# racd03.R  March 2012 for R version 2.15.0

# Create log file
sink("racd03.Rout")

rm(list=ls())

# ********** OVERVIEW OF racd03.R **********

# STATA Program 
# copyright C 2012 by A. Colin Cameron and Pravin K. Trivedi 
# used for "Regression Analyis of Count Data" SECOND EDITION
# by A. Colin Cameron and Pravin K. Trivedi (2012)
# Cambridge University Press

# To run you need file
#   racd03data.dta
# and R packages foreign, pscl, sandwich, boot, MASS and gamlss 

# This R program does some of the analysis for Chapter 3
# (The Stata program racd03.do does more)
#   3.2 POISSON REGRESSION WITH VARIOUS STANDARD ERRORS
#   3.3 NEGATIVE BINOMIAL WITH VARIOUS STANDARD ERRORS
#   3.5 MARGINAl EFFECTS AFTER POISSON
#   3.7 OTHER MODELS
 
# ********* DATA DESCRIPTION

# The data set racd3data.dta is the same data as originally used in
# (1) A.C. Cameron and P.K. Trivedi (1986), "Econometric Models Based on
# Count Data: Comparisons and Applications of  Some Estimators and Tests",
# Journal of Applied Econometrics, Vol. 1, pp. 29-54.
# and in other papers.

# This data is not a representative sample of Australians as it oversamples
# young and old. In particular, use of health services may be overstated.
# This is because while the original sample of 40,650 individuals
# from the 1977-78 Australian Health Survey is representative,
# the sample used here is restricted to single people over 18 years of age.
 
# See the R.E.Stud. (1988, pp.85-106) section 3 for more detailed
# discussion of the data than that given in the RACD book.
# Also see racd03makedata.do for further details 

# ********* 3.2 READ DATA AND SUMMARIZE 

# install.packages("foreign") 
library(foreign)
data.ch3 <- read.dta(file = "racd03data.dta")
# Allows variables in database to be accessed simply by giving names
attach(data.ch3)   
# Lists first six observations
head(data.ch3)

# *** TABLE 3.1: FREQUENCIES

# Tabulate counts of doctor visits
table(DVISITS)
print(table(DVISITS)/nrow(data.ch3),digits=2)

# *** TABLE 3.2: VARIABLE DEFINITIONS AND SUMMARY STATISTICS

# Variable list
names(data.ch3)

# Summary statistics and (mean,sd)
summary(data.ch3)
cbind(sapply(data.ch3,mean),sapply(data.ch3,sd))

# Formula for regressors and the model estimated in this chapter - shortens the commands below
XLIST="SEX+AGE+AGESQ+INCOME+LEVYPLUS+FREEPOOR+FREEREPA+ILLNESS+ACTDAYS+HSCORE+CHCOND1+CHCOND2"
formula.ch3model <- as.formula(paste("DVISITS~",XLIST))

# Alternative in one step is
# formula.ch3model <- as.formula(DVISITS ~ SEX+AGE+AGESQ+INCOME+LEVYPLUS
#                      +FREEPOOR+FREEREPA+ILLNESS+ACTDAYS+HSCORE+CHCOND1+CHCOND2)

# ********** 3.2 POISSON REGRESSION WITH VARIOUS STANDARD ERRORS

# *** TABLE 3.3: In order columns NB1, RS, MLH, and boot

# Poisson with NB1 standard errors (assumes variance is a multiple of the mean)
model.poissqmle <- glm(formula.ch3model, family=quasipoisson()) 
print(summary(model.poissqmle),digits=3)

# Poisson with robust standard errors (no assumption on variance)
# install.packages("sandwich")
library(sandwich)
cov.robust <- vcovHC (model.poissqmle, type="HC0")
se.robust <- sqrt(diag(cov.robust))
coeffs <- coef(model.poissqmle)
t.robust <- coeffs / se.robust
summary.poissrobust <- cbind(coeffs, se.robust, t.robust, pvalue=round(2*(1-pnorm(abs(coeffs/se.robust))),5), 
           lower=coeffs-1.96*se.robust, upper=coeffs+1.96*se.robust)    
print(summary.poissrobust,digits=3)

# Poisson Default standard errors (variance equals mean) based on the inverse of the Hessian
model.poissdef <- glm(formula.ch3model, family=poisson()) 
print(summary(model.poissdef),digits=3)

# Robust sandwich based on model.poissdef same as from model.poissqmle

# Poisson Bootstrap standard errors (variance equals mean)
# This takes time
# Based on http://socserv.mcmaster.ca/jfox/Books/Companion/appendix/Appendix-Bootstrapping.pdf
# install.packages("boot")
library(boot)
boot.poiss <- function(data, indices) {
  data <- data[indices, ] 
  model <- glm(formula.ch3model, family=poisson(), data=data)
  coefficients(model)
  }
set.seed(10101)
# To speed up this uses only 39 bootstraps. Should instead use 399 bootstraps.
# dvisits.boot <- boot(data.ch3, boot.poiss, 399)
summary.poissboot <- boot(data.ch3, boot.poiss, 39)
print(summary.poissboot,digits=3)

# ********** 3.3 NEGATIVE BINOMIAL NB1 and NB2 MLE

# ** NB2 USING PACKAGE MASS

# MASS does just NB2
# install.packages("MASS")
library(MASS)

# Note: MASS  reports as the overdispersion parameter 1/alpha not alpha
# In this example R reports 0.9285 and 1/0.9285 = 1.077 as in Stata 

# NB2 with default standard errors (inverse of Hessian ?)
model.nb2.MASS <- glm.nb(formula.ch3model) 
print(summary(model.nb2.MASS),digits=3)

# ** NB2 USING PACKAGE MASS

# gamlss does NB1 as well as NB2 and allows variance parameter to depend on regressors
# install.packages("gamlss")
library(gamlss)

# Note: What we call NB2 package gamlss calls NBI

# Note: gamlss reports as the overdispersion parameter log(alpha) not alpha
# In this example R reports 0.07461 and exp(0.07461) = 1.077 as in Stata 

# NB2 with default standard errors (inverse of Hessian ?) 
# Same resaults as for MASS default
model.nb2.gamlss <- gamlss(formula.ch3model,family=NBI, data=data.ch3, method=mixed(1,20))
print(summary(model.nb2.gamlss),digits=3)

# *** NB1 USING PACKAGE MASS

# Note: What we call NB1 this package calls NBII

# Note: gamlss reports as the overdispersion parameter log(alpha) not alpha
# In this example R reports -0.7863 and exp(-0.7863) = 0.455 as in Stata 

# NB1 with default standard errors (inverse of Hessian ?) 
model.nb1.gamlss <- gamlss(formula.ch3model,family=NBII, data=data.ch3, method=mixed(1,20))
print(summary(model.nb1.gamlss),digits=3)

# TABLE 3.4: MB2 and NB1 model estimators and standard errors

# (1) NB2 MLE coefficients in Table 3.4 are same as from gamlss or MASS NBII
#             standard errors are different as gamlss or MASS givede default not RS
# (2) NB2 QGP is not computed here (requires own coding of an objective function)
# (3) NB1 MLE coefficients in Table 3.4 are same as from gamlss or MASS NBI
#             standard errors are different as gamlss or MASS givede default not RS
# (4) NB1 GLM coefficients in Table 3.4 are same as from glm(., family=quasipoisson())
#             standard errors are same as robust sandwich - see summary.poissboot

# ********** 3.5 USING REGRESSION RESULTS 

# Average marginal effects
# This uses result that for Poisson QMLE with intercept AME is simply mean(y)*bhat
meany <- mean(DVISITS)
AME <- meany * coeffs
AME

# Marginal effect at means
# First compute the means
Xmean <- cbind(1,mean(SEX),mean(AGE),mean(AGESQ),mean(INCOME),mean(LEVYPLUS),
  mean(FREEPOOR),mean(FREEREPA),mean(ILLNESS),mean(ACTDAYS),mean(HSCORE),
  mean(CHCOND1),mean(CHCOND2))
Xmean
coeffs <- coef(model.poissqmle)
expxbhat <- exp(coeffs%*%t(Xmean))
expxbhat 
MEM <- expxbhat * coeffs
MEM

# Aside: Alternative way to compute means is
# Xmean <- colMeans(cbind(1,SEX,AGE,AGESQ,INCOME,LEVYPLUS,FREEPOOR,FREEREPA,
          ILLNESS,ACTDAYS,HSCORE,CHCOND1,CHCOND2))
# Aside: Another alternative way to compute means (of columns so apply( ,2, ) )

# OLS coefficients
coeffs.ols <- coef(lm(formula.ch3model)) 

# Elasticities
# Here * is element by element multiplication
Elast <- coeffs * t(Xmean)    

# Scale by standard deviation the coefficients
# First compute the standard deviations
Xsd <- cbind(1,sd(SEX),sd(AGE),sd(AGESQ),sd(INCOME),sd(LEVYPLUS),
  sd(FREEPOOR),sd(FREEREPA),sd(ILLNESS),sd(ACTDAYS),sd(HSCORE),
  sd(CHCOND1),sd(CHCOND2))
SSC <- coeffs * t(Xsd)

### TABLE 3.6: MARGINAL EFFECTS and RELATED INFORMATION

print(cbind(AME,MEM,coeffs.ols,Elast,SSC,t(Xmean),t(Xsd)),digits=4)

# ********** 3.5 PREDICTION 

model.poiss <- glm(formula.ch3model, family=quasipoisson()) 
DVISITShat <- fitted.values(model.poiss)
summary(cbind(DVISITS,DVISITShat))
apply(cbind(DVISITS,DVISITShat),2,mean)
apply(cbind(DVISITS,DVISITShat),2,sd)

# Predicted probabilities using package pscl 
# install.packages("pscl")
library(pscl)

# Yields a matrix that is N times (maxy - miny + 1)
# Then take the average over individuals

# Poisson
ppoiss <- predprob(model.poissdef)
ppoissave <- colMeans(ppoiss)

# NB2
pnb2 <- predprob(model.nb2.MASS)
pnb2ave <- colMeans(pnb2)
relfreqs <- table(DVISITS) / nrow(data.ch3)

# Relative frequencies and predicted probabilities
predictedprobs <- cbind(relfreqs,ppoissave, pnb2ave)
print(predictedprobs,digits=3)

# ********** 3.7 OTHER MODELS

# Complementary log-log same as binary Poisson
# First create a binary dependent variable
BINDVISITS <- DVISITS
BINDVISITS[DVISITS>0] <- 1     
model.cloglog <- glm(as.formula(paste("BINDVISITS~",XLIST)),family=binomial(link="cloglog"))
print(summary(model.cloglog),digits=3)
coeffs.cloglog <- coef(model.cloglog)

# cloglog with robust sandwich standard errors
# library(sandwich)
vcovHC.cloglog <- vcovHC(model.cloglog)
robustse.cloglog <- sqrt(diag(vcovHC.cloglog))
coeffs.cloglog <- coef(model.cloglog)
robustt.cloglog <- coeffs.cloglog / robustse.cloglog
summary.cloglogrobust <- cbind(coeffs.cloglog, robustse.cloglog, robustt.cloglog, 
           pvalue=round(2*(1-pnorm(abs(coeffs.cloglog/robustse.cloglog))),5), 
           lower=coeffs-1.96*robustse.cloglog, upper=coeffs+1.96*robustse.cloglog)    
print(summary.cloglogrobust,digits=3)

# Ordered probit requires dependent variable to be declared as a factor variable
# Uses command polr in package MASS
# Slope coefficients are same as those for Stata
# Note that book gave different standardized coefficients
DVISITSFACTOR <- factor(DVISITS)
model.orderedprobit <- polr(as.formula(paste("DVISITSFACTOR~",XLIST)), method="probit")
print(summary(model.orderedprobit),digits=3)
# Following saves slopes but not cutoff parameters
coeffs.orderedprobit <- coefficients(model.orderedprobit) 

# OLS with default standard errors
model.ols <- lm(formula.ch3model) 
summary(model.ols) 
coeffs.ols <- coef(model.ols)

# OLS with robust sandwich standard errors
# library(sandwich)
vcovHC.ols <- vcovHC(model.ols)
robustse.ols <- sqrt(diag(vcovHC.ols))
coeffs.ols <- coef(model.ols)
robustt.ols <- coeffs.ols / robustse.ols
summary.olsrobust <- cbind(coeffs.ols, robustse.ols, robustt.ols, 
           pvalue=round(2*(1-pnorm(abs(coeffs.ols/robustse.ols))),5), 
           lower=coeffs-1.96*robustse.ols, upper=coeffs+1.96*robustse.ols)    
print(summary.olsrobust,digits=3)

# OLS for ln(y+0.1)
lny <- log(DVISITS+0.1)
model.lny <- lm(as.formula(paste("lny~",XLIST)))
summary(model.lny)
coeffs.lny <- coef(model.lny)

# OLS for ln(y+0.1) with robust sandwich standard errors
vcovHC.lny <- vcovHC(model.lny)
robustse.lny <- sqrt(diag(vcovHC.lny))
coeffs.lny <- coef(model.lny)
robustt.lny <- coeffs.lny / robustse.lny
summary.lnyrobust <- cbind(coeffs.lny, robustse.lny, robustt.lny, 
           pvalue=round(2*(1-pnorm(abs(coeffs.sqrt/robustse.lny))),5), 
           lower=coeffs-1.96*robustse.lny, upper=coeffs+1.96*robustse.lny)    
print(summary.lnyrobust,digits=3)

# OLS for square root transformation
sqrty <- sqrt(DVISITS)
model.sqrt <- lm(as.formula(paste("sqrty~",XLIST)))
summary(model.sqrt)
coeffs.sqrt <- coef(model.sqrt)

# OLS for square root transformation with robust sandwich standard errors
vcovHC.sqrt <- vcovHC(model.sqrt)
robustse.sqrt <- sqrt(diag(vcovHC.sqrt))
coeffs.sqrt <- coef(model.sqrt)
robustt.sqrt <- coeffs.sqrt / robustse.sqrt
summary.sqrtrobust <- cbind(coeffs.sqrt, robustse.sqrt, robustt.sqrt, 
           pvalue=round(2*(1-pnorm(abs(coeffs.sqrt/robustse.sqrt))),5), 
           lower=coeffs-1.96*robustse.sqrt, upper=coeffs+1.96*robustse.sqrt)    
print(summary.sqrtrobust,digits=3)

# Poisson
model.poiss <- glm(formula.ch3model, family=quasipoisson()) 
coeffs.poiss <- coef(model.poiss)
# From earlier
robustse.poiss <- se.robust
robustt.poiss <- t.robust

# Nonlinear least squares with default standard errors
# Same as glm for Gaussian with log link and easier than using comamnd nls
# Default is nonrobust standard errors
# Use Poisson estimates as starting values
startfornls <- coef(model.poiss) 
model.nls <- glm(formula.ch3model, family=gaussian(link="log"), start=startfornls)
print(summary(model.nls),digits=3)

# Nonlinear least squares with robust standard errors
vcovHC.nls <- vcovHC(model.nls)
robustse.nls <- sqrt(diag(vcovHC.nls))
coeffs.nls <- coef(model.nls)
robustt.nls <- coeffs.nls / robustse.nls
summary.nlsrobust <- cbind(coeffs.nls, robustse.nls, robustt.nls, 
           pvalue=round(2*(1-pnorm(abs(coeffs.nls/robustse.nls))),5), 
           lower=coeffs-1.96*robustse.nls, upper=coeffs+1.96*robustse.nls)    
print(summary.nlsrobust,digits=3)

### TABLE 3.7

# Most columns except ordered probit
print(cbind(coeffs.cloglog,coeffs.ols,coeffs.lny,coeffs.sqrt,coeffs.poiss,coeffs.nls),digits=3)

# Standard errors
print(cbind(robustse.cloglog,robustse.ols,robustse.lny,robustse.sqrt,robustse.poiss,robustse.nls),digits=3)

# T statistics
print(cbind(robustt.cloglog,robustt.ols,robustt.lny,robustt.sqrt,robustt.poiss,robustt.nls),digits=3)

# Not done - but to save default standard errors use package arm
# install.packages("arm")
# library(arm)
# se.cloglog <- se.coef(model.cloglog)
# t.cloglog <- coeffs.cloglog) / se.cloglog

# close log file
sink()

#  rhs="AGE+SEX"
# glm(as.formula(paste("DVISITS~",rhs)),family=poisson(),data=data.ch3)

