# racd11.R  January 2013 for R version 2.15.0

rm(list=ls())

# Create log file
sink("racd11.Rout")

# ********** OVERVIEW OF racd11.R **********

# R Program 
# copyright C 2013 by A. Colin Cameron and Pravin K. Trivedi 
# used for "Regression Analyis of Count Data" SECOND EDITION
# by A. Colin Cameron and Pravin K. Trivedi (2013)
# Cambridge University Press

# To run you need file
#   racd09data.dta
# and R packages
#   foreign, MASS, np
# This program mostly uses R package np (Hayfield and Racine (2008))

# This R program analyses patents data for chapter 11
#  11.8.2 NONPARAMETRIC DENSITY ESTIMATION
#  11.8.3 NONPARAMETRIC AND SEMIPARAMETIC CONDITIONAL MEAN ESTIMATION
 
# ********* DATA DESCRIPTION

#  The original data are from 
#  Bronwyn Hall, Zvi Griliches, and Jerry Hausman (1986), 
#  "Patents and R&D: Is There a Lag?", 
#  International Economic Review, 27, 265-283.
#  See this article for more detailed discussion 
#  Also see racd09makedata.do for further details  

#  NOTE: Here we use just 1979 data 
#  For more details see racd09makedata.do

# ********* 11.8 READ DATA AND SUMMARIZE 

# Read in and select data
library(foreign)
data.ch11 <- read.dta(file = "racd09data.dta")
data.ch11 <- subset(data.ch11,data.ch11$YEAR==5)    # Selects 1979 only
# Use the following to speed up program
# data.ch11 <- subset(data.ch11,data.ch11$OBSNO<50)   # Selects first 50 observations

# Allows variables in database to be accessed simply by giving names
attach(data.ch11)
# Lists first six observations
head(data.ch11)

# Tabulate counts of doctor visits
table(PAT)
table(PAT) / nrow(data.ch11)

# Create extra variable and include in the data frame
LOGRandD <- log(exp(LOGR)+exp(LOGR1)+exp(LOGR2)+exp(LOGR3)+exp(LOGR4)+exp(LOGR5))
data.ch11 <- cbind(data.ch11,LOGRandD)

# Variable list
names(data.ch11)

# Summary statistics
summary(data.ch11)
sapply(data.ch11,mean)
sapply(data.ch11,sd)

# Formula for the model estimated in this chapter - shortens the commands below
formula.ch11model <- as.formula(PAT ~ LOGRandD+LOGK+SCISECT)

# Poisson with default standard errors (variance is a multiple of the mean)
model.poiss <- glm(formula.ch11model, family=poisson(), data=data.ch11) 
summary(model.poiss)
yhat.poiss <- fitted(model.poiss)

# Negative binomial MLE - a benchmark model
library(MASS)
model.nb <- glm.nb(formula.ch11model, data=data.ch11) 
summary(model.nb)
yhat.nb <- fitted(model.nb)

#### 11.8.3 NONPARAMETRIC AND SEMIPARAMETIC CONDITIONAL MEAN ESTIMATION  ###

# Package np programs first find the bandwidth then do the estimation
# Nonparametric conditional mean:            npreg and npregbw  
# Semiparametric single-index mean:          npindex and npindexbw
# Semiparametric partial linear mean:        npplreg and npplregbw  
# Nonparametric conditional quantile:        npplreg and npplregbw  
# Nonparametric unconditional density & cdf: npudens and npudensbw
# Nonparametric conditional density and cdf: npcdens and npdensbw

# Bootstrap defaults: random.seed=42 and boot.num=399 and boot.method="iid"
# I reduce boot.num to 99 to speed up program

# install.packages("np")
library(np)

# Nonparametric conditional mean estimation (local linear kernel)

# bw.npreg <- npregbw(formula=PAT~LOGRandD+factor(SCISECT)+LOGK,data=data.ch11)
bw.npreg <- npregbw(formula=PAT~LOGRandD+factor(SCISECT)+LOGK,regtype="ll",bwmethod="cv.aic",data=data.ch11)
summary(bw.npreg)                                  # Bandwidth
model.npreg <- npreg(bws=bw.npreg, gradients=TRUE) 
summary(model.npreg)                               # Nonparametric regression
npsigtest(model.npreg,boot.num=99)                 # Test statistical significance
summary(gradients(model.npreg))                    # Summarize gradients
apply(gradients(model.npreg),2,mean)
apply(gradients(model.npreg),2,sd)
plot(bw.npreg,plot.errors.method="bootstrap", 
      boot.num=99)                                 # Conditional mean vs each x
dev.copy(png,'racd11npreg.png')
dev.off()
# plot(bw.npreg, plot.errors.method="asymptotic")  # Did not work here
plot(bw.npreg,gradients=TRUE, 
   plot.errors.method="bootstrap",boot.num=99)     # Change in cond. mean vs each x
dev.copy(png,'racd11npreggrads.png')
dev.off()
yhat.npreg <- fitted(model.npreg)                  # Predicted mean

# Semiparametric single index conditional mean estimation

# Results differrent and strange if order of regressors PAT~LOGRandD+factor(SCISECT)+LOGK 
# bw.npindex <- npindexbw(formula=PAT~LOGRandD+factor(SCISECT)+LOGK,data=data.ch11)
bw.npindex <- npindexbw(formula=PAT~factor(SCISECT)+LOGK+LOGRandD,data=data.ch11)
summary(bw.npindex)
model.npindex <- npindex(bws=bw.npindex, gradients=TRUE) 
summary(model.npindex)
plot(bw.npindex,plot.errors.method="bootstrap",bootnum=99)
dev.copy(png,'racd11npindex.png')
dev.off()
plot(bw.npindex,gradients=TRUE,plot.errors.method="bootstrap",bootnum=99)
dev.copy(png,'racd11npindexgrads.png')
dev.off()
yhat.npindex <- fitted(model.npindex)

# Semiparametric partial linear conditional mean estimation
# Here E[y|x] = b1*SCISECT + b2*LOGK + g(LOGRandD)
# For illustraion. For counts a partial linear model is usually not appropriate.
bw.npplreg <- npplregbw(formula=PAT~factor(SCISECT)+LOGK|LOGRandD,data=data.ch11)
summary(bw.npplreg)
model.npplreg <- npplreg(bws=bw.npplreg, gradients=TRUE) 
summary(model.npplreg)
# The following takes longer than the preceding commands
plot(bw.npplreg,plot.errors.method="bootstrap",boot.num=99)
dev.copy(png,'racd11npplreg.png')
dev.off()
# plot with GRADIENTS=TRUE not applicable
yhat.npplreg <- fitted(model.npplreg)

### TABLE 11.3: SUMMARY OF FITTED VALUES 

# Compare the various predicted conditional means
predictedmeans <- cbind(PAT,yhat.poiss,yhat.nb,yhat.npreg,yhat.npindex,yhat.npplreg)
apply(predictedmeans,2,mean)
apply(predictedmeans,2,sd)
summary(predictedmeans) 

### TABLE 11.4: CORRELATIONS OF FITTED VALUES 

cor(predictedmeans)    # Note - header for yhat.npplreg not given

### FIGURE 11.1: FITTED VALUES PLOTTED AGAINST ACTUAL VALUES

# Give plots for the various conditional means against y
par(mfrow=c(2,2))    # Four panels - 2 x 2
# par(oma=c(1,1,1,1))
# par(mar=c(1,1,1,1))
# par(mar=c(5, 4, 4, 2)+0.1)
par(mar=c(5, 4, 1, 1))
plot(yhat.nb~PAT,ylab="Fitted: Negative binomial")
plot(yhat.npreg~PAT,ylab="Fitted: Nonparametric")
plot(yhat.npindex~PAT,ylab="Fitted: Single-index")
plot(yhat.nb~PAT,ylab="Fitted: Partial linear")
dev.print(device=postscript, "racd11fig1.eps", onefile=FALSE, horizontal=FALSE)
dev.copy(png,'racd11fig1.png')
dev.off()
par(mfrow=c(1,1))

###  11.8.2 NONPARAMETRIC DENSITY ESTIMATION   ###

# Nonparametric unconditional density and c.d.f. estimation

# Univariate treating count PAT as discrete
bw.npudens <- npudensbw(formula=~ordered(PAT))
summary(bw.npudens)
model.npudens <- npudens(bws=bw.npudens) 
summary(model.npudens)
plot(model.npudens)
dev.copy(png,'racd11npudens.png')        # density
dev.off()
plot(model.npudens,cdf=TRUE)       # cdf
dev.copy(png,'racd11npucdf.png')
dev.off()

# Univariate treating count PAT as continuous
bw.npudens.cont <- npudensbw(formula=~(PAT))
summary(bw.npudens.cont)
model.npudens.cont <- npudens(bws=bw.npudens.cont) 
summary(model.npudens.cont)
plot(model.npudens.cont)            # density
dev.copy(png,'racd11npudenscont.png')
dev.off()
plot(model.npudens.cont,cdf=TRUE)   # Cdf
dev.copy(png,'racd11npucdfcont.png')
dev.off()
 
# Bivariate - one continuous and one ordered
bw.npudens.2 <- npudensbw(formula=~ordered(PAT)+LOGRandD)
summary(bw.npudens.2)
model.npudens.2 <- npudens(bws=bw.npudens.2)
summary(model.npudens.2)
plot(model.npudens.2,xtrim=-0.2,view="fixed",main="")
dev.copy(png,'racd11npudens2.png')
dev.off()
plot(model.npudens.2,xtrim=-0.2,cdf=TRUE,view="fixed",main="")
dev.copy(png,'racd11npucdf2.png')
dev.off()

# Nonparametric conditional density and c.d.f. estimation

# Following does not always work
# bw.npcdens <- npcdensbw(formula=ordered(PAT)~LOGRandD+factor(SCISECT)+LOGK,oxkertype="liracine",uxkertype="liracine",data=data.ch11)
bw.npcdens <- npcdensbw(formula=ordered(PAT)~LOGRandD+factor(SCISECT)+LOGK,data=data.ch11)
summary(bw.npcdens)
model.npcdens <- npcdens(bws=bw.npcdens)  # Predicted conditional probability
summary(model.npcdens)
plot(bw.npcdens,plot.errors.method="bootstrap",boot.num=99)  # Plot conditional density
dev.copy(png,'racd11npcdens.png')
dev.off()
plot(bw.npcdens,cdf=TRUE)       # Plot conditional c.d.f.
dev.copy(png,'racd11npccdf.png')
dev.off()

# Nonparametric conditional mode 

### TABLE 11.2: ACTUAL VALUES AGAINST MOST HIGHLY LIKELY PREDICTED VALUE

# npconmode(bws=bw.npcdens,ordered(PAT)~LOGRandD+factor(SCISECT)+LOGK,uxkertype="liracine",data=data.ch11)
model.npconmode <- npconmode(bws=bw.npcdens,ordered(PAT)~LOGRandD+factor(SCISECT)+LOGK)
summary(model.npconmode)
fitted(model.npconmode)

model <- npconmode(ordered(PAT)~LOGRandD+factor(SCISECT)+LOGK,oxkertype="liracine",uxkertype="liracine")
summary(model#bws)
summary(model)

# Nonparametric conditional quantile 
# This requires treat PAT as continuous

bw.npcdens.cont <- npcdensbw(formula=PAT~LOGRandD+factor(SCISECT)+LOGK,data=data.ch11)
summary(bw.npcdens.cont)

model.npcdens.cont <- npcdens(bws=bw.npcdens.cont)  # Predicted conditional probability
summary(model.npcdens.cont)

# 25th quantiles fid not work. Presumably too many zeroes.
# model.q0.25 <- npqreg(bws=bw.npcdens.cont,tau=0.25)  

model.q0.50 = npqreg(bws=bw.npcdens.cont,tau=0.50)   # Several restarts
summary(model.q0.50)
plot(model.q0.50)
dev.copy(png,'racd11q50.png')
dev.off()
yhat.q0.50 <- fitted(model.q0.50)
summary(yhat.q0.50)
plot(model.q0.50$quantile~yhat.npreg)
yhat.q0.50 <- model.q0.50$quantile

model.q0.75 = npqreg(bws=bw.npcdens.cont,tau=0.75)   # One obs needed many restarts 
summary(model.q0.75)
plot(model.q0.75)
dev.copy(png,'racd11q75.png')
dev.off()
yhat.q0.75 <- fitted(model.q0.75)
summary(yhat.q0.75)
yhat.q0.75 <- model.q0.75$quantile

predictedquantiles <- cbind(PAT,yhat.q0.50,yhat.q0.75,yhat.npreg)
apply(predictedquantiles,2,mean)
apply(predictedquantiles,2,sd)
summary(predictedquantiles) 
cor(predictedquantiles)    

# Give plots for the various conditional means against y
par(mfrow=c(2,2))    # Four panels - 2 x 2
# par(oma=c(1,1,1,1))
# par(mar=c(1,1,1,1))
# par(mar=c(5, 4, 4, 2)+0.1)
par(mar=c(5, 4, 1, 1))
plot(yhat.npreg~PAT,ylab="Fitted: Nonparametric")
plot(yhat.q0.50~PAT,ylab="Fitted: 50th quantile")
plot(yhat.q0.75~PAT,ylab="Fitted: 75th quantile")
plot(yhat.q0.50~yhat.q0.75,ylab="50th vs 75th quantile")
dev.copy(png,'racd11quantiles.png')
dev.off()
par(mfrow=c(1,1))

# Check the pattern of the 50th quantiles against the regressor - is it exponential ?
# bw.check <- npregbw(formula=PAT~LOGRandD+factor(SCISECT)+LOGK,data=data.ch11)
bw.q50check <- npregbw(formula=yhat.q0.50~LOGRandD+factor(SCISECT)+LOGK,regtype="ll",bwmethod="cv.aic",data=data.ch11)
summary(bw.q50check)                                  # Bandwidth
model.q50check <- npreg(bws=bw.q50check, gradients=TRUE) 
summary(modelq50.check)                               # Nonparametric regression
plot(bw.q50check) 
dev.copy(png,'racd11qheck.png')
dev.off()

# Not used - possible flexible parametric
# install.packages("tweedie")
# library(tweedie)
# glm(PAT ~ LOGRandD+LOGK+SCISECT,family=tweedie(var.power=3,link.power=0))

# close log file
sink()

