* racd11.do  January 2013 for Stata version 12

capture log close
log using racd11.txt, text replace

********** OVERVIEW OF racd11.do **********

* STATA Program 
* copyright C 2013 by A. Colin Cameron and Pravin K. Trivedi 
* used for "Regression Analyis of Count Data" SECOND EDITION
* by A. Colin Cameron and Pravin K. Trivedi (2013)
* Cambridge University Press

* This STATA program analyzes Patents data in Chapter 11
*   11.8.1 PATENTS - TABLE 11.1 only (parametric models)

* NOTE: The rest of chapter 11 is done using R program racd11.R

* To run you need files
*   racd09data.dta
* and user-written Stata addon
*   fmm
* in your directory

********** SETUP **********

set more off
version 11.2
clear all
* set linesize 82
set scheme s1mono  /* Graphics scheme */

************

* NOTE: This program produces just TABLE 11.1
* The rest of chapter 11 is done using R program racd09.R

********** DATA DESCRIPTION

*  The original data is from 
*  Bronwyn Hall, Zvi Griliches, and Jerry Hausman (1986), 
*  "Patents and R&D: Is There a Lag?", 
*  International Economic Review, 27, 265-283.
*  See this article for more detailed discussion 
*  Also see racd09makedata.do for further details  

*  NOTE: Here we use just 1979 data 

********** 11.8.1 PATENTS - PARAMETRIC MODELS

use racd09data.dta, clear

* Create log of total R&D over five years
generate LOGRandD = ln(exp(LOGR)+exp(LOGR1)+exp(LOGR2)+exp(LOGR3)+exp(LOGR4)+exp(LOGR5))

* Use only 1979 data
keep if YEAR==5

* Regressor list
global XLIST LOGRandD LOGK SCISECT
* global XLIST LOGR LOGR1 LOGR2 LOGR3 LOGR4 LOGR5 LOGK SCISECT

* Variable descriptions and summary statistics
describe PAT $XLIST
summarize PAT $XLIST

****** POISSON AND NB2: Model estimates and (grouped) predicted probabilities

* Data frequencies grouped
generate yle0 = PAT <= 0
generate yle1 = PAT <= 1
generate yle2 = PAT <= 2
generate yle5 = PAT <= 5
generate yle10 = PAT <= 10
generate yle20 = PAT <= 20
generate yle50 = PAT <= 50
generate yle100 = PAT <= 100
generate yle300 = PAT <= 300
generate yle500 = PAT <= 500

* Poisson regression
poisson PAT $XLIST, vce(robust)
estimates store POISSON
predict ple0, pr(0)
predict ple1, pr(0,1)
predict ple2, pr(0,2)
predict ple5, pr(0,5)
predict ple10, pr(0,10)
predict ple20, pr(0,20)
predict ple50, pr(0,50)
predict ple100, pr(0,100)
predict ple300, pr(0,300)
predict ple500, pr(0,1000)

* NB2 intercept-only
nbreg PAT, vce(robust)
predict nbintle0, pr(0)
predict nbintle1, pr(0,1)
predict nbintle2, pr(0,2)
predict nbintle5, pr(0,5)
predict nbintle10, pr(0,10)
predict nbintle20, pr(0,20)
predict nbintle50, pr(0,50)
predict nbintle100, pr(0,100)
predict nbintle300, pr(0,300)
predict nbintle500, pr(0,1000)

* NB2 regression
nbreg PAT $XLIST, vce(robust)
estimates store NB2
predict nble0, pr(0)
predict nble1, pr(0,1)
predict nble2, pr(0,2)
predict nble5, pr(0,5)
predict nble10, pr(0,10)
predict nble20, pr(0,20)
predict nble50, pr(0,50)
predict nble100, pr(0,100)
predict nble300, pr(0,300)
predict nble500, pr(0,1000)

* And model fit for NB2
predict yhat, n
summarize yhat PAT
correlate yhat PAT
display "Squared correlation of y and yhat = " r(rho)^2

* Aside: Calculate Li and Racine R2 measure
quietly sum PAT
scalar ymean = r(mean)
generate ycross = (PAT-ymean)*(yhat-ymean)
generate ylessybarsq = (PAT-ymean)^2
generate yhatlessybarsq = (yhat-ymean)^2
quietly summarize ycross
scalar Rnumerator = r(sum)
quietly summarize ylessybarsq
scalar Rdenominator1 = r(sum)
quietly summarize yhatlessybarsq
scalar Rdenominator2 = r(sum)
display "Li and Racine R-squared = " Rnumerator^2 / (Rdenominator1*Rdenominator2)

* Find the mode probability for each observation (i.e. k than maximizes Pr[y = k] 
predict nbfit0, pr(0)
generate mode = 0
forvalues i = 1/600 {
   predict nbfit1, pr(`i')
   quietly replace mode = `i'   if nbfit1> nbfit0
   quietly replace nbfit0 = nbfit1
   drop nbfit1
   }

* Compare the actual count to the predicted mode 
recode mode (0=0) (1=1) (2/5=2) (6/20=6) (21/50=31) (51/100=51) /// 
 (101/200=101) (201/300=201) (301/600=301), gen(modegrouped)
recode PAT (0=0) (1=1) (2/5=2) (6/20=6) (21/50=31) (51/100=51)  ///
  (101/200=101) (201/300=201) (301/600=301), gen(PATgrouped)
tabulate PATgrouped modegrouped
tabulate mode
count if PAT == mode
count if PATgrouped == modegrouped

sum yle0 yle1 yle2 yle5 yle10 yle20 yle50 yle100 yle300 yle500
sum ple0 ple1 ple2 ple5 ple10 ple20 ple50 ple100 ple300 ple500
sum nbintle0 nbintle1 nbintle2 nbintle5 nbintle10 nbintle20 nbintle50 ///
    nbintle100 nbintle300 nbintle500
sum nble0 nble1 nble2 nble5 nble10 nble20 nble50 nble100 nble300 nble500

***** NB2 MODEL WITH FIRST ORDER POLYNOMIAL - Estimates and (grouped) predicted probabilities

program lfnb2p1
  version 11
  args lnf theta1 alpha a          // theta1=x'b, alpha=alpha, a=a
  tempvar mu 
  local y $ML_y1                  // Define y so program more readable
  generate double `mu'  = exp(`theta1')
  quietly replace `lnf' = lngamma(`y'+(1/`alpha')) - lngamma((1/`alpha'))  ///
               -  lnfactorial(`y') - (`y'+(1/`alpha'))*ln(1+`alpha'*`mu')  ///
               + `y'*ln(`alpha') + `y'*ln(`mu')                            ///
    + ln((1+`a'*`y')^2) - ln(1+2*`a'*`mu'+(`a'^2)*(`mu'+(1+`alpha')*(`mu'^2)))
end
ml model lf lfnb2p1 (PAT = $XLIST) () (), vce(robust)
ml search
* ml init .7 .3 .0 1.0 .0 -.1 .0 -.6 .9 -.5, copy
ml maximize
estimates store NB2P1
estat ic

* Use the results to get predicted probabilities
matrix b = e(b)
scalar a1 = b[1,e(k)]
scalar alpha = b[1,e(k)-1]
matrix b = b[1,1..e(k)-2]
generate mu = 0
generate one = 1
local i 1
foreach var of varlist $XLIST one {
  scalar beta =  b[1,`i']
  quietly replace mu = mu + beta*`var'
  local i = `i' + 1
}
replace mu = exp(mu)
scalar ainv = 1/alpha
forvalues i = 0/500 {
   generate pfitnb2p1`i' = lngamma(`i'+ainv) - lngamma(ainv) - lnfactorial(`i') + ///
   ainv*ln(ainv/(ainv+mu)) + `i'*ln(mu/(ainv+mu)) + ln((1+a1*`i')^2) - ln(1+2*a1*mu+(a1^2)*(mu+(1+alpha)*(mu^2)))
   quietly replace pfitnb2p1`i' = exp(pfitnb2p1`i')
   }
generate nb2p1le0 = pfitnb2p10
generate nb2p1le1 = pfitnb2p10 + pfitnb2p11
generate nb2p1le2 = pfitnb2p10 + pfitnb2p11 + pfitnb2p12
generate nb2p1le5=pfitnb2p10+pfitnb2p11+pfitnb2p12+pfitnb2p13+pfitnb2p13+pfitnb2p15
generate nb2p1le10 = 0
generate nb2p1le20 = 0
generate nb2p1le50 = 0
generate nb2p1le100 = 0
generate nb2p1le300 = 0
generate nb2p1le500 = 0
forvalues i = 0/500 {
     quietly replace nb2p1le10 = nb2p1le10 +  pfitnb2p1`i' if `i'<=10
     quietly replace nb2p1le20 = nb2p1le20 +  pfitnb2p1`i' if `i'<=20
     quietly replace nb2p1le50 = nb2p1le50 +  pfitnb2p1`i' if `i'<=50
     quietly replace nb2p1le100 = nb2p1le100 +  pfitnb2p1`i' if `i'<=100
     quietly replace nb2p1le300 = nb2p1le300 +  pfitnb2p1`i' if `i'<=300
     quietly replace nb2p1le500 = nb2p1le500 +  pfitnb2p1`i' if `i'<=500
     }

* The following gives NB2P1 parametric results corresponding to Table 11.2 
summarize nb2p1le*

****** FINITE MIXTURE MODEL NB2-2 component - Estimates 

* Following does not converge
fmm PAT $XLIST, mixtureof(negbin2) components(2) vce(robust)
estimates store FM2NB2

****** POISSON MODEL WITH POLYNOMIALS HAS LOW LOG-LIKELIHOOD .... 

*** POISSON MODEL WITH FIRST ORDER POLYNOMIAL 

program lfpp1
  version 11
  args lnf theta1 a1    // theta1=x'b, alpha=alpha, a=a
  tempvar mu m1 m2 
  local y $ML_y1                  // Define y so program more readable
  generate double `mu'  = exp(`theta1')
  generate double `m1' = `mu'
  generate double `m2' = `mu'+`mu'^2
  quietly replace `lnf' = -`mu' + `y'*`theta1' - lnfactorial(`y') ///
      + ln((1+`a1'*`y')^2) - ln(1 + 2*`a1'*`m1' + (`a1'^2)*`m2' ) 
end
ml model lf lfpp1 (PAT = $XLIST) (), vce(robust)
ml search
* ml init .5 .5 -.04 .5 .03 -.06 .03 -2.3 5.3 .8, copy
ml maximize, iter(20) 
estat ic
estimates store PP1

*** POISSON MODEL WITH SECOND ORDER POLYNOMIAL 

program lfpp2
  version 11
  args lnf theta1 a1 a2    // theta1=x'b, alpha=alpha, a=a
  tempvar mu m1 m2 m3 m4
  local y $ML_y1                  // Define y so program more readable
  generate double `mu'  = exp(`theta1')
  generate double `m1' = `mu'
  generate double `m2' = `mu'+`mu'^2
  generate double `m3' = `mu'+3*`mu'^2+`mu'^3
  generate double `m4' = `mu'+7*`mu'^2+6*`mu'^3+`mu'^4
  quietly replace `lnf' = -`mu' + `y'*`theta1' - lnfactorial(`y') ///
      + ln((1+`a1'*`y'+`a2'*`y'^2)^2)                  ///
      - ln(1 + 2*`a1'*`m1' + (`a1'^2+2*`a2')*`m2' + 2*`a1'*`a2'*`m3' + `a2'^2*`m4') 
end
ml model lf lfpp2 (PAT = $XLIST) () (), vce(robust)
ml search
* ml init .5 .5 -.04 .5 .03 -.06 .03 -2.3 5.3 .8, copy
ml maximize, iter(20) 
estat ic
estimates store PP2

*** POISSON MODEL WITH THIRD ORDER POLYNOMIAL 

program lfpp3
  version 11
  args lnf theta1 a1 a2 a3    // theta1=x'b, alpha=alpha, a=a
  tempvar mu m1 m2 m3 m4 m5 m6
  local y $ML_y1                  // Define y so program more readable
  generate double `mu'  = exp(`theta1')
  generate double `m1' = `mu'
  generate double `m2' = `mu'+`mu'^2
  generate double `m3' = `mu'+3*`mu'^2+`mu'^3
  generate double `m4' = `mu'+7*`mu'^2+6*`mu'^3+`mu'^4
  generate double `m5' = `mu'+15*`mu'^2+25*`mu'^3+10*`mu'^4+`mu'^5
  generate double `m6' = `mu'+31*`mu'^2+90*`mu'^3+65*`mu'^4+15+`mu'^5+`mu'^6
  quietly replace `lnf' = -`mu' + `y'*`theta1' - lnfactorial(`y') ///
      + ln((1+`a1'*`y'+`a2'*`y'^2+`a3'*`y'^3)^2)                  ///
      - ln(1 + 2*`a1'*`m1' + (`a1'^2+2*`a2')*`m2' + (2*`a1'*`a2'+2*`a3')*`m3' /// 
            + (2*`a1'*`a3'+`a2'^2)*`m4' + 2*`a2'*`a3'*`m5' + `a3'^2*`m6')
end
ml model lf lfpp3 (PAT = $XLIST) () () (), vce(robust)
ml search
* ml init .72 .36 -.01 1.1 .02 -.06 .04 -.66 .94 -.52, copy
ml maximize, iter(20) 
estat ic
estimates store PP3

****** RESULTS

*** TABLE 11.1: PARAMETRIC MODEL ESTIMATES

estimates table POISSON NB2 FM2NB2 NB2P1 PP1,  ///
   b(%7.4f) se(%7.3f) stats(N ll) stfmt(%9.1f)

********** CLOSE OUTPUT

* log close
* exit
* clear

