##################################### #Lab 3 Instrumental variables Stat 209 2/18/08 Rogosa R-Session ##################################### In this my session output I did not duplicate all the explanation in the lab text, but did all the basic analyses and added some notes. So these two versions should be looked at side by side R version 2.5.1 (2007-06-27) Copyright (C) 2007 The R Foundation for Statistical Computing ISBN 3-900051-07-0 R is free software and comes with ABSOLUTELY NO WARRANTY. You are welcome to redistribute it under certain conditions. Type 'license()' or 'licence()' for distribution details. > # Lab 3 > mroz87 = read.table( "http://www-stat.stanford.edu/~rag/stat209/Mroz87.dat", header = T) > names(mroz87) [1] "lfp" "hours" "kids5" "kids618" "age" "educ" [7] "wage" "repwage" "hushrs" "husage" "huseduc" "huswage" [13] "faminc" "mtr" "motheduc" "fatheduc" "unem" "city" [19] "exper" "nwifeinc" "wifecoll" "huscoll" > # Variable definition in lab script (insert here) # lfp Dummy variable for labor-force participation. # hours Wife's hours of work in 1975. # kids5 Number of children 5 years old or younger. # kids618 Number of children 6 to 18 years old. # age Wife's age. # educ Wife's educational attainment, in years. # wage Wife's average hourly earnings, in 1975 dollars. # repwage Wife's wage reported at the time of the 1976 interview. # hushrs Husband's hours worked in 1975. # husage Husband's age. # huseduc Husband's educational attainment, in years. # huswage Husband's wage, in 1975 dollars. # faminc Family income, in 1975 dollars. # mtr Marginal tax rate facing the wife. # motheduc Wife's mother's educational attainment, in years. # fatheduc Wife's father's educational attainment, in years. # unem Unemployment rate in county of residence, in percentage points. # city Dummy variable = 1 if live in large city, else 0. # exper Actual years of wife's previous labor market experience. # nwifeinc Non-wife income. # wifecoll Dummy variable for wife's college attendance. # huscoll Dummy variable for husband's college attendance. > library(sem) > # I already had sem installed; if not install.packages("sem") > help(tsls) > # tsls is our main tool for IV > # outcome variable is log-wage for the 428 working women > attach(mroz87) > table(lfp) lfp 0 1 325 428 > table(wage > 0) FALSE TRUE 325 428 > detach(mroz87) > poswage = subset(mroz87, wage > 0) # my new data subset > poswage$logWage = log( poswage$wage ) # adding logWage to the data set for session > attach(poswage) > names(poswage) [1] "lfp" "hours" "kids5" "kids618" "age" "educ" [7] "wage" "repwage" "hushrs" "husage" "huseduc" "huswage" [13] "faminc" "mtr" "motheduc" "fatheduc" "unem" "city" [19] "exper" "nwifeinc" "wifecoll" "huscoll" "logWage" > length(logWage) [1] 428 > table(lfp) #all the women in this subset are in the workforce lfp 1 428 > # so the poswage data subset is the 428 working women, and appended > # to that subset is logWage # see what you have by take log(wage) > logWage[1:10] [1] 1.21015366 0.32851207 1.51413773 0.09212329 1.52427210 1.55648008 [7] 2.12025954 2.05963416 0.75433635 1.54489939 > wage[1:10] [1] 3.3540 1.3889 4.5455 1.0965 4.5918 4.7421 8.3333 7.8431 2.1262 4.6875 > log(3.354) [1] 1.210154 > min(logWage) [1] -2.054164 > min(wage) #not paid so well [1] 0.1282 > quantile(wage) 0% 25% 50% 75% 100% 0.12820 2.26260 3.48190 4.97075 25.00000 > table(wage < 1) FALSE TRUE 411 17 > # so we have a few very poorly paid women in this dataset it seems < 1$/hr > # onto fitting regression (predictor educ) > lm.posWage = lm(logWage ~ educ) > summary(lm.posWage) Call: lm(formula = logWage ~ educ) Residuals: Min 1Q Median 3Q Max -3.10256 -0.31473 0.06434 0.40081 2.10029 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) -0.1852 0.1852 -1.000 0.318 educ 0.1086 0.0144 7.545 2.76e-13 *** --- Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 Residual standard error: 0.68 on 426 degrees of freedom Multiple R-Squared: 0.1179, Adjusted R-squared: 0.1158 F-statistic: 56.93 on 1 and 426 DF, p-value: 2.761e-13 > # we get a highly significant slope (but not big Rsq), > # year increase in educ, fit increases 11 cents hourly wage > # now the IV fit using fatheduc as instrument (omitted vars concern) > cor.test(educ, fatheduc) Pearson's product-moment correlation data: educ and fatheduc t = 9.4255, df = 426, p-value < 2.2e-16 alternative hypothesis: true correlation is not equal to 0 95 percent confidence interval: 0.3337579 0.4908623 sample estimates: cor 0.4154030 > # significant, fairly strong (moderate at least) correlation educ and instrument > instr.posWage = tsls(logWage ~ educ, ~ fatheduc) > summary(instr.posWage) 2SLS Estimates Model Formula: logWage ~ educ Instruments: ~fatheduc Residuals: Min. 1st Qu. Median Mean 3rd Qu. Max. -3.09e+00 -3.39e-01 5.25e-02 -1.99e-14 4.04e-01 2.07e+00 Estimate Std. Error t value Pr(>|t|) (Intercept) 0.44110 0.44610 0.9888 0.32332 educ 0.05917 0.03514 1.6839 0.09294 Residual standard error: 0.6894 on 426 degrees of freedom > # educ slope reduced by half, se increased, no longer signif > # you can show by math and I meant to in class when I showed the > # textbook results for these data, that if the omitted variable(s)are > # positively related to predictor, the regression will overestimate the slope > > cov(logWage, fatheduc)/cov(educ, fatheduc) [1] 0.05917348 > # IV estimate of slope > .0144/.4154 [1] 0.03466538 > # inflation of OLS standard error for the IV regression > # do two stage least squares as 2 stages > lm.1 = lm(educ ~ fatheduc) > lm.2 = lm(logWage ~ fitted(lm.1)) > summary(lm.2) Call: lm(formula = logWage ~ fitted(lm.1)) Residuals: Min 1Q Median 3Q Max -3.21264 -0.37631 0.05632 0.41728 2.06040 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 0.44110 0.46711 0.944 0.346 fitted(lm.1) 0.05917 0.03680 1.608 0.109 Residual standard error: 0.7219 on 426 degrees of freedom Multiple R-Squared: 0.006034, Adjusted R-squared: 0.003701 F-statistic: 2.586 on 1 and 426 DF, p-value: 0.1086 > # matches the output from tsls pretty closely > > > # move on to Task 2, adding exper to the prediction of logWage (use exper and exper^2) > lm.posWage2 = lm(logWage ~ educ + exper + I(exper^2)) # I made more typing for myself by not creating a separate variable for exper^2 > summary(lm.posWage2) Call: lm(formula = logWage ~ educ + exper + I(exper^2)) Residuals: Min 1Q Median 3Q Max -3.08404 -0.30627 0.04952 0.37498 2.37115 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) -0.5220406 0.1986321 -2.628 0.00890 ** educ 0.1074896 0.0141465 7.598 1.94e-13 *** exper 0.0415665 0.0131752 3.155 0.00172 ** I(exper^2) -0.0008112 0.0003932 -2.063 0.03974 * --- Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 Residual standard error: 0.6664 on 424 degrees of freedom Multiple R-Squared: 0.1568, Adjusted R-squared: 0.1509 F-statistic: 26.29 on 3 and 424 DF, p-value: 1.302e-15 > # use motheduc and fatheduc as exogenous instruments for this regression > instr.posWage2 = tsls(logWage ~ educ + exper + I(exper^2), ~fatheduc + motheduc + exper + I(exper^2)) > summary(instr.posWage2) 2SLS Estimates Model Formula: logWage ~ educ + exper + I(exper^2) Instruments: ~fatheduc + motheduc + exper + I(exper^2) Residuals: Min. 1st Qu. Median Mean 3rd Qu. Max. -3.10e+00 -3.20e-01 5.51e-02 1.87e-15 3.69e-01 2.35e+00 Estimate Std. Error t value Pr(>|t|) (Intercept) 0.048100 0.4003281 0.1202 0.904419 educ 0.061397 0.0314367 1.9530 0.051474 exper 0.044170 0.0134325 3.2883 0.001092 I(exper^2) -0.000899 0.0004017 -2.2380 0.025740 Residual standard error: 0.6747 on 424 degrees of freedom > # once again using IV for omitted vars cuts down the return to educ estimate, also more than doubles se of coeff > # you can check these results against textbook (Wooldridge) versions using stata > # go to http://fmwww.bc.edu/gstat/examples/wooldridge/wooldridge15.html Ex 15.5 (also 15.1) > > > # onto Task 3 simultaneous eqs: more complex modelling, we have a supply and demand eq > # supply, outcome hours with logWage educ age kidslt6 nwifeinc as predictors > # demand, logwage outcome, with hours educ and exper (Task 2) as predictors > # these are rgarded as inked eqs as outcome from supply is predictor in demand > # so here we have linked eqs predictor in one is the outcome in the other, hours wages > # IV TSLS allows estimation of both, you can mess around with reduced form etc, # but here we go to the direct IV estimation, using all exogenous vars as instruments # Results match Wooldridge text, see also # http://fmwww.bc.edu/gstat/examples/wooldridge/wooldridge16.html, Ex 16.5 > sem.hours = tsls( hours ~ logWage + educ + age + kids5 + nwifeinc, instruments =~ educ + age + kids5 + nwifeinc + exper + I(exper^2) ) > summary(sem.hours) 2SLS Estimates Model Formula: hours ~ logWage + educ + age + kids5 + nwifeinc Instruments: ~educ + age + kids5 + nwifeinc + exper + I(exper^2) Residuals: Min. 1st Qu. Median Mean 3rd Qu. Max. -4.57e+03 -6.54e+02 -3.69e+01 -4.58e-12 5.70e+02 8.37e+03 Estimate Std. Error t value Pr(>|t|) (Intercept) 2225.662 574.564 3.8737 0.0001242 logWage 1639.556 470.576 3.4841 0.0005454 educ -183.751 59.100 -3.1092 0.0020032 age -7.806 9.378 -0.8324 0.4056640 kids5 -198.154 182.929 -1.0832 0.2793249 nwifeinc -10.170 6.615 -1.5374 0.1249417 Residual standard error: 1354.2045 on 422 degrees of freedom > sem.logWage = tsls( logWage ~ hours + educ + exper + I(exper^2), instruments =~ educ + exper + I(exper^2) + age + kids5 + nwifeinc ) > summary(sem.logWage) 2SLS Estimates Model Formula: logWage ~ hours + educ + exper + I(exper^2) Instruments: ~educ + exper + I(exper^2) + age + kids5 + nwifeinc Residuals: Min. 1st Qu. Median Mean 3rd Qu. Max. -3.50e+00 -2.93e-01 3.21e-02 1.48e-14 3.65e-01 2.46e+00 Estimate Std. Error t value Pr(>|t|) (Intercept) -0.6557254 0.3377883 -1.9412 5.289e-02 hours 0.0001259 0.0002546 0.4945 6.212e-01 educ 0.1103300 0.0155244 7.1069 5.077e-12 exper 0.0345824 0.0194916 1.7742 7.675e-02 I(exper^2) -0.0007058 0.0004541 -1.5543 1.209e-01 Residual standard error: 0.6794 on 423 degrees of freedom > # now isn't it interesting that with the more complex model, coeff for educ is back up to > # value and signif given the the original, naive single predictor eq estimated by OLS > # being an economist isn't as easy as it looks? >