Lab 4 Stat209 2/26/09 Rogosa Lab 4 R-session # I Start from a relatively clean install, get MatchIt R version 2.8.1 (2008-12-22) Copyright (C) 2008 The R Foundation for Statistical Computing ISBN 3-900051-07-0 > install.packages("MatchIt") --- Please select a CRAN mirror for use in this session --- trying URL 'http://cran.cnr.Berkeley.edu/bin/windows/contrib/2.8/MatchIt_2.4-10.zip' Content type 'application/zip' length 483447 bytes (472 Kb) opened URL downloaded 472 Kb package 'MatchIt' successfully unpacked and MD5 sums checked The downloaded packages are in C:\Documents and Settings\Administrator\Local Settings\Temp\Rtmp3fS3ST\downloaded_packages updating HTML package descriptions > library(MatchIt) Loading required package: MASS ## ## MatchIt (Version 2.4-10, built: 2009-02-02) ## Please refer to http://gking.harvard.edu/matchit for full documentation ## or help.matchit() for help with commands supported by MatchIt. ## > data(lalonde) # get lalonde data from MatchIt, there are different versions under this name # data description http://gking.harvard.edu/matchit/docs/The_Lalonde_Data.html # or simply help(lalonde) #produces ----------------------------------- lalonde package:MatchIt R Documentation Data from National Supported Work Demonstration and PSID, as analyzed by Dehejia and Wahba (1999). Description: This is a subsample of the data from the treated group in the National Supported Work Demonstration (NSW) and the comparison sample from the Current Population Survey (CPS). This data was previously analyzed extensively by Lalonde (1986) and Dehejia and Wahba (1999). The full dataset is available at . Usage: data(lalonde) Format: A data frame with 313 observations (185 treated, 429 control). There are 10 variables measured for each individual. "treat" is the treatment assignment (1=treated, 0=control). "age" is age in years. "educ" is education in number of years of schooling. "black" is an indicator for African-American (1=African-American, 0=not). "hispan" is an indicator for being of Hispanic origin (1=Hispanic, 0=not). "married" is an indicator for married (1=married, 0=not married). "nodegree" is an indicator for whether the individual has a high school degree (1=no degree, 0=degree). "re74" is income in 1974, in U.S. dollars. "re75" is income in 1975, in U.S. dollars. "re78" is income in 1978, in U.S. dollars. Source: References: Lalonde, R. (1986). Evaluating the econometric evaluations of training programs with experimental data. American Economic Review 76: 604-620. Dehejia, R.H. and Wahba, S. (1999). Causal Effects in Nonexperimental Studies: Re-Evaluating the Evaluation of Training Programs. Journal of the American Statistical Association 94: 1053-1062. ----------------------------------------------- > dim(lalonde) # we have 614 cases, 10 vars; see lab script part 2 [1] 614 10 > names(lalonde) [1] "treat" "age" "educ" "black" "hispan" "married" [7] "nodegree" "re74" "re75" "re78" > attach(lalonde) > table(treat) # so these summaries synch with data description treat 0 1 429 185 > lalonde[1:10,] # here's the first 10 observation, all from the treatement treat age educ black hispan married nodegree re74 re75 re78 NSW1 1 37 11 1 0 1 1 0 0 9930.0 NSW2 1 22 9 0 1 0 1 0 0 3595.9 NSW3 1 30 12 1 0 0 0 0 0 24909.5 NSW4 1 27 11 1 0 0 1 0 0 7506.1 NSW5 1 33 8 1 0 0 1 0 0 289.8 NSW6 1 22 9 1 0 0 1 0 0 4056.5 NSW7 1 23 12 1 0 0 0 0 0 0.0 NSW8 1 32 11 1 0 0 1 0 0 8472.2 NSW9 1 22 16 1 0 0 0 0 0 2164.0 NSW10 1 33 12 0 0 1 0 0 0 12418.1 > # brute-force matching-- Lab4 script item 3.1 > numObs = dim(lalonde)[1] > numObs [1] 614 > # that's the 429 + 185, not going to distinguish between T/C here > indiv1 = lalonde[1,] > indiv1 # the target for this matching exercise treat age educ black hispan married nodegree re74 re75 re78 NSW1 1 37 11 1 0 1 1 0 0 9930 > # aside indiv1 is in the treatment group, the first 185 cases are all in treatment > # for example, examine lalonde[185,] lalonde[186,] > matches = matrix( nrow = numObs, ncol = 1, data = FALSE ) > # set up a dummy matrix and then loop through the data with specified calipers > for ( i in 1:numObs) { + matches[i,1] <- all( + c(lalonde[i,4:7] == indiv1[4:7], + (lalonde[i,3] < indiv1[3] + 3) && (lalonde[i,3] > indiv1[3] - 3), + (lalonde[i,2] < indiv1[2] + 3) && (lalonde[i,2] > indiv1[2] - 3) + ) ) + } > sum(as.numeric(matches) ) [1] 4 # this picks up a match to itself (the target indiv1) > # we only got 3 matches to others cases 183, 191 212, the latter 2 being control group members > matches # you can print out all 614 [,1] [1,] TRUE [2,] FALSE [182,] FALSE [183,] TRUE [190,] FALSE [191,] TRUE [211,] FALSE [212,] TRUE [613,] FALSE [614,] FALSE > lalonde[191,] treat age educ black hispan married nodegree re74 re75 re78 PSID6 0 37 9 1 0 1 1 13685 12756 17833 > lalonde[212,] treat age educ black hispan married nodegree re74 re75 re78 PSID27 0 36 9 1 0 1 1 13256 8457 0 > lalonde[183,] treat age educ black hispan married nodegree re74 re75 re78 NSW183 1 35 9 1 0 1 1 13602 13831 12804 > indiv1 treat age educ black hispan married nodegree re74 re75 re78 NSW1 1 37 11 1 0 1 1 0 0 9930 > # if we took re74 re75 into account we wouldn't have any control matches for indiv1 > # optmatch (Ben Hansen) might? do a better job > > # Lab script 3.2 # now do the logistic regression that computes propensity scores (matching packages will do this for you) > glm.lalonde = glm( treat ~ age + educ + black + hispan + married + nodegree + re74 + re75, + data = lalonde, family = binomial) > summary(glm.lalonde) Call: glm(formula = treat ~ age + educ + black + hispan + married + nodegree + re74 + re75, family = binomial, data = lalonde) Deviance Residuals: Min 1Q Median 3Q Max -1.764 -0.474 -0.286 0.751 2.717 Coefficients: Estimate Std. Error z value Pr(>|z|) (Intercept) -4.73e+00 1.02e+00 -4.65 3.3e-06 *** age 1.58e-02 1.36e-02 1.16 0.2452 educ 1.61e-01 6.51e-02 2.48 0.0133 * black 3.07e+00 2.87e-01 10.70 < 2e-16 *** hispan 9.84e-01 4.26e-01 2.31 0.0208 * married -8.32e-01 2.90e-01 -2.87 0.0042 ** nodegree 7.07e-01 3.38e-01 2.09 0.0362 * re74 -7.18e-05 2.87e-05 -2.50 0.0125 * re75 5.34e-05 4.63e-05 1.15 0.2488 --- Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 (Dispersion parameter for binomial family taken to be 1) Null deviance: 751.49 on 613 degrees of freedom Residual deviance: 487.84 on 605 degrees of freedom AIC: 505.8 Number of Fisher Scoring iterations: 5 > propen = fitted(glm.lalonde) # now we have the propensity scores, Lab script calls these propScores > tapply(propen, treat, quantile) # look at overlap via 5-number summary (or side-by-side boxplots) not real good overlap, as noted in class handout $`0` 0% 25% 50% 75% 100% 0.00908 0.03888 0.07585 0.19514 0.78917 $`1` 0% 25% 50% 75% 100% 0.02495 0.52646 0.65368 0.72660 0.85315 > # we are fitting prob(treat = 1) fits for those in treatment group > # will be larger, we need some overlap for matching purposes > tapply(propen, treat, stem) The decimal point is 1 digit(s) to the left of the | 0 | 11111111111112222222222222222222222222222222222222222222233333333333+57 0 | 55555555555555555555555555555556666666666666666666666666777777777777+39 1 | 000000000000000011111111111111112222222222223334444 1 | 56677888888889 2 | 0000011122244 2 | 55667788 3 | 0 3 | 6778899 4 | 1233444 4 | 568999 5 | 1144 5 | 5679 6 | 11112234444 6 | 555556667778888899999999 7 | 111112222444 7 | 55555555689 The decimal point is 1 digit(s) to the left of the | 0 | 244 0 | 6788899 1 | 000122334444 1 | 55 2 | 012 2 | 579 3 | 3 | 8899 4 | 0134 4 | 5559 5 | 1112334 5 | 6666677778899999 6 | 00111122222334444 6 | 55555555555666666778888888899999 7 | 000000001111222222222223333344444 7 | 555555566666777777777778888888999 8 | 1122 8 | 5 $`0` NULL $`1` NULL > quantile(propen) # overall distrib 0% 25% 50% 75% 100% 0.00908 0.04854 0.12068 0.63872 0.85315 # or you can have MatchIt compute propensity scores for you and do a simple match ## http://gking.harvard.edu/matchit/docs/Propensity_Score_Match.html > # the common use of the propensity scores (backed by theory, class handout 2/26)) > # is to stratify by quintiles > # the simple-minded way I do it is to use "cut", Lab script is fancier programming > ?cut # this is a simple function to create bins > k = 1:4 > quantile(propen, k/5) 20% 40% 60% 80% 0.04015 0.08721 0.26978 0.67085 > propbin = cut(propen, c(0, .04015,.08721,.26978,.67085,1)) > propen[1:10] # show process of bins for first 10 cases NSW1 NSW2 NSW3 NSW4 NSW5 NSW6 NSW7 NSW8 NSW9 NSW10 0.63877 0.22463 0.67824 0.77632 0.70164 0.69907 0.65368 0.78972 0.77984 0.04292 > propbin[1:10] [1] (0.27,0.671] (0.0872,0.27] (0.671,1] (0.671,1] [5] (0.671,1] (0.671,1] (0.27,0.671] (0.671,1] [9] (0.671,1] (0.0401,0.0872] Levels: (0,0.0401] (0.0401,0.0872] (0.0872,0.27] (0.27,0.671] (0.671,1] > # levels are the 5 quintiles > table(propbin, treat) # either way you display it, we do not have good overlap in the bottom two quintiles, lower estimated probability for being in treatment for treatment cases treat propbin 0 1 (0,0.0401] 122 1 (0.0401,0.0872] 116 7 (0.0872,0.27] 101 21 (0.27,0.671] 53 71 (0.671,1] 37 85 > tapply(propbin, treat,table) $`0` (0,0.0401] (0.0401,0.0872] (0.0872,0.27] (0.27,0.671] (0.671,1] 122 116 101 53 37 $`1` (0,0.0401] (0.0401,0.0872] (0.0872,0.27] (0.27,0.671] (0.671,1] 1 7 21 71 85 > table(treat) treat 0 1 429 185 > #we have overlap only in the upper 3 quintiles of the propensity score > # lab text does a more sophisticated quintile function > # IMPLEMENT LAB TEXT 3.3 > quintilePropScores = quantile( propen, probs = seq( from = 0, to = 1, by = .2) ) > quintilePropScores 0% 20% 40% 60% 80% 100% 0.00908 0.04015 0.08721 0.26978 0.67085 0.85315 # has bin breaks to greater signif digits, that makes a small difference it turns out > lalonde$PropScores = propen # make var names consistent > table(propbin, treat) treat propbin 0 1 (0,0.0401] 122 1 (0.0401,0.0872] 116 7 (0.0872,0.27] 101 21 (0.27,0.671] 53 71 (0.671,1] 37 85 > tapply(re78, list(propbin, treat),mean) # here are the mean diffs in re78 the outcome stratified by propensity quintile # direction of mean diffs favors treatment, job training 0 1 (0,0.0401] 10467 0 (0.0401,0.0872] 5797 7919 (0.0872,0.27] 6043 9211 (0.27,0.671] 4977 5819 (0.671,1] 4666 6030 > tapply(re78, list(propbin, treat),t.test) # can't do a full set of t-tests Error in t.test.default(X[[6L]], ...) : not enough 'x' observations > # so for the 3 upper quintiles there is the mean difference significant? > propbin[1] [1] (0.27,0.671] Levels: (0,0.0401] (0.0401,0.0872] (0.0872,0.27] (0.27,0.671] (0.671,1] > propen[1] NSW1 0.6388 > bins = c("(0,0.0401]", "(0.0401,0.0872]", "(0.0872,0.27]", "(0.27,0.671]", "(0.671,1]") > # i.e. make a string out of the bin designations, then pull-off entries > t.test(re78[propbin == bins[5]] ~ treat[propbin == bins[5]]) # t-test for quintile 5 Welch Two Sample t-test data: re78[propbin == bins[5]] by treat[propbin == bins[5]] t = -0.9844, df = 100.7, p-value = 0.3273 alternative hypothesis: true difference in means is not equal to 0 95 percent confidence interval: -4113 1385 sample estimates: mean in group 0 mean in group 1 4666 6030 # since we are doing 3 of these best practice would be to Bonferroni and do each at Type 1 .017 instead of .05. Also Karen did these one-sided which you might justify won't make a difference here; see Lab text version below > t.test(re78[propbin == bins[4]] ~ treat[propbin == bins[4]]) Welch Two Sample t-test data: re78[propbin == bins[4]] by treat[propbin == bins[4]] t = -0.7516, df = 108.7, p-value = 0.4539 alternative hypothesis: true difference in means is not equal to 0 95 percent confidence interval: -3061 1378 sample estimates: mean in group 0 mean in group 1 4977 5819 > t.test(re78[propbin == bins[3]] ~ treat[propbin == bins[3]]) Welch Two Sample t-test data: re78[propbin == bins[3]] by treat[propbin == bins[3]] t = -1.536, df = 23.57, p-value = 0.1379 alternative hypothesis: true difference in means is not equal to 0 95 percent confidence interval: -7428 1093 sample estimates: mean in group 0 mean in group 1 6043 9211 > # even one-sided versions of these tests are not going to be signif; should also Bonferroni the Type I error > t.test(re78[propbin == bins[2]] ~ treat[propbin == bins[2]]) # can also try bin 2 but really don't have enough data Welch Two Sample t-test data: re78[propbin == bins[2]] by treat[propbin == bins[2]] t = -1.083, df = 7.38, p-value = 0.3128 alternative hypothesis: true difference in means is not equal to 0 95 percent confidence interval: -6708 2462 sample estimates: mean in group 0 mean in group 1 5797 7919 > # lab text does a more sophisticated quintile function > # IMPLEMENT LAB TEXT 3.3 > quintilePropScores = quantile( propen, probs = seq( from = 0, to = 1, by = .2) ) > quintilePropScores[1:10] 0% 20% 40% 60% 80% 100% 0.00908 0.04015 0.08721 0.26978 0.67085 0.85315 NA NA NA NA > quintilePropScores 0% 20% 40% 60% 80% 100% 0.00908 0.04015 0.08721 0.26978 0.67085 0.85315 > lalonde$PropScores = propen > lalonde$PropScores[1:10] [1] 0.63877 0.22463 0.67824 0.77632 0.70164 0.69907 0.65368 0.78972 0.77984 0.04292 > # here's a pretty fancy do-loop version from Lab script > treated = list(5) > controls = list(5) > for ( i in 1:5 ) { + treated[[i]] = subset( x = lalonde, + subset = PropScores >= quintilePropScores[i] & + PropScores < quintilePropScores[i+1] & + treat == 1) + controls[[i]] = subset( x = lalonde, + subset = PropScores >= quintilePropScores[i] & + PropScores < quintilePropScores[i+1] & + treat == 0) + } > for ( i in 1:5 ) { + t.test( treated[[i]]$re78, + controls[[i]]$re78, + alternative = "greater" ) + } Error in t.test.default(treated[[i]]$re78, controls[[i]]$re78, alternative = "greater") : not enough 'x' observations # so I change the loop in Lab text to exclude testing bin 1 > for ( i in 2:5 ) { + t.test( treated[[i]]$re78, + controls[[i]]$re78, + alternative = "greater" ) + } > ptests = for ( i in 2:5 ) { + t.test( treated[[i]]$re78, + controls[[i]]$re78, + alternative = "greater" ) + } > ptests Welch Two Sample t-test data: treated[[i]]$re78 and controls[[i]]$re78 t = 1.120, df = 108.3, p-value = 0.1326 alternative hypothesis: true difference in means is greater than 0 95 percent confidence interval: -735.8 Inf sample estimates: mean of x mean of y 6027 4499 > t.test( treated[[5]]$re78, + controls[[5]]$re78, + alternative = "greater" ) Welch Two Sample t-test data: treated[[5]]$re78 and controls[[5]]$re78 t = 1.120, df = 108.3, p-value = 0.1326 alternative hypothesis: true difference in means is greater than 0 95 percent confidence interval: -735.8 Inf sample estimates: mean of x mean of y 6027 4499 > t.test( treated[[4]]$re78, + controls[[4]]$re78, + alternative = "greater" ) Welch Two Sample t-test data: treated[[4]]$re78 and controls[[4]]$re78 t = 0.6149, df = 103.4, p-value = 0.27 alternative hypothesis: true difference in means is greater than 0 95 percent confidence interval: -1193 Inf sample estimates: mean of x mean of y 5819 5117 > t.test( treated[[3]]$re78, + controls[[3]]$re78, + alternative = "greater" ) Welch Two Sample t-test data: treated[[3]]$re78 and controls[[3]]$re78 t = 1.536, df = 23.57, p-value = 0.06895 alternative hypothesis: true difference in means is greater than 0 95 percent confidence interval: -363.7 Inf sample estimates: mean of x mean of y 9211 6043 > length(treated[[3]]$re78) [1] 21 > mean(treated[[3]]$re78) [1] 9211 > length(controls[[3]]$re78) [1] 101 > mean(controls[[3]]$re78) [1] 6043 > # those match my bins > length(treated[[4]]$re78) [1] 71 > mean(treated[[4]]$re78) [1] 5819 > length(controls[[4]]$re78) [1] 51 > mean(controls[[4]]$re78) [1] 5117 > # our bins procedure are a little discrepant and that matters, 2 controls #*****end Lab script part 3 *********** > # Lab script PART 4, use MatchIt > #lab text left out "married" and "no degree" for some reason so I did also > # reason is that the Matchit docs omit also http://gking.harvard.edu/matchit/docs/Optimal_Matching.html # compare results for some of the matching options > m1.out = matchit(treat ~ re74 + re75 + educ + black + hispan + age, + data = lalonde, method = "nearest", ratio = 1) # the simplest old-fashioned matching function Nearest neighbor matching selects the r (default=1) best control matches for each individual in the treatment group (excluding those discarded using the discard option). Matching is done using a distance measure specified by the distance option (default=logit). Matches are chosen for each treated unit one at a time, with the order specified by the m.order command (default=largest to smallest). At each matching step we choose the control unit that is not yet matched but is closest to the treated unit on the distance measure. > m1.out Call: matchit(formula = treat ~ re74 + re75 + educ + black + hispan + age, data = lalonde, method = "nearest", ratio = 1) Sample sizes: Control Treated All 429 185 Matched 185 185 Unmatched 244 0 Discarded 0 0 > summary(m1.out) Call: matchit(formula = treat ~ re74 + re75 + educ + black + hispan + age, data = lalonde, method = "nearest", ratio = 1) Summary of balance for all data: Means Treated Means Control SD Control Mean Diff eQQ Med eQQ Mean eQQ Max distance 0.566 0.187 0.231 0.379 0.517 0.379 0.595 re74 2095.574 5619.237 6788.751 -3523.663 2425.572 3620.924 9216.500 re75 1532.055 2466.484 3291.996 -934.429 981.097 1060.658 6795.010 educ 10.346 10.235 2.855 0.111 1.000 0.703 4.000 black 0.843 0.203 0.403 0.640 1.000 0.643 1.000 hispan 0.059 0.142 0.350 -0.083 0.000 0.081 1.000 age 25.816 28.030 10.787 -2.214 1.000 3.265 10.000 Summary of balance for matched data: Means Treated Means Control SD Control Mean Diff eQQ Med eQQ Mean eQQ Max distance 0.566 0.365 0.260 0.201 0.106 0.201 0.49 re74 2095.574 2466.304 4245.694 -370.730 289.971 692.064 13121.75 re75 1532.055 1960.355 2948.255 -428.300 451.161 627.898 11365.71 educ 10.346 10.470 3.207 -0.124 1.000 0.924 4.00 black 0.843 0.470 0.500 0.373 0.000 0.373 1.00 hispan 0.059 0.276 0.448 -0.216 0.000 0.216 1.00 age 25.816 26.054 10.191 -0.238 2.000 2.735 8.00 Percent Balance Improvement: Mean Diff. eQQ Med eQQ Mean eQQ Max distance 46.94 79.44 46.90 17.62 re74 89.48 88.05 80.89 -42.37 re75 54.16 54.02 40.80 -67.27 educ -12.50 0.00 -31.54 0.00 black 41.76 100.00 42.02 0.00 hispan -161.35 0.00 -166.67 0.00 age 89.26 -100.00 16.23 20.00 Sample sizes: Control Treated All 429 185 Matched 185 185 Unmatched 244 0 Discarded 0 0 # so this is not so great, only use less than half of control group, reduce T/C diffs in most variables, slightly increase in educ, and increase hispan, i.e. matches in control group double the proportion of hispanics > m2.out = matchit(treat ~ re74 + re75 + educ + black + hispan + age, + data = lalonde, method = "nearest", ratio = 2) > m2.out # now try 2 control matches for each treatment still using nearest neighbor Call: matchit(formula = treat ~ re74 + re75 + educ + black + hispan + age, data = lalonde, method = "nearest", ratio = 2) Sample sizes: Control Treated All 429 185 Matched 370 185 Unmatched 59 0 Discarded 0 0 > summary(m2.out) Call: matchit(formula = treat ~ re74 + re75 + educ + black + hispan + age, data = lalonde, method = "nearest", ratio = 2) Summary of balance for all data: Means Treated Means Control SD Control Mean Diff eQQ Med eQQ Mean eQQ Max distance 0.566 0.187 0.231 0.379 0.517 0.379 0.595 re74 2095.574 5619.237 6788.751 -3523.663 2425.572 3620.924 9216.500 re75 1532.055 2466.484 3291.996 -934.429 981.097 1060.658 6795.010 educ 10.346 10.235 2.855 0.111 1.000 0.703 4.000 black 0.843 0.203 0.403 0.640 1.000 0.643 1.000 hispan 0.059 0.142 0.350 -0.083 0.000 0.081 1.000 age 25.816 28.030 10.787 -2.214 1.000 3.265 10.000 Summary of balance for matched data: Means Treated Means Control SD Control Mean Diff eQQ Med eQQ Mean eQQ Max distance 0.566 0.213 0.238 0.352 0.488 0.354 0.586 re74 2095.574 3689.453 4736.770 -1593.879 1030.574 1814.473 12434.050 re75 1532.055 2121.527 2915.592 -589.472 666.000 798.521 11365.710 educ 10.346 10.235 2.833 0.111 1.000 0.638 4.000 black 0.843 0.235 0.425 0.608 1.000 0.611 1.000 hispan 0.059 0.165 0.372 -0.105 0.000 0.103 1.000 age 25.816 26.281 9.961 -0.465 2.000 2.411 9.000 Percent Balance Improvement: Mean Diff. eQQ Med eQQ Mean eQQ Max distance 6.908 5.707 6.642 1.391 re74 54.766 57.512 49.889 -34.911 re75 36.916 32.117 24.715 -67.266 educ -0.268 0.000 9.231 0.000 black 5.049 0.000 5.042 0.000 hispan -27.406 0.000 -26.667 0.000 age 79.004 -100.000 26.159 10.000 Sample sizes: Control Treated All 429 185 Matched 370 185 Unmatched 59 0 Discarded 0 0 # In most resp ects the matching is worse here; mean diffs on many vars are more discrepant > m2opt.out = matchit(treat ~ re74 + re75 + educ + black + hispan + age, + data = lalonde, method = "optimal", ratio = 2) # Ben Hansen's method with 2:1 matching > m2opt.out #here, does the same as the nearest neighbor Call: matchit(formula = treat ~ re74 + re75 + educ + black + hispan + age, data = lalonde, method = "optimal", ratio = 2) Sample sizes: Control Treated All 429 185 Matched 370 185 Unmatched 59 0 Discarded 0 0 > summary(m2opt.out) Call: matchit(formula = treat ~ re74 + re75 + educ + black + hispan + age, data = lalonde, method = "optimal", ratio = 2) Summary of balance for all data: Means Treated Means Control SD Control Mean Diff eQQ Med eQQ Mean eQQ Max distance 0.566 0.187 0.231 0.379 0.517 0.379 0.595 re74 2095.574 5619.237 6788.751 -3523.663 2425.572 3620.924 9216.500 re75 1532.055 2466.484 3291.996 -934.429 981.097 1060.658 6795.010 educ 10.346 10.235 2.855 0.111 1.000 0.703 4.000 black 0.843 0.203 0.403 0.640 1.000 0.643 1.000 hispan 0.059 0.142 0.350 -0.083 0.000 0.081 1.000 age 25.816 28.030 10.787 -2.214 1.000 3.265 10.000 Summary of balance for matched data: Means Treated Means Control SD Control Mean Diff eQQ Med eQQ Mean eQQ Max distance 0.566 0.213 0.238 0.352 0.488 0.354 0.586 re74 2095.574 3689.453 4736.770 -1593.879 1030.574 1814.473 12434.050 re75 1532.055 2121.527 2915.592 -589.472 666.000 798.521 11365.710 educ 10.346 10.235 2.833 0.111 1.000 0.638 4.000 black 0.843 0.235 0.425 0.608 1.000 0.611 1.000 hispan 0.059 0.165 0.372 -0.105 0.000 0.103 1.000 age 25.816 26.281 9.961 -0.465 2.000 2.411 9.000 Percent Balance Improvement: Mean Diff. eQQ Med eQQ Mean eQQ Max distance 6.908 5.707 6.642 1.391 re74 54.766 57.512 49.889 -34.911 re75 36.916 32.117 24.715 -67.266 educ -0.268 0.000 9.231 0.000 black 5.049 0.000 5.042 0.000 hispan -27.406 0.000 -26.667 0.000 age 79.004 -100.000 26.159 10.000 Sample sizes: Control Treated All 429 185 Matched 370 185 Unmatched 59 0 Discarded 0 0 > m2full.out = matchit(treat ~ re74 + re75 + educ + black + hispan + age, data = lalonde, method = "full") > summary(m2full.out) #let's try Ben's full matching Call: matchit(formula = treat ~ re74 + re75 + educ + black + hispan + age, data = lalonde, method = "full") Summary of balance for all data: Means Treated Means Control Mean Diff eQQ Med eQQ Mean eQQ Max distance 0.566 0.187 0.379 0.517 0.379 0.595 re74 2095.574 5619.237 -3523.663 2425.572 3620.924 9216.500 re75 1532.055 2466.484 -934.429 981.097 1060.658 6795.010 educ 10.346 10.235 0.111 1.000 0.703 4.000 black 0.843 0.203 0.640 1.000 0.643 1.000 hispan 0.059 0.142 -0.083 0.000 0.081 1.000 age 25.816 28.030 -2.214 1.000 3.265 10.000 Summary of balance for matched data: Means Treated Means Control Mean Diff eQQ Med eQQ Mean eQQ Max distance 0.566 0.566 0.000 0.001 0.005 0.144 re74 2095.574 2024.744 70.830 0.000 493.428 13121.750 re75 1532.055 1313.744 218.312 0.000 328.307 12746.050 educ 10.346 10.417 -0.071 0.000 0.348 3.000 black 0.843 0.838 0.005 0.000 0.010 1.000 hispan 0.059 0.069 -0.009 0.000 0.003 1.000 age 25.816 25.182 0.634 3.000 2.571 9.000 Percent Balance Improvement: Mean Diff. eQQ Med eQQ Mean eQQ Max distance 99.97 99.83 98.67 75.81 re74 97.99 100.00 86.37 -42.37 re75 76.64 100.00 69.05 -87.58 educ 35.31 100.00 50.53 25.00 black 99.16 100.00 98.44 0.00 hispan 88.79 0.00 96.05 0.00 age 71.35 -200.00 21.25 10.00 Sample sizes: Control Treated All 429 185 Matched 429 185 Unmatched 0 0 Discarded 0 0 # seems to have done much better on all mean comparisons # Lots of fun plots to do, try type=jitter to get propensity scores, and you can get ident for indiv cases by clicking > plot(m2full.out) Waiting to confirm page change... Waiting to confirm page change... > help.matchit("plot") > plot(m2full.out, type = "jitter") Waiting to confirm page change... [1] "To identify the units, use first mouse button; to stop, use second." warning: no point with 0.25 inches warning: no point with 0.25 inches > m2full.dat = match.data(m2full.out) # let's get the matched data set from full matching > dim(m2full.dat) [1] 614 13 > names(m2full.dat) [1] "treat" "age" "educ" "black" "hispan" "married" "nodegree" [8] "re74" "re75" "re78" "distance" "weights" "subclass" > m2full.dat[1:20,] # just a look at the data set treat age educ black hispan married nodegree re74 re75 re78 distance weights subclass NSW1 1 37 11 1 0 1 1 0 0 9930.0 0.69208 1 1 NSW2 1 22 9 0 1 0 1 0 0 3595.9 0.18249 1 108 NSW3 1 30 12 1 0 0 0 0 0 24909.5 0.70891 1 107 NSW4 1 27 11 1 0 0 1 0 0 7506.1 0.69294 1 91 NSW5 1 33 8 1 0 0 1 0 0 289.8 0.64088 1 95 NSW6 1 22 9 1 0 0 1 0 0 4056.5 0.65949 1 20 NSW7 1 23 12 1 0 0 0 0 0 0.0 0.70950 1 8 NSW8 1 32 11 1 0 0 1 0 0 8472.2 0.69251 1 10 NSW9 1 22 16 1 0 0 0 0 0 2164.0 0.76907 1 32 NSW10 1 33 12 0 0 1 0 0 0 12418.1 0.08978 1 37 NSW11 1 19 9 1 0 0 1 0 0 8173.9 0.65976 1 16 NSW12 1 21 13 1 0 0 0 0 0 17094.6 0.72535 1 41 NSW13 1 18 8 1 0 0 1 0 0 0.0 0.64226 1 50 NSW14 1 27 10 1 0 1 1 0 0 18739.9 0.67622 1 39 NSW15 1 17 7 1 0 0 1 0 0 3023.9 0.62438 1 71 NSW16 1 19 10 1 0 0 1 0 0 3228.5 0.67692 1 79 NSW17 1 27 13 1 0 0 0 0 0 14581.9 0.72487 1 90 NSW18 1 23 10 1 0 0 1 0 0 7693.4 0.67657 1 102 NSW19 1 40 12 1 0 0 0 0 0 10804.3 0.70808 1 107 NSW20 1 26 12 1 0 0 0 0 0 10747.4 0.70925 1 109 > # so you can see match.data appends 3 colums "distance" "weights" "subclass" > # to the original data set Instead of doing the Zelig in Lab Section 5, which has alot of overhead we take a closer look at the matches and correspondence to propensity matching; e.g. do the subclasses match well on propensity > table(m2full.dat$subclass) #the 109 subclasses have various sizes 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 2 2 17 4 10 2 2 12 3 8 20 3 3 4 5 2 2 3 3 3 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 3 4 37 2 2 2 32 2 2 3 3 2 3 2 9 2 2 2 4 2 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 2 6 3 4 3 125 2 6 13 2 2 2 2 2 2 2 2 10 2 2 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 2 4 2 2 3 9 2 2 2 2 2 2 2 4 2 2 2 2 2 2 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 2 2 3 2 2 2 5 11 2 4 18 6 2 7 6 8 2 2 4 18 101 102 103 104 105 106 107 108 109 2 9 2 2 2 5 3 3 2 > # so in full matching multiple treatment cases can be grouped in a subclass > table(treat,m2full.dat$subclass) treat 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 0 1 1 16 3 9 1 1 1 1 1 19 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 11 2 7 1 2 1 3 4 1 1 2 2 treat 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 0 1 2 3 36 1 1 1 31 1 1 2 2 1 1 1 8 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 treat 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 0 1 1 1 1 1 1 1 124 1 5 12 1 1 1 1 1 1 1 1 1 3 1 1 5 2 3 2 1 1 1 1 1 1 1 1 1 1 1 1 treat 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 0 9 1 1 1 1 1 1 1 8 1 1 1 1 1 1 1 3 1 1 1 1 1 1 1 3 1 1 2 1 1 1 1 1 1 1 1 1 1 1 treat 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 0 1 1 1 1 1 1 1 1 1 1 4 10 1 1 1 5 1 6 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 3 17 1 1 1 5 treat 96 97 98 99 100 101 102 103 104 105 106 107 108 109 0 7 1 1 1 17 1 1 1 1 1 1 1 2 1 1 1 1 1 3 1 1 8 1 1 1 4 2 1 1 > sum(m2full.dat$subclass == 8) # 12 total cases in subclass 8 [1] 12 > m2full.dat$subclass == 8 # crude way to ident the cases in subclass 8 (propen was appended from below) > m2full.dat[subclass ==8,] # here's a way to get the listing treat age educ black hispan married nodegree re74 re75 re78 distance weights subclass propen NSW7 1 23 12 1 0 0 0 0 0.00 0 0.7095 1.00 8 0.7095 NSW34 1 20 12 1 0 0 0 0 0.00 0 0.7097 1.00 8 0.7097 NSW38 1 23 12 1 0 1 0 0 0.00 5912 0.7095 1.00 8 0.7095 NSW41 1 21 12 1 0 0 0 0 0.00 1255 0.7097 1.00 8 0.7097 NSW50 1 23 12 1 0 0 0 0 0.00 4843 0.7095 1.00 8 0.7095 NSW54 1 25 12 1 0 0 0 0 0.00 2349 0.7093 1.00 8 0.7093 NSW74 1 25 12 1 0 0 0 0 0.00 11966 0.7093 1.00 8 0.7093 NSW77 1 22 12 1 0 0 0 0 0.00 18678 0.7096 1.00 8 0.7096 NSW81 1 25 12 1 0 0 0 0 0.00 0 0.7093 1.00 8 0.7093 NSW82 1 18 12 1 0 0 0 0 0.00 2321 0.7099 1.00 8 0.7099 NSW94 1 21 12 1 0 0 0 0 0.00 9984 0.7097 1.00 8 0.7097 PSID293 0 31 12 1 0 1 0 0 42.97 11024 0.7091 25.51 8 0.7091 # As we are somewhat awkwardly not using two of the vars in this part of the exercise # recompute propensity score for the subset of vars and append to the m2full.dat > glm.lalondesub = glm( treat ~ age + educ + black + hispan + re74 + re75, data = lalonde, family = binomial) > propensub = fitted(glm.lalondesub) > m2full.dat$propen = propensub > m2full.dat$propen[subclass == 8] # for the 12 cases in subclass 8 about equal propen scores matching was done on the vars not propen [1] 0.7095 0.7097 0.7095 0.7097 0.7095 0.7093 0.7093 0.7096 0.7093 0.7099 0.7097 0.7091 > table(m2full.dat$propen[subclass == 8], treat[subclass == 8]) 0 1 0.709099144142846 1 0 0.709329320325358 0 3 0.709495198369488 0 3 0.709578116424806 0 1 0.709661020498057 0 2 0.70974391058607 0 1 0.709909648793713 0 1 # subclass 46 has 124 controls and 1 treatment and a much wider range on propensity > table(treat[subclass == 46], signif(m2full.dat$propen[subclass == 46],4)) 0.008977 0.01128 0.01176 0.01315 0.01317 0.01386 0.01388 0.01389 0.0151 0.01524 0.01613 0.01641 0.01673 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0.017 0.01731 0.01808 0.01844 0.0185 0.01853 0.01935 0.01974 0.01979 0.02016 0.02068 0.02085 0.02237 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0.02255 0.0239 0.02398 0.02434 0.02435 0.02445 0.02467 0.0251 0.02569 0.0257 0.02605 0.02651 0.02679 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0.02734 0.02785 0.02856 0.02876 0.02919 0.0294 0.03022 0.03032 0.03087 0.03095 0.03115 0.03144 0.03156 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0.03195 0.03207 0.0321 0.0323 0.0324 0.03242 0.03301 0.03338 0.03516 0.03546 0.03566 0.03568 0.03579 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0.03588 0.03675 0.0371 0.03825 0.03844 0.03856 0.03863 0.03873 0.03895 0.03936 0.04041 0.04086 0.04148 0 1 1 1 1 1 1 1 1 1 2 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0.04187 0.04218 0.04295 0.04306 0.04342 0.04352 0.04364 0.0438 0.04471 0.0449 0.04498 0.04499 0.04508 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0.04517 0.04593 0.04598 0.04622 0.04638 0.04709 0.04719 0.04735 0.04762 0.04778 0.04781 0.04784 0.04785 0 1 1 1 1 1 1 1 1 1 1 1 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0.04805 0.0482 0.04827 0.04842 0.04916 0.04918 0.04972 0.04994 0.04997 0.05013 0.05085 0.05209 0.05223 0 1 1 1 1 1 1 1 1 1 1 1 2 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0.05246 0.05314 0.05374 0.05401 0.05428 0.05458 0 1 1 1 1 1 1 1 0 0 0 0 0 0 > tapply(m2full.dat$propen[subclass == 46], treat[subclass == 46], mean) #compare means 0 1 0.03447 0.04785 look at a couple more subclasses or can loop through them > tapply(m2full.dat$propen[subclass == 109], treat[subclass == 109], mean) 0 1 0.7097 0.7092 > tapply(m2full.dat$propen[subclass == 108], treat[subclass == 108], mean) 0 1 0.1832 0.1825 > tapply(m2full.dat$propen, list(treat,subclass ), mean) # looks like a pretty good balance on propen by fullmatch 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 0 0.6913 0.7240 0.1057 0.07214 0.2001 0.6619 0.5648 0.7091 0.7086 0.6918 0.07444 0.6424 0.1705 0.6425 0.6591 1 0.6921 0.7249 0.1030 0.07238 0.1940 0.6599 0.5655 0.7095 0.7086 0.6925 0.07290 0.6419 0.1714 0.6421 0.6596 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 0 0.6600 0.6588 0.6368 0.5665 0.6597 0.08438 0.534 0.05877 0.7182 0.6074 0.1754 0.0812 0.6913 0.08928 0.1683 1 0.6598 0.6581 0.6372 0.5668 0.6592 0.08399 0.539 0.06300 0.7233 0.6060 0.1777 0.0839 0.6918 0.08961 0.1712 31 32 33 34 35 36 37 38 39 40 41 42 43 44 0 0.09135 0.7894 0.6802 0.6906 0.09055 0.6455 0.08997 0.06821 0.6762 0.7154 0.7291 0.6090 0.7460 0.6820 1 0.09017 0.7691 0.6796 0.6915 0.09004 0.6448 0.08978 0.06872 0.6751 0.7121 0.7254 0.6118 0.7492 0.6823 45 46 47 48 49 50 51 52 53 54 55 56 57 58 0 0.6656 0.03447 0.1828 0.06473 0.1246 0.6424 0.6342 0.6996 0.7012 0.6770 0.3044 0.6887 0.6675 0.07071 1 0.6653 0.04785 0.1819 0.06503 0.1177 0.6423 0.6344 0.7010 0.7019 0.6765 0.3787 0.6893 0.6701 0.07107 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 0 0.6501 0.5151 0.09011 0.4907 0.4390 0.6771 0.5918 0.09613 0.6666 0.6620 0.6670 0.7097 0.6233 0.6065 0.6937 1 0.6493 0.5102 0.08918 0.4870 0.4417 0.6763 0.5935 0.09650 0.6692 0.6604 0.6706 0.7104 0.6244 0.6004 0.6931 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 0 0.4550 0.6200 0.6772 0.5159 0.6807 0.6771 0.6526 0.6936 0.6434 0.4084 0.4757 0.7070 0.5652 0.09184 0.1557 1 0.4586 0.6139 0.6782 0.5088 0.6815 0.6769 0.6487 0.6937 0.6444 0.4035 0.4720 0.7044 0.5566 0.09073 0.1639 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 0 0.6864 0.7254 0.6937 0.5762 0.6913 0.08754 0.6405 0.2192 0.7024 0.2763 0.7400 0.06780 0.6579 0.6770 0.2349 1 0.6868 0.7248 0.6933 0.5749 0.6917 0.08782 0.6408 0.2291 0.7028 0.2507 0.7395 0.06796 0.6576 0.6769 0.2391 104 105 106 107 108 109 0 0.1848 0.5017 0.5860 0.7086 0.1832 0.7097 1 0.1799 0.4950 0.5863 0.7085 0.1825 0.7092 > tapply(m2full.dat$re78, list(treat,subclass ), mean) #mean differences in outcome by subclass 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 0 0 582.2 7233 3085 5274 3984 0 11024 0 0 4277 12760 9352 12719 14344 2159 0 422.6 14999 1 9930 34099.3 5150 6409 11143 1953 6552 5210 2824 5900 13189 4031 2788 0 1121 8174 1068 2722.6 4024 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 0 6084 9943 2727 4638 0.0 7310 1306 6854 3057 1491 18701 17156 18209 0 0 4147 0 1534 0 7544 1 7375 1048 8882 0 647.2 0 6211 18783 23006 4942 3881 8049 2164 4527 0 5588 0 12418 12558 15190 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 0 5496 116.7 94.57 461.1 3684 701.9 10386 6821 4753 5509.1 4520 1274 2285 9067 5307 959 7934 2449 1 1653 17094.6 3847.76 13706.8 8660 9729.3 0 26818 4322 559.4 0 1085 60308 1460 0 6943 4033 10363 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 0 5272 803.9 0 3903 14421 19330 1054 0.0 6314 8900 54.68 20243 0 6146 0 7146.3 0 1614 1730 1 4232 11141.4 0 13386 9071 0 9266 712.5 2485 4147 9970.68 0 26372 3024 5615 485.2 6168 8484 1294 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 0 9088 0 0 0 16486 1224 7315 17833 0 0 11354 7267 8154 17941 0 2285.7 2821 4639 648.7 1 0 0 3229 4280 4815 3463 11668 0 0 10977 13830 6788 13228 7354 3578 743.7 5523 1359 1061.6 96 97 98 99 100 101 102 103 104 105 106 107 108 109 0 5288.4 33.99 19438 10122 6170 17765 2282 1893 9453 0 187.7 18717 3702 11594 1 672.9 0.00 10093 13318 12591 0 4384 5112 36647 3787 7774.5 17857 3596 10747 > m2full.dat[subclass ==17,] #closer look at subclass 17 which shows 0 for control mean (only 1 each T/C) treat age educ black hispan married nodegree re74 re75 re78 distance weights subclass propen NSW57 1 37 9 1 0 0 1 0 0.0 1068 0.6581 1.000 17 0.6581 PSID277 0 19 10 1 0 0 1 1056 205.9 0 0.6588 2.319 17 0.6588 > #let's try Ben's full matching with all the vars; sould also compare with propensity in part 3 > m2fullvars.out = matchit(treat ~ re74 + re75 + educ + black + hispan + age + married + nodegree, + data = lalonde, method = "full") > m2fullvars.out Call: matchit(formula = treat ~ re74 + re75 + educ + black + hispan + age + married + nodegree, data = lalonde, method = "full") Sample sizes: Control Treated All 429 185 Matched 429 185 Unmatched 0 0 Discarded 0 0 > summary(m2fullvars.out) Call: matchit(formula = treat ~ re74 + re75 + educ + black + hispan + age + married + nodegree, data = lalonde, method = "full") Summary of balance for all data: Means Treated Means Control Mean Diff eQQ Med eQQ Mean eQQ Max distance 0.577 0.182 0.395 0.518 0.396 0.597 re74 2095.574 5619.237 -3523.663 2425.572 3620.924 9216.500 re75 1532.055 2466.484 -934.429 981.097 1060.658 6795.010 educ 10.346 10.235 0.111 1.000 0.703 4.000 black 0.843 0.203 0.640 1.000 0.643 1.000 hispan 0.059 0.142 -0.083 0.000 0.081 1.000 age 25.816 28.030 -2.214 1.000 3.265 10.000 married 0.189 0.513 -0.324 0.000 0.324 1.000 nodegree 0.708 0.597 0.111 0.000 0.114 1.000 Summary of balance for matched data: Means Treated Means Control Mean Diff eQQ Med eQQ Mean eQQ Max distance 0.577 0.576 0.001 0.003 0.006 0.087 re74 2095.574 2434.869 -339.295 311.523 659.367 13121.750 re75 1532.055 1577.728 -45.672 205.887 468.549 12746.050 educ 10.346 10.442 -0.096 0.000 0.392 4.000 black 0.843 0.835 0.009 0.000 0.000 1.000 hispan 0.059 0.061 -0.001 0.000 0.002 1.000 age 25.816 24.707 1.110 3.000 3.141 9.000 married 0.189 0.131 0.058 0.000 0.044 1.000 nodegree 0.708 0.695 0.013 0.000 0.011 1.000 Percent Balance Improvement: Mean Diff. eQQ Med eQQ Mean eQQ Max distance 99.64 99.50 98.493 85.49 re74 90.37 87.16 81.790 -42.37 re75 95.11 79.02 55.825 -87.58 educ 13.08 100.00 44.158 0.00 black 98.66 100.00 99.938 0.00 hispan 98.26 0.00 98.027 0.00 age 49.88 -200.00 3.788 10.00 married 82.06 0.00 86.557 0.00 nodegree 88.53 0.00 90.133 0.00 Sample sizes: Control Treated All 429 185 Matched 429 185 Unmatched 0 0 Discarded 0 0 =========================================== Section 6 #Try Mahalanobis # Lab text asks you to do "full" match, done below. For comparison with # propensity distance a "nearest" ratio=1 match may be easiest > matchit.mahal2 = matchit( treat ~ age + educ + black + hispan + married + nodegree + re74 + re75, method = "nearest", + distance = "mahalanobis", data = lalonde ) > summary(matchit.mahal2) Call: matchit(formula = treat ~ age + educ + black + hispan + married + nodegree + re74 + re75, data = lalonde, method = "nearest", distance = "mahalanobis") Summary of balance for all data: Means Treated Means Control SD Control Mean Diff eQQ Med eQQ Mean eQQ Max distance 6.571 8.598 5.202 -2.027 1.701 2.399 25.18 age 25.816 28.030 10.787 -2.214 1.000 3.265 10.00 educ 10.346 10.235 2.855 0.111 1.000 0.703 4.00 black 0.843 0.203 0.403 0.640 1.000 0.643 1.00 hispan 0.059 0.142 0.350 -0.083 0.000 0.081 1.00 married 0.189 0.513 0.500 -0.324 0.000 0.324 1.00 nodegree 0.708 0.597 0.491 0.111 0.000 0.114 1.00 re74 2095.574 5619.237 6788.751 -3523.663 2425.572 3620.924 9216.50 re75 1532.055 2466.484 3291.996 -934.429 981.097 1060.658 6795.01 Summary of balance for matched data: Means Treated Means Control SD Control Mean Diff eQQ Med eQQ Mean eQQ Max distance 6.571 6.613 4.657 -0.042 0.079 0.377 25.18 age 25.816 25.346 9.597 0.470 3.000 2.665 7.00 educ 10.346 10.335 2.499 0.011 0.000 0.411 4.00 black 0.843 0.308 0.463 0.535 1.000 0.535 1.00 hispan 0.059 0.065 0.247 -0.005 0.000 0.005 1.00 married 0.189 0.395 0.490 -0.205 0.000 0.205 1.00 nodegree 0.708 0.665 0.473 0.043 0.000 0.043 1.00 re74 2095.574 4243.581 5931.698 -2148.007 1412.631 2278.350 9504.95 re75 1532.055 1817.187 2634.623 -285.132 300.774 520.283 10876.95 Percent Balance Improvement: Mean Diff. eQQ Med eQQ Mean eQQ Max distance 97.91 95.37 84.30 0.00 age 78.76 -200.00 18.38 30.00 educ 90.22 100.00 41.54 0.00 black 16.44 0.00 16.81 0.00 hispan 93.47 0.00 93.33 0.00 married 36.53 0.00 36.67 0.00 nodegree 61.17 0.00 61.91 0.00 re74 39.04 41.76 37.08 -3.13 re75 69.49 69.34 50.95 -60.07 Sample sizes: Control Treated All 429 185 Matched 185 185 Unmatched 244 0 Discarded 0 0 #Now do the lab text version > matchit.mahal = matchit( treat ~ age + educ + black + hispan + married + nodegree + re74 + re75, method = "full", + distance = "mahalanobis", data = lalonde ) > summary(matchit.mahal) Call: matchit(formula = treat ~ age + educ + black + hispan + married + nodegree + re74 + re75, data = lalonde, method = "full", distance = "mahalanobis") Summary of balance for all data: Means Treated Means Control Mean Diff eQQ Med eQQ Mean eQQ Max distance 6.571 8.598 -2.027 1.701 2.399 25.18 age 25.816 28.030 -2.214 1.000 3.265 10.00 educ 10.346 10.235 0.111 1.000 0.703 4.00 black 0.843 0.203 0.640 1.000 0.643 1.00 hispan 0.059 0.142 -0.083 0.000 0.081 1.00 married 0.189 0.513 -0.324 0.000 0.324 1.00 nodegree 0.708 0.597 0.111 0.000 0.114 1.00 re74 2095.574 5619.237 -3523.663 2425.572 3620.924 9216.50 re75 1532.055 2466.484 -934.429 981.097 1060.658 6795.01 Summary of balance for matched data: Means Treated Means Control Mean Diff eQQ Med eQQ Mean eQQ Max distance 6.571 6.412 0.159 0.051 0.296 25.18 age 25.816 25.517 0.299 3.000 2.732 8.00 educ 10.346 10.300 0.045 0.000 0.339 4.00 black 0.843 0.444 0.399 0.000 0.407 1.00 hispan 0.059 0.065 -0.006 0.000 0.025 1.00 married 0.189 0.390 -0.200 0.000 0.198 1.00 nodegree 0.708 0.671 0.037 0.000 0.034 1.00 re74 2095.574 4088.903 -1993.329 1030.574 2172.700 12841.58 re75 1532.055 1620.410 -88.354 207.194 372.867 12516.89 Percent Balance Improvement: Mean Diff. eQQ Med eQQ Mean eQQ Max distance 92.16 97.03 87.64 0.00 age 86.50 -200.00 16.31 20.00 educ 58.84 100.00 51.79 0.00 black 37.70 100.00 36.76 0.00 hispan 92.70 0.00 69.41 0.00 married 38.09 0.00 39.07 0.00 nodegree 66.76 0.00 69.69 0.00 re74 43.43 57.51 40.00 -39.33 re75 90.55 78.88 64.85 -84.21 Sample sizes: Control Treated All 429 185 Matched 429 185 Unmatched 0 0 Discarded 0 0 Warning message: In sample(names(treat)[treat == 0], numdraws/2, replace = TRUE, : Walker's alias method used: results are different from R < 2.2.0 > mahabdat = match.data(matchit.mahal) > mahabdat[1:20,] treat age educ black hispan married nodegree re74 re75 re78 distance weights subclass NSW1 1 37 11 1 0 1 1 0 0 9930.0 9.237 1 1 NSW2 1 22 9 0 1 0 1 0 0 3595.9 9.083 1 136 NSW3 1 30 12 1 0 0 0 0 0 24909.5 4.768 1 9 NSW4 1 27 11 1 0 0 1 0 0 7506.1 3.558 1 96 NSW5 1 33 8 1 0 0 1 0 0 289.8 3.472 1 109 NSW6 1 22 9 1 0 0 1 0 0 4056.5 2.355 1 82 NSW7 1 23 12 1 0 0 0 0 0 0.0 4.553 1 38 NSW8 1 32 11 1 0 0 1 0 0 8472.2 4.552 1 26 NSW9 1 22 16 1 0 0 0 0 0 2164.0 7.048 1 58 NSW10 1 33 12 0 0 1 0 0 0 12418.1 6.548 1 63 NSW11 1 19 9 1 0 0 1 0 0 8173.9 2.661 1 25 NSW12 1 21 13 1 0 0 0 0 0 17094.6 4.354 1 67 NSW13 1 18 8 1 0 0 1 0 0 0.0 3.451 1 78 NSW14 1 27 10 1 0 1 1 0 0 18739.9 6.861 1 88 NSW15 1 17 7 1 0 0 1 0 0 3023.9 4.963 1 75 NSW16 1 19 10 1 0 0 1 0 0 3228.5 2.692 1 70 NSW17 1 27 13 1 0 0 0 0 0 14581.9 4.318 1 2 NSW18 1 23 10 1 0 0 1 0 0 7693.4 2.458 1 82 NSW19 1 40 12 1 0 0 0 0 0 10804.3 7.347 1 135 NSW20 1 26 12 1 0 0 0 0 0 10747.4 4.485 1 137 > attach(mahabdat) > table(treat, subclass) subclass treat 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 0 1 1 6 6 1 5 3 1 2 1 1 18 1 3 3 1 1 5 1 7 1 6 1 1 1 1 1 9 1 1 1 1 3 1 1 1 1 1 1 1 5 1 1 3 1 1 1 2 1 1 1 1 1 1 2 2 2 1 1 1 1 subclass treat 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 0 7 2 1 1 2 1 19 1 2 12 1 1 1 3 1 3 2 1 2 5 2 1 2 1 1 5 1 2 1 4 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 subclass treat 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 0 1 1 1 1 9 3 5 1 12 1 3 3 1 10 1 1 1 1 1 1 1 1 3 2 7 1 1 1 9 1 1 1 1 1 1 1 1 1 1 1 3 1 1 2 1 2 1 2 2 1 2 2 7 1 1 1 1 1 1 1 1 subclass treat 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 0 1 1 1 4 7 1 1 1 1 1 5 1 1 1 7 1 7 2 1 2 2 2 5 3 4 1 1 3 2 1 1 4 1 2 1 6 1 2 1 1 1 1 1 1 7 1 1 1 1 1 1 subclass treat 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 0 4 4 2 2 1 16 6 8 1 11 1 6 2 6 6 1 12 5 1 1 5 1 1 1 1 1 1 1 1 1 1 4 1 1 1 1 1 1 1 1 1 1 1 1 1 > tapply(re78, list(treat,subclass), mean) # look at outcomes 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 0 12618 1745 4802 7542 2282 3834 7175 54.68 7819 0 14344 5815 9473.7 12370 8821 2862 1 9930 17782 5150 6409 11163 9643 11143 16218.04 24909 2463 0 6552 721.7 0 3192 20506 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 0 6399 5974 0 4584 12490 5756 0 12760 7544 7310 6495 1926 14053 2821 3644 2112 8154 1 9185 5912 3094 0 1255 13189 2788 0 8149 6658 2349 1068 0 7285 13168 1048 1924 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 0 22095.0 4123 2233 8894 2814 7589 9106.6 12108 5552 1067 8437 12419 7343 11111 422.6 10650 1 762.9 8882 0 8547 0 7480 647.2 0 11966 9599 6211 18783 18678 23006 6456.7 0 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 0 2237 5284 6848 5813 0 10746 4224 0 7215 1422 13256 5307 5345 1534 19516 5479 5611 8086 1 2321 4942 0 0 0 0 3881 8049 2164 9984 0 4483 0 12418 12558 22163 1653 17095 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 0 3057.4 4319 0 9062 15485 10122 7128 14065 14999 7236 7146 1054 14476 31.03 1224 9469 1 671.3 17815 4322 0 26818 4955 1773 1512 11233 959 8615 5445 30154 1442.65 3206 6943 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 0 17811 9706 8900 0 0 6787 4788 3372 6084 803.9 7482 8937 0 0 2159 33.99 7934 1 4033 4232 11141 0 18740 13386 4850 0 8984 830.3 0 2485 3051 26372 2808 3196.57 2890 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 0 5062 4520 7699 18133 1211 20243 3789 3838 0 2857 12957 6962 3893 11521 5735 7867 8336 1 6168 7799 8484 1294 0 5010 9371 4280 2441 3463 7383 0 0 10977 13830 6788 9559 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 0 13860 10085 0.0 6163 6411 16704.8 7987 6870 0 7245 8606 7859 8245 0 8815 14163 1 13228 7458 743.7 5523 0 672.9 6263 10093 6281 12591 0 5112 15953 36647 12804 3787 134 135 136 137 0 9923 7216 10509 648.7 1 4182 10804 3596 10747.4 ** that's (more than) enough for Lab 4**********************