#********************************************** # Task 0: Read in the data, fit multiple regression #********************************************** #### # 1. Read in the data #### babies = read.table("http://www-stat.stanford.edu/~rag/stat141/exs/babies.dat", header = T) # # Example to read data from local file: # babies = read.table( file = "C:\\Documents and Settings\\KK\\My Documents\\Courses\\TA Courses\\Stat209\\Stat209\\babies.dat", header = T ) # summary(babies) dim(babies) names(babies) # # gestation length of gestation in days # wt birth weight in ounces, 999 unknown # age mother’s age in years at termination of pregnancy, 99=unknown # ht mother’s height in inches to the last completed inch, 99=unknown # wt1 mother pre-pregnancy weight in pounds, 999=unknown # dht father's height, coding same as for mother's height # dwt father's weight coding same as for mother's weight # #### # 2. Extract subset of predictors #### subsetBabies = subset( babies, subset = gestation < 999 & wt1 < 999 & wt < 999 & ht < 99 & dwt < 999 & dht < 99, select = c(gestation, wt, age, ht, wt1, dht, dwt) ) dim(subsetBabies) summary( subsetBabies ) #### # 3. Run the multiple regression # We get coefficient estimates, standard errors, # r-squared, etc. #### lm.babies = lm( wt ~ age + ht + wt1 + dht + dwt, data = subsetBabies ) summary( lm.babies ) #********************************************** # Task 1: Demonstrate adjusted variables interpretation #********************************************** #### # a. age predictor #### lm.no.age = update( lm.babies, formula = wt ~ . -age ) summary( lm.no.age ) resid.no.age = lm.no.age$residuals lm.age.predictors = lm( age ~ ht + wt1 + dht + dwt -1, data = subsetBabies ) resid.age.predictors = lm.age.predictors$residuals lm.age.addvar = lm( resid.no.age ~ resid.age.predictors ) summary( lm.age.addvar ) lm.age.addvar$coef lm.babies$coef # The coefficient for age in the original regression equals # the coefficient for resid.age.predictors! #### # b. wt1 predictor #### lm.no.wt1 = update( lm.babies, formula = wt ~ . -wt1 ) summary( lm.no.wt1 ) resid.no.wt1 = lm.no.wt1$residuals lm.wt1.predictors = lm( wt1 ~ age + ht + dht + dwt -1, data = subsetBabies ) resid.wt1.predictors = lm.wt1.predictors$residuals lm.wt1.addvar = lm( resid.no.wt1 ~ resid.wt1.predictors ) summary( lm.wt1.addvar ) lm.wt1.addvar$coef lm.babies$coef # The coefficient for wt1 in the original regression equals # the coefficient for resid.wt1.predictors! #### # c. Added variable plots #### library( car ) # Added variable plot for age: # Plot the residuals from wt ~ ht + wt1 + dht + dwt # versus the residuals from age ~ ht + wt1 + dht + dwt avp( lm.babies ) 2 #********************************************** # Task 2: Standardized regression coeficients. # Can also obtain using covariances #********************************************** #### # a. Multiple regression using standardized variables #### standardizedVars = subsetBabies numVars = dim( standardizedVars )[2] for ( i in 1:numVars ) { meanVal = mean( standardizedVars[,i] ) sdVal = sd( standardizedVars[,i] ) standardizedVars[,i] = ( standardizedVars[,i] - meanVal ) / sdVal } lm.std = lm( wt ~ age + ht + wt1 + dht + dwt - 1, data = standardizedVars ) # no intercept, since response standardized summary( lm.std ) #### # b. Coefficient estimates using correlations #### attach( subsetBabies ) corPredictors = cor( cbind( age, ht, wt1, dht, dwt ) ) corPredResponse = cbind( cor( cbind(age, ht, wt1, dht, dwt), wt ) ) coefEsts = solve( corPredictors ) %*% corPredResponse # the "n" terms cancel # Notice that these are equal to lm.std$coef #### # c. Standard errors using correlations #### # Estimate sigma (derivation on p79 of Freedman) numObs = length( wt ) numPreds = 5 sigmaHatSq = ( 1 - t(coefEsts) %*% corPredictors %*% coefEsts ) * (numObs / (numObs - numPreds) ) coefCov = as.numeric(sigmaHatSq) * ( 1/(numObs - 1) ) * solve( corPredictors ) sqrt( coefCov[1,1] ) sqrt( coefCov[2,2] ) summary( lm.std ) # compare s.e. estimates #********************************************** # OPTIONAL # Task 3: Get coefficient estimates and standard errors # starting with the covariance matrix of Y and the predictors #********************************************** #### # a. Coefficient estimates using covariances #### # We can get coefficient estimates for unstandardized predictors # provided we have the correlation, the vector of means and the vector of standard deviations meanVec = apply( subsetBabies, 2, mean ) sdVec = apply( subsetBabies, 2, sd ) # If gam_i is the coef estimate of the standardized predictor z_i, then # the coef estimate beta_i of the unstandardized predictor x_i is # given by beta_i = gam_i * sd_y / sd_xi # The intercept beta_0 is given by # beta_0 = mean_y - ( gam_1 * mean_x1 * sd_y / sd_x1 + ... + gam_p * mean_xp * sd_y / sd_xp) # Modify the coefficient estimates from the standardized predictors coefEstsUnstd = sdVec[1] * coefEsts / sdVec[-1] interceptEstUnstd = meanVec[1] - sum( sdVec[1] * coefEsts * meanVec[-1] / sdVec[-1] ) c( interceptEstUnstd, coefEstsUnstd ) lm.babies$coef #### # b. Standard errors using covariances #### # cov( beta_i, beta_j ) = cov( gam_i * sd_y / sd_xi, gam_j * sd_y / sd_xj ) # = ( sd_y^2 / (sd_xi * sd_xj) ) cov( gam_i, gam_j ) coefCovUnstd = coefCov for ( i in 1:numPreds ) { for ( j in 1:numPreds ) { coefCovUnstd[i,j] = coefCovUnstd[i,j] * sdVec[1]^2 / ( sdVec[i+1] * sdVec[j+1] ) } } sqrt( coefCovUnstd[1,1] ) sqrt( coefCovUnstd[2,2] ) summary( lm.babies ) # Compare s.e. estimates # The covariance of the intercept can also be calculated but is omitted here. #********************************************** # OPTIONAL # Task 4: Logistic regression #********************************************** premie = babies$gestation < 37*7 # premature if born before 37 weeks subsetBabies$premie = premie #### # a. Do a logistic regression #### glm.premie = glm( premie ~ age + ht + wt1 + dht + dwt, family = binomial ) summary( glm.premie ) #### # b. Adjusted variable interpretation in logit scale for coefficient age #### glm.no.age = update( glm.premie, formula = premie ~ . -age ) resid.no.age = glm.no.age$residuals lm.age.predictors = lm( age ~ ht + wt1 + dht + dwt ) resid.age.predictors = lm.age.predictors$residuals lm.age.addvar = lm( resid.no.age ~ resid.age.predictors -1 ) lm.age.addvar$coef glm.premie$coef # The coefficients for age are close #### # c. Added variable plots #### avp( glm.premie ) 2