################################## # Import data and only keep 1995 # ################################## library(rms) data <- stata.get("http://biostat.mc.vanderbilt.edu/wiki/pub/Main/CourseBios312/salary.dta") data95 <- subset(data, year=="95") ################################################## # Convert string variables to numeric indicators # ################################################## data95$male <- as.numeric(data95$sex=="M") with(data95, table(male, sex)) #### Quesiton 2 m1 <- lm(salary ~ male + yrdeg, data=data95) plot(fitted(m1), resid(m1)) lines(lowess(resid(m1) ~ fitted(m1)), col="Red", lwd=3) plot(data95$yrdeg, resid(m1), xlab="Year of Degree") lines(lowess(resid(m1) ~ data95$yrdeg), col="Red", lwd=3) resid.m1 <- rstudent(m1) hist(resid.m1) qqnorm(resid.m1) abline(a=0,b=1) dfbeta.m1 <- dfbeta(m1) head(dfbeta.m1) plot(data95$id, dfbeta.m1[,2]) # Delta-betas for male plot(data95$id, dfbeta.m1[,3]) # Delta-betas for yrdeg # Question 3 m2 <- lm(salary ~ male + poly(yrdeg,2), data=data95) summary(m2) resid.m2 <- rstudent(m2) plot(lowess(resid.m1 ~ data95$yrdeg), type='l', ylim=c(-.4,.4), xlab="Year of Degree") lines(lowess(resid.m2 ~ data95$yrdeg), col='Red') # Note that the residual versus predictor lowess smooth from m2 is flatter than the smooth from m1 pred.1 <- predict(m2, data.frame(male=1, yrdeg=90), se.fit=TRUE) pred.2 <- predict(m2, data.frame(male=0, yrdeg=80), se.fit=TRUE) pred.2$fit + c(-1,0,1)*qt(.975,pred.2$df)*sqrt(pred.2$se.fit^2 + summary(m2)$sigma^2) # Modified dataset data95.mod <- data95 data95.mod$yrdeg[data95.mod$id==23] <- 110 data95.mod$salary[data95.mod$id==8] <- 25000 with(data95.mod, plot(yrdeg, salary)) m3 <- lm(salary ~ male + yrdeg, data=data95.mod) plot(fitted(m3), resid(m3)) lines(lowess(resid(m3) ~ fitted(m3)), lwd=3, col="Red") resid.m3 <- rstudent(m3) hist(resid.m3) resid.m3[data95.mod$id==8] resid.m3[data95.mod$id==23] dfbeta.m3 <- dfbeta(m3) head(dfbeta.m3) plot(data95.mod$id, dfbeta.m3[,2]) # Delta-betas for male plot(data95.mod$id, dfbeta.m3[,3]) # Delta-betas for yrdeg plot(m3) # Note this will provide several common diagnostics cook.m3 <- cooks.distance(m3) lev.m3 <- hatvalues(m3) cook.m3[data95.mod$id==8] lev.m3[data95.mod$id==8] cook.m3[data95.mod$id==23] lev.m3[data95.mod$id==23]