################################## # Import data and only keep 1995 # ################################## library(Design) library(sandwich) library(lmtest) library(car) data <- stata.get("http://biostat.mc.vanderbilt.edu/wiki/pub/Main/CourseBios312/salary.dta") data95 <- subset(data, year=="95") ################################################## # Convert string variables to numeric indicators # ################################################## data95$male <- as.numeric(data95$sex=="M") data95$arts <- as.numeric(data95$field=="Arts") data95$other <- as.numeric(data95$field=="Other") data95$degother <- as.numeric(data95$deg=="Other") data95$degprof <- as.numeric(data95$deg=="Prof") data95$fieldother <- as.numeric(data95$field=="Other") data95$fieldprof <- as.numeric(data95$field=="Prof") data95$assoc <- as.numeric(data95$rank=="Assoc") data95$full <- as.numeric(data95$rank=="Full") with(data95, table(male, sex)) with(data95, table(arts, field)) with(data95, table(other, field)) data95$pp <- with(data95, fieldprof*degprof) data95$po <- with(data95, fieldprof*degother) data95$op <- with(data95, fieldother*degprof) data95$oo <- with(data95, fieldother*degother) #### The regression models # Note: Need to drop op interaction in models because it is not estimable m.c <- lm(salary ~ male + yrdeg + startyr + degother + degprof + fieldother + fieldprof + pp + po + oo + admin, data=data95) coeftest(m.c, vcov=sandwich) fit.c <- predict(m.c) m.d <- lm(salary ~ male + poly(yrdeg,2) + poly(startyr,2) + degother + degprof + fieldother + fieldprof + pp + po + oo + admin, data=data95) coeftest(m.d, vcov=sandwich) fit.d <- predict(m.d) data95$startcat <- cut(data95$startyr, breaks=c(0,65,75,85,99)) data95$yrdegcat <- cut(data95$yrdeg, breaks=c(0,65,75,85,99)) table(data95$startcat) m.e <- lm(salary ~ male + factor(startcat) + factor(yrdegcat) + degother + degprof + fieldother + fieldprof + pp + po + oo + admin, data=data95) coeftest(m.e, vcov=sandwich) fit.e <- predict(m.e) m.f <- lm(salary ~ male + rcs(startyr, 4) + rcs(yrdeg,4) + degother + degprof + fieldother + fieldprof + pp + po + oo + admin, data=data95) coeftest(m.f, vcov=sandwich) fit.f <- predict(m.f) data95$startbin <- cut(data95$startyr, breaks=c(0,65,99)) data95$yrdegbin <- cut(data95$yrdeg, breaks=c(0,65,99)) table(data95$startbin) m.g <- lm(salary ~ male + factor(startbin) + factor(yrdegbin) + degother + degprof + fieldother + fieldprof + pp + po + oo + admin, data=data95) coeftest(m.g, vcov=sandwich) fit.g <- predict(m.g) plot(fit.c,fit.d) plot(fit.c,fit.e) plot(fit.c,fit.f) plot(fit.d,fit.f) plot(fit.c,fit.g)