################################## # Import data and keep only 1995 # ################################## library(rms) library(lattice) data <- stata.get("http://biostat.mc.vanderbilt.edu/wiki/pub/Main/CourseBios312/salary.dta") table(data$year) data.95 <- subset(data, year=="95") describe(data.95) ################################################# # Convert string variable to numeric indicators # ################################################# # Make a male indicator variable; female is reference data.95$male <- data.95$sex=="M" with(data.95, table(male, sex)) # Make a female indicator variable; male is reference data.95$female <- data.95$sex=="F" with(data.95, table(female,sex)) #################################### # Salary by sex: Unadjusted models # #################################### bwplot(salary ~ sex, data=data.95) summary(lm(salary ~ male, data=data.95)) summary(lm(salary ~ female, data=data.95)) t.test(salary ~ sex, data=data.95, var.equal=TRUE) # Calculate the pooled standard deviation s.male <- sd(data.95$salary[data.95$male]) s.female <- sd(data.95$salary[data.95$female]) sqrt((s.female^2 * 408 + s.male^2*1187)/ (408 + 1187)) # Other methods for specifying the regression model summary(lm(salary ~ male + female, data=data.95)) summary(lm(salary ~ -1 + male + female, data=data.95)) summary(lm(salary ~ -1 + as.numeric(male) + as.numeric(female), data=data.95)) # Aside: factor variables summary(lm(salary ~ factor(sex), data=data.95)) # If we were interested in modeling yearly salary data.95$salaryyr <- data.95$salary*12 summary(lm(salary ~ male, data=data.95)) summary(lm(salaryyr ~ male, data=data.95)) ##################################### # Salary by rank: Unadjusted models # ##################################### with(data.95, table(rank)) data.95$rankassist <- data.95$rank=="Assist" data.95$rankassoc <- data.95$rank=="Assoc" data.95$rankfull <- data.95$rank=="Full" bwplot(salary ~ rank, data=data.95) # Different regressions summary(lm(salary ~ rankassoc + rankfull, data=data.95)) summary(lm(salary ~ rankassist + rankfull, data=data.95)) summary(lm(salary ~ rankassoc + rankassist, data=data.95)) summary(lm(salary ~ factor(rank), data=data.95)) # Calculate RMSE by hand s.assist <- sd(data.95$salary[data.95$rankassist]) s.assoc <- sd(data.95$salary[data.95$rankassoc]) s.full <- sd(data.95$salary[data.95$rankfull]) n.assist <- sum(data.95$rankassist) n.assoc <- sum(data.95$rankassoc) n.full <- sum(data.95$rankfull) rmse <- sqrt(((n.assist-1)*s.assist^2 + (n.assoc-1)*s.assoc^2 + (n.full-1)*s.full^2)/(n.assist + n.assoc + n.full - 3)) print(rmse) # Cell means coding summary(lm(salary ~ -1 + as.numeric(rankassist) + as.numeric(rankassoc) + as.numeric(rankfull), data=data.95)) # Compare cell means coding with the following summary(lm(salary ~ -1 + rankassist + rankassoc + rankfull, data=data.95)) # Standard errors from cell means coding output rmse / sqrt(n.assist) rmse / sqrt(n.assoc) rmse / sqrt(n.full) ############################################### # Salary by year of degreee and starting year # ############################################### # Association of year of degree with starting year with(data.95, plot(startyr, yrdeg)) abline(lm(yrdeg ~ startyr, data=data.95), lwd=2) with(data.95, lines(lowess(yrdeg ~ startyr), col="Red", lwd=2)) legend("topleft", inset=0.05, c("Data", "Regression ine", "Lowess smooth"), lty=c(NA,1,1), pch=c(1,NA,NA), col=c("Black","Black", "Red"), lwd=2) summary(lm(yrdeg ~ startyr, data=data.95)) summary(lm(startyr ~ yrdeg, data=data.95)) with(data.95, cor.test(yrdeg, startyr)) lm(salary ~ yrdeg, data=data.95) lm(salary ~ startyr, data=data.95) lm(salary ~ yrdeg + startyr, data=data.95)