##################################
# Import data and keep only 1995 #
##################################

library(rms)
library(lattice)
data <- stata.get("http://biostat.mc.vanderbilt.edu/wiki/pub/Main/CourseBios312/salary.dta")
table(data$year)
data.95 <- subset(data, year=="95")
describe(data.95)


#################################################
# Convert string variable to numeric indicators #
#################################################

# Make a male indicator variable; female is reference
data.95$male <- data.95$sex=="M"
with(data.95, table(male, sex))

# I am going to attach the dataframe data.95 so that I do not have to explicitly reference it in the models below
#  Normally, this is not a good idea if you are working with multiple datasets
attach(data.95)


m1 <- lm(salary ~ male)
m2 <- lm(yrdeg ~ male)

summary(m1)
summary(m2)

# Plot the residuals; add a lowess line
plot(resid(m2), resid(m1))
lines(lowess(resid(m1) ~ resid(m2)), lwd=2, col="Red")

# A simple linear regression using the residuals
m3 <- lm(resid(m1) ~ resid(m2))
summary(m3)


# Now fit the multivariable model
m4 <- lm(salary ~ male + yrdeg)

# Compare m3 and m4
summary(m3)
summary(m4)

# Note that the residuals model doesn't get the residual degrees of freedom exactly right
#  The residual d.f. is n-p, where p is the # of predictors in the model
#  The model using the residuals thinks there is 1 predictor
#  Thus, the RMSE and standard error estimates for betas are a little off


# The 'extra' problem

rankfull <- rank=="Full"
m5 <- lm(rankfull ~ male + yrdeg)
plot(resid(m5), resid(m4))
summary(lm(salary ~ male + yrdeg + rankfull))
summary(lm(resid(m4) ~ resid(m5)))


# The questions..,.

#Another way to think about regression is the amount of variablity in the outcome (Y) that is explained by the predictors (X).  In simple linear regression, we regress X on Y because we believe that X will explain some of the variability in Y.  This leads to an alternate way of thinking about statistical tests for regression coefficients.  Statistical tests assume that X explains none of the variability in Y (the null hypothesis), and the alternative hypothesis is that X explains more variability than would be expected by chance alone.  In this lab we will consider the interpretation of statistical tests in a multivariable model that adds another predictor (W) to the model.  The following exercise details the interpretation of the test for yrdeg in a multivariable model with salary as the outcome and male and yrdeg as the predictors.

#1.  Model 1: Fit a simple linear regression model with salary as the outcome and male as the predictor.  Save the residuals from this model.  Interpret these residuals in terms of the unexplained variability in salary.

#2.  Model 2: Fit a simple linear regression model with yrdeg as the outcome and male as the predictor.  Save the residuals from this model.  Interpret these residuals in terms of the unexplained variability in yrdeg.

#3.  Plot the residuals from model 2 (X-axis) versus the residuals from model 1 (y-axis).  Describe any association you see.  It may be helpful to add a lowess smooth or other smooth line to the plot.

#4.  Fit a simple linear regression model using the residauls from model 1 as the outcome and the residuals from model 2 as the predictor.  Interpret the slope coefficient from this model.

#5.  Fit a multivariable linear regression model with salary as the outcome using predictors male and yrdeg.  What is the interpretation of male in this model?  What is the interpretation of yrdeg?

#6. Compare the slope estimate for yrdeg from model 5 to the slope estimate obtained in question 4.  Explain your findings.

#7. What implications does this have for fitting multivariable models with even more predictors?  Consider the implications in relation to the association between (1) X and W and  (2) the residuals of Y given X and residuals of X given W.