library(rms) # Initial dataset manipulations data <- stata.get("http://biostat.mc.vanderbilt.edu/wiki/pub/Main/CourseBios312/salary.dta") data95 <- subset(data, year=="95") data95$male <- data95$sex=="M" data95$salhigh <- as.numeric(data95$salary > 7600) mean(data95$salhigh) tab1 <- with(data95, table(salhigh, male)) print(tab1) chisq.test(tab1, correct=FALSE) # Likelihood ratio test (fails) # I couldn't get the following package to install install.packages("Deducer") library(Deducer) likelihood.test(tab1) # I will try to do the LR test "by hand" tab.exp <- chisq.test(tab1,correct=FALSE)$expect 2*sum(tab1 * log(tab1/tab.exp)) a<- tab1[1,1] b<- tab1[1,2] c<- tab1[2,1] d<- tab1[2,2] a*d/(b*c) sqrt(1/a + 1/b + 1/c + 1/d) d/(d+b) c/(c+a) m1 <- lrm(salhigh ~ male, data=data95) # When using the Design library, need a data distribution dictionary for the dataset dd <- datadist(data95) options(datadist='dd') summary(m1) exp(m1$coef) b0 <- m1$coef[1] b1 <- m1$coef[2] exp(b0) / (1 + exp(b0)) exp(b0 + b1) / (1 + exp(b0 + b1)) phat <- predict(m1, type='fitted') data95$female <- abs(data95$male - 1) ############################ # Part 2: Salary and yrdeg # ############################ years <- sort(unique(data95$yrdeg)) summstats <- rep(NA, length(years)) for (i in 1:length(years)) { summstats[i] <- mean(data95$salhigh[data95$yrdeg==years[i]]) } plot(years, summstats) m2 <- lrm(salhigh ~ yrdeg, data=data95) m2 summary(m2) data95$yrssince95 <- 95 - data95$yrdeg dd <- datadist(data95) options(datadist='dd') m3 <-lrm(salhigh ~ yrssince95, data=data95) m3 summary(m3) phat2 <- predict(m3, type='fitted') odds2 <- phat2 / (1 - phat2) lnodds2 <- log(odds2) plot(lnodds2, data95$yrssince95) plot(odds2, data95$yrssince95) plot(phat2, data95$yrssince95) m4 <- glm(salhigh ~ 1, data=data95, family="binomial") m5 <- glm(salhigh ~ 1 + yrssince95, data=data95, family="binomial") m4 m5 deviance(m4) - deviance(m5)