library(rms)

# Initial dataset manipulations

data <- stata.get("http://biostat.mc.vanderbilt.edu/wiki/pub/Main/CourseBios312/salary.dta")
data95 <- subset(data, year=="95")
data95$male <- data95$sex=="M"

data95$salhigh <- as.numeric(data95$salary > 7600)

mean(data95$salhigh)

tab1 <- with(data95, table(salhigh, male))
print(tab1)
chisq.test(tab1, correct=FALSE)

# Likelihood ratio test (fails)
# I couldn't get the following package to install
install.packages("Deducer")
library(Deducer)
likelihood.test(tab1)

# I will try to do the LR test "by hand"
tab.exp <- chisq.test(tab1,correct=FALSE)$expect
2*sum(tab1 * log(tab1/tab.exp))

a<- tab1[1,1]
b<- tab1[1,2]
c<- tab1[2,1]
d<- tab1[2,2]

a*d/(b*c)

sqrt(1/a + 1/b + 1/c + 1/d)

d/(d+b)

c/(c+a)


m1 <- lrm(salhigh ~ male, data=data95)

# When using the Design library, need a data distribution dictionary for the dataset
dd <- datadist(data95)
options(datadist='dd')

summary(m1)

exp(m1$coef)

b0 <- m1$coef[1]
b1 <- m1$coef[2]

exp(b0) / (1 + exp(b0))
exp(b0 + b1) / (1 + exp(b0 + b1))

phat <- predict(m1, type='fitted')

data95$female <- abs(data95$male - 1)


############################
# Part 2: Salary and yrdeg #
############################

years <- sort(unique(data95$yrdeg))

summstats <- rep(NA, length(years))

for (i in 1:length(years)) {
	summstats[i] <- mean(data95$salhigh[data95$yrdeg==years[i]])
	}
plot(years, summstats)

m2 <- lrm(salhigh ~ yrdeg, data=data95)
m2
summary(m2)

data95$yrssince95 <- 95 - data95$yrdeg
dd <- datadist(data95)
options(datadist='dd')

m3 <-lrm(salhigh ~ yrssince95, data=data95)
m3
summary(m3)

phat2 <- predict(m3, type='fitted')
odds2 <- phat2 / (1 - phat2)
lnodds2 <- log(odds2)

plot(lnodds2, data95$yrssince95)
plot(odds2, data95$yrssince95)
plot(phat2, data95$yrssince95)

m4 <- glm(salhigh ~ 1, data=data95, family="binomial")
m5 <- glm(salhigh ~ 1 + yrssince95, data=data95, family="binomial")

m4
m5
deviance(m4) - deviance(m5)