## Code to conduct analyses described in our Epidemiology paper using a simulated dataset.
##
## Users should also download ODSCodeFunctions.R and FullCohort.RData
##
## This script conducts analyses of the full cohort.  It also conducts analyses
## for an outcome dependent sample with the three methods described in the manuscript, 
## i.e., ascertainment corrected maximum likelihood (ACML), weighted likeihoood (WL), and a 
## multiple imputation extension of the ACML analysis

## Please search in the following places for updated code and examples:
## http://biostat.mc.vanderbilt.edu/wiki/bin/edit/Main/ODSandLDA
## https://github.com/mercaldo/MMLB

## This code will soon be available on the comprehensive R network with an accompanying 
## vignette to describe it further

rm(list=ls())
## Install all needed R packages
#install.packages("devtools")
#library(devtools)
#install_github('mercaldo/MMLB', force=TRUE)
#install.packages("MASS")
#install.package("mitools")

## Load R libraries
library(MMLB)
library(MASS)
library(mitools)

## Source functions used to conduct the MI analysis
source("ODSCodeFunctions.R")

## Load Full Cohort data that contains 5000 individuals
## The dataframe is named FullCohort.  
load("FullCohort.RData")

## Look at the data
head(FullCohort, n=50)
summary(FullCohort)

# Perform full cohort analysis
fc.fit   <- mm(Y~time*Xe+C,data=FullCohort, lv.formula=~1,t.formula=~1, id=id,q=50, verbose=FALSE)
summary(fc.fit)

## Unique ids in each sampling stratum
## The outcome is relatively rare and so proportionately few subjects are in stratum 3. 
IDinSS <- tapply(FullCohort$id, FullCohort$ss, unique)
NperStratum <- c(unlist(lapply(IDinSS, length)))
NperStratum

## Expected / desired ods sample size per stratum and induced sampling probabilities
NSampledPerStratum <- c(100, 1000, 100)
SampProbs          <- NSampledPerStratum / NperStratum
SampProbs

## Sample from the full cohort to create the subsample
SamplingList         <- data.frame(id=c(unlist(IDinSS)), SampProbi=rep(SampProbs, NperStratum))
SamplingList$Sampled <- rbinom(5000, 1, SamplingList$SampProbi)
SamplingList         <- SamplingList[order(SamplingList$id),]

## Merge the full cohort with the sampling list and set unsampled subjects' (Sampled=0) Xe value set to NA
FullCohort.na <- merge(SamplingList, FullCohort, by="id")
FullCohort.na$Xe[FullCohort.na$Sampled==0] <- NA

## Keep those who were sampled to create the ODS dataset
odsdat <- FullCohort.na[FullCohort.na$Sampled==1,]

# ODS analysis using ascertaintment corrected (maximum) likelihood
# We are fitting this with a marginalized transition and latent variable model (Schildcrout and Heagerty, 2007, Biometrics)
fit.acml        <- mm(Y~time*Xe+C, dat=odsdat, id=id, lv.formula=~1, t.formula=~1, 
                      cond.like=TRUE, samp.probs=SampProbs, iter.lim=1000, verbose=FALSE, q=20, return_args=TRUE)
summary(fit.acml)

# ODS analysis using the weighted likelihood
# Important note: Do not use model based covariances (mod.cov) if using weighted likelihood.  Use rob.cov for a robust, sandwich estimators
fit.wl        <- mm(Y~time*Xe+C, dat=odsdat, id=id, lv.formula=~1, t.formula=~1, 
                        cond.like=FALSE, samp.probi=odsdat$SampProbi, iter.lim=1000, verbose=FALSE, q=20)
summary(fit.wl)

## Imputation based analysis 
fit.mi <- mi.analysis(FullCohortData=FullCohort.na,
                   Sampled=Sampled,
                   mean.formula=Y~time*Xe+C,
                   lv.formula=~1,
                   t.formula=~1 ,
                   id=id,
                   cond.like=TRUE,
                   samp.probs=SampProbs,
                   iter.lim=1000,
                   verbose=TRUE,
                   q=20,
                   M=30,
                   marg.exp.formula=Xe~C,
                   EXPVAR = "Xe")
summary(fit.mi)