## Functions used to run the imputation analyses in ODSCode.R

Odds2Prob      <- function(odds) odds/(1+odds) #odds2prob <- function(ZZ) ZZ/(1+ZZ)
prep_data      <- function(mean.formula, lv.formula, t.formula, id, data, inits,
                           samp.probs, samp.probi, offset) {
    
    mean.f = model.frame(mean.formula, data)
    mean.t = attr(mean.f, "terms")
    
    y  = model.response(mean.f,'numeric') 
    uy = unique(y)
    #print(summary(y))
    #print(summary(id))
    x  = model.matrix(mean.formula,mean.f)
    
    x.t = x.lv = matrix(0, ncol=1, nrow=length(y))
    if(!is.null(t.formula))   x.t  = model.matrix(t.formula,model.frame(t.formula, data)) 
    if(!is.null(lv.formula))  x.lv = model.matrix(lv.formula, model.frame(lv.formula, data))
    
    if(!is.matrix(samp.probs)) samp.probs = matrix(samp.probs, nrow = length(y), ncol = 3, byrow = TRUE)
    if(is.null(samp.probi)) samp.probi = matrix(1,nrow=length(y),ncol=1) 
    if(!is.null(inits)) inits = unlist(inits)
    
    if(is.null(inits)) {
        inits = c(glm(mean.formula,family='binomial',data=data)$coef, rep(1, ncol(x.t) + ncol(x.lv)))
        if(any(is.na(inits))) {
            omit_dup_col = which(is.na(inits))
            x            = x[,-c(omit_dup_col)]
            inits        = inits[-c(omit_dup_col)]
        }
    }
    
    if(is.null(offset)) offset = rep(0, length(y))
    
    x.t = cbind(x.t, NULL)
    x.lv = cbind(x.lv, NULL)
    
    if ( length(inits) != ncol(x) + ncol(x.t) + ncol(x.lv) ) { stop("Parameter length incongruous with X, Xgam, and Xsig")}
    paramlengths <- c(ncol(x), ncol(x.t), ncol(x.lv))
    
    #print(length(id))
    #print(length(y))
    ## Lagged response
    Ylag = rep(0, length(y))
    for(i in 2:length(y)) { if(id[i]==id[i-1]) Ylag[i]<-y[i-1] }
    
    id.tmp        = split(id, id)
    X.tmp         = split(x,id)
    Y.tmp         = split(y,id)
    Ylag.tmp      = split(Ylag,id)
    Xgam.tmp      = split(x.t,id)
    Xsig.tmp      = split(x.lv,id)
    SampProbi.tmp = split(samp.probi,id)
    SampProbs.tmp = split(samp.probs,id)
    offset.tmp    = split(offset,id)
    
    subjectData <- vector('list', length=length(unique(id)))
    subjectData <- list()
    uid <- as.character(unique(id))
    for(j in seq(along=uid)){
        i <- uid[j]
        subjectData[[j]] <- list(id=as.character(unique(id.tmp[[i]])), 
                                 X=matrix(X.tmp[[i]], ncol=ncol(x)), 
                                 Y=as.double(Y.tmp[[i]]),
                                 Ylag=as.double(Ylag.tmp[[i]]),
                                 Xgam=matrix(Xgam.tmp[[i]], ncol=ncol(x.t)),
                                 Xsig=matrix(Xsig.tmp[[i]], ncol=ncol(x.lv)),
                                 SampProbi=unique(SampProbi.tmp[[i]]),
                                 SampProbs=matrix(SampProbs.tmp[[i]], ncol=ncol(samp.probs))[1,],
                                 Offset=as.double(offset.tmp[[i]]))
    }
    names(subjectData) <- uid
    
    beta_nms  <- colnames(x)
    alpha_nms <- c( if(!is.null(t.formula)){paste('gamma',colnames(x.t),sep=':')}, 
                    if(!is.null(lv.formula)){paste('log(sigma)',colnames(x.lv),sep=':')}) 
    
    list(params=inits, paramlengths=paramlengths, subjectData=subjectData, samp.probi=tapply(samp.probi, id, unique),
         nms=list(beta_nms, alpha_nms))
}
ImputeData     <- function(FIT, DAT, M=30, ROB=FALSE, MARG.EXP.FORMULA, verbose=FALSE,EXPVAR=EXPVAR, Sampled=Sampled) {
    # FIT - MMLongit object
    # DAT - complete data set with sampled flag (Sampled)
    # M   - # of imputations
    # ROB - indicator, if FALSE use model-based covariance
    # MARG.EXP.FORMULA - marginal exposure model formula: [Xe | Xo]
    # EXPVAR - exposure / expensive variable: Xe
    # Sampled - should be set to 1 if included in the sample and 0 otherwise
    #EXPVAR   = DAT$EXPVAR  = DAT[, as.character(substitute(EXPVAR))]
    Sampled  = DAT$Sampled = DAT[, as.character(substitute(Sampled))]
    
    imp_data <- vector('list', M)
    
    for(ix in seq(M)) { if (verbose) cat("\r","Imputing dataset number:", ix)
        # Step 0: Update date files
        # Create versions of DAT with updated Xe values
        DAT_Xe1 <- DAT_Xe0 <- DAT
        #DAT_Xe1$EXPVAR <- ifelse (!is.na(DAT_Xe1$EXPVAR), 1, 1 )
        #DAT_Xe0$EXPVAR <- ifelse (!is.na(DAT_Xe0$EXPVAR), 0, 0 )
        
        #print(summary(DAT_Xe1$EXPVAR))
        #print(summary(DAT_Xe0))
        DAT_Xe1[,EXPVAR][is.na(DAT_Xe1[,EXPVAR])] <- 1
        DAT_Xe0[,EXPVAR][is.na(DAT_Xe0[,EXPVAR])] <- 0
        DAT_Xe1[,EXPVAR] <- 1
        DAT_Xe0[,EXPVAR] <- 0
        #print(summary(DAT_Xe0))
        #DAT_Xe1[,EXPVAR & !is.na(EXPVAR)] <- 1
        #DAT_Xe0[,EXPVAR & !is.na(EXPVAR)] <- 0
        
        # Non-sampled subjects
        DAT_S0     <- DAT[Sampled==0,]
        DAT_S0_Xe1 <- DAT_Xe1[Sampled==0, ]
        DAT_S0_Xe0 <- DAT_Xe0[Sampled==0, ]
        dup        <- duplicated(DAT_S0$id)
        DAT_S0_v0  <- DAT_S0[!dup, ]
        
        # Sampled subjects
        DAT_S1     <- DAT[Sampled==1,]
        DAT_S1_Xe1 <- DAT_Xe1[Sampled==1, ]
        DAT_S1_Xe0 <- DAT_Xe0[Sampled==1, ]
        dup        <- duplicated(DAT_S1$id)
        DAT_S1_v0  <- DAT_S1[!dup,]
        
        #print(summary(DAT_S0_Xe1))
        #print(summary(DAT_S1_Xe1))
        # Step 1: extract estimates from FIT and draw m^th theta from MVN
        theta      <- c(FIT$beta, FIT$alpha)
        if(ROB) { cov_theta <- FIT$rob.cov
        } else { cov_theta <- FIT$mod.cov}
        
        theta.m <- mvrnorm(n = 1, mu=theta, Sigma=cov_theta)
        
        # Step 2: Using the estimated theta=(beta,alpha), calculate likelihood contributions for sampled subjects
        # for both Xe=1 and Xe=0.  
        #print(names(attr(FIT,'args')))
        prep_input <- attr(FIT,'args')
        tmp_S1_Xe1 <- prep_data(mean.formula=prep_input$mean.formula, lv.formula=prep_input$lv.formula, 
                                t.formula=prep_input$t.formula, 
                                id=DAT_S1_Xe1[,prep_input$id], data=DAT_S1_Xe1, inits=theta.m,
                                samp.probs=prep_input$samp.probs, samp.probi=prep_input$samp.probi, 
                                offset=prep_input$offset)$subjectData
        tmp_S1_Xe0 <- prep_data(mean.formula=prep_input$mean.formula, lv.formula=prep_input$lv.formula,
                                t.formula=prep_input$t.formula, 
                                id=DAT_S1_Xe0[,prep_input$id], data=DAT_S1_Xe0, inits=theta.m,
                                samp.probs=prep_input$samp.probs, samp.probi=prep_input$samp.probi, 
                                offset=prep_input$offset)$subjectData
        
        # These functions calculate the conditional likelihood contributions by and the ascertainment correction
        # for sampled subjects.  The difference of ascertainment corrections (i.e. log of the ratio 
        # log(pr(S=1 | Xe=1, Xo)/pr(S=1 | Xe=1, Xo))= log(pr(S=1 | Xe=1, Xo))-log(pr(S=1 | Xe=0, Xo))) provides
        # the offset for the marginal exposure model 
        LLSC_1 <- MMLB:::LogLScoreCalc(params = theta.m, subjectData = tmp_S1_Xe1, 
                                       Q = FIT$LLSC_args$Q, W = FIT$LLSC_args$W, Z = FIT$LLSC_args$Z, 
                                       ParamLengths = FIT$LLSC_args$ParamLengths,
                                       CondLike = FIT$control['cond.like']==1, EmpiricalCheeseCalc = ROB) 
        LLSC_0 <- MMLB:::LogLScoreCalc(params = theta.m, subjectData = tmp_S1_Xe0, 
                                       Q = FIT$LLSC_args$Q, W = FIT$LLSC_args$W, Z =  FIT$LLSC_args$Z, 
                                       ParamLengths = FIT$LLSC_args$ParamLengths,
                                       CondLike = FIT$control['cond.like']==1, EmpiricalCheeseCalc = ROB)
        
        # Note: The ascertainment corrections used for the marginal exposure model offset 
        #       are already on the log scale, so log(a/b) = loga-logb
        logAC_1 <- attr(LLSC_1, "ACSubj") 
        logAC_0 <- attr(LLSC_0, "ACSubj")
        DAT_S1_v0$offset <- logAC_1-logAC_0
        
        # Perform offsetted logistic regression [Note, only using first visit info i.e. no time-varying information.
        fit.exp <- glm(MARG.EXP.FORMULA, offset=offset, data=DAT_S1_v0,family=binomial())
        
        # Extract alpha estimates and sample from the sampling distribution
        alpha      <- coef(fit.exp)
        cov_alpha  <- vcov(fit.exp)
        
        alpha.m <- mvrnorm(n = 1, mu=alpha, Sigma=cov_alpha)
        
        # Step 5: Apply the model and theta to calculate the ascertainment corrections for unsampled people, 
        
        nr <- nrow(DAT_S0_Xe1)
        
        tmp_S0_Xe1 <- prep_data(mean.formula=prep_input$mean.formula, lv.formula=prep_input$lv.formula, 
                                t.formula=prep_input$t.formula, 
                                id=DAT_S0_Xe1[,prep_input$id], data=DAT_S0_Xe1, inits=theta.m,
                                samp.probs=prep_input$samp.probs[rep(1,nr),], 
                                samp.probi=prep_input$samp.probi[rep(1,nr)], 
                                offset=prep_input$offset[rep(1,nr)])$subjectData
        tmp_S0_Xe0 <- prep_data(mean.formula=prep_input$mean.formula, lv.formula=prep_input$lv.formula,
                                t.formula=prep_input$t.formula, 
                                id=DAT_S0_Xe0[,prep_input$id], data=DAT_S0_Xe0, inits=theta.m,
                                samp.probs=prep_input$samp.probs[rep(1,nr),], 
                                samp.probi=prep_input$samp.probi[rep(1,nr)], 
                                offset=prep_input$offset[rep(1,nr)])$subjectData
        
        LLSC_1 <- MMLB:::LogLScoreCalc(params = theta.m, subjectData = tmp_S0_Xe1, 
                                       Q = FIT$LLSC_args$Q, W = FIT$LLSC_args$W, Z = FIT$LLSC_args$Z, 
                                       ParamLengths = FIT$LLSC_args$ParamLengths,
                                       CondLike =  FIT$control['cond.like']==1, EmpiricalCheeseCalc = ROB) 
        LLSC_0 <- MMLB:::LogLScoreCalc(params = theta.m, subjectData = tmp_S0_Xe0, 
                                       Q = FIT$LLSC_args$Q, W = FIT$LLSC_args$W, Z =  FIT$LLSC_args$Z, 
                                       ParamLengths = FIT$LLSC_args$ParamLengths,
                                       CondLike =  FIT$control['cond.like']==1, EmpiricalCheeseCalc = ROB)
        
        ## Applied to the non-sampled subjects: log(pr(S=1 | Xe=1, Xo)/pr(S=1 | Xe=1, Xo)).
        logAC_1 <- attr(LLSC_1, "ACSubj") # Ascertainment corrections already log transformed
        logAC_0 <- attr(LLSC_0, "ACSubj")
        offset_log_S0 <- logAC_1-logAC_0
        
        # Create a temporary dataframe for non-sampled subjects in order to create the
        # model.matrix that is used to calculate marginal exposure odds
        tmp          <- DAT_S0_v0
        tmp[,EXPVAR] <- sample(c(0,1),size=nrow(tmp),replace=TRUE)
        mm_tmp       <- model.matrix(MARG.EXP.FORMULA, tmp)
        
        # Compute the conditional exposure odds in non-sampled subjects
        # Notice that all conditional odds are equal: [Xe | Y, Xo, S=1] = [Xe | Y, Xo, S=0] = [Xe | Y, Xo] since we sampled based on Y
        # So, we multiply the odds (conditional on being sampled), pr[Xe=1 | Xo, S=1]/pr[Xe=0 | Xo, S=1] 
        # by the likelihood ratio (conditional on being sampled), pr[Y | Xe=1, Xo, S=1]/ pr[Y | Xe=1, Xo, S=1]
        # in the unsampled subjects in order to obtain conditional odds in unsampled: pr[Xe=1 | Y, Xo, S=1]/pr[Xe=0 | Y, Xo, S=1]
        marg.odds.Xe.Xo.S1   <- exp(mm_tmp %*% alpha.m)*exp(offset_log_S0)  
        LR.Y.Xe.Xo.S1        <- exp( attr(LLSC_1, "LogLikeSubj")-attr(LLSC_0, "LogLikeSubj"))
        cond.odds.Xe.Y.Xo.S0 <-  LR.Y.Xe.Xo.S1*marg.odds.Xe.Xo.S1  
        
        # Convert odds to probability, log(odds) = log(1/[1-p]) ---> p = odds/(1+odds)
        # Impute Xe for non-sampled subjects
        prXe1     <- Odds2Prob(cond.odds.Xe.Y.Xo.S0)
        XeS0      <- rbinom(nrow(tmp), size=1, prob=prXe1)
        
        # Add XeS0 info to DAT_S0_v0
        DAT_S0_v0[,EXPVAR] <- XeS0
        DAT_S0 <- DAT_S0[,-which(colnames(DAT_S0)==EXPVAR)]
        DAT_S0 <- merge(DAT_S0, DAT_S0_v0[,c('id',EXPVAR)],by='id')
        DAT_imp <- rbind(DAT_S1, DAT_S0[,colnames(DAT_S1)])
        
        # Store imputation data
        imp_data[[ix]] <- DAT_imp
        #if(verbose) cat('\r',ix)
    } # end of for ix loop
    
    imp_data
}
FitImputedData <- function(FIT, ImputedData, verbose=FALSE){
    M <- length(ImputedData)
    prep_input <- attr(FIT,'args')
    imp.mods <- ests <- vcovs <- list()
    for (i in 1:M){ if (verbose) cat("\r", "Beginning imputed data analysis number:", i); 
        #print(date())
        imp.mods[[i]]        <- mm(prep_input$mean.formula, 
                                   dat=ImputedData[[i]], 
                                   id=id, 
                                   lv.formula=prep_input$lv.formula, 
                                   t.formula=prep_input$lv.formula, 
                                   iter.lim=1000, verbose=FALSE, q=50)
        ests[[i]]  <- unlist(coef(imp.mods[[i]]))
        vcovs[[i]] <- imp.mods[[i]]$mod.cov
    }
    out <- MIcombine(ests, vcovs)
    out
}
mi.analysis <- function(FullCohortData, Sampled, mean.formula,lv.formula, t.formula, id, cond.like, samp.probs, 
                        iter.lim=100, q=50, M=20, marg.exp.formula, EXPVAR, verbose=FALSE){
    ##  FullCohortData    Full cohort data.  This dataset should not have missing values for variables other than EXPVAR
    ##  mean.formula      Of the form: Y ~ EXPVAR+X1+X2...
    ##  lv.formula        Latent variable formula.  For a random intercept, use ~1.  If no random intercept is needed, set to NULL
    ##  t.formula         Transition formula.  For a standard first order transition model, use ~1.  
    ##                    NOTE: lv.formula and t.formula should not both be set to 0.
    ##  Sampled           Sampling indicator variable name: 1 if sampled and 0 if not
    ##  id                id variable.
    ##  cond.like         If using an ODS design, set this to TRUE.  If imputing from a random sampling design, set to FALSE
    ##  samp.probs        Sampling probabilities for the three strata (\pi_0, \pi_1, \pi_2) 
    ##  iter.lim=100      Limits the number of newton raphson iterations 
    ##  q=50,             Number of Gauss-Hermite quadrature points for integrating over random effects.  If no random intercept is use, set this to 3.
    ##  M=5               Number of imputations
    ##  marg.exp.formula  Marginal exposure model of the form: EXPVAR ~ Xo
    ##  EXPVAR            Expensive exposure variable that should be set to missing for those not included in the subsample.
    ##                    When Sampled=1, this should be non-missing and when Sampled=0 it should be missing                      
    ##  verbose=FALSE     Prints fitting progress
    
    id       = FullCohortData$id      = FullCohortData[, as.character(substitute(id))]
    #EXPVAR   = FullCohortData$EXPVAR  = FullCohortData[, as.character(substitute(EXPVAR))]
    Sampled  = FullCohortData$Sampled = FullCohortData[, as.character(substitute(Sampled))]
    ## Keep those who were sampled to create the ODS dataset
    #odsdat <- FullCohortData[FullCohortData[,Sampled]==1,]
    odsdat <- FullCohortData[Sampled==1,]
    
    if (verbose) cat("Fit ACML...", "\n")
    FIT  <- mm(mean.formula, dat=odsdat, id=id, lv.formula=lv.formula, t.formula=t.formula, 
               cond.like=cond.like, samp.probs=samp.probs, iter.lim=iter.lim, verbose=FALSE, q=q, return_args=TRUE)
    imps <- ImputeData(FIT=FIT, DAT=FullCohortData, M=M, ROB=FALSE, MARG.EXP.FORMULA=marg.exp.formula, 
                       verbose=verbose, EXPVAR=EXPVAR, Sampled=Sampled)
    res  <- FitImputedData(FIT=FIT, ImputedData=imps, verbose=verbose) 
    res
}