## Functions used to run the imputation analyses in ODSCode.R Odds2Prob <- function(odds) odds/(1+odds) #odds2prob <- function(ZZ) ZZ/(1+ZZ) prep_data <- function(mean.formula, lv.formula, t.formula, id, data, inits, samp.probs, samp.probi, offset) { mean.f = model.frame(mean.formula, data) mean.t = attr(mean.f, "terms") y = model.response(mean.f,'numeric') uy = unique(y) #print(summary(y)) #print(summary(id)) x = model.matrix(mean.formula,mean.f) x.t = x.lv = matrix(0, ncol=1, nrow=length(y)) if(!is.null(t.formula)) x.t = model.matrix(t.formula,model.frame(t.formula, data)) if(!is.null(lv.formula)) x.lv = model.matrix(lv.formula, model.frame(lv.formula, data)) if(!is.matrix(samp.probs)) samp.probs = matrix(samp.probs, nrow = length(y), ncol = 3, byrow = TRUE) if(is.null(samp.probi)) samp.probi = matrix(1,nrow=length(y),ncol=1) if(!is.null(inits)) inits = unlist(inits) if(is.null(inits)) { inits = c(glm(mean.formula,family='binomial',data=data)$coef, rep(1, ncol(x.t) + ncol(x.lv))) if(any(is.na(inits))) { omit_dup_col = which(is.na(inits)) x = x[,-c(omit_dup_col)] inits = inits[-c(omit_dup_col)] } } if(is.null(offset)) offset = rep(0, length(y)) x.t = cbind(x.t, NULL) x.lv = cbind(x.lv, NULL) if ( length(inits) != ncol(x) + ncol(x.t) + ncol(x.lv) ) { stop("Parameter length incongruous with X, Xgam, and Xsig")} paramlengths <- c(ncol(x), ncol(x.t), ncol(x.lv)) #print(length(id)) #print(length(y)) ## Lagged response Ylag = rep(0, length(y)) for(i in 2:length(y)) { if(id[i]==id[i-1]) Ylag[i]<-y[i-1] } id.tmp = split(id, id) X.tmp = split(x,id) Y.tmp = split(y,id) Ylag.tmp = split(Ylag,id) Xgam.tmp = split(x.t,id) Xsig.tmp = split(x.lv,id) SampProbi.tmp = split(samp.probi,id) SampProbs.tmp = split(samp.probs,id) offset.tmp = split(offset,id) subjectData <- vector('list', length=length(unique(id))) subjectData <- list() uid <- as.character(unique(id)) for(j in seq(along=uid)){ i <- uid[j] subjectData[[j]] <- list(id=as.character(unique(id.tmp[[i]])), X=matrix(X.tmp[[i]], ncol=ncol(x)), Y=as.double(Y.tmp[[i]]), Ylag=as.double(Ylag.tmp[[i]]), Xgam=matrix(Xgam.tmp[[i]], ncol=ncol(x.t)), Xsig=matrix(Xsig.tmp[[i]], ncol=ncol(x.lv)), SampProbi=unique(SampProbi.tmp[[i]]), SampProbs=matrix(SampProbs.tmp[[i]], ncol=ncol(samp.probs))[1,], Offset=as.double(offset.tmp[[i]])) } names(subjectData) <- uid beta_nms <- colnames(x) alpha_nms <- c( if(!is.null(t.formula)){paste('gamma',colnames(x.t),sep=':')}, if(!is.null(lv.formula)){paste('log(sigma)',colnames(x.lv),sep=':')}) list(params=inits, paramlengths=paramlengths, subjectData=subjectData, samp.probi=tapply(samp.probi, id, unique), nms=list(beta_nms, alpha_nms)) } ImputeData <- function(FIT, DAT, M=30, ROB=FALSE, MARG.EXP.FORMULA, verbose=FALSE,EXPVAR=EXPVAR, Sampled=Sampled) { # FIT - MMLongit object # DAT - complete data set with sampled flag (Sampled) # M - # of imputations # ROB - indicator, if FALSE use model-based covariance # MARG.EXP.FORMULA - marginal exposure model formula: [Xe | Xo] # EXPVAR - exposure / expensive variable: Xe # Sampled - should be set to 1 if included in the sample and 0 otherwise #EXPVAR = DAT$EXPVAR = DAT[, as.character(substitute(EXPVAR))] Sampled = DAT$Sampled = DAT[, as.character(substitute(Sampled))] imp_data <- vector('list', M) for(ix in seq(M)) { if (verbose) cat("\r","Imputing dataset number:", ix) # Step 0: Update date files # Create versions of DAT with updated Xe values DAT_Xe1 <- DAT_Xe0 <- DAT #DAT_Xe1$EXPVAR <- ifelse (!is.na(DAT_Xe1$EXPVAR), 1, 1 ) #DAT_Xe0$EXPVAR <- ifelse (!is.na(DAT_Xe0$EXPVAR), 0, 0 ) #print(summary(DAT_Xe1$EXPVAR)) #print(summary(DAT_Xe0)) DAT_Xe1[,EXPVAR][is.na(DAT_Xe1[,EXPVAR])] <- 1 DAT_Xe0[,EXPVAR][is.na(DAT_Xe0[,EXPVAR])] <- 0 DAT_Xe1[,EXPVAR] <- 1 DAT_Xe0[,EXPVAR] <- 0 #print(summary(DAT_Xe0)) #DAT_Xe1[,EXPVAR & !is.na(EXPVAR)] <- 1 #DAT_Xe0[,EXPVAR & !is.na(EXPVAR)] <- 0 # Non-sampled subjects DAT_S0 <- DAT[Sampled==0,] DAT_S0_Xe1 <- DAT_Xe1[Sampled==0, ] DAT_S0_Xe0 <- DAT_Xe0[Sampled==0, ] dup <- duplicated(DAT_S0$id) DAT_S0_v0 <- DAT_S0[!dup, ] # Sampled subjects DAT_S1 <- DAT[Sampled==1,] DAT_S1_Xe1 <- DAT_Xe1[Sampled==1, ] DAT_S1_Xe0 <- DAT_Xe0[Sampled==1, ] dup <- duplicated(DAT_S1$id) DAT_S1_v0 <- DAT_S1[!dup,] #print(summary(DAT_S0_Xe1)) #print(summary(DAT_S1_Xe1)) # Step 1: extract estimates from FIT and draw m^th theta from MVN theta <- c(FIT$beta, FIT$alpha) if(ROB) { cov_theta <- FIT$rob.cov } else { cov_theta <- FIT$mod.cov} theta.m <- mvrnorm(n = 1, mu=theta, Sigma=cov_theta) # Step 2: Using the estimated theta=(beta,alpha), calculate likelihood contributions for sampled subjects # for both Xe=1 and Xe=0. #print(names(attr(FIT,'args'))) prep_input <- attr(FIT,'args') tmp_S1_Xe1 <- prep_data(mean.formula=prep_input$mean.formula, lv.formula=prep_input$lv.formula, t.formula=prep_input$t.formula, id=DAT_S1_Xe1[,prep_input$id], data=DAT_S1_Xe1, inits=theta.m, samp.probs=prep_input$samp.probs, samp.probi=prep_input$samp.probi, offset=prep_input$offset)$subjectData tmp_S1_Xe0 <- prep_data(mean.formula=prep_input$mean.formula, lv.formula=prep_input$lv.formula, t.formula=prep_input$t.formula, id=DAT_S1_Xe0[,prep_input$id], data=DAT_S1_Xe0, inits=theta.m, samp.probs=prep_input$samp.probs, samp.probi=prep_input$samp.probi, offset=prep_input$offset)$subjectData # These functions calculate the conditional likelihood contributions by and the ascertainment correction # for sampled subjects. The difference of ascertainment corrections (i.e. log of the ratio # log(pr(S=1 | Xe=1, Xo)/pr(S=1 | Xe=1, Xo))= log(pr(S=1 | Xe=1, Xo))-log(pr(S=1 | Xe=0, Xo))) provides # the offset for the marginal exposure model LLSC_1 <- MMLB:::LogLScoreCalc(params = theta.m, subjectData = tmp_S1_Xe1, Q = FIT$LLSC_args$Q, W = FIT$LLSC_args$W, Z = FIT$LLSC_args$Z, ParamLengths = FIT$LLSC_args$ParamLengths, CondLike = FIT$control['cond.like']==1, EmpiricalCheeseCalc = ROB) LLSC_0 <- MMLB:::LogLScoreCalc(params = theta.m, subjectData = tmp_S1_Xe0, Q = FIT$LLSC_args$Q, W = FIT$LLSC_args$W, Z = FIT$LLSC_args$Z, ParamLengths = FIT$LLSC_args$ParamLengths, CondLike = FIT$control['cond.like']==1, EmpiricalCheeseCalc = ROB) # Note: The ascertainment corrections used for the marginal exposure model offset # are already on the log scale, so log(a/b) = loga-logb logAC_1 <- attr(LLSC_1, "ACSubj") logAC_0 <- attr(LLSC_0, "ACSubj") DAT_S1_v0$offset <- logAC_1-logAC_0 # Perform offsetted logistic regression [Note, only using first visit info i.e. no time-varying information. fit.exp <- glm(MARG.EXP.FORMULA, offset=offset, data=DAT_S1_v0,family=binomial()) # Extract alpha estimates and sample from the sampling distribution alpha <- coef(fit.exp) cov_alpha <- vcov(fit.exp) alpha.m <- mvrnorm(n = 1, mu=alpha, Sigma=cov_alpha) # Step 5: Apply the model and theta to calculate the ascertainment corrections for unsampled people, nr <- nrow(DAT_S0_Xe1) tmp_S0_Xe1 <- prep_data(mean.formula=prep_input$mean.formula, lv.formula=prep_input$lv.formula, t.formula=prep_input$t.formula, id=DAT_S0_Xe1[,prep_input$id], data=DAT_S0_Xe1, inits=theta.m, samp.probs=prep_input$samp.probs[rep(1,nr),], samp.probi=prep_input$samp.probi[rep(1,nr)], offset=prep_input$offset[rep(1,nr)])$subjectData tmp_S0_Xe0 <- prep_data(mean.formula=prep_input$mean.formula, lv.formula=prep_input$lv.formula, t.formula=prep_input$t.formula, id=DAT_S0_Xe0[,prep_input$id], data=DAT_S0_Xe0, inits=theta.m, samp.probs=prep_input$samp.probs[rep(1,nr),], samp.probi=prep_input$samp.probi[rep(1,nr)], offset=prep_input$offset[rep(1,nr)])$subjectData LLSC_1 <- MMLB:::LogLScoreCalc(params = theta.m, subjectData = tmp_S0_Xe1, Q = FIT$LLSC_args$Q, W = FIT$LLSC_args$W, Z = FIT$LLSC_args$Z, ParamLengths = FIT$LLSC_args$ParamLengths, CondLike = FIT$control['cond.like']==1, EmpiricalCheeseCalc = ROB) LLSC_0 <- MMLB:::LogLScoreCalc(params = theta.m, subjectData = tmp_S0_Xe0, Q = FIT$LLSC_args$Q, W = FIT$LLSC_args$W, Z = FIT$LLSC_args$Z, ParamLengths = FIT$LLSC_args$ParamLengths, CondLike = FIT$control['cond.like']==1, EmpiricalCheeseCalc = ROB) ## Applied to the non-sampled subjects: log(pr(S=1 | Xe=1, Xo)/pr(S=1 | Xe=1, Xo)). logAC_1 <- attr(LLSC_1, "ACSubj") # Ascertainment corrections already log transformed logAC_0 <- attr(LLSC_0, "ACSubj") offset_log_S0 <- logAC_1-logAC_0 # Create a temporary dataframe for non-sampled subjects in order to create the # model.matrix that is used to calculate marginal exposure odds tmp <- DAT_S0_v0 tmp[,EXPVAR] <- sample(c(0,1),size=nrow(tmp),replace=TRUE) mm_tmp <- model.matrix(MARG.EXP.FORMULA, tmp) # Compute the conditional exposure odds in non-sampled subjects # Notice that all conditional odds are equal: [Xe | Y, Xo, S=1] = [Xe | Y, Xo, S=0] = [Xe | Y, Xo] since we sampled based on Y # So, we multiply the odds (conditional on being sampled), pr[Xe=1 | Xo, S=1]/pr[Xe=0 | Xo, S=1] # by the likelihood ratio (conditional on being sampled), pr[Y | Xe=1, Xo, S=1]/ pr[Y | Xe=1, Xo, S=1] # in the unsampled subjects in order to obtain conditional odds in unsampled: pr[Xe=1 | Y, Xo, S=1]/pr[Xe=0 | Y, Xo, S=1] marg.odds.Xe.Xo.S1 <- exp(mm_tmp %*% alpha.m)*exp(offset_log_S0) LR.Y.Xe.Xo.S1 <- exp( attr(LLSC_1, "LogLikeSubj")-attr(LLSC_0, "LogLikeSubj")) cond.odds.Xe.Y.Xo.S0 <- LR.Y.Xe.Xo.S1*marg.odds.Xe.Xo.S1 # Convert odds to probability, log(odds) = log(1/[1-p]) ---> p = odds/(1+odds) # Impute Xe for non-sampled subjects prXe1 <- Odds2Prob(cond.odds.Xe.Y.Xo.S0) XeS0 <- rbinom(nrow(tmp), size=1, prob=prXe1) # Add XeS0 info to DAT_S0_v0 DAT_S0_v0[,EXPVAR] <- XeS0 DAT_S0 <- DAT_S0[,-which(colnames(DAT_S0)==EXPVAR)] DAT_S0 <- merge(DAT_S0, DAT_S0_v0[,c('id',EXPVAR)],by='id') DAT_imp <- rbind(DAT_S1, DAT_S0[,colnames(DAT_S1)]) # Store imputation data imp_data[[ix]] <- DAT_imp #if(verbose) cat('\r',ix) } # end of for ix loop imp_data } FitImputedData <- function(FIT, ImputedData, verbose=FALSE){ M <- length(ImputedData) prep_input <- attr(FIT,'args') imp.mods <- ests <- vcovs <- list() for (i in 1:M){ if (verbose) cat("\r", "Beginning imputed data analysis number:", i); #print(date()) imp.mods[[i]] <- mm(prep_input$mean.formula, dat=ImputedData[[i]], id=id, lv.formula=prep_input$lv.formula, t.formula=prep_input$lv.formula, iter.lim=1000, verbose=FALSE, q=50) ests[[i]] <- unlist(coef(imp.mods[[i]])) vcovs[[i]] <- imp.mods[[i]]$mod.cov } out <- MIcombine(ests, vcovs) out } mi.analysis <- function(FullCohortData, Sampled, mean.formula,lv.formula, t.formula, id, cond.like, samp.probs, iter.lim=100, q=50, M=20, marg.exp.formula, EXPVAR, verbose=FALSE){ ## FullCohortData Full cohort data. This dataset should not have missing values for variables other than EXPVAR ## mean.formula Of the form: Y ~ EXPVAR+X1+X2... ## lv.formula Latent variable formula. For a random intercept, use ~1. If no random intercept is needed, set to NULL ## t.formula Transition formula. For a standard first order transition model, use ~1. ## NOTE: lv.formula and t.formula should not both be set to 0. ## Sampled Sampling indicator variable name: 1 if sampled and 0 if not ## id id variable. ## cond.like If using an ODS design, set this to TRUE. If imputing from a random sampling design, set to FALSE ## samp.probs Sampling probabilities for the three strata (\pi_0, \pi_1, \pi_2) ## iter.lim=100 Limits the number of newton raphson iterations ## q=50, Number of Gauss-Hermite quadrature points for integrating over random effects. If no random intercept is use, set this to 3. ## M=5 Number of imputations ## marg.exp.formula Marginal exposure model of the form: EXPVAR ~ Xo ## EXPVAR Expensive exposure variable that should be set to missing for those not included in the subsample. ## When Sampled=1, this should be non-missing and when Sampled=0 it should be missing ## verbose=FALSE Prints fitting progress id = FullCohortData$id = FullCohortData[, as.character(substitute(id))] #EXPVAR = FullCohortData$EXPVAR = FullCohortData[, as.character(substitute(EXPVAR))] Sampled = FullCohortData$Sampled = FullCohortData[, as.character(substitute(Sampled))] ## Keep those who were sampled to create the ODS dataset #odsdat <- FullCohortData[FullCohortData[,Sampled]==1,] odsdat <- FullCohortData[Sampled==1,] if (verbose) cat("Fit ACML...", "\n") FIT <- mm(mean.formula, dat=odsdat, id=id, lv.formula=lv.formula, t.formula=t.formula, cond.like=cond.like, samp.probs=samp.probs, iter.lim=iter.lim, verbose=FALSE, q=q, return_args=TRUE) imps <- ImputeData(FIT=FIT, DAT=FullCohortData, M=M, ROB=FALSE, MARG.EXP.FORMULA=marg.exp.formula, verbose=verbose, EXPVAR=EXPVAR, Sampled=Sampled) res <- FitImputedData(FIT=FIT, ImputedData=imps, verbose=verbose) res }