#####  Bootstrap of the NA-ACCORD and ART-CC approaches


rm(list=ls())
setwd("/Volumes/encrypted/cnics/analyses-2013-Nov")

library(survival)


##################  FUNCTIONS NECESSARY FOR DOING THE COLE APPROACH  #################################

##########  Simulating generalized gamma data

qggamma <- function(beta, sigma, lambda, p) {
  
  s<-lambda^(-2)
  r<-s*exp((-beta*lambda)/sigma)
  b<-lambda/sigma
  quan<-(lambda>0)*((qgamma(p, shape=s, rate=r))^(1/b))
  quan<- quan+(lambda<0)*((qgamma(1-p,shape=s, rate=r))^(1/b))
  quan
  
}


###### Parameters received from Sterne, May, and Cole

par.ests.d<-  matrix(c(-0.01976,1.4140,0.8073,-0.3376,0.9424,0.9799,0.004661,450,
                       0.1292,1.4136,0.8473,0.1036,1.2702,0.8694,0.006369,425,
                       0.1378,1.4581,0.7093,-0.1678,1.2230,0.7157,0.006523,400,
                       0.1985,1.4208,0.7989,-0.1855,1.2914,0.7565,0.006381,375,
                       0.2142,1.3600,0.8139,0.3155,1.4237,0.6343,0.007307,350,
                       0.2717,1.2890,0.9180,0.005815,0.9229,0.6943,0.007987,325,
                       0.2535,1.2752,0.8687,-0.2549,1.2515,-0.09287,0.01010,300,
                       0.3532,1.2246,1.0406,-0.05621,1.2546,0.4186,0.01115,275,
                       0.3387,1.2135,0.9881,-0.1256,1.6811,-0.08963,0.01367,250,
                       0.3131,1.2104,0.9577,-0.2491,1.7802,-0.08660,0.01483,225,
                       0.3287,1.1857,0.9952,0.1319,1.4342,0.7271,0.01549,200,
                       0.2843,1.1348,0.9281,0.4699,0.7067,1.9816,0.01810,175,
                       0.3027,1.0971,0.9425,0.4677,0.6664,1.8550,0.02228,150,
                       0.2842,1.0680,0.9363,0.4622,0.7501,1.6866,0.02700,125,
                       0.2362,1.0575,0.8927,0.2233,1.1631,0.7395,0.03686,100),ncol=8,byrow=TRUE)


######                 mu1,   sig1,   del1,  mu2,  sig2,  del2,   pi,   threshold    
par.ests.e<-matrix(c(-0.03244,1.4193,0.7969,0.4088,1.0706,1.2097,0.01610,450,
                     0.1132,1.4260,0.8284,0.5776,0.9573,1.4753,0.02273,425,
                     0.1211,1.4686,0.6908,0.5509,1.0282,1.3532,0.02691,400,
                     0.1938,1.4293,0.7916,0.1801,1.1994,0.8842,0.02830,375,
                     0.2147,1.3643,0.8099,0.2086,2.3632,0.8688,0.03197,350,
                     0.2830,1.2848,0.9275,0.002276,1.1950,0.8346,0.03249,325,
                     0.2590,1.2710,0.8762,0.07611,1.3065,0.6431,0.03828,300,
                     0.3509,1.2137,1.0532,0.3388,1.4096,0.7650,0.05289,275,
                     0.3211,1.2222,0.9638,0.5177,1.2067,1.1048,0.06128,250,
                     0.2951,1.2273,0.9177,0.4338,1.1252,1.2341,0.06974,225,
                     0.3231,1.1839,0.9771,0.3187,1.2826,1.0666,0.08119,200,
                     0.2906,1.1298,0.9100,0.2165,1.1235,1.1676,0.09472,175,
                     0.3235,1.0885,0.9236,0.1419,1.0550,1.2139,0.1079,150,
                     0.2989,1.0653,0.8941,0.1774,1.0114,1.2725,0.1309,125,
                     0.2417,1.0461,0.8189,0.1073,1.1676,1.0102,0.1759,100),ncol=8,byrow=TRUE)


#####  This function performs the analysis of Cole et al.

hr.compute.lt<-function(lower,upper,threshold.cd4,n.impute,cd4,t.event,event,par.ests,truncate=NA,discrete=FALSE,cd4.3=FALSE,age.haart=NA,age.cd4=NA) {
  
  select1<-ifelse(cd4>lower & cd4<=upper,1,0)
  select<-ifelse(is.na(select1),0,select1)
  if (cd4.3){
    select<-ifelse(is.na(age.cd4),select,
                   ifelse(age.haart-age.cd4>0.25,0,select))
  }
  
  t<-t.event[select==1]
  event.orig<-event[select==1]
  defer.orig<-ifelse(cd4[select==1]<=threshold.cd4,1,0)
  row<-which(par.ests[,8]==threshold.cd4)
  
  pi.est<-par.ests[row,7]                       
  n<-sum(defer.orig)
  
  beta<-se<-NULL
  for (i in 1:n.impute) {
    
    events<-event.orig
    defer<-defer.orig
    t.f<-ifelse(defer==0,0,qggamma(par.ests[row,1],par.ests[row,2],par.ests[row,3],runif(n,0,1)))
    t.star<-t+t.f
    
#    u<-rbinom(1,n,pi.est/(1-pi.est))
    u<-rnbinom(1,n,1-pi.est)
    if (u>0) {
      t.g<-qggamma(par.ests[row,4],par.ests[row,5],par.ests[row,6],runif(u,0,1))
      t.star<-c(t.star,t.g)
      defer<-c(defer,rep(1,u))
      events<-c(events,rep(1,u))
    }
    
    if (!is.na(truncate)) {
      t.star<-ifelse(t.star>truncate,truncate,t.star)
      events<-ifelse(t.star>truncate,0,events)
    }
    
    S<-Surv(t.star,events)
    mod<-coxph(S~defer)
    beta[i]<-summary(mod)$coeff[1]
    se[i]<-summary(mod)$coeff[3]
    
    if (discrete) {
      t.star0<-t.star[t.star>0]
      defer0<-defer[t.star>0]
      events0<-events[t.star>0]
      t.star.months<-ceiling(t.star0*12)
      t.months<-1:t.star.months[1]
      defer.m<-rep(defer0[1],t.star.months[1])
      events.m<-rep(0,t.star.months[1])
      events.m[t.star.months[1]]<-events0[1]
      for (j in 2:length(t.star0)) {
        t.months<-c(1:t.star.months[j],t.months)
        defer.m<-c(rep(defer0[j],t.star.months[j]),defer.m)
        stuff<-rep(0,t.star.months[j])
        stuff[t.star.months[j]]<-events0[j]
        events.m<-c(stuff,events.m)
      }
      mod<-glm(events.m~rcs(t.months,4) + defer.m, family="binomial")  
      beta[i]<-summary(mod)$coeff["defer.m",1]
      se[i]<-summary(mod)$coeff["defer.m",2]
    }
    
    
  }
  
  hr<-exp(mean(beta))
  var<-mean(se^2)+var(beta)*(1+1/n.impute)
  ci<-exp(mean(beta)+1.96*sqrt(var)*c(-1,1))
  label<-paste(paste(lower,threshold.cd4-1,sep="-"),paste(threshold.cd4,upper-1,sep="-"),sep=" vs. ")
  return(c(label,hr,ci))
}


#########  FUNCTIONS NECESSARY FOR DOING THE HERNAN APPROACH    ######################

#------------------------#
#       Functions        #
#------------------------#
#---------------------------------------------------------------------#
#                           add.fun                                   #
#---------------------------------------------------------------------#
# In creating monthly (time-dependent) data frame, records are inserted
# that do not exist in the original data.  This creates missing values for
# CD4, viral loads, CD4 %, etc.  Add.fun fills in these values with the 
# last non-missing observation and records the age at which that non-missing
# value occurred.
add.fun <- function(labage.1, labage.2){
  var <- names(labage.1)[3]
  labage.1$order <- 1
  labage.2$order <- 0
  pullData <- rbind(labage.1,labage.2)
  pullData$age <- round(pullData$age, 4)
  
  pullData <- pullData[order(pullData$pid, pullData$age, pullData$order),]
  
  hasVal <- diff(c(0L,as.integer(as.factor(pullData$pid)))) != 0 |
    !is.na(pullData[,var]) |
    (pullData$order == 0)
  
  index <- rep(which(hasVal), times=table(cumsum(hasVal)))
  
  pullData[,c(var, paste("age", var, sep='.'))] <- pullData[index,
                                                            c(var, "age")]
  
  return(pullData)
}

     
#######  Analysis function that does pretty much everything

hernan.analysis<-function(d2, u.limit, l.limit, filename, start.within=180/365.25,  
                          LOST=FALSE, COVARIATES=FALSE, SITE.YEAR=FALSE, EXCLUDE.IDU=FALSE, 
                          DEATH.ONLY=FALSE, CENSOR.TIME=FALSE, EXCLUDE.DEATHS=FALSE,
                          ADD2=FALSE, EXCLUDE.NON.HAART.ART=TRUE, TRUNC.LEVEL=0.99) {
  
  if (EXCLUDE.NON.HAART.ART) {  ###  Currently, this is the only option.  At some point, I should censor but I'm not sure VU gave me this data.
    d2<-d2[d2$non.haart.art==0,]
    d2$non.haart.art<-NULL
  }  
  
  if (ADD2==TRUE){
    d2$last.age<-with(d2, ifelse(deceased==0, last.age+60/365.25, last.age))
  }
  
  # Identifying age at which CD4 is less than ulimit by PID
  less.ulimit <- lapply(split(subset(d2, select=c(pid,age,cd4)), d2$pid),
                        FUN=function(y){
                          y[min(which(y[,"cd4"] < u.limit),na.rm=TRUE),"age"]})
  less.ulimit.df <- data.frame("pid"=names(less.ulimit),
                               "age.less.ulimit"=unlist(less.ulimit))
  less.ulimit.df$pid <- as.character(less.ulimit.df$pid)
  d2[,"age.less.ulimit"] <- less.ulimit.df[match(d2$pid,less.ulimit.df$pid,nomatch=NA),
                                           "age.less.ulimit"]
  d2 <- d2[order(d2$pid, d2$age),]
  
  
  # If a PID has a missing age.less.ulimit, then they did not have a CD4 below the upper
  # limit and should be deleted from the analysis
  d2 <- subset(d2, !is.na(age.less.ulimit))
  
  # Time.in.care is the time between the first recorded clinic visit and the age a PIDs CD4 fell
  # below the upper limit
  d2$time.in.care <- d2$age.less.ulimit - d2$first.age
  
  # If someone died, their true last age should be the age of death
  # Otherwise it is their last recorded age.
  d2$true.last.age<-with(d2,ifelse(is.na(age_at_death),last.age,age_at_death))
  
  # Total follow-up time is the time from when a PIDs CD4 fell below the upper limit to their true.last.age
  d2$tot.fu <- d2$true.last.age - d2$age.less.ulimit
  
  d2<-d2[d2$tot.fu>=0,]   #Doesn't appear to be needed  (2014-03-07)
  
  # In the original data, each record at which an OI occurred had a non-missing age.oi.  All other records
  # had a missing age.oi.  For analysis purposes, those OIs that occurred between a PIDs first age and
  # age.less.ulimit were recorded as prior.oi and the age recorded.  For those OIs that occurred after
  # age.less.ulimit, only the first occurrence of an OI will be included in the analysis and the variable
  # age.oi will repeat the age at which the first OI after age.less.ulimit occurred.  The following
  # code addresses both of these issues.
  # Defining prior.oi (OI between first.age and age.less.ulimit)
  age.oi <- subset(d2, !is.na(age.oi), select=c(pid, age.oi, age.less.ulimit))
  age.oi$prior.oi <- ifelse(round(age.oi$age.oi,4) <=
                              round(age.oi$age.less.ulimit,4),1,0)
  
  #####  Added by Bryan to try to simplify
  prior.oi.df<-data.frame(pid=unique(age.oi$pid[age.oi$prior.oi==1]),prior.oi=1)
  d2<-merge(d2,prior.oi.df,by="pid",all.x=TRUE)
  d2 <- d2[order(d2$pid,d2$age),]
  d2$prior.oi <- ifelse(is.na(d2$prior.oi),0,d2$prior.oi)
  
  
  # Grabbing first incidence of OI after age.less.ulimit if more than one oi
  age.oi.af <- lapply(split(age.oi, age.oi$pid), FUN=function(y){
    ifelse(all(y[,"prior.oi"]==1),NA,min(y[y[,"prior.oi"]==0,"age.oi"]))})
  age.oi.af.df <- data.frame("pid"=unlist(names(age.oi.af)),
                             "age.oi.af"=unlist(age.oi.af))
  age.oi.af.df$pid <- as.character(age.oi.af.df$pid)
  age.oi.af.df<-age.oi.af.df[!is.na(age.oi.af.df$age.oi.af),]
  
  
  d2 <- merge(d2, age.oi.af.df, by="pid", all.x=TRUE)
  d2 <- d2[order(d2$pid,d2$age),]
  
  d2$age.oi<-d2$age.oi.af
  d2$age.oi.af<-NULL
  
  # Subsetting out PIDs with prior.oi
  d2 <- subset(d2, prior.oi != 1)
  
  
  unique.id<-unique(d2$pid)
  
  
  gt.llimit.df <- subset(d2, round(age,4) == round(age.less.ulimit,4),
                         select=c(pid,age,age.less.ulimit,cd4))
  gt.llimit.df$gt.llimit <- ifelse(gt.llimit.df$cd4 >= l.limit,1,0)
  d2[,"gt.llimit"] <- gt.llimit.df[match(d2$pid,
                                         gt.llimit.df$pid,nomatch=NA),
                                   "gt.llimit"]
  sub.d2 <- subset(d2, (age.fhaart >= age.less.ulimit |              
                          is.na(age.fhaart)) &
                     gt.llimit == 1)
  sub.d2 <- sub.d2[order(sub.d2$pid,sub.d2$age),]
  sub.d2$gt.llimit<-NULL
  
  sub.unique.id <- unique(sub.d2$pid)
  
  d2 <- sub.d2
  
  
  #######   I'm being bold, and changing the structure of the code here. I'm going to expand to months first and then I'll define 
  #######  treatment groups.
  
  ####  I'm going to keep all of the repeating covariates to avoid changing them later, but I'll grab baseline values
  
  tmp<-d2[!duplicated(d2$pid),]
  tmp$age.rx.start<-tmp$haart_before_first_visit<-NULL
  tmp$age<-tmp$oi<-tmp$last.age<-tmp$prior.oi<-tmp$haart<-tmp$fhaart<-tmp$age.approx<-NULL
  
  max.follow <- 12*max(tmp$tot.fu, na.rm=TRUE)
  months <- seq(0, ceiling(max.follow),by=1)
  
  tmp$month <- 0
  
  tmp2<-data.frame(pid=rep(tmp$pid,each=length(months[-1])),
                   site=rep(tmp$site,each=length(months[-1])),
                   black=rep(tmp$black,each=length(months[-1])),
                   white=rep(tmp$white,each=length(months[-1])),
                   deceased=rep(tmp$deceased,each=length(months[-1])),
                   age_at_death=rep(tmp$age_at_death,each=length(months[-1])),
                   idu=rep(tmp$idu,each=length(months[-1])),
                   age.oi=rep(tmp$age.oi,each=length(months[-1])),
                   cd4=NA,
                   male=rep(tmp$male,each=length(months[-1])),
                   first.age=rep(tmp$first.age,each=length(months[-1])),
                   first.year=rep(tmp$first.year,each=length(months[-1])),
                   ltfu=rep(tmp$ltfu,each=length(months[-1])),
                   age.fhaart=rep(tmp$age.fhaart,each=length(months[-1])),
                   age.less.ulimit=rep(tmp$age.less.ulimit,each=length(months[-1])),
                   time.in.care=rep(tmp$time.in.care,each=length(months[-1])),
                   true.last.age=rep(tmp$true.last.age,each=length(months[-1])),
                   tot.fu=rep(tmp$tot.fu,each=length(months[-1])),
                   month=rep(months[-1],times=length(tmp$pid)))
  
  surv.df<-rbind(tmp,tmp2)
  surv.df <- surv.df[order(surv.df$pid, surv.df$month),]
  surv.df$age <- round(surv.df$age.less.ulimit + surv.df$month/12,4)
  
  
  ######  Defining month of first HAART using the larger dataframe
  
  locate.fhaart <- lapply(split(subset(surv.df, select=c(pid,age,age.fhaart)),surv.df$pid),
                          FUN=function(y){
                            y[min(which(y[,"age"] >= y[,"age.fhaart"])),"age"]
                          })
  locate.fhaart.df <- data.frame("pid"=names(locate.fhaart)[!is.na(locate.fhaart)],
                                 "new.age.fhaart"=unlist(locate.fhaart[!is.na(locate.fhaart)]))
  surv.df[,"new.age.fhaart"] <- locate.fhaart.df[match(surv.df$pid,locate.fhaart.df$pid,nomatch=NA),
                                                 "new.age.fhaart"]
  surv.df$fhaart <- ifelse(round(surv.df$age,4) == round(surv.df$new.age.fhaart,4) & !is.na(surv.df$new.age.fhaart),1,
                           ifelse(is.na(surv.df$new.age.fhaart) | 
                                    round(surv.df$age,4) != round(surv.df$new.age.fhaart,4),0,NA))
  
  
  locate.oi <- lapply(split(subset(surv.df, select=c(pid,age,age.oi)),surv.df$pid),
                      FUN=function(y){
                        y[min(which(y[,"age"] >= y[,"age.oi"])),"age"]
                      })
  locate.oi.df <- data.frame("pid"=names(locate.oi)[!is.na(locate.oi)],
                             "new.age.oi"=unlist(locate.oi[!is.na(locate.oi)]))
  surv.df[,"new.age.oi"] <- locate.oi.df[match(surv.df$pid,locate.oi.df$pid,nomatch=NA),
                                         "new.age.oi"]
  surv.df$oi <- ifelse(round(surv.df$age,4) == round(surv.df$new.age.oi,4) & !is.na(surv.df$new.age.oi),1,
                       ifelse(is.na(surv.df$new.age.oi) | 
                                round(surv.df$age,4) != round(surv.df$new.age.oi,4),0,NA))
  
  
  ######  Getting rid of time after follow-up.  I did this here, because some of the above code needed it.
  
  # # #surv.df <- subset(surv.df, round(age,4) <=round(true.last.age,4))   ###  Commented out on 2014-03-07
  # # 
  # # 
  # # surv.df$death <- ifelse(is.na(surv.df$age_at_death),0,
  # #                         ifelse((round(surv.df$age,4) == 
  # #                                   round(surv.df$true.last.age,4)) | 
  # #                                  (surv.df$age + 1/12 > surv.df$true.last.age),1,0))
  # # 
  # # 
  # # surv.df$oi <- ifelse(is.na(surv.df$age.oi),0,
  # #                      ifelse((surv.df$age + 1/12) > surv.df$true.last.age & 
  # #                               (surv.df$age.oi + 1/12) > surv.df$true.last.age,1,
  #                             surv.df$oi))
  
  surv.df <- subset(surv.df, round(age,4) < round(true.last.age,4) + 1/12)   ###  Changed on 2014-03-07
  
  
  surv.df$death <- ifelse(is.na(surv.df$age_at_death),0,
                          ifelse((round(surv.df$age,4) == 
                                    round(surv.df$true.last.age,4)) | 
                                   (surv.df$age > surv.df$true.last.age),1,0))
  
  
  surv.df$oi <- ifelse(is.na(surv.df$age.oi),0,
                       ifelse((surv.df$age ) > surv.df$true.last.age & 
                                (surv.df$age.oi) > surv.df$true.last.age,1,
                              surv.df$oi))
  
  surv.df$month<-ifelse(surv.df$month==0&surv.df$death==1,1,surv.df$month)
  
  if (DEATH.ONLY) {
    surv.df$event <- ifelse(surv.df$death == 0, 0,1)
  }
  if (!DEATH.ONLY) {
    surv.df$event <- ifelse(surv.df$death == 0 & surv.df$oi == 0,0,1)
  }
  
  ######  Adding CD4 counts
  
  ref.cd4 <- data.frame("pid"=d2$pid[!duplicated(d2$pid)],
                        # "ref.cd4"=d2$cd4[round(d2$age,4)==round(d2$age.less.ulimit,4)],   ####  Bryan changed because I'll use CD4 at first visit as covariate  (This was probably a poor choice, I fix back later - 2014-03-10)
                        "ref.cd4"=d2$cd4[!duplicated(d2$pid)],    
                        "age.ref.cd4"=d2$age.less.ulimit[!duplicated(d2$pid)],
                        "age.fhaart"=d2$age.fhaart[!duplicated(d2$pid)])
  
  surv.df <- merge(surv.df, ref.cd4, all=TRUE)
  surv.df <- surv.df[order(surv.df$pid,surv.df$age),]
  
  
  labage.1 <- surv.df[,c("pid","age","cd4")]
  labage.1$cd4<-NA
  
  labage.2 <- subset(d2, pid %in% surv.df$pid,
                     select=c(pid,age,cd4))
  names(labage.2) <- c("pid","age","cd4")
  labage.2 <- subset(labage.2, !is.na(cd4))
  
  pullData <- add.fun(labage.1, labage.2)
  
  surv.df$cd4 <- pullData[pullData$order==1,"cd4"]
  surv.df$age <- round(surv.df$age, 4)
  surv.df$age.fhaart <- round(surv.df$age.fhaart,4)
  surv.df$age.cd4 <- pullData[pullData$order==1, "age.cd4"]
  surv.df$time.to.cd4 <- 12*(surv.df$age - surv.df$age.cd4)
  
  junk<-subset(surv.df[!duplicated(surv.df$pid),],select=c(pid,cd4))     ######  The wrong ref.cd4 was saved above.  I'm fixing it here to be the CD4 when first going below upper limit.  2014-03-10
  surv.df<-merge(surv.df,junk,all.x=TRUE)
  surv.df<-surv.df[order(surv.df$pid,surv.df$age),]
  
  ## Need to ensure that for cd4  that age.cd4
  ## is less than age.fhaart when fhaart==1
  # Adjusting those who started HAART and had a CD4 count in that month after
  # the age.fhaart
  
  which.cd4 <- which(round(surv.df$age.fhaart,4) < round(surv.df$age.cd4,4) &
                       surv.df$fhaart == 1 & 
                       round(surv.df$age.cd4,4) > round(surv.df$age.less.ulimit,4))
  surv.df$cd4[which.cd4] <- surv.df$cd4[(which.cd4-1)]
  surv.df$age.cd4[which.cd4] <- surv.df$age.cd4[(which.cd4-1)]
  surv.df$time.to.cd4[which.cd4] <- 12*(surv.df$age[which.cd4]-surv.df$age.cd4[which.cd4])  ###  Line added 12/12/14 (should have minor impact)
  
  #### do the same thing for oi: adjusting those who had oi and CD4 count in the same month
  which.cd4.oi <- which(round(surv.df$age.oi,4) < round(surv.df$age.cd4,4) &
                          surv.df$oi== 1 & 
                          round(surv.df$age.oi,4) > round(surv.df$age.less.ulimit,4))
  surv.df$cd4[which.cd4.oi] <- surv.df$cd4[(which.cd4.oi-1)]
  surv.df$age.cd4[which.cd4.oi] <- surv.df$age.cd4[(which.cd4.oi-1)]
  surv.df$time.to.cd4[which.cd4.oi] <- 12*(surv.df$age[which.cd4.oi]-surv.df$age.cd4[which.cd4.oi])  ###  Line added 12/12/14 (should have minor impact)
  
  
  if (!DEATH.ONLY){
    surv.df<-surv.df[is.na(surv.df$new.age.oi) | surv.df$age<=surv.df$new.age.oi,]
  }
  
  surv.df$haart<-with(surv.df, ifelse(is.na(new.age.fhaart),0,
                                      ifelse(age<new.age.fhaart,0,1)))
  
  surv.df$time.f.baseline<-with(surv.df,as.numeric(age-age.less.ulimit))
  
  #####  Locating if and when a patient falls below the lower CD4 threshold
  unique.id<-unique(surv.df$pid)
  locate.cd4.threshold<-rep(NA,length(unique.id))
  for (i in 1:length(unique.id)) {
    if (sum(surv.df$cd4[surv.df$pid==unique.id[i]]<l.limit,na.rm=TRUE)>=1) {
      #     locate.cd4.threshold[i]<-which(surv.df$age.cd4==min(surv.df$age.cd4[surv.df$cd4<l.limit & surv.df$pid==unique.id[i]]) & 
      #                                      surv.df$cd4<l.limit & surv.df$pid==unique.id[i])[1]
      locate.cd4.threshold[i]<-which(surv.df$age==min(surv.df$age[surv.df$cd4<l.limit & surv.df$pid==unique.id[i]]) & 
                                       surv.df$cd4<l.limit & surv.df$pid==unique.id[i])[1]
      
    }
  }
  age.cd4.llimit<-surv.df$age[locate.cd4.threshold]  
  d3<-data.frame(pid=unique.id,age.cd4.llimit)
  d4<-merge(surv.df,d3,all=TRUE)
  d4<-d4[order(d4$pid, d4$month),]
  surv.df<-d4
  rm(d3,d4)
  
  surv.df$immediate1<-with(surv.df,ifelse((haart==0 & time.f.baseline<=start.within)|
                                            (!is.na(age.fhaart) & haart==1 & age.fhaart-age.less.ulimit<=start.within),1,0))
  surv.df$deferred1<-with(surv.df,ifelse(haart==0 | 
                                           (haart==1 & age.fhaart-age.less.ulimit>start.within) |
                                           (haart==1 & age.fhaart-age.less.ulimit<start.within & !is.na(age.cd4.llimit) & 
                                              age.cd4.llimit < age.fhaart), 1,0))
  
  
  #######  Artificially censoring
  surv.df$cens.no.start<-with(surv.df, ifelse(!is.na(age.cd4.llimit) & 
                                                age>age.cd4.llimit+start.within & 
                                                (age.fhaart>age.cd4.llimit+start.within | is.na(age.fhaart)) & 
                                                deferred1==1,1,0))
  
  surv.df$cens.start.early<-with(surv.df, ifelse(haart==1 & (age.fhaart<age.cd4.llimit | is.na(age.cd4.llimit)) & deferred1==1,1,0))
  
  
  surv.df$cens<-with(surv.df, 1-(1-cens.no.start)*(1-cens.start.early))
  
  if (LOST) {
    unique.id<-unique(surv.df$pid)
    locate.last<-NULL
    for (i in 1:length(unique.id)) {
      if (surv.df$ltfu[surv.df$pid==unique.id[i]][1]==1) {
        locate.last[i]<-which(surv.df$month==max(surv.df$month[surv.df$pid==unique.id[i]])&surv.df$pid==unique.id[i])
      } 
    }
    surv.df$lost<-0
    surv.df$lost[locate.last]<-1
  }
  
  
  if (EXCLUDE.IDU) {
    surv.df<-surv.df[is.na(surv.df$idu) | surv.df$idu==0,]
  }
  
  if (EXCLUDE.DEATHS){
    exclude<-with(surv.df, ifelse(is.na(age.fhaart),0,
                                  ifelse(is.na(age_at_death),0,
                                         ifelse(age_at_death-age.fhaart<=14/365.25,1,0))))
    surv.df<-surv.df[exclude==0,]
  }
  
  if (CENSOR.TIME){
    surv.df<-surv.df[surv.df$month<=72,]
  }
  
  surv.df$year.less.ulimit<-with(surv.df, round(first.year+age.less.ulimit-first.age))
  
  
  #### Qi is trying to fix for time 0 (2014-03-31)
  ### if patient start haart at time 0, assign cd4 at time=0 to cd4 at time=1 
  #tmp<-surv.df[!duplicated(surv.df$pid),]
  locate <- with(surv.df, which(month==0&haart==1))
  surv.df[locate+1, "cd4"] <- surv.df[locate, "cd4"]
  surv.df[locate+1, "fhaart"] <- surv.df[locate, "fhaart"]
  surv.df[locate+1, "haart"] <- surv.df[locate, "haart"]
  ### no event (death or oi) or lost of follow-up at time 0, remove row of time=0 might be the easiest fix
  surv.df <- subset(surv.df, month!=0)
  
  
  #####  Now I'm computing the weights
  
  library(rms)
  
  if (LOST) {
    if (!COVARIATES & !SITE.YEAR){
      mod.lost<-glm(lost~rcs(month,4)+cd4+haart,family="binomial",data=surv.df)                        ### Simple model
    }
    if (COVARIATES & !SITE.YEAR){
      mod.lost<-glm(lost~rcs(month,4)+cd4+haart+age.less.ulimit+male, family="binomial",data=surv.df)  ### Model with covariates
    }
    if (!COVARIATES & SITE.YEAR){
      mod.lost<-glm(lost~rcs(month,4)+cd4+haart+site+year.less.ulimit, family="binomial",data=surv.df)  ### Model with site and year
    }
    if (COVARIATES & SITE.YEAR){
      mod.lost<-glm(lost~rcs(month,4)+cd4+haart+age.less.ulimit+male+site+year.less.ulimit, family="binomial",data=surv.df)  ### Model with covariates, site, and year
    }
    summary(mod.lost)
    
    surv.df$pred.lost<-predict(mod.lost,type="response")
  }
  
  ####  Probability of starting HAART among those who have not yet started
  
  remove<-with(surv.df, ifelse(fhaart==0 & haart==1,1,0))
  trt.df<-surv.df[remove==0,]
  removed<-surv.df[remove==1,]
  
  
  ############## This is where I estimate the probability of starting treatment given they haven't yet started it.
  
  #mod.haart<-glm(fhaart~rcs(month,4)+cd4,family="binomial",data=trt.df,weight=w.missing)      ### weighting for those missing.
  
  
  if (!COVARIATES & !SITE.YEAR){
    mod.haart<-glm(haart~rcs(month,4)+cd4,family="binomial",data=trt.df)                        ### Simple model
  }
  if (COVARIATES & !SITE.YEAR){
    mod.haart<-glm(haart~rcs(month,4)+cd4+age.less.ulimit+male, family="binomial",data=trt.df)  ### Model with covariates
  }
  if (!COVARIATES & SITE.YEAR){
    mod.haart<-glm(haart~rcs(month,4)+cd4+site+year.less.ulimit, family="binomial",data=trt.df)  ### Model with site and year
  }
  if (COVARIATES & SITE.YEAR){
    mod.haart<-glm(haart~rcs(month,4)+cd4+age.less.ulimit+male+site+year.less.ulimit, family="binomial",data=trt.df)  ### Model with covariates, site, and year
  }
  
  summary(mod.haart)
  
  
  pred.trt<-predict(mod.haart,type="response")
  trt.df$p.obs.trt<-ifelse(trt.df$fhaart==0,1-pred.trt,pred.trt)
  
  ##########################################################################################################
  ###########  New stuff based on the paper I reviewed and using uniform censoring probability.
  ###########  I need to go through carefully still and look at the <= vs < and the order of variables to make sure it's sensible.
  ###########  Qi and I did this in the spring of 2014.
  g<-round(start.within*12)
  # m<-c(0:(g-1))
  trt.df$num.imm1<-with(trt.df, #ifelse(time.f.baseline == 0, 1,          # I'm not sure about this last line, but I decided to add it.
                        ifelse(time.f.baseline < start.within & fhaart==0, 1-1/(g+1-month),
                               ifelse(time.f.baseline < start.within & fhaart==1, 1/(g+1-month),NA))) #)
  
  trt.df$num.def1<-with(trt.df, ifelse(is.na(age.cd4.llimit) | age < age.cd4.llimit, 1,    ### I considered changing from < to <= (and then everywhere else)
                                       ifelse(age>=age.cd4.llimit & age<age.cd4.llimit+start.within & fhaart==0, 
                                              1-1/(g+1-(round((age-age.cd4.llimit)*12))),
                                              ifelse(age>=age.cd4.llimit & age<age.cd4.llimit+start.within & fhaart==1,
                                                     1/(g+1-round((age-age.cd4.llimit)*12)),
                                                     NA))))       
  
  
  removed$p.obs.trt<-removed$num.imm1<-removed$num.def1<-NA
  
  d.new<-rbind(trt.df,removed)
  d.new<-d.new[order(d.new$pid, d.new$month),]
  
  d.new$p.obs.trt2<-with(d.new, ifelse(is.na(p.obs.trt),1,p.obs.trt))
  d.new$num.imm2<-with(d.new, ifelse(is.na(num.imm1),1,num.imm1))
  d.new$num.def2<-with(d.new, ifelse(is.na(num.def1),1,num.def1))  
  
  d.new$w.imm2<-with(d.new, num.imm2/p.obs.trt2)
  d.new$w.def2<-with(d.new, num.def2/p.obs.trt2)
  
  d.new$w.imm <- unlist(tapply(d.new$w.imm2, INDEX=d.new$pid, FUN=cumprod))
  d.new$w.def <- unlist(tapply(d.new$w.def2, INDEX=d.new$pid, FUN=cumprod))
  
  
  ########  Adding a little piece on LTFU here
  if (LOST){
    d.new$w.lost <- unlist(tapply(1/(1-d.new$pred.lost), INDEX=d.new$pid, FUN=cumprod))
  }
  
  surv.df.def<-d.new[d.new$deferred1==1,]
  surv.df.def$weight<-surv.df.def$w.def
  surv.df.def$deferred<-1
  surv.df.imm<-d.new[d.new$immediate1==1,]
  surv.df.imm$weight<-surv.df.imm$w.imm
  surv.df.imm$deferred<-0
  
  d.new.stacked<-rbind(surv.df.def,surv.df.imm)
  d.analysis<-d.new.stacked[d.new.stacked$cens==0,]
  
  if (LOST) {
    d.analysis$weight<-with(d.analysis, weight*w.lost)
  }
  
  
  sum(d.analysis$event)
  
  
  #####  Truncating the weights
  w.trunc<-quantile(d.analysis$weight,TRUNC.LEVEL)
  #w.trunc<-10
  d.analysis$weight.trunc<-ifelse(d.analysis$weight>w.trunc,w.trunc,d.analysis$weight)
  
  with(d.analysis, table(deferred, event))
  with(d.analysis, mean(event[deferred==0]))
  with(d.analysis, mean(event[deferred==1]))
  chisq.test(with(d.analysis, table(deferred, event)))
  
  
  if (!COVARIATES & !SITE.YEAR){  ### Simple model
    mod<-glm(event~rcs(month,4) + deferred , weights=weight.trunc, data=d.analysis, family=binomial) 
  }
  if (COVARIATES & !SITE.YEAR){   ### Model with covariates
    mod<-glm(event~rcs(month,4) + deferred + ref.cd4 + age.less.ulimit + male, weights=weight.trunc, family="binomial", data=d.analysis)  
  }
  if (!COVARIATES & SITE.YEAR){  ### Model stratifying by site and year
    mod<-glm(event~rcs(month,4) + site + year.less.ulimit + deferred, weights=weight.trunc, family="binomial",data=d.analysis)  
  }
  if (COVARIATES & SITE.YEAR){  ### Model with covariates and stratifying by site and year
    mod<-glm(event~rcs(month,4) + site + year.less.ulimit + deferred + ref.cd4 + age.less.ulimit + male,       ####  Model may be too rich
             weights=weight.trunc, family="binomial", data=d.analysis)  
  }
  
  #mod<-glm(event~rcs(month,4)+site+year.less.ulimit + deferred + ref.cd4 + age.less.ulimit + male, 
  #         weights=weight.trunc, family="binomial", data=d.analysis)  
  
  library(sandwich)
  
  se<-sqrt(sandwich(mod)["deferred","deferred"])
  
  hr<-exp(summary(mod)$coeff["deferred","Estimate"])
  lower<-exp(mod$coeff[names(mod$coeff)=="deferred"]-1.96*se)
  upper<-exp(mod$coeff[names(mod$coeff)=="deferred"]+1.96*se)
  
  ests<-c(hr,lower,upper)
  
#  save(ests,u.limit,l.limit,LOST,COVARIATES,SITE.YEAR,EXCLUDE.IDU,DEATH.ONLY,CENSOR.TIME,EXCLUDE.DEATHS,start.within,file=filename)
  ests

}


########################   Here I do the analyses

library(rms)

load("cnics-hernan-stuff5c.Rda")     ####  Dataset for Hernan analyses
load("cnics-cole-stuff5.Rda")        ####  Dataset for Cole analyses

#####  Listing all of the ids
all.ids<-unique(d2$pid)
all.ids<-all.ids[order(all.ids)]


Nboot<-200
for (ii in 195:199) {

set.seed(ii)
  
#####  Sampling ids.  First sample numbers and then find corresponding id.
boot.samp<-sample(1:length(all.ids), length(all.ids), replace=TRUE)
ids.boot<-all.ids[boot.samp]

#####  Creating new bootstrap dataframe d.boot for Cole approach. (Fast)
d.junk<-data.frame(id=all.ids)
d.plus<-merge(d,d.junk,by="id",all.y=TRUE)
d.plus<-d.plus[order(d.plus$id),]
d.boot1<-d.plus[boot.samp,]
d.boot<-d.boot1[!is.na(d.boot1$male),]     

#####  Creating new bootstrap dataframe d2.boot for Hernan approach.  (Slow - 2:12)

index.grouped <- split(seq_len(nrow(d2)), d2$pid)
index <- unlist(index.grouped[ids.boot])

d2.boot <- d2[index, ]
d2.boot$count <- 1
while(sum(duplicated(d2.boot))){
  d2.boot$count<-ifelse(duplicated(d2.boot),d2.boot$count+1,d2.boot$count)
}
d2.boot$pid<-paste(d2.boot$pid,d2.boot$count,sep="-")
d2.boot$count<-NULL

# df.bs<-sapply(ids.boot,function(x)which(d2$pid==x))
# d2.boot<-d2[unlist(df.bs),]
# d2.boot$count<-ifelse(duplicated(d2.boot),2,1)
# d2.boot$count<-ifelse(duplicated(d2.boot),3,d2.boot$count)
# d2.boot$count<-ifelse(duplicated(d2.boot),4,d2.boot$count)
# d2.boot$count<-ifelse(duplicated(d2.boot),5,d2.boot$count)
# d2.boot$count<-ifelse(duplicated(d2.boot),6,d2.boot$count)
# d2.boot$count<-ifelse(duplicated(d2.boot),7,d2.boot$count)
# d2.boot$count<-ifelse(duplicated(d2.boot),8,d2.boot$count)
# d2.boot$count<-ifelse(duplicated(d2.boot),9,d2.boot$count)
# d2.boot$count<-ifelse(duplicated(d2.boot),10,d2.boot$count)
# d2.boot$pid<-paste(d2.boot$pid,d2.boot$count,sep="-")
# d2.boot$count<-NULL
# 
# d2.boot1<-d2.boot   # saving table for checking
# 
# library(data.table)
# d2<-data.table(d2)
# stuff<-date()
# 
# 
# d2.boot<-d2[d2$pid==ids.boot[1],]
# for (i in 2:length(ids.boot)){
#   d2.boot<-rbind(d2.boot,d2[d2$pid==ids.boot[i],])
# }
# d2.boot$count<-ifelse(duplicated(d2.boot),2,1)
# d2.boot$count<-ifelse(duplicated(d2.boot),3,d2.boot$count)
# d2.boot$count<-ifelse(duplicated(d2.boot),4,d2.boot$count)
# d2.boot$count<-ifelse(duplicated(d2.boot),5,d2.boot$count)
# d2.boot$count<-ifelse(duplicated(d2.boot),6,d2.boot$count)
# d2.boot$count<-ifelse(duplicated(d2.boot),7,d2.boot$count)
# d2.boot$count<-ifelse(duplicated(d2.boot),8,d2.boot$count)
# d2.boot$count<-ifelse(duplicated(d2.boot),9,d2.boot$count)
# d2.boot$count<-ifelse(duplicated(d2.boot),10,d2.boot$count)
# d2.boot$pid<-paste(d2.boot$pid,d2.boot$count,sep="-")
# #d2.boot$count<-NULL
# stuff
# date()


####  Basic Analyses
hernan.100b<-hernan.analysis(d2.boot, u.limit=201, l.limit=101, filename="basic-200vs100.Rda")  
hernan.200b<-hernan.analysis(d2.boot, u.limit=301, l.limit=201, filename="basic-300vs200.Rda")
hernan.300b<-hernan.analysis(d2.boot, u.limit=401, l.limit=301, filename="basic-400vs300.Rda")
hernan.400b<-hernan.analysis(d2.boot, u.limit=501, l.limit=401, filename="basic-500vs400.Rda")

####  Basic Analyses using death as outcome
hernan.d.100b<-hernan.analysis(d2.boot, u.limit=201, l.limit=101, DEATH.ONLY=TRUE, filename="death-200vs100.Rda")
hernan.d.200b<-hernan.analysis(d2.boot, u.limit=301, l.limit=201, DEATH.ONLY=TRUE, filename="death-300vs200.Rda")
hernan.d.300b<-hernan.analysis(d2.boot, u.limit=401, l.limit=301, DEATH.ONLY=TRUE, filename="death-400vs300.Rda")
hernan.d.400b<-hernan.analysis(d2.boot, u.limit=501, l.limit=401, DEATH.ONLY=TRUE, filename="death-500vs400.Rda")


######  Original had 20 imputations.  Because of computation, I am only going to do 5.
sterne.d.100b<-with(d.boot,hr.compute.lt(lower=  -1,upper=200,threshold.cd4=100,n.impute=5,cd4=cd4,t.event=t.death,event=death,par.ests=par.ests.d,discrete=TRUE)) 
sterne.d.200b<-with(d.boot,hr.compute.lt(lower=100,upper=300,threshold.cd4=200,n.impute=5,cd4=cd4,t.event=t.death,event=death,par.ests=par.ests.d,discrete=TRUE))
sterne.d.300b<-with(d.boot,hr.compute.lt(lower=200,upper=400,threshold.cd4=300,n.impute=5,cd4=cd4,t.event=t.death,event=death,par.ests=par.ests.d,discrete=TRUE))
sterne.d.400b<-with(d.boot,hr.compute.lt(lower=300,upper=500,threshold.cd4=400,n.impute=5,cd4=cd4,t.event=t.death,event=death,par.ests=par.ests.d,discrete=TRUE))


sterne.100b<-with(d.boot,hr.compute.lt(lower=  -1,upper=200,threshold.cd4=100,n.impute=5,cd4=cd4,t.event=t.event,event=event,par.ests=par.ests.d,discrete=TRUE)) 
sterne.200b<-with(d.boot,hr.compute.lt(lower=100,upper=300,threshold.cd4=200,n.impute=5,cd4=cd4,t.event=t.event,event=event,par.ests=par.ests.d,discrete=TRUE))
sterne.300b<-with(d.boot,hr.compute.lt(lower=200,upper=400,threshold.cd4=300,n.impute=5,cd4=cd4,t.event=t.event,event=event,par.ests=par.ests.d,discrete=TRUE))
sterne.400b<-with(d.boot,hr.compute.lt(lower=300,upper=500,threshold.cd4=400,n.impute=5,cd4=cd4,t.event=t.event,event=event,par.ests=par.ests.d,discrete=TRUE))

BootAns<-rbind(c(0,1,100,as.numeric(sterne.d.100b)[2:4]),
                c(0,1,200,as.numeric(sterne.d.200b)[2:4]),
                c(0,1,300,as.numeric(sterne.d.300b)[2:4]),
                c(0,1,400,as.numeric(sterne.d.400b)[2:4]),
                c(0,0,100,as.numeric(sterne.100b)[2:4]),
                c(0,0,200,as.numeric(sterne.200b)[2:4]),
                c(0,0,300,as.numeric(sterne.300b)[2:4]),
                c(0,0,400,as.numeric(sterne.400b)[2:4]),
                c(1,1,100,hernan.d.100b),
                c(1,1,200,hernan.d.200b),
                c(1,1,300,hernan.d.300b),
                c(1,1,400,hernan.d.400b),
                c(1,0,100,hernan.100b),
                c(1,0,200,hernan.200b),
                c(1,0,300,hernan.300b),
                c(1,0,400,hernan.400b))
colnames(BootAns)<-c("hernan","death","threshold","hr","lower","upper")

filename<-paste("bootstrap-results/BootAns",ii,".Rda",sep="")
save(BootAns,file=filename)

}


load("bootstrap-results/BootAns1.Rda")
h0d1.100<-BootAns[1,4]
h0d1.200<-BootAns[2,4]
h0d1.300<-BootAns[3,4]
h0d1.400<-BootAns[4,4]
h0d0.100<-BootAns[5,4]
h0d0.200<-BootAns[6,4]
h0d0.300<-BootAns[7,4]
h0d0.400<-BootAns[8,4]
h1d1.100<-BootAns[9,4]
h1d1.200<-BootAns[10,4]
h1d1.300<-BootAns[11,4]
h1d1.400<-BootAns[12,4]
h1d0.100<-BootAns[13,4]
h1d0.200<-BootAns[14,4]
h1d0.300<-BootAns[15,4]
h1d0.400<-BootAns[16,4]

Nboot<-200
for (ii in 2:Nboot) {
  input.file<-paste("bootstrap-results/BootAns",ii,".Rda",sep="")
  load(input.file)
  h0d1.100<-c(h0d1.100,BootAns[1,4])
  h0d1.200<-c(h0d1.200,BootAns[2,4])
  h0d1.300<-c(h0d1.300,BootAns[3,4])
  h0d1.400<-c(h0d1.400,BootAns[4,4])
  h0d0.100<-c(h0d0.100,BootAns[5,4])
  h0d0.200<-c(h0d0.200,BootAns[6,4])
  h0d0.300<-c(h0d0.300,BootAns[7,4])
  h0d0.400<-c(h0d0.400,BootAns[8,4])
  h1d1.100<-c(h1d1.100,BootAns[9,4])
  h1d1.200<-c(h1d1.200,BootAns[10,4])
  h1d1.300<-c(h1d1.300,BootAns[11,4])
  h1d1.400<-c(h1d1.400,BootAns[12,4])
  h1d0.100<-c(h1d0.100,BootAns[13,4])
  h1d0.200<-c(h1d0.200,BootAns[14,4])
  h1d0.300<-c(h1d0.300,BootAns[15,4])
  h1d0.400<-c(h1d0.400,BootAns[16,4])
}


hist(log(h1d0.100)-log(h0d0.100))
hist(log(h1d0.200)-log(h0d0.200))
hist(log(h1d0.300)-log(h0d0.300))
hist(log(h1d0.400)-log(h0d0.400))
hist(log(h1d1.100)-log(h0d1.100))
hist(log(h1d1.200)-log(h0d1.200))
hist(log(h1d1.300)-log(h0d1.300))
hist(log(h1d1.400)-log(h0d1.400))


quantile(log(h1d0.100)-log(h0d0.100),c(.025,.975))
quantile(log(h1d0.200)-log(h0d0.200),c(.025,.975))
quantile(log(h1d0.300)-log(h0d0.300),c(.025,.975))
quantile(log(h1d0.400)-log(h0d0.400),c(.025,.975))
quantile(log(h1d1.100)-log(h0d1.100),c(.025,.975))
quantile(log(h1d1.200)-log(h0d1.200),c(.025,.975))
quantile(log(h1d1.300)-log(h0d1.300),c(.025,.975))
quantile(log(h1d1.400)-log(h0d1.400),c(.025,.975))

exp(quantile(log(h1d0.100)-log(h0d0.100),c(.025,.975)))
exp(quantile(log(h1d0.200)-log(h0d0.200),c(.025,.975)))
exp(quantile(log(h1d0.300)-log(h0d0.300),c(.025,.975)))
exp(quantile(log(h1d0.400)-log(h0d0.400),c(.025,.975)))
exp(quantile(log(h1d1.100)-log(h0d1.100),c(.025,.975)))
exp(quantile(log(h1d1.200)-log(h0d1.200),c(.025,.975)))
exp(quantile(log(h1d1.300)-log(h0d1.300),c(.025,.975)))
exp(quantile(log(h1d1.400)-log(h0d1.400),c(.025,.975)))


#### Analyses with original (non-bootstrap) data
load("basic-200vs100.Rda")
h1d0.100.orig<-ests[1]
load("basic-300vs200.Rda")
h1d0.200.orig<-ests[1]
load("basic-400vs300.Rda")
h1d0.300.orig<-ests[1]
load("basic-500vs400.Rda")
h1d0.400.orig<-ests[1]
load("death-200vs100.Rda")
h1d1.100.orig<-ests[1]
load("death-300vs200.Rda")
h1d1.200.orig<-ests[1]
load("death-400vs300.Rda")
h1d1.300.orig<-ests[1]
load("death-500vs400.Rda")
h1d1.400.orig<-ests[1]

load("basic-sterne-results.Rda")
h0d0.100.orig<-as.numeric(sterne.b[1,2])
h0d0.200.orig<-as.numeric(sterne.b[2,2])
h0d0.300.orig<-as.numeric(sterne.b[3,2])
h0d0.400.orig<-as.numeric(sterne.b[4,2])
h0d1.100.orig<-as.numeric(sterne.d.b[1,2])
h0d1.200.orig<-as.numeric(sterne.d.b[2,2])
h0d1.300.orig<-as.numeric(sterne.d.b[3,2])
h0d1.400.orig<-as.numeric(sterne.d.b[4,2])


sd0.100<-sd(log(h1d0.100)-log(h0d0.100))
sd0.200<-sd(log(h1d0.200)-log(h0d0.200))
sd0.300<-sd(log(h1d0.300)-log(h0d0.300))
sd0.400<-sd(log(h1d0.400)-log(h0d0.400))
sd1.100<-sd(log(h1d1.100)-log(h0d1.100))
sd1.200<-sd(log(h1d1.200)-log(h0d1.200))
sd1.300<-sd(log(h1d1.300)-log(h0d1.300))
sd1.400<-sd(log(h1d1.400)-log(h0d1.400))


plot(c(log(1/3),log(3)),c(0,1),type="n",xlab="Ratio of Hazard Ratios",ylab="",axes=FALSE)
axis(1,at=log(c(1/3,1/2,1,2,3)),c(0.3,0.5,1,2,3))
mtext("Death/ADE")
lines(quantile(log(h1d0.100)-log(h0d0.100),c(.025,.975)),rep(.8,2))
lines(quantile(log(h1d0.200)-log(h0d0.200),c(.025,.975)),rep(.6,2))
lines(quantile(log(h1d0.300)-log(h0d0.300),c(.025,.975)),rep(.4,2))
lines(quantile(log(h1d0.400)-log(h0d0.400),c(.025,.975)),rep(.2,2))
points(log(h1d0.100.orig)-log(h0d0.100.orig),.8,pch=3)
points(log(h1d0.200.orig)-log(h0d0.200.orig),.6,pch=3)
points(log(h1d0.300.orig)-log(h0d0.300.orig),.4,pch=3)
points(log(h1d0.400.orig)-log(h0d0.400.orig),.2,pch=3)

lines(log(h1d0.100.orig)-log(h0d0.100.orig)+c(-1,1)*1.96*sd0.100,rep(.75,2),col=2)
lines(log(h1d0.200.orig)-log(h0d0.200.orig)+c(-1,1)*1.96*sd0.200,rep(.55,2),col=2)
lines(log(h1d0.300.orig)-log(h0d0.300.orig)+c(-1,1)*1.96*sd0.300,rep(.35,2),col=2)
lines(log(h1d0.400.orig)-log(h0d0.400.orig)+c(-1,1)*1.96*sd0.400,rep(.15,2),col=2)
points(log(h1d0.100.orig)-log(h0d0.100.orig),.75,pch=3,col=2)
points(log(h1d0.200.orig)-log(h0d0.200.orig),.55,pch=3,col=2)
points(log(h1d0.300.orig)-log(h0d0.300.orig),.35,pch=3,col=2)
points(log(h1d0.400.orig)-log(h0d0.400.orig),.15,pch=3,col=2)

abline(v=0,lty=2)


plot(c(log(1/3),log(3)),c(0,1),type="n",xlab="Ratio of Hazard Ratios",ylab="",axes=FALSE)
axis(1,at=log(c(1/3,1/2,1,2,3)),c(0.3,0.5,1,2,3))
mtext("Death")
lines(quantile(log(h1d1.100)-log(h0d1.100),c(.025,.975)),rep(.8,2))
lines(quantile(log(h1d1.200)-log(h0d1.200),c(.025,.975)),rep(.6,2))
lines(quantile(log(h1d1.300)-log(h0d1.300),c(.025,.975)),rep(.4,2))
lines(quantile(log(h1d1.400)-log(h0d1.400),c(.025,.975)),rep(.2,2))
points(log(h1d1.100.orig)-log(h0d1.100.orig),.8,pch=3)
points(log(h1d1.200.orig)-log(h0d1.200.orig),.6,pch=3)
points(log(h1d1.300.orig)-log(h0d1.300.orig),.4,pch=3)
points(log(h1d1.400.orig)-log(h0d1.400.orig),.2,pch=3)

lines(log(h1d1.100.orig)-log(h0d1.100.orig)+c(-1,1)*1.96*sd1.100,rep(.75,2),col=2)
lines(log(h1d1.200.orig)-log(h0d1.200.orig)+c(-1,1)*1.96*sd1.200,rep(.55,2),col=2)
lines(log(h1d1.300.orig)-log(h0d1.300.orig)+c(-1,1)*1.96*sd1.300,rep(.35,2),col=2)
lines(log(h1d1.400.orig)-log(h0d1.400.orig)+c(-1,1)*1.96*sd1.400,rep(.15,2),col=2)
points(log(h1d1.100.orig)-log(h0d1.100.orig),.75,pch=3,col=2)
points(log(h1d1.200.orig)-log(h0d1.200.orig),.55,pch=3,col=2)
points(log(h1d1.300.orig)-log(h0d1.300.orig),.35,pch=3,col=2)
points(log(h1d1.400.orig)-log(h0d1.400.orig),.15,pch=3,col=2)

abline(v=0,lty=2)


pdf("plot-comparison-bootstrap-death.pdf",height=7,width=6)
par(mar=c(5,.1,.5,.1))
plot(c(log(.25),log(3)),c(.1,.9),xlab="Ratio of Hazard Ratios",ylab="",axes=FALSE,type="n")
abline(v=0,col=gray(.5))
axis(1,at=log(c(.33,.5,1,2,3)),c(.3,.5,1,2,3))
text(log(.25),.8,"200 vs. 100", pos=4, cex=1)
text(log(.25),.6,"300 vs. 200", pos=4, cex=1)
text(log(.25),.4,"400 vs. 300", pos=4, cex=1)
text(log(.25),.2,"500 vs. 400", pos=4, cex=1)
lines(log(h1d1.100.orig)-log(h0d1.100.orig)+c(-1,1)*1.96*sd1.100,rep(.8,2))
lines(log(h1d1.200.orig)-log(h0d1.200.orig)+c(-1,1)*1.96*sd1.200,rep(.6,2))
lines(log(h1d1.300.orig)-log(h0d1.300.orig)+c(-1,1)*1.96*sd1.300,rep(.4,2))
lines(log(h1d1.400.orig)-log(h0d1.400.orig)+c(-1,1)*1.96*sd1.400,rep(.2,2))
points(log(h1d1.100.orig)-log(h0d1.100.orig),.8,pch=3)
points(log(h1d1.200.orig)-log(h0d1.200.orig),.6,pch=3)
points(log(h1d1.300.orig)-log(h0d1.300.orig),.4,pch=3)
points(log(h1d1.400.orig)-log(h0d1.400.orig),.2,pch=3)
dev.off()


pdf("plot-comparison-bootstrap.pdf",height=7,width=6)
par(mar=c(5,.1,.5,.1))
plot(c(log(.25),log(3)),c(.1,.9),xlab="Ratio of Hazard Ratios",ylab="",axes=FALSE,type="n")
abline(v=0,col=gray(.5))
axis(1,at=log(c(.33,.5,1,2,3)),c(.3,.5,1,2,3))
text(log(.25),.8,"200 vs. 100", pos=4, cex=1)
text(log(.25),.6,"300 vs. 200", pos=4, cex=1)
text(log(.25),.4,"400 vs. 300", pos=4, cex=1)
text(log(.25),.2,"500 vs. 400", pos=4, cex=1)
lines(log(h1d0.100.orig)-log(h0d0.100.orig)+c(-1,1)*1.96*sd0.100,rep(.8,2))
lines(log(h1d0.200.orig)-log(h0d0.200.orig)+c(-1,1)*1.96*sd0.200,rep(.6,2))
lines(log(h1d0.300.orig)-log(h0d0.300.orig)+c(-1,1)*1.96*sd0.300,rep(.4,2))
lines(log(h1d0.400.orig)-log(h0d0.400.orig)+c(-1,1)*1.96*sd0.400,rep(.2,2))
points(log(h1d0.100.orig)-log(h0d0.100.orig),.8,pch=3)
points(log(h1d0.200.orig)-log(h0d0.200.orig),.6,pch=3)
points(log(h1d0.300.orig)-log(h0d0.300.orig),.4,pch=3)
points(log(h1d0.400.orig)-log(h0d0.400.orig),.2,pch=3)
dev.off()


####  Combined plot.  (plot-comparison-death.pdf + plot-comparison.pdf [both from cnics-cole.R]
####                   + plot-comparison-bootstrap-death and plot-comparison-bootstrap). 
####  Figure 3


load("basic-sterne-results.Rda")

mi.answers<-function(filename){  
  load(paste(filename,".Rda",sep=""))
  est<-ests
  hr<-est[1]
  lower<-est[2]
  upper<-est[3]
  type<-strsplit(filename,"-")[[1]][1]
  type.long<-ifelse(type=="basic","Basic, ADE/death",
                    ifelse(type=="censor6","Censor after 6 years",
                           ifelse(type=="exdeath","Exclude deaths w/in 2 wks of ART",
                                  ifelse(type=="exIDU","Exclude IDU",
                                         ifelse(type=="cov","Adjust for sex, baseline age and CD4",
                                                ifelse(type=="lost","Account for LTFU with weights",
                                                       ifelse(type=="strat","Stratify by site and year",
                                                              ifelse(type=="death","Basic, death",
                                                                     ifelse(type=="add2","Add 2 mos. to those w/out death",type)))))))))
  stuff<-c(hr,lower,upper,u.limit,l.limit,type,type.long)
  return(stuff)
}


ans.d.b<-rbind(mi.answers("death-200vs100"),mi.answers("death-300vs200"),mi.answers("death-400vs300"),
               mi.answers("death-500vs400"))

pdf("figure3-combined.pdf",height=8,width=10)
par(mfrow=c(2,2),mar=c(5,.1,3,.1),xpd=NA)

# pdf("plot-comparison-death.pdf",height=7,width=6)
# par(mar=c(5,.1,.5,.1))
plot(c(log(.25),log(3)),c(.1,1),xlab="",ylab="",axes=FALSE,type="n")
lines(c(0,0),c(.05,.95),col=gray(.7))
axis(1,at=log(c(.33,.5,1,2,3)),c(.3,.5,1,2,3))
points(log(as.numeric(sterne.d.b[1,2])),.8,pch=3)
lines(log(as.numeric(sterne.d.b[1,3:4])),c(.8,.8))
text(log(.25),.8,"100 vs. 200", pos=4, cex=1)
points(log(as.numeric(sterne.d.b[2,2])),.6,pch=3)
lines(log(as.numeric(sterne.d.b[2,3:4])),c(.6,.6))
text(log(.25),.6,"200 vs. 300", pos=4, cex=1)
points(log(as.numeric(sterne.d.b[3,2])),.4,pch=3)
lines(log(as.numeric(sterne.d.b[3,3:4])),c(.4,.4))
text(log(.25),.4,"300 vs. 400", pos=4, cex=1)
points(log(as.numeric(sterne.d.b[4,2])),.2,pch=3)
lines(log(as.numeric(sterne.d.b[4,3:4])),c(.2,.2))
text(log(.25),.2,"400 vs. 500", pos=4, cex=1)
points(log(as.numeric(ans.d.b[1,1])),.75,pch=3,col=4)
lines(log(as.numeric(ans.d.b[1,2:3])),c(.75,.75),col=4)
points(log(as.numeric(ans.d.b[2,1])),.55,pch=3,col=4)
lines(log(as.numeric(ans.d.b[2,2:3])),c(.55,.55),col=4)
points(log(as.numeric(ans.d.b[3,1])),.35,pch=3,col=4)
lines(log(as.numeric(ans.d.b[3,2:3])),c(.35,.35),col=4)
points(log(as.numeric(ans.d.b[4,1])),.15,pch=3,col=4)
lines(log(as.numeric(ans.d.b[4,2:3])),c(.15,.15),col=4)

text(0,-.1,"Hazard Ratio")
text(0,1,"Death", cex=1.3)
#dev.off()


ans.b<-rbind(mi.answers("basic-200vs100"),mi.answers("basic-300vs200"),mi.answers("basic-400vs300"),
             mi.answers("basic-500vs400"))

#pdf("plot-comparison.pdf",height=7,width=6)
#par(mar=c(5,.1,.5,.1))
plot(c(log(.25),log(3)),c(.1,1),xlab="",ylab="",axes=FALSE,type="n")
#abline(v=0,col=gray(.7))
lines(c(0,0),c(.05,.95),col=gray(.7))
axis(1,at=log(c(.33,.5,1,2,3)),c(.3,.5,1,2,3))
points(log(as.numeric(sterne.b[1,2])),.8,pch=3)
lines(log(as.numeric(sterne.b[1,3:4])),c(.8,.8))
text(log(.25),.8,"100 vs. 200", pos=4, cex=1)
points(log(as.numeric(sterne.b[2,2])),.6,pch=3)
lines(log(as.numeric(sterne.b[2,3:4])),c(.6,.6))
text(log(.25),.6,"200 vs. 300", pos=4, cex=1)
points(log(as.numeric(sterne.b[3,2])),.4,pch=3)
lines(log(as.numeric(sterne.b[3,3:4])),c(.4,.4))
text(log(.25),.4,"300 vs. 400", pos=4, cex=1)
points(log(as.numeric(sterne.b[4,2])),.2,pch=3)
lines(log(as.numeric(sterne.b[4,3:4])),c(.2,.2))
text(log(.25),.2,"400 vs. 500", pos=4, cex=1)
points(log(as.numeric(ans.b[1,1])),.75,pch=3,col=4)
lines(log(as.numeric(ans.b[1,2:3])),c(.75,.75),col=4)
points(log(as.numeric(ans.b[2,1])),.55,pch=3,col=4)
lines(log(as.numeric(ans.b[2,2:3])),c(.55,.55),col=4)
points(log(as.numeric(ans.b[3,1])),.35,pch=3,col=4)
lines(log(as.numeric(ans.b[3,2:3])),c(.35,.35),col=4)
points(log(as.numeric(ans.b[4,1])),.15,pch=3,col=4)
lines(log(as.numeric(ans.b[4,2:3])),c(.15,.15),col=4)

text(log(.25),1.1,"Hazard Ratios",cex=1.5)
text(0,-.1,"Hazard Ratio")
text(0,1,"AIDS or Death", cex=1.3)

#dev.off()


plot(c(log(.25),log(3)),c(.1,1),xlab="",ylab="",axes=FALSE,type="n")
#abline(v=0,col=gray(.7))
lines(c(0,0),c(.05,.95),col=gray(.7))
axis(1,at=log(c(.33,.5,1,2,3)),c(.3,.5,1,2,3))
text(log(.25),.8,"100 vs. 200", pos=4, cex=1)
text(log(.25),.6,"200 vs. 300", pos=4, cex=1)
text(log(.25),.4,"300 vs. 400", pos=4, cex=1)
text(log(.25),.2,"400 vs. 500", pos=4, cex=1)
lines(log(h1d1.100.orig)-log(h0d1.100.orig)+c(-1,1)*1.96*sd1.100,rep(.8,2))
lines(log(h1d1.200.orig)-log(h0d1.200.orig)+c(-1,1)*1.96*sd1.200,rep(.6,2))
lines(log(h1d1.300.orig)-log(h0d1.300.orig)+c(-1,1)*1.96*sd1.300,rep(.4,2))
lines(log(h1d1.400.orig)-log(h0d1.400.orig)+c(-1,1)*1.96*sd1.400,rep(.2,2))
points(log(h1d1.100.orig)-log(h0d1.100.orig),.8,pch=3)
points(log(h1d1.200.orig)-log(h0d1.200.orig),.6,pch=3)
points(log(h1d1.300.orig)-log(h0d1.300.orig),.4,pch=3)
points(log(h1d1.400.orig)-log(h0d1.400.orig),.2,pch=3)
text(0,1,"Death", cex=1.3)
text(0,-.1,"Ratio of Hazard Ratios")

plot(c(log(.25),log(3)),c(.1,1),xlab="",ylab="",axes=FALSE,type="n")
#abline(v=0,col=gray(.7))
lines(c(0,0),c(.05,.95),col=gray(.7))
axis(1,at=log(c(.33,.5,1,2,3)),c(.3,.5,1,2,3))
text(log(.25),.8,"100 vs. 200", pos=4, cex=1)
text(log(.25),.6,"200 vs. 300", pos=4, cex=1)
text(log(.25),.4,"300 vs. 400", pos=4, cex=1)
text(log(.25),.2,"400 vs. 500", pos=4, cex=1)
lines(log(h1d0.100.orig)-log(h0d0.100.orig)+c(-1,1)*1.96*sd0.100,rep(.8,2))
lines(log(h1d0.200.orig)-log(h0d0.200.orig)+c(-1,1)*1.96*sd0.200,rep(.6,2))
lines(log(h1d0.300.orig)-log(h0d0.300.orig)+c(-1,1)*1.96*sd0.300,rep(.4,2))
lines(log(h1d0.400.orig)-log(h0d0.400.orig)+c(-1,1)*1.96*sd0.400,rep(.2,2))
points(log(h1d0.100.orig)-log(h0d0.100.orig),.8,pch=3)
points(log(h1d0.200.orig)-log(h0d0.200.orig),.6,pch=3)
points(log(h1d0.300.orig)-log(h0d0.300.orig),.4,pch=3)
points(log(h1d0.400.orig)-log(h0d0.400.orig),.2,pch=3)


text(0,1,"AIDS or Death", cex=1.3)
text(0,-.1,"Ratio of Hazard Ratios")
text(log(.25),1.1,"Ratios of Hazard Ratios",cex=1.5)


dev.off()


####  Playing with bootstrap approaches
library(data.table)
# stuff<-date()
# d2.junk<-data.table(pid=rep(1:10000,each=20),x=rnorm(200000))
# #d2.junk<-data.frame(pid=rep(1:50,each=10),x=rnorm(500))
# 
# all.ids.junk<-unique(d2.junk$pid)
# ids.boot<-sample(all.ids.junk,length(all.ids.junk),replace=TRUE)
# 
# d2.boot<-d2.junk[d2.junk$pid==ids.boot[1],]
# for (i in 2:length(ids.boot)){
#   d2.boot<-rbind(d2.boot,d2.junk[d2.junk$pid==ids.boot[i],])
# }
# 
# d2.boot$count<-ifelse(duplicated(d2.boot),2,1)
# d2.boot$count<-ifelse(duplicated(d2.boot),3,d2.boot$count)
# d2.boot$count<-ifelse(duplicated(d2.boot),4,d2.boot$count)
# d2.boot$count<-ifelse(duplicated(d2.boot),5,d2.boot$count)
# d2.boot$count<-ifelse(duplicated(d2.boot),6,d2.boot$count)
# d2.boot$count<-ifelse(duplicated(d2.boot),7,d2.boot$count)
# d2.boot$count<-ifelse(duplicated(d2.boot),8,d2.boot$count)
# d2.boot$count<-ifelse(duplicated(d2.boot),9,d2.boot$count)
# d2.boot$count<-ifelse(duplicated(d2.boot),10,d2.boot$count)
# d2.boot$pid<-paste(d2.boot$pid,count,sep="-")
# d2.boot$count<-NULL
# stuff
# date()

set.seed(1)
stuff<-date()
d2.junk<-data.frame(pid=rep(1:10000,each=20),x=rnorm(200000))
clusters<-unique(d2.junk$pid)
units<-sample(clusters,size=length(clusters),replace=T)
df.bs<-sapply(units,function(x)which(d2.junk$pid==x))
d2.boot<-d2.junk[unlist(df.bs),]
d2.boot$count<-ifelse(duplicated(d2.boot),2,1)
d2.boot$count<-ifelse(duplicated(d2.boot),3,d2.boot$count)
d2.boot$count<-ifelse(duplicated(d2.boot),4,d2.boot$count)
d2.boot$count<-ifelse(duplicated(d2.boot),5,d2.boot$count)
d2.boot$count<-ifelse(duplicated(d2.boot),6,d2.boot$count)
d2.boot$count<-ifelse(duplicated(d2.boot),7,d2.boot$count)
d2.boot$count<-ifelse(duplicated(d2.boot),8,d2.boot$count)
d2.boot$count<-ifelse(duplicated(d2.boot),9,d2.boot$count)
d2.boot$count<-ifelse(duplicated(d2.boot),10,d2.boot$count)
d2.boot$pid<-paste(d2.boot$pid,d2.boot$count,sep="-")
#d2.boot$count<-NULL
stuff
date()


library(data.table)
set.seed(1)
stuff.qi1<-date()
d2.junk<-data.frame(pid=rep(1:10000,each=20),x=rnorm(200000))
d2.junk.t <- data.table(d2.junk)
setkey(d2.junk.t, pid)

all.ids.junk<-unique(d2.junk.t$pid)
ids.boot<-sample(all.ids.junk,length(all.ids.junk),replace=TRUE)

d2.boot <- do.call("rbind", lapply(ids.boot, FUN=function(id) d2.junk.t[pid==id,]))

d2.boot$count <- 1
while(sum(duplicated(d2.boot))){
  d2.boot$count<-ifelse(duplicated(d2.boot),d2.boot$count+1,d2.boot$count)
}

d2.boot$pid<-paste(d2.boot$pid,d2.boot$count,sep="-")
d2.boot<- data.frame(d2.boot)
d2.boot$count<-NULL

stuff.qi2<-date()

d2.boot.qi<-d2.boot

set.seed(1)
stuff.bryan1<-date()
d2.junk<-data.frame(pid=rep(1:10000,each=20),x=rnorm(200000))
clusters<-unique(d2.junk$pid)
units<-sample(clusters,size=length(clusters),replace=T)
df.bs<-sapply(units,function(x)which(d2.junk$pid==x))
d2.boot<-d2.junk[unlist(df.bs),]
d2.boot$count <- 1
while(sum(duplicated(d2.boot))){
  d2.boot$count<-ifelse(duplicated(d2.boot),d2.boot$count+1,d2.boot$count)
}
d2.boot$pid<-paste(d2.boot$pid,d2.boot$count,sep="-")
d2.boot$count<-NULL
stuff.bryan2<-date()

d2.boot.bryan<-d2.boot

set.seed(1)
stuff.thomas1<-date()
d2.junk<-data.frame(pid=rep(1:10000,each=20),x=rnorm(200000))
clusters<-unique(d2.junk$pid)
units<-sample(clusters,size=length(clusters),replace=T)

index.grouped <- split(seq_len(nrow(d2.junk)), d2.junk$pid)
index <- unlist(index.grouped[units])

d2.boot <- d2.junk[index, ]
d2.boot$count <- 1
while(sum(duplicated(d2.boot))){
  d2.boot$count<-ifelse(duplicated(d2.boot),d2.boot$count+1,d2.boot$count)
}
d2.boot$pid<-paste(d2.boot$pid,d2.boot$count,sep="-")
d2.boot$count<-NULL
stuff.thomas2<-date()

d2.boot.thomas<-d2.boot


set.seed(1)
stuff.cole1<-date()
 x<-data.frame(pid=rep(1:10000,each=20),x=rnorm(200000))
  ids <- sort(unique(x$pid))
  ids.boot <- sample(ids, replace=TRUE)
  len <- length(ids.boot)
  cnt <- sapply(seq(len), function(i) {
    sum(ids.boot[seq(i)] == ids.boot[i])
  })
  index <- lapply(ids, function(i) {
    which(x$pid == i)
  })
  x.boot <- lapply(seq_along(ids.boot), function(i) {
    ix <- ids.boot[i]
    cbind(pid=sprintf("%s-%s", ix, cnt[i]), x[index[[ix]], -1,
                                              drop=FALSE], stringsAsFactors=FALSE)
  })
  x.boot <- do.call(rbind, x.boot)
  # I like to reset rownames
  row.names(x.boot) <- NULL
stuff.cole2<-date()


# 
# 
# clusterBootSE<-function(formula,data,clustervar,reps){
#   reg1 <- glm(formula, data)#, family=binomial(link="logit"))
#   #1. generate the bootstrap dataset
#   clusters<-unique(d2.junk$pid)
#   #sterrs <- matrix(NA, nrow=reps, ncol=length(coef(reg1)))
#   for(i in 1:reps){
#     units<-sample(clusters,size=length(clusters),replace=T)
#     df.bs<-sapply(units,function(x)which(d2.junk$pid==x))
#     df.bs<-d2.junk[unlist(df.bs),]
#     sterrs[i,]<-coef(glm(formula, df.bs))#, family=binomial(link="logit")))
#     cat("\r",i*100/reps," % done ");flush.console()
#   }
#   val <- cbind(coef(reg1),apply(sterrs,2,sd))
#   rownames(val)<-names(coef(reg1)); colnames(val) <- c("Estimate","Std. Error")
#   cat("\n")
#   return(val)
# }
# 
# 
# 
# clusterBootSE(x~1,d2.junk,pid,5)