rm(list=ls())


#------------------------#
#       Libraries        #
#------------------------#
library(foreign)
library(Hmisc)
library(Design)

#------------------------#
#       Functions        #
#------------------------#
#---------------------------------------------------------------------#
#                           add.fun                                   #
#---------------------------------------------------------------------#
# In creating monthly (time-dependent) data frame, records are inserted
# that do not exist in the original data.  This creates missing values for
# CD4, viral loads, CD4 %, etc.  Add.fun fills in these values with the 
# last non-missing observation and records the age at which that non-missing
# value occurred.
add.fun <- function(labage.1, labage.2){
   var <- names(labage.1)[3]
   labage.1$order <- 1
   labage.2$order <- 0
   pullData <- rbind(labage.1,labage.2)
   pullData$age <- round(pullData$age, 4)

   pullData <- pullData[order(pullData$pid, pullData$age, -pullData$order),]

   hasVal <- diff(c(0L,as.integer(as.factor(pullData$pid)))) != 0 |
             !is.na(pullData[,var]) |
             (pullData$order == 0)

   index <- rep(which(hasVal), times=table(cumsum(hasVal)))
   
   pullData[,c(var, paste("age", var, sep='.'))] <- pullData[index,
                                                             c(var, "age")]

   return(pullData)
}

#---------------------------------------------------------------------#
#                           create.df                                   #
#---------------------------------------------------------------------#
# Creates monthly (time-dependent) data frame for analysis
create.df <-
       function(start.within,restrict,u.limit,l.limit,d2,old.code,original){

   # Identifying age at which CD4 is less than ulimit by PID
   less.ulimit <- lapply(split(subset(d2, select=c(pid,age,cd4)), d2$pid),
                         FUN=function(y){
                            y[min(which(y[,"cd4"] < u.limit)),"age"]})
   less.ulimit.df <- data.frame("pid"=names(less.ulimit),
                                "age.less.ulimit"=unlist(less.ulimit))
   less.ulimit.df$pid <- as.character(less.ulimit.df$pid)
   d2[,"age.less.ulimit"] <- less.ulimit.df[match(d2$pid,less.ulimit.df$pid,nomatch=NA),
                                                  "age.less.ulimit"]
   d2 <- d2[order(d2$pid, d2$age),]


   # If a PID has a missing age.less.ulimit, then they did not have a CD4 below the upper
   # limit and should be deleted from the analysis
   d2 <- subset(d2, !is.na(age.less.ulimit))

   # Time.in.care is the time between the first recorded clinic visit and the age a PIDs CD4 fell
   # below the upper limit
   d2$time.in.care <- d2$age.less.ulimit - d2$first.age

   # If someone died, their true last age should be the age of death
   # Otherwise it is their last recorded age.
   true.last.age <- lapply(split(subset(d2, select=c(pid,age,age_at_death)),
                                 d2$pid),
                    FUN=function(y){
                       unique(ifelse(is.na(y[,3]),max(y[,2]),unique(y[,3])))})
   tla.df <- data.frame("pid"=names(true.last.age),
                        "true.last.age"=unlist(true.last.age))
   tla.df <- tla.df[order(as.character(tla.df$pid)),]
   tla.df$pid <- as.character(tla.df$pid)

   d2 <- merge(d2, tla.df, all=TRUE)
   d2 <- d2[order(d2$pid, d2$age),]

   # Total follow-up time is the time from when a PIDs CD4 fell below the upper limit to their true.last.age
   d2$tot.fu <- d2$true.last.age - d2$age.less.ulimit

   # In the original data, each record at which an OI occurred had a non-missing age.oi.  All other records
   # had a missing age.oi.  For analysis purposes, those OIs that occurred between a PIDs first age and
   # age.less.ulimit were recorded as prior.oi and the age recorded.  For those OIs that occurred after
   # age.less.ulimit, only the first occurrence of an OI will be included in the analysis and the variable
   # age.oi will repeat the age at which the first OI after age.less.ulimit occurred.  The following
   # code addresses both of these issues.
   # Defining prior.oi (OI between first.age and age.less.ulimit)
   age.oi <- subset(d2, !is.na(age.oi), select=c(pid, age.oi, age.less.ulimit))
   age.oi$prior.oi <- ifelse(round(age.oi$age.oi,4) <=
                                  round(age.oi$age.less.ulimit,4),1,0)
   # Grabbing first incidence of OI prior to age.less.ulimit if more than one prior.oi
   age.prior.oi <- unlist(lapply(split(age.oi, age.oi$pid),
                         FUN=function(y){
                          ifelse(all(y[,"prior.oi"]==0),NA,min(y[y[,"prior.oi"]==1,
                                                                   "age.oi"]))}))
   age.prior.oi.df <- data.frame("pid"=names(age.prior.oi),
                                 "age.prior.oi"=age.prior.oi)
   age.prior.oi.df$pid <- as.character(age.prior.oi.df$pid)
   age.oi <- merge(age.oi, age.prior.oi.df, all=TRUE)

   # Grabbing first incidence of OI after age.less.ulimit if more than one oi
   age.oi.af <- lapply(split(age.oi, age.oi$pid), FUN=function(y){
                       ifelse(all(y[,"prior.oi"]==1),NA,min(y[y[,"prior.oi"]==0,"age.oi"]))})
   age.oi.af.df <- data.frame("pid"=unlist(names(age.oi.af)),
                              "age.oi"=unlist(age.oi.af))
   age.oi.af.df$pid <- as.character(age.oi.af.df$pid)
   age.oi.2 <- merge(subset(age.oi, !duplicated(pid),
                          select=c(pid,age.prior.oi)),
                     age.oi.af.df, all=TRUE)    
   dup.oi <- subset(age.oi, prior.oi == 0)
   dup.oi$age.oi <- round(dup.oi$age.oi,4)
   oi.df$age.oi <- round(oi.df$age.oi,4)
   oi.df$pid <- as.character(oi.df$pid)
   oi.df <- oi.df[order(oi.df$pid,oi.df$age.oi),]
   dup.oi <- merge(dup.oi, subset(oi.df, pid %in% dup.oi$pid),all=TRUE)
   dup.oi <- subset(dup.oi, prior.oi == 0)
   dup.oi <- upData(dup.oi, drop=c("year.1st.visit","age.less.ulimit","oi",
                                   "prior.oi","age.prior.oi"))
   dup.oi$oi.1 <- tolower(dup.oi$oi.1)
   dup.oi$oi.2 <- tolower(dup.oi$oi.2)
   dup.oi$oi.2 <- ifelse(dup.oi$oi.2 == " ",NA,as.character(dup.oi$oi.2))
   dup.oi$oi.3 <- tolower(dup.oi$oi.3)
   dup.oi$oi.3 <- ifelse(dup.oi$oi.3 == " ",NA,as.character(dup.oi$oi.3))
   dup.oi$oi.4 <- tolower(dup.oi$oi.4)
   dup.oi$oi.4 <- ifelse(dup.oi$oi.4 == " ",NA,as.character(dup.oi$oi.4))

   # Getting class of OIs from oi.class
   dup.oi[,"oi.1.class"] <- oi.class[match(dup.oi$oi.1,
                                           oi.class$oi,nomatch=NA),"class"]
   dup.oi[,"oi.2.class"] <- oi.class[match(dup.oi$oi.2,
                                           oi.class$oi,nomatch=NA),"class"]
   dup.oi[,"oi.3.class"] <- oi.class[match(dup.oi$oi.3,
                                           oi.class$oi,nomatch=NA),"class"]
   dup.oi[,"oi.4.class"] <- oi.class[match(dup.oi$oi.4,
                                           oi.class$oi,nomatch=NA),"class"]
   d2$age.oi <- round(d2$age.oi,4)
   d2 <- merge(d2, dup.oi, all=TRUE)
   d2 <- d2[order(d2$pid,d2$age),]

   # Dropping current age.oi from d2 and adding in age.oi from age.oi.2
   # data.  Rather than have all OI ages repeated, will just repeat age of first
   # OI after age.less.ulimit
   d2 <- upData(d2, drop="age.oi")
   d2[,"age.oi"] <- age.oi.2[match(d2$pid,
                                   age.oi.2$pid,
                                   nomatch=NA),"age.oi"]
   d2[,"age.prior.oi"] <- age.oi.2[match(d2$pid,
                                         age.oi.2$pid,
                                         nomatch=NA),"age.prior.oi"]
   d2$prior.oi <- ifelse(!is.na(d2$age.prior.oi),1,0)


   # Defining prior.art (This is ART before age.less.ulimit but during a clinic
   # visit.  Different from has_art_before_first_visit
   prior.art.df <- subset(d2, age < age.less.ulimit)
   prior.art <- tapply(prior.art.df$art, INDEX=prior.art.df$pid, FUN=max)
   prior.art.df <- data.frame("pid"=names(prior.art),
                              "prior.art"=prior.art)
   d2[,"prior.art"] <- prior.art.df[match(d2$pid,
                                         prior.art.df$pid,
                                         nomatch=NA),"prior.art"]
   d2$prior.art <- ifelse(is.na(d2$prior.art),0,
                               d2$prior.art)


   # Subsetting out PIDs with prior.oi or prior.art
   d2 <- subset(d2, prior.oi != 1 & prior.art != 1 &
                     has_art_before_first_visit != 1 &
                     has_oi_before_first_visit != 1)

#    d2 <- subset(d2, prior.oi != 1 & prior.art != 1)

   # Had issues in the propensity scores with missing CD4 % but non-missing
   # CD4 values.  Gave different lengths for the predicted probabilities
   # of time-dependent vs non-time-dependent models.  Imputing values
   # for CD4_%
   cd4.pct.mod.beta <- lm(d2$cd4_percent ~ d2$cd4)$coeff[-1]
   d2$cd4_percent <- apply(d2[,c("cd4_percent","cd4")],1,
                               FUN=function(x){
                                  if(is.na(x[1]) & !is.na(x[2])){
                                     x[2]*cd4.pct.mod.beta
                                  } else {
                                     x[1]
                                  }})
   pre.d2 <- d2
#    save(pre.d2, file=paste("pre.d2.",k,".nonsus500ADE.rda",sep=""))


   unique.id<-unique(d2$pid)


   if(restrict){
      ## Below..should it be <= or just < ???
      ##################### Will this include those w/ CD4 at or below 200 (before else)???
      gt.ulimit <- tapply(d2$cd4[round(d2$age,4) <=
                                      round(d2$age.less.ulimit,4)],
                          INDEX=d2$pid[round(d2$age,4) <=
                                            round(d2$age.less.ulimi,4)],
                          FUN=function(y){
                             as.numeric(any(y[!is.na(y)] > u.limit))})
      gt.ulimit.df <- data.frame("pid"=names(gt.ulimit),
                                 "gt.ulimit"=gt.ulimit)
      d2[,"gt.ulimit"] <- gt.ulimit.df[match(d2$pid,
                                            gt.ulimit.df$pid,
                                            nomatch=NA),"gt.ulimit"]
      sub.d2 <- subset(d2, (age.fhaart >= age.less.ulimit |
                            is.na(age.fhaart)) & gt.ulimit == 1)
      sub.d2 <- upData(sub.d2, drop=c("gtkmths","gt.ulimit"))

   } else {
      gt.llimit.df <- subset(d2, round(age,4) == round(age.less.ulimit,4),
                             select=c(pid,age,age.less.ulimit,cd4))
      gt.llimit.df$gt.llimit <- ifelse(gt.llimit.df$cd4 >= l.limit,1,0)
      d2[,"gt.llimit"] <- gt.llimit.df[match(d2$pid,
                                            gt.llimit.df$pid,nomatch=NA),
                                            "gt.llimit"]
      sub.d2 <- subset(d2, (age.fhaart >= age.less.ulimit |
                                      is.na(age.fhaart)) &
                                      gt.llimit == 1)
      sub.d2 <- sub.d2[order(sub.d2$pid,sub.d2$age),]
      sub.d2 <- upData(sub.d2, drop=c("gtkmths","gt.llimit"))
   }

   sub.d2 <- sub.d2[order(sub.d2$pid, sub.d2$age),]
   sub.unique.id <- unique(sub.d2$pid)

   d2 <- sub.d2

   # Defining trt200 group
   if(!original){
      d2$trt.group <- ifelse(is.na(d2$age.fhaart),0,
                ifelse((d2$age.fhaart - d2$age.less.ulimit) < start.within,1,0))

      # Excluding those who have an event:
      # Trt group 1:  Those with an OI between age.less.ulimit & age.fhaart
      #               (Note:  Group 1 could not have a death because then
      #                       they would not have started HAART; had to start
      #                       HAART to be in Group 1).
      # Trt group 2:  Those with an OI or death within start.within of
      #               age.less.ulimit.
      grp1.1.del <- unique(d2$pid[which(d2$trt.group == 1 &
                                        !is.na(d2$age.oi) &
                                        d2$age.oi >= d2$age.less.ulimit &
                                        d2$age.oi <= d2$age.fhaart)])
      grp.2.oi.del <- unique(d2$pid[which(d2$trt.group == 0 &
                                          !is.na(d2$age.oi) &
                                          d2$age.oi >= d2$age.less.ulimit &
                             d2$age.oi <= (d2$age.less.ulimit + start.within))])
      grp.2.dth.del <- unique(d2$pid[which(d2$trt.group == 0 &
                                           !is.na(d2$age_at_death) &
                                         d2$age_at_death >= d2$age.less.ulimit &
                         d2$age_at_death <= (d2$age.less.ulimit +
                                             start.within))])
      d2 <- subset(d2, pid %nin% c(grp1.1.del,grp.2.oi.del,grp.2.dth.del))

      # Defining avg start time of trt200 group
      tmp <- subset(d2, select=c("pid","age","fhaart","age.fhaart",
                             "cd4","cd4_percent","vl","lost_to_fu",
                             "age.less.ulimit","trt.group"))
      tmp <- subset(tmp, round(age,4) >= round(age.less.ulimit,4))
      tmp$baseline.age <- tmp$age.less.ulimit
   }

   if(original){
      d2$trt.group <- ifelse(is.na(d2$age.fhaart),0,
                      ifelse(d2$age < d2$age.fhaart,0,
                      ifelse(d2$age >= d2$age.fhaart &
                      (d2$age.fhaart - d2$age.less.ulimit) < start.within,1,0)))
      # For those in trt group 1 who had an event before age.fhaart
      # need to reset them to trt group 0
      grp1.1.del <- unique(d2$pid[which(d2$trt.group == 1 &
                                        !is.na(d2$age.oi) &
                                        d2$age.oi >= d2$age.less.ulimit &
                                        d2$age.oi <= d2$age.fhaart)])
      d2$trt.group <- ifelse(d2$pid %in% grp1.1.del,0,d2$trt.group)
      tmp <- subset(d2, select=c("pid","age","fhaart","age.fhaart",
                             "cd4","cd4_percent","vl","lost_to_fu",
                             "age.less.ulimit","trt.group"))
      tmp <- subset(tmp, round(age,4) >= round(age.less.ulimit,4))
      tmp$baseline.age <- tmp$age.less.ulimit
   }

   tmp.200 <- subset(tmp, trt.group==1)
   tmp.200 <- tmp.200[order(tmp.200$pid,tmp.200$age),]

#    if(old.code){
#       tmp.200$diff <- with(tmp.200, age.fhaart - baseline.age)
#       avg.start <- mean(tmp.200$diff[!duplicated(tmp.200$pid)], na.rm=TRUE)
#       # Now subset tmp.200 so that it starts when they first start HAART
#       tmp.200 <- subset(tmp.200, round(age,4) >= round(age.fhaart,4))
#       # Getting new baseline age for trt200 group
#       tmp.200 <- upData(tmp.200,
#                         drop=c("age.less.ulimit","baseline.age","diff"))
#       baseline <- aggregate(tmp.200$age, list(pid=tmp.200$pid), FUN=head,n=1)
#       names(baseline)[2] <- "baseline.age"
#       tmp.200 <- merge(tmp.200, baseline, all=TRUE)
#       tmp.200 <- tmp.200[order(tmp.200$pid,tmp.200$age),]
#    }  else {
      tmp.200 <- upData(tmp.200, drop=c("age.less.ulimit"))
#    }
   ## Just taking record of baseline.age???
   if(!original){
      tmp.200 <- subset(tmp.200, round(age,4) == round(baseline.age,4))
   } else {
      tmp.200 <- tmp.200[!duplicated(tmp.200$pid),]
   }

   tmp.100 <- subset(tmp, trt.group==0)
#    if(old.code){
#       tmp.100$start.age <- with(tmp.100, baseline.age + avg.start)
#    } else {
#       avg.start <- 0
      tmp.100$start.age <- tmp.100$baseline.age
#    }
   new.rows <- data.frame(pid=unique(tmp.100$pid),
                          age=tmp.100$start.age[!duplicated(tmp.100$pid)],
                          fhaart=0,
#                           cd4=NA,
#                           cd4_percent=NA,
#                           vl=NA,
                       lost_to_fu=tmp.100$lost_to_fu[!duplicated(tmp.100$pid)],
                       age.fhaart=tmp.100$age.fhaart[!duplicated(tmp.100$pid)],
             age.less.ulimit=tmp.100$age.less.ulimit[!duplicated(tmp.100$pid)],
                          trt.group=0,
                    baseline.age=tmp.100$baseline.age[!duplicated(tmp.100$pid)],
                    start.age=tmp.100$start.age[!duplicated(tmp.100$pid)])
   tmp.100 <- merge(tmp.100, new.rows, all=TRUE)
   tmp.100 <- tmp.100[order(tmp.100$pid, tmp.100$age),]
   tmp.100 <- subset(tmp.100, round(age,4) >= round(start.age,4))

   # Getting new baseline for trt.group=0
   tmp.100 <- upData(tmp.100,
                     drop=c("age.less.ulimit","baseline.age","start.age"))
   baseline <- aggregate(tmp.100$age, list(pid=tmp.100$pid),FUN=head,n=1)
   names(baseline)[2] <- "baseline.age"
   tmp.100 <- merge(tmp.100, baseline, all=TRUE)
   tmp.100 <- tmp.100[order(tmp.100$pid,tmp.100$age),]
   tmp.100 <- subset(tmp.100, round(age,4) == round(baseline.age,4))


   # Creating data frame for survival analysis
   # If original=TRUE, this has 2 records for those in trt.group=1 who did 
   # not start HAART at age.less.ulimit
   tmp.tot <- merge(tmp.200, tmp.100, all=TRUE)
   tmp.tot <- tmp.tot[order(tmp.tot$pid,tmp.tot$age),]
   true.last.age.df <- data.frame("pid"=d2$pid[!duplicated(d2$pid)],
                          "true.last.age"=d2$true.last.age[!duplicated(d2$pid)])
   tmp.tot <- merge(tmp.tot, true.last.age.df, all=TRUE)
   tmp.tot <- tmp.tot[order(tmp.tot$pid,tmp.tot$age),]

#    sub.id <- unique(d2$pid[d2$pid %in% tmp.tot$pid])
#    sub.unique.id <- unique(sub.id)
#    last.age <- tapply(d2$age, INDEX=d2$pid, FUN=max,na.rm=TRUE)
#    last.age.df <- data.frame("pid"=names(last.age),
#                              "last.age"=
   tmp.tot$follow.up <- tmp.tot$true.last.age - tmp.tot$baseline.age
   max.follow <- 12*max(tmp.tot$follow.up, na.rm=TRUE)
   months <- seq(0, ceiling(max.follow),by=1)

   tmp.tot$month <- 0
   if(!original){
      tmp.tot2 <- data.frame("pid"=rep(unique(tmp.tot$pid),
                                   each=length(months[-1])),
                     "month"=rep(months[-1],times=length(unique(tmp.tot$pid))),
                             "age"=rep(tmp.tot$age, each=length(months[-1])),
                   "age.fhaart"=rep(tmp.tot$age.fhaart,each=length(months[-1])),
                    "trt.group"=rep(tmp.tot$trt.group,each=length(months[-1])),
             "baseline.age"=rep(tmp.tot$baseline.age,each=length(months[-1])),
             "true.last.age"=rep(tmp.tot$true.last.age,each=length(months[-1])),
              "lost_to_fu"=rep(tmp.tot$lost_to_fu,each=length(months[-1])))
      tmp.tot2$pid <- as.character(tmp.tot2$pid)

      surv.df <- merge(tmp.tot, tmp.tot2, all=TRUE)
      surv.df <- surv.df[order(surv.df$pid, surv.df$month),]
      surv.df$age <- round(surv.df$age + surv.df$month/12,4)
   } else {
      tmp.tot2 <- data.frame("pid"=rep(unique(tmp.tot$pid),
                                       each=length(months[-1])),
                     "month"=rep(months[-1],times=length(unique(tmp.tot$pid))),
                 "age.fhaart"=rep(tmp.tot$age.fhaart[!duplicated(tmp.tot$pid)],
                                    each=length(months[-1])),
             "baseline.age"=rep(tmp.tot$baseline.age[!duplicated(tmp.tot$pid)],
                                    each=length(months[-1])),
           "true.last.age"=rep(tmp.tot$true.last.age[!duplicated(tmp.tot$pid)],
                                    each=length(months[-1])),
              "lost_to_fu"=rep(tmp.tot$lost_to_fu[!duplicated(tmp.tot$pid)],
                                    each=length(months[-1])))
      tmp.tot2$pid <- as.character(tmp.tot2$pid)
      tmp.tot2$month <- as.integer(tmp.tot2$month)

      surv.df <- merge(tmp.tot[!duplicated(tmp.tot$pid),
                      -which(names(tmp.tot) %in% 
#                             c("age","trt.group","cd4","cd4_percent","vl"))],
                               c("age","trt.group"))],
                       tmp.tot2,all=TRUE)
      surv.df <- surv.df[order(surv.df$pid,surv.df$age),]
      surv.df$age <- surv.df$baseline.age
      surv.df$age <- surv.df$age + surv.df$month/12

      surv.df$trt.group <- ifelse(is.na(surv.df$age.fhaart),0,
                           ifelse(surv.df$age >= surv.df$age.fhaart & 
                           (surv.df$age.fhaart - surv.df$baseline.age) <
                           start.within,1,0))
      # For those in trt group 1 who had an event before age.fhaart
      # need to reset them to trt group 0
      surv.df$trt.group <- ifelse(surv.df$pid %in% grp1.1.del,0,
                                  surv.df$trt.group)
   }

   ref.cd4 <- data.frame("pid"=d2$pid[!duplicated(d2$pid)],
                         "ref.cd4"=d2$cd4[round(d2$age,4)==
                                          round(d2$age.less.ulimit,4)],
                         "age.ref.cd4"=d2$age.less.ulimit[!duplicated(d2$pid)],
                         "ref.cd4_percent"=d2$cd4_percent[round(d2$age,4)==
                                          round(d2$age.less.ulimit,4)],
                "age.ref.cd4_percent"=d2$age.less.ulimit[!duplicated(d2$pid)],
                         "ref.vl"=d2$vl[round(d2$age,4)==
                                          round(d2$age.less.ulimit,4)],
                "age.ref.vl"=d2$age.less.ulimit[!duplicated(d2$pid)],
                "age.fhaart"=d2$age.fhaart[!duplicated(d2$pid)])
####HERE!!!!!
   locate.fhaart <- lapply(split(subset(surv.df, select=c(pid,age,age.fhaart)),surv.df$pid),
                           FUN=function(y){
                              y[min(which(y[,"age"] >= y[,"age.fhaart"])),"age"]
                           })
   locate.fhaart.df <- data.frame("pid"=names(locate.fhaart)[!is.na(locate.fhaart)],
                                  "new.age.fhaart"=unlist(locate.fhaart[!is.na(locate.fhaart)]))
   surv.df[,"new.age.fhaart"] <- locate.fhaart.df[match(surv.df$pid,locate.fhaart.df$pid,nomatch=NA),
                                                  "new.age.fhaart"]
   surv.df$fhaart <- ifelse(round(surv.df$age,4) == round(surv.df$new.age.fhaart,4) & !is.na(surv.df$new.age.fhaart),1,
                        ifelse(is.na(surv.df$new.age.fhaart) | 
                               round(surv.df$age,4) != round(surv.df$new.age.fhaart,4),0,NA))
   
   dem <- data.frame("pid"=d2$pid[!duplicated(d2$pid)],
                     "black"=d2$black[!duplicated(d2$pid)],
                     "male"=d2$male[!duplicated(d2$pid)],
                     "idu"=d2$idu[!duplicated(d2$pid)],
                   "age_at_death"=d2$age_at_death[!duplicated(d2$pid)],
                   "time.in.care"=d2$time.in.care[!duplicated(d2$pid)])

   surv.df <- merge(surv.df, ref.cd4, all=TRUE)
   surv.df <- surv.df[order(surv.df$pid,surv.df$age),]
   surv.df <- merge(surv.df, dem, all=TRUE)
   surv.df <- surv.df[order(surv.df$pid,surv.df$age),]

#    tmp <- data.frame("pid"=unique(surv.df$pid),
#                      "age.last"=d2$true.last.age[!duplicated(d2$pid)])
#    tmp$age.last <- round(tmp$age.last,4)
   surv.df$age.last <- surv.df$true.last.age

#    surv.df <- merge(surv.df, tmp, all=TRUE)

   surv.df <- subset(surv.df, round(age,4) <= round(age.last,4))

   surv.df$death <- ifelse(is.na(surv.df$age_at_death),0,
                    ifelse((round(surv.df$age,4) == 
                           round(surv.df$age.last,4)) | 
                           (surv.df$age + 1/12 > surv.df$age.last),1,0))

   ## May need to remove??
   d2.new <- subset(d2, pid %in% surv.df$pid)
   oi.df <- data.frame("pid"=unique(d2.new$pid),
                       "age.oi"=d2.new$age.oi[!duplicated(d2.new$pid)])
   surv.df <- merge(surv.df, oi.df, all=TRUE)
   surv.df <- surv.df[order(surv.df$pid,surv.df$age),]
   locate.oi <- NULL
   for(i in 1:length(unique(surv.df$pid))){
      locate.oi[i] <- min(which(surv.df$pid == unique(surv.df$pid)[i] &
                                round(surv.df$age ,4) >= 
                                round(surv.df$age.oi, 4)))
   }
   locate.oi <- locate.oi[locate.oi != Inf]
   surv.df$oi <- 0
   surv.df$oi[locate.oi] <- 1
   ## Temporary fix...how to capture in above loop??
   surv.df$oi <- ifelse(is.na(surv.df$age.oi),0,
                  ifelse((surv.df$age + 1/12) > surv.df$age.last & 
                        (surv.df$age.oi + 1/12) > surv.df$age.last,1,
                        surv.df$oi))

   surv.df$event <- ifelse(surv.df$death == 0 & surv.df$oi == 0,0,1)

   #Adding CD4 counts
   labage.1 <- surv.df[,c("pid","age","cd4")]
   labage.2 <- subset(d2, pid %in% surv.df$pid,
                      select=c(pid,age,cd4))
   names(labage.2) <- c("pid","age","cd4")
   labage.2 <- subset(labage.2, !is.na(cd4))

   pullData <- add.fun(labage.1, labage.2)

   surv.df$cd4 <- pullData[pullData$order==1,"cd4"]
#    surv.df$age.cd4 <- pullData[pullData$order==1,"age.cd4."]
   surv.df$age <- round(surv.df$age, 4)
   surv.df$age.fhaart <- round(surv.df$age.fhaart,4)
   surv.df$age.cd4 <- pullData[pullData$order==1, "age.cd4"]
   surv.df$time.to.cd4 <- 12*(surv.df$age - surv.df$age.cd4)


   #Adding CD4 percents
   labage.1 <- surv.df[,c("pid","age","cd4_percent")]
   labage.2 <- subset(d2, pid %in% surv.df$pid,
                      select=c(pid,age,cd4_percent))
   names(labage.2) <- c("pid","age","cd4_percent")
   labage.2 <- subset(labage.2, !is.na(cd4_percent))

   pullData <- add.fun(labage.1, labage.2)

   surv.df$cd4_percent <- pullData[pullData$order==1,"cd4_percent"]
#    surv.df$age.cd4_percent <- pullData[pullData$order==1,"age.cd4_percent"]
   surv.df$age <- round(surv.df$age, 4)
   surv.df$age.cd4_percent <- pullData[pullData$order==1, "age.cd4_percent"]
   surv.df$time.to.cd4_percent <- 12*(surv.df$age - surv.df$age.cd4_percent)

   #Adding VL
   labage.1 <- surv.df[,c("pid","age","vl")]
   labage.2 <- subset(d2, pid %in% surv.df$pid,
                      select=c(pid,age,vl))
   names(labage.2) <- c("pid","age","vl")
   labage.2 <- subset(labage.2, !is.na(vl))

   pullData <- add.fun(labage.1, labage.2)

   surv.df$vl <- pullData[pullData$order==1,"vl"]
   surv.df$age <- round(surv.df$age, 4)
   surv.df$age.vl <- pullData[pullData$order==1, "age.vl"]
   surv.df$time.to.vl <- 12*(surv.df$age - surv.df$age.vl)


   ## Need to ensure that for both cd4 and vl that age.cd4 and age.vl
   ## are less than age.fhaart when fhaart==1
   # Adjusting those who started HAART and had a CD4 count in that month after
   # the age.fhaart
   which.cd4 <- which(round(surv.df$age.fhaart,4) < round(surv.df$age.cd4,4) &
                      surv.df$fhaart == 1 & 
                      round(surv.df$age.cd4,4) > round(surv.df$baseline.age,4))
   surv.df$cd4[which.cd4] <- surv.df$cd4[(which.cd4-1)]
   surv.df$age.cd4[which.cd4] <- surv.df$age.cd4[(which.cd4-1)]

   # Adjusting those who started HAART and had a CD4 count in that month after
   # the age.fhaart
   which.vl <- which(round(surv.df$age.fhaart,4) < round(surv.df$age.vl,4) &
                      surv.df$fhaart == 1 & 
                      round(surv.df$age.vl,4) > round(surv.df$baseline.age,4))
   surv.df$vl[which.vl] <- surv.df$vl[(which.vl-1)]
   surv.df$age.vl[which.vl] <- surv.df$age.vl[(which.vl-1)]

   # Adjusting those who started HAART and had a CD4 count in that month after
   # the age.fhaart
   which.cd4.pct <- which(round(surv.df$age.fhaart,4) <
                          round(surv.df$age.cd4_percent,4) & 
                          surv.df$fhaart == 1 & 
                      round(surv.df$age.cd4_percent,4) >
                      round(surv.df$baseline.age,4))
   surv.df$cd4_percent[which.cd4.pct] <- surv.df$cd4_percent[(which.cd4.pct-1)]
   surv.df$age.cd4_percent[which.cd4.pct] <-
                               surv.df$age.cd4_percent[(which.cd4.pct-1)]


   ## Last row no longer row where they fell below 200.  It is now
   ## either:
   ## 1) where fhaart=1 for trt200=1 group, or
#    ## 2) age.bl200 + avg.start for trt200=0 group
   ## !! Need to change this code here !!!
   ## Using locate.startrow...getting Inf for startrow locations????
   locate.firstrow <- locate.rowlt200 <- NULL
   locate.startrow <- locate.lastrow <- NULL
   for(i in 1:length(unique(surv.df$pid))){
      locate.firstrow[i] <- min(which(surv.df$pid==
                                unique(surv.df$pid)[i]))
   #    &                                !is.na(sub.cd4)))
      locate.startrow[i] <- max(which(surv.df$pid==
                                unique(surv.df$pid)[i] & 
                               round(surv.df$age,4) <=
                               round(surv.df$baseline.age,4)),na.rm=TRUE)
   }


   ## Need to informatively censor subjects in trt200=0 group who either started
   ## HAART before falling below 100 or who didn't start HAART within 
   ## "start.within" months of falling below 100
   locate.cd4lt100 <- NULL
   for(i in 1:length(unique(surv.df$pid))){
      locate.cd4lt100[i] <- min(which(surv.df$pid==unique(surv.df$pid)[i] & 
                                   !is.na(surv.df$cd4) & surv.df$trt.group==0 &
                                   surv.df$pid %nin% unique(tmp.200$pid) &
                                   surv.df$cd4 < l.limit), na.rm=TRUE)
   }

   tmp <- data.frame("pid"=unique(as.character(surv.df$pid)),
          "age.less.llimit"=round(surv.df$age.cd4[locate.cd4lt100],4))
   tmp$age.less.llimit <- as.numeric(as.character(tmp$age.less.llimit))


   surv.df <- merge(surv.df, tmp, all=TRUE)
   surv.df <- surv.df[order(surv.df$pid,surv.df$month),]

   surv.df$inform.censor <- ifelse(surv.df$trt.group == 0 & 
                                   surv.df$pid %nin% tmp.200$pid &
   ((!is.na(surv.df$age.fhaart) & is.na(surv.df$age.less.llimit) & 
     surv.df$age >= surv.df$age.fhaart) | 
   (!is.na(surv.df$age.fhaart) & !is.na(surv.df$age.less.llimit) & 
   surv.df$age.fhaart < surv.df$age.less.llimit & 
   surv.df$age >= surv.df$age.fhaart) |
   (!is.na(surv.df$age.fhaart) & !is.na(surv.df$age.less.llimit) &
   surv.df$age.fhaart - surv.df$age.less.llimit > start.within &
   surv.df$age > surv.df$age.less.llimit + start.within) |
   (is.na(surv.df$age.fhaart) & !is.na(surv.df$age.less.llimit) & 
   surv.df$age - surv.df$age.less.llimit > start.within)),1,0)


   # (all(is.na(surv.df$age.fhaart)))), 1, 0)
   ## If I were to keep the above line of code, then that censors folks who
   ## were in the trt200=0 group who never fell below 100 and never started 
   ## HAART.  They are not non-compliant.

   ## Removing rows from surv.df after:
   ## 1)  A patient had an event
   ## 2)  A patient was informatively censored
   ## 3)  Age in surv.df exceeds that actual age of the last observation
   ##     for that patient. (last one already done

   ## (1)
   id_list <- split(surv.df, surv.df$pid)

   #pat <- id_list[[1]]

   #dest <- which(diff(pat$event) < 0) + 1
   #if(length(dest<- which(diff(pat$event) < 0) + 1)) pat[seq(from=dest,
   #to=nrow(pat)),]
   junk3 <- do.call(rbind, 
           lapply(id_list, function(pat)
           pat[(seq(along=pat$event) < c(which(pat$event | pat$inform.censor),
                                          nrow(pat))[1] + 1) & 
   pat$age <= (pat$age.last+1/12),]))

   ## For those records where inform.censor and event both = 1
   ## need to determine which occurred first -- change in trt
   ## or event.
   junk3.b <- subset(junk3, event==1 & inform.censor==1)
   unique.id.b <- unique(junk3.b$pid)

   min.age <- NULL
   for(i in 1:length(unique.id.b)){
      min.age[i] <- min(c(junk3.b$age.fhaart[junk3.b$pid==unique.id.b[i]],
                          junk3.b$age.oi[junk3.b$pid==unique.id.b[i]],
                          junk3.b$age.at.death[junk3.b$pid==unique.id.b[i]]),
                                        na.rm=TRUE)
   }

   junk3.b$event <- ifelse(round(junk3.b$age.oi,4)==round(min.age,4) | 
                           round(junk3.b$age_at_death,4)==
                             round(min.age,4), 1, 0)
   junk3.b$inform.censor <- ifelse(junk3.b$age.fhaart==min.age & 
                                   !is.na(junk3.b$age.fhaart),1,
                             ifelse((is.na(junk3.b$age.fhaart) &
                                    junk3.b$age > min.age),0,1))

   row.2.dl <- which(junk3$event==1 & junk3$inform.censor==1)
   if(length(row.2.dl) > 0){
      junk3 <- junk3[-row.2.dl,]
      junk3 <- merge(junk3, junk3.b, all=TRUE)
   }

   junk3 <- junk3[order(junk3$pid,junk3$age),]

   surv.df <- junk3

   return(surv.df=surv.df)

#    return(list(surv.df=surv.df,
#                avg.start=avg.start))
}

# analysis <- function(start.within=start.within,avg.start=avg.start, surv.df,
#                      No.CD4,IPW,No.PS){
analysis <- function(start.within=start.within,surv.df,No.CD4,IPW,No.PS){

   ######### Pooled Logistic regression for censoring

#    library(Design)

#    surv.df$vl<-ifelse(surv.df$vl<400,399,surv.df$vl)

   surv.df$logvl<-log10(surv.df$vl)

   first.age<-unlist(tapply(d2$age[d2$pid %in% surv.df$pid],
      INDEX=d2$pid[d2$pid %in% surv.df$pid], FUN=head,n=1))

   data.junk<-data.frame(pid=names(first.age),first.age=as.numeric(first.age))

   surv.df<-merge(surv.df,data.junk,all=TRUE)
   surv.df <- surv.df[order(surv.df$pid,surv.df$age),]

   surv.df$time.in.care<-surv.df$baseline.age-surv.df$first.age


   baseline.cd4 <- unlist(tapply(surv.df$cd4, INDEX=surv.df$pid, FUN=head,n=1))
   baseline.time.to.cd4 <- unlist(tapply(surv.df$time.to.cd4, 
                                         INDEX=surv.df$pid,
                                         FUN=head,n=1))
   baseline.cd4_percent <- unlist(tapply(surv.df$cd4_percent, INDEX=surv.df$pid,
                                     FUN=head,n=1))
   baseline.time.to.cd4_percent <- unlist(tapply(surv.df$time.to.cd4_percent, 
                                         INDEX=surv.df$pid,
                                         FUN=head,n=1))
   baseline.logvl <- unlist(tapply(surv.df$logvl, INDEX=surv.df$pid,
                                     FUN=head,n=1))
   baseline.time.to.logvl <- unlist(tapply(surv.df$time.to.vl,
                                    INDEX=surv.df$pid,
                                    FUN=head,n=1))

   ## Imputing baseline logvl based on their cd4 count
   impute.lm <- lm(baseline.logvl ~ sqrt(baseline.cd4))
   baseline.logvl <- ifelse(is.na(baseline.logvl),
             impute.lm$coeff[1]+impute.lm$coeff[2]*sqrt(baseline.cd4),
                                baseline.logvl)
   tmp.logvl <- data.frame(pid=names(baseline.logvl),
                         baseline.logvl=as.numeric(baseline.logvl))
   surv.df <- merge(surv.df, tmp.logvl, all=TRUE)
   surv.df <- surv.df[order(surv.df$pid,surv.df$age),]

   surv.df$logvl <- ifelse(is.na(surv.df$logvl), surv.df$baseline.logvl,
                                                   surv.df$logvl)

#    mod200.pl.1<-lrm(inform.censor~rcs(month,4)+black+idu+male+baseline.age+
#                     prior.oi+prior.art+sqrt(cd4)+ cd4.pct +
#                     sqrt(time.to.cd4)+logvl+sqrt(time.to.vl)+
#                     sqrt(time.in.care),
#                     subset=month > (3-avg.start*12) & trt.group==0,
#                     data=surv.df)
   trt200.ids <- unique(surv.df$pid[surv.df$trt.group == 1])
#    mod200.pl.1 <- lrm(inform.censor ~ rcs(month,4) + black + 
#                                       idu + male + baseline.age + 
#                                       rcs(sqrt(cd4),4) + 
# #                                       cd4_percent +
#                                       logvl + sqrt(time.in.care),
#                                       data=surv.df,
#                                       subset=month >
#                                              (start.within*12-avg.start*12) & 
#                                              trt.group==0 &
#                                              pid %nin% trt200.ids)
   mod200.pl.1 <- lrm(inform.censor ~ rcs(month,4) + black + 
                                      idu + male + baseline.age + 
                                      rcs(sqrt(cd4),4) + 
                                      logvl + sqrt(time.in.care),
                                      data=surv.df,
                                      subset=month >
                                             start.within*12 & 
                                             trt.group==0 &
                                             pid %nin% trt200.ids)

   p200.cens.1<-exp(predict(mod200.pl.1))/(1+exp(predict(mod200.pl.1)))

#    mod200.pl.2<-lrm(inform.censor~rcs(month,4)+black+idu+male+baseline.age+
#                     prior.oi+ prior.art,subset=month >(3-avg.start*12) &
#                     trt.group==0,
#                     data=surv.df)
#    mod200.pl.2 <- lrm(inform.censor ~ rcs(month,4)+black+idu+male+
#                                       baseline.age,
#                       subset=month > (start.within*12-avg.start*12) &
#                                       trt.group==0 & 
#                                       pid %nin% trt200.ids,data=surv.df)
   mod200.pl.2 <- lrm(inform.censor ~ rcs(month,4)+black+idu+male+
                                      baseline.age,
                      subset=month >  start.within*12 &
                                      trt.group==0 & 
                                      pid %nin% trt200.ids,data=surv.df)

   p200.cens.2<-exp(predict(mod200.pl.2))/(1+exp(predict(mod200.pl.2)))

   sw200.init<-(1-p200.cens.2)/(1-p200.cens.1)

   sw.init.df<-data.frame(row.id=names(sw200.init),sw200.init)
   surv.200.junk<-data.frame(row.id=rownames(surv.df),surv.df)

   junk4<-merge(surv.200.junk,sw.init.df,by.x="row.id",by.y="row.id",all=TRUE)
   junk4 <- junk4[order(junk4$pid,junk4$age),]

   sw200.temp<-ifelse(is.na(junk4$sw200.init),1,junk4$sw200.init)

   sw200 <- unlist(tapply(sw200.temp, INDEX=surv.df$pid, FUN=cumprod))


   library(geepack)
#    surv.df$rate.cd4.chg <- -with(surv.df,
#                      (ref.cd4-cd4.gt200)/(age.ref.cd4-age.gt200))
#    surv.df$lograte.cd4.chg <- log10(surv.df$rate.cd4.chg)

   tmp.cd4 <- data.frame(pid=names(baseline.cd4),
                         baseline.cd4=as.numeric(baseline.cd4))
   tmp.time.to.cd4 <- data.frame(pid=names(baseline.time.to.cd4),
                         baseline.time.to.cd4=as.numeric(baseline.time.to.cd4))
   tmp.cd4.pct <- data.frame(pid=names(baseline.cd4_percent),
                         baseline.cd4_percent=as.numeric(baseline.cd4_percent))
   tmp.time.to.cd4.pct <- data.frame(pid=names(baseline.time.to.cd4_percent),
         baseline.time.to.cd4_percent=as.numeric(baseline.time.to.cd4_percent))
   tmp.logvl <- data.frame(pid=names(baseline.logvl),
                         baseline.vl=as.numeric(baseline.logvl))
   tmp.time.to.logvl <- data.frame(pid=names(baseline.time.to.logvl),
                     baseline.time.to.logvl=as.numeric(baseline.time.to.logvl))

   junk<-unique(subset(surv.df,select=c(pid,trt.group,baseline.age,idu,
                       black,male,
                       time.in.care,baseline.logvl)))
   junk <- junk[!duplicated(junk$pid),]
   junk$trt.group <- ifelse(junk$pid %in% trt200.ids,1,junk$trt.group)

   junk <- merge(merge(junk,tmp.cd4,all=TRUE),tmp.cd4.pct,all=TRUE)
   junk <- merge(junk, tmp.time.to.cd4,all=TRUE)
   junk <- merge(junk, tmp.time.to.cd4.pct,all=TRUE)
   junk <- merge(junk, tmp.logvl,all=TRUE)
   junk <- merge(junk, tmp.time.to.logvl,all=TRUE)
                  ## add cd4, cd4pct, and logvl at baseline and then include in 
                  ## glm below...a few id's that have NA's for vl so don't
                  ## get values for predict so error when creating merge.junk

#    mod200.ps<-glm(trt.group~baseline.age+idu+black+male+prior.oi+prior.art+
#                   sqrt(time.in.care)+baseline.cd4+baseline.cd4.pct+
#                   baseline.logvl+lograte.cd4.chg,
#                   data=junk,family=binomial)
   if(No.CD4){
      mod200.ps <- glm(trt.group~black+idu+male+baseline.age+
                              baseline.logvl+
                              sqrt(time.in.care),data=junk,family=binomial)
   } else {
      mod200.ps <- glm(trt.group~black+idu+male+baseline.age+
                                 rcs(baseline.cd4,3) +
                                 baseline.logvl+
                                 sqrt(time.in.care),data=junk,family=binomial)
   }

   ps200<-exp(predict(mod200.ps))/(1+exp(predict(mod200.ps)))

   merge.junk<-data.frame(pid=junk$pid,ps200=ps200)
   surv.df<-merge(surv.df,merge.junk,all=TRUE)
   surv.df <- surv.df[order(surv.df$pid,surv.df$age),]

   if(IPW){
      surv.df$ps200.new <- ifelse(surv.df$trt.group == 1,surv.df$ps200,
                                  (1-surv.df$ps200))
   }

   surv.df$pid1<-factor(surv.df$pid)

   event<-surv.df$event
   month<-surv.df$month
   id<-factor(surv.df$pid)
   sw200<-as.numeric(sw200)
   if(IPW){
      ps200.new <- surv.df$ps200.new
      #-------------------------------------------#
      # If using IPW and no PS
      sw200 <- sw200/ps200.new
      #-------------------------------------------#
   }
   sw200.new <- ifelse(sw200 < quantile(sw200,probs=0.05),
                           quantile(sw200,probs=0.05),
                   ifelse(sw200 > quantile(sw200,probs=0.95),
                          quantile(sw200,probs=0.95),
                          sw200))
   ps200<-surv.df$ps200
   trt.group <- surv.df$trt.group

   rcs.month<-rcs(sqrt(month),3)

   rcs.month1<-as.vector(rcs.month[,1])
   rcs.month2<-as.vector(rcs.month[,2])

   library(sandwich)
   if(IPW | No.PS){
      mod200 <- glm(event ~ rcs.month1 + rcs.month2 + trt.group,
                    family=binomial, weights=sw200.new)
   } else {
      mod200 <- glm(event ~ rcs.month1 + rcs.month2 + trt.group + 
                    ps200,family=binomial, weights=sw200.new)
   }

   vbeta <- diag(sandwich(mod200))[names(mod200$coeff)=="trt.group"]

   # Taking reciprocal since interested in trt.group = 1 as reference
   beta <- 1/exp(mod200$coeff[names(mod200$coeff)=="trt.group"])
   uci <- 1/exp(mod200$coeff[names(mod200$coeff)=="trt.group"]-qnorm(0.975)*sqrt(vbeta))
   lci <- 1/exp(mod200$coeff[names(mod200$coeff)=="trt.group"]+qnorm(0.975)*sqrt(vbeta))

   stuff <- unique(subset(surv.df, select=c(pid,event)) )
   locate <- NULL 
   for(i in 1:length(unique(stuff$pid))){ 
      locate[i] <- which(stuff$pid==unique(stuff$pid)[i] &
            stuff$event==max(stuff$event[stuff$pid==unique(stuff$pid)[i]])) 
   }
   stuff3 <- stuff[locate,] 

   stuff <- unique(subset(surv.df, select=c(pid,trt.group)))
   locate <- NULL
   for(i in 1:length(unique(stuff$pid))){
      locate[i] <- which(stuff$pid == unique(stuff$pid)[i] & 
         stuff$trt.group==max(stuff$trt.group[stuff$pid==unique(stuff$pid)[i]]))
   }
   stuff4 <- stuff[locate,]

   stuff3 <- merge(stuff3,stuff4,all=TRUE)

   table.trtgroup <- table(stuff3$trt.group,stuff3$event) 

   n.ic <- sum(surv.df$inform.censor)


   return(list(beta=beta,
               lci=lci,
               uci=uci,
               table.trtgroup=table.trtgroup,
               n.ic=n.ic,
               surv.df=surv.df))
}


   #------------------------#
   # Loading data           #
   #------------------------#
#    d2 <- read.dta("when_to_start_01oct2007.dta")
#    d2 <- read.dta("when_to_start_19nov2009.dta")
   d2 <- read.dta("when_to_start_03dec2009.dta")
   d2 <- upData(d2, rename=c(cfar_pid="pid",
                             ivdu="idu",
                             indicator_first_haart_0_1="fhaart",
#                              age_at_first_haart="age.fhaart",
                             cd4_count="cd4",
                             indicator_oi_0_1="oi",
                             age_at_oi="age.oi",
                             indicator_non_haart_art_0_1="art"),
                     drop=c("age_at_first_haart",
                            "fhaart"))
   d2 <- d2[!duplicated(d2),]

   d2$male <- d2$sex - 1
#    non.ade <- read.table("w2s_non_ades_dx_17feb2009.csv",header=TRUE,sep=",")
   non.ade <- read.table("w2s_non_ades_dx_18nov2009.csv",header=TRUE,sep=",")
   names(non.ade) <- tolower(names(non.ade))
   non.ade.list <- c("CORONARY ARTERY DISEASE, NOS",
                     "ACUTE MYOCARDIAL INFARCTION",
                     "CIRRHOSIS OF LIVER, NOS",
                     "ESOPHAGEAL VARICES, NOS",
                     "HEPATIC FAILURE, NOS",
                     "ADENOCARCINOMA, INTESTINAL TYPE",
                     "MALIGNANT NEOPLASM OF COLON, UNSPECIFIED",
                     "SMALL CELL CARCINOMA, NOS",
                     "MELANOMA OF SKIN, SITE UNSPECIFIED",
                     "SQUAMOUS CELL CARCINOMA OF ANUS",
                     "END STAGE RENAL DISEASE",
                     "ACUTE VIRAL HEPATITIS C WITH HEPATIC COMA",
                     "ALCOHOLIC CIRRHOSIS OF LIVER",
            "CEREBRAL ARTERY OCCLUSION, UNSPECIFIED, WITH CEREBRAL INFARCTION",
                     "CEREBRAL HEMORRHAGE",
                     "CHRONIC HEPATITIS C WITH HEPATIC COMA",
                     "CHRONIC LIVER DISEASE AND CIRRHOSIS",
                     "CIRRHOSIS OF LIVER",
                     "CORONARY ARTERIOSCLEROSIS",
                     "CORONARY ARTERY DISEASE",
                     "CORONARY ATHEROSCLEROSIS",
                     "HEPATIC ENCEPHALOPATHY",
                     "MELANOMA OF SKIN, SITE UNSPECIFIED",
                     "MYOCARDIAL INFARCTION, NOS",
                     "HEPATOCELLULAR CARCINOMA, NOS",
                     "SQUAMOUS CELL CARCINOMA, KERATINIZING, NOS",
                     "SUBARACHNOID HEMORRHAGE",
                     "ANGINA PECTORIS",
                     "ASCITES, NOS",
                     "CARDIOVASCULAR DISEASE, NOS",
                     "CEREBROVASCULAR ACCIDENT, NOS",
                     "CHRONIC CONGESTIVE HEART FAILURE",
                     "CHRONIC HYPERTENSION IN OBSTETRIC CONTEXT, NOS",
                     "CHRONIC KIDNEY DISEASE, STAGE V",
                     "CONGESTIVE HEART DISEASE",
                     "CONGESTIVE HEART FAILURE",
paste("DIABETES MELLITUS WITHOUT MENTION OF COMPLICATION,",
      " TYPE II OR UNSPECIFIED TYPE, NOT STATES AS UNCONTROLLED",sep=""),
                      "ESOPHAGEAL VARICES WITHOUT MENTION OF BLEEDING",
                      "HYPERTENSION IN THE OBSTETRIC CONTEXT, NOS",
                      "MALIGNANT NEOPLASM OF RECTUM",
                      "PRE-ECLAMPSIA, NOS",
                      "SQUAMOUS CELL CARCINOMA, NOS")
   non.ade <- subset(non.ade, non_ade_diagnosis %in% non.ade.list)

   non.ade <- upData(non.ade, rename=c(cfar_pid="pid",
                                       age_at_non_ade_diagnosis_onset=
                                          "age.nonade.onset"))
#    map.icd9 <- read.table("Non-AIDS CCC vs Africa_14Nov08.csv",
#                           header=TRUE, sep=",")
#    names(map.icd9) <- tolower(names(map.icd9))
#    map.icd9 <- upData(map.icd9, drop="x")


#    trt <- read.table("when_to_start_art_16oct2008.csv",header=TRUE,sep=",")
#    trt <- read.table("when_to_start_art_18nov2009.csv",header=TRUE,sep=",")
   trt <- read.table("when_to_start_art_03dec2009.csv",header=TRUE,sep=",")
   names(trt) <- tolower(names(trt))
   trt$art.0 <- ifelse(trt$haart == 1,0,
                       ifelse(trt$haart == 0 & trt$regimen == " ",0,1))

   trt <- upData(trt, rename=list(cfar_pid="pid",
                                  age_at_rx_end="age.rx.end",
                                  age_at_rx_start="age.rx.start",
                                  first_haart="fhaart"))

#    oi.df <- read.csv("when_to_start_year1_oi_01dec2008.csv", header=TRUE,
#                        sep=",")
   oi.df <- read.csv("when_to_start_year1_oi_18nov2009.csv", header=TRUE,
                       sep=",")
   names(oi.df) <- tolower(names(oi.df))
   oi.df <- upData(oi.df, rename=c(cfar_pid="pid",
                                   year_of_first_visit="year.1st.visit",
                                   indicator_oi_0_1="oi",
                                   age_at_oi="age.oi",
                                   oi_1="oi.1",
                                   oi_2="oi.2",
                                   oi_3="oi.3",
                                   oi_4="oi.4"))
   oi.df$oi.2 <- ifelse(oi.df$oi.2 == "",NA,as.character(oi.df$oi.2))
   oi.df$oi.3 <- ifelse(oi.df$oi.3 == "",NA,as.character(oi.df$oi.3))
   oi.df$oi.4 <- ifelse(oi.df$oi.4 == "",NA,as.character(oi.df$oi.4))
   oi.class <- read.csv("OIList_final.csv", header=TRUE)
   names(oi.class) <- tolower(names(oi.class))
   oi.class$oi <- tolower(as.character(oi.class$oi))
   oi.class$class <- as.character(oi.class$class)
   oi.class$class <-  gsub("^([ \t\n\r\f\v]+)|([ \t\n\r\f\v]+)$", "", 
                           oi.class$class)
   oi.tmp <- unlist(strsplit(oi.class$oi, split="\""))
   oi.tmp <- oi.tmp[oi.tmp != ""]
   oi.class$oi <- oi.tmp

   #-------------------------------------------#
   # Prior ART use
   art <- read.csv("when_to_start_prior_art_20nov09.csv",sep=",",header=TRUE)
   names(art) <- tolower(names(art))
   names(art) <- c("pid","has_art_before_first_visit")
   #-------------------------------------------#

   #-------------------------------------------#
   # Prior OI
   oi <- read.csv("when_to_start_prior_oi_19apr2010.csv",sep=",",header=TRUE)
   names(oi) <- tolower(names(oi))
   names(oi) <- c("pid","has_oi_before_first_visit")
   #-------------------------------------------#


#    # Some PIDs were not HAART naive at start of study
#    delete.pids <-trt$pid[trt$haart_before_first_visit==1]
#    d2 <- subset(d2, pid %nin% delete.pids)
#    trt <- subset(trt, pid %nin% delete.pids)
#    oi.df <- subset(oi.df, pid %nin% delete.pids)
#    oi.df <-subset(oi.df, oi == 1)
#    non.ade <- subset(non.ade, pid %nin% delete.pids)
   save(trt,file="trt.rda",compress=TRUE)


#    ## Need to remove first record of ID=993147H
#    tmp.d2 <- subset(d2, pid != "993147H")
#    tmp2.d2 <- subset(d2, pid == "993147H")
#    tmp2.d2 <- tmp2.d2[duplicated(tmp2.d2$pid),]
#    # Also need to remove this info from oi.df
#    which.row <- which(oi.df$pid == "993147H" & round(oi.df$age.oi ==
# 21.457,5))
#    oi.df <- oi.df[-which.row,]
# 
#    d2 <- merge(tmp.d2,tmp2.d2,all=TRUE)
#    d2 <- d2[order(d2$pid, d2$age),]

   # Having to do this because one pid had an age.oi > age.death
   d2$age_at_death <- ifelse(is.na(d2$age_at_death), NA,
                             ifelse(!is.na(d2$age.oi) &
                                    d2$age.oi > d2$age_at_death,
                                    d2$age.oi, d2$age_at_death))

#    save(d2, file="d2.rda", compress=TRUE)

#    age.fhaart <- subset(d2, !is.na(age.fhaart), select=c(pid,age.fhaart))
#    d2 <- upData(d2, drop=c("age.fhaart"))
#    d2 <- merge(d2, age.fhaart, all=TRUE)
#    d2 <- d2[order(d2$pid, d2$age),]


   # Collapsing duplicate ages for a given PID
#    d2$age <- as.integer(round(d2$age*365.25))
#    source("collapseSolution.R")
#    save(new.d2, file="newd2.rda")
#    d2 <- new.d2
#    d2$age <- d2$age/365.25
   save(d2, file="d2.rda")


   no.visits <- tapply(d2$pid, INDEX=d2$pid, FUN=length)
   #---------------------------------------#
   #     Adding in treatment data          #
   #---------------------------------------#
   source("solution.R")
   save(new.sub, file="newsub.rda")
   new.sub <- new.sub[order(new.sub$pid, new.sub$age),]
   age.fhaart.df <- data.frame("pid"=trt$pid[trt$fhaart==1 &
                                             !is.na(trt$fhaart)],
                               "age.fhaart"=trt$age.rx.start[trt$fhaart == 1 &
                                                         !is.na(trt$fhaart)])
   d2 <- new.sub
   d2$age <- round(d2$age/365.25,4)
   d2$age.rx.start <- round(d2$age.rx.start/365.25,4)
   d2$age.rx.end <- round(d2$age.rx.end/365.25,4)
   d2 <- merge(d2, age.fhaart.df, all=TRUE)
   d2 <- d2[order(d2$pid, d2$age),]
   d2$age.fhaart <- round(d2$age.fhaart/365.25,4)
   d2$oi <- ifelse(is.na(d2$oi), 0, d2$oi)
   d2 <- upData(d2, drop="art")
   d2 <- upData(d2, rename=c(art.0="art"))

   # Merging in ART use prior to first clinic visit
   d2 <- merge(d2, art, all=TRUE)
   d2 <- d2[order(d2$pid,d2$age),]

   # Merging in OI event prior to first clinic visit
   d2 <- merge(d2, oi, all=TRUE)
   d2 <- d2[order(d2$pid,d2$age),]


   no.visits.2 <- tapply(d2$pid, INDEX=d2$pid, FUN=length)

   save(d2, file="postsold2.rda")
   age.lt.age.fhaart <- unique(d2$pid[which(d2$age < d2$age.fhaart)])

   first.age <- unlist(tapply(d2$age, INDEX=d2$pid, FUN=head, n=1))
   tmp <- data.frame("pid"=names(first.age),
                  "first.age"=first.age)
   d2 <- merge(d2, tmp, all=TRUE)
   d2 <- d2[order(d2$pid, d2$age),]

   unique.id<-unique(d2$pid)


#----------------------------------------------------------#
# start.within is expressed in years, not months, and is how treatment
# groups are assigned (ie., start HAART within "start.within" months of
# falling below the upper limit
# original is a logical.  When TRUE, all assigned to trt 0 when they fall
# below the upper limit until they start HAART within start.within years.
# When FALSE, all assigned to trt group based on whether the time between
# age.less.ulimit and age.fhaart within start.within years.
# No.CD4 is a logical indicating whether CD4 should be used in computing
# the propensity scores.  IPW is a logical indicating whether no
# propensity score should be used in the model but rather incorporate them
# into the weights.
start.within <- 1/4
old.code <- FALSE
original <- FALSE
No.CD4 <- TRUE
IPW <- FALSE
No.PS <- FALSE
restrict <- FALSE

save(d2, file="d2.rda",compress=TRUE)

df.500.400 <- create.df(start.within=start.within,restrict=restrict,
                        u.limit=501,l.limit=401,d2=d2,old.code=old.code,
                        original=original)

save(df.500.400,file="df500400.rda",compress=TRUE)
load("df500400.rda")
df.500.400 <- df.500.400[order(df.500.400$pid,df.500.400$month),]
# avg.start <- df.500.400$avg.start

# analysis1 <- analysis(start.within=start.within,avg.start=avg.start,
#                       surv.df=surv.500.400,No.CD4=No.CD4,IPW=IPW,No.PS=No.PS)
analysis1 <- analysis(start.within=start.within,surv.df=df.500.400, No.CD4=No.CD4,IPW=IPW,No.PS=No.PS)
save(analysis1, file="analysis500400.rda",compress=TRUE)

beta.500.400 <- analysis1[[1]]
lci.500.400 <- analysis1[[2]]
uci.500.400 <- analysis1[[3]]
table.500.400 <- analysis1[[4]]
n.ic.500.400 <- analysis1[[5]]
# surv.500.400 <- analysis1[[6]]

#----------------------------------------------------------#
df.400.300 <- create.df(start.within=start.within,restrict=FALSE,
                        u.limit=401,l.limit=301,d2=d2,old.code=old.code,
                        original=original)

save(df.400.300,file="df400300.rda",compress=TRUE)
load("df400300.rda")
# surv.400.300 <- df.400.300$surv.df
df.400.300 <- df.400.300[order(df.400.300$pid,df.400.300$month),]
# avg.start <- df.400.300$avg.start

# analysis2 <- analysis(start.within=start.within,avg.start=avg.start,
#                       surv.df=surv.400.300,No.CD4=No.CD4,IPW=IPW,No.PS=No.PS)
analysis2 <- analysis(start.within=start.within,surv.df=df.400.300,No.CD4=No.CD4,IPW=IPW,No.PS=No.PS)
save(analysis2, file="analysis400300.rda",compress=TRUE)

beta.400.300 <- analysis2[[1]]
lci.400.300 <- analysis2[[2]]
uci.400.300 <- analysis2[[3]]
table.400.300 <- analysis2[[4]]
n.ic.400.300 <- analysis2[[5]]
# surv.400.300 <- analysis2[[6]]

#----------------------------------------------------------#
df.300.200 <- create.df(start.within=start.within,restrict=FALSE,
                        u.limit=301,l.limit=201,d2=d2,old.code=old.code,
                        original=original)

save(df.300.200,file="df300200.rda",compress=TRUE)
load("df300200.rda")
# surv.300.200 <- df.300.200$surv.df
df.300.200 <- df.300.200[order(df.300.200$pid,df.300.200$month),]
# avg.start <- df.300.200$avg.start

# analysis3 <- analysis(start.within=start.within,avg.start=avg.start,
#                       surv.df=surv.300.200,No.CD4=No.CD4,IPW=IPW,No.PS=No.PS)
analysis3 <- analysis(start.within=start.within,surv.df=df.300.200,No.CD4=No.CD4,IPW=IPW,No.PS=No.PS)
save(analysis3, file="analysis300200.rda",compress=TRUE)

beta.300.200 <- analysis3[[1]]
lci.300.200 <- analysis3[[2]]
uci.300.200 <- analysis3[[3]]
table.300.200 <- analysis3[[4]]
n.ic.300.200 <- analysis3[[5]]
# surv.300.200 <- analysis3[[6]]