rm(list=ls()) #------------------------# # Libraries # #------------------------# library(foreign) library(Hmisc) library(Design) #------------------------# # Functions # #------------------------# #---------------------------------------------------------------------# # add.fun # #---------------------------------------------------------------------# # In creating monthly (time-dependent) data frame, records are inserted # that do not exist in the original data. This creates missing values for # CD4, viral loads, CD4 %, etc. Add.fun fills in these values with the # last non-missing observation and records the age at which that non-missing # value occurred. add.fun <- function(labage.1, labage.2){ var <- names(labage.1)[3] labage.1$order <- 1 labage.2$order <- 0 pullData <- rbind(labage.1,labage.2) pullData$age <- round(pullData$age, 4) pullData <- pullData[order(pullData$pid, pullData$age, -pullData$order),] hasVal <- diff(c(0L,as.integer(as.factor(pullData$pid)))) != 0 | !is.na(pullData[,var]) | (pullData$order == 0) index <- rep(which(hasVal), times=table(cumsum(hasVal))) pullData[,c(var, paste("age", var, sep='.'))] <- pullData[index, c(var, "age")] return(pullData) } #---------------------------------------------------------------------# # create.df # #---------------------------------------------------------------------# # Creates monthly (time-dependent) data frame for analysis create.df <- function(start.within,restrict,u.limit,l.limit,d2,old.code,original){ # Identifying age at which CD4 is less than ulimit by PID less.ulimit <- lapply(split(subset(d2, select=c(pid,age,cd4)), d2$pid), FUN=function(y){ y[min(which(y[,"cd4"] < u.limit)),"age"]}) less.ulimit.df <- data.frame("pid"=names(less.ulimit), "age.less.ulimit"=unlist(less.ulimit)) less.ulimit.df$pid <- as.character(less.ulimit.df$pid) d2[,"age.less.ulimit"] <- less.ulimit.df[match(d2$pid,less.ulimit.df$pid,nomatch=NA), "age.less.ulimit"] d2 <- d2[order(d2$pid, d2$age),] # If a PID has a missing age.less.ulimit, then they did not have a CD4 below the upper # limit and should be deleted from the analysis d2 <- subset(d2, !is.na(age.less.ulimit)) # Time.in.care is the time between the first recorded clinic visit and the age a PIDs CD4 fell # below the upper limit d2$time.in.care <- d2$age.less.ulimit - d2$first.age # If someone died, their true last age should be the age of death # Otherwise it is their last recorded age. true.last.age <- lapply(split(subset(d2, select=c(pid,age,age_at_death)), d2$pid), FUN=function(y){ unique(ifelse(is.na(y[,3]),max(y[,2]),unique(y[,3])))}) tla.df <- data.frame("pid"=names(true.last.age), "true.last.age"=unlist(true.last.age)) tla.df <- tla.df[order(as.character(tla.df$pid)),] tla.df$pid <- as.character(tla.df$pid) d2 <- merge(d2, tla.df, all=TRUE) d2 <- d2[order(d2$pid, d2$age),] # Total follow-up time is the time from when a PIDs CD4 fell below the upper limit to their true.last.age d2$tot.fu <- d2$true.last.age - d2$age.less.ulimit # In the original data, each record at which an OI occurred had a non-missing age.oi. All other records # had a missing age.oi. For analysis purposes, those OIs that occurred between a PIDs first age and # age.less.ulimit were recorded as prior.oi and the age recorded. For those OIs that occurred after # age.less.ulimit, only the first occurrence of an OI will be included in the analysis and the variable # age.oi will repeat the age at which the first OI after age.less.ulimit occurred. The following # code addresses both of these issues. # Defining prior.oi (OI between first.age and age.less.ulimit) age.oi <- subset(d2, !is.na(age.oi), select=c(pid, age.oi, age.less.ulimit)) age.oi$prior.oi <- ifelse(round(age.oi$age.oi,4) <= round(age.oi$age.less.ulimit,4),1,0) # Grabbing first incidence of OI prior to age.less.ulimit if more than one prior.oi age.prior.oi <- unlist(lapply(split(age.oi, age.oi$pid), FUN=function(y){ ifelse(all(y[,"prior.oi"]==0),NA,min(y[y[,"prior.oi"]==1, "age.oi"]))})) age.prior.oi.df <- data.frame("pid"=names(age.prior.oi), "age.prior.oi"=age.prior.oi) age.prior.oi.df$pid <- as.character(age.prior.oi.df$pid) age.oi <- merge(age.oi, age.prior.oi.df, all=TRUE) # Grabbing first incidence of OI after age.less.ulimit if more than one oi age.oi.af <- lapply(split(age.oi, age.oi$pid), FUN=function(y){ ifelse(all(y[,"prior.oi"]==1),NA,min(y[y[,"prior.oi"]==0,"age.oi"]))}) age.oi.af.df <- data.frame("pid"=unlist(names(age.oi.af)), "age.oi"=unlist(age.oi.af)) age.oi.af.df$pid <- as.character(age.oi.af.df$pid) age.oi.2 <- merge(subset(age.oi, !duplicated(pid), select=c(pid,age.prior.oi)), age.oi.af.df, all=TRUE) dup.oi <- subset(age.oi, prior.oi == 0) dup.oi$age.oi <- round(dup.oi$age.oi,4) oi.df$age.oi <- round(oi.df$age.oi,4) oi.df$pid <- as.character(oi.df$pid) oi.df <- oi.df[order(oi.df$pid,oi.df$age.oi),] dup.oi <- merge(dup.oi, subset(oi.df, pid %in% dup.oi$pid),all=TRUE) dup.oi <- subset(dup.oi, prior.oi == 0) dup.oi <- upData(dup.oi, drop=c("year.1st.visit","age.less.ulimit","oi", "prior.oi","age.prior.oi")) dup.oi$oi.1 <- tolower(dup.oi$oi.1) dup.oi$oi.2 <- tolower(dup.oi$oi.2) dup.oi$oi.2 <- ifelse(dup.oi$oi.2 == " ",NA,as.character(dup.oi$oi.2)) dup.oi$oi.3 <- tolower(dup.oi$oi.3) dup.oi$oi.3 <- ifelse(dup.oi$oi.3 == " ",NA,as.character(dup.oi$oi.3)) dup.oi$oi.4 <- tolower(dup.oi$oi.4) dup.oi$oi.4 <- ifelse(dup.oi$oi.4 == " ",NA,as.character(dup.oi$oi.4)) # Getting class of OIs from oi.class dup.oi[,"oi.1.class"] <- oi.class[match(dup.oi$oi.1, oi.class$oi,nomatch=NA),"class"] dup.oi[,"oi.2.class"] <- oi.class[match(dup.oi$oi.2, oi.class$oi,nomatch=NA),"class"] dup.oi[,"oi.3.class"] <- oi.class[match(dup.oi$oi.3, oi.class$oi,nomatch=NA),"class"] dup.oi[,"oi.4.class"] <- oi.class[match(dup.oi$oi.4, oi.class$oi,nomatch=NA),"class"] d2$age.oi <- round(d2$age.oi,4) d2 <- merge(d2, dup.oi, all=TRUE) d2 <- d2[order(d2$pid,d2$age),] # Dropping current age.oi from d2 and adding in age.oi from age.oi.2 # data. Rather than have all OI ages repeated, will just repeat age of first # OI after age.less.ulimit d2 <- upData(d2, drop="age.oi") d2[,"age.oi"] <- age.oi.2[match(d2$pid, age.oi.2$pid, nomatch=NA),"age.oi"] d2[,"age.prior.oi"] <- age.oi.2[match(d2$pid, age.oi.2$pid, nomatch=NA),"age.prior.oi"] d2$prior.oi <- ifelse(!is.na(d2$age.prior.oi),1,0) # Defining prior.art (This is ART before age.less.ulimit but during a clinic # visit. Different from has_art_before_first_visit prior.art.df <- subset(d2, age < age.less.ulimit) prior.art <- tapply(prior.art.df$art, INDEX=prior.art.df$pid, FUN=max) prior.art.df <- data.frame("pid"=names(prior.art), "prior.art"=prior.art) d2[,"prior.art"] <- prior.art.df[match(d2$pid, prior.art.df$pid, nomatch=NA),"prior.art"] d2$prior.art <- ifelse(is.na(d2$prior.art),0, d2$prior.art) # Subsetting out PIDs with prior.oi or prior.art d2 <- subset(d2, prior.oi != 1 & prior.art != 1 & has_art_before_first_visit != 1 & has_oi_before_first_visit != 1) # d2 <- subset(d2, prior.oi != 1 & prior.art != 1) # Had issues in the propensity scores with missing CD4 % but non-missing # CD4 values. Gave different lengths for the predicted probabilities # of time-dependent vs non-time-dependent models. Imputing values # for CD4_% cd4.pct.mod.beta <- lm(d2$cd4_percent ~ d2$cd4)$coeff[-1] d2$cd4_percent <- apply(d2[,c("cd4_percent","cd4")],1, FUN=function(x){ if(is.na(x[1]) & !is.na(x[2])){ x[2]*cd4.pct.mod.beta } else { x[1] }}) pre.d2 <- d2 # save(pre.d2, file=paste("pre.d2.",k,".nonsus500ADE.rda",sep="")) unique.id<-unique(d2$pid) if(restrict){ ## Below..should it be <= or just < ??? ##################### Will this include those w/ CD4 at or below 200 (before else)??? gt.ulimit <- tapply(d2$cd4[round(d2$age,4) <= round(d2$age.less.ulimit,4)], INDEX=d2$pid[round(d2$age,4) <= round(d2$age.less.ulimi,4)], FUN=function(y){ as.numeric(any(y[!is.na(y)] > u.limit))}) gt.ulimit.df <- data.frame("pid"=names(gt.ulimit), "gt.ulimit"=gt.ulimit) d2[,"gt.ulimit"] <- gt.ulimit.df[match(d2$pid, gt.ulimit.df$pid, nomatch=NA),"gt.ulimit"] sub.d2 <- subset(d2, (age.fhaart >= age.less.ulimit | is.na(age.fhaart)) & gt.ulimit == 1) sub.d2 <- upData(sub.d2, drop=c("gtkmths","gt.ulimit")) } else { gt.llimit.df <- subset(d2, round(age,4) == round(age.less.ulimit,4), select=c(pid,age,age.less.ulimit,cd4)) gt.llimit.df$gt.llimit <- ifelse(gt.llimit.df$cd4 >= l.limit,1,0) d2[,"gt.llimit"] <- gt.llimit.df[match(d2$pid, gt.llimit.df$pid,nomatch=NA), "gt.llimit"] sub.d2 <- subset(d2, (age.fhaart >= age.less.ulimit | is.na(age.fhaart)) & gt.llimit == 1) sub.d2 <- sub.d2[order(sub.d2$pid,sub.d2$age),] sub.d2 <- upData(sub.d2, drop=c("gtkmths","gt.llimit")) } sub.d2 <- sub.d2[order(sub.d2$pid, sub.d2$age),] sub.unique.id <- unique(sub.d2$pid) d2 <- sub.d2 # Defining trt200 group if(!original){ d2$trt.group <- ifelse(is.na(d2$age.fhaart),0, ifelse((d2$age.fhaart - d2$age.less.ulimit) < start.within,1,0)) # Excluding those who have an event: # Trt group 1: Those with an OI between age.less.ulimit & age.fhaart # (Note: Group 1 could not have a death because then # they would not have started HAART; had to start # HAART to be in Group 1). # Trt group 2: Those with an OI or death within start.within of # age.less.ulimit. grp1.1.del <- unique(d2$pid[which(d2$trt.group == 1 & !is.na(d2$age.oi) & d2$age.oi >= d2$age.less.ulimit & d2$age.oi <= d2$age.fhaart)]) grp.2.oi.del <- unique(d2$pid[which(d2$trt.group == 0 & !is.na(d2$age.oi) & d2$age.oi >= d2$age.less.ulimit & d2$age.oi <= (d2$age.less.ulimit + start.within))]) grp.2.dth.del <- unique(d2$pid[which(d2$trt.group == 0 & !is.na(d2$age_at_death) & d2$age_at_death >= d2$age.less.ulimit & d2$age_at_death <= (d2$age.less.ulimit + start.within))]) d2 <- subset(d2, pid %nin% c(grp1.1.del,grp.2.oi.del,grp.2.dth.del)) # Defining avg start time of trt200 group tmp <- subset(d2, select=c("pid","age","fhaart","age.fhaart", "cd4","cd4_percent","vl","lost_to_fu", "age.less.ulimit","trt.group")) tmp <- subset(tmp, round(age,4) >= round(age.less.ulimit,4)) tmp$baseline.age <- tmp$age.less.ulimit } if(original){ d2$trt.group <- ifelse(is.na(d2$age.fhaart),0, ifelse(d2$age < d2$age.fhaart,0, ifelse(d2$age >= d2$age.fhaart & (d2$age.fhaart - d2$age.less.ulimit) < start.within,1,0))) # For those in trt group 1 who had an event before age.fhaart # need to reset them to trt group 0 grp1.1.del <- unique(d2$pid[which(d2$trt.group == 1 & !is.na(d2$age.oi) & d2$age.oi >= d2$age.less.ulimit & d2$age.oi <= d2$age.fhaart)]) d2$trt.group <- ifelse(d2$pid %in% grp1.1.del,0,d2$trt.group) tmp <- subset(d2, select=c("pid","age","fhaart","age.fhaart", "cd4","cd4_percent","vl","lost_to_fu", "age.less.ulimit","trt.group")) tmp <- subset(tmp, round(age,4) >= round(age.less.ulimit,4)) tmp$baseline.age <- tmp$age.less.ulimit } tmp.200 <- subset(tmp, trt.group==1) tmp.200 <- tmp.200[order(tmp.200$pid,tmp.200$age),] # if(old.code){ # tmp.200$diff <- with(tmp.200, age.fhaart - baseline.age) # avg.start <- mean(tmp.200$diff[!duplicated(tmp.200$pid)], na.rm=TRUE) # # Now subset tmp.200 so that it starts when they first start HAART # tmp.200 <- subset(tmp.200, round(age,4) >= round(age.fhaart,4)) # # Getting new baseline age for trt200 group # tmp.200 <- upData(tmp.200, # drop=c("age.less.ulimit","baseline.age","diff")) # baseline <- aggregate(tmp.200$age, list(pid=tmp.200$pid), FUN=head,n=1) # names(baseline)[2] <- "baseline.age" # tmp.200 <- merge(tmp.200, baseline, all=TRUE) # tmp.200 <- tmp.200[order(tmp.200$pid,tmp.200$age),] # } else { tmp.200 <- upData(tmp.200, drop=c("age.less.ulimit")) # } ## Just taking record of baseline.age??? if(!original){ tmp.200 <- subset(tmp.200, round(age,4) == round(baseline.age,4)) } else { tmp.200 <- tmp.200[!duplicated(tmp.200$pid),] } tmp.100 <- subset(tmp, trt.group==0) # if(old.code){ # tmp.100$start.age <- with(tmp.100, baseline.age + avg.start) # } else { # avg.start <- 0 tmp.100$start.age <- tmp.100$baseline.age # } new.rows <- data.frame(pid=unique(tmp.100$pid), age=tmp.100$start.age[!duplicated(tmp.100$pid)], fhaart=0, # cd4=NA, # cd4_percent=NA, # vl=NA, lost_to_fu=tmp.100$lost_to_fu[!duplicated(tmp.100$pid)], age.fhaart=tmp.100$age.fhaart[!duplicated(tmp.100$pid)], age.less.ulimit=tmp.100$age.less.ulimit[!duplicated(tmp.100$pid)], trt.group=0, baseline.age=tmp.100$baseline.age[!duplicated(tmp.100$pid)], start.age=tmp.100$start.age[!duplicated(tmp.100$pid)]) tmp.100 <- merge(tmp.100, new.rows, all=TRUE) tmp.100 <- tmp.100[order(tmp.100$pid, tmp.100$age),] tmp.100 <- subset(tmp.100, round(age,4) >= round(start.age,4)) # Getting new baseline for trt.group=0 tmp.100 <- upData(tmp.100, drop=c("age.less.ulimit","baseline.age","start.age")) baseline <- aggregate(tmp.100$age, list(pid=tmp.100$pid),FUN=head,n=1) names(baseline)[2] <- "baseline.age" tmp.100 <- merge(tmp.100, baseline, all=TRUE) tmp.100 <- tmp.100[order(tmp.100$pid,tmp.100$age),] tmp.100 <- subset(tmp.100, round(age,4) == round(baseline.age,4)) # Creating data frame for survival analysis # If original=TRUE, this has 2 records for those in trt.group=1 who did # not start HAART at age.less.ulimit tmp.tot <- merge(tmp.200, tmp.100, all=TRUE) tmp.tot <- tmp.tot[order(tmp.tot$pid,tmp.tot$age),] true.last.age.df <- data.frame("pid"=d2$pid[!duplicated(d2$pid)], "true.last.age"=d2$true.last.age[!duplicated(d2$pid)]) tmp.tot <- merge(tmp.tot, true.last.age.df, all=TRUE) tmp.tot <- tmp.tot[order(tmp.tot$pid,tmp.tot$age),] # sub.id <- unique(d2$pid[d2$pid %in% tmp.tot$pid]) # sub.unique.id <- unique(sub.id) # last.age <- tapply(d2$age, INDEX=d2$pid, FUN=max,na.rm=TRUE) # last.age.df <- data.frame("pid"=names(last.age), # "last.age"= tmp.tot$follow.up <- tmp.tot$true.last.age - tmp.tot$baseline.age max.follow <- 12*max(tmp.tot$follow.up, na.rm=TRUE) months <- seq(0, ceiling(max.follow),by=1) tmp.tot$month <- 0 if(!original){ tmp.tot2 <- data.frame("pid"=rep(unique(tmp.tot$pid), each=length(months[-1])), "month"=rep(months[-1],times=length(unique(tmp.tot$pid))), "age"=rep(tmp.tot$age, each=length(months[-1])), "age.fhaart"=rep(tmp.tot$age.fhaart,each=length(months[-1])), "trt.group"=rep(tmp.tot$trt.group,each=length(months[-1])), "baseline.age"=rep(tmp.tot$baseline.age,each=length(months[-1])), "true.last.age"=rep(tmp.tot$true.last.age,each=length(months[-1])), "lost_to_fu"=rep(tmp.tot$lost_to_fu,each=length(months[-1]))) tmp.tot2$pid <- as.character(tmp.tot2$pid) surv.df <- merge(tmp.tot, tmp.tot2, all=TRUE) surv.df <- surv.df[order(surv.df$pid, surv.df$month),] surv.df$age <- round(surv.df$age + surv.df$month/12,4) } else { tmp.tot2 <- data.frame("pid"=rep(unique(tmp.tot$pid), each=length(months[-1])), "month"=rep(months[-1],times=length(unique(tmp.tot$pid))), "age.fhaart"=rep(tmp.tot$age.fhaart[!duplicated(tmp.tot$pid)], each=length(months[-1])), "baseline.age"=rep(tmp.tot$baseline.age[!duplicated(tmp.tot$pid)], each=length(months[-1])), "true.last.age"=rep(tmp.tot$true.last.age[!duplicated(tmp.tot$pid)], each=length(months[-1])), "lost_to_fu"=rep(tmp.tot$lost_to_fu[!duplicated(tmp.tot$pid)], each=length(months[-1]))) tmp.tot2$pid <- as.character(tmp.tot2$pid) tmp.tot2$month <- as.integer(tmp.tot2$month) surv.df <- merge(tmp.tot[!duplicated(tmp.tot$pid), -which(names(tmp.tot) %in% # c("age","trt.group","cd4","cd4_percent","vl"))], c("age","trt.group"))], tmp.tot2,all=TRUE) surv.df <- surv.df[order(surv.df$pid,surv.df$age),] surv.df$age <- surv.df$baseline.age surv.df$age <- surv.df$age + surv.df$month/12 surv.df$trt.group <- ifelse(is.na(surv.df$age.fhaart),0, ifelse(surv.df$age >= surv.df$age.fhaart & (surv.df$age.fhaart - surv.df$baseline.age) < start.within,1,0)) # For those in trt group 1 who had an event before age.fhaart # need to reset them to trt group 0 surv.df$trt.group <- ifelse(surv.df$pid %in% grp1.1.del,0, surv.df$trt.group) } ref.cd4 <- data.frame("pid"=d2$pid[!duplicated(d2$pid)], "ref.cd4"=d2$cd4[round(d2$age,4)== round(d2$age.less.ulimit,4)], "age.ref.cd4"=d2$age.less.ulimit[!duplicated(d2$pid)], "ref.cd4_percent"=d2$cd4_percent[round(d2$age,4)== round(d2$age.less.ulimit,4)], "age.ref.cd4_percent"=d2$age.less.ulimit[!duplicated(d2$pid)], "ref.vl"=d2$vl[round(d2$age,4)== round(d2$age.less.ulimit,4)], "age.ref.vl"=d2$age.less.ulimit[!duplicated(d2$pid)], "age.fhaart"=d2$age.fhaart[!duplicated(d2$pid)]) ####HERE!!!!! locate.fhaart <- lapply(split(subset(surv.df, select=c(pid,age,age.fhaart)),surv.df$pid), FUN=function(y){ y[min(which(y[,"age"] >= y[,"age.fhaart"])),"age"] }) locate.fhaart.df <- data.frame("pid"=names(locate.fhaart)[!is.na(locate.fhaart)], "new.age.fhaart"=unlist(locate.fhaart[!is.na(locate.fhaart)])) surv.df[,"new.age.fhaart"] <- locate.fhaart.df[match(surv.df$pid,locate.fhaart.df$pid,nomatch=NA), "new.age.fhaart"] surv.df$fhaart <- ifelse(round(surv.df$age,4) == round(surv.df$new.age.fhaart,4) & !is.na(surv.df$new.age.fhaart),1, ifelse(is.na(surv.df$new.age.fhaart) | round(surv.df$age,4) != round(surv.df$new.age.fhaart,4),0,NA)) dem <- data.frame("pid"=d2$pid[!duplicated(d2$pid)], "black"=d2$black[!duplicated(d2$pid)], "male"=d2$male[!duplicated(d2$pid)], "idu"=d2$idu[!duplicated(d2$pid)], "age_at_death"=d2$age_at_death[!duplicated(d2$pid)], "time.in.care"=d2$time.in.care[!duplicated(d2$pid)]) surv.df <- merge(surv.df, ref.cd4, all=TRUE) surv.df <- surv.df[order(surv.df$pid,surv.df$age),] surv.df <- merge(surv.df, dem, all=TRUE) surv.df <- surv.df[order(surv.df$pid,surv.df$age),] # tmp <- data.frame("pid"=unique(surv.df$pid), # "age.last"=d2$true.last.age[!duplicated(d2$pid)]) # tmp$age.last <- round(tmp$age.last,4) surv.df$age.last <- surv.df$true.last.age # surv.df <- merge(surv.df, tmp, all=TRUE) surv.df <- subset(surv.df, round(age,4) <= round(age.last,4)) surv.df$death <- ifelse(is.na(surv.df$age_at_death),0, ifelse((round(surv.df$age,4) == round(surv.df$age.last,4)) | (surv.df$age + 1/12 > surv.df$age.last),1,0)) ## May need to remove?? d2.new <- subset(d2, pid %in% surv.df$pid) oi.df <- data.frame("pid"=unique(d2.new$pid), "age.oi"=d2.new$age.oi[!duplicated(d2.new$pid)]) surv.df <- merge(surv.df, oi.df, all=TRUE) surv.df <- surv.df[order(surv.df$pid,surv.df$age),] locate.oi <- NULL for(i in 1:length(unique(surv.df$pid))){ locate.oi[i] <- min(which(surv.df$pid == unique(surv.df$pid)[i] & round(surv.df$age ,4) >= round(surv.df$age.oi, 4))) } locate.oi <- locate.oi[locate.oi != Inf] surv.df$oi <- 0 surv.df$oi[locate.oi] <- 1 ## Temporary fix...how to capture in above loop?? surv.df$oi <- ifelse(is.na(surv.df$age.oi),0, ifelse((surv.df$age + 1/12) > surv.df$age.last & (surv.df$age.oi + 1/12) > surv.df$age.last,1, surv.df$oi)) surv.df$event <- ifelse(surv.df$death == 0 & surv.df$oi == 0,0,1) #Adding CD4 counts labage.1 <- surv.df[,c("pid","age","cd4")] labage.2 <- subset(d2, pid %in% surv.df$pid, select=c(pid,age,cd4)) names(labage.2) <- c("pid","age","cd4") labage.2 <- subset(labage.2, !is.na(cd4)) pullData <- add.fun(labage.1, labage.2) surv.df$cd4 <- pullData[pullData$order==1,"cd4"] # surv.df$age.cd4 <- pullData[pullData$order==1,"age.cd4."] surv.df$age <- round(surv.df$age, 4) surv.df$age.fhaart <- round(surv.df$age.fhaart,4) surv.df$age.cd4 <- pullData[pullData$order==1, "age.cd4"] surv.df$time.to.cd4 <- 12*(surv.df$age - surv.df$age.cd4) #Adding CD4 percents labage.1 <- surv.df[,c("pid","age","cd4_percent")] labage.2 <- subset(d2, pid %in% surv.df$pid, select=c(pid,age,cd4_percent)) names(labage.2) <- c("pid","age","cd4_percent") labage.2 <- subset(labage.2, !is.na(cd4_percent)) pullData <- add.fun(labage.1, labage.2) surv.df$cd4_percent <- pullData[pullData$order==1,"cd4_percent"] # surv.df$age.cd4_percent <- pullData[pullData$order==1,"age.cd4_percent"] surv.df$age <- round(surv.df$age, 4) surv.df$age.cd4_percent <- pullData[pullData$order==1, "age.cd4_percent"] surv.df$time.to.cd4_percent <- 12*(surv.df$age - surv.df$age.cd4_percent) #Adding VL labage.1 <- surv.df[,c("pid","age","vl")] labage.2 <- subset(d2, pid %in% surv.df$pid, select=c(pid,age,vl)) names(labage.2) <- c("pid","age","vl") labage.2 <- subset(labage.2, !is.na(vl)) pullData <- add.fun(labage.1, labage.2) surv.df$vl <- pullData[pullData$order==1,"vl"] surv.df$age <- round(surv.df$age, 4) surv.df$age.vl <- pullData[pullData$order==1, "age.vl"] surv.df$time.to.vl <- 12*(surv.df$age - surv.df$age.vl) ## Need to ensure that for both cd4 and vl that age.cd4 and age.vl ## are less than age.fhaart when fhaart==1 # Adjusting those who started HAART and had a CD4 count in that month after # the age.fhaart which.cd4 <- which(round(surv.df$age.fhaart,4) < round(surv.df$age.cd4,4) & surv.df$fhaart == 1 & round(surv.df$age.cd4,4) > round(surv.df$baseline.age,4)) surv.df$cd4[which.cd4] <- surv.df$cd4[(which.cd4-1)] surv.df$age.cd4[which.cd4] <- surv.df$age.cd4[(which.cd4-1)] # Adjusting those who started HAART and had a CD4 count in that month after # the age.fhaart which.vl <- which(round(surv.df$age.fhaart,4) < round(surv.df$age.vl,4) & surv.df$fhaart == 1 & round(surv.df$age.vl,4) > round(surv.df$baseline.age,4)) surv.df$vl[which.vl] <- surv.df$vl[(which.vl-1)] surv.df$age.vl[which.vl] <- surv.df$age.vl[(which.vl-1)] # Adjusting those who started HAART and had a CD4 count in that month after # the age.fhaart which.cd4.pct <- which(round(surv.df$age.fhaart,4) < round(surv.df$age.cd4_percent,4) & surv.df$fhaart == 1 & round(surv.df$age.cd4_percent,4) > round(surv.df$baseline.age,4)) surv.df$cd4_percent[which.cd4.pct] <- surv.df$cd4_percent[(which.cd4.pct-1)] surv.df$age.cd4_percent[which.cd4.pct] <- surv.df$age.cd4_percent[(which.cd4.pct-1)] ## Last row no longer row where they fell below 200. It is now ## either: ## 1) where fhaart=1 for trt200=1 group, or # ## 2) age.bl200 + avg.start for trt200=0 group ## !! Need to change this code here !!! ## Using locate.startrow...getting Inf for startrow locations???? locate.firstrow <- locate.rowlt200 <- NULL locate.startrow <- locate.lastrow <- NULL for(i in 1:length(unique(surv.df$pid))){ locate.firstrow[i] <- min(which(surv.df$pid== unique(surv.df$pid)[i])) # & !is.na(sub.cd4))) locate.startrow[i] <- max(which(surv.df$pid== unique(surv.df$pid)[i] & round(surv.df$age,4) <= round(surv.df$baseline.age,4)),na.rm=TRUE) } ## Need to informatively censor subjects in trt200=0 group who either started ## HAART before falling below 100 or who didn't start HAART within ## "start.within" months of falling below 100 locate.cd4lt100 <- NULL for(i in 1:length(unique(surv.df$pid))){ locate.cd4lt100[i] <- min(which(surv.df$pid==unique(surv.df$pid)[i] & !is.na(surv.df$cd4) & surv.df$trt.group==0 & surv.df$pid %nin% unique(tmp.200$pid) & surv.df$cd4 < l.limit), na.rm=TRUE) } tmp <- data.frame("pid"=unique(as.character(surv.df$pid)), "age.less.llimit"=round(surv.df$age.cd4[locate.cd4lt100],4)) tmp$age.less.llimit <- as.numeric(as.character(tmp$age.less.llimit)) surv.df <- merge(surv.df, tmp, all=TRUE) surv.df <- surv.df[order(surv.df$pid,surv.df$month),] surv.df$inform.censor <- ifelse(surv.df$trt.group == 0 & surv.df$pid %nin% tmp.200$pid & ((!is.na(surv.df$age.fhaart) & is.na(surv.df$age.less.llimit) & surv.df$age >= surv.df$age.fhaart) | (!is.na(surv.df$age.fhaart) & !is.na(surv.df$age.less.llimit) & surv.df$age.fhaart < surv.df$age.less.llimit & surv.df$age >= surv.df$age.fhaart) | (!is.na(surv.df$age.fhaart) & !is.na(surv.df$age.less.llimit) & surv.df$age.fhaart - surv.df$age.less.llimit > start.within & surv.df$age > surv.df$age.less.llimit + start.within) | (is.na(surv.df$age.fhaart) & !is.na(surv.df$age.less.llimit) & surv.df$age - surv.df$age.less.llimit > start.within)),1,0) # (all(is.na(surv.df$age.fhaart)))), 1, 0) ## If I were to keep the above line of code, then that censors folks who ## were in the trt200=0 group who never fell below 100 and never started ## HAART. They are not non-compliant. ## Removing rows from surv.df after: ## 1) A patient had an event ## 2) A patient was informatively censored ## 3) Age in surv.df exceeds that actual age of the last observation ## for that patient. (last one already done ## (1) id_list <- split(surv.df, surv.df$pid) #pat <- id_list[[1]] #dest <- which(diff(pat$event) < 0) + 1 #if(length(dest<- which(diff(pat$event) < 0) + 1)) pat[seq(from=dest, #to=nrow(pat)),] junk3 <- do.call(rbind, lapply(id_list, function(pat) pat[(seq(along=pat$event) < c(which(pat$event | pat$inform.censor), nrow(pat))[1] + 1) & pat$age <= (pat$age.last+1/12),])) ## For those records where inform.censor and event both = 1 ## need to determine which occurred first -- change in trt ## or event. junk3.b <- subset(junk3, event==1 & inform.censor==1) unique.id.b <- unique(junk3.b$pid) min.age <- NULL for(i in 1:length(unique.id.b)){ min.age[i] <- min(c(junk3.b$age.fhaart[junk3.b$pid==unique.id.b[i]], junk3.b$age.oi[junk3.b$pid==unique.id.b[i]], junk3.b$age.at.death[junk3.b$pid==unique.id.b[i]]), na.rm=TRUE) } junk3.b$event <- ifelse(round(junk3.b$age.oi,4)==round(min.age,4) | round(junk3.b$age_at_death,4)== round(min.age,4), 1, 0) junk3.b$inform.censor <- ifelse(junk3.b$age.fhaart==min.age & !is.na(junk3.b$age.fhaart),1, ifelse((is.na(junk3.b$age.fhaart) & junk3.b$age > min.age),0,1)) row.2.dl <- which(junk3$event==1 & junk3$inform.censor==1) if(length(row.2.dl) > 0){ junk3 <- junk3[-row.2.dl,] junk3 <- merge(junk3, junk3.b, all=TRUE) } junk3 <- junk3[order(junk3$pid,junk3$age),] surv.df <- junk3 return(surv.df=surv.df) # return(list(surv.df=surv.df, # avg.start=avg.start)) } # analysis <- function(start.within=start.within,avg.start=avg.start, surv.df, # No.CD4,IPW,No.PS){ analysis <- function(start.within=start.within,surv.df,No.CD4,IPW,No.PS){ ######### Pooled Logistic regression for censoring # library(Design) # surv.df$vl<-ifelse(surv.df$vl<400,399,surv.df$vl) surv.df$logvl<-log10(surv.df$vl) first.age<-unlist(tapply(d2$age[d2$pid %in% surv.df$pid], INDEX=d2$pid[d2$pid %in% surv.df$pid], FUN=head,n=1)) data.junk<-data.frame(pid=names(first.age),first.age=as.numeric(first.age)) surv.df<-merge(surv.df,data.junk,all=TRUE) surv.df <- surv.df[order(surv.df$pid,surv.df$age),] surv.df$time.in.care<-surv.df$baseline.age-surv.df$first.age baseline.cd4 <- unlist(tapply(surv.df$cd4, INDEX=surv.df$pid, FUN=head,n=1)) baseline.time.to.cd4 <- unlist(tapply(surv.df$time.to.cd4, INDEX=surv.df$pid, FUN=head,n=1)) baseline.cd4_percent <- unlist(tapply(surv.df$cd4_percent, INDEX=surv.df$pid, FUN=head,n=1)) baseline.time.to.cd4_percent <- unlist(tapply(surv.df$time.to.cd4_percent, INDEX=surv.df$pid, FUN=head,n=1)) baseline.logvl <- unlist(tapply(surv.df$logvl, INDEX=surv.df$pid, FUN=head,n=1)) baseline.time.to.logvl <- unlist(tapply(surv.df$time.to.vl, INDEX=surv.df$pid, FUN=head,n=1)) ## Imputing baseline logvl based on their cd4 count impute.lm <- lm(baseline.logvl ~ sqrt(baseline.cd4)) baseline.logvl <- ifelse(is.na(baseline.logvl), impute.lm$coeff[1]+impute.lm$coeff[2]*sqrt(baseline.cd4), baseline.logvl) tmp.logvl <- data.frame(pid=names(baseline.logvl), baseline.logvl=as.numeric(baseline.logvl)) surv.df <- merge(surv.df, tmp.logvl, all=TRUE) surv.df <- surv.df[order(surv.df$pid,surv.df$age),] surv.df$logvl <- ifelse(is.na(surv.df$logvl), surv.df$baseline.logvl, surv.df$logvl) # mod200.pl.1<-lrm(inform.censor~rcs(month,4)+black+idu+male+baseline.age+ # prior.oi+prior.art+sqrt(cd4)+ cd4.pct + # sqrt(time.to.cd4)+logvl+sqrt(time.to.vl)+ # sqrt(time.in.care), # subset=month > (3-avg.start*12) & trt.group==0, # data=surv.df) trt200.ids <- unique(surv.df$pid[surv.df$trt.group == 1]) # mod200.pl.1 <- lrm(inform.censor ~ rcs(month,4) + black + # idu + male + baseline.age + # rcs(sqrt(cd4),4) + # # cd4_percent + # logvl + sqrt(time.in.care), # data=surv.df, # subset=month > # (start.within*12-avg.start*12) & # trt.group==0 & # pid %nin% trt200.ids) mod200.pl.1 <- lrm(inform.censor ~ rcs(month,4) + black + idu + male + baseline.age + rcs(sqrt(cd4),4) + logvl + sqrt(time.in.care), data=surv.df, subset=month > start.within*12 & trt.group==0 & pid %nin% trt200.ids) p200.cens.1<-exp(predict(mod200.pl.1))/(1+exp(predict(mod200.pl.1))) # mod200.pl.2<-lrm(inform.censor~rcs(month,4)+black+idu+male+baseline.age+ # prior.oi+ prior.art,subset=month >(3-avg.start*12) & # trt.group==0, # data=surv.df) # mod200.pl.2 <- lrm(inform.censor ~ rcs(month,4)+black+idu+male+ # baseline.age, # subset=month > (start.within*12-avg.start*12) & # trt.group==0 & # pid %nin% trt200.ids,data=surv.df) mod200.pl.2 <- lrm(inform.censor ~ rcs(month,4)+black+idu+male+ baseline.age, subset=month > start.within*12 & trt.group==0 & pid %nin% trt200.ids,data=surv.df) p200.cens.2<-exp(predict(mod200.pl.2))/(1+exp(predict(mod200.pl.2))) sw200.init<-(1-p200.cens.2)/(1-p200.cens.1) sw.init.df<-data.frame(row.id=names(sw200.init),sw200.init) surv.200.junk<-data.frame(row.id=rownames(surv.df),surv.df) junk4<-merge(surv.200.junk,sw.init.df,by.x="row.id",by.y="row.id",all=TRUE) junk4 <- junk4[order(junk4$pid,junk4$age),] sw200.temp<-ifelse(is.na(junk4$sw200.init),1,junk4$sw200.init) sw200 <- unlist(tapply(sw200.temp, INDEX=surv.df$pid, FUN=cumprod)) library(geepack) # surv.df$rate.cd4.chg <- -with(surv.df, # (ref.cd4-cd4.gt200)/(age.ref.cd4-age.gt200)) # surv.df$lograte.cd4.chg <- log10(surv.df$rate.cd4.chg) tmp.cd4 <- data.frame(pid=names(baseline.cd4), baseline.cd4=as.numeric(baseline.cd4)) tmp.time.to.cd4 <- data.frame(pid=names(baseline.time.to.cd4), baseline.time.to.cd4=as.numeric(baseline.time.to.cd4)) tmp.cd4.pct <- data.frame(pid=names(baseline.cd4_percent), baseline.cd4_percent=as.numeric(baseline.cd4_percent)) tmp.time.to.cd4.pct <- data.frame(pid=names(baseline.time.to.cd4_percent), baseline.time.to.cd4_percent=as.numeric(baseline.time.to.cd4_percent)) tmp.logvl <- data.frame(pid=names(baseline.logvl), baseline.vl=as.numeric(baseline.logvl)) tmp.time.to.logvl <- data.frame(pid=names(baseline.time.to.logvl), baseline.time.to.logvl=as.numeric(baseline.time.to.logvl)) junk<-unique(subset(surv.df,select=c(pid,trt.group,baseline.age,idu, black,male, time.in.care,baseline.logvl))) junk <- junk[!duplicated(junk$pid),] junk$trt.group <- ifelse(junk$pid %in% trt200.ids,1,junk$trt.group) junk <- merge(merge(junk,tmp.cd4,all=TRUE),tmp.cd4.pct,all=TRUE) junk <- merge(junk, tmp.time.to.cd4,all=TRUE) junk <- merge(junk, tmp.time.to.cd4.pct,all=TRUE) junk <- merge(junk, tmp.logvl,all=TRUE) junk <- merge(junk, tmp.time.to.logvl,all=TRUE) ## add cd4, cd4pct, and logvl at baseline and then include in ## glm below...a few id's that have NA's for vl so don't ## get values for predict so error when creating merge.junk # mod200.ps<-glm(trt.group~baseline.age+idu+black+male+prior.oi+prior.art+ # sqrt(time.in.care)+baseline.cd4+baseline.cd4.pct+ # baseline.logvl+lograte.cd4.chg, # data=junk,family=binomial) if(No.CD4){ mod200.ps <- glm(trt.group~black+idu+male+baseline.age+ baseline.logvl+ sqrt(time.in.care),data=junk,family=binomial) } else { mod200.ps <- glm(trt.group~black+idu+male+baseline.age+ rcs(baseline.cd4,3) + baseline.logvl+ sqrt(time.in.care),data=junk,family=binomial) } ps200<-exp(predict(mod200.ps))/(1+exp(predict(mod200.ps))) merge.junk<-data.frame(pid=junk$pid,ps200=ps200) surv.df<-merge(surv.df,merge.junk,all=TRUE) surv.df <- surv.df[order(surv.df$pid,surv.df$age),] if(IPW){ surv.df$ps200.new <- ifelse(surv.df$trt.group == 1,surv.df$ps200, (1-surv.df$ps200)) } surv.df$pid1<-factor(surv.df$pid) event<-surv.df$event month<-surv.df$month id<-factor(surv.df$pid) sw200<-as.numeric(sw200) if(IPW){ ps200.new <- surv.df$ps200.new #-------------------------------------------# # If using IPW and no PS sw200 <- sw200/ps200.new #-------------------------------------------# } sw200.new <- ifelse(sw200 < quantile(sw200,probs=0.05), quantile(sw200,probs=0.05), ifelse(sw200 > quantile(sw200,probs=0.95), quantile(sw200,probs=0.95), sw200)) ps200<-surv.df$ps200 trt.group <- surv.df$trt.group rcs.month<-rcs(sqrt(month),3) rcs.month1<-as.vector(rcs.month[,1]) rcs.month2<-as.vector(rcs.month[,2]) library(sandwich) if(IPW | No.PS){ mod200 <- glm(event ~ rcs.month1 + rcs.month2 + trt.group, family=binomial, weights=sw200.new) } else { mod200 <- glm(event ~ rcs.month1 + rcs.month2 + trt.group + ps200,family=binomial, weights=sw200.new) } vbeta <- diag(sandwich(mod200))[names(mod200$coeff)=="trt.group"] # Taking reciprocal since interested in trt.group = 1 as reference beta <- 1/exp(mod200$coeff[names(mod200$coeff)=="trt.group"]) uci <- 1/exp(mod200$coeff[names(mod200$coeff)=="trt.group"]-qnorm(0.975)*sqrt(vbeta)) lci <- 1/exp(mod200$coeff[names(mod200$coeff)=="trt.group"]+qnorm(0.975)*sqrt(vbeta)) stuff <- unique(subset(surv.df, select=c(pid,event)) ) locate <- NULL for(i in 1:length(unique(stuff$pid))){ locate[i] <- which(stuff$pid==unique(stuff$pid)[i] & stuff$event==max(stuff$event[stuff$pid==unique(stuff$pid)[i]])) } stuff3 <- stuff[locate,] stuff <- unique(subset(surv.df, select=c(pid,trt.group))) locate <- NULL for(i in 1:length(unique(stuff$pid))){ locate[i] <- which(stuff$pid == unique(stuff$pid)[i] & stuff$trt.group==max(stuff$trt.group[stuff$pid==unique(stuff$pid)[i]])) } stuff4 <- stuff[locate,] stuff3 <- merge(stuff3,stuff4,all=TRUE) table.trtgroup <- table(stuff3$trt.group,stuff3$event) n.ic <- sum(surv.df$inform.censor) return(list(beta=beta, lci=lci, uci=uci, table.trtgroup=table.trtgroup, n.ic=n.ic, surv.df=surv.df)) } #------------------------# # Loading data # #------------------------# # d2 <- read.dta("when_to_start_01oct2007.dta") # d2 <- read.dta("when_to_start_19nov2009.dta") d2 <- read.dta("when_to_start_03dec2009.dta") d2 <- upData(d2, rename=c(cfar_pid="pid", ivdu="idu", indicator_first_haart_0_1="fhaart", # age_at_first_haart="age.fhaart", cd4_count="cd4", indicator_oi_0_1="oi", age_at_oi="age.oi", indicator_non_haart_art_0_1="art"), drop=c("age_at_first_haart", "fhaart")) d2 <- d2[!duplicated(d2),] d2$male <- d2$sex - 1 # non.ade <- read.table("w2s_non_ades_dx_17feb2009.csv",header=TRUE,sep=",") non.ade <- read.table("w2s_non_ades_dx_18nov2009.csv",header=TRUE,sep=",") names(non.ade) <- tolower(names(non.ade)) non.ade.list <- c("CORONARY ARTERY DISEASE, NOS", "ACUTE MYOCARDIAL INFARCTION", "CIRRHOSIS OF LIVER, NOS", "ESOPHAGEAL VARICES, NOS", "HEPATIC FAILURE, NOS", "ADENOCARCINOMA, INTESTINAL TYPE", "MALIGNANT NEOPLASM OF COLON, UNSPECIFIED", "SMALL CELL CARCINOMA, NOS", "MELANOMA OF SKIN, SITE UNSPECIFIED", "SQUAMOUS CELL CARCINOMA OF ANUS", "END STAGE RENAL DISEASE", "ACUTE VIRAL HEPATITIS C WITH HEPATIC COMA", "ALCOHOLIC CIRRHOSIS OF LIVER", "CEREBRAL ARTERY OCCLUSION, UNSPECIFIED, WITH CEREBRAL INFARCTION", "CEREBRAL HEMORRHAGE", "CHRONIC HEPATITIS C WITH HEPATIC COMA", "CHRONIC LIVER DISEASE AND CIRRHOSIS", "CIRRHOSIS OF LIVER", "CORONARY ARTERIOSCLEROSIS", "CORONARY ARTERY DISEASE", "CORONARY ATHEROSCLEROSIS", "HEPATIC ENCEPHALOPATHY", "MELANOMA OF SKIN, SITE UNSPECIFIED", "MYOCARDIAL INFARCTION, NOS", "HEPATOCELLULAR CARCINOMA, NOS", "SQUAMOUS CELL CARCINOMA, KERATINIZING, NOS", "SUBARACHNOID HEMORRHAGE", "ANGINA PECTORIS", "ASCITES, NOS", "CARDIOVASCULAR DISEASE, NOS", "CEREBROVASCULAR ACCIDENT, NOS", "CHRONIC CONGESTIVE HEART FAILURE", "CHRONIC HYPERTENSION IN OBSTETRIC CONTEXT, NOS", "CHRONIC KIDNEY DISEASE, STAGE V", "CONGESTIVE HEART DISEASE", "CONGESTIVE HEART FAILURE", paste("DIABETES MELLITUS WITHOUT MENTION OF COMPLICATION,", " TYPE II OR UNSPECIFIED TYPE, NOT STATES AS UNCONTROLLED",sep=""), "ESOPHAGEAL VARICES WITHOUT MENTION OF BLEEDING", "HYPERTENSION IN THE OBSTETRIC CONTEXT, NOS", "MALIGNANT NEOPLASM OF RECTUM", "PRE-ECLAMPSIA, NOS", "SQUAMOUS CELL CARCINOMA, NOS") non.ade <- subset(non.ade, non_ade_diagnosis %in% non.ade.list) non.ade <- upData(non.ade, rename=c(cfar_pid="pid", age_at_non_ade_diagnosis_onset= "age.nonade.onset")) # map.icd9 <- read.table("Non-AIDS CCC vs Africa_14Nov08.csv", # header=TRUE, sep=",") # names(map.icd9) <- tolower(names(map.icd9)) # map.icd9 <- upData(map.icd9, drop="x") # trt <- read.table("when_to_start_art_16oct2008.csv",header=TRUE,sep=",") # trt <- read.table("when_to_start_art_18nov2009.csv",header=TRUE,sep=",") trt <- read.table("when_to_start_art_03dec2009.csv",header=TRUE,sep=",") names(trt) <- tolower(names(trt)) trt$art.0 <- ifelse(trt$haart == 1,0, ifelse(trt$haart == 0 & trt$regimen == " ",0,1)) trt <- upData(trt, rename=list(cfar_pid="pid", age_at_rx_end="age.rx.end", age_at_rx_start="age.rx.start", first_haart="fhaart")) # oi.df <- read.csv("when_to_start_year1_oi_01dec2008.csv", header=TRUE, # sep=",") oi.df <- read.csv("when_to_start_year1_oi_18nov2009.csv", header=TRUE, sep=",") names(oi.df) <- tolower(names(oi.df)) oi.df <- upData(oi.df, rename=c(cfar_pid="pid", year_of_first_visit="year.1st.visit", indicator_oi_0_1="oi", age_at_oi="age.oi", oi_1="oi.1", oi_2="oi.2", oi_3="oi.3", oi_4="oi.4")) oi.df$oi.2 <- ifelse(oi.df$oi.2 == "",NA,as.character(oi.df$oi.2)) oi.df$oi.3 <- ifelse(oi.df$oi.3 == "",NA,as.character(oi.df$oi.3)) oi.df$oi.4 <- ifelse(oi.df$oi.4 == "",NA,as.character(oi.df$oi.4)) oi.class <- read.csv("OIList_final.csv", header=TRUE) names(oi.class) <- tolower(names(oi.class)) oi.class$oi <- tolower(as.character(oi.class$oi)) oi.class$class <- as.character(oi.class$class) oi.class$class <- gsub("^([ \t\n\r\f\v]+)|([ \t\n\r\f\v]+)$", "", oi.class$class) oi.tmp <- unlist(strsplit(oi.class$oi, split="\"")) oi.tmp <- oi.tmp[oi.tmp != ""] oi.class$oi <- oi.tmp #-------------------------------------------# # Prior ART use art <- read.csv("when_to_start_prior_art_20nov09.csv",sep=",",header=TRUE) names(art) <- tolower(names(art)) names(art) <- c("pid","has_art_before_first_visit") #-------------------------------------------# #-------------------------------------------# # Prior OI oi <- read.csv("when_to_start_prior_oi_19apr2010.csv",sep=",",header=TRUE) names(oi) <- tolower(names(oi)) names(oi) <- c("pid","has_oi_before_first_visit") #-------------------------------------------# # # Some PIDs were not HAART naive at start of study # delete.pids <-trt$pid[trt$haart_before_first_visit==1] # d2 <- subset(d2, pid %nin% delete.pids) # trt <- subset(trt, pid %nin% delete.pids) # oi.df <- subset(oi.df, pid %nin% delete.pids) # oi.df <-subset(oi.df, oi == 1) # non.ade <- subset(non.ade, pid %nin% delete.pids) save(trt,file="trt.rda",compress=TRUE) # ## Need to remove first record of ID=993147H # tmp.d2 <- subset(d2, pid != "993147H") # tmp2.d2 <- subset(d2, pid == "993147H") # tmp2.d2 <- tmp2.d2[duplicated(tmp2.d2$pid),] # # Also need to remove this info from oi.df # which.row <- which(oi.df$pid == "993147H" & round(oi.df$age.oi == # 21.457,5)) # oi.df <- oi.df[-which.row,] # # d2 <- merge(tmp.d2,tmp2.d2,all=TRUE) # d2 <- d2[order(d2$pid, d2$age),] # Having to do this because one pid had an age.oi > age.death d2$age_at_death <- ifelse(is.na(d2$age_at_death), NA, ifelse(!is.na(d2$age.oi) & d2$age.oi > d2$age_at_death, d2$age.oi, d2$age_at_death)) # save(d2, file="d2.rda", compress=TRUE) # age.fhaart <- subset(d2, !is.na(age.fhaart), select=c(pid,age.fhaart)) # d2 <- upData(d2, drop=c("age.fhaart")) # d2 <- merge(d2, age.fhaart, all=TRUE) # d2 <- d2[order(d2$pid, d2$age),] # Collapsing duplicate ages for a given PID # d2$age <- as.integer(round(d2$age*365.25)) # source("collapseSolution.R") # save(new.d2, file="newd2.rda") # d2 <- new.d2 # d2$age <- d2$age/365.25 save(d2, file="d2.rda") no.visits <- tapply(d2$pid, INDEX=d2$pid, FUN=length) #---------------------------------------# # Adding in treatment data # #---------------------------------------# source("solution.R") save(new.sub, file="newsub.rda") new.sub <- new.sub[order(new.sub$pid, new.sub$age),] age.fhaart.df <- data.frame("pid"=trt$pid[trt$fhaart==1 & !is.na(trt$fhaart)], "age.fhaart"=trt$age.rx.start[trt$fhaart == 1 & !is.na(trt$fhaart)]) d2 <- new.sub d2$age <- round(d2$age/365.25,4) d2$age.rx.start <- round(d2$age.rx.start/365.25,4) d2$age.rx.end <- round(d2$age.rx.end/365.25,4) d2 <- merge(d2, age.fhaart.df, all=TRUE) d2 <- d2[order(d2$pid, d2$age),] d2$age.fhaart <- round(d2$age.fhaart/365.25,4) d2$oi <- ifelse(is.na(d2$oi), 0, d2$oi) d2 <- upData(d2, drop="art") d2 <- upData(d2, rename=c(art.0="art")) # Merging in ART use prior to first clinic visit d2 <- merge(d2, art, all=TRUE) d2 <- d2[order(d2$pid,d2$age),] # Merging in OI event prior to first clinic visit d2 <- merge(d2, oi, all=TRUE) d2 <- d2[order(d2$pid,d2$age),] no.visits.2 <- tapply(d2$pid, INDEX=d2$pid, FUN=length) save(d2, file="postsold2.rda") age.lt.age.fhaart <- unique(d2$pid[which(d2$age < d2$age.fhaart)]) first.age <- unlist(tapply(d2$age, INDEX=d2$pid, FUN=head, n=1)) tmp <- data.frame("pid"=names(first.age), "first.age"=first.age) d2 <- merge(d2, tmp, all=TRUE) d2 <- d2[order(d2$pid, d2$age),] unique.id<-unique(d2$pid) #----------------------------------------------------------# # start.within is expressed in years, not months, and is how treatment # groups are assigned (ie., start HAART within "start.within" months of # falling below the upper limit # original is a logical. When TRUE, all assigned to trt 0 when they fall # below the upper limit until they start HAART within start.within years. # When FALSE, all assigned to trt group based on whether the time between # age.less.ulimit and age.fhaart within start.within years. # No.CD4 is a logical indicating whether CD4 should be used in computing # the propensity scores. IPW is a logical indicating whether no # propensity score should be used in the model but rather incorporate them # into the weights. start.within <- 1/4 old.code <- FALSE original <- FALSE No.CD4 <- TRUE IPW <- FALSE No.PS <- FALSE restrict <- FALSE save(d2, file="d2.rda",compress=TRUE) df.500.400 <- create.df(start.within=start.within,restrict=restrict, u.limit=501,l.limit=401,d2=d2,old.code=old.code, original=original) save(df.500.400,file="df500400.rda",compress=TRUE) load("df500400.rda") df.500.400 <- df.500.400[order(df.500.400$pid,df.500.400$month),] # avg.start <- df.500.400$avg.start # analysis1 <- analysis(start.within=start.within,avg.start=avg.start, # surv.df=surv.500.400,No.CD4=No.CD4,IPW=IPW,No.PS=No.PS) analysis1 <- analysis(start.within=start.within,surv.df=df.500.400, No.CD4=No.CD4,IPW=IPW,No.PS=No.PS) save(analysis1, file="analysis500400.rda",compress=TRUE) beta.500.400 <- analysis1[[1]] lci.500.400 <- analysis1[[2]] uci.500.400 <- analysis1[[3]] table.500.400 <- analysis1[[4]] n.ic.500.400 <- analysis1[[5]] # surv.500.400 <- analysis1[[6]] #----------------------------------------------------------# df.400.300 <- create.df(start.within=start.within,restrict=FALSE, u.limit=401,l.limit=301,d2=d2,old.code=old.code, original=original) save(df.400.300,file="df400300.rda",compress=TRUE) load("df400300.rda") # surv.400.300 <- df.400.300$surv.df df.400.300 <- df.400.300[order(df.400.300$pid,df.400.300$month),] # avg.start <- df.400.300$avg.start # analysis2 <- analysis(start.within=start.within,avg.start=avg.start, # surv.df=surv.400.300,No.CD4=No.CD4,IPW=IPW,No.PS=No.PS) analysis2 <- analysis(start.within=start.within,surv.df=df.400.300,No.CD4=No.CD4,IPW=IPW,No.PS=No.PS) save(analysis2, file="analysis400300.rda",compress=TRUE) beta.400.300 <- analysis2[[1]] lci.400.300 <- analysis2[[2]] uci.400.300 <- analysis2[[3]] table.400.300 <- analysis2[[4]] n.ic.400.300 <- analysis2[[5]] # surv.400.300 <- analysis2[[6]] #----------------------------------------------------------# df.300.200 <- create.df(start.within=start.within,restrict=FALSE, u.limit=301,l.limit=201,d2=d2,old.code=old.code, original=original) save(df.300.200,file="df300200.rda",compress=TRUE) load("df300200.rda") # surv.300.200 <- df.300.200$surv.df df.300.200 <- df.300.200[order(df.300.200$pid,df.300.200$month),] # avg.start <- df.300.200$avg.start # analysis3 <- analysis(start.within=start.within,avg.start=avg.start, # surv.df=surv.300.200,No.CD4=No.CD4,IPW=IPW,No.PS=No.PS) analysis3 <- analysis(start.within=start.within,surv.df=df.300.200,No.CD4=No.CD4,IPW=IPW,No.PS=No.PS) save(analysis3, file="analysis300200.rda",compress=TRUE) beta.300.200 <- analysis3[[1]] lci.300.200 <- analysis3[[2]] uci.300.200 <- analysis3[[3]] table.300.200 <- analysis3[[4]] n.ic.300.200 <- analysis3[[5]] # surv.300.200 <- analysis3[[6]]