rm(list=ls()) #------------------------# # Libraries # #------------------------# library(foreign) library(Hmisc) library(Design) #------------------------# # Functions # #------------------------# # Adds CD4, viral loads, CD4 % to time-dependent data frame add.fun <- function(labage.1, labage.2){ var <- names(labage.1)[3] labage.1$order <- 1 labage.2$order <- 0 pullData <- rbind(labage.1,labage.2) pullData$age <- round(pullData$age, 5) pullData <- pullData[order(pullData$pid, pullData$age, -pullData$order),] id. <- -1 for(i in 1:length(pullData$pid)){ # if(i %% 100 == 0) print(i) if(pullData$pid[i] != id.){ var. <- pullData[,var][i] age.var. <- pullData$age[i] id. <- pullData$pid[i] } if(pullData$order[i] == 0){ var. <- pullData[,var][i] age.var. <- pullData$age[i] } if(pullData$order[i] == 1 & !is.na(pullData[,var][i])){ var. <- pullData[,var][i] age.var. <- pullData$age[i] } pullData[,var][i] <- var. pullData$age.var.[i] <- age.var. } return(pullData) } create.df <- function(start.within,restrict,u.limit,l.limit,d2,old.code,original){ # Identifying age at which less than ulimit less.ulimit <- NULL for(i in 1:length(unique.id)){ less.ulimit[i] <- min(which(d2$pid == unique.id[i] & d2$cd4 <= u.limit)) } age.less.ulimit <- d2$age[less.ulimit] df.sub <- data.frame("pid"=unique.id, "age.less.ulimit"=age.less.ulimit) d2 <- merge(d2, df.sub, all=TRUE) d2 <- d2[order(d2$pid, d2$age),] # Deleting PIDs who had missing age.less.ulimit values d2 <- subset(d2, !is.na(age.less.ulimit)) d2$time.in.care <- d2$age.less.ulimit - d2$first.age # If someone died, their true last age should be the age of death # Otherwise it is their last recorded age. true.last.age.nd <- unlist(tapply(d2$age[is.na(d2$age_at_death)], INDEX=d2$pid[is.na(d2$age_at_death)], FUN=tail, n=1)) nd.df <- data.frame("pid"=names(true.last.age.nd), "true.last.age"=as.numeric(true.last.age.nd)) true.last.age.d <- d2$age_at_death[!is.na(d2$age_at_death) & !duplicated(d2$pid)] d.df <- data.frame("pid"=unique(d2$pid[!is.na(d2$age_at_death)]), "true.last.age"=true.last.age.d) tmp <- merge(nd.df, d.df, all=TRUE) tmp <- tmp[order(tmp$pid),] d2 <- merge(d2, tmp, all=TRUE) d2 <- d2[order(d2$pid, d2$age),] d2$tot.fu <- d2$true.last.age - d2$age.less.ulimit # Reworking age.oi so that repeats for every record of individual rather # than NA for all records but record at which oi occurred. # In addition, had some PIDs who had multiple OIs after age.less.ulimit so # want to take first occurrence of an OI after age.less.ulimit. # If all they had was an OI prior to falling below age.less.ulimit, will # record this as a prior.oi later. #### age.oi <- subset(d2, !is.na(age.oi), select=c(pid,age.oi,age.less.ulimit)) age.oi$prior.oi <- ifelse(age.oi$age.oi <= age.oi$age.less.ulimit,1,0) oi.after.ulimit <- which(!duplicated(interaction(age.oi$pid,age.oi$prior.oi)) & age.oi$prior.oi == 0) oi.after.ulimit.2 <- which(age.oi$prior.oi == 0) oi.bf.ulimit <- which(!duplicated(interaction(age.oi$pid,age.oi$prior.oi)) & age.oi$prior.oi == 1) age.oi$age.prior.oi <- NA age.oi$age.prior.oi[oi.bf.ulimit] <- age.oi$age.oi[oi.bf.ulimit] age.oi.bf <- age.oi[oi.bf.ulimit,c("pid","age.prior.oi")] age.oi.af <- age.oi[oi.after.ulimit,c("pid","age.oi")] age.oi.af2 <- age.oi[oi.after.ulimit.2,c("pid","age.oi")] age.oi <- merge(age.oi.af,age.oi.bf,all=TRUE) age.oi <- age.oi[order(age.oi$pid),] age.oi.dup <- age.oi.af2[order(age.oi.af2$pid),] age.oi.dup$age.oi <- round(age.oi.dup$age.oi, 5) oi.df$age.oi <- round(oi.df$age.oi,5) age.oi.dup <- merge(age.oi.dup, subset(oi.df, pid %in% age.oi.dup$pid), all=TRUE) age.oi.dup <- age.oi.dup[order(age.oi.dup$pid,age.oi.dup$age.oi),] age.oi.dup <- merge(age.oi.dup, subset(d2, !duplicated(pid) & pid %in% age.oi.dup$pid, select=c(pid, age.less.ulimit)), all=TRUE) age.oi.dup <- age.oi.dup[order(age.oi.dup$pid,age.oi.dup$age.oi),] age.oi.dup <- subset(age.oi.dup, round(age.oi,5) > round(age.less.ulimit,5)) age.oi.dup <- upData(age.oi.dup, drop=c("year.1st.visit", "age.less.ulimit", "oi")) age.oi.dup$oi.1 <- tolower(age.oi.dup$oi.1) age.oi.dup$oi.2 <- tolower(age.oi.dup$oi.2) age.oi.dup$oi.2 <- ifelse(age.oi.dup$oi.2 == " ",NA, as.character(age.oi.dup$oi.2)) age.oi.dup$oi.3 <- tolower(age.oi.dup$oi.3) age.oi.dup$oi.3 <- ifelse(age.oi.dup$oi.3 == " ",NA, as.character(age.oi.dup$oi.3)) age.oi.dup$oi.4 <- tolower(age.oi.dup$oi.4) age.oi.dup$oi.4 <- ifelse(age.oi.dup$oi.4 == " ",NA, as.character(age.oi.dup$oi.4)) dup.oi.1 <- apply(subset(age.oi.dup, select=oi.1), 1, FUN=function(x){ if(!is.na(x)){ oi.class$class[which(oi.class$oi == x)] } else { NA }}) dup.oi.2 <- apply(subset(age.oi.dup, select=oi.2), 1, FUN=function(x){ if(!is.na(x)){ oi.class$class[which(oi.class$oi == x)] } else { NA }}) dup.oi.3 <- apply(subset(age.oi.dup, select=oi.3), 1, FUN=function(x){ if(!is.na(x)){ oi.class$class[which(oi.class$oi == x)] } else { NA }}) dup.oi.4 <- apply(subset(age.oi.dup, select=oi.4), 1, FUN=function(x){ if(!is.na(x)){ oi.class$class[which(oi.class$oi == x)] } else { NA }}) age.oi.dup <- cbind(age.oi.dup, dup.oi.1,dup.oi.2,dup.oi.3,dup.oi.4) names(age.oi.dup)[2:10] <- c("cum.age.oi","cum.oi.1","cum.oi.2", "cum.oi.3","cum.oi.4", "cum.oi.1.class","cum.oi.2.class", "cum.oi.3.class","cum.oi.4.class") d2$age.oi <- round(d2$age.oi,5) d2 <- merge(d2, age.oi.dup, by.x=c("pid","age.oi"), by.y=c("pid","cum.age.oi"),all=TRUE) d2 <- d2[order(d2$pid, d2$age),] d2$cum.age.oi <- ifelse(!is.na(d2$age.oi),d2$age.oi,NA) d2$cum.oi <- ifelse(round(d2$age,5) == round(d2$cum.age.oi,5) & !is.na(d2$cum.age.oi),1, ifelse(is.na(d2$cum.age.oi),0,NA)) # # Had some PIDs who had multiple OIs after age.less.ulimit so want to # # take first occurrence of an OI after age.less.ulimit # oi.after.ulimit <- ifelse(d2$age.oi[!is.na(d2$age.oi)] >= # d2$age.less.ulimit[!is.na(d2$age.oi)],1,0) # pid.age.oi <- d2$pid[!is.na(d2$age.oi)] # dup.pid <- ifelse(duplicated(pid.age.oi),1,0) # age.oi <- age.oi[!dup.pid == 1,] # age.oi <- age.oi[!is.na(age.oi$pid),] d2 <- upData(d2, drop=c("age.oi")) d2 <- merge(d2, age.oi, all=TRUE) d2 <- d2[order(d2$pid,d2$age),] d2$age.oi <- round(d2$age.oi, 3) d2$prior.oi <- ifelse(is.na(d2$age.prior.oi), 0, 1) d2$oi <- ifelse(round(d2$age,5) == round(d2$age.oi,5) & !is.na(d2$age.oi),1,0) # Need to add in OI information regarding type of OI # To do this, first need to eliminate OIs before age.less.ulimit # so have to add this to oi.df age.less.ulimit <- unique(subset(d2, pid %in% oi.df$pid, select=c(pid, age.less.ulimit))) oi.df <- subset(oi.df, pid %in% d2$pid) oi.df <- merge(oi.df, age.less.ulimit, all=TRUE) oi.df <- oi.df[order(oi.df$pid,oi.df$age.oi),] oi.df <- subset(oi.df, round(age.oi,5) > round(age.less.ulimit,5)) oi.df <- oi.df[!duplicated(oi.df$pid),] cath <- subset(oi.df, oi == 1, select=c(pid,age.oi,oi.1,oi.2,oi.3,oi.4)) cath$age.oi <- round(cath$age.oi, 3) d2 <- merge(d2, cath, by=c("pid","age.oi"),all=TRUE) d2 <- d2[order(d2$pid,d2$age),] ## Temporary fix...rounding issue...adding rows for age oi that is 0.001 off ## from other...need to address...12/4/09 d2 <- subset(d2, !is.na(prior.oi)) # Defining prior.art # d2$art <- ifelse(is.na(d2$art),0,as.numeric(d2$art)) prior.art.df <- subset(d2, age < age.less.ulimit) prior.art <- tapply(prior.art.df$art, INDEX=prior.art.df$pid, FUN=max) prior.art.df <- data.frame("pid"=names(prior.art), "prior.art"=prior.art) d2 <- merge(d2, prior.art.df, all=TRUE) d2 <- d2[order(d2$pid,d2$age),] d2$prior.art <- ifelse(is.na(d2$prior.art), 0, d2$prior.art) # Adding in class information to OI information in d2 data frame d2$oi.1 <- tolower(d2$oi.1) d2$oi.2 <- tolower(d2$oi.2) d2$oi.2 <- ifelse(d2$oi.2 == " ", NA, d2$oi.2) d2$oi.3 <- tolower(d2$oi.3) d2$oi.3 <- ifelse(d2$oi.3 == " ", NA, d2$oi.3) d2$oi.4 <- tolower(d2$oi.4) d2$oi.4 <- ifelse(d2$oi.4 == " ", NA, d2$oi.4) oi.1.class <- oi.2.class <- oi.3.class <- oi.4.class <- NULL oi.1.tmp <- oi.2.tmp <- oi.3.tmp <- oi.4.tmp <- NULL for(i in 1:length(unique(d2$pid))){ if(length(unique(d2$oi.1[d2$pid == unique(d2$pid)[i]])) == 1){ oi.1.tmp[i] <- unique(d2$oi.1[d2$pid == unique(d2$pid)[i]]) } else { oi.1.tmp[i] <- unique(d2$oi.1[d2$pid == unique(d2$pid)[i] & !is.na(d2$oi.1)]) } if(!is.na(oi.1.tmp[i])){ oi.1.class[i] <- oi.class$class[which(oi.class$oi == oi.1.tmp[i])] } else { oi.1.class[i] <- NA } if(length(unique(d2$oi.2[d2$pid == unique(d2$pid)[i]])) == 1){ oi.2.tmp[i] <- unique(d2$oi.2[d2$pid == unique(d2$pid)[i]]) } else { oi.2.tmp[i] <- unique(d2$oi.2[d2$pid == unique(d2$pid)[i] & !is.na(d2$oi.2)]) } if(!is.na(oi.2.tmp[i])){ oi.2.class[i] <- oi.class$class[which(oi.class$oi == oi.2.tmp[i])] } else { oi.2.class[i] <- NA } oi.3.tmp[i] <- unique(d2$oi.3[d2$pid == unique(d2$pid)[i]]) if(!is.na(oi.3.tmp[i])){ oi.3.class[i] <- oi.class$class[which(oi.class$oi == oi.3.tmp[i])] } else { oi.3.class[i] <- NA } oi.4.tmp[i] <- unique(d2$oi.4[d2$pid == unique(d2$pid)[i]]) if(!is.na(oi.4.tmp[i])){ oi.4.class[i] <- oi.class$class[which(oi.class$oi == oi.4.tmp[i])] } else { oi.4.class[i] <- NA } } df.tmp <- data.frame("pid"=unique(d2$pid), "oi.1.class"=oi.1.class, "oi.2.class"=oi.2.class, "oi.3.class"=oi.3.class, "oi.4.class"=oi.4.class) d2 <- merge(d2, df.tmp, all=TRUE) d2 <- d2[order(d2$pid, d2$age),] # Subsetting out PIDs with prior.oi or prior.art d2 <- subset(d2, prior.oi != 1 & prior.art != 1) # Had issues in the propensity scores with missing CD4 % but non-missing # CD4 values. Gave different lengths for the predicted probabilities # of time-dependent vs non-time-dependent models. Imputing values # for CD4_% cd4.pct.mod.beta <- lm(d2$cd4_percent ~ d2$cd4)$coeff[-1] d2$cd4_percent <- apply(d2[,c("cd4_percent","cd4")],1, FUN=function(x){ if(is.na(x[1]) & !is.na(x[2])){ x[2]*cd4.pct.mod.beta } else { x[1] }}) pre.d2 <- d2 # save(pre.d2, file=paste("pre.d2.",k,".nonsus500ADE.rda",sep="")) unique.id<-unique(d2$pid) # If restricting to PIDs with CD4 > ulimit prior to age.less.ulimit or not... if(restrict){ d2.sub <- subset(d2, round(age,5) < round(age.less.ulimit,5)) gt.ulimit <- tapply(d2.sub$cd4, INDEX=d2.sub$pid, FUN=function(x){ as.numeric(any(x[!is.na(x)] > u.limit))}) gt.ulimit.df <- data.frame("pid"=names(gt.ulimit), "gt.ulimit"=gt.ulimit) d2.sub <- merge(d2.sub, gt.ulimit.df, all=TRUE) d2.sub <- d2.sub[order(d2.sub$pid,d2.sub$age),] d2.sub <- d2.sub[!duplicated(d2.sub$pid),c("pid","gt.ulimit")] d2 <- merge(d2, d2.sub, all=TRUE) d2 <- d2[order(d2$pid,d2$age),] d2$gt.ulimit <- ifelse(is.na(d2$gt.ulimit),0,d2$gt.ulimit) sub.d2 <- subset(d2, (age.fhaart >= age.less.ulimit | is.na(age.fhaart)) & gt.ulimit == 1) sub.d2 <- upData(sub.d2, drop=c("gt.ulimit")) } else{ # NOTE: using <= age.less.ulimit here and not in restricted part # since at CD4 at age.less.ulimit could be 1st record and # that has to at least be > 200 whereas in restrict can't # have CD4 at that age be both <= 500 and > 500 d2.sub <- subset(d2, round(age,5) <= round(age.less.ulimit,5)) gt.llimit <- tapply(d2.sub$cd4, INDEX=d2.sub$pid, FUN=function(x){ as.numeric(any(x[!is.na(x)] >= l.limit))}) gt.llimit.df <- data.frame("pid"=names(gt.llimit), "gt.llimit"=gt.llimit) d2.sub <- merge(d2.sub, gt.llimit.df, all=TRUE) d2.sub <- d2.sub[order(d2.sub$pid,d2.sub$age),] d2.sub <- d2.sub[!duplicated(d2.sub$pid),c("pid","gt.llimit")] d2 <- merge(d2, d2.sub, all=TRUE) d2 <- d2[order(d2$pid,d2$age),] sub.d2 <- subset(d2, (age.fhaart >= age.less.ulimit | is.na(age.fhaart)) & gt.llimit == 1) sub.d2 <- upData(sub.d2, drop=c("gt.llimit")) } sub.d2 <- sub.d2[order(sub.d2$pid, sub.d2$age),] sub.unique.id <- unique(sub.d2$pid) d2 <- sub.d2 # Defining trt200 group if(!original){ d2$trt.group <- ifelse(is.na(d2$age.fhaart),0, ifelse((d2$age.fhaart - d2$age.less.ulimit) < start.within,1,0)) # Excluding those who have an event: # Trt group 1: Those with an OI between age.less.ulimit & age.fhaart # (Note: Group 1 could not have a death because then # they would not have started HAART; had to start # HAART to be in Group 1). # Trt group 2: Those with an OI or death within start.within of # age.less.ulimit. grp1.1.del <- unique(d2$pid[which(d2$trt.group == 1 & !is.na(d2$age.oi) & d2$age.oi >= d2$age.less.ulimit & d2$age.oi <= d2$age.fhaart)]) grp.2.oi.del <- unique(d2$pid[which(d2$trt.group == 0 & !is.na(d2$age.oi) & d2$age.oi >= d2$age.less.ulimit & d2$age.oi <= (d2$age.less.ulimit + start.within))]) grp.2.dth.del <- unique(d2$pid[which(d2$trt.group == 0 & !is.na(d2$age_at_death) & d2$age_at_death >= d2$age.less.ulimit & d2$age_at_death <= (d2$age.less.ulimit + start.within))]) d2 <- subset(d2, pid %nin% c(grp1.1.del,grp.2.oi.del,grp.2.dth.del)) # Defining avg start time of trt200 group tmp <- subset(d2, select=c("pid","age","fhaart","age.fhaart", "cd4","cd4_percent","vl","lost_to_fu", "age.less.ulimit","trt.group")) tmp <- subset(tmp, round(age,5) >= round(age.less.ulimit,5)) tmp$baseline.age <- tmp$age.less.ulimit } if(original){ d2$trt.group <- ifelse(is.na(d2$age.fhaart),0, ifelse(d2$age < d2$age.fhaart,0, ifelse(d2$age >= d2$age.fhaart & (d2$age.fhaart - d2$age.less.ulimit) < start.within,1,0))) # For those in trt group 1 who had an event before age.fhaart # need to reset them to trt group 0 grp1.1.del <- unique(d2$pid[which(d2$trt.group == 1 & !is.na(d2$age.oi) & d2$age.oi >= d2$age.less.ulimit & d2$age.oi <= d2$age.fhaart)]) d2$trt.group <- ifelse(d2$pid %in% grp1.1.del,0,d2$trt.group) tmp <- subset(d2, select=c("pid","age","fhaart","age.fhaart", "cd4","cd4_percent","vl","lost_to_fu", "age.less.ulimit","trt.group")) tmp <- subset(tmp, round(age,5) >= round(age.less.ulimit,5)) tmp$baseline.age <- tmp$age.less.ulimit } tmp.200 <- subset(tmp, trt.group==1) tmp.200 <- tmp.200[order(tmp.200$pid,tmp.200$age),] if(old.code){ tmp.200$diff <- with(tmp.200, age.fhaart - baseline.age) avg.start <- mean(tmp.200$diff[!duplicated(tmp.200$pid)], na.rm=TRUE) # Now subset tmp.200 so that it starts when they first start HAART tmp.200 <- subset(tmp.200, round(age,5) >= round(age.fhaart,5)) # Getting new baseline age for trt200 group tmp.200 <- upData(tmp.200, drop=c("age.less.ulimit","baseline.age","diff")) baseline <- aggregate(tmp.200$age, list(pid=tmp.200$pid), FUN=head,n=1) names(baseline)[2] <- "baseline.age" tmp.200 <- merge(tmp.200, baseline, all=TRUE) tmp.200 <- tmp.200[order(tmp.200$pid,tmp.200$age),] } else { tmp.200 <- upData(tmp.200, drop=c("age.less.ulimit")) } ## Just taking record of baseline.age??? if(!original){ tmp.200 <- subset(tmp.200, round(age,5) == round(baseline.age,5)) } else { tmp.200 <- tmp.200[!duplicated(tmp.200$pid),] } tmp.100 <- subset(tmp, trt.group==0) if(old.code){ tmp.100$start.age <- with(tmp.100, baseline.age + avg.start) } else { avg.start <- 0 tmp.100$start.age <- tmp.100$baseline.age } new.rows <- data.frame(pid=unique(tmp.100$pid), age=tmp.100$start.age[!duplicated(tmp.100$pid)], fhaart=0, # cd4=NA, # cd4_percent=NA, # vl=NA, lost_to_fu=tmp.100$lost_to_fu[!duplicated(tmp.100$pid)], age.fhaart=tmp.100$age.fhaart[!duplicated(tmp.100$pid)], age.less.ulimit=tmp.100$age.less.ulimit[!duplicated(tmp.100$pid)], trt.group=0, baseline.age=tmp.100$baseline.age[!duplicated(tmp.100$pid)], start.age=tmp.100$start.age[!duplicated(tmp.100$pid)]) tmp.100 <- merge(tmp.100, new.rows, all=TRUE) tmp.100 <- tmp.100[order(tmp.100$pid, tmp.100$age),] tmp.100 <- subset(tmp.100, round(age,5) >= round(start.age,5)) # Getting new baseline for trt.group=0 tmp.100 <- upData(tmp.100, drop=c("age.less.ulimit","baseline.age","start.age")) baseline <- aggregate(tmp.100$age, list(pid=tmp.100$pid),FUN=head,n=1) names(baseline)[2] <- "baseline.age" tmp.100 <- merge(tmp.100, baseline, all=TRUE) tmp.100 <- tmp.100[order(tmp.100$pid,tmp.100$age),] tmp.100 <- subset(tmp.100, round(age,5) == round(baseline.age,5)) # Creating data frame for survival analysis # If original=TRUE, this has 2 records for those in trt.group=1 who did # not start HAART at age.less.ulimit tmp.tot <- merge(tmp.200, tmp.100, all=TRUE) tmp.tot <- tmp.tot[order(tmp.tot$pid,tmp.tot$age),] true.last.age.df <- data.frame("pid"=d2$pid[!duplicated(d2$pid)], "true.last.age"=d2$true.last.age[!duplicated(d2$pid)]) tmp.tot <- merge(tmp.tot, true.last.age.df, all=TRUE) tmp.tot <- tmp.tot[order(tmp.tot$pid,tmp.tot$age),] # sub.id <- unique(d2$pid[d2$pid %in% tmp.tot$pid]) # sub.unique.id <- unique(sub.id) # last.age <- tapply(d2$age, INDEX=d2$pid, FUN=max,na.rm=TRUE) # last.age.df <- data.frame("pid"=names(last.age), # "last.age"= tmp.tot$follow.up <- tmp.tot$true.last.age - tmp.tot$baseline.age max.follow <- 12*max(tmp.tot$follow.up, na.rm=TRUE) months <- seq(0, ceiling(max.follow),by=1) tmp.tot$month <- 0 if(!original){ tmp.tot2 <- data.frame("pid"=rep(unique(tmp.tot$pid), each=length(months[-1])), "month"=rep(months[-1],times=length(unique(tmp.tot$pid))), "age"=rep(tmp.tot$age, each=length(months[-1])), "age.fhaart"=rep(tmp.tot$age.fhaart,each=length(months[-1])), "trt.group"=rep(tmp.tot$trt.group,each=length(months[-1])), "baseline.age"=rep(tmp.tot$baseline.age,each=length(months[-1])), "true.last.age"=rep(tmp.tot$true.last.age,each=length(months[-1])), "lost_to_fu"=rep(tmp.tot$lost_to_fu,each=length(months[-1]))) tmp.tot2$pid <- as.character(tmp.tot2$pid) surv.df <- merge(tmp.tot, tmp.tot2, all=TRUE) surv.df <- surv.df[order(surv.df$pid, surv.df$month),] surv.df$age <- round(surv.df$age + surv.df$month/12,5) } else { tmp.tot2 <- data.frame("pid"=rep(unique(tmp.tot$pid), each=length(months[-1])), "month"=rep(months[-1],times=length(unique(tmp.tot$pid))), "age.fhaart"=rep(tmp.tot$age.fhaart[!duplicated(tmp.tot$pid)], each=length(months[-1])), "baseline.age"=rep(tmp.tot$baseline.age[!duplicated(tmp.tot$pid)], each=length(months[-1])), "true.last.age"=rep(tmp.tot$true.last.age[!duplicated(tmp.tot$pid)], each=length(months[-1])), "lost_to_fu"=rep(tmp.tot$lost_to_fu[!duplicated(tmp.tot$pid)], each=length(months[-1]))) tmp.tot2$pid <- as.character(tmp.tot2$pid) tmp.tot2$month <- as.integer(tmp.tot2$month) surv.df <- merge(tmp.tot[!duplicated(tmp.tot$pid), -which(names(tmp.tot) %in% # c("age","trt.group","cd4","cd4_percent","vl"))], c("age","trt.group"))], tmp.tot2,all=TRUE) surv.df <- surv.df[order(surv.df$pid,surv.df$age),] surv.df$age <- surv.df$baseline.age surv.df$age <- surv.df$age + surv.df$month/12 surv.df$trt.group <- ifelse(is.na(surv.df$age.fhaart),0, ifelse(surv.df$age >= surv.df$age.fhaart & (surv.df$age.fhaart - surv.df$baseline.age) < start.within,1,0)) # For those in trt group 1 who had an event before age.fhaart # need to reset them to trt group 0 surv.df$trt.group <- ifelse(surv.df$pid %in% grp1.1.del,0, surv.df$trt.group) } ref.cd4 <- data.frame("pid"=d2$pid[!duplicated(d2$pid)], "ref.cd4"=d2$cd4[round(d2$age,5)== round(d2$age.less.ulimit,5)], "age.ref.cd4"=d2$age.less.ulimit[!duplicated(d2$pid)], "ref.cd4_percent"=d2$cd4_percent[round(d2$age,5)== round(d2$age.less.ulimit,5)], "age.ref.cd4_percent"=d2$age.less.ulimit[!duplicated(d2$pid)], "ref.vl"=d2$vl[round(d2$age,5)== round(d2$age.less.ulimit,5)], "age.ref.vl"=d2$age.less.ulimit[!duplicated(d2$pid)], "age.fhaart"=d2$age.fhaart[!duplicated(d2$pid)]) locate.fhaart <- NULL for(i in 1:length(unique(surv.df$pid))){ locate.fhaart[i] <- min(which(surv.df$pid == unique(surv.df$pid)[i] & surv.df$age >= surv.df$age.fhaart)) } locate.fhaart <- locate.fhaart[locate.fhaart != Inf] surv.df$fhaart <- 0 surv.df$fhaart[locate.fhaart] <- 1 dem <- data.frame("pid"=d2$pid[!duplicated(d2$pid)], "black"=d2$black[!duplicated(d2$pid)], "male"=d2$male[!duplicated(d2$pid)], "idu"=d2$idu[!duplicated(d2$pid)], "age_at_death"=d2$age_at_death[!duplicated(d2$pid)], "time.in.care"=d2$time.in.care[!duplicated(d2$pid)]) surv.df <- merge(surv.df, ref.cd4, all=TRUE) surv.df <- surv.df[order(surv.df$pid,surv.df$age),] surv.df <- merge(surv.df, dem, all=TRUE) surv.df <- surv.df[order(surv.df$pid,surv.df$age),] # tmp <- data.frame("pid"=unique(surv.df$pid), # "age.last"=d2$true.last.age[!duplicated(d2$pid)]) # tmp$age.last <- round(tmp$age.last,5) surv.df$age.last <- surv.df$true.last.age # surv.df <- merge(surv.df, tmp, all=TRUE) surv.df <- subset(surv.df, round(age,5) <= round(age.last,5)) surv.df$death <- ifelse(is.na(surv.df$age_at_death),0, ifelse((round(surv.df$age,5) == round(surv.df$age.last,5)) | (surv.df$age + 1/12 > surv.df$age.last),1,0)) ## May need to remove?? d2.new <- subset(d2, pid %in% surv.df$pid) oi.df <- data.frame("pid"=unique(d2.new$pid), "age.oi"=d2.new$age.oi[!duplicated(d2.new$pid)]) surv.df <- merge(surv.df, oi.df, all=TRUE) surv.df <- surv.df[order(surv.df$pid,surv.df$age),] locate.oi <- NULL for(i in 1:length(unique(surv.df$pid))){ locate.oi[i] <- min(which(surv.df$pid == unique(surv.df$pid)[i] & round(surv.df$age ,5) >= round(surv.df$age.oi, 5))) } locate.oi <- locate.oi[locate.oi != Inf] surv.df$oi <- 0 surv.df$oi[locate.oi] <- 1 ## Temporary fix...how to capture in above loop?? surv.df$oi <- ifelse(is.na(surv.df$age.oi),0, ifelse((surv.df$age + 1/12) > surv.df$age.last & (surv.df$age.oi + 1/12) > surv.df$age.last,1, surv.df$oi)) surv.df$event <- ifelse(surv.df$death == 0 & surv.df$oi == 0,0,1) #Adding CD4 counts labage.1 <- surv.df[,c("pid","age","cd4")] labage.2 <- subset(d2, pid %in% surv.df$pid, select=c(pid,age,cd4)) names(labage.2) <- c("pid","age","cd4") labage.2 <- subset(labage.2, !is.na(cd4)) pullData <- add.fun(labage.1, labage.2) surv.df$cd4 <- pullData[pullData$order==1,"cd4"] # surv.df$age.cd4 <- pullData[pullData$order==1,"age.cd4."] surv.df$age <- round(surv.df$age, 5) surv.df$age.fhaart <- round(surv.df$age.fhaart,5) surv.df$age.cd4 <- pullData[pullData$order==1, "age.var."] surv.df$time.to.cd4 <- 12*(surv.df$age - surv.df$age.cd4) #Adding CD4 percents labage.1 <- surv.df[,c("pid","age","cd4_percent")] labage.2 <- subset(d2, pid %in% surv.df$pid, select=c(pid,age,cd4_percent)) names(labage.2) <- c("pid","age","cd4_percent") labage.2 <- subset(labage.2, !is.na(cd4_percent)) pullData <- add.fun(labage.1, labage.2) surv.df$cd4_percent <- pullData[pullData$order==1,"cd4_percent"] # surv.df$age.cd4_percent <- pullData[pullData$order==1,"age.cd4_percent"] surv.df$age <- round(surv.df$age, 5) surv.df$age.cd4_percent <- pullData[pullData$order==1, "age.var."] surv.df$time.to.cd4_percent <- 12*(surv.df$age - surv.df$age.cd4_percent) #Adding VL labage.1 <- surv.df[,c("pid","age","vl")] labage.2 <- subset(d2, pid %in% surv.df$pid, select=c(pid,age,vl)) names(labage.2) <- c("pid","age","vl") labage.2 <- subset(labage.2, !is.na(vl)) pullData <- add.fun(labage.1, labage.2) surv.df$vl <- pullData[pullData$order==1,"vl"] surv.df$age <- round(surv.df$age, 5) surv.df$age.vl <- pullData[pullData$order==1, "age.var."] surv.df$time.to.vl <- 12*(surv.df$age - surv.df$age.vl) ## Need to ensure that for both cd4 and vl that age.cd4 and age.vl ## are less than age.fhaart when fhaart==1 # Adjusting those who started HAART and had a CD4 count in that month after # the age.fhaart which.cd4 <- which(round(surv.df$age.fhaart,5) < round(surv.df$age.cd4,5) & surv.df$fhaart == 1 & round(surv.df$age.cd4,5) > round(surv.df$baseline.age,5)) surv.df$cd4[which.cd4] <- surv.df$cd4[(which.cd4-1)] surv.df$age.cd4[which.cd4] <- surv.df$age.cd4[(which.cd4-1)] # Adjusting those who started HAART and had a CD4 count in that month after # the age.fhaart which.vl <- which(round(surv.df$age.fhaart,5) < round(surv.df$age.vl,5) & surv.df$fhaart == 1 & round(surv.df$age.vl,5) > round(surv.df$baseline.age,5)) surv.df$vl[which.vl] <- surv.df$vl[(which.vl-1)] surv.df$age.vl[which.vl] <- surv.df$age.vl[(which.vl-1)] # Adjusting those who started HAART and had a CD4 count in that month after # the age.fhaart which.cd4.pct <- which(round(surv.df$age.fhaart,5) < round(surv.df$age.cd4_percent,5) & surv.df$fhaart == 1 & round(surv.df$age.cd4_percent,5) > round(surv.df$baseline.age,5)) surv.df$cd4_percent[which.cd4.pct] <- surv.df$cd4_percent[(which.cd4.pct-1)] surv.df$age.cd4_percent[which.cd4.pct] <- surv.df$age.cd4_percent[(which.cd4.pct-1)] ## Last row no longer row where they fell below 200. It is now ## either: ## 1) where fhaart=1 for trt200=1 group, or ## 2) age.bl200 + avg.start for trt200=0 group ## !! Need to change this code here !!! ## Using locate.startrow...getting Inf for startrow locations???? locate.firstrow <- locate.rowlt200 <- NULL locate.startrow <- locate.lastrow <- NULL for(i in 1:length(unique(surv.df$pid))){ locate.firstrow[i] <- min(which(surv.df$pid== unique(surv.df$pid)[i])) # & !is.na(sub.cd4))) locate.startrow[i] <- max(which(surv.df$pid== unique(surv.df$pid)[i] & round(surv.df$age,5) <= round(surv.df$baseline.age,5)),na.rm=TRUE) } ## Need to informatively censor subjects in trt200=0 group who either started ## HAART before falling below 100 or who didn't start HAART within ## "start.within" months of falling below 100 locate.cd4lt100 <- NULL for(i in 1:length(unique(surv.df$pid))){ locate.cd4lt100[i] <- min(which(surv.df$pid==unique(surv.df$pid)[i] & !is.na(surv.df$cd4) & surv.df$trt.group==0 & surv.df$pid %nin% unique(tmp.200$pid) & surv.df$cd4 < l.limit), na.rm=TRUE) } tmp <- data.frame("pid"=unique(as.character(surv.df$pid)), "age.less.llimit"=round(surv.df$age.cd4[locate.cd4lt100],5)) tmp$age.less.llimit <- as.numeric(as.character(tmp$age.less.llimit)) surv.df <- merge(surv.df, tmp, all=TRUE) surv.df <- surv.df[order(surv.df$pid,surv.df$month),] surv.df$inform.censor <- ifelse(surv.df$trt.group == 0 & surv.df$pid %nin% tmp.200$pid & ((!is.na(surv.df$age.fhaart) & is.na(surv.df$age.less.llimit) & surv.df$age >= surv.df$age.fhaart) | (!is.na(surv.df$age.fhaart) & !is.na(surv.df$age.less.llimit) & surv.df$age.fhaart < surv.df$age.less.llimit & surv.df$age >= surv.df$age.fhaart) | (!is.na(surv.df$age.fhaart) & !is.na(surv.df$age.less.llimit) & surv.df$age.fhaart - surv.df$age.less.llimit > start.within & surv.df$age > surv.df$age.less.llimit + start.within) | (is.na(surv.df$age.fhaart) & !is.na(surv.df$age.less.llimit) & surv.df$age - surv.df$age.less.llimit > start.within)),1,0) # (all(is.na(surv.df$age.fhaart)))), 1, 0) ## If I were to keep the above line of code, then that censors folks who ## were in the trt200=0 group who never fell below 100 and never started ## HAART. They are not non-compliant. ## Removing rows from surv.df after: ## 1) A patient had an event ## 2) A patient was informatively censored ## 3) Age in surv.df exceeds that actual age of the last observation ## for that patient. (last one already done ## (1) id_list <- split(surv.df, surv.df$pid) #pat <- id_list[[1]] #dest <- which(diff(pat$event) < 0) + 1 #if(length(dest<- which(diff(pat$event) < 0) + 1)) pat[seq(from=dest, #to=nrow(pat)),] junk3 <- do.call(rbind, lapply(id_list, function(pat) pat[(seq(along=pat$event) < c(which(pat$event | pat$inform.censor), nrow(pat))[1] + 1) & pat$age <= (pat$age.last+1/12),])) ## For those records where inform.censor and event both = 1 ## need to determine which occurred first -- change in trt ## or event. junk3.b <- subset(junk3, event==1 & inform.censor==1) unique.id.b <- unique(junk3.b$pid) min.age <- NULL for(i in 1:length(unique.id.b)){ min.age[i] <- min(c(junk3.b$age.fhaart[junk3.b$pid==unique.id.b[i]], junk3.b$age.oi[junk3.b$pid==unique.id.b[i]], junk3.b$age.at.death[junk3.b$pid==unique.id.b[i]]), na.rm=TRUE) } junk3.b$event <- ifelse(round(junk3.b$age.oi,5)==round(min.age,5) | round(junk3.b$age_at_death,5)== round(min.age,5), 1, 0) junk3.b$inform.censor <- ifelse(junk3.b$age.fhaart==min.age & !is.na(junk3.b$age.fhaart),1, ifelse((is.na(junk3.b$age.fhaart) & junk3.b$age > min.age),0,1)) row.2.dl <- which(junk3$event==1 & junk3$inform.censor==1) if(length(row.2.dl) > 0){ junk3 <- junk3[-row.2.dl,] junk3 <- merge(junk3, junk3.b, all=TRUE) } junk3 <- junk3[order(junk3$pid,junk3$age),] surv.df <- junk3 return(list(surv.df=surv.df, avg.start=avg.start)) } analysis <- function(start.within=start.within,avg.start=avg.start, surv.df, No.CD4,IPW,No.PS){ ######### Pooled Logistic regression for censoring # library(Design) # surv.df$vl<-ifelse(surv.df$vl<400,399,surv.df$vl) surv.df$logvl<-log10(surv.df$vl) first.age<-unlist(tapply(d2$age[d2$pid %in% surv.df$pid], INDEX=d2$pid[d2$pid %in% surv.df$pid], FUN=head,n=1)) data.junk<-data.frame(pid=names(first.age),first.age=as.numeric(first.age)) surv.df<-merge(surv.df,data.junk,all=TRUE) surv.df <- surv.df[order(surv.df$pid,surv.df$age),] surv.df$time.in.care<-surv.df$baseline.age-surv.df$first.age baseline.cd4 <- unlist(tapply(surv.df$cd4, INDEX=surv.df$pid, FUN=head,n=1)) baseline.time.to.cd4 <- unlist(tapply(surv.df$time.to.cd4, INDEX=surv.df$pid, FUN=head,n=1)) baseline.cd4_percent <- unlist(tapply(surv.df$cd4_percent, INDEX=surv.df$pid, FUN=head,n=1)) baseline.time.to.cd4_percent <- unlist(tapply(surv.df$time.to.cd4_percent, INDEX=surv.df$pid, FUN=head,n=1)) baseline.logvl <- unlist(tapply(surv.df$logvl, INDEX=surv.df$pid, FUN=head,n=1)) baseline.time.to.logvl <- unlist(tapply(surv.df$time.to.vl, INDEX=surv.df$pid, FUN=head,n=1)) ## Imputing baseline logvl based on their cd4 count impute.lm <- lm(baseline.logvl ~ sqrt(baseline.cd4)) baseline.logvl <- ifelse(is.na(baseline.logvl), impute.lm$coeff[1]+impute.lm$coeff[2]*sqrt(baseline.cd4), baseline.logvl) tmp.logvl <- data.frame(pid=names(baseline.logvl), baseline.logvl=as.numeric(baseline.logvl)) surv.df <- merge(surv.df, tmp.logvl, all=TRUE) surv.df <- surv.df[order(surv.df$pid,surv.df$age),] surv.df$logvl <- ifelse(is.na(surv.df$logvl), surv.df$baseline.logvl, surv.df$logvl) # mod200.pl.1<-lrm(inform.censor~rcs(month,4)+black+idu+male+baseline.age+ # prior.oi+prior.art+sqrt(cd4)+ cd4.pct + # sqrt(time.to.cd4)+logvl+sqrt(time.to.vl)+ # sqrt(time.in.care), # subset=month > (3-avg.start*12) & trt.group==0, # data=surv.df) trt200.ids <- unique(surv.df$pid[surv.df$trt.group == 1]) mod200.pl.1 <- lrm(inform.censor ~ rcs(month,4) + black + idu + male + baseline.age + rcs(sqrt(cd4),4) + # cd4_percent + logvl + sqrt(time.in.care), data=surv.df, subset=month > (start.within*12-avg.start*12) & trt.group==0 & pid %nin% trt200.ids) p200.cens.1<-exp(predict(mod200.pl.1))/(1+exp(predict(mod200.pl.1))) # mod200.pl.2<-lrm(inform.censor~rcs(month,4)+black+idu+male+baseline.age+ # prior.oi+ prior.art,subset=month >(3-avg.start*12) & # trt.group==0, # data=surv.df) mod200.pl.2 <- lrm(inform.censor ~ rcs(month,4)+black+idu+male+ baseline.age, subset=month > (start.within*12-avg.start*12) & trt.group==0 & pid %nin% trt200.ids,data=surv.df) p200.cens.2<-exp(predict(mod200.pl.2))/(1+exp(predict(mod200.pl.2))) sw200.init<-(1-p200.cens.2)/(1-p200.cens.1) sw.init.df<-data.frame(row.id=names(sw200.init),sw200.init) surv.200.junk<-data.frame(row.id=rownames(surv.df),surv.df) junk4<-merge(surv.200.junk,sw.init.df,by.x="row.id",by.y="row.id",all=TRUE) junk4 <- junk4[order(junk4$pid,junk4$age),] sw200.temp<-ifelse(is.na(junk4$sw200.init),1,junk4$sw200.init) sw200 <- unlist(tapply(sw200.temp, INDEX=surv.df$pid, FUN=cumprod)) library(geepack) # surv.df$rate.cd4.chg <- -with(surv.df, # (ref.cd4-cd4.gt200)/(age.ref.cd4-age.gt200)) # surv.df$lograte.cd4.chg <- log10(surv.df$rate.cd4.chg) tmp.cd4 <- data.frame(pid=names(baseline.cd4), baseline.cd4=as.numeric(baseline.cd4)) tmp.time.to.cd4 <- data.frame(pid=names(baseline.time.to.cd4), baseline.time.to.cd4=as.numeric(baseline.time.to.cd4)) tmp.cd4.pct <- data.frame(pid=names(baseline.cd4_percent), baseline.cd4_percent=as.numeric(baseline.cd4_percent)) tmp.time.to.cd4.pct <- data.frame(pid=names(baseline.time.to.cd4_percent), baseline.time.to.cd4_percent=as.numeric(baseline.time.to.cd4_percent)) tmp.logvl <- data.frame(pid=names(baseline.logvl), baseline.vl=as.numeric(baseline.logvl)) tmp.time.to.logvl <- data.frame(pid=names(baseline.time.to.logvl), baseline.time.to.logvl=as.numeric(baseline.time.to.logvl)) junk<-unique(subset(surv.df,select=c(pid,trt.group,baseline.age,idu, black,male, time.in.care,baseline.logvl))) junk <- junk[!duplicated(junk$pid),] junk$trt.group <- ifelse(junk$pid %in% trt200.ids,1,junk$trt.group) junk <- merge(merge(junk,tmp.cd4,all=TRUE),tmp.cd4.pct,all=TRUE) junk <- merge(junk, tmp.time.to.cd4,all=TRUE) junk <- merge(junk, tmp.time.to.cd4.pct,all=TRUE) junk <- merge(junk, tmp.logvl,all=TRUE) junk <- merge(junk, tmp.time.to.logvl,all=TRUE) ## add cd4, cd4pct, and logvl at baseline and then include in ## glm below...a few id's that have NA's for vl so don't ## get values for predict so error when creating merge.junk # mod200.ps<-glm(trt.group~baseline.age+idu+black+male+prior.oi+prior.art+ # sqrt(time.in.care)+baseline.cd4+baseline.cd4.pct+ # baseline.logvl+lograte.cd4.chg, # data=junk,family=binomial) if(No.CD4){ mod200.ps <- glm(trt.group~black+idu+male+baseline.age+ baseline.logvl+ sqrt(time.in.care),data=junk,family=binomial) } else { mod200.ps <- glm(trt.group~black+idu+male+baseline.age+ rcs(baseline.cd4,3) + baseline.logvl+ sqrt(time.in.care),data=junk,family=binomial) } ps200<-exp(predict(mod200.ps))/(1+exp(predict(mod200.ps))) merge.junk<-data.frame(pid=junk$pid,ps200=ps200) surv.df<-merge(surv.df,merge.junk,all=TRUE) surv.df <- surv.df[order(surv.df$pid,surv.df$age),] if(IPW){ surv.df$ps200.new <- ifelse(surv.df$trt.group == 1,surv.df$ps200, (1-surv.df$ps200)) } surv.df$pid1<-factor(surv.df$pid) event<-surv.df$event month<-surv.df$month id<-factor(surv.df$pid) sw200<-as.numeric(sw200) if(IPW){ ps200.new <- surv.df$ps200.new #-------------------------------------------# # If using IPW and no PS sw200 <- sw200/ps200.new #-------------------------------------------# } sw200.new <- ifelse(sw200 < quantile(sw200,probs=0.05), quantile(sw200,probs=0.05), ifelse(sw200 > quantile(sw200,probs=0.95), quantile(sw200,probs=0.95), sw200)) ps200<-surv.df$ps200 trt.group <- surv.df$trt.group rcs.month<-rcs(sqrt(month),3) rcs.month1<-as.vector(rcs.month[,1]) rcs.month2<-as.vector(rcs.month[,2]) library(sandwich) if(IPW | No.PS){ mod200 <- glm(event ~ rcs.month1 + rcs.month2 + trt.group, family=binomial, weights=sw200.new) } else { mod200 <- glm(event ~ rcs.month1 + rcs.month2 + trt.group + ps200,family=binomial, weights=sw200.new) } vbeta <- diag(sandwich(mod200))[names(mod200$coeff)=="trt.group"] # Taking reciprocal since interested in trt.group = 1 as reference beta <- 1/exp(mod200$coeff[names(mod200$coeff)=="trt.group"]) uci <- 1/exp(mod200$coeff[names(mod200$coeff)=="trt.group"]-1.96*sqrt(vbeta)) lci <- 1/exp(mod200$coeff[names(mod200$coeff)=="trt.group"]+1.96*sqrt(vbeta)) stuff <- unique(subset(surv.df, select=c(pid,event)) ) locate <- NULL for(i in 1:length(unique(stuff$pid))){ locate[i] <- which(stuff$pid==unique(stuff$pid)[i] & stuff$event==max(stuff$event[stuff$pid==unique(stuff$pid)[i]])) } stuff3 <- stuff[locate,] stuff <- unique(subset(surv.df, select=c(pid,trt.group))) locate <- NULL for(i in 1:length(unique(stuff$pid))){ locate[i] <- which(stuff$pid == unique(stuff$pid)[i] & stuff$trt.group==max(stuff$trt.group[stuff$pid==unique(stuff$pid)[i]])) } stuff4 <- stuff[locate,] stuff3 <- merge(stuff3,stuff4,all=TRUE) table.trtgroup <- table(stuff3$trt.group,stuff3$event) n.ic <- sum(surv.df$inform.censor) return(list(beta=beta, lci=lci, uci=uci, table.trtgroup=table.trtgroup, n.ic=n.ic, surv.df=surv.df)) } #------------------------# # Loading data # #------------------------# sustiva <- FALSE # d2 <- read.dta("when_to_start_01oct2007.dta") # d2 <- read.dta("when_to_start_19nov2009.dta") d2 <- read.dta("when_to_start_03dec2009.dta") d2 <- upData(d2, rename=c(cfar_pid="pid", ivdu="idu", indicator_first_haart_0_1="fhaart", # age_at_first_haart="age.fhaart", cd4_count="cd4", indicator_oi_0_1="oi", age_at_oi="age.oi", indicator_non_haart_art_0_1="art"), drop=c("age_at_first_haart", "fhaart")) d2$male <- d2$sex - 1 # non.ade <- read.table("w2s_non_ades_dx_17feb2009.csv",header=TRUE,sep=",") non.ade <- read.table("w2s_non_ades_dx_18nov2009.csv",header=TRUE,sep=",") names(non.ade) <- tolower(names(non.ade)) non.ade.list <- c("CORONARY ARTERY DISEASE, NOS", "ACUTE MYOCARDIAL INFARCTION", "CIRRHOSIS OF LIVER, NOS", "ESOPHAGEAL VARICES, NOS", "HEPATIC FAILURE, NOS", "ADENOCARCINOMA, INTESTINAL TYPE", "MALIGNANT NEOPLASM OF COLON, UNSPECIFIED", "SMALL CELL CARCINOMA, NOS", "MELANOMA OF SKIN, SITE UNSPECIFIED", "SQUAMOUS CELL CARCINOMA OF ANUS", "END STAGE RENAL DISEASE", "ACUTE VIRAL HEPATITIS C WITH HEPATIC COMA", "ALCOHOLIC CIRRHOSIS OF LIVER", "CEREBRAL ARTERY OCCLUSION, UNSPECIFIED, WITH CEREBRAL INFARCTION", "CEREBRAL HEMORRHAGE", "CHRONIC HEPATITIS C WITH HEPATIC COMA", "CHRONIC LIVER DISEASE AND CIRRHOSIS", "CIRRHOSIS OF LIVER", "CORONARY ARTERIOSCLEROSIS", "CORONARY ARTERY DISEASE", "CORONARY ATHEROSCLEROSIS", "HEPATIC ENCEPHALOPATHY", "MELANOMA OF SKIN, SITE UNSPECIFIED", "MYOCARDIAL INFARCTION, NOS", "HEPATOCELLULAR CARCINOMA, NOS", "SQUAMOUS CELL CARCINOMA, KERATINIZING, NOS", "SUBARACHNOID HEMORRHAGE", "ANGINA PECTORIS", "ASCITES, NOS", "CARDIOVASCULAR DISEASE, NOS", "CEREBROVASCULAR ACCIDENT, NOS", "CHRONIC CONGESTIVE HEART FAILURE", "CHRONIC HYPERTENSION IN OBSTETRIC CONTEXT, NOS", "CHRONIC KIDNEY DISEASE, STAGE V", "CONGESTIVE HEART DISEASE", "CONGESTIVE HEART FAILURE", paste("DIABETES MELLITUS WITHOUT MENTION OF COMPLICATION,", " TYPE II OR UNSPECIFIED TYPE, NOT STATES AS UNCONTROLLED",sep=""), "ESOPHAGEAL VARICES WITHOUT MENTION OF BLEEDING", "HYPERTENSION IN THE OBSTETRIC CONTEXT, NOS", "MALIGNANT NEOPLASM OF RECTUM", "PRE-ECLAMPSIA, NOS", "SQUAMOUS CELL CARCINOMA, NOS") non.ade <- subset(non.ade, non_ade_diagnosis %in% non.ade.list) non.ade <- upData(non.ade, rename=c(cfar_pid="pid", age_at_non_ade_diagnosis_onset= "age.nonade.onset")) # map.icd9 <- read.table("Non-AIDS CCC vs Africa_14Nov08.csv", # header=TRUE, sep=",") # names(map.icd9) <- tolower(names(map.icd9)) # map.icd9 <- upData(map.icd9, drop="x") # trt <- read.table("when_to_start_art_16oct2008.csv",header=TRUE,sep=",") # trt <- read.table("when_to_start_art_18nov2009.csv",header=TRUE,sep=",") trt <- read.table("when_to_start_art_03dec2009.csv",header=TRUE,sep=",") names(trt) <- tolower(names(trt)) trt$art.0 <- ifelse(trt$haart == 1,0, ifelse(trt$haart == 0 & trt$regimen == " ",0,1)) trt <- upData(trt, rename=list(cfar_pid="pid", age_at_rx_end="age.rx.end", age_at_rx_start="age.rx.start", first_haart="fhaart")) # oi.df <- read.csv("when_to_start_year1_oi_01dec2008.csv", header=TRUE, # sep=",") oi.df <- read.csv("when_to_start_year1_oi_18nov2009.csv", header=TRUE, sep=",") names(oi.df) <- tolower(names(oi.df)) oi.df <- upData(oi.df, rename=c(cfar_pid="pid", year_of_first_visit="year.1st.visit", indicator_oi_0_1="oi", age_at_oi="age.oi", oi_1="oi.1", oi_2="oi.2", oi_3="oi.3", oi_4="oi.4")) oi.df$oi.2 <- ifelse(oi.df$oi.2 == "",NA,as.character(oi.df$oi.2)) oi.df$oi.3 <- ifelse(oi.df$oi.3 == "",NA,as.character(oi.df$oi.3)) oi.df$oi.4 <- ifelse(oi.df$oi.4 == "",NA,as.character(oi.df$oi.4)) oi.class <- read.csv("OIList_final.csv", header=TRUE) names(oi.class) <- tolower(names(oi.class)) oi.class$oi <- tolower(as.character(oi.class$oi)) oi.class$class <- as.character(oi.class$class) oi.class$class <- gsub("^([ \t\n\r\f\v]+)|([ \t\n\r\f\v]+)$", "", oi.class$class) oi.tmp <- unlist(strsplit(oi.class$oi, split="\"")) oi.tmp <- oi.tmp[oi.tmp != ""] oi.class$oi <- oi.tmp # # Some PIDs were not HAART naive at start of study # delete.pids <-trt$pid[trt$haart_before_first_visit==1] # d2 <- subset(d2, pid %nin% delete.pids) # trt <- subset(trt, pid %nin% delete.pids) # oi.df <- subset(oi.df, pid %nin% delete.pids) # oi.df <-subset(oi.df, oi == 1) # non.ade <- subset(non.ade, pid %nin% delete.pids) save(trt,file="trt.rda",compress=TRUE) # ## Need to remove first record of ID=993147H # tmp.d2 <- subset(d2, pid != "993147H") # tmp2.d2 <- subset(d2, pid == "993147H") # tmp2.d2 <- tmp2.d2[duplicated(tmp2.d2$pid),] # # Also need to remove this info from oi.df # which.row <- which(oi.df$pid == "993147H" & round(oi.df$age.oi == # 21.457,5)) # oi.df <- oi.df[-which.row,] # # d2 <- merge(tmp.d2,tmp2.d2,all=TRUE) # d2 <- d2[order(d2$pid, d2$age),] # Having to do this because one pid had an age.oi > age.death d2$age_at_death <- ifelse(is.na(d2$age_at_death), NA, ifelse(!is.na(d2$age.oi) & d2$age.oi > d2$age_at_death, d2$age.oi, d2$age_at_death)) # save(d2, file="d2.rda", compress=TRUE) # age.fhaart <- subset(d2, !is.na(age.fhaart), select=c(pid,age.fhaart)) # d2 <- upData(d2, drop=c("age.fhaart")) # d2 <- merge(d2, age.fhaart, all=TRUE) # d2 <- d2[order(d2$pid, d2$age),] # Collapsing duplicate ages for a given PID d2$age <- as.integer(round(d2$age*365.25)) source("collapseSolution.R") save(new.d2, file="newd2.rda") d2 <- new.d2 d2$age <- d2$age/365.25 save(d2, file="d2.rda") no.visits <- tapply(d2$pid, INDEX=d2$pid, FUN=length) #---------------------------------------# # Adding in treatment data # #---------------------------------------# source("solution.R") save(new.sub, file="newsub.rda") new.sub <- new.sub[order(new.sub$pid, new.sub$age),] age.fhaart.df <- data.frame("pid"=trt$pid[trt$fhaart==1 & !is.na(trt$fhaart)], "age.fhaart"=trt$age.rx.start[trt$fhaart == 1 & !is.na(trt$fhaart)]) d2 <- new.sub d2$age <- round(d2$age/365.25,3) d2$age.rx.start <- round(d2$age.rx.start/365.25,3) d2$age.rx.end <- round(d2$age.rx.end/365.25,3) d2 <- merge(d2, age.fhaart.df, all=TRUE) d2 <- d2[order(d2$pid, d2$age),] d2$age.fhaart <- round(d2$age.fhaart/365.25,3) d2$oi <- ifelse(is.na(d2$oi), 0, d2$oi) d2 <- upData(d2, drop="art") d2 <- upData(d2, rename=c(art.0="art")) no.visits.2 <- tapply(d2$pid, INDEX=d2$pid, FUN=length) save(d2, file="postsold2.rda") age.lt.age.fhaart <- unique(d2$pid[which(d2$age < d2$age.fhaart)]) if(sustiva){ d2$sustiva <- sapply(as.character(d2$regimen), FUN=function(y){ as.numeric("SUSTIVA" %in% unlist(strsplit(y, split=" ")))}, simplify=TRUE) locate.sus <- NULL for(i in 1:length(unique(d2$pid))){ locate.sus[i] <- min(which(d2$pid == unique(d2$pid)[i] & d2$sustiva == 0 & d2$regimen != " " & round(d2$age,5) == round(d2$age.fhaart,5))) } sus.df <- data.frame("pid"=unique(d2$pid), "sustiva.age.censor"=d2$age[locate.sus]) } else { sus.df <- data.frame("pid"=unique(d2$pid), "sustiva.age.censor"=NA) } d2 <- merge(d2, sus.df, all=TRUE) d2 <- d2[order(d2$pid, d2$age),] first.age <- unlist(tapply(d2$age, INDEX=d2$pid, FUN=head, n=1)) tmp <- data.frame("pid"=names(first.age), "first.age"=first.age) d2 <- merge(d2, tmp, all=TRUE) d2 <- d2[order(d2$pid, d2$age),] unique.id<-unique(d2$pid) #----------------------------------------------------------# # start.within is expressed in years, not months, and is how treatment # groups are assigned (ie., start HAART within "start.within" months of # falling below the upper limit # original is a logical. When TRUE, all assigned to trt 0 when they fall # below the upper limit until they start HAART within start.within years. # When FALSE, all assigned to trt group based on whether the time between # age.less.ulimit and age.fhaart within start.within years. # No.CD4 is a logical indicating whether CD4 should be used in computing # the propensity scores. IPW is a logical indicating whether no # propensity score should be used in the model but rather incorporate them # into the weights. start.within <- 1/4 old.code <- FALSE original <- FALSE No.CD4 <- FALSE IPW <- FALSE No.PS <- FALSE save(d2, file="d2.rda",compress=TRUE) df.500.400 <- create.df(start.within=start.within,restrict=FALSE, u.limit=500,l.limit=400,d2=d2,old.code=old.code, original=original) save(df.500.400,file="df500400.rda",compress=TRUE) load("df500400.rda") surv.500.400 <- df.500.400$surv.df surv.500.400 <- surv.500.400[order(surv.500.400$pid,surv.500.400$month),] avg.start <- df.500.400$avg.start analysis1 <- analysis(start.within=start.within,avg.start=avg.start, surv.df=surv.500.400,No.CD4=No.CD4,IPW=IPW,No.PS=No.PS) save(analysis1, file="analysis500400.rda",compress=TRUE) beta.500.400 <- analysis1[[1]] lci.500.400 <- analysis1[[2]] uci.500.400 <- analysis1[[3]] table.500.400 <- analysis1[[4]] n.ic.500.400 <- analysis1[[5]] surv.500.400 <- analysis1[[6]] #----------------------------------------------------------# df.400.300 <- create.df(start.within=start.within,restrict=FALSE, u.limit=400,l.limit=300,d2=d2,old.code=old.code, original=original) save(df.400.300,file="df400300.rda",compress=TRUE) load("df400300.rda") surv.400.300 <- df.400.300$surv.df surv.400.300 <- surv.400.300[order(surv.400.300$pid,surv.400.300$month),] avg.start <- df.400.300$avg.start analysis2 <- analysis(start.within=start.within,avg.start=avg.start, surv.df=surv.400.300,No.CD4=No.CD4,IPW=IPW,No.PS=No.PS) save(analysis2, file="analysis400300.rda",compress=TRUE) beta.400.300 <- analysis2[[1]] lci.400.300 <- analysis2[[2]] uci.400.300 <- analysis2[[3]] table.400.300 <- analysis2[[4]] n.ic.400.300 <- analysis2[[5]] surv.400.300 <- analysis2[[6]] #----------------------------------------------------------# df.300.200 <- create.df(start.within=start.within,restrict=FALSE, u.limit=300,l.limit=200,d2=d2,old.code=old.code, original=original) save(df.300.200,file="df300200.rda",compress=TRUE) load("df300200.rda") surv.300.200 <- df.300.200$surv.df surv.300.200 <- surv.300.200[order(surv.300.200$pid,surv.300.200$month),] avg.start <- df.300.200$avg.start analysis3 <- analysis(start.within=start.within,avg.start=avg.start, surv.df=surv.300.200,No.CD4=No.CD4,IPW=IPW,No.PS=No.PS) save(analysis3, file="analysis300200.rda",compress=TRUE) beta.300.200 <- analysis3[[1]] lci.300.200 <- analysis3[[2]] uci.300.200 <- analysis3[[3]] table.300.200 <- analysis3[[4]] n.ic.300.200 <- analysis3[[5]] surv.300.200 <- analysis3[[6]]