##### This file sets up the CNICS and Vanderbilt datasets in preparation for dynamic marginal structural model analyses. ##### Reading in CNICS data. rm(list=ls()) setwd("/Volumes/encrypted/cnics/analyses-2013-Nov") rf<-read.csv("RiskFactor2.csv") mort<-read.csv("Mortality2.csv") med<-read.csv("Medication2.csv") lab<-read.csv("Lab2.csv") dia<-read.csv("Diagnosis2.csv") dems<-read.csv("Demographic2.csv") dim(dems) length(unique(dems$StudyId)) head(med) table(med$MedicationName) dim(med) med<-med[med$MedicationName!="ADEFOVIR",] dim(med) length(unique(med$StudyId)) art<-med$MedicationName art1<-ifelse(art=="ABACAVIR","ABC", ifelse(art=="ADEFOVIR","hep B drug", ifelse(art=="AMPRENAVIR","APV", ifelse(art=="ATAZANAVIR","ATZ", ifelse(art=="ATRIPLA","EFV,FTC,TDF", ifelse(art=="COMBIVIR","3TC,AZT", ifelse(art=="COMPLERA","FTC,RPV,TDF", ifelse(art=="DARUNAVIR","DRV", ifelse(art=="DELAVIRDINE","DLV", ifelse(art=="DIDANOSINE","DDI", ifelse(art=="EFAVIRENZ","EFV", ifelse(art=="EMTRICITABINE","FTC", ifelse(art=="ENFUVIRTIDE","INN", ifelse(art=="EPZICOM","3TC,AZT", ifelse(art=="ETRAVIRINE","ETR", ifelse(art=="FOSAMPRENAVIR","FPV", ifelse(art=="Hx ARV Treatment, Med unknown","unknown ARV", ifelse(art=="Hx PI Treatment, Med unknown","unknown PI", ifelse(art=="INDINAVIR","IDV", ifelse(art=="KALETRA","LPV,RTV", ifelse(art=="LAMIVUDINE","3TC", ifelse(art=="LOPINAVIR","LPV", ifelse(art=="MARAVIROC","MRV", ifelse(art=="NELFINAVIR","NFV", ifelse(art=="NEVIRAPINE","NVP", ifelse(art=="RALTEGRAVIR","RAL", ifelse(art=="RILPIVIRINE","RPV", ifelse(art=="RITONAVIR","RTV", ifelse(art=="RITONAVIR BOOSTED","RTV", ifelse(art=="SAQUINAVIR HARD CAP","SQV", ifelse(art=="SAQUINAVIR SOFT GEL","SQV", ifelse(art=="SAQUINAVIR UNSPECIFIED","SQV", ifelse(art=="STAVUDINE","D4T", ifelse(art=="STRIBILD","EVG,FTC,TDF", ifelse(art=="TENOFOVIR","TDF", ifelse(art=="TIPRANAVIR","TPV", ifelse(art=="TRIZIVIR","3TC,ABC,AZT", ifelse(art=="TRUVADA","FTC,3TC", ifelse(art=="ZALCITABINE","DDC", ifelse(art=="ZIDOVUDINE","AZT",as.character(art))))))))))))))))))))))))))))))))))))))))) #### Many people only have data up to the month-level in accuracy; this makes me do the following work-around stuff. start.date1<-med$StartDate stuff<-matrix(unlist(strsplit(as.character(start.date1),'-')),ncol=3,byrow=TRUE) start.year<-stuff[,1] start.month<-stuff[,2] start.day<-stuff[,3] start.day1<-ifelse(start.day=="00","15",start.day) start.month1<-ifelse(start.month=="00","07",start.month) start.date.approx<-ifelse(start.month=="00","y", ifelse(start.month!="00"&start.day=="00","m","d")) start.date<-NULL for (i in 1:length(start.date1)) { start.date[i]<-paste(paste(start.year[i],start.month1[i],sep="-",collapse=""),start.day1[i],sep="-",collapse="") } ####### This new date variable includes an approximation variable next to it ####### (y=known up to year, m=known up to month, d=known up to day/no approximation, n=unknown date) id<-med$StudyId unique.id<-unique(id) start.date<-as.Date(start.date) ####### Finding date of first ARV regimen locate1<-NULL for (i in 1:length(unique.id)) { locate1[i]<-which(id==unique.id[i]&start.date==min(start.date[id==unique.id[i]],na.rm=TRUE))[1] } init.date<-as.Date(start.date[locate1]) init.date.approx<-start.date.approx[locate1] #### I should exclude those with init.date.approx=="y" because I only know start date up to the year. This is done later. #### Finding initial regimens reg<-NULL for (i in 1:length(unique.id)) { reg[i]<-paste(art1[id==unique.id[i]&start.date==init.date[i]],collapse=",",sep="") } init.reg<-sapply(lapply(lapply(strsplit(as.character(reg),','),unique),sort),FUN=paste,collapse=",") #### Checking to see if they are HAART. This function excludes those that are obviously not HAART. reg.haart<-function(init.reg) { haart<-ifelse(init.reg=="3TC"| init.reg=="3TC,ABC"| init.reg=="3TC,AZT"| init.reg=="3TC,D4T"| init.reg=="3TC,EFV"| init.reg=="3TC,FTC"| init.reg=="3TC,TDF"| init.reg=="ATZ"| init.reg=="ATZ,RTV"| init.reg=="AZT,IDV"| init.reg=="D4T,DDI"| init.reg=="D4T,FTC"| init.reg=="DDI,EFV"| init.reg=="DDI,IDV"| init.reg=="DDI,TDF"| init.reg=="DLV,EFV"| init.reg=="DLV,NFV"| init.reg=="DRV"| init.reg=="DRV,RTV"| init.reg=="EFV"| init.reg=="EFV,FTC"| init.reg=="EFV,TDF"| init.reg=="FPV,RTV"| init.reg=="FTC"| init.reg=="IDV"| init.reg=="LPV,RTV"| init.reg=="MRV"| init.reg=="NVP"| init.reg=="RAL"| init.reg=="RTV"| init.reg=="unknown ARV"| init.reg=="3TC,DDI"| init.reg=="3TC,NVP"| init.reg=="3TC,NFV"| init.reg=="3TC,RAL"| init.reg=="ABC"| init.reg=="ATZ,EFV"| init.reg=="AZT"| init.reg=="D4T,EFV"| init.reg=="D4T,NFV"| init.reg=="DDI"| init.reg=="ETR"| init.reg=="ETR,RAL"| init.reg=="NFV"| init.reg=="RTV,SQV"| init.reg=="TDF"| init.reg=="D4T,NVP"| init.reg=="D4T"| init.reg=="ABC,NFV"| init.reg=="",0,1) haart } haart1<-reg.haart(init.reg) ##### Some people who started a non-HAART regimen may have started a HAART regimen shortly thereafter. For example, ##### sometimes people are put on a single drug at first and then receive the full regimen after one week. The ##### following code is capturing these people, using a <32 day cutoff. I end up including roughly 20 more patients. locate2<-rep(NA,length(unique.id)) for (i in 1:length(unique.id)) { if (haart1[i]==0 & length(unique(start.date[id==unique.id[i]]))>1) { locate2[i]<-which(id==unique.id[i]&start.date==min(start.date[id==unique.id[i]&start.date!=init.date[i]],na.rm=TRUE))[1] } } second.date<-as.Date(start.date[locate2]) second.date.approx<-start.date.approx[locate2] diff.dates<-as.numeric(second.date-init.date) stop.date1<-med$EndDate stuff<-matrix(unlist(strsplit(as.character(stop.date1),'-')),ncol=3,byrow=TRUE) stop.year<-stuff[,1] stop.month<-stuff[,2] stop.day<-stuff[,3] stop.day1<-ifelse(stop.day=="00","15",stop.day) stop.month1<-ifelse(stop.month=="00","07",stop.month) stop.year1<-ifelse(stop.year=="0000","3000",stop.year) stop.date.approx<-ifelse(stop.year=="0000","n", ifelse(stop.year!="0000"&stop.month=="00","y", ifelse(stop.year!="0000"&stop.month!="00"&stop.day=="00","m","d"))) stop.date<-NULL for (i in 1:length(stop.date1)) { stop.date[i]<-paste(paste(stop.year1[i],stop.month1[i],sep="-",collapse=""),stop.day1[i],sep="-",collapse="") } init.stop.date<-as.Date(stop.date[locate1]) init.stop.date.approx<-stop.date.approx[locate1] reg2<-rep(NA,length(unique.id)) for (i in 1:length(unique.id)) { if (!is.na(second.date[i])) { reg2[i]<-paste(art1[id==unique.id[i]&start.date==second.date[i]],collapse=",",sep="") } } second.reg<-sapply(lapply(lapply(strsplit(as.character(reg2),','),unique),sort),FUN=paste,collapse=",") table(second.reg[diff.dates<32&second.date.approx!="y"]) table(second.reg[diff.dates<32&second.date.approx!="y"&init.stop.date>second.date]) table(second.reg[diff.dates<32&second.date.approx!="y"&init.stop.datesecond.date&init.stop.date.approx!="y",1,0) junk1<-rep(NA,length(unique.id)) for (i in 1:length(unique.id)) { if (junk[i]==1) { junk1[i]<-paste(init.reg[i],second.reg[i],collapse="",sep=",") } } reg1and2<-sapply(lapply(lapply(strsplit(as.character(junk1),','),unique),sort),FUN=paste,collapse=",") haart<-ifelse(haart1==0®.haart(reg1and2)==1,1,haart1) ##### Persons to include those whose date of initiation is known at least up to the month. I'm also including those who ##### started HAART (excluding those who start a non-HAART regimen). This is consistent with what Kitahata et al. did, ##### but perhaps not correct -- they should probably be censored. In the latest version (2013/11/7), I include those ##### who start a non-HAART regimen. I will censor these patients. Those with an unknown date of ART start will be excluded. # include<-ifelse(haart==1&init.date.approx!="y",1,0) ### old way include<-ifelse(init.date.approx!="y",1,0) #### For purposes of computing loss to follow-up, we'll also include the last visit date from this table. WORRY ABOUT THIS LATER. locate3<-locate4<-NULL for (i in 1:length(unique.id)) { locate3[i]<-which(id==unique.id[i]&start.date==max(start.date[id==unique.id[i]],na.rm=TRUE))[1] locate4[i]<-which(id==unique.id[i]&stop.date==max(stop.date[id==unique.id[i]&stop.date!="n"],na.rm=TRUE))[1] } max.start.date<-as.Date(start.date[locate3]) max.start.date.approx<-start.date.approx[locate3] max.stop.date<-as.Date(stop.date[locate4]) max.stop.date.approx<-stop.date.approx[locate4] #### Creating the antiretroviral table d<-data.frame(id=unique.id[include==1], init.date=init.date[include==1], init.date.approx=init.date.approx[include==1], haart=haart[include==1]) #### Merging with demographics table #### First I'm going to simplify the dems table only taking what I'm interested in. dems$black<-ifelse(dems$Race=="Black",1,0) dems$white<-ifelse(dems$Race=="White",1,0) dems$Race<-dems$Hispanic<-NULL dems$male<-ifelse(dems$PresentSex=="Female",0,1) dems$PresentSex<-dems$BirthSex<-dems$Transgendered<-NULL ##### Creating an artificial date of birth as BirthYear-07-15. dob<-NULL for (i in 1:length(dems$BirthYear)){ dob[i]<-paste(dems$BirthYear[i],"-07-15",sep="",collapse="") } dems$dob<-as.Date(dob) dems$BirthYear<-NULL #### Reading in risk factors; I'm really only interested in IDU head(rf) length(unique(rf$StudyId)) ### A single ID can have more than one row in this dataset rf$idu<-with(rf,ifelse(Risk=="Injection drug use"|Risk=="Men who have sex with men and are an injection drug user",1,0)) idu<-NULL for (i in 1:length(unique(rf$StudyId))) { idu[i]<-sum(rf$idu[rf$StudyId==unique(rf$StudyId)[i]]) } idu<-ifelse(is.na(idu),0,idu) drf<-data.frame(id=unique(rf$StudyId),idu) dems1<-merge(dems,drf,by.x="StudyId",by.y="id",all.x=TRUE) dems1$id<-dems1$StudyId dems1$StudyId<-NULL ##### CD4 counts lab.cd4<-lab[lab$TestName=="CD4 cell absolute" & lab$StudyId %in% dems1$id,] trash<-matrix(unlist(strsplit(as.character(lab.cd4$ResultDate),' ')),ncol=2,byrow=TRUE) cd4.d<-trash[,1] cd4.date1<-as.Date(cd4.d) stuff<-matrix(unlist(strsplit(as.character(cd4.d),'-')),ncol=3,byrow=TRUE) cd4.year<-stuff[,1] cd4.month<-stuff[,2] cd4.day<-stuff[,3] cd4.day1<-ifelse(cd4.day=="00","15",cd4.day) cd4.month1<-ifelse(cd4.month=="00","07",cd4.month) cd4.date.approx<-ifelse(cd4.month=="00","y", ifelse(cd4.month!="00"&cd4.day=="00","m","d")) cd4.date<-NULL for (i in 1:length(cd4.date1)) { cd4.date[i]<-paste(paste(cd4.year[i],cd4.month1[i],sep="-",collapse=""),cd4.day1[i],sep="-",collapse="") } cd4.date<-as.Date(cd4.date) head(lab.cd4) table(lab.cd4$Units) ## all units are the same table(lab.cd4$Interpretation) ### I'm ignoring Interpretation, DataSource, and Historical for now. table(lab.cd4$DataSource) table(lab.cd4$Historical) ### The following CD4 measurements are not exact. unique(lab.cd4$Result[which(is.na(as.numeric(as.character(lab.cd4$Result))))]) ### We'll replace the inexact CD4 measurements with the following: cd4<-lab.cd4$Result cd4.new<-as.character(ifelse(cd4=="330 CELLS",330, ifelse(cd4=="150 (?)","150", ifelse(cd4=="~500","500", ifelse(cd4=="<200","190", ifelse(cd4=="<10","5", ifelse(cd4=="<1","0", ifelse(cd4=="<150","140", ifelse(cd4=="<20","10", ifelse(cd4=="<500","490", ifelse(cd4==">500","510", ifelse(cd4==">700","710", ifelse(cd4=="~550","550", ifelse(cd4=="~71","71", as.character(cd4))))))))))))))) cd4.new<-as.numeric(cd4.new) d.cd4<-data.frame(id=lab.cd4$StudyId,cd4.date,cd4.date.approx,cd4=cd4.new) ##### Saving stuff so I don't have to re-run all of the above each time; save(d,dems1,d.cd4,file="cnics-hernan-dems-art-cd4.Rda") ############################################################################## ##### This file is different from the similarly named file for the Cole analysis. ####### Starting with a clean slate rm(list=ls()) load("cnics-hernan-dems-art-cd4.Rda") ##### Now let's look at the clinical endpoint data mort<-read.csv("Mortality2.csv") ##### Date of death is only known up to the month; therefore I will set all death dates as on the 15th. death.date1<-mort$DeathDate stuff<-matrix(unlist(strsplit(as.character(death.date1),'-')),ncol=3,byrow=TRUE) death.year<-stuff[,1] death.month<-stuff[,2] death.day<-stuff[,3] death.day1<-ifelse(death.day=="00","15",death.day) death.month1<-ifelse(death.month=="00","07",death.month) death.date.approx<-ifelse(death.month=="00","y", ifelse(death.month!="00"&death.day=="00","m","d")) death.date<-NULL for (i in 1:length(death.date1)) { death.date[i]<-paste(paste(death.year[i],death.month1[i],sep="-",collapse=""),death.day1[i],sep="-",collapse="") } mort1<-data.frame(id=mort$StudyId,death=1,death.date,death.date.approx) ##### Now clinical diagnoses dia<-read.csv("Diagnosis2.csv") clin.date1<-dia$DiagnosisDate stuff<-matrix(unlist(strsplit(as.character(clin.date1),'-')),ncol=3,byrow=TRUE) clin.year<-stuff[,1] clin.month<-stuff[,2] clin.day<-stuff[,3] clin.day1<-ifelse(clin.day=="00","15",clin.day) clin.month1<-ifelse(clin.month=="00","07",clin.month) clin.year1<-ifelse(clin.year=="0000","3000",clin.year) clin.date.approx<-ifelse(clin.year=="0000","n", ifelse(clin.year!="0000"&clin.month=="00","y", ifelse(clin.year!="0000"&clin.month!="00"&clin.day=="00","m","d"))) clin.date<-NULL for (i in 1:length(clin.date1)) { clin.date[i]<-paste(paste(clin.year[i],clin.month1[i],sep="-",collapse=""),clin.day1[i],sep="-",collapse="") } clin<-data.frame(id=dia$StudyId,ade=1,ade.date=clin.date,ade.date.approx=clin.date.approx) ######### Now I'm ready to merge everything ######### First, I'll only use those records which met inclusion criteria based on ART. mort1<-mort1[mort1$id %in% dems1$id,] clin1<-clin[clin$id %in% dems1$id,] clin1<-clin1[!duplicated(clin1),] ## Only counting one ADE per day per person d.cd4<-d.cd4[!duplicated(d.cd4),] ## Removing duplicate CD4 measurements d1<-merge(d,mort1,by="id",all=TRUE) d2<-merge(d1,dems1,by="id",all.y=TRUE) d2$death <- ifelse(is.na(d2$death), 0, d2$death) d2$haart <- ifelse(is.na(d2$haart), 0, d2$haart) #### now merge time-dependent variables (cd4 and ade) d3 <- merge(d.cd4, clin1, by.x=c("id", "cd4.date"), by.y=c("id", "ade.date"), all=TRUE) d3$vis.date <- d3$cd4.date d3$vis.date.approx<-with(d3,as.character(ifelse(is.na(cd4.date.approx),as.character(ade.date.approx), ifelse(is.na(ade.date.approx),as.character(cd4.date.approx), ifelse(ade.date.approx=="m"|cd4.date.approx=="m","m",as.character(cd4.date.approx)))))) d3$cd4.date <- NULL #### now merge all together d4 <- merge(d2, d3, by="id", all=TRUE) d4$haart.not.art<-d4$haart ###### Added 2013-11-7 d4$haart<-NULL d4<-d4[!is.na(d4$vis.date),] ###### Added 2013-11-14 ### This removes patients who never had a CD4 count (or an ADE) d4$haart<-with(d4,ifelse(is.na(init.date)|vis.date0) { year.haart[i]<-cl$year_of_first_haart[cl$cfar_pid==id[i]&cl$indicator_first_haart_0_1==1] age.haart1[i]<-cl$age_at_first_haart[cl$cfar_pid==id[i]&cl$indicator_first_haart_0_1==1] } } #### Warning messages are OK; I remove these 7 patients later. demo<-data.frame(id=cl$cfar_pid[select], male=cl$sex[select]-1, black=cl$black[select], white=cl$white[select], deceased=cl$deceased[select], age.death=cl$age_at_death[select], idu=cl$ivdu[select], route=cl$probable_route_of_infection[select], ltfu=cl$lost_to_fu[select], age.first.visit=cl$age_at_first_visit[select], age.last, age.haart1) ## Lab values choose<-!is.na(cl$age_at_lab) cd4<-cl$cd4_count[choose] age.lab<-cl$age_at_lab[choose] #vl<-cl$vl[choose] id<-cl$cfar_pid[choose] cd4.lab<-ifelse(!is.na(cd4),1,0) #vl.lab<-ifelse(!is.na(vl),1,0) labs<-data.frame(id=id[cd4.lab==1],age.lab=age.lab[cd4.lab==1],cd4=cd4[cd4.lab==1]) d1<-merge(demo,labs,by.x="id",by.y="id",all.y=TRUE) #### Removing those with no CD4 count (n=7) d2<-merge(d1,trt,by.x="id",by.y="id",all.x=TRUE) #### Removing those with no Demographic info (n=0) d3<-d2[!(d2$FIRST_HAART==0 & d2$REGIMEN_NBR==1),] #### Removing those who were not HAART naive (n=164) d3<-d3[!duplicated(d3),] #### Removing duplicate records (3 records) d3$FIRST_HAART<-d3$REGIMEN_NBR<-NULL d3<-d3[order(d3$id,d3$age.lab),] d3$age<-d3$age.lab d3$fhaart<-0 d3$death<-0 d3$oi<-0 d4<-d3[!duplicated(d3$id),] d4$cd4<-d4$age.lab<-d4$age<-NA d3.death<-d4[d4$deceased==1,] d3.death$age<-d3.death$age.death d3.death$death<-1 d3.haart<-d4[!is.na(d4$age.haart),] d3.haart$age<-d3.haart$age.haart d3.haart$fhaart<-1 ###### OIs d.oi<-oi[oi$INDICATOR_OI_0_1==1,] d.oi$id<-d.oi$CFAR_PID d.oi$age.ade<-d.oi$AGE_AT_OI d.oi$ade<-d.oi$INDICATOR_OI_0_1 #d.oi$year.first.visit<-d.oi$YEAR_OF_FIRST_VISIT d.oi$CFAR_PID<-d.oi$AGE_AT_OI<-d.oi$INDICATOR_OI_0_1<-d.oi$YEAR_OF_FIRST_VISIT<-NULL d.oi$OI_1<-d.oi$OI_2<-d.oi$OI_3<-d.oi$OI_4<-NULL d3.oi<-merge(d4,d.oi,by="id") ### Only including OIs among those who met inclusion criteria d3.oi$age.lab<-d3.oi$cd4<-NA d3.oi$age<-d3.oi$age.ade d3.oi$age.ade<-NULL d3.oi$oi<-d3.oi$ade d3.oi$ade<-NULL d5<-rbind(d3,d3.oi,d3.haart,d3.death) d5<-d5[order(d5$id,d5$age),] d5$age.haart1<-NULL oi1<-oi[!duplicated(oi$CFAR_PID),] d.year<-data.frame(id=oi1$CFAR_PID,year.first.visit=oi1$YEAR_OF_FIRST_VISIT) d5<-merge(d5,d.year,by="id",all.x=TRUE) which(duplicated(cbind(d5$id,d5$age))) #### Could cause problems at some point, but I'm assuming for now that OI and CD4 on same day came before FHAART save(d5,file="cnics-hernan-stuff5-vandy.Rda") ################################################## rm(list=ls()) setwd("/Volumes/encrypted/cnics/analyses-2013-Nov") load("cnics-hernan-stuff5-vandy.Rda") load("cnics-hernan-stuff5a.Rda") d6<-data.frame(pid=d5$id, site="VU", age=d5$age, age.rx.start=d5$age.haart, fhaart=d5$fhaart, haart=0, non.haart.art=0, haart_before_first_visit=0, black=d5$black, white=d5$white, deceased=d5$deceased, age_at_death=d5$age.death, idu=d5$idu, oi=d5$oi, age.oi=NA, cd4=d5$cd4, male=d5$male, first.age=d5$age.first.visit, first.year=d5$year.first.visit, last.age=d5$age.last, ltfu=d5$ltfu, age.approx="d", age.fhaart=d5$age.haart ) d6$age.oi<-ifelse(d6$oi==1,d6$age,d6$age.oi) d6$haart<-with(d6, ifelse(is.na(age.fhaart),0, ifelse(age>=age.fhaart,1,0))) d2.new<-rbind(d2,d6) d2<-d2.new ###### Perhaps there are some rounding errors. d2$age<-round(d2$age,3) d2$age.rx.start<-round(d2$age.rx.start,3) d2$age_at_death<-round(d2$age_at_death,3) d2$first.age<-round(d2$first.age,3) d2$last.age<-round(d2$last.age,3) d2$age.fhaart<-round(d2$age.fhaart,3) d2<-d2[!d2$pid=="1407150328",] ##### This patient has an error in date of death (1976, but started ART much later); They didn't have a CD4 before ART initiation anyway, so would have been excluded, but we just exclude here. save(d2,file="cnics-hernan-stuff5c.Rda") ############################################################################################################################### ############################################################################################################################### #### I use everything above here in my final analyses, but everything below here is run in a separate file and is now removed. ###############################################################################################################################