#####  This file sets up the CNICS and Vanderbilt datasets in preparation for dynamic marginal structural model analyses.


#####  Reading in CNICS data.
rm(list=ls())
setwd("/Volumes/encrypted/cnics/analyses-2013-Nov")

rf<-read.csv("RiskFactor2.csv")
mort<-read.csv("Mortality2.csv")
med<-read.csv("Medication2.csv")
lab<-read.csv("Lab2.csv")
dia<-read.csv("Diagnosis2.csv")
dems<-read.csv("Demographic2.csv")

dim(dems)
length(unique(dems$StudyId))

head(med)
table(med$MedicationName)
dim(med)
med<-med[med$MedicationName!="ADEFOVIR",]
dim(med)
length(unique(med$StudyId))      

art<-med$MedicationName
art1<-ifelse(art=="ABACAVIR","ABC",
             ifelse(art=="ADEFOVIR","hep B drug",
                    ifelse(art=="AMPRENAVIR","APV",
                           ifelse(art=="ATAZANAVIR","ATZ",
                                  ifelse(art=="ATRIPLA","EFV,FTC,TDF",
                                         ifelse(art=="COMBIVIR","3TC,AZT",
                                                ifelse(art=="COMPLERA","FTC,RPV,TDF",
                                                       ifelse(art=="DARUNAVIR","DRV",
                                                              ifelse(art=="DELAVIRDINE","DLV",
                                                                     ifelse(art=="DIDANOSINE","DDI",
                                                                            ifelse(art=="EFAVIRENZ","EFV",
                                                                                   ifelse(art=="EMTRICITABINE","FTC",
                                                                                          ifelse(art=="ENFUVIRTIDE","INN",
                                                                                                 ifelse(art=="EPZICOM","3TC,AZT",
                                                                                                        ifelse(art=="ETRAVIRINE","ETR",
                                                                                                               ifelse(art=="FOSAMPRENAVIR","FPV", 
                                                                                                                      ifelse(art=="Hx ARV Treatment, Med unknown","unknown ARV",
                                                                                                                             ifelse(art=="Hx PI Treatment, Med unknown","unknown PI",
                                                                                                                                    ifelse(art=="INDINAVIR","IDV",
                                                                                                                                           ifelse(art=="KALETRA","LPV,RTV",
                                                                                                                                                  ifelse(art=="LAMIVUDINE","3TC",
                                                                                                                                                         ifelse(art=="LOPINAVIR","LPV",
                                                                                                                                                                ifelse(art=="MARAVIROC","MRV",
                                                                                                                                                                       ifelse(art=="NELFINAVIR","NFV",
                                                                                                                                                                              ifelse(art=="NEVIRAPINE","NVP",
                                                                                                                                                                                     ifelse(art=="RALTEGRAVIR","RAL",
                                                                                                                                                                                            ifelse(art=="RILPIVIRINE","RPV",
                                                                                                                                                                                                   ifelse(art=="RITONAVIR","RTV",
                                                                                                                                                                                                          ifelse(art=="RITONAVIR BOOSTED","RTV",
                                                                                                                                                                                                                 ifelse(art=="SAQUINAVIR HARD CAP","SQV",
                                                                                                                                                                                                                        ifelse(art=="SAQUINAVIR SOFT GEL","SQV",
                                                                                                                                                                                                                               ifelse(art=="SAQUINAVIR UNSPECIFIED","SQV",
                                                                                                                                                                                                                                      ifelse(art=="STAVUDINE","D4T",
                                                                                                                                                                                                                                             ifelse(art=="STRIBILD","EVG,FTC,TDF",
                                                                                                                                                                                                                                                    ifelse(art=="TENOFOVIR","TDF",
                                                                                                                                                                                                                                                           ifelse(art=="TIPRANAVIR","TPV",
                                                                                                                                                                                                                                                                  ifelse(art=="TRIZIVIR","3TC,ABC,AZT",
                                                                                                                                                                                                                                                                         ifelse(art=="TRUVADA","FTC,3TC",
                                                                                                                                                                                                                                                                                ifelse(art=="ZALCITABINE","DDC",
                                                                                                                                                                                                                                                                                       ifelse(art=="ZIDOVUDINE","AZT",as.character(art)))))))))))))))))))))))))))))))))))))))))


####  Many people only have data up to the month-level in accuracy; this makes me do the following work-around stuff.

start.date1<-med$StartDate

stuff<-matrix(unlist(strsplit(as.character(start.date1),'-')),ncol=3,byrow=TRUE)
start.year<-stuff[,1]
start.month<-stuff[,2]
start.day<-stuff[,3]

start.day1<-ifelse(start.day=="00","15",start.day)
start.month1<-ifelse(start.month=="00","07",start.month)
start.date.approx<-ifelse(start.month=="00","y",
                          ifelse(start.month!="00"&start.day=="00","m","d"))

start.date<-NULL
for (i in 1:length(start.date1)) {
  start.date[i]<-paste(paste(start.year[i],start.month1[i],sep="-",collapse=""),start.day1[i],sep="-",collapse="")
}
#######  This new date variable includes an approximation variable next to it 
#######  (y=known up to year, m=known up to month, d=known up to day/no approximation, n=unknown date)


id<-med$StudyId
unique.id<-unique(id)
start.date<-as.Date(start.date)

####### Finding date of first ARV regimen
locate1<-NULL
for (i in 1:length(unique.id)) {
  locate1[i]<-which(id==unique.id[i]&start.date==min(start.date[id==unique.id[i]],na.rm=TRUE))[1]
}
init.date<-as.Date(start.date[locate1])
init.date.approx<-start.date.approx[locate1]

####  I should exclude those with init.date.approx=="y" because I only know start date up to the year.  This is done later.

####   Finding initial regimens
reg<-NULL
for (i in 1:length(unique.id)) {
  reg[i]<-paste(art1[id==unique.id[i]&start.date==init.date[i]],collapse=",",sep="")
}
init.reg<-sapply(lapply(lapply(strsplit(as.character(reg),','),unique),sort),FUN=paste,collapse=",")

####  Checking to see if they are HAART.  This function excludes those that are obviously not HAART.
reg.haart<-function(init.reg) {
  haart<-ifelse(init.reg=="3TC"|
                  init.reg=="3TC,ABC"|
                  init.reg=="3TC,AZT"|
                  init.reg=="3TC,D4T"|
                  init.reg=="3TC,EFV"|
                  init.reg=="3TC,FTC"|
                  init.reg=="3TC,TDF"|
                  init.reg=="ATZ"|
                  init.reg=="ATZ,RTV"|
                  init.reg=="AZT,IDV"|
                  init.reg=="D4T,DDI"|
                  init.reg=="D4T,FTC"|
                  init.reg=="DDI,EFV"|
                  init.reg=="DDI,IDV"|
                  init.reg=="DDI,TDF"|
                  init.reg=="DLV,EFV"|
                  init.reg=="DLV,NFV"|
                  init.reg=="DRV"|
                  init.reg=="DRV,RTV"|
                  init.reg=="EFV"|
                  init.reg=="EFV,FTC"|
                  init.reg=="EFV,TDF"|
                  init.reg=="FPV,RTV"|
                  init.reg=="FTC"|
                  init.reg=="IDV"|
                  init.reg=="LPV,RTV"|
                  init.reg=="MRV"|
                  init.reg=="NVP"|
                  init.reg=="RAL"|
                  init.reg=="RTV"|
                  init.reg=="unknown ARV"|
                  init.reg=="3TC,DDI"|
                  init.reg=="3TC,NVP"|
                  init.reg=="3TC,NFV"|
                  init.reg=="3TC,RAL"|
                  init.reg=="ABC"|
                  init.reg=="ATZ,EFV"|
                  init.reg=="AZT"|
                  init.reg=="D4T,EFV"|
                  init.reg=="D4T,NFV"|
                  init.reg=="DDI"|
                  init.reg=="ETR"|
                  init.reg=="ETR,RAL"|
                  init.reg=="NFV"|
                  init.reg=="RTV,SQV"|
                  init.reg=="TDF"|
                  init.reg=="D4T,NVP"|
                  init.reg=="D4T"|
                  init.reg=="ABC,NFV"|
                  init.reg=="",0,1)
  haart
}

haart1<-reg.haart(init.reg)


#####  Some people who started a non-HAART regimen may have started a HAART regimen shortly thereafter.  For example,
#####  sometimes people are put on a single drug at first and then receive the full regimen after one week.  The 
#####  following code is capturing these people, using a <32 day cutoff.  I end up including roughly 20 more patients.
locate2<-rep(NA,length(unique.id))
for (i in 1:length(unique.id)) {
  if (haart1[i]==0 & length(unique(start.date[id==unique.id[i]]))>1) {
    locate2[i]<-which(id==unique.id[i]&start.date==min(start.date[id==unique.id[i]&start.date!=init.date[i]],na.rm=TRUE))[1]
  }
}
second.date<-as.Date(start.date[locate2])
second.date.approx<-start.date.approx[locate2]
diff.dates<-as.numeric(second.date-init.date)

stop.date1<-med$EndDate

stuff<-matrix(unlist(strsplit(as.character(stop.date1),'-')),ncol=3,byrow=TRUE)
stop.year<-stuff[,1]
stop.month<-stuff[,2]
stop.day<-stuff[,3]

stop.day1<-ifelse(stop.day=="00","15",stop.day)
stop.month1<-ifelse(stop.month=="00","07",stop.month)
stop.year1<-ifelse(stop.year=="0000","3000",stop.year)
stop.date.approx<-ifelse(stop.year=="0000","n",
                         ifelse(stop.year!="0000"&stop.month=="00","y",
                                ifelse(stop.year!="0000"&stop.month!="00"&stop.day=="00","m","d")))

stop.date<-NULL
for (i in 1:length(stop.date1)) {
  stop.date[i]<-paste(paste(stop.year1[i],stop.month1[i],sep="-",collapse=""),stop.day1[i],sep="-",collapse="")
}

init.stop.date<-as.Date(stop.date[locate1])
init.stop.date.approx<-stop.date.approx[locate1]

reg2<-rep(NA,length(unique.id))
for (i in 1:length(unique.id)) {
  if (!is.na(second.date[i])) {
    reg2[i]<-paste(art1[id==unique.id[i]&start.date==second.date[i]],collapse=",",sep="")
  }
}
second.reg<-sapply(lapply(lapply(strsplit(as.character(reg2),','),unique),sort),FUN=paste,collapse=",")

table(second.reg[diff.dates<32&second.date.approx!="y"])
table(second.reg[diff.dates<32&second.date.approx!="y"&init.stop.date>second.date])
table(second.reg[diff.dates<32&second.date.approx!="y"&init.stop.date<second.date])


junk<-ifelse(!is.na(second.reg)&!is.na(diff.dates)&diff.dates<32&second.date.approx!="y"&init.stop.date>second.date&init.stop.date.approx!="y",1,0)
junk1<-rep(NA,length(unique.id))
for (i in 1:length(unique.id)) {
  if (junk[i]==1) {
    junk1[i]<-paste(init.reg[i],second.reg[i],collapse="",sep=",")
  }
}
reg1and2<-sapply(lapply(lapply(strsplit(as.character(junk1),','),unique),sort),FUN=paste,collapse=",")


haart<-ifelse(haart1==0&reg.haart(reg1and2)==1,1,haart1)


##### Persons to include those whose date of initiation is known at least up to the month.  I'm also including those who 
##### started HAART (excluding those who start a non-HAART regimen).  This is consistent with what Kitahata et al. did, 
##### but perhaps not correct -- they should probably be censored.  In the latest version (2013/11/7), I include those
##### who start a non-HAART regimen.  I will censor these patients.  Those with an unknown date of ART start will be excluded.

# include<-ifelse(haart==1&init.date.approx!="y",1,0)   ### old way
include<-ifelse(init.date.approx!="y",1,0)


#### For purposes of computing loss to follow-up, we'll also include the last visit date from this table.  WORRY ABOUT THIS LATER.
locate3<-locate4<-NULL
for (i in 1:length(unique.id)) {
  locate3[i]<-which(id==unique.id[i]&start.date==max(start.date[id==unique.id[i]],na.rm=TRUE))[1]
  locate4[i]<-which(id==unique.id[i]&stop.date==max(stop.date[id==unique.id[i]&stop.date!="n"],na.rm=TRUE))[1]
}
max.start.date<-as.Date(start.date[locate3])
max.start.date.approx<-start.date.approx[locate3]
max.stop.date<-as.Date(stop.date[locate4])
max.stop.date.approx<-stop.date.approx[locate4]


#### Creating the antiretroviral table
d<-data.frame(id=unique.id[include==1],
                    init.date=init.date[include==1],
                    init.date.approx=init.date.approx[include==1],
                    haart=haart[include==1])


#### Merging with demographics table
####  First I'm going to simplify the dems table only taking what I'm interested in.

dems$black<-ifelse(dems$Race=="Black",1,0)
dems$white<-ifelse(dems$Race=="White",1,0)
dems$Race<-dems$Hispanic<-NULL

dems$male<-ifelse(dems$PresentSex=="Female",0,1)
dems$PresentSex<-dems$BirthSex<-dems$Transgendered<-NULL

##### Creating an artificial date of birth as BirthYear-07-15.
dob<-NULL
for (i in 1:length(dems$BirthYear)){
  dob[i]<-paste(dems$BirthYear[i],"-07-15",sep="",collapse="")
}
dems$dob<-as.Date(dob)
dems$BirthYear<-NULL

#### Reading in risk factors; I'm really only interested in IDU

head(rf)
length(unique(rf$StudyId)) ### A single ID can have more than one row in this dataset

rf$idu<-with(rf,ifelse(Risk=="Injection drug use"|Risk=="Men who have sex with men and are an injection drug user",1,0))

idu<-NULL
for (i in 1:length(unique(rf$StudyId))) {
  idu[i]<-sum(rf$idu[rf$StudyId==unique(rf$StudyId)[i]])
}
idu<-ifelse(is.na(idu),0,idu)   


drf<-data.frame(id=unique(rf$StudyId),idu)

dems1<-merge(dems,drf,by.x="StudyId",by.y="id",all.x=TRUE)
dems1$id<-dems1$StudyId
dems1$StudyId<-NULL


#####  CD4 counts

lab.cd4<-lab[lab$TestName=="CD4 cell absolute" & lab$StudyId %in% dems1$id,]

trash<-matrix(unlist(strsplit(as.character(lab.cd4$ResultDate),' ')),ncol=2,byrow=TRUE)
cd4.d<-trash[,1]
cd4.date1<-as.Date(cd4.d)

stuff<-matrix(unlist(strsplit(as.character(cd4.d),'-')),ncol=3,byrow=TRUE)
cd4.year<-stuff[,1]
cd4.month<-stuff[,2]
cd4.day<-stuff[,3]

cd4.day1<-ifelse(cd4.day=="00","15",cd4.day)
cd4.month1<-ifelse(cd4.month=="00","07",cd4.month)
cd4.date.approx<-ifelse(cd4.month=="00","y",
                        ifelse(cd4.month!="00"&cd4.day=="00","m","d"))

cd4.date<-NULL
for (i in 1:length(cd4.date1)) {
  cd4.date[i]<-paste(paste(cd4.year[i],cd4.month1[i],sep="-",collapse=""),cd4.day1[i],sep="-",collapse="")
}
cd4.date<-as.Date(cd4.date)


head(lab.cd4)
table(lab.cd4$Units)   ## all units are the same
table(lab.cd4$Interpretation)  ### I'm ignoring Interpretation, DataSource, and Historical for now.
table(lab.cd4$DataSource)
table(lab.cd4$Historical)


### The following CD4 measurements are not exact.  
unique(lab.cd4$Result[which(is.na(as.numeric(as.character(lab.cd4$Result))))])

### We'll replace the inexact CD4 measurements with the following:
cd4<-lab.cd4$Result

cd4.new<-as.character(ifelse(cd4=="330 CELLS",330,
                             ifelse(cd4=="150 (?)","150",
                                    ifelse(cd4=="~500","500",
                                           ifelse(cd4=="<200","190",
                                                  ifelse(cd4=="<10","5",
                                                         ifelse(cd4=="<1","0",
                                                                ifelse(cd4=="<150","140",
                                                                       ifelse(cd4=="<20","10",
                                                                              ifelse(cd4=="<500","490",
                                                                                     ifelse(cd4==">500","510",
                                                                                            ifelse(cd4==">700","710",
                                                                                                   ifelse(cd4=="~550","550",
                                                                                                          ifelse(cd4=="~71","71",
                                                                                                                 as.character(cd4)))))))))))))))
cd4.new<-as.numeric(cd4.new)

d.cd4<-data.frame(id=lab.cd4$StudyId,cd4.date,cd4.date.approx,cd4=cd4.new)

##### Saving stuff so I don't have to re-run all of the above each time;  

save(d,dems1,d.cd4,file="cnics-hernan-dems-art-cd4.Rda")

##############################################################################
#####  This file is different from the similarly named file for the Cole analysis.

#######  Starting with a clean slate
rm(list=ls())
load("cnics-hernan-dems-art-cd4.Rda")


#####  Now let's look at the clinical endpoint data

mort<-read.csv("Mortality2.csv")

##### Date of death is only known up to the month; therefore I will set all death dates as on the 15th.

death.date1<-mort$DeathDate

stuff<-matrix(unlist(strsplit(as.character(death.date1),'-')),ncol=3,byrow=TRUE)
death.year<-stuff[,1]
death.month<-stuff[,2]
death.day<-stuff[,3]


death.day1<-ifelse(death.day=="00","15",death.day)
death.month1<-ifelse(death.month=="00","07",death.month)
death.date.approx<-ifelse(death.month=="00","y",
                          ifelse(death.month!="00"&death.day=="00","m","d"))

death.date<-NULL
for (i in 1:length(death.date1)) {
  death.date[i]<-paste(paste(death.year[i],death.month1[i],sep="-",collapse=""),death.day1[i],sep="-",collapse="")
}

mort1<-data.frame(id=mort$StudyId,death=1,death.date,death.date.approx)


##### Now clinical diagnoses

dia<-read.csv("Diagnosis2.csv")


clin.date1<-dia$DiagnosisDate

stuff<-matrix(unlist(strsplit(as.character(clin.date1),'-')),ncol=3,byrow=TRUE)
clin.year<-stuff[,1]
clin.month<-stuff[,2]
clin.day<-stuff[,3]

clin.day1<-ifelse(clin.day=="00","15",clin.day)
clin.month1<-ifelse(clin.month=="00","07",clin.month)
clin.year1<-ifelse(clin.year=="0000","3000",clin.year)
clin.date.approx<-ifelse(clin.year=="0000","n",
                         ifelse(clin.year!="0000"&clin.month=="00","y",
                                ifelse(clin.year!="0000"&clin.month!="00"&clin.day=="00","m","d")))

clin.date<-NULL
for (i in 1:length(clin.date1)) {
  clin.date[i]<-paste(paste(clin.year[i],clin.month1[i],sep="-",collapse=""),clin.day1[i],sep="-",collapse="")
}

clin<-data.frame(id=dia$StudyId,ade=1,ade.date=clin.date,ade.date.approx=clin.date.approx)


#########  Now I'm ready to merge everything
#########  First, I'll only use those records which met inclusion criteria based on ART.

mort1<-mort1[mort1$id  %in% dems1$id,]
clin1<-clin[clin$id %in% dems1$id,]

clin1<-clin1[!duplicated(clin1),]   ## Only counting one ADE per day per person
d.cd4<-d.cd4[!duplicated(d.cd4),]   ## Removing duplicate CD4 measurements

d1<-merge(d,mort1,by="id",all=TRUE)
d2<-merge(d1,dems1,by="id",all.y=TRUE)
d2$death <- ifelse(is.na(d2$death), 0, d2$death)
d2$haart <- ifelse(is.na(d2$haart), 0, d2$haart)
#### now merge time-dependent variables (cd4 and ade)
d3 <- merge(d.cd4, clin1, by.x=c("id", "cd4.date"), by.y=c("id", "ade.date"), all=TRUE)
d3$vis.date <- d3$cd4.date
d3$vis.date.approx<-with(d3,as.character(ifelse(is.na(cd4.date.approx),as.character(ade.date.approx),
                                                ifelse(is.na(ade.date.approx),as.character(cd4.date.approx),
                                                       ifelse(ade.date.approx=="m"|cd4.date.approx=="m","m",as.character(cd4.date.approx))))))
d3$cd4.date <- NULL

#### now merge all together
d4 <- merge(d2, d3, by="id", all=TRUE)


d4$haart.not.art<-d4$haart       ######  Added 2013-11-7
d4$haart<-NULL

d4<-d4[!is.na(d4$vis.date),]     ######  Added 2013-11-14         ###  This removes patients who never had a CD4 count (or an ADE)
d4$haart<-with(d4,ifelse(is.na(init.date)|vis.date<init.date,0,1))


########  This appears to have been never used in the most recent analysis (2014-03-05) and it takes a while, so I'm commenting it out.
# locate.ade<-which(d4$ade==1)                          
# id.ade<-d4$id[locate.ade]
# d4$cum.ade<-rep(0,length(d4$ade))
# for (i in 1:length(id.ade)){
#   cum.ade<-cumsum(d4$ade[d4$id==id.ade[i]])
#   d4$cum.ade[d4$id==id.ade[i]]<-cum.ade
# }


save(d4, file="cnics-hernan-stuff4.Rda")


####################################################

rm(list=ls())
setwd("/Volumes/encrypted/cnics/analyses-2013-Nov")
load("cnics-hernan-stuff4.Rda")
d<-d4
d<-d[d$vis.date!="1900-01-01"&d$vis.date!="0000-07-15",]  #Should have excluded CD4 with missing dates earlier in code, but am doing here (6/25/2013)
rm(d4)


#######  New stuff to get my dataset looking like Vanderbilt dataset.

head(d)

d1<-d
d4<-d[!duplicated(d$id),]

d3.death<-d4[d4$death==1,]
d3.death$vis.date<-d3.death$death.date
d3.death$cd4.date.approx<-d3.death$cd4<-d3.death$ade<-d3.death$ade.date.approx<-NA
d3.death$haart<-with(d3.death,ifelse(is.na(init.date),0,1))

d<-rbind(d1,d3.death)
d<-d[order(d$id,d$vis.date),]


add.row<-d[1,]
add.row<-add.row[-1,]
unique.id<-unique(d$id)
for (i in 1:length(unique.id)){
 # if (sum(d$init.date==d$vis.date & d$id==unique.id[i])==0) {
  if (!is.na(d$init.date[d$id==unique.id[i]])[1] & sum(!is.na(d$init.date) & d$init.date==d$vis.date & d$id==unique.id[i])==0) {
    stuff<-d[which(d$id==unique.id[i])[1],]
    stuff$vis.date<-stuff$init.date
    stuff$vis.date.approx<-stuff$init.date.approx
    stuff$cd4.date.approx<-stuff$cd4<-stuff$ade<-stuff$ade.date.approx<-NA           
    stuff$haart<-1
    add.row<-rbind(add.row,stuff)
  }
}

d<-rbind(d,add.row)
d<-d[order(d$id,d$vis.date),]
d$ade<-ifelse(is.na(d$ade),0,d$ade)


d$first.vis<-NULL
first.vis.df<-d[!duplicated(d$id),c("id","vis.date")]
first.vis.df$first.vis<-first.vis.df$vis.date
first.vis.df$vis.date<-NULL
d<-merge(d,first.vis.df,by="id",all.x=TRUE)
d<-d[order(d$id,d$vis.date),]

unique.id<-unique(d$id)
last.visit<-NULL
for (i in 1:length(unique.id)){
  last.visit[i]<-as.character(max(d$vis.date[d$id==unique.id[i]]))
}
last.vis.df<-data.frame(id=unique.id,last.visit=as.Date(last.visit))
d<-merge(d,last.vis.df,by="id",all.x=TRUE)
d<-d[order(d$id,d$vis.date),]


#####  Adding for LTFU

table(d$Site)
close.cwru<-max(d$last.visit[d$Site=="CWRU"])
close.fenway<-max(d$last.visit[d$Site=="FENWAY"])
close.jh<-max(d$last.visit[d$Site=="JH"])
close.uab<-max(d$last.visit[d$Site=="UAB"])
close.ucsd<-max(d$last.visit[d$Site=="UCSD"])
close.ucsf<-max(d$last.visit[d$Site=="UCSF"])
close.unc<-max(d$last.visit[d$Site=="UNC"])
close.uw<-max(d$last.visit[d$Site=="UW"])

close.date<-with(d,ifelse(Site=="CWRU",close.cwru,
                   ifelse(Site=="FENWAY",close.fenway,
                   ifelse(Site=="JH",close.jh,
                   ifelse(Site=="UAB",close.uab,
                   ifelse(Site=="UCSD",close.ucsd,
                   ifelse(Site=="UCSF",close.ucsf,
                   ifelse(Site=="UNC",close.unc,
                   ifelse(Site=="UW",close.uw,NA)))))))))
d$ltfu<-with(d,ifelse(is.na(death.date)&last.visit<close.date-365.25,1,0))

save(d, file="cnics-hernan-stuff5.Rda")

#####

rm(list=ls())
setwd("/Volumes/encrypted/cnics/analyses-2013-Nov")
load("cnics-hernan-stuff5.Rda")

stuff<-matrix(unlist(strsplit(as.character(d$first.vis),"-")),ncol=3,byrow=TRUE)
year.first.visit<-stuff[,1]


non.haart.art<-with(d,ifelse(is.na(haart.not.art),0,
                             ifelse(haart.not.art==1,0,1)))

d2<-data.frame(pid=d$id,
               site=d$Site,
               age=round(as.numeric((d$vis.date-d$dob)/365.25),4),
               age.rx.start=round(as.numeric((d$init.date-d$dob)/365.25),4),
               fhaart=ifelse(d$init.date==d$vis.date,1,0),
               haart=d$haart,
               non.haart.art,
               haart_before_first_visit=0,
               black=d$black,
               white=d$white,
               deceased=d$death,
               age_at_death=round(as.numeric((as.Date(d$death.date)-d$dob)/365.25),4),
               idu=d$idu,
               oi=d$ade,
               age.oi=NA,
               cd4=d$cd4,
               male=d$male,
               first.age=round(as.numeric((d$first.vis-d$dob)/365.25),4),
               first.year=as.numeric(year.first.visit),
               last.age=round(as.numeric((d$last.vis-d$dob)/365.25),4),
               ltfu=d$ltfu,
               age.approx=d$vis.date.approx
               )

d2$age.fhaart<-d2$age.rx.start
#d2$age.rx.start<-with(d2, ifelse(haart==0,NA, age.rx.start))
d2$age.oi<-with(d2, ifelse(oi==1,age,NA))


head(d2)

d2[d2$pid==154952391,"age.approx"][2]<-"m"    #### Hard coding, see below.
d2<-d2[!duplicated(d2),]         #### There is one duplicate (id 154952391) who has CD4 written twice in the record, once with exact date know on the 15th of the month and other with unknown month.  I'll assume that it's the former case.
                                  

save(d2, file="cnics-hernan-stuff5a.Rda")


############################  Reading in Vandy data.


rm(list=ls())
setwd("/Volumes/encrypted/cnics/analyses-2013-Nov")

library(foreign)

cl<-read.dta("../old-vandy-analysis/when_to_start_19nov2009.dta")
nade<-read.csv("../old-vandy-analysis/w2s_non_ades_dx_18nov2009.csv")
tx<-read.csv("../old-vandy-analysis/when_to_start_art_18nov2009.csv")
oi<-read.csv("../old-vandy-analysis/when_to_start_year1_oi_18nov2009.csv")

## Treatment data
tx<-tx[order(tx$CFAR_PID,tx$AGE_AT_RX_START),]
select<-ifelse(!duplicated(tx$CFAR_PID),1,0)
#select<-tx$FIRST_HAART==1&tx$REGIMEN_NBR==1
trt<-data.frame(id=tx$CFAR_PID[select==1],age.haart=tx$AGE_AT_RX_START[select==1],
                FIRST_HAART=tx$FIRST_HAART[select==1],REGIMEN_NBR=tx$REGIMEN_NBR[select==1])
select<-NULL

## Baseline demographics and death/ltfu
select<-!duplicated(cl$cfar_pid)
id<-cl$cfar_pid[select]
age.last<-year.haart<-age.haart1<-NULL
for (i in 1:length(id)){
  age.last[i]<-max(cl$age[cl$cfar_pid==id[i]&!is.na(cl$cd4_count)])
  year.haart[i]<-NA
  if (sum(cl$cfar_pid==id[i]&cl$indicator_first_haart_0_1==1)>0) {
    year.haart[i]<-cl$year_of_first_haart[cl$cfar_pid==id[i]&cl$indicator_first_haart_0_1==1]
    age.haart1[i]<-cl$age_at_first_haart[cl$cfar_pid==id[i]&cl$indicator_first_haart_0_1==1]
  }
}       ####  Warning messages are OK; I remove these 7 patients later.
demo<-data.frame(id=cl$cfar_pid[select],
                 male=cl$sex[select]-1,
                 black=cl$black[select],
                 white=cl$white[select],
                 deceased=cl$deceased[select],
                 age.death=cl$age_at_death[select],
                 idu=cl$ivdu[select],
                 route=cl$probable_route_of_infection[select],
                 ltfu=cl$lost_to_fu[select],
                 age.first.visit=cl$age_at_first_visit[select],
                 age.last,
                 age.haart1)

## Lab values
choose<-!is.na(cl$age_at_lab)
cd4<-cl$cd4_count[choose]
age.lab<-cl$age_at_lab[choose]
#vl<-cl$vl[choose]
id<-cl$cfar_pid[choose]
cd4.lab<-ifelse(!is.na(cd4),1,0)
#vl.lab<-ifelse(!is.na(vl),1,0)
labs<-data.frame(id=id[cd4.lab==1],age.lab=age.lab[cd4.lab==1],cd4=cd4[cd4.lab==1])

d1<-merge(demo,labs,by.x="id",by.y="id",all.y=TRUE)      ####  Removing those with no CD4 count (n=7)
d2<-merge(d1,trt,by.x="id",by.y="id",all.x=TRUE)         ####  Removing those with no Demographic info (n=0)
d3<-d2[!(d2$FIRST_HAART==0 & d2$REGIMEN_NBR==1),]        ####  Removing those who were not HAART naive (n=164)
d3<-d3[!duplicated(d3),]                                 ####  Removing duplicate records (3 records)
d3$FIRST_HAART<-d3$REGIMEN_NBR<-NULL
d3<-d3[order(d3$id,d3$age.lab),]

d3$age<-d3$age.lab
d3$fhaart<-0
d3$death<-0
d3$oi<-0

d4<-d3[!duplicated(d3$id),]
d4$cd4<-d4$age.lab<-d4$age<-NA

d3.death<-d4[d4$deceased==1,]
d3.death$age<-d3.death$age.death
d3.death$death<-1

d3.haart<-d4[!is.na(d4$age.haart),]
d3.haart$age<-d3.haart$age.haart
d3.haart$fhaart<-1


###### OIs

d.oi<-oi[oi$INDICATOR_OI_0_1==1,]

d.oi$id<-d.oi$CFAR_PID
d.oi$age.ade<-d.oi$AGE_AT_OI
d.oi$ade<-d.oi$INDICATOR_OI_0_1
#d.oi$year.first.visit<-d.oi$YEAR_OF_FIRST_VISIT
d.oi$CFAR_PID<-d.oi$AGE_AT_OI<-d.oi$INDICATOR_OI_0_1<-d.oi$YEAR_OF_FIRST_VISIT<-NULL
d.oi$OI_1<-d.oi$OI_2<-d.oi$OI_3<-d.oi$OI_4<-NULL

d3.oi<-merge(d4,d.oi,by="id")      ### Only including OIs among those who met inclusion criteria
d3.oi$age.lab<-d3.oi$cd4<-NA
d3.oi$age<-d3.oi$age.ade
d3.oi$age.ade<-NULL
d3.oi$oi<-d3.oi$ade
d3.oi$ade<-NULL


d5<-rbind(d3,d3.oi,d3.haart,d3.death)
d5<-d5[order(d5$id,d5$age),]
d5$age.haart1<-NULL

oi1<-oi[!duplicated(oi$CFAR_PID),]
d.year<-data.frame(id=oi1$CFAR_PID,year.first.visit=oi1$YEAR_OF_FIRST_VISIT)
d5<-merge(d5,d.year,by="id",all.x=TRUE)


which(duplicated(cbind(d5$id,d5$age)))   ####  Could cause problems at some point, but I'm assuming for now that OI and CD4 on same day came before FHAART

save(d5,file="cnics-hernan-stuff5-vandy.Rda")

##################################################

rm(list=ls())
setwd("/Volumes/encrypted/cnics/analyses-2013-Nov")

load("cnics-hernan-stuff5-vandy.Rda")
load("cnics-hernan-stuff5a.Rda")

d6<-data.frame(pid=d5$id,
               site="VU",
               age=d5$age,
               age.rx.start=d5$age.haart,
               fhaart=d5$fhaart,
               haart=0,
               non.haart.art=0,
               haart_before_first_visit=0,
               black=d5$black,
               white=d5$white,
               deceased=d5$deceased,
               age_at_death=d5$age.death,
               idu=d5$idu,
               oi=d5$oi,                      
               age.oi=NA,                      
               cd4=d5$cd4,
               male=d5$male,
               first.age=d5$age.first.visit,
               first.year=d5$year.first.visit,
               last.age=d5$age.last,
               ltfu=d5$ltfu,
               age.approx="d",
               age.fhaart=d5$age.haart
              )

d6$age.oi<-ifelse(d6$oi==1,d6$age,d6$age.oi)
d6$haart<-with(d6, ifelse(is.na(age.fhaart),0,
                          ifelse(age>=age.fhaart,1,0)))

d2.new<-rbind(d2,d6)
d2<-d2.new

######  Perhaps there are some rounding errors.
d2$age<-round(d2$age,3)
d2$age.rx.start<-round(d2$age.rx.start,3)
d2$age_at_death<-round(d2$age_at_death,3)
d2$first.age<-round(d2$first.age,3)
d2$last.age<-round(d2$last.age,3)
d2$age.fhaart<-round(d2$age.fhaart,3)


d2<-d2[!d2$pid=="1407150328",]     ##### This patient has an error in date of death (1976, but started ART much later); They didn't have a CD4 before ART initiation anyway, so would have been excluded, but we just exclude here.

save(d2,file="cnics-hernan-stuff5c.Rda")


###############################################################################################################################
###############################################################################################################################
####  I use everything above here in my final analyses, but everything below here is run in a separate file and is now removed.
###############################################################################################################################