rm(list=ls()) d<-read.csv("../../../inf_dis/hiv/ccasanet/second-round-data/20131031_changing_demo/basic.csv") with(d, table(mode,site)) with(d, table(mode)) with(d, table(site)) head(d) exclude1<-0 ## I used to be removing those without date of HIV diagnosis, but not anymore. #exclude1<-with(d, ifelse((is.na(hivdiagnosis_d) | as.Date(hivdiagnosis_d)==as.Date("1900-01-01")),1,0)) exclude2<-with(d, ifelse(is.na(birth_d)|birth_d=="1900-01-01",1,0)) exclude3<-with(d, ifelse(!is.na(birth_d) & !is.na(baseline_d) & as.numeric(as.Date(baseline_d)-as.Date(birth_d))/365.25 <18,1,0)) exclude<-1-(1-exclude1)*(1-exclude2)*(1-exclude3) exclude<-ifelse(is.na(exclude),1,exclude) with(d, table(site,exclude1)) with(d, table(site,exclude2)) ##### Slightly different with(d, table(site,exclude3)) ##### Slightly different with(d, table(site,exclude)) ##### Exactly same except for 1 patient in Argentina d<-d[exclude==0,] with(d, table(mode,site)) # junk<-with(d, strsplit(as.character(hivdiagnosis_d),split="-")) # d$year<-NULL # for (i in 1:length(d$site)){ # d$year[i]<-junk[[i]][1] # } d$year<-as.POSIXlt(as.Date(d$hivdiagnosis_d))$year+1900 with(d, summary(as.numeric(year))) d$hivdiagnosis_d<-with(d, ifelse(d$year==1900,NA,as.character(hivdiagnosis_d))) d$age.dx<-with(d, as.numeric(as.Date(hivdiagnosis_d)-as.Date(birth_d))/365.25) d$age.lt25.dx<-with(d, ifelse(age.dx<25,1,0)) d$msm<-ifelse((d$mode=="Bisexual"|d$mode=="Homo/Bisexual and Injecting drug user"| d$mode=="Homosexual contact")&d$male==1,1,0) d$age.enrol<-with(d, as.numeric(as.Date(baseline_d)-as.Date(birth_d))/365.25) d$age.lt25.enrol<-with(d, ifelse(age.enrol<25,1,0)) d$mode1<-ifelse(d$msm==1,"MSM", ifelse(d$mode=="Heterosexual contact","Heterosexual", ifelse(d$mode=="Unknown","Unknown","Other"))) #### Now defining who has AIDS at enrollment d.visit<-read.csv("../../../inf_dis/hiv/ccasanet/second-round-data/20131031_changing_demo/visit.csv") d.visit$date<-as.Date(d.visit$visit_d,"%Y-%m-%d") d.visit$clinical.stage<-with(d.visit, ifelse(whostage==""&cdcstage=="","", ifelse(whostage==""&cdcstage!="",as.character(cdcstage), ifelse(whostage!=''&cdcstage=="",paste("WHO ",as.character(whostage),sep=""), paste(whostage,cdcstage,sep=";")))) ) classify.stage<-function(clinical.stage) { ifelse(is.na(clinical.stage)| clinical.stage=="","Unknown", ifelse(clinical.stage=="A or B"| clinical.stage=="A"| clinical.stage=="A1"| clinical.stage=="A2"| clinical.stage=="A3"| clinical.stage=="B"| clinical.stage=="B1"| clinical.stage=="B2"| clinical.stage=="B3"| clinical.stage=="WHO 1"| clinical.stage=="WHO 2"| clinical.stage=="WHO 3"| clinical.stage=="WHO 2 or 3","not AIDS", ifelse(clinical.stage=="C"| clinical.stage=="C1"| clinical.stage=="C2"| clinical.stage=="C3"| clinical.stage=="WHO 4","AIDS","SOMETHING ELSE"))) } d.visit$site<-as.character(d.visit$site) d.baseline.date<-d[,c("patient","site","baseline_d")] d.visit1<-merge(d.baseline.date,d.visit,by.x=c("patient","site"),by.y=c("patient","site"),all.x=TRUE) #### Removing those records with missing visit date, or that happened more than 30 days after baseline date d.visit1a<-d.visit1[-which(d.visit1$visit_d==""|d.visit1$visit_d=="1900-01-01"|as.Date(d.visit1$visit_d)>as.Date(d.visit1$baseline_d)+30),] ord<-order(d.visit1a$site,d.visit1a$patient,d.visit1a$date) d.visit1<-d.visit1a[ord,] dup<-with(d.visit1,duplicated(paste(site,patient,sep="-"))) d.visit2<-d.visit1[dup==FALSE,] d.first.visit<-data.frame(patient=d.visit2$patient,site=d.visit2$site,visit_d=d.visit2$visit_d,clinical.stage=d.visit2$clinical.stage) d1<-merge(d,d.first.visit,by.x=c("patient","site"),by.y=c("patient","site"),all.x=TRUE) d1$clinical.stage.cat<-with(d1, classify.stage(clinical.stage)) d1$whostage<-with(d1,ifelse(whostage=="."|whostage=="9","",as.character(whostage))) d1$stage.enroll<-with(d1, classify.stage(ifelse(cdcstage==""&whostage=="","", ifelse(cdcstage!="",as.character(cdcstage), ifelse(whostage!="",paste("WHO ",as.character(whostage),sep=""), paste(whostage,cdcstage,sep=";")))))) d1$aids.enroll<-with(d1, ifelse(is.na(aids_y)|aids_y==9, "Unknown", ifelse(aids_y==1,"AIDS", ifelse(aids_y==0,"not AIDS","SOMETHING ELSE")))) d1$aids.enrollment<-with(d1, ifelse(aids.enroll=="AIDS"|stage.enroll=="AIDS"|clinical.stage.cat=="AIDS","AIDS", ifelse(aids.enroll=="Unknown"&stage.enroll=="Unknown"&clinical.stage.cat=="Unknown","Unknown", ifelse(aids.enroll=="not AIDS"|stage.enroll=="not AIDS"|clinical.stage.cat=="not AIDS","not AIDS", "SOMETHING ELSE")))) #### aids.enrollment says someone has AIDS if there is any evidence of them having AIDS either using tbl_BASIC variables aids_y, whostage, cdcstage, #### or tbl_VISIT clinical stage variables at the first visit. table(d1$aids.enrollment,d1$site) d1$year.dx<-with(d1, as.numeric(ifelse(year==1900,NA,as.character(year)))) # junk<-with(d1, strsplit(as.character(baseline_d),split="-")) # d1$year.enrol<-NULL # for (i in 1:length(d1$site)){ # d1$year.enrol[i]<-junk[[i]][1] # } d1$year.enrol<-as.POSIXlt(d1$baseline_d)$year+1900 d.cd4<-read.csv("../../../inf_dis/hiv/ccasanet/second-round-data/20131031_changing_demo/lab_cd4.csv") head(d.cd4) d.cd4$date<-as.Date(d.cd4$cd4_d,"%Y-%m-%d") d.cd4.1<-merge(d.baseline.date,d.cd4,by=c("patient","site")) d.cd4.1$time.diff<-with(d.cd4.1, as.numeric(as.Date(cd4_d)-as.Date(baseline_d))) d.cd4.1$keep<-with(d.cd4.1,ifelse(time.diff > -180 & time.diff < 180,1,0)) d.cd4.2<-d.cd4.1[d.cd4.1$keep==1&!is.na(d.cd4.1$keep),] d.cd4.2$id.site<-with(d.cd4.2,paste(patient,site,sep="-")) unique.id<-unique(d.cd4.2$id.site) locate<-NULL for (i in 1:length(unique.id)){ locate[i]<-with(d.cd4.2,which(id.site==unique.id[i]&abs(time.diff)==min(abs(time.diff[id.site==unique.id[i]]))))[1] } cd4.enrol<-d.cd4.2$cd4_v[locate] d.cd4.3<-data.frame(id.site=unique.id,cd4.enrol) d1$id.site<-with(d1,paste(patient,site,sep="-")) d<-merge(d1,d.cd4.3,all.x=TRUE) with(d,table(!is.na(cd4.enrol),site)) #d<-d[d$site!="chile",] ### Excluding Chile because of a low percentage with date of diagnosis. d.ccasanet<-data.frame(site=d$site,male=d$male,age.enrol=d$age.enrol,age.dx=d$age.dx, year.enrol=d$year.enrol,year.dx=d$year.dx,msm=d$msm,mode=d$mode1, aids.enrol=d$aids.enrollment,cd4.enrol=d$cd4.enrol) save(d.ccasanet,file="ccasanet-anna.Rda")