--- title: 'HIV: NA-ACCORD Data Analysis (365 days lag)' author: "MM" date: "`r Sys.time()`" output: html_document: toc: yes toc_depth: 2 toc_float: collapsed: yes number_sections: true code_folding: hide css: "http://biostatdata.app.vumc.org/tgs/misc/rmarkdown.css" csl: american-medical-association.csl --- ```{r introduction, include=FALSE} #Clear workspace #unlink('alfresco/Sites/HIVProjectAprilPettit/documentLibrary/code/04-ReadingData_cache', recursive = TRUE) rm(list = ls()) options(knitr.duplicate.label = 'allow') require(openxlsx) source('~/Dropbox/aihuafunction12182017.r', echo=TRUE) #Set options knitr::opts_chunk$set(message = FALSE) knitr::opts_chunk$set(warning = FALSE) knitr::opts_chunk$set(cache = TRUE) knitr::opts_chunk$set(cache.lazy = FALSE) options(digits = 3) options(scipen = 999) source("http://biostatdata.app.vumc.org/tgs/misc/r_functions.txt") #Packages Packages <- c("readr","purrr","tidyr","lubridate","stringr","docxtractr","kableExtra","knitr","data.table", "ggplot2","openxlsx","xtable","Hmisc","boot", "irr","utils","lme4","TeachingDemos","Hotelling","dplyr","epiR") lapply(Packages, require, character.only = TRUE) library(riskRegression) library(survival) library(lubridate) library(rms) library(broom) library(cmprsk) library(rms) options(prType='html') library(dplyr) ``` `r hidingTOC(buttonLabel="Outline")` ```{r filedirectory, include=FALSE} LOCATION1 <- "~/Aihua/WFH/" LOCATION1 <- "/home/biana/Aihua/WFH/" LOCATION2 <- "/home/biana/alfresco/Sites/" file_path_rawdata <- paste0(LOCATION1, "hiv-naaccord-cancer-castilho/documentLibrary/datawith2020data/CD4_CD_ratio/Data") file_path_cleandata <- paste0(LOCATION1, "hiv-naaccord-cancer-castilho/documentLibrary/datawith2020data/cleaned_data") file_path_fitted <- paste0(LOCATION1, "hiv-naaccord-cancer-castilho/documentLibrary/datawith2020data/modelsfitted") PATH <- paste0(LOCATION1, "hiv-naaccord-cancer-castilho/documentLibrary/codewith2020data") ``` ```{r} cleandatalist <- readRDS(paste0(file_path_cleandata, "/","cleandatalist.rds")) rawdatalist <- readRDS(paste0(file_path_cleandata, "/","rawdatalist.rds")) ``` ```{r filedirectory1, eval=FALSE, include=FALSE} SYSDATE <- "2021-02-08" file_path_fitted <- paste0("/home/biana/alfresco/Sites/hiv-naaccord-cancer-castilho/documentLibrary/datawith2020data/modelsfitted", SYSDATE,"/") ``` ```{r eval=FALSE, include=FALSE} dir.create(file.path(dirname(file_path_fitted), SYSDATE)) ``` ```{r} cleandatalist[["smoking"]] <- cleandatalist[["smoking"]] %>% dplyr::filter(!is.na(smoking)) %>% unique() saveRDS(cleandatalist, paste0(file_path_cleandata, "/","cleandatalist.rds")) ``` # Data for analysis{.tabset .tabset-fade .tabset-pills} ## Outcome variables Study outcome: * Death * Cancer * Alive ### Study entry date * Rules of followed for study entry date: MAX (Janusary 1,1998, cohort_opendate, enrolldate + 180 days, cancerstart_date) ```{r} ## Getting patients study entry date modeldata <- cleandatalist[["patient"]] %>% mutate(closed_fixeddate = mdy(paste(12,31,2016, sep="-")) , enrollment_date180 = enrollment_date + 180 , studyentry_date = pmax(cohort_opendate, enrollment_date180, enrollment_fixeddate, cancer_start_date, na.rm = TRUE) , study_entry_year = year(studyentry_date) ) ``` ### Study exit date * Rules of followed for study entry date: + study exit date: min(December 31, 2016, death, loss to followup, cohort_closedate, cancerstop_date) * Rules of followed for loss to follow up: + We consider a period of >= 1.5 years with no CD4 or VL measurements to be an indicator of loss to follow up Based on discussion the definition of study exit date were modified: * For the patients with events (cancer or death): + study exit date: min(December 31, 2016, death, event_date(death or cancer), cohort_closedate, cancerstop_date) * For the patients without event (cancer or death): + Once they lost to followup, they were censored at the beginning of the gap (at last CD4 or VL measurements dates). ```{r include=FALSE} ##Getting all the unique dates that patients who had cd4/cd8 ratio tcell <- cleandatalist[["tcellsum"]] %>% dplyr::filter(!is.na(ratio)) %>% dplyr::select(naid, cd48_date) %>% distinct(naid, cd48_date) %>% dplyr::rename(date = cd48_date) ##Getting all the unique dates that patients who had virus load count vl <- cleandatalist[["vlsum"]] %>% dplyr::select(naid, date) %>% distinct(naid, date) ##Combining both VL and CD48 data to get the unique dates that patients who had CD4/8 ratio or VL tcell_vl <- bind_rows(tcell, vl, .id = "id") %>% distinct(naid, date) ## Getting total number of unique ratio or VL measurements dates tcell_vl_summary <- tcell_vl %>% dplyr::filter(naid %in% modeldata$naid) %>% dplyr::distinct(naid, date, .keep_all = TRUE) %>% group_by(naid) %>% dplyr::summarize(nobs = n()) ## How many times a person who had more than 547 days (1.5 years) without any CD4/8 or VL measurements tcell_vl_loss <- tcell_vl %>% group_by(naid) %>% dplyr::mutate(nobs = n()) %>% dplyr::filter(nobs > 1) %>% arrange(naid, date) %>% mutate(next_date = as.Date(c(date[-1], NA), origin = "1970-01-01")) %>% mutate(time_to_next = as.numeric(next_date - date) , loss = time_to_next > 547 ) %>% dplyr::filter(loss == 1) %>% mutate(loss_date = date) %>% dplyr::select(naid, loss_date) temp <- modeldata %>% dplyr::select(naid, studyentry_date) %>% right_join(tcell_vl_loss, by ="naid") %>% mutate(diff = loss_date - studyentry_date) %>% dplyr::filter(diff > 0) ``` ```{r} #cleandatalist <- readRDS(paste0(file_path_cleandata, "/","cleandatalist.rds")) death_data <- cleandatalist[["patient"]] %>% dplyr::filter(!is.na(death_date)) %>% mutate(event = "Death" , dates= death_date) %>% dplyr::select(naid, dates, event) cancer_data <- cleandatalist[["malignancy"]] %>% mutate(event = cancerdx_brief , dates = cancer_dx_date) %>% dplyr::select(naid, dates, event) event_data <- bind_rows(death_data, cancer_data) %>% arrange(naid, dates) %>% group_by(naid) %>% dplyr::slice(1) %>% ungroup %>% mutate(death_cancer_date = dates) %>% dplyr::select(-dates) ``` ```{r} modeldata <- modeldata %>% left_join(event_data, by ="naid") %>% mutate(studyexit_date = pmin(closed_fixeddate, death_cancer_date, cohort_closedate, cancer_stop_date, na.rm = TRUE) , age_at_entry = as.numeric(studyentry_date - mdy(paste(6,15, yob, sep="-")))/365.25 , study_entry_year = year(studyentry_date) ) %>% dplyr::select(naid, subsiteid_n, sex, yob,age_at_entry, race, risk1, risk2, risk, first2_digits_id, cohort, studyentry_date, studyexit_date, death_cancer_date, event, study_entry_year) ``` * Patients with previous cancer ```{r} prevcancerdx_ids <- cleandatalist[["malignancy"]] %>% dplyr::filter(prevcancerdx == 1) %>% dplyr::select(naid) %>% unique() ``` ```{r} last_cd48date <- cleandatalist[["tcellsum"]] %>% dplyr::select(naid, ratio, cd48_date) %>% dplyr::filter(!is.na(cd48_date)) %>% dplyr::filter(!is.na(ratio)) %>% group_by(naid) %>% arrange(naid, cd48_date) %>% dplyr::slice(n()) %>% ungroup %>% mutate(last_cd48date = cd48_date) %>% dplyr::select(naid, last_cd48date) ``` ```{r} temp <- modeldata %>% # left_join(event_data, by ="naid") %>% mutate(diff = studyexit_date- studyentry_date) %>% mutate(time_to_death_cancer = as.numeric(death_cancer_date - studyentry_date)) %>% left_join(last_cd48date, by ="naid") %>% mutate(time_to_last_cd48count = as.numeric(last_cd48date - studyentry_date)) %>% mutate(excluded = "No") %>% mutate(excluded = if_else(age_at_entry < 18 & !is.na(age_at_entry), "Yes", excluded) , excluded_reason_age = if_else(age_at_entry < 18, "Age < 18", NULL) ) %>% mutate(excluded = if_else(time_to_death_cancer <= 0 & !is.na(time_to_death_cancer), "Yes", excluded) , excluded_reason_event= if_else(time_to_death_cancer <= 0 & !is.na(time_to_death_cancer), "Death/Cancer before/on study entry", NULL) ) %>% mutate(excluded = if_else(is.na(last_cd48date) | time_to_last_cd48count < -730, "Yes", excluded) , excluded_reason_cd8 = if_else(is.na(last_cd48date) | time_to_last_cd48count < -730, "No CD8 or last CD8 count were collected before study entry", NULL) ) %>% mutate(excluded = if_else(naid %in% prevcancerdx_ids$naid, "Yes", excluded)) %>% mutate(excluded = if_else(diff<=0, "Yes", excluded)) %>% mutate(excluded= if_else(sex == "Intersex" , "Yes", excluded)) ``` ```{r} ttt <- rawdatalist[["malignancy_diagnosis.csv"]] names(ttt) <- tolower(names(ttt)) cancermissing <- subset(ttt, is.na(dxyear) | dxyear %in% c(0, 1900)) ``` ```{r} excluded_ids <- temp %>% dplyr::filter(excluded =="Yes") %>% dplyr::select(naid, excluded, excluded_reason_cd8, excluded_reason_event, excluded_reason_age, time_to_last_cd48count, time_to_death_cancer, age_at_entry) excluded_idonly <- unique(c(excluded_ids$naid, cancermissing$naid)) ``` ```{r} data4model <- temp %>% dplyr::filter(naid %nin% excluded_idonly ) %>% dplyr::select(-contains("excluded"))%>% mutate(event_date = pmin(studyexit_date, death_cancer_date, na.rm = TRUE)) %>% mutate(time_to_event = as.numeric(event_date - studyentry_date)/365.25) %>% mutate(event = if_else(death_cancer_date > studyexit_date | is.na(event), "censored" , event)) ``` ```{r} ratio_data <- cleandatalist[["tcellsum"]] %>% dplyr::select(naid,ratio, cd4n, cd8n, cd48_date) %>% dplyr::filter(!is.na(cd48_date)) %>% dplyr::filter(!is.na(ratio)) %>% dplyr::filter(naid %in% unique(data4model$naid)) %>% left_join(data4model, by = "naid") %>% mutate(studyentry_date_2years_lag =studyentry_date - 730) %>% dplyr::filter(cd48_date < studyexit_date & cd48_date >= studyentry_date_2years_lag ) %>% dplyr::select(naid, cd48_date, ratio, studyexit_date, studyentry_date, studyentry_date_2years_lag) %>% mutate(cd48_collected_period = if_else(cd48_date >= studyentry_date & cd48_date < studyexit_date, "during followup", "before followup")) %>% distinct(naid, cd48_collected_period) %>% group_by(naid) %>% dplyr::summarize(ratio_period = paste(cd48_collected_period, collapse = "+"))%>% ungroup() data4model<- data4model %>% dplyr::filter(naid %in% ratio_data$naid) %>% left_join(ratio_data, by ="naid") %>% mutate(race_brief = ifelse(race %in% c("Other", "Unknown"), "Other+Unknown", race)) ``` ## Time to first event ### Excluded patients Excluded patients: `r nrow(cleandatalist[["patient"]]) - length(unique(data4model$naid))` 1. Age at entry less than 18 years old 2. Patients who did not have ratio of cd8/cd4 between 2 years before study entry date and study exit date. 3. Died or had cancer before/on study entry 4. Patients cancer records dxyear were missing or recorded as 0 or 1900 5. Intersex patients ### Summary tables ```{r} table1data <- data4model %>% left_join(cleandatalist[["hbv_summary"]], by ="naid") %>% left_join(cleandatalist[["hcv_summary"]], by ="naid") %>% left_join((cleandatalist[["alcohol"]]), by ="naid") %>% left_join((cleandatalist[["smoking"]]), by ="naid") table1vars <- c("time_to_event", "event","sex", "age_at_entry", "race", "race_brief", "risk1","risk2", "risk","cohort","ratio_period", "hcv","hbv", "alc_abuse", "alc_atrisk", "alc_combo","smoking","study_entry_year" ) groupvar = "subsiteid_n" Hmisc::label(data4model$time_to_event) <- "Time to first event" Hmisc::label(data4model$event) <- "First event" tbl1_formula <- as.formula(paste(paste(table1vars, collapse = "+") , "~", groupvar)) ``` #### Patients Characteristics by Site ```{r summaryMtable1} tbl1_formula <- as.formula(paste(paste(table1vars, collapse = "+") , "~", groupvar)) tbl1 <- summaryM( tbl1_formula , data = table1data , quant = c(0.25,0.5,0.75) , continuous = 5 , overall = TRUE , na.include = TRUE # , test = TRUE ) tbl1 <- summaryM_to_df( tbl1 , long = TRUE , exclude1 = FALSE , prob = c(0.25,0.5,0.75) , what = "%" , digits = 2 , prn = TRUE # , vnames = "vnames" ) for (i in 3: (dim(tbl1)[2]-0)){ tbl1[,i] <- slice_to_box(tbl1[,i]) tbl1[,i] <- slice_to_box2(tbl1[,i]) } names(tbl1)[1] <- "Variables" kable(tbl1[, c(1:2,dim(tbl1)[2], 3:(dim(tbl1)[2]-1))], "pandoc") %>% column_spec(1, width = "10em") ``` ### Liver cancers and HBV/HCV ```{r} table1vars <- c( "hbv", "hcv") groupvar = "event" tbl1_formula <- as.formula(paste(paste(table1vars, collapse = "+") , "~", groupvar)) ``` ```{r, ref.label= "summaryMtable1"} ``` #### Smoking by Alcohol ```{r} table1vars <- c( "alc_abuse", "alc_atrisk", "alc_combo") groupvar = "smoking" tbl1_formula <- as.formula(paste(paste(table1vars, collapse = "+") , "~", groupvar)) ``` ```{r, ref.label= "summaryMtable1"} ``` ```{r} table1data <- data4model table1data <- table1data %>% left_join(cleandatalist[["hbv_summary"]], by ="naid") %>% left_join(cleandatalist[["hcv_summary"]], by ="naid") %>% left_join((cleandatalist[["alcohol"]]), by ="naid") %>% left_join((cleandatalist[["smoking"]]), by ="naid") table1data$event_plot <- as.factor(ifelse(table1data$event %nin% c("Death","censored", "Lung Cancer"), "Other Cancer", table1data$event)) table1data$event_plot_num <- as.numeric(( table1data$event_plot)) -1 table1data$age_at_entry_group <- ifelse(table1data$age_at_entry < 43, "<43", ">=43") table1data$subsiteid_n <- as.factor(table1data$subsiteid_n) ``` ```{r include=FALSE} tcell <- cleandatalist[["tcellsum"]] %>% dplyr::filter(!is.na(ratio)) %>% dplyr::select(naid, cd48_date, cd4n, cd8n, ratio) tcell_loss_gap <- tcell %>% group_by(naid) %>% dplyr::mutate(nobs = n()) %>% dplyr::filter(nobs > 1) %>% arrange(naid, cd48_date) %>% mutate(next_date = as.Date(c(cd48_date[-1], NA), origin = "1970-01-01")) %>% mutate(time_to_next = as.numeric(next_date - cd48_date) , loss = time_to_next > 365 ) %>% dplyr::filter(loss == 1) %>% dplyr::select(naid, cd48_date)%>% ungroup() tcell_loss_only_1 <- tcell %>% group_by(naid) %>% dplyr::mutate(nobs = n()) %>% dplyr::filter(nobs==1) %>% dplyr::select(naid, cd48_date)%>% ungroup() tcell_loss_last_obs <- tcell %>% group_by(naid) %>% dplyr::mutate(nobs = n()) %>% dplyr::filter(nobs>1) %>% dplyr::slice(n()) %>% dplyr::select(naid, cd48_date)%>% ungroup() tcell_loss_all <- rbind(tcell_loss_gap, tcell_loss_only_1, tcell_loss_last_obs) %>% mutate(cd48_date = cd48_date + 365 , ratio = 9999999 , cd4n=9999999 , cd8n = 9999999) tcell_tdc <- rbind(tcell_loss_all, tcell)%>% arrange(naid, cd48_date) %>% mutate(date = cd48_date) ``` ```{r include=FALSE} vl <- cleandatalist[["vlsum"]] %>% dplyr::filter(!is.na(vload)) %>% dplyr::select(naid, date, vload) vl_loss_gap <- vl %>% group_by(naid) %>% dplyr::mutate(nobs = n()) %>% dplyr::filter(nobs > 1) %>% arrange(naid, date) %>% mutate(next_date = as.Date(c(date[-1], NA), origin = "1970-01-01")) %>% mutate(time_to_next = as.numeric(next_date - date) , loss = time_to_next > 365 ) %>% dplyr::filter(loss == 1) %>% dplyr::select(naid, date)%>% ungroup() vl_loss_only_1 <- vl %>% group_by(naid) %>% dplyr::mutate(nobs = n()) %>% dplyr::filter(nobs==1) %>% dplyr::select(naid, date)%>% ungroup() vl_loss_last_obs <- vl %>% group_by(naid) %>% dplyr::mutate(nobs = n()) %>% dplyr::filter(nobs>1) %>% slice(n()) %>% dplyr::select(naid, date)%>% ungroup() vl_loss_all <- rbind(vl_loss_gap, vl_loss_only_1, vl_loss_last_obs) %>% mutate(date = date + 365 , vload = 9999999 ) vl_tdc <- bind_rows( vl, vl_loss_all) %>% arrange(naid, date) ``` ```{r include=FALSE} vl_tcell_tdc <- tcell_tdc %>% full_join( vl_tdc, by = c("naid", "date")) ``` ```{r include=FALSE} cirrhosisdata <- cleandatalist[["diagnosis"]] %>% dplyr::filter(diagnosis %in% c(4000, 4110, 9903)) %>% arrange(naid, diagnosis_date) %>% group_by(naid) %>% slice(1) %>% dplyr::select(naid, diagnosis_date) %>% ungroup() %>% mutate(cirrhosis = 1) %>% mutate(date = diagnosis_date) %>% dplyr::filter(naid %in% vl_tcell_tdc$naid) ``` ```{r} cirrhosisdata <- cirrhosisdata %>% mutate(diagnosis_date = if_else(year(diagnosis_date) == 1900, ymd("1998-01-01"), diagnosis_date)) %>% mutate(date = if_else(year(date) == 1900, ymd("1997-05-31"), date)) ``` ```{r} non_aidsd <- unique(grep("cancer|cirr|lym|sarcoma", cleandatalist[["diagnosis"]]$diagnosis_factor, value = TRUE, ignore.case = TRUE)) adi_disease <- setdiff(unique(cleandatalist[["diagnosis"]]$diagnosis_factor), non_aidsd) aidsdata <- cleandatalist[["diagnosis"]] %>% dplyr::filter(diagnosis_factor %in% adi_disease) %>% arrange(naid, diagnosis_date) %>% group_by(naid) %>% slice(1) %>% mutate(adi_diagnosis_date = diagnosis_date) %>% dplyr::select(naid, adi_diagnosis_date) %>% ungroup() %>% mutate(adi = 1) %>% mutate(date = adi_diagnosis_date) %>% dplyr::filter(naid %in% vl_tcell_tdc$naid) aidsdata <- aidsdata %>% mutate(adi_diagnosis_date = if_else(year(adi_diagnosis_date) == 1900, ymd("1998-01-01"), adi_diagnosis_date)) %>% mutate(date = if_else(year(date) == 1900, ymd("1997-05-31"), date)) ``` ```{r include=FALSE} vl_tcell_cir_tdc <- vl_tcell_tdc %>% left_join (subset(cirrhosisdata, select = c(naid, diagnosis_date)), by = c("naid")) %>% mutate(cirrhosis = ifelse(date < diagnosis_date | is.na(diagnosis_date), 0, 1)) %>% full_join(subset(cirrhosisdata, select = c(naid, date, diagnosis_date, cirrhosis), by = c("naid", "date","diagnosis_date"))) %>% left_join (subset(aidsdata, select = c(naid, adi_diagnosis_date)), by = c("naid")) %>% mutate(adi = ifelse(date < adi_diagnosis_date | is.na(adi_diagnosis_date), 0, 1)) %>% full_join(subset(aidsdata, select = c(naid, date, adi_diagnosis_date, adi), by = c("naid", "date","adi_diagnosis_date"))) %>% arrange(naid, date) describe(vl_tcell_cir_tdc$cirrhosis) vl_tcell_cir_tdc %>% dplyr::filter( cirrhosis== 1 & is.na(cd4n) & is.na(vload)) tdcdata <- vl_tcell_cir_tdc ``` ```{r , include=FALSE} tmp <- subset(data4model, select = c(naid, studyentry_date, studyexit_date, event)) tmp <- data4model %>% left_join(cleandatalist[["hbv_summary"]], by ="naid") %>% left_join(cleandatalist[["hcv_summary"]], by ="naid") tmp$status <- as.numeric(as.factor(tmp$event)) tmp$status <- ifelse(tmp$event == "censored", 0, tmp$status) tmp$time <- tmp$studyexit_date - tmp$studyentry_date tmp2 <- tmerge(tmp, tmp, id=naid, endpt = event(time,status )) options(scipen = 999) LAG = 365 tdcdata_entry <- tmp %>% dplyr::select(naid, studyentry_date, studyexit_date) %>% left_join(tdcdata, by ="naid") %>% mutate(riskdate = date + LAG , days_measure_entry = riskdate - studyentry_date ) %>% dplyr::filter(days_measure_entry > (-LAG*2)) %>% dplyr::filter(studyexit_date > riskdate) tmp3 <- tmerge(tmp2, tdcdata_entry, id=naid , ratio_lag = tdc(days_measure_entry, ratio) , vload_lag = tdc(days_measure_entry, vload) , cd4n_lag = tdc(days_measure_entry, cd4n) , cirrhosis_lag = tdc(days_measure_entry, cirrhosis) , adi_lag = tdc(days_measure_entry, adi) ) tmp3_inter <- tmp3 %>% dplyr::filter(ratio_lag !=9999999 & vload_lag != 9999999 & cd4n_lag != 9999999) %>% dplyr::filter(!is.na(ratio_lag) & !is.na(vload_lag) & !is.na(cd4n_lag)) tmp3_inter %>% dplyr::filter(is.na(cd4n_lag)) tmp3 <- tmp3_inter %>% arrange(naid, tstart) %>% group_by(naid) %>% slice(1) %>% dplyr::select(naid, tstart)%>% mutate(mini_tstart = tstart) %>% ungroup() %>% dplyr::select(-tstart) %>% right_join(tmp3_inter, by ="naid") %>% mutate(tstop = as.numeric(tstop) , tstart = as.numeric(tstart) , mini_tstart = as.numeric(mini_tstart) , tstart_new = tstart - mini_tstart , tstop_new = tstop- mini_tstart , age_at_entry = age_at_entry + mini_tstart/365.25 ) tmp3$sex <- relevel(tmp3$sex, ref ="Female") tmp3$log10_vload_lag <- log10(tmp3$vload_lag) tmp3$cd4n_lag_square_root <- sqrt(tmp3$cd4n_lag) ``` * Patient's records included in the time-varying cox model ```{r eval=FALSE, include=FALSE} cphdatalist <- list( "longdata" = tmp3 ) ``` ```{r include=FALSE} #33320014 tmp3 <- tmp3 %>% dplyr::filter(ratio_lag != 0 ) %>% mutate(log_ratio_lag = log(ratio_lag)) %>% left_join((cleandatalist[["alcohol"]]), by ="naid") tmp3$idu <- ifelse(tmp3$risk2 != "Injection drug use" | is.na(tmp3$risk2), "No", "Yes") describe(tmp3$idu) tmp3$idu <- ifelse(tmp3$risk1 == "Injection drug use", "Yes",tmp3$idu) describe(tmp3$idu) tmp3$white <- ifelse(tmp3$race %in% c("White"), "White", "Non-White") cphdatalist <- list( "longdata" = tmp3 ) ``` ```{r} tt <- cbind(levels(as.factor(tmp$event)), 1:length(levels(as.factor(tmp$event)))) ``` ## Distribution of cd4n, cd8n, virus load and cd4/cd8 ratio ```{r} PROBS = c((0:99)/100, ((1:10)/10+99)/100) tmp3$cd8n_lag = tmp3$cd4n_lag/tmp3$ratio_lag data.frame(ratio_lag = quantile(tmp3$ratio_lag, probs = PROBS) , vload_lag = quantile(tmp3$vload_lag, probs = PROBS) , cd4n_lag = quantile(tmp3$cd4n_lag, probs =PROBS) , cd8n_lag = quantile(tmp3$cd8n_lag, probs = PROBS) ) ``` ## Picking knots location for continuous variables Note : rcs knots were fixed for all the analysis. Randomly picked one record per patient, then got the knots postions. ```{r} set.seed(1) tt <- tmp3 %>% group_by(naid) %>% sample_n(1) cphdatalist[["knotdata"]] <- tt ``` ### Ratio knots and distribution of ratio_lag in the randomly picked data set ```{r echo=TRUE} #quantile(tt$ratio_lag, na.rm = TRUE) KNOTS_ratio_lag <- attributes(rcs(tt$ratio_lag, 4))$parms KNOTS_ratio_lag3 <- attributes(rcs(tt$ratio_lag, 3))$parms KNOTS_ratio_lag html(describe(tt$ratio_lag)) ``` ### log_ratio_lag knots and distribution of log_ratio_lag in the randomly picked data set ```{r echo=TRUE} tt$log_ratio_lag <- log(tt$ratio_lag) #quantile(tt$ratio_lag, na.rm = TRUE) KNOTS_log_ratio_lag3 <- attributes(rcs(tt$log_ratio_lag, 3))$parms KNOTS_log_ratio_lag <- attributes(rcs(tt$log_ratio_lag, 4))$parms KNOTS_log_ratio_lag html(describe(tt$log_ratio_lag)) ``` ### cd4n_lag_square_root knots and distribution of cd4n_lag in the randomly picked data set ```{r echo=TRUE} #quantile(tt$cd4n_lag_square_root, na.rm = TRUE) KNOTS_cd4n_lag_square_root <- attributes(rcs(tt$cd4n_lag_square_root, 4))$parms KNOTS_cd4n_lag_square_root3 <- attributes(rcs(tt$cd4n_lag_square_root, 3))$parms KNOTS_cd4n_lag_square_root html(describe(tt$cd4n_lag)) ``` ### Virus load knots and distribution of Virus load in the randomly picked data set ```{r echo=TRUE} KNOTS_log10_vload_lag3 <- attributes(rcs(tt$log10_vload_lag, 3))$parms KNOTS_log10_vload_lag <- attributes(rcs(tt$log10_vload_lag, 4))$parms KNOTS_log10_vload_lag html(describe(tt$vload_lag)) ``` ### Age at entry knots and distribution of age at entry in the randomly picked data set ```{r echo=TRUE} KNOTS_age_at_entry3 <- attributes(rcs(tt$age_at_entry, 3))$parms KNOTS_age_at_entry <- attributes(rcs(tt$age_at_entry, 4))$parms KNOTS_age_at_entry html(describe(tt$age_at_entry)) ``` ### Study_entry_year ```{r echo=TRUE} KNOTS_study_entry_year3 <- attributes(rcs(tt$study_entry_year, 3))$parms KNOTS_study_entry_year <- attributes(rcs(tt$study_entry_year, 3))$parms KNOTS_study_entry_year html(describe(tt$study_entry_year)) ``` # Multivariable analysis{.tabset .tabset-fade .tabset-pills} Note: Cox regression with time-updated 6 month lag cd4/cd8 ratio, adjusting for * age at study entry * race * sex * hcv * time-updated 6 month lag virus load (log 10 transformed and included in the model as nonlinear term with 4 knots) * time- udpated 6 month lag cd4 count(square root transformed and included in the model as nonlinear term with 4 knots) * site was included in the model as a stratification factor. ```{r eval=TRUE, include=FALSE} tt <- cphdatalist[["knotdata"]] Q1Q2 <- as.numeric(format(round(as.vector(quantile(tt$log_ratio_lag, na.rm = TRUE, prob = c(0.25, 0.50))), 2), nsmall = 2)) Q3 <- quantile(tt$log_ratio_lag, na.rm = TRUE, prob = c(0.75)) HRstart <- as.integer(100*(quantile(tt$log_ratio_lag, na.rm =TRUE, probs = 0.05)))+1 HRstop <- as.integer(100*(quantile(tt$log_ratio_lag, na.rm =TRUE, probs = 0.95))) ``` ```{r} CATVARS_ALL <- c("race_brief", "sex", "hcv", "idu", "alc_combo","hbv") CONTVARS_ALL <- c("log_ratio_lag", "cd4n_lag_square_root", "age_at_entry" ,"log10_vload_lag", "study_entry_year") CONTVARS_FMLA_ALL <- c( paste("rcs(", "log_ratio_lag", ",", "c(", paste(KNOTS_log_ratio_lag, collapse = ","), ")", ")") , paste("rcs(", "cd4n_lag_square_root", ",", "c(", paste(KNOTS_cd4n_lag_square_root, collapse = ","), ")", ")") , paste("rcs(", "age_at_entry", ",", "c(", paste(KNOTS_age_at_entry, collapse = ","), ")", ")") , paste("rcs(", "log10_vload_lag", ",", "c(", paste(KNOTS_log10_vload_lag, collapse = ","), ")", ")") , paste("rcs(", "study_entry_year", ",", "c(", paste(KNOTS_study_entry_year, collapse = ","), ")", ")") ) continuous_all <- data.frame( "CONTVARS_ALL" = CONTVARS_ALL , "CONTVARS_FMLA_ALL" = CONTVARS_FMLA_ALL ) continuous_all$CONTVARS_FMLA_ALL <- as.character(continuous_all$CONTVARS_FMLA_ALL) continuous_all$CONTVARS_ALL <- as.character(continuous_all$CONTVARS_ALL) ``` ```{r} event_endpt <- tmp2 %>% distinct(event, endpt) %>% as.data.frame() cphdatalist[["event_endpt"]] <- event_endpt #saveRDS(cphdatalist, paste0(file_path_cleandata, "/","cphdatalistQ1Q3.rds")) #saveRDS(cphdatalist, paste0(file_path_cleandata, "/","cphdatalist.rds")) ``` ```{r} event_endpt <- cphdatalist[["event_endpt"]] MTEXT2 <- "Lung Cancer" N_Cancer <- subset(event_endpt, event %in% MTEXT2)$endpt cphdata <- cphdatalist[["longdata"]] cphdata$endpt <- ifelse(cphdata$endpt == N_Cancer, 1, 0) CATVARS <- c("race_brief", "sex", "hcv", "adi_lag") CONVARS <- c("log_ratio_lag", "cd4n_lag_square_root", "age_at_entry" ,"log10_vload_lag", "study_entry_year") CONTVARS_FMLA <- subset(continuous_all, CONVARS %in% CONTVARS_ALL)$CONTVARS_FMLA_ALL ``` # Fitting model ```{r} tt <- cphdatalistQ1Q3[["knotdata"]] Q1Q2 <- as.numeric(format(round(as.vector(quantile(tt$log_ratio_lag, na.rm = TRUE, prob = c(0.25, 0.50))), 2), nsmall = 2)) Q3 <- quantile(tt$log_ratio_lag, na.rm = TRUE, prob = c(0.75)) Q1Q2 = log(c(0.3,0.5)) Q3 <- log(0.8) HRstart <- as.integer(100*(quantile(tt$log_ratio_lag, na.rm =TRUE, probs = 0.05)))+1 HRstop <- as.integer(100*(quantile(tt$log_ratio_lag, na.rm =TRUE, probs = 0.95))) ``` ```{r } ddist = cphdatalistQ1Q3[["ddist"]] options(datadist='ddist') fmla <- as.formula(paste("Surv(tstart_new, tstop_new, endpt) ~ ", paste(c(CONTVARS_FMLA, CATVARS), collapse = "+"), "+ strat(subsiteid_n)")) fit <- cph(fmla , data = modeldata , x = TRUE , y = TRUE) ``` Model specifics: * Model: Stratified cox proportional hazards models. * Exposure: month lag cd4/cd8 ratio with `r MTEXT2`. * Exposure variables: + categorical variables: `r CATVARS` + continuous variables: `r setdiff(fit$Design$name, c(CATVARS, "subsiteid_n"))` + strata: subsiteid_n + continuous variables with nonlinear terms: `r setdiff(fit$Design$name, c(CATVARS, "subsiteid_n"))` ```{r ratiofig, fig.height=11} COMP <- sort(c(c(HRstart:HRstop)/100, Q1Q2)) hr_ratio_lag <- vector('list', length(COMP)) for (i in 1:length(COMP)) { hr_ratio_lag[[i]] <- summary(fit, log_ratio_lag = c(Q3, COMP[i]))["log_ratio_lag",] } HR_data <- as.data.frame(do.call(rbind, hr_ratio_lag))[, c("Low", "High", "Effect" ,"S.E.", "Lower 0.95", "Upper 0.95")] HR_data$HIGH <- HR_data$High HR_data$High <- exp(HR_data$High) HR_data$Low <- exp(HR_data$Low) HR_data$HR <- exp(HR_data[,"Effect"]) HR_data$lwr <- exp(HR_data[,"Lower 0.95"]) HR_data$upr <- exp(HR_data[,"Upper 0.95"]) HR_data$est95ci <- paste( format(round(HR_data$HR, 2), nsmall =2) , " (" , format(round(HR_data$lwr, 2), nsmall =2) , "-" , format(round(HR_data$upr, 2), nsmall =2) , ")" , sep ="" ) HR_data$Ratio_comparison <- paste( format(round(HR_data$High, 2), nsmall =2) , " vs. " , format(round(HR_data$Low, 2), nsmall =2) , sep ="" ) plot(HR~ High,type="n" , data = HR_data , log ="y" # , ylim = range(c(HR_data$upr, HR_data$lwr)) , ylim = c(0.1, 6) , xlab = "CD4/CD8 ratio (ref = 0.79)" , ylab = "" ) mtext(paste(MTEXT2, "Hazard Ratio"), line =2, side = 2) polygon(c(HR_data$High , rev(HR_data$High)) , c(HR_data[,"lwr"] , rev(HR_data[,"upr"])) , border=NA , col=blues9[3] ) lines(HR_data$High , HR_data[,"HR"] , lwd=2 , col=blues9[8] ) abline(h=1, lty = 3, lwd =1) segments(Q3, 0.01, Q3, 1, col="gray", lty=1) text(0.3 , 5 , ifelse(anova(fit)["log_ratio_lag", "P"]> 0.001 , paste("Overall P =", round(anova(fit)["log_ratio_lag", "P" ], 3)) , paste("Overall P", "< 0.001") ) ) box() ``` ```{r coxSummary} print(fit, latex=TRUE) print(anova(fit), latex = TRUE) print(summary(fit), latex = TRUE) HR_data$outcome = MTEXT2 fittedlist[[MTEXT2]] <- fit samplesize[[MTEXT2]] <- c("patientN"= length(unique(modeldata$naid)) , "eventN" = (sum(modeldata$endpt))) anovaall[[MTEXT2]] <- tibble::rownames_to_column( as.data.frame(anova(fit)[row.names(anova(fit)) %nin% c(" Nonlinear", "TOTAL NONLINEAR", "TOTAL"),]) , "Variables") %>% mutate(outcome = MTEXT2) ratiolag_hr[[MTEXT2]] <- HR_data ``` ```{r HRrequested} tt <- subset(HR_data,HIGH %in% Q1Q2, select = c("Ratio_comparison", "est95ci")) row.names(tt) <- NULL TableCaption = paste(MTEXT2, ":", "Adjusted hazard ratios of 6 month lag of CD4/CD8 ratio") ``` ```{r ttkablestypling} kable(tt, format = "html", caption = TableCaption) %>% kable_styling(bootstrap_options = c("striped","bordered"), full_width = F, position = "left") ``` ### Cox proportionality assumption checking ```{r} cox.zph(fit) %>% show() ``` ```{r} plot(cox.zph(fit)) ```