---
title: 'HIV: NA-ACCORD Data Analysis (365 days lag)'
author: "MM"
date: "`r Sys.time()`"
output:
html_document:
toc: yes
toc_depth: 2
toc_float:
collapsed: yes
number_sections: true
code_folding: hide
css: "http://biostatdata.app.vumc.org/tgs/misc/rmarkdown.css"
csl: american-medical-association.csl
---
```{r introduction, include=FALSE}
#Clear workspace
#unlink('alfresco/Sites/HIVProjectAprilPettit/documentLibrary/code/04-ReadingData_cache', recursive = TRUE)
rm(list = ls())
options(knitr.duplicate.label = 'allow')
require(openxlsx)
source('~/Dropbox/aihuafunction12182017.r', echo=TRUE)
#Set options
knitr::opts_chunk$set(message = FALSE)
knitr::opts_chunk$set(warning = FALSE)
knitr::opts_chunk$set(cache = TRUE)
knitr::opts_chunk$set(cache.lazy = FALSE)
options(digits = 3)
options(scipen = 999)
source("http://biostatdata.app.vumc.org/tgs/misc/r_functions.txt")
#Packages
Packages <- c("readr","purrr","tidyr","lubridate","stringr","docxtractr","kableExtra","knitr","data.table", "ggplot2","openxlsx","xtable","Hmisc","boot", "irr","utils","lme4","TeachingDemos","Hotelling","dplyr","epiR")
lapply(Packages, require, character.only = TRUE)
library(riskRegression)
library(survival)
library(lubridate)
library(rms)
library(broom)
library(cmprsk)
library(rms)
options(prType='html')
library(dplyr)
```
`r hidingTOC(buttonLabel="Outline")`
```{r filedirectory, include=FALSE}
LOCATION1 <- "~/Aihua/WFH/"
LOCATION1 <- "/home/biana/Aihua/WFH/"
LOCATION2 <- "/home/biana/alfresco/Sites/"
file_path_rawdata <- paste0(LOCATION1, "hiv-naaccord-cancer-castilho/documentLibrary/datawith2020data/CD4_CD_ratio/Data")
file_path_cleandata <- paste0(LOCATION1, "hiv-naaccord-cancer-castilho/documentLibrary/datawith2020data/cleaned_data")
file_path_fitted <- paste0(LOCATION1, "hiv-naaccord-cancer-castilho/documentLibrary/datawith2020data/modelsfitted")
PATH <- paste0(LOCATION1, "hiv-naaccord-cancer-castilho/documentLibrary/codewith2020data")
```
```{r}
cleandatalist <- readRDS(paste0(file_path_cleandata, "/","cleandatalist.rds"))
rawdatalist <- readRDS(paste0(file_path_cleandata, "/","rawdatalist.rds"))
```
```{r filedirectory1, eval=FALSE, include=FALSE}
SYSDATE <- "2021-02-08"
file_path_fitted <- paste0("/home/biana/alfresco/Sites/hiv-naaccord-cancer-castilho/documentLibrary/datawith2020data/modelsfitted", SYSDATE,"/")
```
```{r eval=FALSE, include=FALSE}
dir.create(file.path(dirname(file_path_fitted), SYSDATE))
```
```{r}
cleandatalist[["smoking"]] <- cleandatalist[["smoking"]] %>%
dplyr::filter(!is.na(smoking)) %>%
unique()
saveRDS(cleandatalist, paste0(file_path_cleandata, "/","cleandatalist.rds"))
```
# Data for analysis{.tabset .tabset-fade .tabset-pills}
## Outcome variables
Study outcome:
* Death
* Cancer
* Alive
### Study entry date
* Rules of followed for study entry date: MAX (Janusary 1,1998, cohort_opendate, enrolldate + 180 days, cancerstart_date)
```{r}
## Getting patients study entry date
modeldata <- cleandatalist[["patient"]] %>%
mutate(closed_fixeddate = mdy(paste(12,31,2016, sep="-"))
, enrollment_date180 = enrollment_date + 180
, studyentry_date = pmax(cohort_opendate, enrollment_date180, enrollment_fixeddate, cancer_start_date, na.rm = TRUE)
, study_entry_year = year(studyentry_date)
)
```
### Study exit date
* Rules of followed for study entry date:
+ study exit date: min(December 31, 2016, death, loss to followup, cohort_closedate, cancerstop_date)
* Rules of followed for loss to follow up:
+ We consider a period of >= 1.5 years with no CD4 or VL measurements to be an indicator of loss to follow up
Based on discussion the definition of study exit date were modified:
* For the patients with events (cancer or death):
+ study exit date: min(December 31, 2016, death, event_date(death or cancer), cohort_closedate, cancerstop_date)
* For the patients without event (cancer or death):
+ Once they lost to followup, they were censored at the beginning of the gap (at last CD4 or VL measurements dates).
```{r include=FALSE}
##Getting all the unique dates that patients who had cd4/cd8 ratio
tcell <- cleandatalist[["tcellsum"]] %>%
dplyr::filter(!is.na(ratio)) %>%
dplyr::select(naid, cd48_date) %>%
distinct(naid, cd48_date) %>%
dplyr::rename(date = cd48_date)
##Getting all the unique dates that patients who had virus load count
vl <- cleandatalist[["vlsum"]] %>%
dplyr::select(naid, date) %>%
distinct(naid, date)
##Combining both VL and CD48 data to get the unique dates that patients who had CD4/8 ratio or VL
tcell_vl <- bind_rows(tcell, vl, .id = "id") %>%
distinct(naid, date)
## Getting total number of unique ratio or VL measurements dates
tcell_vl_summary <- tcell_vl %>%
dplyr::filter(naid %in% modeldata$naid) %>%
dplyr::distinct(naid, date, .keep_all = TRUE) %>%
group_by(naid) %>%
dplyr::summarize(nobs = n())
## How many times a person who had more than 547 days (1.5 years) without any CD4/8 or VL measurements
tcell_vl_loss <- tcell_vl %>%
group_by(naid) %>%
dplyr::mutate(nobs = n()) %>%
dplyr::filter(nobs > 1) %>%
arrange(naid, date) %>%
mutate(next_date = as.Date(c(date[-1], NA), origin = "1970-01-01")) %>%
mutate(time_to_next = as.numeric(next_date - date)
, loss = time_to_next > 547
) %>%
dplyr::filter(loss == 1) %>%
mutate(loss_date = date) %>%
dplyr::select(naid, loss_date)
temp <- modeldata %>%
dplyr::select(naid, studyentry_date) %>%
right_join(tcell_vl_loss, by ="naid") %>%
mutate(diff = loss_date - studyentry_date) %>%
dplyr::filter(diff > 0)
```
```{r}
#cleandatalist <- readRDS(paste0(file_path_cleandata, "/","cleandatalist.rds"))
death_data <- cleandatalist[["patient"]] %>%
dplyr::filter(!is.na(death_date)) %>%
mutate(event = "Death"
, dates= death_date) %>%
dplyr::select(naid, dates, event)
cancer_data <- cleandatalist[["malignancy"]] %>%
mutate(event = cancerdx_brief
, dates = cancer_dx_date) %>%
dplyr::select(naid, dates, event)
event_data <- bind_rows(death_data, cancer_data) %>%
arrange(naid, dates) %>%
group_by(naid) %>%
dplyr::slice(1) %>%
ungroup %>%
mutate(death_cancer_date = dates) %>%
dplyr::select(-dates)
```
```{r}
modeldata <- modeldata %>%
left_join(event_data, by ="naid") %>%
mutate(studyexit_date = pmin(closed_fixeddate, death_cancer_date, cohort_closedate, cancer_stop_date, na.rm = TRUE)
, age_at_entry = as.numeric(studyentry_date - mdy(paste(6,15, yob, sep="-")))/365.25
, study_entry_year = year(studyentry_date)
) %>%
dplyr::select(naid, subsiteid_n, sex, yob,age_at_entry, race, risk1, risk2, risk, first2_digits_id, cohort, studyentry_date, studyexit_date, death_cancer_date, event, study_entry_year)
```
* Patients with previous cancer
```{r}
prevcancerdx_ids <- cleandatalist[["malignancy"]] %>%
dplyr::filter(prevcancerdx == 1) %>%
dplyr::select(naid) %>%
unique()
```
```{r}
last_cd48date <- cleandatalist[["tcellsum"]] %>%
dplyr::select(naid, ratio, cd48_date) %>%
dplyr::filter(!is.na(cd48_date)) %>%
dplyr::filter(!is.na(ratio)) %>%
group_by(naid) %>%
arrange(naid, cd48_date) %>%
dplyr::slice(n()) %>%
ungroup %>%
mutate(last_cd48date = cd48_date) %>%
dplyr::select(naid, last_cd48date)
```
```{r}
temp <- modeldata %>%
# left_join(event_data, by ="naid") %>%
mutate(diff = studyexit_date- studyentry_date) %>%
mutate(time_to_death_cancer = as.numeric(death_cancer_date - studyentry_date)) %>%
left_join(last_cd48date, by ="naid") %>%
mutate(time_to_last_cd48count = as.numeric(last_cd48date - studyentry_date)) %>%
mutate(excluded = "No") %>%
mutate(excluded = if_else(age_at_entry < 18 & !is.na(age_at_entry), "Yes", excluded)
, excluded_reason_age = if_else(age_at_entry < 18, "Age < 18", NULL)
) %>%
mutate(excluded = if_else(time_to_death_cancer <= 0 & !is.na(time_to_death_cancer), "Yes", excluded)
, excluded_reason_event= if_else(time_to_death_cancer <= 0 & !is.na(time_to_death_cancer), "Death/Cancer before/on study entry", NULL)
) %>%
mutate(excluded = if_else(is.na(last_cd48date) | time_to_last_cd48count < -730, "Yes", excluded)
, excluded_reason_cd8 = if_else(is.na(last_cd48date) | time_to_last_cd48count < -730, "No CD8 or last CD8 count were collected before study entry", NULL)
) %>%
mutate(excluded = if_else(naid %in% prevcancerdx_ids$naid, "Yes", excluded)) %>%
mutate(excluded = if_else(diff<=0, "Yes", excluded)) %>%
mutate(excluded= if_else(sex == "Intersex" , "Yes", excluded))
```
```{r}
ttt <- rawdatalist[["malignancy_diagnosis.csv"]]
names(ttt) <- tolower(names(ttt))
cancermissing <- subset(ttt, is.na(dxyear) | dxyear %in% c(0, 1900))
```
```{r}
excluded_ids <- temp %>%
dplyr::filter(excluded =="Yes") %>%
dplyr::select(naid, excluded, excluded_reason_cd8, excluded_reason_event, excluded_reason_age, time_to_last_cd48count, time_to_death_cancer, age_at_entry)
excluded_idonly <- unique(c(excluded_ids$naid, cancermissing$naid))
```
```{r}
data4model <- temp %>%
dplyr::filter(naid %nin% excluded_idonly ) %>%
dplyr::select(-contains("excluded"))%>%
mutate(event_date = pmin(studyexit_date, death_cancer_date, na.rm = TRUE)) %>%
mutate(time_to_event = as.numeric(event_date - studyentry_date)/365.25) %>%
mutate(event = if_else(death_cancer_date > studyexit_date | is.na(event), "censored" , event))
```
```{r}
ratio_data <- cleandatalist[["tcellsum"]] %>%
dplyr::select(naid,ratio, cd4n, cd8n, cd48_date) %>%
dplyr::filter(!is.na(cd48_date)) %>%
dplyr::filter(!is.na(ratio)) %>%
dplyr::filter(naid %in% unique(data4model$naid)) %>%
left_join(data4model, by = "naid") %>%
mutate(studyentry_date_2years_lag =studyentry_date - 730) %>%
dplyr::filter(cd48_date < studyexit_date & cd48_date >= studyentry_date_2years_lag ) %>%
dplyr::select(naid, cd48_date, ratio, studyexit_date, studyentry_date, studyentry_date_2years_lag) %>%
mutate(cd48_collected_period = if_else(cd48_date >= studyentry_date & cd48_date < studyexit_date, "during followup", "before followup")) %>%
distinct(naid, cd48_collected_period) %>%
group_by(naid) %>%
dplyr::summarize(ratio_period = paste(cd48_collected_period, collapse = "+"))%>%
ungroup()
data4model<- data4model %>%
dplyr::filter(naid %in% ratio_data$naid) %>%
left_join(ratio_data, by ="naid") %>%
mutate(race_brief = ifelse(race %in% c("Other", "Unknown"), "Other+Unknown", race))
```
## Time to first event
### Excluded patients
Excluded patients: `r nrow(cleandatalist[["patient"]]) - length(unique(data4model$naid))`
1. Age at entry less than 18 years old
2. Patients who did not have ratio of cd8/cd4 between 2 years before study entry date and study exit date.
3. Died or had cancer before/on study entry
4. Patients cancer records dxyear were missing or recorded as 0 or 1900
5. Intersex patients
### Summary tables
```{r}
table1data <- data4model %>%
left_join(cleandatalist[["hbv_summary"]], by ="naid") %>%
left_join(cleandatalist[["hcv_summary"]], by ="naid") %>%
left_join((cleandatalist[["alcohol"]]), by ="naid") %>%
left_join((cleandatalist[["smoking"]]), by ="naid")
table1vars <- c("time_to_event", "event","sex", "age_at_entry", "race", "race_brief", "risk1","risk2", "risk","cohort","ratio_period", "hcv","hbv", "alc_abuse", "alc_atrisk", "alc_combo","smoking","study_entry_year" )
groupvar = "subsiteid_n"
Hmisc::label(data4model$time_to_event) <- "Time to first event"
Hmisc::label(data4model$event) <- "First event"
tbl1_formula <- as.formula(paste(paste(table1vars, collapse = "+") , "~", groupvar))
```
#### Patients Characteristics by Site
```{r summaryMtable1}
tbl1_formula <- as.formula(paste(paste(table1vars, collapse = "+") , "~", groupvar))
tbl1 <- summaryM(
tbl1_formula
, data = table1data
, quant = c(0.25,0.5,0.75)
, continuous = 5
, overall = TRUE
, na.include = TRUE
# , test = TRUE
)
tbl1 <- summaryM_to_df(
tbl1
, long = TRUE
, exclude1 = FALSE
, prob = c(0.25,0.5,0.75)
, what = "%"
, digits = 2
, prn = TRUE
# , vnames = "vnames"
)
for (i in 3: (dim(tbl1)[2]-0)){
tbl1[,i] <- slice_to_box(tbl1[,i])
tbl1[,i] <- slice_to_box2(tbl1[,i])
}
names(tbl1)[1] <- "Variables"
kable(tbl1[, c(1:2,dim(tbl1)[2], 3:(dim(tbl1)[2]-1))], "pandoc") %>%
column_spec(1, width = "10em")
```
### Liver cancers and HBV/HCV
```{r}
table1vars <- c( "hbv", "hcv")
groupvar = "event"
tbl1_formula <- as.formula(paste(paste(table1vars, collapse = "+") , "~", groupvar))
```
```{r, ref.label= "summaryMtable1"}
```
#### Smoking by Alcohol
```{r}
table1vars <- c( "alc_abuse", "alc_atrisk", "alc_combo")
groupvar = "smoking"
tbl1_formula <- as.formula(paste(paste(table1vars, collapse = "+") , "~", groupvar))
```
```{r, ref.label= "summaryMtable1"}
```
```{r}
table1data <- data4model
table1data <- table1data %>%
left_join(cleandatalist[["hbv_summary"]], by ="naid") %>%
left_join(cleandatalist[["hcv_summary"]], by ="naid") %>%
left_join((cleandatalist[["alcohol"]]), by ="naid") %>%
left_join((cleandatalist[["smoking"]]), by ="naid")
table1data$event_plot <- as.factor(ifelse(table1data$event %nin% c("Death","censored", "Lung Cancer"), "Other Cancer", table1data$event))
table1data$event_plot_num <- as.numeric(( table1data$event_plot)) -1
table1data$age_at_entry_group <- ifelse(table1data$age_at_entry < 43, "<43", ">=43")
table1data$subsiteid_n <- as.factor(table1data$subsiteid_n)
```
```{r include=FALSE}
tcell <- cleandatalist[["tcellsum"]] %>%
dplyr::filter(!is.na(ratio)) %>%
dplyr::select(naid, cd48_date, cd4n, cd8n, ratio)
tcell_loss_gap <- tcell %>%
group_by(naid) %>%
dplyr::mutate(nobs = n()) %>%
dplyr::filter(nobs > 1) %>%
arrange(naid, cd48_date) %>%
mutate(next_date = as.Date(c(cd48_date[-1], NA), origin = "1970-01-01")) %>%
mutate(time_to_next = as.numeric(next_date - cd48_date)
, loss = time_to_next > 365
) %>%
dplyr::filter(loss == 1) %>%
dplyr::select(naid, cd48_date)%>%
ungroup()
tcell_loss_only_1 <- tcell %>%
group_by(naid) %>%
dplyr::mutate(nobs = n()) %>%
dplyr::filter(nobs==1) %>%
dplyr::select(naid, cd48_date)%>%
ungroup()
tcell_loss_last_obs <- tcell %>%
group_by(naid) %>%
dplyr::mutate(nobs = n()) %>%
dplyr::filter(nobs>1) %>%
dplyr::slice(n()) %>%
dplyr::select(naid, cd48_date)%>%
ungroup()
tcell_loss_all <- rbind(tcell_loss_gap, tcell_loss_only_1, tcell_loss_last_obs) %>%
mutate(cd48_date = cd48_date + 365
, ratio = 9999999
, cd4n=9999999
, cd8n = 9999999)
tcell_tdc <- rbind(tcell_loss_all, tcell)%>%
arrange(naid, cd48_date) %>%
mutate(date = cd48_date)
```
```{r include=FALSE}
vl <- cleandatalist[["vlsum"]] %>%
dplyr::filter(!is.na(vload)) %>%
dplyr::select(naid, date, vload)
vl_loss_gap <- vl %>%
group_by(naid) %>%
dplyr::mutate(nobs = n()) %>%
dplyr::filter(nobs > 1) %>%
arrange(naid, date) %>%
mutate(next_date = as.Date(c(date[-1], NA), origin = "1970-01-01")) %>%
mutate(time_to_next = as.numeric(next_date - date)
, loss = time_to_next > 365
) %>%
dplyr::filter(loss == 1) %>%
dplyr::select(naid, date)%>%
ungroup()
vl_loss_only_1 <- vl %>%
group_by(naid) %>%
dplyr::mutate(nobs = n()) %>%
dplyr::filter(nobs==1) %>%
dplyr::select(naid, date)%>%
ungroup()
vl_loss_last_obs <- vl %>%
group_by(naid) %>%
dplyr::mutate(nobs = n()) %>%
dplyr::filter(nobs>1) %>%
slice(n()) %>%
dplyr::select(naid, date)%>%
ungroup()
vl_loss_all <- rbind(vl_loss_gap, vl_loss_only_1, vl_loss_last_obs) %>%
mutate(date = date + 365
, vload = 9999999
)
vl_tdc <- bind_rows( vl, vl_loss_all) %>%
arrange(naid, date)
```
```{r include=FALSE}
vl_tcell_tdc <- tcell_tdc %>%
full_join( vl_tdc, by = c("naid", "date"))
```
```{r include=FALSE}
cirrhosisdata <- cleandatalist[["diagnosis"]] %>%
dplyr::filter(diagnosis %in% c(4000, 4110, 9903)) %>%
arrange(naid, diagnosis_date) %>%
group_by(naid) %>%
slice(1) %>%
dplyr::select(naid, diagnosis_date) %>%
ungroup() %>%
mutate(cirrhosis = 1) %>%
mutate(date = diagnosis_date) %>%
dplyr::filter(naid %in% vl_tcell_tdc$naid)
```
```{r}
cirrhosisdata <- cirrhosisdata %>%
mutate(diagnosis_date = if_else(year(diagnosis_date) == 1900, ymd("1998-01-01"), diagnosis_date)) %>%
mutate(date = if_else(year(date) == 1900, ymd("1997-05-31"), date))
```
```{r}
non_aidsd <- unique(grep("cancer|cirr|lym|sarcoma", cleandatalist[["diagnosis"]]$diagnosis_factor, value = TRUE, ignore.case = TRUE))
adi_disease <- setdiff(unique(cleandatalist[["diagnosis"]]$diagnosis_factor), non_aidsd)
aidsdata <- cleandatalist[["diagnosis"]] %>%
dplyr::filter(diagnosis_factor %in% adi_disease) %>%
arrange(naid, diagnosis_date) %>%
group_by(naid) %>%
slice(1) %>%
mutate(adi_diagnosis_date = diagnosis_date) %>%
dplyr::select(naid, adi_diagnosis_date) %>%
ungroup() %>%
mutate(adi = 1) %>%
mutate(date = adi_diagnosis_date) %>%
dplyr::filter(naid %in% vl_tcell_tdc$naid)
aidsdata <- aidsdata %>%
mutate(adi_diagnosis_date = if_else(year(adi_diagnosis_date) == 1900, ymd("1998-01-01"), adi_diagnosis_date)) %>%
mutate(date = if_else(year(date) == 1900, ymd("1997-05-31"), date))
```
```{r include=FALSE}
vl_tcell_cir_tdc <- vl_tcell_tdc %>%
left_join (subset(cirrhosisdata, select = c(naid, diagnosis_date)), by = c("naid")) %>%
mutate(cirrhosis = ifelse(date < diagnosis_date | is.na(diagnosis_date), 0, 1)) %>%
full_join(subset(cirrhosisdata, select = c(naid, date, diagnosis_date, cirrhosis), by = c("naid", "date","diagnosis_date"))) %>%
left_join (subset(aidsdata, select = c(naid, adi_diagnosis_date)), by = c("naid")) %>%
mutate(adi = ifelse(date < adi_diagnosis_date | is.na(adi_diagnosis_date), 0, 1)) %>%
full_join(subset(aidsdata, select = c(naid, date, adi_diagnosis_date, adi), by = c("naid", "date","adi_diagnosis_date"))) %>%
arrange(naid, date)
describe(vl_tcell_cir_tdc$cirrhosis)
vl_tcell_cir_tdc %>%
dplyr::filter( cirrhosis== 1 & is.na(cd4n) & is.na(vload))
tdcdata <- vl_tcell_cir_tdc
```
```{r , include=FALSE}
tmp <- subset(data4model, select = c(naid, studyentry_date, studyexit_date, event))
tmp <- data4model %>%
left_join(cleandatalist[["hbv_summary"]], by ="naid") %>%
left_join(cleandatalist[["hcv_summary"]], by ="naid")
tmp$status <- as.numeric(as.factor(tmp$event))
tmp$status <- ifelse(tmp$event == "censored", 0, tmp$status)
tmp$time <- tmp$studyexit_date - tmp$studyentry_date
tmp2 <- tmerge(tmp, tmp, id=naid, endpt = event(time,status ))
options(scipen = 999)
LAG = 365
tdcdata_entry <- tmp %>%
dplyr::select(naid, studyentry_date, studyexit_date) %>%
left_join(tdcdata, by ="naid") %>%
mutate(riskdate = date + LAG
, days_measure_entry = riskdate - studyentry_date
) %>%
dplyr::filter(days_measure_entry > (-LAG*2)) %>%
dplyr::filter(studyexit_date > riskdate)
tmp3 <- tmerge(tmp2, tdcdata_entry, id=naid
, ratio_lag = tdc(days_measure_entry, ratio)
, vload_lag = tdc(days_measure_entry, vload)
, cd4n_lag = tdc(days_measure_entry, cd4n)
, cirrhosis_lag = tdc(days_measure_entry, cirrhosis)
, adi_lag = tdc(days_measure_entry, adi)
)
tmp3_inter <- tmp3 %>%
dplyr::filter(ratio_lag !=9999999 & vload_lag != 9999999 & cd4n_lag != 9999999) %>%
dplyr::filter(!is.na(ratio_lag) & !is.na(vload_lag) & !is.na(cd4n_lag))
tmp3_inter %>%
dplyr::filter(is.na(cd4n_lag))
tmp3 <- tmp3_inter %>%
arrange(naid, tstart) %>%
group_by(naid) %>%
slice(1) %>%
dplyr::select(naid, tstart)%>%
mutate(mini_tstart = tstart) %>%
ungroup() %>%
dplyr::select(-tstart) %>%
right_join(tmp3_inter, by ="naid") %>%
mutate(tstop = as.numeric(tstop)
, tstart = as.numeric(tstart)
, mini_tstart = as.numeric(mini_tstart)
, tstart_new = tstart - mini_tstart
, tstop_new = tstop- mini_tstart
, age_at_entry = age_at_entry + mini_tstart/365.25
)
tmp3$sex <- relevel(tmp3$sex, ref ="Female")
tmp3$log10_vload_lag <- log10(tmp3$vload_lag)
tmp3$cd4n_lag_square_root <- sqrt(tmp3$cd4n_lag)
```
* Patient's records included in the time-varying cox model
```{r eval=FALSE, include=FALSE}
cphdatalist <- list(
"longdata" = tmp3
)
```
```{r include=FALSE}
#33320014
tmp3 <- tmp3 %>%
dplyr::filter(ratio_lag != 0 ) %>%
mutate(log_ratio_lag = log(ratio_lag)) %>%
left_join((cleandatalist[["alcohol"]]), by ="naid")
tmp3$idu <- ifelse(tmp3$risk2 != "Injection drug use" | is.na(tmp3$risk2), "No", "Yes")
describe(tmp3$idu)
tmp3$idu <- ifelse(tmp3$risk1 == "Injection drug use", "Yes",tmp3$idu)
describe(tmp3$idu)
tmp3$white <- ifelse(tmp3$race %in% c("White"), "White", "Non-White")
cphdatalist <- list(
"longdata" = tmp3
)
```
```{r}
tt <- cbind(levels(as.factor(tmp$event)), 1:length(levels(as.factor(tmp$event))))
```
## Distribution of cd4n, cd8n, virus load and cd4/cd8 ratio
```{r}
PROBS = c((0:99)/100, ((1:10)/10+99)/100)
tmp3$cd8n_lag = tmp3$cd4n_lag/tmp3$ratio_lag
data.frame(ratio_lag = quantile(tmp3$ratio_lag, probs = PROBS)
, vload_lag = quantile(tmp3$vload_lag, probs = PROBS)
, cd4n_lag = quantile(tmp3$cd4n_lag, probs =PROBS)
, cd8n_lag = quantile(tmp3$cd8n_lag, probs = PROBS)
)
```
## Picking knots location for continuous variables
Note : rcs knots were fixed for all the analysis.
Randomly picked one record per patient, then got the knots postions.
```{r}
set.seed(1)
tt <- tmp3 %>%
group_by(naid) %>%
sample_n(1)
cphdatalist[["knotdata"]] <- tt
```
### Ratio knots and distribution of ratio_lag in the randomly picked data set
```{r echo=TRUE}
#quantile(tt$ratio_lag, na.rm = TRUE)
KNOTS_ratio_lag <- attributes(rcs(tt$ratio_lag, 4))$parms
KNOTS_ratio_lag3 <- attributes(rcs(tt$ratio_lag, 3))$parms
KNOTS_ratio_lag
html(describe(tt$ratio_lag))
```
### log_ratio_lag knots and distribution of log_ratio_lag in the randomly picked data set
```{r echo=TRUE}
tt$log_ratio_lag <- log(tt$ratio_lag)
#quantile(tt$ratio_lag, na.rm = TRUE)
KNOTS_log_ratio_lag3 <- attributes(rcs(tt$log_ratio_lag, 3))$parms
KNOTS_log_ratio_lag <- attributes(rcs(tt$log_ratio_lag, 4))$parms
KNOTS_log_ratio_lag
html(describe(tt$log_ratio_lag))
```
### cd4n_lag_square_root knots and distribution of cd4n_lag in the randomly picked data set
```{r echo=TRUE}
#quantile(tt$cd4n_lag_square_root, na.rm = TRUE)
KNOTS_cd4n_lag_square_root <- attributes(rcs(tt$cd4n_lag_square_root, 4))$parms
KNOTS_cd4n_lag_square_root3 <- attributes(rcs(tt$cd4n_lag_square_root, 3))$parms
KNOTS_cd4n_lag_square_root
html(describe(tt$cd4n_lag))
```
### Virus load knots and distribution of Virus load in the randomly picked data set
```{r echo=TRUE}
KNOTS_log10_vload_lag3 <- attributes(rcs(tt$log10_vload_lag, 3))$parms
KNOTS_log10_vload_lag <- attributes(rcs(tt$log10_vload_lag, 4))$parms
KNOTS_log10_vload_lag
html(describe(tt$vload_lag))
```
### Age at entry knots and distribution of age at entry in the randomly picked data set
```{r echo=TRUE}
KNOTS_age_at_entry3 <- attributes(rcs(tt$age_at_entry, 3))$parms
KNOTS_age_at_entry <- attributes(rcs(tt$age_at_entry, 4))$parms
KNOTS_age_at_entry
html(describe(tt$age_at_entry))
```
### Study_entry_year
```{r echo=TRUE}
KNOTS_study_entry_year3 <- attributes(rcs(tt$study_entry_year, 3))$parms
KNOTS_study_entry_year <- attributes(rcs(tt$study_entry_year, 3))$parms
KNOTS_study_entry_year
html(describe(tt$study_entry_year))
```
# Multivariable analysis{.tabset .tabset-fade .tabset-pills}
Note:
Cox regression with time-updated 6 month lag cd4/cd8 ratio, adjusting for
* age at study entry
* race
* sex
* hcv
* time-updated 6 month lag virus load (log 10 transformed and included in the model as nonlinear term with 4 knots)
* time- udpated 6 month lag cd4 count(square root transformed and included in the model as nonlinear term with 4 knots)
* site was included in the model as a stratification factor.
```{r eval=TRUE, include=FALSE}
tt <- cphdatalist[["knotdata"]]
Q1Q2 <- as.numeric(format(round(as.vector(quantile(tt$log_ratio_lag, na.rm = TRUE, prob = c(0.25, 0.50))), 2), nsmall = 2))
Q3 <- quantile(tt$log_ratio_lag, na.rm = TRUE, prob = c(0.75))
HRstart <- as.integer(100*(quantile(tt$log_ratio_lag, na.rm =TRUE, probs = 0.05)))+1
HRstop <- as.integer(100*(quantile(tt$log_ratio_lag, na.rm =TRUE, probs = 0.95)))
```
```{r}
CATVARS_ALL <- c("race_brief", "sex", "hcv", "idu", "alc_combo","hbv")
CONTVARS_ALL <- c("log_ratio_lag", "cd4n_lag_square_root", "age_at_entry" ,"log10_vload_lag", "study_entry_year")
CONTVARS_FMLA_ALL <- c( paste("rcs(", "log_ratio_lag", ",", "c(", paste(KNOTS_log_ratio_lag, collapse = ","), ")", ")")
, paste("rcs(", "cd4n_lag_square_root", ",", "c(", paste(KNOTS_cd4n_lag_square_root, collapse = ","), ")", ")")
, paste("rcs(", "age_at_entry", ",", "c(", paste(KNOTS_age_at_entry, collapse = ","), ")", ")")
, paste("rcs(", "log10_vload_lag", ",", "c(", paste(KNOTS_log10_vload_lag, collapse = ","), ")", ")")
, paste("rcs(", "study_entry_year", ",", "c(", paste(KNOTS_study_entry_year, collapse = ","), ")", ")")
)
continuous_all <- data.frame(
"CONTVARS_ALL" = CONTVARS_ALL
, "CONTVARS_FMLA_ALL" = CONTVARS_FMLA_ALL
)
continuous_all$CONTVARS_FMLA_ALL <- as.character(continuous_all$CONTVARS_FMLA_ALL)
continuous_all$CONTVARS_ALL <- as.character(continuous_all$CONTVARS_ALL)
```
```{r}
event_endpt <- tmp2 %>%
distinct(event, endpt) %>%
as.data.frame()
cphdatalist[["event_endpt"]] <- event_endpt
#saveRDS(cphdatalist, paste0(file_path_cleandata, "/","cphdatalistQ1Q3.rds"))
#saveRDS(cphdatalist, paste0(file_path_cleandata, "/","cphdatalist.rds"))
```
```{r}
event_endpt <- cphdatalist[["event_endpt"]]
MTEXT2 <- "Lung Cancer"
N_Cancer <- subset(event_endpt, event %in% MTEXT2)$endpt
cphdata <- cphdatalist[["longdata"]]
cphdata$endpt <- ifelse(cphdata$endpt == N_Cancer, 1, 0)
CATVARS <- c("race_brief", "sex", "hcv", "adi_lag")
CONVARS <- c("log_ratio_lag", "cd4n_lag_square_root", "age_at_entry" ,"log10_vload_lag", "study_entry_year")
CONTVARS_FMLA <- subset(continuous_all, CONVARS %in% CONTVARS_ALL)$CONTVARS_FMLA_ALL
```
# Fitting model
```{r}
tt <- cphdatalistQ1Q3[["knotdata"]]
Q1Q2 <- as.numeric(format(round(as.vector(quantile(tt$log_ratio_lag, na.rm = TRUE, prob = c(0.25, 0.50))), 2), nsmall = 2))
Q3 <- quantile(tt$log_ratio_lag, na.rm = TRUE, prob = c(0.75))
Q1Q2 = log(c(0.3,0.5))
Q3 <- log(0.8)
HRstart <- as.integer(100*(quantile(tt$log_ratio_lag, na.rm =TRUE, probs = 0.05)))+1
HRstop <- as.integer(100*(quantile(tt$log_ratio_lag, na.rm =TRUE, probs = 0.95)))
```
```{r }
ddist = cphdatalistQ1Q3[["ddist"]]
options(datadist='ddist')
fmla <- as.formula(paste("Surv(tstart_new, tstop_new, endpt) ~ ", paste(c(CONTVARS_FMLA, CATVARS), collapse = "+"), "+ strat(subsiteid_n)"))
fit <- cph(fmla
, data = modeldata
, x = TRUE
, y = TRUE)
```
Model specifics:
* Model: Stratified cox proportional hazards models.
* Exposure: month lag cd4/cd8 ratio with `r MTEXT2`.
* Exposure variables:
+ categorical variables: `r CATVARS`
+ continuous variables: `r setdiff(fit$Design$name, c(CATVARS, "subsiteid_n"))`
+ strata: subsiteid_n
+ continuous variables with nonlinear terms: `r setdiff(fit$Design$name, c(CATVARS, "subsiteid_n"))`
```{r ratiofig, fig.height=11}
COMP <- sort(c(c(HRstart:HRstop)/100, Q1Q2))
hr_ratio_lag <- vector('list', length(COMP))
for (i in 1:length(COMP)) {
hr_ratio_lag[[i]] <- summary(fit, log_ratio_lag = c(Q3, COMP[i]))["log_ratio_lag",]
}
HR_data <- as.data.frame(do.call(rbind, hr_ratio_lag))[, c("Low", "High", "Effect" ,"S.E.", "Lower 0.95", "Upper 0.95")]
HR_data$HIGH <- HR_data$High
HR_data$High <- exp(HR_data$High)
HR_data$Low <- exp(HR_data$Low)
HR_data$HR <- exp(HR_data[,"Effect"])
HR_data$lwr <- exp(HR_data[,"Lower 0.95"])
HR_data$upr <- exp(HR_data[,"Upper 0.95"])
HR_data$est95ci <- paste(
format(round(HR_data$HR, 2), nsmall =2)
, " ("
, format(round(HR_data$lwr, 2), nsmall =2)
, "-"
, format(round(HR_data$upr, 2), nsmall =2)
, ")"
, sep =""
)
HR_data$Ratio_comparison <- paste(
format(round(HR_data$High, 2), nsmall =2)
, " vs. "
, format(round(HR_data$Low, 2), nsmall =2)
, sep =""
)
plot(HR~ High,type="n"
, data = HR_data
, log ="y"
# , ylim = range(c(HR_data$upr, HR_data$lwr))
, ylim = c(0.1, 6)
, xlab = "CD4/CD8 ratio (ref = 0.79)"
, ylab = ""
)
mtext(paste(MTEXT2, "Hazard Ratio"), line =2, side = 2)
polygon(c(HR_data$High
, rev(HR_data$High))
, c(HR_data[,"lwr"]
, rev(HR_data[,"upr"]))
, border=NA
, col=blues9[3]
)
lines(HR_data$High
, HR_data[,"HR"]
, lwd=2
, col=blues9[8]
)
abline(h=1, lty = 3, lwd =1)
segments(Q3, 0.01, Q3, 1, col="gray", lty=1)
text(0.3
, 5
,
ifelse(anova(fit)["log_ratio_lag", "P"]> 0.001
, paste("Overall P =", round(anova(fit)["log_ratio_lag", "P" ], 3))
, paste("Overall P", "< 0.001")
)
)
box()
```
```{r coxSummary}
print(fit, latex=TRUE)
print(anova(fit), latex = TRUE)
print(summary(fit), latex = TRUE)
HR_data$outcome = MTEXT2
fittedlist[[MTEXT2]] <- fit
samplesize[[MTEXT2]] <- c("patientN"= length(unique(modeldata$naid))
, "eventN" = (sum(modeldata$endpt)))
anovaall[[MTEXT2]] <- tibble::rownames_to_column(
as.data.frame(anova(fit)[row.names(anova(fit)) %nin% c(" Nonlinear", "TOTAL NONLINEAR", "TOTAL"),])
, "Variables") %>%
mutate(outcome = MTEXT2)
ratiolag_hr[[MTEXT2]] <- HR_data
```
```{r HRrequested}
tt <- subset(HR_data,HIGH %in% Q1Q2, select = c("Ratio_comparison", "est95ci"))
row.names(tt) <- NULL
TableCaption = paste(MTEXT2, ":", "Adjusted hazard ratios of 6 month lag of CD4/CD8 ratio")
```
```{r ttkablestypling}
kable(tt, format = "html", caption = TableCaption) %>%
kable_styling(bootstrap_options = c("striped","bordered"),
full_width = F, position = "left")
```
### Cox proportionality assumption checking
```{r}
cox.zph(fit) %>%
show()
```
```{r}
plot(cox.zph(fit))
```