### Clearing all of the data so that I am starting with a clean set of data. rm(list=ls()) ### Setting my working directory to where I want to save results setwd("~/Dropbox/vigh/d43-nigeria/workshops/workshop2023/analyses") ### Reading in the AKTH dataset d<-read.csv("Nigeria-R3-Data-Simple.csv") head(d) ######### Let's do some basic descriptives of the data with(d, summary(age)) with(d, hist(age)) mean(d$male) with(d, summary(bmi)) with(d, hist(bmi)) mean(d$smoke) with(d, summary(cd4)) with(d, hist(cd4)) mean(d$tdf) mean(d$dtg) table(d$risk.alleles) mean(d$htn) d$jnc_bp<-factor(d$jnc_bp) table(d$jnc_bp) summary(d$years.ART) summary(d$uACR) with(d, hist(uACR)) with(d, boxplot(uACR)) d$log10uACR<-log10(d$uACR) with(d, hist(log10uACR)) with(d, boxplot(log10uACR)) summary(d$eGFR) with(d, hist(eGFR)) summary(d$potassium) with(d, hist(potassium)) summary(d$sodium) with(d, hist(sodium)) summary(d$bicarbonate) with(d, hist(bicarbonate)) summary(d$chloride) with(d, hist(chloride)) summary(d$urea) with(d, hist(urea)) summary(d$SBP) with(d, hist(SBP)) summary(d$DBP) with(d, hist(DBP)) hist(d$bmi) summary(d$bmi) boxplot(d$bmi) head(d) summary(d$cd4) mean(d$tdf) table(d$male,d$smoke) plot(d$age,d$bmi) cor(d$age,d$bmi) boxplot(d$bmi ~ d$male) sum(is.na(d$cd4)) for (i in names(d)){ print(paste(i,sum(is.na(d[i])))) } mean(d$age) mean(d$cd4,na.rm=TRUE) summary(d$cd4) quantile(d$cd4, c(.10,.25,.99), na.rm=TRUE) quantile(d$age, c(.10,.25,.99)) hist(d$cd4) hist(log(d$uACR)) with(d, plot(log(uACR),eGFR)) with(d, cor(log(uACR),eGFR)) mean(d$htn) ##### Univariate Associations with htn (self-reported hypertension) #### Logistic regression model looking at association between age and hypertension ## We obtain the model coefficients and p-values, and then we compute odds ratios ## and 95% confidence intervals for the odds ratios. mod.age<-glm(htn ~ age, data=d, family=binomial) summary(mod.age) exp(mod.age$coefficients) exp(confint.default(mod.age)) #### Logistic regression model looking at the association between cd4 and hypertension mod.cd4<-glm(htn ~ cd4, data=d, family=binomial) summary(mod.cd4) exp(mod.cd4$coefficients*100) exp(confint.default(mod.cd4)*100) mod.dm<-glm(htn ~ DM, data=d, family=binomial) summary(mod.dm) mod.tdf<-glm(htn ~ tdf, data=d, family=binomial) summary(mod.tdf) mod.ra<-glm(htn ~ risk.alleles, data=d, family=binomial) summary(mod.ra) exp(mod.ra$coefficients) exp(confint.default(mod.ra)) mod.raf<-glm(htn ~ factor(risk.alleles), data=d, family=binomial) summary(mod.raf) exp(mod.raf$coefficients) exp(confint.default(mod.raf)) d$hr.risk.alleles<-ifelse(d$risk.alleles==2,1,0) table(d$hr.risk.alleles,d$risk.alleles) mod.hra<-glm(htn ~ hr.risk.alleles, data=d, family=binomial) summary(mod.hra) exp(mod.hra$coefficients) exp(confint.default(mod.hra)) #### Chisquare test looking at association between sex and hypertension ## The first uses a continuity correction, the second does not. ## We compare this with logistic regression, which results in a p-value which is ## the same as the one that does not use the continuity correction. with(d, chisq.test(htn,male)) with(d, chisq.test(htn,male, correct=FALSE)) mod.male<-glm(htn~male, data=d, family=binomial) summary(mod.male) #### I created this function that takes the fitted mmodel and extracts the odds ratio, ## 95% confidence interval for the odds ratio, and the p-value. It also allows me to ## quickly multiply the coefficient by a number, "multiplier", in case I want to ## interpret an association on a different scale (e.g., instead of a 1-unit increase ## in CD4, maybe a 100-unit increase in CD4.) or.function<-function(coeff.info, multiplier=1){ or<-exp(coeff.info[1]*multiplier) lower<-exp((coeff.info[1]-1.96*coeff.info[2])*multiplier) upper<-exp((coeff.info[1]+1.96*coeff.info[2])*multiplier) pval<-coeff.info[4] names(or)<-"Odds Ratio" names(lower)<-"95% Lower CI" names(upper)<-"95% Upper CI" names(pval)<-"P-Value" print(c(or, lower, upper, pval)) } or.function(summary(mod.male)$coefficients[2,]) ### Now I am fitting logistic regression using the lrm function of the rms package. ## Everything is the same, but the functions are slightly different. library(rms) dd<-datadist(d) ## This line and the next are used to help with summarizing lrm(.) results options(datadist=dd) fit.male<-lrm(htn ~ male, data=d) summary(fit.male) ## This gives estimates for contrasts anova(fit.male) ## This computes p-values for the model mod.age<-glm(htn~age, data=d, family=binomial) summary(mod.age) or.function(summary(mod.age)$coefficients[2,]) mod.bmi<-glm(htn~bmi, data=d, family=binomial) summary(mod.bmi) or.function(summary(mod.bmi)$coefficients[2,]) mod.smoke<-glm(htn~smoke, data=d, family=binomial) summary(mod.smoke) or.function(summary(mod.smoke)$coefficients[2,]) mod.cd4<-glm(htn~cd4, data=d, family=binomial) summary(mod.cd4) or.function(summary(mod.cd4)$coefficients[2,], multiplier=100) mod.tdf<-glm(htn~tdf, data=d, family=binomial) summary(mod.tdf) or.function(summary(mod.tdf)$coefficients[2,]) mod.dtg<-glm(htn~dtg, data=d, family=binomial) summary(mod.dtg) or.function(summary(mod.dtg)$coefficients[2,]) ### jnc_bp is a categorical variable, so there are multiple odds ratios comparing ## to the reference category mod.jnc<-glm(htn~jnc_bp, data=d, family=binomial) summary(mod.jnc) or.function(summary(mod.jnc)$coefficients[2,]) or.function(summary(mod.jnc)$coefficients[3,]) or.function(summary(mod.jnc)$coefficients[4,]) mod.art<-glm(htn~years.ART, data=d, family=binomial) summary(mod.art) or.function(summary(mod.art)$coefficients[2,]) mod.uacr<-glm(htn~uACR, data=d, family=binomial) summary(mod.uacr) or.function(summary(mod.uacr)$coefficients[2,]) ### Analyzing log-transformed uACR, because it is so skewed. mod.luacr<-glm(htn~log10uACR, data=d, family=binomial) summary(mod.luacr) or.function(summary(mod.luacr)$coefficients[2,])