### Clearing all of the data so that I am starting with a clean set of data.
rm(list=ls())

### Setting my working directory to where I want to save results
setwd("~/Dropbox/vigh/d43-nigeria/workshops/workshop2023/analyses")

### Reading in the AKTH dataset
d<-read.csv("Nigeria-R3-Data-Simple.csv")

head(d)


######### Let's do some basic descriptives of the data

with(d, summary(age))
with(d, hist(age))

mean(d$male)

with(d, summary(bmi))
with(d, hist(bmi))

mean(d$smoke)

with(d, summary(cd4))
with(d, hist(cd4))

mean(d$tdf)

mean(d$dtg)

table(d$risk.alleles)

mean(d$htn)

d$jnc_bp<-factor(d$jnc_bp)
table(d$jnc_bp)

summary(d$years.ART)

summary(d$uACR)
with(d, hist(uACR))
with(d, boxplot(uACR))

d$log10uACR<-log10(d$uACR)
with(d, hist(log10uACR))
with(d, boxplot(log10uACR))

summary(d$eGFR)
with(d, hist(eGFR))

summary(d$potassium)
with(d, hist(potassium))

summary(d$sodium)
with(d, hist(sodium))

summary(d$bicarbonate)
with(d, hist(bicarbonate))

summary(d$chloride)
with(d, hist(chloride))

summary(d$urea)
with(d, hist(urea))

summary(d$SBP)
with(d, hist(SBP))

summary(d$DBP)
with(d, hist(DBP))


hist(d$bmi)
summary(d$bmi)
boxplot(d$bmi)
head(d)

summary(d$cd4)
mean(d$tdf)
table(d$male,d$smoke)
plot(d$age,d$bmi)
cor(d$age,d$bmi)
boxplot(d$bmi ~ d$male)
sum(is.na(d$cd4))
for (i in names(d)){
  print(paste(i,sum(is.na(d[i]))))
}
mean(d$age)
mean(d$cd4,na.rm=TRUE)
summary(d$cd4)
quantile(d$cd4, c(.10,.25,.99), na.rm=TRUE)

quantile(d$age, c(.10,.25,.99))
hist(d$cd4)
hist(log(d$uACR))
with(d, plot(log(uACR),eGFR))
with(d, cor(log(uACR),eGFR))

mean(d$htn)


#####  Univariate Associations with htn (self-reported hypertension)

#### Logistic regression model looking at association between age and hypertension
##    We obtain the model coefficients and p-values, and then we compute odds ratios
##    and 95% confidence intervals for the odds ratios.
mod.age<-glm(htn ~ age, data=d, family=binomial)
summary(mod.age)     
exp(mod.age$coefficients)
exp(confint.default(mod.age))

#### Logistic regression model looking at the association between cd4 and hypertension
mod.cd4<-glm(htn ~ cd4, data=d, family=binomial)
summary(mod.cd4)
exp(mod.cd4$coefficients*100)
exp(confint.default(mod.cd4)*100)

mod.dm<-glm(htn ~ DM, data=d, family=binomial)
summary(mod.dm)

mod.tdf<-glm(htn ~ tdf, data=d, family=binomial)
summary(mod.tdf)

mod.ra<-glm(htn ~ risk.alleles, data=d, family=binomial)
summary(mod.ra)
exp(mod.ra$coefficients)
exp(confint.default(mod.ra))

mod.raf<-glm(htn ~ factor(risk.alleles), data=d, family=binomial)
summary(mod.raf)
exp(mod.raf$coefficients)
exp(confint.default(mod.raf))

d$hr.risk.alleles<-ifelse(d$risk.alleles==2,1,0)

table(d$hr.risk.alleles,d$risk.alleles)

mod.hra<-glm(htn ~ hr.risk.alleles, data=d, family=binomial)
summary(mod.hra)
exp(mod.hra$coefficients)
exp(confint.default(mod.hra))


#### Chisquare test looking at association between sex and hypertension
##    The first uses a continuity correction, the second does not.
##    We compare this with logistic regression, which results in a p-value which is
##    the same as the one that does not use the continuity correction.
with(d, chisq.test(htn,male))
with(d, chisq.test(htn,male, correct=FALSE))
mod.male<-glm(htn~male, data=d, family=binomial)
summary(mod.male)


#### I created this function that takes the fitted mmodel and extracts the odds ratio, 
##    95% confidence interval for the odds ratio, and the p-value. It also allows me to 
##    quickly multiply the coefficient by a number, "multiplier", in case I want to 
##    interpret an association on a different scale (e.g., instead of a 1-unit increase
##    in CD4, maybe a 100-unit increase in CD4.)
or.function<-function(coeff.info, multiplier=1){
  or<-exp(coeff.info[1]*multiplier)
  lower<-exp((coeff.info[1]-1.96*coeff.info[2])*multiplier)
  upper<-exp((coeff.info[1]+1.96*coeff.info[2])*multiplier)
  pval<-coeff.info[4]
  names(or)<-"Odds Ratio"
  names(lower)<-"95% Lower CI"
  names(upper)<-"95% Upper CI"
  names(pval)<-"P-Value"
  print(c(or, lower, upper, pval))
}

or.function(summary(mod.male)$coefficients[2,])

### Now I am fitting logistic regression using the lrm function of the rms package.
##   Everything is the same, but the functions are slightly different.
library(rms)
dd<-datadist(d)     ## This line and the next are used to help with summarizing lrm(.) results
options(datadist=dd)
fit.male<-lrm(htn ~ male, data=d)
summary(fit.male)   ## This gives estimates for contrasts
anova(fit.male)     ## This computes p-values for the model


mod.age<-glm(htn~age, data=d, family=binomial)
summary(mod.age)
or.function(summary(mod.age)$coefficients[2,])


mod.bmi<-glm(htn~bmi, data=d, family=binomial)
summary(mod.bmi)
or.function(summary(mod.bmi)$coefficients[2,])


mod.smoke<-glm(htn~smoke, data=d, family=binomial)
summary(mod.smoke)
or.function(summary(mod.smoke)$coefficients[2,])


mod.cd4<-glm(htn~cd4, data=d, family=binomial)
summary(mod.cd4)
or.function(summary(mod.cd4)$coefficients[2,], multiplier=100)


mod.tdf<-glm(htn~tdf, data=d, family=binomial)
summary(mod.tdf)
or.function(summary(mod.tdf)$coefficients[2,])


mod.dtg<-glm(htn~dtg, data=d, family=binomial)
summary(mod.dtg)
or.function(summary(mod.dtg)$coefficients[2,])


### jnc_bp is a categorical variable, so there are multiple odds ratios comparing
##   to the reference category
mod.jnc<-glm(htn~jnc_bp, data=d, family=binomial)
summary(mod.jnc)
or.function(summary(mod.jnc)$coefficients[2,])
or.function(summary(mod.jnc)$coefficients[3,])
or.function(summary(mod.jnc)$coefficients[4,])


mod.art<-glm(htn~years.ART, data=d, family=binomial)
summary(mod.art)
or.function(summary(mod.art)$coefficients[2,])


mod.uacr<-glm(htn~uACR, data=d, family=binomial)
summary(mod.uacr)
or.function(summary(mod.uacr)$coefficients[2,])

### Analyzing log-transformed uACR, because it is so skewed.
mod.luacr<-glm(htn~log10uACR, data=d, family=binomial)
summary(mod.luacr)
or.function(summary(mod.luacr)$coefficients[2,])