################################################### ### Using R as a calculator: ################################################### 2*10 10 + 13 - 21; 2^3 1 - 2*3 (1 - 2)*3 ################################################### ### Assignment: ################################################### x <- 5 x (x <- 5) 10*x + 2 x x <- 2 x x < -5 ################################################### ### Functions: ################################################### date date() ourFUN <- function(x) { x + 5 } ourFUN(x = 2) anotherFUN <- function(x, y = 5) { x + y } anotherFUN(x = 2) anotherFUN(x = 2, y = 7) lastFUN <- function(z, ...) { ourFUN(z) - anotherFUN(z, ...) } lastFUN(z = 2) lastFUN(z = 2, y = 10) ################################################### ### Vectors: ################################################### c(2, 3, 5, 2, 7, 1) c(TRUE, FALSE, FALSE, TRUE) c('cat', 'dog', 'bird', 'horse') x<-c(2, NA, 5, NA, NA, 7) y<-c(10,15,12) c(y, x) c(1:3, 'cat') seq(5) seq(-5) seq(from = 2, to = 10) 2:10 seq(from = 10, to = 3) 10:3 5.7:20.2 10.5:3.25 seq(from= -1, to = 1, by = 0.2) seq(from = 9, to = 1, by = -2) seq(from = 5.7, to = 11.2, by = 0.4) seq(from = 2, to = 15, length = 7) ################################################### ### Data frames: ################################################### ourdf <- data.frame(id = 101:110, sex = sample(c("M", "F"), size = 10, replace=TRUE), age = sample(20:50, size=10, replace=TRUE), tx = sample(c("Drug", "Placebo"), size=10, replace=TRUE), diabetes = sample(c(TRUE, FALSE))) ourdf ################################################### ### Read in motivating data set: ################################################### pbc <- read.table("pbc.csv", header = TRUE, sep = ",", na.strings = "") ################################################### ### Attributes of the read-in data set: ################################################### dim(pbc) names(pbc) library(Hmisc) contents(pbc) head(pbc) ################################################### ### Factors: ################################################### 5:1 factor(5:1) c("cat", "horse", "dog", "cat", "dog", "dog") factor(c("cat", "horse", "dog", "cat", "dog", "dog")) factor(c(TRUE, NA, FALSE, TRUE, FALSE, FALSE, NA)) mean(factor(1:5)) factor(c(3, 3, 1, 2, 2), levels = 1:4) factor(c("White", "Black", "Other", "White", "Other", "Black")) factor(c("White", "Black", "Other", "White", "Other", "Black"), levels = c("White", "Black", "Other")) factor(c(1, 1, 2, NA, 1, 2), labels = c("Case", "Control")) ################################################### ### Factor practice exercise 1: ################################################### x <- c("A", "SA", "D", "D", "SA", "SA", "SA", "A", "N", "SD") y <- factor(x, levels = c("SD", "D", "N", "A", "SA"), labels = c("Strongly disagree", "Disagree", "Neutral", "Agree", "Strongly agree")) y ################################################### ### Ordered factors: ################################################### x <- c("A", "SA", "D", "D", "SA", "SA", "SA", "A", "N", "SD") y <- factor(x, levels = c("SD", "D", "N", "A", "SA"), labels = c("Strongly disagree", "Disagree", "Neutral", "Agree", "Strongly agree"), ordered = TRUE) y y < "Neutral" ################################################### ### More on factors: ################################################### test <- factor(c("positive", "negative", "negative")) levels(test) levels(test) <- list("positive" = "positive", "negative" = "negative") levels(test) levels(test) <- list(Positive = "positive", Negative = "negative") test levels(test) <- list(Undetermined = "Undetermined", Positive = "Positive", Negative = "Negative") test levels(test) <- list(Combined = c("Undetermined", "Positive"), Negative = "Negative") test ################################################### ### Practice exercise 2: ################################################### z <- y levels(z) <- list("Disagree" = c("Strongly disagree", "Disagree"), "Neutral" = "Neutral", "Agree" = c("Agree", "Strongly agree")) z ################################################### ### Hmisc package upData() function: ################################################### pbc <- upData(pbc, lowernames = TRUE, stage = factor(stage, levels = 1:4), ageyrs = age/365.25, fuyrs = fudays/365.25, censored = factor(status), labels = c(ageyrs="Age", fuyrs = "Follow Up", censored = "Collapsed Survival Status", fudays = "Follow Up", status="Original Survival Status", drug = "Treatment", sex = "Gender", age = "Age", ascites = "Presence of Ascites", bili = "Serum Bilirubin", chol = "Serum Cholesterol", album = "Serum Albumin", stage = "Histological stage of disease"), units = c(fudays = "days", fuyrs = "years", age = "days", ageyrs = "years", bili = "mg/dL", chol = "mg/dL", album = "mg/dL"), levels = list(status = c("Censored", "Censored due to liver treatment", "Dead"), censored = list("Censored" = c(0,1), "Dead" = 2), drug = c("D-penicillamine", "Placebo"))) contents(pbc) ################################################### ### Alternative upData() invocation: ################################################### pbc <- read.table("pbc.csv", header = TRUE, sep = ",", na.strings = "", stringsAsFactors = FALSE) contents(pbc) pbc <- upData(pbc, lowernames = TRUE, status = factor(status, levels = 0:2, labels = c("Censored", "Censored due to liver treatment", "Dead")), censored = status, drug = factor(drug, levels = 1:2, labels = c("D-penicillamine", "Placebo")), sex = factor(sex, levels = c("Female", "Male")), ascites = factor(ascites, levels = c("No", "Yes")), stage = factor(stage, levels = 1:4), ageyrs = age/365.25, fuyrs = fudays/365.25, labels = c(ageyrs="Age", fuyrs = "Follow Up", censored = "Collapsed Survival Status", fudays = "Follow Up", status="Original Survival Status", drug = "Treatment", sex = "Gender", age = "Age", ascites = "Presence of Ascites", bili = "Serum Bilirubin", chol = "Serum Cholesterol", album = "Serum Albumin", stage = "Histological stage of disease"), units = c(fudays = "days", fuyrs = "years", age = "days", ageyrs = "years", bili = "mg/dL", chol = "mg/dL", album = "mg/dL"), levels = list(censored = list("Censored" = c("Censored", "Censored due to liver treatment"), "Dead" = "Dead"))) contents(pbc) ################################################### ### Tables of categorical variables: ################################################### with(pbc, table(sex)) with(pbc, table(sex, drug)) with(pbc, table(sex, drug, censored)) with(pbc, ftable(sex, drug, censored)) with(pbc, ftable(sex, censored, drug)) with(pbc, table(signif(chol, digits = 1), exclude = NULL)) with(pbc, table(as.character(drug), exclude = NULL)) with(pbc, table(stage, exclude = 4)) with(pbc, table(as.character(stage), exclude = 4)) ################################################### ### Table practice exercise: ################################################### with(pbc, table(fuyrs > 50, status)) ################################################### ### Univariate summary statistics: ################################################### with(pbc, mean(ageyrs)) with(pbc, median(ageyrs)) with(pbc, min(ageyrs)) with(pbc, max(ageyrs)) with(pbc, range(ageyrs)) with(pbc, sd(ageyrs)) with(pbc, quantile(ageyrs)) with(pbc, quantile(ageyrs, probs = seq(0, 1, by = 0.1))) with(pbc, mean(chol)) with(pbc, mean(chol, na.rm = TRUE)) ################################################### ### Univariate summary stats Practice Exercise: ################################################### with(pbc, mean(ageyrs, na.rm = TRUE) - 1.96*sd(ageyrs, na.rm = TRUE)) with(pbc, mean(ageyrs, na.rm = TRUE) + 1.96*sd(ageyrs, na.rm = TRUE)) ################################################### ### More on missing values: ################################################### x <- c(9, 5, 12, NA, 2, NA, NA, 1) x + 5 x > 2 x == NA is.na(x) x > 2 & !is.na(x) ################################################### ### More on missing values Practice Exercise: ################################################### with(pbc, table(is.na(ascites))) ################################################### ### Group-wise summary stats of continuous variables: ################################################### summary.formula(bili ~ censored + sex, data = pbc, method = "response") summary.formula(bili ~ censored + sex, data = pbc, method = "response", fun = range) summary.formula(bili ~ censored + sex, data = pbc, method = "response", fun = function(x) { c(Median = median(x), Min = min(x), Max = max(x)) }) summary.formula(chol ~ drug + ascites, data = pbc, method = "response", fun = median) summary.formula(fuyrs ~ drug + ascites, data = pbc, method = "response", fun = median) ################################################### ### Group-wise summary stats of continous variables Practice Exercise: ################################################### summary.formula(ageyrs ~ sex + censored + stage, data = pbc, method = "response", fun = function(x) {round(quantile(x), 2)}) ################################################### ### Group-wise summary stats of continuous variables, cont'd: ################################################### summary.formula(bili ~ ascites, data = pbc, method = "cross", fun = median) summary.formula(bili ~ drug + censored + sex, data = pbc, method = "cross", fun = function(x) { c(Mean = mean(x), SD = sd(x), Median = median(x), Min = min(x), Max = max(x)) }) summary.formula(bili ~ censored + sex, data = pbc, method = "cross", fun = function(x) { c(Mean = mean(x), SD = sd(x), Median = median(x), Min = min(x), Max = max(x)) }) ################################################### ### Automatic summaries of both continuous and categorical variables: ################################################### summary.formula(sex ~ ageyrs + chol + drug + stage, data=pbc, method="reverse") summary.formula(sex ~ ageyrs + chol + drug + stage, data=pbc, method="reverse", overall = TRUE) summary.formula(~ ageyrs + chol + drug + stage + sex, data=pbc, method="reverse") summary.formula(sex ~ ageyrs + chol + drug + stage, test = TRUE, data=pbc, method="reverse") print(summary.formula(sex ~ ageyrs + chol + drug + stage, overall = TRUE, method="reverse", data=pbc), digits = 3, npct = "both", pctdig = 2, exclude1 = FALSE, long = TRUE) ################################################### ### Histogram: ################################################### with(pbc, hist(ageyrs, xlab = label(ageyrs), main = paste("Histogram of", label(ageyrs)))) ################################################### ### Boxplot: ################################################### with(pbc, boxplot(album, ylab = label(album), main = paste("Boxplot of", label(album)))) ################################################### ### Boxplot with stripchart: ################################################### with(pbc, { boxplot(album, outline = FALSE, ylab = label(album), main = paste("Boxplot of", label(album))) stripchart(album, vertical = TRUE, method = "jitter", pch = 1, add = TRUE) }) ################################################### ### Side-by-side boxplot with stripchart: ################################################### with(pbc, { boxplot(album ~ stage, outline = FALSE, varwidth = TRUE, xlab = label(stage), ylab = label(album), main = paste("Boxplot of", label(album), "by\n", label(stage))) stripchart(album ~ stage, method = "jitter", pch = 1, vertical = TRUE, add = TRUE) }) ################################################### ### Pairs plot: ################################################### pairs(subset(pbc, select = c(ageyrs, bili, chol, album))) ################################################### ### Scatterplot: ################################################### with(pbc, plot(album ~ bili, xlab = label(bili), ylab = label(album), main = paste("Plot of", label(album), "vs.",label(bili)))) ################################################### ### Scatterplot with plsmo() curve: ################################################### with(pbc, { plot(album ~ bili, xlab = label(bili), ylab = label(album), main = paste("Plot of", label(album), "vs.", label(bili))) plsmo(x = bili, y = album, add = TRUE) }) ################################################### ### Barplot: ################################################### with(pbc, barplot(prop.table(table(censored)), ylim = c(0, 1.0), xlab = label(censored), ylab = "Proportion", main = paste("Barplot of", label(censored)))) ################################################### ### Dotplot: ################################################### with(pbc, dotchart(prop.table(table(censored)), lcolor = "black", xlim = c(0, 1.0), xlab = "Proportion", main = paste("Dotplot of", label(censored)))) ################################################### ### Two-way barplot: ################################################### with(pbc, barplot(prop.table(table(censored, drug), margin = 2), beside = TRUE, legend.text = levels(pbc$censored), ylim = c(0, 1.0), xlab = label(censored), ylab = "Proportion", main = paste("Barplot of", label(censored), "\n across", label(drug)))) ################################################### ### Two-way dotplot: ################################################### with(pbc, dotchart(prop.table(table(censored, drug), margin = 2), lcolor = "black", xlim = c(0, 1.0), xlab = "Proportion", main = paste("Dotplot of", label(censored), "\n across", label(drug)))) ################################################### ### Alternative two-way dotplot: ################################################### with(pbc, dotchart(prop.table(table(censored, drug), margin = 2)[, "D-penicillamine"], lcolor = "black", xlim = c(0, 1.0), xlab = "Proportion", main = paste("Dotplot of", label(censored), "\n across", label(drug)))) par(new = TRUE) with(pbc, dotchart(prop.table(table(censored, drug), margin = 2)[, "Placebo"], lcolor = "black", xlim = c(0, 1.0), pch = 4)) legend(x = 1, y = 0.25, xjust = 1, yjust = 0, legend = levels(pbc$drug), pch = c(1, 4)) ################################################### ### Scatterplot with different points and lines: ################################################### with(pbc, plot(album ~ chol, type = "n", xlab = label(chol), ylab = label(album), main = paste("Plot of", label(album), "vs. \n", label(chol), "across", label(drug)))) with(subset(pbc, drug == "D-penicillamine"), points(album ~ chol, pch = 4)) with(subset(pbc, drug == "Placebo"), points(album ~ chol)) with(pbc, { plsmo(chol, album, group = drug, add = TRUE, lty = c(1,2)) legend(x = 1100, y = 2, xjust = 1, yjust = 0, legend = levels(drug), lty = c(1,2), pch = c(4, 1)) }) ################################################### ### Multiple plots: ################################################### xrange <- with(pbc, range(chol, na.rm = TRUE)) yrange <- with(pbc, range(album, na.rm = TRUE)) par(mfrow = c(1, 3), oma = c(0, 0, 5, 0)) with(subset(pbc, drug == "D-penicillamine"), { plot(album ~ chol, pch = 16, xlim = xrange, ylim = yrange, xlab = label(chol), ylab = label(album), main = levels(drug)[1]) plsmo(chol, album, add = TRUE) }) with(subset(pbc, drug == "Placebo"), { plot(album ~ chol, pch = 16, xlim = xrange, ylim = yrange, xlab = label(chol), ylab = label(album), main = levels(drug)[2]) plsmo(chol, album, add = TRUE) }) with(pbc, { plot(album ~ chol, pch = 16, xlim = xrange, ylim = yrange, xlab = label(chol), ylab = label(album), main = "Overall") plsmo(chol, album, add = TRUE) mtext(text = paste("Plot of", label(album), "vs. \n", label(chol), "across", label(drug)), side = 3, outer = TRUE) }) par(mfrow = c(1, 1), oma = c(0,0,0,0))