# This is an example of an R script to illustrate: # Basic variable assignment and some data structures # Reading in a text file # Data manipulation: moaking subsets, re-categorizing variables # Calculating non-parametric survival estimates # Examining the stucture of R objects # Extracting individual items from R objects # Make a vector x <- c(2, 5, 3, 7, 9) # Access different elements of x using position indexing x[1] x[4] x[20] # Set the working directory #setwd("/home/ruddjm/Dropbox/miscWork/introR") setwd("/Users/ruddjm/Dropbox/miscWork/introR") # Read in your data from the file tonguecancer.csv. cancer <- read.csv("tonguecancer.csv") # Examine the data dim(cancer) names(cancer) str(cancer) head(cancer) ## Make some changes to the data # The variable ``tumor" has the following coding: 1=aneuploid tumor; 2=diploid tumor. # Make this a factor with informative levels. # The variable "studytime" is the observed time in weeks. Give this variable an informative label. # Make race into a factor. Suppose the current coding is as follows: 1=Black, 2=White, 3=Hispanic, 4=Asian # load a package called Hmisc, because the function upData, that I want to use, is in this package # The Hmsic package must first be installed on the machine I'm using before I can load it # If you haven't installed it, run install.packages("Hmisc") #install.packages("Hmisc") # This loads the package library(Hmisc) cancer <- upData(cancer, tumorNew = factor(tumor), race = factor(race), labels = list(tumor = "Tumor type", studytime = "Time in weeks"), levels = list(tumorNew = list("Aneuploid" = "1", "Diploid" = "2"), race = list("Black" = "1", "White" = "2", "Hispanic" = "3", "Asian" = "4"))) # Make a new variable that combines Hispanics and Asians into one category cancer <- upData(cancer, race3cat = factor(race), levels = list(race3cat = list("Black" = "Black", "White" = "White", "Other" = c("Hispanic", "Asian")))) # Look at the changes head(cancer) str(cancer) # Make a simple table table(cancer$race) # Get percents # Also note referring to a variable by indexing prop.table(table(cancer[ , "race"])) # Make these percents 100*prop.table(table(cancer[ , "race"])) # Make a 2-way table table(cancer$tumor, cancer$tumorNew) # Same thing using the with() function with(cancer, table(tumor, tumorNew)) # Take a subset of the data. Can also select specific columns using the select argument diploidOnly <- subset(cancer, tumorNew == "Diploid") nrow(diploidOnly) # Get summary statistics of all variables. (This function is in the Hmisc package.) describe(cancer) ## Analysis: survival analysis # Need to first load the survival package to use the survival functions. # You don't have to install this package because it comes with the basic R installation. library(survival) # Make a survival object. (This is for use in survival objects.) survObj <- with(cancer, Surv(time = studytime, event = death)) survObj # Nonparametric survival estimators # Get Kaplan-Meier estimates of probability of survival kmEst <- survfit(Surv(studytime, death) ~ 1, type = "kaplan-meier", conf.type = "log-log", data = cancer) kmEst summary(kmEst) # Plot Kaplan-Meier curve plot(kmEst, las = 1, bty = "l", ylab = "Probability of survival", xlab = "Weeks") # Calculate survival estimates separately for each group. kmEstTumorType <- survfit(Surv(studytime, death) ~ tumor, type = "kaplan-meier", conf.type = "log-log", data = cancer) summary(kmEstTumorType) # Plot separate curves for each of the tumor types. plot(kmEstTumorType, lty = 1:2, ylab = "Probability of survival", xlab = "Weeks", las = 1) legend("topright", lty = 1:2, legend = levels(cancer$tumorNew)) # Use help() to look at help file for this function help(survfit) help(plot.survfit) plot(kmEstTumorType, lty = 1:2, mark.time = FALSE, ylab = "Probability of survival", xlab = "Weeks", las = 1) # Plot estimates of the cumulative hazard. The cumulative hazard can be # calculated from the survival by taking the negative log. # This illustrates how you can access different parts of R objects str(kmEst) plot(kmEst$time, -log(kmEst$surv), col = "green", las = 1, type = "s", xlab = "Weeks", ylab = "Cumulative hazard")