# This is an example of an R script to illustrate:
# Basic variable assignment and some data structures
# Reading in a text file
# Data manipulation: moaking subsets, re-categorizing variables
# Calculating non-parametric survival estimates
# Examining the stucture of R objects
# Extracting individual items from R objects
# Make a vector
x <- c(2, 5, 3, 7, 9)
# Access different elements of x using position indexing
x[1]
x[4]
x[20]
# Set the working directory
#setwd("/home/ruddjm/Dropbox/miscWork/introR")
setwd("/Users/ruddjm/Dropbox/miscWork/introR")
# Read in your data from the file tonguecancer.csv.
cancer <- read.csv("tonguecancer.csv")
# Examine the data
dim(cancer)
names(cancer)
str(cancer)
head(cancer)
## Make some changes to the data
# The variable ``tumor" has the following coding: 1=aneuploid tumor; 2=diploid tumor.
# Make this a factor with informative levels.
# The variable "studytime" is the observed time in weeks. Give this variable an informative label.
# Make race into a factor. Suppose the current coding is as follows: 1=Black, 2=White, 3=Hispanic, 4=Asian
# load a package called Hmisc, because the function upData, that I want to use, is in this package
# The Hmsic package must first be installed on the machine I'm using before I can load it
# If you haven't installed it, run install.packages("Hmisc")
#install.packages("Hmisc")
# This loads the package
library(Hmisc)
cancer <- upData(cancer,
tumorNew = factor(tumor),
race = factor(race),
labels = list(tumor = "Tumor type",
studytime = "Time in weeks"),
levels = list(tumorNew = list("Aneuploid" = "1", "Diploid" = "2"),
race = list("Black" = "1", "White" = "2", "Hispanic" = "3", "Asian" = "4")))
# Make a new variable that combines Hispanics and Asians into one category
cancer <- upData(cancer,
race3cat = factor(race),
levels = list(race3cat = list("Black" = "Black", "White" = "White", "Other" = c("Hispanic", "Asian"))))
# Look at the changes
head(cancer)
str(cancer)
# Make a simple table
table(cancer$race)
# Get percents
# Also note referring to a variable by indexing
prop.table(table(cancer[ , "race"]))
# Make these percents
100*prop.table(table(cancer[ , "race"]))
# Make a 2-way table
table(cancer$tumor, cancer$tumorNew)
# Same thing using the with() function
with(cancer, table(tumor, tumorNew))
# Take a subset of the data. Can also select specific columns using the select argument
diploidOnly <- subset(cancer, tumorNew == "Diploid")
nrow(diploidOnly)
# Get summary statistics of all variables. (This function is in the Hmisc package.)
describe(cancer)
## Analysis: survival analysis
# Need to first load the survival package to use the survival functions.
# You don't have to install this package because it comes with the basic R installation.
library(survival)
# Make a survival object. (This is for use in survival objects.)
survObj <- with(cancer, Surv(time = studytime, event = death))
survObj
# Nonparametric survival estimators
# Get Kaplan-Meier estimates of probability of survival
kmEst <- survfit(Surv(studytime, death) ~ 1,
type = "kaplan-meier",
conf.type = "log-log",
data = cancer)
kmEst
summary(kmEst)
# Plot Kaplan-Meier curve
plot(kmEst,
las = 1,
bty = "l",
ylab = "Probability of survival",
xlab = "Weeks")
# Calculate survival estimates separately for each group.
kmEstTumorType <- survfit(Surv(studytime, death) ~ tumor,
type = "kaplan-meier",
conf.type = "log-log",
data = cancer)
summary(kmEstTumorType)
# Plot separate curves for each of the tumor types.
plot(kmEstTumorType,
lty = 1:2,
ylab = "Probability of survival",
xlab = "Weeks",
las = 1)
legend("topright",
lty = 1:2,
legend = levels(cancer$tumorNew))
# Use help() to look at help file for this function
help(survfit)
help(plot.survfit)
plot(kmEstTumorType,
lty = 1:2,
mark.time = FALSE,
ylab = "Probability of survival",
xlab = "Weeks",
las = 1)
# Plot estimates of the cumulative hazard. The cumulative hazard can be
# calculated from the survival by taking the negative log.
# This illustrates how you can access different parts of R objects
str(kmEst)
plot(kmEst$time, -log(kmEst$surv),
col = "green",
las = 1,
type = "s",
xlab = "Weeks",
ylab = "Cumulative hazard")