####################### ## Introduction to R ## ####################### ## CQS Summer Institute 2016 ## Biostatistics 1 ## 8/1/16 ~ 8/5/16 ## Tatsuki Koyama ## Very basic ## ## Getting help help('mean') ## mean is a function in R. ?median ## so is median. help.search('average') example('t.test') ## R is case-sensitive. citation() # Citation() doesn't work. ls() getwd() save.image() # or specify the file name (.RData) save.image('practice.RData') ## R as a calculator. 3 + 5 * 4 12 * (5 - 2) ## 12(5-2) doesn't work. pi ## Simple functions. log(2) ## log() is a function that takes 2 arguments. args(log) log(2, base=10) ## log2 base 10 log(2, base=2) ## log2 base 2 log(2, base=exp(1)) ## natural log of 2 round(pi) args(round) round(pi, 3) ## short for round(x=pi, digits=3) round(2.5) ## not what you expect. ceiling(pi*100) floor(pi*100) ## Vector and assignment c(3, 4, 5, 8) ## A vector of length 4 age <- c(44, 52, 39) ## The variable 'age' gets c(44, 52, 39) age age + 3 ## Same as age + c(3,3,3); short vectors get recycled. x <- c(1:4) length(x) y <- c(12:5) length(y) x + y ## Is this really what you wanted to do? x + c(4, 6, 3) ## gets a warning. ## Let's compute the average age. age ## We assigned c(45, 52, 39) to 'age' before. sum(age) # sum of age. length(age) # length. ie how many data. sum(age) / length(age) mean(age) ## Logical vector ## Is mean(age) greater than 45? mean(age) > 45 ## Is it greater than or equal to 45? mean(age) >= 45 ## Is it equal to 45? mean(age) == 45 ## Is age greater than 40? age > 40 ## This is done by element. Over40 <- age > 40 age == 40 ## Is age equal to 40? age = 40 ## Oops! Midterm <- c(79, 88, 69, 75, 88, 39) Final <- c(72, 90, 59, 68, 72, NA) # NA is a missing value. Midterm[3] ## The third element of the vector. Final[2:5] ## 2nd to 5th elements of the vector. Midterm[3] <- 59 ## The third element of Midterm gets 59. ## Average for Final mean(Final) ## is NA! ?mean ## There is an argument called 'na.rm' (na remove). is.na(Final) ## Very useful function to see if there is any missing value. any(is.na(Final)) ## Is there any missing? Combination of two funcitons. all(is.na(Final)) mean(Final, na.rm=TRUE) mean(Final[1:5]) mean(Final[-6]) ## Some useful functions. length(Final) sum(Final, na.rm=TRUE) ## Sum prod(Final, na.rm=TRUE) ## Product max(Final, na.rm=TRUE) ## Maximum min(Final, na.rm=TRUE) ## Minimm sort(Final) ## NA is ignored. order(Final) ## NA is put at the end. rank(Final) ## NA is put at the end. unique(Final) duplicated(Final) ## Average of Midterm and Final for every one. mean( Midterm, Final) ## Nope. (Midterm + Final) / 2 TestScore <- cbind(Midterm, Final) ## cbind ... column bind. ## Now TestScore is a "matrix", not a vector. TestScore[3,] ## The third row. TestScore[,2] ## The second column. mean(TestScore[3,]) ## Average of midterm and final for the third person. apply(TestScore, 1, mean) ## Apply the function "mean" to each row (margin=1) in matrix TestScore. apply(TestScore, 1, mean, na.rm=TRUE) ## Additional argument(s) goes with the function. ## Read in data ## getwd() ## Know where you are (file location). setwd('/Users/tatsukikoyama/Desktop/IntroToR') list.files() d <- read.csv(file='Data/practiceCeasarData.csv', header=TRUE, as.is=FALSE) ## d is a data.frame. nrow(d) ## number of rows. ncol(d) ## number of columns. dim(d) ## number of rows and columns. head(d) ## First few lines of d. tail(d, 20) ## Final 20 lines of d. names(d) ## Column names of d. d$Risk ## Call the column named 'Risk' summary(d) ## Summary for each column of d. ## d$Risk is a special vector called 'factor'. It is a character vector with the factor levels. ## Can't change the value if it is not in the factor level. ## d$Risk[1] <- 'high' will change the first entry of d$Risk to NA becaue 'high' is not one of the factor levels. levels(d$Risk) ## Reorder the levels of some columns. levels(d$Gleason) levels(d$Gleason) <- c('6 or less', "3 + 4", '4 + 3', "8,9,10") ## Single and double quotation marks both work. levels(d$Education) levels(d$Education) <- c('High school', 'Some college', 'College graduate', 'Graduate school') levels(d$Income) levels(d$Income) <- c('- 30K', '30K - 50K', '50K - 100K', '100K -') summary(d) table( d$Risk, d$Race ) ## Subset Surgery <- subset(d, Treatment=='Surgery') ## or simply Radiation <- d[ d$Treatment == 'Radiation', ] which( d$Treatment == 'Radiation' ) ## gives index. ## Summary of Age in Surgery group. summary( d$Age[ d$Treatment == 'Surgery' ] ) ## Summary of Age for different Risk groups. tapply(X=d$Age, INDEX=d$Risk, FUN='summary') ## PLOT ## plot( d$Age, d$PSA ) ## Set parameters to make it look better with par() function. par( las=1, family='serif', bty='L', tcl=-0.2) plot( d$Age, d$PSA) ## Add other information and modify it a little.) plot( d$Age, d$PSA, xlab='Age', ylab='PSA', xlim=c(35,85)) ## Specify colors. c(d$Treatment) ## Trick... plot( d$Age, d$PSA, xlab='Age', ylab='PSA', xlim=c(35,85), col=c(d$Treatment) ) ## 1 is black and 2 is red. plot( d$Age, d$PSA, xlab='Age', ylab='PSA', xlim=c(35,85), col=c('slateblue2','deeppink')[ c(d$Treatment) ], pch=19, cex=0.8) legend('topleft', pch=19, cex=0.8, col=c('slateblue2','deeppink'), legend=c('Radiation','Surgery')) boxplot( d$PSA ~ d$Risk, ylab='PSA' ) boxplot( d$PSA ~ d$Risk, ylab='PSA', border='red', col='pink') ## Writing a new function ## ## Write a function to display standard deviation, variance, sample size, and number of missing values. SdAndVar <- function(v){ # v is a vector of data. L <- length(v) nMissing <- sum( is.na(v) ) ## Counting TRUE on is.na(v) SD <- sd( v[!is.na(v)] ) ## v[!is.na(v)] ... v such that is.na(v) is NOT true. VA <- var( v[!is.na(v)] ) data.frame(n=L, nMiss=nMissing, SD, VAR=VA) } ## Write a function to multiply Time... i.e. what is '8 min 47 sec' times 6? MultTime <- function(lap, howMany){ # lap is in 'Min.Sec' format. "8.07" means 8 minutes 47 seconds. min <- floor(lap) ## interger portion. sec <- (lap - min) * 100 MIN <- min * howMany # total minutes. SEC <- sec * howMany # total seconds. min60 <- SEC %/% 60 # How many 60s in SEC? sec60 <- round(SEC %% 60) # Remainder of SEC / 60. totalMIN <- MIN + min60 totalSEC <- sec60 paste(totalMIN, totalSEC, sep=':') ## Formatting the output. } MultTime(8.47, 6) ## Packages ## install.packages('clinfun') ## Run this once (per each version of R). library(clinfun) ## Run this every time you want to use it. library(Hmisc)