####################### ## Introduction to R ## ####################### ## CQS Summer Institute 2017 ## Biostatistics 1 ## 8/7/17 ~ 8/11/17 ## Tatsuki Koyama ## Very basic ## ## Getting help help('mean') ## mean is a function in R. ?median ## so is median. help.search('average') example('t.test') ## R is case-sensitive. citation() # Citation() doesn't work. ls() getwd() save.image() # or specify the file name (.RData) save.image('practice.RData') ## R as a calculator. 3 + 5 * 4 12 * (5 - 2) ## 12(5-2) doesn't work. pi ## Simple functions. log(2) ## log() is a function that takes 2 arguments. args(log) log(2, base=10) ## log2 base 10 log(2, base=2) ## log2 base 2 log(2, base=exp(1)) ## natural log of 2 round(pi) args(round) round(pi, 3) ## short for round(x=pi, digits=3) round(2.5) ## not what you expect. ceiling(pi*100) floor(pi*100) ## Vector and assignment c(3, 4, 5, 8) ## A vector of length 4 age <- c(44, 52, 39) ## The variable 'age' gets c(44, 52, 39) age age + 3 ## Same as age + c(3,3,3); short vectors get recycled. x <- c(1:4) length(x) y <- c(12:5) length(y) x + y ## Is this really what you wanted to do? x + c(4, 6, 3) ## gets a warning. ## Let's compute the average age. age ## We assigned c(45, 52, 39) to 'age' before. sum(age) # sum of age. length(age) # length. ie how many data. sum(age) / length(age) mean(age) ## Logical vector ## Is mean(age) greater than 45? mean(age) > 45 ## Is it greater than or equal to 45? mean(age) >= 45 ## Is it equal to 45? mean(age) == 45 ## Is age greater than 40? age > 40 ## This is done by element. Over40 <- age > 40 age == 40 ## Is age equal to 40? age = 40 ## Oops! Midterm <- c(79, 88, 69, 75, 88, 39) Final <- c(72, 90, 59, 68, 72, NA) # NA is a missing value. Midterm[3] ## The third element of the vector. Final[2:5] ## 2nd to 5th elements of the vector. Midterm[3] <- 59 ## The third element of Midterm gets 59. ## Average for Final mean(Final) ## is NA! ?mean ## There is an argument called 'na.rm' (na remove). is.na(Final) ## Very useful function to see if there is any missing value. any(is.na(Final)) ## Is there any missing? Combination of two funcitons. all(is.na(Final)) mean(Final, na.rm=TRUE) mean(Final[1:5]) mean(Final[-6]) ## Some useful functions. length(Final) sum(Final, na.rm=TRUE) ## Sum prod(Final, na.rm=TRUE) ## Product max(Final, na.rm=TRUE) ## Maximum min(Final, na.rm=TRUE) ## Minimm sort(Final) ## NA is ignored. order(Final) ## NA is put at the end. rank(Final) ## NA is put at the end. unique(Final) duplicated(Final) ## Average of Midterm and Final for every one. mean(Midterm, Final) ## Nope. (Midterm + Final) / 2 TestScore <- cbind(Midterm, Final) ## cbind ... column bind. ## Now TestScore is a "matrix", not a vector. TestScore[3,] ## The third row. TestScore[,2] ## The second column. mean(TestScore[3,]) ## Average of midterm and final for the third person. apply(TestScore, 1, mean) ## Apply the function "mean" to each row (margin=1) in matrix TestScore. apply(TestScore, 1, mean, na.rm=TRUE) ## Additional argument(s) goes with the function. ## Read in data ## getwd() ## Know where you are (file location). #################################################### ## setwd('/Users/tatsukikoyama/Desktop/IntroToR') ## #################################################### list.files() d <- read.csv(file='practiceCeasarData.csv', header=TRUE, as.is=FALSE) ## d is a data.frame. nrow(d) ## number of rows. ncol(d) ## number of columns. dim(d) ## number of rows and columns. head(d) ## First few lines of d. tail(d, 20) ## Final 20 lines of d. names(d) ## Column names of d. d$Risk ## Call the column named 'Risk' summary(d) ## Summary for each column of d. ## d$Risk is a special vector called 'factor'. It is a character vector with the factor levels. ## Can't change the value if it is not in the factor level. ## d$Risk[1] <- 'high' will change the first entry of d$Risk to NA becaue 'high' is not one of the factor levels. levels(d$Risk) ## Reorder the levels of some columns. levels(d$Gleason) levels(d$Gleason) <- c('6 or less', "3 + 4", '4 + 3', "8,9,10") ## Single and double quotation marks both work. levels(d$Education) levels(d$Education) <- c('High school', 'Some college', 'College graduate', 'Graduate school') levels(d$Income) levels(d$Income) <- c('- 30K', '30K - 50K', '50K - 100K', '100K -') levels(d$Race) d$Race <- factor(d$Race, levels=c('White','Black','Other')) summary(d) table( d$Risk, d$Race ) ## Subset Surgery <- subset(d, Treatment=='Surgery') ## or simply Radiation <- d[ d$Treatment == 'Radiation', ] which( d$Treatment == 'Radiation' ) ## gives index. ## Summary of Age in Surgery group. summary( d$Age[ d$Treatment == 'Surgery' ] ) ## Summary of Age for different Risk groups. tapply(X=d$Age, INDEX=d$Risk, FUN='summary') ## PLOT ## plot( d$Age, d$PSA ) ## Set parameters to make it look better with par() function. par( las=1, family='serif', bty='L', tcl=-0.2) plot( d$Age, d$PSA) ## Add other information and modify it a little.) plot( d$Age, d$PSA, xlab='Age', ylab='PSA', xlim=c(35,85)) ## Specify colors. c(d$Treatment) ## Trick... plot( d$Age, d$PSA, xlab='Age', ylab='PSA', xlim=c(35,85), col=c(d$Treatment) ) ## 1 is black and 2 is red. plot( d$Age, d$PSA, xlab='Age', ylab='PSA', xlim=c(35,85), col=c('slateblue2','deeppink')[ c(d$Treatment) ], pch=19, cex=0.8) legend('topleft', pch=19, cex=0.8, col=c('slateblue2','deeppink'), legend=c('Radiation','Surgery')) boxplot( d$PSA ~ d$Risk, ylab='PSA' ) boxplot( d$PSA ~ d$Risk, ylab='PSA', border='red', col='pink') ############################ ## Writing a new function ## ############################ ## Write a function to display standard deviation, variance, sample size, and number of missing values. SdAndVar <- function(v){ # v is a vector of data. L <- length(v) nMissing <- sum( is.na(v) ) ## Counting TRUE on is.na(v) SD <- sd( v[!is.na(v)] ) ## v[!is.na(v)] ... v such that is.na(v) is NOT true. VA <- var( v[!is.na(v)] ) data.frame(n=L, nMiss=nMissing, SD, VAR=VA) } ## Write a function to multiply Time... i.e. what is '8 min 47 sec' times 6? multTime <- function(lap, multiplier, fmt=TRUE){ # lap is in 'Min.Sec' format. "8.47" means 8 minutes 47 seconds. Min <- floor(lap) ## interger portion. Sec <- (lap - Min) * 100 m <- Min + Sec/60 ## in decimal MINUTES. MM <- m * multiplier MIN <- floor(round(MM*100)/100) SEC <- round( 60*(MM - MIN)) out <- MIN+round(SEC/100,2) if(fmt) out <- paste(MIN, formatC(SEC, format='f', digit=0, width=2, flag=0), sep=':') out } addTime <- function(laps, fmt=TRUE){ # laps is in 'Min.Sec' format. "8.47" means 8 minutes 47 seconds. min <- floor(laps) sec <- (laps-min) * 100 MIN <- sum(min) SEC <- sum(sec) secM <- SEC %/% 60 secS <- round(SEC %% 60) multTime(MIN+secM+secS/100, 1, fmt) } multTime(200, 1/26.2) ## Even pace for 200 minutes. addTime(c(47,47,46,45, 15)) ## 10k pace of 47min, 47min, 46min, 45min, and 15min will give us 200 minutes. ## Packages ## install.packages('clinfun') ## Run this once (per each version of R). library(clinfun) ## Run this every time you want to use it. library(Hmisc)