#######################
## Introduction to R ##
#######################

## CQS Summer Institute 2017
## Biostatistics 1
## 8/7/17 ~ 8/11/17
## Tatsuki Koyama

## Very basic ##
## Getting help
help('mean') ## mean is a function in R.
?median ## so is median.

help.search('average')
example('t.test')

## R is case-sensitive.
citation() # Citation() doesn't work.
ls()
getwd()

save.image() # or specify the file name (.RData)
save.image('practice.RData')

## R as a calculator.
3 + 5 * 4
12 * (5 - 2) ## 12(5-2) doesn't work.

pi

## Simple functions. 
log(2) ## log() is a function that takes 2 arguments.
    args(log)
log(2, base=10) ## log2 base 10
log(2, base=2) ## log2 base 2
log(2, base=exp(1)) ## natural log of 2

round(pi)
    args(round)
round(pi, 3) ## short for round(x=pi, digits=3)
    round(2.5) ## not what you expect.

ceiling(pi*100)
floor(pi*100)

## Vector and assignment
c(3, 4, 5, 8) ## A vector of length 4
age <- c(44, 52, 39) ## The variable 'age' gets c(44, 52, 39)
    age
age + 3 ## Same as age + c(3,3,3); short vectors get recycled.

x <- c(1:4)
    length(x)
y <- c(12:5)
    length(y)

x + y ## Is this really what you wanted to do?
x + c(4, 6, 3) ## gets a warning.

## Let's compute the average age. 
age ## We assigned c(45, 52, 39) to 'age' before.
    sum(age) # sum of age.
    length(age) # length. ie how many data.
    sum(age) / length(age)

    mean(age)

## Logical vector
## Is mean(age) greater than 45?
    mean(age) > 45
## Is it greater than or equal to 45?
    mean(age) >= 45
## Is it equal to 45?
    mean(age) == 45

## Is age greater than 40?
    age > 40 ## This is done by element.
Over40 <- age > 40
    age == 40 ## Is age equal to 40?
    age = 40  ## Oops!

Midterm <- c(79, 88, 69, 75, 88, 39)
Final <- c(72, 90, 59, 68, 72, NA) # NA is a missing value.
    Midterm[3] ## The third element of the vector.
    Final[2:5] ## 2nd to 5th elements of the vector.
Midterm[3] <- 59 ## The third element of Midterm gets 59.

## Average for Final
mean(Final) ## is NA!
    ?mean ## There is an argument called 'na.rm' (na remove).

is.na(Final)  ## Very useful function to see if there is any missing value.
    any(is.na(Final)) ## Is there any missing? Combination of two funcitons.
    all(is.na(Final)) 

mean(Final, na.rm=TRUE)
    mean(Final[1:5])
    mean(Final[-6])

## Some useful functions.
    length(Final)
    sum(Final, na.rm=TRUE) ## Sum
    prod(Final, na.rm=TRUE) ## Product
    max(Final, na.rm=TRUE) ## Maximum
    min(Final, na.rm=TRUE) ## Minimm
    sort(Final) ## NA is ignored.
    order(Final) ## NA is put at the end.
    rank(Final) ## NA is put at the end.
    unique(Final)
    duplicated(Final)

## Average of Midterm and Final for every one. 
mean(Midterm, Final) ## Nope.
(Midterm + Final) / 2

TestScore <- cbind(Midterm, Final) ## cbind ... column bind.
    ## Now TestScore is a "matrix", not a vector. 
TestScore[3,] ## The third row.
TestScore[,2] ## The second column.

mean(TestScore[3,]) ## Average of midterm and final for the third person.
    apply(TestScore, 1, mean)
        ## Apply the function "mean" to each row (margin=1) in matrix TestScore.
    apply(TestScore, 1, mean, na.rm=TRUE) 
        ## Additional argument(s) goes with the function.

## Read in data ##
getwd() ## Know where you are (file location).
####################################################
## setwd('/Users/tatsukikoyama/Desktop/IntroToR') ##
####################################################
list.files()

d <- read.csv(file='practiceCeasarData.csv', header=TRUE, as.is=FALSE)
    ## d is a data.frame. 
nrow(d) ## number of rows.
ncol(d) ## number of columns.

dim(d) ## number of rows and columns.
head(d) ## First few lines of d.
tail(d, 20) ## Final 20 lines of d.

names(d) ## Column names of d.
    d$Risk ## Call the column named 'Risk'

summary(d) ## Summary for each column of d.
    ## d$Risk is a special vector called 'factor'. It is a character vector with the factor levels.
    ## Can't change the value if it is not in the factor level.
        ## d$Risk[1] <- 'high' will change the first entry of d$Risk to NA becaue 'high' is not one of the factor levels.
    levels(d$Risk)

## Reorder the levels of some columns.
levels(d$Gleason)
    levels(d$Gleason) <- c('6 or less', "3 + 4", '4 + 3', "8,9,10") ## Single and double quotation marks both work.
levels(d$Education)
    levels(d$Education) <- c('High school', 'Some college', 'College graduate', 'Graduate school')
levels(d$Income)
    levels(d$Income) <- c('- 30K', '30K - 50K', '50K - 100K', '100K -')
levels(d$Race)
    d$Race <- factor(d$Race, levels=c('White','Black','Other'))

summary(d)

table( d$Risk, d$Race )

## Subset 
Surgery <- subset(d, Treatment=='Surgery')
    ## or simply
Radiation <- d[ d$Treatment == 'Radiation', ]

which( d$Treatment == 'Radiation' )  ## gives index.

## Summary of Age in Surgery group.
summary( d$Age[ d$Treatment == 'Surgery' ] )
## Summary of Age for different Risk groups.
tapply(X=d$Age, INDEX=d$Risk, FUN='summary')

## PLOT ##
plot( d$Age, d$PSA )
    ## Set parameters to make it look better with par() function.
par( las=1, family='serif', bty='L', tcl=-0.2)
plot( d$Age, d$PSA)
    ## Add other information and modify it a little.)
plot( d$Age, d$PSA, xlab='Age', ylab='PSA', xlim=c(35,85))
    ## Specify colors.
    c(d$Treatment) ## Trick...
plot( d$Age, d$PSA, xlab='Age', ylab='PSA', xlim=c(35,85), col=c(d$Treatment) ) ## 1 is black and 2 is red.
plot( d$Age, d$PSA, xlab='Age', ylab='PSA', xlim=c(35,85), col=c('slateblue2','deeppink')[ c(d$Treatment) ], pch=19, cex=0.8)
    legend('topleft', pch=19, cex=0.8, col=c('slateblue2','deeppink'), legend=c('Radiation','Surgery'))

boxplot( d$PSA ~ d$Risk, ylab='PSA' )
    boxplot( d$PSA ~ d$Risk, ylab='PSA', border='red', col='pink')

############################
## Writing a new function ##
############################

## Write a function to display standard deviation, variance, sample size, and number of missing values.
SdAndVar <- function(v){
    # v is a vector of data. 
    L <- length(v)
        nMissing <- sum( is.na(v) ) ## Counting TRUE on is.na(v)
    SD <- sd( v[!is.na(v)] ) ## v[!is.na(v)] ... v such that is.na(v) is NOT true.
    VA <- var( v[!is.na(v)] )

    data.frame(n=L, nMiss=nMissing, SD, VAR=VA)
}

## Write a function to multiply Time... i.e. what is '8 min 47 sec' times 6?
multTime <- function(lap, multiplier, fmt=TRUE){
    # lap is in 'Min.Sec' format. "8.47" means 8 minutes 47 seconds.
    Min <- floor(lap) ## interger portion.
    Sec <- (lap - Min) * 100
        m <- Min + Sec/60 ## in decimal MINUTES.
    MM <- m * multiplier
        MIN <- floor(round(MM*100)/100)
        SEC <- round( 60*(MM - MIN))
        out <- MIN+round(SEC/100,2)
        if(fmt) out <- paste(MIN, formatC(SEC, format='f', digit=0, width=2, flag=0), sep=':')
    out       
}

addTime <- function(laps, fmt=TRUE){
    # laps is in 'Min.Sec' format. "8.47" means 8 minutes 47 seconds.
    min <- floor(laps)
    sec <- (laps-min) * 100
        MIN <- sum(min)
        SEC <- sum(sec)
        secM <- SEC %/% 60
        secS <- round(SEC %% 60)
    multTime(MIN+secM+secS/100, 1, fmt)
}

multTime(200, 1/26.2) ## Even pace for 200 minutes.
addTime(c(47,47,46,45, 15)) ## 10k pace of 47min, 47min, 46min, 45min, and 15min will give us 200 minutes.


## Packages ##
install.packages('clinfun') ## Run this once (per each version of R).
    library(clinfun) ## Run this every time you want to use it.
library(Hmisc)