######################################################################################## ## This code is for learning sampling variation using simulation ## [EMS] Chapter 4 ## created on Jan. 05, 2007 by Leena Choi ## modified on Jan. 13, 2007 by Leena Choi ######################################################################################## ######################################################################################## ## Underlying distribution is normal: [EMS] Example 4.4 on page 39 ######################################################################################## ## population mean and s.d. set.seed(20) mu <- 78.2 # the mean of population dbp sigma <- 9.4 # the s.d. of population dbp N <- 250 Y <- round(rnorm(N, mean=mu, sd=sigma), 2) Y[1:10] # 10 values of the population ## distribution of population hist(Y, xlim=c(40, 120)) ## sampling from the population n <- 10 # the number of observation in each sample y.sam <- sample(Y, size=n, replace=FALSE) y.sam # this is what you got ## sample mean and s.d mean(Y) mean(y.sam) sd(Y) sd(y.sam) ## comparison of distributions between population and one sample par(mfrow=c(1,2)) hist(Y, xlim=c(40, 120)) hist(y.sam, xlim=c(40, 120)) ## sampling variation sampling.var <- function(n, n.trial, sigma, mu, N){ Y <- rnorm(N, mean=mu, sd=sigma) sam.mean <- NULL for(i in 1:n.trial){ y.sam <- sample(Y, size=n, replace=FALSE) sam.mean <- c(sam.mean, mean(y.sam) ) } return(sam.mean) } n <- 10 n.trial <- 100 # the number of trials y.sam <- sample(Y, size=n, replace=FALSE) y.sam # observe what you got; try another sample set.seed(20) sam.mean <- sampling.var(n = n, n.trial=n.trial, sigma=sigma, mu=mu, N=N) ## standard error (s.e.) se <- sigma/sqrt(n) #theoretical s.e. se sd(sam.mean) y.sam <- sample(Y, size=n, replace=FALSE) # you only have one sample s <- sd(y.sam) s/sqrt(n) par(mfrow=c(1,1)) plot(50.5, mean(Y), xlab="Identification number of trial", ylab="Sample mean", col=2, pch=15, cex=1.3, xlim=c(1,100), ylim=c(65, 95)) abline(h=mu, col=3, lty=2) points(1:n.trial, sam.mean) abline(h=mu+1.96*se, col=4, lty=3) abline(h=mu-1.96*se, col=4, lty=3) ## sampling distribution as function of n and sigma bin <- 10 hist(sam.mean, bin, xlim=c(40, 120)) sam.mean.10.s10 <- sampling.var(n = 10, n.trial=n.trial, sigma=10, mu=mu, N=N) # n=10, sigma=10 sam.mean.30.s10 <- sampling.var(n = 30, n.trial=n.trial, sigma=10, mu=mu, N=N) # n=30, sigma=10 sam.mean.10.s20 <- sampling.var(n = 10, n.trial=n.trial, sigma=20, mu=mu, N=N) # n=10, sigma=20 sam.mean.30.s20<- sampling.var(n = 30, n.trial=n.trial, sigma=20, mu=mu, N=N) # n=30, sigma=20 par(mfrow=c(2,2)) hist(sam.mean.10.s10, bin, xlim=c(40, 120)) hist(sam.mean.30.s10, bin, xlim=c(40, 120)) hist(sam.mean.10.s20, bin, xlim=c(40, 120)) hist(sam.mean.30.s20, bin, xlim=c(40, 120)) ######################################################################################## ## Underlying distribution is non-normal ######################################################################################## ## skewed distribution ## population mean and s.d. set.seed(20) mu <- 80 # the mean of population dbp sigma <- 10 # the s.d. of population dbp N <- 20000 Y.skew <- rlnorm(N, meanlog=log(mu), sdlog=0.3) Y <- rnorm(N, mean=mu, sd=sigma) ## distribution of population par(mfrow=c(1,2)) hist(Y.skew, xlim=c(0, 200)) hist(Y, xlim=c(0, 200)) sampling.var.skew <- function(n, n.trial, sigma, mu, N, dist, cv, lambda, prob){ if(dist=="lognorm"){ Y <- rlnorm(N, meanlog=log(mu), sdlog=cv) } else if(dist=="poisson"){ Y <- rpois(N, lambda=lambda) } else if(dist=="geometric"){ Y <- rgeom(N, prob=prob) } else{Y <- rnorm(N, mean=mu, sd=sigma) } sam.mean <- NULL for(i in 1:n.trial){ y.sam <- sample(Y, size=n, replace=FALSE) sam.mean <- c(sam.mean, mean(y.sam) ) } return(sam.mean) } n <- 10 sigma <- 40 sam.mean.norm <- sampling.var.skew(n = n, n.trial=n.trial, sigma=sigma, mu=mu, N=N, dist="normal") sam.mean.log <- sampling.var.skew(n = n, n.trial=n.trial, sigma=sigma, mu=mu, N=N, dist="lognorm", cv=sigma/mu) sam.mean.pois <- sampling.var.skew(n = n, n.trial=n.trial, sigma=sigma, mu=mu, N=N, dist="poisson", lambda=mu) sam.mean.geom <- sampling.var.skew(n = n, n.trial=n.trial, sigma=sigma, mu=mu, N=N, dist="geometric", prob=1/mu) par(mfrow=c(2,2)) hist(sam.mean.norm, xlim=c(40, 150), bin, main=paste("underlying distribution: normal","\n", "n =", n)) hist(sam.mean.pois, xlim=c(40, 150), bin, main=paste("underlying distribution: Poisson","\n", "n =", n)) hist(sam.mean.log, xlim=c(40, 150), bin, main=paste("underlying distribution: log normal","\n", "n =", n)) hist(sam.mean.geom, xlim=c(40, 150), bin, main=paste("underlying distribution: geometric","\n", "n =", n))