# Lego example, continued

## True population and sampling distributions

Read true data values. Estimate sampling distribution by drawing 10000 samples of size 5 from the population and calculating the mean of each sample.

I recommend reading the comments in the code below to understand whatâ€™s being done.

fn.data <- "http://statacumen.com/teach/ADA1/worksheet/ADA1_WS_13_Inference_LegoAssemblages_Data.csv"

true.mean.cells <- mean(pop.lego$Cells) true.mean.cells ## [1] 11.86 ## Calculate sampling distribution of the mean R <- 10000 # repetitions, number of samples to draw n <- 5 # size of each sample to draw from population # initialize a place to save the mean of each sample samp.dist <- data.frame(m = rep(NA, R)) # loop over all repetitions, drawing a sample and saving the mean each time # All of this can be done in one line, # but I'm showing the detailed logic of the procedure. for (i in 1:R) { # draw sample of indicies ind <- sample(x = 1:nrow(pop.lego), size = n, replace = FALSE) # retrieve the cell values for the sampled indicies val <- pop.lego$Cells[ind]
# calculate the mean and save the result
samp.dist$m[i] <- mean(val) } Plot the population and sampling distributions # plot population distribution library(ggplot2) p1 <- ggplot(pop.lego, aes(x = Cells)) p1 <- p1 + geom_histogram(binwidth = 1) p1 <- p1 + geom_rug(alpha = 1/2) # true mean cells p1 <- p1 + geom_vline(aes(xintercept = true.mean.cells) , colour = "red", linetype = "dotted", size = 1) p1 <- p1 + labs(title = "Population distribution of cells of 50 assemblages\nred dotted = true mean") p1 <- p1 + scale_x_continuous(limits = c(0,100)) #print(p1) # plot sampling distribution library(ggplot2) p2 <- ggplot(samp.dist, aes(x = m)) p2 <- p2 + geom_histogram(binwidth = 1) #p2 <- p2 + geom_rug(alpha = 1/20) # est mean cells p2 <- p2 + geom_vline(aes(xintercept = mean(samp.dist$m))
, colour = "blue", size = 1)
# true mean cells
p2 <- p2 + geom_vline(aes(xintercept = true.mean.cells)
, colour = "red", linetype = "dotted", size = 1)
p2 <- p2 + labs(title = "Sampling distribution of cells of 50 assemblages\nn=5, red dotted = true, blue = mean of sampling distribution")
p2 <- p2 + scale_x_continuous(limits = c(0,100))
#print(p2)

library(gridExtra)
grid.arrange(p1, p2, ncol=1)

## Numerical summaries

What am I comparing here? Do the values make sense?

# Some summaries of interest

mean(pop.lego$Cells) ## [1] 11.86 mean(samp.dist$m)
## [1] 11.80482
sd(pop.lego$Cells) ## [1] 22.73091 sd(pop.lego$Cells) / sqrt(n)
## [1] 10.16557
sd(samp.dist\$m)
## [1] 9.701353