Lego example, continued

True population and sampling distributions

Read true data values. Estimate sampling distribution by drawing 10000 samples of size 5 from the population and calculating the mean of each sample.

I recommend reading the comments in the code below to understand what’s being done.

fn.data <- "http://statacumen.com/teach/ADA1/worksheet/ADA1_WS_13_Inference_LegoAssemblages_Data.csv"
pop.lego <- read.csv(fn.data)

true.mean.cells <- mean(pop.lego$Cells)
true.mean.cells
## [1] 11.86
## Calculate sampling distribution of the mean
R <- 10000 # repetitions, number of samples to draw
n <- 5     # size of each sample to draw from population

# initialize a place to save the mean of each sample
samp.dist <- data.frame(m = rep(NA, R))

# loop over all repetitions, drawing a sample and saving the mean each time
# All of this can be done in one line,
#   but I'm showing the detailed logic of the procedure.
for (i in 1:R) {
  # draw sample of indicies
  ind <- sample(x = 1:nrow(pop.lego), size = n, replace = FALSE)
  # retrieve the cell values for the sampled indicies
  val <- pop.lego$Cells[ind]
  # calculate the mean and save the result
  samp.dist$m[i] <- mean(val)
}

Plot the population and sampling distributions

# plot population distribution
library(ggplot2)
p1 <- ggplot(pop.lego, aes(x = Cells))
p1 <- p1 + geom_histogram(binwidth = 1)
p1 <- p1 + geom_rug(alpha = 1/2)
# true mean cells
p1 <- p1 + geom_vline(aes(xintercept = true.mean.cells)
                  , colour = "red", linetype = "dotted", size = 1)
p1 <- p1 + labs(title = "Population distribution of cells of 50 assemblages\nred dotted = true mean")
p1 <- p1 + scale_x_continuous(limits = c(0,100))
#print(p1)


# plot sampling distribution
library(ggplot2)
p2 <- ggplot(samp.dist, aes(x = m))
p2 <- p2 + geom_histogram(binwidth = 1)
#p2 <- p2 + geom_rug(alpha = 1/20)
# est mean cells
p2 <- p2 + geom_vline(aes(xintercept = mean(samp.dist$m))
                  , colour = "blue", size = 1)
# true mean cells
p2 <- p2 + geom_vline(aes(xintercept = true.mean.cells)
                  , colour = "red", linetype = "dotted", size = 1)
p2 <- p2 + labs(title = "Sampling distribution of cells of 50 assemblages\nn=5, red dotted = true, blue = mean of sampling distribution")
p2 <- p2 + scale_x_continuous(limits = c(0,100))
#print(p2)

library(gridExtra)
grid.arrange(p1, p2, ncol=1)

Numerical summaries

What am I comparing here? Do the values make sense?

# Some summaries of interest

mean(pop.lego$Cells)
## [1] 11.86
mean(samp.dist$m)
## [1] 11.80482
sd(pop.lego$Cells)
## [1] 22.73091
sd(pop.lego$Cells) / sqrt(n)
## [1] 10.16557
sd(samp.dist$m)
## [1] 9.701353