---
title: "ADA1: Class 13, Inference, Part 2"
author: anonymous
date: "10/04/2016"
output:
html_document:
toc: true
---
# Lego example, continued
## True population and sampling distributions
Read true data values.
Estimate sampling distribution by
drawing 10000 samples of size 5 from the population and
calculating the mean of each sample.
I recommend reading the comments in the code below to understand what's being done.
```{R}
fn.data <- "http://statacumen.com/teach/ADA1/worksheet/ADA1_WS_13_Inference_LegoAssemblages_Data.csv"
pop.lego <- read.csv(fn.data)
true.mean.cells <- mean(pop.lego$Cells)
true.mean.cells
## Calculate sampling distribution of the mean
R <- 10000 # repetitions, number of samples to draw
n <- 5 # size of each sample to draw from population
# initialize a place to save the mean of each sample
samp.dist <- data.frame(m = rep(NA, R))
# loop over all repetitions, drawing a sample and saving the mean each time
# All of this can be done in one line,
# but I'm showing the detailed logic of the procedure.
for (i in 1:R) {
# draw sample of indicies
ind <- sample(x = 1:nrow(pop.lego), size = n, replace = FALSE)
# retrieve the cell values for the sampled indicies
val <- pop.lego$Cells[ind]
# calculate the mean and save the result
samp.dist$m[i] <- mean(val)
}
```
Plot the population and sampling distributions
```{R}
# plot population distribution
library(ggplot2)
p1 <- ggplot(pop.lego, aes(x = Cells))
p1 <- p1 + geom_histogram(binwidth = 1)
p1 <- p1 + geom_rug(alpha = 1/2)
# true mean cells
p1 <- p1 + geom_vline(aes(xintercept = true.mean.cells)
, colour = "red", linetype = "dotted", size = 1)
p1 <- p1 + labs(title = "Population distribution of cells of 50 assemblages\nred dotted = true mean")
p1 <- p1 + scale_x_continuous(limits = c(0,100))
#print(p1)
# plot sampling distribution
library(ggplot2)
p2 <- ggplot(samp.dist, aes(x = m))
p2 <- p2 + geom_histogram(binwidth = 1)
#p2 <- p2 + geom_rug(alpha = 1/20)
# est mean cells
p2 <- p2 + geom_vline(aes(xintercept = mean(samp.dist$m))
, colour = "blue", size = 1)
# true mean cells
p2 <- p2 + geom_vline(aes(xintercept = true.mean.cells)
, colour = "red", linetype = "dotted", size = 1)
p2 <- p2 + labs(title = "Sampling distribution of cells of 50 assemblages\nn=5, red dotted = true, blue = mean of sampling distribution")
p2 <- p2 + scale_x_continuous(limits = c(0,100))
#print(p2)
library(gridExtra)
grid.arrange(p1, p2, ncol=1)
```
## Numerical summaries
What am I comparing here?
Do the values make sense?
```{R}
# Some summaries of interest
mean(pop.lego$Cells)
mean(samp.dist$m)
sd(pop.lego$Cells)
sd(pop.lego$Cells) / sqrt(n)
sd(samp.dist$m)
```