---
title: "ADA1: Class 18, Nonparametric methods"
author: "Your Name Here"
date: "`r format(Sys.time(), '%B %d, %Y')`"
output:
html_document:
toc: true
---
Include your answers in this document in the sections below the rubric where I have point values indicated (1 p).
# Rubric
Answer the questions with the data examples.
---
```{R}
library(tidyverse)
```
```{R, echo = FALSE}
#### Visual comparison of whether sampling distribution is close to Normal via Bootstrap
# a function to compare the bootstrap sampling distribution with
# a normal distribution with mean and SEM estimated from the data
bs.one.samp.dist <- function(dat, N = 1e4) {
n <- length(dat);
# resample from data
sam <- matrix(sample(dat, size = N * n, replace = TRUE), ncol=N);
# draw a histogram of the means
sam.mean <- colMeans(sam);
# save par() settings
old.par <- par(no.readonly = TRUE)
# make smaller margins
par(mfrow=c(2,1), mar=c(3,2,2,1), oma=c(1,1,1,1))
# Histogram overlaid with kernel density curve
hist(dat, freq = FALSE, breaks = 6
, main = "Plot of data with smoothed density curve")
points(density(dat), type = "l")
rug(dat)
hist(sam.mean, freq = FALSE, breaks = 25
, main = "Bootstrap sampling distribution of the mean"
, xlab = paste("Data: n =", n
, ", mean =", signif(mean(dat), digits = 5)
, ", se =", signif(sd(dat)/sqrt(n)), digits = 5))
# overlay a density curve for the sample means
points(density(sam.mean), type = "l")
# overlay a normal distribution, bold and red
x <- seq(min(sam.mean), max(sam.mean), length = 1000)
points(x, dnorm(x, mean = mean(dat), sd = sd(dat)/sqrt(n))
, type = "l", lwd = 2, col = "red")
# place a rug of points under the plot
rug(sam.mean)
# restore par() settings
par(old.par)
}
#### Visual comparison of whether sampling distribution is close to Normal via Bootstrap
# a function to compare the bootstrap sampling distribution
# of the difference of means from two samples with
# a normal distribution with mean and SEM estimated from the data
bs.two.samp.diff.dist <- function(dat1, dat2, N = 1e4) {
n1 <- length(dat1);
n2 <- length(dat2);
# resample from data
sam1 <- matrix(sample(dat1, size = N * n1, replace = TRUE), ncol=N);
sam2 <- matrix(sample(dat2, size = N * n2, replace = TRUE), ncol=N);
# calculate the means and take difference between populations
sam1.mean <- colMeans(sam1);
sam2.mean <- colMeans(sam2);
diff.mean <- sam1.mean - sam2.mean;
# save par() settings
old.par <- par(no.readonly = TRUE)
# make smaller margins
par(mfrow=c(3,1), mar=c(3,2,2,1), oma=c(1,1,1,1))
# Histogram overlaid with kernel density curve
hist(dat1, freq = FALSE, breaks = 6
, main = paste("Sample 1", "\n"
, "n =", n1
, ", mean =", signif(mean(dat1), digits = 5)
, ", sd =", signif(sd(dat1), digits = 5))
, xlim = range(c(dat1, dat2)))
points(density(dat1), type = "l")
rug(dat1)
hist(dat2, freq = FALSE, breaks = 6
, main = paste("Sample 2", "\n"
, "n =", n2
, ", mean =", signif(mean(dat2), digits = 5)
, ", sd =", signif(sd(dat2), digits = 5))
, xlim = range(c(dat1, dat2)))
points(density(dat2), type = "l")
rug(dat2)
hist(diff.mean, freq = FALSE, breaks = 25
, main = paste("Bootstrap sampling distribution of the difference in means", "\n"
, "mean =", signif(mean(diff.mean), digits = 5)
, ", se =", signif(sd(diff.mean), digits = 5)))
# overlay a density curve for the sample means
points(density(diff.mean), type = "l")
# overlay a normal distribution, bold and red
x <- seq(min(diff.mean), max(diff.mean), length = 1000)
points(x, dnorm(x, mean = mean(diff.mean), sd = sd(diff.mean))
, type = "l", lwd = 2, col = "red")
# place a rug of points under the plot
rug(diff.mean)
# restore par() settings
par(old.par)
}
```
# Paired Comparisons of Two Sleep Remedies, revisited
We revisit the sleep remedy dataset (from WS16) because the normality assumption
of the sampling distribution of the mean did not seem to be reasonable.
The plots below remind us of the data and bootstrap estimate of the sampling distribution.
```{R, echo=FALSE, fig.width = 4, fig.height = 4}
# Data
sleep <- read.table(text="
A B
0.7 1.9
-1.6 0.8
-0.2 1.1
-1.2 0.1
0.1 -0.1
3.4 4.4
3.7 5.5
0.8 1.6
0.0 4.6
2.0 3.0
", header = TRUE)
# calculate paired difference bewteen two remedies, D = B - A
sleep$D <- sleep$B - sleep$A
#summary(sleep)
# determine range of each axis to use most extreme of each for square plot below
axis.lim <- range(c(sleep$A, sleep$B))
library(ggplot2)
# scatterplot of A and B sleep times, with 1:1 line
p <- ggplot(sleep, aes(x = A, y = B))
# draw a 1:1 line, dots above line indicate "B > A"
p <- p + geom_abline(intercept = 0, slope = 1, alpha = 0.4)
p <- p + geom_vline(xintercept = 0, alpha = 0.2)
p <- p + geom_hline(yintercept = 0, alpha = 0.2)
p <- p + geom_point()
p <- p + geom_rug()
# make the axes square so it's a fair visual comparison
p <- p + coord_equal()
p <- p + scale_x_continuous(limits=axis.lim)
p <- p + scale_y_continuous(limits=axis.lim)
p <- p + labs(title = "Sleep hours gained on two sleep remedies: A vs B")
p <- p + theme_bw()
print(p)
library(ggplot2)
p1 <- ggplot(sleep, aes(x = D))
p1 <- p1 + scale_x_continuous(limits=c(-6,+6))
p1 <- p1 + geom_histogram(aes(y=..density..), binwidth = 1)
p1 <- p1 + geom_density(alpha=0.1, fill="white", adjust = 2)
# vertical reference line at 0
p1 <- p1 + geom_vline(xintercept = 0, colour="red", linetype="dashed")
p1 <- p1 + geom_vline(xintercept = mean(sleep$D), colour="blue", alpha = 0.5)
p1 <- p1 + geom_rug()
p1 <- p1 + labs(title = "Difference of sleep hours gained: D = B - A")
p1 <- p1 + theme_bw()
print(p1)
```
```{R, echo=FALSE, fig.width = 6, fig.height = 4}
bs.one.samp.dist(sleep$D)
```
Use the example in ADA1 Chapter 6 on Nonparametric Methods to state the hypothesis and results of the tests:
## Sign test
(1 p) Hypotheses in words and notation. Be sure to specify which population parameter is being tested.
(1 p) R code and results for test (use the `BSDA` package and `SIGN.test()`). Use $\alpha = 0.10$.
(1 p) Conclusion based on test.
## 90% CI for the Median
(0.5 p) R code and CI (use the `wilcox.test()`).
(0.5 p) State the assumptions for the Wilcoxon CI.
(1 p) Determine the appropriate method for a CI (sign or Wilcoxon), then present and interpret the 90% CI with reference to the population parameter.
---
# The Number of Breaks in Yarn during Weaving
This data set gives the number of warp breaks per loom,
where a loom corresponds to a fixed length of yarn.
We will analyze wool type B and see whether the tension
affects the number of breaks.
```
A data frame with 54 observations on 3 variables.
[,1] breaks numeric The number of breaks
[,2] wool factor The type of wool (A or B)
[,3] tension factor The level of tension (L, M, H)
```
```{R}
library(datasets)
str(warpbreaks)
# keep only wool type B, then drop wool type from data frame
warp.sub <- subset(warpbreaks, wool == "B", select = -wool)
str(warp.sub)
```
Notice that in the plot below I include the points, median, and IQR (the center 50%).
Because there's only 9 observations per group the boxplot and violin plot that I would typically also include
becomes distracting and less informative.
With very small sample sizes,
the salience of individual values should increase relative to summaries.
```{R, fig.width = 8, fig.height = 4}
# A set of useful summary functions is provided from the Hmisc package:
library(Hmisc)
stat_sum_df <- function(fun, geom="crossbar", ...) {
stat_summary(fun.data=fun, colour="gray60", geom=geom, width=0.2, ...)
}
# Plot the data using ggplot
library(ggplot2)
p <- ggplot(warp.sub, aes(x = tension, y = breaks))
# plot a reference line for the global mean (assuming no groups)
p <- p + geom_hline(yintercept = mean(warp.sub$breaks),
colour = "black", linetype = "dashed", size = 0.3, alpha = 0.5)
# box for IQR and median
p <- p + stat_sum_df("median_hilow", fun.args = list(conf.int=0.5), show.legend=FALSE)
# diamond at mean for each group
p <- p + stat_summary(fun.y = median, geom = "point", shape = 18, size = 4,
colour = "red", alpha = 0.8)
# points for observed data
p <- p + geom_point(alpha = 0.5) #position = position_jitter(w = 0.05, h = 0), alpha = 1)
p <- p + labs(title = "Number of breaks by loom tension for wool B")
p <- p + expand_limits(y = 10)
p <- p + theme_bw()
#print(p)
# log scale
p2 <- p
p2 <- p2 + scale_y_log10(breaks = seq(0, 50, by=10))
p2 <- p2 + labs(y = "log10(breaks)")
p2 <- p2 + expand_limits(y = 10)
#print(p2)
library(gridExtra)
grid.arrange(p, p2, ncol=2)
```
Some numerical summaries may be helpful later.
```{R}
# summary of each
by(warp.sub$breaks, warp.sub$tension, summary)
# IQR, sd, and n
by(warp.sub$breaks, warp.sub$tension, function(X) { c(IQR(X), sd(X), length(X)) })
```
Nonparametric methods are available to use even when assumptions for other tests are met.
Use the Kruskal-Wallis rank sum test to assess whether there are differences in medians between these three groups.
If there are, determine which pairs differ using the Wilcox-Mann-Whitney test.
(1 p) State the hypothesis.
Perform the KW test.
```{R}
# KW ANOVA
fit.bt <- kruskal.test(breaks ~ tension, data = warp.sub)
fit.bt
```
(1 p) State the assumptions of the KW test and provide the support that the assumptions are either met or not.
(1 p) Interpret the test result above. Use $\alpha = 0.10$.
Follow-up pairwise comparisons.
```{R}
wilcox.test(breaks ~ tension, data = warp.sub, subset = (tension %in% c("L", "M")))
wilcox.test(breaks ~ tension, data = warp.sub, subset = (tension %in% c("L", "H")))
wilcox.test(breaks ~ tension, data = warp.sub, subset = (tension %in% c("M", "H")))
```
(1 p) Interpret the pairwise comparisons above (which pairs differ) at a
Bonferroni-corrected significance level of $\alpha/3 = `r signif(0.1/3, 3)`$.
(1 p) Summarize results by ordering the means and grouping pairs that do not differ (see notes for examples).
```
REPLACE THIS EXAMPLE WITH YOUR RESULTS.
Example: Groups A and C differ, but B is not different from either.
(These groups are ordered by mean, so A has the lowest median and C has the highest.)
Group A Group B Group C
-----------------
-----------------
```