Creating A Data Frame

ws <- c(67, 64, 66, 67, 68, 61, 64, 65, 69, 71)
hi <- c(70, 69, 72, 70, 70, 62, 68, 65, 66, 77)
name <- c("Dustin", "Cescily", "Averia", "John", "Carter", "Jaehee",
          "Paige", "Sierra", "Austin", "Lee")
sex <- c("M", "F", "F", "M", "M", "F", "F", "F", "M", "M")
DF <- data.frame(Name = name, WingSpan = ws, HeightIn = hi, Sex = sex)
# Remove individual vectors
rm(ws, hi, sex)
str(DF)

'data.frame':   10 obs. of  4 variables:
 $ Name    : Factor w/ 10 levels "Austin","Averia",..: 5 4 2 7 3 6 9 10 1 8
 $ WingSpan: num  67 64 66 67 68 61 64 65 69 71
 $ HeightIn: num  70 69 72 70 70 62 68 65 66 77
 $ Sex     : Factor w/ 2 levels "F","M": 2 1 1 2 2 1 1 1 2 2

DF$Name <- as.character(DF$Name)
str(DF)

'data.frame':   10 obs. of  4 variables:
 $ Name    : chr  "Dustin" "Cescily" "Averia" "John" ...
 $ WingSpan: num  67 64 66 67 68 61 64 65 69 71
 $ HeightIn: num  70 69 72 70 70 62 68 65 66 77
 $ Sex     : Factor w/ 2 levels "F","M": 2 1 1 2 2 1 1 1 2 2

Creating a Scatter Plot

library(ggplot2)
ggplot(data = DF, aes(x = HeightIn, y = WingSpan, color = Sex)) +
  geom_point() + 
  geom_smooth(method = "lm") + 
  theme_bw()

Computing Descriptive Statistics

library(dplyr)
Res <- DF %>%
  group_by(Sex) %>%
  summarize(MH = mean(HeightIn), SH = sd(HeightIn), n = n())
Res

# A tibble: 2 x 4
  Sex       MH    SH     n
  <fctr> <dbl> <dbl> <int>
1 F       67.2  3.83     5
2 M       70.6  3.97     5

Showing Tables

knitr::kable(Res)

Sex	MH	SH	n
F	67.2	3.834058	5
M	70.6	3.974921	5

DT::datatable(Res)

Testing Hypotheses

\(H_0: \textrm{The average height of males is the no different than the average height of females.}\)

\(H_A: \textrm{The average height of males is greater than the average height of females.}\)

Written mathematically,

\(H_0: \mu_{\textrm{males}} - \mu_{\textrm{females}} = 0\)

\(H_A: \mu_{\textrm{males}} - \mu_{\textrm{females}} > 0\)

Question - How should be test our hypotheses?

\(t\)-test?
Permutation Test?
Bootstrap CI?

What are the assumptions for a \(t\)-test?

# Using ggplot2
ggplot(data = DF, aes(sample = HeightIn, color = Sex)) +
  geom_qq() + 
  theme_bw()

xbar1 <- Res[2, 2]
xbar2 <- Res[1, 2]
s1 <- Res[2, 3]
s2 <- Res[1, 3]
n1 <- Res[2, 4]
n2 <- Res[1, 4]
TS <- (xbar1 - xbar2)/(s1^2/n1 + s2^2/n2)^0.5
names(TS) <- "TestStatistic"
TS

  TestStatistic
1       1.37662

str(TS$TestStatistic)

 num 1.38

nu <- (s1^2/n1 + s2^2/n2)^2 / ((s1^2/n1)^2/(n1-1) + (s2^2/n2)^2/(n2-1))
names(nu) <- "DegreesOfFreedom"
nu

  DegreesOfFreedom
1         7.989608

During class, the question was asked “What are degrees of freedom?” A good explanation is provided on Wikipedia.

Base R

curve(dt(x, nu$DegreesOfFreedom), from = -4, to = 4, n = 500, col = "purple", ylab = "")

Curve with `ggplot2`

ggplot(data.frame(x = c(-4, 4)), aes(x=x)) +
  stat_function(fun = dt, args = list(df = nu$DegreesOfFreedom)) + 
  theme_bw() + 
  geom_vline(xintercept = TS$TestStatistic, color = "purple") + 
  geom_hline(yintercept = 0)

\(p\)-value

ts <- TS$TestStatistic
ts

[1] 1.37662

nu <- nu$DegreesOfFreedom
nu

[1] 7.989608

pvalue <- pt(ts, nu, lower = FALSE)
pvalue

[1] 0.1029859

The \(P(t_{\nu} = t_{7.9896077} \ge 1.3766198) = 0.1029859.\)

t.test(HeightIn ~ Sex, data = DF, alternative = "less")


    Welch Two Sample t-test

data:  HeightIn by Sex
t = -1.3766, df = 7.9896, p-value = 0.103
alternative hypothesis: true difference in means is less than 0
95 percent confidence interval:
     -Inf 1.193519
sample estimates:
mean in group F mean in group M 
           67.2            70.6

Permutation/Randomization Testing

N <- 10^4 - 1
PTS <- numeric(N)
for(i in 1:N){
  PTS[i] <- t.test(HeightIn ~ sample(Sex), data = DF)$stat
}
epv <- (sum(PTS >= ts) + 1)/(N + 1)
epv

[1] 0.1232

ggplot(data = data.frame(x = PTS), aes(x = x)) +
  geom_density(fill = "purple") + 
  theme_bw()

Bootstrapping Percentile CI

Key difference is that sampling is done with replacement!

HeightInM <- DF$HeightIn[DF$Sex =="M"]
HeightInM

[1] 70 70 70 66 77

HeightInF <- subset(DF, Sex == "F", select = HeightIn, drop = TRUE)
HeightInF

[1] 69 72 62 68 65

B <- 10^4 - 1
BSD <- numeric(B)
for(i in 1:B){
  bs1 <- sample(HeightInM, size = 5, replace = TRUE)
  bs2 <- sample(HeightInF, size = 5, replace = TRUE)
  BSD[i] <- mean(bs1) - mean(bs2)
}
ggplot(data = data.frame(x = BSD), aes(x = x)) +
  geom_density(fill = "lightblue") +
  theme_bw()

quantile(BSD, probs = c(0.05, 0.95))

  5%  95% 
-0.2  7.2

Day One

Alan Arnholt

Jan 17, 2017

Creating A Data Frame

Creating a Scatter Plot

Computing Descriptive Statistics

Showing Tables

Testing Hypotheses

Base R

Curve with `ggplot2`

\(p\)-value

Permutation/Randomization Testing

Bootstrapping Percentile CI

Day One

Alan Arnholt

Jan 17, 2017

Creating A Data Frame

Creating a Scatter Plot

Computing Descriptive Statistics

Showing Tables

Testing Hypotheses

Base R

Curve with ggplot2

\(p\)-value

Permutation/Randomization Testing

Bootstrapping Percentile CI

Curve with `ggplot2`