Create the data set discussed on page 248 of OpenIntro Statistics.
library(dplyr)
library(openintro)
bat10 <- mlbBat10 %>%
filter(AB > 200)
# Note unused factor levels
table(bat10$position)
1B 2B 3B OF SS DH C P -
33 39 40 120 42 14 39 0 0
bat10$position <- droplevels(bat10$position)
table(bat10$position)
1B 2B 3B OF SS DH C
33 39 40 120 42 14 39
# Combine 1B, 2B, 3B, SS into infield (IF)
#
bat10$pos <- plyr::revalue(bat10$position,
replace = c("1B" = "IF", "2B" = "IF", "3B" = "IF", "SS" = "IF"))
table(bat10$pos)
IF OF DH C
154 120 14 39
# Note must detach plyr for dplyr to work properly
# detach("package:plyr", unload = TRUE)
Create the summary statistics presented in Table 5.24 of OpenIntro Statistics.
T524 <- bat10 %>%
group_by(pos) %>%
summarise(n = n(), Mean = round(mean(OBP), 4), SD = round(sd(OBP), 4))
T524
# A tibble: 4 × 4
pos n Mean SD
<fctr> <int> <dbl> <dbl>
1 IF 154 0.3315 0.0371
2 OF 120 0.3342 0.0294
3 DH 14 0.3478 0.0360
4 C 39 0.3226 0.0451
DT::datatable(T524)
Your Turn:
\(H_0:\)
\(H_A:\)
library(ggplot2)
ggplot(data = bat10, aes(x = pos, y = OBP, fill = pos)) +
geom_boxplot() +
theme_bw() +
labs(y = "On base percentage", x = "Position") +
guides(fill = FALSE)
ggplot(data = bat10, aes(sample = OBP, color = pos)) +
stat_qq() +
theme_bw()
ggplot(data = bat10, aes(sample = OBP, color = pos)) +
stat_qq() +
facet_grid(.~pos) +
theme_bw() +
guides(color = FALSE)
mod.aov <- aov(OBP ~ pos, data = bat10)
summary(mod.aov)
Df Sum Sq Mean Sq F value Pr(>F)
pos 3 0.0076 0.002519 1.994 0.115
Residuals 323 0.4080 0.001263
Your turn: Compute the \(p-\)value using pf
. Does your answer agree with the output from the ANOVA table?
# Your code here
library(ggplot2)
ggplot(data = data.frame(x = seq(0, 6, length= 200)), aes(x = x)) +
stat_function(fun = df, args = list(3, 233), geom = "area", fill = "purple", alpha = 0.5) +
theme_bw() +
labs(x = "", y = "")
df_limit <- function(x){
y <- df(x, 3, 323)
y[x < 1.994] <- NA
return(y)
}
p <- ggplot(data.frame(x = c(0, 6)), aes(x = x))
p + stat_function(fun = df_limit, geom = "area", fill = "purple", alpha = 0.4, n = 500) +
stat_function(fun = df, args = list(3, 323)) +
theme_bw() +
labs(x = "", y = "")
Finished - Now consider a different problem.
Use bat10
to test whether the on base percentage is related to position.
T5N <- bat10 %>%
group_by(position) %>%
summarise(n = n(), Mean = round(mean(OBP), 4), SD = round(sd(OBP), 4))
T5N
# A tibble: 7 × 4
position n Mean SD
<fctr> <int> <dbl> <dbl>
1 1B 33 0.3554 0.0419
2 2B 39 0.3348 0.0278
3 3B 40 0.3224 0.0402
4 OF 120 0.3342 0.0294
5 SS 42 0.3184 0.0279
6 DH 14 0.3478 0.0360
7 C 39 0.3226 0.0451
Your turn:
\(H_O:\)
\(H_A:\)
ggplot(data = bat10, aes(x = position, y = OBP, fill = position)) +
geom_boxplot() +
theme_bw() +
guides(fill = FALSE)
ggplot(data = bat10, aes(sample = OBP, color = position)) +
stat_qq() +
theme_bw()
ggplot(data = bat10, aes(sample = OBP, color = position)) +
stat_qq() +
facet_grid(.~position) +
theme_bw() +
guides(color = FALSE)
mod2.aov <- aov(OBP ~ position, data = bat10)
summary(mod2.aov)
Df Sum Sq Mean Sq F value Pr(>F)
position 6 0.0374 0.006233 5.275 3.35e-05 ***
Residuals 320 0.3781 0.001182
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Your turn:
\[CI_{1 - \alpha_e}(\mu_i - \mu_j) = (\bar{Y}_{i\bullet} - \bar{Y}_{j\bullet}) \pm t_{1 - \tfrac{\alpha_c}{2K};df_{Error}}\sqrt{MS_{Error}}\sqrt{\frac{1}{n_i} + \frac{1}{n_j}}\]
T5N
# A tibble: 7 × 4
position n Mean SD
<fctr> <int> <dbl> <dbl>
1 1B 33 0.3554 0.0419
2 2B 39 0.3348 0.0278
3 3B 40 0.3224 0.0402
4 OF 120 0.3342 0.0294
5 SS 42 0.3184 0.0279
6 DH 14 0.3478 0.0360
7 C 39 0.3226 0.0451
(0.3554 - 0.3348) +c(-1, 1)*qt(1 - 0.05/(2*choose(7,2)), 320)*sqrt(0.001182)*sqrt(1/33 + 1/39)
[1] -0.004303859 0.045503859
Your Turn:
Find a 95% Bonferroni CI for the \(\mu_{3B} - \mu_{1B}\)
# Your code here
Another approach is to use pairwise.t.test()
pairwise.t.test(bat10$OBP, bat10$position, p.adj = "bonferroni")
Pairwise comparisons using t tests with pooled SD
data: bat10$OBP and bat10$position
1B 2B 3B OF SS DH
2B 0.24462 - - - - -
3B 0.00120 1.00000 - - - -
OF 0.03961 1.00000 1.00000 - - -
SS 0.00011 0.67785 1.00000 0.21761 - -
DH 1.00000 1.00000 0.38248 1.00000 0.12310 -
C 0.00143 1.00000 1.00000 1.00000 1.00000 0.40683
P value adjustment method: bonferroni
\[CI_{1 - \alpha_e}(\mu_i - \mu_j) = (\bar{Y}_{i\bullet} - \bar{Y}_{j\bullet}) \pm \frac{q_{1 - \alpha_e:a, \nu}}{\sqrt{2}}\sqrt{MS_{Error}}\sqrt{\frac{1}{n_i} + \frac{1}{n_j}}\] Note: \(a=\) the number of treatments, \(\nu=\) the degrees of freedom for \(MS_{Error}\). The notation \(q_{1 - \alpha;a,\nu}\) denotes the studentized range value with \(1 - \alpha\) area to the left with \(a\) and \(\nu\) degrees of freedom, respectively. For example, \(q_{0.95;4,20} = 3.9582935\) is obtained by entering:
qtukey(0.95, 4, 20)
[1] 3.958293
Your Turn:
Verify using the formula that \(CI_{0.95}(\mu_{3B} - \mu_{1B}) = [-0.05696, -0.00899]\).
# Your code here
TukeyHSD(mod2.aov)
Tukey multiple comparisons of means
95% family-wise confidence level
Fit: aov(formula = OBP ~ position, data = bat10)
$position
diff lwr upr p adj
2B-1B -0.0206293706 -0.044754902 0.003496161 0.1496557
3B-1B -0.0329742424 -0.056961156 -0.008987328 0.0011118
OF-1B -0.0211742424 -0.041223488 -0.001124997 0.0307855
SS-1B -0.0370670996 -0.060794433 -0.013339766 0.0001053
DH-1B -0.0076385281 -0.040171808 0.024894752 0.9927262
C-1B -0.0328088578 -0.056934389 -0.008683326 0.0013225
3B-2B -0.0123448718 -0.035298490 0.010608747 0.6851503
OF-2B -0.0005448718 -0.019345638 0.018255894 1.0000000
SS-2B -0.0164377289 -0.039119944 0.006244487 0.3259316
DH-2B 0.0129908425 -0.018788253 0.044769938 0.8888576
C-2B -0.0121794872 -0.035277925 0.010918951 0.7050406
OF-3B 0.0118000000 -0.006822556 0.030422556 0.4950415
SS-3B -0.0040928571 -0.026627579 0.018441864 0.9982356
DH-3B 0.0253357143 -0.006338276 0.057009705 0.2133531
C-3B 0.0001653846 -0.022788234 0.023119003 1.0000000
SS-OF -0.0158928571 -0.034179844 0.002394130 0.1360106
DH-OF 0.0135357143 -0.015271262 0.042342690 0.8047266
C-OF -0.0116346154 -0.030435382 0.007166151 0.5245490
DH-SS 0.0294285714 -0.002049293 0.060906436 0.0842020
C-SS 0.0042582418 -0.018423974 0.026940457 0.9978753
C-DH -0.0251703297 -0.056949425 0.006608765 0.2237267
par(las = 1)
plot(TukeyHSD(mod2.aov))
par(las = 0)
NDF <- bat10 %>%
filter(team == "BAL" | team == "BOS") %>%
select(OBP, team)
NDF %>%
group_by(team) %>%
summarize(Mean = mean(OBP), SD = sd(OBP), n = n())
# A tibble: 2 × 4
team Mean SD n
<fctr> <dbl> <dbl> <int>
1 BAL 0.3228182 0.02947819 11
2 BOS 0.3497000 0.03019584 10
ggplot(data = NDF, aes(x = team, y = OBP, fill = team)) +
geom_boxplot() +
theme_bw() +
labs(y = "On base percentage") +
guides(fill = FALSE)
ggplot(data = NDF, aes(sample = OBP, color = team)) +
stat_qq() +
theme_bw()
# Or
ggplot(data = NDF, aes(sample = OBP, color = team)) +
stat_qq() +
theme_bw() +
facet_grid(. ~ team)
# Standard 90% CI first
t.test(OBP ~ team, data = NDF, conf.level = 0.90)
Welch Two Sample t-test
data: OBP by team
t = -2.0607, df = 18.71, p-value = 0.05351
alternative hypothesis: true difference in means is not equal to 0
90 percent confidence interval:
-0.049456471 -0.004307166
sample estimates:
mean in group BAL mean in group BOS
0.3228182 0.3497000
df <- t.test(OBP ~ team, data = NDF, conf.level = 0.90)$parameter
df
df
18.71049
For more on the bootstrap method see http://stat-ata-asu.github.io/STT3851ClassRepo/Rmarkdown/TheBootstrap.html.
BALOBP <- bat10$OBP[bat10$team == "BAL"]
BOSOBP <- bat10$OBP[bat10$team == "BOS"]
obsdiff <- mean(BALOBP) - mean(BOSOBP)
obsdiff
[1] -0.02688182
SIMS <- 10^4 - 1
diffmean <- numeric(SIMS)
for(i in 1:SIMS){
sampBAL <- sample(BALOBP, size = sum(!is.na(BALOBP)), replace = TRUE)
sampBOS <- sample(BOSOBP, size = sum(!is.na(BOSOBP)), replace = TRUE)
diffmean[i] <- mean(sampBAL) - mean(sampBOS)
}
hist(diffmean)
# OR
ggplot(data = data.frame(x = diffmean), aes(x = x)) +
geom_density(fill = "pink") +
theme_bw() +
labs(x = substitute(paste(bar(x)[Bal],"*", - bar(x)[Bos],"*")))
BSCI <- quantile(diffmean, probs = c(0.05, 0.95))
BSCI
5% 95%
-0.047100909 -0.006836364
SBS <- obsdiff + c(-1, 1)*qt(0.95, df)*sd(diffmean)
SBS
[1] -0.04816129 -0.00560235