Baseball

Create the data set discussed on page 248 of OpenIntro Statistics.

Note the book is now linked.

library(dplyr)
library(openintro)
bat10 <- mlbBat10 %>%
  filter(AB > 200)
# Note unused factor levels
table(bat10$position)


 1B  2B  3B  OF  SS  DH   C   P   - 
 33  39  40 120  42  14  39   0   0

bat10$position <- droplevels(bat10$position)
table(bat10$position)


 1B  2B  3B  OF  SS  DH   C 
 33  39  40 120  42  14  39

# Combine 1B, 2B, 3B, SS into infield (IF)
#
bat10$pos <- plyr::revalue(bat10$position, 
                       replace = c("1B" = "IF", "2B" = "IF", "3B" = "IF", "SS" = "IF"))
table(bat10$pos)


 IF  OF  DH   C 
154 120  14  39

# Note must detach plyr for dplyr to work properly
# detach("package:plyr", unload = TRUE)

Create the summary statistics presented in Table 5.24 of OpenIntro Statistics.

T524 <- bat10 %>%
  group_by(pos) %>%
  summarise(n = n(), Mean = round(mean(OBP), 4), SD = round(sd(OBP), 4))
T524

# A tibble: 4 × 4
     pos     n   Mean     SD
  <fctr> <int>  <dbl>  <dbl>
1     IF   154 0.3315 0.0371
2     OF   120 0.3342 0.0294
3     DH    14 0.3478 0.0360
4      C    39 0.3226 0.0451

DT::datatable(T524)

Hypotheses

Your Turn:

Write the null and alternative hypotheses to test whether the on base percentage is related to position.

\(H_0:\)

\(H_A:\)

List the conditions necessary for performing ANOVA:

library(ggplot2)
ggplot(data = bat10, aes(x = pos, y = OBP, fill = pos)) + 
  geom_boxplot() + 
  theme_bw() + 
  labs(y = "On base percentage", x = "Position") + 
  guides(fill = FALSE)

Figure 1: Side-by-side boxplots of on base percentage according to position

ggplot(data = bat10, aes(sample = OBP, color = pos)) + 
  stat_qq() + 
  theme_bw()

ggplot(data = bat10, aes(sample = OBP, color = pos)) + 
  stat_qq() +
  facet_grid(.~pos) +
  theme_bw() + 
  guides(color = FALSE)

mod.aov <- aov(OBP ~ pos, data = bat10)
summary(mod.aov)

             Df Sum Sq  Mean Sq F value Pr(>F)
pos           3 0.0076 0.002519   1.994  0.115
Residuals   323 0.4080 0.001263

Your turn: Compute the \(p-\)value using pf. Does your answer agree with the output from the ANOVA table?

# Your code here

library(ggplot2)
ggplot(data = data.frame(x = seq(0, 6, length= 200)), aes(x = x)) +
  stat_function(fun = df, args = list(3, 233), geom = "area", fill = "purple", alpha = 0.5) +
  theme_bw() + 
  labs(x = "", y = "")

df_limit <- function(x){
  y <- df(x, 3, 323)
  y[x < 1.994] <- NA
  return(y)
}
p <- ggplot(data.frame(x = c(0, 6)), aes(x = x))
p + stat_function(fun = df_limit, geom = "area", fill = "purple", alpha = 0.4, n = 500) + 
  stat_function(fun = df, args = list(3, 323)) +
  theme_bw() +
  labs(x = "", y = "")

Finished - Now consider a different problem.

Use bat10 to test whether the on base percentage is related to position.

T5N <- bat10 %>%
  group_by(position) %>%
  summarise(n = n(), Mean = round(mean(OBP), 4), SD = round(sd(OBP), 4))
T5N

# A tibble: 7 × 4
  position     n   Mean     SD
    <fctr> <int>  <dbl>  <dbl>
1       1B    33 0.3554 0.0419
2       2B    39 0.3348 0.0278
3       3B    40 0.3224 0.0402
4       OF   120 0.3342 0.0294
5       SS    42 0.3184 0.0279
6       DH    14 0.3478 0.0360
7        C    39 0.3226 0.0451

Your turn:

Write the null and alternative hypotheses to test whether the on base percentage is related to position.

\(H_O:\)

\(H_A:\)

List the conditions necessary for performing ANOVA:

ggplot(data = bat10, aes(x = position, y = OBP, fill = position)) + 
  geom_boxplot() + 
  theme_bw() + 
  guides(fill = FALSE)

ggplot(data = bat10, aes(sample = OBP, color = position)) + 
  stat_qq() + 
  theme_bw()

ggplot(data = bat10, aes(sample = OBP, color = position)) + 
  stat_qq() +
  facet_grid(.~position) +
  theme_bw() + 
  guides(color = FALSE)

mod2.aov <- aov(OBP ~ position, data = bat10)
summary(mod2.aov)

             Df Sum Sq  Mean Sq F value   Pr(>F)    
position      6 0.0374 0.006233   5.275 3.35e-05 ***
Residuals   320 0.3781 0.001182                     
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Your turn:

Do you reject the null hypothesis? If so, what does this mean in plain English?
Which positions have a higher on base percentage?

Bonferroni Method

\[CI_{1 - \alpha_e}(\mu_i - \mu_j) = (\bar{Y}_{i\bullet} - \bar{Y}_{j\bullet}) \pm t_{1 - \tfrac{\alpha_c}{2K};df_{Error}}\sqrt{MS_{Error}}\sqrt{\frac{1}{n_i} + \frac{1}{n_j}}\]

T5N

# A tibble: 7 × 4
  position     n   Mean     SD
    <fctr> <int>  <dbl>  <dbl>
1       1B    33 0.3554 0.0419
2       2B    39 0.3348 0.0278
3       3B    40 0.3224 0.0402
4       OF   120 0.3342 0.0294
5       SS    42 0.3184 0.0279
6       DH    14 0.3478 0.0360
7        C    39 0.3226 0.0451

(0.3554 - 0.3348) +c(-1, 1)*qt(1 - 0.05/(2*choose(7,2)), 320)*sqrt(0.001182)*sqrt(1/33 + 1/39)

[1] -0.004303859  0.045503859

Your Turn:

Find a 95% Bonferroni CI for the \(\mu_{3B} - \mu_{1B}\)

# Your code here

Another approach is to use pairwise.t.test()

pairwise.t.test(bat10$OBP, bat10$position, p.adj = "bonferroni")


    Pairwise comparisons using t tests with pooled SD 

data:  bat10$OBP and bat10$position 

   1B      2B      3B      OF      SS      DH     
2B 0.24462 -       -       -       -       -      
3B 0.00120 1.00000 -       -       -       -      
OF 0.03961 1.00000 1.00000 -       -       -      
SS 0.00011 0.67785 1.00000 0.21761 -       -      
DH 1.00000 1.00000 0.38248 1.00000 0.12310 -      
C  0.00143 1.00000 1.00000 1.00000 1.00000 0.40683

P value adjustment method: bonferroni

Tukey’s Honestly Significant Difference (HSD)

\[CI_{1 - \alpha_e}(\mu_i - \mu_j) = (\bar{Y}_{i\bullet} - \bar{Y}_{j\bullet}) \pm \frac{q_{1 - \alpha_e:a, \nu}}{\sqrt{2}}\sqrt{MS_{Error}}\sqrt{\frac{1}{n_i} + \frac{1}{n_j}}\] Note: \(a=\) the number of treatments, \(\nu=\) the degrees of freedom for \(MS_{Error}\). The notation \(q_{1 - \alpha;a,\nu}\) denotes the studentized range value with \(1 - \alpha\) area to the left with \(a\) and \(\nu\) degrees of freedom, respectively. For example, \(q_{0.95;4,20} = 3.9582935\) is obtained by entering:

qtukey(0.95, 4, 20)

[1] 3.958293

Your Turn:

Verify using the formula that \(CI_{0.95}(\mu_{3B} - \mu_{1B}) = [-0.05696, -0.00899]\).

# Your code here

TukeyHSD(mod2.aov)

  Tukey multiple comparisons of means
    95% family-wise confidence level

Fit: aov(formula = OBP ~ position, data = bat10)

$position
               diff          lwr          upr     p adj
2B-1B -0.0206293706 -0.044754902  0.003496161 0.1496557
3B-1B -0.0329742424 -0.056961156 -0.008987328 0.0011118
OF-1B -0.0211742424 -0.041223488 -0.001124997 0.0307855
SS-1B -0.0370670996 -0.060794433 -0.013339766 0.0001053
DH-1B -0.0076385281 -0.040171808  0.024894752 0.9927262
C-1B  -0.0328088578 -0.056934389 -0.008683326 0.0013225
3B-2B -0.0123448718 -0.035298490  0.010608747 0.6851503
OF-2B -0.0005448718 -0.019345638  0.018255894 1.0000000
SS-2B -0.0164377289 -0.039119944  0.006244487 0.3259316
DH-2B  0.0129908425 -0.018788253  0.044769938 0.8888576
C-2B  -0.0121794872 -0.035277925  0.010918951 0.7050406
OF-3B  0.0118000000 -0.006822556  0.030422556 0.4950415
SS-3B -0.0040928571 -0.026627579  0.018441864 0.9982356
DH-3B  0.0253357143 -0.006338276  0.057009705 0.2133531
C-3B   0.0001653846 -0.022788234  0.023119003 1.0000000
SS-OF -0.0158928571 -0.034179844  0.002394130 0.1360106
DH-OF  0.0135357143 -0.015271262  0.042342690 0.8047266
C-OF  -0.0116346154 -0.030435382  0.007166151 0.5245490
DH-SS  0.0294285714 -0.002049293  0.060906436 0.0842020
C-SS   0.0042582418 -0.018423974  0.026940457 0.9978753
C-DH  -0.0251703297 -0.056949425  0.006608765 0.2237267

par(las = 1)
plot(TukeyHSD(mod2.aov))

par(las = 0)

Confidence Intervals (2 Groups)

NDF <- bat10 %>%
  filter(team == "BAL" | team == "BOS") %>%
  select(OBP, team) 
NDF %>%
  group_by(team) %>%
  summarize(Mean = mean(OBP), SD = sd(OBP), n = n())

# A tibble: 2 × 4
    team      Mean         SD     n
  <fctr>     <dbl>      <dbl> <int>
1    BAL 0.3228182 0.02947819    11
2    BOS 0.3497000 0.03019584    10

ggplot(data = NDF, aes(x = team, y = OBP, fill = team)) + 
  geom_boxplot() + 
  theme_bw() + 
  labs(y = "On base percentage") + 
  guides(fill = FALSE)

ggplot(data = NDF, aes(sample = OBP, color = team)) + 
  stat_qq() + 
  theme_bw()

# Or
ggplot(data = NDF, aes(sample = OBP, color = team)) + 
  stat_qq() + 
  theme_bw() + 
  facet_grid(. ~ team)

# Standard 90% CI first
t.test(OBP ~ team, data = NDF, conf.level = 0.90)


    Welch Two Sample t-test

data:  OBP by team
t = -2.0607, df = 18.71, p-value = 0.05351
alternative hypothesis: true difference in means is not equal to 0
90 percent confidence interval:
 -0.049456471 -0.004307166
sample estimates:
mean in group BAL mean in group BOS 
        0.3228182         0.3497000

df <- t.test(OBP ~ team, data = NDF, conf.level = 0.90)$parameter
df

      df 
18.71049

Bootstrap

For more on the bootstrap method see http://stat-ata-asu.github.io/STT3851ClassRepo/Rmarkdown/TheBootstrap.html.

BALOBP <- bat10$OBP[bat10$team == "BAL"]
BOSOBP <- bat10$OBP[bat10$team == "BOS"]
obsdiff <- mean(BALOBP) - mean(BOSOBP)
obsdiff

[1] -0.02688182

SIMS <- 10^4 - 1
diffmean <- numeric(SIMS)
for(i in 1:SIMS){
  sampBAL <- sample(BALOBP, size = sum(!is.na(BALOBP)), replace = TRUE)
  sampBOS <- sample(BOSOBP, size = sum(!is.na(BOSOBP)), replace = TRUE)
  diffmean[i] <- mean(sampBAL) - mean(sampBOS)
}
hist(diffmean)

# OR
ggplot(data = data.frame(x = diffmean), aes(x = x)) + 
  geom_density(fill = "pink") + 
  theme_bw() + 
  labs(x = substitute(paste(bar(x)[Bal],"*", - bar(x)[Bos],"*")))

Percentile Approach

BSCI <- quantile(diffmean, probs = c(0.05, 0.95))
BSCI

          5%          95% 
-0.047100909 -0.006836364

Standard Bootstrap

SBS <- obsdiff + c(-1, 1)*qt(0.95, df)*sd(diffmean)
SBS

[1] -0.04816129 -0.00560235

Baseball

Alan T. Arnholt

Hypotheses

Bonferroni Method

Tukey’s Honestly Significant Difference (HSD)

Confidence Intervals (2 Groups)

Bootstrap

Percentile Approach

Standard Bootstrap