1 Notation

Definition 1.1 (General Notation) \(X \sim N(\mu_X, \sigma_X)\) and \(Y \sim N(\mu_Y, \sigma_Y)\)
# Standard Nomal
curve(dnorm(x), -3.5, 3.5)

1.1 Z-score

Definition 1.2 (Z-score) \(Z = \frac{\text{stat} - \mu_{\text{stat}}}{\sigma_{\text{stat}}} \sim N(0, 1)\)
Example 1.1 Suppose \(n = 1\) and \(X \sim N(100, 10) \rightarrow \bar{X} \sim N(100, 10)\).

1.2 Normal distribution

\(f(x) = \frac{1}{\sqrt{2\pi\sigma^2}}e^{\frac{-(x - \mu)^2}{2\sigma^2}}\)

Example 1.2 Given \(X \sim(100, 10)\), find \(P(X \geq 120)\).
MU <- 100
SIG <- 10
f <- function(x){(1/(sqrt(2*pi*SIG^2)))*exp(-(x - MU)^2/(2*SIG^2))}
integrate(f, 120, Inf)
0.02275013 with absolute error < 4.6e-05
ans <- integrate(f, 120, Inf)$value

The \(P(X \geq 120) = \int_{120}^{\infty}f(x)\,dx = 0.0227501.\)

# Using pnorm
pnorm(120, 100, 10, lower = FALSE)
[1] 0.02275013
library(PASWR2)
summary(CALCULUS)
     score       calculus
 Min.   :39.00   No :18  
 1st Qu.:63.50   Yes:18  
 Median :80.50           
 Mean   :74.78           
 3rd Qu.:87.00           
 Max.   :95.00           
library(ggplot2)
ggplot(data = CALCULUS, aes(sample = score, color = calculus)) + 
  geom_qq() + 
  theme_bw()

Assume \(\sigma_X = 5\) and \(\sigma_Y=12\).

tapply(CALCULUS$score, CALCULUS$calculus, mean)
      No      Yes 
62.61111 86.94444 
# TidyVerse
library(dplyr)
CALCULUS %>%
  group_by(calculus) %>%
  summarize(MeanScore = mean(score))
# A tibble: 2 x 2
  calculus MeanScore
  <fctr>       <dbl>
1 No            62.6
2 Yes           86.9
# sigX = 5, sigY = 12
X <- CALCULUS$score[CALCULUS$calculus == "Yes"]
Y <- CALCULUS$score[CALCULUS$calculus == "No"]
Zobs <- (mean(X) - mean(Y) -(0)) / (sqrt(25/18 + 144/18))
Zobs
[1] 7.941353

1.3 Consider using z.test() from PASWR2

z.test(x = X, sigma.x = 5, y = Y, sigma.y = 12, mu = 0, alternative = "greater")

    Two Sample z-test

data:  X and Y
z = 7.9414, p-value = 9.999e-16
alternative hypothesis: true difference in means is greater than 0
95 percent confidence interval:
 19.29329      Inf
sample estimates:
mean of x mean of y 
 86.94444  62.61111 

1.4 T-test now

t.test(X, Y, alternative = "greater")

    Welch Two Sample t-test

data:  X and Y
t = 7.4219, df = 20.585, p-value = 1.52e-07
alternative hypothesis: true difference in means is greater than 0
95 percent confidence interval:
 18.68649      Inf
sample estimates:
mean of x mean of y 
 86.94444  62.61111 
# Discuss: Why is the alternative "less" now...
t.test(score ~ calculus, data = CALCULUS, alternative = "less")

    Welch Two Sample t-test

data:  score by calculus
t = -7.4219, df = 20.585, p-value = 1.52e-07
alternative hypothesis: true difference in means is less than 0
95 percent confidence interval:
      -Inf -18.68649
sample estimates:
 mean in group No mean in group Yes 
         62.61111          86.94444 

1.5 Draw \(t_{20.585}\)

curve(dt(x, 20.585), from = -4, to = 4, n = 500, col = "blue", ylab = "")
curve(dt(x, 2), from = -4, to = 4, n = 500, col = "purple", add = TRUE)
curve(dnorm(x), from = -4, to = 4, n = 500, col = "pink", add = TRUE)

1.6 Randomization test

Distribute Cards and go through example before this code

obsDIFF <- diff(tapply(CALCULUS$score, CALCULUS$calculus, mean))
obsDIFF
     Yes 
24.33333 
R <- 1000
TS <- numeric(R)
set.seed(123)
for(i in 1:R){
  index <- sample(length(CALCULUS$score), 18, replace = FALSE)
  TS[i] <- mean(CALCULUS$score[index]) - mean(CALCULUS$score[-index])
}
pvalue <- (sum(TS >= obsDIFF) + 1)/(R + 1)
pvalue
[1] 0.000999001

1.7 Another approach using the standardized t

obsDIFF <- t.test(score ~ calculus, data = CALCULUS, alternative = "less")$stat
obsDIFF
       t 
-7.42191 
R <- 1000
TS <- numeric(R)
set.seed(123)
for(i in 1:R){
  TS[i] <- t.test(score ~ sample(calculus), data = CALCULUS, alternative = "less")$stat
}
pvalue <- (sum(TS <= obsDIFF) + 1)/(R + 1)
pvalue
[1] 0.000999001

1.8 What about a Confidence Interval? — Bootstrap

Key concept sample with replacement!

R <- 1000
TS <- numeric(R)
set.seed(123)
for(i in 1:R){
  xbar1 <- mean(sample(X, 18, replace = TRUE))
  xbar2 <- mean(sample(Y, 18, replace = TRUE))
  TS[i] <- xbar1 - xbar2
}
hist(TS, breaks = "Scott")

# With ggplot2 (note have to pass Data Frame)
DF <- data.frame(x = TS)
library(ggplot2)
ggplot(data = DF, aes(x = x)) + 
  geom_density(fill = "purple", alpha = 0.2) + 
  theme_bw()

#
CI <- quantile(TS, probs = c(0.025, 0.975))  # 95% Percentile Boostrap CI
CI 
    2.5%    97.5% 
18.44028 30.44444 
#
p <- ggplot(data = DF, aes(x = x, y = ..density..)) + 
  geom_density(fill = "red", alpha = 0.8) + 
  theme_bw()
x.dens <- density(TS)
df.dens <- data.frame(x = x.dens$x, y = x.dens$y)
p + geom_area(data = subset(df.dens, x >= CI[1] & x <= CI[2]), aes(x = x, y = y), fill = "blue", alpha = .3)

1.9 Simple Linear Regression

library(ggvis)
ggvis(data = GRADES, x = ~sat, y = ~gpa) %>% 
  layer_points() %>% 
  layer_model_predictions(model = "lm", formula = gpa ~ sat)
slr.mod <- lm(gpa ~ sat, data = GRADES)
summary(slr.mod)

Call:
lm(formula = gpa ~ sat, data = GRADES)

Residuals:
     Min       1Q   Median       3Q      Max 
-1.04954 -0.25960 -0.00655  0.26044  1.09328 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)    
(Intercept) -1.1920638  0.2224502  -5.359 2.32e-07 ***
sat          0.0030943  0.0001945  15.912  < 2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.3994 on 198 degrees of freedom
Multiple R-squared:  0.5612,    Adjusted R-squared:  0.5589 
F-statistic: 253.2 on 1 and 198 DF,  p-value: < 2.2e-16
par(mfrow = c(2, 2))
plot(slr.mod)

par(mfrow = c(1, 1))