# Standard Nomal
curve(dnorm(x), -3.5, 3.5)
\(f(x) = \frac{1}{\sqrt{2\pi\sigma^2}}e^{\frac{-(x - \mu)^2}{2\sigma^2}}\)
MU <- 100
SIG <- 10
f <- function(x){(1/(sqrt(2*pi*SIG^2)))*exp(-(x - MU)^2/(2*SIG^2))}
integrate(f, 120, Inf)
0.02275013 with absolute error < 4.6e-05
ans <- integrate(f, 120, Inf)$value
The \(P(X \geq 120) = \int_{120}^{\infty}f(x)\,dx = 0.0227501.\)
# Using pnorm
pnorm(120, 100, 10, lower = FALSE)
[1] 0.02275013
library(PASWR2)
summary(CALCULUS)
score calculus
Min. :39.00 No :18
1st Qu.:63.50 Yes:18
Median :80.50
Mean :74.78
3rd Qu.:87.00
Max. :95.00
library(ggplot2)
ggplot(data = CALCULUS, aes(sample = score, color = calculus)) +
geom_qq() +
theme_bw()
Assume \(\sigma_X = 5\) and \(\sigma_Y=12\).
tapply(CALCULUS$score, CALCULUS$calculus, mean)
No Yes
62.61111 86.94444
# TidyVerse
library(dplyr)
CALCULUS %>%
group_by(calculus) %>%
summarize(MeanScore = mean(score))
# A tibble: 2 x 2
calculus MeanScore
<fctr> <dbl>
1 No 62.6
2 Yes 86.9
# sigX = 5, sigY = 12
X <- CALCULUS$score[CALCULUS$calculus == "Yes"]
Y <- CALCULUS$score[CALCULUS$calculus == "No"]
Zobs <- (mean(X) - mean(Y) -(0)) / (sqrt(25/18 + 144/18))
Zobs
[1] 7.941353
z.test()
from PASWR2
z.test(x = X, sigma.x = 5, y = Y, sigma.y = 12, mu = 0, alternative = "greater")
Two Sample z-test
data: X and Y
z = 7.9414, p-value = 9.999e-16
alternative hypothesis: true difference in means is greater than 0
95 percent confidence interval:
19.29329 Inf
sample estimates:
mean of x mean of y
86.94444 62.61111
t.test(X, Y, alternative = "greater")
Welch Two Sample t-test
data: X and Y
t = 7.4219, df = 20.585, p-value = 1.52e-07
alternative hypothesis: true difference in means is greater than 0
95 percent confidence interval:
18.68649 Inf
sample estimates:
mean of x mean of y
86.94444 62.61111
# Discuss: Why is the alternative "less" now...
t.test(score ~ calculus, data = CALCULUS, alternative = "less")
Welch Two Sample t-test
data: score by calculus
t = -7.4219, df = 20.585, p-value = 1.52e-07
alternative hypothesis: true difference in means is less than 0
95 percent confidence interval:
-Inf -18.68649
sample estimates:
mean in group No mean in group Yes
62.61111 86.94444
curve(dt(x, 20.585), from = -4, to = 4, n = 500, col = "blue", ylab = "")
curve(dt(x, 2), from = -4, to = 4, n = 500, col = "purple", add = TRUE)
curve(dnorm(x), from = -4, to = 4, n = 500, col = "pink", add = TRUE)
obsDIFF <- diff(tapply(CALCULUS$score, CALCULUS$calculus, mean))
obsDIFF
Yes
24.33333
R <- 1000
TS <- numeric(R)
set.seed(123)
for(i in 1:R){
index <- sample(length(CALCULUS$score), 18, replace = FALSE)
TS[i] <- mean(CALCULUS$score[index]) - mean(CALCULUS$score[-index])
}
pvalue <- (sum(TS >= obsDIFF) + 1)/(R + 1)
pvalue
[1] 0.000999001
obsDIFF <- t.test(score ~ calculus, data = CALCULUS, alternative = "less")$stat
obsDIFF
t
-7.42191
R <- 1000
TS <- numeric(R)
set.seed(123)
for(i in 1:R){
TS[i] <- t.test(score ~ sample(calculus), data = CALCULUS, alternative = "less")$stat
}
pvalue <- (sum(TS <= obsDIFF) + 1)/(R + 1)
pvalue
[1] 0.000999001
Key concept sample with replacement!
R <- 1000
TS <- numeric(R)
set.seed(123)
for(i in 1:R){
xbar1 <- mean(sample(X, 18, replace = TRUE))
xbar2 <- mean(sample(Y, 18, replace = TRUE))
TS[i] <- xbar1 - xbar2
}
hist(TS, breaks = "Scott")
# With ggplot2 (note have to pass Data Frame)
DF <- data.frame(x = TS)
library(ggplot2)
ggplot(data = DF, aes(x = x)) +
geom_density(fill = "purple", alpha = 0.2) +
theme_bw()
#
CI <- quantile(TS, probs = c(0.025, 0.975)) # 95% Percentile Boostrap CI
CI
2.5% 97.5%
18.44028 30.44444
#
p <- ggplot(data = DF, aes(x = x, y = ..density..)) +
geom_density(fill = "red", alpha = 0.8) +
theme_bw()
x.dens <- density(TS)
df.dens <- data.frame(x = x.dens$x, y = x.dens$y)
p + geom_area(data = subset(df.dens, x >= CI[1] & x <= CI[2]), aes(x = x, y = y), fill = "blue", alpha = .3)
library(ggvis)
ggvis(data = GRADES, x = ~sat, y = ~gpa) %>%
layer_points() %>%
layer_model_predictions(model = "lm", formula = gpa ~ sat)
slr.mod <- lm(gpa ~ sat, data = GRADES)
summary(slr.mod)
Call:
lm(formula = gpa ~ sat, data = GRADES)
Residuals:
Min 1Q Median 3Q Max
-1.04954 -0.25960 -0.00655 0.26044 1.09328
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -1.1920638 0.2224502 -5.359 2.32e-07 ***
sat 0.0030943 0.0001945 15.912 < 2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.3994 on 198 degrees of freedom
Multiple R-squared: 0.5612, Adjusted R-squared: 0.5589
F-statistic: 253.2 on 1 and 198 DF, p-value: < 2.2e-16
par(mfrow = c(2, 2))
plot(slr.mod)
par(mfrow = c(1, 1))