Chapter Two Graphs

site <- "http://www-bcf.usc.edu/~gareth/ISL/Advertising.csv"
AD <- read.csv(site)
head(AD)

  X    TV Radio Newspaper Sales
1 1 230.1  37.8      69.2  22.1
2 2  44.5  39.3      45.1  10.4
3 3  17.2  45.9      69.3   9.3
4 4 151.5  41.3      58.5  18.5
5 5 180.8  10.8      58.4  12.9
6 6   8.7  48.9      75.0   7.2

dim(AD)

[1] 200   5

library(DT)
datatable(AD[, -1], rownames = FALSE,
          caption = 'Table 1: This is a simple caption for the table.')

Base R Graph

plot(Sales ~ TV, data = AD, col = "red", pch = 19)
mod1 <- lm(Sales ~ TV, data = AD)
abline(mod1, col = "blue")

Using `ggplot2`

library(ggplot2)
library(MASS)
p <- ggplot(data = AD, aes(x = TV, y = Sales)) +
  geom_point(color = "lightblue") +
  geom_smooth(method = "lm", se = FALSE, color = "blue") +
  geom_smooth(method = "loess", color = "red", se = FALSE) + 
  geom_smooth(method = "rlm", color = "purple", se = FALSE) +
  theme_bw()
p

Using `ggvis`

library(ggvis)
AD %>% 
  ggvis(x = ~TV, y = ~Sales) %>% 
  layer_points() %>% 
  layer_model_predictions(model = "lm", se = FALSE) %>% 
  layer_model_predictions(model = "MASS::rlm", se = FALSE, stroke := "blue") %>%
  layer_smooths(stroke:="red", se = FALSE)

Using `plotly`

library(plotly)
p2 <- ggplotly(p)
p2

Scatterplot Matrices

library(car)
scatterplotMatrix(~ Sales + TV + Radio + Newspaper, data = AD)

Chapter 3

Recall mod1

mod1 <- lm(Sales ~ TV, data = AD)
summary(mod1)


Call:
lm(formula = Sales ~ TV, data = AD)

Residuals:
    Min      1Q  Median      3Q     Max 
-8.3860 -1.9545 -0.1913  2.0671  7.2124 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) 7.032594   0.457843   15.36   <2e-16 ***
TV          0.047537   0.002691   17.67   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 3.259 on 198 degrees of freedom
Multiple R-squared:  0.6119,    Adjusted R-squared:  0.6099 
F-statistic: 312.1 on 1 and 198 DF,  p-value: < 2.2e-16

Residual: $e_i = y_i - \hat{y_i}$

To obtain the residuals for mod1 use the function resid on a linear model object.

eis <- resid(mod1)
RSS <- sum(eis^2)
RSS

[1] 2102.531

RSE <- sqrt(RSS/(dim(AD)[1]-2))
RSE

[1] 3.258656

# Or
summary(mod1)$sigma

[1] 3.258656

The least squares estimators of $\beta_0$ and $\beta_1$ are

\[b_0 = \hat{\beta_0} = \bar{y} - b_1\bar{x}\] \[b_1 = \hat{\beta_1} = \frac{\sum_{i = 1}^n(x_i - \bar{x})(y_i - \bar{y})}{\sum_{i=1}^n(x_i-\bar{x})^2}\]

y <- AD$Sales
x <- AD$TV
b1 <- sum( (x - mean(x))*(y - mean(y)) ) / sum((x - mean(x))^2)
b0 <- mean(y) - b1*mean(x)
c(b0, b1)

[1] 7.03259355 0.04753664

# Or using
coef(mod1)

(Intercept)          TV 
 7.03259355  0.04753664

summary(mod1)


Call:
lm(formula = Sales ~ TV, data = AD)

Residuals:
    Min      1Q  Median      3Q     Max 
-8.3860 -1.9545 -0.1913  2.0671  7.2124 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) 7.032594   0.457843   15.36   <2e-16 ***
TV          0.047537   0.002691   17.67   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 3.259 on 198 degrees of freedom
Multiple R-squared:  0.6119,    Adjusted R-squared:  0.6099 
F-statistic: 312.1 on 1 and 198 DF,  p-value: < 2.2e-16

XTXI <- summary(mod1)$cov.unscaled
MSE <- summary(mod1)$sigma^2
var.cov.b <- MSE*XTXI
var.cov.b

             (Intercept)            TV
(Intercept)  0.209620158 -1.064495e-03
TV          -0.001064495  7.239367e-06

seb0 <- sqrt(var.cov.b[1, 1])
seb1 <- sqrt(var.cov.b[2, 2])
c(seb0, seb1)

[1] 0.457842940 0.002690607

coef(summary(mod1))

              Estimate  Std. Error  t value    Pr(>|t|)
(Intercept) 7.03259355 0.457842940 15.36028 1.40630e-35
TV          0.04753664 0.002690607 17.66763 1.46739e-42

coef(summary(mod1))[1, 2]

[1] 0.4578429

coef(summary(mod1))[2, 2]

[1] 0.002690607

tb0 <- b0/seb0
tb1 <- b1/seb1
c(tb0, tb1)

[1] 15.36028 17.66763

pvalues <- c(pt(tb0, 198, lower = FALSE)*2, pt(tb1, 198, lower = FALSE)*2)
pvalues

[1] 1.40630e-35 1.46739e-42

coef(summary(mod1))

              Estimate  Std. Error  t value    Pr(>|t|)
(Intercept) 7.03259355 0.457842940 15.36028 1.40630e-35
TV          0.04753664 0.002690607 17.66763 1.46739e-42

TSS <- sum((y - mean(y))^2)
c(RSS, TSS)

[1] 2102.531 5417.149

R2 <- (TSS - RSS)/TSS
R2

[1] 0.6118751

# Or
summary(mod1)$r.squared

[1] 0.6118751

Confidence Interval for $\beta_1$

\[\text{CI}_{1 - \alpha}(\beta_1) = \left[b_1 - t_{1- \alpha/2, n - p + 1}SE(b1), b_1 + t_{1- \alpha/2, n - p + 1}SE(b1) \right]\]

Example: Construct a 90% confidence interval for $\beta_1$.

alpha <- 0.10
ct <- qt(1 - alpha/2, 198)
ct

[1] 1.652586

b1 +c(-1, 1)*ct*seb1

[1] 0.04309018 0.05198310

# Or
confint(mod1, parm = "TV", level = 0.90)

          5 %      95 %
TV 0.04309018 0.0519831

confint(mod1)

                 2.5 %     97.5 %
(Intercept) 6.12971927 7.93546783
TV          0.04223072 0.05284256

Linear Algebra

Solution of linear systems Find the solution(s) if any to the following linear equations.

\[2x + y - z = 8\] \[-3x - y + 2z = -11\] \[-2x + y + 2z = -3\]

A <- matrix(c(2, -3, -2, 1, -1, 1, -1, 2, 2), nrow = 3)
b <- matrix(c(8, -11, -3), nrow = 3)
x <- solve(A)%*%b
x

     [,1]
[1,]    2
[2,]    3
[3,]   -1

# Or
solve(A, b)

     [,1]
[1,]    2
[2,]    3
[3,]   -1

See wikipedia for a review of matrix multiplication rules and properties.

Consider the 2 $\times$ 2 matrix $A$.

\[A = \begin{bmatrix} 2 & 4 \\ 9 & 5 \\ \end{bmatrix} \]

A <- matrix(c(2, 9, 4, 5), nrow = 2)
A

     [,1] [,2]
[1,]    2    4
[2,]    9    5

t(A)          # Transpose of A

     [,1] [,2]
[1,]    2    9
[2,]    4    5

t(A)%*%A      # A'A

     [,1] [,2]
[1,]   85   53
[2,]   53   41

solve(A)%*%A  # I_2

             [,1] [,2]
[1,] 1.000000e+00    0
[2,] 1.110223e-16    1

zapsmall(solve(A)%*%A)  # What you expect I_2

     [,1] [,2]
[1,]    1    0
[2,]    0    1

X <- model.matrix(mod1)
XTX <- t(X)%*%X
dim(XTX)

[1] 2 2

XTXI <- solve(XTX)
XTXI

              (Intercept)            TV
(Intercept)  0.0197403984 -1.002458e-04
TV          -0.0001002458  6.817474e-07

# But it is best to compute this quantity using
summary(mod1)$cov.unscaled

              (Intercept)            TV
(Intercept)  0.0197403984 -1.002458e-04
TV          -0.0001002458  6.817474e-07

betahat <- XTXI%*%t(X)%*%y
betahat

                  [,1]
(Intercept) 7.03259355
TV          0.04753664

coef(mod1)

(Intercept)          TV 
 7.03259355  0.04753664

XTXI <- summary(mod1)$cov.unscaled
MSE <- summary(mod1)$sigma^2
var.cov.b <- MSE*XTXI
var.cov.b

             (Intercept)            TV
(Intercept)  0.209620158 -1.064495e-03
TV          -0.001064495  7.239367e-06

Multiple Linear Regression

mod2 <- lm(Sales ~ TV + Radio, data = AD)
summary(mod2)


Call:
lm(formula = Sales ~ TV + Radio, data = AD)

Residuals:
    Min      1Q  Median      3Q     Max 
-8.7977 -0.8752  0.2422  1.1708  2.8328 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)  2.92110    0.29449   9.919   <2e-16 ***
TV           0.04575    0.00139  32.909   <2e-16 ***
Radio        0.18799    0.00804  23.382   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 1.681 on 197 degrees of freedom
Multiple R-squared:  0.8972,    Adjusted R-squared:  0.8962 
F-statistic: 859.6 on 2 and 197 DF,  p-value: < 2.2e-16

Graphing the plane

Is There a Relationship Between the Response and Predictors?

mod3 <- lm(Sales ~ TV + Radio + Newspaper, data = AD)
summary(mod3)


Call:
lm(formula = Sales ~ TV + Radio + Newspaper, data = AD)

Residuals:
    Min      1Q  Median      3Q     Max 
-8.8277 -0.8908  0.2418  1.1893  2.8292 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept)  2.938889   0.311908   9.422   <2e-16 ***
TV           0.045765   0.001395  32.809   <2e-16 ***
Radio        0.188530   0.008611  21.893   <2e-16 ***
Newspaper   -0.001037   0.005871  -0.177     0.86    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 1.686 on 196 degrees of freedom
Multiple R-squared:  0.8972,    Adjusted R-squared:  0.8956 
F-statistic: 570.3 on 3 and 196 DF,  p-value: < 2.2e-16

\[H_0: \beta_1 = \beta_2 = \beta_3 = 0\] versus the alternative \[H_1: \text{at least one } \beta_j \neq 0\]

The test statistic is $F = \frac{(\text{TSS} - \text{RSS})/p}{\text{RSS}/(n-p-1)}$

anova(mod3)

Analysis of Variance Table

Response: Sales
           Df Sum Sq Mean Sq   F value Pr(>F)    
TV          1 3314.6  3314.6 1166.7308 <2e-16 ***
Radio       1 1545.6  1545.6  544.0501 <2e-16 ***
Newspaper   1    0.1     0.1    0.0312 0.8599    
Residuals 196  556.8     2.8                     
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

SSR <- sum(anova(mod3)[1:3, 2])
MSR <- SSR/3
SSE <- anova(mod3)[4, 2]
MSE <- SSE/(200-3-1)
Fobs <- MSR/MSE
Fobs

[1] 570.2707

pvalue <- pf(Fobs, 3, 196, lower = FALSE)
pvalue

[1] 1.575227e-96

# Or
summary(mod3)


Call:
lm(formula = Sales ~ TV + Radio + Newspaper, data = AD)

Residuals:
    Min      1Q  Median      3Q     Max 
-8.8277 -0.8908  0.2418  1.1893  2.8292 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept)  2.938889   0.311908   9.422   <2e-16 ***
TV           0.045765   0.001395  32.809   <2e-16 ***
Radio        0.188530   0.008611  21.893   <2e-16 ***
Newspaper   -0.001037   0.005871  -0.177     0.86    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 1.686 on 196 degrees of freedom
Multiple R-squared:  0.8972,    Adjusted R-squared:  0.8956 
F-statistic: 570.3 on 3 and 196 DF,  p-value: < 2.2e-16

summary(mod3)$fstatistic

   value    numdf    dendf 
570.2707   3.0000 196.0000

Suppose we would like to test whether $\beta_2 = \beta_3 = 0$. The reduced model with $\beta_2 = \beta_3 = 0$ is mod1 while the full model is mod3.

summary(mod3)


Call:
lm(formula = Sales ~ TV + Radio + Newspaper, data = AD)

Residuals:
    Min      1Q  Median      3Q     Max 
-8.8277 -0.8908  0.2418  1.1893  2.8292 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept)  2.938889   0.311908   9.422   <2e-16 ***
TV           0.045765   0.001395  32.809   <2e-16 ***
Radio        0.188530   0.008611  21.893   <2e-16 ***
Newspaper   -0.001037   0.005871  -0.177     0.86    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 1.686 on 196 degrees of freedom
Multiple R-squared:  0.8972,    Adjusted R-squared:  0.8956 
F-statistic: 570.3 on 3 and 196 DF,  p-value: < 2.2e-16

anova(mod1, mod3)

Analysis of Variance Table

Model 1: Sales ~ TV
Model 2: Sales ~ TV + Radio + Newspaper
  Res.Df     RSS Df Sum of Sq      F    Pr(>F)    
1    198 2102.53                                  
2    196  556.83  2    1545.7 272.04 < 2.2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Variable Selection

Forward selection

mod.fs <- lm(Sales ~ 1, data = AD)
SCOPE <- (~. + TV + Radio + Newspaper)
add1(mod.fs, scope = SCOPE, test = "F")

Single term additions

Model:
Sales ~ 1
          Df Sum of Sq    RSS    AIC F value    Pr(>F)    
<none>                 5417.1 661.80                      
TV         1    3314.6 2102.5 474.52 312.145 < 2.2e-16 ***
Radio      1    1798.7 3618.5 583.10  98.422 < 2.2e-16 ***
Newspaper  1     282.3 5134.8 653.10  10.887  0.001148 ** 
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

mod.fs <- update(mod.fs, .~. + TV)
add1(mod.fs, scope = SCOPE, test = "F")

Single term additions

Model:
Sales ~ TV
          Df Sum of Sq     RSS    AIC F value    Pr(>F)    
<none>                 2102.53 474.52                      
Radio      1   1545.62  556.91 210.82  546.74 < 2.2e-16 ***
Newspaper  1    183.97 1918.56 458.20   18.89 2.217e-05 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

mod.fs <- update(mod.fs, .~. + Radio)
add1(mod.fs, scope = SCOPE, test = "F")

Single term additions

Model:
Sales ~ TV + Radio
          Df Sum of Sq    RSS    AIC F value Pr(>F)
<none>                 556.91 210.82               
Newspaper  1  0.088717 556.83 212.79  0.0312 0.8599

summary(mod.fs)


Call:
lm(formula = Sales ~ TV + Radio, data = AD)

Residuals:
    Min      1Q  Median      3Q     Max 
-8.7977 -0.8752  0.2422  1.1708  2.8328 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)  2.92110    0.29449   9.919   <2e-16 ***
TV           0.04575    0.00139  32.909   <2e-16 ***
Radio        0.18799    0.00804  23.382   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 1.681 on 197 degrees of freedom
Multiple R-squared:  0.8972,    Adjusted R-squared:  0.8962 
F-statistic: 859.6 on 2 and 197 DF,  p-value: < 2.2e-16

Using stepAIC

stepAIC(lm(Sales ~ 1, data = AD), scope = .~TV + Radio + Newspaper, direction = "forward", test = "F")

Start:  AIC=661.8
Sales ~ 1

            Df Sum of Sq    RSS    AIC F Value     Pr(F)    
+ TV         1    3314.6 2102.5 474.52 312.145 < 2.2e-16 ***
+ Radio      1    1798.7 3618.5 583.10  98.422 < 2.2e-16 ***
+ Newspaper  1     282.3 5134.8 653.10  10.887  0.001148 ** 
<none>                   5417.1 661.80                      
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Step:  AIC=474.52
Sales ~ TV

            Df Sum of Sq     RSS    AIC F Value     Pr(F)    
+ Radio      1   1545.62  556.91 210.82  546.74 < 2.2e-16 ***
+ Newspaper  1    183.97 1918.56 458.20   18.89 2.217e-05 ***
<none>                   2102.53 474.52                      
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Step:  AIC=210.82
Sales ~ TV + Radio

            Df Sum of Sq    RSS    AIC  F Value  Pr(F)
<none>                   556.91 210.82                
+ Newspaper  1  0.088717 556.83 212.79 0.031228 0.8599


Call:
lm(formula = Sales ~ TV + Radio, data = AD)

Coefficients:
(Intercept)           TV        Radio  
    2.92110      0.04575      0.18799

Backward elimination

mod.be <- lm(Sales ~ TV + Radio + Newspaper, data = AD)
drop1(mod.be, test = "F")

Single term deletions

Model:
Sales ~ TV + Radio + Newspaper
          Df Sum of Sq    RSS    AIC   F value Pr(>F)    
<none>                  556.8 212.79                     
TV         1   3058.01 3614.8 584.90 1076.4058 <2e-16 ***
Radio      1   1361.74 1918.6 458.20  479.3252 <2e-16 ***
Newspaper  1      0.09  556.9 210.82    0.0312 0.8599    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

mod.be <- update(mod.be, .~. - Newspaper)
drop1(mod.be, test = "F")

Single term deletions

Model:
Sales ~ TV + Radio
       Df Sum of Sq    RSS    AIC F value    Pr(>F)    
<none>               556.9 210.82                      
TV      1    3061.6 3618.5 583.10 1082.98 < 2.2e-16 ***
Radio   1    1545.6 2102.5 474.52  546.74 < 2.2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

summary(mod.be)


Call:
lm(formula = Sales ~ TV + Radio, data = AD)

Residuals:
    Min      1Q  Median      3Q     Max 
-8.7977 -0.8752  0.2422  1.1708  2.8328 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)  2.92110    0.29449   9.919   <2e-16 ***
TV           0.04575    0.00139  32.909   <2e-16 ***
Radio        0.18799    0.00804  23.382   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 1.681 on 197 degrees of freedom
Multiple R-squared:  0.8972,    Adjusted R-squared:  0.8962 
F-statistic: 859.6 on 2 and 197 DF,  p-value: < 2.2e-16

Using stepAIC

stepAIC(lm(Sales ~ TV + Radio + Newspaper, data = AD), scope = .~TV + Radio + Newspaper, direction = "backward", test = "F")

Start:  AIC=212.79
Sales ~ TV + Radio + Newspaper

            Df Sum of Sq    RSS    AIC F Value  Pr(F)    
- Newspaper  1      0.09  556.9 210.82    0.03 0.8599    
<none>                    556.8 212.79                   
- Radio      1   1361.74 1918.6 458.20  479.33 <2e-16 ***
- TV         1   3058.01 3614.8 584.90 1076.41 <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Step:  AIC=210.82
Sales ~ TV + Radio

        Df Sum of Sq    RSS    AIC F Value     Pr(F)    
<none>                556.9 210.82                      
- Radio  1    1545.6 2102.5 474.52  546.74 < 2.2e-16 ***
- TV     1    3061.6 3618.5 583.10 1082.98 < 2.2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1


Call:
lm(formula = Sales ~ TV + Radio, data = AD)

Coefficients:
(Intercept)           TV        Radio  
    2.92110      0.04575      0.18799

Diagnostic Plots

residualPlots(mod2)

           Test stat Pr(>|t|)
TV            -6.775    0.000
Radio          1.054    0.293
Tukey test     7.635    0.000

qqPlot(mod2)

influenceIndexPlot(mod2)

We use a confidence interval to quantify the uncertainty surrounding the average Sales over a large number of cities. For example, given that $100,000 is spent on TV advertising and $20,000 is spent on Radio advertising in each city, the 95% confidence interval is [10.9852544, 11.5276775]. We interpret this to mean that 95% of intervals of this form will contain the true value of Sales.

predict(mod.be, newdata = data.frame(TV = 100, Radio = 20), interval = "conf")

       fit      lwr      upr
1 11.25647 10.98525 11.52768

On the other hand, a prediction interval can be used to quantify the uncertainty surrounding Sales for a particular city. Given that $100,000 is spent on TV advertising and $20,000 is spent on Radio advertising in a particular city, the 95% prediction interval is [7.9296161, 14.5833158]. We interpret this to mean that 95% of intervals of this form will contain the true value of Sales for this city.

predict(mod.be, newdata = data.frame(TV = 100, Radio = 20), interval = "pred")

       fit      lwr      upr
1 11.25647 7.929616 14.58332

Note that both the intervals are centered at 11.256466, but that the prediction interval is substantially wider than the confidence interval, reflecting the increased uncertainty about Sales for a given city in comparison to the average Sales over many locations.

Non-Additive Models

nam1 <- lm(Sales ~ TV*Radio, data = AD)
# Same as 
nam2 <- lm(Sales ~ TV + Radio + TV:Radio, data = AD)
summary(nam1)


Call:
lm(formula = Sales ~ TV * Radio, data = AD)

Residuals:
    Min      1Q  Median      3Q     Max 
-6.3366 -0.4028  0.1831  0.5948  1.5246 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept) 6.750e+00  2.479e-01  27.233   <2e-16 ***
TV          1.910e-02  1.504e-03  12.699   <2e-16 ***
Radio       2.886e-02  8.905e-03   3.241   0.0014 ** 
TV:Radio    1.086e-03  5.242e-05  20.727   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.9435 on 196 degrees of freedom
Multiple R-squared:  0.9678,    Adjusted R-squared:  0.9673 
F-statistic:  1963 on 3 and 196 DF,  p-value: < 2.2e-16

summary(nam2)


Call:
lm(formula = Sales ~ TV + Radio + TV:Radio, data = AD)

Residuals:
    Min      1Q  Median      3Q     Max 
-6.3366 -0.4028  0.1831  0.5948  1.5246 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept) 6.750e+00  2.479e-01  27.233   <2e-16 ***
TV          1.910e-02  1.504e-03  12.699   <2e-16 ***
Radio       2.886e-02  8.905e-03   3.241   0.0014 ** 
TV:Radio    1.086e-03  5.242e-05  20.727   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.9435 on 196 degrees of freedom
Multiple R-squared:  0.9678,    Adjusted R-squared:  0.9673 
F-statistic:  1963 on 3 and 196 DF,  p-value: < 2.2e-16

Hierarchical Principle: If an interaction term is included in a model, one should also include the main effects, even if the p-values associated with their coefficients are not significant.

Qualitative Predictors

In the Credit data frame there are four qualitative features/variables Gender, Student, Married, and Ethnicity.

Credit <- read.csv("http://www-bcf.usc.edu/~gareth/ISL/Credit.csv")
datatable(Credit[, -1], rownames = FALSE)

modP <- lm(Balance ~ Income*Student, data = Credit)
summary(modP)


Call:
lm(formula = Balance ~ Income * Student, data = Credit)

Residuals:
    Min      1Q  Median      3Q     Max 
-773.39 -325.70  -41.13  321.65  814.04 

Coefficients:
                  Estimate Std. Error t value Pr(>|t|)    
(Intercept)       200.6232    33.6984   5.953 5.79e-09 ***
Income              6.2182     0.5921  10.502  < 2e-16 ***
StudentYes        476.6758   104.3512   4.568 6.59e-06 ***
Income:StudentYes  -1.9992     1.7313  -1.155    0.249    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 391.6 on 396 degrees of freedom
Multiple R-squared:  0.2799,    Adjusted R-squared:  0.2744 
F-statistic:  51.3 on 3 and 396 DF,  p-value: < 2.2e-16

Fitted Model: $\widehat{\text{Balance}} = 200.6231529 + 6.2181687\cdot \text{Income} + 476.6758432\cdot \text{Student} + -1.9991509\cdot\text{Income}\times\text{Student}$

Predictors with Only Two Levels

Suppose we wish to investigate differences in credit card balance between males and females, ignoring the other variables for the moment.

modS <- lm(Balance ~ Gender, data = Credit)
summary(modS)


Call:
lm(formula = Balance ~ Gender, data = Credit)

Residuals:
    Min      1Q  Median      3Q     Max 
-529.54 -455.35  -60.17  334.71 1489.20 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)   529.54      31.99  16.554   <2e-16 ***
Gender Male   -19.73      46.05  -0.429    0.669    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 460.2 on 398 degrees of freedom
Multiple R-squared:  0.0004611, Adjusted R-squared:  -0.00205 
F-statistic: 0.1836 on 1 and 398 DF,  p-value: 0.6685

coef(modS)

(Intercept) Gender Male 
  529.53623   -19.73312

tapply(Credit$Balance, Credit$Gender, mean)

  Female     Male 
529.5362 509.8031

library(ggplot2)
ggplot(data = Credit, aes(x = Gender, y = Balance)) + 
  geom_point() + 
  theme_bw() + 
  geom_hline(yintercept = coef(modS)[1] + coef(modS)[2], color = "blue") + 
  geom_hline(yintercept = coef(modS)[1], color = "pink")

Do females have a higher ratio of Balance to Income (credit utilization)? Here is an article from the Washington Post with numbers that mirror some of the results in the Credit data set.

Credit$Utilization <- Credit$Balance / (Credit$Income*100)
tapply(Credit$Utilization, Credit$Gender, mean)

   Female      Male 
0.1535206 0.1487092

modU <- lm(Utilization ~ Gender, data = Credit)
summary(modU)


Call:
lm(formula = Utilization ~ Gender, data = Credit)

Residuals:
     Min       1Q   Median       3Q      Max 
-0.15352 -0.13494 -0.05202  0.06069  0.96804 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept)  0.153521   0.011962  12.834   <2e-16 ***
Gender Male -0.004811   0.017221  -0.279     0.78    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.1721 on 398 degrees of freedom
Multiple R-squared:  0.0001961, Adjusted R-squared:  -0.002316 
F-statistic: 0.07806 on 1 and 398 DF,  p-value: 0.7801

coef(modU)

 (Intercept)  Gender Male 
 0.153520573 -0.004811408

ggplot(data = Credit, aes(x = Gender, y = Utilization)) + 
  geom_point() + 
  theme_bw() + 
  geom_hline(yintercept = coef(modU)[1] + coef(modU)[2], color = "blue") + 
  geom_hline(yintercept = coef(modU)[1], color = "pink")

Moving On Now

modS1 <- lm(Balance ~ Limit + Student, data = Credit)
summary(modS1)


Call:
lm(formula = Balance ~ Limit + Student, data = Credit)

Residuals:
    Min      1Q  Median      3Q     Max 
-637.77 -116.90    6.04  130.92  434.24 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)    
(Intercept) -3.347e+02  2.307e+01  -14.51   <2e-16 ***
Limit        1.720e-01  4.331e-03   39.70   <2e-16 ***
StudentYes   4.044e+02  3.328e+01   12.15   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 199.7 on 397 degrees of freedom
Multiple R-squared:  0.8123,    Adjusted R-squared:  0.8114 
F-statistic: 859.2 on 2 and 397 DF,  p-value: < 2.2e-16

coef(modS1)

 (Intercept)        Limit   StudentYes 
-334.7299372    0.1719538  404.4036438

# Interaction --- Non-additive Model
modS2 <- lm(Balance ~ Limit*Student, data = Credit)
summary(modS2)


Call:
lm(formula = Balance ~ Limit * Student, data = Credit)

Residuals:
    Min      1Q  Median      3Q     Max 
-705.84 -116.90    6.91  133.97  435.92 

Coefficients:
                   Estimate Std. Error t value Pr(>|t|)    
(Intercept)      -3.262e+02  2.392e+01 -13.636  < 2e-16 ***
Limit             1.702e-01  4.533e-03  37.538  < 2e-16 ***
StudentYes        3.091e+02  7.878e+01   3.924 0.000103 ***
Limit:StudentYes  2.028e-02  1.520e-02   1.334 0.183010    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 199.5 on 396 degrees of freedom
Multiple R-squared:  0.8132,    Adjusted R-squared:  0.8118 
F-statistic: 574.5 on 3 and 396 DF,  p-value: < 2.2e-16

What does this look like?

Several points:

Is the interaction significant?
Which model is ggplot2 graphing below?
Is this the correct model?

ggplot(data = Credit, aes(x = Limit, y = Balance, color = Student)) + 
  geom_point() + 
  stat_smooth(method = "lm")

Correct Graph

ggplot(data = Credit, aes(x = Limit, y = Balance, color = Student)) +
  geom_point() + 
  theme_bw() + 
  geom_abline(intercept = coef(modS1)[1], slope = coef(modS1)[2], color = "red") + 
  geom_abline(intercept = coef(modS1)[1] + coef(modS1)[3], slope = coef(modS1)[2], color = "blue")

Qualitative predictors with More than Two Levels

modQ3 <- lm(Balance ~ Limit + Ethnicity, data = Credit)
summary(modQ3)


Call:
lm(formula = Balance ~ Limit + Ethnicity, data = Credit)

Residuals:
    Min      1Q  Median      3Q     Max 
-677.39 -145.75   -8.75  139.56  776.46 

Coefficients:
                     Estimate Std. Error t value Pr(>|t|)    
(Intercept)        -3.078e+02  3.417e+01  -9.007   <2e-16 ***
Limit               1.718e-01  5.079e-03  33.831   <2e-16 ***
EthnicityAsian      2.835e+01  3.304e+01   0.858    0.391    
EthnicityCaucasian  1.381e+01  2.878e+01   0.480    0.632    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 234 on 396 degrees of freedom
Multiple R-squared:  0.743, Adjusted R-squared:  0.7411 
F-statistic: 381.6 on 3 and 396 DF,  p-value: < 2.2e-16

coef(modQ3)

       (Intercept)              Limit     EthnicityAsian 
      -307.7574777          0.1718203         28.3533975 
EthnicityCaucasian 
        13.8089629

modRM <- lm(Balance ~ Limit, data = Credit)
anova(modRM, modQ3)

Analysis of Variance Table

Model 1: Balance ~ Limit
Model 2: Balance ~ Limit + Ethnicity
  Res.Df      RSS Df Sum of Sq      F Pr(>F)
1    398 21715657                           
2    396 21675307  2     40350 0.3686 0.6919

What follows fits three separate regression lines based on Ethnicity.

AfAmer <- lm(Balance ~ Limit, data = subset(Credit, Ethnicity == "African American"))
AsAmer <- lm(Balance ~ Limit, data = subset(Credit, Ethnicity == "Asian"))
CaAmer <- lm(Balance ~ Limit, data = subset(Credit, Ethnicity == "Caucasian"))
rbind(coef(AfAmer), coef(AsAmer), coef(CaAmer))

     (Intercept)     Limit
[1,]   -301.2245 0.1704820
[2,]   -305.4270 0.1774679
[3,]   -282.4442 0.1693873

ggplot(data = Credit, aes(x = Limit, y = Balance, color = Ethnicity)) +
  geom_point() + 
  theme_bw() +
  stat_smooth(method = "lm", se = FALSE)

Note: Ethnicity is not significant, so we really should have just one line.

ggplot(data = Credit, aes(x = Limit, y = Balance)) +
  geom_point(aes(color = Ethnicity)) + 
  theme_bw() +
  stat_smooth(method = "lm")

Matrix Scatterplots

scatterplotMatrix(~ Balance + Income + Limit + Rating + Cards + Age + Education + Gender + Student + Married + Ethnicity,  data = Credit)

modC <- stepAIC(lm(Balance ~ Income + Limit + Rating + Cards + Age + Education + Gender + Student + Married + Ethnicity,  data = Credit), direction = "backward", test = "F")

Start:  AIC=3686.22
Balance ~ Income + Limit + Rating + Cards + Age + Education + 
    Gender + Student + Married + Ethnicity

            Df Sum of Sq      RSS    AIC F Value     Pr(F)    
- Ethnicity  2     14084  3800814 3683.7    0.72   0.48665    
- Education  1      4615  3791345 3684.7    0.47   0.49207    
- Married    1      6619  3793349 3684.9    0.68   0.41073    
- Gender     1     11269  3798000 3685.4    1.15   0.28324    
<none>                    3786730 3686.2                      
- Age        1     42558  3829288 3688.7    4.36   0.03743 *  
- Rating     1     52314  3839044 3689.7    5.36   0.02112 *  
- Cards      1    162702  3949432 3701.0   16.67 5.401e-05 ***
- Limit      1    331050  4117780 3717.7   33.92 1.206e-08 ***
- Student    1   6326012 10112742 4077.1  648.18 < 2.2e-16 ***
- Income     1  10831162 14617892 4224.5 1109.79 < 2.2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Step:  AIC=3683.7
Balance ~ Income + Limit + Rating + Cards + Age + Education + 
    Gender + Student + Married

            Df Sum of Sq      RSS    AIC F Value     Pr(F)    
- Married    1      4545  3805359 3682.2    0.47   0.49506    
- Education  1      4757  3805572 3682.2    0.49   0.48517    
- Gender     1     10760  3811574 3682.8    1.10   0.29403    
<none>                    3800814 3683.7                      
- Age        1     45650  3846464 3686.5    4.68   0.03105 *  
- Rating     1     49473  3850287 3686.9    5.08   0.02481 *  
- Cards      1    166806  3967621 3698.9   17.12 4.310e-05 ***
- Limit      1    340250  4141064 3716.0   34.91 7.521e-09 ***
- Student    1   6372573 10173387 4075.5  653.89 < 2.2e-16 ***
- Income     1  10838891 14639705 4221.1 1112.17 < 2.2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Step:  AIC=3682.18
Balance ~ Income + Limit + Rating + Cards + Age + Education + 
    Gender + Student

            Df Sum of Sq      RSS    AIC F Value     Pr(F)    
- Education  1      5399  3810759 3680.7    0.55   0.45682    
- Gender     1     11019  3816378 3681.3    1.13   0.28797    
<none>                    3805359 3682.2                      
- Age        1     43545  3848904 3684.7    4.47   0.03504 *  
- Rating     1     46929  3852289 3685.1    4.82   0.02869 *  
- Cards      1    170729  3976088 3697.7   17.54 3.475e-05 ***
- Limit      1    352112  4157472 3715.6   36.18 4.137e-09 ***
- Student    1   6461978 10267338 4077.2  663.97 < 2.2e-16 ***
- Income     1  10860901 14666261 4219.8 1115.96 < 2.2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Step:  AIC=3680.75
Balance ~ Income + Limit + Rating + Cards + Age + Gender + Student

          Df Sum of Sq      RSS    AIC F Value     Pr(F)    
- Gender   1     10861  3821620 3679.9    1.12   0.29117    
<none>                  3810759 3680.7                      
- Age      1     43983  3854741 3683.3    4.52   0.03404 *  
- Rating   1     49520  3860279 3683.9    5.09   0.02456 *  
- Cards    1    170898  3981656 3696.3   17.58 3.409e-05 ***
- Limit    1    347654  4158413 3713.7   35.76 5.023e-09 ***
- Student  1   6472072 10282831 4075.8  665.76 < 2.2e-16 ***
- Income   1  10855780 14666539 4217.8 1116.70 < 2.2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Step:  AIC=3679.89
Balance ~ Income + Limit + Rating + Cards + Age + Student

          Df Sum of Sq      RSS    AIC F Value     Pr(F)    
<none>                  3821620 3679.9                      
- Age      1     44472  3866091 3682.5    4.57   0.03309 *  
- Rating   1     49263  3870883 3683.0    5.07   0.02495 *  
- Cards    1    172930  3994549 3695.6   17.78 3.075e-05 ***
- Limit    1    347898  4169518 3712.7   35.78 4.980e-09 ***
- Student  1   6462599 10284218 4073.9  664.59 < 2.2e-16 ***
- Income   1  10845018 14666638 4215.8 1115.26 < 2.2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

modC


Call:
lm(formula = Balance ~ Income + Limit + Rating + Cards + Age + 
    Student, data = Credit)

Coefficients:
(Intercept)       Income        Limit       Rating        Cards  
  -493.7342      -7.7951       0.1937       1.0912      18.2119  
        Age   StudentYes  
    -0.6241     425.6099

modD <- stepAIC(lm(Balance ~ 1, data = Credit), scope = ~ . + Income + Limit + Cards + Age + Education + Gender + Student + Married + Ethnicity, direction = "forward", test = "F")

Start:  AIC=4905.56
Balance ~ 1

            Df Sum of Sq      RSS    AIC F Value     Pr(F)    
+ Limit      1  62624255 21715657 4364.8 1147.76 < 2.2e-16 ***
+ Income     1  18131167 66208745 4810.7  108.99 < 2.2e-16 ***
+ Student    1   5658372 78681540 4879.8   28.62 1.488e-07 ***
+ Cards      1    630416 83709496 4904.6    3.00   0.08418 .  
<none>                   84339912 4905.6                      
+ Gender     1     38892 84301020 4907.4    0.18   0.66852    
+ Education  1      5481 84334431 4907.5    0.03   0.87231    
+ Married    1      2715 84337197 4907.5    0.01   0.90994    
+ Age        1       284 84339628 4907.6    0.00   0.97081    
+ Ethnicity  2     18454 84321458 4909.5    0.04   0.95749    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Step:  AIC=4364.83
Balance ~ Limit

            Df Sum of Sq      RSS    AIC F Value     Pr(F)    
+ Income     1  10844825 10870832 4090.1  396.05 < 2.2e-16 ***
+ Student    1   5887310 15828347 4240.3  147.66 < 2.2e-16 ***
+ Age        1    617067 21098589 4355.3   11.61 0.0007226 ***
+ Cards      1    508452 21207205 4357.4    9.52 0.0021768 ** 
<none>                   21715657 4364.8                      
+ Married    1     89278 21626379 4365.2    1.64 0.2012253    
+ Gender     1     15093 21700563 4366.6    0.28 0.5995469    
+ Education  1     12622 21703034 4366.6    0.23 0.6311289    
+ Ethnicity  2     40350 21675307 4368.1    0.37 0.6919457    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Step:  AIC=4090.05
Balance ~ Limit + Income

            Df Sum of Sq      RSS    AIC F Value     Pr(F)    
+ Student    1   6553835  4316997 3722.6  601.19 < 2.2e-16 ***
+ Cards      1    326363 10544469 4079.9   12.26 0.0005164 ***
+ Age        1     73681 10797151 4089.3    2.70 0.1009938    
+ Married    1     57405 10813427 4089.9    2.10 0.1478749    
<none>                   10870832 4090.1                      
+ Education  1      4042 10866790 4091.9    0.15 0.7013505    
+ Gender     1       614 10870218 4092.0    0.02 0.8811968    
+ Ethnicity  2     46207 10824625 4092.3    0.84 0.4311630    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Step:  AIC=3722.64
Balance ~ Limit + Income + Student

            Df Sum of Sq     RSS    AIC F Value     Pr(F)    
+ Cards      1    401938 3915058 3685.6  40.553 5.324e-10 ***
+ Age        1     32050 4284947 3721.7   2.954   0.08643 .  
<none>                   4316997 3722.6                      
+ Education  1     15047 4301949 3723.2   1.382   0.24053    
+ Gender     1     14324 4302673 3723.3   1.315   0.25219    
+ Married    1      1682 4315315 3724.5   0.154   0.69500    
+ Ethnicity  2     12831 4304166 3725.5   0.587   0.55633    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Step:  AIC=3685.55
Balance ~ Limit + Income + Student + Cards

            Df Sum of Sq     RSS    AIC F Value   Pr(F)  
+ Age        1     44176 3870883 3683.0  4.4965 0.03459 *
<none>                   3915058 3685.6                  
+ Gender     1     11086 3903972 3686.4  1.1188 0.29082  
+ Education  1      8304 3906754 3686.7  0.8375 0.36067  
+ Married    1      1151 3913908 3687.4  0.1158 0.73378  
+ Ethnicity  2     12534 3902524 3688.3  0.6311 0.53253  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Step:  AIC=3683.01
Balance ~ Limit + Income + Student + Cards + Age

            Df Sum of Sq     RSS    AIC F Value  Pr(F)
<none>                   3870883 3683.0               
+ Gender     1   10603.7 3860279 3683.9 1.07952 0.2994
+ Education  1    7793.4 3863089 3684.2 0.79284 0.3738
+ Married    1    2661.7 3868221 3684.7 0.27042 0.6033
+ Ethnicity  2    9638.6 3861244 3686.0 0.48926 0.6135

modD


Call:
lm(formula = Balance ~ Limit + Income + Student + Cards + Age, 
    data = Credit)

Coefficients:
(Intercept)        Limit       Income   StudentYes        Cards  
  -467.3345       0.2661      -7.7595     428.3786      23.5504  
        Age  
    -0.6220

# Predict
predict(modC, newdata = data.frame(Income = 80, Limit = 5000, Cards = 3, Age = 52, Student = "No", Rating = 800), interval = "pred")

      fit      lwr      upr
1 746.251 295.5691 1196.933

More Diagnostic Plots

residualPlots(modC)

           Test stat Pr(>|t|)
Income         2.442    0.015
Limit         11.709    0.000
Rating        11.214    0.000
Cards         -0.692    0.489
Age           -0.159    0.874
Student           NA       NA
Tukey test    25.379    0.000

qqPlot(modC)

influenceIndexPlot(modC)

Non-linear Relationships

library(ISLR)
car1 <- lm(mpg ~ horsepower, data = Auto)
car2 <- lm(mpg ~ poly(horsepower, 2), data = Auto)
car5 <- lm(mpg ~ poly(horsepower, 5), data = Auto)
xs <- seq(min(Auto$horsepower), max(Auto$horsepower), length = 500)
y1 <- predict(car1, newdata = data.frame(horsepower = xs))
y2 <- predict(car2, newdata = data.frame(horsepower = xs))
y5 <- predict(car5, newdata = data.frame(horsepower = xs))
DF <- data.frame(x = xs, y1 = y1, y2 = y2, y5 = y5)
ggplot(data = Auto, aes(x = horsepower, y = mpg)) + 
  geom_point() + 
  theme_bw() + 
  geom_line(data = DF, aes(x = x, y = y1), color = "red") + 
  geom_line(data = DF, aes(x = x, y = y2), color = "blue") + 
  geom_line(data = DF, aes(x = x, y = y5), color = "green")

ggplot(data = Auto, aes(x = horsepower, y = mpg)) + 
  geom_point(color = "lightblue") + 
  theme_bw() + 
  stat_smooth(method = "lm", data = Auto, color = "red", se = FALSE) + 
  stat_smooth(method = "lm", formula = y ~ poly(x, 2), data = Auto, color = "blue", se = FALSE) + 
  stat_smooth(method = "lm", formula = y ~ poly(x, 5), data = Auto, color = "green", se = FALSE)

newC <- update(modC, .~. - Limit - Income - Rating + poly(Income, 2) + poly(Limit, 4))
summary(newC)


Call:
lm(formula = Balance ~ Cards + Age + Student + poly(Income, 2) + 
    poly(Limit, 4), data = Credit)

Residuals:
    Min      1Q  Median      3Q     Max 
-365.08  -32.62   -4.70   32.36  219.66 

Coefficients:
                   Estimate Std. Error t value Pr(>|t|)    
(Intercept)        458.4208    13.2687  34.549  < 2e-16 ***
Cards               22.8068     2.4225   9.415  < 2e-16 ***
Age                 -0.8707     0.1960  -4.441 1.17e-05 ***
StudentYes         426.1013    11.1190  38.322  < 2e-16 ***
poly(Income, 2)1 -6641.4002   136.3325 -48.715  < 2e-16 ***
poly(Income, 2)2  -290.5402    91.9410  -3.160   0.0017 ** 
poly(Limit, 4)1  13214.2696   126.2891 104.635  < 2e-16 ***
poly(Limit, 4)2   1401.5188   100.4033  13.959  < 2e-16 ***
poly(Limit, 4)3   -860.8780    70.5149 -12.208  < 2e-16 ***
poly(Limit, 4)4    488.6575    67.9057   7.196 3.20e-12 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 66.1 on 390 degrees of freedom
Multiple R-squared:  0.9798,    Adjusted R-squared:  0.9793 
F-statistic:  2101 on 9 and 390 DF,  p-value: < 2.2e-16

residualPlots(newC)

                Test stat Pr(>|t|)
Cards               0.177    0.860
Age                -0.469    0.639
Student                NA       NA
poly(Income, 2)        NA       NA
poly(Limit, 4)         NA       NA
Tukey test         13.439    0.000

qqPlot(newC)

influenceIndexPlot(newC)

Variance Inflation Factor (VIF)

The VIF is the ratio of the variance of $\hat{\beta}_j$ when fitting the full model divided by the variance of $\hat{\beta}_j$ if it is fit on its own. The smallest possible value for VIF is 1, which indicates the complete absence of collinearity. The VIF for each variable can be computed using the formula:

\[VIF(\hat{\beta}_j) = \frac{1}{1 - R^2_{X_j|X_{-j}}}\]

where $R^2_{X_j|X_{-j}}$ is the $R^2$ from a regression of $X_j$ onto all of the other predictors. If $R^2_{X_j|X_{-j}}$ is close to one, then collinearity is present, and so the VIF will be large.

Compute the VIF for each $\hat{\beta}_j$ of modC

modC


Call:
lm(formula = Balance ~ Income + Limit + Rating + Cards + Age + 
    Student, data = Credit)

Coefficients:
(Intercept)       Income        Limit       Rating        Cards  
  -493.7342      -7.7951       0.1937       1.0912      18.2119  
        Age   StudentYes  
    -0.6241     425.6099

R2inc <- summary(lm(Income ~ Limit + Rating + Cards + Age + Student, data = Credit))$r.squared
R2inc

[1] 0.639887

VIFinc <- 1/(1 - R2inc)
VIFinc

[1] 2.776906

R2lim <- summary(lm(Limit ~ Income + Rating + Cards + Age + Student, data = Credit))$r.squared
R2lim

[1] 0.9956377

VIFlim <- 1/(1 - R2lim)
VIFlim

[1] 229.2385

This is tedious is there a function to do this? Yes!

car::vif(modC)

    Income      Limit     Rating      Cards        Age    Student 
  2.776906 229.238479 230.869514   1.439007   1.039696   1.009064

Exercise

Create a model that predicts an individuals credit rating (Rating).
Create another model that predicts rating with Limit, Cards, Married, Student, and Education as features.
Use your model to predict the Rating for an individual that has a credit card limit of $6,000, has 4 credit cards, is married, is not a student, and has an undergraduate degree (Education = 16).
Use your model to predict the Rating for an individual that has a credit card limit of $12,000, has 2 credit cards, is married, is not a student, and has an eighth grade education (Education = 8).

Chapter Two Graphs

Alan Arnholt

Last updated: Feb 18, 2016

Base R Graph

Using `ggplot2`

Using `ggvis`

Using `plotly`

Scatterplot Matrices

Chapter 3

Confidence Interval for \(\beta_1\)

Linear Algebra

Multiple Linear Regression

Graphing the plane

Is There a Relationship Between the Response and Predictors?

Variable Selection

Diagnostic Plots

Non-Additive Models

Qualitative Predictors

Predictors with Only Two Levels

Moving On Now

What does this look like?

Correct Graph

Qualitative predictors with More than Two Levels

Matrix Scatterplots

More Diagnostic Plots

Non-linear Relationships

Variance Inflation Factor (VIF)

Exercise

Chapter Two Graphs

Alan Arnholt

Last updated: Feb 18, 2016

Base R Graph

Using ggplot2

Using ggvis

Using plotly

Scatterplot Matrices

Chapter 3

Confidence Interval for \(\beta_1\)

Linear Algebra

Multiple Linear Regression

Graphing the plane

Is There a Relationship Between the Response and Predictors?

Variable Selection

Diagnostic Plots

Non-Additive Models

Qualitative Predictors

Predictors with Only Two Levels

Moving On Now

What does this look like?

Correct Graph

Qualitative predictors with More than Two Levels

Matrix Scatterplots

More Diagnostic Plots

Non-linear Relationships

Variance Inflation Factor (VIF)

Exercise

Using `ggplot2`

Using `ggvis`

Using `plotly`