library(PASWR2)
table(VIT2005$zone)
Z11 Z21 Z31 Z32 Z34 Z35 Z36 Z37 Z38 Z41 Z42 Z43 Z44 Z45 Z46 Z47 Z48 Z49
8 9 9 11 5 4 10 11 4 18 6 5 5 13 10 11 6 5
Z52 Z53 Z56 Z61 Z62
15 16 9 14 14
table(VIT2005$category)
2A 2B 3A 3B 4A 4B 5A
4 14 61 77 36 23 3
table(VIT2005$out)
E100 E25 E50 E75
122 3 87 6
table(VIT2005$conservation)
1A 2A 2B 3A
161 18 36 3
table(VIT2005$streetcategory)
S2 S3 S4 S5
42 107 59 10
table(VIT2005$heating)
1A 3A 3B 4A
8 149 10 51
##### Feature Creation
library(tidyverse)
VIT2005 <- VIT2005 %>%
mutate(new_category =
case_when(category == "2A" ~ "good",
category == "2B" ~ "good",
category == "3A" ~ "good",
category == "3B" ~ "fair",
category == "4A" ~ "poor",
category == "4B" ~ "poor",
category == "5A" ~ "poor"))
VIT2005$new_category <- factor(VIT2005$new_category)
VIT2005 <- VIT2005 %>%
mutate(new_out =
case_when(out == "E100" ~ "good",
out == "E75" ~ "good",
out == "E50" ~ "fair",
out == "E25" ~ "fair"))
VIT2005$new_out <- factor(VIT2005$new_out)
table(VIT2005$new_out)
fair good
90 128
VIT2005 <- VIT2005 %>%
select(-c("zone", "conservation", "category", "out"))
names(VIT2005)
[1] "totalprice" "area" "age" "floor"
[5] "rooms" "toilets" "garage" "elevator"
[9] "streetcategory" "heating" "storage" "new_category"
[13] "new_out"
library(caret)
set.seed(48)
trainIndex <- createDataPartition(y = VIT2005$totalprice,
p = 0.80,
list = FALSE,
times = 1)
training <- VIT2005[trainIndex, ]
testing <- VIT2005[-trainIndex, ]
# One-Hot Encoding
# Creating dummy variables is converting a categorical variable to as many binary variables as here are categories.
library(caret)
dummies_model <- dummyVars(totalprice ~ ., data=training)
# Create the dummy variables using predict. The Y variable (totalprice) will not be present in trainData_mat.
trainData_mat <- predict(dummies_model, newdata = training)
testData_mat <- predict(dummies_model, newdata = testing)
# # Convert to dataframe
trainData <- data.frame(trainData_mat)
testData <- data.frame(testData_mat)
# # See the structure of the new dataset
str(trainData)
'data.frame': 176 obs. of 21 variables:
$ area : num 75.3 88.9 62.6 146.1 77.2 ...
$ age : num 33 14 41 22 35 14 36 37 11 36 ...
$ floor : num 3 8 3 6 4 6 3 4 5 6 ...
$ rooms : num 5 5 4 7 5 4 4 4 4 6 ...
$ toilets : num 1 2 1 2 1 1 1 1 1 2 ...
$ garage : num 0 0 0 0 0 0 0 0 0 0 ...
$ elevator : num 1 1 0 1 0 1 0 0 0 1 ...
$ streetcategory.S2: num 0 1 0 0 0 0 0 0 0 1 ...
$ streetcategory.S3: num 1 0 1 0 0 1 1 0 0 0 ...
$ streetcategory.S4: num 0 0 0 1 1 0 0 1 1 0 ...
$ streetcategory.S5: num 0 0 0 0 0 0 0 0 0 0 ...
$ heating.1A : num 0 0 1 0 0 0 0 0 0 0 ...
$ heating.3A : num 1 1 0 0 1 0 1 1 1 0 ...
$ heating.3B : num 0 0 0 0 0 0 0 0 0 0 ...
$ heating.4A : num 0 0 0 1 0 1 0 0 0 1 ...
$ storage : num 0 0 0 1 1 1 1 1 1 1 ...
$ new_category.fair: num 0 0 0 0 0 0 1 0 1 0 ...
$ new_category.good: num 0 1 0 1 0 1 0 0 0 0 ...
$ new_category.poor: num 1 0 1 0 1 0 0 1 0 1 ...
$ new_out.fair : num 0 1 1 0 1 1 0 1 1 1 ...
$ new_out.good : num 1 0 0 1 0 0 1 0 0 0 ...
trainData$totalprice <- training$totalprice
str(trainData)
'data.frame': 176 obs. of 22 variables:
$ area : num 75.3 88.9 62.6 146.1 77.2 ...
$ age : num 33 14 41 22 35 14 36 37 11 36 ...
$ floor : num 3 8 3 6 4 6 3 4 5 6 ...
$ rooms : num 5 5 4 7 5 4 4 4 4 6 ...
$ toilets : num 1 2 1 2 1 1 1 1 1 2 ...
$ garage : num 0 0 0 0 0 0 0 0 0 0 ...
$ elevator : num 1 1 0 1 0 1 0 0 0 1 ...
$ streetcategory.S2: num 0 1 0 0 0 0 0 0 0 1 ...
$ streetcategory.S3: num 1 0 1 0 0 1 1 0 0 0 ...
$ streetcategory.S4: num 0 0 0 1 1 0 0 1 1 0 ...
$ streetcategory.S5: num 0 0 0 0 0 0 0 0 0 0 ...
$ heating.1A : num 0 0 1 0 0 0 0 0 0 0 ...
$ heating.3A : num 1 1 0 0 1 0 1 1 1 0 ...
$ heating.3B : num 0 0 0 0 0 0 0 0 0 0 ...
$ heating.4A : num 0 0 0 1 0 1 0 0 0 1 ...
$ storage : num 0 0 0 1 1 1 1 1 1 1 ...
$ new_category.fair: num 0 0 0 0 0 0 1 0 1 0 ...
$ new_category.good: num 0 1 0 1 0 1 0 0 0 0 ...
$ new_category.poor: num 1 0 1 0 1 0 0 1 0 1 ...
$ new_out.fair : num 0 1 1 0 1 1 0 1 1 1 ...
$ new_out.good : num 1 0 0 1 0 0 1 0 0 0 ...
$ totalprice : num 228000 200000 180000 443600 173000 ...
#### Problems with using one-hot encoded variables as they are colinear
mod_bad <- lm(totalprice ~ . , data = trainData)
summary(mod_bad)
Call:
lm(formula = totalprice ~ ., data = trainData)
Residuals:
Min 1Q Median 3Q Max
-85576 -17011 -2119 15489 93272
Coefficients: (4 not defined because of singularities)
Estimate Std. Error t value Pr(>|t|)
(Intercept) 57939.3 28570.5 2.028 0.04425 *
area 1741.0 195.6 8.902 1.22e-15 ***
age -199.0 231.5 -0.860 0.39117
floor -2031.0 1100.4 -1.846 0.06681 .
rooms 6857.2 4909.3 1.397 0.16444
toilets 6025.8 7759.3 0.777 0.43856
garage 25668.3 6086.7 4.217 4.15e-05 ***
elevator 14255.8 7710.2 1.849 0.06633 .
streetcategory.S2 -2754.8 15904.1 -0.173 0.86270
streetcategory.S3 -4091.4 15668.1 -0.261 0.79433
streetcategory.S4 11114.5 15371.3 0.723 0.47071
streetcategory.S5 NA NA NA NA
heating.1A -20808.1 16018.2 -1.299 0.19583
heating.3A -10575.4 6069.4 -1.742 0.08338 .
heating.3B -2349.2 11271.2 -0.208 0.83517
heating.4A NA NA NA NA
storage 18504.6 6768.8 2.734 0.00698 **
new_category.fair 11467.6 6706.0 1.710 0.08922 .
new_category.good 26493.5 8293.2 3.195 0.00169 **
new_category.poor NA NA NA NA
new_out.fair -1239.6 4939.3 -0.251 0.80216
new_out.good NA NA NA NA
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 30300 on 158 degrees of freedom
Multiple R-squared: 0.8269, Adjusted R-squared: 0.8083
F-statistic: 44.4 on 17 and 158 DF, p-value: < 2.2e-16
#### Use training data and R will encode using model.matrix()
#### To see data used
X <- model.matrix(lm(totalprice ~ ., data = training))
head(X)
(Intercept) area age floor rooms toilets garage elevator
1 1 75.31 33 3 5 1 0 1
3 1 88.87 14 8 5 2 0 1
4 1 62.61 41 3 4 1 0 0
5 1 146.15 22 6 7 2 0 1
6 1 77.21 35 4 5 1 0 0
7 1 77.04 14 6 4 1 0 1
streetcategoryS3 streetcategoryS4 streetcategoryS5 heating3A heating3B
1 1 0 0 1 0
3 0 0 0 1 0
4 1 0 0 0 0
5 0 1 0 0 0
6 0 1 0 1 0
7 1 0 0 0 0
heating4A storage new_categorygood new_categorypoor new_outgood
1 0 0 0 1 1
3 0 0 1 0 0
4 0 0 0 1 0
5 1 1 1 0 1
6 0 1 0 1 0
7 1 1 1 0 0
####
mod_good <- lm(totalprice ~ ., data = training)
summary(mod_good)
Call:
lm(formula = totalprice ~ ., data = training)
Residuals:
Min 1Q Median 3Q Max
-85576 -17011 -2119 15489 93272
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 44604.4 27041.0 1.650 0.10103
area 1741.0 195.6 8.902 1.22e-15 ***
age -199.0 231.5 -0.860 0.39117
floor -2031.0 1100.4 -1.846 0.06681 .
rooms 6857.2 4909.3 1.397 0.16444
toilets 6025.8 7759.3 0.777 0.43856
garage 25668.3 6086.7 4.217 4.15e-05 ***
elevator 14255.8 7710.2 1.849 0.06633 .
streetcategoryS3 -1336.6 6655.2 -0.201 0.84109
streetcategoryS4 13869.4 7058.5 1.965 0.05118 .
streetcategoryS5 2754.8 15904.1 0.173 0.86270
heating3A 10232.7 15313.9 0.668 0.50498
heating3B 18458.9 18829.1 0.980 0.32842
heating4A 20808.1 16018.2 1.299 0.19583
storage 18504.6 6768.8 2.734 0.00698 **
new_categorygood 15025.8 6614.1 2.272 0.02445 *
new_categorypoor -11467.6 6706.0 -1.710 0.08922 .
new_outgood 1239.6 4939.3 0.251 0.80216
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 30300 on 158 degrees of freedom
Multiple R-squared: 0.8269, Adjusted R-squared: 0.8083
F-statistic: 44.4 on 17 and 158 DF, p-value: < 2.2e-16
# trainControl---10 fold cv repeated 5 times
myControl <- trainControl(method = "repeatedcv",
number = 10,
repeats = 5,
savePredictions = "final")
# 5 fold cv
myControl <- trainControl(method = "cv",
number = 5,
savePredictions = "final")
set.seed(31)
mod_lm <- train(totalprice ~ .,
data = training,
trControl = myControl,
method = "lm")
mod_lm$results
intercept RMSE Rsquared MAE RMSESD RsquaredSD MAESD
1 TRUE 33091.02 0.7672885 25255.23 4359.556 0.05647467 4549.297
summary(mod_lm$finalModel)
Call:
lm(formula = .outcome ~ ., data = dat)
Residuals:
Min 1Q Median 3Q Max
-85576 -17011 -2119 15489 93272
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 44604.4 27041.0 1.650 0.10103
area 1741.0 195.6 8.902 1.22e-15 ***
age -199.0 231.5 -0.860 0.39117
floor -2031.0 1100.4 -1.846 0.06681 .
rooms 6857.2 4909.3 1.397 0.16444
toilets 6025.8 7759.3 0.777 0.43856
garage 25668.3 6086.7 4.217 4.15e-05 ***
elevator 14255.8 7710.2 1.849 0.06633 .
streetcategoryS3 -1336.6 6655.2 -0.201 0.84109
streetcategoryS4 13869.4 7058.5 1.965 0.05118 .
streetcategoryS5 2754.8 15904.1 0.173 0.86270
heating3A 10232.7 15313.9 0.668 0.50498
heating3B 18458.9 18829.1 0.980 0.32842
heating4A 20808.1 16018.2 1.299 0.19583
storage 18504.6 6768.8 2.734 0.00698 **
new_categorygood 15025.8 6614.1 2.272 0.02445 *
new_categorypoor -11467.6 6706.0 -1.710 0.08922 .
new_outgood 1239.6 4939.3 0.251 0.80216
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 30300 on 158 degrees of freedom
Multiple R-squared: 0.8269, Adjusted R-squared: 0.8083
F-statistic: 44.4 on 17 and 158 DF, p-value: < 2.2e-16
set.seed(3)
mod_lm2 <- train(y = training$totalprice,
x = training[ ,-1],
trControl = myControl,
method = "lm")
mod_lm2$results
intercept RMSE Rsquared MAE RMSESD RsquaredSD MAESD
1 TRUE 32526.6 0.7844173 24956.66 3007.286 0.04715992 2584.782
summary(mod_lm2$finalModel)
Call:
lm(formula = .outcome ~ ., data = dat)
Residuals:
Min 1Q Median 3Q Max
-85576 -17011 -2119 15489 93272
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 44604.4 27041.0 1.650 0.10103
area 1741.0 195.6 8.902 1.22e-15 ***
age -199.0 231.5 -0.860 0.39117
floor -2031.0 1100.4 -1.846 0.06681 .
rooms 6857.2 4909.3 1.397 0.16444
toilets 6025.8 7759.3 0.777 0.43856
garage 25668.3 6086.7 4.217 4.15e-05 ***
elevator 14255.8 7710.2 1.849 0.06633 .
streetcategoryS3 -1336.6 6655.2 -0.201 0.84109
streetcategoryS4 13869.4 7058.5 1.965 0.05118 .
streetcategoryS5 2754.8 15904.1 0.173 0.86270
heating3A 10232.7 15313.9 0.668 0.50498
heating3B 18458.9 18829.1 0.980 0.32842
heating4A 20808.1 16018.2 1.299 0.19583
storage 18504.6 6768.8 2.734 0.00698 **
new_categorygood 15025.8 6614.1 2.272 0.02445 *
new_categorypoor -11467.6 6706.0 -1.710 0.08922 .
new_outgood 1239.6 4939.3 0.251 0.80216
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 30300 on 158 degrees of freedom
Multiple R-squared: 0.8269, Adjusted R-squared: 0.8083
F-statistic: 44.4 on 17 and 158 DF, p-value: < 2.2e-16
#
RMSE(predict(mod_lm2$finalModel, newdata = testing), testing$totalprice)
[1] 41901.39
set.seed(3)
mod_rf <- train(y = training$totalprice,
x = training[, -1],
# trControl = myControl,
tuneLength = 10,
method = "rf")
mod_rf$results
mtry RMSE Rsquared MAE RMSESD RsquaredSD MAESD
1 2 33959.09 0.7728257 25971.76 4343.822 0.03180525 2890.291
2 3 33474.89 0.7724928 25567.79 3909.747 0.03181650 2713.571
3 4 33358.23 0.7710148 25394.98 3658.900 0.03350988 2631.646
4 5 33568.18 0.7657300 25398.22 3688.559 0.03630463 2701.750
5 6 33712.66 0.7625855 25466.03 3713.832 0.03896434 2796.892
6 7 34079.94 0.7565949 25672.09 3701.871 0.04154037 2833.704
7 8 34351.24 0.7520470 25800.31 3742.916 0.04460034 2898.400
8 9 34519.01 0.7487772 25951.90 3696.100 0.04526042 2915.003
9 10 34829.21 0.7443197 26159.44 3842.793 0.04641695 3020.045
10 12 35269.59 0.7378202 26582.03 3948.585 0.04924283 3114.371
####
# ND <- model.matrix( ~ ., data = testing)
yhat <- predict(mod_rf$finalModel, newdata = testing)
RMSE(yhat, testing$totalprice)
[1] 42159.3
set.seed(3)
mod_rf <- train(y = training$totalprice,
x = training[ , -1],
trControl = myControl,
tuneLength = 10,
method = "rf")
mod_rf$results
mtry RMSE Rsquared MAE RMSESD RsquaredSD MAESD
1 2 33318.00 0.7888373 25136.86 3752.484 0.03623065 1353.374
2 3 32674.38 0.7893243 24563.35 3775.252 0.03463207 1897.169
3 4 32894.61 0.7818971 24653.22 4154.356 0.04219018 2289.029
4 5 32717.51 0.7817098 24450.72 4416.466 0.05186453 2835.321
5 6 32733.55 0.7810320 24285.38 4335.748 0.04549921 2675.504
6 7 33083.57 0.7738842 24434.59 4877.632 0.05837481 3013.818
7 8 33153.61 0.7717089 24453.33 5158.883 0.06019273 3411.898
8 9 33316.59 0.7690802 24515.05 5281.864 0.06508762 3410.183
9 10 33477.05 0.7652961 24722.25 5503.229 0.07013493 3336.967
10 12 33808.76 0.7594031 24870.08 6229.157 0.08239102 3688.106
RMSE(predict(mod_rf$finalModel, newdata = testing), testing$totalprice)
[1] 42792.11
set.seed(3)
mod_rf <- train(y = trainData$totalprice,
x = trainData[, -22],
trControl = myControl,
tuneLength = 10,
method = "rf")
mod_rf$results
mtry RMSE Rsquared MAE RMSESD RsquaredSD MAESD
1 2 35726.65 0.7751193 26744.67 3963.913 0.04404517 905.1174
2 4 33211.61 0.7844783 24741.06 3394.930 0.04567664 1247.2310
3 6 32619.36 0.7870544 24323.12 3665.673 0.04054923 2037.1012
4 8 32471.93 0.7871411 24166.60 4016.611 0.04209065 2256.3961
5 10 32941.99 0.7779851 24413.83 4432.608 0.04933864 2680.1171
6 12 32915.60 0.7755008 24219.16 4757.575 0.05724150 3027.5187
7 14 33377.54 0.7678553 24363.44 5057.922 0.06548101 3196.0579
8 16 33492.75 0.7648839 24561.98 5327.788 0.06846601 3252.1438
9 18 33623.30 0.7614994 24548.11 5844.926 0.07890941 3454.6432
10 21 33915.99 0.7578535 24873.19 6404.775 0.08316092 3723.5811
#
RMSE(predict(mod_rf$finalModel, newdata = testData), testing$totalprice)
[1] 41448.96
set.seed(3)
mod_aic <- train(y = training$totalprice,
x = training[, -1],
trControl = myControl,
method = "lmStepAIC")
Start: AIC=3649.29
.outcome ~ area + age + floor + rooms + toilets + garage + elevator +
streetcategory + heating + storage + new_category + new_out
Df Sum of Sq RSS AIC
- new_out 1 5.7837e+07 1.4514e+11 3647.4
- heating 3 3.5540e+09 1.4863e+11 3647.6
- toilets 1 5.5377e+08 1.4563e+11 3648.0
- age 1 6.7890e+08 1.4576e+11 3648.1
<none> 1.4508e+11 3649.3
- rooms 1 1.7915e+09 1.4687e+11 3649.5
- floor 1 3.1281e+09 1.4821e+11 3651.0
- elevator 1 3.1391e+09 1.4822e+11 3651.1
- streetcategory 3 6.9715e+09 1.5205e+11 3651.6
- storage 1 6.8625e+09 1.5194e+11 3655.4
- new_category 2 9.5083e+09 1.5459e+11 3656.5
- garage 1 1.6330e+10 1.6141e+11 3666.1
- area 1 7.2770e+10 2.1785e+11 3718.8
Step: AIC=3647.36
.outcome ~ area + age + floor + rooms + toilets + garage + elevator +
streetcategory + heating + storage + new_category
Df Sum of Sq RSS AIC
- heating 3 3.5481e+09 1.4869e+11 3645.6
- toilets 1 6.1882e+08 1.4576e+11 3646.1
- age 1 7.0108e+08 1.4584e+11 3646.2
<none> 1.4514e+11 3647.4
- rooms 1 1.7503e+09 1.4689e+11 3647.5
- floor 1 3.0951e+09 1.4823e+11 3649.1
- elevator 1 3.1545e+09 1.4829e+11 3649.1
- streetcategory 3 6.9845e+09 1.5212e+11 3649.6
- storage 1 6.8717e+09 1.5201e+11 3653.5
- new_category 2 9.5369e+09 1.5467e+11 3654.6
- garage 1 1.6398e+10 1.6154e+11 3664.2
- area 1 7.2714e+10 2.1785e+11 3716.8
Step: AIC=3645.61
.outcome ~ area + age + floor + rooms + toilets + garage + elevator +
streetcategory + storage + new_category
Df Sum of Sq RSS AIC
- age 1 6.6299e+08 1.4935e+11 3644.4
- toilets 1 7.3540e+08 1.4942e+11 3644.5
<none> 1.4869e+11 3645.6
- rooms 1 1.9956e+09 1.5068e+11 3646.0
- floor 1 2.4577e+09 1.5114e+11 3646.5
- streetcategory 3 6.9637e+09 1.5565e+11 3647.7
- elevator 1 4.4518e+09 1.5314e+11 3648.8
- new_category 2 1.1031e+10 1.5972e+11 3654.2
- storage 1 1.0469e+10 1.5916e+11 3655.6
- garage 1 1.5227e+10 1.6391e+11 3660.8
- area 1 7.6559e+10 2.2524e+11 3716.7
Step: AIC=3644.4
.outcome ~ area + floor + rooms + toilets + garage + elevator +
streetcategory + storage + new_category
Df Sum of Sq RSS AIC
- toilets 1 8.2404e+08 1.5017e+11 3643.4
<none> 1.4935e+11 3644.4
- rooms 1 1.8995e+09 1.5125e+11 3644.6
- floor 1 2.4876e+09 1.5184e+11 3645.3
- streetcategory 3 6.7449e+09 1.5609e+11 3646.2
- elevator 1 6.1461e+09 1.5549e+11 3649.5
- new_category 2 1.3109e+10 1.6246e+11 3655.2
- storage 1 1.2033e+10 1.6138e+11 3656.0
- garage 1 1.5444e+10 1.6479e+11 3659.7
- area 1 7.5926e+10 2.2527e+11 3714.7
Step: AIC=3643.36
.outcome ~ area + floor + rooms + garage + elevator + streetcategory +
storage + new_category
Df Sum of Sq RSS AIC
<none> 1.5017e+11 3643.4
- rooms 1 1.9671e+09 1.5214e+11 3643.7
- floor 1 2.2932e+09 1.5247e+11 3644.0
- streetcategory 3 6.5463e+09 1.5672e+11 3644.9
- elevator 1 6.3181e+09 1.5649e+11 3648.6
- storage 1 1.2936e+10 1.6311e+11 3655.9
- garage 1 1.7025e+10 1.6720e+11 3660.3
- new_category 2 2.1208e+10 1.7138e+11 3662.6
- area 1 9.5579e+10 2.4575e+11 3728.1
mod_aic$results
parameter RMSE Rsquared MAE RMSESD RsquaredSD MAESD
1 none 32751.82 0.783739 24651.48 2932.697 0.04225347 2437.257
summary(mod_aic$finalModel)
Call:
lm(formula = .outcome ~ area + floor + rooms + garage + elevator +
streetcategory + storage + new_category, data = dat)
Residuals:
Min 1Q Median 3Q Max
-90662 -16730 -1783 16238 95192
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 46071.28 20807.25 2.214 0.028195 *
area 1823.29 178.46 10.217 < 2e-16 ***
floor -1715.17 1083.82 -1.583 0.115457
rooms 7114.78 4854.31 1.466 0.144655
garage 25676.99 5954.93 4.312 2.79e-05 ***
elevator 18775.02 7147.61 2.627 0.009437 **
streetcategoryS3 -1765.86 6531.33 -0.270 0.787218
streetcategoryS4 12772.45 6879.27 1.857 0.065154 .
streetcategoryS5 85.58 15633.10 0.005 0.995639
storage 23430.15 6233.65 3.759 0.000237 ***
new_categorygood 18002.37 5995.89 3.002 0.003098 **
new_categorypoor -15687.31 6270.01 -2.502 0.013331 *
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 30260 on 164 degrees of freedom
Multiple R-squared: 0.8208, Adjusted R-squared: 0.8088
F-statistic: 68.31 on 11 and 164 DF, p-value: < 2.2e-16
RMSE(predict(mod_aic$finalModel, newdata = testing), testing$totalprice)
[1] 43052.24
Delete rows with missing values — can lead to problems
Impute the missing values — median imputation and knn imputation
NOTE: lm()
automagically deletes them
Consider the data set Hitters
from the ISLR
package.
library(ISLR)
summary(Hitters)
AtBat Hits HmRun Runs
Min. : 16.0 Min. : 1 Min. : 0.00 Min. : 0.00
1st Qu.:255.2 1st Qu.: 64 1st Qu.: 4.00 1st Qu.: 30.25
Median :379.5 Median : 96 Median : 8.00 Median : 48.00
Mean :380.9 Mean :101 Mean :10.77 Mean : 50.91
3rd Qu.:512.0 3rd Qu.:137 3rd Qu.:16.00 3rd Qu.: 69.00
Max. :687.0 Max. :238 Max. :40.00 Max. :130.00
RBI Walks Years CAtBat
Min. : 0.00 Min. : 0.00 Min. : 1.000 Min. : 19.0
1st Qu.: 28.00 1st Qu.: 22.00 1st Qu.: 4.000 1st Qu.: 816.8
Median : 44.00 Median : 35.00 Median : 6.000 Median : 1928.0
Mean : 48.03 Mean : 38.74 Mean : 7.444 Mean : 2648.7
3rd Qu.: 64.75 3rd Qu.: 53.00 3rd Qu.:11.000 3rd Qu.: 3924.2
Max. :121.00 Max. :105.00 Max. :24.000 Max. :14053.0
CHits CHmRun CRuns CRBI
Min. : 4.0 Min. : 0.00 Min. : 1.0 Min. : 0.00
1st Qu.: 209.0 1st Qu.: 14.00 1st Qu.: 100.2 1st Qu.: 88.75
Median : 508.0 Median : 37.50 Median : 247.0 Median : 220.50
Mean : 717.6 Mean : 69.49 Mean : 358.8 Mean : 330.12
3rd Qu.:1059.2 3rd Qu.: 90.00 3rd Qu.: 526.2 3rd Qu.: 426.25
Max. :4256.0 Max. :548.00 Max. :2165.0 Max. :1659.00
CWalks League Division PutOuts Assists
Min. : 0.00 A:175 E:157 Min. : 0.0 Min. : 0.0
1st Qu.: 67.25 N:147 W:165 1st Qu.: 109.2 1st Qu.: 7.0
Median : 170.50 Median : 212.0 Median : 39.5
Mean : 260.24 Mean : 288.9 Mean :106.9
3rd Qu.: 339.25 3rd Qu.: 325.0 3rd Qu.:166.0
Max. :1566.00 Max. :1378.0 Max. :492.0
Errors Salary NewLeague
Min. : 0.00 Min. : 67.5 A:176
1st Qu.: 3.00 1st Qu.: 190.0 N:146
Median : 6.00 Median : 425.0
Mean : 8.04 Mean : 535.9
3rd Qu.:11.00 3rd Qu.: 750.0
Max. :32.00 Max. :2460.0
NA's :59
Note that there are 59 salary values that are NA. Are these values missing at random or is there some non-random reason they are missing?
Three data sets will be created: one with the missing rows deleted (RD), and one with the missing values imputed using medianImpute
(IFDM), and one using knnImpute
(HittersF).
set.seed(43)
dim(Hitters)
[1] 322 20
RD <- na.omit(Hitters)
dim(RD)
[1] 263 20
IFDM <- preProcess(Hitters, method = "medianImpute")
IFDM <- predict(IFDM, Hitters)
summary(IFDM)
AtBat Hits HmRun Runs
Min. : 16.0 Min. : 1 Min. : 0.00 Min. : 0.00
1st Qu.:255.2 1st Qu.: 64 1st Qu.: 4.00 1st Qu.: 30.25
Median :379.5 Median : 96 Median : 8.00 Median : 48.00
Mean :380.9 Mean :101 Mean :10.77 Mean : 50.91
3rd Qu.:512.0 3rd Qu.:137 3rd Qu.:16.00 3rd Qu.: 69.00
Max. :687.0 Max. :238 Max. :40.00 Max. :130.00
RBI Walks Years CAtBat
Min. : 0.00 Min. : 0.00 Min. : 1.000 Min. : 19.0
1st Qu.: 28.00 1st Qu.: 22.00 1st Qu.: 4.000 1st Qu.: 816.8
Median : 44.00 Median : 35.00 Median : 6.000 Median : 1928.0
Mean : 48.03 Mean : 38.74 Mean : 7.444 Mean : 2648.7
3rd Qu.: 64.75 3rd Qu.: 53.00 3rd Qu.:11.000 3rd Qu.: 3924.2
Max. :121.00 Max. :105.00 Max. :24.000 Max. :14053.0
CHits CHmRun CRuns CRBI
Min. : 4.0 Min. : 0.00 Min. : 1.0 Min. : 0.00
1st Qu.: 209.0 1st Qu.: 14.00 1st Qu.: 100.2 1st Qu.: 88.75
Median : 508.0 Median : 37.50 Median : 247.0 Median : 220.50
Mean : 717.6 Mean : 69.49 Mean : 358.8 Mean : 330.12
3rd Qu.:1059.2 3rd Qu.: 90.00 3rd Qu.: 526.2 3rd Qu.: 426.25
Max. :4256.0 Max. :548.00 Max. :2165.0 Max. :1659.00
CWalks League Division PutOuts Assists
Min. : 0.00 A:175 E:157 Min. : 0.0 Min. : 0.0
1st Qu.: 67.25 N:147 W:165 1st Qu.: 109.2 1st Qu.: 7.0
Median : 170.50 Median : 212.0 Median : 39.5
Mean : 260.24 Mean : 288.9 Mean :106.9
3rd Qu.: 339.25 3rd Qu.: 325.0 3rd Qu.:166.0
Max. :1566.00 Max. :1378.0 Max. :492.0
Errors Salary NewLeague
Min. : 0.00 Min. : 67.5 A:176
1st Qu.: 3.00 1st Qu.: 226.2 N:146
Median : 6.00 Median : 425.0
Mean : 8.04 Mean : 515.6
3rd Qu.:11.00 3rd Qu.: 700.0
Max. :32.00 Max. :2460.0
IFDK <- preProcess(Hitters, method = "knnImpute")
IFDK <- predict(IFDK, Hitters)
summary(IFDK)
AtBat Hits HmRun Runs
Min. :-2.378857 Min. :-2.1532 Min. :-1.2367 Min. :-1.9563
1st Qu.:-0.819260 1st Qu.:-0.7970 1st Qu.:-0.7774 1st Qu.:-0.7939
Median :-0.009312 Median :-0.1082 Median :-0.3181 Median :-0.1118
Mean : 0.000000 Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
3rd Qu.: 0.854414 3rd Qu.: 0.7744 3rd Qu.: 0.6005 3rd Qu.: 0.6951
Max. : 1.995186 Max. : 2.9486 Max. : 3.3563 Max. : 3.0391
RBI Walks Years CAtBat
Min. :-1.8354 Min. :-1.7904 Min. :-1.3082 Min. :-1.1314
1st Qu.:-0.7654 1st Qu.:-0.7737 1st Qu.:-0.6992 1st Qu.:-0.7882
Median :-0.1539 Median :-0.1729 Median :-0.2932 Median :-0.3101
Mean : 0.0000 Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
3rd Qu.: 0.6391 3rd Qu.: 0.6589 3rd Qu.: 0.7219 3rd Qu.: 0.5488
Max. : 2.7887 Max. : 3.0619 Max. : 3.3609 Max. : 4.9068
CHits CHmRun CRuns CRBI
Min. :-1.0903 Min. :-0.8055 Min. :-1.0709 Min. :-0.9907
1st Qu.:-0.7771 1st Qu.:-0.6432 1st Qu.:-0.7738 1st Qu.:-0.7244
Median :-0.3202 Median :-0.3708 Median :-0.3346 Median :-0.3290
Mean : 0.0000 Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
3rd Qu.: 0.5221 3rd Qu.: 0.2377 3rd Qu.: 0.5012 3rd Qu.: 0.2885
Max. : 5.4065 Max. : 5.5469 Max. : 5.4061 Max. : 3.9880
CWalks League Division PutOuts Assists
Min. :-0.9745 A:175 E:157 Min. :-1.0293 Min. :-0.7812
1st Qu.:-0.7226 N:147 W:165 1st Qu.:-0.6401 1st Qu.:-0.7301
Median :-0.3360 Median :-0.2741 Median :-0.4926
Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
3rd Qu.: 0.2959 3rd Qu.: 0.1285 3rd Qu.: 0.4317
Max. : 4.8894 Max. : 3.8797 Max. : 2.8138
Errors Salary NewLeague
Min. :-1.2626 Min. :-1.03836 A:176
1st Qu.:-0.7915 1st Qu.:-0.76626 N:146
Median :-0.3204 Median :-0.28191
Mean : 0.0000 Mean :-0.04422
3rd Qu.: 0.4647 3rd Qu.: 0.44545
Max. : 3.7623 Max. : 4.26512
Salary <- IFDK$Salary * sd(Hitters$Salary, na.rm = TRUE) + mean(Hitters$Salary, na.rm = TRUE)
HittersF <- Hitters
HittersF$Salary <- Salary
rm(Salary)
summary(HittersF)
AtBat Hits HmRun Runs
Min. : 16.0 Min. : 1 Min. : 0.00 Min. : 0.00
1st Qu.:255.2 1st Qu.: 64 1st Qu.: 4.00 1st Qu.: 30.25
Median :379.5 Median : 96 Median : 8.00 Median : 48.00
Mean :380.9 Mean :101 Mean :10.77 Mean : 50.91
3rd Qu.:512.0 3rd Qu.:137 3rd Qu.:16.00 3rd Qu.: 69.00
Max. :687.0 Max. :238 Max. :40.00 Max. :130.00
RBI Walks Years CAtBat
Min. : 0.00 Min. : 0.00 Min. : 1.000 Min. : 19.0
1st Qu.: 28.00 1st Qu.: 22.00 1st Qu.: 4.000 1st Qu.: 816.8
Median : 44.00 Median : 35.00 Median : 6.000 Median : 1928.0
Mean : 48.03 Mean : 38.74 Mean : 7.444 Mean : 2648.7
3rd Qu.: 64.75 3rd Qu.: 53.00 3rd Qu.:11.000 3rd Qu.: 3924.2
Max. :121.00 Max. :105.00 Max. :24.000 Max. :14053.0
CHits CHmRun CRuns CRBI
Min. : 4.0 Min. : 0.00 Min. : 1.0 Min. : 0.00
1st Qu.: 209.0 1st Qu.: 14.00 1st Qu.: 100.2 1st Qu.: 88.75
Median : 508.0 Median : 37.50 Median : 247.0 Median : 220.50
Mean : 717.6 Mean : 69.49 Mean : 358.8 Mean : 330.12
3rd Qu.:1059.2 3rd Qu.: 90.00 3rd Qu.: 526.2 3rd Qu.: 426.25
Max. :4256.0 Max. :548.00 Max. :2165.0 Max. :1659.00
CWalks League Division PutOuts Assists
Min. : 0.00 A:175 E:157 Min. : 0.0 Min. : 0.0
1st Qu.: 67.25 N:147 W:165 1st Qu.: 109.2 1st Qu.: 7.0
Median : 170.50 Median : 212.0 Median : 39.5
Mean : 260.24 Mean : 288.9 Mean :106.9
3rd Qu.: 339.25 3rd Qu.: 325.0 3rd Qu.:166.0
Max. :1566.00 Max. :1378.0 Max. :492.0
Errors Salary NewLeague
Min. : 0.00 Min. : 67.5 A:176
1st Qu.: 3.00 1st Qu.: 190.2 N:146
Median : 6.00 Median : 408.8
Mean : 8.04 Mean : 516.0
3rd Qu.:11.00 3rd Qu.: 736.9
Max. :32.00 Max. :2460.0
compare <- data.frame(Original = Hitters$Salary, Imputed = HittersF$Salary)
head(compare)
Original Imputed
1 NA 84.4
2 475.0 475.0
3 480.0 480.0
4 500.0 500.0
5 91.5 91.5
6 750.0 750.0
tail(compare)
Original Imputed
317 NA 325.5
318 700 700.0
319 875 875.0
320 385 385.0
321 960 960.0
322 1000 1000.0
# One-Hot Encoding
# Creating dummy variables is converting a categorical variable to as many binary variables as here are categories.
dummies_model <- dummyVars(Salary ~ ., data=HittersF)
# Create the dummy variables using predict. The Y variable (totalprice) will not be present in trainData_mat.
Data_mat <- predict(dummies_model, newdata = HittersF)
# # Convert to dataframe
Data <- data.frame(Data_mat)
Data$Salary <- HittersF$Salary
# # See the structure of the new dataset
str(Data)
'data.frame': 322 obs. of 23 variables:
$ AtBat : num 293 315 479 496 321 594 185 298 323 401 ...
$ Hits : num 66 81 130 141 87 169 37 73 81 92 ...
$ HmRun : num 1 7 18 20 10 4 1 0 6 17 ...
$ Runs : num 30 24 66 65 39 74 23 24 26 49 ...
$ RBI : num 29 38 72 78 42 51 8 24 32 66 ...
$ Walks : num 14 39 76 37 30 35 21 7 8 65 ...
$ Years : num 1 14 3 11 2 11 2 3 2 13 ...
$ CAtBat : num 293 3449 1624 5628 396 ...
$ CHits : num 66 835 457 1575 101 ...
$ CHmRun : num 1 69 63 225 12 19 1 0 6 253 ...
$ CRuns : num 30 321 224 828 48 501 30 41 32 784 ...
$ CRBI : num 29 414 266 838 46 336 9 37 34 890 ...
$ CWalks : num 14 375 263 354 33 194 24 12 8 866 ...
$ League.A : num 1 0 1 0 0 1 0 1 0 1 ...
$ League.N : num 0 1 0 1 1 0 1 0 1 0 ...
$ Division.E : num 1 0 0 1 1 0 1 0 0 1 ...
$ Division.W : num 0 1 1 0 0 1 0 1 1 0 ...
$ PutOuts : num 446 632 880 200 805 282 76 121 143 0 ...
$ Assists : num 33 43 82 11 40 421 127 283 290 0 ...
$ Errors : num 20 10 14 3 4 25 7 9 19 0 ...
$ NewLeague.A: num 1 0 1 0 0 1 1 1 0 1 ...
$ NewLeague.N: num 0 1 0 1 1 0 0 0 1 0 ...
$ Salary : num 84.4 475 480 500 91.5 750 70 100 75 1100 ...
set.seed(31)
trainIndex <- createDataPartition(y = HittersF$Salary,
p = 0.80,
list = FALSE,
times = 1)
trainingDum <- Data[trainIndex, ]
testingDum <- Data[-trainIndex, ]
set.seed(3)
mod_regI <- train(y = HittersF$Salary,
x = HittersF[, -19],
trControl = myControl,
tuneLength = 10,
method = "lmStepAIC")
Start: AIC=3693.83
.outcome ~ AtBat + Hits + HmRun + Runs + RBI + Walks + Years +
CAtBat + CHits + CHmRun + CRuns + CRBI + CWalks + League +
Division + PutOuts + Assists + Errors + NewLeague
Df Sum of Sq RSS AIC
- CHits 1 9 27284863 3691.8
- RBI 1 3945 27288799 3691.9
- NewLeague 1 9143 27293998 3691.9
- CHmRun 1 11883 27296737 3692.0
- Runs 1 17611 27302466 3692.0
- League 1 18952 27303806 3692.1
- Years 1 21258 27306112 3692.1
- CRBI 1 37767 27322621 3692.3
- HmRun 1 42438 27327292 3692.3
- CAtBat 1 60517 27345372 3692.5
- Errors 1 73429 27358283 3692.7
<none> 27284854 3693.8
- Assists 1 335055 27619909 3695.8
- CRuns 1 405450 27690305 3696.6
- CWalks 1 631191 27916045 3699.2
- Division 1 695937 27980791 3699.9
- Walks 1 949756 28234610 3702.8
- Hits 1 965435 28250289 3703.0
- AtBat 1 1162945 28447800 3705.3
- PutOuts 1 1588265 28873120 3710.0
Step: AIC=3691.83
.outcome ~ AtBat + Hits + HmRun + Runs + RBI + Walks + Years +
CAtBat + CHmRun + CRuns + CRBI + CWalks + League + Division +
PutOuts + Assists + Errors + NewLeague
Df Sum of Sq RSS AIC
- RBI 1 4145 27289009 3689.9
- NewLeague 1 9193 27294056 3689.9
- League 1 19046 27303909 3690.1
- Years 1 21830 27306693 3690.1
- Runs 1 21835 27306699 3690.1
- CHmRun 1 22052 27306916 3690.1
- HmRun 1 45563 27330426 3690.4
- CRBI 1 68680 27353543 3690.6
- Errors 1 74237 27359101 3690.7
- CAtBat 1 125798 27410662 3691.3
<none> 27284863 3691.8
- Assists 1 340920 27625783 3693.8
- Division 1 696455 27981318 3697.9
- CWalks 1 859173 28144037 3699.8
- CRuns 1 973792 28258655 3701.1
- Walks 1 991666 28276529 3701.3
- AtBat 1 1273339 28558202 3704.5
- Hits 1 1297392 28582255 3704.8
- PutOuts 1 1597454 28882317 3708.1
Step: AIC=3689.88
.outcome ~ AtBat + Hits + HmRun + Runs + Walks + Years + CAtBat +
CHmRun + CRuns + CRBI + CWalks + League + Division + PutOuts +
Assists + Errors + NewLeague
Df Sum of Sq RSS AIC
- NewLeague 1 8929 27297937 3688.0
- Runs 1 18983 27307992 3688.1
- League 1 19186 27308195 3688.1
- Years 1 22493 27311502 3688.1
- CHmRun 1 30566 27319575 3688.2
- HmRun 1 65462 27354470 3688.6
- CRBI 1 65858 27354867 3688.7
- Errors 1 75639 27364648 3688.8
- CAtBat 1 121720 27410728 3689.3
<none> 27289009 3689.9
- Assists 1 340123 27629132 3691.9
- Division 1 694514 27983523 3696.0
- CWalks 1 855069 28144078 3697.8
- CRuns 1 974983 28263992 3699.2
- Walks 1 1001866 28290874 3699.5
- AtBat 1 1325398 28614407 3703.1
- Hits 1 1353783 28642792 3703.5
- PutOuts 1 1601243 28890252 3706.2
Step: AIC=3687.98
.outcome ~ AtBat + Hits + HmRun + Runs + Walks + Years + CAtBat +
CHmRun + CRuns + CRBI + CWalks + League + Division + PutOuts +
Assists + Errors
Df Sum of Sq RSS AIC
- League 1 13179 27311117 3686.1
- Runs 1 18036 27315973 3686.2
- Years 1 21263 27319201 3686.2
- CHmRun 1 30743 27328681 3686.3
- CRBI 1 66297 27364235 3686.8
- HmRun 1 67540 27365478 3686.8
- Errors 1 72342 27370280 3686.8
- CAtBat 1 125507 27423444 3687.5
<none> 27297937 3688.0
- Assists 1 341747 27639685 3690.0
- Division 1 690884 27988821 3694.0
- CWalks 1 848012 28145949 3695.8
- CRuns 1 979400 28277338 3697.3
- Walks 1 994547 28292485 3697.5
- AtBat 1 1343379 28641316 3701.5
- Hits 1 1359203 28657141 3701.6
- PutOuts 1 1600564 28898502 3704.3
Step: AIC=3686.14
.outcome ~ AtBat + Hits + HmRun + Runs + Walks + Years + CAtBat +
CHmRun + CRuns + CRBI + CWalks + Division + PutOuts + Assists +
Errors
Df Sum of Sq RSS AIC
- Runs 1 21458 27332575 3684.4
- Years 1 26081 27337197 3684.4
- CHmRun 1 30331 27341448 3684.5
- HmRun 1 64530 27375646 3684.9
- CRBI 1 66264 27377380 3684.9
- Errors 1 68038 27379154 3684.9
- CAtBat 1 118502 27429618 3685.5
<none> 27311117 3686.1
- Assists 1 336386 27647503 3688.1
- Division 1 696627 28007743 3692.2
- CWalks 1 859486 28170603 3694.1
- CRuns 1 968169 28279286 3695.4
- Walks 1 1041550 28352666 3696.2
- AtBat 1 1353305 28664422 3699.7
- Hits 1 1372203 28683320 3699.9
- PutOuts 1 1610367 28921483 3702.6
Step: AIC=3684.39
.outcome ~ AtBat + Hits + HmRun + Walks + Years + CAtBat + CHmRun +
CRuns + CRBI + CWalks + Division + PutOuts + Assists + Errors
Df Sum of Sq RSS AIC
- Years 1 25943 27358518 3682.7
- CHmRun 1 30093 27362668 3682.7
- HmRun 1 46204 27378779 3682.9
- Errors 1 68744 27401319 3683.2
- CRBI 1 75483 27408059 3683.3
- CAtBat 1 105250 27437825 3683.6
<none> 27332575 3684.4
- Assists 1 347896 27680471 3686.5
- Division 1 696521 28029096 3690.5
- CWalks 1 846866 28179441 3692.2
- CRuns 1 980261 28312836 3693.7
- Walks 1 1073708 28406283 3694.8
- AtBat 1 1394455 28727030 3698.4
- Hits 1 1514978 28847553 3699.8
- PutOuts 1 1680734 29013309 3701.6
Step: AIC=3682.7
.outcome ~ AtBat + Hits + HmRun + Walks + CAtBat + CHmRun + CRuns +
CRBI + CWalks + Division + PutOuts + Assists + Errors
Df Sum of Sq RSS AIC
- CHmRun 1 24847 27383365 3681.0
- HmRun 1 46473 27404992 3681.2
- Errors 1 61680 27420199 3681.4
- CRBI 1 85616 27444134 3681.7
<none> 27358518 3682.7
- CAtBat 1 225708 27584226 3683.3
- Assists 1 356310 27714828 3684.9
- Division 1 681748 28040267 3688.6
- CWalks 1 888710 28247229 3691.0
- Walks 1 1084171 28442689 3693.2
- CRuns 1 1219238 28577756 3694.7
- AtBat 1 1369408 28727927 3696.4
- Hits 1 1498805 28857324 3697.9
- PutOuts 1 1701216 29059734 3700.1
Step: AIC=3680.99
.outcome ~ AtBat + Hits + HmRun + Walks + CAtBat + CRuns + CRBI +
CWalks + Division + PutOuts + Assists + Errors
Df Sum of Sq RSS AIC
- Errors 1 67626 27450991 3679.8
- HmRun 1 85287 27468653 3680.0
<none> 27383365 3681.0
- Assists 1 363546 27746912 3683.2
- CAtBat 1 584068 27967433 3685.8
- Division 1 676337 28059702 3686.8
- CWalks 1 864036 28247401 3689.0
- CRBI 1 927486 28310851 3689.7
- Walks 1 1060473 28443838 3691.2
- AtBat 1 1387545 28770910 3694.9
- CRuns 1 1447821 28831186 3695.6
- Hits 1 1565550 28948915 3696.9
- PutOuts 1 1676388 29059754 3698.1
Step: AIC=3679.78
.outcome ~ AtBat + Hits + HmRun + Walks + CAtBat + CRuns + CRBI +
CWalks + Division + PutOuts + Assists
Df Sum of Sq RSS AIC
- HmRun 1 75019 27526011 3678.7
<none> 27450991 3679.8
- Assists 1 321859 27772850 3681.5
- CAtBat 1 580497 28031488 3684.5
- Division 1 665140 28116131 3685.5
- CWalks 1 838580 28289572 3687.5
- CRBI 1 917180 28368172 3688.4
- Walks 1 1082000 28532991 3690.2
- CRuns 1 1444588 28895580 3694.3
- AtBat 1 1473617 28924609 3694.6
- PutOuts 1 1616307 29067299 3696.2
- Hits 1 1645106 29096097 3696.5
Step: AIC=3678.66
.outcome ~ AtBat + Hits + Walks + CAtBat + CRuns + CRBI + CWalks +
Division + PutOuts + Assists
Df Sum of Sq RSS AIC
<none> 27526011 3678.7
- Assists 1 257389 27783399 3679.7
- Division 1 655026 28181037 3684.2
- CAtBat 1 735290 28261301 3685.1
- CWalks 1 804186 28330197 3685.9
- Walks 1 1065701 28591712 3688.9
- AtBat 1 1404622 28930633 3692.7
- CRBI 1 1407297 28933308 3692.7
- CRuns 1 1447013 28973024 3693.2
- Hits 1 1606492 29132503 3694.9
- PutOuts 1 1634869 29160880 3695.2
mod_regI$results
parameter RMSE Rsquared MAE RMSESD RsquaredSD MAESD
1 none 308.2208 0.4842391 216.5839 40.56372 0.09441538 15.87248
summary(mod_regI$finalModel)
Call:
lm(formula = .outcome ~ AtBat + Hits + Walks + CAtBat + CRuns +
CRBI + CWalks + Division + PutOuts + Assists, data = dat)
Residuals:
Min 1Q Median 3Q Max
-905.59 -161.99 -39.12 123.68 1918.91
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 159.50832 53.26097 2.995 0.002966 **
AtBat -1.86151 0.46728 -3.984 8.46e-05 ***
Hits 6.22487 1.46111 4.260 2.71e-05 ***
Walks 4.91723 1.41708 3.470 0.000594 ***
CAtBat -0.13556 0.04703 -2.882 0.004223 **
CRuns 1.37948 0.34117 4.043 6.65e-05 ***
CRBI 0.68200 0.17103 3.988 8.33e-05 ***
CWalks -0.66875 0.22186 -3.014 0.002788 **
DivisionW -91.57311 33.66125 -2.720 0.006887 **
PutOuts 0.27733 0.06453 4.298 2.31e-05 ***
Assists 0.24303 0.14251 1.705 0.089135 .
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 297.5 on 311 degrees of freedom
Multiple R-squared: 0.5302, Adjusted R-squared: 0.5151
F-statistic: 35.1 on 10 and 311 DF, p-value: < 2.2e-16
set.seed(3)
mod_regM <- train(y = IFDM$Salary,
x = IFDM[, -19],
trControl = myControl,
tuneLength = 10,
method = "lmStepAIC")
Start: AIC=3712.93
.outcome ~ AtBat + Hits + HmRun + Runs + RBI + Walks + Years +
CAtBat + CHits + CHmRun + CRuns + CRBI + CWalks + League +
Division + PutOuts + Assists + Errors + NewLeague
Df Sum of Sq RSS AIC
- RBI 1 319 28952621 3710.9
- NewLeague 1 1429 28953731 3710.9
- Runs 1 3236 28955538 3711.0
- CHmRun 1 8023 28960325 3711.0
- CHits 1 13442 28965744 3711.1
- League 1 21813 28974115 3711.2
- HmRun 1 61106 29013408 3711.6
- CRBI 1 80936 29033238 3711.8
- Years 1 97986 29050288 3712.0
- Errors 1 104235 29056537 3712.1
- CAtBat 1 138985 29091287 3712.5
<none> 28952302 3712.9
- CRuns 1 328328 29280631 3714.6
- Assists 1 370094 29322396 3715.0
- CWalks 1 501220 29453523 3716.5
- Hits 1 721273 29673576 3718.9
- Division 1 915491 29867793 3721.0
- PutOuts 1 940399 29892701 3721.2
- Walks 1 962245 29914547 3721.5
- AtBat 1 1161040 30113342 3723.6
Step: AIC=3710.93
.outcome ~ AtBat + Hits + HmRun + Runs + Walks + Years + CAtBat +
CHits + CHmRun + CRuns + CRBI + CWalks + League + Division +
PutOuts + Assists + Errors + NewLeague
Df Sum of Sq RSS AIC
- NewLeague 1 1427 28954048 3708.9
- Runs 1 2931 28955552 3709.0
- CHmRun 1 7858 28960479 3709.0
- CHits 1 14658 28967279 3709.1
- League 1 21957 28974578 3709.2
- CRBI 1 91142 29043763 3709.9
- Years 1 98069 29050690 3710.0
- Errors 1 105229 29057850 3710.1
- HmRun 1 136727 29089348 3710.4
- CAtBat 1 139349 29091970 3710.5
<none> 28952621 3710.9
- CRuns 1 331297 29283918 3712.6
- Assists 1 370453 29323074 3713.0
- CWalks 1 508155 29460776 3714.5
- Hits 1 790144 29742765 3717.6
- Division 1 927793 29880414 3719.1
- PutOuts 1 940643 29893264 3719.2
- Walks 1 999923 29952544 3719.9
- AtBat 1 1173935 30126556 3721.7
Step: AIC=3708.95
.outcome ~ AtBat + Hits + HmRun + Runs + Walks + Years + CAtBat +
CHits + CHmRun + CRuns + CRBI + CWalks + League + Division +
PutOuts + Assists + Errors
Df Sum of Sq RSS AIC
- Runs 1 2973 28957021 3707.0
- CHmRun 1 8403 28962452 3707.0
- CHits 1 13818 28967866 3707.1
- League 1 56112 29010160 3707.6
- CRBI 1 93604 29047652 3708.0
- Years 1 97479 29051527 3708.0
- Errors 1 103922 29057971 3708.1
- CAtBat 1 138317 29092365 3708.5
- HmRun 1 138889 29092937 3708.5
<none> 28954048 3708.9
- CRuns 1 338989 29293038 3710.7
- Assists 1 370384 29324432 3711.0
- CWalks 1 508668 29462716 3712.6
- Hits 1 799337 29753385 3715.7
- Division 1 926530 29880578 3717.1
- PutOuts 1 941091 29895140 3717.2
- Walks 1 998511 29952560 3717.9
- AtBat 1 1191649 30145697 3719.9
Step: AIC=3706.98
.outcome ~ AtBat + Hits + HmRun + Walks + Years + CAtBat + CHits +
CHmRun + CRuns + CRBI + CWalks + League + Division + PutOuts +
Assists + Errors
Df Sum of Sq RSS AIC
- CHmRun 1 6396 28963417 3705.1
- CHits 1 23007 28980028 3705.2
- League 1 58802 29015823 3705.6
- CRBI 1 90635 29047656 3706.0
- Years 1 95503 29052524 3706.0
- Errors 1 106178 29063199 3706.2
- HmRun 1 146344 29103365 3706.6
- CAtBat 1 152204 29109225 3706.7
<none> 28957021 3707.0
- Assists 1 382752 29339773 3709.2
- CRuns 1 405920 29362941 3709.5
- CWalks 1 521603 29478624 3710.7
- Division 1 926126 29883147 3715.1
- PutOuts 1 955165 29912186 3715.4
- Hits 1 1058614 30015635 3716.5
- Walks 1 1136073 30093095 3717.4
- AtBat 1 1188680 30145701 3717.9
Step: AIC=3705.05
.outcome ~ AtBat + Hits + HmRun + Walks + Years + CAtBat + CHits +
CRuns + CRBI + CWalks + League + Division + PutOuts + Assists +
Errors
Df Sum of Sq RSS AIC
- League 1 57849 29021266 3703.7
- CHits 1 71950 29035367 3703.9
- Years 1 95024 29058441 3704.1
- Errors 1 106740 29070157 3704.2
- HmRun 1 140778 29104195 3704.6
- CAtBat 1 166662 29130079 3704.9
<none> 28963417 3705.1
- Assists 1 386825 29350241 3707.3
- CRBI 1 426936 29390352 3707.8
- CWalks 1 517953 29481370 3708.8
- CRuns 1 562248 29525665 3709.2
- Division 1 929741 29893158 3713.2
- PutOuts 1 956576 29919993 3713.5
- Hits 1 1066910 30030327 3714.7
- Walks 1 1154504 30117921 3715.6
- AtBat 1 1192590 30156007 3716.0
Step: AIC=3703.69
.outcome ~ AtBat + Hits + HmRun + Walks + Years + CAtBat + CHits +
CRuns + CRBI + CWalks + Division + PutOuts + Assists + Errors
Df Sum of Sq RSS AIC
- CHits 1 78242 29099509 3702.6
- Errors 1 96242 29117508 3702.8
- Years 1 114674 29135940 3703.0
- HmRun 1 125255 29146521 3703.1
- CAtBat 1 157492 29178758 3703.4
<none> 29021266 3703.7
- Assists 1 376990 29398256 3705.8
- CRBI 1 425627 29446893 3706.4
- CWalks 1 523941 29545208 3707.5
- CRuns 1 526193 29547460 3707.5
- Division 1 942742 29964009 3712.0
- PutOuts 1 974148 29995414 3712.3
- Hits 1 1056415 30077681 3713.2
- AtBat 1 1204691 30225958 3714.8
- Walks 1 1216619 30237885 3714.9
Step: AIC=3702.56
.outcome ~ AtBat + Hits + HmRun + Walks + Years + CAtBat + CRuns +
CRBI + CWalks + Division + PutOuts + Assists + Errors
Df Sum of Sq RSS AIC
- Errors 1 79590 29179099 3701.4
- HmRun 1 88036 29187545 3701.5
- CAtBat 1 88702 29188210 3701.5
- Years 1 154396 29253905 3702.3
<none> 29099509 3702.6
- Assists 1 345832 29445340 3704.4
- CRBI 1 476442 29575951 3705.8
- CWalks 1 906246 30005755 3710.4
- Division 1 954487 30053996 3711.0
- CRuns 1 989217 30088726 3711.3
- PutOuts 1 1094991 30194500 3712.5
- Walks 1 1288078 30387587 3714.5
- Hits 1 1747754 30847262 3719.3
- AtBat 1 1825725 30925234 3720.2
Step: AIC=3701.44
.outcome ~ AtBat + Hits + HmRun + Walks + Years + CAtBat + CRuns +
CRBI + CWalks + Division + PutOuts + Assists
Df Sum of Sq RSS AIC
- HmRun 1 76255 29255354 3700.3
- CAtBat 1 97466 29276564 3700.5
- Years 1 134468 29313567 3700.9
<none> 29179099 3701.4
- Assists 1 279169 29458268 3702.5
- CRBI 1 469093 29648192 3704.6
- CWalks 1 883612 30062710 3709.0
- Division 1 937092 30116191 3709.6
- CRuns 1 1004667 30183766 3710.3
- PutOuts 1 1037683 30216781 3710.7
- Walks 1 1316746 30495845 3713.7
- Hits 1 1836380 31015479 3719.1
- AtBat 1 1922926 31102025 3720.0
Step: AIC=3700.28
.outcome ~ AtBat + Hits + Walks + Years + CAtBat + CRuns + CRBI +
CWalks + Division + PutOuts + Assists
Df Sum of Sq RSS AIC
- Years 1 128845 29384199 3699.7
- CAtBat 1 146493 29401847 3699.9
<none> 29255354 3700.3
- Assists 1 217581 29472935 3700.7
- CRBI 1 785538 30040892 3706.8
- CWalks 1 849804 30105157 3707.5
- Division 1 924238 30179592 3708.3
- CRuns 1 1011697 30267051 3709.2
- PutOuts 1 1053680 30309034 3709.7
- Walks 1 1299323 30554677 3712.3
- Hits 1 1795269 31050623 3717.5
- AtBat 1 1863298 31118652 3718.2
Step: AIC=3699.7
.outcome ~ AtBat + Hits + Walks + CAtBat + CRuns + CRBI + CWalks +
Division + PutOuts + Assists
Df Sum of Sq RSS AIC
<none> 29384199 3699.7
- Assists 1 274411 29658611 3700.7
- CAtBat 1 526476 29910675 3703.4
- CRBI 1 792476 30176675 3706.3
- Division 1 886295 30270495 3707.3
- CWalks 1 958090 30342289 3708.0
- PutOuts 1 1108613 30492812 3709.6
- Walks 1 1332602 30716801 3712.0
- CRuns 1 1384486 30768685 3712.5
- Hits 1 1774998 31159197 3716.6
- AtBat 1 1790601 31174801 3716.7
mod_regM$results
parameter RMSE Rsquared MAE RMSESD RsquaredSD MAESD
1 none 328.3498 0.3675606 239.91 49.58947 0.05841537 26.98391
summary(mod_regM$finalModel)
Call:
lm(formula = .outcome ~ AtBat + Hits + Walks + CAtBat + CRuns +
CRBI + CWalks + Division + PutOuts + Assists, data = dat)
Residuals:
Min 1Q Median 3Q Max
-756.69 -176.58 -34.86 148.37 1841.12
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 244.62649 55.02935 4.445 1.22e-05 ***
AtBat -2.10177 0.48279 -4.353 1.82e-05 ***
Hits 6.54319 1.50962 4.334 1.98e-05 ***
Walks 5.49861 1.46413 3.756 0.000206 ***
CAtBat -0.11470 0.04859 -2.361 0.018865 *
CRuns 1.34934 0.35250 3.828 0.000156 ***
CRBI 0.51178 0.17671 2.896 0.004046 **
CWalks -0.72995 0.22923 -3.184 0.001598 **
DivisionW -106.51923 34.77887 -3.063 0.002385 **
PutOuts 0.22837 0.06667 3.425 0.000696 ***
Assists 0.25094 0.14724 1.704 0.089340 .
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 307.4 on 311 degrees of freedom
Multiple R-squared: 0.455, Adjusted R-squared: 0.4374
F-statistic: 25.96 on 10 and 311 DF, p-value: < 2.2e-16
set.seed(3)
mod_reg <- train(y = RD$Salary,
x = RD[, -19],
trControl = myControl,
tuneLength = 10,
method = "lmStepAIC")
Start: AIC=3046.02
.outcome ~ AtBat + Hits + HmRun + Runs + RBI + Walks + Years +
CAtBat + CHits + CHmRun + CRuns + CRBI + CWalks + League +
Division + PutOuts + Assists + Errors + NewLeague
Df Sum of Sq RSS AIC
- CHmRun 1 1138 24201837 3044.0
- CHits 1 3930 24204629 3044.1
- Years 1 7869 24208569 3044.1
- NewLeague 1 9784 24210484 3044.1
- RBI 1 16076 24216776 3044.2
- HmRun 1 48572 24249272 3044.6
- Errors 1 58324 24259023 3044.7
- League 1 62121 24262821 3044.7
- Runs 1 63291 24263990 3044.7
- CRBI 1 135439 24336138 3045.5
- CAtBat 1 159864 24360564 3045.8
<none> 24200700 3046.0
- Assists 1 280263 24480963 3047.1
- CRuns 1 374007 24574707 3048.1
- CWalks 1 609408 24810108 3050.6
- Division 1 834491 25035190 3052.9
- AtBat 1 971288 25171987 3054.4
- Hits 1 991242 25191941 3054.6
- Walks 1 1156606 25357305 3056.3
- PutOuts 1 1319628 25520328 3058.0
Step: AIC=3044.03
.outcome ~ AtBat + Hits + HmRun + Runs + RBI + Walks + Years +
CAtBat + CHits + CRuns + CRBI + CWalks + League + Division +
PutOuts + Assists + Errors + NewLeague
Df Sum of Sq RSS AIC
- Years 1 7609 24209447 3042.1
- NewLeague 1 10268 24212106 3042.2
- CHits 1 14003 24215840 3042.2
- RBI 1 14955 24216793 3042.2
- HmRun 1 52777 24254614 3042.6
- Errors 1 59530 24261367 3042.7
- League 1 63407 24265244 3042.7
- Runs 1 64860 24266698 3042.7
- CAtBat 1 174992 24376830 3043.9
<none> 24201837 3044.0
- Assists 1 285766 24487603 3045.1
- CRuns 1 611358 24813196 3048.6
- CWalks 1 645627 24847464 3049.0
- Division 1 834637 25036474 3050.9
- CRBI 1 864220 25066057 3051.3
- AtBat 1 970861 25172699 3052.4
- Hits 1 1025981 25227819 3052.9
- Walks 1 1167378 25369216 3054.4
- PutOuts 1 1325273 25527110 3056.1
Step: AIC=3042.12
.outcome ~ AtBat + Hits + HmRun + Runs + RBI + Walks + CAtBat +
CHits + CRuns + CRBI + CWalks + League + Division + PutOuts +
Assists + Errors + NewLeague
Df Sum of Sq RSS AIC
- NewLeague 1 9931 24219377 3040.2
- RBI 1 15989 24225436 3040.3
- CHits 1 18291 24227738 3040.3
- HmRun 1 54144 24263591 3040.7
- Errors 1 57312 24266759 3040.7
- Runs 1 63172 24272619 3040.8
- League 1 65732 24275178 3040.8
<none> 24209447 3042.1
- CAtBat 1 266205 24475652 3043.0
- Assists 1 293479 24502926 3043.3
- CRuns 1 646350 24855797 3047.1
- CWalks 1 649269 24858716 3047.1
- Division 1 827511 25036958 3049.0
- CRBI 1 872121 25081568 3049.4
- AtBat 1 968713 25178160 3050.4
- Hits 1 1018379 25227825 3050.9
- Walks 1 1164536 25373983 3052.5
- PutOuts 1 1334525 25543972 3054.2
Step: AIC=3040.22
.outcome ~ AtBat + Hits + HmRun + Runs + RBI + Walks + CAtBat +
CHits + CRuns + CRBI + CWalks + League + Division + PutOuts +
Assists + Errors
Df Sum of Sq RSS AIC
- RBI 1 15800 24235177 3038.4
- CHits 1 15859 24235237 3038.4
- Errors 1 54505 24273883 3038.8
- HmRun 1 54938 24274316 3038.8
- Runs 1 62294 24281671 3038.9
- League 1 107479 24326856 3039.4
<none> 24219377 3040.2
- CAtBat 1 261336 24480713 3041.1
- Assists 1 295536 24514914 3041.4
- CWalks 1 648860 24868237 3045.2
- CRuns 1 661449 24880826 3045.3
- Division 1 824672 25044049 3047.0
- CRBI 1 880429 25099806 3047.6
- AtBat 1 999057 25218434 3048.9
- Hits 1 1034463 25253840 3049.2
- Walks 1 1157205 25376583 3050.5
- PutOuts 1 1335173 25554550 3052.3
Step: AIC=3038.4
.outcome ~ AtBat + Hits + HmRun + Runs + Walks + CAtBat + CHits +
CRuns + CRBI + CWalks + League + Division + PutOuts + Assists +
Errors
Df Sum of Sq RSS AIC
- CHits 1 13483 24248660 3036.5
- HmRun 1 44586 24279763 3036.9
- Runs 1 54057 24289234 3037.0
- Errors 1 57656 24292833 3037.0
- League 1 108644 24343821 3037.6
<none> 24235177 3038.4
- CAtBat 1 252756 24487934 3039.1
- Assists 1 294674 24529851 3039.6
- CWalks 1 639690 24874868 3043.2
- CRuns 1 693535 24928712 3043.8
- Division 1 808984 25044161 3045.0
- CRBI 1 893830 25129008 3045.9
- Hits 1 1034884 25270061 3047.4
- AtBat 1 1042798 25277975 3047.5
- Walks 1 1145013 25380191 3048.5
- PutOuts 1 1340713 25575890 3050.6
Step: AIC=3036.54
.outcome ~ AtBat + Hits + HmRun + Runs + Walks + CAtBat + CRuns +
CRBI + CWalks + League + Division + PutOuts + Assists + Errors
Df Sum of Sq RSS AIC
- HmRun 1 40487 24289148 3035.0
- Errors 1 51930 24300590 3035.1
- Runs 1 79343 24328003 3035.4
- League 1 114742 24363402 3035.8
<none> 24248660 3036.5
- Assists 1 283442 24532103 3037.6
- CAtBat 1 613356 24862016 3041.1
- Division 1 801474 25050134 3043.1
- CRBI 1 903248 25151908 3044.2
- CWalks 1 1011953 25260613 3045.3
- Walks 1 1246164 25494824 3047.7
- AtBat 1 1339620 25588280 3048.7
- CRuns 1 1390808 25639469 3049.2
- PutOuts 1 1406023 25654684 3049.4
- Hits 1 1607990 25856650 3051.4
Step: AIC=3034.98
.outcome ~ AtBat + Hits + Runs + Walks + CAtBat + CRuns + CRBI +
CWalks + League + Division + PutOuts + Assists + Errors
Df Sum of Sq RSS AIC
- Errors 1 44085 24333232 3033.5
- Runs 1 49068 24338215 3033.5
- League 1 103837 24392985 3034.1
<none> 24289148 3035.0
- Assists 1 247002 24536150 3035.6
- CAtBat 1 652746 24941894 3040.0
- Division 1 795643 25084791 3041.5
- CWalks 1 982896 25272044 3043.4
- Walks 1 1205823 25494971 3045.7
- AtBat 1 1300972 25590120 3046.7
- CRuns 1 1351200 25640348 3047.2
- CRBI 1 1353507 25642655 3047.2
- PutOuts 1 1429006 25718154 3048.0
- Hits 1 1574140 25863288 3049.5
Step: AIC=3033.46
.outcome ~ AtBat + Hits + Runs + Walks + CAtBat + CRuns + CRBI +
CWalks + League + Division + PutOuts + Assists
Df Sum of Sq RSS AIC
- Runs 1 54113 24387345 3032.0
- League 1 91269 24424501 3032.4
<none> 24333232 3033.5
- Assists 1 220010 24553242 3033.8
- CAtBat 1 650513 24983746 3038.4
- Division 1 799455 25132687 3040.0
- CWalks 1 971260 25304493 3041.8
- Walks 1 1239533 25572765 3044.5
- CRBI 1 1331672 25664904 3045.5
- CRuns 1 1361070 25694302 3045.8
- AtBat 1 1378592 25711824 3045.9
- PutOuts 1 1391660 25724892 3046.1
- Hits 1 1649291 25982523 3048.7
Step: AIC=3032.04
.outcome ~ AtBat + Hits + Walks + CAtBat + CRuns + CRBI + CWalks +
League + Division + PutOuts + Assists
Df Sum of Sq RSS AIC
- League 1 113056 24500402 3031.3
<none> 24387345 3032.0
- Assists 1 280689 24668034 3033.1
- CAtBat 1 596622 24983967 3036.4
- Division 1 780369 25167714 3038.3
- CWalks 1 946687 25334032 3040.1
- Walks 1 1212997 25600342 3042.8
- CRuns 1 1334397 25721742 3044.1
- CRBI 1 1361339 25748684 3044.3
- PutOuts 1 1455210 25842555 3045.3
- AtBat 1 1522760 25910105 3046.0
- Hits 1 1718870 26106215 3047.9
Step: AIC=3031.26
.outcome ~ AtBat + Hits + Walks + CAtBat + CRuns + CRBI + CWalks +
Division + PutOuts + Assists
Df Sum of Sq RSS AIC
<none> 24500402 3031.3
- Assists 1 313650 24814051 3032.6
- CAtBat 1 534156 25034558 3034.9
- Division 1 798473 25298875 3037.7
- CWalks 1 965875 25466276 3039.4
- CRuns 1 1265082 25765484 3042.5
- Walks 1 1290168 25790569 3042.8
- CRBI 1 1326770 25827172 3043.1
- PutOuts 1 1551523 26051925 3045.4
- AtBat 1 1589780 26090181 3045.8
- Hits 1 1716068 26216469 3047.1
mod_reg$results
parameter RMSE Rsquared MAE RMSESD RsquaredSD MAESD
1 none 332.5762 0.469946 233.6482 44.15506 0.07766309 20.82597
summary(mod_reg$finalModel)
Call:
lm(formula = .outcome ~ AtBat + Hits + Walks + CAtBat + CRuns +
CRBI + CWalks + Division + PutOuts + Assists, data = dat)
Residuals:
Min 1Q Median 3Q Max
-939.11 -176.87 -34.08 130.90 1910.55
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 162.53544 66.90784 2.429 0.015830 *
AtBat -2.16865 0.53630 -4.044 7.00e-05 ***
Hits 6.91802 1.64665 4.201 3.69e-05 ***
Walks 5.77322 1.58483 3.643 0.000327 ***
CAtBat -0.13008 0.05550 -2.344 0.019858 *
CRuns 1.40825 0.39040 3.607 0.000373 ***
CRBI 0.77431 0.20961 3.694 0.000271 ***
CWalks -0.83083 0.26359 -3.152 0.001818 **
DivisionW -112.38006 39.21438 -2.866 0.004511 **
PutOuts 0.29737 0.07444 3.995 8.50e-05 ***
Assists 0.28317 0.15766 1.796 0.073673 .
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 311.8 on 252 degrees of freedom
Multiple R-squared: 0.5405, Adjusted R-squared: 0.5223
F-statistic: 29.64 on 10 and 252 DF, p-value: < 2.2e-16
Note: need to encode data vefore using glmnet
functions!
\(\alpha = 0 \rightarrow\) ridge \(\alpha = 1 \rightarrow\) lasso
set.seed(435)
DM <- model.matrix(Salary ~ ., data = HittersF)
mod_RL <- train(y = HittersF$Salary,
x = DM[, -1],
trControl = myControl,
tuneGrid = expand.grid(alpha = 0:1,
lambda = seq(0.0001, 100, length = 20)),
method = "glmnet")
round(mod_RL$results, 4)
alpha lambda RMSE Rsquared MAE RMSESD RsquaredSD MAESD
1 0 0.0001 312.8275 0.4655 219.4810 52.5345 0.0942 23.5352
2 0 5.2633 312.8275 0.4655 219.4810 52.5345 0.0942 23.5352
3 0 10.5264 312.8275 0.4655 219.4810 52.5345 0.0942 23.5352
4 0 15.7896 312.8275 0.4655 219.4810 52.5345 0.0942 23.5352
5 0 21.0527 312.8275 0.4655 219.4810 52.5345 0.0942 23.5352
6 0 26.3159 312.8207 0.4656 219.1784 52.7229 0.0950 23.8353
7 0 31.5790 312.8702 0.4649 218.5672 52.9128 0.0968 24.0638
8 0 36.8422 312.9324 0.4643 218.0795 53.0795 0.0983 24.1761
9 0 42.1053 312.9991 0.4638 217.6913 53.2287 0.0995 24.2416
10 0 47.3685 313.0663 0.4633 217.3716 53.3624 0.1005 24.3209
11 0 52.6316 313.1322 0.4629 217.0991 53.4869 0.1013 24.4050
12 0 57.8948 313.1957 0.4625 216.8515 53.5985 0.1020 24.4796
13 0 63.1579 313.2568 0.4621 216.6242 53.7059 0.1026 24.5438
14 0 68.4211 313.3139 0.4618 216.4157 53.8048 0.1031 24.6005
15 0 73.6842 313.3707 0.4615 216.2231 53.8986 0.1036 24.6504
16 0 78.9474 313.4273 0.4612 216.0437 53.9885 0.1040 24.6948
17 0 84.2105 313.4786 0.4610 215.8969 54.0729 0.1044 24.7393
18 0 89.4737 313.5287 0.4607 215.7704 54.1540 0.1047 24.7752
19 0 94.7368 313.5805 0.4605 215.6503 54.2325 0.1050 24.8088
20 0 100.0000 313.6296 0.4603 215.5411 54.3056 0.1053 24.8419
21 1 0.0001 319.3912 0.4541 227.5934 51.2095 0.0597 24.2360
22 1 5.2633 316.6939 0.4545 218.7426 52.5336 0.1021 29.3388
23 1 10.5264 316.0521 0.4540 215.9250 53.8211 0.1047 28.5333
24 1 15.7896 316.4139 0.4537 216.4106 54.7699 0.1047 28.3375
25 1 21.0527 317.3460 0.4522 217.2927 55.5754 0.1059 27.8541
26 1 26.3159 318.6386 0.4498 218.4059 56.3858 0.1075 27.3452
27 1 31.5790 320.2101 0.4470 219.9216 57.2137 0.1093 26.9738
28 1 36.8422 322.0216 0.4437 221.6239 57.9863 0.1111 26.6677
29 1 42.1053 323.8219 0.4409 223.6050 58.2463 0.1119 26.4199
30 1 47.3685 325.7105 0.4379 225.8548 58.4025 0.1121 26.2166
31 1 52.6316 327.7226 0.4348 228.2878 58.6124 0.1120 26.1270
32 1 57.8948 329.7521 0.4317 230.7360 58.9552 0.1127 26.4382
33 1 63.1579 331.7077 0.4292 233.2884 59.4409 0.1140 26.9513
34 1 68.4211 333.8022 0.4262 235.7589 59.8343 0.1148 27.0841
35 1 73.6842 336.0669 0.4228 238.4157 60.2130 0.1154 27.2202
36 1 78.9474 338.3631 0.4194 241.0617 60.6169 0.1165 27.2526
37 1 84.2105 340.6945 0.4161 243.6643 61.0109 0.1181 27.1885
38 1 89.4737 343.0321 0.4132 246.2320 61.3144 0.1197 27.0635
39 1 94.7368 345.3900 0.4105 248.7585 61.4852 0.1211 26.9867
40 1 100.0000 347.8063 0.4077 251.2971 61.5896 0.1222 26.8890
mod_RL$bestTune
alpha lambda
6 0 26.31586
plot(mod_RL)
coef(mod_RL$finalModel, mod_RL$bestTune$lambda)
20 x 1 sparse Matrix of class "dgCMatrix"
1
(Intercept) 1.092261e+02
AtBat -5.703172e-01
Hits 2.180048e+00
HmRun -1.531211e-01
Runs 1.488494e+00
RBI 5.884955e-01
Walks 2.718316e+00
Years -7.746742e+00
CAtBat 1.820866e-03
CHits 1.082137e-01
CHmRun 7.418973e-01
CRuns 2.773815e-01
CRBI 1.753277e-01
CWalks -2.001928e-01
LeagueN 2.996154e+01
DivisionW -1.035995e+02
PutOuts 2.564265e-01
Assists 1.816449e-01
Errors -3.800791e+00
NewLeagueN -2.018531e+01
set.seed(98)
trainIndex <- createDataPartition(y = HittersF$Salary,
p = 0.80,
list = FALSE,
times = 1)
training <- HittersF[trainIndex, ]
testing <- HittersF[-trainIndex, ]
set.seed(44)
DM <- model.matrix(Salary ~ ., data = training)
mod_RLTT <- train(y = training$Salary,
x = DM[, -1],
trControl = myControl,
tuneGrid = expand.grid(alpha = 0:1,
lambda = seq(0.0001, 100, length = 20)),
method = "glmnet")
round(mod_RLTT$results, 4)
alpha lambda RMSE Rsquared MAE RMSESD RsquaredSD MAESD
1 0 0.0001 319.5231 0.4610 219.1847 70.8489 0.0938 22.6336
2 0 5.2633 319.5231 0.4610 219.1847 70.8489 0.0938 22.6336
3 0 10.5264 319.5231 0.4610 219.1847 70.8489 0.0938 22.6336
4 0 15.7896 319.5231 0.4610 219.1847 70.8489 0.0938 22.6336
5 0 21.0527 319.5231 0.4610 219.1847 70.8489 0.0938 22.6336
6 0 26.3159 319.7148 0.4604 219.1478 70.8462 0.0935 22.6525
7 0 31.5790 319.1643 0.4615 218.5287 71.3404 0.0934 22.9076
8 0 36.8422 318.6636 0.4626 217.9720 71.7960 0.0934 23.1945
9 0 42.1053 318.1949 0.4636 217.4321 72.2167 0.0936 23.4711
10 0 47.3685 317.7710 0.4646 216.9194 72.5992 0.0938 23.7303
11 0 52.6316 317.3813 0.4655 216.4495 72.9629 0.0941 23.9535
12 0 57.8948 317.0179 0.4663 216.0217 73.2953 0.0944 24.1313
13 0 63.1579 316.6850 0.4671 215.6292 73.6089 0.0947 24.2843
14 0 68.4211 316.3733 0.4678 215.3541 73.9047 0.0950 24.4045
15 0 73.6842 316.0847 0.4685 215.1053 74.1854 0.0953 24.5318
16 0 78.9474 315.8239 0.4692 214.8800 74.4468 0.0957 24.6552
17 0 84.2105 315.5814 0.4697 214.6612 74.6934 0.0960 24.7743
18 0 89.4737 315.3505 0.4703 214.4483 74.9298 0.0963 24.8929
19 0 94.7368 315.1407 0.4709 214.2453 75.1562 0.0967 25.0097
20 0 100.0000 314.9463 0.4714 214.0622 75.3729 0.0970 25.1141
21 1 0.0001 327.0071 0.4463 225.9898 69.9704 0.1149 26.7881
22 1 5.2633 321.7263 0.4563 219.2424 71.8999 0.0978 24.4929
23 1 10.5264 318.1856 0.4651 217.9882 74.4477 0.1008 26.2135
24 1 15.7896 317.7862 0.4651 218.0738 75.5927 0.1031 27.9932
25 1 21.0527 318.2833 0.4632 218.2173 76.9540 0.1068 29.5462
26 1 26.3159 319.0452 0.4608 218.9933 78.5887 0.1121 30.5130
27 1 31.5790 320.0435 0.4582 220.3331 80.1448 0.1177 31.6624
28 1 36.8422 321.0185 0.4562 221.7212 80.8235 0.1213 32.2085
29 1 42.1053 322.3984 0.4533 223.4415 81.0804 0.1243 32.5852
30 1 47.3685 324.1537 0.4494 225.5314 81.1067 0.1271 32.8993
31 1 52.6316 325.9976 0.4454 227.6966 81.0002 0.1293 33.1480
32 1 57.8948 327.7811 0.4418 229.6485 80.9546 0.1315 33.5769
33 1 63.1579 329.5421 0.4386 231.4863 80.7864 0.1334 33.9218
34 1 68.4211 331.2945 0.4359 233.5454 80.5384 0.1353 34.2624
35 1 73.6842 333.0261 0.4338 235.8559 80.3566 0.1372 34.6283
36 1 78.9474 334.7890 0.4321 238.3868 80.2113 0.1384 34.8362
37 1 84.2105 336.7651 0.4297 241.1485 79.9953 0.1391 34.8792
38 1 89.4737 338.8637 0.4271 244.0105 79.7939 0.1398 34.9215
39 1 94.7368 341.1071 0.4244 246.8200 79.5933 0.1403 35.0190
40 1 100.0000 343.4588 0.4215 249.5914 79.4286 0.1408 35.1521
mod_RLTT$bestTune
alpha lambda
20 0 100
plot(mod_RLTT)
coef(mod_RL$finalModel, mod_RL$bestTune$lambda)
20 x 1 sparse Matrix of class "dgCMatrix"
1
(Intercept) 1.092261e+02
AtBat -5.703172e-01
Hits 2.180048e+00
HmRun -1.531211e-01
Runs 1.488494e+00
RBI 5.884955e-01
Walks 2.718316e+00
Years -7.746742e+00
CAtBat 1.820866e-03
CHits 1.082137e-01
CHmRun 7.418973e-01
CRuns 2.773815e-01
CRBI 1.753277e-01
CWalks -2.001928e-01
LeagueN 2.996154e+01
DivisionW -1.035995e+02
PutOuts 2.564265e-01
Assists 1.816449e-01
Errors -3.800791e+00
NewLeagueN -2.018531e+01
set.seed(31)
DM2 <- model.matrix(Salary ~ ., data = training)
mod_LASSO <- train(y = training$Salary,
x = DM2[, -1],
trControl = myControl,
tuneGrid = expand.grid(alpha = 1,
lambda = seq(0.0001, 100, length = 20)),
method = "glmnet")
round(mod_LASSO$results, 4)
alpha lambda RMSE Rsquared MAE RMSESD RsquaredSD MAESD
1 1 0.0001 314.2989 0.4827 218.8116 86.2625 0.1584 43.8712
2 1 5.2633 308.3799 0.4999 211.3176 87.0989 0.1403 43.9695
3 1 10.5264 309.1910 0.4971 210.6302 87.3560 0.1410 43.6382
4 1 15.7896 309.9509 0.4956 211.1762 87.4003 0.1418 43.0887
5 1 21.0527 311.2367 0.4922 212.2011 87.1222 0.1402 42.4205
6 1 26.3159 312.8366 0.4879 213.4981 86.8183 0.1378 41.6204
7 1 31.5790 314.7013 0.4831 215.2925 86.6427 0.1349 40.7466
8 1 36.8422 316.8164 0.4776 217.3900 86.3590 0.1318 40.0653
9 1 42.1053 318.9524 0.4717 219.7163 85.5037 0.1269 39.4715
10 1 47.3685 321.3233 0.4647 222.2402 84.5530 0.1212 38.8835
11 1 52.6316 323.6722 0.4584 224.7973 83.6374 0.1186 38.5026
12 1 57.8948 326.0417 0.4523 227.5510 82.8110 0.1169 37.8763
13 1 63.1579 328.3742 0.4464 230.2348 82.2104 0.1153 37.3599
14 1 68.4211 330.5280 0.4418 232.7388 81.8574 0.1151 37.0048
15 1 73.6842 332.5825 0.4384 234.9952 81.6612 0.1154 36.7622
16 1 78.9474 334.7179 0.4353 237.3300 81.5421 0.1165 36.5156
17 1 84.2105 336.9388 0.4322 240.0048 81.3930 0.1180 36.1623
18 1 89.4737 339.2129 0.4293 242.7379 81.3130 0.1196 35.9574
19 1 94.7368 341.5153 0.4270 245.3698 81.4073 0.1215 35.9115
20 1 100.0000 343.9122 0.4245 247.9775 81.5931 0.1235 35.9175
mod_LASSO$bestTune
alpha lambda
2 1 5.263253
plot(mod_LASSO)
coef(mod_LASSO$finalModel, mod_LASSO$bestTune$lambda)
20 x 1 sparse Matrix of class "dgCMatrix"
1
(Intercept) 2.654997e+01
AtBat -4.041062e-01
Hits 2.852278e+00
HmRun .
Runs .
RBI .
Walks 3.126010e+00
Years .
CAtBat .
CHits .
CHmRun 1.549710e+00
CRuns 3.187537e-01
CRBI .
CWalks -1.963378e-02
LeagueN 4.423179e+01
DivisionW -1.080512e+02
PutOuts 2.119802e-01
Assists 8.609666e-03
Errors -3.527294e-01
NewLeagueN .
#
TESTING <- model.matrix(Salary~., testing)
#
RMSE(predict(mod_LASSO, TESTING), testing$Salary)
[1] 335.3591
set.seed(11)
DM <- model.matrix(Salary ~ ., data = training)
mod_RF <- train(y = training$Salary,
x = DM[, -1],
trControl = myControl,
tuneLength = 10,
method = "rf")
round(mod_RF$results, 4)
mtry RMSE Rsquared MAE RMSESD RsquaredSD MAESD
1 2 248.5326 0.6692 149.2221 55.9208 0.0939 16.2105
2 3 247.9404 0.6661 148.6505 57.0167 0.1030 17.3468
3 5 245.6218 0.6692 148.2204 60.0892 0.1173 20.6598
4 7 245.5858 0.6679 148.0041 60.0953 0.1195 20.6268
5 9 247.2503 0.6624 149.9974 61.2587 0.1261 22.6297
6 11 246.6042 0.6636 149.7625 62.1640 0.1349 21.7880
7 13 247.0357 0.6626 149.6589 63.1852 0.1340 21.7843
8 15 249.3676 0.6563 150.5478 66.4893 0.1453 23.8123
9 17 249.2903 0.6553 150.3570 66.5079 0.1470 24.1863
10 19 250.3702 0.6540 151.0579 63.7505 0.1388 22.1006
mod_RF$bestTune
mtry
4 7
plot(mod_RF)
#
TESTING <- model.matrix(Salary~., testing)
#
RMSE(predict(mod_RF, TESTING), testing$Salary)
[1] 311.3955
set.seed(11)
mod_RF3 <- train(y = trainingDum$Salary,
x = trainingDum[, -23],
trControl = myControl,
tuneLength = 10,
method = "rf")
round(mod_RF3$results, 4)
mtry RMSE Rsquared MAE RMSESD RsquaredSD MAESD
1 2 230.9491 0.6968 140.5536 62.3361 0.0729 19.4150
2 4 227.8468 0.7030 138.9562 58.7571 0.0779 21.0813
3 6 231.2499 0.6932 140.2866 58.8790 0.0826 22.3477
4 8 231.3508 0.6958 140.9611 57.3898 0.0797 20.8297
5 10 232.1127 0.6926 140.9198 59.2188 0.0854 20.4562
6 13 233.1179 0.6915 141.5613 57.4110 0.0800 18.8509
7 15 235.1308 0.6839 143.2466 59.1683 0.0838 20.1612
8 17 233.6884 0.6876 142.8784 58.0625 0.0848 19.0271
9 19 233.9086 0.6875 143.3243 57.0655 0.0756 18.9361
10 22 235.0895 0.6852 143.4742 58.7374 0.0819 19.0339
mod_RF3$bestTune
mtry
2 4
plot(mod_RF3)
#
#
RMSE(predict(mod_RF3, testingDum), testingDum$Salary)
[1] 345.1002
set.seed(11)
DM2 <- model.matrix(Salary ~ .^2, data = training)
mod_RF2 <- train(y = training$Salary,
x = DM[, -1],
trControl = myControl,
tuneLength = 10,
method = "rf")
round(mod_RF2$results, 4)
mtry RMSE Rsquared MAE RMSESD RsquaredSD MAESD
1 2 248.5326 0.6692 149.2221 55.9208 0.0939 16.2105
2 3 247.9404 0.6661 148.6505 57.0167 0.1030 17.3468
3 5 245.6218 0.6692 148.2204 60.0892 0.1173 20.6598
4 7 245.5858 0.6679 148.0041 60.0953 0.1195 20.6268
5 9 247.2503 0.6624 149.9974 61.2587 0.1261 22.6297
6 11 246.6042 0.6636 149.7625 62.1640 0.1349 21.7880
7 13 247.0357 0.6626 149.6589 63.1852 0.1340 21.7843
8 15 249.3676 0.6563 150.5478 66.4893 0.1453 23.8123
9 17 249.2903 0.6553 150.3570 66.5079 0.1470 24.1863
10 19 250.3702 0.6540 151.0579 63.7505 0.1388 22.1006
mod_RF2$bestTune
mtry
4 7
plot(mod_RF2)
#
TESTING <- model.matrix(Salary~.^2, testing)
#
RMSE(predict(mod_RF2, TESTING), testing$Salary)
[1] 311.3955
set.seed(11)
DM <- model.matrix(Salary ~ ., data = training)
mod_GB <- train(y = training$Salary,
x = DM[, -1],
trControl = myControl,
tuneLength = 10,
method = "gbm")
Iter TrainDeviance ValidDeviance StepSize Improve
1 162930.3599 -nan 0.1000 18221.4565
2 146525.4982 -nan 0.1000 15270.6119
3 134562.6935 -nan 0.1000 10047.3127
4 123152.3362 -nan 0.1000 11074.4123
5 114176.6504 -nan 0.1000 9394.2904
6 104857.6148 -nan 0.1000 7459.6643
7 96710.4641 -nan 0.1000 4988.9218
8 89665.5019 -nan 0.1000 6997.5720
9 85253.1300 -nan 0.1000 3855.1506
10 81364.3235 -nan 0.1000 2868.5171
20 56410.8655 -nan 0.1000 185.2570
40 40322.4880 -nan 0.1000 -269.7680
50 36662.0200 -nan 0.1000 -350.6883
round(mod_GB$results, 4)
shrinkage interaction.depth n.minobsinnode n.trees RMSE Rsquared
1 0.1 1 10 50 280.5950 0.5725
11 0.1 2 10 50 265.5576 0.6133
21 0.1 3 10 50 262.1542 0.6207
31 0.1 4 10 50 258.0676 0.6339
41 0.1 5 10 50 260.5901 0.6258
51 0.1 6 10 50 265.5246 0.6121
61 0.1 7 10 50 265.9296 0.6110
71 0.1 8 10 50 265.6036 0.6122
81 0.1 9 10 50 264.7138 0.6121
91 0.1 10 10 50 266.8751 0.6092
2 0.1 1 10 100 270.3301 0.5988
12 0.1 2 10 100 262.1037 0.6209
22 0.1 3 10 100 259.7899 0.6327
32 0.1 4 10 100 260.3542 0.6283
42 0.1 5 10 100 259.6890 0.6286
52 0.1 6 10 100 262.2293 0.6241
62 0.1 7 10 100 266.1681 0.6114
72 0.1 8 10 100 264.0092 0.6180
82 0.1 9 10 100 264.6379 0.6141
92 0.1 10 10 100 269.0340 0.6045
3 0.1 1 10 150 265.6170 0.6099
13 0.1 2 10 150 259.6564 0.6283
23 0.1 3 10 150 262.7302 0.6255
33 0.1 4 10 150 262.8734 0.6234
43 0.1 5 10 150 262.9160 0.6212
53 0.1 6 10 150 264.6104 0.6189
63 0.1 7 10 150 267.0137 0.6119
73 0.1 8 10 150 267.5449 0.6097
83 0.1 9 10 150 271.2090 0.5974
93 0.1 10 10 150 271.7214 0.6008
4 0.1 1 10 200 266.0158 0.6089
14 0.1 2 10 200 263.3370 0.6192
24 0.1 3 10 200 262.0874 0.6282
34 0.1 4 10 200 264.4059 0.6211
44 0.1 5 10 200 264.4267 0.6184
54 0.1 6 10 200 265.2065 0.6201
64 0.1 7 10 200 271.1293 0.6017
74 0.1 8 10 200 269.4948 0.6044
84 0.1 9 10 200 274.8607 0.5883
94 0.1 10 10 200 272.1245 0.6015
5 0.1 1 10 250 269.3076 0.6014
15 0.1 2 10 250 264.7997 0.6176
25 0.1 3 10 250 263.7189 0.6230
35 0.1 4 10 250 265.0964 0.6205
45 0.1 5 10 250 268.3025 0.6090
55 0.1 6 10 250 265.2630 0.6199
65 0.1 7 10 250 274.8204 0.5943
75 0.1 8 10 250 273.7677 0.5958
85 0.1 9 10 250 279.2271 0.5790
95 0.1 10 10 250 276.8465 0.5906
6 0.1 1 10 300 267.8486 0.6059
16 0.1 2 10 300 269.3949 0.6070
26 0.1 3 10 300 265.2474 0.6211
36 0.1 4 10 300 265.6137 0.6213
46 0.1 5 10 300 269.1325 0.6075
56 0.1 6 10 300 267.5363 0.6138
66 0.1 7 10 300 276.3511 0.5901
76 0.1 8 10 300 277.0337 0.5891
86 0.1 9 10 300 281.0756 0.5744
96 0.1 10 10 300 279.8074 0.5850
7 0.1 1 10 350 268.1845 0.6065
17 0.1 2 10 350 268.9829 0.6097
27 0.1 3 10 350 267.6002 0.6179
37 0.1 4 10 350 267.0527 0.6177
47 0.1 5 10 350 271.7948 0.6006
57 0.1 6 10 350 268.5267 0.6127
67 0.1 7 10 350 279.2927 0.5835
77 0.1 8 10 350 278.5842 0.5856
87 0.1 9 10 350 283.9218 0.5678
97 0.1 10 10 350 282.5985 0.5775
8 0.1 1 10 400 269.2629 0.6028
18 0.1 2 10 400 271.1650 0.6045
28 0.1 3 10 400 268.2370 0.6155
38 0.1 4 10 400 268.5656 0.6137
48 0.1 5 10 400 272.7713 0.5982
58 0.1 6 10 400 270.1026 0.6089
68 0.1 7 10 400 281.2895 0.5784
78 0.1 8 10 400 279.6129 0.5832
88 0.1 9 10 400 285.3156 0.5652
98 0.1 10 10 400 284.8127 0.5735
9 0.1 1 10 450 269.9367 0.6013
19 0.1 2 10 450 273.3723 0.6021
29 0.1 3 10 450 270.6560 0.6083
39 0.1 4 10 450 269.4159 0.6118
49 0.1 5 10 450 275.2261 0.5931
59 0.1 6 10 450 272.1346 0.6037
69 0.1 7 10 450 282.5772 0.5765
79 0.1 8 10 450 281.0108 0.5803
89 0.1 9 10 450 287.2466 0.5601
99 0.1 10 10 450 285.8673 0.5711
10 0.1 1 10 500 270.4261 0.6016
20 0.1 2 10 500 273.1998 0.6042
30 0.1 3 10 500 272.7496 0.6040
40 0.1 4 10 500 270.4870 0.6087
50 0.1 5 10 500 276.2331 0.5905
60 0.1 6 10 500 273.6781 0.6001
70 0.1 7 10 500 284.1271 0.5724
80 0.1 8 10 500 283.0534 0.5751
90 0.1 9 10 500 288.2241 0.5580
100 0.1 10 10 500 287.3111 0.5678
MAE RMSESD RsquaredSD MAESD
1 184.6751 60.5119 0.1327 19.4614
11 171.3124 63.8595 0.1326 21.9291
21 163.2097 59.1436 0.1263 17.1898
31 163.3569 64.2485 0.1322 22.7078
41 166.1030 56.8970 0.1118 14.5490
51 167.9663 64.8342 0.1325 19.3970
61 170.4883 74.0715 0.1546 32.5165
71 167.7576 65.9964 0.1282 17.2076
81 168.8243 73.9288 0.1524 27.9845
91 168.5632 67.9521 0.1323 19.4611
2 176.5980 65.9162 0.1459 23.4986
12 168.5208 66.6345 0.1403 26.2515
22 168.2400 59.7653 0.1228 18.7662
32 169.9733 63.8853 0.1309 25.2208
42 170.0766 59.1364 0.1143 16.4329
52 169.2963 63.1087 0.1253 23.4059
62 173.2431 72.6947 0.1492 31.5777
72 173.6026 62.0200 0.1186 16.2207
82 172.8935 75.4588 0.1541 31.3752
92 175.9056 73.3462 0.1481 30.3612
3 174.9474 67.5508 0.1494 22.5900
13 168.4714 66.4565 0.1463 24.2063
23 172.2844 61.0743 0.1291 21.0599
33 174.9908 63.7080 0.1308 22.1810
43 177.6245 55.9686 0.1078 16.8503
53 173.4055 64.2215 0.1278 25.1813
63 177.8949 73.2927 0.1483 31.0513
73 177.0979 62.6285 0.1189 18.1688
83 177.8081 73.4718 0.1536 29.1196
93 181.5480 74.4860 0.1491 33.0161
4 177.1612 66.6821 0.1496 20.3438
14 173.1027 65.6539 0.1423 22.7186
24 173.2924 60.6651 0.1276 19.5651
34 177.3287 61.4429 0.1247 21.0524
44 179.4276 58.7706 0.1169 17.1618
54 176.7854 66.2889 0.1341 25.2390
64 182.9727 72.1278 0.1473 29.3593
74 180.6727 64.0594 0.1208 20.8766
84 184.1703 69.1643 0.1444 26.3712
94 183.2714 73.8649 0.1464 33.8767
5 180.7478 64.8606 0.1426 18.6026
15 174.2111 64.1671 0.1413 23.0706
25 176.7163 62.5781 0.1353 23.4483
35 179.4146 60.9726 0.1231 22.9878
45 183.3865 60.5193 0.1208 19.2325
55 178.7575 66.5447 0.1334 24.9275
65 187.0574 73.7275 0.1513 31.3422
75 185.4289 65.6509 0.1229 22.4705
85 187.6945 70.4288 0.1479 27.2193
95 187.3539 73.4796 0.1476 34.4615
6 180.2003 65.5094 0.1426 20.7465
16 179.9017 62.7744 0.1414 23.5129
26 179.5328 62.4497 0.1331 23.5704
36 180.6647 60.3187 0.1221 21.9213
46 185.2049 59.6517 0.1172 19.2692
56 181.2830 65.5142 0.1299 25.9787
66 188.8125 74.6445 0.1531 31.2541
76 189.2413 65.2474 0.1217 22.2666
86 189.5205 69.0510 0.1450 26.4399
96 190.0157 73.0168 0.1469 34.9577
7 182.6549 65.3983 0.1446 19.7912
17 180.7706 63.4481 0.1423 22.9326
27 182.0436 60.9356 0.1289 21.9824
37 182.7250 59.3258 0.1196 20.7426
47 188.1304 59.7916 0.1188 19.1860
57 181.8160 65.8945 0.1308 25.4420
67 191.9959 74.3203 0.1524 30.6237
77 190.5583 66.6911 0.1252 23.8818
87 192.4491 68.3638 0.1429 25.8547
97 192.5166 73.0022 0.1484 35.3163
8 183.1791 63.1989 0.1398 17.6588
18 183.5495 61.4940 0.1396 23.2114
28 183.3130 61.2916 0.1316 22.0910
38 183.9948 58.6498 0.1181 20.6424
48 189.3492 59.7343 0.1193 19.9304
58 184.1570 65.1892 0.1291 24.9113
68 193.3909 74.8617 0.1531 31.1227
78 192.2602 66.9922 0.1253 24.6023
88 194.1057 68.3451 0.1417 24.6887
98 194.9379 71.9336 0.1463 36.0762
9 183.8016 61.1621 0.1332 17.0734
19 184.7534 60.9108 0.1391 24.1594
29 185.4010 60.2173 0.1314 23.0087
39 184.6765 59.2124 0.1192 20.8370
49 191.6222 59.3730 0.1180 19.6500
59 185.7924 64.5072 0.1279 24.7436
69 194.7674 75.2338 0.1535 31.9212
79 194.4138 66.2544 0.1236 24.3220
89 195.9168 68.1733 0.1423 25.4131
99 196.0717 72.4914 0.1468 37.1312
10 185.4729 60.8922 0.1310 18.2502
20 186.9216 61.9546 0.1401 23.9264
30 188.4782 60.5024 0.1315 22.2544
40 186.2120 59.2351 0.1196 21.1918
50 192.3455 59.3416 0.1177 20.2747
60 187.7316 64.3577 0.1276 25.0830
70 196.3914 75.0055 0.1526 31.7833
80 195.7702 66.3207 0.1236 24.6337
90 197.4782 67.3054 0.1407 24.2631
100 197.5501 72.7466 0.1468 37.2539
mod_GB$bestTune
n.trees interaction.depth shrinkage n.minobsinnode
31 50 4 0.1 10
plot(mod_GB)
#
TESTING <- model.matrix(Salary~., testing)
#
RMSE(predict(mod_GB, TESTING), testing$Salary)
[1] 287.5692
set.seed(11)
mod_GB2 <- train(y = trainingDum$Salary,
x = trainingDum[, -23],
trControl = myControl,
tuneLength = 10,
method = "gbm")
Iter TrainDeviance ValidDeviance StepSize Improve
1 141633.7849 -nan 0.1000 11937.0041
2 127626.4781 -nan 0.1000 12042.9387
3 114141.0167 -nan 0.1000 9018.5796
4 101788.2120 -nan 0.1000 7051.6314
5 93889.5281 -nan 0.1000 6117.8966
6 87532.0820 -nan 0.1000 4925.1132
7 81161.6168 -nan 0.1000 4899.3490
8 74743.3102 -nan 0.1000 3305.9905
9 69788.7692 -nan 0.1000 3525.6447
10 65875.5922 -nan 0.1000 2679.8551
20 46995.4199 -nan 0.1000 -603.1296
40 34971.8942 -nan 0.1000 -589.9801
50 31457.5134 -nan 0.1000 -283.8696
round(mod_GB2$results, 4)
shrinkage interaction.depth n.minobsinnode n.trees RMSE Rsquared
1 0.1 1 10 50 258.7664 0.6071
11 0.1 2 10 50 250.1627 0.6237
21 0.1 3 10 50 246.2459 0.6363
31 0.1 4 10 50 240.2550 0.6550
41 0.1 5 10 50 246.8367 0.6419
51 0.1 6 10 50 243.7885 0.6461
61 0.1 7 10 50 244.4927 0.6447
71 0.1 8 10 50 244.4911 0.6437
81 0.1 9 10 50 247.4326 0.6362
91 0.1 10 10 50 244.8341 0.6420
2 0.1 1 10 100 251.2546 0.6219
12 0.1 2 10 100 243.4087 0.6390
22 0.1 3 10 100 243.3711 0.6408
32 0.1 4 10 100 245.1854 0.6420
42 0.1 5 10 100 248.4962 0.6356
52 0.1 6 10 100 246.3877 0.6363
62 0.1 7 10 100 244.6939 0.6432
72 0.1 8 10 100 247.1488 0.6389
82 0.1 9 10 100 246.3775 0.6417
92 0.1 10 10 100 248.7442 0.6299
3 0.1 1 10 150 249.6824 0.6211
13 0.1 2 10 150 246.1351 0.6308
23 0.1 3 10 150 247.0837 0.6301
33 0.1 4 10 150 250.1216 0.6286
43 0.1 5 10 150 251.4183 0.6254
53 0.1 6 10 150 249.6262 0.6286
63 0.1 7 10 150 244.7043 0.6398
73 0.1 8 10 150 249.1397 0.6285
83 0.1 9 10 150 249.7034 0.6305
93 0.1 10 10 150 251.3834 0.6212
4 0.1 1 10 200 246.8929 0.6269
14 0.1 2 10 200 247.4527 0.6261
24 0.1 3 10 200 247.2161 0.6317
34 0.1 4 10 200 250.0340 0.6256
44 0.1 5 10 200 252.1767 0.6181
54 0.1 6 10 200 250.5499 0.6241
64 0.1 7 10 200 248.0930 0.6303
74 0.1 8 10 200 248.0836 0.6292
84 0.1 9 10 200 249.4104 0.6305
94 0.1 10 10 200 254.1605 0.6097
5 0.1 1 10 250 247.8217 0.6252
15 0.1 2 10 250 248.8225 0.6194
25 0.1 3 10 250 250.5926 0.6198
35 0.1 4 10 250 253.3838 0.6178
45 0.1 5 10 250 253.8454 0.6122
55 0.1 6 10 250 252.6321 0.6185
65 0.1 7 10 250 249.8328 0.6233
75 0.1 8 10 250 248.0912 0.6272
85 0.1 9 10 250 251.3313 0.6242
95 0.1 10 10 250 256.2776 0.6044
6 0.1 1 10 300 247.7398 0.6277
16 0.1 2 10 300 251.4886 0.6092
26 0.1 3 10 300 250.6175 0.6186
36 0.1 4 10 300 256.2347 0.6108
46 0.1 5 10 300 254.4032 0.6101
56 0.1 6 10 300 254.5871 0.6121
66 0.1 7 10 300 250.8185 0.6215
76 0.1 8 10 300 249.7306 0.6226
86 0.1 9 10 300 255.2187 0.6133
96 0.1 10 10 300 257.4975 0.6009
7 0.1 1 10 350 248.2660 0.6209
17 0.1 2 10 350 254.5768 0.5991
27 0.1 3 10 350 251.0809 0.6153
37 0.1 4 10 350 256.4078 0.6085
47 0.1 5 10 350 256.2714 0.6042
57 0.1 6 10 350 256.4916 0.6080
67 0.1 7 10 350 252.1754 0.6171
77 0.1 8 10 350 250.4320 0.6213
87 0.1 9 10 350 256.0847 0.6110
97 0.1 10 10 350 258.9691 0.5971
8 0.1 1 10 400 249.6500 0.6189
18 0.1 2 10 400 254.5728 0.5989
28 0.1 3 10 400 253.6762 0.6085
38 0.1 4 10 400 258.1879 0.6036
48 0.1 5 10 400 256.4279 0.6038
58 0.1 6 10 400 258.4552 0.6029
68 0.1 7 10 400 252.8577 0.6154
78 0.1 8 10 400 251.8161 0.6166
88 0.1 9 10 400 258.3539 0.6048
98 0.1 10 10 400 259.9577 0.5947
9 0.1 1 10 450 250.0525 0.6191
19 0.1 2 10 450 255.9234 0.5946
29 0.1 3 10 450 256.0207 0.6026
39 0.1 4 10 450 259.1085 0.6011
49 0.1 5 10 450 257.7612 0.5989
59 0.1 6 10 450 259.5347 0.5996
69 0.1 7 10 450 253.3516 0.6132
79 0.1 8 10 450 251.6396 0.6174
89 0.1 9 10 450 259.3173 0.6013
99 0.1 10 10 450 261.1437 0.5909
10 0.1 1 10 500 251.6977 0.6131
20 0.1 2 10 500 255.4453 0.5960
30 0.1 3 10 500 257.6382 0.5961
40 0.1 4 10 500 260.0174 0.5981
50 0.1 5 10 500 258.8247 0.5957
60 0.1 6 10 500 260.5332 0.5968
70 0.1 7 10 500 253.8386 0.6127
80 0.1 8 10 500 253.2428 0.6131
90 0.1 9 10 500 259.6535 0.6007
100 0.1 10 10 500 261.9428 0.5888
MAE RMSESD RsquaredSD MAESD
1 166.6668 72.0016 0.0968 25.0801
11 161.3855 65.2393 0.0966 19.1760
21 154.7758 65.6643 0.0994 21.8176
31 150.8763 71.1564 0.0931 30.8685
41 151.1593 68.1589 0.0995 20.7306
51 150.9616 61.8356 0.0882 18.6443
61 154.1553 60.7578 0.0796 21.1789
71 155.0094 63.7946 0.0897 20.8024
81 155.3488 61.1916 0.0746 20.7705
91 155.4725 60.4203 0.0721 21.2201
2 161.0283 67.4798 0.0929 26.5110
12 158.4072 59.4396 0.0957 18.9072
22 155.2701 55.6092 0.0905 18.2915
32 157.4830 59.2798 0.0923 23.8170
42 155.4078 55.4536 0.0847 14.5791
52 158.6914 55.7212 0.1003 13.0893
62 157.8605 54.8418 0.0836 19.9697
72 159.6177 56.3012 0.0908 15.7001
82 157.9814 53.9891 0.0685 17.6646
92 161.1731 57.7350 0.0959 17.3418
3 161.8952 65.3708 0.0924 25.9089
13 161.6427 56.6823 0.0963 21.1047
23 157.2787 51.6326 0.0969 13.7219
33 159.9669 52.2771 0.0873 18.0972
43 160.1500 53.9681 0.0891 16.6229
53 161.5450 54.3733 0.1010 13.4901
63 159.3505 48.2449 0.0904 15.3718
73 162.5591 52.9697 0.0947 15.2713
83 164.8859 44.8300 0.0687 11.2581
93 165.6568 53.9090 0.1056 15.2066
4 158.8989 63.5368 0.0904 24.6811
14 164.9330 52.2182 0.0884 21.6571
24 157.3143 50.4123 0.0983 12.2780
34 162.5331 50.0686 0.0992 17.6640
44 161.9634 52.0608 0.0988 15.8198
54 164.2348 54.6886 0.1085 16.5016
64 162.3504 48.7835 0.0961 15.6030
74 161.3880 52.9261 0.1008 16.1088
84 165.7898 49.0373 0.0824 14.0143
94 168.0376 51.3491 0.1110 17.2045
5 161.3576 62.4824 0.0870 24.3320
15 167.0525 49.5790 0.0907 20.5963
25 158.2365 49.8544 0.1022 14.3627
35 165.0148 49.6902 0.1032 16.2258
45 166.8398 53.3932 0.1077 15.4055
55 167.2311 54.1633 0.1066 17.4073
65 164.1502 48.2820 0.1003 15.3770
75 164.0100 52.6201 0.1033 19.1759
85 167.7126 49.2638 0.0860 15.9709
95 170.4072 54.0027 0.1209 20.6391
6 160.3572 61.7373 0.0893 23.4539
16 169.7429 48.9644 0.0902 19.7103
26 160.0151 49.4071 0.1071 14.6579
36 168.5271 49.2123 0.1055 14.4545
46 168.2687 51.7884 0.1087 15.2834
56 169.8051 54.7969 0.1097 18.7454
66 165.8083 47.5810 0.1054 17.0650
76 166.0794 54.0889 0.1094 19.8815
86 170.6516 48.5953 0.0897 15.6518
96 171.3209 54.3078 0.1255 21.6264
7 161.5719 60.4788 0.0849 25.8221
17 172.9563 47.6271 0.0951 19.0992
27 162.1302 48.2939 0.1103 13.8869
37 169.6075 49.3536 0.1100 15.2868
47 169.6180 52.3956 0.1132 16.4988
57 171.3295 55.2745 0.1118 19.0284
67 166.8878 47.2578 0.1037 17.1312
77 166.8795 54.7471 0.1121 20.9787
87 172.4869 48.2611 0.0914 16.2284
97 173.1440 53.8631 0.1293 21.6811
8 164.9464 57.1620 0.0787 23.1944
18 173.1155 46.6062 0.0963 18.4688
28 164.2638 49.5821 0.1159 15.4470
38 172.0078 48.2808 0.1130 16.2826
48 170.1265 54.0410 0.1181 17.7810
58 173.4532 54.7059 0.1134 18.9175
68 167.6549 48.6095 0.1089 18.4151
78 168.4862 54.8129 0.1153 20.5216
88 174.7971 47.9245 0.0934 16.4448
98 174.3662 52.8596 0.1274 21.3510
9 164.9079 58.0728 0.0840 23.1200
19 175.6287 44.6906 0.0946 17.2435
29 166.6539 49.9757 0.1174 17.6009
39 173.0894 48.8510 0.1138 16.2560
49 172.2283 53.7657 0.1180 17.7893
59 174.4864 54.5986 0.1143 19.1092
69 169.0961 48.1577 0.1089 18.3821
79 169.0989 54.3424 0.1160 20.0748
89 175.9029 48.0433 0.0977 17.6847
99 175.5599 53.3244 0.1304 22.0183
10 166.0470 56.8573 0.0776 21.9569
20 175.8933 45.5333 0.0993 15.5762
30 167.6660 50.6423 0.1199 18.4412
40 173.7485 48.4832 0.1140 15.8224
50 172.9229 54.4543 0.1214 19.1621
60 176.2818 54.4026 0.1133 19.2248
70 169.3996 48.7795 0.1101 18.7695
80 171.0547 54.6848 0.1176 21.0035
90 176.4289 48.9490 0.1001 18.5323
100 176.5802 52.8848 0.1310 21.8718
mod_GB2$bestTune
n.trees interaction.depth shrinkage n.minobsinnode
31 50 4 0.1 10
plot(mod_GB2)
#
RMSE(predict(mod_GB, testingDum), testingDum$Salary)
[1] 276.3207