Reading the comma separated file from the Data
directory one level up from where this document is stored using the read.csv()
function. Variable descriptions were obtained from King County, Department of Assessments. All feature engineering should be done in one the first code chunks of your document.
housedata <- read.csv("../Data/housedata.csv",
colClasses = c(id = "character", date = "character",
yr_built = "character", zipcode = "factor", grade = "factor"))
housedata$date <- as.Date(housedata$date, "%Y%m%d")
housedata$waterfront <- factor(housedata$waterfront, labels = c("No", "Yes"))
housedata$condition <- factor(housedata$condition, labels = c("poor", "fair", "average", "good", "very good"))
housedata$yr_renovated <- ifelse(housedata$yr_renovated == 0, housedata$yr_built, housedata$yr_renovated)
housedata$yr_built <- as.Date(ISOdate(housedata$yr_built, 9, 1)) # Complete Year, Sept 1
housedata$yr_renovated <- as.Date(ISOdate(housedata$yr_renovated, 9, 1)) # Last renovated Year, Sept 1
housedata <- housedata[, -1]
#### Perform same steps with test set
housedataT <- read.csv("../Data/housedataTEST.csv",
colClasses = c(id = "character", date = "character",
yr_built = "character", zipcode = "factor", grade = "factor"))
housedataT$date <- as.Date(housedataT$date, "%Y%m%d")
housedataT$waterfront <- factor(housedataT$waterfront, labels = c("No", "Yes"))
housedataT$condition <- factor(housedataT$condition, labels = c("poor", "fair", "average", "good", "very good"))
housedataT$yr_renovated <- ifelse(housedataT$yr_renovated == 0, housedataT$yr_built, housedataT$yr_renovated)
housedataT$yr_built <- as.Date(ISOdate(housedataT$yr_built, 9, 1)) # Complete Year, Sept 1
housedataT$yr_renovated <- as.Date(ISOdate(housedataT$yr_renovated, 9, 1)) # Last renovated Year, Sept 1
housedataT <- housedataT[, -1]
library(DT)
datatable(housedata[, 2:10], rownames = FALSE)
Consider predicting the price (price
) of a house based on a certain feature (sqft_living
). Start by graphing the relationship.
library(ggplot2)
p1 <- ggplot(data = housedata, aes(x = sqft_living, y = price)) +
geom_point() +
theme_bw()
p1
Overplotting is problematic. What should we do?
alpha
).alpha
p2 <- ggplot(data = housedata, aes(x = sqft_living, y = price)) +
geom_point(alpha = 0.05, color = "blue") +
theme_bw()
p2
p3 <- ggplot(data = housedata, aes(x = sqft_living, y = price)) +
stat_bin2d(bins = 50) +
theme_bw()
p3
p4 <- ggplot(data = housedata, aes(x = sqft_living, y = price)) +
stat_bin2d(bins = 50) +
scale_fill_gradient(low = "lightblue", high = "red",
limits = c(0, 1000)) +
theme_bw()
p4
p5 <- ggplot(data = housedata, aes(x = sqft_living, y = price)) +
stat_binhex(bins = 50) +
scale_fill_gradient(low = "lightblue", high = "red",
limits = c(0, 800), breaks = seq(0, 800, by = 200)) +
theme_bw()
p5
**Note* For both stat_bin2d
and stat_binhex
, if you manually specify the range, and there is a bin that falls outside that range because it has too many or too few points, that bin will show up as grey rather than the color at the high or low end of the range. Observe the gray hexagons in the lower left corner of the above graph.
p6 <- ggplot(data = housedata, aes(x = sqft_living, y = price)) +
stat_binhex(bins = 50) +
scale_fill_gradient(low = "lightblue", high = "red",
limits = c(0, 1000), breaks = seq(0, 1000, by = 200)) +
theme_bw()
p6
Use a simple linear model to predict the price of a house with 2,500 \(\text{ft}^2\).
slm <- lm(price ~ sqft_living, data = housedata)
summary(slm)
Call:
lm(formula = price ~ sqft_living, data = housedata)
Residuals:
Min 1Q Median 3Q Max
-1490607 -148265 -23758 105710 4349512
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -47116.079 4923.344 -9.57 <2e-16 ***
sqft_living 281.959 2.164 130.29 <2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 263000 on 17382 degrees of freedom
Multiple R-squared: 0.4941, Adjusted R-squared: 0.4941
F-statistic: 1.698e+04 on 1 and 17382 DF, p-value: < 2.2e-16
predict(slm, newdata = data.frame(sqft_living = 2500))
1
657781
p6 + geom_smooth(method = "lm") +
geom_vline(xintercept = 2500,linetype = "dashed", color = "red") +
geom_hline(yintercept = predict(slm, newdata = data.frame(sqft_living = 2500)), linetype = "dashed", color = "red") +
labs(x = "Living Space (square feet)", y = "Price ($)")
mod.zip
most basic model.
mod.zip <- lm(price ~ 1, data = housedata)
summary(mod.zip)
Call:
lm(formula = price ~ 1, data = housedata)
Residuals:
Min 1Q Median 3Q Max
-464367 -219367 -89367 100633 7160633
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 539367 2804 192.4 <2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 369700 on 17383 degrees of freedom
mod.all
all current features except sqft_basement
.
mod.all <- lm(price ~ . - sqft_basement, data = housedata)
summary(mod.all)
Call:
lm(formula = price ~ . - sqft_basement, data = housedata)
Residuals:
Min 1Q Median 3Q Max
-1539648 -60120 2700 56110 3478116
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -4.135e+07 6.410e+06 -6.450 1.15e-10 ***
date 1.145e+02 1.016e+01 11.269 < 2e-16 ***
bedrooms -1.431e+04 1.708e+03 -8.377 < 2e-16 ***
bathrooms 2.394e+04 2.764e+03 8.663 < 2e-16 ***
sqft_living 1.170e+02 3.714e+00 31.495 < 2e-16 ***
sqft_lot 2.574e-01 4.059e-02 6.342 2.32e-10 ***
floors -2.963e+04 3.344e+03 -8.862 < 2e-16 ***
waterfrontYes 6.641e+05 1.465e+04 45.331 < 2e-16 ***
view 4.953e+04 1.828e+03 27.102 < 2e-16 ***
conditionfair 9.809e+04 3.414e+04 2.873 0.004075 **
conditionaverage 1.042e+05 3.174e+04 3.282 0.001032 **
conditiongood 1.318e+05 3.175e+04 4.151 3.32e-05 ***
conditionvery good 1.814e+05 3.194e+04 5.677 1.39e-08 ***
grade10 1.173e+05 1.549e+05 0.757 0.448960
grade11 3.143e+05 1.552e+05 2.025 0.042869 *
grade12 7.525e+05 1.561e+05 4.821 1.44e-06 ***
grade13 1.661e+06 1.615e+05 10.287 < 2e-16 ***
grade3 -7.935e+02 1.776e+05 -0.004 0.996434
grade4 -9.356e+04 1.588e+05 -0.589 0.555737
grade5 -1.035e+05 1.548e+05 -0.668 0.503898
grade6 -1.122e+05 1.547e+05 -0.725 0.468200
grade7 -1.091e+05 1.547e+05 -0.705 0.480711
grade8 -8.819e+04 1.548e+05 -0.570 0.568766
grade9 -1.235e+04 1.548e+05 -0.080 0.936432
sqft_above 5.176e+01 3.858e+00 13.416 < 2e-16 ***
yr_built -3.312e+00 2.798e-01 -11.838 < 2e-16 ***
yr_renovated 2.591e+00 2.847e-01 9.103 < 2e-16 ***
zipcode98002 1.909e+04 1.516e+04 1.259 0.207972
zipcode98003 -1.207e+04 1.341e+04 -0.900 0.368288
zipcode98004 7.176e+05 2.455e+04 29.229 < 2e-16 ***
zipcode98005 2.534e+05 2.626e+04 9.650 < 2e-16 ***
zipcode98006 2.205e+05 2.140e+04 10.305 < 2e-16 ***
zipcode98007 2.136e+05 2.691e+04 7.938 2.18e-15 ***
zipcode98008 2.371e+05 2.569e+04 9.230 < 2e-16 ***
zipcode98010 1.108e+05 2.280e+04 4.860 1.18e-06 ***
zipcode98011 5.783e+04 3.358e+04 1.722 0.085099 .
zipcode98014 1.005e+05 3.709e+04 2.711 0.006722 **
zipcode98019 7.545e+04 3.613e+04 2.088 0.036804 *
zipcode98022 6.976e+04 2.010e+04 3.470 0.000521 ***
zipcode98023 -5.426e+04 1.240e+04 -4.376 1.21e-05 ***
zipcode98024 1.806e+05 3.184e+04 5.672 1.43e-08 ***
zipcode98027 1.737e+05 2.203e+04 7.885 3.33e-15 ***
zipcode98028 3.778e+04 3.257e+04 1.160 0.246053
zipcode98029 2.355e+05 2.510e+04 9.383 < 2e-16 ***
zipcode98030 1.126e+04 1.465e+04 0.769 0.441829
zipcode98031 1.564e+04 1.537e+04 1.018 0.308915
zipcode98032 -1.284e+04 1.792e+04 -0.717 0.473687
zipcode98033 2.958e+05 2.789e+04 10.608 < 2e-16 ***
zipcode98034 1.264e+05 2.990e+04 4.228 2.37e-05 ***
zipcode98038 7.791e+04 1.655e+04 4.708 2.53e-06 ***
zipcode98039 1.101e+06 3.245e+04 33.942 < 2e-16 ***
zipcode98040 4.680e+05 2.194e+04 21.327 < 2e-16 ***
zipcode98042 2.810e+04 1.418e+04 1.982 0.047456 *
zipcode98045 1.814e+05 3.066e+04 5.917 3.33e-09 ***
zipcode98052 1.967e+05 2.842e+04 6.922 4.62e-12 ***
zipcode98053 1.808e+05 3.037e+04 5.954 2.67e-09 ***
zipcode98055 2.146e+04 1.718e+04 1.249 0.211728
zipcode98056 6.367e+04 1.868e+04 3.409 0.000653 ***
zipcode98058 3.359e+04 1.620e+04 2.073 0.038186 *
zipcode98059 6.952e+04 1.837e+04 3.784 0.000155 ***
zipcode98065 1.346e+05 2.815e+04 4.782 1.75e-06 ***
zipcode98070 -7.192e+04 2.176e+04 -3.305 0.000950 ***
zipcode98072 9.011e+04 3.321e+04 2.713 0.006671 **
zipcode98074 1.588e+05 2.686e+04 5.912 3.45e-09 ***
zipcode98075 1.598e+05 2.576e+04 6.204 5.64e-10 ***
zipcode98077 6.050e+04 3.468e+04 1.745 0.081073 .
zipcode98092 -1.196e+03 1.326e+04 -0.090 0.928155
zipcode98102 4.390e+05 2.935e+04 14.959 < 2e-16 ***
zipcode98103 2.424e+05 2.709e+04 8.948 < 2e-16 ***
zipcode98105 3.997e+05 2.773e+04 14.414 < 2e-16 ***
zipcode98106 5.587e+04 2.006e+04 2.785 0.005359 **
zipcode98107 2.487e+05 2.780e+04 8.948 < 2e-16 ***
zipcode98108 6.070e+04 2.237e+04 2.714 0.006652 **
zipcode98109 4.148e+05 2.861e+04 14.498 < 2e-16 ***
zipcode98112 5.380e+05 2.544e+04 21.151 < 2e-16 ***
zipcode98115 2.480e+05 2.744e+04 9.040 < 2e-16 ***
zipcode98116 2.080e+05 2.232e+04 9.320 < 2e-16 ***
zipcode98117 2.180e+05 2.782e+04 7.834 4.99e-15 ***
zipcode98118 1.124e+05 1.948e+04 5.771 8.02e-09 ***
zipcode98119 4.068e+05 2.701e+04 15.061 < 2e-16 ***
zipcode98122 2.721e+05 2.421e+04 11.235 < 2e-16 ***
zipcode98125 1.097e+05 2.968e+04 3.695 0.000220 ***
zipcode98126 1.206e+05 2.068e+04 5.831 5.61e-09 ***
zipcode98133 5.335e+04 3.061e+04 1.743 0.081338 .
zipcode98136 1.768e+05 2.100e+04 8.420 < 2e-16 ***
zipcode98144 2.204e+05 2.244e+04 9.823 < 2e-16 ***
zipcode98146 3.043e+04 1.876e+04 1.622 0.104828
zipcode98148 3.650e+04 2.635e+04 1.385 0.165989
zipcode98155 4.353e+04 3.191e+04 1.364 0.172629
zipcode98166 5.831e+03 1.725e+04 0.338 0.735412
zipcode98168 9.246e+03 1.825e+04 0.507 0.612501
zipcode98177 1.079e+05 3.190e+04 3.382 0.000720 ***
zipcode98178 -5.908e+03 1.861e+04 -0.317 0.750888
zipcode98188 -4.184e+03 1.898e+04 -0.220 0.825497
zipcode98198 -3.087e+04 1.448e+04 -2.132 0.033009 *
zipcode98199 2.724e+05 2.639e+04 10.323 < 2e-16 ***
lat 2.233e+05 6.632e+04 3.367 0.000761 ***
long -2.361e+05 4.727e+04 -4.995 5.95e-07 ***
sqft_living15 1.521e+01 3.029e+00 5.023 5.15e-07 ***
sqft_lot15 -1.159e-01 6.429e-02 -1.803 0.071344 .
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 150200 on 17284 degrees of freedom
Multiple R-squared: 0.8359, Adjusted R-squared: 0.835
F-statistic: 889.4 on 99 and 17284 DF, p-value: < 2.2e-16
Your goal is to create a model with as small a test error as possible. Note that the square root of the training RSS from model mod.all
is \(1.5018076\times 10^{5}\).
library(ggmap)
KingMap <-
get_map(
location = c(lon = -122.1, lat = 47.48),
zoom = 10,
source = "google",
maptype = "roadmap"
)
ggmap(KingMap) +
geom_point(
aes(x = housedata$long, y = housedata$lat),
data = housedata,
alpha = .2,
color = "blue",
size = 0.01
) +
geom_point(
aes(x = housedataT$long, y = housedataT$lat),
data = housedataT,
alpha = .2,
color = "red",
size = 0.01
) +
ggtitle("Houses Sold in King County, Wa (2014-2015)") +
labs(x = "longitute", y = "latitude")
Submit a vector named (Yourlastname_Yourfirstname
) with the predicted house prices for your model using the data frame housedataT
.
Suppose your final model is mod.all
.
Yourlastname_Yourfirstname <- predict(mod.all, newdata = housedataT)
head(Yourlastname_Yourfirstname)
1 2 3 4 5 6
310196.9 845551.2 305946.0 532806.5 485256.6 469122.8
write.csv()
to create the vector Yourlastname_Yourfirstname.csv
write.csv(Yourlastname_Yourfirstname, file = "Yourlastname_Yourfirstname.csv")
I will compute your \(\sqrt{MSPE}\).
SMSPE <- sqrt(mean((head(Yourlastname_Yourfirstname) - c(310000, 650000, 233000, 580500, 535000, 605000))^2))
SMSPE
[1] 105493.4