Reading the comma separated file from the Data directory one level up from where this document is stored using the read.csv() function. Variable descriptions were obtained from King County, Department of Assessments. All feature engineering should be done in one the first code chunks of your document.

housedata <- read.csv("../Data/housedata.csv", 
                      colClasses = c(id = "character", date = "character", 
                                     yr_built = "character", zipcode = "factor", grade = "factor"))
housedata$date <- as.Date(housedata$date, "%Y%m%d")
housedata$waterfront <- factor(housedata$waterfront, labels = c("No", "Yes"))
housedata$condition <- factor(housedata$condition, labels = c("poor", "fair", "average", "good", "very good"))
housedata$yr_renovated <- ifelse(housedata$yr_renovated == 0, housedata$yr_built, housedata$yr_renovated)
housedata$yr_built <- as.Date(ISOdate(housedata$yr_built, 9, 1))  # Complete Year, Sept 1
housedata$yr_renovated <- as.Date(ISOdate(housedata$yr_renovated, 9, 1))  # Last renovated Year, Sept 1
housedata <- housedata[, -1]
#### Perform same steps with test set
housedataT <- read.csv("../Data/housedataTEST.csv", 
                      colClasses = c(id = "character", date = "character", 
                                     yr_built = "character", zipcode = "factor", grade = "factor"))
housedataT$date <- as.Date(housedataT$date, "%Y%m%d")
housedataT$waterfront <- factor(housedataT$waterfront, labels = c("No", "Yes"))
housedataT$condition <- factor(housedataT$condition, labels = c("poor", "fair", "average", "good", "very good"))
housedataT$yr_renovated <- ifelse(housedataT$yr_renovated == 0, housedataT$yr_built, housedataT$yr_renovated)
housedataT$yr_built <- as.Date(ISOdate(housedataT$yr_built, 9, 1))  # Complete Year, Sept 1
housedataT$yr_renovated <- as.Date(ISOdate(housedataT$yr_renovated, 9, 1))  # Last renovated Year, Sept 1
housedataT <- housedataT[, -1]
library(DT)
datatable(housedata[, 2:10], rownames = FALSE)

Consider predicting the price (price) of a house based on a certain feature (sqft_living). Start by graphing the relationship.

library(ggplot2)
p1 <- ggplot(data = housedata, aes(x = sqft_living, y = price)) + 
  geom_point() + 
  theme_bw()
p1

Overplotting is problematic. What should we do?

Using alpha

p2 <- ggplot(data = housedata, aes(x = sqft_living, y = price)) + 
        geom_point(alpha = 0.05, color = "blue") + 
        theme_bw() 
p2

Using rectangles

p3 <- ggplot(data = housedata, aes(x = sqft_living, y = price)) + 
        stat_bin2d(bins = 50) + 
        theme_bw()
p3

p4 <- ggplot(data = housedata, aes(x = sqft_living, y = price)) + 
        stat_bin2d(bins = 50) + 
        scale_fill_gradient(low = "lightblue", high = "red", 
                            limits = c(0, 1000)) +
        theme_bw()
p4

Using hexagons

p5 <- ggplot(data = housedata, aes(x = sqft_living, y = price)) + 
        stat_binhex(bins = 50) + 
        scale_fill_gradient(low = "lightblue", high = "red", 
                            limits = c(0, 800), breaks = seq(0, 800, by = 200)) +
        theme_bw()
p5

**Note* For both stat_bin2d and stat_binhex, if you manually specify the range, and there is a bin that falls outside that range because it has too many or too few points, that bin will show up as grey rather than the color at the high or low end of the range. Observe the gray hexagons in the lower left corner of the above graph.

p6 <- ggplot(data = housedata, aes(x = sqft_living, y = price)) + 
        stat_binhex(bins = 50) + 
        scale_fill_gradient(low = "lightblue", high = "red", 
                            limits = c(0, 1000), breaks = seq(0, 1000, by = 200)) +
        theme_bw()
p6

What features might be visible in a scatterplot?

Use a simple linear model to predict the price of a house with 2,500 \(\text{ft}^2\).

slm <- lm(price ~ sqft_living, data = housedata)
summary(slm)

Call:
lm(formula = price ~ sqft_living, data = housedata)

Residuals:
     Min       1Q   Median       3Q      Max 
-1490607  -148265   -23758   105710  4349512 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)    
(Intercept) -47116.079   4923.344   -9.57   <2e-16 ***
sqft_living    281.959      2.164  130.29   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 263000 on 17382 degrees of freedom
Multiple R-squared:  0.4941,    Adjusted R-squared:  0.4941 
F-statistic: 1.698e+04 on 1 and 17382 DF,  p-value: < 2.2e-16
predict(slm, newdata = data.frame(sqft_living = 2500))
     1 
657781 
p6 + geom_smooth(method = "lm") + 
  geom_vline(xintercept = 2500,linetype = "dashed", color = "red") +
  geom_hline(yintercept = predict(slm, newdata = data.frame(sqft_living = 2500)), linetype = "dashed", color = "red") + 
  labs(x = "Living Space (square feet)", y = "Price ($)")

mod.zip most basic model.

mod.zip <- lm(price ~ 1, data = housedata)
summary(mod.zip)

Call:
lm(formula = price ~ 1, data = housedata)

Residuals:
    Min      1Q  Median      3Q     Max 
-464367 -219367  -89367  100633 7160633 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)   539367       2804   192.4   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 369700 on 17383 degrees of freedom

mod.all all current features except sqft_basement.

mod.all <- lm(price ~ . - sqft_basement, data = housedata)
summary(mod.all)

Call:
lm(formula = price ~ . - sqft_basement, data = housedata)

Residuals:
     Min       1Q   Median       3Q      Max 
-1539648   -60120     2700    56110  3478116 

Coefficients:
                     Estimate Std. Error t value Pr(>|t|)    
(Intercept)        -4.135e+07  6.410e+06  -6.450 1.15e-10 ***
date                1.145e+02  1.016e+01  11.269  < 2e-16 ***
bedrooms           -1.431e+04  1.708e+03  -8.377  < 2e-16 ***
bathrooms           2.394e+04  2.764e+03   8.663  < 2e-16 ***
sqft_living         1.170e+02  3.714e+00  31.495  < 2e-16 ***
sqft_lot            2.574e-01  4.059e-02   6.342 2.32e-10 ***
floors             -2.963e+04  3.344e+03  -8.862  < 2e-16 ***
waterfrontYes       6.641e+05  1.465e+04  45.331  < 2e-16 ***
view                4.953e+04  1.828e+03  27.102  < 2e-16 ***
conditionfair       9.809e+04  3.414e+04   2.873 0.004075 ** 
conditionaverage    1.042e+05  3.174e+04   3.282 0.001032 ** 
conditiongood       1.318e+05  3.175e+04   4.151 3.32e-05 ***
conditionvery good  1.814e+05  3.194e+04   5.677 1.39e-08 ***
grade10             1.173e+05  1.549e+05   0.757 0.448960    
grade11             3.143e+05  1.552e+05   2.025 0.042869 *  
grade12             7.525e+05  1.561e+05   4.821 1.44e-06 ***
grade13             1.661e+06  1.615e+05  10.287  < 2e-16 ***
grade3             -7.935e+02  1.776e+05  -0.004 0.996434    
grade4             -9.356e+04  1.588e+05  -0.589 0.555737    
grade5             -1.035e+05  1.548e+05  -0.668 0.503898    
grade6             -1.122e+05  1.547e+05  -0.725 0.468200    
grade7             -1.091e+05  1.547e+05  -0.705 0.480711    
grade8             -8.819e+04  1.548e+05  -0.570 0.568766    
grade9             -1.235e+04  1.548e+05  -0.080 0.936432    
sqft_above          5.176e+01  3.858e+00  13.416  < 2e-16 ***
yr_built           -3.312e+00  2.798e-01 -11.838  < 2e-16 ***
yr_renovated        2.591e+00  2.847e-01   9.103  < 2e-16 ***
zipcode98002        1.909e+04  1.516e+04   1.259 0.207972    
zipcode98003       -1.207e+04  1.341e+04  -0.900 0.368288    
zipcode98004        7.176e+05  2.455e+04  29.229  < 2e-16 ***
zipcode98005        2.534e+05  2.626e+04   9.650  < 2e-16 ***
zipcode98006        2.205e+05  2.140e+04  10.305  < 2e-16 ***
zipcode98007        2.136e+05  2.691e+04   7.938 2.18e-15 ***
zipcode98008        2.371e+05  2.569e+04   9.230  < 2e-16 ***
zipcode98010        1.108e+05  2.280e+04   4.860 1.18e-06 ***
zipcode98011        5.783e+04  3.358e+04   1.722 0.085099 .  
zipcode98014        1.005e+05  3.709e+04   2.711 0.006722 ** 
zipcode98019        7.545e+04  3.613e+04   2.088 0.036804 *  
zipcode98022        6.976e+04  2.010e+04   3.470 0.000521 ***
zipcode98023       -5.426e+04  1.240e+04  -4.376 1.21e-05 ***
zipcode98024        1.806e+05  3.184e+04   5.672 1.43e-08 ***
zipcode98027        1.737e+05  2.203e+04   7.885 3.33e-15 ***
zipcode98028        3.778e+04  3.257e+04   1.160 0.246053    
zipcode98029        2.355e+05  2.510e+04   9.383  < 2e-16 ***
zipcode98030        1.126e+04  1.465e+04   0.769 0.441829    
zipcode98031        1.564e+04  1.537e+04   1.018 0.308915    
zipcode98032       -1.284e+04  1.792e+04  -0.717 0.473687    
zipcode98033        2.958e+05  2.789e+04  10.608  < 2e-16 ***
zipcode98034        1.264e+05  2.990e+04   4.228 2.37e-05 ***
zipcode98038        7.791e+04  1.655e+04   4.708 2.53e-06 ***
zipcode98039        1.101e+06  3.245e+04  33.942  < 2e-16 ***
zipcode98040        4.680e+05  2.194e+04  21.327  < 2e-16 ***
zipcode98042        2.810e+04  1.418e+04   1.982 0.047456 *  
zipcode98045        1.814e+05  3.066e+04   5.917 3.33e-09 ***
zipcode98052        1.967e+05  2.842e+04   6.922 4.62e-12 ***
zipcode98053        1.808e+05  3.037e+04   5.954 2.67e-09 ***
zipcode98055        2.146e+04  1.718e+04   1.249 0.211728    
zipcode98056        6.367e+04  1.868e+04   3.409 0.000653 ***
zipcode98058        3.359e+04  1.620e+04   2.073 0.038186 *  
zipcode98059        6.952e+04  1.837e+04   3.784 0.000155 ***
zipcode98065        1.346e+05  2.815e+04   4.782 1.75e-06 ***
zipcode98070       -7.192e+04  2.176e+04  -3.305 0.000950 ***
zipcode98072        9.011e+04  3.321e+04   2.713 0.006671 ** 
zipcode98074        1.588e+05  2.686e+04   5.912 3.45e-09 ***
zipcode98075        1.598e+05  2.576e+04   6.204 5.64e-10 ***
zipcode98077        6.050e+04  3.468e+04   1.745 0.081073 .  
zipcode98092       -1.196e+03  1.326e+04  -0.090 0.928155    
zipcode98102        4.390e+05  2.935e+04  14.959  < 2e-16 ***
zipcode98103        2.424e+05  2.709e+04   8.948  < 2e-16 ***
zipcode98105        3.997e+05  2.773e+04  14.414  < 2e-16 ***
zipcode98106        5.587e+04  2.006e+04   2.785 0.005359 ** 
zipcode98107        2.487e+05  2.780e+04   8.948  < 2e-16 ***
zipcode98108        6.070e+04  2.237e+04   2.714 0.006652 ** 
zipcode98109        4.148e+05  2.861e+04  14.498  < 2e-16 ***
zipcode98112        5.380e+05  2.544e+04  21.151  < 2e-16 ***
zipcode98115        2.480e+05  2.744e+04   9.040  < 2e-16 ***
zipcode98116        2.080e+05  2.232e+04   9.320  < 2e-16 ***
zipcode98117        2.180e+05  2.782e+04   7.834 4.99e-15 ***
zipcode98118        1.124e+05  1.948e+04   5.771 8.02e-09 ***
zipcode98119        4.068e+05  2.701e+04  15.061  < 2e-16 ***
zipcode98122        2.721e+05  2.421e+04  11.235  < 2e-16 ***
zipcode98125        1.097e+05  2.968e+04   3.695 0.000220 ***
zipcode98126        1.206e+05  2.068e+04   5.831 5.61e-09 ***
zipcode98133        5.335e+04  3.061e+04   1.743 0.081338 .  
zipcode98136        1.768e+05  2.100e+04   8.420  < 2e-16 ***
zipcode98144        2.204e+05  2.244e+04   9.823  < 2e-16 ***
zipcode98146        3.043e+04  1.876e+04   1.622 0.104828    
zipcode98148        3.650e+04  2.635e+04   1.385 0.165989    
zipcode98155        4.353e+04  3.191e+04   1.364 0.172629    
zipcode98166        5.831e+03  1.725e+04   0.338 0.735412    
zipcode98168        9.246e+03  1.825e+04   0.507 0.612501    
zipcode98177        1.079e+05  3.190e+04   3.382 0.000720 ***
zipcode98178       -5.908e+03  1.861e+04  -0.317 0.750888    
zipcode98188       -4.184e+03  1.898e+04  -0.220 0.825497    
zipcode98198       -3.087e+04  1.448e+04  -2.132 0.033009 *  
zipcode98199        2.724e+05  2.639e+04  10.323  < 2e-16 ***
lat                 2.233e+05  6.632e+04   3.367 0.000761 ***
long               -2.361e+05  4.727e+04  -4.995 5.95e-07 ***
sqft_living15       1.521e+01  3.029e+00   5.023 5.15e-07 ***
sqft_lot15         -1.159e-01  6.429e-02  -1.803 0.071344 .  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 150200 on 17284 degrees of freedom
Multiple R-squared:  0.8359,    Adjusted R-squared:  0.835 
F-statistic: 889.4 on 99 and 17284 DF,  p-value: < 2.2e-16

Your goal is to create a model with as small a test error as possible. Note that the square root of the training RSS from model mod.all is \(1.5018076\times 10^{5}\).

library(ggmap)
KingMap <-
  get_map(
    location = c(lon = -122.1, lat = 47.48),
    zoom = 10,
    source = "google",
    maptype = "roadmap"
  )
ggmap(KingMap) +
  geom_point(
    aes(x = housedata$long, y = housedata$lat),
    data = housedata,
    alpha = .2,
    color = "blue",
    size = 0.01
  ) +
  geom_point(
    aes(x = housedataT$long, y = housedataT$lat),
    data = housedataT,
    alpha = .2,
    color = "red",
    size = 0.01
  ) +
  ggtitle("Houses Sold in King County, Wa (2014-2015)") +
  labs(x = "longitute", y = "latitude")

Prediction

Yourlastname_Yourfirstname <- predict(mod.all, newdata = housedataT)
head(Yourlastname_Yourfirstname)
       1        2        3        4        5        6 
310196.9 845551.2 305946.0 532806.5 485256.6 469122.8 
write.csv(Yourlastname_Yourfirstname, file = "Yourlastname_Yourfirstname.csv")

I will compute your \(\sqrt{MSPE}\).

SMSPE <- sqrt(mean((head(Yourlastname_Yourfirstname) - c(310000, 650000, 233000, 580500, 535000, 605000))^2))
SMSPE
[1] 105493.4