Reading the comma separated file from the Data
directory one level up from where this document is stored using the read.csv()
function.
housedata <- read.csv("../Data/housedata.csv",
colClasses = c(id = "character", date = "character", yr_built = "character",
zipcode = "character"))
housedata$date <- as.Date(housedata$date, "%Y%m%d")
housedata$waterfront <- factor(housedata$waterfront, labels = c("No", "Yes"))
# housedata$yr_built <- as.Date(housedata$yr_built, "%Y")
housedata$yr_built <- as.Date(ISOdate(housedata$yr_built, 9, 1)) # Complete Year, Sept 1
housedata$yr_renovated <- ifelse(housedata$yr_renovated == 0, NA, housedata$yr_renovated)
housedata$yr_renovated <- as.character(housedata$yr_renovated)
housedata$yr_renovated <- as.Date(housedata$yr_renovated, "%Y")
library(DT)
datatable(housedata[, 2:10], rownames = FALSE)
Consider predicting the price (price
) of a house based on a certain feature (sqft_living
). Start by graphing the relationship.
library(ggplot2)
p1 <- ggplot(data = housedata, aes(x = sqft_living, y = price)) +
geom_point() +
theme_bw()
p1
Overplotting is problematic. What should we do?
alpha
).alpha
p2 <- ggplot(data = housedata, aes(x = sqft_living, y = price)) +
geom_point(alpha = 0.05, color = "blue") +
theme_bw()
p2
p3 <- ggplot(data = housedata, aes(x = sqft_living, y = price)) +
stat_bin2d(bins = 50) +
theme_bw()
p3
p4 <- ggplot(data = housedata, aes(x = sqft_living, y = price)) +
stat_bin2d(bins = 50) +
scale_fill_gradient(low = "lightblue", high = "red",
limits = c(0, 1000)) +
theme_bw()
p4
p5 <- ggplot(data = housedata, aes(x = sqft_living, y = price)) +
stat_binhex(bins = 50) +
scale_fill_gradient(low = "lightblue", high = "red",
limits = c(0, 800), breaks = seq(0, 800, by = 200)) +
theme_bw()
p5
**Note* For both stat_bin2d
and stat_binhex
, if you manually specify the range, and there is a bin that falls outside that range because it has too many of too few points, that bin will show up as grey rather than the color at the high or low end of the range. Observe the gray hexagons in the lower left corner of the above graph.
p6 <- ggplot(data = housedata, aes(x = sqft_living, y = price)) +
stat_binhex(bins = 50) +
scale_fill_gradient(low = "lightblue", high = "red",
limits = c(0, 1000), breaks = seq(0, 1000, by = 200)) +
theme_bw()
p6
Use a simple linear model to predict the price of a house with 2,500 \(\text{ft}^2\).
slm <- lm(price ~ sqft_living, data = housedata)
summary(slm)
Call:
lm(formula = price ~ sqft_living, data = housedata)
Residuals:
Min 1Q Median 3Q Max
-1490607 -148265 -23758 105710 4349512
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -47116.079 4923.344 -9.57 <2e-16 ***
sqft_living 281.959 2.164 130.29 <2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 263000 on 17382 degrees of freedom
Multiple R-squared: 0.4941, Adjusted R-squared: 0.4941
F-statistic: 1.698e+04 on 1 and 17382 DF, p-value: < 2.2e-16
predict(slm, newdata = data.frame(sqft_living = 2500))
1
657781
p6 + geom_smooth(method = "lm") +
geom_vline(xintercept = 2500,linetype = "dashed", color = "red") +
geom_hline(yintercept = predict(slm, newdata = data.frame(sqft_living = 2500)), linetype = "dashed", color = "red") +
labs(x = "Living Space (square feet)", y = "Price ($)")