library(tidyverse)
library(tidymodels)
library(schrute)
library(lubridate)
Data Splitting and Overfitting
Application exercise
Answers
Use theoffice
data from the schrute package to predict IMDB scores for episodes of The Office.
glimpse(theoffice)
Rows: 55,130
Columns: 12
$ index <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16…
$ season <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
$ episode <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
$ episode_name <chr> "Pilot", "Pilot", "Pilot", "Pilot", "Pilot", "Pilot",…
$ director <chr> "Ken Kwapis", "Ken Kwapis", "Ken Kwapis", "Ken Kwapis…
$ writer <chr> "Ricky Gervais;Stephen Merchant;Greg Daniels", "Ricky…
$ character <chr> "Michael", "Jim", "Michael", "Jim", "Michael", "Micha…
$ text <chr> "All right Jim. Your quarterlies look very good. How …
$ text_w_direction <chr> "All right Jim. Your quarterlies look very good. How …
$ imdb_rating <dbl> 7.6, 7.6, 7.6, 7.6, 7.6, 7.6, 7.6, 7.6, 7.6, 7.6, 7.6…
$ total_votes <int> 3706, 3706, 3706, 3706, 3706, 3706, 3706, 3706, 3706,…
$ air_date <chr> "2005-03-24", "2005-03-24", "2005-03-24", "2005-03-24…
Fix air_date
for later use.
<- theoffice %>%
theoffice mutate(air_date = ymd(as.character(air_date)))
We will
- engineer features based on episode scripts
- train a model
- make predictions
- get performance metrics
Note: The episodes listed in theoffice
don’t match the ones listed in the data we used in the cross validation lesson.
%>%
theoffice distinct(season, episode)
# A tibble: 186 × 2
season episode
<int> <int>
1 1 1
2 1 2
3 1 3
4 1 4
5 1 5
6 1 6
7 2 1
8 2 2
9 2 3
10 2 4
# ℹ 176 more rows
Exercise 1 - Calculate the percentage of lines spoken by Jim, Pam, Michael, and Dwight for each episode of The Office.
<- theoffice %>%
office_lines group_by(season, episode) %>%
mutate(
n_lines = n(),
lines_jim = sum(character == "Jim") / n_lines,
lines_pam = sum(character == "Pam") / n_lines,
lines_michael = sum(character == "Michael") / n_lines,
lines_dwight = sum(character == "Dwight") / n_lines,
%>%
) ungroup() %>%
select(season, episode, episode_name, contains("lines_")) %>%
distinct(season, episode, episode_name, .keep_all = TRUE)
Exercise 2 - Identify episodes that touch on Halloween, Valentine’s Day, and Christmas.
<- theoffice %>%
theoffice mutate(text = tolower(text))
<- theoffice %>%
halloween_episodes filter(str_detect(text, "halloween")) %>%
count(episode_name) %>%
filter(n > 1) %>%
mutate(halloween = 1) %>%
select(-n)
<- theoffice %>%
valentine_episodes filter(str_detect(text, "valentine")) %>%
count(episode_name) %>%
filter(n > 1) %>%
mutate(valentine = 1) %>%
select(-n)
<- theoffice %>%
christmas_episodes filter(str_detect(text, "christmas")) %>%
count(episode_name) %>%
filter(n > 1) %>%
mutate(christmas = 1) %>%
select(-n)
Exercise 3 Modify the following code to also create a new indicator variable called michael
which takes the value 1
if Michael Scott (Steve Carrell) was there, and 0
if not. Note: Michael Scott (Steve Carrell) left the show at the end of Season 7. (make sure to remove eval: true)
<- theoffice %>%
office_df select(season, episode, episode_name, imdb_rating, total_votes, air_date) %>%
distinct(season, episode, .keep_all = TRUE) %>%
left_join(halloween_episodes, by = "episode_name") %>%
left_join(valentine_episodes, by = "episode_name") %>%
left_join(christmas_episodes, by = "episode_name") %>%
replace_na(list(halloween = 0, valentine = 0, christmas = 0)) %>%
mutate(michael = if_else(season > 7, 0, 1)) %>%
mutate(across(halloween:michael, as.factor)) %>%
left_join(office_lines, by = c("season", "episode", "episode_name"))
Exercise 4 - Split the data into training (75%) and testing (25%).
set.seed(1122)
<- initial_split(office_df)
office_split <- training(office_split)
office_train <- testing(office_split) office_test
Exercise 5 - Specify a linear regression model.
<- linear_reg() %>%
office_mod set_engine("lm")
Exercise 6 - Create a recipe that updates the role of episode_name
to not be a predictor, removes air_date
as a predictor, and removes all zero variance predictors.
<- recipe(imdb_rating ~ ., data = office_train) %>%
office_rec update_role(episode_name, new_role = "id") %>%
step_rm(air_date) %>%
step_dummy(all_nominal(), -episode_name) %>%
step_zv(all_predictors())
Exercise 7 - Build a workflow for fitting the model specified earlier and using the recipe you developed to preprocess the data.
<- workflow() %>%
office_wflow add_model(office_mod) %>%
add_recipe(office_rec)
Exercise 8 - Fit the model to training data and interpret a couple of the slope coefficients.
<- office_wflow %>%
office_fit fit(data = office_train)
tidy(office_fit)
# A tibble: 12 × 5
term estimate std.error statistic p.value
<chr> <dbl> <dbl> <dbl> <dbl>
1 (Intercept) 6.34 0.298 21.2 1.24e-43
2 season 0.0542 0.0224 2.42 1.68e- 2
3 episode 0.0125 0.00439 2.85 5.05e- 3
4 total_votes 0.000372 0.0000390 9.55 1.25e-16
5 lines_jim 0.653 0.679 0.962 3.38e- 1
6 lines_pam 0.0329 0.696 0.0473 9.62e- 1
7 lines_michael 0.111 0.544 0.204 8.39e- 1
8 lines_dwight 0.806 0.522 1.54 1.25e- 1
9 halloween_X1 -0.00340 0.181 -0.0188 9.85e- 1
10 valentine_X1 -0.0573 0.180 -0.318 7.51e- 1
11 christmas_X1 0.285 0.129 2.22 2.82e- 2
12 michael_X1 0.585 0.141 4.15 6.01e- 5
Exercise 9 - Use your model to make predictions for the testing data and calculate the R2 and the RMSE.
<- predict(office_fit, new_data = office_test) %>%
office_test_pred bind_cols(office_test %>% select(imdb_rating, episode_name))
rmse(office_test_pred, truth = imdb_rating, estimate = .pred)
# A tibble: 1 × 3
.metric .estimator .estimate
<chr> <chr> <dbl>
1 rmse standard 0.401
rsq(office_test_pred, truth = imdb_rating, estimate = .pred)
# A tibble: 1 × 3
.metric .estimator .estimate
<chr> <chr> <dbl>
1 rsq standard 0.519