Data Splitting and Overfitting

Application exercise
Answers
library(tidyverse)
library(tidymodels)
library(schrute)
library(lubridate)

Use theoffice data from the schrute package to predict IMDB scores for episodes of The Office.

glimpse(theoffice)
Rows: 55,130
Columns: 12
$ index            <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16…
$ season           <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
$ episode          <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
$ episode_name     <chr> "Pilot", "Pilot", "Pilot", "Pilot", "Pilot", "Pilot",…
$ director         <chr> "Ken Kwapis", "Ken Kwapis", "Ken Kwapis", "Ken Kwapis…
$ writer           <chr> "Ricky Gervais;Stephen Merchant;Greg Daniels", "Ricky…
$ character        <chr> "Michael", "Jim", "Michael", "Jim", "Michael", "Micha…
$ text             <chr> "All right Jim. Your quarterlies look very good. How …
$ text_w_direction <chr> "All right Jim. Your quarterlies look very good. How …
$ imdb_rating      <dbl> 7.6, 7.6, 7.6, 7.6, 7.6, 7.6, 7.6, 7.6, 7.6, 7.6, 7.6…
$ total_votes      <int> 3706, 3706, 3706, 3706, 3706, 3706, 3706, 3706, 3706,…
$ air_date         <chr> "2005-03-24", "2005-03-24", "2005-03-24", "2005-03-24…

Fix air_date for later use.

theoffice <- theoffice %>%
  mutate(air_date = ymd(as.character(air_date)))

We will

Note: The episodes listed in theoffice don’t match the ones listed in the data we used in the cross validation lesson.

theoffice %>%
  distinct(season, episode)
# A tibble: 186 × 2
   season episode
    <int>   <int>
 1      1       1
 2      1       2
 3      1       3
 4      1       4
 5      1       5
 6      1       6
 7      2       1
 8      2       2
 9      2       3
10      2       4
# ℹ 176 more rows

Exercise 1 - Calculate the percentage of lines spoken by Jim, Pam, Michael, and Dwight for each episode of The Office.

office_lines <- theoffice %>%
  group_by(season, episode) %>%
  mutate(
    n_lines = n(),
    lines_jim = sum(character == "Jim") / n_lines,
    lines_pam = sum(character == "Pam") / n_lines,
    lines_michael = sum(character == "Michael") / n_lines,
    lines_dwight = sum(character == "Dwight") / n_lines,
  ) %>%
  ungroup() %>%
  select(season, episode, episode_name, contains("lines_")) %>%
  distinct(season, episode, episode_name, .keep_all = TRUE)

Exercise 2 - Identify episodes that touch on Halloween, Valentine’s Day, and Christmas.

theoffice <- theoffice %>%
  mutate(text = tolower(text))

halloween_episodes <- theoffice %>%
  filter(str_detect(text, "halloween")) %>% 
  count(episode_name) %>%
  filter(n > 1) %>%
  mutate(halloween = 1) %>%
  select(-n)

valentine_episodes <- theoffice %>%
  filter(str_detect(text, "valentine")) %>% 
  count(episode_name) %>%
  filter(n > 1) %>%
  mutate(valentine = 1) %>%
  select(-n)

christmas_episodes <- theoffice %>%
  filter(str_detect(text, "christmas")) %>% 
  count(episode_name) %>%
  filter(n > 1) %>%
  mutate(christmas = 1) %>%
  select(-n)

Exercise 3 Modify the following code to also create a new indicator variable called michael which takes the value 1 if Michael Scott (Steve Carrell) was there, and 0 if not. Note: Michael Scott (Steve Carrell) left the show at the end of Season 7. (make sure to remove eval: true)

office_df <- theoffice %>%
  select(season, episode, episode_name, imdb_rating, total_votes, air_date) %>%
  distinct(season, episode, .keep_all = TRUE) %>%
  left_join(halloween_episodes, by = "episode_name") %>% 
  left_join(valentine_episodes, by = "episode_name") %>% 
  left_join(christmas_episodes, by = "episode_name") %>% 
  replace_na(list(halloween = 0, valentine = 0, christmas = 0)) %>%
  mutate(michael = if_else(season > 7, 0, 1)) %>%
  mutate(across(halloween:michael, as.factor)) %>%
  left_join(office_lines, by = c("season", "episode", "episode_name"))

Exercise 4 - Split the data into training (75%) and testing (25%).

set.seed(1122)
office_split <- initial_split(office_df)
office_train <- training(office_split)
office_test <- testing(office_split)

Exercise 5 - Specify a linear regression model.

office_mod <- linear_reg() %>%
  set_engine("lm")

Exercise 6 - Create a recipe that updates the role of episode_name to not be a predictor, removes air_date as a predictor, and removes all zero variance predictors.

office_rec <- recipe(imdb_rating ~ ., data = office_train) %>%
  update_role(episode_name, new_role = "id") %>%
  step_rm(air_date) %>%
  step_dummy(all_nominal(), -episode_name) %>%
  step_zv(all_predictors())

Exercise 7 - Build a workflow for fitting the model specified earlier and using the recipe you developed to preprocess the data.

office_wflow <- workflow() %>%
  add_model(office_mod) %>%
  add_recipe(office_rec)

Exercise 8 - Fit the model to training data and interpret a couple of the slope coefficients.

office_fit <- office_wflow %>%
  fit(data = office_train)

tidy(office_fit)
# A tibble: 12 × 5
   term           estimate std.error statistic  p.value
   <chr>             <dbl>     <dbl>     <dbl>    <dbl>
 1 (Intercept)    6.34     0.298       21.2    1.24e-43
 2 season         0.0542   0.0224       2.42   1.68e- 2
 3 episode        0.0125   0.00439      2.85   5.05e- 3
 4 total_votes    0.000372 0.0000390    9.55   1.25e-16
 5 lines_jim      0.653    0.679        0.962  3.38e- 1
 6 lines_pam      0.0329   0.696        0.0473 9.62e- 1
 7 lines_michael  0.111    0.544        0.204  8.39e- 1
 8 lines_dwight   0.806    0.522        1.54   1.25e- 1
 9 halloween_X1  -0.00340  0.181       -0.0188 9.85e- 1
10 valentine_X1  -0.0573   0.180       -0.318  7.51e- 1
11 christmas_X1   0.285    0.129        2.22   2.82e- 2
12 michael_X1     0.585    0.141        4.15   6.01e- 5

Exercise 9 - Use your model to make predictions for the testing data and calculate the R2 and the RMSE.

office_test_pred <- predict(office_fit, new_data = office_test) %>%
  bind_cols(office_test %>% select(imdb_rating, episode_name))

rmse(office_test_pred, truth = imdb_rating, estimate = .pred)
# A tibble: 1 × 3
  .metric .estimator .estimate
  <chr>   <chr>          <dbl>
1 rmse    standard       0.401
rsq(office_test_pred, truth = imdb_rating, estimate = .pred)
# A tibble: 1 × 3
  .metric .estimator .estimate
  <chr>   <chr>          <dbl>
1 rsq     standard       0.519