Tidy Tuesday: NHL Goalscorers

Jonny Law
2020-03-03

First install the Tidy Tuesday R package.


# install.packages("remotes")
remotes::install_github("thebioengineer/tidytuesdayR")

The data for this Tuesday can be downloaded using tt_load.


tuesdata <- tidytuesdayR::tt_load('2020-03-03')

    Downloading file 1 of 3: `game_goals.csv`
    Downloading file 2 of 3: `season_goals.csv`
    Downloading file 3 of 3: `top_250.csv`

Unfortunately this only grabbed one file - the top 250 goalscorers. First look at this file.

Top Career Goalscorers


tuesdata$top_250 %>% 
  top_n(30, wt = total_goals) %>% 
  mutate(player = forcats::fct_reorder(player, total_goals)) %>% 
  ggplot(aes(x = player, y = total_goals)) +
  geom_col() +
  coord_flip() 

Average goals per season


parse_end_year <- function(years) {
  end_tens <- substr(years, 6, 7)
  possible_end <- as.numeric(paste0(substr(years, 1, 2), end_tens))
  start <- as.numeric(substr(years, 1, 4))
  if (possible_end - start > 0) {
    possible_end
  } else {
    as.numeric(paste0(20, end_tens))
  }
}

options(scipen = 99)
tuesdata$top_250 %>%
  rowwise() %>% 
  mutate(
    yr_end = parse_end_year(years),
    seasons = yr_end - yr_start,
    average_goals_per_season = total_goals / seasons,
    decade = cut(yr_start, breaks = seq(1920, 2020, by = 10), dig.lab = 10)
  ) %>% 
  ungroup() %>% 
  # group_by(decade) %>% 
  mutate(player = forcats::fct_reorder(player, average_goals_per_season)) %>% 
  top_n(30, wt = average_goals_per_season) %>%
  ggplot(aes(x = player, y = average_goals_per_season, fill = active)) +
  geom_col() +
  coord_flip()


  # facet_wrap( ~ decade, scales = "free_y")

goals_per_season <- tuesdata$top_250 %>%
  rowwise() %>%
  mutate(
    yr_end = parse_end_year(years),
    seasons = yr_end - yr_start,
    average_goals_per_season = total_goals / seasons,
    decade = cut(
      yr_start,
      breaks = seq(1920, 2020, by = 10),
      dig.lab = 10
    )
  ) %>%
  ungroup()

ggplot() +
  geom_point(data = goals_per_season,
             aes(x = seasons, y = average_goals_per_season)) +
  ggrepel::geom_label_repel(data = top_n(goals_per_season, 10, total_goals), aes(x = seasons, y = average_goals_per_season, label = paste(player, total_goals), fill = active)) +
  xlab("Total Seasons in NHL") +
  ylab("Average Goals per Season") +
  theme_minimal() +
  labs(title = tools::toTitleCase("average goals per season and total number of seasons \nfor the top 250 NHL Goal Scorers"), subtitle = "Top 10 all-time goal scorers are labelled") +
  theme(legend.position = c(0.9, 0.9), legend.title = element_blank())


# ggsave("goals_per_season.png")

Game Goals


game_goals <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-03-03/game_goals.csv')

highlighted_players <- tuesdata$top_250 %>% 
  filter(player %in% c("Alex Ovechkin", "Wayne Gretzky")) %>% 
  select(player, yr_start)

cumulative_goals <- game_goals %>% 
  mutate(age_years = as.numeric(substr(age, 1, 2))) %>% 
  group_by(player, age_years) %>% 
  summarise(goals_per_season = sum(goals)) %>% 
  arrange(player, age_years) %>% 
  inner_join(tuesdata$top_250, by = "player") %>% 
  mutate(cumulative_goals = cumsum(goals_per_season))

ggplot(cumulative_goals, aes(x = age_years, y = cumulative_goals)) +
  geom_line(data = cumulative_goals,
            aes(x = age_years, y = cumulative_goals, group = player),
            alpha = 0.2) +
  geom_line(
    data = cumulative_goals %>% inner_join(highlighted_players, by = "player"),
    aes(x = age_years, y = cumulative_goals, colour = player)
  ) +
  geom_label_repel(
    data =
      cumulative_goals %>%
      inner_join(highlighted_players, by = "player") %>% 
      filter(age_years == 30),
    aes(
      x = age_years,
      y = cumulative_goals,
      label = player,
      colour = player
    )
  ) +
  xlab("Player Age") +
  ylab("Total Goals") +
  theme_minimal() +
  labs(title = "Cumulative Goals in the NHL by Age", subtitle = "Data from 42 of the all time 250 NHL scorers who started their career since game-level \ndata became available in 1979/80 season.") +
  theme(legend.position = "none")