Apply data science to hockey and broader sports analytics
Resources:
!pip install selenium
!pip install webdriver-manager
import requests
import time
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
# Enforce incognito mode
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument("--incognito")
from selenium.webdriver.common.keys import Keys
from webdriver_manager.firefox import GeckoDriverManager
import numpy as np
from numpy import array
import pandas as pd
import csv
from datetime import date, datetime
int_year = 2022; # last year of season to pull.
int_year_start = int_year - 1; # season starts.
str_season_last = [str(int_year_start) + str(int_year)]
# For data strucutre pages that has more than 10,000 records - need to break things down by month
str_date_start = [iter.strftime('%Y-%m-%d') for iter in pd.date_range(start=f"{int_year_start}-08-10", end=f"{int_year}-08-01", freq='MS')]
str_date_end = [iter.strftime('%Y-%m-%d') for iter in pd.date_range(start=f"{int_year_start}-09-10", end=f"{int_year}-09-01", freq='M')]
# define basics
int_year = 2021; # last year of season to pull.
int_year_start = int_year - 1; # season starts.
vec_season_start = list(map(str, list(range(1917, (int_year_start + 1)))))
vec_season_end = list(map(str, list(range(1918, (int_year + 1)))))
# Looking at each season, so no need to worry about, with optional pulling data in reverse order
str_season = [vec_season_start[i] + vec_season_end[i] for i, val in enumerate(vec_season_start)][::-1];
# For data strucutre pages that has more than 10,000 records - need to break things down by month
str_date_start = [iter.strftime('%Y-%m-%d') for iter in pd.date_range(start="1917-11-10", end=f"{int_year}-06-01", freq='MS')]
str_date_end = [iter.strftime('%Y-%m-%d') for iter in pd.date_range(start="1917-12-10", end=f"{int_year}-07-01", freq='M')]
str_data_type = ['teams', 'skaters', 'goalies'];
#str_aggregate = ['', 'aggregate=0&']; # In case the aggregation parameter becomes a problem in future.
str_report_type = ['season', 'game'];
str_datetype = ['season', 'date'];
# Other data, other than summary
str_page_team = ['faceoffpercentages', 'faceoffwins',
'goalsagainstbystrength', 'goalsbyperiod','goalsforbystrength','leadingtrailing', 'outshootoutshotby',
'realtime',
'penalties', 'penaltykill','penaltykilltime','powerplay','powerplaytime',
'summaryshooting', 'percentages', 'scoretrailfirst', 'shootout', 'shottype', 'goalgames'];
str_page_skater = ['bios', 'faceoffpercentages', 'faceoffwins', 'goalsForAgainst',
'realtime',
'penalties', 'penaltykill', #'penaltyShots', # For some reason, the penalty shot page is throwing errors nto able to grab table.
'powerplay', 'puckPossessions',
'summaryshooting', 'percentages', 'scoringRates', 'scoringpergame', 'shootout', 'shottype', 'timeonice'];
str_page_goalie = ['advanced', 'bios', 'daysrest', 'penaltyShots', 'savesByStrength', 'shootout', 'startedVsRelieved'];
driver = webdriver.Firefox(executable_path=GeckoDriverManager().install())
driver.delete_all_cookies(); # delete all cookies
driver.implicitly_wait(5) # secon.abs
driver = webdriver.Chrome(executable_path = "chromedriver.exe", chrome_options=chrome_options)
driver.delete_all_cookies(); # delete all cookies
driver.implicitly_wait(3) # seconds
# PULLING BY SEASON
def nhl_pull(str_url):
driver.get(str_url); # get to the url
try: # Wait until the table appears - JavaScript table may appear slower than other page elements
element = WebDriverWait(driver, 50).until(
EC.presence_of_element_located((By.CLASS_NAME, "rt-table"))
)
finally:
None
time.sleep(2); #Just in case
# Pull from information
html = driver.page_source # Pull the script information
soup = BeautifulSoup(html) # Soupify
# Get table header
rtheader = soup.find_all("div", {"class": "rt-table"})
n_pagecount = int(soup.find_all("span", {"class": "-totalPages"})[0].text) - 1; # number of pages to scrape
# NOTE: page numbers are zero indexed. be careful - using index, number of pages to pull
# Inside a function - this is throwing an error
tableheader = soup.find_all("div", {"class": "tableHeaderDiv"})[0].find_all("div", {"class": "rt-header-cell"})
str_titles = ["idx_row"]#['season start', 'season end']
for temp_str in tableheader:
temp_str_extract = temp_str.get('title');
if temp_str_extract == None:
temp_str_extract
else:
str_titles.append(temp_str_extract)
n_title = len(str_titles);
# Pulling the data.
table_data = soup.find_all("div", {"class": "rt-tbody"})[0].find_all("div", {"class" : "rt-tr-group"})
ary_data = [];
for idx_count, iter_row in enumerate(table_data):
each_row = iter_row.find_all("div", {"class" : "rt-td"})
temp_vec = [];
for iter_col in each_row:
temp_vec.append(iter_col.text) # save the data in order
if idx_count == 0: #start creating the array
ary_data = np.array(temp_vec)
else: # Do horizontal stack
ary_data = np.vstack((ary_data, np.array(temp_vec)))
# Convert to data frame
# Note: converting to array just in case it becomes a one row list.
df_data = pd.DataFrame(np.reshape(ary_data, (-1, len(str_titles))), columns = str_titles)
# Pull total record count
n_recordcount = int(soup.find_all("span", {"class": "-totalInfo"})[0].text.split()[0]);
return {'df': df_data, 'n_pagecount': n_pagecount, 'n_title': n_title, "n_recordcount" : n_recordcount} # Return the dataframe of data & pagecount for multiple pages to pull
def strip_right(df, suffix):
df.columns = df.columns.str.rstrip(suffix)
return df
# Pull URL of the team
def url_team_pull(idx_data_type, idx_report, idx_report_type, iter_date_start, iter_date_end, str_gametype, i_npage, idx_datetype):
URL_team_summary = (f"http://www.nhl.com/stats/"
f"{idx_data_type}?aggregate=0&{idx_report}reportType={idx_report_type}&"
f"{idx_datetype}From={iter_date_start}&{idx_datetype}To={iter_date_end}&"
f"gameType={str_gametype}&filter=gamesPlayed,gte,1&page={i_npage}&pageSize=100")
# Note that in previous iteration idx_aggregate == 'aggregate=0&' - no need because the workflow is pulled by season.
return URL_team_summary
def nhl_pull_loop(str_date_start, str_date_end, str_page, idx_data_type, idx_report_type, idx_datetype):
for idx, iter_date_start in enumerate(str_date_start):
iter_date_end = str_date_end[idx];
df_fin = [];
for idx_game, iter_game in enumerate(["regular", "playoff"]):
# In-loop-specific initial settings
str_gametype = idx_game + 2; # start with regular season
i_npage = 0; # start with page 1
idx_report = ''; # start with the summary page
# temporary data frame save
temp_df = [];
URL_team_summary = url_team_pull(idx_data_type, idx_report, idx_report_type, iter_date_start, iter_date_end, str_gametype, i_npage, idx_datetype);
temp_pulled = nhl_pull(URL_team_summary)
temp_df = temp_pulled['df']; # Initialize
npage = temp_pulled['n_pagecount'];
nrecord = temp_pulled['n_recordcount'];
if nrecord == 0:
continue # break out from the loop.
else: # Continue pulling the data for having a record
# For more than one record
if npage != 0:
for i_npage in range(1, npage + 1): # Python range, need to add one.
URL_team_summary = url_team_pull(idx_data_type, idx_report, idx_report_type, iter_date_start, iter_date_end, str_gametype, i_npage, idx_datetype);
temp_pulled = nhl_pull(URL_team_summary)
temp_df = temp_df.append(temp_pulled['df']);
else:
None
# All summary data pulled, remove empty rows
temp_df = temp_df.loc[(temp_df.idx_row != '\xa0'),:];
# Summary stats, just to check the right count of data.
#temp_df.to_csv(f'df_{idx_data_type}_{idx_report_type}_{iter_season}_summaryOnly.csv',index = False)
# Pull other data - more specific statistics,
for temp_idx in str_page:
# Set specific parameters for different categories - pages
idx_report = "report=" + temp_idx + "&";
i_npage = 0; # start with page 1, Reset
URL_team_summary = url_team_pull(idx_data_type, idx_report, idx_report_type, iter_date_start, iter_date_end, str_gametype, i_npage, idx_datetype);
# Pull date
temp_pulled = nhl_pull(URL_team_summary)
# Because this is different categories - neeed to make master partial file
temp_df_partial = temp_pulled['df'];
# Need to join the data frame
npage = temp_pulled['n_pagecount']
if npage != 0: # Pull data from multiple pages
for i_npage in range(1, npage + 1): # Python range, need to add one.
URL_team_summary = url_team_pull(idx_data_type, idx_report, idx_report_type, iter_date_start, iter_date_end, str_gametype, i_npage, idx_datetype);
temp_pulled = nhl_pull(URL_team_summary); # Pull additional data
temp_df_partial = temp_df_partial.append(temp_pulled['df']); # stack multiple pages
else:
None
# Save the data
# First, must clean up the empty rows, just to make sure not to join empty-empty
temp_df_partial = temp_df_partial.loc[(temp_df_partial.idx_row != '\xa0'),:];
if (temp_pulled['df'].size != 0): # If the page has at least one entry
if idx_data_type == 'teams': # For merging team statistics
if idx_report_type == 'season':
temp_df = pd.merge(temp_df, temp_df_partial, how = 'left', on = "Team", suffixes=('_x', '_y'))
elif idx_report_type == 'game':
temp_df = pd.merge(temp_df, temp_df_partial, how = 'left', on = ["Team", "Game"], suffixes=('_x', '_y'))
else:
None
else: ## For skaters and goalies
if idx_report_type == 'season':
if temp_idx == 'bios':
if idx_data_type == 'skaters': # To match with unique player identity, in case there are players with same name in each period
temp_df = pd.merge(temp_df, temp_df_partial, how = 'left', on = ['Player Name', 'Player Position', 'Games Played'], suffixes=('', '_y'))
else: # For goalies
temp_df = pd.merge(temp_df, temp_df_partial, how = 'left', on = ['Player Name', 'Goalie Catches'], suffixes=('', '_y'))
else:
temp_df = pd.merge(temp_df, temp_df_partial, how = 'left', on = ["Player Name", "Teams Played For"], suffixes=('', '_y'))
elif idx_report_type == 'game':
if temp_idx == 'bios':
# There is no common and reliable identification - instead of risk of many merges, we rely summary bio information in summary datafile.
None
else:
temp_df = pd.merge(temp_df, temp_df_partial, how = 'left', on = ["Player Name", 'Game'], suffixes=('', '_y'))
else:
None
else:
None
# End of appending relevant data to the summary table ====================================================================================================
# End of pulling all data of the season/playoff of the year: specific table ==================================================================================
# Remove redundant columns - the duplicate check and remove works from left to right (first ones appeard are kept, others are discarded)
# given the nature of left-join, the duplicate clear in order left to right works
temp_df = strip_right(temp_df, '_x');
temp_df = strip_right(temp_df, '_y');
temp_df = temp_df.loc[:, ~temp_df.columns.duplicated()];
# Save the game group type,
temp_df['season_type'] = iter_game
# For the first ieration, need to build the file
#if (iter_game == "regular"): # pull from regular
# df_fin = temp_df;
#else:
# df_fin = pd.concat([df_fin, temp_df]);
try:
df_fin = pd.concat([df_fin, temp_df]);
except: # If this is the first series pull
df_fin = temp_df;
# End of pulling all data for the time period sought =============================================================================================================
# Save the data for the time period - if there was any data points.
try:
df_fin.size # if no data was pulled, df_fin would be the list, and the loop should continue without saving any data
df_fin.to_csv(f'df_{idx_data_type}_{idx_report_type}_{iter_date_start}.csv',index = False)
except: # Do nothing, don't save the data
None
# End of pulling all data for the specific time sought ===============================================================================================================
# End of pulling all data for the time sought ===========================================================================================================================
str_data_type = ['teams', 'skaters', 'goalies'];
str_aggregate = ['', 'aggregate=0&'];
str_report_type = ['season', 'game'];
str_datetype = ['season', 'date'];
# Data block to pull
idx_datetype = str_datetype[1]; #Default to season, only use date for player-game stats.
# Test with skaters, per game
idx_data_type = str_data_type[1];
#idx_aggregate = str_aggregate[1];
idx_report_type = str_report_type[1];
str_page = str_page_skater;
nhl_pull_loop(str_date_start[697:], str_date_end[697:], str_page, idx_data_type, idx_report_type, idx_datetype)
# Pull by team, by season
idx_datetype = str_datetype[0]; # Except pulling game-level player data, this should be 0 always
# If pulling game-level player data - change the first two entries of the function to str_date_start and str_date_end, respectively
idx_data_type = str_data_type[0]; # team, skater, or goalie
idx_report_type = str_report_type[0]; # pull by season or by game level data
str_page = str_page_team; # adjust based on what data type (team, skater, or goalie) you are pulling
nhl_pull_loop(str_season, str_season, str_page, idx_data_type, idx_report_type, idx_datetype)
# Pull by team, by game
idx_datetype = str_datetype[0]; # Except pulling game-level player data, this should be 0 always
# If pulling game-level player data - change the first two entries of the function to str_date_start and str_date_end, respectively
idx_data_type = str_data_type[0]; # team, skater, or goalie
idx_report_type = str_report_type[1]; # pull by season or by game level data
str_page = str_page_team; # adjust based on what data type (team, skater, or goalie) you are pulling
nhl_pull_loop(str_season, str_season, str_page, idx_data_type, idx_report_type, idx_datetype)
# Pull by skater, by season
idx_datetype = str_datetype[0]; # Except pulling game-level player data, this should be 0 always
# If pulling game-level player data - change the first two entries of the function to str_date_start and str_date_end, respectively
idx_data_type = str_data_type[1]; # team, skater, or goalie
idx_report_type = str_report_type[0]; # pull by season or by game level data
str_page = str_page_skater; # adjust based on what data type (team, skater, or goalie) you are pulling
nhl_pull_loop(str_season, str_season, str_page, idx_data_type, idx_report_type, idx_datetype)
# Pull by skater, by game
idx_datetype = str_datetype[1]; # Except pulling game-level player data, this should be 0 always
# If pulling game-level player data - change the first two entries of the function to str_date_start and str_date_end, respectively
idx_data_type = str_data_type[1]; # team, skater, or goalie
idx_report_type = str_report_type[1]; # pull by season or by game level data
str_page = str_page_skater; # adjust based on what data type (team, skater, or goalie) you are pulling
nhl_pull_loop(str_date_start, str_date_end, str_page, idx_data_type, idx_report_type, idx_datetype)
# Pull by goalie, by season
idx_datetype = str_datetype[0]; # Except pulling game-level player data, this should be 0 always
# If pulling game-level player data - change the first two entries of the function to str_date_start and str_date_end, respectively
idx_data_type = str_data_type[2]; # team, skater, or goalie
idx_report_type = str_report_type[0]; # pull by season or by game level data
str_page = str_page_goalie; # adjust based on what data type (team, skater, or goalie) you are pulling
nhl_pull_loop(str_season, str_season, str_page, idx_data_type, idx_report_type, idx_datetype)
# Pull by goalie, by game
idx_datetype = str_datetype[0]; # Except pulling game-level player data, this should be 0 always
# If pulling game-level player data - change the first two entries of the function to str_date_start and str_date_end, respectively
idx_data_type = str_data_type[2]; # team, skater, or goalie
idx_report_type = str_report_type[1]; # pull by season or by game level data
str_page = str_page_goalie; # adjust based on what data type (team, skater, or goalie) you are pulling
nhl_pull_loop(str_season, str_season, str_page, idx_data_type, idx_report_type, idx_datetype)
df_examin = pd.read_csv('df_skaters_season_20202021.csv')
df_examin.groupby(['Player Name', 'Teams Played For']).first().reset_index()
Player Name | Teams Played For | idx_row | Season | Skater Shoots | Player Position | Games Played | Goals | Assists | Points | ... | Even Strength Time On Ice | Even Strength Time On Ice Per Game Played | Power Play Time On Ice Per Game Played | Shorthanded Time On Ice Per Game Played | Overtime Time on Ice (since 2009-10) | Overtime Time on Ice Per Overtime Game Played (since 2009-10) | Shifts | Time On Ice Per Shift | Shifts Per Game Played (since 1997-98) | season_type | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | A.J. Greer | NJD | 847 | 2020-21 | L | L | 1 | 0 | 0 | 0 | ... | 8:33 | 8:33 | 0:00 | 0:00 | 0:00 | -- | 12 | 0:43 | 12.0 | regular |
1 | Aaron Ness | ARI | 807 | 2020-21 | L | D | 1 | 0 | 0 | 0 | ... | 12:50 | 12:50 | 0:00 | 0:00 | 0:00 | 0:00 | 18 | 0:43 | 18.0 | regular |
2 | Adam Boqvist | CHI | 332 | 2020-21 | R | D | 35 | 2 | 14 | 16 | ... | 474:10 | 13:33 | 3:23 | 0:04 | 4:00 | 0:40 | 689 | 0:52 | 19.7 | regular |
3 | Adam Brooks | TOR | 584 | 2020-21 | L | C | 11 | 4 | 1 | 5 | ... | 111:15 | 10:07 | 0:08 | 0:26 | 0:00 | 0:00 | 171 | 0:41 | 15.5 | regular |
4 | Alec Regula | CHI | 889 | 2020-21 | R | D | 3 | 0 | 0 | 0 | ... | 35:31 | 11:50 | 0:32 | 0:00 | 0:00 | 0:00 | 43 | 0:52 | 14.3 | regular |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
432 | Zach Sanford | STL | 322 | 2020-21 | L | L | 52 | 10 | 6 | 16 | ... | 643:08 | 12:22 | 0:57 | 1:35 | 0:00 | 0:00 | 1,018 | 0:46 | 19.6 | regular |
433 | Zach Senyshyn | BOS | 852 | 2020-21 | R | R | 8 | 0 | 0 | 0 | ... | 80:34 | 10:04 | 0:01 | 0:02 | 0:00 | 0:00 | 127 | 0:38 | 15.9 | regular |
434 | Zach Werenski | CBJ | 264 | 2020-21 | L | D | 35 | 7 | 13 | 20 | ... | 711:14 | 20:19 | 2:10 | 1:52 | 15:12 | 1:23 | 987 | 0:52 | 28.2 | regular |
435 | Zach Whitecloud | VGK | 429 | 2020-21 | R | D | 51 | 2 | 10 | 12 | ... | 807:55 | 15:50 | 0:03 | 1:56 | 0:00 | 0:00 | 1,115 | 0:49 | 21.9 | regular |
436 | Zack Kassian | EDM | 595 | 2020-21 | R | R | 27 | 2 | 3 | 5 | ... | 314:55 | 11:40 | 0:11 | 0:00 | 0:00 | 0:00 | 400 | 0:48 | 14.8 | regular |
437 rows × 255 columns