Hockey Analytics

Apply data science to hockey and broader sports analytics

Project maintained by justinjjlee Hosted on GitHub Pages — Theme by mattgraham

Data Collection for NHL - Workbook example of scrapping summary statistics

Resources:

Selenium - for using the chrome driver: download.
Another good resource.
Make sure that pages are fully loaded before scrapping - tables takes some time to get loaded.

!pip install selenium
!pip install webdriver-manager

import requests
import time
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By

# Enforce incognito mode
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument("--incognito")

from selenium.webdriver.common.keys import Keys
from webdriver_manager.firefox import GeckoDriverManager


import numpy as np
from numpy import array
import pandas as pd
import csv
from datetime import date, datetime

int_year = 2022; # last year of season to pull.
int_year_start = int_year - 1; # season starts.
str_season_last = [str(int_year_start) + str(int_year)]

# For data strucutre pages that has more than 10,000 records - need to break things down by month
str_date_start = [iter.strftime('%Y-%m-%d') for iter in pd.date_range(start=f"{int_year_start}-08-10", end=f"{int_year}-08-01", freq='MS')]
str_date_end = [iter.strftime('%Y-%m-%d') for iter in pd.date_range(start=f"{int_year_start}-09-10", end=f"{int_year}-09-01", freq='M')]

# define basics
int_year = 2021; # last year of season to pull.
int_year_start = int_year - 1; # season starts.
vec_season_start = list(map(str, list(range(1917, (int_year_start + 1)))))
vec_season_end = list(map(str, list(range(1918, (int_year + 1)))))

# Looking at each season, so no need to worry about, with optional pulling data in reverse order
str_season = [vec_season_start[i] + vec_season_end[i] for i, val in enumerate(vec_season_start)][::-1]; 

# For data strucutre pages that has more than 10,000 records - need to break things down by month
str_date_start = [iter.strftime('%Y-%m-%d') for iter in pd.date_range(start="1917-11-10", end=f"{int_year}-06-01", freq='MS')]
str_date_end = [iter.strftime('%Y-%m-%d') for iter in pd.date_range(start="1917-12-10", end=f"{int_year}-07-01", freq='M')]

str_data_type = ['teams', 'skaters', 'goalies'];
#str_aggregate = ['', 'aggregate=0&']; # In case the aggregation parameter becomes a problem in future.
str_report_type = ['season', 'game'];
str_datetype = ['season', 'date'];

# Other data, other than summary
str_page_team = ['faceoffpercentages', 'faceoffwins', 
                 'goalsagainstbystrength', 'goalsbyperiod','goalsforbystrength','leadingtrailing', 'outshootoutshotby',
                 'realtime', 
                 'penalties', 'penaltykill','penaltykilltime','powerplay','powerplaytime',
                 'summaryshooting', 'percentages', 'scoretrailfirst', 'shootout', 'shottype', 'goalgames'];
str_page_skater = ['bios', 'faceoffpercentages', 'faceoffwins', 'goalsForAgainst',
                   'realtime', 
                   'penalties', 'penaltykill', #'penaltyShots', # For some reason, the penalty shot page is throwing errors nto able to grab table. 
                   'powerplay', 'puckPossessions',
                   'summaryshooting', 'percentages', 'scoringRates', 'scoringpergame', 'shootout', 'shottype', 'timeonice'];
str_page_goalie = ['advanced', 'bios', 'daysrest', 'penaltyShots', 'savesByStrength', 'shootout', 'startedVsRelieved'];

driver = webdriver.Firefox(executable_path=GeckoDriverManager().install())
driver.delete_all_cookies(); # delete all cookies
driver.implicitly_wait(5) # secon.abs

driver = webdriver.Chrome(executable_path = "chromedriver.exe", chrome_options=chrome_options)

driver.delete_all_cookies(); # delete all cookies
driver.implicitly_wait(3) # seconds

Import functions to be used (in .py scripts)

# PULLING BY SEASON
def nhl_pull(str_url):
    driver.get(str_url); # get to the url

    try: # Wait until the table appears - JavaScript table may appear slower than other page elements
        element = WebDriverWait(driver, 50).until(
            EC.presence_of_element_located((By.CLASS_NAME, "rt-table"))
        )
    finally:
        None
    time.sleep(2); #Just in case
    # Pull from information
    html = driver.page_source # Pull the script information
    soup = BeautifulSoup(html) # Soupify

    # Get table header
    rtheader = soup.find_all("div", {"class": "rt-table"})

    n_pagecount = int(soup.find_all("span", {"class": "-totalPages"})[0].text) - 1; # number of pages to scrape
    # NOTE: page numbers are zero indexed. be careful - using index, number of pages to pull
    # Inside a function - this is throwing an error

    tableheader = soup.find_all("div", {"class": "tableHeaderDiv"})[0].find_all("div", {"class": "rt-header-cell"})

    str_titles = ["idx_row"]#['season start', 'season end']
    for temp_str in tableheader:
        temp_str_extract = temp_str.get('title');
        if temp_str_extract == None:
            temp_str_extract
        else:
            str_titles.append(temp_str_extract)

    n_title = len(str_titles);

    # Pulling the data.
    table_data = soup.find_all("div", {"class": "rt-tbody"})[0].find_all("div", {"class" : "rt-tr-group"})

    ary_data = [];
    for idx_count, iter_row in enumerate(table_data):
        each_row = iter_row.find_all("div", {"class" : "rt-td"})
        temp_vec = [];
        for iter_col in each_row:
            temp_vec.append(iter_col.text) # save the data in order
        if idx_count == 0: #start creating the array
            ary_data = np.array(temp_vec)
        else: # Do horizontal stack
            ary_data = np.vstack((ary_data, np.array(temp_vec)))

    # Convert to data frame
    #     Note: converting to array just in case it becomes a one row list.
    df_data = pd.DataFrame(np.reshape(ary_data, (-1, len(str_titles))), columns = str_titles)
    
    # Pull total record count
    n_recordcount = int(soup.find_all("span", {"class": "-totalInfo"})[0].text.split()[0]);

    return {'df': df_data, 'n_pagecount': n_pagecount, 'n_title': n_title, "n_recordcount" : n_recordcount} # Return the dataframe of data & pagecount for multiple pages to pull

def strip_right(df, suffix):
    df.columns =  df.columns.str.rstrip(suffix)
    return df

# Pull URL of the team 
def url_team_pull(idx_data_type, idx_report, idx_report_type, iter_date_start, iter_date_end, str_gametype, i_npage, idx_datetype):
    URL_team_summary = (f"http://www.nhl.com/stats/"
                        f"{idx_data_type}?aggregate=0&{idx_report}reportType={idx_report_type}&"
                        f"{idx_datetype}From={iter_date_start}&{idx_datetype}To={iter_date_end}&"
                        f"gameType={str_gametype}&filter=gamesPlayed,gte,1&page={i_npage}&pageSize=100")
    # Note that in previous iteration idx_aggregate == 'aggregate=0&' - no need because the workflow is pulled by season.
    return URL_team_summary

def nhl_pull_loop(str_date_start, str_date_end, str_page, idx_data_type, idx_report_type, idx_datetype):
    for idx, iter_date_start in enumerate(str_date_start):
        iter_date_end = str_date_end[idx];
        df_fin = [];
        for idx_game, iter_game in enumerate(["regular", "playoff"]):
            # In-loop-specific initial settings
            str_gametype = idx_game + 2; # start with regular season
            i_npage = 0; # start with page 1
            idx_report = ''; # start with the summary page

            # temporary data frame save
            temp_df = [];
            
            URL_team_summary = url_team_pull(idx_data_type, idx_report, idx_report_type, iter_date_start, iter_date_end, str_gametype, i_npage, idx_datetype);

            temp_pulled = nhl_pull(URL_team_summary)
            temp_df = temp_pulled['df']; # Initialize
            npage = temp_pulled['n_pagecount'];
            nrecord = temp_pulled['n_recordcount'];

            if nrecord == 0:
                continue # break out from the loop.
            else: # Continue pulling the data for having a record
                # For more than one record
                if npage != 0:
                    for i_npage in range(1, npage + 1): # Python range, need to add one.
                        URL_team_summary = url_team_pull(idx_data_type, idx_report, idx_report_type, iter_date_start, iter_date_end, str_gametype, i_npage, idx_datetype);
                        temp_pulled = nhl_pull(URL_team_summary)

                        temp_df = temp_df.append(temp_pulled['df']);
                else:
                    None
                # All summary data pulled, remove empty rows
                temp_df = temp_df.loc[(temp_df.idx_row != '\xa0'),:];

                # Summary stats, just to check the right count of data.
                #temp_df.to_csv(f'df_{idx_data_type}_{idx_report_type}_{iter_season}_summaryOnly.csv',index = False)

                # Pull other data - more specific statistics,
                for temp_idx in str_page:
                    # Set specific parameters for different categories - pages
                    idx_report = "report=" + temp_idx + "&";
                    i_npage = 0; # start with page 1, Reset

                    URL_team_summary = url_team_pull(idx_data_type, idx_report, idx_report_type, iter_date_start, iter_date_end, str_gametype, i_npage, idx_datetype);

                    # Pull date
                    temp_pulled = nhl_pull(URL_team_summary)

                    # Because this is different categories - neeed to make master partial file
                    temp_df_partial = temp_pulled['df'];

                    # Need to join the data frame
                    npage = temp_pulled['n_pagecount']
                    if npage != 0: # Pull data from multiple pages
                        for i_npage in range(1, npage + 1): # Python range, need to add one.
                            URL_team_summary = url_team_pull(idx_data_type, idx_report, idx_report_type, iter_date_start, iter_date_end, str_gametype, i_npage, idx_datetype);
                            temp_pulled = nhl_pull(URL_team_summary); # Pull additional data
                            temp_df_partial = temp_df_partial.append(temp_pulled['df']); # stack multiple pages
                    else:
                        None
                    
                    # Save the data
                    #   First, must clean up the empty rows, just to make sure not to join empty-empty
                    temp_df_partial = temp_df_partial.loc[(temp_df_partial.idx_row != '\xa0'),:];

                    if (temp_pulled['df'].size != 0): # If the page has at least one entry 
                        if idx_data_type == 'teams': # For merging team statistics
                            if idx_report_type == 'season':
                                temp_df = pd.merge(temp_df, temp_df_partial, how = 'left', on = "Team", suffixes=('_x', '_y'))
                            elif idx_report_type == 'game':
                                temp_df = pd.merge(temp_df, temp_df_partial, how = 'left', on = ["Team", "Game"], suffixes=('_x', '_y'))
                            else:
                                None
                        else: ## For skaters and goalies
                            if idx_report_type == 'season':
                                if temp_idx == 'bios':
                                    if idx_data_type == 'skaters': # To match with unique player identity, in case there are players with same name in each period
                                        temp_df = pd.merge(temp_df, temp_df_partial, how = 'left', on = ['Player Name', 'Player Position', 'Games Played'], suffixes=('', '_y'))
                                    else: # For goalies
                                        temp_df = pd.merge(temp_df, temp_df_partial, how = 'left', on = ['Player Name', 'Goalie Catches'], suffixes=('', '_y'))
                                else:
                                    temp_df = pd.merge(temp_df, temp_df_partial, how = 'left', on = ["Player Name", "Teams Played For"], suffixes=('', '_y'))
                            elif idx_report_type == 'game':
                                if temp_idx == 'bios':
                                    # There is no common and reliable identification - instead of risk of many merges, we rely summary bio information in summary datafile.
                                    None
                                else:
                                    temp_df = pd.merge(temp_df, temp_df_partial, how = 'left', on = ["Player Name", 'Game'], suffixes=('', '_y'))
                            else:
                                None
                    else:
                        None
                    # End of appending relevant data to the summary table ====================================================================================================
                # End of pulling all data of the season/playoff of the year: specific table ==================================================================================

                # Remove redundant columns - the duplicate check and remove works from left to right (first ones appeard are kept, others are discarded)
                #   given the nature of left-join, the duplicate clear in order left to right works
                temp_df = strip_right(temp_df, '_x');
                temp_df = strip_right(temp_df, '_y');
                temp_df = temp_df.loc[:, ~temp_df.columns.duplicated()];

                # Save the game group type,
                temp_df['season_type'] = iter_game

                # For the first ieration, need to build the file
                #if (iter_game == "regular"): # pull from regular
                #    df_fin = temp_df;
                #else:
                #    df_fin = pd.concat([df_fin, temp_df]);
                try:
                    df_fin = pd.concat([df_fin, temp_df]);
                except: # If this is the first series pull
                    df_fin = temp_df;
            # End of pulling all data for the time period sought =============================================================================================================
        # Save the data for the time period - if there was any data points.
        try:
            df_fin.size # if no data was pulled, df_fin would be the list, and the loop should continue without saving any data

            df_fin.to_csv(f'df_{idx_data_type}_{idx_report_type}_{iter_date_start}.csv',index = False)
        except: # Do nothing, don't save the data
            None
        # End of pulling all data for the specific time sought ===============================================================================================================
    # End of pulling all data for the time sought ===========================================================================================================================

str_data_type = ['teams', 'skaters', 'goalies'];
str_aggregate = ['', 'aggregate=0&'];
str_report_type = ['season', 'game'];
str_datetype = ['season', 'date'];

# Data block to pull
idx_datetype = str_datetype[1]; #Default to season, only use date for player-game stats.
# Test with skaters, per game
idx_data_type = str_data_type[1];
#idx_aggregate = str_aggregate[1];
idx_report_type = str_report_type[1];
str_page = str_page_skater;

nhl_pull_loop(str_date_start[697:], str_date_end[697:], str_page, idx_data_type, idx_report_type, idx_datetype)

Pull Team data, by season and by game

# Pull by team, by season
idx_datetype = str_datetype[0]; # Except pulling game-level player data, this should be 0 always
# If pulling game-level player data - change the first two entries of the function to str_date_start  and str_date_end, respectively
idx_data_type = str_data_type[0]; # team, skater, or goalie
idx_report_type = str_report_type[0]; # pull by season or by game level data 
str_page = str_page_team; # adjust based on what data type (team, skater, or goalie) you are pulling

nhl_pull_loop(str_season, str_season, str_page, idx_data_type, idx_report_type, idx_datetype)

# Pull by team, by game
idx_datetype = str_datetype[0]; # Except pulling game-level player data, this should be 0 always
# If pulling game-level player data - change the first two entries of the function to str_date_start  and str_date_end, respectively
idx_data_type = str_data_type[0]; # team, skater, or goalie
idx_report_type = str_report_type[1]; # pull by season or by game level data 
str_page = str_page_team; # adjust based on what data type (team, skater, or goalie) you are pulling

nhl_pull_loop(str_season, str_season, str_page, idx_data_type, idx_report_type, idx_datetype)

Pull Skater data, by season and by game

# Pull by skater, by season
idx_datetype = str_datetype[0]; # Except pulling game-level player data, this should be 0 always
# If pulling game-level player data - change the first two entries of the function to str_date_start  and str_date_end, respectively
idx_data_type = str_data_type[1]; # team, skater, or goalie
idx_report_type = str_report_type[0]; # pull by season or by game level data 
str_page = str_page_skater; # adjust based on what data type (team, skater, or goalie) you are pulling

nhl_pull_loop(str_season, str_season, str_page, idx_data_type, idx_report_type, idx_datetype)

# Pull by skater, by game
idx_datetype = str_datetype[1]; # Except pulling game-level player data, this should be 0 always
# If pulling game-level player data - change the first two entries of the function to str_date_start  and str_date_end, respectively
idx_data_type = str_data_type[1]; # team, skater, or goalie
idx_report_type = str_report_type[1]; # pull by season or by game level data 
str_page = str_page_skater; # adjust based on what data type (team, skater, or goalie) you are pulling

nhl_pull_loop(str_date_start, str_date_end, str_page, idx_data_type, idx_report_type, idx_datetype)

Pull goalie data, by season and by game

# Pull by goalie, by season
idx_datetype = str_datetype[0]; # Except pulling game-level player data, this should be 0 always
# If pulling game-level player data - change the first two entries of the function to str_date_start  and str_date_end, respectively
idx_data_type = str_data_type[2]; # team, skater, or goalie
idx_report_type = str_report_type[0]; # pull by season or by game level data 
str_page = str_page_goalie; # adjust based on what data type (team, skater, or goalie) you are pulling

nhl_pull_loop(str_season, str_season, str_page, idx_data_type, idx_report_type, idx_datetype)

# Pull by goalie, by game
idx_datetype = str_datetype[0]; # Except pulling game-level player data, this should be 0 always
# If pulling game-level player data - change the first two entries of the function to str_date_start  and str_date_end, respectively
idx_data_type = str_data_type[2]; # team, skater, or goalie
idx_report_type = str_report_type[1]; # pull by season or by game level data 
str_page = str_page_goalie; # adjust based on what data type (team, skater, or goalie) you are pulling

nhl_pull_loop(str_season, str_season, str_page, idx_data_type, idx_report_type, idx_datetype)

Goalie Statistics

df_examin = pd.read_csv('df_skaters_season_20202021.csv')
df_examin.groupby(['Player Name', 'Teams Played For']).first().reset_index()

	Player Name	Teams Played For	idx_row	Season	Skater Shoots	Player Position	Games Played	Goals	Assists	Points	...	Even Strength Time On Ice	Even Strength Time On Ice Per Game Played	Power Play Time On Ice Per Game Played	Shorthanded Time On Ice Per Game Played	Overtime Time on Ice (since 2009-10)	Overtime Time on Ice Per Overtime Game Played (since 2009-10)	Shifts	Time On Ice Per Shift	Shifts Per Game Played (since 1997-98)	season_type
0	A.J. Greer	NJD	847	2020-21	L	L	1	0	0	0	...	8:33	8:33	0:00	0:00	0:00	--	12	0:43	12.0	regular
1	Aaron Ness	ARI	807	2020-21	L	D	1	0	0	0	...	12:50	12:50	0:00	0:00	0:00	0:00	18	0:43	18.0	regular
2	Adam Boqvist	CHI	332	2020-21	R	D	35	2	14	16	...	474:10	13:33	3:23	0:04	4:00	0:40	689	0:52	19.7	regular
3	Adam Brooks	TOR	584	2020-21	L	C	11	4	1	5	...	111:15	10:07	0:08	0:26	0:00	0:00	171	0:41	15.5	regular
4	Alec Regula	CHI	889	2020-21	R	D	3	0	0	0	...	35:31	11:50	0:32	0:00	0:00	0:00	43	0:52	14.3	regular
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
432	Zach Sanford	STL	322	2020-21	L	L	52	10	6	16	...	643:08	12:22	0:57	1:35	0:00	0:00	1,018	0:46	19.6	regular
433	Zach Senyshyn	BOS	852	2020-21	R	R	8	0	0	0	...	80:34	10:04	0:01	0:02	0:00	0:00	127	0:38	15.9	regular
434	Zach Werenski	CBJ	264	2020-21	L	D	35	7	13	20	...	711:14	20:19	2:10	1:52	15:12	1:23	987	0:52	28.2	regular
435	Zach Whitecloud	VGK	429	2020-21	R	D	51	2	10	12	...	807:55	15:50	0:03	1:56	0:00	0:00	1,115	0:49	21.9	regular
436	Zack Kassian	EDM	595	2020-21	R	R	27	2	3	5	...	314:55	11:40	0:11	0:00	0:00	0:00	400	0:48	14.8	regular

437 rows × 255 columns

back to the main page