From 4f1af1d12967eb2128ecadeb26b69b4380ab6865 Mon Sep 17 00:00:00 2001 From: Sean Kim <33474168+seankim658@users.noreply.github.com> Date: Fri, 25 Oct 2024 12:12:09 -0400 Subject: [PATCH 1/6] Add cloudscraper and remove mechanicalsoup --- requirements.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index 229629c..739348d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -mechanicalsoup pandas bs4 -sphinx>2.2.0 \ No newline at end of file +sphinx>2.2.0 +cloudscraper From e99e9951499ff82cb1b9971074091e7a2fdfc877 Mon Sep 17 00:00:00 2001 From: Sean Kim <33474168+seankim658@users.noreply.github.com> Date: Fri, 25 Oct 2024 12:12:41 -0400 Subject: [PATCH 2/6] Refactor login function for cloudscraper and add get_html function --- kenpompy/utils.py | 66 +++++++++++++++++++++++++++-------------------- 1 file changed, 38 insertions(+), 28 deletions(-) diff --git a/kenpompy/utils.py b/kenpompy/utils.py index 9e81b02..c40f32c 100644 --- a/kenpompy/utils.py +++ b/kenpompy/utils.py @@ -2,11 +2,10 @@ The utils module provides utility functions, such as logging in. """ -import mechanicalsoup -from requests import Session -from ._DESAdapter import DESAdapter, environment_requires_DES_adapter +import cloudscraper +from cloudscraper import CloudScraper -def login(email, password): +def login(email: str, password: str): """ Logs in to kenpom.com using user credentials. @@ -18,33 +17,44 @@ def login(email, password): browser (mechanicalsoup StatefulBrowser): Authenticated browser with full access to kenpom.com. """ - # Fix for Cloudflare SSL profiling (https://github.com/j-andrews7/kenpompy/issues/33) provided by Nick Ostendorf (@nickostendorf) - session = Session() - if environment_requires_DES_adapter(): - session.mount('https://kenpom.com/', DESAdapter()) + browser = cloudscraper.create_scraper() + browser.get('https://kenpom.com/index.php') - browser = mechanicalsoup.StatefulBrowser(session) - browser.set_user_agent('Mozilla/5.0') - browser.open('https://kenpom.com/index.php') - - if 'Cloudflare' in browser.page.title.string: - raise Exception( - 'Opening kenpom.com failed - request was intercepted by Cloudflare protection') + form_data = { + 'email': email, + 'password': password, + 'submit': 'Login!', + } # Response page actually throws an error but further navigation works and will show you as logged in. - browser.get_current_page() - browser.select_form('form[action="handlers/login_handler.php"]') - browser['email'] = email - browser['password'] = password - - response = browser.submit_selected() + browser.post( + 'https://kenpom.com/handlers/login_handler.php', + data=form_data, + allow_redirects=True + ) - if response.status_code != 200 or 'PHPSESSID=' not in response.headers['set-cookie']: - raise Exception( - 'Logging in to kenpom.com failed - check that the site is available and your credentials are correct.') - - if 'subscription expired' in str(browser.get('https://kenpom.com/index.php').content): - raise Exception( - 'Logging in to kenpom.com failed - account subscription is expired') + home_page = browser.get('https://kenpom.com/') + if 'Logout' not in home_page.text: + raise Exception('Logging in failed - check your credentials') return browser + +def get_html(browser: CloudScraper, url: str): + """ + Performs a get request on the specified url. + + Args: + browser (CloudScraper): Authenticated browser with full access to kenpom.com generated + by the `login` function. + url (str): The url to perform the get request on. + + Returns: + html (Bytes | Any): The return content. + + Raises: + Exception if get request gets a non-200 response code. + """ + response = browser.get(url) + if response.status_code != 200: + raise Exception(f'Failed to retrieve {url} (status code: {response.status_code})') + return response.content \ No newline at end of file From 8e9fcd7999eac2a3d33b239da550f4c62a452f71 Mon Sep 17 00:00:00 2001 From: Sean Kim <33474168+seankim658@users.noreply.github.com> Date: Fri, 25 Oct 2024 12:13:02 -0400 Subject: [PATCH 3/6] Refactor for cloudscraper and beautifulsoup --- kenpompy/FanMatch.py | 11 +++++--- kenpompy/conference.py | 42 +++++++++++++-------------- kenpompy/misc.py | 64 ++++++++++++++++++++---------------------- kenpompy/summary.py | 53 +++++++++++++++++----------------- kenpompy/team.py | 27 +++++++++--------- 5 files changed, 96 insertions(+), 101 deletions(-) diff --git a/kenpompy/FanMatch.py b/kenpompy/FanMatch.py index 58a86f9..37f10c1 100644 --- a/kenpompy/FanMatch.py +++ b/kenpompy/FanMatch.py @@ -4,6 +4,10 @@ import pandas as pd from io import StringIO +from cloudscraper import CloudScraper +from bs4 import BeautifulSoup +from typing import Optional +from .utils import get_html class FanMatch: """Object to hold FanMatch page scraping results. @@ -11,7 +15,7 @@ class FanMatch: This class scrapes the kenpom FanMatch page when a new instance is created. Args: - browser (mechanicalsoup StatefulBrowser): Authenticated browser with full access to kenpom.com generated + browser (CloudScraper): Authenticated browser with full access to kenpom.com generated by the `login` function. date (str): Date to scrape, in format "YYYY-MM-DD", such as "2020-01-29". @@ -31,7 +35,7 @@ class FanMatch: fm_df (pandas dataframe): Pandas dataframe containing parsed FanMatch table. """ - def __init__(self, browser, date = None): + def __init__(self, browser: CloudScraper, date: Optional[str]=None): self.url = 'https://kenpom.com/fanmatch.php' self.date = date self.lines_o_night = None @@ -48,8 +52,7 @@ def __init__(self, browser, date = None): if self.date is not None: self.url = self.url + "?d=" + self.date - browser.open(self.url) - fm = browser.get_current_page() + fm = BeautifulSoup(get_html(browser, self.url), "html.parser") table = fm.find_all("table")[0] fm_df = pd.read_html(StringIO(str(table))) fm_df = fm_df[0] diff --git a/kenpompy/conference.py b/kenpompy/conference.py index f5a41d7..a17d47a 100644 --- a/kenpompy/conference.py +++ b/kenpompy/conference.py @@ -5,13 +5,17 @@ import pandas as pd from io import StringIO +from cloudscraper import CloudScraper +from bs4 import BeautifulSoup +from typing import Optional +from .utils import get_html -def get_valid_conferences(browser, season=None): +def get_valid_conferences(browser: CloudScraper, season: Optional[str]=None): """ Scrapes the conferences (https://kenpom.com/conf.php) into a list. Args: - browser (mechanicalsoul StatefulBrowser): Authenticated browser with full access to kenpom.com generated + browser (CloudScraper): Authenticated browser with full access to kenpom.com generated by the `login` function season (str, optional): Used to define different seasons. 2002 is the earliest available season. @@ -23,8 +27,7 @@ def get_valid_conferences(browser, season=None): url = url + '?c=B10' if(season): url = url + '&y=' + str(season) - browser.open(url) - confs = browser.get_current_page() + confs = BeautifulSoup(get_html(browser, url), "html.parser") table = confs.find_all('table')[-1] links = table.find_all('a') conf_list = [] @@ -34,12 +37,12 @@ def get_valid_conferences(browser, season=None): return conf_list -def get_aggregate_stats(browser, conf=None, season=None): +def get_aggregate_stats(browser: CloudScraper, conf: Optional[str]=None, season: Optional[str]=None): """ Scrapes a given conference's stats (https://kenpom.com/conf.php or https://kenpom.com/confstats.php) into a dataframe. Args: - browser (mechanicalsoul StatefulBrowser): Authenticated browser with full access to kenpom.com generated + browser (CloudScraper): Authenticated browser with full access to kenpom.com generated by the `login` function conf (str: optional): conference abbreviation (ie B10, P12). If None, it will grab the table from https://kenpom.com/confstats.php instead of https://kenpom.com/conf.php season (str, optional): Used to define different seasons. 2002 is the earliest available season. @@ -52,8 +55,7 @@ def get_aggregate_stats(browser, conf=None, season=None): url = url + f'?c={conf}' if(season): url = url + '&y=' + str(season) - browser.open(url) - confs = browser.get_current_page() + confs = BeautifulSoup(get_html(browser, url), "html.parser") #get first table table = confs.find_all('table')[-3] conf_df = pd.read_html(StringIO(str(table)))[0] @@ -72,8 +74,7 @@ def get_aggregate_stats(browser, conf=None, season=None): url = "https://kenpom.com/confstats.php" if(season): url = url + '?y=' + str(season) - browser.open(url) - confs = browser.get_current_page() + confs = BeautifulSoup(get_html(browser, url), "html.parser") #get table table = confs.find_all('table')[0] conf_df = pd.read_html(StringIO(str(table)))[0] @@ -82,12 +83,12 @@ def get_aggregate_stats(browser, conf=None, season=None): conf_df.columns = [stat[:-1] + 'Rank' if '.1' in stat else stat for stat in conf_df.columns] return conf_df -def get_standings(browser, conf, season=None): +def get_standings(browser: CloudScraper, conf: str, season: Optional[str]=None): """ Scrapes a given conference's standing stats (https://kenpom.com/conf.php) into a dataframe. Args: - browser (mechanicalsoul StatefulBrowser): Authenticated browser with full access to kenpom.com generated + browser (CloudScraper): Authenticated browser with full access to kenpom.com generated by the `login` function conf (str): conference abbreviation (ie B10, P12) season (str, optional): Used to define different seasons. 2002 is the earliest available season. @@ -100,8 +101,7 @@ def get_standings(browser, conf, season=None): url = url + f'?c={conf}' if(season): url = url + '&y=' + str(season) - browser.open(url) - confs = browser.get_current_page() + confs = BeautifulSoup(get_html(browser, url), "html.parser") table = confs.find_all('table')[0] conf_df = pd.read_html(StringIO(str(table)))[0] # Parse out seed @@ -114,12 +114,12 @@ def get_standings(browser, conf, season=None): return conf_df -def get_offense(browser, conf, season=None): +def get_offense(browser: CloudScraper, conf: str, season: Optional[str]=None): """ Scrapes a given conference's offense only stats (https://kenpom.com/conf.php) into a dataframe. Args: - browser (mechanicalsoul StatefulBrowser): Authenticated browser with full access to kenpom.com generated + browser (CloudScraper): Authenticated browser with full access to kenpom.com generated by the `login` function conf (str): conference abbreviation (ie B10, P12) season (str, optional): Used to define different seasons. 2002 is the earliest available season. @@ -132,8 +132,7 @@ def get_offense(browser, conf, season=None): url = url + f'?c={conf}' if(season): url = url + '&y=' + str(season) - browser.open(url) - confs = browser.get_current_page() + confs = BeautifulSoup(get_html(browser, url), "html.parser") table = confs.find_all('table')[1] conf_df = pd.read_html(StringIO(str(table)))[0] @@ -143,12 +142,12 @@ def get_offense(browser, conf, season=None): return conf_df -def get_defense(browser, conf, season=None): +def get_defense(browser: CloudScraper, conf: str, season: Optional[str]=None): """ Scrapes a given conference's defense only stats (https://kenpom.com) into a dataframe. Args: - browser (mechanicalsoul StatefulBrowser): Authenticated browser with full access to kenpom.com generated + browser (CloudScraper): Authenticated browser with full access to kenpom.com generated by the `login` function conf (str): conference abbreviation (ie B10, P12) season (str, optional): Used to define different seasons. 2002 is the earliest available season. @@ -161,8 +160,7 @@ def get_defense(browser, conf, season=None): url = url + f'?c={conf}' if(season): url = url + '&y=' + str(season) - browser.open(url) - confs = browser.get_current_page() + confs = BeautifulSoup(get_html(browser, url), "html.parser") table = confs.find_all('table')[2] conf_df = pd.read_html(StringIO(str(table)))[0] diff --git a/kenpompy/misc.py b/kenpompy/misc.py index f460221..7aef1f9 100644 --- a/kenpompy/misc.py +++ b/kenpompy/misc.py @@ -6,30 +6,34 @@ import pandas as pd from io import StringIO import re -import mechanicalsoup +from cloudscraper import CloudScraper +from bs4 import BeautifulSoup +from typing import Optional +from .utils import get_html -def get_current_season(browser: mechanicalsoup.StatefulBrowser): +def get_current_season(browser: CloudScraper): """ Scrapes the KenPom homepage to get the latest season year that has data published Args: - browser (mechanicalsoup StatefulBrowser): Authenticated browser with full access to kenpom.com generated + browser (CloudScraper): Authenticated browser with full access to kenpom.com generated by the `login` function. Returns: current_season (int): Number corresponding to the last season year that has data published """ - browser.open('https://kenpom.com/index.php') - page_title = browser.page.select_one('#content-header h2').text + url = 'https://kenpom.com/index.php' + content = BeautifulSoup(get_html(browser, url), "html.parser") + page_title = content.select_one('#content-header h2').text YEAR_PATTERN = r'^(\d{4})' return int(re.match(YEAR_PATTERN, page_title).group(0)) -def get_pomeroy_ratings(browser, season=None): +def get_pomeroy_ratings(browser: CloudScraper, season: Optional[str]=None): """ Scrapes the Pomeroy College Basketball Ratings table (https://kenpom.com/index.php) into a dataframe. Args: - browser (mechanicalsoup StatefulBrowser): Authenticated browser with full access to kenpom.com generated + browser (CloudScraper): Authenticated browser with full access to kenpom.com generated by the `login` function. season (str, optional): Used to define different seasons. 2002 is the earliest available season. Most recent season is the default. @@ -42,8 +46,7 @@ def get_pomeroy_ratings(browser, season=None): if season and int(season) < 2002: raise ValueError("season cannot be less than 2002") url += '?y={}'.format(season) - browser.open(url) - page = browser.get_current_page() + page = BeautifulSoup(get_html(browser, url), "html.parser") table = page.find_all('table')[0] ratings_df = pd.read_html(StringIO(str(table))) # Dataframe tidying. @@ -66,12 +69,12 @@ def get_pomeroy_ratings(browser, season=None): return ratings_df -def get_trends(browser): +def get_trends(browser: CloudScraper): """ Scrapes the statistical trends table (https://kenpom.com/trends.php) into a dataframe. Args: - browser (mechanicalsoup StatefulBrowser): Authenticated browser with full access to kenpom.com generated + browser (CloudScraper): Authenticated browser with full access to kenpom.com generated by the `login` function. Returns: @@ -80,8 +83,7 @@ def get_trends(browser): url = 'https://kenpom.com/trends.php' - browser.open(url) - trends = browser.get_current_page() + trends = BeautifulSoup(get_html(browser, url), "html.parser") table = trends.find_all('table')[0] trends_df = pd.read_html(StringIO(str(table))) @@ -92,12 +94,12 @@ def get_trends(browser): return trends_df -def get_refs(browser, season=None): +def get_refs(browser: CloudScraper, season: Optional[str]=None): """ Scrapes the officials rankings table (https://kenpom.com/officials.php) into a dataframe. Args: - browser (mechanicalsoup StatefulBrowser): Authenticated browser with full access to kenpom.com generated + browser (CloudScraper): Authenticated browser with full access to kenpom.com generated by the `login` function. season (str, optional): Used to define different seasons. 2016 is the earliest available season. Most recent season is the default. @@ -117,8 +119,7 @@ def get_refs(browser, season=None): 'season cannot be less than 2016, as data only goes back that far.') url = url + '?y=' + str(season) - browser.open(url) - refs = browser.get_current_page() + refs = BeautifulSoup(get_html(browser, url), "html.parser") table = refs.find_all('table')[0] refs_df = pd.read_html(StringIO(str(table))) @@ -131,14 +132,13 @@ def get_refs(browser, season=None): return refs_df -def get_hca(browser): +def get_hca(browser: CloudScraper): """ Scrapes the home court advantage table (https://kenpom.com/hca.php) into a dataframe. Args: - browser (mechanicalsoup StatefulBrowser): Authenticated browser with full access to kenpom.com generated + browser (CloudScraper): Authenticated browser with full access to kenpom.com generated by the `login` function. - season (str, optional): Used to define different seasons. 2010 is the earliest available season. Returns: hca_df (pandas dataframe): Pandas dataframe containing the home court advantage table from kenpom.com. @@ -146,8 +146,7 @@ def get_hca(browser): url = 'https://kenpom.com/hca.php' - browser.open(url) - hca = browser.get_current_page() + hca = BeautifulSoup(get_html(browser, url), "html.parser") table = hca.find_all('table')[0] hca_df = pd.read_html(StringIO(str(table))) @@ -160,12 +159,12 @@ def get_hca(browser): return hca_df -def get_arenas(browser, season=None): +def get_arenas(browser: CloudScraper, season: Optional[str]=None): """ Scrapes the arenas table (https://kenpom.com/arenas.php) into a dataframe. Args: - browser (mechanicalsoup StatefulBrowser): Authenticated browser with full access to kenpom.com generated + browser (CloudScraper): Authenticated browser with full access to kenpom.com generated by the `login` function. season (str, optional): Used to define different seasons. 2010 is the earliest available season. Most recent season is the default. @@ -185,8 +184,7 @@ def get_arenas(browser, season=None): 'season cannot be less than 2010, as data only goes back that far.') url = url + '?y=' + str(season) - browser.open(url) - arenas = browser.get_current_page() + arenas = BeautifulSoup(get_html(browser, url), "html.parser") table = arenas.find_all('table')[0] arenas_df = pd.read_html(StringIO(str(table))) @@ -201,12 +199,12 @@ def get_arenas(browser, season=None): return arenas_df -def get_gameattribs(browser, season=None, metric='Excitement'): +def get_gameattribs(browser: CloudScraper, season: Optional[str]=None, metric: str='Excitement'): """ Scrapes the Game Attributes tables (https://kenpom.com/game_attrs.php) into a dataframe. Args: - browser (mechanicalsoup StatefulBrowser): Authenticated browser with full access to kenpom.com generated + browser (CloudScraper): Authenticated browser with full access to kenpom.com generated by the `login` function. season (str, optional): Used to define different seasons. 2010 is the earliest available season. Most recent season is the default. @@ -247,8 +245,7 @@ def get_gameattribs(browser, season=None, metric='Excitement'): ) url = url + '&y=' + str(season) - browser.open(url) - playerstats = browser.get_current_page() + playerstats = BeautifulSoup(get_html(browser, url), "html.parser") table = playerstats.find_all('table')[0] ga_df = pd.read_html(StringIO(str(table))) @@ -263,12 +260,12 @@ def get_gameattribs(browser, season=None, metric='Excitement'): return ga_df -def get_program_ratings(browser): +def get_program_ratings(browser: CloudScraper): """ Scrapes the program ratings table (https://kenpom.com/programs.php) into a dataframe. Args: - browser (mechanicalsoup StatefulBrowser): Authenticated browser with full access to kenpom.com generated + browser (CloudScraper): Authenticated browser with full access to kenpom.com generated by the `login` function. Returns: @@ -277,8 +274,7 @@ def get_program_ratings(browser): url = 'https://kenpom.com/programs.php' - browser.open(url) - programs = browser.get_current_page() + programs = BeautifulSoup(get_html(browser, url), "html.parser") table = programs.find_all('table')[0] programs_df = pd.read_html(StringIO(str(table))) programs_df = programs_df[0] diff --git a/kenpompy/summary.py b/kenpompy/summary.py index 21c5721..d7c4cf1 100644 --- a/kenpompy/summary.py +++ b/kenpompy/summary.py @@ -6,13 +6,17 @@ import pandas as pd import re from io import StringIO +from cloudscraper import CloudScraper +from bs4 import BeautifulSoup +from typing import Optional +from .utils import get_html -def get_efficiency(browser, season=None): +def get_efficiency(browser: CloudScraper, season: Optional[str]=None): """ Scrapes the Efficiency stats table (https://kenpom.com/summary.php) into a dataframe. Args: - browser (mechanicalsoup StatefulBrowser): Authenticated browser with full access to kenpom.com generated + browser (CloudScraper): Authenticated browser with full access to kenpom.com generated by the `login` function. season (str, optional): Used to define different seasons. 2002 is the earliest available season but possession length data wasn't available until 2010. Most recent season is the default. @@ -32,8 +36,7 @@ def get_efficiency(browser, season=None): 'season cannot be less than 2002, as data only goes back that far.') url = url + '?y=' + str(season) - browser.open(url) - eff = browser.get_current_page() + eff = BeautifulSoup(get_html(browser, url), "html.parser") table = eff.find_all('table')[0] eff_df = pd.read_html(StringIO(str(table))) @@ -65,12 +68,12 @@ def get_efficiency(browser, season=None): return eff_df -def get_fourfactors(browser, season=None): +def get_fourfactors(browser: CloudScraper, season: Optional[str]=None): """ Scrapes the Four Factors table (https://kenpom.com/stats.php) into a dataframe. Args: - browser (mechanicalsoup StatefulBrowser): Authenticated browser with full access to kenpom.com generated + browser (CloudScraper): Authenticated browser with full access to kenpom.com generated by the `login` function. season (str, optional): Used to define different seasons. 2002 is the earliest available season. Most recent season is the default. @@ -90,8 +93,7 @@ def get_fourfactors(browser, season=None): 'season cannot be less than 2002, as data only goes back that far.') url = url + '?y=' + str(season) - browser.open(url) - ff = browser.get_current_page() + ff = BeautifulSoup(get_html(browser, url), "html.parser") table = ff.find_all('table')[0] ff_df = pd.read_html(StringIO(str(table))) @@ -113,12 +115,12 @@ def get_fourfactors(browser, season=None): return ff_df -def get_teamstats(browser, defense=False, season=None): +def get_teamstats(browser: CloudScraper, defense: Optional[bool]=False, season: Optional[str]=None): """ Scrapes the Miscellaneous Team Stats table (https://kenpom.com/teamstats.php) into a dataframe. Args: - browser (mechanicalsoup StatefulBrowser): Authenticated browser with full access to kenpom.com generated + browser (CloudScraper): Authenticated browser with full access to kenpom.com generated by the `login` function. defense (bool, optional): Used to flag whether the defensive teamstats table is wanted or not. False by default. @@ -148,8 +150,7 @@ def get_teamstats(browser, defense=False, season=None): url = url + '?od=d' last_cols = ['AdjDE', 'AdjDE.Rank'] - browser.open(url) - ts = browser.get_current_page() + ts = BeautifulSoup(get_html(browser, url), "html.parser") table = ts.find_all('table')[0] ts_df = pd.read_html(StringIO(str(table))) @@ -170,12 +171,12 @@ def get_teamstats(browser, defense=False, season=None): return ts_df -def get_pointdist(browser, season=None): +def get_pointdist(browser: CloudScraper, season: Optional[str]=None): """ Scrapes the Team Points Distribution table (https://kenpom.com/pointdist.php) into a dataframe. Args: - browser (mechanicalsoup StatefulBrowser): Authenticated browser with full access to kenpom.com generated + browser (CloudScraper): Authenticated browser with full access to kenpom.com generated by the `login` function. season (str, optional): Used to define different seasons. 2002 is the earliest available season. Most recent season is the default. @@ -196,8 +197,7 @@ def get_pointdist(browser, season=None): 'season cannot be less than 2002, as data only goes back that far.') url = url + '?y=' + str(season) - browser.open(url) - dist = browser.get_current_page() + dist = BeautifulSoup(get_html(browser, url), "html.parser") table = dist.find_all('table')[0] dist_df = pd.read_html(StringIO(str(table))) @@ -217,12 +217,12 @@ def get_pointdist(browser, season=None): return dist_df -def get_height(browser, season=None): +def get_height(browser: CloudScraper, season: Optional[str]=None): """ Scrapes the Height/Experience table (https://kenpom.com/height.php) into a dataframe. Args: - browser (mechanicalsoup StatefulBrowser): Authenticated browser with full access to kenpom.com generated + browser (CloudScraper): Authenticated browser with full access to kenpom.com generated by the `login` function. season (str, optional): Used to define different seasons. 2007 is the earliest available season but continuity data wasn't available until 2008. Most recent season is the default. @@ -242,8 +242,7 @@ def get_height(browser, season=None): 'Season cannot be less than 2007, as data only goes back that far.') url = url + '?y=' + str(season) - browser.open(url) - height = browser.get_current_page() + height = BeautifulSoup(get_html(browser, url), "html.parser") table = height.find_all('table')[0] h_df = pd.read_html(StringIO(str(table))) @@ -274,12 +273,12 @@ def get_height(browser, season=None): return h_df -def get_playerstats(browser, season=None, metric='EFG', conf=None, conf_only=False): +def get_playerstats(browser: CloudScraper, season: Optional[str]=None, metric: str='EFG', conf: Optional[str]=None, conf_only: bool=False): """ Scrapes the Player Leaders tables (https://kenpom.com/playerstats.php) into a dataframe. Args: - browser (mechanicalsoup StatefulBrowser): Authenticated browser with full access to kenpom.com generated + browser (CloudScraper): Authenticated browser with full access to kenpom.com generated by the `login` function. season (str, optional): Used to define different seasons. 2004 is the earliest available season. Most recent season is the default. @@ -338,8 +337,7 @@ def get_playerstats(browser, season=None, metric='EFG', conf=None, conf_only=Fal if conf: url = url + '&f=' + conf - browser.open(url) - playerstats = browser.get_current_page() + playerstats = BeautifulSoup(get_html(browser, url), "html.parser") if metric == 'ORTG': ps_dfs = [] tables = playerstats.find_all('table') @@ -380,12 +378,12 @@ def get_playerstats(browser, season=None, metric='EFG', conf=None, conf_only=Fal return ps_df -def get_kpoy(browser, season=None): +def get_kpoy(browser: CloudScraper, season: Optional[str]=None): """ Scrapes the kenpom Player of the Year tables (https://kenpom.com/kpoy.php) into dataframes. Args: - browser (mechanicalsoup StatefulBrowser): Authenticated browser with full access to kenpom.com generated + browser (CloudScraper): Authenticated browser with full access to kenpom.com generated by the `login` function. season (str, optional): Used to define different seasons. 2011 is the earliest available season. Most recent season is the default. @@ -410,8 +408,7 @@ def get_kpoy(browser, season=None): else: season = 2013 - browser.open(url) - kpoy = browser.get_current_page() + kpoy = BeautifulSoup(get_html(browser, url), "html.parser") table = kpoy.find_all('table')[0] df = pd.read_html(StringIO(str(table))) diff --git a/kenpompy/team.py b/kenpompy/team.py index 651e1d6..2a32f9f 100644 --- a/kenpompy/team.py +++ b/kenpompy/team.py @@ -7,15 +7,18 @@ from io import StringIO from .misc import get_current_season import re +from cloudscraper import CloudScraper from bs4 import BeautifulSoup from codecs import encode, decode +from typing import Optional +from .utils import get_html -def get_valid_teams(browser, season=None): +def get_valid_teams(browser: CloudScraper, season: Optional[str]=None): """ Scrapes the teams (https://kenpom.com) into a list. Args: - browser (mechanicalsoup.StatefulBrowser): Authenticated browser with full access to kenpom.com generated + browser (CloudScraper): Authenticated browser with full access to kenpom.com generated by the `login` function season (str, optional): Used to define different seasons. 2002 is the earliest available season. @@ -26,8 +29,7 @@ def get_valid_teams(browser, season=None): url = "https://kenpom.com" url = url + '?y=' + str(season) - browser.open(url) - teams = browser.get_current_page() + teams = BeautifulSoup(get_html(browser, url), "html.parser") table = teams.find_all('table')[0] team_df = pd.read_html(StringIO(str(table))) # Get only the team column. @@ -42,14 +44,14 @@ def get_valid_teams(browser, season=None): return team_list -def get_schedule(browser, team=None, season=None): +def get_schedule(browser: CloudScraper, team: str, season: Optional[str]=None): """ Scrapes a team's schedule from (https://kenpom.com/team.php) into a dataframe. Args: - browser (mechanicalsoup.StatefulBrowser): Authenticated browser with full access to kenpom.com generated + browser (CloudScraper): Authenticated browser with full access to kenpom.com generated by the `login` function - team: Used to determine which team to scrape for schedule. + team (str): Used to determine which team to scrape for schedule. season (str, optional): Used to define different seasons. 2002 is the earliest available season. Returns: @@ -84,8 +86,7 @@ def get_schedule(browser, team=None, season=None): url = url + "?team=" + str(team) url = url + "&y=" + str(season) - browser.open(url) - schedule = browser.get_current_page() + schedule = BeautifulSoup(get_html(browser, url), "html.parser") table = schedule.find_all('table')[1] schedule_df = pd.read_html(StringIO(str(table))) @@ -119,12 +120,12 @@ def get_schedule(browser, team=None, season=None): return schedule_df.reset_index(drop=True) -def get_scouting_report(browser, team=None, season=None, conference_only=False): +def get_scouting_report(browser: CloudScraper, team: str, season: Optional[int]=None, conference_only: bool=False): """ Retrieves and parses team scouting report data from (https://kenpom.com/team.php) into a dictionary. Args: - browser (mechanicalsoup.StatefulBrowser): The mechanize browser object for web scraping. + browser (CloudScraper): The mechanize browser object for web scraping. team (str): team: Used to determine which team to scrape for schedule. season (int, optional): Used to define different seasons. 2002 is the earliest available season. conference_only (bool, optional): When True, only conference-related stats are retrieved; otherwise, all stats are fetched. @@ -161,8 +162,8 @@ def get_scouting_report(browser, team=None, season=None, conference_only=False): url = url + "?team=" + str(team) url = url + "&y=" + str(season) - browser.open(url) - scouting_report_scripts = browser.page.find("script", { "type": "text/javascript", "src": ""} ) + report = BeautifulSoup(get_html(browser, url), "html.parser") + scouting_report_scripts = report.find("script", { "type": "text/javascript", "src": ""} ) extraction_pattern = re.compile(r"\$\(\"td#(?P[A-Za-z0-9]+)\"\)\.html\(\"(.+)\"\);") if conference_only: From 67319af73e1ada0912327309f2fbe83a8220a529 Mon Sep 17 00:00:00 2001 From: Sean Kim <33474168+seankim658@users.noreply.github.com> Date: Fri, 25 Oct 2024 12:36:28 -0400 Subject: [PATCH 4/6] Update AdjO to ORtg --- tests/test_conference.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/test_conference.py b/tests/test_conference.py index dab4992..b3c2e33 100644 --- a/tests/test_conference.py +++ b/tests/test_conference.py @@ -67,8 +67,8 @@ def test_get_standings(browser): confs_2021 = kpconf.get_standings(browser, 'B10', season = '2021') assert confs_2021.iloc[0, :]['Team'] == expectedTeam1 - assert confs_2021.iloc[0, :]['AdjO'] == expectedTeam1AdjO - assert confs_2021.iloc[0, :]['AdjO.Rank'] == expectedTeam1AdjORank + assert confs_2021.iloc[0, :]['ORtg'] == expectedTeam1AdjO + assert confs_2021.iloc[0, :]['ORtg.Rank'] == expectedTeam1AdjORank expectedTeam1 = 'Wisconsin' expectedTeam1AdjO = 114.4 @@ -76,8 +76,8 @@ def test_get_standings(browser): confs_2003 = kpconf.get_standings(browser, 'B10', season = '2003') assert confs_2003.iloc[0, :]['Team'] == expectedTeam1 - assert confs_2003.iloc[0, :]['AdjO'] == expectedTeam1AdjO - assert confs_2003.iloc[0, :]['AdjO.Rank'] == expectedTeam1AdjORank + assert confs_2003.iloc[0, :]['ORtg'] == expectedTeam1AdjO + assert confs_2003.iloc[0, :]['ORtg.Rank'] == expectedTeam1AdjORank def test_get_offense(browser): From 4fe156a69406a8fc23e7057edc0d3a7423084294 Mon Sep 17 00:00:00 2001 From: Sean Kim <33474168+seankim658@users.noreply.github.com> Date: Fri, 25 Oct 2024 12:45:10 -0400 Subject: [PATCH 5/6] Restore default None for team arg --- kenpompy/team.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kenpompy/team.py b/kenpompy/team.py index 2a32f9f..005197d 100644 --- a/kenpompy/team.py +++ b/kenpompy/team.py @@ -44,14 +44,14 @@ def get_valid_teams(browser: CloudScraper, season: Optional[str]=None): return team_list -def get_schedule(browser: CloudScraper, team: str, season: Optional[str]=None): +def get_schedule(browser: CloudScraper, team: Optional[str]=None, season: Optional[str]=None): """ Scrapes a team's schedule from (https://kenpom.com/team.php) into a dataframe. Args: browser (CloudScraper): Authenticated browser with full access to kenpom.com generated by the `login` function - team (str): Used to determine which team to scrape for schedule. + team (str, optional): Used to determine which team to scrape for schedule. season (str, optional): Used to define different seasons. 2002 is the earliest available season. Returns: From e01aa17d4ebd5a34902ddc07711949e79cbb5874 Mon Sep 17 00:00:00 2001 From: Sean Kim <33474168+seankim658@users.noreply.github.com> Date: Fri, 25 Oct 2024 12:46:18 -0400 Subject: [PATCH 6/6] Fix expected for cloudscraper instead of stateful browser --- tests/test_misc.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_misc.py b/tests/test_misc.py index 4fe91f9..196beed 100644 --- a/tests/test_misc.py +++ b/tests/test_misc.py @@ -89,5 +89,5 @@ def test_get_gameattribs(browser): def test_get_program_ratings(browser): df = kpmisc.get_program_ratings(browser) - expected = (len(browser.page.select("tr:not(:has(th))")), 17) + expected = (362, 17) assert df.shape == expected \ No newline at end of file