Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor for new cloudflare requirements #95

Merged
merged 6 commits into from
Oct 29, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 7 additions & 4 deletions kenpompy/FanMatch.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,18 @@

import pandas as pd
from io import StringIO
from cloudscraper import CloudScraper
from bs4 import BeautifulSoup
from typing import Optional
from .utils import get_html

class FanMatch:
"""Object to hold FanMatch page scraping results.

This class scrapes the kenpom FanMatch page when a new instance is created.

Args:
browser (mechanicalsoup StatefulBrowser): Authenticated browser with full access to kenpom.com generated
browser (CloudScraper): Authenticated browser with full access to kenpom.com generated
by the `login` function.
date (str): Date to scrape, in format "YYYY-MM-DD", such as "2020-01-29".

Expand All @@ -31,7 +35,7 @@ class FanMatch:
fm_df (pandas dataframe): Pandas dataframe containing parsed FanMatch table.
"""

def __init__(self, browser, date = None):
def __init__(self, browser: CloudScraper, date: Optional[str]=None):
self.url = 'https://kenpom.com/fanmatch.php'
self.date = date
self.lines_o_night = None
Expand All @@ -48,8 +52,7 @@ def __init__(self, browser, date = None):
if self.date is not None:
self.url = self.url + "?d=" + self.date

browser.open(self.url)
fm = browser.get_current_page()
fm = BeautifulSoup(get_html(browser, self.url), "html.parser")
table = fm.find_all("table")[0]
fm_df = pd.read_html(StringIO(str(table)))
fm_df = fm_df[0]
Expand Down
42 changes: 20 additions & 22 deletions kenpompy/conference.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,17 @@

import pandas as pd
from io import StringIO
from cloudscraper import CloudScraper
from bs4 import BeautifulSoup
from typing import Optional
from .utils import get_html

def get_valid_conferences(browser, season=None):
def get_valid_conferences(browser: CloudScraper, season: Optional[str]=None):
"""
Scrapes the conferences (https://kenpom.com/conf.php) into a list.

Args:
browser (mechanicalsoul StatefulBrowser): Authenticated browser with full access to kenpom.com generated
browser (CloudScraper): Authenticated browser with full access to kenpom.com generated
by the `login` function
season (str, optional): Used to define different seasons. 2002 is the earliest available season.

Expand All @@ -23,8 +27,7 @@ def get_valid_conferences(browser, season=None):
url = url + '?c=B10'
if(season):
url = url + '&y=' + str(season)
browser.open(url)
confs = browser.get_current_page()
confs = BeautifulSoup(get_html(browser, url), "html.parser")
table = confs.find_all('table')[-1]
links = table.find_all('a')
conf_list = []
Expand All @@ -34,12 +37,12 @@ def get_valid_conferences(browser, season=None):
return conf_list


def get_aggregate_stats(browser, conf=None, season=None):
def get_aggregate_stats(browser: CloudScraper, conf: Optional[str]=None, season: Optional[str]=None):
"""
Scrapes a given conference's stats (https://kenpom.com/conf.php or https://kenpom.com/confstats.php) into a dataframe.

Args:
browser (mechanicalsoul StatefulBrowser): Authenticated browser with full access to kenpom.com generated
browser (CloudScraper): Authenticated browser with full access to kenpom.com generated
by the `login` function
conf (str: optional): conference abbreviation (ie B10, P12). If None, it will grab the table from https://kenpom.com/confstats.php instead of https://kenpom.com/conf.php
season (str, optional): Used to define different seasons. 2002 is the earliest available season.
Expand All @@ -52,8 +55,7 @@ def get_aggregate_stats(browser, conf=None, season=None):
url = url + f'?c={conf}'
if(season):
url = url + '&y=' + str(season)
browser.open(url)
confs = browser.get_current_page()
confs = BeautifulSoup(get_html(browser, url), "html.parser")
#get first table
table = confs.find_all('table')[-3]
conf_df = pd.read_html(StringIO(str(table)))[0]
Expand All @@ -72,8 +74,7 @@ def get_aggregate_stats(browser, conf=None, season=None):
url = "https://kenpom.com/confstats.php"
if(season):
url = url + '?y=' + str(season)
browser.open(url)
confs = browser.get_current_page()
confs = BeautifulSoup(get_html(browser, url), "html.parser")
#get table
table = confs.find_all('table')[0]
conf_df = pd.read_html(StringIO(str(table)))[0]
Expand All @@ -82,12 +83,12 @@ def get_aggregate_stats(browser, conf=None, season=None):
conf_df.columns = [stat[:-1] + 'Rank' if '.1' in stat else stat for stat in conf_df.columns]
return conf_df

def get_standings(browser, conf, season=None):
def get_standings(browser: CloudScraper, conf: str, season: Optional[str]=None):
"""
Scrapes a given conference's standing stats (https://kenpom.com/conf.php) into a dataframe.

Args:
browser (mechanicalsoul StatefulBrowser): Authenticated browser with full access to kenpom.com generated
browser (CloudScraper): Authenticated browser with full access to kenpom.com generated
by the `login` function
conf (str): conference abbreviation (ie B10, P12)
season (str, optional): Used to define different seasons. 2002 is the earliest available season.
Expand All @@ -100,8 +101,7 @@ def get_standings(browser, conf, season=None):
url = url + f'?c={conf}'
if(season):
url = url + '&y=' + str(season)
browser.open(url)
confs = browser.get_current_page()
confs = BeautifulSoup(get_html(browser, url), "html.parser")
table = confs.find_all('table')[0]
conf_df = pd.read_html(StringIO(str(table)))[0]
# Parse out seed
Expand All @@ -114,12 +114,12 @@ def get_standings(browser, conf, season=None):
return conf_df


def get_offense(browser, conf, season=None):
def get_offense(browser: CloudScraper, conf: str, season: Optional[str]=None):
"""
Scrapes a given conference's offense only stats (https://kenpom.com/conf.php) into a dataframe.

Args:
browser (mechanicalsoul StatefulBrowser): Authenticated browser with full access to kenpom.com generated
browser (CloudScraper): Authenticated browser with full access to kenpom.com generated
by the `login` function
conf (str): conference abbreviation (ie B10, P12)
season (str, optional): Used to define different seasons. 2002 is the earliest available season.
Expand All @@ -132,8 +132,7 @@ def get_offense(browser, conf, season=None):
url = url + f'?c={conf}'
if(season):
url = url + '&y=' + str(season)
browser.open(url)
confs = browser.get_current_page()
confs = BeautifulSoup(get_html(browser, url), "html.parser")
table = confs.find_all('table')[1]
conf_df = pd.read_html(StringIO(str(table)))[0]

Expand All @@ -143,12 +142,12 @@ def get_offense(browser, conf, season=None):
return conf_df


def get_defense(browser, conf, season=None):
def get_defense(browser: CloudScraper, conf: str, season: Optional[str]=None):
"""
Scrapes a given conference's defense only stats (https://kenpom.com) into a dataframe.

Args:
browser (mechanicalsoul StatefulBrowser): Authenticated browser with full access to kenpom.com generated
browser (CloudScraper): Authenticated browser with full access to kenpom.com generated
by the `login` function
conf (str): conference abbreviation (ie B10, P12)
season (str, optional): Used to define different seasons. 2002 is the earliest available season.
Expand All @@ -161,8 +160,7 @@ def get_defense(browser, conf, season=None):
url = url + f'?c={conf}'
if(season):
url = url + '&y=' + str(season)
browser.open(url)
confs = browser.get_current_page()
confs = BeautifulSoup(get_html(browser, url), "html.parser")
table = confs.find_all('table')[2]
conf_df = pd.read_html(StringIO(str(table)))[0]

Expand Down
64 changes: 30 additions & 34 deletions kenpompy/misc.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,30 +6,34 @@
import pandas as pd
from io import StringIO
import re
import mechanicalsoup
from cloudscraper import CloudScraper
from bs4 import BeautifulSoup
from typing import Optional
from .utils import get_html

def get_current_season(browser: mechanicalsoup.StatefulBrowser):
def get_current_season(browser: CloudScraper):
"""
Scrapes the KenPom homepage to get the latest season year that has data published

Args:
browser (mechanicalsoup StatefulBrowser): Authenticated browser with full access to kenpom.com generated
browser (CloudScraper): Authenticated browser with full access to kenpom.com generated
by the `login` function.

Returns:
current_season (int): Number corresponding to the last season year that has data published
"""
browser.open('https://kenpom.com/index.php')
page_title = browser.page.select_one('#content-header h2').text
url = 'https://kenpom.com/index.php'
content = BeautifulSoup(get_html(browser, url), "html.parser")
page_title = content.select_one('#content-header h2').text
YEAR_PATTERN = r'^(\d{4})'
return int(re.match(YEAR_PATTERN, page_title).group(0))

def get_pomeroy_ratings(browser, season=None):
def get_pomeroy_ratings(browser: CloudScraper, season: Optional[str]=None):
"""
Scrapes the Pomeroy College Basketball Ratings table (https://kenpom.com/index.php) into a dataframe.

Args:
browser (mechanicalsoup StatefulBrowser): Authenticated browser with full access to kenpom.com generated
browser (CloudScraper): Authenticated browser with full access to kenpom.com generated
by the `login` function.
season (str, optional): Used to define different seasons. 2002 is the earliest available season.
Most recent season is the default.
Expand All @@ -42,8 +46,7 @@ def get_pomeroy_ratings(browser, season=None):
if season and int(season) < 2002:
raise ValueError("season cannot be less than 2002")
url += '?y={}'.format(season)
browser.open(url)
page = browser.get_current_page()
page = BeautifulSoup(get_html(browser, url), "html.parser")
table = page.find_all('table')[0]
ratings_df = pd.read_html(StringIO(str(table)))
# Dataframe tidying.
Expand All @@ -66,12 +69,12 @@ def get_pomeroy_ratings(browser, season=None):
return ratings_df


def get_trends(browser):
def get_trends(browser: CloudScraper):
"""
Scrapes the statistical trends table (https://kenpom.com/trends.php) into a dataframe.

Args:
browser (mechanicalsoup StatefulBrowser): Authenticated browser with full access to kenpom.com generated
browser (CloudScraper): Authenticated browser with full access to kenpom.com generated
by the `login` function.

Returns:
Expand All @@ -80,8 +83,7 @@ def get_trends(browser):

url = 'https://kenpom.com/trends.php'

browser.open(url)
trends = browser.get_current_page()
trends = BeautifulSoup(get_html(browser, url), "html.parser")
table = trends.find_all('table')[0]
trends_df = pd.read_html(StringIO(str(table)))

Expand All @@ -92,12 +94,12 @@ def get_trends(browser):
return trends_df


def get_refs(browser, season=None):
def get_refs(browser: CloudScraper, season: Optional[str]=None):
"""
Scrapes the officials rankings table (https://kenpom.com/officials.php) into a dataframe.

Args:
browser (mechanicalsoup StatefulBrowser): Authenticated browser with full access to kenpom.com generated
browser (CloudScraper): Authenticated browser with full access to kenpom.com generated
by the `login` function.
season (str, optional): Used to define different seasons. 2016 is the earliest available season.
Most recent season is the default.
Expand All @@ -117,8 +119,7 @@ def get_refs(browser, season=None):
'season cannot be less than 2016, as data only goes back that far.')
url = url + '?y=' + str(season)

browser.open(url)
refs = browser.get_current_page()
refs = BeautifulSoup(get_html(browser, url), "html.parser")
table = refs.find_all('table')[0]
refs_df = pd.read_html(StringIO(str(table)))

Expand All @@ -131,23 +132,21 @@ def get_refs(browser, season=None):
return refs_df


def get_hca(browser):
def get_hca(browser: CloudScraper):
"""
Scrapes the home court advantage table (https://kenpom.com/hca.php) into a dataframe.

Args:
browser (mechanicalsoup StatefulBrowser): Authenticated browser with full access to kenpom.com generated
browser (CloudScraper): Authenticated browser with full access to kenpom.com generated
by the `login` function.
season (str, optional): Used to define different seasons. 2010 is the earliest available season.

Returns:
hca_df (pandas dataframe): Pandas dataframe containing the home court advantage table from kenpom.com.
"""

url = 'https://kenpom.com/hca.php'

browser.open(url)
hca = browser.get_current_page()
hca = BeautifulSoup(get_html(browser, url), "html.parser")
table = hca.find_all('table')[0]
hca_df = pd.read_html(StringIO(str(table)))

Expand All @@ -160,12 +159,12 @@ def get_hca(browser):
return hca_df


def get_arenas(browser, season=None):
def get_arenas(browser: CloudScraper, season: Optional[str]=None):
"""
Scrapes the arenas table (https://kenpom.com/arenas.php) into a dataframe.

Args:
browser (mechanicalsoup StatefulBrowser): Authenticated browser with full access to kenpom.com generated
browser (CloudScraper): Authenticated browser with full access to kenpom.com generated
by the `login` function.
season (str, optional): Used to define different seasons. 2010 is the earliest available season.
Most recent season is the default.
Expand All @@ -185,8 +184,7 @@ def get_arenas(browser, season=None):
'season cannot be less than 2010, as data only goes back that far.')
url = url + '?y=' + str(season)

browser.open(url)
arenas = browser.get_current_page()
arenas = BeautifulSoup(get_html(browser, url), "html.parser")
table = arenas.find_all('table')[0]
arenas_df = pd.read_html(StringIO(str(table)))

Expand All @@ -201,12 +199,12 @@ def get_arenas(browser, season=None):
return arenas_df


def get_gameattribs(browser, season=None, metric='Excitement'):
def get_gameattribs(browser: CloudScraper, season: Optional[str]=None, metric: str='Excitement'):
"""
Scrapes the Game Attributes tables (https://kenpom.com/game_attrs.php) into a dataframe.

Args:
browser (mechanicalsoup StatefulBrowser): Authenticated browser with full access to kenpom.com generated
browser (CloudScraper): Authenticated browser with full access to kenpom.com generated
by the `login` function.
season (str, optional): Used to define different seasons. 2010 is the earliest available season.
Most recent season is the default.
Expand Down Expand Up @@ -247,8 +245,7 @@ def get_gameattribs(browser, season=None, metric='Excitement'):
)
url = url + '&y=' + str(season)

browser.open(url)
playerstats = browser.get_current_page()
playerstats = BeautifulSoup(get_html(browser, url), "html.parser")

table = playerstats.find_all('table')[0]
ga_df = pd.read_html(StringIO(str(table)))
Expand All @@ -263,12 +260,12 @@ def get_gameattribs(browser, season=None, metric='Excitement'):
return ga_df


def get_program_ratings(browser):
def get_program_ratings(browser: CloudScraper):
"""
Scrapes the program ratings table (https://kenpom.com/programs.php) into a dataframe.

Args:
browser (mechanicalsoup StatefulBrowser): Authenticated browser with full access to kenpom.com generated
browser (CloudScraper): Authenticated browser with full access to kenpom.com generated
by the `login` function.

Returns:
Expand All @@ -277,8 +274,7 @@ def get_program_ratings(browser):

url = 'https://kenpom.com/programs.php'

browser.open(url)
programs = browser.get_current_page()
programs = BeautifulSoup(get_html(browser, url), "html.parser")
table = programs.find_all('table')[0]
programs_df = pd.read_html(StringIO(str(table)))
programs_df = programs_df[0]
Expand Down
Loading
Loading