Skip to content

Commit

Permalink
Merge pull request #79 from j-andrews7/revert-72-master
Browse files Browse the repository at this point in the history
Revert "Refactor building URLs with urllib.parse.urlencode"
  • Loading branch information
j-andrews7 authored Nov 22, 2023
2 parents 418b655 + 9c38379 commit 5c395d4
Show file tree
Hide file tree
Showing 4 changed files with 88 additions and 141 deletions.
54 changes: 20 additions & 34 deletions kenpompy/conference.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
from bs4 import BeautifulSoup
import datetime
from io import StringIO
import urllib.parse

def get_valid_conferences(browser, season=None):
"""
Expand All @@ -24,14 +23,10 @@ def get_valid_conferences(browser, season=None):
conference_list (list): List containing all valid conferences for the given season on kenpom.com.
"""

params = {}
url = "https://kenpom.com/conf.php"
params['c'] = 'B10'
if season:
params['y'] = str(season)

url = url + '?' + urllib.parse.urlencode(params)

url = url + '?c=B10'
if(season):
url = url + '&y=' + str(season)
browser.open(url)
confs = browser.get_current_page()
table = confs.find_all('table')[-1]
Expand All @@ -56,16 +51,11 @@ def get_aggregate_stats(browser, conf=None, season=None):
Returns:
conference_df (dataframe): Dataframe containing aggregate stats of the conference for the given season on kenpom.com.
"""
params = {}

if conf:
if(conf):
url = "https://kenpom.com/conf.php"
params['c'] = conf
if season:
params['y'] = str(season)

url = url + '?' + urllib.parse.urlencode(params)

url = url + f'?c={conf}'
if(season):
url = url + '&y=' + str(season)
browser.open(url)
confs = browser.get_current_page()
#get first table
Expand All @@ -84,9 +74,8 @@ def get_aggregate_stats(browser, conf=None, season=None):
return conf_df
else:
url = "https://kenpom.com/confstats.php"
if season:
params['y'] = str(season)
url = url + '?' + urllib.parse.urlencode(params)
if(season):
url = url + '?y=' + str(season)
browser.open(url)
confs = browser.get_current_page()
#get table
Expand All @@ -110,12 +99,11 @@ def get_standings(browser, conf, season=None):
Returns:
conference_df (dataframe): Dataframe containing standing stats of the conference for the given season on kenpom.com.
"""
params = {}

url = "https://kenpom.com/conf.php"
params['c'] = conf
if season:
params['y'] = str(season)
url = url + '?' + urllib.parse.urlencode(params)
url = url + f'?c={conf}'
if(season):
url = url + '&y=' + str(season)
browser.open(url)
confs = browser.get_current_page()
table = confs.find_all('table')[0]
Expand Down Expand Up @@ -144,12 +132,10 @@ def get_offense(browser, conf, season=None):
conference_df (dataframe): Dataframe containing offensive stats of the conference for the given season on kenpom.com.
"""

params = {}

url = "https://kenpom.com/conf.php"
params['c'] = conf
if season:
params['y'] = str(season)
url = url + f'?c={conf}'
if(season):
url = url + '&y=' + str(season)
browser.open(url)
confs = browser.get_current_page()
table = confs.find_all('table')[1]
Expand All @@ -174,11 +160,11 @@ def get_defense(browser, conf, season=None):
Returns:
conference_df (dataframe): Dataframe containing defensive stats of the conference for the given season on kenpom.com.
"""
params = {}

url = "https://kenpom.com/conf.php"
params['c'] = conf
if season:
params['y'] = str(season)
url = url + f'?c={conf}'
if(season):
url = url + '&y=' + str(season)
browser.open(url)
confs = browser.get_current_page()
table = confs.find_all('table')[2]
Expand Down
104 changes: 43 additions & 61 deletions kenpompy/misc.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
import pandas as pd
from bs4 import BeautifulSoup
from io import StringIO
import urllib.parse
import re

def get_current_season(browser):
Expand All @@ -27,51 +26,45 @@ def get_current_season(browser):
return int(re.match(YEAR_PATTERN, page_title).group(0))

def get_pomeroy_ratings(browser, season=None):
"""
Scrapes the Pomeroy College Basketball Ratings table (https://kenpom.com/index.php) into a dataframe.
Args:
browser (mechanicalsoup StatefulBrowser): Authenticated browser with full access to kenpom.com generated
by the `login` function.
season (str, optional): Used to define different seasons. 2002 is the earliest available season.
Most recent season is the default.
Returns:
refs_df (pandas dataframe): Pandas dataframe containing the Pomeroy College Basketball Ratings table from kenpom.com.
Raises:
ValueError: If `season` is less than 2002.
"""
params = {}
url = 'https://kenpom.com/index.php'

if season and int(season) < 2002:
raise ValueError("season cannot be less than 2002")
"""
Scrapes the Pomeroy College Basketball Ratings table (https://kenpom.com/index.php) into a dataframe.
params['y'] = str(season)

url = url + '?' + urllib.parse.urlencode(params)

browser.open(url)
page = browser.get_current_page()
table = page.find_all('table')[0]
ratings_df = pd.read_html(StringIO(str(table)))
# Dataframe tidying.
ratings_df = ratings_df[0]
ratings_df.columns = ratings_df.columns.map(lambda x: x[1])
ratings_df.dropna(inplace=True)
ratings_df = ratings_df[ratings_df['Rk'] != 'Rk']
ratings_df.reset_index(drop=True, inplace=True)
# Parse out seed, most current won't have this
tmp = ratings_df['Team'].str.extract(r'(?P<Team>[a-zA-Z.&\'\s]+(?<!\s))\s*(?P<Seed>\d*)')
ratings_df["Team"] = tmp["Team"]
ratings_df["Seed"] = tmp["Seed"]

# Disambiguate column names for easier reference
ratings_df.columns = ['Rk', 'Team', 'Conf', 'W-L', 'AdjEM', 'AdjO',
'AdjO.Rank', 'AdjD', 'AdjD.Rank', 'AdjT', 'AdjT.Rank',
'Luck', 'Luck.Rank', 'SOS-AdjEM', 'SOS-AdjEM.Rank', 'SOS-OppO', 'SOS-OppO.Rank',
'SOS-OppD', 'SOS-OppD.Rank', 'NCSOS-AdjEM', 'NCSOS-AdjEM.Rank', 'Seed']

return ratings_df
Args:
browser (mechanicalsoup StatefulBrowser): Authenticated browser with full access to kenpom.com generated
by the `login` function.
season (str, optional): Used to define different seasons. 2002 is the earliest available season.
Most recent season is the default.
Returns:
refs_df (pandas dataframe): Pandas dataframe containing the Pomeroy College Basketball Ratings table from kenpom.com.
Raises:
ValueError: If `season` is less than 2002.
"""
url = 'https://kenpom.com/index.php'
if season and int(season) < 2002:
raise ValueError("season cannot be less than 2002")
url += '?y={}'.format(season)
browser.open(url)
page = browser.get_current_page()
table = page.find_all('table')[0]
ratings_df = pd.read_html(StringIO(str(table)))
# Dataframe tidying.
ratings_df = ratings_df[0]
ratings_df.columns = ratings_df.columns.map(lambda x: x[1])
ratings_df.dropna(inplace=True)
ratings_df = ratings_df[ratings_df['Rk'] != 'Rk']
ratings_df.reset_index(drop=True, inplace=True)
# Parse out seed, most current won't have this
tmp = ratings_df['Team'].str.extract(r'(?P<Team>[a-zA-Z.&\'\s]+(?<!\s))\s*(?P<Seed>\d*)')
ratings_df["Team"] = tmp["Team"]
ratings_df["Seed"] = tmp["Seed"]

# Disambiguate column names for easier reference
ratings_df.columns = ['Rk', 'Team', 'Conf', 'W-L', 'AdjEM', 'AdjO',
'AdjO.Rank', 'AdjD', 'AdjD.Rank', 'AdjT', 'AdjT.Rank',
'Luck', 'Luck.Rank', 'SOS-AdjEM', 'SOS-AdjEM.Rank', 'SOS-OppO', 'SOS-OppO.Rank',
'SOS-OppD', 'SOS-OppD.Rank', 'NCSOS-AdjEM', 'NCSOS-AdjEM.Rank', 'Seed']

return ratings_df


def get_trends(browser):
Expand Down Expand Up @@ -117,16 +110,13 @@ def get_refs(browser, season=None):
ValueError: If `season` is less than 2016.
"""

params = {}
url = 'https://kenpom.com/officials.php'

if season:
if int(season) < 2016:
raise ValueError(
'season cannot be less than 2016, as data only goes back that far.')
params['y'] = str(season)

url = url + '?' + urllib.parse.urlencode(params)
url = url + '?y=' + str(season)

browser.open(url)
refs = browser.get_current_page()
Expand Down Expand Up @@ -188,17 +178,13 @@ def get_arenas(browser, season=None):
ValueError: If `season` is less than 2010.
"""

params = {}
url = 'https://kenpom.com/arenas.php'

if season:
if int(season) < 2010:
raise ValueError(
'season cannot be less than 2010, as data only goes back that far.')

params['y'] = str(season)

url = url + '?' + urllib.parse.urlencode(params)
url = url + '?y=' + str(season)

browser.open(url)
arenas = browser.get_current_page()
Expand Down Expand Up @@ -238,8 +224,6 @@ def get_gameattribs(browser, season=None, metric='Excitement'):
KeyError: If `metric` is invalid.
"""

params = {}

# `metric` parameter checking.
metric = metric.upper()
metrics = {'EXCITEMENT': 'Excitement', 'TENSION': 'Tension', 'DOMINANCE': 'Dominance', 'COMEBACK': 'MinWP',
Expand All @@ -249,9 +233,9 @@ def get_gameattribs(browser, season=None, metric='Excitement'):
"""Metric is invalid, must be one of: 'Excitement',
'Tension', 'Dominance', 'ComeBack', 'FanMatch', 'Upsets', and 'Busts'""")
else:
params['s'] = metrics[metric]
met_url = 's=' + metrics[metric]

url = 'https://kenpom.com/game_attrs.php?'
url = 'https://kenpom.com/game_attrs.php?' + met_url

# Season selection and an additional check.
if season:
Expand All @@ -262,9 +246,7 @@ def get_gameattribs(browser, season=None, metric='Excitement'):
raise ValueError(
'FanMatch, Upsets, and Busts tables only available for seasons after 2010.'
)
params['y'] = str(season)

url = url + urllib.parse.urlencode(params)
url = url + '&y=' + str(season)

browser.open(url)
playerstats = browser.get_current_page()
Expand Down
Loading

0 comments on commit 5c395d4

Please sign in to comment.