Skip to content

Commit

Permalink
failed speedup thru api
Browse files Browse the repository at this point in the history
  • Loading branch information
N-Masi committed Jan 4, 2024
1 parent 731b8d1 commit 9c0362e
Show file tree
Hide file tree
Showing 4 changed files with 39 additions and 3 deletions.
11 changes: 11 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,14 @@
# TODO:
* Change the `get_data` of `ACSDataSource` class to not download data but rather grab dynamically from API
* [NOTE] Turns out that using the web API is 12-44x slower than downloading the CSV 😭 (run `api_tests.py`)
* [NOTE] this supports a use case where the user only needs the data for one problem, and will not continually
use the data source for many different problems
* Support the household case of `get_data`
* Support CPS data in addition to ACS by making a new subclass of `DataSource`: `CPSDataSource`
* [NOTE] (CPS variables here https://api.census.gov/data/2021/cps/basic/jun/variables.html)
* [NOTE] Folktables only works with Census Bureau microdata, not aggregated data (as I did for 1701X)
* Support multiple years at the same time


[![License: MIT](https://img.shields.io/badge/License-MIT-blue.svg?color=g&style=plastic)](https://opensource.org/licenses/MIT)
[![pypy: MIT](https://static.pepy.tech/personalized-badge/folktables?period=total&units=international_system&left_color=black&right_color=brightgreen&left_text=Downloads)](https://pepy.tech/project/folktables)
Expand Down
21 changes: 21 additions & 0 deletions api_tests.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
from folktables import ACSDataSource, ACSPublicCoverage
import requests
import datetime

def req():
start = datetime.datetime.now()
data_source = ACSDataSource(survey_year='2018', horizon='1-Year', survey='person')
acs_data = data_source.get_data(states=["CA"], download=True)
features, label, group = ACSPublicCoverage.df_to_numpy(acs_data)
delta = datetime.datetime.now() - start
print(delta)

def req_api():
start = datetime.datetime.now()
resp = requests.get('https://api.census.gov/data/2018/acs/acs1/pums?get=AGEP,SCHL,MAR,SEX,DIS,ESP,CIT,MIG,MIL,ANC,NATIVITY,DEAR,DEYE,DREM,PINCP,ESR,ST,FER,RAC1P,PUBCOV&in=state:06')
delta = datetime.datetime.now() - start
print(delta)

req()

req_api()
8 changes: 6 additions & 2 deletions folktables/acs.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,15 @@
class ACSDataSource(folktables.DataSource):
"""Data source implementation for ACS PUMS data."""

def __init__(self, survey_year, horizon, survey, root_dir="data"):
def __init__(self, survey_year, horizon, survey, use_api=False, root_dir="data"):
"""Create data source around PUMS data for specific year, time horizon, survey type.
Args:
survey_year: String. Year of ACS PUMS data, e.g., '2018'
horizon: String. Must be '1-Year' or '5-Year'
survey: String. Must be 'person' or 'household'
use_api: Boolean. Setting to True accesses ACS data from the Census Bureau API,
eliminating the need for local download
Returns:
ACSDataSource
Expand All @@ -25,6 +27,7 @@ def __init__(self, survey_year, horizon, survey, root_dir="data"):
self._survey_year = survey_year
self._horizon = horizon
self._survey = survey
self._use_api = use_api
self._root_dir = root_dir

def get_data(self, states=None, density=1.0, random_seed=0, join_household=False, download=False):
Expand All @@ -46,7 +49,8 @@ def get_data(self, states=None, density=1.0, random_seed=0, join_household=False
horizon=self._horizon,
survey='household',
serial_filter_list=list(data['SERIALNO']),
download=download)
download=download,
use_api=self._use_api)

# We only want to keep the columns in the household dataframe that don't appear in the person
# dataframe, but we *do* want to include the SERIALNO column to merge on.
Expand Down
2 changes: 1 addition & 1 deletion folktables/load_acs.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ def initialize_and_download(datadir, state, year, horizon, survey, download=Fals
def load_acs(root_dir, states=None, year=2018, horizon='1-Year',
survey='person', density=1, random_seed=1,
serial_filter_list=None,
download=False):
download=False, use_api=False):
"""
Load sample of ACS PUMS data from Census csv files into DataFrame.
Expand Down

0 comments on commit 9c0362e

Please sign in to comment.