diff --git a/README.md b/README.md index 3dfe628..d95007a 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,14 @@ +# TODO: +* Change the `get_data` of `ACSDataSource` class to not download data but rather grab dynamically from API + * [NOTE] Turns out that using the web API is 12-44x slower than downloading the CSV 😭 (run `api_tests.py`) + * [NOTE] this supports a use case where the user only needs the data for one problem, and will not continually + use the data source for many different problems + * Support the household case of `get_data` +* Support CPS data in addition to ACS by making a new subclass of `DataSource`: `CPSDataSource` + * [NOTE] (CPS variables here https://api.census.gov/data/2021/cps/basic/jun/variables.html) + * [NOTE] Folktables only works with Census Bureau microdata, not aggregated data (as I did for 1701X) +* Support multiple years at the same time + [![License: MIT](https://img.shields.io/badge/License-MIT-blue.svg?color=g&style=plastic)](https://opensource.org/licenses/MIT) [![pypy: MIT](https://static.pepy.tech/personalized-badge/folktables?period=total&units=international_system&left_color=black&right_color=brightgreen&left_text=Downloads)](https://pepy.tech/project/folktables) diff --git a/api_tests.py b/api_tests.py new file mode 100644 index 0000000..c468e00 --- /dev/null +++ b/api_tests.py @@ -0,0 +1,21 @@ +from folktables import ACSDataSource, ACSPublicCoverage +import requests +import datetime + +def req(): + start = datetime.datetime.now() + data_source = ACSDataSource(survey_year='2018', horizon='1-Year', survey='person') + acs_data = data_source.get_data(states=["CA"], download=True) + features, label, group = ACSPublicCoverage.df_to_numpy(acs_data) + delta = datetime.datetime.now() - start + print(delta) + +def req_api(): + start = datetime.datetime.now() + resp = requests.get('https://api.census.gov/data/2018/acs/acs1/pums?get=AGEP,SCHL,MAR,SEX,DIS,ESP,CIT,MIG,MIL,ANC,NATIVITY,DEAR,DEYE,DREM,PINCP,ESR,ST,FER,RAC1P,PUBCOV&in=state:06') + delta = datetime.datetime.now() - start + print(delta) + +req() + +req_api() \ No newline at end of file diff --git a/folktables/acs.py b/folktables/acs.py index d5d142b..46fc11b 100644 --- a/folktables/acs.py +++ b/folktables/acs.py @@ -9,13 +9,15 @@ class ACSDataSource(folktables.DataSource): """Data source implementation for ACS PUMS data.""" - def __init__(self, survey_year, horizon, survey, root_dir="data"): + def __init__(self, survey_year, horizon, survey, use_api=False, root_dir="data"): """Create data source around PUMS data for specific year, time horizon, survey type. Args: survey_year: String. Year of ACS PUMS data, e.g., '2018' horizon: String. Must be '1-Year' or '5-Year' survey: String. Must be 'person' or 'household' + use_api: Boolean. Setting to True accesses ACS data from the Census Bureau API, + eliminating the need for local download Returns: ACSDataSource @@ -25,6 +27,7 @@ def __init__(self, survey_year, horizon, survey, root_dir="data"): self._survey_year = survey_year self._horizon = horizon self._survey = survey + self._use_api = use_api self._root_dir = root_dir def get_data(self, states=None, density=1.0, random_seed=0, join_household=False, download=False): @@ -46,7 +49,8 @@ def get_data(self, states=None, density=1.0, random_seed=0, join_household=False horizon=self._horizon, survey='household', serial_filter_list=list(data['SERIALNO']), - download=download) + download=download, + use_api=self._use_api) # We only want to keep the columns in the household dataframe that don't appear in the person # dataframe, but we *do* want to include the SERIALNO column to merge on. diff --git a/folktables/load_acs.py b/folktables/load_acs.py index 4c53205..16267e3 100644 --- a/folktables/load_acs.py +++ b/folktables/load_acs.py @@ -83,7 +83,7 @@ def initialize_and_download(datadir, state, year, horizon, survey, download=Fals def load_acs(root_dir, states=None, year=2018, horizon='1-Year', survey='person', density=1, random_seed=1, serial_filter_list=None, - download=False): + download=False, use_api=False): """ Load sample of ACS PUMS data from Census csv files into DataFrame.