failed speedup thru api

N-Masi · Jan 4, 2024 · 9c0362e · 9c0362e
1 parent 731b8d1
commit 9c0362e
Show file tree

Hide file tree

Showing 4 changed files with 39 additions and 3 deletions.
diff --git a/README.md b/README.md
@@ -1,3 +1,14 @@
+# TODO:
+* Change the `get_data` of `ACSDataSource` class to not download data but rather grab dynamically from API
+    * [NOTE] Turns out that using the web API is 12-44x slower than downloading the CSV 😭 (run `api_tests.py`)
+    * [NOTE] this supports a use case where the user only needs the data for one problem, and will not continually
+    use the data source for many different problems
+    * Support the household case of `get_data`
+* Support CPS data in addition to ACS by making a new subclass of `DataSource`: `CPSDataSource`
+    * [NOTE] (CPS variables here https://api.census.gov/data/2021/cps/basic/jun/variables.html)
+    * [NOTE] Folktables only works with Census Bureau microdata, not aggregated data (as I did for 1701X)
+* Support multiple years at the same time
+
 
 [![License: MIT](https://img.shields.io/badge/License-MIT-blue.svg?color=g&style=plastic)](https://opensource.org/licenses/MIT)
 [![pypy: MIT](https://static.pepy.tech/personalized-badge/folktables?period=total&units=international_system&left_color=black&right_color=brightgreen&left_text=Downloads)](https://pepy.tech/project/folktables)

diff --git a/api_tests.py b/api_tests.py
@@ -0,0 +1,21 @@
+from folktables import ACSDataSource, ACSPublicCoverage
+import requests
+import datetime
+
+def req():
+    start = datetime.datetime.now()
+    data_source = ACSDataSource(survey_year='2018', horizon='1-Year', survey='person')
+    acs_data = data_source.get_data(states=["CA"], download=True)
+    features, label, group = ACSPublicCoverage.df_to_numpy(acs_data)
+    delta = datetime.datetime.now() - start
+    print(delta)
+
+def req_api():
+    start = datetime.datetime.now()
+     resp = requests.get('https://api.census.gov/data/2018/acs/acs1/pums?get=AGEP,SCHL,MAR,SEX,DIS,ESP,CIT,MIG,MIL,ANC,NATIVITY,DEAR,DEYE,DREM,PINCP,ESR,ST,FER,RAC1P,PUBCOV&in=state:06')
+     delta = datetime.datetime.now() - start
+     print(delta)
+
+req()
+
+req_api()
diff --git a/folktables/acs.py b/folktables/acs.py
@@ -9,13 +9,15 @@
 class ACSDataSource(folktables.DataSource):
     """Data source implementation for ACS PUMS data."""
 
-    def __init__(self, survey_year, horizon, survey, root_dir="data"):
+    def __init__(self, survey_year, horizon, survey, use_api=False, root_dir="data"):
         """Create data source around PUMS data for specific year, time horizon, survey type.
 
         Args:
             survey_year: String. Year of ACS PUMS data, e.g., '2018'
             horizon: String. Must be '1-Year' or '5-Year'
             survey: String. Must be 'person' or 'household'
+            use_api: Boolean. Setting to True accesses ACS data from the Census Bureau API,
+                     eliminating the need for local download
 
         Returns:
             ACSDataSource
@@ -25,6 +27,7 @@ def __init__(self, survey_year, horizon, survey, root_dir="data"):
         self._survey_year = survey_year
         self._horizon = horizon
         self._survey = survey
+        self._use_api = use_api
         self._root_dir = root_dir
 
     def get_data(self, states=None, density=1.0, random_seed=0, join_household=False, download=False):
@@ -46,7 +49,8 @@ def get_data(self, states=None, density=1.0, random_seed=0, join_household=False
                                       horizon=self._horizon,
                                       survey='household',
                                       serial_filter_list=list(data['SERIALNO']),
-                                      download=download)
+                                      download=download,
+                                      use_api=self._use_api)
 
             # We only want to keep the columns in the household dataframe that don't appear in the person
             # dataframe, but we *do* want to include the SERIALNO column to merge on.

diff --git a/folktables/load_acs.py b/folktables/load_acs.py
@@ -83,7 +83,7 @@ def initialize_and_download(datadir, state, year, horizon, survey, download=Fals
 def load_acs(root_dir, states=None, year=2018, horizon='1-Year',
              survey='person', density=1, random_seed=1,
              serial_filter_list=None,
-             download=False):
+             download=False, use_api=False):
     """
     Load sample of ACS PUMS data from Census csv files into DataFrame.