From 22f6dc1a66ea8cacc8c9b863ed2508269ffb4db0 Mon Sep 17 00:00:00 2001 From: Andrea Baraldi Date: Mon, 27 Mar 2023 11:37:43 +0200 Subject: [PATCH] Improved load_acs Improved loading function using pandas only instead of StrinIO --- folktables/load_acs.py | 41 +++++++++-------------------------------- 1 file changed, 9 insertions(+), 32 deletions(-) diff --git a/folktables/load_acs.py b/folktables/load_acs.py index dee48d4..4c53205 100644 --- a/folktables/load_acs.py +++ b/folktables/load_acs.py @@ -111,38 +111,15 @@ def load_acs(root_dir, states=None, year=2018, horizon='1-Year', initialize_and_download(base_datadir, state, year, horizon, survey, download=download) ) - sample = io.StringIO() - - first = True - + dtypes = {'PINCP': np.float64, 'RT': str, 'SOCP': str, 'SERIALNO': str, 'NAICSP': str} + df_list = [] for file_name in file_names: - - with open(file_name, 'r') as f: - - if first: - sample.write(next(f)) - first = False - else: - next(f) - - if serial_filter_list is None: - for line in f: - if random.uniform(0, 1) < density: - # strip whitespace found in some early files - sample.write(line.replace(' ','')) - else: - for line in f: - serialno = line.split(',')[1] - if serialno in serial_filter_list: - # strip whitespace found in some early files - sample.write(line.replace(' ','')) - - - sample.seek(0) - - dtypes = {'PINCP' : np.float64, 'RT' : str, 'SOCP' : str, 'SERIALNO' : str, 'NAICSP' : str} - - return pd.read_csv(sample, dtype=dtypes) + df = pd.read_csv(file_name, dtype=dtypes).replace(' ','') + if serial_filter_list is not None: + df = df[df['SERIALNO'].isin(serial_filter_list)] + df_list.append(df) + all_df = pd.concat(df_list) + return all_df def load_definitions(root_dir, year=2018, horizon='1-Year', download=False): @@ -214,4 +191,4 @@ def generate_categories(features, definition_df): del mapping_dict[-99999999999999.0] categories[feature] = mapping_dict - return categories \ No newline at end of file + return categories