From 22f6dc1a66ea8cacc8c9b863ed2508269ffb4db0 Mon Sep 17 00:00:00 2001
From: Andrea Baraldi <baraldian@gmail.com>
Date: Mon, 27 Mar 2023 11:37:43 +0200
Subject: [PATCH] Improved load_acs

Improved loading function using pandas only instead of StrinIO
---
 folktables/load_acs.py | 41 +++++++++--------------------------------
 1 file changed, 9 insertions(+), 32 deletions(-)

diff --git a/folktables/load_acs.py b/folktables/load_acs.py
index dee48d4..4c53205 100644
--- a/folktables/load_acs.py
+++ b/folktables/load_acs.py
@@ -111,38 +111,15 @@ def load_acs(root_dir, states=None, year=2018, horizon='1-Year',
             initialize_and_download(base_datadir, state, year, horizon, survey, download=download)
         )
 
-    sample = io.StringIO()
-
-    first = True
-    
+    dtypes = {'PINCP': np.float64, 'RT': str, 'SOCP': str, 'SERIALNO': str, 'NAICSP': str}
+    df_list = []
     for file_name in file_names:
-      
-        with open(file_name, 'r') as f:
-            
-            if first:
-                sample.write(next(f))
-                first = False
-            else:
-                next(f)
-
-            if serial_filter_list is None:
-                for line in f:
-                    if random.uniform(0, 1) < density:
-                        # strip whitespace found in some early files
-                        sample.write(line.replace(' ',''))
-            else:
-                for line in f:
-                    serialno = line.split(',')[1]
-                    if serialno in serial_filter_list:
-                        # strip whitespace found in some early files
-                        sample.write(line.replace(' ',''))
-
-            
-    sample.seek(0)
-    
-    dtypes = {'PINCP' : np.float64, 'RT' : str, 'SOCP' : str, 'SERIALNO' : str, 'NAICSP' : str}
-                    
-    return pd.read_csv(sample, dtype=dtypes)
+        df = pd.read_csv(file_name, dtype=dtypes).replace(' ','')
+        if serial_filter_list is not None:
+            df = df[df['SERIALNO'].isin(serial_filter_list)]
+        df_list.append(df)
+    all_df = pd.concat(df_list)
+    return all_df
 
 
 def load_definitions(root_dir, year=2018, horizon='1-Year', download=False):
@@ -214,4 +191,4 @@ def generate_categories(features, definition_df):
         del mapping_dict[-99999999999999.0]
 
         categories[feature] = mapping_dict
-    return categories
\ No newline at end of file
+    return categories