modin-project · anmyachev · Dec 1, 2020 · Oct 26, 2020 · Dec 1, 2020 · Dec 1, 2020
@@ -313,6 +313,7 @@ def TestReadCSVFixture():
         "test_read_csv_regular",
         "test_read_csv_blank_lines",
         "test_read_csv_yes_no",
+        "test_read_csv_nans",
     ]
     # each xdist worker spawned in separate process with separate namespace and dataset
     pytest.csvs_names = {file_id: get_unique_filename() for file_id in files_ids}
@@ -330,6 +331,12 @@ def TestReadCSVFixture():
         filename=pytest.csvs_names["test_read_csv_blank_lines"],
         add_blank_lines=True,
     )
+    # test_read_csv_nans_handling
+    _make_csv_file(filenames)(
+        filename=pytest.csvs_names["test_read_csv_nans"],
+        add_blank_lines=True,
+        additional_col_values=["<NA>", "N/A", "NA", "NULL", "custom_nan", "73"],
+    )
 
     yield
     # Delete csv files that were created
@@ -553,12 +560,6 @@ class TestReadCSV:
     def test_read_csv_delimiters(
         self, make_csv_file, sep, delimiter, decimal, thousands
     ):
-        kwargs = {
-            "delimiter": delimiter,
-            "sep": sep,
-            "decimal": decimal,
-            "thousands": thousands,
-        }
         unique_filename = get_unique_filename()
         make_csv_file(
             filename=unique_filename,
@@ -568,9 +569,13 @@ def test_read_csv_delimiters(
         )
 
         eval_io(
-            filepath_or_buffer=unique_filename,
             fn_name="read_csv",
-            **kwargs,
+            # read_csv kwargs
+            filepath_or_buffer=unique_filename,
+            delimiter=delimiter,
+            sep=sep,
+            decimal=decimal,
+            thousands=thousands,
         )
 
     # Column and Index Locations and Names tests
@@ -603,19 +608,16 @@ def test_read_csv_col_handling(
                 "The reason of tests fail in `cloud` mode is unknown for now - issue #2340"
             )
 
-        kwargs = {
-            "header": header,
-            "index_col": index_col,
-            "prefix": prefix,
-            "names": names,
-            "usecols": usecols,
-            "skip_blank_lines": skip_blank_lines,
-        }
-
         eval_io(
-            filepath_or_buffer=pytest.csvs_names["test_read_csv_blank_lines"],
             fn_name="read_csv",
-            **kwargs,
+            # read_csv kwargs
+            filepath_or_buffer=pytest.csvs_names["test_read_csv_blank_lines"],
+            header=header,
+            index_col=index_col,
+            prefix=prefix,
+            names=names,
+            usecols=usecols,
+            skip_blank_lines=skip_blank_lines,
         )
 
     # General Parsing Configuration
@@ -645,28 +647,26 @@ def test_read_csv_parsing_1(
             pytest.xfail(
                 "The reason of tests fail in `cloud` mode is unknown for now - issue #2340"
             )
-        kwargs = {
-            "dtype": dtype,
-            "engine": engine,
-            "converters": converters,
-            "skipfooter": skipfooter,
-        }
 
-        if kwargs["dtype"]:
-            kwargs["dtype"] = {
+        if dtype:
+            dtype = {
                 col: "object"
                 for col in pandas.read_csv(
                     pytest.csvs_names["test_read_csv_regular"], nrows=1
                 ).columns
             }
 
         eval_io(
-            filepath_or_buffer=pytest.csvs_names["test_read_csv_regular"],
             fn_name="read_csv",
             check_exception_type=None,  # issue #2320
             raising_exceptions=None,
             check_kwargs_callable=not callable(converters),
-            **kwargs,
+            # read_csv kwargs
+            filepath_or_buffer=pytest.csvs_names["test_read_csv_regular"],
+            dtype=dtype,
+            engine=engine,
+            converters=converters,
+            skipfooter=skipfooter,
         )
 
     @pytest.mark.parametrize("true_values", [["Yes"], ["Yes", "true"], None])
@@ -691,22 +691,20 @@ def test_read_csv_parsing_2(
             pytest.xfail(
                 "The reason of tests fail in `cloud` mode is unknown for now - issue #2340"
             )
-        kwargs = {
-            "true_values": true_values,
-            "false_values": false_values,
-            "skiprows": skiprows,
-            "skipfooter": skipfooter,
-            "nrows": nrows,
-            "names": names,
-        }
 
         eval_io(
-            filepath_or_buffer=pytest.csvs_names["test_read_csv_yes_no"],
             fn_name="read_csv",
             check_exception_type=None,  # issue #2320
             raising_exceptions=None,
             check_kwargs_callable=not callable(skiprows),
-            **kwargs,
+            # read_csv kwargs
+            filepath_or_buffer=pytest.csvs_names["test_read_csv_yes_no"],
+            true_values=true_values,
+            false_values=false_values,
+            skiprows=skiprows,
+            skipfooter=skipfooter,
+            nrows=nrows,
+            names=names,
         )
 
     def test_read_csv_skipinitialspace(self, make_csv_file):
@@ -746,6 +744,31 @@ def test_read_csv_mangle_dupe_cols(self):
         str_non_unique_cols = "col,col,col,col\n" "5, 6, 7, 8\n" "9, 10, 11, 12\n"
         eval_io_from_str(str_non_unique_cols, unique_filename, mangle_dupe_cols=True)
 
+    # NA and Missing Data Handling tests
+    @pytest.mark.parametrize("na_values", ["custom_nan", "73"])
+    @pytest.mark.parametrize("keep_default_na", [True, False])
+    @pytest.mark.parametrize("na_filter", [True, False])
+    @pytest.mark.parametrize("verbose", [True, False])
+    @pytest.mark.parametrize("skip_blank_lines", [True, False])
+    def test_read_csv_nans_handling(
+        self,
+        na_values,
+        keep_default_na,
+        na_filter,
+        verbose,
+        skip_blank_lines,
+    ):
+        eval_io(
+            filepath_or_buffer=pytest.csvs_names["test_read_csv_nans"],
+            fn_name="read_csv",
+            # read_csv kwargs
+            na_values=na_values,
+            keep_default_na=keep_default_na,
+            na_filter=na_filter,
+            verbose=verbose,
+            skip_blank_lines=skip_blank_lines,
+        )
+
     # Datetime Handling tests
     @pytest.mark.parametrize(
         "parse_dates",
@@ -792,21 +815,18 @@ def test_read_csv_datetime(
             raising_exceptions = list(io_ops_bad_exc)
             raising_exceptions.remove(TypeError)
 
-        kwargs = {
-            "parse_dates": parse_dates,
-            "infer_datetime_format": infer_datetime_format,
-            "keep_date_col": keep_date_col,
-            "date_parser": date_parser,
-            "dayfirst": dayfirst,
-            "cache_dates": cache_dates,
-        }
-
         eval_io(
-            filepath_or_buffer=pytest.csvs_names["test_read_csv_regular"],
             fn_name="read_csv",
             check_kwargs_callable=not callable(date_parser),
             raising_exceptions=raising_exceptions,
-            **kwargs,
+            # read_csv kwargs
+            filepath_or_buffer=pytest.csvs_names["test_read_csv_regular"],
+            parse_dates=parse_dates,
+            infer_datetime_format=infer_datetime_format,
+            keep_date_col=keep_date_col,
+            date_parser=date_parser,
+            dayfirst=dayfirst,
+            cache_dates=cache_dates,
         )