From 6ea7711f290786a8df9e2e1b96f085791727128f Mon Sep 17 00:00:00 2001 From: Alexander Myskov Date: Mon, 26 Oct 2020 05:19:28 -0500 Subject: [PATCH 1/4] TEST-#2291: add NAs handling tests Signed-off-by: Alexander Myskov --- modin/pandas/test/test_io.py | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/modin/pandas/test/test_io.py b/modin/pandas/test/test_io.py index 910e10bf0d6..b4eebaf5ca1 100644 --- a/modin/pandas/test/test_io.py +++ b/modin/pandas/test/test_io.py @@ -746,6 +746,40 @@ def test_read_csv_mangle_dupe_cols(self): str_non_unique_cols = "col,col,col,col\n" "5, 6, 7, 8\n" "9, 10, 11, 12\n" eval_io_from_str(str_non_unique_cols, unique_filename, mangle_dupe_cols=True) + # NA and Missing Data Handling tests + @pytest.mark.parametrize("na_values", ["custom_nan", "73"]) + @pytest.mark.parametrize("keep_default_na", [True, False]) + @pytest.mark.parametrize("na_filter", [True, False]) + @pytest.mark.parametrize("verbose", [True, False]) + @pytest.mark.parametrize("skip_blank_lines", [True, False]) + def test_read_csv_na_handling( + self, + make_csv_file, + na_values, + keep_default_na, + na_filter, + verbose, + skip_blank_lines, + ): + kwargs = { + "na_values": na_values, + "keep_default_na": keep_default_na, + "na_filter": na_filter, + "verbose": verbose, + "skip_blank_lines": skip_blank_lines, + } + unique_name = get_unique_filename("test_read_csv_na_handling", kwargs) + make_csv_file( + filename=unique_name, + add_blank_lines=True, + additional_col_values=["", "N/A", "NA", "NULL", "custom_nan", "73"], + ) + eval_io( + filepath_or_buffer=unique_name, + fn_name="read_csv", + **kwargs, + ) + # Datetime Handling tests @pytest.mark.parametrize( "parse_dates", From 1851494f5b79582f0883da7b203e8907b02b15f3 Mon Sep 17 00:00:00 2001 From: Alexander Myskov Date: Tue, 1 Dec 2020 05:55:27 -0600 Subject: [PATCH 2/4] TEST-#2291: use class scope fixture Signed-off-by: Alexander Myskov --- modin/pandas/test/test_io.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/modin/pandas/test/test_io.py b/modin/pandas/test/test_io.py index b4eebaf5ca1..22b0177d451 100644 --- a/modin/pandas/test/test_io.py +++ b/modin/pandas/test/test_io.py @@ -313,6 +313,7 @@ def TestReadCSVFixture(): "test_read_csv_regular", "test_read_csv_blank_lines", "test_read_csv_yes_no", + "test_read_csv_nans", ] # each xdist worker spawned in separate process with separate namespace and dataset pytest.csvs_names = {file_id: get_unique_filename() for file_id in files_ids} @@ -330,6 +331,12 @@ def TestReadCSVFixture(): filename=pytest.csvs_names["test_read_csv_blank_lines"], add_blank_lines=True, ) + # test_read_csv_nans_handling + _make_csv_file(filenames)( + filename=pytest.csvs_names["test_read_csv_nans"], + add_blank_lines=True, + additional_col_values=["", "N/A", "NA", "NULL", "custom_nan", "73"], + ) yield # Delete csv files that were created @@ -752,9 +759,8 @@ def test_read_csv_mangle_dupe_cols(self): @pytest.mark.parametrize("na_filter", [True, False]) @pytest.mark.parametrize("verbose", [True, False]) @pytest.mark.parametrize("skip_blank_lines", [True, False]) - def test_read_csv_na_handling( + def test_read_csv_nans_handling( self, - make_csv_file, na_values, keep_default_na, na_filter, @@ -768,14 +774,8 @@ def test_read_csv_na_handling( "verbose": verbose, "skip_blank_lines": skip_blank_lines, } - unique_name = get_unique_filename("test_read_csv_na_handling", kwargs) - make_csv_file( - filename=unique_name, - add_blank_lines=True, - additional_col_values=["", "N/A", "NA", "NULL", "custom_nan", "73"], - ) eval_io( - filepath_or_buffer=unique_name, + filepath_or_buffer=pytest.csvs_names["test_read_csv_nans"], fn_name="read_csv", **kwargs, ) From de909c3caf0e8df0810512be79856e9b64eeb3b3 Mon Sep 17 00:00:00 2001 From: amyskov <55585026+amyskov@users.noreply.github.com> Date: Tue, 1 Dec 2020 16:32:35 +0300 Subject: [PATCH 3/4] TEST-#2291: Update modin/pandas/test/test_io.py Co-authored-by: Anatoly Myachev <45976948+anmyachev@users.noreply.github.com> Signed-off-by: Alexander Myskov --- modin/pandas/test/test_io.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/modin/pandas/test/test_io.py b/modin/pandas/test/test_io.py index 22b0177d451..a866eb7ab5d 100644 --- a/modin/pandas/test/test_io.py +++ b/modin/pandas/test/test_io.py @@ -777,7 +777,12 @@ def test_read_csv_nans_handling( eval_io( filepath_or_buffer=pytest.csvs_names["test_read_csv_nans"], fn_name="read_csv", - **kwargs, + # read_csv kwargs + na_values=na_values, + keep_default_na=keep_default_na, + na_filter=na_filter, + verbose=verbose, + skip_blank_lines=skip_blank_lines, ) # Datetime Handling tests From d2d269fc869b4f710c80abd0adc015b1f08755ea Mon Sep 17 00:00:00 2001 From: Alexander Myskov Date: Tue, 1 Dec 2020 07:51:49 -0600 Subject: [PATCH 4/4] TEST-#2291: minor refactoring Signed-off-by: Alexander Myskov --- modin/pandas/test/test_io.py | 95 +++++++++++++++--------------------- 1 file changed, 38 insertions(+), 57 deletions(-) diff --git a/modin/pandas/test/test_io.py b/modin/pandas/test/test_io.py index a866eb7ab5d..8f67858f311 100644 --- a/modin/pandas/test/test_io.py +++ b/modin/pandas/test/test_io.py @@ -560,12 +560,6 @@ class TestReadCSV: def test_read_csv_delimiters( self, make_csv_file, sep, delimiter, decimal, thousands ): - kwargs = { - "delimiter": delimiter, - "sep": sep, - "decimal": decimal, - "thousands": thousands, - } unique_filename = get_unique_filename() make_csv_file( filename=unique_filename, @@ -575,9 +569,13 @@ def test_read_csv_delimiters( ) eval_io( - filepath_or_buffer=unique_filename, fn_name="read_csv", - **kwargs, + # read_csv kwargs + filepath_or_buffer=unique_filename, + delimiter=delimiter, + sep=sep, + decimal=decimal, + thousands=thousands, ) # Column and Index Locations and Names tests @@ -610,19 +608,16 @@ def test_read_csv_col_handling( "The reason of tests fail in `cloud` mode is unknown for now - issue #2340" ) - kwargs = { - "header": header, - "index_col": index_col, - "prefix": prefix, - "names": names, - "usecols": usecols, - "skip_blank_lines": skip_blank_lines, - } - eval_io( - filepath_or_buffer=pytest.csvs_names["test_read_csv_blank_lines"], fn_name="read_csv", - **kwargs, + # read_csv kwargs + filepath_or_buffer=pytest.csvs_names["test_read_csv_blank_lines"], + header=header, + index_col=index_col, + prefix=prefix, + names=names, + usecols=usecols, + skip_blank_lines=skip_blank_lines, ) # General Parsing Configuration @@ -652,15 +647,9 @@ def test_read_csv_parsing_1( pytest.xfail( "The reason of tests fail in `cloud` mode is unknown for now - issue #2340" ) - kwargs = { - "dtype": dtype, - "engine": engine, - "converters": converters, - "skipfooter": skipfooter, - } - if kwargs["dtype"]: - kwargs["dtype"] = { + if dtype: + dtype = { col: "object" for col in pandas.read_csv( pytest.csvs_names["test_read_csv_regular"], nrows=1 @@ -668,12 +657,16 @@ def test_read_csv_parsing_1( } eval_io( - filepath_or_buffer=pytest.csvs_names["test_read_csv_regular"], fn_name="read_csv", check_exception_type=None, # issue #2320 raising_exceptions=None, check_kwargs_callable=not callable(converters), - **kwargs, + # read_csv kwargs + filepath_or_buffer=pytest.csvs_names["test_read_csv_regular"], + dtype=dtype, + engine=engine, + converters=converters, + skipfooter=skipfooter, ) @pytest.mark.parametrize("true_values", [["Yes"], ["Yes", "true"], None]) @@ -698,22 +691,20 @@ def test_read_csv_parsing_2( pytest.xfail( "The reason of tests fail in `cloud` mode is unknown for now - issue #2340" ) - kwargs = { - "true_values": true_values, - "false_values": false_values, - "skiprows": skiprows, - "skipfooter": skipfooter, - "nrows": nrows, - "names": names, - } eval_io( - filepath_or_buffer=pytest.csvs_names["test_read_csv_yes_no"], fn_name="read_csv", check_exception_type=None, # issue #2320 raising_exceptions=None, check_kwargs_callable=not callable(skiprows), - **kwargs, + # read_csv kwargs + filepath_or_buffer=pytest.csvs_names["test_read_csv_yes_no"], + true_values=true_values, + false_values=false_values, + skiprows=skiprows, + skipfooter=skipfooter, + nrows=nrows, + names=names, ) def test_read_csv_skipinitialspace(self, make_csv_file): @@ -767,13 +758,6 @@ def test_read_csv_nans_handling( verbose, skip_blank_lines, ): - kwargs = { - "na_values": na_values, - "keep_default_na": keep_default_na, - "na_filter": na_filter, - "verbose": verbose, - "skip_blank_lines": skip_blank_lines, - } eval_io( filepath_or_buffer=pytest.csvs_names["test_read_csv_nans"], fn_name="read_csv", @@ -831,21 +815,18 @@ def test_read_csv_datetime( raising_exceptions = list(io_ops_bad_exc) raising_exceptions.remove(TypeError) - kwargs = { - "parse_dates": parse_dates, - "infer_datetime_format": infer_datetime_format, - "keep_date_col": keep_date_col, - "date_parser": date_parser, - "dayfirst": dayfirst, - "cache_dates": cache_dates, - } - eval_io( - filepath_or_buffer=pytest.csvs_names["test_read_csv_regular"], fn_name="read_csv", check_kwargs_callable=not callable(date_parser), raising_exceptions=raising_exceptions, - **kwargs, + # read_csv kwargs + filepath_or_buffer=pytest.csvs_names["test_read_csv_regular"], + parse_dates=parse_dates, + infer_datetime_format=infer_datetime_format, + keep_date_col=keep_date_col, + date_parser=date_parser, + dayfirst=dayfirst, + cache_dates=cache_dates, )