Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

TEST-#2291: Cover by tests NA and Missing Data Handling parameters of read_csv #2337

Merged
merged 4 commits into from
Dec 1, 2020
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
120 changes: 70 additions & 50 deletions modin/pandas/test/test_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -313,6 +313,7 @@ def TestReadCSVFixture():
"test_read_csv_regular",
"test_read_csv_blank_lines",
"test_read_csv_yes_no",
"test_read_csv_nans",
]
# each xdist worker spawned in separate process with separate namespace and dataset
pytest.csvs_names = {file_id: get_unique_filename() for file_id in files_ids}
Expand All @@ -330,6 +331,12 @@ def TestReadCSVFixture():
filename=pytest.csvs_names["test_read_csv_blank_lines"],
add_blank_lines=True,
)
# test_read_csv_nans_handling
_make_csv_file(filenames)(
filename=pytest.csvs_names["test_read_csv_nans"],
add_blank_lines=True,
additional_col_values=["<NA>", "N/A", "NA", "NULL", "custom_nan", "73"],
)

yield
# Delete csv files that were created
Expand Down Expand Up @@ -553,12 +560,6 @@ class TestReadCSV:
def test_read_csv_delimiters(
self, make_csv_file, sep, delimiter, decimal, thousands
):
kwargs = {
"delimiter": delimiter,
"sep": sep,
"decimal": decimal,
"thousands": thousands,
}
unique_filename = get_unique_filename()
make_csv_file(
filename=unique_filename,
Expand All @@ -568,9 +569,13 @@ def test_read_csv_delimiters(
)

eval_io(
filepath_or_buffer=unique_filename,
fn_name="read_csv",
**kwargs,
# read_csv kwargs
filepath_or_buffer=unique_filename,
delimiter=delimiter,
sep=sep,
decimal=decimal,
thousands=thousands,
)

# Column and Index Locations and Names tests
Expand Down Expand Up @@ -603,19 +608,16 @@ def test_read_csv_col_handling(
"The reason of tests fail in `cloud` mode is unknown for now - issue #2340"
)

kwargs = {
"header": header,
"index_col": index_col,
"prefix": prefix,
"names": names,
"usecols": usecols,
"skip_blank_lines": skip_blank_lines,
}

eval_io(
filepath_or_buffer=pytest.csvs_names["test_read_csv_blank_lines"],
fn_name="read_csv",
**kwargs,
# read_csv kwargs
filepath_or_buffer=pytest.csvs_names["test_read_csv_blank_lines"],
header=header,
index_col=index_col,
prefix=prefix,
names=names,
usecols=usecols,
skip_blank_lines=skip_blank_lines,
)

# General Parsing Configuration
Expand Down Expand Up @@ -645,28 +647,26 @@ def test_read_csv_parsing_1(
pytest.xfail(
"The reason of tests fail in `cloud` mode is unknown for now - issue #2340"
)
kwargs = {
"dtype": dtype,
"engine": engine,
"converters": converters,
"skipfooter": skipfooter,
}

if kwargs["dtype"]:
kwargs["dtype"] = {
if dtype:
dtype = {
col: "object"
for col in pandas.read_csv(
pytest.csvs_names["test_read_csv_regular"], nrows=1
).columns
}

eval_io(
filepath_or_buffer=pytest.csvs_names["test_read_csv_regular"],
fn_name="read_csv",
check_exception_type=None, # issue #2320
raising_exceptions=None,
check_kwargs_callable=not callable(converters),
**kwargs,
# read_csv kwargs
filepath_or_buffer=pytest.csvs_names["test_read_csv_regular"],
dtype=dtype,
engine=engine,
converters=converters,
skipfooter=skipfooter,
)

@pytest.mark.parametrize("true_values", [["Yes"], ["Yes", "true"], None])
Expand All @@ -691,22 +691,20 @@ def test_read_csv_parsing_2(
pytest.xfail(
"The reason of tests fail in `cloud` mode is unknown for now - issue #2340"
)
kwargs = {
"true_values": true_values,
"false_values": false_values,
"skiprows": skiprows,
"skipfooter": skipfooter,
"nrows": nrows,
"names": names,
}

eval_io(
filepath_or_buffer=pytest.csvs_names["test_read_csv_yes_no"],
fn_name="read_csv",
check_exception_type=None, # issue #2320
raising_exceptions=None,
check_kwargs_callable=not callable(skiprows),
**kwargs,
# read_csv kwargs
filepath_or_buffer=pytest.csvs_names["test_read_csv_yes_no"],
true_values=true_values,
false_values=false_values,
skiprows=skiprows,
skipfooter=skipfooter,
nrows=nrows,
names=names,
)

def test_read_csv_skipinitialspace(self, make_csv_file):
Expand Down Expand Up @@ -746,6 +744,31 @@ def test_read_csv_mangle_dupe_cols(self):
str_non_unique_cols = "col,col,col,col\n" "5, 6, 7, 8\n" "9, 10, 11, 12\n"
eval_io_from_str(str_non_unique_cols, unique_filename, mangle_dupe_cols=True)

# NA and Missing Data Handling tests
@pytest.mark.parametrize("na_values", ["custom_nan", "73"])
@pytest.mark.parametrize("keep_default_na", [True, False])
@pytest.mark.parametrize("na_filter", [True, False])
@pytest.mark.parametrize("verbose", [True, False])
@pytest.mark.parametrize("skip_blank_lines", [True, False])
anmyachev marked this conversation as resolved.
Show resolved Hide resolved
def test_read_csv_nans_handling(
self,
na_values,
keep_default_na,
na_filter,
verbose,
skip_blank_lines,
):
eval_io(
filepath_or_buffer=pytest.csvs_names["test_read_csv_nans"],
fn_name="read_csv",
# read_csv kwargs
na_values=na_values,
keep_default_na=keep_default_na,
na_filter=na_filter,
verbose=verbose,
skip_blank_lines=skip_blank_lines,
)

# Datetime Handling tests
@pytest.mark.parametrize(
"parse_dates",
Expand Down Expand Up @@ -792,21 +815,18 @@ def test_read_csv_datetime(
raising_exceptions = list(io_ops_bad_exc)
raising_exceptions.remove(TypeError)

kwargs = {
"parse_dates": parse_dates,
"infer_datetime_format": infer_datetime_format,
"keep_date_col": keep_date_col,
"date_parser": date_parser,
"dayfirst": dayfirst,
"cache_dates": cache_dates,
}

eval_io(
filepath_or_buffer=pytest.csvs_names["test_read_csv_regular"],
fn_name="read_csv",
check_kwargs_callable=not callable(date_parser),
raising_exceptions=raising_exceptions,
**kwargs,
# read_csv kwargs
filepath_or_buffer=pytest.csvs_names["test_read_csv_regular"],
parse_dates=parse_dates,
infer_datetime_format=infer_datetime_format,
keep_date_col=keep_date_col,
date_parser=date_parser,
dayfirst=dayfirst,
cache_dates=cache_dates,
)


Expand Down