Skip to content

Commit

Permalink
TEST-#2289: Columns, Index Locations and Names parameters of read_csv
Browse files Browse the repository at this point in the history
Signed-off-by: Alexander Myskov <[email protected]>

TEST-#2289: revert test_usecols.csv

Signed-off-by: Alexander Myskov <[email protected]>

TEST-#2289: fix

Signed-off-by: Alexander Myskov <[email protected]>

TEST-#2289: mark xfailed cloud tests

Signed-off-by: Alexander Myskov <[email protected]>

TEST-#2289: addressing review comments

Signed-off-by: Alexander Myskov <[email protected]>

TEST-#2289: remove class scope fixtures

Signed-off-by: Alexander Myskov <[email protected]>

TEST-#2289: addressing review comment

Signed-off-by: Alexander Myskov <[email protected]>

TEST-#2289: Apply suggestions from code review

Co-authored-by: Anatoly Myachev <[email protected]>

TEST-#2289: correct str_non_unique_cols

Signed-off-by: Alexander Myskov <[email protected]>
  • Loading branch information
amyskov committed Nov 12, 2020
1 parent 45ef859 commit 112c135
Show file tree
Hide file tree
Showing 3 changed files with 103 additions and 33 deletions.
10 changes: 0 additions & 10 deletions modin/pandas/test/data/issue_621.csv

This file was deleted.

101 changes: 78 additions & 23 deletions modin/pandas/test/test_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@
insert_lines_to_csv,
IO_OPS_DATA_DIR,
io_ops_bad_exc,
eval_io_from_str,
)

from modin.config import Engine, Backend, IsExperimental
Expand Down Expand Up @@ -528,6 +529,83 @@ def test_read_csv_delimiters(
**kwargs,
)

# Column and Index Locations and Names tests
@pytest.mark.xfail(
Engine.get() != "Python",
reason="many parameters combiantions fails: issue #2312, #2307",
)
@pytest.mark.parametrize("header", ["infer", None, 0])
@pytest.mark.parametrize("index_col", [None, "col1"])
@pytest.mark.parametrize("prefix", [None, "_", "col"])
@pytest.mark.parametrize(
"names", [None, ["col1"], ["c1", "c2", "c3", "c4", "c5", "c6", "c7"]]
)
@pytest.mark.parametrize(
"usecols", [None, ["col1"], ["col1", "col2", "col6"], [0, 1, 5]]
)
@pytest.mark.parametrize("skip_blank_lines", [True, False])
def test_read_csv_col_handling(
self,
make_csv_file,
request,
header,
index_col,
prefix,
names,
usecols,
skip_blank_lines,
):
if request.config.getoption("--simulate-cloud").lower() != "off":
pytest.xfail(
"The reason of tests fail in `cloud` mode is unknown for now - issue #2340"
)

kwargs = {
"header": header,
"index_col": index_col,
"prefix": prefix,
"names": names,
"usecols": usecols,
"skip_blank_lines": skip_blank_lines,
}

unique_name = get_unique_filename("test_read_csv_col_handling", kwargs)
make_csv_file(
filename=unique_name,
add_blank_lines=True,
)
eval_io(
filepath_or_buffer=unique_name,
fn_name="read_csv",
**kwargs,
)

@pytest.mark.xfail(reason="infinite recursion error - issue #2032")
@pytest.mark.parametrize(
"test_case", ["single_element", "single_column", "multiple_columns"]
)
def test_read_csv_squeeze(self, test_case):
unique_filename = get_unique_filename("test_read_csv_squeeze")

str_single_element = "1"
str_single_col = "1\n2\n3\n"
str_four_cols = "1, 2, 3, 4\n" "5, 6, 7, 8\n" "9, 10, 11, 12\n"
case_to_data = {
"single_element": str_single_element,
"single_column": str_single_col,
"multiple_columns": str_four_cols,
}

eval_io_from_str(case_to_data[test_case], unique_filename, squeeze=True)
eval_io_from_str(
case_to_data[test_case], unique_filename, header=None, squeeze=True
)

def test_read_csv_mangle_dupe_cols(self):
unique_filename = get_unique_filename("test_read_csv_mangle_dupe_cols")
str_non_unique_cols = "col,col,col,col\n" "5, 6, 7, 8\n" "9, 10, 11, 12\n"
eval_io_from_str(str_non_unique_cols, unique_filename, mangle_dupe_cols=True)

# Datetime Handling tests
@pytest.mark.parametrize(
"parse_dates",
Expand Down Expand Up @@ -1171,21 +1249,6 @@ def test_parse_dates_read_csv():
df_equals(modin_df, pandas_df)


@pytest.mark.parametrize(
"kwargs",
[
{"header": None, "usecols": [0, 7]},
{"usecols": [0, 7]},
{"names": [0, 7], "usecols": [0, 7]},
],
)
def test_from_csv_with_args(kwargs):
file_name = "modin/pandas/test/data/issue_621.csv"
pandas_df = pandas.read_csv(file_name, **kwargs)
modin_df = pd.read_csv(file_name, **kwargs)
df_equals(modin_df, pandas_df)


def test_from_table(make_csv_file):
make_csv_file(delimiter="\t")

Expand All @@ -1200,14 +1263,6 @@ def test_from_table(make_csv_file):
df_equals(modin_df, pandas_df)


@pytest.mark.parametrize("usecols", [["a"], ["a", "b", "e"], [0, 1, 4]])
def test_from_csv_with_usecols(usecols):
fname = "modin/pandas/test/data/test_usecols.csv"
pandas_df = pandas.read_csv(fname, usecols=usecols)
modin_df = pd.read_csv(fname, usecols=usecols)
df_equals(modin_df, pandas_df)


@pytest.mark.skipif(Engine.get() == "Python", reason="Using pandas implementation")
def test_from_csv_s3(make_csv_file):
dataset_url = "s3://noaa-ghcn-pds/csv/1788.csv"
Expand Down
25 changes: 25 additions & 0 deletions modin/pandas/test/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -740,6 +740,31 @@ def applyier(module, *args, **kwargs):
)


def eval_io_from_str(csv_str: str, unique_filename: str, **kwargs):
"""Evaluate I/O operation outputs equality check by using `csv_str`
data passed as python str (csv test file will be created from `csv_str`).
Parameters
----------
csv_str: str
Test data for storing to csv file.
unique_filename: str
csv file name.
"""
try:
with open(unique_filename, "w") as f:
f.write(csv_str)

eval_io(
filepath_or_buffer=unique_filename,
fn_name="read_csv",
**kwargs,
)

finally:
os.remove(unique_filename)


def create_test_dfs(*args, **kwargs):
return pd.DataFrame(*args, **kwargs), pandas.DataFrame(*args, **kwargs)

Expand Down

0 comments on commit 112c135

Please sign in to comment.