Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

TEST-#2295: Cover by tests Quoting, Compression, and File Format parameters of read_csv #2495

Merged
merged 5 commits into from
Dec 3, 2020
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
211 changes: 142 additions & 69 deletions modin/pandas/test/test_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,9 @@
# Number of rows in the test file
NROWS = DATASET_SIZE_DICT.get(TestDatasetSize.get(), DATASET_SIZE_DICT["Small"])

# Files compression to extension mapping
COMP_TO_EXT = {"gzip": "gz", "bz2": "bz2", "xz": "xz", "zip": "zip"}

if not os.path.exists(IO_OPS_DATA_DIR):
os.mkdir(IO_OPS_DATA_DIR)

Expand Down Expand Up @@ -235,11 +238,11 @@ def _csv_file_maker(
df["col6"] = df["col6"].apply(
lambda x: f"{x:,f}".replace(",", thousands_separator)
)

if compression == "gzip":
filename = "{}.gz".format(filename)
elif compression == "zip" or compression == "xz" or compression == "bz2":
filename = "{fname}.{comp}".format(fname=filename, comp=compression)
filename = (
f"{filename}.{COMP_TO_EXT[compression]}"
if compression != "infer"
else filename
)
df.to_csv(
filename,
sep=delimiter,
Expand Down Expand Up @@ -866,6 +869,140 @@ def test_read_csv_iteration(self, make_csv_file, iterator):

df_equals(modin_df, pd_df)

# Quoting, Compression, and File Format parameters tests
@pytest.mark.parametrize("compression", ["infer", "gzip", "bz2", "xz", "zip"])
@pytest.mark.parametrize(
"encoding",
[None, "latin8", "ISO-8859-1", "latin1", "iso-8859-1", "cp1252", "utf8"],
)
@pytest.mark.parametrize("engine", [None, "python", "c"])
def test_read_csv_compression(self, make_csv_file, compression, encoding, engine):
anmyachev marked this conversation as resolved.
Show resolved Hide resolved
unique_filename = get_unique_filename()
make_csv_file(
filename=unique_filename, encoding=encoding, compression=compression
)
compressed_file_path = (
f"{unique_filename}.{COMP_TO_EXT[compression]}"
if compression != "infer"
else unique_filename
)

eval_io(
fn_name="read_csv",
# read_csv kwargs
filepath_or_buffer=compressed_file_path,
compression=compression,
encoding=encoding,
engine=engine,
)

@pytest.mark.parametrize("thousands", [None, ",", "_", " "])
@pytest.mark.parametrize("decimal", [".", "_"])
@pytest.mark.parametrize("lineterminator", [None, "x", "\n"])
@pytest.mark.parametrize("escapechar", [None, "d", "x"])
@pytest.mark.parametrize("dialect", ["test_csv_dialect", None])
def test_read_csv_file_format(
self,
request,
make_csv_file,
thousands,
decimal,
lineterminator,
escapechar,
dialect,
):
if request.config.getoption("--simulate-cloud").lower() != "off":
pytest.xfail(
"The reason of tests fail in `cloud` mode is unknown for now - issue #2340"
)
elif Engine.get() != "Python" and lineterminator == "x":
pytest.xfail("read_csv with Ray engine outputs empty frame - issue #2493")
elif Engine.get() != "Python" and escapechar:
pytest.xfail(
"read_csv with Ray engine fails with some 'escapechar' parameters - issue #2494"
)
elif Engine.get() != "Python" and dialect:
pytest.xfail(
"read_csv with Ray engine fails with `dialect` parameter - issue #2508"
)

unique_filename = get_unique_filename()
if dialect:
test_csv_dialect_params = {
"delimiter": "_",
"doublequote": False,
"escapechar": "\\",
"quotechar": "d",
"quoting": csv.QUOTE_ALL,
}
csv.register_dialect(dialect, **test_csv_dialect_params)
anmyachev marked this conversation as resolved.
Show resolved Hide resolved
dialect = csv.get_dialect(dialect)
make_csv_file(filename=unique_filename, **test_csv_dialect_params)
else:
make_csv_file(
filename=unique_filename,
thousands_separator=thousands,
decimal_separator=decimal,
escapechar=escapechar,
line_terminator=lineterminator,
)

eval_io(
check_exception_type=None, # issue #2320
raising_exceptions=None,
fn_name="read_csv",
# read_csv kwargs
filepath_or_buffer=unique_filename,
thousands=thousands,
decimal=decimal,
lineterminator=lineterminator,
escapechar=escapechar,
dialect=dialect,
)

@pytest.mark.parametrize(
"quoting",
[csv.QUOTE_ALL, csv.QUOTE_MINIMAL, csv.QUOTE_NONNUMERIC, csv.QUOTE_NONE],
)
@pytest.mark.parametrize("quotechar", ['"', "_", "d"])
@pytest.mark.parametrize("doublequote", [True, False])
@pytest.mark.parametrize("comment", [None, "#", "x"])
def test_read_csv_quoting(
self,
make_csv_file,
quoting,
quotechar,
doublequote,
comment,
):
# in these cases escapechar should be set, otherwise error occures
# _csv.Error: need to escape, but no escapechar set"
use_escapechar = (
not doublequote and quotechar != '"' and quoting != csv.QUOTE_NONE
)
escapechar = "\\" if use_escapechar else None
unique_filename = get_unique_filename()

make_csv_file(
filename=unique_filename,
quoting=quoting,
quotechar=quotechar,
doublequote=doublequote,
escapechar=escapechar,
comment_col_char=comment,
)

eval_io(
fn_name="read_csv",
# read_csv kwargs
filepath_or_buffer=unique_filename,
quoting=quoting,
quotechar=quotechar,
doublequote=doublequote,
escapechar=escapechar,
comment=comment,
)

# Error Handling parameters tests
@pytest.mark.xfail(
Engine.get() != "Python",
Expand Down Expand Up @@ -1288,58 +1425,6 @@ def test_from_csv_categories():
df_equals(modin_df, pandas_df)


def test_from_csv_gzip(make_csv_file):
make_csv_file(compression="gzip")
gzip_path = "{}.gz".format(TEST_CSV_FILENAME)

pandas_df = pandas.read_csv(gzip_path)
modin_df = pd.read_csv(gzip_path)
df_equals(modin_df, pandas_df)

pandas_df = pandas.read_csv(gzip_path, compression="gzip")
modin_df = pd.read_csv(gzip_path, compression="gzip")
df_equals(modin_df, pandas_df)


def test_from_csv_bz2(make_csv_file):
make_csv_file(compression="bz2")
bz2_path = "{}.bz2".format(TEST_CSV_FILENAME)

pandas_df = pandas.read_csv(bz2_path)
modin_df = pd.read_csv(bz2_path)
df_equals(modin_df, pandas_df)

pandas_df = pandas.read_csv(bz2_path, compression="bz2")
modin_df = pd.read_csv(bz2_path, compression="bz2")
df_equals(modin_df, pandas_df)


def test_from_csv_xz(make_csv_file):
make_csv_file(compression="xz")
xz_path = "{}.xz".format(TEST_CSV_FILENAME)

pandas_df = pandas.read_csv(xz_path)
modin_df = pd.read_csv(xz_path)
df_equals(modin_df, pandas_df)

pandas_df = pandas.read_csv(xz_path, compression="xz")
modin_df = pd.read_csv(xz_path, compression="xz")
df_equals(modin_df, pandas_df)


def test_from_csv_zip(make_csv_file):
make_csv_file(compression="zip")
zip_path = "{}.zip".format(TEST_CSV_FILENAME)

pandas_df = pandas.read_csv(zip_path)
modin_df = pd.read_csv(zip_path)
df_equals(modin_df, pandas_df)

pandas_df = pandas.read_csv(zip_path, compression="zip")
modin_df = pd.read_csv(zip_path, compression="zip")
df_equals(modin_df, pandas_df)


def test_parse_dates_read_csv():
pandas_df = pandas.read_csv("modin/pandas/test/data/test_time_parsing.csv")
modin_df = pd.read_csv("modin/pandas/test/data/test_time_parsing.csv")
Expand Down Expand Up @@ -1525,18 +1610,6 @@ def test_from_csv_skiprows_names(names, skiprows):
df_equals(pandas_df, modin_df)


@pytest.mark.parametrize(
"encoding", ["latin8", "ISO-8859-1", "latin1", "iso-8859-1", "cp1252", "utf8"]
)
def test_from_csv_encoding(make_csv_file, encoding):
make_csv_file(encoding=encoding)

pandas_df = pandas.read_csv(TEST_CSV_FILENAME, encoding=encoding)
modin_df = pd.read_csv(TEST_CSV_FILENAME, encoding=encoding)

df_equals(modin_df, pandas_df)


def test_from_csv_default_to_pandas_behavior(make_csv_file):
make_csv_file()

Expand Down