Skip to content

Commit

Permalink
TEST-#2288: Cover by tests delimiters parameters of read_csv (#2310)
Browse files Browse the repository at this point in the history
Signed-off-by: Alexander Myskov <[email protected]>
  • Loading branch information
amyskov authored Oct 28, 2020
1 parent 5cabeb9 commit ed69517
Show file tree
Hide file tree
Showing 4 changed files with 337 additions and 47 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -433,7 +433,7 @@ jobs:
conda info
conda list
- shell: bash -l {0}
run: python -m pytest --simulate-cloud=normal modin/pandas/test/test_io.py::test_from_csv
run: python -m pytest --simulate-cloud=normal modin/pandas/test/test_io.py::TestReadCSV
- shell: bash -l {0}
run: bash <(curl -s https://codecov.io/bash)

Expand Down Expand Up @@ -557,4 +557,4 @@ jobs:
conda list
- run: sudo apt update && sudo apt install -y libhdf5-dev
- shell: bash -l {0}
run: python -m pytest modin/pandas/test/test_io.py::test_from_csv
run: python -m pytest modin/pandas/test/test_io.py::TestReadCSV
2 changes: 1 addition & 1 deletion .github/workflows/push.yml
Original file line number Diff line number Diff line change
Expand Up @@ -324,4 +324,4 @@ jobs:
conda list
- run: sudo apt update && sudo apt install -y libhdf5-dev
- shell: bash -l {0}
run: python -m pytest modin/pandas/test/test_io.py::test_from_csv
run: python -m pytest modin/pandas/test/test_io.py::TestReadCSV
190 changes: 146 additions & 44 deletions modin/pandas/test/test_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,12 @@
json_short_bytes,
json_long_string,
json_long_bytes,
eval_general,
random_state,
eval_io,
get_unique_filename,
get_random_string,
insert_lines_to_csv,
IO_OPS_DATA_DIR,
)

from modin.config import Engine, Backend
Expand Down Expand Up @@ -61,24 +66,8 @@
SMALL_ROW_SIZE = 2000


def eval_io(path, fn_name, comparator=df_equals, cast_to_str=False, *args, **kwargs):
def applyier(module, *args, **kwargs):
result = getattr(module, fn_name)(*args, **kwargs)
# There could be some missmatches in dtypes, so we're
# casting the whole frame to `str` before comparison.
# See issue #1931 for details.
if cast_to_str:
result = result.astype(str)
return result

eval_general(
pd,
pandas,
applyier,
path=path,
*args,
**kwargs,
)
if not os.path.exists(IO_OPS_DATA_DIR):
os.mkdir(IO_OPS_DATA_DIR)


@pytest.fixture
Expand Down Expand Up @@ -173,48 +162,131 @@ def teardown_test_file(test_path):
os.remove(test_path)


@pytest.fixture
def make_csv_file(delimiter=",", compression="infer"):
"""Pytest fixture factory that makes temp csv files for testing.
Yields:
Function that generates csv files
"""
filenames = []

def _make_csv_file(
def _make_csv_file(filenames):
def _csv_file_maker(
filename=TEST_CSV_FILENAME,
row_size=SMALL_ROW_SIZE,
force=True,
delimiter=delimiter,
delimiter=",",
encoding=None,
compression=compression,
compression="infer",
additional_col_values=None,
add_blank_lines=False,
add_bad_lines=False,
add_nan_lines=False,
thousands_separator=None,
decimal_separator=None,
lineterminator=None,
comment_col_char=None,
quoting=csv.QUOTE_MINIMAL,
quotechar='"',
doublequote=True,
escapechar=None,
line_terminator=os.linesep,
):
if os.path.exists(filename) and not force:
pass
else:
dates = pandas.date_range("2000", freq="h", periods=row_size)
df = pandas.DataFrame(
{
"col1": np.arange(row_size),
"col2": [str(x.date()) for x in dates],
"col3": np.arange(row_size),
"col4": [str(x.time()) for x in dates],
}
)
data = {
"col1": np.arange(row_size) * 10,
"col2": [str(x.date()) for x in dates],
"col3": np.arange(row_size) * 10,
"col4": [str(x.time()) for x in dates],
"col5": [get_random_string() for _ in range(row_size)],
"col6": random_state.uniform(low=0.0, high=10000.0, size=row_size),
}

if additional_col_values is not None:
assert isinstance(additional_col_values, (list, tuple))
data.update(
{
"col7": random_state.choice(
additional_col_values, size=row_size
),
}
)
df = pandas.DataFrame(data)
if add_nan_lines:
for i in range(0, row_size, row_size // (row_size // 10)):
df.loc[i] = pandas.Series()
if comment_col_char:
char = comment_col_char if isinstance(comment_col_char, str) else "#"
df.insert(
loc=0,
column="col_with_comments",
value=[char if (x + 2) == 0 else x for x in range(row_size)],
)

if thousands_separator:
for col_id in ["col1", "col3"]:
df[col_id] = df[col_id].apply(
lambda x: f"{x:,d}".replace(",", thousands_separator)
)
df["col6"] = df["col6"].apply(
lambda x: f"{x:,f}".replace(",", thousands_separator)
)

if compression == "gzip":
filename = "{}.gz".format(filename)
elif compression == "zip" or compression == "xz" or compression == "bz2":
filename = "{fname}.{comp}".format(fname=filename, comp=compression)

df.to_csv(
filename, sep=delimiter, encoding=encoding, compression=compression
filename,
sep=delimiter,
encoding=encoding,
compression=compression,
index=False,
decimal=decimal_separator if decimal_separator else ".",
line_terminator=line_terminator,
quoting=quoting,
quotechar=quotechar,
doublequote=doublequote,
escapechar=escapechar,
)
csv_reader_writer_params = {
"delimiter": delimiter,
"doublequote": doublequote,
"escapechar": escapechar,
"lineterminator": line_terminator,
"quotechar": quotechar,
"quoting": quoting,
}
if add_blank_lines:
insert_lines_to_csv(
csv_name=filename,
lines_positions=[
x for x in range(5, row_size, row_size // (row_size // 10))
],
lines_type="blank",
encoding=encoding,
**csv_reader_writer_params,
)
if add_bad_lines:
insert_lines_to_csv(
csv_name=filename,
lines_positions=[
x for x in range(6, row_size, row_size // (row_size // 10))
],
lines_type="bad",
encoding=encoding,
**csv_reader_writer_params,
)
filenames.append(filename)
return df

# Return function that generates csv files
yield _make_csv_file
return _csv_file_maker


@pytest.fixture
def make_csv_file():
"""Pytest fixture factory that makes temp csv files for testing.
Yields:
Function that generates csv files
"""
filenames = []

yield _make_csv_file(filenames)

# Delete csv files that were created
for filename in filenames:
Expand Down Expand Up @@ -423,6 +495,36 @@ def teardown_fwf_file():
pass


class TestReadCSV:
# delimiter tests
@pytest.mark.parametrize("sep", ["_", ",", ".", "\n"])
@pytest.mark.parametrize("delimiter", ["_", ",", ".", "\n"])
@pytest.mark.parametrize("decimal", [".", "_"])
@pytest.mark.parametrize("thousands", [None, ",", "_", " "])
def test_read_csv_delimiters(
self, make_csv_file, sep, delimiter, decimal, thousands
):
kwargs = {
"delimiter": delimiter,
"sep": sep,
"decimal": decimal,
"thousands": thousands,
}
unique_filename = get_unique_filename("test_read_csv_delimiter", kwargs)
make_csv_file(
filename=unique_filename,
delimiter=delimiter,
thousands_separator=thousands,
decimal_separator=decimal,
)

eval_io(
filepath_or_buffer=unique_filename,
fn_name="read_csv",
**kwargs,
)


def test_from_parquet(make_parquet_file):
make_parquet_file(SMALL_ROW_SIZE)

Expand Down Expand Up @@ -1230,7 +1332,7 @@ def test_from_csv_parse_dates(make_csv_file):
@pytest.mark.parametrize("skiprows", [4, 1, 500, None])
def test_from_csv_newlines_in_quotes(nrows, skiprows):
eval_io(
path="modin/pandas/test/data/newlines.csv",
filepath_or_buffer="modin/pandas/test/data/newlines.csv",
fn_name="read_csv",
nrows=nrows,
skiprows=skiprows,
Expand Down
Loading

0 comments on commit ed69517

Please sign in to comment.