From ed69517eb5f2ab58ac0e328d0dcac66db426e6fa Mon Sep 17 00:00:00 2001 From: amyskov <55585026+amyskov@users.noreply.github.com> Date: Wed, 28 Oct 2020 13:31:51 +0300 Subject: [PATCH] TEST-#2288: Cover by tests delimiters parameters of read_csv (#2310) Signed-off-by: Alexander Myskov --- .github/workflows/ci.yml | 4 +- .github/workflows/push.yml | 2 +- modin/pandas/test/test_io.py | 190 +++++++++++++++++++++++++++-------- modin/pandas/test/utils.py | 188 ++++++++++++++++++++++++++++++++++ 4 files changed, 337 insertions(+), 47 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index f656703c550..37a6acb3edc 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -433,7 +433,7 @@ jobs: conda info conda list - shell: bash -l {0} - run: python -m pytest --simulate-cloud=normal modin/pandas/test/test_io.py::test_from_csv + run: python -m pytest --simulate-cloud=normal modin/pandas/test/test_io.py::TestReadCSV - shell: bash -l {0} run: bash <(curl -s https://codecov.io/bash) @@ -557,4 +557,4 @@ jobs: conda list - run: sudo apt update && sudo apt install -y libhdf5-dev - shell: bash -l {0} - run: python -m pytest modin/pandas/test/test_io.py::test_from_csv + run: python -m pytest modin/pandas/test/test_io.py::TestReadCSV diff --git a/.github/workflows/push.yml b/.github/workflows/push.yml index ff8677e4aa9..29dece692d2 100644 --- a/.github/workflows/push.yml +++ b/.github/workflows/push.yml @@ -324,4 +324,4 @@ jobs: conda list - run: sudo apt update && sudo apt install -y libhdf5-dev - shell: bash -l {0} - run: python -m pytest modin/pandas/test/test_io.py::test_from_csv + run: python -m pytest modin/pandas/test/test_io.py::TestReadCSV diff --git a/modin/pandas/test/test_io.py b/modin/pandas/test/test_io.py index b382151927c..019db34294a 100644 --- a/modin/pandas/test/test_io.py +++ b/modin/pandas/test/test_io.py @@ -32,7 +32,12 @@ json_short_bytes, json_long_string, json_long_bytes, - eval_general, + random_state, + eval_io, + get_unique_filename, + get_random_string, + insert_lines_to_csv, + IO_OPS_DATA_DIR, ) from modin.config import Engine, Backend @@ -61,24 +66,8 @@ SMALL_ROW_SIZE = 2000 -def eval_io(path, fn_name, comparator=df_equals, cast_to_str=False, *args, **kwargs): - def applyier(module, *args, **kwargs): - result = getattr(module, fn_name)(*args, **kwargs) - # There could be some missmatches in dtypes, so we're - # casting the whole frame to `str` before comparison. - # See issue #1931 for details. - if cast_to_str: - result = result.astype(str) - return result - - eval_general( - pd, - pandas, - applyier, - path=path, - *args, - **kwargs, - ) +if not os.path.exists(IO_OPS_DATA_DIR): + os.mkdir(IO_OPS_DATA_DIR) @pytest.fixture @@ -173,48 +162,131 @@ def teardown_test_file(test_path): os.remove(test_path) -@pytest.fixture -def make_csv_file(delimiter=",", compression="infer"): - """Pytest fixture factory that makes temp csv files for testing. - - Yields: - Function that generates csv files - """ - filenames = [] - - def _make_csv_file( +def _make_csv_file(filenames): + def _csv_file_maker( filename=TEST_CSV_FILENAME, row_size=SMALL_ROW_SIZE, force=True, - delimiter=delimiter, + delimiter=",", encoding=None, - compression=compression, + compression="infer", + additional_col_values=None, + add_blank_lines=False, + add_bad_lines=False, + add_nan_lines=False, + thousands_separator=None, + decimal_separator=None, + lineterminator=None, + comment_col_char=None, + quoting=csv.QUOTE_MINIMAL, + quotechar='"', + doublequote=True, + escapechar=None, + line_terminator=os.linesep, ): if os.path.exists(filename) and not force: pass else: dates = pandas.date_range("2000", freq="h", periods=row_size) - df = pandas.DataFrame( - { - "col1": np.arange(row_size), - "col2": [str(x.date()) for x in dates], - "col3": np.arange(row_size), - "col4": [str(x.time()) for x in dates], - } - ) + data = { + "col1": np.arange(row_size) * 10, + "col2": [str(x.date()) for x in dates], + "col3": np.arange(row_size) * 10, + "col4": [str(x.time()) for x in dates], + "col5": [get_random_string() for _ in range(row_size)], + "col6": random_state.uniform(low=0.0, high=10000.0, size=row_size), + } + + if additional_col_values is not None: + assert isinstance(additional_col_values, (list, tuple)) + data.update( + { + "col7": random_state.choice( + additional_col_values, size=row_size + ), + } + ) + df = pandas.DataFrame(data) + if add_nan_lines: + for i in range(0, row_size, row_size // (row_size // 10)): + df.loc[i] = pandas.Series() + if comment_col_char: + char = comment_col_char if isinstance(comment_col_char, str) else "#" + df.insert( + loc=0, + column="col_with_comments", + value=[char if (x + 2) == 0 else x for x in range(row_size)], + ) + + if thousands_separator: + for col_id in ["col1", "col3"]: + df[col_id] = df[col_id].apply( + lambda x: f"{x:,d}".replace(",", thousands_separator) + ) + df["col6"] = df["col6"].apply( + lambda x: f"{x:,f}".replace(",", thousands_separator) + ) + if compression == "gzip": filename = "{}.gz".format(filename) elif compression == "zip" or compression == "xz" or compression == "bz2": filename = "{fname}.{comp}".format(fname=filename, comp=compression) - df.to_csv( - filename, sep=delimiter, encoding=encoding, compression=compression + filename, + sep=delimiter, + encoding=encoding, + compression=compression, + index=False, + decimal=decimal_separator if decimal_separator else ".", + line_terminator=line_terminator, + quoting=quoting, + quotechar=quotechar, + doublequote=doublequote, + escapechar=escapechar, ) + csv_reader_writer_params = { + "delimiter": delimiter, + "doublequote": doublequote, + "escapechar": escapechar, + "lineterminator": line_terminator, + "quotechar": quotechar, + "quoting": quoting, + } + if add_blank_lines: + insert_lines_to_csv( + csv_name=filename, + lines_positions=[ + x for x in range(5, row_size, row_size // (row_size // 10)) + ], + lines_type="blank", + encoding=encoding, + **csv_reader_writer_params, + ) + if add_bad_lines: + insert_lines_to_csv( + csv_name=filename, + lines_positions=[ + x for x in range(6, row_size, row_size // (row_size // 10)) + ], + lines_type="bad", + encoding=encoding, + **csv_reader_writer_params, + ) filenames.append(filename) return df - # Return function that generates csv files - yield _make_csv_file + return _csv_file_maker + + +@pytest.fixture +def make_csv_file(): + """Pytest fixture factory that makes temp csv files for testing. + Yields: + Function that generates csv files + """ + filenames = [] + + yield _make_csv_file(filenames) # Delete csv files that were created for filename in filenames: @@ -423,6 +495,36 @@ def teardown_fwf_file(): pass +class TestReadCSV: + # delimiter tests + @pytest.mark.parametrize("sep", ["_", ",", ".", "\n"]) + @pytest.mark.parametrize("delimiter", ["_", ",", ".", "\n"]) + @pytest.mark.parametrize("decimal", [".", "_"]) + @pytest.mark.parametrize("thousands", [None, ",", "_", " "]) + def test_read_csv_delimiters( + self, make_csv_file, sep, delimiter, decimal, thousands + ): + kwargs = { + "delimiter": delimiter, + "sep": sep, + "decimal": decimal, + "thousands": thousands, + } + unique_filename = get_unique_filename("test_read_csv_delimiter", kwargs) + make_csv_file( + filename=unique_filename, + delimiter=delimiter, + thousands_separator=thousands, + decimal_separator=decimal, + ) + + eval_io( + filepath_or_buffer=unique_filename, + fn_name="read_csv", + **kwargs, + ) + + def test_from_parquet(make_parquet_file): make_parquet_file(SMALL_ROW_SIZE) @@ -1230,7 +1332,7 @@ def test_from_csv_parse_dates(make_csv_file): @pytest.mark.parametrize("skiprows", [4, 1, 500, None]) def test_from_csv_newlines_in_quotes(nrows, skiprows): eval_io( - path="modin/pandas/test/data/newlines.csv", + filepath_or_buffer="modin/pandas/test/data/newlines.csv", fn_name="read_csv", nrows=nrows, skiprows=skiprows, diff --git a/modin/pandas/test/utils.py b/modin/pandas/test/utils.py index 4704542ad34..baa2daa7ffe 100644 --- a/modin/pandas/test/utils.py +++ b/modin/pandas/test/utils.py @@ -25,6 +25,9 @@ from modin.utils import to_pandas from modin.config import TestDatasetSize from io import BytesIO +import os +from string import ascii_letters +import csv random_state = np.random.RandomState(seed=42) @@ -41,6 +44,9 @@ RAND_LOW = 0 RAND_HIGH = 100 +# Directory for storing I/O operations test data +IO_OPS_DATA_DIR = os.path.join(os.path.dirname(__file__), "read_csv_data") + # Input data and functions for the tests # The test data that we will test our code against test_data = { @@ -420,6 +426,11 @@ "utf_8_sig", ] +# raising of this exceptions can be caused by unexpected behavior +# of I/O operation test, but can passed by eval_io function since +# the type of this exceptions are the same +io_ops_bad_exc = [TypeError, FileNotFoundError] + def categories_equals(left, right): assert (left.ordered and right.ordered) or (not left.ordered and not right.ordered) @@ -630,8 +641,13 @@ def eval_general( comparator=df_equals, __inplace__=False, check_exception_type=True, + raising_exceptions=None, **kwargs, ): + if raising_exceptions: + assert ( + check_exception_type + ), "if raising_exceptions is not None or False, check_exception_type should be True" md_kwargs, pd_kwargs = {}, {} def execute_callable(fn, inplace=False, md_kwargs={}, pd_kwargs={}): @@ -645,6 +661,10 @@ def execute_callable(fn, inplace=False, md_kwargs={}, pd_kwargs={}): repr(fn(modin_df, **md_kwargs)) if check_exception_type: assert isinstance(md_e.value, type(pd_e)) + if raising_exceptions: + assert not isinstance( + md_e.value, tuple(raising_exceptions) + ), f"not acceptable exception type: {md_e.value}" else: md_result = fn(modin_df, **md_kwargs) return (md_result, pd_result) if not __inplace__ else (modin_df, pandas_df) @@ -670,6 +690,53 @@ def execute_callable(fn, inplace=False, md_kwargs={}, pd_kwargs={}): comparator(*values) +def eval_io( + fn_name, + comparator=df_equals, + cast_to_str=False, + check_exception_type=True, + raising_exceptions=io_ops_bad_exc, + *args, + **kwargs, +): + """Evaluate I/O operation outputs equality check. + + Parameters + ---------- + fn_name: str + I/O operation name ("read_csv" for example). + comparator: obj + Function to perform comparison. + cast_to_str: bool + There could be some missmatches in dtypes, so we're + casting the whole frame to `str` before comparison. + See issue #1931 for details. + check_exception_type: bool + Check or not exception types in the case of operation fail + (compare exceptions types raised by Pandas and Modin). + raising_exceptions: Exception or list of Exceptions + Exceptions that should be raised even if they are raised + both by Pandas and Modin (check evaluated only if + `check_exception_type` passed as `True`). + """ + + def applyier(module, *args, **kwargs): + result = getattr(module, fn_name)(*args, **kwargs) + if cast_to_str: + result = result.astype(str) + return result + + eval_general( + pd, + pandas, + applyier, + check_exception_type=check_exception_type, + raising_exceptions=raising_exceptions, + *args, + **kwargs, + ) + + def create_test_dfs(*args, **kwargs): return pd.DataFrame(*args, **kwargs), pandas.DataFrame(*args, **kwargs) @@ -771,3 +838,124 @@ def generate_none_dfs(): } ) return df, df2 + + +def get_unique_filename( + test_name: str, + kwargs: dict = {}, + extension: str = "csv", + data_dir: str = IO_OPS_DATA_DIR, + suffix: str = "", +): + """Returns unique file name with specified parameters. + + Parameters + ---------- + test_name: str + name of the test for which the unique file name is needed. + kwargs: list of ints + Unique combiantion of test parameters for creation of unique name. + extension: str + Extension of unique file. + data_dir: str + Data directory where test files will be created. + suffix: str + String to append to the resulted name. + + Returns + ------- + Unique file name. + """ + # shortcut if kwargs parameter os not provided + if len(kwargs) == 0 and extension == "csv" and suffix == "": + return os.path.join(data_dir, (test_name + f"_{suffix}" + f".{extension}")) + + assert "." not in extension, "please provide pure extension name without '.'" + prohibited_chars = ['"', "\n"] + non_prohibited_char = "np_char" + char_counter = 0 + kwargs_name = dict(kwargs) + for key, value in kwargs_name.items(): + for char in prohibited_chars: + if isinstance(value, str) and char in value or callable(value): + kwargs_name[key] = non_prohibited_char + str(char_counter) + char_counter += 1 + parameters_values = "_".join( + [ + str(value) + if not isinstance(value, (list, tuple)) + else "_".join([str(x) for x in value]) + for value in kwargs_name.values() + ] + ) + return os.path.join(data_dir, parameters_values + f"_{suffix}" + f".{extension}") + + +def get_random_string(): + random_string = "".join( + random_state.choice([x for x in ascii_letters], size=10).tolist() + ) + return random_string + + +def insert_lines_to_csv( + csv_name: str, + lines_positions: list, + lines_type: str = "blank", + encoding: str = None, + **csv_reader_writer_params, +): + """Insert lines to ".csv" file. + + Parameters + ---------- + csv_name: str + ".csv" file that should be modified. + lines_positions: list of ints + Lines postions that sghould be modified (serial number + of line - begins from 0, ends in - 1). + lines_type: str + Lines types that should be inserted to ".csv" file. Possible types: + "blank" - empty line without any delimiters/separators, + "bad" - lines with len(lines_data) > cols_number + encoding: str + Encoding type that should be used during file reading and writing. + """ + cols_number = len(pandas.read_csv(csv_name, nrows=1).columns) + if lines_type == "blank": + lines_data = [] + elif lines_type == "bad": + cols_number = len(pandas.read_csv(csv_name, nrows=1).columns) + lines_data = [x for x in range(cols_number + 1)] + else: + raise ValueError( + f"acceptable values for parameter are ['blank', 'bad'], actually passed {lines_type}" + ) + lines = [] + dialect = "excel" + with open(csv_name, "r", encoding=encoding, newline="") as read_file: + try: + dialect = csv.Sniffer().sniff(read_file.read()) + read_file.seek(0) + except Exception: + dialect = None + + reader = csv.reader( + read_file, + dialect=dialect if dialect is not None else "excel", + **csv_reader_writer_params, + ) + counter = 0 + for row in reader: + if counter in lines_positions: + lines.append(lines_data) + else: + lines.append(row) + counter += 1 + with open(csv_name, "w", encoding=encoding, newline="") as write_file: + writer = csv.writer( + write_file, + dialect=dialect if dialect is not None else "excel", + **csv_reader_writer_params, + ) + writer.writerows(lines)