From ed69517eb5f2ab58ac0e328d0dcac66db426e6fa Mon Sep 17 00:00:00 2001
From: amyskov <55585026+amyskov@users.noreply.github.com>
Date: Wed, 28 Oct 2020 13:31:51 +0300
Subject: [PATCH] TEST-#2288: Cover by tests delimiters parameters of read_csv
 (#2310)

Signed-off-by: Alexander Myskov <alexander.myskov@intel.com>
---
 .github/workflows/ci.yml     |   4 +-
 .github/workflows/push.yml   |   2 +-
 modin/pandas/test/test_io.py | 190 +++++++++++++++++++++++++++--------
 modin/pandas/test/utils.py   | 188 ++++++++++++++++++++++++++++++++++
 4 files changed, 337 insertions(+), 47 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index f656703c550..37a6acb3edc 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -433,7 +433,7 @@ jobs:
           conda info
           conda list
       - shell: bash -l {0}
-        run: python -m pytest --simulate-cloud=normal modin/pandas/test/test_io.py::test_from_csv
+        run: python -m pytest --simulate-cloud=normal modin/pandas/test/test_io.py::TestReadCSV
       - shell: bash -l {0}
         run: bash <(curl -s https://codecov.io/bash)
 
@@ -557,4 +557,4 @@ jobs:
           conda list
       - run: sudo apt update && sudo apt install -y libhdf5-dev
       - shell: bash -l {0}
-        run: python -m pytest modin/pandas/test/test_io.py::test_from_csv
+        run: python -m pytest modin/pandas/test/test_io.py::TestReadCSV
diff --git a/.github/workflows/push.yml b/.github/workflows/push.yml
index ff8677e4aa9..29dece692d2 100644
--- a/.github/workflows/push.yml
+++ b/.github/workflows/push.yml
@@ -324,4 +324,4 @@ jobs:
           conda list
       - run: sudo apt update && sudo apt install -y libhdf5-dev
       - shell: bash -l {0}
-        run: python -m pytest modin/pandas/test/test_io.py::test_from_csv
+        run: python -m pytest modin/pandas/test/test_io.py::TestReadCSV
diff --git a/modin/pandas/test/test_io.py b/modin/pandas/test/test_io.py
index b382151927c..019db34294a 100644
--- a/modin/pandas/test/test_io.py
+++ b/modin/pandas/test/test_io.py
@@ -32,7 +32,12 @@
     json_short_bytes,
     json_long_string,
     json_long_bytes,
-    eval_general,
+    random_state,
+    eval_io,
+    get_unique_filename,
+    get_random_string,
+    insert_lines_to_csv,
+    IO_OPS_DATA_DIR,
 )
 
 from modin.config import Engine, Backend
@@ -61,24 +66,8 @@
 SMALL_ROW_SIZE = 2000
 
 
-def eval_io(path, fn_name, comparator=df_equals, cast_to_str=False, *args, **kwargs):
-    def applyier(module, *args, **kwargs):
-        result = getattr(module, fn_name)(*args, **kwargs)
-        # There could be some missmatches in dtypes, so we're
-        # casting the whole frame to `str` before comparison.
-        # See issue #1931 for details.
-        if cast_to_str:
-            result = result.astype(str)
-        return result
-
-    eval_general(
-        pd,
-        pandas,
-        applyier,
-        path=path,
-        *args,
-        **kwargs,
-    )
+if not os.path.exists(IO_OPS_DATA_DIR):
+    os.mkdir(IO_OPS_DATA_DIR)
 
 
 @pytest.fixture
@@ -173,48 +162,131 @@ def teardown_test_file(test_path):
         os.remove(test_path)
 
 
-@pytest.fixture
-def make_csv_file(delimiter=",", compression="infer"):
-    """Pytest fixture factory that makes temp csv files for testing.
-
-    Yields:
-        Function that generates csv files
-    """
-    filenames = []
-
-    def _make_csv_file(
+def _make_csv_file(filenames):
+    def _csv_file_maker(
         filename=TEST_CSV_FILENAME,
         row_size=SMALL_ROW_SIZE,
         force=True,
-        delimiter=delimiter,
+        delimiter=",",
         encoding=None,
-        compression=compression,
+        compression="infer",
+        additional_col_values=None,
+        add_blank_lines=False,
+        add_bad_lines=False,
+        add_nan_lines=False,
+        thousands_separator=None,
+        decimal_separator=None,
+        lineterminator=None,
+        comment_col_char=None,
+        quoting=csv.QUOTE_MINIMAL,
+        quotechar='"',
+        doublequote=True,
+        escapechar=None,
+        line_terminator=os.linesep,
     ):
         if os.path.exists(filename) and not force:
             pass
         else:
             dates = pandas.date_range("2000", freq="h", periods=row_size)
-            df = pandas.DataFrame(
-                {
-                    "col1": np.arange(row_size),
-                    "col2": [str(x.date()) for x in dates],
-                    "col3": np.arange(row_size),
-                    "col4": [str(x.time()) for x in dates],
-                }
-            )
+            data = {
+                "col1": np.arange(row_size) * 10,
+                "col2": [str(x.date()) for x in dates],
+                "col3": np.arange(row_size) * 10,
+                "col4": [str(x.time()) for x in dates],
+                "col5": [get_random_string() for _ in range(row_size)],
+                "col6": random_state.uniform(low=0.0, high=10000.0, size=row_size),
+            }
+
+            if additional_col_values is not None:
+                assert isinstance(additional_col_values, (list, tuple))
+                data.update(
+                    {
+                        "col7": random_state.choice(
+                            additional_col_values, size=row_size
+                        ),
+                    }
+                )
+            df = pandas.DataFrame(data)
+            if add_nan_lines:
+                for i in range(0, row_size, row_size // (row_size // 10)):
+                    df.loc[i] = pandas.Series()
+            if comment_col_char:
+                char = comment_col_char if isinstance(comment_col_char, str) else "#"
+                df.insert(
+                    loc=0,
+                    column="col_with_comments",
+                    value=[char if (x + 2) == 0 else x for x in range(row_size)],
+                )
+
+            if thousands_separator:
+                for col_id in ["col1", "col3"]:
+                    df[col_id] = df[col_id].apply(
+                        lambda x: f"{x:,d}".replace(",", thousands_separator)
+                    )
+                df["col6"] = df["col6"].apply(
+                    lambda x: f"{x:,f}".replace(",", thousands_separator)
+                )
+
             if compression == "gzip":
                 filename = "{}.gz".format(filename)
             elif compression == "zip" or compression == "xz" or compression == "bz2":
                 filename = "{fname}.{comp}".format(fname=filename, comp=compression)
-
             df.to_csv(
-                filename, sep=delimiter, encoding=encoding, compression=compression
+                filename,
+                sep=delimiter,
+                encoding=encoding,
+                compression=compression,
+                index=False,
+                decimal=decimal_separator if decimal_separator else ".",
+                line_terminator=line_terminator,
+                quoting=quoting,
+                quotechar=quotechar,
+                doublequote=doublequote,
+                escapechar=escapechar,
             )
+            csv_reader_writer_params = {
+                "delimiter": delimiter,
+                "doublequote": doublequote,
+                "escapechar": escapechar,
+                "lineterminator": line_terminator,
+                "quotechar": quotechar,
+                "quoting": quoting,
+            }
+            if add_blank_lines:
+                insert_lines_to_csv(
+                    csv_name=filename,
+                    lines_positions=[
+                        x for x in range(5, row_size, row_size // (row_size // 10))
+                    ],
+                    lines_type="blank",
+                    encoding=encoding,
+                    **csv_reader_writer_params,
+                )
+            if add_bad_lines:
+                insert_lines_to_csv(
+                    csv_name=filename,
+                    lines_positions=[
+                        x for x in range(6, row_size, row_size // (row_size // 10))
+                    ],
+                    lines_type="bad",
+                    encoding=encoding,
+                    **csv_reader_writer_params,
+                )
             filenames.append(filename)
             return df
 
-    # Return function that generates csv files
-    yield _make_csv_file
+    return _csv_file_maker
+
+
+@pytest.fixture
+def make_csv_file():
+    """Pytest fixture factory that makes temp csv files for testing.
+    Yields:
+        Function that generates csv files
+    """
+    filenames = []
+
+    yield _make_csv_file(filenames)
 
     # Delete csv files that were created
     for filename in filenames:
@@ -423,6 +495,36 @@ def teardown_fwf_file():
             pass
 
 
+class TestReadCSV:
+    # delimiter tests
+    @pytest.mark.parametrize("sep", ["_", ",", ".", "\n"])
+    @pytest.mark.parametrize("delimiter", ["_", ",", ".", "\n"])
+    @pytest.mark.parametrize("decimal", [".", "_"])
+    @pytest.mark.parametrize("thousands", [None, ",", "_", " "])
+    def test_read_csv_delimiters(
+        self, make_csv_file, sep, delimiter, decimal, thousands
+    ):
+        kwargs = {
+            "delimiter": delimiter,
+            "sep": sep,
+            "decimal": decimal,
+            "thousands": thousands,
+        }
+        unique_filename = get_unique_filename("test_read_csv_delimiter", kwargs)
+        make_csv_file(
+            filename=unique_filename,
+            delimiter=delimiter,
+            thousands_separator=thousands,
+            decimal_separator=decimal,
+        )
+
+        eval_io(
+            filepath_or_buffer=unique_filename,
+            fn_name="read_csv",
+            **kwargs,
+        )
+
+
 def test_from_parquet(make_parquet_file):
     make_parquet_file(SMALL_ROW_SIZE)
 
@@ -1230,7 +1332,7 @@ def test_from_csv_parse_dates(make_csv_file):
 @pytest.mark.parametrize("skiprows", [4, 1, 500, None])
 def test_from_csv_newlines_in_quotes(nrows, skiprows):
     eval_io(
-        path="modin/pandas/test/data/newlines.csv",
+        filepath_or_buffer="modin/pandas/test/data/newlines.csv",
         fn_name="read_csv",
         nrows=nrows,
         skiprows=skiprows,
diff --git a/modin/pandas/test/utils.py b/modin/pandas/test/utils.py
index 4704542ad34..baa2daa7ffe 100644
--- a/modin/pandas/test/utils.py
+++ b/modin/pandas/test/utils.py
@@ -25,6 +25,9 @@
 from modin.utils import to_pandas
 from modin.config import TestDatasetSize
 from io import BytesIO
+import os
+from string import ascii_letters
+import csv
 
 random_state = np.random.RandomState(seed=42)
 
@@ -41,6 +44,9 @@
 RAND_LOW = 0
 RAND_HIGH = 100
 
+# Directory for storing I/O operations test data
+IO_OPS_DATA_DIR = os.path.join(os.path.dirname(__file__), "read_csv_data")
+
 # Input data and functions for the tests
 # The test data that we will test our code against
 test_data = {
@@ -420,6 +426,11 @@
     "utf_8_sig",
 ]
 
+# raising of this exceptions can be caused by unexpected behavior
+# of I/O operation test, but can passed by eval_io function since
+# the type of this exceptions are the same
+io_ops_bad_exc = [TypeError, FileNotFoundError]
+
 
 def categories_equals(left, right):
     assert (left.ordered and right.ordered) or (not left.ordered and not right.ordered)
@@ -630,8 +641,13 @@ def eval_general(
     comparator=df_equals,
     __inplace__=False,
     check_exception_type=True,
+    raising_exceptions=None,
     **kwargs,
 ):
+    if raising_exceptions:
+        assert (
+            check_exception_type
+        ), "if raising_exceptions is not None or False, check_exception_type should be True"
     md_kwargs, pd_kwargs = {}, {}
 
     def execute_callable(fn, inplace=False, md_kwargs={}, pd_kwargs={}):
@@ -645,6 +661,10 @@ def execute_callable(fn, inplace=False, md_kwargs={}, pd_kwargs={}):
                 repr(fn(modin_df, **md_kwargs))
             if check_exception_type:
                 assert isinstance(md_e.value, type(pd_e))
+                if raising_exceptions:
+                    assert not isinstance(
+                        md_e.value, tuple(raising_exceptions)
+                    ), f"not acceptable exception type: {md_e.value}"
         else:
             md_result = fn(modin_df, **md_kwargs)
             return (md_result, pd_result) if not __inplace__ else (modin_df, pandas_df)
@@ -670,6 +690,53 @@ def execute_callable(fn, inplace=False, md_kwargs={}, pd_kwargs={}):
         comparator(*values)
 
 
+def eval_io(
+    fn_name,
+    comparator=df_equals,
+    cast_to_str=False,
+    check_exception_type=True,
+    raising_exceptions=io_ops_bad_exc,
+    *args,
+    **kwargs,
+):
+    """Evaluate I/O operation outputs equality check.
+
+    Parameters
+    ----------
+    fn_name: str
+        I/O operation name ("read_csv" for example).
+    comparator: obj
+        Function to perform comparison.
+    cast_to_str: bool
+        There could be some missmatches in dtypes, so we're
+        casting the whole frame to `str` before comparison.
+        See issue #1931 for details.
+    check_exception_type: bool
+        Check or not exception types in the case of operation fail
+        (compare exceptions types raised by Pandas and Modin).
+    raising_exceptions: Exception or list of Exceptions
+        Exceptions that should be raised even if they are raised
+        both by Pandas and Modin (check evaluated only if
+        `check_exception_type` passed as `True`).
+    """
+
+    def applyier(module, *args, **kwargs):
+        result = getattr(module, fn_name)(*args, **kwargs)
+        if cast_to_str:
+            result = result.astype(str)
+        return result
+
+    eval_general(
+        pd,
+        pandas,
+        applyier,
+        check_exception_type=check_exception_type,
+        raising_exceptions=raising_exceptions,
+        *args,
+        **kwargs,
+    )
+
+
 def create_test_dfs(*args, **kwargs):
     return pd.DataFrame(*args, **kwargs), pandas.DataFrame(*args, **kwargs)
 
@@ -771,3 +838,124 @@ def generate_none_dfs():
         }
     )
     return df, df2
+
+
+def get_unique_filename(
+    test_name: str,
+    kwargs: dict = {},
+    extension: str = "csv",
+    data_dir: str = IO_OPS_DATA_DIR,
+    suffix: str = "",
+):
+    """Returns unique file name with specified parameters.
+
+    Parameters
+    ----------
+    test_name: str
+        name of the test for which the unique file name is needed.
+    kwargs: list of ints
+        Unique combiantion of test parameters for creation of unique name.
+    extension: str
+        Extension of unique file.
+    data_dir: str
+        Data directory where test files will be created.
+    suffix: str
+        String to append to the resulted name.
+
+    Returns
+    -------
+        Unique file name.
+    """
+    # shortcut if kwargs parameter os not provided
+    if len(kwargs) == 0 and extension == "csv" and suffix == "":
+        return os.path.join(data_dir, (test_name + f"_{suffix}" + f".{extension}"))
+
+    assert "." not in extension, "please provide pure extension name without '.'"
+    prohibited_chars = ['"', "\n"]
+    non_prohibited_char = "np_char"
+    char_counter = 0
+    kwargs_name = dict(kwargs)
+    for key, value in kwargs_name.items():
+        for char in prohibited_chars:
+            if isinstance(value, str) and char in value or callable(value):
+                kwargs_name[key] = non_prohibited_char + str(char_counter)
+                char_counter += 1
+    parameters_values = "_".join(
+        [
+            str(value)
+            if not isinstance(value, (list, tuple))
+            else "_".join([str(x) for x in value])
+            for value in kwargs_name.values()
+        ]
+    )
+    return os.path.join(data_dir, parameters_values + f"_{suffix}" + f".{extension}")
+
+
+def get_random_string():
+    random_string = "".join(
+        random_state.choice([x for x in ascii_letters], size=10).tolist()
+    )
+    return random_string
+
+
+def insert_lines_to_csv(
+    csv_name: str,
+    lines_positions: list,
+    lines_type: str = "blank",
+    encoding: str = None,
+    **csv_reader_writer_params,
+):
+    """Insert lines to ".csv" file.
+
+    Parameters
+    ----------
+    csv_name: str
+        ".csv" file that should be modified.
+    lines_positions: list of ints
+        Lines postions that sghould be modified (serial number
+        of line - begins from 0, ends in <rows_number> - 1).
+    lines_type: str
+        Lines types that should be inserted to ".csv" file. Possible types:
+        "blank" - empty line without any delimiters/separators,
+        "bad" - lines with len(lines_data) > cols_number
+    encoding: str
+        Encoding type that should be used during file reading and writing.
+    """
+    cols_number = len(pandas.read_csv(csv_name, nrows=1).columns)
+    if lines_type == "blank":
+        lines_data = []
+    elif lines_type == "bad":
+        cols_number = len(pandas.read_csv(csv_name, nrows=1).columns)
+        lines_data = [x for x in range(cols_number + 1)]
+    else:
+        raise ValueError(
+            f"acceptable values for  parameter are ['blank', 'bad'], actually passed {lines_type}"
+        )
+    lines = []
+    dialect = "excel"
+    with open(csv_name, "r", encoding=encoding, newline="") as read_file:
+        try:
+            dialect = csv.Sniffer().sniff(read_file.read())
+            read_file.seek(0)
+        except Exception:
+            dialect = None
+
+        reader = csv.reader(
+            read_file,
+            dialect=dialect if dialect is not None else "excel",
+            **csv_reader_writer_params,
+        )
+        counter = 0
+        for row in reader:
+            if counter in lines_positions:
+                lines.append(lines_data)
+            else:
+                lines.append(row)
+            counter += 1
+    with open(csv_name, "w", encoding=encoding, newline="") as write_file:
+        writer = csv.writer(
+            write_file,
+            dialect=dialect if dialect is not None else "excel",
+            **csv_reader_writer_params,
+        )
+        writer.writerows(lines)