From 740662977427f03000aa4352f7183bd2d67f18ac Mon Sep 17 00:00:00 2001 From: William Ma <12377941+williamma12@users.noreply.github.com> Date: Sun, 31 Jan 2021 15:46:23 -0600 Subject: [PATCH] FEAT-#2451: Linting Signed-off-by: William Ma --- modin/engines/base/io/file_dispatcher.py | 5 ++--- modin/engines/base/io/text/csv_glob_dispatcher.py | 12 +++++------- modin/experimental/engines/pandas_on_ray/io_exp.py | 4 +++- modin/experimental/pandas/io_exp.py | 1 + modin/experimental/pandas/test/test_io_exp.py | 11 ++++++++++- 5 files changed, 21 insertions(+), 12 deletions(-) diff --git a/modin/engines/base/io/file_dispatcher.py b/modin/engines/base/io/file_dispatcher.py index 1b7cc980baf..162a65e77b8 100644 --- a/modin/engines/base/io/file_dispatcher.py +++ b/modin/engines/base/io/file_dispatcher.py @@ -68,7 +68,7 @@ def get_path(cls, file_path: str) -> str: String of strings of absolute file paths. """ if S3_ADDRESS_REGEX.search(file_path): - return _s3_path(file_path, False)[0] + return cls._s3_path(file_path, False)[0] else: return os.path.abspath(file_path) @@ -172,7 +172,6 @@ def get_file_path(fs_handle) -> List[str]: s3fs = S3FS.S3FileSystem(anon=True) return get_file_path(s3fs) - @classmethod def file_exists(cls, file_path: str) -> bool: """ @@ -190,7 +189,7 @@ def file_exists(cls, file_path: str) -> bool: """ if isinstance(file_path, str): if S3_ADDRESS_REGEX.search(file_path): - return len(_s3_path(file_path, False)) > 0 + return len(cls._s3_path(file_path, False)) > 0 return os.path.exists(file_path) @classmethod diff --git a/modin/engines/base/io/text/csv_glob_dispatcher.py b/modin/engines/base/io/text/csv_glob_dispatcher.py index df7872221ab..98b39a0b7a5 100644 --- a/modin/engines/base/io/text/csv_glob_dispatcher.py +++ b/modin/engines/base/io/text/csv_glob_dispatcher.py @@ -354,7 +354,6 @@ def partitioned_multiple_files( final_result = [] split_result = [] split_size = 0 - read_rows_counter = 0 for f, fname in zip(files, fnames): if skip_header: outside_quotes, read_rows = cls._read_rows( @@ -369,7 +368,7 @@ def partitioned_multiple_files( remainder_size = partition_size - split_size start = f.tell() if nrows: - outside_quotes, read_rows = cls._read_rows( + _, read_rows = cls._read_rows( f, nrows=remainder_size, quotechar=quotechar, @@ -379,14 +378,14 @@ def partitioned_multiple_files( nrows -= read_rows end = f.tell() else: - outside_quotes = cls.offset( + cls.offset( f, offset_size=remainder_size, quotechar=quotechar, is_quoting=is_quoting, ) end = f.tell() - split_size += (end - start) + split_size += end - start split_result.append((fname, start, end)) if split_size < partition_size: continue @@ -411,7 +410,7 @@ def partitioned_multiple_files( continue else: rows_read -= skiprows - + # Calculate if the last split needs to be carried over to the next file. if nrows: last_size = rows_read % partition_size @@ -419,7 +418,7 @@ def partitioned_multiple_files( nrows -= rows_read else: _, last_start, last_end = file_splits[-1] - last_size = (last_end - last_start) + last_size = last_end - last_start full_last_partition = last_size >= partition_size if full_last_partition: @@ -434,4 +433,3 @@ def partitioned_multiple_files( final_result.append(split_result) return final_result - diff --git a/modin/experimental/engines/pandas_on_ray/io_exp.py b/modin/experimental/engines/pandas_on_ray/io_exp.py index dbd06177c15..33c3e262f7c 100644 --- a/modin/experimental/engines/pandas_on_ray/io_exp.py +++ b/modin/experimental/engines/pandas_on_ray/io_exp.py @@ -62,7 +62,9 @@ class ExperimentalPandasOnRayIO(PandasOnRayIO): query_compiler_cls=PandasQueryCompiler, frame_cls=PandasOnRayFrame, ) - read_csv = type("", (RayTask, PandasCSVGlobParser, CSVGlobDispatcher), build_args)._read + read_csv = type( + "", (RayTask, PandasCSVGlobParser, CSVGlobDispatcher), build_args + )._read read_parquet_remote_task = _read_parquet_columns @classmethod diff --git a/modin/experimental/pandas/io_exp.py b/modin/experimental/pandas/io_exp.py index cce2c4c9a52..d55c4f23fcc 100644 --- a/modin/experimental/pandas/io_exp.py +++ b/modin/experimental/pandas/io_exp.py @@ -73,6 +73,7 @@ def read_sql( _, _, _, kwargs = inspect.getargvalues(inspect.currentframe()) return DataFrame(query_compiler=EngineDispatcher.read_sql(**kwargs)) + # CSV and table def _make_parser_func(sep): """ diff --git a/modin/experimental/pandas/test/test_io_exp.py b/modin/experimental/pandas/test/test_io_exp.py index 7459f514c93..74fdc9aecc7 100644 --- a/modin/experimental/pandas/test/test_io_exp.py +++ b/modin/experimental/pandas/test/test_io_exp.py @@ -17,6 +17,7 @@ from modin.config import Engine from modin.pandas.test.test_io import ( # noqa: F401 df_equals, + eval_io, make_sql_connection, make_csv_file, ) @@ -66,10 +67,11 @@ def test_from_sql_defaults(make_sql_connection): # noqa: F811 df_equals(modin_df_from_query, pandas_df) df_equals(modin_df_from_table, pandas_df) + @pytest.mark.skipif( Engine.get() != "Ray", reason="Currently only support Ray engine for glob paths." ) -def test_read_multiple_csv(make_csv_file): +def test_read_multiple_csv(make_csv_file): # noqa: F811 base_name = get_unique_filename(extension="") glob_path = "{}_*.csv".format(base_name) files = ["{}_{}.csv".format(base_name, i) for i in range(2)] @@ -89,3 +91,10 @@ def test_read_multiple_csv(make_csv_file): except AssertionError: df_equals(modin_df, pandas_df2) + +def test_read_csv_s3(self): + eval_io( + fn_name="read_csv", + # read_csv kwargs + filepath_or_buffer="s3://noaa-ghcn-pds/csv/178*.csv", + )