From a11e7c9bab7d5f640232ddb59e883ff7a8b57565 Mon Sep 17 00:00:00 2001 From: Anatoly Myachev <45976948+anmyachev@users.noreply.github.com> Date: Fri, 30 Oct 2020 07:25:53 +0300 Subject: [PATCH 01/42] DOCS-#2193: Add contributing doc in checklist (#2216) * DOCS-#2193: update contributing doc Signed-off-by: Anatoly Myachev --- docs/{developer/contributing.rst => CONTRIBUTING.rst} | 10 +++++----- docs/index.rst | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) rename docs/{developer/contributing.rst => CONTRIBUTING.rst} (96%) diff --git a/docs/developer/contributing.rst b/docs/CONTRIBUTING.rst similarity index 96% rename from docs/developer/contributing.rst rename to docs/CONTRIBUTING.rst index 4ce4e3091bb..3841c7a9812 100644 --- a/docs/developer/contributing.rst +++ b/docs/CONTRIBUTING.rst @@ -113,10 +113,10 @@ dependencies for running the tests and formatting the code: .. code-block:: bash + conda env create --file environment.yml + # or pip install -r requirements.txt -For developments under Windows, dependencies can be found in 'env_windows.yml' file. - Code Formatting and Lint ------------------------ @@ -128,13 +128,13 @@ that you run the following from the project root: black modin/ We also use flake8_ to check linting errors. Running the following from the project root -will ensure that it passes the lint checks on Travis: +will ensure that it passes the lint checks on Github Actions: .. code-block:: bash flake8 . -We test that this has been run on our `Travis CI`_ test suite. If you do this and find +We test that this has been run on our `Github Actions`_ test suite. If you do this and find that the tests are still failing, try updating your version of black and flake8. Adding a test @@ -181,6 +181,6 @@ More docs on this coming soon... .. _internal methods: .. _black: https://github.com/ambv/black .. _flake8: http://flake8.pycqa.org/en/latest/ -.. _Travis CI: https://travis-ci.org/ +.. _Github Actions: https://github.com/features/actions .. _testing: .. _developer mailing list: https://groups.google.com/forum/#!forum/modin-dev diff --git a/docs/index.rst b/docs/index.rst index 2f8e80822a5..7f3a3ee3b45 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -140,7 +140,7 @@ nature, you get a fast DataFrame at 1MB and 1TB+. .. toctree:: :caption: Developer Documentation - developer/contributing + contributing developer/architecture .. toctree:: From 737ec340f74dfd7f06ea73343c1ea7236bc25edd Mon Sep 17 00:00:00 2001 From: Anatoly Myachev <45976948+anmyachev@users.noreply.github.com> Date: Fri, 30 Oct 2020 14:04:35 +0300 Subject: [PATCH 02/42] REFACTOR-#2343: refactor offset, _read_rows, partitioned_file (#2344) Signed-off-by: Anatoly Myachev --- modin/engines/base/io/text/csv_reader.py | 9 +- modin/engines/base/io/text/fwf_reader.py | 9 +- .../engines/base/io/text/text_file_reader.py | 264 ++++++++---------- 3 files changed, 120 insertions(+), 162 deletions(-) diff --git a/modin/engines/base/io/text/csv_reader.py b/modin/engines/base/io/text/csv_reader.py index 85a844c26c5..f7faf438847 100644 --- a/modin/engines/base/io/text/csv_reader.py +++ b/modin/engines/base/io/text/csv_reader.py @@ -120,12 +120,6 @@ def _read(cls, filepath_or_buffer, **kwargs): skiprows += header + 1 elif hasattr(header, "__iter__") and not isinstance(header, str): skiprows += max(header) + 1 - cls.offset( - f, - nrows=skiprows, - quotechar=quotechar, - is_quoting=is_quoting, - ) if kwargs.get("encoding", None) is not None: partition_kwargs["skiprows"] = 1 # Launch tasks to read partitions @@ -163,8 +157,9 @@ def _read(cls, filepath_or_buffer, **kwargs): splits = cls.partitioned_file( f, - nrows=nrows, num_partitions=num_partitions, + nrows=nrows, + skiprows=skiprows, quotechar=quotechar, is_quoting=is_quoting, ) diff --git a/modin/engines/base/io/text/fwf_reader.py b/modin/engines/base/io/text/fwf_reader.py index 7506ce448c2..e5c6bd36680 100644 --- a/modin/engines/base/io/text/fwf_reader.py +++ b/modin/engines/base/io/text/fwf_reader.py @@ -116,12 +116,6 @@ def read(cls, filepath_or_buffer, **kwargs): skiprows += header + 1 elif hasattr(header, "__iter__") and not isinstance(header, str): skiprows += max(header) + 1 - cls.offset( - f, - nrows=skiprows, - quotechar=quotechar, - is_quoting=is_quoting, - ) if kwargs.get("encoding", None) is not None: partition_kwargs["skiprows"] = 1 # Launch tasks to read partitions @@ -159,8 +153,9 @@ def read(cls, filepath_or_buffer, **kwargs): splits = cls.partitioned_file( f, - nrows=nrows, num_partitions=num_partitions, + nrows=nrows, + skiprows=skiprows, quotechar=quotechar, is_quoting=is_quoting, ) diff --git a/modin/engines/base/io/text/text_file_reader.py b/modin/engines/base/io/text/text_file_reader.py index bc194cc2986..8b60c7e9866 100644 --- a/modin/engines/base/io/text/text_file_reader.py +++ b/modin/engines/base/io/text/text_file_reader.py @@ -56,96 +56,83 @@ def pathlib_or_pypath(cls, filepath_or_buffer): def offset( cls, f, - nrows=None, - skiprows=None, - chunk_size_bytes=None, - quotechar=b'"', - is_quoting=True, + offset_size: int, + quotechar: bytes = b'"', + is_quoting: bool = True, ): """ - Moves the file offset at the specified amount of bytes/rows. + Moves the file offset at the specified amount of bytes. Parameters ---------- - f: file object - nrows: int, number of rows to read. Optional, if not specified will only - consider `chunk_size_bytes` parameter. - chunk_size_bytes: int, Will read new rows while file pointer - is less than `chunk_size_bytes`. Optional, if not specified will only - consider `nrows` parameter. - skiprows: array or callable (optional), specifies rows to skip - quotechar: char that indicates quote in a file - (optional, by default it's '\"') - is_quoting: bool, Whether or not to consider quotes - (optional, by default it's `True`) + f: file object + offset_size: int + Number of bytes to read and ignore. + quotechar: bytes, default b'"' + Indicate quote in a file. + is_quoting: bool, default True + Whether or not to consider quotes. Returns ------- - bool: If file pointer reached the end of the file, but did not find + bool + If file pointer reached the end of the file, but did not find closing quote returns `False`. `True` in any other case. """ - assert ( - nrows is not None or chunk_size_bytes is not None - ), "`nrows` and `chunk_size_bytes` can't be None at the same time" - - if nrows is not None or skiprows is not None: - return cls._read_rows( - f, - nrows=nrows, - skiprows=skiprows, - quotechar=quotechar, - is_quoting=is_quoting, - max_bytes=chunk_size_bytes, - )[0] - - outside_quotes = True if is_quoting: - chunk = f.read(chunk_size_bytes) - line = f.readline() # Ensure we read up to a newline - # We need to ensure that one row isn't split across different partitions - outside_quotes = not ((chunk.count(quotechar) + line.count(quotechar)) % 2) - while not outside_quotes: - line = f.readline() - outside_quotes = line.count(quotechar) % 2 - if not line: - break + chunk = f.read(offset_size) + outside_quotes = not chunk.count(quotechar) % 2 else: - f.seek(chunk_size_bytes, os.SEEK_CUR) - f.readline() + f.seek(offset_size, os.SEEK_CUR) + outside_quotes = True + + # after we read `offset_size` bytes, we most likely break the line but + # the modin implementation doesn't work correctly in the case, so we must + # make sure that the line is read completely to the lineterminator, + # which is what the `_read_rows` does + outside_quotes, _ = cls._read_rows( + f, + nrows=1, + quotechar=quotechar, + is_quoting=is_quoting, + outside_quotes=outside_quotes, + ) + return outside_quotes @classmethod def partitioned_file( cls, f, - nrows=None, - skiprows=None, - num_partitions=None, - quotechar=b'"', - is_quoting=True, - from_begin=False, + num_partitions: int = None, + nrows: int = None, + skiprows: int = None, + quotechar: bytes = b'"', + is_quoting: bool = True, ): - """Computes chunk sizes in bytes for every partition. + """ + Compute chunk sizes in bytes for every partition. Parameters ---------- - f: file to be partitioned - nrows: int (optional), number of rows of file to read - skiprows: array or callable (optional), specifies rows to skip - num_partitions: int, for what number of partitions split a file. - Optional, if not specified grabs the value from `modin.pandas.DEFAULT_NPARTITIONS` - quotechar: char that indicates quote in a file - (optional, by default it's '\"') - is_quoting: bool, Whether or not to consider quotes - (optional, by default it's `True`) - from_begin: bool, Whether or not to set the file pointer to the begining of the file - (optional, by default it's `False`) + f: file to be partitioned + num_partitions: int, optional + For what number of partitions split a file. + If not specified grabs the value from `modin.pandas.DEFAULT_NPARTITIONS` + nrows: int, optional + Number of rows of file to read. + skiprows: array or callable, optional + Specifies rows to skip. + quotechar: bytes, default b'"' + Indicate quote in a file. + is_quoting: bool, default True + Whether or not to consider quotes. Returns ------- - An array, where each element of array is a tuple of two ints: - beginning and the end offsets of the current chunk. + An array, where each element of array is a tuple of two ints: + beginning and the end offsets of the current chunk. """ if num_partitions is None: from modin.pandas import DEFAULT_NPARTITIONS @@ -153,46 +140,54 @@ def partitioned_file( num_partitions = DEFAULT_NPARTITIONS result = [] + file_size = cls.file_size(f) - old_position = f.tell() - if from_begin: - f.seek(0, os.SEEK_SET) - - current_start = f.tell() - total_bytes = cls.file_size(f) - - # if `nrows` are specified we want to use rows as a part measure - if nrows is not None: - chunk_size_bytes = None - rows_per_part = max(1, num_partitions, nrows // num_partitions) - else: - chunk_size_bytes = max(1, num_partitions, total_bytes // num_partitions) - rows_per_part = None - nrows = float("inf") - - rows_readed = 0 - while f.tell() < total_bytes and rows_readed < nrows: - if rows_per_part is not None and rows_readed + rows_per_part > nrows: - rows_per_part = nrows - rows_readed - - outside_quotes = cls.offset( + if skiprows: + outside_quotes, read_rows = cls._read_rows( f, - nrows=rows_per_part, - skiprows=skiprows, - chunk_size_bytes=chunk_size_bytes, + nrows=skiprows, quotechar=quotechar, is_quoting=is_quoting, ) - result.append((current_start, f.tell())) - current_start = f.tell() - if rows_per_part is not None: - rows_readed += rows_per_part - - if is_quoting and not outside_quotes: - warnings.warn("File has mismatched quotes") - - f.seek(old_position, os.SEEK_SET) + start = f.tell() + + if nrows: + read_rows_counter = 0 + partition_size = max(1, num_partitions, nrows // num_partitions) + while f.tell() < file_size and read_rows_counter < nrows: + if read_rows_counter + partition_size > nrows: + # it's possible only if is_quoting==True + partition_size = nrows - read_rows_counter + outside_quotes, read_rows = cls._read_rows( + f, + nrows=partition_size, + quotechar=quotechar, + is_quoting=is_quoting, + ) + result.append((start, f.tell())) + start = f.tell() + read_rows_counter += read_rows + + # add outside_quotes + if is_quoting and not outside_quotes: + warnings.warn("File has mismatched quotes") + else: + partition_size = max(1, num_partitions, file_size // num_partitions) + while f.tell() < file_size: + outside_quotes = cls.offset( + f, + offset_size=partition_size, + quotechar=quotechar, + is_quoting=is_quoting, + ) + + result.append((start, f.tell())) + start = f.tell() + + # add outside_quotes + if is_quoting and not outside_quotes: + warnings.warn("File has mismatched quotes") return result @@ -200,75 +195,48 @@ def partitioned_file( def _read_rows( cls, f, - nrows=None, - skiprows=None, - quotechar=b'"', - is_quoting=True, - max_bytes=None, + nrows: int, + quotechar: bytes = b'"', + is_quoting: bool = True, + outside_quotes: bool = True, ): """ - Moves the file offset at the specified amount of rows - Note: the difference between `offset` is that `_read_rows` is more - specific version of `offset` which is focused of reading **rows**. - In common case it's better to use `offset`. + Move the file offset at the specified amount of rows. Parameters ---------- - f: file object - nrows: int, number of rows to read. Optional, if not specified will only - consider `max_bytes` parameter. - skiprows: int, array or callable (optional), specifies rows to skip - quotechar: char that indicates quote in a file - (optional, by default it's '\"') - is_quoting: bool, Whether or not to consider quotes - (optional, by default it's `True`) - max_bytes: int, Will read new rows while file pointer - is less than `max_bytes`. Optional, if not specified will only - consider `nrows` parameter, if both not specified will read till - the end of the file. + f: file object + nrows: int + Number of rows to read. + quotechar: bytes, default b'"' + Indicate quote in a file. + is_quoting: bool, default True + Whether or not to consider quotes. + outside_quotes: bool, default True + Whether the file pointer is within quotes or not at the time this function is called. Returns ------- - tuple of bool and int, - bool: If file pointer reached the end of the file, but did not find + tuple of bool and int, + bool: If file pointer reached the end of the file, but did not find closing quote returns `False`. `True` in any other case. - int: Number of rows that was readed. + int: Number of rows that was read. """ - assert skiprows is None or isinstance( - skiprows, int - ), f"Skiprows as a {type(skiprows)} is not supported yet." - - if nrows is None and max_bytes is None: - max_bytes = float("inf") - if nrows is not None and nrows <= 0: return True, 0 - # we need this condition to avoid unnecessary checks in `stop_condition` - # which executes in a huge for loop - if nrows is not None and max_bytes is None: - stop_condition = lambda rows_readed: rows_readed >= nrows # noqa (E731) - elif nrows is not None and max_bytes is not None: - stop_condition = ( - lambda rows_readed: f.tell() >= max_bytes or rows_readed >= nrows - ) # noqa (E731) - else: - stop_condition = lambda rows_readed: f.tell() >= max_bytes # noqa (E731) - - if max_bytes is not None: - max_bytes = max_bytes + f.tell() + rows_read = 0 - rows_readed = 0 - outside_quotes = True for line in f: if is_quoting and line.count(quotechar) % 2: outside_quotes = not outside_quotes if outside_quotes: - rows_readed += 1 - if stop_condition(rows_readed): + rows_read += 1 + if rows_read >= nrows: break + # case when EOF if not outside_quotes: - rows_readed += 1 + rows_read += 1 - return outside_quotes, rows_readed + return outside_quotes, rows_read From c86422afcfe311743d7f326a40063e1dc436c5d9 Mon Sep 17 00:00:00 2001 From: YarShev Date: Fri, 30 Oct 2020 16:04:51 +0300 Subject: [PATCH 03/42] FIX-#1927: Fix performance issue related to `sparse` attribute access (#2318) Signed-off-by: Igoshev, Yaroslav --- modin/pandas/accessor.py | 111 ++++++++++++++++++++ modin/pandas/dataframe.py | 5 +- modin/pandas/series.py | 5 +- modin/pandas/test/dataframe/test_default.py | 13 ++- modin/pandas/test/test_api.py | 1 + modin/pandas/test/test_series.py | 23 ++-- 6 files changed, 138 insertions(+), 20 deletions(-) create mode 100644 modin/pandas/accessor.py diff --git a/modin/pandas/accessor.py b/modin/pandas/accessor.py new file mode 100644 index 00000000000..b4895b7eabc --- /dev/null +++ b/modin/pandas/accessor.py @@ -0,0 +1,111 @@ +# Licensed to Modin Development Team under one or more contributor license agreements. +# See the NOTICE file distributed with this work for additional information regarding +# copyright ownership. The Modin Development Team licenses this file to you under the +# Apache License, Version 2.0 (the "License"); you may not use this file except in +# compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under +# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific language +# governing permissions and limitations under the License. + +import pandas +from pandas.core.arrays.sparse.dtype import SparseDtype + +from modin.utils import _inherit_docstrings + + +class BaseSparseAccessor: + _validation_msg = "Can only use the '.sparse' accessor with Sparse data." + + def __init__(self, data=None): + self._parent = data + self._validate(data) + + def _validate(self, data): + raise NotImplementedError + + def _default_to_pandas(self, op, *args, **kwargs): + return self._parent._default_to_pandas( + lambda parent: op(parent.sparse, *args, **kwargs) + ) + + +@_inherit_docstrings(pandas.core.arrays.sparse.accessor.SparseFrameAccessor) +class SparseFrameAccessor(BaseSparseAccessor): + def _validate(self, data): + dtypes = data.dtypes + if not all(isinstance(t, SparseDtype) for t in dtypes): + raise AttributeError(self._validation_msg) + + @property + def density(self): + return self._parent._default_to_pandas(pandas.DataFrame.sparse).density + + @classmethod + def from_spmatrix(cls, data, index=None, columns=None): + return cls._default_to_pandas( + pandas.DataFrame.sparse.from_spmatrix, data, index=index, columns=columns + ) + + def to_dense(self): + return self._default_to_pandas(pandas.DataFrame.sparse.to_dense) + + def to_coo(self): + return self._default_to_pandas(pandas.DataFrame.sparse.to_coo) + + +@_inherit_docstrings(pandas.core.arrays.sparse.accessor.SparseAccessor) +class SparseAccessor(BaseSparseAccessor): + def _validate(self, data): + if not isinstance(data.dtype, SparseDtype): + raise AttributeError(self._validation_msg) + + @property + def density(self): + return self._parent._default_to_pandas(pandas.Series.sparse).density + + @property + def fill_value(self): + return self._parent._default_to_pandas(pandas.Series.sparse).fill_value + + @property + def npoints(self): + return self._parent._default_to_pandas(pandas.Series.sparse).npoints + + @property + def sp_values(self): + return self._parent._default_to_pandas(pandas.Series.sparse).sp_values + + @classmethod + def from_coo(cls, A, dense_index=False): + return cls._default_to_pandas( + pandas.Series.sparse.from_coo, A, dense_index=dense_index + ) + + def to_coo(self, row_levels=(0,), column_levels=(1,), sort_labels=False): + return self._default_to_pandas( + pandas.Series.sparse.to_coo, + row_levels=row_levels, + column_levels=column_levels, + sort_labels=sort_labels, + ) + + def to_dense(self): + return self._default_to_pandas(pandas.Series.sparse.to_dense) + + +@_inherit_docstrings(pandas.core.accessor.CachedAccessor) +class CachedAccessor: + def __init__(self, name: str, accessor) -> None: + self._name = name + self._accessor = accessor + + def __get__(self, obj, cls): + if obj is None: + return self._accessor + accessor_obj = self._accessor(obj) + object.__setattr__(obj, self._name, accessor_obj) + return accessor_obj diff --git a/modin/pandas/dataframe.py b/modin/pandas/dataframe.py index 2d7ee67161e..4c6e6af9846 100644 --- a/modin/pandas/dataframe.py +++ b/modin/pandas/dataframe.py @@ -54,6 +54,7 @@ from .series import Series from .base import BasePandasDataset, _ATTRS_NO_LOOKUP from .groupby import DataFrameGroupBy +from .accessor import CachedAccessor, SparseFrameAccessor @_inherit_docstrings(pandas.DataFrame, excluded=[pandas.DataFrame.__init__]) @@ -1594,9 +1595,7 @@ def set_index( if not inplace: return frame - @property - def sparse(self): - return self._default_to_pandas(pandas.DataFrame.sparse) + sparse = CachedAccessor("sparse", SparseFrameAccessor) def squeeze(self, axis=None): axis = self._get_axis_number(axis) if axis is not None else None diff --git a/modin/pandas/series.py b/modin/pandas/series.py index 6a1e11e4929..c3833cfe64a 100644 --- a/modin/pandas/series.py +++ b/modin/pandas/series.py @@ -41,6 +41,7 @@ from .base import BasePandasDataset, _ATTRS_NO_LOOKUP from .iterator import PartitionIterator from .utils import from_pandas, is_scalar +from .accessor import CachedAccessor, SparseAccessor @_inherit_docstrings(pandas.Series, excluded=[pandas.Series.__init__]) @@ -1187,9 +1188,7 @@ def sort_values( result._query_compiler, inplace=inplace ) - @property - def sparse(self): - return self._default_to_pandas(pandas.Series.sparse) + sparse = CachedAccessor("sparse", SparseAccessor) def squeeze(self, axis=None): if axis is not None: diff --git a/modin/pandas/test/dataframe/test_default.py b/modin/pandas/test/dataframe/test_default.py index b8b39c203da..552a9fa7480 100644 --- a/modin/pandas/test/dataframe/test_default.py +++ b/modin/pandas/test/dataframe/test_default.py @@ -1151,6 +1151,13 @@ def test___bool__(data): eval_general(*create_test_dfs(data), lambda df: df.__bool__()) -@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) -def test_hasattr_sparse(data): - eval_general(*create_test_dfs(data), lambda df: hasattr(df, "sparse")) +@pytest.mark.parametrize( + "is_sparse_data", [True, False], ids=["is_sparse", "is_not_sparse"] +) +def test_hasattr_sparse(is_sparse_data): + modin_df, pandas_df = ( + create_test_dfs(pandas.arrays.SparseArray(test_data["float_nan_data"].values())) + if is_sparse_data + else create_test_dfs(test_data["float_nan_data"]) + ) + eval_general(modin_df, pandas_df, lambda df: hasattr(df, "sparse")) diff --git a/modin/pandas/test/test_api.py b/modin/pandas/test/test_api.py index 319ae2bf505..abb907f639b 100644 --- a/modin/pandas/test/test_api.py +++ b/modin/pandas/test/test_api.py @@ -48,6 +48,7 @@ def test_top_level_api_equality(): "DEFAULT_NPARTITIONS", "iterator", "series", + "accessor", "base", "utils", "dataframe", diff --git a/modin/pandas/test/test_series.py b/modin/pandas/test/test_series.py index ec4aba6879a..990c5c0292d 100644 --- a/modin/pandas/test/test_series.py +++ b/modin/pandas/test/test_series.py @@ -4397,17 +4397,18 @@ def test_encode(data, encoding_type): df_equals(modin_result, pandas_result) -@pytest.mark.parametrize("data", test_string_data_values, ids=test_string_data_keys) -def test_hasattr_sparse(data): - modin_series, pandas_series = create_test_series(data) - try: - pandas_result = hasattr(pandas_series, "sparse") - except Exception as e: - with pytest.raises(type(e)): - hasattr(modin_series, "sparse") - else: - modin_result = hasattr(modin_series, "sparse") - assert modin_result == pandas_result +@pytest.mark.parametrize( + "is_sparse_data", [True, False], ids=["is_sparse", "is_not_sparse"] +) +def test_hasattr_sparse(is_sparse_data): + modin_df, pandas_df = ( + create_test_series( + pandas.arrays.SparseArray(test_data["float_nan_data"].values()) + ) + if is_sparse_data + else create_test_series(test_data["float_nan_data"]) + ) + eval_general(modin_df, pandas_df, lambda df: hasattr(df, "sparse")) @pytest.mark.parametrize( From f1f82eebcddd4a65db527cab534d82bfa7ccec19 Mon Sep 17 00:00:00 2001 From: YarShev Date: Fri, 30 Oct 2020 16:28:33 +0300 Subject: [PATCH 04/42] FIX-#2269: Move `default_to_pandas` logic from API layer to backend (#2332) * FIX-#2269: Move `default_to_pandas` logic from API layer to backend Signed-off-by: Igoshev, Yaroslav * FIX-#2269: Added a test which calls _apply_agg_function Signed-off-by: Gregory Shimansky * FIX-#2269: Added required arguments for groupby_agg Moved wrap_udf_function into backend because omnisci doesn't support executing lambdas. Signed-off-by: Gregory Shimansky * FIX-#2269: Use correct default_to_pandas for groupby in backend, refactor default to pandas functions in BaseQC Signed-off-by: Igoshev, Yaroslav * FIX-#2269: Renamed new default_to_pandas_groupby function into private function of Pandas backend because it is not used anywhere else. Signed-off-by: Gregory Shimansky * FIX-#2269: Fixed specification of backend now it is possible to specify --backend=PandasOnDask, --backend=PandasOnRay or --backend=PandasOnPython, not just --backend=BaseOnPython. Signed-off-by: Gregory Shimansky * FIX-#2269: Fix BaseOnPython tests Signed-off-by: Igoshev, Yaroslav * FIX-#2269: Remove default_to_pandas_groupby Signed-off-by: Igoshev, Yaroslav * FIX-#2269: logic of dropping 'by' moved back to API level Signed-off-by: Dmitry Chigarev Co-authored-by: Gregory Shimansky Co-authored-by: Dmitry Chigarev --- modin/backends/base/query_compiler.py | 30 +++++++++-- modin/backends/pandas/query_compiler.py | 51 ++++++++++++++----- .../default_methods/groupby_default.py | 32 ++++++++---- .../backends/omnisci/query_compiler.py | 19 ++++++- .../omnisci_on_ray/test/test_dataframe.py | 22 ++++++++ modin/pandas/groupby.py | 22 +++----- modin/pandas/test/conftest.py | 2 +- 7 files changed, 137 insertions(+), 41 deletions(-) diff --git a/modin/backends/base/query_compiler.py b/modin/backends/base/query_compiler.py index ceb582ae720..e9657ebbad6 100644 --- a/modin/backends/base/query_compiler.py +++ b/modin/backends/base/query_compiler.py @@ -58,7 +58,8 @@ class BaseQueryCompiler(abc.ABC): @abc.abstractmethod def default_to_pandas(self, pandas_op, *args, **kwargs): - """Default to pandas behavior. + """ + Default to pandas behavior. Parameters ---------- @@ -1396,14 +1397,35 @@ def groupby_size( drop=drop, ) - def groupby_agg(self, by, axis, agg_func, groupby_args, agg_args, drop=False): + def groupby_agg( + self, + by, + is_multi_by, + axis, + agg_func, + agg_args, + agg_kwargs, + groupby_kwargs, + drop=False, + ): + if is_multi_by: + if isinstance(by, type(self)) and len(by.columns) == 1: + by = by.columns[0] if drop else by.to_pandas().squeeze() + elif isinstance(by, type(self)): + by = list(by.columns) + else: + by = by + else: + by = by.to_pandas().squeeze() if isinstance(by, type(self)) else by + return GroupByDefault.register(pandas.core.groupby.DataFrameGroupBy.aggregate)( self, by=by, + is_multi_by=is_multi_by, axis=axis, agg_func=agg_func, - groupby_args=groupby_args, - agg_args=agg_args, + groupby_args=groupby_kwargs, + agg_args=agg_kwargs, drop=drop, ) diff --git a/modin/backends/pandas/query_compiler.py b/modin/backends/pandas/query_compiler.py index 89014b197b8..b5484b0c3c2 100644 --- a/modin/backends/pandas/query_compiler.py +++ b/modin/backends/pandas/query_compiler.py @@ -190,7 +190,8 @@ def __init__(self, modin_frame): self._modin_frame = modin_frame def default_to_pandas(self, pandas_op, *args, **kwargs): - """Default to pandas behavior. + """ + Default to pandas behavior. Parameters ---------- @@ -206,8 +207,8 @@ def default_to_pandas(self, pandas_op, *args, **kwargs): PandasQueryCompiler The result of the `pandas_op`, converted back to PandasQueryCompiler - Note - ---- + Notes + ----- This operation takes a distributed object and converts it directly to pandas. """ op_name = getattr(pandas_op, "__name__", str(pandas_op)) @@ -2583,24 +2584,50 @@ def groupby_dict_agg(self, by, func_dict, groupby_args, agg_args, drop=False): lambda df: df.groupby(by=by, **groupby_args).agg(func_dict, **agg_args) ) - def groupby_agg(self, by, axis, agg_func, groupby_args, agg_args, drop=False): - # since we're going to modify `groupby_args` dict in a `groupby_agg_builder`, + def groupby_agg( + self, + by, + is_multi_by, + axis, + agg_func, + agg_args, + agg_kwargs, + groupby_kwargs, + drop=False, + ): + agg_func = wrap_udf_function(agg_func) + + if is_multi_by: + return super().groupby_agg( + by=by, + is_multi_by=is_multi_by, + axis=axis, + agg_func=agg_func, + agg_args=agg_args, + agg_kwargs=agg_kwargs, + groupby_kwargs=groupby_kwargs, + drop=drop, + ) + + by = by.to_pandas().squeeze() if isinstance(by, type(self)) else by + + # since we're going to modify `groupby_kwargs` dict in a `groupby_agg_builder`, # we want to copy it to not propagate these changes into source dict, in case # of unsuccessful end of function - groupby_args = groupby_args.copy() + groupby_kwargs = groupby_kwargs.copy() - as_index = groupby_args.get("as_index", True) + as_index = groupby_kwargs.get("as_index", True) def groupby_agg_builder(df): # Set `as_index` to True to track the metadata of the grouping object # It is used to make sure that between phases we are constructing the # right index and placing columns in the correct order. - groupby_args["as_index"] = True + groupby_kwargs["as_index"] = True def compute_groupby(df): - grouped_df = df.groupby(by=by, axis=axis, **groupby_args) + grouped_df = df.groupby(by=by, axis=axis, **groupby_kwargs) try: - result = agg_func(grouped_df, **agg_args) + result = agg_func(grouped_df, **agg_kwargs) # This happens when the partition is filled with non-numeric data and a # numeric operation is done. We need to build the index here to avoid # issues with extracting the index. @@ -2628,13 +2655,13 @@ def compute_groupby(df): try: agg_func( pandas.DataFrame(index=[1], columns=[1]).groupby(level=0), - **agg_args, + **agg_kwargs, ) except Exception as e: raise type(e)("No numeric types to aggregate.") # Reset `as_index` because it was edited inplace. - groupby_args["as_index"] = as_index + groupby_kwargs["as_index"] = as_index if as_index: return result else: diff --git a/modin/data_management/functions/default_methods/groupby_default.py b/modin/data_management/functions/default_methods/groupby_default.py index 62bda259739..b6ae497c75f 100644 --- a/modin/data_management/functions/default_methods/groupby_default.py +++ b/modin/data_management/functions/default_methods/groupby_default.py @@ -61,22 +61,36 @@ def get_func(cls, grp, key, **kwargs): @classmethod def build_aggregate_method(cls, key): - def fn(df, by, groupby_args, agg_args, axis=0, drop=False, **kwargs): + def fn( + df, + by, + groupby_args, + agg_args, + axis=0, + is_multi_by=None, + drop=False, + **kwargs + ): by = cls.validate_by(by) - groupby_args = groupby_args.copy() - as_index = groupby_args.pop("as_index", True) - groupby_args["as_index"] = True + + if not is_multi_by: + groupby_args = groupby_args.copy() + as_index = groupby_args.pop("as_index", True) + groupby_args["as_index"] = True grp = df.groupby(by, axis=axis, **groupby_args) agg_func = cls.get_func(grp, key, **kwargs) result = agg_func(grp, **agg_args) - if as_index: - return result + if not is_multi_by: + if as_index: + return result + else: + if result.index.name is None or result.index.name in result.columns: + drop = False + return result.reset_index(drop=not drop) else: - if result.index.name is None or result.index.name in result.columns: - drop = False - return result.reset_index(drop=not drop) + return result return fn diff --git a/modin/experimental/backends/omnisci/query_compiler.py b/modin/experimental/backends/omnisci/query_compiler.py index f04d3cbfd6b..1851444e438 100644 --- a/modin/experimental/backends/omnisci/query_compiler.py +++ b/modin/experimental/backends/omnisci/query_compiler.py @@ -262,6 +262,23 @@ def groupby_count(self, by, axis, groupby_args, map_args, **kwargs): ) return self.__constructor__(new_frame) + def groupby_agg( + self, + by, + is_multi_by, + axis, + agg_func, + agg_args, + agg_kwargs, + groupby_kwargs, + drop=False, + ): + # TODO: handle `is_multi_by`, `agg_args`, `drop` args + new_frame = self._modin_frame.groupby_agg( + by, axis, agg_func, groupby_kwargs, **agg_kwargs + ) + return self.__constructor__(new_frame) + def groupby_dict_agg(self, by, func_dict, groupby_args, agg_args, drop=False): """Apply aggregation functions to a grouped dataframe per-column. @@ -283,7 +300,7 @@ def groupby_dict_agg(self, by, func_dict, groupby_args, agg_args, drop=False): DFAlgQueryCompiler The result of the per-column aggregations on the grouped dataframe. """ - # TODO: handle drop arg + # TODO: handle `drop` arg new_frame = self._modin_frame.groupby_agg( by, 0, func_dict, groupby_args, **agg_args ) diff --git a/modin/experimental/engines/omnisci_on_ray/test/test_dataframe.py b/modin/experimental/engines/omnisci_on_ray/test/test_dataframe.py index 22a471f3c7c..e019249772e 100644 --- a/modin/experimental/engines/omnisci_on_ray/test/test_dataframe.py +++ b/modin/experimental/engines/omnisci_on_ray/test/test_dataframe.py @@ -547,6 +547,17 @@ def groupby_count(df, cols, as_index, **kwargs): run_and_compare(groupby_count, data=self.data, cols=cols, as_index=as_index) + @pytest.mark.xfail( + reason="Currently mean() passes a lambda into backend which cannot be executed on omnisci backend" + ) + @pytest.mark.parametrize("cols", cols_value) + @pytest.mark.parametrize("as_index", bool_arg_values) + def test_groupby_mean(self, cols, as_index): + def groupby_mean(df, cols, as_index, **kwargs): + return df.groupby(cols, as_index=as_index).mean() + + run_and_compare(groupby_mean, data=self.data, cols=cols, as_index=as_index) + @pytest.mark.parametrize("cols", cols_value) @pytest.mark.parametrize("as_index", bool_arg_values) def test_groupby_proj_sum(self, cols, as_index): @@ -569,6 +580,17 @@ def groupby(df, **kwargs): run_and_compare(groupby, data=self.data) + @pytest.mark.xfail( + reason="Function specified as a string should be passed into backend API, but currently it is transformed into a lambda" + ) + @pytest.mark.parametrize("cols", cols_value) + @pytest.mark.parametrize("as_index", bool_arg_values) + def test_groupby_agg_mean(self, cols, as_index): + def groupby_mean(df, cols, as_index, **kwargs): + return df.groupby(cols, as_index=as_index).agg("mean") + + run_and_compare(groupby_mean, data=self.data, cols=cols, as_index=as_index) + taxi_data = { "a": [1, 1, 2, 2], "b": [11, 21, 12, 11], diff --git a/modin/pandas/groupby.py b/modin/pandas/groupby.py index 6feed8ff48b..0158b3c2819 100644 --- a/modin/pandas/groupby.py +++ b/modin/pandas/groupby.py @@ -29,7 +29,7 @@ import pandas.core.common as com from modin.error_message import ErrorMessage -from modin.utils import _inherit_docstrings, wrap_udf_function, try_cast_to_pandas +from modin.utils import _inherit_docstrings, try_cast_to_pandas from modin.config import IsExperimental from .series import Series @@ -834,28 +834,22 @@ def _apply_agg_function(self, f, drop=True, *args, **kwargs): """ assert callable(f), "'{0}' object is not callable".format(type(f)) - f = wrap_udf_function(f) - if self._is_multi_by: - return self._default_to_pandas(f, *args, **kwargs) - - if isinstance(self._by, type(self._query_compiler)): - by = self._by.to_pandas().squeeze() - else: - by = self._by - # For aggregations, pandas behavior does this for the result. # For other operations it does not, so we wait until there is an aggregation to # actually perform this operation. - if self._idx_name is not None and drop and self._drop: + if not self._is_multi_by and self._idx_name is not None and drop and self._drop: groupby_qc = self._query_compiler.drop(columns=[self._idx_name]) else: groupby_qc = self._query_compiler + new_manager = groupby_qc.groupby_agg( - by=by, + by=self._by, + is_multi_by=self._is_multi_by, axis=self._axis, agg_func=f, - groupby_args=self._kwargs, - agg_args=kwargs, + agg_args=args, + agg_kwargs=kwargs, + groupby_kwargs=self._kwargs, drop=self._drop, ) if self._idx_name is not None and self._as_index: diff --git a/modin/pandas/test/conftest.py b/modin/pandas/test/conftest.py index 2cc83a8e068..5fa1eff9cc8 100644 --- a/modin/pandas/test/conftest.py +++ b/modin/pandas/test/conftest.py @@ -71,7 +71,7 @@ def pytest_configure(config): set_base_backend(BASE_BACKEND_NAME) else: partition, engine = backend.split("On") - modin.set_base_backend(engine=engine, partition=backend) + modin.set_backends(engine=engine, partition=partition) def pytest_runtest_call(item): From 067b8ac289d9c33f02b4d0e5d9bc3313df5bdbcf Mon Sep 17 00:00:00 2001 From: amyskov <55585026+amyskov@users.noreply.github.com> Date: Fri, 30 Oct 2020 19:41:33 +0300 Subject: [PATCH 05/42] TEST-#2292: Cover by tests Datetime Handling parameters of read_csv (#2336) Signed-off-by: Alexander Myskov --- modin/pandas/test/test_io.py | 75 ++++++++++++++++++++++++++++++++++-- modin/pandas/test/utils.py | 5 ++- 2 files changed, 76 insertions(+), 4 deletions(-) diff --git a/modin/pandas/test/test_io.py b/modin/pandas/test/test_io.py index 019db34294a..05684f1bd3d 100644 --- a/modin/pandas/test/test_io.py +++ b/modin/pandas/test/test_io.py @@ -38,6 +38,7 @@ get_random_string, insert_lines_to_csv, IO_OPS_DATA_DIR, + io_ops_bad_exc, ) from modin.config import Engine, Backend @@ -176,13 +177,12 @@ def _csv_file_maker( add_nan_lines=False, thousands_separator=None, decimal_separator=None, - lineterminator=None, comment_col_char=None, quoting=csv.QUOTE_MINIMAL, quotechar='"', doublequote=True, escapechar=None, - line_terminator=os.linesep, + line_terminator=None, ): if os.path.exists(filename) and not force: pass @@ -248,7 +248,7 @@ def _csv_file_maker( "delimiter": delimiter, "doublequote": doublequote, "escapechar": escapechar, - "lineterminator": line_terminator, + "lineterminator": line_terminator if line_terminator else os.linesep, "quotechar": quotechar, "quoting": quoting, } @@ -524,6 +524,75 @@ def test_read_csv_delimiters( **kwargs, ) + # Datetime Handling tests + @pytest.mark.parametrize( + "parse_dates", + [ + True, + False, + ["col2"], + ["col2", "col4"], + [1, 3], + pytest.param( + {"foo": ["col2", "col4"]}, + marks=pytest.mark.xfail( + Engine.get() != "Python", + reason="Exception: Internal Error - issue #2073", + ), + ), + ], + ) + @pytest.mark.parametrize("infer_datetime_format", [True, False]) + @pytest.mark.parametrize("keep_date_col", [True, False]) + @pytest.mark.parametrize( + "date_parser", [None, lambda x: pd.datetime.strptime(x, "%Y-%m-%d")] + ) + @pytest.mark.parametrize("dayfirst", [True, False]) + @pytest.mark.parametrize("cache_dates", [True, False]) + def test_read_csv_datetime( + self, + make_csv_file, + request, + parse_dates, + infer_datetime_format, + keep_date_col, + date_parser, + dayfirst, + cache_dates, + ): + if request.config.getoption("--simulate-cloud").lower() != "off": + pytest.xfail( + "The reason of tests fail in `cloud` mode is unknown for now - issue #2340" + ) + + raising_exceptions = io_ops_bad_exc # default value + if isinstance(parse_dates, dict) and callable(date_parser): + # In this case raised TypeError: () takes 1 positional argument but 2 were given + raising_exceptions = list(io_ops_bad_exc) + raising_exceptions.remove(TypeError) + + kwargs = { + "parse_dates": parse_dates, + "infer_datetime_format": infer_datetime_format, + "keep_date_col": keep_date_col, + "date_parser": date_parser, + "dayfirst": dayfirst, + "cache_dates": cache_dates, + } + + unique_name = get_unique_filename("test_read_csv_datetime", kwargs) + make_csv_file( + filename=unique_name, + ) + + eval_io( + filepath_or_buffer=unique_name, + fn_name="read_csv", + check_kwargs_callable=not callable(date_parser), + raising_exceptions=raising_exceptions, + **kwargs, + ) + def test_from_parquet(make_parquet_file): make_parquet_file(SMALL_ROW_SIZE) diff --git a/modin/pandas/test/utils.py b/modin/pandas/test/utils.py index baa2daa7ffe..27a7375bb57 100644 --- a/modin/pandas/test/utils.py +++ b/modin/pandas/test/utils.py @@ -642,6 +642,7 @@ def eval_general( __inplace__=False, check_exception_type=True, raising_exceptions=None, + check_kwargs_callable=True, **kwargs, ): if raising_exceptions: @@ -670,7 +671,7 @@ def execute_callable(fn, inplace=False, md_kwargs={}, pd_kwargs={}): return (md_result, pd_result) if not __inplace__ else (modin_df, pandas_df) for key, value in kwargs.items(): - if callable(value): + if check_kwargs_callable and callable(value): values = execute_callable(value) # that means, that callable raised an exception if values is None: @@ -696,6 +697,7 @@ def eval_io( cast_to_str=False, check_exception_type=True, raising_exceptions=io_ops_bad_exc, + check_kwargs_callable=True, *args, **kwargs, ): @@ -732,6 +734,7 @@ def applyier(module, *args, **kwargs): applyier, check_exception_type=check_exception_type, raising_exceptions=raising_exceptions, + check_kwargs_callable=check_kwargs_callable, *args, **kwargs, ) From 776d8e27132aa9537b63d388cc4bad337c884ad5 Mon Sep 17 00:00:00 2001 From: Alexey Prutskov Date: Fri, 30 Oct 2020 21:59:46 +0300 Subject: [PATCH 06/42] FEAT-#2271: Add implementation of `groupby.shift` (#2323) Signed-off-by: Alexey Prutskov --- modin/pandas/base.py | 4 ++ modin/pandas/groupby.py | 64 +++++++++++++++++++++++++++++-- modin/pandas/test/test_groupby.py | 59 +++++++++++++++++++++++++++- modin/pandas/test/test_series.py | 1 + 4 files changed, 123 insertions(+), 5 deletions(-) diff --git a/modin/pandas/base.py b/modin/pandas/base.py index 07bf7f0501b..73d3043d38c 100644 --- a/modin/pandas/base.py +++ b/modin/pandas/base.py @@ -2113,6 +2113,10 @@ def shift(self, periods=1, freq=None, axis=0, fill_value=None): new_frame.columns = self.columns.copy() return new_frame else: + if not isinstance(self, DataFrame): + raise ValueError( + f"No axis named {axis} for object type {type(self)}" + ) res_columns = self.columns from .general import concat diff --git a/modin/pandas/groupby.py b/modin/pandas/groupby.py index 0158b3c2819..4441d870f4c 100644 --- a/modin/pandas/groupby.py +++ b/modin/pandas/groupby.py @@ -22,6 +22,7 @@ Manually add documentation for methods which are not presented in pandas. """ +import numpy as np import pandas import pandas.core.groupby from pandas.core.dtypes.common import is_list_like @@ -30,6 +31,7 @@ from modin.error_message import ErrorMessage from modin.utils import _inherit_docstrings, try_cast_to_pandas +from modin.backends.base.query_compiler import BaseQueryCompiler from modin.config import IsExperimental from .series import Series @@ -171,10 +173,64 @@ def idxmax(self): def ndim(self): return 2 # ndim is always 2 for DataFrames - def shift(self, periods=1, freq=None, axis=0): - return self._default_to_pandas( - lambda df: df.shift(periods=periods, freq=freq, axis=axis) - ) + def shift(self, periods=1, freq=None, axis=0, fill_value=None): + def _shift(periods, freq, axis, fill_value, is_set_nan_rows=True): + from .dataframe import DataFrame + + result = self._df.shift(periods, freq, axis, fill_value) + + if ( + is_set_nan_rows + and isinstance(self._by, BaseQueryCompiler) + and ( + # Check using `issubset` is effective only in case of MultiIndex + set(self._by.columns).issubset(list(self._df.columns)) + if isinstance(self._by.columns, pandas.MultiIndex) + else len( + self._by.columns.unique() + .sort_values() + .difference(self._df.columns.unique().sort_values()) + ) + == 0 + ) + and DataFrame(query_compiler=self._by.isna()).any(axis=None) + ): + mask_nan_rows = self._df[self._by.columns].isna() + if (isinstance(mask_nan_rows, DataFrame)) and len( + mask_nan_rows.columns + ) == 1: + mask_nan_rows = mask_nan_rows.squeeze(axis=1) + idx_nan_rows = mask_nan_rows[ + mask_nan_rows.any(axis=1) + if (isinstance(mask_nan_rows, DataFrame)) + else mask_nan_rows + ].index + result.loc[idx_nan_rows] = np.nan + return result + + if freq is None and axis == 1 and self._axis == 0: + result = _shift(periods, freq, axis, fill_value) + elif ( + freq is not None + and axis == 0 + and self._axis == 0 + and isinstance(self._by, BaseQueryCompiler) + ): + result = _shift(periods, freq, axis, fill_value, is_set_nan_rows=False) + new_idx_lvl_arrays = np.concatenate( + [self._df[self._by.columns].values.T, [list(result.index)]] + ) + result.index = pandas.MultiIndex.from_arrays( + new_idx_lvl_arrays, + names=[col_name for col_name in self._by.columns] + [result.index.name], + ) + result = result.dropna(subset=self._by.columns).sort_index() + else: + result = self._apply_agg_function( + lambda df: df.shift(periods, freq, axis, fill_value) + ) + result.index.name = None + return result def nth(self, n, dropna=None): return self._default_to_pandas(lambda df: df.nth(n, dropna=dropna)) diff --git a/modin/pandas/test/test_groupby.py b/modin/pandas/test/test_groupby.py index 7a98c43313a..b522e26f673 100644 --- a/modin/pandas/test/test_groupby.py +++ b/modin/pandas/test/test_groupby.py @@ -104,6 +104,7 @@ def test_mixed_dtypes_groupby(as_index): modin_df_almost_equals_pandas, is_default=True, ) + eval_shift(modin_groupby, pandas_groupby) eval_mean(modin_groupby, pandas_groupby) eval_any(modin_groupby, pandas_groupby) eval_min(modin_groupby, pandas_groupby) @@ -298,6 +299,7 @@ def maybe_get_columns(df, by): modin_groupby_equals_pandas(modin_groupby, pandas_groupby) eval_ngroups(modin_groupby, pandas_groupby) + eval_shift(modin_groupby, pandas_groupby) eval_general(modin_groupby, pandas_groupby, lambda df: df.ffill(), is_default=True) eval_general( modin_groupby, @@ -437,6 +439,7 @@ def test_single_group_row_groupby(): modin_groupby_equals_pandas(modin_groupby, pandas_groupby) eval_ngroups(modin_groupby, pandas_groupby) + eval_shift(modin_groupby, pandas_groupby) eval_skew(modin_groupby, pandas_groupby) eval_general(modin_groupby, pandas_groupby, lambda df: df.ffill(), is_default=True) eval_general( @@ -552,6 +555,7 @@ def test_large_row_groupby(is_by_category): modin_groupby_equals_pandas(modin_groupby, pandas_groupby) eval_ngroups(modin_groupby, pandas_groupby) + eval_shift(modin_groupby, pandas_groupby) eval_skew(modin_groupby, pandas_groupby) eval_general(modin_groupby, pandas_groupby, lambda df: df.ffill(), is_default=True) eval_general( @@ -666,6 +670,7 @@ def test_simple_col_groupby(): modin_groupby_equals_pandas(modin_groupby, pandas_groupby) eval_ngroups(modin_groupby, pandas_groupby) + eval_shift(modin_groupby, pandas_groupby) eval_skew(modin_groupby, pandas_groupby) eval_general(modin_groupby, pandas_groupby, lambda df: df.ffill(), is_default=True) eval_general( @@ -796,6 +801,7 @@ def test_series_groupby(by, as_index_series_or_dataframe): modin_groupby_equals_pandas(modin_groupby, pandas_groupby) eval_ngroups(modin_groupby, pandas_groupby) + eval_shift(modin_groupby, pandas_groupby) eval_general( modin_groupby, pandas_groupby, lambda df: df.ffill(), is_default=True ) @@ -1069,7 +1075,26 @@ def eval_groups(modin_groupby, pandas_groupby): def eval_shift(modin_groupby, pandas_groupby): - assert modin_groupby.groups == pandas_groupby.groups + eval_general( + modin_groupby, + pandas_groupby, + lambda groupby: groupby.shift(), + ) + eval_general( + modin_groupby, + pandas_groupby, + lambda groupby: groupby.shift(periods=0), + ) + eval_general( + modin_groupby, + pandas_groupby, + lambda groupby: groupby.shift(periods=-3), + ) + eval_general( + modin_groupby, + pandas_groupby, + lambda groupby: groupby.shift(axis=1, fill_value=777), + ) def test_groupby_on_index_values_with_loop(): @@ -1122,6 +1147,38 @@ def test_groupby_multiindex(): df_equals(modin_df.groupby(by=by).count(), pandas_df.groupby(by=by).count()) +@pytest.mark.parametrize("groupby_axis", [0, 1]) +@pytest.mark.parametrize("shift_axis", [0, 1]) +def test_shift_freq(groupby_axis, shift_axis): + pandas_df = pandas.DataFrame( + { + "col1": [1, 0, 2, 3], + "col2": [4, 5, np.NaN, 7], + "col3": [np.NaN, np.NaN, 12, 10], + "col4": [17, 13, 16, 15], + } + ) + modin_df = from_pandas(pandas_df) + + new_index = pandas.date_range("1/12/2020", periods=4, freq="S") + if groupby_axis == 0 and shift_axis == 0: + pandas_df.index = modin_df.index = new_index + by = [["col2", "col3"], ["col2"], ["col4"], [0, 1, 0, 2]] + else: + pandas_df.index = modin_df.index = new_index + pandas_df.columns = modin_df.columns = new_index + by = [[0, 1, 0, 2]] + + for _by in by: + pandas_groupby = pandas_df.groupby(by=_by, axis=groupby_axis) + modin_groupby = modin_df.groupby(by=_by, axis=groupby_axis) + eval_general( + modin_groupby, + pandas_groupby, + lambda groupby: groupby.shift(axis=shift_axis, freq="S"), + ) + + def test_agg_func_None_rename(): pandas_df = pandas.DataFrame( { diff --git a/modin/pandas/test/test_series.py b/modin/pandas/test/test_series.py index 990c5c0292d..d524537148d 100644 --- a/modin/pandas/test/test_series.py +++ b/modin/pandas/test/test_series.py @@ -2903,6 +2903,7 @@ def test_shift(data): df_equals(modin_series.shift(fill_value=777), pandas_series.shift(fill_value=777)) df_equals(modin_series.shift(periods=7), pandas_series.shift(periods=7)) df_equals(modin_series.shift(periods=-3), pandas_series.shift(periods=-3)) + eval_general(modin_series, pandas_series, lambda df: df.shift(axis=1)) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) From 5251135117b70f8240a2c8744a23ef1df62e74b7 Mon Sep 17 00:00:00 2001 From: YarShev Date: Fri, 30 Oct 2020 22:03:15 +0300 Subject: [PATCH 07/42] FIX-#2348: Fix default to pandas warnings (#2349) Signed-off-by: Igoshev, Yaroslav --- modin/backends/pandas/query_compiler.py | 25 ------------------------- 1 file changed, 25 deletions(-) diff --git a/modin/backends/pandas/query_compiler.py b/modin/backends/pandas/query_compiler.py index b5484b0c3c2..2dc8d5fe748 100644 --- a/modin/backends/pandas/query_compiler.py +++ b/modin/backends/pandas/query_compiler.py @@ -2559,31 +2559,6 @@ def _callable_func(self, func, axis, *args, **kwargs): method="size", ) - def groupby_dict_agg(self, by, func_dict, groupby_args, agg_args, drop=False): - """Apply aggregation functions to a grouped dataframe per-column. - - Parameters - ---------- - by : PandasQueryCompiler - The column to group by - func_dict : dict of str, callable/string - The dictionary mapping of column to function - groupby_args : dict - The dictionary of keyword arguments for the group by. - agg_args : dict - The dictionary of keyword arguments for the aggregation functions - drop : bool - Whether or not to drop the column from the data. - - Returns - ------- - PandasQueryCompiler - The result of the per-column aggregations on the grouped dataframe. - """ - return self.default_to_pandas( - lambda df: df.groupby(by=by, **groupby_args).agg(func_dict, **agg_args) - ) - def groupby_agg( self, by, From e924a1f3b38454dcb6191aded17b4a05a7b78e30 Mon Sep 17 00:00:00 2001 From: YarShev Date: Mon, 2 Nov 2020 14:54:44 +0300 Subject: [PATCH 08/42] FIX-#2357: Fix path to documentation for contributing (#2358) Signed-off-by: Igoshev, Yaroslav --- .github/PULL_REQUEST_TEMPLATE.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index 01e071a5c8d..0f9cc4ae33f 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -1,6 +1,6 @@ @@ -8,7 +8,7 @@ if you have questions about contributing. -- [ ] commit message follows format outlined [here](https://modin.readthedocs.io/en/latest/developer/contributing.html) +- [ ] commit message follows format outlined [here](https://modin.readthedocs.io/en/latest/CONTRIBUTING.html) - [ ] passes `flake8 modin` - [ ] passes `black --check modin` - [ ] signed commit with `git commit -s` From 6fcaef18309773a329bf319de92b32f4a7aad8b8 Mon Sep 17 00:00:00 2001 From: Anatoly Myachev <45976948+anmyachev@users.noreply.github.com> Date: Mon, 2 Nov 2020 16:18:59 +0300 Subject: [PATCH 09/42] FIX-#2352: remove deprecated option: 'num-redis-shards' (#2353) Signed-off-by: Anatoly Myachev --- modin/experimental/cloud/ray-autoscaler.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modin/experimental/cloud/ray-autoscaler.yml b/modin/experimental/cloud/ray-autoscaler.yml index 94bd63f3d96..640e6277457 100644 --- a/modin/experimental/cloud/ray-autoscaler.yml +++ b/modin/experimental/cloud/ray-autoscaler.yml @@ -155,7 +155,7 @@ head_start_ray_commands: echo 'export MEMORY_STORE_SIZE=$(awk "/MemFree/ { printf \"%d \\n\", \$2*1024*0.8}" /proc/meminfo)' >> ~/.bashrc echo 'export TMPDIR="$(dirname $(mktemp tmp.XXXXXXXXXX -ut))"' >> ~/.bashrc - ulimit -n 65536; ray start --head --num-redis-shards=1 --redis-shard-ports=6380 --port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml --object-store-memory=$MEMORY_STORE_SIZE --plasma-directory=$TMPDIR + ulimit -n 65536; ray start --head --redis-shard-ports=6380 --port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml --object-store-memory=$MEMORY_STORE_SIZE --plasma-directory=$TMPDIR # Command to start ray on worker nodes. You don't need to change this. worker_start_ray_commands: From 11574a8f8ab2d5d908ed21a22a1e2a8dd95801b1 Mon Sep 17 00:00:00 2001 From: YarShev Date: Mon, 2 Nov 2020 17:25:11 +0300 Subject: [PATCH 10/42] FIX-#2339: Fix links to documentation (#2361) Signed-off-by: Igoshev, Yaroslav --- README.md | 2 +- docs/UsingSQLonRay/index.rst | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index aa2a1add823..5b31a126f8c 100644 --- a/README.md +++ b/README.md @@ -179,7 +179,7 @@ and improve: ![Architecture](docs/img/modin_architecture.png) -Visit the [Documentation](https://modin.readthedocs.io/en/latest/architecture.html) for +Visit the [Documentation](https://modin.readthedocs.io/en/latest/developer/architecture.html) for more information! **`modin.pandas` is currently under active development. Requests and contributions are welcome!** diff --git a/docs/UsingSQLonRay/index.rst b/docs/UsingSQLonRay/index.rst index 82f3fb7b2a0..ce9dcd2beaf 100644 --- a/docs/UsingSQLonRay/index.rst +++ b/docs/UsingSQLonRay/index.rst @@ -30,4 +30,4 @@ Modin has a query compiler that acts as an intermediate layer between the query 0 1 2.0 A String of information True 1 6 17.0 A String of different information False -.. _architecture: https://modin.readthedocs.io/en/latest/architecture.html +.. _architecture: https://modin.readthedocs.io/en/latest/developer/architecture.html From 5382769c72a3e2e7c9fc533c2815ed89b1429aed Mon Sep 17 00:00:00 2001 From: Anatoly Myachev <45976948+anmyachev@users.noreply.github.com> Date: Mon, 2 Nov 2020 19:11:07 +0300 Subject: [PATCH 11/42] FIX-#2354: use conda activate instead of conda run (#2355) Signed-off-by: Anatoly Myachev --- modin/experimental/cloud/rayscale.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/modin/experimental/cloud/rayscale.py b/modin/experimental/cloud/rayscale.py index c77676616b9..906e5c55391 100644 --- a/modin/experimental/cloud/rayscale.py +++ b/modin/experimental/cloud/rayscale.py @@ -262,6 +262,7 @@ def wrap_cmd(self, cmd: list): [ "bash", "-ic", - subprocess.list2cmdline(["conda", "run", "-n", "modin"] + cmd), + # workaround for https://github.com/conda/conda/issues/8385 + subprocess.list2cmdline(["conda", "activate", "modin", "&&"] + cmd), ] ) From 8c00a8f3ab943ebf572f6b335afc00cfe5ff33f7 Mon Sep 17 00:00:00 2001 From: ienkovich Date: Tue, 3 Nov 2020 19:09:11 +0300 Subject: [PATCH 12/42] FEAT-#2363: introduce getter and setter for index name (#2368) Signed-off-by: ienkovich --- modin/backends/base/query_compiler.py | 44 +++++++++++++ .../backends/omnisci/query_compiler.py | 12 ++++ .../engines/omnisci_on_ray/frame/data.py | 63 +++++++++++++++++++ .../omnisci_on_ray/test/test_dataframe.py | 24 +++++++ modin/pandas/groupby.py | 6 +- 5 files changed, 146 insertions(+), 3 deletions(-) diff --git a/modin/backends/base/query_compiler.py b/modin/backends/base/query_compiler.py index e9657ebbad6..9a453bc9026 100644 --- a/modin/backends/base/query_compiler.py +++ b/modin/backends/base/query_compiler.py @@ -1584,6 +1584,50 @@ def has_multiindex(self, axis=0): assert axis == 1 return isinstance(self.columns, pandas.MultiIndex) + def get_index_name(self): + """ + Get index name. + + Returns + ------- + hashable + Index name, None for MultiIndex. + """ + return self.index.name + + def set_index_name(self, name): + """ + Set index name. + + Parameters + ---------- + name: hashable + New index name. + """ + self.index.name = name + + def get_index_names(self): + """ + Get index names. + + Returns + ------- + list + Index names. + """ + return self.index.names + + def set_index_names(self, names): + """ + Set index names. + + Parameters + ---------- + names: list + New index names. + """ + self.index.names = names + # DateTime methods dt_ceil = DateTimeDefault.register(pandas.Series.dt.ceil) diff --git a/modin/experimental/backends/omnisci/query_compiler.py b/modin/experimental/backends/omnisci/query_compiler.py index 1851444e438..eadee462d9f 100644 --- a/modin/experimental/backends/omnisci/query_compiler.py +++ b/modin/experimental/backends/omnisci/query_compiler.py @@ -646,6 +646,18 @@ def has_multiindex(self, axis=0): assert axis == 1 return isinstance(self.columns, pandas.MultiIndex) + def get_index_name(self): + return self._modin_frame.get_index_name() + + def set_index_name(self, name): + self._modin_frame = self._modin_frame.set_index_name(name) + + def get_index_names(self): + return self._modin_frame.get_index_names() + + def set_index_names(self, names): + self._modin_frame = self._modin_frame.set_index_names(names) + def free(self): return diff --git a/modin/experimental/engines/omnisci_on_ray/frame/data.py b/modin/experimental/engines/omnisci_on_ray/frame/data.py index fc5fd6627fe..88bd6c3cb24 100644 --- a/modin/experimental/engines/omnisci_on_ray/frame/data.py +++ b/modin/experimental/engines/omnisci_on_ray/frame/data.py @@ -1235,6 +1235,69 @@ def has_multiindex(self): return isinstance(self._index_cache, MultiIndex) return self._index_cols is not None and len(self._index_cols) > 1 + def get_index_name(self): + if self._index_cols is None: + return None + if len(self._index_cols) > 1: + return None + return self._index_cols[0] + + def set_index_name(self, name): + if self.has_multiindex(): + ErrorMessage.single_warning("Scalar name for MultiIndex is not supported!") + return self + + if self._index_cols is None and name is None: + return self + + names = self._mangle_index_names([name]) + if self._index_cols is None: + exprs = OrderedDict() + exprs[name] = self.ref("__rowid__") + else: + exprs = self._index_exprs() + + for col in self.columns: + exprs[col] = self.ref(col) + + return self.__constructor__( + columns=self.columns, + dtypes=self._dtypes_for_exprs(exprs), + op=TransformNode(self, exprs), + index_cols=names, + uses_rowid=self._index_cols is None, + force_execution_mode=self._force_execution_mode, + ) + + def get_index_names(self): + if self.has_multiindex(): + return self._index_cols.copy() + return [self.get_index_name()] + + def set_index_names(self, names): + if not self.has_multiindex(): + raise ValueError("Can set names for MultiIndex only") + + if len(names) != len(self._index_cols): + raise ValueError( + f"Unexpected names count: expected {len(self._index_cols)} got {len(names)}" + ) + + names = self._mangle_index_names(names) + exprs = OrderedDict() + for old, new in zip(self._index_cols, names): + exprs[new] = self.ref(old) + for col in self.columns: + exprs[col] = self.ref(col) + + return self.__constructor__( + columns=self.columns, + dtypes=self._dtypes_for_exprs(exprs), + op=TransformNode(self, exprs), + index_cols=names, + force_execution_mode=self._force_execution_mode, + ) + def to_pandas(self): self._execute() diff --git a/modin/experimental/engines/omnisci_on_ray/test/test_dataframe.py b/modin/experimental/engines/omnisci_on_ray/test/test_dataframe.py index e019249772e..9bd59b0ee8a 100644 --- a/modin/experimental/engines/omnisci_on_ray/test/test_dataframe.py +++ b/modin/experimental/engines/omnisci_on_ray/test/test_dataframe.py @@ -392,6 +392,30 @@ def applier(lib): eval_general(pd, pandas, applier) + def test_set_index_name(self): + index = pandas.Index.__new__(pandas.Index, data=[i for i in range(24)]) + + pandas_df = pandas.DataFrame(self.data, index=index) + pandas_df.index.name = "new_name" + modin_df = pd.DataFrame(self.data, index=index) + modin_df._query_compiler.set_index_name("new_name") + + df_equals(pandas_df, modin_df) + + def test_set_index_names(self): + index = pandas.MultiIndex.from_tuples( + [(i, j, k) for i in range(2) for j in range(3) for k in range(4)] + ) + + pandas_df = pandas.DataFrame(self.data, index=index) + pandas_df.index.names = ["new_name1", "new_name2", "new_name3"] + modin_df = pd.DataFrame(self.data, index=index) + modin_df._query_compiler.set_index_names( + ["new_name1", "new_name2", "new_name3"] + ) + + df_equals(pandas_df, modin_df) + class TestFillna: data = {"a": [1, 1, None], "b": [None, None, 2], "c": [3, None, None]} diff --git a/modin/pandas/groupby.py b/modin/pandas/groupby.py index 4441d870f4c..5eedd42759c 100644 --- a/modin/pandas/groupby.py +++ b/modin/pandas/groupby.py @@ -909,10 +909,10 @@ def _apply_agg_function(self, f, drop=True, *args, **kwargs): drop=self._drop, ) if self._idx_name is not None and self._as_index: - new_manager.index.name = self._idx_name + new_manager.set_index_name(self._idx_name) result = type(self._df)(query_compiler=new_manager) - if result.index.name == "__reduced__": - result.index.name = None + if result._query_compiler.get_index_name() == "__reduced__": + result._query_compiler.set_index_name(None) if self._kwargs.get("squeeze", False): return result.squeeze() return result From a3e06c795a982db5848a5e27d7a30eab5f82f9d3 Mon Sep 17 00:00:00 2001 From: Anatoly Myachev <45976948+anmyachev@users.noreply.github.com> Date: Thu, 5 Nov 2020 04:40:55 +0300 Subject: [PATCH 13/42] FEAT-#1844: upgrade pyarrow to 1.0 (#2347) Signed-off-by: Anatoly Myachev --- environment.yml | 2 +- modin/engines/base/io/column_stores/feather_reader.py | 7 +++---- modin/pandas/test/test_io.py | 6 +++++- requirements.txt | 2 +- requirements/env_omnisci.yml | 1 + setup.py | 2 +- 6 files changed, 12 insertions(+), 8 deletions(-) diff --git a/environment.yml b/environment.yml index d50ca19a1f8..77688620c0e 100644 --- a/environment.yml +++ b/environment.yml @@ -4,7 +4,7 @@ channels: dependencies: - pandas==1.1.3 - numpy - - pyarrow<0.17 + - pyarrow==1.0 - dask[complete]>=2.12.0,<=2.19.0 - distributed>=2.12.0,<=2.19.0 - xarray diff --git a/modin/engines/base/io/column_stores/feather_reader.py b/modin/engines/base/io/column_stores/feather_reader.py index 7b311b40f7b..95738f54342 100644 --- a/modin/engines/base/io/column_stores/feather_reader.py +++ b/modin/engines/base/io/column_stores/feather_reader.py @@ -32,8 +32,7 @@ def _read(cls, path, columns=None, **kwargs): https://arrow.apache.org/docs/python/api.html#feather-format """ if columns is None: - from pyarrow.feather import FeatherReader + from pyarrow.feather import read_feather - fr = FeatherReader(path) - columns = [fr.get_column_name(i) for i in range(fr.num_columns)] - return cls.build_query_compiler(path, columns, use_threads=False) + df = read_feather(path) + return cls.build_query_compiler(path, df.columns, use_threads=False) diff --git a/modin/pandas/test/test_io.py b/modin/pandas/test/test_io.py index 05684f1bd3d..b2080dd1efc 100644 --- a/modin/pandas/test/test_io.py +++ b/modin/pandas/test/test_io.py @@ -41,7 +41,7 @@ io_ops_bad_exc, ) -from modin.config import Engine, Backend +from modin.config import Engine, Backend, IsExperimental if Backend.get() == "Pandas": import modin.pandas as pd @@ -495,6 +495,10 @@ def teardown_fwf_file(): pass +@pytest.mark.skipif( + IsExperimental.get() and Backend.get() == "Pyarrow", + reason="Segmentation fault; see PR #2347 ffor details", +) class TestReadCSV: # delimiter tests @pytest.mark.parametrize("sep", ["_", ",", ".", "\n"]) diff --git a/requirements.txt b/requirements.txt index 4b7640f6b3c..c60bf62d8f0 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,6 @@ pandas==1.1.3 numpy -pyarrow<0.17 +pyarrow==1.0 dask[complete]>=2.12.0,<=2.19.0 distributed>=2.12.0,<=2.19.0 ray>=1.0.0 diff --git a/requirements/env_omnisci.yml b/requirements/env_omnisci.yml index 26e17c64008..eabd27089de 100644 --- a/requirements/env_omnisci.yml +++ b/requirements/env_omnisci.yml @@ -4,6 +4,7 @@ channels: - conda-forge dependencies: - pandas==1.1.3 + - pyarrow==1.0 - numpy - pip - pytest>=6.0.1 diff --git a/setup.py b/setup.py index ec536b28dc6..ed1dc9e43f6 100644 --- a/setup.py +++ b/setup.py @@ -38,7 +38,7 @@ def is_pure(self): dask_deps = ["dask>=2.12.0,<=2.19.0", "distributed>=2.12.0,<=2.19.0"] -ray_deps = ["ray>=1.0.0", "pyarrow<0.17"] +ray_deps = ["ray>=1.0.0", "pyarrow==1.0"] remote_deps = ["rpyc==4.1.5", "cloudpickle==1.4.1", "boto3==1.4.8"] all_deps = dask_deps + ray_deps + remote_deps From f4f3a1e15c5472d6588aefa8f2c2d08832e272f3 Mon Sep 17 00:00:00 2001 From: YarShev Date: Thu, 5 Nov 2020 16:49:54 +0300 Subject: [PATCH 14/42] FIX-#2365: Fix `Series.value_counts` when `dropna=False` (#2366) Signed-off-by: Igoshev, Yaroslav --- modin/backends/pandas/query_compiler.py | 19 +++++++++++-------- modin/pandas/test/test_series.py | 20 ++++++++++++++++++++ 2 files changed, 31 insertions(+), 8 deletions(-) diff --git a/modin/backends/pandas/query_compiler.py b/modin/backends/pandas/query_compiler.py index 2dc8d5fe748..99f9ed4445d 100644 --- a/modin/backends/pandas/query_compiler.py +++ b/modin/backends/pandas/query_compiler.py @@ -732,18 +732,21 @@ def reduce_func(df, *args, **kwargs): dropna = kwargs.get("dropna", True) try: - result = df.squeeze(axis=1).groupby(df.index, sort=False).sum() + result = ( + df.squeeze(axis=1) + .groupby(df.index, sort=False, dropna=dropna) + .sum() + ) # This will happen with Arrow buffer read-only errors. We don't want to copy # all the time, so this will try to fast-path the code first. except (ValueError): - result = df.copy().squeeze(axis=1).groupby(df.index, sort=False).sum() - - if not dropna and np.nan in df.index: - result = result.append( - pandas.Series( - [df.squeeze(axis=1).loc[[np.nan]].sum()], index=[np.nan] - ) + result = ( + df.copy() + .squeeze(axis=1) + .groupby(df.index, sort=False, dropna=dropna) + .sum() ) + if normalize: result = result / df.squeeze(axis=1).sum() diff --git a/modin/pandas/test/test_series.py b/modin/pandas/test/test_series.py index d524537148d..8d814762b13 100644 --- a/modin/pandas/test/test_series.py +++ b/modin/pandas/test/test_series.py @@ -3444,6 +3444,26 @@ def sort_index_for_equal_values(result, ascending): ) df_equals(modin_result, pandas_result) + # from issue #2365 + arr = np.random.rand(2 ** 6) + arr[::10] = np.nan + modin_series, pandas_series = create_test_series(arr) + modin_result = modin_series.value_counts(dropna=False, ascending=True) + pandas_result = sort_index_for_equal_values( + pandas_series.value_counts(dropna=False, ascending=True), True + ) + if get_current_backend() == "BaseOnPython": + modin_result = sort_index_for_equal_values(modin_result, ascending=True) + df_equals(modin_result, pandas_result) + + modin_result = modin_series.value_counts(dropna=False, ascending=False) + pandas_result = sort_index_for_equal_values( + pandas_series.value_counts(dropna=False, ascending=False), False + ) + if get_current_backend() == "BaseOnPython": + modin_result = sort_index_for_equal_values(modin_result, ascending=False) + df_equals(modin_result, pandas_result) + @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_values(data): From fc3485273ff89b10c8559b08626ce2eb2198b345 Mon Sep 17 00:00:00 2001 From: YarShev Date: Thu, 5 Nov 2020 21:04:50 +0300 Subject: [PATCH 15/42] FIX-#2369: Update pandas version to 1.1.4 (#2371) Signed-off-by: Igoshev, Yaroslav --- environment.yml | 2 +- modin/engines/base/io/file_reader.py | 4 +- modin/engines/dask/task_wrapper.py | 4 +- .../engines/pandas_on_ray/io_exp.py | 2 +- .../pyarrow_on_ray/frame/axis_partition.py | 6 +- modin/pandas/__init__.py | 2 +- modin/pandas/base.py | 6 -- modin/pandas/test/dataframe/test_binary.py | 16 +--- modin/pandas/test/dataframe/test_udf.py | 28 ++----- modin/pandas/test/test_series.py | 84 ++++++------------- requirements.txt | 2 +- requirements/env_omnisci.yml | 2 +- setup.py | 2 +- 13 files changed, 46 insertions(+), 114 deletions(-) diff --git a/environment.yml b/environment.yml index 77688620c0e..559f74ccf12 100644 --- a/environment.yml +++ b/environment.yml @@ -2,7 +2,7 @@ name: modin channels: - conda-forge dependencies: - - pandas==1.1.3 + - pandas==1.1.4 - numpy - pyarrow==1.0 - dask[complete]>=2.12.0,<=2.19.0 diff --git a/modin/engines/base/io/file_reader.py b/modin/engines/base/io/file_reader.py index 8a8ea6bd1ef..879444881cf 100644 --- a/modin/engines/base/io/file_reader.py +++ b/modin/engines/base/io/file_reader.py @@ -137,10 +137,10 @@ def file_exists(cls, file_path): return os.path.exists(file_path) @classmethod - def deploy(cls, func, args, num_return_vals): + def deploy(cls, func, args, num_returns): raise NotImplementedError(NOT_IMPLEMENTED_MESSAGE) - def parse(self, func, args, num_return_vals): + def parse(self, func, args, num_returns): raise NotImplementedError(NOT_IMPLEMENTED_MESSAGE) @classmethod diff --git a/modin/engines/dask/task_wrapper.py b/modin/engines/dask/task_wrapper.py index 04e5ed2a3b9..af717625afe 100644 --- a/modin/engines/dask/task_wrapper.py +++ b/modin/engines/dask/task_wrapper.py @@ -16,12 +16,12 @@ class DaskTask: @classmethod - def deploy(cls, func, num_return_vals, kwargs): + def deploy(cls, func, num_returns, kwargs): client = _get_global_client() remote_task_future = client.submit(func, **kwargs) return [ client.submit(lambda l, i: l[i], remote_task_future, i) - for i in range(num_return_vals) + for i in range(num_returns) ] @classmethod diff --git a/modin/experimental/engines/pandas_on_ray/io_exp.py b/modin/experimental/engines/pandas_on_ray/io_exp.py index c093e93708c..38b8170445f 100644 --- a/modin/experimental/engines/pandas_on_ray/io_exp.py +++ b/modin/experimental/engines/pandas_on_ray/io_exp.py @@ -148,7 +148,7 @@ def read_sql( columns, chunksize, ), - num_return_vals=num_splits + 1, + num_returns=num_splits + 1, ) partition_ids.append( [PandasOnRayFramePartition(obj) for obj in partition_id[:-1]] diff --git a/modin/experimental/engines/pyarrow_on_ray/frame/axis_partition.py b/modin/experimental/engines/pyarrow_on_ray/frame/axis_partition.py index be82e790e7b..b7cdb2eaa94 100644 --- a/modin/experimental/engines/pyarrow_on_ray/frame/axis_partition.py +++ b/modin/experimental/engines/pyarrow_on_ray/frame/axis_partition.py @@ -46,7 +46,7 @@ def apply(self, func, num_splits=None, other_axis_partition=None, **kwargs): for obj in deploy_ray_func_between_two_axis_partitions._remote( args=(self.axis, func, num_splits, len(self.list_of_blocks), kwargs) + tuple(self.list_of_blocks + other_axis_partition.list_of_blocks), - num_return_vals=num_splits, + num_returns=num_splits, ) ] @@ -54,7 +54,7 @@ def apply(self, func, num_splits=None, other_axis_partition=None, **kwargs): args.extend(self.list_of_blocks) return [ PyarrowOnRayFramePartition(obj) - for obj in deploy_ray_axis_func._remote(args, num_return_vals=num_splits) + for obj in deploy_ray_axis_func._remote(args, num_returns=num_splits) ] def shuffle(self, func, num_splits=None, **kwargs): @@ -74,7 +74,7 @@ def shuffle(self, func, num_splits=None, **kwargs): args.extend(self.list_of_blocks) return [ PyarrowOnRayFramePartition(obj) - for obj in deploy_ray_axis_func._remote(args, num_return_vals=num_splits) + for obj in deploy_ray_axis_func._remote(args, num_returns=num_splits) ] diff --git a/modin/pandas/__init__.py b/modin/pandas/__init__.py index 5ddc9c33c07..7979784d023 100644 --- a/modin/pandas/__init__.py +++ b/modin/pandas/__init__.py @@ -13,7 +13,7 @@ import pandas -__pandas_version__ = "1.1.3" +__pandas_version__ = "1.1.4" if pandas.__version__ != __pandas_version__: import warnings diff --git a/modin/pandas/base.py b/modin/pandas/base.py index 73d3043d38c..e300ea15f96 100644 --- a/modin/pandas/base.py +++ b/modin/pandas/base.py @@ -488,9 +488,6 @@ def add(self, other, axis="columns", level=None, fill_value=None): ) def aggregate(self, func=None, axis=0, *args, **kwargs): - warnings.warn( - "Modin index may not match pandas index due to pandas issue pandas-dev/pandas#36189." - ) axis = self._get_axis_number(axis) result = None @@ -686,9 +683,6 @@ def apply( args=(), **kwds, ): - warnings.warn( - "Modin index may not match pandas index due to pandas issue pandas-dev/pandas#36189." - ) axis = self._get_axis_number(axis) ErrorMessage.non_verified_udf() if isinstance(func, str): diff --git a/modin/pandas/test/dataframe/test_binary.py b/modin/pandas/test/dataframe/test_binary.py index 5346c0e8932..a4449798c12 100644 --- a/modin/pandas/test/dataframe/test_binary.py +++ b/modin/pandas/test/dataframe/test_binary.py @@ -135,7 +135,7 @@ def test_math_alias(math_op, alias): assert getattr(pd.DataFrame, math_op) == getattr(pd.DataFrame, alias) -@pytest.mark.parametrize("other", ["as_left", 4, 4.0]) +@pytest.mark.parametrize("other", ["as_left", 4, 4.0, "a"]) @pytest.mark.parametrize("op", ["eq", "ge", "gt", "le", "lt", "ne"]) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_comparison(data, op, other): @@ -145,20 +145,6 @@ def test_comparison(data, op, other): ) -@pytest.mark.xfail_backends( - ["BaseOnPython"], - reason="Test is failing because of mismathing of thrown exceptions. See pandas issue #36377", -) -@pytest.mark.parametrize("other", ["a"]) -@pytest.mark.parametrize("op", ["ge", "gt", "le", "lt", "eq", "ne"]) -@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) -def test_comparison_except(data, op, other): - eval_general( - *create_test_dfs(data), - lambda df: getattr(df, op)(other), - ) - - @pytest.mark.parametrize("op", ["eq", "ge", "gt", "le", "lt", "ne"]) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_multi_level_comparison(data, op): diff --git a/modin/pandas/test/dataframe/test_udf.py b/modin/pandas/test/dataframe/test_udf.py index 4b39cf7cd22..651feab1e40 100644 --- a/modin/pandas/test/dataframe/test_udf.py +++ b/modin/pandas/test/dataframe/test_udf.py @@ -49,16 +49,10 @@ ) @pytest.mark.parametrize("op", ["agg", "apply"]) def test_agg_apply(axis, func, op): - # AssertionError may be arisen in case of - # mismathing of index/columns in Modin and pandas. - # See details in pandas issue 36189. - try: - eval_general( - *create_test_dfs(test_data["float_nan_data"]), - lambda df: getattr(df, op)(func, axis), - ) - except AssertionError: - pass + eval_general( + *create_test_dfs(test_data["float_nan_data"]), + lambda df: getattr(df, op)(func, axis), + ) @pytest.mark.parametrize("axis", ["rows", "columns"]) @@ -69,16 +63,10 @@ def test_agg_apply(axis, func, op): ) @pytest.mark.parametrize("op", ["agg", "apply"]) def test_agg_apply_axis_names(axis, func, op): - # AssertionError may be arisen in case of - # mismathing of index/columns in Modin and pandas. - # See details in pandas issue 36189. - try: - eval_general( - *create_test_dfs(test_data["int_data"]), - lambda df: getattr(df, op)(func, axis), - ) - except AssertionError: - pass + eval_general( + *create_test_dfs(test_data["int_data"]), + lambda df: getattr(df, op)(func, axis), + ) def test_aggregate_alias(): diff --git a/modin/pandas/test/test_series.py b/modin/pandas/test/test_series.py index 8d814762b13..1f40d7a590e 100644 --- a/modin/pandas/test/test_series.py +++ b/modin/pandas/test/test_series.py @@ -593,16 +593,10 @@ def test_add_suffix(data): @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @pytest.mark.parametrize("func", agg_func_values, ids=agg_func_keys) def test_agg(data, func): - # AssertionError may be arisen in case of - # mismathing of index/columns in Modin and pandas. - # See details in pandas issue 36189. - try: - eval_general( - *create_test_series(data), - lambda df: df.agg(func), - ) - except AssertionError: - pass + eval_general( + *create_test_series(data), + lambda df: df.agg(func), + ) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @@ -624,16 +618,10 @@ def test_agg_numeric(request, data, func): request.node.name, numeric_dfs ): axis = 0 - # AssertionError may be arisen in case of - # mismathing of index/columns in Modin and pandas. - # See details in pandas issue 36189. - try: - eval_general( - *create_test_series(data), - lambda df: df.agg(func, axis), - ) - except AssertionError: - pass + eval_general( + *create_test_series(data), + lambda df: df.agg(func, axis), + ) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @@ -656,16 +644,10 @@ def test_agg_numeric_except(request, data, func): @pytest.mark.parametrize("func", agg_func_values, ids=agg_func_keys) def test_aggregate(data, func): axis = 0 - # AssertionError may be arisen in case of - # mismathing of index/columns in Modin and pandas. - # See details in pandas issue 36189. - try: - eval_general( - *create_test_series(data), - lambda df: df.aggregate(func, axis), - ) - except AssertionError: - pass + eval_general( + *create_test_series(data), + lambda df: df.aggregate(func, axis), + ) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @@ -688,16 +670,10 @@ def test_aggregate_numeric(request, data, func): request.node.name, numeric_dfs ): axis = 0 - # AssertionError may be arisen in case of - # mismathing of index/columns in Modin and pandas. - # See details in pandas issue 36189. - try: - eval_general( - *create_test_series(data), - lambda df: df.agg(func, axis), - ) - except AssertionError: - pass + eval_general( + *create_test_series(data), + lambda df: df.agg(func, axis), + ) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @@ -823,16 +799,10 @@ def test_append(data): @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @pytest.mark.parametrize("func", agg_func_values, ids=agg_func_keys) def test_apply(data, func): - # AssertionError may be arisen in case of - # mismathing of index/columns in Modin and pandas. - # See details in pandas issue 36189. - try: - eval_general( - *create_test_series(data), - lambda df: df.apply(func), - ) - except AssertionError: - pass + eval_general( + *create_test_series(data), + lambda df: df.apply(func), + ) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @@ -871,16 +841,10 @@ def test_apply_external_lib(): @pytest.mark.parametrize("func", agg_func_values, ids=agg_func_keys) def test_apply_numeric(request, data, func): if name_contains(request.node.name, numeric_dfs): - # AssertionError may be arisen in case of - # mismathing of index/columns in Modin and pandas. - # See details in pandas issue 36189. - try: - eval_general( - *create_test_series(data), - lambda df: df.apply(func), - ) - except AssertionError: - pass + eval_general( + *create_test_series(data), + lambda df: df.apply(func), + ) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) diff --git a/requirements.txt b/requirements.txt index c60bf62d8f0..a3183876ef4 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -pandas==1.1.3 +pandas==1.1.4 numpy pyarrow==1.0 dask[complete]>=2.12.0,<=2.19.0 diff --git a/requirements/env_omnisci.yml b/requirements/env_omnisci.yml index eabd27089de..e8432f00898 100644 --- a/requirements/env_omnisci.yml +++ b/requirements/env_omnisci.yml @@ -3,7 +3,7 @@ channels: - intel/label/modin - conda-forge dependencies: - - pandas==1.1.3 + - pandas==1.1.4 - pyarrow==1.0 - numpy - pip diff --git a/setup.py b/setup.py index ed1dc9e43f6..b5bbd4a5cc8 100644 --- a/setup.py +++ b/setup.py @@ -55,7 +55,7 @@ def is_pure(self): url="https://github.com/modin-project/modin", long_description=long_description, long_description_content_type="text/markdown", - install_requires=["pandas==1.1.3", "packaging"], + install_requires=["pandas==1.1.4", "packaging"], extras_require={ # can be installed by pip install modin[dask] "dask": dask_deps, From a13384c83808e018b5b17ed4aa372efaa38c4551 Mon Sep 17 00:00:00 2001 From: Anatoly Myachev <45976948+anmyachev@users.noreply.github.com> Date: Mon, 9 Nov 2020 15:23:23 +0300 Subject: [PATCH 16/42] FIX-#2322: add aligning partition' blocks (#2367) Signed-off-by: Anatoly Myachev --- .github/workflows/ci.yml | 2 + .github/workflows/push.yml | 35 ++++++++++++++ modin/engines/base/frame/data.py | 44 +++++++++++------ modin/engines/base/frame/partition_manager.py | 48 ++++++++++++------- .../ray/pandas_on_ray/frame/axis_partition.py | 3 +- modin/test/backends/pandas/test_internals.py | 40 ++++++++++++++++ 6 files changed, 140 insertions(+), 32 deletions(-) create mode 100644 modin/test/backends/pandas/test_internals.py diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 37a6acb3edc..f0de02b29d3 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -189,6 +189,8 @@ jobs: run: python -m pytest modin/config/test - shell: bash -l {0} run: python -m pytest modin/test/test_envvar_catcher.py + - shell: bash -l {0} + run: python -m pytest modin/test/backends/pandas/test_internals.py test-defaults: needs: [lint-commit, lint-flake8, lint-black, test-api, test-headers] diff --git a/.github/workflows/push.yml b/.github/workflows/push.yml index 29dece692d2..5c56af9fe5b 100644 --- a/.github/workflows/push.yml +++ b/.github/workflows/push.yml @@ -29,6 +29,41 @@ jobs: architecture: "x64" - run: pip install "ray>=1.0.0" + test-internals: + needs: prepare-cache + runs-on: ubuntu-latest + name: test-internals + steps: + - uses: actions/checkout@v2 + with: + fetch-depth: 1 + - name: Cache pip + uses: actions/cache@v1 + with: + path: ~/.cache/pip + key: ${{ runner.os }}-python-3.6-pip-${{ github.run_id }}-${{ hashFiles('environment.yml') }} + - uses: goanpeca/setup-miniconda@v1.6.0 + with: + activate-environment: modin + environment-file: environment.yml + python-version: 3.6 + channel-priority: strict + use-only-tar-bz2: true # IMPORTANT: This needs to be set for caching to work properly! + - name: Conda environment + shell: bash -l {0} + run: | + conda info + conda list + - name: Internals tests + shell: bash -l {0} + run: python -m pytest modin/data_management/factories/test/test_dispatcher.py modin/experimental/cloud/test/test_cloud.py + - shell: bash -l {0} + run: python -m pytest modin/config/test + - shell: bash -l {0} + run: python -m pytest modin/test/test_envvar_catcher.py + - shell: bash -l {0} + run: python -m pytest modin/test/backends/pandas/test_internals.py + test-defaults: needs: prepare-cache runs-on: ubuntu-latest diff --git a/modin/engines/base/frame/data.py b/modin/engines/base/frame/data.py index c09898f86d8..3d57cd6b754 100644 --- a/modin/engines/base/frame/data.py +++ b/modin/engines/base/frame/data.py @@ -1646,6 +1646,8 @@ def _copartition(self, axis, other, how, sort, force_repartition=False): """ Copartition two dataframes. + Perform aligning of partitions, index and partition blocks. + Parameters ---------- axis : 0 or 1 @@ -1694,6 +1696,7 @@ def _copartition(self, axis, other, how, sort, force_repartition=False): [self._simple_shuffle(axis, o) for o in other], self.axes[axis].copy(), ) + index_other_obj = [o.axes[axis] for o in other] joined_index = self._join_index_objects(axis, index_other_obj, how, sort) # We have to set these because otherwise when we perform the functions it may @@ -1701,32 +1704,45 @@ def _copartition(self, axis, other, how, sort, force_repartition=False): left_old_idx = self.axes[axis] right_old_idxes = index_other_obj - is_avoid_reindex = len(joined_index) != len(joined_index.unique()) and axis == 0 + def make_map_func(): + if not joined_index.is_unique and axis == 0: + return lambda df: df + return lambda df: df.reindex(joined_index, axis=axis) + # Start with this and we'll repartition the first time, and then not again. - if ( - not is_aligning_applied - and not is_avoid_reindex - and (force_repartition or not left_old_idx.equals(joined_index)) + if is_aligning_applied or ( + not force_repartition and left_old_idx.equals(joined_index) ): + reindexed_self = self._partitions + else: reindexed_self = self._frame_mgr_cls.map_axis_partitions( - axis, self._partitions, lambda df: df.reindex(joined_index, axis=axis) + axis, + self._partitions, + make_map_func(), ) - else: - reindexed_self = self._partitions - reindexed_other_list = [] + def get_column_widths(partitions): + if len(partitions) > 0: + return [obj.width() for obj in partitions[0]] + + def get_row_lengths(partitions): + if len(partitions.T) > 0: + return [obj.length() for obj in partitions.T[0]] + + reindexed_other_list = [] for i in range(len(other)): - if ( - is_aligning_applied - or is_avoid_reindex - or (not force_repartition and right_old_idxes[i].equals(joined_index)) + if is_aligning_applied or ( + not force_repartition and right_old_idxes[i].equals(joined_index) ): reindexed_other = other[i]._partitions else: reindexed_other = other[i]._frame_mgr_cls.map_axis_partitions( axis, other[i]._partitions, - lambda df: df.reindex(joined_index, axis=axis), + make_map_func(), + lengths=get_row_lengths(reindexed_self) + if axis == 0 + else get_column_widths(reindexed_self), ) reindexed_other_list.append(reindexed_other) return reindexed_self, reindexed_other_list, joined_index diff --git a/modin/engines/base/frame/partition_manager.py b/modin/engines/base/frame/partition_manager.py index ff00a3e8559..5f23ffd98a0 100644 --- a/modin/engines/base/frame/partition_manager.py +++ b/modin/engines/base/frame/partition_manager.py @@ -214,22 +214,25 @@ def broadcast_axis_partitions( left, right, keep_partitioning=False, + lengths=None, ): """ Broadcast the right partitions to left and apply a function along full axis. Parameters ---------- - axis : The axis to apply and broadcast over. - apply_func : The function to apply. - left : The left partitions. - right : The right partitions. - keep_partitioning : boolean. Default is False - The flag to keep partitions for Modin Frame. + axis : The axis to apply and broadcast over. + apply_func : The function to apply. + left : The left partitions. + right : The right partitions. + keep_partitioning : boolean. Default is False + The flag to keep partitions for Modin Frame. + lengths : list(int) + The list of lengths to shuffle the object. Returns ------- - A new `np.array` of partition objects. + A new `np.array` of partition objects. """ # Since we are already splitting the DataFrame back up after an # operation, we will just use this time to compute the number of @@ -245,12 +248,19 @@ def broadcast_axis_partitions( # may want to line to partitioning up with another BlockPartitions object. Since # we don't need to maintain the partitioning, this gives us the opportunity to # load-balance the data as well. + kw = { + "num_splits": num_splits, + "other_axis_partition": right_partitions, + } + if lengths: + kw["_lengths"] = lengths + kw["manual_partition"] = True + result_blocks = np.array( [ part.apply( preprocessed_map_func, - num_splits=num_splits, - other_axis_partition=right_partitions, + **kw, ) for part in left_partitions ] @@ -295,20 +305,23 @@ def map_axis_partitions( partitions, map_func, keep_partitioning=False, + lengths=None, ): """ Applies `map_func` to every partition. Parameters ---------- - axis : 0 or 1 - The axis to perform the map across (0 - index, 1 - columns). - partitions : NumPy array - The partitions of Modin Frame. - map_func : callable - The function to apply. - keep_partitioning : boolean. Default is False - The flag to keep partitions for Modin Frame. + axis : 0 or 1 + The axis to perform the map across (0 - index, 1 - columns). + partitions : NumPy array + The partitions of Modin Frame. + map_func : callable + The function to apply. + keep_partitioning : bool. Default is False + The flag to keep partitions for Modin Frame. + lengths : list(int) + The list of lengths to shuffle the object. Returns ------- @@ -326,6 +339,7 @@ def map_axis_partitions( apply_func=map_func, keep_partitioning=keep_partitioning, right=None, + lengths=lengths, ) @classmethod diff --git a/modin/engines/ray/pandas_on_ray/frame/axis_partition.py b/modin/engines/ray/pandas_on_ray/frame/axis_partition.py index 2099ea9fe93..9ec6620eae9 100644 --- a/modin/engines/ray/pandas_on_ray/frame/axis_partition.py +++ b/modin/engines/ray/pandas_on_ray/frame/axis_partition.py @@ -33,6 +33,7 @@ def __init__(self, list_of_blocks): def deploy_axis_func( cls, axis, func, num_splits, kwargs, maintain_partitioning, *partitions ): + lengths = kwargs.get("_lengths", None) return deploy_ray_func._remote( args=( PandasFrameAxisPartition.deploy_axis_func, @@ -43,7 +44,7 @@ def deploy_axis_func( maintain_partitioning, ) + tuple(partitions), - num_returns=num_splits * 3, + num_returns=num_splits * 3 if lengths is None else len(lengths) * 3, ) @classmethod diff --git a/modin/test/backends/pandas/test_internals.py b/modin/test/backends/pandas/test_internals.py new file mode 100644 index 00000000000..266c6d7ff8e --- /dev/null +++ b/modin/test/backends/pandas/test_internals.py @@ -0,0 +1,40 @@ +# Licensed to Modin Development Team under one or more contributor license agreements. +# See the NOTICE file distributed with this work for additional information regarding +# copyright ownership. The Modin Development Team licenses this file to you under the +# Apache License, Version 2.0 (the "License"); you may not use this file except in +# compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under +# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific language +# governing permissions and limitations under the License. + +import modin.pandas as pd + + +def test_aligning_blocks(): + # Test problem when modin frames have the same number of rows, but different + # blocks (partition.list_of_blocks). See #2322 for details + accm = pd.DataFrame(["-22\n"] * 162) + accm = accm.iloc[2:, :] + accm.reset_index(drop=True, inplace=True) + accm["T"] = pd.Series(["24.67\n"] * 145) + + # see #2322 for details + repr(accm) + + +def test_aligning_blocks_with_duplicated_index(): + # Same problem as in `test_aligning_blocks` but with duplicated values in index. + data11 = [0, 1] + data12 = [2, 3] + + data21 = [0] + data22 = [1, 2, 3] + + df1 = pd.DataFrame(data11).append(pd.DataFrame(data12)) + df2 = pd.DataFrame(data21).append(pd.DataFrame(data22)) + + repr(df1 - df2) From 3395b595d5bb460fecfd706c7d098b4610c9c9dd Mon Sep 17 00:00:00 2001 From: Devin Petersohn Date: Mon, 9 Nov 2020 17:05:55 -0600 Subject: [PATCH 17/42] Bump version to 0.8.2 (#2383) Signed-off-by: Devin Petersohn --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 5b31a126f8c..a8474c2e432 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,7 @@ - +

To use Modin, replace the pandas import:

From c5203b9ddba807fcf6385533167fb419e4ec211b Mon Sep 17 00:00:00 2001 From: Anatoly Myachev <45976948+anmyachev@users.noreply.github.com> Date: Tue, 10 Nov 2020 17:19:27 +0300 Subject: [PATCH 18/42] FIX-#2386: add new location for import ray functions (#2387) Signed-off-by: Anatoly Myachev --- modin/experimental/cloud/rayscale.py | 41 ++++++++++----------- modin/experimental/cloud/test/test_cloud.py | 14 +++---- 2 files changed, 27 insertions(+), 28 deletions(-) diff --git a/modin/experimental/cloud/rayscale.py b/modin/experimental/cloud/rayscale.py index 906e5c55391..a7cb7799691 100644 --- a/modin/experimental/cloud/rayscale.py +++ b/modin/experimental/cloud/rayscale.py @@ -21,12 +21,23 @@ import subprocess import yaml -from ray.autoscaler.commands import ( - create_or_update_cluster, - teardown_cluster, - get_head_node_ip, - _bootstrap_config, -) + +try: + # for ray>=1.0.1 + from ray.autoscaler.sdk import ( + create_or_update_cluster, + teardown_cluster, + get_head_node_ip, + bootstrap_config, + ) +except ModuleNotFoundError: + # for ray==1.0.0 + from ray.autoscaler.commands import ( + create_or_update_cluster, + teardown_cluster, + get_head_node_ip, + _bootstrap_config as bootstrap_config, + ) from .base import ( CannotSpawnCluster, @@ -140,7 +151,7 @@ def __make_config(self): res = self._update_conda_requirements(config["setup_commands"][0]) config["setup_commands"][0] = res - return _bootstrap_config(config) + return bootstrap_config(config) def _conda_requirements(self): import shlex @@ -197,15 +208,9 @@ def __do_spawn(self): try: create_or_update_cluster( self.config_file, - override_min_workers=None, - override_max_workers=None, no_restart=False, restart_only=False, - yes=True, - override_cluster_name=None, no_config_cache=False, - redirect_command_output=False, - use_login_shells=True, ) # need to re-load the config, as create_or_update_cluster() modifies it with open(self.config_file) as inp: @@ -220,13 +225,7 @@ def __do_spawn(self): def __do_destroy(self): try: - teardown_cluster( - self.config_file, - yes=True, - workers_only=False, - override_cluster_name=None, - keep_min_workers=0, - ) + teardown_cluster(self.config_file) self.ready = False self.config = None except BaseException as ex: @@ -244,7 +243,7 @@ def _get_connection_details(self) -> ConnectionDetails: return ConnectionDetails( user_name=self.config["auth"]["ssh_user"], key_file=self.config["auth"]["ssh_private_key"], - address=get_head_node_ip(self.config_file, override_cluster_name=None), + address=get_head_node_ip(self.config_file), ) def _get_main_python(self) -> str: diff --git a/modin/experimental/cloud/test/test_cloud.py b/modin/experimental/cloud/test/test_cloud.py index a7e4c5b3c83..1d17f3ed746 100644 --- a/modin/experimental/cloud/test/test_cloud.py +++ b/modin/experimental/cloud/test/test_cloud.py @@ -15,20 +15,20 @@ import pytest from collections import namedtuple from inspect import signature -from modin.experimental.cloud.rayscale import RayCluster -from modin.experimental.cloud.cluster import Provider -from ray.autoscaler.commands import ( +from modin.experimental.cloud.rayscale import ( + RayCluster, create_or_update_cluster, teardown_cluster, get_head_node_ip, - _bootstrap_config, + bootstrap_config, ) +from modin.experimental.cloud.cluster import Provider @pytest.fixture def make_bootstrap_config_mock(): def bootstrap_config_mock(config, *args, **kwargs): - signature(_bootstrap_config).bind(config, *args, **kwargs) + signature(bootstrap_config).bind(config, *args, **kwargs) config["auth"]["ssh_user"] = "modin" config["auth"]["ssh_private_key"] = "X" * 20 return config @@ -59,7 +59,7 @@ def make_create_or_update_cluster_mock(): def make_ray_cluster(make_bootstrap_config_mock): def ray_cluster(conda_packages=None): with mock.patch( - "modin.experimental.cloud.rayscale._bootstrap_config", + "modin.experimental.cloud.rayscale.bootstrap_config", make_bootstrap_config_mock, ): ray_cluster = RayCluster( @@ -71,7 +71,7 @@ def ray_cluster(conda_packages=None): return ray_cluster -def test__bootstrap_config(make_ray_cluster): +def test_bootstrap_config(make_ray_cluster): make_ray_cluster() From eba3abc3a23dd4426fc359be124648a8ba851d54 Mon Sep 17 00:00:00 2001 From: Gregory Shimansky Date: Tue, 10 Nov 2020 11:10:36 -0600 Subject: [PATCH 19/42] FIX-#2388: Fixed requirements for omnisci binaries (#2389) Signed-off-by: Gregory Shimansky --- environment.yml | 2 +- requirements.txt | 2 +- requirements/env_omnisci.yml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/environment.yml b/environment.yml index 559f74ccf12..802241f50b7 100644 --- a/environment.yml +++ b/environment.yml @@ -4,7 +4,7 @@ channels: dependencies: - pandas==1.1.4 - numpy - - pyarrow==1.0 + - pyarrow>=1.0.0 - dask[complete]>=2.12.0,<=2.19.0 - distributed>=2.12.0,<=2.19.0 - xarray diff --git a/requirements.txt b/requirements.txt index a3183876ef4..c31cab38814 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,6 @@ pandas==1.1.4 numpy -pyarrow==1.0 +pyarrow>=1.0.0 dask[complete]>=2.12.0,<=2.19.0 distributed>=2.12.0,<=2.19.0 ray>=1.0.0 diff --git a/requirements/env_omnisci.yml b/requirements/env_omnisci.yml index e8432f00898..70c170d8408 100644 --- a/requirements/env_omnisci.yml +++ b/requirements/env_omnisci.yml @@ -4,7 +4,7 @@ channels: - conda-forge dependencies: - pandas==1.1.4 - - pyarrow==1.0 + - pyarrow>=1.0.0 - numpy - pip - pytest>=6.0.1 From c6a1d935ed566d26eeed4654cdd51bef2482d1a1 Mon Sep 17 00:00:00 2001 From: Anatoly Myachev <45976948+anmyachev@users.noreply.github.com> Date: Wed, 11 Nov 2020 13:03:28 +0300 Subject: [PATCH 20/42] FIX-#2380: don't ignore lengths parameter for dask engine (#2381) Signed-off-by: Anatoly Myachev --- modin/data_management/utils.py | 9 +++++---- modin/engines/base/frame/axis_partition.py | 9 +-------- .../dask/pandas_on_dask/frame/axis_partition.py | 10 +++++----- 3 files changed, 11 insertions(+), 17 deletions(-) diff --git a/modin/data_management/utils.py b/modin/data_management/utils.py index 0d0a4aafa18..8a4058beb3d 100644 --- a/modin/data_management/utils.py +++ b/modin/data_management/utils.py @@ -82,18 +82,19 @@ def split_result_of_axis_func_pandas(axis, num_splits, result, length_list=None) list A list of Pandas DataFrames. """ - if num_splits == 1: - return result if length_list is not None: length_list.insert(0, 0) sums = np.cumsum(length_list) - if axis == 0: + if axis == 0 or isinstance(result, pandas.Series): return [result.iloc[sums[i] : sums[i + 1]] for i in range(len(sums) - 1)] else: return [result.iloc[:, sums[i] : sums[i + 1]] for i in range(len(sums) - 1)] + + if num_splits == 1: + return [result] # We do this to restore block partitioning chunksize = compute_chunksize(result, num_splits, axis=axis) - if axis == 0: + if axis == 0 or isinstance(result, pandas.Series): return [ result.iloc[chunksize * i : chunksize * (i + 1)] for i in range(num_splits) ] diff --git a/modin/engines/base/frame/axis_partition.py b/modin/engines/base/frame/axis_partition.py index aaacacce875..765ac55777a 100644 --- a/modin/engines/base/frame/axis_partition.py +++ b/modin/engines/base/frame/axis_partition.py @@ -91,10 +91,7 @@ def shuffle(self, func, lengths, **kwargs): partition_type = None def _wrap_partitions(self, partitions): - if isinstance(partitions, self.instance_type): - return [self.partition_type(partitions)] - else: - return [self.partition_type(obj) for obj in partitions] + return [self.partition_type(obj) for obj in partitions] class PandasFrameAxisPartition(BaseFrameAxisPartition): @@ -216,10 +213,6 @@ def deploy_axis_func( dataframe = pandas.concat(list(partitions), axis=axis, copy=False) result = func(dataframe, **kwargs) - if isinstance(result, pandas.Series): - if num_splits == 1: - return result - return [result] + [pandas.Series([]) for _ in range(num_splits - 1)] if manual_partition: # The split function is expecting a list diff --git a/modin/engines/dask/pandas_on_dask/frame/axis_partition.py b/modin/engines/dask/pandas_on_dask/frame/axis_partition.py index b3f98a0fb65..dbd5538aaaf 100644 --- a/modin/engines/dask/pandas_on_dask/frame/axis_partition.py +++ b/modin/engines/dask/pandas_on_dask/frame/axis_partition.py @@ -43,13 +43,15 @@ def deploy_axis_func( *partitions, pure=False, ) - if num_splits == 1: - return axis_result + + lengths = kwargs.get("_lengths", None) + result_num_splits = len(lengths) if lengths else num_splits + # We have to do this to split it back up. It is already split, but we need to # get futures for each. return [ client.submit(lambda l: l[i], axis_result, pure=False) - for i in range(num_splits) + for i in range(result_num_splits) ] @classmethod @@ -68,8 +70,6 @@ def deploy_func_between_two_axis_partitions( *partitions, pure=False, ) - if num_splits == 1: - return axis_result # We have to do this to split it back up. It is already split, but we need to # get futures for each. return [ From b424036555f7c7a6806f85d63a75cdcc3a6eb45b Mon Sep 17 00:00:00 2001 From: YarShev Date: Wed, 11 Nov 2020 14:47:55 +0300 Subject: [PATCH 21/42] FIX-#2390: Fix inserting Series into DataFrame (#2391) Signed-off-by: Igoshev, Yaroslav --- modin/backends/base/query_compiler.py | 5 ++--- modin/pandas/dataframe.py | 6 +++++- modin/pandas/test/dataframe/test_indexing.py | 7 +++++++ 3 files changed, 14 insertions(+), 4 deletions(-) diff --git a/modin/backends/base/query_compiler.py b/modin/backends/base/query_compiler.py index 9a453bc9026..980f7ac2467 100644 --- a/modin/backends/base/query_compiler.py +++ b/modin/backends/base/query_compiler.py @@ -137,10 +137,9 @@ def concat(df, axis, other, **kwargs): else: if isinstance(other, (list, np.ndarray)) and len(other) == 1: other = other[0] - how = kwargs.pop("join", None) ignore_index = kwargs.pop("ignore_index", None) - kwargs["how"] = how - result = df.join(other, **kwargs) + kwargs["how"] = kwargs.pop("join", None) + result = df.join(other, rsuffix="r_", **kwargs) if ignore_index: if axis == 0: result = result.reset_index(drop=True) diff --git a/modin/pandas/dataframe.py b/modin/pandas/dataframe.py index 4c6e6af9846..9ffbb82ef55 100644 --- a/modin/pandas/dataframe.py +++ b/modin/pandas/dataframe.py @@ -1972,7 +1972,11 @@ def __setitem__(self, key, value): self._query_compiler = value._query_compiler.copy() else: self._create_or_update_from_compiler( - self._query_compiler.concat(1, value._query_compiler), + self._query_compiler.concat( + 1, + value._query_compiler, + join="left", + ), inplace=True, ) # Now that the data is appended, we need to update the column name for diff --git a/modin/pandas/test/dataframe/test_indexing.py b/modin/pandas/test/dataframe/test_indexing.py index 1d2d723da71..a2b38aa7d88 100644 --- a/modin/pandas/test/dataframe/test_indexing.py +++ b/modin/pandas/test/dataframe/test_indexing.py @@ -1161,6 +1161,13 @@ def test___setitem__(data): df_equals(modin_df, pandas_df) + # from issue #2390 + modin_df = pd.DataFrame({"a": [1, 2, 3]}) + pandas_df = pandas.DataFrame({"a": [1, 2, 3]}) + modin_df["b"] = pd.Series([4, 5, 6, 7, 8]) + pandas_df["b"] = pandas.Series([4, 5, 6, 7, 8]) + df_equals(modin_df, pandas_df) + def test___setitem__with_mismatched_partitions(): fname = "200kx99.csv" From 45ef859eedc1828bb23eedb9537f1e96998c2379 Mon Sep 17 00:00:00 2001 From: amyskov <55585026+amyskov@users.noreply.github.com> Date: Wed, 11 Nov 2020 20:18:23 +0300 Subject: [PATCH 22/42] FIX-2200: Enable Calcite by default in OmniSci backend (#2385) Signed-off-by: Alexander Myskov --- modin/config/envvars.py | 1 + 1 file changed, 1 insertion(+) diff --git a/modin/config/envvars.py b/modin/config/envvars.py index 5dbf65f2070..129ec39bc30 100644 --- a/modin/config/envvars.py +++ b/modin/config/envvars.py @@ -196,6 +196,7 @@ class DoUseCalcite(EnvironmentVariable, type=bool): """ varname = "MODIN_USE_CALCITE" + default = True class TestDatasetSize(EnvironmentVariable, type=str): From ba006fb9f608bdde64c76617d7aea948bf35f46e Mon Sep 17 00:00:00 2001 From: amyskov <55585026+amyskov@users.noreply.github.com> Date: Thu, 12 Nov 2020 19:17:16 +0300 Subject: [PATCH 23/42] TEST-#2289: Columns, Index Locations and Names parameters of read_csv (#2319) Signed-off-by: Alexander Myskov --- modin/pandas/test/data/issue_621.csv | 10 --- modin/pandas/test/test_io.py | 101 +++++++++++++++++++++------ modin/pandas/test/utils.py | 25 +++++++ 3 files changed, 103 insertions(+), 33 deletions(-) delete mode 100644 modin/pandas/test/data/issue_621.csv diff --git a/modin/pandas/test/data/issue_621.csv b/modin/pandas/test/data/issue_621.csv deleted file mode 100644 index c0d924616ef..00000000000 --- a/modin/pandas/test/data/issue_621.csv +++ /dev/null @@ -1,10 +0,0 @@ -ins_74901673,task_LTg0MTUwNTA5Mjg4MDkwNjIzMA==,j_217,10,Terminated,673795,673797,m_2637,1,1,13,16,0.02,0.02 -ins_815802872,M1,j_1527,1,Terminated,158478,158520,m_3430,1,1,3,19,0.13,0.18 -ins_564677701,M1,j_2014,1,Terminated,372602,372616,m_1910,1,1,87,116,0.04,0.05 -ins_257566161,M1,j_2014,1,Terminated,372602,372615,m_2485,1,1,91,123,0.05,0.05 -ins_688679908,M1,j_2014,1,Terminated,372602,372615,m_993,1,1,93,141,0.05,0.05 -ins_929638393,M1,j_2014,1,Terminated,372603,372615,m_2808,1,1,100,137,0.05,0.05 -ins_1349024140,M1,j_2014,1,Terminated,372603,372617,m_3736,1,1,82,111,0.05,0.05 -ins_330247444,M1,j_2014,1,Terminated,372603,372617,m_1176,1,1,84,110,0.05,0.05 -ins_833551291,M1,j_2014,1,Terminated,372602,372614,m_2682,1,1,90,159,0.05,0.05 -ins_833550789,M1,j_2014,1,Terminated,372603,372619,m_3625,1,1,78,105,0.05,0.05 diff --git a/modin/pandas/test/test_io.py b/modin/pandas/test/test_io.py index b2080dd1efc..65bd6d1a62e 100644 --- a/modin/pandas/test/test_io.py +++ b/modin/pandas/test/test_io.py @@ -39,6 +39,7 @@ insert_lines_to_csv, IO_OPS_DATA_DIR, io_ops_bad_exc, + eval_io_from_str, ) from modin.config import Engine, Backend, IsExperimental @@ -528,6 +529,83 @@ def test_read_csv_delimiters( **kwargs, ) + # Column and Index Locations and Names tests + @pytest.mark.xfail( + Engine.get() != "Python", + reason="many parameters combiantions fails: issue #2312, #2307", + ) + @pytest.mark.parametrize("header", ["infer", None, 0]) + @pytest.mark.parametrize("index_col", [None, "col1"]) + @pytest.mark.parametrize("prefix", [None, "_", "col"]) + @pytest.mark.parametrize( + "names", [None, ["col1"], ["c1", "c2", "c3", "c4", "c5", "c6", "c7"]] + ) + @pytest.mark.parametrize( + "usecols", [None, ["col1"], ["col1", "col2", "col6"], [0, 1, 5]] + ) + @pytest.mark.parametrize("skip_blank_lines", [True, False]) + def test_read_csv_col_handling( + self, + make_csv_file, + request, + header, + index_col, + prefix, + names, + usecols, + skip_blank_lines, + ): + if request.config.getoption("--simulate-cloud").lower() != "off": + pytest.xfail( + "The reason of tests fail in `cloud` mode is unknown for now - issue #2340" + ) + + kwargs = { + "header": header, + "index_col": index_col, + "prefix": prefix, + "names": names, + "usecols": usecols, + "skip_blank_lines": skip_blank_lines, + } + + unique_name = get_unique_filename("test_read_csv_col_handling", kwargs) + make_csv_file( + filename=unique_name, + add_blank_lines=True, + ) + eval_io( + filepath_or_buffer=unique_name, + fn_name="read_csv", + **kwargs, + ) + + @pytest.mark.xfail(reason="infinite recursion error - issue #2032") + @pytest.mark.parametrize( + "test_case", ["single_element", "single_column", "multiple_columns"] + ) + def test_read_csv_squeeze(self, test_case): + unique_filename = get_unique_filename("test_read_csv_squeeze") + + str_single_element = "1" + str_single_col = "1\n2\n3\n" + str_four_cols = "1, 2, 3, 4\n" "5, 6, 7, 8\n" "9, 10, 11, 12\n" + case_to_data = { + "single_element": str_single_element, + "single_column": str_single_col, + "multiple_columns": str_four_cols, + } + + eval_io_from_str(case_to_data[test_case], unique_filename, squeeze=True) + eval_io_from_str( + case_to_data[test_case], unique_filename, header=None, squeeze=True + ) + + def test_read_csv_mangle_dupe_cols(self): + unique_filename = get_unique_filename("test_read_csv_mangle_dupe_cols") + str_non_unique_cols = "col,col,col,col\n" "5, 6, 7, 8\n" "9, 10, 11, 12\n" + eval_io_from_str(str_non_unique_cols, unique_filename, mangle_dupe_cols=True) + # Datetime Handling tests @pytest.mark.parametrize( "parse_dates", @@ -1171,21 +1249,6 @@ def test_parse_dates_read_csv(): df_equals(modin_df, pandas_df) -@pytest.mark.parametrize( - "kwargs", - [ - {"header": None, "usecols": [0, 7]}, - {"usecols": [0, 7]}, - {"names": [0, 7], "usecols": [0, 7]}, - ], -) -def test_from_csv_with_args(kwargs): - file_name = "modin/pandas/test/data/issue_621.csv" - pandas_df = pandas.read_csv(file_name, **kwargs) - modin_df = pd.read_csv(file_name, **kwargs) - df_equals(modin_df, pandas_df) - - def test_from_table(make_csv_file): make_csv_file(delimiter="\t") @@ -1200,14 +1263,6 @@ def test_from_table(make_csv_file): df_equals(modin_df, pandas_df) -@pytest.mark.parametrize("usecols", [["a"], ["a", "b", "e"], [0, 1, 4]]) -def test_from_csv_with_usecols(usecols): - fname = "modin/pandas/test/data/test_usecols.csv" - pandas_df = pandas.read_csv(fname, usecols=usecols) - modin_df = pd.read_csv(fname, usecols=usecols) - df_equals(modin_df, pandas_df) - - @pytest.mark.skipif(Engine.get() == "Python", reason="Using pandas implementation") def test_from_csv_s3(make_csv_file): dataset_url = "s3://noaa-ghcn-pds/csv/1788.csv" diff --git a/modin/pandas/test/utils.py b/modin/pandas/test/utils.py index 27a7375bb57..bb4a0099e68 100644 --- a/modin/pandas/test/utils.py +++ b/modin/pandas/test/utils.py @@ -740,6 +740,31 @@ def applyier(module, *args, **kwargs): ) +def eval_io_from_str(csv_str: str, unique_filename: str, **kwargs): + """Evaluate I/O operation outputs equality check by using `csv_str` + data passed as python str (csv test file will be created from `csv_str`). + + Parameters + ---------- + csv_str: str + Test data for storing to csv file. + unique_filename: str + csv file name. + """ + try: + with open(unique_filename, "w") as f: + f.write(csv_str) + + eval_io( + filepath_or_buffer=unique_filename, + fn_name="read_csv", + **kwargs, + ) + + finally: + os.remove(unique_filename) + + def create_test_dfs(*args, **kwargs): return pd.DataFrame(*args, **kwargs), pandas.DataFrame(*args, **kwargs) From e91687477f547859af2d6f52cd194191fe3e78fe Mon Sep 17 00:00:00 2001 From: Anatoly Myachev <45976948+anmyachev@users.noreply.github.com> Date: Thu, 12 Nov 2020 20:30:07 +0300 Subject: [PATCH 24/42] REFACTOR-#2397: remove redundant assigment (#2398) Signed-off-by: Anatoly Myachev --- modin/backends/base/query_compiler.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/modin/backends/base/query_compiler.py b/modin/backends/base/query_compiler.py index 980f7ac2467..06b74d43e3a 100644 --- a/modin/backends/base/query_compiler.py +++ b/modin/backends/base/query_compiler.py @@ -1412,8 +1412,6 @@ def groupby_agg( by = by.columns[0] if drop else by.to_pandas().squeeze() elif isinstance(by, type(self)): by = list(by.columns) - else: - by = by else: by = by.to_pandas().squeeze() if isinstance(by, type(self)) else by From 2b0b755753988cefbab38a71db973a26cbe581f4 Mon Sep 17 00:00:00 2001 From: ienkovich Date: Fri, 13 Nov 2020 10:14:51 +0300 Subject: [PATCH 25/42] FEAT-#2363: fix index name setter in OmniSci backend (#2379) Signed-off-by: ienkovich --- modin/experimental/engines/omnisci_on_ray/frame/data.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/modin/experimental/engines/omnisci_on_ray/frame/data.py b/modin/experimental/engines/omnisci_on_ray/frame/data.py index 88bd6c3cb24..42d59f70e11 100644 --- a/modin/experimental/engines/omnisci_on_ray/frame/data.py +++ b/modin/experimental/engines/omnisci_on_ray/frame/data.py @@ -1251,11 +1251,11 @@ def set_index_name(self, name): return self names = self._mangle_index_names([name]) + exprs = OrderedDict() if self._index_cols is None: - exprs = OrderedDict() - exprs[name] = self.ref("__rowid__") + exprs[names[0]] = self.ref("__rowid__") else: - exprs = self._index_exprs() + exprs[names[0]] = self.ref(self._index_cols[0]) for col in self.columns: exprs[col] = self.ref(col) From 5c9398c23da933043045cb9636e19216f834ae2d Mon Sep 17 00:00:00 2001 From: Gregory Shimansky Date: Fri, 13 Nov 2020 13:54:17 -0600 Subject: [PATCH 26/42] Merged groupby_agg and groupby_dict_agg to implement dictionary functions aggregations (#2317) * FIX-#2254: Added dictionary functions to groupby aggregate tests Signed-off-by: Gregory Shimansky * FIX-#2254: Initial implementation of dictionary functions aggregation Signed-off-by: Gregory Shimansky * FIX-#2254: Remove lambda wrapper to allow dictionary to go to backend Signed-off-by: Gregory Shimansky * FIX-#2254: Fixed AttributeError not being thrown from getattr Signed-off-by: Gregory Shimansky * FIX-#2254: Lint fixes Signed-off-by: Gregory Shimansky * FEAT-#2363: fix index name setter in OmniSci backend Signed-off-by: ienkovich * FIX-#2254: Removed obsolete groupby_dict_agg API function Signed-off-by: Gregory Shimansky * FIX-#2254: Fixed dict aggregate for base backend Signed-off-by: Gregory Shimansky * FIX-#2254: Address reformatting comments Signed-off-by: Gregory Shimansky * FIX-#2254: Remove whitespace Signed-off-by: Gregory Shimansky * FIX-#2254: Removed redundant argument conversion because it is already done inside of base backend. Signed-off-by: Gregory Shimansky Co-authored-by: ienkovich --- modin/backends/base/query_compiler.py | 10 ---- modin/backends/pandas/query_compiler.py | 13 +++-- .../default_methods/groupby_default.py | 6 ++- .../backends/omnisci/query_compiler.py | 27 ---------- modin/pandas/groupby.py | 52 +++++++------------ modin/pandas/test/test_groupby.py | 16 ++++-- 6 files changed, 47 insertions(+), 77 deletions(-) diff --git a/modin/backends/base/query_compiler.py b/modin/backends/base/query_compiler.py index 06b74d43e3a..e9cc7067796 100644 --- a/modin/backends/base/query_compiler.py +++ b/modin/backends/base/query_compiler.py @@ -1426,16 +1426,6 @@ def groupby_agg( drop=drop, ) - def groupby_dict_agg(self, by, func_dict, groupby_args, agg_args, drop=False): - return GroupByDefault.register(pandas.core.groupby.DataFrameGroupBy.aggregate)( - self, - by=by, - func_dict=func_dict, - groupby_args=groupby_args, - agg_args=agg_args, - drop=drop, - ) - # END Manual Partitioning methods def unstack(self, level, fill_value): diff --git a/modin/backends/pandas/query_compiler.py b/modin/backends/pandas/query_compiler.py index 99f9ed4445d..2567f33e026 100644 --- a/modin/backends/pandas/query_compiler.py +++ b/modin/backends/pandas/query_compiler.py @@ -2573,7 +2573,8 @@ def groupby_agg( groupby_kwargs, drop=False, ): - agg_func = wrap_udf_function(agg_func) + if callable(agg_func): + agg_func = wrap_udf_function(agg_func) if is_multi_by: return super().groupby_agg( @@ -2605,7 +2606,11 @@ def groupby_agg_builder(df): def compute_groupby(df): grouped_df = df.groupby(by=by, axis=axis, **groupby_kwargs) try: - result = agg_func(grouped_df, **agg_kwargs) + result = ( + grouped_df.agg(agg_func) + if isinstance(agg_func, dict) + else agg_func(grouped_df, **agg_kwargs) + ) # This happens when the partition is filled with non-numeric data and a # numeric operation is done. We need to build the index here to avoid # issues with extracting the index. @@ -2631,7 +2636,9 @@ def compute_groupby(df): # determening type of raised exception by applying `aggfunc` # to empty DataFrame try: - agg_func( + pandas.DataFrame(index=[1], columns=[1]).agg(agg_func) if isinstance( + agg_func, dict + ) else agg_func( pandas.DataFrame(index=[1], columns=[1]).groupby(level=0), **agg_kwargs, ) diff --git a/modin/data_management/functions/default_methods/groupby_default.py b/modin/data_management/functions/default_methods/groupby_default.py index b6ae497c75f..e6cd40675e7 100644 --- a/modin/data_management/functions/default_methods/groupby_default.py +++ b/modin/data_management/functions/default_methods/groupby_default.py @@ -80,7 +80,11 @@ def fn( grp = df.groupby(by, axis=axis, **groupby_args) agg_func = cls.get_func(grp, key, **kwargs) - result = agg_func(grp, **agg_args) + result = ( + grp.agg(agg_func, **agg_args) + if isinstance(agg_func, dict) + else agg_func(grp, **agg_args) + ) if not is_multi_by: if as_index: diff --git a/modin/experimental/backends/omnisci/query_compiler.py b/modin/experimental/backends/omnisci/query_compiler.py index eadee462d9f..35ce16e9917 100644 --- a/modin/experimental/backends/omnisci/query_compiler.py +++ b/modin/experimental/backends/omnisci/query_compiler.py @@ -279,33 +279,6 @@ def groupby_agg( ) return self.__constructor__(new_frame) - def groupby_dict_agg(self, by, func_dict, groupby_args, agg_args, drop=False): - """Apply aggregation functions to a grouped dataframe per-column. - - Parameters - ---------- - by : DFAlgQueryCompiler - The column to group by - func_dict : dict of str, callable/string - The dictionary mapping of column to function - groupby_args : dict - The dictionary of keyword arguments for the group by. - agg_args : dict - The dictionary of keyword arguments for the aggregation functions - drop : bool - Whether or not to drop the column from the data. - - Returns - ------- - DFAlgQueryCompiler - The result of the per-column aggregations on the grouped dataframe. - """ - # TODO: handle `drop` arg - new_frame = self._modin_frame.groupby_agg( - by, 0, func_dict, groupby_args, **agg_args - ) - return self.__constructor__(new_frame) - def count(self, **kwargs): return self._agg("count", **kwargs) diff --git a/modin/pandas/groupby.py b/modin/pandas/groupby.py index 5eedd42759c..3329a0412c1 100644 --- a/modin/pandas/groupby.py +++ b/modin/pandas/groupby.py @@ -357,6 +357,8 @@ def aggregate(self, func=None, *args, **kwargs): # This is not implemented in pandas, # so we throw a different message raise NotImplementedError("axis other than 0 is not supported") + + relabeling_required = False if isinstance(func, dict) or func is None: def _reconstruct_func(func, **kwargs): @@ -380,50 +382,32 @@ def _reconstruct_func(func, **kwargs): from pandas.core.base import SpecificationError raise SpecificationError("nested renamer is not supported") - if isinstance(self._by, type(self._query_compiler)): - by = list(self._by.columns) - else: - by = self._by - - subset_cols = list(func_dict.keys()) + ( - list(self._by.columns) - if isinstance(self._by, type(self._query_compiler)) - and all(c in self._df.columns for c in self._by.columns) - else [] - ) - result = type(self._df)( - query_compiler=self._df[subset_cols]._query_compiler.groupby_dict_agg( - by=by, - func_dict=func_dict, - groupby_args=self._kwargs, - agg_args=kwargs, - drop=self._drop, - ) - ) - - if relabeling_required: - result = result.iloc[:, order] - result.columns = new_columns - - return result - - if is_list_like(func): + func = func_dict + elif is_list_like(func): return self._default_to_pandas( lambda df, *args, **kwargs: df.aggregate(func, *args, **kwargs), *args, **kwargs, ) - if isinstance(func, str): - agg_func = getattr(self, func, None) + elif isinstance(func, str): + # Using "getattr" here masks possible AttributeError which we throw + # in __getattr__, so we should call __getattr__ directly instead. + agg_func = self.__getattr__(func) if callable(agg_func): return agg_func(*args, **kwargs) - return self._apply_agg_function( - lambda df, *args, **kwargs: df.aggregate(func, *args, **kwargs), + + result = self._apply_agg_function( + func, drop=self._as_index, *args, **kwargs, ) + if relabeling_required: + result = result.iloc[:, order] + result.columns = new_columns + return result + agg = aggregate def last(self, **kwargs): @@ -888,7 +872,9 @@ def _apply_agg_function(self, f, drop=True, *args, **kwargs): ------- A new combined DataFrame with the result of all groups. """ - assert callable(f), "'{0}' object is not callable".format(type(f)) + assert callable(f) or isinstance( + f, dict + ), "'{0}' object is not callable and not a dict".format(type(f)) # For aggregations, pandas behavior does this for the result. # For other operations it does not, so we wait until there is an aggregation to diff --git a/modin/pandas/test/test_groupby.py b/modin/pandas/test/test_groupby.py index b522e26f673..e75c223346a 100644 --- a/modin/pandas/test/test_groupby.py +++ b/modin/pandas/test/test_groupby.py @@ -149,7 +149,12 @@ def test_mixed_dtypes_groupby(as_index): eval_var(modin_groupby, pandas_groupby) eval_skew(modin_groupby, pandas_groupby) - agg_functions = ["min", "max"] + agg_functions = [ + "min", + "max", + {"col2": "sum"}, + {"col2": "max", "col4": "sum", "col5": "min"}, + ] for func in agg_functions: eval_agg(modin_groupby, pandas_groupby, func) eval_aggregate(modin_groupby, pandas_groupby, func) @@ -479,7 +484,12 @@ def test_single_group_row_groupby(): eval_prod(modin_groupby, pandas_groupby) eval_std(modin_groupby, pandas_groupby) - agg_functions = ["min", "max"] + agg_functions = [ + "min", + "max", + {"col2": "sum"}, + {"col2": "max", "col4": "sum", "col5": "min"}, + ] for func in agg_functions: eval_agg(modin_groupby, pandas_groupby, func) eval_aggregate(modin_groupby, pandas_groupby, func) @@ -595,7 +605,7 @@ def test_large_row_groupby(is_by_category): # eval_prod(modin_groupby, pandas_groupby) causes overflows eval_std(modin_groupby, pandas_groupby) - agg_functions = ["min", "max"] + agg_functions = ["min", "max", {"A": "sum"}, {"A": "max", "B": "sum", "C": "min"}] for func in agg_functions: eval_agg(modin_groupby, pandas_groupby, func) eval_aggregate(modin_groupby, pandas_groupby, func) From 86ebc316b8aeda692f5e2abee4e7c64ec74a2697 Mon Sep 17 00:00:00 2001 From: Gregory Shimansky Date: Sat, 14 Nov 2020 08:30:49 -0600 Subject: [PATCH 27/42] FIX-#2406: filter dictionary aggregation keys to limit them to keys only present in current partition (#2407) * FIX-#2406: Added test to detect this bug Signed-off-by: Gregory Shimansky * FIX-#2406: Added filter for keys absent in current partition Signed-off-by: Gregory Shimansky * FIX-#2406: Attemt to fix broken test on BaseOnPython backend This test gets a corrupted dataframe with "col2" removed by previous test cases. Signed-off-by: Gregory Shimansky --- modin/backends/pandas/query_compiler.py | 14 +++++---- modin/pandas/test/test_groupby.py | 40 +++++++++++++++++-------- 2 files changed, 36 insertions(+), 18 deletions(-) diff --git a/modin/backends/pandas/query_compiler.py b/modin/backends/pandas/query_compiler.py index 2567f33e026..4f35a3133ae 100644 --- a/modin/backends/pandas/query_compiler.py +++ b/modin/backends/pandas/query_compiler.py @@ -2606,11 +2606,15 @@ def groupby_agg_builder(df): def compute_groupby(df): grouped_df = df.groupby(by=by, axis=axis, **groupby_kwargs) try: - result = ( - grouped_df.agg(agg_func) - if isinstance(agg_func, dict) - else agg_func(grouped_df, **agg_kwargs) - ) + if isinstance(agg_func, dict): + # Filter our keys that don't exist in this partition. This happens when some columns + # from this original dataframe didn't end up in every partition. + partition_dict = { + k: v for k, v in agg_func.items() if k in df.columns + } + result = grouped_df.agg(partition_dict) + else: + result = agg_func(grouped_df, **agg_kwargs) # This happens when the partition is filled with non-numeric data and a # numeric operation is done. We need to build the index here to avoid # issues with extracting the index. diff --git a/modin/pandas/test/test_groupby.py b/modin/pandas/test/test_groupby.py index e75c223346a..cbccb85f344 100644 --- a/modin/pandas/test/test_groupby.py +++ b/modin/pandas/test/test_groupby.py @@ -22,6 +22,7 @@ check_df_columns_have_nans, create_test_dfs, eval_general, + test_data, test_data_values, modin_df_almost_equals_pandas, ) @@ -1189,23 +1190,36 @@ def test_shift_freq(groupby_axis, shift_axis): ) -def test_agg_func_None_rename(): - pandas_df = pandas.DataFrame( +@pytest.mark.parametrize( + "by_and_agg_dict", + [ { - "col1": np.random.randint(0, 100, size=1000), - "col2": np.random.randint(0, 100, size=1000), - "col3": np.random.randint(0, 100, size=1000), - "col4": np.random.randint(0, 100, size=1000), + "by": [ + list(test_data["int_data"].keys())[0], + list(test_data["int_data"].keys())[1], + ], + "agg_dict": { + "max": (list(test_data["int_data"].keys())[2], np.max), + "min": (list(test_data["int_data"].keys())[2], np.min), + }, }, - index=["row{}".format(i) for i in range(1000)], - ) - modin_df = from_pandas(pandas_df) + { + "by": ["col1"], + "agg_dict": { + "max": (list(test_data["int_data"].keys())[0], np.max), + "min": (list(test_data["int_data"].keys())[-1], np.min), + }, + }, + ], +) +def test_agg_func_None_rename(by_and_agg_dict): + modin_df, pandas_df = create_test_dfs(test_data["int_data"]) - modin_result = modin_df.groupby(["col1", "col2"]).agg( - max=("col3", np.max), min=("col3", np.min) + modin_result = modin_df.groupby(by_and_agg_dict["by"]).agg( + **by_and_agg_dict["agg_dict"] ) - pandas_result = pandas_df.groupby(["col1", "col2"]).agg( - max=("col3", np.max), min=("col3", np.min) + pandas_result = pandas_df.groupby(by_and_agg_dict["by"]).agg( + **by_and_agg_dict["agg_dict"] ) df_equals(modin_result, pandas_result) From 1da519858cfd1d265d01bfa65a5aaac8c40726cd Mon Sep 17 00:00:00 2001 From: Devin Petersohn Date: Sat, 14 Nov 2020 13:38:34 -0600 Subject: [PATCH 28/42] DOCS-#2413: Add examples page to documentation (#2414) * Resolves #2413 Signed-off-by: Devin Petersohn --- docs/examples/index.rst | 4 ++++ docs/index.rst | 5 +++++ 2 files changed, 9 insertions(+) create mode 100644 docs/examples/index.rst diff --git a/docs/examples/index.rst b/docs/examples/index.rst new file mode 100644 index 00000000000..9feb05553f6 --- /dev/null +++ b/docs/examples/index.rst @@ -0,0 +1,4 @@ +Examples +======== + +Coming Soon... \ No newline at end of file diff --git a/docs/index.rst b/docs/index.rst index 7f3a3ee3b45..e61a77f832e 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -128,6 +128,11 @@ nature, you get a fast DataFrame at 1MB and 1TB+. using_modin out_of_core +.. toctree:: + :caption: Examples + + examples/index + .. toctree:: :caption: Supported APIs From 40ae5a8a65a98c26df9aecf332f20d2b2448563e Mon Sep 17 00:00:00 2001 From: Devin Petersohn Date: Sat, 14 Nov 2020 14:02:56 -0600 Subject: [PATCH 29/42] DOCS-#2415: Add comparisons section to documentation with stubs (#2416) Signed-off-by: Devin Petersohn --- docs/comparisons/dask.rst | 4 ++++ docs/comparisons/index.rst | 4 ++++ docs/comparisons/pandas.rst | 4 ++++ docs/comparisons/spark.rst | 4 ++++ docs/index.rst | 8 ++++++++ 5 files changed, 24 insertions(+) create mode 100644 docs/comparisons/dask.rst create mode 100644 docs/comparisons/index.rst create mode 100644 docs/comparisons/pandas.rst create mode 100644 docs/comparisons/spark.rst diff --git a/docs/comparisons/dask.rst b/docs/comparisons/dask.rst new file mode 100644 index 00000000000..aced4215900 --- /dev/null +++ b/docs/comparisons/dask.rst @@ -0,0 +1,4 @@ +Modin vs. Dask Dataframe +======================== + +Coming Soon... diff --git a/docs/comparisons/index.rst b/docs/comparisons/index.rst new file mode 100644 index 00000000000..40647d065d9 --- /dev/null +++ b/docs/comparisons/index.rst @@ -0,0 +1,4 @@ +How is Modin unique? +==================== + +Coming Soon... diff --git a/docs/comparisons/pandas.rst b/docs/comparisons/pandas.rst new file mode 100644 index 00000000000..dfbaf02aba3 --- /dev/null +++ b/docs/comparisons/pandas.rst @@ -0,0 +1,4 @@ +Modin vs. Pandas +================ + +Coming Soon... diff --git a/docs/comparisons/spark.rst b/docs/comparisons/spark.rst new file mode 100644 index 00000000000..bf60963f710 --- /dev/null +++ b/docs/comparisons/spark.rst @@ -0,0 +1,4 @@ +Modin vs. Koalas and Spark +========================== + +Coming Soon... diff --git a/docs/index.rst b/docs/index.rst index e61a77f832e..e2c94b81bc3 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -133,6 +133,14 @@ nature, you get a fast DataFrame at 1MB and 1TB+. examples/index +.. toctree:: + :caption: How is Modin different from ...? + + comparisons/index + comparisons/pandas + comparisons/dask + comparisons/spark + .. toctree:: :caption: Supported APIs From b01f91fb3a628ce374aa830964d094480f73afca Mon Sep 17 00:00:00 2001 From: Reshama Shaikh Date: Sat, 14 Nov 2020 17:52:05 -0500 Subject: [PATCH 30/42] DOCS-#2417: add sklearn example (#2425) Signed-off-by: reshamas --- docs/examples/index.rst | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/docs/examples/index.rst b/docs/examples/index.rst index 9feb05553f6..e7aba021fa3 100644 --- a/docs/examples/index.rst +++ b/docs/examples/index.rst @@ -1,4 +1,9 @@ Examples ======== -Coming Soon... \ No newline at end of file +scikit-learn with LinearRegression +---------------------------------- +Here is a Jupyter Notebook example which uses Modin with scikit-learn +and linear regression `sklearn LinearRegression`_. + +.. _sklearn LinearRegression: https://github.com/modin-project/modin/blob/master/examples/modin-scikit-learn-example.ipynb From 140988ecbb7417356b0d1471ab7fb788246cd8dd Mon Sep 17 00:00:00 2001 From: vfdev Date: Sun, 15 Nov 2020 00:11:55 +0100 Subject: [PATCH 31/42] DOCS-#2421: Fixes bad link on contributing from architecture.rst (#2427) Signed-off-by: Victor Fomin --- docs/developer/architecture.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/developer/architecture.rst b/docs/developer/architecture.rst index 934e9781053..efa3a9ee24f 100644 --- a/docs/developer/architecture.rst +++ b/docs/developer/architecture.rst @@ -234,7 +234,7 @@ Supported Execution Frameworks and Memory Formats This is the list of execution frameworks and memory formats supported in Modin. If you would like to contribute a new execution framework or memory format, please see the -documentation page on Contributing_. +documentation page on :doc:`../CONTRIBUTING`. - `Pandas on Ray`_ - Uses the Ray_ execution framework. From 525228202cfb0fbeb26f32d780985b67ae1732eb Mon Sep 17 00:00:00 2001 From: vfdev Date: Sun, 15 Nov 2020 00:12:28 +0100 Subject: [PATCH 32/42] DOCS-#2419: Updated CONTRIBUTING.rst (#2423) Signed-off-by: Victor Fomin --- docs/CONTRIBUTING.rst | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/docs/CONTRIBUTING.rst b/docs/CONTRIBUTING.rst index 3841c7a9812..588057a7b5a 100644 --- a/docs/CONTRIBUTING.rst +++ b/docs/CONTRIBUTING.rst @@ -164,6 +164,26 @@ subset of the test suite. In order to run a specific test run: The entire test suite is automatically run for each pull request. +Building documentation +---------------------- + +To build the documentation, please follow the steps below from the project root: + +.. code-block:: bash + + cd docs + pip install -r requirements-doc.txt + sphinx-build -b html . build + +To visualize the documentation locally, run the following from `build` folder: + +.. code-block:: bash + + python -m http.server + # python -m http.server 1234 + +then open the browser at `0.0.0.0:` (e.g. `0.0.0.0:1234`). + Contributing a new execution framework or in-memory format ---------------------------------------------------------- From bcf931d2c05e89b577a97f38a892e661d3386bd0 Mon Sep 17 00:00:00 2001 From: vfdev Date: Sun, 15 Nov 2020 13:24:12 +0100 Subject: [PATCH 33/42] DOCS-#2426,DOCS-#2424: Fixed two issues (#2431) - Closes #2424, CONTRIBUTING.rst does not render the commit message formatting example - Closes #2426, Bad links in index.rst - Renamed CONTRIBUTING.rst into contributing.rst Signed-off-by: Victor Fomin --- docs/{CONTRIBUTING.rst => contributing.rst} | 5 +++-- docs/developer/architecture.rst | 2 +- docs/index.rst | 14 ++++---------- 3 files changed, 8 insertions(+), 13 deletions(-) rename docs/{CONTRIBUTING.rst => contributing.rst} (98%) diff --git a/docs/CONTRIBUTING.rst b/docs/contributing.rst similarity index 98% rename from docs/CONTRIBUTING.rst rename to docs/contributing.rst index 588057a7b5a..af8b2319ca9 100644 --- a/docs/CONTRIBUTING.rst +++ b/docs/contributing.rst @@ -49,7 +49,6 @@ with this project or the open source license(s) involved." Signed-off-by: Awesome Developer -. Code without a proper signoff cannot be merged into the master branch. Note: You must use your real name (sorry, no pseudonyms or anonymous contributions.) @@ -88,7 +87,9 @@ To ensure that all commit messages in the master branch follow a specific format enforce that all commit messages must follow the following format: .. code-block:: bash - FEAT-#9999: Add `DataFrame.rolling` functionality, to enable rolling window operations + + FEAT-#9999: Add `DataFrame.rolling` functionality, to enable rolling window operations + The ``FEAT`` component represents the type of commit. This component of the commit message can be one of the following: diff --git a/docs/developer/architecture.rst b/docs/developer/architecture.rst index efa3a9ee24f..c8ed4162a75 100644 --- a/docs/developer/architecture.rst +++ b/docs/developer/architecture.rst @@ -234,7 +234,7 @@ Supported Execution Frameworks and Memory Formats This is the list of execution frameworks and memory formats supported in Modin. If you would like to contribute a new execution framework or memory format, please see the -documentation page on :doc:`../CONTRIBUTING`. +documentation page on :doc:`../contributing`. - `Pandas on Ray`_ - Uses the Ray_ execution framework. diff --git a/docs/index.rst b/docs/index.rst index e2c94b81bc3..9b69b53cfa5 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -136,10 +136,10 @@ nature, you get a fast DataFrame at 1MB and 1TB+. .. toctree:: :caption: How is Modin different from ...? - comparisons/index - comparisons/pandas - comparisons/dask - comparisons/spark + comparisons/index + comparisons/pandas + comparisons/dask + comparisons/spark .. toctree:: :caption: Supported APIs @@ -164,12 +164,6 @@ nature, you get a fast DataFrame at 1MB and 1TB+. UsingPyarrowonRay/index UsingSQLonRay/index -.. toctree:: - :caption: Contributing to Modin - - contributing - architecture - .. toctree:: :caption: Help From 3edf6d2fb50c1bff121819410c9682564b27b30d Mon Sep 17 00:00:00 2001 From: Mohammed Kashif Date: Sun, 15 Nov 2020 18:03:34 +0530 Subject: [PATCH 34/42] DOCS-#2420: Changed documentation to numpydoc style (#2429) Signed-off-by: Mohammed Kashif Co-authored-by: Mohammed Kashif --- .github/workflows/ci.yml | 1 + modin/engines/base/frame/axis_partition.py | 57 +++-- modin/engines/base/frame/data.py | 220 +++++++++++------- modin/engines/base/frame/partition.py | 46 ++-- modin/engines/base/frame/partition_manager.py | 109 ++++++--- 5 files changed, 278 insertions(+), 155 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index f0de02b29d3..062f32fe4b7 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -51,6 +51,7 @@ jobs: - run: pydocstyle --convention=numpy --add-ignore=D101,D102 modin/pandas/series_utils.py - run: pydocstyle --convention=numpy --add-ignore=D103 modin/pandas/general.py - run: pydocstyle --convention=numpy modin/pandas/plotting.py modin/pandas/utils.py modin/pandas/iterator.py modin/pandas/indexing.py + - run: pydocstyle --convention=numpy --add-ignore=D100,D104 modin/engines/base/frame lint-flake8: name: lint (flake8) diff --git a/modin/engines/base/frame/axis_partition.py b/modin/engines/base/frame/axis_partition.py index 765ac55777a..cf3c9bfa511 100644 --- a/modin/engines/base/frame/axis_partition.py +++ b/modin/engines/base/frame/axis_partition.py @@ -18,9 +18,9 @@ class BaseFrameAxisPartition(object): # pragma: no cover - """This abstract class represents the Parent class for any - `ColumnPartition` or `RowPartition` class. This class is intended to - simplify the way that operations are performed + """An abstract class that represents the Parent class for any `ColumnPartition` or `RowPartition` class. + + This class is intended to simplify the way that operations are performed. Note 0: The procedures that use this class and its methods assume that they have some global knowledge about the entire axis. This may @@ -46,7 +46,7 @@ def apply( maintain_partitioning=True, **kwargs, ): - """Applies a function to a full axis. + """Apply a function to a full axis. Note: The procedures that invoke this method assume full axis knowledge. Implement this method accordingly. @@ -69,7 +69,8 @@ def apply( orientation (the lengths will remain the same). This is ignored between two axis partitions. - Returns: + Returns + ------- A list of `BaseFramePartition` objects. """ raise NotImplementedError(NOT_IMPLMENTED_MESSAGE) @@ -81,7 +82,8 @@ def shuffle(self, func, lengths, **kwargs): func: The function to apply before splitting. lengths: The list of partition lengths to split the result into. - Returns: + Returns + ------- A list of RemotePartition objects split by `lengths`. """ raise NotImplementedError(NOT_IMPLMENTED_MESSAGE) @@ -95,9 +97,9 @@ def _wrap_partitions(self, partitions): class PandasFrameAxisPartition(BaseFrameAxisPartition): - """This abstract class is created to simplify and consolidate the code for - AxisPartitions that run pandas. Because much of the code is similar, this allows - us to reuse this code. + """An abstract class is created to simplify and consolidate the code for AxisPartitions that run pandas. + + Because much of the code is similar, this allows us to reuse this code. Subclasses must implement `list_of_blocks` which unwraps the `RemotePartition` objects and creates something interpretable as a pandas DataFrame. @@ -115,23 +117,28 @@ def apply( maintain_partitioning=True, **kwargs, ): - """Applies func to the object in the plasma store. + """Apply func to the object in the plasma store. See notes in Parent class about this method. - Args: - func: The function to apply. - num_splits: The number of times to split the result object. - other_axis_partition: Another `PandasOnRayFrameAxisPartition` object to apply to - func with this one. - maintain_partitioning: Whether or not to keep the partitioning in the same - orientation as it was previously. This is important because we may be - operating on an individual AxisPartition and not touching the rest. - In this case, we have to return the partitioning to its previous - orientation (the lengths will remain the same). This is ignored between - two axis partitions. + Parameters + ---------- + func: callable + The function to apply. + num_splits: int + The number of times to split the result object. + other_axis_partition: PandasOnRayFrameAxisPartition object + Another `PandasOnRayFrameAxisPartition` object to apply to func with this one. + maintain_partitioning: boolean + Whether or not to keep the partitioning in the same + orientation as it was previously. This is important because we may be + operating on an individual AxisPartition and not touching the rest. + In this case, we have to return the partitioning to its previous + orientation (the lengths will remain the same). This is ignored between + two axis partitions. - Returns: + Returns + ------- A list of `RayRemotePartition` objects. """ if num_splits is None: @@ -177,7 +184,8 @@ def shuffle(self, func, lengths, **kwargs): func: The function to apply before splitting. lengths: The list of partition lengths to split the result into. - Returns: + Returns + ------- A list of RemotePartition objects split by `lengths`. """ num_splits = len(lengths) @@ -204,7 +212,8 @@ def deploy_axis_func( If False, create a new partition layout. partitions: All partitions that make up the full axis (row or column) - Returns: + Returns + ------- A list of Pandas DataFrames. """ # Pop these off first because they aren't expected by the function. diff --git a/modin/engines/base/frame/data.py b/modin/engines/base/frame/data.py index 3d57cd6b754..9bae5a85c92 100644 --- a/modin/engines/base/frame/data.py +++ b/modin/engines/base/frame/data.py @@ -25,13 +25,17 @@ class BasePandasFrame(object): + """An abstract class that represents the Parent class for any Pandas DataFrame class. + + This class is intended to simplify the way that operations are performed + """ _frame_mgr_cls = None _query_compiler_cls = PandasQueryCompiler @property def __constructor__(self): - """The constructor for this object. A convenience method""" + """Create a new instance of this object.""" return type(self) def __init__( @@ -87,7 +91,8 @@ def __init__( def _row_lengths(self): """Compute the row lengths if they are not cached. - Returns: + Returns + ------- A list of row lengths. """ if self._row_lengths_cache is None: @@ -103,7 +108,8 @@ def _row_lengths(self): def _column_widths(self): """Compute the column widths if they are not cached. - Returns: + Returns + ------- A list of column widths. """ if self._column_widths_cache is None: @@ -115,14 +121,15 @@ def _column_widths(self): @property def _axes_lengths(self): - """The row lengths, column widths that can be accessed with an `axis` integer.""" + """Row lengths, column widths that can be accessed with an `axis` integer.""" return [self._row_lengths, self._column_widths] @property def dtypes(self): """Compute the data types if they are not cached. - Returns: + Returns + ------- A pandas Series containing the data types for this dataframe. """ if self._dtypes is None: @@ -132,7 +139,8 @@ def dtypes(self): def _compute_dtypes(self): """Compute the dtypes via MapReduce. - Returns: + Returns + ------- The data types of this dataframe. """ @@ -154,13 +162,17 @@ def dtype_builder(df): _columns_cache = None def _validate_set_axis(self, new_labels, old_labels): - """Validates the index or columns replacement against the old labels. + """Validate the index or columns replacement against the old labels. - Args: - new_labels: The labels to replace with. - old_labels: The labels to replace. + Parameters + ---------- + new_labels: list-like + The labels to replace with. + old_labels: list-like + The labels to replace. - Returns: + Returns + ------- The validated labels. """ new_labels = ensure_index(new_labels) @@ -174,26 +186,30 @@ def _validate_set_axis(self, new_labels, old_labels): return new_labels def _get_index(self): - """Gets the index from the cache object. + """Get the index from the cache object. - Returns: + Returns + ------- A pandas.Index object containing the row labels. """ return self._index_cache def _get_columns(self): - """Gets the columns from the cache object. + """Get the columns from the cache object. - Returns: + Returns + ------- A pandas.Index object containing the column labels. """ return self._columns_cache def _set_index(self, new_index): - """Replaces the current row labels with new labels. + """Replace the current row labels with new labels. - Args: - new_index: The replacement row labels. + Parameters + ---------- + new_index: list-like + The replacement row labels. """ if self._index_cache is None: self._index_cache = ensure_index(new_index) @@ -203,10 +219,12 @@ def _set_index(self, new_index): self._apply_index_objs(axis=0) def _set_columns(self, new_columns): - """Replaces the current column labels with new labels. + """Replace the current column labels with new labels. - Args: - new_columns: The replacement column labels. + Parameters + ---------- + new_columns: list-like + The replacement column labels. """ if self._columns_cache is None: self._columns_cache = ensure_index(new_columns) @@ -218,7 +236,7 @@ def _set_columns(self, new_columns): self._apply_index_objs(axis=1) def _set_axis(self, axis, new_axis, cache_only=False): - """Replaces the current labels at the specified axis with the new one + """Replace the current labels at the specified axis with the new one. Parameters ---------- @@ -246,12 +264,12 @@ def _set_axis(self, axis, new_axis, cache_only=False): @property def axes(self): - """The index, columns that can be accessed with an `axis` integer.""" + """Index, columns that can be accessed with an `axis` integer.""" return [self.index, self.columns] def _compute_axis_labels(self, axis: int, partitions=None): """ - Computes labels for specific `axis` + Compute the labels for specific `axis`. Parameters ---------- @@ -273,7 +291,7 @@ def _compute_axis_labels(self, axis: int, partitions=None): ) def _filter_empties(self): - """Removes empty partitions to avoid triggering excess computation.""" + """Remove empty partitions to avoid triggering excess computation.""" if len(self.axes[0]) == 0 or len(self.axes[1]) == 0: # This is the case for an empty frame. We don't want to completely remove # all metadata and partitions so for the moment, we won't prune if the frame @@ -296,7 +314,7 @@ def _filter_empties(self): def _validate_axis_equality(self, axis: int, force: bool = False): """ - Validates internal and external indices of modin_frame at the specified axis. + Validate internal and external indices of modin_frame at the specified axis. Parameters ---------- @@ -329,8 +347,9 @@ def _validate_axis_equality(self, axis: int, force: bool = False): def _validate_internal_indices(self, mode=None, **kwargs): """ - Validates and optionally updates internal and external indices - of modin_frame in specified mode. There is 4 modes supported: + Validate and optionally updates internal and external indices of modin_frame in specified mode. + + There are 4 modes supported: 1. "reduced" - validates on that axis where external indices is ["__reduced__"] for not force 2. "reduced+other" - validates on axis where external @@ -394,7 +413,8 @@ def _apply_index_objs(self, axis=None): Args: axis: The axis to apply to, None applies to both axes. - Returns: + Returns + ------- A new 2D array of partitions that have the index assignment added to the call queue. """ @@ -680,7 +700,8 @@ def reorder_labels(self, row_numeric_idx=None, col_numeric_idx=None): def copy(self): """Copy this object. - Returns: + Returns + ------- A copied version of this object. """ return self.__constructor__( @@ -694,13 +715,14 @@ def copy(self): @classmethod def combine_dtypes(cls, list_of_dtypes, column_names): - """Describes how data types should be combined when they do not match. + """Describe how data types should be combined when they do not match. Args: list_of_dtypes: A list of pandas Series with the data types. column_names: The names of the columns that the data types map to. - Returns: + Returns + ------- A pandas Series containing the finalized data types. """ # Compute dtypes by getting collecting and combining all of the partitions. The @@ -716,13 +738,14 @@ def combine_dtypes(cls, list_of_dtypes, column_names): return dtypes def astype(self, col_dtypes): - """Converts columns dtypes to given dtypes. + """Convert the columns dtypes to given dtypes. Args: col_dtypes: Dictionary of {col: dtype,...} where col is the column name and dtype is a numpy dtype. - Returns: + Returns + ------- dataframe with updated dtypes. """ columns = col_dtypes.keys() @@ -774,7 +797,8 @@ def add_prefix(self, prefix, axis): prefix: The prefix to add. axis: The axis to update. - Returns: + Returns + ------- A new dataframe with the updated labels. """ new_labels = self.axes[axis].map(lambda x: str(prefix) + str(x)) @@ -792,7 +816,8 @@ def add_suffix(self, suffix, axis): suffix: The suffix to add. axis: The axis to update. - Returns: + Returns + ------- A new dataframe with the updated labels. """ new_labels = self.axes[axis].map(lambda x: str(x) + str(suffix)) @@ -806,9 +831,10 @@ def add_suffix(self, suffix, axis): # END Metadata modification methods def _numeric_columns(self, include_bool=True): - """Returns the numeric columns of the Manager. + """Return the numeric columns of the Manager. - Returns: + Returns + ------- List of index names. """ columns = [] @@ -945,7 +971,7 @@ def internal(block_idx, global_index): def _join_index_objects(self, axis, other_index, how, sort): """ - Joins a pair of index objects (columns or rows) by a given strategy. + Join the pair of index objects (columns or rows) by a given strategy. Unlike Index.join() in Pandas, if axis is 1, the sort is False, and how is "outer", the result will _not_ be sorted. @@ -994,11 +1020,15 @@ def _build_mapreduce_func(self, axis, func): Note: This should be used for any MapReduce style operation that results in a reduced data dimensionality (dataframe -> series). - Args: - axis: The axis along which to apply the function. - func: The function to apply. + Parameters + ---------- + axis: int + The axis along which to apply the function. + func: callable + The function to apply. - Returns: + Returns + ------- A function to be shipped to the partitions to be executed. """ @@ -1020,7 +1050,7 @@ def _map_reduce_func(df, *args, **kwargs): def _compute_map_reduce_metadata(self, axis, new_parts, preserve_index=True): """ - Computes metadata for the result of reduce function. + Compute the metadata for the result of reduce function. Parameters ---------- @@ -1126,7 +1156,7 @@ def _map_reduce(self, axis, map_func, reduce_func=None, preserve_index=True): def _map(self, func, dtypes=None, validate_index=False, validate_columns=False): """Perform a function that maps across the entire dataset. - Pamareters + Parameters ---------- func : callable The function to apply. @@ -1136,6 +1166,7 @@ def _map(self, func, dtypes=None, validate_index=False, validate_columns=False): type, and allows us to avoid (re)computing it. validate_index : bool, (default False) Is index validation required after performing `func` on partitions. + Returns ------- A new dataframe. @@ -1175,11 +1206,15 @@ def _fold(self, axis, func): Note: The data shape is not changed (length and width of the table). - Args: - axis: The axis to apply over. - func: The function to apply. + Parameters + ---------- + axis: int + The axis to apply over. + func: callable + The function to apply. - Returns: + Returns + ------- A new dataframe. """ new_partitions = self._frame_mgr_cls.map_axis_partitions( @@ -1196,12 +1231,16 @@ def _fold(self, axis, func): def filter_full_axis(self, axis, func): """Filter data based on the function provided along an entire axis. - Args: - axis: The axis to filter over. - func: The function to use for the filter. This function should filter the + Parameters + ---------- + axis: int + The axis to filter over. + func: callable + The function to use for the filter. This function should filter the data itself. - Returns: + Returns + ------- A new dataframe. """ new_partitions = self._frame_mgr_cls.map_axis_partitions( @@ -1280,18 +1319,27 @@ def _apply_full_axis_select_indices( ): """Apply a function across an entire axis for a subset of the data. - Args: - axis: The axis to apply over. - func: The function to apply - apply_indices: The labels to apply over. - numeric_indices: The indices to apply over. - new_index: (optional) The index of the result. We may know this in advance, + Parameters + ---------- + axis: int + The axis to apply over. + func: callable + The function to apply + apply_indices: list-like + The labels to apply over. + numeric_indices: list-like + The indices to apply over. + new_index: list-like (optional) + The index of the result. We may know this in advance, and if not provided it must be computed. - new_columns: (optional) The columns of the result. We may know this in + new_columns: list-like (optional) + The columns of the result. We may know this in advance, and if not provided it must be computed. - keep_remaining: Whether or not to drop the data that is not computed over. + keep_remaining: boolean + Whether or not to drop the data that is not computed over. - Returns: + Returns + ------- A new dataframe. """ assert apply_indices is not None or numeric_indices is not None @@ -1332,7 +1380,8 @@ def _apply_select_indices( ): """Apply a function for a subset of the data. - Args: + Parameters + ---------- axis: The axis to apply over. func: The function to apply apply_indices: (optional) The labels to apply over. Must be given if axis is @@ -1349,7 +1398,8 @@ def _apply_select_indices( item_to_distribute: (optional) The item to split up so it can be applied over both axes. - Returns: + Returns + ------- A new dataframe. """ # TODO Infer columns and index from `keep_remaining` and `apply_indices` @@ -1458,7 +1508,7 @@ def broadcast_apply( def _prepare_frame_to_broadcast(self, axis, indices, broadcast_all): """ - Computes indices to broadcast `self` with considering of `indices` + Compute the indices to broadcast `self` with considering of `indices`. Parameters ---------- @@ -1508,8 +1558,7 @@ def broadcast_apply_select_indices( new_columns=None, ): """ - Applyies `func` to select indices at specified axis and broadcasts - partitions of `other` frame. + Apply `func` to select indices at specified axis and broadcasts partitions of `other` frame. Parameters ---------- @@ -1811,13 +1860,19 @@ def _binary_op(self, op, right_frame, join_type="outer"): def _concat(self, axis, others, how, sort): """Concatenate this dataframe with one or more others. - Args: - axis: The axis to concatenate over. - others: The list of dataframes to concatenate with. - how: The type of join to use for the axis. - sort: Whether or not to sort the result. + Parameters + ---------- + axis: int + The axis to concatenate over. + others: List of dataframes + The list of dataframes to concatenate with. + how: str + The type of join to use for the axis. + sort: boolean + Whether or not to sort the result. - Returns: + Returns + ------- A new dataframe. """ # Fast path for equivalent columns and partitioning @@ -1883,7 +1938,8 @@ def groupby_reduce( new_columns: (optional) The columns of the result. We may know this in advance, and if not provided it must be computed. - Returns: + Returns + ------- A new dataframe. """ new_partitions = self._frame_mgr_cls.groupby_reduce( @@ -1902,10 +1958,12 @@ def groupby_reduce( def from_pandas(cls, df): """Improve simple Pandas DataFrame to an advanced and superior Modin DataFrame. - Args: + Parameters + ---------- df: Pandas DataFrame object. - Returns: + Returns + ------- A new dataframe. """ new_index = df.index @@ -1961,9 +2019,10 @@ def _arrow_type_to_dtype(cls, arrow_type): return res def to_pandas(self): - """Converts Modin DataFrame to Pandas DataFrame. + """Convert a Modin DataFrame to Pandas DataFrame. - Returns: + Returns + ------- Pandas DataFrame. """ df = self._frame_mgr_cls.to_pandas(self._partitions) @@ -1985,7 +2044,7 @@ def to_pandas(self): def to_numpy(self, **kwargs): """ - Converts Modin DataFrame to a 2D NumPy array. + Convert a Modin DataFrame to a 2D NumPy array. Returns ------- @@ -1996,7 +2055,8 @@ def to_numpy(self, **kwargs): def transpose(self): """Transpose the index and columns of this dataframe. - Returns: + Returns + ------- A new dataframe. """ new_partitions = self._frame_mgr_cls.lazy_map_partitions( diff --git a/modin/engines/base/frame/partition.py b/modin/engines/base/frame/partition.py index 8854b346e77..6a3c9a49d8e 100644 --- a/modin/engines/base/frame/partition.py +++ b/modin/engines/base/frame/partition.py @@ -15,7 +15,8 @@ class BaseFramePartition(object): # pragma: no cover - """This abstract class holds the data and metadata for a single partition. + """An abstract class that holds the data and metadata for a single partition. + The methods required for implementing this abstract class are listed in the section immediately following this. @@ -36,7 +37,8 @@ def get(self): E.g. if you assign `x = BaseFramePartition.put(1)`, `x.get()` should always return 1. - Returns: + Returns + ------- The object that was `put`. """ raise NotImplementedError(NOT_IMPLEMENTED_MESSAGE) @@ -51,7 +53,8 @@ def apply(self, func, **kwargs): Args: func: The lambda to apply (may already be correctly formatted) - Returns: + Returns + ------- A new `BaseFramePartition` containing the object that has had `func` applied to it. """ @@ -74,7 +77,8 @@ def to_pandas(self): Note: If the underlying object is a Pandas DataFrame, this will likely only need to call `get` - Returns: + Returns + ------- A Pandas DataFrame. """ raise NotImplementedError(NOT_IMPLEMENTED_MESSAGE) @@ -85,7 +89,8 @@ def to_numpy(self, **kwargs): Note: If the underlying object is a Pandas DataFrame, this will return a 2D NumPy array. - Returns: + Returns + ------- A NumPy array. """ raise NotImplementedError(NOT_IMPLEMENTED_MESSAGE) @@ -97,19 +102,22 @@ def mask(self, row_indices, col_indices): row_indices: The indices for the rows to extract. col_indices: The indices for the columns to extract. - Returns: + Returns + ------- A `BaseFramePartition` object. """ raise NotImplementedError(NOT_IMPLEMENTED_MESSAGE) @classmethod def put(cls, obj): - """A factory classmethod to format a given object. + """Format a given object. - Args: + Parameters + ---------- obj: An object. - Returns: + Returns + ------- A `BaseFramePartition` object. """ raise NotImplementedError(NOT_IMPLEMENTED_MESSAGE) @@ -126,25 +134,28 @@ def preprocess_func(cls, func): Args: func: The function to preprocess. - Returns: + Returns + ------- An object that can be accepted by `apply`. """ raise NotImplementedError(NOT_IMPLEMENTED_MESSAGE) @classmethod def length_extraction_fn(cls): - """The function to compute the length of the object in this partition. + """Compute the length of the object in this partition. - Returns: + Returns + ------- A callable function. """ raise NotImplementedError(NOT_IMPLEMENTED_MESSAGE) @classmethod def width_extraction_fn(cls): - """The function to compute the width of the object in this partition. + """Compute the width of the object in this partition. - Returns: + Returns + ------- A callable function. """ raise NotImplementedError(NOT_IMPLEMENTED_MESSAGE) @@ -153,6 +164,7 @@ def width_extraction_fn(cls): _width_cache = None def length(self): + """Return the length of partition.""" if self._length_cache is None: cls = type(self) func = cls.length_extraction_fn() @@ -161,6 +173,7 @@ def length(self): return self._length_cache def width(self): + """Return the width of partition.""" if self._width_cache is None: cls = type(self) func = cls.width_extraction_fn() @@ -170,9 +183,10 @@ def width(self): @classmethod def empty(cls): - """Create an empty partition + """Create an empty partition. - Returns; + Returns + ------- An empty partition """ raise NotImplementedError(NOT_IMPLEMENTED_MESSAGE) diff --git a/modin/engines/base/frame/partition_manager.py b/modin/engines/base/frame/partition_manager.py index 5f23ffd98a0..b917ccedb13 100644 --- a/modin/engines/base/frame/partition_manager.py +++ b/modin/engines/base/frame/partition_manager.py @@ -20,8 +20,11 @@ class BaseFrameManager(object): - # Partition class is the class to use for storing each partition. It must - # extend the `BaseFramePartition` class. + """Partition class is the class to use for storing each partition. It must extend the `BaseFramePartition` class. + + It is the base class for managing the dataframe data layout and operators. + """ + _partition_class = None # Column partitions class is the class to use to create the column partitions. _column_partitions_class = None @@ -43,6 +46,7 @@ def preprocess_func(cls, map_func): map_func: The function to be preprocessed. Returns + ------- The preprocessed version of the `map_func` provided. Note: This does not require any specific format, only that the `BaseFramePartition.apply` method will recognize it (For the subclass @@ -54,28 +58,33 @@ def preprocess_func(cls, map_func): @classmethod def column_partitions(cls, partitions): - """A list of `BaseFrameAxisPartition` objects. + """List of `BaseFrameAxisPartition` objects. Note: Each value in this list will be an `BaseFrameAxisPartition` object. `BaseFrameAxisPartition` is located in `axis_partition.py`. - Returns a list of `BaseFrameAxisPartition` objects. + Returns + ------- + a list of `BaseFrameAxisPartition` objects. """ return [cls._column_partitions_class(col) for col in partitions.T] @classmethod def row_partitions(cls, partitions): - """A list of `BaseFrameAxisPartition` objects, represents column partitions. + """List of `BaseFrameAxisPartition` objects, represents column partitions. Note: Each value in this list will an `BaseFrameAxisPartition` object. `BaseFrameAxisPartition` is located in `axis_partition.py`. - Returns a list of `BaseFrameAxisPartition` objects. + Returns + ------- + a list of `BaseFrameAxisPartition` objects. """ return [cls._row_partition_class(row) for row in partitions] @classmethod def axis_partition(cls, partitions, axis): + """Logically partition along either the columns or the rows.""" return ( cls.column_partitions(partitions) if not axis @@ -84,6 +93,7 @@ def axis_partition(cls, partitions, axis): @classmethod def groupby_reduce(cls, axis, partitions, by, map_func, reduce_func): + """Groupby data using the map_func provided along the axis over the partitions then reduce using reduce_func.""" mapped_partitions = cls.broadcast_apply( axis, map_func, left=partitions, right=by, other_name="other" ) @@ -101,7 +111,7 @@ def broadcast_apply_select_indices( keep_remaining=False, ): """ - Broadcast the right partitions to left and apply a function to selected indices + Broadcast the right partitions to left and apply a function to selected indices. Note: Your internal function must take this kwargs: [`internal_indices`, `other`, `internal_other_indices`] to work correctly @@ -272,12 +282,15 @@ def broadcast_axis_partitions( @classmethod def map_partitions(cls, partitions, map_func): - """Applies `map_func` to every partition. + """Apply `map_func` to every partition. - Args: - map_func: The function to apply. + Parameters + ---------- + map_func: callable + The function to apply. - Returns: + Returns + ------- A new BaseFrameManager object, the type of object that called this. """ preprocessed_map_func = cls.preprocess_func(map_func) @@ -290,6 +303,18 @@ def map_partitions(cls, partitions, map_func): @classmethod def lazy_map_partitions(cls, partitions, map_func): + """ + Apply `map_func` to every partition lazily. + + Parameters + ---------- + map_func: callable + The function to apply. + + Returns + ------- + A new BaseFrameManager object, the type of object that called this. + """ preprocessed_map_func = cls.preprocess_func(map_func) return np.array( [ @@ -308,7 +333,7 @@ def map_axis_partitions( lengths=None, ): """ - Applies `map_func` to every partition. + Apply `map_func` to every partition. Parameters ---------- @@ -345,7 +370,7 @@ def map_axis_partitions( @classmethod def simple_shuffle(cls, axis, partitions, map_func, lengths): """ - Shuffle data using `lengths` via `map_func` + Shuffle data using `lengths` via `map_func`. Parameters ---------- @@ -395,7 +420,8 @@ def concat(cls, axis, left_parts, right_parts): right_parts: the other blocks to be concatenated. This is a BaseFrameManager object. - Returns: + Returns + ------- A new BaseFrameManager object, the type of object that called this. """ if type(right_parts) is list: @@ -410,7 +436,7 @@ def concat(cls, axis, left_parts, right_parts): @classmethod def concatenate(cls, dfs): """ - Concatenate Pandas DataFrames with saving 'category' dtype + Concatenate Pandas DataFrames with saving 'category' dtype. Parameters ---------- @@ -435,7 +461,8 @@ def concatenate(cls, dfs): def to_pandas(cls, partitions): """Convert this object into a Pandas DataFrame from the partitions. - Returns: + Returns + ------- A Pandas DataFrame """ retrieved_objects = [[obj.to_pandas() for obj in part] for part in partitions] @@ -476,6 +503,7 @@ def to_numpy(cls, partitions, **kwargs): @classmethod def from_pandas(cls, df, return_dims=False): + """Return the partitions from Pandas DataFrame.""" num_splits = cls._compute_num_partitions() put_func = cls._partition_class.put row_chunksize, col_chunksize = compute_chunksize(df, num_splits) @@ -505,11 +533,12 @@ def from_pandas(cls, df, return_dims=False): @classmethod def from_arrow(cls, at, return_dims=False): + """Return the partitions from Apache Arrow (PyArrow).""" return cls.from_pandas(at.to_pandas(), return_dims=return_dims) @classmethod def get_indices(cls, axis, partitions, index_func=None): - """This gets the internal indices stored in the partitions. + """Get the internal indices stored in the partitions. Note: These are the global indices of the object. This is mostly useful when you have deleted rows/columns internally, but do not know @@ -519,7 +548,8 @@ def get_indices(cls, axis, partitions, index_func=None): axis: This axis to extract the labels. (0 - index, 1 - columns). index_func: The function to be used to extract the function. - Returns: + Returns + ------- A Pandas Index object. """ ErrorMessage.catch_bugs_and_request_email(not callable(index_func)) @@ -541,10 +571,11 @@ def get_indices(cls, axis, partitions, index_func=None): @classmethod def _compute_num_partitions(cls): - """Currently, this method returns the default. In the future it will - estimate the optimal number of partitions. + """Retrieve the default number of partitions currently. Will estimate the optimal no. of partitions in future. - :return: + Returns + ------- + Number of partitions. """ from modin.pandas import DEFAULT_NPARTITIONS @@ -564,7 +595,7 @@ def _apply_func_to_list_of_partitions_broadcast( @classmethod def _apply_func_to_list_of_partitions(cls, func, partitions, **kwargs): - """Applies a function to a list of remote partitions. + """Apply a function to a list of remote partitions. Note: The main use for this is to preprocess the func. @@ -572,7 +603,8 @@ def _apply_func_to_list_of_partitions(cls, func, partitions, **kwargs): func: The func to apply partitions: The list of partitions - Returns: + Returns + ------- A list of BaseFramePartition objects. """ preprocessed_func = cls.preprocess_func(func) @@ -584,7 +616,7 @@ def _apply_func_to_list_of_partitions(cls, func, partitions, **kwargs): def apply_func_to_select_indices( cls, axis, partitions, func, indices, keep_remaining=False ): - """Applies a function to select indices. + """Apply a function to select indices. Note: Your internal function must take a kwarg `internal_indices` for this to work correctly. This prevents information leakage of the @@ -598,7 +630,8 @@ def apply_func_to_select_indices( Some operations may want to drop the remaining partitions and keep only the results. - Returns: + Returns + ------- A new BaseFrameManager object, the type of object that called this. """ if partitions.size == 0: @@ -685,7 +718,7 @@ def apply_func_to_select_indices( def apply_func_to_select_indices_along_full_axis( cls, axis, partitions, func, indices, keep_remaining=False ): - """Applies a function to a select subset of full columns/rows. + """Apply a function to a select subset of full columns/rows. Note: This should be used when you need to apply a function that relies on some global information for the entire column/row, but only need @@ -694,15 +727,21 @@ def apply_func_to_select_indices_along_full_axis( Important: For your func to operate directly on the indices provided, it must use `internal_indices` as a keyword argument. - Args: - axis: The axis to apply the function over (0 - rows, 1 - columns) - func: The function to apply. - indices: The global indices to apply the func to. - keep_remaining: Whether or not to keep the other partitions. - Some operations may want to drop the remaining partitions and - keep only the results. + Parameters + ---------- + axis: int + The axis to apply the function over (0 - rows, 1 - columns) + func: callable + The function to apply. + indices: list-like + The global indices to apply the func to. + keep_remaining: boolean + Whether or not to keep the other partitions. + Some operations may want to drop the remaining partitions and + keep only the results. - Returns: + Returns + ------- A new BaseFrameManager object, the type of object that called this. """ if partitions.size == 0: @@ -794,7 +833,7 @@ def apply_func_to_indices_both_axis( item_to_distribute=None, ): """ - Apply a function to along both axis + Apply a function to along both axis. Important: For your func to operate directly on the indices provided, it must use `row_internal_indices, col_internal_indices` as keyword From 3e32d0273e56c192b104fb65ad8fcbcdc801647c Mon Sep 17 00:00:00 2001 From: "Abdulelah S. Al Mesfer" <28743265+abdulelahsm@users.noreply.github.com> Date: Mon, 16 Nov 2020 01:18:38 +0300 Subject: [PATCH 35/42] DOCS-#2433: Updated README.md with modin_vs_dask.md doc (#2435) Signed-off-by: Abdulelah S. Al Mesfer --- README.md | 2 +- docs/modin_vs_dask.md | 32 ++++++++++++++++++++++++++++++++ 2 files changed, 33 insertions(+), 1 deletion(-) create mode 100644 docs/modin_vs_dask.md diff --git a/README.md b/README.md index a8474c2e432..62551a367ba 100644 --- a/README.md +++ b/README.md @@ -180,7 +180,7 @@ and improve: ![Architecture](docs/img/modin_architecture.png) Visit the [Documentation](https://modin.readthedocs.io/en/latest/developer/architecture.html) for -more information! +more information, and checkout [the difference between Modin and Dask!](https://github.com/modin-project/modin/tree/master/docs/modin_vs_dask.md) **`modin.pandas` is currently under active development. Requests and contributions are welcome!** diff --git a/docs/modin_vs_dask.md b/docs/modin_vs_dask.md new file mode 100644 index 00000000000..477dba9a887 --- /dev/null +++ b/docs/modin_vs_dask.md @@ -0,0 +1,32 @@ +# What is the difference between Dask DataFrame and Modin? + +**The TL;DR is that Modin's API is identical to pandas, whereas Dask's is not. Note: The projects are fundamentally different in their aims, so a fair comparison is challenging.** + +## API + +### Dask DataFrame + +Dask DataFrame does not scale the entire pandas API, and it isn't trying to. See this explained in their documentation [here](http://docs.dask.org/en/latest/dataframe.html#common-uses-and-anti-uses) + +Dask DataFrames API is also different from the pandas API in that it is lazy and needs .compute() to materialize the DataFrame. This makes the API less convenient but allows to do certain query optimizations/rearrangement, which can give speedups in certain situations. We are planning to incorporate similar capabilities into Modin but hope we can do so without having to change the API. We will outline plans for speeding up Modin in an upcoming blog post. + +### Modin + +Modin attempts to parallelize as much of the pandas API as is possible. We have worked through a significant portion of the DataFrame API. It is intended to be used as a drop-in replacement for pandas, such that even if the API is not yet parallelized, it is still defaulting to pandas. + +## Architecture + +### Dask DataFrame + +Dask DataFrame has row-based partitioning, similar to Spark. This can be seen in their [documentation](http://docs.dask.org/en/latest/dataframe.html#design.) They also have a custom index object for indexing into the object, which is not pandas compatible. Dask DataFrame seems to treat operations on the DataFrame as MapReduce operations, which is a good paradigm for the subset of the pandas API they have chosen to implement. + +### Modin + +Modin is more of a column-store, which we inherited from modern database systems. We laterally partition the columns for scalability (many systems, such as Google BigTable already did this), so we can scale in both directions and have finer grained partitioning. This is explained at a high level in [Modin's documentation](https://modin.readthedocs.io/en/latest/architecture.html). Because we have this finer grained control over the partitioning, we can support a number of operations that are very challenging in MapReduce systems (e.g. transpose, median, quantile). + +## Modin aims + +In the long-term, Modin is planned to become a DataFrame library that supports the popular APIs (SQL, pandas, etc.) and runs on a variety of compute engines and backends. In fact, a group was able to contribute a dask.delayed backend to Modin already in <200 lines of code [PR](https://github.com/modin-project/modin/pull/281). + + +- Reference: [Query: What is the difference between Dask and Modin? #515](https://github.com/modin-project/modin/issues/515) \ No newline at end of file From 5d3f6937d4cfc3f70998a77d8aef479fbd3808bf Mon Sep 17 00:00:00 2001 From: Dmitry Chigarev <62142979+dchigarev@users.noreply.github.com> Date: Wed, 18 Nov 2020 01:52:09 +0300 Subject: [PATCH 36/42] FIX-#2450: fix CI recipe (#2449) Signed-off-by: Dmitry Chigarev --- .github/workflows/ci.yml | 22 +++++++++++----------- .github/workflows/push.yml | 12 ++++++------ 2 files changed, 17 insertions(+), 17 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 062f32fe4b7..c60d1049eb9 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -15,7 +15,7 @@ jobs: node-version: "10.x" - run: npm install --save-dev @commitlint/{config-conventional,cli} commitlint-plugin-jira-rules commitlint-config-jira - name: Add dependencies for commitlint action - run: echo "::set-env name=NODE_PATH::$GITHUB_WORKSPACE/node_modules" + run: echo "NODE_PATH=$GITHUB_WORKSPACE/node_modules" >> $GITHUB_ENV - run: git remote add upstream https://github.com/modin-project/modin.git - run: git fetch upstream - run: npx commitlint --from upstream/master --to HEAD --verbose @@ -108,7 +108,7 @@ jobs: with: path: ~/.cache/pip key: ${{ runner.os }}-python-3.7-pip-${{ github.run_id }}-${{ hashFiles('environment.yml') }} - - uses: goanpeca/setup-miniconda@v1.6.0 + - uses: conda-incubator/setup-miniconda@v2 with: activate-environment: modin environment-file: environment.yml @@ -142,7 +142,7 @@ jobs: with: path: ~/.cache/pip key: ${{ runner.os }}-python-3.6-pip-${{ github.run_id }}-${{ hashFiles('environment.yml') }} - - uses: goanpeca/setup-miniconda@v1.6.0 + - uses: conda-incubator/setup-miniconda@v2 with: activate-environment: modin environment-file: environment.yml @@ -171,7 +171,7 @@ jobs: with: path: ~/.cache/pip key: ${{ runner.os }}-python-3.6-pip-${{ github.run_id }}-${{ hashFiles('environment.yml') }} - - uses: goanpeca/setup-miniconda@v1.6.0 + - uses: conda-incubator/setup-miniconda@v2 with: activate-environment: modin environment-file: environment.yml @@ -212,7 +212,7 @@ jobs: with: path: ~/.cache/pip key: ${{ runner.os }}-python-3.6-pip-${{ github.run_id }}-${{ hashFiles('environment.yml') }} - - uses: goanpeca/setup-miniconda@v1.6.0 + - uses: conda-incubator/setup-miniconda@v2 with: activate-environment: modin environment-file: environment.yml @@ -278,7 +278,7 @@ jobs: path: ~/.cache/pip key: ${{ runner.os }}-python-3.7-pip-${{ github.run_id }}-${{ hashFiles('environment.yml') }} - name: Setting up Modin environment - uses: goanpeca/setup-miniconda@v1.6.0 + uses: conda-incubator/setup-miniconda@v2 with: activate-environment: modin_on_omnisci python-version: 3.7.8 @@ -317,7 +317,7 @@ jobs: with: path: ~/.cache/pip key: ${{ runner.os }}-python-${{ matrix.python-version }}-pip-${{ github.run_id }}-${{ hashFiles('environment.yml') }} - - uses: goanpeca/setup-miniconda@v1.6.0 + - uses: conda-incubator/setup-miniconda@v2 with: activate-environment: modin environment-file: environment.yml @@ -385,7 +385,7 @@ jobs: with: path: ~/.cache/pip key: ${{ runner.os }}-python-3.7-pip-${{ github.run_id }}-${{ hashFiles('environment.yml') }} - - uses: goanpeca/setup-miniconda@v1.6.0 + - uses: conda-incubator/setup-miniconda@v2 with: activate-environment: modin environment-file: environment.yml @@ -423,7 +423,7 @@ jobs: with: path: ~/.cache/pip key: ${{ runner.os }}-python-3.7-pip-${{ github.run_id }}-${{ hashFiles('environment.yml') }} - - uses: goanpeca/setup-miniconda@v1.6.0 + - uses: conda-incubator/setup-miniconda@v2 with: activate-environment: modin environment-file: environment.yml @@ -461,7 +461,7 @@ jobs: with: path: ~\AppData\Local\pip\Cache key: ${{ runner.os }}-python-${{ matrix.python-version }}-pip-${{ github.run_id }}-${{ hashFiles('environment.yml') }} - - uses: goanpeca/setup-miniconda@v1.6.0 + - uses: conda-incubator/setup-miniconda@v2 with: activate-environment: modin environment-file: environment.yml @@ -546,7 +546,7 @@ jobs: with: path: ~/.cache/pip key: ${{ runner.os }}-python-${{ matrix.python-version }}-pip-${{ github.run_id }}-${{ hashFiles('environment.yml') }} - - uses: goanpeca/setup-miniconda@v1.6.0 + - uses: conda-incubator/setup-miniconda@v2 with: activate-environment: modin environment-file: environment.yml diff --git a/.github/workflows/push.yml b/.github/workflows/push.yml index 5c56af9fe5b..a5594a8af81 100644 --- a/.github/workflows/push.yml +++ b/.github/workflows/push.yml @@ -42,7 +42,7 @@ jobs: with: path: ~/.cache/pip key: ${{ runner.os }}-python-3.6-pip-${{ github.run_id }}-${{ hashFiles('environment.yml') }} - - uses: goanpeca/setup-miniconda@v1.6.0 + - uses: conda-incubator/setup-miniconda@v2 with: activate-environment: modin environment-file: environment.yml @@ -83,7 +83,7 @@ jobs: with: path: ~/.cache/pip key: ${{ runner.os }}-python-3.6-pip-${{ github.run_id }}-${{ hashFiles('environment.yml') }} - - uses: goanpeca/setup-miniconda@v1.6.0 + - uses: conda-incubator/setup-miniconda@v2 with: activate-environment: modin environment-file: environment.yml @@ -149,7 +149,7 @@ jobs: path: ~/.cache/pip key: ${{ runner.os }}-python-3.7-pip-${{ github.run_id }}-${{ hashFiles('environment.yml') }} - name: Setting up Modin environment - uses: goanpeca/setup-miniconda@v1.6.0 + uses: conda-incubator/setup-miniconda@v2 with: activate-environment: modin_on_omnisci python-version: 3.7.8 @@ -188,7 +188,7 @@ jobs: with: path: ~/.cache/pip key: ${{ runner.os }}-python-${{ matrix.python-version }}-pip-${{ github.run_id }}-${{ hashFiles('environment.yml') }} - - uses: goanpeca/setup-miniconda@v1.6.0 + - uses: conda-incubator/setup-miniconda@v2 with: activate-environment: modin environment-file: environment.yml @@ -260,7 +260,7 @@ jobs: with: path: ~\AppData\Local\pip\Cache key: ${{ runner.os }}-python-${{ matrix.python-version }}-pip-${{ github.run_id }}-${{ hashFiles('environment.yml') }} - - uses: goanpeca/setup-miniconda@v1.6.0 + - uses: conda-incubator/setup-miniconda@v2 with: activate-environment: modin environment-file: environment.yml @@ -345,7 +345,7 @@ jobs: with: path: ~/.cache/pip key: ${{ runner.os }}-python-${{ matrix.python-version }}-pip-${{ github.run_id }}-${{ hashFiles('environment.yml') }} - - uses: goanpeca/setup-miniconda@v1.6.0 + - uses: conda-incubator/setup-miniconda@v2 with: activate-environment: modin environment-file: environment.yml From 54604f27f1dddf2c2c023c240f993e2fcf98cb0f Mon Sep 17 00:00:00 2001 From: Devin Petersohn Date: Thu, 19 Nov 2020 08:07:50 -0600 Subject: [PATCH 37/42] DOCS-#2437: Add documentation contrasting Modin and Dask (#2441) * Resolves #2437 Signed-off-by: Devin Petersohn --- docs/comparisons/dask.rst | 88 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 87 insertions(+), 1 deletion(-) diff --git a/docs/comparisons/dask.rst b/docs/comparisons/dask.rst index aced4215900..82627916413 100644 --- a/docs/comparisons/dask.rst +++ b/docs/comparisons/dask.rst @@ -1,4 +1,90 @@ Modin vs. Dask Dataframe ======================== -Coming Soon... +Dask's Dataframe is effectively a meta-frame, partitioning and scheduling many smaller +``pandas.DataFrame`` objects. The Dask DataFrame does not implement the entire pandas +API, and it isn't trying to. See this explained in the `Dask DataFrame documentation`_. + +**The TL;DR is that Modin's API is identical to pandas, whereas Dask's is not. Note: The +projects are fundamentally different in their aims, so a fair comparison is +challenging.** + +API +--- +The API of Modin and Dask are different in several ways, explained here. + +Dask DataFrame +"""""""""""""" + +Dask is currently missing multiple APIs from pandas that Modin has implemented. Of note: +Dask does not implement ``iloc``, ``MultiIndex``, ``apply(axis=0)``, ``quantile``, +``median``, and more. Some of these APIs cannot be implemented efficiently or at all +given the architecture design tradeoffs made in Dask's implementation, and others simply +require engineering effort. ``iloc``, for example, can be implemented, but it would be +inefficient, and ``apply(axis=0)`` cannot be implemented at all in Dask's architecture. + +Dask DataFrames API is also different from the pandas API in that it is lazy and needs +``.compute()`` calls to materialize the DataFrame. This makes the API less convenient +but allows Dask to do certain query optimizations/rearrangement, which can give speedups +in certain situations. Several additional APIs exist in the Dask DataFrame API that +expose internal state about how the data is chunked and other data layout details, and +ways to manipulate that state. + +Semantically, Dask sorts the ``index``, which does not allow for user-specified order. +In Dask's case, this was done for optimization purposes, to speed up other computations +which involve the row index. + +Modin +""""" + +Modin is targeted toward parallelizing the entire pandas API, without exception. +As the pandas API continues to evolve, so will Modin's pandas API. Modin is intended to +be used as a drop-in replacement for pandas, such that even if the API is not yet +parallelized, it still works by falling back to running pandas. One of the key features +of being a drop-in replacement is that not only will it work for existing code, if a +user wishing to go back to running pandas directly, they may at no cost. There's no +lock-in: Modin notebooks can be converted to and from pandas as the user prefers. + +In the long-term, Modin is planned to become a data science framework that supports all +popular APIs (SQL, pandas, etc.) with the same underlying execution. + +Architecture +------------ + +The differences in Modin and Dask's architectures are explained in this section. + +Dask DataFrame +"""""""""""""" + +Dask DataFrame uses row-based partitioning, similar to Spark. This can be seen in their +`documentation`_. They also have a custom index object for indexing into the object, +which is not pandas compatible. Dask DataFrame seems to treat operations on the +DataFrame as MapReduce operations, which is a good paradigm for the subset of the pandas +API they have chosen to implement, but makes certain operations impossible. Dask +Dataframe is also lazy and places a lot of partitioning responsibility on the user. + +Modin +""""" + +Modin's partition is much more flexible, so the system can scale in both directions and +have finer grained partitioning. This is explained at a high level in `Modin's +documentation`_. Because we have this finer grained control over the partitioning, we +can support a number of operations that are very challenging in MapReduce systems (e.g. +transpose, median, quantile). This flexibility in partitioning also gives Modin +tremendous power to implement efficient straggler mitigation and improvements in +utilization over the entire cluster. + +Modin is also architected to run on a variety of systems. The goal here is that users +can take the same notebook to different clusters or different environments and it will +still just work, run on what you have! Modin does support running on Dask's compute +engine in addition to Ray. The architecture of Modin is extremely modular, we are able +to add different execution engines or compile to different memory formats because of +this modularity. Modin can run on a Dask cluster in the same way that Dask Dataframe +can, but they will still be different in all of the ways described above. + +Modin's implementation is grounded in theory, which is what enables us to implement the +entire pandas API. + +.. _Dask DataFrame documentation: http://docs.dask.org/en/latest/dataframe.html#common-uses-and-anti-uses +.. _documentation: http://docs.dask.org/en/latest/dataframe.html#design. +.. _Modin's documentation: https://modin.readthedocs.io/en/latest/developer/architecture.html From 74acc1b05bd7a612d69d1d77c14c8b0c98eae4ba Mon Sep 17 00:00:00 2001 From: Anatoly Myachev <45976948+anmyachev@users.noreply.github.com> Date: Fri, 20 Nov 2020 13:18:26 +0300 Subject: [PATCH 38/42] FEAT-#2444: add docker file for nyc on omnisci (#2445) Signed-off-by: Anatoly Myachev --- .../taxi-on-omnisci/build-docker-image.sh | 19 +++ .../nyc-taxi-omnisci.dockerfile | 53 +++++++++ .../taxi-on-omnisci/nyc-taxi-omnisci.py | 108 ++++++++++++++++++ 3 files changed, 180 insertions(+) create mode 100644 examples/docker/taxi-on-omnisci/build-docker-image.sh create mode 100644 examples/docker/taxi-on-omnisci/nyc-taxi-omnisci.dockerfile create mode 100644 examples/docker/taxi-on-omnisci/nyc-taxi-omnisci.py diff --git a/examples/docker/taxi-on-omnisci/build-docker-image.sh b/examples/docker/taxi-on-omnisci/build-docker-image.sh new file mode 100644 index 00000000000..7395976a709 --- /dev/null +++ b/examples/docker/taxi-on-omnisci/build-docker-image.sh @@ -0,0 +1,19 @@ +#!/bin/bash -e + +# Licensed to Modin Development Team under one or more contributor license agreements. +# See the NOTICE file distributed with this work for additional information regarding +# copyright ownership. The Modin Development Team licenses this file to you under the +# Apache License, Version 2.0 (the "License"); you may not use this file except in +# compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under +# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific language +# governing permissions and limitations under the License. + +cd "`dirname \"$0\"`" + +docker build -f nyc-taxi-omnisci.dockerfile -t nyc-taxi-omnisci --build-arg https_proxy --build-arg http_proxy . +printf "\n\nTo run the benchmark execute:\n\tdocker run --rm nyc-taxi-omnisci\n" diff --git a/examples/docker/taxi-on-omnisci/nyc-taxi-omnisci.dockerfile b/examples/docker/taxi-on-omnisci/nyc-taxi-omnisci.dockerfile new file mode 100644 index 00000000000..ba4d6b8f3e1 --- /dev/null +++ b/examples/docker/taxi-on-omnisci/nyc-taxi-omnisci.dockerfile @@ -0,0 +1,53 @@ +# Licensed to Modin Development Team under one or more contributor license agreements. +# See the NOTICE file distributed with this work for additional information regarding +# copyright ownership. The Modin Development Team licenses this file to you under the +# Apache License, Version 2.0 (the "License"); you may not use this file except in +# compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under +# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific language +# governing permissions and limitations under the License. + +FROM ubuntu:18.04 +ENV http_proxy ${http_proxy} +ENV https_proxy ${https_proxy} +ENV MODIN_BACKEND "omnisci" +ENV MODIN_EXPERIMENTAL "true" + +RUN apt-get update --yes \ + && apt-get install wget --yes && \ + rm -rf /var/lib/apt/lists/* + +ENV USER modin +ENV UID 1000 +ENV HOME /home/$USER + +RUN adduser --disabled-password \ + --gecos "Non-root user" \ + --uid $UID \ + --home $HOME \ + $USER + +ENV CONDA_DIR ${HOME}/miniconda + +SHELL ["/bin/bash", "--login", "-c"] + +RUN wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O /tmp/miniconda3.sh && \ + bash /tmp/miniconda3.sh -b -p "${CONDA_DIR}" -f -u && \ + "${CONDA_DIR}/bin/conda" init bash && \ + rm -f /tmp/miniconda3.sh && \ + echo ". '${CONDA_DIR}/etc/profile.d/conda.sh'" >> "${HOME}/.profile" + +RUN conda update -n base -c defaults conda -y && \ + conda create -n modin --yes --no-default-packages && \ + conda activate modin && \ + conda install -c intel/label/modin -c conda-forge modin "ray>=1.0.0" && \ + conda clean --all --yes + +COPY trips_xaa.csv "${HOME}/trips_xaa.csv" +COPY nyc-taxi-omnisci.py "${HOME}/nyc-taxi-omnisci.py" + +CMD ["/bin/bash", "--login", "-c", "conda activate modin && python ${HOME}/nyc-taxi-omnisci.py"] diff --git a/examples/docker/taxi-on-omnisci/nyc-taxi-omnisci.py b/examples/docker/taxi-on-omnisci/nyc-taxi-omnisci.py new file mode 100644 index 00000000000..535e93727f9 --- /dev/null +++ b/examples/docker/taxi-on-omnisci/nyc-taxi-omnisci.py @@ -0,0 +1,108 @@ +# Licensed to Modin Development Team under one or more contributor license agreements. +# See the NOTICE file distributed with this work for additional information regarding +# copyright ownership. The Modin Development Team licenses this file to you under the +# Apache License, Version 2.0 (the "License"); you may not use this file except in +# compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under +# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific language +# governing permissions and limitations under the License. + +import os +import time +import modin.pandas as pd +from modin.experimental.engines.omnisci_on_ray.frame.omnisci_worker import OmnisciServer + +def read(): + columns_names = [ + "trip_id", "vendor_id", "pickup_datetime", "dropoff_datetime", "store_and_fwd_flag", + "rate_code_id", "pickup_longitude", "pickup_latitude", "dropoff_longitude", "dropoff_latitude", + "passenger_count", "trip_distance", "fare_amount", "extra", "mta_tax", "tip_amount", + "tolls_amount", "ehail_fee", "improvement_surcharge", "total_amount", "payment_type", + "trip_type", "pickup", "dropoff", "cab_type", "precipitation", "snow_depth", "snowfall", + "max_temperature", "min_temperature", "average_wind_speed", "pickup_nyct2010_gid", + "pickup_ctlabel", "pickup_borocode", "pickup_boroname", "pickup_ct2010", + "pickup_boroct2010", "pickup_cdeligibil", "pickup_ntacode", "pickup_ntaname", "pickup_puma", + "dropoff_nyct2010_gid", "dropoff_ctlabel", "dropoff_borocode", "dropoff_boroname", + "dropoff_ct2010", "dropoff_boroct2010", "dropoff_cdeligibil", "dropoff_ntacode", + "dropoff_ntaname", "dropoff_puma", + ] + # use string instead of category + columns_types = [ + "int64", "string", "timestamp", "timestamp", "string", "int64", "float64", "float64", + "float64", "float64", "int64", "float64", "float64", "float64", "float64", "float64", "float64", + "float64", "float64", "float64", "string", "float64", "string", "string", "string", "float64", + "int64", "float64", "int64", "int64", "float64", "float64", "float64", "float64", "string", "float64", + "float64", "string", "string", "string", "float64", "float64", "float64", "float64", "string", + "float64", "float64", "string", "string", "string", "float64", + ] + + dtypes = {columns_names[i]: columns_types[i] for i in range(len(columns_names))} + all_but_dates = { + col: valtype for (col, valtype) in dtypes.items() if valtype not in ["timestamp"] + } + dates_only = [col for (col, valtype) in dtypes.items() if valtype in ["timestamp"]] + + df = pd.read_csv( + os.path.expanduser('~/trips_xaa.csv'), + names=columns_names, + dtype=all_but_dates, + parse_dates=dates_only, + ) + + df.shape # to trigger real execution + df._query_compiler._modin_frame._partitions[0][ + 0 + ].frame_id = OmnisciServer().put_arrow_to_omnisci( + df._query_compiler._modin_frame._partitions[0][0].get() + ) # to trigger real execution + return df + + +def q1_omnisci(df): + q1_pandas_output = df.groupby("cab_type").size() + q1_pandas_output.shape # to trigger real execution + return q1_pandas_output + +def q2_omnisci(df): + q2_pandas_output = df.groupby("passenger_count").agg({"total_amount": "mean"}) + q2_pandas_output.shape # to trigger real execution + return q2_pandas_output + +def q3_omnisci(df): + df["pickup_datetime"] = df["pickup_datetime"].dt.year + q3_pandas_output = df.groupby(["passenger_count", "pickup_datetime"]).size() + q3_pandas_output.shape # to trigger real execution + return q3_pandas_output + +def q4_omnisci(df): + df["pickup_datetime"] = df["pickup_datetime"].dt.year + df["trip_distance"] = df["trip_distance"].astype("int64") + q4_pandas_output = ( + df.groupby(["passenger_count", "pickup_datetime", "trip_distance"], sort=False) + .size() + .reset_index() + .sort_values(by=["pickup_datetime", 0], ignore_index=True, ascending=[True, False]) + ) + q4_pandas_output.shape # to trigger real execution + return q4_pandas_output + +def measure(name, func, *args, **kw): + t0 = time.time() + res = func(*args, **kw) + t1 = time.time() + print(f'{name}: {t1 - t0} sec') + return res + +def main(): + df = measure('Reading', read) + measure('Q1', q1_omnisci, df) + measure('Q2', q2_omnisci, df) + measure('Q3', q3_omnisci, df.copy()) + measure('Q4', q4_omnisci, df.copy()) + +if __name__ == '__main__': + main() From 03dbbef819d78ecc035be35a904df02977caa680 Mon Sep 17 00:00:00 2001 From: Abolfazl Shahbazi Date: Fri, 20 Nov 2020 02:27:13 -0800 Subject: [PATCH 39/42] FIX-#2458: fix 'psutil' install (#2452) Signed-off-by: Anatoly Myachev --- examples/docker/nyc-taxi.dockerfile | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/examples/docker/nyc-taxi.dockerfile b/examples/docker/nyc-taxi.dockerfile index f10e749a1f7..a4703745a92 100644 --- a/examples/docker/nyc-taxi.dockerfile +++ b/examples/docker/nyc-taxi.dockerfile @@ -12,11 +12,16 @@ # governing permissions and limitations under the License. FROM ubuntu:18.04 + +ARG PYTHON_VERSION=3.7 ENV http_proxy ${http_proxy} ENV https_proxy ${https_proxy} -RUN apt-get update --yes \ - && apt-get install wget --yes && \ +RUN apt-get update --yes && \ + apt-get install --yes --no-install-recommends --fix-missing \ + gcc \ + python${PYTHON_VERSION}-dev \ + wget && \ rm -rf /var/lib/apt/lists/* ENV USER modin @@ -33,7 +38,7 @@ ENV CONDA_DIR ${HOME}/miniconda SHELL ["/bin/bash", "--login", "-c"] -RUN wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O /tmp/miniconda3.sh && \ +RUN wget --quiet --no-check-certificate https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O /tmp/miniconda3.sh && \ bash /tmp/miniconda3.sh -b -p "${CONDA_DIR}" -f -u && \ "${CONDA_DIR}/bin/conda" init bash && \ rm -f /tmp/miniconda3.sh && \ @@ -45,7 +50,7 @@ RUN conda update -n base -c defaults conda -y && \ pip install --no-cache-dir modin[ray] && \ conda clean --all --yes -RUN wget https://modin-datasets.s3.amazonaws.com/trips_data.csv -O "${HOME}/trips_data.csv" +RUN wget --quiet --no-check-certificate https://modin-datasets.s3.amazonaws.com/trips_data.csv -O "${HOME}/trips_data.csv" COPY nyc-taxi.py "${HOME}/nyc-taxi.py" From 80125c14a47551d8cf024baf7b2d2bdd885be977 Mon Sep 17 00:00:00 2001 From: Anatoly Myachev <45976948+anmyachev@users.noreply.github.com> Date: Fri, 20 Nov 2020 13:28:34 +0300 Subject: [PATCH 40/42] FIX-#2456: update taxi queries with .copy usage (#2457) Signed-off-by: Anatoly Myachev --- examples/docker/nyc-taxi.py | 21 ++++++++------------- 1 file changed, 8 insertions(+), 13 deletions(-) diff --git a/examples/docker/nyc-taxi.py b/examples/docker/nyc-taxi.py index 43cb53e1852..753b857b52b 100644 --- a/examples/docker/nyc-taxi.py +++ b/examples/docker/nyc-taxi.py @@ -40,19 +40,14 @@ def q2(df): return df.groupby("passenger_count", as_index=False).mean()[["passenger_count", "total_amount"]] def q3(df): - transformed = pd.DataFrame({ - "passenger_count": df["passenger_count"], - "pickup_datetime": df["pickup_datetime"].dt.year, - }) - return transformed.groupby(["pickup_datetime", "passenger_count"]).agg({"passenger_count": ["count"]}) + df["pickup_datetime"] = df["pickup_datetime"].dt.year + return df.groupby(["pickup_datetime", "passenger_count"]).size().reset_index() + def q4(df): - transformed = pd.DataFrame({ - "passenger_count": df["passenger_count"], - "pickup_datetime": df["pickup_datetime"].dt.year, - "trip_distance": df["trip_distance"].astype("int64"), - }) - return transformed.groupby(["passenger_count", "pickup_datetime", "trip_distance"]) \ + df["pickup_datetime"] = df["pickup_datetime"].dt.year + df["trip_distance"] = df["trip_distance"].astype("int64") + return df.groupby(["passenger_count", "pickup_datetime", "trip_distance"]) \ .size().reset_index().sort_values(by=["pickup_datetime", 0], ascending=[True, False]) def measure(name, func, *args, **kw): @@ -66,8 +61,8 @@ def main(): df = measure('Reading', read) measure('Q1', q1, df) measure('Q2', q2, df) - measure('Q3', q3, df) - measure('Q4', q4, df) + measure('Q3', q3, df.copy()) + measure('Q4', q4, df.copy()) if __name__ == '__main__': main() From 41d31119d6e6d81c8d9ea520899ce988f1fa8093 Mon Sep 17 00:00:00 2001 From: Anatoly Myachev <45976948+anmyachev@users.noreply.github.com> Date: Fri, 20 Nov 2020 16:05:54 +0300 Subject: [PATCH 41/42] FEAT-#2447: add docker file for census on omnisci (#2448) Also add instructions for building docker images Signed-off-by: Anatoly Myachev --- .../census-on-omnisci/build-docker-image.sh | 25 +++ .../census-omnisci.dockerfile | 63 +++++++ .../census-on-omnisci/census-omnisci.py | 162 ++++++++++++++++++ .../taxi-on-omnisci/build-docker-image.sh | 5 + 4 files changed, 255 insertions(+) create mode 100644 examples/docker/census-on-omnisci/build-docker-image.sh create mode 100644 examples/docker/census-on-omnisci/census-omnisci.dockerfile create mode 100644 examples/docker/census-on-omnisci/census-omnisci.py diff --git a/examples/docker/census-on-omnisci/build-docker-image.sh b/examples/docker/census-on-omnisci/build-docker-image.sh new file mode 100644 index 00000000000..f4dcb266365 --- /dev/null +++ b/examples/docker/census-on-omnisci/build-docker-image.sh @@ -0,0 +1,25 @@ +#!/bin/bash -e + +# Licensed to Modin Development Team under one or more contributor license agreements. +# See the NOTICE file distributed with this work for additional information regarding +# copyright ownership. The Modin Development Team licenses this file to you under the +# Apache License, Version 2.0 (the "License"); you may not use this file except in +# compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under +# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific language +# governing permissions and limitations under the License. + +echo "Note: a user is responsible for preparing the dataset. +The dataset must be named as 'ipums_education2income_1970-2010.csv' and +be in the folder with 'census-omnisci.dockerfile'. It can be downloaded by link: +'https://rapidsai-data.s3.us-east-2.amazonaws.com/datasets/ipums_education2income_1970-2010.csv.gz'" + +cd "`dirname \"$0\"`" + +docker build -f census-omnisci.dockerfile -t census-omnisci --build-arg no_proxy \ + --build-arg https_proxy --build-arg http_proxy --build-arg conda_extra_channel . +printf "\n\nTo run the benchmark execute:\n\tdocker run --rm census-omnisci\n" diff --git a/examples/docker/census-on-omnisci/census-omnisci.dockerfile b/examples/docker/census-on-omnisci/census-omnisci.dockerfile new file mode 100644 index 00000000000..98cfa0d5518 --- /dev/null +++ b/examples/docker/census-on-omnisci/census-omnisci.dockerfile @@ -0,0 +1,63 @@ +# Licensed to Modin Development Team under one or more contributor license agreements. +# See the NOTICE file distributed with this work for additional information regarding +# copyright ownership. The Modin Development Team licenses this file to you under the +# Apache License, Version 2.0 (the "License"); you may not use this file except in +# compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under +# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific language +# governing permissions and limitations under the License. + +FROM ubuntu:18.04 +ENV http_proxy ${http_proxy} +ENV https_proxy ${https_proxy} +ENV no_proxy ${no_proxy} +ENV MODIN_BACKEND "omnisci" +ENV MODIN_EXPERIMENTAL "true" + +ARG conda_extra_channel +ENV add_extra_channel=${conda_extra_channel:+"-c ${conda_extra_channel}"} + +RUN apt-get update --yes \ + && apt-get install wget --yes && \ + rm -rf /var/lib/apt/lists/* + +ENV USER modin +ENV UID 1000 +ENV HOME /home/$USER + +RUN adduser --disabled-password \ + --gecos "Non-root user" \ + --uid $UID \ + --home $HOME \ + $USER + +ENV CONDA_DIR ${HOME}/miniconda + +SHELL ["/bin/bash", "--login", "-c"] + +RUN wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O /tmp/miniconda3.sh && \ + bash /tmp/miniconda3.sh -b -p "${CONDA_DIR}" -f -u && \ + "${CONDA_DIR}/bin/conda" init bash && \ + rm -f /tmp/miniconda3.sh && \ + echo ". '${CONDA_DIR}/etc/profile.d/conda.sh'" >> "${HOME}/.profile" + +RUN conda update -n base -c defaults conda -y && \ + conda create -n modin --yes --no-default-packages && \ + conda activate modin && \ + conda install -c intel/label/modin -c conda-forge modin "ray>=1.0.0" + +RUN conda activate modin && \ + conda install -c intel/label/modin -c conda-forge -c intel ${add_extra_channel} \ + "daal4py>=2021.1" dpcpp_cpp_rt && \ + conda install -c conda-forge scikit-learn && \ + conda clean --all --yes + +COPY ipums_education2income_1970-2010.csv "${HOME}/ipums_education2income_1970-2010.csv" + +COPY census-omnisci.py "${HOME}/census-omnisci.py" + +CMD ["/bin/bash", "--login", "-c", "conda activate modin && python ${HOME}/census-omnisci.py"] diff --git a/examples/docker/census-on-omnisci/census-omnisci.py b/examples/docker/census-on-omnisci/census-omnisci.py new file mode 100644 index 00000000000..48e946870b8 --- /dev/null +++ b/examples/docker/census-on-omnisci/census-omnisci.py @@ -0,0 +1,162 @@ +# Licensed to Modin Development Team under one or more contributor license agreements. +# See the NOTICE file distributed with this work for additional information regarding +# copyright ownership. The Modin Development Team licenses this file to you under the +# Apache License, Version 2.0 (the "License"); you may not use this file except in +# compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under +# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific language +# governing permissions and limitations under the License. + +import os +import time +import modin.pandas as pd +from modin.experimental.engines.omnisci_on_ray.frame.omnisci_worker import OmnisciServer + +from sklearn import config_context +import daal4py.sklearn as sklearn + +sklearn.patch_sklearn() +from sklearn.model_selection import train_test_split +import sklearn.linear_model as lm +import numpy as np + + +def read(): + columns_names = [ + "YEAR0", "DATANUM", "SERIAL", "CBSERIAL", "HHWT", "CPI99", "GQ", "QGQ", "PERNUM", "PERWT", "SEX", + "AGE", "EDUC", "EDUCD", "INCTOT", "SEX_HEAD", "SEX_MOM", "SEX_POP", "SEX_SP", "SEX_MOM2", "SEX_POP2", + "AGE_HEAD", "AGE_MOM", "AGE_POP", "AGE_SP", "AGE_MOM2", "AGE_POP2", "EDUC_HEAD", "EDUC_MOM", "EDUC_POP", + "EDUC_SP", "EDUC_MOM2", "EDUC_POP2", "EDUCD_HEAD", "EDUCD_MOM", "EDUCD_POP", "EDUCD_SP", "EDUCD_MOM2", + "EDUCD_POP2", "INCTOT_HEAD", "INCTOT_MOM", "INCTOT_POP", "INCTOT_SP", "INCTOT_MOM2", "INCTOT_POP2", + ] + columns_types = [ + "int64", "int64", "int64", "float64", "int64", "float64", "int64", "float64", "int64", "int64", + "int64", "int64", "int64", "int64", "int64", "float64", "float64", "float64", "float64", "float64", + "float64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", + "float64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", + "float64", "float64", "float64", "float64", "float64", "float64", "float64", + ] + dtypes = {columns_names[i]: columns_types[i] for i in range(len(columns_names))} + + df = pd.read_csv( + os.path.expanduser('~/ipums_education2income_1970-2010.csv'), + names=columns_names, + dtype=dtypes, + skiprows=1, + ) + + df.shape # to trigger real execution + df._query_compiler._modin_frame._partitions[0][ + 0 + ].frame_id = OmnisciServer().put_arrow_to_omnisci( + df._query_compiler._modin_frame._partitions[0][0].get() + ) # to trigger real execution + return df + + +def etl(df): + keep_cols = [ + "YEAR0", "DATANUM", "SERIAL", "CBSERIAL", "HHWT", "CPI99", "GQ", "PERNUM", "SEX", "AGE", + "INCTOT", "EDUC", "EDUCD", "EDUC_HEAD", "EDUC_POP", "EDUC_MOM", "EDUCD_MOM2", "EDUCD_POP2", + "INCTOT_MOM", "INCTOT_POP", "INCTOT_MOM2", "INCTOT_POP2", "INCTOT_HEAD", "SEX_HEAD", + ] + df = df[keep_cols] + + df = df[df["INCTOT"] != 9999999] + df = df[df["EDUC"] != -1] + df = df[df["EDUCD"] != -1] + + df["INCTOT"] = df["INCTOT"] * df["CPI99"] + + for column in keep_cols: + df[column] = df[column].fillna(-1) + + df[column] = df[column].astype("float64") + + y = df["EDUC"] + X = df.drop(columns=["EDUC", "CPI99"]) + + # to trigger real execution + df.shape + y.shape + X.shape + + return (df, X, y) + + +def mse(y_test, y_pred): + return ((y_test - y_pred) ** 2).mean() + + +def cod(y_test, y_pred): + y_bar = y_test.mean() + total = ((y_test - y_bar) ** 2).sum() + residuals = ((y_test - y_pred) ** 2).sum() + return 1 - (residuals / total) + + +def ml(X, y, random_state, n_runs, test_size): + clf = lm.Ridge() + + X = np.ascontiguousarray(X, dtype=np.float64) + y = np.ascontiguousarray(y, dtype=np.float64) + + mse_values, cod_values = [], [] + ml_scores = {} + + print("ML runs: ", n_runs) + for i in range(n_runs): + (X_train, X_test, y_train, y_test) = train_test_split( + X, y, test_size=test_size, random_state=random_state + ) + random_state += 777 + + with config_context(assume_finite=True): + model = clf.fit(X_train, y_train) + + y_pred = model.predict(X_test) + + mse_values.append(mse(y_test, y_pred)) + cod_values.append(cod(y_test, y_pred)) + + ml_scores["mse_mean"] = sum(mse_values) / len(mse_values) + ml_scores["cod_mean"] = sum(cod_values) / len(cod_values) + ml_scores["mse_dev"] = pow( + sum([(mse_value - ml_scores["mse_mean"]) ** 2 for mse_value in mse_values]) + / (len(mse_values) - 1), + 0.5, + ) + ml_scores["cod_dev"] = pow( + sum([(cod_value - ml_scores["cod_mean"]) ** 2 for cod_value in cod_values]) + / (len(cod_values) - 1), + 0.5, + ) + + return ml_scores + + +def measure(name, func, *args, **kw): + t0 = time.time() + res = func(*args, **kw) + t1 = time.time() + print(f'{name}: {t1 - t0} sec') + return res + + +def main(): + # ML specific + N_RUNS = 50 + TEST_SIZE = 0.1 + RANDOM_STATE = 777 + + df = measure('Reading', read) + _, X, y = measure('ETL', etl, df) + measure('ML', ml, X, y, random_state=RANDOM_STATE, n_runs=N_RUNS, test_size=TEST_SIZE) + + +if __name__ == '__main__': + main() diff --git a/examples/docker/taxi-on-omnisci/build-docker-image.sh b/examples/docker/taxi-on-omnisci/build-docker-image.sh index 7395976a709..dcf2c395490 100644 --- a/examples/docker/taxi-on-omnisci/build-docker-image.sh +++ b/examples/docker/taxi-on-omnisci/build-docker-image.sh @@ -13,6 +13,11 @@ # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. +echo "Note: a user is responsible for preparing the dataset. +The dataset must be named as 'trips_xaa.csv' and be in the folder with 'nyc-taxi-omnisci.dockerfile'. +It Can be generated by following the instructions on the link: +'https://github.com/toddwschneider/nyc-taxi-data#instructions'" + cd "`dirname \"$0\"`" docker build -f nyc-taxi-omnisci.dockerfile -t nyc-taxi-omnisci --build-arg https_proxy --build-arg http_proxy . From f7f1f7ac553dc3cc6595204fb7566db0e8f7f85f Mon Sep 17 00:00:00 2001 From: amyskov <55585026+amyskov@users.noreply.github.com> Date: Mon, 23 Nov 2020 21:19:57 +0300 Subject: [PATCH 42/42] FIX-#2470: revert b867edf (#2471) Signed-off-by: Alexander Myskov --- modin/engines/base/frame/data.py | 8 +++----- modin/engines/base/frame/partition_manager.py | 10 +++------- 2 files changed, 6 insertions(+), 12 deletions(-) diff --git a/modin/engines/base/frame/data.py b/modin/engines/base/frame/data.py index 9bae5a85c92..f18dab29560 100644 --- a/modin/engines/base/frame/data.py +++ b/modin/engines/base/frame/data.py @@ -777,9 +777,7 @@ def astype(self, col_dtypes): def astype_builder(df): return df.astype({k: v for k, v in col_dtypes.items() if k in df}) - new_frame = self._frame_mgr_cls.lazy_map_partitions( - self._partitions, astype_builder - ) + new_frame = self._frame_mgr_cls.map_partitions(self._partitions, astype_builder) return self.__constructor__( new_frame, self.index, @@ -1145,7 +1143,7 @@ def _map_reduce(self, axis, map_func, reduce_func=None, preserve_index=True): else: reduce_func = self._build_mapreduce_func(axis, reduce_func) - map_parts = self._frame_mgr_cls.lazy_map_partitions(self._partitions, map_func) + map_parts = self._frame_mgr_cls.map_partitions(self._partitions, map_func) reduce_parts = self._frame_mgr_cls.map_axis_partitions( axis, map_parts, reduce_func ) @@ -1171,7 +1169,7 @@ def _map(self, func, dtypes=None, validate_index=False, validate_columns=False): ------- A new dataframe. """ - new_partitions = self._frame_mgr_cls.lazy_map_partitions(self._partitions, func) + new_partitions = self._frame_mgr_cls.map_partitions(self._partitions, func) if dtypes == "copy": dtypes = self._dtypes elif dtypes is not None: diff --git a/modin/engines/base/frame/partition_manager.py b/modin/engines/base/frame/partition_manager.py index b917ccedb13..b2e403650b8 100644 --- a/modin/engines/base/frame/partition_manager.py +++ b/modin/engines/base/frame/partition_manager.py @@ -204,7 +204,7 @@ def broadcast_apply(cls, axis, apply_func, left, right, other_name="r"): new_partitions = np.array( [ [ - part.add_to_apply_calls( + part.apply( apply_func, **{other_name: right[col_idx] if axis else right[row_idx]}, ) @@ -587,9 +587,7 @@ def _apply_func_to_list_of_partitions_broadcast( ): preprocessed_func = cls.preprocess_func(func) return [ - obj.add_to_apply_calls( - preprocessed_func, other=[o.get() for o in broadcasted], **kwargs - ) + obj.apply(preprocessed_func, other=[o.get() for o in broadcasted], **kwargs) for obj, broadcasted in zip(partitions, other.T) ] @@ -608,9 +606,7 @@ def _apply_func_to_list_of_partitions(cls, func, partitions, **kwargs): A list of BaseFramePartition objects. """ preprocessed_func = cls.preprocess_func(func) - return [ - obj.add_to_apply_calls(preprocessed_func, **kwargs) for obj in partitions - ] + return [obj.apply(preprocessed_func, **kwargs) for obj in partitions] @classmethod def apply_func_to_select_indices(