diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index 01e071a5c8d..0f9cc4ae33f 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -1,6 +1,6 @@ @@ -8,7 +8,7 @@ if you have questions about contributing. -- [ ] commit message follows format outlined [here](https://modin.readthedocs.io/en/latest/developer/contributing.html) +- [ ] commit message follows format outlined [here](https://modin.readthedocs.io/en/latest/CONTRIBUTING.html) - [ ] passes `flake8 modin` - [ ] passes `black --check modin` - [ ] signed commit with `git commit -s` diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 37a6acb3edc..c60d1049eb9 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -15,7 +15,7 @@ jobs: node-version: "10.x" - run: npm install --save-dev @commitlint/{config-conventional,cli} commitlint-plugin-jira-rules commitlint-config-jira - name: Add dependencies for commitlint action - run: echo "::set-env name=NODE_PATH::$GITHUB_WORKSPACE/node_modules" + run: echo "NODE_PATH=$GITHUB_WORKSPACE/node_modules" >> $GITHUB_ENV - run: git remote add upstream https://github.com/modin-project/modin.git - run: git fetch upstream - run: npx commitlint --from upstream/master --to HEAD --verbose @@ -51,6 +51,7 @@ jobs: - run: pydocstyle --convention=numpy --add-ignore=D101,D102 modin/pandas/series_utils.py - run: pydocstyle --convention=numpy --add-ignore=D103 modin/pandas/general.py - run: pydocstyle --convention=numpy modin/pandas/plotting.py modin/pandas/utils.py modin/pandas/iterator.py modin/pandas/indexing.py + - run: pydocstyle --convention=numpy --add-ignore=D100,D104 modin/engines/base/frame lint-flake8: name: lint (flake8) @@ -107,7 +108,7 @@ jobs: with: path: ~/.cache/pip key: ${{ runner.os }}-python-3.7-pip-${{ github.run_id }}-${{ hashFiles('environment.yml') }} - - uses: goanpeca/setup-miniconda@v1.6.0 + - uses: conda-incubator/setup-miniconda@v2 with: activate-environment: modin environment-file: environment.yml @@ -141,7 +142,7 @@ jobs: with: path: ~/.cache/pip key: ${{ runner.os }}-python-3.6-pip-${{ github.run_id }}-${{ hashFiles('environment.yml') }} - - uses: goanpeca/setup-miniconda@v1.6.0 + - uses: conda-incubator/setup-miniconda@v2 with: activate-environment: modin environment-file: environment.yml @@ -170,7 +171,7 @@ jobs: with: path: ~/.cache/pip key: ${{ runner.os }}-python-3.6-pip-${{ github.run_id }}-${{ hashFiles('environment.yml') }} - - uses: goanpeca/setup-miniconda@v1.6.0 + - uses: conda-incubator/setup-miniconda@v2 with: activate-environment: modin environment-file: environment.yml @@ -189,6 +190,8 @@ jobs: run: python -m pytest modin/config/test - shell: bash -l {0} run: python -m pytest modin/test/test_envvar_catcher.py + - shell: bash -l {0} + run: python -m pytest modin/test/backends/pandas/test_internals.py test-defaults: needs: [lint-commit, lint-flake8, lint-black, test-api, test-headers] @@ -209,7 +212,7 @@ jobs: with: path: ~/.cache/pip key: ${{ runner.os }}-python-3.6-pip-${{ github.run_id }}-${{ hashFiles('environment.yml') }} - - uses: goanpeca/setup-miniconda@v1.6.0 + - uses: conda-incubator/setup-miniconda@v2 with: activate-environment: modin environment-file: environment.yml @@ -275,7 +278,7 @@ jobs: path: ~/.cache/pip key: ${{ runner.os }}-python-3.7-pip-${{ github.run_id }}-${{ hashFiles('environment.yml') }} - name: Setting up Modin environment - uses: goanpeca/setup-miniconda@v1.6.0 + uses: conda-incubator/setup-miniconda@v2 with: activate-environment: modin_on_omnisci python-version: 3.7.8 @@ -314,7 +317,7 @@ jobs: with: path: ~/.cache/pip key: ${{ runner.os }}-python-${{ matrix.python-version }}-pip-${{ github.run_id }}-${{ hashFiles('environment.yml') }} - - uses: goanpeca/setup-miniconda@v1.6.0 + - uses: conda-incubator/setup-miniconda@v2 with: activate-environment: modin environment-file: environment.yml @@ -382,7 +385,7 @@ jobs: with: path: ~/.cache/pip key: ${{ runner.os }}-python-3.7-pip-${{ github.run_id }}-${{ hashFiles('environment.yml') }} - - uses: goanpeca/setup-miniconda@v1.6.0 + - uses: conda-incubator/setup-miniconda@v2 with: activate-environment: modin environment-file: environment.yml @@ -420,7 +423,7 @@ jobs: with: path: ~/.cache/pip key: ${{ runner.os }}-python-3.7-pip-${{ github.run_id }}-${{ hashFiles('environment.yml') }} - - uses: goanpeca/setup-miniconda@v1.6.0 + - uses: conda-incubator/setup-miniconda@v2 with: activate-environment: modin environment-file: environment.yml @@ -458,7 +461,7 @@ jobs: with: path: ~\AppData\Local\pip\Cache key: ${{ runner.os }}-python-${{ matrix.python-version }}-pip-${{ github.run_id }}-${{ hashFiles('environment.yml') }} - - uses: goanpeca/setup-miniconda@v1.6.0 + - uses: conda-incubator/setup-miniconda@v2 with: activate-environment: modin environment-file: environment.yml @@ -543,7 +546,7 @@ jobs: with: path: ~/.cache/pip key: ${{ runner.os }}-python-${{ matrix.python-version }}-pip-${{ github.run_id }}-${{ hashFiles('environment.yml') }} - - uses: goanpeca/setup-miniconda@v1.6.0 + - uses: conda-incubator/setup-miniconda@v2 with: activate-environment: modin environment-file: environment.yml diff --git a/.github/workflows/push.yml b/.github/workflows/push.yml index 29dece692d2..a5594a8af81 100644 --- a/.github/workflows/push.yml +++ b/.github/workflows/push.yml @@ -29,6 +29,41 @@ jobs: architecture: "x64" - run: pip install "ray>=1.0.0" + test-internals: + needs: prepare-cache + runs-on: ubuntu-latest + name: test-internals + steps: + - uses: actions/checkout@v2 + with: + fetch-depth: 1 + - name: Cache pip + uses: actions/cache@v1 + with: + path: ~/.cache/pip + key: ${{ runner.os }}-python-3.6-pip-${{ github.run_id }}-${{ hashFiles('environment.yml') }} + - uses: conda-incubator/setup-miniconda@v2 + with: + activate-environment: modin + environment-file: environment.yml + python-version: 3.6 + channel-priority: strict + use-only-tar-bz2: true # IMPORTANT: This needs to be set for caching to work properly! + - name: Conda environment + shell: bash -l {0} + run: | + conda info + conda list + - name: Internals tests + shell: bash -l {0} + run: python -m pytest modin/data_management/factories/test/test_dispatcher.py modin/experimental/cloud/test/test_cloud.py + - shell: bash -l {0} + run: python -m pytest modin/config/test + - shell: bash -l {0} + run: python -m pytest modin/test/test_envvar_catcher.py + - shell: bash -l {0} + run: python -m pytest modin/test/backends/pandas/test_internals.py + test-defaults: needs: prepare-cache runs-on: ubuntu-latest @@ -48,7 +83,7 @@ jobs: with: path: ~/.cache/pip key: ${{ runner.os }}-python-3.6-pip-${{ github.run_id }}-${{ hashFiles('environment.yml') }} - - uses: goanpeca/setup-miniconda@v1.6.0 + - uses: conda-incubator/setup-miniconda@v2 with: activate-environment: modin environment-file: environment.yml @@ -114,7 +149,7 @@ jobs: path: ~/.cache/pip key: ${{ runner.os }}-python-3.7-pip-${{ github.run_id }}-${{ hashFiles('environment.yml') }} - name: Setting up Modin environment - uses: goanpeca/setup-miniconda@v1.6.0 + uses: conda-incubator/setup-miniconda@v2 with: activate-environment: modin_on_omnisci python-version: 3.7.8 @@ -153,7 +188,7 @@ jobs: with: path: ~/.cache/pip key: ${{ runner.os }}-python-${{ matrix.python-version }}-pip-${{ github.run_id }}-${{ hashFiles('environment.yml') }} - - uses: goanpeca/setup-miniconda@v1.6.0 + - uses: conda-incubator/setup-miniconda@v2 with: activate-environment: modin environment-file: environment.yml @@ -225,7 +260,7 @@ jobs: with: path: ~\AppData\Local\pip\Cache key: ${{ runner.os }}-python-${{ matrix.python-version }}-pip-${{ github.run_id }}-${{ hashFiles('environment.yml') }} - - uses: goanpeca/setup-miniconda@v1.6.0 + - uses: conda-incubator/setup-miniconda@v2 with: activate-environment: modin environment-file: environment.yml @@ -310,7 +345,7 @@ jobs: with: path: ~/.cache/pip key: ${{ runner.os }}-python-${{ matrix.python-version }}-pip-${{ github.run_id }}-${{ hashFiles('environment.yml') }} - - uses: goanpeca/setup-miniconda@v1.6.0 + - uses: conda-incubator/setup-miniconda@v2 with: activate-environment: modin environment-file: environment.yml diff --git a/README.md b/README.md index aa2a1add823..62551a367ba 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,7 @@ - +

To use Modin, replace the pandas import:

@@ -179,8 +179,8 @@ and improve: ![Architecture](docs/img/modin_architecture.png) -Visit the [Documentation](https://modin.readthedocs.io/en/latest/architecture.html) for -more information! +Visit the [Documentation](https://modin.readthedocs.io/en/latest/developer/architecture.html) for +more information, and checkout [the difference between Modin and Dask!](https://github.com/modin-project/modin/tree/master/docs/modin_vs_dask.md) **`modin.pandas` is currently under active development. Requests and contributions are welcome!** diff --git a/docs/UsingSQLonRay/index.rst b/docs/UsingSQLonRay/index.rst index 82f3fb7b2a0..ce9dcd2beaf 100644 --- a/docs/UsingSQLonRay/index.rst +++ b/docs/UsingSQLonRay/index.rst @@ -30,4 +30,4 @@ Modin has a query compiler that acts as an intermediate layer between the query 0 1 2.0 A String of information True 1 6 17.0 A String of different information False -.. _architecture: https://modin.readthedocs.io/en/latest/architecture.html +.. _architecture: https://modin.readthedocs.io/en/latest/developer/architecture.html diff --git a/docs/comparisons/dask.rst b/docs/comparisons/dask.rst new file mode 100644 index 00000000000..82627916413 --- /dev/null +++ b/docs/comparisons/dask.rst @@ -0,0 +1,90 @@ +Modin vs. Dask Dataframe +======================== + +Dask's Dataframe is effectively a meta-frame, partitioning and scheduling many smaller +``pandas.DataFrame`` objects. The Dask DataFrame does not implement the entire pandas +API, and it isn't trying to. See this explained in the `Dask DataFrame documentation`_. + +**The TL;DR is that Modin's API is identical to pandas, whereas Dask's is not. Note: The +projects are fundamentally different in their aims, so a fair comparison is +challenging.** + +API +--- +The API of Modin and Dask are different in several ways, explained here. + +Dask DataFrame +"""""""""""""" + +Dask is currently missing multiple APIs from pandas that Modin has implemented. Of note: +Dask does not implement ``iloc``, ``MultiIndex``, ``apply(axis=0)``, ``quantile``, +``median``, and more. Some of these APIs cannot be implemented efficiently or at all +given the architecture design tradeoffs made in Dask's implementation, and others simply +require engineering effort. ``iloc``, for example, can be implemented, but it would be +inefficient, and ``apply(axis=0)`` cannot be implemented at all in Dask's architecture. + +Dask DataFrames API is also different from the pandas API in that it is lazy and needs +``.compute()`` calls to materialize the DataFrame. This makes the API less convenient +but allows Dask to do certain query optimizations/rearrangement, which can give speedups +in certain situations. Several additional APIs exist in the Dask DataFrame API that +expose internal state about how the data is chunked and other data layout details, and +ways to manipulate that state. + +Semantically, Dask sorts the ``index``, which does not allow for user-specified order. +In Dask's case, this was done for optimization purposes, to speed up other computations +which involve the row index. + +Modin +""""" + +Modin is targeted toward parallelizing the entire pandas API, without exception. +As the pandas API continues to evolve, so will Modin's pandas API. Modin is intended to +be used as a drop-in replacement for pandas, such that even if the API is not yet +parallelized, it still works by falling back to running pandas. One of the key features +of being a drop-in replacement is that not only will it work for existing code, if a +user wishing to go back to running pandas directly, they may at no cost. There's no +lock-in: Modin notebooks can be converted to and from pandas as the user prefers. + +In the long-term, Modin is planned to become a data science framework that supports all +popular APIs (SQL, pandas, etc.) with the same underlying execution. + +Architecture +------------ + +The differences in Modin and Dask's architectures are explained in this section. + +Dask DataFrame +"""""""""""""" + +Dask DataFrame uses row-based partitioning, similar to Spark. This can be seen in their +`documentation`_. They also have a custom index object for indexing into the object, +which is not pandas compatible. Dask DataFrame seems to treat operations on the +DataFrame as MapReduce operations, which is a good paradigm for the subset of the pandas +API they have chosen to implement, but makes certain operations impossible. Dask +Dataframe is also lazy and places a lot of partitioning responsibility on the user. + +Modin +""""" + +Modin's partition is much more flexible, so the system can scale in both directions and +have finer grained partitioning. This is explained at a high level in `Modin's +documentation`_. Because we have this finer grained control over the partitioning, we +can support a number of operations that are very challenging in MapReduce systems (e.g. +transpose, median, quantile). This flexibility in partitioning also gives Modin +tremendous power to implement efficient straggler mitigation and improvements in +utilization over the entire cluster. + +Modin is also architected to run on a variety of systems. The goal here is that users +can take the same notebook to different clusters or different environments and it will +still just work, run on what you have! Modin does support running on Dask's compute +engine in addition to Ray. The architecture of Modin is extremely modular, we are able +to add different execution engines or compile to different memory formats because of +this modularity. Modin can run on a Dask cluster in the same way that Dask Dataframe +can, but they will still be different in all of the ways described above. + +Modin's implementation is grounded in theory, which is what enables us to implement the +entire pandas API. + +.. _Dask DataFrame documentation: http://docs.dask.org/en/latest/dataframe.html#common-uses-and-anti-uses +.. _documentation: http://docs.dask.org/en/latest/dataframe.html#design. +.. _Modin's documentation: https://modin.readthedocs.io/en/latest/developer/architecture.html diff --git a/docs/comparisons/index.rst b/docs/comparisons/index.rst new file mode 100644 index 00000000000..40647d065d9 --- /dev/null +++ b/docs/comparisons/index.rst @@ -0,0 +1,4 @@ +How is Modin unique? +==================== + +Coming Soon... diff --git a/docs/comparisons/pandas.rst b/docs/comparisons/pandas.rst new file mode 100644 index 00000000000..dfbaf02aba3 --- /dev/null +++ b/docs/comparisons/pandas.rst @@ -0,0 +1,4 @@ +Modin vs. Pandas +================ + +Coming Soon... diff --git a/docs/comparisons/spark.rst b/docs/comparisons/spark.rst new file mode 100644 index 00000000000..bf60963f710 --- /dev/null +++ b/docs/comparisons/spark.rst @@ -0,0 +1,4 @@ +Modin vs. Koalas and Spark +========================== + +Coming Soon... diff --git a/docs/developer/contributing.rst b/docs/contributing.rst similarity index 89% rename from docs/developer/contributing.rst rename to docs/contributing.rst index 4ce4e3091bb..af8b2319ca9 100644 --- a/docs/developer/contributing.rst +++ b/docs/contributing.rst @@ -49,7 +49,6 @@ with this project or the open source license(s) involved." Signed-off-by: Awesome Developer -. Code without a proper signoff cannot be merged into the master branch. Note: You must use your real name (sorry, no pseudonyms or anonymous contributions.) @@ -88,7 +87,9 @@ To ensure that all commit messages in the master branch follow a specific format enforce that all commit messages must follow the following format: .. code-block:: bash - FEAT-#9999: Add `DataFrame.rolling` functionality, to enable rolling window operations + + FEAT-#9999: Add `DataFrame.rolling` functionality, to enable rolling window operations + The ``FEAT`` component represents the type of commit. This component of the commit message can be one of the following: @@ -113,10 +114,10 @@ dependencies for running the tests and formatting the code: .. code-block:: bash + conda env create --file environment.yml + # or pip install -r requirements.txt -For developments under Windows, dependencies can be found in 'env_windows.yml' file. - Code Formatting and Lint ------------------------ @@ -128,13 +129,13 @@ that you run the following from the project root: black modin/ We also use flake8_ to check linting errors. Running the following from the project root -will ensure that it passes the lint checks on Travis: +will ensure that it passes the lint checks on Github Actions: .. code-block:: bash flake8 . -We test that this has been run on our `Travis CI`_ test suite. If you do this and find +We test that this has been run on our `Github Actions`_ test suite. If you do this and find that the tests are still failing, try updating your version of black and flake8. Adding a test @@ -164,6 +165,26 @@ subset of the test suite. In order to run a specific test run: The entire test suite is automatically run for each pull request. +Building documentation +---------------------- + +To build the documentation, please follow the steps below from the project root: + +.. code-block:: bash + + cd docs + pip install -r requirements-doc.txt + sphinx-build -b html . build + +To visualize the documentation locally, run the following from `build` folder: + +.. code-block:: bash + + python -m http.server + # python -m http.server 1234 + +then open the browser at `0.0.0.0:` (e.g. `0.0.0.0:1234`). + Contributing a new execution framework or in-memory format ---------------------------------------------------------- @@ -181,6 +202,6 @@ More docs on this coming soon... .. _internal methods: .. _black: https://github.com/ambv/black .. _flake8: http://flake8.pycqa.org/en/latest/ -.. _Travis CI: https://travis-ci.org/ +.. _Github Actions: https://github.com/features/actions .. _testing: .. _developer mailing list: https://groups.google.com/forum/#!forum/modin-dev diff --git a/docs/developer/architecture.rst b/docs/developer/architecture.rst index 934e9781053..c8ed4162a75 100644 --- a/docs/developer/architecture.rst +++ b/docs/developer/architecture.rst @@ -234,7 +234,7 @@ Supported Execution Frameworks and Memory Formats This is the list of execution frameworks and memory formats supported in Modin. If you would like to contribute a new execution framework or memory format, please see the -documentation page on Contributing_. +documentation page on :doc:`../contributing`. - `Pandas on Ray`_ - Uses the Ray_ execution framework. diff --git a/docs/examples/index.rst b/docs/examples/index.rst new file mode 100644 index 00000000000..e7aba021fa3 --- /dev/null +++ b/docs/examples/index.rst @@ -0,0 +1,9 @@ +Examples +======== + +scikit-learn with LinearRegression +---------------------------------- +Here is a Jupyter Notebook example which uses Modin with scikit-learn +and linear regression `sklearn LinearRegression`_. + +.. _sklearn LinearRegression: https://github.com/modin-project/modin/blob/master/examples/modin-scikit-learn-example.ipynb diff --git a/docs/index.rst b/docs/index.rst index 2f8e80822a5..9b69b53cfa5 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -128,6 +128,19 @@ nature, you get a fast DataFrame at 1MB and 1TB+. using_modin out_of_core +.. toctree:: + :caption: Examples + + examples/index + +.. toctree:: + :caption: How is Modin different from ...? + + comparisons/index + comparisons/pandas + comparisons/dask + comparisons/spark + .. toctree:: :caption: Supported APIs @@ -140,7 +153,7 @@ nature, you get a fast DataFrame at 1MB and 1TB+. .. toctree:: :caption: Developer Documentation - developer/contributing + contributing developer/architecture .. toctree:: @@ -151,12 +164,6 @@ nature, you get a fast DataFrame at 1MB and 1TB+. UsingPyarrowonRay/index UsingSQLonRay/index -.. toctree:: - :caption: Contributing to Modin - - contributing - architecture - .. toctree:: :caption: Help diff --git a/docs/modin_vs_dask.md b/docs/modin_vs_dask.md new file mode 100644 index 00000000000..477dba9a887 --- /dev/null +++ b/docs/modin_vs_dask.md @@ -0,0 +1,32 @@ +# What is the difference between Dask DataFrame and Modin? + +**The TL;DR is that Modin's API is identical to pandas, whereas Dask's is not. Note: The projects are fundamentally different in their aims, so a fair comparison is challenging.** + +## API + +### Dask DataFrame + +Dask DataFrame does not scale the entire pandas API, and it isn't trying to. See this explained in their documentation [here](http://docs.dask.org/en/latest/dataframe.html#common-uses-and-anti-uses) + +Dask DataFrames API is also different from the pandas API in that it is lazy and needs .compute() to materialize the DataFrame. This makes the API less convenient but allows to do certain query optimizations/rearrangement, which can give speedups in certain situations. We are planning to incorporate similar capabilities into Modin but hope we can do so without having to change the API. We will outline plans for speeding up Modin in an upcoming blog post. + +### Modin + +Modin attempts to parallelize as much of the pandas API as is possible. We have worked through a significant portion of the DataFrame API. It is intended to be used as a drop-in replacement for pandas, such that even if the API is not yet parallelized, it is still defaulting to pandas. + +## Architecture + +### Dask DataFrame + +Dask DataFrame has row-based partitioning, similar to Spark. This can be seen in their [documentation](http://docs.dask.org/en/latest/dataframe.html#design.) They also have a custom index object for indexing into the object, which is not pandas compatible. Dask DataFrame seems to treat operations on the DataFrame as MapReduce operations, which is a good paradigm for the subset of the pandas API they have chosen to implement. + +### Modin + +Modin is more of a column-store, which we inherited from modern database systems. We laterally partition the columns for scalability (many systems, such as Google BigTable already did this), so we can scale in both directions and have finer grained partitioning. This is explained at a high level in [Modin's documentation](https://modin.readthedocs.io/en/latest/architecture.html). Because we have this finer grained control over the partitioning, we can support a number of operations that are very challenging in MapReduce systems (e.g. transpose, median, quantile). + +## Modin aims + +In the long-term, Modin is planned to become a DataFrame library that supports the popular APIs (SQL, pandas, etc.) and runs on a variety of compute engines and backends. In fact, a group was able to contribute a dask.delayed backend to Modin already in <200 lines of code [PR](https://github.com/modin-project/modin/pull/281). + + +- Reference: [Query: What is the difference between Dask and Modin? #515](https://github.com/modin-project/modin/issues/515) \ No newline at end of file diff --git a/environment.yml b/environment.yml index d50ca19a1f8..802241f50b7 100644 --- a/environment.yml +++ b/environment.yml @@ -2,9 +2,9 @@ name: modin channels: - conda-forge dependencies: - - pandas==1.1.3 + - pandas==1.1.4 - numpy - - pyarrow<0.17 + - pyarrow>=1.0.0 - dask[complete]>=2.12.0,<=2.19.0 - distributed>=2.12.0,<=2.19.0 - xarray diff --git a/examples/docker/census-on-omnisci/build-docker-image.sh b/examples/docker/census-on-omnisci/build-docker-image.sh new file mode 100644 index 00000000000..f4dcb266365 --- /dev/null +++ b/examples/docker/census-on-omnisci/build-docker-image.sh @@ -0,0 +1,25 @@ +#!/bin/bash -e + +# Licensed to Modin Development Team under one or more contributor license agreements. +# See the NOTICE file distributed with this work for additional information regarding +# copyright ownership. The Modin Development Team licenses this file to you under the +# Apache License, Version 2.0 (the "License"); you may not use this file except in +# compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under +# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific language +# governing permissions and limitations under the License. + +echo "Note: a user is responsible for preparing the dataset. +The dataset must be named as 'ipums_education2income_1970-2010.csv' and +be in the folder with 'census-omnisci.dockerfile'. It can be downloaded by link: +'https://rapidsai-data.s3.us-east-2.amazonaws.com/datasets/ipums_education2income_1970-2010.csv.gz'" + +cd "`dirname \"$0\"`" + +docker build -f census-omnisci.dockerfile -t census-omnisci --build-arg no_proxy \ + --build-arg https_proxy --build-arg http_proxy --build-arg conda_extra_channel . +printf "\n\nTo run the benchmark execute:\n\tdocker run --rm census-omnisci\n" diff --git a/examples/docker/census-on-omnisci/census-omnisci.dockerfile b/examples/docker/census-on-omnisci/census-omnisci.dockerfile new file mode 100644 index 00000000000..98cfa0d5518 --- /dev/null +++ b/examples/docker/census-on-omnisci/census-omnisci.dockerfile @@ -0,0 +1,63 @@ +# Licensed to Modin Development Team under one or more contributor license agreements. +# See the NOTICE file distributed with this work for additional information regarding +# copyright ownership. The Modin Development Team licenses this file to you under the +# Apache License, Version 2.0 (the "License"); you may not use this file except in +# compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under +# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific language +# governing permissions and limitations under the License. + +FROM ubuntu:18.04 +ENV http_proxy ${http_proxy} +ENV https_proxy ${https_proxy} +ENV no_proxy ${no_proxy} +ENV MODIN_BACKEND "omnisci" +ENV MODIN_EXPERIMENTAL "true" + +ARG conda_extra_channel +ENV add_extra_channel=${conda_extra_channel:+"-c ${conda_extra_channel}"} + +RUN apt-get update --yes \ + && apt-get install wget --yes && \ + rm -rf /var/lib/apt/lists/* + +ENV USER modin +ENV UID 1000 +ENV HOME /home/$USER + +RUN adduser --disabled-password \ + --gecos "Non-root user" \ + --uid $UID \ + --home $HOME \ + $USER + +ENV CONDA_DIR ${HOME}/miniconda + +SHELL ["/bin/bash", "--login", "-c"] + +RUN wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O /tmp/miniconda3.sh && \ + bash /tmp/miniconda3.sh -b -p "${CONDA_DIR}" -f -u && \ + "${CONDA_DIR}/bin/conda" init bash && \ + rm -f /tmp/miniconda3.sh && \ + echo ". '${CONDA_DIR}/etc/profile.d/conda.sh'" >> "${HOME}/.profile" + +RUN conda update -n base -c defaults conda -y && \ + conda create -n modin --yes --no-default-packages && \ + conda activate modin && \ + conda install -c intel/label/modin -c conda-forge modin "ray>=1.0.0" + +RUN conda activate modin && \ + conda install -c intel/label/modin -c conda-forge -c intel ${add_extra_channel} \ + "daal4py>=2021.1" dpcpp_cpp_rt && \ + conda install -c conda-forge scikit-learn && \ + conda clean --all --yes + +COPY ipums_education2income_1970-2010.csv "${HOME}/ipums_education2income_1970-2010.csv" + +COPY census-omnisci.py "${HOME}/census-omnisci.py" + +CMD ["/bin/bash", "--login", "-c", "conda activate modin && python ${HOME}/census-omnisci.py"] diff --git a/examples/docker/census-on-omnisci/census-omnisci.py b/examples/docker/census-on-omnisci/census-omnisci.py new file mode 100644 index 00000000000..48e946870b8 --- /dev/null +++ b/examples/docker/census-on-omnisci/census-omnisci.py @@ -0,0 +1,162 @@ +# Licensed to Modin Development Team under one or more contributor license agreements. +# See the NOTICE file distributed with this work for additional information regarding +# copyright ownership. The Modin Development Team licenses this file to you under the +# Apache License, Version 2.0 (the "License"); you may not use this file except in +# compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under +# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific language +# governing permissions and limitations under the License. + +import os +import time +import modin.pandas as pd +from modin.experimental.engines.omnisci_on_ray.frame.omnisci_worker import OmnisciServer + +from sklearn import config_context +import daal4py.sklearn as sklearn + +sklearn.patch_sklearn() +from sklearn.model_selection import train_test_split +import sklearn.linear_model as lm +import numpy as np + + +def read(): + columns_names = [ + "YEAR0", "DATANUM", "SERIAL", "CBSERIAL", "HHWT", "CPI99", "GQ", "QGQ", "PERNUM", "PERWT", "SEX", + "AGE", "EDUC", "EDUCD", "INCTOT", "SEX_HEAD", "SEX_MOM", "SEX_POP", "SEX_SP", "SEX_MOM2", "SEX_POP2", + "AGE_HEAD", "AGE_MOM", "AGE_POP", "AGE_SP", "AGE_MOM2", "AGE_POP2", "EDUC_HEAD", "EDUC_MOM", "EDUC_POP", + "EDUC_SP", "EDUC_MOM2", "EDUC_POP2", "EDUCD_HEAD", "EDUCD_MOM", "EDUCD_POP", "EDUCD_SP", "EDUCD_MOM2", + "EDUCD_POP2", "INCTOT_HEAD", "INCTOT_MOM", "INCTOT_POP", "INCTOT_SP", "INCTOT_MOM2", "INCTOT_POP2", + ] + columns_types = [ + "int64", "int64", "int64", "float64", "int64", "float64", "int64", "float64", "int64", "int64", + "int64", "int64", "int64", "int64", "int64", "float64", "float64", "float64", "float64", "float64", + "float64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", + "float64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", + "float64", "float64", "float64", "float64", "float64", "float64", "float64", + ] + dtypes = {columns_names[i]: columns_types[i] for i in range(len(columns_names))} + + df = pd.read_csv( + os.path.expanduser('~/ipums_education2income_1970-2010.csv'), + names=columns_names, + dtype=dtypes, + skiprows=1, + ) + + df.shape # to trigger real execution + df._query_compiler._modin_frame._partitions[0][ + 0 + ].frame_id = OmnisciServer().put_arrow_to_omnisci( + df._query_compiler._modin_frame._partitions[0][0].get() + ) # to trigger real execution + return df + + +def etl(df): + keep_cols = [ + "YEAR0", "DATANUM", "SERIAL", "CBSERIAL", "HHWT", "CPI99", "GQ", "PERNUM", "SEX", "AGE", + "INCTOT", "EDUC", "EDUCD", "EDUC_HEAD", "EDUC_POP", "EDUC_MOM", "EDUCD_MOM2", "EDUCD_POP2", + "INCTOT_MOM", "INCTOT_POP", "INCTOT_MOM2", "INCTOT_POP2", "INCTOT_HEAD", "SEX_HEAD", + ] + df = df[keep_cols] + + df = df[df["INCTOT"] != 9999999] + df = df[df["EDUC"] != -1] + df = df[df["EDUCD"] != -1] + + df["INCTOT"] = df["INCTOT"] * df["CPI99"] + + for column in keep_cols: + df[column] = df[column].fillna(-1) + + df[column] = df[column].astype("float64") + + y = df["EDUC"] + X = df.drop(columns=["EDUC", "CPI99"]) + + # to trigger real execution + df.shape + y.shape + X.shape + + return (df, X, y) + + +def mse(y_test, y_pred): + return ((y_test - y_pred) ** 2).mean() + + +def cod(y_test, y_pred): + y_bar = y_test.mean() + total = ((y_test - y_bar) ** 2).sum() + residuals = ((y_test - y_pred) ** 2).sum() + return 1 - (residuals / total) + + +def ml(X, y, random_state, n_runs, test_size): + clf = lm.Ridge() + + X = np.ascontiguousarray(X, dtype=np.float64) + y = np.ascontiguousarray(y, dtype=np.float64) + + mse_values, cod_values = [], [] + ml_scores = {} + + print("ML runs: ", n_runs) + for i in range(n_runs): + (X_train, X_test, y_train, y_test) = train_test_split( + X, y, test_size=test_size, random_state=random_state + ) + random_state += 777 + + with config_context(assume_finite=True): + model = clf.fit(X_train, y_train) + + y_pred = model.predict(X_test) + + mse_values.append(mse(y_test, y_pred)) + cod_values.append(cod(y_test, y_pred)) + + ml_scores["mse_mean"] = sum(mse_values) / len(mse_values) + ml_scores["cod_mean"] = sum(cod_values) / len(cod_values) + ml_scores["mse_dev"] = pow( + sum([(mse_value - ml_scores["mse_mean"]) ** 2 for mse_value in mse_values]) + / (len(mse_values) - 1), + 0.5, + ) + ml_scores["cod_dev"] = pow( + sum([(cod_value - ml_scores["cod_mean"]) ** 2 for cod_value in cod_values]) + / (len(cod_values) - 1), + 0.5, + ) + + return ml_scores + + +def measure(name, func, *args, **kw): + t0 = time.time() + res = func(*args, **kw) + t1 = time.time() + print(f'{name}: {t1 - t0} sec') + return res + + +def main(): + # ML specific + N_RUNS = 50 + TEST_SIZE = 0.1 + RANDOM_STATE = 777 + + df = measure('Reading', read) + _, X, y = measure('ETL', etl, df) + measure('ML', ml, X, y, random_state=RANDOM_STATE, n_runs=N_RUNS, test_size=TEST_SIZE) + + +if __name__ == '__main__': + main() diff --git a/examples/docker/nyc-taxi.dockerfile b/examples/docker/nyc-taxi.dockerfile index f10e749a1f7..a4703745a92 100644 --- a/examples/docker/nyc-taxi.dockerfile +++ b/examples/docker/nyc-taxi.dockerfile @@ -12,11 +12,16 @@ # governing permissions and limitations under the License. FROM ubuntu:18.04 + +ARG PYTHON_VERSION=3.7 ENV http_proxy ${http_proxy} ENV https_proxy ${https_proxy} -RUN apt-get update --yes \ - && apt-get install wget --yes && \ +RUN apt-get update --yes && \ + apt-get install --yes --no-install-recommends --fix-missing \ + gcc \ + python${PYTHON_VERSION}-dev \ + wget && \ rm -rf /var/lib/apt/lists/* ENV USER modin @@ -33,7 +38,7 @@ ENV CONDA_DIR ${HOME}/miniconda SHELL ["/bin/bash", "--login", "-c"] -RUN wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O /tmp/miniconda3.sh && \ +RUN wget --quiet --no-check-certificate https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O /tmp/miniconda3.sh && \ bash /tmp/miniconda3.sh -b -p "${CONDA_DIR}" -f -u && \ "${CONDA_DIR}/bin/conda" init bash && \ rm -f /tmp/miniconda3.sh && \ @@ -45,7 +50,7 @@ RUN conda update -n base -c defaults conda -y && \ pip install --no-cache-dir modin[ray] && \ conda clean --all --yes -RUN wget https://modin-datasets.s3.amazonaws.com/trips_data.csv -O "${HOME}/trips_data.csv" +RUN wget --quiet --no-check-certificate https://modin-datasets.s3.amazonaws.com/trips_data.csv -O "${HOME}/trips_data.csv" COPY nyc-taxi.py "${HOME}/nyc-taxi.py" diff --git a/examples/docker/nyc-taxi.py b/examples/docker/nyc-taxi.py index 43cb53e1852..753b857b52b 100644 --- a/examples/docker/nyc-taxi.py +++ b/examples/docker/nyc-taxi.py @@ -40,19 +40,14 @@ def q2(df): return df.groupby("passenger_count", as_index=False).mean()[["passenger_count", "total_amount"]] def q3(df): - transformed = pd.DataFrame({ - "passenger_count": df["passenger_count"], - "pickup_datetime": df["pickup_datetime"].dt.year, - }) - return transformed.groupby(["pickup_datetime", "passenger_count"]).agg({"passenger_count": ["count"]}) + df["pickup_datetime"] = df["pickup_datetime"].dt.year + return df.groupby(["pickup_datetime", "passenger_count"]).size().reset_index() + def q4(df): - transformed = pd.DataFrame({ - "passenger_count": df["passenger_count"], - "pickup_datetime": df["pickup_datetime"].dt.year, - "trip_distance": df["trip_distance"].astype("int64"), - }) - return transformed.groupby(["passenger_count", "pickup_datetime", "trip_distance"]) \ + df["pickup_datetime"] = df["pickup_datetime"].dt.year + df["trip_distance"] = df["trip_distance"].astype("int64") + return df.groupby(["passenger_count", "pickup_datetime", "trip_distance"]) \ .size().reset_index().sort_values(by=["pickup_datetime", 0], ascending=[True, False]) def measure(name, func, *args, **kw): @@ -66,8 +61,8 @@ def main(): df = measure('Reading', read) measure('Q1', q1, df) measure('Q2', q2, df) - measure('Q3', q3, df) - measure('Q4', q4, df) + measure('Q3', q3, df.copy()) + measure('Q4', q4, df.copy()) if __name__ == '__main__': main() diff --git a/examples/docker/taxi-on-omnisci/build-docker-image.sh b/examples/docker/taxi-on-omnisci/build-docker-image.sh new file mode 100644 index 00000000000..dcf2c395490 --- /dev/null +++ b/examples/docker/taxi-on-omnisci/build-docker-image.sh @@ -0,0 +1,24 @@ +#!/bin/bash -e + +# Licensed to Modin Development Team under one or more contributor license agreements. +# See the NOTICE file distributed with this work for additional information regarding +# copyright ownership. The Modin Development Team licenses this file to you under the +# Apache License, Version 2.0 (the "License"); you may not use this file except in +# compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under +# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific language +# governing permissions and limitations under the License. + +echo "Note: a user is responsible for preparing the dataset. +The dataset must be named as 'trips_xaa.csv' and be in the folder with 'nyc-taxi-omnisci.dockerfile'. +It Can be generated by following the instructions on the link: +'https://github.com/toddwschneider/nyc-taxi-data#instructions'" + +cd "`dirname \"$0\"`" + +docker build -f nyc-taxi-omnisci.dockerfile -t nyc-taxi-omnisci --build-arg https_proxy --build-arg http_proxy . +printf "\n\nTo run the benchmark execute:\n\tdocker run --rm nyc-taxi-omnisci\n" diff --git a/examples/docker/taxi-on-omnisci/nyc-taxi-omnisci.dockerfile b/examples/docker/taxi-on-omnisci/nyc-taxi-omnisci.dockerfile new file mode 100644 index 00000000000..ba4d6b8f3e1 --- /dev/null +++ b/examples/docker/taxi-on-omnisci/nyc-taxi-omnisci.dockerfile @@ -0,0 +1,53 @@ +# Licensed to Modin Development Team under one or more contributor license agreements. +# See the NOTICE file distributed with this work for additional information regarding +# copyright ownership. The Modin Development Team licenses this file to you under the +# Apache License, Version 2.0 (the "License"); you may not use this file except in +# compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under +# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific language +# governing permissions and limitations under the License. + +FROM ubuntu:18.04 +ENV http_proxy ${http_proxy} +ENV https_proxy ${https_proxy} +ENV MODIN_BACKEND "omnisci" +ENV MODIN_EXPERIMENTAL "true" + +RUN apt-get update --yes \ + && apt-get install wget --yes && \ + rm -rf /var/lib/apt/lists/* + +ENV USER modin +ENV UID 1000 +ENV HOME /home/$USER + +RUN adduser --disabled-password \ + --gecos "Non-root user" \ + --uid $UID \ + --home $HOME \ + $USER + +ENV CONDA_DIR ${HOME}/miniconda + +SHELL ["/bin/bash", "--login", "-c"] + +RUN wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O /tmp/miniconda3.sh && \ + bash /tmp/miniconda3.sh -b -p "${CONDA_DIR}" -f -u && \ + "${CONDA_DIR}/bin/conda" init bash && \ + rm -f /tmp/miniconda3.sh && \ + echo ". '${CONDA_DIR}/etc/profile.d/conda.sh'" >> "${HOME}/.profile" + +RUN conda update -n base -c defaults conda -y && \ + conda create -n modin --yes --no-default-packages && \ + conda activate modin && \ + conda install -c intel/label/modin -c conda-forge modin "ray>=1.0.0" && \ + conda clean --all --yes + +COPY trips_xaa.csv "${HOME}/trips_xaa.csv" +COPY nyc-taxi-omnisci.py "${HOME}/nyc-taxi-omnisci.py" + +CMD ["/bin/bash", "--login", "-c", "conda activate modin && python ${HOME}/nyc-taxi-omnisci.py"] diff --git a/examples/docker/taxi-on-omnisci/nyc-taxi-omnisci.py b/examples/docker/taxi-on-omnisci/nyc-taxi-omnisci.py new file mode 100644 index 00000000000..535e93727f9 --- /dev/null +++ b/examples/docker/taxi-on-omnisci/nyc-taxi-omnisci.py @@ -0,0 +1,108 @@ +# Licensed to Modin Development Team under one or more contributor license agreements. +# See the NOTICE file distributed with this work for additional information regarding +# copyright ownership. The Modin Development Team licenses this file to you under the +# Apache License, Version 2.0 (the "License"); you may not use this file except in +# compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under +# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific language +# governing permissions and limitations under the License. + +import os +import time +import modin.pandas as pd +from modin.experimental.engines.omnisci_on_ray.frame.omnisci_worker import OmnisciServer + +def read(): + columns_names = [ + "trip_id", "vendor_id", "pickup_datetime", "dropoff_datetime", "store_and_fwd_flag", + "rate_code_id", "pickup_longitude", "pickup_latitude", "dropoff_longitude", "dropoff_latitude", + "passenger_count", "trip_distance", "fare_amount", "extra", "mta_tax", "tip_amount", + "tolls_amount", "ehail_fee", "improvement_surcharge", "total_amount", "payment_type", + "trip_type", "pickup", "dropoff", "cab_type", "precipitation", "snow_depth", "snowfall", + "max_temperature", "min_temperature", "average_wind_speed", "pickup_nyct2010_gid", + "pickup_ctlabel", "pickup_borocode", "pickup_boroname", "pickup_ct2010", + "pickup_boroct2010", "pickup_cdeligibil", "pickup_ntacode", "pickup_ntaname", "pickup_puma", + "dropoff_nyct2010_gid", "dropoff_ctlabel", "dropoff_borocode", "dropoff_boroname", + "dropoff_ct2010", "dropoff_boroct2010", "dropoff_cdeligibil", "dropoff_ntacode", + "dropoff_ntaname", "dropoff_puma", + ] + # use string instead of category + columns_types = [ + "int64", "string", "timestamp", "timestamp", "string", "int64", "float64", "float64", + "float64", "float64", "int64", "float64", "float64", "float64", "float64", "float64", "float64", + "float64", "float64", "float64", "string", "float64", "string", "string", "string", "float64", + "int64", "float64", "int64", "int64", "float64", "float64", "float64", "float64", "string", "float64", + "float64", "string", "string", "string", "float64", "float64", "float64", "float64", "string", + "float64", "float64", "string", "string", "string", "float64", + ] + + dtypes = {columns_names[i]: columns_types[i] for i in range(len(columns_names))} + all_but_dates = { + col: valtype for (col, valtype) in dtypes.items() if valtype not in ["timestamp"] + } + dates_only = [col for (col, valtype) in dtypes.items() if valtype in ["timestamp"]] + + df = pd.read_csv( + os.path.expanduser('~/trips_xaa.csv'), + names=columns_names, + dtype=all_but_dates, + parse_dates=dates_only, + ) + + df.shape # to trigger real execution + df._query_compiler._modin_frame._partitions[0][ + 0 + ].frame_id = OmnisciServer().put_arrow_to_omnisci( + df._query_compiler._modin_frame._partitions[0][0].get() + ) # to trigger real execution + return df + + +def q1_omnisci(df): + q1_pandas_output = df.groupby("cab_type").size() + q1_pandas_output.shape # to trigger real execution + return q1_pandas_output + +def q2_omnisci(df): + q2_pandas_output = df.groupby("passenger_count").agg({"total_amount": "mean"}) + q2_pandas_output.shape # to trigger real execution + return q2_pandas_output + +def q3_omnisci(df): + df["pickup_datetime"] = df["pickup_datetime"].dt.year + q3_pandas_output = df.groupby(["passenger_count", "pickup_datetime"]).size() + q3_pandas_output.shape # to trigger real execution + return q3_pandas_output + +def q4_omnisci(df): + df["pickup_datetime"] = df["pickup_datetime"].dt.year + df["trip_distance"] = df["trip_distance"].astype("int64") + q4_pandas_output = ( + df.groupby(["passenger_count", "pickup_datetime", "trip_distance"], sort=False) + .size() + .reset_index() + .sort_values(by=["pickup_datetime", 0], ignore_index=True, ascending=[True, False]) + ) + q4_pandas_output.shape # to trigger real execution + return q4_pandas_output + +def measure(name, func, *args, **kw): + t0 = time.time() + res = func(*args, **kw) + t1 = time.time() + print(f'{name}: {t1 - t0} sec') + return res + +def main(): + df = measure('Reading', read) + measure('Q1', q1_omnisci, df) + measure('Q2', q2_omnisci, df) + measure('Q3', q3_omnisci, df.copy()) + measure('Q4', q4_omnisci, df.copy()) + +if __name__ == '__main__': + main() diff --git a/modin/backends/base/query_compiler.py b/modin/backends/base/query_compiler.py index ceb582ae720..e9cc7067796 100644 --- a/modin/backends/base/query_compiler.py +++ b/modin/backends/base/query_compiler.py @@ -58,7 +58,8 @@ class BaseQueryCompiler(abc.ABC): @abc.abstractmethod def default_to_pandas(self, pandas_op, *args, **kwargs): - """Default to pandas behavior. + """ + Default to pandas behavior. Parameters ---------- @@ -136,10 +137,9 @@ def concat(df, axis, other, **kwargs): else: if isinstance(other, (list, np.ndarray)) and len(other) == 1: other = other[0] - how = kwargs.pop("join", None) ignore_index = kwargs.pop("ignore_index", None) - kwargs["how"] = how - result = df.join(other, **kwargs) + kwargs["how"] = kwargs.pop("join", None) + result = df.join(other, rsuffix="r_", **kwargs) if ignore_index: if axis == 0: result = result.reset_index(drop=True) @@ -1396,24 +1396,33 @@ def groupby_size( drop=drop, ) - def groupby_agg(self, by, axis, agg_func, groupby_args, agg_args, drop=False): + def groupby_agg( + self, + by, + is_multi_by, + axis, + agg_func, + agg_args, + agg_kwargs, + groupby_kwargs, + drop=False, + ): + if is_multi_by: + if isinstance(by, type(self)) and len(by.columns) == 1: + by = by.columns[0] if drop else by.to_pandas().squeeze() + elif isinstance(by, type(self)): + by = list(by.columns) + else: + by = by.to_pandas().squeeze() if isinstance(by, type(self)) else by + return GroupByDefault.register(pandas.core.groupby.DataFrameGroupBy.aggregate)( self, by=by, + is_multi_by=is_multi_by, axis=axis, agg_func=agg_func, - groupby_args=groupby_args, - agg_args=agg_args, - drop=drop, - ) - - def groupby_dict_agg(self, by, func_dict, groupby_args, agg_args, drop=False): - return GroupByDefault.register(pandas.core.groupby.DataFrameGroupBy.aggregate)( - self, - by=by, - func_dict=func_dict, - groupby_args=groupby_args, - agg_args=agg_args, + groupby_args=groupby_kwargs, + agg_args=agg_kwargs, drop=drop, ) @@ -1562,6 +1571,50 @@ def has_multiindex(self, axis=0): assert axis == 1 return isinstance(self.columns, pandas.MultiIndex) + def get_index_name(self): + """ + Get index name. + + Returns + ------- + hashable + Index name, None for MultiIndex. + """ + return self.index.name + + def set_index_name(self, name): + """ + Set index name. + + Parameters + ---------- + name: hashable + New index name. + """ + self.index.name = name + + def get_index_names(self): + """ + Get index names. + + Returns + ------- + list + Index names. + """ + return self.index.names + + def set_index_names(self, names): + """ + Set index names. + + Parameters + ---------- + names: list + New index names. + """ + self.index.names = names + # DateTime methods dt_ceil = DateTimeDefault.register(pandas.Series.dt.ceil) diff --git a/modin/backends/pandas/query_compiler.py b/modin/backends/pandas/query_compiler.py index 89014b197b8..4f35a3133ae 100644 --- a/modin/backends/pandas/query_compiler.py +++ b/modin/backends/pandas/query_compiler.py @@ -190,7 +190,8 @@ def __init__(self, modin_frame): self._modin_frame = modin_frame def default_to_pandas(self, pandas_op, *args, **kwargs): - """Default to pandas behavior. + """ + Default to pandas behavior. Parameters ---------- @@ -206,8 +207,8 @@ def default_to_pandas(self, pandas_op, *args, **kwargs): PandasQueryCompiler The result of the `pandas_op`, converted back to PandasQueryCompiler - Note - ---- + Notes + ----- This operation takes a distributed object and converts it directly to pandas. """ op_name = getattr(pandas_op, "__name__", str(pandas_op)) @@ -731,18 +732,21 @@ def reduce_func(df, *args, **kwargs): dropna = kwargs.get("dropna", True) try: - result = df.squeeze(axis=1).groupby(df.index, sort=False).sum() + result = ( + df.squeeze(axis=1) + .groupby(df.index, sort=False, dropna=dropna) + .sum() + ) # This will happen with Arrow buffer read-only errors. We don't want to copy # all the time, so this will try to fast-path the code first. except (ValueError): - result = df.copy().squeeze(axis=1).groupby(df.index, sort=False).sum() - - if not dropna and np.nan in df.index: - result = result.append( - pandas.Series( - [df.squeeze(axis=1).loc[[np.nan]].sum()], index=[np.nan] - ) + result = ( + df.copy() + .squeeze(axis=1) + .groupby(df.index, sort=False, dropna=dropna) + .sum() ) + if normalize: result = result / df.squeeze(axis=1).sum() @@ -2558,49 +2562,59 @@ def _callable_func(self, func, axis, *args, **kwargs): method="size", ) - def groupby_dict_agg(self, by, func_dict, groupby_args, agg_args, drop=False): - """Apply aggregation functions to a grouped dataframe per-column. + def groupby_agg( + self, + by, + is_multi_by, + axis, + agg_func, + agg_args, + agg_kwargs, + groupby_kwargs, + drop=False, + ): + if callable(agg_func): + agg_func = wrap_udf_function(agg_func) - Parameters - ---------- - by : PandasQueryCompiler - The column to group by - func_dict : dict of str, callable/string - The dictionary mapping of column to function - groupby_args : dict - The dictionary of keyword arguments for the group by. - agg_args : dict - The dictionary of keyword arguments for the aggregation functions - drop : bool - Whether or not to drop the column from the data. + if is_multi_by: + return super().groupby_agg( + by=by, + is_multi_by=is_multi_by, + axis=axis, + agg_func=agg_func, + agg_args=agg_args, + agg_kwargs=agg_kwargs, + groupby_kwargs=groupby_kwargs, + drop=drop, + ) - Returns - ------- - PandasQueryCompiler - The result of the per-column aggregations on the grouped dataframe. - """ - return self.default_to_pandas( - lambda df: df.groupby(by=by, **groupby_args).agg(func_dict, **agg_args) - ) + by = by.to_pandas().squeeze() if isinstance(by, type(self)) else by - def groupby_agg(self, by, axis, agg_func, groupby_args, agg_args, drop=False): - # since we're going to modify `groupby_args` dict in a `groupby_agg_builder`, + # since we're going to modify `groupby_kwargs` dict in a `groupby_agg_builder`, # we want to copy it to not propagate these changes into source dict, in case # of unsuccessful end of function - groupby_args = groupby_args.copy() + groupby_kwargs = groupby_kwargs.copy() - as_index = groupby_args.get("as_index", True) + as_index = groupby_kwargs.get("as_index", True) def groupby_agg_builder(df): # Set `as_index` to True to track the metadata of the grouping object # It is used to make sure that between phases we are constructing the # right index and placing columns in the correct order. - groupby_args["as_index"] = True + groupby_kwargs["as_index"] = True def compute_groupby(df): - grouped_df = df.groupby(by=by, axis=axis, **groupby_args) + grouped_df = df.groupby(by=by, axis=axis, **groupby_kwargs) try: - result = agg_func(grouped_df, **agg_args) + if isinstance(agg_func, dict): + # Filter our keys that don't exist in this partition. This happens when some columns + # from this original dataframe didn't end up in every partition. + partition_dict = { + k: v for k, v in agg_func.items() if k in df.columns + } + result = grouped_df.agg(partition_dict) + else: + result = agg_func(grouped_df, **agg_kwargs) # This happens when the partition is filled with non-numeric data and a # numeric operation is done. We need to build the index here to avoid # issues with extracting the index. @@ -2626,15 +2640,17 @@ def compute_groupby(df): # determening type of raised exception by applying `aggfunc` # to empty DataFrame try: - agg_func( + pandas.DataFrame(index=[1], columns=[1]).agg(agg_func) if isinstance( + agg_func, dict + ) else agg_func( pandas.DataFrame(index=[1], columns=[1]).groupby(level=0), - **agg_args, + **agg_kwargs, ) except Exception as e: raise type(e)("No numeric types to aggregate.") # Reset `as_index` because it was edited inplace. - groupby_args["as_index"] = as_index + groupby_kwargs["as_index"] = as_index if as_index: return result else: diff --git a/modin/config/envvars.py b/modin/config/envvars.py index 5dbf65f2070..129ec39bc30 100644 --- a/modin/config/envvars.py +++ b/modin/config/envvars.py @@ -196,6 +196,7 @@ class DoUseCalcite(EnvironmentVariable, type=bool): """ varname = "MODIN_USE_CALCITE" + default = True class TestDatasetSize(EnvironmentVariable, type=str): diff --git a/modin/data_management/functions/default_methods/groupby_default.py b/modin/data_management/functions/default_methods/groupby_default.py index 62bda259739..e6cd40675e7 100644 --- a/modin/data_management/functions/default_methods/groupby_default.py +++ b/modin/data_management/functions/default_methods/groupby_default.py @@ -61,22 +61,40 @@ def get_func(cls, grp, key, **kwargs): @classmethod def build_aggregate_method(cls, key): - def fn(df, by, groupby_args, agg_args, axis=0, drop=False, **kwargs): + def fn( + df, + by, + groupby_args, + agg_args, + axis=0, + is_multi_by=None, + drop=False, + **kwargs + ): by = cls.validate_by(by) - groupby_args = groupby_args.copy() - as_index = groupby_args.pop("as_index", True) - groupby_args["as_index"] = True + + if not is_multi_by: + groupby_args = groupby_args.copy() + as_index = groupby_args.pop("as_index", True) + groupby_args["as_index"] = True grp = df.groupby(by, axis=axis, **groupby_args) agg_func = cls.get_func(grp, key, **kwargs) - result = agg_func(grp, **agg_args) - - if as_index: - return result + result = ( + grp.agg(agg_func, **agg_args) + if isinstance(agg_func, dict) + else agg_func(grp, **agg_args) + ) + + if not is_multi_by: + if as_index: + return result + else: + if result.index.name is None or result.index.name in result.columns: + drop = False + return result.reset_index(drop=not drop) else: - if result.index.name is None or result.index.name in result.columns: - drop = False - return result.reset_index(drop=not drop) + return result return fn diff --git a/modin/data_management/utils.py b/modin/data_management/utils.py index 0d0a4aafa18..8a4058beb3d 100644 --- a/modin/data_management/utils.py +++ b/modin/data_management/utils.py @@ -82,18 +82,19 @@ def split_result_of_axis_func_pandas(axis, num_splits, result, length_list=None) list A list of Pandas DataFrames. """ - if num_splits == 1: - return result if length_list is not None: length_list.insert(0, 0) sums = np.cumsum(length_list) - if axis == 0: + if axis == 0 or isinstance(result, pandas.Series): return [result.iloc[sums[i] : sums[i + 1]] for i in range(len(sums) - 1)] else: return [result.iloc[:, sums[i] : sums[i + 1]] for i in range(len(sums) - 1)] + + if num_splits == 1: + return [result] # We do this to restore block partitioning chunksize = compute_chunksize(result, num_splits, axis=axis) - if axis == 0: + if axis == 0 or isinstance(result, pandas.Series): return [ result.iloc[chunksize * i : chunksize * (i + 1)] for i in range(num_splits) ] diff --git a/modin/engines/base/frame/axis_partition.py b/modin/engines/base/frame/axis_partition.py index aaacacce875..cf3c9bfa511 100644 --- a/modin/engines/base/frame/axis_partition.py +++ b/modin/engines/base/frame/axis_partition.py @@ -18,9 +18,9 @@ class BaseFrameAxisPartition(object): # pragma: no cover - """This abstract class represents the Parent class for any - `ColumnPartition` or `RowPartition` class. This class is intended to - simplify the way that operations are performed + """An abstract class that represents the Parent class for any `ColumnPartition` or `RowPartition` class. + + This class is intended to simplify the way that operations are performed. Note 0: The procedures that use this class and its methods assume that they have some global knowledge about the entire axis. This may @@ -46,7 +46,7 @@ def apply( maintain_partitioning=True, **kwargs, ): - """Applies a function to a full axis. + """Apply a function to a full axis. Note: The procedures that invoke this method assume full axis knowledge. Implement this method accordingly. @@ -69,7 +69,8 @@ def apply( orientation (the lengths will remain the same). This is ignored between two axis partitions. - Returns: + Returns + ------- A list of `BaseFramePartition` objects. """ raise NotImplementedError(NOT_IMPLMENTED_MESSAGE) @@ -81,7 +82,8 @@ def shuffle(self, func, lengths, **kwargs): func: The function to apply before splitting. lengths: The list of partition lengths to split the result into. - Returns: + Returns + ------- A list of RemotePartition objects split by `lengths`. """ raise NotImplementedError(NOT_IMPLMENTED_MESSAGE) @@ -91,16 +93,13 @@ def shuffle(self, func, lengths, **kwargs): partition_type = None def _wrap_partitions(self, partitions): - if isinstance(partitions, self.instance_type): - return [self.partition_type(partitions)] - else: - return [self.partition_type(obj) for obj in partitions] + return [self.partition_type(obj) for obj in partitions] class PandasFrameAxisPartition(BaseFrameAxisPartition): - """This abstract class is created to simplify and consolidate the code for - AxisPartitions that run pandas. Because much of the code is similar, this allows - us to reuse this code. + """An abstract class is created to simplify and consolidate the code for AxisPartitions that run pandas. + + Because much of the code is similar, this allows us to reuse this code. Subclasses must implement `list_of_blocks` which unwraps the `RemotePartition` objects and creates something interpretable as a pandas DataFrame. @@ -118,23 +117,28 @@ def apply( maintain_partitioning=True, **kwargs, ): - """Applies func to the object in the plasma store. + """Apply func to the object in the plasma store. See notes in Parent class about this method. - Args: - func: The function to apply. - num_splits: The number of times to split the result object. - other_axis_partition: Another `PandasOnRayFrameAxisPartition` object to apply to - func with this one. - maintain_partitioning: Whether or not to keep the partitioning in the same - orientation as it was previously. This is important because we may be - operating on an individual AxisPartition and not touching the rest. - In this case, we have to return the partitioning to its previous - orientation (the lengths will remain the same). This is ignored between - two axis partitions. + Parameters + ---------- + func: callable + The function to apply. + num_splits: int + The number of times to split the result object. + other_axis_partition: PandasOnRayFrameAxisPartition object + Another `PandasOnRayFrameAxisPartition` object to apply to func with this one. + maintain_partitioning: boolean + Whether or not to keep the partitioning in the same + orientation as it was previously. This is important because we may be + operating on an individual AxisPartition and not touching the rest. + In this case, we have to return the partitioning to its previous + orientation (the lengths will remain the same). This is ignored between + two axis partitions. - Returns: + Returns + ------- A list of `RayRemotePartition` objects. """ if num_splits is None: @@ -180,7 +184,8 @@ def shuffle(self, func, lengths, **kwargs): func: The function to apply before splitting. lengths: The list of partition lengths to split the result into. - Returns: + Returns + ------- A list of RemotePartition objects split by `lengths`. """ num_splits = len(lengths) @@ -207,7 +212,8 @@ def deploy_axis_func( If False, create a new partition layout. partitions: All partitions that make up the full axis (row or column) - Returns: + Returns + ------- A list of Pandas DataFrames. """ # Pop these off first because they aren't expected by the function. @@ -216,10 +222,6 @@ def deploy_axis_func( dataframe = pandas.concat(list(partitions), axis=axis, copy=False) result = func(dataframe, **kwargs) - if isinstance(result, pandas.Series): - if num_splits == 1: - return result - return [result] + [pandas.Series([]) for _ in range(num_splits - 1)] if manual_partition: # The split function is expecting a list diff --git a/modin/engines/base/frame/data.py b/modin/engines/base/frame/data.py index c09898f86d8..f18dab29560 100644 --- a/modin/engines/base/frame/data.py +++ b/modin/engines/base/frame/data.py @@ -25,13 +25,17 @@ class BasePandasFrame(object): + """An abstract class that represents the Parent class for any Pandas DataFrame class. + + This class is intended to simplify the way that operations are performed + """ _frame_mgr_cls = None _query_compiler_cls = PandasQueryCompiler @property def __constructor__(self): - """The constructor for this object. A convenience method""" + """Create a new instance of this object.""" return type(self) def __init__( @@ -87,7 +91,8 @@ def __init__( def _row_lengths(self): """Compute the row lengths if they are not cached. - Returns: + Returns + ------- A list of row lengths. """ if self._row_lengths_cache is None: @@ -103,7 +108,8 @@ def _row_lengths(self): def _column_widths(self): """Compute the column widths if they are not cached. - Returns: + Returns + ------- A list of column widths. """ if self._column_widths_cache is None: @@ -115,14 +121,15 @@ def _column_widths(self): @property def _axes_lengths(self): - """The row lengths, column widths that can be accessed with an `axis` integer.""" + """Row lengths, column widths that can be accessed with an `axis` integer.""" return [self._row_lengths, self._column_widths] @property def dtypes(self): """Compute the data types if they are not cached. - Returns: + Returns + ------- A pandas Series containing the data types for this dataframe. """ if self._dtypes is None: @@ -132,7 +139,8 @@ def dtypes(self): def _compute_dtypes(self): """Compute the dtypes via MapReduce. - Returns: + Returns + ------- The data types of this dataframe. """ @@ -154,13 +162,17 @@ def dtype_builder(df): _columns_cache = None def _validate_set_axis(self, new_labels, old_labels): - """Validates the index or columns replacement against the old labels. + """Validate the index or columns replacement against the old labels. - Args: - new_labels: The labels to replace with. - old_labels: The labels to replace. + Parameters + ---------- + new_labels: list-like + The labels to replace with. + old_labels: list-like + The labels to replace. - Returns: + Returns + ------- The validated labels. """ new_labels = ensure_index(new_labels) @@ -174,26 +186,30 @@ def _validate_set_axis(self, new_labels, old_labels): return new_labels def _get_index(self): - """Gets the index from the cache object. + """Get the index from the cache object. - Returns: + Returns + ------- A pandas.Index object containing the row labels. """ return self._index_cache def _get_columns(self): - """Gets the columns from the cache object. + """Get the columns from the cache object. - Returns: + Returns + ------- A pandas.Index object containing the column labels. """ return self._columns_cache def _set_index(self, new_index): - """Replaces the current row labels with new labels. + """Replace the current row labels with new labels. - Args: - new_index: The replacement row labels. + Parameters + ---------- + new_index: list-like + The replacement row labels. """ if self._index_cache is None: self._index_cache = ensure_index(new_index) @@ -203,10 +219,12 @@ def _set_index(self, new_index): self._apply_index_objs(axis=0) def _set_columns(self, new_columns): - """Replaces the current column labels with new labels. + """Replace the current column labels with new labels. - Args: - new_columns: The replacement column labels. + Parameters + ---------- + new_columns: list-like + The replacement column labels. """ if self._columns_cache is None: self._columns_cache = ensure_index(new_columns) @@ -218,7 +236,7 @@ def _set_columns(self, new_columns): self._apply_index_objs(axis=1) def _set_axis(self, axis, new_axis, cache_only=False): - """Replaces the current labels at the specified axis with the new one + """Replace the current labels at the specified axis with the new one. Parameters ---------- @@ -246,12 +264,12 @@ def _set_axis(self, axis, new_axis, cache_only=False): @property def axes(self): - """The index, columns that can be accessed with an `axis` integer.""" + """Index, columns that can be accessed with an `axis` integer.""" return [self.index, self.columns] def _compute_axis_labels(self, axis: int, partitions=None): """ - Computes labels for specific `axis` + Compute the labels for specific `axis`. Parameters ---------- @@ -273,7 +291,7 @@ def _compute_axis_labels(self, axis: int, partitions=None): ) def _filter_empties(self): - """Removes empty partitions to avoid triggering excess computation.""" + """Remove empty partitions to avoid triggering excess computation.""" if len(self.axes[0]) == 0 or len(self.axes[1]) == 0: # This is the case for an empty frame. We don't want to completely remove # all metadata and partitions so for the moment, we won't prune if the frame @@ -296,7 +314,7 @@ def _filter_empties(self): def _validate_axis_equality(self, axis: int, force: bool = False): """ - Validates internal and external indices of modin_frame at the specified axis. + Validate internal and external indices of modin_frame at the specified axis. Parameters ---------- @@ -329,8 +347,9 @@ def _validate_axis_equality(self, axis: int, force: bool = False): def _validate_internal_indices(self, mode=None, **kwargs): """ - Validates and optionally updates internal and external indices - of modin_frame in specified mode. There is 4 modes supported: + Validate and optionally updates internal and external indices of modin_frame in specified mode. + + There are 4 modes supported: 1. "reduced" - validates on that axis where external indices is ["__reduced__"] for not force 2. "reduced+other" - validates on axis where external @@ -394,7 +413,8 @@ def _apply_index_objs(self, axis=None): Args: axis: The axis to apply to, None applies to both axes. - Returns: + Returns + ------- A new 2D array of partitions that have the index assignment added to the call queue. """ @@ -680,7 +700,8 @@ def reorder_labels(self, row_numeric_idx=None, col_numeric_idx=None): def copy(self): """Copy this object. - Returns: + Returns + ------- A copied version of this object. """ return self.__constructor__( @@ -694,13 +715,14 @@ def copy(self): @classmethod def combine_dtypes(cls, list_of_dtypes, column_names): - """Describes how data types should be combined when they do not match. + """Describe how data types should be combined when they do not match. Args: list_of_dtypes: A list of pandas Series with the data types. column_names: The names of the columns that the data types map to. - Returns: + Returns + ------- A pandas Series containing the finalized data types. """ # Compute dtypes by getting collecting and combining all of the partitions. The @@ -716,13 +738,14 @@ def combine_dtypes(cls, list_of_dtypes, column_names): return dtypes def astype(self, col_dtypes): - """Converts columns dtypes to given dtypes. + """Convert the columns dtypes to given dtypes. Args: col_dtypes: Dictionary of {col: dtype,...} where col is the column name and dtype is a numpy dtype. - Returns: + Returns + ------- dataframe with updated dtypes. """ columns = col_dtypes.keys() @@ -754,9 +777,7 @@ def astype(self, col_dtypes): def astype_builder(df): return df.astype({k: v for k, v in col_dtypes.items() if k in df}) - new_frame = self._frame_mgr_cls.lazy_map_partitions( - self._partitions, astype_builder - ) + new_frame = self._frame_mgr_cls.map_partitions(self._partitions, astype_builder) return self.__constructor__( new_frame, self.index, @@ -774,7 +795,8 @@ def add_prefix(self, prefix, axis): prefix: The prefix to add. axis: The axis to update. - Returns: + Returns + ------- A new dataframe with the updated labels. """ new_labels = self.axes[axis].map(lambda x: str(prefix) + str(x)) @@ -792,7 +814,8 @@ def add_suffix(self, suffix, axis): suffix: The suffix to add. axis: The axis to update. - Returns: + Returns + ------- A new dataframe with the updated labels. """ new_labels = self.axes[axis].map(lambda x: str(x) + str(suffix)) @@ -806,9 +829,10 @@ def add_suffix(self, suffix, axis): # END Metadata modification methods def _numeric_columns(self, include_bool=True): - """Returns the numeric columns of the Manager. + """Return the numeric columns of the Manager. - Returns: + Returns + ------- List of index names. """ columns = [] @@ -945,7 +969,7 @@ def internal(block_idx, global_index): def _join_index_objects(self, axis, other_index, how, sort): """ - Joins a pair of index objects (columns or rows) by a given strategy. + Join the pair of index objects (columns or rows) by a given strategy. Unlike Index.join() in Pandas, if axis is 1, the sort is False, and how is "outer", the result will _not_ be sorted. @@ -994,11 +1018,15 @@ def _build_mapreduce_func(self, axis, func): Note: This should be used for any MapReduce style operation that results in a reduced data dimensionality (dataframe -> series). - Args: - axis: The axis along which to apply the function. - func: The function to apply. + Parameters + ---------- + axis: int + The axis along which to apply the function. + func: callable + The function to apply. - Returns: + Returns + ------- A function to be shipped to the partitions to be executed. """ @@ -1020,7 +1048,7 @@ def _map_reduce_func(df, *args, **kwargs): def _compute_map_reduce_metadata(self, axis, new_parts, preserve_index=True): """ - Computes metadata for the result of reduce function. + Compute the metadata for the result of reduce function. Parameters ---------- @@ -1115,7 +1143,7 @@ def _map_reduce(self, axis, map_func, reduce_func=None, preserve_index=True): else: reduce_func = self._build_mapreduce_func(axis, reduce_func) - map_parts = self._frame_mgr_cls.lazy_map_partitions(self._partitions, map_func) + map_parts = self._frame_mgr_cls.map_partitions(self._partitions, map_func) reduce_parts = self._frame_mgr_cls.map_axis_partitions( axis, map_parts, reduce_func ) @@ -1126,7 +1154,7 @@ def _map_reduce(self, axis, map_func, reduce_func=None, preserve_index=True): def _map(self, func, dtypes=None, validate_index=False, validate_columns=False): """Perform a function that maps across the entire dataset. - Pamareters + Parameters ---------- func : callable The function to apply. @@ -1136,11 +1164,12 @@ def _map(self, func, dtypes=None, validate_index=False, validate_columns=False): type, and allows us to avoid (re)computing it. validate_index : bool, (default False) Is index validation required after performing `func` on partitions. + Returns ------- A new dataframe. """ - new_partitions = self._frame_mgr_cls.lazy_map_partitions(self._partitions, func) + new_partitions = self._frame_mgr_cls.map_partitions(self._partitions, func) if dtypes == "copy": dtypes = self._dtypes elif dtypes is not None: @@ -1175,11 +1204,15 @@ def _fold(self, axis, func): Note: The data shape is not changed (length and width of the table). - Args: - axis: The axis to apply over. - func: The function to apply. + Parameters + ---------- + axis: int + The axis to apply over. + func: callable + The function to apply. - Returns: + Returns + ------- A new dataframe. """ new_partitions = self._frame_mgr_cls.map_axis_partitions( @@ -1196,12 +1229,16 @@ def _fold(self, axis, func): def filter_full_axis(self, axis, func): """Filter data based on the function provided along an entire axis. - Args: - axis: The axis to filter over. - func: The function to use for the filter. This function should filter the + Parameters + ---------- + axis: int + The axis to filter over. + func: callable + The function to use for the filter. This function should filter the data itself. - Returns: + Returns + ------- A new dataframe. """ new_partitions = self._frame_mgr_cls.map_axis_partitions( @@ -1280,18 +1317,27 @@ def _apply_full_axis_select_indices( ): """Apply a function across an entire axis for a subset of the data. - Args: - axis: The axis to apply over. - func: The function to apply - apply_indices: The labels to apply over. - numeric_indices: The indices to apply over. - new_index: (optional) The index of the result. We may know this in advance, + Parameters + ---------- + axis: int + The axis to apply over. + func: callable + The function to apply + apply_indices: list-like + The labels to apply over. + numeric_indices: list-like + The indices to apply over. + new_index: list-like (optional) + The index of the result. We may know this in advance, and if not provided it must be computed. - new_columns: (optional) The columns of the result. We may know this in + new_columns: list-like (optional) + The columns of the result. We may know this in advance, and if not provided it must be computed. - keep_remaining: Whether or not to drop the data that is not computed over. + keep_remaining: boolean + Whether or not to drop the data that is not computed over. - Returns: + Returns + ------- A new dataframe. """ assert apply_indices is not None or numeric_indices is not None @@ -1332,7 +1378,8 @@ def _apply_select_indices( ): """Apply a function for a subset of the data. - Args: + Parameters + ---------- axis: The axis to apply over. func: The function to apply apply_indices: (optional) The labels to apply over. Must be given if axis is @@ -1349,7 +1396,8 @@ def _apply_select_indices( item_to_distribute: (optional) The item to split up so it can be applied over both axes. - Returns: + Returns + ------- A new dataframe. """ # TODO Infer columns and index from `keep_remaining` and `apply_indices` @@ -1458,7 +1506,7 @@ def broadcast_apply( def _prepare_frame_to_broadcast(self, axis, indices, broadcast_all): """ - Computes indices to broadcast `self` with considering of `indices` + Compute the indices to broadcast `self` with considering of `indices`. Parameters ---------- @@ -1508,8 +1556,7 @@ def broadcast_apply_select_indices( new_columns=None, ): """ - Applyies `func` to select indices at specified axis and broadcasts - partitions of `other` frame. + Apply `func` to select indices at specified axis and broadcasts partitions of `other` frame. Parameters ---------- @@ -1646,6 +1693,8 @@ def _copartition(self, axis, other, how, sort, force_repartition=False): """ Copartition two dataframes. + Perform aligning of partitions, index and partition blocks. + Parameters ---------- axis : 0 or 1 @@ -1694,6 +1743,7 @@ def _copartition(self, axis, other, how, sort, force_repartition=False): [self._simple_shuffle(axis, o) for o in other], self.axes[axis].copy(), ) + index_other_obj = [o.axes[axis] for o in other] joined_index = self._join_index_objects(axis, index_other_obj, how, sort) # We have to set these because otherwise when we perform the functions it may @@ -1701,32 +1751,45 @@ def _copartition(self, axis, other, how, sort, force_repartition=False): left_old_idx = self.axes[axis] right_old_idxes = index_other_obj - is_avoid_reindex = len(joined_index) != len(joined_index.unique()) and axis == 0 + def make_map_func(): + if not joined_index.is_unique and axis == 0: + return lambda df: df + return lambda df: df.reindex(joined_index, axis=axis) + # Start with this and we'll repartition the first time, and then not again. - if ( - not is_aligning_applied - and not is_avoid_reindex - and (force_repartition or not left_old_idx.equals(joined_index)) + if is_aligning_applied or ( + not force_repartition and left_old_idx.equals(joined_index) ): + reindexed_self = self._partitions + else: reindexed_self = self._frame_mgr_cls.map_axis_partitions( - axis, self._partitions, lambda df: df.reindex(joined_index, axis=axis) + axis, + self._partitions, + make_map_func(), ) - else: - reindexed_self = self._partitions - reindexed_other_list = [] + def get_column_widths(partitions): + if len(partitions) > 0: + return [obj.width() for obj in partitions[0]] + + def get_row_lengths(partitions): + if len(partitions.T) > 0: + return [obj.length() for obj in partitions.T[0]] + + reindexed_other_list = [] for i in range(len(other)): - if ( - is_aligning_applied - or is_avoid_reindex - or (not force_repartition and right_old_idxes[i].equals(joined_index)) + if is_aligning_applied or ( + not force_repartition and right_old_idxes[i].equals(joined_index) ): reindexed_other = other[i]._partitions else: reindexed_other = other[i]._frame_mgr_cls.map_axis_partitions( axis, other[i]._partitions, - lambda df: df.reindex(joined_index, axis=axis), + make_map_func(), + lengths=get_row_lengths(reindexed_self) + if axis == 0 + else get_column_widths(reindexed_self), ) reindexed_other_list.append(reindexed_other) return reindexed_self, reindexed_other_list, joined_index @@ -1795,13 +1858,19 @@ def _binary_op(self, op, right_frame, join_type="outer"): def _concat(self, axis, others, how, sort): """Concatenate this dataframe with one or more others. - Args: - axis: The axis to concatenate over. - others: The list of dataframes to concatenate with. - how: The type of join to use for the axis. - sort: Whether or not to sort the result. + Parameters + ---------- + axis: int + The axis to concatenate over. + others: List of dataframes + The list of dataframes to concatenate with. + how: str + The type of join to use for the axis. + sort: boolean + Whether or not to sort the result. - Returns: + Returns + ------- A new dataframe. """ # Fast path for equivalent columns and partitioning @@ -1867,7 +1936,8 @@ def groupby_reduce( new_columns: (optional) The columns of the result. We may know this in advance, and if not provided it must be computed. - Returns: + Returns + ------- A new dataframe. """ new_partitions = self._frame_mgr_cls.groupby_reduce( @@ -1886,10 +1956,12 @@ def groupby_reduce( def from_pandas(cls, df): """Improve simple Pandas DataFrame to an advanced and superior Modin DataFrame. - Args: + Parameters + ---------- df: Pandas DataFrame object. - Returns: + Returns + ------- A new dataframe. """ new_index = df.index @@ -1945,9 +2017,10 @@ def _arrow_type_to_dtype(cls, arrow_type): return res def to_pandas(self): - """Converts Modin DataFrame to Pandas DataFrame. + """Convert a Modin DataFrame to Pandas DataFrame. - Returns: + Returns + ------- Pandas DataFrame. """ df = self._frame_mgr_cls.to_pandas(self._partitions) @@ -1969,7 +2042,7 @@ def to_pandas(self): def to_numpy(self, **kwargs): """ - Converts Modin DataFrame to a 2D NumPy array. + Convert a Modin DataFrame to a 2D NumPy array. Returns ------- @@ -1980,7 +2053,8 @@ def to_numpy(self, **kwargs): def transpose(self): """Transpose the index and columns of this dataframe. - Returns: + Returns + ------- A new dataframe. """ new_partitions = self._frame_mgr_cls.lazy_map_partitions( diff --git a/modin/engines/base/frame/partition.py b/modin/engines/base/frame/partition.py index 8854b346e77..6a3c9a49d8e 100644 --- a/modin/engines/base/frame/partition.py +++ b/modin/engines/base/frame/partition.py @@ -15,7 +15,8 @@ class BaseFramePartition(object): # pragma: no cover - """This abstract class holds the data and metadata for a single partition. + """An abstract class that holds the data and metadata for a single partition. + The methods required for implementing this abstract class are listed in the section immediately following this. @@ -36,7 +37,8 @@ def get(self): E.g. if you assign `x = BaseFramePartition.put(1)`, `x.get()` should always return 1. - Returns: + Returns + ------- The object that was `put`. """ raise NotImplementedError(NOT_IMPLEMENTED_MESSAGE) @@ -51,7 +53,8 @@ def apply(self, func, **kwargs): Args: func: The lambda to apply (may already be correctly formatted) - Returns: + Returns + ------- A new `BaseFramePartition` containing the object that has had `func` applied to it. """ @@ -74,7 +77,8 @@ def to_pandas(self): Note: If the underlying object is a Pandas DataFrame, this will likely only need to call `get` - Returns: + Returns + ------- A Pandas DataFrame. """ raise NotImplementedError(NOT_IMPLEMENTED_MESSAGE) @@ -85,7 +89,8 @@ def to_numpy(self, **kwargs): Note: If the underlying object is a Pandas DataFrame, this will return a 2D NumPy array. - Returns: + Returns + ------- A NumPy array. """ raise NotImplementedError(NOT_IMPLEMENTED_MESSAGE) @@ -97,19 +102,22 @@ def mask(self, row_indices, col_indices): row_indices: The indices for the rows to extract. col_indices: The indices for the columns to extract. - Returns: + Returns + ------- A `BaseFramePartition` object. """ raise NotImplementedError(NOT_IMPLEMENTED_MESSAGE) @classmethod def put(cls, obj): - """A factory classmethod to format a given object. + """Format a given object. - Args: + Parameters + ---------- obj: An object. - Returns: + Returns + ------- A `BaseFramePartition` object. """ raise NotImplementedError(NOT_IMPLEMENTED_MESSAGE) @@ -126,25 +134,28 @@ def preprocess_func(cls, func): Args: func: The function to preprocess. - Returns: + Returns + ------- An object that can be accepted by `apply`. """ raise NotImplementedError(NOT_IMPLEMENTED_MESSAGE) @classmethod def length_extraction_fn(cls): - """The function to compute the length of the object in this partition. + """Compute the length of the object in this partition. - Returns: + Returns + ------- A callable function. """ raise NotImplementedError(NOT_IMPLEMENTED_MESSAGE) @classmethod def width_extraction_fn(cls): - """The function to compute the width of the object in this partition. + """Compute the width of the object in this partition. - Returns: + Returns + ------- A callable function. """ raise NotImplementedError(NOT_IMPLEMENTED_MESSAGE) @@ -153,6 +164,7 @@ def width_extraction_fn(cls): _width_cache = None def length(self): + """Return the length of partition.""" if self._length_cache is None: cls = type(self) func = cls.length_extraction_fn() @@ -161,6 +173,7 @@ def length(self): return self._length_cache def width(self): + """Return the width of partition.""" if self._width_cache is None: cls = type(self) func = cls.width_extraction_fn() @@ -170,9 +183,10 @@ def width(self): @classmethod def empty(cls): - """Create an empty partition + """Create an empty partition. - Returns; + Returns + ------- An empty partition """ raise NotImplementedError(NOT_IMPLEMENTED_MESSAGE) diff --git a/modin/engines/base/frame/partition_manager.py b/modin/engines/base/frame/partition_manager.py index ff00a3e8559..b2e403650b8 100644 --- a/modin/engines/base/frame/partition_manager.py +++ b/modin/engines/base/frame/partition_manager.py @@ -20,8 +20,11 @@ class BaseFrameManager(object): - # Partition class is the class to use for storing each partition. It must - # extend the `BaseFramePartition` class. + """Partition class is the class to use for storing each partition. It must extend the `BaseFramePartition` class. + + It is the base class for managing the dataframe data layout and operators. + """ + _partition_class = None # Column partitions class is the class to use to create the column partitions. _column_partitions_class = None @@ -43,6 +46,7 @@ def preprocess_func(cls, map_func): map_func: The function to be preprocessed. Returns + ------- The preprocessed version of the `map_func` provided. Note: This does not require any specific format, only that the `BaseFramePartition.apply` method will recognize it (For the subclass @@ -54,28 +58,33 @@ def preprocess_func(cls, map_func): @classmethod def column_partitions(cls, partitions): - """A list of `BaseFrameAxisPartition` objects. + """List of `BaseFrameAxisPartition` objects. Note: Each value in this list will be an `BaseFrameAxisPartition` object. `BaseFrameAxisPartition` is located in `axis_partition.py`. - Returns a list of `BaseFrameAxisPartition` objects. + Returns + ------- + a list of `BaseFrameAxisPartition` objects. """ return [cls._column_partitions_class(col) for col in partitions.T] @classmethod def row_partitions(cls, partitions): - """A list of `BaseFrameAxisPartition` objects, represents column partitions. + """List of `BaseFrameAxisPartition` objects, represents column partitions. Note: Each value in this list will an `BaseFrameAxisPartition` object. `BaseFrameAxisPartition` is located in `axis_partition.py`. - Returns a list of `BaseFrameAxisPartition` objects. + Returns + ------- + a list of `BaseFrameAxisPartition` objects. """ return [cls._row_partition_class(row) for row in partitions] @classmethod def axis_partition(cls, partitions, axis): + """Logically partition along either the columns or the rows.""" return ( cls.column_partitions(partitions) if not axis @@ -84,6 +93,7 @@ def axis_partition(cls, partitions, axis): @classmethod def groupby_reduce(cls, axis, partitions, by, map_func, reduce_func): + """Groupby data using the map_func provided along the axis over the partitions then reduce using reduce_func.""" mapped_partitions = cls.broadcast_apply( axis, map_func, left=partitions, right=by, other_name="other" ) @@ -101,7 +111,7 @@ def broadcast_apply_select_indices( keep_remaining=False, ): """ - Broadcast the right partitions to left and apply a function to selected indices + Broadcast the right partitions to left and apply a function to selected indices. Note: Your internal function must take this kwargs: [`internal_indices`, `other`, `internal_other_indices`] to work correctly @@ -194,7 +204,7 @@ def broadcast_apply(cls, axis, apply_func, left, right, other_name="r"): new_partitions = np.array( [ [ - part.add_to_apply_calls( + part.apply( apply_func, **{other_name: right[col_idx] if axis else right[row_idx]}, ) @@ -214,22 +224,25 @@ def broadcast_axis_partitions( left, right, keep_partitioning=False, + lengths=None, ): """ Broadcast the right partitions to left and apply a function along full axis. Parameters ---------- - axis : The axis to apply and broadcast over. - apply_func : The function to apply. - left : The left partitions. - right : The right partitions. - keep_partitioning : boolean. Default is False - The flag to keep partitions for Modin Frame. + axis : The axis to apply and broadcast over. + apply_func : The function to apply. + left : The left partitions. + right : The right partitions. + keep_partitioning : boolean. Default is False + The flag to keep partitions for Modin Frame. + lengths : list(int) + The list of lengths to shuffle the object. Returns ------- - A new `np.array` of partition objects. + A new `np.array` of partition objects. """ # Since we are already splitting the DataFrame back up after an # operation, we will just use this time to compute the number of @@ -245,12 +258,19 @@ def broadcast_axis_partitions( # may want to line to partitioning up with another BlockPartitions object. Since # we don't need to maintain the partitioning, this gives us the opportunity to # load-balance the data as well. + kw = { + "num_splits": num_splits, + "other_axis_partition": right_partitions, + } + if lengths: + kw["_lengths"] = lengths + kw["manual_partition"] = True + result_blocks = np.array( [ part.apply( preprocessed_map_func, - num_splits=num_splits, - other_axis_partition=right_partitions, + **kw, ) for part in left_partitions ] @@ -262,12 +282,15 @@ def broadcast_axis_partitions( @classmethod def map_partitions(cls, partitions, map_func): - """Applies `map_func` to every partition. + """Apply `map_func` to every partition. - Args: - map_func: The function to apply. + Parameters + ---------- + map_func: callable + The function to apply. - Returns: + Returns + ------- A new BaseFrameManager object, the type of object that called this. """ preprocessed_map_func = cls.preprocess_func(map_func) @@ -280,6 +303,18 @@ def map_partitions(cls, partitions, map_func): @classmethod def lazy_map_partitions(cls, partitions, map_func): + """ + Apply `map_func` to every partition lazily. + + Parameters + ---------- + map_func: callable + The function to apply. + + Returns + ------- + A new BaseFrameManager object, the type of object that called this. + """ preprocessed_map_func = cls.preprocess_func(map_func) return np.array( [ @@ -295,20 +330,23 @@ def map_axis_partitions( partitions, map_func, keep_partitioning=False, + lengths=None, ): """ - Applies `map_func` to every partition. + Apply `map_func` to every partition. Parameters ---------- - axis : 0 or 1 - The axis to perform the map across (0 - index, 1 - columns). - partitions : NumPy array - The partitions of Modin Frame. - map_func : callable - The function to apply. - keep_partitioning : boolean. Default is False - The flag to keep partitions for Modin Frame. + axis : 0 or 1 + The axis to perform the map across (0 - index, 1 - columns). + partitions : NumPy array + The partitions of Modin Frame. + map_func : callable + The function to apply. + keep_partitioning : bool. Default is False + The flag to keep partitions for Modin Frame. + lengths : list(int) + The list of lengths to shuffle the object. Returns ------- @@ -326,12 +364,13 @@ def map_axis_partitions( apply_func=map_func, keep_partitioning=keep_partitioning, right=None, + lengths=lengths, ) @classmethod def simple_shuffle(cls, axis, partitions, map_func, lengths): """ - Shuffle data using `lengths` via `map_func` + Shuffle data using `lengths` via `map_func`. Parameters ---------- @@ -381,7 +420,8 @@ def concat(cls, axis, left_parts, right_parts): right_parts: the other blocks to be concatenated. This is a BaseFrameManager object. - Returns: + Returns + ------- A new BaseFrameManager object, the type of object that called this. """ if type(right_parts) is list: @@ -396,7 +436,7 @@ def concat(cls, axis, left_parts, right_parts): @classmethod def concatenate(cls, dfs): """ - Concatenate Pandas DataFrames with saving 'category' dtype + Concatenate Pandas DataFrames with saving 'category' dtype. Parameters ---------- @@ -421,7 +461,8 @@ def concatenate(cls, dfs): def to_pandas(cls, partitions): """Convert this object into a Pandas DataFrame from the partitions. - Returns: + Returns + ------- A Pandas DataFrame """ retrieved_objects = [[obj.to_pandas() for obj in part] for part in partitions] @@ -462,6 +503,7 @@ def to_numpy(cls, partitions, **kwargs): @classmethod def from_pandas(cls, df, return_dims=False): + """Return the partitions from Pandas DataFrame.""" num_splits = cls._compute_num_partitions() put_func = cls._partition_class.put row_chunksize, col_chunksize = compute_chunksize(df, num_splits) @@ -491,11 +533,12 @@ def from_pandas(cls, df, return_dims=False): @classmethod def from_arrow(cls, at, return_dims=False): + """Return the partitions from Apache Arrow (PyArrow).""" return cls.from_pandas(at.to_pandas(), return_dims=return_dims) @classmethod def get_indices(cls, axis, partitions, index_func=None): - """This gets the internal indices stored in the partitions. + """Get the internal indices stored in the partitions. Note: These are the global indices of the object. This is mostly useful when you have deleted rows/columns internally, but do not know @@ -505,7 +548,8 @@ def get_indices(cls, axis, partitions, index_func=None): axis: This axis to extract the labels. (0 - index, 1 - columns). index_func: The function to be used to extract the function. - Returns: + Returns + ------- A Pandas Index object. """ ErrorMessage.catch_bugs_and_request_email(not callable(index_func)) @@ -527,10 +571,11 @@ def get_indices(cls, axis, partitions, index_func=None): @classmethod def _compute_num_partitions(cls): - """Currently, this method returns the default. In the future it will - estimate the optimal number of partitions. + """Retrieve the default number of partitions currently. Will estimate the optimal no. of partitions in future. - :return: + Returns + ------- + Number of partitions. """ from modin.pandas import DEFAULT_NPARTITIONS @@ -542,15 +587,13 @@ def _apply_func_to_list_of_partitions_broadcast( ): preprocessed_func = cls.preprocess_func(func) return [ - obj.add_to_apply_calls( - preprocessed_func, other=[o.get() for o in broadcasted], **kwargs - ) + obj.apply(preprocessed_func, other=[o.get() for o in broadcasted], **kwargs) for obj, broadcasted in zip(partitions, other.T) ] @classmethod def _apply_func_to_list_of_partitions(cls, func, partitions, **kwargs): - """Applies a function to a list of remote partitions. + """Apply a function to a list of remote partitions. Note: The main use for this is to preprocess the func. @@ -558,19 +601,18 @@ def _apply_func_to_list_of_partitions(cls, func, partitions, **kwargs): func: The func to apply partitions: The list of partitions - Returns: + Returns + ------- A list of BaseFramePartition objects. """ preprocessed_func = cls.preprocess_func(func) - return [ - obj.add_to_apply_calls(preprocessed_func, **kwargs) for obj in partitions - ] + return [obj.apply(preprocessed_func, **kwargs) for obj in partitions] @classmethod def apply_func_to_select_indices( cls, axis, partitions, func, indices, keep_remaining=False ): - """Applies a function to select indices. + """Apply a function to select indices. Note: Your internal function must take a kwarg `internal_indices` for this to work correctly. This prevents information leakage of the @@ -584,7 +626,8 @@ def apply_func_to_select_indices( Some operations may want to drop the remaining partitions and keep only the results. - Returns: + Returns + ------- A new BaseFrameManager object, the type of object that called this. """ if partitions.size == 0: @@ -671,7 +714,7 @@ def apply_func_to_select_indices( def apply_func_to_select_indices_along_full_axis( cls, axis, partitions, func, indices, keep_remaining=False ): - """Applies a function to a select subset of full columns/rows. + """Apply a function to a select subset of full columns/rows. Note: This should be used when you need to apply a function that relies on some global information for the entire column/row, but only need @@ -680,15 +723,21 @@ def apply_func_to_select_indices_along_full_axis( Important: For your func to operate directly on the indices provided, it must use `internal_indices` as a keyword argument. - Args: - axis: The axis to apply the function over (0 - rows, 1 - columns) - func: The function to apply. - indices: The global indices to apply the func to. - keep_remaining: Whether or not to keep the other partitions. - Some operations may want to drop the remaining partitions and - keep only the results. + Parameters + ---------- + axis: int + The axis to apply the function over (0 - rows, 1 - columns) + func: callable + The function to apply. + indices: list-like + The global indices to apply the func to. + keep_remaining: boolean + Whether or not to keep the other partitions. + Some operations may want to drop the remaining partitions and + keep only the results. - Returns: + Returns + ------- A new BaseFrameManager object, the type of object that called this. """ if partitions.size == 0: @@ -780,7 +829,7 @@ def apply_func_to_indices_both_axis( item_to_distribute=None, ): """ - Apply a function to along both axis + Apply a function to along both axis. Important: For your func to operate directly on the indices provided, it must use `row_internal_indices, col_internal_indices` as keyword diff --git a/modin/engines/base/io/column_stores/feather_reader.py b/modin/engines/base/io/column_stores/feather_reader.py index 7b311b40f7b..95738f54342 100644 --- a/modin/engines/base/io/column_stores/feather_reader.py +++ b/modin/engines/base/io/column_stores/feather_reader.py @@ -32,8 +32,7 @@ def _read(cls, path, columns=None, **kwargs): https://arrow.apache.org/docs/python/api.html#feather-format """ if columns is None: - from pyarrow.feather import FeatherReader + from pyarrow.feather import read_feather - fr = FeatherReader(path) - columns = [fr.get_column_name(i) for i in range(fr.num_columns)] - return cls.build_query_compiler(path, columns, use_threads=False) + df = read_feather(path) + return cls.build_query_compiler(path, df.columns, use_threads=False) diff --git a/modin/engines/base/io/file_reader.py b/modin/engines/base/io/file_reader.py index 8a8ea6bd1ef..879444881cf 100644 --- a/modin/engines/base/io/file_reader.py +++ b/modin/engines/base/io/file_reader.py @@ -137,10 +137,10 @@ def file_exists(cls, file_path): return os.path.exists(file_path) @classmethod - def deploy(cls, func, args, num_return_vals): + def deploy(cls, func, args, num_returns): raise NotImplementedError(NOT_IMPLEMENTED_MESSAGE) - def parse(self, func, args, num_return_vals): + def parse(self, func, args, num_returns): raise NotImplementedError(NOT_IMPLEMENTED_MESSAGE) @classmethod diff --git a/modin/engines/base/io/text/csv_reader.py b/modin/engines/base/io/text/csv_reader.py index 85a844c26c5..f7faf438847 100644 --- a/modin/engines/base/io/text/csv_reader.py +++ b/modin/engines/base/io/text/csv_reader.py @@ -120,12 +120,6 @@ def _read(cls, filepath_or_buffer, **kwargs): skiprows += header + 1 elif hasattr(header, "__iter__") and not isinstance(header, str): skiprows += max(header) + 1 - cls.offset( - f, - nrows=skiprows, - quotechar=quotechar, - is_quoting=is_quoting, - ) if kwargs.get("encoding", None) is not None: partition_kwargs["skiprows"] = 1 # Launch tasks to read partitions @@ -163,8 +157,9 @@ def _read(cls, filepath_or_buffer, **kwargs): splits = cls.partitioned_file( f, - nrows=nrows, num_partitions=num_partitions, + nrows=nrows, + skiprows=skiprows, quotechar=quotechar, is_quoting=is_quoting, ) diff --git a/modin/engines/base/io/text/fwf_reader.py b/modin/engines/base/io/text/fwf_reader.py index 7506ce448c2..e5c6bd36680 100644 --- a/modin/engines/base/io/text/fwf_reader.py +++ b/modin/engines/base/io/text/fwf_reader.py @@ -116,12 +116,6 @@ def read(cls, filepath_or_buffer, **kwargs): skiprows += header + 1 elif hasattr(header, "__iter__") and not isinstance(header, str): skiprows += max(header) + 1 - cls.offset( - f, - nrows=skiprows, - quotechar=quotechar, - is_quoting=is_quoting, - ) if kwargs.get("encoding", None) is not None: partition_kwargs["skiprows"] = 1 # Launch tasks to read partitions @@ -159,8 +153,9 @@ def read(cls, filepath_or_buffer, **kwargs): splits = cls.partitioned_file( f, - nrows=nrows, num_partitions=num_partitions, + nrows=nrows, + skiprows=skiprows, quotechar=quotechar, is_quoting=is_quoting, ) diff --git a/modin/engines/base/io/text/text_file_reader.py b/modin/engines/base/io/text/text_file_reader.py index bc194cc2986..8b60c7e9866 100644 --- a/modin/engines/base/io/text/text_file_reader.py +++ b/modin/engines/base/io/text/text_file_reader.py @@ -56,96 +56,83 @@ def pathlib_or_pypath(cls, filepath_or_buffer): def offset( cls, f, - nrows=None, - skiprows=None, - chunk_size_bytes=None, - quotechar=b'"', - is_quoting=True, + offset_size: int, + quotechar: bytes = b'"', + is_quoting: bool = True, ): """ - Moves the file offset at the specified amount of bytes/rows. + Moves the file offset at the specified amount of bytes. Parameters ---------- - f: file object - nrows: int, number of rows to read. Optional, if not specified will only - consider `chunk_size_bytes` parameter. - chunk_size_bytes: int, Will read new rows while file pointer - is less than `chunk_size_bytes`. Optional, if not specified will only - consider `nrows` parameter. - skiprows: array or callable (optional), specifies rows to skip - quotechar: char that indicates quote in a file - (optional, by default it's '\"') - is_quoting: bool, Whether or not to consider quotes - (optional, by default it's `True`) + f: file object + offset_size: int + Number of bytes to read and ignore. + quotechar: bytes, default b'"' + Indicate quote in a file. + is_quoting: bool, default True + Whether or not to consider quotes. Returns ------- - bool: If file pointer reached the end of the file, but did not find + bool + If file pointer reached the end of the file, but did not find closing quote returns `False`. `True` in any other case. """ - assert ( - nrows is not None or chunk_size_bytes is not None - ), "`nrows` and `chunk_size_bytes` can't be None at the same time" - - if nrows is not None or skiprows is not None: - return cls._read_rows( - f, - nrows=nrows, - skiprows=skiprows, - quotechar=quotechar, - is_quoting=is_quoting, - max_bytes=chunk_size_bytes, - )[0] - - outside_quotes = True if is_quoting: - chunk = f.read(chunk_size_bytes) - line = f.readline() # Ensure we read up to a newline - # We need to ensure that one row isn't split across different partitions - outside_quotes = not ((chunk.count(quotechar) + line.count(quotechar)) % 2) - while not outside_quotes: - line = f.readline() - outside_quotes = line.count(quotechar) % 2 - if not line: - break + chunk = f.read(offset_size) + outside_quotes = not chunk.count(quotechar) % 2 else: - f.seek(chunk_size_bytes, os.SEEK_CUR) - f.readline() + f.seek(offset_size, os.SEEK_CUR) + outside_quotes = True + + # after we read `offset_size` bytes, we most likely break the line but + # the modin implementation doesn't work correctly in the case, so we must + # make sure that the line is read completely to the lineterminator, + # which is what the `_read_rows` does + outside_quotes, _ = cls._read_rows( + f, + nrows=1, + quotechar=quotechar, + is_quoting=is_quoting, + outside_quotes=outside_quotes, + ) + return outside_quotes @classmethod def partitioned_file( cls, f, - nrows=None, - skiprows=None, - num_partitions=None, - quotechar=b'"', - is_quoting=True, - from_begin=False, + num_partitions: int = None, + nrows: int = None, + skiprows: int = None, + quotechar: bytes = b'"', + is_quoting: bool = True, ): - """Computes chunk sizes in bytes for every partition. + """ + Compute chunk sizes in bytes for every partition. Parameters ---------- - f: file to be partitioned - nrows: int (optional), number of rows of file to read - skiprows: array or callable (optional), specifies rows to skip - num_partitions: int, for what number of partitions split a file. - Optional, if not specified grabs the value from `modin.pandas.DEFAULT_NPARTITIONS` - quotechar: char that indicates quote in a file - (optional, by default it's '\"') - is_quoting: bool, Whether or not to consider quotes - (optional, by default it's `True`) - from_begin: bool, Whether or not to set the file pointer to the begining of the file - (optional, by default it's `False`) + f: file to be partitioned + num_partitions: int, optional + For what number of partitions split a file. + If not specified grabs the value from `modin.pandas.DEFAULT_NPARTITIONS` + nrows: int, optional + Number of rows of file to read. + skiprows: array or callable, optional + Specifies rows to skip. + quotechar: bytes, default b'"' + Indicate quote in a file. + is_quoting: bool, default True + Whether or not to consider quotes. Returns ------- - An array, where each element of array is a tuple of two ints: - beginning and the end offsets of the current chunk. + An array, where each element of array is a tuple of two ints: + beginning and the end offsets of the current chunk. """ if num_partitions is None: from modin.pandas import DEFAULT_NPARTITIONS @@ -153,46 +140,54 @@ def partitioned_file( num_partitions = DEFAULT_NPARTITIONS result = [] + file_size = cls.file_size(f) - old_position = f.tell() - if from_begin: - f.seek(0, os.SEEK_SET) - - current_start = f.tell() - total_bytes = cls.file_size(f) - - # if `nrows` are specified we want to use rows as a part measure - if nrows is not None: - chunk_size_bytes = None - rows_per_part = max(1, num_partitions, nrows // num_partitions) - else: - chunk_size_bytes = max(1, num_partitions, total_bytes // num_partitions) - rows_per_part = None - nrows = float("inf") - - rows_readed = 0 - while f.tell() < total_bytes and rows_readed < nrows: - if rows_per_part is not None and rows_readed + rows_per_part > nrows: - rows_per_part = nrows - rows_readed - - outside_quotes = cls.offset( + if skiprows: + outside_quotes, read_rows = cls._read_rows( f, - nrows=rows_per_part, - skiprows=skiprows, - chunk_size_bytes=chunk_size_bytes, + nrows=skiprows, quotechar=quotechar, is_quoting=is_quoting, ) - result.append((current_start, f.tell())) - current_start = f.tell() - if rows_per_part is not None: - rows_readed += rows_per_part - - if is_quoting and not outside_quotes: - warnings.warn("File has mismatched quotes") - - f.seek(old_position, os.SEEK_SET) + start = f.tell() + + if nrows: + read_rows_counter = 0 + partition_size = max(1, num_partitions, nrows // num_partitions) + while f.tell() < file_size and read_rows_counter < nrows: + if read_rows_counter + partition_size > nrows: + # it's possible only if is_quoting==True + partition_size = nrows - read_rows_counter + outside_quotes, read_rows = cls._read_rows( + f, + nrows=partition_size, + quotechar=quotechar, + is_quoting=is_quoting, + ) + result.append((start, f.tell())) + start = f.tell() + read_rows_counter += read_rows + + # add outside_quotes + if is_quoting and not outside_quotes: + warnings.warn("File has mismatched quotes") + else: + partition_size = max(1, num_partitions, file_size // num_partitions) + while f.tell() < file_size: + outside_quotes = cls.offset( + f, + offset_size=partition_size, + quotechar=quotechar, + is_quoting=is_quoting, + ) + + result.append((start, f.tell())) + start = f.tell() + + # add outside_quotes + if is_quoting and not outside_quotes: + warnings.warn("File has mismatched quotes") return result @@ -200,75 +195,48 @@ def partitioned_file( def _read_rows( cls, f, - nrows=None, - skiprows=None, - quotechar=b'"', - is_quoting=True, - max_bytes=None, + nrows: int, + quotechar: bytes = b'"', + is_quoting: bool = True, + outside_quotes: bool = True, ): """ - Moves the file offset at the specified amount of rows - Note: the difference between `offset` is that `_read_rows` is more - specific version of `offset` which is focused of reading **rows**. - In common case it's better to use `offset`. + Move the file offset at the specified amount of rows. Parameters ---------- - f: file object - nrows: int, number of rows to read. Optional, if not specified will only - consider `max_bytes` parameter. - skiprows: int, array or callable (optional), specifies rows to skip - quotechar: char that indicates quote in a file - (optional, by default it's '\"') - is_quoting: bool, Whether or not to consider quotes - (optional, by default it's `True`) - max_bytes: int, Will read new rows while file pointer - is less than `max_bytes`. Optional, if not specified will only - consider `nrows` parameter, if both not specified will read till - the end of the file. + f: file object + nrows: int + Number of rows to read. + quotechar: bytes, default b'"' + Indicate quote in a file. + is_quoting: bool, default True + Whether or not to consider quotes. + outside_quotes: bool, default True + Whether the file pointer is within quotes or not at the time this function is called. Returns ------- - tuple of bool and int, - bool: If file pointer reached the end of the file, but did not find + tuple of bool and int, + bool: If file pointer reached the end of the file, but did not find closing quote returns `False`. `True` in any other case. - int: Number of rows that was readed. + int: Number of rows that was read. """ - assert skiprows is None or isinstance( - skiprows, int - ), f"Skiprows as a {type(skiprows)} is not supported yet." - - if nrows is None and max_bytes is None: - max_bytes = float("inf") - if nrows is not None and nrows <= 0: return True, 0 - # we need this condition to avoid unnecessary checks in `stop_condition` - # which executes in a huge for loop - if nrows is not None and max_bytes is None: - stop_condition = lambda rows_readed: rows_readed >= nrows # noqa (E731) - elif nrows is not None and max_bytes is not None: - stop_condition = ( - lambda rows_readed: f.tell() >= max_bytes or rows_readed >= nrows - ) # noqa (E731) - else: - stop_condition = lambda rows_readed: f.tell() >= max_bytes # noqa (E731) - - if max_bytes is not None: - max_bytes = max_bytes + f.tell() + rows_read = 0 - rows_readed = 0 - outside_quotes = True for line in f: if is_quoting and line.count(quotechar) % 2: outside_quotes = not outside_quotes if outside_quotes: - rows_readed += 1 - if stop_condition(rows_readed): + rows_read += 1 + if rows_read >= nrows: break + # case when EOF if not outside_quotes: - rows_readed += 1 + rows_read += 1 - return outside_quotes, rows_readed + return outside_quotes, rows_read diff --git a/modin/engines/dask/pandas_on_dask/frame/axis_partition.py b/modin/engines/dask/pandas_on_dask/frame/axis_partition.py index b3f98a0fb65..dbd5538aaaf 100644 --- a/modin/engines/dask/pandas_on_dask/frame/axis_partition.py +++ b/modin/engines/dask/pandas_on_dask/frame/axis_partition.py @@ -43,13 +43,15 @@ def deploy_axis_func( *partitions, pure=False, ) - if num_splits == 1: - return axis_result + + lengths = kwargs.get("_lengths", None) + result_num_splits = len(lengths) if lengths else num_splits + # We have to do this to split it back up. It is already split, but we need to # get futures for each. return [ client.submit(lambda l: l[i], axis_result, pure=False) - for i in range(num_splits) + for i in range(result_num_splits) ] @classmethod @@ -68,8 +70,6 @@ def deploy_func_between_two_axis_partitions( *partitions, pure=False, ) - if num_splits == 1: - return axis_result # We have to do this to split it back up. It is already split, but we need to # get futures for each. return [ diff --git a/modin/engines/dask/task_wrapper.py b/modin/engines/dask/task_wrapper.py index 04e5ed2a3b9..af717625afe 100644 --- a/modin/engines/dask/task_wrapper.py +++ b/modin/engines/dask/task_wrapper.py @@ -16,12 +16,12 @@ class DaskTask: @classmethod - def deploy(cls, func, num_return_vals, kwargs): + def deploy(cls, func, num_returns, kwargs): client = _get_global_client() remote_task_future = client.submit(func, **kwargs) return [ client.submit(lambda l, i: l[i], remote_task_future, i) - for i in range(num_return_vals) + for i in range(num_returns) ] @classmethod diff --git a/modin/engines/ray/pandas_on_ray/frame/axis_partition.py b/modin/engines/ray/pandas_on_ray/frame/axis_partition.py index 2099ea9fe93..9ec6620eae9 100644 --- a/modin/engines/ray/pandas_on_ray/frame/axis_partition.py +++ b/modin/engines/ray/pandas_on_ray/frame/axis_partition.py @@ -33,6 +33,7 @@ def __init__(self, list_of_blocks): def deploy_axis_func( cls, axis, func, num_splits, kwargs, maintain_partitioning, *partitions ): + lengths = kwargs.get("_lengths", None) return deploy_ray_func._remote( args=( PandasFrameAxisPartition.deploy_axis_func, @@ -43,7 +44,7 @@ def deploy_axis_func( maintain_partitioning, ) + tuple(partitions), - num_returns=num_splits * 3, + num_returns=num_splits * 3 if lengths is None else len(lengths) * 3, ) @classmethod diff --git a/modin/experimental/backends/omnisci/query_compiler.py b/modin/experimental/backends/omnisci/query_compiler.py index f04d3cbfd6b..35ce16e9917 100644 --- a/modin/experimental/backends/omnisci/query_compiler.py +++ b/modin/experimental/backends/omnisci/query_compiler.py @@ -262,30 +262,20 @@ def groupby_count(self, by, axis, groupby_args, map_args, **kwargs): ) return self.__constructor__(new_frame) - def groupby_dict_agg(self, by, func_dict, groupby_args, agg_args, drop=False): - """Apply aggregation functions to a grouped dataframe per-column. - - Parameters - ---------- - by : DFAlgQueryCompiler - The column to group by - func_dict : dict of str, callable/string - The dictionary mapping of column to function - groupby_args : dict - The dictionary of keyword arguments for the group by. - agg_args : dict - The dictionary of keyword arguments for the aggregation functions - drop : bool - Whether or not to drop the column from the data. - - Returns - ------- - DFAlgQueryCompiler - The result of the per-column aggregations on the grouped dataframe. - """ - # TODO: handle drop arg + def groupby_agg( + self, + by, + is_multi_by, + axis, + agg_func, + agg_args, + agg_kwargs, + groupby_kwargs, + drop=False, + ): + # TODO: handle `is_multi_by`, `agg_args`, `drop` args new_frame = self._modin_frame.groupby_agg( - by, 0, func_dict, groupby_args, **agg_args + by, axis, agg_func, groupby_kwargs, **agg_kwargs ) return self.__constructor__(new_frame) @@ -629,6 +619,18 @@ def has_multiindex(self, axis=0): assert axis == 1 return isinstance(self.columns, pandas.MultiIndex) + def get_index_name(self): + return self._modin_frame.get_index_name() + + def set_index_name(self, name): + self._modin_frame = self._modin_frame.set_index_name(name) + + def get_index_names(self): + return self._modin_frame.get_index_names() + + def set_index_names(self, names): + self._modin_frame = self._modin_frame.set_index_names(names) + def free(self): return diff --git a/modin/experimental/cloud/ray-autoscaler.yml b/modin/experimental/cloud/ray-autoscaler.yml index 94bd63f3d96..640e6277457 100644 --- a/modin/experimental/cloud/ray-autoscaler.yml +++ b/modin/experimental/cloud/ray-autoscaler.yml @@ -155,7 +155,7 @@ head_start_ray_commands: echo 'export MEMORY_STORE_SIZE=$(awk "/MemFree/ { printf \"%d \\n\", \$2*1024*0.8}" /proc/meminfo)' >> ~/.bashrc echo 'export TMPDIR="$(dirname $(mktemp tmp.XXXXXXXXXX -ut))"' >> ~/.bashrc - ulimit -n 65536; ray start --head --num-redis-shards=1 --redis-shard-ports=6380 --port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml --object-store-memory=$MEMORY_STORE_SIZE --plasma-directory=$TMPDIR + ulimit -n 65536; ray start --head --redis-shard-ports=6380 --port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml --object-store-memory=$MEMORY_STORE_SIZE --plasma-directory=$TMPDIR # Command to start ray on worker nodes. You don't need to change this. worker_start_ray_commands: diff --git a/modin/experimental/cloud/rayscale.py b/modin/experimental/cloud/rayscale.py index c77676616b9..a7cb7799691 100644 --- a/modin/experimental/cloud/rayscale.py +++ b/modin/experimental/cloud/rayscale.py @@ -21,12 +21,23 @@ import subprocess import yaml -from ray.autoscaler.commands import ( - create_or_update_cluster, - teardown_cluster, - get_head_node_ip, - _bootstrap_config, -) + +try: + # for ray>=1.0.1 + from ray.autoscaler.sdk import ( + create_or_update_cluster, + teardown_cluster, + get_head_node_ip, + bootstrap_config, + ) +except ModuleNotFoundError: + # for ray==1.0.0 + from ray.autoscaler.commands import ( + create_or_update_cluster, + teardown_cluster, + get_head_node_ip, + _bootstrap_config as bootstrap_config, + ) from .base import ( CannotSpawnCluster, @@ -140,7 +151,7 @@ def __make_config(self): res = self._update_conda_requirements(config["setup_commands"][0]) config["setup_commands"][0] = res - return _bootstrap_config(config) + return bootstrap_config(config) def _conda_requirements(self): import shlex @@ -197,15 +208,9 @@ def __do_spawn(self): try: create_or_update_cluster( self.config_file, - override_min_workers=None, - override_max_workers=None, no_restart=False, restart_only=False, - yes=True, - override_cluster_name=None, no_config_cache=False, - redirect_command_output=False, - use_login_shells=True, ) # need to re-load the config, as create_or_update_cluster() modifies it with open(self.config_file) as inp: @@ -220,13 +225,7 @@ def __do_spawn(self): def __do_destroy(self): try: - teardown_cluster( - self.config_file, - yes=True, - workers_only=False, - override_cluster_name=None, - keep_min_workers=0, - ) + teardown_cluster(self.config_file) self.ready = False self.config = None except BaseException as ex: @@ -244,7 +243,7 @@ def _get_connection_details(self) -> ConnectionDetails: return ConnectionDetails( user_name=self.config["auth"]["ssh_user"], key_file=self.config["auth"]["ssh_private_key"], - address=get_head_node_ip(self.config_file, override_cluster_name=None), + address=get_head_node_ip(self.config_file), ) def _get_main_python(self) -> str: @@ -262,6 +261,7 @@ def wrap_cmd(self, cmd: list): [ "bash", "-ic", - subprocess.list2cmdline(["conda", "run", "-n", "modin"] + cmd), + # workaround for https://github.com/conda/conda/issues/8385 + subprocess.list2cmdline(["conda", "activate", "modin", "&&"] + cmd), ] ) diff --git a/modin/experimental/cloud/test/test_cloud.py b/modin/experimental/cloud/test/test_cloud.py index a7e4c5b3c83..1d17f3ed746 100644 --- a/modin/experimental/cloud/test/test_cloud.py +++ b/modin/experimental/cloud/test/test_cloud.py @@ -15,20 +15,20 @@ import pytest from collections import namedtuple from inspect import signature -from modin.experimental.cloud.rayscale import RayCluster -from modin.experimental.cloud.cluster import Provider -from ray.autoscaler.commands import ( +from modin.experimental.cloud.rayscale import ( + RayCluster, create_or_update_cluster, teardown_cluster, get_head_node_ip, - _bootstrap_config, + bootstrap_config, ) +from modin.experimental.cloud.cluster import Provider @pytest.fixture def make_bootstrap_config_mock(): def bootstrap_config_mock(config, *args, **kwargs): - signature(_bootstrap_config).bind(config, *args, **kwargs) + signature(bootstrap_config).bind(config, *args, **kwargs) config["auth"]["ssh_user"] = "modin" config["auth"]["ssh_private_key"] = "X" * 20 return config @@ -59,7 +59,7 @@ def make_create_or_update_cluster_mock(): def make_ray_cluster(make_bootstrap_config_mock): def ray_cluster(conda_packages=None): with mock.patch( - "modin.experimental.cloud.rayscale._bootstrap_config", + "modin.experimental.cloud.rayscale.bootstrap_config", make_bootstrap_config_mock, ): ray_cluster = RayCluster( @@ -71,7 +71,7 @@ def ray_cluster(conda_packages=None): return ray_cluster -def test__bootstrap_config(make_ray_cluster): +def test_bootstrap_config(make_ray_cluster): make_ray_cluster() diff --git a/modin/experimental/engines/omnisci_on_ray/frame/data.py b/modin/experimental/engines/omnisci_on_ray/frame/data.py index fc5fd6627fe..42d59f70e11 100644 --- a/modin/experimental/engines/omnisci_on_ray/frame/data.py +++ b/modin/experimental/engines/omnisci_on_ray/frame/data.py @@ -1235,6 +1235,69 @@ def has_multiindex(self): return isinstance(self._index_cache, MultiIndex) return self._index_cols is not None and len(self._index_cols) > 1 + def get_index_name(self): + if self._index_cols is None: + return None + if len(self._index_cols) > 1: + return None + return self._index_cols[0] + + def set_index_name(self, name): + if self.has_multiindex(): + ErrorMessage.single_warning("Scalar name for MultiIndex is not supported!") + return self + + if self._index_cols is None and name is None: + return self + + names = self._mangle_index_names([name]) + exprs = OrderedDict() + if self._index_cols is None: + exprs[names[0]] = self.ref("__rowid__") + else: + exprs[names[0]] = self.ref(self._index_cols[0]) + + for col in self.columns: + exprs[col] = self.ref(col) + + return self.__constructor__( + columns=self.columns, + dtypes=self._dtypes_for_exprs(exprs), + op=TransformNode(self, exprs), + index_cols=names, + uses_rowid=self._index_cols is None, + force_execution_mode=self._force_execution_mode, + ) + + def get_index_names(self): + if self.has_multiindex(): + return self._index_cols.copy() + return [self.get_index_name()] + + def set_index_names(self, names): + if not self.has_multiindex(): + raise ValueError("Can set names for MultiIndex only") + + if len(names) != len(self._index_cols): + raise ValueError( + f"Unexpected names count: expected {len(self._index_cols)} got {len(names)}" + ) + + names = self._mangle_index_names(names) + exprs = OrderedDict() + for old, new in zip(self._index_cols, names): + exprs[new] = self.ref(old) + for col in self.columns: + exprs[col] = self.ref(col) + + return self.__constructor__( + columns=self.columns, + dtypes=self._dtypes_for_exprs(exprs), + op=TransformNode(self, exprs), + index_cols=names, + force_execution_mode=self._force_execution_mode, + ) + def to_pandas(self): self._execute() diff --git a/modin/experimental/engines/omnisci_on_ray/test/test_dataframe.py b/modin/experimental/engines/omnisci_on_ray/test/test_dataframe.py index 22a471f3c7c..9bd59b0ee8a 100644 --- a/modin/experimental/engines/omnisci_on_ray/test/test_dataframe.py +++ b/modin/experimental/engines/omnisci_on_ray/test/test_dataframe.py @@ -392,6 +392,30 @@ def applier(lib): eval_general(pd, pandas, applier) + def test_set_index_name(self): + index = pandas.Index.__new__(pandas.Index, data=[i for i in range(24)]) + + pandas_df = pandas.DataFrame(self.data, index=index) + pandas_df.index.name = "new_name" + modin_df = pd.DataFrame(self.data, index=index) + modin_df._query_compiler.set_index_name("new_name") + + df_equals(pandas_df, modin_df) + + def test_set_index_names(self): + index = pandas.MultiIndex.from_tuples( + [(i, j, k) for i in range(2) for j in range(3) for k in range(4)] + ) + + pandas_df = pandas.DataFrame(self.data, index=index) + pandas_df.index.names = ["new_name1", "new_name2", "new_name3"] + modin_df = pd.DataFrame(self.data, index=index) + modin_df._query_compiler.set_index_names( + ["new_name1", "new_name2", "new_name3"] + ) + + df_equals(pandas_df, modin_df) + class TestFillna: data = {"a": [1, 1, None], "b": [None, None, 2], "c": [3, None, None]} @@ -547,6 +571,17 @@ def groupby_count(df, cols, as_index, **kwargs): run_and_compare(groupby_count, data=self.data, cols=cols, as_index=as_index) + @pytest.mark.xfail( + reason="Currently mean() passes a lambda into backend which cannot be executed on omnisci backend" + ) + @pytest.mark.parametrize("cols", cols_value) + @pytest.mark.parametrize("as_index", bool_arg_values) + def test_groupby_mean(self, cols, as_index): + def groupby_mean(df, cols, as_index, **kwargs): + return df.groupby(cols, as_index=as_index).mean() + + run_and_compare(groupby_mean, data=self.data, cols=cols, as_index=as_index) + @pytest.mark.parametrize("cols", cols_value) @pytest.mark.parametrize("as_index", bool_arg_values) def test_groupby_proj_sum(self, cols, as_index): @@ -569,6 +604,17 @@ def groupby(df, **kwargs): run_and_compare(groupby, data=self.data) + @pytest.mark.xfail( + reason="Function specified as a string should be passed into backend API, but currently it is transformed into a lambda" + ) + @pytest.mark.parametrize("cols", cols_value) + @pytest.mark.parametrize("as_index", bool_arg_values) + def test_groupby_agg_mean(self, cols, as_index): + def groupby_mean(df, cols, as_index, **kwargs): + return df.groupby(cols, as_index=as_index).agg("mean") + + run_and_compare(groupby_mean, data=self.data, cols=cols, as_index=as_index) + taxi_data = { "a": [1, 1, 2, 2], "b": [11, 21, 12, 11], diff --git a/modin/experimental/engines/pandas_on_ray/io_exp.py b/modin/experimental/engines/pandas_on_ray/io_exp.py index c093e93708c..38b8170445f 100644 --- a/modin/experimental/engines/pandas_on_ray/io_exp.py +++ b/modin/experimental/engines/pandas_on_ray/io_exp.py @@ -148,7 +148,7 @@ def read_sql( columns, chunksize, ), - num_return_vals=num_splits + 1, + num_returns=num_splits + 1, ) partition_ids.append( [PandasOnRayFramePartition(obj) for obj in partition_id[:-1]] diff --git a/modin/experimental/engines/pyarrow_on_ray/frame/axis_partition.py b/modin/experimental/engines/pyarrow_on_ray/frame/axis_partition.py index be82e790e7b..b7cdb2eaa94 100644 --- a/modin/experimental/engines/pyarrow_on_ray/frame/axis_partition.py +++ b/modin/experimental/engines/pyarrow_on_ray/frame/axis_partition.py @@ -46,7 +46,7 @@ def apply(self, func, num_splits=None, other_axis_partition=None, **kwargs): for obj in deploy_ray_func_between_two_axis_partitions._remote( args=(self.axis, func, num_splits, len(self.list_of_blocks), kwargs) + tuple(self.list_of_blocks + other_axis_partition.list_of_blocks), - num_return_vals=num_splits, + num_returns=num_splits, ) ] @@ -54,7 +54,7 @@ def apply(self, func, num_splits=None, other_axis_partition=None, **kwargs): args.extend(self.list_of_blocks) return [ PyarrowOnRayFramePartition(obj) - for obj in deploy_ray_axis_func._remote(args, num_return_vals=num_splits) + for obj in deploy_ray_axis_func._remote(args, num_returns=num_splits) ] def shuffle(self, func, num_splits=None, **kwargs): @@ -74,7 +74,7 @@ def shuffle(self, func, num_splits=None, **kwargs): args.extend(self.list_of_blocks) return [ PyarrowOnRayFramePartition(obj) - for obj in deploy_ray_axis_func._remote(args, num_return_vals=num_splits) + for obj in deploy_ray_axis_func._remote(args, num_returns=num_splits) ] diff --git a/modin/pandas/__init__.py b/modin/pandas/__init__.py index 5ddc9c33c07..7979784d023 100644 --- a/modin/pandas/__init__.py +++ b/modin/pandas/__init__.py @@ -13,7 +13,7 @@ import pandas -__pandas_version__ = "1.1.3" +__pandas_version__ = "1.1.4" if pandas.__version__ != __pandas_version__: import warnings diff --git a/modin/pandas/accessor.py b/modin/pandas/accessor.py new file mode 100644 index 00000000000..b4895b7eabc --- /dev/null +++ b/modin/pandas/accessor.py @@ -0,0 +1,111 @@ +# Licensed to Modin Development Team under one or more contributor license agreements. +# See the NOTICE file distributed with this work for additional information regarding +# copyright ownership. The Modin Development Team licenses this file to you under the +# Apache License, Version 2.0 (the "License"); you may not use this file except in +# compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under +# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific language +# governing permissions and limitations under the License. + +import pandas +from pandas.core.arrays.sparse.dtype import SparseDtype + +from modin.utils import _inherit_docstrings + + +class BaseSparseAccessor: + _validation_msg = "Can only use the '.sparse' accessor with Sparse data." + + def __init__(self, data=None): + self._parent = data + self._validate(data) + + def _validate(self, data): + raise NotImplementedError + + def _default_to_pandas(self, op, *args, **kwargs): + return self._parent._default_to_pandas( + lambda parent: op(parent.sparse, *args, **kwargs) + ) + + +@_inherit_docstrings(pandas.core.arrays.sparse.accessor.SparseFrameAccessor) +class SparseFrameAccessor(BaseSparseAccessor): + def _validate(self, data): + dtypes = data.dtypes + if not all(isinstance(t, SparseDtype) for t in dtypes): + raise AttributeError(self._validation_msg) + + @property + def density(self): + return self._parent._default_to_pandas(pandas.DataFrame.sparse).density + + @classmethod + def from_spmatrix(cls, data, index=None, columns=None): + return cls._default_to_pandas( + pandas.DataFrame.sparse.from_spmatrix, data, index=index, columns=columns + ) + + def to_dense(self): + return self._default_to_pandas(pandas.DataFrame.sparse.to_dense) + + def to_coo(self): + return self._default_to_pandas(pandas.DataFrame.sparse.to_coo) + + +@_inherit_docstrings(pandas.core.arrays.sparse.accessor.SparseAccessor) +class SparseAccessor(BaseSparseAccessor): + def _validate(self, data): + if not isinstance(data.dtype, SparseDtype): + raise AttributeError(self._validation_msg) + + @property + def density(self): + return self._parent._default_to_pandas(pandas.Series.sparse).density + + @property + def fill_value(self): + return self._parent._default_to_pandas(pandas.Series.sparse).fill_value + + @property + def npoints(self): + return self._parent._default_to_pandas(pandas.Series.sparse).npoints + + @property + def sp_values(self): + return self._parent._default_to_pandas(pandas.Series.sparse).sp_values + + @classmethod + def from_coo(cls, A, dense_index=False): + return cls._default_to_pandas( + pandas.Series.sparse.from_coo, A, dense_index=dense_index + ) + + def to_coo(self, row_levels=(0,), column_levels=(1,), sort_labels=False): + return self._default_to_pandas( + pandas.Series.sparse.to_coo, + row_levels=row_levels, + column_levels=column_levels, + sort_labels=sort_labels, + ) + + def to_dense(self): + return self._default_to_pandas(pandas.Series.sparse.to_dense) + + +@_inherit_docstrings(pandas.core.accessor.CachedAccessor) +class CachedAccessor: + def __init__(self, name: str, accessor) -> None: + self._name = name + self._accessor = accessor + + def __get__(self, obj, cls): + if obj is None: + return self._accessor + accessor_obj = self._accessor(obj) + object.__setattr__(obj, self._name, accessor_obj) + return accessor_obj diff --git a/modin/pandas/base.py b/modin/pandas/base.py index 07bf7f0501b..e300ea15f96 100644 --- a/modin/pandas/base.py +++ b/modin/pandas/base.py @@ -488,9 +488,6 @@ def add(self, other, axis="columns", level=None, fill_value=None): ) def aggregate(self, func=None, axis=0, *args, **kwargs): - warnings.warn( - "Modin index may not match pandas index due to pandas issue pandas-dev/pandas#36189." - ) axis = self._get_axis_number(axis) result = None @@ -686,9 +683,6 @@ def apply( args=(), **kwds, ): - warnings.warn( - "Modin index may not match pandas index due to pandas issue pandas-dev/pandas#36189." - ) axis = self._get_axis_number(axis) ErrorMessage.non_verified_udf() if isinstance(func, str): @@ -2113,6 +2107,10 @@ def shift(self, periods=1, freq=None, axis=0, fill_value=None): new_frame.columns = self.columns.copy() return new_frame else: + if not isinstance(self, DataFrame): + raise ValueError( + f"No axis named {axis} for object type {type(self)}" + ) res_columns = self.columns from .general import concat diff --git a/modin/pandas/dataframe.py b/modin/pandas/dataframe.py index 2d7ee67161e..9ffbb82ef55 100644 --- a/modin/pandas/dataframe.py +++ b/modin/pandas/dataframe.py @@ -54,6 +54,7 @@ from .series import Series from .base import BasePandasDataset, _ATTRS_NO_LOOKUP from .groupby import DataFrameGroupBy +from .accessor import CachedAccessor, SparseFrameAccessor @_inherit_docstrings(pandas.DataFrame, excluded=[pandas.DataFrame.__init__]) @@ -1594,9 +1595,7 @@ def set_index( if not inplace: return frame - @property - def sparse(self): - return self._default_to_pandas(pandas.DataFrame.sparse) + sparse = CachedAccessor("sparse", SparseFrameAccessor) def squeeze(self, axis=None): axis = self._get_axis_number(axis) if axis is not None else None @@ -1973,7 +1972,11 @@ def __setitem__(self, key, value): self._query_compiler = value._query_compiler.copy() else: self._create_or_update_from_compiler( - self._query_compiler.concat(1, value._query_compiler), + self._query_compiler.concat( + 1, + value._query_compiler, + join="left", + ), inplace=True, ) # Now that the data is appended, we need to update the column name for diff --git a/modin/pandas/groupby.py b/modin/pandas/groupby.py index 6feed8ff48b..3329a0412c1 100644 --- a/modin/pandas/groupby.py +++ b/modin/pandas/groupby.py @@ -22,6 +22,7 @@ Manually add documentation for methods which are not presented in pandas. """ +import numpy as np import pandas import pandas.core.groupby from pandas.core.dtypes.common import is_list_like @@ -29,7 +30,8 @@ import pandas.core.common as com from modin.error_message import ErrorMessage -from modin.utils import _inherit_docstrings, wrap_udf_function, try_cast_to_pandas +from modin.utils import _inherit_docstrings, try_cast_to_pandas +from modin.backends.base.query_compiler import BaseQueryCompiler from modin.config import IsExperimental from .series import Series @@ -171,10 +173,64 @@ def idxmax(self): def ndim(self): return 2 # ndim is always 2 for DataFrames - def shift(self, periods=1, freq=None, axis=0): - return self._default_to_pandas( - lambda df: df.shift(periods=periods, freq=freq, axis=axis) - ) + def shift(self, periods=1, freq=None, axis=0, fill_value=None): + def _shift(periods, freq, axis, fill_value, is_set_nan_rows=True): + from .dataframe import DataFrame + + result = self._df.shift(periods, freq, axis, fill_value) + + if ( + is_set_nan_rows + and isinstance(self._by, BaseQueryCompiler) + and ( + # Check using `issubset` is effective only in case of MultiIndex + set(self._by.columns).issubset(list(self._df.columns)) + if isinstance(self._by.columns, pandas.MultiIndex) + else len( + self._by.columns.unique() + .sort_values() + .difference(self._df.columns.unique().sort_values()) + ) + == 0 + ) + and DataFrame(query_compiler=self._by.isna()).any(axis=None) + ): + mask_nan_rows = self._df[self._by.columns].isna() + if (isinstance(mask_nan_rows, DataFrame)) and len( + mask_nan_rows.columns + ) == 1: + mask_nan_rows = mask_nan_rows.squeeze(axis=1) + idx_nan_rows = mask_nan_rows[ + mask_nan_rows.any(axis=1) + if (isinstance(mask_nan_rows, DataFrame)) + else mask_nan_rows + ].index + result.loc[idx_nan_rows] = np.nan + return result + + if freq is None and axis == 1 and self._axis == 0: + result = _shift(periods, freq, axis, fill_value) + elif ( + freq is not None + and axis == 0 + and self._axis == 0 + and isinstance(self._by, BaseQueryCompiler) + ): + result = _shift(periods, freq, axis, fill_value, is_set_nan_rows=False) + new_idx_lvl_arrays = np.concatenate( + [self._df[self._by.columns].values.T, [list(result.index)]] + ) + result.index = pandas.MultiIndex.from_arrays( + new_idx_lvl_arrays, + names=[col_name for col_name in self._by.columns] + [result.index.name], + ) + result = result.dropna(subset=self._by.columns).sort_index() + else: + result = self._apply_agg_function( + lambda df: df.shift(periods, freq, axis, fill_value) + ) + result.index.name = None + return result def nth(self, n, dropna=None): return self._default_to_pandas(lambda df: df.nth(n, dropna=dropna)) @@ -301,6 +357,8 @@ def aggregate(self, func=None, *args, **kwargs): # This is not implemented in pandas, # so we throw a different message raise NotImplementedError("axis other than 0 is not supported") + + relabeling_required = False if isinstance(func, dict) or func is None: def _reconstruct_func(func, **kwargs): @@ -324,50 +382,32 @@ def _reconstruct_func(func, **kwargs): from pandas.core.base import SpecificationError raise SpecificationError("nested renamer is not supported") - if isinstance(self._by, type(self._query_compiler)): - by = list(self._by.columns) - else: - by = self._by - - subset_cols = list(func_dict.keys()) + ( - list(self._by.columns) - if isinstance(self._by, type(self._query_compiler)) - and all(c in self._df.columns for c in self._by.columns) - else [] - ) - result = type(self._df)( - query_compiler=self._df[subset_cols]._query_compiler.groupby_dict_agg( - by=by, - func_dict=func_dict, - groupby_args=self._kwargs, - agg_args=kwargs, - drop=self._drop, - ) - ) - - if relabeling_required: - result = result.iloc[:, order] - result.columns = new_columns - - return result - - if is_list_like(func): + func = func_dict + elif is_list_like(func): return self._default_to_pandas( lambda df, *args, **kwargs: df.aggregate(func, *args, **kwargs), *args, **kwargs, ) - if isinstance(func, str): - agg_func = getattr(self, func, None) + elif isinstance(func, str): + # Using "getattr" here masks possible AttributeError which we throw + # in __getattr__, so we should call __getattr__ directly instead. + agg_func = self.__getattr__(func) if callable(agg_func): return agg_func(*args, **kwargs) - return self._apply_agg_function( - lambda df, *args, **kwargs: df.aggregate(func, *args, **kwargs), + + result = self._apply_agg_function( + func, drop=self._as_index, *args, **kwargs, ) + if relabeling_required: + result = result.iloc[:, order] + result.columns = new_columns + return result + agg = aggregate def last(self, **kwargs): @@ -832,37 +872,33 @@ def _apply_agg_function(self, f, drop=True, *args, **kwargs): ------- A new combined DataFrame with the result of all groups. """ - assert callable(f), "'{0}' object is not callable".format(type(f)) - - f = wrap_udf_function(f) - if self._is_multi_by: - return self._default_to_pandas(f, *args, **kwargs) - - if isinstance(self._by, type(self._query_compiler)): - by = self._by.to_pandas().squeeze() - else: - by = self._by + assert callable(f) or isinstance( + f, dict + ), "'{0}' object is not callable and not a dict".format(type(f)) # For aggregations, pandas behavior does this for the result. # For other operations it does not, so we wait until there is an aggregation to # actually perform this operation. - if self._idx_name is not None and drop and self._drop: + if not self._is_multi_by and self._idx_name is not None and drop and self._drop: groupby_qc = self._query_compiler.drop(columns=[self._idx_name]) else: groupby_qc = self._query_compiler + new_manager = groupby_qc.groupby_agg( - by=by, + by=self._by, + is_multi_by=self._is_multi_by, axis=self._axis, agg_func=f, - groupby_args=self._kwargs, - agg_args=kwargs, + agg_args=args, + agg_kwargs=kwargs, + groupby_kwargs=self._kwargs, drop=self._drop, ) if self._idx_name is not None and self._as_index: - new_manager.index.name = self._idx_name + new_manager.set_index_name(self._idx_name) result = type(self._df)(query_compiler=new_manager) - if result.index.name == "__reduced__": - result.index.name = None + if result._query_compiler.get_index_name() == "__reduced__": + result._query_compiler.set_index_name(None) if self._kwargs.get("squeeze", False): return result.squeeze() return result diff --git a/modin/pandas/series.py b/modin/pandas/series.py index 6a1e11e4929..c3833cfe64a 100644 --- a/modin/pandas/series.py +++ b/modin/pandas/series.py @@ -41,6 +41,7 @@ from .base import BasePandasDataset, _ATTRS_NO_LOOKUP from .iterator import PartitionIterator from .utils import from_pandas, is_scalar +from .accessor import CachedAccessor, SparseAccessor @_inherit_docstrings(pandas.Series, excluded=[pandas.Series.__init__]) @@ -1187,9 +1188,7 @@ def sort_values( result._query_compiler, inplace=inplace ) - @property - def sparse(self): - return self._default_to_pandas(pandas.Series.sparse) + sparse = CachedAccessor("sparse", SparseAccessor) def squeeze(self, axis=None): if axis is not None: diff --git a/modin/pandas/test/conftest.py b/modin/pandas/test/conftest.py index 2cc83a8e068..5fa1eff9cc8 100644 --- a/modin/pandas/test/conftest.py +++ b/modin/pandas/test/conftest.py @@ -71,7 +71,7 @@ def pytest_configure(config): set_base_backend(BASE_BACKEND_NAME) else: partition, engine = backend.split("On") - modin.set_base_backend(engine=engine, partition=backend) + modin.set_backends(engine=engine, partition=partition) def pytest_runtest_call(item): diff --git a/modin/pandas/test/data/issue_621.csv b/modin/pandas/test/data/issue_621.csv deleted file mode 100644 index c0d924616ef..00000000000 --- a/modin/pandas/test/data/issue_621.csv +++ /dev/null @@ -1,10 +0,0 @@ -ins_74901673,task_LTg0MTUwNTA5Mjg4MDkwNjIzMA==,j_217,10,Terminated,673795,673797,m_2637,1,1,13,16,0.02,0.02 -ins_815802872,M1,j_1527,1,Terminated,158478,158520,m_3430,1,1,3,19,0.13,0.18 -ins_564677701,M1,j_2014,1,Terminated,372602,372616,m_1910,1,1,87,116,0.04,0.05 -ins_257566161,M1,j_2014,1,Terminated,372602,372615,m_2485,1,1,91,123,0.05,0.05 -ins_688679908,M1,j_2014,1,Terminated,372602,372615,m_993,1,1,93,141,0.05,0.05 -ins_929638393,M1,j_2014,1,Terminated,372603,372615,m_2808,1,1,100,137,0.05,0.05 -ins_1349024140,M1,j_2014,1,Terminated,372603,372617,m_3736,1,1,82,111,0.05,0.05 -ins_330247444,M1,j_2014,1,Terminated,372603,372617,m_1176,1,1,84,110,0.05,0.05 -ins_833551291,M1,j_2014,1,Terminated,372602,372614,m_2682,1,1,90,159,0.05,0.05 -ins_833550789,M1,j_2014,1,Terminated,372603,372619,m_3625,1,1,78,105,0.05,0.05 diff --git a/modin/pandas/test/dataframe/test_binary.py b/modin/pandas/test/dataframe/test_binary.py index 5346c0e8932..a4449798c12 100644 --- a/modin/pandas/test/dataframe/test_binary.py +++ b/modin/pandas/test/dataframe/test_binary.py @@ -135,7 +135,7 @@ def test_math_alias(math_op, alias): assert getattr(pd.DataFrame, math_op) == getattr(pd.DataFrame, alias) -@pytest.mark.parametrize("other", ["as_left", 4, 4.0]) +@pytest.mark.parametrize("other", ["as_left", 4, 4.0, "a"]) @pytest.mark.parametrize("op", ["eq", "ge", "gt", "le", "lt", "ne"]) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_comparison(data, op, other): @@ -145,20 +145,6 @@ def test_comparison(data, op, other): ) -@pytest.mark.xfail_backends( - ["BaseOnPython"], - reason="Test is failing because of mismathing of thrown exceptions. See pandas issue #36377", -) -@pytest.mark.parametrize("other", ["a"]) -@pytest.mark.parametrize("op", ["ge", "gt", "le", "lt", "eq", "ne"]) -@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) -def test_comparison_except(data, op, other): - eval_general( - *create_test_dfs(data), - lambda df: getattr(df, op)(other), - ) - - @pytest.mark.parametrize("op", ["eq", "ge", "gt", "le", "lt", "ne"]) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_multi_level_comparison(data, op): diff --git a/modin/pandas/test/dataframe/test_default.py b/modin/pandas/test/dataframe/test_default.py index b8b39c203da..552a9fa7480 100644 --- a/modin/pandas/test/dataframe/test_default.py +++ b/modin/pandas/test/dataframe/test_default.py @@ -1151,6 +1151,13 @@ def test___bool__(data): eval_general(*create_test_dfs(data), lambda df: df.__bool__()) -@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) -def test_hasattr_sparse(data): - eval_general(*create_test_dfs(data), lambda df: hasattr(df, "sparse")) +@pytest.mark.parametrize( + "is_sparse_data", [True, False], ids=["is_sparse", "is_not_sparse"] +) +def test_hasattr_sparse(is_sparse_data): + modin_df, pandas_df = ( + create_test_dfs(pandas.arrays.SparseArray(test_data["float_nan_data"].values())) + if is_sparse_data + else create_test_dfs(test_data["float_nan_data"]) + ) + eval_general(modin_df, pandas_df, lambda df: hasattr(df, "sparse")) diff --git a/modin/pandas/test/dataframe/test_indexing.py b/modin/pandas/test/dataframe/test_indexing.py index 1d2d723da71..a2b38aa7d88 100644 --- a/modin/pandas/test/dataframe/test_indexing.py +++ b/modin/pandas/test/dataframe/test_indexing.py @@ -1161,6 +1161,13 @@ def test___setitem__(data): df_equals(modin_df, pandas_df) + # from issue #2390 + modin_df = pd.DataFrame({"a": [1, 2, 3]}) + pandas_df = pandas.DataFrame({"a": [1, 2, 3]}) + modin_df["b"] = pd.Series([4, 5, 6, 7, 8]) + pandas_df["b"] = pandas.Series([4, 5, 6, 7, 8]) + df_equals(modin_df, pandas_df) + def test___setitem__with_mismatched_partitions(): fname = "200kx99.csv" diff --git a/modin/pandas/test/dataframe/test_udf.py b/modin/pandas/test/dataframe/test_udf.py index 4b39cf7cd22..651feab1e40 100644 --- a/modin/pandas/test/dataframe/test_udf.py +++ b/modin/pandas/test/dataframe/test_udf.py @@ -49,16 +49,10 @@ ) @pytest.mark.parametrize("op", ["agg", "apply"]) def test_agg_apply(axis, func, op): - # AssertionError may be arisen in case of - # mismathing of index/columns in Modin and pandas. - # See details in pandas issue 36189. - try: - eval_general( - *create_test_dfs(test_data["float_nan_data"]), - lambda df: getattr(df, op)(func, axis), - ) - except AssertionError: - pass + eval_general( + *create_test_dfs(test_data["float_nan_data"]), + lambda df: getattr(df, op)(func, axis), + ) @pytest.mark.parametrize("axis", ["rows", "columns"]) @@ -69,16 +63,10 @@ def test_agg_apply(axis, func, op): ) @pytest.mark.parametrize("op", ["agg", "apply"]) def test_agg_apply_axis_names(axis, func, op): - # AssertionError may be arisen in case of - # mismathing of index/columns in Modin and pandas. - # See details in pandas issue 36189. - try: - eval_general( - *create_test_dfs(test_data["int_data"]), - lambda df: getattr(df, op)(func, axis), - ) - except AssertionError: - pass + eval_general( + *create_test_dfs(test_data["int_data"]), + lambda df: getattr(df, op)(func, axis), + ) def test_aggregate_alias(): diff --git a/modin/pandas/test/test_api.py b/modin/pandas/test/test_api.py index 319ae2bf505..abb907f639b 100644 --- a/modin/pandas/test/test_api.py +++ b/modin/pandas/test/test_api.py @@ -48,6 +48,7 @@ def test_top_level_api_equality(): "DEFAULT_NPARTITIONS", "iterator", "series", + "accessor", "base", "utils", "dataframe", diff --git a/modin/pandas/test/test_groupby.py b/modin/pandas/test/test_groupby.py index 7a98c43313a..cbccb85f344 100644 --- a/modin/pandas/test/test_groupby.py +++ b/modin/pandas/test/test_groupby.py @@ -22,6 +22,7 @@ check_df_columns_have_nans, create_test_dfs, eval_general, + test_data, test_data_values, modin_df_almost_equals_pandas, ) @@ -104,6 +105,7 @@ def test_mixed_dtypes_groupby(as_index): modin_df_almost_equals_pandas, is_default=True, ) + eval_shift(modin_groupby, pandas_groupby) eval_mean(modin_groupby, pandas_groupby) eval_any(modin_groupby, pandas_groupby) eval_min(modin_groupby, pandas_groupby) @@ -148,7 +150,12 @@ def test_mixed_dtypes_groupby(as_index): eval_var(modin_groupby, pandas_groupby) eval_skew(modin_groupby, pandas_groupby) - agg_functions = ["min", "max"] + agg_functions = [ + "min", + "max", + {"col2": "sum"}, + {"col2": "max", "col4": "sum", "col5": "min"}, + ] for func in agg_functions: eval_agg(modin_groupby, pandas_groupby, func) eval_aggregate(modin_groupby, pandas_groupby, func) @@ -298,6 +305,7 @@ def maybe_get_columns(df, by): modin_groupby_equals_pandas(modin_groupby, pandas_groupby) eval_ngroups(modin_groupby, pandas_groupby) + eval_shift(modin_groupby, pandas_groupby) eval_general(modin_groupby, pandas_groupby, lambda df: df.ffill(), is_default=True) eval_general( modin_groupby, @@ -437,6 +445,7 @@ def test_single_group_row_groupby(): modin_groupby_equals_pandas(modin_groupby, pandas_groupby) eval_ngroups(modin_groupby, pandas_groupby) + eval_shift(modin_groupby, pandas_groupby) eval_skew(modin_groupby, pandas_groupby) eval_general(modin_groupby, pandas_groupby, lambda df: df.ffill(), is_default=True) eval_general( @@ -476,7 +485,12 @@ def test_single_group_row_groupby(): eval_prod(modin_groupby, pandas_groupby) eval_std(modin_groupby, pandas_groupby) - agg_functions = ["min", "max"] + agg_functions = [ + "min", + "max", + {"col2": "sum"}, + {"col2": "max", "col4": "sum", "col5": "min"}, + ] for func in agg_functions: eval_agg(modin_groupby, pandas_groupby, func) eval_aggregate(modin_groupby, pandas_groupby, func) @@ -552,6 +566,7 @@ def test_large_row_groupby(is_by_category): modin_groupby_equals_pandas(modin_groupby, pandas_groupby) eval_ngroups(modin_groupby, pandas_groupby) + eval_shift(modin_groupby, pandas_groupby) eval_skew(modin_groupby, pandas_groupby) eval_general(modin_groupby, pandas_groupby, lambda df: df.ffill(), is_default=True) eval_general( @@ -591,7 +606,7 @@ def test_large_row_groupby(is_by_category): # eval_prod(modin_groupby, pandas_groupby) causes overflows eval_std(modin_groupby, pandas_groupby) - agg_functions = ["min", "max"] + agg_functions = ["min", "max", {"A": "sum"}, {"A": "max", "B": "sum", "C": "min"}] for func in agg_functions: eval_agg(modin_groupby, pandas_groupby, func) eval_aggregate(modin_groupby, pandas_groupby, func) @@ -666,6 +681,7 @@ def test_simple_col_groupby(): modin_groupby_equals_pandas(modin_groupby, pandas_groupby) eval_ngroups(modin_groupby, pandas_groupby) + eval_shift(modin_groupby, pandas_groupby) eval_skew(modin_groupby, pandas_groupby) eval_general(modin_groupby, pandas_groupby, lambda df: df.ffill(), is_default=True) eval_general( @@ -796,6 +812,7 @@ def test_series_groupby(by, as_index_series_or_dataframe): modin_groupby_equals_pandas(modin_groupby, pandas_groupby) eval_ngroups(modin_groupby, pandas_groupby) + eval_shift(modin_groupby, pandas_groupby) eval_general( modin_groupby, pandas_groupby, lambda df: df.ffill(), is_default=True ) @@ -1069,7 +1086,26 @@ def eval_groups(modin_groupby, pandas_groupby): def eval_shift(modin_groupby, pandas_groupby): - assert modin_groupby.groups == pandas_groupby.groups + eval_general( + modin_groupby, + pandas_groupby, + lambda groupby: groupby.shift(), + ) + eval_general( + modin_groupby, + pandas_groupby, + lambda groupby: groupby.shift(periods=0), + ) + eval_general( + modin_groupby, + pandas_groupby, + lambda groupby: groupby.shift(periods=-3), + ) + eval_general( + modin_groupby, + pandas_groupby, + lambda groupby: groupby.shift(axis=1, fill_value=777), + ) def test_groupby_on_index_values_with_loop(): @@ -1122,23 +1158,68 @@ def test_groupby_multiindex(): df_equals(modin_df.groupby(by=by).count(), pandas_df.groupby(by=by).count()) -def test_agg_func_None_rename(): +@pytest.mark.parametrize("groupby_axis", [0, 1]) +@pytest.mark.parametrize("shift_axis", [0, 1]) +def test_shift_freq(groupby_axis, shift_axis): pandas_df = pandas.DataFrame( { - "col1": np.random.randint(0, 100, size=1000), - "col2": np.random.randint(0, 100, size=1000), - "col3": np.random.randint(0, 100, size=1000), - "col4": np.random.randint(0, 100, size=1000), - }, - index=["row{}".format(i) for i in range(1000)], + "col1": [1, 0, 2, 3], + "col2": [4, 5, np.NaN, 7], + "col3": [np.NaN, np.NaN, 12, 10], + "col4": [17, 13, 16, 15], + } ) modin_df = from_pandas(pandas_df) - modin_result = modin_df.groupby(["col1", "col2"]).agg( - max=("col3", np.max), min=("col3", np.min) + new_index = pandas.date_range("1/12/2020", periods=4, freq="S") + if groupby_axis == 0 and shift_axis == 0: + pandas_df.index = modin_df.index = new_index + by = [["col2", "col3"], ["col2"], ["col4"], [0, 1, 0, 2]] + else: + pandas_df.index = modin_df.index = new_index + pandas_df.columns = modin_df.columns = new_index + by = [[0, 1, 0, 2]] + + for _by in by: + pandas_groupby = pandas_df.groupby(by=_by, axis=groupby_axis) + modin_groupby = modin_df.groupby(by=_by, axis=groupby_axis) + eval_general( + modin_groupby, + pandas_groupby, + lambda groupby: groupby.shift(axis=shift_axis, freq="S"), + ) + + +@pytest.mark.parametrize( + "by_and_agg_dict", + [ + { + "by": [ + list(test_data["int_data"].keys())[0], + list(test_data["int_data"].keys())[1], + ], + "agg_dict": { + "max": (list(test_data["int_data"].keys())[2], np.max), + "min": (list(test_data["int_data"].keys())[2], np.min), + }, + }, + { + "by": ["col1"], + "agg_dict": { + "max": (list(test_data["int_data"].keys())[0], np.max), + "min": (list(test_data["int_data"].keys())[-1], np.min), + }, + }, + ], +) +def test_agg_func_None_rename(by_and_agg_dict): + modin_df, pandas_df = create_test_dfs(test_data["int_data"]) + + modin_result = modin_df.groupby(by_and_agg_dict["by"]).agg( + **by_and_agg_dict["agg_dict"] ) - pandas_result = pandas_df.groupby(["col1", "col2"]).agg( - max=("col3", np.max), min=("col3", np.min) + pandas_result = pandas_df.groupby(by_and_agg_dict["by"]).agg( + **by_and_agg_dict["agg_dict"] ) df_equals(modin_result, pandas_result) diff --git a/modin/pandas/test/test_io.py b/modin/pandas/test/test_io.py index 019db34294a..65bd6d1a62e 100644 --- a/modin/pandas/test/test_io.py +++ b/modin/pandas/test/test_io.py @@ -38,9 +38,11 @@ get_random_string, insert_lines_to_csv, IO_OPS_DATA_DIR, + io_ops_bad_exc, + eval_io_from_str, ) -from modin.config import Engine, Backend +from modin.config import Engine, Backend, IsExperimental if Backend.get() == "Pandas": import modin.pandas as pd @@ -176,13 +178,12 @@ def _csv_file_maker( add_nan_lines=False, thousands_separator=None, decimal_separator=None, - lineterminator=None, comment_col_char=None, quoting=csv.QUOTE_MINIMAL, quotechar='"', doublequote=True, escapechar=None, - line_terminator=os.linesep, + line_terminator=None, ): if os.path.exists(filename) and not force: pass @@ -248,7 +249,7 @@ def _csv_file_maker( "delimiter": delimiter, "doublequote": doublequote, "escapechar": escapechar, - "lineterminator": line_terminator, + "lineterminator": line_terminator if line_terminator else os.linesep, "quotechar": quotechar, "quoting": quoting, } @@ -495,6 +496,10 @@ def teardown_fwf_file(): pass +@pytest.mark.skipif( + IsExperimental.get() and Backend.get() == "Pyarrow", + reason="Segmentation fault; see PR #2347 ffor details", +) class TestReadCSV: # delimiter tests @pytest.mark.parametrize("sep", ["_", ",", ".", "\n"]) @@ -524,6 +529,152 @@ def test_read_csv_delimiters( **kwargs, ) + # Column and Index Locations and Names tests + @pytest.mark.xfail( + Engine.get() != "Python", + reason="many parameters combiantions fails: issue #2312, #2307", + ) + @pytest.mark.parametrize("header", ["infer", None, 0]) + @pytest.mark.parametrize("index_col", [None, "col1"]) + @pytest.mark.parametrize("prefix", [None, "_", "col"]) + @pytest.mark.parametrize( + "names", [None, ["col1"], ["c1", "c2", "c3", "c4", "c5", "c6", "c7"]] + ) + @pytest.mark.parametrize( + "usecols", [None, ["col1"], ["col1", "col2", "col6"], [0, 1, 5]] + ) + @pytest.mark.parametrize("skip_blank_lines", [True, False]) + def test_read_csv_col_handling( + self, + make_csv_file, + request, + header, + index_col, + prefix, + names, + usecols, + skip_blank_lines, + ): + if request.config.getoption("--simulate-cloud").lower() != "off": + pytest.xfail( + "The reason of tests fail in `cloud` mode is unknown for now - issue #2340" + ) + + kwargs = { + "header": header, + "index_col": index_col, + "prefix": prefix, + "names": names, + "usecols": usecols, + "skip_blank_lines": skip_blank_lines, + } + + unique_name = get_unique_filename("test_read_csv_col_handling", kwargs) + make_csv_file( + filename=unique_name, + add_blank_lines=True, + ) + eval_io( + filepath_or_buffer=unique_name, + fn_name="read_csv", + **kwargs, + ) + + @pytest.mark.xfail(reason="infinite recursion error - issue #2032") + @pytest.mark.parametrize( + "test_case", ["single_element", "single_column", "multiple_columns"] + ) + def test_read_csv_squeeze(self, test_case): + unique_filename = get_unique_filename("test_read_csv_squeeze") + + str_single_element = "1" + str_single_col = "1\n2\n3\n" + str_four_cols = "1, 2, 3, 4\n" "5, 6, 7, 8\n" "9, 10, 11, 12\n" + case_to_data = { + "single_element": str_single_element, + "single_column": str_single_col, + "multiple_columns": str_four_cols, + } + + eval_io_from_str(case_to_data[test_case], unique_filename, squeeze=True) + eval_io_from_str( + case_to_data[test_case], unique_filename, header=None, squeeze=True + ) + + def test_read_csv_mangle_dupe_cols(self): + unique_filename = get_unique_filename("test_read_csv_mangle_dupe_cols") + str_non_unique_cols = "col,col,col,col\n" "5, 6, 7, 8\n" "9, 10, 11, 12\n" + eval_io_from_str(str_non_unique_cols, unique_filename, mangle_dupe_cols=True) + + # Datetime Handling tests + @pytest.mark.parametrize( + "parse_dates", + [ + True, + False, + ["col2"], + ["col2", "col4"], + [1, 3], + pytest.param( + {"foo": ["col2", "col4"]}, + marks=pytest.mark.xfail( + Engine.get() != "Python", + reason="Exception: Internal Error - issue #2073", + ), + ), + ], + ) + @pytest.mark.parametrize("infer_datetime_format", [True, False]) + @pytest.mark.parametrize("keep_date_col", [True, False]) + @pytest.mark.parametrize( + "date_parser", [None, lambda x: pd.datetime.strptime(x, "%Y-%m-%d")] + ) + @pytest.mark.parametrize("dayfirst", [True, False]) + @pytest.mark.parametrize("cache_dates", [True, False]) + def test_read_csv_datetime( + self, + make_csv_file, + request, + parse_dates, + infer_datetime_format, + keep_date_col, + date_parser, + dayfirst, + cache_dates, + ): + if request.config.getoption("--simulate-cloud").lower() != "off": + pytest.xfail( + "The reason of tests fail in `cloud` mode is unknown for now - issue #2340" + ) + + raising_exceptions = io_ops_bad_exc # default value + if isinstance(parse_dates, dict) and callable(date_parser): + # In this case raised TypeError: () takes 1 positional argument but 2 were given + raising_exceptions = list(io_ops_bad_exc) + raising_exceptions.remove(TypeError) + + kwargs = { + "parse_dates": parse_dates, + "infer_datetime_format": infer_datetime_format, + "keep_date_col": keep_date_col, + "date_parser": date_parser, + "dayfirst": dayfirst, + "cache_dates": cache_dates, + } + + unique_name = get_unique_filename("test_read_csv_datetime", kwargs) + make_csv_file( + filename=unique_name, + ) + + eval_io( + filepath_or_buffer=unique_name, + fn_name="read_csv", + check_kwargs_callable=not callable(date_parser), + raising_exceptions=raising_exceptions, + **kwargs, + ) + def test_from_parquet(make_parquet_file): make_parquet_file(SMALL_ROW_SIZE) @@ -1098,21 +1249,6 @@ def test_parse_dates_read_csv(): df_equals(modin_df, pandas_df) -@pytest.mark.parametrize( - "kwargs", - [ - {"header": None, "usecols": [0, 7]}, - {"usecols": [0, 7]}, - {"names": [0, 7], "usecols": [0, 7]}, - ], -) -def test_from_csv_with_args(kwargs): - file_name = "modin/pandas/test/data/issue_621.csv" - pandas_df = pandas.read_csv(file_name, **kwargs) - modin_df = pd.read_csv(file_name, **kwargs) - df_equals(modin_df, pandas_df) - - def test_from_table(make_csv_file): make_csv_file(delimiter="\t") @@ -1127,14 +1263,6 @@ def test_from_table(make_csv_file): df_equals(modin_df, pandas_df) -@pytest.mark.parametrize("usecols", [["a"], ["a", "b", "e"], [0, 1, 4]]) -def test_from_csv_with_usecols(usecols): - fname = "modin/pandas/test/data/test_usecols.csv" - pandas_df = pandas.read_csv(fname, usecols=usecols) - modin_df = pd.read_csv(fname, usecols=usecols) - df_equals(modin_df, pandas_df) - - @pytest.mark.skipif(Engine.get() == "Python", reason="Using pandas implementation") def test_from_csv_s3(make_csv_file): dataset_url = "s3://noaa-ghcn-pds/csv/1788.csv" diff --git a/modin/pandas/test/test_series.py b/modin/pandas/test/test_series.py index ec4aba6879a..1f40d7a590e 100644 --- a/modin/pandas/test/test_series.py +++ b/modin/pandas/test/test_series.py @@ -593,16 +593,10 @@ def test_add_suffix(data): @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @pytest.mark.parametrize("func", agg_func_values, ids=agg_func_keys) def test_agg(data, func): - # AssertionError may be arisen in case of - # mismathing of index/columns in Modin and pandas. - # See details in pandas issue 36189. - try: - eval_general( - *create_test_series(data), - lambda df: df.agg(func), - ) - except AssertionError: - pass + eval_general( + *create_test_series(data), + lambda df: df.agg(func), + ) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @@ -624,16 +618,10 @@ def test_agg_numeric(request, data, func): request.node.name, numeric_dfs ): axis = 0 - # AssertionError may be arisen in case of - # mismathing of index/columns in Modin and pandas. - # See details in pandas issue 36189. - try: - eval_general( - *create_test_series(data), - lambda df: df.agg(func, axis), - ) - except AssertionError: - pass + eval_general( + *create_test_series(data), + lambda df: df.agg(func, axis), + ) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @@ -656,16 +644,10 @@ def test_agg_numeric_except(request, data, func): @pytest.mark.parametrize("func", agg_func_values, ids=agg_func_keys) def test_aggregate(data, func): axis = 0 - # AssertionError may be arisen in case of - # mismathing of index/columns in Modin and pandas. - # See details in pandas issue 36189. - try: - eval_general( - *create_test_series(data), - lambda df: df.aggregate(func, axis), - ) - except AssertionError: - pass + eval_general( + *create_test_series(data), + lambda df: df.aggregate(func, axis), + ) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @@ -688,16 +670,10 @@ def test_aggregate_numeric(request, data, func): request.node.name, numeric_dfs ): axis = 0 - # AssertionError may be arisen in case of - # mismathing of index/columns in Modin and pandas. - # See details in pandas issue 36189. - try: - eval_general( - *create_test_series(data), - lambda df: df.agg(func, axis), - ) - except AssertionError: - pass + eval_general( + *create_test_series(data), + lambda df: df.agg(func, axis), + ) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @@ -823,16 +799,10 @@ def test_append(data): @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @pytest.mark.parametrize("func", agg_func_values, ids=agg_func_keys) def test_apply(data, func): - # AssertionError may be arisen in case of - # mismathing of index/columns in Modin and pandas. - # See details in pandas issue 36189. - try: - eval_general( - *create_test_series(data), - lambda df: df.apply(func), - ) - except AssertionError: - pass + eval_general( + *create_test_series(data), + lambda df: df.apply(func), + ) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @@ -871,16 +841,10 @@ def test_apply_external_lib(): @pytest.mark.parametrize("func", agg_func_values, ids=agg_func_keys) def test_apply_numeric(request, data, func): if name_contains(request.node.name, numeric_dfs): - # AssertionError may be arisen in case of - # mismathing of index/columns in Modin and pandas. - # See details in pandas issue 36189. - try: - eval_general( - *create_test_series(data), - lambda df: df.apply(func), - ) - except AssertionError: - pass + eval_general( + *create_test_series(data), + lambda df: df.apply(func), + ) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @@ -2903,6 +2867,7 @@ def test_shift(data): df_equals(modin_series.shift(fill_value=777), pandas_series.shift(fill_value=777)) df_equals(modin_series.shift(periods=7), pandas_series.shift(periods=7)) df_equals(modin_series.shift(periods=-3), pandas_series.shift(periods=-3)) + eval_general(modin_series, pandas_series, lambda df: df.shift(axis=1)) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @@ -3443,6 +3408,26 @@ def sort_index_for_equal_values(result, ascending): ) df_equals(modin_result, pandas_result) + # from issue #2365 + arr = np.random.rand(2 ** 6) + arr[::10] = np.nan + modin_series, pandas_series = create_test_series(arr) + modin_result = modin_series.value_counts(dropna=False, ascending=True) + pandas_result = sort_index_for_equal_values( + pandas_series.value_counts(dropna=False, ascending=True), True + ) + if get_current_backend() == "BaseOnPython": + modin_result = sort_index_for_equal_values(modin_result, ascending=True) + df_equals(modin_result, pandas_result) + + modin_result = modin_series.value_counts(dropna=False, ascending=False) + pandas_result = sort_index_for_equal_values( + pandas_series.value_counts(dropna=False, ascending=False), False + ) + if get_current_backend() == "BaseOnPython": + modin_result = sort_index_for_equal_values(modin_result, ascending=False) + df_equals(modin_result, pandas_result) + @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_values(data): @@ -4397,17 +4382,18 @@ def test_encode(data, encoding_type): df_equals(modin_result, pandas_result) -@pytest.mark.parametrize("data", test_string_data_values, ids=test_string_data_keys) -def test_hasattr_sparse(data): - modin_series, pandas_series = create_test_series(data) - try: - pandas_result = hasattr(pandas_series, "sparse") - except Exception as e: - with pytest.raises(type(e)): - hasattr(modin_series, "sparse") - else: - modin_result = hasattr(modin_series, "sparse") - assert modin_result == pandas_result +@pytest.mark.parametrize( + "is_sparse_data", [True, False], ids=["is_sparse", "is_not_sparse"] +) +def test_hasattr_sparse(is_sparse_data): + modin_df, pandas_df = ( + create_test_series( + pandas.arrays.SparseArray(test_data["float_nan_data"].values()) + ) + if is_sparse_data + else create_test_series(test_data["float_nan_data"]) + ) + eval_general(modin_df, pandas_df, lambda df: hasattr(df, "sparse")) @pytest.mark.parametrize( diff --git a/modin/pandas/test/utils.py b/modin/pandas/test/utils.py index baa2daa7ffe..bb4a0099e68 100644 --- a/modin/pandas/test/utils.py +++ b/modin/pandas/test/utils.py @@ -642,6 +642,7 @@ def eval_general( __inplace__=False, check_exception_type=True, raising_exceptions=None, + check_kwargs_callable=True, **kwargs, ): if raising_exceptions: @@ -670,7 +671,7 @@ def execute_callable(fn, inplace=False, md_kwargs={}, pd_kwargs={}): return (md_result, pd_result) if not __inplace__ else (modin_df, pandas_df) for key, value in kwargs.items(): - if callable(value): + if check_kwargs_callable and callable(value): values = execute_callable(value) # that means, that callable raised an exception if values is None: @@ -696,6 +697,7 @@ def eval_io( cast_to_str=False, check_exception_type=True, raising_exceptions=io_ops_bad_exc, + check_kwargs_callable=True, *args, **kwargs, ): @@ -732,11 +734,37 @@ def applyier(module, *args, **kwargs): applyier, check_exception_type=check_exception_type, raising_exceptions=raising_exceptions, + check_kwargs_callable=check_kwargs_callable, *args, **kwargs, ) +def eval_io_from_str(csv_str: str, unique_filename: str, **kwargs): + """Evaluate I/O operation outputs equality check by using `csv_str` + data passed as python str (csv test file will be created from `csv_str`). + + Parameters + ---------- + csv_str: str + Test data for storing to csv file. + unique_filename: str + csv file name. + """ + try: + with open(unique_filename, "w") as f: + f.write(csv_str) + + eval_io( + filepath_or_buffer=unique_filename, + fn_name="read_csv", + **kwargs, + ) + + finally: + os.remove(unique_filename) + + def create_test_dfs(*args, **kwargs): return pd.DataFrame(*args, **kwargs), pandas.DataFrame(*args, **kwargs) diff --git a/modin/test/backends/pandas/test_internals.py b/modin/test/backends/pandas/test_internals.py new file mode 100644 index 00000000000..266c6d7ff8e --- /dev/null +++ b/modin/test/backends/pandas/test_internals.py @@ -0,0 +1,40 @@ +# Licensed to Modin Development Team under one or more contributor license agreements. +# See the NOTICE file distributed with this work for additional information regarding +# copyright ownership. The Modin Development Team licenses this file to you under the +# Apache License, Version 2.0 (the "License"); you may not use this file except in +# compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under +# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific language +# governing permissions and limitations under the License. + +import modin.pandas as pd + + +def test_aligning_blocks(): + # Test problem when modin frames have the same number of rows, but different + # blocks (partition.list_of_blocks). See #2322 for details + accm = pd.DataFrame(["-22\n"] * 162) + accm = accm.iloc[2:, :] + accm.reset_index(drop=True, inplace=True) + accm["T"] = pd.Series(["24.67\n"] * 145) + + # see #2322 for details + repr(accm) + + +def test_aligning_blocks_with_duplicated_index(): + # Same problem as in `test_aligning_blocks` but with duplicated values in index. + data11 = [0, 1] + data12 = [2, 3] + + data21 = [0] + data22 = [1, 2, 3] + + df1 = pd.DataFrame(data11).append(pd.DataFrame(data12)) + df2 = pd.DataFrame(data21).append(pd.DataFrame(data22)) + + repr(df1 - df2) diff --git a/requirements.txt b/requirements.txt index 4b7640f6b3c..c31cab38814 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,6 @@ -pandas==1.1.3 +pandas==1.1.4 numpy -pyarrow<0.17 +pyarrow>=1.0.0 dask[complete]>=2.12.0,<=2.19.0 distributed>=2.12.0,<=2.19.0 ray>=1.0.0 diff --git a/requirements/env_omnisci.yml b/requirements/env_omnisci.yml index 26e17c64008..70c170d8408 100644 --- a/requirements/env_omnisci.yml +++ b/requirements/env_omnisci.yml @@ -3,7 +3,8 @@ channels: - intel/label/modin - conda-forge dependencies: - - pandas==1.1.3 + - pandas==1.1.4 + - pyarrow>=1.0.0 - numpy - pip - pytest>=6.0.1 diff --git a/setup.py b/setup.py index ec536b28dc6..b5bbd4a5cc8 100644 --- a/setup.py +++ b/setup.py @@ -38,7 +38,7 @@ def is_pure(self): dask_deps = ["dask>=2.12.0,<=2.19.0", "distributed>=2.12.0,<=2.19.0"] -ray_deps = ["ray>=1.0.0", "pyarrow<0.17"] +ray_deps = ["ray>=1.0.0", "pyarrow==1.0"] remote_deps = ["rpyc==4.1.5", "cloudpickle==1.4.1", "boto3==1.4.8"] all_deps = dask_deps + ray_deps + remote_deps @@ -55,7 +55,7 @@ def is_pure(self): url="https://github.com/modin-project/modin", long_description=long_description, long_description_content_type="text/markdown", - install_requires=["pandas==1.1.3", "packaging"], + install_requires=["pandas==1.1.4", "packaging"], extras_require={ # can be installed by pip install modin[dask] "dask": dask_deps,