From d0dbfd6eafab504c71aff902b2238764fb0389c1 Mon Sep 17 00:00:00 2001 From: Xinrong Meng Date: Tue, 27 Apr 2021 16:01:02 -0700 Subject: [PATCH 1/6] Prototype --- dev/requirements.txt | 10 ++++++++++ python/setup.py | 19 +++++++++++++++++-- 2 files changed, 27 insertions(+), 2 deletions(-) diff --git a/dev/requirements.txt b/dev/requirements.txt index f0bdc797b7033..807bd00da33f9 100644 --- a/dev/requirements.txt +++ b/dev/requirements.txt @@ -6,3 +6,13 @@ pydata_sphinx_theme ipython nbsphinx numpydoc + +# dependencies in pandas-on-spark. +pandas>=0.23.2 +pyarrow>=0.10 +numpy>=1.14,<1.20.0 + +# Optional dependencies in pandas-on-spark. +mlflow>=1.0 +plotly>=4.8 +matplotlib>=3.0.0,<3.3.0 diff --git a/python/setup.py b/python/setup.py index b32569b483731..9886faa181397 100755 --- a/python/setup.py +++ b/python/setup.py @@ -220,6 +220,13 @@ def run(self): 'pyspark.bin', 'pyspark.sbin', 'pyspark.jars', + 'pyspark.pandas', + 'pyspark.pandas.indexes', + 'pyspark.pandas.missing', + 'pyspark.pandas.plot', + 'pyspark.pandas.spark', + 'pyspark.pandas.typedef', + 'pyspark.pandas.usage_logging', 'pyspark.python.pyspark', 'pyspark.python.lib', 'pyspark.data', @@ -250,14 +257,22 @@ def run(self): license='http://www.apache.org/licenses/LICENSE-2.0', # Don't forget to update python/docs/source/getting_started/install.rst # if you're updating the versions or dependencies. - install_requires=['py4j==0.10.9.2'], + install_requires=[ + 'py4j==0.10.9.2' + 'pandas>=0.23.2', + 'pyarrow>=0.10', + 'numpy>=1.14,<1.20.0', + ], extras_require={ 'ml': ['numpy>=1.7'], 'mllib': ['numpy>=1.7'], 'sql': [ 'pandas>=%s' % _minimum_pandas_version, 'pyarrow>=%s' % _minimum_pyarrow_version, - ] + ], + 'pandas.mlflow': ['mlflow>=1.0'], + 'pandas.plotly': ['plotly>=4.8'], + 'pandas.matplotlib': ['matplotlib>=3.0.0,<3.3.0'], }, python_requires='>=3.6', classifiers=[ From 3e4754d1e16ffda188d29b70ab7ca1be6f29266d Mon Sep 17 00:00:00 2001 From: Xinrong Meng Date: Tue, 27 Apr 2021 16:22:57 -0700 Subject: [PATCH 2/6] Fix --- python/setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/setup.py b/python/setup.py index 9886faa181397..163d6604937af 100755 --- a/python/setup.py +++ b/python/setup.py @@ -258,7 +258,7 @@ def run(self): # Don't forget to update python/docs/source/getting_started/install.rst # if you're updating the versions or dependencies. install_requires=[ - 'py4j==0.10.9.2' + 'py4j==0.10.9.2', 'pandas>=0.23.2', 'pyarrow>=0.10', 'numpy>=1.14,<1.20.0', From 28d1034f5534133e874199c498efb10ab53e58c7 Mon Sep 17 00:00:00 2001 From: Xinrong Meng Date: Thu, 29 Apr 2021 10:00:35 -0700 Subject: [PATCH 3/6] Adjust setup.py --- dev/requirements.txt | 2 +- python/setup.py | 18 +++++++----------- 2 files changed, 8 insertions(+), 12 deletions(-) diff --git a/dev/requirements.txt b/dev/requirements.txt index 807bd00da33f9..ba6c7346ef105 100644 --- a/dev/requirements.txt +++ b/dev/requirements.txt @@ -7,7 +7,7 @@ ipython nbsphinx numpydoc -# dependencies in pandas-on-spark. +# Dependencies in pandas-on-spark. pandas>=0.23.2 pyarrow>=0.10 numpy>=1.14,<1.20.0 diff --git a/python/setup.py b/python/setup.py index 163d6604937af..fa8d832643b40 100755 --- a/python/setup.py +++ b/python/setup.py @@ -112,7 +112,6 @@ def _supports_symlinks(): # binary format protocol with the Java version, see ARROW_HOME/format/* for specifications. # Also don't forget to update python/docs/source/getting_started/install.rst. _minimum_pandas_version = "0.23.2" -_minimum_pyarrow_version = "1.0.0" class InstallCommand(install): @@ -257,22 +256,19 @@ def run(self): license='http://www.apache.org/licenses/LICENSE-2.0', # Don't forget to update python/docs/source/getting_started/install.rst # if you're updating the versions or dependencies. - install_requires=[ - 'py4j==0.10.9.2', - 'pandas>=0.23.2', - 'pyarrow>=0.10', - 'numpy>=1.14,<1.20.0', - ], + install_requires=['py4j==0.10.9.2'], extras_require={ 'ml': ['numpy>=1.7'], 'mllib': ['numpy>=1.7'], 'sql': [ 'pandas>=%s' % _minimum_pandas_version, - 'pyarrow>=%s' % _minimum_pyarrow_version, + 'pyarrow>=1.0.0', + ], + 'pandas_on_spark': [ + 'pandas>=%s' % _minimum_pandas_version, + 'pyarrow>=0.10', + 'numpy>=1.14,<1.20.0', ], - 'pandas.mlflow': ['mlflow>=1.0'], - 'pandas.plotly': ['plotly>=4.8'], - 'pandas.matplotlib': ['matplotlib>=3.0.0,<3.3.0'], }, python_requires='>=3.6', classifiers=[ From 939ca2f44fe51392765a0c2092f1cb2d57d593c4 Mon Sep 17 00:00:00 2001 From: Xinrong Meng Date: Thu, 29 Apr 2021 10:12:42 -0700 Subject: [PATCH 4/6] + description to install.rst --- python/docs/source/getting_started/install.rst | 3 +++ 1 file changed, 3 insertions(+) diff --git a/python/docs/source/getting_started/install.rst b/python/docs/source/getting_started/install.rst index a14f2b8057981..c995604949906 100644 --- a/python/docs/source/getting_started/install.rst +++ b/python/docs/source/getting_started/install.rst @@ -159,6 +159,9 @@ Package Minimum supported version Note `NumPy` 1.7 Required for ML `pyarrow` 1.0.0 Optional for SQL `Py4J` 0.10.9.2 Required +`pandas` 0.23.2 Required for Pandas-on-spark +`pyarrow` 0.10 Required for Pandas-on-spark +`Numpy` 1.14(<1.20.0) Required for Pandas-on-spark ============= ========================= ================ Note that PySpark requires Java 8 or later with ``JAVA_HOME`` properly set. From cca7a4900923a3140e8ab121c29a41e16a82d4cc Mon Sep 17 00:00:00 2001 From: Xinrong Meng Date: Fri, 30 Apr 2021 10:05:24 -0700 Subject: [PATCH 5/6] pyarrow lower bound; doc --- dev/requirements.txt | 10 ---------- python/docs/source/getting_started/install.rst | 6 +++--- python/setup.py | 5 +++-- 3 files changed, 6 insertions(+), 15 deletions(-) diff --git a/dev/requirements.txt b/dev/requirements.txt index ba6c7346ef105..f0bdc797b7033 100644 --- a/dev/requirements.txt +++ b/dev/requirements.txt @@ -6,13 +6,3 @@ pydata_sphinx_theme ipython nbsphinx numpydoc - -# Dependencies in pandas-on-spark. -pandas>=0.23.2 -pyarrow>=0.10 -numpy>=1.14,<1.20.0 - -# Optional dependencies in pandas-on-spark. -mlflow>=1.0 -plotly>=4.8 -matplotlib>=3.0.0,<3.3.0 diff --git a/python/docs/source/getting_started/install.rst b/python/docs/source/getting_started/install.rst index c995604949906..98856d1f3a3fc 100644 --- a/python/docs/source/getting_started/install.rst +++ b/python/docs/source/getting_started/install.rst @@ -159,9 +159,9 @@ Package Minimum supported version Note `NumPy` 1.7 Required for ML `pyarrow` 1.0.0 Optional for SQL `Py4J` 0.10.9.2 Required -`pandas` 0.23.2 Required for Pandas-on-spark -`pyarrow` 0.10 Required for Pandas-on-spark -`Numpy` 1.14(<1.20.0) Required for Pandas-on-spark +`pandas` 0.23.2 Required for pandas-on-Spark +`pyarrow` 1.0.0 Required for pandas-on-Spark +`Numpy` 1.14(<1.20.0) Required for pandas-on-Spark ============= ========================= ================ Note that PySpark requires Java 8 or later with ``JAVA_HOME`` properly set. diff --git a/python/setup.py b/python/setup.py index fa8d832643b40..5c4a1aeea083e 100755 --- a/python/setup.py +++ b/python/setup.py @@ -112,6 +112,7 @@ def _supports_symlinks(): # binary format protocol with the Java version, see ARROW_HOME/format/* for specifications. # Also don't forget to update python/docs/source/getting_started/install.rst. _minimum_pandas_version = "0.23.2" +_minimum_pyarrow_version = "1.0.0" class InstallCommand(install): @@ -262,11 +263,11 @@ def run(self): 'mllib': ['numpy>=1.7'], 'sql': [ 'pandas>=%s' % _minimum_pandas_version, - 'pyarrow>=1.0.0', + 'pyarrow>=%s' % _minimum_pyarrow_version, ], 'pandas_on_spark': [ 'pandas>=%s' % _minimum_pandas_version, - 'pyarrow>=0.10', + 'pyarrow>=%s' % _minimum_pyarrow_version, 'numpy>=1.14,<1.20.0', ], }, From f8a08202ea7b5478a31e61468c6f83c24cb3e77c Mon Sep 17 00:00:00 2001 From: Xinrong Meng Date: Mon, 3 May 2021 09:51:54 -0700 Subject: [PATCH 6/6] Format --- python/docs/source/getting_started/install.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/python/docs/source/getting_started/install.rst b/python/docs/source/getting_started/install.rst index 98856d1f3a3fc..7a1a3a98ee9f9 100644 --- a/python/docs/source/getting_started/install.rst +++ b/python/docs/source/getting_started/install.rst @@ -152,9 +152,9 @@ To install PySpark from source, refer to |building_spark|_. Dependencies ------------ -============= ========================= ================ +============= ========================= ============================ Package Minimum supported version Note -============= ========================= ================ +============= ========================= ============================ `pandas` 0.23.2 Optional for SQL `NumPy` 1.7 Required for ML `pyarrow` 1.0.0 Optional for SQL @@ -162,7 +162,7 @@ Package Minimum supported version Note `pandas` 0.23.2 Required for pandas-on-Spark `pyarrow` 1.0.0 Required for pandas-on-Spark `Numpy` 1.14(<1.20.0) Required for pandas-on-Spark -============= ========================= ================ +============= ========================= ============================ Note that PySpark requires Java 8 or later with ``JAVA_HOME`` properly set. If using JDK 11, set ``-Dio.netty.tryReflectionSetAccessible=true`` for Arrow related features and refer