apache · itholic · Apr 2, 2021 · viirya · Apr 5, 2021 · itholic
diff --git a/dev/lint-python b/dev/lint-python
@@ -17,6 +17,7 @@
 #
 # define test binaries + versions
 FLAKE8_BUILD="flake8"
+# TODO(SPARK-34943): minimum version should be 3.8+
 MINIMUM_FLAKE8="3.5.0"
 MYPY_BUILD="mypy"
 PYCODESTYLE_BUILD="pycodestyle"

diff --git a/python/mypy.ini b/python/mypy.ini
@@ -126,3 +126,7 @@ ignore_missing_imports = True
 
 [mypy-psutil.*]
 ignore_missing_imports = True
+
+# TODO(SPARK-34941): Enable mypy for pandas-on-Spark
+[mypy-pyspark.pandas.*]
+ignore_errors = True
diff --git a/python/pyspark/pandas/__init__.py b/python/pyspark/pandas/__init__.py
@@ -0,0 +1,209 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+import os
+import sys
+from distutils.version import LooseVersion
+
+from pyspark.pandas.version import __version__  # noqa: F401
+
+
+def assert_python_version():
+    import warnings
+
+    major = 3
+    minor = 5
+    deprecated_version = (major, minor)
+    min_supported_version = (major, minor + 1)
+
+    if sys.version_info[:2] <= deprecated_version:
+        warnings.warn(
+            "Koalas support for Python {dep_ver} is deprecated and will be dropped in "
+            "the future release. At that point, existing Python {dep_ver} workflows "
+            "that use Koalas will continue to work without modification, but Python {dep_ver} "
+            "users will no longer get access to the latest Koalas features and bugfixes. "
+            "We recommend that you upgrade to Python {min_ver} or newer.".format(
+                dep_ver=".".join(map(str, deprecated_version)),
+                min_ver=".".join(map(str, min_supported_version)),
+            ),
+            FutureWarning,
+        )
+
+
+def assert_pyspark_version():
+    import logging
+
+    try:
+        import pyspark
+    except ImportError:
+        raise ImportError(
+            "Unable to import pyspark - consider doing a pip install with [spark] "
+            "extra to install pyspark with pip"
+        )
+    else:
+        pyspark_ver = getattr(pyspark, "__version__")
+        if pyspark_ver is None or LooseVersion(pyspark_ver) < LooseVersion("2.4"):
+            logging.warning(
+                'Found pyspark version "{}" installed. pyspark>=2.4.0 is recommended.'.format(
+                    pyspark_ver if pyspark_ver is not None else "<unknown version>"
+                )
+            )
+
+
+assert_python_version()
+assert_pyspark_version()
+
+import pyspark
+import pyarrow
+
+if LooseVersion(pyspark.__version__) < LooseVersion("3.0"):
+    if (
+        LooseVersion(pyarrow.__version__) >= LooseVersion("0.15")
+        and "ARROW_PRE_0_15_IPC_FORMAT" not in os.environ
+    ):
+        import logging
+
+        logging.warning(
+            "'ARROW_PRE_0_15_IPC_FORMAT' environment variable was not set. It is required to "
+            "set this environment variable to '1' in both driver and executor sides if you use "
+            "pyarrow>=0.15 and pyspark<3.0. "
+            "Koalas will set it for you but it does not work if there is a Spark context already "
+            "launched."
+        )
+        # This is required to support PyArrow 0.15 in PySpark versions lower than 3.0.
+        # See SPARK-29367.
+        os.environ["ARROW_PRE_0_15_IPC_FORMAT"] = "1"
+elif "ARROW_PRE_0_15_IPC_FORMAT" in os.environ:
+    raise RuntimeError(
+        "Please explicitly unset 'ARROW_PRE_0_15_IPC_FORMAT' environment variable in both "
+        "driver and executor sides. It is required to set this environment variable only "
+        "when you use pyarrow>=0.15 and pyspark<3.0."
+    )
+
+if (
+    LooseVersion(pyarrow.__version__) >= LooseVersion("2.0.0")
+    and "PYARROW_IGNORE_TIMEZONE" not in os.environ
+):
+    import logging
+
+    logging.warning(
+        "'PYARROW_IGNORE_TIMEZONE' environment variable was not set. It is required to "
+        "set this environment variable to '1' in both driver and executor sides if you use "
+        "pyarrow>=2.0.0. "
+        "Koalas will set it for you but it does not work if there is a Spark context already "
+        "launched."
+    )
+    os.environ["PYARROW_IGNORE_TIMEZONE"] = "1"
+
+from pyspark.pandas.frame import DataFrame
+from pyspark.pandas.indexes.base import Index
+from pyspark.pandas.indexes.category import CategoricalIndex
+from pyspark.pandas.indexes.datetimes import DatetimeIndex
+from pyspark.pandas.indexes.multi import MultiIndex
+from pyspark.pandas.indexes.numeric import Float64Index, Int64Index
+from pyspark.pandas.series import Series
+from pyspark.pandas.groupby import NamedAgg
+
+__all__ = [  # noqa: F405
+    "read_csv",
+    "read_parquet",
+    "to_datetime",
+    "date_range",
+    "from_pandas",
+    "get_dummies",
+    "DataFrame",
+    "Series",
+    "Index",
+    "MultiIndex",
+    "Int64Index",
+    "Float64Index",
+    "CategoricalIndex",
+    "DatetimeIndex",
+    "sql",
+    "range",
+    "concat",
+    "melt",
+    "get_option",
+    "set_option",
+    "reset_option",
+    "read_sql_table",
+    "read_sql_query",
+    "read_sql",
+    "options",
+    "option_context",
+    "NamedAgg",
+]
+
+
+def _auto_patch_spark():
+    import os
+    import logging
+
+    # Attach a usage logger.
+    logger_module = os.getenv("KOALAS_USAGE_LOGGER", "")
+    if logger_module != "":
+        try:
+            from pyspark.pandas import usage_logging
+
+            usage_logging.attach(logger_module)
+        except Exception as e:
+            logger = logging.getLogger("pyspark.pandas.usage_logger")
+            logger.warning(
+                "Tried to attach usage logger `{}`, but an exception was raised: {}".format(
+                    logger_module, str(e)
+                )
+            )
+
+    # Autopatching is on by default.
+    x = os.getenv("SPARK_KOALAS_AUTOPATCH", "true")
+    if x.lower() in ("true", "1", "enabled"):
+        logger = logging.getLogger("spark")
+        logger.info(
+            "Patching spark automatically. You can disable it by setting "
+            "SPARK_KOALAS_AUTOPATCH=false in your environment"
+        )
+
+        from pyspark.sql import dataframe as df
+
+        df.DataFrame.to_koalas = DataFrame.to_koalas
+
+
+def _auto_patch_pandas():
+    import pandas as pd
+
+    # In order to use it in test cases.
+    global _frame_has_class_getitem
+    global _series_has_class_getitem
+
+    _frame_has_class_getitem = hasattr(pd.DataFrame, "__class_getitem__")
+    _series_has_class_getitem = hasattr(pd.Series, "__class_getitem__")
+
+    if sys.version_info >= (3, 7):
+        # Just in case pandas implements '__class_getitem__' later.
+        if not _frame_has_class_getitem:
+            pd.DataFrame.__class_getitem__ = lambda params: DataFrame.__class_getitem__(params)
+
+        if not _series_has_class_getitem:
+            pd.Series.__class_getitem__ = lambda params: Series.__class_getitem__(params)
+
+
+_auto_patch_spark()
+_auto_patch_pandas()
+
+# Import after the usage logger is attached.
+from pyspark.pandas.config import get_option, options, option_context, reset_option, set_option
+from pyspark.pandas.namespace import *  # F405
+from pyspark.pandas.sql import sql