Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[SPARK-34890][PYTHON] Port/integrate Koalas main codes into PySpark #32036

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions dev/lint-python
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
#
# define test binaries + versions
FLAKE8_BUILD="flake8"
# TODO(SPARK-34943): minimum version should be 3.8+
MINIMUM_FLAKE8="3.5.0"
MYPY_BUILD="mypy"
PYCODESTYLE_BUILD="pycodestyle"
Expand Down
4 changes: 4 additions & 0 deletions python/mypy.ini
Original file line number Diff line number Diff line change
Expand Up @@ -126,3 +126,7 @@ ignore_missing_imports = True

[mypy-psutil.*]
ignore_missing_imports = True

# TODO(SPARK-34941): Enable mypy for pandas-on-Spark
[mypy-pyspark.pandas.*]
ignore_errors = True
209 changes: 209 additions & 0 deletions python/pyspark/pandas/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,209 @@
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import os
import sys
from distutils.version import LooseVersion

from pyspark.pandas.version import __version__ # noqa: F401


def assert_python_version():
import warnings

major = 3
minor = 5
deprecated_version = (major, minor)
min_supported_version = (major, minor + 1)

if sys.version_info[:2] <= deprecated_version:
warnings.warn(
"Koalas support for Python {dep_ver} is deprecated and will be dropped in "
HyukjinKwon marked this conversation as resolved.
Show resolved Hide resolved
"the future release. At that point, existing Python {dep_ver} workflows "
"that use Koalas will continue to work without modification, but Python {dep_ver} "
"users will no longer get access to the latest Koalas features and bugfixes. "
"We recommend that you upgrade to Python {min_ver} or newer.".format(
dep_ver=".".join(map(str, deprecated_version)),
min_ver=".".join(map(str, min_supported_version)),
),
FutureWarning,
)


def assert_pyspark_version():
import logging

try:
import pyspark
except ImportError:
raise ImportError(
"Unable to import pyspark - consider doing a pip install with [spark] "
"extra to install pyspark with pip"
)
else:
pyspark_ver = getattr(pyspark, "__version__")
if pyspark_ver is None or LooseVersion(pyspark_ver) < LooseVersion("2.4"):
logging.warning(
'Found pyspark version "{}" installed. pyspark>=2.4.0 is recommended.'.format(
pyspark_ver if pyspark_ver is not None else "<unknown version>"
)
)
Comment on lines +58 to +63
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This looks like for checking PySpark version in Koalas. Do we still need to check PySpark version as it is now included in PySpark?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Correct!
We'll address the all codes related with the existing third-party dependencies in the SPARK-34887.
Thanks for the comment!



assert_python_version()
assert_pyspark_version()

import pyspark
import pyarrow

if LooseVersion(pyspark.__version__) < LooseVersion("3.0"):
if (
LooseVersion(pyarrow.__version__) >= LooseVersion("0.15")
and "ARROW_PRE_0_15_IPC_FORMAT" not in os.environ
):
import logging

logging.warning(
"'ARROW_PRE_0_15_IPC_FORMAT' environment variable was not set. It is required to "
"set this environment variable to '1' in both driver and executor sides if you use "
"pyarrow>=0.15 and pyspark<3.0. "
"Koalas will set it for you but it does not work if there is a Spark context already "
"launched."
)
# This is required to support PyArrow 0.15 in PySpark versions lower than 3.0.
# See SPARK-29367.
os.environ["ARROW_PRE_0_15_IPC_FORMAT"] = "1"
elif "ARROW_PRE_0_15_IPC_FORMAT" in os.environ:
raise RuntimeError(
"Please explicitly unset 'ARROW_PRE_0_15_IPC_FORMAT' environment variable in both "
"driver and executor sides. It is required to set this environment variable only "
"when you use pyarrow>=0.15 and pyspark<3.0."
)

if (
LooseVersion(pyarrow.__version__) >= LooseVersion("2.0.0")
and "PYARROW_IGNORE_TIMEZONE" not in os.environ
):
import logging

logging.warning(
"'PYARROW_IGNORE_TIMEZONE' environment variable was not set. It is required to "
"set this environment variable to '1' in both driver and executor sides if you use "
"pyarrow>=2.0.0. "
"Koalas will set it for you but it does not work if there is a Spark context already "
"launched."
)
os.environ["PYARROW_IGNORE_TIMEZONE"] = "1"

from pyspark.pandas.frame import DataFrame
from pyspark.pandas.indexes.base import Index
from pyspark.pandas.indexes.category import CategoricalIndex
from pyspark.pandas.indexes.datetimes import DatetimeIndex
from pyspark.pandas.indexes.multi import MultiIndex
from pyspark.pandas.indexes.numeric import Float64Index, Int64Index
from pyspark.pandas.series import Series
from pyspark.pandas.groupby import NamedAgg

__all__ = [ # noqa: F405
"read_csv",
"read_parquet",
"to_datetime",
"date_range",
"from_pandas",
"get_dummies",
"DataFrame",
"Series",
"Index",
"MultiIndex",
"Int64Index",
"Float64Index",
"CategoricalIndex",
"DatetimeIndex",
"sql",
"range",
"concat",
"melt",
"get_option",
"set_option",
"reset_option",
"read_sql_table",
"read_sql_query",
"read_sql",
"options",
"option_context",
"NamedAgg",
]


def _auto_patch_spark():
import os
import logging

# Attach a usage logger.
logger_module = os.getenv("KOALAS_USAGE_LOGGER", "")
if logger_module != "":
try:
from pyspark.pandas import usage_logging

usage_logging.attach(logger_module)
except Exception as e:
logger = logging.getLogger("pyspark.pandas.usage_logger")
logger.warning(
"Tried to attach usage logger `{}`, but an exception was raised: {}".format(
logger_module, str(e)
)
)

# Autopatching is on by default.
x = os.getenv("SPARK_KOALAS_AUTOPATCH", "true")
if x.lower() in ("true", "1", "enabled"):
logger = logging.getLogger("spark")
logger.info(
"Patching spark automatically. You can disable it by setting "
"SPARK_KOALAS_AUTOPATCH=false in your environment"
)

from pyspark.sql import dataframe as df

df.DataFrame.to_koalas = DataFrame.to_koalas


def _auto_patch_pandas():
import pandas as pd

# In order to use it in test cases.
global _frame_has_class_getitem
global _series_has_class_getitem

_frame_has_class_getitem = hasattr(pd.DataFrame, "__class_getitem__")
_series_has_class_getitem = hasattr(pd.Series, "__class_getitem__")

if sys.version_info >= (3, 7):
# Just in case pandas implements '__class_getitem__' later.
if not _frame_has_class_getitem:
pd.DataFrame.__class_getitem__ = lambda params: DataFrame.__class_getitem__(params)

if not _series_has_class_getitem:
pd.Series.__class_getitem__ = lambda params: Series.__class_getitem__(params)


_auto_patch_spark()
_auto_patch_pandas()

# Import after the usage logger is attached.
from pyspark.pandas.config import get_option, options, option_context, reset_option, set_option
from pyspark.pandas.namespace import * # F405
from pyspark.pandas.sql import sql
Loading