Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add an environment variable for handling fallback in cudf.pandas #15910

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
38 commits
Select commit Hold shift + click to select a range
577cb93
Add a test
Matt711 May 24, 2024
925606d
Add a test
Matt711 May 28, 2024
ffd8ded
Add debug mode as an argument
Matt711 May 28, 2024
d7286e9
Add an env var
Matt711 May 28, 2024
b33f522
Change to xpd
Matt711 May 28, 2024
9dc9ba9
Add test with env var
Matt711 May 28, 2024
2e5c0da
Merge branch 'branch-24.08' into feature/combine-fast-and-slow-paths
Matt711 May 29, 2024
9699b31
Add argument for assert func, delete test with env var
Matt711 May 29, 2024
e421361
Address comments
Matt711 May 29, 2024
5b313ce
Address comments, refactor test
Matt711 May 30, 2024
a25a3e6
add kwargs to assert func
Matt711 May 30, 2024
ff3408d
Add pandas debug option in missing places
Matt711 May 30, 2024
8c618d7
Use env var instead option
Matt711 May 30, 2024
299fb8d
Merge branch 'branch-24.08' into feature/combine-fast-and-slow-paths
Matt711 May 30, 2024
70b335e
Add CUDF_ to env var, refactor test
Matt711 May 30, 2024
9350955
Merge branch 'branch-24.08' into feature/combine-fast-and-slow-paths
Matt711 May 30, 2024
dc4f5ae
Add two more tests
Matt711 May 31, 2024
7f8844d
Merge branch 'feature/combine-fast-and-slow-paths' of github.com:Matt…
Matt711 May 31, 2024
737a893
Change mocked function names
Matt711 May 31, 2024
a2638b9
Call undo
Matt711 May 31, 2024
3555fd9
Combine tests
Matt711 May 31, 2024
f1afdea
Use unittest.mock instead of monkeypatch
Matt711 May 31, 2024
2bf0b75
patch different funcs in each test
Matt711 May 31, 2024
11d8aba
type median in test 2
Matt711 May 31, 2024
af0d9ce
Use monkeypatch.setattr to undo the monkeypatches
Matt711 Jun 3, 2024
6c11c16
Add environment variable for handling fallback in cudf.pandas
Matt711 Jun 3, 2024
96e4170
Add exceptions for different types of fallback and a test for fallbac…
Matt711 Jun 4, 2024
0547bcb
Address comments
Matt711 Jun 14, 2024
474f44a
Fix merge conflicts
Matt711 Jun 14, 2024
b21128f
Fix more merge conflicts
Matt711 Jun 14, 2024
1c4c163
Switch to pd.mean in first test
Matt711 Jun 14, 2024
cdac928
Merge branch 'branch-24.08' of github.com:rapidsai/cudf into feature/…
Matt711 Jun 18, 2024
657deb9
refactor warning classes
Matt711 Jun 26, 2024
1637d5f
Merge branch 'branch-24.08' of github.com:rapidsai/cudf into feature/…
Matt711 Jun 26, 2024
fa4c331
Run ci with env var
Matt711 Jun 27, 2024
63996e7
Merge branch 'branch-24.08' of github.com:rapidsai/cudf into feature/…
Matt711 Jun 28, 2024
7afab57
Merge branch 'branch-24.08' of github.com:rapidsai/cudf into feature/…
Matt711 Jul 9, 2024
cd8b93e
run in ci
Matt711 Jul 9, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
96 changes: 90 additions & 6 deletions python/cudf/cudf/pandas/fast_slow_proxy.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,67 @@

import numpy as np

from rmm import RMMError

from ..options import _env_get_bool
from ..testing import assert_eq
from .annotation import nvtx


class CudfPandasWarning(UserWarning):
"""Base warning for an incorrect result in cuDF or Pandas. Or the Pandas result was uncomputable"""

pass


class ResultsDifferentWarning(CudfPandasWarning):
"""Warns when the results from cuDF and Pandas were different"""

pass


class PandasErrorWarning(CudfPandasWarning):
"""Warns when the results from Pandas could not be computed"""

pass


class DebuggingFailedWarning(CudfPandasWarning):
"""Warns when the cuDF-Pandas debugging fails"""

pass


class FallbackWarning(CudfPandasWarning):
"""Base warning for when fallback occurs"""

pass


class OOMWarning(FallbackWarning):
"""Warns when cuDF produces a MemoryError or an rmm.RMMError"""

pass


class NotImplementedErrorWarning(FallbackWarning):
"""Warns cuDF produces a NotImplementedError"""

pass


class AttributeErrorWarning(FallbackWarning):
"""Warns when cuDF produces an AttributeError"""

pass


class TypeErrorWarning(FallbackWarning):
"""Warns when cuDF produces a TypeError"""

pass


def call_operator(fn, args, kwargs):
return fn(*args, **kwargs)

Expand Down Expand Up @@ -915,22 +971,50 @@ def _fast_slow_function_call(
except Exception as e:
warnings.warn(
"The result from pandas could not be computed. "
f"The exception was {e}."
f"The exception was {e}.",
PandasErrorWarning,
)
else:
try:
_assert_fast_slow_eq(result, slow_result)
_assert_fast_slow_eq(result, slow_result, **kwargs)
Matt711 marked this conversation as resolved.
Show resolved Hide resolved
except AssertionError as e:
warnings.warn(
"The results from cudf and pandas were different. "
f"The exception was {e}."
f"The exception was {e}.",
ResultsDifferentWarning,
)
except Exception as e:
warnings.warn(
"Pandas debugging mode failed. "
f"The exception was {e}."
"cuDF-Pandas debugging failed. "
f"The exception was {e}.",
DebuggingFailedWarning,
)
except Exception:
except Exception as e:
if _env_get_bool("CUDF_PANDAS_FALLBACK_DEBUGGING", False):
if isinstance(e, (RMMError, MemoryError)):
warnings.warn(
"Out of Memory Error. Falling back to the slow path. "
f"The exception was {e}.",
OOMWarning,
)
elif isinstance(e, NotImplementedError):
warnings.warn(
"NotImplementedError. Falling back to the slow path. "
f"The exception was {e}.",
NotImplementedErrorWarning,
)
elif isinstance(e, AttributeError):
warnings.warn(
"AttributeError. Falling back to the slow path. "
f"The exception was {e}.",
AttributeErrorWarning,
)
elif isinstance(e, TypeError):
warnings.warn(
"TypeError. Falling back to the slow path. "
f"The exception was {e}.",
TypeErrorWarning,
)
with nvtx.annotate(
"EXECUTE_SLOW",
color=_CUDF_PANDAS_NVTX_COLORS["EXECUTE_SLOW"],
Expand Down
4 changes: 4 additions & 0 deletions python/cudf/cudf/pandas/scripts/run-pandas-tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,10 @@
#
# This script creates a `pandas-testing` directory if it doesn't exist

export CUDF_PANDAS_FALLBACK_DEBUGGING=True

echo ${CUDF_PANDAS_FALLBACK_DEBUGGING}

set -euo pipefail

# Grab the Pandas source corresponding to the version
Expand Down
196 changes: 134 additions & 62 deletions python/cudf/cudf_pandas_tests/test_cudf_pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,20 @@
from numba import NumbaDeprecationWarning
from pytz import utc

from rmm import RMMError

from cudf.pandas import LOADED, Profiler
from cudf.pandas.fast_slow_proxy import _Unusable, is_proxy_object
from cudf.pandas.fast_slow_proxy import (
AttributeErrorWarning,
DebuggingFailedWarning,
NotImplementedErrorWarning,
OOMWarning,
PandasErrorWarning,
ResultsDifferentWarning,
TypeErrorWarning,
_Unusable,
is_proxy_object,
)

if not LOADED:
raise ImportError("These tests must be run with cudf.pandas loaded")
Expand Down Expand Up @@ -1436,6 +1448,127 @@ def test_holidays_within_dates(holiday, start, expected):
) == [utc.localize(dt) for dt in expected]


def mock_mean_one(self, *args, **kwargs):
return np.float64(1.0)


def mock_mean_exception(self, *args, **kwargs):
raise Exception()


def mock_mean_none(self, *args, **kwargs):
return None


def mock_mean_memory_error(self, *args, **kwargs):
raise MemoryError()


def mock_mean_rmm_error(self, *args, **kwargs):
raise RMMError(1, "error")


def mock_mean_not_impl_error(self, *args, **kwargs):
raise NotImplementedError()


def mock_mean_attr_error(self, *args, **kwargs):
raise AttributeError()


def mock_mean_type_error(self, *args, **kwargs):
raise TypeError()


@pytest.mark.parametrize(
"mock_mean, warning, match_str, env_var, original_mean, proxy_attr",
[
(
mock_mean_one,
ResultsDifferentWarning,
"The results from cudf and pandas were different.",
"CUDF_PANDAS_DEBUGGING",
pd.Series.mean,
"_fsproxy_slow",
),
(
mock_mean_exception,
PandasErrorWarning,
"The result from pandas could not be computed.",
"CUDF_PANDAS_DEBUGGING",
pd.Series.mean,
"_fsproxy_slow",
),
(
mock_mean_none,
DebuggingFailedWarning,
"cuDF-Pandas debugging failed.",
"CUDF_PANDAS_DEBUGGING",
pd.Series.mean,
"_fsproxy_slow",
),
(
mock_mean_memory_error,
OOMWarning,
"Out of Memory Error.",
"CUDF_PANDAS_FALLBACK_DEBUGGING",
cudf.Series.mean,
"_fsproxy_fast",
),
(
mock_mean_rmm_error,
OOMWarning,
"Out of Memory Error.",
"CUDF_PANDAS_FALLBACK_DEBUGGING",
cudf.Series.mean,
"_fsproxy_fast",
),
(
mock_mean_not_impl_error,
NotImplementedErrorWarning,
"NotImplementedError.",
"CUDF_PANDAS_FALLBACK_DEBUGGING",
cudf.Series.mean,
"_fsproxy_fast",
),
(
mock_mean_attr_error,
AttributeErrorWarning,
"AttributeError.",
"CUDF_PANDAS_FALLBACK_DEBUGGING",
cudf.Series.mean,
"_fsproxy_fast",
),
(
mock_mean_type_error,
TypeErrorWarning,
"TypeError.",
"CUDF_PANDAS_FALLBACK_DEBUGGING",
cudf.Series.mean,
"_fsproxy_fast",
),
],
)
def test_cudf_pandas_debugging(
monkeypatch,
mock_mean,
warning,
match_str,
env_var,
original_mean,
proxy_attr,
):
with monkeypatch.context() as monkeycontext:
monkeypatch.setattr(xpd.Series.mean, proxy_attr, mock_mean)
monkeycontext.setenv(env_var, "True")
s = xpd.Series([1, 2])
with pytest.warns(warning, match=match_str):
assert s.mean() == 1.5

# Must explicitly undo the patch. Proxy dispatch doesn't work with monkeypatch contexts.
monkeypatch.setattr(xpd.Series.mean, proxy_attr, original_mean)


@pytest.mark.parametrize(
"env_value",
["", "cuda", "pool", "async", "managed", "managed_pool", "abc"],
Expand Down Expand Up @@ -1463,67 +1596,6 @@ def test_rmm_option_on_import(env_value):
assert sp_completed.returncode == 1


def test_cudf_pandas_debugging_different_results(monkeypatch):
cudf_mean = cudf.Series.mean

def mock_mean_one(self, *args, **kwargs):
return np.float64(1.0)

with monkeypatch.context() as monkeycontext:
monkeypatch.setattr(xpd.Series.mean, "_fsproxy_fast", mock_mean_one)
monkeycontext.setenv("CUDF_PANDAS_DEBUGGING", "True")
s = xpd.Series([1, 2])
with pytest.warns(
UserWarning,
match="The results from cudf and pandas were different.",
):
assert s.mean() == 1.0
# Must explicitly undo the patch. Proxy dispatch doesn't work with monkeypatch contexts.
monkeypatch.setattr(xpd.Series.mean, "_fsproxy_fast", cudf_mean)


def test_cudf_pandas_debugging_pandas_error(monkeypatch):
pd_mean = pd.Series.mean

def mock_mean_exception(self, *args, **kwargs):
raise Exception()

with monkeypatch.context() as monkeycontext:
monkeycontext.setattr(
xpd.Series.mean, "_fsproxy_slow", mock_mean_exception
)
monkeycontext.setenv("CUDF_PANDAS_DEBUGGING", "True")
s = xpd.Series([1, 2])
with pytest.warns(
UserWarning,
match="The result from pandas could not be computed.",
):
s = xpd.Series([1, 2])
assert s.mean() == 1.5
# Must explicitly undo the patch. Proxy dispatch doesn't work with monkeypatch contexts.
monkeypatch.setattr(xpd.Series.mean, "_fsproxy_slow", pd_mean)


def test_cudf_pandas_debugging_failed(monkeypatch):
pd_mean = pd.Series.mean

def mock_mean_none(self, *args, **kwargs):
return None

with monkeypatch.context() as monkeycontext:
monkeycontext.setattr(xpd.Series.mean, "_fsproxy_slow", mock_mean_none)
monkeycontext.setenv("CUDF_PANDAS_DEBUGGING", "True")
s = xpd.Series([1, 2])
with pytest.warns(
UserWarning,
match="Pandas debugging mode failed.",
):
s = xpd.Series([1, 2])
assert s.mean() == 1.5
# Must explicitly undo the patch. Proxy dispatch doesn't work with monkeypatch contexts.
monkeypatch.setattr(xpd.Series.mean, "_fsproxy_slow", pd_mean)


def test_excelwriter_pathlike():
assert isinstance(pd.ExcelWriter("foo.xlsx"), os.PathLike)

Expand Down
Loading