Skip to content

Commit

Permalink
Simplify preprocessing of arguments for DataFrame binops (#10563)
Browse files Browse the repository at this point in the history
This PR simplifies the preprocessing of the rhs of binary operations for DataFrame, streamlining it to a single path that can be more easily combined with that of other types in the future.

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: #10563
  • Loading branch information
vyasr authored Apr 12, 2022
1 parent 64a811e commit c9e16c7
Show file tree
Hide file tree
Showing 4 changed files with 154 additions and 76 deletions.
125 changes: 57 additions & 68 deletions python/cudf/cudf/core/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
import sys
import warnings
from collections import defaultdict
from collections.abc import Iterable, Sequence
from collections.abc import Iterable, Mapping, Sequence
from typing import (
Any,
Dict,
Expand Down Expand Up @@ -1854,86 +1854,75 @@ def _make_operands_and_index_for_binop(
],
Optional[BaseIndex],
]:
lhs, rhs = self, other

if _is_scalar_or_zero_d_array(rhs):
rhs = [rhs] * lhs._num_columns

# For columns that exist in rhs but not lhs, we swap the order so that
# we can always assume that left has a binary operator. This
# implementation assumes that binary operations between a column and
# NULL are always commutative, even for binops (like subtraction) that
# are normally anticommutative.
# TODO: The above should no longer be necessary once we switch to
# properly invoking the operator since we can then rely on reflection.
if isinstance(rhs, Sequence):
# TODO: Consider validating sequence length (pandas does).
operands = {
name: (left, right, reflect, fill_value)
for right, (name, left) in zip(rhs, lhs._data.items())
}
elif isinstance(rhs, DataFrame):
# Check built-in types first for speed.
if isinstance(other, (list, dict, Sequence, Mapping)):
warnings.warn(
"Binary operations between host objects such as "
f"{type(other)} and cudf.DataFrame are deprecated and will be "
"removed in a future release. Please convert it to a cudf "
"object before performing the operation.",
FutureWarning,
)
if len(other) != self._num_columns:
raise ValueError(
"Other is of the wrong length. Expected "
f"{self._num_columns}, got {len(other)}"
)

lhs, rhs = self._data, other
index = self._index
fill_requires_key = False
left_default: Any = False

if _is_scalar_or_zero_d_array(other):
rhs = {name: other for name in self._data}
elif isinstance(other, (list, Sequence)):
rhs = {name: o for (name, o) in zip(self._data, other)}
elif isinstance(other, Series):
rhs = dict(zip(other.index.values_host, other.values_host))
# For keys in right but not left, perform binops between NaN (not
# NULL!) and the right value (result is NaN).
left_default = as_column(np.nan, length=len(self))
elif isinstance(other, DataFrame):
if (
not can_reindex
and fn in cudf.utils.utils._EQUALITY_OPS
and (
not lhs._data.to_pandas_index().equals(
rhs._data.to_pandas_index()
not self.index.equals(other.index)
or not self._data.to_pandas_index().equals(
other._data.to_pandas_index()
)
or not lhs.index.equals(rhs.index)
)
):
raise ValueError(
"Can only compare identically-labeled DataFrame objects"
)
new_lhs, new_rhs = _align_indices(self, other)
index = new_lhs._index
lhs, rhs = new_lhs._data, new_rhs._data
fill_requires_key = True
# For DataFrame-DataFrame ops, always default to operating against
# the fill value.
left_default = fill_value

if not isinstance(rhs, (dict, Mapping)):
return NotImplemented, None

lhs, rhs = _align_indices(lhs, rhs)

operands = {
name: (
lcol,
rhs._data[name]
if name in rhs._data
else (fill_value or None),
reflect,
fill_value if name in rhs._data else None,
)
for name, lcol in lhs._data.items()
}
for name, col in rhs._data.items():
if name not in lhs._data:
operands[name] = (
col,
(fill_value or None),
not reflect,
None,
)
elif isinstance(rhs, Series):
# Note: This logic will need updating if any of the user-facing
# binop methods (e.g. DataFrame.add) ever support axis=0/rows.
right_dict = dict(zip(rhs.index.values_host, rhs.values_host))
left_cols = lhs._column_names
# mypy thinks lhs._column_names is a List rather than a Tuple, so
# we have to ignore the type check.
result_cols = left_cols + tuple( # type: ignore
col for col in right_dict if col not in left_cols
operands = {
k: (
v,
rhs.get(k, fill_value),
reflect,
fill_value if (not fill_requires_key or k in rhs) else None,
)
operands = {}
for col in result_cols:
if col in left_cols:
left = lhs._data[col]
right = right_dict[col] if col in right_dict else None
else:
# We match pandas semantics here by performing binops
# between a NaN (not NULL!) column and the actual values,
# which results in nans, the pandas output.
left = as_column(np.nan, length=lhs._num_rows)
right = right_dict[col]
operands[col] = (left, right, reflect, fill_value)
else:
return NotImplemented, None
for k, v in lhs.items()
}

return operands, lhs._index
if left_default is not False:
for k, v in rhs.items():
if k not in lhs:
operands[k] = (left_default, v, reflect, None)
return operands, index

@_cudf_nvtx_annotate
def update(
Expand Down
12 changes: 10 additions & 2 deletions python/cudf/cudf/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -2475,7 +2475,10 @@ def _colwise_binop(
) in operands.items():
output_mask = None
if fill_value is not None:
if isinstance(right_column, ColumnBase):
left_is_column = isinstance(left_column, ColumnBase)
right_is_column = isinstance(right_column, ColumnBase)

if left_is_column and right_is_column:
# If both columns are nullable, pandas semantics dictate
# that nulls that are present in both left_column and
# right_column are not filled.
Expand All @@ -2489,9 +2492,14 @@ def _colwise_binop(
left_column = left_column.fillna(fill_value)
elif right_column.nullable:
right_column = right_column.fillna(fill_value)
else:
elif left_is_column:
if left_column.nullable:
left_column = left_column.fillna(fill_value)
elif right_is_column:
if right_column.nullable:
right_column = right_column.fillna(fill_value)
else:
assert False, "At least one operand must be a column."

# TODO: Disable logical and binary operators between columns that
# are not numerical using the new binops mixin.
Expand Down
12 changes: 12 additions & 0 deletions python/cudf/cudf/core/single_column_frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

from __future__ import annotations

import warnings
from typing import Any, Dict, Optional, Tuple, Type, TypeVar, Union

import cupy
Expand Down Expand Up @@ -337,6 +338,17 @@ def _make_operands_for_binop(
if isinstance(other, SingleColumnFrame):
other = other._column
elif not _is_scalar_or_zero_d_array(other):
if not hasattr(other, "__cuda_array_interface__"):
# TODO: When this deprecated behavior is removed, also change
# the above conditional to stop checking for pd.Series and
# pd.Index since we only need to support SingleColumnFrame.
warnings.warn(
f"Binary operations between host objects such as "
f"{type(other)} and {type(self)} are deprecated and will "
"be removed in a future release. Please convert it to a "
"cudf object before performing the operation.",
FutureWarning,
)
# Non-scalar right operands are valid iff they convert to columns.
try:
other = as_column(other)
Expand Down
81 changes: 75 additions & 6 deletions python/cudf/cudf/tests/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
import re
import string
import textwrap
import warnings
from contextlib import contextmanager
from copy import copy

import cupy
Expand Down Expand Up @@ -2017,6 +2019,15 @@ def test_dataframe_min_count_ops(data, ops, skipna, min_count):
)


@contextmanager
def _hide_host_other_warning(other):
if isinstance(other, (dict, list)):
with pytest.warns(FutureWarning):
yield
else:
yield


@pytest.mark.parametrize(
"binop",
[
Expand All @@ -2034,12 +2045,70 @@ def test_dataframe_min_count_ops(data, ops, skipna, min_count):
operator.ne,
],
)
def test_binops_df(pdf, gdf, binop):
pdf = pdf + 1.0
gdf = gdf + 1.0
d = binop(pdf, pdf)
g = binop(gdf, gdf)
assert_eq(d, g)
@pytest.mark.parametrize(
"other",
[
1.0,
[1.0],
[1.0, 2.0],
[1.0, 2.0, 3.0],
{"x": 1.0},
{"x": 1.0, "y": 2.0},
{"x": 1.0, "y": 2.0, "z": 3.0},
{"x": 1.0, "z": 3.0},
pd.Series([1.0]),
pd.Series([1.0, 2.0]),
pd.Series([1.0, 2.0, 3.0]),
pd.Series([1.0], index=["x"]),
pd.Series([1.0, 2.0], index=["x", "y"]),
pd.Series([1.0, 2.0, 3.0], index=["x", "y", "z"]),
pd.DataFrame({"x": [1.0]}),
pd.DataFrame({"x": [1.0], "y": [2.0]}),
pd.DataFrame({"x": [1.0], "y": [2.0], "z": [3.0]}),
],
)
def test_binops_df(pdf, gdf, binop, other):
# Avoid 1**NA cases: https://github.com/pandas-dev/pandas/issues/29997
pdf[pdf == 1.0] = 2
gdf[gdf == 1.0] = 2
try:
with warnings.catch_warnings(record=True) as w:
d = binop(pdf, other)
except Exception:
if isinstance(other, (pd.Series, pd.DataFrame)):
other = cudf.from_pandas(other)

# TODO: When we remove support for binary operations with lists and
# dicts, those cases should all be checked in a `pytest.raises` block
# that returns before we enter this try-except.
with _hide_host_other_warning(other):
assert_exceptions_equal(
lfunc=binop,
rfunc=binop,
lfunc_args_and_kwargs=([pdf, other], {}),
rfunc_args_and_kwargs=([gdf, other], {}),
compare_error_message=False,
)
else:
if isinstance(other, (pd.Series, pd.DataFrame)):
other = cudf.from_pandas(other)
with _hide_host_other_warning(other):
g = binop(gdf, other)
try:
assert_eq(d, g)
except AssertionError:
# Currently we will not match pandas for equality/inequality
# operators when there are columns that exist in a Series but not
# the DataFrame because pandas returns True/False values whereas we
# return NA. However, this reindexing is deprecated in pandas so we
# opt not to add support.
if w and "DataFrame vs Series comparisons is deprecated" in str(w):
pass


def test_binops_df_invalid(gdf):
with pytest.raises(TypeError):
gdf + np.array([1, 2])


@pytest.mark.parametrize("binop", [operator.and_, operator.or_, operator.xor])
Expand Down

0 comments on commit c9e16c7

Please sign in to comment.