Simplify preprocessing of arguments for DataFrame binops (#10563)

This PR simplifies the preprocessing of the rhs of binary operations for DataFrame, streamlining it to a single path that can be more easily combined with that of other types in the future. Authors: - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - GALI PREM SAGAR (https://github.com/galipremsagar) URL: #10563
rapidsai · Apr 12, 2022 · c9e16c7 · c9e16c7
1 parent 64a811e
commit c9e16c7
Show file tree

Hide file tree

Showing 4 changed files with 154 additions and 76 deletions.
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
@@ -10,7 +10,7 @@
 import sys
 import warnings
 from collections import defaultdict
-from collections.abc import Iterable, Sequence
+from collections.abc import Iterable, Mapping, Sequence
 from typing import (
     Any,
     Dict,
@@ -1854,86 +1854,75 @@ def _make_operands_and_index_for_binop(
         ],
         Optional[BaseIndex],
     ]:
-        lhs, rhs = self, other
-
-        if _is_scalar_or_zero_d_array(rhs):
-            rhs = [rhs] * lhs._num_columns
-
-        # For columns that exist in rhs but not lhs, we swap the order so that
-        # we can always assume that left has a binary operator. This
-        # implementation assumes that binary operations between a column and
-        # NULL are always commutative, even for binops (like subtraction) that
-        # are normally anticommutative.
-        # TODO: The above should no longer be necessary once we switch to
-        # properly invoking the operator since we can then rely on reflection.
-        if isinstance(rhs, Sequence):
-            # TODO: Consider validating sequence length (pandas does).
-            operands = {
-                name: (left, right, reflect, fill_value)
-                for right, (name, left) in zip(rhs, lhs._data.items())
-            }
-        elif isinstance(rhs, DataFrame):
+        # Check built-in types first for speed.
+        if isinstance(other, (list, dict, Sequence, Mapping)):
+            warnings.warn(
+                "Binary operations between host objects such as "
+                f"{type(other)} and cudf.DataFrame are deprecated and will be "
+                "removed in a future release. Please convert it to a cudf "
+                "object before performing the operation.",
+                FutureWarning,
+            )
+            if len(other) != self._num_columns:
+                raise ValueError(
+                    "Other is of the wrong length. Expected "
+                    f"{self._num_columns}, got {len(other)}"
+                )
+
+        lhs, rhs = self._data, other
+        index = self._index
+        fill_requires_key = False
+        left_default: Any = False
+
+        if _is_scalar_or_zero_d_array(other):
+            rhs = {name: other for name in self._data}
+        elif isinstance(other, (list, Sequence)):
+            rhs = {name: o for (name, o) in zip(self._data, other)}
+        elif isinstance(other, Series):
+            rhs = dict(zip(other.index.values_host, other.values_host))
+            # For keys in right but not left, perform binops between NaN (not
+            # NULL!) and the right value (result is NaN).
+            left_default = as_column(np.nan, length=len(self))
+        elif isinstance(other, DataFrame):
             if (
                 not can_reindex
                 and fn in cudf.utils.utils._EQUALITY_OPS
                 and (
-                    not lhs._data.to_pandas_index().equals(
-                        rhs._data.to_pandas_index()
+                    not self.index.equals(other.index)
+                    or not self._data.to_pandas_index().equals(
+                        other._data.to_pandas_index()
                     )
-                    or not lhs.index.equals(rhs.index)
                 )
             ):
                 raise ValueError(
                     "Can only compare identically-labeled DataFrame objects"
                 )
+            new_lhs, new_rhs = _align_indices(self, other)
+            index = new_lhs._index
+            lhs, rhs = new_lhs._data, new_rhs._data
+            fill_requires_key = True
+            # For DataFrame-DataFrame ops, always default to operating against
+            # the fill value.
+            left_default = fill_value
+
+        if not isinstance(rhs, (dict, Mapping)):
+            return NotImplemented, None
 
-            lhs, rhs = _align_indices(lhs, rhs)
-
-            operands = {
-                name: (
-                    lcol,
-                    rhs._data[name]
-                    if name in rhs._data
-                    else (fill_value or None),
-                    reflect,
-                    fill_value if name in rhs._data else None,
-                )
-                for name, lcol in lhs._data.items()
-            }
-            for name, col in rhs._data.items():
-                if name not in lhs._data:
-                    operands[name] = (
-                        col,
-                        (fill_value or None),
-                        not reflect,
-                        None,
-                    )
-        elif isinstance(rhs, Series):
-            # Note: This logic will need updating if any of the user-facing
-            # binop methods (e.g. DataFrame.add) ever support axis=0/rows.
-            right_dict = dict(zip(rhs.index.values_host, rhs.values_host))
-            left_cols = lhs._column_names
-            # mypy thinks lhs._column_names is a List rather than a Tuple, so
-            # we have to ignore the type check.
-            result_cols = left_cols + tuple(  # type: ignore
-                col for col in right_dict if col not in left_cols
+        operands = {
+            k: (
+                v,
+                rhs.get(k, fill_value),
+                reflect,
+                fill_value if (not fill_requires_key or k in rhs) else None,
             )
-            operands = {}
-            for col in result_cols:
-                if col in left_cols:
-                    left = lhs._data[col]
-                    right = right_dict[col] if col in right_dict else None
-                else:
-                    # We match pandas semantics here by performing binops
-                    # between a NaN (not NULL!) column and the actual values,
-                    # which results in nans, the pandas output.
-                    left = as_column(np.nan, length=lhs._num_rows)
-                    right = right_dict[col]
-                operands[col] = (left, right, reflect, fill_value)
-        else:
-            return NotImplemented, None
+            for k, v in lhs.items()
+        }
 
-        return operands, lhs._index
+        if left_default is not False:
+            for k, v in rhs.items():
+                if k not in lhs:
+                    operands[k] = (left_default, v, reflect, None)
+        return operands, index
 
     @_cudf_nvtx_annotate
     def update(

diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
@@ -2475,7 +2475,10 @@ def _colwise_binop(
         ) in operands.items():
             output_mask = None
             if fill_value is not None:
-                if isinstance(right_column, ColumnBase):
+                left_is_column = isinstance(left_column, ColumnBase)
+                right_is_column = isinstance(right_column, ColumnBase)
+
+                if left_is_column and right_is_column:
                     # If both columns are nullable, pandas semantics dictate
                     # that nulls that are present in both left_column and
                     # right_column are not filled.
@@ -2489,9 +2492,14 @@ def _colwise_binop(
                         left_column = left_column.fillna(fill_value)
                     elif right_column.nullable:
                         right_column = right_column.fillna(fill_value)
-                else:
+                elif left_is_column:
                     if left_column.nullable:
                         left_column = left_column.fillna(fill_value)
+                elif right_is_column:
+                    if right_column.nullable:
+                        right_column = right_column.fillna(fill_value)
+                else:
+                    assert False, "At least one operand must be a column."
 
             # TODO: Disable logical and binary operators between columns that
             # are not numerical using the new binops mixin.

diff --git a/python/cudf/cudf/core/single_column_frame.py b/python/cudf/cudf/core/single_column_frame.py
@@ -3,6 +3,7 @@
 
 from __future__ import annotations
 
+import warnings
 from typing import Any, Dict, Optional, Tuple, Type, TypeVar, Union
 
 import cupy
@@ -337,6 +338,17 @@ def _make_operands_for_binop(
         if isinstance(other, SingleColumnFrame):
             other = other._column
         elif not _is_scalar_or_zero_d_array(other):
+            if not hasattr(other, "__cuda_array_interface__"):
+                # TODO: When this deprecated behavior is removed, also change
+                # the above conditional to stop checking for pd.Series and
+                # pd.Index since we only need to support SingleColumnFrame.
+                warnings.warn(
+                    f"Binary operations between host objects such as "
+                    f"{type(other)} and {type(self)} are deprecated and will "
+                    "be removed in a future release. Please convert it to a "
+                    "cudf object before performing the operation.",
+                    FutureWarning,
+                )
             # Non-scalar right operands are valid iff they convert to columns.
             try:
                 other = as_column(other)

diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
@@ -8,6 +8,8 @@
 import re
 import string
 import textwrap
+import warnings
+from contextlib import contextmanager
 from copy import copy
 
 import cupy
@@ -2017,6 +2019,15 @@ def test_dataframe_min_count_ops(data, ops, skipna, min_count):
     )
 
 
+@contextmanager
+def _hide_host_other_warning(other):
+    if isinstance(other, (dict, list)):
+        with pytest.warns(FutureWarning):
+            yield
+    else:
+        yield
+
+
 @pytest.mark.parametrize(
     "binop",
     [
@@ -2034,12 +2045,70 @@ def test_dataframe_min_count_ops(data, ops, skipna, min_count):
         operator.ne,
     ],
 )
-def test_binops_df(pdf, gdf, binop):
-    pdf = pdf + 1.0
-    gdf = gdf + 1.0
-    d = binop(pdf, pdf)
-    g = binop(gdf, gdf)
-    assert_eq(d, g)
+@pytest.mark.parametrize(
+    "other",
+    [
+        1.0,
+        [1.0],
+        [1.0, 2.0],
+        [1.0, 2.0, 3.0],
+        {"x": 1.0},
+        {"x": 1.0, "y": 2.0},
+        {"x": 1.0, "y": 2.0, "z": 3.0},
+        {"x": 1.0, "z": 3.0},
+        pd.Series([1.0]),
+        pd.Series([1.0, 2.0]),
+        pd.Series([1.0, 2.0, 3.0]),
+        pd.Series([1.0], index=["x"]),
+        pd.Series([1.0, 2.0], index=["x", "y"]),
+        pd.Series([1.0, 2.0, 3.0], index=["x", "y", "z"]),
+        pd.DataFrame({"x": [1.0]}),
+        pd.DataFrame({"x": [1.0], "y": [2.0]}),
+        pd.DataFrame({"x": [1.0], "y": [2.0], "z": [3.0]}),
+    ],
+)
+def test_binops_df(pdf, gdf, binop, other):
+    # Avoid 1**NA cases: https://github.com/pandas-dev/pandas/issues/29997
+    pdf[pdf == 1.0] = 2
+    gdf[gdf == 1.0] = 2
+    try:
+        with warnings.catch_warnings(record=True) as w:
+            d = binop(pdf, other)
+    except Exception:
+        if isinstance(other, (pd.Series, pd.DataFrame)):
+            other = cudf.from_pandas(other)
+
+        # TODO: When we remove support for binary operations with lists and
+        # dicts, those cases should all be checked in a `pytest.raises` block
+        # that returns before we enter this try-except.
+        with _hide_host_other_warning(other):
+            assert_exceptions_equal(
+                lfunc=binop,
+                rfunc=binop,
+                lfunc_args_and_kwargs=([pdf, other], {}),
+                rfunc_args_and_kwargs=([gdf, other], {}),
+                compare_error_message=False,
+            )
+    else:
+        if isinstance(other, (pd.Series, pd.DataFrame)):
+            other = cudf.from_pandas(other)
+        with _hide_host_other_warning(other):
+            g = binop(gdf, other)
+        try:
+            assert_eq(d, g)
+        except AssertionError:
+            # Currently we will not match pandas for equality/inequality
+            # operators when there are columns that exist in a Series but not
+            # the DataFrame because pandas returns True/False values whereas we
+            # return NA. However, this reindexing is deprecated in pandas so we
+            # opt not to add support.
+            if w and "DataFrame vs Series comparisons is deprecated" in str(w):
+                pass
+
+
+def test_binops_df_invalid(gdf):
+    with pytest.raises(TypeError):
+        gdf + np.array([1, 2])
 
 
 @pytest.mark.parametrize("binop", [operator.and_, operator.or_, operator.xor])