Backport PR pandas-dev#46065: BUG: groupby().rolling(freq) with monot…

…onic dates within groups
meeseeksmachine · Feb 19, 2022 · 7e68fe6 · 7e68fe6
1 parent 226f1c2
commit 7e68fe6
Show file tree

Hide file tree

Showing 5 changed files with 96 additions and 91 deletions.
diff --git a/doc/source/whatsnew/v1.4.2.rst b/doc/source/whatsnew/v1.4.2.rst
@@ -23,7 +23,7 @@ Fixed regressions
 
 Bug fixes
 ~~~~~~~~~
--
+- Bug in :meth:`Groupby.rolling` with a frequency window would raise a ``ValueError`` even if the datetimes within each group were monotonic (:issue:`46061`)
 -
 
 .. ---------------------------------------------------------------------------

diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py
@@ -837,7 +837,7 @@ def _gotitem(self, key, ndim, subset=None):
             subset = self.obj.set_index(self._on)
         return super()._gotitem(key, ndim, subset=subset)
 
-    def _validate_monotonic(self):
+    def _validate_datetimelike_monotonic(self):
         """
         Validate that "on" is monotonic; already validated at a higher level.
         """
@@ -1687,7 +1687,7 @@ def _validate(self):
             or isinstance(self._on, (DatetimeIndex, TimedeltaIndex, PeriodIndex))
         ) and isinstance(self.window, (str, BaseOffset, timedelta)):
 
-            self._validate_monotonic()
+            self._validate_datetimelike_monotonic()
 
             # this will raise ValueError on non-fixed freqs
             try:
@@ -1712,18 +1712,13 @@ def _validate(self):
         elif not is_integer(self.window) or self.window < 0:
             raise ValueError("window must be an integer 0 or greater")
 
-    def _validate_monotonic(self):
+    def _validate_datetimelike_monotonic(self):
         """
         Validate monotonic (increasing or decreasing).
         """
         if not (self._on.is_monotonic_increasing or self._on.is_monotonic_decreasing):
-            self._raise_monotonic_error()
-
-    def _raise_monotonic_error(self):
-        formatted = self.on
-        if self.on is None:
-            formatted = "index"
-        raise ValueError(f"{formatted} must be monotonic")
+            on = "index" if self.on is None else self.on
+            raise ValueError(f"{on} must be monotonic.")
 
     @doc(
         _shared_docs["aggregate"],
@@ -2631,12 +2626,17 @@ def _get_window_indexer(self) -> GroupbyIndexer:
         )
         return window_indexer
 
-    def _validate_monotonic(self):
+    def _validate_datetimelike_monotonic(self):
         """
-        Validate that on is monotonic;
+        Validate that each group in self._on is monotonic
         """
-        if (
-            not (self._on.is_monotonic_increasing or self._on.is_monotonic_decreasing)
-            or self._on.hasnans
-        ):
-            self._raise_monotonic_error()
+        # GH 46061
+        on = "index" if self.on is None else self.on
+        if self._on.hasnans:
+            raise ValueError(f"{on} must not have any NaT values.")
+        for group_indices in self._grouper.indices.values():
+            group_on = self._on.take(group_indices)
+            if not (
+                group_on.is_monotonic_increasing or group_on.is_monotonic_decreasing
+            ):
+                raise ValueError(f"Each group within {on} must be monotonic.")
diff --git a/pandas/tests/window/test_groupby.py b/pandas/tests/window/test_groupby.py
@@ -651,7 +651,7 @@ def test_groupby_rolling_nans_in_index(self, rollings, key):
         )
         if key == "index":
             df = df.set_index("a")
-        with pytest.raises(ValueError, match=f"{key} must be monotonic"):
+        with pytest.raises(ValueError, match=f"{key} must not have any NaT values"):
             df.groupby("c").rolling("60min", **rollings)
 
     @pytest.mark.parametrize("group_keys", [True, False])
@@ -895,6 +895,83 @@ def test_nan_and_zero_endpoints(self):
         )
         tm.assert_series_equal(result, expected)
 
+    def test_groupby_rolling_non_monotonic(self):
+        # GH 43909
+
+        shuffled = [3, 0, 1, 2]
+        sec = 1_000
+        df = DataFrame(
+            [{"t": Timestamp(2 * x * sec), "x": x + 1, "c": 42} for x in shuffled]
+        )
+        with pytest.raises(ValueError, match=r".* must be monotonic"):
+            df.groupby("c").rolling(on="t", window="3s")
+
+    def test_groupby_monotonic(self):
+
+        # GH 15130
+        # we don't need to validate monotonicity when grouping
+
+        # GH 43909 we should raise an error here to match
+        # behaviour of non-groupby rolling.
+
+        data = [
+            ["David", "1/1/2015", 100],
+            ["David", "1/5/2015", 500],
+            ["David", "5/30/2015", 50],
+            ["David", "7/25/2015", 50],
+            ["Ryan", "1/4/2014", 100],
+            ["Ryan", "1/19/2015", 500],
+            ["Ryan", "3/31/2016", 50],
+            ["Joe", "7/1/2015", 100],
+            ["Joe", "9/9/2015", 500],
+            ["Joe", "10/15/2015", 50],
+        ]
+
+        df = DataFrame(data=data, columns=["name", "date", "amount"])
+        df["date"] = to_datetime(df["date"])
+        df = df.sort_values("date")
+
+        expected = (
+            df.set_index("date")
+            .groupby("name")
+            .apply(lambda x: x.rolling("180D")["amount"].sum())
+        )
+        result = df.groupby("name").rolling("180D", on="date")["amount"].sum()
+        tm.assert_series_equal(result, expected)
+
+    def test_datelike_on_monotonic_within_each_group(self):
+        # GH 13966 (similar to #15130, closed by #15175)
+
+        # superseded by 43909
+        # GH 46061: OK if the on is monotonic relative to each each group
+
+        dates = date_range(start="2016-01-01 09:30:00", periods=20, freq="s")
+        df = DataFrame(
+            {
+                "A": [1] * 20 + [2] * 12 + [3] * 8,
+                "B": np.concatenate((dates, dates)),
+                "C": np.arange(40),
+            }
+        )
+
+        expected = (
+            df.set_index("B").groupby("A").apply(lambda x: x.rolling("4s")["C"].mean())
+        )
+        result = df.groupby("A").rolling("4s", on="B").C.mean()
+        tm.assert_series_equal(result, expected)
+
+    def test_datelike_on_not_monotonic_within_each_group(self):
+        # GH 46061
+        df = DataFrame(
+            {
+                "A": [1] * 3 + [2] * 3,
+                "B": [Timestamp(year, 1, 1) for year in [2020, 2021, 2019]] * 2,
+                "C": range(6),
+            }
+        )
+        with pytest.raises(ValueError, match="Each group within B must be monotonic."):
+            df.groupby("A").rolling("365D", on="B")
+
 
 class TestExpanding:
     def setup_method(self):

diff --git a/pandas/tests/window/test_rolling.py b/pandas/tests/window/test_rolling.py
@@ -1419,18 +1419,6 @@ def test_groupby_rolling_nan_included():
     tm.assert_frame_equal(result, expected)
 
 
-def test_groupby_rolling_non_monotonic():
-    # GH 43909
-
-    shuffled = [3, 0, 1, 2]
-    sec = 1_000
-    df = DataFrame(
-        [{"t": Timestamp(2 * x * sec), "x": x + 1, "c": 42} for x in shuffled]
-    )
-    with pytest.raises(ValueError, match=r".* must be monotonic"):
-        df.groupby("c").rolling(on="t", window="3s")
-
-
 @pytest.mark.parametrize("method", ["skew", "kurt"])
 def test_rolling_skew_kurt_numerical_stability(method):
     # GH#6929

diff --git a/pandas/tests/window/test_timeseries_window.py b/pandas/tests/window/test_timeseries_window.py
@@ -8,7 +8,6 @@
     Series,
     Timestamp,
     date_range,
-    to_datetime,
 )
 import pandas._testing as tm
 
@@ -643,65 +642,6 @@ def agg_by_day(x):
 
         tm.assert_frame_equal(result, expected)
 
-    def test_groupby_monotonic(self):
-
-        # GH 15130
-        # we don't need to validate monotonicity when grouping
-
-        # GH 43909 we should raise an error here to match
-        # behaviour of non-groupby rolling.
-
-        data = [
-            ["David", "1/1/2015", 100],
-            ["David", "1/5/2015", 500],
-            ["David", "5/30/2015", 50],
-            ["David", "7/25/2015", 50],
-            ["Ryan", "1/4/2014", 100],
-            ["Ryan", "1/19/2015", 500],
-            ["Ryan", "3/31/2016", 50],
-            ["Joe", "7/1/2015", 100],
-            ["Joe", "9/9/2015", 500],
-            ["Joe", "10/15/2015", 50],
-        ]
-
-        df = DataFrame(data=data, columns=["name", "date", "amount"])
-        df["date"] = to_datetime(df["date"])
-        df = df.sort_values("date")
-
-        expected = (
-            df.set_index("date")
-            .groupby("name")
-            .apply(lambda x: x.rolling("180D")["amount"].sum())
-        )
-        result = df.groupby("name").rolling("180D", on="date")["amount"].sum()
-        tm.assert_series_equal(result, expected)
-
-    def test_non_monotonic_raises(self):
-        # GH 13966 (similar to #15130, closed by #15175)
-
-        # superseded by 43909
-
-        dates = date_range(start="2016-01-01 09:30:00", periods=20, freq="s")
-        df = DataFrame(
-            {
-                "A": [1] * 20 + [2] * 12 + [3] * 8,
-                "B": np.concatenate((dates, dates)),
-                "C": np.arange(40),
-            }
-        )
-
-        expected = (
-            df.set_index("B").groupby("A").apply(lambda x: x.rolling("4s")["C"].mean())
-        )
-        with pytest.raises(ValueError, match=r".* must be monotonic"):
-            df.groupby("A").rolling(
-                "4s", on="B"
-            ).C.mean()  # should raise for non-monotonic t series
-
-        df2 = df.sort_values("B")
-        result = df2.groupby("A").rolling("4s", on="B").C.mean()
-        tm.assert_series_equal(result, expected)
-
     def test_rolling_cov_offset(self):
         # GH16058