Fix cumulative count index behavior (#11188)

During #10889 I found that the result was wrong for `cumcount` in the case of more than one single partition. Digging I found that this was because cuDF python always resets the index of `cumcount` operations meaning the index of the reassembled result would be wrong. It also needs the temporary object it groups on to have to original objects index in order for the post-processing functions to correctly set the index. This PR fixes it as such and adds a test. example old behavior: ```python >>> import pandas as pd >>> import cudf >>> df = pd.DataFrame({ ... 'a':[1,2,3,4,5,6] ... }, index=[1,2,3,4,5,6] ... ) >>> df a 1 1 2 2 3 3 4 4 5 5 6 6 >>> df.groupby('a').cumcount() 1 0 2 0 3 0 4 0 5 0 6 0 dtype: int64 >>> cudf.from_pandas(df).groupby('a').cumcount() 0 0 1 0 2 0 3 0 4 0 5 0 dtype: int32 ``` Authors: - https://github.com/brandon-b-miller Approvers: - GALI PREM SAGAR (https://github.com/galipremsagar) - Vyas Ramasubramani (https://github.com/vyasr) URL: #11188
rapidsai · Jul 8, 2022 · d41a036 · d41a036
1 parent 5693f62
commit d41a036
Show file tree

Hide file tree

Showing 2 changed files with 7 additions and 6 deletions.
diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
@@ -201,11 +201,11 @@ def cumcount(self):
             cudf.Series(
                 cudf.core.column.column_empty(
                     len(self.obj), "int8", masked=False
-                )
+                ),
+                index=self.obj.index,
             )
             .groupby(self.grouping, sort=self._sort)
             .agg("cumcount")
-            .reset_index(drop=True)
         )
 
     def rank(
@@ -343,7 +343,6 @@ def agg(self, func):
 
         if not self._as_index:
             result = result.reset_index()
-
         if libgroupby._is_all_scan_aggregate(normalized_aggs):
             # Scan aggregations return rows in original index order
             return self._mimic_pandas_order(result)

diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py
@@ -1116,13 +1116,15 @@ def test_groupby_size():
     )
 
 
-def test_groupby_cumcount():
+@pytest.mark.parametrize("index", [None, [1, 2, 3, 4]])
+def test_groupby_cumcount(index):
     pdf = pd.DataFrame(
         {
             "a": [1, 1, 3, 4],
             "b": ["bob", "bob", "alice", "cooper"],
             "c": [1, 2, 3, 4],
-        }
+        },
+        index=index,
     )
     gdf = cudf.from_pandas(pdf)
 
@@ -1138,7 +1140,7 @@ def test_groupby_cumcount():
         check_dtype=False,
     )
 
-    sr = pd.Series(range(len(pdf)))
+    sr = pd.Series(range(len(pdf)), index=index)
     assert_groupby_results_equal(
         pdf.groupby(sr).cumcount(),
         gdf.groupby(sr).cumcount(),