Merge pull request modin-project#45 from intel-go/izamyati/groupby_si…

…ze_count Izamyati/groupby.size()/count()
dchigarev · Jun 19, 2020 · 236c2c6 · 236c2c6
2 parents 3c7f79a + 6e394ac
commit 236c2c6
Show file tree

Hide file tree

Showing 2 changed files with 125 additions and 37 deletions.
diff --git a/modin/experimental/backends/omnisci/query_compiler.py b/modin/experimental/backends/omnisci/query_compiler.py
@@ -87,6 +87,40 @@ def view(self, index=None, columns=None):
             self._modin_frame.mask(row_numeric_idx=index, col_numeric_idx=columns)
         )
 
+    def groupby_size(
+        query_compiler, by, axis, groupby_args, map_args, **kwargs,
+    ):
+        """Perform a groupby size.
+
+        Parameters
+        ----------
+        by : BaseQueryCompiler
+            The query compiler object to groupby.
+        axis : 0 or 1
+            The axis to groupby. Must be 0 currently.
+        groupby_args : dict
+            The arguments for the groupby component.
+        map_args : dict
+            The arguments for the `map_func`.
+        reduce_args : dict
+            The arguments for `reduce_func`.
+        numeric_only : bool
+            Whether to drop non-numeric columns.
+        drop : bool
+            Whether the data in `by` was dropped.
+
+        Returns
+        -------
+        BaseQueryCompiler
+        """
+        new_frame = query_compiler._modin_frame.groupby_agg(
+            by, axis, "size", groupby_args, **kwargs
+        )
+        new_qc = query_compiler.__constructor__(new_frame)
+        if groupby_args["squeeze"]:
+            new_qc = new_qc.squeeze()
+        return new_qc
+
     def groupby_sum(query_compiler, by, axis, groupby_args, map_args, **kwargs):
         """Groupby with sum aggregation.
 
@@ -104,8 +138,8 @@ def groupby_sum(query_compiler, by, axis, groupby_args, map_args, **kwargs):
 
         Returns
         -------
-        PandasQueryCompiler
-            A new PandasQueryCompiler
+        QueryCompiler
+            A new QueryCompiler
         """
         new_frame = query_compiler._modin_frame.groupby_agg(
             by, axis, "sum", groupby_args, **kwargs
@@ -115,6 +149,38 @@ def groupby_sum(query_compiler, by, axis, groupby_args, map_args, **kwargs):
             new_qc = new_qc.squeeze()
         return new_qc
 
+    def groupby_count(query_compiler, by, axis, groupby_args, map_args, **kwargs):
+        """Perform a groupby count.
+
+        Parameters
+        ----------
+        by : BaseQueryCompiler
+            The query compiler object to groupby.
+        axis : 0 or 1
+            The axis to groupby. Must be 0 currently.
+        groupby_args : dict
+            The arguments for the groupby component.
+        map_args : dict
+            The arguments for the `map_func`.
+        reduce_args : dict
+            The arguments for `reduce_func`.
+        numeric_only : bool
+            Whether to drop non-numeric columns.
+        drop : bool
+            Whether the data in `by` was dropped.
+
+        Returns
+        -------
+        QueryCompiler
+        """
+        new_frame = query_compiler._modin_frame.groupby_agg(
+            by, axis, "count", groupby_args, **kwargs
+        )
+        new_qc = query_compiler.__constructor__(new_frame)
+        if groupby_args["squeeze"]:
+            new_qc = new_qc.squeeze()
+        return new_qc
+
     def groupby_dict_agg(self, by, func_dict, groupby_args, agg_args, drop=False):
         """Apply aggregation functions to a grouped dataframe per-column.
 
@@ -385,12 +451,10 @@ def dtypes(self):
     groupby_agg = DFAlgNotSupported("groupby_agg")
     groupby_all = DFAlgNotSupported("groupby_all")
     groupby_any = DFAlgNotSupported("groupby_any")
-    groupby_count = DFAlgNotSupported("groupby_count")
     groupby_max = DFAlgNotSupported("groupby_max")
     groupby_min = DFAlgNotSupported("groupby_min")
     groupby_prod = DFAlgNotSupported("groupby_prod")
     groupby_reduce = DFAlgNotSupported("groupby_reduce")
-    groupby_size = DFAlgNotSupported("groupby_size")
     head = DFAlgNotSupported("head")
     idxmax = DFAlgNotSupported("idxmax")
     idxmin = DFAlgNotSupported("idxmin")
@@ -424,7 +488,7 @@ def dtypes(self):
     rtruediv = DFAlgNotSupported("rtruediv")
     skew = DFAlgNotSupported("skew")
     series_update = DFAlgNotSupported("series_update")
-    series_view = DFAlgNotSupported("series_view")    
+    series_view = DFAlgNotSupported("series_view")
     sort_index = DFAlgNotSupported("sort_index")
     std = DFAlgNotSupported("std")
     sum = DFAlgNotSupported("sum")

diff --git a/modin/experimental/engines/omnisci_on_ray/test/test_dataframe.py b/modin/experimental/engines/omnisci_on_ray/test/test_dataframe.py
@@ -144,6 +144,14 @@ def groupby_sum(df, cols, as_index, **kwargs):
 
         run_and_compare(groupby_sum, data=self.data, cols=cols, as_index=as_index)
 
+    @pytest.mark.parametrize("cols", cols_value)
+    @pytest.mark.parametrize("as_index", bool_arg_values)
+    def test_groupby_count(self, cols, as_index):
+        def groupby_count(df, cols, as_index, **kwargs):
+            return df.groupby(cols, as_index=as_index).count()
+
+        run_and_compare(groupby_count, data=self.data, cols=cols, as_index=as_index)
+
     @pytest.mark.parametrize("cols", cols_value)
     @pytest.mark.parametrize("as_index", bool_arg_values)
     def test_groupby_proj_sum(self, cols, as_index):
@@ -163,49 +171,70 @@ def test_groupby_agg_count(self):
 
         df_equals(ref, exp)
 
+    def test_groupby_agg_size(self):
+        df = pd.DataFrame(self.data)
+        ref = df.groupby("a").agg({"b": "size"})
+
+        modin_df = mpd.DataFrame(self.data)
+        modin_df = modin_df.groupby("a").agg({"b": "size"})
+
+        exp = to_pandas(modin_df)
+
+        df_equals(ref, exp)
+
+    taxi_data = {
+        "a": [1, 1, 2, 2],
+        "b": [11, 21, 12, 11],
+        "c": pd.to_datetime(
+            ["20190902", "20180913", "20190921", "20180903"], format="%Y%m%d"
+        ),
+        "d": [11.5, 21.2, 12.8, 13.4],
+    }
+
     # TODO: emulate taxi queries with group by category types when we have loading
     #       using arrow
     #       Another way of doing taxi q1 is
     #       res = df.groupby("cab_type").size() - this should be tested later as well
     def test_taxi_q1(self):
-        df = pd.DataFrame(self.data)
-        ref = df.groupby("a").agg({"b": "size"})
+        df = pd.DataFrame(self.taxi_data)
+        # TODO: For now we can't do such groupby by first column since modin use that
+        #      column as aggregation one by default. We don't support such cases at
+        #      at the moment, this will be handled later
+        # ref = df.groupby("a").size()
+        ref = df.groupby("b").size()
 
-        modin_df = mpd.DataFrame(self.data)
-        modin_df = modin_df.groupby("a").agg({"b": "size"})
+        modin_df = mpd.DataFrame(self.taxi_data)
+        # modin_df = modin_df.groupby("a").size()
+        modin_df = modin_df.groupby("b").size()
 
         exp = to_pandas(modin_df)
 
         df_equals(ref, exp)
 
     def test_taxi_q2(self):
-        df = pd.DataFrame(self.data)
+        df = pd.DataFrame(self.taxi_data)
         ref = df.groupby("a").agg({"b": "mean"})
 
-        modin_df = mpd.DataFrame(self.data)
+        modin_df = mpd.DataFrame(self.taxi_data)
         modin_df = modin_df.groupby("a").agg({"b": "mean"})
 
         exp = to_pandas(modin_df)
 
         df_equals(ref, exp)
 
-    datetime_data = {
-        "a": [1, 1, 2, 2],
-        "b": [11, 21, 12, 11],
-        "c": pd.to_datetime(
-            ["20190902", "20180913", "20190921", "20180903"], format="%Y%m%d"
-        ),
-    }
-
     @pytest.mark.parametrize("as_index", bool_arg_values)
     def test_taxi_q3(self, as_index):
-        df = pd.DataFrame(self.datetime_data)
-        ref = df.groupby(["b", df["c"].dt.year], as_index=as_index).agg({"a": "size"})
+        df = pd.DataFrame(self.taxi_data)
+        ref = df.groupby(["b", df["c"].dt.year], as_index=as_index).size()
 
-        modin_df = mpd.DataFrame(self.datetime_data)
+        modin_df = mpd.DataFrame(self.taxi_data)
         modin_df = modin_df.groupby(
             ["b", modin_df["c"].dt.year], as_index=as_index
-        ).agg({"a": "size"})
+        ).size()
+
+        exp = to_pandas(modin_df)
+
+        df_equals(ref, exp)
 
     def test_groupby_expr_col(self):
         def groupby(df, **kwargs):
@@ -217,30 +246,25 @@ def groupby(df, **kwargs):
             df = df.groupby(["id1", "id2"], as_index=False).agg({"b": "max"})
             return df
 
-        run_and_compare(groupby, data=self.datetime_data)
-
-    astype_data = {
-        "a": [1, 1, 2],
-        "b": [11.5, 21.2, 12.8],
-    }
+        run_and_compare(groupby, data=self.taxi_data)
 
     def test_series_astype(self):
-        df = pd.DataFrame(self.astype_data)
-        ref = df["b"].astype("int")
+        df = pd.DataFrame(self.taxi_data)
+        ref = df["d"].astype("int")
 
-        modin_df = mpd.DataFrame(self.astype_data)
-        modin_df = modin_df["b"].astype("int")
+        modin_df = mpd.DataFrame(self.taxi_data)
+        modin_df = modin_df["d"].astype("int")
 
         exp = to_pandas(modin_df)
 
         df_equals(ref, exp)
 
     def test_df_astype(self):
-        df = pd.DataFrame(self.astype_data)
-        ref = df.astype({"a": "float", "b": "int"})
+        df = pd.DataFrame(self.taxi_data)
+        ref = df.astype({"b": "float", "d": "int"})
 
-        modin_df = mpd.DataFrame(self.astype_data)
-        modin_df = modin_df.astype({"a": "float", "b": "int"})
+        modin_df = mpd.DataFrame(self.taxi_data)
+        modin_df = modin_df.astype({"b": "float", "d": "int"})
 
         exp = to_pandas(modin_df)