Skip to content

Commit

Permalink
Merge pull request modin-project#45 from intel-go/izamyati/groupby_si…
Browse files Browse the repository at this point in the history
…ze_count

Izamyati/groupby.size()/count()
  • Loading branch information
Garra1980 authored Jun 19, 2020
2 parents 3c7f79a + 6e394ac commit 236c2c6
Show file tree
Hide file tree
Showing 2 changed files with 125 additions and 37 deletions.
74 changes: 69 additions & 5 deletions modin/experimental/backends/omnisci/query_compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,40 @@ def view(self, index=None, columns=None):
self._modin_frame.mask(row_numeric_idx=index, col_numeric_idx=columns)
)

def groupby_size(
query_compiler, by, axis, groupby_args, map_args, **kwargs,
):
"""Perform a groupby size.
Parameters
----------
by : BaseQueryCompiler
The query compiler object to groupby.
axis : 0 or 1
The axis to groupby. Must be 0 currently.
groupby_args : dict
The arguments for the groupby component.
map_args : dict
The arguments for the `map_func`.
reduce_args : dict
The arguments for `reduce_func`.
numeric_only : bool
Whether to drop non-numeric columns.
drop : bool
Whether the data in `by` was dropped.
Returns
-------
BaseQueryCompiler
"""
new_frame = query_compiler._modin_frame.groupby_agg(
by, axis, "size", groupby_args, **kwargs
)
new_qc = query_compiler.__constructor__(new_frame)
if groupby_args["squeeze"]:
new_qc = new_qc.squeeze()
return new_qc

def groupby_sum(query_compiler, by, axis, groupby_args, map_args, **kwargs):
"""Groupby with sum aggregation.
Expand All @@ -104,8 +138,8 @@ def groupby_sum(query_compiler, by, axis, groupby_args, map_args, **kwargs):
Returns
-------
PandasQueryCompiler
A new PandasQueryCompiler
QueryCompiler
A new QueryCompiler
"""
new_frame = query_compiler._modin_frame.groupby_agg(
by, axis, "sum", groupby_args, **kwargs
Expand All @@ -115,6 +149,38 @@ def groupby_sum(query_compiler, by, axis, groupby_args, map_args, **kwargs):
new_qc = new_qc.squeeze()
return new_qc

def groupby_count(query_compiler, by, axis, groupby_args, map_args, **kwargs):
"""Perform a groupby count.
Parameters
----------
by : BaseQueryCompiler
The query compiler object to groupby.
axis : 0 or 1
The axis to groupby. Must be 0 currently.
groupby_args : dict
The arguments for the groupby component.
map_args : dict
The arguments for the `map_func`.
reduce_args : dict
The arguments for `reduce_func`.
numeric_only : bool
Whether to drop non-numeric columns.
drop : bool
Whether the data in `by` was dropped.
Returns
-------
QueryCompiler
"""
new_frame = query_compiler._modin_frame.groupby_agg(
by, axis, "count", groupby_args, **kwargs
)
new_qc = query_compiler.__constructor__(new_frame)
if groupby_args["squeeze"]:
new_qc = new_qc.squeeze()
return new_qc

def groupby_dict_agg(self, by, func_dict, groupby_args, agg_args, drop=False):
"""Apply aggregation functions to a grouped dataframe per-column.
Expand Down Expand Up @@ -385,12 +451,10 @@ def dtypes(self):
groupby_agg = DFAlgNotSupported("groupby_agg")
groupby_all = DFAlgNotSupported("groupby_all")
groupby_any = DFAlgNotSupported("groupby_any")
groupby_count = DFAlgNotSupported("groupby_count")
groupby_max = DFAlgNotSupported("groupby_max")
groupby_min = DFAlgNotSupported("groupby_min")
groupby_prod = DFAlgNotSupported("groupby_prod")
groupby_reduce = DFAlgNotSupported("groupby_reduce")
groupby_size = DFAlgNotSupported("groupby_size")
head = DFAlgNotSupported("head")
idxmax = DFAlgNotSupported("idxmax")
idxmin = DFAlgNotSupported("idxmin")
Expand Down Expand Up @@ -424,7 +488,7 @@ def dtypes(self):
rtruediv = DFAlgNotSupported("rtruediv")
skew = DFAlgNotSupported("skew")
series_update = DFAlgNotSupported("series_update")
series_view = DFAlgNotSupported("series_view")
series_view = DFAlgNotSupported("series_view")
sort_index = DFAlgNotSupported("sort_index")
std = DFAlgNotSupported("std")
sum = DFAlgNotSupported("sum")
Expand Down
88 changes: 56 additions & 32 deletions modin/experimental/engines/omnisci_on_ray/test/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,14 @@ def groupby_sum(df, cols, as_index, **kwargs):

run_and_compare(groupby_sum, data=self.data, cols=cols, as_index=as_index)

@pytest.mark.parametrize("cols", cols_value)
@pytest.mark.parametrize("as_index", bool_arg_values)
def test_groupby_count(self, cols, as_index):
def groupby_count(df, cols, as_index, **kwargs):
return df.groupby(cols, as_index=as_index).count()

run_and_compare(groupby_count, data=self.data, cols=cols, as_index=as_index)

@pytest.mark.parametrize("cols", cols_value)
@pytest.mark.parametrize("as_index", bool_arg_values)
def test_groupby_proj_sum(self, cols, as_index):
Expand All @@ -163,49 +171,70 @@ def test_groupby_agg_count(self):

df_equals(ref, exp)

def test_groupby_agg_size(self):
df = pd.DataFrame(self.data)
ref = df.groupby("a").agg({"b": "size"})

modin_df = mpd.DataFrame(self.data)
modin_df = modin_df.groupby("a").agg({"b": "size"})

exp = to_pandas(modin_df)

df_equals(ref, exp)

taxi_data = {
"a": [1, 1, 2, 2],
"b": [11, 21, 12, 11],
"c": pd.to_datetime(
["20190902", "20180913", "20190921", "20180903"], format="%Y%m%d"
),
"d": [11.5, 21.2, 12.8, 13.4],
}

# TODO: emulate taxi queries with group by category types when we have loading
# using arrow
# Another way of doing taxi q1 is
# res = df.groupby("cab_type").size() - this should be tested later as well
def test_taxi_q1(self):
df = pd.DataFrame(self.data)
ref = df.groupby("a").agg({"b": "size"})
df = pd.DataFrame(self.taxi_data)
# TODO: For now we can't do such groupby by first column since modin use that
# column as aggregation one by default. We don't support such cases at
# at the moment, this will be handled later
# ref = df.groupby("a").size()
ref = df.groupby("b").size()

modin_df = mpd.DataFrame(self.data)
modin_df = modin_df.groupby("a").agg({"b": "size"})
modin_df = mpd.DataFrame(self.taxi_data)
# modin_df = modin_df.groupby("a").size()
modin_df = modin_df.groupby("b").size()

exp = to_pandas(modin_df)

df_equals(ref, exp)

def test_taxi_q2(self):
df = pd.DataFrame(self.data)
df = pd.DataFrame(self.taxi_data)
ref = df.groupby("a").agg({"b": "mean"})

modin_df = mpd.DataFrame(self.data)
modin_df = mpd.DataFrame(self.taxi_data)
modin_df = modin_df.groupby("a").agg({"b": "mean"})

exp = to_pandas(modin_df)

df_equals(ref, exp)

datetime_data = {
"a": [1, 1, 2, 2],
"b": [11, 21, 12, 11],
"c": pd.to_datetime(
["20190902", "20180913", "20190921", "20180903"], format="%Y%m%d"
),
}

@pytest.mark.parametrize("as_index", bool_arg_values)
def test_taxi_q3(self, as_index):
df = pd.DataFrame(self.datetime_data)
ref = df.groupby(["b", df["c"].dt.year], as_index=as_index).agg({"a": "size"})
df = pd.DataFrame(self.taxi_data)
ref = df.groupby(["b", df["c"].dt.year], as_index=as_index).size()

modin_df = mpd.DataFrame(self.datetime_data)
modin_df = mpd.DataFrame(self.taxi_data)
modin_df = modin_df.groupby(
["b", modin_df["c"].dt.year], as_index=as_index
).agg({"a": "size"})
).size()

exp = to_pandas(modin_df)

df_equals(ref, exp)

def test_groupby_expr_col(self):
def groupby(df, **kwargs):
Expand All @@ -217,30 +246,25 @@ def groupby(df, **kwargs):
df = df.groupby(["id1", "id2"], as_index=False).agg({"b": "max"})
return df

run_and_compare(groupby, data=self.datetime_data)

astype_data = {
"a": [1, 1, 2],
"b": [11.5, 21.2, 12.8],
}
run_and_compare(groupby, data=self.taxi_data)

def test_series_astype(self):
df = pd.DataFrame(self.astype_data)
ref = df["b"].astype("int")
df = pd.DataFrame(self.taxi_data)
ref = df["d"].astype("int")

modin_df = mpd.DataFrame(self.astype_data)
modin_df = modin_df["b"].astype("int")
modin_df = mpd.DataFrame(self.taxi_data)
modin_df = modin_df["d"].astype("int")

exp = to_pandas(modin_df)

df_equals(ref, exp)

def test_df_astype(self):
df = pd.DataFrame(self.astype_data)
ref = df.astype({"a": "float", "b": "int"})
df = pd.DataFrame(self.taxi_data)
ref = df.astype({"b": "float", "d": "int"})

modin_df = mpd.DataFrame(self.astype_data)
modin_df = modin_df.astype({"a": "float", "b": "int"})
modin_df = mpd.DataFrame(self.taxi_data)
modin_df = modin_df.astype({"b": "float", "d": "int"})

exp = to_pandas(modin_df)

Expand Down

0 comments on commit 236c2c6

Please sign in to comment.