Skip to content

Commit

Permalink
FIX-#2313: improved handling non-numeric types at 'mean' when 'axis=1' (
Browse files Browse the repository at this point in the history
#2535)

Signed-off-by: Dmitry Chigarev <[email protected]>
  • Loading branch information
dchigarev authored Dec 15, 2020
1 parent 1a8cd0a commit 787d2b1
Show file tree
Hide file tree
Showing 3 changed files with 23 additions and 24 deletions.
3 changes: 3 additions & 0 deletions asv_bench/benchmarks/benchmarks.py
Original file line number Diff line number Diff line change
Expand Up @@ -260,3 +260,6 @@ def time_nunique(self, impl, data_type, data_size, axis):

def time_apply(self, impl, data_type, data_size, axis):
self.df.apply(lambda df: df.sum(), axis=axis)

def time_mean(self, impl, data_type, data_size, axis):
self.df.mean(axis=axis)
42 changes: 20 additions & 22 deletions modin/backends/pandas/query_compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -677,33 +677,31 @@ def mean(self, axis, **kwargs):

skipna = kwargs.get("skipna", True)

def map_apply_fn(ser, **kwargs):
try:
sum_result = ser.sum(skipna=skipna)
count_result = ser.count()
except TypeError:
return None
else:
return (sum_result, count_result)

def reduce_apply_fn(ser, **kwargs):
sum_result = ser.apply(lambda x: x[0]).sum(skipna=skipna)
count_result = ser.apply(lambda x: x[1]).sum(skipna=skipna)
return sum_result / count_result
# TODO-FIX: this function may work incorrectly with user-defined "numeric" values.
# Since `count(numeric_only=True)` discards all unknown "numeric" types, we can get incorrect
# divisor inside the reduce function.
def map_fn(df, **kwargs):
result = pandas.DataFrame(
{
"sum": df.sum(axis=axis, skipna=skipna),
"count": df.count(axis=axis, numeric_only=True),
}
)
return result if axis else result.T

def reduce_fn(df, **kwargs):
df.dropna(axis=1, inplace=True, how="any")
return build_applyier(reduce_apply_fn, axis=axis)(df)

def build_applyier(func, **applyier_kwargs):
def applyier(df, **kwargs):
result = df.apply(func, **applyier_kwargs)
return result.set_axis(df.axes[axis ^ 1], axis=0)
sum_cols = df["sum"] if axis else df.loc["sum"]
count_cols = df["count"] if axis else df.loc["count"]

return applyier
if not isinstance(sum_cols, pandas.Series):
# If we got `NaN` as the result of the sum in any axis partition,
# then we must consider the whole sum as `NaN`, so setting `skipna=False`
sum_cols = sum_cols.sum(axis=axis, skipna=False)
count_cols = count_cols.sum(axis=axis, skipna=False)
return sum_cols / count_cols

return MapReduceFunction.register(
build_applyier(map_apply_fn, axis=axis, result_type="reduce"),
map_fn,
reduce_fn,
preserve_index=(kwargs.get("numeric_only") is not None),
)(self, axis=axis, **kwargs)
Expand Down
2 changes: 0 additions & 2 deletions modin/pandas/test/dataframe/test_reduction.py
Original file line number Diff line number Diff line change
Expand Up @@ -363,8 +363,6 @@ def test_sum_single_column(data):
"numeric_only", bool_arg_values, ids=arg_keys("numeric_only", bool_arg_keys)
)
def test_reduction_specific(fn, numeric_only, axis):
if fn == "mean" and axis == 1:
pytest.skip("See issue #2313 for details")
eval_general(
*create_test_dfs(test_data_diff_dtype),
lambda df: getattr(df, fn)(numeric_only=numeric_only, axis=axis),
Expand Down

0 comments on commit 787d2b1

Please sign in to comment.