FIX-#2313: improved handling non-numeric types at 'mean' when 'axis=1' (

#2535) Signed-off-by: Dmitry Chigarev <[email protected]>
modin-project · Dec 15, 2020 · 787d2b1 · 787d2b1
1 parent 1a8cd0a
commit 787d2b1
Show file tree

Hide file tree

Showing 3 changed files with 23 additions and 24 deletions.
diff --git a/asv_bench/benchmarks/benchmarks.py b/asv_bench/benchmarks/benchmarks.py
@@ -260,3 +260,6 @@ def time_nunique(self, impl, data_type, data_size, axis):
 
     def time_apply(self, impl, data_type, data_size, axis):
         self.df.apply(lambda df: df.sum(), axis=axis)
+
+    def time_mean(self, impl, data_type, data_size, axis):
+        self.df.mean(axis=axis)
diff --git a/modin/backends/pandas/query_compiler.py b/modin/backends/pandas/query_compiler.py
@@ -677,33 +677,31 @@ def mean(self, axis, **kwargs):
 
         skipna = kwargs.get("skipna", True)
 
-        def map_apply_fn(ser, **kwargs):
-            try:
-                sum_result = ser.sum(skipna=skipna)
-                count_result = ser.count()
-            except TypeError:
-                return None
-            else:
-                return (sum_result, count_result)
-
-        def reduce_apply_fn(ser, **kwargs):
-            sum_result = ser.apply(lambda x: x[0]).sum(skipna=skipna)
-            count_result = ser.apply(lambda x: x[1]).sum(skipna=skipna)
-            return sum_result / count_result
+        # TODO-FIX: this function may work incorrectly with user-defined "numeric" values.
+        # Since `count(numeric_only=True)` discards all unknown "numeric" types, we can get incorrect
+        # divisor inside the reduce function.
+        def map_fn(df, **kwargs):
+            result = pandas.DataFrame(
+                {
+                    "sum": df.sum(axis=axis, skipna=skipna),
+                    "count": df.count(axis=axis, numeric_only=True),
+                }
+            )
+            return result if axis else result.T
 
         def reduce_fn(df, **kwargs):
-            df.dropna(axis=1, inplace=True, how="any")
-            return build_applyier(reduce_apply_fn, axis=axis)(df)
-
-        def build_applyier(func, **applyier_kwargs):
-            def applyier(df, **kwargs):
-                result = df.apply(func, **applyier_kwargs)
-                return result.set_axis(df.axes[axis ^ 1], axis=0)
+            sum_cols = df["sum"] if axis else df.loc["sum"]
+            count_cols = df["count"] if axis else df.loc["count"]
 
-            return applyier
+            if not isinstance(sum_cols, pandas.Series):
+                # If we got `NaN` as the result of the sum in any axis partition,
+                # then we must consider the whole sum as `NaN`, so setting `skipna=False`
+                sum_cols = sum_cols.sum(axis=axis, skipna=False)
+                count_cols = count_cols.sum(axis=axis, skipna=False)
+            return sum_cols / count_cols
 
         return MapReduceFunction.register(
-            build_applyier(map_apply_fn, axis=axis, result_type="reduce"),
+            map_fn,
             reduce_fn,
             preserve_index=(kwargs.get("numeric_only") is not None),
         )(self, axis=axis, **kwargs)

diff --git a/modin/pandas/test/dataframe/test_reduction.py b/modin/pandas/test/dataframe/test_reduction.py
@@ -363,8 +363,6 @@ def test_sum_single_column(data):
     "numeric_only", bool_arg_values, ids=arg_keys("numeric_only", bool_arg_keys)
 )
 def test_reduction_specific(fn, numeric_only, axis):
-    if fn == "mean" and axis == 1:
-        pytest.skip("See issue #2313 for details")
     eval_general(
         *create_test_dfs(test_data_diff_dtype),
         lambda df: getattr(df, fn)(numeric_only=numeric_only, axis=axis),