feature-engine · olikra · Jun 23, 2024 · Jun 23, 2024 · Jun 23, 2024 · Jun 23, 2024
diff --git a/feature_engine/creation/__init__.py b/feature_engine/creation/__init__.py
@@ -2,6 +2,8 @@
 The module creation includes classes to create new variables by combination of existing
 variables in the dataframe.
 """
+
+from .custom_functions import CustomFunctions
 from .cyclical_features import CyclicalFeatures
 from .decision_tree_features import DecisionTreeFeatures
 from .math_features import MathFeatures
@@ -12,4 +14,5 @@
     "MathFeatures",
     "RelativeFeatures",
     "CyclicalFeatures",
+    "CustomFunctions",
 ]
diff --git a/feature_engine/creation/custom_functions.py b/feature_engine/creation/custom_functions.py
@@ -0,0 +1,10 @@
+"""
+CustomFunctions is a wrapper, which allows MathFunctions to detect if a
+custom function should be processed via pandas.agg() or numpy.apply_over_axes()
+"""
+
+
+class CustomFunctions:
+
+    def __init__(self, scope_target):
+        self.scope_target = scope_target
diff --git a/feature_engine/creation/math_features.py b/feature_engine/creation/math_features.py
@@ -1,5 +1,6 @@
 from typing import Any, List, Optional, Union
 
+import numpy as np
 import pandas as pd
 
 from feature_engine._docstrings.fit_attributes import (
@@ -68,6 +69,11 @@ class MathFeatures(BaseCreation):
         one name per function. If None, the transformer will assign arbitrary names,
         starting with the function and followed by the variables separated by _.
 
+    ddof: int, float, default = 1
+        Means Delta Degrees of Freedom. The divisor used in calculations is
+        N - ddof, where N represents the number of elements.
+        By default ddof is 1.
+
     {missing_values}
 
     {drop_original}
@@ -130,6 +136,54 @@ class MathFeatures(BaseCreation):
     0   1   4         2.5
     1   2   5         3.5
     2   3   6         4.5
+
+    We do not recommend using a custum function in combination with
+    the pandas.agg(). Due to performance issues it should be avoided.
+    For the sake of completeness a short example:
+
+    >>> import pandas as pd
+    >>> from feature_engine.creation import MathFeatures
+    >>>
+    >>> def customfunction_agg(series):
+    >>>    # pandas.agg calls the custom-function twice
+    >>>    # first with a non series type
+    >>>    # second with a series type -> we need the series type
+    >>>    if not isinstance(series, pd.Series):
+    >>>        raise ValueError("Only Series allowed")
+    >>>    result = series["Age"] + series["Marks"]
+    >>>    return result
+
+    >>> X = pd.DataFrame(dict(x1 = [1,2,3], x2 = [4,5,6]))
+    >>> mf = MathFeatures(variables = ["x1","x2"], func = "customfunction_agg")
+    >>> mf.fit(X)
+    >>> X = mf.transform(X))
+       x1  x2  customfunction_agg_x1_x2
+    0   1   4         5
+    1   2   5         7
+    2   3   6         9
+
+    We recommend the usage of custom functions via the numpy.apply_over_axes().
+    A custom function gets processed via numpy.apply_over_axes() when we extend
+    a provided wrapper class.
+
+    >>> import pandas as pd
+    >>> import numpy as np
+    >>> from feature_engine.creation import MathFeatures
+    >>> from feature_engine.creation.custom_functions import CustomFunctions
+    >>> class custom_function_1(CustomFunctions):
+    >>>    def domain_specific_custom_function_1(self, df, a):
+    >>>       result = np.sum(df, axis=1)
+    >>>       return result
+    >>> cufu = custom_function_1(scope_target="numpy")
+    >>> X = pd.DataFrame(dict(x1 = [1,2,3], x2 = [4,5,6]))
+    >>> mf = MathFeatures(variables = ["x1","x2"],
+    >>>                   func = [cufu.domain_specific_custom_function_1])
+    >>> mf.fit(X)
+    >>> X = mf.transform(X)
+        x1  x2        domain_specific_custom_function_1_x1_x2
+    0   1   4         5
+    1   2   5         7
+    2   3   6         9
     """
 
     def __init__(
@@ -139,8 +193,19 @@ def __init__(
         new_variables_names: Optional[List[str]] = None,
         missing_values: str = "raise",
         drop_original: bool = False,
+        ddof: Union[int, float] = 1,
     ) -> None:
 
+        # casting input parameter func to a list
+        function_input = []
+        if isinstance(func, str):
+            function_input.append(func)
+        elif isinstance(func, list):
+            function_input = func
+        else:
+            function_input = func
+        func = function_input
+
         if (
             not isinstance(variables, list)
             or not all(isinstance(var, (int, str)) for var in variables)
@@ -175,18 +240,16 @@ def __init__(
                         "The number of new feature names must coincide with the number "
                         "of functions."
                     )
-            else:
-                if len(new_variables_names) != 1:
-                    raise ValueError(
-                        "The number of new feature names must coincide with the number "
-                        "of functions."
-                    )
+
+        if not (isinstance(ddof, int) or isinstance(ddof, float)):
+            raise ValueError("The type of ddof has to be Integer or Float")
 
         super().__init__(missing_values, drop_original)
 
         self.variables = variables
         self.func = func
         self.new_variables_names = new_variables_names
+        self.ddof = ddof
 
     def transform(self, X: pd.DataFrame) -> pd.DataFrame:
         """
@@ -202,14 +265,117 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame:
         X_new: Pandas dataframe, shape = [n_samples, n_features + n_operations]
             The input dataframe plus the new variables.
         """
+
+        def np_transform(np_df, new_variable_names, np_variables, np_functions):
+            np_result_df = pd.DataFrame()
+            for np_function_idx, np_function in enumerate(np_functions):
+                if callable(np_function):
+                    np_function_name = np_function.__name__
+                else:
+                    np_function_name = np_function
+
+                if np_function_name in ("sum"):
+                    result = np.nansum(
+                        np_df[np_variables],
+                        axis=1,
+                    )
+                    np_result_df[new_variable_names[np_function_idx]] = pd.Series(
+                        result
+                    )
+                    continue
+
+                elif np_function_name in ("mean"):
+                    result = np.nanmean(
+                        np_df[np_variables],
+                        axis=1,
+                    )
+                    np_result_df[new_variable_names[np_function_idx]] = pd.Series(
+                        result
+                    )
+                    continue
+
+                elif np_function_name in ("min",):
+                    result = np.nanmin(
+                        np_df[np_variables],
+                        axis=1,
+                    )
+                    np_result_df[new_variable_names[np_function_idx]] = pd.Series(
+                        result
+                    )
+                    continue
+
+                elif np_function_name in ("max",):
+                    result = np.nanmax(
+                        np_df[np_variables],
+                        axis=1,
+                    )
+                    np_result_df[new_variable_names[np_function_idx]] = pd.Series(
+                        result
+                    )
+                    continue
+
+                elif np_function_name in ("prod"):
+                    result = np.nanprod(
+                        np_df[np_variables],
+                        axis=1,
+                    )
+                    np_result_df[new_variable_names[np_function_idx]] = pd.Series(
+                        result
+                    )
+                    continue
+
+                elif np_function_name in ("median"):
+                    result = np.nanmedian(
+                        np_df[np_variables],
+                        axis=1,
+                    )
+                    np_result_df[new_variable_names[np_function_idx]] = pd.Series(
+                        result
+                    )
+                    continue
+
+                elif np_function_name in ("std"):
+                    result = np.nanstd(np_df[np_variables], axis=1, ddof=self.ddof)
+                    np_result_df[new_variable_names[np_function_idx]] = pd.Series(
+                        result
+                    )
+                    continue
+
+                elif np_function_name in ("var"):
+                    result = np.nanvar(np_df[np_variables], axis=1, ddof=self.ddof)
+                    np_result_df[new_variable_names[np_function_idx]] = pd.Series(
+                        result
+                    )
+                    continue
+
+                else:
+                    try:
+                        scope_target = np_function.__self__.scope_target
+                    except Exception:
+                        scope_target = "pandas"
+
+                    if scope_target == "numpy":
+                        result = np.apply_over_axes(np_function, np_df[np_variables], 1)
+                        np_result_df[new_variable_names[np_function_idx]] = (
+                            pd.DataFrame(
+                                data=result,
+                                columns=[new_variable_names[np_function_idx]],
+                            )
+                        )
+                    elif scope_target == "pandas":
+                        result = np_df[np_variables].agg(np_function, axis=1)
+                        np_result_df[new_variable_names[np_function_idx]] = result
+
+            return np_result_df
+
         X = self._check_transform_input_and_state(X)
 
         new_variable_names = self._get_new_features_name()
 
-        if len(new_variable_names) == 1:
-            X[new_variable_names[0]] = X[self.variables].agg(self.func, axis=1)
-        else:
-            X[new_variable_names] = X[self.variables].agg(self.func, axis=1)
+        X = pd.concat(
+            [X, np_transform(X, new_variable_names, self.variables, self.func)],
+            axis=1,
+        )
 
         if self.drop_original:
             X.drop(columns=self.variables, inplace=True)
@@ -226,14 +392,9 @@ def _get_new_features_name(self) -> List:
         else:
             varlist = [f"{var}" for var in self.variables_]
 
-            if isinstance(self.func, list):
-                functions = [
-                    fun if type(fun) is str else fun.__name__ for fun in self.func
-                ]
-                feature_names = [
-                    f"{function}_{'_'.join(varlist)}" for function in functions
-                ]
-            else:
-                feature_names = [f"{self.func}_{'_'.join(varlist)}"]
+            functions = [fun if type(fun) is str else fun.__name__ for fun in self.func]
+            feature_names = [
+                f"{function}_{'_'.join(varlist)}" for function in functions
+            ]
 
         return feature_names