Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Math features migrate to numpy #774

Open
wants to merge 20 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions feature_engine/creation/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
The module creation includes classes to create new variables by combination of existing
variables in the dataframe.
"""

from .custom_functions import CustomFunctions
from .cyclical_features import CyclicalFeatures
from .decision_tree_features import DecisionTreeFeatures
from .math_features import MathFeatures
Expand All @@ -12,4 +14,5 @@
"MathFeatures",
"RelativeFeatures",
"CyclicalFeatures",
"CustomFunctions",
]
10 changes: 10 additions & 0 deletions feature_engine/creation/custom_functions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
"""
CustomFunctions is a wrapper, which allows MathFunctions to detect if a
custom function should be processed via pandas.agg() or numpy.apply_over_axes()
"""


class CustomFunctions:
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we need this class? it seems to be implementing a very simple functionality. Can this not be a one liner in the main MathFeatures class?

Besides that, the class name does not immediately tell what the class is doing, we'd need docstrings and typehints.


def __init__(self, scope_target):
self.scope_target = scope_target
199 changes: 180 additions & 19 deletions feature_engine/creation/math_features.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from typing import Any, List, Optional, Union

import numpy as np
import pandas as pd

from feature_engine._docstrings.fit_attributes import (
Expand Down Expand Up @@ -68,6 +69,11 @@ class MathFeatures(BaseCreation):
one name per function. If None, the transformer will assign arbitrary names,
starting with the function and followed by the variables separated by _.

ddof: int, float, default = 1
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can ddof ever be a float? the ddof are usually integers, no?

Means Delta Degrees of Freedom. The divisor used in calculations is
N - ddof, where N represents the number of elements.
By default ddof is 1.

{missing_values}

{drop_original}
Expand Down Expand Up @@ -130,6 +136,54 @@ class MathFeatures(BaseCreation):
0 1 4 2.5
1 2 5 3.5
2 3 6 4.5

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

All of this, should go in the user guide, not here. It is too much information for the class docstrings. But let's hold on with the change until we finalize the class update

We do not recommend using a custum function in combination with
the pandas.agg(). Due to performance issues it should be avoided.
For the sake of completeness a short example:

>>> import pandas as pd
>>> from feature_engine.creation import MathFeatures
>>>
>>> def customfunction_agg(series):
>>> # pandas.agg calls the custom-function twice
>>> # first with a non series type
>>> # second with a series type -> we need the series type
>>> if not isinstance(series, pd.Series):
>>> raise ValueError("Only Series allowed")
>>> result = series["Age"] + series["Marks"]
>>> return result

>>> X = pd.DataFrame(dict(x1 = [1,2,3], x2 = [4,5,6]))
>>> mf = MathFeatures(variables = ["x1","x2"], func = "customfunction_agg")
>>> mf.fit(X)
>>> X = mf.transform(X))
x1 x2 customfunction_agg_x1_x2
0 1 4 5
1 2 5 7
2 3 6 9

We recommend the usage of custom functions via the numpy.apply_over_axes().
A custom function gets processed via numpy.apply_over_axes() when we extend
a provided wrapper class.

>>> import pandas as pd
>>> import numpy as np
>>> from feature_engine.creation import MathFeatures
>>> from feature_engine.creation.custom_functions import CustomFunctions
>>> class custom_function_1(CustomFunctions):
>>> def domain_specific_custom_function_1(self, df, a):
>>> result = np.sum(df, axis=1)
>>> return result
>>> cufu = custom_function_1(scope_target="numpy")
>>> X = pd.DataFrame(dict(x1 = [1,2,3], x2 = [4,5,6]))
>>> mf = MathFeatures(variables = ["x1","x2"],
>>> func = [cufu.domain_specific_custom_function_1])
>>> mf.fit(X)
>>> X = mf.transform(X)
x1 x2 domain_specific_custom_function_1_x1_x2
0 1 4 5
1 2 5 7
2 3 6 9
"""

def __init__(
Expand All @@ -139,8 +193,19 @@ def __init__(
new_variables_names: Optional[List[str]] = None,
missing_values: str = "raise",
drop_original: bool = False,
ddof: Union[int, float] = 1,
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think ddof should be just int

) -> None:

# casting input parameter func to a list
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

for compatibility with sklearn, we can't modify the parameters that the user enters at init. They need to remain the same. Anything that needs changing, needs to happen in the fit and be added as a new attribute followed by _ if necessary at all.

function_input = []
if isinstance(func, str):
function_input.append(func)
elif isinstance(func, list):
function_input = func
else:
function_input = func
func = function_input

if (
not isinstance(variables, list)
or not all(isinstance(var, (int, str)) for var in variables)
Expand Down Expand Up @@ -175,18 +240,16 @@ def __init__(
"The number of new feature names must coincide with the number "
"of functions."
)
else:
if len(new_variables_names) != 1:
raise ValueError(
"The number of new feature names must coincide with the number "
"of functions."
)

if not (isinstance(ddof, int) or isinstance(ddof, float)):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if not insinstance(ddof, (int, float)):
assuming that we allow it to take both values.

raise ValueError("The type of ddof has to be Integer or Float")
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

error message needs to contain "Got {ddof} instead." to help user undestand why the error


super().__init__(missing_values, drop_original)

self.variables = variables
self.func = func
self.new_variables_names = new_variables_names
self.ddof = ddof

def transform(self, X: pd.DataFrame) -> pd.DataFrame:
"""
Expand All @@ -202,14 +265,117 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame:
X_new: Pandas dataframe, shape = [n_samples, n_features + n_operations]
The input dataframe plus the new variables.
"""

def np_transform(np_df, new_variable_names, np_variables, np_functions):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we need typehint and better function names. What is np_transform supposed to do? not immediately obvious from the name. I'd also not define a method within a method. Let's define it outside preceded by _

np_result_df = pd.DataFrame()
for np_function_idx, np_function in enumerate(np_functions):
if callable(np_function):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if it is a callable, can we not apply it straightaway? is not getting the name and then the if loop adding complexity?

np_function_name = np_function.__name__
else:
np_function_name = np_function

if np_function_name in ("sum"):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think asserting equality is faster than scanning something in set, is that the case? did you check?

result = np.nansum(
np_df[np_variables],
axis=1,
)
np_result_df[new_variable_names[np_function_idx]] = pd.Series(
result
)
continue
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why do we need continue here? so that it reads the following elif? should we not just use if then?


elif np_function_name in ("mean"):
result = np.nanmean(
np_df[np_variables],
axis=1,
)
np_result_df[new_variable_names[np_function_idx]] = pd.Series(
result
)
continue

elif np_function_name in ("min",):
result = np.nanmin(
np_df[np_variables],
axis=1,
)
np_result_df[new_variable_names[np_function_idx]] = pd.Series(
result
)
continue

elif np_function_name in ("max",):
result = np.nanmax(
np_df[np_variables],
axis=1,
)
np_result_df[new_variable_names[np_function_idx]] = pd.Series(
result
)
continue

elif np_function_name in ("prod"):
result = np.nanprod(
np_df[np_variables],
axis=1,
)
np_result_df[new_variable_names[np_function_idx]] = pd.Series(
result
)
continue

elif np_function_name in ("median"):
result = np.nanmedian(
np_df[np_variables],
axis=1,
)
np_result_df[new_variable_names[np_function_idx]] = pd.Series(
result
)
continue

elif np_function_name in ("std"):
result = np.nanstd(np_df[np_variables], axis=1, ddof=self.ddof)
np_result_df[new_variable_names[np_function_idx]] = pd.Series(
result
)
continue

elif np_function_name in ("var"):
result = np.nanvar(np_df[np_variables], axis=1, ddof=self.ddof)
np_result_df[new_variable_names[np_function_idx]] = pd.Series(
result
)
continue

else:
try:
scope_target = np_function.__self__.scope_target
except Exception:
scope_target = "pandas"

if scope_target == "numpy":
result = np.apply_over_axes(np_function, np_df[np_variables], 1)
np_result_df[new_variable_names[np_function_idx]] = (
pd.DataFrame(
data=result,
columns=[new_variable_names[np_function_idx]],
)
)
elif scope_target == "pandas":
result = np_df[np_variables].agg(np_function, axis=1)
np_result_df[new_variable_names[np_function_idx]] = result

return np_result_df

X = self._check_transform_input_and_state(X)

new_variable_names = self._get_new_features_name()

if len(new_variable_names) == 1:
X[new_variable_names[0]] = X[self.variables].agg(self.func, axis=1)
else:
X[new_variable_names] = X[self.variables].agg(self.func, axis=1)
X = pd.concat(
[X, np_transform(X, new_variable_names, self.variables, self.func)],
axis=1,
)

if self.drop_original:
X.drop(columns=self.variables, inplace=True)
Expand All @@ -226,14 +392,9 @@ def _get_new_features_name(self) -> List:
else:
varlist = [f"{var}" for var in self.variables_]

if isinstance(self.func, list):
functions = [
fun if type(fun) is str else fun.__name__ for fun in self.func
]
feature_names = [
f"{function}_{'_'.join(varlist)}" for function in functions
]
else:
feature_names = [f"{self.func}_{'_'.join(varlist)}"]
functions = [fun if type(fun) is str else fun.__name__ for fun in self.func]
feature_names = [
f"{function}_{'_'.join(varlist)}" for function in functions
]

return feature_names
Loading