-
-
Notifications
You must be signed in to change notification settings - Fork 310
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Math features migrate to numpy #774
base: main
Are you sure you want to change the base?
Changes from all commits
22b03cf
43ac1e1
bce30b8
3e1893c
f60867a
4fde225
5310466
df9012f
be0e8bd
2152ab8
ebba2c4
47fa4cb
2bfbc59
2b40910
d77d5ff
2f4f6ab
923c2d4
2c92d43
9c7cf82
63f26c9
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
""" | ||
CustomFunctions is a wrapper, which allows MathFunctions to detect if a | ||
custom function should be processed via pandas.agg() or numpy.apply_over_axes() | ||
""" | ||
|
||
|
||
class CustomFunctions: | ||
|
||
def __init__(self, scope_target): | ||
self.scope_target = scope_target |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,6 @@ | ||
from typing import Any, List, Optional, Union | ||
|
||
import numpy as np | ||
import pandas as pd | ||
|
||
from feature_engine._docstrings.fit_attributes import ( | ||
|
@@ -68,6 +69,11 @@ class MathFeatures(BaseCreation): | |
one name per function. If None, the transformer will assign arbitrary names, | ||
starting with the function and followed by the variables separated by _. | ||
|
||
ddof: int, float, default = 1 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can ddof ever be a float? the ddof are usually integers, no? |
||
Means Delta Degrees of Freedom. The divisor used in calculations is | ||
N - ddof, where N represents the number of elements. | ||
By default ddof is 1. | ||
|
||
{missing_values} | ||
|
||
{drop_original} | ||
|
@@ -130,6 +136,54 @@ class MathFeatures(BaseCreation): | |
0 1 4 2.5 | ||
1 2 5 3.5 | ||
2 3 6 4.5 | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. All of this, should go in the user guide, not here. It is too much information for the class docstrings. But let's hold on with the change until we finalize the class update |
||
We do not recommend using a custum function in combination with | ||
the pandas.agg(). Due to performance issues it should be avoided. | ||
For the sake of completeness a short example: | ||
|
||
>>> import pandas as pd | ||
>>> from feature_engine.creation import MathFeatures | ||
>>> | ||
>>> def customfunction_agg(series): | ||
>>> # pandas.agg calls the custom-function twice | ||
>>> # first with a non series type | ||
>>> # second with a series type -> we need the series type | ||
>>> if not isinstance(series, pd.Series): | ||
>>> raise ValueError("Only Series allowed") | ||
>>> result = series["Age"] + series["Marks"] | ||
>>> return result | ||
|
||
>>> X = pd.DataFrame(dict(x1 = [1,2,3], x2 = [4,5,6])) | ||
>>> mf = MathFeatures(variables = ["x1","x2"], func = "customfunction_agg") | ||
>>> mf.fit(X) | ||
>>> X = mf.transform(X)) | ||
x1 x2 customfunction_agg_x1_x2 | ||
0 1 4 5 | ||
1 2 5 7 | ||
2 3 6 9 | ||
|
||
We recommend the usage of custom functions via the numpy.apply_over_axes(). | ||
A custom function gets processed via numpy.apply_over_axes() when we extend | ||
a provided wrapper class. | ||
|
||
>>> import pandas as pd | ||
>>> import numpy as np | ||
>>> from feature_engine.creation import MathFeatures | ||
>>> from feature_engine.creation.custom_functions import CustomFunctions | ||
>>> class custom_function_1(CustomFunctions): | ||
>>> def domain_specific_custom_function_1(self, df, a): | ||
>>> result = np.sum(df, axis=1) | ||
>>> return result | ||
>>> cufu = custom_function_1(scope_target="numpy") | ||
>>> X = pd.DataFrame(dict(x1 = [1,2,3], x2 = [4,5,6])) | ||
>>> mf = MathFeatures(variables = ["x1","x2"], | ||
>>> func = [cufu.domain_specific_custom_function_1]) | ||
>>> mf.fit(X) | ||
>>> X = mf.transform(X) | ||
x1 x2 domain_specific_custom_function_1_x1_x2 | ||
0 1 4 5 | ||
1 2 5 7 | ||
2 3 6 9 | ||
""" | ||
|
||
def __init__( | ||
|
@@ -139,8 +193,19 @@ def __init__( | |
new_variables_names: Optional[List[str]] = None, | ||
missing_values: str = "raise", | ||
drop_original: bool = False, | ||
ddof: Union[int, float] = 1, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think ddof should be just int |
||
) -> None: | ||
|
||
# casting input parameter func to a list | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. for compatibility with sklearn, we can't modify the parameters that the user enters at init. They need to remain the same. Anything that needs changing, needs to happen in the fit and be added as a new attribute followed by _ if necessary at all. |
||
function_input = [] | ||
if isinstance(func, str): | ||
function_input.append(func) | ||
elif isinstance(func, list): | ||
function_input = func | ||
else: | ||
function_input = func | ||
func = function_input | ||
|
||
if ( | ||
not isinstance(variables, list) | ||
or not all(isinstance(var, (int, str)) for var in variables) | ||
|
@@ -175,18 +240,16 @@ def __init__( | |
"The number of new feature names must coincide with the number " | ||
"of functions." | ||
) | ||
else: | ||
if len(new_variables_names) != 1: | ||
raise ValueError( | ||
"The number of new feature names must coincide with the number " | ||
"of functions." | ||
) | ||
|
||
if not (isinstance(ddof, int) or isinstance(ddof, float)): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. if not insinstance(ddof, (int, float)): |
||
raise ValueError("The type of ddof has to be Integer or Float") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. error message needs to contain "Got {ddof} instead." to help user undestand why the error |
||
|
||
super().__init__(missing_values, drop_original) | ||
|
||
self.variables = variables | ||
self.func = func | ||
self.new_variables_names = new_variables_names | ||
self.ddof = ddof | ||
|
||
def transform(self, X: pd.DataFrame) -> pd.DataFrame: | ||
""" | ||
|
@@ -202,14 +265,117 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame: | |
X_new: Pandas dataframe, shape = [n_samples, n_features + n_operations] | ||
The input dataframe plus the new variables. | ||
""" | ||
|
||
def np_transform(np_df, new_variable_names, np_variables, np_functions): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. we need typehint and better function names. What is np_transform supposed to do? not immediately obvious from the name. I'd also not define a method within a method. Let's define it outside preceded by _ |
||
np_result_df = pd.DataFrame() | ||
for np_function_idx, np_function in enumerate(np_functions): | ||
if callable(np_function): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. if it is a callable, can we not apply it straightaway? is not getting the name and then the if loop adding complexity? |
||
np_function_name = np_function.__name__ | ||
else: | ||
np_function_name = np_function | ||
|
||
if np_function_name in ("sum"): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think asserting equality is faster than scanning something in set, is that the case? did you check? |
||
result = np.nansum( | ||
np_df[np_variables], | ||
axis=1, | ||
) | ||
np_result_df[new_variable_names[np_function_idx]] = pd.Series( | ||
result | ||
) | ||
continue | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why do we need continue here? so that it reads the following elif? should we not just use if then? |
||
|
||
elif np_function_name in ("mean"): | ||
result = np.nanmean( | ||
np_df[np_variables], | ||
axis=1, | ||
) | ||
np_result_df[new_variable_names[np_function_idx]] = pd.Series( | ||
result | ||
) | ||
continue | ||
|
||
elif np_function_name in ("min",): | ||
result = np.nanmin( | ||
np_df[np_variables], | ||
axis=1, | ||
) | ||
np_result_df[new_variable_names[np_function_idx]] = pd.Series( | ||
result | ||
) | ||
continue | ||
|
||
elif np_function_name in ("max",): | ||
result = np.nanmax( | ||
np_df[np_variables], | ||
axis=1, | ||
) | ||
np_result_df[new_variable_names[np_function_idx]] = pd.Series( | ||
result | ||
) | ||
continue | ||
|
||
elif np_function_name in ("prod"): | ||
result = np.nanprod( | ||
np_df[np_variables], | ||
axis=1, | ||
) | ||
np_result_df[new_variable_names[np_function_idx]] = pd.Series( | ||
result | ||
) | ||
continue | ||
|
||
elif np_function_name in ("median"): | ||
result = np.nanmedian( | ||
np_df[np_variables], | ||
axis=1, | ||
) | ||
np_result_df[new_variable_names[np_function_idx]] = pd.Series( | ||
result | ||
) | ||
continue | ||
|
||
elif np_function_name in ("std"): | ||
result = np.nanstd(np_df[np_variables], axis=1, ddof=self.ddof) | ||
np_result_df[new_variable_names[np_function_idx]] = pd.Series( | ||
result | ||
) | ||
continue | ||
|
||
elif np_function_name in ("var"): | ||
result = np.nanvar(np_df[np_variables], axis=1, ddof=self.ddof) | ||
np_result_df[new_variable_names[np_function_idx]] = pd.Series( | ||
result | ||
) | ||
continue | ||
|
||
else: | ||
try: | ||
scope_target = np_function.__self__.scope_target | ||
except Exception: | ||
scope_target = "pandas" | ||
|
||
if scope_target == "numpy": | ||
result = np.apply_over_axes(np_function, np_df[np_variables], 1) | ||
np_result_df[new_variable_names[np_function_idx]] = ( | ||
pd.DataFrame( | ||
data=result, | ||
columns=[new_variable_names[np_function_idx]], | ||
) | ||
) | ||
elif scope_target == "pandas": | ||
result = np_df[np_variables].agg(np_function, axis=1) | ||
np_result_df[new_variable_names[np_function_idx]] = result | ||
|
||
return np_result_df | ||
|
||
X = self._check_transform_input_and_state(X) | ||
|
||
new_variable_names = self._get_new_features_name() | ||
|
||
if len(new_variable_names) == 1: | ||
X[new_variable_names[0]] = X[self.variables].agg(self.func, axis=1) | ||
else: | ||
X[new_variable_names] = X[self.variables].agg(self.func, axis=1) | ||
X = pd.concat( | ||
[X, np_transform(X, new_variable_names, self.variables, self.func)], | ||
axis=1, | ||
) | ||
|
||
if self.drop_original: | ||
X.drop(columns=self.variables, inplace=True) | ||
|
@@ -226,14 +392,9 @@ def _get_new_features_name(self) -> List: | |
else: | ||
varlist = [f"{var}" for var in self.variables_] | ||
|
||
if isinstance(self.func, list): | ||
functions = [ | ||
fun if type(fun) is str else fun.__name__ for fun in self.func | ||
] | ||
feature_names = [ | ||
f"{function}_{'_'.join(varlist)}" for function in functions | ||
] | ||
else: | ||
feature_names = [f"{self.func}_{'_'.join(varlist)}"] | ||
functions = [fun if type(fun) is str else fun.__name__ for fun in self.func] | ||
feature_names = [ | ||
f"{function}_{'_'.join(varlist)}" for function in functions | ||
] | ||
|
||
return feature_names |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Do we need this class? it seems to be implementing a very simple functionality. Can this not be a one liner in the main MathFeatures class?
Besides that, the class name does not immediately tell what the class is doing, we'd need docstrings and typehints.