diff --git a/feature_engine/selection/shuffle_features.py b/feature_engine/selection/shuffle_features.py index 1dcee62a2..69db54d7b 100644 --- a/feature_engine/selection/shuffle_features.py +++ b/feature_engine/selection/shuffle_features.py @@ -5,7 +5,7 @@ from sklearn.base import is_classifier from sklearn.metrics import get_scorer from sklearn.model_selection import check_cv, cross_validate -from sklearn.utils.validation import check_random_state +from sklearn.utils.validation import check_random_state, _check_sample_weight from feature_engine._docstrings.fit_attributes import ( _feature_names_in_docstring, @@ -185,7 +185,12 @@ def __init__( self.cv = cv self.random_state = random_state - def fit(self, X: pd.DataFrame, y: pd.Series): + def fit( + self, + X: pd.DataFrame, + y: pd.Series, + sample_weight: Union[np.array, pd.Series, List] = None, + ): """ Find the important features. @@ -193,8 +198,12 @@ def fit(self, X: pd.DataFrame, y: pd.Series): ---------- X: pandas dataframe of shape = [n_samples, n_features] The input dataframe. + y: array-like of shape (n_samples) Target variable. Required to train the estimator. + + sample_weight : array-like of shape (n_samples,), default=None + Sample weights. If None, then samples are equally weighted. """ X, y = check_X_y(X, y) @@ -203,6 +212,9 @@ def fit(self, X: pd.DataFrame, y: pd.Series): X = X.reset_index(drop=True) y = y.reset_index(drop=True) + if sample_weight is not None: + sample_weight = _check_sample_weight(sample_weight, X) + # If required exclude variables that are not in the input dataframe self._confirm_variables(X) @@ -220,6 +232,7 @@ def fit(self, X: pd.DataFrame, y: pd.Series): cv=self.cv, return_estimator=True, scoring=self.scoring, + fit_params={"sample_weight": sample_weight}, ) # store initial model performance diff --git a/feature_engine/tags.py b/feature_engine/tags.py index fa070a331..6dc3647b7 100644 --- a/feature_engine/tags.py +++ b/feature_engine/tags.py @@ -14,6 +14,7 @@ def _return_tags(): # The test aims to check that the check_X_y function from sklearn is # working, but we do not use that check, because we work with dfs. "check_transformer_data_not_an_array": "Ok to fail", + "check_sample_weights_not_an_array": "Ok to fail", # TODO: we probably need the test below!! "check_methods_sample_order_invariance": "Test does not work on dataframes", # TODO: we probably need the test below!! diff --git a/tests/test_selection/test_shuffle_features.py b/tests/test_selection/test_shuffle_features.py index 61e1d279e..6cafd677b 100644 --- a/tests/test_selection/test_shuffle_features.py +++ b/tests/test_selection/test_shuffle_features.py @@ -134,3 +134,21 @@ def test_automatic_variable_selection(df_test): ] # test transform output pd.testing.assert_frame_equal(sel.transform(X), Xtransformed) + + +def test_sample_weights(): + X = pd.DataFrame( + dict( + x1=[1000, 2000, 1000, 1000, 2000, 3000], + x2=[1000, 2000, 1000, 1000, 2000, 3000], + ) + ) + y = pd.Series([1, 0, 0, 1, 1, 0]) + + sbs = SelectByShuffling( + RandomForestClassifier(random_state=42), cv=2, random_state=42 + ) + + sample_weight = [1000, 2000, 1000, 1000, 2000, 3000] + sbs.fit_transform(X, y, sample_weight=sample_weight) + assert sbs.initial_model_performance_ == 0.125