diff --git a/openstef/feature_engineering/missing_values_transformer.py b/openstef/feature_engineering/missing_values_transformer.py index b2ced0895..7c46e5192 100644 --- a/openstef/feature_engineering/missing_values_transformer.py +++ b/openstef/feature_engineering/missing_values_transformer.py @@ -27,6 +27,7 @@ def __init__( missing_values: Union[int, float, str, None] = np.nan, imputation_strategy: str = None, fill_value: Union[str, int, float] = None, + no_fill_future_values_features: List[str] = None, ): """Initialize missing values handler. @@ -37,11 +38,14 @@ def __init__( Can be one of "mean", "median", "most_frequent", "constant" or None. fill_value: When strategy == "constant", fill_value is used to replace all occurrences of missing_values. - + no_fill_future_values_features: The features for which it does not make sense + to fill future values. Rows that contain trailing null values for these + features will be removed from the data. """ self.missing_values = missing_values self.imputation_strategy = imputation_strategy self.fill_value = fill_value + self.no_fill_future_values_features = no_fill_future_values_features or [] self.is_fitted_ = False # Build the proper imputation transformer @@ -57,6 +61,11 @@ def __init__( ).set_output(transform="pandas") self.imputer_._validate_params() + @staticmethod + def _determine_trailing_null_rows(x: pd.DataFrame) -> pd.Series: + """Determine rows with trailing null values in a DataFrame.""" + return ~x.bfill().isnull().any(axis="columns") + def fit(self, x, y=None): """Fit the imputer on the input data.""" _ = check_array(x, force_all_finite="allow-nan") @@ -69,9 +78,17 @@ def fit(self, x, y=None): # Remove always null columns is_column_null = x.isnull().all(axis="index") self.non_null_feature_names = list(x.columns[~is_column_null]) + x = x[self.non_null_feature_names] + + # Remove trailing null rows for features that should + # not be imputed in the future + trailing_null_rows = self._determine_trailing_null_rows( + x[self.no_fill_future_values_features] + ) + x = x.loc[trailing_null_rows] # Imputers do not support labels - self.imputer_.fit(X=x[self.non_null_feature_names], y=None) + self.imputer_.fit(X=x, y=None) self.is_fitted_ = True def transform(self, x) -> pd.DataFrame: @@ -83,9 +100,11 @@ def transform(self, x) -> pd.DataFrame: x = x[self.non_null_feature_names] - return self.imputer_.transform(x) + transformed = self.imputer_.transform(x) - def fit_transform(self, x, y=None): + return transformed + + def fit_transform(self, x, y=None) -> tuple[pd.DataFrame, Optional[pd.Series]]: """Fit the imputer on the input data and transform it. Returns: @@ -93,7 +112,25 @@ def fit_transform(self, x, y=None): """ self.fit(x, y) - return self.transform(x) + + if not isinstance(x, pd.DataFrame): + x = pd.DataFrame(np.asarray(x)) + + x = x[self.non_null_feature_names] + + # Remove trailing null rows for features that should + # not be imputed in the future + non_trailing_null_rows = self._determine_trailing_null_rows( + x[self.no_fill_future_values_features] + ) + x = x.loc[non_trailing_null_rows] + + x = self.transform(x) + + if y is not None: + y = y.loc[non_trailing_null_rows] + + return x, y @classmethod def _identity(cls, x): diff --git a/openstef/model/model_creator.py b/openstef/model/model_creator.py index 837c6052a..bd2448889 100644 --- a/openstef/model/model_creator.py +++ b/openstef/model/model_creator.py @@ -116,6 +116,7 @@ "missing_values", "imputation_strategy", "fill_value", + "no_fill_future_values_features", ], MLModelType.ARIMA: [ "backtest_max_horizon", diff --git a/openstef/model/regressors/flatliner.py b/openstef/model/regressors/flatliner.py index 764773d52..995052bbf 100644 --- a/openstef/model/regressors/flatliner.py +++ b/openstef/model/regressors/flatliner.py @@ -2,18 +2,13 @@ # # SPDX-License-Identifier: MPL-2.0 import re -from typing import Dict, Union, Set, Optional, List +from typing import List import numpy as np import pandas as pd from sklearn.base import RegressorMixin -from sklearn.linear_model import QuantileRegressor -from sklearn.preprocessing import MinMaxScaler from sklearn.utils.validation import check_is_fitted -from openstef.feature_engineering.missing_values_transformer import ( - MissingValuesTransformer, -) from openstef.model.regressors.regressor import OpenstfRegressor diff --git a/openstef/model/regressors/linear_quantile.py b/openstef/model/regressors/linear_quantile.py index 5e64fa4b3..2c8ead73a 100644 --- a/openstef/model/regressors/linear_quantile.py +++ b/openstef/model/regressors/linear_quantile.py @@ -2,7 +2,7 @@ # # SPDX-License-Identifier: MPL-2.0 import re -from typing import Dict, Union, Set, Optional +from typing import Dict, Union, Set, Optional, List import numpy as np import pandas as pd @@ -47,6 +47,7 @@ def __init__( missing_values: Union[int, float, str, None] = np.nan, imputation_strategy: Optional[str] = "mean", fill_value: Union[str, int, float] = None, + no_fill_future_values_features: List[str] = None, ): """Initialize LinearQuantileOpenstfRegressor. @@ -69,6 +70,9 @@ def __init__( missing_values: Value to be considered as missing value imputation_strategy: Imputation strategy fill_value: Fill value + no_fill_future_values_features: The features for which it does not make sense + to fill future values. Rows that contain trailing null values for these + features will be removed from the data. """ super().__init__() @@ -86,6 +90,7 @@ def __init__( missing_values=missing_values, imputation_strategy=imputation_strategy, fill_value=fill_value, + no_fill_future_values_features=no_fill_future_values_features, ) self.x_scaler_ = MinMaxScaler(feature_range=(-1, 1)) self.y_scaler_ = MinMaxScaler(feature_range=(-1, 1)) @@ -165,7 +170,7 @@ def fit(self, x: pd.DataFrame, y: pd.Series, **kwargs) -> RegressorMixin: x = self._remove_ignored_features(x) # Fix nan columns - x = self.imputer_.fit_transform(x) + x, y = self.imputer_.fit_transform(x, y) if x.isna().any().any(): raise ValueError( "There are nan values in the input data. Set " diff --git a/test/unit/feature_engineering/test_missing_values_transformer.py b/test/unit/feature_engineering/test_missing_values_transformer.py index 6a50157e7..ac93a30f3 100644 --- a/test/unit/feature_engineering/test_missing_values_transformer.py +++ b/test/unit/feature_engineering/test_missing_values_transformer.py @@ -3,7 +3,6 @@ # SPDX-License-Identifier: MPL-2.0 from test.unit.utils.base import BaseTestCase -import unittest import pandas as pd import numpy as np from sklearn.exceptions import NotFittedError @@ -15,41 +14,83 @@ class MissingValuesTransformerTests(BaseTestCase): def setUp(self): self.data = pd.DataFrame( - {"A": [1, np.nan, 3], "B": [4, 5, np.nan], "C": [np.nan, np.nan, np.nan]} + { + "A": [np.nan, 2, 3, 4], + "B": [3, np.nan, 4, 5], + "C": [3, 4, 5, np.nan], + "D": [np.nan, np.nan, np.nan, np.nan], + }, + index=[0, 1, 1, 2], ) def test_imputation_with_mean_strategy_fills_missing_values(self): transformer = MissingValuesTransformer(imputation_strategy="mean") - transformed = transformer.fit_transform(self.data) + transformed, _ = transformer.fit_transform(self.data) self.assertEqual(transformed.isnull().sum().sum(), 0) - self.assertAlmostEqual(transformed.loc[1, "A"], 2) - self.assertAlmostEqual(transformed.loc[2, "B"], 4.5) + self.assertAlmostEqual(transformed.iloc[0]["A"], 3) + self.assertAlmostEqual(transformed.iloc[1]["B"], 4) def test_imputation_with_constant_strategy_fills_missing_values(self): transformer = MissingValuesTransformer( imputation_strategy="constant", fill_value=0 ) - transformed = transformer.fit_transform(self.data) + transformed, _ = transformer.fit_transform(self.data) self.assertEqual(transformed.isnull().sum().sum(), 0) - self.assertEqual(transformed.loc[1, "A"], 0) - self.assertEqual(transformed.loc[2, "B"], 0) + self.assertEqual(transformed.iloc[0]["A"], 0) + self.assertEqual(transformed.iloc[1]["B"], 0) def test_columns_always_null_are_removed(self): transformer = MissingValuesTransformer() transformer.fit(self.data) - self.assertNotIn("C", transformer.non_null_feature_names) + self.assertNotIn("D", transformer.non_null_feature_names) + + def test_determining_non_trailing_null_rows(self): + transformer = MissingValuesTransformer(no_fill_future_values_features=["C"]) + transformer.fit(self.data) + non_trailing_null_rows = transformer._determine_trailing_null_rows( + self.data[transformer.non_null_feature_names] + ) + pd.testing.assert_series_equal( + non_trailing_null_rows, + pd.Series([True, True, True, False], index=[0, 1, 1, 2]), + ) + + def test_fitting_with_labels_removes_rows_with_trailing_nulls(self): + transformer = MissingValuesTransformer(no_fill_future_values_features=["C"]) + _, y_transformed = transformer.fit_transform( + self.data, y=pd.Series([1, 2, 3, 4], index=self.data.index) + ) + self.assertEqual(y_transformed.tolist(), [1, 2, 3]) def test_non_dataframe_input_is_converted_and_processed(self): transformer = MissingValuesTransformer(imputation_strategy="mean") - array = np.array([[1, np.nan], [np.nan, 2]]) - transformed = transformer.fit_transform(array) + array = np.array([[1, np.nan, np.nan], [np.nan, 2, np.nan]]) + transformed, _ = transformer.fit_transform(array) self.assertIsInstance(transformed, pd.DataFrame) self.assertEqual(transformed.isnull().sum().sum(), 0) + self.assertEqual(transformed.shape, (2, 2)) - def test_fitting_transformer_without_strategy_keeps_data_unchanged(self): + def test_fitting_transformer_without_strategy_keeps_valid_data_unchanged(self): transformer = MissingValuesTransformer() - transformed = transformer.fit_transform(self.data) - pd.testing.assert_frame_equal(transformed, self.data.drop(columns=["C"])) + transformed, _ = transformer.fit_transform(self.data) + pd.testing.assert_frame_equal(transformed, self.data.drop(columns=["D"])) + + def test_call_transform_on_fitted_transformer_does_not_remove_trailing_null_rows( + self, + ): + transformer = MissingValuesTransformer(no_fill_future_values_features=["C"]) + transformer.fit(self.data) + new_data = pd.DataFrame( + { + "A": [1, 2, 3, 4], + "B": [1, 2, 3, 4], + "C": [1, 2, 3, 4], + "D": [1, 2, 3, 4], + }, + index=[0, 1, 1, 2], + ) + transformed = transformer.transform(new_data) + pd.testing.assert_frame_equal(transformed, new_data.drop(columns=["D"])) def test_calling_transform_before_fit_raises_error(self): transformer = MissingValuesTransformer() diff --git a/test/unit/model/regressors/test_linear_quantile.py b/test/unit/model/regressors/test_linear_quantile.py index b6ed316a1..50ea32bbc 100644 --- a/test/unit/model/regressors/test_linear_quantile.py +++ b/test/unit/model/regressors/test_linear_quantile.py @@ -55,11 +55,14 @@ def test_imputer(self): # Arrange n_sample = train_input.shape[0] X = train_input.iloc[:, 1:].copy(deep=True) - sp = np.ones(n_sample) - sp[-1] = np.nan - X["Sparse"] = sp + X["sparse"] = np.ones(n_sample) + X.loc[X.index[-2], "sparse"] = np.nan + X["sparse_2"] = np.ones(n_sample) + X.loc[X.index[-1], "sparse_2"] = np.nan model1 = LinearQuantileOpenstfRegressor(imputation_strategy=None) - model2 = LinearQuantileOpenstfRegressor(imputation_strategy="mean") + model2 = LinearQuantileOpenstfRegressor( + imputation_strategy="mean", no_fill_future_values_features=["sparse_2"] + ) # Act # Model should give error if nan values are present. @@ -75,6 +78,10 @@ def test_imputer(self): X_ = pd.DataFrame(model2.imputer_.transform(X), columns=X.columns) self.assertTrue((model2.predict(X_) == model2.predict(X)).all()) + # check if last row is removed because of trailing null values + X_transformed, _ = model2.imputer_.fit_transform(X) + self.assertEqual(X_transformed.shape[0], n_sample - 1) + def test_value_error_raised(self): # Check if Value Error is raised when 0.5 is not in the requested quantiles list with self.assertRaises(ValueError): diff --git a/test/unit/pipeline/test_pipeline_train_model.py b/test/unit/pipeline/test_pipeline_train_model.py index f4b5b7491..e68923c0f 100644 --- a/test/unit/pipeline/test_pipeline_train_model.py +++ b/test/unit/pipeline/test_pipeline_train_model.py @@ -125,8 +125,6 @@ def test_train_model_pipeline_core_happy_flow(self): but it can/should include predictors (e.g. weather data) """ - # Select 50 data points to speedup test - train_input = self.train_input.iloc[:50, :] # Remove modeltypes which are optional, and add a dummy regressor for model_type in list(MLModelType) + [__name__ + ".DummyRegressor"]: with self.subTest(model_type=model_type): @@ -136,7 +134,9 @@ def test_train_model_pipeline_core_happy_flow(self): model_type.value if hasattr(model_type, "value") else model_type ) model_specs = self.model_specs - train_input = self.train_input + + # Select 150 data points to speedup test + train_input = self.train_input.iloc[:150, :] # Use default parameters model_specs.hyper_params = {} @@ -155,7 +155,6 @@ def test_train_model_pipeline_core_happy_flow(self): function=split_dummy_arima, arguments={}, ) - train_input = self.train_input[:150] model, report, modelspecs, _ = train_model_pipeline_core( pj=pj, model_specs=model_specs, input_data=train_input