[fix] disable imputation on future data (#562)

* postprocess imputation df by putting back trailing nan's Signed-off-by: lschilders <[email protected]> * add unit test test_no_imputation_for_future_data Signed-off-by: lschilders <[email protected]> * fix unit test test_linear_quantile Signed-off-by: lschilders <[email protected]> * fix imports in flatliner.py Signed-off-by: lschilders <[email protected]> * remove columns with future nan values Signed-off-by: lschilders <[email protected]> * implement non_trailing_null_rows in missing_values_transformer and add unit tests Signed-off-by: lschilders <[email protected]> * also transform labels y with trailing null rows Signed-off-by: lschilders <[email protected]> * test in test_linear_quantile for trailing null Signed-off-by: lschilders <[email protected]> * Format Python code with Black Signed-off-by: black <[email protected]> * merge Black formatting in branch Signed-off-by: lschilders <[email protected]> * remove assert in fit_transform missing_values_transformer Signed-off-by: lschilders <[email protected]> * only train on subset of data in test_pipeline_train_model Signed-off-by: lschilders <[email protected]> * adapt transform(x) to not remove non_trailing_nulls and index with DataFrame rather than list of index Signed-off-by: lschilders <[email protected]> * test for duplicate indices Signed-off-by: lschilders <[email protected]> * remove non trailing null rows in fit_transform and check in test_imputer of test_linear_quantile Signed-off-by: lschilders <[email protected]> * add test for calling transform separately Signed-off-by: lschilders <[email protected]> * refactored MissingValuesTransformer with private static method _determine_trailing_null_rows Signed-off-by: lschilders <[email protected]> * add whitelist for no_fill_future_values_features Signed-off-by: lschilders <[email protected]> * immutable default value for no_fill_future_values_features * Update openstef/feature_engineering/missing_values_transformer.py Co-authored-by: Egor Dmitriev <[email protected]> Signed-off-by: Lars Schilders <[email protected]> * add no_fill_future_values_features to model_creator Signed-off-by: lschilders <[email protected]> --------- Signed-off-by: lschilders <[email protected]> Signed-off-by: black <[email protected]> Signed-off-by: Lars Schilders <[email protected]> Co-authored-by: black <[email protected]> Co-authored-by: Egor Dmitriev <[email protected]>
OpenSTEF · Oct 9, 2024 · 430fa9e · 430fa9e
1 parent 75ffb01
commit 430fa9e
Show file tree

Hide file tree

Showing 7 changed files with 120 additions and 35 deletions.
diff --git a/openstef/feature_engineering/missing_values_transformer.py b/openstef/feature_engineering/missing_values_transformer.py
@@ -27,6 +27,7 @@ def __init__(
         missing_values: Union[int, float, str, None] = np.nan,
         imputation_strategy: str = None,
         fill_value: Union[str, int, float] = None,
+        no_fill_future_values_features: List[str] = None,
     ):
         """Initialize missing values handler.
 
@@ -37,11 +38,14 @@ def __init__(
                 Can be one of "mean", "median", "most_frequent", "constant" or None.
             fill_value: When strategy == "constant", fill_value is used to replace all
                 occurrences of missing_values.
-
+            no_fill_future_values_features: The features for which it does not make sense
+                to fill future values. Rows that contain trailing null values for these
+                features will be removed from the data.
         """
         self.missing_values = missing_values
         self.imputation_strategy = imputation_strategy
         self.fill_value = fill_value
+        self.no_fill_future_values_features = no_fill_future_values_features or []
         self.is_fitted_ = False
 
         # Build the proper imputation transformer
@@ -57,6 +61,11 @@ def __init__(
             ).set_output(transform="pandas")
             self.imputer_._validate_params()
 
+    @staticmethod
+    def _determine_trailing_null_rows(x: pd.DataFrame) -> pd.Series:
+        """Determine rows with trailing null values in a DataFrame."""
+        return ~x.bfill().isnull().any(axis="columns")
+
     def fit(self, x, y=None):
         """Fit the imputer on the input data."""
         _ = check_array(x, force_all_finite="allow-nan")
@@ -69,9 +78,17 @@ def fit(self, x, y=None):
         # Remove always null columns
         is_column_null = x.isnull().all(axis="index")
         self.non_null_feature_names = list(x.columns[~is_column_null])
+        x = x[self.non_null_feature_names]
+
+        # Remove trailing null rows for features that should
+        # not be imputed in the future
+        trailing_null_rows = self._determine_trailing_null_rows(
+            x[self.no_fill_future_values_features]
+        )
+        x = x.loc[trailing_null_rows]
 
         # Imputers do not support labels
-        self.imputer_.fit(X=x[self.non_null_feature_names], y=None)
+        self.imputer_.fit(X=x, y=None)
         self.is_fitted_ = True
 
     def transform(self, x) -> pd.DataFrame:
@@ -83,17 +100,37 @@ def transform(self, x) -> pd.DataFrame:
 
         x = x[self.non_null_feature_names]
 
-        return self.imputer_.transform(x)
+        transformed = self.imputer_.transform(x)
 
-    def fit_transform(self, x, y=None):
+        return transformed
+
+    def fit_transform(self, x, y=None) -> tuple[pd.DataFrame, Optional[pd.Series]]:
         """Fit the imputer on the input data and transform it.
 
         Returns:
             The data with missing values imputed.
 
         """
         self.fit(x, y)
-        return self.transform(x)
+
+        if not isinstance(x, pd.DataFrame):
+            x = pd.DataFrame(np.asarray(x))
+
+        x = x[self.non_null_feature_names]
+
+        # Remove trailing null rows for features that should
+        # not be imputed in the future
+        non_trailing_null_rows = self._determine_trailing_null_rows(
+            x[self.no_fill_future_values_features]
+        )
+        x = x.loc[non_trailing_null_rows]
+
+        x = self.transform(x)
+
+        if y is not None:
+            y = y.loc[non_trailing_null_rows]
+
+        return x, y
 
     @classmethod
     def _identity(cls, x):

diff --git a/openstef/model/model_creator.py b/openstef/model/model_creator.py
@@ -116,6 +116,7 @@
         "missing_values",
         "imputation_strategy",
         "fill_value",
+        "no_fill_future_values_features",
     ],
     ModelType.ARIMA: [
         "backtest_max_horizon",

diff --git a/openstef/model/regressors/flatliner.py b/openstef/model/regressors/flatliner.py
@@ -2,18 +2,13 @@
 #
 # SPDX-License-Identifier: MPL-2.0
 import re
-from typing import Dict, Union, Set, Optional, List
+from typing import List
 
 import numpy as np
 import pandas as pd
 from sklearn.base import RegressorMixin
-from sklearn.linear_model import QuantileRegressor
-from sklearn.preprocessing import MinMaxScaler
 from sklearn.utils.validation import check_is_fitted
 
-from openstef.feature_engineering.missing_values_transformer import (
-    MissingValuesTransformer,
-)
 from openstef.model.regressors.regressor import OpenstfRegressor
 
 

diff --git a/openstef/model/regressors/linear_quantile.py b/openstef/model/regressors/linear_quantile.py
@@ -2,7 +2,7 @@
 #
 # SPDX-License-Identifier: MPL-2.0
 import re
-from typing import Dict, Union, Set, Optional
+from typing import Dict, Union, Set, Optional, List
 
 import numpy as np
 import pandas as pd
@@ -47,6 +47,7 @@ def __init__(
         missing_values: Union[int, float, str, None] = np.nan,
         imputation_strategy: Optional[str] = "mean",
         fill_value: Union[str, int, float] = None,
+        no_fill_future_values_features: List[str] = None,
     ):
         """Initialize LinearQuantileOpenstfRegressor.
 
@@ -69,6 +70,9 @@ def __init__(
             missing_values: Value to be considered as missing value
             imputation_strategy: Imputation strategy
             fill_value: Fill value
+            no_fill_future_values_features: The features for which it does not make sense
+                to fill future values. Rows that contain trailing null values for these
+                features will be removed from the data.
 
         """
         super().__init__()
@@ -86,6 +90,7 @@ def __init__(
             missing_values=missing_values,
             imputation_strategy=imputation_strategy,
             fill_value=fill_value,
+            no_fill_future_values_features=no_fill_future_values_features,
         )
         self.x_scaler_ = MinMaxScaler(feature_range=(-1, 1))
         self.y_scaler_ = MinMaxScaler(feature_range=(-1, 1))
@@ -165,7 +170,7 @@ def fit(self, x: pd.DataFrame, y: pd.Series, **kwargs) -> RegressorMixin:
         x = self._remove_ignored_features(x)
 
         # Fix nan columns
-        x = self.imputer_.fit_transform(x)
+        x, y = self.imputer_.fit_transform(x, y)
         if x.isna().any().any():
             raise ValueError(
                 "There are nan values in the input data. Set "

diff --git a/test/unit/feature_engineering/test_missing_values_transformer.py b/test/unit/feature_engineering/test_missing_values_transformer.py
@@ -3,7 +3,6 @@
 # SPDX-License-Identifier: MPL-2.0
 from test.unit.utils.base import BaseTestCase
 
-import unittest
 import pandas as pd
 import numpy as np
 from sklearn.exceptions import NotFittedError
@@ -15,41 +14,83 @@
 class MissingValuesTransformerTests(BaseTestCase):
     def setUp(self):
         self.data = pd.DataFrame(
-            {"A": [1, np.nan, 3], "B": [4, 5, np.nan], "C": [np.nan, np.nan, np.nan]}
+            {
+                "A": [np.nan, 2, 3, 4],
+                "B": [3, np.nan, 4, 5],
+                "C": [3, 4, 5, np.nan],
+                "D": [np.nan, np.nan, np.nan, np.nan],
+            },
+            index=[0, 1, 1, 2],
         )
 
     def test_imputation_with_mean_strategy_fills_missing_values(self):
         transformer = MissingValuesTransformer(imputation_strategy="mean")
-        transformed = transformer.fit_transform(self.data)
+        transformed, _ = transformer.fit_transform(self.data)
         self.assertEqual(transformed.isnull().sum().sum(), 0)
-        self.assertAlmostEqual(transformed.loc[1, "A"], 2)
-        self.assertAlmostEqual(transformed.loc[2, "B"], 4.5)
+        self.assertAlmostEqual(transformed.iloc[0]["A"], 3)
+        self.assertAlmostEqual(transformed.iloc[1]["B"], 4)
 
     def test_imputation_with_constant_strategy_fills_missing_values(self):
         transformer = MissingValuesTransformer(
             imputation_strategy="constant", fill_value=0
         )
-        transformed = transformer.fit_transform(self.data)
+        transformed, _ = transformer.fit_transform(self.data)
         self.assertEqual(transformed.isnull().sum().sum(), 0)
-        self.assertEqual(transformed.loc[1, "A"], 0)
-        self.assertEqual(transformed.loc[2, "B"], 0)
+        self.assertEqual(transformed.iloc[0]["A"], 0)
+        self.assertEqual(transformed.iloc[1]["B"], 0)
 
     def test_columns_always_null_are_removed(self):
         transformer = MissingValuesTransformer()
         transformer.fit(self.data)
-        self.assertNotIn("C", transformer.non_null_feature_names)
+        self.assertNotIn("D", transformer.non_null_feature_names)
+
+    def test_determining_non_trailing_null_rows(self):
+        transformer = MissingValuesTransformer(no_fill_future_values_features=["C"])
+        transformer.fit(self.data)
+        non_trailing_null_rows = transformer._determine_trailing_null_rows(
+            self.data[transformer.non_null_feature_names]
+        )
+        pd.testing.assert_series_equal(
+            non_trailing_null_rows,
+            pd.Series([True, True, True, False], index=[0, 1, 1, 2]),
+        )
+
+    def test_fitting_with_labels_removes_rows_with_trailing_nulls(self):
+        transformer = MissingValuesTransformer(no_fill_future_values_features=["C"])
+        _, y_transformed = transformer.fit_transform(
+            self.data, y=pd.Series([1, 2, 3, 4], index=self.data.index)
+        )
+        self.assertEqual(y_transformed.tolist(), [1, 2, 3])
 
     def test_non_dataframe_input_is_converted_and_processed(self):
         transformer = MissingValuesTransformer(imputation_strategy="mean")
-        array = np.array([[1, np.nan], [np.nan, 2]])
-        transformed = transformer.fit_transform(array)
+        array = np.array([[1, np.nan, np.nan], [np.nan, 2, np.nan]])
+        transformed, _ = transformer.fit_transform(array)
         self.assertIsInstance(transformed, pd.DataFrame)
         self.assertEqual(transformed.isnull().sum().sum(), 0)
+        self.assertEqual(transformed.shape, (2, 2))
 
-    def test_fitting_transformer_without_strategy_keeps_data_unchanged(self):
+    def test_fitting_transformer_without_strategy_keeps_valid_data_unchanged(self):
         transformer = MissingValuesTransformer()
-        transformed = transformer.fit_transform(self.data)
-        pd.testing.assert_frame_equal(transformed, self.data.drop(columns=["C"]))
+        transformed, _ = transformer.fit_transform(self.data)
+        pd.testing.assert_frame_equal(transformed, self.data.drop(columns=["D"]))
+
+    def test_call_transform_on_fitted_transformer_does_not_remove_trailing_null_rows(
+        self,
+    ):
+        transformer = MissingValuesTransformer(no_fill_future_values_features=["C"])
+        transformer.fit(self.data)
+        new_data = pd.DataFrame(
+            {
+                "A": [1, 2, 3, 4],
+                "B": [1, 2, 3, 4],
+                "C": [1, 2, 3, 4],
+                "D": [1, 2, 3, 4],
+            },
+            index=[0, 1, 1, 2],
+        )
+        transformed = transformer.transform(new_data)
+        pd.testing.assert_frame_equal(transformed, new_data.drop(columns=["D"]))
 
     def test_calling_transform_before_fit_raises_error(self):
         transformer = MissingValuesTransformer()

diff --git a/test/unit/model/regressors/test_linear_quantile.py b/test/unit/model/regressors/test_linear_quantile.py
@@ -55,11 +55,14 @@ def test_imputer(self):
         # Arrange
         n_sample = train_input.shape[0]
         X = train_input.iloc[:, 1:].copy(deep=True)
-        sp = np.ones(n_sample)
-        sp[-1] = np.nan
-        X["Sparse"] = sp
+        X["sparse"] = np.ones(n_sample)
+        X.loc[X.index[-2], "sparse"] = np.nan
+        X["sparse_2"] = np.ones(n_sample)
+        X.loc[X.index[-1], "sparse_2"] = np.nan
         model1 = LinearQuantileOpenstfRegressor(imputation_strategy=None)
-        model2 = LinearQuantileOpenstfRegressor(imputation_strategy="mean")
+        model2 = LinearQuantileOpenstfRegressor(
+            imputation_strategy="mean", no_fill_future_values_features=["sparse_2"]
+        )
 
         # Act
         # Model should give error if nan values are present.
@@ -75,6 +78,10 @@ def test_imputer(self):
         X_ = pd.DataFrame(model2.imputer_.transform(X), columns=X.columns)
         self.assertTrue((model2.predict(X_) == model2.predict(X)).all())
 
+        # check if last row is removed because of trailing null values
+        X_transformed, _ = model2.imputer_.fit_transform(X)
+        self.assertEqual(X_transformed.shape[0], n_sample - 1)
+
     def test_value_error_raised(self):
         # Check if Value Error is raised when 0.5 is not in the requested quantiles list
         with self.assertRaises(ValueError):

diff --git a/test/unit/pipeline/test_pipeline_train_model.py b/test/unit/pipeline/test_pipeline_train_model.py
@@ -125,8 +125,6 @@ def test_train_model_pipeline_core_happy_flow(self):
         but it can/should include predictors (e.g. weather data)
 
         """
-        # Select 50 data points to speedup test
-        train_input = self.train_input.iloc[:50, :]
         # Remove modeltypes which are optional, and add a dummy regressor
         for model_type in list(ModelType) + [__name__ + ".DummyRegressor"]:
             with self.subTest(model_type=model_type):
@@ -136,7 +134,9 @@ def test_train_model_pipeline_core_happy_flow(self):
                     model_type.value if hasattr(model_type, "value") else model_type
                 )
                 model_specs = self.model_specs
-                train_input = self.train_input
+
+                # Select 150 data points to speedup test
+                train_input = self.train_input.iloc[:150, :]
 
                 # Use default parameters
                 model_specs.hyper_params = {}
@@ -155,7 +155,6 @@ def test_train_model_pipeline_core_happy_flow(self):
                         function=split_dummy_arima,
                         arguments={},
                     )
-                    train_input = self.train_input[:150]
 
                 model, report, modelspecs, _ = train_model_pipeline_core(
                     pj=pj, model_specs=model_specs, input_data=train_input