From cfa4cd3474beb07d7a399014373667c16dc04551 Mon Sep 17 00:00:00 2001 From: lschilders Date: Fri, 27 Sep 2024 14:28:36 +0200 Subject: [PATCH 01/21] postprocess imputation df by putting back trailing nan's Signed-off-by: lschilders --- .../feature_engineering/missing_values_transformer.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/openstef/feature_engineering/missing_values_transformer.py b/openstef/feature_engineering/missing_values_transformer.py index b2ced0895..5b7303d71 100644 --- a/openstef/feature_engineering/missing_values_transformer.py +++ b/openstef/feature_engineering/missing_values_transformer.py @@ -83,7 +83,13 @@ def transform(self, x) -> pd.DataFrame: x = x[self.non_null_feature_names] - return self.imputer_.transform(x) + transformed = self.imputer_.transform(x) + + # Do not impute for trailing missing values + trailing_nans = x.bfill().isna().to_numpy() + transformed = transformed.where(~trailing_nans, np.nan) + + return transformed def fit_transform(self, x, y=None): """Fit the imputer on the input data and transform it. From ca5f858acd986ec1eceb20e106f4f7b004a79d6f Mon Sep 17 00:00:00 2001 From: lschilders Date: Fri, 27 Sep 2024 14:29:13 +0200 Subject: [PATCH 02/21] add unit test test_no_imputation_for_future_data Signed-off-by: lschilders --- .../test_missing_values_transformer.py | 27 +++++++++++-------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/test/unit/feature_engineering/test_missing_values_transformer.py b/test/unit/feature_engineering/test_missing_values_transformer.py index 6a50157e7..b4e7e6376 100644 --- a/test/unit/feature_engineering/test_missing_values_transformer.py +++ b/test/unit/feature_engineering/test_missing_values_transformer.py @@ -15,41 +15,46 @@ class MissingValuesTransformerTests(BaseTestCase): def setUp(self): self.data = pd.DataFrame( - {"A": [1, np.nan, 3], "B": [4, 5, np.nan], "C": [np.nan, np.nan, np.nan]} + {"A": [np.nan, 2, 3], "B": [1, np.nan, 3], "C": [4, 5, np.nan], "D": [np.nan, np.nan, np.nan]} ) def test_imputation_with_mean_strategy_fills_missing_values(self): transformer = MissingValuesTransformer(imputation_strategy="mean") transformed = transformer.fit_transform(self.data) - self.assertEqual(transformed.isnull().sum().sum(), 0) - self.assertAlmostEqual(transformed.loc[1, "A"], 2) - self.assertAlmostEqual(transformed.loc[2, "B"], 4.5) + self.assertEqual(transformed[["A", "B"]].isnull().sum().sum(), 0) + self.assertAlmostEqual(transformed.loc[0, "A"], 2.5) + self.assertAlmostEqual(transformed.loc[1, "B"], 2) def test_imputation_with_constant_strategy_fills_missing_values(self): transformer = MissingValuesTransformer( imputation_strategy="constant", fill_value=0 ) transformed = transformer.fit_transform(self.data) - self.assertEqual(transformed.isnull().sum().sum(), 0) - self.assertEqual(transformed.loc[1, "A"], 0) - self.assertEqual(transformed.loc[2, "B"], 0) + self.assertEqual(transformed[["A", "B"]].isnull().sum().sum(), 0) + self.assertEqual(transformed.loc[0, "A"], 0) + self.assertEqual(transformed.loc[1, "B"], 0) def test_columns_always_null_are_removed(self): transformer = MissingValuesTransformer() transformer.fit(self.data) - self.assertNotIn("C", transformer.non_null_feature_names) + self.assertNotIn("D", transformer.non_null_feature_names) def test_non_dataframe_input_is_converted_and_processed(self): transformer = MissingValuesTransformer(imputation_strategy="mean") - array = np.array([[1, np.nan], [np.nan, 2]]) + array = np.array([[1, np.nan, np.nan], [np.nan, 2, np.nan]]) transformed = transformer.fit_transform(array) self.assertIsInstance(transformed, pd.DataFrame) - self.assertEqual(transformed.isnull().sum().sum(), 0) + self.assertEqual(transformed.isnull().sum().sum(), 1) def test_fitting_transformer_without_strategy_keeps_data_unchanged(self): transformer = MissingValuesTransformer() transformed = transformer.fit_transform(self.data) - pd.testing.assert_frame_equal(transformed, self.data.drop(columns=["C"])) + pd.testing.assert_frame_equal(transformed, self.data.drop(columns=["D"])) + + def test_no_imputation_for_future_data(self): + transformer = MissingValuesTransformer(imputation_strategy="mean") + transformed = transformer.fit_transform(self.data) + self.assertIsNAN(transformed.loc[2, "C"]) def test_calling_transform_before_fit_raises_error(self): transformer = MissingValuesTransformer() From 19f5104575a77ff0832e6101b4a09f15f43b42b9 Mon Sep 17 00:00:00 2001 From: lschilders Date: Fri, 27 Sep 2024 14:53:08 +0200 Subject: [PATCH 03/21] fix unit test test_linear_quantile Signed-off-by: lschilders --- test/unit/model/regressors/test_linear_quantile.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/unit/model/regressors/test_linear_quantile.py b/test/unit/model/regressors/test_linear_quantile.py index b6ed316a1..6579770fa 100644 --- a/test/unit/model/regressors/test_linear_quantile.py +++ b/test/unit/model/regressors/test_linear_quantile.py @@ -56,7 +56,7 @@ def test_imputer(self): n_sample = train_input.shape[0] X = train_input.iloc[:, 1:].copy(deep=True) sp = np.ones(n_sample) - sp[-1] = np.nan + sp[-2] = np.nan X["Sparse"] = sp model1 = LinearQuantileOpenstfRegressor(imputation_strategy=None) model2 = LinearQuantileOpenstfRegressor(imputation_strategy="mean") From 6d07d842fc02b7697523c37c0c94e4ea763c33e7 Mon Sep 17 00:00:00 2001 From: lschilders Date: Fri, 27 Sep 2024 15:36:55 +0200 Subject: [PATCH 04/21] fix imports in flatliner.py Signed-off-by: lschilders --- openstef/model/regressors/flatliner.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/openstef/model/regressors/flatliner.py b/openstef/model/regressors/flatliner.py index 764773d52..995052bbf 100644 --- a/openstef/model/regressors/flatliner.py +++ b/openstef/model/regressors/flatliner.py @@ -2,18 +2,13 @@ # # SPDX-License-Identifier: MPL-2.0 import re -from typing import Dict, Union, Set, Optional, List +from typing import List import numpy as np import pandas as pd from sklearn.base import RegressorMixin -from sklearn.linear_model import QuantileRegressor -from sklearn.preprocessing import MinMaxScaler from sklearn.utils.validation import check_is_fitted -from openstef.feature_engineering.missing_values_transformer import ( - MissingValuesTransformer, -) from openstef.model.regressors.regressor import OpenstfRegressor From 8a96d251b184c00b866110c356d3b74349f62f98 Mon Sep 17 00:00:00 2001 From: lschilders Date: Fri, 27 Sep 2024 16:52:41 +0200 Subject: [PATCH 05/21] remove columns with future nan values Signed-off-by: lschilders --- .../missing_values_transformer.py | 7 ++----- openstef/model/regressors/linear_quantile.py | 2 +- .../test_missing_values_transformer.py | 18 +++++++++--------- 3 files changed, 12 insertions(+), 15 deletions(-) diff --git a/openstef/feature_engineering/missing_values_transformer.py b/openstef/feature_engineering/missing_values_transformer.py index 5b7303d71..bec517b7b 100644 --- a/openstef/feature_engineering/missing_values_transformer.py +++ b/openstef/feature_engineering/missing_values_transformer.py @@ -68,7 +68,8 @@ def fit(self, x, y=None): # Remove always null columns is_column_null = x.isnull().all(axis="index") - self.non_null_feature_names = list(x.columns[~is_column_null]) + trailing_null_columns = x.bfill().isnull().any(axis="index") + self.non_null_feature_names = list(x.columns[~(is_column_null | trailing_null_columns)]) # Imputers do not support labels self.imputer_.fit(X=x[self.non_null_feature_names], y=None) @@ -85,10 +86,6 @@ def transform(self, x) -> pd.DataFrame: transformed = self.imputer_.transform(x) - # Do not impute for trailing missing values - trailing_nans = x.bfill().isna().to_numpy() - transformed = transformed.where(~trailing_nans, np.nan) - return transformed def fit_transform(self, x, y=None): diff --git a/openstef/model/regressors/linear_quantile.py b/openstef/model/regressors/linear_quantile.py index 5e64fa4b3..b6d9c1e43 100644 --- a/openstef/model/regressors/linear_quantile.py +++ b/openstef/model/regressors/linear_quantile.py @@ -166,7 +166,7 @@ def fit(self, x: pd.DataFrame, y: pd.Series, **kwargs) -> RegressorMixin: # Fix nan columns x = self.imputer_.fit_transform(x) - if x.isna().any().any(): + if (x.isna() & ~x.bfill().isna()).any().any(): raise ValueError( "There are nan values in the input data. Set " "imputation_strategy to solve them." diff --git a/test/unit/feature_engineering/test_missing_values_transformer.py b/test/unit/feature_engineering/test_missing_values_transformer.py index b4e7e6376..4dc45e2e5 100644 --- a/test/unit/feature_engineering/test_missing_values_transformer.py +++ b/test/unit/feature_engineering/test_missing_values_transformer.py @@ -21,7 +21,7 @@ def setUp(self): def test_imputation_with_mean_strategy_fills_missing_values(self): transformer = MissingValuesTransformer(imputation_strategy="mean") transformed = transformer.fit_transform(self.data) - self.assertEqual(transformed[["A", "B"]].isnull().sum().sum(), 0) + self.assertEqual(transformed.isnull().sum().sum(), 0) self.assertAlmostEqual(transformed.loc[0, "A"], 2.5) self.assertAlmostEqual(transformed.loc[1, "B"], 2) @@ -30,7 +30,7 @@ def test_imputation_with_constant_strategy_fills_missing_values(self): imputation_strategy="constant", fill_value=0 ) transformed = transformer.fit_transform(self.data) - self.assertEqual(transformed[["A", "B"]].isnull().sum().sum(), 0) + self.assertEqual(transformed.isnull().sum().sum(), 0) self.assertEqual(transformed.loc[0, "A"], 0) self.assertEqual(transformed.loc[1, "B"], 0) @@ -39,22 +39,22 @@ def test_columns_always_null_are_removed(self): transformer.fit(self.data) self.assertNotIn("D", transformer.non_null_feature_names) + def test_columns_with_missing_values_at_end_are_removed(self): + transformer = MissingValuesTransformer() + transformer.fit(self.data) + self.assertNotIn("C", transformer.non_null_feature_names) + def test_non_dataframe_input_is_converted_and_processed(self): transformer = MissingValuesTransformer(imputation_strategy="mean") array = np.array([[1, np.nan, np.nan], [np.nan, 2, np.nan]]) transformed = transformer.fit_transform(array) self.assertIsInstance(transformed, pd.DataFrame) - self.assertEqual(transformed.isnull().sum().sum(), 1) + self.assertEqual(transformed.isnull().sum().sum(), 0) def test_fitting_transformer_without_strategy_keeps_data_unchanged(self): transformer = MissingValuesTransformer() transformed = transformer.fit_transform(self.data) - pd.testing.assert_frame_equal(transformed, self.data.drop(columns=["D"])) - - def test_no_imputation_for_future_data(self): - transformer = MissingValuesTransformer(imputation_strategy="mean") - transformed = transformer.fit_transform(self.data) - self.assertIsNAN(transformed.loc[2, "C"]) + pd.testing.assert_frame_equal(transformed, self.data.drop(columns=["C", "D"])) def test_calling_transform_before_fit_raises_error(self): transformer = MissingValuesTransformer() From 0cba05961f8730058ea425c6be6b9ddb20c28f20 Mon Sep 17 00:00:00 2001 From: lschilders Date: Wed, 2 Oct 2024 10:14:09 +0200 Subject: [PATCH 06/21] implement non_trailing_null_rows in missing_values_transformer and add unit tests Signed-off-by: lschilders --- .../missing_values_transformer.py | 13 ++++++++++--- .../test_missing_values_transformer.py | 11 ++++++----- 2 files changed, 16 insertions(+), 8 deletions(-) diff --git a/openstef/feature_engineering/missing_values_transformer.py b/openstef/feature_engineering/missing_values_transformer.py index bec517b7b..f323566d0 100644 --- a/openstef/feature_engineering/missing_values_transformer.py +++ b/openstef/feature_engineering/missing_values_transformer.py @@ -21,6 +21,7 @@ class MissingValuesTransformer: _n_in_features: Optional[int] = None non_null_feature_names: List[str] = None + non_trailing_null_rows: List[int] = None def __init__( self, @@ -68,11 +69,16 @@ def fit(self, x, y=None): # Remove always null columns is_column_null = x.isnull().all(axis="index") - trailing_null_columns = x.bfill().isnull().any(axis="index") - self.non_null_feature_names = list(x.columns[~(is_column_null | trailing_null_columns)]) + self.non_null_feature_names = list(x.columns[~is_column_null]) + x = x[self.non_null_feature_names] + + # Remove rows with trailing null values + row_has_trailing_null = x.bfill().isnull().any(axis="columns") + self.non_trailing_null_rows = list(x.index[~row_has_trailing_null]) + x = x.loc[self.non_trailing_null_rows] # Imputers do not support labels - self.imputer_.fit(X=x[self.non_null_feature_names], y=None) + self.imputer_.fit(X=x, y=None) self.is_fitted_ = True def transform(self, x) -> pd.DataFrame: @@ -83,6 +89,7 @@ def transform(self, x) -> pd.DataFrame: x = pd.DataFrame(np.asarray(x)) x = x[self.non_null_feature_names] + x = x.loc[self.non_trailing_null_rows] transformed = self.imputer_.transform(x) diff --git a/test/unit/feature_engineering/test_missing_values_transformer.py b/test/unit/feature_engineering/test_missing_values_transformer.py index 4dc45e2e5..60504f50e 100644 --- a/test/unit/feature_engineering/test_missing_values_transformer.py +++ b/test/unit/feature_engineering/test_missing_values_transformer.py @@ -15,7 +15,7 @@ class MissingValuesTransformerTests(BaseTestCase): def setUp(self): self.data = pd.DataFrame( - {"A": [np.nan, 2, 3], "B": [1, np.nan, 3], "C": [4, 5, np.nan], "D": [np.nan, np.nan, np.nan]} + {"A": [np.nan, 2, 3, 4], "B": [1, np.nan, 3, 4], "C": [3, 4, 5, np.nan], "D": [np.nan, np.nan, np.nan, np.nan]} ) def test_imputation_with_mean_strategy_fills_missing_values(self): @@ -39,10 +39,10 @@ def test_columns_always_null_are_removed(self): transformer.fit(self.data) self.assertNotIn("D", transformer.non_null_feature_names) - def test_columns_with_missing_values_at_end_are_removed(self): + def test_rows_with_missing_values_at_end_are_removed(self): transformer = MissingValuesTransformer() transformer.fit(self.data) - self.assertNotIn("C", transformer.non_null_feature_names) + self.assertEqual(transformer.non_trailing_null_rows, [0, 1, 2]) def test_non_dataframe_input_is_converted_and_processed(self): transformer = MissingValuesTransformer(imputation_strategy="mean") @@ -50,11 +50,12 @@ def test_non_dataframe_input_is_converted_and_processed(self): transformed = transformer.fit_transform(array) self.assertIsInstance(transformed, pd.DataFrame) self.assertEqual(transformed.isnull().sum().sum(), 0) + self.assertEqual(transformed.shape, (1, 1)) - def test_fitting_transformer_without_strategy_keeps_data_unchanged(self): + def test_fitting_transformer_without_strategy_keeps_valid_data_unchanged(self): transformer = MissingValuesTransformer() transformed = transformer.fit_transform(self.data) - pd.testing.assert_frame_equal(transformed, self.data.drop(columns=["C", "D"])) + pd.testing.assert_frame_equal(transformed, self.data.drop(index=3, columns=["D"])) def test_calling_transform_before_fit_raises_error(self): transformer = MissingValuesTransformer() From 521d77f02686c1462d9d165365e5b8369a4a7123 Mon Sep 17 00:00:00 2001 From: lschilders Date: Wed, 2 Oct 2024 10:33:02 +0200 Subject: [PATCH 07/21] also transform labels y with trailing null rows Signed-off-by: lschilders --- openstef/feature_engineering/missing_values_transformer.py | 6 +++++- openstef/model/regressors/linear_quantile.py | 4 ++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/openstef/feature_engineering/missing_values_transformer.py b/openstef/feature_engineering/missing_values_transformer.py index f323566d0..4320d305f 100644 --- a/openstef/feature_engineering/missing_values_transformer.py +++ b/openstef/feature_engineering/missing_values_transformer.py @@ -103,7 +103,11 @@ def fit_transform(self, x, y=None): """ self.fit(x, y) - return self.transform(x) + + if y is not None: + y = y.loc[self.non_trailing_null_rows] + + return self.transform(x), y @classmethod def _identity(cls, x): diff --git a/openstef/model/regressors/linear_quantile.py b/openstef/model/regressors/linear_quantile.py index b6d9c1e43..b0c9af91a 100644 --- a/openstef/model/regressors/linear_quantile.py +++ b/openstef/model/regressors/linear_quantile.py @@ -165,8 +165,8 @@ def fit(self, x: pd.DataFrame, y: pd.Series, **kwargs) -> RegressorMixin: x = self._remove_ignored_features(x) # Fix nan columns - x = self.imputer_.fit_transform(x) - if (x.isna() & ~x.bfill().isna()).any().any(): + x, y = self.imputer_.fit_transform(x, y) + if x.isna().any().any(): raise ValueError( "There are nan values in the input data. Set " "imputation_strategy to solve them." From 0956e653a0a27da79516e3fede71f0de54daf9a6 Mon Sep 17 00:00:00 2001 From: lschilders Date: Wed, 2 Oct 2024 10:48:09 +0200 Subject: [PATCH 08/21] test in test_linear_quantile for trailing null Signed-off-by: lschilders --- test/unit/model/regressors/test_linear_quantile.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/test/unit/model/regressors/test_linear_quantile.py b/test/unit/model/regressors/test_linear_quantile.py index 6579770fa..93d37618a 100644 --- a/test/unit/model/regressors/test_linear_quantile.py +++ b/test/unit/model/regressors/test_linear_quantile.py @@ -55,9 +55,10 @@ def test_imputer(self): # Arrange n_sample = train_input.shape[0] X = train_input.iloc[:, 1:].copy(deep=True) - sp = np.ones(n_sample) - sp[-2] = np.nan - X["Sparse"] = sp + X["sparse"] = np.ones(n_sample) + X.loc[X.index[-2], "sparse"] = np.nan + X["sparse_2"] = np.ones(n_sample) + X.loc[X.index[-1], "sparse_2"] = np.nan model1 = LinearQuantileOpenstfRegressor(imputation_strategy=None) model2 = LinearQuantileOpenstfRegressor(imputation_strategy="mean") @@ -75,6 +76,9 @@ def test_imputer(self): X_ = pd.DataFrame(model2.imputer_.transform(X), columns=X.columns) self.assertTrue((model2.predict(X_) == model2.predict(X)).all()) + # check if last row is removed because of trailing null values + self.assertEqual(X_.shape[0], n_sample - 1) + def test_value_error_raised(self): # Check if Value Error is raised when 0.5 is not in the requested quantiles list with self.assertRaises(ValueError): From 7e5a77151c87db45537ace44e3f8fe12e133ab5a Mon Sep 17 00:00:00 2001 From: black Date: Wed, 2 Oct 2024 09:22:56 +0000 Subject: [PATCH 09/21] Format Python code with Black Signed-off-by: black --- .../test_missing_values_transformer.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/test/unit/feature_engineering/test_missing_values_transformer.py b/test/unit/feature_engineering/test_missing_values_transformer.py index 60504f50e..a72f6972d 100644 --- a/test/unit/feature_engineering/test_missing_values_transformer.py +++ b/test/unit/feature_engineering/test_missing_values_transformer.py @@ -15,7 +15,12 @@ class MissingValuesTransformerTests(BaseTestCase): def setUp(self): self.data = pd.DataFrame( - {"A": [np.nan, 2, 3, 4], "B": [1, np.nan, 3, 4], "C": [3, 4, 5, np.nan], "D": [np.nan, np.nan, np.nan, np.nan]} + { + "A": [np.nan, 2, 3, 4], + "B": [1, np.nan, 3, 4], + "C": [3, 4, 5, np.nan], + "D": [np.nan, np.nan, np.nan, np.nan], + } ) def test_imputation_with_mean_strategy_fills_missing_values(self): @@ -55,7 +60,9 @@ def test_non_dataframe_input_is_converted_and_processed(self): def test_fitting_transformer_without_strategy_keeps_valid_data_unchanged(self): transformer = MissingValuesTransformer() transformed = transformer.fit_transform(self.data) - pd.testing.assert_frame_equal(transformed, self.data.drop(index=3, columns=["D"])) + pd.testing.assert_frame_equal( + transformed, self.data.drop(index=3, columns=["D"]) + ) def test_calling_transform_before_fit_raises_error(self): transformer = MissingValuesTransformer() From 5a7537df3114741c0ebfafd4d8f13a8b8b35104f Mon Sep 17 00:00:00 2001 From: lschilders Date: Wed, 2 Oct 2024 13:42:05 +0200 Subject: [PATCH 10/21] merge Black formatting in branch Signed-off-by: lschilders --- .../missing_values_transformer.py | 6 ++++-- .../test_missing_values_transformer.py | 16 +++++++++++----- 2 files changed, 15 insertions(+), 7 deletions(-) diff --git a/openstef/feature_engineering/missing_values_transformer.py b/openstef/feature_engineering/missing_values_transformer.py index 4320d305f..f0e55498d 100644 --- a/openstef/feature_engineering/missing_values_transformer.py +++ b/openstef/feature_engineering/missing_values_transformer.py @@ -95,7 +95,7 @@ def transform(self, x) -> pd.DataFrame: return transformed - def fit_transform(self, x, y=None): + def fit_transform(self, x, y=None) -> tuple[pd.DataFrame, Optional[pd.Series]]: """Fit the imputer on the input data and transform it. Returns: @@ -107,7 +107,9 @@ def fit_transform(self, x, y=None): if y is not None: y = y.loc[self.non_trailing_null_rows] - return self.transform(x), y + x = self.transform(x) + assert x.shape[0] == y.shape[0], "Number of rows in x and y should be equal." + return x, y @classmethod def _identity(cls, x): diff --git a/test/unit/feature_engineering/test_missing_values_transformer.py b/test/unit/feature_engineering/test_missing_values_transformer.py index a72f6972d..86f3c554b 100644 --- a/test/unit/feature_engineering/test_missing_values_transformer.py +++ b/test/unit/feature_engineering/test_missing_values_transformer.py @@ -3,7 +3,6 @@ # SPDX-License-Identifier: MPL-2.0 from test.unit.utils.base import BaseTestCase -import unittest import pandas as pd import numpy as np from sklearn.exceptions import NotFittedError @@ -25,7 +24,7 @@ def setUp(self): def test_imputation_with_mean_strategy_fills_missing_values(self): transformer = MissingValuesTransformer(imputation_strategy="mean") - transformed = transformer.fit_transform(self.data) + transformed, _ = transformer.fit_transform(self.data) self.assertEqual(transformed.isnull().sum().sum(), 0) self.assertAlmostEqual(transformed.loc[0, "A"], 2.5) self.assertAlmostEqual(transformed.loc[1, "B"], 2) @@ -34,7 +33,7 @@ def test_imputation_with_constant_strategy_fills_missing_values(self): transformer = MissingValuesTransformer( imputation_strategy="constant", fill_value=0 ) - transformed = transformer.fit_transform(self.data) + transformed, _ = transformer.fit_transform(self.data) self.assertEqual(transformed.isnull().sum().sum(), 0) self.assertEqual(transformed.loc[0, "A"], 0) self.assertEqual(transformed.loc[1, "B"], 0) @@ -49,17 +48,24 @@ def test_rows_with_missing_values_at_end_are_removed(self): transformer.fit(self.data) self.assertEqual(transformer.non_trailing_null_rows, [0, 1, 2]) + def test_fitting_with_labels_removes_rows_with_trailing_nulls(self): + transformer = MissingValuesTransformer() + _, y_transformed = transformer.fit_transform( + self.data, y=pd.Series([1, 2, 3, 4]) + ) + self.assertEqual(y_transformed.tolist(), [1, 2, 3]) + def test_non_dataframe_input_is_converted_and_processed(self): transformer = MissingValuesTransformer(imputation_strategy="mean") array = np.array([[1, np.nan, np.nan], [np.nan, 2, np.nan]]) - transformed = transformer.fit_transform(array) + transformed, _ = transformer.fit_transform(array) self.assertIsInstance(transformed, pd.DataFrame) self.assertEqual(transformed.isnull().sum().sum(), 0) self.assertEqual(transformed.shape, (1, 1)) def test_fitting_transformer_without_strategy_keeps_valid_data_unchanged(self): transformer = MissingValuesTransformer() - transformed = transformer.fit_transform(self.data) + transformed, _ = transformer.fit_transform(self.data) pd.testing.assert_frame_equal( transformed, self.data.drop(index=3, columns=["D"]) ) From 2db9afade4728c674e3dc31647fc87cc9b2b4cb2 Mon Sep 17 00:00:00 2001 From: lschilders Date: Wed, 2 Oct 2024 14:22:54 +0200 Subject: [PATCH 11/21] remove assert in fit_transform missing_values_transformer Signed-off-by: lschilders --- openstef/feature_engineering/missing_values_transformer.py | 1 - 1 file changed, 1 deletion(-) diff --git a/openstef/feature_engineering/missing_values_transformer.py b/openstef/feature_engineering/missing_values_transformer.py index f0e55498d..15d23d681 100644 --- a/openstef/feature_engineering/missing_values_transformer.py +++ b/openstef/feature_engineering/missing_values_transformer.py @@ -108,7 +108,6 @@ def fit_transform(self, x, y=None) -> tuple[pd.DataFrame, Optional[pd.Series]]: y = y.loc[self.non_trailing_null_rows] x = self.transform(x) - assert x.shape[0] == y.shape[0], "Number of rows in x and y should be equal." return x, y @classmethod From a04700e54fe2c2d81f909212ab0843dbf411c498 Mon Sep 17 00:00:00 2001 From: lschilders Date: Wed, 2 Oct 2024 16:11:59 +0200 Subject: [PATCH 12/21] only train on subset of data in test_pipeline_train_model Signed-off-by: lschilders --- test/unit/pipeline/test_pipeline_train_model.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/test/unit/pipeline/test_pipeline_train_model.py b/test/unit/pipeline/test_pipeline_train_model.py index f4b5b7491..e68923c0f 100644 --- a/test/unit/pipeline/test_pipeline_train_model.py +++ b/test/unit/pipeline/test_pipeline_train_model.py @@ -125,8 +125,6 @@ def test_train_model_pipeline_core_happy_flow(self): but it can/should include predictors (e.g. weather data) """ - # Select 50 data points to speedup test - train_input = self.train_input.iloc[:50, :] # Remove modeltypes which are optional, and add a dummy regressor for model_type in list(MLModelType) + [__name__ + ".DummyRegressor"]: with self.subTest(model_type=model_type): @@ -136,7 +134,9 @@ def test_train_model_pipeline_core_happy_flow(self): model_type.value if hasattr(model_type, "value") else model_type ) model_specs = self.model_specs - train_input = self.train_input + + # Select 150 data points to speedup test + train_input = self.train_input.iloc[:150, :] # Use default parameters model_specs.hyper_params = {} @@ -155,7 +155,6 @@ def test_train_model_pipeline_core_happy_flow(self): function=split_dummy_arima, arguments={}, ) - train_input = self.train_input[:150] model, report, modelspecs, _ = train_model_pipeline_core( pj=pj, model_specs=model_specs, input_data=train_input From f73541e908d1075a0d1db4373f5089af03e70d0d Mon Sep 17 00:00:00 2001 From: lschilders Date: Wed, 2 Oct 2024 16:13:41 +0200 Subject: [PATCH 13/21] adapt transform(x) to not remove non_trailing_nulls and index with DataFrame rather than list of index Signed-off-by: lschilders --- openstef/feature_engineering/missing_values_transformer.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/openstef/feature_engineering/missing_values_transformer.py b/openstef/feature_engineering/missing_values_transformer.py index 15d23d681..6dccdd3cb 100644 --- a/openstef/feature_engineering/missing_values_transformer.py +++ b/openstef/feature_engineering/missing_values_transformer.py @@ -21,7 +21,7 @@ class MissingValuesTransformer: _n_in_features: Optional[int] = None non_null_feature_names: List[str] = None - non_trailing_null_rows: List[int] = None + non_trailing_null_rows: pd.Series = None def __init__( self, @@ -73,8 +73,7 @@ def fit(self, x, y=None): x = x[self.non_null_feature_names] # Remove rows with trailing null values - row_has_trailing_null = x.bfill().isnull().any(axis="columns") - self.non_trailing_null_rows = list(x.index[~row_has_trailing_null]) + self.non_trailing_null_rows = ~x.bfill().isnull().any(axis="columns") x = x.loc[self.non_trailing_null_rows] # Imputers do not support labels @@ -89,7 +88,6 @@ def transform(self, x) -> pd.DataFrame: x = pd.DataFrame(np.asarray(x)) x = x[self.non_null_feature_names] - x = x.loc[self.non_trailing_null_rows] transformed = self.imputer_.transform(x) From ee53800b08fbc0c0e5606ed0a6bee4f6697a0c99 Mon Sep 17 00:00:00 2001 From: lschilders Date: Wed, 2 Oct 2024 16:14:10 +0200 Subject: [PATCH 14/21] test for duplicate indices Signed-off-by: lschilders --- .../test_missing_values_transformer.py | 22 +++++++++++-------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/test/unit/feature_engineering/test_missing_values_transformer.py b/test/unit/feature_engineering/test_missing_values_transformer.py index 86f3c554b..7947116f0 100644 --- a/test/unit/feature_engineering/test_missing_values_transformer.py +++ b/test/unit/feature_engineering/test_missing_values_transformer.py @@ -19,15 +19,16 @@ def setUp(self): "B": [1, np.nan, 3, 4], "C": [3, 4, 5, np.nan], "D": [np.nan, np.nan, np.nan, np.nan], - } + }, + index=[0, 1, 1, 2], ) def test_imputation_with_mean_strategy_fills_missing_values(self): transformer = MissingValuesTransformer(imputation_strategy="mean") transformed, _ = transformer.fit_transform(self.data) self.assertEqual(transformed.isnull().sum().sum(), 0) - self.assertAlmostEqual(transformed.loc[0, "A"], 2.5) - self.assertAlmostEqual(transformed.loc[1, "B"], 2) + self.assertAlmostEqual(transformed.iloc[0]["A"], 2.5) + self.assertAlmostEqual(transformed.iloc[1]["B"], 2) def test_imputation_with_constant_strategy_fills_missing_values(self): transformer = MissingValuesTransformer( @@ -35,23 +36,26 @@ def test_imputation_with_constant_strategy_fills_missing_values(self): ) transformed, _ = transformer.fit_transform(self.data) self.assertEqual(transformed.isnull().sum().sum(), 0) - self.assertEqual(transformed.loc[0, "A"], 0) - self.assertEqual(transformed.loc[1, "B"], 0) + self.assertEqual(transformed.iloc[0]["A"], 0) + self.assertEqual(transformed.iloc[1]["B"], 0) def test_columns_always_null_are_removed(self): transformer = MissingValuesTransformer() transformer.fit(self.data) self.assertNotIn("D", transformer.non_null_feature_names) - def test_rows_with_missing_values_at_end_are_removed(self): + def test_determining_non_trailing_null_rows(self): transformer = MissingValuesTransformer() transformer.fit(self.data) - self.assertEqual(transformer.non_trailing_null_rows, [0, 1, 2]) + pd.testing.assert_series_equal( + transformer.non_trailing_null_rows, + pd.Series([True, True, True, False], index=[0, 1, 1, 2]), + ) def test_fitting_with_labels_removes_rows_with_trailing_nulls(self): transformer = MissingValuesTransformer() _, y_transformed = transformer.fit_transform( - self.data, y=pd.Series([1, 2, 3, 4]) + self.data, y=pd.Series([1, 2, 3, 4], index=self.data.index) ) self.assertEqual(y_transformed.tolist(), [1, 2, 3]) @@ -67,7 +71,7 @@ def test_fitting_transformer_without_strategy_keeps_valid_data_unchanged(self): transformer = MissingValuesTransformer() transformed, _ = transformer.fit_transform(self.data) pd.testing.assert_frame_equal( - transformed, self.data.drop(index=3, columns=["D"]) + transformed, self.data.drop(index=2, columns=["D"]) ) def test_calling_transform_before_fit_raises_error(self): From 52017ee06f4838a2984e183fbbed919494ef36f4 Mon Sep 17 00:00:00 2001 From: lschilders Date: Wed, 2 Oct 2024 16:52:05 +0200 Subject: [PATCH 15/21] remove non trailing null rows in fit_transform and check in test_imputer of test_linear_quantile Signed-off-by: lschilders --- openstef/feature_engineering/missing_values_transformer.py | 6 ++++++ test/unit/model/regressors/test_linear_quantile.py | 3 ++- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/openstef/feature_engineering/missing_values_transformer.py b/openstef/feature_engineering/missing_values_transformer.py index 6dccdd3cb..509dfe9bc 100644 --- a/openstef/feature_engineering/missing_values_transformer.py +++ b/openstef/feature_engineering/missing_values_transformer.py @@ -58,6 +58,11 @@ def __init__( ).set_output(transform="pandas") self.imputer_._validate_params() + def remove_trailing_null_rows(self, df: pd.DataFrame) -> pd.DataFrame: + """Remove rows with trailing null values in a DataFrame.""" + self.non_trailing_null_rows = ~df.bfill().isnull().any(axis="columns") + return df.loc[self.non_trailing_null_rows] + def fit(self, x, y=None): """Fit the imputer on the input data.""" _ = check_array(x, force_all_finite="allow-nan") @@ -106,6 +111,7 @@ def fit_transform(self, x, y=None) -> tuple[pd.DataFrame, Optional[pd.Series]]: y = y.loc[self.non_trailing_null_rows] x = self.transform(x) + x = x.loc[self.non_trailing_null_rows] return x, y @classmethod diff --git a/test/unit/model/regressors/test_linear_quantile.py b/test/unit/model/regressors/test_linear_quantile.py index 93d37618a..36d21b683 100644 --- a/test/unit/model/regressors/test_linear_quantile.py +++ b/test/unit/model/regressors/test_linear_quantile.py @@ -77,7 +77,8 @@ def test_imputer(self): self.assertTrue((model2.predict(X_) == model2.predict(X)).all()) # check if last row is removed because of trailing null values - self.assertEqual(X_.shape[0], n_sample - 1) + X_transformed, _ = model2.imputer_.fit_transform(X) + self.assertEqual(X_transformed.shape[0], n_sample - 1) def test_value_error_raised(self): # Check if Value Error is raised when 0.5 is not in the requested quantiles list From 24590d75de1888220095d49ee41131be41c681f3 Mon Sep 17 00:00:00 2001 From: lschilders Date: Mon, 7 Oct 2024 09:16:39 +0200 Subject: [PATCH 16/21] add test for calling transform separately Signed-off-by: lschilders --- .../test_missing_values_transformer.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/test/unit/feature_engineering/test_missing_values_transformer.py b/test/unit/feature_engineering/test_missing_values_transformer.py index 7947116f0..214f3d71e 100644 --- a/test/unit/feature_engineering/test_missing_values_transformer.py +++ b/test/unit/feature_engineering/test_missing_values_transformer.py @@ -74,6 +74,23 @@ def test_fitting_transformer_without_strategy_keeps_valid_data_unchanged(self): transformed, self.data.drop(index=2, columns=["D"]) ) + def test_call_transform_on_fitted_transformer_does_not_remove_trailing_null_rows( + self, + ): + transformer = MissingValuesTransformer() + transformer.fit(self.data) + new_data = pd.DataFrame( + { + "A": [1, 2, 3, 4], + "B": [1, 2, 3, 4], + "C": [1, 2, 3, 4], + "D": [1, 2, 3, 4], + }, + index=[0, 1, 1, 2], + ) + transformed = transformer.transform(new_data) + pd.testing.assert_frame_equal(transformed, new_data.drop(columns=["D"])) + def test_calling_transform_before_fit_raises_error(self): transformer = MissingValuesTransformer() with self.assertRaises(NotFittedError): From e66afe8e108391b146ff8c7caaeeb4ac32487e87 Mon Sep 17 00:00:00 2001 From: lschilders Date: Mon, 7 Oct 2024 12:10:43 +0200 Subject: [PATCH 17/21] refactored MissingValuesTransformer with private static method _determine_trailing_null_rows Signed-off-by: lschilders --- .../missing_values_transformer.py | 30 ++++++++++++------- .../test_missing_values_transformer.py | 5 +++- 2 files changed, 23 insertions(+), 12 deletions(-) diff --git a/openstef/feature_engineering/missing_values_transformer.py b/openstef/feature_engineering/missing_values_transformer.py index 509dfe9bc..9aeed43e9 100644 --- a/openstef/feature_engineering/missing_values_transformer.py +++ b/openstef/feature_engineering/missing_values_transformer.py @@ -21,7 +21,6 @@ class MissingValuesTransformer: _n_in_features: Optional[int] = None non_null_feature_names: List[str] = None - non_trailing_null_rows: pd.Series = None def __init__( self, @@ -58,10 +57,10 @@ def __init__( ).set_output(transform="pandas") self.imputer_._validate_params() - def remove_trailing_null_rows(self, df: pd.DataFrame) -> pd.DataFrame: - """Remove rows with trailing null values in a DataFrame.""" - self.non_trailing_null_rows = ~df.bfill().isnull().any(axis="columns") - return df.loc[self.non_trailing_null_rows] + @staticmethod + def _determine_trailing_null_rows(x: pd.DataFrame) -> pd.Series: + """Determine rows with trailing null values in a DataFrame.""" + return ~x.bfill().isnull().any(axis="columns") def fit(self, x, y=None): """Fit the imputer on the input data.""" @@ -77,9 +76,9 @@ def fit(self, x, y=None): self.non_null_feature_names = list(x.columns[~is_column_null]) x = x[self.non_null_feature_names] - # Remove rows with trailing null values - self.non_trailing_null_rows = ~x.bfill().isnull().any(axis="columns") - x = x.loc[self.non_trailing_null_rows] + # Remove trailing null rows + trailing_null_rows = self._determine_trailing_null_rows(x) + x = x.loc[trailing_null_rows] # Imputers do not support labels self.imputer_.fit(X=x, y=None) @@ -107,11 +106,20 @@ def fit_transform(self, x, y=None) -> tuple[pd.DataFrame, Optional[pd.Series]]: """ self.fit(x, y) - if y is not None: - y = y.loc[self.non_trailing_null_rows] + if not isinstance(x, pd.DataFrame): + x = pd.DataFrame(np.asarray(x)) + + x = x[self.non_null_feature_names] + + non_trailing_null_rows = self._determine_trailing_null_rows(x) + + x = x.loc[non_trailing_null_rows] x = self.transform(x) - x = x.loc[self.non_trailing_null_rows] + + if y is not None: + y = y.loc[non_trailing_null_rows] + return x, y @classmethod diff --git a/test/unit/feature_engineering/test_missing_values_transformer.py b/test/unit/feature_engineering/test_missing_values_transformer.py index 214f3d71e..af2125e58 100644 --- a/test/unit/feature_engineering/test_missing_values_transformer.py +++ b/test/unit/feature_engineering/test_missing_values_transformer.py @@ -47,8 +47,11 @@ def test_columns_always_null_are_removed(self): def test_determining_non_trailing_null_rows(self): transformer = MissingValuesTransformer() transformer.fit(self.data) + non_trailing_null_rows = transformer._determine_trailing_null_rows( + self.data[transformer.non_null_feature_names] + ) pd.testing.assert_series_equal( - transformer.non_trailing_null_rows, + non_trailing_null_rows, pd.Series([True, True, True, False], index=[0, 1, 1, 2]), ) From c9b73dc9384bc6e1a71f6e8bbda14031b01ce1e7 Mon Sep 17 00:00:00 2001 From: lschilders Date: Mon, 7 Oct 2024 13:42:12 +0200 Subject: [PATCH 18/21] add whitelist for no_fill_future_values_features Signed-off-by: lschilders --- .../missing_values_transformer.py | 20 ++++++++++++++----- openstef/model/regressors/linear_quantile.py | 7 ++++++- .../test_missing_values_transformer.py | 18 ++++++++--------- .../model/regressors/test_linear_quantile.py | 4 +++- 4 files changed, 32 insertions(+), 17 deletions(-) diff --git a/openstef/feature_engineering/missing_values_transformer.py b/openstef/feature_engineering/missing_values_transformer.py index 9aeed43e9..5d2e3dc74 100644 --- a/openstef/feature_engineering/missing_values_transformer.py +++ b/openstef/feature_engineering/missing_values_transformer.py @@ -27,6 +27,7 @@ def __init__( missing_values: Union[int, float, str, None] = np.nan, imputation_strategy: str = None, fill_value: Union[str, int, float] = None, + no_fill_future_values_features: List[str] = [], ): """Initialize missing values handler. @@ -37,11 +38,14 @@ def __init__( Can be one of "mean", "median", "most_frequent", "constant" or None. fill_value: When strategy == "constant", fill_value is used to replace all occurrences of missing_values. - + no_fill_future_values_features: The features for which it does not make sense + to fill future values. Rows that contain trailing null values for these + features will be removed from the data. """ self.missing_values = missing_values self.imputation_strategy = imputation_strategy self.fill_value = fill_value + self.no_fill_future_values_features = no_fill_future_values_features self.is_fitted_ = False # Build the proper imputation transformer @@ -76,8 +80,11 @@ def fit(self, x, y=None): self.non_null_feature_names = list(x.columns[~is_column_null]) x = x[self.non_null_feature_names] - # Remove trailing null rows - trailing_null_rows = self._determine_trailing_null_rows(x) + # Remove trailing null rows for features that should + # not be imputed in the future + trailing_null_rows = self._determine_trailing_null_rows( + x[self.no_fill_future_values_features] + ) x = x.loc[trailing_null_rows] # Imputers do not support labels @@ -111,8 +118,11 @@ def fit_transform(self, x, y=None) -> tuple[pd.DataFrame, Optional[pd.Series]]: x = x[self.non_null_feature_names] - non_trailing_null_rows = self._determine_trailing_null_rows(x) - + # Remove trailing null rows for features that should + # not be imputed in the future + non_trailing_null_rows = self._determine_trailing_null_rows( + x[self.no_fill_future_values_features] + ) x = x.loc[non_trailing_null_rows] x = self.transform(x) diff --git a/openstef/model/regressors/linear_quantile.py b/openstef/model/regressors/linear_quantile.py index b0c9af91a..37d77e371 100644 --- a/openstef/model/regressors/linear_quantile.py +++ b/openstef/model/regressors/linear_quantile.py @@ -2,7 +2,7 @@ # # SPDX-License-Identifier: MPL-2.0 import re -from typing import Dict, Union, Set, Optional +from typing import Dict, Union, Set, Optional, List import numpy as np import pandas as pd @@ -47,6 +47,7 @@ def __init__( missing_values: Union[int, float, str, None] = np.nan, imputation_strategy: Optional[str] = "mean", fill_value: Union[str, int, float] = None, + no_fill_future_values_features: List[str] = [], ): """Initialize LinearQuantileOpenstfRegressor. @@ -69,6 +70,9 @@ def __init__( missing_values: Value to be considered as missing value imputation_strategy: Imputation strategy fill_value: Fill value + no_fill_future_values_features: The features for which it does not make sense + to fill future values. Rows that contain trailing null values for these + features will be removed from the data. """ super().__init__() @@ -86,6 +90,7 @@ def __init__( missing_values=missing_values, imputation_strategy=imputation_strategy, fill_value=fill_value, + no_fill_future_values_features=no_fill_future_values_features, ) self.x_scaler_ = MinMaxScaler(feature_range=(-1, 1)) self.y_scaler_ = MinMaxScaler(feature_range=(-1, 1)) diff --git a/test/unit/feature_engineering/test_missing_values_transformer.py b/test/unit/feature_engineering/test_missing_values_transformer.py index af2125e58..ac93a30f3 100644 --- a/test/unit/feature_engineering/test_missing_values_transformer.py +++ b/test/unit/feature_engineering/test_missing_values_transformer.py @@ -16,7 +16,7 @@ def setUp(self): self.data = pd.DataFrame( { "A": [np.nan, 2, 3, 4], - "B": [1, np.nan, 3, 4], + "B": [3, np.nan, 4, 5], "C": [3, 4, 5, np.nan], "D": [np.nan, np.nan, np.nan, np.nan], }, @@ -27,8 +27,8 @@ def test_imputation_with_mean_strategy_fills_missing_values(self): transformer = MissingValuesTransformer(imputation_strategy="mean") transformed, _ = transformer.fit_transform(self.data) self.assertEqual(transformed.isnull().sum().sum(), 0) - self.assertAlmostEqual(transformed.iloc[0]["A"], 2.5) - self.assertAlmostEqual(transformed.iloc[1]["B"], 2) + self.assertAlmostEqual(transformed.iloc[0]["A"], 3) + self.assertAlmostEqual(transformed.iloc[1]["B"], 4) def test_imputation_with_constant_strategy_fills_missing_values(self): transformer = MissingValuesTransformer( @@ -45,7 +45,7 @@ def test_columns_always_null_are_removed(self): self.assertNotIn("D", transformer.non_null_feature_names) def test_determining_non_trailing_null_rows(self): - transformer = MissingValuesTransformer() + transformer = MissingValuesTransformer(no_fill_future_values_features=["C"]) transformer.fit(self.data) non_trailing_null_rows = transformer._determine_trailing_null_rows( self.data[transformer.non_null_feature_names] @@ -56,7 +56,7 @@ def test_determining_non_trailing_null_rows(self): ) def test_fitting_with_labels_removes_rows_with_trailing_nulls(self): - transformer = MissingValuesTransformer() + transformer = MissingValuesTransformer(no_fill_future_values_features=["C"]) _, y_transformed = transformer.fit_transform( self.data, y=pd.Series([1, 2, 3, 4], index=self.data.index) ) @@ -68,19 +68,17 @@ def test_non_dataframe_input_is_converted_and_processed(self): transformed, _ = transformer.fit_transform(array) self.assertIsInstance(transformed, pd.DataFrame) self.assertEqual(transformed.isnull().sum().sum(), 0) - self.assertEqual(transformed.shape, (1, 1)) + self.assertEqual(transformed.shape, (2, 2)) def test_fitting_transformer_without_strategy_keeps_valid_data_unchanged(self): transformer = MissingValuesTransformer() transformed, _ = transformer.fit_transform(self.data) - pd.testing.assert_frame_equal( - transformed, self.data.drop(index=2, columns=["D"]) - ) + pd.testing.assert_frame_equal(transformed, self.data.drop(columns=["D"])) def test_call_transform_on_fitted_transformer_does_not_remove_trailing_null_rows( self, ): - transformer = MissingValuesTransformer() + transformer = MissingValuesTransformer(no_fill_future_values_features=["C"]) transformer.fit(self.data) new_data = pd.DataFrame( { diff --git a/test/unit/model/regressors/test_linear_quantile.py b/test/unit/model/regressors/test_linear_quantile.py index 36d21b683..50ea32bbc 100644 --- a/test/unit/model/regressors/test_linear_quantile.py +++ b/test/unit/model/regressors/test_linear_quantile.py @@ -60,7 +60,9 @@ def test_imputer(self): X["sparse_2"] = np.ones(n_sample) X.loc[X.index[-1], "sparse_2"] = np.nan model1 = LinearQuantileOpenstfRegressor(imputation_strategy=None) - model2 = LinearQuantileOpenstfRegressor(imputation_strategy="mean") + model2 = LinearQuantileOpenstfRegressor( + imputation_strategy="mean", no_fill_future_values_features=["sparse_2"] + ) # Act # Model should give error if nan values are present. From 7f710a7612b608cdecb83d34fb4996891f43b821 Mon Sep 17 00:00:00 2001 From: lschilders Date: Mon, 7 Oct 2024 14:01:01 +0200 Subject: [PATCH 19/21] immutable default value for no_fill_future_values_features --- openstef/feature_engineering/missing_values_transformer.py | 4 +++- openstef/model/regressors/linear_quantile.py | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/openstef/feature_engineering/missing_values_transformer.py b/openstef/feature_engineering/missing_values_transformer.py index 5d2e3dc74..33ae3f994 100644 --- a/openstef/feature_engineering/missing_values_transformer.py +++ b/openstef/feature_engineering/missing_values_transformer.py @@ -27,7 +27,7 @@ def __init__( missing_values: Union[int, float, str, None] = np.nan, imputation_strategy: str = None, fill_value: Union[str, int, float] = None, - no_fill_future_values_features: List[str] = [], + no_fill_future_values_features: List[str] = None, ): """Initialize missing values handler. @@ -45,6 +45,8 @@ def __init__( self.missing_values = missing_values self.imputation_strategy = imputation_strategy self.fill_value = fill_value + if no_fill_future_values_features is None: + no_fill_future_values_features = [] self.no_fill_future_values_features = no_fill_future_values_features self.is_fitted_ = False diff --git a/openstef/model/regressors/linear_quantile.py b/openstef/model/regressors/linear_quantile.py index 37d77e371..2c8ead73a 100644 --- a/openstef/model/regressors/linear_quantile.py +++ b/openstef/model/regressors/linear_quantile.py @@ -47,7 +47,7 @@ def __init__( missing_values: Union[int, float, str, None] = np.nan, imputation_strategy: Optional[str] = "mean", fill_value: Union[str, int, float] = None, - no_fill_future_values_features: List[str] = [], + no_fill_future_values_features: List[str] = None, ): """Initialize LinearQuantileOpenstfRegressor. From 4203538cab56a92cf32e1f8483315733531e8d97 Mon Sep 17 00:00:00 2001 From: Lars Schilders <123180911+lschilders@users.noreply.github.com> Date: Tue, 8 Oct 2024 09:23:19 +0200 Subject: [PATCH 20/21] Update openstef/feature_engineering/missing_values_transformer.py Co-authored-by: Egor Dmitriev Signed-off-by: Lars Schilders <123180911+lschilders@users.noreply.github.com> --- openstef/feature_engineering/missing_values_transformer.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/openstef/feature_engineering/missing_values_transformer.py b/openstef/feature_engineering/missing_values_transformer.py index 33ae3f994..7c46e5192 100644 --- a/openstef/feature_engineering/missing_values_transformer.py +++ b/openstef/feature_engineering/missing_values_transformer.py @@ -45,9 +45,7 @@ def __init__( self.missing_values = missing_values self.imputation_strategy = imputation_strategy self.fill_value = fill_value - if no_fill_future_values_features is None: - no_fill_future_values_features = [] - self.no_fill_future_values_features = no_fill_future_values_features + self.no_fill_future_values_features = no_fill_future_values_features or [] self.is_fitted_ = False # Build the proper imputation transformer From 5f83b6210819ea9f9460d1225afbcd35f47884ef Mon Sep 17 00:00:00 2001 From: lschilders Date: Tue, 8 Oct 2024 09:24:40 +0200 Subject: [PATCH 21/21] add no_fill_future_values_features to model_creator Signed-off-by: lschilders --- openstef/model/model_creator.py | 1 + 1 file changed, 1 insertion(+) diff --git a/openstef/model/model_creator.py b/openstef/model/model_creator.py index 837c6052a..bd2448889 100644 --- a/openstef/model/model_creator.py +++ b/openstef/model/model_creator.py @@ -116,6 +116,7 @@ "missing_values", "imputation_strategy", "fill_value", + "no_fill_future_values_features", ], MLModelType.ARIMA: [ "backtest_max_horizon",