Skip to content

Commit

Permalink
[fix] disable imputation on future data (#562)
Browse files Browse the repository at this point in the history
* postprocess imputation df by putting back trailing nan's

Signed-off-by: lschilders <[email protected]>

* add unit test test_no_imputation_for_future_data

Signed-off-by: lschilders <[email protected]>

* fix unit test test_linear_quantile

Signed-off-by: lschilders <[email protected]>

* fix imports in flatliner.py

Signed-off-by: lschilders <[email protected]>

* remove columns with future nan values

Signed-off-by: lschilders <[email protected]>

* implement non_trailing_null_rows in missing_values_transformer and add unit tests

Signed-off-by: lschilders <[email protected]>

* also transform labels y with trailing null rows

Signed-off-by: lschilders <[email protected]>

* test in test_linear_quantile for trailing null

Signed-off-by: lschilders <[email protected]>

* Format Python code with Black

Signed-off-by: black <[email protected]>

* merge Black formatting in branch

Signed-off-by: lschilders <[email protected]>

* remove assert in fit_transform missing_values_transformer

Signed-off-by: lschilders <[email protected]>

* only train on subset of data in test_pipeline_train_model

Signed-off-by: lschilders <[email protected]>

* adapt transform(x) to not remove non_trailing_nulls and index with DataFrame rather than list of index

Signed-off-by: lschilders <[email protected]>

* test for duplicate indices

Signed-off-by: lschilders <[email protected]>

* remove non trailing null rows in fit_transform and check in test_imputer of test_linear_quantile

Signed-off-by: lschilders <[email protected]>

* add test for calling transform separately

Signed-off-by: lschilders <[email protected]>

* refactored MissingValuesTransformer with private static method _determine_trailing_null_rows

Signed-off-by: lschilders <[email protected]>

* add whitelist for no_fill_future_values_features

Signed-off-by: lschilders <[email protected]>

* immutable default value for no_fill_future_values_features

* Update openstef/feature_engineering/missing_values_transformer.py

Co-authored-by: Egor Dmitriev <[email protected]>
Signed-off-by: Lars Schilders <[email protected]>

* add no_fill_future_values_features to model_creator

Signed-off-by: lschilders <[email protected]>

---------

Signed-off-by: lschilders <[email protected]>
Signed-off-by: black <[email protected]>
Signed-off-by: Lars Schilders <[email protected]>
Co-authored-by: black <[email protected]>
Co-authored-by: Egor Dmitriev <[email protected]>
  • Loading branch information
3 people authored Oct 9, 2024
1 parent 75ffb01 commit 430fa9e
Show file tree
Hide file tree
Showing 7 changed files with 120 additions and 35 deletions.
47 changes: 42 additions & 5 deletions openstef/feature_engineering/missing_values_transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ def __init__(
missing_values: Union[int, float, str, None] = np.nan,
imputation_strategy: str = None,
fill_value: Union[str, int, float] = None,
no_fill_future_values_features: List[str] = None,
):
"""Initialize missing values handler.
Expand All @@ -37,11 +38,14 @@ def __init__(
Can be one of "mean", "median", "most_frequent", "constant" or None.
fill_value: When strategy == "constant", fill_value is used to replace all
occurrences of missing_values.
no_fill_future_values_features: The features for which it does not make sense
to fill future values. Rows that contain trailing null values for these
features will be removed from the data.
"""
self.missing_values = missing_values
self.imputation_strategy = imputation_strategy
self.fill_value = fill_value
self.no_fill_future_values_features = no_fill_future_values_features or []
self.is_fitted_ = False

# Build the proper imputation transformer
Expand All @@ -57,6 +61,11 @@ def __init__(
).set_output(transform="pandas")
self.imputer_._validate_params()

@staticmethod
def _determine_trailing_null_rows(x: pd.DataFrame) -> pd.Series:
"""Determine rows with trailing null values in a DataFrame."""
return ~x.bfill().isnull().any(axis="columns")

def fit(self, x, y=None):
"""Fit the imputer on the input data."""
_ = check_array(x, force_all_finite="allow-nan")
Expand All @@ -69,9 +78,17 @@ def fit(self, x, y=None):
# Remove always null columns
is_column_null = x.isnull().all(axis="index")
self.non_null_feature_names = list(x.columns[~is_column_null])
x = x[self.non_null_feature_names]

# Remove trailing null rows for features that should
# not be imputed in the future
trailing_null_rows = self._determine_trailing_null_rows(
x[self.no_fill_future_values_features]
)
x = x.loc[trailing_null_rows]

# Imputers do not support labels
self.imputer_.fit(X=x[self.non_null_feature_names], y=None)
self.imputer_.fit(X=x, y=None)
self.is_fitted_ = True

def transform(self, x) -> pd.DataFrame:
Expand All @@ -83,17 +100,37 @@ def transform(self, x) -> pd.DataFrame:

x = x[self.non_null_feature_names]

return self.imputer_.transform(x)
transformed = self.imputer_.transform(x)

def fit_transform(self, x, y=None):
return transformed

def fit_transform(self, x, y=None) -> tuple[pd.DataFrame, Optional[pd.Series]]:
"""Fit the imputer on the input data and transform it.
Returns:
The data with missing values imputed.
"""
self.fit(x, y)
return self.transform(x)

if not isinstance(x, pd.DataFrame):
x = pd.DataFrame(np.asarray(x))

x = x[self.non_null_feature_names]

# Remove trailing null rows for features that should
# not be imputed in the future
non_trailing_null_rows = self._determine_trailing_null_rows(
x[self.no_fill_future_values_features]
)
x = x.loc[non_trailing_null_rows]

x = self.transform(x)

if y is not None:
y = y.loc[non_trailing_null_rows]

return x, y

@classmethod
def _identity(cls, x):
Expand Down
1 change: 1 addition & 0 deletions openstef/model/model_creator.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,7 @@
"missing_values",
"imputation_strategy",
"fill_value",
"no_fill_future_values_features",
],
ModelType.ARIMA: [
"backtest_max_horizon",
Expand Down
7 changes: 1 addition & 6 deletions openstef/model/regressors/flatliner.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,18 +2,13 @@
#
# SPDX-License-Identifier: MPL-2.0
import re
from typing import Dict, Union, Set, Optional, List
from typing import List

import numpy as np
import pandas as pd
from sklearn.base import RegressorMixin
from sklearn.linear_model import QuantileRegressor
from sklearn.preprocessing import MinMaxScaler
from sklearn.utils.validation import check_is_fitted

from openstef.feature_engineering.missing_values_transformer import (
MissingValuesTransformer,
)
from openstef.model.regressors.regressor import OpenstfRegressor


Expand Down
9 changes: 7 additions & 2 deletions openstef/model/regressors/linear_quantile.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
#
# SPDX-License-Identifier: MPL-2.0
import re
from typing import Dict, Union, Set, Optional
from typing import Dict, Union, Set, Optional, List

import numpy as np
import pandas as pd
Expand Down Expand Up @@ -47,6 +47,7 @@ def __init__(
missing_values: Union[int, float, str, None] = np.nan,
imputation_strategy: Optional[str] = "mean",
fill_value: Union[str, int, float] = None,
no_fill_future_values_features: List[str] = None,
):
"""Initialize LinearQuantileOpenstfRegressor.
Expand All @@ -69,6 +70,9 @@ def __init__(
missing_values: Value to be considered as missing value
imputation_strategy: Imputation strategy
fill_value: Fill value
no_fill_future_values_features: The features for which it does not make sense
to fill future values. Rows that contain trailing null values for these
features will be removed from the data.
"""
super().__init__()
Expand All @@ -86,6 +90,7 @@ def __init__(
missing_values=missing_values,
imputation_strategy=imputation_strategy,
fill_value=fill_value,
no_fill_future_values_features=no_fill_future_values_features,
)
self.x_scaler_ = MinMaxScaler(feature_range=(-1, 1))
self.y_scaler_ = MinMaxScaler(feature_range=(-1, 1))
Expand Down Expand Up @@ -165,7 +170,7 @@ def fit(self, x: pd.DataFrame, y: pd.Series, **kwargs) -> RegressorMixin:
x = self._remove_ignored_features(x)

# Fix nan columns
x = self.imputer_.fit_transform(x)
x, y = self.imputer_.fit_transform(x, y)
if x.isna().any().any():
raise ValueError(
"There are nan values in the input data. Set "
Expand Down
69 changes: 55 additions & 14 deletions test/unit/feature_engineering/test_missing_values_transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
# SPDX-License-Identifier: MPL-2.0
from test.unit.utils.base import BaseTestCase

import unittest
import pandas as pd
import numpy as np
from sklearn.exceptions import NotFittedError
Expand All @@ -15,41 +14,83 @@
class MissingValuesTransformerTests(BaseTestCase):
def setUp(self):
self.data = pd.DataFrame(
{"A": [1, np.nan, 3], "B": [4, 5, np.nan], "C": [np.nan, np.nan, np.nan]}
{
"A": [np.nan, 2, 3, 4],
"B": [3, np.nan, 4, 5],
"C": [3, 4, 5, np.nan],
"D": [np.nan, np.nan, np.nan, np.nan],
},
index=[0, 1, 1, 2],
)

def test_imputation_with_mean_strategy_fills_missing_values(self):
transformer = MissingValuesTransformer(imputation_strategy="mean")
transformed = transformer.fit_transform(self.data)
transformed, _ = transformer.fit_transform(self.data)
self.assertEqual(transformed.isnull().sum().sum(), 0)
self.assertAlmostEqual(transformed.loc[1, "A"], 2)
self.assertAlmostEqual(transformed.loc[2, "B"], 4.5)
self.assertAlmostEqual(transformed.iloc[0]["A"], 3)
self.assertAlmostEqual(transformed.iloc[1]["B"], 4)

def test_imputation_with_constant_strategy_fills_missing_values(self):
transformer = MissingValuesTransformer(
imputation_strategy="constant", fill_value=0
)
transformed = transformer.fit_transform(self.data)
transformed, _ = transformer.fit_transform(self.data)
self.assertEqual(transformed.isnull().sum().sum(), 0)
self.assertEqual(transformed.loc[1, "A"], 0)
self.assertEqual(transformed.loc[2, "B"], 0)
self.assertEqual(transformed.iloc[0]["A"], 0)
self.assertEqual(transformed.iloc[1]["B"], 0)

def test_columns_always_null_are_removed(self):
transformer = MissingValuesTransformer()
transformer.fit(self.data)
self.assertNotIn("C", transformer.non_null_feature_names)
self.assertNotIn("D", transformer.non_null_feature_names)

def test_determining_non_trailing_null_rows(self):
transformer = MissingValuesTransformer(no_fill_future_values_features=["C"])
transformer.fit(self.data)
non_trailing_null_rows = transformer._determine_trailing_null_rows(
self.data[transformer.non_null_feature_names]
)
pd.testing.assert_series_equal(
non_trailing_null_rows,
pd.Series([True, True, True, False], index=[0, 1, 1, 2]),
)

def test_fitting_with_labels_removes_rows_with_trailing_nulls(self):
transformer = MissingValuesTransformer(no_fill_future_values_features=["C"])
_, y_transformed = transformer.fit_transform(
self.data, y=pd.Series([1, 2, 3, 4], index=self.data.index)
)
self.assertEqual(y_transformed.tolist(), [1, 2, 3])

def test_non_dataframe_input_is_converted_and_processed(self):
transformer = MissingValuesTransformer(imputation_strategy="mean")
array = np.array([[1, np.nan], [np.nan, 2]])
transformed = transformer.fit_transform(array)
array = np.array([[1, np.nan, np.nan], [np.nan, 2, np.nan]])
transformed, _ = transformer.fit_transform(array)
self.assertIsInstance(transformed, pd.DataFrame)
self.assertEqual(transformed.isnull().sum().sum(), 0)
self.assertEqual(transformed.shape, (2, 2))

def test_fitting_transformer_without_strategy_keeps_data_unchanged(self):
def test_fitting_transformer_without_strategy_keeps_valid_data_unchanged(self):
transformer = MissingValuesTransformer()
transformed = transformer.fit_transform(self.data)
pd.testing.assert_frame_equal(transformed, self.data.drop(columns=["C"]))
transformed, _ = transformer.fit_transform(self.data)
pd.testing.assert_frame_equal(transformed, self.data.drop(columns=["D"]))

def test_call_transform_on_fitted_transformer_does_not_remove_trailing_null_rows(
self,
):
transformer = MissingValuesTransformer(no_fill_future_values_features=["C"])
transformer.fit(self.data)
new_data = pd.DataFrame(
{
"A": [1, 2, 3, 4],
"B": [1, 2, 3, 4],
"C": [1, 2, 3, 4],
"D": [1, 2, 3, 4],
},
index=[0, 1, 1, 2],
)
transformed = transformer.transform(new_data)
pd.testing.assert_frame_equal(transformed, new_data.drop(columns=["D"]))

def test_calling_transform_before_fit_raises_error(self):
transformer = MissingValuesTransformer()
Expand Down
15 changes: 11 additions & 4 deletions test/unit/model/regressors/test_linear_quantile.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,11 +55,14 @@ def test_imputer(self):
# Arrange
n_sample = train_input.shape[0]
X = train_input.iloc[:, 1:].copy(deep=True)
sp = np.ones(n_sample)
sp[-1] = np.nan
X["Sparse"] = sp
X["sparse"] = np.ones(n_sample)
X.loc[X.index[-2], "sparse"] = np.nan
X["sparse_2"] = np.ones(n_sample)
X.loc[X.index[-1], "sparse_2"] = np.nan
model1 = LinearQuantileOpenstfRegressor(imputation_strategy=None)
model2 = LinearQuantileOpenstfRegressor(imputation_strategy="mean")
model2 = LinearQuantileOpenstfRegressor(
imputation_strategy="mean", no_fill_future_values_features=["sparse_2"]
)

# Act
# Model should give error if nan values are present.
Expand All @@ -75,6 +78,10 @@ def test_imputer(self):
X_ = pd.DataFrame(model2.imputer_.transform(X), columns=X.columns)
self.assertTrue((model2.predict(X_) == model2.predict(X)).all())

# check if last row is removed because of trailing null values
X_transformed, _ = model2.imputer_.fit_transform(X)
self.assertEqual(X_transformed.shape[0], n_sample - 1)

def test_value_error_raised(self):
# Check if Value Error is raised when 0.5 is not in the requested quantiles list
with self.assertRaises(ValueError):
Expand Down
7 changes: 3 additions & 4 deletions test/unit/pipeline/test_pipeline_train_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,8 +125,6 @@ def test_train_model_pipeline_core_happy_flow(self):
but it can/should include predictors (e.g. weather data)
"""
# Select 50 data points to speedup test
train_input = self.train_input.iloc[:50, :]
# Remove modeltypes which are optional, and add a dummy regressor
for model_type in list(ModelType) + [__name__ + ".DummyRegressor"]:
with self.subTest(model_type=model_type):
Expand All @@ -136,7 +134,9 @@ def test_train_model_pipeline_core_happy_flow(self):
model_type.value if hasattr(model_type, "value") else model_type
)
model_specs = self.model_specs
train_input = self.train_input

# Select 150 data points to speedup test
train_input = self.train_input.iloc[:150, :]

# Use default parameters
model_specs.hyper_params = {}
Expand All @@ -155,7 +155,6 @@ def test_train_model_pipeline_core_happy_flow(self):
function=split_dummy_arima,
arguments={},
)
train_input = self.train_input[:150]

model, report, modelspecs, _ = train_model_pipeline_core(
pj=pj, model_specs=model_specs, input_data=train_input
Expand Down

0 comments on commit 430fa9e

Please sign in to comment.