Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[fix] disable imputation on future data #562

Merged
merged 22 commits into from
Oct 9, 2024
Merged
Show file tree
Hide file tree
Changes from 20 commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
cfa4cd3
postprocess imputation df by putting back trailing nan's
lschilders Sep 27, 2024
ca5f858
add unit test test_no_imputation_for_future_data
lschilders Sep 27, 2024
19f5104
fix unit test test_linear_quantile
lschilders Sep 27, 2024
6d07d84
fix imports in flatliner.py
lschilders Sep 27, 2024
8a96d25
remove columns with future nan values
lschilders Sep 27, 2024
0cba059
implement non_trailing_null_rows in missing_values_transformer and ad…
lschilders Oct 2, 2024
521d77f
also transform labels y with trailing null rows
lschilders Oct 2, 2024
0956e65
test in test_linear_quantile for trailing null
lschilders Oct 2, 2024
7e5a771
Format Python code with Black
actions-user Oct 2, 2024
5a7537d
merge Black formatting in branch
lschilders Oct 2, 2024
4c65890
Merge branch 'main' into fix/disable_imputation_on_future_data
lschilders Oct 2, 2024
2db9afa
remove assert in fit_transform missing_values_transformer
lschilders Oct 2, 2024
a04700e
only train on subset of data in test_pipeline_train_model
lschilders Oct 2, 2024
f73541e
adapt transform(x) to not remove non_trailing_nulls and index with Da…
lschilders Oct 2, 2024
ee53800
test for duplicate indices
lschilders Oct 2, 2024
52017ee
remove non trailing null rows in fit_transform and check in test_impu…
lschilders Oct 2, 2024
24590d7
add test for calling transform separately
lschilders Oct 7, 2024
e66afe8
refactored MissingValuesTransformer with private static method _deter…
lschilders Oct 7, 2024
c9b73dc
add whitelist for no_fill_future_values_features
lschilders Oct 7, 2024
7f710a7
immutable default value for no_fill_future_values_features
lschilders Oct 7, 2024
4203538
Update openstef/feature_engineering/missing_values_transformer.py
lschilders Oct 8, 2024
5f83b62
add no_fill_future_values_features to model_creator
lschilders Oct 8, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
49 changes: 44 additions & 5 deletions openstef/feature_engineering/missing_values_transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ def __init__(
missing_values: Union[int, float, str, None] = np.nan,
imputation_strategy: str = None,
fill_value: Union[str, int, float] = None,
no_fill_future_values_features: List[str] = None,
):
"""Initialize missing values handler.

Expand All @@ -37,11 +38,16 @@ def __init__(
Can be one of "mean", "median", "most_frequent", "constant" or None.
fill_value: When strategy == "constant", fill_value is used to replace all
occurrences of missing_values.

no_fill_future_values_features: The features for which it does not make sense
to fill future values. Rows that contain trailing null values for these
features will be removed from the data.
"""
self.missing_values = missing_values
self.imputation_strategy = imputation_strategy
self.fill_value = fill_value
if no_fill_future_values_features is None:
no_fill_future_values_features = []
self.no_fill_future_values_features = no_fill_future_values_features
lschilders marked this conversation as resolved.
Show resolved Hide resolved
self.is_fitted_ = False

# Build the proper imputation transformer
Expand All @@ -57,6 +63,11 @@ def __init__(
).set_output(transform="pandas")
self.imputer_._validate_params()

@staticmethod
def _determine_trailing_null_rows(x: pd.DataFrame) -> pd.Series:
"""Determine rows with trailing null values in a DataFrame."""
return ~x.bfill().isnull().any(axis="columns")

def fit(self, x, y=None):
"""Fit the imputer on the input data."""
_ = check_array(x, force_all_finite="allow-nan")
Expand All @@ -69,9 +80,17 @@ def fit(self, x, y=None):
# Remove always null columns
is_column_null = x.isnull().all(axis="index")
self.non_null_feature_names = list(x.columns[~is_column_null])
x = x[self.non_null_feature_names]

# Remove trailing null rows for features that should
# not be imputed in the future
trailing_null_rows = self._determine_trailing_null_rows(
x[self.no_fill_future_values_features]
)
x = x.loc[trailing_null_rows]

# Imputers do not support labels
self.imputer_.fit(X=x[self.non_null_feature_names], y=None)
self.imputer_.fit(X=x, y=None)
self.is_fitted_ = True

def transform(self, x) -> pd.DataFrame:
Expand All @@ -83,17 +102,37 @@ def transform(self, x) -> pd.DataFrame:

x = x[self.non_null_feature_names]

return self.imputer_.transform(x)
transformed = self.imputer_.transform(x)

def fit_transform(self, x, y=None):
return transformed

def fit_transform(self, x, y=None) -> tuple[pd.DataFrame, Optional[pd.Series]]:
"""Fit the imputer on the input data and transform it.

Returns:
The data with missing values imputed.

"""
self.fit(x, y)
return self.transform(x)

if not isinstance(x, pd.DataFrame):
x = pd.DataFrame(np.asarray(x))

x = x[self.non_null_feature_names]

# Remove trailing null rows for features that should
# not be imputed in the future
non_trailing_null_rows = self._determine_trailing_null_rows(
x[self.no_fill_future_values_features]
)
x = x.loc[non_trailing_null_rows]

x = self.transform(x)

if y is not None:
y = y.loc[non_trailing_null_rows]

return x, y

@classmethod
def _identity(cls, x):
Expand Down
7 changes: 1 addition & 6 deletions openstef/model/regressors/flatliner.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,18 +2,13 @@
#
# SPDX-License-Identifier: MPL-2.0
import re
from typing import Dict, Union, Set, Optional, List
from typing import List

import numpy as np
import pandas as pd
from sklearn.base import RegressorMixin
from sklearn.linear_model import QuantileRegressor
from sklearn.preprocessing import MinMaxScaler
from sklearn.utils.validation import check_is_fitted

from openstef.feature_engineering.missing_values_transformer import (
MissingValuesTransformer,
)
from openstef.model.regressors.regressor import OpenstfRegressor


Expand Down
9 changes: 7 additions & 2 deletions openstef/model/regressors/linear_quantile.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
#
# SPDX-License-Identifier: MPL-2.0
import re
from typing import Dict, Union, Set, Optional
from typing import Dict, Union, Set, Optional, List

import numpy as np
import pandas as pd
Expand Down Expand Up @@ -47,6 +47,7 @@ def __init__(
missing_values: Union[int, float, str, None] = np.nan,
imputation_strategy: Optional[str] = "mean",
fill_value: Union[str, int, float] = None,
no_fill_future_values_features: List[str] = None,
):
"""Initialize LinearQuantileOpenstfRegressor.

Expand All @@ -69,6 +70,9 @@ def __init__(
missing_values: Value to be considered as missing value
imputation_strategy: Imputation strategy
fill_value: Fill value
no_fill_future_values_features: The features for which it does not make sense
to fill future values. Rows that contain trailing null values for these
features will be removed from the data.

"""
super().__init__()
Expand All @@ -86,6 +90,7 @@ def __init__(
missing_values=missing_values,
imputation_strategy=imputation_strategy,
fill_value=fill_value,
no_fill_future_values_features=no_fill_future_values_features,
)
self.x_scaler_ = MinMaxScaler(feature_range=(-1, 1))
self.y_scaler_ = MinMaxScaler(feature_range=(-1, 1))
Expand Down Expand Up @@ -165,7 +170,7 @@ def fit(self, x: pd.DataFrame, y: pd.Series, **kwargs) -> RegressorMixin:
x = self._remove_ignored_features(x)

# Fix nan columns
x = self.imputer_.fit_transform(x)
x, y = self.imputer_.fit_transform(x, y)
if x.isna().any().any():
raise ValueError(
"There are nan values in the input data. Set "
Expand Down
69 changes: 55 additions & 14 deletions test/unit/feature_engineering/test_missing_values_transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
# SPDX-License-Identifier: MPL-2.0
from test.unit.utils.base import BaseTestCase

import unittest
import pandas as pd
import numpy as np
from sklearn.exceptions import NotFittedError
Expand All @@ -15,41 +14,83 @@
class MissingValuesTransformerTests(BaseTestCase):
def setUp(self):
self.data = pd.DataFrame(
{"A": [1, np.nan, 3], "B": [4, 5, np.nan], "C": [np.nan, np.nan, np.nan]}
{
"A": [np.nan, 2, 3, 4],
"B": [3, np.nan, 4, 5],
"C": [3, 4, 5, np.nan],
"D": [np.nan, np.nan, np.nan, np.nan],
},
index=[0, 1, 1, 2],
)

def test_imputation_with_mean_strategy_fills_missing_values(self):
transformer = MissingValuesTransformer(imputation_strategy="mean")
transformed = transformer.fit_transform(self.data)
transformed, _ = transformer.fit_transform(self.data)
self.assertEqual(transformed.isnull().sum().sum(), 0)
self.assertAlmostEqual(transformed.loc[1, "A"], 2)
self.assertAlmostEqual(transformed.loc[2, "B"], 4.5)
self.assertAlmostEqual(transformed.iloc[0]["A"], 3)
self.assertAlmostEqual(transformed.iloc[1]["B"], 4)

def test_imputation_with_constant_strategy_fills_missing_values(self):
transformer = MissingValuesTransformer(
imputation_strategy="constant", fill_value=0
)
transformed = transformer.fit_transform(self.data)
transformed, _ = transformer.fit_transform(self.data)
self.assertEqual(transformed.isnull().sum().sum(), 0)
self.assertEqual(transformed.loc[1, "A"], 0)
self.assertEqual(transformed.loc[2, "B"], 0)
self.assertEqual(transformed.iloc[0]["A"], 0)
self.assertEqual(transformed.iloc[1]["B"], 0)

def test_columns_always_null_are_removed(self):
transformer = MissingValuesTransformer()
transformer.fit(self.data)
self.assertNotIn("C", transformer.non_null_feature_names)
self.assertNotIn("D", transformer.non_null_feature_names)

def test_determining_non_trailing_null_rows(self):
transformer = MissingValuesTransformer(no_fill_future_values_features=["C"])
transformer.fit(self.data)
non_trailing_null_rows = transformer._determine_trailing_null_rows(
self.data[transformer.non_null_feature_names]
)
pd.testing.assert_series_equal(
non_trailing_null_rows,
pd.Series([True, True, True, False], index=[0, 1, 1, 2]),
)

def test_fitting_with_labels_removes_rows_with_trailing_nulls(self):
transformer = MissingValuesTransformer(no_fill_future_values_features=["C"])
_, y_transformed = transformer.fit_transform(
self.data, y=pd.Series([1, 2, 3, 4], index=self.data.index)
)
self.assertEqual(y_transformed.tolist(), [1, 2, 3])

def test_non_dataframe_input_is_converted_and_processed(self):
transformer = MissingValuesTransformer(imputation_strategy="mean")
array = np.array([[1, np.nan], [np.nan, 2]])
transformed = transformer.fit_transform(array)
array = np.array([[1, np.nan, np.nan], [np.nan, 2, np.nan]])
transformed, _ = transformer.fit_transform(array)
self.assertIsInstance(transformed, pd.DataFrame)
self.assertEqual(transformed.isnull().sum().sum(), 0)
self.assertEqual(transformed.shape, (2, 2))

def test_fitting_transformer_without_strategy_keeps_data_unchanged(self):
def test_fitting_transformer_without_strategy_keeps_valid_data_unchanged(self):
transformer = MissingValuesTransformer()
transformed = transformer.fit_transform(self.data)
pd.testing.assert_frame_equal(transformed, self.data.drop(columns=["C"]))
transformed, _ = transformer.fit_transform(self.data)
pd.testing.assert_frame_equal(transformed, self.data.drop(columns=["D"]))

def test_call_transform_on_fitted_transformer_does_not_remove_trailing_null_rows(
self,
):
transformer = MissingValuesTransformer(no_fill_future_values_features=["C"])
transformer.fit(self.data)
new_data = pd.DataFrame(
{
"A": [1, 2, 3, 4],
"B": [1, 2, 3, 4],
"C": [1, 2, 3, 4],
"D": [1, 2, 3, 4],
},
index=[0, 1, 1, 2],
)
transformed = transformer.transform(new_data)
pd.testing.assert_frame_equal(transformed, new_data.drop(columns=["D"]))

def test_calling_transform_before_fit_raises_error(self):
transformer = MissingValuesTransformer()
Expand Down
15 changes: 11 additions & 4 deletions test/unit/model/regressors/test_linear_quantile.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,11 +55,14 @@ def test_imputer(self):
# Arrange
n_sample = train_input.shape[0]
X = train_input.iloc[:, 1:].copy(deep=True)
sp = np.ones(n_sample)
sp[-1] = np.nan
X["Sparse"] = sp
X["sparse"] = np.ones(n_sample)
X.loc[X.index[-2], "sparse"] = np.nan
X["sparse_2"] = np.ones(n_sample)
X.loc[X.index[-1], "sparse_2"] = np.nan
model1 = LinearQuantileOpenstfRegressor(imputation_strategy=None)
model2 = LinearQuantileOpenstfRegressor(imputation_strategy="mean")
model2 = LinearQuantileOpenstfRegressor(
imputation_strategy="mean", no_fill_future_values_features=["sparse_2"]
)

# Act
# Model should give error if nan values are present.
Expand All @@ -75,6 +78,10 @@ def test_imputer(self):
X_ = pd.DataFrame(model2.imputer_.transform(X), columns=X.columns)
self.assertTrue((model2.predict(X_) == model2.predict(X)).all())

# check if last row is removed because of trailing null values
X_transformed, _ = model2.imputer_.fit_transform(X)
self.assertEqual(X_transformed.shape[0], n_sample - 1)

def test_value_error_raised(self):
# Check if Value Error is raised when 0.5 is not in the requested quantiles list
with self.assertRaises(ValueError):
Expand Down
7 changes: 3 additions & 4 deletions test/unit/pipeline/test_pipeline_train_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,8 +125,6 @@ def test_train_model_pipeline_core_happy_flow(self):
but it can/should include predictors (e.g. weather data)

"""
# Select 50 data points to speedup test
train_input = self.train_input.iloc[:50, :]
# Remove modeltypes which are optional, and add a dummy regressor
for model_type in list(MLModelType) + [__name__ + ".DummyRegressor"]:
with self.subTest(model_type=model_type):
Expand All @@ -136,7 +134,9 @@ def test_train_model_pipeline_core_happy_flow(self):
model_type.value if hasattr(model_type, "value") else model_type
)
model_specs = self.model_specs
train_input = self.train_input

# Select 150 data points to speedup test
train_input = self.train_input.iloc[:150, :]

# Use default parameters
model_specs.hyper_params = {}
Expand All @@ -155,7 +155,6 @@ def test_train_model_pipeline_core_happy_flow(self):
function=split_dummy_arima,
arguments={},
)
train_input = self.train_input[:150]

model, report, modelspecs, _ = train_model_pipeline_core(
pj=pj, model_specs=model_specs, input_data=train_input
Expand Down
Loading