From 08c9965f9c9afca3b5718ace12f469eb7dadee5e Mon Sep 17 00:00:00 2001 From: Egor Dmitriev Date: Fri, 11 Oct 2024 15:31:17 +0200 Subject: [PATCH] feature(KTP-1279): Changed linear model scaling and improved sample weighting feature. (#565) * feature(KTP-1279): Changed feature scaling in linear model. Added exponential sample weighting in linear model. Signed-off-by: Egor Dmitriev * feature(KTP-1279): Added test to change linear model parameters. Signed-off-by: Egor Dmitriev * style: Code style fixes. Signed-off-by: Egor Dmitriev * Format Python code with Black Signed-off-by: black * feature(KTP-1279): Added additional test condition for linear model params. Signed-off-by: Egor Dmitriev * style: Code style fixes. Signed-off-by: Egor Dmitriev * Format Python code with Black Signed-off-by: black * feature(KTP-1279): Added additional test condition for linear model params. Signed-off-by: Egor Dmitriev * feature(KTP-1279): Added weight floor. Added documentation for sample weight calculation. Signed-off-by: Egor Dmitriev * Format Python code with Black Signed-off-by: black * Merge branch 'main' into feature/KTP-1279-linear-sample-weight Signed-off-by: Clara De Smet * Format Python code with Black Signed-off-by: black * Fixed linter suggestion Signed-off-by: Clara De Smet * Added documentation * Format Python code with Black Signed-off-by: black * Bumped version of black formatting * Updated documentation * Format Python code with Black Signed-off-by: black * Removed blank line * Format Python code with Black Signed-off-by: black * Reformatting docs * Reformatting docs --------- Signed-off-by: Egor Dmitriev Signed-off-by: black Signed-off-by: Clara De Smet Signed-off-by: Clara De Smet <157587243+clara-de-smet@users.noreply.github.com> Co-authored-by: black Co-authored-by: Clara De Smet Co-authored-by: Clara De Smet <157587243+clara-de-smet@users.noreply.github.com> --- .pre-commit-config.yaml | 2 +- README.md | 2 +- .../missing_values_transformer.py | 1 + openstef/model/model_creator.py | 3 + openstef/model/regressors/linear_quantile.py | 56 +++++++++++++++++-- .../model/regressors/test_linear_quantile.py | 20 +++++++ 6 files changed, 76 insertions(+), 8 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 195389c2..807f4479 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -14,7 +14,7 @@ repos: - repo: https://github.com/ambv/black - rev: 22.1.0 + rev: 24.3.0 hooks: - id: black language_version: python3.11 diff --git a/README.md b/README.md index c41a8e0b..cabf6699 100644 --- a/README.md +++ b/README.md @@ -50,7 +50,7 @@ OpenSTEF is a Python package designed for generating short-term forecasts in the pip install openstef ``` -### Remark regarding installation within a **conda environment on Windows**: +### Remark regarding installation within a **conda environment on Windows** A version of the pywin32 package will be installed as a secondary dependency along with the installation of the openstef package. Since conda relies on an old version of pywin32, the new installation can break conda's functionality. The following command can solve this issue: ```shell diff --git a/openstef/feature_engineering/missing_values_transformer.py b/openstef/feature_engineering/missing_values_transformer.py index 7c46e519..faa3b2fc 100644 --- a/openstef/feature_engineering/missing_values_transformer.py +++ b/openstef/feature_engineering/missing_values_transformer.py @@ -41,6 +41,7 @@ def __init__( no_fill_future_values_features: The features for which it does not make sense to fill future values. Rows that contain trailing null values for these features will be removed from the data. + """ self.missing_values = missing_values self.imputation_strategy = imputation_strategy diff --git a/openstef/model/model_creator.py b/openstef/model/model_creator.py index e15b225b..32c10886 100644 --- a/openstef/model/model_creator.py +++ b/openstef/model/model_creator.py @@ -116,6 +116,9 @@ "missing_values", "imputation_strategy", "fill_value", + "weight_scale_percentile", + "weight_exponent", + "weight_floor", "no_fill_future_values_features", ], ModelType.ARIMA: [ diff --git a/openstef/model/regressors/linear_quantile.py b/openstef/model/regressors/linear_quantile.py index 2c8ead73..aa4fc57d 100644 --- a/openstef/model/regressors/linear_quantile.py +++ b/openstef/model/regressors/linear_quantile.py @@ -8,7 +8,7 @@ import pandas as pd from sklearn.base import RegressorMixin from sklearn.linear_model import QuantileRegressor -from sklearn.preprocessing import MinMaxScaler +from sklearn.preprocessing import StandardScaler from sklearn.utils.validation import check_is_fitted from openstef.feature_engineering.missing_values_transformer import ( @@ -25,8 +25,8 @@ class LinearQuantileOpenstfRegressor(OpenstfRegressor, RegressorMixin): solver: str imputer_: MissingValuesTransformer - x_scaler_: MinMaxScaler - y_scaler_: MinMaxScaler + x_scaler_: StandardScaler + y_scaler_: StandardScaler models_: Dict[float, QuantileRegressor] is_fitted_: bool = False @@ -47,6 +47,9 @@ def __init__( missing_values: Union[int, float, str, None] = np.nan, imputation_strategy: Optional[str] = "mean", fill_value: Union[str, int, float] = None, + weight_scale_percentile: int = 95, + weight_exponent: float = 1, + weight_floor: float = 0.1, no_fill_future_values_features: List[str] = None, ): """Initialize LinearQuantileOpenstfRegressor. @@ -70,6 +73,9 @@ def __init__( missing_values: Value to be considered as missing value imputation_strategy: Imputation strategy fill_value: Fill value + weight_scale_percentile: Percentile used in scaling of the samples + weight_exponent: Exponent used in sample weighing + weight_floor: Minimum weight for samples no_fill_future_values_features: The features for which it does not make sense to fill future values. Rows that contain trailing null values for these features will be removed from the data. @@ -86,14 +92,17 @@ def __init__( self.quantiles = quantiles self.alpha = alpha self.solver = solver + self.weight_scale_percentile = weight_scale_percentile + self.weight_exponent = weight_exponent + self.weight_floor = weight_floor self.imputer_ = MissingValuesTransformer( missing_values=missing_values, imputation_strategy=imputation_strategy, fill_value=fill_value, no_fill_future_values_features=no_fill_future_values_features, ) - self.x_scaler_ = MinMaxScaler(feature_range=(-1, 1)) - self.y_scaler_ = MinMaxScaler(feature_range=(-1, 1)) + self.x_scaler_ = StandardScaler() + self.y_scaler_ = StandardScaler() self.models_ = { quantile: QuantileRegressor(alpha=alpha, quantile=quantile, solver=solver) for quantile in quantiles @@ -182,7 +191,7 @@ def fit(self, x: pd.DataFrame, y: pd.Series, **kwargs) -> RegressorMixin: y_scaled = self.y_scaler_.fit_transform(y.to_frame())[:, 0] # Add more focus on extreme / peak values - sample_weight = np.abs(y_scaled) + sample_weight = self._calculate_sample_weights(y.values.squeeze()) # Fit quantile regressors for quantile in self.quantiles: @@ -196,6 +205,33 @@ def fit(self, x: pd.DataFrame, y: pd.Series, **kwargs) -> RegressorMixin: return self + def _calculate_sample_weights(self, y: np.array): + """Calculate sample weights based on the y values of arbitrary scale. + + The resulting weights are in the range [0,1] and are used to put more emphasis + on certain samples. The sample weighting function does: + + * Rescale data to a [-1, 1] range using quantile scaling. 90% of the data will + be within this range. Rest is outside. + * Calculate the weight by taking the exponent of scaled data. + * exponent=0: Results in uniform weights for all samples. + * exponent=1: Results in linearly increasing weights for samples that are + closer to the extremes. + * exponent>1: Results in exponentially increasing weights for samples that are + closer to the extremes. + * Clip the data to [0, 1] range with weight_floor as the minimum weight. + * Weight floor is used to make sure that all the samples are considered. + + """ + return np.clip( + _weight_exp( + _scale_percentile(y, percentile=self.weight_scale_percentile), + exponent=self.weight_exponent, + ), + a_min=self.weight_floor, + a_max=1, + ) + def predict(self, x: pd.DataFrame, quantile: float = 0.5, **kwargs) -> np.array: """Makes a prediction for a desired quantile. @@ -250,3 +286,11 @@ def _get_param_names(cls): def __sklearn_is_fitted__(self) -> bool: return self.is_fitted_ + + +def _scale_percentile(x: np.ndarray, percentile: int = 95): + return np.abs(x / np.percentile(np.abs(x), percentile)) + + +def _weight_exp(x: np.ndarray, exponent: float = 1): + return np.abs(x) ** exponent diff --git a/test/unit/model/regressors/test_linear_quantile.py b/test/unit/model/regressors/test_linear_quantile.py index 50ea32bb..08be9975 100644 --- a/test/unit/model/regressors/test_linear_quantile.py +++ b/test/unit/model/regressors/test_linear_quantile.py @@ -10,6 +10,7 @@ from sklearn.utils.estimator_checks import check_estimator from openstef.feature_engineering.apply_features import apply_features +from openstef.model.model_creator import ModelCreator from openstef.model.regressors.linear_quantile import LinearQuantileOpenstfRegressor from test.unit.utils.base import BaseTestCase from test.unit.utils.data import TestData @@ -151,3 +152,22 @@ def test_ignore_features(self): self.assertNotIn("E1B_AMI_I", input_data_filtered.columns) self.assertNotIn("E4A_I", input_data_filtered.columns) self.assertIn("load", input_data_filtered.columns) + + def test_create_model(self): + # Arrange + kwargs = { + "weight_scale_percentile": 50, + "weight_exponent": 2, + } + + # Act + model = ModelCreator.create_model( + model_type="linear_quantile", + quantiles=[0.5], + **kwargs, + ) + + # Assert + self.assertIsInstance(model, LinearQuantileOpenstfRegressor) + self.assertEqual(model.weight_scale_percentile, 50) + self.assertEqual(model.weight_exponent, 2)