feature(KTP-1279): Changed linear model scaling and improved sample w…

…eighting feature. (#565) * feature(KTP-1279): Changed feature scaling in linear model. Added exponential sample weighting in linear model. Signed-off-by: Egor Dmitriev <[email protected]> * feature(KTP-1279): Added test to change linear model parameters. Signed-off-by: Egor Dmitriev <[email protected]> * style: Code style fixes. Signed-off-by: Egor Dmitriev <[email protected]> * Format Python code with Black Signed-off-by: black <[email protected]> * feature(KTP-1279): Added additional test condition for linear model params. Signed-off-by: Egor Dmitriev <[email protected]> * style: Code style fixes. Signed-off-by: Egor Dmitriev <[email protected]> * Format Python code with Black Signed-off-by: black <[email protected]> * feature(KTP-1279): Added additional test condition for linear model params. Signed-off-by: Egor Dmitriev <[email protected]> * feature(KTP-1279): Added weight floor. Added documentation for sample weight calculation. Signed-off-by: Egor Dmitriev <[email protected]> * Format Python code with Black Signed-off-by: black <[email protected]> * Merge branch 'main' into feature/KTP-1279-linear-sample-weight Signed-off-by: Clara De Smet <[email protected]> * Format Python code with Black Signed-off-by: black <[email protected]> * Fixed linter suggestion Signed-off-by: Clara De Smet <[email protected]> * Added documentation * Format Python code with Black Signed-off-by: black <[email protected]> * Bumped version of black formatting * Updated documentation * Format Python code with Black Signed-off-by: black <[email protected]> * Removed blank line * Format Python code with Black Signed-off-by: black <[email protected]> * Reformatting docs * Reformatting docs --------- Signed-off-by: Egor Dmitriev <[email protected]> Signed-off-by: black <[email protected]> Signed-off-by: Clara De Smet <[email protected]> Signed-off-by: Clara De Smet <[email protected]> Co-authored-by: black <[email protected]> Co-authored-by: Clara De Smet <[email protected]> Co-authored-by: Clara De Smet <[email protected]>
OpenSTEF · Oct 11, 2024 · 08c9965 · 08c9965
1 parent 21692ad
commit 08c9965
Show file tree

Hide file tree

Showing 6 changed files with 76 additions and 8 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -14,7 +14,7 @@
 
 repos:
 -   repo: https://github.com/ambv/black
-    rev: 22.1.0
+    rev: 24.3.0
     hooks:
     - id: black
       language_version: python3.11

diff --git a/README.md b/README.md
@@ -50,7 +50,7 @@ OpenSTEF is a Python package designed for generating short-term forecasts in the
 pip install openstef
 ```
 
-### Remark regarding installation within a **conda environment on Windows**:
+### Remark regarding installation within a **conda environment on Windows**
 
 A version of the pywin32 package will be installed as a secondary dependency along with the installation of the openstef package. Since conda relies on an old version of pywin32, the new installation can break conda's functionality. The following command can solve this issue:
 ```shell

diff --git a/openstef/feature_engineering/missing_values_transformer.py b/openstef/feature_engineering/missing_values_transformer.py
@@ -41,6 +41,7 @@ def __init__(
             no_fill_future_values_features: The features for which it does not make sense
                 to fill future values. Rows that contain trailing null values for these
                 features will be removed from the data.
+
         """
         self.missing_values = missing_values
         self.imputation_strategy = imputation_strategy

diff --git a/openstef/model/model_creator.py b/openstef/model/model_creator.py
@@ -116,6 +116,9 @@
         "missing_values",
         "imputation_strategy",
         "fill_value",
+        "weight_scale_percentile",
+        "weight_exponent",
+        "weight_floor",
         "no_fill_future_values_features",
     ],
     ModelType.ARIMA: [

diff --git a/openstef/model/regressors/linear_quantile.py b/openstef/model/regressors/linear_quantile.py
@@ -8,7 +8,7 @@
 import pandas as pd
 from sklearn.base import RegressorMixin
 from sklearn.linear_model import QuantileRegressor
-from sklearn.preprocessing import MinMaxScaler
+from sklearn.preprocessing import StandardScaler
 from sklearn.utils.validation import check_is_fitted
 
 from openstef.feature_engineering.missing_values_transformer import (
@@ -25,8 +25,8 @@ class LinearQuantileOpenstfRegressor(OpenstfRegressor, RegressorMixin):
     solver: str
 
     imputer_: MissingValuesTransformer
-    x_scaler_: MinMaxScaler
-    y_scaler_: MinMaxScaler
+    x_scaler_: StandardScaler
+    y_scaler_: StandardScaler
     models_: Dict[float, QuantileRegressor]
 
     is_fitted_: bool = False
@@ -47,6 +47,9 @@ def __init__(
         missing_values: Union[int, float, str, None] = np.nan,
         imputation_strategy: Optional[str] = "mean",
         fill_value: Union[str, int, float] = None,
+        weight_scale_percentile: int = 95,
+        weight_exponent: float = 1,
+        weight_floor: float = 0.1,
         no_fill_future_values_features: List[str] = None,
     ):
         """Initialize LinearQuantileOpenstfRegressor.
@@ -70,6 +73,9 @@ def __init__(
             missing_values: Value to be considered as missing value
             imputation_strategy: Imputation strategy
             fill_value: Fill value
+            weight_scale_percentile: Percentile used in scaling of the samples
+            weight_exponent: Exponent used in sample weighing
+            weight_floor: Minimum weight for samples
             no_fill_future_values_features: The features for which it does not make sense
                 to fill future values. Rows that contain trailing null values for these
                 features will be removed from the data.
@@ -86,14 +92,17 @@ def __init__(
         self.quantiles = quantiles
         self.alpha = alpha
         self.solver = solver
+        self.weight_scale_percentile = weight_scale_percentile
+        self.weight_exponent = weight_exponent
+        self.weight_floor = weight_floor
         self.imputer_ = MissingValuesTransformer(
             missing_values=missing_values,
             imputation_strategy=imputation_strategy,
             fill_value=fill_value,
             no_fill_future_values_features=no_fill_future_values_features,
         )
-        self.x_scaler_ = MinMaxScaler(feature_range=(-1, 1))
-        self.y_scaler_ = MinMaxScaler(feature_range=(-1, 1))
+        self.x_scaler_ = StandardScaler()
+        self.y_scaler_ = StandardScaler()
         self.models_ = {
             quantile: QuantileRegressor(alpha=alpha, quantile=quantile, solver=solver)
             for quantile in quantiles
@@ -182,7 +191,7 @@ def fit(self, x: pd.DataFrame, y: pd.Series, **kwargs) -> RegressorMixin:
         y_scaled = self.y_scaler_.fit_transform(y.to_frame())[:, 0]
 
         # Add more focus on extreme / peak values
-        sample_weight = np.abs(y_scaled)
+        sample_weight = self._calculate_sample_weights(y.values.squeeze())
 
         # Fit quantile regressors
         for quantile in self.quantiles:
@@ -196,6 +205,33 @@ def fit(self, x: pd.DataFrame, y: pd.Series, **kwargs) -> RegressorMixin:
 
         return self
 
+    def _calculate_sample_weights(self, y: np.array):
+        """Calculate sample weights based on the y values of arbitrary scale.
+
+        The resulting weights are in the range [0,1] and are used to put more emphasis
+        on certain samples. The sample weighting function does:
+
+        * Rescale data to a [-1, 1] range using quantile scaling. 90% of the data will
+          be within this range. Rest is outside.
+        * Calculate the weight by taking the exponent of scaled data.
+          * exponent=0: Results in uniform weights for all samples.
+          * exponent=1: Results in linearly increasing weights for samples that are
+            closer to the extremes.
+          * exponent>1: Results in exponentially increasing weights for samples that are
+            closer to the extremes.
+        * Clip the data to [0, 1] range with weight_floor as the minimum weight.
+          * Weight floor is used to make sure that all the samples are considered.
+
+        """
+        return np.clip(
+            _weight_exp(
+                _scale_percentile(y, percentile=self.weight_scale_percentile),
+                exponent=self.weight_exponent,
+            ),
+            a_min=self.weight_floor,
+            a_max=1,
+        )
+
     def predict(self, x: pd.DataFrame, quantile: float = 0.5, **kwargs) -> np.array:
         """Makes a prediction for a desired quantile.
 
@@ -250,3 +286,11 @@ def _get_param_names(cls):
 
     def __sklearn_is_fitted__(self) -> bool:
         return self.is_fitted_
+
+
+def _scale_percentile(x: np.ndarray, percentile: int = 95):
+    return np.abs(x / np.percentile(np.abs(x), percentile))
+
+
+def _weight_exp(x: np.ndarray, exponent: float = 1):
+    return np.abs(x) ** exponent
diff --git a/test/unit/model/regressors/test_linear_quantile.py b/test/unit/model/regressors/test_linear_quantile.py
@@ -10,6 +10,7 @@
 from sklearn.utils.estimator_checks import check_estimator
 
 from openstef.feature_engineering.apply_features import apply_features
+from openstef.model.model_creator import ModelCreator
 from openstef.model.regressors.linear_quantile import LinearQuantileOpenstfRegressor
 from test.unit.utils.base import BaseTestCase
 from test.unit.utils.data import TestData
@@ -151,3 +152,22 @@ def test_ignore_features(self):
         self.assertNotIn("E1B_AMI_I", input_data_filtered.columns)
         self.assertNotIn("E4A_I", input_data_filtered.columns)
         self.assertIn("load", input_data_filtered.columns)
+
+    def test_create_model(self):
+        # Arrange
+        kwargs = {
+            "weight_scale_percentile": 50,
+            "weight_exponent": 2,
+        }
+
+        # Act
+        model = ModelCreator.create_model(
+            model_type="linear_quantile",
+            quantiles=[0.5],
+            **kwargs,
+        )
+
+        # Assert
+        self.assertIsInstance(model, LinearQuantileOpenstfRegressor)
+        self.assertEqual(model.weight_scale_percentile, 50)
+        self.assertEqual(model.weight_exponent, 2)