From 08c9965f9c9afca3b5718ace12f469eb7dadee5e Mon Sep 17 00:00:00 2001
From: Egor Dmitriev <egordmitriev2@gmail.com>
Date: Fri, 11 Oct 2024 15:31:17 +0200
Subject: [PATCH] feature(KTP-1279): Changed linear model scaling and improved
 sample weighting feature. (#565)

* feature(KTP-1279): Changed feature scaling in linear model. Added exponential sample weighting in linear model.

Signed-off-by: Egor Dmitriev <egor.dmitriev@alliander.com>

* feature(KTP-1279): Added test to change linear model parameters.

Signed-off-by: Egor Dmitriev <egor.dmitriev@alliander.com>

* style: Code style fixes.

Signed-off-by: Egor Dmitriev <egor.dmitriev@alliander.com>

* Format Python code with Black

Signed-off-by: black <action@github.com>

* feature(KTP-1279): Added additional test condition for linear model params.

Signed-off-by: Egor Dmitriev <egor.dmitriev@alliander.com>

* style: Code style fixes.

Signed-off-by: Egor Dmitriev <egor.dmitriev@alliander.com>

* Format Python code with Black

Signed-off-by: black <action@github.com>

* feature(KTP-1279): Added additional test condition for linear model params.

Signed-off-by: Egor Dmitriev <egor.dmitriev@alliander.com>

* feature(KTP-1279): Added weight floor. Added documentation for sample weight calculation.

Signed-off-by: Egor Dmitriev <egor.dmitriev@alliander.com>

* Format Python code with Black

Signed-off-by: black <action@github.com>

* Merge branch 'main' into feature/KTP-1279-linear-sample-weight

Signed-off-by: Clara De Smet <clara.de.smet@alliander.com>

* Format Python code with Black

Signed-off-by: black <action@github.com>

* Fixed linter suggestion

Signed-off-by: Clara De Smet <clara.de.smet@alliander.com>

* Added documentation

* Format Python code with Black

Signed-off-by: black <action@github.com>

* Bumped version of black formatting

* Updated documentation

* Format Python code with Black

Signed-off-by: black <action@github.com>

* Removed blank line

* Format Python code with Black

Signed-off-by: black <action@github.com>

* Reformatting docs

* Reformatting docs

---------

Signed-off-by: Egor Dmitriev <egor.dmitriev@alliander.com>
Signed-off-by: black <action@github.com>
Signed-off-by: Clara De Smet <clara.de.smet@alliander.com>
Signed-off-by: Clara De Smet <157587243+clara-de-smet@users.noreply.github.com>
Co-authored-by: black <action@github.com>
Co-authored-by: Clara De Smet <clara.de.smet@alliander.com>
Co-authored-by: Clara De Smet <157587243+clara-de-smet@users.noreply.github.com>
---
 .pre-commit-config.yaml                       |  2 +-
 README.md                                     |  2 +-
 .../missing_values_transformer.py             |  1 +
 openstef/model/model_creator.py               |  3 +
 openstef/model/regressors/linear_quantile.py  | 56 +++++++++++++++++--
 .../model/regressors/test_linear_quantile.py  | 20 +++++++
 6 files changed, 76 insertions(+), 8 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 195389c2..807f4479 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -14,7 +14,7 @@
 
 repos:
 -   repo: https://github.com/ambv/black
-    rev: 22.1.0
+    rev: 24.3.0
     hooks:
     - id: black
       language_version: python3.11
diff --git a/README.md b/README.md
index c41a8e0b..cabf6699 100644
--- a/README.md
+++ b/README.md
@@ -50,7 +50,7 @@ OpenSTEF is a Python package designed for generating short-term forecasts in the
 pip install openstef
 ```
 
-### Remark regarding installation within a **conda environment on Windows**:
+### Remark regarding installation within a **conda environment on Windows**
 
 A version of the pywin32 package will be installed as a secondary dependency along with the installation of the openstef package. Since conda relies on an old version of pywin32, the new installation can break conda's functionality. The following command can solve this issue:
 ```shell
diff --git a/openstef/feature_engineering/missing_values_transformer.py b/openstef/feature_engineering/missing_values_transformer.py
index 7c46e519..faa3b2fc 100644
--- a/openstef/feature_engineering/missing_values_transformer.py
+++ b/openstef/feature_engineering/missing_values_transformer.py
@@ -41,6 +41,7 @@ def __init__(
             no_fill_future_values_features: The features for which it does not make sense
                 to fill future values. Rows that contain trailing null values for these
                 features will be removed from the data.
+
         """
         self.missing_values = missing_values
         self.imputation_strategy = imputation_strategy
diff --git a/openstef/model/model_creator.py b/openstef/model/model_creator.py
index e15b225b..32c10886 100644
--- a/openstef/model/model_creator.py
+++ b/openstef/model/model_creator.py
@@ -116,6 +116,9 @@
         "missing_values",
         "imputation_strategy",
         "fill_value",
+        "weight_scale_percentile",
+        "weight_exponent",
+        "weight_floor",
         "no_fill_future_values_features",
     ],
     ModelType.ARIMA: [
diff --git a/openstef/model/regressors/linear_quantile.py b/openstef/model/regressors/linear_quantile.py
index 2c8ead73..aa4fc57d 100644
--- a/openstef/model/regressors/linear_quantile.py
+++ b/openstef/model/regressors/linear_quantile.py
@@ -8,7 +8,7 @@
 import pandas as pd
 from sklearn.base import RegressorMixin
 from sklearn.linear_model import QuantileRegressor
-from sklearn.preprocessing import MinMaxScaler
+from sklearn.preprocessing import StandardScaler
 from sklearn.utils.validation import check_is_fitted
 
 from openstef.feature_engineering.missing_values_transformer import (
@@ -25,8 +25,8 @@ class LinearQuantileOpenstfRegressor(OpenstfRegressor, RegressorMixin):
     solver: str
 
     imputer_: MissingValuesTransformer
-    x_scaler_: MinMaxScaler
-    y_scaler_: MinMaxScaler
+    x_scaler_: StandardScaler
+    y_scaler_: StandardScaler
     models_: Dict[float, QuantileRegressor]
 
     is_fitted_: bool = False
@@ -47,6 +47,9 @@ def __init__(
         missing_values: Union[int, float, str, None] = np.nan,
         imputation_strategy: Optional[str] = "mean",
         fill_value: Union[str, int, float] = None,
+        weight_scale_percentile: int = 95,
+        weight_exponent: float = 1,
+        weight_floor: float = 0.1,
         no_fill_future_values_features: List[str] = None,
     ):
         """Initialize LinearQuantileOpenstfRegressor.
@@ -70,6 +73,9 @@ def __init__(
             missing_values: Value to be considered as missing value
             imputation_strategy: Imputation strategy
             fill_value: Fill value
+            weight_scale_percentile: Percentile used in scaling of the samples
+            weight_exponent: Exponent used in sample weighing
+            weight_floor: Minimum weight for samples
             no_fill_future_values_features: The features for which it does not make sense
                 to fill future values. Rows that contain trailing null values for these
                 features will be removed from the data.
@@ -86,14 +92,17 @@ def __init__(
         self.quantiles = quantiles
         self.alpha = alpha
         self.solver = solver
+        self.weight_scale_percentile = weight_scale_percentile
+        self.weight_exponent = weight_exponent
+        self.weight_floor = weight_floor
         self.imputer_ = MissingValuesTransformer(
             missing_values=missing_values,
             imputation_strategy=imputation_strategy,
             fill_value=fill_value,
             no_fill_future_values_features=no_fill_future_values_features,
         )
-        self.x_scaler_ = MinMaxScaler(feature_range=(-1, 1))
-        self.y_scaler_ = MinMaxScaler(feature_range=(-1, 1))
+        self.x_scaler_ = StandardScaler()
+        self.y_scaler_ = StandardScaler()
         self.models_ = {
             quantile: QuantileRegressor(alpha=alpha, quantile=quantile, solver=solver)
             for quantile in quantiles
@@ -182,7 +191,7 @@ def fit(self, x: pd.DataFrame, y: pd.Series, **kwargs) -> RegressorMixin:
         y_scaled = self.y_scaler_.fit_transform(y.to_frame())[:, 0]
 
         # Add more focus on extreme / peak values
-        sample_weight = np.abs(y_scaled)
+        sample_weight = self._calculate_sample_weights(y.values.squeeze())
 
         # Fit quantile regressors
         for quantile in self.quantiles:
@@ -196,6 +205,33 @@ def fit(self, x: pd.DataFrame, y: pd.Series, **kwargs) -> RegressorMixin:
 
         return self
 
+    def _calculate_sample_weights(self, y: np.array):
+        """Calculate sample weights based on the y values of arbitrary scale.
+
+        The resulting weights are in the range [0,1] and are used to put more emphasis
+        on certain samples. The sample weighting function does:
+
+        * Rescale data to a [-1, 1] range using quantile scaling. 90% of the data will
+          be within this range. Rest is outside.
+        * Calculate the weight by taking the exponent of scaled data.
+          * exponent=0: Results in uniform weights for all samples.
+          * exponent=1: Results in linearly increasing weights for samples that are
+            closer to the extremes.
+          * exponent>1: Results in exponentially increasing weights for samples that are
+            closer to the extremes.
+        * Clip the data to [0, 1] range with weight_floor as the minimum weight.
+          * Weight floor is used to make sure that all the samples are considered.
+
+        """
+        return np.clip(
+            _weight_exp(
+                _scale_percentile(y, percentile=self.weight_scale_percentile),
+                exponent=self.weight_exponent,
+            ),
+            a_min=self.weight_floor,
+            a_max=1,
+        )
+
     def predict(self, x: pd.DataFrame, quantile: float = 0.5, **kwargs) -> np.array:
         """Makes a prediction for a desired quantile.
 
@@ -250,3 +286,11 @@ def _get_param_names(cls):
 
     def __sklearn_is_fitted__(self) -> bool:
         return self.is_fitted_
+
+
+def _scale_percentile(x: np.ndarray, percentile: int = 95):
+    return np.abs(x / np.percentile(np.abs(x), percentile))
+
+
+def _weight_exp(x: np.ndarray, exponent: float = 1):
+    return np.abs(x) ** exponent
diff --git a/test/unit/model/regressors/test_linear_quantile.py b/test/unit/model/regressors/test_linear_quantile.py
index 50ea32bb..08be9975 100644
--- a/test/unit/model/regressors/test_linear_quantile.py
+++ b/test/unit/model/regressors/test_linear_quantile.py
@@ -10,6 +10,7 @@
 from sklearn.utils.estimator_checks import check_estimator
 
 from openstef.feature_engineering.apply_features import apply_features
+from openstef.model.model_creator import ModelCreator
 from openstef.model.regressors.linear_quantile import LinearQuantileOpenstfRegressor
 from test.unit.utils.base import BaseTestCase
 from test.unit.utils.data import TestData
@@ -151,3 +152,22 @@ def test_ignore_features(self):
         self.assertNotIn("E1B_AMI_I", input_data_filtered.columns)
         self.assertNotIn("E4A_I", input_data_filtered.columns)
         self.assertIn("load", input_data_filtered.columns)
+
+    def test_create_model(self):
+        # Arrange
+        kwargs = {
+            "weight_scale_percentile": 50,
+            "weight_exponent": 2,
+        }
+
+        # Act
+        model = ModelCreator.create_model(
+            model_type="linear_quantile",
+            quantiles=[0.5],
+            **kwargs,
+        )
+
+        # Assert
+        self.assertIsInstance(model, LinearQuantileOpenstfRegressor)
+        self.assertEqual(model.weight_scale_percentile, 50)
+        self.assertEqual(model.weight_exponent, 2)