Skip to content

Add default params_to_tune for decomposition transforms #1243

Merged
merged 7 commits into from
May 2, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- Add default `params_to_tune` for `TimeSeriesImputerTransform` ([#1232](https://github.com/tinkoff-ai/etna/pull/1232))
- Add default `params_to_tune` for `DifferencingTransform`, `MedianTransform`, `MaxTransform`, `MinTransform`, `QuantileTransform`, `StdTransform`, `MeanTransform`, `MADTransform`, `MinMaxDifferenceTransform`, `SumTransform`, `BoxCoxTransform`, `YeoJohnsonTransform`, `MaxAbsScalerTransform`, `MinMaxScalerTransform`, `RobustScalerTransform` and `StandardScalerTransform` ([#1233](https://github.com/tinkoff-ai/etna/pull/1233))
- Add default `params_to_tune` for `LabelEncoderTransform` ([#1242](https://github.com/tinkoff-ai/etna/pull/1242))
- Add default `params_to_tune` for `ChangePointsSegmentationTransform`, `ChangePointsTrendTransform`, `ChangePointsLevelTransform`, `TrendTransform`, `LinearTrendTransform`, `TheilSenTrendTransform` and `STLTransform` ([#1243](https://github.com/tinkoff-ai/etna/pull/1243))
### Fixed
- Fix bug in `GaleShapleyFeatureSelectionTransform` with wrong number of remaining features ([#1110](https://github.com/tinkoff-ai/etna/pull/1110))
- `ProphetModel` fails with additional seasonality set ([#1157](https://github.com/tinkoff-ai/etna/pull/1157))
Expand Down
70 changes: 58 additions & 12 deletions etna/transforms/decomposition/change_points_based/detrend.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
from typing import Dict
from typing import Optional

import numpy as np
import pandas as pd
from ruptures.detection import Binseg
from sklearn.linear_model import LinearRegression

from etna import SETTINGS
from etna.transforms.decomposition.change_points_based.base import ReversibleChangePointsTransform
from etna.transforms.decomposition.change_points_based.base import _OneSegmentChangePointsTransform
from etna.transforms.decomposition.change_points_based.change_points_models import BaseChangePointsModelAdapter
Expand All @@ -13,6 +15,11 @@
from etna.transforms.decomposition.change_points_based.per_interval_models import SklearnRegressionPerIntervalModel
from etna.transforms.utils import match_target_quantiles

if SETTINGS.auto_required:
from optuna.distributions import BaseDistribution
from optuna.distributions import CategoricalDistribution
from optuna.distributions import IntUniformDistribution


class _OneSegmentChangePointsTrendTransform(_OneSegmentChangePointsTransform):
"""_OneSegmentChangePointsTransform subtracts multiple linear trend from series."""
Expand All @@ -38,14 +45,29 @@ def _apply_inverse_transformation(self, df: pd.DataFrame, transformed_series: pd


class ChangePointsTrendTransform(ReversibleChangePointsTransform):
"""ChangePointsTrendTransform uses :py:class:`ruptures.detection.Binseg` model as a change point detection model.
"""Transform that makes a detrending of change-point intervals.

This class differs from :py:class:`~etna.transforms.decomposition.change_points_based.level.ChangePointsLevelTransform`
only by default values for ``change_points_model`` and ``per_interval_model``.

Transform divides each segment into intervals using ``change_points_model``.
Then a separate model is fitted on each interval using ``per_interval_model``.
Values predicted by the model are subtracted from each interval.

Evaluated function can be linear, mean, median, etc. Look at the signature to find out which models can be used.

Warning
-------
This transform can suffer from look-ahead bias. For transforming data at some timestamp
it uses information from the whole train part.
"""

_default_change_points_model = RupturesChangePointsModel(
change_points_model=Binseg(model="ar"),
n_bkps=5,
)
_default_per_interval_model = SklearnRegressionPerIntervalModel(model=LinearRegression())

def __init__(
self,
in_column: str,
Expand All @@ -59,24 +81,21 @@ def __init__(
in_column:
name of column to apply transform to
change_points_model:
model to get trend change points
model to get trend change points,
by default :py:class:`ruptures.detection.Binseg` in a wrapper with ``n_bkps=5`` is used
per_interval_model:
model to process intervals of segment
model to process intervals of segment,
by default :py:class:`sklearn.linear_models.LinearRegression` in a wrapper is used
"""
self.in_column = in_column

self.change_points_model = (
change_points_model
if change_points_model is not None
else RupturesChangePointsModel(
change_points_model=Binseg(model="ar"),
n_bkps=5,
)
change_points_model if change_points_model is not None else self._default_change_points_model
)
self.per_interval_model = (
per_interval_model
if per_interval_model is not None
else SklearnRegressionPerIntervalModel(model=LinearRegression())
per_interval_model if per_interval_model is not None else self._default_per_interval_model
)

super().__init__(
transform=_OneSegmentChangePointsTrendTransform(
in_column=self.in_column,
Expand All @@ -85,3 +104,30 @@ def __init__(
),
required_features=[in_column],
)

@property
def _is_change_points_model_default(self) -> bool:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should add that case to to_dict. Maybe in another task I guess.

# it can't see the difference between Binseg(model="ar") and Binseg(model="l1")
return self.change_points_model.to_dict() == self._default_change_points_model.to_dict()

def params_to_tune(self) -> Dict[str, "BaseDistribution"]:
"""Get default grid for tuning hyperparameters.

If ``change_points_model`` is equal to default then this grid tunes parameters:
``change_points_model.change_points_model.model``, ``change_points_model.n_bkps``.
Other parameters are expected to be set by the user.

Returns
-------
:
Grid to tune.
"""
if self._is_change_points_model_default:
Mr-Geekman marked this conversation as resolved.
Show resolved Hide resolved
return {
"change_points_model.change_points_model.model": CategoricalDistribution(
["l1", "l2", "normal", "rbf", "cosine", "linear", "clinear", "ar", "mahalanobis", "rank"]
),
"change_points_model.n_bkps": IntUniformDistribution(low=5, high=30),
}
else:
return {}
70 changes: 60 additions & 10 deletions etna/transforms/decomposition/change_points_based/level.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
from typing import Dict
from typing import Optional

from ruptures import Binseg

from etna import SETTINGS
from etna.transforms.decomposition.change_points_based.base import BaseChangePointsModelAdapter
from etna.transforms.decomposition.change_points_based.base import ReversibleChangePointsTransform
from etna.transforms.decomposition.change_points_based.change_points_models.ruptures_based import (
Expand All @@ -11,6 +13,11 @@
from etna.transforms.decomposition.change_points_based.per_interval_models import MeanPerIntervalModel
from etna.transforms.decomposition.change_points_based.per_interval_models import StatisticsPerIntervalModel

if SETTINGS.auto_required:
from optuna.distributions import BaseDistribution
from optuna.distributions import CategoricalDistribution
from optuna.distributions import IntUniformDistribution


class _OneSegmentChangePointsLevelTransform(_OneSegmentChangePointsTrendTransform):
def __init__(
Expand All @@ -36,14 +43,29 @@ def __init__(


class ChangePointsLevelTransform(ReversibleChangePointsTransform):
"""ChangePointsLevelTransform uses :py:class:`ruptures.detection.Binseg` model as a change point detection model.
"""Transform that makes a detrending of change-point intervals.

This class differs from :py:class:`~etna.transforms.decomposition.change_points_based.detrend.ChangePointsTrendTransform`
only by default values for ``change_points_model`` and ``per_interval_model``.

Transform divides each segment into intervals using ``change_points_model``.
Then a separate model is fitted on each interval using ``per_interval_model``.
Values predicted by the model are subtracted from each interval.

Evaluated function can be linear, mean, median, etc. Look at the signature to find out which models can be used.

Warning
-------
This transform can suffer from look-ahead bias. For transforming data at some timestamp
it uses information from the whole train part.
"""

_default_change_points_model = RupturesChangePointsModel(
change_points_model=Binseg(model="ar"),
n_bkps=5,
)
_default_per_interval_model = MeanPerIntervalModel()

def __init__(
self,
in_column: str,
Expand All @@ -57,20 +79,21 @@ def __init__(
in_column:
name of column to apply transform to
change_points_model:
model to get trend change points
model to get trend change points,
by default :py:class:`ruptures.detection.Binseg` in a wrapper with ``n_bkps=5`` is used
per_interval_model:
model to process intervals of segment
model to process intervals of segment,
by default mean value is used to evaluate the interval
"""
self.in_column = in_column

self.change_points_model = (
change_points_model
if change_points_model is not None
else RupturesChangePointsModel(
change_points_model=Binseg(model="l2"),
n_bkps=5,
)
change_points_model if change_points_model is not None else self._default_change_points_model
)
self.per_interval_model = (
per_interval_model if per_interval_model is not None else self._default_per_interval_model
)
self.per_interval_model = per_interval_model if per_interval_model is not None else MeanPerIntervalModel()

super().__init__(
transform=_OneSegmentChangePointsLevelTransform(
in_column=self.in_column,
Expand All @@ -79,3 +102,30 @@ def __init__(
),
required_features=[in_column],
)

@property
def _is_change_points_model_default(self) -> bool:
# it can't see the difference between Binseg(model="ar") and Binseg(model="l1")
return self.change_points_model.to_dict() == self._default_change_points_model.to_dict()

def params_to_tune(self) -> Dict[str, "BaseDistribution"]:
"""Get default grid for tuning hyperparameters.

If ``change_points_model`` is equal to default then this grid tunes parameters:
``change_points_model.change_points_model.model``, ``change_points_model.n_bkps``.
Other parameters are expected to be set by the user.

Returns
-------
:
Grid to tune.
"""
if self._is_change_points_model_default:
Mr-Geekman marked this conversation as resolved.
Show resolved Hide resolved
return {
"change_points_model.change_points_model.model": CategoricalDistribution(
["l1", "l2", "normal", "rbf", "cosine", "linear", "clinear", "ar", "mahalanobis", "rank"]
),
"change_points_model.n_bkps": IntUniformDistribution(low=5, high=30),
}
else:
return {}
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ class PerIntervalModel(BaseMixin, ABC):
"""Class to handle intervals in change point based transforms.

PerIntervalModel is a class to process intervals between change points
in `~etna.transforms.decomposition.change_points_based` transforms.
in :py:mod:`~etna.transforms.decomposition.change_points_based` transforms.
"""

@abstractmethod
Expand Down
66 changes: 59 additions & 7 deletions etna/transforms/decomposition/change_points_based/segmentation.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,24 @@
from typing import Dict
from typing import Optional

import numpy as np
import pandas as pd
from ruptures import Binseg

from etna import SETTINGS
from etna.transforms.decomposition.change_points_based.base import IrreversibleChangePointsTransform
from etna.transforms.decomposition.change_points_based.base import _OneSegmentChangePointsTransform
from etna.transforms.decomposition.change_points_based.change_points_models import BaseChangePointsModelAdapter
from etna.transforms.decomposition.change_points_based.change_points_models.ruptures_based import (
RupturesChangePointsModel,
)
from etna.transforms.decomposition.change_points_based.per_interval_models import ConstantPerIntervalModel

if SETTINGS.auto_required:
from optuna.distributions import BaseDistribution
from optuna.distributions import CategoricalDistribution
from optuna.distributions import IntUniformDistribution


class _OneSegmentChangePointsSegmentationTransform(_OneSegmentChangePointsTransform):
"""_OneSegmentChangePointsSegmentationTransform make label encoder to change points."""
Expand All @@ -20,7 +31,7 @@ def __init__(self, in_column: str, out_column: str, change_points_model: BaseCha
name of column to apply transform to
out_column:
result column name. If not given use ``self.__repr__()``
change_point_model:
change_points_model:
model to get change points
"""
self.out_column = out_column
Expand All @@ -46,18 +57,27 @@ def _apply_inverse_transformation(self, df: pd.DataFrame, transformed_series: pd


class ChangePointsSegmentationTransform(IrreversibleChangePointsTransform):
"""ChangePointsSegmentationTransform make label encoder to change points.
"""Transform that makes label encoding of change-point intervals.

Transform divides each segment into intervals using ``change_points_model``.
Each interval is enumerated based on its index from the start of the segment.
New column is created with number of interval for each timestamp.

Warning
-------
This transform can suffer from look-ahead bias. For transforming data at some timestamp
it uses information from the whole train part.
"""

_default_change_points_model = RupturesChangePointsModel(
change_points_model=Binseg(model="ar"),
n_bkps=5,
)

def __init__(
self,
in_column: str,
change_points_model: BaseChangePointsModelAdapter,
change_points_model: BaseChangePointsModelAdapter = None,
out_column: Optional[str] = None,
):
"""Init ChangePointsSegmentationTransform.
Expand All @@ -66,19 +86,51 @@ def __init__(
----------
in_column:
name of column to fit change point model
change_points_model:
model to get change points,
by default :py:class:`ruptures.detection.Binseg` in a wrapper with ``n_bkps=5`` is used
out_column:
result column name. If not given use ``self.__repr__()``
change_points_model:
model to get change points
"""
self.in_column = in_column
self.out_column = out_column if out_column is not None else self.__repr__()
self.change_points_model = change_points_model

self.change_points_model = (
change_points_model if change_points_model is not None else self._default_change_points_model
)

super().__init__(
transform=_OneSegmentChangePointsSegmentationTransform(
in_column=self.in_column,
out_column=self.out_column,
change_points_model=self.change_points_model,
out_column=self.out_column,
),
required_features=[in_column],
)

@property
def _is_change_points_model_default(self) -> bool:
# it can't see the difference between Binseg(model="ar") and Binseg(model="l1")
return self.change_points_model.to_dict() == self._default_change_points_model.to_dict()

def params_to_tune(self) -> Dict[str, "BaseDistribution"]:
"""Get default grid for tuning hyperparameters.

If ``change_points_model`` is equal to default then this grid tunes parameters:
``change_points_model.change_points_model.model``, ``change_points_model.n_bkps``.
Other parameters are expected to be set by the user.

Returns
-------
:
Grid to tune.
"""
if self._is_change_points_model_default:
return {
"change_points_model.change_points_model.model": CategoricalDistribution(
["l1", "l2", "normal", "rbf", "cosine", "linear", "clinear", "ar", "mahalanobis", "rank"]
),
"change_points_model.n_bkps": IntUniformDistribution(low=5, high=30),
}
else:
return {}
Loading