From 76d5e0ee84d6cf542e123ec16efe1b1f4704e5e2 Mon Sep 17 00:00:00 2001 From: alex-hse-repository <55380696+alex-hse-repository@users.noreply.github.com> Date: Wed, 29 Dec 2021 10:45:46 +0300 Subject: [PATCH] Add regressors updating in transform loops (#374) --- etna/datasets/tsdataset.py | 50 +++++++++++ tests/test_datasets/test_dataset.py | 88 +++++++++++++++++++ .../test_trend_transform.py | 7 +- 3 files changed, 140 insertions(+), 5 deletions(-) diff --git a/etna/datasets/tsdataset.py b/etna/datasets/tsdataset.py index 7bdf3a169..6d1d9e77e 100644 --- a/etna/datasets/tsdataset.py +++ b/etna/datasets/tsdataset.py @@ -7,6 +7,7 @@ from typing import List from typing import Optional from typing import Sequence +from typing import Set from typing import Tuple from typing import Union @@ -134,7 +135,10 @@ def transform(self, transforms: Sequence["Transform"]): self.transforms = transforms for transform in self.transforms: tslogger.log(f"Transform {transform.__class__.__name__} is applied to dataset") + columns_before = set(self.columns.get_level_values("feature")) self.df = transform.transform(self.df) + columns_after = set(self.columns.get_level_values("feature")) + self._update_regressors(transform=transform, columns_before=columns_before, columns_after=columns_after) def fit_transform(self, transforms: Sequence["Transform"]): """Fit and apply given transforms to the data.""" @@ -142,7 +146,53 @@ def fit_transform(self, transforms: Sequence["Transform"]): self.transforms = transforms for transform in self.transforms: tslogger.log(f"Transform {transform.__class__.__name__} is applied to dataset") + columns_before = set(self.columns.get_level_values("feature")) self.df = transform.fit_transform(self.df) + columns_after = set(self.columns.get_level_values("feature")) + self._update_regressors(transform=transform, columns_before=columns_before, columns_after=columns_after) + + def _update_regressors(self, transform: "Transform", columns_before: Set[str], columns_after: Set[str]): + from etna.transforms.base import FutureMixin + + unseen_columns = list(columns_after - columns_before) + if len(unseen_columns) == 0: + return + + new_regressors = [] + + if isinstance(transform, FutureMixin): + # Every column from FutureMixin is regressor + out_columns = list(columns_after - columns_before) + new_regressors = out_columns + + elif hasattr(transform, "in_column"): + # Only the columns created with the other transforms from regressors are regressors + in_columns = transform.in_column if isinstance(transform.in_column, list) else [transform.in_column] # type: ignore + if hasattr(transform, "out_columns") and transform.out_columns is not None: # type: ignore + # User defined out_columns in sklearn + # TODO: remove this case after fixing the out_column attribute in SklearnTransform + out_columns = transform.out_columns # type: ignore + regressors_in_column_ids = [i for i, in_column in enumerate(in_columns) if in_column in self.regressors] + new_regressors = [out_columns[i] for i in regressors_in_column_ids] + elif hasattr(transform, "out_column") and transform.out_column is not None: # type: ignore + # User defined out_columns + out_columns = transform.out_column if isinstance(transform.out_column, list) else [transform.out_column] # type: ignore + regressors_in_column_ids = [i for i, in_column in enumerate(in_columns) if in_column in self.regressors] + new_regressors = [out_columns[i] for i in regressors_in_column_ids] + else: + # Default out_columns + out_columns = list(columns_after - columns_before) + regressors_in_column = [in_column for in_column in in_columns if in_column in self.regressors] + new_regressors = [ + out_column + for out_column in out_columns + if np.any([regressor in out_column for regressor in regressors_in_column]) + ] + + else: + raise ValueError("Transform is not FutureMixin and does not have in_column attribute!") + + self._regressors.extend(new_regressors) def __repr__(self): return self.df.__repr__() diff --git a/tests/test_datasets/test_dataset.py b/tests/test_datasets/test_dataset.py index fe85407cf..fcce642a1 100644 --- a/tests/test_datasets/test_dataset.py +++ b/tests/test_datasets/test_dataset.py @@ -1,3 +1,4 @@ +from copy import deepcopy from typing import List from typing import Tuple @@ -9,6 +10,10 @@ from etna.datasets import generate_ar_df from etna.datasets.tsdataset import TSDataset from etna.transforms import TimeSeriesImputerTransform +from etna.transforms import AddConstTransform +from etna.transforms import LagTransform +from etna.transforms import MaxAbsScalerTransform +from etna.transforms import SegmentEncoderTransform @pytest.fixture() @@ -598,3 +603,86 @@ def test_describe(df_and_regressors): assert np.all(description["num_exogs"] == 2) assert np.all(description["num_regressors"] == 2) assert np.all(description["freq"] == "D") + +@pytest.fixture() +def ts_with_regressors(df_and_regressors): + df, df_exog, regressors = df_and_regressors + ts = TSDataset(df=df, freq="D", df_exog=df_exog, known_future="all") + return ts + + +def _test_update_regressors_transform(ts, transforms, expected_regressors): + fitted_transforms = [transform.fit(ts.df) for transform in transforms] + ts.transform(fitted_transforms) + regressors = ts.regressors + assert sorted(regressors) == sorted(expected_regressors) + + +def _test_update_regressors_fit_transform(ts, transforms, expected_regressors): + ts.fit_transform(transforms) + regressors = ts.regressors + assert sorted(regressors) == sorted(expected_regressors) + + +@pytest.mark.parametrize( + "transforms, expected_regressors", + ( + ([SegmentEncoderTransform()], ["regressor_1", "regressor_2", "regressor_segment_code"]), + ( + [LagTransform(in_column="target", lags=[1, 2], out_column="regressor_lag")], + ["regressor_1", "regressor_2", "regressor_lag_1", "regressor_lag_2"], + ), + ), +) +def test_update_regressors_with_futuremixin_transform(ts_with_regressors, transforms, expected_regressors): + _test_update_regressors_transform(deepcopy(ts_with_regressors), deepcopy(transforms), expected_regressors) + _test_update_regressors_fit_transform(deepcopy(ts_with_regressors), deepcopy(transforms), expected_regressors) + + +@pytest.mark.parametrize( + "transforms, expected_regressors", + ( + ( + [MaxAbsScalerTransform(in_column="regressor_1", inplace=False, out_column="scaled")], + ["regressor_1", "regressor_2", "scaled_regressor_1"], + ), + ( + [MaxAbsScalerTransform(in_column=["regressor_1", "regressor_2"], inplace=False, out_column=None)], + [ + "regressor_1", + "regressor_2", + MaxAbsScalerTransform(in_column=["regressor_1"], inplace=False, out_column=None).__repr__(), + MaxAbsScalerTransform(in_column=["regressor_2"], inplace=False, out_column=None).__repr__(), + ], + ), + ( + [ + AddConstTransform( + in_column="regressor_1", value=2, inplace=False, out_column="regressor_add_constant_regressor_1" + ) + ], + ["regressor_1", "regressor_2", "regressor_add_constant_regressor_1"], + ), + ), +) +def test_update_regressors_with_regressor_in_column(ts_with_regressors, transforms, expected_regressors): + _test_update_regressors_transform(deepcopy(ts_with_regressors), deepcopy(transforms), expected_regressors) + _test_update_regressors_fit_transform(deepcopy(ts_with_regressors), deepcopy(transforms), expected_regressors) + + +@pytest.mark.parametrize( + "transforms, expected_regressors", + ( + ( + [MaxAbsScalerTransform(in_column="target", inplace=False, out_column="scaled_target")], + ["regressor_1", "regressor_2"], + ), + ( + [AddConstTransform(in_column="target", value=2, inplace=False, out_column="add_constant_target")], + ["regressor_1", "regressor_2"], + ), + ), +) +def test_update_regressors_not_add_not_regressors(ts_with_regressors, transforms, expected_regressors): + _test_update_regressors_transform(deepcopy(ts_with_regressors), deepcopy(transforms), expected_regressors) + _test_update_regressors_fit_transform(deepcopy(ts_with_regressors), deepcopy(transforms), expected_regressors) diff --git a/tests/test_transforms/test_decomposition/test_trend_transform.py b/tests/test_transforms/test_decomposition/test_trend_transform.py index 2839d2f96..d09c5abc9 100644 --- a/tests/test_transforms/test_decomposition/test_trend_transform.py +++ b/tests/test_transforms/test_decomposition/test_trend_transform.py @@ -9,7 +9,6 @@ from etna.datasets.tsdataset import TSDataset from etna.transforms.decomposition import TrendTransform from etna.transforms.decomposition.trend import _OneSegmentTrendTransform -from etna.transforms.decomposition.trend import _TrendTransform DEFAULT_SEGMENT = "segment_1" @@ -61,9 +60,8 @@ def test_fit_transform_many_segments(example_tsds: TSDataset) -> None: """ out_column = "regressor_result" example_tsds_original = deepcopy(example_tsds) - trend_transform = _TrendTransform( + trend_transform = TrendTransform( in_column="target", - change_point_model=Binseg(), detrend_model=LinearRegression(), n_bkps=5, out_column=out_column, @@ -82,9 +80,8 @@ def test_inverse_transform_many_segments(example_tsds: TSDataset) -> None: """ Test that inverse_transform interface works correctly for many segment. """ - trend_transform = _TrendTransform( + trend_transform = TrendTransform( in_column="target", - change_point_model=Binseg(), detrend_model=LinearRegression(), n_bkps=5, out_column="test",