Skip to content

Add regressors updating in transform loops #374

Merged
merged 11 commits into from
Dec 29, 2021
50 changes: 50 additions & 0 deletions etna/datasets/tsdataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from typing import List
from typing import Optional
from typing import Sequence
from typing import Set
from typing import Tuple
from typing import Union

Expand Down Expand Up @@ -132,15 +133,64 @@ def transform(self, transforms: Sequence["Transform"]):
self.transforms = transforms
for transform in self.transforms:
tslogger.log(f"Transform {transform.__class__.__name__} is applied to dataset")
columns_before = set(self.columns.get_level_values("feature"))
self.df = transform.transform(self.df)
columns_after = set(self.columns.get_level_values("feature"))
self._update_regressors(transform=transform, columns_before=columns_before, columns_after=columns_after)

def fit_transform(self, transforms: Sequence["Transform"]):
"""Fit and apply given transforms to the data."""
self._check_endings()
self.transforms = transforms
for transform in self.transforms:
tslogger.log(f"Transform {transform.__class__.__name__} is applied to dataset")
columns_before = set(self.columns.get_level_values("feature"))
self.df = transform.fit_transform(self.df)
columns_after = set(self.columns.get_level_values("feature"))
self._update_regressors(transform=transform, columns_before=columns_before, columns_after=columns_after)

def _update_regressors(self, transform: "Transform", columns_before: Set[str], columns_after: Set[str]):
from etna.transforms.base import FutureMixin

unseen_columns = list(columns_after - columns_before)
if len(unseen_columns) == 0:
return

new_regressors = []

if isinstance(transform, FutureMixin):
martins0n marked this conversation as resolved.
Show resolved Hide resolved
# Every column from FutureMixin is regressor
out_columns = list(columns_after - columns_before)
new_regressors = out_columns

elif hasattr(transform, "in_column"):
# Only the columns created with the other transforms from regressors are regressors
in_columns = transform.in_column if isinstance(transform.in_column, list) else [transform.in_column] # type: ignore
if hasattr(transform, "out_columns") and transform.out_columns is not None: # type: ignore
martins0n marked this conversation as resolved.
Show resolved Hide resolved
# User defined out_columns in sklearn
# TODO: remove this case after fixing the out_column attribute in SklearnTransform
out_columns = transform.out_columns # type: ignore
regressors_in_column_ids = [i for i, in_column in enumerate(in_columns) if in_column in self.regressors]
new_regressors = [out_columns[i] for i in regressors_in_column_ids]
elif hasattr(transform, "out_column") and transform.out_column is not None: # type: ignore
# User defined out_columns
out_columns = transform.out_column if isinstance(transform.out_column, list) else [transform.out_column] # type: ignore
regressors_in_column_ids = [i for i, in_column in enumerate(in_columns) if in_column in self.regressors]
new_regressors = [out_columns[i] for i in regressors_in_column_ids]
else:
# Default out_columns
out_columns = list(columns_after - columns_before)
regressors_in_column = [in_column for in_column in in_columns if in_column in self.regressors]
new_regressors = [
out_column
for out_column in out_columns
if np.any([regressor in out_column for regressor in regressors_in_column])
]

else:
raise ValueError("Transform is not FutureMixin and does not have in_column attribute!")

self._regressors.extend(new_regressors)

def __repr__(self):
return self.df.__repr__()
Expand Down
89 changes: 89 additions & 0 deletions tests/test_datasets/test_dataset.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from copy import deepcopy
from typing import List
from typing import Tuple

Expand All @@ -7,6 +8,10 @@

from etna.datasets import generate_ar_df
from etna.datasets.tsdataset import TSDataset
from etna.transforms import AddConstTransform
from etna.transforms import LagTransform
from etna.transforms import MaxAbsScalerTransform
from etna.transforms import SegmentEncoderTransform


@pytest.fixture()
Expand Down Expand Up @@ -464,3 +469,87 @@ def test_to_flatten(example_df):
obtained_df = TSDataset.to_flatten(TSDataset.to_dataset(example_df))
assert sorted_columns == sorted(obtained_df.columns)
assert (expected_df.values == obtained_df[sorted_columns].values).all()


@pytest.fixture()
def ts_with_regressors(df_and_regressors):
df, df_exog, regressors = df_and_regressors
ts = TSDataset(df=df, freq="D", df_exog=df_exog, known_future="all")
return ts


def _test_update_regressors_transform(ts, transforms, expected_regressors):
fitted_transforms = [transform.fit(ts.df) for transform in transforms]
ts.transform(fitted_transforms)
regressors = ts.regressors
assert sorted(regressors) == sorted(expected_regressors)


def _test_update_regressors_fit_transform(ts, transforms, expected_regressors):
ts.fit_transform(transforms)
regressors = ts.regressors
assert sorted(regressors) == sorted(expected_regressors)


@pytest.mark.parametrize(
"transforms, expected_regressors",
(
([SegmentEncoderTransform()], ["regressor_1", "regressor_2", "regressor_segment_code"]),
(
[LagTransform(in_column="target", lags=[1, 2], out_column="regressor_lag")],
["regressor_1", "regressor_2", "regressor_lag_1", "regressor_lag_2"],
),
),
)
def test_update_regressors_with_futuremixin_transform(ts_with_regressors, transforms, expected_regressors):
_test_update_regressors_transform(deepcopy(ts_with_regressors), deepcopy(transforms), expected_regressors)
_test_update_regressors_fit_transform(deepcopy(ts_with_regressors), deepcopy(transforms), expected_regressors)


@pytest.mark.parametrize(
"transforms, expected_regressors",
(
(
[MaxAbsScalerTransform(in_column="regressor_1", inplace=False, out_column="scaled")],
["regressor_1", "regressor_2", "scaled_regressor_1"],
),
(
[MaxAbsScalerTransform(in_column=["regressor_1", "regressor_2"], inplace=False, out_column=None)],
[
"regressor_1",
"regressor_2",
MaxAbsScalerTransform(in_column=["regressor_1"], inplace=False, out_column=None).__repr__(),
MaxAbsScalerTransform(in_column=["regressor_2"], inplace=False, out_column=None).__repr__(),
],
),
(
[
AddConstTransform(
in_column="regressor_1", value=2, inplace=False, out_column="regressor_add_constant_regressor_1"
)
],
["regressor_1", "regressor_2", "regressor_add_constant_regressor_1"],
),
),
)
def test_update_regressors_with_regressor_in_column(ts_with_regressors, transforms, expected_regressors):
_test_update_regressors_transform(deepcopy(ts_with_regressors), deepcopy(transforms), expected_regressors)
_test_update_regressors_fit_transform(deepcopy(ts_with_regressors), deepcopy(transforms), expected_regressors)


@pytest.mark.parametrize(
"transforms, expected_regressors",
(
(
[MaxAbsScalerTransform(in_column="target", inplace=False, out_column="scaled_target")],
["regressor_1", "regressor_2"],
),
(
[AddConstTransform(in_column="target", value=2, inplace=False, out_column="add_constant_target")],
["regressor_1", "regressor_2"],
),
),
)
def test_update_regressors_not_add_not_regressors(ts_with_regressors, transforms, expected_regressors):
_test_update_regressors_transform(deepcopy(ts_with_regressors), deepcopy(transforms), expected_regressors)
_test_update_regressors_fit_transform(deepcopy(ts_with_regressors), deepcopy(transforms), expected_regressors)
7 changes: 2 additions & 5 deletions tests/test_transforms/test_trend_transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
from etna.datasets.tsdataset import TSDataset
from etna.transforms.trend import TrendTransform
from etna.transforms.trend import _OneSegmentTrendTransform
from etna.transforms.trend import _TrendTransform

DEFAULT_SEGMENT = "segment_1"

Expand Down Expand Up @@ -60,9 +59,8 @@ def test_fit_transform_many_segments(example_tsds: TSDataset) -> None:
"""
out_column = "regressor_result"
example_tsds_original = deepcopy(example_tsds)
trend_transform = _TrendTransform(
trend_transform = TrendTransform(
in_column="target",
change_point_model=Binseg(),
detrend_model=LinearRegression(),
n_bkps=5,
out_column=out_column,
Expand All @@ -81,9 +79,8 @@ def test_inverse_transform_many_segments(example_tsds: TSDataset) -> None:
"""
Test that inverse_transform interface works correctly for many segment.
"""
trend_transform = _TrendTransform(
trend_transform = TrendTransform(
in_column="target",
change_point_model=Binseg(),
detrend_model=LinearRegression(),
n_bkps=5,
out_column="test",
Expand Down