Skip to content

MeanSegmentEncoderTransform #265

Merged
merged 7 commits into from
Nov 10, 2021
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- Out_column parameter to not inplace transforms([#211](https://github.com/tinkoff-ai/etna-ts/pull/211))
- omegaconf config parser in cli ([#258](https://github.com/tinkoff-ai/etna-ts/pull/258))
- Feature relevance table calculation using feature importance ([#261](https://github.com/tinkoff-ai/etna-ts/pull/261))
- MeanSegmentEncoderTransform ([#265](https://github.com/tinkoff-ai/etna-ts/pull/265))

### Changed
- Add possibility to set custom in_column for ConfidenceIntervalOutliersTransform ([#240](https://github.com/tinkoff-ai/etna-ts/pull/240))
Expand Down
1 change: 1 addition & 0 deletions etna/transforms/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
from etna.transforms.scalers import MinMaxScalerTransform
from etna.transforms.scalers import RobustScalerTransform
from etna.transforms.scalers import StandardScalerTransform
from etna.transforms.segment_encoder import MeanSegmentEncoderTransform
from etna.transforms.segment_encoder import SegmentEncoderTransform
from etna.transforms.special_days import SpecialDaysTransform
from etna.transforms.statistics import MaxTransform
Expand Down
52 changes: 50 additions & 2 deletions etna/transforms/segment_encoder.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
import numpy as np
import pandas as pd
from sklearn import preprocessing

from etna.transforms.base import Transform
from etna.transforms.statistics import MeanTransform


class SegmentEncoderTransform(Transform):
Expand All @@ -25,8 +27,8 @@ def fit(self, df: pd.DataFrame) -> "SegmentEncoderTransform":
-------
self
"""
segent_columns = df.columns.get_level_values("segment")
self._le.fit(segent_columns)
segment_columns = df.columns.get_level_values("segment")
self._le.fit(segment_columns)
return self

def transform(self, df: pd.DataFrame) -> pd.DataFrame:
Expand Down Expand Up @@ -59,3 +61,49 @@ def transform(self, df: pd.DataFrame) -> pd.DataFrame:
]
df = df.sort_index(axis=1)
return df


class MeanSegmentEncoderTransform(Transform):
"""Makes expanding mean target encoding of the segment. Creates column 'regressor_segment_mean'."""

idx = pd.IndexSlice

def __init__(self):
self.mean_encoder = MeanTransform(in_column="target", window=-1, out_column="regressor_segment_mean")
self.global_means: np.ndarray[float] = None

def fit(self, df: pd.DataFrame) -> "MeanSegmentEncoderTransform":
"""
Fit encoder.

Parameters
----------
df:
dataframe with data to fit expanding mean target encoder.

Returns
-------
self
"""
self.mean_encoder.fit(df)
self.global_means = df.loc[:, self.idx[:, "target"]].mean().values
return self

def transform(self, df: pd.DataFrame) -> pd.DataFrame:
"""
Get encoded values for the segment.

Parameters
----------
df:
dataframe with data to transform.

Returns
-------
result dataframe
"""
df = self.mean_encoder.transform(df)
segment = df.columns.get_level_values("segment").unique()[0]
nan_timestamps = df[df.loc[:, self.idx[segment, "target"]].isna()].index
df.loc[nan_timestamps, self.idx[:, "regressor_segment_mean"]] = self.global_means
return df
82 changes: 82 additions & 0 deletions tests/test_transforms/test_segment_encoder_transform.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,11 @@
import numpy as np
import pandas as pd
import pytest

from etna.datasets import TSDataset
from etna.metrics import R2
from etna.models import LinearMultiSegmentModel
from etna.transforms import MeanSegmentEncoderTransform
from etna.transforms import SegmentEncoderTransform


Expand Down Expand Up @@ -28,3 +33,80 @@ def test_dummy(dummy_df):
len(transformed_df.loc[:, pd.IndexSlice[:, "regressor_segment_code"]].columns) == 2
), "Number of columns not the same as segments"
assert len(dummy_df) == len(transformed_df), "Row missing"


@pytest.fixture
def simple_df() -> pd.DataFrame:
df_1 = pd.DataFrame.from_dict({"timestamp": pd.date_range("2021-06-01", "2021-06-07", freq="D")})
df_2 = pd.DataFrame.from_dict({"timestamp": pd.date_range("2021-06-01", "2021-06-07", freq="D")})
df_1["segment"] = "Moscow"
df_1["target"] = [1.0, 2.0, 3.0, 4.0, 5.0, np.NAN, np.NAN]
df_1["exog"] = [6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0]
df_2["segment"] = "Omsk"
df_2["target"] = [10.0, 20.0, 30.0, 40.0, 50.0, np.NAN, np.NAN]
df_2["exog"] = [60.0, 70.0, 80.0, 90.0, 100.0, 110.0, 120.0]
classic_df = pd.concat([df_1, df_2], ignore_index=True)
df = TSDataset.to_dataset(classic_df)
return df


@pytest.fixture
def transformed_simple_df() -> pd.DataFrame:
df_1 = pd.DataFrame.from_dict({"timestamp": pd.date_range("2021-06-01", "2021-06-07", freq="D")})
df_2 = pd.DataFrame.from_dict({"timestamp": pd.date_range("2021-06-01", "2021-06-07", freq="D")})
df_1["segment"] = "Moscow"
df_1["target"] = [1.0, 2.0, 3.0, 4.0, 5.0, np.NAN, np.NAN]
df_1["exog"] = [6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0]
df_1["regressor_segment_mean"] = [0, 1, 1.5, 2, 2.5, 3, 3]
df_2["segment"] = "Omsk"
df_2["target"] = [10.0, 20.0, 30.0, 40.0, 50.0, np.NAN, np.NAN]
df_2["exog"] = [60.0, 70.0, 80.0, 90.0, 100.0, 110.0, 120.0]
df_2["regressor_segment_mean"] = [0.0, 10.0, 15.0, 20.0, 25.0, 30, 30]
classic_df = pd.concat([df_1, df_2], ignore_index=True)
df = TSDataset.to_dataset(classic_df)
return df


@pytest.mark.parametrize("expected_global_means", ([[3, 30]]))
def test_mean_segment_encoder_fit(simple_df, expected_global_means):
encoder = MeanSegmentEncoderTransform()
encoder.fit(simple_df)
assert (encoder.global_means == expected_global_means).all()


def test_mean_segment_encoder_transform(simple_df, transformed_simple_df):
encoder = MeanSegmentEncoderTransform()
transformed_df = encoder.fit_transform(simple_df)
pd.testing.assert_frame_equal(transformed_df, transformed_simple_df)


@pytest.fixture
def almost_constant_ts(random_seed) -> TSDataset:
df_1 = pd.DataFrame.from_dict({"timestamp": pd.date_range("2021-06-01", "2021-07-01", freq="D")})
df_2 = pd.DataFrame.from_dict({"timestamp": pd.date_range("2021-06-01", "2021-07-01", freq="D")})
df_1["segment"] = "Moscow"
df_1["target"] = 1 + np.random.normal(0, 0.1, size=len(df_1))
df_2["segment"] = "Omsk"
df_2["target"] = 10 + np.random.normal(0, 0.1, size=len(df_1))
classic_df = pd.concat([df_1, df_2], ignore_index=True)
ts = TSDataset(df=TSDataset.to_dataset(classic_df), freq="D")
return ts


def test_mean_segment_encoder_forecast(almost_constant_ts):
"""Test that MeanSegmentEncoderTransform works correctly in forecast pipeline
and helps to correctly forecast almost constant series."""
horizon = 5
model = LinearMultiSegmentModel()
encoder = MeanSegmentEncoderTransform()

train, test = almost_constant_ts.train_test_split(test_size=horizon)
train.fit_transform([encoder])
model.fit(train)
future = train.make_future(horizon)
pred_mean_segment_encoding = model.forecast(future)

metric = R2(mode="macro")

# R2=0 => model predict the optimal constant
alex-hse-repository marked this conversation as resolved.
Show resolved Hide resolved
assert np.allclose(metric(pred_mean_segment_encoding, test), 0)