Skip to content

Add target components logic to TSDataset #1153

Merged
merged 9 commits into from
Mar 7, 2023
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

## Unreleased
### Added
- Target components logic to TSDataset ([#1153](https://github.com/tinkoff-ai/etna/pull/1153))
- Methods `save` and `load` to HierarchicalPipeline ([#1096](https://github.com/tinkoff-ai/etna/pull/1096))
- New data access methods in `TSDataset` : `update_columns_from_pandas`, `add_columns_from_pandas`, `drop_features` ([#809](https://github.com/tinkoff-ai/etna/pull/809))
- `PytorchForecastingDatasetBuiler` for neural networks from Pytorch Forecasting ([#971](https://github.com/tinkoff-ai/etna/pull/971))
Expand Down
77 changes: 75 additions & 2 deletions etna/datasets/tsdataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,8 @@ def __init__(
if self.current_df_level == self.current_df_exog_level:
self.df = self._merge_exog(self.df)

self._target_components: Optional[List[str]] = None

def _get_dataframe_level(self, df: pd.DataFrame) -> Optional[str]:
"""Return the level of the passed dataframe in hierarchical structure."""
if self.hierarchical_structure is None:
Expand Down Expand Up @@ -329,6 +331,7 @@ def tsdataset_idx_slice(self, start_idx: Optional[int] = None, end_idx: Optional
tsdataset_slice.known_future = deepcopy(self.known_future)
tsdataset_slice._regressors = deepcopy(self.regressors)
tsdataset_slice.df_exog = self.df_exog
tsdataset_slice._target_components = self._target_components
return tsdataset_slice

@staticmethod
Expand Down Expand Up @@ -464,6 +467,11 @@ def regressors(self) -> List[str]:
"""
return self._regressors

@property
def target_components(self) -> Optional[List[str]]:
"""Get list of target components. Components sum up to target. If there are no components, None is returned."""
return self._target_components

def plot(
self,
n_segments: int = 10,
Expand Down Expand Up @@ -935,6 +943,7 @@ def train_test_split(
)
train.raw_df = train_raw_df
train._regressors = self.regressors
train._target_components = self.target_components

test_df = self.df[test_start_defined:test_end_defined][self.raw_df.columns] # type: ignore
test_raw_df = self.raw_df[train_start_defined:test_end_defined] # type: ignore
Expand All @@ -947,7 +956,7 @@ def train_test_split(
)
test.raw_df = test_raw_df
test._regressors = self.regressors

test._target_components = self.target_components
return train, test

def update_columns_from_pandas(self, df_update: pd.DataFrame):
Expand Down Expand Up @@ -1003,7 +1012,18 @@ def drop_features(self, features: List[str], drop_from_exog: bool = False):
drop_from_exog:
* If False, drop features only from df. Features will appear again in df after make_future.
* If True, drop features from df and df_exog. Features won't appear in df after make_future.

Raises
------
ValueError:
If ``features`` list contains target components
"""
features_contain_target_components = (self.target_components is not None) and (
len(set(features).intersection(self.target_components)) != 0
)
if features_contain_target_components:
raise ValueError("Target components can't be dropped from the dataset!")

dfs = [("df", self.df)]
if drop_from_exog:
dfs.append(("df_exog", self.df_exog))
Expand Down Expand Up @@ -1079,13 +1099,66 @@ def get_level_dataset(self, target_level: str) -> "TSDataset":
target_names = tuple(get_target_with_quantiles(columns=self.columns))
target_level_df = self[:, current_level_segments, target_names]

return TSDataset(
ts = TSDataset(
df=target_level_df,
freq=self.freq,
df_exog=self.df_exog,
known_future=self.known_future,
hierarchical_structure=self.hierarchical_structure,
)
ts._target_components = self._target_components
return ts

def add_target_components(self, target_components_df: pd.DataFrame):
"""Add target components into dataset.

Parameters
----------
target_components_df:
Dataframe in etna wide format with target components

Raises
------
ValueError:
If dataset already contains target components
ValueError:
If target components names differs between segments
ValueError:
If components don't sum up to target
"""
if self._target_components is not None:
raise ValueError("Dataset already contains target components!")

components_names = sorted(target_components_df[self.segments[0]].columns.get_level_values("feature"))
for segment in self.segments:
components_names_segment = sorted(target_components_df[segment].columns.get_level_values("feature"))
if components_names != components_names_segment:
raise ValueError(
f"Set of target components differs between segments '{self.segments[0]}' and '{segment}'!"
)

components_sum = target_components_df.sum(axis=1, level="segment")
if not np.array_equal(components_sum.values, self[..., "target"].values):
raise ValueError("Components don't sum up to target!")

self._target_components = components_names
self.df = (
pd.concat((self.df, target_components_df), axis=1)
.loc[self.df.index]
.sort_index(axis=1, level=("segment", "feature"))
)

def get_target_components(self) -> Optional[pd.DataFrame]:
"""Get DataFrame with target components.

Returns
-------
:
Dataframe with target components
"""
if self._target_components is None:
return None
return self.to_pandas(features=self._target_components)

@property
def columns(self) -> pd.core.indexes.multi.MultiIndex:
Expand Down
119 changes: 119 additions & 0 deletions tests/test_datasets/test_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,63 @@ def df_segments_int():
return df


@pytest.fixture
def target_components_df():
timestamp = pd.date_range("2021-01-01", "2021-01-15")
df_1 = pd.DataFrame({"timestamp": timestamp, "target_component_a": 1, "target_component_b": 2, "segment": 1})
df_2 = pd.DataFrame({"timestamp": timestamp, "target_component_a": 3, "target_component_b": 4, "segment": 2})
df = pd.concat([df_1, df_2])
df = TSDataset.to_dataset(df)
return df


@pytest.fixture
def inconsistent_target_components_names_df(target_components_df):
target_components_df = target_components_df.drop(columns=[("2", "target_component_a")])
return target_components_df


@pytest.fixture
def inconsistent_target_components_names_duplication_df(target_components_df):
target_components_df = pd.concat(
(target_components_df, target_components_df.loc[pd.IndexSlice[:], pd.IndexSlice["1", :]]), axis=1
)
return target_components_df


@pytest.fixture
def inconsistent_target_components_values_df(target_components_df):
target_components_df.loc[10, pd.IndexSlice["1", "target_component_a"]] = 100
return target_components_df


@pytest.fixture
def ts_without_target_components():
timestamp = pd.date_range("2021-01-01", "2021-01-15")
df_1 = pd.DataFrame({"timestamp": timestamp, "target": 3, "segment": 1})
df_2 = pd.DataFrame({"timestamp": timestamp, "target": 7, "segment": 2})
df = pd.concat([df_1, df_2])
df = TSDataset.to_dataset(df)
ts = TSDataset(df=df, freq="D")
return ts


@pytest.fixture
def ts_with_target_components():
timestamp = pd.date_range("2021-01-01", "2021-01-15")
df_1 = pd.DataFrame(
{"timestamp": timestamp, "target": 3, "target_component_a": 1, "target_component_b": 2, "segment": 1}
)
df_2 = pd.DataFrame(
{"timestamp": timestamp, "target": 7, "target_component_a": 3, "target_component_b": 4, "segment": 2}
)
df = pd.concat([df_1, df_2])
df = TSDataset.to_dataset(df)
ts = TSDataset(df=df, freq="D")
ts._target_components = ["target_component_a", "target_component_b"]
return ts


def test_check_endings_error():
"""Check that _check_endings method raises exception if some segments end with nan."""
timestamp = pd.date_range("2021-01-01", "2021-02-01")
Expand Down Expand Up @@ -416,6 +473,12 @@ def test_train_test_split_pass_regressors_to_output(df_and_regressors):
assert test.regressors == ts.regressors


def test_train_test_split_pass_target_components_to_output(ts_with_target_components):
train, test = ts_with_target_components.train_test_split(test_size=5)
assert sorted(train.target_components) == sorted(ts_with_target_components.target_components)
assert sorted(test.target_components) == sorted(ts_with_target_components.target_components)


def test_dataset_datetime_conversion():
classic_df = generate_ar_df(periods=30, start_time="2021-06-01", n_segments=2)
classic_df["timestamp"] = classic_df["timestamp"].astype(str)
Expand Down Expand Up @@ -823,6 +886,11 @@ def test_tsdataset_idx_slice(tsdf_with_exog, start_idx, end_idx):
pd.testing.assert_frame_equal(ts_slice.df_exog, tsdf_with_exog.df_exog)


def test_tsdataset_idx_slice_pass_target_components_to_output(ts_with_target_components):
ts_slice = ts_with_target_components.tsdataset_idx_slice(start_idx=1, end_idx=2)
assert sorted(ts_slice.target_components) == sorted(ts_with_target_components.target_components)


def test_to_torch_dataset_without_drop(tsdf_with_exog):
def make_samples(df):
return [{"target": df.target.values, "segment": df["segment"].values[0]}]
Expand Down Expand Up @@ -947,3 +1015,54 @@ def test_drop_features_update_regressors(df_and_regressors, features, expected_r
ts = TSDataset(df=df, df_exog=df_exog, freq="D", known_future=known_future)
ts.drop_features(features=features, drop_from_exog=False)
assert sorted(ts.regressors) == sorted(expected_regressors)


def test_drop_features_throw_error_on_target_components(ts_with_target_components):
with pytest.raises(ValueError, match="Target components can't be dropped from the dataset!"):
ts_with_target_components.drop_features(features=ts_with_target_components.target_components)


def test_get_target_components_on_dataset_without_components(example_tsds):
target_components = example_tsds.get_target_components()
assert target_components is None


def test_get_target_components(
ts_with_target_components, expected_components=["target_component_a", "target_component_b"]
):
expected_target_components_df = ts_with_target_components.to_pandas(features=expected_components)
target_components_df = ts_with_target_components.get_target_components()
pd.testing.assert_frame_equal(target_components_df, expected_target_components_df)


def test_add_target_components_throw_error_adding_components_second_time(
ts_with_target_components, target_components_df
):
with pytest.raises(ValueError, match="Dataset already contains target components!"):
ts_with_target_components.add_target_components(target_components_df=target_components_df)


@pytest.mark.parametrize(
"inconsistent_target_components_names_fixture",
[("inconsistent_target_components_names_df"), ("inconsistent_target_components_names_duplication_df")],
)
def test_add_target_components_throw_error_inconsistent_components_names(
ts_without_target_components, inconsistent_target_components_names_fixture, request
):
inconsistent_target_components_names_df = request.getfixturevalue(inconsistent_target_components_names_fixture)
with pytest.raises(ValueError, match="Set of target components differs between segments '1' and '2'!"):
ts_without_target_components.add_target_components(target_components_df=inconsistent_target_components_names_df)


def test_add_target_components_throw_error_inconsistent_components_values(
ts_without_target_components, inconsistent_target_components_values_df
):
with pytest.raises(ValueError, match="Components don't sum up to target!"):
ts_without_target_components.add_target_components(
target_components_df=inconsistent_target_components_values_df
)


def test_add_target_components(ts_without_target_components, ts_with_target_components, target_components_df):
ts_without_target_components.add_target_components(target_components_df=target_components_df)
pd.testing.assert_frame_equal(ts_without_target_components.to_pandas(), ts_with_target_components.to_pandas())
8 changes: 8 additions & 0 deletions tests/test_datasets/test_hierarchical_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -481,3 +481,11 @@ def test_get_level_dataset_lower_level_error(simple_hierarchical_ts):
def test_get_level_dataset_with_quantiles(product_level_constant_forecast_w_quantiles, target_level, answer):
forecast = product_level_constant_forecast_w_quantiles
np.testing.assert_array_almost_equal(forecast.get_level_dataset(target_level=target_level).df.values, answer)


def test_get_level_dataset_pass_target_components_to_output(simple_hierarchical_ts):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you please explain what is happening here and why?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Here I test that get_level_dataset pass target_components to the output dataset
It should be done as get_level_dataset creates new dataset with the same columns but different set of segments

simple_hierarchical_ts._target_components = ["target_component_a", "target_component_b"]
simple_hierarchical_ts_aggregated = simple_hierarchical_ts.get_level_dataset(target_level="market")
assert sorted(simple_hierarchical_ts_aggregated.target_components) == sorted(
simple_hierarchical_ts.target_components
)