Skip to content

Commit

Permalink
Add target components logic to TSDataset (#1153)
Browse files Browse the repository at this point in the history
  • Loading branch information
alex-hse-repository authored Mar 7, 2023
1 parent 9a29fa8 commit 3d76478
Show file tree
Hide file tree
Showing 4 changed files with 203 additions and 2 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

## Unreleased
### Added
- Target components logic to TSDataset ([#1153](https://github.com/tinkoff-ai/etna/pull/1153))
- Methods `save` and `load` to HierarchicalPipeline ([#1096](https://github.com/tinkoff-ai/etna/pull/1096))
- New data access methods in `TSDataset` : `update_columns_from_pandas`, `add_columns_from_pandas`, `drop_features` ([#809](https://github.com/tinkoff-ai/etna/pull/809))
- `PytorchForecastingDatasetBuiler` for neural networks from Pytorch Forecasting ([#971](https://github.com/tinkoff-ai/etna/pull/971))
Expand Down
77 changes: 75 additions & 2 deletions etna/datasets/tsdataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,8 @@ def __init__(
if self.current_df_level == self.current_df_exog_level:
self.df = self._merge_exog(self.df)

self._target_components: Optional[List[str]] = None

def _get_dataframe_level(self, df: pd.DataFrame) -> Optional[str]:
"""Return the level of the passed dataframe in hierarchical structure."""
if self.hierarchical_structure is None:
Expand Down Expand Up @@ -329,6 +331,7 @@ def tsdataset_idx_slice(self, start_idx: Optional[int] = None, end_idx: Optional
tsdataset_slice.known_future = deepcopy(self.known_future)
tsdataset_slice._regressors = deepcopy(self.regressors)
tsdataset_slice.df_exog = self.df_exog
tsdataset_slice._target_components = self._target_components
return tsdataset_slice

@staticmethod
Expand Down Expand Up @@ -464,6 +467,11 @@ def regressors(self) -> List[str]:
"""
return self._regressors

@property
def target_components(self) -> Optional[List[str]]:
"""Get list of target components. Components sum up to target. If there are no components, None is returned."""
return self._target_components

def plot(
self,
n_segments: int = 10,
Expand Down Expand Up @@ -935,6 +943,7 @@ def train_test_split(
)
train.raw_df = train_raw_df
train._regressors = self.regressors
train._target_components = self.target_components

test_df = self.df[test_start_defined:test_end_defined][self.raw_df.columns] # type: ignore
test_raw_df = self.raw_df[train_start_defined:test_end_defined] # type: ignore
Expand All @@ -947,7 +956,7 @@ def train_test_split(
)
test.raw_df = test_raw_df
test._regressors = self.regressors

test._target_components = self.target_components
return train, test

def update_columns_from_pandas(self, df_update: pd.DataFrame):
Expand Down Expand Up @@ -1003,7 +1012,18 @@ def drop_features(self, features: List[str], drop_from_exog: bool = False):
drop_from_exog:
* If False, drop features only from df. Features will appear again in df after make_future.
* If True, drop features from df and df_exog. Features won't appear in df after make_future.
Raises
------
ValueError:
If ``features`` list contains target components
"""
features_contain_target_components = (self.target_components is not None) and (
len(set(features).intersection(self.target_components)) != 0
)
if features_contain_target_components:
raise ValueError("Target components can't be dropped from the dataset!")

dfs = [("df", self.df)]
if drop_from_exog:
dfs.append(("df_exog", self.df_exog))
Expand Down Expand Up @@ -1079,13 +1099,66 @@ def get_level_dataset(self, target_level: str) -> "TSDataset":
target_names = tuple(get_target_with_quantiles(columns=self.columns))
target_level_df = self[:, current_level_segments, target_names]

return TSDataset(
ts = TSDataset(
df=target_level_df,
freq=self.freq,
df_exog=self.df_exog,
known_future=self.known_future,
hierarchical_structure=self.hierarchical_structure,
)
ts._target_components = self._target_components
return ts

def add_target_components(self, target_components_df: pd.DataFrame):
"""Add target components into dataset.
Parameters
----------
target_components_df:
Dataframe in etna wide format with target components
Raises
------
ValueError:
If dataset already contains target components
ValueError:
If target components names differs between segments
ValueError:
If components don't sum up to target
"""
if self._target_components is not None:
raise ValueError("Dataset already contains target components!")

components_names = sorted(target_components_df[self.segments[0]].columns.get_level_values("feature"))
for segment in self.segments:
components_names_segment = sorted(target_components_df[segment].columns.get_level_values("feature"))
if components_names != components_names_segment:
raise ValueError(
f"Set of target components differs between segments '{self.segments[0]}' and '{segment}'!"
)

components_sum = target_components_df.sum(axis=1, level="segment")
if not np.array_equal(components_sum.values, self[..., "target"].values):
raise ValueError("Components don't sum up to target!")

self._target_components = components_names
self.df = (
pd.concat((self.df, target_components_df), axis=1)
.loc[self.df.index]
.sort_index(axis=1, level=("segment", "feature"))
)

def get_target_components(self) -> Optional[pd.DataFrame]:
"""Get DataFrame with target components.
Returns
-------
:
Dataframe with target components
"""
if self._target_components is None:
return None
return self.to_pandas(features=self._target_components)

@property
def columns(self) -> pd.core.indexes.multi.MultiIndex:
Expand Down
119 changes: 119 additions & 0 deletions tests/test_datasets/test_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,63 @@ def df_segments_int():
return df


@pytest.fixture
def target_components_df():
timestamp = pd.date_range("2021-01-01", "2021-01-15")
df_1 = pd.DataFrame({"timestamp": timestamp, "target_component_a": 1, "target_component_b": 2, "segment": 1})
df_2 = pd.DataFrame({"timestamp": timestamp, "target_component_a": 3, "target_component_b": 4, "segment": 2})
df = pd.concat([df_1, df_2])
df = TSDataset.to_dataset(df)
return df


@pytest.fixture
def inconsistent_target_components_names_df(target_components_df):
target_components_df = target_components_df.drop(columns=[("2", "target_component_a")])
return target_components_df


@pytest.fixture
def inconsistent_target_components_names_duplication_df(target_components_df):
target_components_df = pd.concat(
(target_components_df, target_components_df.loc[pd.IndexSlice[:], pd.IndexSlice["1", :]]), axis=1
)
return target_components_df


@pytest.fixture
def inconsistent_target_components_values_df(target_components_df):
target_components_df.loc[10, pd.IndexSlice["1", "target_component_a"]] = 100
return target_components_df


@pytest.fixture
def ts_without_target_components():
timestamp = pd.date_range("2021-01-01", "2021-01-15")
df_1 = pd.DataFrame({"timestamp": timestamp, "target": 3, "segment": 1})
df_2 = pd.DataFrame({"timestamp": timestamp, "target": 7, "segment": 2})
df = pd.concat([df_1, df_2])
df = TSDataset.to_dataset(df)
ts = TSDataset(df=df, freq="D")
return ts


@pytest.fixture
def ts_with_target_components():
timestamp = pd.date_range("2021-01-01", "2021-01-15")
df_1 = pd.DataFrame(
{"timestamp": timestamp, "target": 3, "target_component_a": 1, "target_component_b": 2, "segment": 1}
)
df_2 = pd.DataFrame(
{"timestamp": timestamp, "target": 7, "target_component_a": 3, "target_component_b": 4, "segment": 2}
)
df = pd.concat([df_1, df_2])
df = TSDataset.to_dataset(df)
ts = TSDataset(df=df, freq="D")
ts._target_components = ["target_component_a", "target_component_b"]
return ts


def test_check_endings_error():
"""Check that _check_endings method raises exception if some segments end with nan."""
timestamp = pd.date_range("2021-01-01", "2021-02-01")
Expand Down Expand Up @@ -416,6 +473,12 @@ def test_train_test_split_pass_regressors_to_output(df_and_regressors):
assert test.regressors == ts.regressors


def test_train_test_split_pass_target_components_to_output(ts_with_target_components):
train, test = ts_with_target_components.train_test_split(test_size=5)
assert sorted(train.target_components) == sorted(ts_with_target_components.target_components)
assert sorted(test.target_components) == sorted(ts_with_target_components.target_components)


def test_dataset_datetime_conversion():
classic_df = generate_ar_df(periods=30, start_time="2021-06-01", n_segments=2)
classic_df["timestamp"] = classic_df["timestamp"].astype(str)
Expand Down Expand Up @@ -823,6 +886,11 @@ def test_tsdataset_idx_slice(tsdf_with_exog, start_idx, end_idx):
pd.testing.assert_frame_equal(ts_slice.df_exog, tsdf_with_exog.df_exog)


def test_tsdataset_idx_slice_pass_target_components_to_output(ts_with_target_components):
ts_slice = ts_with_target_components.tsdataset_idx_slice(start_idx=1, end_idx=2)
assert sorted(ts_slice.target_components) == sorted(ts_with_target_components.target_components)


def test_to_torch_dataset_without_drop(tsdf_with_exog):
def make_samples(df):
return [{"target": df.target.values, "segment": df["segment"].values[0]}]
Expand Down Expand Up @@ -947,3 +1015,54 @@ def test_drop_features_update_regressors(df_and_regressors, features, expected_r
ts = TSDataset(df=df, df_exog=df_exog, freq="D", known_future=known_future)
ts.drop_features(features=features, drop_from_exog=False)
assert sorted(ts.regressors) == sorted(expected_regressors)


def test_drop_features_throw_error_on_target_components(ts_with_target_components):
with pytest.raises(ValueError, match="Target components can't be dropped from the dataset!"):
ts_with_target_components.drop_features(features=ts_with_target_components.target_components)


def test_get_target_components_on_dataset_without_components(example_tsds):
target_components = example_tsds.get_target_components()
assert target_components is None


def test_get_target_components(
ts_with_target_components, expected_components=["target_component_a", "target_component_b"]
):
expected_target_components_df = ts_with_target_components.to_pandas(features=expected_components)
target_components_df = ts_with_target_components.get_target_components()
pd.testing.assert_frame_equal(target_components_df, expected_target_components_df)


def test_add_target_components_throw_error_adding_components_second_time(
ts_with_target_components, target_components_df
):
with pytest.raises(ValueError, match="Dataset already contains target components!"):
ts_with_target_components.add_target_components(target_components_df=target_components_df)


@pytest.mark.parametrize(
"inconsistent_target_components_names_fixture",
[("inconsistent_target_components_names_df"), ("inconsistent_target_components_names_duplication_df")],
)
def test_add_target_components_throw_error_inconsistent_components_names(
ts_without_target_components, inconsistent_target_components_names_fixture, request
):
inconsistent_target_components_names_df = request.getfixturevalue(inconsistent_target_components_names_fixture)
with pytest.raises(ValueError, match="Set of target components differs between segments '1' and '2'!"):
ts_without_target_components.add_target_components(target_components_df=inconsistent_target_components_names_df)


def test_add_target_components_throw_error_inconsistent_components_values(
ts_without_target_components, inconsistent_target_components_values_df
):
with pytest.raises(ValueError, match="Components don't sum up to target!"):
ts_without_target_components.add_target_components(
target_components_df=inconsistent_target_components_values_df
)


def test_add_target_components(ts_without_target_components, ts_with_target_components, target_components_df):
ts_without_target_components.add_target_components(target_components_df=target_components_df)
pd.testing.assert_frame_equal(ts_without_target_components.to_pandas(), ts_with_target_components.to_pandas())
8 changes: 8 additions & 0 deletions tests/test_datasets/test_hierarchical_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -481,3 +481,11 @@ def test_get_level_dataset_lower_level_error(simple_hierarchical_ts):
def test_get_level_dataset_with_quantiles(product_level_constant_forecast_w_quantiles, target_level, answer):
forecast = product_level_constant_forecast_w_quantiles
np.testing.assert_array_almost_equal(forecast.get_level_dataset(target_level=target_level).df.values, answer)


def test_get_level_dataset_pass_target_components_to_output(simple_hierarchical_ts):
simple_hierarchical_ts._target_components = ["target_component_a", "target_component_b"]
simple_hierarchical_ts_aggregated = simple_hierarchical_ts.get_level_dataset(target_level="market")
assert sorted(simple_hierarchical_ts_aggregated.target_components) == sorted(
simple_hierarchical_ts.target_components
)

1 comment on commit 3d76478

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please sign in to comment.