Add target components logic to TSDataset (#1153)

tinkoff-ai · Mar 7, 2023 · 3d76478 · 3d76478 · github-actions · Mar 7, 2023
1 parent 9a29fa8
commit 3d76478
Show file tree

Hide file tree

Showing 4 changed files with 203 additions and 2 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,6 +7,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## Unreleased
 ### Added
+- Target components logic to TSDataset ([#1153](https://github.com/tinkoff-ai/etna/pull/1153))
 - Methods `save` and `load` to HierarchicalPipeline ([#1096](https://github.com/tinkoff-ai/etna/pull/1096))
 - New data access methods in `TSDataset` : `update_columns_from_pandas`, `add_columns_from_pandas`, `drop_features` ([#809](https://github.com/tinkoff-ai/etna/pull/809))
 - `PytorchForecastingDatasetBuiler` for neural networks from Pytorch Forecasting ([#971](https://github.com/tinkoff-ai/etna/pull/971))

diff --git a/etna/datasets/tsdataset.py b/etna/datasets/tsdataset.py
@@ -163,6 +163,8 @@ def __init__(
             if self.current_df_level == self.current_df_exog_level:
                 self.df = self._merge_exog(self.df)
 
+        self._target_components: Optional[List[str]] = None
+
     def _get_dataframe_level(self, df: pd.DataFrame) -> Optional[str]:
         """Return the level of the passed dataframe in hierarchical structure."""
         if self.hierarchical_structure is None:
@@ -329,6 +331,7 @@ def tsdataset_idx_slice(self, start_idx: Optional[int] = None, end_idx: Optional
         tsdataset_slice.known_future = deepcopy(self.known_future)
         tsdataset_slice._regressors = deepcopy(self.regressors)
         tsdataset_slice.df_exog = self.df_exog
+        tsdataset_slice._target_components = self._target_components
         return tsdataset_slice
 
     @staticmethod
@@ -464,6 +467,11 @@ def regressors(self) -> List[str]:
         """
         return self._regressors
 
+    @property
+    def target_components(self) -> Optional[List[str]]:
+        """Get list of target components. Components sum up to target. If there are no components, None is returned."""
+        return self._target_components
+
     def plot(
         self,
         n_segments: int = 10,
@@ -935,6 +943,7 @@ def train_test_split(
         )
         train.raw_df = train_raw_df
         train._regressors = self.regressors
+        train._target_components = self.target_components
 
         test_df = self.df[test_start_defined:test_end_defined][self.raw_df.columns]  # type: ignore
         test_raw_df = self.raw_df[train_start_defined:test_end_defined]  # type: ignore
@@ -947,7 +956,7 @@ def train_test_split(
         )
         test.raw_df = test_raw_df
         test._regressors = self.regressors
-
+        test._target_components = self.target_components
         return train, test
 
     def update_columns_from_pandas(self, df_update: pd.DataFrame):
@@ -1003,7 +1012,18 @@ def drop_features(self, features: List[str], drop_from_exog: bool = False):
         drop_from_exog:
             * If False, drop features only from df. Features will appear again in df after make_future.
             * If True, drop features from df and df_exog. Features won't appear in df after make_future.
+
+        Raises
+        ------
+        ValueError:
+            If ``features`` list contains target components
         """
+        features_contain_target_components = (self.target_components is not None) and (
+            len(set(features).intersection(self.target_components)) != 0
+        )
+        if features_contain_target_components:
+            raise ValueError("Target components can't be dropped from the dataset!")
+
         dfs = [("df", self.df)]
         if drop_from_exog:
             dfs.append(("df_exog", self.df_exog))
@@ -1079,13 +1099,66 @@ def get_level_dataset(self, target_level: str) -> "TSDataset":
             target_names = tuple(get_target_with_quantiles(columns=self.columns))
             target_level_df = self[:, current_level_segments, target_names]
 
-        return TSDataset(
+        ts = TSDataset(
             df=target_level_df,
             freq=self.freq,
             df_exog=self.df_exog,
             known_future=self.known_future,
             hierarchical_structure=self.hierarchical_structure,
         )
+        ts._target_components = self._target_components
+        return ts
+
+    def add_target_components(self, target_components_df: pd.DataFrame):
+        """Add target components into dataset.
+
+        Parameters
+        ----------
+        target_components_df:
+            Dataframe in etna wide format with target components
+
+        Raises
+        ------
+        ValueError:
+            If dataset already contains target components
+        ValueError:
+            If target components names differs between segments
+        ValueError:
+            If components don't sum up to target
+        """
+        if self._target_components is not None:
+            raise ValueError("Dataset already contains target components!")
+
+        components_names = sorted(target_components_df[self.segments[0]].columns.get_level_values("feature"))
+        for segment in self.segments:
+            components_names_segment = sorted(target_components_df[segment].columns.get_level_values("feature"))
+            if components_names != components_names_segment:
+                raise ValueError(
+                    f"Set of target components differs between segments '{self.segments[0]}' and '{segment}'!"
+                )
+
+        components_sum = target_components_df.sum(axis=1, level="segment")
+        if not np.array_equal(components_sum.values, self[..., "target"].values):
+            raise ValueError("Components don't sum up to target!")
+
+        self._target_components = components_names
+        self.df = (
+            pd.concat((self.df, target_components_df), axis=1)
+            .loc[self.df.index]
+            .sort_index(axis=1, level=("segment", "feature"))
+        )
+
+    def get_target_components(self) -> Optional[pd.DataFrame]:
+        """Get DataFrame with target components.
+
+        Returns
+        -------
+        :
+            Dataframe with target components
+        """
+        if self._target_components is None:
+            return None
+        return self.to_pandas(features=self._target_components)
 
     @property
     def columns(self) -> pd.core.indexes.multi.MultiIndex:

diff --git a/tests/test_datasets/test_dataset.py b/tests/test_datasets/test_dataset.py
@@ -170,6 +170,63 @@ def df_segments_int():
     return df
 
 
+@pytest.fixture
+def target_components_df():
+    timestamp = pd.date_range("2021-01-01", "2021-01-15")
+    df_1 = pd.DataFrame({"timestamp": timestamp, "target_component_a": 1, "target_component_b": 2, "segment": 1})
+    df_2 = pd.DataFrame({"timestamp": timestamp, "target_component_a": 3, "target_component_b": 4, "segment": 2})
+    df = pd.concat([df_1, df_2])
+    df = TSDataset.to_dataset(df)
+    return df
+
+
+@pytest.fixture
+def inconsistent_target_components_names_df(target_components_df):
+    target_components_df = target_components_df.drop(columns=[("2", "target_component_a")])
+    return target_components_df
+
+
+@pytest.fixture
+def inconsistent_target_components_names_duplication_df(target_components_df):
+    target_components_df = pd.concat(
+        (target_components_df, target_components_df.loc[pd.IndexSlice[:], pd.IndexSlice["1", :]]), axis=1
+    )
+    return target_components_df
+
+
+@pytest.fixture
+def inconsistent_target_components_values_df(target_components_df):
+    target_components_df.loc[10, pd.IndexSlice["1", "target_component_a"]] = 100
+    return target_components_df
+
+
+@pytest.fixture
+def ts_without_target_components():
+    timestamp = pd.date_range("2021-01-01", "2021-01-15")
+    df_1 = pd.DataFrame({"timestamp": timestamp, "target": 3, "segment": 1})
+    df_2 = pd.DataFrame({"timestamp": timestamp, "target": 7, "segment": 2})
+    df = pd.concat([df_1, df_2])
+    df = TSDataset.to_dataset(df)
+    ts = TSDataset(df=df, freq="D")
+    return ts
+
+
+@pytest.fixture
+def ts_with_target_components():
+    timestamp = pd.date_range("2021-01-01", "2021-01-15")
+    df_1 = pd.DataFrame(
+        {"timestamp": timestamp, "target": 3, "target_component_a": 1, "target_component_b": 2, "segment": 1}
+    )
+    df_2 = pd.DataFrame(
+        {"timestamp": timestamp, "target": 7, "target_component_a": 3, "target_component_b": 4, "segment": 2}
+    )
+    df = pd.concat([df_1, df_2])
+    df = TSDataset.to_dataset(df)
+    ts = TSDataset(df=df, freq="D")
+    ts._target_components = ["target_component_a", "target_component_b"]
+    return ts
+
+
 def test_check_endings_error():
     """Check that _check_endings method raises exception if some segments end with nan."""
     timestamp = pd.date_range("2021-01-01", "2021-02-01")
@@ -416,6 +473,12 @@ def test_train_test_split_pass_regressors_to_output(df_and_regressors):
     assert test.regressors == ts.regressors
 
 
+def test_train_test_split_pass_target_components_to_output(ts_with_target_components):
+    train, test = ts_with_target_components.train_test_split(test_size=5)
+    assert sorted(train.target_components) == sorted(ts_with_target_components.target_components)
+    assert sorted(test.target_components) == sorted(ts_with_target_components.target_components)
+
+
 def test_dataset_datetime_conversion():
     classic_df = generate_ar_df(periods=30, start_time="2021-06-01", n_segments=2)
     classic_df["timestamp"] = classic_df["timestamp"].astype(str)
@@ -823,6 +886,11 @@ def test_tsdataset_idx_slice(tsdf_with_exog, start_idx, end_idx):
     pd.testing.assert_frame_equal(ts_slice.df_exog, tsdf_with_exog.df_exog)
 
 
+def test_tsdataset_idx_slice_pass_target_components_to_output(ts_with_target_components):
+    ts_slice = ts_with_target_components.tsdataset_idx_slice(start_idx=1, end_idx=2)
+    assert sorted(ts_slice.target_components) == sorted(ts_with_target_components.target_components)
+
+
 def test_to_torch_dataset_without_drop(tsdf_with_exog):
     def make_samples(df):
         return [{"target": df.target.values, "segment": df["segment"].values[0]}]
@@ -947,3 +1015,54 @@ def test_drop_features_update_regressors(df_and_regressors, features, expected_r
     ts = TSDataset(df=df, df_exog=df_exog, freq="D", known_future=known_future)
     ts.drop_features(features=features, drop_from_exog=False)
     assert sorted(ts.regressors) == sorted(expected_regressors)
+
+
+def test_drop_features_throw_error_on_target_components(ts_with_target_components):
+    with pytest.raises(ValueError, match="Target components can't be dropped from the dataset!"):
+        ts_with_target_components.drop_features(features=ts_with_target_components.target_components)
+
+
+def test_get_target_components_on_dataset_without_components(example_tsds):
+    target_components = example_tsds.get_target_components()
+    assert target_components is None
+
+
+def test_get_target_components(
+    ts_with_target_components, expected_components=["target_component_a", "target_component_b"]
+):
+    expected_target_components_df = ts_with_target_components.to_pandas(features=expected_components)
+    target_components_df = ts_with_target_components.get_target_components()
+    pd.testing.assert_frame_equal(target_components_df, expected_target_components_df)
+
+
+def test_add_target_components_throw_error_adding_components_second_time(
+    ts_with_target_components, target_components_df
+):
+    with pytest.raises(ValueError, match="Dataset already contains target components!"):
+        ts_with_target_components.add_target_components(target_components_df=target_components_df)
+
+
+@pytest.mark.parametrize(
+    "inconsistent_target_components_names_fixture",
+    [("inconsistent_target_components_names_df"), ("inconsistent_target_components_names_duplication_df")],
+)
+def test_add_target_components_throw_error_inconsistent_components_names(
+    ts_without_target_components, inconsistent_target_components_names_fixture, request
+):
+    inconsistent_target_components_names_df = request.getfixturevalue(inconsistent_target_components_names_fixture)
+    with pytest.raises(ValueError, match="Set of target components differs between segments '1' and '2'!"):
+        ts_without_target_components.add_target_components(target_components_df=inconsistent_target_components_names_df)
+
+
+def test_add_target_components_throw_error_inconsistent_components_values(
+    ts_without_target_components, inconsistent_target_components_values_df
+):
+    with pytest.raises(ValueError, match="Components don't sum up to target!"):
+        ts_without_target_components.add_target_components(
+            target_components_df=inconsistent_target_components_values_df
+        )
+
+
+def test_add_target_components(ts_without_target_components, ts_with_target_components, target_components_df):
+    ts_without_target_components.add_target_components(target_components_df=target_components_df)
+    pd.testing.assert_frame_equal(ts_without_target_components.to_pandas(), ts_with_target_components.to_pandas())
diff --git a/tests/test_datasets/test_hierarchical_dataset.py b/tests/test_datasets/test_hierarchical_dataset.py
@@ -481,3 +481,11 @@ def test_get_level_dataset_lower_level_error(simple_hierarchical_ts):
 def test_get_level_dataset_with_quantiles(product_level_constant_forecast_w_quantiles, target_level, answer):
     forecast = product_level_constant_forecast_w_quantiles
     np.testing.assert_array_almost_equal(forecast.get_level_dataset(target_level=target_level).df.values, answer)
+
+
+def test_get_level_dataset_pass_target_components_to_output(simple_hierarchical_ts):
+    simple_hierarchical_ts._target_components = ["target_component_a", "target_component_b"]
+    simple_hierarchical_ts_aggregated = simple_hierarchical_ts.get_level_dataset(target_level="market")
+    assert sorted(simple_hierarchical_ts_aggregated.target_components) == sorted(
+        simple_hierarchical_ts.target_components
+    )