-
Notifications
You must be signed in to change notification settings - Fork 80
Add target components logic to TSDataset
#1153
Changes from 6 commits
c403de8
ece8cf4
b7bbd08
fef9852
46fc3ca
3c3a9ac
603d3fe
c379e42
872aeb3
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -163,6 +163,8 @@ def __init__( | |
if self.current_df_level == self.current_df_exog_level: | ||
self.df = self._merge_exog(self.df) | ||
|
||
self._target_components: Optional[List[str]] = None | ||
|
||
def _get_dataframe_level(self, df: pd.DataFrame) -> Optional[str]: | ||
"""Return the level of the passed dataframe in hierarchical structure.""" | ||
if self.hierarchical_structure is None: | ||
|
@@ -329,6 +331,7 @@ def tsdataset_idx_slice(self, start_idx: Optional[int] = None, end_idx: Optional | |
tsdataset_slice.known_future = deepcopy(self.known_future) | ||
tsdataset_slice._regressors = deepcopy(self.regressors) | ||
tsdataset_slice.df_exog = self.df_exog | ||
tsdataset_slice._target_components = self._target_components | ||
return tsdataset_slice | ||
|
||
@staticmethod | ||
|
@@ -464,6 +467,11 @@ def regressors(self) -> List[str]: | |
""" | ||
return self._regressors | ||
|
||
@property | ||
def target_components(self) -> Optional[List[str]]: | ||
"""Get list of target components. Target components sum up to target.""" | ||
return self._target_components | ||
|
||
def plot( | ||
self, | ||
n_segments: int = 10, | ||
|
@@ -935,6 +943,7 @@ def train_test_split( | |
) | ||
train.raw_df = train_raw_df | ||
train._regressors = self.regressors | ||
train._target_components = self.target_components | ||
|
||
test_df = self.df[test_start_defined:test_end_defined][self.raw_df.columns] # type: ignore | ||
test_raw_df = self.raw_df[train_start_defined:test_end_defined] # type: ignore | ||
|
@@ -947,7 +956,7 @@ def train_test_split( | |
) | ||
test.raw_df = test_raw_df | ||
test._regressors = self.regressors | ||
|
||
test._target_components = self.target_components | ||
return train, test | ||
|
||
def update_columns_from_pandas(self, df_update: pd.DataFrame): | ||
|
@@ -1003,7 +1012,18 @@ def drop_features(self, features: List[str], drop_from_exog: bool = False): | |
drop_from_exog: | ||
* If False, drop features only from df. Features will appear again in df after make_future. | ||
* If True, drop features from df and df_exog. Features won't appear in df after make_future. | ||
|
||
Raises | ||
------ | ||
ValueError: | ||
If ``features`` list contains target components | ||
""" | ||
features_contain_target_components = (self.target_components is not None) and ( | ||
len(set(features).intersection(self.target_components)) != 0 | ||
) | ||
if features_contain_target_components: | ||
raise ValueError("Target components can't be dropped from the dataset!") | ||
|
||
dfs = [("df", self.df)] | ||
if drop_from_exog: | ||
dfs.append(("df_exog", self.df_exog)) | ||
|
@@ -1079,13 +1099,64 @@ def get_level_dataset(self, target_level: str) -> "TSDataset": | |
target_names = tuple(get_target_with_quantiles(columns=self.columns)) | ||
target_level_df = self[:, current_level_segments, target_names] | ||
|
||
return TSDataset( | ||
ts = TSDataset( | ||
df=target_level_df, | ||
freq=self.freq, | ||
df_exog=self.df_exog, | ||
known_future=self.known_future, | ||
hierarchical_structure=self.hierarchical_structure, | ||
) | ||
ts._target_components = self._target_components | ||
return ts | ||
|
||
def add_target_components(self, target_components_df: pd.DataFrame): | ||
"""Add target components into dataset. | ||
|
||
Parameters | ||
---------- | ||
target_components_df: | ||
Dataframe in etna wide format with target components | ||
|
||
Raises | ||
------ | ||
ValueError: | ||
If dataset already contains target components | ||
ValueError: | ||
If target components names differs between segments | ||
ValueError: | ||
If components don't sum up to target | ||
""" | ||
if self._target_components is not None: | ||
raise ValueError("Dataset already contains target components!") | ||
|
||
components_names = set(target_components_df.columns.get_level_values("feature")) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What if there is duplication in names of components? |
||
for segment in self.segments: | ||
components_names_segment = set(target_components_df[segment].columns.get_level_values("feature")) | ||
if components_names != components_names_segment: | ||
raise ValueError("Set of target components differs between segments!") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should we write between which segments there is a difference? |
||
|
||
components_sum = target_components_df.sum(axis=1, level="segment") | ||
if not np.array_equal(components_sum.values, self[..., "target"].values): | ||
raise ValueError("Components don't sum up to target!") | ||
|
||
self._target_components = sorted(components_names) | ||
self.df = ( | ||
pd.concat((self.df, target_components_df), axis=1) | ||
.loc[self.df.index] | ||
.sort_index(axis=1, level=("segment", "feature")) | ||
) | ||
|
||
def get_target_components(self) -> Optional[pd.DataFrame]: | ||
"""Get DataFrame with target components. | ||
|
||
Returns | ||
------- | ||
: | ||
Dataframe with target components | ||
""" | ||
if self._target_components is None: | ||
return None | ||
return self.to_pandas(features=self._target_components) | ||
|
||
@property | ||
def columns(self) -> pd.core.indexes.multi.MultiIndex: | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -170,6 +170,55 @@ def df_segments_int(): | |
return df | ||
|
||
|
||
@pytest.fixture | ||
def target_components_df(): | ||
timestamp = pd.date_range("2021-01-01", "2021-01-15") | ||
df_1 = pd.DataFrame({"timestamp": timestamp, "target_component_a": 1, "target_component_b": 2, "segment": 1}) | ||
df_2 = pd.DataFrame({"timestamp": timestamp, "target_component_a": 3, "target_component_b": 4, "segment": 2}) | ||
df = pd.concat([df_1, df_2]) | ||
df = TSDataset.to_dataset(df) | ||
return df | ||
|
||
|
||
@pytest.fixture | ||
def inconsistent_target_components_names_df(target_components_df): | ||
target_components_df = target_components_df.drop(columns=[("2", "target_component_a")]) | ||
return target_components_df | ||
|
||
|
||
@pytest.fixture | ||
def inconsistent_target_components_values_df(target_components_df): | ||
target_components_df.loc[10, pd.IndexSlice["1", "target_component_a"]] = 100 | ||
return target_components_df | ||
|
||
|
||
@pytest.fixture | ||
def ts_without_target_components(): | ||
timestamp = pd.date_range("2021-01-01", "2021-01-15") | ||
df_1 = pd.DataFrame({"timestamp": timestamp, "target": 3, "segment": 1}) | ||
df_2 = pd.DataFrame({"timestamp": timestamp, "target": 7, "segment": 2}) | ||
df = pd.concat([df_1, df_2]) | ||
df = TSDataset.to_dataset(df) | ||
ts = TSDataset(df=df, freq="D") | ||
return ts | ||
|
||
|
||
@pytest.fixture | ||
def ts_with_target_components(): | ||
timestamp = pd.date_range("2021-01-01", "2021-01-15") | ||
df_1 = pd.DataFrame( | ||
{"timestamp": timestamp, "target": 3, "target_component_a": 1, "target_component_b": 2, "segment": 1} | ||
) | ||
df_2 = pd.DataFrame( | ||
{"timestamp": timestamp, "target": 7, "target_component_a": 3, "target_component_b": 4, "segment": 2} | ||
) | ||
df = pd.concat([df_1, df_2]) | ||
df = TSDataset.to_dataset(df) | ||
ts = TSDataset(df=df, freq="D") | ||
ts._target_components = ["target_component_a", "target_component_b"] | ||
return ts | ||
|
||
|
||
def test_check_endings_error(): | ||
"""Check that _check_endings method raises exception if some segments end with nan.""" | ||
timestamp = pd.date_range("2021-01-01", "2021-02-01") | ||
|
@@ -416,6 +465,12 @@ def test_train_test_split_pass_regressors_to_output(df_and_regressors): | |
assert test.regressors == ts.regressors | ||
|
||
|
||
def test_train_test_split_pass_target_components_to_output(ts_with_target_components): | ||
train, test = ts_with_target_components.train_test_split(test_size=5) | ||
assert sorted(train.target_components) == sorted(ts_with_target_components.target_components) | ||
assert sorted(test.target_components) == sorted(ts_with_target_components.target_components) | ||
|
||
|
||
def test_dataset_datetime_conversion(): | ||
classic_df = generate_ar_df(periods=30, start_time="2021-06-01", n_segments=2) | ||
classic_df["timestamp"] = classic_df["timestamp"].astype(str) | ||
|
@@ -823,6 +878,11 @@ def test_tsdataset_idx_slice(tsdf_with_exog, start_idx, end_idx): | |
pd.testing.assert_frame_equal(ts_slice.df_exog, tsdf_with_exog.df_exog) | ||
|
||
|
||
def test_tsdataset_idx_slice_pass_target_components_to_output(ts_with_target_components): | ||
ts_slice = ts_with_target_components.tsdataset_idx_slice(start_idx=1, end_idx=2) | ||
assert sorted(ts_slice.target_components) == sorted(ts_with_target_components.target_components) | ||
|
||
|
||
def test_to_torch_dataset_without_drop(tsdf_with_exog): | ||
def make_samples(df): | ||
return [{"target": df.target.values, "segment": df["segment"].values[0]}] | ||
|
@@ -947,3 +1007,49 @@ def test_drop_features_update_regressors(df_and_regressors, features, expected_r | |
ts = TSDataset(df=df, df_exog=df_exog, freq="D", known_future=known_future) | ||
ts.drop_features(features=features, drop_from_exog=False) | ||
assert sorted(ts.regressors) == sorted(expected_regressors) | ||
|
||
|
||
def test_drop_features_throw_error_on_target_components(ts_with_target_components): | ||
with pytest.raises(ValueError, match="Target components can't be dropped from the dataset!"): | ||
ts_with_target_components.drop_features(features=ts_with_target_components.target_components) | ||
|
||
|
||
def test_get_target_components_on_dataset_without_components(example_tsds): | ||
target_components = example_tsds.get_target_components() | ||
assert target_components is None | ||
|
||
|
||
def test_get_target_components( | ||
ts_with_target_components, expected_components=["target_component_a", "target_component_b"] | ||
): | ||
expected_target_components_df = ts_with_target_components.to_pandas(features=expected_components) | ||
target_components_df = ts_with_target_components.get_target_components() | ||
pd.testing.assert_frame_equal(target_components_df, expected_target_components_df) | ||
|
||
|
||
def test_add_target_components_throw_error_adding_components_second_time( | ||
ts_with_target_components, target_components_df | ||
): | ||
with pytest.raises(ValueError, match="Dataset already contains target components!"): | ||
ts_with_target_components.add_target_components(target_components_df=target_components_df) | ||
|
||
|
||
def test_add_target_components_throw_error_inconsistent_components_names( | ||
ts_without_target_components, inconsistent_target_components_names_df | ||
): | ||
with pytest.raises(ValueError, match="Set of target components differs between segments!"): | ||
ts_without_target_components.add_target_components(target_components_df=inconsistent_target_components_names_df) | ||
|
||
|
||
def test_add_target_components_throw_error_inconsistent_components_values( | ||
ts_without_target_components, inconsistent_target_components_values_df | ||
): | ||
with pytest.raises(ValueError, match="Components don't sum up to target!"): | ||
ts_without_target_components.add_target_components( | ||
target_components_df=inconsistent_target_components_values_df | ||
) | ||
|
||
|
||
def test_add_target_components(ts_without_target_components, ts_with_target_components, target_components_df): | ||
ts_without_target_components.add_target_components(target_components_df=target_components_df) | ||
pd.testing.assert_frame_equal(ts_with_target_components.to_pandas(), ts_with_target_components.to_pandas()) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Probably should be: pd.testing.assert_frame_equal(ts_without_target_components.to_pandas(), ts_with_target_components.to_pandas() |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -481,3 +481,11 @@ def test_get_level_dataset_lower_level_error(simple_hierarchical_ts): | |
def test_get_level_dataset_with_quantiles(product_level_constant_forecast_w_quantiles, target_level, answer): | ||
forecast = product_level_constant_forecast_w_quantiles | ||
np.testing.assert_array_almost_equal(forecast.get_level_dataset(target_level=target_level).df.values, answer) | ||
|
||
|
||
def test_get_level_dataset_pass_target_components_to_output(simple_hierarchical_ts): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you please explain what is happening here and why? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Here I test that get_level_dataset pass target_components to the output dataset |
||
simple_hierarchical_ts._target_components = ["target_component_a", "target_component_b"] | ||
simple_hierarchical_ts_aggregated = simple_hierarchical_ts.get_level_dataset(target_level="market") | ||
assert sorted(simple_hierarchical_ts_aggregated.target_components) == sorted( | ||
simple_hierarchical_ts.target_components | ||
) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think we should write that if there are no components,
None
is returned.