From 04ab10b93e972d51c846f960e28908a9d7dbf241 Mon Sep 17 00:00:00 2001 From: "d.a.bunin" Date: Mon, 6 Dec 2021 11:18:15 +0300 Subject: [PATCH 1/7] Add logic with known_future to TSDataset, add tests on it, fix tests after change --- etna/datasets/tsdataset.py | 64 +++++--- tests/conftest.py | 4 +- tests/test_datasets/test_dataset.py | 138 ++++++++++-------- tests/test_models/test_catboost.py | 6 +- tests/test_models/test_prophet.py | 10 +- .../test_feature_importance_transform.py | 8 +- .../test_transforms/test_feature_selection.py | 2 +- tests/test_transforms/test_gale_shapley.py | 5 +- .../test_outliers_transform.py | 7 +- .../test_resample_transform.py | 20 ++- 10 files changed, 159 insertions(+), 105 deletions(-) diff --git a/etna/datasets/tsdataset.py b/etna/datasets/tsdataset.py index 2d8554397..84b8322c3 100644 --- a/etna/datasets/tsdataset.py +++ b/etna/datasets/tsdataset.py @@ -56,7 +56,8 @@ class TSDataset: >>> df_regressors["segment"] = "segment_0" >>> df_to_forecast = TSDataset.to_dataset(df_to_forecast) >>> df_regressors = TSDataset.to_dataset(df_regressors) - >>> tsdataset = TSDataset(df=df_to_forecast, freq="D", df_exog=df_regressors) + >>> known_future = [f"regressor_{i}" for i in range(5)] + >>> tsdataset = TSDataset(df=df_to_forecast, freq="D", df_exog=df_regressors, known_future=known_future) >>> tsdataset.df.head(5) segment segment_0 feature regressor_0 regressor_1 regressor_2 regressor_3 regressor_4 target @@ -70,7 +71,9 @@ class TSDataset: idx = pd.IndexSlice - def __init__(self, df: pd.DataFrame, freq: str, df_exog: Optional[pd.DataFrame] = None): + def __init__( + self, df: pd.DataFrame, freq: str, df_exog: Optional[pd.DataFrame] = None, known_future: Sequence = () + ): """Init TSDataset. Parameters @@ -82,11 +85,14 @@ def __init__(self, df: pd.DataFrame, freq: str, df_exog: Optional[pd.DataFrame] df_exog: dataframe with exogenous data; if the series is known in the future features' names should start with prefix 'regressor_`. + known_future: + series from columns in df_exog[known_future] are regressors """ self.raw_df = df.copy(deep=True) self.raw_df.index = pd.to_datetime(self.raw_df.index) self.freq = freq self.df_exog = None + self.known_future = known_future self.raw_df.index = pd.to_datetime(self.raw_df.index) @@ -105,13 +111,14 @@ def __init__(self, df: pd.DataFrame, freq: str, df_exog: Optional[pd.DataFrame] self.df = self.raw_df.copy(deep=True) + self._regressors = self._check_known_future(known_future, df_exog) + if df_exog is not None: self.df_exog = df_exog.copy(deep=True) self.df_exog.index = pd.to_datetime(self.df_exog.index) self.df = self._merge_exog(self.df) self.transforms: Optional[Sequence["Transform"]] = None - self._update_regressors() def transform(self, transforms: Sequence["Transform"]): """Apply given transform to the data.""" @@ -120,7 +127,6 @@ def transform(self, transforms: Sequence["Transform"]): for transform in self.transforms: tslogger.log(f"Transform {transform.__class__.__name__} is applied to dataset") self.df = transform.transform(self.df) - self._update_regressors() def fit_transform(self, transforms: Sequence["Transform"]): """Fit and apply given transforms to the data.""" @@ -129,7 +135,6 @@ def fit_transform(self, transforms: Sequence["Transform"]): for transform in self.transforms: tslogger.log(f"Transform {transform.__class__.__name__} is applied to dataset") self.df = transform.fit_transform(self.df) - self._update_regressors() def __repr__(self): return self.df.__repr__() @@ -177,7 +182,9 @@ def make_future(self, future_steps: int) -> "TSDataset": ... }) >>> df_ts_format = TSDataset.to_dataset(df) >>> df_regressors_ts_format = TSDataset.to_dataset(df_regressors) - >>> ts = TSDataset(df_ts_format, "D", df_exog=df_regressors_ts_format) + >>> ts = TSDataset( + ... df_ts_format, "D", df_exog=df_regressors_ts_format, known_future=["regressor_1", "regressor_2"] + ... ) >>> ts.make_future(4) segment segment_0 segment_1 feature regressor_1 regressor_2 target regressor_1 regressor_2 target @@ -221,14 +228,32 @@ def make_future(self, future_steps: int) -> "TSDataset": return future_ts @staticmethod - def _check_regressors(df: pd.DataFrame, df_exog: pd.DataFrame): - """Check that regressors in df_exog begin not later than in df and end later than in df.""" + def _check_known_future(known_future: Sequence[str], df_exog: Optional[pd.DataFrame]) -> List[str]: + """Check that `known_future` corresponds to `df_exog` and returns initial list of regressors.""" + if df_exog is not None: + columns = set(df_exog.columns.get_level_values("feature")) + known_future_unique = set(known_future) + if not known_future_unique.issubset(columns): + raise ValueError( + f"Some features in known_future are not present in df_exog: " + f"{known_future_unique.difference(columns)}" + ) + else: + return sorted(list(known_future_unique)) + elif len(known_future) > 0: + raise ValueError("There are is exogenous data to extract known future features") + else: + return [] + + @staticmethod + def _check_regressors(df: pd.DataFrame, df_regressors: pd.DataFrame): + """Check that regressors begin not later than in df and end later than in df.""" + # TODO: check performance df_segments = df.columns.get_level_values("segment") for segment in df_segments: - target = df[segment]["target"].dropna() - exog_regressor_columns = [x for x in set(df_exog[segment].columns) if x.startswith("regressor")] - for series in exog_regressor_columns: - exog_series = df_exog[segment][series].dropna() + target = df.loc[:, pd.IndexSlice[segment, "target"]].dropna() + for series in df_regressors.columns.get_level_values("feature"): + exog_series = df_regressors.loc[:, pd.IndexSlice[segment, series]].dropna() if target.index.min() < exog_series.index.min(): raise ValueError( f"All the regressor series should start not later than corresponding 'target'." @@ -243,7 +268,9 @@ def _check_regressors(df: pd.DataFrame, df_exog: pd.DataFrame): ) def _merge_exog(self, df: pd.DataFrame) -> pd.DataFrame: - self._check_regressors(df=df, df_exog=self.df_exog) + segments = sorted(set(df.columns.get_level_values("segment"))) + df_regressors = self.df_exog.loc[:, pd.IndexSlice[segments, self.known_future]] + self._check_regressors(df=df, df_regressors=df_regressors) df = pd.merge(df, self.df_exog, left_index=True, right_index=True, how="left").sort_index(axis=1, level=(0, 1)) return df @@ -279,13 +306,6 @@ def segments(self) -> List[str]: """ return self.df.columns.get_level_values("segment").unique().tolist() - def _update_regressors(self): - result = set() - for column in self.columns.get_level_values("feature"): - if column.startswith("regressor"): - result.add(column) - self._regressors = list(result) - @property def regressors(self) -> List[str]: """Get list of all regressors across all segments in dataset. @@ -307,7 +327,9 @@ def regressors(self) -> List[str]: ... ) >>> df_exog = pd.concat([df_regressors_1, df_regressors_2], ignore_index=True) >>> df_exog_ts_format = TSDataset.to_dataset(df_exog) - >>> ts = TSDataset(df_ts_format, df_exog=df_exog_ts_format, freq="D") + >>> ts = TSDataset( + ... df_ts_format, df_exog=df_exog_ts_format, freq="D", known_future=["regressor_1"] + ... ) >>> ts.regressors ['regressor_1'] """ diff --git a/tests/conftest.py b/tests/conftest.py index b1e832fa7..4e67bb31a 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -207,7 +207,7 @@ def example_reg_tsds(random_seed) -> TSDataset: df = TSDataset.to_dataset(df) exog = TSDataset.to_dataset(exog) - tsds = TSDataset(df, freq="D", df_exog=exog) + tsds = TSDataset(df, freq="D", df_exog=exog, known_future=["regressor_exog_weekend"]) return tsds @@ -235,7 +235,7 @@ def outliers_tsds(): df.columns.names = ["segment", "feature"] exog = df.copy() - exog.columns = pd.MultiIndex.from_arrays([["1", "2"], ["exog", "exog"]]) + exog.columns.set_levels(["exog"], level="feature", inplace=True) tsds = TSDataset(df, "1d", exog) diff --git a/tests/test_datasets/test_dataset.py b/tests/test_datasets/test_dataset.py index 2f9516865..2375cdb5f 100644 --- a/tests/test_datasets/test_dataset.py +++ b/tests/test_datasets/test_dataset.py @@ -1,3 +1,4 @@ +from typing import List from typing import Tuple import numpy as np @@ -6,7 +7,6 @@ from etna.datasets import generate_ar_df from etna.datasets.tsdataset import TSDataset -from etna.transforms import DateFlagsTransform @pytest.fixture() @@ -19,24 +19,18 @@ def tsdf_with_exog(random_seed) -> TSDataset: df_2["target"] = [x ** 0.5 + np.random.uniform(-2, 2) for x in list(range(len(df_2)))] classic_df = pd.concat([df_1, df_2], ignore_index=True) - df = classic_df.pivot(index="timestamp", columns="segment") - df = df.reorder_levels([1, 0], axis=1) - df = df.sort_index(axis=1) - df.columns.names = ["segment", "feature"] + df = TSDataset.to_dataset(classic_df) - exog = generate_ar_df(start_time="2021-01-01", periods=600, n_segments=2) - exog = exog.pivot(index="timestamp", columns="segment") - exog = exog.reorder_levels([1, 0], axis=1) - exog = exog.sort_index(axis=1) - exog.columns.names = ["segment", "feature"] - exog.columns = pd.MultiIndex.from_arrays([["Moscow", "Omsk"], ["exog", "exog"]]) + classic_df_exog = generate_ar_df(start_time="2021-01-01", periods=600, n_segments=2) + classic_df_exog.rename(columns={"target": "exog"}, inplace=True) + df_exog = TSDataset.to_dataset(classic_df_exog) - ts = TSDataset(df=df, df_exog=exog, freq="1D") + ts = TSDataset(df=df, df_exog=df_exog, freq="1D") return ts @pytest.fixture() -def df_and_regressors() -> Tuple[pd.DataFrame, pd.DataFrame]: +def df_and_regressors() -> Tuple[pd.DataFrame, pd.DataFrame, List[str]]: timestamp = pd.date_range("2021-01-01", "2021-02-01") df_1 = pd.DataFrame({"timestamp": timestamp, "target": 11, "segment": "1"}) df_2 = pd.DataFrame({"timestamp": timestamp[5:], "target": 12, "segment": "2"}) @@ -49,10 +43,10 @@ def df_and_regressors() -> Tuple[pd.DataFrame, pd.DataFrame]: df_exog = pd.concat([df_1, df_2], ignore_index=True) df_exog = TSDataset.to_dataset(df_exog) - return df, df_exog + return df, df_exog, ["regressor_1", "regressor_2"] -def test_check_endings_error_raise(): +def test_check_endings_error(): """Check that _check_endings method raises exception if some segments end with nan.""" timestamp = pd.date_range("2021-01-01", "2021-02-01") df1 = pd.DataFrame({"timestamp": timestamp, "target": 11, "segment": "1"}) @@ -65,7 +59,7 @@ def test_check_endings_error_raise(): ts._check_endings() -def test_check_endings_error_pass(): +def test_check_endings_pass(): """Check that _check_endings method passes if there is no nans at the end of all segments.""" timestamp = pd.date_range("2021-01-01", "2021-02-01") df1 = pd.DataFrame({"timestamp": timestamp, "target": 11, "segment": "1"}) @@ -76,6 +70,36 @@ def test_check_endings_error_pass(): ts._check_endings() +def test_check_known_future_error_no_df_exog(): + """Check that _check_known_future raises exception if there are no df_exog, but known_future isn't empty.""" + with pytest.raises(ValueError, match="There are is exogenous data"): + _ = TSDataset._check_known_future(["regressor_1"], None) + + +def test_check_known_future_error_not_matching(df_and_regressors): + """Check that _check_known_future raises exception if df_exog doesn't contain some features in known_future.""" + _, df_exog, known_future = df_and_regressors + known_future.append("regressor_new") + with pytest.raises(ValueError, match="Some features in known_future are not present in df_exog"): + _ = TSDataset._check_known_future(known_future, df_exog) + + +def test_check_known_future_pass_all_empty(): + """Check that _check_known_future passes if known_future and df_exog are empty.""" + regressors = TSDataset._check_known_future([], None) + assert len(regressors) == 0 + + +@pytest.mark.parametrize( + "known_future", [[], ["regressor_1"], ["regressor_1", "regressor_2"], ["regressor_1", "regressor_1"]] +) +def test_check_known_future_pass_non_empty(df_and_regressors, known_future): + _, df_exog, _ = df_and_regressors + """Check that _check_known_future passes if df_exog is not empty.""" + regressors = TSDataset._check_known_future(known_future, df_exog) + assert regressors == sorted(list(set(known_future))) + + def test_categorical_after_call_to_pandas(): classic_df = generate_ar_df(periods=30, start_time="2021-06-01", n_segments=2) classic_df["categorical_column"] = [0] * 30 + [1] * 30 @@ -306,44 +330,49 @@ def test_make_future_with_exog(): def test_make_future_with_regressors(df_and_regressors): - df, df_exog = df_and_regressors - ts = TSDataset(df=df, df_exog=df_exog, freq="D") + df, df_exog, known_future = df_and_regressors + ts = TSDataset(df=df, df_exog=df_exog, freq="D", known_future=known_future) ts_future = ts.make_future(10) assert np.all(ts_future.index == pd.date_range(ts.index.max() + pd.Timedelta("1D"), periods=10, freq="D")) assert set(ts_future.columns.get_level_values("feature")) == {"target", "regressor_1", "regressor_2"} +def test_make_future_warn_not_enough_regressors(df_and_regressors): + """Check that warning is thrown if regressors don't have enough values for the future.""" + df, df_exog, known_future = df_and_regressors + ts = TSDataset(df=df, df_exog=df_exog, freq="D", known_future=known_future) + with pytest.warns(UserWarning, match="Some regressors don't have enough values"): + ts.make_future(ts.df_exog.shape[0] + 100) + + @pytest.mark.parametrize("exog_starts_later,exog_ends_earlier", ((True, False), (False, True), (True, True))) -def test_dataset_check_exog_raise_error(exog_starts_later: bool, exog_ends_earlier: bool): - start_time = "2021-01-10" if exog_starts_later else "2021-01-01" - end_time = "2021-01-20" if exog_ends_earlier else "2021-02-01" +def test_check_regressors_error(exog_starts_later: bool, exog_ends_earlier: bool): + """Check that error is raised if regressors don't have enough values for the train data.""" + start_time_main = "2021-01-01" + end_time_main = "2021-02-01" + start_time_regressors = "2021-01-10" if exog_starts_later else start_time_main + end_time_regressors = "2021-01-20" if exog_ends_earlier else end_time_main + timestamp = pd.date_range("2021-01-01", "2021-02-01") df1 = pd.DataFrame({"timestamp": timestamp, "target": 11, "segment": "1"}) df2 = pd.DataFrame({"timestamp": timestamp[5:], "target": 12, "segment": "2"}) df = pd.concat([df1, df2], ignore_index=True) df = TSDataset.to_dataset(df) - timestamp = pd.date_range(start_time, end_time) + timestamp = pd.date_range(start_time_regressors, end_time_regressors) df1 = pd.DataFrame({"timestamp": timestamp, "regressor_aaa": 1, "segment": "1"}) df2 = pd.DataFrame({"timestamp": timestamp[5:], "regressor_aaa": 2, "segment": "2"}) - dfexog = pd.concat([df1, df2], ignore_index=True) - dfexog = TSDataset.to_dataset(dfexog) + df_regressors = pd.concat([df1, df2], ignore_index=True) + df_regressors = TSDataset.to_dataset(df_regressors) with pytest.raises(ValueError): - TSDataset._check_regressors(df=df, df_exog=dfexog) + TSDataset._check_regressors(df=df, df_regressors=df_regressors) -def test_dataset_check_exog_pass(df_and_regressors): - df, df_exog = df_and_regressors - _ = TSDataset._check_regressors(df=df, df_exog=df_exog) - - -def test_warn_not_enough_exog(df_and_regressors): - """Check that warning is thrown if regressors don't have enough values.""" - df, df_exog = df_and_regressors - ts = TSDataset(df=df, df_exog=df_exog, freq="D") - with pytest.warns(UserWarning, match="Some regressors don't have enough values"): - ts.make_future(ts.df_exog.shape[0] + 100) +def test_check_regressors_pass(df_and_regressors): + """Check that regressors check on creation passes with correct regressors.""" + df, df_exog, _ = df_and_regressors + _ = TSDataset._check_regressors(df=df, df_regressors=df_exog) def test_getitem_only_date(tsdf_with_exog): @@ -378,13 +407,20 @@ def test_getitem_all_indexes(tsdf_with_exog): pd.testing.assert_frame_equal(df_expected, df_slice) -def test_finding_regressors(df_and_regressors): - """Check that ts.regressors property works correctly.""" - df, df_exog = df_and_regressors - ts = TSDataset(df=df, df_exog=df_exog, freq="D") +def test_finding_regressors_marked(df_and_regressors): + """Check that ts.regressors property works correctly when regressors set.""" + df, df_exog, known_future = df_and_regressors + ts = TSDataset(df=df, df_exog=df_exog, freq="D", known_future=["regressor_1", "regressor_2"]) assert sorted(ts.regressors) == ["regressor_1", "regressor_2"] +def test_finding_regressors_unmarked(df_and_regressors): + """Check that ts.regressors property works correctly when regressors don't set.""" + df, df_exog, _ = df_and_regressors + ts = TSDataset(df=df, df_exog=df_exog, freq="D") + assert sorted(ts.regressors) == [] + + def test_head_default(tsdf_with_exog): assert np.all(tsdf_with_exog.head() == tsdf_with_exog.df.head()) @@ -393,28 +429,6 @@ def test_tail_default(tsdf_with_exog): np.all(tsdf_with_exog.tail() == tsdf_with_exog.df.tail()) -def test_updating_regressors_fit_transform(df_and_regressors): - """Check that ts.regressors is updated after making ts.fit_transform().""" - df, df_exog = df_and_regressors - ts = TSDataset(df=df, df_exog=df_exog, freq="D") - date_flags_transform = DateFlagsTransform( - day_number_in_week=True, - day_number_in_month=False, - week_number_in_month=False, - week_number_in_year=False, - month_number_in_year=False, - year_number=False, - is_weekend=True, - out_column="regressor_dateflag", - ) - initial_regressors = set(ts.regressors) - ts.fit_transform(transforms=[date_flags_transform]) - final_regressors = set(ts.regressors) - expected_columns = {"regressor_dateflag_day_number_in_week", "regressor_dateflag_is_weekend"} - assert initial_regressors.issubset(final_regressors) - assert final_regressors.difference(initial_regressors) == expected_columns - - def test_right_format_sorting(): """Need to check if to_dataset method does not mess up with data and column names, sorting it with no respect to each other diff --git a/tests/test_models/test_catboost.py b/tests/test_models/test_catboost.py index 6d5b8030f..ff0bc449c 100644 --- a/tests/test_models/test_catboost.py +++ b/tests/test_models/test_catboost.py @@ -31,11 +31,9 @@ def test_run(catboostmodel, new_format_df): def test_run_with_reg(catboostmodel, new_format_df, new_format_exog): df = new_format_df exog = new_format_exog - exog.columns = pd.MultiIndex.from_arrays( - [exog.columns.get_level_values("segment").unique().tolist(), ["regressor_exog", "regressor_exog"]] - ) + exog.columns.set_levels(["regressor_exog"], level="feature", inplace=True) - ts = TSDataset(df, "1d", df_exog=exog) + ts = TSDataset(df, "1d", df_exog=exog, known_future=["regressor_exog"]) lags = LagTransform(lags=[3, 4, 5], in_column="target") lags_exog = LagTransform(lags=[3, 4, 5, 6], in_column="regressor_exog") diff --git a/tests/test_models/test_prophet.py b/tests/test_models/test_prophet.py index 4b0505e74..c1a1b0890 100644 --- a/tests/test_models/test_prophet.py +++ b/tests/test_models/test_prophet.py @@ -23,16 +23,12 @@ def test_run_with_reg(new_format_df, new_format_exog): df = new_format_df regressors = new_format_exog.copy() - regressors.columns = pd.MultiIndex.from_arrays( - [regressors.columns.get_level_values("segment").unique().tolist(), ["regressor_exog", "regressor_exog"]] - ) + regressors.columns.set_levels(["regressor_exog"], level="feature", inplace=True) regressors_cap = new_format_exog.copy() - regressors_cap.columns = pd.MultiIndex.from_arrays( - [regressors_cap.columns.get_level_values("segment").unique().tolist(), ["regressor_cap", "regressor_cap"]] - ) + regressors_cap.columns.set_levels(["regressor_cap"], level="feature", inplace=True) exog = pd.concat([regressors, regressors_cap], axis=1) - ts = TSDataset(df, "1d", df_exog=exog) + ts = TSDataset(df, "1d", df_exog=exog, known_future=["regressor_exog", "regressor_cap"]) model = ProphetModel() model.fit(ts) diff --git a/tests/test_transforms/test_feature_importance_transform.py b/tests/test_transforms/test_feature_importance_transform.py index f79b1e5f2..1a1cbc97a 100644 --- a/tests/test_transforms/test_feature_importance_transform.py +++ b/tests/test_transforms/test_feature_importance_transform.py @@ -54,8 +54,14 @@ def ts_with_regressors(): df_exog_all_segments = pd.concat(classic_exog_list) # construct TSDataset + known_future = list(set(df_exog.columns).difference({"timestamp"})) df = df[df["timestamp"] <= timestamp[200]] - return TSDataset(df=TSDataset.to_dataset(df), df_exog=TSDataset.to_dataset(df_exog_all_segments), freq="D") + return TSDataset( + df=TSDataset.to_dataset(df), + df_exog=TSDataset.to_dataset(df_exog_all_segments), + freq="D", + known_future=known_future, + ) @pytest.mark.parametrize( diff --git a/tests/test_transforms/test_feature_selection.py b/tests/test_transforms/test_feature_selection.py index e8c07a3c9..e25afde1a 100644 --- a/tests/test_transforms/test_feature_selection.py +++ b/tests/test_transforms/test_feature_selection.py @@ -25,7 +25,7 @@ def ts_with_complex_exog(random_seed) -> TSDataset: df = TSDataset.to_dataset(df) df_exog = TSDataset.to_dataset(df_exog) - ts = TSDataset(df=df, freq="D", df_exog=df_exog) + ts = TSDataset(df=df, freq="D", df_exog=df_exog, known_future=["regressor_1", "regressor_2"]) return ts diff --git a/tests/test_transforms/test_gale_shapley.py b/tests/test_transforms/test_gale_shapley.py index 08fdb0401..a11a21beb 100644 --- a/tests/test_transforms/test_gale_shapley.py +++ b/tests/test_transforms/test_gale_shapley.py @@ -31,7 +31,10 @@ def ts_with_large_regressors_number(random_seed) -> TSDataset: tmp = generate_ar_df(periods=150, start_time="2020-01-01", n_segments=3, ar_coef=[1], random_seed=i) exog_df = exog_df.merge(tmp.rename({"target": f"regressor_{i + 1}"}, axis=1), on=["timestamp", "segment"]) - ts = TSDataset(df=TSDataset.to_dataset(df), freq="D", df_exog=TSDataset.to_dataset(exog_df)) + known_future = [f"regressor_{i}" for i in range(1, 9)] + ts = TSDataset( + df=TSDataset.to_dataset(df), freq="D", df_exog=TSDataset.to_dataset(exog_df), known_future=known_future + ) return ts diff --git a/tests/test_transforms/test_outliers_transform.py b/tests/test_transforms/test_outliers_transform.py index 483000fff..484c89c59 100644 --- a/tests/test_transforms/test_outliers_transform.py +++ b/tests/test_transforms/test_outliers_transform.py @@ -31,7 +31,12 @@ def outliers_solid_tsds(): df = pd.concat([df1, df2], ignore_index=True) df_exog = df.copy() df_exog.columns = ["timestamp", "regressor_1", "segment"] - ts = TSDataset(df=TSDataset.to_dataset(df).iloc[:-10], df_exog=TSDataset.to_dataset(df_exog), freq="D") + ts = TSDataset( + df=TSDataset.to_dataset(df).iloc[:-10], + df_exog=TSDataset.to_dataset(df_exog), + freq="D", + known_future=["regressor_1"], + ) return ts diff --git a/tests/test_transforms/test_resample_transform.py b/tests/test_transforms/test_resample_transform.py index 15c2d0729..47c4a053b 100644 --- a/tests/test_transforms/test_resample_transform.py +++ b/tests/test_transforms/test_resample_transform.py @@ -58,7 +58,9 @@ def daily_exog_ts() -> Dict[str, Union[TSDataset, DistributionDict]]: } ) - ts = TSDataset(df=TSDataset.to_dataset(df), freq="H", df_exog=TSDataset.to_dataset(df_exog)) + ts = TSDataset( + df=TSDataset.to_dataset(df), freq="H", df_exog=TSDataset.to_dataset(df_exog), known_future=["regressor_exog"] + ) distribution = {"segment_1": target1, "segment_2": target2} return {"ts": ts, "distribution": distribution} @@ -97,7 +99,9 @@ def inplace_resampled_daily_exog_ts() -> TSDataset: ) df_exog = pd.concat([df_exog1, df_exog2], ignore_index=True) - ts = TSDataset(df=TSDataset.to_dataset(df), freq="H", df_exog=TSDataset.to_dataset(df_exog)) + ts = TSDataset( + df=TSDataset.to_dataset(df), freq="H", df_exog=TSDataset.to_dataset(df_exog), known_future=["regressor_exog"] + ) return ts @@ -137,7 +141,9 @@ def noninplace_resampled_daily_exog_ts() -> TSDataset: ) df_exog = pd.concat([df_exog1, df_exog2], ignore_index=True) - ts = TSDataset(df=TSDataset.to_dataset(df), freq="H", df_exog=TSDataset.to_dataset(df_exog)) + ts = TSDataset( + df=TSDataset.to_dataset(df), freq="H", df_exog=TSDataset.to_dataset(df_exog), known_future=["regressor_exog"] + ) return ts @@ -189,7 +195,9 @@ def weekly_exog_same_start_ts() -> Dict[str, Union[TSDataset, DistributionDict]] } ) distribution = {"segment_1": target1, "segment_2": target2} - ts = TSDataset(df=TSDataset.to_dataset(df), freq="D", df_exog=TSDataset.to_dataset(df_exog)) + ts = TSDataset( + df=TSDataset.to_dataset(df), freq="D", df_exog=TSDataset.to_dataset(df_exog), known_future=["regressor_exog"] + ) return {"ts": ts, "distribution": distribution} @@ -242,7 +250,9 @@ def weekly_exog_diff_start_ts() -> Dict[str, Union[TSDataset, DistributionDict]] } ) - ts = TSDataset(df=TSDataset.to_dataset(df), freq="D", df_exog=TSDataset.to_dataset(df_exog)) + ts = TSDataset( + df=TSDataset.to_dataset(df), freq="D", df_exog=TSDataset.to_dataset(df_exog), known_future=["regressor_exog"] + ) distribution = {"segment_1": target1, "segment_2": target2} return {"ts": ts, "distribution": distribution} From 209216c7f9e801034a1330b39bc45ec761110641 Mon Sep 17 00:00:00 2001 From: "d.a.bunin" Date: Mon, 6 Dec 2021 15:52:06 +0300 Subject: [PATCH 2/7] Fix typo in error message --- etna/datasets/tsdataset.py | 2 +- tests/test_datasets/test_dataset.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/etna/datasets/tsdataset.py b/etna/datasets/tsdataset.py index 84b8322c3..dd89258dd 100644 --- a/etna/datasets/tsdataset.py +++ b/etna/datasets/tsdataset.py @@ -241,7 +241,7 @@ def _check_known_future(known_future: Sequence[str], df_exog: Optional[pd.DataFr else: return sorted(list(known_future_unique)) elif len(known_future) > 0: - raise ValueError("There are is exogenous data to extract known future features") + raise ValueError("There is no exogenous data to extract known future features from") else: return [] diff --git a/tests/test_datasets/test_dataset.py b/tests/test_datasets/test_dataset.py index 2375cdb5f..214772c45 100644 --- a/tests/test_datasets/test_dataset.py +++ b/tests/test_datasets/test_dataset.py @@ -72,7 +72,7 @@ def test_check_endings_pass(): def test_check_known_future_error_no_df_exog(): """Check that _check_known_future raises exception if there are no df_exog, but known_future isn't empty.""" - with pytest.raises(ValueError, match="There are is exogenous data"): + with pytest.raises(ValueError, match="There is no exogenous data"): _ = TSDataset._check_known_future(["regressor_1"], None) From b623792918dc9e884726eb7f58a3366d007525d9 Mon Sep 17 00:00:00 2001 From: "d.a.bunin" Date: Mon, 6 Dec 2021 16:28:15 +0300 Subject: [PATCH 3/7] Add all literal and test for it --- etna/datasets/tsdataset.py | 34 ++++++++++++++++++----------- tests/test_datasets/test_dataset.py | 20 +++++++++++++---- 2 files changed, 37 insertions(+), 17 deletions(-) diff --git a/etna/datasets/tsdataset.py b/etna/datasets/tsdataset.py index dd89258dd..4a47756bf 100644 --- a/etna/datasets/tsdataset.py +++ b/etna/datasets/tsdataset.py @@ -6,6 +6,8 @@ from typing import Sequence from typing import Tuple from typing import Union +from typing_extensions import Literal +from copy import copy import numpy as np import pandas as pd @@ -72,7 +74,7 @@ class TSDataset: idx = pd.IndexSlice def __init__( - self, df: pd.DataFrame, freq: str, df_exog: Optional[pd.DataFrame] = None, known_future: Sequence = () + self, df: pd.DataFrame, freq: str, df_exog: Optional[pd.DataFrame] = None, known_future: Union[Literal["all"], Sequence] = () ): """Init TSDataset. @@ -86,13 +88,13 @@ def __init__( dataframe with exogenous data; if the series is known in the future features' names should start with prefix 'regressor_`. known_future: - series from columns in df_exog[known_future] are regressors + columns in df_exog[known_future] that are regressors, + if "all" value is given, all columns are meant to be regressors """ self.raw_df = df.copy(deep=True) self.raw_df.index = pd.to_datetime(self.raw_df.index) self.freq = freq self.df_exog = None - self.known_future = known_future self.raw_df.index = pd.to_datetime(self.raw_df.index) @@ -111,7 +113,8 @@ def __init__( self.df = self.raw_df.copy(deep=True) - self._regressors = self._check_known_future(known_future, df_exog) + self.known_future = self._check_known_future(known_future, df_exog) + self._regressors = copy(self.known_future) if df_exog is not None: self.df_exog = df_exog.copy(deep=True) @@ -228,22 +231,27 @@ def make_future(self, future_steps: int) -> "TSDataset": return future_ts @staticmethod - def _check_known_future(known_future: Sequence[str], df_exog: Optional[pd.DataFrame]) -> List[str]: + def _check_known_future(known_future: Union[Literal["all"], Sequence], df_exog: Optional[pd.DataFrame]) -> List[str]: """Check that `known_future` corresponds to `df_exog` and returns initial list of regressors.""" - if df_exog is not None: - columns = set(df_exog.columns.get_level_values("feature")) + if df_exog is None: + exog_columns = [] + else: + exog_columns = set(df_exog.columns.get_level_values("feature")) + + if isinstance(known_future, str): + if known_future == "all": + return sorted(list(exog_columns)) + else: + raise ValueError("The only possible literal is 'all'") + else: known_future_unique = set(known_future) - if not known_future_unique.issubset(columns): + if not known_future_unique.issubset(exog_columns): raise ValueError( f"Some features in known_future are not present in df_exog: " - f"{known_future_unique.difference(columns)}" + f"{known_future_unique.difference(exog_columns)}" ) else: return sorted(list(known_future_unique)) - elif len(known_future) > 0: - raise ValueError("There is no exogenous data to extract known future features from") - else: - return [] @staticmethod def _check_regressors(df: pd.DataFrame, df_regressors: pd.DataFrame): diff --git a/tests/test_datasets/test_dataset.py b/tests/test_datasets/test_dataset.py index 214772c45..498e8ea47 100644 --- a/tests/test_datasets/test_dataset.py +++ b/tests/test_datasets/test_dataset.py @@ -70,9 +70,15 @@ def test_check_endings_pass(): ts._check_endings() +def test_check_known_future_wrong_literal(): + """Check that _check_known_future raises exception if wrong literal is given.""" + with pytest.raises(ValueError, match="The only possible literal is 'all'"): + _ = TSDataset._check_known_future("wrong-literal", None) + + def test_check_known_future_error_no_df_exog(): """Check that _check_known_future raises exception if there are no df_exog, but known_future isn't empty.""" - with pytest.raises(ValueError, match="There is no exogenous data"): + with pytest.raises(ValueError, match="Some features in known_future are not present in df_exog"): _ = TSDataset._check_known_future(["regressor_1"], None) @@ -91,13 +97,19 @@ def test_check_known_future_pass_all_empty(): @pytest.mark.parametrize( - "known_future", [[], ["regressor_1"], ["regressor_1", "regressor_2"], ["regressor_1", "regressor_1"]] + "known_future, expected_columns", [ + ([], []), + (["regressor_1"], ["regressor_1"]), + (["regressor_1", "regressor_2"], ["regressor_1", "regressor_2"]), + (["regressor_1", "regressor_1"], ["regressor_1"]), + ("all", ["regressor_1", "regressor_2"]) + ] ) -def test_check_known_future_pass_non_empty(df_and_regressors, known_future): +def test_check_known_future_pass_non_empty(df_and_regressors, known_future, expected_columns): _, df_exog, _ = df_and_regressors """Check that _check_known_future passes if df_exog is not empty.""" regressors = TSDataset._check_known_future(known_future, df_exog) - assert regressors == sorted(list(set(known_future))) + assert regressors == expected_columns def test_categorical_after_call_to_pandas(): From 4e876fc68848320ceebd34e1279142e901bcfc03 Mon Sep 17 00:00:00 2001 From: "d.a.bunin" Date: Mon, 6 Dec 2021 17:02:07 +0300 Subject: [PATCH 4/7] Fix mypy issue, reformat code --- etna/datasets/tsdataset.py | 16 +++++++++++----- tests/test_datasets/test_dataset.py | 7 ++++--- 2 files changed, 15 insertions(+), 8 deletions(-) diff --git a/etna/datasets/tsdataset.py b/etna/datasets/tsdataset.py index 4a47756bf..21bebaa30 100644 --- a/etna/datasets/tsdataset.py +++ b/etna/datasets/tsdataset.py @@ -1,17 +1,17 @@ import math import warnings +from copy import copy from typing import TYPE_CHECKING from typing import List from typing import Optional from typing import Sequence from typing import Tuple from typing import Union -from typing_extensions import Literal -from copy import copy import numpy as np import pandas as pd from matplotlib import pyplot as plt +from typing_extensions import Literal from etna.loggers import tslogger @@ -74,7 +74,11 @@ class TSDataset: idx = pd.IndexSlice def __init__( - self, df: pd.DataFrame, freq: str, df_exog: Optional[pd.DataFrame] = None, known_future: Union[Literal["all"], Sequence] = () + self, + df: pd.DataFrame, + freq: str, + df_exog: Optional[pd.DataFrame] = None, + known_future: Union[Literal["all"], Sequence] = (), ): """Init TSDataset. @@ -231,10 +235,12 @@ def make_future(self, future_steps: int) -> "TSDataset": return future_ts @staticmethod - def _check_known_future(known_future: Union[Literal["all"], Sequence], df_exog: Optional[pd.DataFrame]) -> List[str]: + def _check_known_future( + known_future: Union[Literal["all"], Sequence], df_exog: Optional[pd.DataFrame] + ) -> List[str]: """Check that `known_future` corresponds to `df_exog` and returns initial list of regressors.""" if df_exog is None: - exog_columns = [] + exog_columns = set() else: exog_columns = set(df_exog.columns.get_level_values("feature")) diff --git a/tests/test_datasets/test_dataset.py b/tests/test_datasets/test_dataset.py index 498e8ea47..5b31b71a5 100644 --- a/tests/test_datasets/test_dataset.py +++ b/tests/test_datasets/test_dataset.py @@ -97,13 +97,14 @@ def test_check_known_future_pass_all_empty(): @pytest.mark.parametrize( - "known_future, expected_columns", [ + "known_future, expected_columns", + [ ([], []), (["regressor_1"], ["regressor_1"]), (["regressor_1", "regressor_2"], ["regressor_1", "regressor_2"]), (["regressor_1", "regressor_1"], ["regressor_1"]), - ("all", ["regressor_1", "regressor_2"]) - ] + ("all", ["regressor_1", "regressor_2"]), + ], ) def test_check_known_future_pass_non_empty(df_and_regressors, known_future, expected_columns): _, df_exog, _ = df_and_regressors From 2cb31c0506c9d47f67dd666ea18cc17d3229addf Mon Sep 17 00:00:00 2001 From: "d.a.bunin" Date: Mon, 6 Dec 2021 17:21:37 +0300 Subject: [PATCH 5/7] Replace known_future list with all where it is possible --- etna/datasets/tsdataset.py | 7 +++---- tests/conftest.py | 2 +- tests/test_models/test_catboost.py | 2 +- tests/test_models/test_prophet.py | 2 +- .../test_feature_importance_transform.py | 3 +-- tests/test_transforms/test_gale_shapley.py | 5 +---- .../test_outliers_transform.py | 2 +- .../test_resample_transform.py | 20 +++++-------------- 8 files changed, 14 insertions(+), 29 deletions(-) diff --git a/etna/datasets/tsdataset.py b/etna/datasets/tsdataset.py index 21bebaa30..028745766 100644 --- a/etna/datasets/tsdataset.py +++ b/etna/datasets/tsdataset.py @@ -58,8 +58,7 @@ class TSDataset: >>> df_regressors["segment"] = "segment_0" >>> df_to_forecast = TSDataset.to_dataset(df_to_forecast) >>> df_regressors = TSDataset.to_dataset(df_regressors) - >>> known_future = [f"regressor_{i}" for i in range(5)] - >>> tsdataset = TSDataset(df=df_to_forecast, freq="D", df_exog=df_regressors, known_future=known_future) + >>> tsdataset = TSDataset(df=df_to_forecast, freq="D", df_exog=df_regressors, known_future="all") >>> tsdataset.df.head(5) segment segment_0 feature regressor_0 regressor_1 regressor_2 regressor_3 regressor_4 target @@ -190,7 +189,7 @@ def make_future(self, future_steps: int) -> "TSDataset": >>> df_ts_format = TSDataset.to_dataset(df) >>> df_regressors_ts_format = TSDataset.to_dataset(df_regressors) >>> ts = TSDataset( - ... df_ts_format, "D", df_exog=df_regressors_ts_format, known_future=["regressor_1", "regressor_2"] + ... df_ts_format, "D", df_exog=df_regressors_ts_format, known_future="all" ... ) >>> ts.make_future(4) segment segment_0 segment_1 @@ -342,7 +341,7 @@ def regressors(self) -> List[str]: >>> df_exog = pd.concat([df_regressors_1, df_regressors_2], ignore_index=True) >>> df_exog_ts_format = TSDataset.to_dataset(df_exog) >>> ts = TSDataset( - ... df_ts_format, df_exog=df_exog_ts_format, freq="D", known_future=["regressor_1"] + ... df_ts_format, df_exog=df_exog_ts_format, freq="D", known_future="all" ... ) >>> ts.regressors ['regressor_1'] diff --git a/tests/conftest.py b/tests/conftest.py index 4e67bb31a..78391e2b6 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -207,7 +207,7 @@ def example_reg_tsds(random_seed) -> TSDataset: df = TSDataset.to_dataset(df) exog = TSDataset.to_dataset(exog) - tsds = TSDataset(df, freq="D", df_exog=exog, known_future=["regressor_exog_weekend"]) + tsds = TSDataset(df, freq="D", df_exog=exog, known_future="all") return tsds diff --git a/tests/test_models/test_catboost.py b/tests/test_models/test_catboost.py index ff0bc449c..9d4493fd4 100644 --- a/tests/test_models/test_catboost.py +++ b/tests/test_models/test_catboost.py @@ -33,7 +33,7 @@ def test_run_with_reg(catboostmodel, new_format_df, new_format_exog): exog = new_format_exog exog.columns.set_levels(["regressor_exog"], level="feature", inplace=True) - ts = TSDataset(df, "1d", df_exog=exog, known_future=["regressor_exog"]) + ts = TSDataset(df, "1d", df_exog=exog, known_future="all") lags = LagTransform(lags=[3, 4, 5], in_column="target") lags_exog = LagTransform(lags=[3, 4, 5, 6], in_column="regressor_exog") diff --git a/tests/test_models/test_prophet.py b/tests/test_models/test_prophet.py index c1a1b0890..f0c6ec438 100644 --- a/tests/test_models/test_prophet.py +++ b/tests/test_models/test_prophet.py @@ -28,7 +28,7 @@ def test_run_with_reg(new_format_df, new_format_exog): regressors_cap.columns.set_levels(["regressor_cap"], level="feature", inplace=True) exog = pd.concat([regressors, regressors_cap], axis=1) - ts = TSDataset(df, "1d", df_exog=exog, known_future=["regressor_exog", "regressor_cap"]) + ts = TSDataset(df, "1d", df_exog=exog, known_future="all") model = ProphetModel() model.fit(ts) diff --git a/tests/test_transforms/test_feature_importance_transform.py b/tests/test_transforms/test_feature_importance_transform.py index 1a1cbc97a..f9ad96d28 100644 --- a/tests/test_transforms/test_feature_importance_transform.py +++ b/tests/test_transforms/test_feature_importance_transform.py @@ -54,13 +54,12 @@ def ts_with_regressors(): df_exog_all_segments = pd.concat(classic_exog_list) # construct TSDataset - known_future = list(set(df_exog.columns).difference({"timestamp"})) df = df[df["timestamp"] <= timestamp[200]] return TSDataset( df=TSDataset.to_dataset(df), df_exog=TSDataset.to_dataset(df_exog_all_segments), freq="D", - known_future=known_future, + known_future="all", ) diff --git a/tests/test_transforms/test_gale_shapley.py b/tests/test_transforms/test_gale_shapley.py index a11a21beb..e454fe8ed 100644 --- a/tests/test_transforms/test_gale_shapley.py +++ b/tests/test_transforms/test_gale_shapley.py @@ -31,10 +31,7 @@ def ts_with_large_regressors_number(random_seed) -> TSDataset: tmp = generate_ar_df(periods=150, start_time="2020-01-01", n_segments=3, ar_coef=[1], random_seed=i) exog_df = exog_df.merge(tmp.rename({"target": f"regressor_{i + 1}"}, axis=1), on=["timestamp", "segment"]) - known_future = [f"regressor_{i}" for i in range(1, 9)] - ts = TSDataset( - df=TSDataset.to_dataset(df), freq="D", df_exog=TSDataset.to_dataset(exog_df), known_future=known_future - ) + ts = TSDataset(df=TSDataset.to_dataset(df), freq="D", df_exog=TSDataset.to_dataset(exog_df), known_future="all") return ts diff --git a/tests/test_transforms/test_outliers_transform.py b/tests/test_transforms/test_outliers_transform.py index 484c89c59..913002931 100644 --- a/tests/test_transforms/test_outliers_transform.py +++ b/tests/test_transforms/test_outliers_transform.py @@ -35,7 +35,7 @@ def outliers_solid_tsds(): df=TSDataset.to_dataset(df).iloc[:-10], df_exog=TSDataset.to_dataset(df_exog), freq="D", - known_future=["regressor_1"], + known_future="all", ) return ts diff --git a/tests/test_transforms/test_resample_transform.py b/tests/test_transforms/test_resample_transform.py index 47c4a053b..1f06cc287 100644 --- a/tests/test_transforms/test_resample_transform.py +++ b/tests/test_transforms/test_resample_transform.py @@ -58,9 +58,7 @@ def daily_exog_ts() -> Dict[str, Union[TSDataset, DistributionDict]]: } ) - ts = TSDataset( - df=TSDataset.to_dataset(df), freq="H", df_exog=TSDataset.to_dataset(df_exog), known_future=["regressor_exog"] - ) + ts = TSDataset(df=TSDataset.to_dataset(df), freq="H", df_exog=TSDataset.to_dataset(df_exog), known_future="all") distribution = {"segment_1": target1, "segment_2": target2} return {"ts": ts, "distribution": distribution} @@ -99,9 +97,7 @@ def inplace_resampled_daily_exog_ts() -> TSDataset: ) df_exog = pd.concat([df_exog1, df_exog2], ignore_index=True) - ts = TSDataset( - df=TSDataset.to_dataset(df), freq="H", df_exog=TSDataset.to_dataset(df_exog), known_future=["regressor_exog"] - ) + ts = TSDataset(df=TSDataset.to_dataset(df), freq="H", df_exog=TSDataset.to_dataset(df_exog), known_future="all") return ts @@ -141,9 +137,7 @@ def noninplace_resampled_daily_exog_ts() -> TSDataset: ) df_exog = pd.concat([df_exog1, df_exog2], ignore_index=True) - ts = TSDataset( - df=TSDataset.to_dataset(df), freq="H", df_exog=TSDataset.to_dataset(df_exog), known_future=["regressor_exog"] - ) + ts = TSDataset(df=TSDataset.to_dataset(df), freq="H", df_exog=TSDataset.to_dataset(df_exog), known_future="all") return ts @@ -195,9 +189,7 @@ def weekly_exog_same_start_ts() -> Dict[str, Union[TSDataset, DistributionDict]] } ) distribution = {"segment_1": target1, "segment_2": target2} - ts = TSDataset( - df=TSDataset.to_dataset(df), freq="D", df_exog=TSDataset.to_dataset(df_exog), known_future=["regressor_exog"] - ) + ts = TSDataset(df=TSDataset.to_dataset(df), freq="D", df_exog=TSDataset.to_dataset(df_exog), known_future="all") return {"ts": ts, "distribution": distribution} @@ -250,9 +242,7 @@ def weekly_exog_diff_start_ts() -> Dict[str, Union[TSDataset, DistributionDict]] } ) - ts = TSDataset( - df=TSDataset.to_dataset(df), freq="D", df_exog=TSDataset.to_dataset(df_exog), known_future=["regressor_exog"] - ) + ts = TSDataset(df=TSDataset.to_dataset(df), freq="D", df_exog=TSDataset.to_dataset(df_exog), known_future="all") distribution = {"segment_1": target1, "segment_2": target2} return {"ts": ts, "distribution": distribution} From 35007e90f119585263dbc55f9bcfe1a83c4d2acb Mon Sep 17 00:00:00 2001 From: "d.a.bunin" Date: Wed, 8 Dec 2021 11:00:19 +0300 Subject: [PATCH 6/7] Update changelog --- CHANGELOG.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index daf052f82..c7e13daec 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,9 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +### Added +- Add regressors logic to TSDatasets init ([#357](https://github.com/tinkoff-ai/etna/pull/357)) + ## [1.4.0] - 2021-12-03 ### Added - ACF plot ([#318](https://github.com/tinkoff-ai/etna/pull/318)) From 7f0253148e1a9a04a2f837f85e994ffb893786c0 Mon Sep 17 00:00:00 2001 From: "d.a.bunin" Date: Wed, 8 Dec 2021 11:02:29 +0300 Subject: [PATCH 7/7] Update changelog --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index c7e13daec..f7e795f26 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,7 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [Unreleased] ### Added - Add regressors logic to TSDatasets init ([#357](https://github.com/tinkoff-ai/etna/pull/357))