From d9c24de7dd4ca0c69b962e8ed1497ea21c817e24 Mon Sep 17 00:00:00 2001 From: Mr-Geekman <36005824+Mr-Geekman@users.noreply.github.com> Date: Tue, 7 Mar 2023 15:12:22 +0300 Subject: [PATCH] Teach `DifferencingTransform` to `inverse_transform` with NaNs (#1155) --- CHANGELOG.md | 1 + etna/transforms/math/differencing.py | 36 +++++-- .../test_inference/test_inverse_transform.py | 12 +-- .../test_math/test_differencing_transform.py | 96 ++++++++++++------- 4 files changed, 91 insertions(+), 54 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index b9edf0ddc..7c0645712 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -33,6 +33,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Fix `SklearnTransform` in per-segment mode to work on subset of segments and raise error on new segments ([#1107](https://github.com/tinkoff-ai/etna/pull/1107)) - Fix `OutliersTransform` and its children to raise error on new segments ([#1139](https://github.com/tinkoff-ai/etna/pull/1139)) - Fix `DifferencingTransform` to raise error on new segments during `transform` and `inverse_transform` in inplace mode ([#1141](https://github.com/tinkoff-ai/etna/pull/1141)) +- Teach `DifferencingTransform` to `inverse_transform` with NaNs ([#1155](https://github.com/tinkoff-ai/etna/pull/1155)) ## [1.14.0] - 2022-12-16 ### Added - Add python 3.10 support ([#1005](https://github.com/tinkoff-ai/etna/pull/1005)) diff --git a/etna/transforms/math/differencing.py b/etna/transforms/math/differencing.py index 6b0463ef1..692066429 100644 --- a/etna/transforms/math/differencing.py +++ b/etna/transforms/math/differencing.py @@ -16,7 +16,8 @@ class _SingleDifferencingTransform(Transform): """Calculate a time series differences of order 1. - This transform can work with NaNs at the beginning of the segment, but fails when meets NaN inside the segment. + During ``fit`` this transform can work with NaNs at the beginning of the segment, but fails when meets NaN inside the segment. + During ``transform`` and ``inverse_transform`` there is no special treatment of NaNs. Notes ----- @@ -86,6 +87,11 @@ def fit(self, df: pd.DataFrame) -> "_SingleDifferencingTransform": Returns ------- result: _SingleDifferencingTransform + + Raises + ------ + ValueError: + if NaNs are present inside the segment """ segments = sorted(set(df.columns.get_level_values("segment"))) fit_df = df.loc[:, pd.IndexSlice[segments, self.in_column]].copy() @@ -124,11 +130,8 @@ def transform(self, df: pd.DataFrame) -> pd.DataFrame: transformed = df.loc[:, pd.IndexSlice[segments, self.in_column]].copy() for current_segment in segments: to_transform = transformed.loc[:, pd.IndexSlice[current_segment, self.in_column]] - start_idx = to_transform.first_valid_index() # make a differentiation - transformed.loc[start_idx:, pd.IndexSlice[current_segment, self.in_column]] = to_transform.loc[ - start_idx: - ].diff(periods=self.period) + transformed.loc[:, pd.IndexSlice[current_segment, self.in_column]] = to_transform.diff(periods=self.period) if self.inplace: result_df = df.copy() @@ -188,10 +191,6 @@ def _reconstruct_test(self, df: pd.DataFrame, columns_to_inverse: Set[str]) -> p init_df = init_df[segments] to_transform = pd.concat([init_df, to_transform]) - # validate values inside the series to transform - if to_transform.isna().sum().sum() > 0: - raise ValueError(f"There should be no NaNs inside the segments") - # run reconstruction and save the result to_transform = self._make_inv_diff(to_transform) result_df.loc[:, pd.IndexSlice[segments, column]] = to_transform @@ -210,6 +209,13 @@ def inverse_transform(self, df: pd.DataFrame) -> pd.DataFrame: ------- result: transformed DataFrame. + + Raises + ------ + ValueError: + if inverse transform is applied not to full train nor to test that goes after train + ValueError: + if inverse transform is applied to test that goes after train with gap """ # we assume this to be fitted self._train_timestamp = cast(pd.DatetimeIndex, self._train_timestamp) @@ -241,7 +247,8 @@ def inverse_transform(self, df: pd.DataFrame) -> pd.DataFrame: class DifferencingTransform(Transform): """Calculate a time series differences. - This transform can work with NaNs at the beginning of the segment, but fails when meets NaN inside the segment. + During ``fit`` this transform can work with NaNs at the beginning of the segment, but fails when meets NaN inside the segment. + During ``transform`` and ``inverse_transform`` there is no special treatment of NaNs. Notes ----- @@ -334,6 +341,11 @@ def fit(self, df: pd.DataFrame) -> "DifferencingTransform": Returns ------- result: DifferencingTransform + + Raises + ------ + ValueError: + if NaNs are present inside the segment """ # this is made because transforms of high order may need some columns created by transforms of lower order result_df = df.copy() @@ -395,6 +407,10 @@ def inverse_transform(self, df: pd.DataFrame) -> pd.DataFrame: if transform isn't fitted NotImplementedError: if there are segments that weren't present during training + ValueError: + if inverse transform is applied not to full train nor to test that goes after train + ValueError: + if inverse transform is applied to test that goes after train with gap """ self._check_is_fitted() if not self.inplace: diff --git a/tests/test_transforms/test_inference/test_inverse_transform.py b/tests/test_transforms/test_inference/test_inverse_transform.py index f2a2c8f8e..890056a2c 100644 --- a/tests/test_transforms/test_inference/test_inverse_transform.py +++ b/tests/test_transforms/test_inference/test_inverse_transform.py @@ -311,6 +311,7 @@ def _test_inverse_transform_future_subset_segments(self, ts, transform, segments (LogTransform(in_column="target", inplace=True), "positive_ts"), (LogTransform(in_column="positive", inplace=True), "ts_with_exog"), (DifferencingTransform(in_column="target", inplace=False), "regular_ts"), + (DifferencingTransform(in_column="target", inplace=True), "regular_ts"), (DifferencingTransform(in_column="positive", inplace=True), "ts_with_exog"), (MADTransform(in_column="target", window=14), "regular_ts"), (MaxTransform(in_column="target", window=14), "regular_ts"), @@ -389,17 +390,6 @@ def test_inverse_transform_future_subset_segments(self, transform, dataset_name, ts = request.getfixturevalue(dataset_name) self._test_inverse_transform_future_subset_segments(ts, transform, segments=["segment_2"]) - @to_be_fixed(ValueError, match="There should be no NaNs inside the segments") - @pytest.mark.parametrize( - "transform, dataset_name", - [ - (DifferencingTransform(in_column="target", inplace=True), "regular_ts"), - ], - ) - def test_inverse_transform_future_subset_difference_fail(self, transform, dataset_name, request): - ts = request.getfixturevalue(dataset_name) - self._test_inverse_transform_future_subset_segments(ts, transform, segments=["segment_2"]) - class TestInverseTransformTrainNewSegments: """Test inverse transform on train part of new segments. diff --git a/tests/test_transforms/test_math/test_differencing_transform.py b/tests/test_transforms/test_math/test_differencing_transform.py index c26708d42..ee2ea1ed2 100644 --- a/tests/test_transforms/test_math/test_differencing_transform.py +++ b/tests/test_transforms/test_math/test_differencing_transform.py @@ -40,6 +40,18 @@ def df_nans() -> pd.DataFrame: return df +@pytest.fixture +def df_nans_middle() -> pd.DataFrame: + """Create DataFrame with nans in the middle of the segment.""" + timestamp = pd.date_range("2021-01-01", "2021-04-01") + df_1 = pd.DataFrame({"timestamp": timestamp, "target": np.arange(timestamp.shape[0]), "segment": "1"}) + df_2 = pd.DataFrame({"timestamp": timestamp, "target": np.arange(timestamp.shape[0]) * 2, "segment": "2"}) + df = pd.concat([df_1, df_2], ignore_index=True) + df = TSDataset.to_dataset(df) + df.iloc[5:10, 0] = np.NaN + return df + + @pytest.fixture def df_segments_split(df_nans) -> Tuple[pd.DataFrame, pd.DataFrame]: """Create a pair of DataFrames with different segments.""" @@ -106,15 +118,10 @@ def check_transform( series_init = df.loc[:, pd.IndexSlice[segment, "target"]] series_transformed = transformed_df.loc[:, pd.IndexSlice[segment, out_column]] - series_init = series_init.loc[series_init.first_valid_index() :] - series_transformed = series_transformed.loc[series_transformed.first_valid_index() :] - - assert series_init.shape[0] == series_transformed.shape[0] + order * period - for _ in range(order): - series_init = series_init.diff(periods=period).iloc[period:] + series_init = series_init.diff(periods=period) - assert np.all(series_init == series_transformed) + assert series_init.equals(series_transformed) def check_inverse_transform_not_inplace( @@ -134,10 +141,10 @@ def check_inverse_transform_inplace_train(transform: GeneralDifferencingTransfor assert inverse_transformed_df.equals(df) -def check_inverse_transform_inplace_test( +def check_inverse_transform_inplace_filled_test( transform: GeneralDifferencingTransform, period: int, order: int, df: pd.DataFrame ): - """Check that differencing transform correctly makes inverse_transform on test data in inplace mode.""" + """Check that differencing transform correctly makes inverse_transform on filled test data in inplace mode.""" ts = TSDataset(df, freq="D") ts_train, ts_test = ts.train_test_split(test_size=20) ts_train.fit_transform(transforms=[transform]) @@ -158,6 +165,19 @@ def check_inverse_transform_inplace_test( assert np.all(future_ts.to_pandas() == ts_test.to_pandas()) +def check_inverse_transform_inplace_unfilled_test(transform: GeneralDifferencingTransform, df: pd.DataFrame): + """Check that differencing transform correctly makes inverse_transform on unfilled test data in inplace mode.""" + ts = TSDataset(df, freq="D") + ts_train, ts_test = ts.train_test_split(test_size=20) + ts_train.fit_transform(transforms=[transform]) + + future_ts = ts_train.make_future(20) + + # check values from inverse_transform + future_ts.inverse_transform() + assert future_ts.to_pandas().isna().all().all() + + def check_inverse_transform_inplace_test_quantiles(transform: GeneralDifferencingTransform, df: pd.DataFrame): """Check that differencing transform correctly makes inverse_transform on test data with quantiles.""" ts = TSDataset(df, freq="D") @@ -345,6 +365,25 @@ def test_full_transform(period, order, inplace, out_column, df_nans): check_transform(transform, period, order, out_column, df_nans, df_nans) +@pytest.mark.parametrize("period", [1, 7]) +@pytest.mark.parametrize("inplace, out_column", [(False, "diff"), (True, "target")]) +def test_single_transform_nans_middle(period, inplace, out_column, df_nans, df_nans_middle): + """Test that _SingleDifferencingTransform generates correct values in transform with NaNs in the middle.""" + transform = _SingleDifferencingTransform(in_column="target", period=period, inplace=inplace, out_column=out_column) + check_transform(transform, period, 1, out_column, df_nans, df_nans_middle) + + +@pytest.mark.parametrize("period", [1, 7]) +@pytest.mark.parametrize("order", [1, 2]) +@pytest.mark.parametrize("inplace, out_column", [(False, "diff"), (True, "target")]) +def test_full_transform_nans_middle(period, order, inplace, out_column, df_nans, df_nans_middle): + """Test that DifferencingTransform generates correct values in transform with NaNs in the middle.""" + transform = DifferencingTransform( + in_column="target", period=period, order=order, inplace=inplace, out_column=out_column + ) + check_transform(transform, period, order, out_column, df_nans, df_nans_middle) + + @pytest.mark.parametrize("period", [1, 7]) def test_single_transform_not_inplace_new_segments(period, df_segments_split): """Test that _SingleDifferencingTransform generates correct values in transform on new segments in non-inplace mode.""" @@ -466,43 +505,34 @@ def test_full_inverse_transform_inplace_train(period, order, df_nans): check_inverse_transform_inplace_train(transform, df_nans) -@pytest.mark.parametrize( - "transform", - [ - _SingleDifferencingTransform(in_column="target", period=1, inplace=True), - DifferencingTransform(in_column="target", period=1, order=1, inplace=True), - ], -) -def test_general_inverse_transform_inplace_test_fail_nans(transform, df_nans): - """Test that differencing transform fails to make inverse_transform on test data if there are NaNs.""" - ts = TSDataset(df_nans, freq="D") - ts_train, ts_test = ts.train_test_split(test_size=20) - - ts_train.fit_transform(transforms=[transform]) +@pytest.mark.parametrize("period", [1, 7]) +def test_single_inverse_transform_inplace_filled_test(period, df_nans): + """Test that _SingleDifferencingTransform correctly makes inverse_transform on filled test data in inplace mode.""" + transform = _SingleDifferencingTransform(in_column="target", period=period, inplace=True) + check_inverse_transform_inplace_filled_test(transform, period, 1, df_nans) - # make predictions by hand only on one segment - future_ts = ts_train.make_future(20) - future_ts.df.loc[:, pd.IndexSlice["1", "target"]] = np.NaN - future_ts.df.loc[:, pd.IndexSlice["2", "target"]] = 2 - # check fail on inverse_transform - with pytest.raises(ValueError, match="There should be no NaNs inside the segments"): - future_ts.inverse_transform() +@pytest.mark.parametrize("period", [1, 7]) +@pytest.mark.parametrize("order", [1, 2]) +def test_full_inverse_transform_inplace_test(period, order, df_nans): + """Test that DifferencingTransform correctly makes inverse_transform on filled test data in inplace mode.""" + transform = DifferencingTransform(in_column="target", period=period, order=order, inplace=True) + check_inverse_transform_inplace_filled_test(transform, period, order, df_nans) @pytest.mark.parametrize("period", [1, 7]) def test_single_inverse_transform_inplace_test(period, df_nans): - """Test that _SingleDifferencingTransform correctly makes inverse_transform on test data in inplace mode.""" + """Test that _SingleDifferencingTransform correctly makes inverse_transform on unfilled test data in inplace mode.""" transform = _SingleDifferencingTransform(in_column="target", period=period, inplace=True) - check_inverse_transform_inplace_test(transform, period, 1, df_nans) + check_inverse_transform_inplace_unfilled_test(transform, df_nans) @pytest.mark.parametrize("period", [1, 7]) @pytest.mark.parametrize("order", [1, 2]) def test_full_inverse_transform_inplace_test(period, order, df_nans): - """Test that DifferencingTransform correctly makes inverse_transform on test data in inplace mode.""" + """Test that DifferencingTransform correctly makes inverse_transform on unfilled test data in inplace mode.""" transform = DifferencingTransform(in_column="target", period=period, order=order, inplace=True) - check_inverse_transform_inplace_test(transform, period, order, df_nans) + check_inverse_transform_inplace_unfilled_test(transform, df_nans) @pytest.mark.parametrize("period", [1, 7])