Skip to content

Add TSDatasets.__init__ regressors logic #357

Merged
merged 7 commits into from
Dec 9, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,10 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## [Unreleased]
### Added
- Add regressors logic to TSDatasets init ([#357](https://github.com/tinkoff-ai/etna/pull/357))

## [1.4.0] - 2021-12-03
### Added
- ACF plot ([#318](https://github.com/tinkoff-ai/etna/pull/318))
Expand Down
77 changes: 56 additions & 21 deletions etna/datasets/tsdataset.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import math
import warnings
from copy import copy
from typing import TYPE_CHECKING
from typing import List
from typing import Optional
Expand All @@ -10,6 +11,7 @@
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from typing_extensions import Literal

from etna.loggers import tslogger

Expand Down Expand Up @@ -56,7 +58,7 @@ class TSDataset:
>>> df_regressors["segment"] = "segment_0"
>>> df_to_forecast = TSDataset.to_dataset(df_to_forecast)
>>> df_regressors = TSDataset.to_dataset(df_regressors)
>>> tsdataset = TSDataset(df=df_to_forecast, freq="D", df_exog=df_regressors)
>>> tsdataset = TSDataset(df=df_to_forecast, freq="D", df_exog=df_regressors, known_future="all")
>>> tsdataset.df.head(5)
segment segment_0
feature regressor_0 regressor_1 regressor_2 regressor_3 regressor_4 target
Expand All @@ -70,7 +72,13 @@ class TSDataset:

idx = pd.IndexSlice

def __init__(self, df: pd.DataFrame, freq: str, df_exog: Optional[pd.DataFrame] = None):
def __init__(
self,
df: pd.DataFrame,
freq: str,
df_exog: Optional[pd.DataFrame] = None,
known_future: Union[Literal["all"], Sequence] = (),
):
"""Init TSDataset.

Parameters
Expand All @@ -82,6 +90,9 @@ def __init__(self, df: pd.DataFrame, freq: str, df_exog: Optional[pd.DataFrame]
df_exog:
dataframe with exogenous data;
if the series is known in the future features' names should start with prefix 'regressor_`.
known_future:
columns in df_exog[known_future] that are regressors,
if "all" value is given, all columns are meant to be regressors
"""
self.raw_df = df.copy(deep=True)
self.raw_df.index = pd.to_datetime(self.raw_df.index)
Expand All @@ -105,13 +116,15 @@ def __init__(self, df: pd.DataFrame, freq: str, df_exog: Optional[pd.DataFrame]

self.df = self.raw_df.copy(deep=True)

self.known_future = self._check_known_future(known_future, df_exog)
self._regressors = copy(self.known_future)

if df_exog is not None:
self.df_exog = df_exog.copy(deep=True)
self.df_exog.index = pd.to_datetime(self.df_exog.index)
self.df = self._merge_exog(self.df)

self.transforms: Optional[Sequence["Transform"]] = None
self._update_regressors()

def transform(self, transforms: Sequence["Transform"]):
"""Apply given transform to the data."""
Expand All @@ -120,7 +133,6 @@ def transform(self, transforms: Sequence["Transform"]):
for transform in self.transforms:
tslogger.log(f"Transform {transform.__class__.__name__} is applied to dataset")
self.df = transform.transform(self.df)
self._update_regressors()

def fit_transform(self, transforms: Sequence["Transform"]):
"""Fit and apply given transforms to the data."""
Expand All @@ -129,7 +141,6 @@ def fit_transform(self, transforms: Sequence["Transform"]):
for transform in self.transforms:
tslogger.log(f"Transform {transform.__class__.__name__} is applied to dataset")
self.df = transform.fit_transform(self.df)
self._update_regressors()

def __repr__(self):
return self.df.__repr__()
Expand Down Expand Up @@ -177,7 +188,9 @@ def make_future(self, future_steps: int) -> "TSDataset":
... })
>>> df_ts_format = TSDataset.to_dataset(df)
>>> df_regressors_ts_format = TSDataset.to_dataset(df_regressors)
>>> ts = TSDataset(df_ts_format, "D", df_exog=df_regressors_ts_format)
>>> ts = TSDataset(
... df_ts_format, "D", df_exog=df_regressors_ts_format, known_future="all"
... )
>>> ts.make_future(4)
segment segment_0 segment_1
feature regressor_1 regressor_2 target regressor_1 regressor_2 target
Expand Down Expand Up @@ -221,14 +234,39 @@ def make_future(self, future_steps: int) -> "TSDataset":
return future_ts

@staticmethod
def _check_regressors(df: pd.DataFrame, df_exog: pd.DataFrame):
"""Check that regressors in df_exog begin not later than in df and end later than in df."""
def _check_known_future(
known_future: Union[Literal["all"], Sequence], df_exog: Optional[pd.DataFrame]
) -> List[str]:
"""Check that `known_future` corresponds to `df_exog` and returns initial list of regressors."""
if df_exog is None:
exog_columns = set()
else:
exog_columns = set(df_exog.columns.get_level_values("feature"))

if isinstance(known_future, str):
if known_future == "all":
return sorted(list(exog_columns))
else:
raise ValueError("The only possible literal is 'all'")
else:
known_future_unique = set(known_future)
if not known_future_unique.issubset(exog_columns):
raise ValueError(
f"Some features in known_future are not present in df_exog: "
f"{known_future_unique.difference(exog_columns)}"
)
else:
return sorted(list(known_future_unique))

@staticmethod
def _check_regressors(df: pd.DataFrame, df_regressors: pd.DataFrame):
"""Check that regressors begin not later than in df and end later than in df."""
# TODO: check performance
df_segments = df.columns.get_level_values("segment")
for segment in df_segments:
target = df[segment]["target"].dropna()
exog_regressor_columns = [x for x in set(df_exog[segment].columns) if x.startswith("regressor")]
for series in exog_regressor_columns:
exog_series = df_exog[segment][series].dropna()
target = df.loc[:, pd.IndexSlice[segment, "target"]].dropna()
for series in df_regressors.columns.get_level_values("feature"):
exog_series = df_regressors.loc[:, pd.IndexSlice[segment, series]].dropna()
if target.index.min() < exog_series.index.min():
raise ValueError(
f"All the regressor series should start not later than corresponding 'target'."
Expand All @@ -243,7 +281,9 @@ def _check_regressors(df: pd.DataFrame, df_exog: pd.DataFrame):
)

def _merge_exog(self, df: pd.DataFrame) -> pd.DataFrame:
self._check_regressors(df=df, df_exog=self.df_exog)
segments = sorted(set(df.columns.get_level_values("segment")))
df_regressors = self.df_exog.loc[:, pd.IndexSlice[segments, self.known_future]]
iKintosh marked this conversation as resolved.
Show resolved Hide resolved
self._check_regressors(df=df, df_regressors=df_regressors)
df = pd.merge(df, self.df_exog, left_index=True, right_index=True, how="left").sort_index(axis=1, level=(0, 1))
return df

Expand Down Expand Up @@ -279,13 +319,6 @@ def segments(self) -> List[str]:
"""
return self.df.columns.get_level_values("segment").unique().tolist()

def _update_regressors(self):
result = set()
for column in self.columns.get_level_values("feature"):
if column.startswith("regressor"):
result.add(column)
self._regressors = list(result)

iKintosh marked this conversation as resolved.
Show resolved Hide resolved
@property
def regressors(self) -> List[str]:
"""Get list of all regressors across all segments in dataset.
Expand All @@ -307,7 +340,9 @@ def regressors(self) -> List[str]:
... )
>>> df_exog = pd.concat([df_regressors_1, df_regressors_2], ignore_index=True)
>>> df_exog_ts_format = TSDataset.to_dataset(df_exog)
>>> ts = TSDataset(df_ts_format, df_exog=df_exog_ts_format, freq="D")
>>> ts = TSDataset(
... df_ts_format, df_exog=df_exog_ts_format, freq="D", known_future="all"
... )
>>> ts.regressors
['regressor_1']
"""
Expand Down
4 changes: 2 additions & 2 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -207,7 +207,7 @@ def example_reg_tsds(random_seed) -> TSDataset:
df = TSDataset.to_dataset(df)
exog = TSDataset.to_dataset(exog)

tsds = TSDataset(df, freq="D", df_exog=exog)
tsds = TSDataset(df, freq="D", df_exog=exog, known_future="all")

return tsds

Expand Down Expand Up @@ -235,7 +235,7 @@ def outliers_tsds():
df.columns.names = ["segment", "feature"]

exog = df.copy()
exog.columns = pd.MultiIndex.from_arrays([["1", "2"], ["exog", "exog"]])
exog.columns.set_levels(["exog"], level="feature", inplace=True)

tsds = TSDataset(df, "1d", exog)

Expand Down
Loading