Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/master' into issue-323
Browse files Browse the repository at this point in the history
# Conflicts:
#	CHANGELOG.md
  • Loading branch information
d.a.bunin committed Dec 1, 2021
2 parents 3ea1f4a + 96b3666 commit 2c3f79e
Show file tree
Hide file tree
Showing 13 changed files with 448 additions and 30 deletions.
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -123,4 +123,5 @@ config.env
.devcontainer
/docs/source/api/
tmp
wandb
wandb
!examples/wandb
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
### Changed
- Add `ts.inverse_transform` as final step at `Pipeline.fit` method ([#316](https://github.com/tinkoff-ai/etna/pull/316))
- Make test_ts optional in plot_forecast ([#321](https://github.com/tinkoff-ai/etna/pull/321))
- Speed up inference for multisegment regression models ([#333](https://github.com/tinkoff-ai/etna/pull/333))
- Speed up SegmentEncoderTransform ([#331](https://github.com/tinkoff-ai/etna/pull/331))

## [1.3.3] - 2021-11-24
Expand All @@ -22,6 +23,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- Spell checking for source code and md files ([#303](https://github.com/tinkoff-ai/etna/pull/303))
- ResampleWithDistributionTransform ([#296](https://github.com/tinkoff-ai/etna/pull/296))
- Add function to duplicate exogenous data ([#305](https://github.com/tinkoff-ai/etna/pull/305))
- FourierTransform ([#306](https://github.com/tinkoff-ai/etna/pull/306))

### Changed
- Rename confidence interval to prediction interval, start working with quantiles instead of interval_width ([#285](https://github.com/tinkoff-ai/etna/pull/285))
Expand Down
21 changes: 6 additions & 15 deletions etna/models/catboost.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from typing import Optional

import numpy as np
import pandas as pd
from catboost import CatBoostRegressor
from catboost import Pool
Expand Down Expand Up @@ -41,7 +42,7 @@ def fit(self, df: pd.DataFrame) -> "_CatBoostModel":
self.model.fit(train_pool)
return self

def predict(self, df: pd.DataFrame) -> list:
def predict(self, df: pd.DataFrame) -> np.ndarray:
features = df.drop(columns=["timestamp", "target"])
predict_pool = Pool(features, cat_features=self._categorical)
pred = self.model.predict(predict_pool)
Expand Down Expand Up @@ -294,19 +295,9 @@ def forecast(self, ts: TSDataset) -> TSDataset:
DataFrame
Models result
"""
result_list = list()
for segment in ts.segments:
segment_predict = self._forecast_segment(self._base_model, segment, ts)
result_list.append(segment_predict)

result_df = pd.concat(result_list, ignore_index=True)
result_df = result_df.set_index(["timestamp", "segment"])

df = ts.to_pandas(flatten=True)
df = df.set_index(["timestamp", "segment"])
df = df.combine_first(result_df).reset_index()

df = TSDataset.to_dataset(df)
ts.df = df
horizon = len(ts.df)
x = ts.to_pandas(flatten=True).drop(["segment"], axis=1)
y = self._base_model.predict(x).reshape(-1, horizon).T
ts.loc[:, pd.IndexSlice[:, "target"]] = y
ts.inverse_transform()
return ts
18 changes: 4 additions & 14 deletions etna/models/sklearn.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,19 +82,9 @@ def forecast(self, ts: TSDataset) -> TSDataset:
DataFrame
Models result
"""
result_list = list()
for segment in ts.segments:
segment_predict = self._forecast_segment(self._base_model, segment, ts)
result_list.append(segment_predict)

result_df = pd.concat(result_list, ignore_index=True)
result_df = result_df.set_index(["timestamp", "segment"])

df = ts.to_pandas(flatten=True)
df = df.set_index(["timestamp", "segment"])
df = df.combine_first(result_df).reset_index()

df = TSDataset.to_dataset(df)
ts.df = df
horizon = len(ts.df)
x = ts.to_pandas(flatten=True).drop(["segment"], axis=1)
y = self._base_model.predict(x).reshape(-1, horizon).T
ts.loc[:, pd.IndexSlice[:, "target"]] = y
ts.inverse_transform()
return ts
1 change: 1 addition & 0 deletions etna/transforms/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from etna.transforms.detrend import TheilSenTrendTransform
from etna.transforms.feature_importance import TreeFeatureSelectionTransform
from etna.transforms.filter import FilterFeaturesTransform
from etna.transforms.fourier import FourierTransform
from etna.transforms.gale_shapley import GaleShapleyFeatureSelectionTransform
from etna.transforms.imputation import TimeSeriesImputerTransform
from etna.transforms.lags import LagTransform
Expand Down
135 changes: 135 additions & 0 deletions etna/transforms/fourier.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,135 @@
import math
from typing import Optional
from typing import Sequence

import numpy as np
import pandas as pd

from etna.transforms.base import Transform


class FourierTransform(Transform):
"""Adds fourier features to the dataset."""

def __init__(
self,
period: float,
order: Optional[int] = None,
mods: Optional[Sequence[int]] = None,
out_column: Optional[str] = None,
):
"""Create instance of FourierTransform.
Parameters
----------
period:
the period of the seasonality to capture in frequency units of time series, it should be >= 2
order:
upper order of Fourier components to include, it should be >= 1 and <= ceil(period/2))
mods:
alternative and precise way of defining which harmonics will be used,
for example `mods=[1, 3, 4]` means that sin of the first order
and sin and cos of the second order will be used,
mods should be >= 1 and < period
out_column:
if set, name of added column, the final name will be '{out_columnt}_{mod}',
don't forget to add 'regressor_' prefix
if don't set, name will be 'regressor_{repr}', repr will represent class that creates exactly this column
Raises
------
ValueError:
if period < 2
ValueError:
if both or none of order, mods is set
ValueError:
if order is < 1 or > ceil(period/2)
ValueError:
if at least one mod is < 1 or >= period
Notes
-----
To understand how transform works we recommend: https://otexts.com/fpp2/useful-predictors.html#fourier-series
* Parameter `period` is responsible for the seasonality we want to capture.
* Parameters `order` and `mods` define which harmonics will be used.
Parameter `order` is a more user-friendly version of `mods`.
For example, `order=2` can be represented as `mods=[1, 2, 3, 4]` if `period` > 4 and
as `mods=[1, 2, 3]` if 3 <= `period` <= 4.
"""
if period < 2:
raise ValueError("Period should be at least 2")
self.period = period
self.mods: Sequence[int]

if order is not None and mods is None:
if order < 1 or order > math.ceil(period / 2):
raise ValueError("Order should be within [1, ceil(period/2)] range")
self.mods = [mod for mod in range(1, 2 * order + 1) if mod < period]
elif mods is not None and order is None:
if min(mods) < 1 or max(mods) >= period:
raise ValueError("Every mod should be within [1, int(period)) range")
self.mods = mods
else:
raise ValueError("There should be exactly one option set: order or mods")

self.out_column = out_column

def fit(self, df: pd.DataFrame) -> "FourierTransform":
"""Fit method does nothing and is kept for compatibility.
Parameters
----------
df:
dataframe with data.
Returns
-------
result: FourierTransform
"""
return self

def _get_column_name(self, mod: int) -> str:
if self.out_column is None:
return f"regressor_{FourierTransform(period=self.period, mods=[mod]).__repr__()}"
else:
return f"{self.out_column}_{mod}"

@staticmethod
def _construct_answer(df: pd.DataFrame, features: pd.DataFrame) -> pd.DataFrame:
dataframes = []
for seg in df.columns.get_level_values("segment").unique():
tmp = df[seg].join(features)
_idx = tmp.columns.to_frame()
_idx.insert(0, "segment", seg)
tmp.columns = pd.MultiIndex.from_frame(_idx)
dataframes.append(tmp)

result = pd.concat(dataframes, axis=1).sort_index(axis=1)
result.columns.names = ["segment", "feature"]
return result

def transform(self, df: pd.DataFrame) -> pd.DataFrame:
"""Add harmonics to the dataset.
Parameters
----------
df:
dataframe with data to transform.
Returns
-------
result: pd.Dataframe
transformed dataframe
"""
features = pd.DataFrame(index=df.index)
elapsed = np.arange(features.shape[0]) / self.period

for mod in self.mods:
order = (mod + 1) // 2
is_cos = mod % 2 == 0

features[self._get_column_name(mod)] = np.sin(2 * np.pi * order * elapsed + np.pi / 2 * is_cos)

return self._construct_answer(df, features)
27 changes: 27 additions & 0 deletions examples/wandb/sweeps/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
# Using WandB with ETNA library

## Colab example

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1EBSqqBPaYgLWCRdpC5vMy9RiLBsCEd7I?usp=sharing)

![](assets/etna-wandb.png)

[Sweep Dashboard](https://wandb.ai/martins0n/wandb-etna-sweep/sweeps/c7e0r8sq/overview?workspace=user-martins0n)

## Steps to start

- Define your pipeline and hyperparameters in `pipeline.yaml`, in example we will optimize number of iterations `iterations` and `learning-rate`

- Define WandB sweeps config `sweep.yaml` and push it to cloud:

```bash
WANDB_PROJECT=<project_name> WandB sweep sweep.yaml
```

- You may change `dataloader` function and add additional parameters for WandB logger like tags for example in `run.py`

- Run WandB agent for hyperparameters optimization start:

```bash
wandb agent <user_name>/<project_name>/<sweep_id>
```
Binary file added examples/wandb/sweeps/assets/etna-wandb.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
23 changes: 23 additions & 0 deletions examples/wandb/sweeps/pipeline.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
backtest:
n_folds: 3
n_jobs: 1
metrics:
- _target_: etna.metrics.MAE
- _target_: etna.metrics.MSE
- _target_: etna.metrics.MAPE
- _target_: etna.metrics.SMAPE
- _target_: etna.metrics.R2
pipeline:
_target_: etna.pipeline.Pipeline
horizon: 10
model:
_target_: etna.models.CatBoostModelMultiSegment
iterations: ${iterations}
learning_rate: ${learning-rate}
transforms:
- _target_: etna.transforms.SegmentEncoderTransform
- _target_: etna.transforms.LagTransform
in_column: target
lags: [20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32]
iterations: null
learning-rate: null
60 changes: 60 additions & 0 deletions examples/wandb/sweeps/run.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
"""
Example of using WandB with ETNA library.
Current script could be used for sweeps and simple validation runs.
"""

import argparse
import random
from typing import Any, Dict

import hydra_slayer
import numpy as np
from etna.datasets import TSDataset, generate_ar_df
from etna.loggers import WandbLogger, tslogger
from etna.pipeline import Pipeline
from omegaconf import OmegaConf

SEED = 11
random.seed(SEED)
np.random.seed(SEED)

# Default config loading
config = OmegaConf.load("pipeline.yaml")


# Define arguments for WandB sweep parameters
args = argparse.ArgumentParser()
args.add_argument("--iterations", type=int)
args.add_argument("--learning-rate", type=float)
for key, value in vars(args.parse_args()).items():
if value:
config[key] = value

# Config for Pipeline and backtesting pipeline
config = OmegaConf.to_container(config, resolve=True)
pipeline = config["pipeline"]
backtest = config["backtest"]


# Define WandbLogger and passing it to global library logger
# It will not log child processes in case of `spawn` (OSX or Windows)
wblogger = WandbLogger(project="test-run", config=pipeline)
tslogger.add(wblogger)


def dataloader() -> TSDataset:
df = generate_ar_df(periods=300, start_time="2021-01-02", n_segments=10)
df = TSDataset.to_dataset(df)
ts = TSDataset(df=df, freq="1D")
return ts


if __name__ == "__main__":

ts = dataloader()

pipeline: Pipeline = hydra_slayer.get_from_params(**pipeline)

backtest_configs: Dict[str, Any] = hydra_slayer.get_from_params(**backtest)

metrics, forecast, info = pipeline.backtest(ts=ts, **backtest_configs)
18 changes: 18 additions & 0 deletions examples/wandb/sweeps/sweep.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
program:
run.py
method: bayes
parameters:
learning-rate:
min: 0.0001
max: 0.1
iterations:
distribution: int_uniform
min: 2
max: 30
metric:
name: MAE_median
goal: minimize
command:
- python
- run.py
- ${args}
Loading

0 comments on commit 2c3f79e

Please sign in to comment.