From 2ef5fca6ce17fe7942f6f394fca21257bc72b0eb Mon Sep 17 00:00:00 2001 From: alex-hse-repository Date: Wed, 20 Oct 2021 13:35:39 +0300 Subject: [PATCH 1/3] Add Ensembles notebook --- README.md | 6 +- docs/source/tutorials.rst | 1 + examples/ensembles.ipynb | 988 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 994 insertions(+), 1 deletion(-) create mode 100644 examples/ensembles.ipynb diff --git a/README.md b/README.md index f05a0e8f5..268be2f50 100644 --- a/README.md +++ b/README.md @@ -98,7 +98,7 @@ We have also prepared a set of tutorials for an easy introduction: - Custom Distance - Visualisation -#### 04. [Deep learning models](https://github.com/tinkoff-ai/etna-ts/tree/master/examples/NN_examples.ipynb) +#### 06. [Deep learning models](https://github.com/tinkoff-ai/etna-ts/tree/master/examples/NN_examples.ipynb) - Creating TSDataset - Architecture - Testing models @@ -106,6 +106,10 @@ We have also prepared a set of tutorials for an easy introduction: - TFT - Simple Model +#### 07. [Ensembles](https://github.com/tinkoff-ai/etna-ts/tree/master/examples/ensembles.ipynb) +- VotingEnsemble +- StackingEnsemble + ## Documentation ETNA documentation is available [here](https://etna-docs.netlify.app/). diff --git a/docs/source/tutorials.rst b/docs/source/tutorials.rst index 2687264b1..92d576548 100644 --- a/docs/source/tutorials.rst +++ b/docs/source/tutorials.rst @@ -13,3 +13,4 @@ Tutorials tutorials/outliers tutorials/clustering tutorials/custom_transform_and_model + tutorials/ensembles diff --git a/examples/ensembles.ipynb b/examples/ensembles.ipynb new file mode 100644 index 000000000..b9b88ff4e --- /dev/null +++ b/examples/ensembles.ipynb @@ -0,0 +1,988 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "e4793ef5", + "metadata": {}, + "source": [ + "# Ensembles notebook" + ] + }, + { + "cell_type": "markdown", + "id": "4949ce7f", + "metadata": {}, + "source": [ + "This notebook contains the simple examples of using the ensemble models with ETNA library.\n", + "\n", + "**Table of Contents**\n", + "\n", + "* [Load Dataset](#chapter1) \n", + "* [Build Pipelines](#chapter2)\n", + "* [Ensembles](#chapter3)\n", + " * [VotingEnsemble](#section_3_1)\n", + " * [StackingEnsamble](#section_3_2)\n", + " * [Results](#section_3_3)" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "7b9df4dc", + "metadata": {}, + "outputs": [], + "source": [ + "import warnings\n", + "warnings.filterwarnings(\"ignore\")" + ] + }, + { + "cell_type": "markdown", + "id": "f82360d8", + "metadata": {}, + "source": [ + "## 1. Load Dataset \n", + "\n", + "In this notebook we will work with the dataset contains only one segment with monthly wine sales." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "639d0580", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "from etna.datasets import TSDataset" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "01e2fcee", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "original_df = pd.read_csv(\"data/monthly-australian-wine-sales.csv\")\n", + "original_df[\"timestamp\"] = pd.to_datetime(original_df[\"month\"])\n", + "original_df[\"target\"] = original_df[\"sales\"]\n", + "original_df.drop(columns=[\"month\", \"sales\"], inplace=True)\n", + "original_df[\"segment\"] = \"main\"\n", + "original_df.head()\n", + "df = TSDataset.to_dataset(original_df)\n", + "ts = TSDataset(df=df, freq=\"MS\")\n", + "ts.plot()" + ] + }, + { + "cell_type": "markdown", + "id": "c879183c", + "metadata": {}, + "source": [ + "## 2. Build Pipelines \n", + "\n", + "Given the sales' history, we want to select the best model(pipeline) to forecast future sales." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "0ee58fa6", + "metadata": {}, + "outputs": [], + "source": [ + "from etna.pipeline import Pipeline\n", + "from etna.models import NaiveModel, SeasonalMovingAverageModel, ProphetModel, CatBoostModelMultiSegment\n", + "from etna.transforms import LagTransform\n", + "from etna.metrics import MAE, MSE, SMAPE, MAPE\n", + "HORIZON = 3\n", + "N_FOLDS = 5" + ] + }, + { + "cell_type": "markdown", + "id": "b6815f49", + "metadata": {}, + "source": [ + "Let's build four pipelines using the different models" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "f0dc26e4", + "metadata": {}, + "outputs": [], + "source": [ + "naive_pipeline = Pipeline(model=NaiveModel(lag=12), transforms=[], horizon=HORIZON)\n", + "seasonalma_pipeline = Pipeline(\n", + " model=SeasonalMovingAverageModel(window=5, seasonality=12), transforms=[], horizon=HORIZON\n", + ")\n", + "prophet_pipeline = Pipeline(model=ProphetModel(), transforms=[], horizon=HORIZON)\n", + "catboost_pipeline = Pipeline(\n", + " model=CatBoostModelMultiSegment(),\n", + " transforms=[LagTransform(lags=[6, 7, 8, 9, 10, 11, 12], in_column=\"target\")],\n", + " horizon=HORIZON,\n", + ")\n", + "pipeline_names = [\"naive\", \"moving average\", \"prophet\", \"catboost\"]\n", + "pipelines = [naive_pipeline, seasonalma_pipeline, prophet_pipeline, catboost_pipeline]" + ] + }, + { + "cell_type": "markdown", + "id": "106e3885", + "metadata": {}, + "source": [ + "And evaluate their performance on the backtest" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "53c1a0b9", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[Parallel(n_jobs=5)]: Using backend MultiprocessingBackend with 5 concurrent workers.\n", + "[Parallel(n_jobs=5)]: Done 1 tasks | elapsed: 3.4s\n", + "[Parallel(n_jobs=5)]: Done 2 out of 5 | elapsed: 6.6s remaining: 9.9s\n", + "[Parallel(n_jobs=5)]: Done 3 out of 5 | elapsed: 9.5s remaining: 6.4s\n", + "[Parallel(n_jobs=5)]: Done 5 out of 5 | elapsed: 15.6s remaining: 0.0s\n", + "[Parallel(n_jobs=5)]: Done 5 out of 5 | elapsed: 15.6s finished\n", + "[Parallel(n_jobs=5)]: Using backend MultiprocessingBackend with 5 concurrent workers.\n", + "[Parallel(n_jobs=5)]: Done 1 tasks | elapsed: 3.6s\n", + "[Parallel(n_jobs=5)]: Done 2 out of 5 | elapsed: 6.4s remaining: 9.6s\n", + "[Parallel(n_jobs=5)]: Done 3 out of 5 | elapsed: 9.2s remaining: 6.2s\n", + "[Parallel(n_jobs=5)]: Done 5 out of 5 | elapsed: 14.8s remaining: 0.0s\n", + "[Parallel(n_jobs=5)]: Done 5 out of 5 | elapsed: 14.8s finished\n", + "[Parallel(n_jobs=5)]: Using backend MultiprocessingBackend with 5 concurrent workers.\n", + "[Parallel(n_jobs=5)]: Done 1 tasks | elapsed: 6.1s\n", + "[Parallel(n_jobs=5)]: Done 2 out of 5 | elapsed: 10.4s remaining: 15.6s\n", + "[Parallel(n_jobs=5)]: Done 3 out of 5 | elapsed: 14.6s remaining: 9.7s\n", + "[Parallel(n_jobs=5)]: Done 5 out of 5 | elapsed: 22.5s remaining: 0.0s\n", + "[Parallel(n_jobs=5)]: Done 5 out of 5 | elapsed: 22.5s finished\n", + "[Parallel(n_jobs=5)]: Using backend MultiprocessingBackend with 5 concurrent workers.\n", + "[Parallel(n_jobs=5)]: Done 1 tasks | elapsed: 4.1s\n", + "[Parallel(n_jobs=5)]: Done 2 out of 5 | elapsed: 7.2s remaining: 10.7s\n", + "[Parallel(n_jobs=5)]: Done 3 out of 5 | elapsed: 10.2s remaining: 6.8s\n", + "[Parallel(n_jobs=5)]: Done 5 out of 5 | elapsed: 16.7s remaining: 0.0s\n", + "[Parallel(n_jobs=5)]: Done 5 out of 5 | elapsed: 16.7s finished\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
MAEMSESMAPEMAPE
naive2437.4666671.089199e+079.94988610.222106
moving average1913.8266676.113701e+067.8975707.824056
prophet2174.8390536.577668e+068.8911479.047454
catboost2224.6610628.699304e+069.0956149.733325
\n", + "
" + ], + "text/plain": [ + " MAE MSE SMAPE MAPE\n", + "naive 2437.466667 1.089199e+07 9.949886 10.222106\n", + "moving average 1913.826667 6.113701e+06 7.897570 7.824056\n", + "prophet 2174.839053 6.577668e+06 8.891147 9.047454\n", + "catboost 2224.661062 8.699304e+06 9.095614 9.733325" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "metrics = []\n", + "for pipeline in pipelines:\n", + " metrics.append(\n", + " pipeline.backtest(\n", + " ts=ts, metrics=[MAE(), MSE(), SMAPE(), MAPE()], n_folds=N_FOLDS, aggregate_metrics=True, n_jobs=5\n", + " )[0].iloc[:, 1:]\n", + " )\n", + "metrics = pd.concat(metrics)\n", + "metrics.index = pipeline_names\n", + "metrics" + ] + }, + { + "cell_type": "markdown", + "id": "b9581f2a", + "metadata": {}, + "source": [ + "## 3. Ensembles \n", + "To improve the performance of the individual models, we can try to make ensembles out of them. Our library contains two ensembling methods, which we will try on now." + ] + }, + { + "cell_type": "markdown", + "id": "f0e7e3e6", + "metadata": {}, + "source": [ + "### 3.1 VotingEnsemble\n", + "\n", + "`VotingEnsemble` forecasts future values with weighted averaging of it's `pipelines` forecasts." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "5338aeea", + "metadata": {}, + "outputs": [], + "source": [ + "from etna.ensembles import VotingEnsemble" + ] + }, + { + "cell_type": "markdown", + "id": "9f7ee7db", + "metadata": {}, + "source": [ + "By default, `VotingEnsemble` uses **uniform** weights for the pipelines' forecasts." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "1c4029fc", + "metadata": {}, + "outputs": [], + "source": [ + "voting_ensemble = VotingEnsemble(pipelines=pipelines, n_jobs=4)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "f1cb83b8", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[Parallel(n_jobs=2)]: Using backend MultiprocessingBackend with 2 concurrent workers.\n", + "[Parallel(n_jobs=2)]: Done 1 tasks | elapsed: 6.7s\n", + "[Parallel(n_jobs=2)]: Done 2 tasks | elapsed: 11.1s\n", + "[Parallel(n_jobs=2)]: Done 3 out of 5 | elapsed: 11.2s remaining: 7.5s\n", + "[Parallel(n_jobs=2)]: Done 5 out of 5 | elapsed: 13.3s remaining: 0.0s\n", + "[Parallel(n_jobs=2)]: Done 5 out of 5 | elapsed: 13.3s finished\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
MAEMSESMAPEMAPE
voting ensemble(uniform weights)2089.2143277.245760e+068.5948288.83881
\n", + "
" + ], + "text/plain": [ + " MAE MSE SMAPE MAPE\n", + "voting ensemble(uniform weights) 2089.214327 7.245760e+06 8.594828 8.83881" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "voting_ensamble_metrics = voting_ensemble.backtest(\n", + " ts=ts, metrics=[MAE(), MSE(), SMAPE(), MAPE()], n_folds=N_FOLDS, aggregate_metrics=True, n_jobs=2\n", + ")[0].iloc[:, 1:]\n", + "voting_ensamble_metrics.index = [\"voting ensemble(uniform weights)\"]\n", + "voting_ensamble_metrics" + ] + }, + { + "cell_type": "markdown", + "id": "7d661818", + "metadata": {}, + "source": [ + "You can specify the `weights` manually, the higher weight the more you trust the base model. The `weights` are automatically normalized." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "7be24d69", + "metadata": {}, + "outputs": [], + "source": [ + "voting_ensemble_custom_weights = VotingEnsemble(pipelines=pipelines, weights=[1, 16, 9, 4], n_jobs=4)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "beed0ae4", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[Parallel(n_jobs=2)]: Using backend MultiprocessingBackend with 2 concurrent workers.\n", + "[Parallel(n_jobs=2)]: Done 1 tasks | elapsed: 6.8s\n", + "[Parallel(n_jobs=2)]: Done 2 tasks | elapsed: 11.4s\n", + "[Parallel(n_jobs=2)]: Done 3 out of 5 | elapsed: 11.5s remaining: 7.6s\n", + "[Parallel(n_jobs=2)]: Done 5 out of 5 | elapsed: 13.5s remaining: 0.0s\n", + "[Parallel(n_jobs=2)]: Done 5 out of 5 | elapsed: 13.5s finished\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
MAEMSESMAPEMAPE
voting ensemble(custom weights)1961.7131726.152859e+068.0941238.19029
\n", + "
" + ], + "text/plain": [ + " MAE MSE SMAPE MAPE\n", + "voting ensemble(custom weights) 1961.713172 6.152859e+06 8.094123 8.19029" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "voting_ensamble_custom_weights_metrics = voting_ensemble_custom_weights.backtest(\n", + " ts=ts, metrics=[MAE(), MSE(), SMAPE(), MAPE()], n_folds=N_FOLDS, aggregate_metrics=True, n_jobs=2\n", + ")[0].iloc[:, 1:]\n", + "voting_ensamble_custom_weights_metrics.index = [\"voting ensemble(custom weights)\"]\n", + "voting_ensamble_custom_weights_metrics" + ] + }, + { + "cell_type": "markdown", + "id": "a26b503b", + "metadata": {}, + "source": [ + "### 3.2 StackingEnsemble\n", + "`StackingEnsemble` forecasts future using the metamodel to combine the forecasts of it's `pipelines`." + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "78c46663", + "metadata": {}, + "outputs": [], + "source": [ + "from etna.ensembles import StackingEnsemble" + ] + }, + { + "cell_type": "markdown", + "id": "3b430668", + "metadata": {}, + "source": [ + "By default, `StackingEnsemble` uses only the pipelines' forecasts as features for the `final_model`." + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "273626b1", + "metadata": {}, + "outputs": [], + "source": [ + "stacking_ensemble_unfeatured = StackingEnsemble(pipelines=pipelines, cv=10, n_jobs=4)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "272cc433", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[Parallel(n_jobs=2)]: Using backend MultiprocessingBackend with 2 concurrent workers.\n", + "[Parallel(n_jobs=2)]: Done 1 tasks | elapsed: 27.2s\n", + "[Parallel(n_jobs=2)]: Done 2 tasks | elapsed: 32.0s\n", + "[Parallel(n_jobs=2)]: Done 3 out of 5 | elapsed: 50.6s remaining: 33.7s\n", + "[Parallel(n_jobs=2)]: Done 5 out of 5 | elapsed: 1.2min remaining: 0.0s\n", + "[Parallel(n_jobs=2)]: Done 5 out of 5 | elapsed: 1.2min finished\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
MAEMSESMAPEMAPE
stacking ensemble2002.6573757.227136e+068.3296968.461256
\n", + "
" + ], + "text/plain": [ + " MAE MSE SMAPE MAPE\n", + "stacking ensemble 2002.657375 7.227136e+06 8.329696 8.461256" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "stacking_ensamble_unfeatured_metrics = stacking_ensemble_unfeatured.backtest(\n", + " ts=ts, metrics=[MAE(), MSE(), SMAPE(), MAPE()], n_folds=N_FOLDS, aggregate_metrics=True, n_jobs=2\n", + ")[0].iloc[:, 1:]\n", + "stacking_ensamble_unfeatured_metrics.index = [\"stacking ensemble\"]\n", + "stacking_ensamble_unfeatured_metrics" + ] + }, + { + "cell_type": "markdown", + "id": "60eaa513", + "metadata": {}, + "source": [ + "However, sometimes it might be useful to combine them with the other features, which were used to fit the base models. You can specify the additional features using the `features_to_use` parameter." + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "acdf24fb", + "metadata": {}, + "outputs": [], + "source": [ + "stacking_ensemble_featured = StackingEnsemble(pipelines=pipelines, cv=10, features_to_use=\"all\", n_jobs=4)" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "906ffd7b", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[Parallel(n_jobs=2)]: Using backend MultiprocessingBackend with 2 concurrent workers.\n", + "[Parallel(n_jobs=2)]: Done 1 tasks | elapsed: 29.8s\n", + "[Parallel(n_jobs=2)]: Done 2 tasks | elapsed: 35.8s\n", + "[Parallel(n_jobs=2)]: Done 3 out of 5 | elapsed: 53.3s remaining: 35.5s\n", + "[Parallel(n_jobs=2)]: Done 5 out of 5 | elapsed: 1.3min remaining: 0.0s\n", + "[Parallel(n_jobs=2)]: Done 5 out of 5 | elapsed: 1.3min finished\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
MAEMSESMAPEMAPE
stacking ensemble + features2012.2460217.284254e+068.2926198.525519
\n", + "
" + ], + "text/plain": [ + " MAE MSE SMAPE MAPE\n", + "stacking ensemble + features 2012.246021 7.284254e+06 8.292619 8.525519" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "stacking_ensamble_featured_metrics = stacking_ensemble_featured.backtest(\n", + " ts=ts, metrics=[MAE(), MSE(), SMAPE(), MAPE()], n_folds=N_FOLDS, aggregate_metrics=True, n_jobs=2\n", + ")[0].iloc[:, 1:]\n", + "stacking_ensamble_featured_metrics.index = [\"stacking ensemble + features\"]\n", + "stacking_ensamble_featured_metrics" + ] + }, + { + "cell_type": "markdown", + "id": "051a0ba0", + "metadata": {}, + "source": [ + "In addition, it is also possible to specify the `final_model`. You can use any regression model with the sklearn interface for this purpose." + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "df04bfc8", + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.linear_model import Lasso" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "b6f4f011", + "metadata": {}, + "outputs": [], + "source": [ + "stacking_ensemble_final_model = StackingEnsemble(\n", + " pipelines=pipelines, final_model=Lasso(alpha=10), cv=10, features_to_use=\"all\", n_jobs=4\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "be99b902", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[Parallel(n_jobs=2)]: Using backend MultiprocessingBackend with 2 concurrent workers.\n", + "[Parallel(n_jobs=2)]: Done 1 tasks | elapsed: 29.0s\n", + "[Parallel(n_jobs=2)]: Done 2 tasks | elapsed: 35.1s\n", + "[Parallel(n_jobs=2)]: Done 3 out of 5 | elapsed: 52.3s remaining: 34.8s\n", + "[Parallel(n_jobs=2)]: Done 5 out of 5 | elapsed: 1.3min remaining: 0.0s\n", + "[Parallel(n_jobs=2)]: Done 5 out of 5 | elapsed: 1.3min finished\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
MAEMSESMAPEMAPE
stacking ensemble + features + lasso2012.244697.284238e+068.2926158.525514
\n", + "
" + ], + "text/plain": [ + " MAE MSE SMAPE \\\n", + "stacking ensemble + features + lasso 2012.24469 7.284238e+06 8.292615 \n", + "\n", + " MAPE \n", + "stacking ensemble + features + lasso 8.525514 " + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "stacking_ensamble_final_model_metrics = stacking_ensemble_final_model.backtest(\n", + " ts=ts, metrics=[MAE(), MSE(), SMAPE(), MAPE()], n_folds=N_FOLDS, aggregate_metrics=True, n_jobs=2\n", + ")[0].iloc[:, 1:]\n", + "stacking_ensamble_final_model_metrics.index = [\"stacking ensemble + features + lasso\"]\n", + "stacking_ensamble_final_model_metrics" + ] + }, + { + "cell_type": "markdown", + "id": "c975d5c5", + "metadata": {}, + "source": [ + "### 3.3 Results\n", + "Finally, let's take a look at the results of our experiments" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "c2f1d397", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
MAEMSESMAPEMAPE
naive2437.4666671.089199e+079.94988610.222106
moving average1913.8266676.113701e+067.8975707.824056
prophet2174.8390536.577668e+068.8911479.047454
catboost2224.6610628.699304e+069.0956149.733325
voting ensemble(uniform weights)2089.2143277.245760e+068.5948288.838810
voting ensemble(custom weights)1961.7131726.152859e+068.0941238.190290
stacking ensemble2002.6573757.227136e+068.3296968.461256
stacking ensemble + features2012.2460217.284254e+068.2926198.525519
stacking ensemble + features + lasso2012.2446907.284238e+068.2926158.525514
\n", + "
" + ], + "text/plain": [ + " MAE MSE SMAPE \\\n", + "naive 2437.466667 1.089199e+07 9.949886 \n", + "moving average 1913.826667 6.113701e+06 7.897570 \n", + "prophet 2174.839053 6.577668e+06 8.891147 \n", + "catboost 2224.661062 8.699304e+06 9.095614 \n", + "voting ensemble(uniform weights) 2089.214327 7.245760e+06 8.594828 \n", + "voting ensemble(custom weights) 1961.713172 6.152859e+06 8.094123 \n", + "stacking ensemble 2002.657375 7.227136e+06 8.329696 \n", + "stacking ensemble + features 2012.246021 7.284254e+06 8.292619 \n", + "stacking ensemble + features + lasso 2012.244690 7.284238e+06 8.292615 \n", + "\n", + " MAPE \n", + "naive 10.222106 \n", + "moving average 7.824056 \n", + "prophet 9.047454 \n", + "catboost 9.733325 \n", + "voting ensemble(uniform weights) 8.838810 \n", + "voting ensemble(custom weights) 8.190290 \n", + "stacking ensemble 8.461256 \n", + "stacking ensemble + features 8.525519 \n", + "stacking ensemble + features + lasso 8.525514 " + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "metrics = pd.concat(\n", + " [\n", + " metrics,\n", + " voting_ensamble_metrics,\n", + " voting_ensamble_custom_weights_metrics,\n", + " stacking_ensamble_unfeatured_metrics,\n", + " stacking_ensamble_featured_metrics,\n", + " stacking_ensamble_final_model_metrics,\n", + " ]\n", + ")\n", + "metrics\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cfad369f", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "etna-Wq0dxOda-py3.8", + "language": "python", + "name": "etna-wq0dxoda-py3.8" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.6" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From d5633800438b4a7f75d33c49f6cc67ed3075bc28 Mon Sep 17 00:00:00 2001 From: alex-hse-repository Date: Wed, 20 Oct 2021 13:38:06 +0300 Subject: [PATCH 2/3] Update CHANGELOG --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 774aac4b8..2e8259440 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -38,6 +38,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Add 'in_column' parameter to get_anomalies methods([#199](https://github.com/tinkoff-ai/etna-ts/pull/199)) - Clustering notebook ([#152](https://github.com/tinkoff-ai/etna-ts/pull/152)) - StackingEnsemble ([#195](https://github.com/tinkoff-ai/etna-ts/pull/195)) +- Ensembles notebook ([#218](https://github.com/tinkoff-ai/etna-ts/pull/218)) ### Changed - Delete offset from WindowStatisticsTransform ([#111](https://github.com/tinkoff-ai/etna-ts/pull/111)) From 59669d819dea397d1985756a558045bf0ab7cf1c Mon Sep 17 00:00:00 2001 From: alex-hse-repository Date: Fri, 22 Oct 2021 11:37:11 +0300 Subject: [PATCH 3/3] Fixes --- examples/ensembles.ipynb | 498 +++++++-------------------------------- 1 file changed, 79 insertions(+), 419 deletions(-) diff --git a/examples/ensembles.ipynb b/examples/ensembles.ipynb index b9b88ff4e..e5ae036b1 100644 --- a/examples/ensembles.ipynb +++ b/examples/ensembles.ipynb @@ -43,7 +43,7 @@ "source": [ "## 1. Load Dataset \n", "\n", - "In this notebook we will work with the dataset contains only one segment with monthly wine sales." + "In this notebook we will work with the dataset contains only one segment with monthly wine sales. Working process with the dataset containing more segments will be absolutely the same." ] }, { @@ -106,7 +106,7 @@ "outputs": [], "source": [ "from etna.pipeline import Pipeline\n", - "from etna.models import NaiveModel, SeasonalMovingAverageModel, ProphetModel, CatBoostModelMultiSegment\n", + "from etna.models import NaiveModel, SeasonalMovingAverageModel, CatBoostModelMultiSegment\n", "from etna.transforms import LagTransform\n", "from etna.metrics import MAE, MSE, SMAPE, MAPE\n", "HORIZON = 3\n", @@ -132,14 +132,13 @@ "seasonalma_pipeline = Pipeline(\n", " model=SeasonalMovingAverageModel(window=5, seasonality=12), transforms=[], horizon=HORIZON\n", ")\n", - "prophet_pipeline = Pipeline(model=ProphetModel(), transforms=[], horizon=HORIZON)\n", "catboost_pipeline = Pipeline(\n", " model=CatBoostModelMultiSegment(),\n", " transforms=[LagTransform(lags=[6, 7, 8, 9, 10, 11, 12], in_column=\"target\")],\n", " horizon=HORIZON,\n", ")\n", - "pipeline_names = [\"naive\", \"moving average\", \"prophet\", \"catboost\"]\n", - "pipelines = [naive_pipeline, seasonalma_pipeline, prophet_pipeline, catboost_pipeline]" + "pipeline_names = [\"naive\", \"moving average\", \"catboost\"]\n", + "pipelines = [naive_pipeline, seasonalma_pipeline, catboost_pipeline]" ] }, { @@ -163,29 +162,23 @@ "output_type": "stream", "text": [ "[Parallel(n_jobs=5)]: Using backend MultiprocessingBackend with 5 concurrent workers.\n", - "[Parallel(n_jobs=5)]: Done 1 tasks | elapsed: 3.4s\n", - "[Parallel(n_jobs=5)]: Done 2 out of 5 | elapsed: 6.6s remaining: 9.9s\n", - "[Parallel(n_jobs=5)]: Done 3 out of 5 | elapsed: 9.5s remaining: 6.4s\n", - "[Parallel(n_jobs=5)]: Done 5 out of 5 | elapsed: 15.6s remaining: 0.0s\n", - "[Parallel(n_jobs=5)]: Done 5 out of 5 | elapsed: 15.6s finished\n", - "[Parallel(n_jobs=5)]: Using backend MultiprocessingBackend with 5 concurrent workers.\n", - "[Parallel(n_jobs=5)]: Done 1 tasks | elapsed: 3.6s\n", - "[Parallel(n_jobs=5)]: Done 2 out of 5 | elapsed: 6.4s remaining: 9.6s\n", - "[Parallel(n_jobs=5)]: Done 3 out of 5 | elapsed: 9.2s remaining: 6.2s\n", - "[Parallel(n_jobs=5)]: Done 5 out of 5 | elapsed: 14.8s remaining: 0.0s\n", - "[Parallel(n_jobs=5)]: Done 5 out of 5 | elapsed: 14.8s finished\n", + "[Parallel(n_jobs=5)]: Done 1 tasks | elapsed: 3.7s\n", + "[Parallel(n_jobs=5)]: Done 2 out of 5 | elapsed: 6.5s remaining: 9.7s\n", + "[Parallel(n_jobs=5)]: Done 3 out of 5 | elapsed: 9.8s remaining: 6.5s\n", + "[Parallel(n_jobs=5)]: Done 5 out of 5 | elapsed: 16.8s remaining: 0.0s\n", + "[Parallel(n_jobs=5)]: Done 5 out of 5 | elapsed: 16.8s finished\n", "[Parallel(n_jobs=5)]: Using backend MultiprocessingBackend with 5 concurrent workers.\n", - "[Parallel(n_jobs=5)]: Done 1 tasks | elapsed: 6.1s\n", - "[Parallel(n_jobs=5)]: Done 2 out of 5 | elapsed: 10.4s remaining: 15.6s\n", - "[Parallel(n_jobs=5)]: Done 3 out of 5 | elapsed: 14.6s remaining: 9.7s\n", - "[Parallel(n_jobs=5)]: Done 5 out of 5 | elapsed: 22.5s remaining: 0.0s\n", - "[Parallel(n_jobs=5)]: Done 5 out of 5 | elapsed: 22.5s finished\n", + "[Parallel(n_jobs=5)]: Done 1 tasks | elapsed: 3.4s\n", + "[Parallel(n_jobs=5)]: Done 2 out of 5 | elapsed: 6.7s remaining: 10.0s\n", + "[Parallel(n_jobs=5)]: Done 3 out of 5 | elapsed: 9.9s remaining: 6.6s\n", + "[Parallel(n_jobs=5)]: Done 5 out of 5 | elapsed: 16.3s remaining: 0.0s\n", + "[Parallel(n_jobs=5)]: Done 5 out of 5 | elapsed: 16.3s finished\n", "[Parallel(n_jobs=5)]: Using backend MultiprocessingBackend with 5 concurrent workers.\n", - "[Parallel(n_jobs=5)]: Done 1 tasks | elapsed: 4.1s\n", - "[Parallel(n_jobs=5)]: Done 2 out of 5 | elapsed: 7.2s remaining: 10.7s\n", - "[Parallel(n_jobs=5)]: Done 3 out of 5 | elapsed: 10.2s remaining: 6.8s\n", - "[Parallel(n_jobs=5)]: Done 5 out of 5 | elapsed: 16.7s remaining: 0.0s\n", - "[Parallel(n_jobs=5)]: Done 5 out of 5 | elapsed: 16.7s finished\n" + "[Parallel(n_jobs=5)]: Done 1 tasks | elapsed: 3.7s\n", + "[Parallel(n_jobs=5)]: Done 2 out of 5 | elapsed: 6.7s remaining: 10.0s\n", + "[Parallel(n_jobs=5)]: Done 3 out of 5 | elapsed: 9.6s remaining: 6.4s\n", + "[Parallel(n_jobs=5)]: Done 5 out of 5 | elapsed: 15.7s remaining: 0.0s\n", + "[Parallel(n_jobs=5)]: Done 5 out of 5 | elapsed: 15.7s finished\n" ] }, { @@ -231,13 +224,6 @@ " 7.824056\n", " \n", " \n", - " prophet\n", - " 2174.839053\n", - " 6.577668e+06\n", - " 8.891147\n", - " 9.047454\n", - " \n", - " \n", " catboost\n", " 2224.661062\n", " 8.699304e+06\n", @@ -252,7 +238,6 @@ " MAE MSE SMAPE MAPE\n", "naive 2437.466667 1.089199e+07 9.949886 10.222106\n", "moving average 1913.826667 6.113701e+06 7.897570 7.824056\n", - "prophet 2174.839053 6.577668e+06 8.891147 9.047454\n", "catboost 2224.661062 8.699304e+06 9.095614 9.733325" ] }, @@ -308,7 +293,9 @@ "id": "9f7ee7db", "metadata": {}, "source": [ - "By default, `VotingEnsemble` uses **uniform** weights for the pipelines' forecasts." + "By default, `VotingEnsemble` uses **uniform** weights for the pipelines' forecasts. However, you can specify the weights manually usign the `weights` parameter. The higher weight the more you trust the base model. \n", + "\n", + "*Note*: The `weights` are automatically normalized." ] }, { @@ -318,7 +305,7 @@ "metadata": {}, "outputs": [], "source": [ - "voting_ensemble = VotingEnsemble(pipelines=pipelines, n_jobs=4)" + "voting_ensemble = VotingEnsemble(pipelines=pipelines, weights=[1, 9, 4], n_jobs=4)" ] }, { @@ -332,11 +319,11 @@ "output_type": "stream", "text": [ "[Parallel(n_jobs=2)]: Using backend MultiprocessingBackend with 2 concurrent workers.\n", - "[Parallel(n_jobs=2)]: Done 1 tasks | elapsed: 6.7s\n", - "[Parallel(n_jobs=2)]: Done 2 tasks | elapsed: 11.1s\n", - "[Parallel(n_jobs=2)]: Done 3 out of 5 | elapsed: 11.2s remaining: 7.5s\n", - "[Parallel(n_jobs=2)]: Done 5 out of 5 | elapsed: 13.3s remaining: 0.0s\n", - "[Parallel(n_jobs=2)]: Done 5 out of 5 | elapsed: 13.3s finished\n" + "[Parallel(n_jobs=2)]: Done 1 tasks | elapsed: 4.6s\n", + "[Parallel(n_jobs=2)]: Done 2 tasks | elapsed: 7.9s\n", + "[Parallel(n_jobs=2)]: Done 3 out of 5 | elapsed: 7.9s remaining: 5.3s\n", + "[Parallel(n_jobs=2)]: Done 5 out of 5 | elapsed: 8.8s remaining: 0.0s\n", + "[Parallel(n_jobs=2)]: Done 5 out of 5 | elapsed: 8.8s finished\n" ] }, { @@ -368,19 +355,19 @@ " \n", " \n", " \n", - " voting ensemble(uniform weights)\n", - " 2089.214327\n", - " 7.245760e+06\n", - " 8.594828\n", - " 8.83881\n", + " voting ensemble\n", + " 1969.545363\n", + " 6.558551e+06\n", + " 8.125364\n", + " 8.257041\n", " \n", " \n", "\n", "" ], "text/plain": [ - " MAE MSE SMAPE MAPE\n", - "voting ensemble(uniform weights) 2089.214327 7.245760e+06 8.594828 8.83881" + " MAE MSE SMAPE MAPE\n", + "voting ensemble 1969.545363 6.558551e+06 8.125364 8.257041" ] }, "execution_count": 9, @@ -392,103 +379,10 @@ "voting_ensamble_metrics = voting_ensemble.backtest(\n", " ts=ts, metrics=[MAE(), MSE(), SMAPE(), MAPE()], n_folds=N_FOLDS, aggregate_metrics=True, n_jobs=2\n", ")[0].iloc[:, 1:]\n", - "voting_ensamble_metrics.index = [\"voting ensemble(uniform weights)\"]\n", + "voting_ensamble_metrics.index = [\"voting ensemble\"]\n", "voting_ensamble_metrics" ] }, - { - "cell_type": "markdown", - "id": "7d661818", - "metadata": {}, - "source": [ - "You can specify the `weights` manually, the higher weight the more you trust the base model. The `weights` are automatically normalized." - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "id": "7be24d69", - "metadata": {}, - "outputs": [], - "source": [ - "voting_ensemble_custom_weights = VotingEnsemble(pipelines=pipelines, weights=[1, 16, 9, 4], n_jobs=4)" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "id": "beed0ae4", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "[Parallel(n_jobs=2)]: Using backend MultiprocessingBackend with 2 concurrent workers.\n", - "[Parallel(n_jobs=2)]: Done 1 tasks | elapsed: 6.8s\n", - "[Parallel(n_jobs=2)]: Done 2 tasks | elapsed: 11.4s\n", - "[Parallel(n_jobs=2)]: Done 3 out of 5 | elapsed: 11.5s remaining: 7.6s\n", - "[Parallel(n_jobs=2)]: Done 5 out of 5 | elapsed: 13.5s remaining: 0.0s\n", - "[Parallel(n_jobs=2)]: Done 5 out of 5 | elapsed: 13.5s finished\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
MAEMSESMAPEMAPE
voting ensemble(custom weights)1961.7131726.152859e+068.0941238.19029
\n", - "
" - ], - "text/plain": [ - " MAE MSE SMAPE MAPE\n", - "voting ensemble(custom weights) 1961.713172 6.152859e+06 8.094123 8.19029" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "voting_ensamble_custom_weights_metrics = voting_ensemble_custom_weights.backtest(\n", - " ts=ts, metrics=[MAE(), MSE(), SMAPE(), MAPE()], n_folds=N_FOLDS, aggregate_metrics=True, n_jobs=2\n", - ")[0].iloc[:, 1:]\n", - "voting_ensamble_custom_weights_metrics.index = [\"voting ensemble(custom weights)\"]\n", - "voting_ensamble_custom_weights_metrics" - ] - }, { "cell_type": "markdown", "id": "a26b503b", @@ -500,7 +394,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 10, "id": "78c46663", "metadata": {}, "outputs": [], @@ -513,12 +407,17 @@ "id": "3b430668", "metadata": {}, "source": [ - "By default, `StackingEnsemble` uses only the pipelines' forecasts as features for the `final_model`." + "By default, `StackingEnsemble` uses only the pipelines' forecasts as features for the `final_model`. However, you can specify the additional features using the `features_to_use` parameter. The following values are possible:\n", + "+ **None** - use only the pipelines' forecasts(default)\n", + "+ **List[str]** - use the pipelines' forecasts + features from the list\n", + "+ **\"all\"** - use all the avalible features\n", + "\n", + "*Note:* It is possible to use only the features available for the base models." ] }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 11, "id": "273626b1", "metadata": {}, "outputs": [], @@ -528,7 +427,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 12, "id": "272cc433", "metadata": {}, "outputs": [ @@ -537,11 +436,11 @@ "output_type": "stream", "text": [ "[Parallel(n_jobs=2)]: Using backend MultiprocessingBackend with 2 concurrent workers.\n", - "[Parallel(n_jobs=2)]: Done 1 tasks | elapsed: 27.2s\n", - "[Parallel(n_jobs=2)]: Done 2 tasks | elapsed: 32.0s\n", - "[Parallel(n_jobs=2)]: Done 3 out of 5 | elapsed: 50.6s remaining: 33.7s\n", - "[Parallel(n_jobs=2)]: Done 5 out of 5 | elapsed: 1.2min remaining: 0.0s\n", - "[Parallel(n_jobs=2)]: Done 5 out of 5 | elapsed: 1.2min finished\n" + "[Parallel(n_jobs=2)]: Done 1 tasks | elapsed: 12.3s\n", + "[Parallel(n_jobs=2)]: Done 2 tasks | elapsed: 16.0s\n", + "[Parallel(n_jobs=2)]: Done 3 out of 5 | elapsed: 21.2s remaining: 14.1s\n", + "[Parallel(n_jobs=2)]: Done 5 out of 5 | elapsed: 29.1s remaining: 0.0s\n", + "[Parallel(n_jobs=2)]: Done 5 out of 5 | elapsed: 29.1s finished\n" ] }, { @@ -574,10 +473,10 @@ " \n", " \n", " stacking ensemble\n", - " 2002.657375\n", - " 7.227136e+06\n", - " 8.329696\n", - " 8.461256\n", + " 2109.063596\n", + " 8.609287e+06\n", + " 8.694399\n", + " 8.678903\n", " \n", " \n", "\n", @@ -585,113 +484,20 @@ ], "text/plain": [ " MAE MSE SMAPE MAPE\n", - "stacking ensemble 2002.657375 7.227136e+06 8.329696 8.461256" + "stacking ensemble 2109.063596 8.609287e+06 8.694399 8.678903" ] }, - "execution_count": 14, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "stacking_ensamble_unfeatured_metrics = stacking_ensemble_unfeatured.backtest(\n", + "stacking_ensamble_metrics = stacking_ensemble_unfeatured.backtest(\n", " ts=ts, metrics=[MAE(), MSE(), SMAPE(), MAPE()], n_folds=N_FOLDS, aggregate_metrics=True, n_jobs=2\n", ")[0].iloc[:, 1:]\n", - "stacking_ensamble_unfeatured_metrics.index = [\"stacking ensemble\"]\n", - "stacking_ensamble_unfeatured_metrics" - ] - }, - { - "cell_type": "markdown", - "id": "60eaa513", - "metadata": {}, - "source": [ - "However, sometimes it might be useful to combine them with the other features, which were used to fit the base models. You can specify the additional features using the `features_to_use` parameter." - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "id": "acdf24fb", - "metadata": {}, - "outputs": [], - "source": [ - "stacking_ensemble_featured = StackingEnsemble(pipelines=pipelines, cv=10, features_to_use=\"all\", n_jobs=4)" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "id": "906ffd7b", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "[Parallel(n_jobs=2)]: Using backend MultiprocessingBackend with 2 concurrent workers.\n", - "[Parallel(n_jobs=2)]: Done 1 tasks | elapsed: 29.8s\n", - "[Parallel(n_jobs=2)]: Done 2 tasks | elapsed: 35.8s\n", - "[Parallel(n_jobs=2)]: Done 3 out of 5 | elapsed: 53.3s remaining: 35.5s\n", - "[Parallel(n_jobs=2)]: Done 5 out of 5 | elapsed: 1.3min remaining: 0.0s\n", - "[Parallel(n_jobs=2)]: Done 5 out of 5 | elapsed: 1.3min finished\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
MAEMSESMAPEMAPE
stacking ensemble + features2012.2460217.284254e+068.2926198.525519
\n", - "
" - ], - "text/plain": [ - " MAE MSE SMAPE MAPE\n", - "stacking ensemble + features 2012.246021 7.284254e+06 8.292619 8.525519" - ] - }, - "execution_count": 16, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "stacking_ensamble_featured_metrics = stacking_ensemble_featured.backtest(\n", - " ts=ts, metrics=[MAE(), MSE(), SMAPE(), MAPE()], n_folds=N_FOLDS, aggregate_metrics=True, n_jobs=2\n", - ")[0].iloc[:, 1:]\n", - "stacking_ensamble_featured_metrics.index = [\"stacking ensemble + features\"]\n", - "stacking_ensamble_featured_metrics" + "stacking_ensamble_metrics.index = [\"stacking ensemble\"]\n", + "stacking_ensamble_metrics" ] }, { @@ -702,106 +508,6 @@ "In addition, it is also possible to specify the `final_model`. You can use any regression model with the sklearn interface for this purpose." ] }, - { - "cell_type": "code", - "execution_count": 17, - "id": "df04bfc8", - "metadata": {}, - "outputs": [], - "source": [ - "from sklearn.linear_model import Lasso" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "id": "b6f4f011", - "metadata": {}, - "outputs": [], - "source": [ - "stacking_ensemble_final_model = StackingEnsemble(\n", - " pipelines=pipelines, final_model=Lasso(alpha=10), cv=10, features_to_use=\"all\", n_jobs=4\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "id": "be99b902", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "[Parallel(n_jobs=2)]: Using backend MultiprocessingBackend with 2 concurrent workers.\n", - "[Parallel(n_jobs=2)]: Done 1 tasks | elapsed: 29.0s\n", - "[Parallel(n_jobs=2)]: Done 2 tasks | elapsed: 35.1s\n", - "[Parallel(n_jobs=2)]: Done 3 out of 5 | elapsed: 52.3s remaining: 34.8s\n", - "[Parallel(n_jobs=2)]: Done 5 out of 5 | elapsed: 1.3min remaining: 0.0s\n", - "[Parallel(n_jobs=2)]: Done 5 out of 5 | elapsed: 1.3min finished\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
MAEMSESMAPEMAPE
stacking ensemble + features + lasso2012.244697.284238e+068.2926158.525514
\n", - "
" - ], - "text/plain": [ - " MAE MSE SMAPE \\\n", - "stacking ensemble + features + lasso 2012.24469 7.284238e+06 8.292615 \n", - "\n", - " MAPE \n", - "stacking ensemble + features + lasso 8.525514 " - ] - }, - "execution_count": 19, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "stacking_ensamble_final_model_metrics = stacking_ensemble_final_model.backtest(\n", - " ts=ts, metrics=[MAE(), MSE(), SMAPE(), MAPE()], n_folds=N_FOLDS, aggregate_metrics=True, n_jobs=2\n", - ")[0].iloc[:, 1:]\n", - "stacking_ensamble_final_model_metrics.index = [\"stacking ensemble + features + lasso\"]\n", - "stacking_ensamble_final_model_metrics" - ] - }, { "cell_type": "markdown", "id": "c975d5c5", @@ -813,7 +519,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 16, "id": "c2f1d397", "metadata": {}, "outputs": [ @@ -860,13 +566,6 @@ " 7.824056\n", " \n", " \n", - " prophet\n", - " 2174.839053\n", - " 6.577668e+06\n", - " 8.891147\n", - " 9.047454\n", - " \n", - " \n", " catboost\n", " 2224.661062\n", " 8.699304e+06\n", @@ -874,69 +573,33 @@ " 9.733325\n", " \n", " \n", - " voting ensemble(uniform weights)\n", - " 2089.214327\n", - " 7.245760e+06\n", - " 8.594828\n", - " 8.838810\n", - " \n", - " \n", - " voting ensemble(custom weights)\n", - " 1961.713172\n", - " 6.152859e+06\n", - " 8.094123\n", - " 8.190290\n", + " voting ensemble\n", + " 1969.545363\n", + " 6.558551e+06\n", + " 8.125364\n", + " 8.257041\n", " \n", " \n", " stacking ensemble\n", - " 2002.657375\n", - " 7.227136e+06\n", - " 8.329696\n", - " 8.461256\n", - " \n", - " \n", - " stacking ensemble + features\n", - " 2012.246021\n", - " 7.284254e+06\n", - " 8.292619\n", - " 8.525519\n", - " \n", - " \n", - " stacking ensemble + features + lasso\n", - " 2012.244690\n", - " 7.284238e+06\n", - " 8.292615\n", - " 8.525514\n", + " 2109.063596\n", + " 8.609287e+06\n", + " 8.694399\n", + " 8.678903\n", " \n", " \n", "\n", "" ], "text/plain": [ - " MAE MSE SMAPE \\\n", - "naive 2437.466667 1.089199e+07 9.949886 \n", - "moving average 1913.826667 6.113701e+06 7.897570 \n", - "prophet 2174.839053 6.577668e+06 8.891147 \n", - "catboost 2224.661062 8.699304e+06 9.095614 \n", - "voting ensemble(uniform weights) 2089.214327 7.245760e+06 8.594828 \n", - "voting ensemble(custom weights) 1961.713172 6.152859e+06 8.094123 \n", - "stacking ensemble 2002.657375 7.227136e+06 8.329696 \n", - "stacking ensemble + features 2012.246021 7.284254e+06 8.292619 \n", - "stacking ensemble + features + lasso 2012.244690 7.284238e+06 8.292615 \n", - "\n", - " MAPE \n", - "naive 10.222106 \n", - "moving average 7.824056 \n", - "prophet 9.047454 \n", - "catboost 9.733325 \n", - "voting ensemble(uniform weights) 8.838810 \n", - "voting ensemble(custom weights) 8.190290 \n", - "stacking ensemble 8.461256 \n", - "stacking ensemble + features 8.525519 \n", - "stacking ensemble + features + lasso 8.525514 " + " MAE MSE SMAPE MAPE\n", + "naive 2437.466667 1.089199e+07 9.949886 10.222106\n", + "moving average 1913.826667 6.113701e+06 7.897570 7.824056\n", + "catboost 2224.661062 8.699304e+06 9.095614 9.733325\n", + "voting ensemble 1969.545363 6.558551e+06 8.125364 8.257041\n", + "stacking ensemble 2109.063596 8.609287e+06 8.694399 8.678903" ] }, - "execution_count": 20, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" } @@ -946,19 +609,16 @@ " [\n", " metrics,\n", " voting_ensamble_metrics,\n", - " voting_ensamble_custom_weights_metrics,\n", - " stacking_ensamble_unfeatured_metrics,\n", - " stacking_ensamble_featured_metrics,\n", - " stacking_ensamble_final_model_metrics,\n", + " stacking_ensamble_metrics\n", " ]\n", ")\n", - "metrics\n" + "metrics" ] }, { "cell_type": "code", "execution_count": null, - "id": "cfad369f", + "id": "70c0f874", "metadata": {}, "outputs": [], "source": []