From 61f9044aea966d20fa62b23dd72f62fe534437d1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20Morales?= Date: Fri, 26 Jul 2024 16:39:30 -0600 Subject: [PATCH 1/2] support returning multiple time features from function --- nbs/compat.ipynb | 4 + nbs/feature_engineering.ipynb | 468 +++++++++++++++++---------- utilsforecast/compat.py | 3 + utilsforecast/feature_engineering.py | 23 +- 4 files changed, 321 insertions(+), 177 deletions(-) diff --git a/nbs/compat.ipynb b/nbs/compat.ipynb index df66a56..11ea4af 100644 --- a/nbs/compat.ipynb +++ b/nbs/compat.ipynb @@ -36,6 +36,7 @@ "try:\n", " import polars as pl\n", " from polars import DataFrame as pl_DataFrame\n", + " from polars import Expr as pl_Expr\n", " from polars import Series as pl_Series\n", "\n", " POLARS_INSTALLED = True\n", @@ -45,6 +46,9 @@ " class pl_DataFrame:\n", " ...\n", "\n", + " class pl_Expr:\n", + " ...\n", + "\n", " class pl_Series:\n", " ...\n", "\n", diff --git a/nbs/feature_engineering.ipynb b/nbs/feature_engineering.ipynb index 1d067da..ccae0f3 100644 --- a/nbs/feature_engineering.ipynb +++ b/nbs/feature_engineering.ipynb @@ -46,7 +46,7 @@ "import pandas as pd\n", "\n", "import utilsforecast.processing as ufp\n", - "from utilsforecast.compat import DataFrame, Series, pl, pl_DataFrame\n", + "from utilsforecast.compat import DataFrame, pl, pl_DataFrame, pl_Expr\n", "from utilsforecast.validation import validate_format, validate_freq" ] }, @@ -834,12 +834,16 @@ "source": [ "#| exporti\n", "def _compute_time_feature(\n", - " times: Union[Series, pd.Index],\n", + " times: Union[pd.Index, pl_Expr],\n", " feature: Union[str, Callable],\n", - ") -> Union[Series, pd.Index]:\n", + ") -> Tuple[Union[str, List[str]], Union[pd.DataFrame, pl_Expr, List[pl_Expr], pd.Index, np.ndarray]]:\n", " if callable(feature):\n", - " feat_name = feature.__name__\n", " feat_vals = feature(times)\n", + " if isinstance(feat_vals, pd.DataFrame):\n", + " feat_name = feat_vals.columns.tolist()\n", + " feat_vals = feat_vals.to_numpy()\n", + " else:\n", + " feat_name = feature.__name__\n", " else:\n", " feat_name = feature\n", " if isinstance(times, pd.DatetimeIndex):\n", @@ -868,9 +872,13 @@ " exprs = []\n", " for feature in features:\n", " name, vals = _compute_time_feature(pl.col(time_col), feature)\n", - " exprs.append(vals.alias(name))\n", + " if isinstance(vals, list):\n", + " exprs.extend(vals)\n", + " else:\n", + " assert isinstance(vals, pl_Expr)\n", + " exprs.append(vals.alias(name))\n", " feats = unique_times.to_frame().with_columns(*exprs)\n", - " df = df.join(feats, on=time_col, how=\"left\")\n", + " df = df.join(feats, on=time_col, how='left')\n", " return df" ] }, @@ -939,27 +947,141 @@ "outputs": [ { "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
unique_iddsymonthday
002000-10-050.428973105
102000-10-061.423626106
202000-10-072.311782107
302000-10-083.192191108
402000-10-094.148767109
..................
109642001-05-104.058910510
109742001-05-115.178157511
109842001-05-126.133142512
109942001-05-130.403709513
110042001-05-141.081779514
\n", + "

1101 rows × 5 columns

\n", + "
" + ], "text/plain": [ - "( unique_id ds y month day\n", - " 0 0 2000-10-05 0.428973 10 5\n", - " 1 0 2000-10-06 1.423626 10 6\n", - " 2 0 2000-10-07 2.311782 10 7\n", - " 3 0 2000-10-08 3.192191 10 8\n", - " 4 0 2000-10-09 4.148767 10 9\n", - " ... ... ... ... ... ...\n", - " 1096 4 2001-05-10 4.058910 5 10\n", - " 1097 4 2001-05-11 5.178157 5 11\n", - " 1098 4 2001-05-12 6.133142 5 12\n", - " 1099 4 2001-05-13 0.403709 5 13\n", - " 1100 4 2001-05-14 1.081779 5 14\n", - " \n", - " [1101 rows x 5 columns],\n", - " unique_id ds month day\n", - " 0 0 2001-05-15 5 15\n", - " 1 1 2001-05-15 5 15\n", - " 2 2 2001-05-15 5 15\n", - " 3 3 2001-05-15 5 15\n", - " 4 4 2001-05-15 5 15)" + " unique_id ds y month day\n", + "0 0 2000-10-05 0.428973 10 5\n", + "1 0 2000-10-06 1.423626 10 6\n", + "2 0 2000-10-07 2.311782 10 7\n", + "3 0 2000-10-08 3.192191 10 8\n", + "4 0 2000-10-09 4.148767 10 9\n", + "... ... ... ... ... ...\n", + "1096 4 2001-05-10 4.058910 5 10\n", + "1097 4 2001-05-11 5.178157 5 11\n", + "1098 4 2001-05-12 6.133142 5 12\n", + "1099 4 2001-05-13 0.403709 5 13\n", + "1100 4 2001-05-14 1.081779 5 14\n", + "\n", + "[1101 rows x 5 columns]" ] }, "execution_count": null, @@ -968,7 +1090,99 @@ } ], "source": [ - "time_features(series, freq='D', features=['month', 'day'], h=1)" + "transformed_df, future_df = time_features(series, freq='D', features=['month', 'day'], h=1)\n", + "transformed_df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b5d07f9d-66f3-47c7-8e61-bc8294e58440", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
unique_iddsmonthday
002001-05-15515
112001-05-15515
222001-05-15515
332001-05-15515
442001-05-15515
\n", + "
" + ], + "text/plain": [ + " unique_id ds month day\n", + "0 0 2001-05-15 5 15\n", + "1 1 2001-05-15 5 15\n", + "2 2 2001-05-15 5 15\n", + "3 3 2001-05-15 5 15\n", + "4 4 2001-05-15 5 15" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "future_df" ] }, { @@ -1062,6 +1276,8 @@ " cos1_28\n", " day\n", " is_weekend\n", + " even_day\n", + " even_month\n", " \n", " \n", " \n", @@ -1077,6 +1293,8 @@ " -9.009683e-01\n", " 5\n", " False\n", + " True\n", + " True\n", " \n", " \n", " 1\n", @@ -1090,6 +1308,8 @@ " -9.749276e-01\n", " 6\n", " False\n", + " False\n", + " True\n", " \n", " \n", " 2\n", @@ -1103,6 +1323,8 @@ " -1.000000e+00\n", " 7\n", " True\n", + " True\n", + " True\n", " \n", " \n", " 3\n", @@ -1116,6 +1338,8 @@ " -9.749281e-01\n", " 8\n", " True\n", + " False\n", + " True\n", " \n", " \n", " 4\n", @@ -1129,6 +1353,8 @@ " -9.009693e-01\n", " 9\n", " False\n", + " False\n", + " True\n", " \n", " \n", " ...\n", @@ -1142,6 +1368,8 @@ " ...\n", " ...\n", " ...\n", + " ...\n", + " ...\n", " \n", " \n", " 1096\n", @@ -1155,6 +1383,8 @@ " 4.338843e-01\n", " 10\n", " False\n", + " True\n", + " False\n", " \n", " \n", " 1097\n", @@ -1168,6 +1398,8 @@ " 2.225177e-01\n", " 11\n", " False\n", + " False\n", + " False\n", " \n", " \n", " 1098\n", @@ -1181,6 +1413,8 @@ " 4.251100e-07\n", " 12\n", " True\n", + " True\n", + " False\n", " \n", " \n", " 1099\n", @@ -1194,6 +1428,8 @@ " -2.225243e-01\n", " 13\n", " True\n", + " False\n", + " False\n", " \n", " \n", " 1100\n", @@ -1207,10 +1443,12 @@ " -4.338835e-01\n", " 14\n", " False\n", + " False\n", + " False\n", " \n", " \n", "\n", - "

1101 rows × 10 columns

\n", + "

1101 rows × 12 columns

\n", "" ], "text/plain": [ @@ -1227,20 +1465,20 @@ "1099 4 2001-05-13 0.403709 372.0 0.781840 0.623479 0.974927 \n", "1100 4 2001-05-14 1.081779 373.0 0.974928 -0.222520 0.900969 \n", "\n", - " cos1_28 day is_weekend \n", - "0 -9.009683e-01 5 False \n", - "1 -9.749276e-01 6 False \n", - "2 -1.000000e+00 7 True \n", - "3 -9.749281e-01 8 True \n", - "4 -9.009693e-01 9 False \n", - "... ... ... ... \n", - "1096 4.338843e-01 10 False \n", - "1097 2.225177e-01 11 False \n", - "1098 4.251100e-07 12 True \n", - "1099 -2.225243e-01 13 True \n", - "1100 -4.338835e-01 14 False \n", + " cos1_28 day is_weekend even_day even_month \n", + "0 -9.009683e-01 5 False True True \n", + "1 -9.749276e-01 6 False False True \n", + "2 -1.000000e+00 7 True True True \n", + "3 -9.749281e-01 8 True False True \n", + "4 -9.009693e-01 9 False False True \n", + "... ... ... ... ... ... \n", + "1096 4.338843e-01 10 False True False \n", + "1097 2.225177e-01 11 False False False \n", + "1098 4.251100e-07 12 True True False \n", + "1099 -2.225243e-01 13 True False False \n", + "1100 -4.338835e-01 14 False False False \n", "\n", - "[1101 rows x 10 columns]" + "[1101 rows x 12 columns]" ] }, "execution_count": null, @@ -1249,18 +1487,34 @@ } ], "source": [ - "def is_weekend(times):\n", + "def is_weekend(times: Union[pd.Index, pl.Expr]):\n", " if isinstance(times, pd.Index):\n", " dow = times.weekday + 1 # monday=0 in pandas and 1 in polars\n", " else:\n", " dow = times.dt.weekday()\n", " return dow >= 6\n", "\n", + "def even_days_and_months(times: Union[pd.Index, pl.Expr]):\n", + " if isinstance(times, pd.Index):\n", + " out = pd.DataFrame(\n", + " {\n", + " 'even_day': (times.weekday + 1) % 2 == 0,\n", + " 'even_month': times.month % 2 == 0,\n", + " }\n", + " )\n", + " else:\n", + " # for polars you can return a list of expressions\n", + " out = [\n", + " (times.dt.weekday() % 2 == 0).alias('even_day'),\n", + " (times.dt.month() % 2 == 0).alias('even_month'),\n", + " ]\n", + " return out\n", + "\n", "features = [\n", " trend,\n", " partial(fourier, season_length=7, k=1),\n", " partial(fourier, season_length=28, k=1),\n", - " partial(time_features, features=['day', is_weekend]),\n", + " partial(time_features, features=['day', is_weekend, even_days_and_months]),\n", "]\n", "transformed_df, future_df = pipeline(\n", " series,\n", @@ -1271,134 +1525,6 @@ "transformed_df" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "0c454175-20d9-4031-8c23-91a9e0f7c416", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
unique_iddstrendsin1_7cos1_7sin1_28cos1_28dayis_weekend
002001-05-15374.00.433871-0.9009750.781829-0.62349315False
112001-05-15374.00.433871-0.9009750.781829-0.62349315False
222001-05-15374.00.433871-0.9009750.781829-0.62349315False
332001-05-15374.00.433871-0.9009750.781829-0.62349315False
442001-05-15374.00.433871-0.9009750.781829-0.62349315False
\n", - "
" - ], - "text/plain": [ - " unique_id ds trend sin1_7 cos1_7 sin1_28 cos1_28 day \\\n", - "0 0 2001-05-15 374.0 0.433871 -0.900975 0.781829 -0.623493 15 \n", - "1 1 2001-05-15 374.0 0.433871 -0.900975 0.781829 -0.623493 15 \n", - "2 2 2001-05-15 374.0 0.433871 -0.900975 0.781829 -0.623493 15 \n", - "3 3 2001-05-15 374.0 0.433871 -0.900975 0.781829 -0.623493 15 \n", - "4 4 2001-05-15 374.0 0.433871 -0.900975 0.781829 -0.623493 15 \n", - "\n", - " is_weekend \n", - "0 False \n", - "1 False \n", - "2 False \n", - "3 False \n", - "4 False " - ] - }, - "execution_count": null, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "future_df" - ] - }, { "cell_type": "code", "execution_count": null, diff --git a/utilsforecast/compat.py b/utilsforecast/compat.py index fc8f6ce..fa21f5f 100644 --- a/utilsforecast/compat.py +++ b/utilsforecast/compat.py @@ -14,6 +14,7 @@ try: import polars as pl from polars import DataFrame as pl_DataFrame + from polars import Expr as pl_Expr from polars import Series as pl_Series POLARS_INSTALLED = True @@ -22,6 +23,8 @@ class pl_DataFrame: ... + class pl_Expr: ... + class pl_Series: ... POLARS_INSTALLED = False diff --git a/utilsforecast/feature_engineering.py b/utilsforecast/feature_engineering.py index 9e83e7e..496ea77 100644 --- a/utilsforecast/feature_engineering.py +++ b/utilsforecast/feature_engineering.py @@ -11,7 +11,7 @@ import pandas as pd import utilsforecast.processing as ufp -from .compat import DataFrame, Series, pl, pl_DataFrame +from .compat import DataFrame, pl, pl_DataFrame, pl_Expr from .validation import validate_format, validate_freq # %% ../nbs/feature_engineering.ipynb 4 @@ -194,12 +194,19 @@ def trend( # %% ../nbs/feature_engineering.ipynb 15 def _compute_time_feature( - times: Union[Series, pd.Index], + times: Union[pd.Index, pl_Expr], feature: Union[str, Callable], -) -> Union[Series, pd.Index]: +) -> Tuple[ + Union[str, List[str]], + Union[pd.DataFrame, pl_Expr, List[pl_Expr], pd.Index, np.ndarray], +]: if callable(feature): - feat_name = feature.__name__ feat_vals = feature(times) + if isinstance(feat_vals, pd.DataFrame): + feat_name = feat_vals.columns.tolist() + feat_vals = feat_vals.to_numpy() + else: + feat_name = feature.__name__ else: feat_name = feature if isinstance(times, pd.DatetimeIndex): @@ -229,7 +236,11 @@ def _add_time_features( exprs = [] for feature in features: name, vals = _compute_time_feature(pl.col(time_col), feature) - exprs.append(vals.alias(name)) + if isinstance(vals, list): + exprs.extend(vals) + else: + assert isinstance(vals, pl_Expr) + exprs.append(vals.alias(name)) feats = unique_times.to_frame().with_columns(*exprs) df = df.join(feats, on=time_col, how="left") return df @@ -283,7 +294,7 @@ def time_features( future = _add_time_features(df=future, features=features, time_col=time_col) return transformed, future -# %% ../nbs/feature_engineering.ipynb 18 +# %% ../nbs/feature_engineering.ipynb 19 def pipeline( df: DataFrame, features: List[Callable], From d904c009e1d07f9db94e4287f85fe6c1736b4fb6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20Morales?= Date: Fri, 26 Jul 2024 16:43:24 -0600 Subject: [PATCH 2/2] remove annotations in example --- nbs/feature_engineering.ipynb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/nbs/feature_engineering.ipynb b/nbs/feature_engineering.ipynb index ccae0f3..f470cd2 100644 --- a/nbs/feature_engineering.ipynb +++ b/nbs/feature_engineering.ipynb @@ -1487,14 +1487,14 @@ } ], "source": [ - "def is_weekend(times: Union[pd.Index, pl.Expr]):\n", + "def is_weekend(times):\n", " if isinstance(times, pd.Index):\n", " dow = times.weekday + 1 # monday=0 in pandas and 1 in polars\n", " else:\n", " dow = times.dt.weekday()\n", " return dow >= 6\n", "\n", - "def even_days_and_months(times: Union[pd.Index, pl.Expr]):\n", + "def even_days_and_months(times):\n", " if isinstance(times, pd.Index):\n", " out = pd.DataFrame(\n", " {\n",