Merge branch 'main' into feature/docs_faq_api_call

Nixtla · Aug 30, 2024 · 955df2e · 955df2e
2 parents 85546d3 + f44938a
commit 955df2e
Show file tree

Hide file tree

Showing 4 changed files with 502 additions and 174 deletions.
diff --git a/nbs/docs/capabilities/forecast/02_exogenous_variables.ipynb b/nbs/docs/capabilities/forecast/02_exogenous_variables.ipynb
@@ -48,27 +48,16 @@
    "source": [
     "# Add exogenous variables\n",
     "\n",
-    "To model with exogenous features, include them in the DataFrame you pass to the `forecast` method. Provide the future values of these exogenous features over the forecast horizon using the `X_df` parameter."
+    "To model with exogenous features, you have two options:\n",
+    "1. Use historical exogenous variables: include these variables in the DataFrame you pass to the `forecast` method\n",
+    "2. Use future exogenous variables: include these variables in the DataFrame you pass to the `forecast` method and provide the future values of these exogenous features over the forecast horizon using the `X_df` parameter."
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/markdown": [
-       "[![](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Nixtla/nixtla/blob/main/nbs/docs/capabilities/forecast/02_exogenous_variables.ipynb)"
-      ],
-      "text/plain": [
-       "<IPython.core.display.Markdown object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
+   "outputs": [],
    "source": [
     "#| echo: false\n",
     "if not IN_COLAB:\n",
@@ -120,25 +109,47 @@
     "    nixtla_client = NixtlaClient()"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 1. Historical exogenous variables"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Read data\n",
+    "df = pd.read_csv('https://raw.githubusercontent.com/Nixtla/transfer-learning-time-series/main/datasets/electricity-short-with-ex-vars.csv')\n",
+    "\n",
+    "# Forecast\n",
+    "forecast_df = nixtla_client.forecast(\n",
+    "    df=df, \n",
+    "    h=24,\n",
+    "    id_col='unique_id',\n",
+    "    target_col='y',\n",
+    "    time_col='ds'\n",
+    ")\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 2. Future exogenous variables"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "INFO:nixtla.nixtla_client:Validating inputs...\n",
-      "INFO:nixtla.nixtla_client:Preprocessing dataframes...\n",
-      "INFO:nixtla.nixtla_client:Inferred freq: H\n",
-      "INFO:nixtla.nixtla_client:Using the following exogenous variables: Exogenous1, Exogenous2, day_0, day_1, day_2, day_3, day_4, day_5, day_6\n",
-      "INFO:nixtla.nixtla_client:Calling Forecast Endpoint...\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "# Read data\n",
+    "import numpy as np\n",
     "df = pd.read_csv('https://raw.githubusercontent.com/Nixtla/transfer-learning-time-series/main/datasets/electricity-short-with-ex-vars.csv')\n",
     "\n",
     "# Load the future value of exogenous variables over the forecast horizon\n",
@@ -155,6 +166,39 @@
     ")\n"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 3. Historical and future exogenous variables"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Read data\n",
+    "df = pd.read_csv('https://raw.githubusercontent.com/Nixtla/transfer-learning-time-series/main/datasets/electricity-short-with-ex-vars.csv')\n",
+    "\n",
+    "# Load the future value of exogenous variables over the forecast horizon\n",
+    "future_ex_vars_df = pd.read_csv('https://raw.githubusercontent.com/Nixtla/transfer-learning-time-series/main/datasets/electricity-short-future-ex-vars.csv')\n",
+    "\n",
+    "# We will only use 2 exogenous of future_ex_vars_df. The columns not included in future_ex_vars_df will be considered as historical exogenous variables, as the future values have not been supplied.\n",
+    "future_ex_vars_df = future_ex_vars_df[[\"unique_id\", \"ds\", \"Exogenous1\", \"Exogenous2\"]]\n",
+    "\n",
+    "# Forecast\n",
+    "forecast_df = nixtla_client.forecast(\n",
+    "    df=df, \n",
+    "    X_df=future_ex_vars_df, \n",
+    "    h=24,\n",
+    "    id_col='unique_id',\n",
+    "    target_col='y',\n",
+    "    time_col='ds'\n",
+    ")"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -176,11 +220,6 @@
    "source": [
     "For more details on using exogenous features with TimeGPT, read our in-depth tutorials on [Exogenous variables](https://docs.nixtla.io/docs/tutorials-exogenous_variables) and on [Categorical variables](https://docs.nixtla.io/docs/tutorials-categorical_variables)."
    ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": []
   }
  ],
  "metadata": {

diff --git a/nbs/docs/tutorials/01_exogenous_variables.ipynb b/nbs/docs/tutorials/01_exogenous_variables.ipynb
diff --git a/nbs/nixtla_client.ipynb b/nbs/nixtla_client.ipynb
@@ -419,33 +419,28 @@
     "    time_col: str,\n",
     "    target_col: str,\n",
     ") -> Tuple[DFType, Optional[DFType]]:\n",
-    "    exogs_df = [c for c in df.columns if c not in (id_col, time_col, target_col)]\n",
+    "\n",
+    "    exog_list = [c for c in df.columns if c not in (id_col, time_col, target_col)]\n",
+    "\n",
     "    if X_df is None:\n",
-    "        if exogs_df:\n",
-    "            warnings.warn(\n",
-    "                f'`df` contains the following exogenous features: {exogs_df}, '\n",
-    "                'but `X_df` was not provided. They will be ignored.'\n",
-    "            )\n",
-    "            df = df[[id_col, time_col, target_col]]\n",
+    "        df = df[[id_col, time_col, target_col, *exog_list]]\n",
     "        return df, None\n",
-    "    exogs_X = [c for c in X_df.columns if c not in (id_col, time_col)]\n",
-    "    missing_df = set(exogs_X) - set(exogs_df)\n",
-    "    if missing_df:\n",
+    "\n",
+    "    futr_exog_list = [c for c in X_df.columns if c not in (id_col, time_col)]\n",
+    "    hist_exog_list = list(set(exog_list) - set(futr_exog_list))\n",
+    "\n",
+    "    # Capture case where future exogenous are provided in X_df that are not in df\n",
+    "    missing_futr = set(futr_exog_list) - set(exog_list)\n",
+    "    if missing_futr:\n",
     "        raise ValueError(\n",
-    "            'The following exogenous features are present in `X_df` '\n",
-    "            f'but not in `df`: {missing_df}.'\n",
+    "            \"The following exogenous features are present in `X_df` \"\n",
+    "            f\"but not in `df`: {missing_futr}.\"\n",
     "        )\n",
-    "    missing_X_df = set(exogs_df) - set(exogs_X)\n",
-    "    if missing_X_df:\n",
-    "        warnings.warn(\n",
-    "            'The following exogenous features are present in `df` '\n",
-    "            f'but not in `X_df`: {missing_X_df}. They will be ignored'\n",
-    "        )\n",
-    "        exogs_df = [c for c in exogs_df if c in exogs_X]\n",
-    "        df = df[[id_col, time_col, target_col, *exogs_df]]\n",
-    "    if exogs_df != exogs_X:\n",
-    "        # rearrange columns\n",
-    "        X_df = X_df[[id_col, time_col, *exogs_df]]\n",
+    "\n",
+    "    # Make sure df and X_df are in right order\n",
+    "    df = df[[id_col, time_col, target_col, *futr_exog_list, *hist_exog_list]]\n",
+    "    X_df = X_df[[id_col, time_col, *futr_exog_list]]\n",
+    "\n",
     "    return df, X_df\n",
     "\n",
     "def _validate_input_size(\n",
@@ -530,10 +525,12 @@
     "            df=X_df, id_col=id_col, time_col=time_col, target_col=None,\n",
     "        )\n",
     "        X_future = processed_X.data.T\n",
+    "        futr_cols = [c for c in X_df.columns if c not in (id_col, time_col)]\n",
     "    else:\n",
     "        X_future = None\n",
+    "        futr_cols = None\n",
     "    x_cols = [c for c in df.columns if c not in (id_col, time_col, target_col)]\n",
-    "    return processed, X_future, x_cols\n",
+    "    return processed, X_future, x_cols, futr_cols\n",
     "\n",
     "def _forecast_payload_to_in_sample(payload):\n",
     "    in_sample_payload = {\n",
@@ -681,7 +678,16 @@
     "        def ensure_contiguous_arrays(d: Dict[str, Any]) -> None:\n",
     "            for k, v in d.items():\n",
     "                if isinstance(v, np.ndarray):\n",
-    "                    d[k] = np.ascontiguousarray(v)\n",
+    "                    if np.issubdtype(v.dtype, np.floating):\n",
+    "                        v_cont = np.ascontiguousarray(v, dtype=np.float32)\n",
+    "                        d[k] = np.nan_to_num(v_cont, \n",
+    "                                            nan=np.nan, \n",
+    "                                            posinf=np.finfo(np.float32).max, \n",
+    "                                            neginf=np.finfo(np.float32).min,\n",
+    "                                            copy=False)\n",
+    "                    else:\n",
+    "                        d[k] = np.ascontiguousarray(v)\n",
+    "\n",
     "                elif isinstance(v, dict):\n",
     "                    ensure_contiguous_arrays(v) \n",
     "\n",
@@ -737,7 +743,7 @@
     "            offsets = [0] + [sum(p['series']['sizes']) for p in payloads[:-1]]\n",
     "            resp['idxs'] = np.hstack(\n",
     "                [\n",
-    "                    np.array(res['idxs']) + offset\n",
+    "                    np.array(res['idxs'], dtype=np.int64) + offset\n",
     "                    for res, offset in zip(results, offsets)\n",
     "                ]\n",
     "            )\n",
@@ -980,7 +986,7 @@
     "            )\n",
     "\n",
     "        logger.info('Preprocessing dataframes...')\n",
-    "        processed, X_future, x_cols = _preprocess(\n",
+    "        processed, X_future, x_cols, futr_cols = _preprocess(\n",
     "            df=df,\n",
     "            X_df=X_df,\n",
     "            h=h,\n",
@@ -1003,7 +1009,13 @@
     "            processed = _tail(processed, new_input_size)\n",
     "        if processed.data.shape[1] > 1:\n",
     "            X = processed.data[:, 1:].T\n",
-    "            logger.info(f'Using the following exogenous features: {x_cols}')\n",
+    "            if futr_cols is not None:\n",
+    "                hist_exog_set= set(x_cols) - set(futr_cols)\n",
+    "                if hist_exog_set:\n",
+    "                    logger.info(f'Using historical exogenous features: {list(hist_exog_set)}')\n",
+    "                logger.info(f'Using future exogenous features: {futr_cols}')\n",
+    "            else:\n",
+    "                logger.info(f'Using historical exogenous features: {x_cols}')\n",
     "        else:\n",
     "            X = None\n",
     "\n",
@@ -1173,7 +1185,7 @@
     "        model_input_size, model_horizon = self._get_model_params(model, standard_freq)\n",
     "\n",
     "        logger.info('Preprocessing dataframes...')\n",
-    "        processed, _, x_cols = _preprocess(\n",
+    "        processed, _, x_cols, _ = _preprocess(\n",
     "            df=df,\n",
     "            X_df=None,\n",
     "            h=0,\n",
@@ -1358,7 +1370,7 @@
     "            step_size = h\n",
     "\n",
     "        logger.info('Preprocessing dataframes...')\n",
-    "        processed, _, x_cols = _preprocess(\n",
+    "        processed, _, x_cols, _ = _preprocess(\n",
     "            df=df,\n",
     "            X_df=None,\n",
     "            h=0,\n",
@@ -1428,8 +1440,8 @@
     "                resp = self._make_partitioned_requests(client, 'v2/cross_validation', payloads)\n",
     "\n",
     "        # assemble result\n",
-    "        idxs = np.array(resp['idxs'])\n",
-    "        sizes = np.array(resp['sizes'])\n",
+    "        idxs = np.array(resp['idxs'], dtype=np.int64)\n",
+    "        sizes = np.array(resp['sizes'], dtype=np.int64)\n",
     "        window_starts = np.arange(0, sizes.sum(), h)\n",
     "        cutoff_idxs = np.repeat(idxs[window_starts] - 1, h)\n",
     "        out = type(df)(\n",
@@ -2462,26 +2474,6 @@
     ")"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "#| hide\n",
-    "# test for showing the correct warning if X_df is missing but df has exogenous columns\n",
-    "df = generate_series(n_series=2, min_length=5, max_length=20, n_static_features=3)\n",
-    "missing_exogenous = df.columns.drop(['unique_id', 'ds', 'y']).tolist()\n",
-    "expected_warning = (\n",
-    "    f'`df` contains the following exogenous features: {missing_exogenous}, '\n",
-    "    'but `X_df` was not provided. They will be ignored.'      \n",
-    ")\n",
-    "\n",
-    "with warnings.catch_warnings(record=True) as w:\n",
-    "    forecasts = nixtla_client.forecast(df, h=5)\n",
-    "    assert any(expected_warning in str(warning.message) for warning in w)"
-   ]
-  },
   {
    "cell_type": "markdown",
    "metadata": {},