Check for Nans and Infs in TensorboardMetric (#2628)

Summary: Pull Request resolved: #2628 Raises a ValueError in bulk_fetch_trial_data if a Nan or an Inf is found. This will get wrapped up in a MetricFetchE and handled appropriately in the Scheduler (ex. INFO if we intend to try and fetch again, WARN if coming from a tracking metric, mark trial as ABANDONED if the metric is needed for the optimization https://fburl.com/code/eq37gghi). Reviewed By: Balandat Differential Revision: D60670356 fbshipit-source-id: 7f011f87c9ade9f1bf8b11a9ad2f2c34857bfd20
facebook · Aug 5, 2024 · 46464a5 · 46464a5
1 parent 8587587
commit 46464a5
Show file tree

Hide file tree

Showing 2 changed files with 64 additions and 0 deletions.
diff --git a/ax/metrics/tensorboard.py b/ax/metrics/tensorboard.py
@@ -13,6 +13,8 @@
 from logging import Logger
 from typing import Any, Dict, List, Optional
 
+import numpy as np
+
 import pandas as pd
 from ax.core.base_trial import BaseTrial
 from ax.core.map_data import MapData, MapKeyInfo
@@ -166,6 +168,10 @@ def bulk_fetch_trial_data(
                         .reset_index()
                     )
 
+                    # If there are any NaNs or Infs in the data, raise an Exception
+                    if np.any(~np.isfinite(df["mean"])):
+                        raise ValueError("Found NaNs or Infs in data")
+
                     # Apply per-metric post-processing
                     # Apply cumulative "best" (min if lower_is_better)
                     if metric.cumulative_best:

diff --git a/ax/metrics/tests/test_tensorboard.py b/ax/metrics/tests/test_tensorboard.py
@@ -10,8 +10,11 @@
 from typing import List, Sequence
 from unittest import mock
 
+import numpy as np
+
 import pandas as pd
 from ax.core.map_data import MapData
+from ax.core.metric import MetricFetchE
 from ax.metrics.tensorboard import TensorboardMetric
 from ax.utils.common.testutils import TestCase
 from ax.utils.testing.core_stubs import get_trial
@@ -82,6 +85,61 @@ def test_fetch_trial_data(self) -> None:
 
             self.assertTrue(df.equals(expected_df))
 
+    def test_fetch_trial_data_with_bad_data(self) -> None:
+        nan_data = [1, 2, np.nan, 4]
+        nan_multiplexer = _get_fake_multiplexer(fake_data=nan_data)
+
+        with mock.patch.object(
+            TensorboardMetric,
+            "_get_event_multiplexer_for_trial",
+            return_value=nan_multiplexer,
+        ):
+            metric = TensorboardMetric(
+                name="loss",
+                tag="loss",
+            )
+
+            trial = get_trial()
+
+            result = metric.fetch_trial_data(trial=trial)
+
+            err = assert_is_instance(result.unwrap_err(), MetricFetchE)
+            self.assertEqual(
+                err.message,
+                "Failed to fetch data for loss",
+            )
+            self.assertEqual(
+                str(err.exception),
+                "Found NaNs or Infs in data",
+            )
+
+        inf_data = [1, 2, np.inf, 4]
+        inf_multiplexer = _get_fake_multiplexer(fake_data=inf_data)
+
+        with mock.patch.object(
+            TensorboardMetric,
+            "_get_event_multiplexer_for_trial",
+            return_value=inf_multiplexer,
+        ):
+            metric = TensorboardMetric(
+                name="loss",
+                tag="loss",
+            )
+
+            trial = get_trial()
+
+            result = metric.fetch_trial_data(trial=trial)
+
+            err = assert_is_instance(result.unwrap_err(), MetricFetchE)
+            self.assertEqual(
+                err.message,
+                "Failed to fetch data for loss",
+            )
+            self.assertEqual(
+                str(err.exception),
+                "Found NaNs or Infs in data",
+            )
+
     def test_smoothing(self) -> None:
         fake_data = [8.0, 4.0, 2.0, 1.0]
         smoothing = 0.5