Skip to content

Commit

Permalink
Check for Nans and Infs in TensorboardMetric (#2628)
Browse files Browse the repository at this point in the history
Summary:
Pull Request resolved: #2628

Raises a ValueError in bulk_fetch_trial_data if a Nan or an Inf is found. This will get wrapped up in a MetricFetchE and handled appropriately in the Scheduler (ex. INFO if we intend to try and fetch again, WARN if coming from a tracking metric, mark trial as ABANDONED if the metric is needed for the optimization https://fburl.com/code/eq37gghi).

Reviewed By: Balandat

Differential Revision: D60670356

fbshipit-source-id: 7f011f87c9ade9f1bf8b11a9ad2f2c34857bfd20
  • Loading branch information
mpolson64 authored and facebook-github-bot committed Aug 5, 2024
1 parent 8587587 commit 46464a5
Show file tree
Hide file tree
Showing 2 changed files with 64 additions and 0 deletions.
6 changes: 6 additions & 0 deletions ax/metrics/tensorboard.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@
from logging import Logger
from typing import Any, Dict, List, Optional

import numpy as np

import pandas as pd
from ax.core.base_trial import BaseTrial
from ax.core.map_data import MapData, MapKeyInfo
Expand Down Expand Up @@ -166,6 +168,10 @@ def bulk_fetch_trial_data(
.reset_index()
)

# If there are any NaNs or Infs in the data, raise an Exception
if np.any(~np.isfinite(df["mean"])):
raise ValueError("Found NaNs or Infs in data")

# Apply per-metric post-processing
# Apply cumulative "best" (min if lower_is_better)
if metric.cumulative_best:
Expand Down
58 changes: 58 additions & 0 deletions ax/metrics/tests/test_tensorboard.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,11 @@
from typing import List, Sequence
from unittest import mock

import numpy as np

import pandas as pd
from ax.core.map_data import MapData
from ax.core.metric import MetricFetchE
from ax.metrics.tensorboard import TensorboardMetric
from ax.utils.common.testutils import TestCase
from ax.utils.testing.core_stubs import get_trial
Expand Down Expand Up @@ -82,6 +85,61 @@ def test_fetch_trial_data(self) -> None:

self.assertTrue(df.equals(expected_df))

def test_fetch_trial_data_with_bad_data(self) -> None:
nan_data = [1, 2, np.nan, 4]
nan_multiplexer = _get_fake_multiplexer(fake_data=nan_data)

with mock.patch.object(
TensorboardMetric,
"_get_event_multiplexer_for_trial",
return_value=nan_multiplexer,
):
metric = TensorboardMetric(
name="loss",
tag="loss",
)

trial = get_trial()

result = metric.fetch_trial_data(trial=trial)

err = assert_is_instance(result.unwrap_err(), MetricFetchE)
self.assertEqual(
err.message,
"Failed to fetch data for loss",
)
self.assertEqual(
str(err.exception),
"Found NaNs or Infs in data",
)

inf_data = [1, 2, np.inf, 4]
inf_multiplexer = _get_fake_multiplexer(fake_data=inf_data)

with mock.patch.object(
TensorboardMetric,
"_get_event_multiplexer_for_trial",
return_value=inf_multiplexer,
):
metric = TensorboardMetric(
name="loss",
tag="loss",
)

trial = get_trial()

result = metric.fetch_trial_data(trial=trial)

err = assert_is_instance(result.unwrap_err(), MetricFetchE)
self.assertEqual(
err.message,
"Failed to fetch data for loss",
)
self.assertEqual(
str(err.exception),
"Found NaNs or Infs in data",
)

def test_smoothing(self) -> None:
fake_data = [8.0, 4.0, 2.0, 1.0]
smoothing = 0.5
Expand Down

0 comments on commit 46464a5

Please sign in to comment.