diff --git a/ax/service/scheduler.py b/ax/service/scheduler.py index 4faf63b3f21..7b643b19a94 100644 --- a/ax/service/scheduler.py +++ b/ax/service/scheduler.py @@ -77,8 +77,10 @@ """ FAILURE_EXCEEDED_MSG = ( "Failure rate exceeds the tolerated trial failure rate of {f_rate} (at least " - "{n_failed} out of first {n_ran} trials failed). Checks are triggered both at " - "the end of a optimization and if at least {min_failed} trials have failed." + "{n_failed} out of first {n_ran} trials failed or were abandoned). Checks are " + "triggered both at the end of a optimization and if at least {min_failed} trials " + "have either failed, or have been abandoned, potentially automatically due to " + "issues with the trial." ) @@ -850,10 +852,11 @@ def error_if_failure_rate_exceeded(self, force_check: bool = False) -> None: ): return - num_ran_in_scheduler = ( - len(self.experiment.trials) - self._num_preexisting_trials + num_ran_in_scheduler = sum( + 1 + for idx, t in self.experiment.trials.items() + if idx >= self._num_preexisting_trials and t.status.is_terminal ) - failure_rate_exceeded = ( num_bad_in_scheduler / num_ran_in_scheduler ) > self.options.tolerated_trial_failure_rate