Skip to content

Commit

Permalink
[air/execution] Fix new execution backend for BOHB (#34828)
Browse files Browse the repository at this point in the history
The new execution backend did not work with BOHB. The reason was a faulty check when we start actors. We should not eagerly start paused trials, as the scheduler may want to keep them paused for synchronous training. However, a leftover code piece allowed for trials that were once pending but then paused to be started. 

This PR fixes the bug by requiring trials to be strictly pending, and also makes sure that the `trial_to_run` is set to PENDING when chosen by the scheduler. BOHB tests are now enabled for the new execution backend.

Signed-off-by: Kai Fricke <[email protected]>
  • Loading branch information
krfricke authored Apr 28, 2023
1 parent 922e2c8 commit 92f4fe7
Show file tree
Hide file tree
Showing 3 changed files with 3 additions and 10 deletions.
5 changes: 3 additions & 2 deletions python/ray/tune/execution/tune_controller.py
Original file line number Diff line number Diff line change
Expand Up @@ -437,6 +437,7 @@ def _maybe_add_actors(self) -> None:
and trial_to_run not in self._trial_to_actor
):
logger.debug(f"Staging trial to run: {trial_to_run}")
self._set_trial_status(trial_to_run, Trial.PENDING)
self._staged_trials.add(trial_to_run)
self._actor_cache.increase_max(trial_to_run.placement_group_factory)
# schedule_trial_actor also potentially uses cached actors
Expand All @@ -460,7 +461,7 @@ def _maybe_add_actors(candidates: List[Trial]):
# If the trial is part of the list, but not of the set,
# we just ignore it. Removing it from the list on status
# change is too expensive.
if trial not in (self._pending_trials | self._paused_trials):
if trial not in self._pending_trials:
continue

if trial in self._trial_to_actor:
Expand Down Expand Up @@ -541,7 +542,7 @@ def _schedule_trial_actor(self, trial: Trial):
"""
logger.debug(f"Trying to schedule new ACTOR for trial {trial}")

self._set_trial_status(trial, Trial.PENDING)
assert trial.status == Trial.PENDING

trial.init_logdir()
# We checkpoint metadata here to try mitigating logdir duplication
Expand Down
4 changes: 0 additions & 4 deletions python/ray/tune/tests/test_trial_scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -807,10 +807,6 @@ def result(score, ts):
[t.status for t in trials], [Trial.PAUSED, Trial.PENDING, Trial.PAUSED]
)

@pytest.mark.skipif(
os.environ.get("TUNE_NEW_EXECUTION") == "1",
reason="BOHB does not currently work with the new execution backend.",
)
def testNonstopBOHB(self):
from ray.tune.search.bohb import TuneBOHB

Expand Down
4 changes: 0 additions & 4 deletions python/ray/tune/tests/test_tune_restore_warm_start.py
Original file line number Diff line number Diff line change
Expand Up @@ -470,10 +470,6 @@ def cost(space, reporter):
return search_alg, cost


@pytest.mark.skipif(
os.environ.get("TUNE_NEW_EXECUTION") == "1",
reason="BOHB does not currently work with the new execution backend.",
)
class BOHBWarmStartTest(AbstractWarmStartTest, unittest.TestCase):
def set_basic_conf(self):
space = {"width": tune.uniform(0, 20), "height": tune.uniform(-100, 100)}
Expand Down

0 comments on commit 92f4fe7

Please sign in to comment.