Skip to content

Commit

Permalink
[tune] Never block for results (#18391)
Browse files Browse the repository at this point in the history
* [tune] Never block for results

* Fix tests

* Block in tests

* Add comment to test
  • Loading branch information
krfricke authored Sep 9, 2021
1 parent 0126837 commit 395976c
Show file tree
Hide file tree
Showing 13 changed files with 32 additions and 8 deletions.
6 changes: 4 additions & 2 deletions doc/source/tune/user-guide.rst
Original file line number Diff line number Diff line change
Expand Up @@ -844,9 +844,11 @@ These are the environment variables Ray Tune currently considers:
but never longer than this value. Defaults to 100 (seconds).
* **TUNE_RESULT_BUFFER_MIN_TIME_S**: Additionally, you can specify a minimum time to buffer results. Defaults to 0.
* **TUNE_SYNCER_VERBOSITY**: Amount of command output when using Tune with Docker Syncer. Defaults to 0.
* **TUNE_TRIAL_RESULT_WAIT_TIME_S**: Amount of time Ray Tune will block until a result from a running trial is received.
Defaults to 1 (second).
* **TUNE_TRIAL_STARTUP_GRACE_PERIOD**: Amount of time after starting a trial that Ray Tune checks for successful
trial startups. After the grace period, Tune will block until a result from a running trial is received. Can
be disabled by setting this to lower or equal to 0.
trial startups. After the grace period, Tune will block for up to ``TUNE_TRIAL_RESULT_WAIT_TIME_S`` seconds
until a result from a running trial is received. Can be disabled by setting this to lower or equal to 0.
* **TUNE_WARN_THRESHOLD_S**: Threshold for logging if an Tune event loop operation takes too long. Defaults to 0.5 (seconds).
* **TUNE_WARN_INSUFFICENT_RESOURCE_THRESHOLD_S**: Threshold for throwing a warning if no active trials are in ``RUNNING`` state
for this amount of seconds. If the Ray Tune job is stuck in this state (most likely due to insufficient resources),
Expand Down
17 changes: 13 additions & 4 deletions python/ray/tune/tests/test_actor_reuse.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
from collections import defaultdict
import os
import pickle
import unittest
import sys
from collections import defaultdict
import time
import unittest

import ray
from ray import tune, logger
Expand All @@ -24,6 +25,7 @@ def setup(self, config):
self.num_resets = 0
self.iter = 0
self.msg = config.get("message", "No message")
self.sleep = int(config.get("sleep", 0))

def step(self):
self.iter += 1
Expand All @@ -32,6 +34,9 @@ def step(self):
print("PRINT_STDERR: {}".format(self.msg), file=sys.stderr)
logger.info("LOG_STDERR: {}".format(self.msg))

if self.sleep:
time.sleep(self.sleep)

return {
"id": self.config["id"],
"num_resets": self.num_resets,
Expand Down Expand Up @@ -219,13 +224,17 @@ def tearDown(self):
def testMultiTrialReuse(self):
register_trainable("foo2", create_resettable_class())

# Log to default files
# We sleep here for one second so that the third actor
# does not finish training before the fourth can be scheduled.
# This helps ensure that both remote runners are re-used and
# not just one.
[trial1, trial2, trial3, trial4] = tune.run(
"foo2",
config={
"message": tune.grid_search(
["First", "Second", "Third", "Fourth"]),
"id": -1
"id": -1,
"sleep": 1,
},
reuse_actors=True).trials

Expand Down
1 change: 1 addition & 0 deletions python/ray/tune/tests/test_cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
os.environ["TUNE_PLACEMENT_GROUP_WAIT_S"] = "5"
# Block for results even when placement groups are pending
os.environ["TUNE_TRIAL_STARTUP_GRACE_PERIOD"] = "0"
os.environ["TUNE_TRIAL_RESULT_WAIT_TIME_S"] = "9999"


def _check_trial_running(trial):
Expand Down
1 change: 1 addition & 0 deletions python/ray/tune/tests/test_progress_reporter.py
Original file line number Diff line number Diff line change
Expand Up @@ -257,6 +257,7 @@ def setUp(self) -> None:
os.environ["TUNE_PLACEMENT_GROUP_WAIT_S"] = "5"
# Block for results even when placement groups are pending
os.environ["TUNE_TRIAL_STARTUP_GRACE_PERIOD"] = "0"
os.environ["TUNE_TRIAL_RESULT_WAIT_TIME_S"] = "99999"
os.environ["TUNE_MAX_PENDING_TRIALS_PG"] = "auto"

def mock_trial(self, status, i):
Expand Down
1 change: 1 addition & 0 deletions python/ray/tune/tests/test_ray_trial_executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ def setUp(self):
os.environ["TUNE_PLACEMENT_GROUP_WAIT_S"] = "5"
# Block for results even when placement groups are pending
os.environ["TUNE_TRIAL_STARTUP_GRACE_PERIOD"] = "0"
os.environ["TUNE_TRIAL_RESULT_WAIT_TIME_S"] = "99999"

self.trial_executor = RayTrialExecutor(queue_trials=False)
ray.init(num_cpus=2, ignore_reinit_error=True)
Expand Down
1 change: 1 addition & 0 deletions python/ray/tune/tests/test_sync.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ def setUp(self):
os.environ["TUNE_PLACEMENT_GROUP_WAIT_S"] = "1.5"
# Block for results even when placement groups are pending
os.environ["TUNE_TRIAL_STARTUP_GRACE_PERIOD"] = "0"
os.environ["TUNE_TRIAL_RESULT_WAIT_TIME_S"] = "99999"

ray.init(num_cpus=2)

Expand Down
1 change: 1 addition & 0 deletions python/ray/tune/tests/test_trial_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ def setUp(self):
os.environ["TUNE_PLACEMENT_GROUP_WAIT_S"] = "5"
# Block for results even when placement groups are pending
os.environ["TUNE_TRIAL_STARTUP_GRACE_PERIOD"] = "0"
os.environ["TUNE_TRIAL_RESULT_WAIT_TIME_S"] = "99999"
_register_all() # re-register the evicted objects

def tearDown(self):
Expand Down
1 change: 1 addition & 0 deletions python/ray/tune/tests/test_trial_runner_2.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ def setUp(self):
os.environ["TUNE_PLACEMENT_GROUP_WAIT_S"] = "5"
# Block for results even when placement groups are pending
os.environ["TUNE_TRIAL_STARTUP_GRACE_PERIOD"] = "0"
os.environ["TUNE_TRIAL_RESULT_WAIT_TIME_S"] = "99999"

def tearDown(self):
ray.shutdown()
Expand Down
1 change: 1 addition & 0 deletions python/ray/tune/tests/test_trial_runner_3.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ def setUp(self):
os.environ["TUNE_PLACEMENT_GROUP_WAIT_S"] = "5"
# Block for results even when placement groups are pending
os.environ["TUNE_TRIAL_STARTUP_GRACE_PERIOD"] = "0"
os.environ["TUNE_TRIAL_RESULT_WAIT_TIME_S"] = "99999"

os.environ["TUNE_MAX_PENDING_TRIALS_PG"] = "auto" # Reset default

Expand Down
1 change: 1 addition & 0 deletions python/ray/tune/tests/test_trial_scheduler_pbt.py
Original file line number Diff line number Diff line change
Expand Up @@ -172,6 +172,7 @@ def get_virt_mem(cls):
class PopulationBasedTrainingSynchTest(unittest.TestCase):
def setUp(self):
os.environ["TUNE_TRIAL_STARTUP_GRACE_PERIOD"] = "0"
os.environ["TUNE_TRIAL_RESULT_WAIT_TIME_S"] = "99999"
ray.init(num_cpus=2)

def MockTrainingFuncSync(config, checkpoint_dir=None):
Expand Down
2 changes: 2 additions & 0 deletions python/ray/tune/tests/test_tune_restore.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,7 @@ def setUp(self) -> None:
os.environ["TUNE_PLACEMENT_GROUP_WAIT_S"] = "5"
# Block for results even when placement groups are pending
os.environ["TUNE_TRIAL_STARTUP_GRACE_PERIOD"] = "0"
os.environ["TUNE_TRIAL_RESULT_WAIT_TIME_S"] = "99999"

def testExperimentInterrupted(self):
import multiprocessing
Expand Down Expand Up @@ -218,6 +219,7 @@ def setUp(self):
os.environ["TUNE_PLACEMENT_GROUP_WAIT_S"] = "1.5"
# Block for results even when placement groups are pending
os.environ["TUNE_TRIAL_STARTUP_GRACE_PERIOD"] = "0"
os.environ["TUNE_TRIAL_RESULT_WAIT_TIME_S"] = "99999"

# Change back to local_mode=True after this is resolved:
# https://github.com/ray-project/ray/issues/13932
Expand Down
1 change: 1 addition & 0 deletions python/ray/tune/tests/test_tune_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ def basicSetup(self):
os.environ["TUNE_PLACEMENT_GROUP_WAIT_S"] = "5"
# Block for results even when placement groups are pending
os.environ["TUNE_TRIAL_STARTUP_GRACE_PERIOD"] = "0"
os.environ["TUNE_TRIAL_RESULT_WAIT_TIME_S"] = "99999"

ray.init(num_cpus=4, num_gpus=1)
port = get_valid_port()
Expand Down
6 changes: 4 additions & 2 deletions python/ray/tune/trial_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -280,6 +280,8 @@ def __init__(self,
self._cached_trial_decisions = {}
self._queued_trial_decisions = {}
self._updated_queue = False
self._result_wait_time = int(
os.getenv("TUNE_TRIAL_RESULT_WAIT_TIME_S", "1"))

self._stop_queue = []
self._should_stop_experiment = False # used by TuneServer
Expand Down Expand Up @@ -619,10 +621,10 @@ def _start_trial(trial: Trial) -> bool:

if may_handle_events:
if self.trial_executor.get_running_trials():
timeout = None
timeout = self._result_wait_time
if self.trial_executor.in_staging_grace_period():
timeout = 0.1
self._process_events(timeout=timeout) # blocking
self._process_events(timeout=timeout)
else:
self._run_and_catch(self.trial_executor.on_no_available_trials)

Expand Down

0 comments on commit 395976c

Please sign in to comment.