Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[tune] Never block for results #18391

Merged
merged 4 commits into from
Sep 9, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions doc/source/tune/user-guide.rst
Original file line number Diff line number Diff line change
Expand Up @@ -844,9 +844,11 @@ These are the environment variables Ray Tune currently considers:
but never longer than this value. Defaults to 100 (seconds).
* **TUNE_RESULT_BUFFER_MIN_TIME_S**: Additionally, you can specify a minimum time to buffer results. Defaults to 0.
* **TUNE_SYNCER_VERBOSITY**: Amount of command output when using Tune with Docker Syncer. Defaults to 0.
* **TUNE_TRIAL_RESULT_WAIT_TIME_S**: Amount of time Ray Tune will block until a result from a running trial is received.
Defaults to 1 (second).
* **TUNE_TRIAL_STARTUP_GRACE_PERIOD**: Amount of time after starting a trial that Ray Tune checks for successful
trial startups. After the grace period, Tune will block until a result from a running trial is received. Can
be disabled by setting this to lower or equal to 0.
trial startups. After the grace period, Tune will block for up to ``TUNE_TRIAL_RESULT_WAIT_TIME_S`` seconds
until a result from a running trial is received. Can be disabled by setting this to lower or equal to 0.
* **TUNE_WARN_THRESHOLD_S**: Threshold for logging if an Tune event loop operation takes too long. Defaults to 0.5 (seconds).
* **TUNE_WARN_INSUFFICENT_RESOURCE_THRESHOLD_S**: Threshold for throwing a warning if no active trials are in ``RUNNING`` state
for this amount of seconds. If the Ray Tune job is stuck in this state (most likely due to insufficient resources),
Expand Down
17 changes: 13 additions & 4 deletions python/ray/tune/tests/test_actor_reuse.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
from collections import defaultdict
import os
import pickle
import unittest
import sys
from collections import defaultdict
import time
import unittest

import ray
from ray import tune, logger
Expand All @@ -24,6 +25,7 @@ def setup(self, config):
self.num_resets = 0
self.iter = 0
self.msg = config.get("message", "No message")
self.sleep = int(config.get("sleep", 0))

def step(self):
self.iter += 1
Expand All @@ -32,6 +34,9 @@ def step(self):
print("PRINT_STDERR: {}".format(self.msg), file=sys.stderr)
logger.info("LOG_STDERR: {}".format(self.msg))

if self.sleep:
time.sleep(self.sleep)

return {
"id": self.config["id"],
"num_resets": self.num_resets,
Expand Down Expand Up @@ -219,13 +224,17 @@ def tearDown(self):
def testMultiTrialReuse(self):
register_trainable("foo2", create_resettable_class())

# Log to default files
# We sleep here for one second so that the third actor
# does not finish training before the fourth can be scheduled.
# This helps ensure that both remote runners are re-used and
# not just one.
[trial1, trial2, trial3, trial4] = tune.run(
"foo2",
config={
"message": tune.grid_search(
["First", "Second", "Third", "Fourth"]),
"id": -1
"id": -1,
"sleep": 1,
},
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe add some comments on why this ActorReuseMultiTest needs sleep to work(v.s. ActorReuseTest does not)?

I don't have a good answer yet as to how. But do you think when we restructure Tune code, it's also a good time to revisit testability, especially how to emulate the timing in tests in a consistent and readable way, preferably less sleep so that test can run faster.

reuse_actors=True).trials

Expand Down
1 change: 1 addition & 0 deletions python/ray/tune/tests/test_cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
os.environ["TUNE_PLACEMENT_GROUP_WAIT_S"] = "5"
# Block for results even when placement groups are pending
os.environ["TUNE_TRIAL_STARTUP_GRACE_PERIOD"] = "0"
os.environ["TUNE_TRIAL_RESULT_WAIT_TIME_S"] = "9999"


def _check_trial_running(trial):
Expand Down
1 change: 1 addition & 0 deletions python/ray/tune/tests/test_progress_reporter.py
Original file line number Diff line number Diff line change
Expand Up @@ -257,6 +257,7 @@ def setUp(self) -> None:
os.environ["TUNE_PLACEMENT_GROUP_WAIT_S"] = "5"
# Block for results even when placement groups are pending
os.environ["TUNE_TRIAL_STARTUP_GRACE_PERIOD"] = "0"
os.environ["TUNE_TRIAL_RESULT_WAIT_TIME_S"] = "99999"
os.environ["TUNE_MAX_PENDING_TRIALS_PG"] = "auto"

def mock_trial(self, status, i):
Expand Down
1 change: 1 addition & 0 deletions python/ray/tune/tests/test_ray_trial_executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ def setUp(self):
os.environ["TUNE_PLACEMENT_GROUP_WAIT_S"] = "5"
# Block for results even when placement groups are pending
os.environ["TUNE_TRIAL_STARTUP_GRACE_PERIOD"] = "0"
os.environ["TUNE_TRIAL_RESULT_WAIT_TIME_S"] = "99999"

self.trial_executor = RayTrialExecutor(queue_trials=False)
ray.init(num_cpus=2, ignore_reinit_error=True)
Expand Down
1 change: 1 addition & 0 deletions python/ray/tune/tests/test_sync.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ def setUp(self):
os.environ["TUNE_PLACEMENT_GROUP_WAIT_S"] = "1.5"
# Block for results even when placement groups are pending
os.environ["TUNE_TRIAL_STARTUP_GRACE_PERIOD"] = "0"
os.environ["TUNE_TRIAL_RESULT_WAIT_TIME_S"] = "99999"

ray.init(num_cpus=2)

Expand Down
1 change: 1 addition & 0 deletions python/ray/tune/tests/test_trial_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ def setUp(self):
os.environ["TUNE_PLACEMENT_GROUP_WAIT_S"] = "5"
# Block for results even when placement groups are pending
os.environ["TUNE_TRIAL_STARTUP_GRACE_PERIOD"] = "0"
os.environ["TUNE_TRIAL_RESULT_WAIT_TIME_S"] = "99999"
_register_all() # re-register the evicted objects

def tearDown(self):
Expand Down
1 change: 1 addition & 0 deletions python/ray/tune/tests/test_trial_runner_2.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ def setUp(self):
os.environ["TUNE_PLACEMENT_GROUP_WAIT_S"] = "5"
# Block for results even when placement groups are pending
os.environ["TUNE_TRIAL_STARTUP_GRACE_PERIOD"] = "0"
os.environ["TUNE_TRIAL_RESULT_WAIT_TIME_S"] = "99999"

def tearDown(self):
ray.shutdown()
Expand Down
1 change: 1 addition & 0 deletions python/ray/tune/tests/test_trial_runner_3.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ def setUp(self):
os.environ["TUNE_PLACEMENT_GROUP_WAIT_S"] = "5"
# Block for results even when placement groups are pending
os.environ["TUNE_TRIAL_STARTUP_GRACE_PERIOD"] = "0"
os.environ["TUNE_TRIAL_RESULT_WAIT_TIME_S"] = "99999"

os.environ["TUNE_MAX_PENDING_TRIALS_PG"] = "auto" # Reset default

Expand Down
1 change: 1 addition & 0 deletions python/ray/tune/tests/test_trial_scheduler_pbt.py
Original file line number Diff line number Diff line change
Expand Up @@ -172,6 +172,7 @@ def get_virt_mem(cls):
class PopulationBasedTrainingSynchTest(unittest.TestCase):
def setUp(self):
os.environ["TUNE_TRIAL_STARTUP_GRACE_PERIOD"] = "0"
os.environ["TUNE_TRIAL_RESULT_WAIT_TIME_S"] = "99999"
ray.init(num_cpus=2)

def MockTrainingFuncSync(config, checkpoint_dir=None):
Expand Down
2 changes: 2 additions & 0 deletions python/ray/tune/tests/test_tune_restore.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,7 @@ def setUp(self) -> None:
os.environ["TUNE_PLACEMENT_GROUP_WAIT_S"] = "5"
# Block for results even when placement groups are pending
os.environ["TUNE_TRIAL_STARTUP_GRACE_PERIOD"] = "0"
os.environ["TUNE_TRIAL_RESULT_WAIT_TIME_S"] = "99999"

def testExperimentInterrupted(self):
import multiprocessing
Expand Down Expand Up @@ -218,6 +219,7 @@ def setUp(self):
os.environ["TUNE_PLACEMENT_GROUP_WAIT_S"] = "1.5"
# Block for results even when placement groups are pending
os.environ["TUNE_TRIAL_STARTUP_GRACE_PERIOD"] = "0"
os.environ["TUNE_TRIAL_RESULT_WAIT_TIME_S"] = "99999"

# Change back to local_mode=True after this is resolved:
# https://github.com/ray-project/ray/issues/13932
Expand Down
1 change: 1 addition & 0 deletions python/ray/tune/tests/test_tune_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ def basicSetup(self):
os.environ["TUNE_PLACEMENT_GROUP_WAIT_S"] = "5"
# Block for results even when placement groups are pending
os.environ["TUNE_TRIAL_STARTUP_GRACE_PERIOD"] = "0"
os.environ["TUNE_TRIAL_RESULT_WAIT_TIME_S"] = "99999"

ray.init(num_cpus=4, num_gpus=1)
port = get_valid_port()
Expand Down
6 changes: 4 additions & 2 deletions python/ray/tune/trial_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -280,6 +280,8 @@ def __init__(self,
self._cached_trial_decisions = {}
self._queued_trial_decisions = {}
self._updated_queue = False
self._result_wait_time = int(
os.getenv("TUNE_TRIAL_RESULT_WAIT_TIME_S", "1"))

self._stop_queue = []
self._should_stop_experiment = False # used by TuneServer
Expand Down Expand Up @@ -619,10 +621,10 @@ def _start_trial(trial: Trial) -> bool:

if may_handle_events:
if self.trial_executor.get_running_trials():
timeout = None
timeout = self._result_wait_time
if self.trial_executor.in_staging_grace_period():
timeout = 0.1
self._process_events(timeout=timeout) # blocking
self._process_events(timeout=timeout)
else:
self._run_and_catch(self.trial_executor.on_no_available_trials)

Expand Down