[tune] Never block for results (#18391)

* [tune] Never block for results * Fix tests * Block in tests * Add comment to test
ray-project · Sep 9, 2021 · 395976c · 395976c
1 parent 0126837
commit 395976c
Show file tree

Hide file tree

Showing 13 changed files with 32 additions and 8 deletions.
diff --git a/doc/source/tune/user-guide.rst b/doc/source/tune/user-guide.rst
@@ -844,9 +844,11 @@ These are the environment variables Ray Tune currently considers:
   but never longer than this value. Defaults to 100 (seconds).
 * **TUNE_RESULT_BUFFER_MIN_TIME_S**: Additionally, you can specify a minimum time to buffer results. Defaults to 0.
 * **TUNE_SYNCER_VERBOSITY**: Amount of command output when using Tune with Docker Syncer. Defaults to 0.
+* **TUNE_TRIAL_RESULT_WAIT_TIME_S**: Amount of time Ray Tune will block until a result from a running trial is received.
+  Defaults to 1 (second).
 * **TUNE_TRIAL_STARTUP_GRACE_PERIOD**: Amount of time after starting a trial that Ray Tune checks for successful
-  trial startups. After the grace period, Tune will block until a result from a running trial is received. Can
-  be disabled by setting this to lower or equal to 0.
+  trial startups. After the grace period, Tune will block for up to ``TUNE_TRIAL_RESULT_WAIT_TIME_S`` seconds
+  until a result from a running trial is received. Can be disabled by setting this to lower or equal to 0.
 * **TUNE_WARN_THRESHOLD_S**: Threshold for logging if an Tune event loop operation takes too long. Defaults to 0.5 (seconds).
 * **TUNE_WARN_INSUFFICENT_RESOURCE_THRESHOLD_S**: Threshold for throwing a warning if no active trials are in ``RUNNING`` state
   for this amount of seconds. If the Ray Tune job is stuck in this state (most likely due to insufficient resources),

diff --git a/python/ray/tune/tests/test_actor_reuse.py b/python/ray/tune/tests/test_actor_reuse.py
@@ -1,8 +1,9 @@
+from collections import defaultdict
 import os
 import pickle
-import unittest
 import sys
-from collections import defaultdict
+import time
+import unittest
 
 import ray
 from ray import tune, logger
@@ -24,6 +25,7 @@ def setup(self, config):
             self.num_resets = 0
             self.iter = 0
             self.msg = config.get("message", "No message")
+            self.sleep = int(config.get("sleep", 0))
 
         def step(self):
             self.iter += 1
@@ -32,6 +34,9 @@ def step(self):
             print("PRINT_STDERR: {}".format(self.msg), file=sys.stderr)
             logger.info("LOG_STDERR: {}".format(self.msg))
 
+            if self.sleep:
+                time.sleep(self.sleep)
+
             return {
                 "id": self.config["id"],
                 "num_resets": self.num_resets,
@@ -219,13 +224,17 @@ def tearDown(self):
     def testMultiTrialReuse(self):
         register_trainable("foo2", create_resettable_class())
 
-        # Log to default files
+        # We sleep here for one second so that the third actor
+        # does not finish training before the fourth can be scheduled.
+        # This helps ensure that both remote runners are re-used and
+        # not just one.
         [trial1, trial2, trial3, trial4] = tune.run(
             "foo2",
             config={
                 "message": tune.grid_search(
                     ["First", "Second", "Third", "Fourth"]),
-                "id": -1
+                "id": -1,
+                "sleep": 1,
             },
             reuse_actors=True).trials
 

diff --git a/python/ray/tune/tests/test_cluster.py b/python/ray/tune/tests/test_cluster.py
@@ -32,6 +32,7 @@
 os.environ["TUNE_PLACEMENT_GROUP_WAIT_S"] = "5"
 # Block for results even when placement groups are pending
 os.environ["TUNE_TRIAL_STARTUP_GRACE_PERIOD"] = "0"
+os.environ["TUNE_TRIAL_RESULT_WAIT_TIME_S"] = "9999"
 
 
 def _check_trial_running(trial):

diff --git a/python/ray/tune/tests/test_progress_reporter.py b/python/ray/tune/tests/test_progress_reporter.py
@@ -257,6 +257,7 @@ def setUp(self) -> None:
         os.environ["TUNE_PLACEMENT_GROUP_WAIT_S"] = "5"
         # Block for results even when placement groups are pending
         os.environ["TUNE_TRIAL_STARTUP_GRACE_PERIOD"] = "0"
+        os.environ["TUNE_TRIAL_RESULT_WAIT_TIME_S"] = "99999"
         os.environ["TUNE_MAX_PENDING_TRIALS_PG"] = "auto"
 
     def mock_trial(self, status, i):

diff --git a/python/ray/tune/tests/test_ray_trial_executor.py b/python/ray/tune/tests/test_ray_trial_executor.py
@@ -60,6 +60,7 @@ def setUp(self):
         os.environ["TUNE_PLACEMENT_GROUP_WAIT_S"] = "5"
         # Block for results even when placement groups are pending
         os.environ["TUNE_TRIAL_STARTUP_GRACE_PERIOD"] = "0"
+        os.environ["TUNE_TRIAL_RESULT_WAIT_TIME_S"] = "99999"
 
         self.trial_executor = RayTrialExecutor(queue_trials=False)
         ray.init(num_cpus=2, ignore_reinit_error=True)

diff --git a/python/ray/tune/tests/test_sync.py b/python/ray/tune/tests/test_sync.py
@@ -24,6 +24,7 @@ def setUp(self):
         os.environ["TUNE_PLACEMENT_GROUP_WAIT_S"] = "1.5"
         # Block for results even when placement groups are pending
         os.environ["TUNE_TRIAL_STARTUP_GRACE_PERIOD"] = "0"
+        os.environ["TUNE_TRIAL_RESULT_WAIT_TIME_S"] = "99999"
 
         ray.init(num_cpus=2)
 

diff --git a/python/ray/tune/tests/test_trial_runner.py b/python/ray/tune/tests/test_trial_runner.py
@@ -21,6 +21,7 @@ def setUp(self):
         os.environ["TUNE_PLACEMENT_GROUP_WAIT_S"] = "5"
         # Block for results even when placement groups are pending
         os.environ["TUNE_TRIAL_STARTUP_GRACE_PERIOD"] = "0"
+        os.environ["TUNE_TRIAL_RESULT_WAIT_TIME_S"] = "99999"
         _register_all()  # re-register the evicted objects
 
     def tearDown(self):

diff --git a/python/ray/tune/tests/test_trial_runner_2.py b/python/ray/tune/tests/test_trial_runner_2.py
@@ -42,6 +42,7 @@ def setUp(self):
         os.environ["TUNE_PLACEMENT_GROUP_WAIT_S"] = "5"
         # Block for results even when placement groups are pending
         os.environ["TUNE_TRIAL_STARTUP_GRACE_PERIOD"] = "0"
+        os.environ["TUNE_TRIAL_RESULT_WAIT_TIME_S"] = "99999"
 
     def tearDown(self):
         ray.shutdown()

diff --git a/python/ray/tune/tests/test_trial_runner_3.py b/python/ray/tune/tests/test_trial_runner_3.py
@@ -32,6 +32,7 @@ def setUp(self):
         os.environ["TUNE_PLACEMENT_GROUP_WAIT_S"] = "5"
         # Block for results even when placement groups are pending
         os.environ["TUNE_TRIAL_STARTUP_GRACE_PERIOD"] = "0"
+        os.environ["TUNE_TRIAL_RESULT_WAIT_TIME_S"] = "99999"
 
         os.environ["TUNE_MAX_PENDING_TRIALS_PG"] = "auto"  # Reset default
 

diff --git a/python/ray/tune/tests/test_trial_scheduler_pbt.py b/python/ray/tune/tests/test_trial_scheduler_pbt.py
@@ -172,6 +172,7 @@ def get_virt_mem(cls):
 class PopulationBasedTrainingSynchTest(unittest.TestCase):
     def setUp(self):
         os.environ["TUNE_TRIAL_STARTUP_GRACE_PERIOD"] = "0"
+        os.environ["TUNE_TRIAL_RESULT_WAIT_TIME_S"] = "99999"
         ray.init(num_cpus=2)
 
         def MockTrainingFuncSync(config, checkpoint_dir=None):

diff --git a/python/ray/tune/tests/test_tune_restore.py b/python/ray/tune/tests/test_tune_restore.py
@@ -101,6 +101,7 @@ def setUp(self) -> None:
         os.environ["TUNE_PLACEMENT_GROUP_WAIT_S"] = "5"
         # Block for results even when placement groups are pending
         os.environ["TUNE_TRIAL_STARTUP_GRACE_PERIOD"] = "0"
+        os.environ["TUNE_TRIAL_RESULT_WAIT_TIME_S"] = "99999"
 
     def testExperimentInterrupted(self):
         import multiprocessing
@@ -218,6 +219,7 @@ def setUp(self):
         os.environ["TUNE_PLACEMENT_GROUP_WAIT_S"] = "1.5"
         # Block for results even when placement groups are pending
         os.environ["TUNE_TRIAL_STARTUP_GRACE_PERIOD"] = "0"
+        os.environ["TUNE_TRIAL_RESULT_WAIT_TIME_S"] = "99999"
 
         # Change back to local_mode=True after this is resolved:
         # https://github.com/ray-project/ray/issues/13932

diff --git a/python/ray/tune/tests/test_tune_server.py b/python/ray/tune/tests/test_tune_server.py
@@ -32,6 +32,7 @@ def basicSetup(self):
         os.environ["TUNE_PLACEMENT_GROUP_WAIT_S"] = "5"
         # Block for results even when placement groups are pending
         os.environ["TUNE_TRIAL_STARTUP_GRACE_PERIOD"] = "0"
+        os.environ["TUNE_TRIAL_RESULT_WAIT_TIME_S"] = "99999"
 
         ray.init(num_cpus=4, num_gpus=1)
         port = get_valid_port()

diff --git a/python/ray/tune/trial_runner.py b/python/ray/tune/trial_runner.py
@@ -280,6 +280,8 @@ def __init__(self,
         self._cached_trial_decisions = {}
         self._queued_trial_decisions = {}
         self._updated_queue = False
+        self._result_wait_time = int(
+            os.getenv("TUNE_TRIAL_RESULT_WAIT_TIME_S", "1"))
 
         self._stop_queue = []
         self._should_stop_experiment = False  # used by TuneServer
@@ -619,10 +621,10 @@ def _start_trial(trial: Trial) -> bool:
 
         if may_handle_events:
             if self.trial_executor.get_running_trials():
-                timeout = None
+                timeout = self._result_wait_time
                 if self.trial_executor.in_staging_grace_period():
                     timeout = 0.1
-                self._process_events(timeout=timeout)  # blocking
+                self._process_events(timeout=timeout)
             else:
                 self._run_and_catch(self.trial_executor.on_no_available_trials)