From 076e9f6f56446ff6df150eb0c41d7e390f765dbd Mon Sep 17 00:00:00 2001 From: Stephanie Wang Date: Tue, 9 Aug 2022 16:48:48 -0400 Subject: [PATCH] [core] Allow reuse of cluster address if Ray is not running (#27666) Signed-off-by: Stephanie Wang swang@cs.berkeley.edu Cluster address is now written to a temp file. Previously we raised an error if ray start --head tried to reuse the old cluster address in the temp file, even if Ray was no longer running. This PR allows ray start --head to continue if it can't find any GCS process associated with the recorded cluster address. Related issue number Closes #27021. --- python/ray/scripts/scripts.py | 7 ++++++- python/ray/tests/test_cli.py | 19 +++++++++++++++++++ 2 files changed, 25 insertions(+), 1 deletion(-) diff --git a/python/ray/scripts/scripts.py b/python/ray/scripts/scripts.py index e83ed443ec70..23ea55667423 100644 --- a/python/ray/scripts/scripts.py +++ b/python/ray/scripts/scripts.py @@ -730,7 +730,12 @@ def start( if address is None: default_address = f"{ray_params.node_ip_address}:{port}" bootstrap_address = services.find_bootstrap_address(temp_dir) - if default_address == bootstrap_address: + if ( + default_address == bootstrap_address + and bootstrap_address in services.find_gcs_addresses() + ): + # The default address is already in use by a local running GCS + # instance. raise ConnectionError( f"Ray is trying to start at {default_address}, " f"but is already running at {bootstrap_address}. " diff --git a/python/ray/tests/test_cli.py b/python/ray/tests/test_cli.py index fb5f6aee0903..97286004c14d 100644 --- a/python/ray/tests/test_cli.py +++ b/python/ray/tests/test_cli.py @@ -292,6 +292,7 @@ def test_ray_start(configure_lang, monkeypatch, tmp_path): # Check that --temp-dir arg worked: assert os.path.isfile(os.path.join(temp_dir, "ray_current_cluster")) assert os.path.isdir(os.path.join(temp_dir, "session_latest")) + _die_on_error(result) _die_on_error(runner.invoke(scripts.stop)) @@ -300,6 +301,24 @@ def test_ray_start(configure_lang, monkeypatch, tmp_path): else: _check_output_via_pattern("test_ray_start.txt", result) + # Check that we can rerun `ray start` even though the cluster address file + # is already written. + _die_on_error( + runner.invoke( + scripts.start, + [ + "--head", + "--log-style=pretty", + "--log-color", + "False", + "--port", + "0", + "--temp-dir", + temp_dir, + ], + ) + ) + def _ray_start_hook(ray_params, head): os.makedirs(ray_params.temp_dir, exist_ok=True)