Skip to content

Commit

Permalink
[core] Allow reuse of cluster address if Ray is not running (#27666)
Browse files Browse the repository at this point in the history
Signed-off-by: Stephanie Wang [email protected]

Cluster address is now written to a temp file. Previously we raised an error if ray start --head tried to reuse the old cluster address in the temp file, even if Ray was no longer running. This PR allows ray start --head to continue if it can't find any GCS process associated with the recorded cluster address.
Related issue number

Closes #27021.
  • Loading branch information
stephanie-wang authored and scv119 committed Aug 10, 2022
1 parent 155b367 commit 076e9f6
Show file tree
Hide file tree
Showing 2 changed files with 25 additions and 1 deletion.
7 changes: 6 additions & 1 deletion python/ray/scripts/scripts.py
Original file line number Diff line number Diff line change
Expand Up @@ -730,7 +730,12 @@ def start(
if address is None:
default_address = f"{ray_params.node_ip_address}:{port}"
bootstrap_address = services.find_bootstrap_address(temp_dir)
if default_address == bootstrap_address:
if (
default_address == bootstrap_address
and bootstrap_address in services.find_gcs_addresses()
):
# The default address is already in use by a local running GCS
# instance.
raise ConnectionError(
f"Ray is trying to start at {default_address}, "
f"but is already running at {bootstrap_address}. "
Expand Down
19 changes: 19 additions & 0 deletions python/ray/tests/test_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -292,6 +292,7 @@ def test_ray_start(configure_lang, monkeypatch, tmp_path):
# Check that --temp-dir arg worked:
assert os.path.isfile(os.path.join(temp_dir, "ray_current_cluster"))
assert os.path.isdir(os.path.join(temp_dir, "session_latest"))
_die_on_error(result)

_die_on_error(runner.invoke(scripts.stop))

Expand All @@ -300,6 +301,24 @@ def test_ray_start(configure_lang, monkeypatch, tmp_path):
else:
_check_output_via_pattern("test_ray_start.txt", result)

# Check that we can rerun `ray start` even though the cluster address file
# is already written.
_die_on_error(
runner.invoke(
scripts.start,
[
"--head",
"--log-style=pretty",
"--log-color",
"False",
"--port",
"0",
"--temp-dir",
temp_dir,
],
)
)


def _ray_start_hook(ray_params, head):
os.makedirs(ray_params.temp_dir, exist_ok=True)
Expand Down

0 comments on commit 076e9f6

Please sign in to comment.