Skip to content

Commit

Permalink
[autoscaler] Rephrase RayletUnexpectedlyDied to RayletTerminated for …
Browse files Browse the repository at this point in the history
…more generic error message (ray-project#37857)

---------

Signed-off-by: rickyyx <[email protected]>
Signed-off-by: e428265 <[email protected]>
  • Loading branch information
rickyyx authored and arvind-chandra committed Aug 31, 2023
1 parent a8ad294 commit ea59b97
Show file tree
Hide file tree
Showing 4 changed files with 32 additions and 28 deletions.
2 changes: 1 addition & 1 deletion python/ray/autoscaler/_private/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -796,7 +796,7 @@ def format_info_string(

failure_lines = []
for ip, node_type in autoscaler_summary.failed_nodes:
line = f" {node_type}: RayletUnexpectedlyDied (ip: {ip})"
line = f" {node_type}: NodeTerminated (ip: {ip})"
failure_lines.append(line)
if autoscaler_summary.node_availability_summary:
records = sorted(
Expand Down
6 changes: 5 additions & 1 deletion python/ray/autoscaler/v2/schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,11 @@
from enum import Enum
from typing import Dict, List, Optional

NODE_DEATH_CAUSE_RAYLET_DIED = "RayletUnexpectedlyDied"
# TODO(rickyx): once we have graceful shutdown, we could populate
# the failure detail with the actual termination message. As of now,
# we will use a more generic message to include cases such as:
# (idle termination, node death, crash, preemption, etc)
NODE_DEATH_CAUSE_RAYLET_DIED = "NodeTerminated"


@dataclass
Expand Down
2 changes: 1 addition & 1 deletion python/ray/autoscaler/v2/tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -513,7 +513,7 @@ def test_cluster_status_formatter():
127.0.0.3: worker_node, starting ray
Recent failures:
worker_node: LaunchFailed (latest_attempt: 02:46:40) - Insufficient capacity
worker_node: RayletUnexpectedlyDied (ip: 127.0.0.5)
worker_node: NodeTerminated (ip: 127.0.0.5)
Resources
--------------------------------------------------------
Expand Down
50 changes: 25 additions & 25 deletions python/ray/tests/test_resource_demand_scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -3138,7 +3138,7 @@ def test_info_string():
1.2.3.4: m4.4xlarge, waiting-for-ssh
1.2.3.5: m4.4xlarge, waiting-for-ssh
Recent failures:
p3.2xlarge: RayletUnexpectedlyDied (ip: 1.2.3.6)
p3.2xlarge: NodeTerminated (ip: 1.2.3.6)
Resources
--------------------------------------------------------
Expand Down Expand Up @@ -3218,7 +3218,7 @@ def test_info_string_verbose():
1.2.3.4: m4.4xlarge, waiting-for-ssh
1.2.3.5: m4.4xlarge, waiting-for-ssh
Recent failures:
p3.2xlarge: RayletUnexpectedlyDied (ip: 1.2.3.6)
p3.2xlarge: NodeTerminated (ip: 1.2.3.6)
Resources
--------------------------------------------------------
Expand Down Expand Up @@ -3322,7 +3322,7 @@ def test_info_string_verbose_node_types():
1.2.3.4: m4.4xlarge, waiting-for-ssh
1.2.3.5: m4.4xlarge, waiting-for-ssh
Recent failures:
p3.2xlarge: RayletUnexpectedlyDied (ip: 1.2.3.6)
p3.2xlarge: NodeTerminated (ip: 1.2.3.6)
Resources
--------------------------------------------------------
Expand Down Expand Up @@ -3410,7 +3410,7 @@ def test_info_string_verbose_no_breakdown():
1.2.3.4: m4.4xlarge, waiting-for-ssh
1.2.3.5: m4.4xlarge, waiting-for-ssh
Recent failures:
p3.2xlarge: RayletUnexpectedlyDied (ip: 1.2.3.6)
p3.2xlarge: NodeTerminated (ip: 1.2.3.6)
Resources
--------------------------------------------------------
Expand Down Expand Up @@ -3501,7 +3501,7 @@ def test_info_string_with_launch_failures():
Recent failures:
A100: InstanceLimitExceeded (latest_attempt: 13:03:02)
Inferentia-Spot: InsufficientInstanceCapacity (latest_attempt: 13:03:01)
p3.2xlarge: RayletUnexpectedlyDied (ip: 1.2.3.6)
p3.2xlarge: NodeTerminated (ip: 1.2.3.6)
Resources
--------------------------------------------------------
Expand Down Expand Up @@ -3590,7 +3590,7 @@ def test_info_string_with_launch_failures_verbose():
Recent failures:
A100: InstanceLimitExceeded (latest_attempt: 13:03:02) - you should fix it
Inferentia-Spot: InsufficientInstanceCapacity (latest_attempt: 13:03:01) - desc
p3.2xlarge: RayletUnexpectedlyDied (ip: 1.2.3.6)
p3.2xlarge: NodeTerminated (ip: 1.2.3.6)
Resources
--------------------------------------------------------
Expand Down Expand Up @@ -3657,25 +3657,25 @@ def test_info_string_failed_node_cap():
1.2.3.4: m4.4xlarge, waiting-for-ssh
1.2.3.5: m4.4xlarge, waiting-for-ssh
Recent failures:
p3.2xlarge: RayletUnexpectedlyDied (ip: 1.2.3.99)
p3.2xlarge: RayletUnexpectedlyDied (ip: 1.2.3.98)
p3.2xlarge: RayletUnexpectedlyDied (ip: 1.2.3.97)
p3.2xlarge: RayletUnexpectedlyDied (ip: 1.2.3.96)
p3.2xlarge: RayletUnexpectedlyDied (ip: 1.2.3.95)
p3.2xlarge: RayletUnexpectedlyDied (ip: 1.2.3.94)
p3.2xlarge: RayletUnexpectedlyDied (ip: 1.2.3.93)
p3.2xlarge: RayletUnexpectedlyDied (ip: 1.2.3.92)
p3.2xlarge: RayletUnexpectedlyDied (ip: 1.2.3.91)
p3.2xlarge: RayletUnexpectedlyDied (ip: 1.2.3.90)
p3.2xlarge: RayletUnexpectedlyDied (ip: 1.2.3.89)
p3.2xlarge: RayletUnexpectedlyDied (ip: 1.2.3.88)
p3.2xlarge: RayletUnexpectedlyDied (ip: 1.2.3.87)
p3.2xlarge: RayletUnexpectedlyDied (ip: 1.2.3.86)
p3.2xlarge: RayletUnexpectedlyDied (ip: 1.2.3.85)
p3.2xlarge: RayletUnexpectedlyDied (ip: 1.2.3.84)
p3.2xlarge: RayletUnexpectedlyDied (ip: 1.2.3.83)
p3.2xlarge: RayletUnexpectedlyDied (ip: 1.2.3.82)
p3.2xlarge: RayletUnexpectedlyDied (ip: 1.2.3.81)
p3.2xlarge: NodeTerminated (ip: 1.2.3.99)
p3.2xlarge: NodeTerminated (ip: 1.2.3.98)
p3.2xlarge: NodeTerminated (ip: 1.2.3.97)
p3.2xlarge: NodeTerminated (ip: 1.2.3.96)
p3.2xlarge: NodeTerminated (ip: 1.2.3.95)
p3.2xlarge: NodeTerminated (ip: 1.2.3.94)
p3.2xlarge: NodeTerminated (ip: 1.2.3.93)
p3.2xlarge: NodeTerminated (ip: 1.2.3.92)
p3.2xlarge: NodeTerminated (ip: 1.2.3.91)
p3.2xlarge: NodeTerminated (ip: 1.2.3.90)
p3.2xlarge: NodeTerminated (ip: 1.2.3.89)
p3.2xlarge: NodeTerminated (ip: 1.2.3.88)
p3.2xlarge: NodeTerminated (ip: 1.2.3.87)
p3.2xlarge: NodeTerminated (ip: 1.2.3.86)
p3.2xlarge: NodeTerminated (ip: 1.2.3.85)
p3.2xlarge: NodeTerminated (ip: 1.2.3.84)
p3.2xlarge: NodeTerminated (ip: 1.2.3.83)
p3.2xlarge: NodeTerminated (ip: 1.2.3.82)
p3.2xlarge: NodeTerminated (ip: 1.2.3.81)
Resources
--------------------------------------------------------
Expand Down

0 comments on commit ea59b97

Please sign in to comment.