diff --git a/python/ray/autoscaler/_private/util.py b/python/ray/autoscaler/_private/util.py index 9eeafd938252..7d1d70b07946 100644 --- a/python/ray/autoscaler/_private/util.py +++ b/python/ray/autoscaler/_private/util.py @@ -796,7 +796,7 @@ def format_info_string( failure_lines = [] for ip, node_type in autoscaler_summary.failed_nodes: - line = f" {node_type}: RayletUnexpectedlyDied (ip: {ip})" + line = f" {node_type}: NodeTerminated (ip: {ip})" failure_lines.append(line) if autoscaler_summary.node_availability_summary: records = sorted( diff --git a/python/ray/autoscaler/v2/schema.py b/python/ray/autoscaler/v2/schema.py index 302d3cf53895..86ace52582bf 100644 --- a/python/ray/autoscaler/v2/schema.py +++ b/python/ray/autoscaler/v2/schema.py @@ -3,7 +3,11 @@ from enum import Enum from typing import Dict, List, Optional -NODE_DEATH_CAUSE_RAYLET_DIED = "RayletUnexpectedlyDied" +# TODO(rickyx): once we have graceful shutdown, we could populate +# the failure detail with the actual termination message. As of now, +# we will use a more generic message to include cases such as: +# (idle termination, node death, crash, preemption, etc) +NODE_DEATH_CAUSE_RAYLET_DIED = "NodeTerminated" @dataclass diff --git a/python/ray/autoscaler/v2/tests/test_utils.py b/python/ray/autoscaler/v2/tests/test_utils.py index 51a1861a034d..f2f9aad6354f 100644 --- a/python/ray/autoscaler/v2/tests/test_utils.py +++ b/python/ray/autoscaler/v2/tests/test_utils.py @@ -510,7 +510,7 @@ def test_cluster_status_formatter(): 127.0.0.3: worker_node, starting ray Recent failures: worker_node: LaunchFailed (latest_attempt: 02:46:40) - Insufficient capacity - worker_node: RayletUnexpectedlyDied (ip: 127.0.0.5) + worker_node: NodeTerminated (ip: 127.0.0.5) Resources -------------------------------------------------------- diff --git a/python/ray/tests/test_resource_demand_scheduler.py b/python/ray/tests/test_resource_demand_scheduler.py index b63a6216b013..12ab20c9089f 100644 --- a/python/ray/tests/test_resource_demand_scheduler.py +++ b/python/ray/tests/test_resource_demand_scheduler.py @@ -3138,7 +3138,7 @@ def test_info_string(): 1.2.3.4: m4.4xlarge, waiting-for-ssh 1.2.3.5: m4.4xlarge, waiting-for-ssh Recent failures: - p3.2xlarge: RayletUnexpectedlyDied (ip: 1.2.3.6) + p3.2xlarge: NodeTerminated (ip: 1.2.3.6) Resources -------------------------------------------------------- @@ -3218,7 +3218,7 @@ def test_info_string_verbose(): 1.2.3.4: m4.4xlarge, waiting-for-ssh 1.2.3.5: m4.4xlarge, waiting-for-ssh Recent failures: - p3.2xlarge: RayletUnexpectedlyDied (ip: 1.2.3.6) + p3.2xlarge: NodeTerminated (ip: 1.2.3.6) Resources -------------------------------------------------------- @@ -3322,7 +3322,7 @@ def test_info_string_verbose_node_types(): 1.2.3.4: m4.4xlarge, waiting-for-ssh 1.2.3.5: m4.4xlarge, waiting-for-ssh Recent failures: - p3.2xlarge: RayletUnexpectedlyDied (ip: 1.2.3.6) + p3.2xlarge: NodeTerminated (ip: 1.2.3.6) Resources -------------------------------------------------------- @@ -3410,7 +3410,7 @@ def test_info_string_verbose_no_breakdown(): 1.2.3.4: m4.4xlarge, waiting-for-ssh 1.2.3.5: m4.4xlarge, waiting-for-ssh Recent failures: - p3.2xlarge: RayletUnexpectedlyDied (ip: 1.2.3.6) + p3.2xlarge: NodeTerminated (ip: 1.2.3.6) Resources -------------------------------------------------------- @@ -3501,7 +3501,7 @@ def test_info_string_with_launch_failures(): Recent failures: A100: InstanceLimitExceeded (latest_attempt: 13:03:02) Inferentia-Spot: InsufficientInstanceCapacity (latest_attempt: 13:03:01) - p3.2xlarge: RayletUnexpectedlyDied (ip: 1.2.3.6) + p3.2xlarge: NodeTerminated (ip: 1.2.3.6) Resources -------------------------------------------------------- @@ -3590,7 +3590,7 @@ def test_info_string_with_launch_failures_verbose(): Recent failures: A100: InstanceLimitExceeded (latest_attempt: 13:03:02) - you should fix it Inferentia-Spot: InsufficientInstanceCapacity (latest_attempt: 13:03:01) - desc - p3.2xlarge: RayletUnexpectedlyDied (ip: 1.2.3.6) + p3.2xlarge: NodeTerminated (ip: 1.2.3.6) Resources -------------------------------------------------------- @@ -3657,25 +3657,25 @@ def test_info_string_failed_node_cap(): 1.2.3.4: m4.4xlarge, waiting-for-ssh 1.2.3.5: m4.4xlarge, waiting-for-ssh Recent failures: - p3.2xlarge: RayletUnexpectedlyDied (ip: 1.2.3.99) - p3.2xlarge: RayletUnexpectedlyDied (ip: 1.2.3.98) - p3.2xlarge: RayletUnexpectedlyDied (ip: 1.2.3.97) - p3.2xlarge: RayletUnexpectedlyDied (ip: 1.2.3.96) - p3.2xlarge: RayletUnexpectedlyDied (ip: 1.2.3.95) - p3.2xlarge: RayletUnexpectedlyDied (ip: 1.2.3.94) - p3.2xlarge: RayletUnexpectedlyDied (ip: 1.2.3.93) - p3.2xlarge: RayletUnexpectedlyDied (ip: 1.2.3.92) - p3.2xlarge: RayletUnexpectedlyDied (ip: 1.2.3.91) - p3.2xlarge: RayletUnexpectedlyDied (ip: 1.2.3.90) - p3.2xlarge: RayletUnexpectedlyDied (ip: 1.2.3.89) - p3.2xlarge: RayletUnexpectedlyDied (ip: 1.2.3.88) - p3.2xlarge: RayletUnexpectedlyDied (ip: 1.2.3.87) - p3.2xlarge: RayletUnexpectedlyDied (ip: 1.2.3.86) - p3.2xlarge: RayletUnexpectedlyDied (ip: 1.2.3.85) - p3.2xlarge: RayletUnexpectedlyDied (ip: 1.2.3.84) - p3.2xlarge: RayletUnexpectedlyDied (ip: 1.2.3.83) - p3.2xlarge: RayletUnexpectedlyDied (ip: 1.2.3.82) - p3.2xlarge: RayletUnexpectedlyDied (ip: 1.2.3.81) + p3.2xlarge: NodeTerminated (ip: 1.2.3.99) + p3.2xlarge: NodeTerminated (ip: 1.2.3.98) + p3.2xlarge: NodeTerminated (ip: 1.2.3.97) + p3.2xlarge: NodeTerminated (ip: 1.2.3.96) + p3.2xlarge: NodeTerminated (ip: 1.2.3.95) + p3.2xlarge: NodeTerminated (ip: 1.2.3.94) + p3.2xlarge: NodeTerminated (ip: 1.2.3.93) + p3.2xlarge: NodeTerminated (ip: 1.2.3.92) + p3.2xlarge: NodeTerminated (ip: 1.2.3.91) + p3.2xlarge: NodeTerminated (ip: 1.2.3.90) + p3.2xlarge: NodeTerminated (ip: 1.2.3.89) + p3.2xlarge: NodeTerminated (ip: 1.2.3.88) + p3.2xlarge: NodeTerminated (ip: 1.2.3.87) + p3.2xlarge: NodeTerminated (ip: 1.2.3.86) + p3.2xlarge: NodeTerminated (ip: 1.2.3.85) + p3.2xlarge: NodeTerminated (ip: 1.2.3.84) + p3.2xlarge: NodeTerminated (ip: 1.2.3.83) + p3.2xlarge: NodeTerminated (ip: 1.2.3.82) + p3.2xlarge: NodeTerminated (ip: 1.2.3.81) Resources --------------------------------------------------------