From cf8ea9e5f149ab8f97a79d99f3d86e5d35eb9046 Mon Sep 17 00:00:00 2001 From: rickyyx Date: Thu, 27 Jul 2023 16:35:28 +0000 Subject: [PATCH 1/2] update Signed-off-by: rickyyx --- python/ray/autoscaler/_private/util.py | 2 +- python/ray/autoscaler/v2/schema.py | 6 ++- python/ray/autoscaler/v2/tests/test_utils.py | 2 +- .../tests/test_resource_demand_scheduler.py | 50 +++++++++---------- 4 files changed, 32 insertions(+), 28 deletions(-) diff --git a/python/ray/autoscaler/_private/util.py b/python/ray/autoscaler/_private/util.py index 9eeafd938252..d33a6abab002 100644 --- a/python/ray/autoscaler/_private/util.py +++ b/python/ray/autoscaler/_private/util.py @@ -796,7 +796,7 @@ def format_info_string( failure_lines = [] for ip, node_type in autoscaler_summary.failed_nodes: - line = f" {node_type}: RayletUnexpectedlyDied (ip: {ip})" + line = f" {node_type}: RayletTerminated (ip: {ip})" failure_lines.append(line) if autoscaler_summary.node_availability_summary: records = sorted( diff --git a/python/ray/autoscaler/v2/schema.py b/python/ray/autoscaler/v2/schema.py index 302d3cf53895..e48fedc49e4f 100644 --- a/python/ray/autoscaler/v2/schema.py +++ b/python/ray/autoscaler/v2/schema.py @@ -3,7 +3,11 @@ from enum import Enum from typing import Dict, List, Optional -NODE_DEATH_CAUSE_RAYLET_DIED = "RayletUnexpectedlyDied" +# TODO(rickyx): once we have graceful shutdown, we could populate +# the failure detail with the actual termination message. As of now, +# we will use a more generic message to include cases such as: +# (idle termination, node death, crash, preemption, etc) +NODE_DEATH_CAUSE_RAYLET_DIED = "RayletTerminated" @dataclass diff --git a/python/ray/autoscaler/v2/tests/test_utils.py b/python/ray/autoscaler/v2/tests/test_utils.py index 51a1861a034d..6a18ce7ed42a 100644 --- a/python/ray/autoscaler/v2/tests/test_utils.py +++ b/python/ray/autoscaler/v2/tests/test_utils.py @@ -510,7 +510,7 @@ def test_cluster_status_formatter(): 127.0.0.3: worker_node, starting ray Recent failures: worker_node: LaunchFailed (latest_attempt: 02:46:40) - Insufficient capacity - worker_node: RayletUnexpectedlyDied (ip: 127.0.0.5) + worker_node: RayletTerminated (ip: 127.0.0.5) Resources -------------------------------------------------------- diff --git a/python/ray/tests/test_resource_demand_scheduler.py b/python/ray/tests/test_resource_demand_scheduler.py index b63a6216b013..7e112afb0e91 100644 --- a/python/ray/tests/test_resource_demand_scheduler.py +++ b/python/ray/tests/test_resource_demand_scheduler.py @@ -3138,7 +3138,7 @@ def test_info_string(): 1.2.3.4: m4.4xlarge, waiting-for-ssh 1.2.3.5: m4.4xlarge, waiting-for-ssh Recent failures: - p3.2xlarge: RayletUnexpectedlyDied (ip: 1.2.3.6) + p3.2xlarge: RayletTerminated (ip: 1.2.3.6) Resources -------------------------------------------------------- @@ -3218,7 +3218,7 @@ def test_info_string_verbose(): 1.2.3.4: m4.4xlarge, waiting-for-ssh 1.2.3.5: m4.4xlarge, waiting-for-ssh Recent failures: - p3.2xlarge: RayletUnexpectedlyDied (ip: 1.2.3.6) + p3.2xlarge: RayletTerminated (ip: 1.2.3.6) Resources -------------------------------------------------------- @@ -3322,7 +3322,7 @@ def test_info_string_verbose_node_types(): 1.2.3.4: m4.4xlarge, waiting-for-ssh 1.2.3.5: m4.4xlarge, waiting-for-ssh Recent failures: - p3.2xlarge: RayletUnexpectedlyDied (ip: 1.2.3.6) + p3.2xlarge: RayletTerminated (ip: 1.2.3.6) Resources -------------------------------------------------------- @@ -3410,7 +3410,7 @@ def test_info_string_verbose_no_breakdown(): 1.2.3.4: m4.4xlarge, waiting-for-ssh 1.2.3.5: m4.4xlarge, waiting-for-ssh Recent failures: - p3.2xlarge: RayletUnexpectedlyDied (ip: 1.2.3.6) + p3.2xlarge: RayletTerminated (ip: 1.2.3.6) Resources -------------------------------------------------------- @@ -3501,7 +3501,7 @@ def test_info_string_with_launch_failures(): Recent failures: A100: InstanceLimitExceeded (latest_attempt: 13:03:02) Inferentia-Spot: InsufficientInstanceCapacity (latest_attempt: 13:03:01) - p3.2xlarge: RayletUnexpectedlyDied (ip: 1.2.3.6) + p3.2xlarge: RayletTerminated (ip: 1.2.3.6) Resources -------------------------------------------------------- @@ -3590,7 +3590,7 @@ def test_info_string_with_launch_failures_verbose(): Recent failures: A100: InstanceLimitExceeded (latest_attempt: 13:03:02) - you should fix it Inferentia-Spot: InsufficientInstanceCapacity (latest_attempt: 13:03:01) - desc - p3.2xlarge: RayletUnexpectedlyDied (ip: 1.2.3.6) + p3.2xlarge: RayletTerminated (ip: 1.2.3.6) Resources -------------------------------------------------------- @@ -3657,25 +3657,25 @@ def test_info_string_failed_node_cap(): 1.2.3.4: m4.4xlarge, waiting-for-ssh 1.2.3.5: m4.4xlarge, waiting-for-ssh Recent failures: - p3.2xlarge: RayletUnexpectedlyDied (ip: 1.2.3.99) - p3.2xlarge: RayletUnexpectedlyDied (ip: 1.2.3.98) - p3.2xlarge: RayletUnexpectedlyDied (ip: 1.2.3.97) - p3.2xlarge: RayletUnexpectedlyDied (ip: 1.2.3.96) - p3.2xlarge: RayletUnexpectedlyDied (ip: 1.2.3.95) - p3.2xlarge: RayletUnexpectedlyDied (ip: 1.2.3.94) - p3.2xlarge: RayletUnexpectedlyDied (ip: 1.2.3.93) - p3.2xlarge: RayletUnexpectedlyDied (ip: 1.2.3.92) - p3.2xlarge: RayletUnexpectedlyDied (ip: 1.2.3.91) - p3.2xlarge: RayletUnexpectedlyDied (ip: 1.2.3.90) - p3.2xlarge: RayletUnexpectedlyDied (ip: 1.2.3.89) - p3.2xlarge: RayletUnexpectedlyDied (ip: 1.2.3.88) - p3.2xlarge: RayletUnexpectedlyDied (ip: 1.2.3.87) - p3.2xlarge: RayletUnexpectedlyDied (ip: 1.2.3.86) - p3.2xlarge: RayletUnexpectedlyDied (ip: 1.2.3.85) - p3.2xlarge: RayletUnexpectedlyDied (ip: 1.2.3.84) - p3.2xlarge: RayletUnexpectedlyDied (ip: 1.2.3.83) - p3.2xlarge: RayletUnexpectedlyDied (ip: 1.2.3.82) - p3.2xlarge: RayletUnexpectedlyDied (ip: 1.2.3.81) + p3.2xlarge: RayletTerminated (ip: 1.2.3.99) + p3.2xlarge: RayletTerminated (ip: 1.2.3.98) + p3.2xlarge: RayletTerminated (ip: 1.2.3.97) + p3.2xlarge: RayletTerminated (ip: 1.2.3.96) + p3.2xlarge: RayletTerminated (ip: 1.2.3.95) + p3.2xlarge: RayletTerminated (ip: 1.2.3.94) + p3.2xlarge: RayletTerminated (ip: 1.2.3.93) + p3.2xlarge: RayletTerminated (ip: 1.2.3.92) + p3.2xlarge: RayletTerminated (ip: 1.2.3.91) + p3.2xlarge: RayletTerminated (ip: 1.2.3.90) + p3.2xlarge: RayletTerminated (ip: 1.2.3.89) + p3.2xlarge: RayletTerminated (ip: 1.2.3.88) + p3.2xlarge: RayletTerminated (ip: 1.2.3.87) + p3.2xlarge: RayletTerminated (ip: 1.2.3.86) + p3.2xlarge: RayletTerminated (ip: 1.2.3.85) + p3.2xlarge: RayletTerminated (ip: 1.2.3.84) + p3.2xlarge: RayletTerminated (ip: 1.2.3.83) + p3.2xlarge: RayletTerminated (ip: 1.2.3.82) + p3.2xlarge: RayletTerminated (ip: 1.2.3.81) Resources -------------------------------------------------------- From 3d4eaa48da5138df95f5107d1ce5be5d932bcee7 Mon Sep 17 00:00:00 2001 From: rickyyx Date: Thu, 27 Jul 2023 20:43:17 +0000 Subject: [PATCH 2/2] nit Signed-off-by: rickyyx --- python/ray/autoscaler/_private/util.py | 2 +- python/ray/autoscaler/v2/schema.py | 2 +- python/ray/autoscaler/v2/tests/test_utils.py | 2 +- .../tests/test_resource_demand_scheduler.py | 50 +++++++++---------- 4 files changed, 28 insertions(+), 28 deletions(-) diff --git a/python/ray/autoscaler/_private/util.py b/python/ray/autoscaler/_private/util.py index d33a6abab002..7d1d70b07946 100644 --- a/python/ray/autoscaler/_private/util.py +++ b/python/ray/autoscaler/_private/util.py @@ -796,7 +796,7 @@ def format_info_string( failure_lines = [] for ip, node_type in autoscaler_summary.failed_nodes: - line = f" {node_type}: RayletTerminated (ip: {ip})" + line = f" {node_type}: NodeTerminated (ip: {ip})" failure_lines.append(line) if autoscaler_summary.node_availability_summary: records = sorted( diff --git a/python/ray/autoscaler/v2/schema.py b/python/ray/autoscaler/v2/schema.py index e48fedc49e4f..86ace52582bf 100644 --- a/python/ray/autoscaler/v2/schema.py +++ b/python/ray/autoscaler/v2/schema.py @@ -7,7 +7,7 @@ # the failure detail with the actual termination message. As of now, # we will use a more generic message to include cases such as: # (idle termination, node death, crash, preemption, etc) -NODE_DEATH_CAUSE_RAYLET_DIED = "RayletTerminated" +NODE_DEATH_CAUSE_RAYLET_DIED = "NodeTerminated" @dataclass diff --git a/python/ray/autoscaler/v2/tests/test_utils.py b/python/ray/autoscaler/v2/tests/test_utils.py index 6a18ce7ed42a..f2f9aad6354f 100644 --- a/python/ray/autoscaler/v2/tests/test_utils.py +++ b/python/ray/autoscaler/v2/tests/test_utils.py @@ -510,7 +510,7 @@ def test_cluster_status_formatter(): 127.0.0.3: worker_node, starting ray Recent failures: worker_node: LaunchFailed (latest_attempt: 02:46:40) - Insufficient capacity - worker_node: RayletTerminated (ip: 127.0.0.5) + worker_node: NodeTerminated (ip: 127.0.0.5) Resources -------------------------------------------------------- diff --git a/python/ray/tests/test_resource_demand_scheduler.py b/python/ray/tests/test_resource_demand_scheduler.py index 7e112afb0e91..12ab20c9089f 100644 --- a/python/ray/tests/test_resource_demand_scheduler.py +++ b/python/ray/tests/test_resource_demand_scheduler.py @@ -3138,7 +3138,7 @@ def test_info_string(): 1.2.3.4: m4.4xlarge, waiting-for-ssh 1.2.3.5: m4.4xlarge, waiting-for-ssh Recent failures: - p3.2xlarge: RayletTerminated (ip: 1.2.3.6) + p3.2xlarge: NodeTerminated (ip: 1.2.3.6) Resources -------------------------------------------------------- @@ -3218,7 +3218,7 @@ def test_info_string_verbose(): 1.2.3.4: m4.4xlarge, waiting-for-ssh 1.2.3.5: m4.4xlarge, waiting-for-ssh Recent failures: - p3.2xlarge: RayletTerminated (ip: 1.2.3.6) + p3.2xlarge: NodeTerminated (ip: 1.2.3.6) Resources -------------------------------------------------------- @@ -3322,7 +3322,7 @@ def test_info_string_verbose_node_types(): 1.2.3.4: m4.4xlarge, waiting-for-ssh 1.2.3.5: m4.4xlarge, waiting-for-ssh Recent failures: - p3.2xlarge: RayletTerminated (ip: 1.2.3.6) + p3.2xlarge: NodeTerminated (ip: 1.2.3.6) Resources -------------------------------------------------------- @@ -3410,7 +3410,7 @@ def test_info_string_verbose_no_breakdown(): 1.2.3.4: m4.4xlarge, waiting-for-ssh 1.2.3.5: m4.4xlarge, waiting-for-ssh Recent failures: - p3.2xlarge: RayletTerminated (ip: 1.2.3.6) + p3.2xlarge: NodeTerminated (ip: 1.2.3.6) Resources -------------------------------------------------------- @@ -3501,7 +3501,7 @@ def test_info_string_with_launch_failures(): Recent failures: A100: InstanceLimitExceeded (latest_attempt: 13:03:02) Inferentia-Spot: InsufficientInstanceCapacity (latest_attempt: 13:03:01) - p3.2xlarge: RayletTerminated (ip: 1.2.3.6) + p3.2xlarge: NodeTerminated (ip: 1.2.3.6) Resources -------------------------------------------------------- @@ -3590,7 +3590,7 @@ def test_info_string_with_launch_failures_verbose(): Recent failures: A100: InstanceLimitExceeded (latest_attempt: 13:03:02) - you should fix it Inferentia-Spot: InsufficientInstanceCapacity (latest_attempt: 13:03:01) - desc - p3.2xlarge: RayletTerminated (ip: 1.2.3.6) + p3.2xlarge: NodeTerminated (ip: 1.2.3.6) Resources -------------------------------------------------------- @@ -3657,25 +3657,25 @@ def test_info_string_failed_node_cap(): 1.2.3.4: m4.4xlarge, waiting-for-ssh 1.2.3.5: m4.4xlarge, waiting-for-ssh Recent failures: - p3.2xlarge: RayletTerminated (ip: 1.2.3.99) - p3.2xlarge: RayletTerminated (ip: 1.2.3.98) - p3.2xlarge: RayletTerminated (ip: 1.2.3.97) - p3.2xlarge: RayletTerminated (ip: 1.2.3.96) - p3.2xlarge: RayletTerminated (ip: 1.2.3.95) - p3.2xlarge: RayletTerminated (ip: 1.2.3.94) - p3.2xlarge: RayletTerminated (ip: 1.2.3.93) - p3.2xlarge: RayletTerminated (ip: 1.2.3.92) - p3.2xlarge: RayletTerminated (ip: 1.2.3.91) - p3.2xlarge: RayletTerminated (ip: 1.2.3.90) - p3.2xlarge: RayletTerminated (ip: 1.2.3.89) - p3.2xlarge: RayletTerminated (ip: 1.2.3.88) - p3.2xlarge: RayletTerminated (ip: 1.2.3.87) - p3.2xlarge: RayletTerminated (ip: 1.2.3.86) - p3.2xlarge: RayletTerminated (ip: 1.2.3.85) - p3.2xlarge: RayletTerminated (ip: 1.2.3.84) - p3.2xlarge: RayletTerminated (ip: 1.2.3.83) - p3.2xlarge: RayletTerminated (ip: 1.2.3.82) - p3.2xlarge: RayletTerminated (ip: 1.2.3.81) + p3.2xlarge: NodeTerminated (ip: 1.2.3.99) + p3.2xlarge: NodeTerminated (ip: 1.2.3.98) + p3.2xlarge: NodeTerminated (ip: 1.2.3.97) + p3.2xlarge: NodeTerminated (ip: 1.2.3.96) + p3.2xlarge: NodeTerminated (ip: 1.2.3.95) + p3.2xlarge: NodeTerminated (ip: 1.2.3.94) + p3.2xlarge: NodeTerminated (ip: 1.2.3.93) + p3.2xlarge: NodeTerminated (ip: 1.2.3.92) + p3.2xlarge: NodeTerminated (ip: 1.2.3.91) + p3.2xlarge: NodeTerminated (ip: 1.2.3.90) + p3.2xlarge: NodeTerminated (ip: 1.2.3.89) + p3.2xlarge: NodeTerminated (ip: 1.2.3.88) + p3.2xlarge: NodeTerminated (ip: 1.2.3.87) + p3.2xlarge: NodeTerminated (ip: 1.2.3.86) + p3.2xlarge: NodeTerminated (ip: 1.2.3.85) + p3.2xlarge: NodeTerminated (ip: 1.2.3.84) + p3.2xlarge: NodeTerminated (ip: 1.2.3.83) + p3.2xlarge: NodeTerminated (ip: 1.2.3.82) + p3.2xlarge: NodeTerminated (ip: 1.2.3.81) Resources --------------------------------------------------------