From ff4e88f6a30255329ccd30a6dce9071392b5f53b Mon Sep 17 00:00:00 2001 From: Cindy Zhang Date: Wed, 15 Jun 2022 17:04:12 -0700 Subject: [PATCH 01/16] Set message field of status if allocations/initializations have been pending for too long --- python/ray/serve/deployment_state.py | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/python/ray/serve/deployment_state.py b/python/ray/serve/deployment_state.py index 4168b3be2de4..0d6dc2c70de2 100644 --- a/python/ray/serve/deployment_state.py +++ b/python/ray/serve/deployment_state.py @@ -1496,7 +1496,7 @@ def _check_and_update_replicas(self) -> bool: if len(pending_allocation) > 0: required, available = slow_start_replicas[0][0].resource_requirements() - logger.warning( + message = ( f"Deployment '{self._name}' has " f"{len(pending_allocation)} replicas that have taken " f"more than {SLOW_STARTUP_WARNING_S}s to be scheduled. " @@ -1506,16 +1506,34 @@ def _check_and_update_replicas(self) -> bool: f"Resources required for each replica: {required}, " f"resources available: {available}." ) + logger.warning(message) if _SCALING_LOG_ENABLED: print_verbose_scaling_log() + # If status is UNHEALTHY, give it higher priority over the stuck + # allocations problem when propagating status back to user + if self._curr_status_info.status != DeploymentStatus.UNHEALTHY: + self._curr_status_info = DeploymentStatusInfo( + name=self._name, + status=DeploymentStatus.UPDATING, + message=message, + ) if len(pending_initialization) > 0: - logger.warning( + message = ( f"Deployment '{self._name}' has " f"{len(pending_initialization)} replicas that have taken " f"more than {SLOW_STARTUP_WARNING_S}s to initialize. This " f"may be caused by a slow __init__ or reconfigure method." ) + logger.warning(message) + # If status is UNHEALTHY, give it higher priority over the stuck + # initializations problem when propagating status back to user + if self._curr_status_info.status != DeploymentStatus.UNHEALTHY: + self._curr_status_info = DeploymentStatusInfo( + name=self._name, + status=DeploymentStatus.UPDATING, + message=message, + ) self._prev_startup_warning = time.time() From 9eb7c871f57a10e92b2ea58e955b47a5526d5198 Mon Sep 17 00:00:00 2001 From: Cindy Zhang Date: Wed, 15 Jun 2022 18:23:01 -0700 Subject: [PATCH 02/16] format changes --- python/ray/serve/deployment_state.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/ray/serve/deployment_state.py b/python/ray/serve/deployment_state.py index 0d6dc2c70de2..45fbd397de5e 100644 --- a/python/ray/serve/deployment_state.py +++ b/python/ray/serve/deployment_state.py @@ -1509,7 +1509,7 @@ def _check_and_update_replicas(self) -> bool: logger.warning(message) if _SCALING_LOG_ENABLED: print_verbose_scaling_log() - # If status is UNHEALTHY, give it higher priority over the stuck + # If status is UNHEALTHY, give it higher priority over the stuck # allocations problem when propagating status back to user if self._curr_status_info.status != DeploymentStatus.UNHEALTHY: self._curr_status_info = DeploymentStatusInfo( @@ -1526,7 +1526,7 @@ def _check_and_update_replicas(self) -> bool: f"may be caused by a slow __init__ or reconfigure method." ) logger.warning(message) - # If status is UNHEALTHY, give it higher priority over the stuck + # If status is UNHEALTHY, give it higher priority over the stuck # initializations problem when propagating status back to user if self._curr_status_info.status != DeploymentStatus.UNHEALTHY: self._curr_status_info = DeploymentStatusInfo( From 8d43a1b35cd61a9c3829befa14f9fb27b3645ae4 Mon Sep 17 00:00:00 2001 From: Cindy Zhang Date: Fri, 17 Jun 2022 12:15:59 -0700 Subject: [PATCH 03/16] Add unit test --- python/ray/serve/controller.py | 12 ++++++ python/ray/serve/tests/test_standalone2.py | 43 ++++++++++++++++++++++ 2 files changed, 55 insertions(+) diff --git a/python/ray/serve/controller.py b/python/ray/serve/controller.py index f654f1e4db41..bec255f70704 100644 --- a/python/ray/serve/controller.py +++ b/python/ray/serve/controller.py @@ -157,6 +157,18 @@ def _stop_one_running_replica_for_testing(self, deployment_name): deployment_name ]._stop_one_running_replica_for_testing() + def _get_slow_startup_warning_period_s(self) -> float: + return ray.serve.deployment_state.SLOW_STARTUP_WARNING_PERIOD_S + + def _get_slow_startup_warning_s(self) -> float: + return ray.serve.deployment_state.SLOW_STARTUP_WARNING_S + + def _set_slow_startup_warning_period_s(self, period: float) -> None: + ray.serve.deployment_state.SLOW_STARTUP_WARNING_PERIOD_S = period + + def _set_slow_startup_warning_s(self, time_limit: float) -> None: + ray.serve.deployment_state.SLOW_STARTUP_WARNING_S = time_limit + async def listen_for_change(self, keys_to_snapshot_ids: Dict[str, int]): """Proxy long pull client's listen request. diff --git a/python/ray/serve/tests/test_standalone2.py b/python/ray/serve/tests/test_standalone2.py index d3a0118c7022..5dfa84c271f8 100644 --- a/python/ray/serve/tests/test_standalone2.py +++ b/python/ray/serve/tests/test_standalone2.py @@ -138,6 +138,49 @@ def f(*args): serve.shutdown() +def test_updating_status_message(shutdown_ray): + ray.init(num_cpus=2) + client = serve.start(detached=True) + + @serve.deployment( + num_replicas=5, + ray_actor_options={"num_cpus": 1}, + ) + def f(*args): + pass + + original_slow_startup_warning_period_s = ( + client._controller._get_slow_startup_warning_period_s.remote() + ) + original_slow_startup_warning_s = ( + client._controller._get_slow_startup_warning_s.remote() + ) + # Lower slow startup warning threshold to 1 second to reduce test duration + client._controller._set_slow_startup_warning_period_s.remote(1) + client._controller._set_slow_startup_warning_s.remote(1) + f.deploy(_blocking=False) + + def updating_message(): + deployment_status = client.get_serve_status().deployment_statuses[0] + message_substring = "more than 1s to be scheduled." + return (deployment_status.status == "UPDATING") and ( + message_substring in deployment_status.message + ) + + wait_for_condition(updating_message, timeout=2) + # Reset slow startup warning threshold in case bugs that cause different + # tests to share state occur + client._controller._set_slow_startup_warning_period_s.remote( + original_slow_startup_warning_period_s + ) + client._controller._set_slow_startup_warning_s.remote( + original_slow_startup_warning_s + ) + + serve.shutdown() + ray.shutdown() + + @pytest.mark.parametrize("detached", [True, False]) def test_refresh_controller_after_death(shutdown_ray, detached): """Check if serve.start() refreshes the controller handle if it's dead.""" From b9bdb4c4d2428c30801e84ea2622f95d93ea031e Mon Sep 17 00:00:00 2001 From: Cindy Zhang Date: Fri, 17 Jun 2022 12:19:05 -0700 Subject: [PATCH 04/16] change comments --- python/ray/serve/deployment_state.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/python/ray/serve/deployment_state.py b/python/ray/serve/deployment_state.py index 45fbd397de5e..5ec933e1a7f8 100644 --- a/python/ray/serve/deployment_state.py +++ b/python/ray/serve/deployment_state.py @@ -1509,8 +1509,9 @@ def _check_and_update_replicas(self) -> bool: logger.warning(message) if _SCALING_LOG_ENABLED: print_verbose_scaling_log() - # If status is UNHEALTHY, give it higher priority over the stuck - # allocations problem when propagating status back to user + # If status is UNHEALTHY, leave the status and message as is. + # The issue that caused the deployment to be unhealthy should be + # prioritized over this resource availability issue. if self._curr_status_info.status != DeploymentStatus.UNHEALTHY: self._curr_status_info = DeploymentStatusInfo( name=self._name, @@ -1526,8 +1527,9 @@ def _check_and_update_replicas(self) -> bool: f"may be caused by a slow __init__ or reconfigure method." ) logger.warning(message) - # If status is UNHEALTHY, give it higher priority over the stuck - # initializations problem when propagating status back to user + # If status is UNHEALTHY, leave the status and message as is. + # The issue that caused the deployment to be unhealthy should be + # prioritized over this resource availability issue. if self._curr_status_info.status != DeploymentStatus.UNHEALTHY: self._curr_status_info = DeploymentStatusInfo( name=self._name, From a5e5b886f5eadbb4528ff7dec6461a56d1cf8c1f Mon Sep 17 00:00:00 2001 From: Cindy Zhang Date: Fri, 17 Jun 2022 14:59:07 -0700 Subject: [PATCH 05/16] minor fixes --- python/ray/serve/deployment_state.py | 8 +++---- python/ray/serve/tests/test_standalone2.py | 27 ++++++++++++++-------- 2 files changed, 21 insertions(+), 14 deletions(-) diff --git a/python/ray/serve/deployment_state.py b/python/ray/serve/deployment_state.py index 5ec933e1a7f8..6d3dce21ec15 100644 --- a/python/ray/serve/deployment_state.py +++ b/python/ray/serve/deployment_state.py @@ -1509,8 +1509,8 @@ def _check_and_update_replicas(self) -> bool: logger.warning(message) if _SCALING_LOG_ENABLED: print_verbose_scaling_log() - # If status is UNHEALTHY, leave the status and message as is. - # The issue that caused the deployment to be unhealthy should be + # If status is UNHEALTHY, leave the status and message as is. + # The issue that caused the deployment to be unhealthy should be # prioritized over this resource availability issue. if self._curr_status_info.status != DeploymentStatus.UNHEALTHY: self._curr_status_info = DeploymentStatusInfo( @@ -1527,8 +1527,8 @@ def _check_and_update_replicas(self) -> bool: f"may be caused by a slow __init__ or reconfigure method." ) logger.warning(message) - # If status is UNHEALTHY, leave the status and message as is. - # The issue that caused the deployment to be unhealthy should be + # If status is UNHEALTHY, leave the status and message as is. + # The issue that caused the deployment to be unhealthy should be # prioritized over this resource availability issue. if self._curr_status_info.status != DeploymentStatus.UNHEALTHY: self._curr_status_info = DeploymentStatusInfo( diff --git a/python/ray/serve/tests/test_standalone2.py b/python/ray/serve/tests/test_standalone2.py index 5dfa84c271f8..6c8259e6fe42 100644 --- a/python/ray/serve/tests/test_standalone2.py +++ b/python/ray/serve/tests/test_standalone2.py @@ -139,6 +139,8 @@ def f(*args): def test_updating_status_message(shutdown_ray): + """Check if status message says if a serve deployment has taken a long time""" + ray.init(num_cpus=2) client = serve.start(detached=True) @@ -149,15 +151,15 @@ def test_updating_status_message(shutdown_ray): def f(*args): pass - original_slow_startup_warning_period_s = ( + original_slow_startup_warning_period_s = ray.get( client._controller._get_slow_startup_warning_period_s.remote() ) - original_slow_startup_warning_s = ( + original_slow_startup_warning_s = ray.get( client._controller._get_slow_startup_warning_s.remote() ) # Lower slow startup warning threshold to 1 second to reduce test duration - client._controller._set_slow_startup_warning_period_s.remote(1) - client._controller._set_slow_startup_warning_s.remote(1) + ray.get(client._controller._set_slow_startup_warning_period_s.remote(1)) + ray.get(client._controller._set_slow_startup_warning_s.remote(1)) f.deploy(_blocking=False) def updating_message(): @@ -168,13 +170,18 @@ def updating_message(): ) wait_for_condition(updating_message, timeout=2) - # Reset slow startup warning threshold in case bugs that cause different - # tests to share state occur - client._controller._set_slow_startup_warning_period_s.remote( - original_slow_startup_warning_period_s + + # Reset slow startup warning threshold to prevent state sharing across unit + # tests + ray.get( + client._controller._set_slow_startup_warning_period_s.remote( + original_slow_startup_warning_period_s + ) ) - client._controller._set_slow_startup_warning_s.remote( - original_slow_startup_warning_s + ray.get( + client._controller._set_slow_startup_warning_s.remote( + original_slow_startup_warning_s + ) ) serve.shutdown() From 885b722cd5f629a25e2e2d910827161225d80356 Mon Sep 17 00:00:00 2001 From: Cindy Zhang Date: Tue, 21 Jun 2022 09:40:43 -0700 Subject: [PATCH 06/16] Add unit test for checking that an unhealthy status should not be overrided with resource availability issue --- python/ray/serve/tests/test_standalone2.py | 93 ++++++++++++++++------ 1 file changed, 67 insertions(+), 26 deletions(-) diff --git a/python/ray/serve/tests/test_standalone2.py b/python/ray/serve/tests/test_standalone2.py index 6c8259e6fe42..ddc98c4f1e54 100644 --- a/python/ray/serve/tests/test_standalone2.py +++ b/python/ray/serve/tests/test_standalone2.py @@ -51,6 +51,40 @@ def start_and_shutdown_ray_cli_class(): yield +@pytest.fixture() +def lower_slow_startup_threshold_and_reset(): + ray.init(num_cpus=2) + client = serve.start(detached=True) + + original_slow_startup_warning_period_s = ray.get( + client._controller._get_slow_startup_warning_period_s.remote() + ) + original_slow_startup_warning_s = ray.get( + client._controller._get_slow_startup_warning_s.remote() + ) + # Lower slow startup warning threshold to 1 second to reduce test duration + ray.get(client._controller._set_slow_startup_warning_period_s.remote(1)) + ray.get(client._controller._set_slow_startup_warning_s.remote(1)) + + yield client + + # Reset slow startup warning threshold to prevent state sharing across unit + # tests + ray.get( + client._controller._set_slow_startup_warning_period_s.remote( + original_slow_startup_warning_period_s + ) + ) + ray.get( + client._controller._set_slow_startup_warning_s.remote( + original_slow_startup_warning_s + ) + ) + + serve.shutdown() + ray.shutdown() + + def test_standalone_actor_outside_serve(): # https://github.com/ray-project/ray/issues/20066 @@ -138,12 +172,10 @@ def f(*args): serve.shutdown() -def test_updating_status_message(shutdown_ray): +def test_updating_status_message(shutdown_ray, lower_slow_startup_threshold_and_reset): """Check if status message says if a serve deployment has taken a long time""" - ray.init(num_cpus=2) - client = serve.start(detached=True) - + client = lower_slow_startup_threshold_and_reset @serve.deployment( num_replicas=5, ray_actor_options={"num_cpus": 1}, @@ -151,15 +183,6 @@ def test_updating_status_message(shutdown_ray): def f(*args): pass - original_slow_startup_warning_period_s = ray.get( - client._controller._get_slow_startup_warning_period_s.remote() - ) - original_slow_startup_warning_s = ray.get( - client._controller._get_slow_startup_warning_s.remote() - ) - # Lower slow startup warning threshold to 1 second to reduce test duration - ray.get(client._controller._set_slow_startup_warning_period_s.remote(1)) - ray.get(client._controller._set_slow_startup_warning_s.remote(1)) f.deploy(_blocking=False) def updating_message(): @@ -171,21 +194,39 @@ def updating_message(): wait_for_condition(updating_message, timeout=2) - # Reset slow startup warning threshold to prevent state sharing across unit - # tests - ray.get( - client._controller._set_slow_startup_warning_period_s.remote( - original_slow_startup_warning_period_s - ) - ) - ray.get( - client._controller._set_slow_startup_warning_s.remote( - original_slow_startup_warning_s - ) + +def test_unhealthy_override_updating_status( + shutdown_ray, lower_slow_startup_threshold_and_reset +): + """ + Check that if status is UNHEALTHY and there is a resource availability + issue, the status should not change. The issue that caused the deployment to + be unhealthy should be prioritized over this resource availability issue. + """ + + client = lower_slow_startup_threshold_and_reset + @serve.deployment + class f: + def __init__(self): + self.num = 5 / 0 + + def __call__(self, request): + pass + + f.deploy(_blocking=False) + + wait_for_condition( + lambda: client.get_serve_status().deployment_statuses[0].status + == "UNHEALTHY", + timeout=5, ) - serve.shutdown() - ray.shutdown() + with pytest.raises(RuntimeError): + wait_for_condition( + lambda: client.get_serve_status().deployment_statuses[0].status + == "UPDATING", + timeout=5, + ) @pytest.mark.parametrize("detached", [True, False]) From 4e0d28b1ccc7fb5e178da112c15b497bc44bbbc8 Mon Sep 17 00:00:00 2001 From: Cindy Zhang Date: Tue, 21 Jun 2022 09:42:05 -0700 Subject: [PATCH 07/16] format changes --- python/ray/serve/tests/test_standalone2.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/python/ray/serve/tests/test_standalone2.py b/python/ray/serve/tests/test_standalone2.py index ddc98c4f1e54..767cf3f99455 100644 --- a/python/ray/serve/tests/test_standalone2.py +++ b/python/ray/serve/tests/test_standalone2.py @@ -176,6 +176,7 @@ def test_updating_status_message(shutdown_ray, lower_slow_startup_threshold_and_ """Check if status message says if a serve deployment has taken a long time""" client = lower_slow_startup_threshold_and_reset + @serve.deployment( num_replicas=5, ray_actor_options={"num_cpus": 1}, @@ -199,12 +200,13 @@ def test_unhealthy_override_updating_status( shutdown_ray, lower_slow_startup_threshold_and_reset ): """ - Check that if status is UNHEALTHY and there is a resource availability + Check that if status is UNHEALTHY and there is a resource availability issue, the status should not change. The issue that caused the deployment to be unhealthy should be prioritized over this resource availability issue. """ client = lower_slow_startup_threshold_and_reset + @serve.deployment class f: def __init__(self): @@ -216,8 +218,7 @@ def __call__(self, request): f.deploy(_blocking=False) wait_for_condition( - lambda: client.get_serve_status().deployment_statuses[0].status - == "UNHEALTHY", + lambda: client.get_serve_status().deployment_statuses[0].status == "UNHEALTHY", timeout=5, ) From 44dff49cbed468608507f6276ac66faa6cf92c74 Mon Sep 17 00:00:00 2001 From: Cindy Zhang Date: Tue, 21 Jun 2022 13:08:04 -0700 Subject: [PATCH 08/16] move unit tests from test_standalone2 to test_standalone --- python/ray/serve/tests/test_standalone.py | 92 ++++++++++++++++++++++ python/ray/serve/tests/test_standalone2.py | 92 ---------------------- 2 files changed, 92 insertions(+), 92 deletions(-) diff --git a/python/ray/serve/tests/test_standalone.py b/python/ray/serve/tests/test_standalone.py index 042bf5d8d3a1..0fb7733b6e61 100644 --- a/python/ray/serve/tests/test_standalone.py +++ b/python/ray/serve/tests/test_standalone.py @@ -53,6 +53,40 @@ def ray_cluster(): cluster.shutdown() +@pytest.fixture() +def lower_slow_startup_threshold_and_reset(): + ray.init(num_cpus=2) + client = serve.start(detached=True) + + original_slow_startup_warning_period_s = ray.get( + client._controller._get_slow_startup_warning_period_s.remote() + ) + original_slow_startup_warning_s = ray.get( + client._controller._get_slow_startup_warning_s.remote() + ) + # Lower slow startup warning threshold to 1 second to reduce test duration + ray.get(client._controller._set_slow_startup_warning_period_s.remote(1)) + ray.get(client._controller._set_slow_startup_warning_s.remote(1)) + + yield client + + # Reset slow startup warning threshold to prevent state sharing across unit + # tests + ray.get( + client._controller._set_slow_startup_warning_period_s.remote( + original_slow_startup_warning_period_s + ) + ) + ray.get( + client._controller._set_slow_startup_warning_s.remote( + original_slow_startup_warning_s + ) + ) + + serve.shutdown() + ray.shutdown() + + def test_shutdown(ray_shutdown): ray.init(num_cpus=16) serve.start(http_options=dict(port=8003)) @@ -672,5 +706,63 @@ def emit(self, record): ray.shutdown() +def test_updating_status_message(shutdown_ray, lower_slow_startup_threshold_and_reset): + """Check if status message says if a serve deployment has taken a long time""" + + client = lower_slow_startup_threshold_and_reset + + @serve.deployment( + num_replicas=5, + ray_actor_options={"num_cpus": 1}, + ) + def f(*args): + pass + + f.deploy(_blocking=False) + + def updating_message(): + deployment_status = client.get_serve_status().deployment_statuses[0] + message_substring = "more than 1s to be scheduled." + return (deployment_status.status == "UPDATING") and ( + message_substring in deployment_status.message + ) + + wait_for_condition(updating_message, timeout=2) + + +def test_unhealthy_override_updating_status( + shutdown_ray, lower_slow_startup_threshold_and_reset +): + """ + Check that if status is UNHEALTHY and there is a resource availability + issue, the status should not change. The issue that caused the deployment to + be unhealthy should be prioritized over this resource availability issue. + """ + + client = lower_slow_startup_threshold_and_reset + + @serve.deployment + class f: + def __init__(self): + self.num = 5 / 0 + + def __call__(self, request): + pass + + f.deploy(_blocking=False) + + wait_for_condition( + lambda: client.get_serve_status().deployment_statuses[0].status == "UNHEALTHY", + timeout=5, + ) + + with pytest.raises(RuntimeError): + wait_for_condition( + lambda: client.get_serve_status().deployment_statuses[0].status + == "UPDATING", + timeout=5, + ) + + if __name__ == "__main__": sys.exit(pytest.main(["-v", "-s", __file__])) diff --git a/python/ray/serve/tests/test_standalone2.py b/python/ray/serve/tests/test_standalone2.py index 767cf3f99455..d3a0118c7022 100644 --- a/python/ray/serve/tests/test_standalone2.py +++ b/python/ray/serve/tests/test_standalone2.py @@ -51,40 +51,6 @@ def start_and_shutdown_ray_cli_class(): yield -@pytest.fixture() -def lower_slow_startup_threshold_and_reset(): - ray.init(num_cpus=2) - client = serve.start(detached=True) - - original_slow_startup_warning_period_s = ray.get( - client._controller._get_slow_startup_warning_period_s.remote() - ) - original_slow_startup_warning_s = ray.get( - client._controller._get_slow_startup_warning_s.remote() - ) - # Lower slow startup warning threshold to 1 second to reduce test duration - ray.get(client._controller._set_slow_startup_warning_period_s.remote(1)) - ray.get(client._controller._set_slow_startup_warning_s.remote(1)) - - yield client - - # Reset slow startup warning threshold to prevent state sharing across unit - # tests - ray.get( - client._controller._set_slow_startup_warning_period_s.remote( - original_slow_startup_warning_period_s - ) - ) - ray.get( - client._controller._set_slow_startup_warning_s.remote( - original_slow_startup_warning_s - ) - ) - - serve.shutdown() - ray.shutdown() - - def test_standalone_actor_outside_serve(): # https://github.com/ray-project/ray/issues/20066 @@ -172,64 +138,6 @@ def f(*args): serve.shutdown() -def test_updating_status_message(shutdown_ray, lower_slow_startup_threshold_and_reset): - """Check if status message says if a serve deployment has taken a long time""" - - client = lower_slow_startup_threshold_and_reset - - @serve.deployment( - num_replicas=5, - ray_actor_options={"num_cpus": 1}, - ) - def f(*args): - pass - - f.deploy(_blocking=False) - - def updating_message(): - deployment_status = client.get_serve_status().deployment_statuses[0] - message_substring = "more than 1s to be scheduled." - return (deployment_status.status == "UPDATING") and ( - message_substring in deployment_status.message - ) - - wait_for_condition(updating_message, timeout=2) - - -def test_unhealthy_override_updating_status( - shutdown_ray, lower_slow_startup_threshold_and_reset -): - """ - Check that if status is UNHEALTHY and there is a resource availability - issue, the status should not change. The issue that caused the deployment to - be unhealthy should be prioritized over this resource availability issue. - """ - - client = lower_slow_startup_threshold_and_reset - - @serve.deployment - class f: - def __init__(self): - self.num = 5 / 0 - - def __call__(self, request): - pass - - f.deploy(_blocking=False) - - wait_for_condition( - lambda: client.get_serve_status().deployment_statuses[0].status == "UNHEALTHY", - timeout=5, - ) - - with pytest.raises(RuntimeError): - wait_for_condition( - lambda: client.get_serve_status().deployment_statuses[0].status - == "UPDATING", - timeout=5, - ) - - @pytest.mark.parametrize("detached", [True, False]) def test_refresh_controller_after_death(shutdown_ray, detached): """Check if serve.start() refreshes the controller handle if it's dead.""" From d5d85ab42961512368b6a90365a747d200ac897f Mon Sep 17 00:00:00 2001 From: Cindy Zhang Date: Tue, 21 Jun 2022 13:29:41 -0700 Subject: [PATCH 09/16] fix bug --- python/ray/serve/tests/test_standalone.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/python/ray/serve/tests/test_standalone.py b/python/ray/serve/tests/test_standalone.py index 0fb7733b6e61..2aa9266dc83e 100644 --- a/python/ray/serve/tests/test_standalone.py +++ b/python/ray/serve/tests/test_standalone.py @@ -706,7 +706,7 @@ def emit(self, record): ray.shutdown() -def test_updating_status_message(shutdown_ray, lower_slow_startup_threshold_and_reset): +def test_updating_status_message(lower_slow_startup_threshold_and_reset): """Check if status message says if a serve deployment has taken a long time""" client = lower_slow_startup_threshold_and_reset @@ -730,9 +730,7 @@ def updating_message(): wait_for_condition(updating_message, timeout=2) -def test_unhealthy_override_updating_status( - shutdown_ray, lower_slow_startup_threshold_and_reset -): +def test_unhealthy_override_updating_status(lower_slow_startup_threshold_and_reset): """ Check that if status is UNHEALTHY and there is a resource availability issue, the status should not change. The issue that caused the deployment to From b8fb9fe0069bac06af6c6de5c82ec168aad1bd39 Mon Sep 17 00:00:00 2001 From: Cindy Zhang Date: Tue, 21 Jun 2022 15:21:21 -0700 Subject: [PATCH 10/16] increase timeout --- python/ray/serve/tests/test_standalone.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/ray/serve/tests/test_standalone.py b/python/ray/serve/tests/test_standalone.py index 2aa9266dc83e..17ecb87317f5 100644 --- a/python/ray/serve/tests/test_standalone.py +++ b/python/ray/serve/tests/test_standalone.py @@ -727,7 +727,7 @@ def updating_message(): message_substring in deployment_status.message ) - wait_for_condition(updating_message, timeout=2) + wait_for_condition(updating_message, timeout=20) def test_unhealthy_override_updating_status(lower_slow_startup_threshold_and_reset): @@ -758,7 +758,7 @@ def __call__(self, request): wait_for_condition( lambda: client.get_serve_status().deployment_statuses[0].status == "UPDATING", - timeout=5, + timeout=20, ) From 077e42dd6bd5fc335d1cee23f6562dbe79fb6463 Mon Sep 17 00:00:00 2001 From: Cindy Zhang Date: Tue, 21 Jun 2022 16:35:38 -0700 Subject: [PATCH 11/16] change timeout --- python/ray/serve/tests/test_standalone.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/ray/serve/tests/test_standalone.py b/python/ray/serve/tests/test_standalone.py index 17ecb87317f5..bd861fc775dc 100644 --- a/python/ray/serve/tests/test_standalone.py +++ b/python/ray/serve/tests/test_standalone.py @@ -751,14 +751,14 @@ def __call__(self, request): wait_for_condition( lambda: client.get_serve_status().deployment_statuses[0].status == "UNHEALTHY", - timeout=5, + timeout=20, ) with pytest.raises(RuntimeError): wait_for_condition( lambda: client.get_serve_status().deployment_statuses[0].status == "UPDATING", - timeout=20, + timeout=10, ) From de6cc2c7474be342db30c96158c594246afc0a7a Mon Sep 17 00:00:00 2001 From: Cindy Zhang Date: Fri, 24 Jun 2022 14:43:31 -0700 Subject: [PATCH 12/16] modify global variables through environment variables --- python/ray/serve/controller.py | 12 ------- python/ray/serve/deployment_state.py | 6 ++-- python/ray/serve/tests/test_standalone.py | 40 +++++++++-------------- 3 files changed, 20 insertions(+), 38 deletions(-) diff --git a/python/ray/serve/controller.py b/python/ray/serve/controller.py index c4237163f3cd..001ab0652a7b 100644 --- a/python/ray/serve/controller.py +++ b/python/ray/serve/controller.py @@ -165,18 +165,6 @@ def _stop_one_running_replica_for_testing(self, deployment_name): deployment_name ]._stop_one_running_replica_for_testing() - def _get_slow_startup_warning_period_s(self) -> float: - return ray.serve.deployment_state.SLOW_STARTUP_WARNING_PERIOD_S - - def _get_slow_startup_warning_s(self) -> float: - return ray.serve.deployment_state.SLOW_STARTUP_WARNING_S - - def _set_slow_startup_warning_period_s(self, period: float) -> None: - ray.serve.deployment_state.SLOW_STARTUP_WARNING_PERIOD_S = period - - def _set_slow_startup_warning_s(self, time_limit: float) -> None: - ray.serve.deployment_state.SLOW_STARTUP_WARNING_S = time_limit - async def listen_for_change(self, keys_to_snapshot_ids: Dict[str, int]): """Proxy long pull client's listen request. diff --git a/python/ray/serve/deployment_state.py b/python/ray/serve/deployment_state.py index 6d3dce21ec15..dbfb3bd0c21c 100644 --- a/python/ray/serve/deployment_state.py +++ b/python/ray/serve/deployment_state.py @@ -71,8 +71,10 @@ class ReplicaHealthCheckResponse(Enum): CHECKPOINT_KEY = "serve-deployment-state-checkpoint" -SLOW_STARTUP_WARNING_S = 30 -SLOW_STARTUP_WARNING_PERIOD_S = 30 +SLOW_STARTUP_WARNING_S = int(os.getenv("SERVE_SLOW_STARTUP_WARNING_S", 30)) +SLOW_STARTUP_WARNING_PERIOD_S = int( + os.getenv("SERVE_SLOW_STARTUP_WARNING_PERIOD_S", 30) +) ALL_REPLICA_STATES = list(ReplicaState) USE_PLACEMENT_GROUP = os.environ.get("SERVE_USE_PLACEMENT_GROUP", "1") != "0" diff --git a/python/ray/serve/tests/test_standalone.py b/python/ray/serve/tests/test_standalone.py index 4a194d79dd1d..1d4a6841563f 100644 --- a/python/ray/serve/tests/test_standalone.py +++ b/python/ray/serve/tests/test_standalone.py @@ -54,37 +54,29 @@ def ray_cluster(): @pytest.fixture() def lower_slow_startup_threshold_and_reset(): - ray.init(num_cpus=2) - client = serve.start(detached=True) - - original_slow_startup_warning_period_s = ray.get( - client._controller._get_slow_startup_warning_period_s.remote() - ) - original_slow_startup_warning_s = ray.get( - client._controller._get_slow_startup_warning_s.remote() + original_slow_startup_warning_s = os.getenv("SERVE_SLOW_STARTUP_WARNING_S", "30") + original_slow_startup_warning_period_s = os.getenv( + "SERVE_SLOW_STARTUP_WARNING_PERIOD_S", "30" ) # Lower slow startup warning threshold to 1 second to reduce test duration - ray.get(client._controller._set_slow_startup_warning_period_s.remote(1)) - ray.get(client._controller._set_slow_startup_warning_s.remote(1)) + os.environ["SERVE_SLOW_STARTUP_WARNING_S"] = "1" + os.environ["SERVE_SLOW_STARTUP_WARNING_PERIOD_S"] = "1" - yield client + ray.init(num_cpus=2) + client = serve.start(detached=True) - # Reset slow startup warning threshold to prevent state sharing across unit - # tests - ray.get( - client._controller._set_slow_startup_warning_period_s.remote( - original_slow_startup_warning_period_s - ) - ) - ray.get( - client._controller._set_slow_startup_warning_s.remote( - original_slow_startup_warning_s - ) - ) + yield client serve.shutdown() ray.shutdown() + # Reset slow startup warning threshold to prevent state sharing across unit + # tests + os.environ["SERVE_SLOW_STARTUP_WARNING_S"] = original_slow_startup_warning_s + os.environ[ + "SERVE_SLOW_STARTUP_WARNING_PERIOD_S" + ] = original_slow_startup_warning_period_s + def test_shutdown(ray_shutdown): ray.init(num_cpus=16) @@ -770,7 +762,7 @@ def test_unhealthy_override_updating_status(lower_slow_startup_threshold_and_res @serve.deployment class f: def __init__(self): - self.num = 5 / 0 + self.num = 1 / 0 def __call__(self, request): pass From 6dc874c43c15f9b1da75f90659a666ece168391c Mon Sep 17 00:00:00 2001 From: Cindy Zhang Date: Fri, 24 Jun 2022 14:45:37 -0700 Subject: [PATCH 13/16] format --- python/ray/serve/tests/test_standalone.py | 1 + 1 file changed, 1 insertion(+) diff --git a/python/ray/serve/tests/test_standalone.py b/python/ray/serve/tests/test_standalone.py index 1d4a6841563f..7aa4c9e78744 100644 --- a/python/ray/serve/tests/test_standalone.py +++ b/python/ray/serve/tests/test_standalone.py @@ -697,6 +697,7 @@ def emit(self, record): serve.shutdown() ray.shutdown() + def test_recovering_controller_no_redeploy(): """Ensure controller doesn't redeploy running deployments when recovering.""" ray.init(namespace="x") From e85550193c66ba37c604cd2063fff638878a69bf Mon Sep 17 00:00:00 2001 From: zcin Date: Mon, 27 Jun 2022 09:42:13 -0700 Subject: [PATCH 14/16] Update python/ray/serve/tests/test_standalone.py Co-authored-by: shrekris-anyscale <92341594+shrekris-anyscale@users.noreply.github.com> --- python/ray/serve/tests/test_standalone.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/ray/serve/tests/test_standalone.py b/python/ray/serve/tests/test_standalone.py index 7aa4c9e78744..573e44731c40 100644 --- a/python/ray/serve/tests/test_standalone.py +++ b/python/ray/serve/tests/test_standalone.py @@ -54,7 +54,7 @@ def ray_cluster(): @pytest.fixture() def lower_slow_startup_threshold_and_reset(): - original_slow_startup_warning_s = os.getenv("SERVE_SLOW_STARTUP_WARNING_S", "30") + original_slow_startup_warning_s = os.getenv("SERVE_SLOW_STARTUP_WARNING_S") original_slow_startup_warning_period_s = os.getenv( "SERVE_SLOW_STARTUP_WARNING_PERIOD_S", "30" ) From 6e0e76d7c6d40b1472fff689d73f8e1c707d01b4 Mon Sep 17 00:00:00 2001 From: zcin Date: Mon, 27 Jun 2022 09:42:26 -0700 Subject: [PATCH 15/16] Update python/ray/serve/tests/test_standalone.py Co-authored-by: shrekris-anyscale <92341594+shrekris-anyscale@users.noreply.github.com> --- python/ray/serve/tests/test_standalone.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/ray/serve/tests/test_standalone.py b/python/ray/serve/tests/test_standalone.py index 573e44731c40..0c9fa35078ae 100644 --- a/python/ray/serve/tests/test_standalone.py +++ b/python/ray/serve/tests/test_standalone.py @@ -56,7 +56,7 @@ def ray_cluster(): def lower_slow_startup_threshold_and_reset(): original_slow_startup_warning_s = os.getenv("SERVE_SLOW_STARTUP_WARNING_S") original_slow_startup_warning_period_s = os.getenv( - "SERVE_SLOW_STARTUP_WARNING_PERIOD_S", "30" + "SERVE_SLOW_STARTUP_WARNING_PERIOD_S" ) # Lower slow startup warning threshold to 1 second to reduce test duration os.environ["SERVE_SLOW_STARTUP_WARNING_S"] = "1" From 6bcb8c63f579cf23696d41c172692abe6641518d Mon Sep 17 00:00:00 2001 From: Cindy Zhang Date: Mon, 27 Jun 2022 10:36:21 -0700 Subject: [PATCH 16/16] change method of getting environment variables --- python/ray/serve/deployment_state.py | 4 ++-- python/ray/serve/tests/test_standalone.py | 14 ++++++++------ 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/python/ray/serve/deployment_state.py b/python/ray/serve/deployment_state.py index dbfb3bd0c21c..3500d6d3f07a 100644 --- a/python/ray/serve/deployment_state.py +++ b/python/ray/serve/deployment_state.py @@ -71,9 +71,9 @@ class ReplicaHealthCheckResponse(Enum): CHECKPOINT_KEY = "serve-deployment-state-checkpoint" -SLOW_STARTUP_WARNING_S = int(os.getenv("SERVE_SLOW_STARTUP_WARNING_S", 30)) +SLOW_STARTUP_WARNING_S = int(os.environ.get("SERVE_SLOW_STARTUP_WARNING_S", 30)) SLOW_STARTUP_WARNING_PERIOD_S = int( - os.getenv("SERVE_SLOW_STARTUP_WARNING_PERIOD_S", 30) + os.environ.get("SERVE_SLOW_STARTUP_WARNING_PERIOD_S", 30) ) ALL_REPLICA_STATES = list(ReplicaState) diff --git a/python/ray/serve/tests/test_standalone.py b/python/ray/serve/tests/test_standalone.py index 0c9fa35078ae..511c60727aef 100644 --- a/python/ray/serve/tests/test_standalone.py +++ b/python/ray/serve/tests/test_standalone.py @@ -54,8 +54,8 @@ def ray_cluster(): @pytest.fixture() def lower_slow_startup_threshold_and_reset(): - original_slow_startup_warning_s = os.getenv("SERVE_SLOW_STARTUP_WARNING_S") - original_slow_startup_warning_period_s = os.getenv( + original_slow_startup_warning_s = os.environ.get("SERVE_SLOW_STARTUP_WARNING_S") + original_slow_startup_warning_period_s = os.environ.get( "SERVE_SLOW_STARTUP_WARNING_PERIOD_S" ) # Lower slow startup warning threshold to 1 second to reduce test duration @@ -72,10 +72,12 @@ def lower_slow_startup_threshold_and_reset(): # Reset slow startup warning threshold to prevent state sharing across unit # tests - os.environ["SERVE_SLOW_STARTUP_WARNING_S"] = original_slow_startup_warning_s - os.environ[ - "SERVE_SLOW_STARTUP_WARNING_PERIOD_S" - ] = original_slow_startup_warning_period_s + if original_slow_startup_warning_s is not None: + os.environ["SERVE_SLOW_STARTUP_WARNING_S"] = original_slow_startup_warning_s + if original_slow_startup_warning_period_s is not None: + os.environ[ + "SERVE_SLOW_STARTUP_WARNING_PERIOD_S" + ] = original_slow_startup_warning_period_s def test_shutdown(ray_shutdown):