From ff4e88f6a30255329ccd30a6dce9071392b5f53b Mon Sep 17 00:00:00 2001
From: Cindy Zhang <cindyzyx9@gmail.com>
Date: Wed, 15 Jun 2022 17:04:12 -0700
Subject: [PATCH 01/16] Set message field of status if
 allocations/initializations have been pending for too long

---
 python/ray/serve/deployment_state.py | 22 ++++++++++++++++++++--
 1 file changed, 20 insertions(+), 2 deletions(-)

diff --git a/python/ray/serve/deployment_state.py b/python/ray/serve/deployment_state.py
index 4168b3be2de4..0d6dc2c70de2 100644
--- a/python/ray/serve/deployment_state.py
+++ b/python/ray/serve/deployment_state.py
@@ -1496,7 +1496,7 @@ def _check_and_update_replicas(self) -> bool:
 
             if len(pending_allocation) > 0:
                 required, available = slow_start_replicas[0][0].resource_requirements()
-                logger.warning(
+                message = (
                     f"Deployment '{self._name}' has "
                     f"{len(pending_allocation)} replicas that have taken "
                     f"more than {SLOW_STARTUP_WARNING_S}s to be scheduled. "
@@ -1506,16 +1506,34 @@ def _check_and_update_replicas(self) -> bool:
                     f"Resources required for each replica: {required}, "
                     f"resources available: {available}."
                 )
+                logger.warning(message)
                 if _SCALING_LOG_ENABLED:
                     print_verbose_scaling_log()
+                # If status is UNHEALTHY, give it higher priority over the stuck 
+                # allocations problem when propagating status back to user
+                if self._curr_status_info.status != DeploymentStatus.UNHEALTHY:
+                    self._curr_status_info = DeploymentStatusInfo(
+                        name=self._name,
+                        status=DeploymentStatus.UPDATING,
+                        message=message,
+                    )
 
             if len(pending_initialization) > 0:
-                logger.warning(
+                message = (
                     f"Deployment '{self._name}' has "
                     f"{len(pending_initialization)} replicas that have taken "
                     f"more than {SLOW_STARTUP_WARNING_S}s to initialize. This "
                     f"may be caused by a slow __init__ or reconfigure method."
                 )
+                logger.warning(message)
+                # If status is UNHEALTHY, give it higher priority over the stuck 
+                # initializations problem when propagating status back to user
+                if self._curr_status_info.status != DeploymentStatus.UNHEALTHY:
+                    self._curr_status_info = DeploymentStatusInfo(
+                        name=self._name,
+                        status=DeploymentStatus.UPDATING,
+                        message=message,
+                    )
 
             self._prev_startup_warning = time.time()
 

From 9eb7c871f57a10e92b2ea58e955b47a5526d5198 Mon Sep 17 00:00:00 2001
From: Cindy Zhang <cindyzyx9@gmail.com>
Date: Wed, 15 Jun 2022 18:23:01 -0700
Subject: [PATCH 02/16] format changes

---
 python/ray/serve/deployment_state.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/ray/serve/deployment_state.py b/python/ray/serve/deployment_state.py
index 0d6dc2c70de2..45fbd397de5e 100644
--- a/python/ray/serve/deployment_state.py
+++ b/python/ray/serve/deployment_state.py
@@ -1509,7 +1509,7 @@ def _check_and_update_replicas(self) -> bool:
                 logger.warning(message)
                 if _SCALING_LOG_ENABLED:
                     print_verbose_scaling_log()
-                # If status is UNHEALTHY, give it higher priority over the stuck 
+                # If status is UNHEALTHY, give it higher priority over the stuck
                 # allocations problem when propagating status back to user
                 if self._curr_status_info.status != DeploymentStatus.UNHEALTHY:
                     self._curr_status_info = DeploymentStatusInfo(
@@ -1526,7 +1526,7 @@ def _check_and_update_replicas(self) -> bool:
                     f"may be caused by a slow __init__ or reconfigure method."
                 )
                 logger.warning(message)
-                # If status is UNHEALTHY, give it higher priority over the stuck 
+                # If status is UNHEALTHY, give it higher priority over the stuck
                 # initializations problem when propagating status back to user
                 if self._curr_status_info.status != DeploymentStatus.UNHEALTHY:
                     self._curr_status_info = DeploymentStatusInfo(

From 8d43a1b35cd61a9c3829befa14f9fb27b3645ae4 Mon Sep 17 00:00:00 2001
From: Cindy Zhang <cindyzyx9@gmail.com>
Date: Fri, 17 Jun 2022 12:15:59 -0700
Subject: [PATCH 03/16] Add unit test

---
 python/ray/serve/controller.py             | 12 ++++++
 python/ray/serve/tests/test_standalone2.py | 43 ++++++++++++++++++++++
 2 files changed, 55 insertions(+)

diff --git a/python/ray/serve/controller.py b/python/ray/serve/controller.py
index f654f1e4db41..bec255f70704 100644
--- a/python/ray/serve/controller.py
+++ b/python/ray/serve/controller.py
@@ -157,6 +157,18 @@ def _stop_one_running_replica_for_testing(self, deployment_name):
             deployment_name
         ]._stop_one_running_replica_for_testing()
 
+    def _get_slow_startup_warning_period_s(self) -> float:
+        return ray.serve.deployment_state.SLOW_STARTUP_WARNING_PERIOD_S
+
+    def _get_slow_startup_warning_s(self) -> float:
+        return ray.serve.deployment_state.SLOW_STARTUP_WARNING_S
+
+    def _set_slow_startup_warning_period_s(self, period: float) -> None:
+        ray.serve.deployment_state.SLOW_STARTUP_WARNING_PERIOD_S = period
+
+    def _set_slow_startup_warning_s(self, time_limit: float) -> None:
+        ray.serve.deployment_state.SLOW_STARTUP_WARNING_S = time_limit
+
     async def listen_for_change(self, keys_to_snapshot_ids: Dict[str, int]):
         """Proxy long pull client's listen request.
 
diff --git a/python/ray/serve/tests/test_standalone2.py b/python/ray/serve/tests/test_standalone2.py
index d3a0118c7022..5dfa84c271f8 100644
--- a/python/ray/serve/tests/test_standalone2.py
+++ b/python/ray/serve/tests/test_standalone2.py
@@ -138,6 +138,49 @@ def f(*args):
         serve.shutdown()
 
 
+def test_updating_status_message(shutdown_ray):
+    ray.init(num_cpus=2)
+    client = serve.start(detached=True)
+
+    @serve.deployment(
+        num_replicas=5,
+        ray_actor_options={"num_cpus": 1},
+    )
+    def f(*args):
+        pass
+
+    original_slow_startup_warning_period_s = (
+        client._controller._get_slow_startup_warning_period_s.remote()
+    )
+    original_slow_startup_warning_s = (
+        client._controller._get_slow_startup_warning_s.remote()
+    )
+    # Lower slow startup warning threshold to 1 second to reduce test duration
+    client._controller._set_slow_startup_warning_period_s.remote(1)
+    client._controller._set_slow_startup_warning_s.remote(1)
+    f.deploy(_blocking=False)
+
+    def updating_message():
+        deployment_status = client.get_serve_status().deployment_statuses[0]
+        message_substring = "more than 1s to be scheduled."
+        return (deployment_status.status == "UPDATING") and (
+            message_substring in deployment_status.message
+        )
+
+    wait_for_condition(updating_message, timeout=2)
+    # Reset slow startup warning threshold in case bugs that cause different
+    # tests to share state occur
+    client._controller._set_slow_startup_warning_period_s.remote(
+        original_slow_startup_warning_period_s
+    )
+    client._controller._set_slow_startup_warning_s.remote(
+        original_slow_startup_warning_s
+    )
+
+    serve.shutdown()
+    ray.shutdown()
+
+
 @pytest.mark.parametrize("detached", [True, False])
 def test_refresh_controller_after_death(shutdown_ray, detached):
     """Check if serve.start() refreshes the controller handle if it's dead."""

From b9bdb4c4d2428c30801e84ea2622f95d93ea031e Mon Sep 17 00:00:00 2001
From: Cindy Zhang <cindyzyx9@gmail.com>
Date: Fri, 17 Jun 2022 12:19:05 -0700
Subject: [PATCH 04/16] change comments

---
 python/ray/serve/deployment_state.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/python/ray/serve/deployment_state.py b/python/ray/serve/deployment_state.py
index 45fbd397de5e..5ec933e1a7f8 100644
--- a/python/ray/serve/deployment_state.py
+++ b/python/ray/serve/deployment_state.py
@@ -1509,8 +1509,9 @@ def _check_and_update_replicas(self) -> bool:
                 logger.warning(message)
                 if _SCALING_LOG_ENABLED:
                     print_verbose_scaling_log()
-                # If status is UNHEALTHY, give it higher priority over the stuck
-                # allocations problem when propagating status back to user
+                # If status is UNHEALTHY, leave the status and message as is. 
+                # The issue that caused the deployment to be unhealthy should be 
+                # prioritized over this resource availability issue.
                 if self._curr_status_info.status != DeploymentStatus.UNHEALTHY:
                     self._curr_status_info = DeploymentStatusInfo(
                         name=self._name,
@@ -1526,8 +1527,9 @@ def _check_and_update_replicas(self) -> bool:
                     f"may be caused by a slow __init__ or reconfigure method."
                 )
                 logger.warning(message)
-                # If status is UNHEALTHY, give it higher priority over the stuck
-                # initializations problem when propagating status back to user
+                # If status is UNHEALTHY, leave the status and message as is. 
+                # The issue that caused the deployment to be unhealthy should be 
+                # prioritized over this resource availability issue.
                 if self._curr_status_info.status != DeploymentStatus.UNHEALTHY:
                     self._curr_status_info = DeploymentStatusInfo(
                         name=self._name,

From a5e5b886f5eadbb4528ff7dec6461a56d1cf8c1f Mon Sep 17 00:00:00 2001
From: Cindy Zhang <cindyzyx9@gmail.com>
Date: Fri, 17 Jun 2022 14:59:07 -0700
Subject: [PATCH 05/16] minor fixes

---
 python/ray/serve/deployment_state.py       |  8 +++----
 python/ray/serve/tests/test_standalone2.py | 27 ++++++++++++++--------
 2 files changed, 21 insertions(+), 14 deletions(-)

diff --git a/python/ray/serve/deployment_state.py b/python/ray/serve/deployment_state.py
index 5ec933e1a7f8..6d3dce21ec15 100644
--- a/python/ray/serve/deployment_state.py
+++ b/python/ray/serve/deployment_state.py
@@ -1509,8 +1509,8 @@ def _check_and_update_replicas(self) -> bool:
                 logger.warning(message)
                 if _SCALING_LOG_ENABLED:
                     print_verbose_scaling_log()
-                # If status is UNHEALTHY, leave the status and message as is. 
-                # The issue that caused the deployment to be unhealthy should be 
+                # If status is UNHEALTHY, leave the status and message as is.
+                # The issue that caused the deployment to be unhealthy should be
                 # prioritized over this resource availability issue.
                 if self._curr_status_info.status != DeploymentStatus.UNHEALTHY:
                     self._curr_status_info = DeploymentStatusInfo(
@@ -1527,8 +1527,8 @@ def _check_and_update_replicas(self) -> bool:
                     f"may be caused by a slow __init__ or reconfigure method."
                 )
                 logger.warning(message)
-                # If status is UNHEALTHY, leave the status and message as is. 
-                # The issue that caused the deployment to be unhealthy should be 
+                # If status is UNHEALTHY, leave the status and message as is.
+                # The issue that caused the deployment to be unhealthy should be
                 # prioritized over this resource availability issue.
                 if self._curr_status_info.status != DeploymentStatus.UNHEALTHY:
                     self._curr_status_info = DeploymentStatusInfo(
diff --git a/python/ray/serve/tests/test_standalone2.py b/python/ray/serve/tests/test_standalone2.py
index 5dfa84c271f8..6c8259e6fe42 100644
--- a/python/ray/serve/tests/test_standalone2.py
+++ b/python/ray/serve/tests/test_standalone2.py
@@ -139,6 +139,8 @@ def f(*args):
 
 
 def test_updating_status_message(shutdown_ray):
+    """Check if status message says if a serve deployment has taken a long time"""
+
     ray.init(num_cpus=2)
     client = serve.start(detached=True)
 
@@ -149,15 +151,15 @@ def test_updating_status_message(shutdown_ray):
     def f(*args):
         pass
 
-    original_slow_startup_warning_period_s = (
+    original_slow_startup_warning_period_s = ray.get(
         client._controller._get_slow_startup_warning_period_s.remote()
     )
-    original_slow_startup_warning_s = (
+    original_slow_startup_warning_s = ray.get(
         client._controller._get_slow_startup_warning_s.remote()
     )
     # Lower slow startup warning threshold to 1 second to reduce test duration
-    client._controller._set_slow_startup_warning_period_s.remote(1)
-    client._controller._set_slow_startup_warning_s.remote(1)
+    ray.get(client._controller._set_slow_startup_warning_period_s.remote(1))
+    ray.get(client._controller._set_slow_startup_warning_s.remote(1))
     f.deploy(_blocking=False)
 
     def updating_message():
@@ -168,13 +170,18 @@ def updating_message():
         )
 
     wait_for_condition(updating_message, timeout=2)
-    # Reset slow startup warning threshold in case bugs that cause different
-    # tests to share state occur
-    client._controller._set_slow_startup_warning_period_s.remote(
-        original_slow_startup_warning_period_s
+
+    # Reset slow startup warning threshold to prevent state sharing across unit
+    # tests
+    ray.get(
+        client._controller._set_slow_startup_warning_period_s.remote(
+            original_slow_startup_warning_period_s
+        )
     )
-    client._controller._set_slow_startup_warning_s.remote(
-        original_slow_startup_warning_s
+    ray.get(
+        client._controller._set_slow_startup_warning_s.remote(
+            original_slow_startup_warning_s
+        )
     )
 
     serve.shutdown()

From 885b722cd5f629a25e2e2d910827161225d80356 Mon Sep 17 00:00:00 2001
From: Cindy Zhang <cindyzyx9@gmail.com>
Date: Tue, 21 Jun 2022 09:40:43 -0700
Subject: [PATCH 06/16] Add unit test for checking that an unhealthy status
 should not be overrided with resource availability issue

---
 python/ray/serve/tests/test_standalone2.py | 93 ++++++++++++++++------
 1 file changed, 67 insertions(+), 26 deletions(-)

diff --git a/python/ray/serve/tests/test_standalone2.py b/python/ray/serve/tests/test_standalone2.py
index 6c8259e6fe42..ddc98c4f1e54 100644
--- a/python/ray/serve/tests/test_standalone2.py
+++ b/python/ray/serve/tests/test_standalone2.py
@@ -51,6 +51,40 @@ def start_and_shutdown_ray_cli_class():
         yield
 
 
+@pytest.fixture()
+def lower_slow_startup_threshold_and_reset():
+    ray.init(num_cpus=2)
+    client = serve.start(detached=True)
+
+    original_slow_startup_warning_period_s = ray.get(
+        client._controller._get_slow_startup_warning_period_s.remote()
+    )
+    original_slow_startup_warning_s = ray.get(
+        client._controller._get_slow_startup_warning_s.remote()
+    )
+    # Lower slow startup warning threshold to 1 second to reduce test duration
+    ray.get(client._controller._set_slow_startup_warning_period_s.remote(1))
+    ray.get(client._controller._set_slow_startup_warning_s.remote(1))
+
+    yield client
+
+    # Reset slow startup warning threshold to prevent state sharing across unit
+    # tests
+    ray.get(
+        client._controller._set_slow_startup_warning_period_s.remote(
+            original_slow_startup_warning_period_s
+        )
+    )
+    ray.get(
+        client._controller._set_slow_startup_warning_s.remote(
+            original_slow_startup_warning_s
+        )
+    )
+
+    serve.shutdown()
+    ray.shutdown()
+
+
 def test_standalone_actor_outside_serve():
     # https://github.com/ray-project/ray/issues/20066
 
@@ -138,12 +172,10 @@ def f(*args):
         serve.shutdown()
 
 
-def test_updating_status_message(shutdown_ray):
+def test_updating_status_message(shutdown_ray, lower_slow_startup_threshold_and_reset):
     """Check if status message says if a serve deployment has taken a long time"""
 
-    ray.init(num_cpus=2)
-    client = serve.start(detached=True)
-
+    client = lower_slow_startup_threshold_and_reset
     @serve.deployment(
         num_replicas=5,
         ray_actor_options={"num_cpus": 1},
@@ -151,15 +183,6 @@ def test_updating_status_message(shutdown_ray):
     def f(*args):
         pass
 
-    original_slow_startup_warning_period_s = ray.get(
-        client._controller._get_slow_startup_warning_period_s.remote()
-    )
-    original_slow_startup_warning_s = ray.get(
-        client._controller._get_slow_startup_warning_s.remote()
-    )
-    # Lower slow startup warning threshold to 1 second to reduce test duration
-    ray.get(client._controller._set_slow_startup_warning_period_s.remote(1))
-    ray.get(client._controller._set_slow_startup_warning_s.remote(1))
     f.deploy(_blocking=False)
 
     def updating_message():
@@ -171,21 +194,39 @@ def updating_message():
 
     wait_for_condition(updating_message, timeout=2)
 
-    # Reset slow startup warning threshold to prevent state sharing across unit
-    # tests
-    ray.get(
-        client._controller._set_slow_startup_warning_period_s.remote(
-            original_slow_startup_warning_period_s
-        )
-    )
-    ray.get(
-        client._controller._set_slow_startup_warning_s.remote(
-            original_slow_startup_warning_s
-        )
+
+def test_unhealthy_override_updating_status(
+    shutdown_ray, lower_slow_startup_threshold_and_reset
+):
+    """
+    Check that if status is UNHEALTHY and there is a resource availability 
+    issue, the status should not change. The issue that caused the deployment to
+    be unhealthy should be prioritized over this resource availability issue.
+    """
+
+    client = lower_slow_startup_threshold_and_reset
+    @serve.deployment
+    class f:
+        def __init__(self):
+            self.num = 5 / 0
+
+        def __call__(self, request):
+            pass
+
+    f.deploy(_blocking=False)
+
+    wait_for_condition(
+        lambda: client.get_serve_status().deployment_statuses[0].status
+        == "UNHEALTHY",
+        timeout=5,
     )
 
-    serve.shutdown()
-    ray.shutdown()
+    with pytest.raises(RuntimeError):
+        wait_for_condition(
+            lambda: client.get_serve_status().deployment_statuses[0].status
+            == "UPDATING",
+            timeout=5,
+        )
 
 
 @pytest.mark.parametrize("detached", [True, False])

From 4e0d28b1ccc7fb5e178da112c15b497bc44bbbc8 Mon Sep 17 00:00:00 2001
From: Cindy Zhang <cindyzyx9@gmail.com>
Date: Tue, 21 Jun 2022 09:42:05 -0700
Subject: [PATCH 07/16] format changes

---
 python/ray/serve/tests/test_standalone2.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/python/ray/serve/tests/test_standalone2.py b/python/ray/serve/tests/test_standalone2.py
index ddc98c4f1e54..767cf3f99455 100644
--- a/python/ray/serve/tests/test_standalone2.py
+++ b/python/ray/serve/tests/test_standalone2.py
@@ -176,6 +176,7 @@ def test_updating_status_message(shutdown_ray, lower_slow_startup_threshold_and_
     """Check if status message says if a serve deployment has taken a long time"""
 
     client = lower_slow_startup_threshold_and_reset
+
     @serve.deployment(
         num_replicas=5,
         ray_actor_options={"num_cpus": 1},
@@ -199,12 +200,13 @@ def test_unhealthy_override_updating_status(
     shutdown_ray, lower_slow_startup_threshold_and_reset
 ):
     """
-    Check that if status is UNHEALTHY and there is a resource availability 
+    Check that if status is UNHEALTHY and there is a resource availability
     issue, the status should not change. The issue that caused the deployment to
     be unhealthy should be prioritized over this resource availability issue.
     """
 
     client = lower_slow_startup_threshold_and_reset
+
     @serve.deployment
     class f:
         def __init__(self):
@@ -216,8 +218,7 @@ def __call__(self, request):
     f.deploy(_blocking=False)
 
     wait_for_condition(
-        lambda: client.get_serve_status().deployment_statuses[0].status
-        == "UNHEALTHY",
+        lambda: client.get_serve_status().deployment_statuses[0].status == "UNHEALTHY",
         timeout=5,
     )
 

From 44dff49cbed468608507f6276ac66faa6cf92c74 Mon Sep 17 00:00:00 2001
From: Cindy Zhang <cindyzyx9@gmail.com>
Date: Tue, 21 Jun 2022 13:08:04 -0700
Subject: [PATCH 08/16] move unit tests from test_standalone2 to
 test_standalone

---
 python/ray/serve/tests/test_standalone.py  | 92 ++++++++++++++++++++++
 python/ray/serve/tests/test_standalone2.py | 92 ----------------------
 2 files changed, 92 insertions(+), 92 deletions(-)

diff --git a/python/ray/serve/tests/test_standalone.py b/python/ray/serve/tests/test_standalone.py
index 042bf5d8d3a1..0fb7733b6e61 100644
--- a/python/ray/serve/tests/test_standalone.py
+++ b/python/ray/serve/tests/test_standalone.py
@@ -53,6 +53,40 @@ def ray_cluster():
     cluster.shutdown()
 
 
+@pytest.fixture()
+def lower_slow_startup_threshold_and_reset():
+    ray.init(num_cpus=2)
+    client = serve.start(detached=True)
+
+    original_slow_startup_warning_period_s = ray.get(
+        client._controller._get_slow_startup_warning_period_s.remote()
+    )
+    original_slow_startup_warning_s = ray.get(
+        client._controller._get_slow_startup_warning_s.remote()
+    )
+    # Lower slow startup warning threshold to 1 second to reduce test duration
+    ray.get(client._controller._set_slow_startup_warning_period_s.remote(1))
+    ray.get(client._controller._set_slow_startup_warning_s.remote(1))
+
+    yield client
+
+    # Reset slow startup warning threshold to prevent state sharing across unit
+    # tests
+    ray.get(
+        client._controller._set_slow_startup_warning_period_s.remote(
+            original_slow_startup_warning_period_s
+        )
+    )
+    ray.get(
+        client._controller._set_slow_startup_warning_s.remote(
+            original_slow_startup_warning_s
+        )
+    )
+
+    serve.shutdown()
+    ray.shutdown()
+
+
 def test_shutdown(ray_shutdown):
     ray.init(num_cpus=16)
     serve.start(http_options=dict(port=8003))
@@ -672,5 +706,63 @@ def emit(self, record):
     ray.shutdown()
 
 
+def test_updating_status_message(shutdown_ray, lower_slow_startup_threshold_and_reset):
+    """Check if status message says if a serve deployment has taken a long time"""
+
+    client = lower_slow_startup_threshold_and_reset
+
+    @serve.deployment(
+        num_replicas=5,
+        ray_actor_options={"num_cpus": 1},
+    )
+    def f(*args):
+        pass
+
+    f.deploy(_blocking=False)
+
+    def updating_message():
+        deployment_status = client.get_serve_status().deployment_statuses[0]
+        message_substring = "more than 1s to be scheduled."
+        return (deployment_status.status == "UPDATING") and (
+            message_substring in deployment_status.message
+        )
+
+    wait_for_condition(updating_message, timeout=2)
+
+
+def test_unhealthy_override_updating_status(
+    shutdown_ray, lower_slow_startup_threshold_and_reset
+):
+    """
+    Check that if status is UNHEALTHY and there is a resource availability
+    issue, the status should not change. The issue that caused the deployment to
+    be unhealthy should be prioritized over this resource availability issue.
+    """
+
+    client = lower_slow_startup_threshold_and_reset
+
+    @serve.deployment
+    class f:
+        def __init__(self):
+            self.num = 5 / 0
+
+        def __call__(self, request):
+            pass
+
+    f.deploy(_blocking=False)
+
+    wait_for_condition(
+        lambda: client.get_serve_status().deployment_statuses[0].status == "UNHEALTHY",
+        timeout=5,
+    )
+
+    with pytest.raises(RuntimeError):
+        wait_for_condition(
+            lambda: client.get_serve_status().deployment_statuses[0].status
+            == "UPDATING",
+            timeout=5,
+        )
+
+
 if __name__ == "__main__":
     sys.exit(pytest.main(["-v", "-s", __file__]))
diff --git a/python/ray/serve/tests/test_standalone2.py b/python/ray/serve/tests/test_standalone2.py
index 767cf3f99455..d3a0118c7022 100644
--- a/python/ray/serve/tests/test_standalone2.py
+++ b/python/ray/serve/tests/test_standalone2.py
@@ -51,40 +51,6 @@ def start_and_shutdown_ray_cli_class():
         yield
 
 
-@pytest.fixture()
-def lower_slow_startup_threshold_and_reset():
-    ray.init(num_cpus=2)
-    client = serve.start(detached=True)
-
-    original_slow_startup_warning_period_s = ray.get(
-        client._controller._get_slow_startup_warning_period_s.remote()
-    )
-    original_slow_startup_warning_s = ray.get(
-        client._controller._get_slow_startup_warning_s.remote()
-    )
-    # Lower slow startup warning threshold to 1 second to reduce test duration
-    ray.get(client._controller._set_slow_startup_warning_period_s.remote(1))
-    ray.get(client._controller._set_slow_startup_warning_s.remote(1))
-
-    yield client
-
-    # Reset slow startup warning threshold to prevent state sharing across unit
-    # tests
-    ray.get(
-        client._controller._set_slow_startup_warning_period_s.remote(
-            original_slow_startup_warning_period_s
-        )
-    )
-    ray.get(
-        client._controller._set_slow_startup_warning_s.remote(
-            original_slow_startup_warning_s
-        )
-    )
-
-    serve.shutdown()
-    ray.shutdown()
-
-
 def test_standalone_actor_outside_serve():
     # https://github.com/ray-project/ray/issues/20066
 
@@ -172,64 +138,6 @@ def f(*args):
         serve.shutdown()
 
 
-def test_updating_status_message(shutdown_ray, lower_slow_startup_threshold_and_reset):
-    """Check if status message says if a serve deployment has taken a long time"""
-
-    client = lower_slow_startup_threshold_and_reset
-
-    @serve.deployment(
-        num_replicas=5,
-        ray_actor_options={"num_cpus": 1},
-    )
-    def f(*args):
-        pass
-
-    f.deploy(_blocking=False)
-
-    def updating_message():
-        deployment_status = client.get_serve_status().deployment_statuses[0]
-        message_substring = "more than 1s to be scheduled."
-        return (deployment_status.status == "UPDATING") and (
-            message_substring in deployment_status.message
-        )
-
-    wait_for_condition(updating_message, timeout=2)
-
-
-def test_unhealthy_override_updating_status(
-    shutdown_ray, lower_slow_startup_threshold_and_reset
-):
-    """
-    Check that if status is UNHEALTHY and there is a resource availability
-    issue, the status should not change. The issue that caused the deployment to
-    be unhealthy should be prioritized over this resource availability issue.
-    """
-
-    client = lower_slow_startup_threshold_and_reset
-
-    @serve.deployment
-    class f:
-        def __init__(self):
-            self.num = 5 / 0
-
-        def __call__(self, request):
-            pass
-
-    f.deploy(_blocking=False)
-
-    wait_for_condition(
-        lambda: client.get_serve_status().deployment_statuses[0].status == "UNHEALTHY",
-        timeout=5,
-    )
-
-    with pytest.raises(RuntimeError):
-        wait_for_condition(
-            lambda: client.get_serve_status().deployment_statuses[0].status
-            == "UPDATING",
-            timeout=5,
-        )
-
-
 @pytest.mark.parametrize("detached", [True, False])
 def test_refresh_controller_after_death(shutdown_ray, detached):
     """Check if serve.start() refreshes the controller handle if it's dead."""

From d5d85ab42961512368b6a90365a747d200ac897f Mon Sep 17 00:00:00 2001
From: Cindy Zhang <cindyzyx9@gmail.com>
Date: Tue, 21 Jun 2022 13:29:41 -0700
Subject: [PATCH 09/16] fix bug

---
 python/ray/serve/tests/test_standalone.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/python/ray/serve/tests/test_standalone.py b/python/ray/serve/tests/test_standalone.py
index 0fb7733b6e61..2aa9266dc83e 100644
--- a/python/ray/serve/tests/test_standalone.py
+++ b/python/ray/serve/tests/test_standalone.py
@@ -706,7 +706,7 @@ def emit(self, record):
     ray.shutdown()
 
 
-def test_updating_status_message(shutdown_ray, lower_slow_startup_threshold_and_reset):
+def test_updating_status_message(lower_slow_startup_threshold_and_reset):
     """Check if status message says if a serve deployment has taken a long time"""
 
     client = lower_slow_startup_threshold_and_reset
@@ -730,9 +730,7 @@ def updating_message():
     wait_for_condition(updating_message, timeout=2)
 
 
-def test_unhealthy_override_updating_status(
-    shutdown_ray, lower_slow_startup_threshold_and_reset
-):
+def test_unhealthy_override_updating_status(lower_slow_startup_threshold_and_reset):
     """
     Check that if status is UNHEALTHY and there is a resource availability
     issue, the status should not change. The issue that caused the deployment to

From b8fb9fe0069bac06af6c6de5c82ec168aad1bd39 Mon Sep 17 00:00:00 2001
From: Cindy Zhang <cindyzyx9@gmail.com>
Date: Tue, 21 Jun 2022 15:21:21 -0700
Subject: [PATCH 10/16] increase timeout

---
 python/ray/serve/tests/test_standalone.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/ray/serve/tests/test_standalone.py b/python/ray/serve/tests/test_standalone.py
index 2aa9266dc83e..17ecb87317f5 100644
--- a/python/ray/serve/tests/test_standalone.py
+++ b/python/ray/serve/tests/test_standalone.py
@@ -727,7 +727,7 @@ def updating_message():
             message_substring in deployment_status.message
         )
 
-    wait_for_condition(updating_message, timeout=2)
+    wait_for_condition(updating_message, timeout=20)
 
 
 def test_unhealthy_override_updating_status(lower_slow_startup_threshold_and_reset):
@@ -758,7 +758,7 @@ def __call__(self, request):
         wait_for_condition(
             lambda: client.get_serve_status().deployment_statuses[0].status
             == "UPDATING",
-            timeout=5,
+            timeout=20,
         )
 
 

From 077e42dd6bd5fc335d1cee23f6562dbe79fb6463 Mon Sep 17 00:00:00 2001
From: Cindy Zhang <cindyzyx9@gmail.com>
Date: Tue, 21 Jun 2022 16:35:38 -0700
Subject: [PATCH 11/16] change timeout

---
 python/ray/serve/tests/test_standalone.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/ray/serve/tests/test_standalone.py b/python/ray/serve/tests/test_standalone.py
index 17ecb87317f5..bd861fc775dc 100644
--- a/python/ray/serve/tests/test_standalone.py
+++ b/python/ray/serve/tests/test_standalone.py
@@ -751,14 +751,14 @@ def __call__(self, request):
 
     wait_for_condition(
         lambda: client.get_serve_status().deployment_statuses[0].status == "UNHEALTHY",
-        timeout=5,
+        timeout=20,
     )
 
     with pytest.raises(RuntimeError):
         wait_for_condition(
             lambda: client.get_serve_status().deployment_statuses[0].status
             == "UPDATING",
-            timeout=20,
+            timeout=10,
         )
 
 

From de6cc2c7474be342db30c96158c594246afc0a7a Mon Sep 17 00:00:00 2001
From: Cindy Zhang <cindyzyx9@gmail.com>
Date: Fri, 24 Jun 2022 14:43:31 -0700
Subject: [PATCH 12/16] modify global variables through environment variables

---
 python/ray/serve/controller.py            | 12 -------
 python/ray/serve/deployment_state.py      |  6 ++--
 python/ray/serve/tests/test_standalone.py | 40 +++++++++--------------
 3 files changed, 20 insertions(+), 38 deletions(-)

diff --git a/python/ray/serve/controller.py b/python/ray/serve/controller.py
index c4237163f3cd..001ab0652a7b 100644
--- a/python/ray/serve/controller.py
+++ b/python/ray/serve/controller.py
@@ -165,18 +165,6 @@ def _stop_one_running_replica_for_testing(self, deployment_name):
             deployment_name
         ]._stop_one_running_replica_for_testing()
 
-    def _get_slow_startup_warning_period_s(self) -> float:
-        return ray.serve.deployment_state.SLOW_STARTUP_WARNING_PERIOD_S
-
-    def _get_slow_startup_warning_s(self) -> float:
-        return ray.serve.deployment_state.SLOW_STARTUP_WARNING_S
-
-    def _set_slow_startup_warning_period_s(self, period: float) -> None:
-        ray.serve.deployment_state.SLOW_STARTUP_WARNING_PERIOD_S = period
-
-    def _set_slow_startup_warning_s(self, time_limit: float) -> None:
-        ray.serve.deployment_state.SLOW_STARTUP_WARNING_S = time_limit
-
     async def listen_for_change(self, keys_to_snapshot_ids: Dict[str, int]):
         """Proxy long pull client's listen request.
 
diff --git a/python/ray/serve/deployment_state.py b/python/ray/serve/deployment_state.py
index 6d3dce21ec15..dbfb3bd0c21c 100644
--- a/python/ray/serve/deployment_state.py
+++ b/python/ray/serve/deployment_state.py
@@ -71,8 +71,10 @@ class ReplicaHealthCheckResponse(Enum):
 
 
 CHECKPOINT_KEY = "serve-deployment-state-checkpoint"
-SLOW_STARTUP_WARNING_S = 30
-SLOW_STARTUP_WARNING_PERIOD_S = 30
+SLOW_STARTUP_WARNING_S = int(os.getenv("SERVE_SLOW_STARTUP_WARNING_S", 30))
+SLOW_STARTUP_WARNING_PERIOD_S = int(
+    os.getenv("SERVE_SLOW_STARTUP_WARNING_PERIOD_S", 30)
+)
 
 ALL_REPLICA_STATES = list(ReplicaState)
 USE_PLACEMENT_GROUP = os.environ.get("SERVE_USE_PLACEMENT_GROUP", "1") != "0"
diff --git a/python/ray/serve/tests/test_standalone.py b/python/ray/serve/tests/test_standalone.py
index 4a194d79dd1d..1d4a6841563f 100644
--- a/python/ray/serve/tests/test_standalone.py
+++ b/python/ray/serve/tests/test_standalone.py
@@ -54,37 +54,29 @@ def ray_cluster():
 
 @pytest.fixture()
 def lower_slow_startup_threshold_and_reset():
-    ray.init(num_cpus=2)
-    client = serve.start(detached=True)
-
-    original_slow_startup_warning_period_s = ray.get(
-        client._controller._get_slow_startup_warning_period_s.remote()
-    )
-    original_slow_startup_warning_s = ray.get(
-        client._controller._get_slow_startup_warning_s.remote()
+    original_slow_startup_warning_s = os.getenv("SERVE_SLOW_STARTUP_WARNING_S", "30")
+    original_slow_startup_warning_period_s = os.getenv(
+        "SERVE_SLOW_STARTUP_WARNING_PERIOD_S", "30"
     )
     # Lower slow startup warning threshold to 1 second to reduce test duration
-    ray.get(client._controller._set_slow_startup_warning_period_s.remote(1))
-    ray.get(client._controller._set_slow_startup_warning_s.remote(1))
+    os.environ["SERVE_SLOW_STARTUP_WARNING_S"] = "1"
+    os.environ["SERVE_SLOW_STARTUP_WARNING_PERIOD_S"] = "1"
 
-    yield client
+    ray.init(num_cpus=2)
+    client = serve.start(detached=True)
 
-    # Reset slow startup warning threshold to prevent state sharing across unit
-    # tests
-    ray.get(
-        client._controller._set_slow_startup_warning_period_s.remote(
-            original_slow_startup_warning_period_s
-        )
-    )
-    ray.get(
-        client._controller._set_slow_startup_warning_s.remote(
-            original_slow_startup_warning_s
-        )
-    )
+    yield client
 
     serve.shutdown()
     ray.shutdown()
 
+    # Reset slow startup warning threshold to prevent state sharing across unit
+    # tests
+    os.environ["SERVE_SLOW_STARTUP_WARNING_S"] = original_slow_startup_warning_s
+    os.environ[
+        "SERVE_SLOW_STARTUP_WARNING_PERIOD_S"
+    ] = original_slow_startup_warning_period_s
+
 
 def test_shutdown(ray_shutdown):
     ray.init(num_cpus=16)
@@ -770,7 +762,7 @@ def test_unhealthy_override_updating_status(lower_slow_startup_threshold_and_res
     @serve.deployment
     class f:
         def __init__(self):
-            self.num = 5 / 0
+            self.num = 1 / 0
 
         def __call__(self, request):
             pass

From 6dc874c43c15f9b1da75f90659a666ece168391c Mon Sep 17 00:00:00 2001
From: Cindy Zhang <cindyzyx9@gmail.com>
Date: Fri, 24 Jun 2022 14:45:37 -0700
Subject: [PATCH 13/16] format

---
 python/ray/serve/tests/test_standalone.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/python/ray/serve/tests/test_standalone.py b/python/ray/serve/tests/test_standalone.py
index 1d4a6841563f..7aa4c9e78744 100644
--- a/python/ray/serve/tests/test_standalone.py
+++ b/python/ray/serve/tests/test_standalone.py
@@ -697,6 +697,7 @@ def emit(self, record):
     serve.shutdown()
     ray.shutdown()
 
+
 def test_recovering_controller_no_redeploy():
     """Ensure controller doesn't redeploy running deployments when recovering."""
     ray.init(namespace="x")

From e85550193c66ba37c604cd2063fff638878a69bf Mon Sep 17 00:00:00 2001
From: zcin <cindyzyx9@gmail.com>
Date: Mon, 27 Jun 2022 09:42:13 -0700
Subject: [PATCH 14/16] Update python/ray/serve/tests/test_standalone.py

Co-authored-by: shrekris-anyscale <92341594+shrekris-anyscale@users.noreply.github.com>
---
 python/ray/serve/tests/test_standalone.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/ray/serve/tests/test_standalone.py b/python/ray/serve/tests/test_standalone.py
index 7aa4c9e78744..573e44731c40 100644
--- a/python/ray/serve/tests/test_standalone.py
+++ b/python/ray/serve/tests/test_standalone.py
@@ -54,7 +54,7 @@ def ray_cluster():
 
 @pytest.fixture()
 def lower_slow_startup_threshold_and_reset():
-    original_slow_startup_warning_s = os.getenv("SERVE_SLOW_STARTUP_WARNING_S", "30")
+    original_slow_startup_warning_s = os.getenv("SERVE_SLOW_STARTUP_WARNING_S")
     original_slow_startup_warning_period_s = os.getenv(
         "SERVE_SLOW_STARTUP_WARNING_PERIOD_S", "30"
     )

From 6e0e76d7c6d40b1472fff689d73f8e1c707d01b4 Mon Sep 17 00:00:00 2001
From: zcin <cindyzyx9@gmail.com>
Date: Mon, 27 Jun 2022 09:42:26 -0700
Subject: [PATCH 15/16] Update python/ray/serve/tests/test_standalone.py

Co-authored-by: shrekris-anyscale <92341594+shrekris-anyscale@users.noreply.github.com>
---
 python/ray/serve/tests/test_standalone.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/ray/serve/tests/test_standalone.py b/python/ray/serve/tests/test_standalone.py
index 573e44731c40..0c9fa35078ae 100644
--- a/python/ray/serve/tests/test_standalone.py
+++ b/python/ray/serve/tests/test_standalone.py
@@ -56,7 +56,7 @@ def ray_cluster():
 def lower_slow_startup_threshold_and_reset():
     original_slow_startup_warning_s = os.getenv("SERVE_SLOW_STARTUP_WARNING_S")
     original_slow_startup_warning_period_s = os.getenv(
-        "SERVE_SLOW_STARTUP_WARNING_PERIOD_S", "30"
+        "SERVE_SLOW_STARTUP_WARNING_PERIOD_S"
     )
     # Lower slow startup warning threshold to 1 second to reduce test duration
     os.environ["SERVE_SLOW_STARTUP_WARNING_S"] = "1"

From 6bcb8c63f579cf23696d41c172692abe6641518d Mon Sep 17 00:00:00 2001
From: Cindy Zhang <cindyzyx9@gmail.com>
Date: Mon, 27 Jun 2022 10:36:21 -0700
Subject: [PATCH 16/16] change method of getting environment variables

---
 python/ray/serve/deployment_state.py      |  4 ++--
 python/ray/serve/tests/test_standalone.py | 14 ++++++++------
 2 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/python/ray/serve/deployment_state.py b/python/ray/serve/deployment_state.py
index dbfb3bd0c21c..3500d6d3f07a 100644
--- a/python/ray/serve/deployment_state.py
+++ b/python/ray/serve/deployment_state.py
@@ -71,9 +71,9 @@ class ReplicaHealthCheckResponse(Enum):
 
 
 CHECKPOINT_KEY = "serve-deployment-state-checkpoint"
-SLOW_STARTUP_WARNING_S = int(os.getenv("SERVE_SLOW_STARTUP_WARNING_S", 30))
+SLOW_STARTUP_WARNING_S = int(os.environ.get("SERVE_SLOW_STARTUP_WARNING_S", 30))
 SLOW_STARTUP_WARNING_PERIOD_S = int(
-    os.getenv("SERVE_SLOW_STARTUP_WARNING_PERIOD_S", 30)
+    os.environ.get("SERVE_SLOW_STARTUP_WARNING_PERIOD_S", 30)
 )
 
 ALL_REPLICA_STATES = list(ReplicaState)
diff --git a/python/ray/serve/tests/test_standalone.py b/python/ray/serve/tests/test_standalone.py
index 0c9fa35078ae..511c60727aef 100644
--- a/python/ray/serve/tests/test_standalone.py
+++ b/python/ray/serve/tests/test_standalone.py
@@ -54,8 +54,8 @@ def ray_cluster():
 
 @pytest.fixture()
 def lower_slow_startup_threshold_and_reset():
-    original_slow_startup_warning_s = os.getenv("SERVE_SLOW_STARTUP_WARNING_S")
-    original_slow_startup_warning_period_s = os.getenv(
+    original_slow_startup_warning_s = os.environ.get("SERVE_SLOW_STARTUP_WARNING_S")
+    original_slow_startup_warning_period_s = os.environ.get(
         "SERVE_SLOW_STARTUP_WARNING_PERIOD_S"
     )
     # Lower slow startup warning threshold to 1 second to reduce test duration
@@ -72,10 +72,12 @@ def lower_slow_startup_threshold_and_reset():
 
     # Reset slow startup warning threshold to prevent state sharing across unit
     # tests
-    os.environ["SERVE_SLOW_STARTUP_WARNING_S"] = original_slow_startup_warning_s
-    os.environ[
-        "SERVE_SLOW_STARTUP_WARNING_PERIOD_S"
-    ] = original_slow_startup_warning_period_s
+    if original_slow_startup_warning_s is not None:
+        os.environ["SERVE_SLOW_STARTUP_WARNING_S"] = original_slow_startup_warning_s
+    if original_slow_startup_warning_period_s is not None:
+        os.environ[
+            "SERVE_SLOW_STARTUP_WARNING_PERIOD_S"
+        ] = original_slow_startup_warning_period_s
 
 
 def test_shutdown(ray_shutdown):