From 0ecaf51a65d1b37d45ff1074fdcc913085cf725b Mon Sep 17 00:00:00 2001 From: sihanwang41 Date: Tue, 25 Apr 2023 20:51:51 +0000 Subject: [PATCH 1/5] [Serve] Unset health replica stats when replica is under STOPPING Signed-off-by: sihanwang41 --- python/ray/serve/_private/deployment_state.py | 30 +++++++++++-------- 1 file changed, 18 insertions(+), 12 deletions(-) diff --git a/python/ray/serve/_private/deployment_state.py b/python/ray/serve/_private/deployment_state.py index 63c981150c1d..422443acd14d 100644 --- a/python/ray/serve/_private/deployment_state.py +++ b/python/ray/serve/_private/deployment_state.py @@ -1283,8 +1283,7 @@ def _stop_or_update_outdated_version_replicas(self, max_to_stop=math.inf) -> int # normal scale-up process. if replica.version.requires_actor_restart(self._target_state.version): code_version_changes += 1 - replica.stop() - self._replicas.add(ReplicaState.STOPPING, replica) + self._stop_replica(replica) replicas_changed = True # Otherwise, only lightweight options in deployment config is a mismatch, so # we update it dynamically without restarting the replica. @@ -1463,8 +1462,7 @@ def _scale_deployment_replicas(self) -> bool: f"Adding STOPPING to replica_tag: {replica}, " f"deployment_name: {self._name}" ) - replica.stop() - self._replicas.add(ReplicaState.STOPPING, replica) + self._stop_replica(replica) return replicas_changed @@ -1583,8 +1581,7 @@ def _check_startup_replicas( self._replica_constructor_retry_counter += 1 replicas_failed = True - replica.stop(graceful=False) - self._replicas.add(ReplicaState.STOPPING, replica) + self._stop_replica(replica) elif start_status in [ ReplicaStartupStatus.PENDING_ALLOCATION, ReplicaStartupStatus.PENDING_INITIALIZATION, @@ -1597,8 +1594,7 @@ def _check_startup_replicas( # Does it make sense to stop replicas in PENDING_ALLOCATION # state? if is_slow and stop_on_slow: - replica.stop(graceful=False) - self._replicas.add(ReplicaState.STOPPING, replica) + self._stop_replica(replica, graceful_stop=False) else: self._replicas.add(original_state, replica) @@ -1619,6 +1615,18 @@ def _check_startup_replicas( return slow_replicas, transitioned_to_running + def _stop_replica(self, replica, graceful_stop=True): + """Stop replica + 1. Stop the replica. + 2. Change the replica into stopping state. + 3. Set the health replica stats to 0. + """ + replica.stop(graceful=graceful_stop) + self._replicas.add(ReplicaState.STOPPING, replica) + self.health_check_gauge.set( + 0, tags={"deployment": self._name, "replica": replica.replica_tag} + ) + def _check_and_update_replicas(self) -> bool: """ Check current state of all DeploymentReplica being tracked, and compare @@ -1644,8 +1652,7 @@ def _check_and_update_replicas(self) -> bool: self.health_check_gauge.set( 0, tags={"deployment": self._name, "replica": replica.replica_tag} ) - replica.stop(graceful=False) - self._replicas.add(ReplicaState.STOPPING, replica) + self._stop_replica(replica, graceful_stop=False) # If this is a replica of the target version, the deployment # enters the "UNHEALTHY" status until the replica is # recovered or a new deploy happens. @@ -1855,8 +1862,7 @@ def _stop_all_replicas(self) -> bool: ReplicaState.RECOVERING, ] ): - replica.stop() - self._replicas.add(ReplicaState.STOPPING, replica) + self._stop_replica(replica) replica_changed = True return replica_changed From 216b80b40c63ecc41612b5792afcee131d6f8166 Mon Sep 17 00:00:00 2001 From: sihanwang41 Date: Thu, 27 Apr 2023 00:13:30 +0000 Subject: [PATCH 2/5] Add test Signed-off-by: sihanwang41 --- python/ray/serve/tests/test_standalone3.py | 38 ++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/python/ray/serve/tests/test_standalone3.py b/python/ray/serve/tests/test_standalone3.py index 9170bb4fe700..31f333ee99c6 100644 --- a/python/ray/serve/tests/test_standalone3.py +++ b/python/ray/serve/tests/test_standalone3.py @@ -380,5 +380,43 @@ def predict(self, inp): ) +def test_replica_health_metric(ray_instance): + """Test replica health metrics""" + + @serve.deployment(num_replicas=2) + def f(): + return "hello" + + serve.run(f.bind()) + + def count_live_replica_metrics(): + resp = requests.get("http://127.0.0.1:9999").text + resp = resp.split("\n") + count = 0 + for metrics in resp: + if "# HELP" in metrics or "# TYPE" in metrics: + continue + if "serve_deployment_replica_healthy" in metrics: + if "1.0" in metrics: + count += 1 + return count + + wait_for_condition( + lambda: count_live_replica_metrics() == 2, timeout=60, retry_interval_ms=500 + ) + + # Add more replicas + serve.run(f.options(num_replicas=10).bind()) + wait_for_condition( + lambda: count_live_replica_metrics() == 10, timeout=60, retry_interval_ms=500 + ) + + # delete the application + serve.delete("default_f") + wait_for_condition( + lambda: count_live_replica_metrics() == 0, timeout=60, retry_interval_ms=500 + ) + + if __name__ == "__main__": sys.exit(pytest.main(["-v", "-s", __file__])) From 501ce81ce1fb2e4ef27bb8a0bd582456c9c13d43 Mon Sep 17 00:00:00 2001 From: sihanwang41 Date: Thu, 27 Apr 2023 03:52:28 +0000 Subject: [PATCH 3/5] Update Signed-off-by: sihanwang41 --- python/ray/serve/tests/test_standalone3.py | 77 +++++++++++----------- 1 file changed, 39 insertions(+), 38 deletions(-) diff --git a/python/ray/serve/tests/test_standalone3.py b/python/ray/serve/tests/test_standalone3.py index 31f333ee99c6..1dcabaf6ddbe 100644 --- a/python/ray/serve/tests/test_standalone3.py +++ b/python/ray/serve/tests/test_standalone3.py @@ -196,6 +196,45 @@ def verify_metrics(): serve.shutdown() +def test_replica_health_metric(ray_instance): + """Test replica health metrics""" + + @serve.deployment(num_replicas=2) + def f(): + return "hello" + + serve.run(f.bind()) + + def count_live_replica_metrics(): + resp = requests.get("http://127.0.0.1:9999").text + resp = resp.split("\n") + count = 0 + for metrics in resp: + if "# HELP" in metrics or "# TYPE" in metrics: + continue + if "serve_deployment_replica_healthy" in metrics: + if "1.0" in metrics: + count += 1 + return count + + wait_for_condition( + lambda: count_live_replica_metrics() == 2, timeout=60, retry_interval_ms=500 + ) + + # Add more replicas + serve.run(f.options(num_replicas=10).bind()) + wait_for_condition( + lambda: count_live_replica_metrics() == 10, timeout=60, retry_interval_ms=500 + ) + + # delete the application + serve.delete("default_f") + wait_for_condition( + lambda: count_live_replica_metrics() == 0, timeout=60, retry_interval_ms=500 + ) + serve.shutdown() + + def test_shutdown_remote(start_and_shutdown_ray_cli_function): """Check that serve.shutdown() works on a remote Ray cluster.""" @@ -380,43 +419,5 @@ def predict(self, inp): ) -def test_replica_health_metric(ray_instance): - """Test replica health metrics""" - - @serve.deployment(num_replicas=2) - def f(): - return "hello" - - serve.run(f.bind()) - - def count_live_replica_metrics(): - resp = requests.get("http://127.0.0.1:9999").text - resp = resp.split("\n") - count = 0 - for metrics in resp: - if "# HELP" in metrics or "# TYPE" in metrics: - continue - if "serve_deployment_replica_healthy" in metrics: - if "1.0" in metrics: - count += 1 - return count - - wait_for_condition( - lambda: count_live_replica_metrics() == 2, timeout=60, retry_interval_ms=500 - ) - - # Add more replicas - serve.run(f.options(num_replicas=10).bind()) - wait_for_condition( - lambda: count_live_replica_metrics() == 10, timeout=60, retry_interval_ms=500 - ) - - # delete the application - serve.delete("default_f") - wait_for_condition( - lambda: count_live_replica_metrics() == 0, timeout=60, retry_interval_ms=500 - ) - - if __name__ == "__main__": sys.exit(pytest.main(["-v", "-s", __file__])) From 09de3f140acd8064c522a9ec80a470bed3743c49 Mon Sep 17 00:00:00 2001 From: sihanwang41 Date: Thu, 27 Apr 2023 04:45:31 +0000 Subject: [PATCH 4/5] Update Signed-off-by: sihanwang41 --- python/ray/serve/tests/test_standalone3.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/python/ray/serve/tests/test_standalone3.py b/python/ray/serve/tests/test_standalone3.py index 1dcabaf6ddbe..d1666f62cd97 100644 --- a/python/ray/serve/tests/test_standalone3.py +++ b/python/ray/serve/tests/test_standalone3.py @@ -22,6 +22,7 @@ from ray.exceptions import RayActorError from ray.serve._private.constants import ( SYNC_HANDLE_IN_DAG_FEATURE_FLAG_ENV_KEY, + SERVE_DEFAULT_APP_NAME, ) from ray.serve.context import get_global_client from ray.tests.conftest import call_ray_stop_only # noqa: F401 @@ -218,19 +219,19 @@ def count_live_replica_metrics(): return count wait_for_condition( - lambda: count_live_replica_metrics() == 2, timeout=60, retry_interval_ms=500 + lambda: count_live_replica_metrics() == 2, timeout=120, retry_interval_ms=500 ) # Add more replicas serve.run(f.options(num_replicas=10).bind()) wait_for_condition( - lambda: count_live_replica_metrics() == 10, timeout=60, retry_interval_ms=500 + lambda: count_live_replica_metrics() == 10, timeout=120, retry_interval_ms=500 ) # delete the application - serve.delete("default_f") + serve.delete(SERVE_DEFAULT_APP_NAME) wait_for_condition( - lambda: count_live_replica_metrics() == 0, timeout=60, retry_interval_ms=500 + lambda: count_live_replica_metrics() == 0, timeout=120, retry_interval_ms=500 ) serve.shutdown() From 0f673ed17a594fe658d2349c6f038fa14eb6e085 Mon Sep 17 00:00:00 2001 From: sihanwang41 Date: Thu, 27 Apr 2023 17:19:53 +0000 Subject: [PATCH 5/5] Update Signed-off-by: sihanwang41 --- python/ray/serve/tests/test_standalone3.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/python/ray/serve/tests/test_standalone3.py b/python/ray/serve/tests/test_standalone3.py index d1666f62cd97..47b7173b187c 100644 --- a/python/ray/serve/tests/test_standalone3.py +++ b/python/ray/serve/tests/test_standalone3.py @@ -197,6 +197,11 @@ def verify_metrics(): serve.shutdown() +@pytest.mark.parametrize( + "ray_instance", + [], + indirect=True, +) def test_replica_health_metric(ray_instance): """Test replica health metrics"""