[Serve] Set app_msg to empty string by default (ray-project#35646)

The ApplicationState currently updates its _status directly without necessarily updating the corresponding message. This causes stale messages to sometimes appear with updated statuses. This change sets the app_msg to an empty string by default, so the message is cleared or updated whenever the status is changed. Related issue number Closes ray-project#35508
shrekris-anyscale · May 24, 2023 · 902b759 · 902b759
1 parent f6410b0
commit 902b759
Show file tree

Hide file tree

Showing 2 changed files with 67 additions and 22 deletions.
diff --git a/python/ray/serve/_private/application_state.py b/python/ray/serve/_private/application_state.py
@@ -41,7 +41,7 @@ def __init__(
 
         self._name = name
         self._deploy_obj_ref = deploy_obj_ref
-        self._app_msg = ""
+        self._status_msg = ""
         self._deployment_state_manager = deployment_state_manager
         self._deployment_params: List[Dict] = []
         # This set tracks old deployments that are being deleted
@@ -92,7 +92,7 @@ def deployments(self) -> List[str]:
 
     def delete(self):
         """Delete the application"""
-        self._status = ApplicationStatus.DELETING
+        self._update_status(ApplicationStatus.DELETING)
 
     def deploy(self, deployment_params: List[Dict]) -> List[str]:
         """Deploy the application.
@@ -141,13 +141,13 @@ def deploy(self, deployment_params: List[Dict]) -> List[str]:
                 "path in your application to avoid this issue."
             )
 
-        self._status = ApplicationStatus.DEPLOYING
+        self._update_status(ApplicationStatus.DEPLOYING)
         return cur_deployments_to_delete
 
     def update_obj_ref(self, deploy_obj_ref: ObjectRef, deployment_time: int):
         self._deploy_obj_ref = deploy_obj_ref
         self._deployment_timestamp = deployment_time
-        self._status = ApplicationStatus.DEPLOYING
+        self._update_status(ApplicationStatus.DEPLOYING)
 
     def _process_terminating_deployments(self):
         """Update the tracking for all deployments being deleted
@@ -202,42 +202,60 @@ def update(self):
                     ray.get(finished[0])
                     logger.info(f"Deploy task for app '{self._name}' ran successfully.")
                 except RayTaskError as e:
-                    self._status = ApplicationStatus.DEPLOY_FAILED
                     # NOTE(zcin): we should use str(e) instead of traceback.format_exc()
                     # here because the full details of the error is not displayed
                     # properly with traceback.format_exc(). RayTaskError has its own
                     # custom __str__ function.
-                    self._app_msg = f"Deploying app '{self._name}' failed:\n{str(e)}"
-                    logger.warning(self._app_msg)
+                    self._update_status(
+                        ApplicationStatus.DEPLOY_FAILED,
+                        status_msg=f"Deploying app '{self._name}' failed:\n{str(e)}",
+                    )
+                    logger.warning(self._status_msg)
                     return
                 except RuntimeEnvSetupError:
-                    self._status = ApplicationStatus.DEPLOY_FAILED
-                    self._app_msg = (
-                        f"Runtime env setup for app '{self._name}' "
-                        f"failed:\n{traceback.format_exc()}"
+                    self._update_status(
+                        ApplicationStatus.DEPLOY_FAILED,
+                        status_msg=(
+                            f"Runtime env setup for app '{self._name}' "
+                            f"failed:\n{traceback.format_exc()}"
+                        ),
                     )
-                    logger.warning(self._app_msg)
+                    logger.warning(self._status_msg)
                     return
                 except Exception:
-                    self._status = ApplicationStatus.DEPLOY_FAILED
-                    self._app_msg = (
-                        "Unexpected error occured while deploying application "
-                        f"'{self._name}':\n{traceback.format_exc()}"
+                    self._update_status(
+                        ApplicationStatus.DEPLOY_FAILED,
+                        status_msg=(
+                            "Unexpected error occured while deploying "
+                            f"application '{self._name}':"
+                            f"\n{traceback.format_exc()}"
+                        ),
                     )
-                    logger.warning(self._app_msg)
+                    logger.warning(self._status_msg)
                     return
             deployments_statuses = (
                 self._deployment_state_manager.get_deployment_statuses(self.deployments)
             )
             num_health_deployments = 0
+            unhealthy_deployment_names = []
             for deployment_status in deployments_statuses:
                 if deployment_status.status == DeploymentStatus.UNHEALTHY:
-                    self._status = ApplicationStatus.DEPLOY_FAILED
-                    return
+                    unhealthy_deployment_names.append(deployment_status.name)
                 if deployment_status.status == DeploymentStatus.HEALTHY:
                     num_health_deployments += 1
+
+            if len(unhealthy_deployment_names) != 0:
+                self._update_status(
+                    ApplicationStatus.DEPLOY_FAILED,
+                    status_msg=(
+                        "The following deployments are UNHEALTHY: "
+                        f"{unhealthy_deployment_names}"
+                    ),
+                )
+                return
+
             if num_health_deployments == len(deployments_statuses):
-                self._status = ApplicationStatus.RUNNING
+                self._update_status(ApplicationStatus.RUNNING)
 
             self._process_terminating_deployments()
 
@@ -249,7 +267,7 @@ def get_application_status_info(self) -> ApplicationStatusInfo:
         """Return the application status information"""
         return ApplicationStatusInfo(
             self._status,
-            message=self._app_msg,
+            message=self._status_msg,
             deployment_timestamp=self._deployment_timestamp,
         )
 
@@ -269,6 +287,10 @@ def list_deployment_details(self) -> Dict[str, DeploymentDetails]:
         }
         return {k: v for k, v in details.items() if v is not None}
 
+    def _update_status(self, status: ApplicationStatus, status_msg: str = ""):
+        self._status = status
+        self._status_msg = status_msg
+
 
 class ApplicationStateManager:
     def __init__(self, deployment_state_manager):

diff --git a/python/ray/serve/tests/test_application_state.py b/python/ray/serve/tests/test_application_state.py
@@ -115,7 +115,7 @@ def test_update_app_running(mocked_application_state_manager):
 
 
 def test_update_app_deploy_failed(mocked_application_state_manager):
-    """Test DEPLOYING -> DEPLOY_FAILED"""
+    """Test DEPLOYING -> DEPLOY_FAILED -> DEPLOYING -> RUNNING"""
     app_state_manager, deployment_state_manager = mocked_application_state_manager
     app_state_manager.deploy_application("test_app", [{"name": "d1"}])
     # Simulate controller
@@ -128,8 +128,31 @@ def test_update_app_deploy_failed(mocked_application_state_manager):
     app_status = app_state_manager.get_app_status("test_app")
     assert app_status.status == ApplicationStatus.DEPLOY_FAILED
     # rerun update, application status should not make difference
+    deploy_failed_msg = app_status.message
+    assert len(deploy_failed_msg) != 0
     app_state_manager.update()
     assert app_status.status == ApplicationStatus.DEPLOY_FAILED
+    assert app_status.message == deploy_failed_msg
+
+    app_state_manager.deploy_application("test_app", [{"name": "d1"}, {"name": "d2"}])
+    # Simulate controller
+    deployment_state_manager.deploy("d1", None)
+    deployment_state_manager.deploy("d2", None)
+
+    app_status = app_state_manager.get_app_status("test_app")
+    assert app_status.status == ApplicationStatus.DEPLOYING
+    assert app_status.message != deploy_failed_msg
+    deployment_state_manager.set_deployment_statuses_healthy("d1")
+    deployment_state_manager.set_deployment_statuses_healthy("d2")
+    app_state_manager.update()
+    app_status = app_state_manager.get_app_status("test_app")
+    assert app_status.status == ApplicationStatus.RUNNING
+    running_msg = app_status.message
+    assert running_msg != deploy_failed_msg
+    # rerun update, application status should not make difference
+    app_state_manager.update()
+    assert app_status.status == ApplicationStatus.RUNNING
+    assert app_status.message == running_msg
 
 
 @pytest.mark.skipif(sys.platform == "win32", reason="Failing on Windows.")