[serve] change default for max and target ongoing requests

TODO: fill in. Signed-off-by: Cindy Zhang <[email protected]>
ray-project · Jun 18, 2024 · 5ad7f14 · 5ad7f14
1 parent 4eb232c
commit 5ad7f14
Show file tree

Hide file tree

Showing 10 changed files with 99 additions and 42 deletions.
diff --git a/dashboard/modules/serve/serve_rest_api_impl.py b/dashboard/modules/serve/serve_rest_api_impl.py
@@ -167,6 +167,8 @@ async def put_all_applications(self, req: Request) -> Response:
                     text=repr(e),
                 )
 
+            self.log_config_change_default_warning(config)
+
             config_http_options = config.http_options.dict()
             location = ProxyLocation._to_deployment_mode(config.proxy_location)
             full_http_options = dict({"location": location}, **config_http_options)
@@ -213,6 +215,41 @@ def validate_http_options(self, client, http_options):
                     f"updated: {divergent_http_options}."
                 )
 
+        def log_config_change_default_warning(self, config):
+            from ray.serve.config import AutoscalingConfig
+
+            for deployment in [
+                d for app in config.applications for d in app.deployments
+            ]:
+                if "max_ongoing_requests" not in deployment.dict(exclude_unset=True):
+                    logger.warning(
+                        "The default value for `max_ongoing_requests` has changed "
+                        "from 100 to 5 in Ray 2.31.0."
+                    )
+                    break
+
+            for deployment in [
+                d for app in config.applications for d in app.deployments
+            ]:
+                if isinstance(deployment.autoscaling_config, dict):
+                    autoscaling_config = deployment.autoscaling_config
+                elif isinstance(deployment.autoscaling_config, AutoscalingConfig):
+                    autoscaling_config = deployment.autoscaling_config.dict(
+                        exclude_unset=True
+                    )
+                else:
+                    continue
+
+                if (
+                    "target_num_ongoing_requests_per_replica" not in autoscaling_config
+                    and "target_ongoing_requests" not in autoscaling_config
+                ):
+                    logger.warning(
+                        "The default value for `target_ongoing_requests` has changed "
+                        "from 1.0 to 2.0 in Ray 2.31.0."
+                    )
+                    break
+
         async def get_serve_controller(self):
             """Gets the ServeController to the this cluster's Serve app.
 

diff --git a/doc/source/serve/advanced-guides/advanced-autoscaling.md b/doc/source/serve/advanced-guides/advanced-autoscaling.md
@@ -14,12 +14,12 @@ In this section, we go into more detail about Serve autoscaling concepts as well
 
 To define what the steady state of your deployments should be, set values for `target_ongoing_requests` and `max_ongoing_requests`.
 
-#### **target_num_ongoing_requests_per_replica [default=1]**
+#### **target_num_ongoing_requests_per_replica [default=2]**
 This parameter is renamed to `target_ongoing_requests`. `target_num_ongoing_requests_per_replica` will be removed in a future release.
 
-#### **target_ongoing_requests [default=1]**
+#### **target_ongoing_requests [default=2]**
 :::{note}
-The default for `target_ongoing_requests` will be changed to 2.0 in an upcoming Ray release. You can continue to set it manually to override the default.
+The default for `target_ongoing_requests` changed from 1.0 to 2.0 in Ray 2.31.0. You can continue to set it manually to override the default.
 :::
 Serve scales the number of replicas for a deployment up or down based on the average number of ongoing requests per replica. Specifically, Serve compares the *actual* number of ongoing requests per replica with the target value you set in the autoscaling config and makes upscale or downscale decisions from that. Set the target value with `target_ongoing_requests`, and Serve attempts to ensure that each replica has roughly that number
 of requests being processed and waiting in the queue. 
@@ -30,12 +30,12 @@ Always load test your workloads. For example, if the use case is latency sensiti
 As an example, suppose you have two replicas of a synchronous deployment that has 100ms latency, serving a traffic load of 30 QPS. Then Serve assigns requests to replicas faster than the replicas can finish processing them; more and more requests queue up at the replica (these requests are "ongoing requests") as time progresses, and then the average number of ongoing requests at each replica steadily increases. Latency also increases because new requests have to wait for old requests to finish processing. If you set `target_ongoing_requests = 1`, Serve detects a higher than desired number of ongoing requests per replica, and adds more replicas. At 3 replicas, your system would be able to process 30 QPS with 1 ongoing request per replica on average.
 :::
 
-#### **max_concurrent_queries [default=100] (DEPRECATED)**
+#### **max_concurrent_queries [default=5] (DEPRECATED)**
 This parameter is renamed to `max_ongoing_requests`. `max_concurrent_queries` will be removed in a future release.
 
-#### **max_ongoing_requests [default=100]**
+#### **max_ongoing_requests [default=5]**
 :::{note}
-The default for `max_ongoing_requests` will be changed to 5 in an upcoming Ray release. You can continue to set it manually to override the default.
+The default for `max_ongoing_requests` changed from 100 to 5 in Ray 2.31.0. You can continue to set it manually to override the default.
 :::
 There is also a maximum queue limit that proxies respect when assigning requests to replicas. Define the limit with `max_ongoing_requests`. Set `max_ongoing_requests` to ~20 to 50% higher than `target_ongoing_requests`. Note that `target_ongoing_requests` should always be strictly less than `max_ongoing_requests`, otherwise the deployment never scales up.
 

diff --git a/python/ray/serve/_private/application_state.py b/python/ray/serve/_private/application_state.py
@@ -21,11 +21,7 @@
     TargetCapacityDirection,
 )
 from ray.serve._private.config import DeploymentConfig
-from ray.serve._private.constants import (
-    NEW_DEFAULT_MAX_ONGOING_REQUESTS,
-    RAY_SERVE_ENABLE_TASK_EVENTS,
-    SERVE_LOGGER_NAME,
-)
+from ray.serve._private.constants import RAY_SERVE_ENABLE_TASK_EVENTS, SERVE_LOGGER_NAME
 from ray.serve._private.deploy_utils import (
     deploy_args_to_deployment_info,
     get_app_code_version,
@@ -1110,11 +1106,6 @@ def override_deployment_info(
         # `num_replicas="auto"`
         if options.get("num_replicas") == "auto":
             options["num_replicas"] = None
-            if (
-                "max_ongoing_requests"
-                not in original_options["user_configured_option_names"]
-            ):
-                options["max_ongoing_requests"] = NEW_DEFAULT_MAX_ONGOING_REQUESTS
 
             new_config = AutoscalingConfig.default().dict()
             # If `autoscaling_config` is specified, its values override

diff --git a/python/ray/serve/_private/config.py b/python/ray/serve/_private/config.py
@@ -25,7 +25,6 @@
     DEFAULT_HEALTH_CHECK_TIMEOUT_S,
     DEFAULT_MAX_ONGOING_REQUESTS,
     MAX_REPLICAS_PER_NODE_MAX_VALUE,
-    NEW_DEFAULT_MAX_ONGOING_REQUESTS,
 )
 from ray.serve._private.utils import DEFAULT, DeploymentOptionUpdateType
 from ray.serve.config import AutoscalingConfig
@@ -87,7 +86,7 @@ class DeploymentConfig(BaseModel):
             handles requests to this deployment. Defaults to 1.
         max_ongoing_requests: The maximum number of queries
             that is sent to a replica of this deployment without receiving
-            a response. Defaults to 100.
+            a response. Defaults to 5.
         max_queued_requests: Maximum number of requests to this deployment that will be
             queued at each *caller* (proxy or DeploymentHandle). Once this limit is
             reached, subsequent requests will raise a BackPressureError (for handles) or
@@ -332,19 +331,12 @@ def handle_num_replicas_auto(
     """Return modified `max_ongoing_requests` and `autoscaling_config`
     for when num_replicas="auto".
 
-    If `max_ongoing_requests` is unspecified (DEFAULT.VALUE), returns
-    the modified value NEW_DEFAULT_MAX_ONGOING_REQUESTS. Otherwise,
-    doesn't change `max_ongoing_requests`.
-
     If `autoscaling_config` is unspecified, returns the modified value
     AutoscalingConfig.default().
     If it is specified, the specified fields in `autoscaling_config`
     override that of AutoscalingConfig.default().
     """
 
-    if max_ongoing_requests is DEFAULT.VALUE:
-        max_ongoing_requests = NEW_DEFAULT_MAX_ONGOING_REQUESTS
-
     if autoscaling_config in [DEFAULT.VALUE, None]:
         # If autoscaling config wasn't specified, use default
         # configuration

diff --git a/python/ray/serve/_private/constants.py b/python/ray/serve/_private/constants.py
@@ -102,8 +102,8 @@
 DEFAULT_GRACEFUL_SHUTDOWN_WAIT_LOOP_S = 2
 DEFAULT_HEALTH_CHECK_PERIOD_S = 10
 DEFAULT_HEALTH_CHECK_TIMEOUT_S = 30
-DEFAULT_MAX_ONGOING_REQUESTS = 100
-NEW_DEFAULT_MAX_ONGOING_REQUESTS = 5
+DEFAULT_MAX_ONGOING_REQUESTS = 5
+DEFAULT_TARGET_ONGOING_REQUESTS = 2
 
 # HTTP Proxy health check configs
 PROXY_HEALTH_CHECK_TIMEOUT_S = (

diff --git a/python/ray/serve/api.py b/python/ray/serve/api.py
@@ -306,9 +306,9 @@ class MyDeployment:
             can be updated dynamically without restarting the replicas of the
             deployment. The user_config must be fully JSON-serializable.
         max_concurrent_queries: [DEPRECATED] Maximum number of queries that are sent to
-            a replica of this deployment without receiving a response. Defaults to 100.
+            a replica of this deployment without receiving a response. Defaults to 5.
         max_ongoing_requests: Maximum number of requests that are sent to a
-            replica of this deployment without receiving a response. Defaults to 100.
+            replica of this deployment without receiving a response. Defaults to 5.
         max_queued_requests: [EXPERIMENTAL] Maximum number of requests to this
             deployment that will be queued at each *caller* (proxy or DeploymentHandle).
             Once this limit is reached, subsequent requests will raise a
@@ -350,10 +350,30 @@ class MyDeployment:
                 "version."
             )
 
+        if (
+            isinstance(autoscaling_config, dict)
+            and "target_num_ongoing_requests_per_replica" not in autoscaling_config
+            and "target_ongoing_requests" not in autoscaling_config
+        ) or (
+            isinstance(autoscaling_config, AutoscalingConfig)
+            and "target_num_ongoing_requests_per_replica"
+            not in autoscaling_config.dict(exclude_unset=True)
+            and "target_ongoing_requests"
+            not in autoscaling_config.dict(exclude_unset=True)
+        ):
+            logger.warning(
+                "The default value for `target_ongoing_requests` has changed from 1.0 "
+                "to 2.0 in Ray 2.31.0."
+            )
+
     if max_ongoing_requests is None:
         raise ValueError("`max_ongoing_requests` must be non-null, got None.")
     elif max_ongoing_requests is DEFAULT.VALUE:
         if max_concurrent_queries is None:
+            logger.warning(
+                "The default value for `max_ongoing_requests` has changed from "
+                "100 to 5 in Ray 2.31.0."
+            )
             max_ongoing_requests = DEFAULT_MAX_ONGOING_REQUESTS
         else:
             max_ongoing_requests = max_concurrent_queries
@@ -408,6 +428,12 @@ class MyDeployment:
             "been deprecated and replaced by `max_ongoing_requests`."
         )
 
+    if max_ongoing_requests is DEFAULT.VALUE:
+        logger.warning(
+            "The default value for `max_ongoing_requests` has changed from 100 to 5 in "
+            "Ray 2.31.0."
+        )
+
     if isinstance(logging_config, LoggingConfig):
         logging_config = logging_config.dict()
 
@@ -935,3 +961,8 @@ async def __call__(self, val: int) -> int:
         ServeUsageTag.SERVE_GET_DEPLOYMENT_HANDLE_API_USED.record("1")
 
     return client.get_handle(deployment_name, app_name, check_exists=_check_exists)
+
+
+@PublicAPI
+def get_tracing_context() -> Optional[Dict]:
+    return None
diff --git a/python/ray/serve/config.py b/python/ray/serve/config.py
@@ -20,6 +20,7 @@
     DEFAULT_GRPC_PORT,
     DEFAULT_HTTP_HOST,
     DEFAULT_HTTP_PORT,
+    DEFAULT_TARGET_ONGOING_REQUESTS,
     DEFAULT_UVICORN_KEEP_ALIVE_TIMEOUT_S,
     SERVE_LOGGER_NAME,
 )
@@ -42,7 +43,7 @@ class AutoscalingConfig(BaseModel):
 
     # DEPRECATED: replaced by target_ongoing_requests
     target_num_ongoing_requests_per_replica: PositiveFloat = Field(
-        default=1.0,
+        default=DEFAULT_TARGET_ONGOING_REQUESTS,
         description="[DEPRECATED] Please use `target_ongoing_requests` instead.",
     )
     # Will default to 1.0 in the future.
@@ -134,10 +135,10 @@ def serialize_policy(self) -> None:
     @classmethod
     def default(cls):
         return cls(
+            target_num_ongoing_requests_per_replica=DEFAULT_TARGET_ONGOING_REQUESTS,
+            target_ongoing_requests=DEFAULT_TARGET_ONGOING_REQUESTS,
             min_replicas=1,
             max_replicas=100,
-            target_num_ongoing_requests_per_replica=2,
-            target_ongoing_requests=2,
         )
 
     def get_policy(self) -> Callable:

diff --git a/python/ray/serve/tests/test_api.py b/python/ray/serve/tests/test_api.py
@@ -14,7 +14,10 @@
 from ray._private.test_utils import SignalActor, wait_for_condition
 from ray.serve._private.api import call_app_builder_with_args_if_necessary
 from ray.serve._private.common import DeploymentID
-from ray.serve._private.constants import SERVE_DEFAULT_APP_NAME
+from ray.serve._private.constants import (
+    DEFAULT_MAX_ONGOING_REQUESTS,
+    SERVE_DEFAULT_APP_NAME,
+)
 from ray.serve.deployment import Application
 from ray.serve.exceptions import RayServeException
 from ray.serve.handle import DeploymentHandle
@@ -1034,10 +1037,10 @@ class A:
         ).bind()
 
     serve.run(serve.deployment(max_concurrent_queries=None)(A).bind())
-    assert get_max_ongoing_requests() == 100
+    assert get_max_ongoing_requests() == DEFAULT_MAX_ONGOING_REQUESTS
 
     serve.run(serve.deployment(A).options(max_concurrent_queries=None).bind())
-    assert get_max_ongoing_requests() == 100
+    assert get_max_ongoing_requests() == DEFAULT_MAX_ONGOING_REQUESTS
 
     serve.run(
         serve.deployment(max_ongoing_requests=8, max_concurrent_queries=None)(A).bind()

diff --git a/python/ray/serve/tests/test_autoscaling_policy.py b/python/ray/serve/tests/test_autoscaling_policy.py
@@ -297,7 +297,7 @@ async def __call__(self):
                 await signal.wait.remote()
                 return "sup"
 
-        @serve.deployment(graceful_shutdown_timeout_s=1)
+        @serve.deployment(graceful_shutdown_timeout_s=1, max_ongoing_requests=50)
         class Router:
             def __init__(self, handle: DeploymentHandle):
                 if use_get_handle_api:
@@ -1144,8 +1144,9 @@ def test_max_ongoing_requests_set_to_one(
 
     @serve.deployment(
         autoscaling_config=AutoscalingConfig(
+            target_ongoing_requests=1.0,
             min_replicas=1,
-            max_replicas=5,
+            max_replicas=3,
             upscale_delay_s=0.5,
             downscale_delay_s=0.5,
             metrics_interval_s=0.5,
@@ -1171,11 +1172,12 @@ async def f():
     # 2. Wait for the number of waiters on signal to increase by 1.
     # 3. Assert the number of replicas has increased by 1.
     refs = []
-    for i in range(5):
+    for i in range(3):
         refs.append(h.remote())
 
         def check_num_waiters(target: int):
-            assert ray.get(signal.cur_num_waiters.remote()) == target
+            num_waiters = ray.get(signal.cur_num_waiters.remote())
+            assert num_waiters == target
             return True
 
         wait_for_condition(check_num_waiters, target=i + 1)

diff --git a/python/ray/serve/tests/test_controller.py b/python/ray/serve/tests/test_controller.py
@@ -156,15 +156,15 @@ def autoscaling_app():
                             "message": "",
                             "deployment_config": {
                                 "name": "autoscaling_app",
-                                "max_concurrent_queries": 100,
-                                "max_ongoing_requests": 100,
+                                "max_concurrent_queries": 5,
+                                "max_ongoing_requests": 5,
                                 "max_queued_requests": -1,
                                 "user_config": None,
                                 "autoscaling_config": {
                                     "min_replicas": 1,
                                     "initial_replicas": None,
                                     "max_replicas": 10,
-                                    "target_num_ongoing_requests_per_replica": 1.0,
+                                    "target_num_ongoing_requests_per_replica": 2.0,
                                     "target_ongoing_requests": None,
                                     "metrics_interval_s": 10.0,
                                     "look_back_period_s": 30.0,