diff --git a/dashboard/modules/serve/serve_rest_api_impl.py b/dashboard/modules/serve/serve_rest_api_impl.py index 3d59ebaf9c94..d1aaa8a9dabe 100644 --- a/dashboard/modules/serve/serve_rest_api_impl.py +++ b/dashboard/modules/serve/serve_rest_api_impl.py @@ -167,6 +167,8 @@ async def put_all_applications(self, req: Request) -> Response: text=repr(e), ) + self.log_config_change_default_warning(config) + config_http_options = config.http_options.dict() location = ProxyLocation._to_deployment_mode(config.proxy_location) full_http_options = dict({"location": location}, **config_http_options) @@ -213,6 +215,41 @@ def validate_http_options(self, client, http_options): f"updated: {divergent_http_options}." ) + def log_config_change_default_warning(self, config): + from ray.serve.config import AutoscalingConfig + + for deployment in [ + d for app in config.applications for d in app.deployments + ]: + if "max_ongoing_requests" not in deployment.dict(exclude_unset=True): + logger.warning( + "The default value for `max_ongoing_requests` has changed " + "from 100 to 5 in Ray 2.31.0." + ) + break + + for deployment in [ + d for app in config.applications for d in app.deployments + ]: + if isinstance(deployment.autoscaling_config, dict): + autoscaling_config = deployment.autoscaling_config + elif isinstance(deployment.autoscaling_config, AutoscalingConfig): + autoscaling_config = deployment.autoscaling_config.dict( + exclude_unset=True + ) + else: + continue + + if ( + "target_num_ongoing_requests_per_replica" not in autoscaling_config + and "target_ongoing_requests" not in autoscaling_config + ): + logger.warning( + "The default value for `target_ongoing_requests` has changed " + "from 1.0 to 2.0 in Ray 2.31.0." + ) + break + async def get_serve_controller(self): """Gets the ServeController to the this cluster's Serve app. diff --git a/doc/source/serve/advanced-guides/advanced-autoscaling.md b/doc/source/serve/advanced-guides/advanced-autoscaling.md index c8514d4e746b..08becbea1137 100644 --- a/doc/source/serve/advanced-guides/advanced-autoscaling.md +++ b/doc/source/serve/advanced-guides/advanced-autoscaling.md @@ -14,12 +14,12 @@ In this section, we go into more detail about Serve autoscaling concepts as well To define what the steady state of your deployments should be, set values for `target_ongoing_requests` and `max_ongoing_requests`. -#### **target_num_ongoing_requests_per_replica [default=1]** +#### **target_num_ongoing_requests_per_replica [default=2]** This parameter is renamed to `target_ongoing_requests`. `target_num_ongoing_requests_per_replica` will be removed in a future release. -#### **target_ongoing_requests [default=1]** +#### **target_ongoing_requests [default=2]** :::{note} -The default for `target_ongoing_requests` will be changed to 2.0 in an upcoming Ray release. You can continue to set it manually to override the default. +The default for `target_ongoing_requests` changed from 1.0 to 2.0 in Ray 2.31.0. You can continue to set it manually to override the default. ::: Serve scales the number of replicas for a deployment up or down based on the average number of ongoing requests per replica. Specifically, Serve compares the *actual* number of ongoing requests per replica with the target value you set in the autoscaling config and makes upscale or downscale decisions from that. Set the target value with `target_ongoing_requests`, and Serve attempts to ensure that each replica has roughly that number of requests being processed and waiting in the queue. @@ -30,12 +30,12 @@ Always load test your workloads. For example, if the use case is latency sensiti As an example, suppose you have two replicas of a synchronous deployment that has 100ms latency, serving a traffic load of 30 QPS. Then Serve assigns requests to replicas faster than the replicas can finish processing them; more and more requests queue up at the replica (these requests are "ongoing requests") as time progresses, and then the average number of ongoing requests at each replica steadily increases. Latency also increases because new requests have to wait for old requests to finish processing. If you set `target_ongoing_requests = 1`, Serve detects a higher than desired number of ongoing requests per replica, and adds more replicas. At 3 replicas, your system would be able to process 30 QPS with 1 ongoing request per replica on average. ::: -#### **max_concurrent_queries [default=100] (DEPRECATED)** +#### **max_concurrent_queries [default=5] (DEPRECATED)** This parameter is renamed to `max_ongoing_requests`. `max_concurrent_queries` will be removed in a future release. -#### **max_ongoing_requests [default=100]** +#### **max_ongoing_requests [default=5]** :::{note} -The default for `max_ongoing_requests` will be changed to 5 in an upcoming Ray release. You can continue to set it manually to override the default. +The default for `max_ongoing_requests` changed from 100 to 5 in Ray 2.31.0. You can continue to set it manually to override the default. ::: There is also a maximum queue limit that proxies respect when assigning requests to replicas. Define the limit with `max_ongoing_requests`. Set `max_ongoing_requests` to ~20 to 50% higher than `target_ongoing_requests`. Note that `target_ongoing_requests` should always be strictly less than `max_ongoing_requests`, otherwise the deployment never scales up. diff --git a/python/ray/serve/_private/application_state.py b/python/ray/serve/_private/application_state.py index 8a77bdfeaf8a..75042306b053 100644 --- a/python/ray/serve/_private/application_state.py +++ b/python/ray/serve/_private/application_state.py @@ -21,11 +21,7 @@ TargetCapacityDirection, ) from ray.serve._private.config import DeploymentConfig -from ray.serve._private.constants import ( - NEW_DEFAULT_MAX_ONGOING_REQUESTS, - RAY_SERVE_ENABLE_TASK_EVENTS, - SERVE_LOGGER_NAME, -) +from ray.serve._private.constants import RAY_SERVE_ENABLE_TASK_EVENTS, SERVE_LOGGER_NAME from ray.serve._private.deploy_utils import ( deploy_args_to_deployment_info, get_app_code_version, @@ -1110,11 +1106,6 @@ def override_deployment_info( # `num_replicas="auto"` if options.get("num_replicas") == "auto": options["num_replicas"] = None - if ( - "max_ongoing_requests" - not in original_options["user_configured_option_names"] - ): - options["max_ongoing_requests"] = NEW_DEFAULT_MAX_ONGOING_REQUESTS new_config = AutoscalingConfig.default().dict() # If `autoscaling_config` is specified, its values override diff --git a/python/ray/serve/_private/config.py b/python/ray/serve/_private/config.py index 6396ed7685f4..37a6b3662ab7 100644 --- a/python/ray/serve/_private/config.py +++ b/python/ray/serve/_private/config.py @@ -25,7 +25,6 @@ DEFAULT_HEALTH_CHECK_TIMEOUT_S, DEFAULT_MAX_ONGOING_REQUESTS, MAX_REPLICAS_PER_NODE_MAX_VALUE, - NEW_DEFAULT_MAX_ONGOING_REQUESTS, ) from ray.serve._private.utils import DEFAULT, DeploymentOptionUpdateType from ray.serve.config import AutoscalingConfig @@ -87,7 +86,7 @@ class DeploymentConfig(BaseModel): handles requests to this deployment. Defaults to 1. max_ongoing_requests: The maximum number of queries that is sent to a replica of this deployment without receiving - a response. Defaults to 100. + a response. Defaults to 5. max_queued_requests: Maximum number of requests to this deployment that will be queued at each *caller* (proxy or DeploymentHandle). Once this limit is reached, subsequent requests will raise a BackPressureError (for handles) or @@ -332,19 +331,12 @@ def handle_num_replicas_auto( """Return modified `max_ongoing_requests` and `autoscaling_config` for when num_replicas="auto". - If `max_ongoing_requests` is unspecified (DEFAULT.VALUE), returns - the modified value NEW_DEFAULT_MAX_ONGOING_REQUESTS. Otherwise, - doesn't change `max_ongoing_requests`. - If `autoscaling_config` is unspecified, returns the modified value AutoscalingConfig.default(). If it is specified, the specified fields in `autoscaling_config` override that of AutoscalingConfig.default(). """ - if max_ongoing_requests is DEFAULT.VALUE: - max_ongoing_requests = NEW_DEFAULT_MAX_ONGOING_REQUESTS - if autoscaling_config in [DEFAULT.VALUE, None]: # If autoscaling config wasn't specified, use default # configuration diff --git a/python/ray/serve/_private/constants.py b/python/ray/serve/_private/constants.py index ff9b7156dd78..4f0e1ed4a766 100644 --- a/python/ray/serve/_private/constants.py +++ b/python/ray/serve/_private/constants.py @@ -102,8 +102,8 @@ DEFAULT_GRACEFUL_SHUTDOWN_WAIT_LOOP_S = 2 DEFAULT_HEALTH_CHECK_PERIOD_S = 10 DEFAULT_HEALTH_CHECK_TIMEOUT_S = 30 -DEFAULT_MAX_ONGOING_REQUESTS = 100 -NEW_DEFAULT_MAX_ONGOING_REQUESTS = 5 +DEFAULT_MAX_ONGOING_REQUESTS = 5 +DEFAULT_TARGET_ONGOING_REQUESTS = 2 # HTTP Proxy health check configs PROXY_HEALTH_CHECK_TIMEOUT_S = ( diff --git a/python/ray/serve/api.py b/python/ray/serve/api.py index 00f28cddacd7..b149bd570f55 100644 --- a/python/ray/serve/api.py +++ b/python/ray/serve/api.py @@ -306,9 +306,9 @@ class MyDeployment: can be updated dynamically without restarting the replicas of the deployment. The user_config must be fully JSON-serializable. max_concurrent_queries: [DEPRECATED] Maximum number of queries that are sent to - a replica of this deployment without receiving a response. Defaults to 100. + a replica of this deployment without receiving a response. Defaults to 5. max_ongoing_requests: Maximum number of requests that are sent to a - replica of this deployment without receiving a response. Defaults to 100. + replica of this deployment without receiving a response. Defaults to 5. max_queued_requests: [EXPERIMENTAL] Maximum number of requests to this deployment that will be queued at each *caller* (proxy or DeploymentHandle). Once this limit is reached, subsequent requests will raise a @@ -350,10 +350,30 @@ class MyDeployment: "version." ) + if ( + isinstance(autoscaling_config, dict) + and "target_num_ongoing_requests_per_replica" not in autoscaling_config + and "target_ongoing_requests" not in autoscaling_config + ) or ( + isinstance(autoscaling_config, AutoscalingConfig) + and "target_num_ongoing_requests_per_replica" + not in autoscaling_config.dict(exclude_unset=True) + and "target_ongoing_requests" + not in autoscaling_config.dict(exclude_unset=True) + ): + logger.warning( + "The default value for `target_ongoing_requests` has changed from 1.0 " + "to 2.0 in Ray 2.31.0." + ) + if max_ongoing_requests is None: raise ValueError("`max_ongoing_requests` must be non-null, got None.") elif max_ongoing_requests is DEFAULT.VALUE: if max_concurrent_queries is None: + logger.warning( + "The default value for `max_ongoing_requests` has changed from " + "100 to 5 in Ray 2.31.0." + ) max_ongoing_requests = DEFAULT_MAX_ONGOING_REQUESTS else: max_ongoing_requests = max_concurrent_queries @@ -408,6 +428,12 @@ class MyDeployment: "been deprecated and replaced by `max_ongoing_requests`." ) + if max_ongoing_requests is DEFAULT.VALUE: + logger.warning( + "The default value for `max_ongoing_requests` has changed from 100 to 5 in " + "Ray 2.31.0." + ) + if isinstance(logging_config, LoggingConfig): logging_config = logging_config.dict() @@ -935,3 +961,8 @@ async def __call__(self, val: int) -> int: ServeUsageTag.SERVE_GET_DEPLOYMENT_HANDLE_API_USED.record("1") return client.get_handle(deployment_name, app_name, check_exists=_check_exists) + + +@PublicAPI +def get_tracing_context() -> Optional[Dict]: + return None diff --git a/python/ray/serve/config.py b/python/ray/serve/config.py index bd169f36cebd..5521d4a5519f 100644 --- a/python/ray/serve/config.py +++ b/python/ray/serve/config.py @@ -20,6 +20,7 @@ DEFAULT_GRPC_PORT, DEFAULT_HTTP_HOST, DEFAULT_HTTP_PORT, + DEFAULT_TARGET_ONGOING_REQUESTS, DEFAULT_UVICORN_KEEP_ALIVE_TIMEOUT_S, SERVE_LOGGER_NAME, ) @@ -42,7 +43,7 @@ class AutoscalingConfig(BaseModel): # DEPRECATED: replaced by target_ongoing_requests target_num_ongoing_requests_per_replica: PositiveFloat = Field( - default=1.0, + default=DEFAULT_TARGET_ONGOING_REQUESTS, description="[DEPRECATED] Please use `target_ongoing_requests` instead.", ) # Will default to 1.0 in the future. @@ -134,10 +135,10 @@ def serialize_policy(self) -> None: @classmethod def default(cls): return cls( + target_num_ongoing_requests_per_replica=DEFAULT_TARGET_ONGOING_REQUESTS, + target_ongoing_requests=DEFAULT_TARGET_ONGOING_REQUESTS, min_replicas=1, max_replicas=100, - target_num_ongoing_requests_per_replica=2, - target_ongoing_requests=2, ) def get_policy(self) -> Callable: diff --git a/python/ray/serve/tests/test_api.py b/python/ray/serve/tests/test_api.py index c246dcfd8436..1b832a13c9d2 100644 --- a/python/ray/serve/tests/test_api.py +++ b/python/ray/serve/tests/test_api.py @@ -14,7 +14,10 @@ from ray._private.test_utils import SignalActor, wait_for_condition from ray.serve._private.api import call_app_builder_with_args_if_necessary from ray.serve._private.common import DeploymentID -from ray.serve._private.constants import SERVE_DEFAULT_APP_NAME +from ray.serve._private.constants import ( + DEFAULT_MAX_ONGOING_REQUESTS, + SERVE_DEFAULT_APP_NAME, +) from ray.serve.deployment import Application from ray.serve.exceptions import RayServeException from ray.serve.handle import DeploymentHandle @@ -1034,10 +1037,10 @@ class A: ).bind() serve.run(serve.deployment(max_concurrent_queries=None)(A).bind()) - assert get_max_ongoing_requests() == 100 + assert get_max_ongoing_requests() == DEFAULT_MAX_ONGOING_REQUESTS serve.run(serve.deployment(A).options(max_concurrent_queries=None).bind()) - assert get_max_ongoing_requests() == 100 + assert get_max_ongoing_requests() == DEFAULT_MAX_ONGOING_REQUESTS serve.run( serve.deployment(max_ongoing_requests=8, max_concurrent_queries=None)(A).bind() diff --git a/python/ray/serve/tests/test_autoscaling_policy.py b/python/ray/serve/tests/test_autoscaling_policy.py index d14a25cbb6d3..6a0164db75d9 100644 --- a/python/ray/serve/tests/test_autoscaling_policy.py +++ b/python/ray/serve/tests/test_autoscaling_policy.py @@ -297,7 +297,7 @@ async def __call__(self): await signal.wait.remote() return "sup" - @serve.deployment(graceful_shutdown_timeout_s=1) + @serve.deployment(graceful_shutdown_timeout_s=1, max_ongoing_requests=50) class Router: def __init__(self, handle: DeploymentHandle): if use_get_handle_api: @@ -1144,8 +1144,9 @@ def test_max_ongoing_requests_set_to_one( @serve.deployment( autoscaling_config=AutoscalingConfig( + target_ongoing_requests=1.0, min_replicas=1, - max_replicas=5, + max_replicas=3, upscale_delay_s=0.5, downscale_delay_s=0.5, metrics_interval_s=0.5, @@ -1171,11 +1172,12 @@ async def f(): # 2. Wait for the number of waiters on signal to increase by 1. # 3. Assert the number of replicas has increased by 1. refs = [] - for i in range(5): + for i in range(3): refs.append(h.remote()) def check_num_waiters(target: int): - assert ray.get(signal.cur_num_waiters.remote()) == target + num_waiters = ray.get(signal.cur_num_waiters.remote()) + assert num_waiters == target return True wait_for_condition(check_num_waiters, target=i + 1) diff --git a/python/ray/serve/tests/test_controller.py b/python/ray/serve/tests/test_controller.py index 2588fedd5f19..5c885e47bb77 100644 --- a/python/ray/serve/tests/test_controller.py +++ b/python/ray/serve/tests/test_controller.py @@ -156,15 +156,15 @@ def autoscaling_app(): "message": "", "deployment_config": { "name": "autoscaling_app", - "max_concurrent_queries": 100, - "max_ongoing_requests": 100, + "max_concurrent_queries": 5, + "max_ongoing_requests": 5, "max_queued_requests": -1, "user_config": None, "autoscaling_config": { "min_replicas": 1, "initial_replicas": None, "max_replicas": 10, - "target_num_ongoing_requests_per_replica": 1.0, + "target_num_ongoing_requests_per_replica": 2.0, "target_ongoing_requests": None, "metrics_interval_s": 10.0, "look_back_period_s": 30.0,