ray-project · edoakes · Jul 16, 2024 · Jul 16, 2024
@@ -14,9 +14,6 @@ In this section, we go into more detail about Serve autoscaling concepts as well
 
 To define what the steady state of your deployments should be, set values for `target_ongoing_requests` and `max_ongoing_requests`.
 
-#### **target_num_ongoing_requests_per_replica [default=2]**
-This parameter is renamed to `target_ongoing_requests`. `target_num_ongoing_requests_per_replica` will be removed in a future release.
-
 #### **target_ongoing_requests [default=2]**
 :::{note}
 The default for `target_ongoing_requests` changed from 1.0 to 2.0 in Ray 2.32.0. You can continue to set it manually to override the default.

@@ -45,7 +45,7 @@ You can set `num_replicas="auto"` and override its default values (shown above)
 
 Let's dive into what each of these parameters do.
 
-* **target_ongoing_requests** (replaces the deprecated `target_num_ongoing_requests_per_replica`) is the average number of ongoing requests per replica that the Serve autoscaler tries to ensure. You can adjust it based on your request processing length (the longer the requests, the smaller this number should be) as well as your latency objective (the shorter you want your latency to be, the smaller this number should be).
+* **target_ongoing_requests** is the average number of ongoing requests per replica that the Serve autoscaler tries to ensure. You can adjust it based on your request processing length (the longer the requests, the smaller this number should be) as well as your latency objective (the shorter you want your latency to be, the smaller this number should be).
 * **max_ongoing_requests** is the maximum number of ongoing requests allowed for a replica. Note this parameter is not part of the autoscaling config because it's relevant to all deployments, but it's important to set it relative to the target value if you turn on autoscaling for your deployment.
 * **min_replicas** is the minimum number of replicas for the deployment. Set this to 0 if there are long periods of no traffic and some extra tail latency during upscale is acceptable. Otherwise, set this to what you think you need for low traffic.
 * **max_replicas** is the maximum number of replicas for the deployment. Set this to ~20% higher than what you think you need for peak traffic.

diff --git a/java/serve/src/main/java/io/ray/serve/config/AutoscalingConfig.java b/java/serve/src/main/java/io/ray/serve/config/AutoscalingConfig.java
@@ -6,7 +6,6 @@ public class AutoscalingConfig implements Serializable {
   private static final long serialVersionUID = 9135422781025005216L;
   private int minReplicas = 1;
   private int maxReplicas = 1;
-  private int targetNumOngoingRequestsPerReplica = 1;
   private int targetOngoingRequests = 1;
   /** How often to scrape for metrics */
   private double metricsIntervalS = 10.0;
@@ -35,14 +34,6 @@ public void setMaxReplicas(int maxReplicas) {
     this.maxReplicas = maxReplicas;
   }
 
-  public int getTargetNumOngoingRequestsPerReplica() {
-    return targetNumOngoingRequestsPerReplica;
-  }
-
-  public void setTargetNumOngoingRequestsPerReplica(int targetNumOngoingRequestsPerReplica) {
-    this.targetNumOngoingRequestsPerReplica = targetNumOngoingRequestsPerReplica;
-  }
-
   public int getTargetOngoingRequests() {
     return targetOngoingRequests;
   }
@@ -95,7 +86,6 @@ public io.ray.serve.generated.AutoscalingConfig toProto() {
     return io.ray.serve.generated.AutoscalingConfig.newBuilder()
         .setMinReplicas(minReplicas)
         .setMaxReplicas(maxReplicas)
-        .setTargetNumOngoingRequestsPerReplica(targetNumOngoingRequestsPerReplica)
         .setTargetOngoingRequests(targetOngoingRequests)
         .setMetricsIntervalS(metricsIntervalS)
         .setLookBackPeriodS(lookBackPeriodS)

diff --git a/python/ray/dashboard/modules/serve/serve_rest_api_impl.py b/python/ray/dashboard/modules/serve/serve_rest_api_impl.py
@@ -238,10 +238,7 @@ def log_config_change_default_warning(self, config):
                 else:
                     continue
 
-                if (
-                    "target_num_ongoing_requests_per_replica" not in autoscaling_config
-                    and "target_ongoing_requests" not in autoscaling_config
-                ):
+                if "target_ongoing_requests" not in autoscaling_config:
                     logger.warning(
                         "The default value for `target_ongoing_requests` has changed "
                         "from 1.0 to 2.0 in Ray 2.32.0."

diff --git a/python/ray/serve/api.py b/python/ray/serve/api.py
@@ -329,28 +329,9 @@ class MyDeployment:
     if autoscaling_config not in [DEFAULT.VALUE, None]:
         if (
             isinstance(autoscaling_config, dict)
-            and "target_num_ongoing_requests_per_replica" in autoscaling_config
-        ) or (
-            isinstance(autoscaling_config, AutoscalingConfig)
-            and "target_num_ongoing_requests_per_replica"
-            in autoscaling_config.dict(exclude_unset=True)
-        ):
-            logger.warning(
-                "DeprecationWarning: `target_num_ongoing_requests_per_replica` in "
-                "`autoscaling_config` has been deprecated and replaced by "
-                "`target_ongoing_requests`. "
-                "`target_num_ongoing_requests_per_replica` will be removed in a future "
-                "version."
-            )
-
-        if (
-            isinstance(autoscaling_config, dict)
-            and "target_num_ongoing_requests_per_replica" not in autoscaling_config
             and "target_ongoing_requests" not in autoscaling_config
         ) or (
             isinstance(autoscaling_config, AutoscalingConfig)
-            and "target_num_ongoing_requests_per_replica"
-            not in autoscaling_config.dict(exclude_unset=True)
             and "target_ongoing_requests"
             not in autoscaling_config.dict(exclude_unset=True)
         ):

diff --git a/python/ray/serve/config.py b/python/ray/serve/config.py
@@ -41,13 +41,7 @@ class AutoscalingConfig(BaseModel):
     initial_replicas: Optional[NonNegativeInt] = None
     max_replicas: PositiveInt = 1
 
-    # DEPRECATED: replaced by target_ongoing_requests
-    target_num_ongoing_requests_per_replica: PositiveFloat = Field(
-        default=DEFAULT_TARGET_ONGOING_REQUESTS,
-        description="[DEPRECATED] Please use `target_ongoing_requests` instead.",
-    )
-    # Will default to 1.0 in the future.
-    target_ongoing_requests: Optional[PositiveFloat] = None
+    target_ongoing_requests: PositiveFloat = DEFAULT_TARGET_ONGOING_REQUESTS
 
     # How often to scrape for metrics
     metrics_interval_s: PositiveFloat = 10.0
@@ -135,7 +129,6 @@ def serialize_policy(self) -> None:
     @classmethod
     def default(cls):
         return cls(
-            target_num_ongoing_requests_per_replica=DEFAULT_TARGET_ONGOING_REQUESTS,
             target_ongoing_requests=DEFAULT_TARGET_ONGOING_REQUESTS,
             min_replicas=1,
             max_replicas=100,
@@ -158,9 +151,7 @@ def get_downscaling_factor(self) -> PositiveFloat:
         return self.downscale_smoothing_factor or self.smoothing_factor
 
     def get_target_ongoing_requests(self) -> PositiveFloat:
-        return (
-            self.target_ongoing_requests or self.target_num_ongoing_requests_per_replica
-        )
+        return self.target_ongoing_requests
 
 
 # Keep in sync with ServeDeploymentMode in dashboard/client/src/type/serve.ts

diff --git a/python/ray/serve/deployment.py b/python/ray/serve/deployment.py
@@ -442,18 +442,6 @@ def options(
 
         if autoscaling_config is not DEFAULT.VALUE:
             new_deployment_config.autoscaling_config = autoscaling_config
-            if (
-                new_deployment_config.autoscaling_config
-                and "target_num_ongoing_requests_per_replica"
-                in new_deployment_config.autoscaling_config.dict(exclude_unset=True)
-            ):
-                logger.warning(
-                    "DeprecationWarning: `target_num_ongoing_requests_per_replica` in "
-                    "`autoscaling_config` has been deprecated and replaced by "
-                    "`target_ongoing_requests`. Note that "
-                    "`target_num_ongoing_requests_per_replica` will be removed in a "
-                    "future version."
-                )
 
         if graceful_shutdown_wait_loop_s is not DEFAULT.VALUE:
             new_deployment_config.graceful_shutdown_wait_loop_s = (

diff --git a/python/ray/serve/tests/test_autoscaling_policy.py b/python/ray/serve/tests/test_autoscaling_policy.py
@@ -122,45 +122,22 @@ def check_num_requests_ge(client, id: DeploymentID, expected: int):
 
 
 class TestAutoscalingMetrics:
-    @pytest.mark.parametrize(
-        "use_target_ongoing_requests,use_target_num_ongoing_requests_per_replica",
-        [(True, True), (True, False), (False, True)],
-    )
-    def test_basic(
-        self,
-        serve_instance,
-        use_target_num_ongoing_requests_per_replica,
-        use_target_ongoing_requests,
-    ):
+    def test_basic(self, serve_instance):
         """Test that request metrics are sent correctly to the controller."""
 
         client = serve_instance
         signal = SignalActor.remote()
 
-        autoscaling_config = {
-            "metrics_interval_s": 0.1,
-            "min_replicas": 1,
-            "max_replicas": 10,
-            "upscale_delay_s": 0,
-            "downscale_delay_s": 0,
-            "look_back_period_s": 1,
-        }
-        if (
-            use_target_ongoing_requests
-            and not use_target_num_ongoing_requests_per_replica
-        ):
-            autoscaling_config["target_ongoing_requests"] = 10
-        elif (
-            use_target_ongoing_requests and use_target_num_ongoing_requests_per_replica
-        ):
-            autoscaling_config["target_ongoing_requests"] = 10
-            # Random setting, should get ignored
-            autoscaling_config["target_num_ongoing_requests_per_replica"] = 234
-        else:
-            autoscaling_config["target_num_ongoing_requests_per_replica"] = 10
-
         @serve.deployment(
-            autoscaling_config=autoscaling_config,
+            autoscaling_config={
+                "metrics_interval_s": 0.1,
+                "min_replicas": 1,
+                "max_replicas": 10,
+                "target_ongoing_requests": 10,
+                "upscale_delay_s": 0,
+                "downscale_delay_s": 0,
+                "look_back_period_s": 1,
+            },
             # We will send many requests. This will make sure replicas are
             # killed quickly during cleanup.
             graceful_shutdown_timeout_s=1,

diff --git a/python/ray/serve/tests/test_controller.py b/python/ray/serve/tests/test_controller.py
@@ -163,8 +163,7 @@ def autoscaling_app():
                                     "min_replicas": 1,
                                     "initial_replicas": None,
                                     "max_replicas": 10,
-                                    "target_num_ongoing_requests_per_replica": 2.0,
-                                    "target_ongoing_requests": None,
+                                    "target_ongoing_requests": 2.0,
                                     "metrics_interval_s": 10.0,
                                     "look_back_period_s": 30.0,
                                     "smoothing_factor": 1.0,

diff --git a/python/ray/serve/tests/test_deploy_2.py b/python/ray/serve/tests/test_deploy_2.py
@@ -320,7 +320,6 @@ async def __call__(self):
     assert deployment_config["autoscaling_config"] == {
         # Set by `num_replicas="auto"`
         "target_ongoing_requests": 2.0,
-        "target_num_ongoing_requests_per_replica": 2.0,
         "min_replicas": 1,
         "max_replicas": 100,
         # Untouched defaults
@@ -373,7 +372,6 @@ async def __call__(self):
     assert deployment_config["autoscaling_config"] == {
         # Set by `num_replicas="auto"`
         "target_ongoing_requests": 2.0,
-        "target_num_ongoing_requests_per_replica": 2.0,
         "min_replicas": 1,
         "max_replicas": 100,
         # Overrided by `autoscaling_config`

diff --git a/python/ray/serve/tests/test_deploy_app.py b/python/ray/serve/tests/test_deploy_app.py
@@ -1432,7 +1432,6 @@ def test_num_replicas_auto_api(client: ServeControllerClient):
     assert deployment_config["autoscaling_config"] == {
         # Set by `num_replicas="auto"`
         "target_ongoing_requests": 2.0,
-        "target_num_ongoing_requests_per_replica": 2.0,
         "min_replicas": 1,
         "max_replicas": 100,
         # Untouched defaults
@@ -1484,7 +1483,6 @@ def test_num_replicas_auto_basic(client: ServeControllerClient):
     assert deployment_config["autoscaling_config"] == {
         # Set by `num_replicas="auto"`
         "target_ongoing_requests": 2.0,
-        "target_num_ongoing_requests_per_replica": 2.0,
         "min_replicas": 1,
         "max_replicas": 100,
         # Overrided by `autoscaling_config`