From 66542b2cdf064eb679af218e8571146dfc7e6ca1 Mon Sep 17 00:00:00 2001 From: Sihan Wang Date: Wed, 1 Mar 2023 12:17:16 -0800 Subject: [PATCH] [Serve] Add http request latency (#32839) Add http request latency metrics. --- .../serve/production-guide/monitoring.md | 3 +++ python/ray/serve/_private/http_proxy.py | 18 +++++++++++++++++- python/ray/serve/tests/test_metrics.py | 7 ++++--- 3 files changed, 24 insertions(+), 4 deletions(-) diff --git a/doc/source/serve/production-guide/monitoring.md b/doc/source/serve/production-guide/monitoring.md index d9d913a9fd18..d24071a80097 100644 --- a/doc/source/serve/production-guide/monitoring.md +++ b/doc/source/serve/production-guide/monitoring.md @@ -279,6 +279,9 @@ The following metrics are exposed by Ray Serve: * error_code * method - The number of non-200 HTTP responses returned by each deployment. + * - ``serve_http_request_latency_ms`` [*] + - * endpoint + - The end-to-end latency of HTTP requests (measured from the Serve HTTP proxy). ``` [*] - only available when using HTTP calls [**] - only available when using Python `ServeHandle` calls diff --git a/python/ray/serve/_private/http_proxy.py b/python/ray/serve/_private/http_proxy.py index b00139d7be28..bc6983ef7278 100644 --- a/python/ray/serve/_private/http_proxy.py +++ b/python/ray/serve/_private/http_proxy.py @@ -26,7 +26,11 @@ set_socket_reuse_port, ) from ray.serve._private.common import EndpointInfo, EndpointTag -from ray.serve._private.constants import SERVE_LOGGER_NAME, SERVE_NAMESPACE +from ray.serve._private.constants import ( + SERVE_LOGGER_NAME, + SERVE_NAMESPACE, + DEFAULT_LATENCY_BUCKET_MS, +) from ray.serve._private.long_poll import LongPollClient, LongPollNamespace from ray.serve._private.logging_utils import access_log_msg, configure_component_logger @@ -291,6 +295,15 @@ def get_handle(name): "method", ), ) + self.processing_latency_tracker = metrics.Histogram( + "serve_http_request_latency_ms", + description=( + "The end-to-end latency of HTTP requests " + "(measured from the Serve HTTP proxy)." + ), + boundaries=DEFAULT_LATENCY_BUCKET_MS, + tag_keys=("route_prefix",), + ) def _update_routes(self, endpoints: Dict[EndpointTag, EndpointInfo]) -> None: self.route_info: Dict[str, Tuple[EndpointTag, List[str]]] = dict() @@ -370,6 +383,9 @@ async def __call__(self, scope, receive, send): start_time = time.time() status_code = await _send_request_to_handle(handle, scope, receive, send) latency_ms = (time.time() - start_time) * 1000.0 + self.processing_latency_tracker.observe( + latency_ms, tags={"route_prefix": route_prefix} + ) logger.info( access_log_msg( method=scope["method"], diff --git a/python/ray/serve/tests/test_metrics.py b/python/ray/serve/tests/test_metrics.py index e56e64f91961..3085e2954094 100644 --- a/python/ray/serve/tests/test_metrics.py +++ b/python/ray/serve/tests/test_metrics.py @@ -56,9 +56,9 @@ def verify_metrics(do_assert=False): "serve_deployment_request_counter", "serve_deployment_replica_starts", # histogram - "deployment_processing_latency_ms_bucket", - "deployment_processing_latency_ms_count", - "deployment_processing_latency_ms_sum", + "serve_deployment_processing_latency_ms_bucket", + "serve_deployment_processing_latency_ms_count", + "serve_deployment_processing_latency_ms_sum", "serve_deployment_processing_latency_ms", # gauge "serve_replica_processing_queries", @@ -120,6 +120,7 @@ def verify_metrics(expected_metrics, do_assert=False): # https://docs.ray.io/en/latest/serve/monitoring.html#metrics # Any updates here should be reflected there too. expected_metrics.append("serve_num_deployment_http_error_requests") + expected_metrics.append("serve_http_request_latency_ms") @serve.deployment(name="A") class A: