From 6ac555c50618c501644d2b827406b40623b4cd52 Mon Sep 17 00:00:00 2001 From: lvliang-intel Date: Tue, 3 Sep 2024 17:23:54 +0800 Subject: [PATCH] Support P50, P90, P99 for next token latency (#93) Signed-off-by: lvliang-intel --- evals/benchmark/stresscli/commands/config.ini | 5 ++++- evals/benchmark/stresscli/locust/tokenresponse.py | 11 +++++++++-- 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/evals/benchmark/stresscli/commands/config.ini b/evals/benchmark/stresscli/commands/config.ini index e954fd13..16a47653 100644 --- a/evals/benchmark/stresscli/commands/config.ini +++ b/evals/benchmark/stresscli/commands/config.ini @@ -15,7 +15,10 @@ First_token_latency_P50 = First token latency\(ms\),\s+P50:\s+([\d.]+) First_token_latency_P90 = First token latency\(ms\),\s+P50:[\s\d.,]+P90:\s+([\d.]+) First_token_latency_P99 = First token latency\(ms\),\s+P50:[\s\d.,]+P90:\s+[\s\d.,]+P99:\s+([\d.]+) First_token_latency_Avg = First token latency\(ms\),\s+P50:[\s\d.,]+P90:\s+[\s\d.,]+P99:\s+[\s\d.,]+Avg:\s+([\d.]+) -Average_Next_token_latency = Average Next token latency\(ms\):\s+([\d.]+) +Next_token_latency_P50 = Next token latency\(ms\),\s+P50:\s+([\d.]+) +Next_token_latency_P90 = Next token latency\(ms\),\s+P50:[\s\d.,]+P90:\s+([\d.]+) +Next_token_latency_P99 = Next token latency\(ms\),\s+P50:[\s\d.,]+P90:\s+[\s\d.,]+P99:\s+([\d.]+) +Next_token_latency_Avg = Next token latency\(ms\),\s+P50:[\s\d.,]+P90:\s+[\s\d.,]+P99:\s+[\s\d.,]+Avg:\s+([\d.]+) Average_token_latency = Average token latency\(ms\)\s+:\s+([\d.]+) locust_num_requests = \"num_requests\":\s+(\d+) locust_num_failures = \"num_failures\":\s+(\d+) diff --git a/evals/benchmark/stresscli/locust/tokenresponse.py b/evals/benchmark/stresscli/locust/tokenresponse.py index 75943915..fc3aff6a 100644 --- a/evals/benchmark/stresscli/locust/tokenresponse.py +++ b/evals/benchmark/stresscli/locust/tokenresponse.py @@ -64,7 +64,7 @@ def staticsOutput(environment, reqlist): ) e2e_msg = "End to End latency(ms), P50: {:.2f}, P90: {:.2f}, P99: {:.2f}, Avg: {:.2f}" first_msg = "First token latency(ms), P50: {:.2f}, P90: {:.2f}, P99: {:.2f}, Avg: {:.2f}" - next_msg = "Average Next token latency(ms): {:.2f}" + next_msg = "Next token latency(ms), P50: {:.2f}, P90: {:.2f}, P99: {:.2f}, Avg: {:.2f}" average_msg = "Average token latency(ms) : {:.2f}" console_logger.warning("\n=================Total statistics=====================") if tokens_output == 0: @@ -108,7 +108,14 @@ def staticsOutput(environment, reqlist): numpy.average(first_token), ) ) - console_logger.warning(next_msg.format(numpy.average(next_token))) + console_logger.warning( + next_msg.format( + numpy.percentile(next_token, 50), + numpy.percentile(next_token, 90), + numpy.percentile(next_token, 99), + numpy.average(next_token), + ) + ) console_logger.warning(average_msg.format(numpy.average(avg_token))) console_logger.warning("======================================================\n\n") logging.shutdown()