Support P50, P90, P99 for next token latency

Signed-off-by: lvliang-intel <[email protected]>
opea-project · Aug 31, 2024 · bb0873e · bb0873e
1 parent 476a327
commit bb0873e
Show file tree

Hide file tree

Showing 2 changed files with 13 additions and 3 deletions.
diff --git a/evals/benchmark/stresscli/commands/config.ini b/evals/benchmark/stresscli/commands/config.ini
@@ -15,7 +15,10 @@ First_token_latency_P50 = First token latency\(ms\),\s+P50:\s+([\d.]+)
 First_token_latency_P90 = First token latency\(ms\),\s+P50:[\s\d.,]+P90:\s+([\d.]+)
 First_token_latency_P99 = First token latency\(ms\),\s+P50:[\s\d.,]+P90:\s+[\s\d.,]+P99:\s+([\d.]+)
 First_token_latency_Avg = First token latency\(ms\),\s+P50:[\s\d.,]+P90:\s+[\s\d.,]+P99:\s+[\s\d.,]+Avg:\s+([\d.]+)
-Average_Next_token_latency = Average Next token latency\(ms\):\s+([\d.]+)
+Next_token_latency_P50 = Next token latency\(ms\),\s+P50:\s+([\d.]+)
+Next_token_latency_P90 = Next token latency\(ms\),\s+P50:[\s\d.,]+P90:\s+([\d.]+)
+Next_token_latency_P99 = Next token latency\(ms\),\s+P50:[\s\d.,]+P90:\s+[\s\d.,]+P99:\s+([\d.]+)
+Next_token_latency_Avg = Next token latency\(ms\),\s+P50:[\s\d.,]+P90:\s+[\s\d.,]+P99:\s+[\s\d.,]+Avg:\s+([\d.]+)
 Average_token_latency = Average token latency\(ms\)\s+:\s+([\d.]+)
 locust_num_requests = \"num_requests\":\s+(\d+)
 locust_num_failures = \"num_failures\":\s+(\d+)

diff --git a/evals/benchmark/stresscli/locust/tokenresponse.py b/evals/benchmark/stresscli/locust/tokenresponse.py
@@ -64,7 +64,7 @@ def staticsOutput(environment, reqlist):
         )
     e2e_msg = "End to End latency(ms),    P50: {:.2f},   P90: {:.2f},   P99: {:.2f},   Avg: {:.2f}"
     first_msg = "First token latency(ms),   P50: {:.2f},   P90: {:.2f},   P99: {:.2f},   Avg: {:.2f}"
-    next_msg = "Average Next token latency(ms): {:.2f}"
+    next_msg = "Next token latency(ms),   P50: {:.2f},   P90: {:.2f},   P99: {:.2f},   Avg: {:.2f}"
     average_msg = "Average token latency(ms)     : {:.2f}"
     console_logger.warning("\n=================Total statistics=====================")
     if tokens_output == 0:
@@ -108,7 +108,14 @@ def staticsOutput(environment, reqlist):
                 numpy.average(first_token),
             )
         )
-        console_logger.warning(next_msg.format(numpy.average(next_token)))
+        console_logger.warning(
+            next_msg.format(
+                numpy.percentile(next_token, 50),
+                numpy.percentile(next_token, 90),
+                numpy.percentile(next_token, 99),
+                numpy.average(next_token),
+            )
+        )
         console_logger.warning(average_msg.format(numpy.average(avg_token)))
     console_logger.warning("======================================================\n\n")
     logging.shutdown()