Skip to content

Commit

Permalink
Support P50, P90, P99 for next token latency
Browse files Browse the repository at this point in the history
Signed-off-by: lvliang-intel <[email protected]>
  • Loading branch information
lvliang-intel committed Aug 31, 2024
1 parent 476a327 commit bb0873e
Show file tree
Hide file tree
Showing 2 changed files with 13 additions and 3 deletions.
5 changes: 4 additions & 1 deletion evals/benchmark/stresscli/commands/config.ini
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,10 @@ First_token_latency_P50 = First token latency\(ms\),\s+P50:\s+([\d.]+)
First_token_latency_P90 = First token latency\(ms\),\s+P50:[\s\d.,]+P90:\s+([\d.]+)
First_token_latency_P99 = First token latency\(ms\),\s+P50:[\s\d.,]+P90:\s+[\s\d.,]+P99:\s+([\d.]+)
First_token_latency_Avg = First token latency\(ms\),\s+P50:[\s\d.,]+P90:\s+[\s\d.,]+P99:\s+[\s\d.,]+Avg:\s+([\d.]+)
Average_Next_token_latency = Average Next token latency\(ms\):\s+([\d.]+)
Next_token_latency_P50 = Next token latency\(ms\),\s+P50:\s+([\d.]+)
Next_token_latency_P90 = Next token latency\(ms\),\s+P50:[\s\d.,]+P90:\s+([\d.]+)
Next_token_latency_P99 = Next token latency\(ms\),\s+P50:[\s\d.,]+P90:\s+[\s\d.,]+P99:\s+([\d.]+)
Next_token_latency_Avg = Next token latency\(ms\),\s+P50:[\s\d.,]+P90:\s+[\s\d.,]+P99:\s+[\s\d.,]+Avg:\s+([\d.]+)
Average_token_latency = Average token latency\(ms\)\s+:\s+([\d.]+)
locust_num_requests = \"num_requests\":\s+(\d+)
locust_num_failures = \"num_failures\":\s+(\d+)
Expand Down
11 changes: 9 additions & 2 deletions evals/benchmark/stresscli/locust/tokenresponse.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ def staticsOutput(environment, reqlist):
)
e2e_msg = "End to End latency(ms), P50: {:.2f}, P90: {:.2f}, P99: {:.2f}, Avg: {:.2f}"
first_msg = "First token latency(ms), P50: {:.2f}, P90: {:.2f}, P99: {:.2f}, Avg: {:.2f}"
next_msg = "Average Next token latency(ms): {:.2f}"
next_msg = "Next token latency(ms), P50: {:.2f}, P90: {:.2f}, P99: {:.2f}, Avg: {:.2f}"
average_msg = "Average token latency(ms) : {:.2f}"
console_logger.warning("\n=================Total statistics=====================")
if tokens_output == 0:
Expand Down Expand Up @@ -108,7 +108,14 @@ def staticsOutput(environment, reqlist):
numpy.average(first_token),
)
)
console_logger.warning(next_msg.format(numpy.average(next_token)))
console_logger.warning(
next_msg.format(
numpy.percentile(next_token, 50),
numpy.percentile(next_token, 90),
numpy.percentile(next_token, 99),
numpy.average(next_token),
)
)
console_logger.warning(average_msg.format(numpy.average(avg_token)))
console_logger.warning("======================================================\n\n")
logging.shutdown()

0 comments on commit bb0873e

Please sign in to comment.