vllm-project · ywang96 · Jul 7, 2024 · Jun 26, 2024 · Jun 26, 2024 · Jun 26, 2024
diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
@@ -17,7 +17,7 @@
         --dataset-path <path to dataset> \
         --request-rate <request_rate> \ # By default <request_rate> is inf
         --num-prompts <num_prompts> # By default <num_prompts> is 1000
-        
+
     when using tgi backend, add
         --endpoint /generate_stream
     to the end of the command above.
@@ -77,7 +77,6 @@ def sample_sharegpt_requests(
 ) -> List[Tuple[str, int, int]]:
     if fixed_output_len is not None and fixed_output_len < 4:
         raise ValueError("output_len too small")
-
     # Load the dataset.
     with open(dataset_path) as f:
         dataset = json.load(f)
@@ -185,6 +184,31 @@ def sample_sonnet_requests(
     return sampled_requests
 
 
+def sample_random_requests(
+        input_len: int, output_len: int, num_prompts: int, range_ratio: float,
+        tokenizer: PreTrainedTokenizerBase) -> List[Tuple[str, int, int]]:
+
+    input_lens = np.random.randint(
+        int(input_len * range_ratio),
+        input_len + 1,
+        size=num_prompts,
+    )
+    output_lens = np.random.randint(
+        int(output_len * range_ratio),
+        output_len + 1,
+        size=num_prompts,
+    )
+    offsets = np.random.randint(0, tokenizer.vocab_size, size=num_prompts)
+    input_requests = []
+    for i in range(args.num_prompts):
+        prompt = tokenizer.decode([(offsets[i] + i + j) % tokenizer.vocab_size
+                                   for j in range(input_lens[i])])
+        input_requests.append(
+            (prompt, int(input_lens[i]), int(output_lens[i])))
+
+    return input_requests
+
+
 async def get_request(
     input_requests: List[Tuple[str, int, int]],
     request_rate: float,
@@ -196,6 +220,7 @@ async def get_request(
         if request_rate == float("inf"):
             # If the request rate is infinity, then we don't need to wait.
             continue
+
         # Sample the request interval from the exponential distribution.
         interval = np.random.exponential(1.0 / request_rate)
         # The next request will be sent after the interval.
@@ -219,7 +244,7 @@ def calculate_metrics(
             # We use the tokenizer to count the number of output tokens for all
             # serving backends instead of looking at len(outputs[i].itl) since
             # multiple output tokens may be bundled together
-            # Note: this may inflate the output token count slightly
+            # Note : this may inflate the output token count slightly
             output_len = len(
                 tokenizer(outputs[i].generated_text,
                           add_special_tokens=False).input_ids)
@@ -456,6 +481,15 @@ def main(args: argparse.Namespace):
                               for prompt, prompt_formatted, prompt_len,
                               output_len in input_requests]
 
+    elif args.dataset_name == "random":
+        input_requests = sample_random_requests(
+            input_len=args.input_len,
+            output_len=args.output_len,
+            num_prompts=args.num_prompts,
+            range_ratio=args.range_ratio,
+            tokenizer=tokenizer,
+        )
+
     else:
         raise ValueError(f"Unknown dataset: {args.dataset_name}")
 
@@ -549,7 +583,7 @@ def main(args: argparse.Namespace):
         "--dataset-name",
         type=str,
         default="sharegpt",
-        choices=["sharegpt", "sonnet"],
+        choices=["sharegpt", "sonnet", "random"],
         help="Name of the dataset to benchmark on.",
     )
     parser.add_argument("--dataset-path",
@@ -566,7 +600,7 @@ def main(args: argparse.Namespace):
         "--tokenizer",
         type=str,
         help=
-        "Name or path of the tokenizer, if not using the default tokenizer.",
+        "Name or path of the tokenizer, if not using the default tokenizer.",  # noqa: E501
     )
     parser.add_argument(
         "--best-of",
@@ -609,6 +643,27 @@ def main(args: argparse.Namespace):
         help=
         "Number of prefix tokens per request, used only for sonnet dataset.",
     )
+    parser.add_argument(
+        "--random-input-len",
+        type=int,
+        default=1024,
+        help=
+        "Number of input tokens per request, used only for random sampling.",
+    )
+    parser.add_argument(
+        "--random-output-len",
+        type=int,
+        default=128,
+        help=
+        "Number of output tokens per request, used only for random sampling.",
+    )
+    parser.add_argument(
+        "--random-range-ratio",
+        type=float,
+        default=1.0,
+        help="Range of sampled ratio of input/output length, "
+        "used only for random sampling.",
+    )
     parser.add_argument(
         "--request-rate",
         type=float,