diff --git a/benchmarks/README.md b/benchmarks/README.md index b504e3ab..b88501f2 100644 --- a/benchmarks/README.md +++ b/benchmarks/README.md @@ -29,6 +29,19 @@ python benchmark_serving.py \ ``` +### Run Benchmark for Llama 3 + +``` +python benchmark_serving.py \ +--tokenizer \ +--num-prompts 10 \ +--dataset sharegpt \ +--dataset-path ~/data/ShareGPT_V3_unfiltered_cleaned_split.json \ +--max-output-length 1024 \ +--model llama-3 + +``` + ### Save request outputs in Benchmark Please use `--save-request-outputs` flag to save predictions to a file. diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py index 32855de6..20351106 100644 --- a/benchmarks/benchmark_serving.py +++ b/benchmarks/benchmark_serving.py @@ -73,6 +73,7 @@ from jetstream.core.proto import jetstream_pb2 from jetstream.core.proto import jetstream_pb2_grpc from jetstream.engine.token_utils import load_vocab +from jetstream.third_party.llama3 import llama3_tokenizer import numpy as np from tqdm.asyncio import tqdm # pytype: disable=pyi-error import pandas @@ -130,10 +131,13 @@ def to_dict(self): } -def get_tokenizer(tokenizer_name: str) -> Any: +def get_tokenizer(model_id: str, tokenizer_name: str) -> Any: """Return a tokenizer or a tokenizer placholder.""" if tokenizer_name == "test": return "test" + elif model_id == "llama-3": + # Llama 3 uses a tiktoken tokenizer. + return llama3_tokenizer.Tokenizer(tokenizer_name) else: # Use JetStream tokenizer util. It's using the sentencepiece wrapper in # seqio library. @@ -195,18 +199,14 @@ def tokenize_dataset( prompts = [] outputs = [] indices = [] - + prompt_token_ids = [] + outputs_token_ids = [] for prompt, output, idx in dataset: prompts.append(prompt) outputs.append(output) indices.append(idx) - - prompt_token_ids = tokenizer.encode( - prompts - ) # adjust this code based on tokenizer method - outputs_token_ids = tokenizer.encode( - outputs - ) # adjust this code based on tokenizer method + prompt_token_ids.append(tokenizer.encode(prompt)) + outputs_token_ids.append(tokenizer.encode(output)) tokenized_dataset = [] for i in range(n): @@ -549,7 +549,7 @@ def main(args: argparse.Namespace): api_url = f"{args.server}:{args.port}" - tokenizer = get_tokenizer(tokenizer_id) + tokenizer = get_tokenizer(model_id, tokenizer_id) if tokenizer == "test" or args.dataset == "test": input_requests = mock_requests( args.total_mock_requests @@ -680,9 +680,10 @@ def main(args: argparse.Namespace): type=str, default="no_model", help=( - "Name of the model. (it's just used to label the benchmark, the model" - " config is defined in config_lib, and passed as the server config" - " flag when we run the JetStream server)" + "Name of the model like llama-2, llama-3, gemma. (it's just used to" + " label the benchmark, pick the tokenizer, the model config is" + " defined in config_lib, and passed as the server config flag when" + " we run the JetStream server)" ), ) parser.add_argument( diff --git a/jetstream/third_party/llama3/llama3_tokenizer.py b/jetstream/third_party/llama3/llama3_tokenizer.py index f3d8b0f0..230debe5 100644 --- a/jetstream/third_party/llama3/llama3_tokenizer.py +++ b/jetstream/third_party/llama3/llama3_tokenizer.py @@ -107,8 +107,8 @@ def encode( self, s: str, *, - bos: bool, - eos: bool, + bos: bool = False, + eos: bool = False, allowed_special: Union[Literal["all"], AbstractSet[str]] | None = None, disallowed_special: Union[Literal["all"], Collection[str]] = (), ) -> List[int]: