quic · quic-mamta · May 23, 2024 · May 16, 2024 · May 17, 2024 · May 20, 2024
@@ -18,29 +18,38 @@
 
 def main(
     model_name: str,
-    prompt: str,
     qpc_path: str,
-    devices: List[int],
+    device_group: List[int],
+    prompt: str = None,
+    prompts_txt_file_path: str = None,
     cache_dir: str = Constants.CACHE_DIR,
     hf_token: str = None,
 ):
     """
     APi() to run the Model on Cloud AI 100 Platform.
     ---------
-    :param model_name: str. Hugging Face Model Card name, Example: [gpt2]
-    :prompt: str. Sample prompt for the model text generation
+    :param model_name: str. Hugging Face Model Card name, Example: "gpt2"
     :qpc_path: str.  Path to the generated binary after compilation.
-    :devices: List[int]. Device Ids to be used for compilation. if devices > 1. Multiple Card setup is enabled.
+    :device_group: List[int]. Device Ids to be used for compilation. if len(device_group) > 1. Multiple Card setup is enabled.
+    :prompt: str. Sample prompt for the model text generation
+    :prompts_txt_file_path: str. Path to txt file for multiple input prompts (in case of batch size > 1)
     """
+
     if hf_token is not None:
         login(hf_token)
+
     # Download tokenizer along with model if it doesn't exist
-    model_hf_path = hf_download(repo_id=model_name, cache_dir=cache_dir, allow_patterns=["*.json"])
-    tokenizer = AutoTokenizer.from_pretrained(
-        model_hf_path, use_cache=True, padding_side="left", trust_remote_code=True
-    )
+    model_hf_path = hf_download(repo_id=model_name, cache_dir=cache_dir, allow_patterns=["*.json", "*.py"])
+    tokenizer = AutoTokenizer.from_pretrained(model_hf_path, use_cache=True, padding_side="left")
 
-    cloud_ai_100_exec_kv(tokenizer=tokenizer, qpc=qpc_path, device_id=devices, prompt=prompt)
+    # Execute
+    cloud_ai_100_exec_kv(
+        tokenizer=tokenizer,
+        qpc_path=qpc_path,
+        device_id=device_group,
+        prompt=prompt,
+        prompts_txt_file_path=prompts_txt_file_path,
+    )
 
 
 if __name__ == "__main__":
@@ -49,18 +58,23 @@ def main(
         "--model_name", "--model-name", required=False, type=str, help="HF model card name for tokenizing the inputs"
     )
     parser.add_argument("--qpc_path", "--qpc-path", required=True, help="Path to generated QPC")
-    parser.add_argument(
-        "--prompt",
-        type=lambda prompt: prompt.split("|"),
-        default="My name is",
-        help="Input prompt, if executing for batch size>1, pass input promprs in single string but seperate with pipe (|) symbol",
-    )
     parser.add_argument(
         "--device_group",
         "--device-group",
         required=True,
         type=lambda device_ids: [int(x) for x in device_ids.strip("[]").split(",")],
-        help="cloud AI 100 device ids (comma-separated) e.g. [0]",
+        help="Cloud AI 100 device ids (comma-separated) e.g. [0]",
+    )
+    parser.add_argument(
+        "--prompt",
+        type=str,
+        help="Input prompt, if executing for batch size>1, use prompts_txt_file_path flag",
+    )
+    parser.add_argument(
+        "--prompts_txt_file_path",
+        "--prompts-txt-file-path",
+        type=str,
+        help="for batch size>1, pass input prompts in txt file, sample prompts.txt file present in examples folder",
     )
     parser.add_argument(
         "--cache-dir", "--cache_dir", default=Constants.CACHE_DIR, required=False, help="Cache dir to store HF Downlods"
@@ -69,4 +83,4 @@ def main(
         "--hf-token", "--hf_token", default=None, type=str, required=False, help="HF token id for private HF models"
     )
     args = parser.parse_args()
-    main(args.model_name, args.prompt, args.qpc_path, args.device_group, args.cache_dir, args.hf_token)
+    main(**args.__dict__)
@@ -15,7 +15,11 @@
 import QEfficient
 from QEfficient.cloud.compile import main as compile
 from QEfficient.exporter.export_hf_to_cloud_ai_100 import qualcomm_efficient_converter
-from QEfficient.generation.text_generation_inference import cloud_ai_100_exec_kv
+from QEfficient.generation.text_generation_inference import (
+    check_batch_size_and_num_prompts,
+    cloud_ai_100_exec_kv,
+    read_prompts_txt_file,
+)
 from QEfficient.utils import hf_download
 from QEfficient.utils.constants import QEFF_MODELS_DIR, Constants
 from QEfficient.utils.logging_utils import logger
@@ -48,7 +52,8 @@ def onnx_exists(onnx_file_path: str) -> bool:
 def main(
     model_name: str,
     num_cores: int,
-    prompt: str,
+    prompt: str = None,
+    prompts_txt_file_path: str = None,
     aic_enable_depth_first: bool = False,
     mos: int = -1,
     cache_dir: str = Constants.CACHE_DIR,
@@ -76,6 +81,16 @@ def main(
     onnx_dir_path = os.path.join(model_card_dir, "onnx")
     onnx_model_path = os.path.join(onnx_dir_path, model_name.replace("/", "_") + "_kv_clipped_fp16.onnx")
 
+    assert (prompt is None and prompts_txt_file_path is not None) or (
+        prompt is not None and prompts_txt_file_path is None
+    ), "Please pass either single input string using --prompt or multiple inputs using --prompts_txt_file_path"
+
+    if prompts_txt_file_path is not None:
+        prompts = read_prompts_txt_file(prompts_txt_file_path)
+        check_batch_size_and_num_prompts(prompts, batch_size)
+    else:
+        check_batch_size_and_num_prompts([prompt], batch_size)
+
     # Get tokenizer
     if hf_token is not None:
         login(hf_token)
@@ -91,7 +106,13 @@ def main(
     if qpc_exists(qpc_dir_path):
         # execute
         logger.info("Pre-compiled qpc found! Trying to execute with given prompt")
-        cloud_ai_100_exec_kv(tokenizer=tokenizer, qpc=qpc_dir_path, device_id=device_group, prompt=prompt)
+        cloud_ai_100_exec_kv(
+            tokenizer=tokenizer,
+            qpc_path=qpc_dir_path,
+            device_id=device_group,
+            prompt=prompt,
+            prompts_txt_file_path=prompts_txt_file_path,
+        )
         return
 
     if onnx_exists(onnx_model_path):
@@ -112,7 +133,13 @@ def main(
         assert (
             generated_qpc_path == qpc_dir_path
         ), f"QPC files were generated at an unusual location, expected {qpc_dir_path}; got {generated_qpc_path}"
-        cloud_ai_100_exec_kv(tokenizer=tokenizer, qpc=generated_qpc_path, device_id=device_group, prompt=prompt)
+        cloud_ai_100_exec_kv(
+            tokenizer=tokenizer,
+            qpc_path=qpc_dir_path,
+            device_id=device_group,
+            prompt=prompt,
+            prompts_txt_file_path=prompts_txt_file_path,
+        )
         return
 
     #############################################
@@ -159,12 +186,18 @@ def main(
     logger.info(f"Compiled qpc files can be found at : {generated_qpc_path}")
 
     # Execute
-    cloud_ai_100_exec_kv(tokenizer=tokenizer, qpc=generated_qpc_path, device_id=device_group, prompt=prompt)
+    cloud_ai_100_exec_kv(
+        tokenizer=tokenizer,
+        qpc_path=qpc_dir_path,
+        device_id=device_group,
+        prompt=prompt,
+        prompts_txt_file_path=prompts_txt_file_path,
+    )
 
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(
-        description="Inference command, the model will be downloaded from HF, optmized, compiled, executed on AIC"
+        description="Inference command, the model will be downloaded from HF, optmized, compiled, executed on Cloud AI 100"
     )
     parser.add_argument("--model-name", "--model_name", required=True, help="HF Model card name/id")
     parser.add_argument(
@@ -193,9 +226,14 @@ def main(
     )
     parser.add_argument(
         "--prompt",
-        type=lambda prompt: prompt.split("|"),
-        default="My name is",
-        help="Input prompt, if executing for batch size>1, pass input promprs in single string but seperate with pipe (|) symbol",
+        type=str,
+        help="Input prompt, if executing for batch size>1, use prompts_txt_file_path flag",
+    )
+    parser.add_argument(
+        "--prompts_txt_file_path",
+        "--prompts-txt-file-path",
+        type=str,
+        help="for batch size>1, pass input prompts in txt file, sample prompts.txt file present in examples folder",
     )
     parser.add_argument(
         "--aic_enable_depth_first",

@@ -61,6 +61,85 @@ def write_io_files(
         json.dump({"IO-files": io_files}, fp, indent=True)
 
 
+def get_compilation_batch_size(qpc_path: str):
+    qpc_base_path = os.path.dirname(qpc_path)
+    specialization_file_path = os.path.join(qpc_base_path, "specializations.json")
+    with open(specialization_file_path, "r") as file:
+        data = json.load(file)
+    compilation_batch_size = int(data["specializations"][0]["batch_size"])
+    return compilation_batch_size
+
+
+def check_batch_size_and_num_prompts(prompt: List[str], batch_size: int):
+    num_prompts = len(prompt)
+    if batch_size > 1:
+        assert (
+            batch_size == num_prompts
+        ), f"Mismatch between number of prompts {num_prompts} and batch size {batch_size}; please pass correct input argument"
+
+
+def read_prompts_txt_file(prompts_txt_file_path: str):
+    prompt = []
+    with open(prompts_txt_file_path, "r") as file:
+        for line in file:
+            prompt.append(line.strip())
+    return prompt
+
+
+def cloud_ai_100_exec_kv(
+    tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
+    qpc_path: str,
+    prompt: str,
+    prompts_txt_file_path: str,
+    device_id: List[int] = [0],
+):
+    assert (prompt is None and prompts_txt_file_path is not None) or (
+        prompt is not None and prompts_txt_file_path is None
+    ), "Please pass either single input string using --prompt or multiple inputs using --prompts_txt_file_path"
+
+    if prompts_txt_file_path is not None:
+        prompt = read_prompts_txt_file(prompts_txt_file_path)
+    if isinstance(prompt, str):
+        prompt = [prompt]
+
+    batch_size = get_compilation_batch_size(qpc_path)
+    check_batch_size_and_num_prompts(prompt, batch_size)
+
+    if batch_size == 1:
+        prefill_time = []
+        decode_perf = []
+        total_perf = []
+        total_time = []
+        generated_texts = []
+        for i in range(len(prompt)):
+            latency_stats = exec_kv(tokenizer=tokenizer, qpc=qpc_path, device_id=device_id, prompt=[prompt[i]])
+            generated_texts.append(latency_stats[0])
+            prefill_time.append(latency_stats[1])
+            decode_perf.append(latency_stats[2])
+            total_perf.append(latency_stats[3])
+            total_time.append(latency_stats[4])
+
+        prefill_time = np.average(prefill_time)
+        decode_perf = np.average(decode_perf)
+        total_perf = np.average(total_perf)
+        total_time = np.average(total_time)
+
+    else:
+        latency_stats = exec_kv(tokenizer=tokenizer, qpc=qpc_path, device_id=device_id, prompt=prompt)
+        generated_texts, prefill_time, decode_perf, total_perf, total_time = latency_stats
+
+    print_latency_stats_kv(
+        prompt,
+        generated_texts,
+        batch_size,
+        prefill_time,
+        decode_perf,
+        total_perf,
+        total_time,
+        automation=False,
+    )
+
+
 def latency_stats_bertstyle(
     model_name: str,
     qpc: str,
@@ -97,25 +176,26 @@ def latency_stats_bertstyle(
     print(round((cur_len - init_len) / (end - start), 2), "tok/s")
 
 
-def cloud_ai_100_exec_kv(
+def exec_kv(
     tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
     qpc: str,
-    prompt: str,
+    prompt: List[str],
     input_len: Optional[int] = None,
     generation_len: Optional[int] = None,
     device_id: List[int] = [0],
     enable_debug_logs: bool = False,
     stream: bool = True,
     write_io_dir: Optional[str] = None,
-    automation: bool = False,
 ):
     if tokenizer.padding_side != "left":
-        logger.warning(f"Please use padding_side='left' while initializing the tokenizer")
+        logger.warning("Please use padding_side='left' while initializing the tokenizer")
         tokenizer.padding_side = "left"
     if tokenizer.pad_token_id is None:
         tokenizer.pad_token_id = tokenizer.eos_token_id
+
     # Load QPC
     session = QAICInferenceSession(qpc, device_id, enable_debug_logs=enable_debug_logs)
+
     # Read prompt and ctx len from session
     prompt_len = max([x[session.binding_index_map["input_ids"]][1][1] for x in session.allowed_shapes])
     ctx_len = session.allowed_shapes[0][session.binding_index_map["attention_mask"]][1][1]
@@ -126,11 +206,11 @@ def cloud_ai_100_exec_kv(
     num_chunks = -(input_len // -prompt_len)  # ceil divide without float
     input_len = num_chunks * prompt_len  # Convert input_len to a multiple of prompt_len
     assert input_len <= ctx_len, "input_len should be less than ctx_len"
+
     # Skip inputs/outputs
     session.skip_buffers([x for x in session.input_names if x.startswith("past_")])
     session.skip_buffers([x for x in session.output_names if x.endswith("_RetainedState")])
-    if tokenizer.pad_token_id is None:
-        tokenizer.pad_token_id = tokenizer.eos_token_id
+
     # Prepare inputs for first iteration
     start = perf_counter()
     inputs = tokenizer(prompt, return_tensors="np", padding="max_length", max_length=input_len)
@@ -146,8 +226,10 @@ def cloud_ai_100_exec_kv(
     cache_index = np.array([0])
     inputs["cache_index"] = cache_index
     generated_ids = np.full((batch_size, generation_len - input_len + 1), tokenizer.pad_token_id)
+
     if stream:
         print(0, prompt[0], end=" ", flush=True)
+
     # Run prefill
     for i in range(num_chunks):
         chunk_inputs = inputs.copy()
@@ -159,6 +241,7 @@ def cloud_ai_100_exec_kv(
         if write_io_dir:
             write_io_files(inputs, outputs, write_io_dir, "prefill", "aic_batch_io", True, False)
         cache_index += prompt_len
+
     # Get first token
     logits = outputs["logits"]
     if len(logits.shape) == 2:
@@ -169,6 +252,7 @@ def cloud_ai_100_exec_kv(
     generated_ids[:, cache_index[0] - input_len] = next_token_id.squeeze(1)
     if stream:
         print(tokenizer.decode(next_token_id[0]), end=" ", flush=True)
+
     # Skip attention_mask from next iteration to use retained attention_mask
     session.skip_buffers(["attention_mask"])
     loop_start = perf_counter()
@@ -178,6 +262,7 @@ def cloud_ai_100_exec_kv(
         if write_io_dir:
             write_io_files(inputs, outputs, write_io_dir, "decode", "aic_batch_io", True, False)
             write_io_dir = None
+
         # Prepare inputs for next iteration
         logits = outputs["logits"]
         if len(logits.shape) == 2:
@@ -192,14 +277,24 @@ def cloud_ai_100_exec_kv(
             print(tokenizer.decode(next_token_id[0]), end=" ", flush=True)
     end = perf_counter()
     generated_texts = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
+
     for i in range(1 if stream else 0, batch_size):
         print()
         print(i, prompt[i], generated_texts[i])
+
     prefill_time = loop_start - start
     decode_perf = (cache_index.item() - input_len - 1) / (end - loop_start)
     total_perf = (cache_index.item() - input_len) / (end - start)
     total_time = end - start
     print()
+
+    latency_stats = (generated_texts, prefill_time, decode_perf, total_perf, total_time)
+    return latency_stats
+
+
+def print_latency_stats_kv(
+    prompt, generated_texts, batch_size, prefill_time, decode_perf, total_perf, total_time, automation: bool = False
+):
     if automation:
         print()
         print("input=", prompt)
@@ -210,6 +305,7 @@ def cloud_ai_100_exec_kv(
         print("Total (E2E) inference time is=", round(total_time, 2))
         return
     print()
+
     print("===================== Performance Stats =====================")
     if batch_size > 1:
         print("Prefill time a.k.a TTFT (batch) is :", round(prefill_time, 2), "s")