From 042f2c132eb923de8ccdde735b71f833d440d2e7 Mon Sep 17 00:00:00 2001 From: Mamta Singh Date: Thu, 16 May 2024 11:43:28 +0530 Subject: [PATCH 01/28] [QEff]: Update infer and execute API to take prompts from txt file for bs>1 Signed-off-by: mamtsing Signed-off-by: mamtsing --- QEfficient/cloud/execute.py | 44 +++++++++++++++++++++++++++++++++---- QEfficient/cloud/infer.py | 43 +++++++++++++++++++++++++++++++----- examples/prompts.txt | 3 +++ 3 files changed, 81 insertions(+), 9 deletions(-) create mode 100644 examples/prompts.txt diff --git a/QEfficient/cloud/execute.py b/QEfficient/cloud/execute.py index 2bd5626e..9734e3d0 100644 --- a/QEfficient/cloud/execute.py +++ b/QEfficient/cloud/execute.py @@ -19,6 +19,7 @@ def main( model_name: str, prompt: str, + inputs_file_path: str, qpc_path: str, devices: List[int], cache_dir: str = Constants.CACHE_DIR, @@ -38,7 +39,29 @@ def main( model_hf_path = hf_download(repo_id=model_name, cache_dir=cache_dir, allow_patterns=["*.json"]) tokenizer = AutoTokenizer.from_pretrained(model_hf_path, use_cache=True, padding_side="left") - cloud_ai_100_exec_kv(tokenizer=tokenizer, qpc=qpc_path, device_id=devices, prompt=prompt) + if inputs_file_path is not None: + try: + prompt = [] + with open(inputs_file_path, "r") as file: + for line in file: + prompt.append(line.strip()) + except FileNotFoundError: + print("inputs file not found.") + + qpc_dir_name = qpc_path.strip("/").split("/")[-2] + compilation_batch_size = int(qpc_dir_name.split("BS")[0].split("_")[-1]) + + if compilation_batch_size > 1: + assert ( + compilation_batch_size == len(prompt) + ), "Mismatch between number of prompts {len(prompt)} and compilation batch size {compilation_batch_size}; please pass correct input argument" + + # Execute + if compilation_batch_size == 1 and isinstance(prompt, list): + for i in range(len(prompt)): + cloud_ai_100_exec_kv(tokenizer=tokenizer, qpc=qpc_path, device_id=devices, prompt=prompt[i]) + else: + cloud_ai_100_exec_kv(tokenizer=tokenizer, qpc=qpc_path, device_id=devices, prompt=prompt) if __name__ == "__main__": @@ -49,9 +72,14 @@ def main( parser.add_argument("--qpc_path", "--qpc-path", required=True, help="Path to generated QPC") parser.add_argument( "--prompt", - type=lambda prompt: prompt.split("|"), + type=str, default="My name is", - help="Input prompt, if executing for batch size>1, pass input promprs in single string but seperate with pipe (|) symbol", + help="Input prompt, if executing for batch size>1, use inputs_file_path flag", + ) + parser.add_argument( + "--inputs_file_path", + type=str, + help="for batch size>1, pass input prompts in txt file, sample prompts.txt file present in examples folder", ) parser.add_argument( "--device_group", @@ -67,4 +95,12 @@ def main( "--hf-token", "--hf_token", default=None, type=str, required=False, help="HF token id for private HF models" ) args = parser.parse_args() - main(args.model_name, args.prompt, args.qpc_path, args.device_group, args.cache_dir, args.hf_token) + main( + args.model_name, + args.prompt, + args.inputs_file_path, + args.qpc_path, + args.device_group, + args.cache_dir, + args.hf_token, + ) diff --git a/QEfficient/cloud/infer.py b/QEfficient/cloud/infer.py index 3492874a..de5c2743 100644 --- a/QEfficient/cloud/infer.py +++ b/QEfficient/cloud/infer.py @@ -49,6 +49,7 @@ def main( model_name: str, num_cores: int, prompt: str, + inputs_file_path: str, aic_enable_depth_first: bool = False, mos: int = -1, cache_dir: str = Constants.CACHE_DIR, @@ -76,6 +77,20 @@ def main( onnx_dir_path = os.path.join(model_card_dir, "onnx") onnx_model_path = os.path.join(onnx_dir_path, model_name.replace("/", "_") + "_kv_clipped_fp16.onnx") + if inputs_file_path is not None: + try: + prompt = [] + with open(inputs_file_path, "r") as file: + for line in file: + prompt.append(line.strip()) + except FileNotFoundError: + print("Inputs file not found.") + + if batch_size > 1: + assert ( + batch_size == len(prompt) + ), "Mismatch between number of prompts {len(prompt)} and batch size {batch_size}; please pass correct input argument" + # Get tokenizer if hf_token is not None: login(hf_token) @@ -89,7 +104,11 @@ def main( if qpc_exists(qpc_dir_path): # execute logger.info("Pre-compiled qpc found! Trying to execute with given prompt") - cloud_ai_100_exec_kv(tokenizer=tokenizer, qpc=qpc_dir_path, device_id=device_group, prompt=prompt) + if batch_size == 1 and isinstance(prompt, list): + for i in range(len(prompt)): + cloud_ai_100_exec_kv(tokenizer=tokenizer, qpc=qpc_dir_path, device_id=device_group, prompt=prompt[i]) + else: + cloud_ai_100_exec_kv(tokenizer=tokenizer, qpc=qpc_dir_path, device_id=device_group, prompt=prompt) return if onnx_exists(onnx_model_path): @@ -110,7 +129,11 @@ def main( assert ( generated_qpc_path == qpc_dir_path ), f"QPC files were generated at an unusual location, expected {qpc_dir_path}; got {generated_qpc_path}" - cloud_ai_100_exec_kv(tokenizer=tokenizer, qpc=generated_qpc_path, device_id=device_group, prompt=prompt) + if batch_size == 1 and isinstance(prompt, list): + for i in range(len(prompt)): + cloud_ai_100_exec_kv(tokenizer=tokenizer, qpc=qpc_dir_path, device_id=device_group, prompt=prompt[i]) + else: + cloud_ai_100_exec_kv(tokenizer=tokenizer, qpc=qpc_dir_path, device_id=device_group, prompt=prompt) return ############################################# @@ -157,7 +180,11 @@ def main( logger.info(f"Compiled qpc files can be found at : {generated_qpc_path}") # Execute - cloud_ai_100_exec_kv(tokenizer=tokenizer, qpc=generated_qpc_path, device_id=device_group, prompt=prompt) + if batch_size == 1 and isinstance(prompt, list): + for i in range(len(prompt)): + cloud_ai_100_exec_kv(tokenizer=tokenizer, qpc=generated_qpc_path, device_id=device_group, prompt=prompt[i]) + else: + cloud_ai_100_exec_kv(tokenizer=tokenizer, qpc=generated_qpc_path, device_id=device_group, prompt=prompt) if __name__ == "__main__": @@ -191,9 +218,15 @@ def main( ) parser.add_argument( "--prompt", - type=lambda prompt: prompt.split("|"), + type=str, default="My name is", - help="Input prompt, if executing for batch size>1, pass input promprs in single string but seperate with pipe (|) symbol", + help="Input prompt, if executing for batch size>1, use inputs_file_path flag", + ) + parser.add_argument( + "--inputs_file_path", + "--inputs-file-path", + type=str, + help="for batch size>1, pass input prompts in txt file, sample prompts.txt file present in examples folder", ) parser.add_argument( "--aic_enable_depth_first", diff --git a/examples/prompts.txt b/examples/prompts.txt new file mode 100644 index 00000000..a91a5151 --- /dev/null +++ b/examples/prompts.txt @@ -0,0 +1,3 @@ +My name is +The sun rises from +The flat earth theory is the belief that \ No newline at end of file From 0802373c94eb2b7d4daa7dc245b1af7be5faca84 Mon Sep 17 00:00:00 2001 From: Mamta Singh Date: Fri, 17 May 2024 21:07:52 +0530 Subject: [PATCH 02/28] Update infer and execute API Signed-off-by: mamtsing --- QEfficient/cloud/execute.py | 1 + QEfficient/cloud/infer.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/QEfficient/cloud/execute.py b/QEfficient/cloud/execute.py index 9734e3d0..1b5c3506 100644 --- a/QEfficient/cloud/execute.py +++ b/QEfficient/cloud/execute.py @@ -78,6 +78,7 @@ def main( ) parser.add_argument( "--inputs_file_path", + "--inputs-file-path", type=str, help="for batch size>1, pass input prompts in txt file, sample prompts.txt file present in examples folder", ) diff --git a/QEfficient/cloud/infer.py b/QEfficient/cloud/infer.py index de5c2743..b2361aff 100644 --- a/QEfficient/cloud/infer.py +++ b/QEfficient/cloud/infer.py @@ -189,7 +189,7 @@ def main( if __name__ == "__main__": parser = argparse.ArgumentParser( - description="Inference command, the model will be downloaded from HF, optmized, compiled, executed on AIC" + description="Inference command, the model will be downloaded from HF, optmized, compiled, executed on Cloud AI 100" ) parser.add_argument("--model-name", "--model_name", required=True, help="HF Model card name/id") parser.add_argument( From bc5ca88c68bcb976fdeef9a1735a9d45e6ad05d8 Mon Sep 17 00:00:00 2001 From: Mamta Singh Date: Mon, 20 May 2024 12:20:23 +0530 Subject: [PATCH 03/28] Update infer and execute API Signed-off-by: mamtsing --- QEfficient/cloud/execute.py | 52 ++++----- QEfficient/cloud/infer.py | 75 +++++++------ .../generation/text_generation_inference.py | 103 ++++++++++++++++-- 3 files changed, 163 insertions(+), 67 deletions(-) diff --git a/QEfficient/cloud/execute.py b/QEfficient/cloud/execute.py index 1b5c3506..3896ac8e 100644 --- a/QEfficient/cloud/execute.py +++ b/QEfficient/cloud/execute.py @@ -11,7 +11,12 @@ from huggingface_hub import login from transformers import AutoTokenizer -from QEfficient.generation.text_generation_inference import cloud_ai_100_exec_kv +from QEfficient.generation.text_generation_inference import ( + check_batch_size_and_num_prompts, + cloud_ai_100_exec_kv, + get_compilation_batch_size, + read_prompts_txt_file, +) from QEfficient.utils import hf_download from QEfficient.utils.constants import Constants @@ -19,7 +24,7 @@ def main( model_name: str, prompt: str, - inputs_file_path: str, + prompts_txt_file_path: str, qpc_path: str, devices: List[int], cache_dir: str = Constants.CACHE_DIR, @@ -35,33 +40,29 @@ def main( """ if hf_token is not None: login(hf_token) + # Download tokenizer along with model if it doesn't exist model_hf_path = hf_download(repo_id=model_name, cache_dir=cache_dir, allow_patterns=["*.json"]) tokenizer = AutoTokenizer.from_pretrained(model_hf_path, use_cache=True, padding_side="left") - if inputs_file_path is not None: - try: - prompt = [] - with open(inputs_file_path, "r") as file: - for line in file: - prompt.append(line.strip()) - except FileNotFoundError: - print("inputs file not found.") + assert (prompt is None and prompts_txt_file_path is not None) or ( + prompt is not None and prompts_txt_file_path is None + ), "Please pass either single input string using --prompt or multiple inputs using --prompts_txt_file_path" - qpc_dir_name = qpc_path.strip("/").split("/")[-2] - compilation_batch_size = int(qpc_dir_name.split("BS")[0].split("_")[-1]) + if prompts_txt_file_path is not None: + prompt = read_prompts_txt_file(prompts_txt_file_path) - if compilation_batch_size > 1: - assert ( - compilation_batch_size == len(prompt) - ), "Mismatch between number of prompts {len(prompt)} and compilation batch size {compilation_batch_size}; please pass correct input argument" + compilation_batch_size = get_compilation_batch_size(qpc_path) + check_batch_size_and_num_prompts(prompt, compilation_batch_size) # Execute - if compilation_batch_size == 1 and isinstance(prompt, list): - for i in range(len(prompt)): - cloud_ai_100_exec_kv(tokenizer=tokenizer, qpc=qpc_path, device_id=devices, prompt=prompt[i]) - else: - cloud_ai_100_exec_kv(tokenizer=tokenizer, qpc=qpc_path, device_id=devices, prompt=prompt) + cloud_ai_100_exec_kv( + compilation_batch_size=compilation_batch_size, + tokenizer=tokenizer, + qpc=qpc_path, + device_id=devices, + prompt=prompt, + ) if __name__ == "__main__": @@ -73,12 +74,11 @@ def main( parser.add_argument( "--prompt", type=str, - default="My name is", - help="Input prompt, if executing for batch size>1, use inputs_file_path flag", + help="Input prompt, if executing for batch size>1, use prompts_txt_file_path flag", ) parser.add_argument( - "--inputs_file_path", - "--inputs-file-path", + "--prompts_txt_file_path", + "--prompts-txt-file-path-file-path", type=str, help="for batch size>1, pass input prompts in txt file, sample prompts.txt file present in examples folder", ) @@ -99,7 +99,7 @@ def main( main( args.model_name, args.prompt, - args.inputs_file_path, + args.prompts_txt_file_path, args.qpc_path, args.device_group, args.cache_dir, diff --git a/QEfficient/cloud/infer.py b/QEfficient/cloud/infer.py index b2361aff..7612460b 100644 --- a/QEfficient/cloud/infer.py +++ b/QEfficient/cloud/infer.py @@ -15,7 +15,12 @@ import QEfficient from QEfficient.cloud.compile import main as compile from QEfficient.exporter.export_hf_to_cloud_ai_100 import qualcomm_efficient_converter -from QEfficient.generation.text_generation_inference import cloud_ai_100_exec_kv +from QEfficient.generation.text_generation_inference import ( + check_batch_size_and_num_prompts, + cloud_ai_100_exec_kv, + get_compilation_batch_size, + read_prompts_txt_file, +) from QEfficient.utils import hf_download from QEfficient.utils.constants import QEFF_MODELS_DIR, Constants from QEfficient.utils.logging_utils import logger @@ -48,8 +53,8 @@ def onnx_exists(onnx_file_path: str) -> bool: def main( model_name: str, num_cores: int, - prompt: str, - inputs_file_path: str, + prompt: str = None, + prompts_txt_file_path: str = None, aic_enable_depth_first: bool = False, mos: int = -1, cache_dir: str = Constants.CACHE_DIR, @@ -77,19 +82,12 @@ def main( onnx_dir_path = os.path.join(model_card_dir, "onnx") onnx_model_path = os.path.join(onnx_dir_path, model_name.replace("/", "_") + "_kv_clipped_fp16.onnx") - if inputs_file_path is not None: - try: - prompt = [] - with open(inputs_file_path, "r") as file: - for line in file: - prompt.append(line.strip()) - except FileNotFoundError: - print("Inputs file not found.") + assert (prompt is None and prompts_txt_file_path is not None) or ( + prompt is not None and prompts_txt_file_path is None + ), "Please pass either single input string using --prompt or multiple inputs using --prompts_txt_file_path" - if batch_size > 1: - assert ( - batch_size == len(prompt) - ), "Mismatch between number of prompts {len(prompt)} and batch size {batch_size}; please pass correct input argument" + if prompts_txt_file_path is not None: + prompt = read_prompts_txt_file(prompts_txt_file_path) # Get tokenizer if hf_token is not None: @@ -104,13 +102,19 @@ def main( if qpc_exists(qpc_dir_path): # execute logger.info("Pre-compiled qpc found! Trying to execute with given prompt") - if batch_size == 1 and isinstance(prompt, list): - for i in range(len(prompt)): - cloud_ai_100_exec_kv(tokenizer=tokenizer, qpc=qpc_dir_path, device_id=device_group, prompt=prompt[i]) - else: - cloud_ai_100_exec_kv(tokenizer=tokenizer, qpc=qpc_dir_path, device_id=device_group, prompt=prompt) + compilation_batch_size = get_compilation_batch_size(qpc_dir_path) + check_batch_size_and_num_prompts(prompt, compilation_batch_size) + cloud_ai_100_exec_kv( + compilation_batch_size=compilation_batch_size, + tokenizer=tokenizer, + qpc_path=qpc_dir_path, + device_id=device_group, + prompt=prompt, + ) return + check_batch_size_and_num_prompts(prompt, batch_size) + if onnx_exists(onnx_model_path): # Compile -> execute # We need to pass parent directory of qpc_dir_path, as the compile function handles the qpcs directory creation @@ -129,11 +133,13 @@ def main( assert ( generated_qpc_path == qpc_dir_path ), f"QPC files were generated at an unusual location, expected {qpc_dir_path}; got {generated_qpc_path}" - if batch_size == 1 and isinstance(prompt, list): - for i in range(len(prompt)): - cloud_ai_100_exec_kv(tokenizer=tokenizer, qpc=qpc_dir_path, device_id=device_group, prompt=prompt[i]) - else: - cloud_ai_100_exec_kv(tokenizer=tokenizer, qpc=qpc_dir_path, device_id=device_group, prompt=prompt) + cloud_ai_100_exec_kv( + compilation_batch_size=compilation_batch_size, + tokenizer=tokenizer, + qpc_path=qpc_dir_path, + device_id=device_group, + prompt=prompt, + ) return ############################################# @@ -180,11 +186,13 @@ def main( logger.info(f"Compiled qpc files can be found at : {generated_qpc_path}") # Execute - if batch_size == 1 and isinstance(prompt, list): - for i in range(len(prompt)): - cloud_ai_100_exec_kv(tokenizer=tokenizer, qpc=generated_qpc_path, device_id=device_group, prompt=prompt[i]) - else: - cloud_ai_100_exec_kv(tokenizer=tokenizer, qpc=generated_qpc_path, device_id=device_group, prompt=prompt) + cloud_ai_100_exec_kv( + compilation_batch_size=compilation_batch_size, + tokenizer=tokenizer, + qpc_path=qpc_dir_path, + device_id=device_group, + prompt=prompt, + ) if __name__ == "__main__": @@ -219,12 +227,11 @@ def main( parser.add_argument( "--prompt", type=str, - default="My name is", - help="Input prompt, if executing for batch size>1, use inputs_file_path flag", + help="Input prompt, if executing for batch size>1, use prompts_txt_file_path flag", ) parser.add_argument( - "--inputs_file_path", - "--inputs-file-path", + "--prompts_txt_file_path", + "--prompts-txt-file-path", type=str, help="for batch size>1, pass input prompts in txt file, sample prompts.txt file present in examples folder", ) diff --git a/QEfficient/generation/text_generation_inference.py b/QEfficient/generation/text_generation_inference.py index c1f6d190..1be533e3 100755 --- a/QEfficient/generation/text_generation_inference.py +++ b/QEfficient/generation/text_generation_inference.py @@ -61,6 +61,75 @@ def write_io_files( json.dump({"IO-files": io_files}, fp, indent=True) +def get_compilation_batch_size(qpc_path: str): + qpc_base_path = os.path.dirname(qpc_path) + print(qpc_base_path) + specialization_file_path = os.path.join(qpc_base_path, "specializations.json") + print(specialization_file_path) + with open(specialization_file_path, "r") as file: + data = json.load(file) + compilation_batch_size = int(data["specializations"][0]["batch_size"]) + return compilation_batch_size + + +def check_batch_size_and_num_prompts(prompt: Union[str, List], compilation_batch_size: int): + if isinstance(prompt, list): + num_prompts = len(prompt) + elif isinstance(prompt, str): + num_prompts = 1 + else: + print("Input prompt sould be either string for single input or List of string in case of mutliple inputs") + if compilation_batch_size > 1: + assert ( + compilation_batch_size == num_prompts + ), f"Mismatch between number of prompts {num_prompts} and compilation batch size {compilation_batch_size}; please pass correct input argument" + + +def read_prompts_txt_file(prompts_txt_file_path: str): + prompt = [] + with open(prompts_txt_file_path, "r") as file: + for line in file: + prompt.append(line.strip()) + return prompt + + +def cloud_ai_100_exec_kv( + tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast], + qpc_path: str, + prompt: Union[str, List], + compilation_batch_size: int, + device_id: List[int] = [0], +): + if compilation_batch_size == 1 and isinstance(prompt, list): + for i in range(len(prompt)): + latency_stats = exec_kv(tokenizer=tokenizer, qpc=qpc_path, device_id=device_id, prompt=prompt[i]) + if i == len(prompt) - 1: + generated_texts, prefill_time, decode_perf, total_perf, total_time = latency_stats + print_latency_stats_kv( + prompt, + generated_texts, + compilation_batch_size, + prefill_time, + decode_perf, + total_perf, + total_time, + automation=False, + ) + else: + latency_stats = exec_kv(tokenizer=tokenizer, qpc=qpc_path, device_id=device_id, prompt=prompt) + generated_texts, prefill_time, decode_perf, total_perf, total_time = latency_stats + print_latency_stats_kv( + prompt, + generated_texts, + compilation_batch_size, + prefill_time, + decode_perf, + total_perf, + total_time, + automation=False, + ) + + def latency_stats_bertstyle( model_name: str, qpc: str, @@ -97,25 +166,26 @@ def latency_stats_bertstyle( print(round((cur_len - init_len) / (end - start), 2), "tok/s") -def cloud_ai_100_exec_kv( +def exec_kv( tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast], qpc: str, - prompt: str, + prompt: Union[str, List], input_len: Optional[int] = None, generation_len: Optional[int] = None, device_id: List[int] = [0], enable_debug_logs: bool = False, stream: bool = True, write_io_dir: Optional[str] = None, - automation: bool = False, ): if tokenizer.padding_side != "left": - logger.warning(f"Please use padding_side='left' while initializing the tokenizer") + logger.warning("Please use padding_side='left' while initializing the tokenizer") tokenizer.padding_side = "left" if tokenizer.pad_token_id is None: tokenizer.pad_token_id = tokenizer.eos_token_id + # Load QPC session = QAICInferenceSession(qpc, device_id, enable_debug_logs=enable_debug_logs) + # Read prompt and ctx len from session prompt_len = max([x[session.binding_index_map["input_ids"]][1][1] for x in session.allowed_shapes]) ctx_len = session.allowed_shapes[0][session.binding_index_map["attention_mask"]][1][1] @@ -126,11 +196,11 @@ def cloud_ai_100_exec_kv( num_chunks = -(input_len // -prompt_len) # ceil divide without float input_len = num_chunks * prompt_len # Convert input_len to a multiple of prompt_len assert input_len <= ctx_len, "input_len should be less than ctx_len" + # Skip inputs/outputs session.skip_buffers([x for x in session.input_names if x.startswith("past_")]) session.skip_buffers([x for x in session.output_names if x.endswith("_RetainedState")]) - if tokenizer.pad_token_id is None: - tokenizer.pad_token_id = tokenizer.eos_token_id + # Prepare inputs for first iteration start = perf_counter() inputs = tokenizer(prompt, return_tensors="np", padding="max_length", max_length=input_len) @@ -146,8 +216,13 @@ def cloud_ai_100_exec_kv( cache_index = np.array([0]) inputs["cache_index"] = cache_index generated_ids = np.full((batch_size, generation_len - input_len + 1), tokenizer.pad_token_id) + if stream: - print(0, prompt[0], end=" ", flush=True) + if isinstance(prompt, list): + print(0, prompt[0], end=" ", flush=True) + else: + print(0, prompt, end=" ", flush=True) + # Run prefill for i in range(num_chunks): chunk_inputs = inputs.copy() @@ -159,6 +234,7 @@ def cloud_ai_100_exec_kv( if write_io_dir: write_io_files(inputs, outputs, write_io_dir, "prefill", "aic_batch_io", True, False) cache_index += prompt_len + # Get first token logits = outputs["logits"] if len(logits.shape) == 2: @@ -169,6 +245,7 @@ def cloud_ai_100_exec_kv( generated_ids[:, cache_index[0] - input_len] = next_token_id.squeeze(1) if stream: print(tokenizer.decode(next_token_id[0]), end=" ", flush=True) + # Skip attention_mask from next iteration to use retained attention_mask session.skip_buffers(["attention_mask"]) loop_start = perf_counter() @@ -178,6 +255,7 @@ def cloud_ai_100_exec_kv( if write_io_dir: write_io_files(inputs, outputs, write_io_dir, "decode", "aic_batch_io", True, False) write_io_dir = None + # Prepare inputs for next iteration logits = outputs["logits"] if len(logits.shape) == 2: @@ -192,14 +270,24 @@ def cloud_ai_100_exec_kv( print(tokenizer.decode(next_token_id[0]), end=" ", flush=True) end = perf_counter() generated_texts = tokenizer.batch_decode(generated_ids, skip_special_tokens=True) + for i in range(1 if stream else 0, batch_size): print() print(i, prompt[i], generated_texts[i]) + prefill_time = loop_start - start decode_perf = (cache_index.item() - input_len - 1) / (end - loop_start) total_perf = (cache_index.item() - input_len) / (end - start) total_time = end - start print() + + latency_stats = (generated_texts, prefill_time, decode_perf, total_perf, total_time) + return latency_stats + + +def print_latency_stats_kv( + prompt, generated_texts, batch_size, prefill_time, decode_perf, total_perf, total_time, automation: bool = False +): if automation: print() print("input=", prompt) @@ -210,6 +298,7 @@ def cloud_ai_100_exec_kv( print("Total (E2E) inference time is=", round(total_time, 2)) return print() + print("===================== Performance Stats =====================") if batch_size > 1: print("Prefill time a.k.a TTFT (batch) is :", round(prefill_time, 2), "s") From 8712b87e8681bf009659b56d9630758700b4054b Mon Sep 17 00:00:00 2001 From: Mamta Singh Date: Tue, 21 May 2024 12:15:33 +0530 Subject: [PATCH 04/28] Update README.md Signed-off-by: mamtsing --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index df5a7d12..2c85f3b4 100644 --- a/README.md +++ b/README.md @@ -116,9 +116,9 @@ This is the single e2e python api in the library, which takes model_card name as python -m QEfficient.cloud.infer --help python -m QEfficient.cloud.infer --model_name gpt2 --batch_size 1 --prompt_len 32 --ctx_len 128 --mxfp6 --num_cores 16 --device_group [0] --prompt "My name is" --mos 1 --aic_enable_depth_first -# If executing for batch size>1, pass input prompts in single string but seperate with pipe (|) symbol". Example below +# If executing for batch size>1, pass path of txt file with input prompts, Example below -python -m QEfficient.cloud.infer --model_name gpt2 --batch_size 3 --prompt_len 32 --ctx_len 128 --num_cores 16 --device_group [0] --prompt "My name is|The flat earth theory is the belief that|The sun rises from" --mxfp6 --mos 1 --aic_enable_depth_first +python -m QEfficient.cloud.infer --model_name gpt2 --batch_size 3 --prompt_len 32 --ctx_len 128 --num_cores 16 --device_group [0] --prompts_txt_file_path examples/prompts.txt --mxfp6 --mos 1 --aic_enable_depth_first ``` ### 2. Use of QEfficient.cloud.excute From 81a3163616a55359c8dcee0cdee505ffdda4f892 Mon Sep 17 00:00:00 2001 From: Mamta Singh <168400541+quic-mamta@users.noreply.github.com> Date: Tue, 21 May 2024 13:39:41 +0530 Subject: [PATCH 05/28] Update README.md Signed-off-by: Mamta Singh <168400541+quic-mamta@users.noreply.github.com> --- README.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 2c85f3b4..78291df4 100644 --- a/README.md +++ b/README.md @@ -99,8 +99,8 @@ In summary: | High Level APIs | Sample use | Arguments | |-----------------|------------|-------------------| -| QEfficient.cloud.infer | [click here](#1-use-qefficientcloudinfer) |
  • model_name : $\color{green} {Mandatory}$
  • num_cores : $\color{green} {Mandatory}$
  • device_group : $\color{green} {Mandatory}$
  • batch_size : Optional [Default-1]
  • prompt_len : Optional [Default-32]
  • ctx_len : Optional [Default-128]
  • mxfp6 : Optional
  • hf_token : Optional
  • cache_dir : Optional ["cache_dir" in current working directory]
  • prompt : Optinoal [Default-"My name is"]
  • | -| QEfficient.cloud.execute | [click here](#2-use-of-qefficientcloudexcute) |
  • model_name : $\color{green} {Mandatory}$
  • device_group : $\color{green} {Mandatory}$
  • qpc_path : $\color{green} {Mandatory}$
  • prompt : Optional [Default-"My name is"]
  • cache_dir : Optional ["cache_dir" in current working directory]
  • hf_token : Optional
  • | +| QEfficient.cloud.infer | [click here](#1-use-qefficientcloudinfer) |
  • model_name : $\color{green} {Mandatory}$
  • num_cores : $\color{green} {Mandatory}$
  • device_group : $\color{green} {Mandatory}$
  • batch_size : Optional [Default-1]
  • prompt_len : Optional [Default-32]
  • ctx_len : Optional [Default-128]
  • mxfp6 : Optional
  • hf_token : Optional
  • cache_dir : Optional ["cache_dir" in current working directory]
  • prompt : Optional
  • prompts_txt_file_path : Optional
  • *only one argument, prompt or prompts_txt_file_path should be passed*
  • | +| QEfficient.cloud.execute | [click here](#2-use-of-qefficientcloudexcute) |
  • model_name : $\color{green} {Mandatory}$
  • device_group : $\color{green} {Mandatory}$
  • qpc_path : $\color{green} {Mandatory}$
  • cache_dir : Optional ["cache_dir" in current working directory]
  • hf_token : Optional
  • prompt : Optional
  • prompts_txt_file_path : Optional
  • *only one argument, prompt or prompts_txt_file_path should be passed*
  • | ### 1. Use QEfficient.cloud.infer @@ -116,7 +116,7 @@ This is the single e2e python api in the library, which takes model_card name as python -m QEfficient.cloud.infer --help python -m QEfficient.cloud.infer --model_name gpt2 --batch_size 1 --prompt_len 32 --ctx_len 128 --mxfp6 --num_cores 16 --device_group [0] --prompt "My name is" --mos 1 --aic_enable_depth_first -# If executing for batch size>1, pass path of txt file with input prompts, Example below +# If executing for batch size>1, pass path of txt file with input prompts, Example below, sample txt file(prompts.txt) is present in examples folder . python -m QEfficient.cloud.infer --model_name gpt2 --batch_size 3 --prompt_len 32 --ctx_len 128 --num_cores 16 --device_group [0] --prompts_txt_file_path examples/prompts.txt --mxfp6 --mos 1 --aic_enable_depth_first ``` @@ -128,7 +128,7 @@ Once we have compiled the QPC, we can now use the precompiled QPC in execute API python -m QEfficient.cloud.execute --model_name gpt2 --qpc_path qeff_models/gpt2/qpc_16cores_1BS_32PL_128CL_1devices_mxfp6/qpcs/ --prompt "Once upon a time in" --device_group [0] ``` -We can also enable MQ, just based on the number of devices. Based on the "--device_group" as input it will create TS config on the fly. If "--device-group [0,1]" it will create TS config for 2 devices and use it for compilation, if "--device-group 0" then TS compilation is skipped and single soc execution is enabled. +We can also enable MQ, just based on the number of devices. Based on the "--device-group" as input it will create TS config on the fly. If "--device-group [0,1]" it will create TS config for 2 devices and use it for compilation, if "--device-group 0" then TS compilation is skipped and single soc execution is enabled. ```bash python -m QEfficient.cloud.infer --model_name Salesforce/codegen-2B-mono --batch_size 1 --prompt_len 32 --ctx_len 128 --mxfp6 --num_cores 16 --device-group [0,1] --prompt "def fibonacci(n):" --mos 2 --aic_enable_depth_first @@ -145,7 +145,7 @@ python -m QEfficient.cloud.infer --model_name gpt2 --batch_size 1 --prompt_len 3 | High Level APIs | Single SoC | Tensor Slicing | |-----------------|------------|-------------------| -| QEfficient.cloud.infer | python -m QEfficient.cloud.infer --model_name $\color{green} {model}$ --batch_size 8 --prompt_len 128 --ctx_len 1024 --num_cores 16 --device-group [0] --prompt "My name is" --mxfp6 --hf_token $\color{green}{xyz}$ --mos 1 --aic_enable_depth_first | python -m QEfficient.cloud.infer --model_name $\color{green}{model}$ --batch_size 8 --prompt_len 128 --ctx_len 1024--num_cores 16 --device-group [0,1,2,3] --prompt "My name is" --mxfp6 --hf_token $\color{green}{xyz}$ --mos 4 --aic_enable_depth_first | +| QEfficient.cloud.infer | python -m QEfficient.cloud.infer --model_name $\color{green} {model}$ --batch_size 1 --prompt_len 128 --ctx_len 1024 --num_cores 16 --device-group [0] --prompt "My name is" --mxfp6 --hf_token $\color{green}{xyz}$ --mos 1 --aic_enable_depth_first | python -m QEfficient.cloud.infer --model_name $\color{green}{model}$ --batch_size 1 --prompt_len 128 --ctx_len 1024--num_cores 16 --device-group [0,1,2,3] --prompt "My name is" --mxfp6 --hf_token $\color{green}{xyz}$ --mos 4 --aic_enable_depth_first | | QEfficient.cloud.excute | python -m QEfficient.cloud.execute --model_name $\color{green}{model}$ --device_group [0] --qpc_path $\color{green}{path}$ --prompt "My name is" --hf_token $\color{green}{xyz}$ | python -m QEfficient.cloud.execute --model_name $\color{green}{model}$ --device_group [0,1,2,3] --qpc_path $\color{green}{path}$ --prompt "My name is" --hf_token $\color{green}{xyz}$ | :memo: Replace $\color{green}{model}$ , $\color{green}{path}$ and $\color{green}{xyz}$ with preffered model card name, qpc path and hf token respectively. From 968dd414a3abbad90990b43b83f2efbc7bad6681 Mon Sep 17 00:00:00 2001 From: Mamta Singh <168400541+quic-mamta@users.noreply.github.com> Date: Tue, 21 May 2024 13:42:09 +0530 Subject: [PATCH 06/28] Update README.md Signed-off-by: Mamta Singh <168400541+quic-mamta@users.noreply.github.com> --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 78291df4..6d901cc0 100644 --- a/README.md +++ b/README.md @@ -99,8 +99,8 @@ In summary: | High Level APIs | Sample use | Arguments | |-----------------|------------|-------------------| -| QEfficient.cloud.infer | [click here](#1-use-qefficientcloudinfer) |
  • model_name : $\color{green} {Mandatory}$
  • num_cores : $\color{green} {Mandatory}$
  • device_group : $\color{green} {Mandatory}$
  • batch_size : Optional [Default-1]
  • prompt_len : Optional [Default-32]
  • ctx_len : Optional [Default-128]
  • mxfp6 : Optional
  • hf_token : Optional
  • cache_dir : Optional ["cache_dir" in current working directory]
  • prompt : Optional
  • prompts_txt_file_path : Optional
  • *only one argument, prompt or prompts_txt_file_path should be passed*
  • | -| QEfficient.cloud.execute | [click here](#2-use-of-qefficientcloudexcute) |
  • model_name : $\color{green} {Mandatory}$
  • device_group : $\color{green} {Mandatory}$
  • qpc_path : $\color{green} {Mandatory}$
  • cache_dir : Optional ["cache_dir" in current working directory]
  • hf_token : Optional
  • prompt : Optional
  • prompts_txt_file_path : Optional
  • *only one argument, prompt or prompts_txt_file_path should be passed*
  • | +| QEfficient.cloud.infer | [click here](#1-use-qefficientcloudinfer) |
  • model_name : $\color{green} {Mandatory}$
  • num_cores : $\color{green} {Mandatory}$
  • device_group : $\color{green} {Mandatory}$
  • batch_size : Optional [Default-1]
  • prompt_len : Optional [Default-32]
  • ctx_len : Optional [Default-128]
  • mxfp6 : Optional
  • hf_token : Optional
  • cache_dir : Optional ["cache_dir" in current working directory]
  • prompt : Optional
  • prompts_txt_file_path : Optional
  • *One argument, prompt or prompts_txt_file_path must be passed*
  • | +| QEfficient.cloud.execute | [click here](#2-use-of-qefficientcloudexcute) |
  • model_name : $\color{green} {Mandatory}$
  • device_group : $\color{green} {Mandatory}$
  • qpc_path : $\color{green} {Mandatory}$
  • cache_dir : Optional ["cache_dir" in current working directory]
  • hf_token : Optional
  • prompt : Optional
  • prompts_txt_file_path : Optional
  • *One argument, prompt or prompts_txt_file_path must be passed*
  • | ### 1. Use QEfficient.cloud.infer From 0229664b3eeb9b2e398fb9241080582b47ca65e1 Mon Sep 17 00:00:00 2001 From: Mamta Singh Date: Tue, 21 May 2024 16:37:26 +0530 Subject: [PATCH 07/28] Update infer, execute and text generation interface Signed-off-by: mamtsing --- QEfficient/cloud/execute.py | 60 ++++-------- QEfficient/cloud/infer.py | 16 ++-- .../generation/text_generation_inference.py | 93 ++++++++++--------- 3 files changed, 76 insertions(+), 93 deletions(-) diff --git a/QEfficient/cloud/execute.py b/QEfficient/cloud/execute.py index 3896ac8e..f7076518 100644 --- a/QEfficient/cloud/execute.py +++ b/QEfficient/cloud/execute.py @@ -11,22 +11,17 @@ from huggingface_hub import login from transformers import AutoTokenizer -from QEfficient.generation.text_generation_inference import ( - check_batch_size_and_num_prompts, - cloud_ai_100_exec_kv, - get_compilation_batch_size, - read_prompts_txt_file, -) +from QEfficient.generation.text_generation_inference import cloud_ai_100_exec_kv from QEfficient.utils import hf_download from QEfficient.utils.constants import Constants def main( model_name: str, - prompt: str, - prompts_txt_file_path: str, qpc_path: str, - devices: List[int], + device_group: List[int], + prompt: str = None, + prompts_txt_file_path: str = None, cache_dir: str = Constants.CACHE_DIR, hf_token: str = None, ): @@ -36,32 +31,23 @@ def main( :param model_name: str. Hugging Face Model Card name, Example: [gpt2] :prompt: str. Sample prompt for the model text generation :qpc_path: str. Path to the generated binary after compilation. - :devices: List[int]. Device Ids to be used for compilation. if devices > 1. Multiple Card setup is enabled. + :device_group: List[int]. Device Ids to be used for compilation. if len(device_group) > 1. Multiple Card setup is enabled. """ + if hf_token is not None: login(hf_token) # Download tokenizer along with model if it doesn't exist - model_hf_path = hf_download(repo_id=model_name, cache_dir=cache_dir, allow_patterns=["*.json"]) + model_hf_path = hf_download(repo_id=model_name, cache_dir=cache_dir, allow_patterns=["*.json", "*.py"]) tokenizer = AutoTokenizer.from_pretrained(model_hf_path, use_cache=True, padding_side="left") - assert (prompt is None and prompts_txt_file_path is not None) or ( - prompt is not None and prompts_txt_file_path is None - ), "Please pass either single input string using --prompt or multiple inputs using --prompts_txt_file_path" - - if prompts_txt_file_path is not None: - prompt = read_prompts_txt_file(prompts_txt_file_path) - - compilation_batch_size = get_compilation_batch_size(qpc_path) - check_batch_size_and_num_prompts(prompt, compilation_batch_size) - # Execute cloud_ai_100_exec_kv( - compilation_batch_size=compilation_batch_size, tokenizer=tokenizer, - qpc=qpc_path, - device_id=devices, + qpc_path=qpc_path, + device_id=device_group, prompt=prompt, + prompts_txt_file_path=prompts_txt_file_path, ) @@ -71,6 +57,13 @@ def main( "--model_name", "--model-name", required=False, type=str, help="HF model card name for tokenizing the inputs" ) parser.add_argument("--qpc_path", "--qpc-path", required=True, help="Path to generated QPC") + parser.add_argument( + "--device_group", + "--device-group", + required=True, + type=lambda device_ids: [int(x) for x in device_ids.strip("[]").split(",")], + help="Cloud AI 100 device ids (comma-separated) e.g. [0]", + ) parser.add_argument( "--prompt", type=str, @@ -78,17 +71,10 @@ def main( ) parser.add_argument( "--prompts_txt_file_path", - "--prompts-txt-file-path-file-path", + "--prompts-txt-file-path", type=str, help="for batch size>1, pass input prompts in txt file, sample prompts.txt file present in examples folder", ) - parser.add_argument( - "--device_group", - "--device-group", - required=True, - type=lambda device_ids: [int(x) for x in device_ids.strip("[]").split(",")], - help="cloud AI 100 device ids (comma-separated) e.g. [0]", - ) parser.add_argument( "--cache-dir", "--cache_dir", default=Constants.CACHE_DIR, required=False, help="Cache dir to store HF Downlods" ) @@ -96,12 +82,4 @@ def main( "--hf-token", "--hf_token", default=None, type=str, required=False, help="HF token id for private HF models" ) args = parser.parse_args() - main( - args.model_name, - args.prompt, - args.prompts_txt_file_path, - args.qpc_path, - args.device_group, - args.cache_dir, - args.hf_token, - ) + main(**args.__dict__) diff --git a/QEfficient/cloud/infer.py b/QEfficient/cloud/infer.py index 7612460b..82445cd7 100644 --- a/QEfficient/cloud/infer.py +++ b/QEfficient/cloud/infer.py @@ -18,7 +18,6 @@ from QEfficient.generation.text_generation_inference import ( check_batch_size_and_num_prompts, cloud_ai_100_exec_kv, - get_compilation_batch_size, read_prompts_txt_file, ) from QEfficient.utils import hf_download @@ -87,7 +86,10 @@ def main( ), "Please pass either single input string using --prompt or multiple inputs using --prompts_txt_file_path" if prompts_txt_file_path is not None: - prompt = read_prompts_txt_file(prompts_txt_file_path) + prompts = read_prompts_txt_file(prompts_txt_file_path) + check_batch_size_and_num_prompts(prompts, batch_size) + else: + check_batch_size_and_num_prompts([prompt], batch_size) # Get tokenizer if hf_token is not None: @@ -102,19 +104,15 @@ def main( if qpc_exists(qpc_dir_path): # execute logger.info("Pre-compiled qpc found! Trying to execute with given prompt") - compilation_batch_size = get_compilation_batch_size(qpc_dir_path) - check_batch_size_and_num_prompts(prompt, compilation_batch_size) cloud_ai_100_exec_kv( - compilation_batch_size=compilation_batch_size, tokenizer=tokenizer, qpc_path=qpc_dir_path, device_id=device_group, prompt=prompt, + prompts_txt_file_path=prompts_txt_file_path, ) return - check_batch_size_and_num_prompts(prompt, batch_size) - if onnx_exists(onnx_model_path): # Compile -> execute # We need to pass parent directory of qpc_dir_path, as the compile function handles the qpcs directory creation @@ -134,11 +132,11 @@ def main( generated_qpc_path == qpc_dir_path ), f"QPC files were generated at an unusual location, expected {qpc_dir_path}; got {generated_qpc_path}" cloud_ai_100_exec_kv( - compilation_batch_size=compilation_batch_size, tokenizer=tokenizer, qpc_path=qpc_dir_path, device_id=device_group, prompt=prompt, + prompts_txt_file_path=prompts_txt_file_path, ) return @@ -187,11 +185,11 @@ def main( # Execute cloud_ai_100_exec_kv( - compilation_batch_size=compilation_batch_size, tokenizer=tokenizer, qpc_path=qpc_dir_path, device_id=device_group, prompt=prompt, + prompts_txt_file_path=prompts_txt_file_path, ) diff --git a/QEfficient/generation/text_generation_inference.py b/QEfficient/generation/text_generation_inference.py index 1be533e3..6b730067 100755 --- a/QEfficient/generation/text_generation_inference.py +++ b/QEfficient/generation/text_generation_inference.py @@ -63,26 +63,19 @@ def write_io_files( def get_compilation_batch_size(qpc_path: str): qpc_base_path = os.path.dirname(qpc_path) - print(qpc_base_path) specialization_file_path = os.path.join(qpc_base_path, "specializations.json") - print(specialization_file_path) with open(specialization_file_path, "r") as file: data = json.load(file) compilation_batch_size = int(data["specializations"][0]["batch_size"]) return compilation_batch_size -def check_batch_size_and_num_prompts(prompt: Union[str, List], compilation_batch_size: int): - if isinstance(prompt, list): - num_prompts = len(prompt) - elif isinstance(prompt, str): - num_prompts = 1 - else: - print("Input prompt sould be either string for single input or List of string in case of mutliple inputs") - if compilation_batch_size > 1: +def check_batch_size_and_num_prompts(prompt: List[str], batch_size: int): + num_prompts = len(prompt) + if batch_size > 1: assert ( - compilation_batch_size == num_prompts - ), f"Mismatch between number of prompts {num_prompts} and compilation batch size {compilation_batch_size}; please pass correct input argument" + batch_size == num_prompts + ), f"Mismatch between number of prompts {num_prompts} and batch size {batch_size}; please pass correct input argument" def read_prompts_txt_file(prompts_txt_file_path: str): @@ -96,38 +89,55 @@ def read_prompts_txt_file(prompts_txt_file_path: str): def cloud_ai_100_exec_kv( tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast], qpc_path: str, - prompt: Union[str, List], - compilation_batch_size: int, + prompt: str, + prompts_txt_file_path: str, device_id: List[int] = [0], ): - if compilation_batch_size == 1 and isinstance(prompt, list): + assert (prompt is None and prompts_txt_file_path is not None) or ( + prompt is not None and prompts_txt_file_path is None + ), "Please pass either single input string using --prompt or multiple inputs using --prompts_txt_file_path" + + if prompts_txt_file_path is not None: + prompt = read_prompts_txt_file(prompts_txt_file_path) + if isinstance(prompt, str): + prompt = [prompt] + + batch_size = get_compilation_batch_size(qpc_path) + check_batch_size_and_num_prompts(prompt, batch_size) + + if batch_size == 1: + prefill_time = [] + decode_perf = [] + total_perf = [] + total_time = [] + generated_texts = [] for i in range(len(prompt)): - latency_stats = exec_kv(tokenizer=tokenizer, qpc=qpc_path, device_id=device_id, prompt=prompt[i]) - if i == len(prompt) - 1: - generated_texts, prefill_time, decode_perf, total_perf, total_time = latency_stats - print_latency_stats_kv( - prompt, - generated_texts, - compilation_batch_size, - prefill_time, - decode_perf, - total_perf, - total_time, - automation=False, - ) + latency_stats = exec_kv(tokenizer=tokenizer, qpc=qpc_path, device_id=device_id, prompt=[prompt[i]]) + generated_texts.append(latency_stats[0]) + prefill_time.append(latency_stats[1]) + decode_perf.append(latency_stats[2]) + total_perf.append(latency_stats[3]) + total_time.append(latency_stats[4]) + + prefill_time = np.average(prefill_time) + decode_perf = np.average(decode_perf) + total_perf = np.average(total_perf) + total_time = np.average(total_time) + else: latency_stats = exec_kv(tokenizer=tokenizer, qpc=qpc_path, device_id=device_id, prompt=prompt) generated_texts, prefill_time, decode_perf, total_perf, total_time = latency_stats - print_latency_stats_kv( - prompt, - generated_texts, - compilation_batch_size, - prefill_time, - decode_perf, - total_perf, - total_time, - automation=False, - ) + + print_latency_stats_kv( + prompt, + generated_texts, + batch_size, + prefill_time, + decode_perf, + total_perf, + total_time, + automation=False, + ) def latency_stats_bertstyle( @@ -169,7 +179,7 @@ def latency_stats_bertstyle( def exec_kv( tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast], qpc: str, - prompt: Union[str, List], + prompt: List[str], input_len: Optional[int] = None, generation_len: Optional[int] = None, device_id: List[int] = [0], @@ -218,10 +228,7 @@ def exec_kv( generated_ids = np.full((batch_size, generation_len - input_len + 1), tokenizer.pad_token_id) if stream: - if isinstance(prompt, list): - print(0, prompt[0], end=" ", flush=True) - else: - print(0, prompt, end=" ", flush=True) + print(0, prompt[0], end=" ", flush=True) # Run prefill for i in range(num_chunks): From e51431fa1b60c1d1c433e496da5df3b7c08abfa3 Mon Sep 17 00:00:00 2001 From: Mamta Singh <168400541+quic-mamta@users.noreply.github.com> Date: Tue, 21 May 2024 16:46:04 +0530 Subject: [PATCH 08/28] Update execute.py Signed-off-by: Mamta Singh <168400541+quic-mamta@users.noreply.github.com> --- QEfficient/cloud/execute.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/QEfficient/cloud/execute.py b/QEfficient/cloud/execute.py index f7076518..866926a4 100644 --- a/QEfficient/cloud/execute.py +++ b/QEfficient/cloud/execute.py @@ -28,10 +28,11 @@ def main( """ APi() to run the Model on Cloud AI 100 Platform. --------- - :param model_name: str. Hugging Face Model Card name, Example: [gpt2] - :prompt: str. Sample prompt for the model text generation + :param model_name: str. Hugging Face Model Card name, Example: "gpt2" :qpc_path: str. Path to the generated binary after compilation. :device_group: List[int]. Device Ids to be used for compilation. if len(device_group) > 1. Multiple Card setup is enabled. + :prompt: str. Sample prompt for the model text generation + :prompts_txt_file_path: str. Path to txt file for multiple input prompts (in case of batch size > 1) """ if hf_token is not None: From 18c973ca529dc474573d310d67e602b71a6f6ca0 Mon Sep 17 00:00:00 2001 From: Mamta Singh <168400541+quic-mamta@users.noreply.github.com> Date: Tue, 21 May 2024 17:01:22 +0530 Subject: [PATCH 09/28] Update execute.py Signed-off-by: Mamta Singh <168400541+quic-mamta@users.noreply.github.com> --- QEfficient/cloud/execute.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/QEfficient/cloud/execute.py b/QEfficient/cloud/execute.py index 866926a4..2c344384 100644 --- a/QEfficient/cloud/execute.py +++ b/QEfficient/cloud/execute.py @@ -39,7 +39,7 @@ def main( login(hf_token) # Download tokenizer along with model if it doesn't exist - model_hf_path = hf_download(repo_id=model_name, cache_dir=cache_dir, allow_patterns=["*.json", "*.py"]) + model_hf_path = hf_download(repo_id=model_name, cache_dir=cache_dir, allow_patterns=["*.json", "*.py", "*token*"]) tokenizer = AutoTokenizer.from_pretrained(model_hf_path, use_cache=True, padding_side="left") # Execute From cef24ab62a346093c8e7e0d3af50a951a1f79a50 Mon Sep 17 00:00:00 2001 From: Mamta Singh Date: Tue, 21 May 2024 17:33:45 +0530 Subject: [PATCH 10/28] Update text generation interface Signed-off-by: mamtsing --- .../generation/text_generation_inference.py | 34 ++++++++++++++++--- 1 file changed, 30 insertions(+), 4 deletions(-) diff --git a/QEfficient/generation/text_generation_inference.py b/QEfficient/generation/text_generation_inference.py index 6b730067..e6ca743e 100755 --- a/QEfficient/generation/text_generation_inference.py +++ b/QEfficient/generation/text_generation_inference.py @@ -92,6 +92,12 @@ def cloud_ai_100_exec_kv( prompt: str, prompts_txt_file_path: str, device_id: List[int] = [0], + input_len: Optional[int] = None, + generation_len: Optional[int] = None, + enable_debug_logs: bool = False, + stream: bool = True, + write_io_dir: Optional[str] = None, + automation=False, ): assert (prompt is None and prompts_txt_file_path is not None) or ( prompt is not None and prompts_txt_file_path is None @@ -112,7 +118,17 @@ def cloud_ai_100_exec_kv( total_time = [] generated_texts = [] for i in range(len(prompt)): - latency_stats = exec_kv(tokenizer=tokenizer, qpc=qpc_path, device_id=device_id, prompt=[prompt[i]]) + latency_stats = cloud_ai_100_exec_kv_helper( + tokenizer=tokenizer, + prompt=[prompt[i]], + qpc=qpc_path, + device_id=device_id, + input_len=input_len, + generation_len=generation_len, + enable_debug_logs=enable_debug_logs, + stream=stream, + write_io_dir=write_io_dir, + ) generated_texts.append(latency_stats[0]) prefill_time.append(latency_stats[1]) decode_perf.append(latency_stats[2]) @@ -125,7 +141,17 @@ def cloud_ai_100_exec_kv( total_time = np.average(total_time) else: - latency_stats = exec_kv(tokenizer=tokenizer, qpc=qpc_path, device_id=device_id, prompt=prompt) + latency_stats = cloud_ai_100_exec_kv_helper( + tokenizer=tokenizer, + prompt=prompt, + qpc=qpc_path, + device_id=device_id, + input_len=input_len, + generation_len=generation_len, + enable_debug_logs=enable_debug_logs, + stream=stream, + write_io_dir=write_io_dir, + ) generated_texts, prefill_time, decode_perf, total_perf, total_time = latency_stats print_latency_stats_kv( @@ -136,7 +162,7 @@ def cloud_ai_100_exec_kv( decode_perf, total_perf, total_time, - automation=False, + automation=automation, ) @@ -176,7 +202,7 @@ def latency_stats_bertstyle( print(round((cur_len - init_len) / (end - start), 2), "tok/s") -def exec_kv( +def cloud_ai_100_exec_kv_helper( tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast], qpc: str, prompt: List[str], From b6920c4ba55b7c88c1ea117a0bb8329596522ce3 Mon Sep 17 00:00:00 2001 From: Mamta Singh Date: Tue, 21 May 2024 17:41:37 +0530 Subject: [PATCH 11/28] Update Notebooks Signed-off-by: quic-mamta Signed-off-by: mamtsing --- notebooks/QEfficientGPT2.ipynb | 2 +- notebooks/QEfficientMPT.ipynb | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/notebooks/QEfficientGPT2.ipynb b/notebooks/QEfficientGPT2.ipynb index 8984aa54..be7e3d44 100644 --- a/notebooks/QEfficientGPT2.ipynb +++ b/notebooks/QEfficientGPT2.ipynb @@ -166,7 +166,7 @@ "\n", "# post compilation, we can print the latency stats for the kv models, We provide API to print token and Latency stats on AI 100\n", "# We need the compiled prefill and decode qpc to compute the token generated, This is based on Greedy Sampling Approach\n", - "cloud_ai_100_exec_kv(tokenizer=tokenizer, qpc=generated_qpc_path, device_id=[0], prompt=\"My name is\")" + "cloud_ai_100_exec_kv(tokenizer=tokenizer, qpc_path=generated_qpc_path, device_id=[0], prompt=\"My name is\")" ] } ], diff --git a/notebooks/QEfficientMPT.ipynb b/notebooks/QEfficientMPT.ipynb index ba3b8b60..d7e45d92 100644 --- a/notebooks/QEfficientMPT.ipynb +++ b/notebooks/QEfficientMPT.ipynb @@ -165,7 +165,7 @@ "# post compilation, we can print the latency stats for the kv models, We provide API to print token and Latency stats on AI 100\n", "# We need the compiled prefill and decode qpc to compute the token generated, This is based on Greedy Sampling Approach\n", "\n", - "cloud_ai_100_exec_kv(tokenizer=tokenizer, qpc=generated_qpc_path, device_id=[0], prompt=\"My name is\")" + "cloud_ai_100_exec_kv(tokenizer=tokenizer, qpc_path=generated_qpc_path, device_id=[0], prompt=\"My name is\")" ] } ], From 20cdb52ed9efd43f11ea41c040907d48f11c4a82 Mon Sep 17 00:00:00 2001 From: Mamta Singh <168400541+quic-mamta@users.noreply.github.com> Date: Tue, 21 May 2024 17:47:23 +0530 Subject: [PATCH 12/28] Update README.md Signed-off-by: Mamta Singh <168400541+quic-mamta@users.noreply.github.com> Signed-off-by: mamtsing --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 4eae5581..6a856b77 100644 --- a/README.md +++ b/README.md @@ -174,7 +174,7 @@ python -m QEfficient.cloud.infer --model_name gpt2 --batch_size 1 --prompt_len 3 | QEfficient.transform | [click here](#1-model-download-and-transform) |
  • model : $\color{green} {Mandatory}$
  • Type : Optional [Default- "Transformers"]
  • form_factor : Optional [Default-"cloud"]
  • | | qualcomm_efficient_converter | [click here](#2-onnx-export-of-transformed-model) |
  • mode_name : $\color{green} {Mandatory}$
  • model_kv : $\color{green} {Mandatory}$ [Optional when model_class passed]
  • model_class : $\color{green} {Mandatory}$ [Optional when model_kv passed]
  • tokenizer : Optional
  • onnx_path : Optional
  • hf_token : Optional
  • seq_length : Optional [Default-128]
  • input_str : Optional [Default-"My name is"]
  • kv : Optional [Default-$\color{green} {True}$]
  • return_path : Optional [Default-False]
  • form_factor : Optional [Default-"cloud"]
  • save_fp32_onnx : Optional [Default-False]
  • save_fp16_onnx : Optional [Default-True]
  • *Both save_fp32_onnx and save_fp16_onnx can't be false*
  • | | compile | [click here](#3-compile-on-cloud-ai-100) |
  • onnx_path : $\color{green} {Mandatory}$
  • qpc_path : $\color{green} {Mandatory}$
  • num_cores : $\color{green} {Mandatory}$
  • device_group : $\color{green} {Mandatory}$
  • batch_size : Optional [Default-1]
  • prompt_len : Optional [Default-32]
  • ctx_len : Optional [Default-128]
  • mxfp6 : Optional [Default-True]
  • | -|cloud_ai_100_exec_kv | [click here](#4-run-benchmark) |
  • tokenizer : $\color{green} {Mandatory}$
  • qpc : $\color{green} {Mandatory}$
  • prompt : $\color{green} {Mandatory}$
  • input_len : Optional [Default-None]
  • generation_len : Optional [Default-None]
  • device_id : Optional [Default-[0]]
  • enable_debug_logs : Optional [Default-False]
  • stream : Optional [Default-True]
  • write_io_dir : Optional
  • automation : Optional [Default-False]
  • | +|cloud_ai_100_exec_kv | [click here](#4-run-benchmark) |
  • tokenizer : $\color{green} {Mandatory}$
  • qpc_path : $\color{green} {Mandatory}$
  • prompt : Optional
  • prompts_txt_file_path : Optional
  • input_len : Optional [Default-None]
  • generation_len : Optional [Default-None]
  • device_id : Optional [Default-[0]]
  • enable_debug_logs : Optional [Default-False]
  • stream : Optional [Default-True]
  • write_io_dir : Optional
  • automation : Optional [Default-False]
  • *One argument, prompt or prompts_txt_file_path must be passed*
  • | ### 1. Model download and transform @@ -269,7 +269,7 @@ from QEfficient.generation.text_generation_inference import cloud_ai_100_exec_kv # post compilation, we can print the latency stats for the kv models, We provide API to print token and Latency stats on AI 100 # We need the compiled prefill and decode qpc to compute the token generated, This is based on Greedy Sampling Approach -cloud_ai_100_exec_kv(tokenizer=tokenizer, qpc=generated_qpc_path, device_id=[0], prompt="My name is") +cloud_ai_100_exec_kv(tokenizer=tokenizer, qpc_path=generated_qpc_path, device_id=[0], prompt="My name is") ``` End to End demo examples for various models are available in **notebooks** directory. Please check them out. From 80fb101b1b2934d0bdb0b8104a2f5d6492425b0a Mon Sep 17 00:00:00 2001 From: Mamta Singh <168400541+quic-mamta@users.noreply.github.com> Date: Tue, 21 May 2024 17:51:43 +0530 Subject: [PATCH 13/28] Update README.md Signed-off-by: Mamta Singh <168400541+quic-mamta@users.noreply.github.com> Signed-off-by: mamtsing --- README.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 6a856b77..80dd2e76 100644 --- a/README.md +++ b/README.md @@ -111,14 +111,14 @@ In summary: | High Level APIs | Sample use | Arguments | |-----------------|------------|-------------------| | QEfficient.cloud.infer | [click here](#1-use-qefficientcloudinfer) |
  • model_name : $\color{green} {Mandatory}$
  • num_cores : $\color{green} {Mandatory}$
  • device_group : $\color{green} {Mandatory}$
  • batch_size : Optional [Default-1]
  • prompt_len : Optional [Default-32]
  • ctx_len : Optional [Default-128]
  • mxfp6 : Optional
  • hf_token : Optional
  • cache_dir : Optional ["cache_dir" in current working directory]
  • prompt : Optional
  • prompts_txt_file_path : Optional
  • *One argument, prompt or prompts_txt_file_path must be passed*
  • | -| QEfficient.cloud.execute | [click here](#2-use-of-qefficientcloudexcute) |
  • model_name : $\color{green} {Mandatory}$
  • device_group : $\color{green} {Mandatory}$
  • qpc_path : $\color{green} {Mandatory}$
  • cache_dir : Optional ["cache_dir" in current working directory]
  • hf_token : Optional
  • prompt : Optional
  • prompts_txt_file_path : Optional
  • *One argument, prompt or prompts_txt_file_path must be passed*
  • | +| QEfficient.cloud.execute | [click here](#2-use-of-qefficientcloudexecute) |
  • model_name : $\color{green} {Mandatory}$
  • device_group : $\color{green} {Mandatory}$
  • qpc_path : $\color{green} {Mandatory}$
  • cache_dir : Optional ["cache_dir" in current working directory]
  • hf_token : Optional
  • prompt : Optional
  • prompts_txt_file_path : Optional
  • *One argument, prompt or prompts_txt_file_path must be passed*
  • | ### 1. Use QEfficient.cloud.infer This is the single e2e python api in the library, which takes model_card name as input along with other compile args if necessary and does everything in one go. -* Torch Download → Optimize for Cloud AI 100 → Export to ONNX → Verify (CPU) → Compile on Cloud AI 100 → [Execute](#2-use-of-qefficientcloudexcute) +* Torch Download → Optimize for Cloud AI 100 → Export to ONNX → Verify (CPU) → Compile on Cloud AI 100 → [Execute](#2-use-of-qefficientcloudexecute) * Its skips the ONNX export/compile stage if ONNX file or qpc found on path @@ -131,12 +131,12 @@ python -m QEfficient.cloud.infer --model_name gpt2 --batch_size 1 --prompt_len 3 python -m QEfficient.cloud.infer --model_name gpt2 --batch_size 3 --prompt_len 32 --ctx_len 128 --num_cores 16 --device_group [0] --prompts_txt_file_path examples/prompts.txt --mxfp6 --mos 1 --aic_enable_depth_first ``` -### 2. Use of QEfficient.cloud.excute +### 2. Use of QEfficient.cloud.execute Once we have compiled the QPC, we can now use the precompiled QPC in execute API to run for different prompts, like below: ```bash -python -m QEfficient.cloud.execute --model_name gpt2 --qpc_path qeff_models/gpt2/qpc_16cores_1BS_32PL_128CL_1devices_mxfp6/qpcs/ --prompt "Once upon a time in" --device_group [0] +python -m QEfficient.cloud.execute --model_name gpt2 --qpc_path qeff_models/gpt2/qpc_16cores_1BS_32PL_128CL_1devices_mxfp6/qpcs --prompt "Once upon a time in" --device_group [0] ``` We can also enable MQ, just based on the number of devices. Based on the "--device-group" as input it will create TS config on the fly. If "--device-group [0,1]" it will create TS config for 2 devices and use it for compilation, if "--device-group 0" then TS compilation is skipped and single soc execution is enabled. @@ -157,7 +157,7 @@ python -m QEfficient.cloud.infer --model_name gpt2 --batch_size 1 --prompt_len 3 | High Level APIs | Single SoC | Tensor Slicing | |-----------------|------------|-------------------| | QEfficient.cloud.infer | python -m QEfficient.cloud.infer --model_name $\color{green} {model}$ --batch_size 1 --prompt_len 128 --ctx_len 1024 --num_cores 16 --device-group [0] --prompt "My name is" --mxfp6 --hf_token $\color{green}{xyz}$ --mos 1 --aic_enable_depth_first | python -m QEfficient.cloud.infer --model_name $\color{green}{model}$ --batch_size 1 --prompt_len 128 --ctx_len 1024--num_cores 16 --device-group [0,1,2,3] --prompt "My name is" --mxfp6 --hf_token $\color{green}{xyz}$ --mos 4 --aic_enable_depth_first | -| QEfficient.cloud.excute | python -m QEfficient.cloud.execute --model_name $\color{green}{model}$ --device_group [0] --qpc_path $\color{green}{path}$ --prompt "My name is" --hf_token $\color{green}{xyz}$ | python -m QEfficient.cloud.execute --model_name $\color{green}{model}$ --device_group [0,1,2,3] --qpc_path $\color{green}{path}$ --prompt "My name is" --hf_token $\color{green}{xyz}$ | +| QEfficient.cloud.execute | python -m QEfficient.cloud.execute --model_name $\color{green}{model}$ --device_group [0] --qpc_path $\color{green}{path}$ --prompt "My name is" --hf_token $\color{green}{xyz}$ | python -m QEfficient.cloud.execute --model_name $\color{green}{model}$ --device_group [0,1,2,3] --qpc_path $\color{green}{path}$ --prompt "My name is" --hf_token $\color{green}{xyz}$ | :memo: Replace $\color{green}{model}$ , $\color{green}{path}$ and $\color{green}{xyz}$ with preffered model card name, qpc path and hf token respectively. From 01999cab6d6d37c01dcb3baa08866de254c29720 Mon Sep 17 00:00:00 2001 From: Mamta Singh <168400541+quic-mamta@users.noreply.github.com> Date: Tue, 21 May 2024 19:12:48 +0530 Subject: [PATCH 14/28] Update text_generation_inference.py Signed-off-by: Mamta Singh <168400541+quic-mamta@users.noreply.github.com> --- QEfficient/generation/text_generation_inference.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/QEfficient/generation/text_generation_inference.py b/QEfficient/generation/text_generation_inference.py index e6ca743e..1eb7c27b 100755 --- a/QEfficient/generation/text_generation_inference.py +++ b/QEfficient/generation/text_generation_inference.py @@ -89,8 +89,8 @@ def read_prompts_txt_file(prompts_txt_file_path: str): def cloud_ai_100_exec_kv( tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast], qpc_path: str, - prompt: str, - prompts_txt_file_path: str, + prompt: Optional[str] = None, + prompts_txt_file_path: Optional[str] = None, device_id: List[int] = [0], input_len: Optional[int] = None, generation_len: Optional[int] = None, From 94b7ead01fb36044a091174265964f75ff3850d1 Mon Sep 17 00:00:00 2001 From: Mamta Singh Date: Wed, 22 May 2024 15:11:19 +0530 Subject: [PATCH 15/28] Update infer and execute and text generation interface Signed-off-by: Mamta Singh --- QEfficient/cloud/execute.py | 7 +- QEfficient/cloud/infer.py | 17 +- .../generation/text_generation_inference.py | 207 +++++++++--------- 3 files changed, 116 insertions(+), 115 deletions(-) diff --git a/QEfficient/cloud/execute.py b/QEfficient/cloud/execute.py index 2c344384..6a1aeb63 100644 --- a/QEfficient/cloud/execute.py +++ b/QEfficient/cloud/execute.py @@ -67,14 +67,15 @@ def main( ) parser.add_argument( "--prompt", - type=str, - help="Input prompt, if executing for batch size>1, use prompts_txt_file_path flag", + type=lambda prompt: prompt.split("|"), + default="My name is", + help="Input prompt, if executing for batch size>1, pass input promprs in single string but seperate with pipe (|) symbol", ) parser.add_argument( "--prompts_txt_file_path", "--prompts-txt-file-path", type=str, - help="for batch size>1, pass input prompts in txt file, sample prompts.txt file present in examples folder", + help="File path for taking input prompts from txt file, sample prompts.txt file present in examples folder", ) parser.add_argument( "--cache-dir", "--cache_dir", default=Constants.CACHE_DIR, required=False, help="Cache dir to store HF Downlods" diff --git a/QEfficient/cloud/infer.py b/QEfficient/cloud/infer.py index 39a95418..fe55ceaf 100644 --- a/QEfficient/cloud/infer.py +++ b/QEfficient/cloud/infer.py @@ -81,15 +81,17 @@ def main( onnx_dir_path = os.path.join(model_card_dir, "onnx") onnx_model_path = os.path.join(onnx_dir_path, model_name.replace("/", "_") + "_kv_clipped_fp16.onnx") - assert (prompt is None and prompts_txt_file_path is not None) or ( - prompt is not None and prompts_txt_file_path is None - ), "Please pass either single input string using --prompt or multiple inputs using --prompts_txt_file_path" + print("prompt : ", prompt) + print("prompts_txt_file_path : ", prompts_txt_file_path) if prompts_txt_file_path is not None: + logger.info("Found inputs passed using txt file as well as CLI, taking inputs from given txt file") prompts = read_prompts_txt_file(prompts_txt_file_path) check_batch_size_and_num_prompts(prompts, batch_size) else: - check_batch_size_and_num_prompts([prompt], batch_size) + if isinstance(prompt, str): + prompt = [prompt] + check_batch_size_and_num_prompts(prompt, batch_size) # Get tokenizer if hf_token is not None: @@ -226,14 +228,15 @@ def main( ) parser.add_argument( "--prompt", - type=str, - help="Input prompt, if executing for batch size>1, use prompts_txt_file_path flag", + type=lambda prompt: prompt.split("|"), + default="My name is", + help="Input prompt, if executing for batch size>1, pass input promprs in single string but seperate with pipe (|) symbol", ) parser.add_argument( "--prompts_txt_file_path", "--prompts-txt-file-path", type=str, - help="for batch size>1, pass input prompts in txt file, sample prompts.txt file present in examples folder", + help="File path for taking input prompts from txt file, sample prompts.txt file present in examples folder", ) parser.add_argument( "--aic_enable_depth_first", diff --git a/QEfficient/generation/text_generation_inference.py b/QEfficient/generation/text_generation_inference.py index 1eb7c27b..38eca70c 100755 --- a/QEfficient/generation/text_generation_inference.py +++ b/QEfficient/generation/text_generation_inference.py @@ -61,111 +61,6 @@ def write_io_files( json.dump({"IO-files": io_files}, fp, indent=True) -def get_compilation_batch_size(qpc_path: str): - qpc_base_path = os.path.dirname(qpc_path) - specialization_file_path = os.path.join(qpc_base_path, "specializations.json") - with open(specialization_file_path, "r") as file: - data = json.load(file) - compilation_batch_size = int(data["specializations"][0]["batch_size"]) - return compilation_batch_size - - -def check_batch_size_and_num_prompts(prompt: List[str], batch_size: int): - num_prompts = len(prompt) - if batch_size > 1: - assert ( - batch_size == num_prompts - ), f"Mismatch between number of prompts {num_prompts} and batch size {batch_size}; please pass correct input argument" - - -def read_prompts_txt_file(prompts_txt_file_path: str): - prompt = [] - with open(prompts_txt_file_path, "r") as file: - for line in file: - prompt.append(line.strip()) - return prompt - - -def cloud_ai_100_exec_kv( - tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast], - qpc_path: str, - prompt: Optional[str] = None, - prompts_txt_file_path: Optional[str] = None, - device_id: List[int] = [0], - input_len: Optional[int] = None, - generation_len: Optional[int] = None, - enable_debug_logs: bool = False, - stream: bool = True, - write_io_dir: Optional[str] = None, - automation=False, -): - assert (prompt is None and prompts_txt_file_path is not None) or ( - prompt is not None and prompts_txt_file_path is None - ), "Please pass either single input string using --prompt or multiple inputs using --prompts_txt_file_path" - - if prompts_txt_file_path is not None: - prompt = read_prompts_txt_file(prompts_txt_file_path) - if isinstance(prompt, str): - prompt = [prompt] - - batch_size = get_compilation_batch_size(qpc_path) - check_batch_size_and_num_prompts(prompt, batch_size) - - if batch_size == 1: - prefill_time = [] - decode_perf = [] - total_perf = [] - total_time = [] - generated_texts = [] - for i in range(len(prompt)): - latency_stats = cloud_ai_100_exec_kv_helper( - tokenizer=tokenizer, - prompt=[prompt[i]], - qpc=qpc_path, - device_id=device_id, - input_len=input_len, - generation_len=generation_len, - enable_debug_logs=enable_debug_logs, - stream=stream, - write_io_dir=write_io_dir, - ) - generated_texts.append(latency_stats[0]) - prefill_time.append(latency_stats[1]) - decode_perf.append(latency_stats[2]) - total_perf.append(latency_stats[3]) - total_time.append(latency_stats[4]) - - prefill_time = np.average(prefill_time) - decode_perf = np.average(decode_perf) - total_perf = np.average(total_perf) - total_time = np.average(total_time) - - else: - latency_stats = cloud_ai_100_exec_kv_helper( - tokenizer=tokenizer, - prompt=prompt, - qpc=qpc_path, - device_id=device_id, - input_len=input_len, - generation_len=generation_len, - enable_debug_logs=enable_debug_logs, - stream=stream, - write_io_dir=write_io_dir, - ) - generated_texts, prefill_time, decode_perf, total_perf, total_time = latency_stats - - print_latency_stats_kv( - prompt, - generated_texts, - batch_size, - prefill_time, - decode_perf, - total_perf, - total_time, - automation=automation, - ) - - def latency_stats_bertstyle( model_name: str, qpc: str, @@ -202,6 +97,31 @@ def latency_stats_bertstyle( print(round((cur_len - init_len) / (end - start), 2), "tok/s") +def get_compilation_batch_size(qpc_path: str): + qpc_base_path = os.path.dirname(qpc_path) + specialization_file_path = os.path.join(qpc_base_path, "specializations.json") + with open(specialization_file_path, "r") as file: + data = json.load(file) + compilation_batch_size = int(data["specializations"][0]["batch_size"]) + return compilation_batch_size + + +def check_batch_size_and_num_prompts(prompt: List[str], batch_size: int): + num_prompts = len(prompt) + if batch_size > 1: + assert ( + batch_size == num_prompts + ), f"Mismatch between number of prompts {num_prompts} and batch size {batch_size}; please pass correct input argument" + + +def read_prompts_txt_file(prompts_txt_file_path: str): + prompt = [] + with open(prompts_txt_file_path, "r") as file: + for line in file: + prompt.append(line.strip()) + return prompt + + def cloud_ai_100_exec_kv_helper( tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast], qpc: str, @@ -344,3 +264,80 @@ def print_latency_stats_kv( print("E2E:", round(total_perf, 2), "tok/s") print("Total (E2E) inference time is=", round(total_time, 2), "s") print("=============================================================") + + +def cloud_ai_100_exec_kv( + tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast], + qpc_path: str, + prompt: Optional[str] = None, + prompts_txt_file_path: Optional[str] = None, + device_id: List[int] = [0], + input_len: Optional[int] = None, + generation_len: Optional[int] = None, + enable_debug_logs: bool = False, + stream: bool = True, + write_io_dir: Optional[str] = None, + automation=False, +): + if prompts_txt_file_path is not None: + logger.info("Found inputs passed using txt file as well as CLI, taking inputs from given txt file") + prompt = read_prompts_txt_file(prompts_txt_file_path) + if isinstance(prompt, str): + prompt = [prompt] + + batch_size = get_compilation_batch_size(qpc_path) + check_batch_size_and_num_prompts(prompt, batch_size) + + if batch_size == 1: + prefill_time = [] + decode_perf = [] + total_perf = [] + total_time = [] + generated_texts = [] + for i in range(len(prompt)): + latency_stats = cloud_ai_100_exec_kv_helper( + tokenizer=tokenizer, + prompt=[prompt[i]], + qpc=qpc_path, + device_id=device_id, + input_len=input_len, + generation_len=generation_len, + enable_debug_logs=enable_debug_logs, + stream=stream, + write_io_dir=write_io_dir, + ) + generated_texts.append(latency_stats[0]) + prefill_time.append(latency_stats[1]) + decode_perf.append(latency_stats[2]) + total_perf.append(latency_stats[3]) + total_time.append(latency_stats[4]) + + prefill_time = np.average(prefill_time) + decode_perf = np.average(decode_perf) + total_perf = np.average(total_perf) + total_time = np.average(total_time) + + else: + latency_stats = cloud_ai_100_exec_kv_helper( + tokenizer=tokenizer, + prompt=prompt, + qpc=qpc_path, + device_id=device_id, + input_len=input_len, + generation_len=generation_len, + enable_debug_logs=enable_debug_logs, + stream=stream, + write_io_dir=write_io_dir, + ) + generated_texts, prefill_time, decode_perf, total_perf, total_time = latency_stats + + print_latency_stats_kv( + prompt, + generated_texts, + batch_size, + prefill_time, + decode_perf, + total_perf, + total_time, + automation=automation, + ) From 885c07bccc7b3d09d39fcc96b8eb57065eb2918d Mon Sep 17 00:00:00 2001 From: Mamta Singh <168400541+quic-mamta@users.noreply.github.com> Date: Wed, 22 May 2024 15:21:41 +0530 Subject: [PATCH 16/28] Update infer.py Signed-off-by: Mamta Singh <168400541+quic-mamta@users.noreply.github.com> --- QEfficient/cloud/infer.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/QEfficient/cloud/infer.py b/QEfficient/cloud/infer.py index fe55ceaf..934f4e8a 100644 --- a/QEfficient/cloud/infer.py +++ b/QEfficient/cloud/infer.py @@ -81,9 +81,6 @@ def main( onnx_dir_path = os.path.join(model_card_dir, "onnx") onnx_model_path = os.path.join(onnx_dir_path, model_name.replace("/", "_") + "_kv_clipped_fp16.onnx") - print("prompt : ", prompt) - print("prompts_txt_file_path : ", prompts_txt_file_path) - if prompts_txt_file_path is not None: logger.info("Found inputs passed using txt file as well as CLI, taking inputs from given txt file") prompts = read_prompts_txt_file(prompts_txt_file_path) @@ -230,7 +227,7 @@ def main( "--prompt", type=lambda prompt: prompt.split("|"), default="My name is", - help="Input prompt, if executing for batch size>1, pass input promprs in single string but seperate with pipe (|) symbol", + help="Input prompt, if executing for batch size>1, pass input prompts in single string but seperate with pipe (|) symbol", ) parser.add_argument( "--prompts_txt_file_path", From bc615b4dbe68c4c7b81086d5e8ecd580863a3a4c Mon Sep 17 00:00:00 2001 From: Mamta Singh <168400541+quic-mamta@users.noreply.github.com> Date: Wed, 22 May 2024 15:52:15 +0530 Subject: [PATCH 17/28] Update README.md Signed-off-by: Mamta Singh <168400541+quic-mamta@users.noreply.github.com> --- README.md | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 80dd2e76..cb7ee4c4 100644 --- a/README.md +++ b/README.md @@ -110,8 +110,8 @@ In summary: | High Level APIs | Sample use | Arguments | |-----------------|------------|-------------------| -| QEfficient.cloud.infer | [click here](#1-use-qefficientcloudinfer) |
  • model_name : $\color{green} {Mandatory}$
  • num_cores : $\color{green} {Mandatory}$
  • device_group : $\color{green} {Mandatory}$
  • batch_size : Optional [Default-1]
  • prompt_len : Optional [Default-32]
  • ctx_len : Optional [Default-128]
  • mxfp6 : Optional
  • hf_token : Optional
  • cache_dir : Optional ["cache_dir" in current working directory]
  • prompt : Optional
  • prompts_txt_file_path : Optional
  • *One argument, prompt or prompts_txt_file_path must be passed*
  • | -| QEfficient.cloud.execute | [click here](#2-use-of-qefficientcloudexecute) |
  • model_name : $\color{green} {Mandatory}$
  • device_group : $\color{green} {Mandatory}$
  • qpc_path : $\color{green} {Mandatory}$
  • cache_dir : Optional ["cache_dir" in current working directory]
  • hf_token : Optional
  • prompt : Optional
  • prompts_txt_file_path : Optional
  • *One argument, prompt or prompts_txt_file_path must be passed*
  • | +| QEfficient.cloud.infer | [click here](#1-use-qefficientcloudinfer) |
  • model_name : $\color{green} {Mandatory}$
  • num_cores : $\color{green} {Mandatory}$
  • device_group : $\color{green} {Mandatory}$
  • batch_size : Optional [Default-1]
  • prompt_len : Optional [Default-32]
  • ctx_len : Optional [Default-128]
  • mxfp6 : Optional
  • hf_token : Optional
  • cache_dir : Optional ["cache_dir" in current working directory]
  • prompt : Optional
  • prompts_txt_file_path : Optional
  • | +| QEfficient.cloud.execute | [click here](#2-use-of-qefficientcloudexecute) |
  • model_name : $\color{green} {Mandatory}$
  • device_group : $\color{green} {Mandatory}$
  • qpc_path : $\color{green} {Mandatory}$
  • cache_dir : Optional ["cache_dir" in current working directory]
  • hf_token : Optional
  • prompt : Optional
  • prompts_txt_file_path : Optional
  • | ### 1. Use QEfficient.cloud.infer @@ -126,8 +126,14 @@ This is the single e2e python api in the library, which takes model_card name as # Check out the options using the help menu python -m QEfficient.cloud.infer --help python -m QEfficient.cloud.infer --model_name gpt2 --batch_size 1 --prompt_len 32 --ctx_len 128 --mxfp6 --num_cores 16 --device_group [0] --prompt "My name is" --mos 1 --aic_enable_depth_first - -# If executing for batch size>1, pass path of txt file with input prompts, Example below, sample txt file(prompts.txt) is present in examples folder . + +# If executing for batch size>1, + +Either pass input prompts in single string but seperate with pipe (|) symbol". Example below + +python -m QEfficient.cloud.infer --model_name gpt2 --batch_size 3 --prompt_len 32 --ctx_len 128 --num_cores 16 --device_group [0] --prompt "My name is|The flat earth theory is the belief that|The sun rises from" --mxfp6 --mos 1 --aic_enable_depth_first + +Or pass path of txt file with input prompts, Example below, sample txt file(prompts.txt) is present in examples folder . python -m QEfficient.cloud.infer --model_name gpt2 --batch_size 3 --prompt_len 32 --ctx_len 128 --num_cores 16 --device_group [0] --prompts_txt_file_path examples/prompts.txt --mxfp6 --mos 1 --aic_enable_depth_first ``` @@ -174,7 +180,7 @@ python -m QEfficient.cloud.infer --model_name gpt2 --batch_size 1 --prompt_len 3 | QEfficient.transform | [click here](#1-model-download-and-transform) |
  • model : $\color{green} {Mandatory}$
  • Type : Optional [Default- "Transformers"]
  • form_factor : Optional [Default-"cloud"]
  • | | qualcomm_efficient_converter | [click here](#2-onnx-export-of-transformed-model) |
  • mode_name : $\color{green} {Mandatory}$
  • model_kv : $\color{green} {Mandatory}$ [Optional when model_class passed]
  • model_class : $\color{green} {Mandatory}$ [Optional when model_kv passed]
  • tokenizer : Optional
  • onnx_path : Optional
  • hf_token : Optional
  • seq_length : Optional [Default-128]
  • input_str : Optional [Default-"My name is"]
  • kv : Optional [Default-$\color{green} {True}$]
  • return_path : Optional [Default-False]
  • form_factor : Optional [Default-"cloud"]
  • save_fp32_onnx : Optional [Default-False]
  • save_fp16_onnx : Optional [Default-True]
  • *Both save_fp32_onnx and save_fp16_onnx can't be false*
  • | | compile | [click here](#3-compile-on-cloud-ai-100) |
  • onnx_path : $\color{green} {Mandatory}$
  • qpc_path : $\color{green} {Mandatory}$
  • num_cores : $\color{green} {Mandatory}$
  • device_group : $\color{green} {Mandatory}$
  • batch_size : Optional [Default-1]
  • prompt_len : Optional [Default-32]
  • ctx_len : Optional [Default-128]
  • mxfp6 : Optional [Default-True]
  • | -|cloud_ai_100_exec_kv | [click here](#4-run-benchmark) |
  • tokenizer : $\color{green} {Mandatory}$
  • qpc_path : $\color{green} {Mandatory}$
  • prompt : Optional
  • prompts_txt_file_path : Optional
  • input_len : Optional [Default-None]
  • generation_len : Optional [Default-None]
  • device_id : Optional [Default-[0]]
  • enable_debug_logs : Optional [Default-False]
  • stream : Optional [Default-True]
  • write_io_dir : Optional
  • automation : Optional [Default-False]
  • *One argument, prompt or prompts_txt_file_path must be passed*
  • | +|cloud_ai_100_exec_kv | [click here](#4-run-benchmark) |
  • tokenizer : $\color{green} {Mandatory}$
  • qpc_path : $\color{green} {Mandatory}$
  • prompt : Optional
  • prompts_txt_file_path : Optional
  • input_len : Optional [Default-None]
  • generation_len : Optional [Default-None]
  • device_id : Optional [Default-[0]]
  • enable_debug_logs : Optional [Default-False]
  • stream : Optional [Default-True]
  • write_io_dir : Optional
  • automation : Optional [Default-False]
  • | ### 1. Model download and transform From 6303154e084b14e1ddbfb95cf0c909f94c18f8c7 Mon Sep 17 00:00:00 2001 From: Mamta Singh <168400541+quic-mamta@users.noreply.github.com> Date: Wed, 22 May 2024 15:54:34 +0530 Subject: [PATCH 18/28] Update README.md Signed-off-by: Mamta Singh <168400541+quic-mamta@users.noreply.github.com> --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index cb7ee4c4..b4744c36 100644 --- a/README.md +++ b/README.md @@ -129,11 +129,11 @@ python -m QEfficient.cloud.infer --model_name gpt2 --batch_size 1 --prompt_len 3 # If executing for batch size>1, -Either pass input prompts in single string but seperate with pipe (|) symbol". Example below +#Either pass input prompts in single string but seperate with pipe (|) symbol". Example below python -m QEfficient.cloud.infer --model_name gpt2 --batch_size 3 --prompt_len 32 --ctx_len 128 --num_cores 16 --device_group [0] --prompt "My name is|The flat earth theory is the belief that|The sun rises from" --mxfp6 --mos 1 --aic_enable_depth_first -Or pass path of txt file with input prompts, Example below, sample txt file(prompts.txt) is present in examples folder . +#Or pass path of txt file with input prompts, Example below, sample txt file(prompts.txt) is present in examples folder . python -m QEfficient.cloud.infer --model_name gpt2 --batch_size 3 --prompt_len 32 --ctx_len 128 --num_cores 16 --device_group [0] --prompts_txt_file_path examples/prompts.txt --mxfp6 --mos 1 --aic_enable_depth_first ``` From 52e74cb6424a69fb9c1fa5c34298b18c9f6d7c98 Mon Sep 17 00:00:00 2001 From: Mamta Singh <168400541+quic-mamta@users.noreply.github.com> Date: Wed, 22 May 2024 16:05:56 +0530 Subject: [PATCH 19/28] Update README.md Signed-off-by: Mamta Singh <168400541+quic-mamta@users.noreply.github.com> --- README.md | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index b4744c36..e82346ce 100644 --- a/README.md +++ b/README.md @@ -129,11 +129,12 @@ python -m QEfficient.cloud.infer --model_name gpt2 --batch_size 1 --prompt_len 3 # If executing for batch size>1, -#Either pass input prompts in single string but seperate with pipe (|) symbol". Example below +# Either pass input prompts in single string but seperate with pipe (|) symbol". Example below -python -m QEfficient.cloud.infer --model_name gpt2 --batch_size 3 --prompt_len 32 --ctx_len 128 --num_cores 16 --device_group [0] --prompt "My name is|The flat earth theory is the belief that|The sun rises from" --mxfp6 --mos 1 --aic_enable_depth_first +python -m QEfficient.cloud.infer --model_name gpt2 --batch_size 3 --prompt_len 32 --ctx_len 128 --num_cores 16 --device_group [0] --prompt "My name is|The flat earth +theory is the belief that|The sun rises from" --mxfp6 --mos 1 --aic_enable_depth_first -#Or pass path of txt file with input prompts, Example below, sample txt file(prompts.txt) is present in examples folder . +# Or pass path of txt file with input prompts, Example below, sample txt file(prompts.txt) is present in examples folder . python -m QEfficient.cloud.infer --model_name gpt2 --batch_size 3 --prompt_len 32 --ctx_len 128 --num_cores 16 --device_group [0] --prompts_txt_file_path examples/prompts.txt --mxfp6 --mos 1 --aic_enable_depth_first ``` From 74984514be2667816c573e6294c9fa65a1cbdd44 Mon Sep 17 00:00:00 2001 From: Mamta Singh <168400541+quic-mamta@users.noreply.github.com> Date: Wed, 22 May 2024 16:06:44 +0530 Subject: [PATCH 20/28] Update infer.py Signed-off-by: Mamta Singh <168400541+quic-mamta@users.noreply.github.com> --- QEfficient/cloud/infer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/QEfficient/cloud/infer.py b/QEfficient/cloud/infer.py index 934f4e8a..08b9dfef 100644 --- a/QEfficient/cloud/infer.py +++ b/QEfficient/cloud/infer.py @@ -200,7 +200,7 @@ def main( ) parser.add_argument("--model-name", "--model_name", required=True, help="HF Model card name/id") parser.add_argument( - "--cache-dir", "--cache_dir", default=Constants.CACHE_DIR, required=False, help="Cache dir to store HF Downlods" + "--cache-dir", "--cache_dir", default=Constants.CACHE_DIR, required=False, help="Cache dir to store HF Downloads" ) parser.add_argument( "--hf-token", "--hf_token", default=None, type=str, required=False, help="HF token id for private HF models" From a6b04809d71e2b0e1fe49822b7f01cddbf3de545 Mon Sep 17 00:00:00 2001 From: Mamta Singh <168400541+quic-mamta@users.noreply.github.com> Date: Wed, 22 May 2024 16:07:17 +0530 Subject: [PATCH 21/28] Update execute.py Signed-off-by: Mamta Singh <168400541+quic-mamta@users.noreply.github.com> --- QEfficient/cloud/execute.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/QEfficient/cloud/execute.py b/QEfficient/cloud/execute.py index 6a1aeb63..fecdcc9d 100644 --- a/QEfficient/cloud/execute.py +++ b/QEfficient/cloud/execute.py @@ -78,7 +78,7 @@ def main( help="File path for taking input prompts from txt file, sample prompts.txt file present in examples folder", ) parser.add_argument( - "--cache-dir", "--cache_dir", default=Constants.CACHE_DIR, required=False, help="Cache dir to store HF Downlods" + "--cache-dir", "--cache_dir", default=Constants.CACHE_DIR, required=False, help="Cache dir to store HF Downloads" ) parser.add_argument( "--hf-token", "--hf_token", default=None, type=str, required=False, help="HF token id for private HF models" From be8857120f1349510312af2aec320f1e904cbf49 Mon Sep 17 00:00:00 2001 From: Mamta Singh Date: Wed, 22 May 2024 17:25:59 +0530 Subject: [PATCH 22/28] Update files Signed-off-by: Mamta Singh --- QEfficient/cloud/execute.py | 3 +- QEfficient/cloud/infer.py | 27 +++++----- .../generation/text_generation_inference.py | 51 +++++++++++++++---- 3 files changed, 55 insertions(+), 26 deletions(-) diff --git a/QEfficient/cloud/execute.py b/QEfficient/cloud/execute.py index fecdcc9d..1715f2d8 100644 --- a/QEfficient/cloud/execute.py +++ b/QEfficient/cloud/execute.py @@ -32,7 +32,7 @@ def main( :qpc_path: str. Path to the generated binary after compilation. :device_group: List[int]. Device Ids to be used for compilation. if len(device_group) > 1. Multiple Card setup is enabled. :prompt: str. Sample prompt for the model text generation - :prompts_txt_file_path: str. Path to txt file for multiple input prompts (in case of batch size > 1) + :prompts_txt_file_path: str. Path to txt file for multiple input prompts """ if hf_token is not None: @@ -68,7 +68,6 @@ def main( parser.add_argument( "--prompt", type=lambda prompt: prompt.split("|"), - default="My name is", help="Input prompt, if executing for batch size>1, pass input promprs in single string but seperate with pipe (|) symbol", ) parser.add_argument( diff --git a/QEfficient/cloud/infer.py b/QEfficient/cloud/infer.py index 08b9dfef..e3ea8b2f 100644 --- a/QEfficient/cloud/infer.py +++ b/QEfficient/cloud/infer.py @@ -17,7 +17,7 @@ from QEfficient.exporter.export_hf_to_cloud_ai_100 import qualcomm_efficient_converter from QEfficient.generation.text_generation_inference import ( check_batch_size_and_num_prompts, - cloud_ai_100_exec_kv, + cloud_ai_100_exec_kv_helper_loop, read_prompts_txt_file, ) from QEfficient.utils import hf_download @@ -80,15 +80,17 @@ def main( onnx_dir_path = os.path.join(model_card_dir, "onnx") onnx_model_path = os.path.join(onnx_dir_path, model_name.replace("/", "_") + "_kv_clipped_fp16.onnx") - + + assert prompt is not None or prompts_txt_file_path is not None, "Please pass atleast one argument either using --prompt or --prompts_txt_file_path" + if prompts_txt_file_path is not None: - logger.info("Found inputs passed using txt file as well as CLI, taking inputs from given txt file") - prompts = read_prompts_txt_file(prompts_txt_file_path) - check_batch_size_and_num_prompts(prompts, batch_size) + if prompt is not None: + logger.warning("Found inputs passed using txt file as well as CLI, taking inputs from given txt file") + prompt = read_prompts_txt_file(prompts_txt_file_path) else: if isinstance(prompt, str): prompt = [prompt] - check_batch_size_and_num_prompts(prompt, batch_size) + check_batch_size_and_num_prompts(prompt, batch_size) # Get tokenizer if hf_token is not None: @@ -105,12 +107,12 @@ def main( if qpc_exists(qpc_dir_path): # execute logger.info("Pre-compiled qpc found! Trying to execute with given prompt") - cloud_ai_100_exec_kv( + cloud_ai_100_exec_kv_helper_loop( + batch_size, tokenizer=tokenizer, qpc_path=qpc_dir_path, device_id=device_group, prompt=prompt, - prompts_txt_file_path=prompts_txt_file_path, ) return @@ -132,12 +134,12 @@ def main( assert ( generated_qpc_path == qpc_dir_path ), f"QPC files were generated at an unusual location, expected {qpc_dir_path}; got {generated_qpc_path}" - cloud_ai_100_exec_kv( + cloud_ai_100_exec_kv_helper_loop( + batch_size, tokenizer=tokenizer, qpc_path=qpc_dir_path, device_id=device_group, prompt=prompt, - prompts_txt_file_path=prompts_txt_file_path, ) return @@ -185,12 +187,12 @@ def main( logger.info(f"Compiled qpc files can be found at : {generated_qpc_path}") # Execute - cloud_ai_100_exec_kv( + cloud_ai_100_exec_kv_helper_loop( + batch_size, tokenizer=tokenizer, qpc_path=qpc_dir_path, device_id=device_group, prompt=prompt, - prompts_txt_file_path=prompts_txt_file_path, ) @@ -226,7 +228,6 @@ def main( parser.add_argument( "--prompt", type=lambda prompt: prompt.split("|"), - default="My name is", help="Input prompt, if executing for batch size>1, pass input prompts in single string but seperate with pipe (|) symbol", ) parser.add_argument( diff --git a/QEfficient/generation/text_generation_inference.py b/QEfficient/generation/text_generation_inference.py index 38eca70c..ad96e90a 100755 --- a/QEfficient/generation/text_generation_inference.py +++ b/QEfficient/generation/text_generation_inference.py @@ -266,11 +266,11 @@ def print_latency_stats_kv( print("=============================================================") -def cloud_ai_100_exec_kv( +def cloud_ai_100_exec_kv_helper_loop( + batch_size, tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast], qpc_path: str, - prompt: Optional[str] = None, - prompts_txt_file_path: Optional[str] = None, + prompt: Optional[List[str]] = None, device_id: List[int] = [0], input_len: Optional[int] = None, generation_len: Optional[int] = None, @@ -279,14 +279,6 @@ def cloud_ai_100_exec_kv( write_io_dir: Optional[str] = None, automation=False, ): - if prompts_txt_file_path is not None: - logger.info("Found inputs passed using txt file as well as CLI, taking inputs from given txt file") - prompt = read_prompts_txt_file(prompts_txt_file_path) - if isinstance(prompt, str): - prompt = [prompt] - - batch_size = get_compilation_batch_size(qpc_path) - check_batch_size_and_num_prompts(prompt, batch_size) if batch_size == 1: prefill_time = [] @@ -341,3 +333,40 @@ def cloud_ai_100_exec_kv( total_time, automation=automation, ) + +def cloud_ai_100_exec_kv( + tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast], + qpc_path: str, + prompt: Optional[str] = None, + prompts_txt_file_path: Optional[str] = None, + device_id: List[int] = [0], + input_len: Optional[int] = None, + generation_len: Optional[int] = None, + enable_debug_logs: bool = False, + stream: bool = True, + write_io_dir: Optional[str] = None, + automation=False, +): + if prompts_txt_file_path is not None: + if prompt is not None: + logger.warning("Found inputs passed using txt file as well as CLI, taking inputs from given txt file") + prompt = read_prompts_txt_file(prompts_txt_file_path) + if isinstance(prompt, str): + prompt = [prompt] + + batch_size = get_compilation_batch_size(qpc_path) + check_batch_size_and_num_prompts(prompt, batch_size) + + cloud_ai_100_exec_kv_helper_loop( + batch_size, + tokenizer=tokenizer, + prompt=prompt, + qpc_path=qpc_path, + device_id=device_id, + input_len=input_len, + generation_len=generation_len, + enable_debug_logs=enable_debug_logs, + stream=stream, + write_io_dir=write_io_dir, + automation=False, + ) \ No newline at end of file From 0711073e4a7fb0d71cd43085a7e9a4858a097c8e Mon Sep 17 00:00:00 2001 From: Mamta Singh Date: Wed, 22 May 2024 22:54:55 +0530 Subject: [PATCH 23/28] Update files Signed-off-by: Mamta Singh --- QEfficient/cloud/execute.py | 17 ++++-- QEfficient/cloud/infer.py | 28 ++++------ .../generation/text_generation_inference.py | 53 +++++-------------- 3 files changed, 38 insertions(+), 60 deletions(-) diff --git a/QEfficient/cloud/execute.py b/QEfficient/cloud/execute.py index 1715f2d8..c1ec39ab 100644 --- a/QEfficient/cloud/execute.py +++ b/QEfficient/cloud/execute.py @@ -11,7 +11,11 @@ from huggingface_hub import login from transformers import AutoTokenizer -from QEfficient.generation.text_generation_inference import cloud_ai_100_exec_kv +from QEfficient.generation.text_generation_inference import ( + check_batch_size_and_num_prompts, + cloud_ai_100_exec_kv, + get_compilation_batch_size, +) from QEfficient.utils import hf_download from QEfficient.utils.constants import Constants @@ -42,13 +46,16 @@ def main( model_hf_path = hf_download(repo_id=model_name, cache_dir=cache_dir, allow_patterns=["*.json", "*.py", "*token*"]) tokenizer = AutoTokenizer.from_pretrained(model_hf_path, use_cache=True, padding_side="left") + batch_size = get_compilation_batch_size(qpc_path) + prompt = check_batch_size_and_num_prompts(prompt, prompts_txt_file_path, batch_size) + # Execute cloud_ai_100_exec_kv( + batch_size=batch_size, tokenizer=tokenizer, qpc_path=qpc_path, device_id=device_group, prompt=prompt, - prompts_txt_file_path=prompts_txt_file_path, ) @@ -77,7 +84,11 @@ def main( help="File path for taking input prompts from txt file, sample prompts.txt file present in examples folder", ) parser.add_argument( - "--cache-dir", "--cache_dir", default=Constants.CACHE_DIR, required=False, help="Cache dir to store HF Downloads" + "--cache-dir", + "--cache_dir", + default=Constants.CACHE_DIR, + required=False, + help="Cache dir to store HF Downloads", ) parser.add_argument( "--hf-token", "--hf_token", default=None, type=str, required=False, help="HF token id for private HF models" diff --git a/QEfficient/cloud/infer.py b/QEfficient/cloud/infer.py index e3ea8b2f..23893d3d 100644 --- a/QEfficient/cloud/infer.py +++ b/QEfficient/cloud/infer.py @@ -17,8 +17,7 @@ from QEfficient.exporter.export_hf_to_cloud_ai_100 import qualcomm_efficient_converter from QEfficient.generation.text_generation_inference import ( check_batch_size_and_num_prompts, - cloud_ai_100_exec_kv_helper_loop, - read_prompts_txt_file, + cloud_ai_100_exec_kv, ) from QEfficient.utils import hf_download from QEfficient.utils.constants import QEFF_MODELS_DIR, Constants @@ -80,17 +79,8 @@ def main( onnx_dir_path = os.path.join(model_card_dir, "onnx") onnx_model_path = os.path.join(onnx_dir_path, model_name.replace("/", "_") + "_kv_clipped_fp16.onnx") - - assert prompt is not None or prompts_txt_file_path is not None, "Please pass atleast one argument either using --prompt or --prompts_txt_file_path" - - if prompts_txt_file_path is not None: - if prompt is not None: - logger.warning("Found inputs passed using txt file as well as CLI, taking inputs from given txt file") - prompt = read_prompts_txt_file(prompts_txt_file_path) - else: - if isinstance(prompt, str): - prompt = [prompt] - check_batch_size_and_num_prompts(prompt, batch_size) + + prompt = check_batch_size_and_num_prompts(prompt, prompts_txt_file_path, batch_size) # Get tokenizer if hf_token is not None: @@ -107,7 +97,7 @@ def main( if qpc_exists(qpc_dir_path): # execute logger.info("Pre-compiled qpc found! Trying to execute with given prompt") - cloud_ai_100_exec_kv_helper_loop( + cloud_ai_100_exec_kv( batch_size, tokenizer=tokenizer, qpc_path=qpc_dir_path, @@ -134,7 +124,7 @@ def main( assert ( generated_qpc_path == qpc_dir_path ), f"QPC files were generated at an unusual location, expected {qpc_dir_path}; got {generated_qpc_path}" - cloud_ai_100_exec_kv_helper_loop( + cloud_ai_100_exec_kv( batch_size, tokenizer=tokenizer, qpc_path=qpc_dir_path, @@ -187,7 +177,7 @@ def main( logger.info(f"Compiled qpc files can be found at : {generated_qpc_path}") # Execute - cloud_ai_100_exec_kv_helper_loop( + cloud_ai_100_exec_kv( batch_size, tokenizer=tokenizer, qpc_path=qpc_dir_path, @@ -202,7 +192,11 @@ def main( ) parser.add_argument("--model-name", "--model_name", required=True, help="HF Model card name/id") parser.add_argument( - "--cache-dir", "--cache_dir", default=Constants.CACHE_DIR, required=False, help="Cache dir to store HF Downloads" + "--cache-dir", + "--cache_dir", + default=Constants.CACHE_DIR, + required=False, + help="Cache dir to store HF Downloads", ) parser.add_argument( "--hf-token", "--hf_token", default=None, type=str, required=False, help="HF token id for private HF models" diff --git a/QEfficient/generation/text_generation_inference.py b/QEfficient/generation/text_generation_inference.py index ad96e90a..59b03160 100755 --- a/QEfficient/generation/text_generation_inference.py +++ b/QEfficient/generation/text_generation_inference.py @@ -106,12 +106,23 @@ def get_compilation_batch_size(qpc_path: str): return compilation_batch_size -def check_batch_size_and_num_prompts(prompt: List[str], batch_size: int): +def check_batch_size_and_num_prompts(prompt, prompts_txt_file_path, batch_size): + assert ( + prompt is not None or prompts_txt_file_path is not None + ), "Please pass atleast one argument either using --prompt or --prompts_txt_file_path" + if prompts_txt_file_path is not None: + if prompt is not None: + logger.warning("Found inputs passed using txt file as well as CLI, taking inputs from given txt file") + prompt = read_prompts_txt_file(prompts_txt_file_path) + if isinstance(prompt, str): + prompt = [prompt] + num_prompts = len(prompt) if batch_size > 1: assert ( batch_size == num_prompts ), f"Mismatch between number of prompts {num_prompts} and batch size {batch_size}; please pass correct input argument" + return prompt def read_prompts_txt_file(prompts_txt_file_path: str): @@ -266,7 +277,7 @@ def print_latency_stats_kv( print("=============================================================") -def cloud_ai_100_exec_kv_helper_loop( +def cloud_ai_100_exec_kv( batch_size, tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast], qpc_path: str, @@ -279,7 +290,6 @@ def cloud_ai_100_exec_kv_helper_loop( write_io_dir: Optional[str] = None, automation=False, ): - if batch_size == 1: prefill_time = [] decode_perf = [] @@ -333,40 +343,3 @@ def cloud_ai_100_exec_kv_helper_loop( total_time, automation=automation, ) - -def cloud_ai_100_exec_kv( - tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast], - qpc_path: str, - prompt: Optional[str] = None, - prompts_txt_file_path: Optional[str] = None, - device_id: List[int] = [0], - input_len: Optional[int] = None, - generation_len: Optional[int] = None, - enable_debug_logs: bool = False, - stream: bool = True, - write_io_dir: Optional[str] = None, - automation=False, -): - if prompts_txt_file_path is not None: - if prompt is not None: - logger.warning("Found inputs passed using txt file as well as CLI, taking inputs from given txt file") - prompt = read_prompts_txt_file(prompts_txt_file_path) - if isinstance(prompt, str): - prompt = [prompt] - - batch_size = get_compilation_batch_size(qpc_path) - check_batch_size_and_num_prompts(prompt, batch_size) - - cloud_ai_100_exec_kv_helper_loop( - batch_size, - tokenizer=tokenizer, - prompt=prompt, - qpc_path=qpc_path, - device_id=device_id, - input_len=input_len, - generation_len=generation_len, - enable_debug_logs=enable_debug_logs, - stream=stream, - write_io_dir=write_io_dir, - automation=False, - ) \ No newline at end of file From 5449fbb6c663ea6c9c9bee7c381424745fa6c1f2 Mon Sep 17 00:00:00 2001 From: Mamta Singh <168400541+quic-mamta@users.noreply.github.com> Date: Wed, 22 May 2024 23:03:40 +0530 Subject: [PATCH 24/28] Update README.md Signed-off-by: Mamta Singh <168400541+quic-mamta@users.noreply.github.com> --- README.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index e82346ce..0f3a9501 100644 --- a/README.md +++ b/README.md @@ -110,9 +110,10 @@ In summary: | High Level APIs | Sample use | Arguments | |-----------------|------------|-------------------| -| QEfficient.cloud.infer | [click here](#1-use-qefficientcloudinfer) |
  • model_name : $\color{green} {Mandatory}$
  • num_cores : $\color{green} {Mandatory}$
  • device_group : $\color{green} {Mandatory}$
  • batch_size : Optional [Default-1]
  • prompt_len : Optional [Default-32]
  • ctx_len : Optional [Default-128]
  • mxfp6 : Optional
  • hf_token : Optional
  • cache_dir : Optional ["cache_dir" in current working directory]
  • prompt : Optional
  • prompts_txt_file_path : Optional
  • | -| QEfficient.cloud.execute | [click here](#2-use-of-qefficientcloudexecute) |
  • model_name : $\color{green} {Mandatory}$
  • device_group : $\color{green} {Mandatory}$
  • qpc_path : $\color{green} {Mandatory}$
  • cache_dir : Optional ["cache_dir" in current working directory]
  • hf_token : Optional
  • prompt : Optional
  • prompts_txt_file_path : Optional
  • | +| QEfficient.cloud.infer | [click here](#1-use-qefficientcloudinfer) |
  • model_name : $\color{green} {Mandatory}$
  • num_cores : $\color{green} {Mandatory}$
  • device_group : $\color{green} {Mandatory}$
  • batch_size : Optional [Default-1]
  • prompt_len : Optional [Default-32]
  • ctx_len : Optional [Default-128]
  • mxfp6 : Optional
  • hf_token : Optional
  • cache_dir : Optional ["cache_dir" in current working directory]
  • **prompt : Optional
  • **prompts_txt_file_path : Optional
  • | +| QEfficient.cloud.execute | [click here](#2-use-of-qefficientcloudexecute) |
  • model_name : $\color{green} {Mandatory}$
  • device_group : $\color{green} {Mandatory}$
  • qpc_path : $\color{green} {Mandatory}$
  • cache_dir : Optional ["cache_dir" in current working directory]
  • hf_token : Optional
  • **prompt : Optional
  • **prompts_txt_file_path : Optional
  • | +**One argument, prompt or prompts_txt_file_path must be passed. ### 1. Use QEfficient.cloud.infer From 17096a3e513d8d4e4edbcb39821fec8a0a56c3a6 Mon Sep 17 00:00:00 2001 From: Mamta Singh <168400541+quic-mamta@users.noreply.github.com> Date: Wed, 22 May 2024 23:17:05 +0530 Subject: [PATCH 25/28] Update QEfficientGPT2.ipynb Signed-off-by: Mamta Singh <168400541+quic-mamta@users.noreply.github.com> --- notebooks/QEfficientGPT2.ipynb | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/notebooks/QEfficientGPT2.ipynb b/notebooks/QEfficientGPT2.ipynb index be7e3d44..668a3b47 100644 --- a/notebooks/QEfficientGPT2.ipynb +++ b/notebooks/QEfficientGPT2.ipynb @@ -162,11 +162,12 @@ "metadata": {}, "outputs": [], "source": [ - "from QEfficient.generation.text_generation_inference import cloud_ai_100_exec_kv\n", + "from QEfficient.generation.text_generation_inference import cloud_ai_100_exec_kv, get_compilation_batch_size\n", "\n", "# post compilation, we can print the latency stats for the kv models, We provide API to print token and Latency stats on AI 100\n", "# We need the compiled prefill and decode qpc to compute the token generated, This is based on Greedy Sampling Approach\n", - "cloud_ai_100_exec_kv(tokenizer=tokenizer, qpc_path=generated_qpc_path, device_id=[0], prompt=\"My name is\")" + "batch_size = get_compilation_batch_size(generated_qpc_path)\n" + "cloud_ai_100_exec_kv(batch_size=batch_size, tokenizer=tokenizer, qpc_path=generated_qpc_path, device_id=[0], prompt=\"My name is\")" ] } ], From 107b4145b7716ba2265a552b285a1c4e7614aeef Mon Sep 17 00:00:00 2001 From: Mamta Singh <168400541+quic-mamta@users.noreply.github.com> Date: Wed, 22 May 2024 23:18:51 +0530 Subject: [PATCH 26/28] Update QEfficientMPT.ipynb Signed-off-by: Mamta Singh <168400541+quic-mamta@users.noreply.github.com> --- notebooks/QEfficientMPT.ipynb | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/notebooks/QEfficientMPT.ipynb b/notebooks/QEfficientMPT.ipynb index d7e45d92..8533eedc 100644 --- a/notebooks/QEfficientMPT.ipynb +++ b/notebooks/QEfficientMPT.ipynb @@ -160,12 +160,13 @@ "metadata": {}, "outputs": [], "source": [ - "from QEfficient.generation.text_generation_inference import cloud_ai_100_exec_kv\n", + "from QEfficient.generation.text_generation_inference import cloud_ai_100_exec_kv, get_compilation_batch_size\n", "\n", "# post compilation, we can print the latency stats for the kv models, We provide API to print token and Latency stats on AI 100\n", "# We need the compiled prefill and decode qpc to compute the token generated, This is based on Greedy Sampling Approach\n", "\n", - "cloud_ai_100_exec_kv(tokenizer=tokenizer, qpc_path=generated_qpc_path, device_id=[0], prompt=\"My name is\")" + "batch_size = get_compilation_batch_size(generated_qpc_path)" + "cloud_ai_100_exec_kv(batch_size=batch_size, tokenizer=tokenizer, qpc_path=generated_qpc_path, device_id=[0], prompt=\"My name is\")" ] } ], From 0e567faa8fd8ac894a90d7c567622a1add9a914b Mon Sep 17 00:00:00 2001 From: Mamta Singh <168400541+quic-mamta@users.noreply.github.com> Date: Wed, 22 May 2024 23:23:15 +0530 Subject: [PATCH 27/28] Update README.md Signed-off-by: Mamta Singh <168400541+quic-mamta@users.noreply.github.com> --- README.md | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 0f3a9501..521e8688 100644 --- a/README.md +++ b/README.md @@ -182,7 +182,9 @@ python -m QEfficient.cloud.infer --model_name gpt2 --batch_size 1 --prompt_len 3 | QEfficient.transform | [click here](#1-model-download-and-transform) |
  • model : $\color{green} {Mandatory}$
  • Type : Optional [Default- "Transformers"]
  • form_factor : Optional [Default-"cloud"]
  • | | qualcomm_efficient_converter | [click here](#2-onnx-export-of-transformed-model) |
  • mode_name : $\color{green} {Mandatory}$
  • model_kv : $\color{green} {Mandatory}$ [Optional when model_class passed]
  • model_class : $\color{green} {Mandatory}$ [Optional when model_kv passed]
  • tokenizer : Optional
  • onnx_path : Optional
  • hf_token : Optional
  • seq_length : Optional [Default-128]
  • input_str : Optional [Default-"My name is"]
  • kv : Optional [Default-$\color{green} {True}$]
  • return_path : Optional [Default-False]
  • form_factor : Optional [Default-"cloud"]
  • save_fp32_onnx : Optional [Default-False]
  • save_fp16_onnx : Optional [Default-True]
  • *Both save_fp32_onnx and save_fp16_onnx can't be false*
  • | | compile | [click here](#3-compile-on-cloud-ai-100) |
  • onnx_path : $\color{green} {Mandatory}$
  • qpc_path : $\color{green} {Mandatory}$
  • num_cores : $\color{green} {Mandatory}$
  • device_group : $\color{green} {Mandatory}$
  • batch_size : Optional [Default-1]
  • prompt_len : Optional [Default-32]
  • ctx_len : Optional [Default-128]
  • mxfp6 : Optional [Default-True]
  • | -|cloud_ai_100_exec_kv | [click here](#4-run-benchmark) |
  • tokenizer : $\color{green} {Mandatory}$
  • qpc_path : $\color{green} {Mandatory}$
  • prompt : Optional
  • prompts_txt_file_path : Optional
  • input_len : Optional [Default-None]
  • generation_len : Optional [Default-None]
  • device_id : Optional [Default-[0]]
  • enable_debug_logs : Optional [Default-False]
  • stream : Optional [Default-True]
  • write_io_dir : Optional
  • automation : Optional [Default-False]
  • | +|cloud_ai_100_exec_kv | [click here](#4-run-benchmark) |
  • batch_size : $\color{green} {Mandatory}$
  • tokenizer : $\color{green} {Mandatory}$
  • qpc_path : $\color{green} {Mandatory}$
  • **prompt : Optional
  • **prompts_txt_file_path : Optional
  • input_len : Optional [Default-None]
  • generation_len : Optional [Default-None]
  • device_id : Optional [Default-[0]]
  • enable_debug_logs : Optional [Default-False]
  • stream : Optional [Default-True]
  • write_io_dir : Optional
  • automation : Optional [Default-False]
  • | + +**One argument, prompt or prompts_txt_file_path must be passed. ### 1. Model download and transform @@ -273,10 +275,11 @@ generated_qpc_path = compile( Benchmark the model on Cloud AI 100, run the infer API to print tokens and tok/sec ```bash -from QEfficient.generation.text_generation_inference import cloud_ai_100_exec_kv +from QEfficient.generation.text_generation_inference import cloud_ai_100_exec_kv, get_compilation_batch_size # post compilation, we can print the latency stats for the kv models, We provide API to print token and Latency stats on AI 100 # We need the compiled prefill and decode qpc to compute the token generated, This is based on Greedy Sampling Approach +batch_size = get_compilation_batch_size(generated_qpc_path) cloud_ai_100_exec_kv(tokenizer=tokenizer, qpc_path=generated_qpc_path, device_id=[0], prompt="My name is") ``` End to End demo examples for various models are available in **notebooks** directory. Please check them out. From ade2c135ae8f2b3c682753b557bf7aa8abcdf9c5 Mon Sep 17 00:00:00 2001 From: Mamta Singh <168400541+quic-mamta@users.noreply.github.com> Date: Thu, 23 May 2024 14:59:24 +0530 Subject: [PATCH 28/28] Update README.md Signed-off-by: Mamta Singh <168400541+quic-mamta@users.noreply.github.com> --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 521e8688..36604912 100644 --- a/README.md +++ b/README.md @@ -280,7 +280,7 @@ from QEfficient.generation.text_generation_inference import cloud_ai_100_exec_kv # post compilation, we can print the latency stats for the kv models, We provide API to print token and Latency stats on AI 100 # We need the compiled prefill and decode qpc to compute the token generated, This is based on Greedy Sampling Approach batch_size = get_compilation_batch_size(generated_qpc_path) -cloud_ai_100_exec_kv(tokenizer=tokenizer, qpc_path=generated_qpc_path, device_id=[0], prompt="My name is") +cloud_ai_100_exec_kv(batch_size=batch_size, tokenizer=tokenizer, qpc_path=generated_qpc_path, device_id=[0], prompt="My name is") ``` End to End demo examples for various models are available in **notebooks** directory. Please check them out.