From 042f2c132eb923de8ccdde735b71f833d440d2e7 Mon Sep 17 00:00:00 2001
From: Mamta Singh <quic_mamtsing@quicinc.com>
Date: Thu, 16 May 2024 11:43:28 +0530
Subject: [PATCH 01/28] [QEff]: Update infer and execute API to take prompts
 from txt file for bs>1

Signed-off-by: mamtsing <quic_mamtsing@quicinc.com>
Signed-off-by: mamtsing <mamtsing@qti.qualcomm.com>
---
 QEfficient/cloud/execute.py | 44 +++++++++++++++++++++++++++++++++----
 QEfficient/cloud/infer.py   | 43 +++++++++++++++++++++++++++++++-----
 examples/prompts.txt        |  3 +++
 3 files changed, 81 insertions(+), 9 deletions(-)
 create mode 100644 examples/prompts.txt

diff --git a/QEfficient/cloud/execute.py b/QEfficient/cloud/execute.py
index 2bd5626e..9734e3d0 100644
--- a/QEfficient/cloud/execute.py
+++ b/QEfficient/cloud/execute.py
@@ -19,6 +19,7 @@
 def main(
     model_name: str,
     prompt: str,
+    inputs_file_path: str,
     qpc_path: str,
     devices: List[int],
     cache_dir: str = Constants.CACHE_DIR,
@@ -38,7 +39,29 @@ def main(
     model_hf_path = hf_download(repo_id=model_name, cache_dir=cache_dir, allow_patterns=["*.json"])
     tokenizer = AutoTokenizer.from_pretrained(model_hf_path, use_cache=True, padding_side="left")
 
-    cloud_ai_100_exec_kv(tokenizer=tokenizer, qpc=qpc_path, device_id=devices, prompt=prompt)
+    if inputs_file_path is not None:
+        try:
+            prompt = []
+            with open(inputs_file_path, "r") as file:
+                for line in file:
+                    prompt.append(line.strip())
+        except FileNotFoundError:
+            print("inputs file not found.")
+
+    qpc_dir_name = qpc_path.strip("/").split("/")[-2]
+    compilation_batch_size = int(qpc_dir_name.split("BS")[0].split("_")[-1])
+
+    if compilation_batch_size > 1:
+        assert (
+            compilation_batch_size == len(prompt)
+        ), "Mismatch between number of prompts {len(prompt)} and compilation batch size {compilation_batch_size}; please pass correct input argument"
+
+    # Execute
+    if compilation_batch_size == 1 and isinstance(prompt, list):
+        for i in range(len(prompt)):
+            cloud_ai_100_exec_kv(tokenizer=tokenizer, qpc=qpc_path, device_id=devices, prompt=prompt[i])
+    else:
+        cloud_ai_100_exec_kv(tokenizer=tokenizer, qpc=qpc_path, device_id=devices, prompt=prompt)
 
 
 if __name__ == "__main__":
@@ -49,9 +72,14 @@ def main(
     parser.add_argument("--qpc_path", "--qpc-path", required=True, help="Path to generated QPC")
     parser.add_argument(
         "--prompt",
-        type=lambda prompt: prompt.split("|"),
+        type=str,
         default="My name is",
-        help="Input prompt, if executing for batch size>1, pass input promprs in single string but seperate with pipe (|) symbol",
+        help="Input prompt, if executing for batch size>1, use inputs_file_path flag",
+    )
+    parser.add_argument(
+        "--inputs_file_path",
+        type=str,
+        help="for batch size>1, pass input prompts in txt file, sample prompts.txt file present in examples folder",
     )
     parser.add_argument(
         "--device_group",
@@ -67,4 +95,12 @@ def main(
         "--hf-token", "--hf_token", default=None, type=str, required=False, help="HF token id for private HF models"
     )
     args = parser.parse_args()
-    main(args.model_name, args.prompt, args.qpc_path, args.device_group, args.cache_dir, args.hf_token)
+    main(
+        args.model_name,
+        args.prompt,
+        args.inputs_file_path,
+        args.qpc_path,
+        args.device_group,
+        args.cache_dir,
+        args.hf_token,
+    )
diff --git a/QEfficient/cloud/infer.py b/QEfficient/cloud/infer.py
index 3492874a..de5c2743 100644
--- a/QEfficient/cloud/infer.py
+++ b/QEfficient/cloud/infer.py
@@ -49,6 +49,7 @@ def main(
     model_name: str,
     num_cores: int,
     prompt: str,
+    inputs_file_path: str,
     aic_enable_depth_first: bool = False,
     mos: int = -1,
     cache_dir: str = Constants.CACHE_DIR,
@@ -76,6 +77,20 @@ def main(
     onnx_dir_path = os.path.join(model_card_dir, "onnx")
     onnx_model_path = os.path.join(onnx_dir_path, model_name.replace("/", "_") + "_kv_clipped_fp16.onnx")
 
+    if inputs_file_path is not None:
+        try:
+            prompt = []
+            with open(inputs_file_path, "r") as file:
+                for line in file:
+                    prompt.append(line.strip())
+        except FileNotFoundError:
+            print("Inputs file not found.")
+
+    if batch_size > 1:
+        assert (
+            batch_size == len(prompt)
+        ), "Mismatch between number of prompts {len(prompt)} and batch size {batch_size}; please pass correct input argument"
+
     # Get tokenizer
     if hf_token is not None:
         login(hf_token)
@@ -89,7 +104,11 @@ def main(
     if qpc_exists(qpc_dir_path):
         # execute
         logger.info("Pre-compiled qpc found! Trying to execute with given prompt")
-        cloud_ai_100_exec_kv(tokenizer=tokenizer, qpc=qpc_dir_path, device_id=device_group, prompt=prompt)
+        if batch_size == 1 and isinstance(prompt, list):
+            for i in range(len(prompt)):
+                cloud_ai_100_exec_kv(tokenizer=tokenizer, qpc=qpc_dir_path, device_id=device_group, prompt=prompt[i])
+        else:
+            cloud_ai_100_exec_kv(tokenizer=tokenizer, qpc=qpc_dir_path, device_id=device_group, prompt=prompt)
         return
 
     if onnx_exists(onnx_model_path):
@@ -110,7 +129,11 @@ def main(
         assert (
             generated_qpc_path == qpc_dir_path
         ), f"QPC files were generated at an unusual location, expected {qpc_dir_path}; got {generated_qpc_path}"
-        cloud_ai_100_exec_kv(tokenizer=tokenizer, qpc=generated_qpc_path, device_id=device_group, prompt=prompt)
+        if batch_size == 1 and isinstance(prompt, list):
+            for i in range(len(prompt)):
+                cloud_ai_100_exec_kv(tokenizer=tokenizer, qpc=qpc_dir_path, device_id=device_group, prompt=prompt[i])
+        else:
+            cloud_ai_100_exec_kv(tokenizer=tokenizer, qpc=qpc_dir_path, device_id=device_group, prompt=prompt)
         return
 
     #############################################
@@ -157,7 +180,11 @@ def main(
     logger.info(f"Compiled qpc files can be found at : {generated_qpc_path}")
 
     # Execute
-    cloud_ai_100_exec_kv(tokenizer=tokenizer, qpc=generated_qpc_path, device_id=device_group, prompt=prompt)
+    if batch_size == 1 and isinstance(prompt, list):
+        for i in range(len(prompt)):
+            cloud_ai_100_exec_kv(tokenizer=tokenizer, qpc=generated_qpc_path, device_id=device_group, prompt=prompt[i])
+    else:
+        cloud_ai_100_exec_kv(tokenizer=tokenizer, qpc=generated_qpc_path, device_id=device_group, prompt=prompt)
 
 
 if __name__ == "__main__":
@@ -191,9 +218,15 @@ def main(
     )
     parser.add_argument(
         "--prompt",
-        type=lambda prompt: prompt.split("|"),
+        type=str,
         default="My name is",
-        help="Input prompt, if executing for batch size>1, pass input promprs in single string but seperate with pipe (|) symbol",
+        help="Input prompt, if executing for batch size>1, use inputs_file_path flag",
+    )
+    parser.add_argument(
+        "--inputs_file_path",
+        "--inputs-file-path",
+        type=str,
+        help="for batch size>1, pass input prompts in txt file, sample prompts.txt file present in examples folder",
     )
     parser.add_argument(
         "--aic_enable_depth_first",
diff --git a/examples/prompts.txt b/examples/prompts.txt
new file mode 100644
index 00000000..a91a5151
--- /dev/null
+++ b/examples/prompts.txt
@@ -0,0 +1,3 @@
+My name is
+The sun rises from
+The flat earth theory is the belief that
\ No newline at end of file

From 0802373c94eb2b7d4daa7dc245b1af7be5faca84 Mon Sep 17 00:00:00 2001
From: Mamta Singh <quic_mamtsing@quicinc.com>
Date: Fri, 17 May 2024 21:07:52 +0530
Subject: [PATCH 02/28] Update infer and execute API

Signed-off-by: mamtsing <mamtsing@qti.qualcomm.com>
---
 QEfficient/cloud/execute.py | 1 +
 QEfficient/cloud/infer.py   | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/QEfficient/cloud/execute.py b/QEfficient/cloud/execute.py
index 9734e3d0..1b5c3506 100644
--- a/QEfficient/cloud/execute.py
+++ b/QEfficient/cloud/execute.py
@@ -78,6 +78,7 @@ def main(
     )
     parser.add_argument(
         "--inputs_file_path",
+        "--inputs-file-path",
         type=str,
         help="for batch size>1, pass input prompts in txt file, sample prompts.txt file present in examples folder",
     )
diff --git a/QEfficient/cloud/infer.py b/QEfficient/cloud/infer.py
index de5c2743..b2361aff 100644
--- a/QEfficient/cloud/infer.py
+++ b/QEfficient/cloud/infer.py
@@ -189,7 +189,7 @@ def main(
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(
-        description="Inference command, the model will be downloaded from HF, optmized, compiled, executed on AIC"
+        description="Inference command, the model will be downloaded from HF, optmized, compiled, executed on Cloud AI 100"
     )
     parser.add_argument("--model-name", "--model_name", required=True, help="HF Model card name/id")
     parser.add_argument(

From bc5ca88c68bcb976fdeef9a1735a9d45e6ad05d8 Mon Sep 17 00:00:00 2001
From: Mamta Singh <quic_mamtsing@quicinc.com>
Date: Mon, 20 May 2024 12:20:23 +0530
Subject: [PATCH 03/28] Update infer and execute API

Signed-off-by: mamtsing <mamtsing@qti.qualcomm.com>
---
 QEfficient/cloud/execute.py                   |  52 ++++-----
 QEfficient/cloud/infer.py                     |  75 +++++++------
 .../generation/text_generation_inference.py   | 103 ++++++++++++++++--
 3 files changed, 163 insertions(+), 67 deletions(-)

diff --git a/QEfficient/cloud/execute.py b/QEfficient/cloud/execute.py
index 1b5c3506..3896ac8e 100644
--- a/QEfficient/cloud/execute.py
+++ b/QEfficient/cloud/execute.py
@@ -11,7 +11,12 @@
 from huggingface_hub import login
 from transformers import AutoTokenizer
 
-from QEfficient.generation.text_generation_inference import cloud_ai_100_exec_kv
+from QEfficient.generation.text_generation_inference import (
+    check_batch_size_and_num_prompts,
+    cloud_ai_100_exec_kv,
+    get_compilation_batch_size,
+    read_prompts_txt_file,
+)
 from QEfficient.utils import hf_download
 from QEfficient.utils.constants import Constants
 
@@ -19,7 +24,7 @@
 def main(
     model_name: str,
     prompt: str,
-    inputs_file_path: str,
+    prompts_txt_file_path: str,
     qpc_path: str,
     devices: List[int],
     cache_dir: str = Constants.CACHE_DIR,
@@ -35,33 +40,29 @@ def main(
     """
     if hf_token is not None:
         login(hf_token)
+
     # Download tokenizer along with model if it doesn't exist
     model_hf_path = hf_download(repo_id=model_name, cache_dir=cache_dir, allow_patterns=["*.json"])
     tokenizer = AutoTokenizer.from_pretrained(model_hf_path, use_cache=True, padding_side="left")
 
-    if inputs_file_path is not None:
-        try:
-            prompt = []
-            with open(inputs_file_path, "r") as file:
-                for line in file:
-                    prompt.append(line.strip())
-        except FileNotFoundError:
-            print("inputs file not found.")
+    assert (prompt is None and prompts_txt_file_path is not None) or (
+        prompt is not None and prompts_txt_file_path is None
+    ), "Please pass either single input string using --prompt or multiple inputs using --prompts_txt_file_path"
 
-    qpc_dir_name = qpc_path.strip("/").split("/")[-2]
-    compilation_batch_size = int(qpc_dir_name.split("BS")[0].split("_")[-1])
+    if prompts_txt_file_path is not None:
+        prompt = read_prompts_txt_file(prompts_txt_file_path)
 
-    if compilation_batch_size > 1:
-        assert (
-            compilation_batch_size == len(prompt)
-        ), "Mismatch between number of prompts {len(prompt)} and compilation batch size {compilation_batch_size}; please pass correct input argument"
+    compilation_batch_size = get_compilation_batch_size(qpc_path)
+    check_batch_size_and_num_prompts(prompt, compilation_batch_size)
 
     # Execute
-    if compilation_batch_size == 1 and isinstance(prompt, list):
-        for i in range(len(prompt)):
-            cloud_ai_100_exec_kv(tokenizer=tokenizer, qpc=qpc_path, device_id=devices, prompt=prompt[i])
-    else:
-        cloud_ai_100_exec_kv(tokenizer=tokenizer, qpc=qpc_path, device_id=devices, prompt=prompt)
+    cloud_ai_100_exec_kv(
+        compilation_batch_size=compilation_batch_size,
+        tokenizer=tokenizer,
+        qpc=qpc_path,
+        device_id=devices,
+        prompt=prompt,
+    )
 
 
 if __name__ == "__main__":
@@ -73,12 +74,11 @@ def main(
     parser.add_argument(
         "--prompt",
         type=str,
-        default="My name is",
-        help="Input prompt, if executing for batch size>1, use inputs_file_path flag",
+        help="Input prompt, if executing for batch size>1, use prompts_txt_file_path flag",
     )
     parser.add_argument(
-        "--inputs_file_path",
-        "--inputs-file-path",
+        "--prompts_txt_file_path",
+        "--prompts-txt-file-path-file-path",
         type=str,
         help="for batch size>1, pass input prompts in txt file, sample prompts.txt file present in examples folder",
     )
@@ -99,7 +99,7 @@ def main(
     main(
         args.model_name,
         args.prompt,
-        args.inputs_file_path,
+        args.prompts_txt_file_path,
         args.qpc_path,
         args.device_group,
         args.cache_dir,
diff --git a/QEfficient/cloud/infer.py b/QEfficient/cloud/infer.py
index b2361aff..7612460b 100644
--- a/QEfficient/cloud/infer.py
+++ b/QEfficient/cloud/infer.py
@@ -15,7 +15,12 @@
 import QEfficient
 from QEfficient.cloud.compile import main as compile
 from QEfficient.exporter.export_hf_to_cloud_ai_100 import qualcomm_efficient_converter
-from QEfficient.generation.text_generation_inference import cloud_ai_100_exec_kv
+from QEfficient.generation.text_generation_inference import (
+    check_batch_size_and_num_prompts,
+    cloud_ai_100_exec_kv,
+    get_compilation_batch_size,
+    read_prompts_txt_file,
+)
 from QEfficient.utils import hf_download
 from QEfficient.utils.constants import QEFF_MODELS_DIR, Constants
 from QEfficient.utils.logging_utils import logger
@@ -48,8 +53,8 @@ def onnx_exists(onnx_file_path: str) -> bool:
 def main(
     model_name: str,
     num_cores: int,
-    prompt: str,
-    inputs_file_path: str,
+    prompt: str = None,
+    prompts_txt_file_path: str = None,
     aic_enable_depth_first: bool = False,
     mos: int = -1,
     cache_dir: str = Constants.CACHE_DIR,
@@ -77,19 +82,12 @@ def main(
     onnx_dir_path = os.path.join(model_card_dir, "onnx")
     onnx_model_path = os.path.join(onnx_dir_path, model_name.replace("/", "_") + "_kv_clipped_fp16.onnx")
 
-    if inputs_file_path is not None:
-        try:
-            prompt = []
-            with open(inputs_file_path, "r") as file:
-                for line in file:
-                    prompt.append(line.strip())
-        except FileNotFoundError:
-            print("Inputs file not found.")
+    assert (prompt is None and prompts_txt_file_path is not None) or (
+        prompt is not None and prompts_txt_file_path is None
+    ), "Please pass either single input string using --prompt or multiple inputs using --prompts_txt_file_path"
 
-    if batch_size > 1:
-        assert (
-            batch_size == len(prompt)
-        ), "Mismatch between number of prompts {len(prompt)} and batch size {batch_size}; please pass correct input argument"
+    if prompts_txt_file_path is not None:
+        prompt = read_prompts_txt_file(prompts_txt_file_path)
 
     # Get tokenizer
     if hf_token is not None:
@@ -104,13 +102,19 @@ def main(
     if qpc_exists(qpc_dir_path):
         # execute
         logger.info("Pre-compiled qpc found! Trying to execute with given prompt")
-        if batch_size == 1 and isinstance(prompt, list):
-            for i in range(len(prompt)):
-                cloud_ai_100_exec_kv(tokenizer=tokenizer, qpc=qpc_dir_path, device_id=device_group, prompt=prompt[i])
-        else:
-            cloud_ai_100_exec_kv(tokenizer=tokenizer, qpc=qpc_dir_path, device_id=device_group, prompt=prompt)
+        compilation_batch_size = get_compilation_batch_size(qpc_dir_path)
+        check_batch_size_and_num_prompts(prompt, compilation_batch_size)
+        cloud_ai_100_exec_kv(
+            compilation_batch_size=compilation_batch_size,
+            tokenizer=tokenizer,
+            qpc_path=qpc_dir_path,
+            device_id=device_group,
+            prompt=prompt,
+        )
         return
 
+    check_batch_size_and_num_prompts(prompt, batch_size)
+
     if onnx_exists(onnx_model_path):
         # Compile -> execute
         # We need to pass parent directory of qpc_dir_path, as the compile function handles the qpcs directory creation
@@ -129,11 +133,13 @@ def main(
         assert (
             generated_qpc_path == qpc_dir_path
         ), f"QPC files were generated at an unusual location, expected {qpc_dir_path}; got {generated_qpc_path}"
-        if batch_size == 1 and isinstance(prompt, list):
-            for i in range(len(prompt)):
-                cloud_ai_100_exec_kv(tokenizer=tokenizer, qpc=qpc_dir_path, device_id=device_group, prompt=prompt[i])
-        else:
-            cloud_ai_100_exec_kv(tokenizer=tokenizer, qpc=qpc_dir_path, device_id=device_group, prompt=prompt)
+        cloud_ai_100_exec_kv(
+            compilation_batch_size=compilation_batch_size,
+            tokenizer=tokenizer,
+            qpc_path=qpc_dir_path,
+            device_id=device_group,
+            prompt=prompt,
+        )
         return
 
     #############################################
@@ -180,11 +186,13 @@ def main(
     logger.info(f"Compiled qpc files can be found at : {generated_qpc_path}")
 
     # Execute
-    if batch_size == 1 and isinstance(prompt, list):
-        for i in range(len(prompt)):
-            cloud_ai_100_exec_kv(tokenizer=tokenizer, qpc=generated_qpc_path, device_id=device_group, prompt=prompt[i])
-    else:
-        cloud_ai_100_exec_kv(tokenizer=tokenizer, qpc=generated_qpc_path, device_id=device_group, prompt=prompt)
+    cloud_ai_100_exec_kv(
+        compilation_batch_size=compilation_batch_size,
+        tokenizer=tokenizer,
+        qpc_path=qpc_dir_path,
+        device_id=device_group,
+        prompt=prompt,
+    )
 
 
 if __name__ == "__main__":
@@ -219,12 +227,11 @@ def main(
     parser.add_argument(
         "--prompt",
         type=str,
-        default="My name is",
-        help="Input prompt, if executing for batch size>1, use inputs_file_path flag",
+        help="Input prompt, if executing for batch size>1, use prompts_txt_file_path flag",
     )
     parser.add_argument(
-        "--inputs_file_path",
-        "--inputs-file-path",
+        "--prompts_txt_file_path",
+        "--prompts-txt-file-path",
         type=str,
         help="for batch size>1, pass input prompts in txt file, sample prompts.txt file present in examples folder",
     )
diff --git a/QEfficient/generation/text_generation_inference.py b/QEfficient/generation/text_generation_inference.py
index c1f6d190..1be533e3 100755
--- a/QEfficient/generation/text_generation_inference.py
+++ b/QEfficient/generation/text_generation_inference.py
@@ -61,6 +61,75 @@ def write_io_files(
         json.dump({"IO-files": io_files}, fp, indent=True)
 
 
+def get_compilation_batch_size(qpc_path: str):
+    qpc_base_path = os.path.dirname(qpc_path)
+    print(qpc_base_path)
+    specialization_file_path = os.path.join(qpc_base_path, "specializations.json")
+    print(specialization_file_path)
+    with open(specialization_file_path, "r") as file:
+        data = json.load(file)
+    compilation_batch_size = int(data["specializations"][0]["batch_size"])
+    return compilation_batch_size
+
+
+def check_batch_size_and_num_prompts(prompt: Union[str, List], compilation_batch_size: int):
+    if isinstance(prompt, list):
+        num_prompts = len(prompt)
+    elif isinstance(prompt, str):
+        num_prompts = 1
+    else:
+        print("Input prompt sould be either string for single input or List of string in case of mutliple inputs")
+    if compilation_batch_size > 1:
+        assert (
+            compilation_batch_size == num_prompts
+        ), f"Mismatch between number of prompts {num_prompts} and compilation batch size {compilation_batch_size}; please pass correct input argument"
+
+
+def read_prompts_txt_file(prompts_txt_file_path: str):
+    prompt = []
+    with open(prompts_txt_file_path, "r") as file:
+        for line in file:
+            prompt.append(line.strip())
+    return prompt
+
+
+def cloud_ai_100_exec_kv(
+    tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
+    qpc_path: str,
+    prompt: Union[str, List],
+    compilation_batch_size: int,
+    device_id: List[int] = [0],
+):
+    if compilation_batch_size == 1 and isinstance(prompt, list):
+        for i in range(len(prompt)):
+            latency_stats = exec_kv(tokenizer=tokenizer, qpc=qpc_path, device_id=device_id, prompt=prompt[i])
+            if i == len(prompt) - 1:
+                generated_texts, prefill_time, decode_perf, total_perf, total_time = latency_stats
+                print_latency_stats_kv(
+                    prompt,
+                    generated_texts,
+                    compilation_batch_size,
+                    prefill_time,
+                    decode_perf,
+                    total_perf,
+                    total_time,
+                    automation=False,
+                )
+    else:
+        latency_stats = exec_kv(tokenizer=tokenizer, qpc=qpc_path, device_id=device_id, prompt=prompt)
+        generated_texts, prefill_time, decode_perf, total_perf, total_time = latency_stats
+        print_latency_stats_kv(
+            prompt,
+            generated_texts,
+            compilation_batch_size,
+            prefill_time,
+            decode_perf,
+            total_perf,
+            total_time,
+            automation=False,
+        )
+
+
 def latency_stats_bertstyle(
     model_name: str,
     qpc: str,
@@ -97,25 +166,26 @@ def latency_stats_bertstyle(
     print(round((cur_len - init_len) / (end - start), 2), "tok/s")
 
 
-def cloud_ai_100_exec_kv(
+def exec_kv(
     tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
     qpc: str,
-    prompt: str,
+    prompt: Union[str, List],
     input_len: Optional[int] = None,
     generation_len: Optional[int] = None,
     device_id: List[int] = [0],
     enable_debug_logs: bool = False,
     stream: bool = True,
     write_io_dir: Optional[str] = None,
-    automation: bool = False,
 ):
     if tokenizer.padding_side != "left":
-        logger.warning(f"Please use padding_side='left' while initializing the tokenizer")
+        logger.warning("Please use padding_side='left' while initializing the tokenizer")
         tokenizer.padding_side = "left"
     if tokenizer.pad_token_id is None:
         tokenizer.pad_token_id = tokenizer.eos_token_id
+
     # Load QPC
     session = QAICInferenceSession(qpc, device_id, enable_debug_logs=enable_debug_logs)
+
     # Read prompt and ctx len from session
     prompt_len = max([x[session.binding_index_map["input_ids"]][1][1] for x in session.allowed_shapes])
     ctx_len = session.allowed_shapes[0][session.binding_index_map["attention_mask"]][1][1]
@@ -126,11 +196,11 @@ def cloud_ai_100_exec_kv(
     num_chunks = -(input_len // -prompt_len)  # ceil divide without float
     input_len = num_chunks * prompt_len  # Convert input_len to a multiple of prompt_len
     assert input_len <= ctx_len, "input_len should be less than ctx_len"
+
     # Skip inputs/outputs
     session.skip_buffers([x for x in session.input_names if x.startswith("past_")])
     session.skip_buffers([x for x in session.output_names if x.endswith("_RetainedState")])
-    if tokenizer.pad_token_id is None:
-        tokenizer.pad_token_id = tokenizer.eos_token_id
+
     # Prepare inputs for first iteration
     start = perf_counter()
     inputs = tokenizer(prompt, return_tensors="np", padding="max_length", max_length=input_len)
@@ -146,8 +216,13 @@ def cloud_ai_100_exec_kv(
     cache_index = np.array([0])
     inputs["cache_index"] = cache_index
     generated_ids = np.full((batch_size, generation_len - input_len + 1), tokenizer.pad_token_id)
+
     if stream:
-        print(0, prompt[0], end=" ", flush=True)
+        if isinstance(prompt, list):
+            print(0, prompt[0], end=" ", flush=True)
+        else:
+            print(0, prompt, end=" ", flush=True)
+
     # Run prefill
     for i in range(num_chunks):
         chunk_inputs = inputs.copy()
@@ -159,6 +234,7 @@ def cloud_ai_100_exec_kv(
         if write_io_dir:
             write_io_files(inputs, outputs, write_io_dir, "prefill", "aic_batch_io", True, False)
         cache_index += prompt_len
+
     # Get first token
     logits = outputs["logits"]
     if len(logits.shape) == 2:
@@ -169,6 +245,7 @@ def cloud_ai_100_exec_kv(
     generated_ids[:, cache_index[0] - input_len] = next_token_id.squeeze(1)
     if stream:
         print(tokenizer.decode(next_token_id[0]), end=" ", flush=True)
+
     # Skip attention_mask from next iteration to use retained attention_mask
     session.skip_buffers(["attention_mask"])
     loop_start = perf_counter()
@@ -178,6 +255,7 @@ def cloud_ai_100_exec_kv(
         if write_io_dir:
             write_io_files(inputs, outputs, write_io_dir, "decode", "aic_batch_io", True, False)
             write_io_dir = None
+
         # Prepare inputs for next iteration
         logits = outputs["logits"]
         if len(logits.shape) == 2:
@@ -192,14 +270,24 @@ def cloud_ai_100_exec_kv(
             print(tokenizer.decode(next_token_id[0]), end=" ", flush=True)
     end = perf_counter()
     generated_texts = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
+
     for i in range(1 if stream else 0, batch_size):
         print()
         print(i, prompt[i], generated_texts[i])
+
     prefill_time = loop_start - start
     decode_perf = (cache_index.item() - input_len - 1) / (end - loop_start)
     total_perf = (cache_index.item() - input_len) / (end - start)
     total_time = end - start
     print()
+
+    latency_stats = (generated_texts, prefill_time, decode_perf, total_perf, total_time)
+    return latency_stats
+
+
+def print_latency_stats_kv(
+    prompt, generated_texts, batch_size, prefill_time, decode_perf, total_perf, total_time, automation: bool = False
+):
     if automation:
         print()
         print("input=", prompt)
@@ -210,6 +298,7 @@ def cloud_ai_100_exec_kv(
         print("Total (E2E) inference time is=", round(total_time, 2))
         return
     print()
+
     print("===================== Performance Stats =====================")
     if batch_size > 1:
         print("Prefill time a.k.a TTFT (batch) is :", round(prefill_time, 2), "s")

From 8712b87e8681bf009659b56d9630758700b4054b Mon Sep 17 00:00:00 2001
From: Mamta Singh <quic_mamtsing@quicinc.com>
Date: Tue, 21 May 2024 12:15:33 +0530
Subject: [PATCH 04/28] Update README.md

Signed-off-by: mamtsing <mamtsing@qti.qualcomm.com>
---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index df5a7d12..2c85f3b4 100644
--- a/README.md
+++ b/README.md
@@ -116,9 +116,9 @@ This is the single e2e python api in the library, which takes model_card name as
 python -m QEfficient.cloud.infer --help
 python -m QEfficient.cloud.infer --model_name gpt2 --batch_size 1 --prompt_len 32 --ctx_len 128 --mxfp6 --num_cores 16 --device_group [0] --prompt "My name is" --mos 1 --aic_enable_depth_first  
  
-# If executing for batch size>1, pass input prompts in single string but seperate with pipe (|) symbol". Example below
+# If executing for batch size>1, pass path of txt file with input prompts, Example below
 
-python -m QEfficient.cloud.infer --model_name gpt2 --batch_size 3 --prompt_len 32 --ctx_len 128 --num_cores 16 --device_group [0] --prompt "My name is|The flat earth theory is the belief that|The sun rises from" --mxfp6 --mos 1 --aic_enable_depth_first  
+python -m QEfficient.cloud.infer --model_name gpt2 --batch_size 3 --prompt_len 32 --ctx_len 128 --num_cores 16 --device_group [0] --prompts_txt_file_path examples/prompts.txt --mxfp6 --mos 1 --aic_enable_depth_first  
  ```
 ### 2. Use of QEfficient.cloud.excute
 

From 81a3163616a55359c8dcee0cdee505ffdda4f892 Mon Sep 17 00:00:00 2001
From: Mamta Singh <168400541+quic-mamta@users.noreply.github.com>
Date: Tue, 21 May 2024 13:39:41 +0530
Subject: [PATCH 05/28] Update README.md

Signed-off-by: Mamta Singh <168400541+quic-mamta@users.noreply.github.com>
---
 README.md | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/README.md b/README.md
index 2c85f3b4..78291df4 100644
--- a/README.md
+++ b/README.md
@@ -99,8 +99,8 @@ In summary:
 
 | High Level APIs | Sample use | Arguments         |
 |-----------------|------------|-------------------|
-| QEfficient.cloud.infer           |   [click here](#1-use-qefficientcloudinfer)         |  <li>model_name : $\color{green} {Mandatory}$</li> <li>num_cores : $\color{green} {Mandatory}$</li> <li>device_group : $\color{green} {Mandatory}$</li><li>batch_size : Optional [Default-1]</li> <li>prompt_len : Optional [Default-32]</li> <li>ctx_len : Optional [Default-128]</li><li>mxfp6 : Optional </li> <li>hf_token : Optional </li><li>cache_dir : Optional ["cache_dir" in current working directory]</li><li>prompt : Optinoal [Default-"My name is"]</li> |
-| QEfficient.cloud.execute  |     [click here](#2-use-of-qefficientcloudexcute)       |   <li>model_name : $\color{green} {Mandatory}$</li> <li>device_group : $\color{green} {Mandatory}$</li><li>qpc_path : $\color{green} {Mandatory}$</li><li>prompt : Optional [Default-"My name is"]</li> <li>cache_dir : Optional ["cache_dir" in current working directory]</li><li>hf_token : Optional </li>             |
+| QEfficient.cloud.infer           |   [click here](#1-use-qefficientcloudinfer)         |  <li>model_name : $\color{green} {Mandatory}$</li> <li>num_cores : $\color{green} {Mandatory}$</li> <li>device_group : $\color{green} {Mandatory}$</li><li>batch_size : Optional [Default-1]</li> <li>prompt_len : Optional [Default-32]</li> <li>ctx_len : Optional [Default-128]</li><li>mxfp6 : Optional </li> <li>hf_token : Optional </li><li>cache_dir : Optional ["cache_dir" in current working directory]</li><li>prompt : Optional</li><li>prompts_txt_file_path : Optional</li><li>*only one argument, prompt or prompts_txt_file_path should be passed*</li> |
+| QEfficient.cloud.execute  |     [click here](#2-use-of-qefficientcloudexcute)       |   <li>model_name : $\color{green} {Mandatory}$</li> <li>device_group : $\color{green} {Mandatory}$</li><li>qpc_path : $\color{green} {Mandatory}$</li> <li>cache_dir : Optional ["cache_dir" in current working directory]</li><li>hf_token : Optional </li><li>prompt : Optional</li><li>prompts_txt_file_path : Optional</li> <li>*only one argument, prompt or prompts_txt_file_path should be passed*</li>             |
 
 
 ### 1. Use QEfficient.cloud.infer 
@@ -116,7 +116,7 @@ This is the single e2e python api in the library, which takes model_card name as
 python -m QEfficient.cloud.infer --help
 python -m QEfficient.cloud.infer --model_name gpt2 --batch_size 1 --prompt_len 32 --ctx_len 128 --mxfp6 --num_cores 16 --device_group [0] --prompt "My name is" --mos 1 --aic_enable_depth_first  
  
-# If executing for batch size>1, pass path of txt file with input prompts, Example below
+# If executing for batch size>1, pass path of txt file with input prompts, Example below, sample txt file(prompts.txt) is present in examples folder .
 
 python -m QEfficient.cloud.infer --model_name gpt2 --batch_size 3 --prompt_len 32 --ctx_len 128 --num_cores 16 --device_group [0] --prompts_txt_file_path examples/prompts.txt --mxfp6 --mos 1 --aic_enable_depth_first  
  ```
@@ -128,7 +128,7 @@ Once we have compiled the QPC, we can now use the precompiled QPC in execute API
 python -m QEfficient.cloud.execute --model_name gpt2 --qpc_path qeff_models/gpt2/qpc_16cores_1BS_32PL_128CL_1devices_mxfp6/qpcs/ --prompt "Once upon a time in" --device_group [0]  
 ```
 
-We can also enable MQ, just based on the number of devices. Based on the "--device_group" as input it will create TS config on the fly. If "--device-group [0,1]" it will create TS config for 2 devices and use it for compilation, if "--device-group 0" then TS compilation is skipped and single soc execution is enabled.
+We can also enable MQ, just based on the number of devices. Based on the "--device-group" as input it will create TS config on the fly. If "--device-group [0,1]" it will create TS config for 2 devices and use it for compilation, if "--device-group 0" then TS compilation is skipped and single soc execution is enabled.
 
 ```bash
 python -m QEfficient.cloud.infer --model_name Salesforce/codegen-2B-mono --batch_size 1 --prompt_len 32 --ctx_len 128 --mxfp6 --num_cores 16 --device-group [0,1] --prompt "def fibonacci(n):" --mos 2 --aic_enable_depth_first  
@@ -145,7 +145,7 @@ python -m QEfficient.cloud.infer --model_name gpt2 --batch_size 1 --prompt_len 3
 
 | High Level APIs | Single SoC | Tensor Slicing         |
 |-----------------|------------|-------------------|
-| QEfficient.cloud.infer           | python -m QEfficient.cloud.infer --model_name $\color{green} {model}$  --batch_size 8 --prompt_len 128 --ctx_len 1024 --num_cores 16 --device-group [0] --prompt "My name is" --mxfp6 --hf_token  $\color{green}{xyz}$  --mos 1 --aic_enable_depth_first |  python -m QEfficient.cloud.infer --model_name $\color{green}{model}$  --batch_size 8 --prompt_len 128 --ctx_len 1024--num_cores 16 --device-group [0,1,2,3] --prompt "My name is" --mxfp6 --hf_token  $\color{green}{xyz}$  --mos 4 --aic_enable_depth_first |
+| QEfficient.cloud.infer           | python -m QEfficient.cloud.infer --model_name $\color{green} {model}$  --batch_size 1 --prompt_len 128 --ctx_len 1024 --num_cores 16 --device-group [0] --prompt "My name is" --mxfp6 --hf_token  $\color{green}{xyz}$  --mos 1 --aic_enable_depth_first |  python -m QEfficient.cloud.infer --model_name $\color{green}{model}$  --batch_size 1 --prompt_len 128 --ctx_len 1024--num_cores 16 --device-group [0,1,2,3] --prompt "My name is" --mxfp6 --hf_token  $\color{green}{xyz}$  --mos 4 --aic_enable_depth_first |
 | QEfficient.cloud.excute  |   python -m QEfficient.cloud.execute --model_name $\color{green}{model}$  --device_group [0] --qpc_path  $\color{green}{path}$  --prompt "My name is"  --hf_token  $\color{green}{xyz}$   |  python -m QEfficient.cloud.execute --model_name $\color{green}{model}$  --device_group [0,1,2,3] --qpc_path  $\color{green}{path}$  --prompt "My name is"  --hf_token  $\color{green}{xyz}$   |
 
 :memo: Replace $\color{green}{model}$ ,  $\color{green}{path}$  and  $\color{green}{xyz}$  with preffered model card name, qpc path and hf token respectively.

From 968dd414a3abbad90990b43b83f2efbc7bad6681 Mon Sep 17 00:00:00 2001
From: Mamta Singh <168400541+quic-mamta@users.noreply.github.com>
Date: Tue, 21 May 2024 13:42:09 +0530
Subject: [PATCH 06/28] Update README.md

Signed-off-by: Mamta Singh <168400541+quic-mamta@users.noreply.github.com>
---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 78291df4..6d901cc0 100644
--- a/README.md
+++ b/README.md
@@ -99,8 +99,8 @@ In summary:
 
 | High Level APIs | Sample use | Arguments         |
 |-----------------|------------|-------------------|
-| QEfficient.cloud.infer           |   [click here](#1-use-qefficientcloudinfer)         |  <li>model_name : $\color{green} {Mandatory}$</li> <li>num_cores : $\color{green} {Mandatory}$</li> <li>device_group : $\color{green} {Mandatory}$</li><li>batch_size : Optional [Default-1]</li> <li>prompt_len : Optional [Default-32]</li> <li>ctx_len : Optional [Default-128]</li><li>mxfp6 : Optional </li> <li>hf_token : Optional </li><li>cache_dir : Optional ["cache_dir" in current working directory]</li><li>prompt : Optional</li><li>prompts_txt_file_path : Optional</li><li>*only one argument, prompt or prompts_txt_file_path should be passed*</li> |
-| QEfficient.cloud.execute  |     [click here](#2-use-of-qefficientcloudexcute)       |   <li>model_name : $\color{green} {Mandatory}$</li> <li>device_group : $\color{green} {Mandatory}$</li><li>qpc_path : $\color{green} {Mandatory}$</li> <li>cache_dir : Optional ["cache_dir" in current working directory]</li><li>hf_token : Optional </li><li>prompt : Optional</li><li>prompts_txt_file_path : Optional</li> <li>*only one argument, prompt or prompts_txt_file_path should be passed*</li>             |
+| QEfficient.cloud.infer           |   [click here](#1-use-qefficientcloudinfer)         |  <li>model_name : $\color{green} {Mandatory}$</li> <li>num_cores : $\color{green} {Mandatory}$</li> <li>device_group : $\color{green} {Mandatory}$</li><li>batch_size : Optional [Default-1]</li> <li>prompt_len : Optional [Default-32]</li> <li>ctx_len : Optional [Default-128]</li><li>mxfp6 : Optional </li> <li>hf_token : Optional </li><li>cache_dir : Optional ["cache_dir" in current working directory]</li><li>prompt : Optional</li><li>prompts_txt_file_path : Optional</li><li>*One argument, prompt or prompts_txt_file_path must be passed*</li> |
+| QEfficient.cloud.execute  |     [click here](#2-use-of-qefficientcloudexcute)       |   <li>model_name : $\color{green} {Mandatory}$</li> <li>device_group : $\color{green} {Mandatory}$</li><li>qpc_path : $\color{green} {Mandatory}$</li> <li>cache_dir : Optional ["cache_dir" in current working directory]</li><li>hf_token : Optional </li><li>prompt : Optional</li><li>prompts_txt_file_path : Optional</li> <li>*One argument, prompt or prompts_txt_file_path must be passed*</li>             |
 
 
 ### 1. Use QEfficient.cloud.infer 

From 0229664b3eeb9b2e398fb9241080582b47ca65e1 Mon Sep 17 00:00:00 2001
From: Mamta Singh <quic_mamtsing@quicinc.com>
Date: Tue, 21 May 2024 16:37:26 +0530
Subject: [PATCH 07/28] Update infer, execute and text generation interface

Signed-off-by: mamtsing <mamtsing@qti.qualcomm.com>
---
 QEfficient/cloud/execute.py                   | 60 ++++--------
 QEfficient/cloud/infer.py                     | 16 ++--
 .../generation/text_generation_inference.py   | 93 ++++++++++---------
 3 files changed, 76 insertions(+), 93 deletions(-)

diff --git a/QEfficient/cloud/execute.py b/QEfficient/cloud/execute.py
index 3896ac8e..f7076518 100644
--- a/QEfficient/cloud/execute.py
+++ b/QEfficient/cloud/execute.py
@@ -11,22 +11,17 @@
 from huggingface_hub import login
 from transformers import AutoTokenizer
 
-from QEfficient.generation.text_generation_inference import (
-    check_batch_size_and_num_prompts,
-    cloud_ai_100_exec_kv,
-    get_compilation_batch_size,
-    read_prompts_txt_file,
-)
+from QEfficient.generation.text_generation_inference import cloud_ai_100_exec_kv
 from QEfficient.utils import hf_download
 from QEfficient.utils.constants import Constants
 
 
 def main(
     model_name: str,
-    prompt: str,
-    prompts_txt_file_path: str,
     qpc_path: str,
-    devices: List[int],
+    device_group: List[int],
+    prompt: str = None,
+    prompts_txt_file_path: str = None,
     cache_dir: str = Constants.CACHE_DIR,
     hf_token: str = None,
 ):
@@ -36,32 +31,23 @@ def main(
     :param model_name: str. Hugging Face Model Card name, Example: [gpt2]
     :prompt: str. Sample prompt for the model text generation
     :qpc_path: str.  Path to the generated binary after compilation.
-    :devices: List[int]. Device Ids to be used for compilation. if devices > 1. Multiple Card setup is enabled.
+    :device_group: List[int]. Device Ids to be used for compilation. if len(device_group) > 1. Multiple Card setup is enabled.
     """
+
     if hf_token is not None:
         login(hf_token)
 
     # Download tokenizer along with model if it doesn't exist
-    model_hf_path = hf_download(repo_id=model_name, cache_dir=cache_dir, allow_patterns=["*.json"])
+    model_hf_path = hf_download(repo_id=model_name, cache_dir=cache_dir, allow_patterns=["*.json", "*.py"])
     tokenizer = AutoTokenizer.from_pretrained(model_hf_path, use_cache=True, padding_side="left")
 
-    assert (prompt is None and prompts_txt_file_path is not None) or (
-        prompt is not None and prompts_txt_file_path is None
-    ), "Please pass either single input string using --prompt or multiple inputs using --prompts_txt_file_path"
-
-    if prompts_txt_file_path is not None:
-        prompt = read_prompts_txt_file(prompts_txt_file_path)
-
-    compilation_batch_size = get_compilation_batch_size(qpc_path)
-    check_batch_size_and_num_prompts(prompt, compilation_batch_size)
-
     # Execute
     cloud_ai_100_exec_kv(
-        compilation_batch_size=compilation_batch_size,
         tokenizer=tokenizer,
-        qpc=qpc_path,
-        device_id=devices,
+        qpc_path=qpc_path,
+        device_id=device_group,
         prompt=prompt,
+        prompts_txt_file_path=prompts_txt_file_path,
     )
 
 
@@ -71,6 +57,13 @@ def main(
         "--model_name", "--model-name", required=False, type=str, help="HF model card name for tokenizing the inputs"
     )
     parser.add_argument("--qpc_path", "--qpc-path", required=True, help="Path to generated QPC")
+    parser.add_argument(
+        "--device_group",
+        "--device-group",
+        required=True,
+        type=lambda device_ids: [int(x) for x in device_ids.strip("[]").split(",")],
+        help="Cloud AI 100 device ids (comma-separated) e.g. [0]",
+    )
     parser.add_argument(
         "--prompt",
         type=str,
@@ -78,17 +71,10 @@ def main(
     )
     parser.add_argument(
         "--prompts_txt_file_path",
-        "--prompts-txt-file-path-file-path",
+        "--prompts-txt-file-path",
         type=str,
         help="for batch size>1, pass input prompts in txt file, sample prompts.txt file present in examples folder",
     )
-    parser.add_argument(
-        "--device_group",
-        "--device-group",
-        required=True,
-        type=lambda device_ids: [int(x) for x in device_ids.strip("[]").split(",")],
-        help="cloud AI 100 device ids (comma-separated) e.g. [0]",
-    )
     parser.add_argument(
         "--cache-dir", "--cache_dir", default=Constants.CACHE_DIR, required=False, help="Cache dir to store HF Downlods"
     )
@@ -96,12 +82,4 @@ def main(
         "--hf-token", "--hf_token", default=None, type=str, required=False, help="HF token id for private HF models"
     )
     args = parser.parse_args()
-    main(
-        args.model_name,
-        args.prompt,
-        args.prompts_txt_file_path,
-        args.qpc_path,
-        args.device_group,
-        args.cache_dir,
-        args.hf_token,
-    )
+    main(**args.__dict__)
diff --git a/QEfficient/cloud/infer.py b/QEfficient/cloud/infer.py
index 7612460b..82445cd7 100644
--- a/QEfficient/cloud/infer.py
+++ b/QEfficient/cloud/infer.py
@@ -18,7 +18,6 @@
 from QEfficient.generation.text_generation_inference import (
     check_batch_size_and_num_prompts,
     cloud_ai_100_exec_kv,
-    get_compilation_batch_size,
     read_prompts_txt_file,
 )
 from QEfficient.utils import hf_download
@@ -87,7 +86,10 @@ def main(
     ), "Please pass either single input string using --prompt or multiple inputs using --prompts_txt_file_path"
 
     if prompts_txt_file_path is not None:
-        prompt = read_prompts_txt_file(prompts_txt_file_path)
+        prompts = read_prompts_txt_file(prompts_txt_file_path)
+        check_batch_size_and_num_prompts(prompts, batch_size)
+    else:
+        check_batch_size_and_num_prompts([prompt], batch_size)
 
     # Get tokenizer
     if hf_token is not None:
@@ -102,19 +104,15 @@ def main(
     if qpc_exists(qpc_dir_path):
         # execute
         logger.info("Pre-compiled qpc found! Trying to execute with given prompt")
-        compilation_batch_size = get_compilation_batch_size(qpc_dir_path)
-        check_batch_size_and_num_prompts(prompt, compilation_batch_size)
         cloud_ai_100_exec_kv(
-            compilation_batch_size=compilation_batch_size,
             tokenizer=tokenizer,
             qpc_path=qpc_dir_path,
             device_id=device_group,
             prompt=prompt,
+            prompts_txt_file_path=prompts_txt_file_path,
         )
         return
 
-    check_batch_size_and_num_prompts(prompt, batch_size)
-
     if onnx_exists(onnx_model_path):
         # Compile -> execute
         # We need to pass parent directory of qpc_dir_path, as the compile function handles the qpcs directory creation
@@ -134,11 +132,11 @@ def main(
             generated_qpc_path == qpc_dir_path
         ), f"QPC files were generated at an unusual location, expected {qpc_dir_path}; got {generated_qpc_path}"
         cloud_ai_100_exec_kv(
-            compilation_batch_size=compilation_batch_size,
             tokenizer=tokenizer,
             qpc_path=qpc_dir_path,
             device_id=device_group,
             prompt=prompt,
+            prompts_txt_file_path=prompts_txt_file_path,
         )
         return
 
@@ -187,11 +185,11 @@ def main(
 
     # Execute
     cloud_ai_100_exec_kv(
-        compilation_batch_size=compilation_batch_size,
         tokenizer=tokenizer,
         qpc_path=qpc_dir_path,
         device_id=device_group,
         prompt=prompt,
+        prompts_txt_file_path=prompts_txt_file_path,
     )
 
 
diff --git a/QEfficient/generation/text_generation_inference.py b/QEfficient/generation/text_generation_inference.py
index 1be533e3..6b730067 100755
--- a/QEfficient/generation/text_generation_inference.py
+++ b/QEfficient/generation/text_generation_inference.py
@@ -63,26 +63,19 @@ def write_io_files(
 
 def get_compilation_batch_size(qpc_path: str):
     qpc_base_path = os.path.dirname(qpc_path)
-    print(qpc_base_path)
     specialization_file_path = os.path.join(qpc_base_path, "specializations.json")
-    print(specialization_file_path)
     with open(specialization_file_path, "r") as file:
         data = json.load(file)
     compilation_batch_size = int(data["specializations"][0]["batch_size"])
     return compilation_batch_size
 
 
-def check_batch_size_and_num_prompts(prompt: Union[str, List], compilation_batch_size: int):
-    if isinstance(prompt, list):
-        num_prompts = len(prompt)
-    elif isinstance(prompt, str):
-        num_prompts = 1
-    else:
-        print("Input prompt sould be either string for single input or List of string in case of mutliple inputs")
-    if compilation_batch_size > 1:
+def check_batch_size_and_num_prompts(prompt: List[str], batch_size: int):
+    num_prompts = len(prompt)
+    if batch_size > 1:
         assert (
-            compilation_batch_size == num_prompts
-        ), f"Mismatch between number of prompts {num_prompts} and compilation batch size {compilation_batch_size}; please pass correct input argument"
+            batch_size == num_prompts
+        ), f"Mismatch between number of prompts {num_prompts} and batch size {batch_size}; please pass correct input argument"
 
 
 def read_prompts_txt_file(prompts_txt_file_path: str):
@@ -96,38 +89,55 @@ def read_prompts_txt_file(prompts_txt_file_path: str):
 def cloud_ai_100_exec_kv(
     tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
     qpc_path: str,
-    prompt: Union[str, List],
-    compilation_batch_size: int,
+    prompt: str,
+    prompts_txt_file_path: str,
     device_id: List[int] = [0],
 ):
-    if compilation_batch_size == 1 and isinstance(prompt, list):
+    assert (prompt is None and prompts_txt_file_path is not None) or (
+        prompt is not None and prompts_txt_file_path is None
+    ), "Please pass either single input string using --prompt or multiple inputs using --prompts_txt_file_path"
+
+    if prompts_txt_file_path is not None:
+        prompt = read_prompts_txt_file(prompts_txt_file_path)
+    if isinstance(prompt, str):
+        prompt = [prompt]
+
+    batch_size = get_compilation_batch_size(qpc_path)
+    check_batch_size_and_num_prompts(prompt, batch_size)
+
+    if batch_size == 1:
+        prefill_time = []
+        decode_perf = []
+        total_perf = []
+        total_time = []
+        generated_texts = []
         for i in range(len(prompt)):
-            latency_stats = exec_kv(tokenizer=tokenizer, qpc=qpc_path, device_id=device_id, prompt=prompt[i])
-            if i == len(prompt) - 1:
-                generated_texts, prefill_time, decode_perf, total_perf, total_time = latency_stats
-                print_latency_stats_kv(
-                    prompt,
-                    generated_texts,
-                    compilation_batch_size,
-                    prefill_time,
-                    decode_perf,
-                    total_perf,
-                    total_time,
-                    automation=False,
-                )
+            latency_stats = exec_kv(tokenizer=tokenizer, qpc=qpc_path, device_id=device_id, prompt=[prompt[i]])
+            generated_texts.append(latency_stats[0])
+            prefill_time.append(latency_stats[1])
+            decode_perf.append(latency_stats[2])
+            total_perf.append(latency_stats[3])
+            total_time.append(latency_stats[4])
+
+        prefill_time = np.average(prefill_time)
+        decode_perf = np.average(decode_perf)
+        total_perf = np.average(total_perf)
+        total_time = np.average(total_time)
+
     else:
         latency_stats = exec_kv(tokenizer=tokenizer, qpc=qpc_path, device_id=device_id, prompt=prompt)
         generated_texts, prefill_time, decode_perf, total_perf, total_time = latency_stats
-        print_latency_stats_kv(
-            prompt,
-            generated_texts,
-            compilation_batch_size,
-            prefill_time,
-            decode_perf,
-            total_perf,
-            total_time,
-            automation=False,
-        )
+
+    print_latency_stats_kv(
+        prompt,
+        generated_texts,
+        batch_size,
+        prefill_time,
+        decode_perf,
+        total_perf,
+        total_time,
+        automation=False,
+    )
 
 
 def latency_stats_bertstyle(
@@ -169,7 +179,7 @@ def latency_stats_bertstyle(
 def exec_kv(
     tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
     qpc: str,
-    prompt: Union[str, List],
+    prompt: List[str],
     input_len: Optional[int] = None,
     generation_len: Optional[int] = None,
     device_id: List[int] = [0],
@@ -218,10 +228,7 @@ def exec_kv(
     generated_ids = np.full((batch_size, generation_len - input_len + 1), tokenizer.pad_token_id)
 
     if stream:
-        if isinstance(prompt, list):
-            print(0, prompt[0], end=" ", flush=True)
-        else:
-            print(0, prompt, end=" ", flush=True)
+        print(0, prompt[0], end=" ", flush=True)
 
     # Run prefill
     for i in range(num_chunks):

From e51431fa1b60c1d1c433e496da5df3b7c08abfa3 Mon Sep 17 00:00:00 2001
From: Mamta Singh <168400541+quic-mamta@users.noreply.github.com>
Date: Tue, 21 May 2024 16:46:04 +0530
Subject: [PATCH 08/28] Update execute.py

Signed-off-by: Mamta Singh <168400541+quic-mamta@users.noreply.github.com>
---
 QEfficient/cloud/execute.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/QEfficient/cloud/execute.py b/QEfficient/cloud/execute.py
index f7076518..866926a4 100644
--- a/QEfficient/cloud/execute.py
+++ b/QEfficient/cloud/execute.py
@@ -28,10 +28,11 @@ def main(
     """
     APi() to run the Model on Cloud AI 100 Platform.
     ---------
-    :param model_name: str. Hugging Face Model Card name, Example: [gpt2]
-    :prompt: str. Sample prompt for the model text generation
+    :param model_name: str. Hugging Face Model Card name, Example: "gpt2"
     :qpc_path: str.  Path to the generated binary after compilation.
     :device_group: List[int]. Device Ids to be used for compilation. if len(device_group) > 1. Multiple Card setup is enabled.
+    :prompt: str. Sample prompt for the model text generation
+    :prompts_txt_file_path: str. Path to txt file for multiple input prompts (in case of batch size > 1)
     """
 
     if hf_token is not None:

From 18c973ca529dc474573d310d67e602b71a6f6ca0 Mon Sep 17 00:00:00 2001
From: Mamta Singh <168400541+quic-mamta@users.noreply.github.com>
Date: Tue, 21 May 2024 17:01:22 +0530
Subject: [PATCH 09/28] Update execute.py

Signed-off-by: Mamta Singh <168400541+quic-mamta@users.noreply.github.com>
---
 QEfficient/cloud/execute.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/QEfficient/cloud/execute.py b/QEfficient/cloud/execute.py
index 866926a4..2c344384 100644
--- a/QEfficient/cloud/execute.py
+++ b/QEfficient/cloud/execute.py
@@ -39,7 +39,7 @@ def main(
         login(hf_token)
 
     # Download tokenizer along with model if it doesn't exist
-    model_hf_path = hf_download(repo_id=model_name, cache_dir=cache_dir, allow_patterns=["*.json", "*.py"])
+    model_hf_path = hf_download(repo_id=model_name, cache_dir=cache_dir, allow_patterns=["*.json", "*.py", "*token*"])
     tokenizer = AutoTokenizer.from_pretrained(model_hf_path, use_cache=True, padding_side="left")
 
     # Execute

From cef24ab62a346093c8e7e0d3af50a951a1f79a50 Mon Sep 17 00:00:00 2001
From: Mamta Singh <quic_mamtsing@quicinc.com>
Date: Tue, 21 May 2024 17:33:45 +0530
Subject: [PATCH 10/28] Update text generation interface

Signed-off-by: mamtsing <mamtsing@qti.qualcomm.com>
---
 .../generation/text_generation_inference.py   | 34 ++++++++++++++++---
 1 file changed, 30 insertions(+), 4 deletions(-)

diff --git a/QEfficient/generation/text_generation_inference.py b/QEfficient/generation/text_generation_inference.py
index 6b730067..e6ca743e 100755
--- a/QEfficient/generation/text_generation_inference.py
+++ b/QEfficient/generation/text_generation_inference.py
@@ -92,6 +92,12 @@ def cloud_ai_100_exec_kv(
     prompt: str,
     prompts_txt_file_path: str,
     device_id: List[int] = [0],
+    input_len: Optional[int] = None,
+    generation_len: Optional[int] = None,
+    enable_debug_logs: bool = False,
+    stream: bool = True,
+    write_io_dir: Optional[str] = None,
+    automation=False,
 ):
     assert (prompt is None and prompts_txt_file_path is not None) or (
         prompt is not None and prompts_txt_file_path is None
@@ -112,7 +118,17 @@ def cloud_ai_100_exec_kv(
         total_time = []
         generated_texts = []
         for i in range(len(prompt)):
-            latency_stats = exec_kv(tokenizer=tokenizer, qpc=qpc_path, device_id=device_id, prompt=[prompt[i]])
+            latency_stats = cloud_ai_100_exec_kv_helper(
+                tokenizer=tokenizer,
+                prompt=[prompt[i]],
+                qpc=qpc_path,
+                device_id=device_id,
+                input_len=input_len,
+                generation_len=generation_len,
+                enable_debug_logs=enable_debug_logs,
+                stream=stream,
+                write_io_dir=write_io_dir,
+            )
             generated_texts.append(latency_stats[0])
             prefill_time.append(latency_stats[1])
             decode_perf.append(latency_stats[2])
@@ -125,7 +141,17 @@ def cloud_ai_100_exec_kv(
         total_time = np.average(total_time)
 
     else:
-        latency_stats = exec_kv(tokenizer=tokenizer, qpc=qpc_path, device_id=device_id, prompt=prompt)
+        latency_stats = cloud_ai_100_exec_kv_helper(
+                tokenizer=tokenizer,
+                prompt=prompt,
+                qpc=qpc_path,
+                device_id=device_id,
+                input_len=input_len,
+                generation_len=generation_len,
+                enable_debug_logs=enable_debug_logs,
+                stream=stream,
+                write_io_dir=write_io_dir,
+            )
         generated_texts, prefill_time, decode_perf, total_perf, total_time = latency_stats
 
     print_latency_stats_kv(
@@ -136,7 +162,7 @@ def cloud_ai_100_exec_kv(
         decode_perf,
         total_perf,
         total_time,
-        automation=False,
+        automation=automation,
     )
 
 
@@ -176,7 +202,7 @@ def latency_stats_bertstyle(
     print(round((cur_len - init_len) / (end - start), 2), "tok/s")
 
 
-def exec_kv(
+def cloud_ai_100_exec_kv_helper(
     tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
     qpc: str,
     prompt: List[str],

From b6920c4ba55b7c88c1ea117a0bb8329596522ce3 Mon Sep 17 00:00:00 2001
From: Mamta Singh <quic_mamtsing@quicinc.com>
Date: Tue, 21 May 2024 17:41:37 +0530
Subject: [PATCH 11/28] Update Notebooks

Signed-off-by: quic-mamta <quic_mamtsing@quicinc.com>
Signed-off-by: mamtsing <mamtsing@qti.qualcomm.com>
---
 notebooks/QEfficientGPT2.ipynb | 2 +-
 notebooks/QEfficientMPT.ipynb  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/notebooks/QEfficientGPT2.ipynb b/notebooks/QEfficientGPT2.ipynb
index 8984aa54..be7e3d44 100644
--- a/notebooks/QEfficientGPT2.ipynb
+++ b/notebooks/QEfficientGPT2.ipynb
@@ -166,7 +166,7 @@
     "\n",
     "# post compilation, we can print the latency stats for the kv models, We provide API to print token and Latency stats on AI 100\n",
     "# We need the compiled prefill and decode qpc to compute the token generated, This is based on Greedy Sampling Approach\n",
-    "cloud_ai_100_exec_kv(tokenizer=tokenizer, qpc=generated_qpc_path, device_id=[0], prompt=\"My name is\")"
+    "cloud_ai_100_exec_kv(tokenizer=tokenizer, qpc_path=generated_qpc_path, device_id=[0], prompt=\"My name is\")"
    ]
   }
  ],
diff --git a/notebooks/QEfficientMPT.ipynb b/notebooks/QEfficientMPT.ipynb
index ba3b8b60..d7e45d92 100644
--- a/notebooks/QEfficientMPT.ipynb
+++ b/notebooks/QEfficientMPT.ipynb
@@ -165,7 +165,7 @@
     "# post compilation, we can print the latency stats for the kv models, We provide API to print token and Latency stats on AI 100\n",
     "# We need the compiled prefill and decode qpc to compute the token generated, This is based on Greedy Sampling Approach\n",
     "\n",
-    "cloud_ai_100_exec_kv(tokenizer=tokenizer, qpc=generated_qpc_path, device_id=[0], prompt=\"My name is\")"
+    "cloud_ai_100_exec_kv(tokenizer=tokenizer, qpc_path=generated_qpc_path, device_id=[0], prompt=\"My name is\")"
    ]
   }
  ],

From 20cdb52ed9efd43f11ea41c040907d48f11c4a82 Mon Sep 17 00:00:00 2001
From: Mamta Singh <168400541+quic-mamta@users.noreply.github.com>
Date: Tue, 21 May 2024 17:47:23 +0530
Subject: [PATCH 12/28] Update README.md

Signed-off-by: Mamta Singh <168400541+quic-mamta@users.noreply.github.com>
Signed-off-by: mamtsing <mamtsing@qti.qualcomm.com>
---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 4eae5581..6a856b77 100644
--- a/README.md
+++ b/README.md
@@ -174,7 +174,7 @@ python -m QEfficient.cloud.infer --model_name gpt2 --batch_size 1 --prompt_len 3
 |  QEfficient.transform    |   [click here](#1-model-download-and-transform)         |  <li>model : $\color{green} {Mandatory}$</li><li>Type : Optional [Default- "Transformers"]</li> <li>form_factor : Optional [Default-"cloud"]</li>  | 
 | qualcomm_efficient_converter |     [click here](#2-onnx-export-of-transformed-model)       |   <li>mode_name : $\color{green} {Mandatory}$</li><li>model_kv : $\color{green} {Mandatory}$ [Optional when model_class passed] </li><li>model_class : $\color{green} {Mandatory}$ [Optional when model_kv passed]</li> <li>tokenizer : Optional</li><li>onnx_path : Optional </li><li>hf_token : Optional</li><li>seq_length : Optional [Default-128]</li><li>input_str : Optional [Default-"My name is"]</li><li>kv : Optional [Default-$\color{green} {True}$]</li><li>return_path : Optional [Default-False]</li><li>form_factor : Optional [Default-"cloud"]</li><li>save_fp32_onnx : Optional [Default-False]</li><li>save_fp16_onnx : Optional [Default-True]</li> <li>*Both save_fp32_onnx and save_fp16_onnx can't be false*</li> | 
 |     compile | [click here](#3-compile-on-cloud-ai-100) | <li>onnx_path : $\color{green} {Mandatory}$</li><li>qpc_path : $\color{green} {Mandatory}$</li><li>num_cores : $\color{green} {Mandatory}$</li><li>device_group  : $\color{green} {Mandatory}$</li> <li>batch_size : Optional [Default-1]</li> <li>prompt_len : Optional [Default-32]</li><li>ctx_len : Optional [Default-128]</li><li>mxfp6 : Optional [Default-True]</li>| 
-|cloud_ai_100_exec_kv | [click here](#4-run-benchmark)  | <li>tokenizer : $\color{green} {Mandatory}$</li> <li>qpc : $\color{green} {Mandatory}$</li><li>prompt : $\color{green} {Mandatory}$</li><li>input_len : Optional [Default-None]</li> <li>generation_len : Optional [Default-None]</li> <li>device_id : Optional [Default-[0]]</li> <li>enable_debug_logs : Optional [Default-False]</li> <li>stream : Optional [Default-True]</li> <li>write_io_dir : Optional</li><li>automation : Optional [Default-False]</li> | 
+|cloud_ai_100_exec_kv | [click here](#4-run-benchmark)  | <li>tokenizer : $\color{green} {Mandatory}$</li> <li>qpc_path : $\color{green} {Mandatory}$</li><li>prompt : Optional</li><li>prompts_txt_file_path : Optional</li><li>input_len : Optional [Default-None]</li> <li>generation_len : Optional [Default-None]</li> <li>device_id : Optional [Default-[0]]</li> <li>enable_debug_logs : Optional [Default-False]</li> <li>stream : Optional [Default-True]</li> <li>write_io_dir : Optional</li><li>automation : Optional [Default-False]</li><li>*One argument, prompt or prompts_txt_file_path must be passed*</li> | 
 
 
 ### 1.  Model download and transform
@@ -269,7 +269,7 @@ from QEfficient.generation.text_generation_inference import cloud_ai_100_exec_kv
 
 # post compilation, we can print the latency stats for the kv models, We provide API to print token and Latency stats on AI 100
 # We need the compiled prefill and decode qpc to compute the token generated, This is based on Greedy Sampling Approach
-cloud_ai_100_exec_kv(tokenizer=tokenizer, qpc=generated_qpc_path, device_id=[0], prompt="My name is")
+cloud_ai_100_exec_kv(tokenizer=tokenizer, qpc_path=generated_qpc_path, device_id=[0], prompt="My name is")
 ```
 End to End demo examples for various models are available in **notebooks** directory. Please check them out.
 

From 80fb101b1b2934d0bdb0b8104a2f5d6492425b0a Mon Sep 17 00:00:00 2001
From: Mamta Singh <168400541+quic-mamta@users.noreply.github.com>
Date: Tue, 21 May 2024 17:51:43 +0530
Subject: [PATCH 13/28] Update README.md

Signed-off-by: Mamta Singh <168400541+quic-mamta@users.noreply.github.com>
Signed-off-by: mamtsing <mamtsing@qti.qualcomm.com>
---
 README.md | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/README.md b/README.md
index 6a856b77..80dd2e76 100644
--- a/README.md
+++ b/README.md
@@ -111,14 +111,14 @@ In summary:
 | High Level APIs | Sample use | Arguments         |
 |-----------------|------------|-------------------|
 | QEfficient.cloud.infer           |   [click here](#1-use-qefficientcloudinfer)         |  <li>model_name : $\color{green} {Mandatory}$</li> <li>num_cores : $\color{green} {Mandatory}$</li> <li>device_group : $\color{green} {Mandatory}$</li><li>batch_size : Optional [Default-1]</li> <li>prompt_len : Optional [Default-32]</li> <li>ctx_len : Optional [Default-128]</li><li>mxfp6 : Optional </li> <li>hf_token : Optional </li><li>cache_dir : Optional ["cache_dir" in current working directory]</li><li>prompt : Optional</li><li>prompts_txt_file_path : Optional</li><li>*One argument, prompt or prompts_txt_file_path must be passed*</li> |
-| QEfficient.cloud.execute  |     [click here](#2-use-of-qefficientcloudexcute)       |   <li>model_name : $\color{green} {Mandatory}$</li> <li>device_group : $\color{green} {Mandatory}$</li><li>qpc_path : $\color{green} {Mandatory}$</li> <li>cache_dir : Optional ["cache_dir" in current working directory]</li><li>hf_token : Optional </li><li>prompt : Optional</li><li>prompts_txt_file_path : Optional</li> <li>*One argument, prompt or prompts_txt_file_path must be passed*</li>             |
+| QEfficient.cloud.execute  |     [click here](#2-use-of-qefficientcloudexecute)       |   <li>model_name : $\color{green} {Mandatory}$</li> <li>device_group : $\color{green} {Mandatory}$</li><li>qpc_path : $\color{green} {Mandatory}$</li> <li>cache_dir : Optional ["cache_dir" in current working directory]</li><li>hf_token : Optional </li><li>prompt : Optional</li><li>prompts_txt_file_path : Optional</li> <li>*One argument, prompt or prompts_txt_file_path must be passed*</li>             |
 
 
 ### 1. Use QEfficient.cloud.infer 
 
 This is the single e2e python api in the library, which takes model_card name as input along with other compile args if necessary and does everything in one go. 
 
-* Torch Download → Optimize for Cloud AI 100 → Export to ONNX → Verify (CPU) → Compile on Cloud AI 100 → [Execute](#2-use-of-qefficientcloudexcute)
+* Torch Download → Optimize for Cloud AI 100 → Export to ONNX → Verify (CPU) → Compile on Cloud AI 100 → [Execute](#2-use-of-qefficientcloudexecute)
 * Its skips the ONNX export/compile stage if ONNX file or qpc found on path
 
 
@@ -131,12 +131,12 @@ python -m QEfficient.cloud.infer --model_name gpt2 --batch_size 1 --prompt_len 3
 
 python -m QEfficient.cloud.infer --model_name gpt2 --batch_size 3 --prompt_len 32 --ctx_len 128 --num_cores 16 --device_group [0] --prompts_txt_file_path examples/prompts.txt --mxfp6 --mos 1 --aic_enable_depth_first  
  ```
-### 2. Use of QEfficient.cloud.excute
+### 2. Use of QEfficient.cloud.execute
 
 Once we have compiled the QPC, we can now use the precompiled QPC in execute API to run for different prompts, like below:
 
 ```bash
-python -m QEfficient.cloud.execute --model_name gpt2 --qpc_path qeff_models/gpt2/qpc_16cores_1BS_32PL_128CL_1devices_mxfp6/qpcs/ --prompt "Once upon a time in" --device_group [0]  
+python -m QEfficient.cloud.execute --model_name gpt2 --qpc_path qeff_models/gpt2/qpc_16cores_1BS_32PL_128CL_1devices_mxfp6/qpcs --prompt "Once upon a time in" --device_group [0]  
 ```
 
 We can also enable MQ, just based on the number of devices. Based on the "--device-group" as input it will create TS config on the fly. If "--device-group [0,1]" it will create TS config for 2 devices and use it for compilation, if "--device-group 0" then TS compilation is skipped and single soc execution is enabled.
@@ -157,7 +157,7 @@ python -m QEfficient.cloud.infer --model_name gpt2 --batch_size 1 --prompt_len 3
 | High Level APIs | Single SoC | Tensor Slicing         |
 |-----------------|------------|-------------------|
 | QEfficient.cloud.infer           | python -m QEfficient.cloud.infer --model_name $\color{green} {model}$  --batch_size 1 --prompt_len 128 --ctx_len 1024 --num_cores 16 --device-group [0] --prompt "My name is" --mxfp6 --hf_token  $\color{green}{xyz}$  --mos 1 --aic_enable_depth_first |  python -m QEfficient.cloud.infer --model_name $\color{green}{model}$  --batch_size 1 --prompt_len 128 --ctx_len 1024--num_cores 16 --device-group [0,1,2,3] --prompt "My name is" --mxfp6 --hf_token  $\color{green}{xyz}$  --mos 4 --aic_enable_depth_first |
-| QEfficient.cloud.excute  |   python -m QEfficient.cloud.execute --model_name $\color{green}{model}$  --device_group [0] --qpc_path  $\color{green}{path}$  --prompt "My name is"  --hf_token  $\color{green}{xyz}$   |  python -m QEfficient.cloud.execute --model_name $\color{green}{model}$  --device_group [0,1,2,3] --qpc_path  $\color{green}{path}$  --prompt "My name is"  --hf_token  $\color{green}{xyz}$   |
+| QEfficient.cloud.execute  |   python -m QEfficient.cloud.execute --model_name $\color{green}{model}$  --device_group [0] --qpc_path  $\color{green}{path}$  --prompt "My name is"  --hf_token  $\color{green}{xyz}$   |  python -m QEfficient.cloud.execute --model_name $\color{green}{model}$  --device_group [0,1,2,3] --qpc_path  $\color{green}{path}$  --prompt "My name is"  --hf_token  $\color{green}{xyz}$   |
 
 :memo: Replace $\color{green}{model}$ ,  $\color{green}{path}$  and  $\color{green}{xyz}$  with preffered model card name, qpc path and hf token respectively.
 

From 01999cab6d6d37c01dcb3baa08866de254c29720 Mon Sep 17 00:00:00 2001
From: Mamta Singh <168400541+quic-mamta@users.noreply.github.com>
Date: Tue, 21 May 2024 19:12:48 +0530
Subject: [PATCH 14/28] Update text_generation_inference.py

Signed-off-by: Mamta Singh <168400541+quic-mamta@users.noreply.github.com>
---
 QEfficient/generation/text_generation_inference.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/QEfficient/generation/text_generation_inference.py b/QEfficient/generation/text_generation_inference.py
index e6ca743e..1eb7c27b 100755
--- a/QEfficient/generation/text_generation_inference.py
+++ b/QEfficient/generation/text_generation_inference.py
@@ -89,8 +89,8 @@ def read_prompts_txt_file(prompts_txt_file_path: str):
 def cloud_ai_100_exec_kv(
     tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
     qpc_path: str,
-    prompt: str,
-    prompts_txt_file_path: str,
+    prompt: Optional[str] = None,
+    prompts_txt_file_path: Optional[str] = None,
     device_id: List[int] = [0],
     input_len: Optional[int] = None,
     generation_len: Optional[int] = None,

From 94b7ead01fb36044a091174265964f75ff3850d1 Mon Sep 17 00:00:00 2001
From: Mamta Singh <quic_mamtsing@quicinc.com>
Date: Wed, 22 May 2024 15:11:19 +0530
Subject: [PATCH 15/28] Update infer and execute and text generation interface

Signed-off-by: Mamta Singh <quic_mamtsing@quicinc.com>
---
 QEfficient/cloud/execute.py                   |   7 +-
 QEfficient/cloud/infer.py                     |  17 +-
 .../generation/text_generation_inference.py   | 207 +++++++++---------
 3 files changed, 116 insertions(+), 115 deletions(-)

diff --git a/QEfficient/cloud/execute.py b/QEfficient/cloud/execute.py
index 2c344384..6a1aeb63 100644
--- a/QEfficient/cloud/execute.py
+++ b/QEfficient/cloud/execute.py
@@ -67,14 +67,15 @@ def main(
     )
     parser.add_argument(
         "--prompt",
-        type=str,
-        help="Input prompt, if executing for batch size>1, use prompts_txt_file_path flag",
+        type=lambda prompt: prompt.split("|"),
+        default="My name is",
+        help="Input prompt, if executing for batch size>1, pass input promprs in single string but seperate with pipe (|) symbol",
     )
     parser.add_argument(
         "--prompts_txt_file_path",
         "--prompts-txt-file-path",
         type=str,
-        help="for batch size>1, pass input prompts in txt file, sample prompts.txt file present in examples folder",
+        help="File path for taking input prompts from txt file, sample prompts.txt file present in examples folder",
     )
     parser.add_argument(
         "--cache-dir", "--cache_dir", default=Constants.CACHE_DIR, required=False, help="Cache dir to store HF Downlods"
diff --git a/QEfficient/cloud/infer.py b/QEfficient/cloud/infer.py
index 39a95418..fe55ceaf 100644
--- a/QEfficient/cloud/infer.py
+++ b/QEfficient/cloud/infer.py
@@ -81,15 +81,17 @@ def main(
     onnx_dir_path = os.path.join(model_card_dir, "onnx")
     onnx_model_path = os.path.join(onnx_dir_path, model_name.replace("/", "_") + "_kv_clipped_fp16.onnx")
 
-    assert (prompt is None and prompts_txt_file_path is not None) or (
-        prompt is not None and prompts_txt_file_path is None
-    ), "Please pass either single input string using --prompt or multiple inputs using --prompts_txt_file_path"
+    print("prompt : ", prompt)
+    print("prompts_txt_file_path : ", prompts_txt_file_path)
 
     if prompts_txt_file_path is not None:
+        logger.info("Found inputs passed using txt file as well as CLI, taking inputs from given txt file")
         prompts = read_prompts_txt_file(prompts_txt_file_path)
         check_batch_size_and_num_prompts(prompts, batch_size)
     else:
-        check_batch_size_and_num_prompts([prompt], batch_size)
+        if isinstance(prompt, str):
+            prompt = [prompt]
+        check_batch_size_and_num_prompts(prompt, batch_size)
 
     # Get tokenizer
     if hf_token is not None:
@@ -226,14 +228,15 @@ def main(
     )
     parser.add_argument(
         "--prompt",
-        type=str,
-        help="Input prompt, if executing for batch size>1, use prompts_txt_file_path flag",
+        type=lambda prompt: prompt.split("|"),
+        default="My name is",
+        help="Input prompt, if executing for batch size>1, pass input promprs in single string but seperate with pipe (|) symbol",
     )
     parser.add_argument(
         "--prompts_txt_file_path",
         "--prompts-txt-file-path",
         type=str,
-        help="for batch size>1, pass input prompts in txt file, sample prompts.txt file present in examples folder",
+        help="File path for taking input prompts from txt file, sample prompts.txt file present in examples folder",
     )
     parser.add_argument(
         "--aic_enable_depth_first",
diff --git a/QEfficient/generation/text_generation_inference.py b/QEfficient/generation/text_generation_inference.py
index 1eb7c27b..38eca70c 100755
--- a/QEfficient/generation/text_generation_inference.py
+++ b/QEfficient/generation/text_generation_inference.py
@@ -61,111 +61,6 @@ def write_io_files(
         json.dump({"IO-files": io_files}, fp, indent=True)
 
 
-def get_compilation_batch_size(qpc_path: str):
-    qpc_base_path = os.path.dirname(qpc_path)
-    specialization_file_path = os.path.join(qpc_base_path, "specializations.json")
-    with open(specialization_file_path, "r") as file:
-        data = json.load(file)
-    compilation_batch_size = int(data["specializations"][0]["batch_size"])
-    return compilation_batch_size
-
-
-def check_batch_size_and_num_prompts(prompt: List[str], batch_size: int):
-    num_prompts = len(prompt)
-    if batch_size > 1:
-        assert (
-            batch_size == num_prompts
-        ), f"Mismatch between number of prompts {num_prompts} and batch size {batch_size}; please pass correct input argument"
-
-
-def read_prompts_txt_file(prompts_txt_file_path: str):
-    prompt = []
-    with open(prompts_txt_file_path, "r") as file:
-        for line in file:
-            prompt.append(line.strip())
-    return prompt
-
-
-def cloud_ai_100_exec_kv(
-    tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
-    qpc_path: str,
-    prompt: Optional[str] = None,
-    prompts_txt_file_path: Optional[str] = None,
-    device_id: List[int] = [0],
-    input_len: Optional[int] = None,
-    generation_len: Optional[int] = None,
-    enable_debug_logs: bool = False,
-    stream: bool = True,
-    write_io_dir: Optional[str] = None,
-    automation=False,
-):
-    assert (prompt is None and prompts_txt_file_path is not None) or (
-        prompt is not None and prompts_txt_file_path is None
-    ), "Please pass either single input string using --prompt or multiple inputs using --prompts_txt_file_path"
-
-    if prompts_txt_file_path is not None:
-        prompt = read_prompts_txt_file(prompts_txt_file_path)
-    if isinstance(prompt, str):
-        prompt = [prompt]
-
-    batch_size = get_compilation_batch_size(qpc_path)
-    check_batch_size_and_num_prompts(prompt, batch_size)
-
-    if batch_size == 1:
-        prefill_time = []
-        decode_perf = []
-        total_perf = []
-        total_time = []
-        generated_texts = []
-        for i in range(len(prompt)):
-            latency_stats = cloud_ai_100_exec_kv_helper(
-                tokenizer=tokenizer,
-                prompt=[prompt[i]],
-                qpc=qpc_path,
-                device_id=device_id,
-                input_len=input_len,
-                generation_len=generation_len,
-                enable_debug_logs=enable_debug_logs,
-                stream=stream,
-                write_io_dir=write_io_dir,
-            )
-            generated_texts.append(latency_stats[0])
-            prefill_time.append(latency_stats[1])
-            decode_perf.append(latency_stats[2])
-            total_perf.append(latency_stats[3])
-            total_time.append(latency_stats[4])
-
-        prefill_time = np.average(prefill_time)
-        decode_perf = np.average(decode_perf)
-        total_perf = np.average(total_perf)
-        total_time = np.average(total_time)
-
-    else:
-        latency_stats = cloud_ai_100_exec_kv_helper(
-                tokenizer=tokenizer,
-                prompt=prompt,
-                qpc=qpc_path,
-                device_id=device_id,
-                input_len=input_len,
-                generation_len=generation_len,
-                enable_debug_logs=enable_debug_logs,
-                stream=stream,
-                write_io_dir=write_io_dir,
-            )
-        generated_texts, prefill_time, decode_perf, total_perf, total_time = latency_stats
-
-    print_latency_stats_kv(
-        prompt,
-        generated_texts,
-        batch_size,
-        prefill_time,
-        decode_perf,
-        total_perf,
-        total_time,
-        automation=automation,
-    )
-
-
 def latency_stats_bertstyle(
     model_name: str,
     qpc: str,
@@ -202,6 +97,31 @@ def latency_stats_bertstyle(
     print(round((cur_len - init_len) / (end - start), 2), "tok/s")
 
 
+def get_compilation_batch_size(qpc_path: str):
+    qpc_base_path = os.path.dirname(qpc_path)
+    specialization_file_path = os.path.join(qpc_base_path, "specializations.json")
+    with open(specialization_file_path, "r") as file:
+        data = json.load(file)
+    compilation_batch_size = int(data["specializations"][0]["batch_size"])
+    return compilation_batch_size
+
+
+def check_batch_size_and_num_prompts(prompt: List[str], batch_size: int):
+    num_prompts = len(prompt)
+    if batch_size > 1:
+        assert (
+            batch_size == num_prompts
+        ), f"Mismatch between number of prompts {num_prompts} and batch size {batch_size}; please pass correct input argument"
+
+
+def read_prompts_txt_file(prompts_txt_file_path: str):
+    prompt = []
+    with open(prompts_txt_file_path, "r") as file:
+        for line in file:
+            prompt.append(line.strip())
+    return prompt
+
+
 def cloud_ai_100_exec_kv_helper(
     tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
     qpc: str,
@@ -344,3 +264,80 @@ def print_latency_stats_kv(
         print("E2E:", round(total_perf, 2), "tok/s")
         print("Total (E2E) inference time is=", round(total_time, 2), "s")
     print("=============================================================")
+
+
+def cloud_ai_100_exec_kv(
+    tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
+    qpc_path: str,
+    prompt: Optional[str] = None,
+    prompts_txt_file_path: Optional[str] = None,
+    device_id: List[int] = [0],
+    input_len: Optional[int] = None,
+    generation_len: Optional[int] = None,
+    enable_debug_logs: bool = False,
+    stream: bool = True,
+    write_io_dir: Optional[str] = None,
+    automation=False,
+):
+    if prompts_txt_file_path is not None:
+        logger.info("Found inputs passed using txt file as well as CLI, taking inputs from given txt file")
+        prompt = read_prompts_txt_file(prompts_txt_file_path)
+    if isinstance(prompt, str):
+        prompt = [prompt]
+
+    batch_size = get_compilation_batch_size(qpc_path)
+    check_batch_size_and_num_prompts(prompt, batch_size)
+
+    if batch_size == 1:
+        prefill_time = []
+        decode_perf = []
+        total_perf = []
+        total_time = []
+        generated_texts = []
+        for i in range(len(prompt)):
+            latency_stats = cloud_ai_100_exec_kv_helper(
+                tokenizer=tokenizer,
+                prompt=[prompt[i]],
+                qpc=qpc_path,
+                device_id=device_id,
+                input_len=input_len,
+                generation_len=generation_len,
+                enable_debug_logs=enable_debug_logs,
+                stream=stream,
+                write_io_dir=write_io_dir,
+            )
+            generated_texts.append(latency_stats[0])
+            prefill_time.append(latency_stats[1])
+            decode_perf.append(latency_stats[2])
+            total_perf.append(latency_stats[3])
+            total_time.append(latency_stats[4])
+
+        prefill_time = np.average(prefill_time)
+        decode_perf = np.average(decode_perf)
+        total_perf = np.average(total_perf)
+        total_time = np.average(total_time)
+
+    else:
+        latency_stats = cloud_ai_100_exec_kv_helper(
+            tokenizer=tokenizer,
+            prompt=prompt,
+            qpc=qpc_path,
+            device_id=device_id,
+            input_len=input_len,
+            generation_len=generation_len,
+            enable_debug_logs=enable_debug_logs,
+            stream=stream,
+            write_io_dir=write_io_dir,
+        )
+        generated_texts, prefill_time, decode_perf, total_perf, total_time = latency_stats
+
+    print_latency_stats_kv(
+        prompt,
+        generated_texts,
+        batch_size,
+        prefill_time,
+        decode_perf,
+        total_perf,
+        total_time,
+        automation=automation,
+    )

From 885c07bccc7b3d09d39fcc96b8eb57065eb2918d Mon Sep 17 00:00:00 2001
From: Mamta Singh <168400541+quic-mamta@users.noreply.github.com>
Date: Wed, 22 May 2024 15:21:41 +0530
Subject: [PATCH 16/28] Update infer.py

Signed-off-by: Mamta Singh <168400541+quic-mamta@users.noreply.github.com>
---
 QEfficient/cloud/infer.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/QEfficient/cloud/infer.py b/QEfficient/cloud/infer.py
index fe55ceaf..934f4e8a 100644
--- a/QEfficient/cloud/infer.py
+++ b/QEfficient/cloud/infer.py
@@ -81,9 +81,6 @@ def main(
     onnx_dir_path = os.path.join(model_card_dir, "onnx")
     onnx_model_path = os.path.join(onnx_dir_path, model_name.replace("/", "_") + "_kv_clipped_fp16.onnx")
 
-    print("prompt : ", prompt)
-    print("prompts_txt_file_path : ", prompts_txt_file_path)
-
     if prompts_txt_file_path is not None:
         logger.info("Found inputs passed using txt file as well as CLI, taking inputs from given txt file")
         prompts = read_prompts_txt_file(prompts_txt_file_path)
@@ -230,7 +227,7 @@ def main(
         "--prompt",
         type=lambda prompt: prompt.split("|"),
         default="My name is",
-        help="Input prompt, if executing for batch size>1, pass input promprs in single string but seperate with pipe (|) symbol",
+        help="Input prompt, if executing for batch size>1, pass input prompts in single string but seperate with pipe (|) symbol",
     )
     parser.add_argument(
         "--prompts_txt_file_path",

From bc615b4dbe68c4c7b81086d5e8ecd580863a3a4c Mon Sep 17 00:00:00 2001
From: Mamta Singh <168400541+quic-mamta@users.noreply.github.com>
Date: Wed, 22 May 2024 15:52:15 +0530
Subject: [PATCH 17/28] Update README.md

Signed-off-by: Mamta Singh <168400541+quic-mamta@users.noreply.github.com>
---
 README.md | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/README.md b/README.md
index 80dd2e76..cb7ee4c4 100644
--- a/README.md
+++ b/README.md
@@ -110,8 +110,8 @@ In summary:
 
 | High Level APIs | Sample use | Arguments         |
 |-----------------|------------|-------------------|
-| QEfficient.cloud.infer           |   [click here](#1-use-qefficientcloudinfer)         |  <li>model_name : $\color{green} {Mandatory}$</li> <li>num_cores : $\color{green} {Mandatory}$</li> <li>device_group : $\color{green} {Mandatory}$</li><li>batch_size : Optional [Default-1]</li> <li>prompt_len : Optional [Default-32]</li> <li>ctx_len : Optional [Default-128]</li><li>mxfp6 : Optional </li> <li>hf_token : Optional </li><li>cache_dir : Optional ["cache_dir" in current working directory]</li><li>prompt : Optional</li><li>prompts_txt_file_path : Optional</li><li>*One argument, prompt or prompts_txt_file_path must be passed*</li> |
-| QEfficient.cloud.execute  |     [click here](#2-use-of-qefficientcloudexecute)       |   <li>model_name : $\color{green} {Mandatory}$</li> <li>device_group : $\color{green} {Mandatory}$</li><li>qpc_path : $\color{green} {Mandatory}$</li> <li>cache_dir : Optional ["cache_dir" in current working directory]</li><li>hf_token : Optional </li><li>prompt : Optional</li><li>prompts_txt_file_path : Optional</li> <li>*One argument, prompt or prompts_txt_file_path must be passed*</li>             |
+| QEfficient.cloud.infer           |   [click here](#1-use-qefficientcloudinfer)         |  <li>model_name : $\color{green} {Mandatory}$</li> <li>num_cores : $\color{green} {Mandatory}$</li> <li>device_group : $\color{green} {Mandatory}$</li><li>batch_size : Optional [Default-1]</li> <li>prompt_len : Optional [Default-32]</li> <li>ctx_len : Optional [Default-128]</li><li>mxfp6 : Optional </li> <li>hf_token : Optional </li><li>cache_dir : Optional ["cache_dir" in current working directory]</li><li>prompt : Optional</li><li>prompts_txt_file_path : Optional</li>|
+| QEfficient.cloud.execute  |     [click here](#2-use-of-qefficientcloudexecute)       |   <li>model_name : $\color{green} {Mandatory}$</li> <li>device_group : $\color{green} {Mandatory}$</li><li>qpc_path : $\color{green} {Mandatory}$</li> <li>cache_dir : Optional ["cache_dir" in current working directory]</li><li>hf_token : Optional </li><li>prompt : Optional</li><li>prompts_txt_file_path : Optional</li> |
 
 
 ### 1. Use QEfficient.cloud.infer 
@@ -126,8 +126,14 @@ This is the single e2e python api in the library, which takes model_card name as
 # Check out the options using the help menu
 python -m QEfficient.cloud.infer --help
 python -m QEfficient.cloud.infer --model_name gpt2 --batch_size 1 --prompt_len 32 --ctx_len 128 --mxfp6 --num_cores 16 --device_group [0] --prompt "My name is" --mos 1 --aic_enable_depth_first  
- 
-# If executing for batch size>1, pass path of txt file with input prompts, Example below, sample txt file(prompts.txt) is present in examples folder .
+
+# If executing for batch size>1,
+
+Either pass input prompts in single string but seperate with pipe (|) symbol". Example below
+
+python -m QEfficient.cloud.infer --model_name gpt2 --batch_size 3 --prompt_len 32 --ctx_len 128 --num_cores 16 --device_group [0] --prompt "My name is|The flat earth theory is the belief that|The sun rises from" --mxfp6 --mos 1 --aic_enable_depth_first
+
+Or pass path of txt file with input prompts, Example below, sample txt file(prompts.txt) is present in examples folder .
 
 python -m QEfficient.cloud.infer --model_name gpt2 --batch_size 3 --prompt_len 32 --ctx_len 128 --num_cores 16 --device_group [0] --prompts_txt_file_path examples/prompts.txt --mxfp6 --mos 1 --aic_enable_depth_first  
  ```
@@ -174,7 +180,7 @@ python -m QEfficient.cloud.infer --model_name gpt2 --batch_size 1 --prompt_len 3
 |  QEfficient.transform    |   [click here](#1-model-download-and-transform)         |  <li>model : $\color{green} {Mandatory}$</li><li>Type : Optional [Default- "Transformers"]</li> <li>form_factor : Optional [Default-"cloud"]</li>  | 
 | qualcomm_efficient_converter |     [click here](#2-onnx-export-of-transformed-model)       |   <li>mode_name : $\color{green} {Mandatory}$</li><li>model_kv : $\color{green} {Mandatory}$ [Optional when model_class passed] </li><li>model_class : $\color{green} {Mandatory}$ [Optional when model_kv passed]</li> <li>tokenizer : Optional</li><li>onnx_path : Optional </li><li>hf_token : Optional</li><li>seq_length : Optional [Default-128]</li><li>input_str : Optional [Default-"My name is"]</li><li>kv : Optional [Default-$\color{green} {True}$]</li><li>return_path : Optional [Default-False]</li><li>form_factor : Optional [Default-"cloud"]</li><li>save_fp32_onnx : Optional [Default-False]</li><li>save_fp16_onnx : Optional [Default-True]</li> <li>*Both save_fp32_onnx and save_fp16_onnx can't be false*</li> | 
 |     compile | [click here](#3-compile-on-cloud-ai-100) | <li>onnx_path : $\color{green} {Mandatory}$</li><li>qpc_path : $\color{green} {Mandatory}$</li><li>num_cores : $\color{green} {Mandatory}$</li><li>device_group  : $\color{green} {Mandatory}$</li> <li>batch_size : Optional [Default-1]</li> <li>prompt_len : Optional [Default-32]</li><li>ctx_len : Optional [Default-128]</li><li>mxfp6 : Optional [Default-True]</li>| 
-|cloud_ai_100_exec_kv | [click here](#4-run-benchmark)  | <li>tokenizer : $\color{green} {Mandatory}$</li> <li>qpc_path : $\color{green} {Mandatory}$</li><li>prompt : Optional</li><li>prompts_txt_file_path : Optional</li><li>input_len : Optional [Default-None]</li> <li>generation_len : Optional [Default-None]</li> <li>device_id : Optional [Default-[0]]</li> <li>enable_debug_logs : Optional [Default-False]</li> <li>stream : Optional [Default-True]</li> <li>write_io_dir : Optional</li><li>automation : Optional [Default-False]</li><li>*One argument, prompt or prompts_txt_file_path must be passed*</li> | 
+|cloud_ai_100_exec_kv | [click here](#4-run-benchmark)  | <li>tokenizer : $\color{green} {Mandatory}$</li> <li>qpc_path : $\color{green} {Mandatory}$</li><li>prompt : Optional</li><li>prompts_txt_file_path : Optional</li><li>input_len : Optional [Default-None]</li> <li>generation_len : Optional [Default-None]</li> <li>device_id : Optional [Default-[0]]</li> <li>enable_debug_logs : Optional [Default-False]</li> <li>stream : Optional [Default-True]</li> <li>write_io_dir : Optional</li><li>automation : Optional [Default-False]</li>| 
 
 
 ### 1.  Model download and transform

From 6303154e084b14e1ddbfb95cf0c909f94c18f8c7 Mon Sep 17 00:00:00 2001
From: Mamta Singh <168400541+quic-mamta@users.noreply.github.com>
Date: Wed, 22 May 2024 15:54:34 +0530
Subject: [PATCH 18/28] Update README.md

Signed-off-by: Mamta Singh <168400541+quic-mamta@users.noreply.github.com>
---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index cb7ee4c4..b4744c36 100644
--- a/README.md
+++ b/README.md
@@ -129,11 +129,11 @@ python -m QEfficient.cloud.infer --model_name gpt2 --batch_size 1 --prompt_len 3
 
 # If executing for batch size>1,
 
-Either pass input prompts in single string but seperate with pipe (|) symbol". Example below
+#Either pass input prompts in single string but seperate with pipe (|) symbol". Example below
 
 python -m QEfficient.cloud.infer --model_name gpt2 --batch_size 3 --prompt_len 32 --ctx_len 128 --num_cores 16 --device_group [0] --prompt "My name is|The flat earth theory is the belief that|The sun rises from" --mxfp6 --mos 1 --aic_enable_depth_first
 
-Or pass path of txt file with input prompts, Example below, sample txt file(prompts.txt) is present in examples folder .
+#Or pass path of txt file with input prompts, Example below, sample txt file(prompts.txt) is present in examples folder .
 
 python -m QEfficient.cloud.infer --model_name gpt2 --batch_size 3 --prompt_len 32 --ctx_len 128 --num_cores 16 --device_group [0] --prompts_txt_file_path examples/prompts.txt --mxfp6 --mos 1 --aic_enable_depth_first  
  ```

From 52e74cb6424a69fb9c1fa5c34298b18c9f6d7c98 Mon Sep 17 00:00:00 2001
From: Mamta Singh <168400541+quic-mamta@users.noreply.github.com>
Date: Wed, 22 May 2024 16:05:56 +0530
Subject: [PATCH 19/28] Update README.md

Signed-off-by: Mamta Singh <168400541+quic-mamta@users.noreply.github.com>
---
 README.md | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index b4744c36..e82346ce 100644
--- a/README.md
+++ b/README.md
@@ -129,11 +129,12 @@ python -m QEfficient.cloud.infer --model_name gpt2 --batch_size 1 --prompt_len 3
 
 # If executing for batch size>1,
 
-#Either pass input prompts in single string but seperate with pipe (|) symbol". Example below
+# Either pass input prompts in single string but seperate with pipe (|) symbol". Example below
 
-python -m QEfficient.cloud.infer --model_name gpt2 --batch_size 3 --prompt_len 32 --ctx_len 128 --num_cores 16 --device_group [0] --prompt "My name is|The flat earth theory is the belief that|The sun rises from" --mxfp6 --mos 1 --aic_enable_depth_first
+python -m QEfficient.cloud.infer --model_name gpt2 --batch_size 3 --prompt_len 32 --ctx_len 128 --num_cores 16 --device_group [0] --prompt "My name is|The flat earth 
+theory is the belief that|The sun rises from" --mxfp6 --mos 1 --aic_enable_depth_first
 
-#Or pass path of txt file with input prompts, Example below, sample txt file(prompts.txt) is present in examples folder .
+# Or pass path of txt file with input prompts, Example below, sample txt file(prompts.txt) is present in examples folder .
 
 python -m QEfficient.cloud.infer --model_name gpt2 --batch_size 3 --prompt_len 32 --ctx_len 128 --num_cores 16 --device_group [0] --prompts_txt_file_path examples/prompts.txt --mxfp6 --mos 1 --aic_enable_depth_first  
  ```

From 74984514be2667816c573e6294c9fa65a1cbdd44 Mon Sep 17 00:00:00 2001
From: Mamta Singh <168400541+quic-mamta@users.noreply.github.com>
Date: Wed, 22 May 2024 16:06:44 +0530
Subject: [PATCH 20/28] Update infer.py

Signed-off-by: Mamta Singh <168400541+quic-mamta@users.noreply.github.com>
---
 QEfficient/cloud/infer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/QEfficient/cloud/infer.py b/QEfficient/cloud/infer.py
index 934f4e8a..08b9dfef 100644
--- a/QEfficient/cloud/infer.py
+++ b/QEfficient/cloud/infer.py
@@ -200,7 +200,7 @@ def main(
     )
     parser.add_argument("--model-name", "--model_name", required=True, help="HF Model card name/id")
     parser.add_argument(
-        "--cache-dir", "--cache_dir", default=Constants.CACHE_DIR, required=False, help="Cache dir to store HF Downlods"
+        "--cache-dir", "--cache_dir", default=Constants.CACHE_DIR, required=False, help="Cache dir to store HF Downloads"
     )
     parser.add_argument(
         "--hf-token", "--hf_token", default=None, type=str, required=False, help="HF token id for private HF models"

From a6b04809d71e2b0e1fe49822b7f01cddbf3de545 Mon Sep 17 00:00:00 2001
From: Mamta Singh <168400541+quic-mamta@users.noreply.github.com>
Date: Wed, 22 May 2024 16:07:17 +0530
Subject: [PATCH 21/28] Update execute.py

Signed-off-by: Mamta Singh <168400541+quic-mamta@users.noreply.github.com>
---
 QEfficient/cloud/execute.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/QEfficient/cloud/execute.py b/QEfficient/cloud/execute.py
index 6a1aeb63..fecdcc9d 100644
--- a/QEfficient/cloud/execute.py
+++ b/QEfficient/cloud/execute.py
@@ -78,7 +78,7 @@ def main(
         help="File path for taking input prompts from txt file, sample prompts.txt file present in examples folder",
     )
     parser.add_argument(
-        "--cache-dir", "--cache_dir", default=Constants.CACHE_DIR, required=False, help="Cache dir to store HF Downlods"
+        "--cache-dir", "--cache_dir", default=Constants.CACHE_DIR, required=False, help="Cache dir to store HF Downloads"
     )
     parser.add_argument(
         "--hf-token", "--hf_token", default=None, type=str, required=False, help="HF token id for private HF models"

From be8857120f1349510312af2aec320f1e904cbf49 Mon Sep 17 00:00:00 2001
From: Mamta Singh <quic_mamtsing@quicinc.com>
Date: Wed, 22 May 2024 17:25:59 +0530
Subject: [PATCH 22/28] Update files

Signed-off-by: Mamta Singh <quic_mamtsing@quicinc.com>
---
 QEfficient/cloud/execute.py                   |  3 +-
 QEfficient/cloud/infer.py                     | 27 +++++-----
 .../generation/text_generation_inference.py   | 51 +++++++++++++++----
 3 files changed, 55 insertions(+), 26 deletions(-)

diff --git a/QEfficient/cloud/execute.py b/QEfficient/cloud/execute.py
index fecdcc9d..1715f2d8 100644
--- a/QEfficient/cloud/execute.py
+++ b/QEfficient/cloud/execute.py
@@ -32,7 +32,7 @@ def main(
     :qpc_path: str.  Path to the generated binary after compilation.
     :device_group: List[int]. Device Ids to be used for compilation. if len(device_group) > 1. Multiple Card setup is enabled.
     :prompt: str. Sample prompt for the model text generation
-    :prompts_txt_file_path: str. Path to txt file for multiple input prompts (in case of batch size > 1)
+    :prompts_txt_file_path: str. Path to txt file for multiple input prompts
     """
 
     if hf_token is not None:
@@ -68,7 +68,6 @@ def main(
     parser.add_argument(
         "--prompt",
         type=lambda prompt: prompt.split("|"),
-        default="My name is",
         help="Input prompt, if executing for batch size>1, pass input promprs in single string but seperate with pipe (|) symbol",
     )
     parser.add_argument(
diff --git a/QEfficient/cloud/infer.py b/QEfficient/cloud/infer.py
index 08b9dfef..e3ea8b2f 100644
--- a/QEfficient/cloud/infer.py
+++ b/QEfficient/cloud/infer.py
@@ -17,7 +17,7 @@
 from QEfficient.exporter.export_hf_to_cloud_ai_100 import qualcomm_efficient_converter
 from QEfficient.generation.text_generation_inference import (
     check_batch_size_and_num_prompts,
-    cloud_ai_100_exec_kv,
+    cloud_ai_100_exec_kv_helper_loop,
     read_prompts_txt_file,
 )
 from QEfficient.utils import hf_download
@@ -80,15 +80,17 @@ def main(
 
     onnx_dir_path = os.path.join(model_card_dir, "onnx")
     onnx_model_path = os.path.join(onnx_dir_path, model_name.replace("/", "_") + "_kv_clipped_fp16.onnx")
-
+    
+    assert prompt is not None or prompts_txt_file_path is not None, "Please pass atleast one argument either using --prompt or --prompts_txt_file_path"
+    
     if prompts_txt_file_path is not None:
-        logger.info("Found inputs passed using txt file as well as CLI, taking inputs from given txt file")
-        prompts = read_prompts_txt_file(prompts_txt_file_path)
-        check_batch_size_and_num_prompts(prompts, batch_size)
+        if prompt is not None:
+            logger.warning("Found inputs passed using txt file as well as CLI, taking inputs from given txt file")
+        prompt = read_prompts_txt_file(prompts_txt_file_path)
     else:
         if isinstance(prompt, str):
             prompt = [prompt]
-        check_batch_size_and_num_prompts(prompt, batch_size)
+    check_batch_size_and_num_prompts(prompt, batch_size)
 
     # Get tokenizer
     if hf_token is not None:
@@ -105,12 +107,12 @@ def main(
     if qpc_exists(qpc_dir_path):
         # execute
         logger.info("Pre-compiled qpc found! Trying to execute with given prompt")
-        cloud_ai_100_exec_kv(
+        cloud_ai_100_exec_kv_helper_loop(
+            batch_size,
             tokenizer=tokenizer,
             qpc_path=qpc_dir_path,
             device_id=device_group,
             prompt=prompt,
-            prompts_txt_file_path=prompts_txt_file_path,
         )
         return
 
@@ -132,12 +134,12 @@ def main(
         assert (
             generated_qpc_path == qpc_dir_path
         ), f"QPC files were generated at an unusual location, expected {qpc_dir_path}; got {generated_qpc_path}"
-        cloud_ai_100_exec_kv(
+        cloud_ai_100_exec_kv_helper_loop(
+            batch_size,
             tokenizer=tokenizer,
             qpc_path=qpc_dir_path,
             device_id=device_group,
             prompt=prompt,
-            prompts_txt_file_path=prompts_txt_file_path,
         )
         return
 
@@ -185,12 +187,12 @@ def main(
     logger.info(f"Compiled qpc files can be found at : {generated_qpc_path}")
 
     # Execute
-    cloud_ai_100_exec_kv(
+    cloud_ai_100_exec_kv_helper_loop(
+        batch_size,
         tokenizer=tokenizer,
         qpc_path=qpc_dir_path,
         device_id=device_group,
         prompt=prompt,
-        prompts_txt_file_path=prompts_txt_file_path,
     )
 
 
@@ -226,7 +228,6 @@ def main(
     parser.add_argument(
         "--prompt",
         type=lambda prompt: prompt.split("|"),
-        default="My name is",
         help="Input prompt, if executing for batch size>1, pass input prompts in single string but seperate with pipe (|) symbol",
     )
     parser.add_argument(
diff --git a/QEfficient/generation/text_generation_inference.py b/QEfficient/generation/text_generation_inference.py
index 38eca70c..ad96e90a 100755
--- a/QEfficient/generation/text_generation_inference.py
+++ b/QEfficient/generation/text_generation_inference.py
@@ -266,11 +266,11 @@ def print_latency_stats_kv(
     print("=============================================================")
 
 
-def cloud_ai_100_exec_kv(
+def cloud_ai_100_exec_kv_helper_loop(
+    batch_size,
     tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
     qpc_path: str,
-    prompt: Optional[str] = None,
-    prompts_txt_file_path: Optional[str] = None,
+    prompt: Optional[List[str]] = None,
     device_id: List[int] = [0],
     input_len: Optional[int] = None,
     generation_len: Optional[int] = None,
@@ -279,14 +279,6 @@ def cloud_ai_100_exec_kv(
     write_io_dir: Optional[str] = None,
     automation=False,
 ):
-    if prompts_txt_file_path is not None:
-        logger.info("Found inputs passed using txt file as well as CLI, taking inputs from given txt file")
-        prompt = read_prompts_txt_file(prompts_txt_file_path)
-    if isinstance(prompt, str):
-        prompt = [prompt]
-
-    batch_size = get_compilation_batch_size(qpc_path)
-    check_batch_size_and_num_prompts(prompt, batch_size)
 
     if batch_size == 1:
         prefill_time = []
@@ -341,3 +333,40 @@ def cloud_ai_100_exec_kv(
         total_time,
         automation=automation,
     )
+
+def cloud_ai_100_exec_kv(
+    tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
+    qpc_path: str,
+    prompt: Optional[str] = None,
+    prompts_txt_file_path: Optional[str] = None,
+    device_id: List[int] = [0],
+    input_len: Optional[int] = None,
+    generation_len: Optional[int] = None,
+    enable_debug_logs: bool = False,
+    stream: bool = True,
+    write_io_dir: Optional[str] = None,
+    automation=False,
+):
+    if prompts_txt_file_path is not None:
+        if prompt is not None:
+            logger.warning("Found inputs passed using txt file as well as CLI, taking inputs from given txt file")
+        prompt = read_prompts_txt_file(prompts_txt_file_path)
+    if isinstance(prompt, str):
+        prompt = [prompt]
+
+    batch_size = get_compilation_batch_size(qpc_path)
+    check_batch_size_and_num_prompts(prompt, batch_size)
+
+    cloud_ai_100_exec_kv_helper_loop(
+        batch_size,
+        tokenizer=tokenizer,
+        prompt=prompt,
+        qpc_path=qpc_path,
+        device_id=device_id,
+        input_len=input_len,
+        generation_len=generation_len,
+        enable_debug_logs=enable_debug_logs,
+        stream=stream,
+        write_io_dir=write_io_dir,
+        automation=False,
+    )
\ No newline at end of file

From 0711073e4a7fb0d71cd43085a7e9a4858a097c8e Mon Sep 17 00:00:00 2001
From: Mamta Singh <quic_mamtsing@quicinc.com>
Date: Wed, 22 May 2024 22:54:55 +0530
Subject: [PATCH 23/28] Update files

Signed-off-by: Mamta Singh <quic_mamtsing@quicinc.com>
---
 QEfficient/cloud/execute.py                   | 17 ++++--
 QEfficient/cloud/infer.py                     | 28 ++++------
 .../generation/text_generation_inference.py   | 53 +++++--------------
 3 files changed, 38 insertions(+), 60 deletions(-)

diff --git a/QEfficient/cloud/execute.py b/QEfficient/cloud/execute.py
index 1715f2d8..c1ec39ab 100644
--- a/QEfficient/cloud/execute.py
+++ b/QEfficient/cloud/execute.py
@@ -11,7 +11,11 @@
 from huggingface_hub import login
 from transformers import AutoTokenizer
 
-from QEfficient.generation.text_generation_inference import cloud_ai_100_exec_kv
+from QEfficient.generation.text_generation_inference import (
+    check_batch_size_and_num_prompts,
+    cloud_ai_100_exec_kv,
+    get_compilation_batch_size,
+)
 from QEfficient.utils import hf_download
 from QEfficient.utils.constants import Constants
 
@@ -42,13 +46,16 @@ def main(
     model_hf_path = hf_download(repo_id=model_name, cache_dir=cache_dir, allow_patterns=["*.json", "*.py", "*token*"])
     tokenizer = AutoTokenizer.from_pretrained(model_hf_path, use_cache=True, padding_side="left")
 
+    batch_size = get_compilation_batch_size(qpc_path)
+    prompt = check_batch_size_and_num_prompts(prompt, prompts_txt_file_path, batch_size)
+
     # Execute
     cloud_ai_100_exec_kv(
+        batch_size=batch_size,
         tokenizer=tokenizer,
         qpc_path=qpc_path,
         device_id=device_group,
         prompt=prompt,
-        prompts_txt_file_path=prompts_txt_file_path,
     )
 
 
@@ -77,7 +84,11 @@ def main(
         help="File path for taking input prompts from txt file, sample prompts.txt file present in examples folder",
     )
     parser.add_argument(
-        "--cache-dir", "--cache_dir", default=Constants.CACHE_DIR, required=False, help="Cache dir to store HF Downloads"
+        "--cache-dir",
+        "--cache_dir",
+        default=Constants.CACHE_DIR,
+        required=False,
+        help="Cache dir to store HF Downloads",
     )
     parser.add_argument(
         "--hf-token", "--hf_token", default=None, type=str, required=False, help="HF token id for private HF models"
diff --git a/QEfficient/cloud/infer.py b/QEfficient/cloud/infer.py
index e3ea8b2f..23893d3d 100644
--- a/QEfficient/cloud/infer.py
+++ b/QEfficient/cloud/infer.py
@@ -17,8 +17,7 @@
 from QEfficient.exporter.export_hf_to_cloud_ai_100 import qualcomm_efficient_converter
 from QEfficient.generation.text_generation_inference import (
     check_batch_size_and_num_prompts,
-    cloud_ai_100_exec_kv_helper_loop,
-    read_prompts_txt_file,
+    cloud_ai_100_exec_kv,
 )
 from QEfficient.utils import hf_download
 from QEfficient.utils.constants import QEFF_MODELS_DIR, Constants
@@ -80,17 +79,8 @@ def main(
 
     onnx_dir_path = os.path.join(model_card_dir, "onnx")
     onnx_model_path = os.path.join(onnx_dir_path, model_name.replace("/", "_") + "_kv_clipped_fp16.onnx")
-    
-    assert prompt is not None or prompts_txt_file_path is not None, "Please pass atleast one argument either using --prompt or --prompts_txt_file_path"
-    
-    if prompts_txt_file_path is not None:
-        if prompt is not None:
-            logger.warning("Found inputs passed using txt file as well as CLI, taking inputs from given txt file")
-        prompt = read_prompts_txt_file(prompts_txt_file_path)
-    else:
-        if isinstance(prompt, str):
-            prompt = [prompt]
-    check_batch_size_and_num_prompts(prompt, batch_size)
+
+    prompt = check_batch_size_and_num_prompts(prompt, prompts_txt_file_path, batch_size)
 
     # Get tokenizer
     if hf_token is not None:
@@ -107,7 +97,7 @@ def main(
     if qpc_exists(qpc_dir_path):
         # execute
         logger.info("Pre-compiled qpc found! Trying to execute with given prompt")
-        cloud_ai_100_exec_kv_helper_loop(
+        cloud_ai_100_exec_kv(
             batch_size,
             tokenizer=tokenizer,
             qpc_path=qpc_dir_path,
@@ -134,7 +124,7 @@ def main(
         assert (
             generated_qpc_path == qpc_dir_path
         ), f"QPC files were generated at an unusual location, expected {qpc_dir_path}; got {generated_qpc_path}"
-        cloud_ai_100_exec_kv_helper_loop(
+        cloud_ai_100_exec_kv(
             batch_size,
             tokenizer=tokenizer,
             qpc_path=qpc_dir_path,
@@ -187,7 +177,7 @@ def main(
     logger.info(f"Compiled qpc files can be found at : {generated_qpc_path}")
 
     # Execute
-    cloud_ai_100_exec_kv_helper_loop(
+    cloud_ai_100_exec_kv(
         batch_size,
         tokenizer=tokenizer,
         qpc_path=qpc_dir_path,
@@ -202,7 +192,11 @@ def main(
     )
     parser.add_argument("--model-name", "--model_name", required=True, help="HF Model card name/id")
     parser.add_argument(
-        "--cache-dir", "--cache_dir", default=Constants.CACHE_DIR, required=False, help="Cache dir to store HF Downloads"
+        "--cache-dir",
+        "--cache_dir",
+        default=Constants.CACHE_DIR,
+        required=False,
+        help="Cache dir to store HF Downloads",
     )
     parser.add_argument(
         "--hf-token", "--hf_token", default=None, type=str, required=False, help="HF token id for private HF models"
diff --git a/QEfficient/generation/text_generation_inference.py b/QEfficient/generation/text_generation_inference.py
index ad96e90a..59b03160 100755
--- a/QEfficient/generation/text_generation_inference.py
+++ b/QEfficient/generation/text_generation_inference.py
@@ -106,12 +106,23 @@ def get_compilation_batch_size(qpc_path: str):
     return compilation_batch_size
 
 
-def check_batch_size_and_num_prompts(prompt: List[str], batch_size: int):
+def check_batch_size_and_num_prompts(prompt, prompts_txt_file_path, batch_size):
+    assert (
+        prompt is not None or prompts_txt_file_path is not None
+    ), "Please pass atleast one argument either using --prompt or --prompts_txt_file_path"
+    if prompts_txt_file_path is not None:
+        if prompt is not None:
+            logger.warning("Found inputs passed using txt file as well as CLI, taking inputs from given txt file")
+        prompt = read_prompts_txt_file(prompts_txt_file_path)
+    if isinstance(prompt, str):
+        prompt = [prompt]
+
     num_prompts = len(prompt)
     if batch_size > 1:
         assert (
             batch_size == num_prompts
         ), f"Mismatch between number of prompts {num_prompts} and batch size {batch_size}; please pass correct input argument"
+    return prompt
 
 
 def read_prompts_txt_file(prompts_txt_file_path: str):
@@ -266,7 +277,7 @@ def print_latency_stats_kv(
     print("=============================================================")
 
 
-def cloud_ai_100_exec_kv_helper_loop(
+def cloud_ai_100_exec_kv(
     batch_size,
     tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
     qpc_path: str,
@@ -279,7 +290,6 @@ def cloud_ai_100_exec_kv_helper_loop(
     write_io_dir: Optional[str] = None,
     automation=False,
 ):
-
     if batch_size == 1:
         prefill_time = []
         decode_perf = []
@@ -333,40 +343,3 @@ def cloud_ai_100_exec_kv_helper_loop(
         total_time,
         automation=automation,
     )
-
-def cloud_ai_100_exec_kv(
-    tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
-    qpc_path: str,
-    prompt: Optional[str] = None,
-    prompts_txt_file_path: Optional[str] = None,
-    device_id: List[int] = [0],
-    input_len: Optional[int] = None,
-    generation_len: Optional[int] = None,
-    enable_debug_logs: bool = False,
-    stream: bool = True,
-    write_io_dir: Optional[str] = None,
-    automation=False,
-):
-    if prompts_txt_file_path is not None:
-        if prompt is not None:
-            logger.warning("Found inputs passed using txt file as well as CLI, taking inputs from given txt file")
-        prompt = read_prompts_txt_file(prompts_txt_file_path)
-    if isinstance(prompt, str):
-        prompt = [prompt]
-
-    batch_size = get_compilation_batch_size(qpc_path)
-    check_batch_size_and_num_prompts(prompt, batch_size)
-
-    cloud_ai_100_exec_kv_helper_loop(
-        batch_size,
-        tokenizer=tokenizer,
-        prompt=prompt,
-        qpc_path=qpc_path,
-        device_id=device_id,
-        input_len=input_len,
-        generation_len=generation_len,
-        enable_debug_logs=enable_debug_logs,
-        stream=stream,
-        write_io_dir=write_io_dir,
-        automation=False,
-    )
\ No newline at end of file

From 5449fbb6c663ea6c9c9bee7c381424745fa6c1f2 Mon Sep 17 00:00:00 2001
From: Mamta Singh <168400541+quic-mamta@users.noreply.github.com>
Date: Wed, 22 May 2024 23:03:40 +0530
Subject: [PATCH 24/28] Update README.md

Signed-off-by: Mamta Singh <168400541+quic-mamta@users.noreply.github.com>
---
 README.md | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index e82346ce..0f3a9501 100644
--- a/README.md
+++ b/README.md
@@ -110,9 +110,10 @@ In summary:
 
 | High Level APIs | Sample use | Arguments         |
 |-----------------|------------|-------------------|
-| QEfficient.cloud.infer           |   [click here](#1-use-qefficientcloudinfer)         |  <li>model_name : $\color{green} {Mandatory}$</li> <li>num_cores : $\color{green} {Mandatory}$</li> <li>device_group : $\color{green} {Mandatory}$</li><li>batch_size : Optional [Default-1]</li> <li>prompt_len : Optional [Default-32]</li> <li>ctx_len : Optional [Default-128]</li><li>mxfp6 : Optional </li> <li>hf_token : Optional </li><li>cache_dir : Optional ["cache_dir" in current working directory]</li><li>prompt : Optional</li><li>prompts_txt_file_path : Optional</li>|
-| QEfficient.cloud.execute  |     [click here](#2-use-of-qefficientcloudexecute)       |   <li>model_name : $\color{green} {Mandatory}$</li> <li>device_group : $\color{green} {Mandatory}$</li><li>qpc_path : $\color{green} {Mandatory}$</li> <li>cache_dir : Optional ["cache_dir" in current working directory]</li><li>hf_token : Optional </li><li>prompt : Optional</li><li>prompts_txt_file_path : Optional</li> |
+| QEfficient.cloud.infer           |   [click here](#1-use-qefficientcloudinfer)         |  <li>model_name : $\color{green} {Mandatory}$</li> <li>num_cores : $\color{green} {Mandatory}$</li> <li>device_group : $\color{green} {Mandatory}$</li><li>batch_size : Optional [Default-1]</li> <li>prompt_len : Optional [Default-32]</li> <li>ctx_len : Optional [Default-128]</li><li>mxfp6 : Optional </li> <li>hf_token : Optional </li><li>cache_dir : Optional ["cache_dir" in current working directory]</li><li>**prompt : Optional</li><li>**prompts_txt_file_path : Optional</li>|
+| QEfficient.cloud.execute  |     [click here](#2-use-of-qefficientcloudexecute)       |   <li>model_name : $\color{green} {Mandatory}$</li> <li>device_group : $\color{green} {Mandatory}$</li><li>qpc_path : $\color{green} {Mandatory}$</li> <li>cache_dir : Optional ["cache_dir" in current working directory]</li><li>hf_token : Optional </li><li>**prompt : Optional</li><li>**prompts_txt_file_path : Optional</li> |
 
+**One argument, prompt or prompts_txt_file_path must be passed.
 
 ### 1. Use QEfficient.cloud.infer 
 

From 17096a3e513d8d4e4edbcb39821fec8a0a56c3a6 Mon Sep 17 00:00:00 2001
From: Mamta Singh <168400541+quic-mamta@users.noreply.github.com>
Date: Wed, 22 May 2024 23:17:05 +0530
Subject: [PATCH 25/28] Update QEfficientGPT2.ipynb

Signed-off-by: Mamta Singh <168400541+quic-mamta@users.noreply.github.com>
---
 notebooks/QEfficientGPT2.ipynb | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/notebooks/QEfficientGPT2.ipynb b/notebooks/QEfficientGPT2.ipynb
index be7e3d44..668a3b47 100644
--- a/notebooks/QEfficientGPT2.ipynb
+++ b/notebooks/QEfficientGPT2.ipynb
@@ -162,11 +162,12 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from QEfficient.generation.text_generation_inference import cloud_ai_100_exec_kv\n",
+    "from QEfficient.generation.text_generation_inference import cloud_ai_100_exec_kv, get_compilation_batch_size\n",
     "\n",
     "# post compilation, we can print the latency stats for the kv models, We provide API to print token and Latency stats on AI 100\n",
     "# We need the compiled prefill and decode qpc to compute the token generated, This is based on Greedy Sampling Approach\n",
-    "cloud_ai_100_exec_kv(tokenizer=tokenizer, qpc_path=generated_qpc_path, device_id=[0], prompt=\"My name is\")"
+    "batch_size = get_compilation_batch_size(generated_qpc_path)\n"
+    "cloud_ai_100_exec_kv(batch_size=batch_size, tokenizer=tokenizer, qpc_path=generated_qpc_path, device_id=[0], prompt=\"My name is\")"
    ]
   }
  ],

From 107b4145b7716ba2265a552b285a1c4e7614aeef Mon Sep 17 00:00:00 2001
From: Mamta Singh <168400541+quic-mamta@users.noreply.github.com>
Date: Wed, 22 May 2024 23:18:51 +0530
Subject: [PATCH 26/28] Update QEfficientMPT.ipynb

Signed-off-by: Mamta Singh <168400541+quic-mamta@users.noreply.github.com>
---
 notebooks/QEfficientMPT.ipynb | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/notebooks/QEfficientMPT.ipynb b/notebooks/QEfficientMPT.ipynb
index d7e45d92..8533eedc 100644
--- a/notebooks/QEfficientMPT.ipynb
+++ b/notebooks/QEfficientMPT.ipynb
@@ -160,12 +160,13 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from QEfficient.generation.text_generation_inference import cloud_ai_100_exec_kv\n",
+    "from QEfficient.generation.text_generation_inference import cloud_ai_100_exec_kv, get_compilation_batch_size\n",
     "\n",
     "# post compilation, we can print the latency stats for the kv models, We provide API to print token and Latency stats on AI 100\n",
     "# We need the compiled prefill and decode qpc to compute the token generated, This is based on Greedy Sampling Approach\n",
     "\n",
-    "cloud_ai_100_exec_kv(tokenizer=tokenizer, qpc_path=generated_qpc_path, device_id=[0], prompt=\"My name is\")"
+    "batch_size = get_compilation_batch_size(generated_qpc_path)"
+    "cloud_ai_100_exec_kv(batch_size=batch_size, tokenizer=tokenizer, qpc_path=generated_qpc_path, device_id=[0], prompt=\"My name is\")"
    ]
   }
  ],

From 0e567faa8fd8ac894a90d7c567622a1add9a914b Mon Sep 17 00:00:00 2001
From: Mamta Singh <168400541+quic-mamta@users.noreply.github.com>
Date: Wed, 22 May 2024 23:23:15 +0530
Subject: [PATCH 27/28] Update README.md

Signed-off-by: Mamta Singh <168400541+quic-mamta@users.noreply.github.com>
---
 README.md | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 0f3a9501..521e8688 100644
--- a/README.md
+++ b/README.md
@@ -182,7 +182,9 @@ python -m QEfficient.cloud.infer --model_name gpt2 --batch_size 1 --prompt_len 3
 |  QEfficient.transform    |   [click here](#1-model-download-and-transform)         |  <li>model : $\color{green} {Mandatory}$</li><li>Type : Optional [Default- "Transformers"]</li> <li>form_factor : Optional [Default-"cloud"]</li>  | 
 | qualcomm_efficient_converter |     [click here](#2-onnx-export-of-transformed-model)       |   <li>mode_name : $\color{green} {Mandatory}$</li><li>model_kv : $\color{green} {Mandatory}$ [Optional when model_class passed] </li><li>model_class : $\color{green} {Mandatory}$ [Optional when model_kv passed]</li> <li>tokenizer : Optional</li><li>onnx_path : Optional </li><li>hf_token : Optional</li><li>seq_length : Optional [Default-128]</li><li>input_str : Optional [Default-"My name is"]</li><li>kv : Optional [Default-$\color{green} {True}$]</li><li>return_path : Optional [Default-False]</li><li>form_factor : Optional [Default-"cloud"]</li><li>save_fp32_onnx : Optional [Default-False]</li><li>save_fp16_onnx : Optional [Default-True]</li> <li>*Both save_fp32_onnx and save_fp16_onnx can't be false*</li> | 
 |     compile | [click here](#3-compile-on-cloud-ai-100) | <li>onnx_path : $\color{green} {Mandatory}$</li><li>qpc_path : $\color{green} {Mandatory}$</li><li>num_cores : $\color{green} {Mandatory}$</li><li>device_group  : $\color{green} {Mandatory}$</li> <li>batch_size : Optional [Default-1]</li> <li>prompt_len : Optional [Default-32]</li><li>ctx_len : Optional [Default-128]</li><li>mxfp6 : Optional [Default-True]</li>| 
-|cloud_ai_100_exec_kv | [click here](#4-run-benchmark)  | <li>tokenizer : $\color{green} {Mandatory}$</li> <li>qpc_path : $\color{green} {Mandatory}$</li><li>prompt : Optional</li><li>prompts_txt_file_path : Optional</li><li>input_len : Optional [Default-None]</li> <li>generation_len : Optional [Default-None]</li> <li>device_id : Optional [Default-[0]]</li> <li>enable_debug_logs : Optional [Default-False]</li> <li>stream : Optional [Default-True]</li> <li>write_io_dir : Optional</li><li>automation : Optional [Default-False]</li>| 
+|cloud_ai_100_exec_kv | [click here](#4-run-benchmark)  | <li>batch_size : $\color{green} {Mandatory}$</li> <li>tokenizer : $\color{green} {Mandatory}$</li> <li>qpc_path : $\color{green} {Mandatory}$</li><li>**prompt : Optional</li><li>**prompts_txt_file_path : Optional</li><li>input_len : Optional [Default-None]</li> <li>generation_len : Optional [Default-None]</li> <li>device_id : Optional [Default-[0]]</li> <li>enable_debug_logs : Optional [Default-False]</li> <li>stream : Optional [Default-True]</li> <li>write_io_dir : Optional</li><li>automation : Optional [Default-False]</li>| 
+
+**One argument, prompt or prompts_txt_file_path must be passed.
 
 
 ### 1.  Model download and transform
@@ -273,10 +275,11 @@ generated_qpc_path = compile(
 Benchmark the model on Cloud AI 100, run the infer API to print tokens and tok/sec
 
 ```bash
-from QEfficient.generation.text_generation_inference import cloud_ai_100_exec_kv
+from QEfficient.generation.text_generation_inference import cloud_ai_100_exec_kv, get_compilation_batch_size
 
 # post compilation, we can print the latency stats for the kv models, We provide API to print token and Latency stats on AI 100
 # We need the compiled prefill and decode qpc to compute the token generated, This is based on Greedy Sampling Approach
+batch_size = get_compilation_batch_size(generated_qpc_path)
 cloud_ai_100_exec_kv(tokenizer=tokenizer, qpc_path=generated_qpc_path, device_id=[0], prompt="My name is")
 ```
 End to End demo examples for various models are available in **notebooks** directory. Please check them out.

From ade2c135ae8f2b3c682753b557bf7aa8abcdf9c5 Mon Sep 17 00:00:00 2001
From: Mamta Singh <168400541+quic-mamta@users.noreply.github.com>
Date: Thu, 23 May 2024 14:59:24 +0530
Subject: [PATCH 28/28] Update README.md

Signed-off-by: Mamta Singh <168400541+quic-mamta@users.noreply.github.com>
---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 521e8688..36604912 100644
--- a/README.md
+++ b/README.md
@@ -280,7 +280,7 @@ from QEfficient.generation.text_generation_inference import cloud_ai_100_exec_kv
 # post compilation, we can print the latency stats for the kv models, We provide API to print token and Latency stats on AI 100
 # We need the compiled prefill and decode qpc to compute the token generated, This is based on Greedy Sampling Approach
 batch_size = get_compilation_batch_size(generated_qpc_path)
-cloud_ai_100_exec_kv(tokenizer=tokenizer, qpc_path=generated_qpc_path, device_id=[0], prompt="My name is")
+cloud_ai_100_exec_kv(batch_size=batch_size, tokenizer=tokenizer, qpc_path=generated_qpc_path, device_id=[0], prompt="My name is")
 ```
 End to End demo examples for various models are available in **notebooks** directory. Please check them out.