Update infer and execute API to take prompts from txt file for BS>=1 (q…

…uic#11) * [QEff]: Update infer and execute API to take prompts from txt file for bs>1 Signed-off-by: mamtsing <[email protected]> * Update infer and execute API Signed-off-by: mamtsing <[email protected]> * Update infer and execute API Signed-off-by: mamtsing <[email protected]> * Update README.md Signed-off-by: mamtsing <[email protected]> * Update README.md Signed-off-by: Mamta Singh <[email protected]> * Update README.md Signed-off-by: Mamta Singh <[email protected]> * Update infer, execute and text generation interface Signed-off-by: mamtsing <[email protected]> * Update execute.py Signed-off-by: Mamta Singh <[email protected]> * Update execute.py Signed-off-by: Mamta Singh <[email protected]> * Update text generation interface Signed-off-by: mamtsing <[email protected]> * Update Notebooks Signed-off-by: quic-mamta <[email protected]> Signed-off-by: mamtsing <[email protected]> * Update README.md Signed-off-by: Mamta Singh <[email protected]> Signed-off-by: mamtsing <[email protected]> * Update README.md Signed-off-by: Mamta Singh <[email protected]> Signed-off-by: mamtsing <[email protected]> * Update text_generation_inference.py Signed-off-by: Mamta Singh <[email protected]> * Update infer and execute and text generation interface Signed-off-by: Mamta Singh <[email protected]> * Update infer.py Signed-off-by: Mamta Singh <[email protected]> * Update README.md Signed-off-by: Mamta Singh <[email protected]> * Update README.md Signed-off-by: Mamta Singh <[email protected]> * Update README.md Signed-off-by: Mamta Singh <[email protected]> * Update infer.py Signed-off-by: Mamta Singh <[email protected]> * Update execute.py Signed-off-by: Mamta Singh <[email protected]> * Update files Signed-off-by: Mamta Singh <[email protected]> * Update files Signed-off-by: Mamta Singh <[email protected]> * Update README.md Signed-off-by: Mamta Singh <[email protected]> * Update QEfficientGPT2.ipynb Signed-off-by: Mamta Singh <[email protected]> * Update QEfficientMPT.ipynb Signed-off-by: Mamta Singh <[email protected]> * Update README.md Signed-off-by: Mamta Singh <[email protected]> * Update README.md Signed-off-by: Mamta Singh <[email protected]> --------- Signed-off-by: mamtsing <[email protected]> Signed-off-by: mamtsing <[email protected]> Signed-off-by: Mamta Singh <[email protected]> Signed-off-by: quic-mamta <[email protected]> Signed-off-by: Mamta Singh <[email protected]> Signed-off-by: Onkar Chougule <[email protected]> Signed-off-by: quic-amitraj <[email protected]> Signed-off-by: amitraj <[email protected]>
quic-amitraj · Jul 19, 2024 · 44934c4 · 44934c4
1 parent a7efd1e
commit 44934c4
Show file tree

Hide file tree

Showing 4 changed files with 75 additions and 3 deletions.
diff --git a/QEfficient/cloud/execute.py b/QEfficient/cloud/execute.py
@@ -33,10 +33,10 @@ def main(
     :param model_name: str. Hugging Face Model Card name, Example: [gpt2].
     :prompt: str. Sample prompt for the model text generation.
     :qpc_path: str.  Path to the save generated binary file after compilation.
-    :devices: List[int]. Device Ids to be used for compilation. if devices > 1, it enable multiple card setup.
     :cache_dir: str. Cache dir to store the downloaded huggingface files.
     :hf_token: Huggingface token to access gated models.
-
+    :device_group: List[int]. Device Ids to be used for compilation. if len(device_group) > 1. Multiple Card setup is enabled.
+    :prompts_txt_file_path: str. Path to txt file for multiple input prompts
     """
 
     tokenizer = load_hf_tokenizer(model_name, cache_dir, hf_token)

diff --git a/QEfficient/generation/text_generation_inference.py b/QEfficient/generation/text_generation_inference.py
@@ -117,18 +117,31 @@ def latency_stats_bertstyle(
     print(round((cur_len - init_len) / (end - start), 2), "tok/s")
 
 
+<<<<<<< HEAD
 def get_compilation_dims(qpc_path: str) -> Tuple[int, int]:
+=======
+def get_compilation_batch_size(qpc_path: str):
+<<<<<<< HEAD
+>>>>>>> 3a7458a (Update infer and execute API to take prompts from txt file for BS>=1 (#11))
     qpc_base_path = os.path.dirname(os.path.normpath(qpc_path))
     specialization_file_path = os.path.join(qpc_base_path, "specializations.json")
     logger.info(f"specialization_file_path : {specialization_file_path}")
+=======
+    qpc_base_path = os.path.dirname(qpc_path)
+    specialization_file_path = os.path.join(qpc_base_path, "specializations.json")
+>>>>>>> 22a285a (Update infer and execute API to take prompts from txt file for BS>=1 (#11))
     with open(specialization_file_path, "r") as file:
         data = json.load(file)
     compilation_batch_size = int(data["specializations"][0]["batch_size"])
     compilation_ctx_len = int(data["specializations"][0]["ctx_len"])
     return compilation_batch_size, compilation_ctx_len
 
 
+<<<<<<< HEAD
 def check_batch_size_and_num_prompts(prompt, prompts_txt_file_path, batch_size) -> List[str]:
+=======
+def check_batch_size_and_num_prompts(prompt, prompts_txt_file_path, batch_size):
+>>>>>>> 22a285a (Update infer and execute API to take prompts from txt file for BS>=1 (#11))
     assert (
         prompt is not None or prompts_txt_file_path is not None
     ), "Please pass atleast one argument either using --prompt or --prompts_txt_file_path"

diff --git a/notebooks/QEfficientGPT2.ipynb b/notebooks/QEfficientGPT2.ipynb
@@ -115,10 +115,44 @@
    "metadata": {},
    "outputs": [],
    "source": [
+<<<<<<< HEAD
     "# post compilation, we can print the latency stats for the kv models, We provide API to print token and Latency stats on AI 100\n",
     "# We need the compiled prefill and decode qpc to compute the token generated, This is based on Greedy Sampling Approach\n",
     "\n",
     "qeff_model.generate(prompts=[\"My name is\"])"
+=======
+<<<<<<< HEAD
+    "from QEfficient.generation.text_generation_inference import get_compilation_dims\n",
+    "\n",
+    "# post compilation, we can print the latency stats for the kv models, We provide API to print token and Latency stats on AI 100\n",
+    "# We need the compiled prefill and decode qpc to compute the token generated, This is based on Greedy Sampling Approach\n",
+    "batch_size, ctx_len = get_compilation_dims(generated_qpc_path)\n",
+    "QEfficient.cloud_ai_100_exec_kv(\n",
+    "    batch_size=batch_size,\n",
+    "    tokenizer=tokenizer,\n",
+    "    qpc_path=generated_qpc_path,\n",
+    "    device_id=[0],\n",
+    "    prompt=[\"My name is\"],\n",
+    "    ctx_len=ctx_len,\n",
+    ")"
+=======
+<<<<<<< HEAD
+    "from QEfficient.generation.text_generation_inference import get_compilation_batch_size\n",
+    "\n",
+    "# post compilation, we can print the latency stats for the kv models, We provide API to print token and Latency stats on AI 100\n",
+    "# We need the compiled prefill and decode qpc to compute the token generated, This is based on Greedy Sampling Approach\n",
+    "batch_size = get_compilation_batch_size(generated_qpc_path)\n",
+    "QEfficient.cloud_ai_100_exec_kv(batch_size=batch_size, tokenizer=tokenizer, qpc_path=generated_qpc_path, device_id=[0], prompt=[\"My name is\"])"
+=======
+    "from QEfficient.generation.text_generation_inference import cloud_ai_100_exec_kv, get_compilation_batch_size\n",
+    "\n",
+    "# post compilation, we can print the latency stats for the kv models, We provide API to print token and Latency stats on AI 100\n",
+    "# We need the compiled prefill and decode qpc to compute the token generated, This is based on Greedy Sampling Approach\n",
+    "batch_size = get_compilation_batch_size(generated_qpc_path)\n"
+    "cloud_ai_100_exec_kv(batch_size=batch_size, tokenizer=tokenizer, qpc_path=generated_qpc_path, device_id=[0], prompt=\"My name is\")"
+>>>>>>> 22a285a (Update infer and execute API to take prompts from txt file for BS>=1 (#11))
+>>>>>>> 3a7458a (Update infer and execute API to take prompts from txt file for BS>=1 (#11))
+>>>>>>> de8b8be (Update infer and execute API to take prompts from txt file for BS>=1 (#11))
    ]
   }
  ],

diff --git a/notebooks/QEfficientMPT.ipynb b/notebooks/QEfficientMPT.ipynb
@@ -114,13 +114,38 @@
    "metadata": {},
    "outputs": [],
    "source": [
+<<<<<<< HEAD
     "from QEfficient.generation.text_generation_inference import get_compilation_dims\n",
+=======
+<<<<<<< HEAD
+    "from QEfficient.generation.text_generation_inference import get_compilation_batch_size\n",
+=======
+    "from QEfficient.generation.text_generation_inference import cloud_ai_100_exec_kv, get_compilation_batch_size\n",
+>>>>>>> 22a285a (Update infer and execute API to take prompts from txt file for BS>=1 (#11))
+>>>>>>> 3a7458a (Update infer and execute API to take prompts from txt file for BS>=1 (#11))
     "\n",
     "# post compilation, we can print the latency stats for the kv models, We provide API to print token and Latency stats on AI 100\n",
     "# We need the compiled prefill and decode qpc to compute the token generated, This is based on Greedy Sampling Approach\n",
     "\n",
+<<<<<<< HEAD
     "batch_size, ctx_len = get_compilation_dims(generated_qpc_path)\n",
-    "qeff_model.generate(prompts=[\"My name is\"])"
+    "QEfficient.cloud_ai_100_exec_kv(\n",
+    "    batch_size=batch_size,\n",
+    "    tokenizer=tokenizer,\n",
+    "    qpc_path=generated_qpc_path,\n",
+    "    device_id=[0],\n",
+    "    prompt=[\"My name is\"],\n",
+    "    ctx_len=ctx_len,\n",
+    ")"
+=======
+<<<<<<< HEAD
+    "batch_size = get_compilation_batch_size(generated_qpc_path)\n",
+    "QEfficient.cloud_ai_100_exec_kv(batch_size=batch_size, tokenizer=tokenizer, qpc_path=generated_qpc_path, device_id=[0], prompt=[\"My name is\"])"
+=======
+    "batch_size = get_compilation_batch_size(generated_qpc_path)"
+    "cloud_ai_100_exec_kv(batch_size=batch_size, tokenizer=tokenizer, qpc_path=generated_qpc_path, device_id=[0], prompt=\"My name is\")"
+>>>>>>> 22a285a (Update infer and execute API to take prompts from txt file for BS>=1 (#11))
+>>>>>>> 3a7458a (Update infer and execute API to take prompts from txt file for BS>=1 (#11))
    ]
   }
  ],