Skip to content

Commit

Permalink
Update infer and execute API to take prompts from txt file for BS>=1 (q…
Browse files Browse the repository at this point in the history
…uic#11)

* [QEff]: Update infer and execute API to take prompts from txt file for bs>1

Signed-off-by: mamtsing <[email protected]>

* Update infer and execute API

Signed-off-by: mamtsing <[email protected]>

* Update infer and execute API

Signed-off-by: mamtsing <[email protected]>

* Update README.md

Signed-off-by: mamtsing <[email protected]>

* Update README.md

Signed-off-by: Mamta Singh <[email protected]>

* Update README.md

Signed-off-by: Mamta Singh <[email protected]>

* Update infer, execute and text generation interface

Signed-off-by: mamtsing <[email protected]>

* Update execute.py

Signed-off-by: Mamta Singh <[email protected]>

* Update execute.py

Signed-off-by: Mamta Singh <[email protected]>

* Update text generation interface

Signed-off-by: mamtsing <[email protected]>

* Update Notebooks

Signed-off-by: quic-mamta <[email protected]>
Signed-off-by: mamtsing <[email protected]>

* Update README.md

Signed-off-by: Mamta Singh <[email protected]>
Signed-off-by: mamtsing <[email protected]>

* Update README.md

Signed-off-by: Mamta Singh <[email protected]>
Signed-off-by: mamtsing <[email protected]>

* Update text_generation_inference.py

Signed-off-by: Mamta Singh <[email protected]>

* Update infer and execute and text generation interface

Signed-off-by: Mamta Singh <[email protected]>

* Update infer.py

Signed-off-by: Mamta Singh <[email protected]>

* Update README.md

Signed-off-by: Mamta Singh <[email protected]>

* Update README.md

Signed-off-by: Mamta Singh <[email protected]>

* Update README.md

Signed-off-by: Mamta Singh <[email protected]>

* Update infer.py

Signed-off-by: Mamta Singh <[email protected]>

* Update execute.py

Signed-off-by: Mamta Singh <[email protected]>

* Update files

Signed-off-by: Mamta Singh <[email protected]>

* Update files

Signed-off-by: Mamta Singh <[email protected]>

* Update README.md

Signed-off-by: Mamta Singh <[email protected]>

* Update QEfficientGPT2.ipynb

Signed-off-by: Mamta Singh <[email protected]>

* Update QEfficientMPT.ipynb

Signed-off-by: Mamta Singh <[email protected]>

* Update README.md

Signed-off-by: Mamta Singh <[email protected]>

* Update README.md

Signed-off-by: Mamta Singh <[email protected]>

---------

Signed-off-by: mamtsing <[email protected]>
Signed-off-by: mamtsing <[email protected]>
Signed-off-by: Mamta Singh <[email protected]>
Signed-off-by: quic-mamta <[email protected]>
Signed-off-by: Mamta Singh <[email protected]>
Signed-off-by: Onkar Chougule <[email protected]>
Signed-off-by: quic-amitraj <[email protected]>
Signed-off-by: amitraj <[email protected]>
  • Loading branch information
quic-mamta authored and quic-amitraj committed Jul 19, 2024
1 parent a7efd1e commit 44934c4
Show file tree
Hide file tree
Showing 4 changed files with 75 additions and 3 deletions.
4 changes: 2 additions & 2 deletions QEfficient/cloud/execute.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,10 +33,10 @@ def main(
:param model_name: str. Hugging Face Model Card name, Example: [gpt2].
:prompt: str. Sample prompt for the model text generation.
:qpc_path: str. Path to the save generated binary file after compilation.
:devices: List[int]. Device Ids to be used for compilation. if devices > 1, it enable multiple card setup.
:cache_dir: str. Cache dir to store the downloaded huggingface files.
:hf_token: Huggingface token to access gated models.
:device_group: List[int]. Device Ids to be used for compilation. if len(device_group) > 1. Multiple Card setup is enabled.
:prompts_txt_file_path: str. Path to txt file for multiple input prompts
"""

tokenizer = load_hf_tokenizer(model_name, cache_dir, hf_token)
Expand Down
13 changes: 13 additions & 0 deletions QEfficient/generation/text_generation_inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,18 +117,31 @@ def latency_stats_bertstyle(
print(round((cur_len - init_len) / (end - start), 2), "tok/s")


<<<<<<< HEAD
def get_compilation_dims(qpc_path: str) -> Tuple[int, int]:
=======
def get_compilation_batch_size(qpc_path: str):
<<<<<<< HEAD
>>>>>>> 3a7458a (Update infer and execute API to take prompts from txt file for BS>=1 (#11))
qpc_base_path = os.path.dirname(os.path.normpath(qpc_path))
specialization_file_path = os.path.join(qpc_base_path, "specializations.json")
logger.info(f"specialization_file_path : {specialization_file_path}")
=======
qpc_base_path = os.path.dirname(qpc_path)
specialization_file_path = os.path.join(qpc_base_path, "specializations.json")
>>>>>>> 22a285a (Update infer and execute API to take prompts from txt file for BS>=1 (#11))
with open(specialization_file_path, "r") as file:
data = json.load(file)
compilation_batch_size = int(data["specializations"][0]["batch_size"])
compilation_ctx_len = int(data["specializations"][0]["ctx_len"])
return compilation_batch_size, compilation_ctx_len


<<<<<<< HEAD
def check_batch_size_and_num_prompts(prompt, prompts_txt_file_path, batch_size) -> List[str]:
=======
def check_batch_size_and_num_prompts(prompt, prompts_txt_file_path, batch_size):
>>>>>>> 22a285a (Update infer and execute API to take prompts from txt file for BS>=1 (#11))
assert (
prompt is not None or prompts_txt_file_path is not None
), "Please pass atleast one argument either using --prompt or --prompts_txt_file_path"
Expand Down
34 changes: 34 additions & 0 deletions notebooks/QEfficientGPT2.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -115,10 +115,44 @@
"metadata": {},
"outputs": [],
"source": [
<<<<<<< HEAD
"# post compilation, we can print the latency stats for the kv models, We provide API to print token and Latency stats on AI 100\n",
"# We need the compiled prefill and decode qpc to compute the token generated, This is based on Greedy Sampling Approach\n",
"\n",
"qeff_model.generate(prompts=[\"My name is\"])"
=======
<<<<<<< HEAD
"from QEfficient.generation.text_generation_inference import get_compilation_dims\n",
"\n",
"# post compilation, we can print the latency stats for the kv models, We provide API to print token and Latency stats on AI 100\n",
"# We need the compiled prefill and decode qpc to compute the token generated, This is based on Greedy Sampling Approach\n",
"batch_size, ctx_len = get_compilation_dims(generated_qpc_path)\n",
"QEfficient.cloud_ai_100_exec_kv(\n",
" batch_size=batch_size,\n",
" tokenizer=tokenizer,\n",
" qpc_path=generated_qpc_path,\n",
" device_id=[0],\n",
" prompt=[\"My name is\"],\n",
" ctx_len=ctx_len,\n",
")"
=======
<<<<<<< HEAD
"from QEfficient.generation.text_generation_inference import get_compilation_batch_size\n",
"\n",
"# post compilation, we can print the latency stats for the kv models, We provide API to print token and Latency stats on AI 100\n",
"# We need the compiled prefill and decode qpc to compute the token generated, This is based on Greedy Sampling Approach\n",
"batch_size = get_compilation_batch_size(generated_qpc_path)\n",
"QEfficient.cloud_ai_100_exec_kv(batch_size=batch_size, tokenizer=tokenizer, qpc_path=generated_qpc_path, device_id=[0], prompt=[\"My name is\"])"
=======
"from QEfficient.generation.text_generation_inference import cloud_ai_100_exec_kv, get_compilation_batch_size\n",
"\n",
"# post compilation, we can print the latency stats for the kv models, We provide API to print token and Latency stats on AI 100\n",
"# We need the compiled prefill and decode qpc to compute the token generated, This is based on Greedy Sampling Approach\n",
"batch_size = get_compilation_batch_size(generated_qpc_path)\n"
"cloud_ai_100_exec_kv(batch_size=batch_size, tokenizer=tokenizer, qpc_path=generated_qpc_path, device_id=[0], prompt=\"My name is\")"
>>>>>>> 22a285a (Update infer and execute API to take prompts from txt file for BS>=1 (#11))
>>>>>>> 3a7458a (Update infer and execute API to take prompts from txt file for BS>=1 (#11))
>>>>>>> de8b8be (Update infer and execute API to take prompts from txt file for BS>=1 (#11))
]
}
],
Expand Down
27 changes: 26 additions & 1 deletion notebooks/QEfficientMPT.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -114,13 +114,38 @@
"metadata": {},
"outputs": [],
"source": [
<<<<<<< HEAD
"from QEfficient.generation.text_generation_inference import get_compilation_dims\n",
=======
<<<<<<< HEAD
"from QEfficient.generation.text_generation_inference import get_compilation_batch_size\n",
=======
"from QEfficient.generation.text_generation_inference import cloud_ai_100_exec_kv, get_compilation_batch_size\n",
>>>>>>> 22a285a (Update infer and execute API to take prompts from txt file for BS>=1 (#11))
>>>>>>> 3a7458a (Update infer and execute API to take prompts from txt file for BS>=1 (#11))
"\n",
"# post compilation, we can print the latency stats for the kv models, We provide API to print token and Latency stats on AI 100\n",
"# We need the compiled prefill and decode qpc to compute the token generated, This is based on Greedy Sampling Approach\n",
"\n",
<<<<<<< HEAD
"batch_size, ctx_len = get_compilation_dims(generated_qpc_path)\n",
"qeff_model.generate(prompts=[\"My name is\"])"
"QEfficient.cloud_ai_100_exec_kv(\n",
" batch_size=batch_size,\n",
" tokenizer=tokenizer,\n",
" qpc_path=generated_qpc_path,\n",
" device_id=[0],\n",
" prompt=[\"My name is\"],\n",
" ctx_len=ctx_len,\n",
")"
=======
<<<<<<< HEAD
"batch_size = get_compilation_batch_size(generated_qpc_path)\n",
"QEfficient.cloud_ai_100_exec_kv(batch_size=batch_size, tokenizer=tokenizer, qpc_path=generated_qpc_path, device_id=[0], prompt=[\"My name is\"])"
=======
"batch_size = get_compilation_batch_size(generated_qpc_path)"
"cloud_ai_100_exec_kv(batch_size=batch_size, tokenizer=tokenizer, qpc_path=generated_qpc_path, device_id=[0], prompt=\"My name is\")"
>>>>>>> 22a285a (Update infer and execute API to take prompts from txt file for BS>=1 (#11))
>>>>>>> 3a7458a (Update infer and execute API to take prompts from txt file for BS>=1 (#11))
]
}
],
Expand Down

0 comments on commit 44934c4

Please sign in to comment.