Skip to content

Commit

Permalink
Added SplitTensorsTransform & Auto-device-picker (quic#89)
Browse files Browse the repository at this point in the history
* fix: Transform names

Signed-off-by: Ilango Rajagopal <[email protected]>

* OnnxTransforms need **kwargs

Signed-off-by: Ilango Rajagopal <[email protected]>

* Added `SplitTensorsTransform`

Signed-off-by: Ilango Rajagopal <[email protected]>

* fix: `onnx_base_dir` should be passed as kwarg

Signed-off-by: Ilango Rajagopal <[email protected]>

* Auto-device-picker for QAICInferenceSession

Signed-off-by: Ilango Rajagopal <[email protected]>

* Make device_id optional

Signed-off-by: Ilango Rajagopal <[email protected]>

* Use auto-device-picker for tests

Signed-off-by: Ilango Rajagopal <[email protected]>

* Remove LoraAdapters placeholder

Signed-off-by: Ilango Rajagopal <[email protected]>

* Fix compile API to use None device_group

Signed-off-by: Ilango Rajagopal <[email protected]>

* fix: get_qpc_dir when device_group=None

Signed-off-by: Ilango Rajagopal <[email protected]>

* Parallelizing pytests

Signed-off-by: Onkar Chougule <[email protected]>

* fixed parallel tests

Signed-off-by: Onkar Chougule <[email protected]>

* fixing parallel tests

Signed-off-by: Onkar Chougule <[email protected]>

* linter

Signed-off-by: Onkar Chougule <[email protected]>

* Update docstring for optional `device_group`

Signed-off-by: Ilango Rajagopal <[email protected]>

* Add docstrings to new transforms

Signed-off-by: Ilango Rajagopal <[email protected]>

* fixed parallelizing tests

Signed-off-by: Onkar Chougule <[email protected]>

* parallelzing cli tests too

Signed-off-by: Onkar Chougule <[email protected]>

* extra attempt to reduce tests time

Signed-off-by: Onkar Chougule <[email protected]>

* Fix docstring for compile function

Signed-off-by: Ilango Rajagopal <[email protected]>

* fixed junit xml files

Signed-off-by: Onkar Chougule <[email protected]>

* bugfix

Signed-off-by: Onkar Chougule <[email protected]>

* fix: typo on pytest-xdist

Signed-off-by: Ilango Rajagopal <[email protected]>

* Move junit_logging init option to pyproject.toml

Signed-off-by: Ilango Rajagopal <[email protected]>

* Move standard pytest flags to pyproject.toml

Signed-off-by: Ilango Rajagopal <[email protected]>

* fix pyproject.toml

Signed-off-by: Ilango Rajagopal <[email protected]>

---------

Signed-off-by: Ilango Rajagopal <[email protected]>
Signed-off-by: Onkar Chougule <[email protected]>
Co-authored-by: Onkar Chougule <[email protected]>
  • Loading branch information
irajagop and ochougul authored Aug 26, 2024
1 parent 771d983 commit 643bb2c
Show file tree
Hide file tree
Showing 20 changed files with 229 additions and 220 deletions.
50 changes: 41 additions & 9 deletions QEfficient/base/onnx_transforms.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,25 +20,28 @@ def __init__(self):
raise TypeError("Transform classes are not to be instantiated. Directly use the `apply` method.")

@classmethod
def apply(cls, model: ModelProto, onnx_base_dir: Optional[str] = None) -> Tuple[ModelProto, bool]:
def apply(cls, model: ModelProto, **kwargs) -> Tuple[ModelProto, bool]:
"""
Override this class to apply a transformation.
:param model: The model's ONNX graph to transform
:param onnx_base_dir: Directory where the model and external files are present
:param kwargs: Parameters needed for specific transforms. All transforms should take **kwargs to ignore unneeded kwargs.
:returns: ONNX graph after applying the transform
:returns: Boolean indicating whether transform was applied
"""
raise NotImplementedError("Use subclasses for ONNX transform")


class FP16Clip(OnnxTransform):
class FP16ClipTransform(OnnxTransform):
"""
Clips the tensor values to be in FP16 range.
"""

@classmethod
def apply(cls, model: ModelProto, onnx_base_dir: Optional[str] = None) -> Tuple[ModelProto, bool]:
def apply(cls, model: ModelProto, *, onnx_base_dir: Optional[str] = None, **kwargs) -> Tuple[ModelProto, bool]:
"""
:param onnx_base_dir: Base directory to load tensors (if not already loaded).
"""
finfo = np.finfo(np.float16)
fp16_max = finfo.max
fp16_min = finfo.min
Expand All @@ -53,9 +56,38 @@ def apply(cls, model: ModelProto, onnx_base_dir: Optional[str] = None) -> Tuple[
return model, transformed


class SplitWeights(OnnxTransform):
pass

class SplitTensorsTransform(OnnxTransform):
"""
Split external tensors file
"""

class LoraAdapters(OnnxTransform):
pass
@classmethod
def apply(
cls,
model: ModelProto,
*,
model_name: str,
onnx_base_dir: Optional[str] = None,
file_chunk_size: int = 10 * 2**30, # 10 GiB
size_threshold: int = 1024,
**kwargs,
) -> Tuple[ModelProto, bool]:
"""
:param model_name: Used for naming external files. i.e. {model_name}_0.onnx.data
:param onnx_base_dir: Base directory to load tensors (if not already loaded).
:param file_chunk_size: Chunk size to split external files into.
:param size_threshold: Only tensors greater than this threshold (in bytes) will be saved externally.
"""
file_num = 0
current_file_size = 0
transformed = False
external_data_helper.load_external_data_for_model(model, onnx_base_dir)
for tensor in external_data_helper._get_all_tensors(model):
if tensor.HasField("raw_data") and ((tsize := len(tensor.raw_data)) > size_threshold):
transformed = True
current_file_size += tsize
if current_file_size > file_chunk_size:
file_num += 1
current_file_size = tsize
external_data_helper.set_external_data(tensor, f"{model_name}_{file_num}.onnx.data")
return model, transformed
5 changes: 2 additions & 3 deletions QEfficient/cloud/execute.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
def main(
model_name: str,
qpc_path: str,
device_group: List[int],
device_group: Optional[List[int]] = None,
local_model_dir: Optional[str] = None,
prompt: Optional[str] = None, # type: ignore
prompts_txt_file_path: Optional[str] = None,
Expand All @@ -30,8 +30,8 @@ def main(
``Mandatory`` Args:
:model_name (str): Hugging Face Model Card name, Example: ``gpt2``.
:qpc_path (str): Path to the generated binary after compilation.
:device_group (List[int]): Device Ids to be used for compilation. if len(device_group) > 1. Multiple Card setup is enabled.
``Optional`` Args:
:device_group (List[int]): Device Ids to be used for compilation. if len(device_group) > 1. Multiple Card setup is enabled. ``Defaults to None.``
:local_model_dir (str): Path to custom model weights and config files. ``Defaults to None.``
:prompt (str): Sample prompt for the model text generation. ``Defaults to None.``
:prompts_txt_file_path (str): Path to txt file for multiple input prompts. ``Defaults to None.``
Expand Down Expand Up @@ -69,7 +69,6 @@ def main(
parser.add_argument(
"--device_group",
"--device-group",
required=True,
type=lambda device_ids: [int(x) for x in device_ids.strip("[]").split(",")],
help="Cloud AI 100 device ids (comma-separated) e.g. [0]",
)
Expand Down
5 changes: 2 additions & 3 deletions QEfficient/cloud/infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
def main(
model_name: str,
num_cores: int,
device_group: List[int],
device_group: Optional[List[int]] = None,
prompt: Optional[str] = None, # type: ignore
prompts_txt_file_path: Optional[str] = None,
aic_enable_depth_first: bool = False,
Expand All @@ -39,8 +39,8 @@ def main(
``Mandatory`` Args:
:model_name (str): Hugging Face Model Card name, Example: ``gpt2``
:num_cores (int): Number of cores to compile model on.
:device_group (List[int]): Device Ids to be used for compilation. If ``len(device_group) > 1``, multiple Card setup is enabled.
``Optional`` Args:
:device_group (List[int]): Device Ids to be used for compilation. If ``len(device_group) > 1``, multiple Card setup is enabled. ``Defaults to None.``
:prompt (str): Sample prompt for the model text generation. ``Defaults to None.``
:prompts_txt_file_path (str): Path to txt file for multiple input prompts. ``Defaults to None.``
:aic_enable_depth_first (bool): Enables ``DFS`` with default memory size. ``Defaults to False.``
Expand Down Expand Up @@ -147,7 +147,6 @@ def main(
parser.add_argument(
"--device_group",
"--device-group",
required=True,
type=lambda device_ids: [int(x) for x in device_ids.strip("[]").split(",")],
help="Cloud AI 100 device ids (comma-separated) e.g. [0,1] ",
)
Expand Down
8 changes: 4 additions & 4 deletions QEfficient/compile/compile_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ def compile_kv_model_on_cloud_ai_100(
custom_io_path: str,
aic_enable_depth_first: bool,
mos: int = -1,
device_group: List[int] = [0],
device_group: Optional[List[int]] = None,
**kwargs,
) -> Tuple[bool, str]:
if kwargs:
Expand Down Expand Up @@ -74,7 +74,7 @@ def compile_kv_model_on_cloud_ai_100(
command.append(f"-mos={mos}")
if aic_enable_depth_first:
command.append("-aic-enable-depth-first")
if len(device_group) > 1:
if device_group is not None and len(device_group) > 1:
mdp_ts_config = {
"connections": [{"devices": list(range(len(device_group))), "type": "p2p"}],
"partitions": [
Expand All @@ -101,7 +101,7 @@ def compile(
onnx_path: str,
qpc_path: str,
num_cores: int,
device_group: List[int], # FIXME: use num_devices instead
device_group: Optional[List[int]] = None, # FIXME: use num_devices instead
aic_enable_depth_first: bool = False,
mos: int = -1,
batch_size: int = 1,
Expand All @@ -122,8 +122,8 @@ def compile(
:onnx_path (str): Generated ``ONNX`` Model Path.
:qpc_path (str): Path for saving compiled qpc binaries.
:num_cores (int): Number of cores to compile the model on.
:device_group (List[int]): Used for finding the number of devices to compile for.
``Optional`` Args:
:device_group (List[int]): Used for finding the number of devices to compile for. ``Defaults to None.``
:aic_enable_depth_first (bool): Enables ``DFS`` with default memory size. ``Defaults to False.``
:mos (int): Effort level to reduce the on-chip memory. ``Defaults to -1.``
:batch_size (int): Batch size to compile the model for. ``Defaults to 1.``
Expand Down
4 changes: 2 additions & 2 deletions QEfficient/exporter/export_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
import torch
from onnx import external_data_helper

from QEfficient.base.onnx_transforms import FP16Clip
from QEfficient.base.onnx_transforms import FP16ClipTransform


def export_onnx(
Expand Down Expand Up @@ -215,7 +215,7 @@ def fix_onnx_fp16(
model = onnx.load(os.path.join(gen_models_path, f"{model_base_name}.onnx"))
# TODO: Remove this `fix_onnx_fp16` function and replace with this transform
# as we're not utilizing the validations done in this function
model, fp16_fix = FP16Clip.apply(model, gen_models_path)
model, fp16_fix = FP16ClipTransform.apply(model, onnx_base_dir=gen_models_path)

if fp16_fix:
# Save FP16 model
Expand Down
22 changes: 13 additions & 9 deletions QEfficient/generation/cloud_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
#
# -----------------------------------------------------------------------------

from typing import Dict, List
from typing import Dict, List, Optional
from warnings import warn

import numpy as np
Expand Down Expand Up @@ -44,7 +44,7 @@ class QAICInferenceSession:
def __init__(
self,
qpc_path: str,
device_ids: List[int] = [0],
device_ids: Optional[List[int]] = None,
activate: bool = True,
enable_debug_logs: bool = False,
):
Expand All @@ -58,9 +58,13 @@ def __init__(
:enable_debug_logs: bool. If True, It will enable debug logs. Default=False.
"""
# Load QPC
devices = qaicrt.QIDList(device_ids)
self.context = qaicrt.Context(devices)
self.queue = qaicrt.Queue(self.context, device_ids[0]) # Async API
if device_ids is not None:
devices = qaicrt.QIDList(device_ids)
self.context = qaicrt.Context(devices)
self.queue = qaicrt.Queue(self.context, device_ids[0])
else:
self.context = qaicrt.Context()
self.queue = qaicrt.Queue(self.context, 0) # Async API
if enable_debug_logs:
assert (
self.context.setLogLevel(qaicrt.QLogLevel.QL_DEBUG) == qaicrt.QStatus.QS_SUCCESS
Expand All @@ -80,7 +84,7 @@ def __init__(
# Create and load Program
prog_properties = qaicrt.QAicProgramProperties()
prog_properties.SubmitRetryTimeoutMs = 60_000
if len(device_ids) > 1:
if device_ids and len(device_ids) > 1:
prog_properties.devMapping = ":".join(map(str, device_ids))
self.program = qaicrt.Program(self.context, None, qpc, prog_properties)
assert self.program.load() == qaicrt.QStatus.QS_SUCCESS, "Failed to load program"
Expand Down Expand Up @@ -170,14 +174,14 @@ def run(self, inputs: Dict[str, np.ndarray]) -> Dict[str, np.ndarray]:
for binding, (elemsize, shape), (_, passed_shape) in zip(
self.bindings, allowed_shape, self.buf_dims
):
if passed_shape[0] == 0:
if passed_shape == [0]:
if not binding.is_partial_buf_allowed:
warn(f"Partial buffer not allowed for: {binding.name}")
continue
error_message += f"{binding.name}:\t{elemsize}\t{shape}\n"
error_message += "\n\nPassed shapes:\n"
for binding, (elemsize, shape) in zip(self.bindings, self.buf_dims):
if shape[0] == 0:
if shape == [0]:
continue
error_message += f"{binding.name}:\t{elemsize}\t{shape}\n"
raise ValueError(error_message)
Expand All @@ -188,7 +192,7 @@ def run(self, inputs: Dict[str, np.ndarray]) -> Dict[str, np.ndarray]:
outputs = {}
for output_name in self.output_names:
buffer_index = self.binding_index_map[output_name]
if self.buf_dims[buffer_index][1][0] == 0:
if self.qbuffers[buffer_index].size == 0:
continue
outputs[output_name] = np.frombuffer(
bytes(output_qbuffers[buffer_index]),
Expand Down
8 changes: 4 additions & 4 deletions QEfficient/generation/text_generation_inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ def latency_stats_bertstyle(
qpc_path: str,
seq_len: int,
prompt: str,
device_id: List[int] = [0],
device_id: Optional[List[int]] = None,
):
"""
Function to execute Bertstyle ONNX model on Cloud AI 100.
Expand Down Expand Up @@ -196,7 +196,7 @@ def cloud_ai_100_exec_kv_helper(
prompt: List[str],
ctx_len: int,
generation_len: Optional[int] = None,
device_id: List[int] = [0],
device_id: Optional[List[int]] = None,
enable_debug_logs: bool = False,
stream: bool = True,
write_io_dir: Optional[str] = None,
Expand Down Expand Up @@ -342,7 +342,7 @@ def cloud_ai_100_exec_kv(
qpc_path: str,
prompt: Optional[str] = None,
prompts_txt_file_path: Optional[str] = None,
device_id: List[int] = [0],
device_id: Optional[List[int]] = None,
generation_len: Optional[int] = None,
enable_debug_logs: bool = False,
stream: bool = True,
Expand All @@ -362,7 +362,7 @@ def cloud_ai_100_exec_kv(
:prompt (str): Sample prompt for the model text generation. ``Defaults to None``.
:prompts_txt_file_path (str): Path of the prompt text file. ``Defaults to None``.
:generation_len (int): Maximum context length for the model during compilation. ``Defaults to None``.
:device_id (List[int]): Device IDs to be used for compilation. If ``len(device_id) > 1``, it enables multiple card setup. ``Defaults to [0]``.
:device_id (List[int]): Device IDs to be used for execution. If ``len(device_id) > 1``, it enables multiple card setup. If ``None``, auto-device-picker will be used. ``Defaults to None``.
:enable_debug_logs (bool): If True, it enables debugging logs. ``Defaults to False``.
:stream (bool): If True, enable streamer, which returns tokens one by one as the model generates them. ``Defaults to True``.
:Write_io_dir (str): Path to write the input and output files. ``Defaults to None``.
Expand Down
2 changes: 1 addition & 1 deletion QEfficient/utils/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,7 +157,7 @@ def get_qpc_dir_path(
) -> str:
qpc_base_dir_name = (
f"qpc_{num_cores}cores_{batch_size}BS_{prompt_len}PL_{ctx_len}CL_{mos}MOS_"
+ f"{len(device_group)}"
+ f"{len(device_group) if device_group is not None else 1}"
+ "devices"
+ ("_mxfp6_mxint8" if (mxfp6 and mxint8) else "_mxfp6" if mxfp6 else "_fp16_mxint8" if mxint8 else "_fp16")
)
Expand Down
2 changes: 1 addition & 1 deletion QEfficient/utils/run_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -173,7 +173,7 @@ def run_kv_model_on_ort(self, model_path):
print("Completion:", repr(predicted_string))
return generated_ids

def run_kv_model_on_cloud_ai_100(self, qpc_path, device_group):
def run_kv_model_on_cloud_ai_100(self, qpc_path, device_group=None):
"""
Function responsible for running ``ONNX`` model on Cloud AI 100 and return the output tokens
Expand Down
2 changes: 2 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -56,4 +56,6 @@ line-length = 120
lint.extend-select = ["I"]

[tool.pytest.ini_options]
addopts = "-W ignore -s -v"
junit_logging = "all"
doctest_optionflags = "NUMBER NORMALIZE_WHITESPACE ELLIPSIS"
5 changes: 4 additions & 1 deletion scripts/Jenkinsfile
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ pipeline
. preflight_qeff/bin/activate
pip install --upgrade pip setuptools
pip install .[test]
pip install junitparser pytest-xdist
rm -rf QEfficient
'''
}
Expand All @@ -35,7 +36,9 @@ pipeline
sh '''
. preflight_qeff/bin/activate
export TOKENIZERS_PARALLELISM=false
pytest -W ignore -s -v tests -o junit_logging=all --junitxml=tests/tests_log.xml
pytest tests --ignore tests/cloud -n 4 --junitxml=tests/tests_log1.xml
pytest tests/cloud --junitxml=tests/tests_log2.xml
junitparser merge tests/tests_log1.xml tests/tests_log2.xml tests/tests_log.xml
deactivate
exit
'''
Expand Down
Loading

0 comments on commit 643bb2c

Please sign in to comment.