Added SplitTensorsTransform & Auto-device-picker (quic#89)

* fix: Transform names Signed-off-by: Ilango Rajagopal <[email protected]> * OnnxTransforms need **kwargs Signed-off-by: Ilango Rajagopal <[email protected]> * Added `SplitTensorsTransform` Signed-off-by: Ilango Rajagopal <[email protected]> * fix: `onnx_base_dir` should be passed as kwarg Signed-off-by: Ilango Rajagopal <[email protected]> * Auto-device-picker for QAICInferenceSession Signed-off-by: Ilango Rajagopal <[email protected]> * Make device_id optional Signed-off-by: Ilango Rajagopal <[email protected]> * Use auto-device-picker for tests Signed-off-by: Ilango Rajagopal <[email protected]> * Remove LoraAdapters placeholder Signed-off-by: Ilango Rajagopal <[email protected]> * Fix compile API to use None device_group Signed-off-by: Ilango Rajagopal <[email protected]> * fix: get_qpc_dir when device_group=None Signed-off-by: Ilango Rajagopal <[email protected]> * Parallelizing pytests Signed-off-by: Onkar Chougule <[email protected]> * fixed parallel tests Signed-off-by: Onkar Chougule <[email protected]> * fixing parallel tests Signed-off-by: Onkar Chougule <[email protected]> * linter Signed-off-by: Onkar Chougule <[email protected]> * Update docstring for optional `device_group` Signed-off-by: Ilango Rajagopal <[email protected]> * Add docstrings to new transforms Signed-off-by: Ilango Rajagopal <[email protected]> * fixed parallelizing tests Signed-off-by: Onkar Chougule <[email protected]> * parallelzing cli tests too Signed-off-by: Onkar Chougule <[email protected]> * extra attempt to reduce tests time Signed-off-by: Onkar Chougule <[email protected]> * Fix docstring for compile function Signed-off-by: Ilango Rajagopal <[email protected]> * fixed junit xml files Signed-off-by: Onkar Chougule <[email protected]> * bugfix Signed-off-by: Onkar Chougule <[email protected]> * fix: typo on pytest-xdist Signed-off-by: Ilango Rajagopal <[email protected]> * Move junit_logging init option to pyproject.toml Signed-off-by: Ilango Rajagopal <[email protected]> * Move standard pytest flags to pyproject.toml Signed-off-by: Ilango Rajagopal <[email protected]> * fix pyproject.toml Signed-off-by: Ilango Rajagopal <[email protected]> --------- Signed-off-by: Ilango Rajagopal <[email protected]> Signed-off-by: Onkar Chougule <[email protected]> Co-authored-by: Onkar Chougule <[email protected]>
abukhoy · Aug 26, 2024 · 643bb2c · 643bb2c
1 parent 771d983
commit 643bb2c
Show file tree

Hide file tree

Showing 20 changed files with 229 additions and 220 deletions.
diff --git a/QEfficient/base/onnx_transforms.py b/QEfficient/base/onnx_transforms.py
@@ -20,25 +20,28 @@ def __init__(self):
         raise TypeError("Transform classes are not to be instantiated. Directly use the `apply` method.")
 
     @classmethod
-    def apply(cls, model: ModelProto, onnx_base_dir: Optional[str] = None) -> Tuple[ModelProto, bool]:
+    def apply(cls, model: ModelProto, **kwargs) -> Tuple[ModelProto, bool]:
         """
         Override this class to apply a transformation.
         :param model: The model's ONNX graph to transform
-        :param onnx_base_dir: Directory where the model and external files are present
+        :param kwargs: Parameters needed for specific transforms. All transforms should take **kwargs to ignore unneeded kwargs.
 
         :returns: ONNX graph after applying the transform
         :returns: Boolean indicating whether transform was applied
         """
         raise NotImplementedError("Use subclasses for ONNX transform")
 
 
-class FP16Clip(OnnxTransform):
+class FP16ClipTransform(OnnxTransform):
     """
     Clips the tensor values to be in FP16 range.
     """
 
     @classmethod
-    def apply(cls, model: ModelProto, onnx_base_dir: Optional[str] = None) -> Tuple[ModelProto, bool]:
+    def apply(cls, model: ModelProto, *, onnx_base_dir: Optional[str] = None, **kwargs) -> Tuple[ModelProto, bool]:
+        """
+        :param onnx_base_dir: Base directory to load tensors (if not already loaded).
+        """
         finfo = np.finfo(np.float16)
         fp16_max = finfo.max
         fp16_min = finfo.min
@@ -53,9 +56,38 @@ def apply(cls, model: ModelProto, onnx_base_dir: Optional[str] = None) -> Tuple[
         return model, transformed
 
 
-class SplitWeights(OnnxTransform):
-    pass
-
+class SplitTensorsTransform(OnnxTransform):
+    """
+    Split external tensors file
+    """
 
-class LoraAdapters(OnnxTransform):
-    pass
+    @classmethod
+    def apply(
+        cls,
+        model: ModelProto,
+        *,
+        model_name: str,
+        onnx_base_dir: Optional[str] = None,
+        file_chunk_size: int = 10 * 2**30,  # 10 GiB
+        size_threshold: int = 1024,
+        **kwargs,
+    ) -> Tuple[ModelProto, bool]:
+        """
+        :param model_name: Used for naming external files. i.e. {model_name}_0.onnx.data
+        :param onnx_base_dir: Base directory to load tensors (if not already loaded).
+        :param file_chunk_size: Chunk size to split external files into.
+        :param size_threshold: Only tensors greater than this threshold (in bytes) will be saved externally.
+        """
+        file_num = 0
+        current_file_size = 0
+        transformed = False
+        external_data_helper.load_external_data_for_model(model, onnx_base_dir)
+        for tensor in external_data_helper._get_all_tensors(model):
+            if tensor.HasField("raw_data") and ((tsize := len(tensor.raw_data)) > size_threshold):
+                transformed = True
+                current_file_size += tsize
+                if current_file_size > file_chunk_size:
+                    file_num += 1
+                    current_file_size = tsize
+                external_data_helper.set_external_data(tensor, f"{model_name}_{file_num}.onnx.data")
+        return model, transformed
diff --git a/QEfficient/cloud/execute.py b/QEfficient/cloud/execute.py
@@ -16,7 +16,7 @@
 def main(
     model_name: str,
     qpc_path: str,
-    device_group: List[int],
+    device_group: Optional[List[int]] = None,
     local_model_dir: Optional[str] = None,
     prompt: Optional[str] = None,  # type: ignore
     prompts_txt_file_path: Optional[str] = None,
@@ -30,8 +30,8 @@ def main(
     ``Mandatory`` Args:
         :model_name (str): Hugging Face Model Card name, Example: ``gpt2``.
         :qpc_path (str): Path to the generated binary after compilation.
-        :device_group (List[int]): Device Ids to be used for compilation. if len(device_group) > 1. Multiple Card setup is enabled.
     ``Optional`` Args:
+        :device_group (List[int]): Device Ids to be used for compilation. if len(device_group) > 1. Multiple Card setup is enabled. ``Defaults to None.``
         :local_model_dir (str): Path to custom model weights and config files. ``Defaults to None.``
         :prompt (str): Sample prompt for the model text generation. ``Defaults to None.``
         :prompts_txt_file_path (str): Path to txt file for multiple input prompts. ``Defaults to None.``
@@ -69,7 +69,6 @@ def main(
     parser.add_argument(
         "--device_group",
         "--device-group",
-        required=True,
         type=lambda device_ids: [int(x) for x in device_ids.strip("[]").split(",")],
         help="Cloud AI 100 device ids (comma-separated) e.g. [0]",
     )

diff --git a/QEfficient/cloud/infer.py b/QEfficient/cloud/infer.py
@@ -20,7 +20,7 @@
 def main(
     model_name: str,
     num_cores: int,
-    device_group: List[int],
+    device_group: Optional[List[int]] = None,
     prompt: Optional[str] = None,  # type: ignore
     prompts_txt_file_path: Optional[str] = None,
     aic_enable_depth_first: bool = False,
@@ -39,8 +39,8 @@ def main(
     ``Mandatory`` Args:
         :model_name (str): Hugging Face Model Card name, Example: ``gpt2``
         :num_cores (int): Number of cores to compile model on.
-        :device_group (List[int]): Device Ids to be used for compilation. If ``len(device_group) > 1``, multiple Card setup is enabled.
     ``Optional`` Args:
+        :device_group (List[int]): Device Ids to be used for compilation. If ``len(device_group) > 1``, multiple Card setup is enabled. ``Defaults to None.``
         :prompt (str): Sample prompt for the model text generation. ``Defaults to None.``
         :prompts_txt_file_path (str): Path to txt file for multiple input prompts. ``Defaults to None.``
         :aic_enable_depth_first (bool): Enables ``DFS`` with default memory size. ``Defaults to False.``
@@ -147,7 +147,6 @@ def main(
     parser.add_argument(
         "--device_group",
         "--device-group",
-        required=True,
         type=lambda device_ids: [int(x) for x in device_ids.strip("[]").split(",")],
         help="Cloud AI 100 device ids (comma-separated) e.g. [0,1]  ",
     )

diff --git a/QEfficient/compile/compile_helper.py b/QEfficient/compile/compile_helper.py
@@ -40,7 +40,7 @@ def compile_kv_model_on_cloud_ai_100(
     custom_io_path: str,
     aic_enable_depth_first: bool,
     mos: int = -1,
-    device_group: List[int] = [0],
+    device_group: Optional[List[int]] = None,
     **kwargs,
 ) -> Tuple[bool, str]:
     if kwargs:
@@ -74,7 +74,7 @@ def compile_kv_model_on_cloud_ai_100(
         command.append(f"-mos={mos}")
     if aic_enable_depth_first:
         command.append("-aic-enable-depth-first")
-    if len(device_group) > 1:
+    if device_group is not None and len(device_group) > 1:
         mdp_ts_config = {
             "connections": [{"devices": list(range(len(device_group))), "type": "p2p"}],
             "partitions": [
@@ -101,7 +101,7 @@ def compile(
     onnx_path: str,
     qpc_path: str,
     num_cores: int,
-    device_group: List[int],  #  FIXME: use num_devices instead
+    device_group: Optional[List[int]] = None,  #  FIXME: use num_devices instead
     aic_enable_depth_first: bool = False,
     mos: int = -1,
     batch_size: int = 1,
@@ -122,8 +122,8 @@ def compile(
         :onnx_path (str): Generated ``ONNX`` Model Path.
         :qpc_path (str): Path for saving compiled qpc binaries.
         :num_cores (int): Number of cores to compile the model on.
-        :device_group (List[int]): Used for finding the number of devices to compile for.
     ``Optional`` Args:
+        :device_group (List[int]): Used for finding the number of devices to compile for. ``Defaults to None.``
         :aic_enable_depth_first (bool): Enables ``DFS`` with default memory size. ``Defaults to False.``
         :mos (int): Effort level to reduce the on-chip memory. ``Defaults to -1.``
         :batch_size (int): Batch size to compile the model for. ``Defaults to 1.``

diff --git a/QEfficient/exporter/export_utils.py b/QEfficient/exporter/export_utils.py
@@ -17,7 +17,7 @@
 import torch
 from onnx import external_data_helper
 
-from QEfficient.base.onnx_transforms import FP16Clip
+from QEfficient.base.onnx_transforms import FP16ClipTransform
 
 
 def export_onnx(
@@ -215,7 +215,7 @@ def fix_onnx_fp16(
     model = onnx.load(os.path.join(gen_models_path, f"{model_base_name}.onnx"))
     # TODO: Remove this `fix_onnx_fp16` function and replace with this transform
     # as we're not utilizing the validations done in this function
-    model, fp16_fix = FP16Clip.apply(model, gen_models_path)
+    model, fp16_fix = FP16ClipTransform.apply(model, onnx_base_dir=gen_models_path)
 
     if fp16_fix:
         # Save FP16 model

diff --git a/QEfficient/generation/cloud_infer.py b/QEfficient/generation/cloud_infer.py
@@ -5,7 +5,7 @@
 #
 # -----------------------------------------------------------------------------
 
-from typing import Dict, List
+from typing import Dict, List, Optional
 from warnings import warn
 
 import numpy as np
@@ -44,7 +44,7 @@ class QAICInferenceSession:
     def __init__(
         self,
         qpc_path: str,
-        device_ids: List[int] = [0],
+        device_ids: Optional[List[int]] = None,
         activate: bool = True,
         enable_debug_logs: bool = False,
     ):
@@ -58,9 +58,13 @@ def __init__(
         :enable_debug_logs: bool. If True, It will enable debug logs. Default=False.
         """
         # Load QPC
-        devices = qaicrt.QIDList(device_ids)
-        self.context = qaicrt.Context(devices)
-        self.queue = qaicrt.Queue(self.context, device_ids[0])  # Async API
+        if device_ids is not None:
+            devices = qaicrt.QIDList(device_ids)
+            self.context = qaicrt.Context(devices)
+            self.queue = qaicrt.Queue(self.context, device_ids[0])
+        else:
+            self.context = qaicrt.Context()
+            self.queue = qaicrt.Queue(self.context, 0)  # Async API
         if enable_debug_logs:
             assert (
                 self.context.setLogLevel(qaicrt.QLogLevel.QL_DEBUG) == qaicrt.QStatus.QS_SUCCESS
@@ -80,7 +84,7 @@ def __init__(
         # Create and load Program
         prog_properties = qaicrt.QAicProgramProperties()
         prog_properties.SubmitRetryTimeoutMs = 60_000
-        if len(device_ids) > 1:
+        if device_ids and len(device_ids) > 1:
             prog_properties.devMapping = ":".join(map(str, device_ids))
         self.program = qaicrt.Program(self.context, None, qpc, prog_properties)
         assert self.program.load() == qaicrt.QStatus.QS_SUCCESS, "Failed to load program"
@@ -170,14 +174,14 @@ def run(self, inputs: Dict[str, np.ndarray]) -> Dict[str, np.ndarray]:
                     for binding, (elemsize, shape), (_, passed_shape) in zip(
                         self.bindings, allowed_shape, self.buf_dims
                     ):
-                        if passed_shape[0] == 0:
+                        if passed_shape == [0]:
                             if not binding.is_partial_buf_allowed:
                                 warn(f"Partial buffer not allowed for: {binding.name}")
                             continue
                         error_message += f"{binding.name}:\t{elemsize}\t{shape}\n"
                 error_message += "\n\nPassed shapes:\n"
                 for binding, (elemsize, shape) in zip(self.bindings, self.buf_dims):
-                    if shape[0] == 0:
+                    if shape == [0]:
                         continue
                     error_message += f"{binding.name}:\t{elemsize}\t{shape}\n"
             raise ValueError(error_message)
@@ -188,7 +192,7 @@ def run(self, inputs: Dict[str, np.ndarray]) -> Dict[str, np.ndarray]:
         outputs = {}
         for output_name in self.output_names:
             buffer_index = self.binding_index_map[output_name]
-            if self.buf_dims[buffer_index][1][0] == 0:
+            if self.qbuffers[buffer_index].size == 0:
                 continue
             outputs[output_name] = np.frombuffer(
                 bytes(output_qbuffers[buffer_index]),

diff --git a/QEfficient/generation/text_generation_inference.py b/QEfficient/generation/text_generation_inference.py
@@ -99,7 +99,7 @@ def latency_stats_bertstyle(
     qpc_path: str,
     seq_len: int,
     prompt: str,
-    device_id: List[int] = [0],
+    device_id: Optional[List[int]] = None,
 ):
     """
     Function to execute Bertstyle ONNX model on Cloud AI 100.
@@ -196,7 +196,7 @@ def cloud_ai_100_exec_kv_helper(
     prompt: List[str],
     ctx_len: int,
     generation_len: Optional[int] = None,
-    device_id: List[int] = [0],
+    device_id: Optional[List[int]] = None,
     enable_debug_logs: bool = False,
     stream: bool = True,
     write_io_dir: Optional[str] = None,
@@ -342,7 +342,7 @@ def cloud_ai_100_exec_kv(
     qpc_path: str,
     prompt: Optional[str] = None,
     prompts_txt_file_path: Optional[str] = None,
-    device_id: List[int] = [0],
+    device_id: Optional[List[int]] = None,
     generation_len: Optional[int] = None,
     enable_debug_logs: bool = False,
     stream: bool = True,
@@ -362,7 +362,7 @@ def cloud_ai_100_exec_kv(
         :prompt (str): Sample prompt for the model text generation. ``Defaults to None``.
         :prompts_txt_file_path (str): Path of the prompt text file. ``Defaults to None``.
         :generation_len (int): Maximum context length for the model during compilation. ``Defaults to None``.
-        :device_id (List[int]): Device IDs to be used for compilation. If ``len(device_id) > 1``, it enables multiple card setup. ``Defaults to [0]``.
+        :device_id (List[int]): Device IDs to be used for execution. If ``len(device_id) > 1``, it enables multiple card setup. If ``None``, auto-device-picker will be used. ``Defaults to None``.
         :enable_debug_logs (bool): If True, it enables debugging logs. ``Defaults to False``.
         :stream (bool): If True, enable streamer, which returns tokens one by one as the model generates them. ``Defaults to True``.
         :Write_io_dir (str): Path to write the input and output files. ``Defaults to None``.

diff --git a/QEfficient/utils/_utils.py b/QEfficient/utils/_utils.py
@@ -157,7 +157,7 @@ def get_qpc_dir_path(
 ) -> str:
     qpc_base_dir_name = (
         f"qpc_{num_cores}cores_{batch_size}BS_{prompt_len}PL_{ctx_len}CL_{mos}MOS_"
-        + f"{len(device_group)}"
+        + f"{len(device_group) if device_group is not None else 1}"
         + "devices"
         + ("_mxfp6_mxint8" if (mxfp6 and mxint8) else "_mxfp6" if mxfp6 else "_fp16_mxint8" if mxint8 else "_fp16")
     )

diff --git a/QEfficient/utils/run_utils.py b/QEfficient/utils/run_utils.py
@@ -173,7 +173,7 @@ def run_kv_model_on_ort(self, model_path):
         print("Completion:", repr(predicted_string))
         return generated_ids
 
-    def run_kv_model_on_cloud_ai_100(self, qpc_path, device_group):
+    def run_kv_model_on_cloud_ai_100(self, qpc_path, device_group=None):
         """
         Function responsible for running ``ONNX`` model on Cloud AI 100 and return the output tokens
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -56,4 +56,6 @@ line-length = 120
 lint.extend-select = ["I"]
 
 [tool.pytest.ini_options]
+addopts = "-W ignore -s -v"
+junit_logging = "all"
 doctest_optionflags = "NUMBER NORMALIZE_WHITESPACE ELLIPSIS"
diff --git a/scripts/Jenkinsfile b/scripts/Jenkinsfile
@@ -20,6 +20,7 @@ pipeline
                     . preflight_qeff/bin/activate
                     pip install --upgrade pip setuptools
                     pip install .[test]
+                    pip install junitparser pytest-xdist
                     rm -rf QEfficient
                 '''
             }
@@ -35,7 +36,9 @@ pipeline
                     sh '''
                     . preflight_qeff/bin/activate
                     export TOKENIZERS_PARALLELISM=false
-                    pytest -W ignore -s -v tests -o junit_logging=all --junitxml=tests/tests_log.xml
+                    pytest tests --ignore tests/cloud -n 4 --junitxml=tests/tests_log1.xml
+                    pytest tests/cloud --junitxml=tests/tests_log2.xml
+                    junitparser merge tests/tests_log1.xml tests/tests_log2.xml tests/tests_log.xml
                     deactivate
                     exit
                     '''