diff --git a/docs/deploy/arm_compute_lib.rst b/docs/deploy/arm_compute_lib.rst index a2eaa5fb5662..5d11241c1a34 100644 --- a/docs/deploy/arm_compute_lib.rst +++ b/docs/deploy/arm_compute_lib.rst @@ -15,7 +15,7 @@ specific language governing permissions and limitations under the License. -Relay Arm :sup:`®` Compute Library Integration +Relay Arm:sup:`®` Compute Library Integration ============================================== **Author**: `Luke Hutton `_ @@ -195,12 +195,14 @@ Operator support | | Simple: nn.conv2d | | | Composite: nn.pad?, nn.conv2d, nn.bias_add?, nn.relu? | | | | -| | (only groups = 1 supported) | +| | Normal and depth-wise (when kernel is 3x3 or 5x5 and strides are 1x1 | +| | or 2x2) convolution supported. Grouped convolution is not supported. | +----------------------+-------------------------------------------------------------------------+ | qnn.conv2d | uint8: | | | Composite: nn.pad?, nn.conv2d, nn.bias_add?, nn.relu?, qnn.requantize | | | | -| | (only groups = 1 supported) | +| | Normal and depth-wise (when kernel is 3x3 or 5x5 and strides are 1x1 | +| | or 2x2) convolution supported. Grouped convolution is not supported. | +----------------------+-------------------------------------------------------------------------+ | nn.dense | fp32: | | | Simple: nn.dense | diff --git a/python/tvm/relay/op/contrib/arm_compute_lib.py b/python/tvm/relay/op/contrib/arm_compute_lib.py index a78ad294b770..8a03cb173612 100644 --- a/python/tvm/relay/op/contrib/arm_compute_lib.py +++ b/python/tvm/relay/op/contrib/arm_compute_lib.py @@ -19,12 +19,15 @@ import numpy as np import tvm +from tvm._ffi import register_func from tvm.relay.expr import const from tvm.relay import transform from tvm.relay.build_module import bind_params_by_name +from tvm.relay.testing.temp_op_attr import TempOpAttr from ...dataflow_pattern import wildcard, is_op, is_constant, is_expr from .register import register_pattern_table +from ..strategy.generic import is_depthwise_conv2d def is_arm_compute_runtime_enabled(): @@ -71,6 +74,61 @@ def partition_for_arm_compute_lib(mod, params=None): return seq(mod) +@register_func("relay.ext.arm_compute_lib.optimize") +def preprocess_module(mod): + """ + Pre-process a module containing functions ready for ACL codegen. For now we enforce OHWI + kernel layout and fold the transforms away. + + Parameters + ---------- + mod : Module + The module to run passes on. + + Returns + ------- + preprocessed_mod : The processed module. + """ + + def convert_layout_conv2d(conv2d_function): + def convert_conv(attrs, inputs, tinfos, desired_layouts): + new_attrs = dict(attrs) + data_info = tinfos[0] + weight_info = tinfos[1] + desired_data_layout, desired_kernel_layout = map(str, desired_layouts) + new_attrs["data_layout"] = desired_data_layout + new_attrs["kernel_layout"] = desired_kernel_layout + + if is_depthwise_conv2d( + data_info.shape, + attrs["data_layout"], + weight_info.shape, + attrs["kernel_layout"], + attrs["groups"], + ): + dkl = desired_kernel_layout + new_attrs["kernel_layout"] = dkl[3] + dkl[1:3] + dkl[0] + return conv2d_function(*inputs, **new_attrs) + + return convert_conv + + with TempOpAttr( + "nn.conv2d", "FTVMConvertOpLayout", convert_layout_conv2d(tvm.relay.nn.conv2d) + ), TempOpAttr( + "qnn.conv2d", "FTVMConvertOpLayout", convert_layout_conv2d(tvm.relay.qnn.op.conv2d) + ): + seq = tvm.transform.Sequential( + [ + transform.ConvertLayout( + {"nn.conv2d": ["NHWC", "OHWI"], "qnn.conv2d": ["NHWC", "OHWI"]} + ), + transform.FoldConstant(), + ] + ) + preprocessed_mod = seq(mod) + return preprocessed_mod + + @register_pattern_table("arm_compute_lib") def arm_compute_lib_pattern_table(): """Get the ACL pattern table.""" @@ -236,8 +294,6 @@ def _func_wrapper(expr): def conv2d(expr): """Check if the external ACL codegen for conv2d should be used.""" attrs, args = expr.attrs, expr.args - if attrs.groups != 1: - return False if attrs.data_layout != "NHWC": return False if attrs.out_dtype != "float32" and attrs.out_dtype != "": @@ -248,14 +304,25 @@ def conv2d(expr): kernel_typ = args[1].checked_type if len(kernel_typ.shape) != 4 or kernel_typ.dtype != "float32": return False + is_depthwise = is_depthwise_conv2d( + data_typ.shape, + attrs["data_layout"], + kernel_typ.shape, + attrs["kernel_layout"], + attrs["groups"], + ) + if is_depthwise: + return depthwise_conv2d(attrs, args) + # ACL doesn't support grouped convolution + if attrs.groups != 1 and not is_depthwise: + return False return True def qnn_conv2d(expr): """Check if the external ACL codegen for qnn.conv2d should be used.""" attrs, args = expr.attrs, expr.args - if attrs.groups != 1: - return False + if attrs.data_layout != "NHWC": return False if attrs.out_dtype != "int32" and attrs.out_dtype != "": @@ -266,6 +333,40 @@ def qnn_conv2d(expr): kernel_typ = args[1].checked_type if len(kernel_typ.shape) != 4 or kernel_typ.dtype != "uint8": return False + is_depthwise = is_depthwise_conv2d( + data_typ.shape, + attrs["data_layout"], + kernel_typ.shape, + attrs["kernel_layout"], + attrs["groups"], + ) + if is_depthwise: + return depthwise_conv2d(attrs, args) + # ACL doesn't support grouped convolution + if attrs.groups != 1 and not is_depthwise: + return False + return True + + +def depthwise_conv2d(attrs, args): + """Check if the external ACL codegen for depthwise convolution should be used. + + Note + ---- + Relay does not have a depthwise conv2d operator whilst ACL does. We simply + separate the checks for depthwise for clarity. + """ + kernel_typ = args[1].checked_type + # Only supports 3x3, 5x5 depthwise + if ( + kernel_typ.shape[0] not in [3, 5] + or kernel_typ.shape[1] not in [3, 5] + or kernel_typ.shape[0] != kernel_typ.shape[1] + ): + return False + # Stride must be (1, 1) or (2, 2) + if (attrs.strides[0], attrs.strides[1]) not in [(1, 1), (2, 2)]: + return False return True diff --git a/python/tvm/relay/testing/__init__.py b/python/tvm/relay/testing/__init__.py index 0b81cb9c7ec6..f0c79bed1218 100644 --- a/python/tvm/relay/testing/__init__.py +++ b/python/tvm/relay/testing/__init__.py @@ -22,9 +22,9 @@ import tvm from tvm import te -import tvm.relay as relay -import tvm.relay.op as op -from tvm.relay import Prelude +from tvm import relay +from tvm.relay import op +from tvm.relay.prelude import Prelude from tvm.testing import enabled_targets from . import mlp diff --git a/src/relay/backend/contrib/arm_compute_lib/codegen.cc b/src/relay/backend/contrib/arm_compute_lib/codegen.cc index a963242f82d5..e0669ae64bdb 100644 --- a/src/relay/backend/contrib/arm_compute_lib/codegen.cc +++ b/src/relay/backend/contrib/arm_compute_lib/codegen.cc @@ -24,6 +24,7 @@ #include #include #include +#include #include #include @@ -126,7 +127,7 @@ class ACLJSONSerializer : public backend::contrib::JSONSerializer { nodes.activation = current_call; current_call = current_call->args[0].as(); } - if (backend::IsOp(current_call, "nn.bias_add")) { + if (backend::IsOp(current_call, "add")) { nodes.bias = current_call; current_call = current_call->args[0].as(); } @@ -154,19 +155,32 @@ class ACLJSONSerializer : public backend::contrib::JSONSerializer { */ std::shared_ptr CreateCompositeConvJSONNode(const CallNode* cn) { CompositeConvNode nodes = UnpackCompositeConvolution(cn); - std::string name = "nn.conv2d"; const auto* conv_attr = nodes.conv->attrs.as(); ICHECK(conv_attr); - ICHECK(conv_attr->kernel_layout == "OHWI") - << "Kernel layout must be OHWI, has the module been pre-processed correctly?"; + + std::string name; + std::string name_prefix = "nn"; + + // Distinguish between normal and depth-wise convolution + if (conv_attr->channels.defined() && + tvm::tir::ExprDeepEqual()(conv_attr->channels, conv_attr->groups) && + conv_attr->groups != 1) { + name = "depthwise_conv2d"; + ICHECK(conv_attr->kernel_layout == "IHWO") + << "Kernel layout must be IHWO, has the module been pre-processed correctly?"; + } else { + name = "conv2d"; + ICHECK(conv_attr->kernel_layout == "OHWI") + << "Kernel layout must be OHWI, has the module been pre-processed correctly?"; + } // Inputs must be added in the same order they appear in the relay graph. std::vector inputs; inputs.push_back(VisitExpr(cn->args[0])[0]); inputs.push_back(VisitExpr(nodes.conv->args[1])[0]); if (nodes.requantize) { - name = "qnn.conv2d"; + name_prefix = "qnn"; inputs.push_back(VisitExpr(nodes.conv->args[2])[0]); // input zero-point inputs.push_back(VisitExpr(nodes.conv->args[3])[0]); // kernel zero-point inputs.push_back(VisitExpr(nodes.conv->args[4])[0]); // input scale @@ -180,7 +194,7 @@ class ACLJSONSerializer : public backend::contrib::JSONSerializer { inputs.push_back(VisitExpr(nodes.requantize->args[4])[0]); // output zero-point } - auto json_node = std::make_shared(name, "kernel", inputs, 1); + auto json_node = std::make_shared(name_prefix + "." + name, "kernel", inputs, 1); SetCallNodeAttribute(json_node, nodes.conv); // Override attributes @@ -224,10 +238,11 @@ class ACLJSONSerializer : public backend::contrib::JSONSerializer { nodes.requantize = current_call; current_call = current_call->args[0].as(); } - if (backend::IsOp(current_call, "nn.bias_add")) { + if (backend::IsOp(current_call, "add")) { nodes.bias = current_call; current_call = current_call->args[0].as(); } + // Enforce a dense node exists at this point during traversal if (nodes.requantize) { ICHECK(backend::IsOp(current_call, "qnn.dense")); @@ -329,25 +344,6 @@ class ACLJSONSerializer : public backend::contrib::JSONSerializer { } }; -/*! - * \brief Pre-process a module containing functions ready for ACL codegen. - * - * For now we enforce OHWI kernel layout and fold the transforms away. - * - * \param mod The module to be pre-processed. - * \return The processed module. - */ -IRModule PreProcessModule(const IRModule& mod) { - IRModule preprocessed_module; - tvm::Map> desired_layouts = {{"nn.conv2d", {"NHWC", "OHWI"}}, - {"qnn.conv2d", {"NHWC", "OHWI"}}}; - preprocessed_module = transform::ConvertLayout(desired_layouts)(mod); - preprocessed_module = transform::FoldConstant()(preprocessed_module); - return preprocessed_module; -} - -TVM_REGISTER_GLOBAL("relay.ext.arm_compute_lib.optimize").set_body_typed(PreProcessModule); - /*! * \brief Create a runtime module for ACL. * diff --git a/src/runtime/contrib/arm_compute_lib/acl_runtime.cc b/src/runtime/contrib/arm_compute_lib/acl_runtime.cc index 09879bdc6e95..ed8f6adbd083 100644 --- a/src/runtime/contrib/arm_compute_lib/acl_runtime.cc +++ b/src/runtime/contrib/arm_compute_lib/acl_runtime.cc @@ -32,6 +32,7 @@ #include #include #include +#include #include #include #include @@ -131,6 +132,9 @@ class ACLRuntime : public JSONRuntimeBase { if ("nn.conv2d" == op_name || "qnn.conv2d" == op_name) { CreateConvolution2DLayer(&layer_, node, mm); num_pools++; + } else if ("nn.depthwise_conv2d" == op_name || "qnn.depthwise_conv2d" == op_name) { + CreateDepthwiseConvolution2DLayer(&layer_, node, mm); + num_pools++; } else if ("nn.dense" == op_name || "qnn.dense" == op_name) { CreateFullyConnectedLayer(&layer_, node, mm); num_pools++; @@ -227,12 +231,7 @@ class ACLRuntime : public JSONRuntimeBase { arm_compute::ActivationLayerInfo act_info; if (node.HasAttr("activation_type")) { std::string activation_type = node.GetAttr>("activation_type")[0]; - if (activation_type == "relu") { - act_info = arm_compute::ActivationLayerInfo( - arm_compute::ActivationLayerInfo::ActivationFunction::RELU); - } else { - LOG(FATAL) << "Unsupported activation function"; - } + act_info = MakeACLActivationInfo(activation_type); } arm_compute::Size2D dilation_2d(std::stoi(dilation[0]), std::stoi(dilation[1])); @@ -269,6 +268,64 @@ class ACLRuntime : public JSONRuntimeBase { layer->function = function; } + /*! + * \brief Create a 2D depthwise convolution layer. + * + * \param layer The ACL layer to build. Containing inputs, outputs and the ACL function. + * \param node The JSON representation of the operator. + * \param mm The ACL conv2d layer can request auxiliary memory from TVM. + */ + void CreateDepthwiseConvolution2DLayer( + CachedLayer* layer, const JSONGraphNode& node, + const std::shared_ptr& mm) { + std::vector padding = node.GetAttr>("padding"); + std::vector strides = node.GetAttr>("strides"); + std::vector dilation = node.GetAttr>("dilation"); + arm_compute::PadStrideInfo pad_stride_info = MakeACLPadStride(padding, strides); + + arm_compute::ActivationLayerInfo act_info; + if (node.HasAttr("activation_type")) { + std::string activation_type = node.GetAttr>("activation_type")[0]; + act_info = MakeACLActivationInfo(activation_type); + } + + arm_compute::Size2D dilation_2d(std::stoi(dilation[0]), std::stoi(dilation[1])); + + // Collect inputs and outputs, handling both nn.conv2d and qnn.conv2d cases. + std::vector inputs = node.GetInputs(); + size_t num_inputs = inputs.size(); + bool has_bias; + if (node.GetOpName() == "qnn.depthwise_conv2d") { + ICHECK(num_inputs >= 8U && num_inputs <= 9U) + << "Quantized convolution requires 9 inputs with a bias, 8 inputs without."; + has_bias = num_inputs == 9; + layer->inputs.push_back(MakeACLTensorFromJSONEntry(inputs[0], &inputs[4], &inputs[2])); + layer->inputs.push_back(MakeACLTensorFromJSONEntry(inputs[1], &inputs[5], &inputs[3])); + if (has_bias) { + layer->inputs.push_back(MakeACLTensorFromJSONEntry(inputs[6])); + } + layer->outputs.push_back( + MakeACLTensorFromJSONNode(node, &inputs[6 + has_bias], &inputs[7 + has_bias])); + } else { + ICHECK(num_inputs >= 2U && num_inputs <= 3U) + << "Convolution requires 3 inputs with a bias, 2 inputs without."; + has_bias = num_inputs == 3; + for (const auto& i : inputs) { + layer->inputs.push_back(MakeACLTensorFromJSONEntry(i)); + } + layer->outputs.push_back(MakeACLTensorFromJSONNode(node)); + } + + // Depth multiplier is the final dimension in acl weights tensor (IWH*M*) + int depth_multiplier = layer->inputs[1].info()->tensor_shape()[3]; + + auto function = std::make_shared(mm); + function->configure(&layer->inputs[0], &layer->inputs[1], + has_bias ? &layer->inputs[2] : nullptr, &layer->outputs[0], pad_stride_info, + depth_multiplier, act_info, dilation_2d); + layer->function = function; + } + /*! * \brief Create a fully connected (dense) layer. * diff --git a/src/runtime/contrib/arm_compute_lib/acl_utils.cc b/src/runtime/contrib/arm_compute_lib/acl_utils.cc index 604c619bf49c..3b2620987ab0 100644 --- a/src/runtime/contrib/arm_compute_lib/acl_utils.cc +++ b/src/runtime/contrib/arm_compute_lib/acl_utils.cc @@ -134,6 +134,16 @@ arm_compute::DataType MakeACLDataType(const DLDataType& data_type) { } } +arm_compute::ActivationLayerInfo MakeACLActivationInfo(const std::string& activation_type) { + auto act_func = arm_compute::ActivationLayerInfo::ActivationFunction::IDENTITY; + if (activation_type == "relu") { + act_func = arm_compute::ActivationLayerInfo::ActivationFunction::RELU; + } else { + LOG(FATAL) << "Activation " << activation_type << " unsupported by ACL runtime"; + } + return {act_func}; +} + template std::vector GetVectorFromDLTensor(const DLTensor* tensor) { ICHECK(tensor) << "Cannot convert a nullptr"; diff --git a/src/runtime/contrib/arm_compute_lib/acl_utils.h b/src/runtime/contrib/arm_compute_lib/acl_utils.h index 576ed916ff60..dbb006fbb347 100644 --- a/src/runtime/contrib/arm_compute_lib/acl_utils.h +++ b/src/runtime/contrib/arm_compute_lib/acl_utils.h @@ -108,6 +108,15 @@ arm_compute::PadStrideInfo MakeACLPadStride(const std::vector& pad, */ arm_compute::DataType MakeACLDataType(const DLDataType& data_type); +/*! + * \brief Convert string to arm_compute::ActivationLayerInfo + * + * \param activation_type A string representing activation function. + * Currently supports the following options: "relu". + * \return arm_compute::ActivationLayerInfo. + */ +arm_compute::ActivationLayerInfo MakeACLActivationInfo(const std::string& activation_type); + /*! * \brief Get a vector from DLTensor data. * \note Performs a copy of data. diff --git a/tests/python/contrib/test_arm_compute_lib/infrastructure.py b/tests/python/contrib/test_arm_compute_lib/infrastructure.py index c5d711d7afa3..80cd5847440e 100644 --- a/tests/python/contrib/test_arm_compute_lib/infrastructure.py +++ b/tests/python/contrib/test_arm_compute_lib/infrastructure.py @@ -303,45 +303,3 @@ def verify_codegen( f"Actual={codegen_str} \n" f"Expected={known_good_codegen_str}" ) - - -def generate_trials(space, r_factor=3): - """Generates a series of trials. - - This algorithm generates a series of non-deterministic trials given a - space of options to test. A trial is generated by pulling a value from - each option in the space. On some occasions the values are shuffled to - ensure a different trial on each r_factor iteration. The algorithm ensures - that each value from an option is used at least once. The total number of - trials is determined by the r_factor * the option with the largest number - of values. - - Parameters - ---------- - space: List[List[Any]] - A list of different options with varying values to test. - r_factor: (optional) int - The repeat factor. - - Returns - ------- - A list of trials specifying values for each option. - - """ - np.random.seed(0) - max_len = 1 - for option in space: - max_len = max(max_len, len(option)) - - num_trials = r_factor * max_len - trials = [] - for i in range(num_trials): - trial = [] - for option in space: - if i % len(option) == 0: - np.random.shuffle(option) - trial.append(option[i % len(option)]) - - trials.append(trial) - - return trials diff --git a/tests/python/contrib/test_arm_compute_lib/test_conv2d.py b/tests/python/contrib/test_arm_compute_lib/test_conv2d.py index 4496a2a1afa9..cc5bbfec7c69 100644 --- a/tests/python/contrib/test_arm_compute_lib/test_conv2d.py +++ b/tests/python/contrib/test_arm_compute_lib/test_conv2d.py @@ -21,15 +21,14 @@ import tvm from tvm import relay -from .infrastructure import ( +from test_arm_compute_lib.infrastructure import ( skip_runtime_test, skip_codegen_test, build_and_run, verify, verify_codegen, - generate_trials, ) -from .infrastructure import Device +from test_arm_compute_lib.infrastructure import Device def _get_model( @@ -57,7 +56,12 @@ def _get_model( if len(padding) == 2: padding = (padding[0], padding[1], padding[0], padding[1]) shape = (shape[0], shape[1] + padding[0] * 2, shape[2] + padding[1] * 2, shape[3]) - weight_shape = (kernel_h, kernel_w, shape[3] // groups, channels) + is_depthwise = shape[3] == channels == groups + weight_format = "HWOI" if is_depthwise else "HWIO" + if weight_format == "HWIO": + weight_shape = (kernel_h, kernel_w, shape[3] // groups, channels) + else: + weight_shape = (kernel_h, kernel_w, channels, shape[3] // groups) w = tvm.nd.array(np.random.uniform(-128, 127, weight_shape).astype(dtype)) weights = relay.const(w, dtype) out = relay.nn.conv2d( @@ -65,7 +69,7 @@ def _get_model( weights, kernel_size=(kernel_h, kernel_w), data_layout="NHWC", - kernel_layout="HWIO", + kernel_layout=weight_format, dilation=dilation, strides=strides, padding=padding, @@ -75,7 +79,8 @@ def _get_model( ) params = {"w": w} if has_bias: - b = tvm.nd.array(np.random.uniform(-128, 127, weight_shape[3]).astype(dtype)) + bias_shape = weight_shape[2] if is_depthwise else weight_shape[3] + b = tvm.nd.array(np.random.uniform(-128, 127, bias_shape).astype(dtype)) biasc = relay.const(b, dtype) out = relay.nn.bias_add(out, biasc, axis=3) params["b"] = b @@ -134,7 +139,12 @@ def _get_qnn_model( if len(padding) == 2: padding = (padding[0], padding[1], padding[0], padding[1]) shape = (shape[0], shape[1] + padding[0] * 2, shape[2] + padding[1] * 2, shape[3]) - weight_shape = (kernel_h, kernel_w, shape[3] // groups, channels) + is_depthwise = shape[3] == channels == groups + weight_format = "HWOI" if is_depthwise else "HWIO" + if weight_format == "HWIO": + weight_shape = (kernel_h, kernel_w, shape[3] // groups, channels) + else: + weight_shape = (kernel_h, kernel_w, channels, shape[3] // groups) w = tvm.nd.array(np.random.uniform(0, 255, weight_shape).astype(dtype)) weights = relay.const(w, dtype) out = relay.qnn.op.conv2d( @@ -146,7 +156,7 @@ def _get_qnn_model( kernel_scale=relay.const(kernel_sc, "float32"), kernel_size=(kernel_h, kernel_w), data_layout="NHWC", - kernel_layout="HWIO", + kernel_layout=weight_format, dilation=dilation, strides=strides, padding=padding, @@ -156,7 +166,8 @@ def _get_qnn_model( ) params = {"w": w} if has_bias: - b = tvm.nd.array(np.random.uniform(0, 255, weight_shape[3]).astype("int32")) + bias_shape = weight_shape[2] if is_depthwise else weight_shape[3] + b = tvm.nd.array(np.random.uniform(-128, 127, bias_shape).astype("int32")) biasc = relay.const(b, "int32") out = relay.nn.bias_add(out, biasc, axis=3) params["b"] = b @@ -188,21 +199,30 @@ def _get_expected_codegen( ): if len(padding) == 2: padding = (padding[0], padding[1], padding[0], padding[1]) - weight_shape = (channels, kernel_h, kernel_w, shape[3] // groups) output_height = ((shape[1] - kernel_h + padding[0] + padding[2]) / strides[0]) + 1 output_width = ((shape[2] - kernel_w + padding[1] + padding[3]) / strides[1]) + 1 output_shape = (1, int(output_height), int(output_width), channels) out_dtype = "int32" if dtype == "uint8" else "float32" + is_depthwise = shape[3] == channels == groups + weight_format = "IHWO" if is_depthwise else "OHWI" + if weight_format == "IHWO": + weight_shape = (shape[3] // groups, kernel_h, kernel_w, channels) + else: + weight_shape = (channels, kernel_h, kernel_w, shape[3] // groups) + if is_depthwise: + name = "nn.depthwise_conv2d" + else: + name = "nn.conv2d" node = { "op": "kernel", - "name": "nn.conv2d", + "name": name, "inputs": [], "attrs": { - "groups": [["1"]], + "groups": [[str(groups)]], "num_outputs": "1", "data_layout": [["NHWC"]], - "kernel_layout": [["OHWI"]], + "kernel_layout": [[weight_format]], "channels": [[str(channels)]], "dilation": [[str(dilation[0]), str(dilation[1])]], "out_layout": [[""]], @@ -229,7 +249,7 @@ def _get_expected_codegen( # qnn.conv2d params, input and kernel if dtype == "uint8": - node["name"] = "qnn.conv2d" + node["name"] = "qnn." + node["name"].split(".")[1] for param_dtype in ["int32", "float32"]: for _ in range(2): inputs.append( @@ -246,7 +266,10 @@ def _get_expected_codegen( { "op": "const", "name": "", - "attrs": {"shape": [[[weight_shape[0]]]], "dtype": [[bias_dtype]]}, + "attrs": { + "shape": [[[1, 1, 1, weight_shape[3] if is_depthwise else weight_shape[0]]]], + "dtype": [[bias_dtype]], + }, } ) @@ -275,29 +298,43 @@ def test_conv2d(): device = Device() np.random.seed(0) - kernel_hs = [1, 2, 3, 5] - kernel_ws = [1, 2, 3, 5] - pad = [(1, 1), (2, 2), (2, 1)] - strides = [(1, 1), (2, 2)] - dilation = [(1, 1)] - out_channels = [4, 7, 16] - input_shapes = [(10, 10, 14), (12, 15, 16), (20, 20, 20)] - # composite operator (pad, bias, activation) - composite = [ - (False, False, False), - (False, True, False), - (False, False, True), - (False, True, True), - (True, False, False), - ] dtype = "float32" - trials = generate_trials( - [kernel_hs, kernel_ws, pad, strides, dilation, out_channels, input_shapes, composite], 3 - ) + trials = [ + # Normal convolution + [2, 2, (1, 1), (1, 1), (1, 1), 4, (10, 10, 14), (False, False, False), False], + [2, 1, (2, 2), (1, 1), (1, 1), 7, (12, 15, 16), (False, False, True), False], + [3, 3, (2, 1), (1, 1), (1, 1), 4, (10, 10, 14), (False, True, False), False], + [3, 3, (1, 1), (1, 1), (1, 1), 16, (12, 15, 16), (False, False, False), False], + [5, 5, (1, 1), (2, 2), (1, 1), 4, (10, 10, 14), (True, False, False), False], + [1, 3, (1, 1), (1, 1), (1, 1), 7, (20, 20, 20), (False, False, True), False], + [2, 2, (2, 2), (1, 1), (1, 1), 4, (20, 20, 20), (False, True, False), False], + [5, 5, (1, 1), (2, 2), (1, 1), 4, (10, 10, 14), (True, False, False), False], + [3, 3, (2, 1), (1, 1), (1, 1), 7, (20, 20, 20), (False, False, False), False], + [3, 3, (1, 1), (2, 2), (1, 1), 16, (10, 10, 14), (False, True, True), False], + # Depth-wise convolution + [3, 3, (1, 1), (1, 1), (1, 1), 20, (20, 20, 20), (False, False, True), True], + [5, 5, (2, 2), (1, 1), (1, 1), 20, (20, 20, 20), (False, True, False), True], + [3, 3, (2, 2), (2, 2), (1, 1), 14, (10, 10, 14), (True, False, False), True], + [5, 5, (0, 0), (1, 1), (1, 1), 20, (20, 20, 20), (False, False, False), True], + [3, 3, (1, 1), (2, 2), (1, 1), 14, (10, 10, 14), (False, True, True), True], + ] - for kernel_h, kernel_w, pad, stride, dilation, out_channels, input_shapes, composite in trials: - groups = 1 - shape = (1, *input_shapes) + for ( + kernel_h, + kernel_w, + pad, + stride, + dilation, + out_channels, + shape, + composite, + is_depthwise, + ) in trials: + shape = (1, *shape) + if is_depthwise: + groups = shape[3] + else: + groups = 1 outputs = [] inputs = { "a": tvm.nd.array(np.random.uniform(-128, 127, shape).astype(dtype)), @@ -338,31 +375,43 @@ def test_codegen_conv2d(): if skip_codegen_test(): return - np.random.seed(0) - - kernel_hs = [1, 2, 3, 5] - kernel_ws = [1, 2, 3, 5] - pad = [(1, 1), (2, 2), (2, 1)] - strides = [(1, 1), (2, 2)] - dilation = [(1, 1)] - out_channels = [4, 7, 16] - input_shapes = [(10, 10, 14), (12, 15, 16), (20, 20, 20)] - # composite operator (pad, bias, activation) - composite = [ - (False, False, False), - (False, True, False), - (False, False, True), - (False, True, True), - (True, False, False), - ] dtype = "float32" - trials = generate_trials( - [kernel_hs, kernel_ws, pad, strides, dilation, out_channels, input_shapes, composite], 3 - ) + trials = [ + # Normal convolution + [2, 2, (1, 1), (1, 1), (1, 1), 4, (10, 10, 14), (False, False, False), False], + [2, 1, (2, 2), (1, 1), (1, 1), 7, (12, 15, 16), (False, False, True), False], + [3, 3, (2, 1), (1, 1), (1, 1), 4, (10, 10, 14), (False, True, False), False], + [3, 3, (1, 1), (1, 1), (1, 1), 16, (12, 15, 16), (False, False, False), False], + [5, 5, (1, 1), (2, 2), (1, 1), 4, (10, 10, 14), (True, False, False), False], + [1, 3, (1, 1), (1, 1), (1, 1), 7, (20, 20, 20), (False, False, True), False], + [2, 2, (2, 2), (1, 1), (1, 1), 4, (20, 20, 20), (False, True, False), False], + [5, 5, (1, 1), (2, 2), (1, 1), 4, (10, 10, 14), (True, False, False), False], + [3, 3, (2, 1), (1, 1), (1, 1), 7, (20, 20, 20), (False, False, False), False], + [3, 3, (1, 1), (2, 2), (1, 1), 16, (10, 10, 14), (False, True, True), False], + # Depth-wise convolution + [3, 3, (1, 1), (1, 1), (1, 1), 20, (20, 20, 20), (False, False, True), True], + [5, 5, (2, 2), (1, 1), (1, 1), 20, (20, 20, 20), (False, True, False), True], + [3, 3, (2, 2), (2, 2), (1, 1), 14, (10, 10, 14), (True, False, False), True], + [5, 5, (0, 0), (1, 1), (1, 1), 20, (20, 20, 20), (False, False, False), True], + [3, 3, (1, 1), (2, 2), (1, 1), 14, (10, 10, 14), (False, True, True), True], + ] - for kernel_h, kernel_w, pad, stride, dilation, out_channels, input_shapes, composite in trials: - groups = 1 - shape = (1, *input_shapes) + for ( + kernel_h, + kernel_w, + pad, + stride, + dilation, + out_channels, + shape, + composite, + is_depthwise, + ) in trials: + shape = (1, *shape) + if is_depthwise: + groups = shape[3] + else: + groups = 1 inputs = {"a"} args = (shape, kernel_h, kernel_w, pad, stride, dilation, groups, dtype, out_channels) @@ -389,29 +438,43 @@ def test_qnn_conv2d(): device = Device() np.random.seed(0) - kernel_hs = [1, 2, 3, 5] - kernel_ws = [1, 2, 3, 5] - pad = [(1, 1), (2, 2)] - strides = [(1, 1), (2, 2)] - dilation = [(1, 1)] - out_channels = [4, 7, 16] - input_shapes = [(10, 10, 14), (12, 15, 16), (20, 20, 20)] - # composite operator (pad, bias, activation) - composite = [ - (False, False, False), - (False, True, False), - (False, False, True), - (False, True, True), - (True, False, False), - ] dtype = "uint8" - trials = generate_trials( - [kernel_hs, kernel_ws, pad, strides, dilation, out_channels, input_shapes, composite], 3 - ) + trials = [ + # Normal convolution + [2, 2, (1, 1), (1, 1), (1, 1), 4, (10, 10, 14), (False, False, False), False], + [2, 1, (2, 2), (1, 1), (1, 1), 7, (12, 15, 16), (False, False, True), False], + [3, 3, (2, 1), (1, 1), (1, 1), 4, (10, 10, 14), (False, True, False), False], + [3, 3, (1, 1), (1, 1), (1, 1), 16, (12, 15, 16), (False, False, False), False], + [5, 5, (1, 1), (2, 2), (1, 1), 4, (10, 10, 14), (True, False, False), False], + [1, 3, (1, 1), (1, 1), (1, 1), 7, (20, 20, 20), (False, False, True), False], + [2, 2, (2, 2), (1, 1), (1, 1), 4, (20, 20, 20), (False, True, False), False], + [5, 5, (1, 1), (2, 2), (1, 1), 4, (10, 10, 14), (True, False, False), False], + [3, 3, (2, 1), (1, 1), (1, 1), 7, (20, 20, 20), (False, False, False), False], + [3, 3, (1, 1), (2, 2), (1, 1), 16, (10, 10, 14), (False, True, True), False], + # Depth-wise convolution + [3, 3, (1, 1), (1, 1), (1, 1), 20, (20, 20, 20), (False, False, True), True], + [5, 5, (2, 2), (1, 1), (1, 1), 20, (20, 20, 20), (False, True, False), True], + [3, 3, (2, 2), (2, 2), (1, 1), 14, (10, 10, 14), (True, False, False), True], + [5, 5, (0, 0), (1, 1), (1, 1), 20, (20, 20, 20), (False, False, False), True], + [3, 3, (1, 1), (2, 2), (1, 1), 14, (10, 10, 14), (False, True, True), True], + ] - for kernel_h, kernel_w, pad, stride, dilation, out_channels, input_shapes, composite in trials: - groups = 1 - shape = (1, *input_shapes) + for ( + kernel_h, + kernel_w, + pad, + stride, + dilation, + out_channels, + shape, + composite, + is_depthwise, + ) in trials: + shape = (1, *shape) + if is_depthwise: + groups = shape[3] + else: + groups = 1 outputs = [] inputs = {"a": tvm.nd.array(np.random.uniform(0, 255, shape).astype(dtype))} @@ -463,36 +526,52 @@ def test_qnn_conv2d(): "output scale": output_sc, "output zero point": output_zp, } - verify(outputs, atol=1, rtol=0, config=config, verify_saturation=True) + + atol = 2 if is_depthwise else 1 + verify(outputs, atol=atol, rtol=0, config=config, verify_saturation=True) def test_codegen_qnn_conv2d(): if skip_codegen_test(): return - kernel_hs = [1, 2, 3, 5] - kernel_ws = [1, 2, 3, 5] - pad = [(1, 1), (2, 2), (2, 1)] - strides = [(1, 1), (2, 2)] - dilation = [(1, 1)] - out_channels = [4, 7, 16] - input_shapes = [(10, 10, 14), (12, 15, 16), (20, 20, 20)] - # composite operator (pad, bias, activation) - composite = [ - (False, False, False), - (False, True, False), - (False, False, True), - (False, True, True), - (True, False, False), - ] dtype = "uint8" - trials = generate_trials( - [kernel_hs, kernel_ws, pad, strides, dilation, out_channels, input_shapes, composite], 3 - ) + trials = [ + # Normal convolution + [2, 2, (1, 1), (1, 1), (1, 1), 4, (10, 10, 14), (False, False, False), False], + [2, 1, (2, 2), (1, 1), (1, 1), 7, (12, 15, 16), (False, False, True), False], + [3, 3, (2, 1), (1, 1), (1, 1), 4, (10, 10, 14), (False, True, False), False], + [3, 3, (1, 1), (1, 1), (1, 1), 16, (12, 15, 16), (False, False, False), False], + [5, 5, (1, 1), (2, 2), (1, 1), 4, (10, 10, 14), (True, False, False), False], + [1, 3, (1, 1), (1, 1), (1, 1), 7, (20, 20, 20), (False, False, True), False], + [2, 2, (2, 2), (1, 1), (1, 1), 4, (20, 20, 20), (False, True, False), False], + [5, 5, (1, 1), (2, 2), (1, 1), 4, (10, 10, 14), (True, False, False), False], + [3, 3, (2, 1), (1, 1), (1, 1), 7, (20, 20, 20), (False, False, False), False], + [3, 3, (1, 1), (2, 2), (1, 1), 16, (10, 10, 14), (False, True, True), False], + # Depth-wise convolution + [3, 3, (1, 1), (1, 1), (1, 1), 20, (20, 20, 20), (False, False, True), True], + [5, 5, (2, 2), (1, 1), (1, 1), 20, (20, 20, 20), (False, True, False), True], + [3, 3, (2, 2), (2, 2), (1, 1), 14, (10, 10, 14), (True, False, False), True], + [5, 5, (0, 0), (1, 1), (1, 1), 20, (20, 20, 20), (False, False, False), True], + [3, 3, (1, 1), (2, 2), (1, 1), 14, (10, 10, 14), (False, True, True), True], + ] - for kernel_h, kernel_w, pad, stride, dilation, out_channels, input_shapes, composite in trials: - groups = 1 - shape = (1, *input_shapes) + for ( + kernel_h, + kernel_w, + pad, + stride, + dilation, + out_channels, + shape, + composite, + is_depthwise, + ) in trials: + shape = (1, *shape) + if is_depthwise: + groups = shape[3] + else: + groups = 1 inputs = {"a"} input_zp = 100 diff --git a/tests/python/contrib/test_arm_compute_lib/test_dense.py b/tests/python/contrib/test_arm_compute_lib/test_dense.py index 0279aa72eaf7..dba7be67a012 100644 --- a/tests/python/contrib/test_arm_compute_lib/test_dense.py +++ b/tests/python/contrib/test_arm_compute_lib/test_dense.py @@ -28,7 +28,6 @@ build_and_run, verify, verify_codegen, - generate_trials, ) @@ -184,17 +183,19 @@ def test_dense(): device = Device() np.random.seed(0) - dtype = ["float32"] - shape = [ - (1, (1, 128), (16, 128), 16), - (1, (32, 32), (32, 32), 32), - (0, (1, 64), (1, 64), 1), - (0, (11, 2), (2, 2), 2), + dtype = "float32" + trials = [ + [(1, 128), (16, 128), 16, True, 1], + [(1, 128), (16, 128), 16, False, 1], + [(32, 32), (32, 32), 32, True, 1], + [(32, 32), (32, 32), 32, False, 1], + [(1, 64), (1, 64), 1, True, 0], + [(1, 64), (1, 64), 1, False, 0], + [(11, 2), (2, 2), 2, True, 0], + [(11, 2), (2, 2), 2, False, 0], ] - composite = [False, True] - trials = generate_trials([dtype, shape, composite], 3) - for dtype, (acl_partitions, shape, weight_shape, units), composite in trials: + for shape, weight_shape, units, composite, acl_partitions in trials: outputs = [] inputs = {"a": tvm.nd.array(np.random.uniform(-128, 127, shape).astype(dtype))} func, params = _get_model( @@ -230,19 +231,26 @@ def test_codegen_dense(): np.random.seed(0) - dtype = ["float32"] - shape = [(1, (1, 128), (16, 128), 16), (1, (32, 32), (32, 32), 32), (0, (1, 64), (1, 64), 1)] - composite = [False, True] - trials = generate_trials([dtype, shape, composite], 3) + dtype = "float32" + trials = [ + [(1, 128), (16, 128), 16, True, 1], + [(1, 128), (16, 128), 16, False, 1], + [(32, 32), (32, 32), 32, True, 1], + [(32, 32), (32, 32), 32, False, 1], + [(1, 64), (1, 64), 1, True, 0], + [(1, 64), (1, 64), 1, False, 0], + ] - for dtype, (acl_partitions, shape, weight_shape, units), composite in trials: + for shape, weight_shape, units, composite, acl_partitions in trials: inputs = {"a"} args = (shape, weight_shape, units, dtype) func, params = _get_model(*args, var_names=iter(inputs), has_bias=composite) exp_codegen = _get_expected_codegen(*args, has_bias=composite) - verify_codegen(func, exp_codegen, acl_partitions, 1 - acl_partitions) + verify_codegen( + func, exp_codegen, acl_partitions, (1 - acl_partitions) * (2 - int(not composite)) + ) def test_qnn_dense(): @@ -254,19 +262,21 @@ def test_qnn_dense(): device = Device() np.random.seed(0) - dtype = ["uint8"] - shape = [ - (0, (4, 4), (4, 4), 4), - (1, (16, 16), (4, 16), 4), - (1, (1, 128), (16, 128), 16), - (1, (32, 32), (32, 32), 32), - (0, (1, 64), (1, 64), 1), + dtype = "uint8" + trials = [ + [(4, 4), (4, 4), 4, True, 0], + [(4, 4), (4, 4), 4, False, 0], + [(16, 16), (4, 16), 4, True, 1], + [(16, 16), (4, 16), 4, False, 1], + [(1, 128), (16, 128), 16, True, 1], + [(1, 128), (16, 128), 16, False, 1], + [(32, 32), (32, 32), 32, True, 1], + [(32, 32), (32, 32), 32, False, 1], + [(1, 64), (1, 64), 1, True, 0], + [(1, 64), (1, 64), 1, False, 0], ] - composite = [False, True] - trials = generate_trials([dtype, shape, composite], 3) - - for dtype, (acl_partitions, shape, weight_shape, units), composite in trials: + for shape, weight_shape, units, composite, acl_partitions in trials: outputs = [] inputs = {"a": tvm.nd.array(np.random.uniform(0, 255, shape).astype(dtype))} input_zp = 100 @@ -328,12 +338,17 @@ def test_codegen_qnn_dense(): np.random.seed(0) - dtype = ["uint8"] - shape = [(1, (1, 128), (16, 128), 16), (1, (32, 32), (32, 32), 32), (0, (1, 64), (1, 64), 1)] - composite = [False, True] - trials = generate_trials([dtype, shape, composite], 3) + dtype = "uint8" + trials = [ + [(1, 128), (16, 128), 16, True, 1], + [(1, 128), (16, 128), 16, False, 1], + [(32, 32), (32, 32), 32, True, 1], + [(32, 32), (32, 32), 32, False, 1], + [(1, 64), (1, 64), 1, True, 0], + [(1, 64), (1, 64), 1, False, 0], + ] - for dtype, (acl_partitions, shape, weight_shape, units), composite in trials: + for shape, weight_shape, units, composite, acl_partitions in trials: inputs = {"a"} args = (shape, weight_shape, units, dtype) @@ -357,7 +372,9 @@ def test_codegen_qnn_dense(): has_bias=composite, ) exp_codegen = _get_expected_codegen(*args, has_bias=composite) - verify_codegen(func, exp_codegen, acl_partitions, 2 - 2 * acl_partitions) + verify_codegen( + func, exp_codegen, acl_partitions, (1 - acl_partitions) * (3 - int(not composite)) + ) if __name__ == "__main__": diff --git a/tests/python/contrib/test_arm_compute_lib/test_network.py b/tests/python/contrib/test_arm_compute_lib/test_network.py index 898446b32ed9..462df143b447 100644 --- a/tests/python/contrib/test_arm_compute_lib/test_network.py +++ b/tests/python/contrib/test_arm_compute_lib/test_network.py @@ -123,7 +123,7 @@ def get_model(): return mod, params, inputs _build_and_run_network( - *get_model(), device=device, tvm_ops=73, acl_partitions=18, atol=0.002, rtol=0.01 + *get_model(), device=device, tvm_ops=56, acl_partitions=31, atol=0.002, rtol=0.01 ) @@ -148,7 +148,7 @@ def get_model(): return mod, params, inputs _build_and_run_network( - *get_model(), device=device, tvm_ops=42, acl_partitions=17, atol=8, rtol=0 + *get_model(), device=device, tvm_ops=3, acl_partitions=30, atol=9, rtol=0 )