apache · zhiics · Aug 13, 2020 · Aug 11, 2020 · Aug 12, 2020 · comaniac
diff --git a/docs/deploy/arm_compute_lib.rst b/docs/deploy/arm_compute_lib.rst
@@ -181,6 +181,13 @@ Operator support
 |              |                                                                         |
 |              | (only groups = 1 supported)                                             |
 +--------------+-------------------------------------------------------------------------+
+| nn.dense     | fp32:                                                                   |
+|              |   Simple: nn.dense                                                      |
+|              |   Composite: nn.dense, nn.bias_add?                                     |
++--------------+-------------------------------------------------------------------------+
+| qnn.dense    | uint8:                                                                  |
+|              |   Composite: qnn.dense, nn.bias_add?, qnn.requantize                    |
++--------------+-------------------------------------------------------------------------+
 | nn.maxpool2d | fp32, uint8                                                             |
 +--------------+-------------------------------------------------------------------------+
 | reshape      | fp32, uint8                                                             |

diff --git a/python/tvm/relay/op/contrib/arm_compute_lib.py b/python/tvm/relay/op/contrib/arm_compute_lib.py
@@ -98,6 +98,33 @@ def qnn_conv_pattern():
             pattern, wildcard(), wildcard(), is_constant(), is_constant())
         return pattern
 
+    def dense_pattern():
+        """Create a dense (fully-connected) pattern.
+
+        Returns
+        -------
+        pattern : dataflow_pattern.AltPattern
+            Denotes the convolution pattern.
+        """
+        pattern = is_op('nn.dense')(wildcard(), is_constant())
+        pattern = pattern.optional(lambda x: is_op('nn.bias_add')(x, is_constant()))
+        return pattern
+
+    def qnn_dense_pattern():
+        """Create a quantized dense (fully-connected) pattern.
+
+        Returns
+        -------
+        pattern : dataflow_pattern.AltPattern
+            Denotes the convolution pattern.
+        """
+        pattern = is_op('qnn.dense')(
+            wildcard(), is_constant(), is_constant(), is_constant(), is_constant(), is_constant())
+        pattern = pattern.optional(lambda x: is_op('nn.bias_add')(x, is_constant()))
+        pattern = is_op('qnn.requantize')(
+            pattern, wildcard(), wildcard(), is_constant(), is_constant())
+        return pattern
+
     def check_conv(extract):
         """Check conv pattern is supported by ACL."""
         call = extract
@@ -114,8 +141,26 @@ def check_qnn_conv(extract):
             call = call.args[0]
         return qnn_conv2d(call.attrs, call.args)
 
+    def check_dense(extract):
+        """Check conv pattern is supported by ACL."""
+        call = extract
+        while call.op.name != "nn.dense":
+            call = call.args[0]
+        return dense(call.attrs, call.args)
+
+    def check_qnn_dense(extract):
+        """Check qnn conv pattern is supported by ACL."""
+        if extract.attrs.out_dtype != "uint8":
+            return False
+        call = extract
+        while call.op.name != "qnn.dense":
+            call = call.args[0]
+        return qnn_dense(call.attrs, call.args)
+
     return [('arm_compute_lib.conv2d', conv_pattern(), check_conv),
-            ('arm_compute_lib.qnn_conv2d', qnn_conv_pattern(), check_qnn_conv)]
+            ('arm_compute_lib.qnn_conv2d', qnn_conv_pattern(), check_qnn_conv),
+            ('arm_compute_lib.dense', dense_pattern(), check_dense),
+            ('arm_compute_lib.qnn_dense', qnn_dense_pattern(), check_qnn_dense)]
 
 
 def _register_external_op_helper(op_name, supported=True):
@@ -164,6 +209,33 @@ def qnn_conv2d(attrs, args):
     return True
 
 
+@tvm.ir.register_op_attr("nn.dense", "target.arm_compute_lib")
+def dense(attrs, args):
+    """Check if the external ACL codegen for dense should be used."""
+    data_typ = args[0].checked_type
+    if data_typ.dtype != "float32":
+        return False
+    kernel_typ = args[1].checked_type
+    if len(kernel_typ.shape) != 2 or kernel_typ.dtype != "float32":
+        return False
+    if attrs.out_dtype != "float32" and attrs.out_dtype != "":
+        return False
+    return True
+
+
+def qnn_dense(attrs, args):
+    """Check if the external ACL codegen for qnn.dense should be used."""
+    data_typ = args[0].checked_type
+    if data_typ.dtype != "uint8":
+        return False
+    kernel_typ = args[1].checked_type
+    if len(kernel_typ.shape) != 2 or kernel_typ.dtype != "uint8":
+        return False
+    if attrs.out_dtype != "int32":
+        return False
+    return True
+
+
 @tvm.ir.register_op_attr("nn.max_pool2d", "target.arm_compute_lib")
 def max_pool2d(attrs, args):
     """Check if the external ACL codegen for maxpool2d should be used."""

diff --git a/src/relay/backend/contrib/arm_compute_lib/codegen.cc b/src/relay/backend/contrib/arm_compute_lib/codegen.cc
@@ -61,6 +61,16 @@ class ACLJSONSerializer : public backend::contrib::JSONSerializer {
     const CallNode* requantize = nullptr;
   };
 
+  /*!
+   * \brief A series of operators that form a composite
+   * dense layer. Supports both nn.dense and qnn.dense.
+   */
+  struct CompositeDenseNode {
+    const CallNode* dense = nullptr;
+    const CallNode* bias = nullptr;
+    const CallNode* requantize = nullptr;
+  };
+
   /*!
    * \brief Visit call nodes and generate appropriate JSON node.
    *
@@ -82,6 +92,8 @@ class ACLJSONSerializer : public backend::contrib::JSONSerializer {
     std::shared_ptr<JSONGraphNode> json_node;
     if (name == "arm_compute_lib.conv2d" || name == "arm_compute_lib.qnn_conv2d") {
       json_node = CreateCompositeConvJSONNode(cn);
+    } else if (name == "arm_compute_lib.dense" || name == "arm_compute_lib.qnn_dense") {
+      json_node = CreateCompositeDenseJSONNode(cn);
     } else {
       LOG(FATAL) << "Unrecognized Arm Compute Library pattern: " << name;
     }
@@ -190,6 +202,71 @@ class ACLJSONSerializer : public backend::contrib::JSONSerializer {
     }
     return json_node;
   }
+
+  /*!
+   * \brief Extract dense nodes from a composite function.
+   *
+   * \param cn The call node of the composite function.
+   * \return Extracted composite convolution nodes.
+   */
+  static CompositeDenseNode UnpackCompositeDense(const CallNode* cn) {
+    CompositeDenseNode nodes{};
+    const auto* fn = cn->op.as<FunctionNode>();
+    CHECK(fn);
+
+    // Traverse composite dense function from child to parent
+    const auto* current_call = fn->body.as<CallNode>();
+    if (backend::IsOp(current_call, "qnn.requantize")) {
+      nodes.requantize = current_call;
+      current_call = current_call->args[0].as<CallNode>();
+    }
+    if (backend::IsOp(current_call, "nn.bias_add")) {
+      nodes.bias = current_call;
+      current_call = current_call->args[0].as<CallNode>();
+    }
+    // Enforce a dense node exists at this point during traversal
+    if (nodes.requantize) {
+      CHECK(backend::IsOp(current_call, "qnn.dense"));
+    } else {
+      CHECK(backend::IsOp(current_call, "nn.dense"));
+    }
+    nodes.dense = current_call;
+    return nodes;
+  }
+
+  /*!
+   * \brief Create a JSON representation of a composite dense (fully-connected) operator.
+   *
+   * \param cn The call to be represented.
+   * \return A JSON representation of a specific operator.
+   */
+  std::shared_ptr<JSONGraphNode> CreateCompositeDenseJSONNode(const CallNode* cn) {
+    CompositeDenseNode nodes = UnpackCompositeDense(cn);
+    std::string name = "nn.dense";
+
+    // Inputs must be added in the same order they appear in the relay graph.
+    std::vector<JSONGraphNodeEntry> inputs;
+    inputs.push_back(VisitExpr(cn->args[0])[0]);
+    inputs.push_back(VisitExpr(nodes.dense->args[1])[0]);
+    if (nodes.requantize) {
+      name = "qnn.dense";
+      inputs.push_back(VisitExpr(nodes.dense->args[2])[0]);  // input zero-point
+      inputs.push_back(VisitExpr(nodes.dense->args[3])[0]);  // weight zero-point
+      inputs.push_back(VisitExpr(nodes.dense->args[4])[0]);  // input scale
+      inputs.push_back(VisitExpr(nodes.dense->args[5])[0]);  // weight scale
+    }
+    if (nodes.bias) {
+      inputs.push_back(VisitExpr(nodes.bias->args[1])[0]);
+    }
+    if (nodes.requantize) {
+      inputs.push_back(VisitExpr(nodes.requantize->args[3])[0]);  // output scale
+      inputs.push_back(VisitExpr(nodes.requantize->args[4])[0]);  // output zero-point
+    }
+
+    auto json_node = std::make_shared<JSONGraphNode>(name, "kernel", inputs, 1);
+    SetCallNodeAttribute(json_node, nodes.dense);
+    return json_node;
+  }
 };
 
 /*!

diff --git a/src/runtime/contrib/arm_compute_lib/acl_runtime.cc b/src/runtime/contrib/arm_compute_lib/acl_runtime.cc
@@ -31,6 +31,7 @@
 #ifdef TVM_GRAPH_RUNTIME_ARM_COMPUTE_LIB
 #include <arm_compute/core/Types.h>
 #include <arm_compute/runtime/NEON/functions/NEConvolutionLayer.h>
+#include <arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h>
 #include <arm_compute/runtime/NEON/functions/NEPoolingLayer.h>
 #include <arm_compute/runtime/NEON/functions/NEReshapeLayer.h>
 
@@ -128,6 +129,9 @@ class ACLRuntime : public JSONRuntimeBase {
         if ("nn.conv2d" == op_name || "qnn.conv2d" == op_name) {
           CreateConvolution2DLayer(&layer_, node, mm);
           num_pools++;
+        } else if ("nn.dense" == op_name || "qnn.dense" == op_name) {
+          CreateFullyConnectedLayer(&layer_, node, mm);
+          num_pools++;
         } else if ("nn.max_pool2d" == op_name) {
           CreatePoolingLayer(&layer_, node);
         } else if ("reshape" == op_name) {
@@ -257,6 +261,50 @@ class ACLRuntime : public JSONRuntimeBase {
     layer->function = function;
   }
 
+  /*!
+   * \brief Create a fully connected (dense) layer.
+   *
+   * \param layer The ACL layer to build. Containing inputs, outputs and the ACL function.
+   * \param node The JSON representation of the operator.
+   * \param mm The ACL fully connected layer can request auxiliary memory from TVM.
+   */
+  void CreateFullyConnectedLayer(CachedLayer* layer, const JSONGraphNode& node,
+                                 const std::shared_ptr<arm_compute::MemoryManagerOnDemand>& mm) {
+    arm_compute::FullyConnectedLayerInfo fc_info;
+    fc_info.set_weights_trained_layout(arm_compute::DataLayout::NHWC);
+
+    // Collect inputs and outputs, handling both nn.dense and qnn.dense cases.
+    std::vector<JSONGraphNodeEntry> inputs = node.GetInputs();
+    size_t num_inputs = inputs.size();
+    bool has_bias;
+    if (node.GetOpName() == "qnn.dense") {
+      CHECK(num_inputs >= 8U && num_inputs <= 9U)
+          << "Quantized fully connected (dense) layer requires 9 inputs with a bias, 8 inputs "
+             "without.";
+      has_bias = num_inputs == 9;
+      layer->inputs.push_back(MakeACLTensorFromJSONEntry(inputs[0], &inputs[4], &inputs[2]));
+      layer->inputs.push_back(MakeACLTensorFromJSONEntry(inputs[1], &inputs[5], &inputs[3]));
+      if (has_bias) {
+        layer->inputs.push_back(MakeACLTensorFromJSONEntry(inputs[6]));
+      }
+      layer->outputs.push_back(
+          MakeACLTensorFromJSONNode(node, &inputs[6 + has_bias], &inputs[7 + has_bias]));
+    } else {
+      CHECK(num_inputs >= 2U && num_inputs <= 3U)
+          << "Fully connected (dense) layer requires 3 inputs with a bias, 2 inputs without.";
+      has_bias = num_inputs == 3;
+      for (const auto& i : inputs) {
+        layer->inputs.push_back(MakeACLTensorFromJSONEntry(i));
+      }
+      layer->outputs.push_back(MakeACLTensorFromJSONNode(node));
+    }
+
+    auto function = std::make_shared<arm_compute::NEFullyConnectedLayer>(mm);
+    function->configure(&layer->inputs[0], &layer->inputs[1],
+                        has_bias ? &layer->inputs[2] : nullptr, &layer->outputs[0], fc_info);
+    layer->function = function;
+  }
+
   /*!
    * \brief Create a pooling layer.
    *

diff --git a/tests/python/contrib/test_arm_compute_lib/test_conv2d.py b/tests/python/contrib/test_arm_compute_lib/test_conv2d.py
@@ -392,7 +392,7 @@ def test_qnn_conv2d():
             "output scale": output_sc,
             "output zero point": output_zp
         }
-        verify(outputs, atol=1, rtol=0, params=params)
+        verify(outputs, atol=1, rtol=0, params=params, verify_saturation=True)
 
 
 def test_codegen_qnn_conv2d():