apache · tqchen · Apr 13, 2020 · Apr 6, 2020 · Apr 6, 2020 · Apr 8, 2020
diff --git a/python/tvm/relay/op/nn/_nn.py b/python/tvm/relay/op/nn/_nn.py
@@ -27,6 +27,7 @@
 from .. import strategy
 from ..op import OpPattern
 from .._tensor import elemwise_shape_func
+from ..strategy.generic import is_depthwise_conv2d
 
 # relu
 reg.register_broadcast_schedule("nn.relu")
@@ -139,13 +140,22 @@ def convert_conv2d(attrs, inputs, tinfos, desired_layout):
     # pylint: disable=import-outside-toplevel
     from tvm import relay
     data, weight = inputs
-    assert desired_layout == 'NCHW', \
-            "Currently only transformation to NCHW layout is supported."
+    new_attrs = dict(attrs)
     if desired_layout == 'NCHW':
-        new_attrs = dict(attrs)
         new_attrs['data_layout'] = desired_layout
         new_attrs['kernel_layout'] = 'OIHW'
         return relay.nn.conv2d(data, weight, **new_attrs)
+    elif desired_layout == 'NHWC':
+        new_attrs['data_layout'] = desired_layout
+        # Check for depthwise convolution.
+        if is_depthwise_conv2d(data.shape, attrs['data_layout'], weight.shape,
+                               attrs['kernel_layout'], attrs['groups']):
+            new_attrs['kernel_layout'] = 'HWOI'
+        else:
+            new_attrs['kernel_layout'] = 'HWIO'
+        return relay.nn.conv2d(data, weight, **new_attrs)
+    else:
+        assert "Layout %s is not yet supported." % (desired_layout)
     return None
 
 
@@ -183,6 +193,42 @@ def alter_op_layout_conv3d(attrs, inputs, tinfos, out_type):
     """Alternate the layout of conv3d"""
     return topi.nn.conv3d_alter_layout(attrs, inputs, tinfos, out_type)
 
+@reg.register_convert_op_layout("nn.conv3d")
+def convert_conv3d(attrs, inputs, tinfos, desired_layout):
+    """Convert Layout pass registration for conv3d op.
+
+    Parameters
+    ----------
+    attrs : tvm.ir.Attrs
+        Attributes of current convolution
+    inputs : list of tvm.relay.Expr
+        The args of the Relay expr to be legalized
+    tinfos : list of types
+        List of input and output types
+    desired_layout : str
+        The desired layout
+
+    Returns
+    -------
+    result : tvm.relay.Expr
+        The transformed expr
+    """
+    # pylint: disable=import-outside-toplevel
+    from tvm import relay
+    data, weight = inputs
+    new_attrs = dict(attrs)
+    if desired_layout == 'NCDHW':
+        new_attrs['data_layout'] = desired_layout
+        new_attrs['kernel_layout'] = 'OIDHW'
+        return relay.nn.conv3d(data, weight, **new_attrs)
+    elif desired_layout == "NDHWC":
+        new_attrs['data_layout'] = desired_layout
+        new_attrs['kernel_layout'] = 'DHWIO'
+        return relay.nn.conv3d(data, weight, **new_attrs)
+    else:
+        assert "Layout %s is not yet supported" % desired_layout
+    return None
+
 # conv3d_winograd related operators
 reg.register_strategy("nn.contrib_conv3d_winograd_without_weight_transform",
                       strategy.conv3d_winograd_without_weight_transfrom_strategy)

diff --git a/python/tvm/relay/op/strategy/cuda.py b/python/tvm/relay/op/strategy/cuda.py
@@ -138,15 +138,16 @@ def conv2d_strategy_cuda(attrs, inputs, out_type, target):
                 name="conv2d_nhwc.cuda")
             N, _, _, _ = get_const_tuple(data.shape)
             _, _, CI, CO = get_const_tuple(kernel.shape)
-            if nvcc.have_tensorcore(tvm.gpu(0).compute_version):
-                if (N % 16 == 0 and CI % 16 == 0 and CO % 16 == 0) or \
-                        (N % 8 == 0 and CI % 16 == 0 and CO % 32 == 0) or \
-                        (N % 32 == 0 and CI % 16 == 0 and CO % 8 == 0):
-                    strategy.add_implementation(
-                        wrap_compute_conv2d(topi.cuda.conv2d_nhwc_tensorcore),
-                        wrap_topi_schedule(topi.cuda.schedule_conv2d_nhwc_tensorcore),
-                        name="conv2d_nhwc_tensorcore.cuda",
-                        plevel=20)
+            if target.target_name == "cuda":
+                if nvcc.have_tensorcore(tvm.gpu(0).compute_version):
+                    if (N % 16 == 0 and CI % 16 == 0 and CO % 16 == 0) or \
+                            (N % 8 == 0 and CI % 16 == 0 and CO % 32 == 0) or \
+                            (N % 32 == 0 and CI % 16 == 0 and CO % 8 == 0):
+                        strategy.add_implementation(
+                            wrap_compute_conv2d(topi.cuda.conv2d_nhwc_tensorcore),
+                            wrap_topi_schedule(topi.cuda.schedule_conv2d_nhwc_tensorcore),
+                            name="conv2d_nhwc_tensorcore.cuda",
+                            plevel=20)
         elif layout == "NCHW4c" and data.dtype in ["int8", "uint8"]:
             assert kernel_layout == "OIHW4o4i"
             strategy.add_implementation(
@@ -170,7 +171,7 @@ def conv2d_strategy_cuda(attrs, inputs, out_type, target):
             strategy.add_implementation(
                 wrap_compute_conv2d(topi.cuda.depthwise_conv2d_nchw),
                 wrap_topi_schedule(topi.cuda.schedule_depthwise_conv2d_nchw),
-                name="dpethwise_nchw.cuda")
+                name="depthwise_conv2d_nchw.cuda")
         elif layout == "NHWC":
             assert kernel_layout == "HWOI"
             strategy.add_implementation(
@@ -249,7 +250,7 @@ def conv2d_transpose_strategy_cuda(attrs, inputs, out_type, target):
 def conv3d_strategy_cuda(attrs, inputs, out_type, target):
     """conv3d cuda strategy"""
     strategy = _op.OpStrategy()
-    _, kernel = inputs
+    data, kernel = inputs
     layout = attrs.data_layout
     _, stride_h, stride_w = attrs.get_int_tuple("strides")
     _, dilation_h, dilation_w = attrs.get_int_tuple("dilation")
@@ -268,11 +269,25 @@ def conv3d_strategy_cuda(attrs, inputs, out_type, target):
                 wrap_topi_schedule(topi.cuda.schedule_conv3d_ncdhw_winograd),
                 name="conv3d_ncdhw_winograd.cuda",
                 plevel=5)
-    else: # layout == "NDHWC":
-        strategy.add_implementation(wrap_compute_conv3d(topi.cuda.conv3d_ndhwc),
-                                    wrap_topi_schedule(topi.cuda.schedule_conv3d_ndhwc),
-                                    name="conv3d_ndhwc.cuda",
-                                    plevel=10)
+    else:  # layout == "NDHWC":
+        strategy.add_implementation(
+            wrap_compute_conv3d(topi.cuda.conv3d_ndhwc),
+            wrap_topi_schedule(topi.cuda.schedule_conv3d_ndhwc),
+            name="conv3d_ndhwc.cuda",
+            plevel=10)
+        N, _, _, _, _ = get_const_tuple(data.shape)
+        _, _, _, CI, CO = get_const_tuple(kernel.shape)
+        if target.target_name == "cuda":
+            if nvcc.have_tensorcore(tvm.gpu(0).compute_version):
+                if (N % 16 == 0 and CI % 16 == 0 and CO % 16 == 0) or \
+                (N % 8 == 0 and CI % 16 == 0 and CO % 32 == 0) or \
+                (N % 32 == 0 and CI % 16 == 0 and CO % 8 == 0):
+                    strategy.add_implementation(
+                        wrap_compute_conv3d(topi.cuda.conv3d_ndhwc_tensorcore),
+                        wrap_topi_schedule(topi.cuda.schedule_conv3d_ndhwc_tensorcore),
+                        name="conv3d_ndhwc_tensorcore.cuda",
+                        plevel=20)
+
     if target.target_name == "cuda" and "cudnn" in target.libs:
         strategy.add_implementation(wrap_compute_conv3d(topi.cuda.conv3d_cudnn, True),
                                     wrap_topi_schedule(topi.cuda.schedule_conv3d_cudnn),

diff --git a/topi/python/topi/cuda/__init__.py b/topi/python/topi/cuda/__init__.py
@@ -46,4 +46,5 @@
 from .rcnn import *
 from .sort import *
 from .conv2d_nhwc_tensorcore import *
+from .conv3d_ndhwc_tensorcore import *
 from .dense_tensorcore import *
diff --git a/topi/python/topi/cuda/conv2d_nhwc_tensorcore.py b/topi/python/topi/cuda/conv2d_nhwc_tensorcore.py
@@ -70,7 +70,7 @@ def nhwc_tensorcore_cuda(cfg, Input, Filter, stride, padding, dilation, out_dtyp
     # convert data type of input feature maps and weights
     TransPaddedInput = te.compute(
         PaddedInput.shape,
-        lambda h, w, i, o: PaddedInput[h, w, i, o].astype('float16'))
+        lambda n, h, w, c: PaddedInput[n, h, w, c].astype('float16'))
     TransFilter = te.compute(
         Filter.shape, lambda h, w, i, o: Filter[h, w, i, o].astype('float16'))
     Output = te.compute(