From a2bf430a8ad598a3999c812fc73e59062f8abf4e Mon Sep 17 00:00:00 2001
From: Krzysztof Parzyszek <kparzysz@quicinc.com>
Date: Wed, 22 Sep 2021 19:11:46 -0500
Subject: [PATCH 01/37] [Hexagon] Pytestify Hexagon unit test (#8955)

* [Hexagon] Pytestify Hexagon unit test

* Fix formatting

* Convert linker registration into pytest fixture

* Use yield in pytest fixture

* Restart CI
---
 python/tvm/testing/plugin.py                  |  3 ++
 python/tvm/testing/utils.py                   | 20 +++++++++
 .../unittest/test_target_codegen_hexagon.py   | 42 +++++++------------
 3 files changed, 38 insertions(+), 27 deletions(-)

diff --git a/python/tvm/testing/plugin.py b/python/tvm/testing/plugin.py
index 95875acbd82c..0413c44208b0 100644
--- a/python/tvm/testing/plugin.py
+++ b/python/tvm/testing/plugin.py
@@ -48,6 +48,7 @@
     "metal": "mark a test as requiring metal",
     "llvm": "mark a test as requiring llvm",
     "ethosn": "mark a test as requiring ethosn",
+    "hexagon": "mark a test as requiring hexagon",
 }
 
 
@@ -258,6 +259,8 @@ def _target_to_requirement(target):
         return utils.requires_opencl()
     if target.kind.name == "llvm":
         return utils.requires_llvm()
+    if target.kind.name == "hexagon":
+        return utils.requires_hexagon()
     return []
 
 
diff --git a/python/tvm/testing/utils.py b/python/tvm/testing/utils.py
index 62531ff7c194..39c759c7cd69 100644
--- a/python/tvm/testing/utils.py
+++ b/python/tvm/testing/utils.py
@@ -442,6 +442,7 @@ def _get_targets(target_str=None):
     "opencl -device=intel_graphics",
     "metal",
     "rocm",
+    "hexagon",
 ]
 
 
@@ -818,6 +819,25 @@ def requires_ethosn(*args):
     return _compose(args, marks)
 
 
+def requires_hexagon(*args):
+    """Mark a test as requiring Hexagon to run.
+
+    Parameters
+    ----------
+    f : function
+        Function to mark
+    """
+    _requires_hexagon = [
+        pytest.mark.hexagon,
+        pytest.mark.skipif(not device_enabled("hexagon"), reason="Hexagon support not enabled"),
+        *requires_llvm(),
+        pytest.mark.skipif(
+            tvm.target.codegen.llvm_version_major() < 7, reason="Hexagon requires LLVM 7 or later"
+        ),
+    ]
+    return _compose(args, _requires_hexagon)
+
+
 def requires_package(*packages):
     """Mark a test as requiring python packages to run.
 
diff --git a/tests/python/unittest/test_target_codegen_hexagon.py b/tests/python/unittest/test_target_codegen_hexagon.py
index c8b48993967b..79f2fb06a1ec 100644
--- a/tests/python/unittest/test_target_codegen_hexagon.py
+++ b/tests/python/unittest/test_target_codegen_hexagon.py
@@ -17,30 +17,27 @@
 
 import numpy as np
 import os
+import pytest
 import re
+import sys
 import tvm
 import tvm.relay
+import tvm.testing
 import tvm.contrib.hexagon as hexagon
 
 
-def check_prereq_and_setup():
-    if tvm.target.codegen.llvm_version_major() <= 7:
-        print("Skipping test: need LLVM 7 or later for codegen")
-        return False
-    if os.name != "posix":
-        print("Skipping test on non-POSIX platforms")
-        return False
-    if not tvm.runtime.enabled("hexagon"):
-        print("Hexagon runtime not enabled")
-        return False
+@pytest.fixture(autouse=True)
+def register_linker():
+    original_linker = tvm.contrib.hexagon.hexagon_link()
     # Register a phony linker, so that we can test codegen without a Hexagon toolchain.
     hexagon.register_linker(lambda: "/bin/true")
-    return True
+    yield None
+    # Restore registration.
+    hexagon.register_linker(original_linker)
 
 
+@tvm.testing.requires_hexagon
 def test_basic():
-    if not check_prereq_and_setup():
-        return
     target = tvm.target.hexagon("v66", hvx=128)
 
     def check_add(offload):
@@ -67,9 +64,8 @@ def check_add(offload):
     check_add(False)
 
 
+@tvm.testing.requires_hexagon
 def test_llvm_target_features():
-    if not check_prereq_and_setup():
-        return
     target = tvm.target.hexagon("v66", hvx=128)
     # Define some trivial compute
     A = tvm.te.placeholder((128,), dtype="uint8", name="A")
@@ -82,9 +78,8 @@ def test_llvm_target_features():
     assert fs  # Check that it's non-empty
 
 
+@tvm.testing.requires_hexagon
 def test_alloc_vtcm():
-    if not check_prereq_and_setup():
-        return
     target = tvm.target.hexagon("v66")
 
     buf_len = 2048
@@ -109,9 +104,8 @@ def test_alloc_vtcm():
     assert "HexagonBackendFreeVTCM" in calls
 
 
+@tvm.testing.requires_hexagon
 def test_llvm_options():
-    if not check_prereq_and_setup():
-        return
     target = tvm.target.hexagon("v66", llvm_options="-hexagon-noopt")
     Zero = tvm.te.compute((10,), lambda _: tvm.tir.const(0, "int32"))
     s = tvm.te.create_schedule(Zero.op)
@@ -121,10 +115,8 @@ def test_llvm_options():
     assert re.search("-hexagon-noopt", str(target))
 
 
+@tvm.testing.requires_hexagon
 def test_linked_params_codegen():
-    if not check_prereq_and_setup():
-        return
-
     # A simple model (a single conv2d) to trigger parameter separation:
     mod_lines = [
         '#[version = "0.0.5"]',
@@ -185,8 +177,4 @@ def test_linked_params_codegen():
 
 
 if __name__ == "__main__":
-    test_basic()
-    test_llvm_target_features()
-    test_alloc_vtcm()
-    test_llvm_options()
-    test_linked_params_codegen()
+    sys.exit(pytest.main([__file__] + sys.argv[1:]))

From 7cdd4667aa8fd1384da632c6d44f8ba78ccd8847 Mon Sep 17 00:00:00 2001
From: AndrewZhaoLuo <andrew.zhao.luo@gmail.com>
Date: Thu, 23 Sep 2021 00:25:43 -0700
Subject: [PATCH 02/37] [ONNX] Add Compress Support (#9067)

* compress impl

* unit tests

* docstring

Co-authored-by: Andrew Zhao Luo <andrewzhaoluo@system76-pc.localdomain>
---
 python/tvm/relay/frontend/onnx.py          | 21 +++++++++++++++++++++
 tests/python/frontend/onnx/test_forward.py |  4 ----
 2 files changed, 21 insertions(+), 4 deletions(-)

diff --git a/python/tvm/relay/frontend/onnx.py b/python/tvm/relay/frontend/onnx.py
index bcaa60929e49..d6fdaf5c2a47 100644
--- a/python/tvm/relay/frontend/onnx.py
+++ b/python/tvm/relay/frontend/onnx.py
@@ -1666,6 +1666,26 @@ def _impl_v12(cls, inputs, attr, params):
         return cls._impl_common(inputs[0], inputs[1], batch_dims)
 
 
+class Compress(OnnxOpConverter):
+    """Operator converter for compress"""
+
+    @classmethod
+    def _impl_v11(cls, inputs, attr, params):
+        input_tensor, condition_tensor = inputs
+
+        axis = attr.get("axis", None)
+
+        # Change one hot tensor to indices e.g. [0, 1, 1, 0, 1] -> [1, 2, 4]
+        condition_tensor = _op.reshape(_op.argwhere(condition_tensor), (-1,))
+
+        if axis is not None:
+            return _op.take(input_tensor, condition_tensor, axis=axis)
+
+        # if axis is None, flatten input tensor before selection
+        input_tensor = _op.reshape(input_tensor, (-1,))
+        return _op.take(input_tensor, condition_tensor, axis=0)
+
+
 class Scatter(OnnxOpConverter):
     """Operator converter for Scatter."""
 
@@ -4093,6 +4113,7 @@ def _get_convert_map(opset):
         "Gather": Gather.get_converter(opset),
         "GatherElements": GatherElements.get_converter(opset),
         "GatherND": GatherND.get_converter(opset),
+        "Compress": Compress.get_converter(opset),
         "Size": AttrCvt("ndarray_size", extras={"dtype": "int64"}),
         "Scatter": Scatter.get_converter(opset),
         "ScatterElements": Scatter.get_converter(opset),
diff --git a/tests/python/frontend/onnx/test_forward.py b/tests/python/frontend/onnx/test_forward.py
index 7a071678405b..e9e1c00c1a9f 100644
--- a/tests/python/frontend/onnx/test_forward.py
+++ b/tests/python/frontend/onnx/test_forward.py
@@ -4871,10 +4871,6 @@ def verify_eyelike(indata):
     "test_cast_FLOAT_to_BFLOAT16",
     "test_cast_FLOAT_to_STRING",
     "test_cast_STRING_to_FLOAT",
-    "test_compress_0",
-    "test_compress_1",
-    "test_compress_default_axis",
-    "test_compress_negative_axis",
     "test_convtranspose_dilations",
     "test_convtranspose_output_shape",
     "test_cumsum_1d",

From dfd5e4de5bd5496ae775b9c4a9986ef249086b88 Mon Sep 17 00:00:00 2001
From: AndrewZhaoLuo <andrew.zhao.luo@gmail.com>
Date: Thu, 23 Sep 2021 00:26:38 -0700
Subject: [PATCH 03/37] [ONNX] enable more `*_expanded` tests  (#9051)

* enable more sce tests

* more tests

Co-authored-by: Andrew Zhao Luo <andrewzhaoluo@system76-pc.localdomain>
---
 python/tvm/relay/frontend/onnx.py          | 21 +++++++++-
 tests/python/frontend/onnx/test_forward.py | 49 ----------------------
 2 files changed, 19 insertions(+), 51 deletions(-)

diff --git a/python/tvm/relay/frontend/onnx.py b/python/tvm/relay/frontend/onnx.py
index d6fdaf5c2a47..dfc0298979e6 100644
--- a/python/tvm/relay/frontend/onnx.py
+++ b/python/tvm/relay/frontend/onnx.py
@@ -1800,6 +1800,11 @@ class Reduce(OnnxOpConverter):
 
     name = ""
 
+    @classmethod
+    def run_calculation(cls, inputs, axis, keepdims):
+        attr = {"axis": axis, "keepdims": keepdims}
+        return AttrCvt(cls.name)(inputs, attr)
+
     @classmethod
     def _impl_v1(cls, inputs, attr, params):
         if "axes" in attr:
@@ -1807,8 +1812,20 @@ def _impl_v1(cls, inputs, attr, params):
         else:
             axis_len = len(infer_shape(inputs[0]))
             axis = list(range(axis_len))
-        attr = {"axis": axis, "keepdims": attr.get("keepdims", True)}
-        return AttrCvt(cls.name)(inputs, attr)
+
+        return cls.run_calculation(inputs, axis, attr.get("keepdims", True))
+
+    @classmethod
+    def _impl_v12(cls, inputs, attr, params):
+        if len(inputs) == 2:
+            if isinstance(inputs[1], _expr.Constant):
+                # Get axis and unpack scalar
+                constant_axis = int(inputs[1].data.numpy()[0])
+                return cls.run_calculation([inputs[0]], constant_axis, attr.get("keepdims", True))
+
+            raise ValueError("Dynamic Reduce is not supported yet!")
+
+        return cls._impl_v1(inputs, attr, params)
 
 
 class ReduceMax(Reduce):
diff --git a/tests/python/frontend/onnx/test_forward.py b/tests/python/frontend/onnx/test_forward.py
index e9e1c00c1a9f..e049a195dc17 100644
--- a/tests/python/frontend/onnx/test_forward.py
+++ b/tests/python/frontend/onnx/test_forward.py
@@ -4887,13 +4887,6 @@ def verify_eyelike(indata):
     "test_dropout_default_mask_ratio",
     "test_dropout_default_ratio",
     "test_if_seq",
-    "test_logsoftmax_axis_0_expanded",
-    "test_logsoftmax_axis_1_expanded",
-    "test_logsoftmax_axis_2_expanded",
-    "test_logsoftmax_default_axis_expanded",
-    "test_logsoftmax_example_1_expanded",
-    "test_logsoftmax_large_number_expanded",
-    "test_logsoftmax_negative_axis_expanded",
     "test_loop11",
     "test_loop13_seq",
     "test_matmulinteger",
@@ -4949,52 +4942,10 @@ def verify_eyelike(indata):
     "test_round",
     "test_scan9_sum",
     "test_scan_sum",
-    # With reduce_sum supported fully, these expanded tests should pass
-    "test_sce_NCd1_mean_weight_negative_ii_expanded",
-    "test_sce_NCd1_mean_weight_negative_ii_log_prob_expanded",
-    "test_sce_NCd1d2d3_none_no_weight_negative_ii_expanded",
-    "test_sce_NCd1d2d3_none_no_weight_negative_ii_log_prob_expanded",
-    "test_sce_NCd1d2d3_sum_weight_high_ii_expanded",
-    "test_sce_NCd1d2d3_sum_weight_high_ii_log_prob_expanded",
-    "test_sce_NCd1d2d3d4d5_mean_weight_expanded",
-    "test_sce_NCd1d2d3d4d5_mean_weight_log_prob_expanded",
-    "test_sce_NCd1d2d3d4d5_none_no_weight_expanded",
-    "test_sce_NCd1d2d3d4d5_none_no_weight_log_prob_expanded",
-    "test_sce_mean_3d_expanded",
-    "test_sce_mean_3d_log_prob_expanded",
-    "test_sce_mean_expanded",
-    "test_sce_mean_log_prob_expanded",
-    "test_sce_mean_no_weight_ii_3d_expanded",
-    "test_sce_mean_no_weight_ii_3d_log_prob_expanded",
-    "test_sce_mean_no_weight_ii_4d_expanded",
-    "test_sce_mean_no_weight_ii_4d_log_prob_expanded",
-    "test_sce_mean_no_weight_ii_expanded",
-    "test_sce_mean_no_weight_ii_log_prob_expanded",
-    "test_sce_mean_weight_expanded",
-    "test_sce_mean_weight_ii_3d_expanded",
-    "test_sce_mean_weight_ii_3d_log_prob_expanded",
-    "test_sce_mean_weight_ii_4d_expanded",
-    "test_sce_mean_weight_ii_4d_log_prob_expanded",
-    "test_sce_mean_weight_ii_expanded",
-    "test_sce_mean_weight_ii_log_prob_expanded",
-    "test_sce_mean_weight_log_prob_expanded",
-    "test_sce_none_expanded",
-    "test_sce_none_log_prob_expanded",
-    "test_sce_none_weights_expanded",
-    "test_sce_none_weights_log_prob_expanded",
-    "test_sce_sum_expanded",
-    "test_sce_sum_log_prob_expanded",
     "test_sequence_insert_at_back",
     "test_sequence_insert_at_front",
     "test_simple_rnn_defaults",
     "test_simple_rnn_with_initial_bias",
-    "test_softmax_axis_0_expanded",
-    "test_softmax_axis_1_expanded",
-    "test_softmax_axis_2_expanded",
-    "test_softmax_default_axis_expanded",
-    "test_softmax_example_expanded",
-    "test_softmax_large_number_expanded",
-    "test_softmax_negative_axis_expanded",
     "test_split_variable_parts_1d",
     "test_split_variable_parts_2d",
     "test_split_variable_parts_default_axis",

From 98eab45d52911662ff760fb0bbdccc11f3a31b9f Mon Sep 17 00:00:00 2001
From: Jason <928090362@qq.com>
Date: Thu, 23 Sep 2021 18:18:18 +0800
Subject: [PATCH 04/37] Update ubuntu_install_paddle.sh (#9082)

---
 docker/install/ubuntu_install_paddle.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docker/install/ubuntu_install_paddle.sh b/docker/install/ubuntu_install_paddle.sh
index 267d59105c06..c7f9d43a3bd4 100644
--- a/docker/install/ubuntu_install_paddle.sh
+++ b/docker/install/ubuntu_install_paddle.sh
@@ -20,4 +20,4 @@ set -e
 set -u
 set -o pipefail
 
-pip install paddlepaddle==2.1.2
+pip install paddlepaddle==2.1.3

From 1b595c0c1e190a3176b1b2df8ae4e276408b6993 Mon Sep 17 00:00:00 2001
From: masahi <masahi129@gmail.com>
Date: Fri, 24 Sep 2021 00:18:59 +0900
Subject: [PATCH 05/37] [CUDA] Swap block x and z dimension for conv2d NHWC
 schedule (#9087)

---
 python/tvm/topi/cuda/conv2d_nhwc.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/python/tvm/topi/cuda/conv2d_nhwc.py b/python/tvm/topi/cuda/conv2d_nhwc.py
index e4361e30b5c3..f8115830ce50 100644
--- a/python/tvm/topi/cuda/conv2d_nhwc.py
+++ b/python/tvm/topi/cuda/conv2d_nhwc.py
@@ -86,14 +86,14 @@ def schedule_conv2d_nhwc_direct(cfg, s, Conv):
 
     # Schedule for output
     ni, hi, wi, fi = s[output].op.axis
-    bz = s[output].fuse(hi, wi)
+    bx = s[output].fuse(hi, wi)
     tx, fi = s[output].split(fi, factor=tile_c)
     txz, tx = s[output].split(tx, factor=num_thread_c)
-    bx, txz = s[output].split(txz, factor=vthread_c)
+    bz, txz = s[output].split(txz, factor=vthread_c)
     ty, ni = s[output].split(ni, factor=tile_n)
     tyz, ty = s[output].split(ty, factor=num_thread_n)
     by, tyz = s[output].split(tyz, factor=vthread_n)
-    s[output].reorder(bz, by, bx, tyz, txz, ty, tx, ni, fi)
+    s[output].reorder(bx, by, bz, tyz, txz, ty, tx, ni, fi)
     s[output].bind(bz, block_z)
     s[output].bind(by, block_y)
     s[output].bind(bx, block_x)

From 73c28455d30df8167f3a596326fddeb294e7d11f Mon Sep 17 00:00:00 2001
From: Mehrdad Hessar <mhessar@octoml.ai>
Date: Thu, 23 Sep 2021 08:54:38 -0700
Subject: [PATCH 06/37] [microTVM][Zephyr] Add 'config_main_stack_size' option
 to API server (#9026)

---
 .../zephyr/template_project/boards.json       |  56 ++++++++
 .../template_project/microtvm_api_server.py   |  77 +++--------
 python/tvm/micro/project.py                   |  18 ++-
 python/tvm/micro/project_api/client.py        |   1 -
 tests/micro/zephyr/conftest.py                |  36 +-----
 tests/micro/zephyr/test_utils.py              |  62 +++++++++
 tests/micro/zephyr/test_zephyr.py             | 122 ++++++------------
 tests/micro/zephyr/test_zephyr_aot.py         |  25 +---
 8 files changed, 201 insertions(+), 196 deletions(-)
 create mode 100644 apps/microtvm/zephyr/template_project/boards.json
 create mode 100644 tests/micro/zephyr/test_utils.py

diff --git a/apps/microtvm/zephyr/template_project/boards.json b/apps/microtvm/zephyr/template_project/boards.json
new file mode 100644
index 000000000000..bdfa51109ff7
--- /dev/null
+++ b/apps/microtvm/zephyr/template_project/boards.json
@@ -0,0 +1,56 @@
+{
+    "mps2_an521": {
+        "board": "mps2_an521",
+        "model": "mps2_an521",
+        "is_qemu": true,
+        "fpu": false
+    },
+    "nrf5340dk_nrf5340_cpuapp": {
+        "board": "nrf5340dk_nrf5340_cpuapp",
+        "model": "nrf5340dk",
+        "is_qemu": false,
+        "fpu": true
+    },
+    "nucleo_f746zg": {
+        "board": "nucleo_f746zg",
+        "model": "stm32f746xx",
+        "is_qemu": false,
+        "fpu": true
+    },
+    "nucleo_l4r5zi": {
+        "board": "nucleo_l4r5zi",
+        "model": "stm32l4r5zi",
+        "is_qemu": false,
+        "fpu": true
+    },
+    "qemu_cortex_r5": {
+        "board": "qemu_cortex_r5",
+        "model": "zynq_mp_r5",
+        "is_qemu": true,
+        "fpu": true
+    },
+    "qemu_riscv32": {
+        "board": "qemu_riscv32",
+        "model": "host",
+        "is_qemu": true,
+        "fpu": true
+    },
+    "qemu_riscv64": {
+        "board": "qemu_riscv64",
+        "model": "host",
+        "is_qemu": true,
+        "fpu": true
+    },
+    "qemu_x86": {
+        "board": "qemu_x86",
+        "model": "host",
+        "is_qemu": true,
+        "fpu": true
+    },
+    "stm32f746g_disco": {
+        "board": "stm32f746g_disco",
+        "model": "stm32f746xx",
+        "is_qemu": false,
+        "fpu": true
+    }
+}
diff --git a/apps/microtvm/zephyr/template_project/microtvm_api_server.py b/apps/microtvm/zephyr/template_project/microtvm_api_server.py
index f2e091b2f5b5..ed275e610912 100644
--- a/apps/microtvm/zephyr/template_project/microtvm_api_server.py
+++ b/apps/microtvm/zephyr/template_project/microtvm_api_server.py
@@ -35,6 +35,7 @@
 import tempfile
 import threading
 import time
+import json
 
 import serial
 import serial.tools.list_ports
@@ -57,46 +58,16 @@
 
 IS_TEMPLATE = not (API_SERVER_DIR / MODEL_LIBRARY_FORMAT_RELPATH).exists()
 
+
+BOARDS = API_SERVER_DIR / "boards.json"
+
 # Data structure to hold the information microtvm_api_server.py needs
 # to communicate with each of these boards.
-BOARD_PROPERTIES = {
-    "qemu_x86": {
-        "board": "qemu_x86",
-        "model": "host",
-    },
-    "qemu_riscv32": {
-        "board": "qemu_riscv32",
-        "model": "host",
-    },
-    "qemu_riscv64": {
-        "board": "qemu_riscv64",
-        "model": "host",
-    },
-    "mps2_an521": {
-        "board": "mps2_an521",
-        "model": "mps2_an521",
-    },
-    "nrf5340dk_nrf5340_cpuapp": {
-        "board": "nrf5340dk_nrf5340_cpuapp",
-        "model": "nrf5340dk",
-    },
-    "stm32f746g_disco": {
-        "board": "stm32f746g_disco",
-        "model": "stm32f746xx",
-    },
-    "nucleo_f746zg": {
-        "board": "nucleo_f746zg",
-        "model": "stm32f746xx",
-    },
-    "nucleo_l4r5zi": {
-        "board": "nucleo_l4r5zi",
-        "model": "stm32l4r5zi",
-    },
-    "qemu_cortex_r5": {
-        "board": "qemu_cortex_r5",
-        "model": "zynq_mp_r5",
-    },
-}
+try:
+    with open(BOARDS) as boards:
+        BOARD_PROPERTIES = json.load(boards)
+except FileNotFoundError:
+    raise FileNotFoundError(f"Board file {{{BOARDS}}} does not exist.")
 
 
 def check_call(cmd_args, *args, **kwargs):
@@ -290,9 +261,8 @@ def _get_nrf_device_args(options):
         help="Name of the Zephyr board to build for.",
     ),
     server.ProjectOption(
-        "zephyr_model",
-        choices=[board["model"] for _, board in BOARD_PROPERTIES.items()],
-        help="Name of the model for each Zephyr board.",
+        "config_main_stack_size",
+        help="Sets CONFIG_MAIN_STACK_SIZE for Zephyr board.",
     ),
 ]
 
@@ -351,13 +321,9 @@ def _create_prj_conf(self, project_dir, options):
             if self._has_fpu(options["zephyr_board"]):
                 f.write("# For models with floating point.\n" "CONFIG_FPU=y\n" "\n")
 
-            main_stack_size = None
-            if self._is_qemu(options) and options["project_type"] == "host_driven":
-                main_stack_size = 1536
-
             # Set main stack size, if needed.
-            if main_stack_size is not None:
-                f.write(f"CONFIG_MAIN_STACK_SIZE={main_stack_size}\n")
+            if options.get("config_main_stack_size") is not None:
+                f.write(f"CONFIG_MAIN_STACK_SIZE={options['config_main_stack_size']}\n")
 
             f.write("# For random number generation.\n" "CONFIG_TEST_RANDOM_GENERATOR=y\n")
 
@@ -384,6 +350,9 @@ def generate_project(self, model_library_format_path, standalone_crt_dir, projec
         # by launching the copy.
         shutil.copy2(__file__, project_dir / os.path.basename(__file__))
 
+        # Copy boards.json file to generated project.
+        shutil.copy2(BOARDS, project_dir / BOARDS.name)
+
         # Place Model Library Format tarball in the special location, which this script uses to decide
         # whether it's being invoked in a template or generated project.
         project_model_library_format_tar_path = project_dir / MODEL_LIBRARY_FORMAT_RELPATH
@@ -471,20 +440,10 @@ def _is_qemu(cls, options):
             or options["zephyr_board"] in cls._KNOWN_QEMU_ZEPHYR_BOARDS
         )
 
-    _KNOWN_FPU_ZEPHYR_BOARDS = (
-        "nucleo_f746zg",
-        "nucleo_l4r5zi",
-        "nrf5340dk_nrf5340_cpuapp",
-        "qemu_cortex_r5",
-        "qemu_riscv32",
-        "qemu_riscv64",
-        "qemu_x86",
-        "stm32f746g_disco",
-    )
-
     @classmethod
     def _has_fpu(cls, zephyr_board):
-        return zephyr_board in cls._KNOWN_FPU_ZEPHYR_BOARDS
+        fpu_boards = [name for name, board in BOARD_PROPERTIES.items() if board["fpu"]]
+        return zephyr_board in fpu_boards
 
     def flash(self, options):
         if self._is_qemu(options):
diff --git a/python/tvm/micro/project.py b/python/tvm/micro/project.py
index 8a62c9b5f9ba..5393096b5df3 100644
--- a/python/tvm/micro/project.py
+++ b/python/tvm/micro/project.py
@@ -18,7 +18,7 @@
 """Defines glue wrappers around the Project API which mate to TVM interfaces."""
 
 import pathlib
-import typing
+from typing import Union
 
 from .. import __version__
 from ..contrib import utils
@@ -64,7 +64,7 @@ class GeneratedProject:
     """Defines a glue interface to interact with a generated project through the API server."""
 
     @classmethod
-    def from_directory(cls, project_dir: typing.Union[pathlib.Path, str], options: dict):
+    def from_directory(cls, project_dir: Union[pathlib.Path, str], options: dict):
         return cls(client.instantiate_from_dir(project_dir), options)
 
     def __init__(self, api_client, options):
@@ -101,7 +101,17 @@ def __init__(self, api_client):
         if not self._info["is_template"]:
             raise NotATemplateProjectError()
 
+    def _check_project_options(self, options: dict):
+        """Check if options are valid ProjectOptions"""
+        available_options = [option["name"] for option in self.info()["project_options"]]
+        if options and not set(options.keys()).issubset(available_options):
+            raise ValueError(
+                f"""options:{list(options)} include non valid ProjectOptions.
+                        Here is a list of available options:{list(available_options)}."""
+            )
+
     def generate_project_from_mlf(self, model_library_format_path, project_dir, options):
+        self._check_project_options(options)
         self._api_client.generate_project(
             model_library_format_path=str(model_library_format_path),
             standalone_crt_dir=get_standalone_crt_dir(),
@@ -124,9 +134,9 @@ def generate_project(self, graph_executor_factory, project_dir, options):
 
 
 def generate_project(
-    template_project_dir: typing.Union[pathlib.Path, str],
+    template_project_dir: Union[pathlib.Path, str],
     module: ExportableModule,
-    generated_project_dir: typing.Union[pathlib.Path, str],
+    generated_project_dir: Union[pathlib.Path, str],
     options: dict = None,
 ):
     """Generate a project for an embedded platform that contains the given model.
diff --git a/python/tvm/micro/project_api/client.py b/python/tvm/micro/project_api/client.py
index ac8ff629a718..f1eb115cfbbe 100644
--- a/python/tvm/micro/project_api/client.py
+++ b/python/tvm/micro/project_api/client.py
@@ -205,7 +205,6 @@ def instantiate_from_dir(project_dir: typing.Union[pathlib.Path, str], debug: bo
     """Launch server located in project_dir, and instantiate a Project API Client
     connected to it."""
     args = None
-
     project_dir = pathlib.Path(project_dir)
 
     python_script = project_dir / SERVER_PYTHON_FILENAME
diff --git a/tests/micro/zephyr/conftest.py b/tests/micro/zephyr/conftest.py
index 7c19b62ac63d..177ca8aa269e 100644
--- a/tests/micro/zephyr/conftest.py
+++ b/tests/micro/zephyr/conftest.py
@@ -20,44 +20,16 @@
 
 import pytest
 
-from tvm.micro import project
-import tvm.contrib.utils
-import tvm.target.target
+import test_utils
 
-TEMPLATE_PROJECT_DIR = (
-    pathlib.Path(__file__).parent
-    / ".."
-    / ".."
-    / ".."
-    / "apps"
-    / "microtvm"
-    / "zephyr"
-    / "template_project"
-).resolve()
-
-
-def zephyr_boards() -> dict:
-    """Returns a dict mapping board to target model"""
-    template = project.TemplateProject.from_directory(TEMPLATE_PROJECT_DIR)
-    project_options = template.info()["project_options"]
-    for option in project_options:
-        if option["name"] == "zephyr_board":
-            boards = option["choices"]
-        if option["name"] == "zephyr_model":
-            models = option["choices"]
-
-    arduino_boards = {boards[i]: models[i] for i in range(len(boards))}
-    return arduino_boards
-
-
-ZEPHYR_BOARDS = zephyr_boards()
+from tvm.contrib.utils import tempdir
 
 
 def pytest_addoption(parser):
     parser.addoption(
         "--zephyr-board",
         required=True,
-        choices=ZEPHYR_BOARDS.keys(),
+        choices=test_utils.ZEPHYR_BOARDS.keys(),
         help=("Zephyr board for test."),
     )
     parser.addoption(
@@ -104,4 +76,4 @@ def temp_dir(board):
     if not os.path.exists(board_workspace.parent):
         os.makedirs(board_workspace.parent)
 
-    return tvm.contrib.utils.tempdir(board_workspace)
+    return tempdir(board_workspace)
diff --git a/tests/micro/zephyr/test_utils.py b/tests/micro/zephyr/test_utils.py
new file mode 100644
index 000000000000..54c3de252f8a
--- /dev/null
+++ b/tests/micro/zephyr/test_utils.py
@@ -0,0 +1,62 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import json
+import pathlib
+
+
+TEMPLATE_PROJECT_DIR = (
+    pathlib.Path(__file__).parent
+    / ".."
+    / ".."
+    / ".."
+    / "apps"
+    / "microtvm"
+    / "zephyr"
+    / "template_project"
+).resolve()
+
+BOARDS = TEMPLATE_PROJECT_DIR / "boards.json"
+
+
+def zephyr_boards() -> dict:
+    """Returns a dict mapping board to target model"""
+    with open(BOARDS) as f:
+        board_properties = json.load(f)
+
+    boards_model = {board: info["model"] for board, info in board_properties.items()}
+    return boards_model
+
+
+ZEPHYR_BOARDS = zephyr_boards()
+
+
+def qemu_boards(board: str):
+    """Returns True if board is QEMU."""
+    with open(BOARDS) as f:
+        board_properties = json.load(f)
+
+    qemu_boards = [name for name, board in board_properties.items() if board["is_qemu"]]
+    return board in qemu_boards
+
+
+def has_fpu(board: str):
+    """Returns True if board has FPU."""
+    with open(BOARDS) as f:
+        board_properties = json.load(f)
+
+    fpu_boards = [name for name, board in board_properties.items() if board["fpu"]]
+    return board in fpu_boards
diff --git a/tests/micro/zephyr/test_zephyr.py b/tests/micro/zephyr/test_zephyr.py
index b6396ce53315..be1f231156ad 100644
--- a/tests/micro/zephyr/test_zephyr.py
+++ b/tests/micro/zephyr/test_zephyr.py
@@ -14,14 +14,12 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-
 import logging
 import os
 import pathlib
 import subprocess
 import sys
 import logging
-import json
 
 import pytest
 import numpy as np
@@ -29,19 +27,12 @@
 from PIL import Image
 
 import tvm
-import tvm.rpc
-import tvm.micro
-import tvm.testing
 import tvm.relay as relay
 from tvm.relay.testing import byoc
-
 from tvm.contrib import utils
-from tvm.relay.expr_functor import ExprMutator
-from tvm.relay.op.annotation import compiler_begin, compiler_end
-
 from tvm.micro.testing import check_tune_log
 
-import conftest
+import test_utils
 
 _LOG = logging.getLogger(__name__)
 
@@ -58,16 +49,24 @@ def _make_sess_from_op(
 
 
 def _make_session(temp_dir, zephyr_board, west_cmd, mod, build_config):
+    config_main_stack_size = None
+    if test_utils.qemu_boards(zephyr_board):
+        config_main_stack_size = 1536
+
+    project_options = {
+        "project_type": "host_driven",
+        "west_cmd": west_cmd,
+        "verbose": bool(build_config.get("debug")),
+        "zephyr_board": zephyr_board,
+    }
+    if config_main_stack_size is not None:
+        project_options["config_main_stack_size"] = config_main_stack_size
+
     project = tvm.micro.generate_project(
-        str(conftest.TEMPLATE_PROJECT_DIR),
+        str(test_utils.TEMPLATE_PROJECT_DIR),
         mod,
         temp_dir / "project",
-        {
-            "project_type": "host_driven",
-            "west_cmd": west_cmd,
-            "verbose": bool(build_config.get("debug")),
-            "zephyr_board": zephyr_board,
-        },
+        project_options,
     )
     project.build()
     project.flash()
@@ -89,7 +88,7 @@ def _make_add_sess(temp_dir, model, zephyr_board, west_cmd, build_config, dtype=
 def test_add_uint(temp_dir, board, west_cmd, tvm_debug):
     """Test compiling the on-device runtime."""
 
-    model = conftest.ZEPHYR_BOARDS[board]
+    model = test_utils.ZEPHYR_BOARDS[board]
     build_config = {"debug": tvm_debug}
 
     # NOTE: run test in a nested function so cPython will delete arrays before closing the session.
@@ -109,22 +108,12 @@ def test_basic_add(sess):
         test_basic_add(sess)
 
 
-def has_fpu(zephyr_board):
-    sys.path.insert(0, str(conftest.TEMPLATE_PROJECT_DIR))
-    try:
-        import microtvm_api_server
-    finally:
-        sys.path.pop(0)
-
-    return microtvm_api_server.Handler._has_fpu(zephyr_board)
-
-
 # The same test code can be executed on both the QEMU simulation and on real hardware.
 @tvm.testing.requires_micro
 def test_add_float(temp_dir, board, west_cmd, tvm_debug):
     """Test compiling the on-device runtime."""
-    model = conftest.ZEPHYR_BOARDS[board]
-    if not has_fpu(board):
+    model = test_utils.ZEPHYR_BOARDS[board]
+    if not test_utils.has_fpu(board):
         pytest.skip(f"FPU not enabled for {board}")
 
     build_config = {"debug": tvm_debug}
@@ -150,7 +139,7 @@ def test_basic_add(sess):
 def test_platform_timer(temp_dir, board, west_cmd, tvm_debug):
     """Test compiling the on-device runtime."""
 
-    model = conftest.ZEPHYR_BOARDS[board]
+    model = test_utils.ZEPHYR_BOARDS[board]
     build_config = {"debug": tvm_debug}
 
     # NOTE: run test in a nested function so cPython will delete arrays before closing the session.
@@ -178,7 +167,7 @@ def test_basic_add(sess):
 @tvm.testing.requires_micro
 def test_relay(temp_dir, board, west_cmd, tvm_debug):
     """Testing a simple relay graph"""
-    model = conftest.ZEPHYR_BOARDS[board]
+    model = test_utils.ZEPHYR_BOARDS[board]
     build_config = {"debug": tvm_debug}
     shape = (10,)
     dtype = "int8"
@@ -209,7 +198,7 @@ def test_relay(temp_dir, board, west_cmd, tvm_debug):
 @tvm.testing.requires_micro
 def test_onnx(temp_dir, board, west_cmd, tvm_debug):
     """Testing a simple ONNX model."""
-    model = conftest.ZEPHYR_BOARDS[board]
+    model = test_utils.ZEPHYR_BOARDS[board]
     build_config = {"debug": tvm_debug}
 
     this_dir = pathlib.Path(os.path.dirname(__file__))
@@ -286,7 +275,7 @@ def check_result(
 @tvm.testing.requires_micro
 def test_byoc_microtvm(temp_dir, board, west_cmd, tvm_debug):
     """This is a simple test case to check BYOC capabilities of microTVM"""
-    model = conftest.ZEPHYR_BOARDS[board]
+    model = test_utils.ZEPHYR_BOARDS[board]
     build_config = {"debug": tvm_debug}
     x = relay.var("x", shape=(10, 10))
     w0 = relay.var("w0", shape=(10, 10))
@@ -366,7 +355,7 @@ def _make_add_sess_with_shape(temp_dir, model, zephyr_board, west_cmd, shape, bu
 @tvm.testing.requires_micro
 def test_rpc_large_array(temp_dir, board, west_cmd, tvm_debug, shape):
     """Test large RPC array transfer."""
-    model = conftest.ZEPHYR_BOARDS[board]
+    model = test_utils.ZEPHYR_BOARDS[board]
     build_config = {"debug": tvm_debug}
 
     # NOTE: run test in a nested function so cPython will delete arrays before closing the session.
@@ -385,9 +374,8 @@ def test_tensors(sess):
 @tvm.testing.requires_micro
 def test_autotune_conv2d(temp_dir, board, west_cmd, tvm_debug):
     """Test AutoTune for microTVM Zephyr"""
-    import tvm.relay as relay
-
-    model = conftest.ZEPHYR_BOARDS[board]
+    model = test_utils.ZEPHYR_BOARDS[board]
+    build_config = {"debug": tvm_debug}
 
     # Create a Relay model
     data_shape = (1, 3, 16, 16)
@@ -420,18 +408,22 @@ def test_autotune_conv2d(temp_dir, board, west_cmd, tvm_debug):
         tasks = tvm.autotvm.task.extract_from_program(mod["main"], {}, target)
     assert len(tasks) > 0
 
-    repo_root = pathlib.Path(
-        subprocess.check_output(["git", "rev-parse", "--show-toplevel"], encoding="utf-8").strip()
-    )
-    template_project_dir = repo_root / "apps" / "microtvm" / "zephyr" / "template_project"
+    config_main_stack_size = None
+    if test_utils.qemu_boards(board):
+        config_main_stack_size = 1536
+
+    project_options = {
+        "zephyr_board": board,
+        "west_cmd": west_cmd,
+        "verbose": 1,
+        "project_type": "host_driven",
+    }
+    if config_main_stack_size is not None:
+        project_options["config_main_stack_size"] = config_main_stack_size
+
     module_loader = tvm.micro.AutoTvmModuleLoader(
-        template_project_dir=template_project_dir,
-        project_options={
-            "zephyr_board": board,
-            "west_cmd": west_cmd,
-            "verbose": 1,
-            "project_type": "host_driven",
-        },
+        template_project_dir=test_utils.TEMPLATE_PROJECT_DIR,
+        project_options=project_options,
     )
 
     timeout = 200
@@ -473,21 +465,7 @@ def test_autotune_conv2d(temp_dir, board, west_cmd, tvm_debug):
         lowered = tvm.relay.build(mod, target=target, params=params)
 
     temp_dir = utils.tempdir()
-    project = tvm.micro.generate_project(
-        str(template_project_dir),
-        lowered,
-        temp_dir / "project",
-        {
-            "zephyr_board": board,
-            "west_cmd": west_cmd,
-            "verbose": 1,
-            "project_type": "host_driven",
-        },
-    )
-    project.build()
-    project.flash()
-
-    with tvm.micro.Session(project.transport()) as session:
+    with _make_session(temp_dir, board, west_cmd, lowered, build_config) as session:
         graph_mod = tvm.micro.create_local_graph_executor(
             lowered.get_graph_json(), session.get_system_lib(), session.device
         )
@@ -502,21 +480,7 @@ def test_autotune_conv2d(temp_dir, board, west_cmd, tvm_debug):
             lowered_tuned = tvm.relay.build(mod, target=target, params=params)
 
     temp_dir = utils.tempdir()
-    project = tvm.micro.generate_project(
-        str(template_project_dir),
-        lowered_tuned,
-        temp_dir / "project",
-        {
-            "zephyr_board": board,
-            "west_cmd": west_cmd,
-            "verbose": 1,
-            "project_type": "host_driven",
-        },
-    )
-    project.build()
-    project.flash()
-
-    with tvm.micro.Session(project.transport()) as session:
+    with _make_session(temp_dir, board, west_cmd, lowered_tuned, build_config) as session:
         graph_mod = tvm.micro.create_local_graph_executor(
             lowered_tuned.get_graph_json(), session.get_system_lib(), session.device
         )
diff --git a/tests/micro/zephyr/test_zephyr_aot.py b/tests/micro/zephyr/test_zephyr_aot.py
index 6c72d3d7becf..f03b8ecce6d0 100644
--- a/tests/micro/zephyr/test_zephyr_aot.py
+++ b/tests/micro/zephyr/test_zephyr_aot.py
@@ -14,7 +14,6 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-
 import io
 import logging
 import os
@@ -28,35 +27,19 @@
 import numpy as np
 
 import tvm
-import tvm.rpc
-import tvm.micro
 from tvm.micro.project_api import server
-import tvm.testing
 import tvm.relay as relay
 
-from tvm.contrib import utils
 from tvm.contrib.download import download_testdata
 from tvm.micro.interface_api import generate_c_interface_header
 
-import conftest
-
-_LOG = logging.getLogger(__name__)
+import test_utils
 
 
 def _build_project(temp_dir, zephyr_board, west_cmd, mod, build_config, extra_files_tar=None):
-    template_project_dir = (
-        pathlib.Path(__file__).parent
-        / ".."
-        / ".."
-        / ".."
-        / "apps"
-        / "microtvm"
-        / "zephyr"
-        / "template_project"
-    ).resolve()
     project_dir = temp_dir / "project"
     project = tvm.micro.generate_project(
-        str(template_project_dir),
+        str(test_utils.TEMPLATE_PROJECT_DIR),
         mod,
         project_dir,
         {
@@ -145,7 +128,7 @@ def test_tflite(temp_dir, board, west_cmd, tvm_debug):
     ]:
         pytest.skip(msg="Model does not fit.")
 
-    model = conftest.ZEPHYR_BOARDS[board]
+    model = test_utils.ZEPHYR_BOARDS[board]
     input_shape = (1, 32, 32, 3)
     output_shape = (1, 10)
     build_config = {"debug": tvm_debug}
@@ -227,7 +210,7 @@ def test_qemu_make_fail(temp_dir, board, west_cmd, tvm_debug):
     if board not in ["qemu_x86", "mps2_an521"]:
         pytest.skip(msg="Only for QEMU targets.")
 
-    model = conftest.ZEPHYR_BOARDS[board]
+    model = test_utils.ZEPHYR_BOARDS[board]
     build_config = {"debug": tvm_debug}
     shape = (10,)
     dtype = "float32"

From e8872866acd82e24fb20cd86bcc6fbdb409eec47 Mon Sep 17 00:00:00 2001
From: Mark Shields <87091372+mbs-octoml@users.noreply.github.com>
Date: Thu, 23 Sep 2021 09:41:13 -0700
Subject: [PATCH 07/37] BUG #8013: Remove register_alter_op_layout example from
 dev/use_pass_infra.py (#9076)

* BUG #8013: Remove register_alter_op_layout example from dev/use_pass_infra.py

This tutorial registers a global layout transformation for conv2d for all
targets which is not well-formed. Later uses of conv2d in the tutorials
pick that layout up then assert fail in the conv2d type-relation.

Better would be to register a transform for an entirely fake target, but
that is beyond my current level of expertise.

In general our use of sphinx/sphinx_gallery for running and rendering the
tutorials is highly suspect since there is no inter-example isolation:
 - Examples using tensorflow will gobble up GPU memory and not give it back.
 - Any examples which use any of the (many!) global registration mechanisms
   need to ensure the registrant is safe across all tutorials.
I recall seeing a thread with the sphinx_gallery where they said they'd prefer
not to work on process-level isolation, but it's probably worth pinging again.

While digging into this I noticed we had a slicing cast in AlterOpLayout due
to a derived class of ObjectRef introducing virtuals. I moved the virtuals to
the corresponding Node classes. In this case we got away with it since the
ObjectRef happened to not get copied but we were on very thin ice.

* [checkpoint] Woops, forgot there was an extra AlterOpLayout

I should have run locally, there goes 6hrs of CI.
---
 src/relay/transforms/alter_op_layout.cc | 41 ++++++++++++++-----------
 src/relay/transforms/convert_layout.cc  | 39 ++++++++++++-----------
 src/relay/transforms/transform_layout.h | 36 +++++++++++-----------
 tutorials/dev/use_pass_infra.py         | 30 ------------------
 4 files changed, 62 insertions(+), 84 deletions(-)

diff --git a/src/relay/transforms/alter_op_layout.cc b/src/relay/transforms/alter_op_layout.cc
index 9afdb7210cba..f347eddae760 100644
--- a/src/relay/transforms/alter_op_layout.cc
+++ b/src/relay/transforms/alter_op_layout.cc
@@ -50,19 +50,6 @@ namespace alter_op_layout {
 class AlterTransformMemorizerNode : public TransformMemorizerNode {
  public:
   static constexpr const char* _type_key = "relay.alter_op_layout.AlterTransformMemorizerNode";
-};
-
-/*!
- * \brief Container that provides the transformation function for alter layout..
- */
-class AlterTransformMemorizer : public TransformMemorizer {
- public:
-  AlterTransformMemorizer() {}
-  explicit AlterTransformMemorizer(ObjectPtr<Object> n) : TransformMemorizer(n) {}
-
-  AlterTransformMemorizerNode* operator->() {
-    return static_cast<AlterTransformMemorizerNode*>(get_mutable());
-  }
 
   /*!
    * \brief Defines the call transformation for AlterOpLayout pass. The new layouts are defined by
@@ -102,7 +89,23 @@ class AlterTransformMemorizer : public TransformMemorizer {
     return GetRef<Call>(new_call);
   }
 
-  using TransformMemorizer::CallWithNewLayouts;
+  Call CallWithNewLayouts(const Call& ref_call, const std::vector<Expr>& new_args) override {
+    return CallWithNewLayouts(ref_call, ref_call->attrs, new_args);
+  }
+};
+
+/*!
+ * \brief Container that provides the transformation function for alter layout..
+ */
+class AlterTransformMemorizer : public TransformMemorizer {
+ public:
+  AlterTransformMemorizer() = default;
+  explicit AlterTransformMemorizer(ObjectPtr<Object> n) : TransformMemorizer(n) {}
+
+  AlterTransformMemorizerNode* operator->() {
+    return static_cast<AlterTransformMemorizerNode*>(get_mutable());
+  }
+
   using ContainerType = AlterTransformMemorizerNode;
 };
 
@@ -113,10 +116,12 @@ class AlterTransformMemorizer : public TransformMemorizer {
  */
 Expr AlterOpLayout(const Expr& expr) {
   // TODO(@icemelon9): need to rerun type inference after applying an alter op.
-  AlterTransformMemorizer alterMemorizer(make_object<AlterTransformMemorizerNode>());
-  auto fcontext = [&](const Call& call) -> ObjectRef { return alterMemorizer; };
-
-  return ForwardRewrite(expr, LayoutRewriter<AlterTransformMemorizer>, fcontext);
+  AlterTransformMemorizer alter_memorizer(make_object<AlterTransformMemorizerNode>());
+  std::function<ObjectRef(const Call&)> fcontext = [=](const Call& call) -> ObjectRef {
+    return alter_memorizer;
+  };
+  FForwardRewrite rewrite_func = LayoutRewriter<AlterTransformMemorizer>;
+  return ForwardRewrite(expr, rewrite_func, fcontext);
 }
 
 }  // namespace alter_op_layout
diff --git a/src/relay/transforms/convert_layout.cc b/src/relay/transforms/convert_layout.cc
index e74ea0115857..e10be508529e 100644
--- a/src/relay/transforms/convert_layout.cc
+++ b/src/relay/transforms/convert_layout.cc
@@ -58,22 +58,6 @@ class ConvertTransformMemorizerNode : public TransformMemorizerNode {
   explicit ConvertTransformMemorizerNode(Map<String, Array<String>> desired_layouts)
       : desired_layouts_(std::move(desired_layouts)) {}
 
-  /*! \brief A mapping of op_name to array of desired layouts for each input. */
-  Map<String, Array<String>> desired_layouts_;
-};
-
-/*!
- * \brief Container that provides the transformation function for convert layout.
- */
-class ConvertTransformMemorizer : public TransformMemorizer {
- public:
-  ConvertTransformMemorizer() {}
-  explicit ConvertTransformMemorizer(ObjectPtr<Object> n) : TransformMemorizer(n) {}
-
-  ConvertTransformMemorizerNode* operator->() {
-    return static_cast<ConvertTransformMemorizerNode*>(get_mutable());
-  }
-
   /*!
    * \brief Defines the call transformation for ConvertLayout pass. The new layouts should be the
    * desired layout as specified by the user.
@@ -89,7 +73,7 @@ class ConvertTransformMemorizer : public TransformMemorizer {
     Expr new_e;
     bool modified = false;
     if (fconvert_layout.count(op)) {
-      auto desired_layouts = operator->()->desired_layouts_;
+      auto desired_layouts = desired_layouts_;
       if (desired_layouts.find(op->name) != desired_layouts.end()) {
         tvm::Array<tvm::te::Tensor> tinfos;
         for (auto& expr : ref_call->args) {
@@ -124,7 +108,26 @@ class ConvertTransformMemorizer : public TransformMemorizer {
     return Call(new_call->op, new_call->args, new_call->attrs, new_call->type_args, ref_call->span);
   }
 
-  using TransformMemorizer::CallWithNewLayouts;
+  Call CallWithNewLayouts(const Call& ref_call, const std::vector<Expr>& new_args) override {
+    return CallWithNewLayouts(ref_call, ref_call->attrs, new_args);
+  }
+
+  /*! \brief A mapping of op_name to array of desired layouts for each input. */
+  Map<String, Array<String>> desired_layouts_;
+};
+
+/*!
+ * \brief Container that provides the transformation function for convert layout.
+ */
+class ConvertTransformMemorizer : public TransformMemorizer {
+ public:
+  ConvertTransformMemorizer() = default;
+  explicit ConvertTransformMemorizer(ObjectPtr<Object> n) : TransformMemorizer(n) {}
+
+  ConvertTransformMemorizerNode* operator->() {
+    return static_cast<ConvertTransformMemorizerNode*>(get_mutable());
+  }
+
   using ContainerType = ConvertTransformMemorizerNode;
 };
 
diff --git a/src/relay/transforms/transform_layout.h b/src/relay/transforms/transform_layout.h
index fbb7bc9cd985..7bfb31a299ad 100644
--- a/src/relay/transforms/transform_layout.h
+++ b/src/relay/transforms/transform_layout.h
@@ -57,6 +57,21 @@ class TransformMemorizerNode : public Object {
     }
   };
 
+  /*!
+   * \brief Defines the call transformation for derived passes. The new layouts are defined by
+   * used for different targets using a packed func.
+   * \param ref_call The original call.
+   * \param new_attrs Updated attributes consistent with new layouts.
+   * \param new_args The traversed/recursed args to the call.
+   * \return The new Call after calling the packed func.
+   */
+  virtual Call CallWithNewLayouts(const Call& ref_call, Attrs new_attrs,
+                                  const std::vector<Expr>& new_args) = 0;
+
+  virtual Call CallWithNewLayouts(const Call& ref_call, const std::vector<Expr>& new_args) {
+    return CallWithNewLayouts(ref_call, ref_call->attrs, new_args);
+  }
+
   /*! \brief The memorizer map. */
   std::unordered_map<TransformKey, Expr, key_hash> memo;
 
@@ -69,11 +84,9 @@ class TransformMemorizerNode : public Object {
  */
 class TransformMemorizer : public ObjectRef {
  public:
-  TransformMemorizer() {}
+  TransformMemorizer() = default;
   explicit TransformMemorizer(ObjectPtr<Object> n) : ObjectRef(n) {}
 
-  virtual ~TransformMemorizer() {}
-
   TransformMemorizerNode* operator->() {
     return static_cast<TransformMemorizerNode*>(get_mutable());
   }
@@ -146,19 +159,6 @@ class TransformMemorizer : public ObjectRef {
     return MakeLayoutTransform(input_expr, new_src_layout.name(), dst_layout.name());
   }
 
-  /*!
-   * \brief Defines the call transformation for derived passes. The new layouts are defined by
-   * used for different targets using a packed func.
-   * \param ref_call The original call.
-   * \param new_attrs Updated attributes consistent with new layouts.
-   * \param new_args The traversed/recursed args to the call.
-   * \return The new Call after calling the packed func.
-   */
-  virtual Call CallWithNewLayouts(const Call& ref_call, Attrs new_attrs,
-                                  const std::vector<Expr>& new_args) = 0;
-  virtual Call CallWithNewLayouts(const Call& ref_call, const std::vector<Expr>& new_args) {
-    return CallWithNewLayouts(ref_call, ref_call->attrs, new_args);
-  }
   using ContainerType = TransformMemorizerNode;
 };
 
@@ -312,7 +312,7 @@ Expr LayoutRewriter(const Call& ref_call, const Array<Expr>& new_args, const Obj
     if (ref_call->op.as<OpNode>()) {
       Op op = Downcast<Op>(ref_call->op);
       if (falter_layout.count(op) && !finfer_layout.count(op)) {
-        return memorizer.CallWithNewLayouts(ref_call, normal_new_args);
+        return memorizer->CallWithNewLayouts(ref_call, normal_new_args);
       }
     }
   }
@@ -349,7 +349,7 @@ Expr LayoutRewriter(const Call& ref_call, const Array<Expr>& new_args, const Obj
   }
 
   // new_op = alter(op)
-  Call new_call = memorizer.CallWithNewLayouts(ref_call, infer_out->new_attrs, normal_new_args);
+  Call new_call = memorizer->CallWithNewLayouts(ref_call, infer_out->new_attrs, normal_new_args);
 
   // new_in2, new_out = op.infer(new_in)
   if (new_call->op->IsInstance<OpNode>()) {
diff --git a/tutorials/dev/use_pass_infra.py b/tutorials/dev/use_pass_infra.py
index 468c4d40b942..67cdfdedce0e 100644
--- a/tutorials/dev/use_pass_infra.py
+++ b/tutorials/dev/use_pass_infra.py
@@ -69,20 +69,6 @@ def example():
     return relay.Function([x, weight], z2)
 
 
-###############################################################################
-# Let us register layout alteration for a conv2d op so that we can apply the
-# layout alteration pass on the example. How alter layout pass works is out
-# the scope of this tutorial.
-
-
-@relay.op.register_alter_op_layout("nn.conv2d", level=101)
-def alter_conv2d(attrs, inputs, tinfos, out_type):
-    data, weight = inputs
-    new_attrs = dict(attrs)
-    new_attrs["data_layout"] = "NCHW16c"
-    return relay.nn.conv2d(data, weight, **new_attrs)
-
-
 ###############################################################################
 # Optimize the Program
 # --------------------
@@ -188,21 +174,6 @@ def alter_conv2d(attrs, inputs, tinfos, out_type):
     mod3 = seq(mod)
 print(mod3)
 
-###############################################################################
-# The passes applied so far are target independent. The pass infra also
-# provides a means to make pass target-aware. For example, the layout
-# alteration pass falls in such category.
-
-with tvm.transform.PassContext(opt_level=3):
-    mod4 = seq(mod)
-print(mod4)
-
-seq1 = tvm.transform.Sequential([relay.transform.AlterOpLayout()])
-with tvm.transform.PassContext(opt_level=3):
-    with tvm.target.Target("llvm"):
-        mod5 = seq1(mod)
-print(mod5)
-
 ##############################################################################
 # Implement a Pass Using Python Decorator
 # ------------------------------------------
@@ -257,7 +228,6 @@ def visit_constant(self, c):
         tvm.transform.PrintIR(),
         relay.transform.EliminateCommonSubexpr(),
         relay.transform.FuseOps(),
-        relay.transform.AlterOpLayout(),
     ]
 )
 

From ee6fe122c6a51fc6ae7a82cc535c474d060b3189 Mon Sep 17 00:00:00 2001
From: Egor Churaev <egor.churaev@gmail.com>
Date: Thu, 23 Sep 2021 20:49:44 +0300
Subject: [PATCH 08/37] [Auto-Schedule][Fix] Fix hang while tune model through
 rpc (#9032)

* [Auto-Schedule][Fix] Fix hang while tune model through rpc

* Fix problem with hang by using deep copy

* Fix with local args

* Update python/tvm/auto_scheduler/measure.py

Co-authored-by: Wuwei Lin <vincentl13x@gmail.com>
---
 python/tvm/auto_scheduler/measure.py | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/python/tvm/auto_scheduler/measure.py b/python/tvm/auto_scheduler/measure.py
index c58aeea57d14..8c6fd5f1a949 100644
--- a/python/tvm/auto_scheduler/measure.py
+++ b/python/tvm/auto_scheduler/measure.py
@@ -909,6 +909,7 @@ def _timed_eval_func(
             random_fill = tvm.get_global_func("tvm.contrib.random.random_fill", True)
             assert random_fill, "Please make sure USE_RANDOM is ON in the config.cmake"
             assert len(args) == len(build_res.args)
+            loc_args = []
             # pylint: disable=consider-using-enumerate
             for idx in range(len(args)):
                 if args[idx] is None:
@@ -917,11 +918,11 @@ def _timed_eval_func(
                         get_const_tuple(build_res_arg.shape), build_res_arg.dtype, dev
                     )
                     random_fill(empty_array)
-                    args[idx] = empty_array
+                    loc_args.append(empty_array)
                 else:
-                    args[idx] = ndarray.array(args[idx], dev)
+                    loc_args.append(ndarray.array(args[idx], dev))
             dev.sync()
-            costs = time_f(*args).results
+            costs = time_f(*loc_args).results
         # pylint: disable=broad-except
         except Exception:
             costs = (MAX_FLOAT,)
@@ -1112,6 +1113,7 @@ def _rpc_run(
             ), "Please make sure USE_RANDOM is ON in the config.cmake on the remote devices"
 
             assert len(args) == len(build_res.args)
+            loc_args = []
             # pylint: disable=consider-using-enumerate
             for idx in range(len(args)):
                 if args[idx] is None:
@@ -1120,16 +1122,16 @@ def _rpc_run(
                         get_const_tuple(build_res_arg.shape), build_res_arg.dtype, dev
                     )
                     random_fill(empty_array)
-                    args[idx] = empty_array
+                    loc_args.append(empty_array)
                 else:
-                    args[idx] = ndarray.array(args[idx], dev)
+                    loc_args.append(ndarray.array(args[idx], dev))
             dev.sync()
 
             # First run for check that the kernel is correct
-            func.entry_func(*args)
+            func.entry_func(*loc_args)
             dev.sync()
 
-            costs = time_f(*args).results
+            costs = time_f(*loc_args).results
 
             # clean up remote files
             remote.remove(build_res.filename)

From 0d866218e7d1a7af354212839703be9df1fcf29f Mon Sep 17 00:00:00 2001
From: sunway <sunwayforever@gmail.com>
Date: Fri, 24 Sep 2021 04:46:34 +0800
Subject: [PATCH 09/37] [BYOC] Fix DNNL Conv2D in JSON runtime (#9043)

* dnnl memory dim is wrong for conv2d

* change test case for dnnl conv2d

* trigger CI

Co-authored-by: sunway <sunwayforever@hotmail.com>
---
 src/runtime/contrib/dnnl/dnnl_json_runtime.cc | 12 ++++++------
 tests/python/relay/test_json_runtime.py       | 14 +++++++++-----
 2 files changed, 15 insertions(+), 11 deletions(-)

diff --git a/src/runtime/contrib/dnnl/dnnl_json_runtime.cc b/src/runtime/contrib/dnnl/dnnl_json_runtime.cc
index eef67a702d9c..e52009d7add7 100644
--- a/src/runtime/contrib/dnnl/dnnl_json_runtime.cc
+++ b/src/runtime/contrib/dnnl/dnnl_json_runtime.cc
@@ -163,16 +163,16 @@ class DNNLJSONRuntime : public JSONRuntimeBase {
     dnnl::memory::dim N = input_shape[0],       // batch size
         IC = input_shape[1],                    // input channels
         IH = input_shape[2],                    // input height
-        IW = input_shape[2],                    // input width
+        IW = input_shape[3],                    // input width
         OC = weight_shape[0],                   // output channels
         KH = weight_shape[2],                   // weight height
         KW = weight_shape[3],                   // weight width
-        PH_L = std::stoi(str_padding[1]),       // height padding: left
-        PH_R = std::stoi(str_padding[3]),       // height padding: right
-        PW_L = std::stoi(str_padding[0]),       // width padding: left
-        PW_R = std::stoi(str_padding[2]),       // width padding: right
+        PW_L = std::stoi(str_padding[1]),       // width padding: left
+        PW_R = std::stoi(str_padding[3]),       // width padding: right
+        PH_L = std::stoi(str_padding[0]),       // height padding: top
+        PH_R = std::stoi(str_padding[2]),       // height padding: bottom
         SH = std::stoi(str_strides[0]),         // height-wise stride
-        SW = std::stoi(str_strides[0]),         // weight-wise stride
+        SW = std::stoi(str_strides[1]),         // weight-wise stride
         OH = (IH - KH + PH_L + PH_R) / SH + 1,  // output height
         OW = (IW - KW + PW_L + PW_R) / SW + 1;  // output width
 
diff --git a/tests/python/relay/test_json_runtime.py b/tests/python/relay/test_json_runtime.py
index f5674dbf5fb3..8107dc231adb 100644
--- a/tests/python/relay/test_json_runtime.py
+++ b/tests/python/relay/test_json_runtime.py
@@ -96,12 +96,14 @@ def test_conv2d():
 
     def conv2d_direct():
         dtype = "float32"
-        ishape = (1, 32, 14, 14)
-        w1shape = (32, 32, 3, 3)
+        ishape = (1, 1, 99, 12)
+        w1shape = (54, 1, 3, 3)
 
         data0 = relay.var("data", shape=ishape, dtype=dtype)
         weight0 = relay.var("weight", shape=w1shape, dtype=dtype)
-        out = relay.nn.conv2d(data0, weight0, kernel_size=(3, 3), padding=(1, 1))
+        out = relay.nn.conv2d(
+            data0, weight0, kernel_size=(3, 3), strides=(2, 2), padding=(1, 0, 1, 1)
+        )
 
         func = relay.Function([data0, weight0], out)
         func = set_func_attr(func, "dnnl", "tvmgen_default_dnnl_0")
@@ -118,7 +120,9 @@ def conv2d_direct():
 
         data0 = relay.var("data", shape=ishape, dtype=dtype)
         weight0 = relay.var("weight", shape=w1shape, dtype=dtype)
-        out = relay.nn.conv2d(data0, weight0, kernel_size=(3, 3), padding=(1, 1))
+        out = relay.nn.conv2d(
+            data0, weight0, kernel_size=(3, 3), strides=(2, 2), padding=(1, 0, 1, 1)
+        )
         main_f = relay.Function([data0, weight0], out)
         ref_mod = tvm.IRModule()
         ref_mod["main"] = main_f
@@ -127,7 +131,7 @@ def conv2d_direct():
         i_data = np.random.uniform(0, 1, ishape).astype(dtype)
         w1_data = np.random.uniform(0, 1, w1shape).astype(dtype)
 
-        return mod, ref_mod, {"data": i_data, "weight": w1_data}, (1, 32, 14, 14)
+        return mod, ref_mod, {"data": i_data, "weight": w1_data}, (1, 54, 50, 6)
 
     def group_conv2d():
         dtype = "float32"

From f6aa2f3c6708757a7b894422f8b53e03c27d6a95 Mon Sep 17 00:00:00 2001
From: Krzysztof Parzyszek <kparzysz@quicinc.com>
Date: Thu, 23 Sep 2021 15:46:51 -0500
Subject: [PATCH 10/37] [Relay] Fix compiler warning in ExtractOperators
 (#9075)

* [Relay] Fix compiler warning in ExtractOperators

Fix clang warning:
  'OperatorExtractorWrapper::VisitExpr_' hides overloaded virtual functions

* Restart CI
---
 src/relay/analysis/extract_operators.cc | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/relay/analysis/extract_operators.cc b/src/relay/analysis/extract_operators.cc
index 8fd0f87239ff..f150453ba0b6 100644
--- a/src/relay/analysis/extract_operators.cc
+++ b/src/relay/analysis/extract_operators.cc
@@ -40,6 +40,8 @@ class OperatorExtractorWrapper : private MixedModeVisitor {
   }
 
  private:
+  using MixedModeVisitor::VisitExpr_;
+
   const IRModule mod_;
   /*! \brief Map of operator to frequency. */
   Map<String, tvm::Integer> operator_freqs_;

From 04ae0287e9cc4b8b4cf1cb5caf68af22a518ba76 Mon Sep 17 00:00:00 2001
From: Jason <jiangjiajun@baidu.com>
Date: Fri, 24 Sep 2021 16:38:49 +0800
Subject: [PATCH 11/37] add paddlepaddle to python/gen_requirements.py (#9098)

---
 python/gen_requirements.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/python/gen_requirements.py b/python/gen_requirements.py
index 7470ccc92496..d6dd094f6a5b 100755
--- a/python/gen_requirements.py
+++ b/python/gen_requirements.py
@@ -115,6 +115,10 @@
             ],
         ),
     ),
+    (
+        "importer-paddle",
+        ("Requirements for the PaddlePaddle importer", ["paddlepaddle"]),
+    ),
     (
         "importer-pytorch",
         (
@@ -235,6 +239,7 @@
     ("onnx", None),
     ("onnxruntime", None),
     ("opencv-python", None),
+    ("paddlepaddle", None),
     ("pillow", None),
     ("progressbar", None),
     ("psutil", None),

From af3c8192286ebc7f5d64097b629c38d2fc6e2194 Mon Sep 17 00:00:00 2001
From: Jason Knight <binarybana@gmail.com>
Date: Fri, 24 Sep 2021 01:51:17 -0700
Subject: [PATCH 12/37] Support colons inside of TVMC input shape name
 arguments (#9080)

---
 python/tvm/driver/tvmc/common.py             | 11 ++++++++---
 tests/python/driver/tvmc/test_tvmc_common.py |  8 ++++++++
 2 files changed, 16 insertions(+), 3 deletions(-)

diff --git a/python/tvm/driver/tvmc/common.py b/python/tvm/driver/tvmc/common.py
index 15c09753d46f..9ef2f6f1fbfa 100644
--- a/python/tvm/driver/tvmc/common.py
+++ b/python/tvm/driver/tvmc/common.py
@@ -387,7 +387,8 @@ def parse_shape_string(inputs_string):
     ----------
     inputs_string: str
         A string of the form "input_name:[dim1,dim2,...,dimn] input_name2:[dim1,dim2]" that
-        indicates the desired shape for specific model inputs.
+        indicates the desired shape for specific model inputs. Colons and forward slashes
+        within input_names are supported. Spaces are supported inside of dimension arrays.
 
     Returns
     -------
@@ -396,7 +397,11 @@ def parse_shape_string(inputs_string):
     """
 
     # Create a regex pattern that extracts each separate input mapping.
-    pattern = r"(?:\w+\/)?\w+\:\s*\[\-?\d+(?:\,\s*\-?\d+)*\]"
+    # We want to be able to handle:
+    # * Spaces inside arrays
+    # * forward slashes inside names (but not at the beginning or end)
+    # * colons inside names (but not at the beginning or end)
+    pattern = r"(?:\w+\/)?[:\w]+\:\s*\[\-?\d+(?:\,\s*\-?\d+)*\]"
     input_mappings = re.findall(pattern, inputs_string)
     if not input_mappings:
         raise argparse.ArgumentTypeError(
@@ -408,7 +413,7 @@ def parse_shape_string(inputs_string):
         # Remove whitespace.
         mapping = mapping.replace(" ", "")
         # Split mapping into name and shape.
-        name, shape_string = mapping.split(":")
+        name, shape_string = mapping.rsplit(":", 1)
         # Convert shape string into a list of integers or Anys if negative.
         shape = [int(x) if int(x) > 0 else relay.Any() for x in shape_string.strip("][").split(",")]
         # Add parsed mapping to shape dictionary.
diff --git a/tests/python/driver/tvmc/test_tvmc_common.py b/tests/python/driver/tvmc/test_tvmc_common.py
index 31fa688ad717..779611a7a345 100644
--- a/tests/python/driver/tvmc/test_tvmc_common.py
+++ b/tests/python/driver/tvmc/test_tvmc_common.py
@@ -165,6 +165,10 @@ def test_shape_parser():
     shape_string = "input:[10,10,10] input2:[20,20,20,20]"
     shape_dict = tvmc.common.parse_shape_string(shape_string)
     assert shape_dict == {"input": [10, 10, 10], "input2": [20, 20, 20, 20]}
+    # Check that multiple valid input shapes with colons are parse correctly
+    shape_string = "input:0:[10,10,10] input2:[20,20,20,20]"
+    shape_dict = tvmc.common.parse_shape_string(shape_string)
+    assert shape_dict == {"input:0": [10, 10, 10], "input2": [20, 20, 20, 20]}
     # Check that alternate syntax parses correctly
     shape_string = "input: [10, 10, 10] input2: [20, 20, 20, 20]"
     shape_dict = tvmc.common.parse_shape_string(shape_string)
@@ -193,6 +197,10 @@ def test_shape_parser():
         tvmc.common.parse_shape_string(shape_string)
     # Check that input with a invalid slash raises error.
     shape_string = "gpu_0/data_0:5,10 /:10,10"
+    with pytest.raises(argparse.ArgumentTypeError):
+        tvmc.common.parse_shape_string(shape_string)
+    # Check that input with a invalid colon raises error.
+    shape_string = "gpu_0/data_0:5,10 :test:10,10"
     with pytest.raises(argparse.ArgumentTypeError):
         tvmc.common.parse_shape_string(shape_string)
     # Check that input with a invalid slash raises error.

From 83ba9912860ec252cc874cc64051c0c4369313b8 Mon Sep 17 00:00:00 2001
From: Christopher Sidebottom <chris.sidebottom@arm.com>
Date: Fri, 24 Sep 2021 09:52:15 +0100
Subject: [PATCH 13/37] Add `extern "C"` to C Interface API header (#9094)

This is to provide the hint to C++ compilers that these functions are C linkage.

New header looks similar to:

```c++

extern "C" {

/*!
 * \brief Input tensor pointers for TVM module "default"
 */
struct tvmgen_default_inputs {
  void* y;
};

/*!
 * \brief Output tensor pointers for TVM module "default"
 */
struct tvmgen_default_outputs {
  void* output;
};

/*!
 * \brief entrypoint function for TVM module "default"
 * \param inputs Input tensors for the module
 * \param outputs Output tensors for the module
 */
int32_t tvmgen_default_run(
  struct tvmgen_default_inputs* inputs,
  struct tvmgen_default_outputs* outputs
);

}

```
---
 python/tvm/micro/interface_api.py | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/python/tvm/micro/interface_api.py b/python/tvm/micro/interface_api.py
index d9961e9de3f9..5a4841f39f7c 100644
--- a/python/tvm/micro/interface_api.py
+++ b/python/tvm/micro/interface_api.py
@@ -57,9 +57,12 @@ def generate_c_interface_header(module_name, inputs, outputs, output_path):
     metadata_header = os.path.join(output_path, f"{mangled_name}.h")
     with open(metadata_header, "w") as header_file:
         header_file.write(
-            "#include <stdint.h>\n"
             f"#ifndef {mangled_name.upper()}_H_\n"
-            f"#define {mangled_name.upper()}_H_\n"
+            f"#define {mangled_name.upper()}_H_\n\n"
+            "#include <stdint.h>\n\n"
+            "#ifdef __cplusplus\n"
+            'extern "C" {\n'
+            "#endif\n\n"
         )
 
         _emit_brief(header_file, module_name, "Input tensor pointers")
@@ -91,6 +94,8 @@ def generate_c_interface_header(module_name, inputs, outputs, output_path):
             ");\n"
         )
 
-        header_file.write(f"#endif // {mangled_name.upper()}_H_\n")
+        header_file.write(
+            "\n#ifdef __cplusplus\n}\n#endif\n\n" f"#endif // {mangled_name.upper()}_H_\n"
+        )
 
     return metadata_header

From 7c6a334bed261eeb9228bd9f7182bd1febcea094 Mon Sep 17 00:00:00 2001
From: Manupa Karunaratne <manupa.karunaratne@arm.com>
Date: Fri, 24 Sep 2021 13:14:27 +0100
Subject: [PATCH 14/37] Move the allocates of AoT codegen to be TVMBAWs (#9065)

* Move the allocates of AoT codegen to be TVMBAWs

This commit introduces changes to aot_executor_codegen.cc
to place tir.allocate to use storage_scope = 'global.workspace'.
The lower_tvm_builtin pass is modified slightly to generate
TVMBAW calls.

Change-Id: Iba4ba437c1431c5197bf11abb826e03807bbcf66

* Move the allocates of AoT codegen to be TVMBAWs

*Adding more comments and descriptions
*Modified the test case to use primitive relay

Change-Id: Ia18a169d94bded3f81af7b3081c7d1ac29c669bc
---
 src/relay/backend/aot_executor_codegen.cc |  7 +-
 src/tir/transforms/lower_tvm_builtin.cc   |  7 +-
 src/tir/transforms/storage_rewrite.cc     | 11 ++-
 tests/python/relay/aot/aot_test_utils.py  | 84 +++++++++++++++++------
 tests/python/relay/aot/test_crt_aot.py    | 41 +++++++++++
 5 files changed, 125 insertions(+), 25 deletions(-)

diff --git a/src/relay/backend/aot_executor_codegen.cc b/src/relay/backend/aot_executor_codegen.cc
index f1398786b93b..deca3b5a4c5a 100644
--- a/src/relay/backend/aot_executor_codegen.cc
+++ b/src/relay/backend/aot_executor_codegen.cc
@@ -625,8 +625,13 @@ class AOTExecutorCodegen : public MixedModeVisitor {
     // Define the storage allocator ids
     for (auto kv : storage_device_map_) {
       for (auto sid : kv.second->storage_ids) {
+        // The buffer_var is created with storage_scope to be global.workspace to be serviced by
+        // TVMBackendAllocWorkspace(TVMBAW) calls, explicitly. The reasoning being the executor
+        // allocates should be serviced by TVMBAWs as the data could be accessed by many devices and
+        // should not be lowered to the stack. For more details please refer to the discussion here:
+        // https://github.com/apache/tvm/issues/9022
         te::Var buffer_var(MakeString("sid_", sid),
-                           PointerType(PrimType(DataType::Int(8)), "global"));
+                           PointerType(PrimType(DataType::Int(8)), "global.workspace"));
         sids_table_[sid] = buffer_var;
       }
     }
diff --git a/src/tir/transforms/lower_tvm_builtin.cc b/src/tir/transforms/lower_tvm_builtin.cc
index 99d71ebe15bd..062d67eef165 100644
--- a/src/tir/transforms/lower_tvm_builtin.cc
+++ b/src/tir/transforms/lower_tvm_builtin.cc
@@ -113,9 +113,14 @@ class BuiltinLower : public StmtExprMutator {
     op = stmt.as<AllocateNode>();
     // Get constant allocation bound.
     int64_t nbytes = GetVectorBytes(op->dtype);
+    // If the buffers are for CPU and have global scope,
+    // and less than runtime::kMaxStackAlloca heuristic
+    // they are not serviced with TVMBackendWorkspaceAlloc calls
+    // to be placed on stack.
     if (device_type_.defined()) {
       if (const auto* dev_type = device_type_.as<IntImmNode>()) {
-        if (dev_type->value == kDLCPU) {
+        auto storage_scope = Downcast<PointerType>(op->buffer_var->type_annotation)->storage_scope;
+        if (dev_type->value == kDLCPU && storage_scope == "global") {
           int32_t constant_size = op->constant_allocation_size();
           if (constant_size > 0 && constant_size * nbytes < runtime::kMaxStackAlloca) {
             return stmt;
diff --git a/src/tir/transforms/storage_rewrite.cc b/src/tir/transforms/storage_rewrite.cc
index 592a6a33375e..409b7c262954 100644
--- a/src/tir/transforms/storage_rewrite.cc
+++ b/src/tir/transforms/storage_rewrite.cc
@@ -478,6 +478,11 @@ class StoragePlanRewriter : public StmtExprMutator {
     uint64_t bits_offset{0};
   };
 
+  // Checks whether the storage_scope is especially tagged for a specific memory.
+  bool IsSpecialTaggedMemory(const StorageScope& scope) {
+    return scope.tag.length() != 0 && scope.tag != ".dyn" && scope.tag != ".workspace";
+  }
+
   // Alllocate entry of node.
   // Event entry in liveness analysis
   struct EventEntry {
@@ -516,7 +521,7 @@ class StoragePlanRewriter : public StmtExprMutator {
       // try to find merge, for tagged memory
       for (size_t i = 0; i < vec.size(); ++i) {
         StorageEntry* e = vec[i];
-        if (e->scope.tag.length() != 0 && e->scope.tag != ".dyn") {
+        if (IsSpecialTaggedMemory(e->scope)) {
           ICHECK_NE(e->const_nbits, 0U) << "Special tagged memory must be const size";
           for (size_t j = 0; j < i; ++j) {
             if (e->scope == vec[j]->scope) {
@@ -550,7 +555,7 @@ class StoragePlanRewriter : public StmtExprMutator {
                               make_const(DataType::Int(32), 1), e->allocs[0]->extents);
           e->new_alloc =
               Allocate(e->alloc_var, alloc_type, {sz}, e->allocs[0]->condition, Evaluate(0));
-          if (e->scope.tag.length() != 0 && e->scope.tag != ".dyn") {
+          if (IsSpecialTaggedMemory(e->scope)) {
             MemoryInfo info = GetMemoryInfo(e->scope.to_string());
             uint64_t total_elem = e->const_nbits / e->elem_type.bits();
             ICHECK_LE(total_elem * e->elem_type.bits(), info->max_num_bits)
@@ -591,7 +596,7 @@ class StoragePlanRewriter : public StmtExprMutator {
           combo_size = analyzer_.Simplify(combo_size);
           e->new_alloc =
               Allocate(e->alloc_var, alloc_type, {combo_size}, const_true(), Evaluate(0));
-          if (e->scope.tag.length() != 0 && e->scope.tag != ".dyn") {
+          if (IsSpecialTaggedMemory(e->scope)) {
             MemoryInfo info = GetMemoryInfo(e->scope.to_string());
             uint64_t total_elem = e->const_nbits / e->elem_type.bits();
             ICHECK_LE(total_elem * e->elem_type.bits(), info->max_num_bits)
diff --git a/tests/python/relay/aot/aot_test_utils.py b/tests/python/relay/aot/aot_test_utils.py
index 490257ac66da..0935c0c16e99 100644
--- a/tests/python/relay/aot/aot_test_utils.py
+++ b/tests/python/relay/aot/aot_test_utils.py
@@ -471,37 +471,67 @@ def extract_main_workspace_size_bytes(extract_dir):
         return metadata["memory"]["functions"]["main"][0]["workspace_size_bytes"]
 
 
-def compile_and_run(
+def compile_models(
     models: Union[List[AOTTestModel], AOTTestModel],
-    runner: AOTTestRunner,
     interface_api,
     use_unpacked_api,
-    debug_calculated_workspaces=False,
     workspace_byte_alignment=8,
     enable_op_fusion=True,
 ):
     """
-    This method verifies the generated source
+    This method generates runtime.Modules for the tests
     """
+
     base_target = "c -runtime=c --link-params --executor=aot"
     extra_target = f"--workspace-byte-alignment={workspace_byte_alignment} --interface-api={interface_api} --unpacked-api={int(use_unpacked_api)}"
     target = f"{base_target} {extra_target}"
-    cflags = f"-DTVM_RUNTIME_ALLOC_ALIGNMENT_BYTES={workspace_byte_alignment} "
 
     if not isinstance(models, list):
         models = [models]
 
-    # The calculated workspaces will not account for stack allocator tags used for debugging
-    if debug_calculated_workspaces:
-        cflags += "-DTVM_CRT_STACK_ALLOCATOR_ENABLE_LIFO_CHECK "
-
     config = {"tir.disable_vectorize": True}
     if not enable_op_fusion:
         config["relay.FuseOps.max_depth"] = 1
 
+    compiled_runtime_mods = list()
+    for model in models:
+        with tvm.transform.PassContext(opt_level=3, config=config):
+            compiled_runtime_mods.append(
+                tvm.relay.build(
+                    model.module,
+                    target,
+                    target_host=target,
+                    params=model.params,
+                    mod_name=model.name,
+                )
+            )
+    return compiled_runtime_mods
+
+
+def run_and_check(
+    models: Union[List[AOTTestModel], AOTTestModel],
+    runner: AOTTestRunner,
+    interface_api,
+    compiled_runtime_mods: List[tvm.runtime.Module],
+    debug_calculated_workspaces=False,
+    workspace_byte_alignment=8,
+):
+    """
+    This method uses the original test data and compiled runtime.Modules
+    to run in the test runner to verify the results.
+    """
+
+    if not isinstance(models, list):
+        models = [models]
+
     tmp_path = utils.tempdir()
     tmp_dir = tmp_path.temp_dir
 
+    cflags = f"-DTVM_RUNTIME_ALLOC_ALIGNMENT_BYTES={workspace_byte_alignment} "
+    # The calculated workspaces will not account for stack allocator tags used for debugging
+    if debug_calculated_workspaces:
+        cflags += "-DTVM_CRT_STACK_ALLOCATOR_ENABLE_LIFO_CHECK "
+
     base_path = os.path.join(tmp_dir, "test")
     build_path = os.path.join(base_path, "build")
     os.makedirs(build_path, exist_ok=True)
@@ -515,18 +545,9 @@ def compile_and_run(
     )
 
     workspace_bytes = 0
-    for model in models:
-        with tvm.transform.PassContext(opt_level=3, config=config):
-            lib = tvm.relay.build(
-                model.module,
-                target,
-                target_host=target,
-                params=model.params,
-                mod_name=model.name,
-            )
-
+    for runtime_module, model in zip(compiled_runtime_mods, models):
         tar_file = os.path.join(base_path, f"{model.name}.tar")
-        export_model_library_format(lib, tar_file)
+        export_model_library_format(runtime_module, tar_file)
         t = tarfile.open(tar_file)
         t.extractall(base_path)
 
@@ -592,6 +613,29 @@ def compile_and_run(
         assert AOT_SUCCESS_TOKEN in run_log.read()
 
 
+def compile_and_run(
+    models: Union[List[AOTTestModel], AOTTestModel],
+    runner: AOTTestRunner,
+    interface_api,
+    use_unpacked_api,
+    debug_calculated_workspaces=False,
+    workspace_byte_alignment=8,
+    enable_op_fusion=True,
+):
+    """This is a wrapper API to compile and run models as test for AoT"""
+    compiled_runtime_mods = compile_models(
+        models, interface_api, use_unpacked_api, workspace_byte_alignment, enable_op_fusion
+    )
+    run_and_check(
+        models,
+        runner,
+        interface_api,
+        compiled_runtime_mods,
+        debug_calculated_workspaces,
+        workspace_byte_alignment,
+    )
+
+
 def generate_ref_data(mod, input_data, params=None, target="llvm"):
     """Generate reference data through executing the relay module"""
     compile_engine.get().clear()
diff --git a/tests/python/relay/aot/test_crt_aot.py b/tests/python/relay/aot/test_crt_aot.py
index 73aa385161f6..9961cd567fbe 100644
--- a/tests/python/relay/aot/test_crt_aot.py
+++ b/tests/python/relay/aot/test_crt_aot.py
@@ -33,6 +33,7 @@
     generate_ref_data,
     convert_to_relay,
     compile_and_run,
+    compile_models,
     parametrize_aot_options,
 )
 
@@ -643,5 +644,45 @@ def test_memory_planning(workspace_byte_alignment, main_workspace_size, sum_work
     )
 
 
+def test_aot_codegen_backend_alloc_workspace_calls():
+    """This test checks whether AoT lowering creates TVMBackendAllocWorkspace calls"""
+
+    # The %data and %weight shapes in the following primitive Relay should create
+    # small tensors that would get lowered to stack allocations in the CPU PrimFuncs.
+    # However, the AoT executor codegen should retain them as TVMBAW calls
+    relay_mod = tvm.parser.fromtext(
+        """
+        #[version = "0.0.5"]
+        def @main(%data: Tensor[(1, 4, 4, 4), float32], %weight: Tensor[(4, 4, 3, 3), float32], src_layout="OIHW", dst_layout="OIHW4i4o") -> Tensor[(1, 4, 4, 4), float32] {
+        %0 = fn (%p02: Tensor[(1, 4, 4, 4), float32], Primitive=1, hash="9332b3872fb5292c", src_layout="NCHW", dst_layout="NCHW4c") -> Tensor[(1, 1, 4, 4, 4), float32] {
+            layout_transform(%p02, src_layout="NCHW", dst_layout="NCHW4c") /* ty=Tensor[(1, 1, 4, 4, 4), float32] */
+        };
+        %1 = fn (%p03: Tensor[(4, 4, 3, 3), float32], Primitive=1, hash="9f0b2b8a24a4dab3", src_layout="OIHW", dst_layout="OIHW4i4o") -> Tensor[(1, 1, 3, 3, 4, 4), float32] {
+            layout_transform(%p03, src_layout="OIHW", dst_layout="OIHW4i4o") /* ty=Tensor[(1, 1, 3, 3, 4, 4), float32] */
+        };
+        %2 = %0(%data) /* ty=Tensor[(1, 1, 4, 4, 4), float32] */;
+        %3 = %1(%weight) /* ty=Tensor[(1, 1, 3, 3, 4, 4), float32] */;
+        %4 = fn (%p01: Tensor[(1, 1, 4, 4, 4), float32], %p1: Tensor[(1, 1, 3, 3, 4, 4), float32], out_layout="NCHW4c", kernel_layout="OIHW4i4o", Primitive=1, data_layout="NCHW4c") -> Tensor[(1, 1, 4, 4, 4), float32] {
+                                                                                                                                                                                                                                                      nn.contrib_conv2d_NCHWc(%p01, %p1, padding=[1, 1, 1, 1], channels=4, kernel_size=[3, 3], data_layout="NCHW4c", kernel_layout="OIHW4i4o", out_layout="NCHW4c") /* ty=Tensor[(1, 1, 4, 4, 4), float32] */
+        };
+        %5 = %4(%2, %3) /* ty=Tensor[(1, 1, 4, 4, 4), float32] */;
+        %6 = fn (%p0: Tensor[(1, 1, 4, 4, 4), float32], Primitive=1, src_layout="NCHW4c", dst_layout="NCHW") -> Tensor[(1, 4, 4, 4), float32] {
+            layout_transform(%p0, src_layout="NCHW4c", dst_layout="NCHW") /* ty=Tensor[(1, 4, 4, 4), float32] */
+        };
+        %6(%5) /* ty=Tensor[(1, 4, 4, 4), float32] */
+        }
+        """
+    )
+    compiled_runtime_modules = compile_models(
+        AOTTestModel(module=relay_mod, inputs=None, outputs=None),
+        "c",
+        True,
+    )
+    source = compiled_runtime_modules[0].lib.imported_modules[0].get_source()
+    # There should be three allocates created for three primitive relay function
+    # calls in the main for the above relay snippet.
+    assert source.count("TVMBackendAllocWorkspace") == 3
+
+
 if __name__ == "__main__":
     sys.exit(pytest.main([__file__] + sys.argv[1:]))

From d3d7e8eb6c201506dc706a055e16eed189dcdb0b Mon Sep 17 00:00:00 2001
From: wangxiang2713 <49302617+wangxiang2713@users.noreply.github.com>
Date: Fri, 24 Sep 2021 21:22:55 +0800
Subject: [PATCH 15/37] [PYTHON][FFI] Skip numpy.ascontiguousarray if
 C_CONTIGUOUS == True (#9073)

---
 python/tvm/runtime/ndarray.py | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/python/tvm/runtime/ndarray.py b/python/tvm/runtime/ndarray.py
index 27811a963993..2b9f7f9446ba 100644
--- a/python/tvm/runtime/ndarray.py
+++ b/python/tvm/runtime/ndarray.py
@@ -165,9 +165,18 @@ def copyfrom(self, source_array):
                     source_array.shape, shape
                 )
             )
-        source_array = np.ascontiguousarray(
-            source_array, dtype="uint16" if dtype == "bfloat16" else dtype
+        numpy_str_map = DataType.NUMPY2STR
+        np_dtype_str = (
+            numpy_str_map[source_array.dtype]
+            if source_array.dtype in numpy_str_map
+            else str(source_array.dtype)
         )
+        if (not source_array.flags["C_CONTIGUOUS"]) or (
+            dtype == "bfloat16" or dtype != np_dtype_str
+        ):
+            source_array = np.ascontiguousarray(
+                source_array, dtype="uint16" if dtype == "bfloat16" else dtype
+            )
         assert source_array.flags["C_CONTIGUOUS"]
         data = source_array.ctypes.data_as(ctypes.c_void_p)
         nbytes = ctypes.c_size_t(source_array.size * source_array.dtype.itemsize)

From 8a62f1097fc47ece59051cfd218a4bca19a0f62a Mon Sep 17 00:00:00 2001
From: Euntaik <eun.taik.lee@samsung.com>
Date: Sat, 25 Sep 2021 05:18:09 +0900
Subject: [PATCH 16/37] [TFLite] Fix padding calculation in Transpose Conv
 (#9089)

* [TFLite] Fix padding caculation in Transpose Conv

* [TFLite] Fix padding calculation in Transpose Conv

* [TFLite] Fix padding calculation in Transpose Conv

* remove unused variables
---
 python/tvm/relay/frontend/tflite.py          |  7 ++++---
 tests/python/frontend/tflite/test_forward.py | 20 ++++++++++++++++++++
 2 files changed, 24 insertions(+), 3 deletions(-)

diff --git a/python/tvm/relay/frontend/tflite.py b/python/tvm/relay/frontend/tflite.py
index 4d607e46c97f..250e9c4eb117 100644
--- a/python/tvm/relay/frontend/tflite.py
+++ b/python/tvm/relay/frontend/tflite.py
@@ -2858,7 +2858,7 @@ def convert_transpose_conv(self, op):
 
         # Input (data) Tensor. NHWC layout
         input_tensor = input_tensors[2]
-        _, input_h, input_w, input_c = to_int_list(self.get_tensor_shape(input_tensor))
+        _, _, _, input_c = to_int_list(self.get_tensor_shape(input_tensor))
         # Weights tensor. TFLite uses OHWI layout
         weights_tensor = input_tensors[1]
         out_channels, kernel_h, kernel_w, in_channels = to_int_list(
@@ -2919,8 +2919,9 @@ def convert_transpose_conv(self, op):
         ), "Output channel in the filter should match to channel in the output_shape"
 
         if padding == Padding.SAME:
-            pad_top, pad_bottom = get_pad_value(input_h, kernel_h, stride_h)
-            pad_left, pad_right = get_pad_value(input_w, kernel_w, stride_w)
+            output_h, output_w = output_shape_value[1], output_shape_value[2]
+            pad_top, pad_bottom = get_pad_value(output_h, kernel_h, stride_h)
+            pad_left, pad_right = get_pad_value(output_w, kernel_w, stride_w)
             padding = (pad_top, pad_left, pad_bottom, pad_right)
         else:
             padding = (0, 0, 0, 0)
diff --git a/tests/python/frontend/tflite/test_forward.py b/tests/python/frontend/tflite/test_forward.py
index f2941030f0ab..d4c0b28e4e14 100644
--- a/tests/python/frontend/tflite/test_forward.py
+++ b/tests/python/frontend/tflite/test_forward.py
@@ -1263,6 +1263,26 @@ def _test_transpose_conv(
 def test_forward_transpose_conv():
     for quantized in [True, False]:
         for fp16_quantized in [True, False]:
+            # odd size input, padding VALID
+            _test_transpose_conv(
+                [1, 5, 6, 16],
+                [2, 2, 16, 16],
+                [1, 10, 12, 16],
+                [2, 2],
+                "VALID",
+                quantized,
+                fp16_quantized,
+            )
+            # odd size input, padding SAME
+            _test_transpose_conv(
+                [1, 5, 6, 16],
+                [2, 2, 16, 16],
+                [1, 10, 12, 16],
+                [2, 2],
+                "SAME",
+                quantized,
+                fp16_quantized,
+            )
             # kernel 3x3, padding VALID
             _test_transpose_conv(
                 [4, 32, 32, 16],

From 58716081637fdf21bdae44122086f1dbc35f7c35 Mon Sep 17 00:00:00 2001
From: Mark Shields <87091372+mbs-octoml@users.noreply.github.com>
Date: Fri, 24 Sep 2021 13:19:13 -0700
Subject: [PATCH 17/37] [Relay] Prepare for merging context_analysis.cc and
 device_annotation.cc (#9077)

* [Relay] Prepare for merging context_analysis.cc and device_annotation.cc

- Improve construction and deconstruction of "on_device" and "device_copy" calls since they will be center stage.
- Move "device_copy" support out of memory.h into own module to mirror "on_device".
- Clearing out some DLOG -> VLOG changes I found helped me debug.
- Clearing out some whitespace-only changes I accumulated.

* [checkpoint] Address Christopher's comments.

Some stray py formatting changes snuck in since I just run black . at the root.
---
 include/tvm/relay/attrs/annotation.h          |  28 +++-
 include/tvm/relay/attrs/device_copy.h         |   1 +
 include/tvm/relay/attrs/function.h            |  66 +++++++++
 include/tvm/relay/expr_functor.h              |   3 +-
 include/tvm/relay/transform.h                 |   4 +-
 include/tvm/runtime/container/array.h         |   2 +-
 include/tvm/runtime/ndarray.h                 |  26 ++--
 python/tvm/relay/op/annotation/annotation.py  |  56 ++++++--
 python/tvm/relay/transform/transform.py       |   2 +-
 python/tvm/target/target.py                   |   8 +-
 src/node/structural_equal.cc                  |   7 +-
 src/relay/backend/te_compiler.cc              |   8 +-
 src/relay/backend/vm/inline_primitives.cc     |   4 +-
 src/relay/op/annotation/annotation.cc         | 136 +++++++++++++++++-
 src/relay/op/annotation/annotation.h          | 116 +++++++++++++++
 src/relay/op/memory/device_copy.cc            | 117 +++++++++++++++
 src/relay/op/memory/device_copy.h             |  79 ++++++++++
 src/relay/op/memory/memory.cc                 |  44 ++----
 src/relay/op/memory/memory.h                  |   1 -
 src/relay/quantize/partition.cc               |   2 +-
 src/relay/quantize/realize.cc                 |   2 +-
 src/relay/transforms/device_annotation.cc     |   2 +-
 src/relay/transforms/memory_alloc.cc          |  14 +-
 src/runtime/ndarray.cc                        |   4 +-
 src/tir/analysis/verify_memory.cc             |   5 +-
 .../relay/op/annotation/test_annotation.py    |  71 +++++++++
 26 files changed, 709 insertions(+), 99 deletions(-)
 create mode 100644 include/tvm/relay/attrs/function.h
 create mode 100644 src/relay/op/annotation/annotation.h
 create mode 100644 src/relay/op/memory/device_copy.cc
 create mode 100644 src/relay/op/memory/device_copy.h
 create mode 100644 tests/python/relay/op/annotation/test_annotation.py

diff --git a/include/tvm/relay/attrs/annotation.h b/include/tvm/relay/attrs/annotation.h
index 8379e6471561..bc55965ee852 100644
--- a/include/tvm/relay/attrs/annotation.h
+++ b/include/tvm/relay/attrs/annotation.h
@@ -32,15 +32,37 @@ namespace tvm {
 namespace relay {
 
 /*!
- * \brief Options for the device annotation operators.
+ * \brief Attributes for the "on_device" operator.
+ *
+ * The relay call
+ * \code
+ *   on_device(expr, device_type=2)
+ * \endcode
+ * denotes that the result of \p expr should be stored on the device with \p DLDeviceType 2
+ * (i.e. \p kDLCuda). Semantically the operator is the identity function.
+ *
+ * See also FunctionOnDeviceAttrs in include/relay/attrs/function.h for the function-level
+ * companion.
  */
 struct OnDeviceAttrs : public tvm::AttrsNode<OnDeviceAttrs> {
-  int device_type;
+  // TODO(mbs): Replace device types with TargetDevice.
+  /*! \brief Device type on which argument expression should be evaluated. */
+  int device_type = kInvalidDeviceType;
+  /*!
+   * \brief If true, the result device must also be \p device_type and device planning should
+   * not insert any "device_copy" calls to respect this annotation.
+   *
+   * This is used by the device planning pass itself when annotating the planned program.
+   */
+  bool is_fixed = false;
 
   TVM_DECLARE_ATTRS(OnDeviceAttrs, "relay.attrs.OnDeviceAttrs") {
     TVM_ATTR_FIELD(device_type)
-        .describe("The virutal device/context type that an expression is annotated with.")
+        .describe("The type of the virtual device which should hold the expression result.")
         .set_default(0);
+    TVM_ATTR_FIELD(is_fixed)
+        .describe("If true, do not insert a \"device_copy\" call to respect this annotation.")
+        .set_default(false);
   }
 };
 
diff --git a/include/tvm/relay/attrs/device_copy.h b/include/tvm/relay/attrs/device_copy.h
index 7da92b3ff763..f7b0a04f45fa 100644
--- a/include/tvm/relay/attrs/device_copy.h
+++ b/include/tvm/relay/attrs/device_copy.h
@@ -35,6 +35,7 @@ namespace relay {
  * \brief Options for the device copy operators.
  */
 struct DeviceCopyAttrs : public tvm::AttrsNode<DeviceCopyAttrs> {
+  // TODO(mbs): Should be TargetDevice.
   int dst_dev_type;
   int src_dev_type;
 
diff --git a/include/tvm/relay/attrs/function.h b/include/tvm/relay/attrs/function.h
new file mode 100644
index 000000000000..f4f94131da1f
--- /dev/null
+++ b/include/tvm/relay/attrs/function.h
@@ -0,0 +1,66 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file tvm/relay/attrs/function.h
+ * \brief Attributes for Relay Functions which don't make sense on PrimFuncs.
+ */
+#ifndef TVM_RELAY_ATTRS_FUNCTION_H_
+#define TVM_RELAY_ATTRS_FUNCTION_H_
+
+namespace tvm {
+namespace relay {
+/*!
+ * \brief Attributes for Relay function definitions which capture the devices for the
+ * function parameters and result.
+ *
+ * See also OnDeviceAttrs in include/tvm/relay/attrs/annotation.h for the companion "on_device"
+ * call attributes.
+ */
+struct FunctionOnDeviceAttrs : public tvm::AttrsNode<FunctionOnDeviceAttrs> {
+  /*! \brief Device type on which each of the function's arguments already resides. */
+  Array<Integer> param_device_types;
+  // TODO(mbs): Replace device types with TargetDevice.
+  /*! \brief Device type on which function body should be evaluated. */
+  int result_device_type = kInvalidDeviceType;
+
+  TVM_DECLARE_ATTRS(FunctionOnDeviceAttrs, "relay.attrs.FunctionOnDeviceAttrs") {
+    TVM_ATTR_FIELD(param_device_types)
+        .describe("The type of the virtual device which holds each function parameters.");
+    TVM_ATTR_FIELD(result_device_type)
+        .describe("The type of the virtual device which will hold the function's result.")
+        .set_default(0);
+  }
+};
+
+namespace attr {
+
+/*!
+ * \brief Device annotations for function parameters and results.
+ *
+ * Type: FunctionOnDeviceAttrs
+ */
+constexpr static const char* kFunctionAttrsKey = "on_device";
+
+}  // namespace attr
+
+}  // namespace relay
+}  // namespace tvm
+
+#endif  // TVM_RELAY_ATTRS_FUNCTION_H_
diff --git a/include/tvm/relay/expr_functor.h b/include/tvm/relay/expr_functor.h
index 688ad8254fa8..f96faffb24f4 100644
--- a/include/tvm/relay/expr_functor.h
+++ b/include/tvm/relay/expr_functor.h
@@ -37,6 +37,7 @@
 #include <unordered_map>
 #include <utility>
 #include <vector>
+
 namespace tvm {
 namespace relay {
 
@@ -227,7 +228,7 @@ class ExprMutator : public ::tvm::relay::ExprFunctor<Expr(const Expr&)> {
  *
  * MixedModeVisitor provides the same recursive API as ExprVisitor, and uses
  * recursion to traverse most forms of the IR, but under the hood it expands nested dataflow regions
- * of the graph and processes them iteratatively to prevent stack overflows
+ * of the graph and processes them iteratively to prevent stack overflows
  */
 class MixedModeVisitor : public ::tvm::relay::ExprVisitor {
  public:
diff --git a/include/tvm/relay/transform.h b/include/tvm/relay/transform.h
index 912879dc8a4b..cdd4c9c1dbd2 100644
--- a/include/tvm/relay/transform.h
+++ b/include/tvm/relay/transform.h
@@ -437,8 +437,8 @@ TVM_DLL Pass RelayToTIRTargetHook();
  * \brief A pass for manifesting explicit memory allocations and rewriting
  * specific dialects.
  *
- * \param target_host The target used by the host for compliation.
- * \param targets The device type and target pairs for compliation.
+ * \param target_host The target used by the host for compilation.
+ * \param targets The device type and target pairs for compilation.
  *
  * \return The pass.
  */
diff --git a/include/tvm/runtime/container/array.h b/include/tvm/runtime/container/array.h
index 8830653da88c..26f4e545deb7 100644
--- a/include/tvm/runtime/container/array.h
+++ b/include/tvm/runtime/container/array.h
@@ -249,7 +249,7 @@ class ArrayNode : public Object, public InplaceArrayBase<ArrayNode, ObjectRef> {
 };
 
 /*!
- * \brief Array, container representing a contigious sequence of ObjectRefs.
+ * \brief Array, container representing a contiguous sequence of ObjectRefs.
  *
  *  Array implements in-place copy-on-write semantics.
  *
diff --git a/include/tvm/runtime/ndarray.h b/include/tvm/runtime/ndarray.h
index 1127a9ae732c..a4c285e3dd08 100644
--- a/include/tvm/runtime/ndarray.h
+++ b/include/tvm/runtime/ndarray.h
@@ -38,9 +38,19 @@
 #include <vector>
 
 namespace tvm {
-namespace runtime {
 
-typedef DLDevice Device;
+// alias DLDevice
+using Device = DLDevice;
+
+// A 'null' device type, does not correspond to any DLDeviceType enum.
+// TODO(mbs): This is to help us as we transition away from representing the 'homogenous' case
+// as a singleton target map indexed by the invalid DLDeviceType '0'.
+constexpr DLDeviceType kNullDeviceType = static_cast<DLDeviceType>(0);
+
+// An 'invalid' device type, does not correspond to any DLDeviceType enum.
+constexpr DLDeviceType kInvalidDeviceType = static_cast<DLDeviceType>(-1);
+
+namespace runtime {
 
 /*!
  * \brief Managed NDArray.
@@ -481,23 +491,19 @@ inline bool NDArray::Load(dmlc::Stream* strm) {
 }
 
 }  // namespace runtime
-
-// alias Device
-using tvm::runtime::Device;
-
 }  // namespace tvm
 
 namespace std {
 template <>
-struct hash<tvm::runtime::Device> {
-  std::size_t operator()(const tvm::runtime::Device& dev) const {
+struct hash<tvm::Device> {
+  std::size_t operator()(const tvm::Device& dev) const {
     return ((dev.device_id << 8) | dev.device_type);
   }
 };
 
 template <>
-struct equal_to<tvm::runtime::Device> {
-  bool operator()(const tvm::runtime::Device& lhs, const tvm::runtime::Device& rhs) const {
+struct equal_to<tvm::Device> {
+  bool operator()(const tvm::Device& lhs, const tvm::Device& rhs) const {
     return (lhs.device_type == rhs.device_type && lhs.device_id == rhs.device_id);
   }
 };
diff --git a/python/tvm/relay/op/annotation/annotation.py b/python/tvm/relay/op/annotation/annotation.py
index 809b6369b085..f5f8870ab015 100644
--- a/python/tvm/relay/op/annotation/annotation.py
+++ b/python/tvm/relay/op/annotation/annotation.py
@@ -22,8 +22,16 @@
 from .. import op as reg
 
 
-def on_device(data, device):
-    """Annotate an expression with a certain device type.
+def _device_to_int(device):
+    if isinstance(device, _Device):
+        return device.device_type
+    if isinstance(device, str):
+        return _nd.device(device).device_type
+    raise ValueError("expecting a Device or device name, but received a %s" % (type(device)))
+
+
+def on_device(data, device, is_fixed=False):
+    """Annotates an expression with the device type on which its result should be stored.
 
     Parameters
     ----------
@@ -31,23 +39,45 @@ def on_device(data, device):
         The expression to be annotated.
 
     device : Union[:py:class:`Device`, str]
-        The device type to annotate.
+        The device to annotate with. Only the device's type is significant.
+
+    is_fixed : bool
+        If false (the default), a device_copy
+        If true, the annotation does not imply a device_copy may be inserted to
+        reconcile the device of the data argument with the device for the context of the
+        annotated expression.
 
     Returns
     -------
     result : tvm.relay.Expr
         The annotated expression.
     """
-    if isinstance(device, _Device):
-        device = device.device_type
-    elif isinstance(device, str):
-        device = _nd.device(device).device_type
-    else:
-        raise ValueError(
-            "device is expected to be the type of Device or "
-            "str, but received %s" % (type(device))
-        )
-    return _make.on_device(data, device)
+    return _make.on_device(data, _device_to_int(device), is_fixed)
+
+
+def function_on_device(function, param_devices, result_device):
+    """Annotates a Relay function with the device types on which its parameters and result should
+    be stored.
+
+    Parameters
+    ----------
+    function : tvm.relay.Function
+        The function to be annotated.
+
+    param_devices : Array[Union[:py:class:`Device`, str]]
+        The devices for each parameter. Only the device types are significant.
+
+    result_device: Union[:py:class:`Device`, str]
+        The device for the function result. Only the device type is significant.
+
+    Returns
+    -------
+    result : tvm.rleay.Function
+        The annotated function.
+    """
+    return _make.function_on_device(
+        function, [_device_to_int(d) for d in param_devices], _device_to_int(result_device)
+    )
 
 
 def stop_fusion(data):
diff --git a/python/tvm/relay/transform/transform.py b/python/tvm/relay/transform/transform.py
index 688422284c0f..7c79464bdd30 100644
--- a/python/tvm/relay/transform/transform.py
+++ b/python/tvm/relay/transform/transform.py
@@ -546,7 +546,7 @@ def MergeCompilerRegions():
 
 def RewriteAnnotatedOps(fallback_device):
     """Rewrite the annotated program where annotation operators, e.g.
-    `on_deivce`, mark which device an expression should be scheduled to.
+    `on_device`, mark which device an expression should be scheduled to.
     This pass helps heterogeneous execution where different operators may need
     to be allocated on various devices.
 
diff --git a/python/tvm/target/target.py b/python/tvm/target/target.py
index af2f5d857293..4ce888170134 100644
--- a/python/tvm/target/target.py
+++ b/python/tvm/target/target.py
@@ -525,7 +525,7 @@ def hexagon(cpu_ver="v66", **kwargs):
 
     # LLVM target string
     def create_llvm_target(cpu_ver, config):
-        """ Create LLVM target string. """
+        """Create LLVM target string."""
 
         target = " -mtriple=hexagon"
         mcpu = " -mcpu=hexagon" + cpu_ver
@@ -547,7 +547,7 @@ def create_target_features(config):
 
     # Simulator options string
     def create_sim_options(cpu_ver, config):
-        """ Create simulator option string. """
+        """Create simulator option string."""
 
         def validate_hvx_length(codegen_hvx, sim_options):
             if sim_options and "--hvx_length" in sim_options:
@@ -606,7 +606,7 @@ def validate_hvx_length(codegen_hvx, sim_options):
 
     # LLVM options string
     def create_llvm_options(cpu_ver, config):  # pylint: disable=unused-argument
-        """ Create LLVM options string. """
+        """Create LLVM options string."""
 
         llvm_options = config["llvm_options"]
 
@@ -620,7 +620,7 @@ def create_llvm_options(cpu_ver, config):  # pylint: disable=unused-argument
 
     # TVM target attributes string
     def create_tvm_options(cpu_ver, config):  # pylint: disable=unused-argument
-        """ Create TVM target features string. """
+        """Create TVM target features string."""
 
         features = {
             "link_params": "link-params",
diff --git a/src/node/structural_equal.cc b/src/node/structural_equal.cc
index 1fa72c92b6fc..8e52af60d235 100644
--- a/src/node/structural_equal.cc
+++ b/src/node/structural_equal.cc
@@ -19,6 +19,7 @@
 /*!
  * \file src/node/structural_equal.cc
  */
+#include <tvm/ir/module.h>
 #include <tvm/node/functor.h>
 #include <tvm/node/node.h>
 #include <tvm/node/reflection.h>
@@ -119,8 +120,10 @@ class RemapVarSEqualHandler : public SEqualReducer::Handler {
   // Check the result.
   bool CheckResult(bool result, const ObjectRef& lhs, const ObjectRef& rhs) {
     if (assert_mode_ && !result) {
-      LOG(FATAL) << "ValueError: StructuralEqual check failed, caused by\n"
-                 << "lhs = " << lhs << "\nrhs = " << rhs;
+      LOG(FATAL) << "ValueError: StructuralEqual check failed, caused by lhs:" << std::endl
+                 << PrettyPrint(lhs) << std::endl
+                 << "and rhs:" << std::endl
+                 << PrettyPrint(rhs);
     }
     return result;
   }
diff --git a/src/relay/backend/te_compiler.cc b/src/relay/backend/te_compiler.cc
index 0d32cc61e2e6..d37fbeabc277 100644
--- a/src/relay/backend/te_compiler.cc
+++ b/src/relay/backend/te_compiler.cc
@@ -439,11 +439,10 @@ class LowerTensorExprMutator : public ExprMutator {
     }
 
     // Non-External Relay Function
-    DLOG(INFO) << "lowering to target '" << target->str() << "' for primitive:\n"
-               << PrettyPrint(func);
+    VLOG(1) << "lowering to target '" << target->str() << "' for primitive:\n" << PrettyPrint(func);
     CCacheKey key = CCacheKey(func, target);
     CachedFunc lowered_func = compiler_->Lower(key, module_name_);
-    DLOG(INFO) << "lowered primitive bound to '" << PrettyPrint(lowered_func->prim_fn_var) << "'";
+    VLOG(1) << "lowered primitive bound to '" << PrettyPrint(lowered_func->prim_fn_var) << "'";
 
     // Collect all the lowered functions produced for this primitive function.
     Map<GlobalVar, tir::PrimFunc> prim_fns;
@@ -452,8 +451,7 @@ class LowerTensorExprMutator : public ExprMutator {
       CHECK(prim_fn.second.as<tir::PrimFuncNode>()) << "must be a prim fn";
       prim_fns.Set(prim_fn.first, Downcast<tir::PrimFunc>(prim_fn.second));
       all_prim_fn_vars.push_back(prim_fn.first);
-      DLOG(INFO) << "lowered primitive includes bindings for '" << PrettyPrint(prim_fn.first)
-                 << "'";
+      VLOG(1) << "lowered primitive includes bindings for '" << PrettyPrint(prim_fn.first) << "'";
     }
 
     // TODO(@areusch, @jroesch): this metadata is for AOT, this should be our interface for AOT
diff --git a/src/relay/backend/vm/inline_primitives.cc b/src/relay/backend/vm/inline_primitives.cc
index 05fb2a120620..6924f2598f6f 100644
--- a/src/relay/backend/vm/inline_primitives.cc
+++ b/src/relay/backend/vm/inline_primitives.cc
@@ -136,13 +136,13 @@ struct PrimitiveInliner : ExprMutator {
         if (n->GetAttr<String>(attr::kCompiler).defined()) continue;
         auto func = GetRef<Function>(n);
 
-        DLOG(INFO) << "Before inlining primitives: " << global << std::endl << AsText(func, false);
+        VLOG(1) << "Before inlining primitives: " << global << std::endl << PrettyPrint(func);
 
         func = Function(func->params, VisitExpr(func->body), func->ret_type, func->type_params,
                         func->attrs);
         module_->Add(global, func, true);
 
-        DLOG(INFO) << "After inlining primitives: " << global << std::endl << AsText(func, false);
+        VLOG(1) << "After inlining primitives: " << global << std::endl << PrettyPrint(func);
       }
     }
     return module_;
diff --git a/src/relay/op/annotation/annotation.cc b/src/relay/op/annotation/annotation.cc
index b59c5a3e9ff3..4eda15937f3a 100644
--- a/src/relay/op/annotation/annotation.cc
+++ b/src/relay/op/annotation/annotation.cc
@@ -20,10 +20,13 @@
 /*!
  *
  * \file src/relay/op/annotation/annotation.cc
- * \brief Registration of annotation operators.
+ * \brief Helpers for working with various 'annotations' attributes.
  */
 
+#include "./annotation.h"
+
 #include <tvm/relay/attrs/annotation.h>
+#include <tvm/relay/attrs/function.h>
 #include <tvm/relay/expr.h>
 #include <tvm/relay/op.h>
 #include <tvm/relay/op_attr_types.h>
@@ -36,15 +39,51 @@
 namespace tvm {
 namespace relay {
 
-// relay.annotation.on_device
 TVM_REGISTER_NODE_TYPE(OnDeviceAttrs);
 
+const Op& OnDeviceOp() {
+  static const Op& op = Op::Get("on_device");
+  return op;
+}
+
+Expr OnDevice(Expr expr, DLDeviceType device_type, bool is_fixed) {
+  auto attrs = make_object<OnDeviceAttrs>();
+  attrs->device_type = device_type;
+  attrs->is_fixed = is_fixed;
+  Span span = expr->span;
+  return Call(OnDeviceOp(), {std::move(expr)}, Attrs(std::move(attrs)), /*type_args=*/{}, span);
+}
+
+Expr OptOnDevice(Expr expr, DLDeviceType device_type, bool is_fixed) {
+  if (device_type == kInvalidDeviceType) {
+    // Undefined signals no annotation is required.
+    return expr;
+  }
+  if (expr->IsInstance<OpNode>() || expr->IsInstance<ConstructorNode>()) {
+    // These operators are device polymorphic so no annotation is required.
+    // TODO(mbs): The device planning pass does NOT currently support device polymorphism for
+    // constructors, so we could remove them from this condition. However most constructors
+    // accept type parameters, and it is not well-formed Relay to simply wrap such a
+    // constructor in an "on_device" call. So we'll pretend they are device polymorphic to
+    // avoid that difficultly. Overall ADTs need more work to be fully supported.
+    return expr;
+  }
+  if (expr->IsInstance<GlobalVarNode>() || expr->IsInstance<VarNode>()) {
+    // The device can be recovered from the binding site of the global or local variable.
+    return expr;
+  }
+  if (const auto* function_node = expr.as<FunctionNode>()) {
+    if (function_node->HasNonzeroAttr(attr::kPrimitive)) {
+      // Primitive functions are device polymorphic, matching our interpretation for OpNode above.
+      return expr;
+    }
+  }
+  return OnDevice(expr, device_type, is_fixed);
+}
+
 TVM_REGISTER_GLOBAL("relay.op.annotation._make.on_device")
-    .set_body_typed([](Expr data, int device_type) {
-      auto attrs = make_object<OnDeviceAttrs>();
-      attrs->device_type = device_type;
-      static const Op& op = Op::Get("on_device");
-      return Call(op, {data}, Attrs(attrs), {});
+    .set_body_typed([](Expr expr, int device_type, bool is_fixed) {
+      return OnDevice(expr, static_cast<DLDeviceType>(device_type), is_fixed);
     });
 
 RELAY_REGISTER_OP("on_device")
@@ -56,12 +95,95 @@ RELAY_REGISTER_OP("on_device")
     .set_attr<TOpPattern>("TOpPattern", kOpaque)
     .set_attr<TOpIsStateful>("TOpIsStateful", false)
     .set_attr<FInferCorrectLayout>("FInferCorrectLayout", ElemwiseArbitraryLayout)
+    .set_attr<TNonComputational>("TNonComputational", true)
     .set_attr<FTVMCompute>("FTVMCompute",
                            [](const Attrs& attrs, const Array<te::Tensor>& inputs,
                               const Type& out_type) -> Array<te::Tensor> {
                              return {topi::identity(inputs[0])};
                            });
 
+OnDeviceProps GetOnDeviceProps(const CallNode* call_node) {
+  if (call_node->op == OnDeviceOp()) {
+    ICHECK_EQ(call_node->args.size(), 1) << "on_device expects one argument";
+    ICHECK(call_node->attrs.defined()) << "on_device requires attributes";
+    const auto* on_device_attrs = call_node->attrs.as<OnDeviceAttrs>();
+    ICHECK(on_device_attrs != nullptr) << "on_device requires OnDeviceAttrs";
+    auto device_type = static_cast<DLDeviceType>(on_device_attrs->device_type);
+    // Follow nesting:
+    //   on_device(on_device(expr, device_type=1), device_type=2) == {expr, 1}
+    auto inner = GetOnDeviceProps(call_node->args[0]);
+    if (inner.body.defined()) {
+      return {inner.body, inner.device_type, on_device_attrs->is_fixed || inner.is_fixed};
+    } else {
+      return {call_node->args[0], device_type, on_device_attrs->is_fixed};
+    }
+  }
+  return {};
+}
+
+OnDeviceProps GetOnDeviceProps(const Expr& expr) {
+  if (const auto* call_node = expr.as<CallNode>()) {
+    return GetOnDeviceProps(call_node);
+  }
+  return {};
+}
+
+TVM_REGISTER_NODE_TYPE(FunctionOnDeviceAttrs);
+
+Function FunctionOnDevice(Function function, Array<Integer> param_device_types,
+                          DLDeviceType result_device_type) {
+  auto attrs = make_object<FunctionOnDeviceAttrs>();
+  attrs->param_device_types = std::move(param_device_types);
+  attrs->result_device_type = result_device_type;
+  return WithAttr(std::move(function), attr::kFunctionAttrsKey, Attrs(std::move(attrs)));
+}
+
+Function FunctionOnDevice(Function function, const std::vector<DLDeviceType>& param_device_types,
+                          DLDeviceType result_device_type) {
+  Array<Integer> arr;
+  arr.reserve(param_device_types.size());
+  for (const auto device_type : param_device_types) {
+    arr.push_back(static_cast<int64_t>(device_type));
+  }
+  return FunctionOnDevice(function, arr, result_device_type);
+}
+
+TVM_REGISTER_GLOBAL("relay.op.annotation._make.function_on_device")
+    .set_body_typed([](Function function, Array<Integer> param_device_types,
+                       int result_device_type) {
+      return FunctionOnDevice(function, param_device_types,
+                              static_cast<DLDeviceType>(result_device_type));
+    });
+
+DLDeviceType GetFunctionResultDeviceType(const FunctionNode* function_node) {
+  auto opt_attrs = function_node->GetAttr<Attrs>(attr::kFunctionAttrsKey);
+  if (!opt_attrs) {
+    // No annotation.
+    return kInvalidDeviceType;
+  }
+  const auto* opt_function_on_device_attrs = opt_attrs.value().as<FunctionOnDeviceAttrs>();
+  ICHECK(opt_function_on_device_attrs != nullptr)
+      << "function '" << attr::kFunctionAttrsKey << "' annotation must be a FunctionOnDeviceAttrs";
+  return static_cast<DLDeviceType>(opt_function_on_device_attrs->result_device_type);
+}
+
+DLDeviceType GetFunctionParamDeviceType(const FunctionNode* function_node, size_t i) {
+  ICHECK_LT(i, function_node->params.size())
+      << "param index " << i << " out of range for function of arity "
+      << function_node->params.size();
+  auto opt_attrs = function_node->GetAttr<Attrs>(attr::kFunctionAttrsKey);
+  if (!opt_attrs) {
+    // No annotation.
+    return kInvalidDeviceType;
+  }
+  const auto* opt_function_on_device_attrs = opt_attrs.value().as<FunctionOnDeviceAttrs>();
+  ICHECK(opt_function_on_device_attrs != nullptr)
+      << "function '" << attr::kFunctionAttrsKey << "' annotation must be a FunctionOnDeviceAttrs";
+  ICHECK_EQ(opt_function_on_device_attrs->param_device_types.size(), function_node->params.size())
+      << "annotation parameters do not match function arity";
+  return static_cast<DLDeviceType>(opt_function_on_device_attrs->param_device_types[i]->value);
+}
+
 Expr StopFusion(Expr data) {
   static const Op& op = Op::Get("annotation.stop_fusion");
   return Call(op, {data}, Attrs{}, {});
diff --git a/src/relay/op/annotation/annotation.h b/src/relay/op/annotation/annotation.h
new file mode 100644
index 000000000000..e3a4aea4708c
--- /dev/null
+++ b/src/relay/op/annotation/annotation.h
@@ -0,0 +1,116 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file relay/op/annotation/annotation.h
+ * \brief Helpers for working with various 'annotation' attributes.
+ */
+#ifndef TVM_RELAY_OP_ANNOTATION_ANNOTATION_H_
+#define TVM_RELAY_OP_ANNOTATION_ANNOTATION_H_
+
+#include <tvm/relay/attrs/annotation.h>
+#include <tvm/relay/expr.h>
+#include <tvm/relay/function.h>
+#include <tvm/runtime/ndarray.h>
+
+#include <vector>
+
+namespace tvm {
+namespace relay {
+
+/*! \brief Returns the "on_device" operator. */
+const Op& OnDeviceOp();
+
+/*!
+ * \brief Wraps \p expr in an "on_device" CallNode for \p device_type and \p is_fixed.
+ */
+Expr OnDevice(Expr expr, DLDeviceType device_type, bool is_fixed);
+
+/*!
+ * \brief Wraps \p expr in an "on_device" CallNode for \p device_type and \p is_fixed. However
+ * returns \p expr directly if:
+ *  - \p device_type is \p kInvalidDeviceType, which signals there are no device annotations
+ *    already in play.
+ *  - \p expr is an operator or primitive function literal. These are device polymorphic.
+ *  - \p expr is a global or local var. These already have an implied device.
+ *  - \p expr is a constructor. There should probably be device polymorphic but are in an
+ *    in-between state at the moment.
+ */
+Expr OptOnDevice(Expr expr, DLDeviceType device_type, bool is_fixed);
+
+/*! \brief Result of \p GetOnDeviceProps. */
+struct OnDeviceProps {
+  Expr body;  // = null
+  DLDeviceType device_type = kInvalidDeviceType;
+  bool is_fixed = false;
+
+  OnDeviceProps() = default;
+
+  OnDeviceProps(const Expr& body, DLDeviceType deviceType, bool isFixed)
+      : body(body), device_type(deviceType), is_fixed(isFixed) {}
+};
+
+/*!
+ * \brief Returns the body expression, device type and is_fixed field for \p call_node if it is
+ * an "on_device" CallNode. Otherwise returns the null expression, \p kInvalidDeviceType and \p
+ * false.
+ */
+OnDeviceProps GetOnDeviceProps(const CallNode* call_node);
+
+/*!
+ * \brief Returns the body expression, device type and is_fixed field for \p expr if it is an
+ * "on_device" CallNode. Otherwise returns the null expression, \p kInvalidDeviceType and \p false.
+ */
+OnDeviceProps GetOnDeviceProps(const Expr& expr);
+
+/*! \brief Returns true if \p expr is an on_device CallNode. */
+inline bool IsOnDeviceCall(const Expr& expr) { return GetOnDeviceProps(expr).body.defined(); }
+
+/*!
+ * \brief Returns \p function annotated with "on_device" attributes capturing parameter and result
+ * devices types. However returns \p function directly if all device types are \p
+ * kInvalidDeviceType.
+ */
+Function FunctionOnDevice(Function function, Array<Integer> param_device_types,
+                          DLDeviceType body_device_type);
+Function FunctionOnDevice(Function function, const std::vector<DLDeviceType>& param_device_types,
+                          DLDeviceType body_device_type);
+
+/*!
+ * \brief Returns the device type for the resut of \p function_node, or \p kInvalidDeviceType
+ * if function does not have "on_device" annotation.
+ */
+DLDeviceType GetFunctionResultDeviceType(const FunctionNode* function_node);
+
+/*!
+ * \brief Returns the device type for the \p i'th parameter of \p function_node, or
+ * \p kInvalidDeviceType if function does not have "on_device" annotation.
+ */
+DLDeviceType GetFunctionParamDeviceType(const FunctionNode* function_node, size_t i);
+
+/*! \brief Wraps \p data in a "stop_fusion" annotation. */
+Expr StopFusion(Expr data);
+
+/*! \brief Wraps \p data in a "cast_hint" annotation for \p dtype. */
+Expr CastHint(Expr data, DataType dtype);
+
+}  // namespace relay
+}  // namespace tvm
+
+#endif  // TVM_RELAY_OP_ANNOTATION_ANNOTATION_H_
diff --git a/src/relay/op/memory/device_copy.cc b/src/relay/op/memory/device_copy.cc
new file mode 100644
index 000000000000..b94caac2c3d9
--- /dev/null
+++ b/src/relay/op/memory/device_copy.cc
@@ -0,0 +1,117 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file relay/op/memory/device_copy.cc
+ * \brief Helpers for working with "device_copy" attributes.
+ */
+
+#include "./device_copy.h"
+
+#include <tvm/relay/attrs/device_copy.h>
+#include <tvm/relay/expr.h>
+#include <tvm/relay/op.h>
+#include <tvm/relay/op_attr_types.h>
+#include <tvm/topi/elemwise.h>
+
+#include "../../transforms/infer_layout_utils.h"
+#include "../type_relations.h"
+
+namespace tvm {
+namespace relay {
+
+// relay.device_copy
+TVM_REGISTER_NODE_TYPE(DeviceCopyAttrs);
+
+const Op& DeviceCopyOp() {
+  static const Op& op = Op::Get("device_copy");
+  return op;
+}
+
+Expr DeviceCopy(Expr expr, DLDeviceType src_dev_type, DLDeviceType dst_dev_type) {
+  auto attrs = make_object<DeviceCopyAttrs>();
+  attrs->src_dev_type = src_dev_type;
+  attrs->dst_dev_type = dst_dev_type;
+  Span span = expr->span;
+  return Call(DeviceCopyOp(), {std::move(expr)}, Attrs(attrs), /*type_args=*/{}, span);
+}
+
+Expr OptDeviceCopy(Expr expr, DLDeviceType src_dev_type, DLDeviceType dst_dev_type) {
+  if (src_dev_type == dst_dev_type) {
+    return expr;
+  }
+  ICHECK_NE(src_dev_type, kInvalidDeviceType);
+  ICHECK_NE(dst_dev_type, kInvalidDeviceType);
+  return DeviceCopy(expr, src_dev_type, dst_dev_type);
+}
+
+TVM_REGISTER_GLOBAL("relay.op._make.device_copy")
+    .set_body_typed([](Expr expr, int src_dev_type, int dst_dev_type) {
+      return DeviceCopy(expr, static_cast<DLDeviceType>(src_dev_type),
+                        static_cast<DLDeviceType>(dst_dev_type));
+    });
+
+RELAY_REGISTER_OP("device_copy")
+    .describe(R"code(
+Copy data from one tensor to another. The source and destination might be
+on different devices.
+)code" TVM_ADD_FILELINE)
+    .set_num_inputs(1)
+    .add_argument("data", "Tensor", "The input data.")
+    .set_support_level(10)
+    .add_type_rel("Identity", IdentityRel)
+    .set_attr<TOpPattern>("TOpPattern", kOpaque)
+    .set_attr<TOpIsStateful>("TOpIsStateful", false)
+    .set_attr<FInferCorrectLayout>("FInferCorrectLayout", ElemwiseArbitraryLayout)
+    .set_attr<FTVMCompute>("FTVMCompute",
+                           [](const Attrs& attrs, const Array<te::Tensor>& inputs,
+                              const Type& out_dtype) -> Array<te::Tensor> {
+                             return {topi::identity(inputs[0])};
+                           });
+
+DeviceCopyProps GetDeviceCopyProps(const CallNode* call_node) {
+  if (call_node->op == DeviceCopyOp()) {
+    ICHECK_EQ(call_node->args.size(), 1) << "device_copy expects one argument";
+    ICHECK(call_node->attrs.defined()) << "device_copy requires attributes";
+    const auto* device_copy_attrs = call_node->attrs.as<DeviceCopyAttrs>();
+    ICHECK(device_copy_attrs != nullptr) << "device_copy requires DeviceCopyAttrs";
+    auto src_dev_type = static_cast<DLDeviceType>(device_copy_attrs->src_dev_type);
+    auto dst_dev_type = static_cast<DLDeviceType>(device_copy_attrs->dst_dev_type);
+    // Follow nesting:
+    //   device_copy(device_copy(expr, src_dev_type=1, dst_dev_type=2),
+    //               src_dev_type=2, dst_dev_type=3) ==> {expr, 1, 3}
+    auto inner = GetDeviceCopyProps(call_node->args[0]);
+    if (inner.body.defined()) {
+      return {inner.body, inner.src_dev_type, inner.dst_dev_type};
+    } else {
+      return {call_node->args[0], src_dev_type, dst_dev_type};
+    }
+  }
+  return {};
+}
+
+DeviceCopyProps GetDeviceCopyProps(const Expr& expr) {
+  if (const auto* call_node = expr.as<CallNode>()) {
+    return GetDeviceCopyProps(call_node);
+  }
+  return {};
+}
+
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/op/memory/device_copy.h b/src/relay/op/memory/device_copy.h
new file mode 100644
index 000000000000..d590d8510f17
--- /dev/null
+++ b/src/relay/op/memory/device_copy.h
@@ -0,0 +1,79 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file relay/op/memory/device_copy.h
+ * \brief Helpers for working with "device_copy" attributes.
+ */
+
+#ifndef TVM_RELAY_OP_MEMORY_DEVICE_COPY_H_
+#define TVM_RELAY_OP_MEMORY_DEVICE_COPY_H_
+
+#include <tvm/relay/attrs/device_copy.h>
+#include <tvm/relay/expr.h>
+
+namespace tvm {
+namespace relay {
+
+/*! \brief Returns the "device_copy" operator. */
+const Op& DeviceCopyOp();
+
+/*!
+ * \brief Wraps \p expr in a "device_copy" CallNode indicating it should be evaluated on
+ * a device of type \p src_dev_type but then copied to a device of type \p dst_dev_type.
+ */
+Expr DeviceCopy(Expr expr, DLDeviceType src_dev_type, DLDeviceType dst_dev_type);
+
+/*!
+ * \brief Wraps \p expr in a "device_copy" CallNode indicating it should be evaluated on
+ * a device of type \p src_dev_type but then copied to a device of type \p dst_dev_type.
+ * However, return \p expr directly if \p src_dev_type equals \p dst_dev_type.
+ */
+Expr OptDeviceCopy(Expr expr, DLDeviceType src_dev_type, DLDeviceType dst_dev_type);
+
+/*! \brief Result of \p GetDeviceCopyProps. */
+struct DeviceCopyProps {
+  Expr body;  // = null
+  DLDeviceType src_dev_type = kInvalidDeviceType;
+  DLDeviceType dst_dev_type = kInvalidDeviceType;
+
+  DeviceCopyProps() = default;
+
+  DeviceCopyProps(const Expr& body, DLDeviceType srcDevType, DLDeviceType dstDevType)
+      : body(body), src_dev_type(srcDevType), dst_dev_type(dstDevType) {}
+};
+
+/*!
+ * \brief Returns the body expression, source, and destination device types for \p call_node if it
+ * is a "device_copy" CallNode. Otherwise returns the null expression and \p kInvalidDeviceType
+ * device types.
+ */
+DeviceCopyProps GetDeviceCopyProps(const CallNode* call_node);
+
+/*!
+ * \brief Returns the body expression, source, and destination device types for \p expr if it
+ * is a "device_copy" CallNode. Otherwise returns the null expression and \p kInvalidDeviceType
+ * device types.
+ */
+DeviceCopyProps GetDeviceCopyProps(const Expr& expr);
+
+}  // namespace relay
+}  // namespace tvm
+
+#endif  // TVM_RELAY_OP_MEMORY_DEVICE_COPY_H_
diff --git a/src/relay/op/memory/memory.cc b/src/relay/op/memory/memory.cc
index c2997fb6cf95..68a83ebba1fe 100644
--- a/src/relay/op/memory/memory.cc
+++ b/src/relay/op/memory/memory.cc
@@ -35,9 +35,9 @@
 #include <vector>
 
 #include "../../transforms/infer_layout_utils.h"
+#include "../annotation/annotation.h"
 #include "../op_common.h"
 #include "../type_relations.h"
-#include "tvm/relay/attrs/device_copy.h"
 
 namespace tvm {
 namespace relay {
@@ -97,14 +97,21 @@ RELAY_REGISTER_OP("memory.alloc_storage")
                              return {topi::identity(inputs[0])};
                            });
 
-Expr AllocTensor(Expr storage, Expr offset, tvm::relay::Expr shape, DataType dtype,
+Expr AllocTensor(Expr storage, Expr offset, Expr shape, DataType dtype,
                  Array<IndexExpr> assert_shape) {
   auto attrs = make_object<AllocTensorAttrs>();
   attrs->dtype = dtype;
   if (assert_shape.defined()) {
     attrs->assert_shape = assert_shape;
   } else {
-    attrs->const_shape = Downcast<Constant>(shape);
+    // Look through any on_device for the shape argument expression.
+    Expr literal_shape = shape;
+    auto props = GetOnDeviceProps(literal_shape);
+    if (props.body.defined()) {
+      // See through on_device calls.
+      literal_shape = props.body;
+    }
+    attrs->const_shape = Downcast<Constant>(literal_shape);
   }
   static const Op& op = Op::Get("memory.alloc_tensor");
   return Call(op, {storage, offset, shape}, Attrs(attrs), {});
@@ -307,36 +314,5 @@ TVM_REGISTER_GLOBAL("relay.op.memory._make.ToTupleType")
       return ToTupleType(t, std::vector<Expr>(array.begin(), array.end()));
     });
 
-// relay.device_copy
-TVM_REGISTER_NODE_TYPE(DeviceCopyAttrs);
-
-Expr DeviceCopy(Expr data, int src_dev_type, int dst_dev_type) {
-  auto attrs = make_object<DeviceCopyAttrs>();
-  attrs->src_dev_type = src_dev_type;
-  attrs->dst_dev_type = dst_dev_type;
-  static const Op& op = Op::Get("device_copy");
-  return Call(op, {data}, Attrs(attrs), {});
-}
-
-TVM_REGISTER_GLOBAL("relay.op._make.device_copy").set_body_typed(DeviceCopy);
-
-RELAY_REGISTER_OP("device_copy")
-    .describe(R"code(
-Copy data from one tensor to another. The source and destination might be
-on different devices.
-)code" TVM_ADD_FILELINE)
-    .set_num_inputs(1)
-    .add_argument("data", "Tensor", "The input data.")
-    .set_support_level(10)
-    .add_type_rel("Identity", IdentityRel)
-    .set_attr<TOpPattern>("TOpPattern", kOpaque)
-    .set_attr<TOpIsStateful>("TOpIsStateful", false)
-    .set_attr<FInferCorrectLayout>("FInferCorrectLayout", ElemwiseArbitraryLayout)
-    .set_attr<FTVMCompute>("FTVMCompute",
-                           [](const Attrs& attrs, const Array<te::Tensor>& inputs,
-                              const Type& out_dtype) -> Array<te::Tensor> {
-                             return {topi::identity(inputs[0])};
-                           });
-
 }  // namespace relay
 }  // namespace tvm
diff --git a/src/relay/op/memory/memory.h b/src/relay/op/memory/memory.h
index bbbd11867549..558c409782f5 100644
--- a/src/relay/op/memory/memory.h
+++ b/src/relay/op/memory/memory.h
@@ -33,7 +33,6 @@ namespace tvm {
 namespace relay {
 
 Expr AllocStorage(Expr size, Expr alignment, Device dev, DataType dtype_hint);
-Expr DeviceCopy(Expr data, int src_dev_type, int dst_dev_type);
 Expr AllocTensor(Expr storage, Expr offset, tvm::relay::Expr shape, DataType dtype,
                  Array<IndexExpr> assert_shape);
 Expr ToTupleType(const Type& ty, const std::vector<Expr>& exprs);
diff --git a/src/relay/quantize/partition.cc b/src/relay/quantize/partition.cc
index c65cc1879932..6cd596a814ac 100644
--- a/src/relay/quantize/partition.cc
+++ b/src/relay/quantize/partition.cc
@@ -26,7 +26,7 @@
 
 #include <tvm/relay/transform.h>
 
-#include "../transforms/pattern_utils.h"
+#include "../op/annotation/annotation.h"
 #include "./quantize.h"
 
 namespace tvm {
diff --git a/src/relay/quantize/realize.cc b/src/relay/quantize/realize.cc
index 968628fbfe39..e636130f8553 100644
--- a/src/relay/quantize/realize.cc
+++ b/src/relay/quantize/realize.cc
@@ -29,8 +29,8 @@
 #include <tvm/relay/attrs/annotation.h>
 #include <tvm/relay/transform.h>
 
+#include "../op/annotation/annotation.h"
 #include "../qnn/utils.h"
-#include "../transforms/pattern_utils.h"
 #include "./quantize.h"
 
 namespace tvm {
diff --git a/src/relay/transforms/device_annotation.cc b/src/relay/transforms/device_annotation.cc
index 02f9d474411a..7457457e4c5c 100644
--- a/src/relay/transforms/device_annotation.cc
+++ b/src/relay/transforms/device_annotation.cc
@@ -18,7 +18,7 @@
  */
 
 /*!
- * \file deivce_annotation.cc
+ * \file device_annotation.cc
  * \brief Passes to rewrite annotated program and retrieve the device allocation
  * of expression.
  *
diff --git a/src/relay/transforms/memory_alloc.cc b/src/relay/transforms/memory_alloc.cc
index 657e2c392455..31d3b2c8991a 100644
--- a/src/relay/transforms/memory_alloc.cc
+++ b/src/relay/transforms/memory_alloc.cc
@@ -43,14 +43,15 @@
 
 #include "../backend/te_compiler.h"
 #include "../backend/te_compiler_cache.h"
+#include "../op/annotation/annotation.h"
+#include "../op/memory/device_copy.h"
 #include "../op/memory/memory.h"
 #include "../op/vm/vm.h"
+#include "./let_list.h"
 #include "./pass_utils.h"
-#include "let_list.h"
-#include "pattern_utils.h"
+#include "./pattern_utils.h"
 
 using namespace tvm::runtime;
-using namespace tvm::relay::tec;
 
 namespace tvm {
 namespace relay {
@@ -193,7 +194,8 @@ class DialectRewriter : public ExprMutator {
  private:
   // Insert a device copy node.
   Expr DeviceCopy(const Expr& inp, int src_dev, int dst_dev) {
-    return ExprMutator::Mutate(relay::DeviceCopy(inp, src_dev, dst_dev));
+    return ExprMutator::Mutate(relay::DeviceCopy(inp, static_cast<DLDeviceType>(src_dev),
+                                                 static_cast<DLDeviceType>(dst_dev)));
   }
 
   // Check if a call invokes a primitive function.
@@ -274,9 +276,9 @@ class DialectRewriter : public ExprMutator {
                             const std::vector<Expr>& new_args) {
     Array<Expr> shape_func_ins;
 
-    TECompiler compiler;
+    tec::TECompiler compiler;
 
-    CCacheKey key(func, target_host_);
+    tec::CCacheKey key(func, target_host_);
     auto cfunc = compiler->LowerShapeFunc(key);
     auto input_states = cfunc->shape_func_param_states;
 
diff --git a/src/runtime/ndarray.cc b/src/runtime/ndarray.cc
index 968a4488bbcf..8db89c59a85d 100644
--- a/src/runtime/ndarray.cc
+++ b/src/runtime/ndarray.cc
@@ -272,7 +272,7 @@ int TVMArrayAlloc(const tvm_index_t* shape, int ndim, int dtype_code, int dtype_
   dtype.code = static_cast<uint8_t>(dtype_code);
   dtype.bits = static_cast<uint8_t>(dtype_bits);
   dtype.lanes = static_cast<uint16_t>(dtype_lanes);
-  Device dev;
+  tvm::Device dev;
   dev.device_type = static_cast<DLDeviceType>(device_type);
   dev.device_id = device_id;
   auto ndarray = NDArray::Empty(ShapeTuple(shape, shape + ndim), dtype, dev);
@@ -286,7 +286,7 @@ TVM_REGISTER_GLOBAL("runtime.TVMArrayAllocWithScope").set_body([](TVMArgs args,
   int ndim = args[1];
   ShapeTuple shape(shape_ptr, shape_ptr + ndim);
   DataType dtype = args[2];
-  Device dev = args[3];
+  tvm::Device dev = args[3];
   Optional<String> mem_scope = args[4];
   auto ndarray = NDArray::Empty(shape, dtype, dev, mem_scope);
   *ret = ndarray;
diff --git a/src/tir/analysis/verify_memory.cc b/src/tir/analysis/verify_memory.cc
index 0382b8071de7..b6c41b958c31 100644
--- a/src/tir/analysis/verify_memory.cc
+++ b/src/tir/analysis/verify_memory.cc
@@ -172,8 +172,9 @@ std::vector<String> VerifyMemory_(const PrimFunc& func) {
   auto target = func->GetAttr<Target>(tvm::attr::kTarget);
   ICHECK(target.defined()) << "VerifyMemory: Require the target attribute";
 
-  DLOG(INFO) << "verifying memory for target '" << target.value()->str() << "' for primitive\n"
-             << PrettyPrint(func);
+  VLOG(1) << "verifying memory for target '" << target.value()->str()
+          << "' for primitive:" << std::endl
+          << PrettyPrint(func);
 
   if (func->GetAttr<Integer>(tvm::attr::kCallingConv, Integer(CallingConv::kDefault)) ==
       CallingConv::kDefault) {
diff --git a/tests/python/relay/op/annotation/test_annotation.py b/tests/python/relay/op/annotation/test_annotation.py
new file mode 100644
index 000000000000..51daa9aaa06a
--- /dev/null
+++ b/tests/python/relay/op/annotation/test_annotation.py
@@ -0,0 +1,71 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Unit tests for annotations."""
+import tvm
+from tvm import relay
+import pytest
+
+
+def test_on_device_via_string():
+    x = relay.Var("x")
+    call = relay.annotation.on_device(x, "cuda")
+    assert isinstance(call, relay.Call)
+    assert len(call.args) == 1
+    assert call.args[0] == x
+    assert call.attrs.device_type == 2  # ie kDLCUDA
+    assert not call.attrs.is_fixed
+
+
+def test_on_device_via_device():
+    x = relay.Var("x")
+    call = relay.annotation.on_device(x, tvm.device("llvm"))
+    assert call.attrs.device_type == 1  # ie kDLCPU
+
+
+def test_on_device_invalid_device():
+    x = relay.Var("x")
+    pytest.raises(ValueError, lambda: relay.annotation.on_device(x, "bogus"))
+
+
+def test_on_device_is_fixed():
+    x = relay.Var("x")
+    call = relay.annotation.on_device(x, "cuda", True)
+    assert call.attrs.device_type == 2
+    assert call.attrs.is_fixed
+
+
+def test_function_on_device():
+    x = relay.Var("x")
+    y = relay.Var("y")
+    f = relay.Function([x, y], relay.add(x, y))
+    func = relay.annotation.function_on_device(f, ["cpu", "cuda"], "cuda")
+    assert isinstance(func, relay.Function)
+    assert len(func.attrs["on_device"].param_device_types) == 2
+    assert func.attrs["on_device"].param_device_types[0] == 1
+    # ie kDLCPU
+    assert func.attrs["on_device"].param_device_types[1] == 2
+    # ie kDLCUDA
+    assert func.attrs["on_device"].result_device_type == 2
+    # ie KDLCUDA
+
+
+if __name__ == "__main__":
+    test_on_device_via_string()
+    test_on_device_via_device()
+    test_on_device_invalid_device()
+    test_on_device_is_fixed()
+    test_function_on_device()

From 774ff12b7db38f4b467538f0b7ce0a74c2f82736 Mon Sep 17 00:00:00 2001
From: sunway <sunwayforever@gmail.com>
Date: Sat, 25 Sep 2021 04:21:10 +0800
Subject: [PATCH 18/37] [Frontend][TFLite] fix #9078 (#9099)

Co-authored-by: sunway <wei.sun@hexintek.com>
---
 python/tvm/relay/frontend/tflite.py          | 2 +-
 tests/python/frontend/tflite/test_forward.py | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/python/tvm/relay/frontend/tflite.py b/python/tvm/relay/frontend/tflite.py
index 250e9c4eb117..93a1dba233f2 100644
--- a/python/tvm/relay/frontend/tflite.py
+++ b/python/tvm/relay/frontend/tflite.py
@@ -775,7 +775,7 @@ def convert_softmax(self, op):
         assert len(output_tensors) == 1, "output tensors length should be 1"
         output_tensor = output_tensors[0]
 
-        params = {"axis": 1}  # 1 is channel
+        params = {"axis": -1}  # -1 is channel
         in_expr = self.get_expr(input_tensor_idx)
 
         # TODO - Naive softmax int8 implementation leads to bad accuracy. Currently, we can
diff --git a/tests/python/frontend/tflite/test_forward.py b/tests/python/frontend/tflite/test_forward.py
index d4c0b28e4e14..c073681dcbf5 100644
--- a/tests/python/frontend/tflite/test_forward.py
+++ b/tests/python/frontend/tflite/test_forward.py
@@ -3286,6 +3286,7 @@ def _test_softmax(data):
 def test_forward_softmax():
     """Softmax"""
     _test_softmax(np.arange(6.0, dtype=np.float32).reshape((1, 6)))
+    _test_softmax(np.arange(6.0, dtype=np.float32).reshape((1, 2, 3)))
 
 
 ######################################################################

From 70f2297191d0d2b9efb6b1b6257e6f9755f516ca Mon Sep 17 00:00:00 2001
From: sunway <sunwayforever@gmail.com>
Date: Sat, 25 Sep 2021 04:24:47 +0800
Subject: [PATCH 19/37] [BYOC] Fix incorrect conv2d padding handling of `dnnl
 with c source runtime` (#9097)

Co-authored-by: sunway <wei.sun@hexintek.com>
---
 src/relay/backend/contrib/dnnl/codegen.cc    |  4 ++-
 src/runtime/contrib/dnnl/dnnl.cc             | 37 +++++++++++---------
 src/runtime/contrib/dnnl/dnnl_kernel.h       | 15 ++++----
 tests/python/relay/test_external_codegen.py  | 33 +++++++++++++++++
 tests/python/relay/utils/external_codegen.py |  2 +-
 5 files changed, 66 insertions(+), 25 deletions(-)

diff --git a/src/relay/backend/contrib/dnnl/codegen.cc b/src/relay/backend/contrib/dnnl/codegen.cc
index e96255e976e9..f0d360ae8b6d 100644
--- a/src/relay/backend/contrib/dnnl/codegen.cc
+++ b/src/relay/backend/contrib/dnnl/codegen.cc
@@ -67,11 +67,13 @@ std::vector<std::string> Conv2d(const CallNode* call) {
     args.push_back(std::to_string(s));
   }
 
-  // Args: O, G, Ph, Pw, Kh, Kw, Sh, Sw
+  // Args: O, G, Ph0, Pw0, Ph1, Pw1, Kh, Kw, Sh, Sw
   args.push_back(std::to_string(wshape[0]));
   args.push_back(std::to_string(conv2d_attr->groups));
   args.push_back(std::to_string(conv2d_attr->padding[0].as<IntImmNode>()->value));
   args.push_back(std::to_string(conv2d_attr->padding[1].as<IntImmNode>()->value));
+  args.push_back(std::to_string(conv2d_attr->padding[2].as<IntImmNode>()->value));
+  args.push_back(std::to_string(conv2d_attr->padding[3].as<IntImmNode>()->value));
   args.push_back(std::to_string(wshape[2]));
   args.push_back(std::to_string(wshape[3]));
   args.push_back(std::to_string(conv2d_attr->strides[0].as<IntImmNode>()->value));
diff --git a/src/runtime/contrib/dnnl/dnnl.cc b/src/runtime/contrib/dnnl/dnnl.cc
index 5b9f5e17232c..19b3f796fd33 100644
--- a/src/runtime/contrib/dnnl/dnnl.cc
+++ b/src/runtime/contrib/dnnl/dnnl.cc
@@ -53,8 +53,9 @@ inline void read_from_dnnl_memory(void* handle, const memory& mem) {
 }
 
 void dnnl_conv2d_common(float* data, float* weights, float* bias, float* out, int p_N_, int p_C_,
-                        int p_H_, int p_W_, int p_O_, int p_G_, int p_Ph_, int p_Pw_, int p_Kh_,
-                        int p_Kw_, int p_Sh_, int p_Sw_, primitive_attr attr) {
+                        int p_H_, int p_W_, int p_O_, int p_G_, int p_Ph0_, int p_Pw0_, int p_Ph1_,
+                        int p_Pw1_, int p_Kh_, int p_Kw_, int p_Sh_, int p_Sw_,
+                        primitive_attr attr) {
   using tag = memory::format_tag;
   using dt = memory::data_type;
   engine eng(engine::kind::cpu, 0);
@@ -64,10 +65,11 @@ void dnnl_conv2d_common(float* data, float* weights, float* bias, float* out, in
   memory::dims conv2d_weights_tz = {p_O_, p_C_, p_Kh_, p_Kw_};
   if (p_G_ > 1) conv2d_weights_tz = {p_G_, 1, p_C_ / p_G_, p_Kh_, p_Kw_};
   memory::dims conv2d_bias_tz = {p_O_};
-  memory::dims conv2d_dst_tz = {p_N_, p_O_, (p_H_ - p_Kh_ + 2 * p_Ph_ + p_Sh_) / p_Sh_,
-                                (p_W_ - p_Kw_ + 2 * p_Pw_ + p_Sw_) / p_Sw_};
+  memory::dims conv2d_dst_tz = {p_N_, p_O_, (p_H_ - p_Kh_ + p_Ph0_ + p_Ph1_ + p_Sh_) / p_Sh_,
+                                (p_W_ - p_Kw_ + p_Pw0_ + p_Pw1_ + p_Sw_) / p_Sw_};
   memory::dims conv2d_strides = {p_Sh_, p_Sw_};
-  memory::dims conv2d_padding = {p_Ph_, p_Pw_};
+  memory::dims conv2d_padding0 = {p_Ph0_, p_Pw0_};
+  memory::dims conv2d_padding1 = {p_Ph1_, p_Pw1_};
 
   auto user_src_memory = memory({{conv2d_src_tz}, dt::f32, tag::nchw}, eng, data);
   auto user_weights_memory =
@@ -81,7 +83,7 @@ void dnnl_conv2d_common(float* data, float* weights, float* bias, float* out, in
 
   auto conv2d_desc = convolution_forward::desc(
       prop_kind::forward_inference, algorithm::convolution_direct, conv2d_src_md, conv2d_weights_md,
-      conv2d_bias_md, conv2d_dst_md, conv2d_strides, conv2d_padding, conv2d_padding);
+      conv2d_bias_md, conv2d_dst_md, conv2d_strides, conv2d_padding0, conv2d_padding1);
   auto conv2d_prim_desc = convolution_forward::primitive_desc(conv2d_desc, attr, eng);
 
   auto conv2d_src_memory = user_src_memory;
@@ -98,12 +100,12 @@ void dnnl_conv2d_common(float* data, float* weights, float* bias, float* out, in
 }
 
 extern "C" void dnnl_conv2d(float* data, float* weights, float* out, int p_N_, int p_C_, int p_H_,
-                            int p_W_, int p_O_, int p_G_, int p_Ph_, int p_Pw_, int p_Kh_,
-                            int p_Kw_, int p_Sh_, int p_Sw_) {
+                            int p_W_, int p_O_, int p_G_, int p_Ph0_, int p_Pw0_, int p_Ph1_,
+                            int p_Pw1_, int p_Kh_, int p_Kw_, int p_Sh_, int p_Sw_) {
   primitive_attr attr;
   std::vector<float> bias(p_O_, 0);
   return dnnl_conv2d_common(data, weights, bias.data(), out, p_N_, p_C_, p_H_, p_W_, p_O_, p_G_,
-                            p_Ph_, p_Pw_, p_Kh_, p_Kw_, p_Sh_, p_Sw_, attr);
+                            p_Ph0_, p_Pw0_, p_Ph1_, p_Pw1_, p_Kh_, p_Kw_, p_Sh_, p_Sw_, attr);
 }
 
 primitive_attr create_attr_with_relu_post_op() {
@@ -117,20 +119,23 @@ primitive_attr create_attr_with_relu_post_op() {
 }
 
 extern "C" void dnnl_fused_conv2d_relu(float* data, float* weights, float* out, int p_N_, int p_C_,
-                                       int p_H_, int p_W_, int p_O_, int p_G_, int p_Ph_, int p_Pw_,
-                                       int p_Kh_, int p_Kw_, int p_Sh_, int p_Sw_) {
+                                       int p_H_, int p_W_, int p_O_, int p_G_, int p_Ph0_,
+                                       int p_Pw0_, int p_Ph1_, int p_Pw1_, int p_Kh_, int p_Kw_,
+                                       int p_Sh_, int p_Sw_) {
   std::vector<float> bias(p_O_, 0);
   return dnnl_conv2d_common(data, weights, bias.data(), out, p_N_, p_C_, p_H_, p_W_, p_O_, p_G_,
-                            p_Ph_, p_Pw_, p_Kh_, p_Kw_, p_Sh_, p_Sw_,
+                            p_Ph0_, p_Pw0_, p_Ph1_, p_Pw1_, p_Kh_, p_Kw_, p_Sh_, p_Sw_,
                             create_attr_with_relu_post_op());
 }
 
 extern "C" void dnnl_fused_conv2d_bias_relu(float* data, float* weights, float* bias, float* out,
                                             int p_N_, int p_C_, int p_H_, int p_W_, int p_O_,
-                                            int p_G_, int p_Ph_, int p_Pw_, int p_Kh_, int p_Kw_,
-                                            int p_Sh_, int p_Sw_) {
-  return dnnl_conv2d_common(data, weights, bias, out, p_N_, p_C_, p_H_, p_W_, p_O_, p_G_, p_Ph_,
-                            p_Pw_, p_Kh_, p_Kw_, p_Sh_, p_Sw_, create_attr_with_relu_post_op());
+                                            int p_G_, int p_Ph0_, int p_Pw0_, int p_Ph1_,
+                                            int p_Pw1_, int p_Kh_, int p_Kw_, int p_Sh_,
+                                            int p_Sw_) {
+  return dnnl_conv2d_common(data, weights, bias, out, p_N_, p_C_, p_H_, p_W_, p_O_, p_G_, p_Ph0_,
+                            p_Pw0_, p_Ph1_, p_Pw1_, p_Kh_, p_Kw_, p_Sh_, p_Sw_,
+                            create_attr_with_relu_post_op());
 }
 
 extern "C" void dnnl_dense(float* data, float* weight, float* out, int p_B_, int p_I_, int p_O_) {
diff --git a/src/runtime/contrib/dnnl/dnnl_kernel.h b/src/runtime/contrib/dnnl/dnnl_kernel.h
index dbc064a6bc99..f5f28fccd8e7 100644
--- a/src/runtime/contrib/dnnl/dnnl_kernel.h
+++ b/src/runtime/contrib/dnnl/dnnl_kernel.h
@@ -36,19 +36,20 @@ namespace contrib {
 using namespace dnnl;
 
 extern "C" TVM_DLL void dnnl_conv2d(float* data, float* weights, float* out, int p_N_, int p_C_,
-                                    int p_H_, int p_W_, int p_O_, int p_G_, int p_Ph_, int p_Pw_,
-                                    int p_Kh_, int p_Kw_, int p_Sh_, int p_Sw_);
+                                    int p_H_, int p_W_, int p_O_, int p_G_, int p_Ph0_, int p_Pw0_,
+                                    int p_Ph1_, int p_Pw1_, int p_Kh_, int p_Kw_, int p_Sh_,
+                                    int p_Sw_);
 
 extern "C" TVM_DLL void dnnl_fused_conv2d_relu(float* data, float* weights, float* out, int p_N_,
                                                int p_C_, int p_H_, int p_W_, int p_O_, int p_G_,
-                                               int p_Ph_, int p_Pw_, int p_Kh_, int p_Kw_,
-                                               int p_Sh_, int p_Sw_);
+                                               int p_Ph0_, int p_Pw0_, int p_Ph1_, int p_Pw1_,
+                                               int p_Kh_, int p_Kw_, int p_Sh_, int p_Sw_);
 
 extern "C" TVM_DLL void dnnl_fused_conv2d_bias_relu(float* data, float* weights, float* bias,
                                                     float* out, int p_N_, int p_C_, int p_H_,
-                                                    int p_W_, int p_O_, int p_G_, int p_Ph_,
-                                                    int p_Pw_, int p_Kh_, int p_Kw_, int p_Sh_,
-                                                    int p_Sw_);
+                                                    int p_W_, int p_O_, int p_G_, int p_Ph0_,
+                                                    int p_Pw0_, int p_Ph1_, int p_Pw1_, int p_Kh_,
+                                                    int p_Kw_, int p_Sh_, int p_Sw_);
 
 extern "C" TVM_DLL void dnnl_dense(float* data, float* weight, float* out, int p_B_, int p_I_,
                                    int p_O_);
diff --git a/tests/python/relay/test_external_codegen.py b/tests/python/relay/test_external_codegen.py
index ad5f2aa9d4fa..41c113684f0a 100644
--- a/tests/python/relay/test_external_codegen.py
+++ b/tests/python/relay/test_external_codegen.py
@@ -213,6 +213,39 @@ def constant_updater(expr, symbol):
     tvm._ffi.registry.remove_global_func("relay.ext.ccompiler.constant_updater")
 
 
+@pytest.mark.skipif(
+    not tvm.get_global_func("relay.ext.dnnl", True),
+    reason="skip because DNNL codegen is not available",
+)
+@parametrize_external_json_codegen_checks
+def test_extern_dnnl_padding(check_result):
+    dtype = "float32"
+    ishape = (1, 1, 99, 12)
+    w1shape = (54, 1, 3, 3)
+    data0 = relay.var("data0", shape=(ishape), dtype=dtype)
+    weight0 = relay.var("weight0", shape=(w1shape), dtype=dtype)
+    out = relay.nn.conv2d(data0, weight0, kernel_size=(3, 3), strides=(2, 2), padding=(1, 0, 1, 1))
+    f = relay.Function([data0, weight0], out)
+    ref_mod = tvm.IRModule()
+    ref_mod["main"] = f
+
+    data1 = relay.var("data0", shape=(ishape), dtype=dtype)
+    weight1 = relay.var("weight0", shape=(w1shape), dtype=dtype)
+    f = set_external_func_attr(f, "dnnl", "dnnl_0")
+    call = relay.Call(f, [data1, weight1])
+    mod = tvm.IRModule.from_expr(call)
+
+    i_data = np.random.uniform(0, 1, ishape).astype(dtype)
+    w_data = np.random.uniform(0, 1, w1shape).astype(dtype)
+
+    ref_res = relay.create_executor("graph", mod=ref_mod, device=tvm.cpu()).evaluate()(
+        i_data, w_data
+    )
+    check_result(
+        mod, {"data0": i_data, "weight0": w_data}, (1, 54, 50, 6), ref_res.numpy(), tol=1e-5
+    )
+
+
 @pytest.mark.skipif(
     not tvm.get_global_func("relay.ext.dnnl", True),
     reason="skip because DNNL codegen is not available",
diff --git a/tests/python/relay/utils/external_codegen.py b/tests/python/relay/utils/external_codegen.py
index 85583f6ccc5d..2d73ef85be28 100644
--- a/tests/python/relay/utils/external_codegen.py
+++ b/tests/python/relay/utils/external_codegen.py
@@ -59,7 +59,7 @@ def parametrize_external_json_codegen_checks(test):
 
 def update_lib(lib):
     test_dir = os.path.dirname(os.path.realpath(os.path.expanduser(__file__)))
-    source_dir = os.path.join(test_dir, "..", "..", "..")
+    source_dir = os.path.join(test_dir, "..", "..", "..", "..")
     contrib_path = os.path.join(source_dir, "src", "runtime", "contrib")
 
     kwargs = {}

From 8f5abaad030551e3c978c43e7655f201134f4b3b Mon Sep 17 00:00:00 2001
From: Thierry Moreau <tmoreau@octoml.ai>
Date: Fri, 24 Sep 2021 23:14:24 -0700
Subject: [PATCH 20/37] adding Jorn to reviewers list (#9105)

---
 CONTRIBUTORS.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index 14f8191707c8..b9ef0479c72f 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -136,6 +136,7 @@ We do encourage everyone to work anything they are interested in.
 - [Jon Soifer](https://github.com/soiferj): @soiferj
 - [Zhixun Tan](https://github.com/phisiart): @phisiart
 - [Andrew Tulloch](https://github.com/ajtulloch): @ajtulloch
+- [Jorn Tuyls](https://github.com/jtuyls): @jtuyls
 - [Luis Vega](https://github.com/vegaluisjose): @vegaluisjose
 - [Thomas Viehmann](https://github.com/t-vi): @t-vi
 - [Yao Wang](https://github.com/kevinthesun): @kevinthesun

From b33a1a795ff4fa46c791e83688adb5cc10d8942c Mon Sep 17 00:00:00 2001
From: Hongyi Jin <3231950289@qq.com>
Date: Sat, 25 Sep 2021 20:29:38 +0800
Subject: [PATCH 21/37] [TensorIR][Bugfix] Disallow fusing loops with
 dependency (#9112)

* check dependency for fuse

* blank line
---
 include/tvm/tir/schedule/schedule.h           |  1 +
 python/tvm/tir/schedule/schedule.py           |  1 +
 src/tir/schedule/primitive.h                  |  1 +
 .../schedule/primitive/loop_transformation.cc | 39 +++++++++++++++----
 .../unittest/test_tir_schedule_split_fuse.py  | 18 +++++++++
 5 files changed, 53 insertions(+), 7 deletions(-)

diff --git a/include/tvm/tir/schedule/schedule.h b/include/tvm/tir/schedule/schedule.h
index 66dd5375eaf9..9f48d9ab9b1f 100644
--- a/include/tvm/tir/schedule/schedule.h
+++ b/include/tvm/tir/schedule/schedule.h
@@ -216,6 +216,7 @@ class ScheduleNode : public runtime::Object {
    * 1) The loops can't have annotations or thread bindings.
    * 2) The (i+1)-th loop must be the only child of the i-th loop.
    * 3) All loops must start with 0.
+   * 4) The domain of a loop to be fused cannot depend on another loop to be fused.
    * \param loop_rvs The loops to be fused
    * \return The new loop after fusion
    */
diff --git a/python/tvm/tir/schedule/schedule.py b/python/tvm/tir/schedule/schedule.py
index 7545c09b020d..d26ffc0b1efa 100644
--- a/python/tvm/tir/schedule/schedule.py
+++ b/python/tvm/tir/schedule/schedule.py
@@ -373,6 +373,7 @@ def fuse(self, *loops: List[LoopRV]) -> LoopRV:
         1) The loops can't have annotations or thread bindings.
         2) The (i+1)-th loop must be the only child of the i-th loop.
         3) All loops must start with 0.
+        4) The domain of a loop to be fused cannot depend on another loop to be fused.
 
         Parameters
         ----------
diff --git a/src/tir/schedule/primitive.h b/src/tir/schedule/primitive.h
index 05eefaca8a11..8ad6bdf7d37f 100644
--- a/src/tir/schedule/primitive.h
+++ b/src/tir/schedule/primitive.h
@@ -72,6 +72,7 @@ TVM_DLL Array<StmtSRef> Split(ScheduleState self, const StmtSRef& loop_sref,
  * 1) The loops can't have annotations or thread bindings.
  * 2) The inner loop must be the only child of the outer loop.
  * 3) All loops must start with 0.
+ * 4) The domain of a loop to be fused cannot depend on another loop to be fused.
  * \param self The state of the schedule
  * \param loop_srefs An array of srefs to the loops to be fused
  * \return The sref to the fused loop
diff --git a/src/tir/schedule/primitive/loop_transformation.cc b/src/tir/schedule/primitive/loop_transformation.cc
index 95c92aa0a322..7b9ac488b8b9 100644
--- a/src/tir/schedule/primitive/loop_transformation.cc
+++ b/src/tir/schedule/primitive/loop_transformation.cc
@@ -358,17 +358,26 @@ class LoopsNotAChainError : public ScheduleError {
 
 class DependentLoopError : public ScheduleError {
  public:
-  explicit DependentLoopError(IRModule mod, For loop, String inner_var)
-      : mod_(mod), loop_(std::move(loop)), inner_var_(std::move(inner_var)) {}
+  enum class PrimitiveKind { kFuse, kReorder };
+  explicit DependentLoopError(IRModule mod, For loop, String inner_var, PrimitiveKind kind)
+      : mod_(mod), loop_(std::move(loop)), inner_var_(std::move(inner_var)), kind_(kind) {}
 
   String FastErrorString() const final {
-    return "ScheduleError: An outer loop's `min` or `extent` is dependent on an inner loop "
-           "in the new order";
+    if (kind_ == PrimitiveKind::kReorder) {
+      return "ScheduleError: An outer loop's `min` or `extent` is dependent on an inner loop "
+             "in the new order";
+    } else {
+      return "ScheduleError: A loop's `extent` is dependent on another loop";
+    }
   }
 
   String DetailRenderTemplate() const final {
-    return "Outer Loop {0}'s `min` or `extent` is dependent on an inner loop " + inner_var_ +
-           " in the new order";
+    if (kind_ == PrimitiveKind::kReorder) {
+      return "Outer Loop {0}'s `min` or `extent` is dependent on an inner loop " + inner_var_ +
+             " in the new order";
+    } else {
+      return "A loop {0}'s `extent` is dependent on another loop " + inner_var_;
+    }
   }
 
   IRModule mod() const final { return mod_; }
@@ -377,6 +386,7 @@ class DependentLoopError : public ScheduleError {
   IRModule mod_;
   For loop_;
   String inner_var_;
+  PrimitiveKind kind_;
 };
 
 Array<StmtSRef> Split(ScheduleState self, const StmtSRef& loop_sref,
@@ -450,6 +460,7 @@ StmtSRef Fuse(ScheduleState self, const Array<StmtSRef>& loop_srefs) {
   StmtSRef outer_loop_sref{nullptr};
   const ForNode* outer_loop = nullptr;
   arith::Analyzer analyzer;
+  std::unordered_set<const VarNode*> outer_loop_vars;
   // Step 1. check correctness
   for (const StmtSRef& sref : loop_srefs) {
     const ForNode* loop = TVM_SREF_TO_FOR(loop, sref);
@@ -469,6 +480,19 @@ StmtSRef Fuse(ScheduleState self, const Array<StmtSRef>& loop_srefs) {
     if (!analyzer.CanProve(loop->min == 0)) {
       throw LoopNotStartWithZeroError(self->mod, GetRef<For>(loop));
     }
+    const VarNode* used_var = nullptr;
+    auto f_contain = [&outer_loop_vars, &used_var](const VarNode* var) {
+      if (outer_loop_vars.count(var)) {
+        used_var = var;
+        return true;
+      }
+      return false;
+    };
+    if (UsesVar(loop->extent, f_contain)) {
+      throw DependentLoopError(self->mod, GetRef<For>(loop), used_var->name_hint,
+                               DependentLoopError::PrimitiveKind::kFuse);
+    }
+    outer_loop_vars.insert(loop->loop_var.get());
     loops.push_back(loop);
   }
   // Step 2. Create fused loop var and replace the original loop vars
@@ -651,7 +675,8 @@ For ConstructNewLoopChain(const ScheduleState& self, std::vector<const StmtSRefN
       return false;
     };
     if (UsesVar(copy->min, f_contain) || UsesVar(copy->extent, f_contain)) {
-      throw DependentLoopError(self->mod, GetRef<For>(copy), used_var->name_hint);
+      throw DependentLoopError(self->mod, GetRef<For>(copy), used_var->name_hint,
+                               DependentLoopError::PrimitiveKind::kReorder);
     }
     inner_vars.insert(copy->loop_var.get());
     new_loop = For(std::move(n));
diff --git a/tests/python/unittest/test_tir_schedule_split_fuse.py b/tests/python/unittest/test_tir_schedule_split_fuse.py
index 2284f9d996b1..d11e7f877ccc 100644
--- a/tests/python/unittest/test_tir_schedule_split_fuse.py
+++ b/tests/python/unittest/test_tir_schedule_split_fuse.py
@@ -34,6 +34,16 @@ def elementwise(a: ty.handle, b: ty.handle) -> None:
         B[vi, vj, vk] = A[vi, vj, vk] * 2.0
 
 
+@tvm.script.tir
+def elementwise_dependent_loops(a: ty.handle, b: ty.handle) -> None:
+    A = tir.match_buffer(a, (128, 128, 128))
+    B = tir.match_buffer(b, (128, 128, 128))
+    for i in tir.serial(0, 128):
+        for j, k in tir.grid(i, 128):
+            with tir.block([128, i, 128], "B") as [vi, vj, vk]:
+                B[vi, vj, vk] = A[vi, vj, vk] * 2.0
+
+
 @tvm.script.tir
 def elementwise_symbolic(a: ty.handle, b: ty.handle, n: ty.int32) -> None:
     A = tir.match_buffer(a, (128, 128, n))
@@ -462,5 +472,13 @@ def test_split_symbolic():
     verify_trace_roundtrip(sch=sch, mod=elementwise_symbolic)
 
 
+def test_fuse_fail_with_dependent_loops():
+    sch = tir.Schedule(elementwise_dependent_loops, debug_mask="all")
+    block_b = sch.get_block("B")
+    i, j, _ = sch.get_loops(block_b)
+    with pytest.raises(tvm.tir.ScheduleError):
+        sch.fuse(i, j)
+
+
 if __name__ == "__main__":
     sys.exit(pytest.main([__file__] + sys.argv[1:]))

From 80de1239e2c206b78f12e61da171340085b17ebc Mon Sep 17 00:00:00 2001
From: Xiyou Zhou <xiyou@octoml.ai>
Date: Sat, 25 Sep 2021 10:04:46 -0700
Subject: [PATCH 22/37] [Meta Schedule][M3a] SpaceGenerator  (#9079)

* Add meta shedule space generator.

Co-authored-by: Junru Shao <junrushao1994@gmail.com>
Co-authored-by: Bohan Hou <32121147+spectrometerHBH@users.noreply.github.com>
Co-authored-by: Ruihang Lai <lairuihangdongdong@qq.com>
Co-authored-by: Hongyi Jin <3231950289@qq.com>
Co-authored-by: Wuwei Lin <wuwei@apache.org>
Co-authored-by: Siyuan Feng <Hzfengsy@sjtu.edu.cn>

* Clean up.

* Minor fix.

* Move utils.h.

Co-authored-by: Junru Shao <junrushao1994@gmail.com>
Co-authored-by: Bohan Hou <32121147+spectrometerHBH@users.noreply.github.com>
Co-authored-by: Ruihang Lai <lairuihangdongdong@qq.com>
Co-authored-by: Hongyi Jin <3231950289@qq.com>
Co-authored-by: Wuwei Lin <wuwei@apache.org>
Co-authored-by: Siyuan Feng <Hzfengsy@sjtu.edu.cn>
---
 include/tvm/meta_schedule/space_generator.h   | 122 ++++++++++++++++++
 .../tvm}/meta_schedule/tune_context.h         |   6 +
 python/tvm/meta_schedule/__init__.py          |   1 +
 .../meta_schedule/space_generator/__init__.py |  25 ++++
 .../space_generator/schedule_fn.py            |  90 +++++++++++++
 .../space_generator/space_generator.py        |  93 +++++++++++++
 .../space_generator/space_generator_union.py  |  41 ++++++
 python/tvm/meta_schedule/tune_context.py      |   9 +-
 .../space_generator/space_generator.cc        |  44 +++++++
 .../space_generator/space_generator_union.cc  |  70 ++++++++++
 src/meta_schedule/tune_context.cc             |  10 +-
 src/meta_schedule/utils.h                     |   4 +-
 .../test_meta_schedule_space_generator.py     |  88 +++++++++++++
 .../test_meta_schedule_tune_context.py        |   2 +-
 14 files changed, 599 insertions(+), 6 deletions(-)
 create mode 100644 include/tvm/meta_schedule/space_generator.h
 rename {src => include/tvm}/meta_schedule/tune_context.h (89%)
 create mode 100644 python/tvm/meta_schedule/space_generator/__init__.py
 create mode 100644 python/tvm/meta_schedule/space_generator/schedule_fn.py
 create mode 100644 python/tvm/meta_schedule/space_generator/space_generator.py
 create mode 100644 python/tvm/meta_schedule/space_generator/space_generator_union.py
 create mode 100644 src/meta_schedule/space_generator/space_generator.cc
 create mode 100644 src/meta_schedule/space_generator/space_generator_union.cc
 create mode 100644 tests/python/unittest/test_meta_schedule_space_generator.py

diff --git a/include/tvm/meta_schedule/space_generator.h b/include/tvm/meta_schedule/space_generator.h
new file mode 100644
index 000000000000..9528be2a85ad
--- /dev/null
+++ b/include/tvm/meta_schedule/space_generator.h
@@ -0,0 +1,122 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#ifndef TVM_META_SCHEDULE_SPACE_GENERATOR_H_
+#define TVM_META_SCHEDULE_SPACE_GENERATOR_H_
+
+#include <tvm/ir/module.h>
+#include <tvm/tir/schedule/schedule.h>
+
+namespace tvm {
+namespace meta_schedule {
+
+// Forward declaration
+class TuneContext;
+
+/*! \brief The abstract class for design space generation. */
+class SpaceGeneratorNode : public Object {
+ public:
+  /*! \brief Default destructor */
+  virtual ~SpaceGeneratorNode() = default;
+
+  /*!
+   * \brief Initialize the design space generator with tuning context.
+   * \param tune_context The tuning context for initialization.
+   */
+  virtual void InitializeWithTuneContext(const TuneContext& tune_context) = 0;
+
+  /*!
+   * \brief Generate design spaces given a module.
+   * \param mod The module used for design space generation.
+   * \return The generated design spaces, i.e., schedules.
+   */
+  virtual Array<tir::Schedule> GenerateDesignSpace(const IRModule& mod) = 0;
+
+  static constexpr const char* _type_key = "meta_schedule.SpaceGenerator";
+  TVM_DECLARE_BASE_OBJECT_INFO(SpaceGeneratorNode, Object);
+};
+
+/*! \brief The design space generator with customized methods on the python-side. */
+class PySpaceGeneratorNode : public SpaceGeneratorNode {
+ public:
+  /*!
+   * \brief The function type of `InitializeWithTuneContext` method.
+   * \param tune_context The tuning context for initialization.
+   */
+  using FInitializeWithTuneContext = runtime::TypedPackedFunc<void(const TuneContext&)>;
+  /*!
+   * \brief The function type of `GenerateDesignSpace` method.
+   * \param mod The module used for design space generation.
+   * \return The generated design spaces, i.e., schedules.
+   */
+  using FGenerateDesignSpace = runtime::TypedPackedFunc<Array<tir::Schedule>(const IRModule&)>;
+
+  /*! \brief The packed function to the `InitializeWithTuneContext` funcion. */
+  FInitializeWithTuneContext f_initialize_with_tune_context;
+  /*! \brief The packed function to the `GenerateDesignSpace` function. */
+  FGenerateDesignSpace f_generate_design_space;
+
+  void VisitAttrs(tvm::AttrVisitor* v) {
+    // `f_initialize_with_tune_context` is not visited
+    // `f_generate_design_space` is not visited
+  }
+
+  void InitializeWithTuneContext(const TuneContext& tune_context) final {
+    f_initialize_with_tune_context(tune_context);
+  }
+
+  Array<tir::Schedule> GenerateDesignSpace(const IRModule& mod) final {
+    return f_generate_design_space(mod);
+  }
+
+  static constexpr const char* _type_key = "meta_schedule.PySpaceGenerator";
+  TVM_DECLARE_FINAL_OBJECT_INFO(PySpaceGeneratorNode, SpaceGeneratorNode);
+};
+
+/*!
+ * \brief Managed reference to SpaceGeneratorNode.
+ * \sa SpaceGeneratorNode
+ */
+class SpaceGenerator : public ObjectRef {
+ protected:
+  SpaceGenerator() = default;
+
+ public:
+  /*!
+   * \brief Create a design space generator with customized methods on the python-side.
+   * \param initialize_with_tune_context_func The packed function of `InitializeWithTuneContext`.
+   * \param generate_design_space_func The packed function of `GenerateDesignSpace`.
+   * \return The design space generator created.
+   */
+  TVM_DLL static SpaceGenerator PySpaceGenerator(
+      PySpaceGeneratorNode::FInitializeWithTuneContext initialize_with_tune_context_func,
+      PySpaceGeneratorNode::FGenerateDesignSpace generate_design_space_func);
+
+  /*!
+   * \brief Create a design space generator that is union of multiple design space generators.
+   * \param space_generators An array of design space generators to be unioned.
+   * \return The design space generator created.
+   */
+  TVM_DLL static SpaceGenerator SpaceGeneratorUnion(Array<SpaceGenerator, void> space_generators);
+  TVM_DEFINE_MUTABLE_NOTNULLABLE_OBJECT_REF_METHODS(SpaceGenerator, ObjectRef, SpaceGeneratorNode);
+};
+
+}  // namespace meta_schedule
+}  // namespace tvm
+
+#endif  // TVM_META_SCHEDULE_SPACE_GENERATOR_H_
diff --git a/src/meta_schedule/tune_context.h b/include/tvm/meta_schedule/tune_context.h
similarity index 89%
rename from src/meta_schedule/tune_context.h
rename to include/tvm/meta_schedule/tune_context.h
index 454b8095aabc..87a3a491c8f3 100644
--- a/src/meta_schedule/tune_context.h
+++ b/include/tvm/meta_schedule/tune_context.h
@@ -20,6 +20,7 @@
 #define TVM_META_SCHEDULE_TUNE_CONTEXT_H_
 
 #include <tvm/ir/module.h>
+#include <tvm/meta_schedule/space_generator.h>
 #include <tvm/support/random_engine.h>
 #include <tvm/target/target.h>
 
@@ -33,6 +34,8 @@ class TuneContextNode : public runtime::Object {
   Optional<IRModule> mod;
   /*! \brief The target to be tuned for. */
   Optional<Target> target;
+  /*! \brief The design space generator. */
+  Optional<SpaceGenerator> space_generator;
   /*! \brief The name of the tuning task. */
   Optional<String> task_name;
   /*! \brief The random state. */
@@ -43,6 +46,7 @@ class TuneContextNode : public runtime::Object {
   void VisitAttrs(tvm::AttrVisitor* v) {
     v->Visit("mod", &mod);
     v->Visit("target", &target);
+    v->Visit("space_generator", &space_generator);
     v->Visit("task_name", &task_name);
     v->Visit("rand_state", &rand_state);
     v->Visit("num_threads", &num_threads);
@@ -62,12 +66,14 @@ class TuneContext : public runtime::ObjectRef {
    * \brief Constructor.
    * \param mod The workload to be tuned.
    * \param target The target to be tuned for.
+   * \param space_generator The design space generator.
    * \param task_name The name of the tuning task.
    * \param rand_state The random state.
    * \param num_threads The number of threads to be used.
    */
   TVM_DLL explicit TuneContext(Optional<IRModule> mod,                                    //
                                Optional<Target> target,                                   //
+                               Optional<SpaceGenerator> space_generator,                  //
                                Optional<String> task_name,                                //
                                support::LinearCongruentialEngine::TRandState rand_state,  //
                                int num_threads);
diff --git a/python/tvm/meta_schedule/__init__.py b/python/tvm/meta_schedule/__init__.py
index f0e8af223511..c07b28b4fc9f 100644
--- a/python/tvm/meta_schedule/__init__.py
+++ b/python/tvm/meta_schedule/__init__.py
@@ -17,4 +17,5 @@
 """Package `tvm.meta_schedule`. The meta schedule infrastructure."""
 from . import builder
 from . import arg_info
+from . import space_generator
 from .tune_context import TuneContext
diff --git a/python/tvm/meta_schedule/space_generator/__init__.py b/python/tvm/meta_schedule/space_generator/__init__.py
new file mode 100644
index 000000000000..af759d43b34a
--- /dev/null
+++ b/python/tvm/meta_schedule/space_generator/__init__.py
@@ -0,0 +1,25 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+The tvm.meta_schedule.space_generator package.
+Meta Schedule design space generators that generates design
+space for generation of measure candidates.
+"""
+
+from .space_generator import SpaceGenerator, PySpaceGenerator
+from .space_generator_union import SpaceGeneratorUnion
+from .schedule_fn import ScheduleFn
diff --git a/python/tvm/meta_schedule/space_generator/schedule_fn.py b/python/tvm/meta_schedule/space_generator/schedule_fn.py
new file mode 100644
index 000000000000..64edd9e0bf8c
--- /dev/null
+++ b/python/tvm/meta_schedule/space_generator/schedule_fn.py
@@ -0,0 +1,90 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+Meta schedule design space generators that generates design
+space via a schedule function.
+"""
+from typing import TYPE_CHECKING, Callable, List, Union
+
+from tvm.ir import IRModule
+from tvm.ir.container import Array
+from tvm.tir.schedule import Schedule
+
+from .space_generator import PySpaceGenerator
+
+if TYPE_CHECKING:
+    from ..tune_context import TuneContext
+
+
+class ScheduleFn(PySpaceGenerator):
+    """A design space generator with design spaces specified by a schedule function."""
+
+    # Multiple cases of schedule functions supported
+    SCH_FN_TYPE = Union[
+        Callable[[IRModule], None],  # No output
+        Callable[[IRModule], Schedule],  # Single output
+        Callable[[IRModule], List[Schedule]],  # Multiple outputs
+    ]
+
+    def __init__(self, sch_fn: SCH_FN_TYPE):
+        """Constructor.
+
+        Parameters
+        ----------
+        sch_fn : SCH_FN_TYPE
+            The schedule function.
+        """
+        super().__init__()
+        self.sch_fn = sch_fn
+
+    def initialize_with_tune_context(self, tune_context: "TuneContext") -> None:
+        """Initialize the design space generator with tuning context.
+
+        Parameters
+        ----------
+        tune_context : TuneContext
+            The tuning context for initializing the design space generator.
+        """
+
+    def generate_design_space(self, mod: IRModule) -> List[Schedule]:
+        """Generate design spaces given a module.
+
+        Parameters
+        ----------
+        mod : IRModule
+            The module used for design space generation.
+
+        Returns
+        -------
+        design_spaces : List[Schedule]
+            The generated design spaces, i.e., schedules.
+        """
+        sch = Schedule(mod)  # Make sure the schedule is traced
+        result = self.sch_fn(sch)  # Call the schedule function
+        if result is None:  # Case 1. No output
+            return [sch]
+        if isinstance(result, Schedule):  # Case 2. Single output
+            return [result]
+        if isinstance(result, (list, tuple, Array)):  # Case 3. Multiple outputs
+            for ret in result:  # enumerate the outputs
+                if not isinstance(ret, Schedule):
+                    raise TypeError(
+                        "Wrong type of element in the list, expected Schedule got "
+                        + f"'{type(ret)}': {ret}"
+                    )
+            return result
+        raise TypeError(f"Unexpected return type {type(result)}: {result}")
diff --git a/python/tvm/meta_schedule/space_generator/space_generator.py b/python/tvm/meta_schedule/space_generator/space_generator.py
new file mode 100644
index 000000000000..798753d91345
--- /dev/null
+++ b/python/tvm/meta_schedule/space_generator/space_generator.py
@@ -0,0 +1,93 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+Meta Schedule design space generators that generates design
+space for generation of measure candidates.
+"""
+
+from typing import TYPE_CHECKING, List
+
+from tvm._ffi import register_object
+from tvm.ir import IRModule
+from tvm.runtime import Object
+from tvm.tir.schedule import Schedule
+
+from .. import _ffi_api
+
+if TYPE_CHECKING:
+    from ..tune_context import TuneContext
+
+
+@register_object("meta_schedule.SpaceGenerator")
+class SpaceGenerator(Object):
+    """The abstract design space generator interface."""
+
+    def initialize_with_tune_context(
+        self,
+        tune_context: "TuneContext",
+    ) -> None:
+        """Initialize the design space generator with tuning context.
+
+        Parameters
+        ----------
+        tune_context : TuneContext
+            The tuning context for initializing the design space generator.
+        """
+        _ffi_api.SpaceGeneratorInitializeWithTuneContext(  # type: ignore # pylint: disable=no-member
+            self, tune_context
+        )
+
+    def generate_design_space(self, mod: IRModule) -> List[Schedule]:
+        """Generate design spaces given a module.
+
+        Parameters
+        ----------
+        mod : IRModule
+            The module used for design space generation.
+
+        Returns
+        -------
+        design_spaces : List[Schedule]
+            The generated design spaces, i.e., schedules.
+        """
+        return _ffi_api.SpaceGeneratorGenerateDesignSpace(self, mod)  # type: ignore # pylint: disable=no-member
+
+
+@register_object("meta_schedule.PySpaceGenerator")
+class PySpaceGenerator(SpaceGenerator):
+    """An abstract design space generator with customized methods on the python-side."""
+
+    def __init__(self):
+        """Constructor."""
+
+        def f_initialize_with_tune_context(tune_context: "TuneContext") -> None:
+            self.initialize_with_tune_context(tune_context)
+
+        def f_generate_design_space(mod: IRModule) -> List[Schedule]:
+            return self.generate_design_space(mod)
+
+        self.__init_handle_by_constructor__(
+            _ffi_api.SpaceGeneratorPySpaceGenerator,  # type: ignore # pylint: disable=no-member
+            f_initialize_with_tune_context,
+            f_generate_design_space,
+        )
+
+    def initialize_with_tune_context(self, tune_context: "TuneContext") -> None:
+        raise NotImplementedError
+
+    def generate_design_space(self, mod: IRModule) -> List[Schedule]:
+        raise NotImplementedError
diff --git a/python/tvm/meta_schedule/space_generator/space_generator_union.py b/python/tvm/meta_schedule/space_generator/space_generator_union.py
new file mode 100644
index 000000000000..5541ab0b5026
--- /dev/null
+++ b/python/tvm/meta_schedule/space_generator/space_generator_union.py
@@ -0,0 +1,41 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Union of meta Schedule design space generators."""
+from typing import List
+
+from tvm._ffi import register_object
+
+from .. import _ffi_api
+from .space_generator import SpaceGenerator
+
+
+@register_object("meta_schedule.SpaceGeneratorUnion")
+class SpaceGeneratorUnion(SpaceGenerator):
+    """Union of design space generators."""
+
+    def __init__(self, space_generators: List[SpaceGenerator]):
+        """Constructor.
+
+        Parameters
+        ----------
+        space_generators : List[SpaceGenerator]
+            The list of design space generators to be unioned.
+        """
+        self.__init_handle_by_constructor__(
+            _ffi_api.SpaceGeneratorSpaceGeneratorUnion,  # type: ignore # pylint: disable=no-member
+            space_generators,
+        )
diff --git a/python/tvm/meta_schedule/tune_context.py b/python/tvm/meta_schedule/tune_context.py
index b2fee178ebd6..4c83b9afa289 100644
--- a/python/tvm/meta_schedule/tune_context.py
+++ b/python/tvm/meta_schedule/tune_context.py
@@ -16,7 +16,7 @@
 # under the License.
 """Meta Schedule tuning context."""
 
-from typing import Optional
+from typing import Optional, TYPE_CHECKING
 
 from tvm import IRModule
 from tvm.runtime import Object
@@ -26,6 +26,9 @@
 
 from . import _ffi_api
 
+if TYPE_CHECKING:
+    from .space_generator import SpaceGenerator
+
 
 @register_object("meta_schedule.TuneContext")
 class TuneContext(Object):
@@ -68,6 +71,7 @@ def __init__(
         self,
         mod: Optional[IRModule] = None,
         target: Optional[Target] = None,
+        space_generator: Optional["SpaceGenerator"] = None,
         task_name: Optional[str] = None,
         rand_state: int = -1,
         num_threads: Optional[int] = None,
@@ -80,6 +84,8 @@ def __init__(
             The workload to be optimized.
         target : Optional[Target] = None
             The target to be optimized for.
+        space_generator : Optional[SpaceGenerator] = None
+            The design space generator.
         task_name : Optional[str] = None
             The name of the tuning task.
         rand_state : int = -1
@@ -95,6 +101,7 @@ def __init__(
             _ffi_api.TuneContext,  # type: ignore # pylint: disable=no-member
             mod,
             target,
+            space_generator,
             task_name,
             rand_state,
             num_threads,
diff --git a/src/meta_schedule/space_generator/space_generator.cc b/src/meta_schedule/space_generator/space_generator.cc
new file mode 100644
index 000000000000..6df8da2f7aa1
--- /dev/null
+++ b/src/meta_schedule/space_generator/space_generator.cc
@@ -0,0 +1,44 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#include "../utils.h"
+
+namespace tvm {
+namespace meta_schedule {
+
+SpaceGenerator SpaceGenerator::PySpaceGenerator(
+    PySpaceGeneratorNode::FInitializeWithTuneContext f_initialize_with_tune_context,
+    PySpaceGeneratorNode::FGenerateDesignSpace f_generate_design_space) {
+  ObjectPtr<PySpaceGeneratorNode> n = make_object<PySpaceGeneratorNode>();
+  n->f_initialize_with_tune_context = std::move(f_initialize_with_tune_context);
+  n->f_generate_design_space = std::move(f_generate_design_space);
+  return SpaceGenerator(n);
+}
+
+TVM_REGISTER_OBJECT_TYPE(SpaceGeneratorNode);
+TVM_REGISTER_NODE_TYPE(PySpaceGeneratorNode);
+
+TVM_REGISTER_GLOBAL("meta_schedule.SpaceGeneratorInitializeWithTuneContext")
+    .set_body_method<SpaceGenerator>(&SpaceGeneratorNode::InitializeWithTuneContext);
+TVM_REGISTER_GLOBAL("meta_schedule.SpaceGeneratorGenerateDesignSpace")
+    .set_body_method<SpaceGenerator>(&SpaceGeneratorNode::GenerateDesignSpace);
+TVM_REGISTER_GLOBAL("meta_schedule.SpaceGeneratorPySpaceGenerator")
+    .set_body_typed(SpaceGenerator::PySpaceGenerator);
+
+}  // namespace meta_schedule
+}  // namespace tvm
diff --git a/src/meta_schedule/space_generator/space_generator_union.cc b/src/meta_schedule/space_generator/space_generator_union.cc
new file mode 100644
index 000000000000..9c2e3eeabe09
--- /dev/null
+++ b/src/meta_schedule/space_generator/space_generator_union.cc
@@ -0,0 +1,70 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#include "../utils.h"
+
+namespace tvm {
+namespace meta_schedule {
+
+/*! \brief The union of design space generators. */
+class SpaceGeneratorUnionNode : public SpaceGeneratorNode {
+ public:
+  /*! \brief The array of design space generators unioned, could be recursive. */
+  Array<SpaceGenerator> space_generators;
+
+  void VisitAttrs(tvm::AttrVisitor* v) { v->Visit("space_generators", &space_generators); }
+
+  void InitializeWithTuneContext(const TuneContext& tune_context) final {
+    // Initialize each space generator.
+    for (const SpaceGenerator& space_generator : space_generators) {
+      space_generator->InitializeWithTuneContext(tune_context);
+    }
+  }
+
+  Array<tir::Schedule> GenerateDesignSpace(const IRModule& mod) final {
+    Array<tir::Schedule> design_spaces;
+    for (const SpaceGenerator& space_generator : space_generators) {
+      // Generate partial design spaces from each design space generator.
+      Array<tir::Schedule> partial = space_generator->GenerateDesignSpace(mod);
+      // Merge the partial design spaces.
+      design_spaces.insert(design_spaces.end(), partial.begin(), partial.end());
+    }
+    return design_spaces;
+  }
+
+  static constexpr const char* _type_key = "meta_schedule.SpaceGeneratorUnion";
+  TVM_DECLARE_FINAL_OBJECT_INFO(SpaceGeneratorUnionNode, SpaceGeneratorNode);
+};
+
+/*!
+ * \brief Create a design space generator as union of given design space generators.
+ * \param space_generators Array of the design space generators to be unioned.
+ * \return The design space generator created.
+ */
+SpaceGenerator SpaceGenerator::SpaceGeneratorUnion(Array<SpaceGenerator> space_generators) {
+  ObjectPtr<SpaceGeneratorUnionNode> n = make_object<SpaceGeneratorUnionNode>();
+  n->space_generators = std::move(space_generators);
+  return SpaceGenerator(n);
+}
+
+TVM_REGISTER_NODE_TYPE(SpaceGeneratorUnionNode);
+TVM_REGISTER_GLOBAL("meta_schedule.SpaceGeneratorSpaceGeneratorUnion")
+    .set_body_typed(SpaceGenerator::SpaceGeneratorUnion);
+
+}  // namespace meta_schedule
+}  // namespace tvm
diff --git a/src/meta_schedule/tune_context.cc b/src/meta_schedule/tune_context.cc
index 6e80081c1ec2..ad82b6f514a2 100644
--- a/src/meta_schedule/tune_context.cc
+++ b/src/meta_schedule/tune_context.cc
@@ -16,11 +16,11 @@
  * specific language governing permissions and limitations
  * under the License.
  */
-#include "./tune_context.h"
-
 #include <random>
 #include <utility>
 
+#include "./utils.h"
+
 namespace tvm {
 namespace meta_schedule {
 
@@ -28,6 +28,7 @@ namespace meta_schedule {
  * \brief Constructor function of TuneContext class.
  * \param mod The mod to be optimized.
  * \param target The target to be optimized for.
+ * \param space_generator The design space generator.
  * \param task_name The name of the tuning task.
  * \param rand_state The random state.
  * \param num_threads The number of threads to be used.
@@ -35,12 +36,14 @@ namespace meta_schedule {
  */
 TuneContext::TuneContext(Optional<IRModule> mod,                                    //
                          Optional<Target> target,                                   //
+                         Optional<SpaceGenerator> space_generator,                  //
                          Optional<String> task_name,                                //
                          support::LinearCongruentialEngine::TRandState rand_state,  //
                          int num_threads) {
   ObjectPtr<TuneContextNode> n = make_object<TuneContextNode>();
   n->mod = mod;
   n->target = target;
+  n->space_generator = space_generator;
   n->task_name = task_name;
   if (rand_state == -1) {
     rand_state = std::random_device()();
@@ -55,10 +58,11 @@ TVM_REGISTER_NODE_TYPE(TuneContextNode);
 TVM_REGISTER_GLOBAL("meta_schedule.TuneContext")
     .set_body_typed([](Optional<IRModule> mod,                                    //
                        Optional<Target> target,                                   //
+                       Optional<SpaceGenerator> space_generator,                  //
                        Optional<String> task_name,                                //
                        support::LinearCongruentialEngine::TRandState rand_state,  //
                        int num_threads) -> TuneContext {
-      return TuneContext(mod, target, task_name, rand_state, num_threads);
+      return TuneContext(mod, target, space_generator, task_name, rand_state, num_threads);
     });
 }  // namespace meta_schedule
 }  // namespace tvm
diff --git a/src/meta_schedule/utils.h b/src/meta_schedule/utils.h
index e6eae4d0d915..a2b5ac4d3184 100644
--- a/src/meta_schedule/utils.h
+++ b/src/meta_schedule/utils.h
@@ -21,8 +21,10 @@
 
 #include <tvm/meta_schedule/arg_info.h>
 #include <tvm/meta_schedule/builder.h>
+#include <tvm/meta_schedule/space_generator.h>
+#include <tvm/meta_schedule/tune_context.h>
 
-#include "../src/support/array.h"
+#include "../support/array.h"
 
 namespace tvm {
 namespace meta_schedule {}  // namespace meta_schedule
diff --git a/tests/python/unittest/test_meta_schedule_space_generator.py b/tests/python/unittest/test_meta_schedule_space_generator.py
new file mode 100644
index 000000000000..3ab60aced197
--- /dev/null
+++ b/tests/python/unittest/test_meta_schedule_space_generator.py
@@ -0,0 +1,88 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+""" Test Meta Schedule SpaceGenerator """
+# pylint: disable=missing-function-docstring
+
+import sys
+import math
+
+import pytest
+
+import tvm
+from tvm import tir
+from tvm.script import ty
+
+from tvm.tir.schedule import Schedule, Trace
+from tvm.meta_schedule.space_generator import ScheduleFn, SpaceGeneratorUnion
+
+
+# pylint: disable=invalid-name,no-member,line-too-long,too-many-nested-blocks,no-self-argument
+# fmt: off
+
+@tvm.script.tir
+class Matmul:
+    def main(a: ty.handle, b: ty.handle, c: ty.handle) -> None:
+        tir.func_attr({"global_symbol": "main"})
+        A = tir.match_buffer(a, (1024, 1024), "float32")
+        B = tir.match_buffer(b, (1024, 1024), "float32")
+        C = tir.match_buffer(c, (1024, 1024), "float32")
+        with tir.block([1024, 1024, tir.reduce_axis(0, 1024)], "matmul") as [vi, vj, vk]:
+            with tir.init():
+                C[vi, vj] = 0.0
+            C[vi, vj] = C[vi, vj] + A[vi, vk] * B[vk, vj]
+
+# fmt: on
+# pylint: enable=invalid-name,no-member,line-too-long,too-many-nested-blocks,no-self-argument
+
+
+def schedule_matmul(sch: Schedule):
+    block = sch.get_block("matmul")
+    i, j, k = sch.get_loops(block=block)
+    # TODO(@zxybazh): Change to `sample_perfect_tile` after upstreaming
+    i_0, i_1, i_2, i_3 = sch.split(loop=i, factors=[2, 4, 64, 2])
+    j_0, j_1, j_2, j_3 = sch.split(loop=j, factors=[4, 64, 2, 2])
+    k_0, k_1 = sch.split(loop=k, factors=[32, 32])
+    sch.reorder(i_0, j_0, i_1, j_1, k_0, i_2, j_2, k_1, i_3, j_3)
+
+
+def _check_correct(schedule: Schedule):
+    trace = schedule.trace
+    for inst in trace.decisions:
+        assert math.prod(trace.decisions[inst]) == 1024
+
+
+def test_meta_schedule_space_generator_schedule_fn():
+    mod = Matmul()
+    space_generator = ScheduleFn(sch_fn=schedule_matmul)
+    design_spaces = space_generator.generate_design_space(mod)
+    assert len(design_spaces) == 1
+    (schedule,) = design_spaces
+    _check_correct(schedule)
+
+
+def test_meta_schedule_design_space_generator_union():
+    mod = Matmul()
+    space_generator = ScheduleFn(sch_fn=schedule_matmul)
+    space_generator_union = SpaceGeneratorUnion([space_generator, space_generator])
+    design_spaces = space_generator_union.generate_design_space(mod)
+    assert len(design_spaces) == 2
+    for design_space in design_spaces:
+        _check_correct(design_space)
+
+
+if __name__ == "__main__":
+    sys.exit(pytest.main([__file__] + sys.argv[1:]))
diff --git a/tests/python/unittest/test_meta_schedule_tune_context.py b/tests/python/unittest/test_meta_schedule_tune_context.py
index a6c2101928d7..2da4c85ab421 100644
--- a/tests/python/unittest/test_meta_schedule_tune_context.py
+++ b/tests/python/unittest/test_meta_schedule_tune_context.py
@@ -46,7 +46,7 @@ def main(a: ty.handle, b: ty.handle, c: ty.handle) -> None:  # pylint: disable=n
 
 def test_tune_context_create():
     mod = Matmul()
-    context = TuneContext(mod, Target("llvm"), "Test Task")
+    context = TuneContext(mod=mod, target=Target("llvm"), task_name="Test Task")
     assert context.num_threads > 0
     assert context.rand_state != -1
     assert context.task_name == "Test Task"

From f573007777825b3df96bb069c80da57334e4b07a Mon Sep 17 00:00:00 2001
From: sunway <sunwayforever@gmail.com>
Date: Mon, 27 Sep 2021 03:46:29 +0800
Subject: [PATCH 23/37] relu of dnnl json runtime only support 4-dims input
 (#9122)

---
 src/runtime/contrib/dnnl/dnnl_json_runtime.cc |  5 ++--
 tests/python/relay/test_json_runtime.py       | 28 +++++++++++--------
 2 files changed, 18 insertions(+), 15 deletions(-)

diff --git a/src/runtime/contrib/dnnl/dnnl_json_runtime.cc b/src/runtime/contrib/dnnl/dnnl_json_runtime.cc
index e52009d7add7..66378d74f5d7 100644
--- a/src/runtime/contrib/dnnl/dnnl_json_runtime.cc
+++ b/src/runtime/contrib/dnnl/dnnl_json_runtime.cc
@@ -338,7 +338,7 @@ class DNNLJSONRuntime : public JSONRuntimeBase {
 
     auto data_entry = node.GetInputs()[0];
     dnnl::memory::dims shape = nodes_[data_entry.id_].GetOpShape()[data_entry.index_];
-    auto data_md = dnnl::memory::desc{{shape}, dt::f32, tag::abcd};
+    dnnl::memory::desc data_md = GenDNNLMemDescByShape(shape, dt::f32);
 
     auto relu_desc = dnnl::eltwise_forward::desc(dnnl::prop_kind::forward_inference,
                                                  dnnl::algorithm::eltwise_relu, data_md, 0);
@@ -349,9 +349,8 @@ class DNNLJSONRuntime : public JSONRuntimeBase {
     net_.push_back(relu);
 
     auto data_memory = BindDNNLMemory(data_entry, data_md);
-    auto out_md = dnnl::memory::desc(shape, dt::f32, tag::abcd);
     JSONGraphNodeEntry out_entry(nid, 0);
-    auto out_memory = BindDNNLMemory(out_entry, out_md);
+    auto out_memory = BindDNNLMemory(out_entry, data_md);
 
     net_args_.push_back({{DNNL_ARG_SRC, data_memory}, {DNNL_ARG_DST, out_memory}});
   }
diff --git a/tests/python/relay/test_json_runtime.py b/tests/python/relay/test_json_runtime.py
index 8107dc231adb..721271ac70f1 100644
--- a/tests/python/relay/test_json_runtime.py
+++ b/tests/python/relay/test_json_runtime.py
@@ -225,7 +225,7 @@ def test_relu():
     dtype = "float32"
     shape = (1, 32, 14, 14)
 
-    def gen_relu():
+    def gen_relu(shape):
         data0 = relay.var("data0", shape=shape, dtype=dtype)
         out = relay.nn.relu(data0)
 
@@ -250,18 +250,22 @@ def gen_relu():
 
         return mod, ref_mod
 
-    mod, ref_mod = gen_relu()
+    def check(shape):
+        mod, ref_mod = gen_relu(shape)
+
+        data0 = np.random.uniform(-1, 1, shape).astype(dtype)
+        check_result(
+            mod,
+            ref_mod,
+            {
+                "data0": data0,
+            },
+            shape,
+            tol=1e-5,
+        )
 
-    data0 = np.random.uniform(-1, 1, shape).astype(dtype)
-    check_result(
-        mod,
-        ref_mod,
-        {
-            "data0": data0,
-        },
-        (1, 32, 14, 14),
-        tol=1e-5,
-    )
+    check(shape=(1, 32, 14, 14))
+    check(shape=(1, 32))
 
 
 def test_dense():

From 0564d38e7965152f94d61528020aa19fac064b0a Mon Sep 17 00:00:00 2001
From: sunway <sunwayforever@gmail.com>
Date: Mon, 27 Sep 2021 03:47:16 +0800
Subject: [PATCH 24/37] add `multiply` and remove `subtract` for dnnl json
 runtime (#9120)

---
 python/tvm/relay/op/contrib/dnnl.py           |  1 -
 src/runtime/contrib/dnnl/dnnl_json_runtime.cc | 15 ++++---
 tests/python/relay/test_json_runtime.py       | 45 +++++++++++++++++++
 3 files changed, 53 insertions(+), 8 deletions(-)

diff --git a/python/tvm/relay/op/contrib/dnnl.py b/python/tvm/relay/op/contrib/dnnl.py
index 79bd02db164b..a2fdc19badab 100644
--- a/python/tvm/relay/op/contrib/dnnl.py
+++ b/python/tvm/relay/op/contrib/dnnl.py
@@ -64,7 +64,6 @@ def _func_wrapper(expr):
 _register_external_op_helper("nn.dense")
 _register_external_op_helper("nn.relu")
 _register_external_op_helper("add")
-_register_external_op_helper("subtract")
 _register_external_op_helper("multiply")
 
 
diff --git a/src/runtime/contrib/dnnl/dnnl_json_runtime.cc b/src/runtime/contrib/dnnl/dnnl_json_runtime.cc
index 66378d74f5d7..b32d137a2566 100644
--- a/src/runtime/contrib/dnnl/dnnl_json_runtime.cc
+++ b/src/runtime/contrib/dnnl/dnnl_json_runtime.cc
@@ -113,7 +113,9 @@ class DNNLJSONRuntime : public JSONRuntimeBase {
         } else if ("nn.relu" == op_name) {
           Relu(nid);
         } else if ("add" == op_name) {
-          Add(nid);
+          Binary(nid, dnnl::algorithm::binary_add);
+        } else if ("multiply" == op_name) {
+          Binary(nid, dnnl::algorithm::binary_mul);
         } else {
           LOG(FATAL) << "Unsupported op: " << op_name;
         }
@@ -355,7 +357,7 @@ class DNNLJSONRuntime : public JSONRuntimeBase {
     net_args_.push_back({{DNNL_ARG_SRC, data_memory}, {DNNL_ARG_DST, out_memory}});
   }
 
-  void Add(const size_t& nid) {
+  void Binary(const size_t& nid, dnnl::algorithm algo) {
     auto node = nodes_[nid];
 
     // Memory and compute description.
@@ -377,11 +379,10 @@ class DNNLJSONRuntime : public JSONRuntimeBase {
     JSONGraphNodeEntry out_entry(nid, 0);
     auto out_memory = BindDNNLMemory(out_entry, out_md);
 
-    auto add_desc =
-        dnnl::binary::desc(dnnl::algorithm::binary_add, data_mds[0], data_mds[1], out_md);
-    auto add_prim_desc = dnnl::binary::primitive_desc(add_desc, engine_);
-    auto add = dnnl::binary(add_prim_desc);
-    net_.push_back(add);
+    auto binary_desc = dnnl::binary::desc(algo, data_mds[0], data_mds[1], out_md);
+    auto binary_prim_desc = dnnl::binary::primitive_desc(binary_desc, engine_);
+    auto binary = dnnl::binary(binary_prim_desc);
+    net_.push_back(binary);
 
     net_args_.push_back({{DNNL_ARG_SRC_0, data_memories[0]},
                          {DNNL_ARG_SRC_1, data_memories[1]},
diff --git a/tests/python/relay/test_json_runtime.py b/tests/python/relay/test_json_runtime.py
index 721271ac70f1..ca792204c835 100644
--- a/tests/python/relay/test_json_runtime.py
+++ b/tests/python/relay/test_json_runtime.py
@@ -216,6 +216,50 @@ def gen_add():
     check_result(mod, ref_mod, {"data0": data0, "data1": data1}, shape, tol=1e-5)
 
 
+def test_multiply():
+    """Test a subgraph with a single add operator."""
+    if not tvm.get_global_func("runtime.DNNLJSONRuntimeCreate", True):
+        print("skip because DNNL codegen is not available")
+        return
+
+    dtype = "float32"
+    shape = (10, 10)
+
+    def gen_multiply():
+        data0 = relay.var("data0", shape=shape, dtype=dtype)
+        data1 = relay.var("data1", shape=shape, dtype=dtype)
+        out = relay.multiply(data0, data1)
+
+        func = relay.Function([data0, data1], out)
+        func = set_func_attr(func, "dnnl", "tvmgen_default_dnnl_0")
+        glb_var = relay.GlobalVar("tvmgen_default_dnnl_0")
+        mod = tvm.IRModule()
+        mod[glb_var] = func
+        mod = transform.InferType()(mod)
+
+        data0 = relay.var("data0", shape=shape, dtype=dtype)
+        data1 = relay.var("data1", shape=shape, dtype=dtype)
+        main_f = relay.Function([data0, data1], glb_var(data0, data1))
+        mod["main"] = main_f
+        mod = transform.InferType()(mod)
+
+        data0 = relay.var("data0", shape=shape, dtype=dtype)
+        data1 = relay.var("data1", shape=shape, dtype=dtype)
+        out = relay.multiply(data0, data1)
+        main_f = relay.Function([data0, data1], out)
+        ref_mod = tvm.IRModule()
+        ref_mod["main"] = main_f
+        ref_mod = transform.InferType()(ref_mod)
+
+        return mod, ref_mod
+
+    mod, ref_mod = gen_multiply()
+
+    data0 = np.random.uniform(0, 1, shape).astype(dtype)
+    data1 = np.random.uniform(0, 1, shape).astype(dtype)
+    check_result(mod, ref_mod, {"data0": data0, "data1": data1}, shape, tol=1e-5)
+
+
 def test_relu():
     """Test a subgraph with a single ReLU operator."""
     if not tvm.get_global_func("runtime.DNNLJSONRuntimeCreate", True):
@@ -672,6 +716,7 @@ def test_partial_constant():
 if __name__ == "__main__":
     test_conv2d()
     test_add()
+    test_multiply()
     test_relu()
     test_dense()
     test_bn()

From db8864b6540e8997a17ed2143ae9c1511afcda02 Mon Sep 17 00:00:00 2001
From: Yuxiang Wei <universe_fly@icloud.com>
Date: Mon, 27 Sep 2021 13:37:29 +0800
Subject: [PATCH 25/37] Fix the missing `dtype` attribute of `tir.Shuffle` in
 Python level (#9131)

---
 include/tvm/tir/expr.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/tvm/tir/expr.h b/include/tvm/tir/expr.h
index 8ea48dd592d5..f6741112f269 100644
--- a/include/tvm/tir/expr.h
+++ b/include/tvm/tir/expr.h
@@ -947,6 +947,7 @@ class ShuffleNode : public PrimExprNode {
   Array<PrimExpr> indices;
 
   void VisitAttrs(AttrVisitor* v) {
+    v->Visit("dtype", &dtype);
     v->Visit("vectors", &vectors);
     v->Visit("indices", &indices);
     v->Visit("span", &span);

From 44d5d7a434d17d96a1b803645638cb3a9832ddbc Mon Sep 17 00:00:00 2001
From: masahi <masahi129@gmail.com>
Date: Mon, 27 Sep 2021 21:29:27 +0900
Subject: [PATCH 26/37] [Relay] Register layout conversion function to more
 reduce ops (#9048)

* Register layout conversion function to more reduce ops

* bug fix for exclude=True case, the original code compute wrong axes

* properly handle variance op, which has two inputs

* update test expected output
---
 src/relay/op/tensor/reduce.cc                 | 48 ++++++++---
 .../python/relay/test_pass_alter_op_layout.py |  8 +-
 .../relay/test_pass_convert_op_layout.py      | 80 +++++++++++--------
 3 files changed, 86 insertions(+), 50 deletions(-)

diff --git a/src/relay/op/tensor/reduce.cc b/src/relay/op/tensor/reduce.cc
index 693589fecfb4..c9f14c91c7b1 100644
--- a/src/relay/op/tensor/reduce.cc
+++ b/src/relay/op/tensor/reduce.cc
@@ -116,13 +116,14 @@ Array<Integer> GetExcludeAxes(size_t indim, const Array<Integer>& inaxis) {
 }
 
 // Return the modified layout for AlterOpLayout pass.
+template <typename T>
 InferCorrectLayoutOutput ReduceInferCorrectLayout(const Attrs& attrs,
                                                   const Array<Layout>& new_in_layouts,
                                                   const Array<Layout>& old_in_layouts,
                                                   const Array<tvm::relay::Type>& old_in_types) {
-  const auto* attrs_ptr = attrs.as<ReduceAttrs>();
+  const auto* attrs_ptr = attrs.as<T>();
   ICHECK(attrs_ptr);
-  ObjectPtr<ReduceAttrs> params = make_object<ReduceAttrs>(*attrs_ptr);
+  ObjectPtr<T> params = make_object<T>(*attrs_ptr);
 
   // Get the reduce axes.
   Array<Array<IndexExpr>> old_in_shapes;
@@ -152,11 +153,14 @@ InferCorrectLayoutOutput ReduceInferCorrectLayout(const Attrs& attrs,
     for (auto iter_var : layout->axes) {
       const auto& layout_axis = LayoutAxis::Get(iter_var);
       const std::string& layout_dim = layout_axis.name();
-      if (old_r_dims.count(layout_dim)) {
-        new_r_axes.push_back(tvm::Integer(axis_index));
-      }
       // Collect only the primal axis.
       if (layout_axis.IsPrimal()) {
+        if (old_r_dims.count(layout_dim) && !params->exclude) {
+          new_r_axes.push_back(tvm::Integer(axis_index));
+        }
+        if (!old_r_dims.count(layout_dim) && params->exclude) {
+          new_r_axes.push_back(tvm::Integer(axis_index));
+        }
         if (!old_r_dims.count(layout_dim) || params->keepdims) {
           inferred_out_string += layout_dim;
         }
@@ -171,18 +175,24 @@ InferCorrectLayoutOutput ReduceInferCorrectLayout(const Attrs& attrs,
 
   std::string new_layout_string;
   Array<Integer> new_r_axes;
+  Array<Layout> new_input_layouts;
+
+  auto check_num_input_layouts = [](Array<Layout> in_layouts) {
+    // The second case is for variance op
+    ICHECK(in_layouts.size() == 1 || in_layouts.size() == 2);
+  };
 
   if (new_in_layouts.defined() && r_axes.size()) {
     // Adapt to new layout. The axis has to change. Record original reduce axes. Convert to the
     // modified layout axes.
-    ICHECK_EQ(new_in_layouts.size(), 1);
-    ICHECK_EQ(old_in_layouts.size(), 1);
+    check_num_input_layouts(new_in_layouts);
+    check_num_input_layouts(old_in_layouts);
 
     // Get inferred_in and inferred_out from new_in_layout.
     std::tie(inferred_in, inferred_out, new_r_axes) = infer(new_in_layouts[0]);
     params->axis = new_r_axes;
   } else if (old_in_layouts.defined()) {
-    ICHECK_EQ(old_in_layouts.size(), 1);
+    check_num_input_layouts(old_in_layouts);
 
     // If the new layout is undefined, get inferred_in and inferred_out from old_in_layout.
     if (old_in_layouts[0].defined()) {
@@ -190,7 +200,13 @@ InferCorrectLayoutOutput ReduceInferCorrectLayout(const Attrs& attrs,
     }
   }
 
-  return InferCorrectLayoutOutput({inferred_in}, {inferred_out}, Attrs(params));
+  new_input_layouts.push_back(inferred_in);
+
+  if (old_in_layouts.size() == 2) {
+    new_input_layouts.push_back(inferred_in);
+  }
+
+  return InferCorrectLayoutOutput(new_input_layouts, {inferred_out}, Attrs(params));
 }
 
 template <typename F>
@@ -389,6 +405,7 @@ values over a given axis.
     .set_support_level(4)
     .add_type_rel("ArgReduce", GenericReduceRel<ArgReduceAttrs>)
     .set_attr<FTVMCompute>("FTVMCompute", ArgMaxCompute)
+    .set_attr<FInferCorrectLayout>("FInferCorrectLayout", ReduceInferCorrectLayout<ArgReduceAttrs>)
     .set_attr<TOpPattern>("TOpPattern", kCommReduce);
 
 Array<te::Tensor> ArgMinCompute(const Attrs& attrs, const Array<te::Tensor>& inputs,
@@ -405,6 +422,7 @@ values over a given axis.
     .set_support_level(4)
     .add_type_rel("ArgReduce", GenericReduceRel<ArgReduceAttrs>)
     .set_attr<FTVMCompute>("FTVMCompute", ArgMinCompute)
+    .set_attr<FInferCorrectLayout>("FInferCorrectLayout", ReduceInferCorrectLayout<ArgReduceAttrs>)
     .set_attr<TOpPattern>("TOpPattern", kCommReduce);
 
 Array<te::Tensor> SumCompute(const Attrs& attrs, const Array<te::Tensor>& inputs,
@@ -433,7 +451,7 @@ Example::
     .set_attrs_type<ReduceAttrs>()
     .set_support_level(4)
     .add_type_rel("Reduce", ReduceRel)
-    .set_attr<FInferCorrectLayout>("FInferCorrectLayout", ReduceInferCorrectLayout)
+    .set_attr<FInferCorrectLayout>("FInferCorrectLayout", ReduceInferCorrectLayout<ReduceAttrs>)
     .set_attr<FTVMCompute>("FTVMCompute", SumCompute)
     .set_attr<TOpPattern>("TOpPattern", kCommReduce);
 
@@ -468,6 +486,7 @@ Example::
     .set_support_level(4)
     .add_type_rel("Reduce", ReduceRel)
     .set_attr<FTVMCompute>("FTVMCompute", AllCompute)
+    .set_attr<FInferCorrectLayout>("FInferCorrectLayout", ReduceInferCorrectLayout<ReduceAttrs>)
     .set_attr<TOpPattern>("TOpPattern", kCommReduce);
 
 Array<te::Tensor> AnyCompute(const Attrs& attrs, const Array<te::Tensor>& inputs,
@@ -516,6 +535,7 @@ RELAY_REGISTER_REDUCE_OP("max")
     .set_support_level(4)
     .add_type_rel("Reduce", ReduceRel)
     .set_attr<FTVMCompute>("FTVMCompute", MaxCompute)
+    .set_attr<FInferCorrectLayout>("FInferCorrectLayout", ReduceInferCorrectLayout<ReduceAttrs>)
     .set_attr<TOpPattern>("TOpPattern", kCommReduce);
 
 Array<te::Tensor> MinCompute(const Attrs& attrs, const Array<te::Tensor>& inputs,
@@ -531,6 +551,7 @@ RELAY_REGISTER_REDUCE_OP("min")
     .set_support_level(4)
     .add_type_rel("Reduce", ReduceRel)
     .set_attr<FTVMCompute>("FTVMCompute", MinCompute)
+    .set_attr<FInferCorrectLayout>("FInferCorrectLayout", ReduceInferCorrectLayout<ReduceAttrs>)
     .set_attr<TOpPattern>("TOpPattern", kCommReduce);
 
 Array<te::Tensor> ProdCompute(const Attrs& attrs, const Array<te::Tensor>& inputs,
@@ -551,10 +572,10 @@ Example::
           [[1,4],[4,3],[5,2]],
           [[7,1],[7,2],[7,3]]]
 
-  mean(data, axis=1)
+  prod(data, axis=1)
   [35562240]
 
-  mean(data, axis=[1,2])
+  prod(data, axis=[1,2])
   [ 36  480  2058]
 
 )code" TVM_ADD_FILELINE)
@@ -562,6 +583,7 @@ Example::
     .set_support_level(4)
     .add_type_rel("Reduce", ReduceRel)
     .set_attr<FTVMCompute>("FTVMCompute", ProdCompute)
+    .set_attr<FInferCorrectLayout>("FInferCorrectLayout", ReduceInferCorrectLayout<ReduceAttrs>)
     .set_attr<TOpPattern>("TOpPattern", kCommReduce);
 
 Array<te::Tensor> MeanCompute(const Attrs& attrs, const Array<te::Tensor>& inputs,
@@ -600,6 +622,7 @@ Example::
     .set_support_level(4)
     .add_type_rel("Reduce", ReduceRel)
     .set_attr<FTVMCompute>("FTVMCompute", MeanCompute)
+    .set_attr<FInferCorrectLayout>("FInferCorrectLayout", ReduceInferCorrectLayout<ReduceAttrs>)
     .set_attr<TOpPattern>("TOpPattern", kCommReduce);
 
 bool VarianceRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
@@ -675,6 +698,7 @@ RELAY_REGISTER_OP("variance")
     .add_argument("mean", "Tensor", "The mean tensor.")
     .add_type_rel("Variance", VarianceRel)
     .set_attr<FTVMCompute>("FTVMCompute", VarianceCompute)
+    .set_attr<FInferCorrectLayout>("FInferCorrectLayout", ReduceInferCorrectLayout<VarianceAttrs>)
     .set_attr<TOpPattern>("TOpPattern", kCommReduce);
 
 }  // namespace relay
diff --git a/tests/python/relay/test_pass_alter_op_layout.py b/tests/python/relay/test_pass_alter_op_layout.py
index ef5824c957e8..3310b6b2ed69 100644
--- a/tests/python/relay/test_pass_alter_op_layout.py
+++ b/tests/python/relay/test_pass_alter_op_layout.py
@@ -486,8 +486,7 @@ def before():
         beta = relay.var("beta")
         y = relay.nn.batch_norm(y, gamma, beta, mean, var, axis=3)
         y = y[0]
-        y = relay.Function(analysis.free_vars(y), y)
-        return y
+        return relay.Function(analysis.free_vars(y), y)
 
     def alter_conv2d(attrs, inputs, tinfos, out_type):
         data, weight = inputs
@@ -509,9 +508,8 @@ def expected():
         bias = relay.layout_transform(bias, src_layout="NCHW", dst_layout="NCHW16c")
         add = relay.add(y, bias)
         y = relay.layout_transform(add, src_layout="NCHW16c", dst_layout="NCHW")
-        y = relay.layout_transform(y, src_layout="NCHW", dst_layout="NHWC")
-        mean = relay.mean(y, axis=3, exclude=True)
-        var = relay.variance(y, axis=3, exclude=True)
+        mean = relay.mean(y, axis=1, exclude=True)
+        var = relay.variance(y, axis=1, exclude=True)
         denom = relay.const(1.0) / relay.sqrt(var + relay.const(1e-05))
         gamma = relay.var("gamma", shape=(16,))
         denom = denom * gamma
diff --git a/tests/python/relay/test_pass_convert_op_layout.py b/tests/python/relay/test_pass_convert_op_layout.py
index 2b7e3e9eb3a9..a1965aa2d0c5 100644
--- a/tests/python/relay/test_pass_convert_op_layout.py
+++ b/tests/python/relay/test_pass_convert_op_layout.py
@@ -15,6 +15,8 @@
 # specific language governing permissions and limitations
 # under the License.
 """Test alter op layout pass"""
+import pytest
+
 import tvm
 from tvm import te
 
@@ -1925,37 +1927,49 @@ def infer_correct_layout_relu(attrs, new_in_layouts, old_in_layouts, old_in_type
     assert test_infer_correct_layout_flag == True
 
 
+def test_reduce_op_convert_layout():
+    for reduce_op in [relay.argmax, relay.mean, relay.max]:
+
+        def before():
+            x = relay.var("x", shape=(1, 64, 56, 56))
+            weight = relay.var("weight", shape=(64, 64, 3, 3))
+            y = relay.nn.conv2d(
+                x,
+                weight,
+                channels=64,
+                kernel_size=(3, 3),
+                padding=(1, 1),
+                data_layout="NCHW",
+                kernel_layout="OIHW",
+            )
+            y = reduce_op(y, axis=[2, 3])
+            y = relay.Function([x, weight], y)
+            return y
+
+        def expected():
+            x = relay.var("x", shape=(1, 64, 56, 56))
+            weight = relay.var("weight", shape=(64, 64, 3, 3))
+            x = relay.layout_transform(x, "NCHW", "NHWC")
+            weight = relay.layout_transform(weight, "OIHW", "HWIO")
+            y = relay.nn.conv2d(
+                x,
+                weight,
+                channels=64,
+                kernel_size=(3, 3),
+                padding=(1, 1),
+                data_layout="NHWC",
+                kernel_layout="HWIO",
+            )
+            y = reduce_op(y, axis=[1, 2])
+            y = relay.Function(relay.analysis.free_vars(y), y)
+            return y
+
+        a = before()
+        a = run_opt_pass(a, transform.ConvertLayout({"nn.conv2d": ["NHWC", "default"]}))
+        b = run_opt_pass(expected(), transform.InferType())
+
+        assert tvm.ir.structural_equal(a, b), "Actual = \n" + str(a)
+
+
 if __name__ == "__main__":
-    test_qnn_binary_no_convert_layout()
-    test_no_convert_layout()
-    test_conv_convert_layout()
-    test_conv_nhwc_convert_layout()
-    test_conv_bias_pool_convert_layout()
-    test_conv_concat_convert_layout()
-    test_dual_path_convert_layout()
-    test_bn_convert_layout()
-    test_slice_like_convert_layout()
-    test_transpose_convert_layout()
-    test_resnet_convert_layout()
-    test_scalar_convert_layout()
-    test_conv_bn_convert_layout()
-    test_qnn_conv_requantize_convert_layout()
-    test_qnn_conv_concat_convert_layout()
-    test_qnn_conv_add_convert_layout()
-    test_qnn_conv_nhwc_convert_layout()
-    test_conv_convert_kernel_layout()
-    test_conv_transpose_convert_layout()
-    test_conv_roi_align_convert_layout()
-    test_conv_roi_pool_convert_layout()
-    test_conv_strided_slice_convert_layout()
-    test_deformable_conv_bias_pool_convert_layout()
-    test_default_keyword()
-    test_different_ops_convert_layout()
-    test_no_desired_layout()
-    test_convert_with_config()
-    test_conv_squeeze_convert_layout()
-    test_conv_reduce_convert_layout()
-    test_conv_strided_slice_axes_convert_layout()
-    test_image_resize_convert_layout()
-    test_conv_image_resize_convert_layout()
-    test_infer_correct_layout()
+    pytest.main([__file__])

From c82b2bdb7011e58cffb7e9e5ef588c84af0017de Mon Sep 17 00:00:00 2001
From: Sen Yang <syang15@fudan.edu.cn>
Date: Mon, 27 Sep 2021 20:29:45 +0800
Subject: [PATCH 27/37] fix annotation of tir generic (#9119)

---
 python/tvm/tir/generic.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/tvm/tir/generic.py b/python/tvm/tir/generic.py
index 58efc0985970..68e995e01872 100644
--- a/python/tvm/tir/generic.py
+++ b/python/tvm/tir/generic.py
@@ -121,7 +121,7 @@ def floordiv(lhs, rhs, span=None):
     Returns
     -------
     op : tvm.Expr
-        The result Expr of divide operaton.
+        The result Expr of floordiv operaton.
     """
     return _ffi_api._OpFloorDiv(lhs, rhs, span)  # type: ignore
 
@@ -139,6 +139,6 @@ def cast(src, dtype, span=None):
     Returns
     -------
     op : tvm.Expr
-        The result Expr of divide operaton.
+        The result Expr of cast operaton.
     """
     return _ffi_api._cast(dtype, src, span)  # type: ignore

From f70ec813fb84ea643a6f3d2746034425f71422dd Mon Sep 17 00:00:00 2001
From: Sen Yang <syang15@fudan.edu.cn>
Date: Mon, 27 Sep 2021 20:30:06 +0800
Subject: [PATCH 28/37] prevent casting handle to other types (#9114)

---
 src/tir/op/op.cc                        |  1 +
 tests/python/unittest/test_tir_nodes.py | 32 ++++++-------------------
 2 files changed, 8 insertions(+), 25 deletions(-)

diff --git a/src/tir/op/op.cc b/src/tir/op/op.cc
index 5db131c44f2a..d08bef2ab91a 100644
--- a/src/tir/op/op.cc
+++ b/src/tir/op/op.cc
@@ -273,6 +273,7 @@ PrimExpr cast(const DataType& t, PrimExpr value, Span span) {
     } else if (const FloatImmNode* op = value.as<FloatImmNode>()) {
       return make_const(t, op->value, op->span);
     }
+    ICHECK(!value.dtype().is_handle()) << "Can't cast a handle to other types.";
     return tir::Cast(t, value, span);
   } else {
     if (value.dtype().lanes() == 1) {
diff --git a/tests/python/unittest/test_tir_nodes.py b/tests/python/unittest/test_tir_nodes.py
index dbae0b6fa516..de94464187b0 100644
--- a/tests/python/unittest/test_tir_nodes.py
+++ b/tests/python/unittest/test_tir_nodes.py
@@ -14,6 +14,7 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
+import pytest
 import tvm
 from tvm import te
 import numpy as np
@@ -104,6 +105,11 @@ def test_cast():
     assert isinstance(z, tvm.tir.Broadcast)
     assert z.lanes == 4
 
+    s = tvm.tir.StringImm("s")
+    with pytest.raises(tvm.error.TVMError) as cm:
+        s.astype("int")
+        assert "Can't cast a handle to other types" in str(cm.execption)
+
 
 def test_attr():
     x = te.var("x")
@@ -468,28 +474,4 @@ def test_block_blockrealize():
 
 
 if __name__ == "__main__":
-    test_intimm_cond()
-    test_buffer_load_store()
-    test_vars()
-    test_prim_func()
-    test_cast()
-    test_attr()
-    test_const()
-    test_scalar_dtype_inference()
-    test_make()
-    test_ir()
-    test_basic()
-    test_stmt()
-    test_let()
-    test_dir()
-    test_dtype()
-    test_any()
-    test_all()
-    test_bitwise()
-    test_float_bitwise()
-    test_shift_bounds()
-    test_divide_by_zero()
-    test_isnan()
-    test_equality()
-    test_equality_string_imm()
-    test_block_blockrealize()
+    pytest.main([__file__])

From 861b47d5ecd74b3880dd79864056faf1f66c3874 Mon Sep 17 00:00:00 2001
From: Krzysztof Parzyszek <kparzysz@quicinc.com>
Date: Mon, 27 Sep 2021 07:35:48 -0500
Subject: [PATCH 29/37] [LLVM] Refactor MakeCallPacked, NFC (#9118)

Change the interface for `MakeCallPacked` in `CodeGenCPU` and in
`CodeGenHexagon` to encapsulate the multiple returned values into
a single structure. This should help readability, but also it will
make the upcoming adoption of opaque pointers a bit easier.
---
 src/target/llvm/codegen_cpu.cc     | 67 ++++++++++++------------
 src/target/llvm/codegen_cpu.h      |  8 ++-
 src/target/llvm/codegen_hexagon.cc | 82 ++++++++++++++++--------------
 3 files changed, 85 insertions(+), 72 deletions(-)

diff --git a/src/target/llvm/codegen_cpu.cc b/src/target/llvm/codegen_cpu.cc
index ab96d6e69d14..e67dee3c37c4 100644
--- a/src/target/llvm/codegen_cpu.cc
+++ b/src/target/llvm/codegen_cpu.cc
@@ -686,10 +686,10 @@ llvm::Value* CodeGenCPU::GetPackedFuncHandle(const std::string& fname) {
   return phi;
 }
 
-llvm::BasicBlock* CodeGenCPU::MakeCallPacked(const Array<PrimExpr>& args, llvm::Value** rvalue,
-                                             llvm::Value** ret_tcode, const DataType& r_type,
-                                             const int64_t begin, const int64_t end) {
-  using llvm::BasicBlock;
+CodeGenCPU::PackedCall CodeGenCPU::MakeCallPackedLowered(const Array<PrimExpr>& args,
+                                                         const DataType& r_type,
+                                                         const int64_t begin, const int64_t end) {
+  PackedCall pc;
   std::string func_name = args[0].as<StringImmNode>()->value;
   llvm::Value* handle = GetPackedFuncHandle(func_name);
   // call the function
@@ -702,66 +702,69 @@ llvm::BasicBlock* CodeGenCPU::MakeCallPacked(const Array<PrimExpr>& args, llvm::
   llvm::Value* arg_tcode = CreateBufferPtr(DataType::Int(32), stack_tcode, ConstInt32(begin));
   llvm::Value* ret_value = builder_->CreateInBoundsGEP(
       builder_->CreatePointerCast(stack_value, t_tvm_value_->getPointerTo()), ConstInt32(end));
-  *ret_tcode = CreateBufferPtr(DataType::Int(32), stack_tcode, ConstInt32(end));
+  llvm::Value* ret_tcode = CreateBufferPtr(DataType::Int(32), stack_tcode, ConstInt32(end));
+
 #if TVM_LLVM_VERSION >= 90
   auto call_callee = llvm::FunctionCallee(ftype_tvm_func_call_, RuntimeTVMFuncCall());
 #else
   auto call_callee = RuntimeTVMFuncCall();
 #endif
-  BasicBlock* end_block = CheckCallSuccess(builder_->CreateCall(
-      call_callee, {handle, arg_value, arg_tcode, ConstInt32(nargs), ret_value, *ret_tcode}));
+  llvm::Value* call = builder_->CreateCall(
+      call_callee, {handle, arg_value, arg_tcode, ConstInt32(nargs), ret_value, ret_tcode});
+  llvm::BasicBlock* end_block = CheckCallSuccess(call);
+
+  // Load the return value and cast it to the designated type (r_type).
   DataType r_api_type = tir::APIType(r_type);
   llvm::Value* load_ptr =
       builder_->CreatePointerCast(ret_value, DTypeToLLVMType(r_api_type)->getPointerTo());
 #if TVM_LLVM_VERSION >= 110
-  *rvalue = builder_->CreateAlignedLoad(load_ptr, llvm::Align(8));
+  llvm::Value* rvalue = builder_->CreateAlignedLoad(load_ptr, llvm::Align(8));
 #else
-  *rvalue = builder_->CreateAlignedLoad(load_ptr, 8);
+  llvm::Value* rvalue = builder_->CreateAlignedLoad(load_ptr, 8);
 #endif
-  *rvalue = CreateCast(r_api_type, r_type, *rvalue);
-  return end_block;
+  pc.ret_value = CreateCast(r_api_type, r_type, rvalue);
+
+  // Load the return type code.
+#if TVM_LLVM_VERSION >= 110
+  pc.ret_tcode = builder_->CreateAlignedLoad(ret_tcode, llvm::Align(8));
+#else
+  pc.ret_tcode = builder_->CreateAlignedLoad(ret_tcode, 8);
+#endif
+
+  pc.end_block = end_block;
+  return pc;
 }
 
 llvm::Value* CodeGenCPU::CreateCallPacked(const CallNode* op) {
   ICHECK_EQ(op->args.size(), 5U);
-  llvm::Value* rvalue = nullptr;
-  llvm::Value* ret_tcode = nullptr;
-  MakeCallPacked(op->args, &rvalue, &ret_tcode, op->dtype, op->args[3].as<IntImmNode>()->value,
-                 op->args[4].as<IntImmNode>()->value);
-  return rvalue;
+  PackedCall pc = MakeCallPackedLowered(op->args, op->dtype, op->args[3].as<IntImmNode>()->value,
+                                        op->args[4].as<IntImmNode>()->value);
+  return pc.ret_value;
 }
 
 llvm::Value* CodeGenCPU::CreateCallTracePacked(const CallNode* op) {
-  using llvm::BasicBlock;
   ICHECK_EQ(op->args.size(), 6U);
-  llvm::Value* rvalue = nullptr;
-  llvm::Value* ret_tcode = nullptr;
-  BasicBlock* end_block =
-      MakeCallPacked(op->args, &rvalue, &ret_tcode, op->dtype, op->args[3].as<IntImmNode>()->value,
-                     op->args[4].as<IntImmNode>()->value);
+  PackedCall pc = MakeCallPackedLowered(op->args, op->dtype, op->args[3].as<IntImmNode>()->value,
+                                        op->args[4].as<IntImmNode>()->value);
   // Get traced value.
   llvm::Value* traced_value = MakeValue(op->args[5]);
   // The update_block handles case when we need to update the return value.
-  BasicBlock* update_block = BasicBlock::Create(*ctx_, "update_block", function_);
+  llvm::BasicBlock* update_block = llvm::BasicBlock::Create(*ctx_, "update_block", function_);
   // The continue_block handles case when we need to return original
   // traced value.
-  BasicBlock* continue_block = BasicBlock::Create(*ctx_, "continue_block", function_);
-#if TVM_LLVM_VERSION >= 110
-  llvm::Value* ret_tcode_value = builder_->CreateAlignedLoad(ret_tcode, llvm::Align(8));
-#else
-  llvm::Value* ret_tcode_value = builder_->CreateAlignedLoad(ret_tcode, 8);
-#endif
+  llvm::BasicBlock* continue_block = llvm::BasicBlock::Create(*ctx_, "continue_block", function_);
+
   // Check the ret_type_code and create cmp instruction.
   llvm::Value* cmp =
-      builder_->CreateICmpNE(ret_tcode_value, llvm::ConstantInt::get(t_int_, kTVMNullptr));
+      builder_->CreateICmpNE(pc.ret_tcode, llvm::ConstantInt::get(t_int_, kTVMNullptr));
   builder_->CreateCondBr(cmp, update_block, continue_block);
   builder_->SetInsertPoint(update_block);
   builder_->CreateBr(continue_block);
   builder_->SetInsertPoint(continue_block);
   // The return value depends on from what bb we come from.
   llvm::PHINode* phi_rvalue = builder_->CreatePHI(traced_value->getType(), 2);
-  phi_rvalue->addIncoming(rvalue, update_block);
-  phi_rvalue->addIncoming(traced_value, end_block);
+  phi_rvalue->addIncoming(pc.ret_value, update_block);
+  phi_rvalue->addIncoming(traced_value, pc.end_block);
   return phi_rvalue;
 }
 
diff --git a/src/target/llvm/codegen_cpu.h b/src/target/llvm/codegen_cpu.h
index d08bd639e131..30e61ea63f12 100644
--- a/src/target/llvm/codegen_cpu.h
+++ b/src/target/llvm/codegen_cpu.h
@@ -110,8 +110,12 @@ class CodeGenCPU : public CodeGenLLVM {
   void UnpackClosureData(llvm::Value* cdata, const Array<Var>& fields,
                          std::unordered_map<const VarNode*, llvm::Value*>* vmap);
   // Make packed call.
-  llvm::BasicBlock* MakeCallPacked(const Array<PrimExpr>& args, llvm::Value** rvalue,
-                                   llvm::Value** ret_tcode, const DataType& r_type,
+  struct PackedCall {
+    llvm::Value* ret_value;
+    llvm::Value* ret_tcode;
+    llvm::BasicBlock* end_block;
+  };
+  PackedCall MakeCallPackedLowered(const Array<PrimExpr>& args, const DataType& r_type,
                                    const int64_t begin, const int64_t end);
   // create call into tvm packed function.
   llvm::Value* CreateCallPacked(const CallNode* op);
diff --git a/src/target/llvm/codegen_hexagon.cc b/src/target/llvm/codegen_hexagon.cc
index d9d0d1f3d6a4..d8a64102f9cd 100644
--- a/src/target/llvm/codegen_hexagon.cc
+++ b/src/target/llvm/codegen_hexagon.cc
@@ -97,8 +97,12 @@ class CodeGenHexagon final : public CodeGenLLVM {
   std::unordered_map<std::string, llvm::GlobalVariable*> func_handle_map_;
 
   // Make packed call.
-  llvm::BasicBlock* MakeCallPacked(const Array<PrimExpr>& args, llvm::Value** rvalue,
-                                   llvm::Value** ret_tcode, const DataType& r_type,
+  struct PackedCall {
+    llvm::Value* ret_value;
+    llvm::Value* ret_tcode;
+    llvm::BasicBlock* end_block;
+  };
+  PackedCall MakeCallPackedLowered(const Array<PrimExpr>& args, const DataType& r_type,
                                    const int64_t begin, const int64_t end);
   // create call into tvm packed function.
   llvm::Value* CreateCallPacked(const CallNode* op);
@@ -296,11 +300,11 @@ llvm::Value* CodeGenHexagon::RuntimeTVMAPISetLastError() {
   return GetContextPtr(gv_tvm_api_set_last_error_);
 }
 
-llvm::BasicBlock* CodeGenHexagon::MakeCallPacked(const Array<PrimExpr>& args, llvm::Value** rvalue,
-                                                 llvm::Value** ret_tcode, const DataType& r_type,
-                                                 const int64_t begin, const int64_t end) {
-  using llvm::BasicBlock;
-  // using namespace tir;
+CodeGenHexagon::PackedCall CodeGenHexagon::MakeCallPackedLowered(const Array<PrimExpr>& args,
+                                                                 const DataType& r_type,
+                                                                 const int64_t begin,
+                                                                 const int64_t end) {
+  PackedCall pc;
   std::string func_name = args[0].as<StringImmNode>()->value;
   llvm::Value* handle = GetPackedFuncHandle(func_name);
   // call the function
@@ -313,25 +317,37 @@ llvm::BasicBlock* CodeGenHexagon::MakeCallPacked(const Array<PrimExpr>& args, ll
   llvm::Value* arg_tcode = CreateBufferPtr(DataType::Int(32), stack_tcode, ConstInt32(begin));
   llvm::Value* ret_value = builder_->CreateInBoundsGEP(
       builder_->CreatePointerCast(stack_value, t_tvm_value_->getPointerTo()), ConstInt32(end));
-  *ret_tcode = CreateBufferPtr(DataType::Int(32), stack_tcode, ConstInt32(end));
+  llvm::Value* ret_tcode = CreateBufferPtr(DataType::Int(32), stack_tcode, ConstInt32(end));
+
 #if TVM_LLVM_VERSION >= 90
   auto call_callee = llvm::FunctionCallee(ftype_tvm_func_call_, RuntimeTVMFuncCall());
 #else
   auto call_callee = RuntimeTVMFuncCall();
 #endif
-  BasicBlock* end_block = CheckCallSuccess(builder_->CreateCall(
-      call_callee, {handle, arg_value, arg_tcode, ConstInt32(nargs), ret_value, *ret_tcode}));
+  llvm::Value* call = builder_->CreateCall(
+      call_callee, {handle, arg_value, arg_tcode, ConstInt32(nargs), ret_value, ret_tcode});
+  llvm::BasicBlock* end_block = CheckCallSuccess(call);
+
+  // Load the return value and cast it to the designated type (r_type).
   DataType r_api_type = tir::APIType(r_type);
+  llvm::Value* load_ptr =
+      builder_->CreatePointerCast(ret_value, DTypeToLLVMType(r_api_type)->getPointerTo());
 #if TVM_LLVM_VERSION >= 110
-  *rvalue = builder_->CreateAlignedLoad(
-      builder_->CreatePointerCast(ret_value, DTypeToLLVMType(r_api_type)->getPointerTo()),
-      llvm::Align(8));
+  llvm::Value* rvalue = builder_->CreateAlignedLoad(load_ptr, llvm::Align(8));
 #else
-  *rvalue = builder_->CreateAlignedLoad(
-      builder_->CreatePointerCast(ret_value, DTypeToLLVMType(r_api_type)->getPointerTo()), 8);
+  llvm::Value* rvalue = builder_->CreateAlignedLoad(load_ptr, 8);
 #endif
-  *rvalue = CreateCast(r_api_type, r_type, *rvalue);
-  return end_block;
+  pc.ret_value = CreateCast(r_api_type, r_type, rvalue);
+
+  // Load the return type code.
+#if TVM_LLVM_VERSION >= 110
+  pc.ret_tcode = builder_->CreateAlignedLoad(ret_tcode, llvm::Align(8));
+#else
+  pc.ret_tcode = builder_->CreateAlignedLoad(ret_tcode, 8);
+#endif
+
+  pc.end_block = end_block;
+  return pc;
 }
 
 llvm::Value* CodeGenHexagon::GetPackedFuncHandle(const std::string& fname) {
@@ -417,44 +433,34 @@ llvm::Value* CodeGenHexagon::CreateCallPacked(const CallNode* op) {
   }
 
   ICHECK_EQ(op->args.size(), 5U);
-  llvm::Value* rvalue = nullptr;
-  llvm::Value* ret_tcode = nullptr;
-  MakeCallPacked(op->args, &rvalue, &ret_tcode, op->dtype, op->args[3].as<IntImmNode>()->value,
-                 op->args[4].as<IntImmNode>()->value);
-  return rvalue;
+  PackedCall pc = MakeCallPackedLowered(op->args, op->dtype, op->args[3].as<IntImmNode>()->value,
+                                        op->args[4].as<IntImmNode>()->value);
+  return pc.ret_value;
 }
 
 llvm::Value* CodeGenHexagon::CreateCallTracePacked(const CallNode* op) {
-  using llvm::BasicBlock;
   ICHECK_EQ(op->args.size(), 6U);
-  llvm::Value* rvalue = nullptr;
-  llvm::Value* ret_tcode = nullptr;
-  BasicBlock* end_block =
-      MakeCallPacked(op->args, &rvalue, &ret_tcode, op->dtype, op->args[3].as<IntImmNode>()->value,
-                     op->args[4].as<IntImmNode>()->value);
+  PackedCall pc = MakeCallPackedLowered(op->args, op->dtype, op->args[3].as<IntImmNode>()->value,
+                                        op->args[4].as<IntImmNode>()->value);
   // Get traced value.
   llvm::Value* traced_value = MakeValue(op->args[5]);
   // The update_block handles case when we need to update the return value.
-  BasicBlock* update_block = BasicBlock::Create(*ctx_, "update_block", function_);
+  llvm::BasicBlock* update_block = llvm::BasicBlock::Create(*ctx_, "update_block", function_);
   // The continue_block handles case when we need to return original
   // traced value.
-  BasicBlock* continue_block = BasicBlock::Create(*ctx_, "continue_block", function_);
-#if TVM_LLVM_VERSION >= 110
-  llvm::Value* ret_tcode_value = builder_->CreateAlignedLoad(ret_tcode, llvm::Align(8));
-#else
-  llvm::Value* ret_tcode_value = builder_->CreateAlignedLoad(ret_tcode, 8);
-#endif
+  llvm::BasicBlock* continue_block = llvm::BasicBlock::Create(*ctx_, "continue_block", function_);
+
   // Check the ret_type_code and create cmp instruction.
   llvm::Value* cmp =
-      builder_->CreateICmpNE(ret_tcode_value, llvm::ConstantInt::get(t_int_, kTVMNullptr));
+      builder_->CreateICmpNE(pc.ret_tcode, llvm::ConstantInt::get(t_int_, kTVMNullptr));
   builder_->CreateCondBr(cmp, update_block, continue_block);
   builder_->SetInsertPoint(update_block);
   builder_->CreateBr(continue_block);
   builder_->SetInsertPoint(continue_block);
   // The return value depends on from what bb we come from.
   llvm::PHINode* phi_rvalue = builder_->CreatePHI(traced_value->getType(), 2);
-  phi_rvalue->addIncoming(rvalue, update_block);
-  phi_rvalue->addIncoming(traced_value, end_block);
+  phi_rvalue->addIncoming(pc.ret_value, update_block);
+  phi_rvalue->addIncoming(traced_value, pc.end_block);
   return phi_rvalue;
 }
 

From d0c6ca5cacae8dcae26e26287d6d2a270ab6127c Mon Sep 17 00:00:00 2001
From: alter-xp <xp56@linux.alibaba.com>
Date: Tue, 28 Sep 2021 01:04:39 +0800
Subject: [PATCH 30/37] Frontend: add onnx GlobalLpPool op (#8845)

* Frontend: add onnx GlobalLpPool op

* update

* fix for test

Co-authored-by: xp224797 <xp224797@alibaba-inc.com>
---
 python/tvm/relay/frontend/onnx.py          | 13 +++++++
 tests/python/frontend/onnx/test_forward.py | 44 ++++++++++++++++++++++
 2 files changed, 57 insertions(+)

diff --git a/python/tvm/relay/frontend/onnx.py b/python/tvm/relay/frontend/onnx.py
index dfc0298979e6..1ebb12ac8199 100644
--- a/python/tvm/relay/frontend/onnx.py
+++ b/python/tvm/relay/frontend/onnx.py
@@ -1008,6 +1008,18 @@ def _impl_v1(cls, inputs, attr, params):
         return _op.power(out, reci_p)
 
 
+class GlobalLpPool(OnnxOpConverter):
+    """Operator converter for GlobalLpPool."""
+
+    @classmethod
+    def _impl_v1(cls, inputs, attr, params):
+        # TODO: GlobalLpPool does not yet support dynamic shapes
+        in_shape = infer_shape(inputs[0])
+        attr["kernel_shape"] = in_shape[2:]
+
+        return LpPool._impl_v1(inputs, attr, params)
+
+
 class Mul(Elemwise):
     """Operator converter for Multiply."""
 
@@ -4083,6 +4095,7 @@ def _get_convert_map(opset):
         # defs/nn
         "AveragePool": AveragePool.get_converter(opset),
         "LpPool": LpPool.get_converter(opset),
+        "GlobalLpPool": GlobalLpPool.get_converter(opset),
         "MaxPool": MaxPool.get_converter(opset),
         "MaxUnpool": MaxUnpool.get_converter(opset),
         "Conv": Conv.get_converter(opset),
diff --git a/tests/python/frontend/onnx/test_forward.py b/tests/python/frontend/onnx/test_forward.py
index e049a195dc17..084a5b4e4733 100644
--- a/tests/python/frontend/onnx/test_forward.py
+++ b/tests/python/frontend/onnx/test_forward.py
@@ -3470,6 +3470,49 @@ def verify_lppool(x_shape, kernel_shape, p, strides, pads, out_shape, auto_pad="
     )
 
 
+def verify_global_lppool(x_shape, p, out_shape, target, dev):
+    pool_node = helper.make_node(
+        "GlobalLpPool",
+        inputs=["x"],
+        outputs=["y"],
+        p=p,
+    )
+
+    graph = helper.make_graph(
+        [pool_node],
+        "global_lppool_test",
+        inputs=[helper.make_tensor_value_info("x", TensorProto.FLOAT, list(x_shape))],
+        outputs=[helper.make_tensor_value_info("y", TensorProto.FLOAT, list(out_shape))],
+    )
+
+    model = helper.make_model(graph, producer_name="global_lppool_test")
+    verify_with_ort(
+        model, [x_shape], out_shape, use_vm=True, convert_to_static=True, target=target, dev=dev
+    )
+
+
+@tvm.testing.parametrize_targets
+def test_global_lppool(target, dev):
+
+    # LpPool1D
+    verify_global_lppool(x_shape=[1, 15, 16], p=2, out_shape=[1, 15, 1], target=target, dev=dev)
+
+    # LpPool2D
+    verify_global_lppool(
+        x_shape=[1, 15, 32, 32], p=2, out_shape=[1, 15, 1, 1], target=target, dev=dev
+    )
+
+    # LpPool2D
+    verify_global_lppool(
+        x_shape=[1, 15, 32, 32], p=3, out_shape=[1, 15, 1, 1], target=target, dev=dev
+    )
+
+    # LpPool3D
+    verify_global_lppool(
+        x_shape=[1, 15, 3, 32, 32], p=2, out_shape=[1, 15, 1, 1, 1], target=target, dev=dev
+    )
+
+
 def verify_rnn(
     seq_length,
     batch_size,
@@ -5826,3 +5869,4 @@ def repeat(N, D):
     test_random_uniform()
     test_convinteger()
     test_batch_matmul()
+    test_global_lppool()

From 2c6fb65aba10439559d1c5747aed2b5d9a09a619 Mon Sep 17 00:00:00 2001
From: Manupa Karunaratne <manupa.karunaratne@arm.com>
Date: Mon, 27 Sep 2021 20:36:21 +0100
Subject: [PATCH 31/37] Arm(R) Ethos(TM)-U NPU TIR to CS for Conv2D (#8811)

This commit introduces the TIR to Command Stream(CS)
translation using Vela API calls for conv2D and copy operations.
It will create Vela npu_op objects for each command.

Change-Id: I906d2cb333652813142cc70fb39b8372ec498bd0
---
 .../contrib/ethosu/tir_to_cs_translator.py    | 244 ++++++
 .../test_ethosu/test_tir_to_cs_translator.py  | 770 ++++++++++++++++++
 2 files changed, 1014 insertions(+)
 create mode 100644 tests/python/contrib/test_ethosu/test_tir_to_cs_translator.py

diff --git a/python/tvm/relay/backend/contrib/ethosu/tir_to_cs_translator.py b/python/tvm/relay/backend/contrib/ethosu/tir_to_cs_translator.py
index ce9abcbd683d..4b28dc5b191e 100644
--- a/python/tvm/relay/backend/contrib/ethosu/tir_to_cs_translator.py
+++ b/python/tvm/relay/backend/contrib/ethosu/tir_to_cs_translator.py
@@ -25,6 +25,8 @@
 import ethosu.vela.api as vapi  # type: ignore
 
 import tvm
+from tvm.tir import stmt_functor
+from tvm.relay.backend.contrib.ethosu import util
 from tvm.relay.backend.contrib.ethosu import vela_api
 from tvm.relay.backend.contrib.ethosu.tir import spec
 
@@ -39,6 +41,14 @@ class BufferType(Enum):
     output = auto()
 
 
+_REGION_MAP = {
+    BufferType.constant: 0,
+    BufferType.scratch: 1,
+    BufferType.input: 3,
+    BufferType.output: 4,
+}
+
+
 class BufferInfo(NamedTuple):
     """A data structure to hold metadata of the buffer"""
 
@@ -49,6 +59,72 @@ class BufferInfo(NamedTuple):
     btype: BufferType
 
 
+def translate(tir_module, params):
+    """This will take an tir module for the NPU
+    and compile to command stream
+
+    Parameters
+    ----------
+    tir_module : tvm.IRModule
+        The TIR module containing ethosu extern calls
+    params : dict
+        A dictionary containing TIR primfunc argument ordering
+        idx to constant NDArray map
+    accel_type : ethosu.vela.api.NpuAccelerator
+        the accelerator variant the tir module needs to compiled to
+
+    Returns
+    -------
+    cs : str
+        An hex string of the bytes of command stream
+    encoded_constants : str
+        An hex string of the bytes that includes concat'd
+        encoded weights, encoded biases and scales.
+    scratch_size : int
+        The size of the scratch buffer needed.
+    """
+
+    buffer_info = extract_buffer_info(tir_module, params)
+    extern_calls = extract_extern_calls(tir_module)
+    _npu_ops = list()
+    for extern_call in extern_calls:
+        _npu_ops.append(translate_ethosu_tir_extern_call(extern_call))
+    _npu_ops, constant_tensor, scratch_size = assign_addresses(buffer_info, _npu_ops)
+    target_accel_type = vela_api.get_target_accel_type()
+    cmds = vapi.npu_generate_register_command_stream(_npu_ops, target_accel_type)
+    payload = vapi.npu_create_driver_payload(cmds, target_accel_type)
+    hex_value = "" if constant_tensor is None else constant_tensor.tobytes().hex()
+    return payload.hex(), hex_value, scratch_size
+
+
+def extract_extern_calls(mod):
+    """This function will obtain all extern
+    calls from a TIR module
+    Parameters
+    ----------
+    mod : tvm.IRModule
+        The TIR Module for NPU
+
+    Returns
+    -------
+    list
+        of tvm.tir.Call objects
+        that are tir extern calls
+    """
+    # There should only be a single function
+    assert len(mod.functions.items()) == 1
+    primfunc = mod.functions.items()[0][1]
+
+    extern_calls = list()
+
+    def populate_extern_calls(stmt):
+        if isinstance(stmt, tvm.tir.Call) and stmt.op.name == "tir.call_extern":
+            extern_calls.append(stmt)
+
+    stmt_functor.post_order_visit(primfunc.body, populate_extern_calls)
+    return extern_calls
+
+
 def extract_buffer_info(mod, param_dict):
     """
     This function is to read the tvm.IRModule that
@@ -101,6 +177,156 @@ def populate_allocate_buffer_info(stmt):
     return buffer_info
 
 
+def assign_addresses(buffer_info, npu_ops):
+    """This function will assign addresses to tensors
+    within two buffers : scratch and constants.
+    The scratch is the buffer created to hold all intermediary data
+    The constants is the buffer created via unifying all the constant data
+    (post-encoding).
+    Parameters
+    ----------
+    buffer_info : dict
+        This is the dictionary obtained via calling extract_buffer_info.
+        The key is the buffer name to BufferInfo
+    npu_ops : list
+        A list of Vela NpuOps with tir.Loads for addresses
+    Returns
+    -------
+    npu_ops : list
+        A list of Vela NpuOps with addesses within scratch and constant buffers
+    constant_tensor : NDArray
+        A unified constant data array of uint8 as the constant buffer
+    scratch_size : int
+        The size of the scratch tensor.
+    """
+
+    def replace_npu_fm_with_address(npu_fm):
+        assert isinstance(npu_fm.tiles.addresses[0], tvm.tir.Load)
+        # We currently does not support tiles
+        # Change this when tiles are needed
+        # (i.e. when using rolling buffers)
+        assert npu_fm.tiles.addresses[1:] == [0, 0, 0]
+        npu_fm.tiles.addresses[1:] = [0, 0, 0]
+        buffer = npu_fm.tiles.addresses[0].buffer_var
+        assert buffer in buffer_addresses.keys()
+        address, buffer_type = buffer_addresses[buffer]
+        npu_fm.tiles.addresses[0] = address
+        npu_fm.region = _REGION_MAP[buffer_type]
+        return npu_fm
+
+    def replace_npu_address_range_with_address(npu_addr_range):
+        assert isinstance(npu_addr_range.address, tvm.tir.Load)
+        buffer = npu_addr_range.address.buffer_var
+        assert buffer in buffer_addresses.keys()
+        address, buffer_type = buffer_addresses[buffer]
+        return vapi.NpuAddressRange(_REGION_MAP[buffer_type], address, npu_addr_range.length)
+
+    def replace_tir_loads(npu_object):
+        if isinstance(npu_object, vapi.NpuFeatureMap):
+            return replace_npu_fm_with_address(npu_object)
+        if isinstance(npu_object, vapi.NpuAddressRange):
+            return replace_npu_address_range_with_address(npu_object)
+        return npu_object
+
+    def classify_io(buffer):
+        for _npu_op in npu_ops:
+            if issubclass(type(_npu_op), vapi.NpuBlockOperation):
+                if _npu_op.ifm and _npu_op.ifm.tiles.addresses[0].buffer_var == buffer:
+                    return BufferType.input
+                if _npu_op.ifm2 and _npu_op.ifm2.tiles.addresses[0].buffer_var == buffer:
+                    return BufferType.input
+                if _npu_op.ofm and _npu_op.ofm.tiles.addresses[0].buffer_var == buffer:
+                    return BufferType.output
+
+        raise ValueError(f"Unused IO : {buffer} in tir module.")
+
+    scratch_size = 0
+    constant_tensor = None
+    buffer_addresses = dict()
+    for _buffer, info in buffer_info.items():
+        if info.values is not None:
+            assert np.dtype(info.dtype) == np.uint8
+            assert info.btype == BufferType.constant
+            assert len(info.shape) == 1
+            if constant_tensor is None:
+                buffer_addresses[_buffer] = (0, info.btype)
+                assert info.values.dtype == np.uint8
+                size_in_bytes = info.values.size
+                # Every memory address the NPU access have to be 16 byte aligned
+                size_in_bytes = util.round_up(size_in_bytes, 16)
+                constant_tensor = np.resize(info.values, size_in_bytes)
+            else:
+                buffer_addresses[_buffer] = (constant_tensor.size, info.btype)
+                assert info.values.dtype == np.uint8
+                size_in_bytes = info.values.size
+                # Every memory address the NPU access have to be 16 byte aligned
+                size_in_bytes = util.round_up(size_in_bytes, 16)
+                constant_tensor = np.append(constant_tensor, np.resize(info.values, size_in_bytes))
+        else:
+            size_in_bytes = int(
+                (np.iinfo(np.dtype(info.dtype)).bits // 8) * np.prod(list(info.shape))
+            )
+            # Every memory address the NPU access have to be 16 byte aligned
+            size_in_bytes = util.round_up(size_in_bytes, 16)
+            if info.btype == BufferType.input_or_output:
+                buffer_type = classify_io(_buffer)
+                assert buffer_type in (BufferType.input, BufferType.output)
+                address = 0
+                buffer_addresses[_buffer] = (address, buffer_type)
+            else:
+                assert info.btype == BufferType.scratch
+                address = scratch_size
+                scratch_size += size_in_bytes
+                buffer_addresses[_buffer] = (address, info.btype)
+
+    for npu_op in npu_ops:
+        for attr_name, attr in npu_op.__dict__.items():
+            if isinstance(attr, list):
+                new_attr = list()
+                for attr_ in attr:
+                    new_attr.append(replace_tir_loads(attr_))
+                setattr(npu_op, attr_name, new_attr)
+            else:
+                setattr(npu_op, attr_name, replace_tir_loads(attr))
+
+    return npu_ops, constant_tensor, scratch_size
+
+
+def translate_ethosu_tir_extern_call(tir_extern_call):
+    """This is a dispatcher function to dispatch
+    correct translation call depending on the extern call's
+    first argument"""
+    supported_extern_calls = {
+        "ethosu_conv2d": translate_ethosu_conv2d,
+        "ethosu_copy": translate_ethosu_copy,
+    }
+    ext_call_type = tir_extern_call.args[0].value
+    assert ext_call_type in supported_extern_calls.keys(), f"{ext_call_type} is not yet supported"
+    npu_op = supported_extern_calls[ext_call_type](tir_extern_call)
+    # Some conversions return additional outputs
+    # if they are needed, the caller should use the function directly
+    if isinstance(npu_op, tuple):
+        return npu_op[0]
+    return npu_op
+
+
+def translate_ethosu_copy(tir_extern_call):
+    """This function will translate a tir ethosu_copy extern_call
+    as produced by Relay to TIR compilation.
+    Parameters
+    ----------
+    tir_extern_call : tvm.tir.Call
+
+    Returns
+    -------
+    ethosu.vela.api.NpuDmaOperation
+        The vela object containing the params of ethosu_copy
+    """
+    # We skip the first element as it is the extern_call function name
+    serial_object = spec.create_serial_object(spec.SerialCopy, tir_extern_call.args[1:])
+    return _create_npu_dma_op(serial_object)
+
+
 def _convert_clip_bounds(npu_op):
     """
     This function will convert the min and max value
@@ -330,3 +556,21 @@ def _create_npu_resampling_mode(
     mode = str(mode.value)
     assert mode in mode_map.keys()
     return mode_map[mode]
+
+
+def _create_npu_dma_op(serial_copy):
+    """This is a helper function to capture the list of arguments
+    to create a NpuDmaOperation object"""
+    src = vapi.NpuAddressRange(
+        # region will be updated later
+        region=0,
+        address=serial_copy.read_address,
+        length=int(serial_copy.length.value),
+    )
+    dest = vapi.NpuAddressRange(
+        # region will be updated later
+        region=0,
+        address=serial_copy.write_address,
+        length=int(serial_copy.length.value),
+    )
+    return vapi.NpuDmaOperation(src, dest)
diff --git a/tests/python/contrib/test_ethosu/test_tir_to_cs_translator.py b/tests/python/contrib/test_ethosu/test_tir_to_cs_translator.py
new file mode 100644
index 000000000000..479a1032453a
--- /dev/null
+++ b/tests/python/contrib/test_ethosu/test_tir_to_cs_translator.py
@@ -0,0 +1,770 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name, unused-argument
+import pytest
+
+pytest.importorskip("ethosu.vela")
+import numpy as np
+
+import tvm
+from tvm import tir
+from tvm.tir import stmt_functor
+from tvm.script import ty
+from tvm.relay.backend.contrib.ethosu import tir_to_cs_translator
+from tvm.relay.backend.contrib.ethosu import util
+import ethosu.vela.api as vapi
+
+
+# fmt: off
+"""A sample tir test case for translator"""
+@tvm.script.tir
+class SingleEthosUConv2D:
+    def main(placeholder: ty.handle, placeholder_1: ty.handle, placeholder_2: ty.handle, ethosu_conv2d: ty.handle) -> None:
+        # function attr dict
+        tir.func_attr({"global_symbol": "main", "tir.noalias": True})
+        placeholder_4 = tir.match_buffer(placeholder_1, [1, 1, 3, 16], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
+        placeholder_5 = tir.match_buffer(placeholder_2, [16], dtype="int32", elem_offset=0, align=128, offset_factor=1)
+        placeholder_3 = tir.match_buffer(placeholder, [1, 8, 8, 3], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
+        ethosu_conv2d_1 = tir.match_buffer(ethosu_conv2d, [1, 8, 8, 16], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
+        # body
+        tir.evaluate(tir.call_extern("ethosu_conv2d", "uint8", 8, 8, 3, 8, 0, 8, tir.load("uint8", placeholder_3.data, 0), 0, 0, 0, tir.float32(0.5), 10, "NHWC", 24, 3, 1, "uint8", 8, 8, 16, 8, 0, 8, tir.load("uint8", ethosu_conv2d_1.data, 0), 0, 0, 0, tir.float32(0.25), 14, "NHWC", 128, 16, 1, 1, 1, 1, 1, 1, 1, tir.load("uint8", placeholder_4.data, 0), 0, 12, tir.load("uint8", placeholder_5.data, 0), 0, 0, 0, 0, 0, "CLIP", 0, 255, "NONE", dtype="uint8"))
+# fmt: on
+
+
+# fmt: off
+"""A sample tir test case with multiple convolutions for translator"""
+@tvm.script.tir
+class MultiEthosUConv2D:
+    def main(placeholder: ty.handle, placeholder_1: ty.handle, placeholder_2: ty.handle, placeholder_3: ty.handle, placeholder_4: ty.handle, ethosu_conv2d: ty.handle) -> None:
+        # function attr dict
+        tir.func_attr({"global_symbol": "main", "tir.noalias": True})
+        placeholder_9 = tir.match_buffer(placeholder_3, [1, 1, 32, 8], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
+        ethosu_conv2d_1 = tir.match_buffer(ethosu_conv2d, [1, 8, 8, 8], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
+        placeholder_7 = tir.match_buffer(placeholder_1, [1, 1, 3, 32], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
+        placeholder_6 = tir.match_buffer(placeholder, [1, 8, 8, 3], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
+        placeholder_8 = tir.match_buffer(placeholder_2, [32], dtype="int32", elem_offset=0, align=128, offset_factor=1)
+        placeholder_5 = tir.match_buffer(placeholder_4, [8], dtype="int32", elem_offset=0, align=128, offset_factor=1)
+        # body
+        ethosu_conv2d_2 = tir.allocate([1024], "uint8", "global")
+        ethosu_conv2d_3 = tir.allocate([2048], "uint8", "global")
+        tir.evaluate(tir.call_extern("ethosu_conv2d", "uint8", 4, 8, 3, 4, 0, 8, tir.load("uint8", placeholder_6.data, 0), 0, 0, 0, tir.float32(0.5), 10, "NHWC", 24, 3, 1, "uint8", 4, 8, 32, 4, 0, 8, tir.load("uint8", ethosu_conv2d_2, 0), 0, 0, 0, tir.float32(0.25), 14, "NHWC", 256, 32, 1, 1, 1, 1, 1, 1, 1, tir.load("uint8", placeholder_7.data, 0), 0, 12, tir.load("uint8", placeholder_8.data, 0), 0, 0, 0, 0, 0, "NONE", 0, 0, "NONE", dtype="uint8"))
+        tir.evaluate(tir.call_extern("ethosu_conv2d", "uint8", 4, 8, 32, 4, 0, 8, tir.load("uint8", ethosu_conv2d_2, 0), 0, 0, 0, tir.float32(0.5), 10, "NHWC", 256, 32, 1, "uint8", 4, 8, 8, 4, 0, 8, tir.load("uint8", ethosu_conv2d_1.data, 0), 0, 0, 0, tir.float32(0.25), 14, "NHWC", 64, 8, 1, 1, 1, 1, 1, 1, 1, tir.load("uint8", placeholder_9.data, 0), 0, 12, tir.load("uint8", placeholder_5.data, 0), 0, 0, 0, 0, 0, "CLIP", 0, 255, "NONE", dtype="uint8"))
+        tir.evaluate(tir.call_extern("ethosu_conv2d", "uint8", 4, 8, 3, 4, 0, 8, tir.load("uint8", placeholder_6.data, 96), 0, 0, 0, tir.float32(0.5), 10, "NHWC", 24, 3, 1, "uint8", 4, 8, 32, 4, 0, 8, tir.load("uint8", ethosu_conv2d_2, 0), 0, 0, 0, tir.float32(0.25), 14, "NHWC", 256, 32, 1, 1, 1, 1, 1, 1, 1, tir.load("uint8", placeholder_7.data, 0), 0, 12, tir.load("uint8", placeholder_8.data, 0), 0, 0, 0, 0, 0, "CLIP", 0, 255, "NONE", dtype="uint8"))
+        tir.evaluate(tir.call_extern("ethosu_conv2d", "uint8", 4, 8, 32, 4, 0, 8, tir.load("uint8", ethosu_conv2d_2, 0), 0, 0, 0, tir.float32(0.5), 10, "NHWC", 256, 32, 1, "uint8", 4, 8, 8, 4, 0, 8, tir.load("uint8", ethosu_conv2d_1.data, 256), 0, 0, 0, tir.float32(0.25), 14, "NHWC", 64, 8, 1, 1, 1, 1, 1, 1, 1, tir.load("uint8", placeholder_9.data, 0), 0, 12, tir.load("uint8", placeholder_5.data, 0), 0, 0, 0, 0, 0, "CLIP", 0, 255, "NONE", dtype="uint8"))
+# fmt: on
+
+
+# fmt: off
+"""A sample tir test case with copy operations for translator"""
+@tvm.script.tir
+class MultiEthosUCopy:
+    def main(placeholder: ty.handle, placeholder_1: ty.handle, placeholder_2: ty.handle, ethosu_conv2d: ty.handle) -> None:
+        # function attr dict
+        tir.func_attr({"global_symbol": "main", "tir.noalias": True})
+        placeholder_3 = tir.match_buffer(placeholder, [1, 16, 16, 32], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
+        ethosu_conv2d_1 = tir.match_buffer(ethosu_conv2d, [1, 16, 16, 8], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
+        placeholder_5 = tir.match_buffer(placeholder_2, [8], dtype="int32", elem_offset=0, align=128, offset_factor=1)
+        placeholder_4 = tir.match_buffer(placeholder_1, [8, 1, 1, 32], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
+        # body
+        placeholder_global = tir.allocate([256], "uint8", "global")
+        placeholder_d_global = tir.allocate([8], "int32", "global")
+        tir.evaluate(tir.call_extern("ethosu_copy", tir.load("uint8", placeholder_4.data, 0), 256,  tir.load("uint8", placeholder_global, 0), dtype="handle"))
+        tir.evaluate(tir.call_extern("ethosu_copy", tir.load("int32", placeholder_5.data, 0), 8, tir.load("int32", placeholder_d_global, 0), dtype="handle"))
+        tir.evaluate(tir.call_extern("ethosu_conv2d", "uint8", 16, 16, 32, 16, 0, 16, tir.load("uint8", placeholder_3.data, 0), 0, 0, 0, tir.float32(0.5), 10, "NHWC", 512, 32, 1, "uint8", 16, 16, 8, 16, 0, 16, tir.load("uint8", ethosu_conv2d_1.data, 0), 0, 0, 0, tir.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, tir.load("uint8", placeholder_global, 0), 0, 12, tir.load("uint8", placeholder_d_global, 0), 0, 0, 0, 0, 0, "CLIP", 0, 255, "NONE", dtype="handle"))
+# fmt: on
+
+
+# fmt: off
+"""A TIR test module of weight streaming"""
+@tvm.script.tir
+class WeightStreamOnly:
+    def main(placeholder: ty.handle, ethosu_conv2d: ty.handle, placeholder_1: ty.handle, placeholder_2: ty.handle, placeholder_3: ty.handle, placeholder_4: ty.handle, placeholder_5: ty.handle, placeholder_6: ty.handle, placeholder_7: ty.handle, placeholder_8: ty.handle) -> None:
+        # function attr dict
+        tir.func_attr({"global_symbol": "main", "tir.noalias": True})
+        buffer_4 = tir.match_buffer(placeholder_5, [144], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
+        buffer_3 = tir.match_buffer(placeholder_4, [20], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
+        buffer_7 = tir.match_buffer(placeholder_7, [144], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
+        buffer_5 = tir.match_buffer(placeholder_1, [144], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
+        buffer_6 = tir.match_buffer(placeholder_6, [20], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
+        ethosu_conv2d_1 = tir.match_buffer(ethosu_conv2d, [1, 16, 16, 8], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
+        buffer_1 = tir.match_buffer(placeholder_3, [144], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
+        buffer_2 = tir.match_buffer(placeholder_2, [20], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
+        placeholder_9 = tir.match_buffer(placeholder, [1, 16, 16, 32], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
+        buffer = tir.match_buffer(placeholder_8, [20], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
+        # body
+        placeholder_global = tir.allocate([144], "uint8", "global")
+        placeholder_d_global = tir.allocate([20], "uint8", "global")
+        tir.evaluate(tir.call_extern("ethosu_copy", tir.load("uint8", buffer_5.data, 0), 144, tir.load("uint8", placeholder_global, 0), dtype="handle"))
+        tir.evaluate(tir.call_extern("ethosu_copy", tir.load("uint8", buffer_2.data, 0), 20, tir.load("uint8", placeholder_d_global, 0), dtype="handle"))
+        tir.evaluate(tir.call_extern("ethosu_conv2d", "uint8", 16, 16, 32, 16, 0, 16, tir.load("uint8", placeholder_9.data, 0), 0, 0, 0, tir.float32(0.5), 10, "NHWC", 512, 32, 1, "uint8", 16, 16, 2, 16, 0, 16, tir.load("uint8", ethosu_conv2d_1.data, 0), 0, 0, 0, tir.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, tir.load("uint8", placeholder_global, 0), 144, 12, tir.load("uint8", placeholder_d_global, 0), 20, 0, 0, 0, 0, "CLIP", 0, 255, "NONE", dtype="handle"))
+        tir.evaluate(tir.call_extern("ethosu_copy", tir.load("uint8", buffer_1.data, 0), 144, tir.load("uint8", placeholder_global, 0), dtype="handle"))
+        tir.evaluate(tir.call_extern("ethosu_copy", tir.load("uint8", buffer_3.data, 0), 20, tir.load("uint8", placeholder_d_global, 0), dtype="handle"))
+        tir.evaluate(tir.call_extern("ethosu_conv2d", "uint8", 16, 16, 32, 16, 0, 16, tir.load("uint8", placeholder_9.data, 0), 0, 0, 0, tir.float32(0.5), 10, "NHWC", 512, 32, 1, "uint8", 16, 16, 2, 16, 0, 16, tir.load("uint8", ethosu_conv2d_1.data, 2), 0, 0, 0, tir.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, tir.load("uint8", placeholder_global, 0), 144, 12, tir.load("uint8", placeholder_d_global, 0), 20, 0, 0, 0, 0, "CLIP", 0, 255, "NONE", dtype="handle"))
+        tir.evaluate(tir.call_extern("ethosu_copy", tir.load("uint8", buffer_4.data, 0), 144, tir.load("uint8", placeholder_global, 0), dtype="handle"))
+        tir.evaluate(tir.call_extern("ethosu_copy", tir.load("uint8", buffer_6.data, 0), 20, tir.load("uint8", placeholder_d_global, 0), dtype="handle"))
+        tir.evaluate(tir.call_extern("ethosu_conv2d", "uint8", 16, 16, 32, 16, 0, 16, tir.load("uint8", placeholder_9.data, 0), 0, 0, 0, tir.float32(0.5), 10, "NHWC", 512, 32, 1, "uint8", 16, 16, 2, 16, 0, 16, tir.load("uint8", ethosu_conv2d_1.data, 4), 0, 0, 0, tir.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, tir.load("uint8", placeholder_global, 0), 144, 12, tir.load("uint8", placeholder_d_global, 0), 20, 0, 0, 0, 0, "CLIP", 0, 255, "NONE", dtype="handle"))
+        tir.evaluate(tir.call_extern("ethosu_copy", tir.load("uint8", buffer_7.data, 0), 144, tir.load("uint8", placeholder_global, 0), dtype="handle"))
+        tir.evaluate(tir.call_extern("ethosu_copy", tir.load("uint8", buffer.data, 0), 20, tir.load("uint8", placeholder_d_global, 0), dtype="handle"))
+        tir.evaluate(tir.call_extern("ethosu_conv2d", "uint8", 16, 16, 32, 16, 0, 16, tir.load("uint8", placeholder_9.data, 0), 0, 0, 0, tir.float32(0.5), 10, "NHWC", 512, 32, 1, "uint8", 16, 16, 2, 16, 0, 16, tir.load("uint8", ethosu_conv2d_1.data, 6), 0, 0, 0, tir.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, tir.load("uint8", placeholder_global, 0), 144, 12, tir.load("uint8", placeholder_d_global, 0), 20, 0, 0, 0, 0, "CLIP", 0, 255, "NONE", dtype="handle"))
+    __tvm_meta__ = None
+# fmt: on
+
+
+# fmt: off
+"""A TIR test module of weight streaming and direct reading"""
+@tvm.script.tir
+class MixedRead:
+    def main(placeholder: ty.handle, placeholder_1: ty.handle, ethosu_conv2d: ty.handle, placeholder_2: ty.handle, placeholder_3: ty.handle, placeholder_4: ty.handle, placeholder_5: ty.handle, placeholder_6: ty.handle, placeholder_7: ty.handle, placeholder_8: ty.handle, placeholder_9: ty.handle, placeholder_10: ty.handle) -> None:
+        # function attr dict
+        tir.func_attr({"global_symbol": "main", "tir.noalias": True})
+        buffer_5 = tir.match_buffer(placeholder_1, [592], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
+        buffer_7 = tir.match_buffer(placeholder_2, [160], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
+        buffer_3 = tir.match_buffer(placeholder_7, [80], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
+        buffer_6 = tir.match_buffer(placeholder_4, [20], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
+        buffer_9 = tir.match_buffer(placeholder_5, [80], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
+        ethosu_conv2d_1 = tir.match_buffer(ethosu_conv2d, [1, 16, 16, 8], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
+        buffer = tir.match_buffer(placeholder_8, [20], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
+        buffer_1 = tir.match_buffer(placeholder_10, [20], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
+        placeholder_11 = tir.match_buffer(placeholder, [1, 16, 16, 32], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
+        buffer_2 = tir.match_buffer(placeholder_6, [20], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
+        buffer_4 = tir.match_buffer(placeholder_3, [80], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
+        buffer_8 = tir.match_buffer(placeholder_9, [80], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
+        # body
+        ethosu_conv2d_2 = tir.allocate([4096], "uint8", "global")
+        placeholder_global = tir.allocate([80], "uint8", "global")
+        placeholder_d_global = tir.allocate([20], "uint8", "global")
+        tir.evaluate(tir.call_extern("ethosu_conv2d", "uint8", 16, 16, 32, 16, 0, 16, tir.load("uint8", placeholder_11.data, 0), 0, 0, 0, tir.float32(0.5), 10, "NHWC", 512, 32, 1, "uint8", 16, 16, 16, 16, 0, 16, tir.load("uint8", ethosu_conv2d_2, 0), 0, 0, 0, tir.float32(0.25), 14, "NHWC", 256, 16, 1, 1, 1, 1, 1, 1, 1, tir.load("uint8", buffer_5.data, 0), 592, 12, tir.load("uint8", buffer_7.data, 0), 160, 0, 0, 0, 0, "CLIP", 0, 255, "NONE", dtype="handle"))
+        tir.evaluate(tir.call_extern("ethosu_copy", tir.load("uint8", buffer_4.data, 0), 80, tir.load("uint8", placeholder_global, 0), dtype="handle"))
+        tir.evaluate(tir.call_extern("ethosu_copy", tir.load("uint8", buffer_6.data, 0), 20, tir.load("uint8", placeholder_d_global, 0), dtype="handle"))
+        tir.evaluate(tir.call_extern("ethosu_conv2d", "uint8", 16, 16, 16, 16, 0, 16, tir.load("uint8", ethosu_conv2d_2, 0), 0, 0, 0, tir.float32(0.5), 10, "NHWC", 256, 16, 1, "uint8", 16, 16, 2, 16, 0, 16, tir.load("uint8", ethosu_conv2d_1.data, 0), 0, 0, 0, tir.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, tir.load("uint8", placeholder_global, 0), 80, 12, tir.load("uint8", placeholder_d_global, 0), 20, 0, 0, 0, 0, "CLIP", 0, 255, "NONE", dtype="handle"))
+        tir.evaluate(tir.call_extern("ethosu_copy", tir.load("uint8", buffer_9.data, 0), 80, tir.load("uint8", placeholder_global, 0), dtype="handle"))
+        tir.evaluate(tir.call_extern("ethosu_copy", tir.load("uint8", buffer_2.data, 0), 20, tir.load("uint8", placeholder_d_global, 0), dtype="handle"))
+        tir.evaluate(tir.call_extern("ethosu_conv2d", "uint8", 16, 16, 16, 16, 0, 16, tir.load("uint8", ethosu_conv2d_2, 0), 0, 0, 0, tir.float32(0.5), 10, "NHWC", 256, 16, 1, "uint8", 16, 16, 2, 16, 0, 16, tir.load("uint8", ethosu_conv2d_1.data, 2), 0, 0, 0, tir.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, tir.load("uint8", placeholder_global, 0), 80, 12, tir.load("uint8", placeholder_d_global, 0), 20, 0, 0, 0, 0, "CLIP", 0, 255, "NONE", dtype="handle"))
+        tir.evaluate(tir.call_extern("ethosu_copy", tir.load("uint8", buffer_3.data, 0), 80, tir.load("uint8", placeholder_global, 0), dtype="handle"))
+        tir.evaluate(tir.call_extern("ethosu_copy", tir.load("uint8", buffer.data, 0), 20, tir.load("uint8", placeholder_d_global, 0), dtype="handle"))
+        tir.evaluate(tir.call_extern("ethosu_conv2d", "uint8", 16, 16, 16, 16, 0, 16, tir.load("uint8", ethosu_conv2d_2, 0), 0, 0, 0, tir.float32(0.5), 10, "NHWC", 256, 16, 1, "uint8", 16, 16, 2, 16, 0, 16, tir.load("uint8", ethosu_conv2d_1.data, 4), 0, 0, 0, tir.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, tir.load("uint8", placeholder_global, 0), 80, 12, tir.load("uint8", placeholder_d_global, 0), 20, 0, 0, 0, 0, "CLIP", 0, 255, "NONE", dtype="handle"))
+        tir.evaluate(tir.call_extern("ethosu_copy", tir.load("uint8", buffer_8.data, 0), 80, tir.load("uint8", placeholder_global, 0), dtype="handle"))
+        tir.evaluate(tir.call_extern("ethosu_copy", tir.load("uint8", buffer_1.data, 0), 20, tir.load("uint8", placeholder_d_global, 0), dtype="handle"))
+        tir.evaluate(tir.call_extern("ethosu_conv2d", "uint8", 16, 16, 16, 16, 0, 16, tir.load("uint8", ethosu_conv2d_2, 0), 0, 0, 0, tir.float32(0.5), 10, "NHWC", 256, 16, 1, "uint8", 16, 16, 2, 16, 0, 16, tir.load("uint8", ethosu_conv2d_1.data, 6), 0, 0, 0, tir.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, tir.load("uint8", placeholder_global, 0), 80, 12, tir.load("uint8", placeholder_d_global, 0), 20, 0, 0, 0, 0, "CLIP", 0, 255, "NONE", dtype="handle"))
+    __tvm_meta__ = None
+# fmt: on
+
+
+def test_buffer_info_extraction():
+    test_cases = [
+        {
+            # Stimulus
+            "tir_module": SingleEthosUConv2D(),
+            "param_dict": {
+                1: np.random.randint(
+                    np.iinfo("uint8").min, np.iinfo("uint8").max, [1, 1, 3, 16], "uint8"
+                ),
+                2: np.random.randint(np.iinfo("int32").min, np.iinfo("int32").max, [16], "int32"),
+            },
+            # Reference Outputs
+            "constants": {
+                "placeholder_4": 1,
+                "placeholder_5": 2,
+            },
+            "data_buffers": {
+                "placeholder_3": (
+                    [1, 8, 8, 3],
+                    "uint8",
+                    tir_to_cs_translator.BufferType.input_or_output,
+                ),
+                "ethosu_conv2d_1": (
+                    [1, 8, 8, 16],
+                    "uint8",
+                    tir_to_cs_translator.BufferType.input_or_output,
+                ),
+            },
+        },
+        {
+            "tir_module": MultiEthosUConv2D(),
+            "param_dict": {
+                1: np.random.randint(
+                    np.iinfo("uint8").min, np.iinfo("uint8").max, [1, 1, 3, 32], "uint8"
+                ),
+                2: np.random.randint(np.iinfo("int32").min, np.iinfo("int32").max, [32], "int32"),
+                3: np.random.randint(
+                    np.iinfo("uint8").min, np.iinfo("uint8").max, [1, 1, 32, 8], "uint8"
+                ),
+                4: np.random.randint(np.iinfo("int32").min, np.iinfo("int32").max, [8], "int32"),
+            },
+            # Reference Outputs
+            "constants": {
+                "placeholder_5": 4,
+                "placeholder_7": 1,
+                "placeholder_8": 2,
+                "placeholder_9": 3,
+            },
+            "data_buffers": {
+                "placeholder_6": (
+                    [1, 8, 8, 3],
+                    "uint8",
+                    tir_to_cs_translator.BufferType.input_or_output,
+                ),
+                "ethosu_conv2d_1": (
+                    [1, 8, 8, 8],
+                    "uint8",
+                    tir_to_cs_translator.BufferType.input_or_output,
+                ),
+                "ethosu_conv2d_2": ([1024], "uint8", tir_to_cs_translator.BufferType.scratch),
+                "ethosu_conv2d_3": ([2048], "uint8", tir_to_cs_translator.BufferType.scratch),
+            },
+        },
+    ]
+    for test_case in test_cases:
+        buffer_info = tir_to_cs_translator.extract_buffer_info(
+            test_case["tir_module"], test_case["param_dict"]
+        )
+        for buffer_var, info in buffer_info.items():
+            buffer_name = buffer_var.name
+            if buffer_name in test_case["constants"].keys():
+                assert (
+                    info.values == test_case["param_dict"][test_case["constants"][buffer_name]]
+                ).all()
+                assert (
+                    info.dtype == test_case["param_dict"][test_case["constants"][buffer_name]].dtype
+                )
+                info.btype == tir_to_cs_translator.BufferType.constant
+            else:
+                assert list(info.shape) == test_case["data_buffers"][buffer_name][0]
+                assert info.dtype == test_case["data_buffers"][buffer_name][1]
+                assert info.btype == test_case["data_buffers"][buffer_name][2]
+
+
+def test_translate_ethosu_conv2d():
+    test_cases = [
+        {
+            # Stimulus
+            "tir_module": SingleEthosUConv2D(),
+            "param_dict": {
+                1: np.random.randint(
+                    np.iinfo("uint8").min, np.iinfo("uint8").max, [1, 1, 3, 16], "uint8"
+                ),
+                2: np.random.randint(np.iinfo("int32").min, np.iinfo("int32").max, [16], "int32"),
+            },
+            # Reference outputs
+            "ref": [
+                {
+                    "ifm": {
+                        "data_type": vapi.NpuDataType.UINT8,
+                        "shape": vapi.NpuShape3D(8, 8, 3),
+                        "tiles": vapi.NpuTileBox(8, 0, 8, [0, 0, 0, 0]),
+                        "quantization": vapi.NpuQuantization(0.5, 10),
+                        "layout": vapi.NpuLayout.NHWC,
+                        "strides": vapi.NpuShape3D(24, 3, 1),
+                    },
+                    "ofm": {
+                        "data_type": vapi.NpuDataType.UINT8,
+                        "shape": vapi.NpuShape3D(8, 8, 16),
+                        "tiles": vapi.NpuTileBox(8, 0, 8, [0, 0, 0, 0]),
+                        "quantization": vapi.NpuQuantization(0.25, 14),
+                        "layout": vapi.NpuLayout.NHWC,
+                        "strides": vapi.NpuShape3D(128, 16, 1),
+                    },
+                    "kernel": vapi.NpuKernel(
+                        w=1, h=1, stride_x=1, stride_y=1, dilation_x=1, dilation_y=1
+                    ),
+                    "padding": vapi.NpuPadding(top=0, left=0, bottom=0, right=0),
+                    "activation": {
+                        "op": vapi.NpuActivationOp.NONE_OR_RELU,
+                        "min": -3.5,
+                        "max": 60.25,
+                    },
+                    "ifm_upscale": vapi.NpuResamplingMode.NONE,
+                    "w_zero_point": 12,
+                }
+            ],
+        },
+        {
+            "tir_module": MultiEthosUConv2D(),
+            "param_dict": {
+                1: np.random.randint(
+                    np.iinfo("uint8").min, np.iinfo("uint8").max, [1, 1, 3, 32], "uint8"
+                ),
+                2: np.random.randint(np.iinfo("int32").min, np.iinfo("int32").max, [32], "int32"),
+                3: np.random.randint(
+                    np.iinfo("uint8").min, np.iinfo("uint8").max, [1, 1, 32, 8], "uint8"
+                ),
+                4: np.random.randint(np.iinfo("int32").min, np.iinfo("int32").max, [8], "int32"),
+            },
+            # Reference Outputs
+            "ref": [
+                {
+                    "ifm": {
+                        "data_type": vapi.NpuDataType.UINT8,
+                        "shape": vapi.NpuShape3D(4, 8, 3),
+                        "tiles": vapi.NpuTileBox(4, 0, 8, [0, 0, 0, 0]),
+                        "quantization": vapi.NpuQuantization(0.5, 10),
+                        "layout": vapi.NpuLayout.NHWC,
+                        "strides": vapi.NpuShape3D(24, 3, 1),
+                    },
+                    "ofm": {
+                        "data_type": vapi.NpuDataType.UINT8,
+                        "shape": vapi.NpuShape3D(4, 8, 32),
+                        "tiles": vapi.NpuTileBox(4, 0, 8, [0, 0, 0, 0]),
+                        "quantization": vapi.NpuQuantization(0.25, 14),
+                        "layout": vapi.NpuLayout.NHWC,
+                        "strides": vapi.NpuShape3D(256, 32, 1),
+                    },
+                    "kernel": vapi.NpuKernel(
+                        w=1, h=1, stride_x=1, stride_y=1, dilation_x=1, dilation_y=1
+                    ),
+                    "padding": vapi.NpuPadding(top=0, left=0, bottom=0, right=0),
+                    "activation": {"op": None},
+                    "ifm_upscale": vapi.NpuResamplingMode.NONE,
+                    "w_zero_point": 12,
+                },
+                {
+                    "ifm": {
+                        "data_type": vapi.NpuDataType.UINT8,
+                        "shape": vapi.NpuShape3D(4, 8, 32),
+                        "tiles": vapi.NpuTileBox(4, 0, 8, [0, 0, 0, 0]),
+                        "quantization": vapi.NpuQuantization(0.5, 10),
+                        "layout": vapi.NpuLayout.NHWC,
+                        "strides": vapi.NpuShape3D(256, 32, 1),
+                    },
+                    "ofm": {
+                        "data_type": vapi.NpuDataType.UINT8,
+                        "shape": vapi.NpuShape3D(4, 8, 8),
+                        "tiles": vapi.NpuTileBox(4, 0, 8, [0, 0, 0, 0]),
+                        "quantization": vapi.NpuQuantization(0.25, 14),
+                        "layout": vapi.NpuLayout.NHWC,
+                        "strides": vapi.NpuShape3D(64, 8, 1),
+                    },
+                    "kernel": vapi.NpuKernel(
+                        w=1, h=1, stride_x=1, stride_y=1, dilation_x=1, dilation_y=1
+                    ),
+                    "padding": vapi.NpuPadding(top=0, left=0, bottom=0, right=0),
+                    "activation": {
+                        "op": vapi.NpuActivationOp.NONE_OR_RELU,
+                        "min": -3.5,
+                        "max": 60.25,
+                    },
+                    "ifm_upscale": vapi.NpuResamplingMode.NONE,
+                    "w_zero_point": 12,
+                },
+                {
+                    "ifm": {
+                        "data_type": vapi.NpuDataType.UINT8,
+                        "shape": vapi.NpuShape3D(4, 8, 3),
+                        "tiles": vapi.NpuTileBox(4, 0, 8, [0, 0, 0, 0]),
+                        "quantization": vapi.NpuQuantization(0.5, 10),
+                        "layout": vapi.NpuLayout.NHWC,
+                        "strides": vapi.NpuShape3D(24, 3, 1),
+                    },
+                    "ofm": {
+                        "data_type": vapi.NpuDataType.UINT8,
+                        "shape": vapi.NpuShape3D(4, 8, 32),
+                        "tiles": vapi.NpuTileBox(4, 0, 8, [0, 0, 0, 0]),
+                        "quantization": vapi.NpuQuantization(0.25, 14),
+                        "layout": vapi.NpuLayout.NHWC,
+                        "strides": vapi.NpuShape3D(256, 32, 1),
+                    },
+                    "kernel": vapi.NpuKernel(
+                        w=1, h=1, stride_x=1, stride_y=1, dilation_x=1, dilation_y=1
+                    ),
+                    "padding": vapi.NpuPadding(top=0, left=0, bottom=0, right=0),
+                    "activation": {
+                        "op": vapi.NpuActivationOp.NONE_OR_RELU,
+                        "min": -3.5,
+                        "max": 60.25,
+                    },
+                    "ifm_upscale": vapi.NpuResamplingMode.NONE,
+                    "w_zero_point": 12,
+                },
+                {
+                    "ifm": {
+                        "data_type": vapi.NpuDataType.UINT8,
+                        "shape": vapi.NpuShape3D(4, 8, 32),
+                        "tiles": vapi.NpuTileBox(4, 0, 8, [0, 0, 0, 0]),
+                        "quantization": vapi.NpuQuantization(0.5, 10),
+                        "layout": vapi.NpuLayout.NHWC,
+                        "strides": vapi.NpuShape3D(256, 32, 1),
+                    },
+                    "ofm": {
+                        "data_type": vapi.NpuDataType.UINT8,
+                        "shape": vapi.NpuShape3D(4, 8, 8),
+                        "tiles": vapi.NpuTileBox(4, 0, 8, [0, 0, 0, 0]),
+                        "quantization": vapi.NpuQuantization(0.25, 14),
+                        "layout": vapi.NpuLayout.NHWC,
+                        "strides": vapi.NpuShape3D(64, 8, 1),
+                    },
+                    "kernel": vapi.NpuKernel(
+                        w=1, h=1, stride_x=1, stride_y=1, dilation_x=1, dilation_y=1
+                    ),
+                    "padding": vapi.NpuPadding(top=0, left=0, bottom=0, right=0),
+                    "activation": {
+                        "op": vapi.NpuActivationOp.NONE_OR_RELU,
+                        "min": -3.5,
+                        "max": 60.25,
+                    },
+                    "ifm_upscale": vapi.NpuResamplingMode.NONE,
+                    "w_zero_point": 12,
+                },
+            ],
+        },
+    ]
+
+    def extract_ethosu_conv2d_extern_calls(mod):
+        """This function will obtain all ethosu_conv2d
+        calls from a NPU TIR module
+        Parameters
+        ----------
+        mod : tvm.IRModule
+            This is a NPU TIR Module
+
+        Returns
+        -------
+        list
+            of tvm.tir.Call objects
+            that are tir extern calls
+            for ethosu_conv2d
+        """
+        # There should only be a single function
+        assert len(mod.functions.items()) == 1
+        primfunc = mod.functions.items()[0][1]
+
+        ethosu_conv2d_calls = list()
+
+        def populate_ethosu_conv2d_calls(stmt):
+            if (
+                isinstance(stmt, tvm.tir.Call)
+                and stmt.op.name == "tir.call_extern"
+                and stmt.args[0] == "ethosu_conv2d"
+            ):
+                ethosu_conv2d_calls.append(stmt)
+
+        stmt_functor.post_order_visit(primfunc.body, populate_ethosu_conv2d_calls)
+        return ethosu_conv2d_calls
+
+    for test_case in test_cases:
+        ethosu_conv2d_calls = extract_ethosu_conv2d_extern_calls(test_case["tir_module"])
+        for idx, ethosu_conv2d_call in enumerate(ethosu_conv2d_calls):
+            ref = test_case["ref"][idx]
+            npu_op, w_zero_point = tir_to_cs_translator.translate_ethosu_conv2d(ethosu_conv2d_call)
+            # Compare IFM
+            assert npu_op.ifm.data_type == ref["ifm"]["data_type"]
+            assert npu_op.ifm.shape == ref["ifm"]["shape"]
+            assert npu_op.ifm.tiles.height_0 == ref["ifm"]["tiles"].height_0
+            assert npu_op.ifm.tiles.height_1 == ref["ifm"]["tiles"].height_1
+            assert npu_op.ifm.tiles.width_0 == ref["ifm"]["tiles"].width_0
+            assert npu_op.ifm.quantization == ref["ifm"]["quantization"]
+            assert npu_op.ifm.layout == ref["ifm"]["layout"]
+            assert npu_op.ifm.strides == ref["ifm"]["strides"]
+            # Compare OFM
+            assert npu_op.ofm.data_type == ref["ofm"]["data_type"]
+            assert npu_op.ofm.shape == ref["ofm"]["shape"]
+            assert npu_op.ofm.tiles.height_0 == ref["ofm"]["tiles"].height_0
+            assert npu_op.ofm.tiles.height_1 == ref["ofm"]["tiles"].height_1
+            assert npu_op.ofm.tiles.width_0 == ref["ofm"]["tiles"].width_0
+            assert npu_op.ofm.quantization == ref["ofm"]["quantization"]
+            assert npu_op.ofm.layout == ref["ofm"]["layout"]
+            assert npu_op.ofm.strides == ref["ofm"]["strides"]
+            # Compare kernel and padding
+            assert npu_op.kernel.__dict__ == ref["kernel"].__dict__
+            assert npu_op.padding == ref["padding"]
+            # Compare activation
+            if ref["activation"]["op"] is None:
+                assert npu_op.activation is None
+            else:
+                assert npu_op.activation.op_type == ref["activation"]["op"]
+                assert npu_op.activation.min == ref["activation"]["min"]
+                assert npu_op.activation.max == ref["activation"]["max"]
+            # Compare ifm upscaling
+            assert npu_op.ifm_upscale == ref["ifm_upscale"]
+            # Compare weight quantization parameters
+            assert w_zero_point == ref["w_zero_point"]
+
+
+def test_translate_ethosu_copy():
+    def extract_ethosu_copy_extern_calls(mod):
+        """This function will obtain all ethosu_conv2d
+        calls from a NPU TIR module
+        Parameters
+        ----------
+        mod : tvm.IRModule
+            This is a NPU TIR Module
+
+        Returns
+        -------
+        list
+            of tvm.tir.Call objects
+            that are tir extern calls
+            for ethosu_conv2d
+        """
+        # There should only be a single function
+        assert len(mod.functions.items()) == 1
+        primfunc = mod.functions.items()[0][1]
+
+        ethosu_copy_calls = list()
+
+        def populate_ethosu_copy_calls(stmt):
+            if (
+                isinstance(stmt, tvm.tir.Call)
+                and stmt.op.name == "tir.call_extern"
+                and stmt.args[0] == "ethosu_copy"
+            ):
+                ethosu_copy_calls.append(stmt)
+
+        stmt_functor.post_order_visit(primfunc.body, populate_ethosu_copy_calls)
+        return ethosu_copy_calls
+
+    test_cases = [
+        {
+            # Stimulus
+            "tir_module": MultiEthosUCopy(),
+            "param_dict": {
+                1: np.random.randint(
+                    np.iinfo("uint8").min, np.iinfo("uint8").max, [8, 1, 1, 32], "uint8"
+                ),
+                2: np.random.randint(np.iinfo("int32").min, np.iinfo("int32").max, [8], "int32"),
+            },
+            # Reference outputs
+            "ref": [
+                {
+                    "src": "placeholder_4",
+                    "dest": "placeholder_global",
+                    "length": 256,
+                },
+                {
+                    "src": "placeholder_5",
+                    "dest": "placeholder_d_global",
+                    "length": 8,
+                },
+            ],
+        },
+    ]
+
+    for test_case in test_cases:
+        ethosu_copy_calls = extract_ethosu_copy_extern_calls(test_case["tir_module"])
+        for idx, ethosu_copy_call in enumerate(ethosu_copy_calls):
+            npu_dma_op = tir_to_cs_translator.translate_ethosu_tir_extern_call(ethosu_copy_call)
+            assert npu_dma_op.src.address.buffer_var.name == test_case["ref"][idx]["src"]
+            assert npu_dma_op.dest.address.buffer_var.name == test_case["ref"][idx]["dest"]
+            assert npu_dma_op.src.length == test_case["ref"][idx]["length"]
+            assert npu_dma_op.dest.length == test_case["ref"][idx]["length"]
+
+
+def test_assign_addresses():
+    test_cases = [
+        {
+            # Stimulus
+            "tir_module": WeightStreamOnly(),
+            "param_dict": {
+                2: np.random.randint(np.iinfo("uint8").min, np.iinfo("uint8").max, [144], "uint8"),
+                3: np.random.randint(np.iinfo("uint8").min, np.iinfo("uint8").max, [20], "uint8"),
+                4: np.random.randint(np.iinfo("uint8").min, np.iinfo("uint8").max, [144], "uint8"),
+                5: np.random.randint(np.iinfo("uint8").min, np.iinfo("uint8").max, [20], "uint8"),
+                6: np.random.randint(np.iinfo("uint8").min, np.iinfo("uint8").max, [144], "uint8"),
+                7: np.random.randint(np.iinfo("uint8").min, np.iinfo("uint8").max, [20], "uint8"),
+                8: np.random.randint(np.iinfo("uint8").min, np.iinfo("uint8").max, [144], "uint8"),
+                9: np.random.randint(np.iinfo("uint8").min, np.iinfo("uint8").max, [20], "uint8"),
+            },
+        },
+        {
+            # Stimulus
+            "tir_module": MixedRead(),
+            "param_dict": {
+                1: np.random.randint(np.iinfo("uint8").min, np.iinfo("uint8").max, [592], "uint8"),
+                3: np.random.randint(np.iinfo("uint8").min, np.iinfo("uint8").max, [160], "uint8"),
+                4: np.random.randint(np.iinfo("uint8").min, np.iinfo("uint8").max, [80], "uint8"),
+                5: np.random.randint(np.iinfo("uint8").min, np.iinfo("uint8").max, [20], "uint8"),
+                6: np.random.randint(np.iinfo("uint8").min, np.iinfo("uint8").max, [80], "uint8"),
+                7: np.random.randint(np.iinfo("uint8").min, np.iinfo("uint8").max, [20], "uint8"),
+                8: np.random.randint(np.iinfo("uint8").min, np.iinfo("uint8").max, [80], "uint8"),
+                9: np.random.randint(np.iinfo("uint8").min, np.iinfo("uint8").max, [20], "uint8"),
+                10: np.random.randint(np.iinfo("uint8").min, np.iinfo("uint8").max, [80], "uint8"),
+                11: np.random.randint(np.iinfo("uint8").min, np.iinfo("uint8").max, [20], "uint8"),
+            },
+        },
+    ]
+
+    def extract_extern_calls(mod):
+        """This function will obtain all ethosu_conv2d
+        calls from a NPU TIR module
+        Parameters
+        ----------
+        mod : tvm.IRModule
+            This is a NPU TIR Module
+
+        Returns
+        -------
+        list
+            of tvm.tir.Call objects
+            that are tir extern calls
+            for ethosu_conv2d
+        """
+        # There should only be a single function
+        assert len(mod.functions.items()) == 1
+        primfunc = mod.functions.items()[0][1]
+
+        extern_calls = list()
+
+        def populate_extern_calls(stmt):
+            if isinstance(stmt, tvm.tir.Call) and stmt.op.name == "tir.call_extern":
+                extern_calls.append(stmt)
+
+        stmt_functor.post_order_visit(primfunc.body, populate_extern_calls)
+        return extern_calls
+
+    def collect_tir_buffer_info(npu_ops):
+        """This is run prior to address assigning to collect tir buffer information
+        for verification later on"""
+        _npu_op_tir_buffers = dict()
+        for npu_op in npu_ops:
+            if isinstance(npu_op, vapi.NpuDmaOperation):
+                _npu_op_tir_buffers[npu_op] = (npu_op.src.address, npu_op.dest.address)
+            elif issubclass(type(npu_op), vapi.NpuBlockOperation):
+                _npu_op_tir_buffers[npu_op] = (
+                    npu_op.ifm.tiles.addresses[0],
+                    npu_op.ofm.tiles.addresses[0],
+                    npu_op.weights,
+                    npu_op.biases,
+                )
+        return _npu_op_tir_buffers
+
+    def _check_buffer(address, region, length, buffer_var):
+        """Checks whether the buffer information is valid with
+        original tir buffers.
+        - If its constant, this will check
+          the slice in the constant tensor has the values.
+        - If its scratch, this will check
+          the slice is within scratch and does not have conflicts
+          with other scratch tensors.
+        - If its input/output, this will check the
+          address is zero
+        """
+        inverse_region_map = {
+            0: tir_to_cs_translator.BufferType.constant,
+            1: tir_to_cs_translator.BufferType.scratch,
+            3: tir_to_cs_translator.BufferType.input,
+            4: tir_to_cs_translator.BufferType.output,
+        }
+        buffer_type = inverse_region_map[region]
+        if buffer_type == tir_to_cs_translator.BufferType.constant:
+            ref = buffer_info[buffer_var].values
+            assert (constant_tensor[address : address + length] == ref).all()
+            # Every buffer is adjusted to align to 16 bytes
+            length = util.round_up(length, 16)
+            # Mark these constants are read at least once
+            constant_tensor_read_mask[address : address + length] = np.ones(length, dtype="uint8")
+        elif buffer_type == tir_to_cs_translator.BufferType.scratch:
+            shape = list(buffer_info[buffer_var].shape)
+            assert length == np.prod(shape)
+            assert address < scratch_size
+            # Every buffer is adjusted to align to 16 bytes
+            length = util.round_up(length, 16)
+            assert address + length <= scratch_size
+            # The scratch area should not be used by anyother buffer
+            assert not scratch_allocation_mask[address : address + length].any()
+            # The scratch area is marked as used
+            scratch_allocation_mask[address : address + length] = np.ones(length, dtype="uint8")
+        elif buffer_type == tir_to_cs_translator.BufferType.input:
+            assert address == 0
+        else:
+            assert buffer_type == tir_to_cs_translator.BufferType.output
+            assert address == 0
+
+    def verify(npu_ops):
+        """This wrapper verifies the allocated addresses matches with original tir buffers"""
+        checked_buffers = set()
+
+        def check_buffer(address, region, length, buffer_var):
+            if buffer_var not in checked_buffers:
+                _check_buffer(address, region, length, buffer_var)
+                checked_buffers.add(buffer_var)
+
+        for npu_op in npu_ops:
+            if isinstance(npu_op, vapi.NpuDmaOperation):
+                src_tir_buffer_var = npu_op_tir_buffers[npu_op][0].buffer_var
+                check_buffer(
+                    npu_op.src.address, npu_op.src.region, npu_op.src.length, src_tir_buffer_var
+                )
+                dest_tir_load = npu_op_tir_buffers[npu_op][1].buffer_var
+                check_buffer(
+                    npu_op.dest.address,
+                    npu_op.dest.region,
+                    npu_op.dest.length,
+                    dest_tir_load,
+                )
+            elif issubclass(type(npu_op), vapi.NpuBlockOperation):
+                ifm_tir_buffer_var = npu_op_tir_buffers[npu_op][0].buffer_var
+                ifm_length = (
+                    npu_op.ifm.shape.height * npu_op.ifm.shape.width * npu_op.ifm.shape.depth
+                )
+                check_buffer(
+                    npu_op.ifm.tiles.addresses[0],
+                    npu_op.ifm.region,
+                    ifm_length,
+                    ifm_tir_buffer_var,
+                )
+                ofm_tir_buffer_var = npu_op_tir_buffers[npu_op][1].buffer_var
+                ofm_length = (
+                    npu_op.ofm.shape.height * npu_op.ofm.shape.width * npu_op.ofm.shape.depth
+                )
+                check_buffer(
+                    npu_op.ofm.tiles.addresses[0],
+                    npu_op.ofm.region,
+                    ofm_length,
+                    ofm_tir_buffer_var,
+                )
+                for idx, weight in enumerate(npu_op_tir_buffers[npu_op][2]):
+                    assert isinstance(weight, vapi.NpuAddressRange)
+                    check_buffer(
+                        npu_op.weights[idx].address,
+                        npu_op.weights[idx].region,
+                        npu_op.weights[idx].length,
+                        weight.address.buffer_var,
+                    )
+                for idx, bias in enumerate(npu_op_tir_buffers[npu_op][3]):
+                    assert isinstance(bias, vapi.NpuAddressRange)
+                    check_buffer(
+                        npu_op.biases[idx].address,
+                        npu_op.biases[idx].region,
+                        npu_op.biases[idx].length,
+                        bias.address.buffer_var,
+                    )
+
+    for test_case in test_cases:
+        buffer_info = tir_to_cs_translator.extract_buffer_info(
+            test_case["tir_module"], test_case["param_dict"]
+        )
+        extern_calls = extract_extern_calls(test_case["tir_module"])
+        _npu_ops = list()
+        for extern_call in extern_calls:
+            _npu_ops.append(tir_to_cs_translator.translate_ethosu_tir_extern_call(extern_call))
+        npu_op_tir_buffers = collect_tir_buffer_info(_npu_ops)
+        _npu_ops, constant_tensor, scratch_size = tir_to_cs_translator.assign_addresses(
+            buffer_info, _npu_ops
+        )
+        scratch_allocation_mask = np.zeros(scratch_size, dtype="uint8")
+        constant_tensor_read_mask = np.zeros(constant_tensor.size, dtype="uint8")
+        verify(_npu_ops)
+        # This will be only 1 if all allocated scratch is used.
+        assert np.prod(scratch_allocation_mask) == 1
+        # This will be only 1 if all constant tensors is read at least once.
+        assert np.prod(constant_tensor_read_mask) == 1
+
+
+if __name__ == "__main__":
+    test_buffer_info_extraction()
+    test_translate_ethosu_conv2d()
+    test_translate_ethosu_copy()
+    test_assign_addresses()

From 507400eec2c6dedfc68422afafbedbda3b83539c Mon Sep 17 00:00:00 2001
From: Christopher Sidebottom <chris.sidebottom@arm.com>
Date: Mon, 27 Sep 2021 20:54:53 +0100
Subject: [PATCH 32/37] Ensure google-mock is installed and setup (#9107)

Google Mock is the mocking/helper framework that gets bundled with Google Test, it used to be separate but now isn't.  I ran into the issue of Google Mock not being configured fully in the i386 build of #9106, which uses the `HasSubtr` matcher. This PR aims to fully configure Google Mock for use, which is interesting in itself...

The headers are installed as part of Ubuntu 18.04's `googletest` package:

```shell
$ dpkg -S /usr/include/gmock/
googletest:amd64: /usr/include/gmock
```

But not the lib sources, that requires another package named `google-mock`:

```shell
$ dpkg -S /usr/src/gmock
google-mock:amd64: /usr/src/gmock
```

But in Ubuntu 16.04 the includes and lib sources are in the `google-mock` package:
```shell
$ dpkg -S /usr/include/gmock
google-mock:i386: /usr/include/gmock
$ dpkg -S /usr/src/gmock/
google-mock:i386: /usr/src/gmock
```

And excitingly, in Ubuntu 20.04 this will again be changed to `libgmock-dev` by the
looks of things, just to keep us on our toes.
---
 docker/install/ubuntu_install_core.sh | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/docker/install/ubuntu_install_core.sh b/docker/install/ubuntu_install_core.sh
index 2a50afcf5985..fb167b92f5c5 100755
--- a/docker/install/ubuntu_install_core.sh
+++ b/docker/install/ubuntu_install_core.sh
@@ -22,9 +22,10 @@ set -o pipefail
 
 # install libraries for building c++ core on ubuntu
 apt-get update && apt-get install -y --no-install-recommends \
-        git make libgtest-dev cmake wget unzip libtinfo-dev libz-dev\
+        git make google-mock libgtest-dev cmake wget unzip libtinfo-dev libz-dev \
         libcurl4-openssl-dev libssl-dev libopenblas-dev g++ sudo \
         apt-transport-https graphviz pkg-config curl
 
 
 cd /usr/src/gtest && cmake CMakeLists.txt && make && cp *.a /usr/lib
+cd /usr/src/gmock && cmake CMakeLists.txt && make && cp *.a /usr/lib

From 49c99edf9277d36315c353121a3f20c64bc820f3 Mon Sep 17 00:00:00 2001
From: Ioannis Doudalis <doudalis@gmail.com>
Date: Mon, 27 Sep 2021 13:06:43 -0700
Subject: [PATCH 33/37] [CI] bash.sh, build.sh: add option to set the container
 name and hostname (#9110)

This commit adds option "--name" to bash.sh and build.sh to enable the user
specify the name of the container and set the hostname inside the
container as well.

This helps the developer idenitfy that they are inside the container
and which container they are working inside.
---
 docker/bash.sh  | 24 ++++++++++++++++++++++--
 docker/build.sh | 12 +++++++++++-
 2 files changed, 33 insertions(+), 3 deletions(-)

diff --git a/docker/bash.sh b/docker/bash.sh
index 372cfded8f89..cbd71870747c 100755
--- a/docker/bash.sh
+++ b/docker/bash.sh
@@ -22,7 +22,7 @@
 #
 # Usage: docker/bash.sh [-i|--interactive] [--net=host] [-t|--tty]
 #          [--mount MOUNT_DIR] [--repo-mount-point REPO_MOUNT_POINT]
-#          [--dry-run]
+#          [--dry-run] [--name NAME]
 #          <DOCKER_IMAGE_NAME> [--] [COMMAND]
 #
 # Usage: docker/bash.sh <CONTAINER_NAME>
@@ -40,7 +40,7 @@ function show_usage() {
     cat <<EOF
 Usage: docker/bash.sh [-i|--interactive] [--net=host] [-t|--tty]
          [--mount MOUNT_DIR] [--repo-mount-point REPO_MOUNT_POINT]
-         [--dry-run]
+         [--dry-run] [--name NAME]
          <DOCKER_IMAGE_NAME> [--] [COMMAND]
 
 -h, --help
@@ -85,6 +85,11 @@ Usage: docker/bash.sh [-i|--interactive] [--net=host] [-t|--tty]
 
     Print the docker command to be run, but do not execute it.
 
+--name
+
+    Set the name of the docker container, and the hostname that will
+    appear inside the container.
+
 DOCKER_IMAGE_NAME
 
     The name of the docker container to be run.  This can be an
@@ -118,6 +123,7 @@ USE_NET_HOST=false
 DOCKER_IMAGE_NAME=
 COMMAND=bash
 MOUNT_DIRS=( )
+CONTAINER_NAME=
 
 # TODO(Lunderberg): Remove this if statement and always set to
 # "${REPO_DIR}".  The consistent directory for Jenkins is currently
@@ -180,6 +186,15 @@ while (( $# )); do
             shift
             ;;
 
+        --name)
+            if [[ -n "$2" ]]; then
+                CONTAINER_NAME="$2"
+                shift 2
+            else
+                parse_error 'ERROR: --name requires a non empty argument'
+            fi
+            ;;
+
         --dry-run)
             DRY_RUN=true
             shift
@@ -312,6 +327,11 @@ if ${TTY}; then
     DOCKER_FLAGS+=( --tty )
 fi
 
+# Setup the docker name and the hostname inside the container
+if [[ ! -z "${CONTAINER_NAME}" ]]; then
+    DOCKER_FLAGS+=( --name ${CONTAINER_NAME} --hostname ${CONTAINER_NAME})
+fi
+
 # Expose external directories to the docker container
 for MOUNT_DIR in ${MOUNT_DIRS[@]+"${MOUNT_DIRS[@]}"}; do
     DOCKER_MOUNT+=( --volume "${MOUNT_DIR}:${MOUNT_DIR}" )
diff --git a/docker/build.sh b/docker/build.sh
index 3b58bcc52a75..4e1a9b346895 100755
--- a/docker/build.sh
+++ b/docker/build.sh
@@ -23,7 +23,8 @@
 # Usage: build.sh <CONTAINER_TYPE> [--tag <DOCKER_IMAGE_TAG>]
 #                [--dockerfile <DOCKERFILE_PATH>] [-it]
 #                [--net=host] [--cache-from <IMAGE_NAME>]
-#                [--context-path <CONTEXT_PATH>] [<COMMAND>]
+#                [--name CONTAINER_NAME] [--context-path <CONTEXT_PATH>]
+#                [<COMMAND>]
 #
 # CONTAINER_TYPE: Type of the docker container used the run the build,
 #                 e.g. "ci_cpu", "ci_gpu"
@@ -38,6 +39,9 @@
 # IMAGE_NAME: An image to be as a source for cached layers when building the
 #             Docker image requested.
 #
+# CONTAINER_NAME: The name of the docker container, and the hostname that will
+#                 appear inside the container.
+#
 # CONTEXT_PATH: Path to be used for relative path resolution when building
 #               the Docker images.
 #
@@ -95,6 +99,12 @@ else
     echo "Using default context path: ${DOCKER_CONTEXT_PATH}"
 fi
 
+if [[ "$1" == "--name" ]]; then
+    CI_DOCKER_EXTRA_PARAMS+=("--name ${2} --hostname ${2}")
+    echo "Using container name ${2}"
+    shift 2
+fi
+
 if [[ ! -f "${DOCKERFILE_PATH}" ]]; then
     echo "Invalid Dockerfile path: \"${DOCKERFILE_PATH}\""
     exit 1

From 9df2ae8eaa8b394013182a7ad09ac57fe401f80e Mon Sep 17 00:00:00 2001
From: Krzysztof Parzyszek <kparzysz@quicinc.com>
Date: Mon, 27 Sep 2021 19:38:26 -0500
Subject: [PATCH 34/37] [Codegen] Swap out analyzer when outlining (#9117)

Problem: the `analyzer_` in `CodeGenLLVM` and derived classes
can generate invalid code for outlined functions.

Consider code like this:

  let x = y in
    // attr compute_scope
    blah = x

Then it gets outlined in codegen_cpu (for example):

  let x = y in
    call foo(x)

  foo(x) {
    blah = x
  }

Now, if `analyzer_->Simplify` was run on the body of `foo`, it
would produce:

  foo(x) {
    blah = y
  }

Because the `analyzer_` knows that `x` is same as `y` (because
of the `Let` statemement), but doesn't know that `y` is no longer
available in the outlined function `foo`.

See
https://discuss.tvm.apache.org/t/compute-scope-issue-with-analyzer-invalid-simplification/11111
---
 src/target/llvm/codegen_cpu.cc | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/src/target/llvm/codegen_cpu.cc b/src/target/llvm/codegen_cpu.cc
index e67dee3c37c4..40e6245e8beb 100644
--- a/src/target/llvm/codegen_cpu.cc
+++ b/src/target/llvm/codegen_cpu.cc
@@ -473,14 +473,17 @@ void CodeGenCPU::CreateComputeScope(const AttrStmtNode* op) {
     }
 #endif
   }
+  auto new_analyzer = std::make_unique<arith::Analyzer>();
   std::swap(function_, fcompute);
-  std::swap(new_vmap, var_map_);
+  std::swap(analyzer_, new_analyzer);
+  std::swap(var_map_, new_vmap);
   BasicBlock* compute_entry = BasicBlock::Create(*ctx_, "entry", function_);
   builder_->SetInsertPoint(compute_entry);
   this->VisitStmt(op->body);
   builder_->CreateRet(ConstInt32(0));
   // swap the var map back, now we are back on track.
-  std::swap(new_vmap, var_map_);
+  std::swap(var_map_, new_vmap);
+  std::swap(analyzer_, new_analyzer);
   std::swap(function_, fcompute);
   builder_->SetInsertPoint(compute_call_end);
 }
@@ -551,13 +554,16 @@ void CodeGenCPU::CreateParallelLaunch(const Stmt& body, int num_task) {
   new_vmap[par_env.num_task.get()] =
       builder_->CreateLoad(builder_->CreateInBoundsGEP(penv, {ConstInt32(0), ConstInt32(1)}));
   par_env.penv = penv;
+  auto new_analyzer = std::make_unique<arith::Analyzer>();
   std::swap(function_, f);
   std::swap(parallel_env_, par_env);
+  std::swap(analyzer_, new_analyzer);
   std::swap(var_map_, new_vmap);
   this->VisitStmt(body);
   builder_->CreateRet(ConstInt32(0));
   // swap the var map back, now we are back on track.
   std::swap(var_map_, new_vmap);
+  std::swap(analyzer_, new_analyzer);
   std::swap(parallel_env_, par_env);
   std::swap(function_, f);
   ICHECK_NE(par_env.parallel_loop_count, 0) << "Cannot find parallel loop within parallel launch";
@@ -604,12 +610,15 @@ void CodeGenCPU::CreateStaticInit(const std::string& init_fname, const Stmt& bod
   std::unordered_map<const VarNode*, llvm::Value*> new_vmap;
   UnpackClosureData(cdata, vfields, &new_vmap);
   ICHECK(parallel_env_.penv == nullptr);
+  auto new_analyzer = std::make_unique<arith::Analyzer>();
   std::swap(function_, f);
+  std::swap(analyzer_, new_analyzer);
   std::swap(var_map_, new_vmap);
   this->VisitStmt(body);
   builder_->CreateRet(ConstInt32(0));
   // swap the var map back, now we are back on track.
   std::swap(var_map_, new_vmap);
+  std::swap(analyzer_, new_analyzer);
   std::swap(function_, f);
   builder_->SetInsertPoint(init_end);
 }

From 4251103f5d752263892e71224266dfbc25079490 Mon Sep 17 00:00:00 2001
From: Sen Yang <syang15@fudan.edu.cn>
Date: Tue, 28 Sep 2021 23:08:32 +0800
Subject: [PATCH 35/37] [Bugfix] Add nullptr checking for `AttrStmt` with
 `coproc_uop_scope` attr key (#9123)

---
 src/target/llvm/codegen_cpu.cc | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/src/target/llvm/codegen_cpu.cc b/src/target/llvm/codegen_cpu.cc
index 40e6245e8beb..c98c23ae8c61 100644
--- a/src/target/llvm/codegen_cpu.cc
+++ b/src/target/llvm/codegen_cpu.cc
@@ -440,9 +440,11 @@ void CodeGenCPU::CreateComputeScope(const AttrStmtNode* op) {
   // $xxx_compute_ functions are not global. They should be marked as static (via InternalLinkage)
   // to call them correctly on MIPS platform (CALL16 issue)
   // Linkage ld Error: CALL16 reloc at 0x290 not against global symbol
-  llvm::Function* fcompute = llvm::Function::Create(
-      ftype, llvm::Function::InternalLinkage,
-      op->value.as<StringImmNode>()->value.operator llvm::StringRef(), module_.get());
+  const StringImmNode* value = op->value.as<StringImmNode>();
+  ICHECK(value != nullptr);
+  llvm::Function* fcompute =
+      llvm::Function::Create(ftype, llvm::Function::InternalLinkage,
+                             value->value.operator llvm::StringRef(), module_.get());
   BasicBlock* compute_call_end = CheckCallSuccess(builder_->CreateCall(fcompute, arg_values));
   // setup compute function.
   std::unordered_map<const VarNode*, llvm::Value*> new_vmap;
@@ -953,7 +955,9 @@ void CodeGenCPU::VisitStmt_(const AssertStmtNode* op) {
 
 void CodeGenCPU::VisitStmt_(const AttrStmtNode* op) {
   if (op->attr_key == tir::attr::coproc_uop_scope) {
-    this->CreateStaticInit(op->value.as<StringImmNode>()->value, op->body);
+    const StringImmNode* value = op->value.as<StringImmNode>();
+    ICHECK(value != nullptr);
+    this->CreateStaticInit(value->value, op->body);
   } else if (op->attr_key == tir::attr::compute_scope) {
     this->CreateComputeScope(op);
   } else if (tir::attr::IsPragmaKey(op->attr_key)) {

From 9e47b43fc9f6ab93079225fd4fa90d5fff96d81c Mon Sep 17 00:00:00 2001
From: Junru Shao <junrushao1994@gmail.com>
Date: Tue, 28 Sep 2021 09:22:37 -0700
Subject: [PATCH 36/37] [Meta Schedule][M3b] Database (#9061)

This PR is part of the meta schedule project (#8473) that adds a generic
Database interface of tuning records, as well as a default implementation
of using two JSON-files to mimic the database.
This feature is future-compatible with dynamic shape auto-tuning.

Co-authored-by: Xiyou Zhou <xiyou@octoml.ai>
Co-authored-by: Bohan Hou <32121147+spectrometerHBH@users.noreply.github.com>
Co-authored-by: Ruihang Lai <lairuihangdongdong@qq.com>
Co-authored-by: Hongyi Jin <3231950289@qq.com>
Co-authored-by: Wuwei Lin <wuwei@apache.org>
Co-authored-by: Siyuan Feng <Hzfengsy@sjtu.edu.cn>
---
 include/tvm/meta_schedule/database.h          | 279 ++++++++++++++++++
 include/tvm/runtime/container/string.h        |   7 +
 python/tvm/meta_schedule/__init__.py          |   4 +-
 python/tvm/meta_schedule/database/__init__.py |  22 ++
 python/tvm/meta_schedule/database/database.py | 240 +++++++++++++++
 .../meta_schedule/database/json_database.py   |  61 ++++
 python/tvm/meta_schedule/utils.py             |  28 +-
 src/meta_schedule/database/database.cc        | 179 +++++++++++
 src/meta_schedule/database/json_database.cc   | 161 ++++++++++
 src/meta_schedule/utils.h                     | 106 ++++++-
 .../unittest/test_meta_schedule_database.py   | 274 +++++++++++++++++
 11 files changed, 1358 insertions(+), 3 deletions(-)
 create mode 100644 include/tvm/meta_schedule/database.h
 create mode 100644 python/tvm/meta_schedule/database/__init__.py
 create mode 100644 python/tvm/meta_schedule/database/database.py
 create mode 100644 python/tvm/meta_schedule/database/json_database.py
 create mode 100644 src/meta_schedule/database/database.cc
 create mode 100644 src/meta_schedule/database/json_database.cc
 create mode 100644 tests/python/unittest/test_meta_schedule_database.py

diff --git a/include/tvm/meta_schedule/database.h b/include/tvm/meta_schedule/database.h
new file mode 100644
index 000000000000..7ba3c207e349
--- /dev/null
+++ b/include/tvm/meta_schedule/database.h
@@ -0,0 +1,279 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#ifndef TVM_META_SCHEDULE_DATABASE_H_
+#define TVM_META_SCHEDULE_DATABASE_H_
+
+#include <tvm/meta_schedule/arg_info.h>
+#include <tvm/target/target.h>
+#include <tvm/tir/schedule/trace.h>
+
+namespace tvm {
+namespace meta_schedule {
+
+/*! \brief A workload, i.e. an IRModule and its structural hash. */
+class WorkloadNode : public runtime::Object {
+ public:
+  /*! \brief The type of structural hash */
+  using THashCode = size_t;
+  /*! \brief The workload's IRModule. */
+  IRModule mod;
+  /*! \brief The workload's structural hash. */
+  THashCode shash;
+
+  void VisitAttrs(tvm::AttrVisitor* v) {
+    v->Visit("mod", &mod);
+    // `shash` is not visited because TVM FFI doesn't support uint64_t
+  }
+
+  static constexpr const char* _type_key = "meta_schedule.Workload";
+  TVM_DECLARE_FINAL_OBJECT_INFO(WorkloadNode, runtime::Object);
+
+  /*!
+   * \brief Export the workload to a JSON string.
+   * \return An array containing the structural hash and the base64 json string.
+   */
+  ObjectRef AsJSON() const;
+};
+
+/*!
+ * \brief Managed reference to WorkloadNode.
+ *  \sa WorkloadNode
+ */
+class Workload : public runtime::ObjectRef {
+ public:
+  using THashCode = WorkloadNode::THashCode;
+  /*!
+   * \brief Constructor of Workload.
+   * \param mod The workload's IRModule.
+   */
+  TVM_DLL explicit Workload(IRModule mod);
+  /*!
+   * \brief Constructor of Workload.
+   * \param mod The workload's IRModule.
+   * \param shash The workload's structural hash.
+   */
+  TVM_DLL explicit Workload(IRModule mod, THashCode shash);
+  /*!
+   * \brief Create a workload from a json object.
+   * \param json_obj The json object.
+   * \return The created workload.
+   */
+  TVM_DLL static Workload FromJSON(const ObjectRef& json_obj);
+
+  TVM_DEFINE_NOTNULLABLE_OBJECT_REF_METHODS(Workload, runtime::ObjectRef, WorkloadNode);
+};
+
+/*! \brief The hash method for Workload */
+struct WorkloadHash {
+  size_t operator()(const Workload& a) const { return a->shash; }
+};
+
+/*! \brief The equality check for Workload */
+struct WorkloadEqual {
+  bool operator()(const Workload& a, const Workload& b) const {
+    return a->shash == b->shash && tvm::StructuralEqual()(a->mod, b->mod);
+  }
+};
+
+/*! \brief The class of tuning records. */
+class TuningRecordNode : public runtime::Object {
+ public:
+  /*! \brief The trace tuned. */
+  tir::Trace trace;
+  /*! \brief The profiling result in seconds. */
+  Array<FloatImm> run_secs;
+  /*! \brief The workload. */
+  Workload workload{nullptr};
+  /*! \brief The target for tuning. */
+  Target target;
+  /*! \brief The argument information. */
+  Array<ArgInfo> args_info;
+
+  void VisitAttrs(tvm::AttrVisitor* v) {
+    v->Visit("trace", &trace);
+    v->Visit("run_secs", &run_secs);
+    v->Visit("workload", &workload);
+    v->Visit("target", &target);
+    v->Visit("args_info", &args_info);
+  }
+
+  static constexpr const char* _type_key = "meta_schedule.TuningRecord";
+  TVM_DECLARE_FINAL_OBJECT_INFO(TuningRecordNode, runtime::Object);
+
+  /*!
+   * \brief Export the tuning record to a JSON string.
+   * \return An array containing the trace, running secs, serialized target, and
+   * argument information.
+   */
+  ObjectRef AsJSON() const;
+};
+
+/*!
+ * \brief The managed reference of TuningRecordNode.
+ * \sa TuningRecordNode
+ */
+class TuningRecord : public runtime::ObjectRef {
+ public:
+  /*!
+   \brief Constructor of a tuning record.
+   \param trace The trace of the tuning record.
+   \param run_secs The running time of the tuning record.
+   \param workload The workload of the tuning record.
+   \param target The target of the tuning record.
+   \param args_info The argument information of the tuning record.
+  */
+  TVM_DLL explicit TuningRecord(tir::Trace trace, Array<FloatImm> run_secs, Workload workload,
+                                Target target, Array<ArgInfo> args_info);
+  /*!
+   * \brief Create a tuning record from a json object.
+   * \param json_obj The json object.
+   * \param workload The workload.
+   * \return The tuning record created.
+   */
+  TVM_DLL static TuningRecord FromJSON(const ObjectRef& json_obj, const Workload& workload);
+  TVM_DEFINE_NOTNULLABLE_OBJECT_REF_METHODS(TuningRecord, runtime::ObjectRef, TuningRecordNode);
+};
+
+/* \brief The abstract interface of database. */
+class DatabaseNode : public runtime::Object {
+ public:
+  /*! \brief Default destructor */
+  virtual ~DatabaseNode() = default;
+  /*!
+   * \brief Look up or add workload to the database if missing.
+   * \param mod The IRModule to be searched for or added.
+   * \return The workload corresponding to the given IRModule.
+   */
+  virtual Workload CommitWorkload(const IRModule& mod) = 0;
+  /*!
+   * \brief Add a tuning record to the database.
+   * \param record The tuning record to be added.
+   */
+  virtual void CommitTuningRecord(const TuningRecord& record) = 0;
+  /*!
+   * \brief Get the top K tuning records of given workload from the database.
+   * \param workload The workload to be searched for.
+   * \param top_k The number of top records to be returned.
+   * \return An array of top K tuning records for the given workload.
+   */
+  virtual Array<TuningRecord> GetTopK(const Workload& workload, int top_k) = 0;
+  /*!
+   * \brief Get the size of the database.
+   * \return The size of the database.
+   */
+  virtual int64_t Size() = 0;
+
+  static constexpr const char* _type_key = "meta_schedule.Database";
+  TVM_DECLARE_BASE_OBJECT_INFO(DatabaseNode, runtime::Object);
+};
+
+/*! \brief The database with customized methods on the python-side. */
+class PyDatabaseNode : public DatabaseNode {
+ public:
+  /*!
+   * \brief The function type of `CommitWorkload` method.
+   * \param mod The IRModule to be searched for or added.
+   * \return The workload corresponding to the given IRModule.
+   */
+  using FCommitWorkload = runtime::TypedPackedFunc<Workload(const IRModule&)>;
+  /*!
+   * \brief The function type of `CommitTuningRecord` method.
+   * \param record The tuning record to be added.
+   */
+  using FCommitTuningRecord = runtime::TypedPackedFunc<void(const TuningRecord&)>;
+  /*!
+   * \brief The function type of `GetTopK` method.
+   * \param workload The workload to be searched for.
+   * \param top_k The number of top records to be returned.
+   * \return An array of top K tuning records for the given workload.
+   */
+  using FGetTopK = runtime::TypedPackedFunc<Array<TuningRecord>(const Workload&, int)>;
+  /*!
+   * \brief The function type of `Size` method.
+   * \return The size of the database.
+   */
+  using FSize = runtime::TypedPackedFunc<int64_t()>;
+
+  /*! \brief The packed function to the `CommitWorkload` function. */
+  FCommitWorkload f_commit_workload;
+  /*! \brief The packed function to the `CommitTuningRecord` function. */
+  FCommitTuningRecord f_commit_tuning_record;
+  /*! \brief The packed function to the `GetTopK` function. */
+  FGetTopK f_get_top_k;
+  /*! \brief The packed function to the `Size` function. */
+  FSize f_size;
+
+  void VisitAttrs(tvm::AttrVisitor* v) {
+    // PackedFuncs are all not visited, because the reflection system doesn't take care of them,
+    // so it cannot be accessible on the python side. If there is such need from the future,
+    // we can then add corresponding accessor methods to help access on python.
+    //
+    // `f_commit_workload` is not visited
+    // `f_commit_tuning_record` is not visited
+    // `f_get_top_k` is not visited
+    // `f_size` is not visited
+  }
+
+  static constexpr const char* _type_key = "meta_schedule.PyDatabase";
+  TVM_DECLARE_FINAL_OBJECT_INFO(PyDatabaseNode, DatabaseNode);
+
+  Workload CommitWorkload(const IRModule& mod) final { return f_commit_workload(mod); }
+
+  void CommitTuningRecord(const TuningRecord& record) final { f_commit_tuning_record(record); }
+
+  Array<TuningRecord> GetTopK(const Workload& workload, int top_k) final {
+    return f_get_top_k(workload, top_k);
+  }
+
+  int64_t Size() final { return f_size(); }
+};
+
+/*!
+ * \brief Managed reference to DatabaseNode.
+ * \sa DatabaseNode
+ */
+class Database : public runtime::ObjectRef {
+ public:
+  /*!
+   * \brief Create a default database that uses JSON file for tuning records.
+   * \param path_workload The path to the workload table.
+   * \param path_tuning_record The path to the database table.
+   * \param allow_missing Whether to create new file when the given path is not found.
+   */
+  TVM_DLL static Database JSONDatabase(String path_workload, String path_tuning_record,
+                                       bool allow_missing);
+  /*!
+   * \brief Create a database with customized methods on the python-side.
+   * \param f_commit_workload The packed function of `CommitWorkload`.
+   * \param f_commit_tuning_record The packed function of `CommitTuningRecord`.
+   * \param f_get_top_k The packed function of `GetTopK`.
+   * \param f_size The packed function of `Size`.
+   * \return The created database.
+   */
+  TVM_DLL static Database PyDatabase(PyDatabaseNode::FCommitWorkload f_commit_workload,
+                                     PyDatabaseNode::FCommitTuningRecord f_commit_tuning_record,
+                                     PyDatabaseNode::FGetTopK f_get_top_k,
+                                     PyDatabaseNode::FSize f_size);
+  TVM_DEFINE_MUTABLE_NOTNULLABLE_OBJECT_REF_METHODS(Database, runtime::ObjectRef, DatabaseNode);
+};
+
+}  // namespace meta_schedule
+}  // namespace tvm
+
+#endif  // TVM_META_SCHEDULE_DATABASE_H_
diff --git a/include/tvm/runtime/container/string.h b/include/tvm/runtime/container/string.h
index 664d19818be1..bb9e7ff65adc 100644
--- a/include/tvm/runtime/container/string.h
+++ b/include/tvm/runtime/container/string.h
@@ -31,6 +31,7 @@
 #include <tvm/runtime/object.h>
 
 #include <algorithm>
+#include <cstddef>
 #include <cstring>
 #include <initializer_list>
 #include <memory>
@@ -149,6 +150,12 @@ class String : public ObjectRef {
   String(const char* other)  // NOLINT(*)
       : String(std::string(other)) {}
 
+  /*!
+   * \brief Construct a new null object
+   */
+  String(std::nullptr_t)  // NOLINT(*)
+      : ObjectRef(nullptr) {}
+
   /*!
    * \brief Change the value the reference object points to.
    *
diff --git a/python/tvm/meta_schedule/__init__.py b/python/tvm/meta_schedule/__init__.py
index c07b28b4fc9f..f8b2b026c83b 100644
--- a/python/tvm/meta_schedule/__init__.py
+++ b/python/tvm/meta_schedule/__init__.py
@@ -15,7 +15,9 @@
 # specific language governing permissions and limitations
 # under the License.
 """Package `tvm.meta_schedule`. The meta schedule infrastructure."""
-from . import builder
 from . import arg_info
+from . import builder
+from . import database
 from . import space_generator
+from .database import TuningRecord
 from .tune_context import TuneContext
diff --git a/python/tvm/meta_schedule/database/__init__.py b/python/tvm/meta_schedule/database/__init__.py
new file mode 100644
index 000000000000..dcd430d39407
--- /dev/null
+++ b/python/tvm/meta_schedule/database/__init__.py
@@ -0,0 +1,22 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+The tvm.meta_schedule.database package.
+The database that stores serialized tuning records and workloads
+"""
+from .database import Database, PyDatabase, TuningRecord
+from .json_database import JSONDatabase
diff --git a/python/tvm/meta_schedule/database/database.py b/python/tvm/meta_schedule/database/database.py
new file mode 100644
index 000000000000..3d05441fe22b
--- /dev/null
+++ b/python/tvm/meta_schedule/database/database.py
@@ -0,0 +1,240 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Tuning record database"""
+from typing import Any, List
+
+from tvm._ffi import register_object
+from tvm.ir.module import IRModule
+from tvm.runtime import Object
+from tvm.target import Target
+from tvm.tir.schedule import Trace
+
+from .. import _ffi_api
+from ..arg_info import ArgInfo
+from ..utils import _json_de_tvm
+
+
+@register_object("meta_schedule.Workload")
+class Workload(Object):
+    """A workload, i.e. an IRModule and its structural hash.
+
+    Parameters
+    ----------
+    mod : IRModule
+        The workload's IRModule
+    """
+
+    mod: IRModule
+
+    def __init__(self, mod: IRModule) -> None:
+        self.__init_handle_by_constructor__(
+            _ffi_api.Workload,  # type: ignore # pylint: disable=no-member
+            mod,
+        )
+
+    def as_json(self) -> Any:
+        """Export the workload to a JSON string.
+
+        Returns
+        -------
+        json_str : str
+            The JSON string exported.
+        """
+        return _json_de_tvm(_ffi_api.WorkloadAsJSON(self))  # type: ignore # pylint: disable=no-member
+
+    @staticmethod
+    def from_json(json_obj: Any) -> "Workload":
+        """Create a workload from a json object.
+
+        Parameters
+        ----------
+        json_obj : Any
+            The json object to parse.
+
+        Returns
+        -------
+        tuning_record : TuningRecord
+            The parsed tuning record.
+        """
+        return _ffi_api.WorkloadFromJSON(json_obj)  # type: ignore # pylint: disable=no-member
+
+
+@register_object("meta_schedule.TuningRecord")
+class TuningRecord(Object):
+    """The class of tuning records.
+
+    Parameters
+    ----------
+    trace : tvm.ir.Trace
+        The trace of the tuning record.
+    run_secs : List[float]
+        The run time of the tuning record.
+    workload : Workload
+        The workload of the tuning record.
+    target : Target
+        The target of the tuning record.
+    args_info : List[ArgInfo]
+        The argument information of the tuning record.
+    """
+
+    trace: Trace
+    run_secs: List[float]
+    workload: Workload
+    target: Target
+    args_info: List[ArgInfo]
+
+    def __init__(
+        self,
+        trace: Trace,
+        run_secs: List[float],
+        workload: Workload,
+        target: Target,
+        args_info: List[ArgInfo],
+    ) -> None:
+        self.__init_handle_by_constructor__(
+            _ffi_api.TuningRecord,  # type: ignore # pylint: disable=no-member
+            trace,
+            run_secs,
+            workload,
+            target,
+            args_info,
+        )
+
+    def as_json(self) -> Any:
+        """Export the tuning record to a JSON string.
+
+        Returns
+        -------
+        json_str : str
+            The JSON string exported.
+        """
+        return _json_de_tvm(_ffi_api.TuningRecordAsJSON(self))  # type: ignore # pylint: disable=no-member
+
+    @staticmethod
+    def from_json(json_obj: Any, workload: Workload) -> "TuningRecord":
+        """Create a tuning record from a json object.
+
+        Parameters
+        ----------
+        json_obj : Any
+            The json object to parse.
+        workload : Workload
+            The workload.
+
+        Returns
+        -------
+        tuning_record : TuningRecord
+            The parsed tuning record.
+        """
+        return _ffi_api.TuningRecordFromJSON(json_obj, workload)  # type: ignore # pylint: disable=no-member
+
+
+@register_object("meta_schedule.Database")
+class Database(Object):
+    """The abstract database interface."""
+
+    def commit_workload(self, mod: IRModule) -> Workload:
+        """Commit a workload to the database if missing.
+
+        Parameters
+        ----------
+        mod : IRModule
+            The IRModule to be searched for or added.
+
+        Returns
+        -------
+        workload : Workload
+            The workload corresponding to the given IRModule.
+        """
+        return _ffi_api.DatabaseCommitWorkload(self, mod)  # type: ignore # pylint: disable=no-member
+
+    def commit_tuning_record(self, record: TuningRecord) -> None:
+        """Commit a tuning record to the database.
+
+        Parameters
+        ----------
+        record : TuningRecord
+            The tuning record to add.
+        """
+        _ffi_api.DatabaseCommitTuningRecord(self, record)  # type: ignore # pylint: disable=no-member
+
+    def get_top_k(self, workload: Workload, top_k: int) -> List[TuningRecord]:
+        """Get the top K tuning records of given workload from the database.
+
+        Parameters
+        ----------
+        workload : Workload
+            The workload to be searched for.
+        top_k : int
+            The number of top records to get.
+
+        Returns
+        -------
+        top_k_records : List[TuningRecord]
+            The top K records.
+        """
+        return _ffi_api.DatabaseGetTopK(self, workload, top_k)  # type: ignore # pylint: disable=no-member
+
+    def __len__(self) -> int:
+        """Get the number of records in the database.
+
+        Returns
+        -------
+        num_records : int
+            The number of records in the database
+        """
+        return _ffi_api.DatabaseSize(self)  # type: ignore # pylint: disable=no-member
+
+
+@register_object("meta_schedule.PyDatabase")
+class PyDatabase(Database):
+    """An abstract Database with customized methods on the python-side."""
+
+    def __init__(self):
+        """Constructor."""
+
+        def f_commit_workload(mod: IRModule) -> Workload:
+            return self.commit_workload(mod)
+
+        def f_commit_tuning_record(record: TuningRecord) -> None:
+            self.commit_tuning_record(record)
+
+        def f_get_top_k(workload: Workload, top_k: int) -> List[TuningRecord]:
+            return self.get_top_k(workload, top_k)
+
+        def f_size() -> int:
+            return len(self)
+
+        self.__init_handle_by_constructor__(
+            _ffi_api.DatabasePyDatabase,  # type: ignore  # pylint: disable=no-member
+            f_commit_workload,
+            f_commit_tuning_record,
+            f_get_top_k,
+            f_size,
+        )
+
+    def commit_workload(self, mod: IRModule) -> Workload:
+        raise NotImplementedError
+
+    def commit_tuning_record(self, record: TuningRecord) -> None:
+        raise NotImplementedError
+
+    def get_top_k(self, workload: Workload, top_k: int) -> List[TuningRecord]:
+        raise NotImplementedError
+
+    def __len__(self) -> int:
+        raise NotImplementedError
diff --git a/python/tvm/meta_schedule/database/json_database.py b/python/tvm/meta_schedule/database/json_database.py
new file mode 100644
index 000000000000..6897b82d9888
--- /dev/null
+++ b/python/tvm/meta_schedule/database/json_database.py
@@ -0,0 +1,61 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""The default database that uses a JSON File to store tuning records"""
+from tvm._ffi import register_object
+
+from .. import _ffi_api
+from .database import Database
+
+
+@register_object("meta_schedule.JSONDatabase")
+class JSONDatabase(Database):
+    """The class of tuning records.
+
+    Parameters
+    ----------
+    path_workload : str
+        The path to the workload table.
+    path_tuning_record : str
+        The path to the tuning record table.
+    """
+
+    path_workload: str
+    path_tuning_record: str
+
+    def __init__(
+        self,
+        path_workload: str,
+        path_tuning_record: str,
+        allow_missing: bool = True,
+    ) -> None:
+        """Constructor.
+
+        Parameters
+        ----------
+        path_workload : str
+            The path to the workload table.
+        path_tuning_record : str
+            The path to the tuning record table.
+        allow_missing : bool
+            Whether to create new file when the given path is not found.
+        """
+        self.__init_handle_by_constructor__(
+            _ffi_api.DatabaseJSONDatabase,  # type: ignore # pylint: disable=no-member
+            path_workload,
+            path_tuning_record,
+            allow_missing,
+        )
diff --git a/python/tvm/meta_schedule/utils.py b/python/tvm/meta_schedule/utils.py
index abde198cf6ec..e710b0ed06f3 100644
--- a/python/tvm/meta_schedule/utils.py
+++ b/python/tvm/meta_schedule/utils.py
@@ -15,9 +15,10 @@
 # specific language governing permissions and limitations
 # under the License.
 """Utilities for meta schedule"""
+import json
 import os
 import shutil
-from typing import Any, Callable, Union
+from typing import Any, Callable, List, Union
 
 import psutil
 
@@ -126,3 +127,28 @@ def _json_de_tvm(obj: Any) -> Any:
     if isinstance(obj, Map):
         return {_json_de_tvm(k): _json_de_tvm(v) for k, v in obj.items()}
     raise TypeError("Not supported type: " + str(type(obj)))
+
+
+@register_func("meta_schedule.json_obj2str")
+def json_obj2str(json_obj: Any) -> str:
+    json_obj = _json_de_tvm(json_obj)
+    return json.dumps(json_obj)
+
+
+@register_func("meta_schedule.batch_json_str2obj")
+def batch_json_str2obj(json_strs: List[str]) -> List[Any]:
+    """Covert a list of JSON strings to a list of json objects.
+    Parameters
+    ----------
+    json_strs : List[str]
+        The list of JSON strings
+    Returns
+    -------
+    result : List[Any]
+        The list of json objects
+    """
+    return [
+        json.loads(json_str)
+        for json_str in map(str.strip, json_strs)
+        if json_str and (not json_str.startswith("#")) and (not json_str.startswith("//"))
+    ]
diff --git a/src/meta_schedule/database/database.cc b/src/meta_schedule/database/database.cc
new file mode 100644
index 000000000000..e67b3d1ab9b6
--- /dev/null
+++ b/src/meta_schedule/database/database.cc
@@ -0,0 +1,179 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#include "../utils.h"
+
+namespace tvm {
+namespace meta_schedule {
+
+/******** Workload ********/
+
+Workload::Workload(IRModule mod) {
+  ObjectPtr<WorkloadNode> n = runtime::make_object<WorkloadNode>();
+  n->shash = tvm::StructuralHash()(mod);
+  n->mod = mod;
+  data_ = std::move(n);
+}
+
+Workload::Workload(IRModule mod, Workload::THashCode shash) {
+  ObjectPtr<WorkloadNode> n = runtime::make_object<WorkloadNode>();
+  n->mod = mod;
+  n->shash = shash;
+  data_ = std::move(n);
+}
+
+ObjectRef WorkloadNode::AsJSON() const {
+  // Convert `this->mod` to JSON
+  std::string json_mod = tvm::SaveJSON(this->mod);
+  // Dump the JSON string to base64
+  std::string b64_mod = Base64Encode(json_mod);
+  // Output
+  return Array<ObjectRef>{SHash2Str(this->shash), String(b64_mod)};
+}
+
+Workload Workload::FromJSON(const ObjectRef& json_obj) {
+  IRModule mod{nullptr};
+  THashCode shash = 0;
+  try {
+    const ArrayNode* json_array = json_obj.as<ArrayNode>();
+    CHECK(json_array && json_array->size() == 2);
+    // Load json[0] => shash
+    String str_shash = Downcast<String>(json_array->at(0));
+    // Load json[1] => mod
+    {
+      String b64_mod = Downcast<String>(json_array->at(1));
+      std::string json_mod = Base64Decode(b64_mod);
+      mod = Downcast<IRModule>(LoadJSON(json_mod));
+    }
+    // Verify SHash(mod) == shash
+    shash = tvm::StructuralHash()(mod);
+    String recalc_shash = SHash2Str(shash);
+    CHECK_EQ(recalc_shash, str_shash) << "ValueError: Structural hash changed. Given: " << str_shash
+                                      << "; Recalculated: " << recalc_shash;
+  } catch (const std::runtime_error& e) {  // includes tvm::Error and dmlc::Error
+    LOG(FATAL) << "ValueError: Unable to parse the JSON object: " << json_obj
+               << "\nThe error is: " << e.what();
+  }
+  return Workload(mod, shash);
+}
+
+/******** TuningRecord ********/
+
+TuningRecord::TuningRecord(tir::Trace trace, Array<FloatImm> run_secs, Workload workload,
+                           Target target, Array<ArgInfo> args_info) {
+  ObjectPtr<TuningRecordNode> n = make_object<TuningRecordNode>();
+  n->trace = trace;
+  n->run_secs = run_secs;
+  n->workload = workload;
+  n->target = target;
+  n->args_info = args_info;
+  this->data_ = n;
+}
+
+ObjectRef TuningRecordNode::AsJSON() const {
+  Array<ObjectRef> json_args_info;
+  json_args_info.reserve(args_info.size());
+  for (const ArgInfo& arg_info : args_info) {
+    json_args_info.push_back(arg_info->AsJSON());
+  }
+  return Array<ObjectRef>{trace->AsJSON(false),  //
+                          run_secs,              //
+                          target->Export(),      //
+                          json_args_info};
+}
+
+TuningRecord TuningRecord::FromJSON(const ObjectRef& json_obj, const Workload& workload) {
+  tir::Trace trace{nullptr};
+  Array<FloatImm> run_secs{nullptr};
+  Target target{nullptr};
+  Array<ArgInfo> args_info;
+  try {
+    const ArrayNode* json_array = json_obj.as<ArrayNode>();
+    CHECK(json_array && json_array->size() == 4);
+    // Load json[1] => run_secs
+    run_secs = Downcast<Array<FloatImm>>(json_array->at(1));
+    // Load json[2] => target
+    target = Target(Downcast<Map<String, ObjectRef>>(json_array->at(2)));
+    // Load json[3] => args_info
+    {
+      const ArrayNode* json_args_info = json_array->at(3).as<ArrayNode>();
+      args_info.reserve(json_args_info->size());
+      for (const ObjectRef& json_arg_info : *json_args_info) {
+        args_info.push_back(ArgInfo::FromJSON(json_arg_info));
+      }
+    }
+    // Load json[0] => trace
+    {
+      const ObjectRef& json_trace = json_array->at(0);
+      tir::Schedule sch =
+          tir::Schedule::Traced(workload->mod, /*seed=*/-1, /*debug_mask=*/0,
+                                /*error_render_level=*/tir::ScheduleErrorRenderLevel::kNone);
+      tir::Trace::ApplyJSONToSchedule(json_trace, sch);
+      trace = sch->trace().value();
+    }
+  } catch (const std::runtime_error& e) {  // includes tvm::Error and dmlc::Error
+    LOG(FATAL) << "ValueError: Unable to parse the JSON object: " << json_obj
+               << "\nThe error is: " << e.what();
+  }
+  return TuningRecord(trace, run_secs, workload, target, args_info);
+}
+
+/******** PyDatabase ********/
+
+Database Database::PyDatabase(PyDatabaseNode::FCommitWorkload f_commit_workload,
+                              PyDatabaseNode::FCommitTuningRecord f_commit_tuning_record,
+                              PyDatabaseNode::FGetTopK f_get_top_k, PyDatabaseNode::FSize f_size) {
+  ObjectPtr<PyDatabaseNode> n = make_object<PyDatabaseNode>();
+  n->f_commit_workload = f_commit_workload;
+  n->f_commit_tuning_record = f_commit_tuning_record;
+  n->f_get_top_k = f_get_top_k;
+  n->f_size = f_size;
+  return Database(n);
+}
+
+/******** FFI ********/
+
+TVM_REGISTER_NODE_TYPE(WorkloadNode);
+TVM_REGISTER_NODE_TYPE(TuningRecordNode);
+TVM_REGISTER_OBJECT_TYPE(DatabaseNode);
+TVM_REGISTER_NODE_TYPE(PyDatabaseNode);
+TVM_REGISTER_GLOBAL("meta_schedule.Workload").set_body_typed([](IRModule mod) {
+  return Workload(mod);
+});
+TVM_REGISTER_GLOBAL("meta_schedule.WorkloadAsJSON")
+    .set_body_method<Workload>(&WorkloadNode::AsJSON);
+TVM_REGISTER_GLOBAL("meta_schedule.WorkloadFromJSON").set_body_typed(&Workload::FromJSON);
+TVM_REGISTER_GLOBAL("meta_schedule.TuningRecord")
+    .set_body_typed([](tir::Trace trace, Array<FloatImm> run_secs, Workload workload, Target target,
+                       Array<ArgInfo> args_info) {
+      return TuningRecord(trace, run_secs, workload, target, args_info);
+    });
+TVM_REGISTER_GLOBAL("meta_schedule.TuningRecordAsJSON")
+    .set_body_method<TuningRecord>(&TuningRecordNode::AsJSON);
+TVM_REGISTER_GLOBAL("meta_schedule.TuningRecordFromJSON").set_body_typed(TuningRecord::FromJSON);
+TVM_REGISTER_GLOBAL("meta_schedule.DatabaseCommitWorkload")
+    .set_body_method<Database>(&DatabaseNode::CommitWorkload);
+TVM_REGISTER_GLOBAL("meta_schedule.DatabaseCommitTuningRecord")
+    .set_body_method<Database>(&DatabaseNode::CommitTuningRecord);
+TVM_REGISTER_GLOBAL("meta_schedule.DatabaseGetTopK")
+    .set_body_method<Database>(&DatabaseNode::GetTopK);
+TVM_REGISTER_GLOBAL("meta_schedule.DatabaseSize").set_body_method<Database>(&DatabaseNode::Size);
+TVM_REGISTER_GLOBAL("meta_schedule.DatabasePyDatabase").set_body_typed(Database::PyDatabase);
+
+}  // namespace meta_schedule
+}  // namespace tvm
diff --git a/src/meta_schedule/database/json_database.cc b/src/meta_schedule/database/json_database.cc
new file mode 100644
index 000000000000..3efb72e2fa74
--- /dev/null
+++ b/src/meta_schedule/database/json_database.cc
@@ -0,0 +1,161 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#include <set>
+#include <unordered_map>
+
+#include "../utils.h"
+
+namespace tvm {
+namespace meta_schedule {
+
+/*! \brief The struct defining comparison function of sorting by mean run seconds. */
+struct SortTuningRecordByMeanRunSecs {
+  static const constexpr double kMaxMeanTime = 1e10;
+
+  static double Mean(const Array<FloatImm>& a) {
+    if (a.empty()) {
+      return kMaxMeanTime;
+    }
+    double sum = 0.0;
+    for (const FloatImm& i : a) {
+      sum += i->value;
+    }
+    return sum / a.size();
+  }
+
+  bool operator()(const TuningRecord& a, const TuningRecord& b) const {
+    double a_time = Mean(a->run_secs);
+    double b_time = Mean(b->run_secs);
+    return a_time < b_time;
+  }
+};
+
+/*! \brief The default database implementation, which mimics two database tables with two files. */
+class JSONDatabaseNode : public DatabaseNode {
+ public:
+  /*! \brief The path to the workload table */
+  String path_workload;
+  /*! \brief The path to the tuning record table */
+  String path_tuning_record;
+  /*! \brief All the workloads in the database */
+  std::unordered_map<Workload, int, WorkloadHash, WorkloadEqual> workloads2idx_;
+  /*! \brief All the tuning records in the database */
+  std::multiset<TuningRecord, SortTuningRecordByMeanRunSecs> tuning_records_;
+
+  void VisitAttrs(tvm::AttrVisitor* v) {
+    v->Visit("path_workload", &path_workload);
+    v->Visit("path_tuning_record", &path_tuning_record);
+    // `workloads2idx_` is not visited
+    // `tuning_records_` is not visited
+  }
+
+  static constexpr const char* _type_key = "meta_schedule.JSONDatabase";
+  TVM_DECLARE_FINAL_OBJECT_INFO(JSONDatabaseNode, DatabaseNode);
+
+ public:
+  Workload CommitWorkload(const IRModule& mod) {
+    // Try to insert `mod` into `workloads_`
+    decltype(this->workloads2idx_)::iterator it;
+    bool inserted = false;
+    std::tie(it, inserted) =
+        this->workloads2idx_.emplace(Workload(mod, tvm::StructuralHash()(mod)), -1);
+    Workload workload = it->first;
+    // If `mod` is new in `workloads2idx_`, append it to the workload file
+    if (inserted) {
+      it->second = static_cast<int>(this->workloads2idx_.size()) - 1;
+      JSONFileAppendLine(this->path_workload, JSONObj2Str(workload->AsJSON()));
+    }
+    return it->first;
+  }
+
+  void CommitTuningRecord(const TuningRecord& record) {
+    this->tuning_records_.insert(record);
+    JSONFileAppendLine(this->path_tuning_record,
+                       JSONObj2Str(Array<ObjectRef>{
+                           /*workload_index=*/Integer(this->workloads2idx_.at(record->workload)),
+                           /*tuning_record=*/record->AsJSON()  //
+                       }));
+  }
+
+  Array<TuningRecord> GetTopK(const Workload& workload, int top_k) {
+    CHECK_GE(top_k, 0) << "ValueError: top_k must be non-negative";
+    if (top_k == 0) {
+      return {};
+    }
+    Array<TuningRecord> results;
+    results.reserve(top_k);
+    int counter = 0;
+    for (const TuningRecord& record : this->tuning_records_) {
+      if (WorkloadEqual()(record->workload, workload)) {
+        results.push_back(record);
+        if (++counter == top_k) {
+          break;
+        }
+      }
+    }
+    return results;
+  }
+
+  int64_t Size() { return tuning_records_.size(); }
+};
+
+Database Database::JSONDatabase(String path_workload, String path_tuning_record,
+                                bool allow_missing) {
+  ObjectPtr<JSONDatabaseNode> n = make_object<JSONDatabaseNode>();
+  // Load `n->workloads2idx_` from `path_workload`
+  std::vector<Workload> workloads;
+  {
+    Array<ObjectRef> json_objs = JSONStr2Obj(JSONFileReadLines(path_workload, allow_missing));
+    int n_objs = json_objs.size();
+    n->workloads2idx_.reserve(n_objs);
+    workloads.reserve(n_objs);
+    for (int i = 0; i < n_objs; ++i) {
+      Workload workload = Workload::FromJSON(json_objs[i]);
+      n->workloads2idx_.emplace(workload, i);
+      workloads.push_back(workload);
+    }
+  }
+  // Load `n->tuning_records_` from `path_tuning_record`
+  {
+    Array<ObjectRef> json_objs = JSONStr2Obj(JSONFileReadLines(path_tuning_record, allow_missing));
+    for (const ObjectRef& json_obj : json_objs) {
+      int workload_index = -1;
+      ObjectRef tuning_record{nullptr};
+      try {
+        const ArrayNode* arr = json_obj.as<ArrayNode>();
+        ICHECK_EQ(arr->size(), 2);
+        workload_index = Downcast<Integer>(arr->at(0));
+        tuning_record = arr->at(1);
+      } catch (std::runtime_error& e) {
+        LOG(FATAL) << "ValueError: Unable to parse the JSON object: " << json_obj
+                   << "\nThe error is: " << e.what();
+      }
+      n->tuning_records_.insert(TuningRecord::FromJSON(tuning_record, workloads[workload_index]));
+    }
+  }
+  n->path_workload = path_workload;
+  n->path_tuning_record = path_tuning_record;
+  return Database(n);
+}
+
+TVM_REGISTER_NODE_TYPE(JSONDatabaseNode);
+TVM_REGISTER_GLOBAL("meta_schedule.DatabaseJSONDatabase").set_body_typed(Database::JSONDatabase);
+
+}  // namespace meta_schedule
+}  // namespace tvm
diff --git a/src/meta_schedule/utils.h b/src/meta_schedule/utils.h
index a2b5ac4d3184..4c9e1e2c10a1 100644
--- a/src/meta_schedule/utils.h
+++ b/src/meta_schedule/utils.h
@@ -19,15 +19,119 @@
 #ifndef TVM_META_SCHEDULE_UTILS_H_
 #define TVM_META_SCHEDULE_UTILS_H_
 
+#include <dmlc/memory_io.h>
 #include <tvm/meta_schedule/arg_info.h>
 #include <tvm/meta_schedule/builder.h>
+#include <tvm/meta_schedule/database.h>
 #include <tvm/meta_schedule/space_generator.h>
 #include <tvm/meta_schedule/tune_context.h>
+#include <tvm/node/node.h>
+#include <tvm/node/serialization.h>
+#include <tvm/tir/schedule/schedule.h>
+
+#include <string>
 
 #include "../support/array.h"
+#include "../support/base64.h"
 
 namespace tvm {
-namespace meta_schedule {}  // namespace meta_schedule
+namespace meta_schedule {
+
+/*!
+ * \brief Read lines from a json file.
+ * \param path The path to the json file.
+ * \param allow_missing Whether to create new file when the given path is not found.
+ * \return An array containing lines read from the json file.
+ */
+inline Array<String> JSONFileReadLines(const String& path, bool allow_missing) {
+  std::ifstream is(path);
+  if (is.good()) {
+    Array<String> results;
+    for (std::string str; std::getline(is, str);) {
+      results.push_back(str);
+    }
+    return results;
+  }
+  CHECK(allow_missing) << "ValueError: File doesn't exist: " << path;
+  std::ofstream os(path);
+  CHECK(os.good()) << "ValueError: Cannot create new file: " << path;
+  return {};
+}
+
+/*!
+ * \brief Append a line to a json file.
+ * \param path The path to the json file.
+ * \param line The line to append.
+ */
+inline void JSONFileAppendLine(const String& path, const std::string& line) {
+  std::ofstream os(path, std::ofstream::app);
+  CHECK(os.good()) << "ValueError: Cannot open the file to write: " << path;
+  os << line << std::endl;
+}
+
+/*!
+ * \brief Get the base64 encoded result of a string.
+ * \param str The string to encode.
+ * \return The base64 encoded string.
+ */
+inline std::string Base64Encode(std::string str) {
+  std::string result;
+  dmlc::MemoryStringStream m_stream(&result);
+  support::Base64OutStream b64stream(&m_stream);
+  static_cast<dmlc::Stream*>(&b64stream)->Write(str);
+  b64stream.Finish();
+  return result;
+}
+
+/*!
+ * \brief Get the base64 decoded result of a string.
+ * \param str The string to decode.
+ * \return The base64 decoded string.
+ */
+inline std::string Base64Decode(std::string str) {
+  std::string result;
+  dmlc::MemoryStringStream m_stream(&str);
+  support::Base64InStream b64stream(&m_stream);
+  b64stream.InitPosition();
+  static_cast<dmlc::Stream*>(&b64stream)->Read(&result);
+  return result;
+}
+
+/*!
+ * \brief Parse lines of json string into a json object.
+ * \param lines The lines of json string.
+ * \return Array of json objects parsed.
+ * \note The function calls the python-side json parser in runtime registry.
+ */
+inline Array<ObjectRef> JSONStr2Obj(const Array<String>& lines) {
+  static const runtime::PackedFunc* f_to_obj =
+      runtime::Registry::Get("meta_schedule.batch_json_str2obj");
+  ICHECK(f_to_obj) << "IndexError: Cannot find the packed function "
+                      "`meta_schedule.batch_json_str2obj` in the global registry";
+  return (*f_to_obj)(lines);
+}
+
+/*!
+ * \brief Serialize a json object into a json string.
+ * \param json_obj The json object to serialize.
+ * \return A string containing the serialized json object.
+ * \note The function calls the python-side json obj serializer in runtime registry.
+ */
+inline String JSONObj2Str(const ObjectRef& json_obj) {
+  static const runtime::PackedFunc* f_to_str = runtime::Registry::Get("meta_schedule.json_obj2str");
+  ICHECK(f_to_str) << "IndexError: Cannot find the packed function "
+                      "`meta_schedule.json_obj2str` in the global registry";
+  return (*f_to_str)(json_obj);
+}
+
+/*!
+ * \brief Converts a structural hash code to string
+ * \param hash_code The hash code
+ * \return The string representation of the hash code
+ */
+inline String SHash2Str(Workload::THashCode hash_code) { return std::to_string(hash_code); }
+
+}  // namespace meta_schedule
 }  // namespace tvm
 
 #endif  // TVM_META_SCHEDULE_UTILS_H_
diff --git a/tests/python/unittest/test_meta_schedule_database.py b/tests/python/unittest/test_meta_schedule_database.py
new file mode 100644
index 000000000000..feef023675b0
--- /dev/null
+++ b/tests/python/unittest/test_meta_schedule_database.py
@@ -0,0 +1,274 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=missing-module-docstring,missing-function-docstring,missing-class-docstring
+"""Test Meta Schedule Database"""
+import os.path as osp
+import sys
+import tempfile
+from typing import Callable
+
+import pytest
+
+import tvm
+from tvm import tir
+from tvm.ir.module import IRModule
+from tvm.meta_schedule.arg_info import ArgInfo
+from tvm.meta_schedule.database import JSONDatabase, TuningRecord
+from tvm.script import ty
+from tvm.tir import Schedule
+
+# pylint: disable=invalid-name,no-member,line-too-long,too-many-nested-blocks,no-self-argument
+# fmt: off
+
+@tvm.script.tir
+class Matmul:
+    def main(a: ty.handle, b: ty.handle, c: ty.handle) -> None:
+        tir.func_attr({"global_symbol": "main"})
+        A = tir.match_buffer(a, (1024, 1024), "float32")
+        B = tir.match_buffer(b, (1024, 1024), "float32")
+        C = tir.match_buffer(c, (1024, 1024), "float32")
+        with tir.block([1024, 1024, tir.reduce_axis(0, 1024)], "matmul") as [vi, vj, vk]:
+            with tir.init():
+                C[vi, vj] = 0.0
+            C[vi, vj] = C[vi, vj] + A[vi, vk] * B[vk, vj]
+
+
+@tvm.script.tir
+class MatmulRelu:
+    def main(a: ty.handle, b: ty.handle, d: ty.handle) -> None:  # pylint: disable=no-self-argument
+        tir.func_attr({"global_symbol": "main", "tir.noalias": True})
+        A = tir.match_buffer(a, (16, 16), "float32")
+        B = tir.match_buffer(b, (16, 16), "float32")
+        D = tir.match_buffer(d, (16, 16), "float32")
+        C = tir.alloc_buffer((16, 16), "float32")
+        with tir.block([16, 16, tir.reduce_axis(0, 16)], "matmul") as [vi, vj, vk]:
+            with tir.init():
+                C[vi, vj] = 0.0
+            C[vi, vj] = C[vi, vj] + A[vi, vk] * B[vk, vj]
+        with tir.block([16, 16], "relu") as [vi, vj]:
+            D[vi, vj] = tir.max(C[vi, vj], 0.0)
+
+
+# fmt: on
+# pylint: enable=invalid-name,no-member,line-too-long,too-many-nested-blocks,no-self-argument
+
+
+def _schedule_matmul(sch: Schedule):
+    block = sch.get_block("matmul")
+    i, j, k = sch.get_loops(block=block)
+    i_tiles = [1, 1, 2, 512]
+    j_tiles = [1, 512, 1, 2]
+    k_tiles = [256, 4]
+    i_0, i_1, i_2, i_3 = sch.split(loop=i, factors=i_tiles)
+    j_0, j_1, j_2, j_3 = sch.split(loop=j, factors=j_tiles)
+    k_0, k_1 = sch.split(loop=k, factors=k_tiles)
+    sch.reorder(i_0, j_0, i_1, j_1, k_0, i_2, j_2, k_1, i_3, j_3)
+
+
+def _create_schedule(mod: IRModule, sch_fn: Callable[[Schedule], None]) -> Schedule:
+    sch = tir.Schedule(mod=mod, debug_mask="all")
+    sch_fn(sch)
+    return sch
+
+
+def _create_tmp_database(tmpdir: str) -> JSONDatabase:
+    path_workload = osp.join(tmpdir, "workloads.json")
+    path_tuning_record = osp.join(tmpdir, "tuning_records.json")
+    return JSONDatabase(path_workload, path_tuning_record)
+
+
+def _equal_record(a: TuningRecord, b: TuningRecord):
+    assert str(a.trace) == str(b.trace)
+    assert str(a.run_secs) == str(b.run_secs)
+    # AWAIT(@zxybazh): change to export after fixing "(bool)0"
+    assert str(a.target) == str(b.target)
+    assert tvm.ir.structural_equal(a.workload.mod, b.workload.mod)
+    for arg0, arg1 in zip(a.args_info, b.args_info):
+        assert str(arg0.as_json()) == str(arg1.as_json())
+
+
+def test_meta_schedule_tuning_record_round_trip():
+    mod: IRModule = Matmul()
+    with tempfile.TemporaryDirectory() as tmpdir:
+        database = _create_tmp_database(tmpdir)
+        workload = database.commit_workload(mod)
+        record = TuningRecord(
+            _create_schedule(mod, _schedule_matmul).trace,
+            [1.5, 2.5, 1.8],
+            workload,
+            tvm.target.Target("llvm"),
+            ArgInfo.from_prim_func(func=mod["main"]),  # pylint: disable=unsubscriptable-object
+        )
+        database.commit_tuning_record(record)
+        new_record = TuningRecord.from_json(record.as_json(), workload)
+        _equal_record(record, new_record)
+
+
+def test_meta_schedule_database_create():
+    with tempfile.TemporaryDirectory() as tmpdir:
+        database = _create_tmp_database(tmpdir)
+        assert osp.exists(database.path_workload)
+        assert osp.exists(database.path_tuning_record)
+
+
+def test_meta_schedule_database_add_entry():
+    mod: IRModule = Matmul()
+    with tempfile.TemporaryDirectory() as tmpdir:
+        database = _create_tmp_database(tmpdir)
+        workload = database.commit_workload(mod)
+        record = TuningRecord(
+            _create_schedule(mod, _schedule_matmul).trace,
+            [1.5, 2.5, 1.8],
+            workload,
+            tvm.target.Target("llvm"),
+            ArgInfo.from_prim_func(func=mod["main"]),  # pylint: disable=unsubscriptable-object
+        )
+        database.commit_tuning_record(record)
+        assert len(database) == 1
+        (ret,) = database.get_top_k(workload, 3)
+        _equal_record(ret, record)
+
+
+def test_meta_schedule_database_missing():
+    mod: IRModule = Matmul()
+    mod_2: IRModule = MatmulRelu()
+    with tempfile.TemporaryDirectory() as tmpdir:
+        database = _create_tmp_database(tmpdir)
+        workload = database.commit_workload(mod)
+        workload_2 = database.commit_workload(mod_2)
+        record = TuningRecord(
+            _create_schedule(mod, _schedule_matmul).trace,
+            [1.5, 2.5, 1.8],
+            workload,
+            tvm.target.Target("llvm"),
+            ArgInfo.from_prim_func(func=mod["main"]),  # pylint: disable=unsubscriptable-object
+        )
+        database.commit_tuning_record(record)
+        ret = database.get_top_k(workload_2, 3)
+        assert len(ret) == 0
+
+
+def test_meta_schedule_database_sorting():
+    mod: IRModule = Matmul()
+    with tempfile.TemporaryDirectory() as tmpdir:
+        database = _create_tmp_database(tmpdir)
+        token = database.commit_workload(mod)
+        trace = _create_schedule(mod, _schedule_matmul).trace
+        records = [
+            TuningRecord(
+                trace,
+                [7.0, 8.0, 9.0],
+                token,
+                tvm.target.Target("llvm"),
+                ArgInfo.from_prim_func(func=mod["main"]),  # pylint: disable=unsubscriptable-object
+            ),
+            TuningRecord(
+                trace,
+                [1.0, 2.0, 3.0],
+                token,
+                tvm.target.Target("llvm"),
+                ArgInfo.from_prim_func(func=mod["main"]),  # pylint: disable=unsubscriptable-object
+            ),
+            TuningRecord(
+                trace,
+                [4.0, 5.0, 6.0],
+                token,
+                tvm.target.Target("llvm"),
+                ArgInfo.from_prim_func(func=mod["main"]),  # pylint: disable=unsubscriptable-object
+            ),
+            TuningRecord(
+                trace,
+                [1.1, 1.2, 600.0],
+                token,
+                tvm.target.Target("llvm"),
+                ArgInfo.from_prim_func(func=mod["main"]),  # pylint: disable=unsubscriptable-object
+            ),
+            TuningRecord(
+                trace,
+                [1.0, 100.0, 6.0],
+                token,
+                tvm.target.Target("llvm"),
+                ArgInfo.from_prim_func(func=mod["main"]),  # pylint: disable=unsubscriptable-object
+            ),
+            TuningRecord(
+                trace,
+                [4.0, 9.0, 8.0],
+                token,
+                tvm.target.Target("llvm"),
+                ArgInfo.from_prim_func(func=mod["main"]),  # pylint: disable=unsubscriptable-object
+            ),
+        ]
+        for record in records:
+            database.commit_tuning_record(record)
+        ret = database.get_top_k(token, 2)
+        assert len(ret) == 2
+        try:
+            _equal_record(ret[0], records[2])
+            _equal_record(ret[1], records[1])
+        except AssertionError:
+            _equal_record(ret[0], records[1])
+            _equal_record(ret[1], records[2])
+
+
+def test_meta_schedule_database_reload():
+    mod: IRModule = Matmul()
+    with tempfile.TemporaryDirectory() as tmpdir:
+        database = _create_tmp_database(tmpdir)
+        token = database.commit_workload(mod)
+        trace = _create_schedule(mod, _schedule_matmul).trace
+        records = [
+            TuningRecord(
+                trace,
+                [7.0, 8.0, 9.0],
+                token,
+                tvm.target.Target("llvm"),
+                ArgInfo.from_prim_func(func=mod["main"]),  # pylint: disable=unsubscriptable-object
+            ),
+            TuningRecord(
+                trace,
+                [1.0, 2.0, 3.0],
+                token,
+                tvm.target.Target("llvm"),
+                ArgInfo.from_prim_func(func=mod["main"]),  # pylint: disable=unsubscriptable-object
+            ),
+            TuningRecord(
+                trace,
+                [4.0, 5.0, 6.0],
+                token,
+                tvm.target.Target("llvm"),
+                ArgInfo.from_prim_func(func=mod["main"]),  # pylint: disable=unsubscriptable-object
+            ),
+        ]
+        for record in records:
+            database.commit_tuning_record(record)
+        new_database = JSONDatabase(  # pylint: disable=unused-variable
+            path_workload=database.path_workload,
+            path_tuning_record=database.path_tuning_record,
+        )
+        token = new_database.commit_workload(mod)
+        ret = new_database.get_top_k(token, 2)
+        assert len(ret) == 2
+        try:
+            _equal_record(ret[0], records[2])
+            _equal_record(ret[1], records[1])
+        except AssertionError:
+            _equal_record(ret[0], records[1])
+            _equal_record(ret[1], records[2])
+
+
+if __name__ == "__main__":
+    sys.exit(pytest.main([__file__] + sys.argv[1:]))

From 5e46e757483e6e199a4244dbabe92a71642eee0d Mon Sep 17 00:00:00 2001
From: CircleSpin <2keepconnected@gmail.com>
Date: Tue, 28 Sep 2021 12:25:25 -0400
Subject: [PATCH 37/37] [ONNX] [Relay] Dynamic squeeze (#9095)

* adding dynamic squeeze first steps

* Matt B. implementing shape

* squeeze implemented, dynamic_to_static and onnx importer next

* add Squeeze op convert to onnx.py

* dynamic to static

* removed comments

* removed comments

* added comment

* adjusted comment

* black and lint

* ran make format in root directory

Co-authored-by: CircleSpin <jocelyn@pop-os.localdomain>
---
 python/tvm/relay/frontend/onnx.py             | 22 ++++++-
 python/tvm/relay/op/dyn/_transform.py         | 22 +++++++
 python/tvm/relay/op/transform.py              |  6 +-
 src/relay/op/dyn/tensor/transform.cc          | 57 +++++++++++++++++++
 src/relay/transforms/dynamic_to_static.cc     |  9 +++
 tests/python/frontend/onnx/test_forward.py    |  2 -
 .../relay/dyn/test_dynamic_op_level3.py       | 16 ++++++
 .../relay/test_pass_dynamic_to_static.py      | 25 ++++++++
 8 files changed, 154 insertions(+), 5 deletions(-)

diff --git a/python/tvm/relay/frontend/onnx.py b/python/tvm/relay/frontend/onnx.py
index 1ebb12ac8199..4444b15dfb12 100644
--- a/python/tvm/relay/frontend/onnx.py
+++ b/python/tvm/relay/frontend/onnx.py
@@ -1495,6 +1495,23 @@ def _impl_v12(cls, inputs, attr, params):
         return result
 
 
+class Squeeze(OnnxOpConverter):
+    """Operator converter for Squeeze."""
+
+    @classmethod
+    def _impl_v1(cls, inputs, attr, params):
+        axis = attr.get("axes", None)
+        return _op.squeeze(*inputs, axis)
+
+    @classmethod
+    def _impl_v13(cls, inputs, attr, params):
+        axis = inputs[1]
+        dtype = infer_type(axis).checked_type.dtype
+        rank = _op.shape_of(_op.shape_of(inputs[0], dtype), dtype)
+        axis = _op.where(axis < _op.const(0, dtype), axis + rank, axis)
+        return _op.squeeze(inputs[0], fold_constant(axis))
+
+
 class Split(OnnxOpConverter):
     """Operator converter for Split."""
 
@@ -2818,7 +2835,8 @@ def _impl_v12(cls, inputs, attr, params):
         alpha = _op.const(attr.get("alpha", 1.0), dtype)
         zero = _op.const(0, dtype)
         one = _op.const(1, dtype)
-        return _op.maximum(zero, x) + _op.minimum(zero, alpha * (_op.exp(x / alpha) - one))
+        out = _op.maximum(zero, x) + _op.minimum(zero, alpha * (_op.exp(x / alpha) - one))
+        return out
 
 
 class MaxRoiPool(OnnxOpConverter):
@@ -4149,7 +4167,7 @@ def _get_convert_map(opset):
         "ScatterElements": Scatter.get_converter(opset),
         "ScatterND": ScatterND.get_converter(opset),
         "EyeLike": EyeLike.get_converter(opset),
-        "Squeeze": AttrCvt("squeeze", {"axes": "axis"}),
+        "Squeeze": Squeeze.get_converter(opset),
         "Unsqueeze": Unsqueeze.get_converter(opset),
         "Pad": Pad.get_converter(opset),
         "Shape": Shape.get_converter(opset),
diff --git a/python/tvm/relay/op/dyn/_transform.py b/python/tvm/relay/op/dyn/_transform.py
index c8235ec9375a..c909764319d9 100644
--- a/python/tvm/relay/op/dyn/_transform.py
+++ b/python/tvm/relay/op/dyn/_transform.py
@@ -26,6 +26,7 @@
 _reg.register_broadcast_schedule("dyn.broadcast_to")
 _reg.register_injective_schedule("dyn.reshape")
 _reg.register_injective_schedule("dyn.expand_dims")
+_reg.register_injective_schedule("dyn.squeeze")
 _reg.register_broadcast_schedule("dyn.tile")
 _reg.register_injective_schedule("dyn.one_hot")
 _reg.register_injective_schedule("dyn.full")
@@ -258,3 +259,24 @@ def _sparse_to_dense_shape_func(output_shape, ndim):
 @_reg.register_shape_func("dyn.sparse_to_dense", True)
 def sparse_to_dense_shape_func(attrs, inputs, out_ndims):
     return [_sparse_to_dense_shape_func(inputs[3], out_ndims[0])]
+
+
+@script
+def _squeeze_shape_func_input_data(data, axis, ndims):
+    out = output_tensor((ndims,), "int64")
+    out_i = 0
+    for i in const_range(data.shape[0]):
+        not_in_axis = True
+        for j in const_range(axis.shape[0]):
+            if i == axis[j]:
+                not_in_axis = False
+        if not_in_axis:
+            out[out_i] = int64(data[i])
+            out_i += 1
+
+    return out
+
+
+@_reg.register_shape_func("dyn.squeeze", [False, True])
+def dynamic_squeeze_shape_func(attrs, inputs, out_ndims):
+    return [_squeeze_shape_func_input_data(inputs[0], inputs[1], out_ndims[0])]
diff --git a/python/tvm/relay/op/transform.py b/python/tvm/relay/op/transform.py
index fe1a73ca231a..234e76b11813 100644
--- a/python/tvm/relay/op/transform.py
+++ b/python/tvm/relay/op/transform.py
@@ -149,7 +149,7 @@ def squeeze(data, axis=None):
     data : tvm.relay.Expr
         The input data to the operator.
 
-    axis : None or List[int]
+    axis : None or List[int] or Expr
         The set of axes to remove.
         If axis = None, remove all axis of dimensions 1.
         If any specified axis has dimension that does not equal 1, it is an error.
@@ -159,6 +159,10 @@ def squeeze(data, axis=None):
     result : tvm.relay.Expr
         The squeezed result.
     """
+    if isinstance(axis, Constant):
+        axis = list(axis.data.numpy())
+    if isinstance(axis, Expr):
+        return _dyn_make.squeeze(data, axis)
     return _make.squeeze(data, axis)
 
 
diff --git a/src/relay/op/dyn/tensor/transform.cc b/src/relay/op/dyn/tensor/transform.cc
index 848d058f0af3..64baa6066522 100644
--- a/src/relay/op/dyn/tensor/transform.cc
+++ b/src/relay/op/dyn/tensor/transform.cc
@@ -692,6 +692,63 @@ RELAY_REGISTER_OP("dyn.expand_dims")
     .set_attr<FTVMCompute>("FTVMCompute", ExpandDimsCompute)
     .set_attr<TOpPattern>("TOpPattern", kInjective);
 
+bool DynSqueezeRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
+                   const TypeReporter& reporter) {
+  // [data, axes, output]
+  ICHECK_EQ(types.size(), 3);
+  const auto* data = types[0].as<TensorTypeNode>();
+  if (data == nullptr) {
+    return false;
+  }
+  const auto* axes = types[1].as<TensorTypeNode>();
+  if (axes == nullptr) {
+    return false;
+  }
+
+  ICHECK_EQ(axes->shape.size(), 1) << "Got" << axes->shape.size() << "expected 1";
+  ICHECK(axes->shape[0].as<IntImmNode>()) << "axes expected to be static rank";
+  size_t output_rank = data->shape.size() - axes->shape[0].as<IntImmNode>()->value;
+  std::vector<IndexExpr> result_shape(output_rank, Any());
+  reporter->Assign(types[2], TensorType(result_shape, data->dtype));
+  return true;
+}
+
+Array<te::Tensor> SqueezeCompute(const Attrs& attrs, const Array<te::Tensor>& inputs,
+                                 const Type& out_type) {
+  const auto* out_ttype = out_type.as<TensorTypeNode>();
+  ICHECK(out_ttype != nullptr);
+  Array<IndexExpr> newshape;
+  for (auto val : out_ttype->shape) {
+    newshape.push_back(val.as<tir::AnyNode>()->ToVar());
+  }
+  return {topi::reshape(inputs[0], newshape)};
+}
+
+Expr MakeDynSqueeze(Expr data, Expr axes) {
+  auto attrs = make_object<SqueezeAttrs>();
+  static const Op& op = Op::Get("dyn.squeeze");
+  return Call(op, {data, axes}, Attrs(attrs), {});
+}
+
+TVM_REGISTER_GLOBAL("relay.op.dyn._make.squeeze").set_body_typed(MakeDynSqueeze);
+
+RELAY_REGISTER_OP("dyn.squeeze")
+    .describe(R"code(Remove axes of value 1 in input tensor at the dimensions given by axes
+
+- **data**: The input data to the operator.
+- **axes**: The axes to squeeze.
+
+)code" TVM_ADD_FILELINE)
+    .set_num_inputs(2)
+    .set_attrs_type<SqueezeAttrs>()
+    .add_argument("data", "Tensor", "The input tensor.")
+    .add_argument("axes", "Tensor", "The axes to squeeze.")
+    .set_support_level(3)
+    .add_type_rel("DynSqueeze", DynSqueezeRel)
+    .set_attr<FTVMCompute>("FTVMCompute", SqueezeCompute)
+    .set_attr<TOpPattern>("TOpPattern", kInjective)
+    .set_attr<TReshapeOp>("TReshapeOp", true);
+
 }  // namespace dyn
 }  // namespace relay
 }  // namespace tvm
diff --git a/src/relay/transforms/dynamic_to_static.cc b/src/relay/transforms/dynamic_to_static.cc
index 318022fb86f5..751271d2add3 100644
--- a/src/relay/transforms/dynamic_to_static.cc
+++ b/src/relay/transforms/dynamic_to_static.cc
@@ -45,6 +45,15 @@ class DynamicToStaticMutator : public MixedModeMutator {
            }
            return Expr(nullptr);
          }},
+        {Op::Get("dyn.squeeze"),
+         [this](const CallNode* call_node) {
+           auto args = PrepareArgs(call_node);
+           if (const ConstantNode* axis = args[1].as<ConstantNode>()) {
+             ICHECK_EQ(axis->data->ndim, 1);
+             return MakeSqueeze(call_node->args[0], ToVector(axis->data));
+           }
+           return Expr(nullptr);
+         }},
         {Op::Get("dyn.tile"),
          [this](const CallNode* call_node) {
            auto args = PrepareArgs(call_node);
diff --git a/tests/python/frontend/onnx/test_forward.py b/tests/python/frontend/onnx/test_forward.py
index 084a5b4e4733..1cf6ffff762c 100644
--- a/tests/python/frontend/onnx/test_forward.py
+++ b/tests/python/frontend/onnx/test_forward.py
@@ -4993,8 +4993,6 @@ def verify_eyelike(indata):
     "test_split_variable_parts_2d",
     "test_split_variable_parts_default_axis",
     "test_split_zero_size_splits",
-    "test_squeeze",
-    "test_squeeze_negative_axes",
     "test_strnormalizer_export_monday_casesensintive_lower",
     "test_strnormalizer_export_monday_casesensintive_nochangecase",
     "test_strnormalizer_export_monday_casesensintive_upper",
diff --git a/tests/python/relay/dyn/test_dynamic_op_level3.py b/tests/python/relay/dyn/test_dynamic_op_level3.py
index 8c57e1dc4a9f..22583eda4a40 100644
--- a/tests/python/relay/dyn/test_dynamic_op_level3.py
+++ b/tests/python/relay/dyn/test_dynamic_op_level3.py
@@ -92,6 +92,22 @@ def verify_reshape(shape, newshape, oshape):
     verify_reshape((4, 7), (2, 7, 2), (2, 7, 2))
 
 
+def test_squeeze():
+    def verify_squeeze(shape, dtype, axis):
+        x = relay.var("x", relay.TensorType(shape, dtype))
+        assert axis is not None
+        np_axis = tuple(axis)
+        axis = relay.var("axis", relay.TensorType([len(axis)], "int64"))
+        squeeze = relay.squeeze(x, axis=axis)
+        func = relay.Function([x, axis], squeeze)
+        x_data = np.random.random_sample(shape).astype(dtype)
+        ref_res = np.squeeze(x_data, axis=np_axis)
+        verify_func(func, [x_data, np.array(np_axis).astype("int64")], ref_res)
+
+    verify_squeeze((1, 3, 1), "float32", [0])
+    verify_squeeze((1, 2, 1, 2, 1), "float32", [0, 2])
+
+
 @tvm.testing.uses_gpu
 def test_dyn_expand_dims():
     def verify_expand_dims(
diff --git a/tests/python/relay/test_pass_dynamic_to_static.py b/tests/python/relay/test_pass_dynamic_to_static.py
index a34c4ac6f705..5b61733bbd76 100644
--- a/tests/python/relay/test_pass_dynamic_to_static.py
+++ b/tests/python/relay/test_pass_dynamic_to_static.py
@@ -72,6 +72,31 @@ def verify_reshape(shape, newshape, oshape):
     verify_reshape((4, 7), (2, 7, 2), (2, 7, 2))
 
 
+@tvm.testing.uses_gpu
+def test_dynamic_to_static_squeeze():
+    def verify_squeeze(shape, axis, oshape):
+        x = relay.var("x", relay.TensorType(shape, "float32"))
+        y = relay.var("y", relay.TensorType(axis, "float32"))
+        z = relay.squeeze(x, relay.shape_of(y))
+        func = run_infer_type(relay.Function([x, y], z))
+        func2 = run_opt_pass(run_opt_pass(func, transform.DynamicToStatic()), transform.InferType())
+
+        zz = func2.body
+        assert isinstance(zz, relay.Call)
+        assert zz.op == relay.op.get("squeeze")
+        assert "axis=" in zz.astext()
+        assert zz.checked_type == relay.ty.TensorType(oshape, "float32")
+
+        x_data = np.random.uniform(low=-1, high=1, size=shape).astype("float32")
+        y_data = np.random.uniform(low=-1, high=1, size=axis).astype("float32")
+        ref_res = np.squeeze(x_data, axis)
+        verify_func(func2, [x_data, y_data], ref_res)
+
+    verify_squeeze((1, 3, 4, 1), (0,), (3, 4, 1))
+    verify_squeeze((1, 3, 4, 1), (3,), (1, 3, 4))
+    verify_squeeze((1, 3, 4, 1), (0, 3), (3, 4))
+
+
 @tvm.testing.uses_gpu
 def test_dynamic_to_static_double_reshape():
     def verify_reshape(shape, newshape):