From eba5f8d6293acba3bcfe056a73da66da760c6129 Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Sat, 14 Oct 2017 12:56:52 -0700
Subject: [PATCH 1/2] [TVM] Introduce target generic dispatch system

---
 docs/api/python/index.rst                     |   1 +
 docs/api/python/target.rst                    |  13 +
 docs/api/python/topi.rst                      |  30 +-
 python/tvm/_ffi/ndarray.py                    |   6 +-
 python/tvm/_ffi/runtime_ctypes.py             |   3 +
 python/tvm/build_module.py                    |  59 ++--
 python/tvm/target.py                          | 329 +++++++++++++++---
 src/codegen/llvm/llvm_common.cc               |   2 +
 tests/python/integration/test_gemm.py         |   3 +-
 tests/python/unittest/test_lang_target.py     |  44 +++
 topi/python/topi/cuda/conv2d_nchw.py          |   3 +
 topi/python/topi/cuda/dense.py                |   2 +
 topi/python/topi/cuda/depthwise_conv2d.py     |   2 +
 topi/python/topi/cuda/injective.py            |   6 +-
 topi/python/topi/cuda/pooling.py              |   3 +
 topi/python/topi/cuda/reduction.py            |   2 +
 topi/python/topi/cuda/softmax.py              |   2 +
 topi/python/topi/generic/__init__.py          |  19 +
 topi/python/topi/generic/injective.py         |  32 ++
 topi/python/topi/generic/nn.py                | 142 ++++++++
 topi/python/topi/nn/conv2d.py                 |  27 +-
 topi/python/topi/rasp/conv2d.py               |  18 +-
 topi/python/topi/rasp/depthwise_conv2d.py     |   4 +-
 topi/tests/python/test_topi_broadcast.py      |   7 +-
 topi/tests/python/test_topi_conv2d.py         |   2 +-
 topi/tests/python/test_topi_conv2d_nchw.py    |   5 +-
 topi/tests/python/test_topi_dense.py          |   3 +-
 .../python/test_topi_depthwise_conv2d.py      |  15 +-
 topi/tests/python/test_topi_pooling.py        |   6 +-
 topi/tests/python/test_topi_reduce.py         |   4 +-
 topi/tests/python/test_topi_softmax.py        |   7 +-
 topi/tests/python/test_topi_transform.py      |  31 +-
 32 files changed, 671 insertions(+), 161 deletions(-)
 create mode 100644 docs/api/python/target.rst
 create mode 100644 tests/python/unittest/test_lang_target.py
 create mode 100644 topi/python/topi/generic/__init__.py
 create mode 100644 topi/python/topi/generic/injective.py
 create mode 100644 topi/python/topi/generic/nn.py

diff --git a/docs/api/python/index.rst b/docs/api/python/index.rst
index 4a5ddebc02d7..ee830e031462 100644
--- a/docs/api/python/index.rst
+++ b/docs/api/python/index.rst
@@ -8,6 +8,7 @@ Python API
    intrin
    tensor
    schedule
+   target
    build
    module
    ndarray
diff --git a/docs/api/python/target.rst b/docs/api/python/target.rst
new file mode 100644
index 000000000000..0f824324d0c5
--- /dev/null
+++ b/docs/api/python/target.rst
@@ -0,0 +1,13 @@
+tvm.target
+----------
+.. automodule:: tvm.target
+
+.. autofunction:: tvm.target.generic_func
+
+.. autoclass:: tvm.target.Target
+    :members:
+
+.. autofunction:: tvm.target.cuda
+.. autofunction:: tvm.target.rocm
+.. autofunction:: tvm.target.rasp
+.. autofunction:: tvm.target.create
diff --git a/docs/api/python/topi.rst b/docs/api/python/topi.rst
index a645a701f012..4f7d8cace31b 100644
--- a/docs/api/python/topi.rst
+++ b/docs/api/python/topi.rst
@@ -37,13 +37,11 @@ Index
 
 .. autosummary::
 
-   topi.cuda.schedule_conv2d_nchw
-   topi.cuda.schedule_conv2d_hwcn
-   topi.cuda.schedule_depthwise_conv2d_nchw
-   topi.cuda.schedule_depthwise_conv2d_nhwc
-   topi.cuda.schedule_reduce
-   topi.cuda.schedule_broadcast
-   topi.cuda.schedule_injective
+   topi.generic.schedule_conv2d_nchw
+   topi.generic.schedule_depthwise_conv2d_nchw
+   topi.generic.schedule_reduce
+   topi.generic.schedule_broadcast
+   topi.generic.schedule_injective
 
 topi
 ~~~~
@@ -75,14 +73,12 @@ topi.nn
 .. autofunction:: topi.nn.depthwise_conv2d_nhwc
 
 
-topi.cuda
-~~~~~~~~~
-.. automodule:: topi.cuda
+topi.generic
+~~~~~~~~~~~~
+.. automodule:: topi.generic
 
-.. autofunction:: topi.cuda.schedule_conv2d_nchw
-.. autofunction:: topi.cuda.schedule_conv2d_hwcn
-.. autofunction:: topi.cuda.schedule_depthwise_conv2d_nchw
-.. autofunction:: topi.cuda.schedule_depthwise_conv2d_nhwc
-.. autofunction:: topi.cuda.schedule_reduce
-.. autofunction:: topi.cuda.schedule_broadcast
-.. autofunction:: topi.cuda.schedule_injective
+.. autofunction:: topi.generic.schedule_conv2d_nchw
+.. autofunction:: topi.generic.schedule_depthwise_conv2d_nchw
+.. autofunction:: topi.generic.schedule_reduce
+.. autofunction:: topi.generic.schedule_broadcast
+.. autofunction:: topi.generic.schedule_injective
diff --git a/python/tvm/_ffi/ndarray.py b/python/tvm/_ffi/ndarray.py
index b12e1fdf7c8e..b0dfd0f73fd9 100644
--- a/python/tvm/_ffi/ndarray.py
+++ b/python/tvm/_ffi/ndarray.py
@@ -56,11 +56,7 @@ def context(dev_type, dev_id=0):
       assert tvm.context("cuda", 0) == tvm.gpu(0)
     """
     if isinstance(dev_type, string_types):
-        if dev_type not in TVMContext.STR2MASK:
-            if dev_type.find("nvptx") != -1:
-                dev_type = "cuda"
-            if dev_type.find("rocm") != -1:
-                dev_type = "rocm"
+        dev_type = dev_type.split()[0]
         if dev_type not in TVMContext.STR2MASK:
             raise ValueError("Unknown device type %s" % dev_type)
         dev_type = TVMContext.STR2MASK[dev_type]
diff --git a/python/tvm/_ffi/runtime_ctypes.py b/python/tvm/_ffi/runtime_ctypes.py
index 596e13f1f3fe..cfadd18188f5 100644
--- a/python/tvm/_ffi/runtime_ctypes.py
+++ b/python/tvm/_ffi/runtime_ctypes.py
@@ -100,9 +100,12 @@ class TVMContext(ctypes.Structure):
         12: 'ext_dev',
     }
     STR2MASK = {
+        'llvm': 1,
+        'stackvm': 1,
         'cpu': 1,
         'gpu': 2,
         'cuda': 2,
+        'nvptx': 2,
         'cl': 4,
         'opencl': 4,
         'metal': 8,
diff --git a/python/tvm/build_module.py b/python/tvm/build_module.py
index 621f8a727274..259a33773c76 100644
--- a/python/tvm/build_module.py
+++ b/python/tvm/build_module.py
@@ -15,6 +15,7 @@
 from . import module
 from . import codegen
 from . import ndarray
+from . import target as _target
 
 class BuildConfig(object):
     """Configuration scope to set a build config option.
@@ -238,7 +239,7 @@ def lower(sch,
 
 def build(sch,
           args=None,
-          target="llvm",
+          target=None,
           target_host=None,
           name="default_function",
           binds=None):
@@ -252,36 +253,10 @@ def build(sch,
     args : list of Buffer or Tensor or Var, optional
         The argument lists to the function.
 
-    target : str, optional
+    target : str or :any:`tvm.target.Target`, optional
         The target and option of the compilation.
-        When the target is llvm, you can set options like:
 
-          - **-mtriple=<target triple>** or **-target**
-
-            Specify the target triple, which is useful for cross
-            compilation.
-
-          - **-mcpu=<cpuname>**
-
-            Specify a specific chip in the current architecture to
-            generate code for. By default this is infered from the
-            target triple and autodetected to the current architecture.
-
-          - **-mattr=a1,+a2,-a3,...**
-
-            Override or control specific attributes of the target,
-            such as whether SIMD operations are enabled or not. The
-            default set of attributes is set by the current CPU.
-
-          - **-system-lib**
-
-            Build TVM system library module. System lib is a global module that contains
-            self registered functions in program startup. User can get the module using
-            :any:`tvm.module.system_lib`.
-            It is useful in environments where dynamic loading api like dlopen is banned.
-            The system lib will be available as long as the result code is linked by the program.
-
-    target_host : str, optional
+    target_host : str or :any:`tvm.target.Target` optional
         Host compilation target, if target is device.
         When TVM compiles device specific program such as CUDA,
         we also need host(CPU) side code to interact with the driver
@@ -301,6 +276,10 @@ def build(sch,
     -------
     f : Function, or pair of functions
        The result function.
+
+    Note
+    ----
+    See the note on :any:`tvm.target` on target string format.
     """
     if isinstance(sch, schedule.Schedule):
         if args is None:
@@ -325,6 +304,9 @@ def build(sch,
         if x.name in fname_set:
             raise ValueError("Duplicate function name %s" % x.name)
 
+    target = _target.current_target() if target is None else target
+    target = _target.create(target) if target else _target.create("llvm")
+
     fhost = []
     fdevice = []
     for func in flist:
@@ -332,7 +314,7 @@ def build(sch,
             if BuildConfig.current.detect_global_barrier:
                 func = ir_pass.ThreadSync(func, "global")
             func = ir_pass.ThreadSync(func, "shared")
-            warp_size = 32 if target == "cuda" else 1
+            warp_size = target.thread_warp_size
             func = ir_pass.LowerThreadAllreduce(func, warp_size)
             fsplits = [s for s in ir_pass.SplitHostDevice(func)]
             fhost.append(fsplits[0])
@@ -345,29 +327,28 @@ def build(sch,
         else:
             raise ValueError("unknown function type %d" % func.func_type)
 
-    if not target.startswith("llvm") and target not in ("stackvm", "ext_dev") and not fdevice:
+    if "gpu" in target.keys and not fdevice:
         warnings.warn(
             "Specified target %s, but cannot find device code, did you do bind?" % target)
 
-    device = "cpu" if target.startswith("llvm") or target == "stackvm" else target
-    device_type = ndarray.context(device, 0).device_type
+    device_type = ndarray.context(target.target_name, 0).device_type
     fhost = [ir_pass.BindDeviceType(x, device_type) for x in fhost]
     fhost = [ir_pass.LowerTVMBuiltin(x) for x in fhost]
 
     if not target_host:
-        if device == "cpu":
+        if device_type == ndarray.cpu(0).device_type:
             target_host = target
             assert not fdevice
         else:
             target_host = "llvm" if module.enabled("llvm") else "stackvm"
-
+    target_host = _target.create(target_host)
     target_device = target
-    fdevice = [ir_pass.LowerIntrin(x, target_device) for x in fdevice]
-    fhost = [ir_pass.LowerIntrin(x, target_host) for x in fhost]
+    fdevice = [ir_pass.LowerIntrin(x, target_device.target_name) for x in fdevice]
+    fhost = [ir_pass.LowerIntrin(x, target_host.target_name) for x in fhost]
     fhost = [ir_pass.CombineContextCall(x) for x in fhost]
-    mhost = codegen.build_module(fhost, target_host)
+    mhost = codegen.build_module(fhost, str(target_host))
 
     if fdevice:
-        mdev = codegen.build_module(fdevice, target_device)
+        mdev = codegen.build_module(fdevice, str(target_device))
         mhost.import_module(mdev)
     return mhost
diff --git a/python/tvm/target.py b/python/tvm/target.py
index 8fcc44519703..930ca33b760e 100644
--- a/python/tvm/target.py
+++ b/python/tvm/target.py
@@ -1,63 +1,310 @@
-"""Target management API of tvm"""
+"""Target management API of TVM.
 
+TVM's target string is in fomat ``<target_name> [-option=value]...``.
+
+Note
+----
+The list of options include:
+
+- **-device=<device name>**
+
+   The device name.
+
+- **-mtriple=<target triple>** or **-target**
+
+   Specify the target triple, which is useful for cross
+   compilation.
+
+- **-mcpu=<cpuname>**
+
+   Specify a specific chip in the current architecture to
+   generate code for. By default this is infered from the
+   target triple and autodetected to the current architecture.
+
+- **-mattr=a1,+a2,-a3,...**
+
+   Override or control specific attributes of the target,
+   such as whether SIMD operations are enabled or not. The
+   default set of attributes is set by the current CPU.
+
+- **-system-lib**
+
+   Build TVM system library module. System lib is a global module that contains
+   self registered functions in program startup. User can get the module using
+   :any:`tvm.module.system_lib`.
+   It is useful in environments where dynamic loading api like dlopen is banned.
+   The system lib will be available as long as the result code is linked by the program.
+
+We can use :any:`tvm.target.create` to create a tvm.target.Target from the target string.
+We can also use other specific function in this module to create specific targets.
+"""
 from __future__ import absolute_import
 
+from ._ffi.base import _LIB_NAME
+
+try:
+    from decorator import decorate
+except ImportError as err_msg:
+    # Allow decorator to be missing in runtime
+    if _LIB_NAME != "libtvm_runtime.so":
+        raise err_msg
+
+
+def _merge_opts(opts, new_opts):
+    """Helper function to merge options"""
+    if isinstance(new_opts, str):
+        new_opts = new_opts.split()
+    if new_opts:
+        return opts + new_opts
+    return opts
+
+
 class Target(object):
-    """A Target describes the target type on which computation should be carried on"""
-    default_target = None
-    str2type = {'x86': 1, 'cuda': 2, 'rasp': 3}
-    type2str = {1: 'x86', 2: 'cuda', 3: 'rasp'}
-    def __init__(self, target_type):
-        """Constructs a context."""
-        if isinstance(target_type, Target):
-            self.target_typeid = target_type.target_typeid
-        else:
-            self.target_typeid = Target.str2type[target_type]
+    """Target device information, use through TVM API.
 
-    @property
-    def target_type(self):
-        """Returns the target type of current target."""
-        return Target.type2str[self.target_typeid]
+    Parameters
+    ----------
+    target_name : {"llvm", "cuda", "opencl", "metal", "rocm", "stackvm", "ext_dev"}
+        The major target name.
 
-    def __hash__(self):
-        """Compute hash value of target for dictionary lookup"""
-        return hash(self.target_typeid)
+    options : list of str, optional
+        Additional arguments appended to the target.
 
-    def __eq__(self, other):
-        """Compares two targets. Two targets are equal if they
-        have the same target type.
-        """
-        return isinstance(other, Target) and \
-            self.target_typeid == other.target_typeid
+    Note
+    ----
+    Do not use class constructor, you can create target using the following functions
+
+    - :any:`tvm.target.create` create target from string
+    - :any:`tvm.target.rasp` create raspberry pi target
+    - :any:`tvm.target.cuda` create CUDA target
+    - :any:`tvm.target.rocm` create ROCM target
+    """
+    current = None
+
+    def __init__(self,
+                 target_name,
+                 options=None):
+        self.target_name = target_name
+        self.options = _merge_opts([], options)
+        self.device_name = ""
+        # Parse device option
+        for item in self.options:
+            if item.startswith("-device="):
+                self.device_name = item.split("=")[1]
+        # Target query searchs device name first
+        if self.device_name:
+            self.keys = (self.device_name,)
+        else:
+            self.keys = ()
+        # Target configuration handling
+        self.thread_warp_size = 1
+        if target_name in ("llvm", ):
+            self.keys += ("cpu",)
+        elif target_name in ("cuda", "nvptx"):
+            self.keys += ("cuda", "gpu")
+            self.max_num_threads = 512
+            self.thread_warp_size = 32
+        elif target_name in ("rocm", "opencl"):
+            # For now assume rocm schedule for opencl
+            self.keys += ("rocm", "gpu")
+            self.max_num_threads = 256
+        elif target_name in ("metal",):
+            self.keys += ("gpu",)
+            self.max_num_threads = 256
+        elif target_name in ("stackvm", "ext_dev"):
+            # Do not now class for stacvm or ext_dev
+            pass
+        else:
+            raise ValueError("Unknown target name %s" % target_name)
 
     def __str__(self):
-        return '%s' % (self.target_type)
+        return " ".join([self.target_name] + self.options)
 
     def __repr__(self):
         return self.__str__()
 
     def __enter__(self):
-        self._old_target = Target.default_target
-        Target.default_target = self
+        self._old_target = Target.current
+        if self._old_target is not None and str(self) != str(self._old_target):
+            warnings.warn(
+                "Override target '%s' with new target scope '%s'" % (
+                    self._old_target, self))
+        Target.current = self
         return self
 
     def __exit__(self, ptype, value, trace):
-        Target.default_target = self._old_target
+        Target.current = self._old_target
+
+
+def generic_func(fdefault):
+    """Wrap a target generic function.
+
+    Generic function allows registeration of further functions
+    that can be dispatched on current target context.
+    If no registered dispatch is matched, the fdefault will be called.
+
+    Parameters
+    ----------
+    fdefault : function
+        The default function.
+
+    Returns
+    -------
+    fgeneric : function
+        A wrapped generic function.
+
+    Example
+    -------
+    .. code-block:: python
+
+      import tvm
+      # wrap function as target generic
+      @tvm.target.generic_func
+      def my_func(a):
+          return a + 1
+      # register specialization of my_func under target cuda
+      @my_func.register("cuda")
+      def my_func_cuda(a):
+          return a + 2
+      # displays 3, because my_func is called
+      print(my_func(2))
+      # displays 4, because my_func_cuda is called
+      with tvm.target.cuda():
+          print(my_func(2))
+    """
+    dispatch_dict = {}
+    func_name = fdefault.__name__
+
+    def register(key, func=None, override=False):
+        """Register function to be the dispatch function.
+
+        Parameters
+        ----------
+        key : str or list of str
+            The key to be registered.
+
+        func : function
+            The function to be registered.
+
+        override : bool
+            Whether override existing registeration.
+
+        Returns
+        -------
+        The register function is necessary.
+        """
+        def _do_reg(myf):
+            key_list = [key] if isinstance(key, str) else key
+            for k in key_list:
+                if k in dispatch_dict and not override:
+                    raise ValueError(
+                        "Key is already registered for %s" % func_name)
+                dispatch_dict[k] = myf
+            return myf
+        if func:
+            return _do_reg(myf)
+        return _do_reg
+
+    def dispatch_func(func, *args, **kwargs):
+        """The wrapped dispath function"""
+        target = current_target()
+        if target is None:
+            return func(*args, **kwargs)
+        for k in target.keys:
+            if k in dispatch_dict:
+                return dispatch_dict[k](*args, **kwargs)
+        return func(*args, **kwargs)
+    fdecorate = decorate(fdefault, dispatch_func)
+    fdecorate.register = register
+    return fdecorate
+
+
+def cuda(options=None):
+    """Returns a cuda target.
+
+    Parameters
+    ----------
+    options : list of str
+        Additional options
+    """
+    return Target("cuda", options)
+
+
+def rocm(options=None):
+    """Returns a ROCM target.
+
+    Parameters
+    ----------
+    options : list of str
+        Additional options
+    """
+    return Target("rocm", options)
+
+
+def rasp(options=None):
+    """Returns a rasp target.
+
+    Parameters
+    ----------
+    options : list of str
+        Additional options
+    """
+    opts = ["-device=rasp",
+            "-mtriple=armv7l-none-linux-gnueabihf",
+            "-mcpu=cortex-a53",
+            "-mattr=+neon"]
+    opts = _merge_opts(opts, options)
+    return Target("llvm", opts)
+
+
+def create(target_str):
+    """Get a target given target string.
+
+    Parameters
+    ----------
+    target_str : str
+        The target string.
+
+    Returns
+    -------
+    target : Target
+        The target object
 
-Target.default_target = Target('x86')
+    Note
+    ----
+    See the note on :any:`tvm.target` on target string format.
+    """
+    if isinstance(target_str, Target):
+        return target_str
+    if not isinstance(target_str, str):
+        raise ValueError("target_str has to be string type")
+    arr = target_str.split()
+    # Parse device option
+    device_name = ""
+    for item in arr[1:]:
+        if item.startswith("-device="):
+            device_name = item.split("=")[1]
+    if device_name == "rasp":
+        return rasp(arr[1:])
+    return Target(arr[0], arr[1:])
 
-def x86():
-    """Returns a x86 target."""
-    return Target('x86')
 
-def cuda():
-    """Returns a cuda target."""
-    return Target('cuda')
+def current_target(allow_none=True):
+    """Returns the current target.
 
-def rasp():
-    """Returns a rasp target."""
-    return Target('rasp')
+    Parameters
+    ----------
+    allow_none : bool
+       Whether allow the current target to be none
 
-def current_target():
-    """Returns the current target."""
-    return Target.default_target
+    Raises
+    ------
+    ValueError if current target is not set.
+    """
+    if Target.current:
+        return Target.current
+    if not allow_none:
+        raise RuntimeError(
+            "Requires a current target in generic function, but it is not set. "
+            "Please set it using `with TargetObject:`")
+    return Target.current
diff --git a/src/codegen/llvm/llvm_common.cc b/src/codegen/llvm/llvm_common.cc
index 05972c3bcfe7..dfe51fb373f7 100644
--- a/src/codegen/llvm/llvm_common.cc
+++ b/src/codegen/llvm/llvm_common.cc
@@ -82,6 +82,8 @@ GetLLVMTargetMachine(const std::string& target_str,
       } else {
         LOG(FATAL) << "invalid -mfloat-abi option " << value;
       }
+    } else if (key == "-device") {
+      // pass
     } else {
       LOG(FATAL) << "unknown option " << key;
     }
diff --git a/tests/python/integration/test_gemm.py b/tests/python/integration/test_gemm.py
index 5733e11a84ec..fea342baade6 100644
--- a/tests/python/integration/test_gemm.py
+++ b/tests/python/integration/test_gemm.py
@@ -68,7 +68,8 @@ def check_device(device):
             print("skip because %s is not enabled.." % device)
             return
 
-        f = tvm.build(s, [A, B, C], device)
+        with tvm.target.create(device):
+            f = tvm.build(s, [A, B, C])
         ctx = tvm.context(device, 0)
         # launch the kernel.
         n = nn
diff --git a/tests/python/unittest/test_lang_target.py b/tests/python/unittest/test_lang_target.py
new file mode 100644
index 000000000000..4e13c76d5f39
--- /dev/null
+++ b/tests/python/unittest/test_lang_target.py
@@ -0,0 +1,44 @@
+import tvm
+
+@tvm.target.generic_func
+def mygeneric(data):
+    # default generic function
+    return data + 1
+
+@mygeneric.register(["cuda", "gpu"])
+def cuda_func(data):
+    return data + 2
+
+@mygeneric.register("rocm")
+def rocm_func(data):
+    return data + 3
+
+@mygeneric.register("cpu")
+def rocm_func(data):
+    return data + 10
+
+
+def test_target_dispatch():
+    with tvm.target.cuda():
+        assert mygeneric(1) == 3
+
+    with tvm.target.rocm():
+        assert mygeneric(1) == 4
+
+    with tvm.target.create("cuda"):
+        assert mygeneric(1) == 3
+
+    with tvm.target.rasp():
+        assert mygeneric(1) == 11
+
+    with tvm.target.create("metal"):
+        assert mygeneric(1) == 3
+
+    try:
+        mygeneric(0)
+        raise RuntimeError("not reached")
+    except RuntimeError:
+        pass
+
+if __name__ == "__main__":
+    test_target_dispatch()
diff --git a/topi/python/topi/cuda/conv2d_nchw.py b/topi/python/topi/cuda/conv2d_nchw.py
index 8e0f22781c1d..34cc61b1d78c 100644
--- a/topi/python/topi/cuda/conv2d_nchw.py
+++ b/topi/python/topi/cuda/conv2d_nchw.py
@@ -3,6 +3,7 @@
 import tvm
 from .. import util
 from .. import tag
+from .. import generic
 
 def conv2d_224_3_64(s, temp, temp_R, temp_S, Filter_S, Out, Out_L):
     """Schedule conv2d for specific feature_in_out_filter pattern"""
@@ -483,6 +484,8 @@ def traverse(OP):
     traverse(outs[0].op)
     return s
 
+
+@generic.schedule_conv2d_nchw.register(["cuda", "gpu"])
 def schedule_conv2d_nchw(outs):
     """Schedule for conv2d_nchw.
 
diff --git a/topi/python/topi/cuda/dense.py b/topi/python/topi/cuda/dense.py
index 207aabc9e7ce..e7b142758295 100644
--- a/topi/python/topi/cuda/dense.py
+++ b/topi/python/topi/cuda/dense.py
@@ -3,7 +3,9 @@
 from __future__ import absolute_import as _abs
 import tvm
 from .. import tag
+from .. import generic
 
+@generic.schedule_dense.register(["cuda", "gpu"])
 def schedule_dense(outs):
     """Schedule for dense operator.
 
diff --git a/topi/python/topi/cuda/depthwise_conv2d.py b/topi/python/topi/cuda/depthwise_conv2d.py
index e14e189957f1..935d8a98b811 100644
--- a/topi/python/topi/cuda/depthwise_conv2d.py
+++ b/topi/python/topi/cuda/depthwise_conv2d.py
@@ -3,7 +3,9 @@
 import tvm
 from ..util import get_const_tuple
 from .. import tag
+from .. import generic
 
+@generic.schedule_depthwise_conv2d_nchw.register(["cuda", "gpu"])
 def schedule_depthwise_conv2d_nchw(outs):
     """Schedule for depthwise_conv2d nchw forward.
 
diff --git a/topi/python/topi/cuda/injective.py b/topi/python/topi/cuda/injective.py
index a83eb67be8bd..940f91e5938c 100644
--- a/topi/python/topi/cuda/injective.py
+++ b/topi/python/topi/cuda/injective.py
@@ -1,17 +1,21 @@
 # pylint: disable=invalid-name, unused-variable,
 """Schedule for composition of injective operator"""
 import tvm
+from .. import generic
 
 def _schedule_injective(op, sch):
     x = op.output(0)
     fused = sch[x].fuse(*sch[x].op.axis)
-    num_thread = 512
+    target = tvm.target.current_target()
+    target = target if target else tvm.target.cuda()
+    num_thread = target.max_num_threads
     bx, tx = sch[x].split(fused, factor=num_thread)
     sch[x].bind(bx, tvm.thread_axis("blockIdx.x"))
     sch[x].bind(tx, tvm.thread_axis("threadIdx.x"))
     return sch
 
 
+@generic.schedule_injective.register(["cuda", "gpu"])
 def schedule_injective(outs):
     """Schedule for injective op.
 
diff --git a/topi/python/topi/cuda/pooling.py b/topi/python/topi/cuda/pooling.py
index 800da009f3c2..fc1850d31221 100644
--- a/topi/python/topi/cuda/pooling.py
+++ b/topi/python/topi/cuda/pooling.py
@@ -2,7 +2,9 @@
 """Schedule for pooling operators"""
 import tvm
 from .. import tag
+from .. import generic
 
+@generic.schedule_global_pool.register(["cuda", "gpu"])
 def schedule_global_pool(outs):
     """Schedule for global_pool.
 
@@ -63,6 +65,7 @@ def traverse(OP):
     return s
 
 
+@generic.schedule_pool.register(["cuda", "gpu"])
 def schedule_pool(outs):
     """Schedule for pool.
 
diff --git a/topi/python/topi/cuda/reduction.py b/topi/python/topi/cuda/reduction.py
index bfa2615bf6e3..02250fadb90d 100644
--- a/topi/python/topi/cuda/reduction.py
+++ b/topi/python/topi/cuda/reduction.py
@@ -3,6 +3,7 @@
 from __future__ import absolute_import as _abs
 import tvm
 from .. import tag
+from .. import generic
 
 def _schedule_reduce(op, sch, is_idx_reduce=False):
     if is_idx_reduce:
@@ -62,6 +63,7 @@ def _schedule_reduce(op, sch, is_idx_reduce=False):
     return sch
 
 
+@generic.schedule_reduce.register(["cuda", "gpu"])
 def schedule_reduce(outs):
     """Schedule for inject->reduce->bcast ops.
 
diff --git a/topi/python/topi/cuda/softmax.py b/topi/python/topi/cuda/softmax.py
index d57c9e10fe0e..5d98fb45df75 100644
--- a/topi/python/topi/cuda/softmax.py
+++ b/topi/python/topi/cuda/softmax.py
@@ -1,7 +1,9 @@
 # pylint: disable=invalid-name, unused-variable, trailing-whitespace
 """Schedule for softmax operator"""
 import tvm
+from .. import generic
 
+@generic.schedule_softmax.register(["cuda", "gpu"])
 def schedule_softmax(outs):
     """Schedule for softmax op.
 
diff --git a/topi/python/topi/generic/__init__.py b/topi/python/topi/generic/__init__.py
new file mode 100644
index 000000000000..d10d5c5ebecc
--- /dev/null
+++ b/topi/python/topi/generic/__init__.py
@@ -0,0 +1,19 @@
+# pylint: disable=wildcard-import
+"""Generic declaration and schedules.
+
+This is a recommended way of using TOPI API.
+To use the generic schedule function, user must set
+the current target scope using with block. See also :any:`tvm.target`
+
+Example
+-------
+.. code-block:: python
+
+  # create schedule that dispatches to topi.cuda.schedule_injective
+  with tvm.target.create("cuda"):
+    s = tvm.generic.schedule_injective(outs)
+"""
+from __future__ import absolute_import as _abs
+
+from .nn import *
+from .injective import *
diff --git a/topi/python/topi/generic/injective.py b/topi/python/topi/generic/injective.py
new file mode 100644
index 000000000000..6adf218d18a1
--- /dev/null
+++ b/topi/python/topi/generic/injective.py
@@ -0,0 +1,32 @@
+# pylint: disable=invalid-name
+"""generic declaration and schedules."""
+from __future__ import absolute_import as _abs
+
+import tvm
+
+@tvm.target.generic_func
+def schedule_injective(outs):
+    """Schedule for injective op.
+
+    Parameters
+    ----------
+    outs: Array of Tensor
+          The computation graph description of reduce in the format
+          of an array of tensors.
+
+    Returns
+    -------
+    sch: Schedule
+        The computation schedule for the op.
+    """
+    target = tvm.target.current_target(allow_none=False)
+    if target.target_name != "llvm":
+        raise RuntimeError("schedule_injective not registered for '%s'" % target)
+    x = outs[0]
+    s = tvm.create_schedule([x.op for x in outs])
+    tvm.schedule.AutoInlineInjective(s)
+    s[x].fuse(s[x].op.axis)
+    return s
+
+schedule_elemwise = schedule_injective
+schedule_broadcast = schedule_injective
diff --git a/topi/python/topi/generic/nn.py b/topi/python/topi/generic/nn.py
new file mode 100644
index 000000000000..7f45ccd03e45
--- /dev/null
+++ b/topi/python/topi/generic/nn.py
@@ -0,0 +1,142 @@
+"""Generic nn operators"""
+from __future__ import absolute_import as _abs
+import tvm
+
+
+def _default_schedule(outs, auto_inline):
+    """Default schedule for llvm."""
+    target = tvm.target.current_target(allow_none=False)
+    if target.target_name != "llvm":
+        raise RuntimeError("schedule_pool not registered for '%s'" % target)
+    s = tvm.create_schedule([x.op for x in outs])
+    if auto_inline:
+        x = outs[0]
+        tvm.schedule.AutoInlineInjective(s)
+        s[x].fuse(s[x].op.axis)
+    return s
+
+
+@tvm.target.generic_func
+def schedule_conv2d_nchw(outs):
+    """Schedule for conv2d nchow
+
+    Parameters
+    ----------
+    outs: Array of Tensor
+          The computation graph description of reduce in the format
+          of an array of tensors.
+
+    Returns
+    -------
+    sch: Schedule
+        The computation schedule for the op.
+    """
+    return _default_schedule(outs, False)
+
+
+@tvm.target.generic_func
+def schedule_depthwise_conv2d_nchw(outs):
+    """Schedule for conv2d nchow
+
+    Parameters
+    ----------
+    outs: Array of Tensor
+          The computation graph description of reduce in the format
+          of an array of tensors.
+
+    Returns
+    -------
+    sch: Schedule
+        The computation schedule for the op.
+    """
+    return _default_schedule(outs, False)
+
+
+@tvm.target.generic_func
+def schedule_reduce(outs):
+    """Schedule for reduction
+
+    Parameters
+    ----------
+    outs: Array of Tensor
+          The computation graph description of reduce in the format
+          of an array of tensors.
+
+    Returns
+    -------
+    sch: Schedule
+        The computation schedule for the op.
+    """
+    return _default_schedule(outs, True)
+
+
+@tvm.target.generic_func
+def schedule_softmax(outs):
+    """Schedule for softmax
+
+    Parameters
+    ----------
+    outs: Array of Tensor
+          The computation graph description of reduce in the format
+          of an array of tensors.
+
+    Returns
+    -------
+    sch: Schedule
+        The computation schedule for the op.
+    """
+    return _default_schedule(outs, False)
+
+
+@tvm.target.generic_func
+def schedule_dense(outs):
+    """Schedule for dense
+
+    Parameters
+    ----------
+    outs: Array of Tensor
+          The computation graph description of reduce in the format
+          of an array of tensors.
+
+    Returns
+    -------
+    sch: Schedule
+        The computation schedule for the op.
+    """
+    return _default_schedule(outs, False)
+
+
+@tvm.target.generic_func
+def schedule_pool(outs):
+    """Schedule for pool
+
+    Parameters
+    ----------
+    outs: Array of Tensor
+          The computation graph description of reduce in the format
+          of an array of tensors.
+
+    Returns
+    -------
+    sch: Schedule
+        The computation schedule for the op.
+    """
+    return _default_schedule(outs, False)
+
+
+@tvm.target.generic_func
+def schedule_global_pool(outs):
+    """Schedule for global pool
+
+    Parameters
+    ----------
+    outs: Array of Tensor
+          The computation graph description of reduce in the format
+          of an array of tensors.
+
+    Returns
+    -------
+    sch: Schedule
+        The computation schedule for the op.
+    """
+    return _default_schedule(outs, False)
diff --git a/topi/python/topi/nn/conv2d.py b/topi/python/topi/nn/conv2d.py
index 215ef43ecee0..db3a6079f96a 100644
--- a/topi/python/topi/nn/conv2d.py
+++ b/topi/python/topi/nn/conv2d.py
@@ -1,9 +1,8 @@
-# pylint: disable=invalid-name, unused-variable, too-many-locals
+# pylint: disable=invalid-name, unused-variable, too-many-locals, unused-argument
 """Conv2D operators"""
 from __future__ import absolute_import as _abs
 from collections import namedtuple
 import tvm
-from tvm import target as _target
 from .pad import pad
 from .util import get_pad_tuple
 from ..util import simplify
@@ -51,9 +50,7 @@
 # platform specific schedule
 _CONV_SCHEDULE = {}
 
-# platform specific declaration
-_CONV_DECLARATION = {}
-
+@tvm.target.generic_func
 def conv2d(data, kernel, stride, padding, layout='NCHW'):
     """Conv2D operator.
 
@@ -80,10 +77,6 @@ def conv2d(data, kernel, stride, padding, layout='NCHW'):
         4-D with shape [batch, out_channel, out_height, out_width]
     """
     # search platform specific declaration first
-    target = _target.current_target()
-    if target in _CONV_DECLARATION:
-        return _CONV_DECLARATION[target](data, kernel, stride, padding, layout)
-
     # default declaration
     if layout == 'NCHW':
         return conv2d_nchw(data, kernel, stride, padding)
@@ -105,15 +98,15 @@ def _get_workload(data, kernel, stride, padding):
     return Workload(IH, IW, CI, CO, KH, KW, HPAD, WPAD, HSTR, WSTR)
 
 
-def _get_schedule(wkl, target=None):
+@tvm.target.generic_func
+def _get_schedule(wkl):
+    # pylint: disable=unreachable
     """ Get the platform specific schedule. """
-    if target is None:
-        target = _target.current_target()
-    else:
-        target = _target.Target(target)
-    assert target in _CONV_SCHEDULE, "no schedule for such target: {}".format(target)
-    return _CONV_SCHEDULE[target](wkl)
-
+    target = tvm.target.current_target()
+    raise RuntimeError(
+        "No schedule for current target:{}".format(target))
+    # This return has no use, merely to supress pylint warning
+    return wkl
 
 def _spatial_pack(data, kernel, stride, padding):
     """ Compute convolution with pack on spatial axes. """
diff --git a/topi/python/topi/rasp/conv2d.py b/topi/python/topi/rasp/conv2d.py
index 7d07030ad887..86cb8a9d01a0 100644
--- a/topi/python/topi/rasp/conv2d.py
+++ b/topi/python/topi/rasp/conv2d.py
@@ -4,11 +4,12 @@
 import tvm
 from tvm import target as _target
 from .. import tag
+from ..nn.conv2d import conv2d, _get_schedule
 from ..nn.conv2d import SpatialPack, Im2ColPack
-from ..nn.conv2d import _CONV_DECLARATION, _CONV_SCHEDULE
 from ..nn.conv2d import _WORKLOADS, _SCH_TO_DECL_FUNC
-from ..nn.conv2d import _get_workload, _get_schedule
+from ..nn.conv2d import _get_workload
 from ..nn.util import infer_pad, infer_stride
+from .. import generic
 
 _SCHEDULES = [
     SpatialPack(1, 8, 4, 1, 4, True),
@@ -36,6 +37,7 @@
     Im2ColPack(7, 4, 1, 4, True),
 ]
 
+@_get_schedule.register("rasp")
 def _schedule_conv2d(wkl):
     if wkl not in _WORKLOADS:
         raise ValueError("no schedule for such workload: {}".format(wkl))
@@ -43,8 +45,8 @@ def _schedule_conv2d(wkl):
     sch = _SCHEDULES[idx]
     return sch
 
-_CONV_SCHEDULE[_target.rasp()] = _schedule_conv2d
 
+@conv2d.register("rasp")
 def _declaration_conv2d(data, kernel, stride, padding, layout):
     assert layout == 'NCHW', "only support NCHW convolution on rasp"
     assert data.shape[0].value == 1, "only support batch size=1 convolution on rasp"
@@ -52,7 +54,6 @@ def _declaration_conv2d(data, kernel, stride, padding, layout):
     sch = _get_schedule(wkl)
     return _SCH_TO_DECL_FUNC[type(sch)](data, kernel, stride, padding)
 
-_CONV_DECLARATION[_target.rasp()] = _declaration_conv2d
 
 def _schedule_spatial_conv2d(s, data, data_pad, data_vec,
                              kernel, kernel_vec,
@@ -64,7 +65,9 @@ def _schedule_spatial_conv2d(s, data, data_pad, data_vec,
     else:
         stride = infer_stride(data_pad, kernel, output)
     wkl = _get_workload(data, kernel, stride, padding)
-    sch = _get_schedule(wkl, 'rasp')
+
+    with tvm.target.rasp():
+        sch = _get_schedule(wkl)
 
     H, W = wkl.height, wkl.width
     CI, CO = wkl.in_filter, wkl.out_filter
@@ -170,7 +173,9 @@ def _schedule_im2col_conv2d(s, data, data_pad, data_col, data_vec,
     else:
         stride = infer_stride(data_pad, kernel, output)
     wkl = _get_workload(data, kernel, stride, padding)
-    sch = _get_schedule(wkl, 'rasp')
+
+    with _target.rasp():
+        sch = _get_schedule(wkl)
 
     H, W = wkl.height, wkl.width
     CI = wkl.in_filter
@@ -275,6 +280,7 @@ def _schedule_im2col_conv2d(s, data, data_pad, data_col, data_vec,
 
     return s
 
+@generic.schedule_conv2d_nchw.register(["cpu", "rasp"])
 def schedule_conv2d(outs):
     """Create schedule for tensors"""
     s = tvm.create_schedule([x.op for x in outs])
diff --git a/topi/python/topi/rasp/depthwise_conv2d.py b/topi/python/topi/rasp/depthwise_conv2d.py
index 49465bdad639..e695f0463852 100644
--- a/topi/python/topi/rasp/depthwise_conv2d.py
+++ b/topi/python/topi/rasp/depthwise_conv2d.py
@@ -5,7 +5,7 @@
 import tvm
 from .. import tag
 from ..nn.util import infer_pad, infer_stride, get_pad_tuple
-
+from .. import generic
 
 _Workload = namedtuple('Workload',
                        ['height', 'width', 'channel', 'multiplier',
@@ -145,7 +145,7 @@ def _schedule(s, data, data_pad, kernel, output, last):
     return s
 
 
-
+@generic.schedule_depthwise_conv2d_nchw.register(["cpu", "rasp"])
 def schedule_depthwise_conv2d(outs):
     """Schedule for depthwise_conv2d nchw forward.
 
diff --git a/topi/tests/python/test_topi_broadcast.py b/topi/tests/python/test_topi_broadcast.py
index e993b5324176..dbd1c37a221b 100644
--- a/topi/tests/python/test_topi_broadcast.py
+++ b/topi/tests/python/test_topi_broadcast.py
@@ -8,16 +8,16 @@ def verify_broadcast_to_ele(in_shape, out_shape):
     # Build the logic and compile the function
     A = tvm.placeholder(shape=in_shape, name="A")
     B = topi.broadcast_to(A, out_shape)
-    s = topi.cuda.schedule_broadcast(B)
     def check_device(device):
         if not tvm.module.enabled(device):
             print("Skip because %s is not enabled" % device)
             return
+        with tvm.target.create(device):
+            s = topi.generic.schedule_broadcast(B)
         ctx = tvm.context(device, 0)
         foo = tvm.build(s, [A, B], device, name="broadcast_to")
         data_npy = np.random.uniform(size=in_shape).astype(A.dtype)
         out_npy = np.broadcast_to(data_npy, out_shape)
-
         data_nd = tvm.nd.array(data_npy, ctx)
         out_nd = tvm.nd.array(np.empty(out_shape).astype(B.dtype), ctx)
         for _ in range(1):
@@ -48,11 +48,12 @@ def verify_broadcast_binary_ele(lhs_shape, rhs_shape, typ="add"):
         C = topi.broadcast_minimum(A, B)
     else:
         raise NotImplementedError
-    s = topi.cuda.schedule_broadcast(C)
     def check_device(device):
         if not tvm.module.enabled(device):
             print("Skip because %s is not enabled" % device)
             return
+        with tvm.target.create(device):
+            s = topi.generic.schedule_broadcast(C)
         ctx = tvm.context(device, 0)
         foo = tvm.build(s, [A, B, C], device, name="broadcast_binary" + "_" + typ)
         lhs_npy = np.random.uniform(size=lhs_shape).astype(A.dtype)
diff --git a/topi/tests/python/test_topi_conv2d.py b/topi/tests/python/test_topi_conv2d.py
index b731bfee997a..3e6978752b29 100644
--- a/topi/tests/python/test_topi_conv2d.py
+++ b/topi/tests/python/test_topi_conv2d.py
@@ -14,8 +14,8 @@ def verify_conv2d(batch, in_size, in_channel, num_filter, kernel, stride, paddin
         A = tvm.placeholder((batch, in_channel, in_height, in_width), name='A')
         W = tvm.placeholder((num_filter, in_channel, kernel, kernel), name='W')
         B = topi.nn.conv2d(A, W, stride, padding)
+        s = topi.generic.schedule_conv2d_nchw([B])
 
-    s = topi.rasp.schedule_conv2d([B])
     a_shape = get_const_tuple(A.shape)
     w_shape = get_const_tuple(W.shape)
     dtype = A.dtype
diff --git a/topi/tests/python/test_topi_conv2d_nchw.py b/topi/tests/python/test_topi_conv2d_nchw.py
index d2fc7cd5b2a3..fd645ecd024b 100644
--- a/topi/tests/python/test_topi_conv2d_nchw.py
+++ b/topi/tests/python/test_topi_conv2d_nchw.py
@@ -14,8 +14,6 @@ def verify_conv2d_nchw(batch, in_channel, in_size, num_filter, kernel, stride, p
     W = tvm.placeholder((num_filter, in_channel, kernel, kernel), name='W')
     B = topi.nn.conv2d_nchw(A, W, stride, padding)
     C = topi.nn.relu(B)
-    s1 = topi.cuda.schedule_conv2d_nchw([B])
-    s2 = topi.cuda.schedule_conv2d_nchw([C])
 
     a_shape = get_const_tuple(A.shape)
     w_shape = get_const_tuple(W.shape)
@@ -35,6 +33,9 @@ def check_device(device):
         if not tvm.module.enabled(device):
             print("Skip because %s is not enabled" % device)
             return
+        with tvm.target.create(device):
+            s1 = topi.generic.schedule_conv2d_nchw([B])
+            s2 = topi.generic.schedule_conv2d_nchw([C])
         ctx = tvm.context(device, 0)
         a = tvm.nd.array(a_np, ctx)
         w = tvm.nd.array(w_np, ctx)
diff --git a/topi/tests/python/test_topi_dense.py b/topi/tests/python/test_topi_dense.py
index 94dcb14af4d2..94badb81769c 100644
--- a/topi/tests/python/test_topi_dense.py
+++ b/topi/tests/python/test_topi_dense.py
@@ -12,7 +12,6 @@ def verify_dense(batch, in_dim, out_dim, use_bias=True):
     C = tvm.placeholder((out_dim,), name='C')
     D = topi.nn.dense(A, B, C if use_bias else None)
     D = topi.nn.relu(D)
-    s = topi.cuda.schedule_dense(D)
     dtype = A.dtype
 
     # use memoize to pickle the test data for next time use
@@ -33,6 +32,8 @@ def check_device(device):
         if not tvm.module.enabled(device):
             print("Skip because %s is not enabled" % device)
             return
+        with tvm.target.create(device):
+            s = topi.generic.schedule_dense(D)
         ctx = tvm.context(device, 0)
         a = tvm.nd.array(a_np, ctx)
         b = tvm.nd.array(b_np, ctx)
diff --git a/topi/tests/python/test_topi_depthwise_conv2d.py b/topi/tests/python/test_topi_depthwise_conv2d.py
index b0f09c55ca4e..dc507d1bfc3d 100644
--- a/topi/tests/python/test_topi_depthwise_conv2d.py
+++ b/topi/tests/python/test_topi_depthwise_conv2d.py
@@ -4,7 +4,7 @@
 from scipy import signal
 from topi.util import get_const_tuple
 from tvm.contrib.pickle_memoize import memoize
-from topi.cuda.depthwise_conv2d import schedule_depthwise_conv2d_nchw, schedule_depthwise_conv2d_nhwc
+from topi.cuda.depthwise_conv2d import schedule_depthwise_conv2d_nhwc
 
 
 def depthwise_conv2d_with_workload_nchw(batch, in_channel, in_height, channel_multiplier, filter_height, stride_h, padding):
@@ -21,15 +21,18 @@ def depthwise_conv2d_with_workload_nchw(batch, in_channel, in_height, channel_mu
     DepthwiseConv2d = topi.nn.depthwise_conv2d_nchw(Input, Filter, stride=[stride_h, stride_w], padding=padding)
     ScaleShift = topi.nn.scale_shift_nchw(DepthwiseConv2d, Scale, Shift)
     Relu = topi.nn.relu(ScaleShift)
-    # schedule
-    s1 = schedule_depthwise_conv2d_nchw(DepthwiseConv2d)
-    s2 = schedule_depthwise_conv2d_nchw(ScaleShift)
-    s3 = schedule_depthwise_conv2d_nchw(Relu)
+
 
     def check_device(device):
         if not tvm.module.enabled(device):
             print("Skip because %s is not enabled" % device)
             return
+        with tvm.target.create(device):
+            # schedule
+            s1 = topi.generic.schedule_depthwise_conv2d_nchw(DepthwiseConv2d)
+            s2 = topi.generic.schedule_depthwise_conv2d_nchw(ScaleShift)
+            s3 = topi.generic.schedule_depthwise_conv2d_nchw(Relu)
+
         ctx = tvm.context(device, 0)
         # build the kernels
         f1 = tvm.build(s1, [Input, Filter, DepthwiseConv2d], device)
@@ -88,7 +91,7 @@ def get_ref_data():
     check_device("cuda")
     check_device("metal")
     check_device("rocm")
-    
+
 def depthwise_conv2d_with_workload_nhwc(batch, in_channel, in_height, channel_multiplier, filter_height, stride_h, padding):
     in_width = in_height
     filter_channel = in_channel
diff --git a/topi/tests/python/test_topi_pooling.py b/topi/tests/python/test_topi_pooling.py
index 88b5a769093c..3cddb7dfbc64 100644
--- a/topi/tests/python/test_topi_pooling.py
+++ b/topi/tests/python/test_topi_pooling.py
@@ -12,7 +12,6 @@ def verify_pool(n, ic, ih, kh, sh, padding, pool_type):
     A = tvm.placeholder((n, ic, ih, iw), name='A')
     B = topi.nn.pool(A, kernel=[kh, kw], stride=[sh, sw], padding=padding, pool_type=pool_type)
     B = topi.nn.relu(B)
-    s = topi.cuda.schedule_pool(B)
     dtype = A.dtype
 
     a_np = np.random.uniform(size=(n, ic, ih, iw)).astype(dtype)
@@ -36,6 +35,8 @@ def check_device(device):
         if not tvm.module.enabled(device):
             print("Skip because %s is not enabled" % device)
             return
+        with tvm.target.create(device):
+            s = topi.generic.schedule_pool(B)
         ctx = tvm.context(device, 0)
         a = tvm.nd.array(a_np, ctx)
         b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=dtype), ctx)
@@ -57,7 +58,6 @@ def verify_global_pool(n, c, h, w, pool_type):
     A = tvm.placeholder((n, c, h, w), name='A')
     B = topi.nn.global_pool(A, pool_type=pool_type)
     B = topi.nn.relu(B)
-    s = topi.cuda.schedule_global_pool(B)
 
     a_np = np.random.uniform(size=get_const_tuple(A.shape)).astype(A.dtype)
     if pool_type == 'avg':
@@ -70,6 +70,8 @@ def check_device(device):
         if not tvm.module.enabled(device):
             print("Skip because %s is not enabled" % device)
             return
+        with tvm.target.create(device):
+            s = topi.generic.schedule_global_pool(B)
         ctx = tvm.context(device, 0)
         a = tvm.nd.array(a_np, ctx)
         b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx)
diff --git a/topi/tests/python/test_topi_reduce.py b/topi/tests/python/test_topi_reduce.py
index 7922090ec112..07e55a54bcf6 100644
--- a/topi/tests/python/test_topi_reduce.py
+++ b/topi/tests/python/test_topi_reduce.py
@@ -45,11 +45,13 @@ def verify_reduce_map_ele(in_shape, axis, keepdims, type="sum"):
         out_dtype = "int32"
     else:
         raise NotImplementedError
-    s = topi.cuda.schedule_reduce(B)
+
     def check_device(device):
         if not tvm.module.enabled(device):
             print("Skip because %s is not enabled" % device)
             return
+        with tvm.target.create(device):
+            s = topi.generic.schedule_reduce(B)
         ctx = tvm.context(device, 0)
         foo = tvm.build(s, [A, B], device, name="sum")
         # Test
diff --git a/topi/tests/python/test_topi_softmax.py b/topi/tests/python/test_topi_softmax.py
index 7359c6fa56f0..912e2a8c3e5d 100644
--- a/topi/tests/python/test_topi_softmax.py
+++ b/topi/tests/python/test_topi_softmax.py
@@ -12,8 +12,6 @@ def verify_softmax(m, n):
     s = tvm.create_schedule([B.op])
     tvm.lower(s, [A, B], simple_mode=True)
 
-    s = topi.cuda.schedule_softmax(B)
-
     a_np = np.random.uniform(size=get_const_tuple(A.shape)).astype(A.dtype)
     b_np = topi.testing.softmax_python(a_np)
 
@@ -21,6 +19,8 @@ def check_device(device):
         if not tvm.module.enabled(device):
             print("Skip because %s is not enabled" % device)
             return
+        with tvm.target.create(device):
+            s = topi.generic.schedule_softmax(B)
         ctx = tvm.context(device, 0)
         a = tvm.nd.array(a_np, ctx)
         b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx)
@@ -43,7 +43,6 @@ def verify_log_softmax(m, n):
     s = tvm.create_schedule([B.op])
     tvm.lower(s, [A, B], simple_mode=True)
 
-    s = topi.cuda.schedule_softmax(B)
 
     a_np = np.random.uniform(size=get_const_tuple(A.shape)).astype(A.dtype)
     b_np = topi.testing.log_softmax_python(a_np)
@@ -52,6 +51,8 @@ def check_device(device):
         if not tvm.module.enabled(device):
             print("Skip because %s is not enabled" % device)
             return
+        with tvm.target.create(device):
+            s = topi.generic.schedule_softmax(B)
         ctx = tvm.context(device, 0)
         a = tvm.nd.array(a_np, ctx)
         b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx)
diff --git a/topi/tests/python/test_topi_transform.py b/topi/tests/python/test_topi_transform.py
index 623c8c6185ab..cd5bedf6c46e 100644
--- a/topi/tests/python/test_topi_transform.py
+++ b/topi/tests/python/test_topi_transform.py
@@ -6,11 +6,12 @@
 def verify_expand_dims(in_shape, out_shape, axis, num_newaxis):
     A = tvm.placeholder(shape=in_shape, name="A")
     B = topi.expand_dims(A, axis, num_newaxis)
-    s = topi.cuda.schedule_broadcast(B)
     def check_device(device):
         if not tvm.module.enabled(device):
             print("Skip because %s is not enabled" % device)
             return
+        with tvm.target.create(device):
+            s = topi.generic.schedule_broadcast(B)
         ctx = tvm.context(device, 0)
         foo = tvm.build(s, [A, B], device, name="expand_dims")
         data_npy = np.random.uniform(size=in_shape).astype(A.dtype)
@@ -23,17 +24,18 @@ def check_device(device):
     check_device("opencl")
     check_device("cuda")
     check_device("metal")
-    check_device("rocm")    
+    check_device("rocm")
 
 
 def verify_tranpose(in_shape, axes):
     A = tvm.placeholder(shape=in_shape, name="A")
     B = topi.transpose(A, axes)
-    s = topi.cuda.schedule_injective(B)
     def check_device(device):
         if not tvm.module.enabled(device):
             print("Skip because %s is not enabled" % device)
             return
+        with tvm.target.create(device):
+            s = topi.generic.schedule_injective(B)
         ctx = tvm.context(device, 0)
         foo = tvm.build(s, [A, B], device, name="tranpose")
         data_npy = np.arange(np.prod(in_shape)).reshape(in_shape).astype(A.dtype)
@@ -46,16 +48,17 @@ def check_device(device):
     check_device("cuda")
     check_device("opencl")
     check_device("metal")
-    check_device("rocm")    
+    check_device("rocm")
 
 def verify_reshape(src_shape, dst_shape):
     A = tvm.placeholder(shape=src_shape, name="A")
     B = topi.reshape(A, dst_shape)
-    s = topi.cuda.schedule_injective(B)
     def check_device(device):
         if not tvm.module.enabled(device):
             print("Skip because %s is not enabled" % device)
             return
+        with tvm.target.create(device):
+            s = topi.generic.schedule_injective(B)
         ctx = tvm.context(device, 0)
         foo = tvm.build(s, [A, B], device, name="reshape")
         data_npy = np.random.normal(size=src_shape).astype(A.dtype)
@@ -68,16 +71,17 @@ def check_device(device):
     check_device("cuda")
     check_device("opencl")
     check_device("metal")
-    check_device("rocm")    
+    check_device("rocm")
 
 def verify_squeeze(src_shape, axis):
     A = tvm.placeholder(shape=src_shape, name="A")
     B = topi.squeeze(A, axis=axis)
-    s = topi.cuda.schedule_injective(B)
     def check_device(device):
         if not tvm.module.enabled(device):
             print("Skip because %s is not enabled" % device)
             return
+        with tvm.target.create(device):
+            s = topi.generic.schedule_injective(B)
         ctx = tvm.context(device, 0)
         foo = tvm.build(s, [A, B], device, name="squeeze")
         data_npy = np.random.normal(size=src_shape).astype(A.dtype)
@@ -94,18 +98,19 @@ def check_device(device):
     check_device("cuda")
     check_device("opencl")
     check_device("metal")
-    check_device("rocm")    
+    check_device("rocm")
 
 def verify_concatenate(shapes, axis):
     tensor_l = []
     for i, shape in enumerate(shapes):
         tensor_l.append(tvm.placeholder(shape, name="A" + str(i)))
     out_tensor = topi.concatenate(a_tuple=tensor_l, axis=axis)
-    s = topi.cuda.schedule_injective(out_tensor)
     def check_device(device):
         if not tvm.module.enabled(device):
             print("Skip because %s is not enabled" % device)
             return
+        with tvm.target.create(device):
+            s = topi.generic.schedule_injective(out_tensor)
         ctx = tvm.context(device, 0)
         foo = tvm.build(s, tensor_l + [out_tensor], device, name="concatenate")
         data_npys = [np.random.normal(size=shape).astype(tensor_l[0].dtype) for shape in shapes]
@@ -118,16 +123,17 @@ def check_device(device):
     check_device("cuda")
     check_device("opencl")
     check_device("metal")
-    check_device("rocm")    
+    check_device("rocm")
 
 def verify_split(src_shape, indices_or_sections, axis):
     A = tvm.placeholder(shape=src_shape, name="A")
     tensor_l = topi.split(A, indices_or_sections, axis=axis)
-    s = topi.cuda.schedule_injective(tensor_l)
     def check_device(device):
         if not tvm.module.enabled(device):
             print("Skip because %s is not enabled" % device)
             return
+        with tvm.target.create(device):
+            s = topi.generic.schedule_injective(tensor_l)
         ctx = tvm.context(device, 0)
         foo = tvm.build(s, [A] + tensor_l, device, name="split")
         data_npy = np.random.normal(size=src_shape).astype(A.dtype)
@@ -142,7 +148,7 @@ def check_device(device):
     check_device("opencl")
     check_device("metal")
     check_device("rocm")
-    
+
 def test_expand_dims():
     verify_expand_dims((3, 10), (3, 10, 1, 1), 2, 2)
     verify_expand_dims((3, 10), (1, 3, 10), -3, 1)
@@ -190,4 +196,3 @@ def test_split():
     test_squeeze()
     test_concatenate()
     test_split()
-

From 25cf3894d8ac1a8e81464898649c6a5e8d0f313b Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Sat, 14 Oct 2017 14:33:40 -0700
Subject: [PATCH 2/2] fix target warning

---
 python/tvm/target.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/python/tvm/target.py b/python/tvm/target.py
index 930ca33b760e..f55a2d613d05 100644
--- a/python/tvm/target.py
+++ b/python/tvm/target.py
@@ -40,6 +40,7 @@
 """
 from __future__ import absolute_import
 
+import warnings
 from ._ffi.base import _LIB_NAME
 
 try: