diff --git a/CMakeLists.txt b/CMakeLists.txt
index dd84e67fce5a..62aa6db6e46d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -119,8 +119,9 @@ else(MSVC)
 endif(MSVC)
 
 # add source group
-FILE(GLOB_RECURSE GROUP_SOURCE "src/*.cc")
-FILE(GLOB_RECURSE GROUP_INCLUDE "src/*.h" "include/*.h")
+FILE(GLOB_RECURSE GROUP_SOURCE "src/*.cc" "nnvm/src/*.cc")
+FILE(GLOB_RECURSE GROUP_INCLUDE "src/*.h" "include/*.h"
+                                "nnvm/src/*.h" "nnvm/include/*.h")
 assign_source_group("Source" ${GROUP_SOURCE})
 assign_source_group("Include" ${GROUP_INCLUDE})
 
@@ -169,6 +170,19 @@ endif(USE_VM_PROFILER)
 file(GLOB DATATYPE_SRCS src/codegen/datatype/*.cc)
 list(APPEND COMPILER_SRCS ${DATATYPE_SRCS})
 
+if(NOT MSVC)
+  file(GLOB COMPILER_VERILOG_SRCS src/codegen/verilog/*.cc)
+  list(APPEND COMPILER_SRCS ${COMPILER_VERILOG_SRCS})
+endif()
+
+file(GLOB_RECURSE NNVM_COMPILER_SRCS
+    nnvm/src/c_api/*.cc
+    nnvm/src/core/*.cc
+    nnvm/src/pass/*.cc
+    nnvm/src/compiler/*.cc
+    nnvm/src/top/*.cc
+)
+
 file(GLOB TOPI_SRCS
     topi/src/*.cc
 )
@@ -241,8 +255,6 @@ include(cmake/modules/LLVM.cmake)
 include(cmake/modules/Micro.cmake)
 include(cmake/modules/ANTLR.cmake)
 include(cmake/modules/contrib/BLAS.cmake)
-include(cmake/modules/contrib/CODEGENC.cmake)
-include(cmake/modules/contrib/DNNL.cmake)
 include(cmake/modules/contrib/Random.cmake)
 include(cmake/modules/contrib/MicroStandaloneRuntime.cmake)
 include(cmake/modules/contrib/Sort.cmake)
@@ -283,6 +295,7 @@ if(NOT USE_SGX STREQUAL "OFF")
   add_dependencies(tvm_runtime sgx_edl tvm_t)
   install(TARGETS tvm_t ARCHIVE DESTINATION lib${LIB_SUFFIX})
 endif()
+add_library(nnvm_compiler SHARED ${NNVM_COMPILER_SRCS})
 
 if(USE_THREADS)
   message(STATUS "Build with thread support...")
@@ -292,11 +305,14 @@ if(USE_THREADS)
   target_link_libraries(tvm Threads::Threads)
   target_link_libraries(tvm_topi Threads::Threads)
   target_link_libraries(tvm_runtime Threads::Threads)
+  target_link_libraries(nnvm_compiler Threads::Threads)
 endif(USE_THREADS)
 
 target_link_libraries(tvm ${TVM_LINKER_LIBS} ${TVM_RUNTIME_LINKER_LIBS})
 target_link_libraries(tvm_topi tvm ${TVM_LINKER_LIBS} ${TVM_RUNTIME_LINKER_LIBS})
 target_link_libraries(tvm_runtime ${TVM_RUNTIME_LINKER_LIBS})
+target_link_libraries(tvm_runtime_static ${TVM_RUNTIME_LINKER_LIBS})
+target_link_libraries(nnvm_compiler tvm)
 
 if (HIDE_PRIVATE_SYMBOLS AND NOT ${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
   set(HIDE_SYMBOLS_LINKER_FLAGS "-Wl,--exclude-libs,ALL")
@@ -306,6 +322,7 @@ if (HIDE_PRIVATE_SYMBOLS AND NOT ${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
   target_link_libraries(tvm ${HIDE_SYMBOLS_LINKER_FLAGS})
   target_link_libraries(tvm_topi ${HIDE_SYMBOLS_LINKER_FLAGS})
   target_link_libraries(tvm_runtime ${HIDE_SYMBOLS_LINKER_FLAGS})
+  target_link_libraries(nnvm_compiler ${HIDE_SYMBOLS_LINKER_FLAGS})
 endif()
 
 # Related headers
@@ -315,7 +332,10 @@ target_include_directories(
 target_include_directories(
   tvm_topi
   PUBLIC "topi/include")
-
+target_include_directories(
+  nnvm_compiler
+  PUBLIC "nnvm/include"
+  PUBLIC "topi/include")
 
 # Tests
 set(TEST_EXECS "")
@@ -354,6 +374,8 @@ add_custom_target(runtime DEPENDS tvm_runtime)
 install(TARGETS tvm DESTINATION lib${LIB_SUFFIX})
 install(TARGETS tvm_topi DESTINATION lib${LIB_SUFFIX})
 install(TARGETS tvm_runtime DESTINATION lib${LIB_SUFFIX})
+install(TARGETS tvm_runtime_static DESTINATION lib${LIB_SUFFIX})
+install(TARGETS nnvm_compiler DESTINATION lib${LIB_SUFFIX})
 
 if (INSTALL_DEV)
   install(
@@ -376,6 +398,11 @@ if (INSTALL_DEV)
     FILES_MATCHING
     PATTERN "*.h"
     )
+  install(
+    DIRECTORY "nnvm/include/." DESTINATION "include"
+    FILES_MATCHING
+    PATTERN "*.h"
+    )
 else(INSTALL_DEV)
   install(
     DIRECTORY "include/tvm/runtime/." DESTINATION "include/tvm/runtime"
@@ -388,4 +415,5 @@ endif(INSTALL_DEV)
 if(MSVC)
   target_compile_definitions(tvm PRIVATE -DTVM_EXPORTS)
   target_compile_definitions(tvm_runtime PRIVATE -DTVM_EXPORTS)
+  target_compile_definitions(nnvm_compiler PRIVATE -DNNVM_EXPORTS)
 endif()
diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index 99b6a35c576e..b402e72b5e53 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -69,7 +69,6 @@ We do encourage everyone to work anything they are interested in.
 - [Liangfu Chen](https://github.com/liangfu): @liangfu
 - [Wei Chen](https://github.com/wweic): @wweic
 - [Zhi Chen](https://github.com/zhiics): @zhiics
-- [Neo Chien](https://github.com/cchung100m): @cchung100m
 - [Meghan Cowan](https://github.com/cowanmeg): @cowanmeg
 - [Balint Cristian](https://github.com/cbalint13): @cbalint13
 - [Sergei Grechanik](https://github.com/sgrechanik-h): @sgrechanik-h
@@ -121,3 +120,4 @@ We do encourage everyone to work anything they are interested in.
 - [Cody Hao Yu](https://github.com/comaniac)
 - [Chris Nuernberger](https://github.com/cnuernber)
 - [Shoubhik Bhattacharya](https://github.com/shoubhik)
+- [Neo Chien](https://github.com/cchung100m)
diff --git a/Jenkinsfile b/Jenkinsfile
index ec79b9718a67..43118590629b 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -57,7 +57,7 @@ tvm_multilib = "build/libtvm.so, " +
                "build/libvta_tsim.so, " +
                "build/libvta_fsim.so, " +
                "build/libtvm_topi.so, " +
-               tvm_runtime
+               "build/libnnvm_compiler.so, " + tvm_runtime
 
 // command to start a docker container
 docker_run = 'docker/bash.sh'
@@ -309,15 +309,14 @@ stage('Integration Test') {
       }
     }
   },
-  'docs: GPU': {
+  'legacy: GPU': {
     node('GPU') {
-      ws(per_exec_ws("tvm/docs-python-gpu")) {
+      ws(per_exec_ws("tvm/legacy-python-gpu")) {
         init_git()
         unpack_lib('gpu', tvm_multilib)
         timeout(time: max_time, unit: 'MINUTES') {
-          sh "${docker_run} ${ci_gpu} ./tests/scripts/task_python_docs.sh"
+          sh "${docker_run} ${ci_gpu} ./tests/scripts/task_python_legacy.sh"
         }
-        pack_lib('mydocs', 'docs.tgz')
       }
     }
   }
diff --git a/Makefile b/Makefile
index d34fbe4c9d88..d3ad1030b9f2 100644
--- a/Makefile
+++ b/Makefile
@@ -69,12 +69,14 @@ build/libtvm_web_runtime.js: build/libtvm_web_runtime.bc
 cpplint:
 	python3 3rdparty/dmlc-core/scripts/lint.py vta cpp vta/include vta/src
 	python3 3rdparty/dmlc-core/scripts/lint.py topi cpp topi/include;
+	python3 3rdparty/dmlc-core/scripts/lint.py nnvm cpp nnvm/include nnvm/src;
 	python3 3rdparty/dmlc-core/scripts/lint.py tvm cpp include src \
 	 examples/extension/src examples/graph_executor/src
 
 pylint:
 	python3 -m pylint python/tvm --rcfile=$(ROOTDIR)/tests/lint/pylintrc
 	python3 -m pylint topi/python/topi --rcfile=$(ROOTDIR)/tests/lint/pylintrc
+	python3 -m pylint nnvm/python/nnvm --rcfile=$(ROOTDIR)/tests/lint/pylintrc
 	python3 -m pylint vta/python/vta --rcfile=$(ROOTDIR)/tests/lint/pylintrc
 
 jnilint:
diff --git a/NEWS.md b/NEWS.md
index 438bd3059ef5..658848b3e3bb 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -26,899 +26,6 @@ Refer to the Roadmap issue for complete list on on-going version features.
 If you check in something that is not reflected in Roadmap issue, please reply
 to that issue so it can get added.
 
-## 0.6
-
-### Relay in Production
-Relay is a functional, differentiable programming language designed to be an expressive intermediate representation for machine learning systems. Relay supports algebraic data types, closures, control flow, and recursion, allowing it to directly represent more complex models than computation graph-based IRs (e.g., NNVM) can. In TVM v0.6, Relay is in stable phase and is ready for production.
-
-* Algebraic Data Types (ADT) support (#2442, #2575). ADT provides an expressive, efficient, and safe way to realize recursive computation (e.g., RNN). Refer to https://docs.tvm.ai/langref/relay_adt.html for more information.
-* Pass manager for Relay (#2546, #3226, #3234, #3191)
-* Most frameworks have been supported in Relay, including ONNX, Keras, Tensorflow, Caffe2, CoreML, NNVMv1, MXNet (#2246).
-* Explicitly manifest memory and tensor allocations in Relay. (#3560)
-
-### Relay Virtual Machine
-The Relay Virtual Machine (Relay VM) is the new generation of runtime to strike a balance between performance and flexibility when deploying and executing Relay programs. Previously, the graph runtime is able to utilize the fully static nature of the input graphs to perform aggressive optimization such as fully static allocation, and optimal memory reuse. When we introduce models which make use of control-flow, recursion, dynamic shapes, dynamic allocation we must change how execution works.
-
-Relay VM is now usable and is able to achieve decent performance for a various of models and targets.
-
-* Design (#2810 #2915) and a first version of implementation (#2889),
-* Add VM runtime for Relay and compiler support (#3120, #3121, #2889, #3139)
-* Relay VM (pattern matching #3470, port to python #3391, serialization #3647)
-* Relay VM Profiler (#3727)
-* Support execution on devices for Relay VM (#3678)
-* [Relay][VM] Add more passes to VMCompiler (#4058)
-* [relay][vm] Separate VM runtime with executable (#4100)
-* Port VM, VM compiler, and Object into Python (#3391)
-* VM: Add AllocTensor instruction and better instruction printer (#3306)
-* [Relay][VM][Interpreter] Enable first-class constructors in VM and interpreter via eta expansion. (#4218)
-* [Relay][VM] Clean up the VM and VM profiler code (#4391)
-
-### Training
-Relay is designed to natively support first-order and higher-order differentiation. The automatic differentiation infrastructure is now usable and a count of operators with gradient support are available in v0.6 release.
-
-* Higher order reverse mode automatic differentiation that work with control flow (#2496)
-* Higher order continuation passing style (#3456, #3485 )
-* Relay gradient registration (clip #3509, `max_pool2d` and `avg_pool2d` #3601)
-* Relay AD algorithm (#3585)
-* Relay Training - allow gradient to return a tuple (#3600), numerical gradient check (#3630)
-* Improve AD for concatenate (#3729)
-* [Relay][Training] Add missing gradient check to gradient pass (#4169)
-* As a part of Relay's automatic differentiation system, we are adding primal gradients for Relay operators. Please refer to #2562 for tracking the progress.
-* Gradient for Conv2d (#3636)
-* Add gradient operators (#3857, #3894, #3901, #3915)
-* Add gradient for log-softmax (#4069)
-* [Relay][Training] Add gradient for Crossentropy (#3925)
-* [Relay][Training] Add and fix gradients (#4126)
-
-### Quantization
-
-Low-bit inference is getting more and more popular as it benefits both the performance and storage usage. TVM now supports two types of quantization. 1. Automatic quantizaion takes floating-point precision model, does per-layer calibration and generates low-bit model. 2. TVM also imports pre-quantized model from Tensorflow and MXNet, a new dialect QNN is introduced to handle further lowering to normal operators.
-
-* Automatic Quantization
-  - Low-bit automatic quantization supported. (#2116). The workflow includes annotation, calibration and transformation. 
-  - Refactor quantization codebase and fix model accuracy. (#3543)
-  - KL-divergence-based per-layer calibration. (#3538)
-  - Add option to select which convolution layers are quantized. (#3173)
-  - [Relay][Quantize] Integrate data-aware calibration into quantization. (#4295)
-* Pre-quantized model support (QNN operators and legalize pass).
-  - Add a legalize pass to Relay (#3672)
-  - Qnn Concatenate, quantize, dequantize and requantize operators (#3819,  #3730, #3745, #3531)
-  - QNNtoRelay & QNNLegalize Pass utility (#3838, #3782)
-  - Requantize: Optimize lowering for some corner cases. (#3864)
-  - New quantized operator support: conv2d, add, dense (#3580, #3736, #3896, #3910)
-  - Do type checking for the input and kernel in the qnn conv2d (#3904)
-  - Legalize and AlterOpLayout for Intel int8. (#3961)
-  - Renaming tests to follow the Relay nomenclature. (#3975)
-  - Fix padding changes due to #3739 (#3989)
-  - Memorizing quantize node mapping to avoid duplicated simulated quantization (#3233)
-  - Infrastructure to support pre-quantized models (QNN) (#3971).
-  - [Relay][AlterOp] NHWC to NCHWc support for Pool, concatenate, sum. (#4059)
-  - [TOPI][x86] Cascade lake support. (#4123)
-  - [TOPI][x86] Legalize - Support int8xint8 convolution to use VNNI inst (#4196)
-  - Qnn dequantize with min max using Mxnet flavor to support Mxnet prequantized models. (#3945)
-  - Improve the lowering of Qnn Dense (#4213)
-  - Adding support for dequantizing from int32 to float32. (#4130)
-  - [QNN] Refactor fixed point multiplication in requantize (#4073)
-  - [Relay][Quantize] Use fixed point mulplications (#4160)
-  - Add support for quantized multiply to Relay (#4141)
-  - Use legalize to handle NHWC layout for `arm_cpu` (#3754)
-  - [QNN][Legalize] Specialize for Platforms w/o fast Int8 support (#4307)
-  - [QNN] Use Int16 upcast in Fallback Conv2D. (#4329)
-  - Retain input kernel scales in QNN dialect (#4292)
-  - [QNN] Lowering for Depthwise Convolution. (#4351)
-  - [QNN][TFLite] Parsing QNN Add op. Adding MobilenetV2. (#4142)
-  - [QNN][TFLite] Parsing TFLite quantized models. (#3900)
-  - Added tflite frontend support for quantized mean. (#4339)
-  - [Relay][Legalize] Legalize `conv2d_transpose` for NHWC (#4399)
-
-### Accelerator and Microcontroller Support
-
-TSIM is introduced to improve software and hardware integration and simulation accuracy. It integrates the hardware development process into the software stack. TSIM enables VTA to provide a more accurate performance feedback, i.e. clock cycles, compared to the traditional functional model of a hardware accelerator. Moreover, Chisel implementation for VTA is availale and it runs on top of TSIM.
-
-There has been a proliferation of resource-constrained and embedded devices that do not have operating systems or a mature software stack. MicroTVM is intended to support TVM on such bare-metal devices.
-
-* [TSIM] Enabling Cycle-Accurate Hardware Simulation for VTA (#3010, #3206, #3242)
-* Chisel implementation for VTA and runs on top of TSIM (#3258, #3347)
-* MicroTVM (#3227)
-* Relay Compilation + AutoTVM compatible operator libraries for VTA (#3135)
-* ChangeBatch pass for batched VTA compilation (#3656, #3660)
-* VTA fast simulator statistics (#3481)
-* TSIM improvements and fixes (#3505)
-* Chisel VTA enhancements and fixes (32bit support #3558, alu instruction generation #3592, coherence support #3593, separate types #3605, tensor issue/commit #3637, uop load request #3643, uop dma requests #3654)
-* VTA Runtime refactor for non-shared memory FPGAs (#3590)
-* VTA HLS codebase refactor for Ultra96 (#3496)
-* VTA support for batched inference (#3661)
-* VTA bitstream compilation for Intel FPGA (#3494)
-* TSIM: Introduce Virtual Memory for TSIM Driver (#3686)
-* Parallel TSIM hardware compilation with macOS and debug support (#3797)
-* Chisel: scale dram base address in hardware instead of runtime (#3772)
-* Chisel: run all unittests by default (#3766)
-* Chisel: improved Data Gen, Added ALU Test (#3743)
-* Chisel dependencies for TSIM CI (#3721)
-* Chisel: Added Module Unit Test Infrastructure (#3698)
-* Add ISA BitPat generation (#3891)
-* de10-nano driver (#3394)
-* Extending Vision model coverage compilation for VTA (#3740)
-* Conv2d transpose (deconvolution) operator support (#3777)
-* Support TLPP in function simulator. (#3555)
-* [VTA][Chisel] TSIM VTA Source Refactor (#4163)
-* [VTA][TSIM] Serial GEMM Application Added (#4082)
-
-### Rust Support
-Rust language support in TVM includes two parts. 1. The frontend wraps the current C API and exposes a Rust programming model. 2. The backend serves as an alternative to C++ runtime. It privdes a standalone WASM module and security support, e.g., SGX.
-
-* Rust frontend (#2292).
-* Unify types between bindings and pure Rust impl (#2616)
-* Rust: load syslib modules at compile time (#3274)
-* Rustify PackedFunc & Friends (#2969)
-* Rust DSO module (#2976)
-
-### Operator Support
-* A special operator `annotation.stop_fusion` to prevent it being fused with previous expressions (#2624).
-* `batch_matmul`  supported (#2561).
-* `reverse_reshape` supported (#2503).
-* Faster-RCNN proposal operator for CUDA (#2420).
-* Vision operator for YOLO `yolo_reorg` (#1941).
-* `slice` operator for MXNet (#2662).
-* `arange` supported (#2621).
-* Vision operator `roi_align` (#2618).
-* `where` operator for MXNet (#2647).
-* Deformable conv2d (#2908)
-* Faster-RCNN Proposal OP (#2725) 
-* ROI Pool operator (#2811) 
-* Gluoncv SSD support on CPU (#2353) 
-* shape, reverse, and sign op (#2749, #2800, #2775)
-* tile and repeat op (#2720)
-* logical operators (#2743, #2453)
-* stack op (#2729)
-* NCHWc upsampling (#2806) 
-* clip and wrap mode support in take (#2858)
-* AlterLayout support for `intel_graphics` conv2d , depthwise conv2d (#2729, #2806)
-* Add foldr1 operator (#2928)
-* Add rsqrt operator (#2949)
-* Add clip and wrap mode support in take (#2858)
-* `Gather_nd` exposed to relay (#2945)
-* `bitserial_conv2d` move to autotvm template and updates (#2819)
-* Port x86 NCHWc to AutoTVM for Task Extraction (#2664)
-* Implement relay `nn.bias_add` compute in C++ (#3027)
-* Rename output tensors for better readability (#3006)
-* int8 dense on CUDA & Dense op quantization (#2877)
-* Bitserial dense operators for CPU (#3051)
-* Enhance upsample operator to adapt onnx opset v9 (#2968)
-* Add adaptive pooling operator (#3085)
-* Add all operator (#3124)
-* Add cblas `batch_matmul` (#3210)
-* Add packing for int8 1x1 convolution and support the int8 group convolution on X86 (#2991)
-* Add op size (#3094)
-* x86 TOPI (`roi_align` #3475, `conv2d_transpose` #3491)
-* Intel INT8 (dilation in conv2d #3510, type checking #3516)
-* Reinterpretation of tensor elements (#3599)
-* Spase-Dense for block-sparse multiplication (#3566)
-* Winograd matrix computation (#3553)
-* CUDA schedule for `pool_grad` (#3622), `group_conv2d` (#3663)
-* Bitserial operations conv2d, dense and bitpack (#3844)
-* Improve numeric gradient check (#3856)
-* Resize rework ([3788](#3788))
-* Improve `conv2d_transpose` CUDA schedule template (#3796)
-* SpaceToDepth and MirrorPad Operators (#3718)
-* Add variance and layer norm op (#3700)
-* Add `sparse_transpose` for Square CSR matrices (#3707)
-* TOPI: Memoize winograd matrix (#3687)
-* New TOPI operators: `erf`, `logical_and`, `logical_or`, `logical_not`, `isnan` (#3702, #3929, #3979)
-* Improve `ceil_divide` in tile/split (#3842)
-* [Relay][Frontend][TF] Add tensor array ops (#3798, #4309)
-* [TF][Op] Op where (#4045)
-* [TOPI]Add op argwhere (#3994)
-* [Relay] `crossentropy_with_logits` and its gradient (#4075)
-* [Relay][Op] Enhance Upsample Operator to support float scales (#4206)
-* [Relay][Op] Add instance norm op (#4004)
-
-### Frontend and User Interface
-* Frontend darknet (#2773)
-* Support tf.gather (#2935) 
-* Support tf.where (#2936)
-* Adding ADD operator to tflite frontend for compiling the MobileNetV2 (#2919)
-* Support SpaceToBatchND/BatchToSpaceND in Tensorflow frontend (#2943)
-* Simplify TF `get_output_names` (#3025)
-* TF Tile Round Sign Pow Exp Reverse (#2960)
-* Gluncv SSD support on the GPU (#2784)
-* Allow an op as loop var in Tensorflow (#3056)
-* Add `FULLY_CONNECTED` op into tflite frontend (#3019)
-* Add MXNet converter for RNN layer ops (#3125)
-* Add log op in tf frontend (#3111)
-* Add SoftPlus Sqrt in Tensorflow frontend (#3187)
-* Add onnx elemwise greater/less (#3186)
-* Add PlaceholderWithDefault (limited) implementation in TensorFlow (#3184)
-* Support `tf.math.reduce_prod` (#3166)
-* Better shape inference in TensorFlow Frontend (#3176)
-* Get list of unsupported ONNX operators (#2995)
-* Implement ONNX MaxPool-v8 and MaxPool-v10 (#3114)
-* Convert TFLite NCHW to NHWC (#3141)
-* Add Crop op converter (#3241)
-* TFLite frontend operator support: PAD, RESIZE, MUL, Reduce (min, max, mean, prod), LOGISTIC, elemwise operators (Sub, Divide, Power, Max, Min) (#3310, #3370, #3304, #3421, #3313, #3357)
-* Tensorflow frontend operator support: Abs, FloorDiv, GatherND, LeftShift, LogSoftmax, Max, Min, Mod, RightShift, ZerosLike, TruncateMod, Neg, ClipByValue, ResizeNearestNeighbor (#3270, #3211, #3393)
-* TFLite: Add `fused_activation_function` for ADD, SUB, MUL, DIV (#3372)
-* Support bidirectional RNN layer for MXNet (#3397)
-* TFLite operator support (pack #3521, split #3520 )
-* Keras operator support (permute, softmax #3618)
-* TF operator support (BatchMatMul #3634)
-* TFLite frontend operator support: tile, transpose (#3814, #3705)
-* ONNX frontend operator support: PReLU for NNVM, Not, Sign, Equal (#3813, #3836, #3760)
-* Keras frontend operator support: Dot (#3668)
-* Add more cases to Keras `_convert_reshape` (#3846)
-* TensorFlow frontend operator support: OneHot, log1p, cos, sin (#3781, #3614)
-* Support BatchMatMul with input dimensions larger than 3 for TensorFlow (#3732)
-* ONNX new operator support: And, Tile, Erf (#3878, #3941, #3988)
-* MXNet new operator support: pad, conv1d, deconv1d (#3739)
-* TFLite new operator support: `batch_to_space_nd`, `space_to_batch_nd`, tanh, greater, relu (#3850, #3996, #3963, #4022)
-* TFLite: Support depthwise convolution multiplier greater than 1 (#3922)
-* Keras: Fix ReLU in Keras Converter missed the case (#3917)
-* Keras: frontend upsample and 1 channel conv2d fixes (#3937)
-* Tensorflow: Convert scalar Const into tvm.relay.const (#3885)
-* TensorFlow: Add support for SquaredDifference (#3930)
-* [relay][frontend] clean up tf frontend (#3710)
-* [Relay][Topi][TensorFlow][ONNX][Lang] Add support for Any op (#4205)
-* [Relay][Frontend][ONNX] Add support for op Where (#4184)
-* [Relay][TopHub] Add switch to disable TopHub download (#4015)
-* Add parser support for CAST tflite operator (#4096)
-* Add parses support for `zeros_like` tflite operator (#4042)
-* Add parser support for SUM tflite operator (#4182)
-* Add support for tf.assert (as no-op) and `tf.no_op` to TF Relay frontend. (#4172)
-* [Relay][Frontend][ONNX] New Operators and Opsets to Support BERT (#4197)
-* [Relay][Params] Add APIs for storing and retrieving parameters from individual functions. (#4194)
-* Add `build_create_shared_func` to tvm/contrib/cc.py (#3840)
-* Tensorflow saved model for NNVM ([#2493](#2493/) and Relay ([#2586](#2586/)).
-* Introduced `HybridModule` (#2477) so that normal TVM schedule can be compiled to hybrid target, run and dumped to Hybrid Script.
-* Relay ][Frontend][Tensorflow] add operator `add_n` (#4181)
-* [Relay][Frontend][Tensorflow] StopGradient (#4238)
-* [Relay][Frontend][ONNX] Add support for broadcasting to Where and MatMul (#4267)
-* [TFLite] Support PRelu (#4298)
-* [Frontend][MxNet] support mxnet cond op (#4311)
-* Add support for `quant.mul` operator in tflite frontend (#4283)
-* [Relay][Frontend][ONNX] operator support: DepthToSpace, SpaceToDepth (#4271)
-* [Relay][Frontend][Tensorflow]Add `conv2d_transpose`. (#4300)
-* [Frontend]Add TensorFlow FloorMod (#4308)
-
-### Runtime and Backend Support
-* Make external library extend TVM's NDArray more easily (#2613).
-* Improvements for NNPACK integratation, includes ci test, winograd (#2846, #2868, #2856, #2721) 
-* Improvements for OpenCL runtime (#2741, #2737)
-* GraphRuntime: Enable sharing parameters of a model among multiple threads (#3384)
-* Android runtime argsort support (#3472)
-* GraphRuntime enhancements (`set_input_zero_copy` #3416)
-* A new minimal runtime implementation (~12kb .text on ARMv7/x86) for TVM.
-* Add AVX512VNNI support for TVM (#3388)
-* Enable miopen Group Convolution (#3987)
-* Minimal runtime (~12kb .text on ARMv7/x86) for subset of TVM models (#3567)
-* [RUNTIME] Separate runtime related contrib into runtime/contrib (#4207)
-* [topi] add ARM v8.2 udot (uint8) support (#3978)
-* [codegen] Add multiple operands and function support when using fp16 compilation (#4056)
-* [TOPI] Added support for Mali Bifrost target (#4047)
-* [topi] enable fp16 sort for arm (#4084)
-* Add OpenOCD Low-Level Device (RISC-V Support) (#3756)
-* Add wave 32 bc for AMD ROCm backend (#3984)
-* [RUTNIME] Support C++ RPC (#4281)
-* [TOPI][OP] Support Faster-RCNN Proposal OP on CPU (#4297)
-* [TVM][RUNTIME] A minimum example to generate external library wrappers for DSOModule (#4280)
-
-### Language and Architecture
-* Support custom datatypes (#2900)
-* Add the acc16 intrinsic support (#3081)
-* Handle float16 constants & fix BatchNorm (#3260)
-* Structural hash - incorporate the var type into its hash (#3267)
-* Relay C++ Build Module (#3082, #3144, #3174)
-* Enable decorating python class to be a Relay Pass (#3364)
-* Make Partial Eval support interprocedural optimization and termination check. (#3033)
-* Introduce feature manager to Relay. (#3236)
-* Use Relay parser to define the Relay prelude (#3043)
-* Mechanism to detect incomplete expression match in Relay (#3203)
-* EQ/NE operators support for StringImm expressions (#3283)
-* Mechanism to detect incomplete expression match in Relay (#3203)
-* Introduce CanonicalizeCast pass to formally reduce memory overhead introduced by fused cast operations (#3280)
-* Support overloading comparison operations in Relay (#3168)
-* Mac count: provide a pass to calculate the number of multiply-accumulate operations in a network (#2609).
-  - support for `conv_2d_transpose` (#3469)
-  - [Relay][Pass] Count MAC for BatchMatMul (#4157)
-  - Detect depthwise conv2d in `mac_count` pass (#3083)
-* Add Tuple pattern (#3596)
-* Text format support for ADTs and prelude (#3863, #3939)
-* Add new IR pass CombineParallelDense (#3862)
-* Add support for `EQ` op in the deduce bound and the loop partition (#3775)
-* Introduce base-class IRMutatorWithAnalyzer (#3969)
-* Define more standard global functions in the prelude of relay program, includes foldr1, hd, tl, nth, list update (#2928, #2917, #2771, #2866)
-* Add SkipVectorize pass (#3222, #3228)
-* [Relay][Pass] Add pass to remove unused functions in relay module (#4334)
-
-### Symbolic shape enhancement
-* Add shape function for symbolic shape. It enables certain cases for broadcast with symbolic shapes. (#3606)
-* [tvm][any] broadcast with values other than one (#3967)
-* Symbolic shape support (broadcast op #3389)
-* Support reshape for dynamic shape in tf converter (#4185)
-* Runtime Shape Functions (#4179)
-
-### Language and Architecture
-* An optimization pass to eliminate expressions which have the same functionality and same inputs (#2639).
-* Refactor text printer to add stream-like API and FunctionType support (#2605, #2882)
-* Build a scaffold for structured error handling (#2838). The new mechanism detects and rewrites error messages so that c++ and python stack trace are unified and not redundant. Guideslines and conventions for error handling is also discussed.
-* Higher order reverse mode automatic differentiation that work with control flow (#2496)
-* Integer arithmetic analyzers, includes modular set analysis, const integer bound analysis and rewrite simplifier (#2904, #2851, #2768, #2722, #2668, #2860)
-* Improve operator fusion for TupleGetItem in relay (#2914, #2929
-* Compute FLOP of autotvm template for int8 models (#2776) 
-* Common subexpression elimination pass in Relay (#2639)
-* Improve quantization in Relay (#2723)
-* Refactor `build_func` in measure module of autotvm to better support cross compiler (#2927)
-* Quantize all fields of concatenate (#2913)
-* Remove stale verilog generator (#2964)
-* Improve Relay printing (#2984, #2881, #3030, #3041)
-* Add `min_num_branches` option in CombineParallelConv2D (#2961)
-* Add `expr_visitor`, fix `expr_functor` exponential blowup problem (#2988)
-* Support Deriving channels when it is not provided in AlterLayout. (#2972)
-* Enhance BoundDeduce algorithm (#2795)
-* Enhance loop partition algorithm (#2956)
-* Better tuple fusion implementation (#3092)
-* Enhance fusion rule that starts from elemwise and broadcast (#2932)
-* Remove `on_device` op after annotation in heterogeneous pass (#3204)
-* Improve canonical and rewrite simplifier (#3132, #3149)
-* Capture constant external python variables in hybrid script (#3157)
-* Remove Peano nats from the prelude (#3045)
-* Macro to define NodeRef methods, constructor style example (#3224)
-* Consistent RAII scoping API (#3231)
-* Register all operators' attributes in Python (#3175)
-* Add module supoort in relay.build (#3424)
-* Relay pass infrastructure improvement (#3319, #3336, #3430, #3353)
-* Migrate Relay passes to pass manager (#3323, #3289, #3251, #3406)
-* Improve heterogeneous annotation by using visitor (#3261)
-* Support export ADT value in Python (#3299)
-* Extend TensorComputeOp to allow scalar inputs (#3300)
-* Transitioning low-level IR away from HalideIR (#3533, #3535)
-* Tags for ADT constructors (#3369)
-* IR dumping for debugging (#3493)
-* Pretty printer and parser roundtrip (#3460, #3536)
-* Relay type checking (conv2d weight dimension #3511, any shape #3221)
-* Relay Module enhancements (remove free variables #3476)
-* LLVM DWARF debug information (#3420)
-* Printer for Layout/BijectiveLayout (#3582)
-* Type inference escape hatch (#3571)
-* Making iterators compatible with constructors of STL containers (#3624)
-* Moving Conv, Dense, Concatenate InferTypes to header (#3783)
-* Simplify casts of constants 0 and 1 (#3758)
-* Conditionally replace reduction init axis. (#3408)
-* Improve Partial Evaluator (#3749, #3703)
-* Strict mode in Relay pattern matching (#3620)
-* Quit and clean when TVM is interrupted (#3640)
-* Make Type Relation catch more errors (#3899, #3699)
-* Refactor the way we interface between different modules of Relay (#3906)
-* Introduce `schedule_injective_from_existing` and unify external schedules for all targets (#3983)
-* [NODE][REFACTOR] Refactor reflection system in node. (#4189)
-* Unify node system and object (#4161, #4115, #4128)
-* [Relay][Refactor] Rename Datatype to ADT (#4156)
-* [Relay] fix exponential blowup in interpreter (#3559)
-* [Relay] Fix memory leak in the interpreter (#4155)
-* [rpc] use callback func to do send & recv (#4147)
-* Add `lift_if_then_else` pass to improve loop partitioning (#3865)
-* Decrease the complexity of CalcDep from exponential to linear (#4053)
-* [IR] Make iterators compatible with constructors of STL containers (#3624)
-* [Relay][Pass] Avoid FoldConstant folding some ops (#4245)
-* [Relay][Prelude] More dtypes support in `tensor_t` (#4233)
-* [NODE][REFACTOR] Rename IRFunctor->NodeFunctor, use func pointer (#4247)
-* [RUNTIME][REFACTOR] Use object protocol to support runtime::Module (#4289)
-* [CodeGen] Add build config option `disable_assert` to control whether to generate assert. (#4340)
-
-### Arithmetic Analysis
-* Formalize Integer Arithmetic Analysis (RFC: #2588). It is aiming to perform better context-dependent analysis, bound analysis, centralized arithmetic logic and arithmetic simplification. (#3272, #3463, #3464, #3368, #3503, #3504 , #3502, #3479 , #3568)
-* Introduce FloorDiv/Mod, TruncDiv/Mod, and IndexDiv/Mod for better arithmetic simplification (#3976, #3986, #4000, #4014, #4008, #4028)
-* [ARITH] Use floordiv for the deduce bound (#4025)
-* [Simplifier] Rewrite simplification rule to eliminate unnecessary conditionals. (#4076)
-
-### Runtime and Backend Support
-* Provide error msg for failure function call in tvm4j (#2967)
-* Expose backtrace symbols in Debug mode (#3001)
-* C++ GraphRuntimeCodegen, Deprecate Python2 (#2986)
-* Ensure interpreted functions can take values that are not TensorValues (#3015)
-* Make OpenCL runtime Compatible with OpenCL2.0 (#2897)
-* Handle INF and NAN in CUDA and OpenCL (#3194)
-* Update debug graph runtime for more precise layerwise timing (#3232)
-* ROCM support (llvm printing #3662, ld.lld finding #3664, save to file #3665)
-* Threadpool: make `spin_count` configurable (#3577)
-* RPC worker children termination (#3669)
-* Vulkan runtime reimplementation (stream approach) (#3849)
-* Vulkan backend supports Call::reinterpret and vectorized comparison (#3795)
-* Support MKL on Windows (#3837)
-* Vulkan IR builder (bool to float #3513)
-* Force `code_object_v2` for amd gpu backend (#4099)
-* [Codegen][cuda-fp16] fallback to fp32 simulation when cuda arch < sm53 (#4268)
-* Fix and refactoring for AMD gpu backend (#4305, #4321, #4341, #4342)
-* [Debugger] Sorting op-time breakdown for quicker analysis. (#4352)
-* [nvcc] enable multiple arch in one fatbin (#4377)
-* [RUNTIME] Move module export to the function level. (#4405)
-
-
-### Frontend and User Interface
-* Relay now supports saving and loading parameter dictionaries. (#2620)
-* Add `max_num_threads` to Hybrid Script, which allows users to get max number of threads for GPU targets ([#2672](#2672/)).
-* Improvements for tensorflow frontend (#2830, #2757, #2586), includes decompiling tf control flow (#2830)
-* Improvements for mxnet frontend (#2844, #2777, #2772, #2706, #2704, #2709,, #2739) 
-* Improvements for keras frontend (#2842, #2854)
-* Improvements for DarkNet frontend (#2673)
-* Improvements for ONNX frontend (#2843, #2840)
-* Better profile result dump in Chrome Tracing format (#2922, #2863)
-* Unified error handling in NNVM and Relay frontends (#2828) 
-* Improve NNVM to Relay conversion (#2734)
-* Remove `input_0d_mismatch` special handling for TF Frontend(#3087)
-* Bumped ONNX version from 1.1.0 to 1.4.1 (#3286)
-* Simplify parameter handling in Tensorflow frontend (#2993)
-* CoreML improvement for image scaler and padding (#3800)
-* Clean up TensorFlow frontend (#3710)
-* Darknet: Solve tvm parsing darknet resnext failure bug (#3778)
-* Frontend changes `get_workload` - (#3483)
-* [TF][Relay][Op] Pass module when infer shape (#4287)
-
-### AutoTVM
-* Support override in `register_topi_compute` and `register_topi_schedule`. (#3292)
-* Improve graph tuner dealing with Tuple. (#3649)
-* Add AutoTVM template for conv2d Intel int8. (#3955)
-* Add AutoTVM template for dense on CUDA. (#3923)
-* Add AutoTVM template for conv2d on Intel graphics. (#3839)
-* Optimizing autotvm task extraction speed. (#4138)
-* [AutoTVM] Add `batch_matmul` to tunable operations. (#4242)
-* Selecting tuning templates when extracting task. (#4338)
-
-### Performance Improvements
-* Enable AlterOpLayout pass for x86 on Relay (#2585). It is essential to get decent performance for CNN-based model on Intel CPUs.
-* Better intrinsic matching for x86 CPU and ARM CPU, includes variants of vcvtph2ps and vmlal.s16 (#2925, #2748).
-* Improve injective schedule for ARM CPU(#2801)
-* Core functionality for Graph tuner (#2184)
-* Fast tanh implementation (#3255)
-* Improve multi-batch conv2d on x86 (#3308)
-* Improve `non_max_suppression` and `get_valid_counts` for CPU (#3305)
-* Improve `roi_align` performance for CPU (#3296)
-* Improve `nms` and `get_valid_count` performance (#3282)
-* Graph tuner for multiple subgraph (#3490)
-* For sparsity, fast transpose for square CSR matrices has been now merged, which is a good start point for more general sparse type support.
-* Reduce `set_input` and `set_input_zero_copy` overhead (#3805)
-* Parallelize batch axis for ARM (#3931)
-* Support cuBLAS BatchMatMul (#3936)
-* Add AVX512VNNI support for TVM (#3388)
-* Enhance tuning space of split (#3949)
-* Enable miopen transpose convolution and fp16 support (#3952)
-* Improve `conv2d_transpose` schedule on X86 and CUDA (#3948)
-* Expose llvm.nearbyint intrinsic (#4001)
-* [TOPI][X86] Pool operator parallel support. (#4090)
-* Improve layout for several operators (#4103, #4040, #4080)
-* [Relay][VM] Fix constant folding issue in VM compiler (#4077)
-* [relay][vm] Reuse allocated device memory (#4170)
-* [Runtime] Enable option to use OpenMP thread pool (#4089)
-* [PERF] Parallelize reduction for CPU (#4158)
-* [TOPI] Tunable Template for Conv2D HWCN on CUDA (#4168)
-* [TOPI] Add valid auto tvm for Intel Graphics (#4078)
-* [TOPI] FIFO buffer op, to accelerate sequence modeling with dilated convolutions (#4039)
-* TensorCore Support using Intrinsic (#4136)
-* Auto TensorCore CodeGen (#4234)
-* Use cblas for dense and `batch_matmul` (#3787)
-* Update TOPI softmax compute and CPU schedule (#3680)
-* [VTA] Performance optimize, remove unnecessary contigious memory use. (#4246)
-* [TOPI][AlterOpLayout][ARM] Enabling NHWC to NCHW layout transformation. (#4249)
-* [PERF] Parallelize reduction for CPU (#4158)
-* [ThreadPool] Solve thread transitions issue (#4344)
-
-### Documentation
-* Tutorials for deep learning frameworks support in Relay.
-* Tutorial for running AutoTVM with Relay (#2594).
-* Document for Algebraic Data Types (#2575).
-* Move NNVM tutorials to Relay (#2783, #2785, #2766, #2693)
-* Documentation on operators (#2761)
-* Add gradient operator tutorial docs (#2751)
-* Add compiler pass tutorial docs (#2746)
-* Add Android Tutorial (#2977) 
-* Developer documentation for InferBound pass (#3126)
-* Add missing targets to `target_name` documentation (#3128)
-* Various documentation improvements (#3133)
-* Add VM doc (#3188)
-* Update documents for TSim (#3409, #3318, #3302, #3343, #3206)
-* Improve tvm4j document describing LLVM support (#3404)
-* Tutorial migration to Python3 (#3498)
-* Android RPC README (#3500)
-* Documentation for Relay opcode (#3522)
-* Tutorial for pass manager (#3515)
-* Minimum version of Python in docs (#3588)
-* Relay pass infra (#3583)
-* X86 Autotune tutorial improvements (#3609)
-* YOLOv3 tiny Darknet tutorial (#3674)
-* SSD doc to avoid confusion (#3677)
-* Tutorial: Build a Graph Convolutional Network on TVM (#3681)
-* Add docs for analysis namespace (#3985)
-* [tutorial] Relay pass infra tutorial (#4083)
-* [DOCS] Add TensorFlow frontend docs (#4154)
-* Tutorial: update Building a Graph Convolutional Network tutorial (#4060)
-* [Docs] Add dependency of compilation with LLVM (#4117)
-* [Documentation]Fix example code in comment of `tvm.build_module.build()` (#4195)
-* TSIM: add virtual memory support to examples (#3868)
-* Relay pass infra tutorial (#4083)
-* Fix the TF tutorial to run against TF2.0 and TF1.x (#4104)
-* Add `topi.nn.fifo_buffer` to TVM doc (#4343)
-* License statement (#4345, #4359, #4401, #4402, #4408, #4409, #4410, #4414, #4431)
-
-### Build and Test
-* Increate the robuteness of CI test (#2841, #2798, #2793, #2788, #2781, #2727, #2710, #2711, #2923)
-* Improve conda build (#2742) 
-* Add caffe2 nnvm frontend to CI (#3018)
-* Use bridge network and expose port on macOS when launch docker image (#3086）
-* Run DarkNet tests (#2673) 
-* Add file type check (#3116)
-* Always run cpptest during build to ensure library correctness (#3147)
-* Handle more file types in ASF header (#3235)
-* Add `test_forward_ssd_mobilenet_v1` to `tflite/test_forward` (#3350)
-* Add Azure build pipeline (#3458, #3459)
-* Update ci-gpu to v0.52 (#3374)
-* Enable more visible symbols by default (#3365)
-* Separate out legacy as a stage in CI (#3337)
-* Simplify build script, remove python 2 support  (#3419)
-* Ignore rust cargo lock files in rat (#3314)
-* Improve CUDA Conda package build (#3281)
-* Update CMakeLists.txt to be more flexible to find the third parties libraries (#3354)
-* Docker update conda package (#3344), requests and pillow (#3495), Android demo (#3499), rat install (#3527), ARM support (#3546), LLVM (#3590)
-* Relay-to-Python testing (#3156)
-* Code refactoring/remove (#3523, #3667)
-* Zero-rank testing (#3612)
-* CMake compilation (#3611, #3650, google test #3628)
-* Standalone wheel build for TOPI (#3657)
-* Fixing performance issues in PassUpDomain when fusing and splitting axes (#3073)
-* conda recipe (#3791)
-* Allow users to specify download directory (#3803)
-* Update docs for installation for CUDA (#3832)
-* Update `hybrid_script.rst` (#3799)
-* Acknowledge Halide attributions (#3824)
-* Add psutil dependency (#3780)
-* Temporary disable rust test (#3809)
-* Solve occasional CI issue when pad value is all 0 (#3801)
-* Towards TSIM CI testing (#3704)
-* Use pip3 for python3 (#3742)
-* Update docker image `ci_cpu,i386` to include verilator (#3738)
-* Remove sccache from Rust install (#3728)
-* Update dmlc-core to the latest commit (#3716)
-* Update GPU docker (#3709)
-* Add an option to build with -pthread (#3671)
-* Add DGL to `{ci_gpu, demo_cpu, demo_gpu}` docker images (#3692)
-* Use pytest instead of nosetest (#3524)
-* Enable NHWC of `relay.testing.mobilenet` (#3886)
-* Add .hsaco save/load for `tesnor_expr` Tutorial (#3852)
-* Support LLVM trunk (#3907)
-* Remove GTest cmake flag from install docs (#3953)
-* Allow `USE_LLVM` to take extra arguments (#3954)
-* [CI] Pin NNPack pthreadtools version (#4152)
-* [TOPI] Fix flaky testcase for check round (#4211)
-* [CI] Move gpu docker binary to cuda10 (#4229)
-* [CI] use llvm9 for the gpu tests (#4224)
-* [CI] Update GPU docker to cuda10 (#4228)
-* [Relay] Install Relay Prelude program in package install (#4227)
-* [relay] use `time_evaluator` for measurement (#4191)
-* [Relay] Improve build error when no lowered funcs are produced (#4132)
-* [llvm] switch to use Align for llvm trunk (#4051)
-* [CUDA] Update `have_int8` condition to run on compute capability 7.x devices (#4214)
-* [DOCKER] Pin torchvision==0.4.1 (#4140)
-* [DOCKER] torch install depends on future package (#4098)
-* [CodeGen] Disable -mfloat-abi hard option for LLVM < 6.0 (#4071)
-* Add a python how to example of deploying tvm module with tvm runtime only (#4094)
-* Hide symbols from dependent libraries if `HIDE_PRIVATE_SYMBOLS` is ON. (#4041)
-* [BUILD] Disable utvm standalone runtime by default (#4240)
-* Fix TSIM compile error in Linux (add missing -fPIC flag) (#3876)
-* Add scalafmt and format existing scala codebase (#3880)
-* Update TFLite wheel version to 1.13.1 (#3435)
-* Remove PEP498 f-string new feature for support python3.5 (#4250)
-* Require LLVM >= 9 for AMDGPU backend (#4253)
-* Rename ml.dmlc.tvm to org.apache.tvm (#4290)
-* [Test][TF][Relay] Fix argument preparation for vm test mode (#4296)
-* Add test for the `qnn_add` operator (#4282)
-* [CI][DOCKER] Add ONNX runtime dep (#4314)
-* [CI][DOCKER] Upgrade image to include onnx runtime (#4313)
-* [CI] Set workspace to be per executor (#4336)
-* [Build][Windows] Fix Windows build by including cctype (#4319)
-* [Contrib] Add MKL DNN option (#4323)
-* [Test][Relay][Pass] Add test case for lambda lift (#4317)
-* Remove Python imp module as it is deprecated (#4275)
-* Bump up CUDA log version in tophub.py (#4347)
-* Add rule for clean in APPs (#4364)
-* [Relay tests] Temporary Attr Update for Order-Independent Testing (#4357)
-* [CI] Avoid content-length request in test data download (#4375)
-* Compare all outputs in TFLite `test_forward_ssd_mobilenet_v1` (#4373)
-
-### Bug Fixes
-* [RELAY] Fix `get_int_tuple`. (#2691)
-* [ARITH] Select support for integer set analysis. (#2687)
-* [Relay] Fix error in ANF (too agressively inline atomic expression and create free variable). (#2665)
-* [Hybrid Script] Fix name conflict and attached scope problem. (#2649)
-* [Relay] Fix ANF for reference and pattern matching. (#2637)
-* [Relay] Fix fusion bug when call symbol that is not an operator. (#2630)
-* Fix missing <sstream> header file. (#2629)
-* [Relay]Fix the bug in heterogeneous annotation which mistakenly steps into the fused op. (#2622)
-* [AutoTVM] Fix incorrect localhost usage in RPC mode. (#2619)
-* [NNVM] Fix incorrectly getting layout attribute as a tuple. (#2610)
-* [Relay] Fix mutating IF expression. (#2601)
-* [Tutorial] Fix downloaded file path. (#2590)
-* [Storage] Fix int32 overflow bug when input is big. (#2580)
-* [NNVM] Fix non-identity problem for FInplaceIdentity. (#2572)
-* [Golang] Fix compilation error. (#2558)
-* [Tensor Expression] Fix missing reduction init predicates. (#2495)
-* [Relay] Fix missing argument for NCHWc in Relay. (#2627)
-* [TOPI] Fix `Nms_ir` data race. (#2600)
-* Fix `compute_inline` with multiple outputs (#2934) 
-* [TEXPR][PASS] Fix thread all reduce to avoid write after read hazzard (#2937)
-* [FRONTEND][TENSORFLOW] bug fix for tensorflow official slim models. (#2864)
-* [FRONTEND][ONNX] Some bug fixes and Shape operator fixed for relay. (#2850)
-* Turn on `USE_SORT` by default (#2916) 
-* [DOCKER] Upgrade ci-cpu to latest v0.50 (#2901) 
-* [TESTS] Import script robustness (set -u) (#2896) 
-* [Relay] Fix name of bias in testing.mlp (#2892) 
-* [TESTS] Improve script robustness (#2893)
-* Add dense schedules to `__init__` for cpu (#2855)
-* [Apps] [howto_deploy] fix cxx-flags order and build directory (#2888) 
-* [Relay] Add TVM_DLL for ANF/GNF conversion #2883 
-* [Relay] Fix Relay ARM CPU depthwise spatial pack schedule alter op layout issue. (#2861)
-* Fix setting up hints for getaddrinfo (#2872) 
-* Add missing sgx includes (#2878) 
-* Fix error reporting for missing axis (#2835) 
-* Fix an OrderDict initilization bug. (#2862)
-* Fix Xcode 10 metal compile error (#2836)
-* tvmrpc: Fix includes (#2825) 
-* Fix `init_proj.py`: Team ID expected (#2824) 
-* [DOCKER] Fix git clone failure. (#2816) 
-* upgrade java style-check due to CVE-2019-9658 (#2817) 
-* [Relay][Quantization] Fix duplicated simulated quantization (#2803) 
-* [Bugfix] Repeat and tile bug fixed, relay tests added (#2804) 
-* Fix caffe2 relay frontend (#2733) 
-* Fix a bug in nnvm to relay converter. (#2756) 
-* Ensure loop count is a constant before trying to unroll. (#2797) 
-* xcode.py: Decode bytes before output #2833 
-* [WIN] Fix a bug in `find_llvm` when specify llvm-config (#2758) 
-* [DLPACK] fix flaky ctypes support (#2759) 
-* [Bugfix][Relay][Frontend] Fix bug in mxnet converter for `slick_like` (#2744)
-* [DOCS] Fix tutorial (#2724) 
-* [TOPI][Relay] Fix default `out_dtype` for `conv2d_NCHWc` and Relay (#2702)
-* [Relay] fix checkwellform (#2705) 
-* fix prelu, now can use on 2d input and add one test (#2875) 
-* [CODEGEN][OPENCL] Fix compile error about ternary expression. (#2821)
-* Fix Placeholder issue (#2834)
-* Fix makedirs() condition in contrib (#2942)
-* Add missing #!/bin/bash directive (#2951)
-* Bilinear resize bug fix from PR #2777 (#2857)
-* Fix `bias_add` default axis (#2829)
-* Remove empty ty.rs (#2958)
-* fix undefined reference to dlopen, etc (#2957)
-* Removed deprecated `std::unary_function` (#2962)
-* Add output format to ndk build func (#2999)
-* Fix java checkstyle version (#2998)
-* Fix relay invariant error message (#3011)
-* Fix for caffe2 nnvm frontend (#2996)
-* Fix rust resnet example (#3000)
-* Fix x||!x for comparisons in rewrite simplifier (#3029)
-* Fix BatchMatMulRel typerelation (#3032)
-* Update dmlc-core, fix default ctors of NodeEntry (#3017)
-* Fix Fuse (#3035)
-* Fix PostOrderVisit signature (#3048)
-* Fix winograd nnpack fp16 (#3046)
-* Fix some typos (#3063, #3112)
-* Fix `group_conv2d` unit test (#3113)
-* Fix bug in ONNX importer (#3084)
-* Fixing a doc nit (#3123)
-* Fix type code error for StringImm (#3050)
-* Fix bug of wrongly generated `device_map` (#2990)
-* use `unordered_map` instead of map in ANF (#3024)
-* Fix PRelu layout in Relay (#3013)
-* Minor addition to graph runtime debug (#3129)
-* Fix mali conv2d performance regression (#3131)
-* Fix dense autotvm template registration in ROCm (#3136)
-* Fix `conv2d_transpose` (#3138)
-* Fix python lint warnings (#3145)
-* Some fixes for golang latest version compiler #3119 (#3182)
-* Add more syncs to fix flaky test caused by `get_valid_counts` (#3151)
-* Fix AlterLayout Pass (#3155)
-* Fix a multithreaded bug in llvm LazyInitJIT (#3158)
-* Fix a tensorflow test bug. (#3165)
-* Fix concat for ARM (#3061)
-* Handle vectorize for LE statement (#3137)
-* Raise exception `group_conv2d_nchw` not supported (#3195)
-* Quick fix of VTA FPGA Toolchain Installation documentation (#3196)
-* Check file exists before removing it (#3178)
-* Fix a bug of flatten in ONNX to Relay converter (#3180)
-* Fix converter where initializers were not registered as nodes (#3143)
-* Fix bug in cast to bool (#3207)
-* Hotfix `build_module` creation (#3198)
-* Fix sort changing original input data issue (#3212)
-* Fix bug in vta runtime DepPop function (#3208)
-* Fix resize nearest with fractional scaling (#3244)
-* Fix `vta_conv2d` crash issue after change `vta_config.json` (#3213)
-* Fix a memory leak in OpManager (#3263)
-* PkgConfig cause crash in PYNQ board due to link library (#3257)
-* Fix Error messages in tflite.py (#3320)
-* Fix typos in docs and comments (#3309, #3376)
-* Bugfix min/max const canonicalize rule (#3386)
-* Return module from frontend for autotvm (#3401)
-* Fix constant and reshape in ONNX (#3387)
-* Default verilator location fix (#3324)
-* Fix autodiff for conditional expression (#3453)
-* Gramatical improvements to `tensor_expr_get_started` (#3330)
-* Fix AutoTVM data structure bug (#3462)
-* Fix MXNet RNN without providing state initialization as input (#3326)
-* Fix flaky test on topk and quantize pass (#3362)
-* Add VTA PYNQ `metal_test` bitstream program logic and fix compilation issue. (#3400)
-* Fix VTA function Vivado Compile Error. (#3375)
-* Fix VTA DRAM functionality issue. (#3278)
-* Fix reshape precompute and type error in ONNX frontend (#3230)
-* Fix interpreter argument conversion for tuples. (#3349)
-* Fix code generation for packed functions + tuples in VM (#3287)
-* Fix memory leak in Relay interpreter (#3448)
-* Fix x86 depthwise conv2d `alter_op_layout` (#3264)
-* Create closure object for GlobalVar (#3411)
-* Fix getting global var in prelude (#3405)
-* Fix rfactor bugs which related to predicate and loop partition (#3382, #3444)
-* Fix the bug in AutoTVM where SimulatedAnnealingOptimizer sometimes finds useless candidate (#3413)
-* Fix name conflict in PartialEval (#3402)
-* Fix int bound analysis bug for modular (#3288)
-* Check arg positiveness for modular rules (#3279)
-* Fixes failure of `sum` and `all` on `axis=0` (#3422)
-* Fix package path in tflite test (#3427)
-* Fix Windows build (#3429)
-* Fix `LSTMBlockCell` in Tensorflow frontend (#3410)
-* TF fix where output index is ignored (#3622)
-* Runtime fix for custom datatypes (#3471)
-* Relay build module warnings (#3452)
-* Relay partial evaluator (#3482)
-* Pynq AutoTVM tracker (#3497, #3578)
-* A normal form test (#3525)
-* Lint issue (#3519, #3615 )
-* Any shape testing (#3528)
-* Android `posix_memalign` (#3532)
-* Quantization `add_rewrite` and UnifyDTypeScale (#3534)
-* Bound inference fix (#3526)
-* Tensorflow NCHW data format (#3514)
-* First order gradient (#3550)
-* JS load module example (#3556)
-* Build error (#3552)
-* Relay VM debug statements (#3565)
-* C++ lambda expr (#3570)
-* Handling of tempdir if subprocess is killed (#3574)
-* Remove tabs in Chisel source (#3603)
-* Relay VM DataTypeObject (#3604)
-* Removing prints (#3616)
-* Average Pool2D Bug (#3607)
-* Missing header in `cuda_device_api.cc` (#3621)
-* Tensorflow frontend fix where `output_shape` is None (#3632)
-* Winograd accuracy fix (#3644)
-* Fix comment (#3646)
-* Zero-input op fix for recursive traversals (#3623)
-* Python 3.5 compatibility (#3675)
-* Fix infinite recursive `device_api.ext_dev` call in VTA. (#3843)
-* Fix `depth_mult` for TensorFlow frontend (#3676)
-* Fix database APIs for AutoTVM (#3821)
-* Fix axis of softmax in Keras (#3834)
-* Fix VTA TensorLoad module (#3841)
-* Fix inconsistent python/cpp API behavior for `if_then_else`, power (#3829)
-* Fix code comment of operators in ONNX frontend (#3830)
-* Added repo for llvm-9 to fix missing dependency issue (#3826)
-* Fix typo in Relay text parser (#3785)
-* Fix tvm const warnings (#3817)
-* Add gfx906 bc (#3808)
-* Fixed onnx test failures when run on a cpu backend (#3764)
-* Fix ArgBinder assert order (#3794)
-* Fix for NoneType Target for quantization (#3792)
-* Fix out-of-date quantization realize (#3790)
-* Fix Qnn concatenate InferType (#3779)
-* Fix dense tuning (#3768)
-* Fix `visit_pattern` in ExprMutator (#3769)
-* Fix Chisel Scala style (#3765)
-* Fix some pass docs (#3767)
-* Fix mistype in rpc tutorial (#3763)
-* Fix tvm.scan follow by tvm.compute segfault (#3723)
-* Fix the potential index overflow in where operator (#3751)
-* Revert `compile_cmd` kwarg name change (#3746)
-* Update tophub (#3752)
-* Fix typo in `ir_pass.h` (#3741)
-* Bug fix for VME Shell (#3737)
-* Fix missing apt https transport support (#3735)
-* Take zero extent loops as NoOp and remove it (#3724)
-* Fix mxnet converter for hybridblock and add `div_sqrt_dim` (#3701)
-* Fix partial eval unit test name (#3719)
-* Fix conv2d schedule code (#3648, #3717)
-* Remove thread related headers (#3713)
-* Fix FunctionPass (#3712)
-* Export tvm::relay::OpRegistry::OpRegistry (#3711)
-* Fix Metal reinterpret (#3706)
-* Fix `gather_nd` in Relay (#3442)
-* Fix error in partial evaluator (#3693)
-* Align the naming rule for OpAttributeUnImplemented (#3695)
-* Enable the sparse schedule (#3651)
-* Fix typo names in Caffe2 frontend (#3685)
-* Make tests multi-process friendly. (#3683)
-* Fix typo in README.md (#3684)
-* Fix doc rendering  (#3897)
-* Add test script starter command to document (#3993)
-* Add type solver unit tests for unifying quantified funcs (#3947)
-* Change Vivado install instructions to version 2018.3 (#4003)
-* Add a link to the defining network description of auto-tuning tutorial (#4023)
-* Additional MXNet Convolution and Deconvolution tests (#4026)
-* Adding support to check if an attribute is present or not without having to get the value (#3957)
-* Fix parser for cast. (#3873)
-* Fix operator fusion for multiple output (#3871)
-* Remove extern C warpper for cuBLAS (#3877)
-* Fix int32 range overflow by using int64 (#3870)
-* Remove duplicate resize (#3902)
-* Fix blas cmake for mac os (#3898)
-* Add another MKL name alias for MKL installed through pypi (#3853)
-* Numpy compatible dtype inference for `tvm.convert` and `tvm.const` (#3861)
-* Remove incorrect check for LLVM in C codegen test (#3921)
-* Fix exponential blowup in interpreter (#3559)
-* Fix CUDA int8x4 vectorize (#3928)
-* Make buffer auto broadcast independent to the order of input args (#3956)
-* Fix benchmark layout in graph tuner (#3926)
-* Fix Android Demo LLVM version (#3962)
-* Cast filepath arguments to string (#3968)
-* Fixes "common" sub crate using nightly and master (#3965)
-* Changes to make tensorize work. These changes also fix the previously broken test. (#3981)
-* Remove FLOP computation when calling 3rd party library (#4005)
-* Use a more intuitive way to limit the #ops in a group (#4018)
-* Add more `pad_mode` support for onnx converter (#4029)
-* Impose a max op limit to the op fusion pass (#4002)
-* Fixes issue with CPP enums (#4019)
-* Int64 shape handling for outputs. (#4031)
-* [PYTHON] Fix installation for generated grammar (#4223)
-* [Bugfix] Fix target host for vm compiler (#4057)
-* [Fix][VM] Fix VM invoke with `set_params` (#4079)
-* [Fix] Fix a few bugs when dtype is fp16 (#4088)
-* [Relay][Frontend][TF] Fix Size operator (#4175)
-* [cmake][ANTLR] Support setting path to ANTLR jar (#4176)
-* Fix infer type of kernel in dense. (#4125)
-* [Relay] Fix match case in Python-side expr functor (#4037)
-* Split `adaptive_pool2d_avg` into sum and div (#4186)
-* [AutoTVM] Fix Split Factors when `no_tail` is off (#4044)
-* Fix extent one for the `post_stmt` in loop partition (#3734)
-* [TOPI] Fix bug in intel graphics auto tune (#4093)
-* [ARITH] Fix lowering of `floormod(x, y) != 0` (#4127)
-* [ARITH] Fix the rule `y < x && x <= y` (#4220)
-* [Bugfix][TF] reset graph after getting tag of savedmodel (#4055)
-* [Fix] Fix the logic of the number of nodes checking in op fusion (#4074)
-* [VTA] hotfix for de10-nano driver (#4081)
-* Fixing tensor not found issue in bitserial operator (#4095)
-* Fix wrong `n_trial` number in autotvm tutorials' progress bar if `n_trial` is larger then config space. (#4070)
-* [PATCH] Fix undefined `__floatdihf` in libtvmruntime.so on aarch64. (#4119)
-* [ARITH] Fix lowering of FloorMod (#4236)
-* [Relay][Frontend][Tensorflow] Fix GatherV2 (#4238)
-* Fix typing.Deque import error for Python 3.5 (#4254)
-* [VTA] Hotfix for padded load test in Chisel VTA (#4264)
-* [Contrib] Fix error message at `callback_get_section_size()` (#4221)
-* [TOPI] Fix bug in Winograd on CUDA (#4260)
-* AutoTVM: Fix hang/crash issues on feature extraction (#3689)
-* [TOPI][CUDA] Fix Winograd Kernel Size Support (#4276)
-* [Relay][Frontend][Tensorflow] Fix type assignment for 'tf.range' operator (#4294)
-* Fix incorrect call to Unicode Win32 InetPton (#4306)
-* [Relay][Frontend][Keras] handle `batch_norm` op params well (#4310)
-* [VTA] fix error when `memory_id` is `VTA_MEM_ID_OUT` (#4330)
-* [Doc][fix] fix sphinx parsing for pass infra tutorial (#4337)
-* [Codegen] remove fp16 function override for cuda (#4331)
-* [TFLite] Fix Prelu unified shape error (#4326)
-* [Relay][Frontend][TF] Fix transpose when axes is not a param (#4327)
-* [VTA] Bug fix for padded load with large inputs (#4293)
-* Fix inconsistent operator tag name (#4134)
-* Fix for a specific case when loop partitioning with indivisble. (#4243)
-* Send list as argument to `schedule_conv2d` (#4358)
-* [Docker] Fix TVM folder name for installing on Android and OpenCL. (#4363)
-* Fix TFLite Reshape assert (#4320)
-* [Relay][Frontend][TF] Fix slice when begin or size is not Const (#4372)
-* Fix compilaton of bfloat16 on Windows (#4415)
-
-### Known Issues
-
-* The performance of Relay VM is not good enough on GPU, due to memeory allocation overhead which will be resolved later.
-* TFlite rounding vs tvm rounding causing differences in accuracy and potentially off by 1 errors. For reference #3900
-* TFlite pre-quantized network support is still a work in progress and the project would welcome further contributions.
-* TSIM build requires `python` command exist on the host. See [forum discussion](https://discuss.tvm.ai/t/vta-build-failure/4790) for details.
-* Tensorflow control flow has not been fully supported in the frontend converter.
-* `topi.floor_div` is inconsistent with floor division semantic when result number is close to an integer.
-
-
-### Depreciations
-* Deprecating python2 support in the master branch and following release (v0.6). (#2994, #2986)
-* NNVM is deprecated and will be removed in a future version. (#4333, #4368)
-
 
 ## 0.5
 This release features several major improvements. Some of the highlights are: Arbitrary bits quantization algorithm; High-level auto-differentiable programming IR -- Relay.
@@ -1172,5 +279,3 @@ We also make major improvements in supporting new backends: ROCm for AMDGPUs and
 - DLPack integration support
 - AOT and module system
 - Basic code structure ready.
-
-
diff --git a/apps/benchmark/util.py b/apps/benchmark/util.py
index c7de3a1dda31..0af1669ec364 100644
--- a/apps/benchmark/util.py
+++ b/apps/benchmark/util.py
@@ -34,8 +34,8 @@ def get_network(name, batch_size, dtype='float32'):
 
     Returns
     -------
-    net: relay.Module
-        The relay function of network definition
+    net: nnvm.symbol
+        The NNVM symbol of network definition
     params: dict
         The random parameters for benchmark
     input_shape: tuple
diff --git a/apps/bundle_deploy/Makefile b/apps/bundle_deploy/Makefile
index 57e484379a4e..8550a0ee1f00 100644
--- a/apps/bundle_deploy/Makefile
+++ b/apps/bundle_deploy/Makefile
@@ -16,15 +16,15 @@
 # under the License.
 
 # Makefile Example to bundle TVM modules.
-
 TVM_ROOT=$(shell cd ../..; pwd)
+NNVM_PATH=nnvm
 DMLC_CORE=${TVM_ROOT}/3rdparty/dmlc-core
-PKG_CFLAGS = -std=c++14 -O2 -fPIC\
+PKG_CFLAGS = -std=c++14 -Oz -fPIC\
 	-I${TVM_ROOT}/include\
 	-I${DMLC_CORE}/include\
-	-I${TVM_ROOT}/3rdparty/dlpack/include
+	-I${TVM_ROOT}/3rdparty/dlpack/include\
 
-PKG_LDFLAGS = -pthread
+PKG_LDFLAGS = -L${TVM_ROOT}/build
 
 build_dir := build
 
@@ -33,7 +33,7 @@ test: $(build_dir)/demo $(build_dir)/bundle.so
 
 $(build_dir)/demo: demo.cc
 	@mkdir -p $(@D)
-	$(CXX) $(PKG_CFLAGS) -o $@  $^ -ldl
+	$(CXX) $(PKG_CFLAGS) -o $@  $^
 
 # Serialize our graph.json file.
 $(build_dir)/graph.json.cc: $(build_dir)/graph.json
@@ -44,13 +44,13 @@ $(build_dir)/params.bin.cc: $(build_dir)/params.bin
 	xxd -i $^  > $@
 
 $(build_dir)/model.o $(build_dir)/graph.json $(build_dir)/params.bin: build_model.py
-	python3 $< -o $(build_dir)
+	python $< -o $(build_dir)
 
 # Build our bundle against the serialized bundle.cc API, the runtime.cc API, and
 # the serialized graph.json and params.bin
 $(build_dir)/bundle.so: bundle.cc runtime.cc $(build_dir)/model.o $(build_dir)/graph.json.cc $(build_dir)/params.bin.cc
 	@mkdir -p $(@D)
-	$(CXX) -shared $(PKG_CFLAGS) -fvisibility=hidden -o $@  $^ $(PKG_LDFLAGS)
+	$(CXX) $(PKG_CFLAGS) -fvisibility=hidden -o $@  $^ $(PKG_LDFLAGS) -shared
 
 clean:
 	rm -r $(build_dir)
diff --git a/apps/bundle_deploy/build_model.py b/apps/bundle_deploy/build_model.py
index de9e73522ca2..dc4c14b47a01 100644
--- a/apps/bundle_deploy/build_model.py
+++ b/apps/bundle_deploy/build_model.py
@@ -18,7 +18,8 @@
 
 import argparse
 import os
-from tvm import relay
+import nnvm.compiler
+import nnvm.testing
 import tvm
 import logging
 
@@ -33,24 +34,22 @@ def main():
     dshape = (1, 3, 224, 224)
     from mxnet.gluon.model_zoo.vision import get_model
     block = get_model('mobilenet0.25', pretrained=True)
-    shape_dict = {'data': dshape}
-    mod, params = relay.frontend.from_mxnet(block, shape_dict)
-    func = mod["main"]
-    func = relay.Function(func.params, relay.nn.softmax(func.body), None, func.type_params, func.attrs)
-
-    with relay.build_config(opt_level=3):
-        graph, lib, params = relay.build(
-            func, 'llvm --system-lib', params=params)
+    net, params = nnvm.frontend.from_mxnet(block)
+    net = nnvm.sym.softmax(net)
 
+    with nnvm.compiler.build_config(opt_level=3):
+        graph, lib, params = nnvm.compiler.build(
+            net, 'llvm --system-lib', shape={'data': dshape}, params=params)
+    print(graph.symbol().debug_str())
     build_dir = os.path.abspath(opts.out_dir)
     if not os.path.isdir(build_dir):
         os.makedirs(build_dir)
 
     lib.save(os.path.join(build_dir, 'model.o'))
     with open(os.path.join(build_dir, 'graph.json'), 'w') as f_graph_json:
-        f_graph_json.write(graph)
+        f_graph_json.write(graph.json())
     with open(os.path.join(build_dir, 'params.bin'), 'wb') as f_params:
-        f_params.write(relay.save_param_dict(params))
+        f_params.write(nnvm.compiler.save_param_dict(params))
 
 
 if __name__ == '__main__':
diff --git a/apps/bundle_deploy/bundle.cc b/apps/bundle_deploy/bundle.cc
index 14f0b7edc301..61169f17cf71 100644
--- a/apps/bundle_deploy/bundle.cc
+++ b/apps/bundle_deploy/bundle.cc
@@ -6,9 +6,9 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- *
+ * 
  *   http://www.apache.org/licenses/LICENSE-2.0
- *
+ * 
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -26,9 +26,7 @@ extern unsigned int build_graph_json_len;
 extern unsigned char build_params_bin[];
 extern unsigned int build_params_bin_len;
 
-#define TVM_BUNDLE_FUNCTION __attribute__((visibility("default")))
-
-extern "C" {
+#define TVM_BUNDLE_FUNCTION __attribute__((visibility("default"))) extern "C"
 
 TVM_BUNDLE_FUNCTION void *tvm_runtime_create() {
   const std::string json_data(&build_graph_json[0],
@@ -66,4 +64,3 @@ TVM_BUNDLE_FUNCTION void tvm_runtime_get_output(void *handle, int index,
   reinterpret_cast<tvm::runtime::Module *>(handle)->GetFunction("get_output")(
       index, reinterpret_cast<DLTensor *>(tensor));
 }
-}
diff --git a/apps/bundle_deploy/runtime.cc b/apps/bundle_deploy/runtime.cc
index 7a116e89fa88..f1c2ba2f54ec 100644
--- a/apps/bundle_deploy/runtime.cc
+++ b/apps/bundle_deploy/runtime.cc
@@ -25,7 +25,7 @@
 #include "../../src/runtime/c_runtime_api.cc"
 #include "../../src/runtime/cpu_device_api.cc"
 #include "../../src/runtime/workspace_pool.cc"
-#include "../../src/runtime/library_module.cc"
+#include "../../src/runtime/module_util.cc"
 #include "../../src/runtime/module.cc"
 #include "../../src/runtime/registry.cc"
 #include "../../src/runtime/file_util.cc"
@@ -33,5 +33,5 @@
 #include "../../src/runtime/thread_pool.cc"
 #include "../../src/runtime/ndarray.cc"
 #include "../../src/runtime/object.cc"
-#include "../../src/runtime/system_library.cc"
+#include "../../src/runtime/system_lib_module.cc"
 #include "../../src/runtime/graph/graph_runtime.cc"
diff --git a/apps/extension/Makefile b/apps/extension/Makefile
index 1680a003e06f..14c71d92ca20 100644
--- a/apps/extension/Makefile
+++ b/apps/extension/Makefile
@@ -20,7 +20,8 @@ TVM_ROOT=$(shell cd ../..; pwd)
 PKG_CFLAGS = -std=c++11 -O2 -fPIC\
 	-I${TVM_ROOT}/include\
 	-I${TVM_ROOT}/3rdparty/dmlc-core/include\
-	-I${TVM_ROOT}/3rdparty/dlpack/include
+	-I${TVM_ROOT}/3rdparty/dlpack/include\
+	-I${TVM_ROOT}/3rdparty/HalideIR/src
 
 PKG_LDFLAGS =-L${TVM_ROOT}/build
 UNAME_S := $(shell uname -s)
diff --git a/apps/extension/python/tvm_ext/__init__.py b/apps/extension/python/tvm_ext/__init__.py
index 31b149eb4913..38d511eeb617 100644
--- a/apps/extension/python/tvm_ext/__init__.py
+++ b/apps/extension/python/tvm_ext/__init__.py
@@ -38,9 +38,18 @@ def load_lib():
 ivec_create = tvm.get_global_func("tvm_ext.ivec_create")
 ivec_get = tvm.get_global_func("tvm_ext.ivec_get")
 
-@tvm.register_object("tvm_ext.IntVector")
-class IntVec(tvm.Object):
+class IntVec(object):
     """Example for using extension class in c++ """
+    _tvm_tcode = 17
+
+    def __init__(self, handle):
+        self.handle = handle
+
+    def __del__(self):
+        # You can also call your own customized
+        # deleter if you can free it via your own FFI.
+        tvm.nd.free_extension_handle(self.handle, self.__class__._tvm_tcode)
+
     @property
     def _tvm_handle(self):
         return self.handle.value
@@ -48,26 +57,32 @@ def _tvm_handle(self):
     def __getitem__(self, idx):
         return ivec_get(self, idx)
 
+# Register IntVec extension on python side.
+tvm.register_extension(IntVec, IntVec)
+
 
 nd_create = tvm.get_global_func("tvm_ext.nd_create")
 nd_add_two = tvm.get_global_func("tvm_ext.nd_add_two")
-nd_get_additional_info = tvm.get_global_func("tvm_ext.nd_get_additional_info")
+nd_get_addtional_info = tvm.get_global_func("tvm_ext.nd_get_addtional_info")
 
-@tvm.register_object("tvm_ext.NDSubClass")
 class NDSubClass(tvm.nd.NDArrayBase):
     """Example for subclassing TVM's NDArray infrastructure.
 
     By inheriting TMV's NDArray, external libraries could
     leverage TVM's FFI without any modification.
     """
+    # Should be consistent with the type-trait set in the backend
+    _array_type_code = 1
 
     @staticmethod
-    def create(additional_info):
-        return nd_create(additional_info)
+    def create(addtional_info):
+        return nd_create(addtional_info)
 
     @property
-    def additional_info(self):
-        return nd_get_additional_info(self)
+    def addtional_info(self):
+        return nd_get_addtional_info(self)
 
     def __add__(self, other):
         return nd_add_two(self, other)
+
+tvm.register_extension(NDSubClass, NDSubClass)
diff --git a/apps/extension/src/tvm_ext.cc b/apps/extension/src/tvm_ext.cc
index d57b41b1215b..8655fa7d0c30 100644
--- a/apps/extension/src/tvm_ext.cc
+++ b/apps/extension/src/tvm_ext.cc
@@ -6,9 +6,9 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- *
+ * 
  *   http://www.apache.org/licenses/LICENSE-2.0
- *
+ * 
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -29,6 +29,24 @@
 #include <tvm/packed_func_ext.h>
 #include <tvm/runtime/device_api.h>
 
+namespace tvm_ext {
+using IntVector = std::vector<int>;
+class NDSubClass;
+}  // namespace tvm_ext
+
+namespace tvm {
+namespace runtime {
+template<>
+struct extension_type_info<tvm_ext::IntVector> {
+  static const int code = 17;
+};
+template<>
+struct array_type_info<tvm_ext::NDSubClass> {
+  static const int code = 1;
+};
+}  // namespace tvm
+}  // namespace runtime
+
 using namespace tvm;
 using namespace tvm::runtime;
 
@@ -39,95 +57,71 @@ namespace tvm_ext {
  * To use this extension, an external library should
  *
  * 1) Inherit TVM's NDArray and NDArray container,
+ *    and define the trait `array_type_info` for this class.
  *
- * 2) Follow the new object protocol to define new NDArray as a reference class.
+ * 2) Define a constructor in the inherited class that accepts
+ *    a pointer to TVM's Container, which is nullable.
  *
- * 3) On Python frontend, inherit `tvm.nd.NDArray`,
- *    register the type using tvm.register_object
+ * 3) On Python frontend, inherit `tvm.nd.NDArrayBase`,
+ *    define the class attribute `_array_type_code` consistent to
+ *    the C++ type trait, and register the subclass using `tvm.register_extension`.
  */
 class NDSubClass : public tvm::runtime::NDArray {
  public:
   class SubContainer : public NDArray::Container {
    public:
-    SubContainer(int additional_info) :
-      additional_info_(additional_info) {
-      type_index_ = SubContainer::RuntimeTypeIndex();
+    SubContainer(int addtional_info) :
+      addtional_info_(addtional_info) {
+      array_type_code_ = array_type_info<NDSubClass>::code;
     }
-    int additional_info_{0};
-
-    static constexpr const uint32_t _type_index = TypeIndex::kDynamic;
-    static constexpr const char* _type_key = "tvm_ext.NDSubClass";
-    TVM_DECLARE_FINAL_OBJECT_INFO(SubContainer, NDArray::Container);
+    static bool Is(NDArray::Container *container) {
+      SubContainer *c = static_cast<SubContainer*>(container);
+      return c->array_type_code_ == array_type_info<NDSubClass>::code;
+    }
+    int addtional_info_{0};
   };
-
-  static void SubContainerDeleter(Object* obj) {
-    auto* ptr = static_cast<SubContainer*>(obj);
-    delete ptr;
+  NDSubClass(NDArray::Container *container) {
+    if (container == nullptr) {
+      data_ = nullptr;
+      return;
+    }
+    CHECK(SubContainer::Is(container));
+    container->IncRef();
+    data_ = container;
   }
-
-  NDSubClass() {}
-  explicit NDSubClass(ObjectPtr<Object> n) : NDArray(n) {}
-  explicit NDSubClass(int additional_info) {
-    SubContainer* ptr = new SubContainer(additional_info);
-    ptr->SetDeleter(SubContainerDeleter);
-    data_ = GetObjectPtr<Object>(ptr);
+  ~NDSubClass() {
+    this->reset();
   }
-
   NDSubClass AddWith(const NDSubClass &other) const {
-    SubContainer *a = static_cast<SubContainer*>(get_mutable());
-    SubContainer *b = static_cast<SubContainer*>(other.get_mutable());
+    SubContainer *a = static_cast<SubContainer*>(data_);
+    SubContainer *b = static_cast<SubContainer*>(other.data_);
     CHECK(a != nullptr && b != nullptr);
-    return NDSubClass(a->additional_info_ + b->additional_info_);
+    return NDSubClass(new SubContainer(a->addtional_info_ + b->addtional_info_));
   }
   int get_additional_info() const {
-    SubContainer *self = static_cast<SubContainer*>(get_mutable());
+    SubContainer *self = static_cast<SubContainer*>(data_);
     CHECK(self != nullptr);
-    return self->additional_info_;
+    return self->addtional_info_;
   }
-  using ContainerType = SubContainer;
-};
-
-TVM_REGISTER_OBJECT_TYPE(NDSubClass::SubContainer);
-
-/*!
- * \brief Introduce additional extension data structures
- *        by sub-classing TVM's object system.
- */
-class IntVectorObj : public Object {
- public:
-  std::vector<int> vec;
-
-  static constexpr const char* _type_key = "tvm_ext.IntVector";
-  TVM_DECLARE_FINAL_OBJECT_INFO(IntVectorObj, Object);
 };
-
-/*!
- * \brief Int vector reference class.
- */
-class IntVector : public ObjectRef {
- public:
-  TVM_DEFINE_OBJECT_REF_METHODS(IntVector, ObjectRef, IntVectorObj);
-};
-
-TVM_REGISTER_OBJECT_TYPE(IntVectorObj);
-
 }  // namespace tvm_ext
 
 namespace tvm_ext {
 
+TVM_REGISTER_EXT_TYPE(IntVector);
+
 TVM_REGISTER_GLOBAL("tvm_ext.ivec_create")
 .set_body([](TVMArgs args, TVMRetValue *rv) {
-    auto n = tvm::runtime::make_object<IntVectorObj>();
+    IntVector vec;
     for (int i = 0; i < args.size(); ++i) {
-      n->vec.push_back(args[i].operator int());
+      vec.push_back(args[i].operator int());
     }
-    *rv = IntVector(n);
+    *rv = vec;
   });
 
 TVM_REGISTER_GLOBAL("tvm_ext.ivec_get")
 .set_body([](TVMArgs args, TVMRetValue *rv) {
-    IntVector p = args[0];
-    *rv = p->vec[args[1].operator int()];
+    *rv = args[0].AsExtension<IntVector>()[args[1].operator int()];
   });
 
 
@@ -154,10 +148,8 @@ TVM_REGISTER_GLOBAL("device_api.ext_dev")
 
 TVM_REGISTER_GLOBAL("tvm_ext.nd_create")
 .set_body([](TVMArgs args, TVMRetValue *rv) {
-  int additional_info = args[0];
-  *rv = NDSubClass(additional_info);
-  CHECK_EQ(rv->type_code(), kNDArrayContainer);
-
+  int addtional_info = args[0];
+  *rv = NDSubClass(new NDSubClass::SubContainer(addtional_info));
 });
 
 TVM_REGISTER_GLOBAL("tvm_ext.nd_add_two")
@@ -167,7 +159,7 @@ TVM_REGISTER_GLOBAL("tvm_ext.nd_add_two")
   *rv = a.AddWith(b);
 });
 
-TVM_REGISTER_GLOBAL("tvm_ext.nd_get_additional_info")
+TVM_REGISTER_GLOBAL("tvm_ext.nd_get_addtional_info")
 .set_body([](TVMArgs args, TVMRetValue *rv) {
   NDSubClass a = args[0];
   *rv = a.get_additional_info();
diff --git a/apps/extension/tests/test_ext.py b/apps/extension/tests/test_ext.py
index a5e7e0f69456..e481e82fefb3 100644
--- a/apps/extension/tests/test_ext.py
+++ b/apps/extension/tests/test_ext.py
@@ -87,17 +87,16 @@ def check_llvm():
 
 
 def test_nd_subclass():
-    a = tvm_ext.NDSubClass.create(additional_info=3)
-    b = tvm_ext.NDSubClass.create(additional_info=5)
-    assert isinstance(a, tvm_ext.NDSubClass)
+    a = tvm_ext.NDSubClass.create(addtional_info=3)
+    b = tvm_ext.NDSubClass.create(addtional_info=5)
     c = a + b
     d = a + a
     e = b + b
-    assert(a.additional_info == 3)
-    assert(b.additional_info == 5)
-    assert(c.additional_info == 8)
-    assert(d.additional_info == 6)
-    assert(e.additional_info == 10)
+    assert(a.addtional_info == 3)
+    assert(b.addtional_info == 5)
+    assert(c.addtional_info == 8)
+    assert(d.addtional_info == 6)
+    assert(e.addtional_info == 10)
 
 
 if __name__ == "__main__":
diff --git a/apps/howto_deploy/Makefile b/apps/howto_deploy/Makefile
index a260e89bc042..5c4a6d6e89da 100644
--- a/apps/howto_deploy/Makefile
+++ b/apps/howto_deploy/Makefile
@@ -17,6 +17,7 @@
 
 # Makefile Example to deploy TVM modules.
 TVM_ROOT=$(shell cd ../..; pwd)
+NNVM_PATH=nnvm
 DMLC_CORE=${TVM_ROOT}/3rdparty/dmlc-core
 
 PKG_CFLAGS = -std=c++11 -O2 -fPIC\
@@ -24,7 +25,7 @@ PKG_CFLAGS = -std=c++11 -O2 -fPIC\
 	-I${DMLC_CORE}/include\
 	-I${TVM_ROOT}/3rdparty/dlpack/include\
 
-PKG_LDFLAGS = -L${TVM_ROOT}/build -ldl -pthread
+PKG_LDFLAGS = -L${TVM_ROOT}/build -ldl -lpthread
 
 .PHONY: clean all
 
@@ -38,7 +39,7 @@ lib/libtvm_runtime_pack.o: tvm_runtime_pack.cc
 # The code library built by TVM
 lib/test_addone_sys.o: prepare_test_libs.py
 	@mkdir -p $(@D)
-	python3 prepare_test_libs.py
+	python prepare_test_libs.py
 
 # Deploy using the all in one TVM package library
 lib/cpp_deploy_pack: cpp_deploy.cc lib/test_addone_sys.o lib/libtvm_runtime_pack.o
diff --git a/apps/rocm_rpc/Makefile b/apps/rocm_rpc/Makefile
index 36eb41596be8..8d30fb6ef780 100644
--- a/apps/rocm_rpc/Makefile
+++ b/apps/rocm_rpc/Makefile
@@ -19,6 +19,7 @@
 ROCM_PATH=/opt/rocm
 
 TVM_ROOT=$(shell cd ../..; pwd)
+NNVM_PATH=nnvm
 DMLC_CORE=${TVM_ROOT}/3rdparty/dmlc-core
 
 PKG_CFLAGS = -std=c++11 -O2 -fPIC\
diff --git a/apps/sgx/README.md b/apps/sgx/README.md
index ad87be4e93db..13f72b0629cf 100644
--- a/apps/sgx/README.md
+++ b/apps/sgx/README.md
@@ -49,7 +49,7 @@ mkdir build && cd build
 cmake .. -DUSE_LLVM=ON -DUSE_SGX=/opt/sgxsdk -DRUST_SGX_SDK=/opt/rust-sgx-sdk
 make -j4
 cd ..
-pip install -e python -e topi/python
+pip install -e python -e topi/python -e nnvm/python
 cd apps/sgx
 ```
 
diff --git a/apps/sgx/enclave/src/build_model.py b/apps/sgx/enclave/src/build_model.py
index dff571668422..5a6b10cfcd38 100644
--- a/apps/sgx/enclave/src/build_model.py
+++ b/apps/sgx/enclave/src/build_model.py
@@ -20,8 +20,8 @@
 import os
 from os import path as osp
 
-from tvm import relay
-from tvm.relay import testing
+import nnvm.compiler
+import nnvm.testing
 import tvm
 
 
@@ -30,13 +30,14 @@ def main():
     parser.add_argument('-o', '--out-dir', default='.')
     opts = parser.parse_args()
 
+    # from tutorials/nnvm_quick_start.py
     dshape = (1, 3, 224, 224)
-    net, params = relay.testing.resnet.get_workload(
+    net, params = nnvm.testing.resnet.get_workload(
         layers=18, batch_size=dshape[0], image_shape=dshape[1:])
 
-    with relay.build_config(opt_level=3):
-        graph, lib, params = relay.build(
-            net, 'llvm --system-lib', params=params)
+    with nnvm.compiler.build_config(opt_level=3):
+        graph, lib, params = nnvm.compiler.build(
+            net, 'llvm --system-lib', shape={'data': dshape}, params=params)
 
     build_dir = osp.abspath(opts.out_dir)
     if not osp.isdir(build_dir):
@@ -44,9 +45,9 @@ def main():
 
     lib.save(osp.join(build_dir, 'model.bc'))
     with open(osp.join(build_dir, 'graph.json'), 'w') as f_graph_json:
-        f_graph_json.write(graph)
+        f_graph_json.write(graph.json())
         with open(osp.join(build_dir, 'params.bin'), 'wb') as f_params:
-            f_params.write(relay.save_param_dict(params))
+            f_params.write(nnvm.compiler.save_param_dict(params))
 
 
 if __name__ == '__main__':
diff --git a/cmake/config.cmake b/cmake/config.cmake
index 42c19b5277be..dbad944c5459 100644
--- a/cmake/config.cmake
+++ b/cmake/config.cmake
@@ -175,10 +175,6 @@ set(USE_SORT ON)
 # Whether use TensorRT
 # /path/to/tensorrt that contains include and lib dirs
 set(USE_TENSORRT OFF)
-
-# Whether use MKL-DNN (DNNL) codegen
-set(USE_DNNL_CODEGEN OFF)
-
 # Build ANTLR parser for Relay text format
 # Possible values:
 # - ON: enable ANTLR by searching default locations (cmake find_program for antlr4 and /usr/local for jar)
diff --git a/conda/tvm/build.sh b/conda/tvm/build.sh
index 358e0b91798a..494f90f0afa0 100644
--- a/conda/tvm/build.sh
+++ b/conda/tvm/build.sh
@@ -6,9 +6,9 @@
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
-#
+# 
 #   http://www.apache.org/licenses/LICENSE-2.0
-#
+# 
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -26,3 +26,7 @@ cd ..
 cd topi/python
 $PYTHON setup.py install --single-version-externally-managed --record=/tmp/record.txt
 cd ../..
+
+cd nnvm/python
+$PYTHON setup.py install --single-version-externally-managed --record=/tmp/record.txt
+cd ../..
diff --git a/conda/tvm/meta.yaml b/conda/tvm/meta.yaml
index 3ce0f5e4726c..12f9a9698d70 100644
--- a/conda/tvm/meta.yaml
+++ b/conda/tvm/meta.yaml
@@ -48,6 +48,7 @@ test:
   imports:
     - tvm
     - topi
+    - nnvm
   requires:
     - pytest
     - scipy
diff --git a/dmlc_tvm_commit_id.txt b/dmlc_tvm_commit_id.txt
index 86cce8c71a91..461bca727d0a 100644
--- a/dmlc_tvm_commit_id.txt
+++ b/dmlc_tvm_commit_id.txt
@@ -1 +1 @@
-475158f6285c63b42efe574cb9ba8afec24261be
+6e085b40328a99ec59a7e4ff50017edab31eb553
\ No newline at end of file
diff --git a/docker/Dockerfile.demo_android b/docker/Dockerfile.demo_android
index 13d1a2175b88..4d52411444f7 100644
--- a/docker/Dockerfile.demo_android
+++ b/docker/Dockerfile.demo_android
@@ -70,5 +70,5 @@ RUN cd /usr && \
     make -j10
 
 # Environment variables
-ENV PYTHONPATH=/usr/tvm/python:/usr/tvm/topi/python:/usr/tvm/vta/python:${PYTHONPATH}
+ENV PYTHONPATH=/usr/tvm/python:/usr/tvm/topi/python:/usr/tvm/nnvm/python/:/usr/tvm/vta/python:${PYTHONPATH}
 ENV ANDROID_HOME=/opt/android-sdk-linux/
diff --git a/docker/Dockerfile.demo_cpu b/docker/Dockerfile.demo_cpu
index 6700579bc41b..63dc3a15d088 100644
--- a/docker/Dockerfile.demo_cpu
+++ b/docker/Dockerfile.demo_cpu
@@ -30,4 +30,4 @@ COPY install/install_tvm_cpu.sh /install/install_tvm_cpu.sh
 RUN bash /install/install_tvm_cpu.sh
 
 # Environment variables
-ENV PYTHONPATH=/usr/tvm/python:/usr/tvm/topi/python:/usr/tvm/vta/python:${PYTHONPATH}
+ENV PYTHONPATH=/usr/tvm/python:/usr/tvm/topi/python:/usr/tvm/nnvm/python/:/usr/tvm/vta/python:${PYTHONPATH}
diff --git a/docker/Dockerfile.demo_gpu b/docker/Dockerfile.demo_gpu
index 0591050c5270..9be8c00f941c 100644
--- a/docker/Dockerfile.demo_gpu
+++ b/docker/Dockerfile.demo_gpu
@@ -28,7 +28,7 @@ COPY install/install_tvm_gpu.sh /install/install_tvm_gpu.sh
 RUN bash /install/install_tvm_gpu.sh
 
 # Environment variables
-ENV PYTHONPATH=/usr/tvm/python:/usr/tvm/topi/python:/usr/tvm/vta/python:${PYTHONPATH}
+ENV PYTHONPATH=/usr/tvm/python:/usr/tvm/topi/python:/usr/tvm/nnvm/python/:/usr/tvm/vta/python:${PYTHONPATH}
 ENV PATH=/usr/local/nvidia/bin:${PATH}
 ENV PATH=/usr/local/cuda/bin:${PATH}
 ENV LD_LIBRARY_PATH=/usr/local/cuda/lib64:/usr/local/nvidia/lib64:${LD_LIBRARY_PATH}
diff --git a/docker/Dockerfile.demo_opencl b/docker/Dockerfile.demo_opencl
index bf27eef862f5..7afb2243cb52 100644
--- a/docker/Dockerfile.demo_opencl
+++ b/docker/Dockerfile.demo_opencl
@@ -76,6 +76,7 @@ RUN mkdir -p ${TVM_BUILD_DIR} && \
 	make -j6
 
 RUN echo "Building Python package"
-ENV PYTHONPATH=${TVM_HOME}/python:${TVM_HOME}/topi/python:${PYTHONPATH}
+ENV PYTHONPATH=${TVM_HOME}/python:${TVM_HOME}/topi/python:${TVM_HOME}/nnvm/python:${PYTHONPATH}
 RUN cd ${TVM_HOME}/python && python3 setup.py install --user
 RUN cd ${TVM_HOME}/topi/python && python3 setup.py install --user
+RUN cd ${TVM_HOME}/nnvm/python && python3 setup.py install --user
diff --git a/docker/install/ubuntu_install_iverilog.sh b/docker/install/ubuntu_install_iverilog.sh
new file mode 100755
index 000000000000..da20730d491e
--- /dev/null
+++ b/docker/install/ubuntu_install_iverilog.sh
@@ -0,0 +1,30 @@
+#!/bin/bash
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+set -e
+set -u
+set -o pipefail
+
+apt-get install -y --no-install-recommends make bison flex
+wget -q ftp://icarus.com/pub/eda/verilog/v10/verilog-10.1.tar.gz
+tar xf verilog-10.1.tar.gz
+cd verilog-10.1
+./configure --prefix=/usr
+make install -j8
+cd ..
+rm -rf verilog-10.1 verilog-10.1.tar.gz
diff --git a/docs/Doxyfile b/docs/Doxyfile
index b96678ca1696..c52e14867a82 100644
--- a/docs/Doxyfile
+++ b/docs/Doxyfile
@@ -770,7 +770,7 @@ WARN_LOGFILE           =
 # spaces.
 # Note: If this tag is empty the current directory is searched.
 
-INPUT                  = include/tvm topi/include/topi vta/include/vta
+INPUT                  = include/tvm topi/include/topi nnvm/include/nnvm vta/include/vta
 
 # This tag can be used to specify the character encoding of the source files
 # that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses
@@ -1991,7 +1991,7 @@ INCLUDE_FILE_PATTERNS  =
 # recursively expanded use the := operator instead of the = operator.
 # This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
 
-PREDEFINED             = DMLC_USE_CXX11 TVM_DLL= __attribute__(x)=
+PREDEFINED             = DMLC_USE_CXX11 TVM_DLL= NNVM_DLL= __attribute__(x)=
 
 # If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then this
 # tag can be used to specify a list of macro names that should be expanded. The
diff --git a/docs/api/python/autotvm.rst b/docs/api/python/autotvm.rst
index 5e8778502457..f6a9ff5f6aa8 100644
--- a/docs/api/python/autotvm.rst
+++ b/docs/api/python/autotvm.rst
@@ -83,6 +83,9 @@ tvm.autotvm.task
 .. automodule:: tvm.autotvm.task.topi_integration
     :members:
 
+.. automodule:: tvm.autotvm.task.nnvm_integration
+    :members:
+
 tvm.autotvm.record
 ~~~~~~~~~~~~~~~~~~
 .. automodule:: tvm.autotvm.record
diff --git a/docs/api/python/index.rst b/docs/api/python/index.rst
index 7a8566eec7ba..2773fefedee3 100644
--- a/docs/api/python/index.rst
+++ b/docs/api/python/index.rst
@@ -40,5 +40,6 @@ Python API
    dev
    topi
    vta/index
+   nnvm/index
    hybrid
    relay/index
diff --git a/docs/api/python/nnvm/compiler.rst b/docs/api/python/nnvm/compiler.rst
new file mode 100644
index 000000000000..4cf1b083df60
--- /dev/null
+++ b/docs/api/python/nnvm/compiler.rst
@@ -0,0 +1,40 @@
+..  Licensed to the Apache Software Foundation (ASF) under one
+    or more contributor license agreements.  See the NOTICE file
+    distributed with this work for additional information
+    regarding copyright ownership.  The ASF licenses this file
+    to you under the Apache License, Version 2.0 (the
+    "License"); you may not use this file except in compliance
+    with the License.  You may obtain a copy of the License at
+
+..    http://www.apache.org/licenses/LICENSE-2.0
+
+..  Unless required by applicable law or agreed to in writing,
+    software distributed under the License is distributed on an
+    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+    KIND, either express or implied.  See the License for the
+    specific language governing permissions and limitations
+    under the License.
+
+nnvm.compiler
+-------------
+
+.. automodule:: nnvm.compiler
+
+.. autofunction:: nnvm.compiler.build
+
+.. autofunction:: nnvm.compiler.build_config
+
+.. autofunction:: nnvm.compiler.save_param_dict
+
+.. autofunction:: nnvm.compiler.load_param_dict
+
+.. autofunction:: nnvm.compiler.optimize
+
+.. automodule:: nnvm.compiler.graph_util
+    :members:
+
+.. automodule:: nnvm.compiler.graph_attr
+    :members:
+
+.. automodule:: nnvm.compiler.compile_engine
+    :members:
diff --git a/docs/api/python/nnvm/frontend.rst b/docs/api/python/nnvm/frontend.rst
new file mode 100644
index 000000000000..ca8c4088fd08
--- /dev/null
+++ b/docs/api/python/nnvm/frontend.rst
@@ -0,0 +1,33 @@
+..  Licensed to the Apache Software Foundation (ASF) under one
+    or more contributor license agreements.  See the NOTICE file
+    distributed with this work for additional information
+    regarding copyright ownership.  The ASF licenses this file
+    to you under the Apache License, Version 2.0 (the
+    "License"); you may not use this file except in compliance
+    with the License.  You may obtain a copy of the License at
+
+..    http://www.apache.org/licenses/LICENSE-2.0
+
+..  Unless required by applicable law or agreed to in writing,
+    software distributed under the License is distributed on an
+    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+    KIND, either express or implied.  See the License for the
+    specific language governing permissions and limitations
+    under the License.
+
+nnvm.frontend
+-------------
+
+.. automodule:: nnvm.frontend
+
+.. autofunction:: nnvm.frontend.from_mxnet
+
+.. autofunction:: nnvm.frontend.from_onnx
+
+.. autofunction:: nnvm.frontend.from_coreml
+
+.. autofunction:: nnvm.frontend.from_keras
+
+.. autofunction:: nnvm.frontend.from_tensorflow
+
+.. autofunction:: nnvm.frontend.from_darknet
diff --git a/docs/api/python/nnvm/graph.rst b/docs/api/python/nnvm/graph.rst
new file mode 100644
index 000000000000..e9f667e416e8
--- /dev/null
+++ b/docs/api/python/nnvm/graph.rst
@@ -0,0 +1,25 @@
+..  Licensed to the Apache Software Foundation (ASF) under one
+    or more contributor license agreements.  See the NOTICE file
+    distributed with this work for additional information
+    regarding copyright ownership.  The ASF licenses this file
+    to you under the Apache License, Version 2.0 (the
+    "License"); you may not use this file except in compliance
+    with the License.  You may obtain a copy of the License at
+
+..    http://www.apache.org/licenses/LICENSE-2.0
+
+..  Unless required by applicable law or agreed to in writing,
+    software distributed under the License is distributed on an
+    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+    KIND, either express or implied.  See the License for the
+    specific language governing permissions and limitations
+    under the License.
+
+nnvm.graph
+----------
+.. automodule:: nnvm.graph
+
+.. autofunction:: nnvm.graph.create
+
+.. autoclass:: nnvm.graph.Graph
+   :members:
diff --git a/docs/api/python/nnvm/index.rst b/docs/api/python/nnvm/index.rst
new file mode 100644
index 000000000000..493a8fc1a772
--- /dev/null
+++ b/docs/api/python/nnvm/index.rst
@@ -0,0 +1,31 @@
+..  Licensed to the Apache Software Foundation (ASF) under one
+    or more contributor license agreements.  See the NOTICE file
+    distributed with this work for additional information
+    regarding copyright ownership.  The ASF licenses this file
+    to you under the Apache License, Version 2.0 (the
+    "License"); you may not use this file except in compliance
+    with the License.  You may obtain a copy of the License at
+
+..    http://www.apache.org/licenses/LICENSE-2.0
+
+..  Unless required by applicable law or agreed to in writing,
+    software distributed under the License is distributed on an
+    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+    KIND, either express or implied.  See the License for the
+    specific language governing permissions and limitations
+    under the License.
+
+NNVM API
+========
+
+This document contains the python API to NNVM compiler toolchain.
+
+.. toctree::
+   :maxdepth: 2
+
+   compiler
+   frontend
+   symbol
+   graph
+   top
+   testing
diff --git a/docs/api/python/nnvm/symbol.rst b/docs/api/python/nnvm/symbol.rst
new file mode 100644
index 000000000000..46dcac97ddf9
--- /dev/null
+++ b/docs/api/python/nnvm/symbol.rst
@@ -0,0 +1,27 @@
+..  Licensed to the Apache Software Foundation (ASF) under one
+    or more contributor license agreements.  See the NOTICE file
+    distributed with this work for additional information
+    regarding copyright ownership.  The ASF licenses this file
+    to you under the Apache License, Version 2.0 (the
+    "License"); you may not use this file except in compliance
+    with the License.  You may obtain a copy of the License at
+
+..    http://www.apache.org/licenses/LICENSE-2.0
+
+..  Unless required by applicable law or agreed to in writing,
+    software distributed under the License is distributed on an
+    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+    KIND, either express or implied.  See the License for the
+    specific language governing permissions and limitations
+    under the License.
+
+nnvm.symbol
+-----------
+.. automodule:: nnvm.symbol
+
+.. autoclass:: nnvm.symbol.Symbol
+    :members:
+
+.. autoclass:: nnvm.symbol.Variable
+
+.. autofunction:: nnvm.symbol.Group
diff --git a/docs/api/python/nnvm/testing.rst b/docs/api/python/nnvm/testing.rst
new file mode 100644
index 000000000000..9ee72d41b2eb
--- /dev/null
+++ b/docs/api/python/nnvm/testing.rst
@@ -0,0 +1,31 @@
+..  Licensed to the Apache Software Foundation (ASF) under one
+    or more contributor license agreements.  See the NOTICE file
+    distributed with this work for additional information
+    regarding copyright ownership.  The ASF licenses this file
+    to you under the Apache License, Version 2.0 (the
+    "License"); you may not use this file except in compliance
+    with the License.  You may obtain a copy of the License at
+
+..    http://www.apache.org/licenses/LICENSE-2.0
+
+..  Unless required by applicable law or agreed to in writing,
+    software distributed under the License is distributed on an
+    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+    KIND, either express or implied.  See the License for the
+    specific language governing permissions and limitations
+    under the License.
+
+nnvm.testing
+------------
+
+.. automodule:: nnvm.testing
+
+.. autofunction:: nnvm.testing.ctx_list
+
+nnvm.testing.check_computation
+------------------------------
+
+.. automodule:: nnvm.testing.check_computation
+    :members:
+
+.. include:: testing_new_ops.rst
diff --git a/docs/api/python/nnvm/testing_new_ops.rst b/docs/api/python/nnvm/testing_new_ops.rst
new file mode 100644
index 000000000000..bf80b526a5d5
--- /dev/null
+++ b/docs/api/python/nnvm/testing_new_ops.rst
@@ -0,0 +1,152 @@
+..  Licensed to the Apache Software Foundation (ASF) under one
+    or more contributor license agreements.  See the NOTICE file
+    distributed with this work for additional information
+    regarding copyright ownership.  The ASF licenses this file
+    to you under the Apache License, Version 2.0 (the
+    "License"); you may not use this file except in compliance
+    with the License.  You may obtain a copy of the License at
+
+..    http://www.apache.org/licenses/LICENSE-2.0
+
+..  Unless required by applicable law or agreed to in writing,
+    software distributed under the License is distributed on an
+    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+    KIND, either express or implied.  See the License for the
+    specific language governing permissions and limitations
+    under the License.
+
+Testing new operations
+----------------------
+
+When adding new operations, it is a good idea to test them. Testing
+should be done with the function ``nnvm.testing.check_function``. You
+should provide it with the symbol representing the result of a
+computation and a reference numpy implementation. By default, it will
+also check analytical gradients against numerical gradients if
+analytical gradients are implemented for your operation. You can also
+pass a reference implementation for the gradients, but numerical
+gradients will still be checked. Numerical gradient checking may be
+switched off explicitly, but doing this is not a good idea generally.
+Here is an example testing the logarithm operation:
+
+.. code:: python
+
+    import numpy as np
+    import nnvm
+    import nnvm.symbol as sym
+    from nnvm.testing.check_computation import check_function
+
+    x = sym.Variable("x")
+    y = sym.log(x)
+
+    def forward(x):
+        return np.log(x)
+
+    def backward(head_grads, x):
+        return [1. / x * head_grads]
+
+    dtype = "float32"
+    shape = {'x': (1, 3, 32, 32)}
+    check_function(y, forward, backward, in_range=(0.001, 2.0), dtype=dtype, shape=shape)
+
+If you run the code above, you might get an ``AssertionError`` in rare
+cases. That’s why it is recommended to run new tests a lot of times.
+
+.. code:: python
+
+    for _ in range(10000):
+        check_function(y, forward, backward, in_range=(0.001, 2.0), dtype=dtype, shape=shape)
+
+If you run the code above then sooner or later you will get an exception
+which may look like this:
+
+.. code-block:: text
+
+    AssertionError: Analytical and numerical grads wrt x differ too much
+    analytical grad = [
+            ...
+        ]
+    numerical grad = [
+            ...
+        ]
+    distance > atol*sqrt(n) + rtol*grad_norm
+    distance 308.50885009765625 > 0.01*55.42562584220407 + 0.1*2167.70703125
+
+It means that either you have a mistake in the ``FGradient`` function or
+the numerical error is too high. Generally, if you look at the printed
+gradients and see that they differ only slightly or just in a single
+position, then it is a numerical error. But if the gradients look
+completely different, especially if many corresponding positions have
+different signs, then it must be something wrong with the analytical
+gradient implementation.
+
+Then try to make this error reproducible, and also try to reduce the
+shape of inputs, but not too much, a vector of 10 elements is a
+reasonable choice. Also you won’t need reference functions ``forward``
+and ``backward``, and restricting the number of targets might also be a
+good idea. Since the error may manifest itself only in rare cases, you
+might want to run it in a loop.
+
+.. code:: python
+
+    shape = {'x': (10,)}
+    np.random.seed(42)
+
+    for _ in range(1000):
+        check_function(y, in_range=(0.001, 2.0), dtype=dtype, shape=shape,
+                       numerical_grads=True, only_targets=['llvm'])
+
+Running this code will result in the following:
+
+.. code-block:: text
+
+    check_function failed while checking gradients numerically, here is the main graph
+    Graph(%x, %head_grads_0) {
+      %x, shape=[10], dtype=0
+      %head_grads_0, shape=[10], dtype=0
+      %1 = log(%x), shape=[10], dtype=0
+      %3 = elemwise_div(%head_grads_0, %x), shape=[10], dtype=0
+      ret %1, %3, %head_grads_0
+    }
+    graph_attr_keys = [layout_inputs, dtype_num_unknown_nodes, dtype, shape_num_unknown_nodes, shape]
+
+    Generated inputs:
+    {'x': array([2.5660574e-01, 1.5313280e+00, 1.0232578e-03, 8.3371508e-01,
+           1.0454979e+00, 1.1021420e-01, 1.9461832e+00, 4.5302454e-01,
+           6.0909325e-01, 6.0858107e-01], dtype=float32), 'head_grads_0': array([0.4616029 , 0.00394617, 1.4589603 , 1.9337242 , 0.44936267,
+           1.3264314 , 1.4840508 , 1.6970023 , 0.84583575, 0.60655886],
+          dtype=float32)}
+
+    ...
+
+    AssertionError: Analytical and numerical grads wrt x differ too much
+    analytical grad = [1.7988799e+00 2.5769596e-03 1.4257993e+03 2.3194065e+00 4.2980734e-01
+     1.2035031e+01 7.6254421e-01 3.7459390e+00 1.3886802e+00 9.9667716e-01]
+     numerical grad = [1.7948151e+00 1.9073486e-03 9.9268610e+02 2.3174286e+00 4.2915344e-01
+     1.1980057e+01 7.6198578e-01 3.7412643e+00 1.3866425e+00 9.9563599e-01]
+    distance > atol*sqrt(n) + rtol*grad_norm
+    distance 433.11322021484375 > 0.01*3.1622776601683795 + 0.1*992.7716674804688
+
+In this case the largest difference is in the 2nd position (starting
+from 0) which corresponds to input value ``1.0232578e-03``. This value
+is too close to the singularity, so the numerical derivative gets too
+imprecise. The solution is to shrink the range for ``x``, here, for
+example, ``(0.002, 2.0)`` turned out to be enough. Don’t forget to run
+lots of tests, so that other people don’t get false positives.
+
+.. code:: python
+
+    for _ in range(100):
+        check_function(y, in_range={x: (0.002, 2.0)}, dtype=dtype, shape=(1, 3, 32, 32),
+                       numerical_grads=True, only_targets=['llvm'])
+
+If you need a more precise control over which values get passed to the
+checking function, you can use ``values={x: ...}``:
+
+.. code:: python
+
+    x_val = np.array([1.2594858e+00, 1.0960974e-01, 1.4975418e+00, 6.3585603e-01,
+           1.2692513e-03, 1.0227472e+00, 9.4656967e-02, 5.5306298e-01,
+           1.4142460e+00, 1.2631655e-01], dtype=np.float32)
+    check_function(y, values={x: x_val}, dtype=dtype, shape=shape,
+                   numerical_grads=True, only_targets=['llvm'])
diff --git a/docs/api/python/nnvm/top.rst b/docs/api/python/nnvm/top.rst
new file mode 100644
index 000000000000..ff946e7639c8
--- /dev/null
+++ b/docs/api/python/nnvm/top.rst
@@ -0,0 +1,30 @@
+..  Licensed to the Apache Software Foundation (ASF) under one
+    or more contributor license agreements.  See the NOTICE file
+    distributed with this work for additional information
+    regarding copyright ownership.  The ASF licenses this file
+    to you under the Apache License, Version 2.0 (the
+    "License"); you may not use this file except in compliance
+    with the License.  You may obtain a copy of the License at
+
+..    http://www.apache.org/licenses/LICENSE-2.0
+
+..  Unless required by applicable law or agreed to in writing,
+    software distributed under the License is distributed on an
+    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+    KIND, either express or implied.  See the License for the
+    specific language governing permissions and limitations
+    under the License.
+
+nnvm.top
+--------
+.. automodule:: nnvm.top
+
+.. autofunction:: register_compute
+
+.. autofunction:: register_schedule
+
+.. autofunction:: register_pattern
+
+
+.. autoclass:: nnvm.top.AttrDict
+   :members:
diff --git a/docs/conf.py b/docs/conf.py
index a098ad4e4d55..c4410e5864f9 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -43,6 +43,7 @@
 curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
 sys.path.insert(0, os.path.join(curr_path, '../python/'))
 sys.path.insert(0, os.path.join(curr_path, '../topi/python'))
+sys.path.insert(0, os.path.join(curr_path, '../nnvm/python'))
 sys.path.insert(0, os.path.join(curr_path, '../vta/python'))
 
 # -- General configuration ------------------------------------------------
@@ -59,6 +60,7 @@
     '.md': CommonMarkParser
 }
 os.environ['TVM_BUILD_DOC'] = '1'
+os.environ['NNVM_BUILD_DOC'] = '1'
 # Version information.
 import tvm
 version = tvm.__version__
diff --git a/docs/deploy/android.md b/docs/deploy/android.md
index 788ab412db62..daf023c38042 100644
--- a/docs/deploy/android.md
+++ b/docs/deploy/android.md
@@ -20,15 +20,18 @@
 
 ## Build model for Android Target
 
-Relay compilation of model for android target could follow same approach like android_rpc.
-The code below will save the compilation output which is required on android target.
+NNVM compilation of model for android target could follow same approach like android_rpc.
+
+An reference example can be found at [chainer-nnvm-example](https://github.com/tkat0/chainer-nnvm-example)
+
+Above example will directly run the compiled model on RPC target. Below modification at [rum_mobile.py](https://github.com/tkat0/chainer-nnvm-example/blob/5b97fd4d41aa4dde4b0aceb0be311054fb5de451/run_mobile.py#L64) will save the compilation output which is required on android target.
 
 ```
 lib.export_library("deploy_lib.so", ndk.create_shared)
 with open("deploy_graph.json", "w") as fo:
     fo.write(graph.json())
 with open("deploy_param.params", "wb") as fo:
-    fo.write(relay.save_param_dict(params))
+    fo.write(nnvm.compiler.save_param_dict(params))
 ```
 
 deploy_lib.so, deploy_graph.json, deploy_param.params will go to android target.
diff --git a/docs/deploy/index.rst b/docs/deploy/index.rst
index 9a30b96ca66e..db50865008a2 100644
--- a/docs/deploy/index.rst
+++ b/docs/deploy/index.rst
@@ -67,4 +67,5 @@ target device without relying on RPC. see the following resources on how to do s
 
    cpp_deploy
    android
+   nnvm
    integrate
diff --git a/docs/deploy/nnvm.md b/docs/deploy/nnvm.md
new file mode 100644
index 000000000000..650912231b12
--- /dev/null
+++ b/docs/deploy/nnvm.md
@@ -0,0 +1,196 @@
+<!--- Licensed to the Apache Software Foundation (ASF) under one -->
+<!--- or more contributor license agreements.  See the NOTICE file -->
+<!--- distributed with this work for additional information -->
+<!--- regarding copyright ownership.  The ASF licenses this file -->
+<!--- to you under the Apache License, Version 2.0 (the -->
+<!--- "License"); you may not use this file except in compliance -->
+<!--- with the License.  You may obtain a copy of the License at -->
+
+<!---   http://www.apache.org/licenses/LICENSE-2.0 -->
+
+<!--- Unless required by applicable law or agreed to in writing, -->
+<!--- software distributed under the License is distributed on an -->
+<!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -->
+<!--- KIND, either express or implied.  See the License for the -->
+<!--- specific language governing permissions and limitations -->
+<!--- under the License. -->
+
+# Deploy NNVM Modules
+NNVM compiled modules are fully embedded in TVM runtime as long as ```GRAPH_RUNTIME``` option
+is enabled in tvm runtime.
+
+
+In a nutshell, we will need three items to deploy a compiled module.
+Checkout our tutorials on getting started with NNVM compiler for more details.
+
+- The graph json data which contains the execution graph.
+- The tvm module library of compiled functions.
+- The parameter blobs for stored parameters.
+
+We can then use TVM's runtime API to deploy the compiled module.
+Here is an example in python.
+
+```python
+import tvm
+
+# tvm module for compiled functions.
+loaded_lib = tvm.module.load("deploy.so")
+# json graph
+loaded_json = open(temp.relpath("deploy.json")).read()
+# parameters in binary
+loaded_params = bytearray(open(temp.relpath("deploy.params"), "rb").read())
+
+fcreate = tvm.get_global_func("tvm.graph_runtime.create")
+ctx = tvm.gpu(0)
+gmodule = fcreate(loaded_json, loaded_lib, ctx.device_type, ctx.device_id)
+set_input, get_output, run = gmodule["set_input"], gmodule["get_output"], gmodule["run"]
+set_input("x", tvm.nd.array(x_np))
+gmodule["load_params"](loaded_params)
+run()
+out = tvm.nd.empty(shape)
+get_output(0, out)
+print(out.asnumpy())
+```
+
+An example in c++.
+```cpp
+#include <dlpack/dlpack.h>
+#include <tvm/runtime/module.h>
+#include <tvm/runtime/registry.h>
+#include <tvm/runtime/packed_func.h>
+
+#include <algorithm>
+#include <fstream>
+#include <iterator>
+#include <stdexcept>
+#include <string>
+
+int main()
+{
+    // tvm module for compiled functions
+    tvm::runtime::Module mod_syslib = tvm::runtime::Module::LoadFromFile("deploy.so");
+
+    // json graph
+    std::ifstream json_in("deploy.json", std::ios::in);
+    std::string json_data((std::istreambuf_iterator<char>(json_in)), std::istreambuf_iterator<char>());
+    json_in.close();
+
+    // parameters in binary
+    std::ifstream params_in("deploy.params", std::ios::binary);
+    std::string params_data((std::istreambuf_iterator<char>(params_in)), std::istreambuf_iterator<char>());
+    params_in.close();
+
+    // parameters need to be TVMByteArray type to indicate the binary data
+    TVMByteArray params_arr;
+    params_arr.data = params_data.c_str();
+    params_arr.size = params_data.length();
+
+    int dtype_code = kDLFloat;
+    int dtype_bits = 32;
+    int dtype_lanes = 1;
+    int device_type = kDLCPU;
+    int device_id = 0;
+
+    // get global function module for graph runtime
+    tvm::runtime::Module mod = (*tvm::runtime::Registry::Get("tvm.graph_runtime.create"))(json_data, mod_syslib, device_type, device_id);
+
+    DLTensor* x;
+    int in_ndim = 4;
+    int64_t in_shape[4] = {1, 3, 224, 224};
+    TVMArrayAlloc(in_shape, in_ndim, dtype_code, dtype_bits, dtype_lanes, device_type, device_id, &x);
+    // load image data saved in binary
+    const std::string data_filename = "cat.bin";
+    std::ifstream data_fin(data_filename, std::ios::binary);
+    if(!data_fin) throw std::runtime_error("Could not open: " + data_filename);
+    data_fin.read(static_cast<char*>(x->data), 3 * 224 * 224 * 4);
+
+    // get the function from the module(set input data)
+    tvm::runtime::PackedFunc set_input = mod.GetFunction("set_input");
+    set_input("data", x);
+
+    // get the function from the module(load patameters)
+    tvm::runtime::PackedFunc load_params = mod.GetFunction("load_params");
+    load_params(params_arr);
+
+    // get the function from the module(run it)
+    tvm::runtime::PackedFunc run = mod.GetFunction("run");
+    run();
+
+    DLTensor* y;
+    int out_ndim = 2;
+    int64_t out_shape[2] = {1, 1000, };
+    TVMArrayAlloc(out_shape, out_ndim, dtype_code, dtype_bits, dtype_lanes, device_type, device_id, &y);
+
+    // get the function from the module(get output data)
+    tvm::runtime::PackedFunc get_output = mod.GetFunction("get_output");
+    get_output(0, y);
+
+    // get the maximum position in output vector
+    auto y_iter = static_cast<float*>(y->data);
+    auto max_iter = std::max_element(y_iter, y_iter + 1000);
+    auto max_index = std::distance(y_iter, max_iter);
+    std::cout << "The maximum position in output vector is: " << max_index << std::endl;
+
+    TVMArrayFree(x);
+    TVMArrayFree(y);
+
+    return 0;
+}
+```
+
+## Deploy as System Module
+C++ additionally support deployment as system module.
+This process need few additional options as given below to NNVM build.
+
+- For target llvm append --system-lib as ```target=llvm --system-lib```
+- For a GPU build (or non llvm) the additional option should be given to targat_host as ```target_host=llvm --system-lib```
+
+Module export require additional options for not to compile but save as ```lib.export_library (path, fcompile=False)```
+
+The output of above API is a tar compressed file containing object file ```(lib.o)``` and cpp source file ```(devc.cc)``` which embeds device blob. Thease two files should be compiled along with other files or objects while building c++ application.
+Please refer to [Makefile](https://github.com/apache/incubator-tvm/tree/master/apps/howto_deploy/Makefile#L32) for a reference.
+
+The c++ code to load this system module require the below change.
+
+```cpp
+    // tvm module for compiled functions
+    tvm::runtime::Module mod_syslib = (*tvm::runtime::Registry::Get("module._GetSystemLib"))();
+```
+
+Based on the build environment the system object, device blob source should be included in the final executable. An example with bazel build is given below.
+```bash
+cc_library(
+    name = "host_module",
+    srcs = ["lib.o"],
+    alwayslink=1
+)
+
+cc_library(
+    name = "device_module",
+    srcs = ["devc.cc"],
+    alwayslink=1
+)
+
+cc_library(
+    name = "tvm_runtime",
+    srcs = ["libtvm_runtime_pack.cc"],
+)
+
+cc_binary(
+    name = "bazel_deploy",
+    srcs = ["cpp_deploy.cc"],
+    deps = [
+        ":tvm_runtime", ":host_module", ":device_module"
+    ],
+    linkopts = [ "-lpthread -ldl" ]
+)
+
+```
+
+This build directive creates
+- new library ```host_module``` out of ```lib.o```
+- new library ```device_module``` out of ```devc.cc```
+
+These intermediate modules can be used as a dependency to final deploy application.
+
+In bazel ```alwayslink=1``` enforce embedding entire lib into application (even though it doesn't call any API from this module).
diff --git a/docs/dev/codebase_walkthrough.rst b/docs/dev/codebase_walkthrough.rst
index 7e78d5753027..ffda632f738f 100644
--- a/docs/dev/codebase_walkthrough.rst
+++ b/docs/dev/codebase_walkthrough.rst
@@ -16,7 +16,7 @@
     under the License.
 
 =======================================
-TVM Codebase Walkthrough by Example
+**TVM Codebase Walkthrough by Example**
 =======================================
 
 Getting to know a new codebase can be a challenge. This is especially true for a codebase like that of TVM, where different components interact in non-obvious ways. In this guide, we try to illustrate the key elements that comprise a compilation pipeline with a simple example. For each important step, we show where in the codebase it is implemented. The purpose is to let new developers and interested users dive into the codebase more quickly.
@@ -28,13 +28,16 @@ Codebase Structure Overview
 At the root of the TVM repository, we have following subdirectories that together comprise a bulk of the codebase.
 
 - ``src`` - C++ code for operator compilation and deployment runtimes.
-- ``src/relay`` - Implementation of Relay, a new functional IR for deep learning framework.
+- ``src/relay`` - Implementation of Relay, a new IR for deep learning framework superseding ``nnvm`` below.
 - ``python`` - Python frontend that wraps C++ functions and objects implemented in ``src``.
 - ``topi`` - Compute definitions and backend schedules for standard neural network operators.
+- ``nnvm`` - C++ code and Python frontend for graph optimization and compilation. After the introduction of Relay, it remains in the codebase for backward compatibility.
 
 Using standard Deep Learning terminology, ``src/relay`` is the component that manages a computational graph, and nodes in a graph are compiled and executed using infrastructure implemented in the rest of ``src``. ``python`` provides python bindings for the C++ API and driver code that users can use to execute compilation. Operators corresponding to each node are registered in ``src/relay/op``. Implementations of operators are in ``topi``, and they are coded in either C++ or Python.
 
-When a user invokes graph compilation by ``relay.build(...)``, the following sequence of actions happens for each node in the graph:
+Relay is the new IR for deep networks that is intended to replace NNVM. If you have used NNVM, Relay provides equivalent or better functionality. In fact, Relay goes beyond a traditional way of thinking deep networks in terms of computational graphs. But for the purpose of this document, we can think of Relay as a traditional computational graph framework. You can read more about Relay `here <https://docs.tvm.ai/dev/relay_intro.html>`_.
+
+When a user invokes graph compilation by ``relay.build(...)`` (or ``nnvm.compiler.build(...)`` for the older API), the following sequence of actions happens for each node in the graph:
 
 - Look up an operator implementation by querying the operator registry
 - Generate a compute expression and a schedule for the operator
diff --git a/docs/dev/debugger.rst b/docs/dev/debugger.rst
index 4c49e926a8df..65f206f0cd5e 100644
--- a/docs/dev/debugger.rst
+++ b/docs/dev/debugger.rst
@@ -16,18 +16,18 @@
     under the License.
 
 =================
-Debugger
+**Debugger**
 =================
 
 TVM Debugger is an interface for debugging TVM's computation graph execution. It helps to provide access to graph structures and tensor values at the TVM runtime.
 
 *******************************************
-Debug Exchange Format
+**Debug Exchange Format**
 *******************************************
 
-1. Computational Graph
-======================
-The optimized graph build by relay in json
+**1. Computational Graph**
+==========================
+The optimized graph build by nnvm in json
 serialized format is dumped as it is. This contains the whole
 information about the graph. The UX can either use this graph directly
 or transform this graph to the format UX can understand.
@@ -35,7 +35,7 @@ or transform this graph to the format UX can understand.
 The Graph JSON format is explained below
 
 1. ``nodes``
-Nodes are either placeholders or computational nodes in json. The nodes are stored
+Nodes are either placeholders or computational nodes in NNVM graph. The nodes are stored
 as a list. A node contains the below information
 
 -     ``op`` - operation type, ``null`` means it is a placeholder/variable/input node and``tvm_op`` means this node can be executed
@@ -44,7 +44,7 @@ as a list. A node contains the below information
 -     ``attrs`` - Attributes of the node which contains the following information
 
     -     ``flatten_data`` - Whether this data need to be flattened before execution
-    -     ``func_name`` - Fused function name, corresponds to the symbol in the lib generated by relay compilation process.
+    -     ``func_name`` - Fused function name, corresponds to the symbol in the lib generated by NNVM compilation process.
     -     ``num_inputs`` - Number of inputs for this node
     -     ``num_outputs`` - Number of outputs this node produces
 
@@ -82,7 +82,7 @@ Example of dumped graph:
           "name": "relu0",                          # Name of the node
           "attrs": {                                # Attributes of the node
             "flatten_data": "0",                    # Whether this data need to be flattened
-            "func_name": "fuse_l2_normalize_relu",  # Fused function name, corresponds to the symbol in the lib generated by compilation process
+            "func_name": "fuse_l2_normalize_relu",  # Fused function name, corresponds to the symbol in the lib generated by NNVM compilation process
             "num_inputs": "1",                      # Number of inputs for this node
             "num_outputs": "1"                      # Number of outputs this node produces
           },
@@ -105,8 +105,8 @@ Example of dumped graph:
       }
     }
 
-2. Tensor dumping
-=================
+**2. Tensor dumping**
+=====================
 
 The tensor received after execution is in ``tvm.ndarray`` type. All the tensors will
 be saved as binary bytes in serialized format.  The result binary bytes can be loaded by the
@@ -155,7 +155,7 @@ folder specified while creating the runtime.
 Sample Output
 ***************************************
 
-The below is the an example output of the debugger.
+The below is the output of running  ``tvm/nnvm/tutorials/from_onnx.py`` with debugger.
 
 ::
 
diff --git a/docs/dev/index.rst b/docs/dev/index.rst
index 9f46c39ce0a7..983d04f93fe3 100644
--- a/docs/dev/index.rst
+++ b/docs/dev/index.rst
@@ -34,4 +34,6 @@ In this part of documentation, we share the rationale for the specific choices m
    virtual_machine
    codebase_walkthrough
    inferbound
+   nnvm_json_spec
+   nnvm_overview
    benchmark
diff --git a/docs/dev/nnvm_json_spec.rst b/docs/dev/nnvm_json_spec.rst
new file mode 100644
index 000000000000..60d1b9b12bfe
--- /dev/null
+++ b/docs/dev/nnvm_json_spec.rst
@@ -0,0 +1,229 @@
+..  Licensed to the Apache Software Foundation (ASF) under one
+    or more contributor license agreements.  See the NOTICE file
+    distributed with this work for additional information
+    regarding copyright ownership.  The ASF licenses this file
+    to you under the Apache License, Version 2.0 (the
+    "License"); you may not use this file except in compliance
+    with the License.  You may obtain a copy of the License at
+
+..    http://www.apache.org/licenses/LICENSE-2.0
+
+..  Unless required by applicable law or agreed to in writing,
+    software distributed under the License is distributed on an
+    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+    KIND, either express or implied.  See the License for the
+    specific language governing permissions and limitations
+    under the License.
+
+NNVM Graph JSON Specification
+=============================
+
+NNVM uses JSON for graph serialization. This allows NNVM graph to be
+exported to any backend either natively supported or by third-party
+without any dependency such as protobuf.
+
+Getting started
+---------------
+
+A serialized NNVM graph in JSON format can be deserialized by any JSON
+parser.
+
+.. code:: python
+
+    # python
+    import json
+    with open('model.json', 'r') as f:
+      graph = json.loads(f.read())
+    print(graph.keys())
+
+``['nodes', 'arg_nodes', 'heads', 'node_row_ptr']``
+
+Actually, the following keys are valid in JSON graph.
+
++--------------------------------------+------------+-----------------------------------+
+| Keys                                 | Required   | Description                       |
++======================================+============+===================================+
+| `nodes <#nodes>`__                   | Yes        | The nodes in graph.               |
++--------------------------------------+------------+-----------------------------------+
+| `arg\_nodes <#arg_nodes>`__          | Yes        | Indices of input nodes.           |
++--------------------------------------+------------+-----------------------------------+
+| `heads <#heads>`__                   | Yes        | Indices of output nodes.          |
++--------------------------------------+------------+-----------------------------------+
+| `node\_row\_ptr <#node_row_ptr>`__   | Optional   | Depth first search row indices.   |
++--------------------------------------+------------+-----------------------------------+
+| `attr <#attr>`__                     | Optional   | Additional information.           |
++--------------------------------------+------------+-----------------------------------+
+
+nodes
+-----
+
+Explained by the name itself, ``nodes`` are either placeholders or
+computational nodes in NNVM graph. The ``nodes`` are stored in list.
+
+.. code:: python
+
+    nodes = graph['nodes']
+    print(len(nodes))
+    print(nodes[0])
+    print(nodes[3])
+
+::
+
+    53
+    {'inputs': [], 'name': 'data', 'op': 'null'}
+    {'inputs': [[0, 0, 0], [1, 0, 0], [2, 0, 0]], 'attrs': {'channels': '64',
+    'padding': '(1, 1)', 'layout': 'NCHW', 'kernel_size': '[3, 3]', 'groups': '1',
+    'strides': '(1, 1)', 'use_bias': 'True', 'dilation': '(1, 1)'},
+    'name': 'conv1_1', 'op': 'conv2d'}
+
+The following keys are valid in each node:
+
++----------------+------------------+----------+
+| Keys           | Required         | Descript |
+|                |                  | ion      |
++================+==================+==========+
+| op             | Yes              | The      |
+|                |                  | operator |
+|                |                  | type     |
+|                |                  | name,    |
+|                |                  | 'null'   |
+|                |                  | is used  |
+|                |                  | if it's  |
+|                |                  | a        |
+|                |                  | placehol |
+|                |                  | der/vari |
+|                |                  | able/inp |
+|                |                  | ut.      |
++----------------+------------------+----------+
+| name           | Yes              | The      |
+|                |                  | given    |
+|                |                  | name of  |
+|                |                  | the      |
+|                |                  | node,    |
+|                |                  | defined  |
+|                |                  | by user  |
+|                |                  | composin |
+|                |                  | g        |
+|                |                  | the      |
+|                |                  | network. |
++----------------+------------------+----------+
+| inputs         | Yes              | List of  |
+|                |                  | Entry    |
+|                |                  | of the   |
+|                |                  | input    |
+|                |                  | nodes,   |
+|                |                  | can be   |
+|                |                  | empty    |
+|                |                  | list []. |
+|                |                  | Entry is |
+|                |                  | a list   |
+|                |                  | of       |
+|                |                  | [nose\_i |
+|                |                  | d,       |
+|                |                  | index,   |
+|                |                  | version] |
++----------------+------------------+----------+
+| attrs          | Optional         | Extra    |
+|                |                  | attribut |
+|                |                  | es       |
+|                |                  | for the  |
+|                |                  | specific |
+|                |                  | operator |
+|                |                  | .        |
++----------------+------------------+----------+
+| control\_deps  | Optional         | Control  |
+|                |                  | dependen |
+|                |                  | cies,    |
+|                |                  | left     |
+|                |                  | blank    |
+|                |                  | unless   |
+|                |                  | specific |
+|                |                  | ally     |
+|                |                  | used.    |
++----------------+------------------+----------+
+
+``attrs`` for operators is a dictionary. Key-value pair examples:
+
++----------------+------------------+----------+----------+
+| Keys           | Value            | Operator | Descript |
+|                |                  |          | ion      |
++================+==================+==========+==========+
+| 'channels'     | '64'             | conv2d   | Output   |
+|                |                  |          | channels |
+|                |                  |          | for 2d   |
+|                |                  |          | convolut |
+|                |                  |          | ion.     |
++----------------+------------------+----------+----------+
+| 'kernel\_size' | '[3, 3]'         | conv2d   | Convolut |
+|                |                  |          | ion      |
+|                |                  |          | filter   |
+|                |                  |          | kernel   |
+|                |                  |          | size in  |
+|                |                  |          | (h, w),  |
+|                |                  |          | list and |
+|                |                  |          | tuple    |
+|                |                  |          | both     |
+|                |                  |          | works.   |
++----------------+------------------+----------+----------+
+| 'use\_bias'    | '1'              | conv2d   | Whether  |
+|                |                  |          | use bias |
+|                |                  |          | such     |
+|                |                  |          | that     |
+|                |                  |          | `y = w   |
+|                |                  |          | * x + b` |
+|                |                  |          | .        |
++----------------+------------------+----------+----------+
+
+.. note::
+
+    Tips for parsing key-value pair:
+
+    * Both key and value are stored as strings.
+
+    * Boolean values need extra attention, convert to int is recommended since `bool('0') == True` in python.
+
+    * For a full list of operator attributes, please refer to the core operator `documentation <top.html>`__.
+
+arg\_nodes
+----------
+
+``arg_nodes`` is a list of indices of nodes which is
+placeholder/variable/input to the graph.
+
+.. code:: python
+
+    print(graph['arg_nodes'])
+
+::
+
+    [0, 1, 2, 6, 7, 11, 12, 15, 16, 20, 21, 24, 25, 29, 30, 33, 34, 39, 40, 44, 45, 49, 50]
+
+For example, ``nodes[3]`` is not in ``arg_nodes`` because it's an
+internal node.
+
+heads
+-----
+
+``heads`` is a list of entries as the outlet/output of the graph.
+
+.. code:: python
+
+    print(graph['heads'])
+
+::
+
+    [[52, 0, 0]]
+
+This example indicating that there's only one output in the graph, with
+index 52.
+
+node\_row\_ptr
+--------------
+
+``node_row_ptr`` stores the history of forward path, so you can skip
+constructing the entire graph in inference tasks.
+
+attrs
+-----
+
+``attrs`` can contain version numbers or similar helpful informations.
diff --git a/docs/dev/nnvm_overview.md b/docs/dev/nnvm_overview.md
new file mode 100644
index 000000000000..b4a8ee7ccb9f
--- /dev/null
+++ b/docs/dev/nnvm_overview.md
@@ -0,0 +1,143 @@
+<!--- Licensed to the Apache Software Foundation (ASF) under one -->
+<!--- or more contributor license agreements.  See the NOTICE file -->
+<!--- distributed with this work for additional information -->
+<!--- regarding copyright ownership.  The ASF licenses this file -->
+<!--- to you under the Apache License, Version 2.0 (the -->
+<!--- "License"); you may not use this file except in compliance -->
+<!--- with the License.  You may obtain a copy of the License at -->
+
+<!---   http://www.apache.org/licenses/LICENSE-2.0 -->
+
+<!--- Unless required by applicable law or agreed to in writing, -->
+<!--- software distributed under the License is distributed on an -->
+<!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -->
+<!--- KIND, either express or implied.  See the License for the -->
+<!--- specific language governing permissions and limitations -->
+<!--- under the License. -->
+
+
+# NNVM Design Overview
+
+NNVM is a reusable graph IR stack for deep learning systems. It provides useful API to construct, represent and transform computation graphs to get most high-level optimization needed in deep learning.
+As a part of TVM stack for deep learning, NNVM also provides a shared compiler for deep learning frameworks to optimize, compile and deploy into different hardware backends via [TVM](https://github.com/apache/incubator-tvm)
+
+## Key Requirements and Design Choices
+
+- Have minimum dependency in the deployment module.
+- Being able to add new operators to the IR, in a decentralized fashion.
+- Being able to add new optimization passes to the IR and applies to existing graphs.
+
+The item2 and 3 are particularly interesting if we compare it to a typical compiler IR. Compiler IR usually contains a fixed set of primitives(instructions), and use them as a contract between optimization pass designers. This design enables easy addition of new optimization passes, but not new operator(instruction). Because every time we add a new instruction, we need to modify the passes to accommodate these changes.
+
+Deep learning frameworks usually have a fixed operator interface(schema). These interfaces can contain properties like shape inference function, whether in-place computation can happen.  The operator interface is an again contract that makes it easy to add new an operator. But it is hard to add new passes in decentralized fashion a new optimization pass usually requires additional information, and this results in frequent changes of the centralized operator interface when we are exploring new optimizations. There is also a drawback of modularization. For example, a graph compiler for FPGA devices may not need the GPU device specific attributes.
+
+During our explorations in graph optimization and compilation, we find that it is important to quickly add both operators and passes to the framework without changing the core library.
+
+Here is a list of key elements in NNVM's design
+
+-  Operator registry system to register and add new operators
+-  Operator attribute system provide property of operator in decentralized fashion
+-  A reusable IR data structure for optimization passes.
+
+The above list is more like the generic language part of NNVM, besides of that, we also provide a collection of core operator primitives, and graph optimization passes.   The core tensor operator primitives and optimizations already cover commonly deep learning workloads. This design allows the NNVM compiler to be directly used as optimization and compilation stack for frameworks. The extendible nature of NNVM makes new adjustment easy without constraining the backend providers.
+
+## Minimum Registration for a Symbolic Front-End
+To use NNVM to build language front end, a developer only needs to register minimum information about each operator.
+
+```c++
+NNVM_REGISTER_OP(add)
+.describe("add two data together")
+.set_num_inputs(2);
+
+NNVM_REGISTER_OP(conv2d)
+.describe("take 2d convolution of input")
+.set_num_inputs(2);
+
+NNVM_REGISTER_OP(assign)
+.describe("assign second input argument to the first one")
+.set_num_inputs(2);
+```
+
+Compiling the code with NNVM library. User can use the following interface to compose the computation graph in python, like the following code.
+
+```python
+import nnvm.symbol as nn
+
+# symbolic variable
+x = nn.Variable('x')
+y = nn.Variable('y')
+w = nn.Variable('w')
+
+z = nn.conv2d(nn.elemwise_add(x, y), w, kernel_size=(2,2), name='conv1')
+```
+
+The graph structure is interchangeable between the frontend and the backend.  Python interface is supported currently. More language support can be easily
+moved in the future.
+
+## Operator Attribute for More Extensions
+
+The minimum information provided by the operator is enough to get a front-end. However,   we need more knowledge about each operator to do transformations and executing the graph.
+A typical difference between neural nets' computation graph and traditional compiler IR is that there are a lot more high-level operators. We cannot fix the set of operators in the IR.
+
+NNVM allow developers to register attributes of each operator. The attributes can include shape inference function, whether the operator can perform in-place calculation etc.
+
+This design to having an operator attribute registry is not uncommon in deep learning systems.
+For example, MXNet has a ```OpProperty``` class, Tensorflow has a ```OpDef``` and Caffe2 have a ```OperatorSchema``` class.
+However, the operator attribute interface listed in these frameworks only support a fixed number of defined attributes of interest to the system. If we want to extend the framework to add a new attribute in each operator, we need to change the operator registry.
+Eventually, the operator interface grows into to be very big and have to evolve in the centralized repo.
+
+In NNVM, we decided to change the design and support arbitrary type of operator attributes, without changing the interface registry. The minimum interface also makes it easier to share across multiple projects
+
+User can register new attribute, such as inplace property checking function as follows.
+```c++
+using FInplaceOption = std::function<
+  std::vector<std::pair<int, int> > (const NodeAttrs& attrs)>;
+
+// we can register attributes from multiple places.
+NNVM_REGISTER_OP(elemwise_add)
+.set_num_inputs(2);
+
+// register to tell first input can be calculate inplace with first output
+NNVM_REGISTER_OP(add)
+.set_attr<FInplaceOption>("FInplaceOption", [](const NodeAttrs& attrs) {
+  return std::vector<std::pair<int, int> >{{0, 0}};
+ });
+
+NNVM_REGISTER_OP(exp)
+.set_num_inputs(1)
+.set_attr<FInplaceOption>("FInplaceOption", [](const NodeAttrs& attrs) {
+  return std::vector<std::pair<int, int> >{{0, 0}};
+ });
+```
+
+We can query these attributes at arbitrary parts of the code, like the following parts. Under the hood, each attribute is stored in a columnar store, that can easily be retrieved table and do quick lookups.
+
+```c++
+void MyFunction() {
+  const Op* add = Op::Get("add");
+  // if we need quick query, we can use static variable
+  // attribute map contains attributes of all operators.
+  static auto& finplace_option_map = Op::GetAttr<FInplaceOption>("FInplaceOption");
+
+  // quick look up attribute of add, O(1) time, vector index lookup internally.
+  auto add_inplace = finplace_option_map[add];
+}
+```
+Besides making the code minimum, this attribute store enables decentralization of projects.
+Before, all the attributes of operator have to sit on a centralized interface class.
+Now, everyone can register attributes of their own, take some other attributes they need from another project without changing the operator interface and core library
+
+
+## Graph and Pass
+
+We can use the additional information on attribute registry to do optimizations and get more information about the graph. Graph is the unit we manipulate in these steps. A Graph in NNVM contains
+two parts:
+- The computation graph structure
+- A attribute map from string to any type ```map<string, shared_ptr<any> >```
+
+The second attribute map is quite important, as we may need different kinds
+of information about the graph during the transformation process. Let it be
+shapes of each tensor, types of each tensor or the storage allocation plans.
+
+A ```Pass``` can take a graph with existing attribute information,
+and transform it to the same graph structure with more graph attributes or another graph.
diff --git a/docs/faq.md b/docs/faq.md
index f070ed59a575..3161e3bff082 100644
--- a/docs/faq.md
+++ b/docs/faq.md
@@ -26,7 +26,7 @@ See [Installation](http://docs.tvm.ai/install/)
 TVM's relation to Other IR/DSL Projects
 ---------------------------------------
 There are usually two levels of abstractions of IR in the deep learning systems.
-TensorFlow's XLA and Intel's ngraph uses computation graph representation.
+NNVM, TensorFlow's XLA and Intel's ngraph uses computation graph representation.
 This representation is high level, and can be helpful to perform generic optimizations
 such as memory reuse, layout transformation and automatic differentiation.
 
diff --git a/docs/frontend/tensorflow.rst b/docs/frontend/tensorflow.rst
index 87341ab6b7c6..c2fefedfebf7 100644
--- a/docs/frontend/tensorflow.rst
+++ b/docs/frontend/tensorflow.rst
@@ -97,7 +97,7 @@ Import the Model
 Explicit Shape:
 ~~~~~~~~~~~~~~~
 
-To ensure shapes can be known throughout the entire graph, pass the ```shape``` argument to ```from_tensorflow```. This dictionary maps input names to input shapes. Please refer to these `test cases <https://github.com/apache/incubator-tvm/blob/master/tests/python/frontend/tensorflow/test_forward.py#L36>`_ for examples.
+To ensure shapes can be known throughout the entire graph, pass the ```shape``` argument to ```from_tensorflow```. This dictionary maps input names to input shapes. Please refer to these `test cases <https://github.com/apache/incubator-tvm/blob/master/nnvm/tests/python/frontend/tensorflow/test_forward.py#L36>`_ for examples.
 
 Data Layout
 ~~~~~~~~~~~
diff --git a/docs/index.rst b/docs/index.rst
index 258547a34acd..f02dcc7c91e2 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -45,6 +45,7 @@ Developer Guide
    :maxdepth: 2
 
    dev/index
+   nnvm_top
 
 Frontends
 ----------------
diff --git a/docs/install/from_source.rst b/docs/install/from_source.rst
index b1b780b28c0d..acbd9b413d74 100644
--- a/docs/install/from_source.rst
+++ b/docs/install/from_source.rst
@@ -62,7 +62,8 @@ The minimal building requirements are
 - CMake 3.5 or higher
 - We highly recommend to build with LLVM to enable all the features.
 - If you want to use CUDA, CUDA toolkit version >= 8.0 is required. If you are upgrading from an older version, make sure you purge the older version and reboot after installation.
-
+- It is possible to build TVM without the LLVM dependency if you only want to use CUDA/OpenCL
+- If you want to use the NNVM compiler, then LLVM is required
 
 We use cmake to build the library.
 The configuration of TVM can be modified by `config.cmake`.
@@ -106,14 +107,6 @@ The configuration of TVM can be modified by `config.cmake`.
       cmake ..
       make -j4
 
-  - You can also use Ninja build system instead of Unix Makefiles. It can be faster to build than using Makefiles.
-
-  .. code:: bash
-
-      cd build
-      cmake .. -G Ninja
-      ninja
-
 If everything goes well, we can go to :ref:`python-package-installation`
 
 Building on Windows
@@ -131,6 +124,7 @@ In order to generate the VS solution file using cmake, make sure you have a rece
 This will generate the VS project using the MSVC 14 64 bit generator.
 Open the .sln file in the build directory and build with Visual Studio.
 In order to build with LLVM in windows, you will need to build LLVM from source.
+You need to run build the nnvm by running the same script under the nnvm folder.
 
 Building ROCm support
 ~~~~~~~~~~~~~~~~~~~~~
@@ -163,7 +157,7 @@ Method 1
    .. code:: bash
 
        export TVM_HOME=/path/to/tvm
-       export PYTHONPATH=$TVM_HOME/python:$TVM_HOME/topi/python:${PYTHONPATH}
+       export PYTHONPATH=$TVM_HOME/python:$TVM_HOME/topi/python:$TVM_HOME/nnvm/python:${PYTHONPATH}
 
 
 Method 2
@@ -178,6 +172,7 @@ Method 2
        export MACOSX_DEPLOYMENT_TARGET=10.9  # This is required for mac to avoid symbol conflicts with libstdc++
        cd python; python setup.py install --user; cd ..
        cd topi/python; python setup.py install --user; cd ../..
+       cd nnvm/python; python setup.py install --user; cd ../..
 
 
 Python dependencies
diff --git a/docs/langref/relay_op.rst b/docs/langref/relay_op.rst
index 1fabd704482c..fc77869a6261 100644
--- a/docs/langref/relay_op.rst
+++ b/docs/langref/relay_op.rst
@@ -71,13 +71,10 @@ This level enables typical convnet models.
    tvm.relay.nn.conv2d_transpose
    tvm.relay.nn.dense
    tvm.relay.nn.max_pool2d
-   tvm.relay.nn.max_pool3d
    tvm.relay.nn.avg_pool2d
-   tvm.relay.nn.avg_pool3d
    tvm.relay.nn.global_max_pool2d
    tvm.relay.nn.global_avg_pool2d
    tvm.relay.nn.upsampling
-   tvm.relay.nn.upsampling3d
    tvm.relay.nn.batch_flatten
    tvm.relay.nn.pad
    tvm.relay.nn.lrn
@@ -249,13 +246,10 @@ Level 2 Definitions
 .. autofunction:: tvm.relay.nn.conv2d_transpose
 .. autofunction:: tvm.relay.nn.dense
 .. autofunction:: tvm.relay.nn.max_pool2d
-.. autofunction:: tvm.relay.nn.max_pool3d
 .. autofunction:: tvm.relay.nn.avg_pool2d
-.. autofunction:: tvm.relay.nn.avg_pool3d
 .. autofunction:: tvm.relay.nn.global_max_pool2d
 .. autofunction:: tvm.relay.nn.global_avg_pool2d
 .. autofunction:: tvm.relay.nn.upsampling
-.. autofunction:: tvm.relay.nn.upsampling3d
 .. autofunction:: tvm.relay.nn.batch_flatten
 .. autofunction:: tvm.relay.nn.pad
 .. autofunction:: tvm.relay.nn.lrn
diff --git a/docs/nnvm_top.rst b/docs/nnvm_top.rst
new file mode 100644
index 000000000000..8679cae9b256
--- /dev/null
+++ b/docs/nnvm_top.rst
@@ -0,0 +1,297 @@
+..  Licensed to the Apache Software Foundation (ASF) under one
+    or more contributor license agreements.  See the NOTICE file
+    distributed with this work for additional information
+    regarding copyright ownership.  The ASF licenses this file
+    to you under the Apache License, Version 2.0 (the
+    "License"); you may not use this file except in compliance
+    with the License.  You may obtain a copy of the License at
+
+..    http://www.apache.org/licenses/LICENSE-2.0
+
+..  Unless required by applicable law or agreed to in writing,
+    software distributed under the License is distributed on an
+    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+    KIND, either express or implied.  See the License for the
+    specific language governing permissions and limitations
+    under the License.
+
+NNVM Core Tensor Operators
+==========================
+
+This page contains the list of core tensor operator primitives pre-defined in NNVM.
+The core tensor operator primitives(``nnvm.top``) covers typical workloads in deep learning.
+They can represent workloads in front-end frameworks, and provide basic building blocks for optimization.
+Since deep learning is a fast evolving field and it is that possible to have operators that are not in here.
+NNVM is designed for this problem and can easily new operators without changing the core library.
+
+.. note::
+
+   Each operator node in the graph IR contains the following two kinds of parameters.
+
+   - inputs: positional list of input tensors
+   - attrs: attributes about operator(e.g. kernel_size in conv2d)
+
+   This document lists both inputs and attributes in the parameter field.  You can distinguish them by the marked type. The inputs are of type Tensor, while the rest parameters are attributes.
+   To construct the graph with NNVM python API, a user can pass in the input Tensors as positional arguments, and attributes as keyword arguments.
+
+
+Overview of Operators
+---------------------
+**Level 1: Basic Operators**
+
+This level enables fully connected multi-layer perceptron.
+
+.. autosummary::
+   :nosignatures:
+
+   nnvm.symbol.dense
+   nnvm.symbol.relu
+   nnvm.symbol.prelu
+   nnvm.symbol.tanh
+   nnvm.symbol.sigmoid
+   nnvm.symbol.exp
+   nnvm.symbol.log
+   nnvm.symbol.sqrt
+   nnvm.symbol.logical_and
+   nnvm.symbol.logical_or
+   nnvm.symbol.logical_not
+   nnvm.symbol.elemwise_add
+   nnvm.symbol.elemwise_sub
+   nnvm.symbol.elemwise_mul
+   nnvm.symbol.elemwise_div
+   nnvm.symbol.elemwise_sum
+   nnvm.symbol.elemwise_mod
+   nnvm.symbol.elemwise_pow
+   nnvm.symbol.flatten
+   nnvm.symbol.concatenate
+   nnvm.symbol.expand_dims
+   nnvm.symbol.squeeze
+   nnvm.symbol.split
+   nnvm.symbol.dropout
+   nnvm.symbol.batch_norm
+   nnvm.symbol.softmax
+   nnvm.symbol.log_softmax
+   nnvm.symbol.pad
+   nnvm.symbol.block_grad
+   nnvm.symbol.matmul
+   nnvm.symbol.resize
+   nnvm.symbol.upsampling
+   nnvm.symbol.take
+   nnvm.symbol.l2_normalize
+   nnvm.symbol.flip
+   nnvm.symbol.lrn
+   nnvm.symbol.where
+   nnvm.symbol.gather_nd
+
+
+**Level 2: Convolutions**
+
+This level enables typical convnet models.
+
+.. autosummary::
+   :nosignatures:
+
+   nnvm.symbol.conv2d
+   nnvm.symbol.conv2d_transpose
+   nnvm.symbol.max_pool2d
+   nnvm.symbol.avg_pool2d
+   nnvm.symbol.global_max_pool2d
+   nnvm.symbol.global_avg_pool2d
+
+
+**Level 3: Additional Tensor Ops**
+
+.. autosummary::
+   :nosignatures:
+
+   nnvm.symbol.reshape
+   nnvm.symbol.copy
+   nnvm.symbol.negative
+   nnvm.symbol.floor
+   nnvm.symbol.ceil
+   nnvm.symbol.round
+   nnvm.symbol.trunc
+   nnvm.symbol.abs
+   nnvm.symbol.leaky_relu
+   nnvm.symbol.__add_scalar__
+   nnvm.symbol.__sub_scalar__
+   nnvm.symbol.__rsub_scalar__
+   nnvm.symbol.__mul_scalar__
+   nnvm.symbol.__div_scalar__
+   nnvm.symbol.__rdiv_scalar__
+   nnvm.symbol.__pow_scalar__
+   nnvm.symbol.__rpow_scalar__
+   nnvm.symbol.__lshift_scalar__
+   nnvm.symbol.__rshift_scalar__
+
+
+**Level 4: Broadcast and Reductions**
+
+.. autosummary::
+   :nosignatures:
+
+   nnvm.symbol.transpose
+   nnvm.symbol.broadcast_to
+   nnvm.symbol.sum
+   nnvm.symbol.min
+   nnvm.symbol.max
+   nnvm.symbol.mean
+   nnvm.symbol.prod
+   nnvm.symbol.broadcast_add
+   nnvm.symbol.broadcast_sub
+   nnvm.symbol.broadcast_mul
+   nnvm.symbol.broadcast_div
+   nnvm.symbol.clip
+   nnvm.symbol.greater
+   nnvm.symbol.less
+   nnvm.symbol.expand_like
+   nnvm.symbol.reshape_like
+   nnvm.symbol.full
+   nnvm.symbol.full_like
+   nnvm.symbol.ones
+   nnvm.symbol.ones_like
+   nnvm.symbol.zeros
+   nnvm.symbol.zeros_like
+   nnvm.symbol.slice_like
+   nnvm.symbol.strided_slice
+   nnvm.symbol.argmax
+   nnvm.symbol.argmin
+   nnvm.symbol.collapse_sum
+   nnvm.symbol.broadcast_equal
+   nnvm.symbol.broadcast_greater_equal
+   nnvm.symbol.broadcast_greater
+   nnvm.symbol.broadcast_left_shift
+   nnvm.symbol.broadcast_less_equal
+   nnvm.symbol.broadcast_less
+   nnvm.symbol.broadcast_max
+   nnvm.symbol.broadcast_min
+   nnvm.symbol.broadcast_mod
+   nnvm.symbol.broadcast_not_equal
+   nnvm.symbol.broadcast_pow
+   nnvm.symbol.broadcast_right_shift
+
+
+**Level 5: Vision Operators**
+
+.. autosummary::
+   :nosignatures:
+
+   nnvm.symbol.multibox_prior
+   nnvm.symbol.multibox_transform_loc
+   nnvm.symbol.nms
+   nnvm.symbol.yolo_region
+   nnvm.symbol.yolo_reorg
+
+Detailed Definitions
+--------------------
+.. autofunction:: nnvm.symbol.dense
+.. autofunction:: nnvm.symbol.relu
+.. autofunction:: nnvm.symbol.prelu
+.. autofunction:: nnvm.symbol.tanh
+.. autofunction:: nnvm.symbol.sigmoid
+.. autofunction:: nnvm.symbol.exp
+.. autofunction:: nnvm.symbol.log
+.. autofunction:: nnvm.symbol.sqrt
+.. autofunction:: nnvm.symbol.logical_and
+.. autofunction:: nnvm.symbol.logical_or
+.. autofunction:: nnvm.symbol.logical_not
+.. autofunction:: nnvm.symbol.elemwise_add
+.. autofunction:: nnvm.symbol.elemwise_sub
+.. autofunction:: nnvm.symbol.elemwise_mul
+.. autofunction:: nnvm.symbol.elemwise_div
+.. autofunction:: nnvm.symbol.elemwise_sum
+.. autofunction:: nnvm.symbol.elemwise_mod
+.. autofunction:: nnvm.symbol.elemwise_pow
+.. autofunction:: nnvm.symbol.flatten
+.. autofunction:: nnvm.symbol.concatenate
+.. autofunction:: nnvm.symbol.expand_dims
+.. autofunction:: nnvm.symbol.squeeze
+.. autofunction:: nnvm.symbol.split
+.. autofunction:: nnvm.symbol.dropout
+.. autofunction:: nnvm.symbol.batch_norm
+.. autofunction:: nnvm.symbol.softmax
+.. autofunction:: nnvm.symbol.log_softmax
+.. autofunction:: nnvm.symbol.pad
+.. autofunction:: nnvm.symbol.block_grad
+.. autofunction:: nnvm.symbol.matmul
+.. autofunction:: nnvm.symbol.resize
+.. autofunction:: nnvm.symbol.upsampling
+.. autofunction:: nnvm.symbol.take
+.. autofunction:: nnvm.symbol.l2_normalize
+.. autofunction:: nnvm.symbol.flip
+.. autofunction:: nnvm.symbol.lrn
+.. autofunction:: nnvm.symbol.where
+.. autofunction:: nnvm.symbol.gather_nd
+
+.. autofunction:: nnvm.symbol.conv2d
+.. autofunction:: nnvm.symbol.conv2d_transpose
+.. autofunction:: nnvm.symbol.max_pool2d
+.. autofunction:: nnvm.symbol.avg_pool2d
+.. autofunction:: nnvm.symbol.global_max_pool2d
+.. autofunction:: nnvm.symbol.global_avg_pool2d
+
+.. autofunction:: nnvm.symbol.reshape
+.. autofunction:: nnvm.symbol.copy
+.. autofunction:: nnvm.symbol.negative
+.. autofunction:: nnvm.symbol.floor
+.. autofunction:: nnvm.symbol.ceil
+.. autofunction:: nnvm.symbol.round
+.. autofunction:: nnvm.symbol.trunc
+.. autofunction:: nnvm.symbol.abs
+.. autofunction:: nnvm.symbol.leaky_relu
+.. autofunction:: nnvm.symbol.__add_scalar__
+.. autofunction:: nnvm.symbol.__sub_scalar__
+.. autofunction:: nnvm.symbol.__rsub_scalar__
+.. autofunction:: nnvm.symbol.__mul_scalar__
+.. autofunction:: nnvm.symbol.__div_scalar__
+.. autofunction:: nnvm.symbol.__rdiv_scalar__
+.. autofunction:: nnvm.symbol.__pow_scalar__
+.. autofunction:: nnvm.symbol.__rpow_scalar__
+.. autofunction:: nnvm.symbol.__lshift_scalar__
+.. autofunction:: nnvm.symbol.__rshift_scalar__
+
+.. autofunction:: nnvm.symbol.transpose
+.. autofunction:: nnvm.symbol.broadcast_to
+.. autofunction:: nnvm.symbol.sum
+.. autofunction:: nnvm.symbol.min
+.. autofunction:: nnvm.symbol.max
+.. autofunction:: nnvm.symbol.mean
+.. autofunction:: nnvm.symbol.prod
+.. autofunction:: nnvm.symbol.broadcast_add
+.. autofunction:: nnvm.symbol.broadcast_sub
+.. autofunction:: nnvm.symbol.broadcast_mul
+.. autofunction:: nnvm.symbol.broadcast_div
+.. autofunction:: nnvm.symbol.clip
+.. autofunction:: nnvm.symbol.greater
+.. autofunction:: nnvm.symbol.less
+.. autofunction:: nnvm.symbol.expand_like
+.. autofunction:: nnvm.symbol.reshape_like
+.. autofunction:: nnvm.symbol.full
+.. autofunction:: nnvm.symbol.full_like
+.. autofunction:: nnvm.symbol.ones
+.. autofunction:: nnvm.symbol.ones_like
+.. autofunction:: nnvm.symbol.zeros
+.. autofunction:: nnvm.symbol.zeros_like
+.. autofunction:: nnvm.symbol.slice_like
+.. autofunction:: nnvm.symbol.strided_slice
+.. autofunction:: nnvm.symbol.argmax
+.. autofunction:: nnvm.symbol.argmin
+.. autofunction:: nnvm.symbol.collapse_sum
+.. autofunction:: nnvm.symbol.broadcast_equal
+.. autofunction:: nnvm.symbol.broadcast_greater_equal
+.. autofunction:: nnvm.symbol.broadcast_greater
+.. autofunction:: nnvm.symbol.broadcast_left_shift
+.. autofunction:: nnvm.symbol.broadcast_less_equal
+.. autofunction:: nnvm.symbol.broadcast_less
+.. autofunction:: nnvm.symbol.broadcast_max
+.. autofunction:: nnvm.symbol.broadcast_min
+.. autofunction:: nnvm.symbol.broadcast_mod
+.. autofunction:: nnvm.symbol.broadcast_not_equal
+.. autofunction:: nnvm.symbol.broadcast_pow
+.. autofunction:: nnvm.symbol.broadcast_right_shift
+
+.. autofunction:: nnvm.symbol.multibox_prior
+.. autofunction:: nnvm.symbol.multibox_transform_loc
+.. autofunction:: nnvm.symbol.nms
+.. autofunction:: nnvm.symbol.yolo_region
+.. autofunction:: nnvm.symbol.yolo_reorg
diff --git a/include/tvm/attrs.h b/include/tvm/attrs.h
index 8810c4e4a0df..2fbb9e6a866e 100644
--- a/include/tvm/attrs.h
+++ b/include/tvm/attrs.h
@@ -159,7 +159,7 @@ class AttrsEqual {
   bool operator()(const std::string& lhs, const std::string& rhs) const {
     return lhs == rhs;
   }
-  bool operator()(const DataType& lhs, const DataType& rhs) const {
+  bool operator()(const Type& lhs, const Type& rhs) const {
     return lhs == rhs;
   }
   // node comparator
@@ -506,8 +506,8 @@ inline void SetValue<std::string>(std::string* ptr, const TVMArgValue& val) {
   }
 }
 template<>
-inline void SetValue(DataType* ptr, const TVMArgValue& val) {
-  *ptr = val.operator DataType();
+inline void SetValue(Type* ptr, const TVMArgValue& val) {
+  *ptr = val.operator Type();
 }
 template<>
 inline void SetValue<double>(double* ptr, const TVMArgValue& val) {
@@ -611,7 +611,7 @@ struct TypeName<uint64_t> {
 };
 
 template<>
-struct TypeName<DataType> {
+struct TypeName<Type> {
   static constexpr const char* value = "Type";
 };
 
diff --git a/include/tvm/buffer.h b/include/tvm/buffer.h
index fac18a9b1753..d2c2b40661e2 100644
--- a/include/tvm/buffer.h
+++ b/include/tvm/buffer.h
@@ -74,16 +74,14 @@ class Buffer : public NodeRef {
    * \param content_lanes The number of lanes for the (data) type.
    * \param offset The offset of ptr.
    */
-  TVM_DLL Expr access_ptr(int access_mask,
-                          DataType ptr_type = DataType::Handle(),
-                          int content_lanes = 1,
-                          Expr offset = make_const(DataType::Int(32), 0)) const;
+  TVM_DLL Expr access_ptr(int access_mask, Type ptr_type = Handle(),
+                          int content_lanes = 1, Expr offset = make_const(Int(32), 0)) const;
   /*!
    * \brief Create an Expr that does a vector load at begin index.
    * \param begin The beginning index
    * \param dtype The data type to be loaded.
    */
-  TVM_DLL Expr vload(Array<Expr> begin, DataType dtype) const;
+  TVM_DLL Expr vload(Array<Expr> begin, Type dtype) const;
   /*!
    * \brief Create a Stmt that does a vector store at begin index.
    * \param begin The beginning index
@@ -110,7 +108,7 @@ class BufferNode : public Node {
    */
   Var data;
   /*! \brief data type in the content of the tensor */
-  DataType dtype;
+  Type dtype;
   /*! \brief The shape of the buffer */
   Array<Expr> shape;
   /*!
@@ -151,14 +149,14 @@ class BufferNode : public Node {
   }
 
   /*! \return preferred index type for this buffer node */
-  DataType DefaultIndexType() const {
-    return shape.size() != 0 ? shape[0].dtype() : DataType::Int(32);
+  Type DefaultIndexType() const {
+    return shape.size() != 0 ? shape[0].type() : Int(32);
   }
 
   // User can specify data_alignment and offset_factor to be 0
   // A default value will be picked.
   TVM_DLL static Buffer make(Var ptr,
-                             DataType dtype,
+                             Type dtype,
                              Array<Expr> shape,
                              Array<Expr> strides,
                              Expr elem_offset,
@@ -185,7 +183,7 @@ inline const BufferNode* Buffer::operator->() const {
  * \sa BufferNode::make for complete constructor.
  */
 TVM_DLL Buffer decl_buffer(Array<Expr> shape,
-                           DataType dtype = DataType::Float(32),
+                           Type dtype = Float(32),
                            std::string name = "buffer");
 }  // namespace tvm
 #endif  // TVM_BUFFER_H_
diff --git a/include/tvm/build_module.h b/include/tvm/build_module.h
index fba929cda1be..a83288ce3662 100644
--- a/include/tvm/build_module.h
+++ b/include/tvm/build_module.h
@@ -170,9 +170,6 @@ TVM_DLL Target intel_graphics(const std::vector<std::string>& options =
 TVM_DLL Target stackvm(const std::vector<std::string>& options =
                       std::vector<std::string>());
 
-/*! \return A target for external device */
-TVM_DLL Target ext_dev(const std::vector<std::string>& options =
-                   std::vector<std::string>());
 }  // namespace target
 
 /*!
diff --git a/include/tvm/channel.h b/include/tvm/channel.h
new file mode 100644
index 000000000000..3a40a787d891
--- /dev/null
+++ b/include/tvm/channel.h
@@ -0,0 +1,73 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file tvm/channel.h
+ * \brief Channel object for pipeline.
+ */
+#ifndef TVM_CHANNEL_H_
+#define TVM_CHANNEL_H_
+
+#include <tvm/expr.h>
+
+namespace tvm {
+// Node container of channel
+struct ChannelNode;
+
+/*! \brief The data channel. */
+class Channel : public NodeRef {
+ public:
+  /*! \brief default constructor  */
+  Channel() {}
+  explicit Channel(ObjectPtr<Object> n) : NodeRef(n) {}
+  /*!
+   * \brief access the internal node container
+   * \return the pointer to the internal node container
+   */
+  inline const ChannelNode* operator->() const;
+  // The container type
+  using ContainerType = ChannelNode;
+};
+
+/*!
+ * \brief Generalized FIFO channel.
+ */
+struct ChannelNode : public Node {
+  /*! \brief Variable to channel handle */
+  Var handle_var;
+  /*! \brief default data type in read/write */
+  Type dtype;
+  // visit all attributes
+  void VisitAttrs(AttrVisitor* v) {
+    v->Visit("handle_var", &handle_var);
+    v->Visit("dtype", &dtype);
+  }
+
+  static Channel make(Var handle_var, Type dtype);
+  static constexpr const char* _type_key = "Channel";
+
+  TVM_DECLARE_NODE_TYPE_INFO(ChannelNode, Node);
+};
+
+// Inline implementations
+inline const ChannelNode* Channel::operator->() const {
+  return static_cast<const ChannelNode*>(get());
+}
+}  // namespace tvm
+#endif  // TVM_CHANNEL_H_
diff --git a/include/tvm/runtime/data_type.h b/include/tvm/dtype.h
similarity index 57%
rename from include/tvm/runtime/data_type.h
rename to include/tvm/dtype.h
index 5b222ac6b442..9f7902deb960 100644
--- a/include/tvm/runtime/data_type.h
+++ b/include/tvm/dtype.h
@@ -17,35 +17,23 @@
  * under the License.
  */
 /*
- * \file tvm/runtime/data_type.h
- * \brief Primitive runtime data type.
+ * \file tvm/dtype.h
+ * \brief Data type used in IR.
  */
 // Acknowledgement: DataType structure design originates from Halide.
-#ifndef TVM_RUNTIME_DATA_TYPE_H_
-#define TVM_RUNTIME_DATA_TYPE_H_
-
-#include <tvm/runtime/c_runtime_api.h>
-#include <dmlc/logging.h>
-#include <type_traits>
+#ifndef TVM_DTYPE_H_
+#define TVM_DTYPE_H_
 
+#include "runtime/packed_func.h"
 
 namespace tvm {
-namespace runtime {
+class Expr;
+
 /*!
- * \brief Runtime primitive data type.
- *
- *  This class is a thin wrapper of DLDataType.
- *  We also make use of DataType in compiler to store quick hint
+ * \brief Primitive data types in tvm.
  */
 class DataType {
  public:
-  /*! \brief Type code for the DataType. */
-  enum TypeCode {
-    kInt = kDLInt,
-    kUInt = kDLUInt,
-    kFloat = kDLFloat,
-    kHandle = TVMTypeCode::kHandle,
-  };
   /*! \brief default constructor */
   DataType() {}
   /*!
@@ -87,23 +75,23 @@ class DataType {
   }
   /*! \return whether type is a scalar type. */
   bool is_bool() const {
-    return code() == DataType::kUInt && bits() == 1;
+    return code() == kDLUInt && bits() == 1;
   }
   /*! \return whether type is a float type. */
   bool is_float() const {
-    return code() == DataType::kFloat;
+    return code() == kDLFloat;
   }
   /*! \return whether type is an int type. */
   bool is_int() const {
-    return code() == DataType::kInt;
+    return code() == kDLInt;
   }
   /*! \return whether type is an uint type. */
   bool is_uint() const {
-    return code() == DataType::kUInt;
+    return code() == kDLUInt;
   }
   /*! \return whether type is a handle type. */
   bool is_handle() const {
-    return code() == DataType::kHandle;
+    return code() == kHandle;
   }
   /*! \return whether type is a vector type. */
   bool is_vector() const {
@@ -132,93 +120,107 @@ class DataType {
   DataType element_of() const {
     return with_lanes(1);
   }
-  /*!
-   * \brief Equal comparator.
-   * \param other The data type to compre against.
-   * \return The comparison resilt.
-   */
+  // operator overloadings
   bool operator==(const DataType& other) const {
     return
         data_.code == other.data_.code &&
         data_.bits == other.data_.bits &&
         data_.lanes == other.data_.lanes;
   }
-  /*!
-   * \brief NotEqual comparator.
-   * \param other The data type to compre against.
-   * \return The comparison resilt.
-   */
   bool operator!=(const DataType& other) const {
     return !operator==(other);
   }
-  /*!
-   * \brief Converter to DLDataType
-   * \return the result.
-   */
   operator DLDataType () const {
     return data_;
   }
-
-  /*!
-   * \brief Construct an int type.
-   * \param bits The number of bits in the type.
-   * \param lanes The number of lanes.
-   * \return The constructed data type.
-   */
-  static DataType Int(int bits, int lanes = 1) {
-    return DataType(kDLInt, bits, lanes);
-  }
-  /*!
-   * \brief Construct an uint type.
-   * \param bits The number of bits in the type.
-   * \param lanes The number of lanes
-   * \return The constructed data type.
-   */
-  static DataType UInt(int bits, int lanes = 1) {
-    return DataType(kDLUInt, bits, lanes);
-  }
-  /*!
-   * \brief Construct an uint type.
-   * \param bits The number of bits in the type.
-   * \param lanes The number of lanes
-   * \return The constructed data type.
-   */
-  static DataType Float(int bits, int lanes = 1) {
-    return DataType(kDLFloat, bits, lanes);
-  }
-  /*!
-   * \brief Construct a bool type.
-   * \param lanes The number of lanes
-   * \return The constructed data type.
-   */
-  static DataType Bool(int lanes = 1) {
-    return DataType::UInt(1, lanes);
-  }
-  /*!
-   * \brief Construct a handle type.
-   * \param bits The number of bits in the type.
-   * \param lanes The number of lanes
-   * \return The constructed data type.
-   */
-  static DataType Handle(int bits = 64, int lanes = 1) {
-    return DataType(kHandle, bits, lanes);
-  }
-  /*!
-   * \brief Get the corresponding type of TVMShapeIndex.
-   * \return The type of TVM shape index.
-   */
-  static DataType ShapeIndex() {
-    if (std::is_signed<tvm_index_t>::value) {
-      return DataType::Int(sizeof(tvm_index_t) * 8);
-    } else {
-      return DataType::UInt(sizeof(tvm_index_t) * 8);
-    }
-  }
+  /*! \return the maximum possible value in this format. */
+  TVM_DLL Expr max() const;
+  /*! \return the minimum possible value in this format. */
+  TVM_DLL Expr min() const;
 
  private:
   DLDataType data_;
 };
 
+/*!
+ * \brief Construct an int type.
+ * \param bits The number of bits in the type.
+ * \param lanes The number of lanes.
+ * \return The constructed data type.
+ */
+inline DataType Int(int bits, int lanes = 1) {
+  return DataType(kDLInt, bits, lanes);
+}
+
+/*!
+ * \brief Construct an uint type.
+ * \param bits The number of bits in the type.
+ * \param lanes The number of lanes
+ * \return The constructed data type.
+ */
+inline DataType UInt(int bits, int lanes = 1) {
+  return DataType(kDLUInt, bits, lanes);
+}
+
+/*!
+ * \brief Construct a bool type.
+ * \param lanes The number of lanes
+ * \return The constructed data type.
+ */
+inline DataType Bool(int lanes = 1) {
+  return UInt(1, lanes);
+}
+
+/*!
+ * \brief Construct an uint type.
+ * \param bits The number of bits in the type.
+ * \param lanes The number of lanes
+ * \return The constructed data type.
+ */
+inline DataType Float(int bits, int lanes = 1) {
+  return DataType(kDLFloat, bits, lanes);
+}
+
+/*!
+ * \brief Construct a handle type.
+ * \param bits The number of bits in the type.
+ * \param lanes The number of lanes
+ * \return The constructed data type.
+ */
+inline DataType Handle(int bits = 64, int lanes = 1) {
+  return DataType(kHandle, bits, lanes);
+}
+
+/*!
+ * \brief Get the corresponding type of TVMShapeIndex.
+ * \return The type of TVM shape index.
+ */
+inline DataType TVMShapeIndexType() {
+  if (std::is_signed<tvm_index_t>::value) {
+    return Int(sizeof(tvm_index_t) * 8);
+  } else {
+    return UInt(sizeof(tvm_index_t) * 8);
+  }
+}
+
+/*!
+ * \brief Convert DLDataType to DataType.
+ * \param t The original type.
+ * \return The conversion result.
+ */
+inline DataType TVMType2Type(DLDataType t) {
+  return DataType(t.code, t.bits, t.lanes);
+}
+
+/*!
+ * \brief Convert DataType to DataType.
+ * \param t The original type.
+ * \return The conversion result.
+ */
+inline DLDataType Type2TVMType(DataType t) {
+  return t.operator DLDataType();
+}
+
 /*!
  * \brief Get the number of bytes needed in a vector.
  * \param dtype The data type.
@@ -227,15 +229,19 @@ class DataType {
 inline int GetVectorBytes(DataType dtype) {
   int data_bits = dtype.bits() * dtype.lanes();
   // allow bool to exist
-  if (dtype == DataType::Bool()) return 1;
+  if (dtype == Bool()) return 1;
   CHECK_EQ(data_bits % 8, 0U)
       << "Need to load/store by multiple of bytes";
   return data_bits / 8;
 }
 
-}  // namespace runtime
-
-using DataType = runtime::DataType;
+// Overload print function.
+inline std::ostream& operator<<(std::ostream& os, DataType dtype) { // NOLINT(*)
+  using namespace tvm::runtime;
+  return os << dtype.operator DLDataType();
+}
 
+// Backward compatibility
+using Type = DataType;
 }  // namespace tvm
-#endif  //  TVM_RUNTIME_DATA_TYPE_H_
+#endif  //  TVM_DTYPE_H_
diff --git a/include/tvm/expr.h b/include/tvm/expr.h
index f27cb9879fb7..fc52421d903b 100644
--- a/include/tvm/expr.h
+++ b/include/tvm/expr.h
@@ -29,11 +29,11 @@
 #include <unordered_map>
 #include <iostream>
 #include "base.h"
+#include "dtype.h"
 #include "node/node.h"
 #include "node/container.h"
 #include "node/functor.h"
 #include "runtime/c_runtime_api.h"
-#include "runtime/data_type.h"
 
 namespace tvm {
 
@@ -41,7 +41,7 @@ namespace tvm {
 class ExprNode : public Node {
  public:
   /*! \brief The data type of the expression. */
-  DataType dtype;
+  DataType type;
 
   static constexpr const char* _type_key = "Expr";
   TVM_DECLARE_BASE_NODE_INFO(ExprNode, Node);
@@ -69,8 +69,8 @@ class Expr : public NodeRef {
   TVM_DLL Expr(std::string str);  // NOLINT(*)
 
   /*! \return the data type of this expression. */
-  DataType dtype() const {
-    return static_cast<const ExprNode*>(get())->dtype;
+  DataType type() const {
+    return static_cast<const ExprNode*>(get())->type;
   }
 
   /*! \brief type indicate the container type */
@@ -113,7 +113,7 @@ class Variable : public ExprNode {
   static Var make(DataType dtype, std::string name_hint);
 
   void VisitAttrs(AttrVisitor* v) {
-    v->Visit("dtype", &dtype);
+    v->Visit("dtype", &type);
     v->Visit("name", &name_hint);
   }
 
@@ -126,14 +126,14 @@ class Var : public Expr {
  public:
   explicit Var(ObjectPtr<Object> n) : Expr(n) {}
   TVM_DLL explicit Var(std::string name_hint = "v",
-                       DataType t = DataType::Int(32));
+                       Type t = Int(32));
   /*!
    * \brief Make a new copy of var with same type, append suffix
    * \param suffix The suffix to be appended.
    * \return the new Var copy
    */
   Var copy_with_suffix(const std::string& suffix) const {
-    return Var((*this)->name_hint + suffix, (*this)->dtype);
+    return Var((*this)->name_hint + suffix, (*this)->type);
   }
   /*!
    * \brief Get pointer to the internal value.
@@ -167,7 +167,7 @@ class IntImm : public ExprNode {
   int64_t value;
 
   void VisitAttrs(AttrVisitor* v) {
-    v->Visit("dtype", &dtype);
+    v->Visit("dtype", &type);
     v->Visit("value", &value);
   }
 
@@ -452,7 +452,7 @@ inline const char* IterVarType2String(IterVarType t) {
  * \param name_hint The name hint for the expression
  * \param t The type of the expression
  */
-TVM_DLL Var var(std::string name_hint, DataType t = DataType::Int(32));
+TVM_DLL Var var(std::string name_hint, Type t = Int(32));
 
 /*
  * \brief Template function to convert Map to unordered_map
diff --git a/include/tvm/expr_operator.h b/include/tvm/expr_operator.h
index 41e7aa5b7796..625ee8e49286 100644
--- a/include/tvm/expr_operator.h
+++ b/include/tvm/expr_operator.h
@@ -44,20 +44,20 @@ namespace tvm {
  */
 template<typename ValueType,
          typename = typename std::enable_if<std::is_pod<ValueType>::value>::type>
-inline Expr make_const(DataType t, ValueType value);
+inline Expr make_const(Type t, ValueType value);
 /*!
  * \brief Make a const zero expr.
  * \param t The target type.
  * \return the result expression.
  */
-inline Expr make_zero(DataType t);
+inline Expr make_zero(Type t);
 /*!
  * \brief Make a constant true expression.
  * \param lanes The number of lanes in the bool
  * \return The result expression.
  */
 inline Expr const_true(int lanes = 1) {
-  return make_const(DataType::UInt(1, lanes), 1);
+  return make_const(UInt(1, lanes), 1);
 }
 /*!
  * \brief Make a constant false expression.
@@ -65,7 +65,7 @@ inline Expr const_true(int lanes = 1) {
  * \return The result expression.
  */
 inline Expr const_false(int lanes = 1) {
-  return make_const(DataType::UInt(1, lanes), 0);
+  return make_const(UInt(1, lanes), 0);
 }
 /*!
  * \brief Get x as constant int expression.
@@ -139,20 +139,6 @@ inline bool is_zero(const Expr& x) {
  */
 inline bool is_const(const Expr& x);
 
-/*!
- * Query the maximum possible value of dtype.
- * \param dtype The data type.
- * \return the maximum possible value in this format.
- */
-TVM_DLL Expr max_value(const DataType& dtype);
-
-/*!
- * Query the minimum possible value of dtype.
- * \param dtype The data type.
- * \return the minimum possible value in this format.
- */
-TVM_DLL Expr min_value(const DataType& dtype);
-
 /*!
  * \brief Check whether x is a constant power of two
  * If x is power of two, write the power to the shift.
@@ -171,7 +157,7 @@ TVM_DLL bool is_const_power_of_two_integer(const Expr& x, int* shift);
  * \return The result expression.
  * \note This function may return value if the type is the same.
  */
-TVM_DLL Expr cast(const DataType& t, Expr value);
+TVM_DLL Expr cast(const Type& t, Expr value);
 /*!
  * \brief perform reinterpret cast value to type.
  *
@@ -180,7 +166,7 @@ TVM_DLL Expr cast(const DataType& t, Expr value);
  * \return The result expression.
  * \note This function may return value if the type is the same.
  */
-TVM_DLL Expr reinterpret(const DataType& t, Expr value);
+TVM_DLL Expr reinterpret(const Type& t, Expr value);
 /*!
  * \brief add operator
  *
@@ -600,7 +586,7 @@ TVM_DLL Expr trunc(Expr x);
 // Intrinsic operators
 #define TVM_DECLARE_INTRIN_UNARY(OpName)                                \
   inline Expr OpName(Expr x) {                                          \
-    return ir::Call::make(x.dtype(), #OpName, {x}, ir::Call::PureIntrinsic); \
+    return ir::Call::make(x.type(), #OpName, {x}, ir::Call::PureIntrinsic); \
   }                                                                     \
 
 TVM_DECLARE_INTRIN_UNARY(exp);
@@ -671,7 +657,7 @@ inline bool is_no_op(const Stmt& stmt) {
 }
 
 template<typename ValueType>
-inline Expr MakeConstScalar(DataType t, ValueType value) {
+inline Expr MakeConstScalar(Type t, ValueType value) {
   if (t.is_int()) return ir::IntImm::make(t, static_cast<int64_t>(value));
   if (t.is_uint()) return ir::UIntImm::make(t, static_cast<uint64_t>(value));
   if (t.is_float()) return ir::FloatImm::make(t, static_cast<double>(value));
@@ -686,7 +672,7 @@ inline Expr MakeConstScalar(DataType t, ValueType value) {
 }
 
 template<typename ValueType, typename>
-inline Expr make_const(DataType t, ValueType value) {
+inline Expr make_const(Type t, ValueType value) {
   if (t.lanes() == 1) {
     return MakeConstScalar(t, value);
   } else {
@@ -695,9 +681,9 @@ inline Expr make_const(DataType t, ValueType value) {
   }
 }
 
-inline Expr make_zero(DataType t) {
+inline Expr make_zero(Type t) {
   if (t.is_handle()) {
-    return reinterpret(t, make_const(DataType::UInt(64), 0));
+    return reinterpret(t, make_const(UInt(64), 0));
   }
   return make_const(t, 0);
 }
@@ -717,13 +703,13 @@ inline Expr make_zero(DataType t) {
     return Name(Expr(a), b);                                   \
   }                                                            \
   inline Expr Name(int a, const Expr& b) {                     \
-    return Name(make_const(b.dtype(), a), b);                  \
+    return Name(make_const(b.type(), a), b);                   \
   }                                                            \
   inline Expr Name(const Expr& a, int b) {                     \
-    return Name(a, make_const(a.dtype(), b));                  \
+    return Name(a, make_const(a.type(), b));                   \
   }                                                            \
   inline Expr Name(const Expr& a, double b) {                  \
-    return Name(a, make_const(DataType::Float(64), b));                  \
+    return Name(a, make_const(Float(64), b));                  \
   }
 
 #define TVM_DEFINE_LOGICAL_OP_CONST_VAL_OVERLOAD(Name)                  \
@@ -736,10 +722,10 @@ inline Expr make_zero(DataType t) {
 
 #define TVM_DEFINE_INT_OP_CONST_VAL_OVERLOAD(Name)                      \
   inline Expr Name(const Expr& a, int b) {                              \
-    return Name(a, make_const(a.dtype(), b));                           \
+    return Name(a, make_const(a.type(), b));                            \
   }                                                                     \
   inline Expr Name(int a, const Expr& b) {                              \
-    return Name(make_const(b.dtype(), a), b);                           \
+    return Name(make_const(b.type(), a), b);                            \
   }
 
 
diff --git a/include/tvm/ir.h b/include/tvm/ir.h
index 33aa72b50805..226d6f83dcc7 100644
--- a/include/tvm/ir.h
+++ b/include/tvm/ir.h
@@ -46,11 +46,11 @@ class UIntImm : public ExprNode {
   uint64_t value;
 
   void VisitAttrs(AttrVisitor* v) {
-    v->Visit("dtype", &dtype);
+    v->Visit("dtype", &type);
     v->Visit("value", &value);
   }
 
-  TVM_DLL static Expr make(DataType t, uint64_t value);
+  TVM_DLL static Expr make(Type t, uint64_t value);
 
   static constexpr const char* _type_key = "UIntImm";
   TVM_DECLARE_NODE_TYPE_INFO(UIntImm, ExprNode);
@@ -63,11 +63,11 @@ class FloatImm : public ExprNode {
   double value;
 
   void VisitAttrs(AttrVisitor* v) {
-    v->Visit("dtype", &dtype);
+    v->Visit("dtype", &type);
     v->Visit("value", &value);
   }
 
-  TVM_DLL static Expr make(DataType t, double value);
+  TVM_DLL static Expr make(Type t, double value);
 
   static constexpr const char* _type_key = "FloatImm";
   TVM_DECLARE_NODE_TYPE_INFO(FloatImm, ExprNode);
@@ -80,7 +80,7 @@ class StringImm : public ExprNode {
   std::string value;
 
   void VisitAttrs(AttrVisitor* v) {
-    v->Visit("dtype", &dtype);
+    v->Visit("dtype", &type);
     v->Visit("value", &value);
   }
 
@@ -100,11 +100,11 @@ class Cast : public ExprNode {
   Expr value;
 
   void VisitAttrs(AttrVisitor* v) {
-    v->Visit("dtype", &dtype);
+    v->Visit("dtype", &type);
     v->Visit("value", &value);
   }
 
-  TVM_DLL static Expr make(DataType t, Expr v);
+  TVM_DLL static Expr make(Type t, Expr v);
 
   static constexpr const char* _type_key = "Cast";
   TVM_DECLARE_NODE_TYPE_INFO(Cast, ExprNode);
@@ -123,7 +123,7 @@ class BinaryOpNode : public ExprNode {
   Expr b;
 
   void VisitAttrs(AttrVisitor* v) {
-    v->Visit("dtype", &(this->dtype));
+    v->Visit("dtype", &(this->type));
     v->Visit("a", &a);
     v->Visit("b", &b);
   }
@@ -131,9 +131,9 @@ class BinaryOpNode : public ExprNode {
   static Expr make(Expr a, Expr b) {
     CHECK(a.defined()) << "ValueError: a is undefined\n";
     CHECK(b.defined()) << "ValueError: b is undefined\n";
-    CHECK(a.dtype() == b.dtype()) << "TypeError: mismatched types\n";
+    CHECK(a.type() == b.type()) << "TypeError: mismatched types\n";
     NodePtr<T> node = make_node<T>();
-    node->dtype = a.dtype();
+    node->type = a.type();
     node->a = std::move(a);
     node->b = std::move(b);
     return Expr(node);
@@ -215,7 +215,7 @@ class CmpOpNode : public ExprNode {
   Expr b;
 
   void VisitAttrs(AttrVisitor* v) {
-    v->Visit("dtype", &(this->dtype));
+    v->Visit("dtype", &(this->type));
     v->Visit("a", &a);
     v->Visit("b", &b);
   }
@@ -223,9 +223,9 @@ class CmpOpNode : public ExprNode {
   static Expr make(Expr a, Expr b) {
     CHECK(a.defined()) << "ValueError: a is undefined\n";
     CHECK(b.defined()) << "ValueError: b is undefined\n";
-    CHECK(a.dtype() == b.dtype()) << "TypeError: mismatched types\n";
+    CHECK(a.type() == b.type()) << "TypeError: mismatched types\n";
     NodePtr<T> node = make_node<T>();
-    node->dtype = DataType::Bool(a.dtype().lanes());
+    node->type = Bool(a.type().lanes());
     node->a = std::move(a);
     node->b = std::move(b);
     return Expr(node);
@@ -279,7 +279,7 @@ class And : public ExprNode {
   Expr b;
 
   void VisitAttrs(AttrVisitor* v) {
-    v->Visit("dtype", &(this->dtype));
+    v->Visit("dtype", &(this->type));
     v->Visit("a", &a);
     v->Visit("b", &b);
   }
@@ -299,7 +299,7 @@ class Or : public ExprNode {
   Expr b;
 
   void VisitAttrs(AttrVisitor* v) {
-    v->Visit("dtype", &dtype);
+    v->Visit("dtype", &type);
     v->Visit("a", &a);
     v->Visit("b", &b);
   }
@@ -317,7 +317,7 @@ class Not : public ExprNode {
   Expr a;
 
   void VisitAttrs(AttrVisitor* v) {
-    v->Visit("dtype", &dtype);
+    v->Visit("dtype", &type);
     v->Visit("a", &a);
   }
 
@@ -344,7 +344,7 @@ class Select : public ExprNode {
   Expr false_value;
 
   void VisitAttrs(AttrVisitor* v) {
-    v->Visit("dtype", &dtype);
+    v->Visit("dtype", &type);
     v->Visit("condition", &condition);
     v->Visit("true_value", &true_value);
     v->Visit("false_value", &false_value);
@@ -381,13 +381,13 @@ class Load : public ExprNode {
   Expr predicate;
 
   void VisitAttrs(AttrVisitor* v) {
-    v->Visit("dtype", &dtype);
+    v->Visit("dtype", &type);
     v->Visit("buffer_var", &buffer_var);
     v->Visit("index", &index);
     v->Visit("predicate", &predicate);
   }
 
-  TVM_DLL static Expr make(DataType dtype, Var buffer_var, Expr index, Expr predicate);
+  TVM_DLL static Expr make(Type type, Var buffer_var, Expr index, Expr predicate);
 
   static constexpr const char* _type_key = "Load";
   TVM_DECLARE_NODE_TYPE_INFO(Load, ExprNode);
@@ -412,7 +412,7 @@ class Ramp : public ExprNode {
   int lanes;
 
   void VisitAttrs(AttrVisitor* v) {
-    v->Visit("dtype", &dtype);
+    v->Visit("dtype", &type);
     v->Visit("base", &base);
     v->Visit("stride", &stride);
     v->Visit("lanes", &lanes);
@@ -433,7 +433,7 @@ class Broadcast : public ExprNode {
   int lanes;
 
   void VisitAttrs(AttrVisitor* v) {
-    v->Visit("dtype", &dtype);
+    v->Visit("dtype", &type);
     v->Visit("value", &value);
     v->Visit("lanes", &lanes);
   }
@@ -457,7 +457,7 @@ class Let : public ExprNode {
   Expr body;
 
   void VisitAttrs(AttrVisitor* v) {
-    v->Visit("dtype", &dtype);
+    v->Visit("dtype", &type);
     v->Visit("var", &var);
     v->Visit("value", &value);
     v->Visit("body", &body);
@@ -523,7 +523,7 @@ class Call : public ExprNode {
   int value_index{0};
 
   void VisitAttrs(AttrVisitor* v) {
-    v->Visit("dtype", &dtype);
+    v->Visit("dtype", &type);
     v->Visit("name", &name);
     v->Visit("args", &args);
     v->Visit("call_type", &call_type);
@@ -531,7 +531,7 @@ class Call : public ExprNode {
     v->Visit("value_index", &value_index);
   }
 
-  TVM_DLL static Expr make(DataType dtype,
+  TVM_DLL static Expr make(Type type,
                            std::string name,
                            Array<Expr> args,
                            CallType call_type,
@@ -695,7 +695,7 @@ class Reduce : public ExprNode {
                            int value_index);
 
   void VisitAttrs(AttrVisitor* v) {
-    v->Visit("dtype", &dtype);
+    v->Visit("dtype", &type);
     v->Visit("combiner", &combiner);
     v->Visit("source", &source);
     v->Visit("axis", &axis);
@@ -713,7 +713,7 @@ class Any : public ExprNode {
   void VisitAttrs(AttrVisitor* v) {}
   /*! \brief Convert to var. */
   Var ToVar() const {
-    return Variable::make(DataType::Int(32), "any_dim");
+    return Variable::make(Int(32), "any_dim");
   }
 
   TVM_DLL static Expr make();
@@ -917,7 +917,7 @@ class Allocate : public StmtNode {
   /*! \brief The buffer variable. */
   Var buffer_var;
   /*! \brief The type of the buffer. */
-  DataType dtype;
+  DataType type;
   /*! \brief The extents of the buffer. */
   Array<Expr> extents;
   /*! \brief Only allocate buffer when condition is satisfied. */
@@ -931,14 +931,14 @@ class Allocate : public StmtNode {
 
   void VisitAttrs(AttrVisitor* v) {
     v->Visit("buffer_var", &buffer_var);
-    v->Visit("dtype", &dtype);
+    v->Visit("dtype", &type);
     v->Visit("extents", &extents);
     v->Visit("condition", &condition);
     v->Visit("body", &body);
   }
 
   TVM_DLL static Stmt make(Var buffer_var,
-                           DataType dtype,
+                           DataType type,
                            Array<Expr> extents,
                            Expr condition,
                            Stmt body,
@@ -993,7 +993,7 @@ class Realize : public StmtNode {
   /*! \brief The output value index if func's value is a tuple. */
   int value_index;
   /*! \brief The data type of the array. */
-  DataType dtype;
+  DataType type;
   /*! \brief Bounds to be realized. */
   Region bounds;
   /*! \brief Only realize if condition holds. */
@@ -1004,7 +1004,7 @@ class Realize : public StmtNode {
   void VisitAttrs(AttrVisitor* v) {
     v->Visit("func", &func);
     v->Visit("value_index", &value_index);
-    v->Visit("dtype", &dtype);
+    v->Visit("dtype", &type);
     v->Visit("bounds", &bounds);
     v->Visit("condition", &condition);
     v->Visit("body", &body);
@@ -1012,7 +1012,7 @@ class Realize : public StmtNode {
 
   TVM_DLL static Stmt make(FunctionRef func,
                            int value_index,
-                           DataType dtype,
+                           DataType type,
                            Region bounds,
                            Expr condition,
                            Stmt body);
@@ -1165,20 +1165,20 @@ class Prefetch : public StmtNode {
   /*! \brief The output value index if func's value is a tuple. */
   int value_index;
   /*! \brief The data type of the array. */
-  DataType dtype;
+  DataType type;
   /*! \brief Bounds to be prefetched. */
   Region bounds;
 
   void VisitAttrs(AttrVisitor* v) {
     v->Visit("func", &func);
     v->Visit("value_index", &value_index);
-    v->Visit("dtype", &dtype);
+    v->Visit("type", &type);
     v->Visit("bounds", &bounds);
   }
 
   TVM_DLL static Stmt make(FunctionRef func,
                            int value_index,
-                           DataType dtype,
+                           DataType type,
                            Region bounds);
 
   static constexpr const char* _type_key = "Prefetch";
@@ -1620,7 +1620,7 @@ constexpr const char* tvm_store_matrix_sync = "tvm_store_matrix_sync";
  * \param dtype The data type
  * \return Expr a expression with dtype.
  */
-inline Expr TypeAnnotation(DataType dtype) {
+inline Expr TypeAnnotation(Type dtype) {
   return ir::Call::make(dtype,
                         "type_annotation", {},
                         ir::Call::PureIntrinsic);
diff --git a/include/tvm/ir_pass.h b/include/tvm/ir_pass.h
index b0b13df729cc..5c5c4bb2f452 100644
--- a/include/tvm/ir_pass.h
+++ b/include/tvm/ir_pass.h
@@ -236,6 +236,21 @@ bool VerifyCompactBuffer(Stmt stmt);
  */
 Stmt RemoveNoOp(Stmt stmt);
 
+/*!
+ * \brief Split statement into pipeine stages.
+ * \param stmt The stmt to be splitted
+ * \param split_load Whether split load into its own stage.
+ * \return Transformed stmt.
+ */
+Stmt SplitPipeline(Stmt stmt, bool split_load);
+
+/*!
+ * \brief Narrow channel access to smaller range.
+ * \param stmt The stmt to do access rewriting.
+ * \return Transformed stmt.
+ */
+Stmt NarrowChannelAccess(Stmt stmt);
+
 /*!
  * \brief unroll the constant loop marked by unroll.
  * This pass also automatically attach pragma unroll tag to loops which meets the standard.
diff --git a/include/tvm/node/container.h b/include/tvm/node/container.h
index 1a276ae695fc..41b47d3a679e 100644
--- a/include/tvm/node/container.h
+++ b/include/tvm/node/container.h
@@ -23,14 +23,14 @@
 #ifndef TVM_NODE_CONTAINER_H_
 #define TVM_NODE_CONTAINER_H_
 
-#include <tvm/node/node.h>
-
 #include <type_traits>
 #include <vector>
 #include <initializer_list>
 #include <unordered_map>
 #include <utility>
 #include <string>
+#include "node.h"
+#include "memory.h"
 
 namespace tvm {
 
diff --git a/include/tvm/node/reflection.h b/include/tvm/node/reflection.h
index daffeb859668..35a8e1d4a657 100644
--- a/include/tvm/node/reflection.h
+++ b/include/tvm/node/reflection.h
@@ -28,7 +28,6 @@
 #include <tvm/runtime/memory.h>
 #include <tvm/runtime/packed_func.h>
 #include <tvm/runtime/ndarray.h>
-#include <tvm/runtime/data_type.h>
 
 #include <vector>
 #include <string>
@@ -36,6 +35,8 @@
 namespace tvm {
 
 // forward declaration
+class DataType;
+
 using runtime::Object;
 using runtime::ObjectPtr;
 using runtime::ObjectRef;
diff --git a/include/tvm/operation.h b/include/tvm/operation.h
index 34f584b63261..f53c1ce56a93 100644
--- a/include/tvm/operation.h
+++ b/include/tvm/operation.h
@@ -75,7 +75,7 @@ class OperationNode : public ir::FunctionBaseNode {
    * \param i The output index.
    * \return type of i-th output.
    */
-  virtual DataType output_dtype(size_t i) const = 0;
+  virtual Type output_dtype(size_t i) const = 0;
   /*!
    * \brief Get shape of i-th output tensor.
    * \param i The output index.
@@ -160,11 +160,11 @@ class PlaceholderOpNode : public OperationNode {
   /*! \brief The shape of the input */
   Array<Expr> shape;
   /*! \brief The data type of the input. */
-  DataType dtype;
+  Type dtype;
   // override behavior.
   int num_outputs() const final;
   Array<IterVar> root_iter_vars() const final;
-  DataType output_dtype(size_t i) const final;
+  Type output_dtype(size_t i) const final;
   Array<Expr> output_shape(size_t i) const final;
   Array<Tensor> InputTensors() const final;
   Operation ReplaceInputs(
@@ -197,7 +197,7 @@ class PlaceholderOpNode : public OperationNode {
   }
   static Operation make(std::string name,
                         Array<Expr> shape,
-                        DataType dtype);
+                        Type dtype);
 
   static constexpr const char* _type_key = "PlaceholderOp";
   TVM_DECLARE_NODE_TYPE_INFO(PlaceholderOpNode, OperationNode);
@@ -243,7 +243,7 @@ class TVM_DLL ComputeOpNode : public BaseComputeOpNode {
   ComputeOpNode() {}
   // override functions
   int num_outputs() const final;
-  DataType output_dtype(size_t i) const final;
+  Type output_dtype(size_t i) const final;
   Array<Tensor> InputTensors() const final;
   Operation ReplaceInputs(
       const Operation& self,
@@ -296,7 +296,7 @@ class TensorComputeOpNode : public BaseComputeOpNode {
   TensorComputeOpNode() {}
   // override functions
   int num_outputs() const final;
-  DataType output_dtype(size_t i) const final;
+  Type output_dtype(size_t i) const final;
   Array<Tensor> InputTensors() const final;
   Operation ReplaceInputs(
       const Operation& self,
@@ -370,7 +370,7 @@ class ScanOpNode : public OperationNode {
   // override behavior.
   int num_outputs() const final;
   Array<IterVar> root_iter_vars() const final;
-  DataType output_dtype(size_t i) const final;
+  Type output_dtype(size_t i) const final;
   Array<Expr> output_shape(size_t i) const final;
   Array<Tensor> InputTensors() const final;
   Operation ReplaceInputs(
@@ -437,7 +437,7 @@ class ExternOpNode : public OperationNode {
   // override functions
   int num_outputs() const final;
   Array<IterVar> root_iter_vars() const final;
-  DataType output_dtype(size_t i) const final;
+  Type output_dtype(size_t i) const final;
   Array<Expr> output_shape(size_t i) const final;
   Array<Tensor> InputTensors() const final;
   Operation ReplaceInputs(
@@ -505,7 +505,7 @@ class HybridOpNode : public OperationNode {
   // override functions
   int num_outputs() const final;
   Array<IterVar> root_iter_vars() const final;
-  DataType output_dtype(size_t i) const final;
+  Type output_dtype(size_t i) const final;
   Array<Expr> output_shape(size_t i) const final;
   Array<Tensor> InputTensors() const final;
   Operation ReplaceInputs(
@@ -562,7 +562,7 @@ using FBatchCompute = std::function<Array<Expr> (const Array<Var>& i)>;
  * \param name The name of the Tensor.
  */
 TVM_DLL Tensor placeholder(Array<Expr> shape,
-                           DataType dtype = DataType::Float(32),
+                           Type dtype = Float(32),
                            std::string name = "placeholder");
 
 /*!
diff --git a/include/tvm/packed_func_ext.h b/include/tvm/packed_func_ext.h
index c9f7a580621f..71f8f55b2655 100644
--- a/include/tvm/packed_func_ext.h
+++ b/include/tvm/packed_func_ext.h
@@ -25,6 +25,7 @@
 #ifndef TVM_PACKED_FUNC_EXT_H_
 #define TVM_PACKED_FUNC_EXT_H_
 
+#include <sstream>
 #include <string>
 #include <memory>
 #include <limits>
@@ -42,7 +43,22 @@ using runtime::TVMRetValue;
 using runtime::PackedFunc;
 
 namespace runtime {
-
+/*!
+ * \brief Runtime type checker for node type.
+ * \tparam T the type to be checked.
+ */
+template<typename T>
+struct ObjectTypeChecker {
+  static bool Check(const Object* ptr) {
+    using ContainerType = typename T::ContainerType;
+    if (ptr == nullptr) return true;
+    return ptr->IsInstance<ContainerType>();
+  }
+  static void PrintName(std::ostream& os) { // NOLINT(*)
+    using ContainerType = typename T::ContainerType;
+    os << ContainerType::_type_key;
+  }
+};
 
 template<typename T>
 struct ObjectTypeChecker<Array<T> > {
@@ -57,8 +73,10 @@ struct ObjectTypeChecker<Array<T> > {
     }
     return true;
   }
-  static std::string TypeName() {
-    return "List[" + ObjectTypeChecker<T>::TypeName() + "]";
+  static void PrintName(std::ostream& os) { // NOLINT(*)
+    os << "List[";
+    ObjectTypeChecker<T>::PrintName(os);
+    os << "]";
   }
 };
 
@@ -73,9 +91,11 @@ struct ObjectTypeChecker<Map<std::string, V> > {
     }
     return true;
   }
-  static std::string TypeName() {
-    return "Map[str, " +
-        ObjectTypeChecker<V>::TypeName()+ ']';
+  static void PrintName(std::ostream& os) { // NOLINT(*)
+    os << "Map[str";
+    os << ',';
+    ObjectTypeChecker<V>::PrintName(os);
+    os << ']';
   }
 };
 
@@ -91,16 +111,39 @@ struct ObjectTypeChecker<Map<K, V> > {
     }
     return true;
   }
-  static std::string TypeName() {
-    return "Map[" +
-        ObjectTypeChecker<K>::TypeName() +
-        ", " +
-        ObjectTypeChecker<V>::TypeName()+ ']';
+  static void PrintName(std::ostringstream& os) { // NOLINT(*)
+    os << "Map[";
+    ObjectTypeChecker<K>::PrintName(os);
+    os << ',';
+    ObjectTypeChecker<V>::PrintName(os);
+    os << ']';
   }
 };
 
+template<typename T>
+inline std::string ObjectTypeName() {
+  std::ostringstream os;
+  ObjectTypeChecker<T>::PrintName(os);
+  return os.str();
+}
+
 // extensions for tvm arg value
-inline TVMPODValue_::operator tvm::Expr() const {
+
+template<typename TObjectRef>
+inline TObjectRef TVMArgValue::AsObjectRef() const {
+  static_assert(
+      std::is_base_of<ObjectRef, TObjectRef>::value,
+      "Conversion only works for ObjectRef");
+  if (type_code_ == kNull) return TObjectRef(NodePtr<Node>(nullptr));
+  TVM_CHECK_TYPE_CODE(type_code_, kObjectHandle);
+  Object* ptr = static_cast<Object*>(value_.v_handle);
+  CHECK(ObjectTypeChecker<TObjectRef>::Check(ptr))
+      << "Expected type " << ObjectTypeName<TObjectRef>()
+      << " but get " << ptr->GetTypeKey();
+  return TObjectRef(ObjectPtr<Node>(ptr));
+}
+
+inline TVMArgValue::operator tvm::Expr() const {
   if (type_code_ == kNull) return Expr();
   if (type_code_ == kDLInt) {
     CHECK_LE(value_.v_int64, std::numeric_limits<int>::max());
@@ -121,12 +164,12 @@ inline TVMPODValue_::operator tvm::Expr() const {
     return Tensor(ObjectPtr<Node>(ptr))();
   }
   CHECK(ObjectTypeChecker<Expr>::Check(ptr))
-      << "Expect type " << ObjectTypeChecker<Expr>::TypeName()
+      << "Expected type " << ObjectTypeName<Expr>()
       << " but get " << ptr->GetTypeKey();
   return Expr(ObjectPtr<Node>(ptr));
 }
 
-inline TVMPODValue_::operator tvm::Integer() const {
+inline TVMArgValue::operator tvm::Integer() const {
   if (type_code_ == kNull) return Integer();
   if (type_code_ == kDLInt) {
     CHECK_LE(value_.v_int64, std::numeric_limits<int>::max());
@@ -136,10 +179,52 @@ inline TVMPODValue_::operator tvm::Integer() const {
   TVM_CHECK_TYPE_CODE(type_code_, kObjectHandle);
   Object* ptr = static_cast<Object*>(value_.v_handle);
   CHECK(ObjectTypeChecker<Integer>::Check(ptr))
-      << "Expect type " << ObjectTypeChecker<Expr>::TypeName()
+      << "Expected type " << ObjectTypeName<Expr>()
       << " but get " << ptr->GetTypeKey();
   return Integer(ObjectPtr<Node>(ptr));
 }
+
+template<typename TObjectRef, typename>
+inline bool TVMPODValue_::IsObjectRef() const {
+  TVM_CHECK_TYPE_CODE(type_code_, kObjectHandle);
+  Object* ptr = static_cast<Object*>(value_.v_handle);
+  return ObjectTypeChecker<TObjectRef>::Check(ptr);
+}
+
+// extensions for TVMRetValue
+template<typename TObjectRef>
+inline TObjectRef TVMRetValue::AsObjectRef() const {
+  static_assert(
+      std::is_base_of<ObjectRef, TObjectRef>::value,
+      "Conversion only works for ObjectRef");
+  if (type_code_ == kNull) return TObjectRef();
+  TVM_CHECK_TYPE_CODE(type_code_, kObjectHandle);
+
+  Object* ptr = static_cast<Object*>(value_.v_handle);
+
+  CHECK(ObjectTypeChecker<TObjectRef>::Check(ptr))
+      << "Expected type " << ObjectTypeName<TObjectRef>()
+      << " but get " << ptr->GetTypeKey();
+  return TObjectRef(ObjectPtr<Object>(ptr));
+}
+
+// type related stuffs
+inline TVMRetValue& TVMRetValue::operator=(const DataType& t) {
+  return this->operator=(t.operator DLDataType());
+}
+
+inline TVMRetValue::operator tvm::DataType() const {
+  return DataType(operator DLDataType());
+}
+
+inline TVMArgValue::operator tvm::DataType() const {
+  return DataType(operator DLDataType());
+}
+
+inline void TVMArgsSetter::operator()(
+    size_t i, const DataType& t) const {
+  this->operator()(i, t.operator DLDataType());
+}
 }  // namespace runtime
 }  // namespace tvm
 #endif  // TVM_PACKED_FUNC_EXT_H_
diff --git a/include/tvm/relay/attrs/memory.h b/include/tvm/relay/attrs/memory.h
index c74b6487de54..2e279a56bbde 100644
--- a/include/tvm/relay/attrs/memory.h
+++ b/include/tvm/relay/attrs/memory.h
@@ -43,7 +43,7 @@ struct AllocTensorAttrs : public tvm::AttrsNode<AllocTensorAttrs> {
     TVM_ATTR_FIELD(dtype)
       .describe(
          "The dtype of the tensor to allocate.")
-      .set_default(DataType::Float(32, 1));
+      .set_default(Float(32, 1));
     TVM_ATTR_FIELD(const_shape)
       .describe(
          "The shape of constant used to aid in type inference.");
diff --git a/include/tvm/relay/attrs/nn.h b/include/tvm/relay/attrs/nn.h
index d724f8173832..4422fce250c2 100644
--- a/include/tvm/relay/attrs/nn.h
+++ b/include/tvm/relay/attrs/nn.h
@@ -315,64 +315,6 @@ struct Conv2DTransposeAttrs : public tvm::AttrsNode<Conv2DTransposeAttrs> {
   }
 };
 
-/*! \brief Attributes used in 1D transposed convolution operator */
-struct Conv1DTransposeAttrs : public tvm::AttrsNode<Conv1DTransposeAttrs> {
-  IndexExpr channels;
-  Array<IndexExpr> kernel_size;
-  Array<IndexExpr> strides;
-  Array<IndexExpr> padding;
-  Array<IndexExpr> output_padding;
-  Array<IndexExpr> dilation;
-  int groups;
-  std::string data_layout;
-  std::string kernel_layout;
-  std::string out_layout;
-  DataType out_dtype;
-
-  TVM_DECLARE_ATTRS(Conv1DTransposeAttrs, "relay.attrs.Conv1DTransposeAttrs") {
-    TVM_ATTR_FIELD(channels)
-      .set_default(NullValue<IndexExpr>())
-      .describe("The dimensionality of the output space"
-                "i.e. the number of output channels in the convolution.");
-    TVM_ATTR_FIELD(kernel_size)
-      .describe("The dimensions of the convolution window.")
-      .set_default(NullValue<Array<IndexExpr> >());
-    TVM_ATTR_FIELD(strides).set_default(Array<IndexExpr>({1}))
-      .describe("The strides of the convolution.");
-    TVM_ATTR_FIELD(output_padding).set_default(Array<IndexExpr>({0}))
-      .describe("Zero-padding added to one side of the output.");
-    TVM_ATTR_FIELD(padding).set_default(Array<IndexExpr>({0}))
-      .describe("Symmetric or asymmetric padding."
-                "Single value: the input is implicitly zero-padded on both sides."
-                "Two values: padding[0] is used for left input padding, "
-                "padding[1] is used for right input padding,");
-    TVM_ATTR_FIELD(dilation).set_default(Array<IndexExpr>({1}))
-      .describe("Specifies the dilation rate to use for dilated convolution.");
-    TVM_ATTR_FIELD(groups).set_default(1)
-      .describe("Controls the connections between inputs and outputs."
-                "At groups=1, all inputs are convolved to all outputs."
-                "At groups=2, the operation becomes equivalent to having two convolution"
-                "layers side by side, each seeing half the input channels, and producing"
-                "half the output channels, and both subsequently concatenated.");
-    TVM_ATTR_FIELD(data_layout).set_default("NCW")
-      .describe("Dimension ordering of data. Can be 'NCW', 'NWC', etc."
-                "'N', 'C', 'W' stands for batch, channel, and width"
-                "dimensions respectively. Convolution is applied on the"
-                "'W' dimension.");
-    TVM_ATTR_FIELD(kernel_layout).set_default("OIW")
-      .describe("Dimension ordering of data and weight. Can be 'OIW', 'OIW16o16i', etc."
-                "'O', 'I', 'W' stands for num_filter, input_channel, and width"
-                "dimensions respectively.");
-    TVM_ATTR_FIELD(out_layout).set_default("")
-        .describe("Dimension ordering of output. Can be 'NCW', 'NWC', etc."
-                      "'N', 'C', 'W' stands for batch, channel, and width"
-                      "dimensions respectively. Default to be same as input layout.");
-    TVM_ATTR_FIELD(out_dtype)
-        .set_default(NullValue<DataType>())
-        .describe("Output data type, set to explicit type under mixed precision setting");
-  }
-};
-
 /*! \brief Attributes for max pool operator */
 struct MaxPool2DAttrs : public tvm::AttrsNode<MaxPool2DAttrs> {
   Array<IndexExpr> pool_size;
@@ -589,39 +531,6 @@ struct UpSamplingAttrs : public tvm::AttrsNode<UpSamplingAttrs> {
   }
 };
 
-/*! \brief Attributes for upsampling3d operator */
-struct UpSampling3DAttrs : public tvm::AttrsNode<UpSampling3DAttrs> {
-  double scale_d;
-  double scale_h;
-  double scale_w;
-  std::string layout;
-  std::string method;
-  std::string coordinate_transformation_mode;
-
-  TVM_DECLARE_ATTRS(UpSampling3DAttrs, "relay.attrs.UpSampling3DAttrs") {
-    TVM_ATTR_FIELD(scale_d)
-        .describe("The upsampling factor for depth");
-    TVM_ATTR_FIELD(scale_h)
-        .describe("The upsampling factor for height");
-    TVM_ATTR_FIELD(scale_w)
-        .describe("The upsampling factor for width");
-    TVM_ATTR_FIELD(layout).set_default("NCDHW")
-        .describe("Dimension ordering of input data. Can be 'NCDHW', 'NDHWC', etc."
-                  "'N', 'C', 'D', 'H', 'W' stands for batch, channel, depth, height, and width"
-                  "dimensions respectively. Upsampling is applied on the 'D', 'H' and"
-                  "'W' dimensions.");
-    TVM_ATTR_FIELD(method).set_default("nearest_neighbor")
-        .describe("Specify the mode to use for scaling."
-                  "nearest_neighbor -  Nearest Neighbor"
-                  "trilinear - Trilinear Interpolation");
-    TVM_ATTR_FIELD(coordinate_transformation_mode).set_default("half_pixel")
-        .describe("Describes how to transform the coordinate in the resized tensor"
-                  "to the coordinate in the original tensor."
-                  "Refer to the ONNX Resize operator specification for details"
-                  "Available options are half_pixel, align_corners and asymmetric");
-  }
-};
-
 /*! \brief Attributes used for the padding operator */
 struct PadAttrs : public tvm::AttrsNode<PadAttrs> {
   double pad_value;
@@ -854,26 +763,6 @@ struct DeformableConv2DAttrs : public tvm::AttrsNode<DeformableConv2DAttrs> {
   }
 };
 
-/*! \brief Attributes used in subpixel operators */
-struct SubPixelAttrs : public tvm::AttrsNode<SubPixelAttrs> {
-  int block_size;
-  std::string layout;
-  std::string mode;
-
-  TVM_DECLARE_ATTRS(SubPixelAttrs, "relay.attrs.SubPixelAttrs") {
-    TVM_ATTR_FIELD(block_size)
-        .describe("The size of subpixel blocks to compose or decompose.")
-        .set_default(1);
-    TVM_ATTR_FIELD(layout).set_default("NCHW").describe(
-        "Dimension ordering of input data. Can be 'NCHW', 'NHWC', etc."
-        "'N', 'C', 'H', 'W' stands for batch, channel, height, and width"
-        "dimensions respectively.");
-    TVM_ATTR_FIELD(mode).set_default("DCR").describe(
-        "Indicates order in which channels are accessed. Must be one of"
-        "DCR or CDR.");
-  }
-};  // struct SubPixelAttrs
-
 }  // namespace relay
 }  // namespace tvm
 #endif  // TVM_RELAY_ATTRS_NN_H_
diff --git a/include/tvm/relay/base.h b/include/tvm/relay/base.h
index 32f9c32f468a..42a01f009b10 100644
--- a/include/tvm/relay/base.h
+++ b/include/tvm/relay/base.h
@@ -63,7 +63,7 @@ using NodeRef = tvm::NodeRef;
 /*!
  * \brief Content data type.
  */
-using DataType = ::tvm::DataType;
+using DataType = ::tvm::Type;
 
 /*!
  * \brief Symbolic expression for tensor shape.
diff --git a/include/tvm/relay/expr.h b/include/tvm/relay/expr.h
index 01a73d5396cc..2aa88099a69c 100644
--- a/include/tvm/relay/expr.h
+++ b/include/tvm/relay/expr.h
@@ -268,15 +268,6 @@ class FunctionNode : public ExprNode {
    */
   bool IsPrimitive() const;
 
-  /*!
-   * \brief Check whether the function should use the TVM default compiler to build, or
-   * use other compilers.
-   *
-   * \return Whether the function will be compiled using the default compiler
-   * (e.g. those are used in the TVM stack).
-   */
-  bool UseDefaultCompiler() const;
-
   TVM_DLL static Function make(tvm::Array<Var> params,
                                Expr body,
                                Type ret_type,
@@ -597,25 +588,6 @@ std::string AsText(const NodeRef& node,
                    bool show_meta_data = true,
                    runtime::TypedPackedFunc<std::string(Expr)> annotate = nullptr);
 
-/*! \brief namespace of the attributes that are attached to a function. */
-namespace attr {
-/*! \brief Mark the function as a primitive function. */
-constexpr const char* kPrimitive = "Primitive";
-/*!
- * \brief Indicate the compiler that should be used for builing this function.
- * When this is unset or set to "default", the default compilation pipeline will be used.
- */
-constexpr const char* kCompiler = "Compiler";
-/*! \brief Indicate if the function is a closure. */
-constexpr const char* kClosure = "Closure";
-/*! \brief Store a Var to parameter/Constant mapping on a Function. */
-constexpr const char* kParams = "__params__";
-/*! \brief Store the unique external symbol for external compilers. */
-constexpr const char* kExternalSymbol = "ExternalSymbol";
-/*! \brief Mark if the function should be avoided being optimized. */
-constexpr const char* kSkipOptimization = "SkipOptimization";
-}  // namespace attr
-
 }  // namespace relay
 }  // namespace tvm
 #endif  // TVM_RELAY_EXPR_H_
diff --git a/include/tvm/relay/op.h b/include/tvm/relay/op.h
index 90f2937c929b..7f1ef456b59b 100644
--- a/include/tvm/relay/op.h
+++ b/include/tvm/relay/op.h
@@ -594,11 +594,12 @@ inline ValueType OpMap<ValueType>::get(const Expr& expr,
   return map_.get<ValueType>(expr, def_value);
 }
 
+
 /*!
- * \brief Check that an expression is a "primitive operator".
+ * \brief Check that an expression is a "primtive operator".
  *
  * Will return true if the expression is an operator which
- * matches the form of primitive operators registered directly
+ * matches the form of primtive operators registered directly
  * by the Relay codebase.
  *
  * That is the arguments are all type variables, and there is a single
diff --git a/include/tvm/relay/op_attr_types.h b/include/tvm/relay/op_attr_types.h
index 54ea707905e5..741e8b478828 100644
--- a/include/tvm/relay/op_attr_types.h
+++ b/include/tvm/relay/op_attr_types.h
@@ -29,7 +29,6 @@
 #include <tvm/build_module.h>
 #include <tvm/relay/type.h>
 #include <tvm/relay/expr.h>
-#include <string>
 
 namespace tvm {
 namespace relay {
@@ -133,22 +132,6 @@ using FTVMAlterOpLayout = runtime::TypedPackedFunc<
        const Array<Expr>& args,
        const Array<Tensor>& tinfos)>;
 
-/*!
- * \brief Convert the layout of operators or replace the
- *  operator with other expressions. This function will be invoked
- *  in ConvertLayout pass.
- * \param attrs The attribute of the original node.
- * \param inputs The input symbols of the original node.
- * \param tinfos An array of placeholders, use for getting the inferred shape
- *               and dtype of the inputs.
- * \param desired_layout The desired layout.
- * \return new_expr The modified expression.
- */
-using FTVMConvertOpLayout = runtime::TypedPackedFunc<
-  Expr(const Attrs& attrs,
-       const Array<Expr>& args,
-       const Array<Tensor>& tinfos,
-       const std::string& desired_layout)>;
 /*!
  * \brief Legalizes an expression with another expression. This function will be
  *  invoked in Legalize pass. It is a target-dependent pass.
diff --git a/include/tvm/relay/transform.h b/include/tvm/relay/transform.h
index 52be6a0f3781..ddadbe4fc31d 100644
--- a/include/tvm/relay/transform.h
+++ b/include/tvm/relay/transform.h
@@ -532,26 +532,6 @@ TVM_DLL Pass CanonicalizeOps();
  */
 TVM_DLL Pass AlterOpLayout();
 
-/*!
- * \brief Given a dest layout, this pass transforms the expr such that most of the ops input data
- * layout is changed to the dest layout. In ideal situation, there are only 2 layout transforms, one
- * at the start and one at the end.
- *
- * This pass is not a part of relay.build and is expected to be called between framework-relay
- * parser and relay.build call. This is very helpful for hardware backends that support/prefer only
- * type of data layout.
- *
- * RFC - https://discuss.tvm.ai/t/layout-conversion-pass/4009
- *
- * This pass uses most of the AlterOpLayout and InferCorrectLayout infrastructure. We can define new
- * layouts for conv2d ops for now. Most of the other operators try to adapt to their input layout
- * using the InferCorrectLayout infrastructure.
- *
- * \param desired_layout The desired layout.
- * \return The pass.
- */
-TVM_DLL Pass ConvertLayout(const std::string& desired_layout);
-
 /*!
  * \brief Legalizes an expr with another expression.
  * \param legalize_map_attr_name The Op's attr name which corresponds to the legalize rule function.
diff --git a/include/tvm/runtime/c_runtime_api.h b/include/tvm/runtime/c_runtime_api.h
index 8cb86bf725bc..5053326058bc 100644
--- a/include/tvm/runtime/c_runtime_api.h
+++ b/include/tvm/runtime/c_runtime_api.h
@@ -136,7 +136,7 @@ typedef DLDataType TVMType;
 typedef DLContext TVMContext;
 
 /*!
- * \brief The tensor array structure to TVM API.
+ * \brief The tensor array stucture to TVM API.
  */
 typedef DLTensor TVMArray;
 
@@ -234,6 +234,14 @@ TVM_DLL int TVMModGetFunction(TVMModuleHandle mod,
                               int query_imports,
                               TVMFunctionHandle *out);
 
+/*!
+ * \brief Free front-end extension type resource.
+ * \param handle The extension handle.
+ * \param type_code The type of of the extension type.
+ * \return 0 when success, -1 when failure happens
+ */
+TVM_DLL int TVMExtTypeFree(void* handle, int type_code);
+
 /*!
  * \brief Free the Module
  * \param mod The module to be freed.
diff --git a/include/tvm/runtime/container.h b/include/tvm/runtime/container.h
index 4dc07f4a3a04..dbe827812fc3 100644
--- a/include/tvm/runtime/container.h
+++ b/include/tvm/runtime/container.h
@@ -23,7 +23,6 @@
  */
 #ifndef TVM_RUNTIME_CONTAINER_H_
 #define TVM_RUNTIME_CONTAINER_H_
-
 #include <dmlc/logging.h>
 #include <tvm/runtime/memory.h>
 #include <tvm/runtime/object.h>
diff --git a/include/tvm/runtime/ndarray.h b/include/tvm/runtime/ndarray.h
index 090cacff5c3a..993295179842 100644
--- a/include/tvm/runtime/ndarray.h
+++ b/include/tvm/runtime/ndarray.h
@@ -24,13 +24,11 @@
 #ifndef TVM_RUNTIME_NDARRAY_H_
 #define TVM_RUNTIME_NDARRAY_H_
 
-#include <tvm/runtime/c_runtime_api.h>
-#include <tvm/runtime/object.h>
-#include <tvm/runtime/serializer.h>
-
 #include <atomic>
 #include <vector>
 #include <utility>
+#include "c_runtime_api.h"
+#include "serializer.h"
 
 namespace tvm {
 namespace runtime {
@@ -39,23 +37,72 @@ namespace runtime {
  * \brief Managed NDArray.
  *  The array is backed by reference counted blocks.
  */
-class NDArray : public ObjectRef {
+class NDArray {
  public:
-  /*! \brief ContainerBase used to back the TVMArrayHandle */
-  class ContainerBase;
-  /*! \brief NDArray internal container type */
+  // internal container type
   class Container;
-  /*! \brief Container type for Object system. */
-  using ContainerType = Container;
   /*! \brief default constructor */
   NDArray() {}
   /*!
-   * \brief constructor.
-   * \param data ObjectPtr to the data container.
+   * \brief cosntruct a NDArray that refers to data
+   * \param data The data this NDArray refers to
    */
-  explicit NDArray(ObjectPtr<Object> data)
-      : ObjectRef(data) {}
-
+  explicit inline NDArray(Container* data);
+  /*!
+   * \brief copy constructor.
+   *
+   * It does not make a copy, but the reference count of the input NDArray is incremented
+   *
+   * \param other NDArray that shares internal data with the input NDArray.
+   */
+  inline NDArray(const NDArray& other);  // NOLINT(*)
+  /*!
+   * \brief move constructor
+   * \param other The value to be moved
+   */
+  NDArray(NDArray&& other) // NOLINT(*)
+      : data_(other.data_) {
+    other.data_ = nullptr;
+  }
+  /*! \brief destructor */
+  ~NDArray() {
+    this->reset();
+  }
+  /*!
+   * \brief Swap this array with another NDArray
+   * \param other The other NDArray
+   */
+  void swap(NDArray& other) {  // NOLINT(*)
+    std::swap(data_, other.data_);
+  }
+  /*!
+   * \brief copy assignmemt
+   * \param other The value to be assigned.
+   * \return reference to self.
+   */
+  NDArray& operator=(const NDArray& other) {  // NOLINT(*)
+    // copy-and-swap idiom
+    NDArray(other).swap(*this);  // NOLINT(*)
+    return *this;
+  }
+  /*!
+   * \brief move assignmemt
+   * \param other The value to be assigned.
+   * \return reference to self.
+   */
+  NDArray& operator=(NDArray&& other) {  // NOLINT(*)
+    // copy-and-swap idiom
+    NDArray(std::move(other)).swap(*this); // NOLINT(*)
+    return *this;
+  }
+  /*! \return If NDArray is defined */
+  bool defined() const {
+    return data_ != nullptr;
+  }
+  /*! \return If both NDArray reference the same container */
+  bool same_as(const NDArray& other) const {
+    return data_ == other.data_;
+  }
   /*! \brief reset the content of NDArray to be nullptr */
   inline void reset();
   /*!
@@ -71,7 +118,7 @@ class NDArray : public ObjectRef {
    * \note The copy may happen asynchrously if it involves a GPU context.
    *       TVMSynchronize is necessary.
    */
-  inline void CopyFrom(const DLTensor* other);
+  inline void CopyFrom(DLTensor* other);
   inline void CopyFrom(const NDArray& other);
   /*!
    * \brief Copy data content into another array.
@@ -141,43 +188,39 @@ class NDArray : public ObjectRef {
    * \param stream The stream used in copy.
    */
   TVM_DLL static void CopyFromTo(
-      const DLTensor* from, DLTensor* to, TVMStreamHandle stream = nullptr);
+      DLTensor* from, DLTensor* to, TVMStreamHandle stream = nullptr);
 
   TVM_DLL std::vector<int64_t> Shape() const;
+
   // internal namespace
   struct Internal;
-
  protected:
+  /*! \brief Internal Data content */
+  Container* data_{nullptr};
+  // enable internal functions
+  friend struct Internal;
   friend class TVMPODValue_;
+  friend class TVMArgValue;
   friend class TVMRetValue;
   friend class TVMArgsSetter;
-  /*!
-   * \brief Get mutable internal container pointer.
-   * \return a mutable container pointer.
-   */
-  inline Container* get_mutable() const;
-  // Helper functions for FFI handling.
-  /*!
-   * \brief Construct NDArray's Data field from array handle in FFI.
-   * \param handle The array handle.
-   * \return The corresponding ObjectPtr to the constructed container object.
-   *
-   * \note We keep a special calling convention for NDArray by passing
-   *       ContainerBase pointer in FFI.
-   *       As a result, the argument is compatible to DLTensor*.
-   */
-  inline static ObjectPtr<Object> FFIDataFromHandle(TVMArrayHandle handle);
-  /*!
-   * \brief DecRef resource managed by an FFI array handle.
-   * \param handle The array handle.
-   */
-  inline static void FFIDecRef(TVMArrayHandle handle);
-  /*!
-   * \brief Get FFI Array handle from ndarray.
-   * \param nd The object with ndarray type.
-   * \return The result array handle.
-   */
-  inline static TVMArrayHandle FFIGetHandle(const ObjectRef& nd);
+};
+
+/*!
+ * \brief The type trait indicates subclass of TVM's NDArray.
+ *  For irrelavant classes, code = -1.
+ *  For TVM NDArray itself, code = 0.
+ *  All subclasses of NDArray should override code > 0.
+ */
+template<typename T>
+struct array_type_info {
+  /*! \brief the value of the traits */
+  static const int code = -1;
+};
+
+// Overrides the type trait for tvm's NDArray.
+template<>
+struct array_type_info<NDArray> {
+  static const int code = 0;
 };
 
 /*!
@@ -188,14 +231,19 @@ class NDArray : public ObjectRef {
 inline bool SaveDLTensor(dmlc::Stream* strm, const DLTensor* tensor);
 
 /*!
- * \brief The container base structure
- *        contains all the fields except for the Object header.
+ * \brief Reference counted Container object used to back NDArray.
  *
- * \note We explicitly declare this structure in order to pass
- *       PackedFunc argument using ContainerBase*.
+ *  This object is DLTensor compatible:
+ *    the pointer to the NDArrayContainer can be directly
+ *    interpreted as a DLTensor*
+ *
+ * \note do not use this function directly, use NDArray.
  */
-class NDArray::ContainerBase {
+class NDArray::Container {
  public:
+  // NOTE: the first part of this structure is the same as
+  // DLManagedTensor, note that, however, the deleter
+  // is only called when the reference counter goes to 0
   /*!
    * \brief The corresponding dl_tensor field.
    * \note it is important that the first field is DLTensor
@@ -211,27 +259,42 @@ class NDArray::ContainerBase {
    *  (e.g. reference to original memory when creating views).
    */
   void* manager_ctx{nullptr};
+  /*!
+   * \brief Customized deleter
+   *
+   * \note The customized deleter is helpful to enable
+   *  different ways of memory allocator that are not
+   *  currently defined by the system.
+   */
+  void (*deleter)(Container* self) = nullptr;
 
  protected:
+  friend class NDArray;
+  friend class TVMPODValue_;
+  friend class TVMArgValue;
+  friend class TVMRetValue;
+  friend class RPCWrappedFunc;
+  /*!
+   * \brief Type flag used to indicate subclass.
+   *  Default value 0 means normal NDArray::Conatainer.
+   *
+   *  We can extend a more specialized NDArray::Container
+   *  and use the array_type_code_ to indicate
+   *  the specific array subclass.
+   */
+  int32_t array_type_code_{0};
+  /*! \brief The internal reference counter */
+  std::atomic<int> ref_counter_{0};
+
   /*!
    * \brief The shape container,
    *  can be used used for shape data.
    */
   std::vector<int64_t> shape_;
-};
 
-/*!
- * \brief Object container class that backs NDArray.
- * \note do not use this function directly, use NDArray.
- */
-class NDArray::Container :
-      public Object,
-      public NDArray::ContainerBase {
  public:
   /*! \brief default constructor */
   Container() {
-    // Initialize the type index.
-    type_index_ = Container::RuntimeTypeIndex();
     dl_tensor.data = nullptr;
     dl_tensor.ndim = 0;
     dl_tensor.shape = nullptr;
@@ -243,8 +306,6 @@ class NDArray::Container :
             std::vector<int64_t> shape,
             DLDataType dtype,
             DLContext ctx) {
-    // Initialize the type index.
-    type_index_ = Container::RuntimeTypeIndex();
     dl_tensor.data = data;
     shape_ = std::move(shape);
     dl_tensor.ndim = static_cast<int>(shape_.size());
@@ -254,36 +315,49 @@ class NDArray::Container :
     dl_tensor.byte_offset = 0;
     dl_tensor.ctx = ctx;
   }
-  /*!
-   * \brief Set the deleter field.
-   * \param deleter The deleter.
-   */
-  void SetDeleter(FDeleter deleter) {
-    deleter_ = deleter;
+
+  /*! \brief developer function, increases reference counter */
+  void IncRef() {
+    ref_counter_.fetch_add(1, std::memory_order_relaxed);
+  }
+  /*! \brief developer function, decrease reference counter */
+  void DecRef() {
+    if (ref_counter_.fetch_sub(1, std::memory_order_release) == 1) {
+      std::atomic_thread_fence(std::memory_order_acquire);
+      if (this->deleter != nullptr) {
+        (*this->deleter)(this);
+      }
+    }
   }
+};
 
-  // Expose DecRef and IncRef as public function
-  // NOTE: they are only for developer purposes only.
-  using Object::DecRef;
-  using Object::IncRef;
+// implementations of inline functions
+// the usages of functions are documented in place.
+inline NDArray::NDArray(Container* data)
+  : data_(data) {
+  if (data != nullptr) {
+    data_->IncRef();
+  }
+}
 
-  // Information for object protocol.
-  static constexpr const uint32_t _type_index = TypeIndex::kDynamic;
-  static constexpr const uint32_t _type_child_slots = 0;
-  static constexpr const uint32_t _type_child_slots_can_overflow = true;
-  static constexpr const char* _type_key = "NDArray";
-  TVM_DECLARE_BASE_OBJECT_INFO(NDArray::Container, Object);
+inline NDArray::NDArray(const NDArray& other)
+  : data_(other.data_) {
+  if (data_ != nullptr) {
+    data_->IncRef();
+  }
+}
 
- protected:
-  friend class RPCWrappedFunc;
-  friend class NDArray;
-};
+inline void NDArray::reset() {
+  if (data_ != nullptr) {
+    data_->DecRef();
+    data_ = nullptr;
+  }
+}
 
-// implementations of inline functions
-/*!
- * \brief return the size of data the DLTensor hold, in term of number of bytes
+/*! \brief return the size of data the DLTensor hold, in term of number of bytes
  *
  *  \param arr the input DLTensor
+ *
  *  \return number of  bytes of data in the DLTensor.
  */
 inline size_t GetDataSize(const DLTensor& arr) {
@@ -295,26 +369,26 @@ inline size_t GetDataSize(const DLTensor& arr) {
   return size;
 }
 
-inline void NDArray::CopyFrom(const DLTensor* other) {
+inline void NDArray::CopyFrom(DLTensor* other) {
   CHECK(data_ != nullptr);
-  CopyFromTo(other, &(get_mutable()->dl_tensor));
+  CopyFromTo(other, &(data_->dl_tensor));
 }
 
 inline void NDArray::CopyFrom(const NDArray& other) {
   CHECK(data_ != nullptr);
   CHECK(other.data_ != nullptr);
-  CopyFromTo(&(other.get_mutable()->dl_tensor), &(get_mutable()->dl_tensor));
+  CopyFromTo(&(other.data_->dl_tensor), &(data_->dl_tensor));
 }
 
 inline void NDArray::CopyTo(DLTensor* other) const {
   CHECK(data_ != nullptr);
-  CopyFromTo(&(get_mutable()->dl_tensor), other);
+  CopyFromTo(&(data_->dl_tensor), other);
 }
 
 inline void NDArray::CopyTo(const NDArray& other) const {
   CHECK(data_ != nullptr);
   CHECK(other.data_ != nullptr);
-  CopyFromTo(&(get_mutable()->dl_tensor), &(other.get_mutable()->dl_tensor));
+  CopyFromTo(&(data_->dl_tensor), &(other.data_->dl_tensor));
 }
 
 inline NDArray NDArray::CopyTo(const DLContext& ctx) const {
@@ -327,46 +401,19 @@ inline NDArray NDArray::CopyTo(const DLContext& ctx) const {
 }
 
 inline int NDArray::use_count() const {
-  return data_.use_count();
+  if (data_ == nullptr) return 0;
+  return data_->ref_counter_.load(std::memory_order_relaxed);
 }
 
 inline const DLTensor* NDArray::operator->() const {
-  return &(get_mutable()->dl_tensor);
-}
-
-inline NDArray::Container* NDArray::get_mutable() const {
-  return static_cast<NDArray::Container*>(data_.get());
-}
-
-inline ObjectPtr<Object> NDArray::FFIDataFromHandle(TVMArrayHandle handle) {
-  return GetObjectPtr<Object>(static_cast<NDArray::Container*>(
-      reinterpret_cast<NDArray::ContainerBase*>(handle)));
-}
-
-inline TVMArrayHandle NDArray::FFIGetHandle(const ObjectRef& nd) {
-  // NOTE: it is necessary to cast to container then to base
-  //       so that the FFI handle uses the ContainerBase address.
-  return reinterpret_cast<TVMArrayHandle>(
-          static_cast<NDArray::ContainerBase*>(
-              static_cast<NDArray::Container*>(
-                  const_cast<Object*>(nd.get()))));
-}
-
-inline void NDArray::FFIDecRef(TVMArrayHandle handle) {
-  static_cast<NDArray::Container*>(
-      reinterpret_cast<NDArray::ContainerBase*>(handle))->DecRef();
-}
-
-inline Object* TVMArrayHandleToObjectHandle(TVMArrayHandle handle) {
-  return static_cast<NDArray::Container*>(
-      reinterpret_cast<NDArray::ContainerBase*>(handle));
+  return &(data_->dl_tensor);
 }
 
 /*! \brief Magic number for NDArray file */
 constexpr uint64_t kTVMNDArrayMagic = 0xDD5E40F096B4A13F;
 
 inline bool SaveDLTensor(dmlc::Stream* strm,
-                         const DLTensor* tensor) {
+                         DLTensor* tensor) {
   uint64_t header = kTVMNDArrayMagic, reserved = 0;
   strm->Write(header);
   strm->Write(reserved);
@@ -404,7 +451,7 @@ inline bool SaveDLTensor(dmlc::Stream* strm,
   } else {
     std::vector<uint8_t> bytes(data_byte_size);
     CHECK_EQ(TVMArrayCopyToBytes(
-        const_cast<DLTensor*>(tensor), dmlc::BeginPtr(bytes), data_byte_size), 0)
+        tensor, dmlc::BeginPtr(bytes), data_byte_size), 0)
         << TVMGetLastError();
     if (!DMLC_IO_NO_ENDIAN_SWAP) {
       dmlc::ByteSwap(dmlc::BeginPtr(bytes), type_bytes, num_elems);
@@ -415,7 +462,7 @@ inline bool SaveDLTensor(dmlc::Stream* strm,
 }
 
 inline void NDArray::Save(dmlc::Stream* strm) const {
-  SaveDLTensor(strm, operator->());
+  SaveDLTensor(strm, const_cast<DLTensor*>(operator->()));
 }
 
 inline bool NDArray::Load(dmlc::Stream* strm) {
diff --git a/include/tvm/runtime/object.h b/include/tvm/runtime/object.h
index 96215daf4a7a..20e6b5a0fb63 100644
--- a/include/tvm/runtime/object.h
+++ b/include/tvm/runtime/object.h
@@ -24,11 +24,10 @@
 #define TVM_RUNTIME_OBJECT_H_
 
 #include <dmlc/logging.h>
-#include <tvm/runtime/c_runtime_api.h>
 #include <type_traits>
 #include <string>
 #include <utility>
-
+#include "c_runtime_api.h"
 
 /*!
  * \brief Whether or not use atomic reference counter.
@@ -581,14 +580,6 @@ class ObjectRef {
   static T DowncastNoCheck(ObjectRef ref) {
     return T(std::move(ref.data_));
   }
-  /*!
-   * \brief Clear the object ref data field without DecRef
-   *        after we successfully moved the field.
-   * \param ref The reference data.
-   */
-  static void FFIClearAfterMove(ObjectRef* ref) {
-    ref->data_.data_ = nullptr;
-  }
   /*!
    * \brief Internal helper function get data_ as ObjectPtr of ObjectType.
    * \note only used for internal dev purpose.
@@ -657,7 +648,7 @@ struct ObjectEqual {
     return _GetOrAllocRuntimeTypeIndex();                               \
   }                                                                     \
   static const uint32_t _GetOrAllocRuntimeTypeIndex()  {                \
-    static uint32_t tidx = Object::GetOrAllocRuntimeTypeIndex(          \
+    static uint32_t tidx = GetOrAllocRuntimeTypeIndex(                  \
         TypeName::_type_key,                                            \
         TypeName::_type_index,                                          \
         ParentType::_GetOrAllocRuntimeTypeIndex(),                      \
@@ -677,19 +668,6 @@ struct ObjectEqual {
   TVM_DECLARE_BASE_OBJECT_INFO(TypeName, ParentType)                    \
 
 
-/*! \brief helper macro to supress unused warning */
-#if defined(__GNUC__)
-#define TVM_ATTRIBUTE_UNUSED __attribute__((unused))
-#else
-#define TVM_ATTRIBUTE_UNUSED
-#endif
-
-#define TVM_STR_CONCAT_(__x, __y) __x##__y
-#define TVM_STR_CONCAT(__x, __y) TVM_STR_CONCAT_(__x, __y)
-
-#define TVM_OBJECT_REG_VAR_DEF                              \
-  static TVM_ATTRIBUTE_UNUSED uint32_t __make_Object_tid
-
 /*!
  * \brief Helper macro to register the object type to runtime.
  *  Makes sure that the runtime type table is correctly populated.
@@ -697,7 +675,7 @@ struct ObjectEqual {
  *  Use this macro in the cc file for each terminal class.
  */
 #define TVM_REGISTER_OBJECT_TYPE(TypeName)                              \
-  TVM_STR_CONCAT(TVM_OBJECT_REG_VAR_DEF, __COUNTER__) =                 \
+  static DMLC_ATTRIBUTE_UNUSED uint32_t __make_Object_tidx ## _ ## TypeName ## __ = \
       TypeName::_GetOrAllocRuntimeTypeIndex()
 
 
@@ -713,14 +691,14 @@ struct ObjectEqual {
   using ContainerType = ObjectName;
 
 #define TVM_DEFINE_OBJECT_REF_METHODS_MUT(TypeName, ParentType, ObjectName) \
-  TypeName() {}                                                         \
-  explicit TypeName(                                                    \
-      ::tvm::runtime::ObjectPtr<::tvm::runtime::Object> n)              \
-      : ParentType(n) {}                                                \
-  ObjectName* operator->() {                                            \
-    return static_cast<ObjectName*>(data_.get());                       \
-  }                                                                     \
-  operator bool() const { return data_ != nullptr; }                    \
+  TypeName() {}                                                             \
+  explicit TypeName(                                                        \
+      ::tvm::runtime::ObjectPtr<::tvm::runtime::Object> n)                  \
+      : ParentType(n) {}                                                    \
+  ObjectName* operator->() {                                    \
+    return static_cast<ObjectName*>(data_.get());                     \
+  }                                                                         \
+  operator bool() const { return data_ != nullptr; }                        \
   using ContainerType = ObjectName;
 
 // Implementations details below
diff --git a/include/tvm/runtime/packed_func.h b/include/tvm/runtime/packed_func.h
index 5650db6f909c..57c4291907c0 100644
--- a/include/tvm/runtime/packed_func.h
+++ b/include/tvm/runtime/packed_func.h
@@ -28,11 +28,6 @@
 #include <sstream>
 #endif
 #include <dmlc/logging.h>
-#include <tvm/runtime/c_runtime_api.h>
-#include <tvm/runtime/module.h>
-#include <tvm/runtime/ndarray.h>
-#include <tvm/runtime/data_type.h>
-#include <tvm/runtime/object.h>
 #include <functional>
 #include <tuple>
 #include <vector>
@@ -41,7 +36,10 @@
 #include <memory>
 #include <utility>
 #include <type_traits>
-
+#include "c_runtime_api.h"
+#include "module.h"
+#include "ndarray.h"
+#include "object.h"
 
 // Whether use TVM runtime in header only mode.
 #ifndef TVM_RUNTIME_HEADER_ONLY
@@ -51,6 +49,7 @@
 namespace tvm {
 // forward declarations
 class Integer;
+class DataType;
 class Expr;
 
 namespace runtime {
@@ -389,20 +388,47 @@ inline std::string TVMType2String(TVMType t);
   << TypeCode2Str(T) << " but get " << TypeCode2Str(CODE)      \
 
 /*!
- * \brief Type traits for runtime type check during FFI conversion.
- * \tparam T the type to be checked.
+ * \brief Type traits to mark if a class is tvm extension type.
+ *
+ * To enable extension type in C++ must be registered via marco.
+ * TVM_REGISTER_EXT_TYPE(TypeName) after defining this with this traits.
+ *
+ * Extension class can be passed and returned via PackedFunc in all tvm runtime.
+ * Internally extension class is stored as T*.
+ *
+ * \tparam T the typename
  */
 template<typename T>
-struct ObjectTypeChecker {
-  static bool Check(const Object* ptr) {
-    using ContainerType = typename T::ContainerType;
-    if (ptr == nullptr) return true;
-    return ptr->IsInstance<ContainerType>();
-  }
-  static std::string TypeName() {
-    using ContainerType = typename T::ContainerType;
-    return ContainerType::_type_key;
-  }
+struct extension_type_info {
+  static const int code = 0;
+};
+
+/*!
+ * \brief Runtime function table about extension type.
+ */
+class ExtTypeVTable {
+ public:
+  /*! \brief function to be called to delete a handle */
+  void (*destroy)(void* handle);
+  /*! \brief function to be called when clone a handle */
+  void* (*clone)(void* handle);
+  /*!
+   * \brief Register type
+   * \tparam T The type to be register.
+   * \return The registered vtable.
+   */
+  template <typename T>
+  static inline ExtTypeVTable* Register_();
+  /*!
+   * \brief Get a vtable based on type code.
+   * \param type_code The type code
+   * \return The registered vtable.
+   */
+  TVM_DLL static ExtTypeVTable* Get(int type_code);
+
+ private:
+  // Internal registration function.
+  TVM_DLL static ExtTypeVTable* RegisterInternal(int type_code, const ExtTypeVTable& vt);
 };
 
 /*!
@@ -451,17 +477,24 @@ class TVMPODValue_ {
       return static_cast<DLTensor*>(value_.v_handle);
     } else {
       if (type_code_ == kNull) return nullptr;
-      LOG(FATAL) << "Expect "
+      LOG(FATAL) << "Expected "
                  << "DLTensor* or NDArray but get "
                  << TypeCode2Str(type_code_);
       return nullptr;
     }
   }
   operator NDArray() const {
-    if (type_code_ == kNull) return NDArray(ObjectPtr<Object>(nullptr));
+    if (type_code_ == kNull) return NDArray();
     TVM_CHECK_TYPE_CODE(type_code_, kNDArrayContainer);
-    return NDArray(NDArray::FFIDataFromHandle(
-        static_cast<TVMArrayHandle>(value_.v_handle)));
+    return NDArray(static_cast<NDArray::Container*>(value_.v_handle));
+  }
+  operator ObjectRef() const {
+    if (type_code_ == kNull) {
+      return ObjectRef(ObjectPtr<Object>(nullptr));
+    }
+    TVM_CHECK_TYPE_CODE(type_code_, kObjectHandle);
+    return ObjectRef(
+        ObjectPtr<Object>(static_cast<Object*>(value_.v_handle)));
   }
   operator Module() const {
     if (type_code_ == kNull) {
@@ -475,9 +508,28 @@ class TVMPODValue_ {
     TVM_CHECK_TYPE_CODE(type_code_, kTVMContext);
     return value_.v_ctx;
   }
+  template<typename TNDArray,
+           typename = typename std::enable_if<
+           std::is_base_of<NDArray, TNDArray>::value>::type>
+  TNDArray AsNDArray() const {
+    if (type_code_ == kNull) return TNDArray(nullptr);
+    auto *container = static_cast<NDArray::Container*>(value_.v_handle);
+    CHECK_EQ(container->array_type_code_, array_type_info<TNDArray>::code);
+    return TNDArray(container);
+  }
+  template<typename TExtension>
+  const TExtension& AsExtension() const {
+    CHECK_LT(type_code_, kExtEnd);
+    return static_cast<TExtension*>(value_.v_handle)[0];
+  }
+  template<typename TObjectRef,
+           typename = typename std::enable_if<
+             std::is_class<TObjectRef>::value>::type>
+  inline bool IsObjectRef() const;
   int type_code() const {
     return type_code_;
   }
+
   /*!
    * \brief return handle as specific pointer type.
    * \tparam T the data type.
@@ -487,16 +539,6 @@ class TVMPODValue_ {
   T* ptr() const {
     return static_cast<T*>(value_.v_handle);
   }
-  // ObjectRef handling
-  template<typename TObjectRef,
-           typename = typename std::enable_if<
-             std::is_base_of<ObjectRef, TObjectRef>::value>::type>
-  inline bool IsObjectRef() const;
-  template<typename TObjectRef>
-  inline TObjectRef AsObjectRef() const;
-  // ObjectRef Specializations
-  inline operator tvm::Expr() const;
-  inline operator tvm::Integer() const;
 
  protected:
   friend class TVMArgsSetter;
@@ -539,11 +581,9 @@ class TVMArgValue : public TVMPODValue_ {
   using TVMPODValue_::operator DLTensor*;
   using TVMPODValue_::operator NDArray;
   using TVMPODValue_::operator TVMContext;
+  using TVMPODValue_::operator ObjectRef;
   using TVMPODValue_::operator Module;
   using TVMPODValue_::IsObjectRef;
-  using TVMPODValue_::AsObjectRef;
-  using TVMPODValue_::operator tvm::Expr;
-  using TVMPODValue_::operator tvm::Integer;
 
   // conversion operator.
   operator std::string() const {
@@ -570,9 +610,6 @@ class TVMArgValue : public TVMPODValue_ {
     TVM_CHECK_TYPE_CODE(type_code_, kTVMType);
     return value_.v_type;
   }
-  operator DataType() const {
-    return DataType(operator DLDataType());
-  }
   operator PackedFunc() const {
     if (type_code_ == kNull) return PackedFunc();
     TVM_CHECK_TYPE_CODE(type_code_, kFuncHandle);
@@ -585,10 +622,16 @@ class TVMArgValue : public TVMPODValue_ {
   const TVMValue& value() const {
     return value_;
   }
+  // Deferred extension handler.
+  template<typename TObjectRef>
+  inline TObjectRef AsObjectRef() const;
   template<typename T,
            typename = typename std::enable_if<
-             std::is_class<T>::value>::type>
+           std::is_class<T>::value>::type>
   inline operator T() const;
+  inline operator tvm::DataType() const;
+  inline operator tvm::Expr() const;
+  inline operator tvm::Integer() const;
 };
 
 /*!
@@ -626,11 +669,9 @@ class TVMRetValue : public TVMPODValue_ {
   using TVMPODValue_::operator DLTensor*;
   using TVMPODValue_::operator TVMContext;
   using TVMPODValue_::operator NDArray;
+  using TVMPODValue_::operator ObjectRef;
   using TVMPODValue_::operator Module;
   using TVMPODValue_::IsObjectRef;
-  using TVMPODValue_::AsObjectRef;
-  using TVMPODValue_::operator tvm::Expr;
-  using TVMPODValue_::operator tvm::Integer;
 
   TVMRetValue(const TVMRetValue& other) : TVMPODValue_() {
     this->Assign(other);
@@ -652,9 +693,6 @@ class TVMRetValue : public TVMPODValue_ {
     TVM_CHECK_TYPE_CODE(type_code_, kTVMType);
     return value_.v_type;
   }
-  operator DataType() const {
-    return DataType(operator DLDataType());
-  }
   operator PackedFunc() const {
     if (type_code_ == kNull) return PackedFunc();
     TVM_CHECK_TYPE_CODE(type_code_, kFuncHandle);
@@ -707,9 +745,6 @@ class TVMRetValue : public TVMPODValue_ {
     value_.v_type = t;
     return *this;
   }
-  TVMRetValue& operator=(const DataType& other) {
-    return operator=(other.operator DLDataType());
-  }
   TVMRetValue& operator=(bool value) {
     this->SwitchToPOD(kDLInt);
     value_.v_int64 = value;
@@ -724,20 +759,24 @@ class TVMRetValue : public TVMPODValue_ {
     return *this;
   }
   TVMRetValue& operator=(NDArray other) {
-    if (other.data_ != nullptr) {
-      this->Clear();
-      type_code_ = kNDArrayContainer;
-      value_.v_handle = NDArray::FFIGetHandle(other);
-      ObjectRef::FFIClearAfterMove(&other);
-    } else {
-      SwitchToPOD(kNull);
-    }
+    this->Clear();
+    type_code_ = kNDArrayContainer;
+    value_.v_handle = other.data_;
+    other.data_ = nullptr;
     return *this;
   }
+  TVMRetValue& operator=(ObjectRef other) {
+    return operator=(std::move(other.data_));
+  }
   TVMRetValue& operator=(Module m) {
     SwitchToObject(kModuleHandle, std::move(m.data_));
     return *this;
   }
+  template<typename T>
+  TVMRetValue& operator=(ObjectPtr<T> other) {
+    SwitchToObject(kObjectHandle, std::move(other));
+    return *this;
+  }
   TVMRetValue& operator=(PackedFunc f) {
     this->SwitchToClass(kFuncHandle, f);
     return *this;
@@ -754,6 +793,14 @@ class TVMRetValue : public TVMPODValue_ {
     this->Assign(other);
     return *this;
   }
+  template<typename T,
+           typename = typename std::enable_if<
+             extension_type_info<T>::code != 0>::type>
+  TVMRetValue& operator=(const T& other) {
+    this->SwitchToClass<T>(
+        extension_type_info<T>::code, other);
+    return *this;
+  }
   /*!
    * \brief Move the value back to front-end via C API.
    *  This marks the current container as null.
@@ -779,15 +826,16 @@ class TVMRetValue : public TVMPODValue_ {
           type_code_ != kStr) << "TVMRetValue.value can only be used for POD data";
     return value_;
   }
-  // ObjectRef handling
-  template<typename TObjectRef,
-           typename = typename std::enable_if<
-             std::is_base_of<ObjectRef, TObjectRef>::value>::type>
-  inline TVMRetValue& operator=(TObjectRef other);
+  // ObjectRef related extenstions: in tvm/packed_func_ext.h
   template<typename T,
            typename = typename std::enable_if<
              std::is_class<T>::value>::type>
   inline operator T() const;
+  template<typename TObjectRef>
+  inline TObjectRef AsObjectRef() const;
+  // type related
+  inline operator tvm::DataType() const;
+  inline TVMRetValue& operator=(const tvm::DataType& other);
 
  private:
   template<typename T>
@@ -814,15 +862,24 @@ class TVMRetValue : public TVMPODValue_ {
         break;
       }
       case kObjectHandle: {
-        // Avoid operator ObjectRef as we already know it is not NDArray/Module
-        SwitchToObject(
-            kObjectHandle, GetObjectPtr<Object>(
-                static_cast<Object*>(other.value_.v_handle)));
+        *this = other.operator ObjectRef();
         break;
       }
       default: {
-        SwitchToPOD(other.type_code());
-        value_ = other.value_;
+        if (other.type_code() < kExtBegin) {
+          SwitchToPOD(other.type_code());
+          value_ = other.value_;
+        } else {
+#if TVM_RUNTIME_HEADER_ONLY
+          LOG(FATAL) << "Header only mode do not support ext type";
+#else
+          this->Clear();
+          type_code_ = other.type_code();
+          value_.v_handle =
+              (*(ExtTypeVTable::Get(other.type_code())->clone))(
+                  other.value().v_handle);
+#endif
+        }
         break;
       }
     }
@@ -861,7 +918,7 @@ class TVMRetValue : public TVMPODValue_ {
       case kStr: delete ptr<std::string>(); break;
       case kFuncHandle: delete ptr<PackedFunc>(); break;
       case kNDArrayContainer: {
-        NDArray::FFIDecRef(static_cast<TVMArrayHandle>(value_.v_handle));
+        static_cast<NDArray::Container*>(value_.v_handle)->DecRef();
         break;
       }
       case kModuleHandle: {
@@ -873,6 +930,13 @@ class TVMRetValue : public TVMPODValue_ {
         break;
       }
     }
+    if (type_code_ > kExtBegin) {
+#if TVM_RUNTIME_HEADER_ONLY
+          LOG(FATAL) << "Header only mode do not support ext type";
+#else
+      (*(ExtTypeVTable::Get(type_code_)->destroy))(value_.v_handle);
+#endif
+    }
     type_code_ = kNull;
   }
 };
@@ -893,7 +957,7 @@ inline const char* TypeCode2Str(int type_code) {
     case kFuncHandle: return "FunctionHandle";
     case kModuleHandle: return "ModuleHandle";
     case kNDArrayContainer: return "NDArrayContainer";
-    case kObjectHandle: return "Object";
+    case kObjectHandle: return "ObjectCell";
     default: LOG(FATAL) << "unknown type_code="
                         << static_cast<int>(type_code); return "";
   }
@@ -917,10 +981,6 @@ inline std::ostream& operator<<(std::ostream& os, TVMType t) {  // NOLINT(*)
   return os;
 }
 
-inline std::ostream& operator<<(std::ostream& os, const DataType& dtype) { // NOLINT(*)
-  return os << dtype.operator DLDataType();
-}
-
 #endif
 
 inline std::string TVMType2String(TVMType t) {
@@ -1080,31 +1140,50 @@ class TVMArgsSetter {
     values_[i].v_type = value;
     type_codes_[i] = kTVMType;
   }
-  void operator()(size_t i, DataType dtype) const {
-    operator()(i, dtype.operator DLDataType());
-  }
   void operator()(size_t i, const char* value) const {
     values_[i].v_str = value;
     type_codes_[i] = kStr;
   }
-  // setters for container types
-  void operator()(size_t i, const std::string& value) const {
+  // setters for container type
+  // They must be reference(instead of const ref)
+  // to make sure they are alive in the tuple(instead of getting converted)
+  void operator()(size_t i, const std::string& value) const {  // NOLINT(*)
     values_[i].v_str = value.c_str();
     type_codes_[i] = kStr;
   }
-  void operator()(size_t i, const TVMByteArray& value) const {
+  void operator()(size_t i, const TVMByteArray& value) const {  // NOLINT(*)
     values_[i].v_handle = const_cast<TVMByteArray*>(&value);
     type_codes_[i] = kBytes;
   }
-  void operator()(size_t i, const PackedFunc& value) const {
+  void operator()(size_t i, const PackedFunc& value) const {  // NOLINT(*)
     values_[i].v_handle = const_cast<PackedFunc*>(&value);
     type_codes_[i] = kFuncHandle;
   }
   template<typename FType>
-  void operator()(size_t i, const TypedPackedFunc<FType>& value) const {
+  void operator()(size_t i, const TypedPackedFunc<FType>& value) const {  // NOLINT(*)
     operator()(i, value.packed());
   }
-  void operator()(size_t i, const TVMRetValue& value) const {
+  void operator()(size_t i, const Module& value) const {  // NOLINT(*)
+    if (value.defined()) {
+      values_[i].v_handle = value.data_.data_;
+      type_codes_[i] = kModuleHandle;
+    } else {
+      type_codes_[i] = kNull;
+    }
+  }
+  void operator()(size_t i, const NDArray& value) const {  // NOLINT(*)
+    values_[i].v_handle = value.data_;
+    type_codes_[i] = kNDArrayContainer;
+  }
+  void operator()(size_t i, const ObjectRef& value) const {  // NOLINT(*)
+    if (value.defined()) {
+      values_[i].v_handle = value.data_.data_;
+      type_codes_[i] = kObjectHandle;
+    } else {
+      type_codes_[i] = kNull;
+    }
+  }
+  void operator()(size_t i, const TVMRetValue& value) const {  // NOLINT(*)
     if (value.type_code() == kStr) {
       values_[i].v_str = value.ptr<std::string>()->c_str();
       type_codes_[i] = kStr;
@@ -1114,11 +1193,12 @@ class TVMArgsSetter {
       type_codes_[i] = value.type_code();
     }
   }
-  // ObjectRef handling
-  template<typename TObjectRef,
+  // extension
+  template<typename T,
            typename = typename std::enable_if<
-             std::is_base_of<ObjectRef, TObjectRef>::value>::type>
-  inline void operator()(size_t i, const TObjectRef& value) const;
+             extension_type_info<T>::code != 0>::type>
+  inline void operator()(size_t i, const T& value) const;
+  inline void operator()(size_t i, const tvm::DataType& t) const;
 
  private:
   /*! \brief The values fields */
@@ -1230,131 +1310,78 @@ inline R TypedPackedFunc<R(Args...)>::operator()(Args... args) const {
       ::run(packed_, std::forward<Args>(args)...);
 }
 
-// ObjectRef related conversion handling
-// Object can have three possible type codes:
-//      kNDArrayContainer, kModuleHandle, kObjectHandle
-//
-// We use type traits to eliminate un-necessary checks.
-template<typename TObjectRef, typename>
-inline void TVMArgsSetter::operator()(size_t i, const TObjectRef& value) const {
-  if (value.defined()) {
-    Object* ptr = value.data_.data_;
-    if (std::is_base_of<NDArray, TObjectRef>::value ||
-        (std::is_base_of<TObjectRef, NDArray>::value &&
-         ptr->IsInstance<NDArray::ContainerType>())) {
-      values_[i].v_handle = NDArray::FFIGetHandle(value);
-      type_codes_[i] = kNDArrayContainer;
-    } else if (std::is_base_of<Module, TObjectRef>::value ||
-               (std::is_base_of<TObjectRef, Module>::value &&
-                ptr->IsInstance<Module::ContainerType>())) {
-      values_[i].v_handle = ptr;
-      type_codes_[i] = kModuleHandle;
-    } else {
-      values_[i].v_handle = ptr;
-      type_codes_[i] = kObjectHandle;
-    }
-  } else {
-    type_codes_[i] = kNull;
+// extension and node type handling
+namespace detail {
+template<typename T, typename TSrc, bool is_ext, bool is_nd>
+struct TVMValueCast {
+  static T Apply(const TSrc* self) {
+    static_assert(!is_ext && !is_nd, "The default case accepts only non-extensions");
+    return self->template AsObjectRef<T>();
   }
-}
-
-template<typename TObjectRef, typename>
-inline bool TVMPODValue_::IsObjectRef() const {
-  using ContainerType = typename TObjectRef::ContainerType;
-  // NOTE: the following code can be optimized by constant folding.
-  if (std::is_base_of<NDArray, TObjectRef>::value) {
-    return type_code_ == kNDArrayContainer &&
-        TVMArrayHandleToObjectHandle(
-            static_cast<TVMArrayHandle>(value_.v_handle))->IsInstance<ContainerType>();
-  }
-  if (std::is_base_of<Module, TObjectRef>::value) {
-    return type_code_ == kModuleHandle &&
-        static_cast<Object*>(value_.v_handle)->IsInstance<ContainerType>();
-  }
-  return
-      (std::is_base_of<TObjectRef, NDArray>::value && type_code_ == kNDArrayContainer) ||
-      (std::is_base_of<TObjectRef, Module>::value && type_code_ == kModuleHandle) ||
-      (type_code_ == kObjectHandle &&
-       ObjectTypeChecker<TObjectRef>::Check(static_cast<Object*>(value_.v_handle)));
-}
+};
 
-template<typename TObjectRef>
-inline TObjectRef TVMPODValue_::AsObjectRef() const {
-  static_assert(
-      std::is_base_of<ObjectRef, TObjectRef>::value,
-      "Conversion only works for ObjectRef");
-  using ContainerType = typename TObjectRef::ContainerType;
-  if (type_code_ == kNull) return TObjectRef(ObjectPtr<Object>(nullptr));
-  // NOTE: the following code can be optimized by constant folding.
-  if (std::is_base_of<NDArray, TObjectRef>::value) {
-    // Casting to a sub-class of NDArray
-    TVM_CHECK_TYPE_CODE(type_code_, kNDArrayContainer);
-    ObjectPtr<Object> data = NDArray::FFIDataFromHandle(
-        static_cast<TVMArrayHandle>(value_.v_handle));
-    CHECK(data->IsInstance<ContainerType>())
-        << "Expect " << ContainerType::_type_key << " but get " << data->GetTypeKey();
-    return TObjectRef(data);
-  }
-  if (std::is_base_of<Module, TObjectRef>::value) {
-    // Casting to a sub-class of Module
-    TVM_CHECK_TYPE_CODE(type_code_, kModuleHandle);
-    ObjectPtr<Object> data = GetObjectPtr<Object>(static_cast<Object*>(value_.v_handle));
-    CHECK(data->IsInstance<ContainerType>())
-        << "Expect " << ContainerType::_type_key << " but get " << data->GetTypeKey();
-    return TObjectRef(data);
-  }
-  if (type_code_ == kObjectHandle) {
-    // normal object type check.
-    Object* ptr = static_cast<Object*>(value_.v_handle);
-    CHECK(ObjectTypeChecker<TObjectRef>::Check(ptr))
-        << "Expect " << ObjectTypeChecker<TObjectRef>::TypeName()
-        << " but get " << ptr->GetTypeKey();
-    return TObjectRef(GetObjectPtr<Object>(ptr));
-  } else if (std::is_base_of<TObjectRef, NDArray>::value &&
-             type_code_ == kNDArrayContainer) {
-    // Casting to a base class that NDArray can sub-class
-    ObjectPtr<Object> data = NDArray::FFIDataFromHandle(
-        static_cast<TVMArrayHandle>(value_.v_handle));
-    return TObjectRef(data);
-  } else if (std::is_base_of<TObjectRef, Module>::value &&
-             type_code_ == kModuleHandle) {
-    // Casting to a base class that Module can sub-class
-    return TObjectRef(GetObjectPtr<Object>(static_cast<Object*>(value_.v_handle)));
-  } else {
-    TVM_CHECK_TYPE_CODE(type_code_, kObjectHandle);
-    return TObjectRef(ObjectPtr<Object>(nullptr));
+template<typename T, typename TSrc>
+struct TVMValueCast<T, TSrc, true, false> {
+  static T Apply(const TSrc* self) {
+    return self->template AsExtension<T>();
   }
-}
+};
 
-template<typename TObjectRef, typename>
-inline TVMRetValue& TVMRetValue::operator=(TObjectRef other) {
-  const Object* ptr = other.get();
-  if (ptr != nullptr) {
-    if (std::is_base_of<NDArray, TObjectRef>::value ||
-        (std::is_base_of<TObjectRef, NDArray>::value &&
-         ptr->IsInstance<NDArray::ContainerType>())) {
-      return operator=(NDArray(std::move(other.data_)));
-    }
-    if (std::is_base_of<Module, TObjectRef>::value ||
-        (std::is_base_of<TObjectRef, Module>::value &&
-         ptr->IsInstance<Module::ContainerType>())) {
-      return operator=(Module(std::move(other.data_)));
-    }
-    SwitchToObject(kObjectHandle, std::move(other.data_));
-  } else {
-    SwitchToPOD(kNull);
+template<typename T, typename TSrc>
+struct TVMValueCast<T, TSrc, false, true> {
+  static T Apply(const TSrc* self) {
+    return self->template AsNDArray<T>();
   }
-  return *this;
-}
+};
+
+}  // namespace detail
 
 template<typename T, typename>
 inline TVMArgValue::operator T() const {
-  return AsObjectRef<T>();
+  return detail::
+      TVMValueCast<T, TVMArgValue,
+                   (extension_type_info<T>::code != 0),
+                   (array_type_info<T>::code > 0)>
+      ::Apply(this);
 }
 
 template<typename T, typename>
 inline TVMRetValue::operator T() const {
-  return AsObjectRef<T>();
+  return detail::
+      TVMValueCast<T, TVMRetValue,
+                   (extension_type_info<T>::code != 0),
+                   (array_type_info<T>::code > 0)>
+      ::Apply(this);
+}
+
+template<typename T, typename>
+inline void TVMArgsSetter::operator()(size_t i, const T& value) const {
+  static_assert(extension_type_info<T>::code != 0,
+                "Need to have extesion code");
+  type_codes_[i] = extension_type_info<T>::code;
+  values_[i].v_handle = const_cast<T*>(&value);
+}
+
+// extension type handling
+template<typename T>
+struct ExtTypeInfo {
+  static void destroy(void* handle) {
+    delete static_cast<T*>(handle);
+  }
+  static void* clone(void* handle) {
+    return new T(*static_cast<T*>(handle));
+  }
+};
+
+template<typename T>
+inline ExtTypeVTable* ExtTypeVTable::Register_() {
+  const int code = extension_type_info<T>::code;
+  static_assert(code != 0,
+                "require extension_type_info traits to be declared with non-zero code");
+  ExtTypeVTable vt;
+  vt.clone = ExtTypeInfo<T>::clone;
+  vt.destroy = ExtTypeInfo<T>::destroy;
+  return ExtTypeVTable::RegisterInternal(code, vt);
 }
 
 inline PackedFunc Module::GetFunction(const std::string& name, bool query_imports) {
diff --git a/include/tvm/runtime/registry.h b/include/tvm/runtime/registry.h
index e51b806ea81f..d668984f50e2 100644
--- a/include/tvm/runtime/registry.h
+++ b/include/tvm/runtime/registry.h
@@ -43,9 +43,9 @@
 #ifndef TVM_RUNTIME_REGISTRY_H_
 #define TVM_RUNTIME_REGISTRY_H_
 
-#include <tvm/runtime/packed_func.h>
 #include <string>
 #include <vector>
+#include "packed_func.h"
 
 namespace tvm {
 namespace runtime {
@@ -283,9 +283,22 @@ class Registry {
   friend struct Manager;
 };
 
+/*! \brief helper macro to supress unused warning */
+#if defined(__GNUC__)
+#define TVM_ATTRIBUTE_UNUSED __attribute__((unused))
+#else
+#define TVM_ATTRIBUTE_UNUSED
+#endif
+
+#define TVM_STR_CONCAT_(__x, __y) __x##__y
+#define TVM_STR_CONCAT(__x, __y) TVM_STR_CONCAT_(__x, __y)
+
 #define TVM_FUNC_REG_VAR_DEF                                            \
   static TVM_ATTRIBUTE_UNUSED ::tvm::runtime::Registry& __mk_ ## TVM
 
+#define TVM_TYPE_REG_VAR_DEF                                            \
+  static TVM_ATTRIBUTE_UNUSED ::tvm::runtime::ExtTypeVTable* __mk_ ## TVMT
+
 /*!
  * \brief Register a function globally.
  * \code
@@ -298,6 +311,15 @@ class Registry {
   TVM_STR_CONCAT(TVM_FUNC_REG_VAR_DEF, __COUNTER__) =            \
       ::tvm::runtime::Registry::Register(OpName)
 
+/*!
+ * \brief Macro to register extension type.
+ *  This must be registered in a cc file
+ *  after the trait extension_type_info is defined.
+ */
+#define TVM_REGISTER_EXT_TYPE(T)                                 \
+  TVM_STR_CONCAT(TVM_TYPE_REG_VAR_DEF, __COUNTER__) =            \
+      ::tvm::runtime::ExtTypeVTable::Register_<T>()
+
 }  // namespace runtime
 }  // namespace tvm
 #endif  // TVM_RUNTIME_REGISTRY_H_
diff --git a/include/tvm/tensor.h b/include/tvm/tensor.h
index f44498a0aa7a..599d6ff657d1 100644
--- a/include/tvm/tensor.h
+++ b/include/tvm/tensor.h
@@ -163,7 +163,7 @@ class TensorNode : public Node {
   /*! \brief The shape of the tensor */
   Array<Expr> shape;
   /*! \brief data type in the content of the tensor */
-  DataType dtype;
+  Type dtype;
   /*! \brief the source operation, can be None */
   Operation op;
   /*! \brief the output index from source operation */
@@ -178,7 +178,7 @@ class TensorNode : public Node {
     v->Visit("value_index", &value_index);
   }
   TVM_DLL static Tensor make(Array<Expr> shape,
-                             DataType dtype,
+                             Type dtype,
                              Operation op,
                              int value_index);
 
diff --git a/jvm/core/src/main/java/org/apache/tvm/contrib/GraphRuntime.java b/jvm/core/src/main/java/org/apache/tvm/contrib/GraphRuntime.java
index c31c67f283af..03695dc9045b 100644
--- a/jvm/core/src/main/java/org/apache/tvm/contrib/GraphRuntime.java
+++ b/jvm/core/src/main/java/org/apache/tvm/contrib/GraphRuntime.java
@@ -32,7 +32,7 @@
 public class GraphRuntime {
   /**
    * Create a runtime executor module given a graph and module.
-   * @param graphJson The graph deployed in json format output by compiler.
+   * @param graphJson The graph deployed in json format output by nnvm graph.
    * @param libmod The module of the corresponding function.
    * @param ctx The local or remote context to deploy the module.
    * @return Runtime graph module that can be used to execute the graph.
diff --git a/neo-tools/sync-with-dmlc.py b/neo-tools/sync-with-dmlc.py
index 74881eacdf39..a1967444ffb6 100755
--- a/neo-tools/sync-with-dmlc.py
+++ b/neo-tools/sync-with-dmlc.py
@@ -58,7 +58,7 @@ def main():
 
     # Add dmlc/tvm to remote 'upstream' if not
     repo = Repo()
-    add_remote(repo, 'upstream', 'https://github.com/apache/incubator-tvm.git')
+    add_remote(repo, 'upstream', 'git@github.com:dmlc/tvm.git')
 
     # Fetch 'upstream' remote
     logging.info("Fetching remote upstrean")
diff --git a/nnvm/Makefile b/nnvm/Makefile
index 14af3b294e73..39763cb59db8 100644
--- a/nnvm/Makefile
+++ b/nnvm/Makefile
@@ -30,6 +30,7 @@ TVMPATH = ..
 
 export LDFLAGS = -pthread -lm
 export CFLAGS = -std=c++11 -Wall -O2 -Iinclude -fPIC
+CFLAGS += -I$(TVMPATH)/include -I$(TVMPATH)/3rdparty/dlpack/include -I$(TVMPATH)/3rdparty/HalideIR/src -I$(TVMPATH)/topi/include
 
 ifdef DMLC_CORE_PATH
   CFLAGS += -I$(DMLC_CORE_PATH)/include
@@ -65,7 +66,7 @@ else
 	NO_WHOLE_ARCH= --no-whole-archive
 endif
 
-all: lib/libnnvm.a lib/libnnvm.$(SHARED_LIBRARY_SUFFIX)
+all: lib/libnnvm.a lib/libnnvm_compiler.$(SHARED_LIBRARY_SUFFIX)
 
 SRC = $(wildcard src/*.cc src/c_api/*.cc src/core/*.cc src/pass/*.cc)
 SRC_COMPILER = $(wildcard src/top/*/*.cc wildcard src/top/vision/*/*.cc src/compiler/*.cc src/compiler/*/*.cc)
@@ -86,7 +87,7 @@ lib/libnnvm.a: $(ALL_DEP)
 	@mkdir -p $(@D)
 	$(AR) crv $@ $(filter %.o, $?)
 
-lib/libnnvm.$(SHARED_LIBRARY_SUFFIX): lib/libnnvm.a ${TOP_OBJ}
+lib/libnnvm_compiler.$(SHARED_LIBRARY_SUFFIX): lib/libnnvm.a ${TOP_OBJ}
 	@mkdir -p $(@D)
 	$(CXX) $(CFLAGS) -shared -o $@ $(filter %.o, $^) $(LDFLAGS) -Wl,${WHOLE_ARCH} lib/libnnvm.a -Wl,${NO_WHOLE_ARCH}
 
diff --git a/nnvm/README.md b/nnvm/README.md
index 54caa17e2ce3..e3b451d63dcd 100644
--- a/nnvm/README.md
+++ b/nnvm/README.md
@@ -15,8 +15,38 @@
 <!--- specific language governing permissions and limitations -->
 <!--- under the License. -->
 
-# NNVM
+# NNVM Compiler Module of TVM Stack
 
-NNVM is a graph level IR for neural networks.
-We are moving towards Relay IR, a better unified IR that support wider range of programs.
-Please use relay instead.
+```python
+import tvm
+from tvm.contrib import graph_runtime, rpc
+import nnvm.frontend
+import nnvm.compiler
+
+# GET model from frameworks
+# change xyz to supported framework name.
+graph, params = nnvm.frontend.from_xyz(...)
+
+# OPTIMIZE and COMPILE the graph to get a deployable module
+# target can be "opencl", "llvm", "metal" or any target supported by tvm
+target = "cuda"
+graph, lib, params = nnvm.compiler.build(graph, target, {"data", data_shape}, params=params)
+
+# DEPLOY and run on gpu(0)
+module = graph_runtime.create(graph, lib, tvm.gpu(0))
+module.set_input(**params)
+module.run(data=data_array)
+output = tvm.nd.empty(out_shape, ctx=tvm.gpu(0))
+module.get_output(0, output)
+
+# DEPLOY to REMOTE mobile/rasp/browser with minimum tvm rpc runtime
+# useful for quick experiments on mobile devices
+remote = rpc.connect(remote_host, remote_port)
+lib.export_library("mylib.so")
+remote.upload("mylib.so")
+rlib = rpc.load_module("mylib.so")
+# run on remote device
+rmodule = graph_runtime.create(graph, rlib, remote.gpu(0))
+rmodule.set_input(**params)
+rmodule.run()
+```
diff --git a/nnvm/include/nnvm/base.h b/nnvm/include/nnvm/base.h
index 678ed4d4a942..2fd71c7d087e 100644
--- a/nnvm/include/nnvm/base.h
+++ b/nnvm/include/nnvm/base.h
@@ -6,9 +6,9 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- *
+ * 
  *   http://www.apache.org/licenses/LICENSE-2.0
- *
+ * 
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -46,24 +46,6 @@ using dmlc::get;
 /*!\brief "unsafe" getter function of any type */
 using dmlc::unsafe_get;
 
-enum TypeFlag {
-  kFloat32 = 0,
-  kFloat64 = 1,
-  kFloat16 = 2,
-  kUint8 = 3,
-  kInt32 = 4,
-  kInt8  = 5,
-  kInt64 = 6,
-  // kBool = 7,
-  // 7 is reserved for kBool, in order to keep consistency with MXNet TypeFlag defined in
-  // https://github.com/apache/incubator-mxnet/blob/master/3rdparty/mshadow/mshadow/base.h#L314
-  kInt16 = 8,
-  kUint16 = 9,
-  kUint32 = 10,
-  kUint64 = 11,
-  kBfloat16 = 12,
-};
-
 }  // namespace nnvm
 
 // describe op registration point
diff --git a/nnvm/include/nnvm/compiler/op_attr_types.h b/nnvm/include/nnvm/compiler/op_attr_types.h
new file mode 100644
index 000000000000..12b4415850d4
--- /dev/null
+++ b/nnvm/include/nnvm/compiler/op_attr_types.h
@@ -0,0 +1,119 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file nnvm/compiler/op_attr_types.h
+ * \brief The Expr and related elements in DataFlow construction.
+ */
+#ifndef NNVM_COMPILER_OP_ATTR_TYPES_H_
+#define NNVM_COMPILER_OP_ATTR_TYPES_H_
+
+#include <tvm/expr.h>
+#include <tvm/tensor.h>
+#include <tvm/schedule.h>
+#include <tvm/packed_func_ext.h>
+#include <tvm/runtime/registry.h>
+#include <nnvm/op_attr_types.h>
+#include <nnvm/graph_attr_types.h>
+#include <nnvm/graph.h>
+#include <vector>
+#include <string>
+#include "packed_func_ext.h"
+
+namespace nnvm {
+namespace compiler {
+
+using ::tvm::Array;
+using ::tvm::Tensor;
+using ::tvm::Schedule;
+
+/*! \brief operator pattern used in graph fusion */
+enum OpPatternKind {
+  // Elementwise operation
+  kElemWise = 0,
+  // Broadcasting operator, can always map output axis to the input in order.
+  // for example :code:`out[i, ax1, j, ax2] = input[i, j]`.
+  // Note that the axis need to be in order so transpose is not a bcast operator.
+  kBroadcast = 1,
+  // Injective operator, can always injectively map output axis to a single input axis.
+  // All injective operator can still be safely fused to injective and reduction.
+  kInjective = 2,
+  // Communicative reduction operator.
+  kCommReduce = 3,
+  // Complex operation, can still fuse elemwise operations into its output.
+  // but cannot chain another complex op
+  kOutEWiseFusable = 4,
+  // Opaque operation, cannot fuse anything.
+  kOpaque = 8
+};
+
+/*! \brief the operator pattern */
+using TOpPattern = int;
+
+/*!
+ * \brief Computation description interface
+ * \param attrs The attribute of the node.
+ * \param inputs The input tensors(placeholders)
+ * \param out_info Tensors holding shape/type information about output,
+ &                 these are always placeholders.
+ * \return The output description of the tensor.
+ */
+using FTVMCompute = std::function<
+  Array<Tensor>(const NodeAttrs& attrs,
+                const Array<Tensor>& inputs,
+                const Array<Tensor>& out_info)>;
+
+/*!
+ * \brief Build the computation schedule for
+ *  op whose root is at current op.
+ * \param attrs The attribute of the node.
+ * \param outs The output tensors.
+ * \param target The build target.
+ * \return schedule The computation schedule.
+ */
+using FTVMSchedule = std::function<
+  Schedule(const NodeAttrs& attrs,
+           const Array<Tensor>& outs,
+           const std::string& target)>;
+
+/*!
+ * \brief Modify the op node to alter its input layout.
+ *  it is invoked in AlterOpLayout pass.
+ * \param attrs The attribute of the original node.
+ * \param inputs The input symbols of the original node.
+ * \param tinfos The inferred shape and dtype of the inputs.
+ * \param ret The replaced operator.
+ * \return Whether to replace current operator.
+ */
+using FTVMAlterOpLayout = std::function<
+  bool(const NodeAttrs& attrs,
+       const Symbol& inputs,
+       const Array<Tensor>& tinfos,
+       Symbol* ret)>;
+
+/*!
+ * \brief Transform from normal operator to vectorized operator
+ * \param node The source node.
+ * \return Transformed vectorized op.
+ */
+using FTVMVectorizedOp = std::function<nnvm::NodePtr (const nnvm::Node* node)>;
+
+}  // namespace compiler
+}  // namespace nnvm
+#endif  // NNVM_COMPILER_OP_ATTR_TYPES_H_
diff --git a/nnvm/include/nnvm/compiler/packed_func_ext.h b/nnvm/include/nnvm/compiler/packed_func_ext.h
new file mode 100644
index 000000000000..67a43a7b4104
--- /dev/null
+++ b/nnvm/include/nnvm/compiler/packed_func_ext.h
@@ -0,0 +1,77 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file nnvm/compiler/packed_func_ext.h
+ * \brief Extension to enable packed functionn for nnvm types
+ */
+#ifndef NNVM_COMPILER_PACKED_FUNC_EXT_H_
+#define NNVM_COMPILER_PACKED_FUNC_EXT_H_
+
+#include <tvm/runtime/packed_func.h>
+#include <tvm/runtime/registry.h>
+#include <nnvm/graph.h>
+#include <nnvm/symbolic.h>
+#include <string>
+#include <vector>
+#include <unordered_map>
+
+namespace nnvm {
+namespace compiler {
+
+using tvm::runtime::PackedFunc;
+
+using AttrDict = std::unordered_map<std::string, std::string>;
+
+/*!
+ * \brief Get PackedFunction from global registry and
+ *  report error if it does not exist
+ * \param name The name of the function.
+ * \return The created PackedFunc.
+ */
+inline const PackedFunc& GetPackedFunc(const std::string& name) {
+  const PackedFunc* pf = tvm::runtime::Registry::Get(name);
+  CHECK(pf != nullptr) << "Cannot find function " << name << " in registry";
+  return *pf;
+}
+}  // namespace compiler
+}  // namespace nnvm
+
+// Enable the graph and symbol object exchange.
+namespace tvm {
+namespace runtime {
+
+template<>
+struct extension_type_info<nnvm::Symbol> {
+  static const int code = 16;
+};
+
+template<>
+struct extension_type_info<nnvm::Graph> {
+  static const int code = 17;
+};
+
+template<>
+struct extension_type_info<nnvm::compiler::AttrDict> {
+  static const int code = 18;
+};
+
+}  // namespace runtime
+}  // namespace tvm
+#endif  // NNVM_COMPILER_PACKED_FUNC_EXT_H_
diff --git a/nnvm/include/nnvm/compiler/util.h b/nnvm/include/nnvm/compiler/util.h
new file mode 100644
index 000000000000..f108ff131d66
--- /dev/null
+++ b/nnvm/include/nnvm/compiler/util.h
@@ -0,0 +1,62 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+* \file nnvm/compiler/util.h
+* \brief Utility functions for nnvm compiler
+*/
+#ifndef NNVM_COMPILER_UTIL_H_
+#define NNVM_COMPILER_UTIL_H_
+
+#include <tvm/expr.h>
+#include <nnvm/tuple.h>
+
+namespace nnvm {
+namespace compiler {
+
+/*
+ * \brief Helper function to convert TShape to TVM array. Useful for
+ * passing data from NNVM param structures to TOPI ops.
+ *
+ * \param shape The shape to convert
+ *
+ * \return An Array of Expr, where each element is a constant int32
+ */
+inline tvm::Array<tvm::Expr> ShapeToArray(TShape shape) {
+  tvm::Array<tvm::Expr> result;
+  for (auto i : shape) {
+    result.push_back(tvm::make_const(tvm::Int(32), i));
+  }
+  return result;
+}
+
+/*
+ * \brief Helper function to convert TShape to TVM array. Useful for
+ * passing data from NNVM param structures to TOPI ops.
+ *
+ * \param shape The shape to convert
+ *
+ * \return An Array of Expr, where each element is a constant int32
+ */
+inline tvm::Array<tvm::Integer> ShapeToIntArray(TShape shape) {
+  return tvm::Downcast<tvm::Array<tvm::Integer> >(ShapeToArray(shape));
+}
+}  // namespace compiler
+}  // namespace nnvm
+#endif  // NNVM_COMPILER_UTIL_H_
diff --git a/nnvm/include/nnvm/top/README b/nnvm/include/nnvm/top/README
new file mode 100644
index 000000000000..09a4d6fc387f
--- /dev/null
+++ b/nnvm/include/nnvm/top/README
@@ -0,0 +1 @@
+NNVM Core Operator and Compiler
diff --git a/nnvm/include/nnvm/top/nn.h b/nnvm/include/nnvm/top/nn.h
new file mode 100644
index 000000000000..f2a3e81472e1
--- /dev/null
+++ b/nnvm/include/nnvm/top/nn.h
@@ -0,0 +1,555 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file nnvm/top/nn.h
+ * \brief Auxiliary param for tensor primitive.
+ */
+#ifndef NNVM_TOP_NN_H_
+#define NNVM_TOP_NN_H_
+
+#include <dmlc/base.h>
+#include <dmlc/parameter.h>
+#include <nnvm/tuple.h>
+#include <nnvm/layout.h>
+#include <string>
+#include "tensor.h"
+
+namespace nnvm {
+namespace top {
+
+struct DenseParam : public dmlc::Parameter<DenseParam> {
+  int units;
+  bool use_bias;
+
+  DMLC_DECLARE_PARAMETER(DenseParam) {
+    DMLC_DECLARE_FIELD(units).set_lower_bound(1)
+    .describe("Number of hidden units of the dense transformation.");
+    DMLC_DECLARE_FIELD(use_bias).set_default(true)
+    .describe("Whether to use bias parameter");
+  }
+  // constants
+  static const constexpr int kData = 0;
+  static const constexpr int kWeight = 1;
+  static const constexpr int kBias = 2;
+};
+
+struct DropoutParam : public dmlc::Parameter<DropoutParam> {
+  float rate;
+
+  DMLC_DECLARE_PARAMETER(DropoutParam) {
+    DMLC_DECLARE_FIELD(rate).set_default(0.5)
+        .set_range(0, 1)
+        .describe("Fraction of the input that gets dropped out during training time.");
+  }
+};
+
+struct BatchNormParam : public dmlc::Parameter<BatchNormParam> {
+  int axis;
+  double epsilon;
+  double momentum;
+  bool center;
+  bool scale;
+
+  DMLC_DECLARE_PARAMETER(BatchNormParam) {
+    DMLC_DECLARE_FIELD(axis).set_default(1)
+      .describe("Specify which shape axis the channel is specified.");
+    DMLC_DECLARE_FIELD(epsilon).set_default(1e-5)
+        .describe("Small float added to variance to avoid dividing by zero.");
+    DMLC_DECLARE_FIELD(center).set_default(true)
+        .describe("If True, add offset of `beta` to normalized tensor."
+                  "If False, `beta` is ignored.");
+    DMLC_DECLARE_FIELD(scale).set_default(true)
+        .describe("If True, multiply by `gamma`. If False, `gamma` is not used."
+                  "When the next layer is piecewise linear (also e.g. `nn.relu`),"
+                  "this can be disabled since the scaling"
+                  "will be done by the next layer.");
+  }
+  // constants
+  static const constexpr int kData = 0;
+  static const constexpr int kGamma = 1;
+  static const constexpr int kBeta = 2;
+  static const constexpr int kMovingMean = 3;
+  static const constexpr int kMovingVariance = 4;
+};
+
+
+// Shared by softmax and log_softmax
+struct SoftmaxParam : public dmlc::Parameter<SoftmaxParam> {
+  int axis;
+
+  DMLC_DECLARE_PARAMETER(SoftmaxParam) {
+    DMLC_DECLARE_FIELD(axis).set_default(-1)
+        .describe("The axis to sum over when computing softmax.");
+  }
+};
+
+struct LeakyReLUParam : public dmlc::Parameter<LeakyReLUParam> {
+  double alpha;
+
+  DMLC_DECLARE_PARAMETER(LeakyReLUParam) {
+    DMLC_DECLARE_FIELD(alpha).set_lower_bound(0.0).set_default(0.25)
+        .describe("slope coefficient for the negative half axis.");
+  }
+};
+
+struct PReLUParam : public dmlc::Parameter<PReLUParam> {
+  int axis;
+  DMLC_DECLARE_PARAMETER(PReLUParam) {
+    DMLC_DECLARE_FIELD(axis).set_default(1)
+      .describe("Specify which shape axis the channel is specified.");
+  }
+};
+
+struct PadParam : public dmlc::Parameter<PadParam> {
+  float pad_value;
+  Tuple<Tuple<int> > pad_width;
+
+  DMLC_DECLARE_PARAMETER(PadParam) {
+    DMLC_DECLARE_FIELD(pad_value).set_default(0.0)
+      .describe("The value to be padded.");
+    DMLC_DECLARE_FIELD(pad_width)
+      .describe("Number of values padded to the edges of each axis, "
+                "in the format of ((before_1, after_1), ... (before_N, after_N))");
+  }
+};
+
+
+struct Conv2DParam : public dmlc::Parameter<Conv2DParam> {
+  int channels;
+  TShape kernel_size;
+  TShape strides;
+  TShape padding;
+  TShape dilation;
+  int groups;
+  std::string layout;
+  std::string kernel_layout;
+  std::string out_layout;
+  int out_dtype;
+  bool use_bias;
+
+  DMLC_DECLARE_PARAMETER(Conv2DParam) {
+    DMLC_DECLARE_FIELD(channels)
+      .describe("The dimensionality of the output space"
+                "i.e. the number of output channels in the convolution.");
+    DMLC_DECLARE_FIELD(kernel_size)
+      .describe("Specifies the dimensions of the convolution window.");
+    DMLC_DECLARE_FIELD(strides).set_default(TShape({1, 1}))
+      .describe("Specifies the strides of the convolution.");
+    DMLC_DECLARE_FIELD(padding).set_default(TShape({0, 0}))
+      .describe("If padding is non-zero, then the input is implicitly zero-padded"
+                "on both sides for padding number of points");
+    DMLC_DECLARE_FIELD(dilation).set_default(TShape({1, 1}))
+      .describe("Specifies the dilation rate to use for dilated convolution.");
+    DMLC_DECLARE_FIELD(groups).set_default(1)
+      .describe("Controls the connections between inputs and outputs."
+                "At groups=1, all inputs are convolved to all outputs."
+                "At groups=2, the operation becomes equivalent to having two convolution"
+                "layers side by side, each seeing half the input channels, and producing"
+                "half the output channels, and both subsequently concatenated.");
+    DMLC_DECLARE_FIELD(layout).set_default("NCHW")
+      .describe("Dimension ordering of input data. Can be 'NCHW', 'NHWC', etc."
+                "'N', 'C', 'H', 'W' stands for batch, channel, height, and width"
+                "dimensions respectively. Convolution is applied on the 'H' and"
+                "'W' dimensions.");
+    DMLC_DECLARE_FIELD(out_layout).set_default("__undef__")
+      .describe("Dimension ordering of output. Can be 'NCHW', 'NHWC', etc."
+                "'N', 'C', 'H', 'W' stands for batch, channel, height, and width"
+                "dimensions respectively. Default to be same as input layout.");
+    DMLC_DECLARE_FIELD(kernel_layout).set_default("OIHW")
+      .describe("Dimension ordering of weight. Can be 'OIHW', 'OIHW16o16i', etc."
+                "'O', 'I', 'H', 'W' stands for num_filter, input_channel, height, and width"
+                "dimensions respectively.");
+    DMLC_DECLARE_DTYPE_FIELD(out_dtype)
+      .add_enum("same", -1)
+      .set_default(-1)
+      .describe("Output data type, set to explicit type under mixed precision setting");
+
+    DMLC_DECLARE_FIELD(use_bias).set_default(true)
+      .describe("Whether the layer uses a bias vector.");
+  }
+  // constants
+  static const constexpr int kData = 0;
+  static const constexpr int kWeight = 1;
+  static const constexpr int kBias = 2;
+};
+
+struct WinogradWeightTransformParam : public dmlc::Parameter<WinogradWeightTransformParam> {
+    int tile_size;
+
+    DMLC_DECLARE_PARAMETER(WinogradWeightTransformParam) {
+      DMLC_DECLARE_FIELD(tile_size)
+        .describe("Tile size of winograd. E.g. 2 for F(2x2, 3x3) and 4 for F(4x4, 3x3)");
+    }
+
+    static const constexpr int kWeight = 0;
+};
+
+struct WinogradNNPACKWeightTransformParam
+    : public dmlc::Parameter<WinogradNNPACKWeightTransformParam> {
+  int convolution_algorithm;
+  int out_dtype;
+
+  DMLC_DECLARE_PARAMETER(WinogradNNPACKWeightTransformParam) {
+    DMLC_DECLARE_FIELD(convolution_algorithm)
+        .describe(
+            "The convolution algorithm for Winograd NNPACK. "
+            "E.g. tvm.contrib.nnpack.ConvolutionAlgorithm.WT_8x8 for WT_8x8, "
+            "tvm.contrib.nnpack.ConvolutionAlgorithm.WT_8x8_FP16 for WT_8x8_FP16");
+    DMLC_DECLARE_DTYPE_FIELD(out_dtype)
+        .add_enum("same", -1)
+        .set_default(-1)
+        .describe("Output data type, set to explicit type under mixed precision setting");
+  }
+
+  static const constexpr int kWeight = 0;
+};
+
+struct WinogradConv2DParam : public dmlc::Parameter<WinogradConv2DParam> {
+  int channels;
+  TShape kernel_size;
+  TShape strides;
+  TShape padding;
+  TShape dilation;
+  int groups;
+  std::string layout;
+  std::string kernel_layout;
+  std::string out_layout;
+  int out_dtype;
+  bool use_bias;
+  int tile_size;
+
+  DMLC_DECLARE_PARAMETER(WinogradConv2DParam) {
+    DMLC_DECLARE_FIELD(channels)
+      .describe("The dimensionality of the output space"
+                "i.e. the number of output channels in the convolution.");
+    DMLC_DECLARE_FIELD(kernel_size)
+      .describe("Specifies the dimensions of the convolution window.");
+    DMLC_DECLARE_FIELD(strides).set_default(TShape({1, 1}))
+      .describe("Specifies the strides of the convolution.");
+    DMLC_DECLARE_FIELD(padding).set_default(TShape({0, 0}))
+      .describe("If padding is non-zero, then the input is implicitly zero-padded"
+                "on both sides for padding number of points");
+    DMLC_DECLARE_FIELD(dilation).set_default(TShape({1, 1}))
+      .describe("Specifies the dilation rate to use for dilated convolution.");
+    DMLC_DECLARE_FIELD(groups).set_default(1)
+      .describe("Controls the connections between inputs and outputs."
+                "At groups=1, all inputs are convolved to all outputs."
+                "At groups=2, the operation becomes equivalent to having two convolution"
+                "layers side by side, each seeing half the input channels, and producing"
+                "half the output channels, and both subsequently concatenated.");
+    DMLC_DECLARE_FIELD(layout).set_default("NCHW")
+      .describe("Dimension ordering of input data. Can be 'NCHW', 'NHWC', etc."
+                "'N', 'C', 'H', 'W' stands for batch, channel, height, and width"
+                "dimensions respectively. Convolution is applied on the 'H' and"
+                "'W' dimensions.");
+    DMLC_DECLARE_FIELD(out_layout).set_default("__undef__")
+      .describe("Dimension ordering of output. Can be 'NCHW', 'NHWC', etc."
+                "'N', 'C', 'H', 'W' stands for batch, channel, height, and width"
+                "dimensions respectively. Default to be same as input layout.");
+    DMLC_DECLARE_FIELD(kernel_layout).set_default("OIHW")
+      .describe("Dimension ordering of weight. Can be 'OIHW', 'OIHW16o16i', etc."
+                "'O', 'I', 'H', 'W' stands for num_filter, input_channel, height, and width"
+                "dimensions respectively.");
+    DMLC_DECLARE_DTYPE_FIELD(out_dtype)
+      .add_enum("same", -1)
+      .set_default(-1)
+      .describe("Output data type, set to explicit type under mixed precision setting");
+    DMLC_DECLARE_FIELD(use_bias).set_default(true)
+      .describe("Whether the layer uses a bias vector.");
+    DMLC_DECLARE_FIELD(tile_size)
+      .describe("Tile size of winograd. E.g. 2 for F(2x2, 3x3) and 4 for F(4x4, 3x3)");
+  }
+  // constants
+  static const constexpr int kData = 0;
+  static const constexpr int kWeight = 1;
+  static const constexpr int kBias = 2;
+};
+
+struct Conv2DTransposeParam : public dmlc::Parameter<Conv2DTransposeParam> {
+  int channels;
+  TShape kernel_size;
+  TShape strides;
+  TShape padding;
+  TShape output_padding;
+  TShape dilation;
+  int groups;
+  std::string layout;
+  std::string kernel_layout;
+  int out_dtype;
+  bool use_bias;
+
+  DMLC_DECLARE_PARAMETER(Conv2DTransposeParam) {
+    DMLC_DECLARE_FIELD(channels)
+      .describe("The dimensionality of the output space"
+                "i.e. the number of output channels in the convolution.");
+    DMLC_DECLARE_FIELD(kernel_size)
+      .describe("Specifies the dimensions of the convolution window.");
+    DMLC_DECLARE_FIELD(strides).set_default(TShape({1, 1}))
+      .describe("Specifies the strides of the convolution.");
+    DMLC_DECLARE_FIELD(output_padding).set_default(TShape({0, 0}))
+      .describe("Zero-padding added to one side of the output.");
+    DMLC_DECLARE_FIELD(padding).set_default(TShape({0, 0}))
+      .describe("If padding is non-zero, then the input is implicitly zero-padded"
+                "on both sides for padding number of points");
+    DMLC_DECLARE_FIELD(dilation).set_default(TShape({1, 1}))
+      .describe("Specifies the dilation rate to use for dilated convolution.");
+    DMLC_DECLARE_FIELD(groups).set_default(1)
+      .describe("Controls the connections between inputs and outputs."
+                "At groups=1, all inputs are convolved to all outputs."
+                "At groups=2, the operation becomes equivalent to having two convolution"
+                "layers side by side, each seeing half the input channels, and producing"
+                "half the output channels, and both subsequently concatenated.");
+    DMLC_DECLARE_FIELD(layout).set_default("NCHW")
+      .describe("Dimension ordering of data. Can be 'NCHW', 'NHWC', etc."
+                "'N', 'C', 'H', 'W' stands for batch, channel, height, and width"
+                "dimensions respectively. Convolution is applied on the 'H' and"
+                "'W' dimensions.");
+    DMLC_DECLARE_FIELD(kernel_layout).set_default("OIHW")
+      .describe("Dimension ordering of data and weight. Can be 'OIHW', 'OIHW16o16i', etc."
+                "'O', 'I', 'H', 'W' stands for num_filter, input_channel, height, and width"
+                "dimensions respectively.");
+    DMLC_DECLARE_DTYPE_FIELD(out_dtype)
+        .add_enum("same", -1)
+        .set_default(-1)
+        .describe("Output data type, set to explicit type under mixed precision setting");
+    DMLC_DECLARE_FIELD(use_bias).set_default(true)
+      .describe("Whether the layer uses a bias vector.");
+  }
+  // constants
+  static const constexpr int kData = 0;
+  static const constexpr int kWeight = 1;
+  static const constexpr int kBias = 2;
+};
+
+
+struct MaxPool2DParam : public dmlc::Parameter<MaxPool2DParam> {
+  TShape pool_size;
+  TShape strides;
+  TShape padding;
+  std::string layout;
+  bool ceil_mode;
+
+  DMLC_DECLARE_PARAMETER(MaxPool2DParam) {
+    DMLC_DECLARE_FIELD(pool_size)
+      .describe("Size of the pooling windows..");
+    DMLC_DECLARE_FIELD(strides).set_default(TShape({1, 1}))
+      .describe("Specifies the strides of the convolution.");
+    DMLC_DECLARE_FIELD(padding).set_default(TShape({0, 0}))
+      .describe("If padding is non-zero, then the input is implicitly zero-padded"
+                "Padding support both symmetric and asymmetric as"
+                "one int : same padding used on all sides"
+                "two int : bottom, right will use same padding as top, left"
+                "four int : padding width in the order of (top, left, bottom, right)");
+    DMLC_DECLARE_FIELD(layout).set_default("NCHW")
+      .describe("Dimension ordering of data and weight. Can be 'NCHW', 'NHWC', etc."
+                "'N', 'C', 'H', 'W' stands for batch, channel, height, and width"
+                "dimensions respectively. Convolution is applied on the 'H' and"
+                "'W' dimensions.");
+    DMLC_DECLARE_FIELD(ceil_mode).set_default(false)
+      .describe("When true, will use ceil instead of floor to compute the output shape.");
+  }
+};
+
+
+struct AvgPool2DParam : public dmlc::Parameter<AvgPool2DParam> {
+  TShape pool_size;
+  TShape strides;
+  TShape padding;
+  std::string layout;
+  bool ceil_mode;
+  bool count_include_pad;
+
+  DMLC_DECLARE_PARAMETER(AvgPool2DParam) {
+    DMLC_DECLARE_FIELD(pool_size)
+      .describe("Size of the pooling windows..");
+    DMLC_DECLARE_FIELD(strides).set_default(TShape({1, 1}))
+      .describe("Specifies the strides of the convolution.");
+    DMLC_DECLARE_FIELD(padding).set_default(TShape({0, 0}))
+      .describe("If padding is non-zero, then the input is implicitly zero-padded"
+                "Padding support both symmetric and asymmetric as"
+                "one int : same padding used on all sides"
+                "two int : bottom, right will use same padding as top, left"
+                "four int : padding width in the order of (top, left, bottom, right)");
+    DMLC_DECLARE_FIELD(layout).set_default("NCHW")
+      .describe("Dimension ordering of data and weight. Can be 'NCHW', 'NHWC', etc."
+                "'N', 'C', 'H', 'W' stands for batch, channel, height, and width"
+                "dimensions respectively. Convolution is applied on the 'H' and"
+                "'W' dimensions.");
+    DMLC_DECLARE_FIELD(ceil_mode).set_default(false)
+      .describe("When true, will use ceil instead of floor to compute the output shape.");
+    DMLC_DECLARE_FIELD(count_include_pad).set_default(false)
+      .describe("When true, will include padding to compute the average");
+  }
+};
+
+
+struct GlobalPool2DParam : public dmlc::Parameter<GlobalPool2DParam> {
+  std::string layout;
+
+  DMLC_DECLARE_PARAMETER(GlobalPool2DParam) {
+    DMLC_DECLARE_FIELD(layout).set_default("NCHW")
+      .describe("Dimension ordering of data and weight. Can be 'NCHW', 'NHWC', etc."
+                "'N', 'C', 'H', 'W' stands for batch, channel, height, and width"
+                "dimensions respectively. Convolution is applied on the 'H' and"
+                "'W' dimensions.");
+  }
+};
+
+struct UpSamplingParam : public dmlc::Parameter<UpSamplingParam> {
+  int scale;
+  std::string layout;
+  std::string method;
+
+  DMLC_DECLARE_PARAMETER(UpSamplingParam) {
+    DMLC_DECLARE_FIELD(scale)
+      .describe("upsampling scaling factor");
+    DMLC_DECLARE_FIELD(layout)
+      .set_default("NCHW")
+      .describe("Dimension ordering of data. Can be 'NCHW', 'NHWC', etc."
+                "'N', 'C', 'H', 'W' stands for batch, channel, height, and width"
+                "dimensions respectively. Upsampling is applied on the 'H' and"
+                "'W' dimensions.");
+    DMLC_DECLARE_FIELD(method)
+      .set_default("NEAREST_NEIGHBOR")
+      .describe("Specify the mode to use for scaling."
+                "NEAREST_NEIGHBOR -  Nearest Neighbor"
+                "BILINEAR - Bilinear Interpolation");
+  }
+};
+
+struct LayoutTransformParam : public dmlc::Parameter<LayoutTransformParam> {
+  std::string src_layout;
+  std::string dst_layout;
+
+  DMLC_DECLARE_PARAMETER(LayoutTransformParam) {
+    DMLC_DECLARE_FIELD(src_layout).set_default("__undef__")
+    .describe("Dimension ordering of data");
+    DMLC_DECLARE_FIELD(dst_layout).set_default("__undef__")
+    .describe("Dimension ordering of data.");
+  }
+};
+
+struct MultiBoxPriorParam : public dmlc::Parameter<MultiBoxPriorParam> {
+  Tuple<float> sizes;
+  Tuple<float> ratios;
+  Tuple<float> steps;
+  Tuple<float> offsets;
+  bool clip;
+
+  DMLC_DECLARE_PARAMETER(MultiBoxPriorParam) {
+    DMLC_DECLARE_FIELD(sizes).set_default(Tuple<float>({1.0}))
+      .describe("List of sizes of generated MultiBoxPriores.");
+    DMLC_DECLARE_FIELD(ratios).set_default(Tuple<float>({1.0}))
+    .describe("List of aspect ratios of generated MultiBoxPriores.");
+    DMLC_DECLARE_FIELD(steps).set_default(Tuple<float>({-1.0, -1.0}))
+    .describe("Priorbox step across y and x, -1 for auto calculation.");
+    DMLC_DECLARE_FIELD(offsets).set_default(Tuple<float>({0.5, 0.5}))
+    .describe("Priorbox center offsets, y and x respectively.");
+    DMLC_DECLARE_FIELD(clip).set_default(false)
+    .describe("Whether to clip out-of-boundary boxes.");
+  }
+};
+
+struct MultiBoxTransformLocParam : public dmlc::Parameter<MultiBoxTransformLocParam> {
+  bool clip;
+  float threshold;
+  Tuple<float> variances;
+  DMLC_DECLARE_PARAMETER(MultiBoxTransformLocParam) {
+    DMLC_DECLARE_FIELD(clip).set_default(true)
+      .describe("Clip out-of-boundary boxes.");
+    DMLC_DECLARE_FIELD(threshold).set_default(0.01)
+    .describe("Threshold to be a positive prediction.");
+    DMLC_DECLARE_FIELD(variances).set_default(Tuple<float>({0.1f, 0.1f, 0.2f, 0.2f}))
+    .describe("Variances to be decoded from box regression output.");
+  }
+};
+
+struct NonMaximumSuppressionParam : public dmlc::Parameter<NonMaximumSuppressionParam> {
+  bool return_indices;
+  float iou_threshold;
+  bool force_suppress;
+  int top_k;
+  int id_index;
+  int coord_start;
+  int score_index;
+  int max_output_size;
+  bool invalid_to_bottom;
+  DMLC_DECLARE_PARAMETER(NonMaximumSuppressionParam) {
+    DMLC_DECLARE_FIELD(max_output_size).set_default(-1)
+      .describe("Max number of output valid boxes for each instance."
+                "By default all valid boxes are returned.");
+    DMLC_DECLARE_FIELD(iou_threshold).set_default(0.5)
+      .describe("Non-maximum suppression threshold.");
+    DMLC_DECLARE_FIELD(force_suppress).set_default(false)
+      .describe("Suppress all detections regardless of class_id.");
+    DMLC_DECLARE_FIELD(top_k).set_default(-1)
+      .describe("Keep maximum top k detections before nms, -1 for no limit.");
+    DMLC_DECLARE_FIELD(coord_start).set_default(2)
+      .describe("Start index of the consecutive 4 coordinates.");
+    DMLC_DECLARE_FIELD(score_index).set_default(1)
+      .describe("Index of the scores/confidence of boxes.");
+    DMLC_DECLARE_FIELD(id_index).set_default(0)
+      .describe("Axis index of id.");
+    DMLC_DECLARE_FIELD(return_indices).set_default(true)
+      .describe("Whether to return box indices in input data.");
+    DMLC_DECLARE_FIELD(invalid_to_bottom).set_default(false)
+      .describe("Whether to move all invalid bounding boxes to the bottom.");
+  }
+};
+
+struct LRNParam : public dmlc::Parameter<LRNParam> {
+  int size;
+  int axis;
+  float alpha;
+  float beta;
+  float bias;
+
+  DMLC_DECLARE_PARAMETER(LRNParam) {
+    DMLC_DECLARE_FIELD(size)
+      .describe("The size of the local region to be considered for normalization.");
+    DMLC_DECLARE_FIELD(axis)
+      .describe("input data layout channel axis");
+    DMLC_DECLARE_FIELD(alpha)
+      .describe("The scaling parameter.");
+    DMLC_DECLARE_FIELD(beta)
+      .describe("The exponent parameter.");
+    DMLC_DECLARE_FIELD(bias)
+      .describe("The offset parameter.");
+  }
+  // constants
+  static const constexpr int kData = 0;
+};
+
+struct L2NormalizeParam : public dmlc::Parameter<L2NormalizeParam> {
+  float eps;
+  Tuple<int> axis;
+
+  DMLC_DECLARE_PARAMETER(L2NormalizeParam) {
+    DMLC_DECLARE_FIELD(eps)
+      .describe("float type epsilon value.");
+    DMLC_DECLARE_FIELD(axis)
+      .describe("axis over the normalization applied");
+  }
+};
+
+}  // namespace top
+}  // namespace nnvm
+
+#endif  // NNVM_TOP_NN_H_
diff --git a/nnvm/include/nnvm/top/tensor.h b/nnvm/include/nnvm/top/tensor.h
index b8e245dbdfe2..f2dc1b6c8b01 100644
--- a/nnvm/include/nnvm/top/tensor.h
+++ b/nnvm/include/nnvm/top/tensor.h
@@ -100,14 +100,10 @@ enum TypeFlag {
   kInt32 = 4,
   kInt8  = 5,
   kInt64 = 6,
-  // kBool = 7,
-  // 7 is reserved for kBool, in order to keep consistency with MXNet TypeFlag defined in
-  // https://github.com/apache/incubator-mxnet/blob/master/3rdparty/mshadow/mshadow/base.h#L314
-  kInt16 = 8,
-  kUint16 = 9,
-  kUint32 = 10,
-  kUint64 = 11,
-  kBfloat16 = 12,
+  kInt16 = 7,
+  kUint16 = 8,
+  kUint32 = 9,
+  kUint64 = 10,
 };
 
 enum IndicatorRuleFlag {
@@ -129,8 +125,7 @@ enum IndicatorRuleFlag {
   .add_enum("int8",  kInt8)                                         \
   .add_enum("int16", kInt16)                                        \
   .add_enum("int32", kInt32)                                        \
-  .add_enum("int64", kInt64)                                        \
-  .add_enum("bfloat16", kBfloat16)
+  .add_enum("int64", kInt64)
 
 struct CastParam : public dmlc::Parameter<CastParam> {
   int dtype;
diff --git a/nnvm/python/.gitignore b/nnvm/python/.gitignore
new file mode 100644
index 000000000000..40d7cb4cc13a
--- /dev/null
+++ b/nnvm/python/.gitignore
@@ -0,0 +1,2 @@
+*.c
+*.cpp
diff --git a/nnvm/python/nnvm/__init__.py b/nnvm/python/nnvm/__init__.py
new file mode 100644
index 000000000000..450058449e3a
--- /dev/null
+++ b/nnvm/python/nnvm/__init__.py
@@ -0,0 +1,33 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+#!/usr/bin/env python
+# coding: utf-8
+"""NNVM python API for ease of use and help new framework establish python API. """
+from __future__ import absolute_import as _abs
+import warnings
+
+from . import _base
+from . import symbol as sym
+from . import symbol
+from ._base import NNVMError
+from . import frontend
+
+__version__ = _base.__version__
+
+warnings.warn("NNVM is deprecated and will be removed in a future version. Use Relay instead.",
+              FutureWarning)
diff --git a/nnvm/python/nnvm/_base.py b/nnvm/python/nnvm/_base.py
new file mode 100644
index 000000000000..420392f17e92
--- /dev/null
+++ b/nnvm/python/nnvm/_base.py
@@ -0,0 +1,215 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# coding: utf-8
+# pylint: disable=invalid-name, unused-import
+""" ctypes library of nnvm and helper functions """
+from __future__ import absolute_import
+
+import os
+import sys
+import ctypes
+import numpy as np
+from . import libinfo
+
+try:
+    import tvm
+except ImportError:
+    pass
+
+#----------------------------
+# library loading
+#----------------------------
+if sys.version_info[0] == 3:
+    string_types = str
+    numeric_types = (float, int, np.float32, np.int32)
+    # this function is needed for python3
+    # to convert ctypes.char_p .value back to python str
+    py_str = lambda x: x.decode('utf-8')
+else:
+    string_types = basestring
+    numeric_types = (float, int, long, np.float32, np.int32)
+    py_str = lambda x: x
+
+
+class NNVMError(Exception):
+    """Error that will be throwed by all nnvm functions"""
+
+
+def _load_lib():
+    """Load libary by searching possible path."""
+    lib_path = libinfo.find_lib_path()
+    lib = ctypes.CDLL(lib_path[0], ctypes.RTLD_LOCAL)
+    # DMatrix functions
+    lib.NNGetLastError.restype = ctypes.c_char_p
+    return lib
+
+# version number
+__version__ = libinfo.__version__
+# library instance of nnvm
+_LIB = _load_lib()
+# The FFI mode of TVM
+_FFI_MODE = os.environ.get("TVM_FFI", "auto")
+
+# type definitions
+nn_uint = ctypes.c_uint
+OpHandle = ctypes.c_void_p
+SymbolHandle = ctypes.c_void_p
+GraphHandle = ctypes.c_void_p
+
+# Global dict of str to symbol to initialize variables
+_all_var_init = {}
+
+#----------------------------
+# helper function definition
+#----------------------------
+def check_call(ret):
+    """Check the return value of C API call
+
+    This function will raise exception when error occurs.
+    Wrap every API call with this function
+
+    Parameters
+    ----------
+    ret : int
+        return value from API calls
+    """
+    if ret != 0:
+        raise NNVMError(py_str(_LIB.NNGetLastError()))
+
+def c_str(string):
+    """Create ctypes char * from a python string
+    Parameters
+    ----------
+    string : string type
+        python string
+
+    Returns
+    -------
+    str : c_char_p
+        A char pointer that can be passed to C API
+    """
+    return ctypes.c_char_p(string.encode('utf-8'))
+
+
+def c_array(ctype, values):
+    """Create ctypes array from a python array
+
+    Parameters
+    ----------
+    ctype : ctypes data type
+        data type of the array we want to convert to
+
+    values : tuple or list
+        data content
+
+    Returns
+    -------
+    out : ctypes array
+        Created ctypes array
+    """
+    return (ctype * len(values))(*values)
+
+def ctypes2buffer(cptr, length):
+    """Convert ctypes pointer to buffer type.
+
+    Parameters
+    ----------
+    cptr : ctypes.POINTER(ctypes.c_char)
+        pointer to the raw memory region
+    length : int
+        the length of the buffer
+
+    Returns
+    -------
+    buffer : bytearray
+        The raw byte memory buffer
+    """
+    if not isinstance(cptr, ctypes.POINTER(ctypes.c_char)):
+        raise TypeError('expected char pointer')
+    res = bytearray(length)
+    rptr = (ctypes.c_char * length).from_buffer(res)
+    if not ctypes.memmove(rptr, cptr, length):
+        raise RuntimeError('memmove failed')
+    return res
+
+def ctypes2numpy_shared(cptr, shape):
+    """Convert a ctypes pointer to a numpy array
+
+    The result numpy array shares the memory with the pointer
+
+    Parameters
+    ----------
+    cptr : ctypes.POINTER(mx_float)
+        pointer to the memory region
+
+    shape : tuple
+        shape of target ndarray
+
+    Returns
+    -------
+    out : numpy_array
+        A numpy array : numpy array
+    """
+    if not isinstance(cptr, ctypes.POINTER(mx_float)):
+        raise RuntimeError('expected float pointer')
+    size = 1
+    for s in shape:
+        size *= s
+    dbuffer = (mx_float * size).from_address(ctypes.addressof(cptr.contents))
+    return np.frombuffer(dbuffer, dtype=np.float32).reshape(shape)
+
+
+def ctypes2docstring(num_args, arg_names, arg_types, arg_descs, remove_dup=True):
+    """Convert ctypes returned doc string information into parameters docstring.
+
+    num_args : nn_uint
+        Number of arguments.
+
+    arg_names : ctypes.POINTER(ctypes.c_char_p)
+        Argument names.
+
+    arg_types : ctypes.POINTER(ctypes.c_char_p)
+        Argument type information.
+
+    arg_descs : ctypes.POINTER(ctypes.c_char_p)
+        Argument description information.
+
+    remove_dup : boolean, optional
+        Whether remove duplication or not.
+
+    Returns
+    -------
+    docstr : str
+        Python docstring of parameter sections.
+    """
+    param_keys = set()
+    param_str = []
+    for i in range(num_args.value):
+        key = py_str(arg_names[i])
+        if key in param_keys and remove_dup:
+            continue
+        param_keys.add(key)
+        type_info = py_str(arg_types[i])
+        ret = '%s : %s' % (key, type_info)
+        if arg_descs[i]:
+            ret += '\n    ' + py_str(arg_descs[i])
+        param_str.append(ret)
+    doc_str = ('Parameters\n' +
+               '----------\n' +
+               '%s\n')
+    doc_str = doc_str % ('\n'.join(param_str))
+    return doc_str
diff --git a/nnvm/python/nnvm/_ctypes/README b/nnvm/python/nnvm/_ctypes/README
new file mode 100644
index 000000000000..6e82cb962f99
--- /dev/null
+++ b/nnvm/python/nnvm/_ctypes/README
@@ -0,0 +1 @@
+Ctypes specific implementation of certain modules
\ No newline at end of file
diff --git a/cmake/modules/contrib/CODEGENC.cmake b/nnvm/python/nnvm/_ctypes/__init__.py
similarity index 84%
rename from cmake/modules/contrib/CODEGENC.cmake
rename to nnvm/python/nnvm/_ctypes/__init__.py
index bb53621f1a11..ea196643ae2f 100644
--- a/cmake/modules/contrib/CODEGENC.cmake
+++ b/nnvm/python/nnvm/_ctypes/__init__.py
@@ -15,6 +15,4 @@
 # specific language governing permissions and limitations
 # under the License.
 
-file(GLOB CSOURCE_RELAY_CONTRIB_SRC src/relay/backend/contrib/codegen_c/codegen.cc)
-list(APPEND COMPILER_SRCS ${CSOURCE_RELAY_CONTRIB_SRC})
-
+""""ctypes implementation of the Symbol"""
diff --git a/nnvm/python/nnvm/_ctypes/symbol.py b/nnvm/python/nnvm/_ctypes/symbol.py
new file mode 100644
index 000000000000..8c7d58a65920
--- /dev/null
+++ b/nnvm/python/nnvm/_ctypes/symbol.py
@@ -0,0 +1,242 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# coding: utf-8
+# pylint: disable=invalid-name, protected-access, too-many-arguments, too-many-lines,
+# pylint: disable=len-as-condition, consider-iterating-dictionary
+"""Symbolic configuration API."""
+from __future__ import absolute_import as _abs
+
+import copy
+import ctypes
+import sys
+from .._base import _LIB
+from .._base import c_array, c_str, nn_uint, py_str
+from .._base import SymbolHandle, OpHandle
+from .._base import check_call, ctypes2docstring
+from ..name import NameManager
+from ..attribute import AttrScope
+
+class SymbolBase(object):
+    """Symbol is symbolic graph."""
+    __slots__ = ["handle"]
+    # pylint: disable=no-member
+    def __init__(self, handle):
+        """Initialize the function with handle
+
+        Parameters
+        ----------
+        handle : SymbolHandle
+            the handle to the underlying C++ Symbol
+        """
+        self.handle = handle
+
+    def __del__(self):
+        check_call(_LIB.NNSymbolFree(self.handle))
+
+    def __call__(self, *args, **kwargs):
+        """Invoke symbol as function on inputs.
+
+        Parameters
+        ----------
+        args:
+            provide positional arguments
+
+        kwargs:
+            provide keyword arguments
+        Returns
+        -------
+        the resulting symbol
+        """
+        s = copy.deepcopy(self)
+        s._compose(*args, **kwargs)
+        return s
+
+    def _compose(self, *args, **kwargs):
+        """Compose symbol on inputs.
+
+        This call mutates the current symbol.
+
+        Parameters
+        ----------
+        args:
+            provide positional arguments
+
+        kwargs:
+            provide keyword arguments
+
+        Returns
+        -------
+        the resulting symbol
+        """
+        name = kwargs.pop('name', None)
+
+        if name:
+            name = c_str(name)
+        if len(args) != 0 and len(kwargs) != 0:
+            raise TypeError('compose only accept input Symbols \
+                either as positional or keyword arguments, not both')
+
+        for arg in args:
+            if not isinstance(arg, SymbolBase):
+                raise TypeError('Compose expect `Symbol` as arguments')
+        for val in kwargs.values():
+            if not isinstance(val, SymbolBase):
+                raise TypeError('Compose expect `Symbol` as arguments')
+
+        num_args = len(args) + len(kwargs)
+        if len(kwargs) != 0:
+            keys = c_array(ctypes.c_char_p, [c_str(key) for key in kwargs.keys()])
+            args = c_array(SymbolHandle, [s.handle for s in kwargs.values()])
+        else:
+            keys = None
+            args = c_array(SymbolHandle, [s.handle for s in args])
+        check_call(_LIB.NNSymbolCompose(
+            self.handle, name, num_args, keys, args))
+
+    def _set_attr(self, **kwargs):
+        """Set the attribute of the symbol.
+
+        Parameters
+        ----------
+        **kwargs
+            The attributes to set
+        """
+        keys = c_array(ctypes.c_char_p,
+                       [c_str(key) for key in kwargs.keys()])
+        vals = c_array(ctypes.c_char_p,
+                       [c_str(str(val)) for val in kwargs.values()])
+        num_args = nn_uint(len(kwargs))
+        check_call(_LIB.NNSymbolSetAttrs(
+            self.handle, num_args, keys, vals))
+
+
+_symbol_cls = SymbolBase
+
+def _set_symbol_class(cls):
+    global _symbol_cls
+    _symbol_cls = cls
+
+
+def _make_atomic_symbol_function(handle, name):
+    """Create an atomic symbol function by handle and funciton name."""
+    real_name = ctypes.c_char_p()
+    desc = ctypes.c_char_p()
+    num_args = nn_uint()
+    arg_names = ctypes.POINTER(ctypes.c_char_p)()
+    arg_types = ctypes.POINTER(ctypes.c_char_p)()
+    arg_descs = ctypes.POINTER(ctypes.c_char_p)()
+    ret_type = ctypes.c_char_p()
+
+    check_call(_LIB.NNGetOpInfo(
+        handle, ctypes.byref(real_name), ctypes.byref(desc),
+        ctypes.byref(num_args),
+        ctypes.byref(arg_names),
+        ctypes.byref(arg_types),
+        ctypes.byref(arg_descs),
+        ctypes.byref(ret_type)))
+    param_str = ctypes2docstring(num_args, arg_names, arg_types, arg_descs)
+    func_name = name
+    desc = py_str(desc.value)
+
+    doc_str = ('%s\n\n' +
+               '%s\n' +
+               'Returns\n' +
+               '-------\n' +
+               'result: Tensor\n' +
+               '    The result Tensor.')
+    doc_str = doc_str % (desc, param_str)
+
+    def creator(*args, **kwargs):
+        """Activation Operator of Neural Net.
+        The parameters listed below can be passed in as keyword arguments.
+
+        Parameters
+        ----------
+        name : string, required.
+            Name of the resulting symbol.
+
+        Returns
+        -------
+        symbol: Symbol
+            the resulting symbol
+        """
+        param_keys = []
+        param_vals = []
+        symbol_kwargs = {}
+        name = kwargs.pop('name', None)
+        attr = kwargs.pop('attr', None)
+
+        for k, v in kwargs.items():
+            if isinstance(v, SymbolBase):
+                symbol_kwargs[k] = v
+            else:
+                param_keys.append(c_str(k))
+                param_vals.append(c_str(str(v)))
+        # create atomic symbol
+        param_keys = c_array(ctypes.c_char_p, param_keys)
+        param_vals = c_array(ctypes.c_char_p, param_vals)
+        sym_handle = SymbolHandle()
+        check_call(_LIB.NNSymbolCreateAtomicSymbol(
+            handle,
+            nn_uint(len(param_keys)),
+            param_keys, param_vals,
+            ctypes.byref(sym_handle)))
+
+        if len(args) != 0 and len(symbol_kwargs) != 0:
+            raise TypeError(
+                '%s can only accept input'
+                'Symbols either as positional or keyword arguments, not both' % func_name)
+        s = _symbol_cls(sym_handle)
+        attr = AttrScope.current.get(attr)
+        if attr:
+            s._set_attr(**attr)
+        hint = func_name.lower()
+        name = NameManager.current.get(name, hint)
+        s._compose(*args, name=name, **symbol_kwargs)
+        return s
+
+    creator.__name__ = func_name
+    creator.__doc__ = doc_str
+    return creator
+
+
+def _init_symbol_module(symbol_class, root_namespace):
+    """List and add all the atomic symbol functions to current module."""
+    _set_symbol_class(symbol_class)
+    plist = ctypes.POINTER(ctypes.c_char_p)()
+    size = ctypes.c_uint()
+
+    check_call(_LIB.NNListAllOpNames(ctypes.byref(size),
+                                     ctypes.byref(plist)))
+    op_names = []
+    for i in range(size.value):
+        op_names.append(py_str(plist[i]))
+
+    module_obj = sys.modules["%s.symbol" % root_namespace]
+    module_obj_contrib = sys.modules["%s.contrib" % root_namespace]
+    module_internal = sys.modules["%s._symbol_internal" % root_namespace]
+    for name in op_names:
+        hdl = OpHandle()
+        check_call(_LIB.NNGetOpHandle(c_str(name), ctypes.byref(hdl)))
+        function = _make_atomic_symbol_function(hdl, name)
+        if function.__name__.startswith('_contrib_'):
+            setattr(module_obj_contrib, function.__name__.split('_contrib_')[1], function)
+        elif function.__name__.startswith('_'):
+            setattr(module_internal, function.__name__, function)
+            setattr(module_obj, function.__name__, function)
+        else:
+            setattr(module_obj, function.__name__, function)
diff --git a/nnvm/python/nnvm/_cy2/README b/nnvm/python/nnvm/_cy2/README
new file mode 100644
index 000000000000..ed4639b674a0
--- /dev/null
+++ b/nnvm/python/nnvm/_cy2/README
@@ -0,0 +1 @@
+This folder is by default empty and will hold DLLs generated by cython.
diff --git a/nnvm/python/nnvm/_cy2/__init__.py b/nnvm/python/nnvm/_cy2/__init__.py
new file mode 100644
index 000000000000..1961cd9ff613
--- /dev/null
+++ b/nnvm/python/nnvm/_cy2/__init__.py
@@ -0,0 +1,18 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Namespace for cython generated modules for python2"""
diff --git a/nnvm/python/nnvm/_cy3/README b/nnvm/python/nnvm/_cy3/README
new file mode 100644
index 000000000000..dc3a57603782
--- /dev/null
+++ b/nnvm/python/nnvm/_cy3/README
@@ -0,0 +1 @@
+This folder is by default empty and will hold DLLs generated by cython.
\ No newline at end of file
diff --git a/nnvm/python/nnvm/_cy3/__init__.py b/nnvm/python/nnvm/_cy3/__init__.py
new file mode 100644
index 000000000000..c9a495225351
--- /dev/null
+++ b/nnvm/python/nnvm/_cy3/__init__.py
@@ -0,0 +1,18 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Cython generated modules"""
diff --git a/nnvm/python/nnvm/_symbol_internal.py b/nnvm/python/nnvm/_symbol_internal.py
new file mode 100644
index 000000000000..de2f85aa2f29
--- /dev/null
+++ b/nnvm/python/nnvm/_symbol_internal.py
@@ -0,0 +1,17 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Module space to register internal functions. Leave empty"""
diff --git a/nnvm/python/nnvm/attribute.py b/nnvm/python/nnvm/attribute.py
new file mode 100644
index 000000000000..14341794bb64
--- /dev/null
+++ b/nnvm/python/nnvm/attribute.py
@@ -0,0 +1,76 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# coding: utf-8
+"""Attribute scoping support for symbolic API."""
+from __future__ import absolute_import
+
+from ._base import string_types
+
+class AttrScope(object):
+    """Attribute manager for scoping.
+
+    User can also inherit this object to change naming behavior.
+
+    Parameters
+    ----------
+    kwargs
+        The attributes to set for all symbol creations in the scope.
+    """
+    current = None
+
+    def __init__(self, **kwargs):
+        self._old_scope = None
+        for value in kwargs.values():
+            if not isinstance(value, string_types):
+                raise ValueError("Attributes need to be string")
+        self._attr = kwargs
+
+    def get(self, attr):
+        """
+        Get the attribute dict given the attribute set by the symbol.
+
+        Parameters
+        ----------
+        attr : dict of string to string
+            The attribute passed in by user during symbol creation.
+
+        Returns
+        -------
+        attr : dict of string to string
+            Updated attributes to add other scope related attributes.
+        """
+        if self._attr:
+            ret = self._attr.copy()
+            if attr:
+                ret.update(attr)
+            return ret
+        return attr
+
+    def __enter__(self):
+        # pylint: disable=protected-access
+        self._old_scope = AttrScope.current
+        attr = AttrScope.current._attr.copy()
+        attr.update(self._attr)
+        self._attr = attr
+        AttrScope.current = self
+        return self
+
+    def __exit__(self, ptype, value, trace):
+        assert self._old_scope
+        AttrScope.current = self._old_scope
+
+AttrScope.current = AttrScope()
diff --git a/nnvm/python/nnvm/compiler/__init__.py b/nnvm/python/nnvm/compiler/__init__.py
new file mode 100644
index 000000000000..6a3e846c4496
--- /dev/null
+++ b/nnvm/python/nnvm/compiler/__init__.py
@@ -0,0 +1,40 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""NNVM compiler toolchain.
+
+User only need to use :any:`build` and :any:`build_config` to do the compilation,
+and :any:`save_param_dict` to save the parameters into bytes.
+The other APIs are for more advanced interaction with the compiler toolchain.
+"""
+from __future__ import absolute_import
+
+import tvm
+
+from . import build_module
+from . build_module import build, optimize, build_config
+from . compile_engine import engine, graph_key
+from . param_dict import save_param_dict, load_param_dict
+
+from .. import symbol as _symbol
+from .. import graph as _graph
+
+from .. import top as _top
+
+
+tvm.register_extension(_symbol.Symbol, _symbol.Symbol)
+tvm.register_extension(_graph.Graph, _graph.Graph)
diff --git a/nnvm/python/nnvm/compiler/compile_engine.py b/nnvm/python/nnvm/compiler/compile_engine.py
new file mode 100644
index 000000000000..d7799bf7b0e7
--- /dev/null
+++ b/nnvm/python/nnvm/compiler/compile_engine.py
@@ -0,0 +1,119 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name
+"""Compiler engine interface to internal engine
+
+You can get the engine singleton at ``nnvm.compiler.engine``
+"""
+import tvm
+
+_list_cache_items = tvm.get_global_func("nnvm.compiler.ListCacheItems")
+_clear_cache = tvm.get_global_func("nnvm.compiler.ClearCache")
+_get_cache_item = tvm.get_global_func("nnvm.compiler.GetCacheItem")
+_set_cache_item = tvm.get_global_func("nnvm.compiler.SetCacheItem")
+_graph_key_get_graph = tvm.get_global_func("nnvm.compiler.GraphKeyGetGraph")
+_make_graph_key = tvm.get_global_func("nnvm.compiler.MakeGraphKey")
+
+@tvm.register_node
+class GraphKey(tvm.node.NodeBase):
+    """Key of a graph compilation context"""
+    @property
+    def graph(self):
+        return _graph_key_get_graph(self)
+
+
+@tvm.register_node
+class GraphCacheEntry(tvm.node.NodeBase):
+    """CacheEntry of compilation into a TVM Function"""
+
+
+@tvm.register_node
+class GraphFunc(tvm.node.NodeBase):
+    """Compiled result of a graph into a TVM Function"""
+
+
+class Engine(object):
+    """Global singleton compilation engine.
+
+    You can get the singleton at ``nnvm.compiler.engine``
+    """
+    def items(self):
+        """List the available cache key value pairs.
+
+        Returns
+        -------
+        item_list : list of (GraphKey, GraphCacheEntry)
+            The existing cache items
+        """
+        res = _list_cache_items()
+        assert len(res) % 2 == 0
+        return [(res[2*i], res[2*i+1]) for i in range(len(res) // 2)]
+
+    def clear_cache(self):
+        """Clear the existing cached functions."""
+        _clear_cache()
+
+    def __setitem__(self, key, value):
+        """Clear the existing cached functions."""
+        if isinstance(value, GraphCacheEntry):
+            _set_cache_item(key, value.graph_func)
+        else:
+            _set_cache_item(key, value)
+
+    def __getitem__(self, key):
+        """Clear the existing cached functions."""
+        return _get_cache_item(key)
+
+    def dump(self):
+        """Return a string representation of engine dump
+
+        Returns
+        -------
+        dump : str
+            The dumped string representation
+        """
+        items = self.items()
+        res = "====================================\n"
+        res += "CompilerEngine dump, %d items cached\n" % len(items)
+        for key, value in items:
+            res += "------------------------------------\n"
+            res += "target={}\n".format(key.target)
+            res += "inputs={}\n".format(key.inputs)
+            res += "use_count={}\n".format(value.use_count)
+            res += "func_name={}\n".format(value.graph_func.func_name)
+            res += key.graph.ir() + "\n"
+        res += "===================================\n"
+        return res
+
+engine = Engine()
+
+
+def graph_key(graph, inputs, target):
+    """Construct a new graph key.
+
+    Parameters
+    ----------
+    graph : Graph
+        The computation graph structure
+
+    inputs : list of Tensor(placeholder)
+        The input requirement to the graph.
+
+    target : str
+        The target of compilation.
+    """
+    return _make_graph_key(graph, inputs, target)
diff --git a/nnvm/python/nnvm/compiler/graph_attr.py b/nnvm/python/nnvm/compiler/graph_attr.py
new file mode 100644
index 000000000000..de557cce78b3
--- /dev/null
+++ b/nnvm/python/nnvm/compiler/graph_attr.py
@@ -0,0 +1,136 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name
+"""Utilities to access graph attributes"""
+from __future__ import absolute_import as _abs
+
+import tvm
+
+def set_shape_inputs(g, shape):
+    """Set the shape of input graph nodes in the graph attribute.
+
+    Parameters
+    ----------
+    g : Graph
+        The input graph
+
+    shape : dict of str to tuple
+        The input shape
+
+    Returns
+    -------
+    g : Graph
+        The updated graph with updated shape.
+    """
+    list_shape = [
+        shape.get(name, ()) for name in g.index.input_names]
+    g._set_json_attr("shape_inputs", list_shape, 'list_shape')
+    return g
+
+
+DTYPE_TO_TCODE = {
+    "default": -1,
+    "float32": 0,
+    "float64": 1,
+    "float16": 2,
+    "uint8": 3,
+    "int32": 4,
+    "int8": 5,
+    "int64": 6,
+    "int16": 7,
+    "uint16": 8,
+    "uint32": 9,
+    "uint64": 10,
+    "bool": 11,
+}
+
+TCODE_TO_DTYPE = {
+    -1: None,
+    0: "float32",
+    1: "float64",
+    2: "float16",
+    3: "uint8",
+    4: "int32",
+    5: "int8",
+    6: "int64",
+    7: "int16",
+    8: "uint16",
+    9: "uint32",
+    10: "uint64",
+    11: "bool",
+}
+
+def set_dtype_inputs(g, dtype):
+    """Set the dtype inputs of graph nodes
+
+    Parameters
+    ----------
+    g : Graph
+        The input graph
+
+    dtype : dict of str to str or str
+        The input dtype
+
+    Returns
+    -------
+    g : Graph
+        The updated graph with updated dtype.
+    """
+    if isinstance(dtype, dict):
+        list_dtype = [
+            DTYPE_TO_TCODE[str(dtype.get(name, "default"))]
+            for name in g.index.input_names]
+    else:
+        list_dtype = [DTYPE_TO_TCODE[dtype]] * len(g.index.input_names)
+    g._set_json_attr("dtype_inputs", list_dtype, "list_int")
+    return g
+
+
+def set_layout_inputs(g, layout):
+    """Set the layout inputs of graph nodes
+
+    Parameters
+    ----------
+    g : Graph
+        The input graph
+
+    layout : dict of str to str or str
+        The input layout
+
+    Returns
+    -------
+    g : Graph
+        The updated graph with updated layout.
+    """
+    if isinstance(layout, dict):
+        list_layout = [
+            layout.get(name, "__undef__") for name in g.index.input_names]
+    elif isinstance(layout, str):
+        list_layout = ["__undef__"] * len(g.index.input_names)
+        list_layout[0] = layout
+    else:
+        raise ValueError("Input layout must be str or dict")
+    last_inferred_layouts = g.json_attr("layout")
+    if last_inferred_layouts:
+        input_layout = [last_inferred_layouts[g.index.entry_id(x)] for x in g.index.input_names]
+        for i, layout_stored in enumerate(input_layout):
+            list_layout[i] = list_layout[i] if list_layout[i] != '__undef__' else layout_stored
+    g._set_json_attr("layout_inputs", list_layout, 'list_layout')
+    return g
+
+_move_out_module = tvm.get_global_func("nnvm.graph._move_module")
+_move_out_graph = tvm.get_global_func("nnvm.graph._move_graph")
diff --git a/nnvm/python/nnvm/compiler/graph_pass.py b/nnvm/python/nnvm/compiler/graph_pass.py
new file mode 100644
index 000000000000..a11a80e43fe4
--- /dev/null
+++ b/nnvm/python/nnvm/compiler/graph_pass.py
@@ -0,0 +1,24 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name
+"""Namespace of graph pass.
+
+Principle:
+- Graph in, graph out: always takes in graph as first argument and returns a graph
+- Composable API: break graph transformation pass as segments of small transformations.
+"""
+from __future__ import absolute_import as _abs
diff --git a/nnvm/python/nnvm/compiler/graph_util.py b/nnvm/python/nnvm/compiler/graph_util.py
new file mode 100644
index 000000000000..3ce38dacacc3
--- /dev/null
+++ b/nnvm/python/nnvm/compiler/graph_util.py
@@ -0,0 +1,164 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name
+"""Utility function to get information from graph."""
+from __future__ import absolute_import as _abs
+
+import tvm
+from . import graph_attr
+
+from ..graph import create
+from ..symbol import Group, ones_like
+
+def infer_shape(graph, **shape):
+    """Infer the shape given the shape of inputs.
+
+    Parameters
+    ----------
+    graph : Graph
+        The graph to perform shape inference from
+
+    shape : dict of str to tuple
+        The specific input shape.
+
+    Returns
+    -------
+    in_shape : list of tuple
+         Shape of inputs
+
+    out_shape: list of tuple
+         Shape of outputs
+    """
+    graph = graph_attr.set_shape_inputs(graph, shape)
+    graph = graph.apply("InferShape")
+    shape = graph.json_attr("shape")
+    index = graph.index
+    input_shape = [shape[index.entry_id(x)] for x in index.input_names]
+    output_shape = [shape[index.entry_id(x)] for x in index.output_entries]
+    return input_shape, output_shape
+
+
+def infer_dtype(graph, **dtype):
+    """Infer the type given the typeS of inputs.
+
+    Parameters
+    ----------
+    graph : Graph
+        The graph to perform type inference from
+
+    dtype : dict of str to dtype
+        The specific input data type.
+
+    Returns
+    -------
+    in_dtype : list of tuple
+         Dtype of inputs
+
+    out_dtype: list of tuple
+         Dtype of outputs
+    """
+    graph = graph_attr.set_dtype_inputs(graph, dtype)
+    graph = graph.apply("InferType")
+    dtype = graph.json_attr("dtype")
+    index = graph.index
+    input_dtype = [graph_attr.TCODE_TO_DTYPE[dtype[index.entry_id(x)]]
+                   for x in index.input_names]
+    output_dtype = [graph_attr.TCODE_TO_DTYPE[dtype[index.entry_id(x)]]
+                    for x in index.output_entries]
+    return input_dtype, output_dtype
+
+
+_deep_compare = tvm.get_global_func("nnvm.graph.DeepCompare")
+
+def check_graph_equal(grapha, graphb, compare_variable_attrs=False):
+    """Check if two graphs have equal structure.
+
+    Parameters
+    ----------
+    grapha : Graph
+        The first graph
+
+    graphb : Graph
+        The second graph
+
+    compare_variable_attrs : bool, optional
+        Whether we want to compare attributes(names) on variables.
+        Usually it is safe to skip it unless we want input name
+        to exactly match
+
+    Raises
+    ------
+    ValueError
+        ValueError is raised with error message when graph not equal
+    """
+    err = _deep_compare(grapha, graphb, compare_variable_attrs)
+    if err:
+        raise ValueError("Graph compare error: " + err)
+
+def get_gradient_graph(ys, xs, grad_ys=None):
+    """Create gradient graph of ys with respect to xs.
+
+    Parameters
+    ----------
+    ys : Symbol or list of Symbol
+        Symbols from which the gradient is calculated.
+    xs : Symbol or list of Symbol
+        Symbols the gradient respect to.
+        For group symbol, gradients for all outputs will be calculated.
+    grad_ys : Symbol or list of Symbol
+        Head gradients for ys.
+
+    Returns
+    -------
+    ret : Graph
+        Generated gradient graph.
+    """
+    if isinstance(ys, list):
+        ys = Group(ys)
+    g = create(ys)
+    g._set_symbol_list_attr('grad_ys', ys)
+    g._set_symbol_list_attr('grad_xs', xs)
+    ny = len(ys.list_output_names())
+    if grad_ys is None:
+        grad_ys = [ones_like(ys[i]) for i in range(ny)]
+    g._set_symbol_list_attr('grad_ys_out_grad', grad_ys)
+    return g.apply('Gradient')
+
+def gradients(ys, xs, grad_ys=None):
+    """Create gradient symbol of ys respect to xs.
+
+    Parameters
+    ----------
+    ys : Symbol or list of Symbol
+        Symbols from which the gradient is calculated.
+    xs : Symbol or list of Symbol
+        Symbols the gradient respect to.
+        For group symbol, gradients for all outputs will be calculated.
+    grad_ys : Symbol or list of Symbol
+        Head gradients for ys.
+
+    Returns
+    -------
+    ret : list of Symbol
+        Generated gradient symbol. For each xs,
+        all gradients from ys are merged into a single symbol.
+    """
+    grad_g = get_gradient_graph(ys, xs, grad_ys)
+    nx = len(Group(xs).list_output_names()) \
+        if isinstance(xs, list) else len(xs.list_output_names())
+    ret = [grad_g.symbol[i] for i in range(nx)]
+    return ret
diff --git a/nnvm/python/nnvm/compiler/lr_scheduler.py b/nnvm/python/nnvm/compiler/lr_scheduler.py
new file mode 100644
index 000000000000..3a33f390b6f4
--- /dev/null
+++ b/nnvm/python/nnvm/compiler/lr_scheduler.py
@@ -0,0 +1,74 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=too-few-public-methods, no-member
+"""API for scheduling learning rate."""
+from .. import symbol as sym
+
+class LRScheduler(object):
+    """Base class of a learning rate scheduler.
+
+    A scheduler returns a new learning rate based on the number of updates that have
+    been performed.
+
+    Parameters
+    ----------
+    base_lr : float, optional
+        The initial learning rate.
+    """
+    def __init__(self, base_lr=0.01, name='LRScheduler'):
+        self.name = name
+        self.base_lr = base_lr
+
+    def __call__(self, num_update):
+        """Return a new learning rate based on number of updates.
+
+        Parameters
+        ----------
+        num_update: nnvm Symbol
+            the number of updates applied to weight.
+        """
+        raise NotImplementedError("__call__ method must be overridden.")
+
+class FactorScheduler(LRScheduler):
+    """Reduce the learning rate by a factor for every *n* steps.
+
+    It returns a new learning rate by::
+
+        base_lr * pow(factor, num_update/step)
+
+    Parameters
+    ----------
+    step : int
+        Changes the learning rate for every n updates.
+    factor : float, optional
+        The factor to change the learning rate.
+    stop_factor_lr : float, optional
+        Stop updating the learning rate if it is less than this value.
+    """
+    def __init__(self, step, factor=1, stop_factor_lr=1e-8, name='FactorScheduler', **kwargs):
+        super(FactorScheduler, self).__init__(name=name, **kwargs)
+        if step < 1:
+            raise ValueError("Schedule step must be greater or equal than 1 round")
+        if factor > 1.0:
+            raise ValueError("Factor must be no more than 1 to make lr reduce")
+        self.step = step
+        self.factor = factor
+        self.stop_factor_lr = stop_factor_lr
+
+    def __call__(self, num_update):
+        updated_lr = self.base_lr * self.factor ** (num_update / self.step)
+        return sym.clip(updated_lr, a_min=self.stop_factor_lr, a_max=self.base_lr)
diff --git a/nnvm/python/nnvm/compiler/optimizer.py b/nnvm/python/nnvm/compiler/optimizer.py
new file mode 100644
index 000000000000..ba739b8c7056
--- /dev/null
+++ b/nnvm/python/nnvm/compiler/optimizer.py
@@ -0,0 +1,147 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name, no-member, too-few-public-methods, too-many-arguments, too-many-locals, protected-access
+"""Optimizer API"""
+from . import graph_util
+from .. import symbol as sym
+
+class Optimizer(object):
+    """Base class inherited by all optimizers.
+
+    Parameters
+    ----------
+    learning_rate : float, optional
+        The initial learning rate.
+
+    lr_scheduler : LRScheduler, optional
+        The learning rate scheduler.
+
+    rescale_grad : float, optional
+        Multiply the gradient with `rescale_grad` before updating. Often
+        choose to be ``1.0/batch_size``.
+
+    clip_gradient : float, optional
+        Clip the gradient by projecting onto the box ``[-clip_gradient, clip_gradient]``.
+
+    wd : float, optional
+        The weight decay (or L2 regularization) coefficient. Modifies objective
+        by adding a penalty for having large weights.
+
+    name : string, optional
+        The name of optimizer.
+    """
+    def __init__(self, learning_rate=0.01, lr_scheduler=None,
+                 rescale_grad=1, clip_gradient=None, wd=0, name="Optimizer"):
+        self.name = name
+        self.lr = learning_rate
+        self.lr_scheduler = lr_scheduler
+        self.rescale_grad = rescale_grad
+        self.clip_gradient = clip_gradient
+        self.wd = wd
+        init_update_t = sym.Variable(name+'_t', init=sym.zeros(shape=(1,), dtype="int32"))
+        self.update_t = sym._assign(init_update_t, init_update_t + 1)
+
+    def minimize(self, obj, var=None):
+        """Minimize given obj symbol respect to var. If var is not set, all input
+        variables of obj will be used.
+
+        Parameters
+        ----------
+        obj : nnvm Symbol or list of nnvm Symbols
+            Symbols to be minimized.
+        var : nnvm Symbol or list of nnvm Symbols, optional
+            Symbols the gradient respect to.
+
+        Returns
+        -------
+        group_sym : nnvm Symbol
+            Group symbol represents update symbols.
+        """
+        raise NotImplementedError()
+
+    def _get_lr(self):
+        """Gets the learning rate with learning rate scheduler.
+
+        Returns
+        -------
+        lr : float
+            Learning rate.
+        """
+        if self.lr_scheduler is not None:
+            lr = self.lr_scheduler(self.update_t)
+        else:
+            lr = self.lr
+        return lr
+
+
+class SGD(Optimizer):
+    """The SGD optimizer
+    """
+    def __init__(self, name='SGD', **kwargs):
+        super(SGD, self).__init__(name=name, **kwargs)
+
+    def minimize(self, obj, var=None):
+        variables = var or obj.list_input_variables()
+        if not isinstance(variables, list):
+            variables = [variables]
+        grads = graph_util.gradients(obj, variables)
+        updates = []
+        lr_t = self._get_lr()
+        for v, g in zip(variables, grads):
+            g = self.rescale_grad * g
+            if self.clip_gradient is not None:
+                g = sym.clip(g, a_min=-1 * self.clip_gradient, a_max=self.clip_gradient)
+            updates.append(sym._assign(v, v - lr_t * (g + self.wd * v)))
+        return sym.Group(updates)
+
+
+class Adam(Optimizer):
+    """The Adam optimizer.
+
+    This class implements the optimizer described in *Adam: A Method for
+    Stochastic Optimization*, available at http://arxiv.org/abs/1412.6980.
+    """
+    def __init__(self, learning_rate=0.001, beta1=0.9, beta2=0.999,
+                 epsilon=1e-8, name='Adam', **kwargs):
+        super(Adam, self).__init__(learning_rate=learning_rate, name=name, **kwargs)
+        self.beta1 = beta1
+        self.beta2 = beta2
+        self.epsilon = epsilon
+        self.m = []
+        self.v = []
+
+    def minimize(self, obj, var=None):
+        variables = var or obj.list_input_variables()
+        if not isinstance(variables, list):
+            variables = [variables]
+        grads = graph_util.gradients(obj, variables)
+        updates = []
+        for i, v in enumerate(variables):
+            self.m.append(sym.Variable(self.name + '_m' + str(i), init=sym.zeros_like(v)))
+            self.v.append(sym.Variable(self.name + '_v' + str(i), init=sym.zeros_like(v)))
+        rate = sym.sqrt(1 - self.beta2 ** self.update_t) / (1 -  self.beta1 ** self.update_t)
+        lr_t = self._get_lr() * rate
+        for variable, g, m, v in zip(variables, grads, self.m, self.v):
+            g = self.rescale_grad * g
+            if self.clip_gradient is not None:
+                g = sym.clip(g, a_min=-1 * self.clip_gradient, a_max=self.clip_gradient)
+            update_m = sym._assign(m, self.beta1 * m + (1 - self.beta1) * g)
+            update_v = sym._assign(v, self.beta2 * v + (1 - self.beta2) * g * g)
+            update_var = sym._assign(variable, variable - lr_t * (update_m / (sym.sqrt(update_v) \
+                         + self.epsilon) + self.wd * variable))
+            updates.append(update_var)
+        return sym.Group(updates)
diff --git a/nnvm/python/nnvm/compiler/param_dict.py b/nnvm/python/nnvm/compiler/param_dict.py
new file mode 100644
index 000000000000..a543e0a827b3
--- /dev/null
+++ b/nnvm/python/nnvm/compiler/param_dict.py
@@ -0,0 +1,77 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name
+"""Helper utility to save parameter dict"""
+import tvm
+
+_save_param_dict = tvm.get_global_func("nnvm.compiler._save_param_dict")
+_load_param_dict = tvm.get_global_func("nnvm.compiler._load_param_dict")
+
+def save_param_dict(params):
+    """Save parameter dictionary to binary bytes.
+
+    The result binary bytes can be loaded by the
+    GraphModule with API "load_params".
+
+    Parameters
+    ----------
+    params : dict of str to NDArray
+        The parameter dictionary.
+
+    Returns
+    -------
+    param_bytes: bytearray
+        Serialized parameters.
+
+    Examples
+    --------
+    .. code-block:: python
+
+       # compile and save the modules to file.
+       graph, lib, params = nnvm.compiler.build(
+          graph, target, shape={"data", data_shape}, params=params)
+       module = graph_runtime.create(graph, lib, tvm.gpu(0))
+       # save the parameters as byte array
+       param_bytes = nnvm.compiler.save_param_dict(params)
+       # We can serialize the param_bytes and load it back later.
+       # Pass in byte array to module to directly set parameters
+       module["load_params"](param_bytes)
+    """
+    args = []
+    for k, v in params.items():
+        args.append(k)
+        args.append(tvm.nd.array(v))
+    return _save_param_dict(*args)
+
+
+def load_param_dict(param_bytes):
+    """Load parameter dictionary to binary bytes.
+
+    Parameters
+    ----------
+    param_bytes: bytearray
+        Serialized parameters.
+
+    Returns
+    -------
+    params : dict of str to NDArray
+        The parameter dictionary.
+    """
+    if isinstance(param_bytes, (bytes, str)):
+        param_bytes = bytearray(param_bytes)
+    load_arr = _load_param_dict(param_bytes)
+    return {v.name : v.array for v in load_arr}
diff --git a/nnvm/python/nnvm/contrib.py b/nnvm/python/nnvm/contrib.py
new file mode 100644
index 000000000000..c3e943682db5
--- /dev/null
+++ b/nnvm/python/nnvm/contrib.py
@@ -0,0 +1,17 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Module space to register contrib functions. Leave empty"""
diff --git a/nnvm/python/nnvm/cython/README b/nnvm/python/nnvm/cython/README
new file mode 100644
index 000000000000..d9deab1abca9
--- /dev/null
+++ b/nnvm/python/nnvm/cython/README
@@ -0,0 +1 @@
+Cython specific implementation of certain modules
\ No newline at end of file
diff --git a/nnvm/python/nnvm/cython/base.pyi b/nnvm/python/nnvm/cython/base.pyi
new file mode 100644
index 000000000000..40ef71a20546
--- /dev/null
+++ b/nnvm/python/nnvm/cython/base.pyi
@@ -0,0 +1,106 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+ctypedef void* SymbolHandle
+ctypedef void* OpHandle
+ctypedef unsigned nn_uint
+
+cdef py_str(const char* x):
+    if PY_MAJOR_VERSION < 3:
+        return x
+    else:
+        return x.decode("utf-8")
+
+
+cdef c_str(pystr):
+    """Create ctypes char * from a python string
+    Parameters
+    ----------
+    string : string type
+        python string
+
+    Returns
+    -------
+    str : c_char_p
+        A char pointer that can be passed to C API
+    """
+    return pystr.encode("utf-8")
+
+
+cdef CALL(int ret):
+    if ret != 0:
+        raise NNVMError(NNGetLastError())
+
+
+cdef const char** CBeginPtr(vector[const char*]& vec):
+    if (vec.size() != 0):
+        return &vec[0]
+    else:
+        return NULL
+
+cdef vector[const char*] SVec2Ptr(vector[string]& vec):
+    cdef vector[const char*] svec
+    svec.resize(vec.size())
+    for i in range(vec.size()):
+        svec[i] = vec[i].c_str()
+    return svec
+
+
+cdef BuildDoc(nn_uint num_args,
+              const char** arg_names,
+              const char** arg_types,
+              const char** arg_descs,
+              remove_dup=True):
+    """Convert ctypes returned doc string information into parameters docstring.
+
+    num_args : nn_uint
+        Number of arguments.
+
+    arg_names : ctypes.POINTER(ctypes.c_char_p)
+        Argument names.
+
+    arg_types : ctypes.POINTER(ctypes.c_char_p)
+        Argument type information.
+
+    arg_descs : ctypes.POINTER(ctypes.c_char_p)
+        Argument description information.
+
+    remove_dup : boolean, optional
+        Whether remove duplication or not.
+
+    Returns
+    -------
+    docstr : str
+        Python docstring of parameter sections.
+    """
+    param_keys = set()
+    param_str = []
+    for i in range(num_args):
+        key = arg_names[i]
+        if key in param_keys and remove_dup:
+            continue
+        param_keys.add(key)
+        type_info = arg_types[i]
+        ret = '%s : %s' % (key, type_info)
+        if len(arg_descs[i]) != 0:
+            ret += '\n    ' + py_str(arg_descs[i])
+        param_str.append(ret)
+    doc_str = ('Parameters\n' +
+               '----------\n' +
+               '%s\n')
+    doc_str = doc_str % ('\n'.join(param_str))
+    return doc_str
diff --git a/nnvm/python/nnvm/cython/symbol.pyx b/nnvm/python/nnvm/cython/symbol.pyx
new file mode 100644
index 000000000000..eedf2afbbc2a
--- /dev/null
+++ b/nnvm/python/nnvm/cython/symbol.pyx
@@ -0,0 +1,233 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+from __future__ import absolute_import as _abs
+
+import sys as _sys
+import ctypes as _ctypes
+from numbers import Number as _Number
+from .._base import NNVMError
+from ..name import NameManager
+from ..attribute import AttrScope
+from libcpp.vector cimport vector
+from libcpp.string cimport string
+from cpython.version cimport PY_MAJOR_VERSION
+
+include "./base.pyi"
+
+cdef extern from "nnvm/c_api.h":
+    const char* NNGetLastError();
+    int NNListAllOpNames(nn_uint *out_size,
+                      const char ***out_array);
+    int NNGetOpHandle(const char *op_name,
+                      OpHandle *handle);
+    int NNGetOpInfo(OpHandle op,
+                    const char **name,
+                    const char **description,
+                    nn_uint *num_doc_args,
+                    const char ***arg_names,
+                    const char ***arg_type_infos,
+                    const char ***arg_descriptions,
+                    const char **return_type);
+    int NNListOpNames(nn_uint *out_size,
+                      const char ***out_array);
+    int NNSymbolCreateAtomicSymbol(OpHandle op,
+                                   nn_uint num_param,
+                                   const char **keys,
+                                   const char **vals,
+                                   SymbolHandle *out);
+    int NNSymbolFree(SymbolHandle symbol);
+    int NNSymbolSetAttrs(SymbolHandle symbol,
+                         nn_uint num_param,
+                         const char** keys,
+                         const char** values);
+    int NNSymbolCompose(SymbolHandle sym,
+                        const char* name,
+                        nn_uint num_args,
+                        const char** keys,
+                        SymbolHandle* args);
+
+cdef class SymbolBase:
+    """Symbol is symbolic graph."""
+    # handle for symbolic operator.
+    cdef SymbolHandle handle
+
+    def __init__(self, handle):
+        cdef unsigned long ptr
+        if handle is None:
+            self.handle = NULL
+        else:
+            ptr = handle.value
+            self.handle = <SymbolHandle>(ptr)
+
+    def __dealloc__(self):
+        CALL(NNSymbolFree(self.handle))
+
+    @property
+    def handle(self):
+        return _ctypes.cast(<unsigned long>self.handle, _ctypes.c_void_p)
+
+    def _set_attr(self, **kwargs):
+        """Set the attribute of the symbol.
+
+        Parameters
+        ----------
+        **kwargs
+            The attributes to set
+        """
+        SymbolSetAttr(self.handle, kwargs)
+
+
+cdef SymbolSetAttr(SymbolHandle handle, dict kwargs):
+    cdef vector[string] sparam_keys
+    cdef vector[string] sparam_vals
+    cdef nn_uint num_args
+    for k, v in kwargs.items():
+        sparam_keys.push_back(c_str(k))
+        sparam_vals.push_back(c_str(str(v)))
+    # keep strings in vector
+    cdef vector[const char*] param_keys = SVec2Ptr(sparam_keys)
+    cdef vector[const char*] param_vals = SVec2Ptr(sparam_vals)
+    num_args = param_keys.size()
+    CALL(NNSymbolSetAttrs(
+        handle, num_args, CBeginPtr(param_keys), CBeginPtr(param_vals)))
+
+
+_symbol_cls = SymbolBase
+
+cdef _set_symbol_class(cls):
+    global _symbol_cls
+    _symbol_cls = cls
+
+cdef NewSymbol(SymbolHandle handle):
+    """Create a new symbol given handle"""
+    sym = _symbol_cls(None)
+    (<SymbolBase>sym).handle = handle
+    return sym
+
+cdef _make_atomic_symbol_function(OpHandle handle, string name):
+    """Create an atomic symbol function by handle and funciton name."""
+    cdef const char *real_name
+    cdef const char *desc
+    cdef nn_uint num_args
+    cdef const char** arg_names
+    cdef const char** arg_types
+    cdef const char** arg_descs
+    cdef const char* return_type
+
+    CALL(NNGetOpInfo(
+        handle, &real_name, &desc,
+        &num_args, &arg_names,
+        &arg_types, &arg_descs,
+        &return_type))
+
+    param_str = BuildDoc(num_args, arg_names, arg_types, arg_descs)
+    func_name = py_str(name.c_str())
+    doc_str = ('%s\n\n' +
+               '%s\n' +
+               'Returns\n' +
+               '-------\n' +
+               'result: Tensor\n' +
+               '    The result Tensor.')
+    doc_str = doc_str % (desc, param_str)
+    func_hint = func_name.lower()
+
+    def creator(*args, **kwargs):
+        cdef vector[string] sparam_keys
+        cdef vector[string] sparam_vals
+        cdef vector[SymbolHandle] symbol_args
+        cdef vector[string] ssymbol_keys
+        cdef SymbolHandle ret_handle
+
+        name = kwargs.pop("name", None)
+        attr = kwargs.pop("attr", None)
+
+        if len(kwargs) != 0:
+            for k, v in kwargs.items():
+                if isinstance(v, SymbolBase):
+                    ssymbol_keys.push_back(c_str(k))
+                    symbol_args.push_back((<SymbolBase>v).handle)
+                else:
+                    sparam_keys.push_back(c_str(k))
+                    sparam_vals.push_back(c_str(str(v)))
+
+        if len(args) != 0:
+            if symbol_args.size() != 0:
+                raise TypeError("compose only accept input Symbols\
+                    either as positional or keyword arguments, not both")
+            for v in args:
+                if not isinstance(v, SymbolBase):
+                    raise TypeError('Compose expect `Symbol` as arguments')
+                symbol_args.push_back((<SymbolBase>v).handle)
+
+        cdef vector[const char*] param_keys = SVec2Ptr(sparam_keys)
+        cdef vector[const char*] param_vals = SVec2Ptr(sparam_vals)
+        cdef vector[const char*] symbol_keys = SVec2Ptr(ssymbol_keys)
+
+        CALL(NNSymbolCreateAtomicSymbol(
+            handle,
+            <nn_uint>param_keys.size(),
+            CBeginPtr(param_keys),
+            CBeginPtr(param_vals),
+            &ret_handle))
+        num_args = <nn_uint>(symbol_args.size())
+
+        attr = AttrScope.current.get(attr)
+        if attr:
+            SymbolSetAttr(ret_handle, attr)
+        name = NameManager.current.get(name, func_hint)
+
+        cdef const char* c_name = NULL
+
+        if name:
+            name = c_str(name)
+            c_name = name
+
+        CALL(NNSymbolCompose(
+            ret_handle,
+            c_name,
+            num_args,
+            &symbol_keys[0] if symbol_keys.size() != 0 else NULL,
+            &symbol_args[0] if symbol_args.size() != 0 else NULL))
+        return NewSymbol(ret_handle)
+
+    creator.__name__ = func_name
+    creator.__doc__ = doc_str
+    return creator
+
+
+def _init_symbol_module(symbol_class, root_namespace):
+    """List and add all the atomic symbol functions to current module."""
+    cdef const char** op_name_ptrs
+    cdef nn_uint size
+    cdef vector[string] op_names
+    cdef OpHandle handle
+
+    _set_symbol_class(symbol_class)
+    CALL(NNListAllOpNames(&size, &op_name_ptrs))
+    for i in range(size):
+        op_names.push_back(string(op_name_ptrs[i]));
+    module_obj = _sys.modules["%s.symbol" % root_namespace]
+    module_internal = _sys.modules["%s._symbol_internal" % root_namespace]
+    for i in range(op_names.size()):
+        CALL(NNGetOpHandle(op_names[i].c_str(), &handle))
+        function = _make_atomic_symbol_function(handle, op_names[i])
+        if function.__name__.startswith('_'):
+            setattr(module_internal, function.__name__, function)
+            setattr(module_obj, function.__name__, function)
+        else:
+            setattr(module_obj, function.__name__, function)
diff --git a/nnvm/python/nnvm/frontend/__init__.py b/nnvm/python/nnvm/frontend/__init__.py
new file mode 100644
index 000000000000..61c294f2606f
--- /dev/null
+++ b/nnvm/python/nnvm/frontend/__init__.py
@@ -0,0 +1,26 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""NNVM frontends."""
+from __future__ import absolute_import
+from .mxnet import from_mxnet
+from .onnx import from_onnx
+from .coreml import from_coreml
+from .keras import from_keras
+from .darknet import from_darknet
+from .tensorflow import from_tensorflow
+from .caffe2 import from_caffe2
diff --git a/nnvm/python/nnvm/frontend/caffe2.py b/nnvm/python/nnvm/frontend/caffe2.py
new file mode 100644
index 000000000000..f951db66b5a6
--- /dev/null
+++ b/nnvm/python/nnvm/frontend/caffe2.py
@@ -0,0 +1,471 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=import-self, invalid-name, line-too-long, unused-argument
+"""Caffe2 frontend"""
+from __future__ import absolute_import as _abs
+import tvm
+from nnvm import symbol as _sym
+from .common import get_nnvm_op, Renamer, AttrConverter as AttrCvt
+from .onnx_caffe2_utils import dimension_picker, dimension_constraint, infer_channels, revert_caffe2_pad
+from . import onnx
+
+__all__ = ['from_caffe2']
+
+
+def _clean_up_pool_args(args):
+    """ A helper function to clean up common arguments in conv and pooling ops.
+    """
+    assert isinstance(args, dict)
+
+    if 'stride_h' in args and 'stride_w' in args:
+        assert 'stride' not in args and 'strides' not in args
+        args['strides'] = [args['stride_h'], args['stride_w']]
+        args.pop('stride_h')
+        args.pop('stride_w')
+    elif 'stride' in args:
+        args['strides'] = [args['stride'], args['stride']]
+        args.pop('stride')
+
+    # rename 'kernel', 'kernels', to 'kernel_shape'
+    if 'kernel_h' in args and 'kernel_w' in args:
+        assert 'kernel' not in args and 'kernels' not in args
+        args['kernel_shape'] = [args['kernel_h'], args['kernel_w']]
+        args.pop('kernel_h')
+        args.pop('kernel_w')
+    elif 'kernel' in args:
+        args['kernel_shape'] = [args['kernel'], args['kernel']]
+        args.pop('kernel')
+    elif 'kernels' in args:
+        args['kernel_shape'] = args['kernels']
+        args.pop('kernels')
+
+    if 'pad_t' in args and 'pad_l' in args and 'pad_b' in args and 'pad_r' in args:
+        assert 'pad' not in args and 'pads' not in args
+        args['pads'] = [
+            args['pad_t'], args['pad_l'], args['pad_b'], args['pad_r']
+        ]
+        for pad in ['pad_t', 'pad_l', 'pad_b', 'pad_r']:
+            args.pop(pad)
+    elif 'pad' in args:
+        args['pads'] = [args['pad'], args['pad']]
+        args.pop('pad')
+
+    if 'dilation_h' in args and 'dilation_w' in args:
+        assert 'dilation' not in args and 'dilations' not in args
+        args['dilations'] = [args['dilation_h'], args['dilation_w']]
+        args.pop('dilation_h')
+        args.pop('dilation_w')
+    elif 'dilation' in args:
+        args['dilations'] = [args['dilation'], args['dilation']]
+        args.pop('dilation')
+
+    return args
+
+
+class Caffe2OpConverter(object):
+    """ A helper class for holding Caffe2 op converters.
+    """
+
+    @classmethod
+    def get_converter(cls):
+        """ Get converter.
+
+        :return: converter, which should be `_impl`.
+        """
+
+        if hasattr(cls, '_impl'):
+            return getattr(cls, '_impl')
+        raise tvm.error.OpNotImplemented(
+            'Operator {} is not implemented in frontend Caffe2.'.format(cls.__name__))
+
+
+_caffe2_internal_args = {
+    # nnpack args
+    'algo',
+    'convolution_transform_strategy',
+    'float16_compute',
+    'shared_buffer',
+
+    # training args
+    'init_params',
+    'cudnn_exhaustive_search',
+    'exhaustive_search',
+
+    # training args
+    'adj',
+    'hwgq',
+
+    # args that we don't care
+    'legacy_pad',
+}
+
+
+class Pool(Caffe2OpConverter):
+    """ A helper class for pool op converters.
+    """
+
+    name = ''
+
+    @classmethod
+    def _impl(cls, inputs, args, params):
+        _clean_up_pool_args(args)
+        if 'global_pooling' in args and args['global_pooling'] == 1:
+            op_name = dimension_picker('global_' + cls.name)
+            return get_nnvm_op(op_name(args))(*inputs)
+
+        return AttrCvt(
+            op_name=dimension_picker(cls.name),
+            transforms={
+                'kernel_shape': 'pool_size',
+                'pads': ('padding', (0, 0), revert_caffe2_pad),
+                'strides': 'strides',
+            },
+            excludes={
+                # TVM poolop does not support dilation
+                'dilations',
+            },
+            ignores=_caffe2_internal_args | {'global_pooling', 'order'},
+            custom_check=dimension_constraint())(inputs, args, params)
+
+
+class AveragePool(Pool):
+    name = 'avg_pool'
+
+
+class MaxPool(Pool):
+    name = 'max_pool'
+
+
+class Conv(Caffe2OpConverter):
+    """ Operator converter for Conv.
+    """
+
+    @classmethod
+    def _impl(cls, inputs, args, params):
+        # get number of channels
+        channels = infer_channels(inputs[1], params)
+        args['channels'] = channels
+        _clean_up_pool_args(args)
+        return AttrCvt(
+            op_name=dimension_picker('conv'),
+            transforms={
+                'group': ('groups', 1),
+                'kernel_shape':
+                'kernel_size',
+                'pads': ('padding', (0, 0), revert_caffe2_pad),
+                'strides':
+                'strides',
+                'dilations': ('dilation', (1, 1)),
+                'order':
+                ('layout', ("NCHW"),
+                 lambda x: x if isinstance(x, str) else x.decode('UTF-8')),
+            },
+            excludes={},
+            ignores=_caffe2_internal_args,
+            extras={'use_bias': len(inputs) == 3},
+            custom_check=dimension_constraint())(inputs, args, params)
+
+
+class Concat(Caffe2OpConverter):
+    """ Operator converter for Concat.
+    """
+
+    @classmethod
+    def _impl(cls, inputs, args, params):
+        def _get_axis_from_order_str(order):
+            order = order if isinstance(order, str) else order.decode('UTF-8')
+            if order == 'NCHW':
+                return 1
+            if order == 'NHWC':
+                return 3
+            raise tvm.error.OpAttributeInvalid('Value {} in attribute {} of operator {} is not valid.'.format(order, 'order', 'Concat'))
+
+        return AttrCvt(
+            op_name='concatenate',
+            transforms={
+                'order': ('axis', (1), _get_axis_from_order_str),
+            },
+            excludes={
+                'add_axis',
+            })(inputs, args, params)
+
+
+class NormalizePlanarYUV(Caffe2OpConverter):
+    """ Operator converter for NormalizePlanarYUV.
+    caffe2 definition: https://github.com/pytorch/pytorch/blob/master/caffe2/operators/norm_planar_yuv_op.cc
+    """
+
+    @classmethod
+    def _impl(cls, inputs, args, params):
+        assert len(inputs) == 3
+        mean = _sym.expand_dims(inputs[1], axis=2, num_newaxis=2)
+        std = _sym.expand_dims(inputs[2], axis=2, num_newaxis=2)
+
+        return _sym.broadcast_div(_sym.broadcast_sub(inputs[0], mean), std)
+
+
+class ResizeNearest(Caffe2OpConverter):
+    """ Operator converter for Upsample (nearest mode).
+    """
+
+    @classmethod
+    def _impl(cls, inputs, args, params):
+        width_scale = args['width_scale'] if 'width_scale' in args else 1
+        height_scale = args['height_scale'] if 'height_scale' in args else 1
+        assert width_scale == height_scale
+
+        return _sym.upsampling(
+            inputs[0], scale=int(width_scale), method="NEAREST_NEIGHBOR")
+
+
+class FC(Caffe2OpConverter):
+    """ Operator converter for FC.
+    """
+
+    @classmethod
+    def _impl(cls, inputs, args, params):
+        inputs[0] = _sym.flatten(inputs[0])
+        args['units'] = infer_channels(inputs[1], params)
+        return AttrCvt(
+            'dense',
+            ignores=['axis', 'axis_w'],
+            extras={'use_bias': len(inputs) == 3},
+        )(inputs, args, params)
+
+
+class SpatialBN(Caffe2OpConverter):
+    """ Operator converter for SpatialBN.
+    """
+
+    @classmethod
+    def _impl(cls, inputs, args, params):
+        return AttrCvt(
+            op_name='batch_norm',
+            disables=['momentum'],
+            ignores=[
+                'order', 'spatial', 'is_test', 'consumed_inputs', 'num_batches'
+            ])(inputs, args, params)
+
+
+# compatible operators that do NOT require any conversion.
+_identity_list = []
+
+# _convert_map defines maps of name to converter functor(callable)
+# for 1 to 1 mapping, use Renamer if nothing but name is different
+# use AttrCvt if attributes need to be converted
+# for 1 to N mapping(composed), use custom callable functions
+# for N to 1 mapping, currently not supported(?)
+
+# Minimal set of ops for squeezenet and resnet50
+def _get_convert_map():
+    return {
+        # caffe2/onnx common operators
+        'Add': onnx.Add.get_converter(opset=1),
+        'Sum': onnx.Sum.get_converter(opset=1),
+        'Softmax': onnx.Softmax.get_converter(opset=1),
+
+        # nn
+        'AveragePool': AveragePool.get_converter(),
+        'MaxPool': MaxPool.get_converter(),
+        'Conv': Conv.get_converter(),
+        'Concat': Concat.get_converter(),
+        'FC': FC.get_converter(),
+        'SpatialBN': SpatialBN.get_converter(),
+        'ResizeNearest': ResizeNearest.get_converter(),
+        'Relu': AttrCvt('relu', {}, ignores=['order']),
+        'Sigmoid': Renamer('sigmoid'),
+        'Dropout': AttrCvt('dropout', {'ratio': 'rate'}, ignores=['is_test']),
+
+        # c2 image preprocessing ops
+        'NormalizePlanarYUV': NormalizePlanarYUV.get_converter(),
+    }
+
+
+class Caffe2NetDef(object):
+    """A helper class for handling nnvm graph copying from pb2.GraphProto.
+    Definition: https://github.com/pytorch/pytorch/blob/master/caffe2/proto/caffe2.proto
+    """
+
+    def __init__(self):
+        self._nodes = {}
+        self._params = {}
+        self._visited_nodes = set()
+        self._ops = {}
+
+    def from_caffe2(self, init_net, predict_net):
+        """Construct nnvm nodes from caffe2 graph.
+
+        Parameters
+        ----------
+        workspace : Caffe2 workspace
+        predict_net : protobuf object
+
+        Returns
+        -------
+        sym : nnvm.sym.Symbol
+            The returned nnvm symbol
+        params : dict
+            A dict of name: tvm.nd.array pairs, used as pretrained weights
+        """
+        from caffe2.python import workspace
+        workspace.RunNetOnce(init_net)
+
+        # Input
+        input_name = predict_net.op[0].input[0]
+
+        # Params
+        self._params = {}
+        used_blobs = set()
+        for c2_op in predict_net.op:
+            for i in c2_op.input:
+                used_blobs.add(i)
+        for blob in workspace.Blobs():
+            if blob in used_blobs and blob != input_name:
+                self._params[blob] = tvm.nd.array(workspace.FetchBlob(blob))
+
+        # Variables
+        self._nodes = {}
+        for blob in predict_net.external_input:
+            self._nodes[blob] = _sym.Variable(name=blob)
+
+        # Ops
+        for c2_op in predict_net.op:
+            for blob in c2_op.output:
+                self._ops[blob] = c2_op
+        for c2_op in predict_net.op:
+            self._process_op(c2_op)
+
+        # Outputs
+        out = []
+        for blob in predict_net.external_output:
+            out.append(self._nodes[blob])
+
+        if len(out) > 1:
+            sym = _sym.Group(out)
+        else:
+            sym = out[0]
+
+        return sym, self._params
+
+    def _get_node(self, blob):
+        """Get the nnvm Symbol of blob and detect cyclic dependency in the graph."""
+        if blob in self._nodes:
+            return self._nodes[blob]
+
+        assert blob not in self._visited_nodes, 'Cyclic dependency in the graph (in {})'.format(
+            blob)
+        self._visited_nodes.add(blob)
+
+        self._process_op(self._ops[blob])
+        return self._nodes[blob]
+
+    def _process_op(self, c2_op):
+        op_type = c2_op.type
+        args = self._parse_arg(c2_op.arg)
+        inputs = [self._get_node(i) for i in c2_op.input]
+        tvm_op = self._convert_operator(op_type, inputs, args)
+        # Ignore all outputs except the first one
+        self._nodes[c2_op.output[0]] = tvm_op[0]
+
+    def _parse_arg(self, arg):
+        """Convert a list of Argument to a dict, with names as keys."""
+        args = {}
+        for a in arg:
+            for f in ['f', 'i', 's']:
+                if a.HasField(f):
+                    args[a.name] = getattr(a, f)
+            for f in ['floats', 'ints', 'strings']:
+                if list(getattr(a, f)):
+                    assert a.name not in args, "Only one type of attr is allowed"
+                    args[a.name] = tuple(getattr(a, f))
+            for f in ['n']:
+                if a.HasField(f):
+                    raise NotImplementedError(
+                        "Field {} is not supported in nnvm.".format(f))
+            for f in ['nets']:
+                if list(getattr(a, f)):
+                    raise NotImplementedError(
+                        "Field {} is not supported in nnvm.".format(f))
+            if a.name not in args:
+                raise ValueError("Cannot parse attribute: \n{}\n.".format(a))
+        return args
+
+    def _convert_operator(self,
+                          op_type,
+                          inputs,
+                          args,
+                          identity_list=None,
+                          convert_map=None):
+        """Convert from Caffe2 operator to nnvm operator.
+        The converter must specify conversions explicitly for incompatible name, and
+        apply handlers to operator attributes.
+
+        Parameters
+        ----------
+        op_type : str
+            Operator name, such as Convolution, FullyConnected
+        inputs : list of nnvm.Symbol
+            List of input symbols.
+        args : dict
+            Dict of operator attributes
+        identity_list : list
+            List of operators that don't require conversion
+        convert_map : dict
+            Dict of name : callable, where name is the op's name that
+            require conversion to nnvm, callable are functions which
+            take args and return (new_op_type, new_args)
+
+        Returns
+        -------
+        sym : nnvm.Symbol
+            Converted nnvm Symbol
+        """
+        identity_list = identity_list if identity_list else _identity_list
+        convert_map = convert_map if convert_map else _get_convert_map()
+        if op_type in identity_list:
+            sym = get_nnvm_op(op_type)(*inputs, **args)
+        elif op_type in convert_map:
+            # Add a sanitizing step to convert all byte strings in args to strings
+            sym = convert_map[op_type](inputs, args, self._params)
+        else:
+            raise tvm.error.OpNotImplemented(
+                'Operator {} is not supported in frontend Caffe2.'.format(op_type))
+        return sym
+
+
+def from_caffe2(init_net, predict_net):
+    """Load caffe2 graph which contains init_net and predict_net into nnvm graph.
+
+    Parameters
+    ----------
+    init_net : protobuf object
+        Caffe2 NetDef containing the weights
+
+    predict_net : protobuf object
+        Caffe2 NetDef containing the graph
+
+    Returns
+    -------
+    sym : nnvm.Symbol
+        Compatible nnvm symbol
+
+    params : dict of str to tvm.ndarray
+        Dict of converted parameters stored in tvm.ndarray format
+    """
+
+    caffe2 = Caffe2NetDef()
+    return caffe2.from_caffe2(init_net, predict_net)
diff --git a/nnvm/python/nnvm/frontend/common.py b/nnvm/python/nnvm/frontend/common.py
new file mode 100644
index 000000000000..0e09a2c43323
--- /dev/null
+++ b/nnvm/python/nnvm/frontend/common.py
@@ -0,0 +1,204 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Shared functions and classes for frontends."""
+from __future__ import absolute_import as _abs
+import logging
+from nnvm import sym as _sym
+from .._base import string_types
+
+def get_nnvm_op(op_name):
+    op = getattr(_sym, op_name)
+    if not op:
+        raise OpNotImplemented(
+            'Operator {} is not supported.'.format(op))
+    return op
+
+def required_attr(attr, key, op_name):
+    assert isinstance(attr, dict)
+    if key not in attr:
+        raise OpAttributeRequired(
+            'Required attribute {} not found in operator {}'.format(key, op_name))
+    return attr[key]
+
+def parse_tshape(tshape):
+    """Parse tshape in string."""
+    return [int(x.strip()) for x in tshape.strip('()').split(',')]
+
+def parse_bool_str(attr, key, default='False'):
+    """Parse bool string to boolean."""
+    return attr.get(key, default).strip().lower() in ['true', '1', 't', 'y', 'yes']
+
+class Renamer(object):
+    """A simply renamer for operators.
+
+    Parameters
+    ----------
+    new_name : str
+        The new name for the operator
+    """
+    def __init__(self, new_name):
+        self._new_name = new_name
+
+    def __call__(self, inputs, attrs, *args):
+        return get_nnvm_op(self._new_name)(*inputs, **attrs)
+
+
+class AttrConverter(object):
+    """Common attribute converter. An AttrConverter instance is a callable:
+    ```
+    attr_converter = AttrConverter(op_name, transforms={'a':'b', 'c':('d', 1)})
+    new_op_name, new_attr = attr_converter(attrs)
+    ```
+
+    Parameters
+    ----------
+    op_name : str or callable
+        If set as str, returned operator name is the str.
+        If set as callable, returned operator is the str returned by calling:
+        `op_name = func(attr)`
+    transforms : dict of `new_name, or (new_name, default_value, transform function)`
+        If only a new_name is provided, it's like renaming the attribute name.
+        If default_value if provided, then the attribute is considered as optional.
+        If transform function is provided, the original attribute value is handled
+        by transform function.
+    excludes : list
+        A list of excluded attributes that should `NOT` appear.
+        Raise NotImplementedError if occurred.
+    disables : list
+        A list of attributes that is disabled in nnvm. Log warnings.
+    ignores : list
+        A list of attributes that is ignored in nnvm. Debug level logging.
+    extras : dict
+        A series of additional attributes should be added anyway to the returned
+        attribute dict.
+    custom_check : callable
+        A custom function takes attribute, and return True/False.
+        Raise RuntimeError if not bool(True) returned.
+    """
+    def __init__(self, op_name, transforms=None,
+                 excludes=None, disables=None, ignores=None,
+                 extras=None, custom_check=None):
+        self._op_name = op_name
+        self._transforms = transforms if transforms else {}
+        self._excludes = excludes if excludes else []
+        self._disables = disables if disables else []
+        self._ignores = ignores if ignores else []
+        self._extras = extras if extras else {}
+        self._custom_check = custom_check
+
+    def __call__(self, inputs, attrs, *args):
+        # apply custom check
+        if self._custom_check:
+            func, msg = self._custom_check
+            if not func(attrs):
+                raise RuntimeError("Check failed: {}".format(msg))
+        # get new op_name
+        if isinstance(self._op_name, string_types):
+            op_name = self._op_name
+        else:
+            assert callable(self._op_name), "op_name can either be string or callable"
+            op_name = self._op_name(attrs)
+        # convert attributes
+        new_attrs = {}
+        for k in attrs.keys():
+            if k in self._excludes:
+                raise NotImplementedError("Attribute {} not supported yet.".format(k))
+            elif k in self._disables:
+                logging.warning("Attribute %s is disabled in nnvm.sym.%s", k, op_name)
+            elif k in self._ignores:
+                logging.debug("Attribute %s is ignored in nnvm.sym.%s", k, op_name)
+            elif k in self._transforms:
+                new_name, defaults, transform = self._parse_default(self._transforms[k])
+                if defaults is None:
+                    new_attr = self._required_attr(attrs, k)
+                else:
+                    new_attr = attrs.get(k, None)
+                if new_attr is None:
+                    new_attrs[new_name] = defaults
+                else:
+                    new_attrs[new_name] = transform(new_attr)
+            else:
+                # copy
+                new_attrs[k] = attrs[k]
+        # add extras
+        new_attrs.update(self._extras)
+        return get_nnvm_op(op_name)(*inputs, **new_attrs)
+
+    def _parse_default(self, target):
+        """Helper function to parse default values."""
+        if not isinstance(target, (list, tuple)):
+            k, v, t = target, None, lambda x: x
+        elif len(target) == 1:
+            k, v, t = target[0], None, lambda x: x
+        elif len(target) == 2:
+            k, v, t = target[0], target[1], lambda x: x
+        elif len(target) > 2:
+            k, v, t = target[0], target[1], target[2]
+        else:
+            k = None  # should raise
+        if not isinstance(k, string_types):
+            msg = "{} is not a valid target, (name, default) expected.".format(target)
+            raise ValueError(msg)
+        return k, v, t
+
+    def _parse_bool(self, value):
+        """Helper function to parse default boolean values."""
+        if isinstance(value, string_types):
+            return value.strip().lower() in ['true', '1', 't', 'y', 'yes']
+        return bool(value)
+
+    def _required_attr(self, attr, key):
+        """Wrapper for getting required attributes."""
+        assert isinstance(attr, dict)
+        if key not in attr:
+            raise AttributeError("Required attribute {} not found.".format(key))
+        return attr[key]
+
+
+class SymbolTable(object):
+    """Table storing symbols by names."""
+    def __init__(self):
+        self.vars = {}
+        self.params = {}
+        self.const_ctr = 1
+        self.in_padding = False
+        self.paddings = [0, 0]
+
+    def new_const(self, value):
+        name = "_param_%d" % (self.const_ctr)
+        self.const_ctr += 1
+        self.params[name] = value
+        self.vars[name] = _sym.Variable(name=name)
+        return self.vars[name]
+
+    def get_var(self, name, must_contain=True):
+        if must_contain:
+            assert name in self.vars
+        if name not in self.vars:
+            self.vars[name] = _sym.Variable(name=name)
+        return self.vars[name]
+
+    def set_var(self, name, sym):
+        assert isinstance(sym, _sym.Symbol)
+        self.vars[name] = sym
+
+    def set_padding(self, paddings):
+        self.paddings = paddings
+        self.in_padding = True
+
+    def clear_padding(self):
+        self.in_padding = False
diff --git a/nnvm/python/nnvm/frontend/coreml.py b/nnvm/python/nnvm/frontend/coreml.py
new file mode 100644
index 000000000000..c5b0c0a799ec
--- /dev/null
+++ b/nnvm/python/nnvm/frontend/coreml.py
@@ -0,0 +1,431 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name, unused-argument
+"""CoreML frontend."""
+from __future__ import absolute_import as _abs
+import numpy as np
+import tvm
+from .common import SymbolTable
+from .. import symbol as _sym
+from .._base import string_types
+
+__all__ = ['from_coreml']
+
+
+def NeuralNetworkImageScaler(op, insym, symtab):
+    # this changes the symbol
+    biases = np.array([op.blueBias, op.greenBias, op.redBias]).reshape([3, 1, 1])
+    bias = symtab.new_const(biases)
+    ret = _sym.__mul_scalar__(insym, scalar=op.channelScale)
+    ret = _sym.broadcast_add(ret, bias)
+    return ret
+
+
+def NeuralNetworkMeanImage(op, insym, symtab):
+    # this changes the symbol
+    ret = _sym.elemwise_sub(insym, scalar=op.meanImage)
+    return ret
+
+
+def ConvolutionLayerParams(op, insym, symtab):
+    """Convolution layer params."""
+    weights = symtab.new_const(np.array(list(op.weights.floatValue)).reshape(
+        tuple([op.outputChannels, op.kernelChannels] + list(op.kernelSize))))
+    if op.hasBias:
+        biases = symtab.new_const(list(op.bias.floatValue))
+    dilation = list(op.dilationFactor)
+    if not dilation:
+        dilation = [1, 1]
+    params = {'channels':op.outputChannels,
+              'kernel_size':list(op.kernelSize),
+              'strides':list(op.stride),
+              'dilation': dilation,
+              'use_bias': op.hasBias,
+              'groups':op.nGroups}
+
+    if op.WhichOneof('ConvolutionPaddingType') == 'valid':
+        valid = op.valid
+        padding = [b.startEdgeSize for b in valid.paddingAmounts.borderAmounts]
+        padding2 = [b.endEdgeSize for b in valid.paddingAmounts.borderAmounts]
+        for i, j in zip(padding, padding2):
+            assert i == j, "Asymmetry padding not supported"
+        if padding:
+            params['padding'] = padding
+    elif op.WhichOneof('ConvolutionPaddingType') == 'same':
+        kernel = params['kernel_size']
+        pad_h = kernel[0] - 1
+        pad_w = kernel[1] - 1
+        pad_t = pad_h // 2
+        pad_l = pad_w // 2
+        pad_b = pad_h - pad_t
+        pad_r = pad_w - pad_l
+        assert pad_t == pad_r and pad_l == pad_b, "Asymmetry padding not supported"
+        params['padding'] = [pad_t, pad_l]
+    else:
+        raise NotImplementedError("Valid/Same convolution padding implemented")
+
+    if op.hasBias:
+        pos = [insym, weights, biases]
+    else:
+        pos = [insym, weights]
+
+    # consume padding layer
+    if symtab.in_padding:
+        params['padding'] = [sum(x) for x in zip(params.get('padding', [0, 0]), symtab.paddings)]
+        symtab.clear_padding()
+
+    if op.isDeconvolution:
+        ret = _sym.conv2d_transpose(*pos, **params)
+    else:
+        ret = _sym.conv2d(*pos, **params)
+    return ret
+
+def BatchnormLayerParams(op, insym, symtab):
+    """Get layer of batchnorm parameter"""
+    # this changes the symbol
+    if op.instanceNormalization:
+        msg = 'Operator "instance normalization" is not supported in frontend CoreML.'
+        raise tvm.error.OpNotImplemented(msg)
+    else:
+        params = {'gamma':symtab.new_const(list(op.gamma.floatValue)),
+                  'beta':symtab.new_const(list(op.beta.floatValue)),
+                  'moving_mean':symtab.new_const(list(op.mean.floatValue)),
+                  'moving_var': symtab.new_const(list(op.variance.floatValue)),
+                  'epsilon': op.epsilon}
+        return _sym.batch_norm(data=insym, **params)
+
+def ActivationParams(op, insym, symtab):
+    """Get activation parameters"""
+    whichActivation = op.WhichOneof('NonlinearityType')
+    par = getattr(op, whichActivation)
+    if whichActivation == 'linear':
+        return _sym.__add_scalar__(_sym.__mul_scalar__(insym, scalar=par.alpha), scalar=par.beta)
+    if whichActivation == 'ReLU':
+        return _sym.relu(insym)
+    if whichActivation == 'leakyReLU':
+        return _sym.leaky_relu(insym, alpha=par.alpha)
+    if whichActivation == 'thresholdedReLU':
+        alpha_tensor = _sym.full_like(insym, fill_value=float(par.alpha))
+        return _sym.elemwise_mul(insym, _sym.greater(insym, alpha_tensor))
+    if whichActivation == 'PReLU':
+        return _sym.prelu(insym, alpha=par.alpha)
+    if whichActivation == 'tanh':
+        return _sym.tanh(insym)
+    if whichActivation == 'scaledTanh':
+        return _sym.__mul_scalar__(_sym.tanh(_sym.__mul_scalar__(
+            insym, scalar=par.beta)), scalar=par.alpha)
+    if whichActivation == 'sigmoid':
+        return _sym.sigmoid(insym)
+    if whichActivation == 'sigmoidHard':
+        transformX = (par.alpha * insym) + par.beta
+        return _sym.clip(transformX, a_min=0, a_max=1)
+    if whichActivation == 'ELU':
+        return _sym.__mul_scalar__(_sym.__add_scalar__(
+            _sym.exp(insym), scalar=-1), scalar=par.alpha)
+    if whichActivation == 'softsign':
+        return insym / (1 + (_sym.relu(insym) + _sym.relu(_sym.negative(insym))))
+    if whichActivation == 'softplus':
+        return _sym.log(_sym.__add_scalar__(_sym.exp(insym), scalar=1))
+    if whichActivation == 'parametricSoftplus':
+        alpha = list(par.alpha.floatValue)
+        beta = list(par.alpha.floatValue)
+        if len(alpha) == 1:
+            return _sym.__mul_scalar__(_sym.log(_sym.__add_scalar__(
+                _sym.exp(insym), scalar=beta[0])), scalar=alpha[0])
+        alpha = np.array(alpha).reshape((len(alpha), 1, 1))
+        beta = np.array(beta).reshape((len(beta), 1, 1))
+        alphasym = symtab.new_const(alpha)
+        betasym = symtab.new_const(beta)
+        return _sym.broadcast_mul(_sym.log(_sym.broadcast_add(
+            _sym.exp(insym), betasym)), alphasym)
+    raise tvm.error.OpNotImplemented(
+        'Operator {} is not supported in frontend CoreML.'.format(whichActivation))
+
+def ScaleLayerParams(op, insym, symtab):
+    """Scale layer params."""
+    scale = symtab.new_const(np.array(list(op.scale.floatValue)).reshape(
+        tuple(list(op.shapeScale) + [1, 1])))
+    # scale = _sym.reshape(scale, shape=tuple(list(op.shapeScale) + [1,1]))
+    ret = _sym.broadcast_mul(insym, scale)
+    if op.hasBias:
+        bias = symtab.new_const(np.array(list(op.bias.floatValue)).reshape(
+            tuple(list(op.shapeBias) + [1, 1])))
+        # bias = _sym.reshape(bias, shape=tuple(list(op.shapeBias) + [1,1]))
+        ret = _sym.broadcast_add(ret, bias)
+    return ret
+
+def PoolingLayerParams(op, insym, symtab):
+    """get pooling parameters"""
+    if op.globalPooling:
+        if op.type == 0:
+            return _sym.global_max_pool2d(insym)
+        if op.type == 1:
+            return _sym.global_avg_pool2d(insym)
+        raise tvm.error.OpNotImplemented(
+            'Operator pooling (not max or average) is not supported in frontend CoreML.')
+
+    else:
+        params = {'pool_size':list(op.kernelSize),
+                  'strides':list(op.stride)}
+
+        if op.WhichOneof('PoolingPaddingType') == 'valid':
+            valid = op.valid
+            padding = [b.startEdgeSize for b in valid.paddingAmounts.borderAmounts]
+            padding2 = [b.endEdgeSize for b in valid.paddingAmounts.borderAmounts]
+            for i, j in zip(padding, padding2):
+                assert i == j
+            params['padding'] = padding
+        elif op.WhichOneof('PoolingPaddingType') == 'includeLastPixel':
+            # I don't know if this is correct
+            valid = op.includeLastPixel
+            padding = list(valid.paddingAmounts)
+            params['padding'] = padding
+            params['ceil_mode'] = True
+        else:
+            msg = 'Value {} in attribute PoolingPaddingType of operator Pooling is not valid.'
+            raise tvm.error.OpAttributeInvalid(msg.format(op.WhichOneof('PoolingPaddingType')))
+
+        # consume padding layer
+        if symtab.in_padding:
+            params['padding'] = [sum(x) for x in zip(
+                params.get('padding', [0, 0]), symtab.paddings)]
+            symtab.clear_padding()
+
+        if op.type == 0:
+            return _sym.max_pool2d(insym, **params)
+        if op.type == 1:
+            return _sym.avg_pool2d(insym, **params)
+        msg = 'Operator pooling (not max or average) is not supported in frontend CoreML.'
+        raise tvm.error.OpNotImplemented(msg)
+
+def SoftmaxLayerParams(op, insym, symtab):
+    return _sym.softmax(_sym.flatten(insym))
+
+def InnerProductLayerParams(op, insym, symtab):
+    weights = symtab.new_const(np.array(op.weights.floatValue).reshape(
+        (op.outputChannels, op.inputChannels)))
+    par = {'weight':weights, 'use_bias':False, 'units':op.outputChannels}
+    if op.hasBias:
+        bias = symtab.new_const(np.array(op.bias.floatValue))
+        par['bias'] = bias
+        par['use_bias'] = True
+    return _sym.dense(data=insym, **par)
+
+def AddLayerParams(op, insyms, symtab):
+    if not isinstance(insyms, list):
+        insyms = [insyms]
+    ret = insyms[0]
+    for i in range(1, len(insyms)):
+        ret = _sym.elemwise_add(ret, insyms[i])
+    if op.alpha > 0:
+        ret = _sym.__add_scalar__(ret, scalar=op.alpha)
+    return ret
+
+def MultiplyLayerParams(op, insyms, symtab):
+    if not isinstance(insyms, list):
+        insyms = [insyms]
+    ret = insyms[0]
+    for i in range(1, len(insyms)):
+        ret = _sym.elemwise_mul(ret, insyms[i])
+    if op.alpha != 1:
+        ret = _sym.__mul_scalar__(ret, scalar=op.alpha)
+    return ret
+
+def ConcatLayerParams(op, insyms, symtab):
+    if not isinstance(insyms, list):
+        insyms = [insyms]
+    if op.sequenceConcat:
+        raise tvm.error.OpNotImplemented(
+            'Operator Sequence Concat is not supported in frontend CoreML.')
+    ret = _sym.concatenate(*insyms, axis=1)
+    return ret
+
+def FlattenLayerParams(op, insym, symtab):
+    if op.mode == 1:
+        insym = _sym.transpose(_sym.reshape(insym, shape=(0, 0, -1)), axes=(0, 2, 1))
+    return _sym.flatten(insym)
+
+def PaddingLayerParams(op, insym, symtab):
+    """Hacking for padding layer params."""
+    if op.WhichOneof('PaddingType') == 'constant':
+        constant = op.constant
+        if constant.value != 0:
+            msg = 'Value {} in attribute "padding value" of operator Padding is not valid.'
+            raise tvm.error.OpAttributeInvalid(msg.format(constant.value))
+        padding = [b.startEdgeSize for b in op.paddingAmounts.borderAmounts]
+        padding2 = [b.endEdgeSize for b in op.paddingAmounts.borderAmounts]
+        for i, j in zip(padding, padding2):
+            assert i == j
+        symtab.set_padding(padding)
+    else:
+        raise tvm.error.OpNotImplemented(
+            'Operator "non-constant padding" is not supported in frontend CoreML.')
+    return insym
+
+def PermuteLayerParams(op, insym, symtab):
+    axes = tuple(op.axis)
+    return _sym.transpose(insym, axes=axes)
+
+def UpsampleLayerParams(op, insym, symtab):
+    if op.scalingFactor[0] != op.scalingFactor[1]:
+        raise tvm.error.OpAttributeInvalid(
+            'Height and width scaling factors of Upsample operator must be equal.')
+    interpolationMode = 'NEAREST_NEIGHBOR' if op.mode == 0 else 'BILINEAR'
+    return _sym.upsampling(insym, scale=op.scalingFactor[0], method=interpolationMode)
+
+def L2NormalizeLayerParams(op, insym, symtab):
+    return _sym.l2_normalize(insym, eps=op.epsilon, axis=1)
+
+def LRNLayerParams(op, insym, symtab):
+    par = {}
+    par['size'] = op.localSize
+    par['bias'] = op.k
+    par['alpha'] = op.alpha
+    par['beta'] = op.beta
+    par['axis'] = 1 #default layout is nchw
+    return _sym.lrn(data=insym, **par)
+
+def AverageLayerParams(op, insyms, symtab):
+    if not isinstance(insyms, list) or len(insyms) < 2:
+        raise ValueError("Expect minimum 2 inputs")
+    count = len(insyms)
+    _sum = insyms[0]
+    for i in range(1, count):
+        _sum = _sym.broadcast_add(_sum, insyms[i])
+    return _sum / count
+
+def MaxLayerParams(op, insyms, symtab):
+    if not isinstance(insyms, list) or len(insyms) < 2:
+        raise ValueError("Expect minimum 2 inputs")
+    _max = insyms[0]
+    for i in range(1, len(insyms)):
+        _max = _sym.broadcast_max(_max, insyms[i])
+    return _max
+
+def MinLayerParams(op, insyms, symtab):
+    if not isinstance(insyms, list) or len(insyms) < 2:
+        raise ValueError("Expect minimum 2 inputs")
+    _min = insyms[0]
+    for i in range(1, len(insyms)):
+        _min = _sym.broadcast_min(_min, insyms[i])
+    return _min
+
+_convert_map = {
+    'NeuralNetworkMeanImage': NeuralNetworkMeanImage,
+    'NeuralNetworkImageScaler': NeuralNetworkImageScaler,
+    'ConvolutionLayerParams':ConvolutionLayerParams,
+    'BatchnormLayerParams':BatchnormLayerParams,
+    'ActivationParams':ActivationParams,
+    'ScaleLayerParams':ScaleLayerParams,
+    'PoolingLayerParams':PoolingLayerParams,
+    'SoftmaxLayerParams':SoftmaxLayerParams,
+    'InnerProductLayerParams':InnerProductLayerParams,
+    'AddLayerParams':AddLayerParams,
+    'MultiplyLayerParams':MultiplyLayerParams,
+    'FlattenLayerParams':FlattenLayerParams,
+    'ConcatLayerParams':ConcatLayerParams,
+    'PaddingLayerParams':PaddingLayerParams,
+    'PermuteLayerParams':PermuteLayerParams,
+    'UpsampleLayerParams':UpsampleLayerParams,
+    'L2NormalizeLayerParams':L2NormalizeLayerParams,
+    'LRNLayerParams':LRNLayerParams,
+    'AverageLayerParams':AverageLayerParams,
+    'MaxLayerParams':MaxLayerParams,
+    'MinLayerParams':MinLayerParams,
+}
+
+def coreml_op_to_nnvm(op, inname, outname, symtab):
+    """Convert coreml layer to nnvm layer.
+
+    Parameters
+    ----------
+    coremlop: a coreml protobuf bit
+
+    prevsym: previous nnvm symbol
+
+    Returns:
+    -------
+    nnvm.sym.Symbol
+        Converted symbol
+    """
+    classname = type(op).__name__
+    if classname not in _convert_map:
+        raise tvm.error.OpNotImplemented(
+            'Operator {} is not supported in frontend CoreML.'.format(classname))
+    if isinstance(inname, string_types):
+        insym = symtab.get_var(inname)
+    else:
+        insym = [symtab.get_var(i) for i in inname]
+    ret = _convert_map[classname](op, insym, symtab)
+    if outname:
+        symtab.set_var(outname, ret)
+    if classname != 'PaddingLayerParams':
+        assert not symtab.in_padding, "Previous padding not consumed by conv/pool"
+
+def from_coreml(model):
+    """Convert from coreml model into NNVM format.
+
+    Parameters
+    ----------
+    model:
+        coremltools.models.MLModel of a NeuralNetworkClassifier
+
+    Returns
+    -------
+    sym : nnvm.Symbol
+        Compatible nnvm symbol
+
+    params : dict of str to tvm.NDArray
+        The parameter dict to be used by nnvm
+    """
+    try:
+        import coremltools as cm
+    except ImportError:
+        raise ImportError('The coremltools package must be installed')
+
+    assert isinstance(model, cm.models.MLModel)
+    spec = model.get_spec()
+    modeltype = spec.WhichOneof('Type')
+    assert modeltype in ['neuralNetworkClassifier', 'neuralNetwork', 'neuralNetworkRegressor']
+    cc = getattr(spec, modeltype)
+
+    symtab = SymbolTable()
+    for i in spec.description.input:
+        symtab.get_var(i.name, must_contain=False)
+
+    for pp in cc.preprocessing:
+        whichpp = pp.WhichOneof('preprocessor')
+        ppmethod = getattr(pp, whichpp)
+        # the NeuralNetworkImageScalar doesn't seem to have a featureName?
+        if whichpp == 'scaler':
+            for i in spec.description.input:
+                coreml_op_to_nnvm(ppmethod, i.name, i.name, symtab)
+        else:
+            coreml_op_to_nnvm(ppmethod, pp.featureName, pp.featureName, symtab)
+
+    for l in cc.layers:
+        layertype = l.WhichOneof('layer')
+        layerop = getattr(l, layertype)
+        assert len(l.output) == 1
+        if len(l.input) == 1:
+            coreml_op_to_nnvm(layerop, l.input[0], l.output[0], symtab)
+        else:
+            coreml_op_to_nnvm(layerop, list(l.input), l.output[0], symtab)
+    returns = [symtab.get_var(i.name, must_contain=False) for i in spec.description.output]
+    tvmparams = {k:tvm.nd.array(np.array(v, dtype=np.float32)) for k, v in symtab.params.items()}
+    # for now return first output
+    return returns[0], tvmparams
diff --git a/nnvm/python/nnvm/frontend/darknet.py b/nnvm/python/nnvm/frontend/darknet.py
new file mode 100644
index 000000000000..8c6020500b45
--- /dev/null
+++ b/nnvm/python/nnvm/frontend/darknet.py
@@ -0,0 +1,979 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+DarkNet symbol frontend.
+"""
+
+from __future__ import absolute_import as _abs
+import numpy as np
+import tvm
+from .. import symbol as _sym
+from .common import get_nnvm_op, required_attr, parse_tshape, parse_bool_str
+
+class LAYERTYPE(object):
+    """Darknet LAYERTYPE Class constant."""
+    CONVOLUTIONAL = 0
+    DECONVOLUTIONAL = 1
+    CONNECTED = 2
+    MAXPOOL = 3
+    SOFTMAX = 4
+    DETECTION = 5
+    DROPOUT = 6
+    CROP = 7
+    ROUTE = 8
+    COST = 9
+    NORMALIZATION = 10
+    AVGPOOL = 11
+    LOCAL = 12
+    SHORTCUT = 13
+    ACTIVE = 14
+    RNN = 15
+    GRU = 16
+    LSTM = 17
+    CRNN = 18
+    BATCHNORM = 19
+    NETWORK = 20
+    XNOR = 21
+    REGION = 22
+    YOLO = 23
+    REORG = 24
+    UPSAMPLE = 25
+    LOGXENT = 26
+    L2NORM = 27
+    BLANK = 28
+
+class ACTIVATION(object):
+    """Darknet ACTIVATION Class constant."""
+    LOGISTIC = 0
+    RELU = 1
+    RELIE = 2
+    LINEAR = 3
+    RAMP = 4
+    TANH = 5
+    PLSE = 6
+    LEAKY = 7
+    ELU = 8
+    LOGGY = 9
+    STAIR = 10
+    HARDTAN = 11
+    LHTAN = 12
+
+__all__ = ['from_darknet']
+
+def _darknet_maxpooling(inputs, attrs):
+    """Process the max pool 2d operation."""
+    kernel = parse_tshape(required_attr(attrs, 'kernel', 'maxpool'))
+    if len(kernel) != 1:
+        raise tvm.error.OpAttributeUnImplemented(
+            'Non-2D kernels for Max Pooling are not supported in frontend Darknet.')
+
+    op_name, new_attrs = 'max_pool2d', {}
+    strides = int(attrs.get('stride', (1, 1)))
+    pads = int(attrs.get('pad', (0, 0)))
+    new_attrs['pool_size'] = [kernel[0], kernel[0]]
+    new_attrs['strides'] = str((strides, strides))
+    new_attrs['padding'] = str((pads, pads))
+    extra_pad_size = attrs.get('extra_pad_size', 0)
+    if extra_pad_size:
+        pad_width = ((0, 0), (0, 0), (0, extra_pad_size), (0, extra_pad_size))
+        inputs = _sym.pad(*inputs, pad_width=pad_width, pad_value=np.finfo(np.float32).min)
+    return get_nnvm_op(op_name)(*inputs, **new_attrs), None
+
+def _darknet_avgpooling(inputs, attrs):
+    """Process the average pool 2d operation."""
+    kernel = parse_tshape(required_attr(attrs, 'kernel', 'avgpool'))
+    if len(kernel) != 1:
+        raise tvm.error.OpAttributeUnimplemented(
+            'Non-2D kernels for Average Pooling are not supported in frontend Darknet.')
+
+    op_name, new_attrs = 'avg_pool2d', {}
+    strides = int(attrs.get('stride', (1, 1)))
+    pads = int(attrs.get('pad', (0, 0)))
+    new_attrs['pool_size'] = [kernel[0], kernel[0]]
+    new_attrs['strides'] = str((strides, strides))
+    new_attrs['padding'] = str((pads, pads))
+
+    return get_nnvm_op(op_name)(*inputs, **new_attrs), None
+
+def _darknet_batch_norm(inputs, attrs):
+    """Process the batchnormalization operation."""
+    op_name, new_attrs = 'darknet_batch_norm', {}
+    new_attrs['axis'] = attrs.get('axis', 1)
+    new_attrs['epsilon'] = attrs.get('eps', 0.000001)
+    new_attrs['center'] = True
+    new_attrs['scale'] = True
+    return get_nnvm_op(op_name)(*inputs, **new_attrs), None
+
+def _darknet_conv2d(inputs, attrs):
+    """Process the convolution 2d operation."""
+    kernel = parse_tshape(required_attr(attrs, 'kernel', 'conv2d'))
+    if len(kernel) != 1:
+        raise tvm.error.OpAttributeUnimplemented('Non-2D kernels for Conv2D are unsupported '
+                                                 'in frontend Darknet.')
+    layout = attrs.get('layout', 'NCHW')
+    if layout not in ['NCHW', 'NHWC']:
+        raise tvm.error.OpAttributeInvalid(
+            'Value {} in attribute "layout" of operator Conv2D is not valid.'.format(layout))
+    strides = int(attrs.get('stride', (1, 1)))
+    pads = int(attrs.get('pad', (0, 0)))
+
+    op_name, new_attrs = 'conv2d', {}
+    new_attrs['channels'] = required_attr(attrs, 'num_filter', 'conv2d')
+    new_attrs['kernel_size'] = [kernel[0], kernel[0]]
+    new_attrs['strides'] = (strides, strides)
+    new_attrs['padding'] = (pads, pads)
+    new_attrs['dilation'] = attrs.get('dilate', (1, 1))
+    new_attrs['groups'] = attrs.get('num_group', 1)
+    new_attrs['layout'] = layout
+    if attrs.get('use_batchNorm', False) is True:
+        new_attrs['use_bias'] = False
+    else:
+        new_attrs['use_bias'] = True
+    out_name = {}
+    sym = get_nnvm_op(op_name)(*inputs, **new_attrs)
+    out_name[0] = sym.list_output_names()[0].replace('_output', '')
+
+    if attrs.get('use_batchNorm', False) is True:
+        op_name, new_attrs = 'batch_norm', {}
+        new_attrs['epsilon'] = 0.000001
+        sym = get_nnvm_op(op_name)(*sym, **new_attrs)
+        out_name[1] = sym.list_output_names()[0].replace('_output', '')
+    if 'activation' in attrs:
+        new_attrs = {}
+        new_attrs['activation'] = attrs['activation']
+        new_attrs['slope'] = 0.1
+        sym, _ = _darknet_activations(sym, new_attrs)
+    return sym, out_name
+
+
+def _darknet_conv2d_transpose(inputs, attrs):
+    """Process the convolution 2d transpose operation."""
+    if 'target_shape' in attrs:
+        raise tvm.error.OpAttributeUnimplemented(
+            'Attribute "target_shape" is not supported in operator Conv2D-transpose.')
+    kernel = parse_tshape(required_attr(attrs, 'kernel', 'conv2d_transpose'))
+    if len(kernel) != 2:
+        raise tvm.error.OpAttributeUnimplemented(
+            'Non-2D kernels are not supported in operator Conv2D-transpose.')
+    layout = attrs.get('layout', 'NCHW')
+    if layout not in ['NCHW', 'NHWC']:
+        msg = 'Value {} in attribute "layout" of operator Conv2D-transpose is not valid.'
+        raise tvm.error.OpAttributeInvalid(msg.format(layout))
+    op_name, new_attrs = 'conv2d_transpose', {}
+    new_attrs['channels'] = required_attr(attrs, 'num_filter', 'conv2d_transpose')
+    new_attrs['kernel_size'] = kernel
+    new_attrs['strides'] = attrs.get('stride', (1, 1))
+    new_attrs['output_padding'] = attrs.get('adj', (0, 0))
+    new_attrs['padding'] = attrs.get('pad', (0, 0))
+    new_attrs['dilation'] = attrs.get('dilate', (1, 1))
+    new_attrs['groups'] = attrs.get('num_group', 1)
+    new_attrs['layout'] = layout
+    new_attrs['use_bias'] = not parse_bool_str(attrs, 'no_bias')
+    return get_nnvm_op(op_name)(*inputs, **new_attrs), None
+
+def _darknet_shortcut(inputs, attrs):
+    """Process the shortcut operation."""
+    op_name, new_attrs = 'elemwise_add', {}
+    input_0 = inputs[0]
+    input_1 = inputs[1]
+    input_0_channel = int(attrs['out_channel'])
+    input_1_channel = int(attrs['add_out_channel'])
+    input_0_size = int(attrs['out_size'])
+    input_1_size = int(attrs['add_out_size'])
+
+    if input_0_size > input_1_size:
+        scale = int(input_0_size/input_1_size)
+        input_1 = _sym.upsampling(input_1, scale=scale, name="_upsampling")
+    elif input_0_size < input_1_size:
+        stride = int(input_1_size/input_0_size)
+        input_1 = _sym.avg_pool2d(input_1, pool_size=(1, 1),
+                                  strides=(stride, stride), padding=(0, 0), name="_downsampling")
+
+    if input_0_channel != input_1_channel:
+        pad_channel = input_0_channel - input_1_channel
+        input_1 = _sym.pad(input_1, pad_width=((0, 0), (0, pad_channel), (0, 0), (0, 0)),
+                           pad_value=0.)
+
+    new_inputs = _as_list([input_0, input_1])
+    sym = get_nnvm_op(op_name)(*new_inputs, **new_attrs)
+    out_name = sym.list_output_names()[0].replace('_output', '')
+    if 'activation' in attrs:
+        new_attrs['activation'] = attrs['activation']
+        sym, _ = _darknet_activations(sym, new_attrs)
+    return sym, out_name
+
+def _darknet_dense(inputs, attrs):
+    """Process the dense operation."""
+    op_name, new_attrs = 'dense', {}
+    new_attrs['units'] = required_attr(attrs, 'num_hidden', 'dense')
+    out_name = {}
+    new_attrs['use_bias'] = attrs.get('use_bias', False)
+    if attrs.get('use_flatten', False) is True:
+        inputs[0] = _sym.flatten(inputs[0])
+    sym = get_nnvm_op(op_name)(*inputs, **new_attrs)
+    out_name[0] = sym.list_output_names()[0].replace('_output', '')
+    if 'use_batchNorm' in attrs:
+        op_name, new_attrs = 'batch_norm', {}
+        new_attrs['epsilon'] = 0.000001
+        sym = get_nnvm_op(op_name)(*sym, **new_attrs)
+        out_name[1] = sym.list_output_names()[0].replace('_output', '')
+    if 'activation' in attrs:
+        new_attrs = {}
+        new_attrs['activation'] = attrs['activation']
+        sym, _ = _darknet_activations(sym, new_attrs)
+    return sym, out_name
+
+def _darknet_dropout(inputs, attrs):
+    """Process the dropout operation, its a blank operation."""
+    op_name, new_attrs = 'dropout', {}
+    new_attrs['rate'] = attrs.get('p', 0.5)
+    return get_nnvm_op(op_name)(*inputs, **new_attrs), None
+
+def _darknet_reshape(inputs, attrs):
+    """Process the reshape operation."""
+    if parse_bool_str(attrs, 'reverse'):
+        raise tvm.error.OpAttributeUnimplemented(
+            'Attribute "reverse" is not supported in operator Reshape.')
+    op_name, new_attrs = 'reshape', {}
+    new_attrs['shape'] = required_attr(attrs, 'shape', 'reshape')
+    return get_nnvm_op(op_name)(*inputs, **new_attrs), None
+
+def _darknet_upsampling(inputs, attrs):
+    """Process the upsampling operation."""
+    op_name, new_attrs = 'upsampling', {}
+    new_attrs['scale'] = attrs.get('scale', 1)
+    return get_nnvm_op(op_name)(*inputs, **new_attrs), None
+
+def _darknet_l2normalize(inputs, attrs):
+    """Process the l2 normalization operation."""
+    op_name, new_attrs = 'l2_normalize', {}
+    new_attrs['eps'] = attrs.get('eps', 0)
+    new_attrs['axis'] = attrs.get('axis', 1)
+    return get_nnvm_op(op_name)(*inputs, **new_attrs), None
+
+def _darknet_softmax_output(inputs, attrs):
+    """Process the softmax operation."""
+    temperature = attrs.get('temperature', 1)
+    if temperature != 1:
+        inputs[0] = inputs[0] / float(temperature)
+    op_name, new_attrs = 'softmax', {}
+    if parse_bool_str(attrs, 'multi_output'):
+        new_attrs['axis'] = 1
+
+    if attrs.get('use_flatten', False) is True:
+        inputs[0] = _sym.flatten(inputs[0])
+    return get_nnvm_op(op_name)(*inputs, **new_attrs), None
+
+def _darknet_route(inputs, attrs):
+    """Process the route operation, which is equivalent to concat."""
+    op_name = 'concatenate'
+    new_attrs = {'axis': attrs.get('dim', 1)}
+    return get_nnvm_op(op_name)(*inputs, **new_attrs), None
+
+def _darknet_reorg(inputs, attrs):
+    """Process the reorg operation."""
+    op_name, new_attrs = 'yolo_reorg', {}
+    if 'stride' in attrs:
+        new_attrs = {'stride': attrs.get('stride', 1)}
+    return get_nnvm_op(op_name)(*inputs, **new_attrs), None
+
+def _darknet_region(inputs, attrs):
+    """Process the region operation."""
+    num = attrs.get('n', 1)
+    classes = attrs.get('classes', 1)
+    coords = attrs.get('coords', 0)
+    background = attrs.get('background', 0)
+    softmax = attrs.get('softmax', True)
+    input_shape = attrs.get('shape')
+
+    split_size = classes + coords + 1
+    intermediate_shape = (input_shape[0], num, split_size, input_shape[2], input_shape[3])
+    data_block = _sym.reshape(inputs[0], shape=intermediate_shape)
+    split_indices = (2, 4, 5)
+    split_res = _sym.split(data_block, indices_or_sections=split_indices, axis=2)
+    split_res0 = _sym.sigmoid(split_res[0])
+    if not background:
+        split_res2 = _sym.sigmoid(split_res[2])
+    else:
+        split_res2 = split_res[2]
+    if softmax:
+        split_res3 = _sym.softmax(split_res[3], axis=2)
+    concat_list = [split_res0, split_res[1], split_res2, split_res3]
+    out = _sym.concatenate(*concat_list, axis=2)
+    return _sym.reshape(out, shape=input_shape), None
+
+
+def _darknet_yolo(inputs, attrs):
+    """Process the yolo operation."""
+    num = attrs.get('n', 1)
+    classes = attrs.get('classes', 1)
+    input_shape = attrs.get('shape')
+    split_size = classes + 5
+    intermediate_shape = (input_shape[0], num, split_size, input_shape[2], input_shape[3])
+    data_block = _sym.reshape(inputs[0], shape=intermediate_shape)
+    split_indices = (2, 4)
+    split_res = _sym.split(data_block, indices_or_sections=split_indices, axis=2)
+    split_res0 = _sym.sigmoid(split_res[0])
+    split_res2 = _sym.sigmoid(split_res[2])
+    concat_list = [split_res0, split_res[1], split_res2]
+    out = _sym.concatenate(*concat_list, axis=2)
+    return _sym.reshape(out, shape=input_shape), None
+
+def _darknet_activations(inputs, attrs):
+    """Process the activation function."""
+    act = required_attr(attrs, 'activation', 'activations')
+    if ACTIVATION.LOGISTIC == act:
+        act_type = 'sigmoid'
+    elif ACTIVATION.RELU == act:
+        act_type = 'relu'
+    elif ACTIVATION.TANH == act:
+        act_type = 'tanh'
+    elif ACTIVATION.LINEAR == act:
+        return inputs, None
+    elif ACTIVATION.LEAKY == act:
+        act_type = 'leaky_relu'
+    elif ACTIVATION.ELU == act:
+        act_type = 'elu'
+    else:
+        raise tvm.error.OpNotImplemented(
+            'Operator act: {} is not supported in framework Darknet.'.format(act))
+
+    if act_type in ['relu', 'tanh']:
+        op_name, new_attrs = act_type, {}
+        sym = get_nnvm_op(op_name)(*inputs, **new_attrs)
+    elif act_type in ['leaky_relu']:
+        op_name, new_attrs = act_type, {}
+        new_attrs['alpha'] = attrs.get('slope', 0.1)
+        sym = get_nnvm_op(op_name)(*inputs, **new_attrs)
+    elif act_type in ['elu']:
+        sym = -1 * _sym.relu(1 - _sym.exp(*inputs)) + _sym.relu(*inputs)
+    elif act_type in ['sigmoid']:
+        op_name, new_attrs = act_type, {}
+        sym = get_nnvm_op(op_name)(*inputs, **new_attrs)
+    else:
+        raise tvm.error.OpNotImplemented(
+            'Operator act: {} is not supported in framework Darknet.'.format(act))
+    return sym, None
+
+def _darknet_op_not_support(inputs, attrs):
+    """Raise exception if the operation is not supported."""
+    err = "{} is not supported in {}.".format(attrs, inputs)
+    raise NotImplementedError(err)
+
+_DARKNET_CONVERT_MAP = {
+    LAYERTYPE.CONVOLUTIONAL   : _darknet_conv2d,
+    LAYERTYPE.DECONVOLUTIONAL : _darknet_conv2d_transpose,
+    LAYERTYPE.CONNECTED       : _darknet_dense,
+    LAYERTYPE.MAXPOOL         : _darknet_maxpooling,
+    LAYERTYPE.SOFTMAX         : _darknet_softmax_output,
+    LAYERTYPE.DROPOUT         : _darknet_dropout,
+    LAYERTYPE.AVGPOOL         : _darknet_avgpooling,
+    LAYERTYPE.BATCHNORM       : _darknet_batch_norm,
+    LAYERTYPE.ROUTE           : _darknet_route,
+    LAYERTYPE.REORG           : _darknet_reorg,
+    LAYERTYPE.REGION          : _darknet_region,
+    LAYERTYPE.SHORTCUT        : _darknet_shortcut,
+    LAYERTYPE.UPSAMPLE        : _darknet_upsampling,
+    LAYERTYPE.L2NORM          : _darknet_l2normalize,
+    LAYERTYPE.YOLO            : _darknet_yolo,
+    LAYERTYPE.DETECTION       : _darknet_op_not_support,
+    LAYERTYPE.CROP            : _darknet_op_not_support,
+    LAYERTYPE.COST            : _darknet_op_not_support,
+    LAYERTYPE.NORMALIZATION   : _darknet_op_not_support,
+    LAYERTYPE.LOCAL           : _darknet_op_not_support,
+    LAYERTYPE.ACTIVE          : _darknet_op_not_support,
+    LAYERTYPE.RNN             : _darknet_op_not_support,
+    LAYERTYPE.GRU             : _darknet_op_not_support,
+    LAYERTYPE.LSTM            : _darknet_op_not_support,
+    LAYERTYPE.CRNN            : _darknet_op_not_support,
+    LAYERTYPE.NETWORK         : _darknet_op_not_support,
+    LAYERTYPE.XNOR            : _darknet_op_not_support,
+    LAYERTYPE.BLANK           : _darknet_op_not_support,
+}
+
+def _darknet_convert_symbol(op_name, inputs, attrs):
+    """Convert from darknet op to nnvm op.
+    The converter must specify some conversions explicitly to
+    support gluon format ops such as conv2d...
+
+    Parameters
+    ----------
+    op_name : str
+        Operator name, such as Convolution, Connected, etc
+    inputs : list of nnvm.Symbol
+        List of input symbols.
+    attrs : dict
+        Dict of operator attributes
+
+    Returns
+    -------
+    out_name : converted out name of operation
+    sym : nnvm.Symbol
+        Converted nnvm Symbol
+    """
+
+    if op_name in _DARKNET_CONVERT_MAP:
+        sym, out_name = _DARKNET_CONVERT_MAP[op_name](inputs, attrs)
+    else:
+        raise tvm.error.OpNotImplemented(
+            'Operator {} is not supported in frontend Darknet.'.format(op_name))
+    if out_name is  None:
+        out_name = sym.list_output_names()[0].replace('_output', '')
+    return out_name, sym
+
+
+def _as_list(arr):
+    """Force being a list, ignore if already is."""
+    if isinstance(arr, list):
+        return arr
+    return [arr]
+
+
+class GraphProto(object):
+    """A helper class for handling nnvm graph copying from darknet model.
+    """
+
+    def __init__(self, net, dtype='float32'):
+        self.net = net
+        self.dtype = dtype
+        self._sym_array = {}
+        self._tvmparams = {}
+        self._outs = []
+        self._state_ctr = {}
+        self._state_ctr['rnn'] = 0
+        self._state_ctr['crnn'] = 0
+        self._state_ctr['lstm'] = 0
+        self._state_ctr['cell_state'] = 0
+        self._state_ctr['gru'] = 0
+
+    def _read_memory_buffer(self, shape, data, dtype=None):
+        if dtype is None:
+            dtype = self.dtype
+        length = 1
+        for x in shape:
+            length *= x
+        data_np = np.zeros(length, dtype=dtype)
+        for i in range(length):
+            data_np[i] = data[i]
+        return data_np.reshape(shape)
+
+    def _get_convolution_weights(self, layer, opname):
+        """Get the convolution layer weights and biases."""
+        if layer.nweights == 0:
+            return
+
+        if layer.n * layer.c * layer.size * layer.size != layer.nweights:
+            msg = 'nweights ({}) != n * c * h * w ({}) in operator {}'
+            msg = msg.format(layer.nweights, layer.n * layer.c * layer.size ** 2, opname)
+            raise tvm.error.OpAttributeInvalid(msg)
+
+        shape = (layer.n, layer.c, layer.size, layer.size)
+        weights = self._read_memory_buffer(shape, layer.weights)
+
+        biases = self._read_memory_buffer((layer.n, ), layer.biases)
+
+        k = self._get_tvm_params_name(opname[0], 'weight')
+        self._tvmparams[k] = tvm.nd.array(weights)
+
+        if layer.batch_normalize == 1 and layer.dontloadscales != 1:
+            self._get_batchnorm_weights(layer, opname[1], layer.n)
+            k = self._get_tvm_params_name(opname[1], 'beta')
+            self._tvmparams[k] = tvm.nd.array(biases)
+        else:
+            k = self._get_tvm_params_name(opname[0], 'bias')
+            self._tvmparams[k] = tvm.nd.array(biases)
+
+    def _get_connected_weights(self, layer, opname):
+        """Parse the weights and biases for fully connected or dense layer."""
+        size = layer.outputs * layer.inputs
+        if size == 0:
+            return
+
+        weights = self._read_memory_buffer((layer.outputs, layer.inputs), layer.weights)
+        biases = self._read_memory_buffer((layer.outputs, ), layer.biases)
+
+        k = self._get_tvm_params_name(opname[0], 'weight')
+        self._tvmparams[k] = tvm.nd.array(weights)
+
+        if layer.batch_normalize == 1 and layer.dontloadscales != 1:
+            self._get_batchnorm_weights(layer, opname[1], layer.outputs)
+            k = self._get_tvm_params_name(opname[1], 'beta')
+            self._tvmparams[k] = tvm.nd.array(biases)
+        else:
+            k = self._get_tvm_params_name(opname[0], 'bias')
+            self._tvmparams[k] = tvm.nd.array(biases)
+
+    def _get_region_weights(self, layer, opname):
+        """Parse the biases for region layer."""
+        biases = self._read_memory_buffer((layer.n*2, ), layer.biases)
+        attributes = np.array([layer.n, layer.out_c, layer.out_h, layer.out_w,
+                               layer.classes, layer.coords, layer.background],
+                              dtype=np.int32)
+        k = self._get_tvm_params_name(opname, 'bias')
+        self._tvmparams[k] = tvm.nd.array(biases)
+        k = self._get_tvm_params_name(opname, 'attr')
+        self._tvmparams[k] = tvm.nd.array(attributes)
+
+    def _get_yolo_weights(self, layer, opname):
+        """Parse the biases and mask for yolo layer."""
+        biases = self._read_memory_buffer((layer.total*2, ), layer.biases)
+        mask = self._read_memory_buffer((layer.n, ), layer.mask, dtype='int32')
+        attributes = np.array([layer.n, layer.out_c, layer.out_h, layer.out_w,
+                               layer.classes, layer.total],
+                              dtype=np.int32)
+        k = self._get_tvm_params_name(opname, 'bias')
+        self._tvmparams[k] = tvm.nd.array(biases)
+        k = self._get_tvm_params_name(opname, 'mask')
+        self._tvmparams[k] = tvm.nd.array(mask)
+        k = self._get_tvm_params_name(opname, 'attr')
+        self._tvmparams[k] = tvm.nd.array(attributes)
+
+    def _get_batchnorm_weights(self, layer, opname, size):
+        """Parse the weights for batchnorm, which includes, scales, moving mean
+        and moving variances."""
+        scales = self._read_memory_buffer((size, ), layer.scales)
+        rolling_mean = self._read_memory_buffer((size, ), layer.rolling_mean)
+        rolling_variance = self._read_memory_buffer((size, ), layer.rolling_variance)
+
+        k = self._get_tvm_params_name(opname, 'moving_mean')
+        self._tvmparams[k] = tvm.nd.array(rolling_mean)
+        k = self._get_tvm_params_name(opname, 'moving_var')
+        self._tvmparams[k] = tvm.nd.array(rolling_variance)
+        k = self._get_tvm_params_name(opname, 'gamma')
+        self._tvmparams[k] = tvm.nd.array(scales)
+
+    def _get_darknet_attrs(self, layer, layer_num):
+        """Parse attributes of each layer and return."""
+        attr = {}
+        use_flatten = True
+        if LAYERTYPE.CONVOLUTIONAL == layer.type:
+            attr.update({'layout' : 'NCHW'})
+            attr.update({'pad' : str(layer.pad)})
+            attr.update({'num_group' : str(layer.groups)})
+            attr.update({'num_filter' : str(layer.n)})
+            attr.update({'stride' : str(layer.stride)})
+            attr.update({'kernel' : str(layer.size)})
+            attr.update({'activation' : (layer.activation)})
+
+            if layer.nbiases == 0:
+                attr.update({'use_bias' : False})
+            else:
+                attr.update({'use_bias' : True})
+
+            if layer.batch_normalize == 1 and layer.dontloadscales != 1:
+                attr.update({'use_batchNorm' : True})
+                attr.update({'use_scales' : True})
+
+        elif LAYERTYPE.CONNECTED == layer.type:
+            attr.update({'num_hidden' : str(layer.outputs)})
+            attr.update({'activation' : (layer.activation)})
+            if layer_num != 0:
+                layer_prev = self.net.layers[layer_num - 1]
+                if (layer_prev.out_h == layer.h and
+                        layer_prev.out_w == layer.w and
+                        layer_prev.out_c == layer.c):
+                    use_flatten = False
+            attr.update({'use_flatten' : use_flatten})
+            attr.update({'use_bias' : True})
+            if layer.batch_normalize == 1 and layer.dontloadscales != 1:
+                attr.update({'use_batchNorm' : True})
+                attr.update({'use_scales' : True})
+                attr.update({'use_bias' : False})
+
+        elif LAYERTYPE.MAXPOOL == layer.type:
+            attr.update({'pad' : str(layer.pad)})
+            attr.update({'stride' : str(layer.stride)})
+            attr.update({'kernel' : str(layer.size)})
+            max_output = (layer.w - layer.size + 2 * layer.pad)/float(layer.stride) + 1
+            if max_output < layer.out_w:
+                extra_pad = (layer.out_w - max_output)*layer.stride
+                attr.update({'extra_pad_size' : int(extra_pad)})
+        elif LAYERTYPE.AVGPOOL == layer.type:
+            attr.update({'pad' : str(layer.pad)})
+            if layer.stride == 0:
+                attr.update({'stride' : str(1)})
+            else:
+                attr.update({'stride' : str(layer.stride)})
+            if layer.size == 0 and layer.h == layer.w:
+                attr.update({'kernel' : str(layer.h)})
+            else:
+                attr.update({'kernel' : str(layer.size)})
+
+        elif LAYERTYPE.DROPOUT == layer.type:
+            attr.update({'p' : str(layer.probability)})
+
+        elif LAYERTYPE.SOFTMAX == layer.type:
+            attr.update({'axis' : 1})
+            attr.update({'use_flatten' : True})
+            if layer.temperature:
+                attr.update({'temperature' : str(layer.temperature)})
+
+        elif LAYERTYPE.SHORTCUT == layer.type:
+            add_layer = self.net.layers[layer.index]
+            attr.update({'activation' : (layer.activation)})
+            attr.update({'out_channel' : (layer.out_c)})
+            attr.update({'out_size' : (layer.out_h)})
+            attr.update({'add_out_channel' : (add_layer.out_c)})
+            attr.update({'add_out_size' : (add_layer.out_h)})
+
+        elif LAYERTYPE.ROUTE == layer.type:
+            pass
+
+        elif LAYERTYPE.COST == layer.type:
+            pass
+
+        elif LAYERTYPE.REORG == layer.type:
+            attr.update({'stride' : layer.stride})
+
+        elif LAYERTYPE.REGION == layer.type:
+            attr.update({'n' : layer.n})
+            attr.update({'classes' : layer.classes})
+            attr.update({'coords' : layer.coords})
+            attr.update({'background' : layer.background})
+            attr.update({'softmax' : layer.softmax})
+            attr.update({'shape' : (1, layer.c, layer.h, layer.w)})
+
+        elif LAYERTYPE.YOLO == layer.type:
+            attr.update({'n' : layer.n})
+            attr.update({'classes' : layer.classes})
+            attr.update({'shape' : (1, layer.c, layer.h, layer.w)})
+
+        elif LAYERTYPE.UPSAMPLE == layer.type:
+            attr.update({'scale' : layer.stride})
+
+        elif LAYERTYPE.L2NORM == layer.type:
+            pass
+
+        else:
+            raise tvm.error.OpNotImplemented(
+                'Operator {} is not supported in frontend Darknet.'.format(layer.type))
+
+        return attr
+
+    def _get_tvm_params_name(self, opname, arg_name):
+        """Makes the params name for the k,v pair."""
+        return opname + '_'+ arg_name
+
+    def _get_darknet_params(self, layer, opname):
+        """To parse and get the darknet params."""
+        if LAYERTYPE.CONVOLUTIONAL == layer.type:
+            self._get_convolution_weights(layer, opname)
+
+        elif LAYERTYPE.CONNECTED == layer.type:
+            self._get_connected_weights(layer, opname)
+
+        elif LAYERTYPE.REGION == layer.type:
+            self._get_region_weights(layer, opname)
+
+        elif LAYERTYPE.YOLO == layer.type:
+            self._get_yolo_weights(layer, opname)
+    def _preproc_layer(self, layer, layer_num):
+        """To preprocess each darknet layer, some layer doesnt need processing."""
+        if layer_num == 0:
+            name = 'data'
+            attribute = {}
+            sym = [_sym.Variable(name, **attribute)]
+        else:
+            sym = self._sym_array[layer_num - 1]
+        skip_layer = False
+
+        if LAYERTYPE.ROUTE == layer.type:
+            sym = []
+            for j in range(layer.n):
+                sym.append(self._sym_array[layer.input_layers[j]])
+            if layer.n == 1:
+                skip_layer = True
+
+        elif LAYERTYPE.COST == layer.type:
+            skip_layer = True
+
+        elif LAYERTYPE.SHORTCUT == layer.type:
+            sym = [sym, self._sym_array[layer.index]]
+
+        elif LAYERTYPE.BLANK == layer.type:
+            skip_layer = True
+
+        if skip_layer is True:
+            self._sym_array[layer_num] = sym
+
+        return skip_layer, sym
+
+    def _get_opname(self, layer):
+        """Returs the layer name."""
+        return layer.type
+
+    def _new_rnn_state_sym(self, state=None, name='rnn'):
+        """Returs a symbol for state"""
+        sym_name = name + "%d_state" % self._state_ctr[name]
+        self._state_ctr[name] += 1
+        return _sym.Variable(name=sym_name, init=state)
+
+    def _get_rnn_state_buffer(self, layer, name):
+        """Get the state buffer for rnn."""
+        buffer = np.zeros((1, layer.outputs), self.dtype)
+        return self._new_rnn_state_sym(buffer, name)
+
+    def _get_darknet_rnn_attrs(self, layer, sym):
+        """Get the rnn converted symbol from attributes."""
+        attr = self._get_darknet_attrs(layer, 0)
+        op_name = self._get_opname(layer)
+        layer_name, sym = _darknet_convert_symbol(op_name, _as_list(sym), attr)
+        self._get_darknet_params(layer, layer_name)
+        return sym
+
+    def _handle_darknet_rnn_layers(self, layer_num, sym):
+        """Parse attributes and handle the rnn layers."""
+        attr = {}
+        layer = self.net.layers[layer_num]
+        processed = False
+
+        if LAYERTYPE.RNN == layer.type:
+            attr.update({'n' : layer.n})
+            attr.update({'batch' : layer.batch})
+            attr.update({'num_hidden' : str(layer.outputs)})
+
+            state = self._get_rnn_state_buffer(layer, 'rnn')
+
+            for _ in range(layer.steps):
+                input_layer = layer.input_layer
+                sym = self._get_darknet_rnn_attrs(input_layer, sym)
+
+                self_layer = layer.self_layer
+                state = self._get_darknet_rnn_attrs(self_layer, state)
+
+                op_name, new_attrs = 'elemwise_add', {}
+                new_inputs = _as_list([sym, state])
+                state = get_nnvm_op(op_name)(*new_inputs, **new_attrs)
+                self._outs.append(state)
+
+                output_layer = layer.output_layer
+                sym = self._get_darknet_rnn_attrs(output_layer, state)
+
+            self._sym_array[layer_num] = sym
+            processed = True
+
+        elif LAYERTYPE.CRNN == layer.type:
+            attr.update({'n' : layer.n})
+            attr.update({'batch' : layer.batch})
+            attr.update({'num_hidden' : str(layer.outputs)})
+
+            state = self._get_rnn_state_buffer(layer, 'crnn')
+
+            for _ in range(layer.steps):
+                input_layer = layer.input_layer
+                sym = self._get_darknet_rnn_attrs(input_layer, sym)
+
+                self_layer = layer.self_layer
+                state = self._get_darknet_rnn_attrs(self_layer, state)
+
+                op_name, new_attrs = 'elemwise_add', {}
+                new_inputs = _as_list([sym, state])
+                state = get_nnvm_op(op_name)(*new_inputs, **new_attrs)
+                self._outs.append(state)
+
+                output_layer = layer.output_layer
+                sym = self._get_darknet_rnn_attrs(output_layer, state)
+
+            self._sym_array[layer_num] = sym
+            processed = True
+
+        elif LAYERTYPE.LSTM == layer.type:
+            if layer.steps > 1:
+                raise tvm.error.OpAttributeInvalid(
+                    'Number of steps {} of RNN is not valid.'.format(layer.steps))
+
+            op_name_add = 'elemwise_add'
+            op_name_mul = 'elemwise_mul'
+            attrs = {}
+            act_attr = {}
+
+            h_state = self._get_rnn_state_buffer(layer, 'lstm')
+            c_state = self._get_rnn_state_buffer(layer, 'cell_state')
+            for _ in range(layer.steps):
+                sym_wf = self._get_darknet_rnn_attrs(layer.wf, h_state)
+                sym_wi = self._get_darknet_rnn_attrs(layer.wi, h_state)
+                sym_wg = self._get_darknet_rnn_attrs(layer.wg, h_state)
+                sym_wo = self._get_darknet_rnn_attrs(layer.wo, h_state)
+
+                input_sym = sym
+                sym_uf = self._get_darknet_rnn_attrs(layer.uf, input_sym)
+                sym_ui = self._get_darknet_rnn_attrs(layer.ui, input_sym)
+                sym_ug = self._get_darknet_rnn_attrs(layer.ug, input_sym)
+                sym_uo = self._get_darknet_rnn_attrs(layer.uo, input_sym)
+
+                new_inputs = _as_list([sym_wf, sym_uf])
+                add_f = get_nnvm_op(op_name_add)(*new_inputs, **attrs)
+
+                new_inputs = _as_list([sym_wi, sym_ui])
+                add_i = get_nnvm_op(op_name_add)(*new_inputs, **attrs)
+
+                new_inputs = _as_list([sym_wg, sym_ug])
+                add_g = get_nnvm_op(op_name_add)(*new_inputs, **attrs)
+
+                new_inputs = _as_list([sym_wo, sym_uo])
+                add_o = get_nnvm_op(op_name_add)(*new_inputs, **attrs)
+
+                act_attr['activation'] = ACTIVATION.LOGISTIC
+                act_f, _ = _darknet_activations(_as_list(add_f), act_attr)
+
+                act_attr['activation'] = ACTIVATION.LOGISTIC
+                act_i, _ = _darknet_activations(_as_list(add_i), act_attr)
+
+                act_attr['activation'] = ACTIVATION.TANH
+                act_g, _ = _darknet_activations(_as_list(add_g), act_attr)
+
+                act_attr['activation'] = ACTIVATION.LOGISTIC
+                act_o, _ = _darknet_activations(_as_list(add_o), act_attr)
+
+                new_inputs = _as_list([act_i, act_g])
+                mul_t = get_nnvm_op(op_name_mul)(*new_inputs, **attrs)
+
+                new_inputs = _as_list([act_f, c_state])
+                c_state = get_nnvm_op(op_name_mul)(*new_inputs, **attrs)
+
+                new_inputs = _as_list([mul_t, c_state])
+                c_state = get_nnvm_op(op_name_add)(*new_inputs, **attrs)
+
+                act_attr['activation'] = ACTIVATION.TANH
+                h_state, _ = _darknet_activations(_as_list(c_state), act_attr)
+
+                new_inputs = _as_list([act_o, h_state])
+                h_state = get_nnvm_op(op_name_mul)(*new_inputs, **attrs)
+                self._outs = self._outs + [c_state, h_state]
+                sym = h_state
+            self._sym_array[layer_num] = sym
+            processed = True
+
+        elif LAYERTYPE.GRU == layer.type:
+            if layer.steps > 1:
+                raise tvm.error.OpAttributeInvalid(
+                    'Number of steps {} is not valid in RNN.'.format(layer.steps))
+
+            op_name_add = 'elemwise_add'
+            op_name_mul = 'elemwise_mul'
+            attrs = {}
+            act_attr = {}
+
+            state = self._get_rnn_state_buffer(layer, "gru")
+            for _ in range(layer.steps):
+                sym_wz = self._get_darknet_rnn_attrs(layer.wz, state)
+                sym_wr = self._get_darknet_rnn_attrs(layer.wr, state)
+
+                input_sym = sym
+                sym_uz = self._get_darknet_rnn_attrs(layer.uz, input_sym)
+                sym_ur = self._get_darknet_rnn_attrs(layer.ur, input_sym)
+                sym_uh = self._get_darknet_rnn_attrs(layer.uh, input_sym)
+
+                new_inputs = _as_list([sym_uz, sym_wz])
+                add_z = get_nnvm_op(op_name_add)(*new_inputs, **attrs)
+
+                new_inputs = _as_list([sym_ur, sym_wr])
+                add_r = get_nnvm_op(op_name_add)(*new_inputs, **attrs)
+
+                act_attr['activation'] = ACTIVATION.LOGISTIC
+                act_z, _ = _darknet_activations(_as_list(add_z), act_attr)
+
+                act_attr['activation'] = ACTIVATION.LOGISTIC
+                act_r, _ = _darknet_activations(_as_list(add_r), act_attr)
+
+                new_inputs = _as_list([act_r, state])
+                forgot = get_nnvm_op(op_name_mul)(*new_inputs, **attrs)
+
+                sym_wh = self._get_darknet_rnn_attrs(layer.wh, forgot)
+
+                new_inputs = _as_list([sym_uh, sym_wh])
+                h_state = get_nnvm_op(op_name_add)(*new_inputs, **attrs)
+
+                if layer.tanh == 1:
+                    act_attr['activation'] = ACTIVATION.TANH
+                else:
+                    act_attr['activation'] = ACTIVATION.LOGISTIC
+                h_state, _ = _darknet_activations(_as_list(h_state), act_attr)
+
+                sym = act_z * state + (1 - act_z) * h_state
+
+                self._outs = self._outs + [sym]
+            self._sym_array[layer_num] = sym
+            processed = True
+
+        return processed, sym
+
+    def _make_outlist(self, sym, op_name, layer, layer_num):
+        if layer.type == LAYERTYPE.REGION:
+            k = self._get_tvm_params_name(op_name, 'attr')
+            self._outs.insert(0, _sym.Variable(name=k, init=self._tvmparams[k].asnumpy()))
+            k = self._get_tvm_params_name(op_name, 'bias')
+            self._outs.insert(0, _sym.Variable(name=k, init=self._tvmparams[k].asnumpy()))
+            if layer_num != self.net.n-1:
+                self._outs.insert(0, sym)
+
+        elif layer.type == LAYERTYPE.YOLO:
+            k = self._get_tvm_params_name(op_name, 'attr')
+            self._outs.insert(0, _sym.Variable(name=k, init=self._tvmparams[k].asnumpy()))
+            k = self._get_tvm_params_name(op_name, 'bias')
+            self._outs.insert(0, _sym.Variable(name=k, init=self._tvmparams[k].asnumpy()))
+            k = self._get_tvm_params_name(op_name, 'mask')
+            self._outs.insert(0, _sym.Variable(name=k, init=self._tvmparams[k].asnumpy()))
+            if layer_num != self.net.n-1:
+                self._outs.insert(0, sym)
+
+    def from_darknet(self):
+        """To convert the darknet symbol to nnvm symbols."""
+        for i in range(self.net.n):
+            layer = self.net.layers[i]
+            need_skip, sym = self._preproc_layer(layer, i)
+            if need_skip is True:
+                continue
+
+            processed, sym = self._handle_darknet_rnn_layers(i, sym)
+            if processed is True:
+                continue
+
+            attr = self._get_darknet_attrs(layer, i)
+            op_name = self._get_opname(layer)
+            layer_name, sym = _darknet_convert_symbol(op_name, _as_list(sym), attr)
+            self._get_darknet_params(self.net.layers[i], layer_name)
+            self._sym_array[i] = sym
+            self._make_outlist(sym, layer_name, layer, i)
+
+        self._outs = _as_list(sym) + self._outs
+        if isinstance(self._outs, list):
+            sym = _sym.Group(self._outs)
+        return sym, self._tvmparams
+
+def from_darknet(net, dtype='float32'):
+    """Convert from darknet's model into compatible NNVM format.
+    Reconstruct a nnvm symbol by traversing the darknet input.
+
+    Parameters
+    ----------
+    net : ctype Pointer to network
+        Darknet parsed symbols
+
+    dtype : str
+        Datatype of the input net structure, default is float32
+
+    Returns
+    -------
+    sym : nnvm.Symbol
+        Compatible nnvm symbol
+
+    params : dict of str to tvm.NDArray
+        The parameter dict to be used by nnvm
+    """
+
+    return GraphProto(net, dtype).from_darknet()
diff --git a/nnvm/python/nnvm/frontend/keras.py b/nnvm/python/nnvm/frontend/keras.py
new file mode 100644
index 000000000000..f647a644bd2b
--- /dev/null
+++ b/nnvm/python/nnvm/frontend/keras.py
@@ -0,0 +1,727 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name, import-self
+"""Keras frontend."""
+from __future__ import absolute_import as _abs
+import sys
+import numpy as np
+import tvm
+from .. import symbol as _sym
+from .common import SymbolTable
+
+__all__ = ['from_keras']
+
+
+def _check_data_format(keras_layer):
+    if hasattr(keras_layer, ('data_format')):
+        if keras_layer.data_format != 'channels_last':
+            raise ValueError("Keras frontend currently supports data_format = channels_last only.")
+
+
+def _get_pad_pair(input1d, kernel1d, stride1d):
+    out1d = (input1d + stride1d - 1) // stride1d
+    pad = np.maximum((out1d - 1) * stride1d + kernel1d - input1d, 0)
+    pad_before = pad // 2
+    pad_after = pad - pad_before
+    return [pad_before, pad_after]
+
+def _get_elu(insym, alpha):
+    """ A helper method for elu.
+    """
+    return -alpha * _sym.relu(1 - _sym.exp(insym)) + _sym.relu(insym)
+
+def _convert_recurrent_activation(insym, keras_layer):
+    act_type = keras_layer.recurrent_activation.__name__
+    return _convert_activation(insym, act_type, None)
+
+def _convert_activation(insym, keras_layer, _):
+    if isinstance(keras_layer, str):
+        act_type = keras_layer
+    else:
+        if sys.version_info.major < 3:
+            act_type = keras_layer.activation.func_name
+        else:
+            act_type = keras_layer.activation.__name__
+    if act_type == 'linear':
+        if isinstance(keras_layer, str):
+            return insym
+        alpha = keras_layer.alpha if hasattr(keras_layer, "alpha") else 1
+        beta = keras_layer.beta if hasattr(keras_layer, "beta") else 0
+        return _sym.__add_scalar__(_sym.__mul_scalar__(insym, \
+            scalar=alpha), scalar=beta)
+    if act_type == 'softmax':
+        return _sym.softmax(insym, axis=1)
+    if act_type == 'sigmoid':
+        return _sym.sigmoid(insym)
+    if act_type == 'tanh':
+        return _sym.tanh(insym)
+    if act_type == 'relu':
+        return _sym.relu(insym)
+    if act_type == 'softplus':
+        return _sym.log(_sym.__add_scalar__(_sym.exp(insym), scalar=1))
+    if act_type == 'elu':
+        alpha = keras_layer.alpha if hasattr(keras_layer, "alpha") else 1
+        return _get_elu(insym, alpha)
+    if act_type == 'selu':
+        # Alpha, Gamma values, obtained from  https://arxiv.org/abs/1706.02515
+        alpha = keras_layer.alpha if hasattr(keras_layer, "alpha") \
+            else 1.6732632423543772848170429916717
+        gamma = keras_layer.gamma if hasattr(keras_layer, "gamma") \
+            else 1.0507009873554804934193349852946
+        return gamma * _get_elu(insym, alpha)
+    if act_type == 'relu6':
+        return _sym.clip(insym, a_min=0, a_max=6)
+    if act_type == 'softsign':
+        return insym / (1 + (_sym.relu(insym) + _sym.relu(_sym.negative(insym))))
+    if act_type == 'hard_sigmoid':
+        transformX = (0.2 * insym) + 0.5
+        return _sym.clip(transformX, a_min=0, a_max=1)
+    raise tvm.error.OpNotImplemented(
+        'Operator {} is not supported in frontend Keras.'.format(act_type))
+
+
+def _convert_advanced_activation(insym, keras_layer, symtab):
+    act_type = type(keras_layer).__name__
+    if act_type == 'ReLU':
+        if keras_layer.max_value:
+            return _sym.clip(insym, a_min=0, a_max=keras_layer.max_value)
+        return _sym.relu(insym)
+    if act_type == 'LeakyReLU':
+        return _sym.leaky_relu(insym, alpha=keras_layer.alpha)
+    if act_type == 'ELU':
+        alpha = keras_layer.alpha if hasattr(keras_layer, "alpha") else 1
+        return _get_elu(insym, alpha)
+    if act_type == 'PReLU':
+        assert hasattr(keras_layer, "alpha"), \
+            "alpha required for PReLU."
+        _check_data_format(keras_layer)
+        size = len(keras_layer.alpha.shape)
+        return -symtab.new_const(keras_layer.get_weights()[0] \
+                                 .transpose(np.roll(range(size), 1))) \
+                                 * _sym.relu(-insym) + _sym.relu(insym)
+    if act_type == 'ThresholdedReLU':
+        theta = keras_layer.theta if hasattr(keras_layer, "theta") else 1.0
+        theta_tensor = _sym.full_like(insym[0], fill_value=float(theta))
+        return _sym.elemwise_mul(insym[0], _sym.greater(insym[0], theta_tensor, out_type="float32"))
+    raise tvm.error.OpNotImplemented(
+        'Operator {} is not supported in frontend Keras.'.format(act_type))
+
+
+def _convert_merge(insym, keras_layer, _):
+    merge_type = type(keras_layer).__name__
+    ret = insym[0]
+    for i in range(1, len(insym)):
+        if merge_type == 'Add':
+            ret = _sym.elemwise_add(ret, insym[i])
+        elif merge_type == 'Subtract':
+            ret = _sym.elemwise_sub(ret, insym[i])
+        elif merge_type == 'Multiply':
+            ret = _sym.elemwise_mul(ret, insym[i])
+        else:
+            raise tvm.error.OpNotImplemented(
+                'Operator {} Merge is not supported in frontend Keras.'.format(merge_type))
+    return ret
+
+
+def _convert_dense(insym, keras_layer, symtab):
+    weightList = keras_layer.get_weights()
+    weight = symtab.new_const(weightList[0].transpose([1, 0]))
+    params = {'weight':weight, 'use_bias':False, 'units':weightList[0].shape[1]}
+    if keras_layer.use_bias:
+        params['use_bias'] = True
+        params['bias'] = symtab.new_const(weightList[1])
+    input_shape = keras_layer.input_shape
+    input_dim = len(input_shape)
+    # In case of RNN dense, input shape will be (1, 1, n)
+    if input_dim > 2:
+        input_shape = tuple(dim if dim else 1 for dim in _as_list(input_shape)[0])
+        if input_dim != 3 or input_shape[0] != 1 or input_shape[1] != 1:
+            msg = 'Value {} in attribute "input_shape" of operator Dense is not valid.'
+            raise tvm.error.OpAttributeInvalid(msg.format(input_shape))
+        insym = _sym.squeeze(insym, axis=0)
+    out = _sym.dense(data=insym, **params)
+    # defuse activation
+    if sys.version_info.major < 3:
+        act_type = keras_layer.activation.func_name
+    else:
+        act_type = keras_layer.activation.__name__
+    if act_type != 'linear':
+        out = _convert_activation(out, act_type, symtab)
+    if input_dim > 2:
+        out = _sym.expand_dims(out, axis=0)
+    return out
+
+
+def _convert_convolution(insym, keras_layer, symtab):
+    _check_data_format(keras_layer)
+    is_deconv = type(keras_layer).__name__ == 'Conv2DTranspose'
+    is_depthconv = type(keras_layer).__name__ == 'DepthwiseConv2D'
+    weightList = keras_layer.get_weights()
+    if is_deconv:
+        kernel_h, kernel_w, n_filters, in_channels = weightList[0].shape
+        weight = weightList[0].transpose([3, 2, 0, 1])
+    elif is_depthconv:
+        kernel_h, kernel_w, in_channels, depth_mult = weightList[0].shape
+        weight = weightList[0].transpose([2, 3, 0, 1])
+    else:
+        kernel_h, kernel_w, in_channels, n_filters = weightList[0].shape
+        weight = weightList[0].transpose([3, 2, 0, 1])
+    if isinstance(keras_layer.dilation_rate, (list, tuple)):
+        dilation = [keras_layer.dilation_rate[0], keras_layer.dilation_rate[1]]
+    else:
+        dilation = [keras_layer.dilation_rate, keras_layer.dilation_rate]
+    dilated_kernel_h = (kernel_h - 1) * dilation[0] + 1
+    dilated_kernel_w = (kernel_w - 1) * dilation[1] + 1
+    stride_h, stride_w = keras_layer.strides
+    params = {'weight': symtab.new_const(weight),
+              'kernel_size': [kernel_h, kernel_w],
+              'strides': [stride_h, stride_w],
+              'dilation': dilation,
+              'padding': [0, 0],
+              'use_bias': False}
+    if is_depthconv:
+        params['channels'] = in_channels * depth_mult
+        params['groups'] = in_channels
+    else:
+        params['channels'] = n_filters
+    if keras_layer.use_bias:
+        params['use_bias'] = True
+        params['bias'] = symtab.new_const(weightList[1])
+    if keras_layer.padding == 'valid':
+        pass
+    # we insert a separate pad operator
+    elif keras_layer.padding == 'same':
+        in_h = keras_layer.input_shape[1]
+        in_w = keras_layer.input_shape[2]
+        pad_t, pad_b = _get_pad_pair(in_h, dilated_kernel_h, stride_h)
+        pad_l, pad_r = _get_pad_pair(in_w, dilated_kernel_w, stride_w)
+        if pad_t == pad_b and pad_l == pad_r:
+            params['padding'] = (pad_t, pad_l)
+        else:
+            insym = _sym.pad(data=insym, pad_width=((0, 0), (0, 0), (pad_t, pad_b), (pad_l, pad_r)))
+    else:
+        msg = 'Value {} in attribute "padding" of operator Convolution is not valid.'
+        raise tvm.error.OpAttributeInvalid(msg.format(keras_layer.padding))
+    if is_deconv:
+        out = _sym.conv2d_transpose(data=insym, **params)
+    else:
+        out = _sym.conv2d(data=insym, **params)
+    # defuse activation
+    if sys.version_info.major < 3:
+        act_type = keras_layer.activation.func_name
+    else:
+        act_type = keras_layer.activation.__name__
+    if act_type != 'linear':
+        out = _convert_activation(out, act_type, symtab)
+    return out
+
+
+def _convert_separable_convolution(insym, keras_layer, symtab):
+    _check_data_format(keras_layer)
+    weightList = keras_layer.get_weights()
+    # depthwise conv
+    kernel_h, kernel_w, in_channels, depth_mult = weightList[0].shape
+    stride_h, stride_w = keras_layer.strides
+    weight0 = weightList[0].transpose([2, 3, 0, 1])
+    params0 = {'weight': symtab.new_const(weight0),
+               'channels': in_channels * depth_mult,
+               'groups': in_channels,
+               'kernel_size': [kernel_h, kernel_w],
+               'strides': [stride_h, stride_w],
+               'dilation': [1, 1],
+               'padding': [0, 0],
+               'use_bias': False}
+    if keras_layer.padding == 'valid':
+        pass
+    # we insert a separate pad operator
+    elif keras_layer.padding == 'same':
+        in_h = keras_layer.input_shape[1]
+        in_w = keras_layer.input_shape[2]
+        pad_t, pad_b = _get_pad_pair(in_h, kernel_h, stride_h)
+        pad_l, pad_r = _get_pad_pair(in_w, kernel_w, stride_w)
+        insym = _sym.pad(data=insym, pad_width=(
+            (0, 0), (0, 0), (pad_t, pad_b), (pad_l, pad_r)))
+    else:
+        msg = 'Value {} in attribute "padding" of operator Separable Convolution is not valid.'
+        raise tvm.error.OpAttributeInvalid(msg.format(keras_layer.padding))
+    depthconv = _sym.conv2d(data=insym, **params0)
+    # pointwise conv
+    weight1 = weightList[1].transpose([3, 2, 0, 1])
+    params1 = {'weight': symtab.new_const(weight1),
+               'channels': weight1.shape[0],
+               'groups': 1,
+               'kernel_size': [1, 1],
+               'strides': [1, 1],
+               'dilation': [1, 1],
+               'use_bias': False}
+    if keras_layer.use_bias:
+        params1['use_bias'] = True
+        params1['bias'] = symtab.new_const(weightList[2])
+    out = _sym.conv2d(data=depthconv, **params1)
+    # defuse activation
+    if sys.version_info.major < 3:
+        act_type = keras_layer.activation.func_name
+    else:
+        act_type = keras_layer.activation.__name__
+    if act_type != 'linear':
+        out = _convert_activation(out, act_type, symtab)
+    return out
+
+
+def _convert_flatten(insym, keras_layer, _):
+    _check_data_format(keras_layer)
+    # NCHW -> NHWC so that dense can be correctly converted
+    insym = _sym.transpose(insym, axes=[0, 2, 3, 1])
+    return _sym.flatten(insym)
+
+
+def _convert_pooling(insym, keras_layer, symtab):
+    _check_data_format(keras_layer)
+    pool_type = type(keras_layer).__name__
+    # global pool in keras = global pool + flatten in nnvm
+    if pool_type == 'GlobalMaxPooling2D':
+        return _convert_flatten(_sym.global_max_pool2d(insym), keras_layer, symtab)
+    if pool_type == 'GlobalAveragePooling2D':
+        return _convert_flatten(_sym.global_avg_pool2d(insym), keras_layer, symtab)
+    pool_h, pool_w = keras_layer.pool_size
+    stride_h, stride_w = keras_layer.strides
+    params = {'pool_size': [pool_h, pool_w],
+              'strides': [stride_h, stride_w],
+              'padding': [0, 0]}
+    if keras_layer.padding == 'valid':
+        pass
+    elif keras_layer.padding == 'same':
+        in_h = keras_layer.input_shape[1]
+        in_w = keras_layer.input_shape[2]
+        pad_t, pad_b = _get_pad_pair(in_h, pool_h, stride_h)
+        pad_l, pad_r = _get_pad_pair(in_w, pool_w, stride_w)
+        params['padding'] = [pad_t, pad_l, pad_b, pad_r]
+    else:
+        msg = 'Value {} in attribute "padding" of operator Pooling is not valid.'
+        raise tvm.error.OpAttributeInvalid(msg.format(keras_layer.padding))
+    if pool_type == 'MaxPooling2D':
+        return _sym.max_pool2d(insym, **params)
+    if pool_type == 'AveragePooling2D':
+        # TODO: in keras, padded zeros are not calculated
+        return _sym.avg_pool2d(insym, **params)
+    msg = 'Value {} in attribute "padding" of operator Pooling is not valid.'
+    raise tvm.error.OpAttributeInvalid(msg.format(keras_layer.padding))
+
+
+def _convert_upsample(insym, keras_layer, _):
+    _check_data_format(keras_layer)
+    upsample_type = type(keras_layer).__name__
+    if upsample_type == "UpSampling1D":
+        h = keras_layer.size
+        params = {'scale': h}
+    elif upsample_type == "UpSampling2D":
+        h, w = keras_layer.size
+        if h != w:
+            raise tvm.error.OpAttributeInvalid(
+                'Upsample height ({}) must equal width ({})'.format(h, w))
+        params = {'scale': h}
+    elif upsample_type == "UpSampling3D":
+        h, w, d = keras_layer.size
+        if h != w or w != d:
+            raise tvm.error.OpAttributeInvalid(
+                'Upsample height ({}), width ({}), and depth ({}) must be equal.'.format(h, w, d))
+        params = {'scale': h}
+    else:
+        msg = 'Operator {} is not supported in frontend Keras.'
+        raise tvm.error.OpNotImplemented(msg.format(upsample_type))
+    return _sym.upsampling(insym, **params)
+
+
+def _convert_cropping(insym, keras_layer, _):
+    _check_data_format(keras_layer)
+    crop_type = type(keras_layer).__name__
+    if crop_type == "Cropping2D":
+        (_, in_h, in_w, _) = keras_layer.input_shape
+        ((crop_t, crop_b), (crop_l, crop_r)) = keras_layer.cropping
+    else:
+        raise tvm.error.OpNotImplemented(
+            'Operator {} is not supported in frontend Keras.'.format(crop_type))
+    int32_max = np.iinfo(np.int32).max
+    return _sym.strided_slice(insym, begin=[0, 0, crop_t, crop_l],
+                              end=[int32_max, int32_max, in_h-crop_b, in_w-crop_r])
+
+
+def _convert_batchnorm(insym, keras_layer, symtab):
+    params = {'scale': False,
+              'center': False,
+              'epsilon': keras_layer.epsilon}
+    idx = 0
+    if keras_layer.scale:
+        params['scale'] = True
+        gamma = keras_layer.get_weights()[idx]
+        params['gamma'] = symtab.new_const(gamma)
+        idx += 1
+    if keras_layer.center:
+        params['center'] = True
+        beta = keras_layer.get_weights()[idx]
+        params['beta'] = symtab.new_const(beta)
+        idx += 1
+    moving_mean = keras_layer.get_weights()[idx]
+    moving_var = keras_layer.get_weights()[idx + 1]
+    params['moving_mean'] = symtab.new_const(moving_mean)
+    params['moving_var'] = symtab.new_const(moving_var)
+    return _sym.batch_norm(data=insym, **params)
+
+
+def _convert_padding(insym, keras_layer, _):
+    _check_data_format(keras_layer)
+    padding_type = type(keras_layer).__name__
+    padding = keras_layer.padding
+    top = left = bottom = right = 0
+    if padding_type == 'ZeroPadding2D':
+        if isinstance(padding, int):
+            top = left = bottom = right = padding
+        elif isinstance(padding, tuple):
+            if isinstance(padding[0], int):
+                top, left = padding
+                bottom, right = padding
+            elif isinstance(padding[0], tuple):
+                top, bottom = padding[0]
+                left, right = padding[1]
+            else:
+                msg = 'Value {} in attribute "padding" of operator {} is not valid.'
+                raise tvm.error.OpAttributeInvalid(msg.format(str(padding), padding_type))
+        else:
+            msg = 'Value {} in attribute "padding" of operator {} is not valid.'
+            raise tvm.error.OpAttributeInvalid(msg.format(str(padding), padding_type))
+    else:
+        raise tvm.error.OpNotImplemented('Operator {} is not supported in frontend Keras.')
+    return _sym.pad(data=insym, pad_width=((0, 0), (0, 0), (top, bottom), (left, right)))
+
+
+def _convert_concat(insym, keras_layer, _):
+    _check_data_format(keras_layer)
+    if not isinstance(insym, list):
+        insym = [insym]
+    return _sym.concatenate(*insym, axis=1)
+
+
+def _convert_reshape(insym, keras_layer, _):
+    _check_data_format(keras_layer)
+    ch = keras_layer.input_shape[-1]
+    assert ch == keras_layer.target_shape[-1], \
+        "Only supports last dimension in target shape being equal to " \
+        "the channel number of input tensor."
+    shape = (-1, ch) + keras_layer.target_shape[:-1]
+    return _sym.reshape(insym, shape=shape)
+
+def _convert_lstm(insym, keras_layer, symtab):
+    _check_data_format(keras_layer)
+    if not isinstance(insym, list):
+        buffer = np.zeros((1, keras_layer.units), 'float32')
+        c_sym = symtab.new_const(buffer)
+        h_sym = symtab.new_const(buffer)
+        insym = [insym, h_sym, c_sym]
+
+    in_data = insym[0]
+    next_h = insym[1]
+    next_c = insym[2]
+
+    weightList = keras_layer.get_weights()
+    inp_shape = tuple(dim if dim else 1 for dim in _as_list(keras_layer.input_shape)[0])
+
+    kernel_wt = symtab.new_const(weightList[0].transpose([1, 0]))
+    recurrent_wt = symtab.new_const(weightList[1].transpose([1, 0]))
+    in_bias = symtab.new_const(weightList[2])
+
+    units = list(weightList[0].shape)[1]
+
+    time_steps = inp_shape[1]
+    in_data = _sym.squeeze(in_data, axis=0)
+    in_data = _sym.split(in_data, indices_or_sections=time_steps, axis=0)
+    #loop for the number of time_steps
+    for data in in_data:
+        ixh1 = _sym.dense(data, kernel_wt, use_bias=False, units=units)
+        ixh2 = _sym.dense(next_h, recurrent_wt, in_bias, use_bias=True, units=units)
+        gate = ixh1 + ixh2
+        gates = _sym.split(gate, indices_or_sections=4, axis=1)
+        in_gate = _convert_recurrent_activation(gates[0], keras_layer)
+        in_transform = _convert_recurrent_activation(gates[1], keras_layer)
+        next_c = in_transform * next_c + in_gate * _convert_activation(gates[2], keras_layer, None)
+        out_gate = _convert_recurrent_activation(gates[3], keras_layer)
+        next_h = out_gate * _convert_activation(next_c, keras_layer, None)
+
+    out_shape = tuple(dim if dim else 1 for dim in _as_list(keras_layer.output_shape)[0])
+    out = _sym.reshape(next_h, shape=out_shape)
+    return [out, next_h, next_c]
+
+def _convert_simple_rnn(insym, keras_layer, symtab):
+    _check_data_format(keras_layer)
+    if not isinstance(insym, list):
+        buffer = np.zeros((1, keras_layer.units), 'float32')
+        prev_sym = symtab.new_const(buffer)
+        insym = [insym, prev_sym]
+    in_data = insym[0]
+    prev_sym = insym[1]
+
+    weightList = keras_layer.get_weights()
+    kernel_wt = symtab.new_const(weightList[0].transpose([1, 0]))
+    recurrent_wt = symtab.new_const(weightList[1].transpose([1, 0]))
+    in_bias = symtab.new_const(weightList[2])
+    units = list(weightList[0].shape)[1]
+
+    in_data = _sym.flatten(in_data)
+    ixh = _sym.dense(in_data, kernel_wt, in_bias, use_bias=True, units=units)
+    prev_sym = _sym.flatten(prev_sym)
+    ixh2 = _sym.dense(prev_sym, recurrent_wt, use_bias=False, units=units)
+    output = ixh + ixh2
+    output = _convert_activation(output, keras_layer, None)
+
+    out_shape = tuple(dim if dim else 1 for dim in _as_list(keras_layer.output_shape)[0])
+    output = _sym.reshape(output, shape=out_shape)
+
+    return [output, output]
+
+def _convert_gru(insym, keras_layer, symtab):
+    _check_data_format(keras_layer)
+    if not isinstance(insym, list):
+        buffer = np.zeros((1, keras_layer.units), 'float32')
+        h_tm1 = symtab.new_const(buffer)
+        insym = [insym, h_tm1]
+    in_data = insym[0]
+    h_tm1_sym = insym[1]
+
+    weightList = keras_layer.get_weights()
+    kernel_wt = symtab.new_const(weightList[0].transpose([1, 0]))
+    recurrent_wt = symtab.new_const(weightList[1].transpose([1, 0]))
+    in_bias = symtab.new_const(weightList[2])
+
+    units = list(weightList[0].shape)[1]
+
+    in_data = _sym.flatten(in_data)
+    matrix_x = _sym.dense(in_data, kernel_wt, in_bias, use_bias=True, units=units)
+
+    # inputs projected by all gate matrices at once
+    split_indices = [keras_layer.units, 2 * keras_layer.units]
+    gates = _sym.split(matrix_x, indices_or_sections=split_indices, axis=1)
+    x_z = gates[0]
+    x_r = gates[1]
+    x_h = gates[2]
+
+    # hidden state projected separately for update/reset and new
+    units = 2 * keras_layer.units
+    split_indices = [units]
+    rec_wts = _sym.split(recurrent_wt, indices_or_sections=split_indices, axis=0)
+
+    h_tm1_sym = _sym.flatten(h_tm1_sym)
+    matrix_inner = _sym.dense(h_tm1_sym, rec_wts[0], use_bias=False, units=units)
+
+    split_indices = [keras_layer.units]
+    recurrent = _sym.split(matrix_inner, indices_or_sections=split_indices, axis=1)
+    recurrent_z = recurrent[0]
+    recurrent_r = recurrent[1]
+
+    rec_act_z = _convert_recurrent_activation(x_z + recurrent_z, keras_layer)
+    rec_act_r = _convert_recurrent_activation(x_r + recurrent_r, keras_layer)
+
+    units = keras_layer.units
+    recurrent_h = _sym.dense(rec_act_r * h_tm1_sym, rec_wts[1], use_bias=False, units=units)
+    act_hh = _convert_activation(x_h + recurrent_h, keras_layer, None)
+
+    # previous and candidate state mixed by update gate
+    output = rec_act_z * h_tm1_sym + (1 - rec_act_z) * act_hh
+
+    out_shape = tuple(dim if dim else 1 for dim in _as_list(keras_layer.output_shape)[0])
+    output = _sym.reshape(output, shape=out_shape)
+    return [output, output]
+
+def _default_skip(insym, keras_layer, _): # pylint: disable=unused-argument
+    """Layers that can be skipped because they are train time only."""
+    return insym
+
+
+_convert_map = {
+    'Dense'                    : _convert_dense,
+    'Activation'               : _convert_activation,
+    'ReLU'                     : _convert_advanced_activation,
+    'LeakyReLU'                : _convert_advanced_activation,
+    'PReLU'                    : _convert_advanced_activation,
+    'ELU'                      : _convert_advanced_activation,
+    'ThresholdedReLU'          : _convert_advanced_activation,
+
+    'AveragePooling2D'         : _convert_pooling,
+    'MaxPooling2D'             : _convert_pooling,
+    'GlobalAveragePooling2D'   : _convert_pooling,
+    'GlobalMaxPooling2D'       : _convert_pooling,
+    'Conv2D'                   : _convert_convolution,
+    'Conv2DTranspose'          : _convert_convolution,
+    'DepthwiseConv2D'          : _convert_convolution,
+    'SeparableConv2D'          : _convert_separable_convolution,
+
+    'Flatten'                  : _convert_flatten,
+    'Reshape'                  : _convert_reshape,
+    'Concatenate'              : _convert_concat,
+    'BatchNormalization'       : _convert_batchnorm,
+
+    'Add'                      : _convert_merge,
+    'Subtract'                 : _convert_merge,
+    'Multiply'                 : _convert_merge,
+    'ZeroPadding2D'            : _convert_padding,
+    'UpSampling2D'             : _convert_upsample,
+    'Cropping2D'               : _convert_cropping,
+
+    # 'ZeroPadding1D'          : _convert_padding,
+    # 'AveragePooling1D'       : _convert_pooling,
+    # 'MaxPooling1D'           : _convert_pooling,
+    # 'GlobalAveragePooling1D' : _convert_pooling,
+    # 'GlobalMaxPooling1D'     : _convert_pooling,
+    # 'Cropping1D'             : _convert_cropping,
+    # 'UpSampling1D'           : _convert_upsample,
+    # 'UpSampling3D'           : _convert_upsample,
+    # 'Conv1D'                 : _convert_convolution1d,
+
+    'SimpleRNN'                : _convert_simple_rnn,
+    'LSTM'                     : _convert_lstm,
+    'GRU'                      : _convert_gru,
+    # 'Bidirectional'          : _convert_bidirectional,
+    # 'TimeDistributed'        : _default_skip,
+
+    # 'Average'                : _convert_merge,
+    # 'Maximum'                : _convert_merge,
+    # 'Dot'                    : _convert_merge,
+    # 'Permute'                : _convert_permute,
+    # 'Embedding'              : _convert_embedding,
+    # 'RepeatVector'           : _convert_repeat_vector,
+
+    'InputLayer'               : _default_skip,
+    'Dropout'                  : _default_skip,
+    'SpatialDropout2D'         : _default_skip,
+    'SpatialDropout1D'         : _default_skip,
+}
+
+
+def _check_unsupported_layers(model):
+    for layer in model.layers:
+        op_name = type(layer).__name__
+        if op_name not in _convert_map:
+            raise tvm.error.OpNotImplemented(
+                'Operator {} is not supported in frontend Keras.'.format(op_name))
+
+def _as_list(arr):
+    """Force being a list, ignore if already is."""
+    if isinstance(arr, list):
+        return arr
+    return [arr]
+
+def keras_op_to_nnvm(insym, keras_layer, outname, symtab):
+    """Convert keras layer to nnvm symbol, and update symtab.
+
+    Parameters
+    ----------
+    insym : nnvm.symbol.Symbol or a list of it
+        The input nnvm symbol(s)
+
+    keras_layer : keras.layers
+        The keras layer to be converted
+
+    outname : str
+        Name of the output nnvm symbol
+
+    symtab : nnvm.frontend.common.SymbolTable
+        The global symbol table to be updated
+    """
+    op_name = type(keras_layer).__name__
+    if op_name not in _convert_map:
+        raise tvm.error.OpNotImplemented(
+            'Operator {} is not supported in frontend Keras.'.format(op_name))
+    outs = _convert_map[op_name](insym, keras_layer, symtab)
+    outs = _as_list(outs)
+
+    for t_idx, out in enumerate(outs):
+        name = outname + ":" + str(t_idx)
+        symtab.set_var(name, out)
+
+def from_keras(model):
+    """Convert keras model to NNVM format.
+
+    Parameters
+    ----------
+    model : keras.engine.training.Model
+        The keras model to be converted
+
+    Returns
+    -------
+    sym : nnvm.Symbol
+        Compatible nnvm symbol
+
+    params : dict of str to tvm.NDArray
+        The parameter dict to be used by nnvm
+    """
+    try:
+        import keras
+    except ImportError:
+        raise ImportError('Keras must be installed')
+
+    assert isinstance(model, keras.engine.training.Model)
+    if keras.backend.backend() != 'tensorflow':
+        raise ValueError("Keras frontend currently supports tensorflow backend only.")
+    if keras.backend.image_data_format() != 'channels_last':
+        raise ValueError("Keras frontend currently supports data_format = channels_last only.")
+    _check_unsupported_layers(model)
+
+    symtab = SymbolTable()
+    for keras_layer in model.layers:
+        if isinstance(keras_layer, keras.engine.InputLayer):
+            symtab.get_var(keras_layer.name, must_contain=False)
+        else:
+            inbound_nodes = keras_layer.inbound_nodes if hasattr(keras_layer, 'inbound_nodes') \
+                       else keras_layer._inbound_nodes if hasattr(keras_layer, '_inbound_nodes') \
+                       else None
+            if inbound_nodes is None:
+                raise TypeError("Unknown layer type or unsupported Keras version : {}"
+                                .format(keras_layer))
+            for node_idx, node in enumerate(inbound_nodes):
+                # If some nodes in imported model is not relevant to the current model,
+                # skip such layers. model._network_nodes contains keys of all nodes relevant
+                # to the current model.
+                if not model._node_key(keras_layer, node_idx) in model._network_nodes:
+                    continue
+
+                insym = []
+
+                # Since Keras allows creating multiple layers from the same name instance,
+                # we append node index to the symbol name to make it unique.
+                # The one exception is InputLayer.  Changing input variable names after conversion
+                # would confuse users, so we should keep them as far as possible.  Fortunately,
+                # they are named uniquely to input_1, input_2, input_3 ... by default.
+                zip_node = zip(node.node_indices, node.tensor_indices, node.inbound_layers)
+                for n_idx, t_idx, layer in zip_node:
+                    if isinstance(layer, keras.engine.InputLayer):
+                        sym = symtab.get_var(layer.name, must_contain=True)
+                    else:
+                        sym_name = layer.name + ':' + str(n_idx) + ':' + str(t_idx)
+                        sym = symtab.get_var(sym_name, must_contain=True)
+                    insym.append(sym)
+
+                if len(insym) == 1:
+                    insym = insym[0]
+                keras_op_to_nnvm(insym, keras_layer, keras_layer.name + ':' + str(node_idx), symtab)
+
+    #model._output_coordinates contains out_node(oc[0]), node_index(oc[1]) and tensor index(oc[2])
+    #Get all output nodes in symtab using the name made from above values. The out symbols
+    #were added to symtab in keras_op_to_nnvm using this name. For multiple outputs, make a list
+    #with these output symbols and Group them.
+    outsym = [symtab.get_var(oc[0].name + ":" + str(oc[1]) + ":" + str(oc[2]))
+              for oc in model._output_coordinates]
+
+    tvmparams = {k:tvm.nd.array(np.array(v, dtype=np.float32)) for k, v in symtab.params.items()}
+    return _sym.Group(outsym), tvmparams
diff --git a/nnvm/python/nnvm/frontend/onnx.py b/nnvm/python/nnvm/frontend/onnx.py
new file mode 100644
index 000000000000..8a92821476a5
--- /dev/null
+++ b/nnvm/python/nnvm/frontend/onnx.py
@@ -0,0 +1,1038 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=import-self, invalid-name, unused-argument, too-many-lines
+"""ONNX: Open Neural Network Exchange frontend."""
+from __future__ import absolute_import as _abs
+import numpy as np
+import tvm
+from .. import symbol as _sym
+from .common import get_nnvm_op, Renamer, SymbolTable, AttrConverter as AttrCvt
+from .onnx_caffe2_utils import dimension_picker, dimension_constraint, \
+    infer_channels, revert_caffe2_pad
+
+__all__ = ['from_onnx']
+
+
+def onnx_storage_order2layout(storage_order):
+    if storage_order not in (0, 1):
+        raise tvm.error.OpAttributeInvalid('Mode of storage_order must be either 0 or 1')
+
+    return 'NCHW' if storage_order == 0 else 'NHWC'
+
+
+class OnnxOpConverter(object):
+    """ A helper class for holding onnx op converters.
+    """
+
+    @classmethod
+    def get_converter(cls, opset):
+        """ Get converter matches given opset.
+
+        :param opset: opset from model.
+        :return: converter, which should be `_impl_vx`. Number x is the biggest
+            number smaller than or equal to opset belongs to all support versions.
+        """
+        versions = [
+            int(d.replace('_impl_v', '')) for d in dir(cls) if '_impl_v' in d
+        ]
+        versions = sorted(versions + [opset])
+        version = versions[
+            max([i for i, v in enumerate(versions) if v == opset]) - 1]
+        if hasattr(cls, '_impl_v{}'.format(version)):
+            return getattr(cls, '_impl_v{}'.format(version))
+        raise NotImplementedError(
+            'opset version {} of {} not implemented'.format(
+                version, cls.__name__))
+
+
+class Elemwise(OnnxOpConverter):
+    """ A helper class for elemwise op converters.
+    """
+
+    name = ''
+
+    @classmethod
+    def _math_name_picker(cls, suffix):
+
+        def _impl(attr):
+            if attr.get('broadcast', 0):
+                return 'broadcast_' + suffix
+            return 'elemwise_' + suffix
+
+        return _impl
+
+    @classmethod
+    def _impl_v1(cls, inputs, attr, params):
+        assert len(inputs) == 2, "Math op take 2 inputs, {} given".format(
+            len(inputs))
+        op_name = cls._math_name_picker(cls.name)(attr)
+        axis = int(attr.get('axis', 0))
+        conv_ops = ["conv2d", "conv2d_transpose"]
+        if op_name == 'broadcast_add' and inputs[0].attr('op_name') in conv_ops:
+            # TODO(zhreshold): remove hard coded infershape
+            inputs[1] = _sym.expand_dims(inputs[1], axis=axis, num_newaxis=2)
+        return get_nnvm_op(op_name)(*inputs)
+
+
+class Pool(OnnxOpConverter):
+    """ A helper class for pool op converters.
+    """
+
+    name = ''
+
+    @classmethod
+    def _impl_v1(cls, inputs, attr, params):
+        return AttrCvt(
+            op_name=dimension_picker(cls.name),
+            transforms={
+                'kernel_shape': 'pool_size',
+                'pads': ('padding', (0, 0), revert_caffe2_pad)
+            },
+            # very weird attributes here in onnx, force check
+            ignores=['dilations'],
+            # TODO(zhreshold): make sure ceil_mode in onnx, and layout?
+            extras={'ceil_mode': False},
+            custom_check=dimension_constraint())(inputs, attr, params)
+
+
+class Absolute(OnnxOpConverter):
+
+    @classmethod
+    def _impl_v1(cls, inputs, attr, params):
+        return _sym.relu(inputs[0]) + _sym.relu(_sym.negative(inputs[0]))
+
+
+class Add(Elemwise):
+    name = 'add'
+
+
+class AveragePool(Pool):
+    name = 'avg_pool'
+
+
+class BatchNorm(OnnxOpConverter):
+
+    @classmethod
+    def _impl_v1(cls, inputs, attr, params):
+        # TODO(zhreshold): 'spatial' is not properly handled here.
+        return AttrCvt(
+            op_name='batch_norm',
+            disables=['momentum'],
+            ignores=['spatial', 'is_test', 'consumed_inputs'])(inputs, attr,
+                                                               params)
+
+
+class Conv(OnnxOpConverter):
+
+    @classmethod
+    def _impl_v1(cls, inputs, attr, params):
+        # get number of channels
+        channels = infer_channels(inputs[1], params)
+        attr['channels'] = channels
+        return AttrCvt(
+            op_name=dimension_picker('conv'),
+            transforms={
+                'kernel_shape': 'kernel_size',
+                'dilations': ('dilation', (0, 0)),
+                'pads': ('padding', (0, 0), revert_caffe2_pad),
+                'group': ('groups', 1)
+            },
+            extras={'use_bias': len(inputs) == 3},
+            custom_check=dimension_constraint())(inputs, attr, params)
+
+
+class ConvTranspose(OnnxOpConverter):
+
+    @classmethod
+    def _impl_v1(cls, inputs, attr, params):
+        # get number of channels
+        channels = infer_channels(inputs[1], params, True)
+        attr['channels'] = channels
+        groups = attr.pop('group')
+        attr['groups'] = groups
+        return AttrCvt(
+            op_name=dimension_picker('conv', '_transpose'),
+            transforms={
+                'kernel_shape': 'kernel_size',
+                'dilations': ('dilation', (0, 0)),
+                'pads': ('padding', (0, 0), revert_caffe2_pad)
+            },
+            disables=['output_shape'],
+            extras={'use_bias': len(inputs) == 3},
+            custom_check=dimension_constraint())(inputs, attr, params)
+
+
+class Div(Elemwise):
+    name = 'div'
+
+
+class Elu(OnnxOpConverter):
+
+    @classmethod
+    def _impl_v1(cls, inputs, attr, params):
+        alpha = float(attr.get('alpha', 1.0))
+        return -alpha * _sym.relu(1 - _sym.exp(inputs[0])) + _sym.relu(
+            inputs[0])
+
+
+class Gemm(OnnxOpConverter):
+    """ Operator converter for Gemm.
+    """
+
+    @classmethod
+    def _impl_v1(cls, inputs, attr, params):
+        assert len(inputs) == 3, "Gemm op take 3 inputs, {} given".format(
+            len(inputs))
+        # Y = alpha * A * B + beta * C
+        alpha = float(attr.get('alpha', 1.0))
+        beta = float(attr.get('beta', 1.0))
+        transA = int(attr.get('transA', 0))
+        transB = int(attr.get('transB', 0))
+        # get number of channels
+        channels = infer_channels(inputs[1], params, not transB)
+        if transA:
+            inputs[0] = _sym.transpose(inputs[0], axes=(1, 0))
+        if not transB:
+            inputs[1] = _sym.transpose(inputs[1], axes=(1, 0))
+        inputs[0] = _sym.flatten(inputs[0])
+        return _sym.dense(
+            alpha * inputs[0], inputs[1], beta * inputs[2], units=channels)
+
+
+class MaxPool(Pool):
+    """ Operator converter for MaxPool
+    """
+    name = 'max_pool'
+
+    @classmethod
+    def _impl_v8(cls, inputs, attr, params):
+        return AttrCvt(
+            op_name=dimension_picker(cls.name),
+            transforms={
+                'kernel_shape': 'pool_size',
+                'pads': ('padding', (0, 0), revert_caffe2_pad),
+                'storage_order': ('layout', 'NCHW', onnx_storage_order2layout),
+            },
+            # very weird attributes here in onnx, force check
+            ignores=['dilations', 'auto_pad'],
+            # TODO(higumachan): make sure ceil_mode in onnx, and layout?
+            extras={'ceil_mode': False},
+            custom_check=dimension_constraint())(inputs, attr, params)
+
+    @classmethod
+    def _impl_v10(cls, inputs, attr, params):
+        return AttrCvt(
+            op_name=dimension_picker(cls.name),
+            transforms={
+                'kernel_shape': 'pool_size',
+                'pads': ('padding', (0, 0), revert_caffe2_pad),
+                'storage_order': ('layout', 'NCHW', onnx_storage_order2layout),
+                'ceil_mode': 'ceil_mode'
+            },
+            # very weird attributes here in onnx, force check
+            ignores=['dilations', 'auto_pad'],
+            custom_check=dimension_constraint())(inputs, attr, params)
+
+class Mul(Elemwise):
+    name = 'mul'
+
+
+class Pad(OnnxOpConverter):
+    """ Operator converter for Pad.
+    """
+
+    @classmethod
+    def _impl_v1(cls, inputs, attr, params):
+        pad_width = []
+        pads = attr.pop('paddings')
+        dims = int(len(pads) / 2)
+        for i in range(dims):
+            pad_width.append((pads[i], pads[i+dims]))
+        attr['pad_width'] = pad_width
+
+        return AttrCvt(
+            op_name='pad',
+            transforms={
+                'value': 'pad_value',
+            },
+            ignores=['mode'],
+            custom_check=(lambda attrs: attrs.get('mode', 'constant').decode("utf-8") == 'constant',
+                          'split mode != constant'))(inputs, attr, params)
+
+    @classmethod
+    def _impl_v2(cls, inputs, attr, params):
+        pad_width = []
+        pads = attr.pop('pads')
+        dims = int(len(pads) / 2)
+        for i in range(dims):
+            pad_width.append((pads[i], pads[i+dims]))
+        attr['pad_width'] = pad_width
+
+        return AttrCvt(
+            op_name='pad',
+            transforms={
+                'value': 'pad_value',
+            },
+            ignores=['mode'],
+            custom_check=(lambda attrs: attrs.get('mode', 'constant').decode("utf-8") == 'constant',
+                          'split mode != constant'))(inputs, attr, params)
+
+
+class ParametricSoftPlus(OnnxOpConverter):
+
+    @classmethod
+    def _impl_v1(cls, inputs, attr, params):
+        alpha = float(attr.get('alpha', 1.0))
+        beta = float(attr.get('beta', 1.0))
+        return _sym.log(_sym.exp(beta * inputs[0]) + 1) * alpha
+
+
+class Prelu(OnnxOpConverter):
+
+    @classmethod
+    def _impl_v1(cls, inputs, attr, params):
+        assert len(inputs) == 2, "Prelu need 2 inputs, {} given".format(
+            len(inputs))
+        return _sym.prelu(inputs[0], inputs[1])
+
+
+class Reciprocal(OnnxOpConverter):
+
+    @classmethod
+    def _impl_v1(cls, inputs, attr, params):
+        return 1.0 / inputs[0]
+
+
+class Reshape(OnnxOpConverter):
+    """ Operator converter for Reshape.
+    """
+
+    @classmethod
+    def _impl_v1(cls, inputs, attr, params):
+        return _sym.reshape(inputs[0], shape=attr['shape'])
+
+    @classmethod
+    def _impl_v5(cls, inputs, attr, params):
+        if inputs[1].list_output_names()[0] in params:
+            shape = tuple(params[inputs[1].list_output_names()[0]].asnumpy())
+            out = _sym.reshape(inputs[0], shape=shape)
+        else:
+            out = _sym.reshape_like(inputs[0], inputs[1])
+
+        return out
+
+class Scale(OnnxOpConverter):
+
+    @classmethod
+    def _impl_v1(cls, inputs, attr, params):
+        scale = float(attr.get('scale', 1.0))
+        return inputs[0] * scale
+
+
+class Selu(OnnxOpConverter):
+
+    @classmethod
+    def _impl_v1(cls, inputs, attr, params):
+        alpha = float(attr.get('alpha', 1.6732))
+        gamma = float(attr.get('gamma', 1.0507))
+        return gamma * (
+            -alpha * _sym.relu(1 - _sym.exp(inputs[0])) + _sym.relu(inputs[0]))
+
+
+class ScaledTanh(OnnxOpConverter):
+
+    @classmethod
+    def _impl_v1(cls, inputs, attr, params):
+        alpha = float(attr.get('alpha', 1.0))
+        beta = float(attr.get('beta', 1.0))
+        return _sym.tanh(beta * inputs[0]) * alpha
+
+
+class SoftPlus(OnnxOpConverter):
+
+    @classmethod
+    def _impl_v1(cls, inputs, attr, params):
+        return _sym.log(_sym.exp(inputs[0]) + 1)
+
+
+class Softsign(OnnxOpConverter):
+
+    @classmethod
+    def _impl_v1(cls, inputs, attr, params):
+        return inputs[0] / (1 + Absolute.get_converter(1)(inputs, attr, params))
+
+
+class Sub(Elemwise):
+    name = 'sub'
+
+
+class Sum(OnnxOpConverter):
+
+    @classmethod
+    def _impl_v1(cls, inputs, attr, params):
+        # Onnx Sum Operator
+        for in_index in range(len(inputs) - 1):
+            inputs[in_index + 1] = _sym.broadcast_add(inputs[in_index],
+                                                      inputs[in_index + 1])
+
+        return inputs[len(inputs) - 1]
+
+
+class ThresholdedRelu(OnnxOpConverter):
+
+    @classmethod
+    def _impl_v1(cls, inputs, attr, params):
+        alpha = float(attr.get('alpha', 1.0))
+        alpha_tensor = _sym.full_like(inputs[0], fill_value=float(alpha))
+        return _sym.elemwise_mul(inputs[0], _sym.greater(inputs[0], alpha_tensor))
+
+class ImageScaler(OnnxOpConverter):
+
+    @classmethod
+    def _impl_v1(cls, inputs, attr, params):
+        channelScale = attr['scale']
+        bias_attr = attr['bias']
+        bias = SymbolTable().new_const(np.array(bias_attr).reshape([3, 1, 1]))
+        scaledChannel = _sym.__mul_scalar__(inputs[0], scalar=channelScale)
+        ret = _sym.broadcast_add(scaledChannel, bias)
+        return ret
+
+
+def _broadcast_constraint():
+
+    def _broadcast_check(attrs):
+        if attrs.get('axis', None):
+            return False
+        return True
+
+    return _broadcast_check, "Specifying broadcast axis not allowed."
+
+
+def _fully_connected(opset):
+
+    def _impl(inputs, attr, params):
+        # get number of channels
+        channels = infer_channels(inputs[1], params)
+        attr['units'] = channels
+        return AttrCvt('dense', ignores=['axis', 'axis_w'])(inputs, attr)
+
+    return _impl
+
+
+class Upsample(OnnxOpConverter):
+    """ Operator converter for Upsample (nearest mode).
+    """
+
+    @classmethod
+    def _impl_v9(cls, inputs, attr, params):
+        scales = attr.get('scales')
+        if not scales:
+            #Here we are going to higher OPSET version.
+            assert len(inputs) == 2, "Upsample op take 2 inputs, {} given".format(len(inputs))
+            input_name = inputs[1].list_input_names()[0]
+            scales = params[input_name].asnumpy()
+            inputs = inputs[:1]
+        assert len(scales) == 4 and scales[0] == 1.0 and scales[1] == 1.0 and scales[2] == scales[3]
+        mode = attr.get('mode')
+        if mode == b'nearest':
+            method = "NEAREST_NEIGHBOR"
+        elif mode == b'linear':
+            method = "BILINEAR"
+        else:
+            raise tvm.error.OpAttributeInvalid(
+                'Value {} in attribute "mode" of operator Upsample is not valid.'.format(mode))
+        return _sym.upsampling(inputs[0], scale=int(scales[-1]), method=method, layout='NCHW')
+
+
+class Shape(OnnxOpConverter):
+    """ Operator converter for Shape.
+    """
+
+    @classmethod
+    def _impl_v1(cls, inputs, attr, params):
+        # Result of this operator is prominently used by reshape operator.
+        # Just pass the input as it is so that reshape_like can be used there.
+        print("Shape: Differently implemented in NNVM as a bypass (dummy operator)")
+        return inputs[0]
+
+class Cast(OnnxOpConverter):
+    """ Operator converter for Cast.
+    """
+
+    @classmethod
+    def _impl_v1(cls, inputs, attr, params):
+        return AttrCvt(op_name='cast', transforms={'to': 'dtype'})(inputs, attr)
+
+    @classmethod
+    def _impl_v5(cls, inputs, attr, params):
+        try:
+            from onnx.mapping import TENSOR_TYPE_TO_NP_TYPE
+            attr['to'] = TENSOR_TYPE_TO_NP_TYPE[attr['to']]
+        except ImportError as e:
+            raise ImportError(
+                "Unable to import onnx.mapping which is required {}".format(e))
+        return AttrCvt(op_name='cast', transforms={'to': 'dtype'})(inputs, attr)
+
+
+class Unsqueeze(OnnxOpConverter):
+    """ Operator converter for Unsqueeze.
+    """
+
+    @classmethod
+    def _impl_v1(cls, inputs, attr, params):
+        for axes in attr['axes']:
+            inputs[0] = _sym.expand_dims(inputs[0], axis=axes, num_newaxis=1)
+        return inputs[0]
+
+
+class Split(OnnxOpConverter):
+    """ Operator converter for Split.
+    """
+
+    @classmethod
+    def _impl_v1(cls, inputs, attr, params):
+        attr['indices_or_sections'] = []
+        index = 0
+        for i in attr['split'][:-1]:
+            index += i
+            attr['indices_or_sections'].append(index)
+        return AttrCvt(
+            op_name='split',
+            ignores=['split'])(inputs, attr, params)
+
+
+class Slice(OnnxOpConverter):
+    """ Operator converter for Slice.
+    """
+    @classmethod
+    def _impl_v1(cls, inputs, attr, params):
+        if isinstance(attr['starts'], int):
+            attr['starts'] = (attr['starts'],)
+            attr['ends'] = (attr['ends'],)
+
+        try:
+            # Update the starts and ends according to axes if required.
+            if isinstance(attr['axes'], int):
+                attr['axes'] = (attr['axes'],)
+
+            if (max(attr['axes']) + 1) != len(attr['axes']):
+                new_axes = []
+                new_starts = []
+                new_ends = []
+                pop_index = 0
+                for i in range(max(attr['axes']) + 1):
+                    if i in attr['axes']:
+                        new_axes.append(i)
+                        new_starts.append(attr['starts'][pop_index])
+                        new_ends.append(attr['ends'][pop_index])
+                        pop_index += 1
+                    else:
+                        new_axes.append(i)
+                        new_starts.append(0)
+                        new_ends.append(np.iinfo(np.int32).max)
+                attr['axes'] = new_axes
+                attr['starts'] = new_starts
+                attr['ends'] = new_ends
+        except KeyError:
+            pass
+
+        return AttrCvt(op_name='strided_slice',
+                       transforms={'starts': 'begin',
+                                   'ends': 'end'},
+                       ignores=['axes'])(inputs, attr)
+
+class Gather(OnnxOpConverter):
+    """ Operator converter for Gather.
+    """
+    @classmethod
+    def _impl_v1(cls, inputs, attr, params):
+        axis = attr.get('axis', 0)
+        return AttrCvt(op_name='take',
+                       extras={'axis':axis})(inputs, attr)
+
+class LRN(OnnxOpConverter):
+    """ Operator converter for Local Response Normalization.
+    """
+    @classmethod
+    def _impl_v1(cls, inputs, attr, params):
+        """LRN support only NCHW format
+        https://github.com/onnx/onnx/blob/master/docs/Operators.md#LRN
+        """
+        axis = 1
+        alpha = attr.get('alpha', 0.0001)
+        beta = attr.get('beta', 0.75)
+        bias = attr.get('bias', 1.0)
+        nsize = attr.get('size')
+        return _sym.lrn(inputs[0], size=nsize, axis=axis,
+                        alpha=alpha, beta=beta, bias=bias)
+
+class Maximum(OnnxOpConverter):
+    """ Operator converter for Maximum.
+    """
+    @classmethod
+    def _impl_v1(cls, inputs, attr, params):
+        if not isinstance(inputs, list) or len(inputs) < 2:
+            raise ValueError("Expect minimum 2 inputs")
+        _max = inputs[0]
+        for i in range(1, len(inputs)):
+            _max = AttrCvt(op_name='broadcast_max')([_max, inputs[i]], {})
+        return _max
+
+class Minimum(OnnxOpConverter):
+    """ Operator converter for Minimum.
+    """
+    @classmethod
+    def _impl_v1(cls, inputs, attr, params):
+        if not isinstance(inputs, list) or len(inputs) < 2:
+            raise ValueError("Expect minimum 2 inputs")
+        _min = inputs[0]
+        for i in range(1, len(inputs)):
+            _min = AttrCvt(op_name='broadcast_min')([_min, inputs[i]], {})
+        return _min
+
+class Mean(OnnxOpConverter):
+    """ Operator converter for Mean.
+    """
+    @classmethod
+    def _impl_v1(cls, inputs, attr, params):
+        if not isinstance(inputs, list) or len(inputs) < 2:
+            raise ValueError("Expect minimum 2 inputs")
+        count = len(inputs)
+        _sum = inputs[0]
+        for i in range(1, count):
+            _sum = AttrCvt(op_name='broadcast_add')([_sum, inputs[i]], {})
+        return _sum / count
+
+class HardSigmoid(OnnxOpConverter):
+    """ Operator converter for HardSigmoid.
+    """
+    @classmethod
+    def _impl_v1(cls, inputs, attr, params):
+        alpha = attr.get('alpha', 0.2)
+        beta = attr.get('beta', 0.5)
+        transformX = (inputs[0] * alpha) + beta
+        attr = {'a_min':0, 'a_max':1}
+        return AttrCvt(op_name='clip')([transformX], attr)
+
+class ArgMax(OnnxOpConverter):
+    """ Operator converter for ArgMax.
+    """
+    @classmethod
+    def _impl_v1(cls, inputs, attr, params):
+        axis = attr.get('axis', 0)
+        keepdims = attr.get('keepdims', True)
+        attr = {'axis':axis, 'keepdims':keepdims}
+        return AttrCvt(op_name='argmax')(inputs, attr)
+
+class ArgMin(OnnxOpConverter):
+    """ Operator converter for ArgMin.
+    """
+    @classmethod
+    def _impl_v1(cls, inputs, attr, params):
+        axis = attr.get('axis', 0)
+        keepdims = attr.get('keepdims', True)
+        attr = {'axis':axis, 'keepdims':keepdims}
+        return AttrCvt(op_name='argmin')(inputs, attr)
+
+class Softmax(OnnxOpConverter):
+    """ Operator converter for Softmax.
+    """
+    @classmethod
+    def _impl_v1(cls, inputs, attr, params):
+        # set default value when axis is not set in the model
+        if 'axis' not in attr:
+            attr['axis'] = 1
+        return AttrCvt(
+            op_name='softmax',
+            transforms={
+                'axis': ('axis', 1),
+            })(inputs, attr, params)
+
+class ConstantFill(OnnxOpConverter):
+    """ Operator converter for ConstantFill.
+    """
+    @classmethod
+    def _impl_v1(cls, inputs, attr, params):
+        is_full = True
+        num_inputs = len(inputs)
+        if 'shape' in attr:
+            if num_inputs > 0:
+                raise ImportError(
+                    "Can't set shape and input tensor at a time")
+            shape = attr.pop('shape')
+        else:
+            if num_inputs == 0:
+                raise ImportError(
+                    "Either shape attribute or input should be set")
+            if 'input_as_shape' in attr and attr['input_as_shape']:
+                shape = params[inputs[0].list_output_names()[0]].asnumpy()
+            else:
+                is_full = False
+
+        if not is_full:
+            if 'extra_shape' in attr:
+                raise ImportError(
+                    "Extra Shape not supported with fill_like")
+
+            out = AttrCvt(
+                op_name='full_like',
+                transforms={'value': 'fill_value'},
+                ignores=['dtype'])(inputs, attr)
+            return _sym.cast(out, dtype=attr['dtype'].decode("utf-8"))
+        if 'extra_shape' in attr:
+            shape = shape + attr.pop('extra_shape')
+
+        return AttrCvt(
+            op_name='full',
+            transforms={'value': 'fill_value'},
+            extras={'shape':shape})(inputs, attr)
+
+# compatible operators that do NOT require any conversion.
+_identity_list = []
+
+
+# _convert_map defines maps of name to converter functor(callable)
+# for 1 to 1 mapping, use Renamer if nothing but name is different
+# use AttrCvt if attributes need to be converted
+# for 1 to N mapping(composed), use custom callable functions
+# for N to 1 mapping, currently not supported(?)
+def _get_convert_map(opset):
+    return {
+        # defs/experimental
+        'Identity': Renamer('copy'),
+        # 'Affine'
+        'ThresholdedRelu': ThresholdedRelu.get_converter(opset),
+        'ScaledTanh': ScaledTanh.get_converter(opset),
+        'ParametricSoftplus': ParametricSoftPlus.get_converter(opset),
+        'ConstantFill': ConstantFill.get_converter(opset),
+        # 'GivenTensorFill'
+        'FC': AttrCvt('dense', ignores=['axis', 'axis_w']),
+        'Scale': Scale.get_converter(opset),
+        # 'GRUUnit'
+        # 'ATen'
+        'ImageScaler': ImageScaler.get_converter(opset),
+        # 'MeanVarianceNormalization'
+        # 'Crop'
+        # 'Embedding'
+        'Upsample' : Upsample.get_converter(opset),
+        'SpatialBN': BatchNorm.get_converter(opset),
+
+        # defs/generator
+        # 'Constant' # Implemented
+        # 'RandomUniform'
+        # 'RandomNormal'
+        # 'RandomUniformLike'
+        # 'RandomNormalLike'
+
+        # defs/logical
+
+        # defs/math
+        'Add': Add.get_converter(opset),
+        'Sub': Sub.get_converter(opset),
+        'Mul': Mul.get_converter(opset),
+        'Div': Div.get_converter(opset),
+        'Neg': Renamer('negative'),
+        'Abs': Absolute.get_converter(opset),
+        'Reciprocal': Reciprocal.get_converter(opset),
+        'Floor': Renamer('floor'),
+        'Ceil': Renamer('ceil'),
+        'Sqrt': Renamer('sqrt'),
+        'Relu': Renamer('relu'),
+        'LeakyRelu': Renamer('leaky_relu'),
+        'Selu': Selu.get_converter(opset),
+        'Elu': Elu.get_converter(opset),
+        'Exp': Renamer('exp'),
+        'Log': Renamer('log'),
+        'Tanh': Renamer('tanh'),
+        'Pow': Renamer('broadcast_pow'),
+        'PRelu': Prelu.get_converter(opset),
+        'Sigmoid': Renamer('sigmoid'),
+        'HardSigmoid': HardSigmoid.get_converter(opset),
+        'Max': Maximum.get_converter(opset),
+        'Min': Minimum.get_converter(opset),
+        'Sum': Sum.get_converter(opset),
+        'Mean': Mean.get_converter(opset),
+        'Clip': AttrCvt('clip', transforms={'min': 'a_min', 'max': 'a_max'}),
+        # softmax default axis is different in onnx
+        'Softmax': Softmax.get_converter(opset),
+        'LogSoftmax': AttrCvt('log_softmax', {'axis': ('axis', 1)}),
+        # 'Hardmax'
+        'Softsign': Softsign.get_converter(opset),
+        'SoftPlus': SoftPlus.get_converter(opset),
+        'Gemm': Gemm.get_converter(opset),
+        'MatMul': Renamer('matmul'),
+
+        # defs/nn
+        'AveragePool': AveragePool.get_converter(opset),
+        'MaxPool': MaxPool.get_converter(opset),
+        'Conv': Conv.get_converter(opset),
+        'ConvTranspose': ConvTranspose.get_converter(opset),
+        'GlobalAveragePool': Renamer('global_avg_pool2d'),
+        'GlobalMaxPool': Renamer('global_max_pool2d'),
+        'BatchNormalization': BatchNorm.get_converter(opset),
+        # 'InstanceNormalization'
+        # 'LpNormalization'
+        'Dropout': AttrCvt('dropout', {'ratio': 'rate'}, ignores=['is_test']),
+        'Flatten': Renamer('flatten'),
+        'LRN': LRN.get_converter(opset),
+
+        # defs/reduction
+        'ReduceMax': AttrCvt('max', {'axes': 'axis'}),
+        'ReduceMin': AttrCvt('min', {'axes': 'axis'}),
+        'ReduceSum': AttrCvt('sum', {'axes': 'axis'}),
+        'ReduceMean': AttrCvt('mean', {'axes': 'axis'}),
+        # 'ReduceProd'
+        # 'ReduceLogSumExp'
+        'ArgMax': ArgMax.get_converter(opset),
+        'ArgMin': ArgMin.get_converter(opset),
+
+        # defs/tensor
+        'Cast': Cast.get_converter(opset),
+        'Reshape': Reshape.get_converter(opset),
+        'Concat': Renamer('concatenate'),
+        'Split': Split.get_converter(opset),
+        'Slice': Slice.get_converter(opset),
+        'Transpose': AttrCvt('transpose', {'perm': 'axes'}),
+        'Gather': Gather.get_converter(opset),
+        'Squeeze': AttrCvt('squeeze', {'axes': 'axis'}),
+        'Unsqueeze': Unsqueeze.get_converter(opset),
+        'Pad': Pad.get_converter(opset),
+        'Shape': Shape.get_converter(opset),
+    }
+
+
+class GraphProto(object):
+    """A helper class for handling nnvm graph copying from pb2.GraphProto.
+    Definition: https://github.com/onnx/onnx/blob/master/onnx/onnx.proto
+    """
+
+    def __init__(self):
+        self._nodes = {}
+        self._params = {}
+        self._renames = {}
+        self._num_input = 0
+        self._num_param = 0
+
+    def from_onnx(self, graph, opset):
+        """Construct nnvm nodes from onnx graph.
+        The inputs from onnx graph is vague, only providing "1", "2"...
+        For convenience, we rename the `real` input names to "input_0",
+        "input_1"... And renaming parameters to "param_0", "param_1"...
+
+        Parameters
+        ----------
+        graph : onnx protobuf object
+            The loaded onnx graph
+        opset : opset version
+
+        Returns
+        -------
+        sym : nnvm.sym.Symbol
+            The returned nnvm symbol
+        params : dict
+            A dict of name: tvm.nd.array pairs, used as pretrained weights
+        """
+        # parse network inputs to nnvm, aka parameters
+        for init_tensor in graph.initializer:
+            if not init_tensor.name.strip():
+                raise ValueError("Tensor's name is required.")
+            self._params[init_tensor.name] = self._parse_array(init_tensor)
+        for i in graph.input:
+            # from onnx v0.2, GraphProto.input has type ValueInfoProto,
+            #  and the name is 'i.name'
+            i_name = self._parse_value_proto(i)
+            if i_name in self._params:
+                # i is a param instead of input
+                self._num_param += 1
+                self._params[i_name] = self._params.pop(i_name)
+                self._nodes[i_name] = _sym.Variable(
+                    name=i_name, shape=self._params[i_name].shape)
+            else:
+                self._num_input += 1
+                self._nodes[i_name] = _sym.Variable(name=i_name)
+        # get list of unsupported ops
+        convert_map = _get_convert_map(opset)
+        unsupported_ops = set()
+        for node in graph.node:
+            op_name = node.op_type
+            if op_name not in convert_map and \
+               op_name != 'Constant' and \
+               op_name not in _identity_list:
+                unsupported_ops.add(op_name)
+        if unsupported_ops:
+            msg = 'The following operators are not supported for frontend ONNX: '
+            msg += ', '.join(unsupported_ops)
+            raise tvm.error.OpNotImplemented(msg)
+        # construct nodes, nodes are stored as directed acyclic graph
+        for node in graph.node:
+            op_name = node.op_type
+            attr = self._parse_attr(node.attribute)
+            inputs = [self._nodes[self._renames.get(i, i)] for i in node.input]
+            if op_name == "Constant":
+                t_proto = self._parse_attr(node.attribute)["value"]
+                self._num_param += 1
+                self._params[node.output[0]] = self._parse_array(t_proto)
+                self._nodes[node.output[0]] = _sym.Variable(name=node.output[0],
+                                                            shape=list(t_proto.dims))
+            else:
+                op = self._convert_operator(op_name, inputs, attr, opset)
+                node_output = self._fix_outputs(op_name, node.output)
+                assert len(node_output) == len(op.list_output_names()), (
+                    "Number of output mismatch {} vs {} in {}.".format(
+                        len(node_output), len(op.list_output_names()), op_name))
+                for k, i in zip(list(node_output), range(len(node_output))):
+                    self._nodes[k] = op[i]
+        # now return the outputs
+        out = [self._nodes[self._parse_value_proto(i)] for i in graph.output]
+        if len(out) > 1:
+            out = _sym.Group(out)
+        else:
+            out = out[0]
+        return out, self._params
+
+    def _parse_value_proto(self, value_proto):
+        """Parse ValueProto or raw str."""
+        try:
+            name = value_proto.name
+        except AttributeError:
+            name = value_proto
+        return name
+
+    def _parse_array(self, tensor_proto):
+        """Grab data in TensorProto and convert to numpy array."""
+        try:
+            from onnx.numpy_helper import to_array
+        except ImportError as e:
+            raise ImportError(
+                "Unable to import onnx which is required {}".format(e))
+        np_array = to_array(tensor_proto).reshape(tuple(tensor_proto.dims))
+        return tvm.nd.array(np_array)
+
+    def _parse_attr(self, attr_proto):
+        """Convert a list of AttributeProto to a dict, with names as keys."""
+        attrs = {}
+        for a in attr_proto:
+            for f in ['f', 'i', 's']:
+                if a.HasField(f):
+                    attrs[a.name] = getattr(a, f)
+            for f in ['floats', 'ints', 'strings']:
+                if list(getattr(a, f)):
+                    assert a.name not in attrs, "Only one type of attr is allowed"
+                    attrs[a.name] = tuple(getattr(a, f))
+            for f in ['t']:
+                if a.HasField(f):
+                    attrs[a.name] = getattr(a, f)
+            for f in ['tensors']:
+                if list(getattr(a, f)):
+                    assert a.name not in attrs, "Only one type of attr is allowed"
+                    attrs[a.name] = tuple(getattr(a, f))
+            for f in ['g']:
+                if a.HasField(f):
+                    raise NotImplementedError(
+                        "Filed {} is not supported in nnvm.".format(f))
+            for f in ['graphs']:
+                if list(getattr(a, f)):
+                    raise NotImplementedError(
+                        "Filed {} is not supported in nnvm.".format(f))
+            if a.name not in attrs:
+                raise ValueError("Cannot parse attribute: \n{}\n.".format(a))
+        return attrs
+
+    def _convert_operator(self,
+                          op_name,
+                          inputs,
+                          attrs,
+                          opset,
+                          identity_list=None,
+                          convert_map=None):
+        """Convert from onnx operator to nnvm operator.
+        The converter must specify conversions explicitly for incompatible name, and
+        apply handlers to operator attributes.
+
+        Parameters
+        ----------
+        op_name : str
+            Operator name, such as Convolution, FullyConnected
+        inputs : list of nnvm.Symbol
+            List of input symbols.
+        attrs : dict
+            Dict of operator attributes
+        opset : int
+            Opset version
+        identity_list : list
+            List of operators that don't require conversion
+        convert_map : dict
+            Dict of name : callable, where name is the op's name that
+            require conversion to nnvm, callable are functions which
+            take attrs and return (new_op_name, new_attrs)
+
+        Returns
+        -------
+        sym : nnvm.Symbol
+            Converted nnvm Symbol
+        """
+        identity_list = identity_list if identity_list else _identity_list
+        convert_map = convert_map if convert_map else _get_convert_map(opset)
+        if op_name in identity_list:
+            sym = get_nnvm_op(op_name)(*inputs, **attrs)
+        elif op_name in convert_map:
+            sym = convert_map[op_name](inputs, attrs, self._params)
+        else:
+            raise tvm.error.OpNotImplemented(
+                'Operator {} is not supported in frontend ONNX.')
+        return sym
+
+    def _fix_outputs(self, op_name, outputs):
+        """A hack to handle dropout or similar operator that have more than one out
+        in ONNX.
+        """
+        if op_name == 'Dropout':
+            if len(outputs) == 1:
+                return outputs
+            # TODO(zhreshold): support dropout mask?
+            outputs = outputs[:-1]
+        return outputs
+
+
+def from_onnx(model):
+    """Load onnx graph which is a python protobuf object into nnvm graph.
+    The companion parameters will be handled automatically.
+    The inputs from onnx graph is vague, only providing "1", "2"...
+    For convenience, we rename the `real` input names to "input_0",
+    "input_1"... And renaming parameters to "param_0", "param_1"...
+
+    Parameters
+    ----------
+    model : protobuf object
+        ONNX ModelProto after ONNX v1.1.0
+
+    Returns
+    -------
+    sym : nnvm.Symbol
+        Compatible nnvm symbol
+
+    params : dict of str to tvm.ndarray
+        Dict of converted parameters stored in tvm.ndarray format
+    """
+    g = GraphProto()
+    graph = model.graph
+    try:
+        opset = model.opset_import[0].version if model.opset_import else 1
+    except AttributeError:
+        opset = 1
+    sym, params = g.from_onnx(graph, opset)
+    return sym, params
diff --git a/nnvm/python/nnvm/frontend/onnx_caffe2_utils.py b/nnvm/python/nnvm/frontend/onnx_caffe2_utils.py
new file mode 100644
index 000000000000..18f9263ecc0b
--- /dev/null
+++ b/nnvm/python/nnvm/frontend/onnx_caffe2_utils.py
@@ -0,0 +1,61 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Util functions shared by the ONNX and Caffe2 frontends."""
+from __future__ import absolute_import as _abs
+from nnvm import graph as _graph
+from nnvm.compiler import graph_util
+
+
+def dimension_picker(prefix, surfix=''):
+    def _impl(attr):
+        kernel = attr['kernel_shape']
+        if len(kernel) == 2:
+            return prefix + '2d' + surfix
+        raise NotImplementedError("Only 2d kernel supported.")
+
+    return _impl
+
+
+def dimension_constraint():
+    def _dim_check(attrs):
+        if len(attrs['kernel_shape']) == 2:
+            return True
+        return False
+
+    return _dim_check, "Only 2d kernel supported."
+
+
+def infer_channels(inputs, params, transpose=False):
+    """A hack for getting 'channels' or 'units' since caffe2 don't provide
+    these attributes. We check the shape of weights provided to get the number.
+    """
+    g = _graph.create(inputs)
+    shape_dict = {k: v.shape for k, v in params.items()}
+    _, out_shapes = graph_util.infer_shape(g, **shape_dict)
+    channels = out_shapes[0][0] if not transpose else out_shapes[0][1]
+    return channels
+
+
+def revert_caffe2_pad(pads):
+    """Caffe2 require two times the normal padding."""
+    if len(pads) == 4:
+        pads = pads[:2]
+    elif len(pads) == 2:
+        pass
+    else:
+        raise ValueError("Invalid caffe2 type padding: {}".format(pads))
+    return pads
diff --git a/nnvm/python/nnvm/graph.py b/nnvm/python/nnvm/graph.py
new file mode 100644
index 000000000000..0d1e70f4e0f6
--- /dev/null
+++ b/nnvm/python/nnvm/graph.py
@@ -0,0 +1,288 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# coding: utf-8
+# pylint: disable=invalid-name, protected-access, too-many-arguments, too-many-lines
+"""NNVM Graph IR API.
+
+This is a developer API that is used to manipulate and transform graphs.
+"""
+from __future__ import absolute_import as _abs
+
+import ctypes
+import json
+from ._base import _LIB
+from ._base import c_array, c_str, nn_uint, py_str, string_types
+from ._base import GraphHandle, SymbolHandle
+from ._base import check_call
+from .symbol import Variable, Symbol, Group as _Group
+
+class GraphIndex(object):
+    """Index for quickly accessing graph attributes.
+
+    Parameters
+    ----------
+    graph : Graph
+        The graph to create index.
+    """
+    def __init__(self, graph):
+        jgraph = json.loads(create(graph).apply("SaveJSON").json_attr("json"))
+        self.nodes = jgraph["nodes"]
+        self.entry_ptr = jgraph["node_row_ptr"]
+        self._name2nodeid = {n["name"]: i for i, n in enumerate(self.nodes)}
+        self.input_names = graph.symbol.list_input_names()
+        self.output_entries = jgraph["heads"]
+
+    @property
+    def num_nodes(self):
+        """Number of nodes in graph."""
+        return len(self.entry_ptr) - 1
+
+    @property
+    def num_node_entries(self):
+        """Number of nodes in graph."""
+        return self.entry_ptr[-1]
+
+    def node_id(self, key):
+        """Get the node index for a given key.
+
+        Parameters
+        ----------
+        key : str or int
+            The node key or index
+
+        Returns
+        -------
+        index : int
+            The entry index
+        """
+        return self._name2nodeid[key]
+
+    def entry_id(self, key, value_index=0):
+        """Get the entry id of a node entry.
+
+        Parameters
+        ----------
+        key : str or int
+            The node key or index
+
+        value_index : int
+            The value index of output
+
+        Returns
+        -------
+        index : int
+            The entry index
+        """
+        if isinstance(key, (list, tuple)):
+            if len(key) != 3:
+                raise ValueError("Expect entry index to be tuple of 3 elems")
+            key, value_index, _ = key
+        idx = self.node_id(key) if isinstance(key, str) else key
+        assert value_index < self.entry_ptr[idx + 1]
+        return self.entry_ptr[idx] + value_index
+
+
+
+class Graph(object):
+    """Graph is the graph object that can be used to apply optimization pass.
+
+    It contains additional graphwise attribute besides the internal symbol.
+    """
+    _tvm_tcode = 17
+
+    # pylint: disable=no-member
+    def __init__(self, handle):
+        """Initialize the function with handle
+
+        Parameters
+        ----------
+        handle : GraphHandle
+            the handle to the underlying C++ Graph
+        """
+        self.handle = handle
+        self._index = None
+
+    def __del__(self):
+        check_call(_LIB.NNGraphFree(self.handle))
+
+    def json_attr(self, key):
+        """Get attribute string from the graph.
+
+        Parameters
+        ----------
+        key : str
+            The key to get attribute from.
+
+        Returns
+        -------
+        value : str
+            The attribute value of the key, returns None if attribute do not exist.
+        """
+        ret = ctypes.c_char_p()
+        success = ctypes.c_int()
+        check_call(_LIB.NNGraphGetJSONAttr(
+            self.handle, c_str(key), ctypes.byref(ret), ctypes.byref(success)))
+        if success.value != 0:
+            json_str = py_str(ret.value)
+            return json.loads(json_str)[1]
+        return None
+
+    def _set_symbol_list_attr(self, key, value):
+        """Set the attribute of the graph.
+
+        Parameters
+        ----------
+        key : string
+            The key of the attribute
+        value : value
+            The any type that can be dumped to json
+        type_name : string
+            The typename registered on c++ side.
+        """
+        if isinstance(value, list):
+            value = _Group(value)
+        if not isinstance(value, Symbol):
+            raise ValueError("value need to be grouped symbol")
+        check_call(_LIB.NNGraphSetNodeEntryListAttr_(
+            self.handle, c_str(key), value.handle))
+
+    def _set_json_attr(self, key, value, type_name=None):
+        """Set the attribute of the graph.
+
+        Parameters
+        ----------
+        key : string
+            The key of the attribute
+        value : value
+            The any type that can be dumped to json
+        type_name : string
+            The typename registered on c++ side.
+        """
+        if isinstance(value, string_types):
+            type_name = 'str'
+        elif type_name is None:
+            raise ValueError("Need to specify type_name")
+        json_value = json.dumps([type_name, value])
+        check_call(_LIB.NNGraphSetJSONAttr(
+            self.handle, c_str(key), c_str(json_value)))
+
+    @property
+    def _tvm_handle(self):
+        return self.handle.value
+
+    @property
+    def symbol(self):
+        shandle = SymbolHandle()
+        check_call(_LIB.NNGraphGetSymbol(self.handle, ctypes.byref(shandle)))
+        return Symbol(shandle)
+
+    def json(self):
+        """Get JSON representation of the graph
+
+        Returns
+        -------
+        json : str
+            JSON representation of the graph
+        """
+        return self.apply("SaveJSON").json_attr("json")
+
+    def _tvm_graph_json(self):
+        """Get TVM graph json"""
+        return self.json()
+
+    @property
+    def index(self):
+        if not self._index:
+            self._index = GraphIndex(self)
+        return self._index
+
+    def ir(self, join_entry_attrs=None, join_node_attrs=None):
+        """Get text form of graph ir.
+
+        Parameters
+        ----------
+        join_entry_attrs : list of str
+            List of graph NodeEntry attribute to be
+            printed along each operator.
+
+        join_node_attrs : list of str
+            List of graph node attribute to be
+            printed along each operator.
+        """
+        if join_entry_attrs:
+            self._set_json_attr("join_entry_attrs", join_entry_attrs, "list_str")
+        if join_node_attrs:
+            self._set_json_attr("join_node_attrs", join_node_attrs, "list_str")
+        return self.apply("PrintGraphIR").json_attr("graphir")
+
+    def apply(self, passes):
+        """Apply passes to the graph
+
+        Parameters
+        ----------
+        passes : str or list of str
+            The passes to be applied
+
+        Returns
+        -------
+        g : Graph
+            The transformed graph.
+        """
+        if isinstance(passes, string_types):
+            passes = [passes]
+        cpass = c_array(ctypes.c_char_p, [c_str(key) for key in passes])
+        ghandle = GraphHandle()
+        npass = nn_uint(len(passes))
+        check_call(_LIB.NNGraphApplyPasses(self.handle, npass, cpass, ctypes.byref(ghandle)))
+        return Graph(ghandle)
+
+
+def load_json(json_str):
+    """Create a new graph by loading from json
+
+    Parameters
+    ----------
+    json_str : str
+        The json string
+
+    Returns
+    -------
+    graph : Graph
+        The loaded graph
+    """
+    ret = create(Variable("x"))
+    ret._set_json_attr("json", json_str)
+    return ret.apply("LoadJSON")
+
+
+def create(symbol):
+    """Create a new graph from symbol.
+
+    Parameters
+    ----------
+    symbol : Symbol
+        The symbolic graph used to create Graph object.
+
+    Returns
+    -------
+    graph : Graph
+        A generated new graph object.
+    """
+    ghandle = GraphHandle()
+    check_call(_LIB.NNGraphCreate(
+        symbol.handle, ctypes.byref(ghandle)))
+    return Graph(ghandle)
diff --git a/nnvm/python/nnvm/libinfo.py b/nnvm/python/nnvm/libinfo.py
new file mode 100644
index 000000000000..b3bfc753b9c2
--- /dev/null
+++ b/nnvm/python/nnvm/libinfo.py
@@ -0,0 +1,84 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# coding: utf-8
+"""Information about nnvm."""
+from __future__ import absolute_import
+import sys
+import os
+import platform
+
+if sys.version_info[0] == 3:
+    import builtins as __builtin__
+else:
+    import __builtin__
+
+def find_lib_path():
+    """Find NNNet dynamic library files.
+
+    Returns
+    -------
+    lib_path : list(string)
+        List of all found path to the libraries
+    """
+    if hasattr(__builtin__, "NNVM_BASE_PATH"):
+        base_path = __builtin__.NNVM_BASE_PATH
+    else:
+        base_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
+
+    if hasattr(__builtin__, "NNVM_LIBRARY_NAME"):
+        lib_name = __builtin__.NNVM_LIBRARY_NAME
+    else:
+        lib_name = "nnvm_compiler" if sys.platform.startswith('win32') else "libnnvm_compiler"
+
+    api_path = os.path.join(base_path, '..', '..', 'lib')
+    cmake_build_path_win = os.path.join(base_path, '..', '..', '..', 'build', 'Release')
+    cmake_build_path = os.path.join(base_path, '..', '..', '..', 'build')
+    install_path = os.path.join(base_path, '..', '..', '..')
+    dll_path = [base_path, api_path, cmake_build_path_win, cmake_build_path,
+                install_path]
+
+    if sys.platform.startswith('linux') and os.environ.get('LD_LIBRARY_PATH', None):
+        dll_path.extend([p.strip() for p in os.environ['LD_LIBRARY_PATH'].split(":")])
+    elif sys.platform.startswith('darwin') and os.environ.get('DYLD_LIBRARY_PATH', None):
+        dll_path.extend([p.strip() for p in os.environ['DYLD_LIBRARY_PATH'].split(":")])
+    elif sys.platform.startswith('win32') and os.environ.get('PATH', None):
+        dll_path.extend([p.strip() for p in os.environ['PATH'].split(";")])
+
+    if sys.platform.startswith('win32'):
+        vs_configuration = 'Release'
+        if platform.architecture()[0] == '64bit':
+            dll_path.append(os.path.join(base_path, '..', '..', '..', 'build', vs_configuration))
+            dll_path.append(os.path.join(base_path, '..', '..', '..', 'windows', 'x64',
+                                         vs_configuration))
+        else:
+            dll_path.append(os.path.join(base_path, '..', '..', '..', 'build', vs_configuration))
+            dll_path.append(os.path.join(base_path, '..', '..', '..', 'windows', vs_configuration))
+        dll_path = [os.path.join(p, '%s.dll' % lib_name) for p in dll_path]
+    elif sys.platform.startswith('darwin'):
+        dll_path = [os.path.join(p, '%s.dylib' % lib_name) for p in dll_path]
+    else:
+        dll_path = [os.path.join(p, '%s.so' % lib_name) for p in dll_path]
+
+    lib_path = [p for p in dll_path if os.path.exists(p) and os.path.isfile(p)]
+    if not lib_path:
+        raise RuntimeError('Cannot find the files.\n' +
+                           'List of candidates:\n' + str('\n'.join(dll_path)))
+    return lib_path
+
+
+# current version
+__version__ = "0.8.0"
diff --git a/nnvm/python/nnvm/name.py b/nnvm/python/nnvm/name.py
new file mode 100644
index 000000000000..fe3d8311f1a6
--- /dev/null
+++ b/nnvm/python/nnvm/name.py
@@ -0,0 +1,94 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# coding: utf-8
+"""Automatic naming support for symbolic API."""
+from __future__ import absolute_import as _abs
+
+class NameManager(object):
+    """NameManager to do automatic naming.
+
+    User can also inherit this object to change naming behavior.
+    """
+    current = None
+
+    def __init__(self):
+        self._counter = {}
+        self._old_manager = None
+
+    def get(self, name, hint):
+        """Get the canonical name for a symbol.
+
+        This is default implementation.
+        When user specified a name,
+        the user specified name will be used.
+
+        When user did not, we will automatically generate a
+        name based on hint string.
+
+        Parameters
+        ----------
+        name : str or None
+            The name user specified.
+
+        hint : str
+            A hint string, which can be used to generate name.
+
+        Returns
+        -------
+        full_name : str
+            A canonical name for the user.
+        """
+        if name:
+            return name
+        if hint not in self._counter:
+            self._counter[hint] = 0
+        name = '%s%d' % (hint, self._counter[hint])
+        self._counter[hint] += 1
+        return name
+
+    def __enter__(self):
+        self._old_manager = NameManager.current
+        NameManager.current = self
+        return self
+
+    def __exit__(self, ptype, value, trace):
+        assert self._old_manager
+        NameManager.current = self._old_manager
+
+
+class Prefix(NameManager):
+    """A name manager that always attach a prefix to all names.
+
+    Examples
+    --------
+    >>> import nnvm as nn
+    >>> data = nn.symbol.Variable('data')
+    >>> with nn.name.Prefix('mynet_'):
+            net = nn.symbol.FullyConnected(data, num_hidden=10, name='fc1')
+    >>> net.list_arguments()
+    ['data', 'mynet_fc1_weight', 'mynet_fc1_bias']
+    """
+    def __init__(self, prefix):
+        super(Prefix, self).__init__()
+        self._prefix = prefix
+
+    def get(self, name, hint):
+        name = super(Prefix, self).get(name, hint)
+        return self._prefix + name
+
+# initialize the default name manager
+NameManager.current = NameManager()
diff --git a/nnvm/python/nnvm/symbol.py b/nnvm/python/nnvm/symbol.py
new file mode 100644
index 000000000000..297d2ba7405a
--- /dev/null
+++ b/nnvm/python/nnvm/symbol.py
@@ -0,0 +1,405 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name, unused-import, protected-access
+"""Symbolic graph construction API.
+
+This namespace contains most of the registered operators.
+For detailed list of operators, checkout ``Core Tensor Operators``
+"""
+from __future__ import absolute_import as _abs
+import sys as _sys
+import os as _os
+import ctypes as _ctypes
+from numbers import Number as _Number
+
+import numpy as np
+
+from . import _base
+from ._base import _LIB, check_call as _check_call, _FFI_MODE, _all_var_init
+from .attribute import AttrScope
+from . import _symbol_internal as _internal
+from . import contrib
+
+# Use different verison of SymbolBase
+# When possible, use cython to speedup part of computation.
+
+IMPORT_EXCEPT = RuntimeError if _FFI_MODE == "cython" else ImportError
+
+try:
+    if _FFI_MODE == "ctypes":
+        raise ImportError()
+    if _sys.version_info >= (3, 0):
+        from ._cy3.symbol import SymbolBase, _init_symbol_module
+    else:
+        from ._cy2.symbol import SymbolBase, _init_symbol_module
+except IMPORT_EXCEPT:
+    # pylint: disable=wrong-import-position
+    from ._ctypes.symbol import SymbolBase, _init_symbol_module
+
+
+class Symbol(SymbolBase):
+    """Symbol is basic operation unit for symbolic graph composition."""
+    # disable dictionary storage, also do not have parent type.
+    __slots__ = []
+
+    _tvm_tcode = 16
+
+    @property
+    def _tvm_handle(self):
+        return self.handle.value
+
+    def __add__(self, other):
+        """x.__add__(y) <=> x+y"""
+        if isinstance(other, Symbol):
+            return __add_symbol__(self, other)
+        if isinstance(other, _Number):
+            return __add_scalar__(self, scalar=other)
+        raise TypeError("type %s not supported" % str(type(other)))
+
+    def __radd__(self, other):
+        return self.__add__(other)
+
+    def __sub__(self, other):
+        """x.__sub__(y) <=> x-y"""
+        if isinstance(other, Symbol):
+            return __sub_symbol__(self, other)
+        if isinstance(other, _Number):
+            return __sub_scalar__(self, scalar=other)
+        raise TypeError('type %s not supported' % str(type(other)))
+
+    def __rsub__(self, other):
+        if isinstance(other, _Number):
+            return __rsub_scalar__(self, scalar=other)
+        raise TypeError('type %s not supported' % str(type(other)))
+
+    def __mul__(self, other):
+        """x.__mul__(y) <=> x*y"""
+        if isinstance(other, Symbol):
+            return __mul_symbol__(self, other)
+        if isinstance(other, _Number):
+            return __mul_scalar__(self, scalar=other)
+        raise TypeError('type %s not supported' % str(type(other)))
+
+    def __rmul__(self, other):
+        return self.__mul__(other)
+
+    def __div__(self, other):
+        """x.__div__(y) <=> x/y"""
+        if isinstance(other, Symbol):
+            return __div_symbol__(self, other)
+        if isinstance(other, _Number):
+            return __div_scalar__(self, scalar=other)
+        raise TypeError('type %s not supported' % str(type(other)))
+
+    def __rdiv__(self, other):
+        if isinstance(other, _Number):
+            return __rdiv_scalar__(self, scalar=other)
+        raise TypeError('type %s not supported' % str(type(other)))
+
+    def __lshift__(self, other):
+        """x.__lshift__(y) <=> x << y"""
+        if isinstance(other, _Number):
+            return __lshift_scalar__(self, scalar=other)
+        raise TypeError('type %s not supported' % str(type(other)))
+
+    def __rshift__(self, other):
+        """x.__rshift__(y) <=> x >> y"""
+        if isinstance(other, _Number):
+            return __rshift_scalar__(self, scalar=other)
+        raise TypeError('type %s not supported' % str(type(other)))
+
+    def __truediv__(self, other):
+        return self.__div__(other)
+
+    def __rtruediv__(self, other):
+        return self.__rdiv__(other)
+
+    def __pow__(self, other):
+        """x.__pow__(y) <=> x**y"""
+        if isinstance(other, Symbol):
+            return __pow_symbol__(self, other)
+        if isinstance(other, _Number):
+            return __pow_scalar__(self, scalar=other)
+        raise TypeError('type %s not supported' % str(type(other)))
+
+    def __rpow__(self, other):
+        if isinstance(other, _Number):
+            return __rpow_scalar__(self, scalar=other)
+        raise TypeError('type %s not supported' % str(type(other)))
+
+    def __neg__(self):
+        """x.__neg__() <=> -x"""
+        return self.__mul__(-1.0)
+
+    def __copy__(self):
+        return self.__deepcopy__()
+
+    def __deepcopy__(self, _=None):
+        """Returns a deep copy of the input object."""
+        handle = _base.SymbolHandle()
+        _base.check_call(_LIB.NNSymbolCopy(self.handle,
+                                           _ctypes.byref(handle)))
+        return Symbol(handle)
+
+    def __getitem__(self, index):
+        if isinstance(index, _base.string_types):
+            idx = None
+            for i, name in enumerate(self.list_output_names()):
+                if name == index:
+                    if idx is not None:
+                        raise ValueError('There are multiple outputs with name \"%s\"' % index)
+                    idx = i
+            if idx is None:
+                raise ValueError('Cannot find output that matches name \"%s\"' % index)
+            index = idx
+        if not isinstance(index, int):
+            raise TypeError('Symbol only support integer index to fetch i-th output')
+        handle = _base.SymbolHandle()
+        _check_call(_LIB.NNSymbolGetOutput(
+            self.handle, _base.nn_uint(index), _ctypes.byref(handle)))
+        return Symbol(handle=handle)
+
+    def __iter__(self):
+        return (self[i] for i in self.list_output_names())
+
+    def attr(self, key):
+        """Get attribute string from the symbol, this function only works for non-grouped symbol.
+
+        Parameters
+        ----------
+        key : str
+            The key to get attribute from.
+
+        Returns
+        -------
+        value : str
+            The attribute value of the key, returns None if attribute do not exist.
+        """
+        ret = _ctypes.c_char_p()
+        success = _ctypes.c_int()
+        _check_call(_LIB.NNSymbolGetAttr(
+            self.handle, _base.c_str(key), _ctypes.byref(ret), _ctypes.byref(success)))
+        if success.value != 0:
+            return _base.py_str(ret.value)
+        return None
+
+    def list_attr(self, recursive=False):
+        """Get all attributes from the symbol.
+
+        Parameters
+        ----------
+        recursive : bool
+            Default `False`. When `recursive` is `True`, list recursively all the
+            attributes in the descendents. The attribute names are pre-pended with
+            the symbol names to avoid conflicts. If `False`, then only attributes
+            that belongs to this symbol is returned, and the attribute names will
+            **not** be pre-pended with the symbol name.
+        """
+        size = _base.nn_uint()
+        pairs = _ctypes.POINTER(_ctypes.c_char_p)()
+        option = _ctypes.c_int(0) if recursive else _ctypes.c_int(1)
+        _check_call(_LIB.NNSymbolListAttrs(
+            self.handle, option, _ctypes.byref(size), _ctypes.byref(pairs)))
+        return {_base.py_str(pairs[i*2]): _base.py_str(pairs[i*2+1]) for i in range(size.value)}
+
+    def get_internals(self):
+        """Get a new grouped symbol whose output contains all the internal outputs of this symbol.
+
+        Returns
+        -------
+        sgroup : Symbol
+            The internal of the symbol.
+        """
+        handle = _base.SymbolHandle()
+        _check_call(_LIB.NNSymbolGetInternals(
+            self.handle, _ctypes.byref(handle)))
+        return Symbol(handle=handle)
+
+    def get_children(self):
+        """Gets a new grouped symbol whose output contains
+           inputs to output nodes of the original symbol."""
+        handle = _base.SymbolHandle()
+        _check_call(_LIB.NNSymbolGetChildren(
+            self.handle, _ctypes.byref(handle)))
+        ret = Symbol(handle=handle)
+        if not ret.list_output_names():
+            return None
+        return ret
+
+    def _get_list_copt(self, option):
+        """internal function to get list option"""
+        if option == 'all':
+            return _ctypes.c_int(0)
+        if option == 'read_only':
+            return _ctypes.c_int(1)
+        if option == 'aux_state':
+            return _ctypes.c_int(2)
+        raise ValueError("option need to be in {'all', 'read_only, 'aux_state'}")
+
+    def list_input_variables(self, option='all'):
+        """List all the input variables in the symbol.
+
+        Parameters
+        ----------
+        option : {'all', 'read_only', 'aux_state'}, optional
+           The listing option
+           - 'all' will list all the arguments.
+           - 'read_only' lists arguments that are readed by the graph.
+           - 'aux_state' lists arguments that are mutated by the graph as state.
+        Returns
+        -------
+        vars : list of symbol
+            List of all the variables
+        """
+        size = _ctypes.c_uint()
+        sarr = _ctypes.POINTER(_base.SymbolHandle)()
+        _check_call(_LIB.NNSymbolListInputVariables(
+            self.handle, self._get_list_copt(option),
+            _ctypes.byref(size), _ctypes.byref(sarr)))
+        return [Symbol(_base.SymbolHandle(sarr[i])) for i in range(size.value)]
+
+    def list_input_names(self, option='all'):
+        """List all the inputs in the symbol.
+
+        Parameters
+        ----------
+        option : {'all', 'read_only', 'aux_state'}, optional
+           The listing option
+           - 'all' will list all the arguments.
+           - 'read_only' lists arguments that are readed by the graph.
+           - 'aux_state' lists arguments that are mutated by the graph as state.
+        Returns
+        -------
+        args : list of string
+            List of all the arguments.
+        """
+        size = _ctypes.c_uint()
+        sarr = _ctypes.POINTER(_ctypes.c_char_p)()
+        _check_call(_LIB.NNSymbolListInputNames(
+            self.handle, self._get_list_copt(option),
+            _ctypes.byref(size), _ctypes.byref(sarr)))
+        return [_base.py_str(sarr[i]) for i in range(size.value)]
+
+    def list_output_names(self):
+        """List all outputs in the symbol.
+
+        Returns
+        -------
+        returns : list of string
+            List of all the outputs.
+        """
+        size = _ctypes.c_uint()
+        sarr = _ctypes.POINTER(_ctypes.c_char_p)()
+        _check_call(_LIB.NNSymbolListOutputNames(
+            self.handle, _ctypes.byref(size), _ctypes.byref(sarr)))
+        return [_base.py_str(sarr[i]) for i in range(size.value)]
+
+    def debug_str(self):
+        """Get a debug string.
+
+        Returns
+        -------
+        debug_str : string
+            Debug string of the symbol.
+        """
+        debug_str = _ctypes.c_char_p()
+        _check_call(_LIB.NNSymbolPrint(
+            self.handle, _ctypes.byref(debug_str)))
+        return _base.py_str(debug_str.value)
+
+    def _add_control_deps(self, deps):
+        """Add control flow dependencies.
+        This makes current op depend on the deps.
+        Only use when necessary,
+        this function mutate the current symbol node.
+
+        Returns
+        -------
+        deps : Symbol for list of symbol
+            The dependencies
+        """
+        if isinstance(deps, list):
+            deps = Group(deps)
+        _check_call(_LIB.NNAddControlDeps(
+            self.handle, deps.handle))
+
+
+def Variable(name, init=None, **kwargs):
+    """Create a symbolic variable with specified name.
+
+    Parameters
+    ----------
+    name : str
+        Name of the variable.
+    init : Symbol or numpy.ndarray
+        Symbol or numpy ndarray of initial value for the variable.
+        Note that for symbolic initialization value, it must be able
+        to be defined through InferShape, such as sym.zeros_like(v),
+        in which v is an input or parameter. Otherwise, pass a numpy
+        ndarray instead.
+    kwargs : dict of string -> string
+        Additional attributes to set on the variable.
+
+    Returns
+    -------
+    variable : Symbol
+        The created variable symbol.
+    """
+    if not isinstance(name, _base.string_types):
+        raise TypeError('Expect a string for variable `name`')
+    handle = _base.SymbolHandle()
+    _base.check_call(_LIB.NNSymbolCreateVariable(
+        _base.c_str(name), _ctypes.byref(handle)))
+    ret = Symbol(handle)
+    attr = AttrScope.current.get(kwargs)
+    if attr:
+        ret._set_attr(**attr)
+    if init is not None:
+        if not isinstance(init, (Symbol, np.ndarray)):
+            raise TypeError('Expect a Symbol or numpy ndarray'
+                            'for variable `init`')
+        _all_var_init[name] = init
+    return ret
+
+
+def Group(symbols):
+    """Create a symbol that groups symbols together.
+
+    Parameters
+    ----------
+    symbols : list
+        List of symbols to be grouped.
+
+    Returns
+    -------
+    sym : Symbol
+        The created group symbol.
+     """
+    ihandles = []
+    for sym in symbols:
+        if not isinstance(sym, Symbol):
+            raise TypeError('Expect Symbols in the list input')
+        ihandles.append(sym.handle)
+    handle = _base.SymbolHandle()
+    _check_call(_LIB.NNSymbolCreateGroup(
+        _base.nn_uint(len(ihandles)),
+        _base.c_array(_base.SymbolHandle, ihandles),
+        _ctypes.byref(handle)))
+    return Symbol(handle)
+
+# Set the real symbol class to Symbol
+_init_symbol_module(Symbol, "nnvm")
diff --git a/nnvm/python/nnvm/testing/__init__.py b/nnvm/python/nnvm/testing/__init__.py
new file mode 100644
index 000000000000..506a9e9aa68b
--- /dev/null
+++ b/nnvm/python/nnvm/testing/__init__.py
@@ -0,0 +1,33 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Utilities for testing and benchmarks"""
+from __future__ import absolute_import as _abs
+
+from .config import ctx_list
+from .utils import create_workload
+from . import mobilenet
+from . import mobilenet_v2
+from . import mlp
+from . import resnet
+from . import vgg
+from . import densenet
+from . import squeezenet
+from . import inception_v3
+from . import dcgan
+from . import dqn
+from . import check_computation
diff --git a/nnvm/python/nnvm/testing/check_computation.py b/nnvm/python/nnvm/testing/check_computation.py
new file mode 100644
index 000000000000..63b3a17880a2
--- /dev/null
+++ b/nnvm/python/nnvm/testing/check_computation.py
@@ -0,0 +1,573 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=cell-var-from-loop,no-else-return
+"""Helper utilities to check functions and their gradients."""
+from __future__ import absolute_import as _abs
+
+import logging
+import numpy as np
+
+import tvm
+from tvm.contrib import graph_runtime
+from tvm.testing import check_numerical_grads
+from tvm import relay
+
+import nnvm
+from nnvm.compiler import graph_util
+from nnvm.compiler.graph_attr import TCODE_TO_DTYPE, DTYPE_TO_TCODE
+from nnvm.to_relay import to_relay
+from .config import ctx_list
+
+def infer_shapes_dtypes(graph, shape=None, dtype=None, fallback_dtype=None):
+    """Runs dtype and shape inference passes on a graph and returns the resulting graph
+    along with the inferred information.
+
+    Parameters
+    ----------
+    graph : nnvm.graph.Graph
+        A graph we want to run inference on.
+
+    shape : Dict[str, Tuple[int]] or Tuple[int], optional
+        A dict mapping input variable names to shapes.
+        By default shapes will be inferred from variables' attributes.
+        Note that this parameter takes precedence over variables' attributes.
+
+    dtype : Dict[str, str] or str, optional
+        A dict mapping input variable names to dtypes, or just a single dtype.
+        By default dtypes will be inferred from variables' attributes.
+        Note that this parameter takes precedence over variables' attributes.
+
+    fallback_dtype : str, optional
+        A dtype that will be used for variables whose dtype can't be inferred from other
+        variables' dtypes.
+
+    Returns
+    -------
+    graph : nnvm.graph.Graph
+        The resulting graph with dtype and shape information on its nodes.
+
+    input_shapes : Dict[str, Tuple[int]]
+        The inferred shapes of input variables merged with the `shape` dictionary.
+
+    input_dtypes : Dict[str, str]
+        The inferred dtypes of input variables merged with the `dtype` dictionary.
+
+    output_shapes : List[Tuple[int]]
+        The inferred shapes of outputs.
+
+    output_dtypes : List[str]
+        The inferred dtypes of outputs.
+    """
+    # Preprocess input parameters
+    if shape is None:
+        provided_shapes = {}
+    elif isinstance(shape, dict):
+        provided_shapes = shape
+    else:
+        provided_shapes = {x: shape for x in graph.symbol.list_input_variables()}
+
+    if dtype is None:
+        provided_dtypes = {}
+    elif isinstance(dtype, dict):
+        provided_dtypes = dtype
+    else:
+        provided_dtypes = {x: dtype for x in graph.symbol.list_input_variables()}
+
+    provided_shapes = _dict_var_to_dict_str(provided_shapes)
+    provided_dtypes = _dict_var_to_dict_str(provided_dtypes)
+
+    # The graph may already contain shape and dtype info, so extract it and merge with
+    # the user-specified shapes and dtypes (use the user-specified one on contradiction)
+    preexisting_shapes = graph.json_attr('shape')
+    preexisting_dtypes = graph.json_attr('dtype')
+
+    if preexisting_shapes:
+        for x in graph.index.input_names:
+            if x not in provided_shapes:
+                x_shape = tuple(preexisting_shapes[graph.index.entry_id(x)])
+                provided_shapes[x] = x_shape
+
+    if preexisting_dtypes:
+        for x in graph.index.input_names:
+            if x not in provided_dtypes:
+                x_dtype = TCODE_TO_DTYPE[preexisting_dtypes[graph.index.entry_id(x)]]
+                provided_dtypes[x] = x_dtype
+
+    # Perform inference
+    nnvm.compiler.graph_attr.set_shape_inputs(graph, provided_shapes)
+    nnvm.compiler.graph_attr.set_dtype_inputs(graph, provided_dtypes)
+
+    graph = graph.apply('InferShape').apply('InferType')
+
+    inferred_shapes = graph.json_attr('shape')
+    inferred_dtypes = graph.json_attr('dtype')
+
+    index = graph.index
+
+    output_shapes = [tuple(inferred_shapes[index.entry_id(entry)])
+                     for entry in index.output_entries]
+    output_dtypes = [TCODE_TO_DTYPE[inferred_dtypes[index.entry_id(entry)]]
+                     for entry in index.output_entries]
+
+    # Postprocess the results
+    input_shapes = provided_shapes.copy()
+    input_dtypes = provided_dtypes.copy()
+
+    for x in graph.symbol.list_input_variables():
+        x_name = x.attr('name')
+        x_entry_id = graph.index.entry_id(x_name)
+        input_shapes[x_name] = tuple(inferred_shapes[x_entry_id])
+        input_dtypes[x_name] = TCODE_TO_DTYPE[inferred_dtypes[x_entry_id]]
+
+    # Merge the original user-specified shapes in case some of them are specified for non-existing
+    # variables
+    for x_name, x_shape in provided_shapes.items():
+        x_shape = tuple(x_shape)
+        if input_shapes.get(x_name, x_shape) != x_shape:
+            raise RuntimeError("Inferred shape differs from the provided shape.\n"
+                               "Provided shapes: {}\nInferred shapes: {}"
+                               .format(provided_shapes, input_shapes))
+        else:
+            input_shapes[x_name] = x_shape
+
+    # Merge the original user-specified dtypes
+    for x_name, x_dtype in provided_dtypes.items():
+        if not isinstance(x_dtype, str):
+            x_dtype = TCODE_TO_DTYPE[x_dtype]
+        if input_dtypes.get(x_name, x_dtype) != x_dtype:
+            raise RuntimeError("Inferred dtype differs from the provided dtype.\n"
+                               "Provided dtypes: {}\nInferred dtypes: {}"
+                               .format(provided_dtypes, input_dtypes))
+        else:
+            input_dtypes[x_name] = x_dtype
+
+    # If some dtypes weren't inferred and there is a fallback dtype, assign it to those varibles
+    # and repeat the inference
+    if fallback_dtype is not None and not all(input_dtypes.values()):
+        input_dtypes = {x: input_dtypes[x] if input_dtypes[x] else fallback_dtype
+                        for x in input_dtypes}
+        return infer_shapes_dtypes(graph, input_shapes, input_dtypes, fallback_dtype=None)
+
+    return graph, input_shapes, input_dtypes, output_shapes, output_dtypes
+
+def graph_to_function(graph, target, ctx, shape=None, dtype=None):
+    """Convert a graph to a function taking a keyword args and returning a list of results
+    (both args and results are numpy arrays).
+
+    Example::
+
+        fun = graph_to_function(graph, llvm, cpu(0))
+        [res1, res2] = fun(x=np.zeros((1,2)), y=np.zeros((1,)))
+
+    Parameters
+    ----------
+    graph : nnvm.graph.Graph
+        A graph we want to convert to a function.
+
+    target : str or :any:`tvm.target.Target`
+        The build target
+
+    ctx : TVMContext
+        The context to deploy the module.
+
+    shape : Dict[str, Tuple[int]], optional
+        A dict mapping input variable names to shapes.
+        By default shapes will be inferred from variables' attributes.
+        Note that this parameter takes precedence over variables' attributes.
+
+    dtype : Dict[str, str] or str, optional
+        A dict mapping input variable names to dtypes, or just a single dtype.
+        By default dtypes will be inferred from variables' attributes.
+        Note that this parameter takes precedence over variables' attributes.
+
+    Returns
+    -------
+    function : Callable[..., List[numpy.ndarray]]
+    """
+    # Infer missing shapes and dtypes
+    graph, shape, dtype, output_shapes, output_dtypes = \
+        infer_shapes_dtypes(graph, shape=shape, dtype=dtype)
+
+    if None in dtype.values():
+        raise ValueError("Input variables with no type: {}".format(dtype))
+
+    if not all(shape.values()):
+        raise ValueError("Input variables with no shape: {}".format(shape))
+
+    compute_graph, lib, params = nnvm.compiler.build(graph, target, shape=shape, dtype=dtype)
+    module = graph_runtime.create(compute_graph, lib, ctx)
+
+    if params:
+        module.set_inputs(**params)
+
+    def run(**kwargs):
+        module.run(**kwargs)
+        res = []
+        for i, (o_shape, o_dtype) in enumerate(zip(output_shapes, output_dtypes)):
+            res.append(module.get_output(i, tvm.nd.empty(o_shape, o_dtype)).asnumpy())
+        return res
+
+    return run
+
+def _dict_var_to_dict_str(dictionary):
+    """Convert a Dict[nnvm.Symbol, T] to Dict[str, T]"""
+    if isinstance(dictionary, dict):
+        return {s.attr('name') if isinstance(s, nnvm.symbol.Symbol) else s:
+                dictionary[s] for s in dictionary}
+    else:
+        return dictionary
+
+def check_function(symbol, forward=None, backward=None, grad_input_vars=None,
+                   shape=None, dtype=None, in_range=None, values=None,
+                   exclude_targets=None, only_targets=None,
+                   additional_params=None,
+                   numerical_grads=None, numerical_grads_params=None,
+                   atol=1e-5, rtol=1e-5, quiet=False):
+    """Compute the function and/or its gradients on a random input and raise
+    an exception if the result doesn't match the reference implementation.
+
+    Parameters
+    ----------
+    symbol : nnvm.Symbol
+        A symbol representing the output.
+
+    forward : Callable[..., List[numpy.ndarray]], optional
+        A reference implementation to compare with.
+
+    backward : Callable[..., List[numpy.ndarray] or Dict[str, numpy.ndarray]], optional
+        A reference implementation of gradients. Should also accept head_grads besides
+        normal inputs which is a list of gradients of some scalar wrt the outputs or just a
+        single gradient if there are multiple outputs.
+        Should return either a dict mapping input variable names to the respective
+        gradients or a list of gradients wrt variables from grad_input_vars in
+        exactly the same order (in alphabetical order by default).
+
+    grad_input_vars : List[nnvm.Symbol or str], optional
+        A list of variables with respect to which the gradients will be computed.
+        None (default) means that all input variables will be used in an alphabetical order.
+
+    shape : Dict[nnvm.Symbol or str, Tuple[int]] or Tuple[int], optional
+        A dict mapping input variable names to shapes, or just a single shape.
+        By default shapes will be inferred from variables' attributes (see the Examples).
+        Note that this parameter takes precedence over variables' attributes.
+
+    dtype : Dict[nnvm.Symbol or str, str] or str, optional
+        A dict mapping input variable names to dtypes, or just a single dtype.
+        By default dtypes will be inferred from variables' attributes (see the Examples).
+        If dtypes cannot be inferred for some variables then float32 will be used as a fallback.
+        Note that this parameter takes precedence over variables' attributes.
+
+    in_range : Dict[nnvm.Symbol or str, (float, float)] or (float, float), optional
+        A dict mapping input variable names to ranges or just a single range
+        (the same for all variables). Input values will be generated from
+        uniform distributions on these ranges. `head_grads` can also be
+        assigned a range this way.
+
+    values : Dict[nnvm.Symbol or str, numpy.ndarray], optional
+        A dict explicitly providing values for some variables instead of random generation.
+
+    exclude_targets : Set[str], optional
+        Skip compiling and running anything for these targets.
+
+    only_targets : Set[str], optional
+        Test only for those targets from `ctx_list()` that are also in this set.
+
+    additional_params : dict, optional
+        A dict of additional parameters which will be passed to forward and backward.
+
+    numerical_grads : bool or 'if_possible', optional
+        Whether to additionally check against numerically computed gradients. If 'if_possible' or
+        None is passed (which is the default) then it will try to create a gradient computation
+        graph and then check gradients numerically only if this graph can be created (i.e. if there
+        are some operations with unimplemented gradients, it will just issue a warning).
+        Checking against numerical gradients is done via the `check_numerical_grads` function.
+
+    numerical_grads_params : dict, optional
+        Additional parameters for `check_numerical_grads`.
+
+    atol : float, optional
+        Absolute tolerance for `tvm.testing.assert_allclose`. NOT used for numerical gradients.
+
+    rtol : float, optional
+        Relative tolerance for `tvm.testing.assert_allclose`. NOT used for numerical gradients.
+
+    quiet : bool, optional
+        Don't dump additional information to stdout on failure.
+
+    Examples
+    --------
+    .. code-block:: python
+
+        x = sym.Variable("x", shape=(1, 2))
+        y = sym.Variable("y", shape=(1, 2))
+
+        # check the function and its gradients both numerically and using a reference function
+        check_function(x + 2*y,
+                       lambda x, y: x + 2*y,
+                       lambda x, y, head_grads: {'x': head_grads, 'y': 2*head_grads})
+
+        # just check gradients numerically
+        check_function(x + 2*y, numerical_grads=True)
+
+        # just check the forward computation
+        check_function(x + 2*y, lambda x, y: x + 2*y, numerical_grads=False)
+
+        # specifying dtype
+        check_function(x + 2*y, lambda x, y: x + 2*y, dtype='float64')
+
+        # dtypes can also be specified during variable creation with dtype codes
+        x = sym.Variable("x", dtype=0)
+        check_function(x + 1, shape=(2, 2), numerical_grads=True)
+    """
+    # validate and preprocess the input params
+    if numerical_grads is None and forward is None and backward is None:
+        raise ValueError("No reference function was passed to check_function. If you only want to "
+                         "check gradients numerically, pass numerical_grads=True explicitly.")
+
+    if numerical_grads is None:
+        numerical_grads = 'if_possible'
+
+    if numerical_grads not in [False, True, 'if_possible']:
+        raise ValueError("numerical_grads must be a bool or 'if_possible', not {}"
+                         .format(numerical_grads))
+
+    if additional_params is None:
+        additional_params = {}
+
+    input_vars = symbol.list_input_variables()
+    input_dict = {x.attr('name'): x for x in input_vars}
+
+    if grad_input_vars is None:
+        grad_input_vars = sorted(input_vars, key=lambda x: x.attr('name'))
+    else:
+        grad_input_vars = [input_dict[x] if isinstance(x, str) else x for x in grad_input_vars]
+
+    in_range = _dict_var_to_dict_str(in_range)
+    values = _dict_var_to_dict_str(values)
+
+    out_len = len(symbol.list_output_names())
+
+    # Infer the output shapes and dtypes, and preprocess the shape and dtype params
+    forward_graph, shape, dtype, out_shapes, out_dtypes = \
+        infer_shapes_dtypes(nnvm.graph.create(symbol), shape=shape, dtype=dtype,
+                            fallback_dtype='float32')
+
+    if not all(out_shapes) or not all(out_dtypes):
+        if not quiet:
+            print(forward_graph.ir(join_node_attrs=['shape', 'dtype']))
+        raise ValueError("Could not infer shapes or dtypes for outputs.\n"
+                         "out_shapes = {}\nout_dtypes = {}".format(out_shapes, out_dtypes))
+
+    backward_graph = None
+
+    # If we want gradients, we have to recreate the graph, but now with gradient computations
+    # Note that here we need out_shapes for defining the shape of head grads, so we have to
+    # create the graph twice
+    if backward is not None or numerical_grads:
+        try:
+            head_grads_symbols = [nnvm.symbol.Variable("head_grads_" + str(i),
+                                                       shape=out_shapes[i],
+                                                       dtype=DTYPE_TO_TCODE[out_dtypes[i]])
+                                  for i in range(out_len)]
+            grad_symbols = graph_util.gradients([symbol], grad_input_vars,
+                                                grad_ys=head_grads_symbols)
+            # Sometimes grads do not depend on head_grads, so head_grads does not appear
+            # in the variable list; adding it manually prevents this, making things a bit easier
+            backward_graph = \
+                nnvm.graph.create(nnvm.symbol.Group([symbol] + grad_symbols + head_grads_symbols))
+
+            backward_graph, shape, dtype, out_shapes, out_dtypes = \
+                infer_shapes_dtypes(backward_graph, shape=shape, dtype=dtype,
+                                    fallback_dtype='float32')
+        except nnvm._base.NNVMError as err:
+            if backward is None and numerical_grads == "if_possible":
+                logging.warning("Won't check gradients because: %s", str(err).split('\n', 1)[0])
+                numerical_grads = False
+                backward_graph = None
+            else:
+                raise
+
+    main_graph = backward_graph if backward_graph is not None else forward_graph
+
+    # Generate random data for inputs (including head_grads)
+
+    np_inputs = {}
+
+    for x in main_graph.symbol.list_input_variables():
+        x_name = x.attr('name')
+        x_shape = shape[x_name]
+        x_dtype = dtype[x_name]
+
+        if values is not None and x_name in values:
+            np_inputs[x_name] = values[x_name].astype(x_dtype)
+            continue
+
+        low = -1.0
+        high = 1.0
+        if in_range is not None:
+            if isinstance(in_range, dict):
+                if x_name in in_range:
+                    low = in_range[x_name][0]
+                    high = in_range[x_name][1]
+            else:
+                low = in_range[0]
+                high = in_range[1]
+
+        np_inputs[x_name] = np.random.uniform(size=x_shape, low=low, high=high).astype(x_dtype)
+
+    np_inputs_without_head_grads = {k: np_inputs[k] for k in np_inputs
+                                    if not k.startswith('head_grads_')}
+
+    nothing_was_done = True
+
+    # Compute and compare the results
+    for target, ctx in ctx_list():
+        if exclude_targets is not None:
+            if target in exclude_targets or str(target) in exclude_targets:
+                logging.info("Skipping target = %s, ctx = %s", target, ctx)
+                continue
+        if only_targets is not None:
+            if target not in only_targets and str(target) not in only_targets:
+                logging.info("Skipping target = %s, ctx = %s", target, ctx)
+                continue
+
+        logging.info("Checking computation on target = %s, ctx = %s", target, ctx)
+
+        debug_stage = None
+
+        try:
+            nnvm_res = None
+
+            debug_stage = "compiling"
+            main_function = graph_to_function(main_graph, target, ctx)
+
+            # nnvm_res contains the output and gradients (if they are needed)
+            debug_stage = "running"
+            nnvm_res = main_function(**np_inputs)
+
+            try:
+                logging.debug("checking to_relay conversion")
+                inputs = np_inputs_without_head_grads.copy()
+                func, inputs = to_relay(main_graph, shape, dtype, params=inputs)
+                with relay.build_config(opt_level=3):
+                    graph, lib, params = relay.build(func, target=target)
+                m = graph_runtime.create(graph, lib, ctx)
+                m.set_input(**inputs)
+                m.set_input(**params)
+                m.run()
+                for i in range(out_len):
+                    relay_out = m.get_output(i).asnumpy()
+                    tvm.testing.assert_allclose(nnvm_res[i], relay_out, atol=atol, rtol=rtol)
+            except NotImplementedError as err:
+                # the NNVM operator is not supported yet
+                logging.warning(err)
+
+            if backward_graph is not None:
+                grad_var_names = [x.attr('name') for x in grad_input_vars]
+                nnvm_grads = {x: v for x, v in zip(grad_var_names, nnvm_res[out_len:])}
+
+            if forward is not None:
+                nothing_was_done = False
+                debug_stage = "checking forward computation"
+                logging.debug(debug_stage)
+
+                params = {}
+                params.update(np_inputs_without_head_grads)
+                params.update(additional_params)
+                numpy_res = forward(**params)
+
+                if isinstance(numpy_res, tuple):
+                    numpy_res = list(numpy_res)
+
+                if not isinstance(numpy_res, list):
+                    numpy_res = [numpy_res]
+
+                if len(numpy_res) != out_len:
+                    raise ValueError("Forward function returned {} values, but "
+                                     "the nnvm graph returns {} values"
+                                     .format(len(numpy_res), out_len))
+
+                for i in range(out_len):
+                    tvm.testing.assert_allclose(nnvm_res[i], numpy_res[i], atol=atol, rtol=rtol)
+
+            if backward is not None:
+                nothing_was_done = False
+                debug_stage = "checking gradients"
+                logging.debug(debug_stage)
+
+                np_head_grads = [np_inputs["head_grads_" + str(i)] for i in range(out_len)]
+
+                if out_len == 1:
+                    np_head_grads = np_head_grads[0]
+
+                params = {'head_grads': np_head_grads}
+                params.update(np_inputs_without_head_grads)
+                params.update(additional_params)
+                numpy_grads = backward(**params)
+
+                if not isinstance(numpy_grads, dict):
+                    if isinstance(numpy_grads, tuple):
+                        numpy_grads = list(numpy_grads)
+                    if not isinstance(numpy_grads, list):
+                        numpy_grads = [numpy_grads]
+                    numpy_grads = {x: v for x, v in zip(grad_var_names, numpy_grads)}
+                    if len(numpy_grads) != len(grad_var_names):
+                        raise ValueError("The backward function returns a list of gradients which "
+                                         "does not contain gradients for these variables: {}"
+                                         .format(set(grad_var_names) - set(numpy_grads)))
+
+                for x_name in numpy_grads:
+                    tvm.testing.assert_allclose(nnvm_grads[x_name], numpy_grads[x_name],
+                                                atol=atol, rtol=rtol)
+
+            if numerical_grads:
+                nothing_was_done = False
+                debug_stage = "checking gradients numerically"
+                logging.debug(debug_stage)
+
+                forward_function = graph_to_function(forward_graph, target, ctx)
+
+                # Since the result may be non-scalar, we have to put another operation on the top,
+                # so we just multiple by the randomly generated head_grads and then sum everything.
+                # This way we can reuse the gradient values which has been already computed.
+                def scalar_function(**kwargs):
+                    res = forward_function(**kwargs)
+                    return np.sum([np.dot(np_inputs['head_grads_' + str(i)].ravel(), res[i].ravel())
+                                   for i in range(out_len)])
+
+                if numerical_grads_params is None:
+                    numerical_grads_params = {}
+
+                check_numerical_grads(
+                    scalar_function,
+                    input_values=np_inputs_without_head_grads,
+                    grad_values=nnvm_grads,
+                    **numerical_grads_params)
+
+        except:
+            if not quiet:
+                print("\ncheck_function failed while {}, here is the main graph"
+                      .format(debug_stage))
+                print(main_graph.ir(join_node_attrs=['shape', 'dtype']))
+                if nnvm_res is not None:
+                    print("Generated inputs:")
+                    print(np_inputs)
+                    print()
+            raise
+
+    if nothing_was_done:
+        logging.warning("Nothing was done in check_function. Check ctx_list().")
diff --git a/nnvm/python/nnvm/testing/config.py b/nnvm/python/nnvm/testing/config.py
new file mode 100644
index 000000000000..175478b6e14a
--- /dev/null
+++ b/nnvm/python/nnvm/testing/config.py
@@ -0,0 +1,30 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Configuration about tests"""
+from __future__ import absolute_import as _abs
+
+import os
+import tvm
+
+def ctx_list():
+    """Get context list for testcases"""
+    device_list = os.environ.get("NNVM_TEST_TARGETS", "")
+    device_list = (device_list.split(",") if device_list
+                   else ["llvm", "cuda"])
+    device_list = set(device_list)
+    res = [(device, tvm.context(device, 0)) for device in device_list]
+    return [x for x in res if x[1].exist]
diff --git a/nnvm/python/nnvm/testing/dcgan.py b/nnvm/python/nnvm/testing/dcgan.py
new file mode 100644
index 000000000000..714b3fbb1301
--- /dev/null
+++ b/nnvm/python/nnvm/testing/dcgan.py
@@ -0,0 +1,109 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=unused-argument
+"""
+Symbol of the generator of DCGAN
+
+Adopted from:
+https://github.com/tqchen/mxnet-gan/blob/master/mxgan/generator.py
+
+Reference:
+Radford, Alec, Luke Metz, and Soumith Chintala.
+"Unsupervised representation learning with deep convolutional generative adversarial networks."
+arXiv preprint arXiv:1511.06434 (2015).
+"""
+from .. import symbol as sym
+from . utils import create_workload
+
+def deconv2d(data, ishape, oshape, kshape, name, stride=(2, 2)):
+    """a deconv layer that enlarges the feature map"""
+    target_shape = (oshape[-2], oshape[-1])
+
+    pad_y = (kshape[0] - 1) // 2
+    pad_x = (kshape[1] - 1) // 2
+    adj_y = (target_shape[0] + 2 * pad_y - kshape[0]) % stride[0]
+    adj_x = (target_shape[1] + 2 * pad_x - kshape[1]) % stride[1]
+
+    net = sym.conv2d_transpose(data,
+                               kernel_size=kshape,
+                               strides=stride,
+                               channels=oshape[0],
+                               padding=(pad_y, pad_x),
+                               output_padding=(adj_y, adj_x),
+                               use_bias=False,
+                               name=name)
+    return net
+
+def deconv2d_bn_relu(data, prefix, **kwargs):
+    """a block of deconv + batch norm + relu"""
+    eps = 1e-5 + 1e-12
+    net = deconv2d(data, name="%s_deconv" % prefix, **kwargs)
+    net = sym.batch_norm(net, epsilon=eps, name="%s_bn" % prefix)
+    net = sym.relu(net, name="%s_act" % prefix)
+    return net
+
+def get_symbol(oshape, ngf=128, code=None):
+    """get symbol of dcgan generator"""
+    assert oshape[-1] == 64, "Only support 64x64 image"
+    assert oshape[-2] == 64, "Only support 64x64 image"
+
+    code = sym.Variable("data") if code is None else code
+    net = sym.dense(code, name="g1", units=4*4*ngf*8, use_bias=False)
+    net = sym.relu(net)
+    # 4 x 4
+    net = sym.reshape(net, shape=(-1, ngf * 8, 4, 4))
+    # 8 x 8
+    net = deconv2d_bn_relu(
+        net, ishape=(ngf * 8, 4, 4), oshape=(ngf * 4, 8, 8), kshape=(4, 4), prefix="g2")
+    # 16x16
+    net = deconv2d_bn_relu(
+        net, ishape=(ngf * 4, 8, 8), oshape=(ngf * 2, 16, 16), kshape=(4, 4), prefix="g3")
+    # 32x32
+    net = deconv2d_bn_relu(
+        net, ishape=(ngf * 2, 16, 16), oshape=(ngf, 32, 32), kshape=(4, 4), prefix="g4")
+    # 64x64
+    net = deconv2d(
+        net, ishape=(ngf, 32, 32), oshape=oshape[-3:], kshape=(4, 4), name="g5_deconv")
+    net = sym.tanh(net)
+    return net
+
+
+def get_workload(batch_size, oshape=(3, 64, 64), ngf=128, random_len=100, dtype="float32"):
+    """Get benchmark workload for a DCGAN generator
+
+    Parameters
+    ----------
+    batch_size : int
+        The batch size used in the model
+    oshape : tuple, optional
+        The shape of output image, layout="CHW"
+    ngf: int, optional
+        The number of final feature maps in the generator
+    random_len : int, optional
+        The length of random input
+    dtype : str, optional
+        The data type
+
+    Returns
+    -------
+    net : nnvm.symbol
+        The computational graph
+    params : dict of str to NDArray
+        The parameters.
+    """
+    net = get_symbol(oshape=oshape, ngf=ngf)
+    return create_workload(net, batch_size, (random_len, ), dtype)
diff --git a/nnvm/python/nnvm/testing/densenet.py b/nnvm/python/nnvm/testing/densenet.py
new file mode 100644
index 000000000000..92ba2bf46a8f
--- /dev/null
+++ b/nnvm/python/nnvm/testing/densenet.py
@@ -0,0 +1,65 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+DenseNet, load model from gluon model zoo
+
+Reference:
+Huang, Gao, et al. "Densely Connected Convolutional Networks." CVPR 2017
+"""
+
+from .utils import create_workload
+from ..frontend.mxnet import _from_mxnet_impl
+
+def get_workload(batch_size, num_classes=1000, num_layers=121, dtype="float32"):
+    """Get benchmark workload for mobilenet
+
+    Parameters
+    ----------
+    batch_size : int
+        The batch size used in the model
+
+    num_classes : int, optional
+        Number of classes
+
+    num_layers : int, optional
+        The number of layers
+
+    dtype : str, optional
+        The data type
+
+    Returns
+    -------
+    net : nnvm.Symbol
+        The computational graph
+
+    params : dict of str to NDArray
+        The parameters.
+    """
+    import mxnet as mx
+    from mxnet.gluon.model_zoo.vision import get_model
+
+    image_shape = (1, 3, 224, 224)
+
+    block = get_model('densenet%d' % num_layers, classes=num_classes, pretrained=False)
+
+    data = mx.sym.Variable('data')
+    sym = block(data)
+    sym = mx.sym.SoftmaxOutput(sym)
+
+    net = _from_mxnet_impl(sym, {})
+
+    return create_workload(net, batch_size, image_shape[1:], dtype)
diff --git a/nnvm/python/nnvm/testing/dqn.py b/nnvm/python/nnvm/testing/dqn.py
new file mode 100644
index 000000000000..b04475efa32a
--- /dev/null
+++ b/nnvm/python/nnvm/testing/dqn.py
@@ -0,0 +1,71 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""
+Symbol of Nature DQN
+
+Reference:
+Mnih, Volodymyr, et al. "Human-level control through deep reinforcement learning."
+Nature 518.7540 (2015): 529.
+"""
+
+from .. import symbol as sym
+from . utils import create_workload
+
+def get_symbol(num_actions=18):
+    """get symbol of nature dqn"""
+    data = sym.Variable(name='data')
+    net = sym.conv2d(data, kernel_size=(8, 8), strides=(4, 4), padding=(0, 0),
+                     channels=32, name='conv1')
+    net = sym.relu(net, name='relu1')
+    net = sym.conv2d(net, kernel_size=(4, 4), strides=(2, 2), padding=(0, 0),
+                     channels=64, name='conv2')
+    net = sym.relu(net, name='relu2')
+    net = sym.conv2d(net, kernel_size=(3, 3), strides=(1, 1), padding=(0, 0),
+                     channels=64, name='conv3')
+    net = sym.relu(net, name='relu3')
+    net = sym.flatten(net, name='flatten')
+    net = sym.dense(net, units=512, name='fc4')
+    net = sym.relu(net, name='relu4')
+    net = sym.dense(net, units=num_actions, name='fc5')
+
+    return net
+
+
+def get_workload(batch_size, num_actions=18, image_shape=(4, 84, 84), dtype="float32"):
+    """Get benchmark workload for a Deep Q Network
+
+    Parameters
+    ----------
+    batch_size : int
+        The batch size used in the model
+    num_actions : int, optional
+        Number of actions
+    image_shape : tuple, optional
+        The input image shape
+    dtype : str, optional
+        The data type
+
+    Returns
+    -------
+    net : nnvm.symbol
+        The computational graph
+    params : dict of str to NDArray
+        The parameters.
+    """
+    net = get_symbol(num_actions=num_actions)
+    return create_workload(net, batch_size, image_shape, dtype)
diff --git a/nnvm/python/nnvm/testing/inception_v3.py b/nnvm/python/nnvm/testing/inception_v3.py
new file mode 100644
index 000000000000..e1614d7a9fed
--- /dev/null
+++ b/nnvm/python/nnvm/testing/inception_v3.py
@@ -0,0 +1,270 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+Inception V3, suitable for images with around 299 x 299
+
+Reference:
+Szegedy, Christian, et al. "Rethinking the Inception Architecture for Computer Vision."
+arXiv preprint arXiv:1512.00567 (2015).
+
+Adopted from https://github.com/apache/incubator-mxnet/blob/
+             master/example/image-classification/symbols/inception-v3.py
+"""
+# pylint: disable=invalid-name,missing-docstring,unused-argument
+from .. import symbol as sym
+from .utils import create_workload
+
+def Conv(data, num_filter, kernel=(1, 1), stride=(1, 1), pad=(0, 0), name=None, suffix=''):
+    conv = sym.conv2d(data=data, channels=num_filter, kernel_size=kernel,
+                      strides=stride, padding=pad, use_bias=False,
+                      name='%s%s_conv2d' % (name, suffix))
+    bn = sym.batch_norm(data=conv, name='%s%s_batchnorm' % (name, suffix), epsilon=2e-5)
+    act = sym.relu(data=bn, name='%s%s_relu' % (name, suffix))
+    return act
+
+def Pooling(data, kernel, stride, pad, pool_type, name):
+    if pool_type == 'max':
+        return sym.max_pool2d(data=data, pool_size=kernel, strides=stride, padding=pad, name=name)
+    if pool_type == 'avg':
+        return sym.avg_pool2d(data=data, pool_size=kernel, strides=stride, padding=pad, name=name,
+                              count_include_pad=True)
+    raise ValueError("Invalid pooling type: " + pool_type)
+
+def Inception7A(data,
+                num_1x1,
+                num_3x3_red, num_3x3_1, num_3x3_2,
+                num_5x5_red, num_5x5,
+                pool, proj,
+                name):
+    tower_1x1 = Conv(data, num_1x1, name=('%s_conv' % name))
+    tower_5x5 = Conv(data, num_5x5_red, name=('%s_tower' % name), suffix='_conv')
+    tower_5x5 = Conv(tower_5x5, num_5x5, kernel=(5, 5), pad=(2, 2), name=('%s_tower' % name),
+                     suffix='_conv_1')
+    tower_3x3 = Conv(data, num_3x3_red, name=('%s_tower_1' % name), suffix='_conv')
+    tower_3x3 = Conv(tower_3x3, num_3x3_1, kernel=(3, 3), pad=(1, 1), name=('%s_tower_1' % name),
+                     suffix='_conv_1')
+    tower_3x3 = Conv(tower_3x3, num_3x3_2, kernel=(3, 3), pad=(1, 1), name=('%s_tower_1' % name),
+                     suffix='_conv_2')
+    pooling = Pooling(data=data, kernel=(3, 3), stride=(1, 1), pad=(1, 1), pool_type=pool,
+                      name=('%s_pool_%s_pool' % (pool, name)))
+
+    cproj = Conv(pooling, proj, name=('%s_tower_2' % name), suffix='_conv')
+    concat = sym.concatenate(*[tower_1x1, tower_5x5, tower_3x3, cproj],
+                             name='ch_concat_%s_chconcat' % name)
+    return concat
+
+# First Downsample
+def Inception7B(data,
+                num_3x3,
+                num_d3x3_red, num_d3x3_1, num_d3x3_2,
+                pool,
+                name):
+    tower_3x3 = Conv(data, num_3x3, kernel=(3, 3), pad=(0, 0), stride=(2, 2),
+                     name=('%s_conv' % name))
+    tower_d3x3 = Conv(data, num_d3x3_red, name=('%s_tower' % name), suffix='_conv')
+    tower_d3x3 = Conv(tower_d3x3, num_d3x3_1, kernel=(3, 3), pad=(1, 1), stride=(1, 1),
+                      name=('%s_tower' % name), suffix='_conv_1')
+    tower_d3x3 = Conv(tower_d3x3, num_d3x3_2, kernel=(3, 3), pad=(0, 0), stride=(2, 2),
+                      name=('%s_tower' % name), suffix='_conv_2')
+    pooling = Pooling(data=data, kernel=(3, 3), stride=(2, 2), pad=(0, 0), pool_type="max",
+                      name=('max_pool_%s_pool' % name))
+    concat = sym.concatenate(*[tower_3x3, tower_d3x3, pooling], name='ch_concat_%s_chconcat' % name)
+    return concat
+
+def Inception7C(data,
+                num_1x1,
+                num_d7_red, num_d7_1, num_d7_2,
+                num_q7_red, num_q7_1, num_q7_2, num_q7_3, num_q7_4,
+                pool, proj,
+                name):
+    tower_1x1 = Conv(data=data, num_filter=num_1x1, kernel=(1, 1), name=('%s_conv' % name))
+    tower_d7 = Conv(data=data, num_filter=num_d7_red, name=('%s_tower' % name), suffix='_conv')
+    tower_d7 = Conv(data=tower_d7, num_filter=num_d7_1, kernel=(1, 7), pad=(0, 3),
+                    name=('%s_tower' % name), suffix='_conv_1')
+    tower_d7 = Conv(data=tower_d7, num_filter=num_d7_2, kernel=(7, 1), pad=(3, 0),
+                    name=('%s_tower' % name), suffix='_conv_2')
+    tower_q7 = Conv(data=data, num_filter=num_q7_red, name=('%s_tower_1' % name), suffix='_conv')
+    tower_q7 = Conv(data=tower_q7, num_filter=num_q7_1, kernel=(7, 1), pad=(3, 0),
+                    name=('%s_tower_1' % name), suffix='_conv_1')
+    tower_q7 = Conv(data=tower_q7, num_filter=num_q7_2, kernel=(1, 7), pad=(0, 3),
+                    name=('%s_tower_1' % name), suffix='_conv_2')
+    tower_q7 = Conv(data=tower_q7, num_filter=num_q7_3, kernel=(7, 1), pad=(3, 0),
+                    name=('%s_tower_1' % name), suffix='_conv_3')
+    tower_q7 = Conv(data=tower_q7, num_filter=num_q7_4, kernel=(1, 7), pad=(0, 3),
+                    name=('%s_tower_1' % name), suffix='_conv_4')
+    pooling = Pooling(data=data, kernel=(3, 3), stride=(1, 1), pad=(1, 1), pool_type=pool,
+                      name=('%s_pool_%s_pool' % (pool, name)))
+    cproj = Conv(data=pooling, num_filter=proj, kernel=(1, 1),
+                 name=('%s_tower_2' % name), suffix='_conv')
+    # concat
+    concat = sym.concatenate(*[tower_1x1, tower_d7, tower_q7, cproj],
+                             name='ch_concat_%s_chconcat' % name)
+    return concat
+
+def Inception7D(data,
+                num_3x3_red, num_3x3,
+                num_d7_3x3_red, num_d7_1, num_d7_2, num_d7_3x3,
+                pool,
+                name):
+    tower_3x3 = Conv(data=data, num_filter=num_3x3_red, name=('%s_tower' % name),
+                     suffix='_conv')
+    tower_3x3 = Conv(data=tower_3x3, num_filter=num_3x3, kernel=(3, 3), pad=(0, 0), stride=(2, 2),
+                     name=('%s_tower' % name), suffix='_conv_1')
+    tower_d7_3x3 = Conv(data=data, num_filter=num_d7_3x3_red, name=('%s_tower_1' % name),
+                        suffix='_conv')
+    tower_d7_3x3 = Conv(data=tower_d7_3x3, num_filter=num_d7_1, kernel=(1, 7), pad=(0, 3),
+                        name=('%s_tower_1' % name), suffix='_conv_1')
+    tower_d7_3x3 = Conv(data=tower_d7_3x3, num_filter=num_d7_2, kernel=(7, 1), pad=(3, 0),
+                        name=('%s_tower_1' % name), suffix='_conv_2')
+    tower_d7_3x3 = Conv(data=tower_d7_3x3, num_filter=num_d7_3x3, kernel=(3, 3), stride=(2, 2),
+                        name=('%s_tower_1' % name), suffix='_conv_3')
+    pooling = Pooling(data=data, kernel=(3, 3), stride=(2, 2), pool_type=pool, pad=(0, 0),
+                      name=('%s_pool_%s_pool' % (pool, name)))
+    # concat
+    concat = sym.concatenate(*[tower_3x3, tower_d7_3x3, pooling],
+                             name='ch_concat_%s_chconcat' % name)
+    return concat
+
+def Inception7E(data,
+                num_1x1,
+                num_d3_red, num_d3_1, num_d3_2,
+                num_3x3_d3_red, num_3x3, num_3x3_d3_1, num_3x3_d3_2,
+                pool, proj,
+                name):
+    tower_1x1 = Conv(data=data, num_filter=num_1x1, kernel=(1, 1), name=('%s_conv' % name))
+    tower_d3 = Conv(data=data, num_filter=num_d3_red, name=('%s_tower' % name), suffix='_conv')
+    tower_d3_a = Conv(data=tower_d3, num_filter=num_d3_1, kernel=(1, 3), pad=(0, 1),
+                      name=('%s_tower' % name), suffix='_mixed_conv')
+    tower_d3_b = Conv(data=tower_d3, num_filter=num_d3_2, kernel=(3, 1), pad=(1, 0),
+                      name=('%s_tower' % name), suffix='_mixed_conv_1')
+    tower_3x3_d3 = Conv(data=data, num_filter=num_3x3_d3_red, name=('%s_tower_1' % name),
+                        suffix='_conv')
+    tower_3x3_d3 = Conv(data=tower_3x3_d3, num_filter=num_3x3, kernel=(3, 3), pad=(1, 1),
+                        name=('%s_tower_1' % name), suffix='_conv_1')
+    tower_3x3_d3_a = Conv(data=tower_3x3_d3, num_filter=num_3x3_d3_1, kernel=(1, 3), pad=(0, 1),
+                          name=('%s_tower_1' % name), suffix='_mixed_conv')
+    tower_3x3_d3_b = Conv(data=tower_3x3_d3, num_filter=num_3x3_d3_2, kernel=(3, 1), pad=(1, 0),
+                          name=('%s_tower_1' % name), suffix='_mixed_conv_1')
+    pooling = Pooling(data=data, kernel=(3, 3), stride=(1, 1), pad=(1, 1), pool_type=pool,
+                      name=('%s_pool_%s_pool' % (pool, name)))
+    cproj = Conv(data=pooling, num_filter=proj, kernel=(1, 1), name=('%s_tower_2' % name),
+                 suffix='_conv')
+    # concat
+    concat = sym.concatenate(
+        *[tower_1x1, tower_d3_a, tower_d3_b, tower_3x3_d3_a, tower_3x3_d3_b, cproj],
+        name='ch_concat_%s_chconcat' % name)
+    return concat
+
+
+def get_symbol(num_classes=1000, **kwargs):
+    data = sym.Variable(name="data")
+    # stage 1
+    conv = Conv(data, 32, kernel=(3, 3), stride=(2, 2), name="conv")
+    conv_1 = Conv(conv, 32, kernel=(3, 3), name="conv_1")
+    conv_2 = Conv(conv_1, 64, kernel=(3, 3), pad=(1, 1), name="conv_2")
+    pool = Pooling(data=conv_2, kernel=(3, 3), stride=(2, 2), pool_type="max", pad=(0, 0),
+                   name="pool")
+    # stage 2
+    conv_3 = Conv(pool, 80, kernel=(1, 1), name="conv_3")
+    conv_4 = Conv(conv_3, 192, kernel=(3, 3), name="conv_4")
+    pool1 = Pooling(data=conv_4, kernel=(3, 3), stride=(2, 2), pool_type="max", pad=(0, 0),
+                    name="pool1")
+
+    # stage 3
+    in3a = Inception7A(pool1, 64,
+                       64, 96, 96,
+                       48, 64,
+                       "avg", 32, "mixed")
+    in3b = Inception7A(in3a, 64,
+                       64, 96, 96,
+                       48, 64,
+                       "avg", 64, "mixed_1")
+    in3c = Inception7A(in3b, 64,
+                       64, 96, 96,
+                       48, 64,
+                       "avg", 64, "mixed_2")
+    in3d = Inception7B(in3c, 384,
+                       64, 96, 96,
+                       "max", "mixed_3")
+    # stage 4
+    in4a = Inception7C(in3d, 192,
+                       128, 128, 192,
+                       128, 128, 128, 128, 192,
+                       "avg", 192, "mixed_4")
+    in4b = Inception7C(in4a, 192,
+                       160, 160, 192,
+                       160, 160, 160, 160, 192,
+                       "avg", 192, "mixed_5")
+    in4c = Inception7C(in4b, 192,
+                       160, 160, 192,
+                       160, 160, 160, 160, 192,
+                       "avg", 192, "mixed_6")
+    in4d = Inception7C(in4c, 192,
+                       192, 192, 192,
+                       192, 192, 192, 192, 192,
+                       "avg", 192, "mixed_7")
+    in4e = Inception7D(in4d, 192, 320,
+                       192, 192, 192, 192,
+                       "max", "mixed_8")
+    # stage 5
+    in5a = Inception7E(in4e, 320,
+                       384, 384, 384,
+                       448, 384, 384, 384,
+                       "avg", 192, "mixed_9")
+    in5b = Inception7E(in5a, 320,
+                       384, 384, 384,
+                       448, 384, 384, 384,
+                       "max", 192, "mixed_10")
+    # pool
+    pool = Pooling(data=in5b, kernel=(8, 8), stride=(1, 1), pool_type="avg", pad=(0, 0),
+                   name="global_pool")
+    flatten = sym.flatten(data=pool, name="flatten")
+    fc1 = sym.dense(data=flatten, units=num_classes, name='fc1')
+    softmax = sym.softmax(data=fc1, name='softmax')
+    return softmax
+
+def get_workload(batch_size=1, num_classes=1000,
+                 image_shape=(3, 299, 299), dtype="float32", **kwargs):
+    """Get benchmark workload for InceptionV3
+
+    Parameters
+    ----------
+    batch_size : int
+        The batch size used in the model
+
+    num_classes : int, optional
+        Number of classes
+
+    image_shape : tuple, optional
+        The input image shape
+
+    dtype : str, optional
+        The data type
+
+    kwargs : dict
+        Extra arguments
+
+    Returns
+    -------
+    net : nnvm.Symbol
+        The computational graph
+
+    params : dict of str to NDArray
+        The parameters.
+    """
+    net = get_symbol(num_classes=num_classes, **kwargs)
+    return create_workload(net, batch_size, image_shape, dtype)
diff --git a/nnvm/python/nnvm/testing/init.py b/nnvm/python/nnvm/testing/init.py
new file mode 100644
index 000000000000..611c81e69483
--- /dev/null
+++ b/nnvm/python/nnvm/testing/init.py
@@ -0,0 +1,125 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Initializer of parameters."""
+import numpy as np
+
+class Initializer(object):
+    """The base class of an initializer."""
+    def __init__(self, **kwargs):
+        self._kwargs = kwargs
+
+    def __call__(self, desc, arr):
+        """Initialize an array
+
+        Parameters
+        ----------
+        desc : str
+            Initialization pattern descriptor.
+
+        arr : NDArray
+            The array to be initialized.
+        """
+        if desc.endswith('weight'):
+            self._init_weight(desc, arr)
+        elif desc.endswith('bias'):
+            self._init_bias(desc, arr)
+        elif desc.endswith('gamma'):
+            self._init_gamma(desc, arr)
+        elif desc.endswith('beta'):
+            self._init_beta(desc, arr)
+        elif desc.endswith('mean'):
+            self._init_mean(desc, arr)
+        elif desc.endswith('var'):
+            self._init_var(desc, arr)
+        else:
+            self._init_default(desc, arr)
+
+    def _init_bias(self, _, arr):
+        arr[:] = 0.0
+
+    def _init_gamma(self, _, arr):
+        arr[:] = 1.0
+
+    def _init_beta(self, _, arr):
+        arr[:] = 0.0
+
+    def _init_mean(self, _, arr):
+        arr[:] = 0.0
+
+    def _init_var(self, _, arr):
+        arr[:] = 1.0
+
+    def _init_weight(self, name, arr):
+        """Abstract method to Initialize weight."""
+        raise NotImplementedError("Must override it")
+
+    def _init_default(self, name, _):
+        raise ValueError(
+            'Unknown initialization pattern for %s. ' \
+            'Default initialization is now limited to '\
+            '"weight", "bias", "gamma" (1.0), and "beta" (0.0).' \
+            'Please use mx.sym.Variable(init=mx.init.*) to set initialization pattern' % name)
+
+
+class Xavier(Initializer):
+    """ "Xavier" initialization for weights
+
+    Parameters
+    ----------
+    rnd_type: str, optional
+        Random generator type, can be ``'gaussian'`` or ``'uniform'``.
+
+    factor_type: str, optional
+        Can be ``'avg'``, ``'in'``, or ``'out'``.
+
+    magnitude: float, optional
+        Scale of random number.
+    """
+    def __init__(self, rnd_type="uniform", factor_type="avg", magnitude=3):
+        super(Xavier, self).__init__(rnd_type=rnd_type,
+                                     factor_type=factor_type,
+                                     magnitude=magnitude)
+        self.rnd_type = rnd_type
+        self.factor_type = factor_type
+        self.magnitude = float(magnitude)
+
+    def _init_weight(self, name, arr):
+        shape = arr.shape
+        hw_scale = 1.
+        if len(shape) < 2:
+            raise ValueError('Xavier initializer cannot be applied to vector {0}. It requires at'
+                             ' least 2D.'.format(name))
+        if len(shape) > 2:
+            hw_scale = np.prod(shape[2:])
+        fan_in, fan_out = shape[1] * hw_scale, shape[0] * hw_scale
+        factor = 1.
+        if self.factor_type == "avg":
+            factor = (fan_in + fan_out) / 2.0
+        elif self.factor_type == "in":
+            factor = fan_in
+        elif self.factor_type == "out":
+            factor = fan_out
+        else:
+            raise ValueError("Incorrect factor type")
+        # Hack for mobilenet, because there is less connectivity
+        if "depthwise" in name:
+            factor = 3 * 3
+        scale = np.sqrt(self.magnitude / factor)
+        if self.rnd_type == "uniform":
+            arr[:] = np.random.uniform(-scale, scale, size=arr.shape)
+        else:
+            raise ValueError("Unknown random type")
diff --git a/nnvm/python/nnvm/testing/mlp.py b/nnvm/python/nnvm/testing/mlp.py
new file mode 100644
index 000000000000..1b6975661fe4
--- /dev/null
+++ b/nnvm/python/nnvm/testing/mlp.py
@@ -0,0 +1,60 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+a simple multilayer perceptron
+"""
+from .. import symbol as sym
+from . utils import create_workload
+
+def get_symbol(num_classes=1000):
+    data = sym.Variable('data')
+    data = sym.flatten(data=data)
+    fc1 = sym.dense(data=data, name='fc1', units=128)
+    act1 = sym.relu(data=fc1, name='relu1')
+    fc2 = sym.dense(data=act1, name='fc2', units=64)
+    act2 = sym.relu(data=fc2, name='relu2')
+    fc3 = sym.dense(data=act2, name='fc3', units=num_classes)
+    mlp = sym.softmax(data=fc3, name='softmax')
+    return mlp
+
+def get_workload(batch_size, num_classes=1000, image_shape=(3, 224, 224), dtype="float32"):
+    """Get benchmark workload for a simple multilayer perceptron
+
+    Parameters
+    ----------
+    batch_size : int
+        The batch size used in the model
+
+    num_classes : int, optional
+        Number of claseses
+
+    image_shape : tuple, optional
+        The input image shape
+
+    dtype : str, optional
+        The data type
+
+    Returns
+    -------
+    net : nnvm.symbol
+        The computational graph
+
+    params : dict of str to NDArray
+        The parameters.
+    """
+    net = get_symbol(num_classes=num_classes)
+    return create_workload(net, batch_size, image_shape, dtype)
diff --git a/nnvm/python/nnvm/testing/mobilenet.py b/nnvm/python/nnvm/testing/mobilenet.py
new file mode 100644
index 000000000000..e505ff499a54
--- /dev/null
+++ b/nnvm/python/nnvm/testing/mobilenet.py
@@ -0,0 +1,122 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Helper utility to get mobilenet workload for testing."""
+# pylint: disable=invalid-name
+from __future__ import absolute_import as _abs
+
+from .. import symbol as sym
+from . utils import create_workload
+
+def conv_block(data, name, channels,
+               kernel_size=(3, 3), strides=(1, 1), padding=(1, 1),
+               epsilon=1e-5):
+    """Helper function to construct conv-bn-relu"""
+    # convolution + bn + relu
+    conv = sym.conv2d(data=data, channels=channels,
+                      kernel_size=kernel_size, strides=strides,
+                      padding=padding, use_bias=False,
+                      layout="NCHW", name=name + "_conv")
+    bn = sym.batch_norm(data=conv, epsilon=epsilon, name=name + "_bn")
+    act = sym.relu(data=bn, name=name + "_relu")
+    return act
+
+def separable_conv_block(data, name, depthwise_channels,
+                         pointwise_channels, kernel_size=(3, 3),
+                         downsample=False, padding=(1, 1),
+                         epsilon=1e-5):
+    """Helper function to get a separable conv block"""
+    if downsample:
+        strides = (2, 2)
+    else:
+        strides = (1, 1)
+    # depthwise convolution + bn + relu
+    conv1 = sym.conv2d(data=data, channels=depthwise_channels,
+                       groups=depthwise_channels, kernel_size=kernel_size, strides=strides,
+                       padding=padding, use_bias=False, layout="NCHW",
+                       name=name + "_depthwise_conv1")
+    bn1 = sym.batch_norm(data=conv1, epsilon=epsilon, name=name + "_bn1")
+    act1 = sym.relu(data=bn1, name=name + "_relu1")
+    # pointwise convolution + bn + relu
+    conv2 = sym.conv2d(data=act1, channels=pointwise_channels, kernel_size=(1, 1), strides=(1, 1),
+                       padding=(0, 0), use_bias=False, layout="NCHW", name=name + "_conv2")
+    bn2 = sym.batch_norm(data=conv2, epsilon=epsilon, name=name + "_bn2")
+    act2 = sym.relu(data=bn2, name=name + "_relu2")
+    return act2
+
+def mobile_net(num_classes=1000, alpha=1.0, is_shallow=False):
+    """Function to construct a MobileNet"""
+    data = sym.Variable("data")
+    body = conv_block(data, "conv_block_1", int(32*alpha), strides=(2, 2))
+    body = separable_conv_block(body, "separable_conv_block_1",
+                                int(32*alpha), int(64*alpha))
+    body = separable_conv_block(body, "separable_conv_block_2",
+                                int(64*alpha), int(128*alpha), downsample=True)
+    body = separable_conv_block(body, "separable_conv_block_3",
+                                int(128*alpha), int(128*alpha))
+    body = separable_conv_block(body, "separable_conv_block_4",
+                                int(128*alpha), int(256*alpha), downsample=True)
+    body = separable_conv_block(body, "separable_conv_block_5",
+                                int(256*alpha), int(256*alpha))
+    body = separable_conv_block(body, "separable_conv_block_6",
+                                int(256*alpha), int(512*alpha), downsample=True)
+    if is_shallow:
+        body = separable_conv_block(body, "separable_conv_block_7",
+                                    int(512*alpha), int(1024*alpha), downsample=True)
+        body = separable_conv_block(body, "separable_conv_block_8",
+                                    int(1024*alpha), int(1024*alpha))
+    else:
+        for i in range(7, 12):
+            body = separable_conv_block(body, "separable_conv_block_%d" % i,
+                                        int(512*alpha), int(512*alpha))
+        body = separable_conv_block(body, "separable_conv_block_12",
+                                    int(512*alpha), int(1024*alpha), downsample=True)
+        body = separable_conv_block(body, "separable_conv_block_13",
+                                    int(1024*alpha), int(1024*alpha))
+    pool = sym.global_avg_pool2d(data=body, name="pool")
+    flatten = sym.flatten(data=pool, name="flatten")
+    fc = sym.dense(data=flatten, units=num_classes, use_bias=False, name="fc")
+    softmax = sym.softmax(data=fc, name="softmax")
+    return softmax
+
+
+def get_workload(batch_size, num_classes=1000, image_shape=(3, 224, 224), dtype="float32"):
+    """Get benchmark workload for mobilenet
+
+    Parameters
+    ----------
+    batch_size : int
+        The batch size used in the model
+
+    num_classes : int, optional
+        Number of classes
+
+    image_shape : tuple, optional
+        The input image shape
+
+    dtype : str, optional
+        The data type
+
+    Returns
+    -------
+    net : nnvm.Symbol
+        The computational graph
+
+    params : dict of str to NDArray
+        The parameters.
+    """
+    net = mobile_net(num_classes=num_classes, alpha=1.0, is_shallow=False)
+    return create_workload(net, batch_size, image_shape, dtype)
diff --git a/nnvm/python/nnvm/testing/mobilenet_v2.py b/nnvm/python/nnvm/testing/mobilenet_v2.py
new file mode 100644
index 000000000000..87c4a2c7e9f5
--- /dev/null
+++ b/nnvm/python/nnvm/testing/mobilenet_v2.py
@@ -0,0 +1,67 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+MobileNetV2, load model from gluon model zoo
+
+Reference:
+Inverted Residuals and Linear Bottlenecks:
+Mobile Networks for Classification, Detection and Segmentation
+https://arxiv.org/abs/1801.04381
+"""
+
+from .utils import create_workload
+from ..frontend.mxnet import _from_mxnet_impl
+
+def get_workload(batch_size, num_classes=1000, multiplier=1.0, dtype="float32"):
+    """Get benchmark workload for mobilenet
+
+    Parameters
+    ----------
+    batch_size : int
+        The batch size used in the model
+
+    num_classes : int, optional
+        Number of classes
+
+    multiplier : tuple, optional
+        The input image shape
+
+    dtype : str, optional
+        The data type
+
+    Returns
+    -------
+    net : nnvm.Symbol
+        The computational graph
+
+    params : dict of str to NDArray
+        The parameters.
+    """
+    import mxnet as mx
+    from mxnet.gluon.model_zoo.vision.mobilenet import MobileNetV2
+
+    image_shape = (1, 3, 224, 224)
+
+    block = MobileNetV2(multiplier=multiplier, classes=num_classes)
+
+    data = mx.sym.Variable('data')
+    sym = block(data)
+    sym = mx.sym.SoftmaxOutput(sym)
+
+    net = _from_mxnet_impl(sym, {})
+
+    return create_workload(net, batch_size, image_shape[1:], dtype)
diff --git a/nnvm/python/nnvm/testing/resnet.py b/nnvm/python/nnvm/testing/resnet.py
new file mode 100644
index 000000000000..e63ceff7c3f0
--- /dev/null
+++ b/nnvm/python/nnvm/testing/resnet.py
@@ -0,0 +1,224 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+'''
+Adapted from https://github.com/tornadomeet/ResNet/blob/master/symbol_resnet.py
+Original author Wei Wu
+
+Implemented the following paper:
+
+Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun. "Identity Mappings in Deep Residual Networks"
+'''
+# pylint: disable=unused-argument
+from .. import symbol as sym
+from . utils import create_workload
+
+def residual_unit(data, num_filter, stride, dim_match, name, bottle_neck=True):
+    """Return ResNet Unit symbol for building ResNet
+    Parameters
+    ----------
+    data : str
+        Input data
+    num_filter : int
+        Number of output channels
+    bnf : int
+        Bottle neck channels factor with regard to num_filter
+    stride : tuple
+        Stride used in convolution
+    dim_match : Boolean
+        True means channel number between input and output is the same,
+        otherwise means differ
+    name : str
+        Base name of the operators
+    """
+    if bottle_neck:
+        bn1 = sym.batch_norm(data=data, epsilon=2e-5, name=name + '_bn1')
+        act1 = sym.relu(data=bn1, name=name + '_relu1')
+        conv1 = sym.conv2d(
+            data=act1, channels=int(num_filter*0.25), kernel_size=(1, 1),
+            strides=stride, padding=(0, 0), use_bias=False, name=name + '_conv1')
+        bn2 = sym.batch_norm(data=conv1, epsilon=2e-5, name=name + '_bn2')
+        act2 = sym.relu(data=bn2, name=name + '_relu2')
+        conv2 = sym.conv2d(
+            data=act2, channels=int(num_filter*0.25), kernel_size=(3, 3),
+            strides=(1, 1), padding=(1, 1), use_bias=False, name=name + '_conv2')
+        bn3 = sym.batch_norm(data=conv2, epsilon=2e-5, name=name + '_bn3')
+        act3 = sym.relu(data=bn3, name=name + '_relu3')
+        conv3 = sym.conv2d(
+            data=act3, channels=num_filter, kernel_size=(1, 1),
+            strides=(1, 1), padding=(0, 0), use_bias=False, name=name + '_conv3')
+        if dim_match:
+            shortcut = data
+        else:
+            shortcut = sym.conv2d(
+                data=act1, channels=num_filter, kernel_size=(1, 1),
+                strides=stride, use_bias=False, name=name+'_sc')
+        return sym.elemwise_add(conv3, shortcut)
+    else:
+        bn1 = sym.batch_norm(data=data, epsilon=2e-5, name=name + '_bn1')
+        act1 = sym.relu(data=bn1, name=name + '_relu1')
+        conv1 = sym.conv2d(
+            data=act1, channels=num_filter, kernel_size=(3, 3),
+            strides=stride, padding=(1, 1), use_bias=False, name=name + '_conv1')
+        bn2 = sym.batch_norm(data=conv1, epsilon=2e-5, name=name + '_bn2')
+        act2 = sym.relu(data=bn2, name=name + '_relu2')
+        conv2 = sym.conv2d(
+            data=act2, channels=num_filter, kernel_size=(3, 3),
+            strides=(1, 1), padding=(1, 1), use_bias=False, name=name + '_conv2')
+        if dim_match:
+            shortcut = data
+        else:
+            shortcut = sym.conv2d(
+                data=act1, channels=num_filter, kernel_size=(1, 1),
+                strides=stride, use_bias=False, name=name+'_sc')
+        return sym.elemwise_add(conv2, shortcut)
+
+def resnet(units, num_stages, filter_list, num_classes, image_shape,
+           bottle_neck=True):
+    """Return ResNet symbol of
+    Parameters
+    ----------
+    units : list
+        Number of units in each stage
+    num_stages : int
+        Number of stage
+    filter_list : list
+        Channel size of each stage
+    num_classes : int
+        Ouput size of symbol
+    dataset : str
+        Dataset type, only cifar10 and imagenet supports
+    """
+    num_unit = len(units)
+    assert num_unit == num_stages
+    data = sym.Variable(name='data')
+    data = sym.batch_norm(data=data, epsilon=2e-5, scale=False, name='bn_data')
+    (_, height, _) = image_shape
+    if height <= 32:            # such as cifar10
+        body = sym.conv2d(
+            data=data, channels=filter_list[0], kernel_size=(3, 3),
+            strides=(1, 1), padding=(1, 1), use_bias=False, name="conv0")
+    else:                       # often expected to be 224 such as imagenet
+        body = sym.conv2d(
+            data=data, channels=filter_list[0], kernel_size=(7, 7),
+            strides=(2, 2), padding=(3, 3), use_bias=False, name="conv0")
+        body = sym.batch_norm(data=body, epsilon=2e-5, name='bn0')
+        body = sym.relu(data=body, name='relu0')
+        body = sym.max_pool2d(data=body, pool_size=(3, 3), strides=(2, 2), padding=(1, 1))
+
+    for i in range(num_stages):
+        body = residual_unit(
+            body, filter_list[i+1], (1 if i == 0 else 2, 1 if i == 0 else 2),
+            False, name='stage%d_unit%d' % (i + 1, 1), bottle_neck=bottle_neck)
+        for j in range(units[i]-1):
+            body = residual_unit(
+                body, filter_list[i+1], (1, 1), True,
+                name='stage%d_unit%d' % (i + 1, j + 2), bottle_neck=bottle_neck)
+    bn1 = sym.batch_norm(data=body, epsilon=2e-5, name='bn1')
+    relu1 = sym.relu(data=bn1, name='relu1')
+    # Although kernel is not used here when global_pool=True, we should put one
+    pool1 = sym.global_avg_pool2d(data=relu1, name='pool1')
+    flat = sym.flatten(data=pool1)
+    fc1 = sym.dense(data=flat, units=num_classes, name='fc1')
+    return sym.softmax(data=fc1, name='softmax')
+
+def get_symbol(num_classes, num_layers=50, image_shape=(3, 224, 224), **kwargs):
+    """
+    Adapted from https://github.com/tornadomeet/ResNet/blob/master/train_resnet.py
+    Original author Wei Wu
+    """
+    (_, height, _) = image_shape
+    if height <= 28:
+        num_stages = 3
+        if (num_layers-2) % 9 == 0 and num_layers >= 164:
+            per_unit = [(num_layers-2)//9]
+            filter_list = [16, 64, 128, 256]
+            bottle_neck = True
+        elif (num_layers-2) % 6 == 0 and num_layers < 164:
+            per_unit = [(num_layers-2)//6]
+            filter_list = [16, 16, 32, 64]
+            bottle_neck = False
+        else:
+            raise ValueError("no experiments done on num_layers {}".format(num_layers))
+        units = per_unit * num_stages
+    else:
+        if num_layers >= 50:
+            filter_list = [64, 256, 512, 1024, 2048]
+            bottle_neck = True
+        else:
+            filter_list = [64, 64, 128, 256, 512]
+            bottle_neck = False
+        num_stages = 4
+        if num_layers == 18:
+            units = [2, 2, 2, 2]
+        elif num_layers == 34:
+            units = [3, 4, 6, 3]
+        elif num_layers == 50:
+            units = [3, 4, 6, 3]
+        elif num_layers == 101:
+            units = [3, 4, 23, 3]
+        elif num_layers == 152:
+            units = [3, 8, 36, 3]
+        elif num_layers == 200:
+            units = [3, 24, 36, 3]
+        elif num_layers == 269:
+            units = [3, 30, 48, 8]
+        else:
+            raise ValueError("no experiments done on num_layers {}".format(num_layers))
+
+    return resnet(units=units,
+                  num_stages=num_stages,
+                  filter_list=filter_list,
+                  num_classes=num_classes,
+                  image_shape=image_shape,
+                  bottle_neck=bottle_neck)
+
+def get_workload(batch_size=1, num_classes=1000, num_layers=18,
+                 image_shape=(3, 224, 224), dtype="float32", **kwargs):
+    """Get benchmark workload for resnet
+
+    Parameters
+    ----------
+    batch_size : int
+        The batch size used in the model
+
+    num_classes : int, optional
+        Number of classes
+
+    num_layers : int, optional
+        Number of layers
+
+    image_shape : tuple, optional
+        The input image shape
+
+    dtype : str, optional
+        The data type
+
+    kwargs : dict
+        Extra arguments
+
+    Returns
+    -------
+    net : nnvm.Symbol
+        The computational graph
+
+    params : dict of str to NDArray
+        The parameters.
+    """
+    net = get_symbol(num_classes=num_classes, num_layers=num_layers,
+                     image_shape=image_shape, **kwargs)
+    return create_workload(net, batch_size, image_shape, dtype)
diff --git a/nnvm/python/nnvm/testing/squeezenet.py b/nnvm/python/nnvm/testing/squeezenet.py
new file mode 100644
index 000000000000..eab2cf06fee6
--- /dev/null
+++ b/nnvm/python/nnvm/testing/squeezenet.py
@@ -0,0 +1,132 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# coding: utf-8
+# pylint: disable=unused-argument
+
+"""
+Symbol of SqueezeNet
+
+Reference:
+Iandola, Forrest N., et al.
+"Squeezenet: Alexnet-level accuracy with 50x fewer parameters and< 0.5 mb model size." (2016).
+"""
+
+from .. import symbol as sym
+from . utils import create_workload
+
+# Helpers
+def _make_fire(net, squeeze_channels, expand1x1_channels, expand3x3_channels):
+    net = _make_fire_conv(net, squeeze_channels, 1, 0)
+
+    left = _make_fire_conv(net, expand1x1_channels, 1, 0)
+    right = _make_fire_conv(net, expand3x3_channels, 3, 1)
+    # NOTE : Assume NCHW layout here
+    net = sym.concatenate(left, right, axis=1)
+
+    return net
+
+def _make_fire_conv(net, channels, kernel_size, padding=0):
+    net = sym.conv2d(net, channels=channels, kernel_size=(kernel_size, kernel_size),
+                     padding=(padding, padding))
+    net = sym.relu(net)
+    return net
+
+# Net
+def get_symbol(num_classes, version, **kwargs):
+    """Get symbol of SqueezeNet
+
+    Parameters
+    ----------
+    num_classes: int
+        The number of classification results
+
+    version : str, optional
+        "1.0" or "1.1" of SqueezeNet
+    """
+    assert version in ['1.0', '1.1'], ("Unsupported SqueezeNet version {version}:"
+                                       "1.0 or 1.1 expected".format(version=version))
+    net = sym.Variable("data")
+    if version == '1.0':
+        net = sym.conv2d(net, channels=96, kernel_size=(7, 7), strides=(2, 2), padding=(3, 3))
+        net = sym.relu(net)
+        net = sym.max_pool2d(net, pool_size=(3, 3), strides=(2, 2))
+        net = _make_fire(net, 16, 64, 64)
+        net = _make_fire(net, 16, 64, 64)
+        net = _make_fire(net, 32, 128, 128)
+        net = sym.max_pool2d(net, pool_size=(3, 3), strides=(2, 2))
+        net = _make_fire(net, 32, 128, 128)
+        net = _make_fire(net, 48, 192, 192)
+        net = _make_fire(net, 48, 192, 192)
+        net = _make_fire(net, 64, 256, 256)
+        net = sym.max_pool2d(net, pool_size=(3, 3), strides=(2, 2))
+        net = _make_fire(net, 64, 256, 256)
+    else:
+        net = sym.conv2d(net, channels=64, kernel_size=(3, 3), strides=(2, 2), padding=(1, 1))
+        net = sym.relu(net)
+        net = sym.max_pool2d(net, pool_size=(3, 3), strides=(2, 2))
+        net = _make_fire(net, 16, 64, 64)
+        net = _make_fire(net, 16, 64, 64)
+        net = sym.max_pool2d(net, pool_size=(3, 3), strides=(2, 2))
+        net = _make_fire(net, 32, 128, 128)
+        net = _make_fire(net, 32, 128, 128)
+        net = sym.max_pool2d(net, pool_size=(3, 3), strides=(2, 2))
+        net = _make_fire(net, 48, 192, 192)
+        net = _make_fire(net, 48, 192, 192)
+        net = _make_fire(net, 64, 256, 256)
+        net = _make_fire(net, 64, 256, 256)
+    net = sym.dropout(net, rate=0.5)
+    net = sym.conv2d(net, channels=num_classes, kernel_size=(1, 1))
+    net = sym.relu(net)
+    net = sym.global_avg_pool2d(net)
+    net = sym.flatten(net)
+    return sym.softmax(net)
+
+def get_workload(batch_size=1, num_classes=1000, version='1.0',
+                 image_shape=(3, 224, 224), dtype="float32", **kwargs):
+    """Get benchmark workload for SqueezeNet
+
+    Parameters
+    ----------
+    batch_size : int
+        The batch size used in the model
+
+    num_classes : int, optional
+        Number of classes
+
+    version : str, optional
+        "1.0" or "1.1" of SqueezeNet
+
+    image_shape : tuple, optional
+        The input image shape
+
+    dtype : str, optional
+        The data type
+
+    kwargs : dict
+        Extra arguments
+
+    Returns
+    -------
+    net : nnvm.Symbol
+        The computational graph
+
+    params : dict of str to NDArray
+        The parameters.
+    """
+    net = get_symbol(num_classes=num_classes, version=version, **kwargs)
+    return create_workload(net, batch_size, image_shape, dtype)
diff --git a/nnvm/python/nnvm/testing/utils.py b/nnvm/python/nnvm/testing/utils.py
new file mode 100644
index 000000000000..0bffc81a0663
--- /dev/null
+++ b/nnvm/python/nnvm/testing/utils.py
@@ -0,0 +1,73 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Helper utility to create common workload for testing."""
+from __future__ import absolute_import as _abs
+
+import numpy as np
+import tvm
+from ..compiler import graph_util
+from ..import graph
+from . init import Xavier
+
+def create_workload(net, batch_size, image_shape=(3, 224, 224),
+                    dtype="float32", initializer=None, seed=0):
+    """Helper function to create benchmark workload for input network
+
+    Parameters
+    ----------
+    net : nnvm.Symbol
+        The selected network symbol to use
+
+    batch_size : int
+        The batch size used in the model
+
+    image_shape : tuple, optional
+        The input image shape
+
+    dtype : str, optional
+        The data type
+
+    initializer : Initializer
+        The initializer used
+
+    seed : int
+        The seed used in initialization.
+
+    Returns
+    -------
+    net : nnvm.Symbol
+        The computational graph
+
+    params : dict of str to NDArray
+        The parameters.
+    """
+    if image_shape is None:
+        image_shape = (3, 224, 224)
+    data_shape = (batch_size,) + image_shape
+    params = {}
+    g = graph.create(net)
+    input_shapes, _ = graph_util.infer_shape(g, data=data_shape)
+    shape_dict = dict(zip(g.index.input_names, input_shapes))
+    np.random.seed(seed)
+    initializer = initializer if initializer else Xavier()
+    for k, v in shape_dict.items():
+        if k == "data":
+            continue
+        init_value = np.zeros(v).astype(dtype)
+        initializer(k, init_value)
+        params[k] = tvm.nd.array(init_value, ctx=tvm.cpu(0))
+    return net, params
diff --git a/nnvm/python/nnvm/testing/vgg.py b/nnvm/python/nnvm/testing/vgg.py
new file mode 100644
index 000000000000..2c290bdc3c68
--- /dev/null
+++ b/nnvm/python/nnvm/testing/vgg.py
@@ -0,0 +1,107 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""References:
+
+Simonyan, Karen, and Andrew Zisserman. "Very deep convolutional networks for
+large-scale image recognition." arXiv preprint arXiv:1409.1556 (2014).
+"""
+from .. import symbol as sym
+from . utils import create_workload
+
+def get_feature(internel_layer, layers, filters, batch_norm=False):
+    """Get VGG feature body as stacks of convoltions."""
+    for i, num in enumerate(layers):
+        for j in range(num):
+            internel_layer = sym.conv2d(
+                data=internel_layer, kernel_size=(3, 3), padding=(1, 1),
+                channels=filters[i], name="conv%s_%s"%(i + 1, j + 1))
+            if batch_norm:
+                internel_layer = sym.batch_norm(
+                    data=internel_layer, name="bn%s_%s" %(i + 1, j + 1))
+            internel_layer = sym.relu(data=internel_layer, name="relu%s_%s" %(i + 1, j + 1))
+        internel_layer = sym.max_pool2d(
+            data=internel_layer, pool_size=(2, 2), strides=(2, 2), name="pool%s"%(i + 1))
+    return internel_layer
+
+def get_classifier(input_data, num_classes):
+    """Get VGG classifier layers as fc layers."""
+    flatten = sym.flatten(data=input_data, name="flatten")
+    fc6 = sym.dense(data=flatten, units=4096, name="fc6")
+    relu6 = sym.relu(data=fc6, name="relu6")
+    drop6 = sym.dropout(data=relu6, rate=0.5, name="drop6")
+    fc7 = sym.dense(data=drop6, units=4096, name="fc7")
+    relu7 = sym.relu(data=fc7, name="relu7")
+    drop7 = sym.dropout(data=relu7, rate=0.5, name="drop7")
+    fc8 = sym.dense(data=drop7, units=num_classes, name="fc8")
+    return fc8
+
+def get_symbol(num_classes, num_layers=11, batch_norm=False):
+    """
+    Parameters
+    ----------
+    num_classes : int, default 1000
+        Number of classification classes.
+    num_layers : int
+        Number of layers for the variant of densenet. Options are 11, 13, 16, 19.
+    batch_norm : bool, default False
+        Use batch normalization.
+    """
+    vgg_spec = {11: ([1, 1, 2, 2, 2], [64, 128, 256, 512, 512]),
+                13: ([2, 2, 2, 2, 2], [64, 128, 256, 512, 512]),
+                16: ([2, 2, 3, 3, 3], [64, 128, 256, 512, 512]),
+                19: ([2, 2, 4, 4, 4], [64, 128, 256, 512, 512])}
+    if num_layers not in vgg_spec:
+        raise ValueError("Invalide num_layers {}. Choices are 11,13,16,19.".format(num_layers))
+    layers, filters = vgg_spec[num_layers]
+    data = sym.Variable(name="data")
+    feature = get_feature(data, layers, filters, batch_norm)
+    classifier = get_classifier(feature, num_classes)
+    symbol = sym.softmax(data=classifier, name='softmax')
+    return symbol
+
+def get_workload(batch_size, num_classes=1000, image_shape=(3, 224, 224),
+                 dtype="float32", **kwargs):
+    """Get benchmark workload for VGG nets.
+
+    Parameters
+    ----------
+    batch_size : int
+        The batch size used in the model
+
+    num_classes : int, optional
+        Number of claseses
+
+    image_shape : tuple, optional
+        The input image shape
+
+    dtype : str, optional
+        The data type
+
+    kwargs : dict
+        Extra arguments
+
+    Returns
+    -------
+    net : nnvm.Symbol
+        The computational graph
+
+    params : dict of str to NDArray
+        The parameters.
+    """
+    net = get_symbol(num_classes=num_classes, **kwargs)
+    return create_workload(net, batch_size, image_shape, dtype)
diff --git a/nnvm/python/nnvm/to_relay.py b/nnvm/python/nnvm/to_relay.py
new file mode 100644
index 000000000000..94a736dabe70
--- /dev/null
+++ b/nnvm/python/nnvm/to_relay.py
@@ -0,0 +1,507 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=no-else-return, unidiomatic-typecheck, invalid-name, unused-argument
+"""Convert an NNVM graph to Relay."""
+import numpy
+
+from tvm import relay, nd
+from tvm.relay import op, expr, var
+from tvm.relay.frontend.common import StrAttrsDict
+from tvm.relay.frontend.nnvm_common import _rename, _binop_scalar, _rbinop_scalar, \
+     _elemwise_sum, _softmax_op, _compare, _reduce
+from .symbol import Symbol
+from .compiler import graph_attr
+from .graph import create as graph_create
+
+def _nn_batch_flatten(children, attrs, odtype='float32'):
+    assert len(children) == 1
+    return op.nn.batch_flatten(children[0])
+
+
+def _dense(children, attrs, odtype='float32'):
+    use_bias = attrs.get_bool('use_bias', True)
+    units = attrs.get_int('units')
+    dense = op.nn.dense(children[0], children[1], units=units)
+    if use_bias:
+        return op.nn.bias_add(dense, children[2])
+    else:
+        return dense
+
+def _conv2d(children, attrs, odtype='float32'):
+    use_bias = attrs.get_bool('use_bias', True)
+
+    if use_bias:
+        data, weight, bias = children
+    else:
+        data, weight = children
+
+    kernel_size = attrs.get_int_tuple('kernel_size')
+    channels = attrs.get_int('channels')
+    strides = attrs.get_int_tuple('strides', (1, 1))
+    padding = attrs.get_int_tuple('padding', (0, 0))
+    dilation = attrs.get_int_tuple('dilation', (1, 1))
+    groups = attrs.get_int('groups', 1)
+    data_layout = attrs.get_str('layout', 'NCHW')
+    kernel_layout = attrs.get_str('kernel_layout', 'OIHW')
+    out_layout = ''
+    out_dtype = attrs.get_str('out_dtype', '')
+
+    conv_out = op.nn.conv2d(
+        data,
+        weight,
+        kernel_size=kernel_size,
+        channels=channels,
+        strides=strides,
+        padding=padding,
+        dilation=dilation,
+        groups=groups,
+        data_layout=data_layout,
+        kernel_layout=kernel_layout,
+        out_layout=out_layout,
+        out_dtype=out_dtype)
+
+    if use_bias:
+        return op.nn.bias_add(conv_out, bias)
+    else:
+        return conv_out
+
+
+def _conv2d_transpose(children, attrs, odtype='float32'):
+    use_bias = attrs.get_bool('use_bias', False)
+
+    if use_bias:
+        data, weight, bias = children
+    else:
+        data, weight = children
+
+    strides = attrs.get_int_tuple('strides', (1, 1))
+    padding = attrs.get_int_tuple('padding', (0, 0))
+    dilation = attrs.get_int_tuple('dilation', (1, 1))
+    groups = attrs.get_int('groups', 1)
+    data_layout = attrs.get_str('layout', 'NCHW')
+    kernel_layout = attrs.get_str('kernel_layout', 'OIHW')
+    out_dtype = attrs.get_str('out_dtype', '')
+
+    out_conv2d = op.nn.conv2d_transpose(
+        data,
+        weight,
+        strides=strides,
+        padding=padding,
+        dilation=dilation,
+        groups=groups,
+        data_layout=data_layout,
+        kernel_layout=kernel_layout,
+        out_dtype=out_dtype)
+
+    if use_bias:
+        return op.nn.bias_add(out_conv2d, bias)
+    else:
+        return out_conv2d
+
+
+def _batch_norm(children, attrs, odtype='float32'):
+    data, gamma, beta, moving_mean, moving_view = children
+    axis = attrs.get_int('axis', 1)
+    epsilon = attrs.get_float('epsilon', 1e-05)
+    center = attrs.get_bool('center', True)
+    scale = attrs.get_bool('scale', True)
+
+    return op.nn.batch_norm(
+        data,
+        gamma,
+        beta,
+        moving_mean,
+        moving_view,
+        axis=axis,
+        epsilon=epsilon,
+        center=center,
+        scale=scale)[0]
+
+
+def _max_pool2d(children, attrs, odtype='float32'):
+    assert len(children) == 1
+    data = children[0]
+    pool_size = attrs.get_int_tuple('pool_size', (1, 1))
+    strides = attrs.get_int_tuple('strides', (1, 1))
+    padding = attrs.get_int_tuple('padding', (0, 0))
+    layout = attrs.get_str('layout', 'NCHW')
+    ceil_mode = attrs.get_bool('ceil_mode', False)
+
+    return op.nn.max_pool2d(
+        data,
+        pool_size=pool_size,
+        strides=strides,
+        padding=padding,
+        layout=layout,
+        ceil_mode=ceil_mode)
+
+
+def _reshape(children, attrs, odtype='float32'):
+    data = children[0]
+    shape = attrs.get_int_list('shape')
+    return op.reshape(data, shape)
+
+
+def _transpose(children, attrs, odtype='float32'):
+    axes = attrs.get_int_list('axes', None)
+    return op.transpose(children[0], axes=axes)
+
+
+def _clip(children, attrs, odtype='float32'):
+    a_min = attrs.get_float('a_min')
+    a_max = attrs.get_float('a_max')
+    return op.clip(children[0], a_min, a_max)
+
+
+def _cast(children, attrs, odtype='float32'):
+    data = children[0]
+    dtype = attrs.get_str('dtype')
+    return data.astype(dtype)
+
+
+def _expand_dims(children, attrs, odtype='float32'):
+    data = children[0]
+    axis = attrs.get_int('axis')
+    num_newaxis = attrs.get_int('num_newaxis', 1)
+    return op.transform.expand_dims(data, axis, num_newaxis=num_newaxis)
+
+
+def broadcast_to(children, attrs, odtype='float32'):
+    # TODO(@jroesch) export broadcast to?
+    data = children[0]
+    shape = attrs.get_int_tuple('shape')
+    array = numpy.zeros(shape).astype(odtype)
+    rconst = relay.Constant(nd.array(array))
+    return op.broadcast_to_like(data, rconst)
+
+
+def _global_avg_pool2d(children, attrs, odtype='float32'):
+    data = children[0]
+    layout = attrs.get_str('layout', "NCHW")
+    return op.nn.global_avg_pool2d(data, layout)
+
+
+def _avg_pool2d(children, attrs, odtype='float32'):
+    data = children[0]
+    pool_size = attrs.get_int_tuple('pool_size', (1, 1))
+    strides = attrs.get_int_tuple('strides', (1, 1))
+    padding = attrs.get_int_tuple('padding', (0, 0))
+    layout = attrs.get_str('layout', "NCHW")
+    ceil_mode = attrs.get_bool('ceil_mode', False)
+    count_include_pad = attrs.get_bool('layout', False)
+    return op.nn.avg_pool2d(
+        data,
+        pool_size=pool_size,
+        strides=strides,
+        padding=padding,
+        layout=layout,
+        ceil_mode=ceil_mode,
+        count_include_pad=count_include_pad)
+
+
+def _upsampling(children, attrs, odtype='float32'):
+    scale = attrs.get_int('scale')
+    layout = attrs.get_str('layout', 'NCHW')
+    method = attrs.get_str('method', 'NEAREST_NEIGHBOR')
+    return op.nn.upsampling(
+        children[0],
+        scale_h=scale,
+        scale_w=scale,
+        layout=layout,
+        method=method)
+
+
+def _pad(children, attrs, odtype='float32'):
+    pad_value = attrs.get_float('pad_value', 0.0)
+    pad_width = attrs.get_tuple_tuple_int('pad_width')
+    return op.nn.pad(children[0], pad_width, pad_value=pad_value)
+
+def _leaky_relu(children, attrs, odtype='float32'):
+    alpha = attrs.get_float('alpha')
+    return op.nn.leaky_relu(children[0], alpha)
+
+
+def _full_like(children, attrs, odtype='float32'):
+    fill_value = relay.const(attrs.get_float('fill_value'), dtype='float32')
+    return op.full_like(children[0], fill_value)
+
+
+def _strided_slice(children, attrs, odtype='float32'):
+    begin = attrs.get_int_list('begin')
+    end = attrs.get_int_list('end')
+    strides = attrs.get_int_list('stride', None)
+    return op.strided_slice(children[0], begin, end, strides=strides)
+
+
+def _split(children, attrs, odtype='float32'):
+    indices_or_sections = None
+    try:
+        indices_or_sections = attrs.get_int('indices_or_sections', None)
+    except ValueError:
+        indices_or_sections = indices_or_sections or attrs.get_int_tuple(
+            'indices_or_sections')
+
+    axis = attrs.get_int('axis', 0)
+
+    return op.split(children[0], indices_or_sections, axis)
+
+def _squeeze(children, attrs, odtype='float32'):
+    axis = attrs.get_int_tuple('axis', None)
+    axis = [axis] if isinstance(axis, int) else axis
+
+    return op.squeeze(children[0], axis)
+
+def _concatenate(children, attrs, odtype='float32'):
+    axis = attrs.get_int('axis', 1)
+    return op.concatenate(children, axis)
+
+def _dropout(children, attrs, odtype='float32'):
+    rate = attrs.get_float('rate', 0.5)
+    return op.nn.dropout(children[0], rate)
+
+def _mean(children, attrs, odtype='float32'):
+    axis = attrs.get_int_tuple('axis', None)
+    keepdims = attrs.get_bool('keepdims')
+
+    return op.mean(children[0], axis, keepdims)
+
+
+def _prelu(children, attrs, odtype='float32'):
+    axis = attrs.get_int('axis', 1)
+    return op.nn.prelu(children[0], children[1], axis)
+
+
+def _lrn(children, attrs, odtype='float32'):
+    size = attrs.get_int("size", 5)
+    axis = attrs.get_int("axis", 1)
+    bias = attrs.get_float("bias", 2)
+    alpha = attrs.get_float("alpha", 1e-05)
+    beta = attrs.get_float("beta", 0.75)
+    return op.nn.lrn(children[0], size, axis, bias, alpha, beta)
+
+
+def _l2_nomalize(children, attrs, odtype='float32'):
+    eps = attrs.get_float('eps')
+    axis = attrs.get_int_tuple('axis', None)
+    return op.nn.l2_normalize(children[0], eps, axis)
+
+
+def _take(children, attrs, odtype='float32'):
+    axis = attrs.get_int('axis', None)
+    return op.take(children[0], children[1], axis)
+
+
+def _matmul(children, attrs, odtype='float32'):
+    input_1_t = op.transpose(children[1], axes=(1, 0))
+    return op.nn.dense(children[0], input_1_t)
+
+
+def _collapse_sum(children, attrs, odtype='float32'):
+    for key in ["axis", "keepdims", "exclude"]:
+        if key in attrs.attrs:
+            raise NotImplementedError("Parameter '" + key + "' is not supported.")
+    return op.collapse_sum_like(children[0], children[1])
+
+
+def _not_implemented(new_op):
+    def _impl(children, attrs, odtype='float32'):
+        raise NotImplementedError(str(new_op) + " is not implemented.")
+    return _impl
+
+
+NNVM_OP_2_RELAY_OP = {
+    'flatten': _nn_batch_flatten,
+    'dense': _dense,
+    'softmax': _softmax_op(op.nn.softmax),
+    'log_softmax': _softmax_op(op.nn.log_softmax),
+    'conv2d': _conv2d,
+    'batch_norm': _batch_norm,
+    'max_pool2d': _max_pool2d,
+    'reshape': _reshape,
+    'transpose': _transpose,
+    'dropout': _dropout,
+    'mean': _mean,
+    # Addition
+    '__add_scalar__': _binop_scalar(op.add),
+    'broadcast_add' : _rename(op.add),
+    'elemwise_add'  : _rename(op.add),
+    # Subtraction
+    '__sub_scalar__' : _binop_scalar(op.subtract),
+    '__rsub_scalar__': _rbinop_scalar(op.subtract),
+    'broadcast_sub'  : _rename(op.subtract),
+    'elemwise_sub'   : _rename(op.subtract),
+    # Multiply
+    '__mul_scalar__': _binop_scalar(op.multiply),
+    'broadcast_mul' : _rename(op.multiply),
+    'elemwise_mul'  : _rename(op.multiply),
+    # Division
+    '__div_scalar__': _binop_scalar(op.divide),
+    'broadcast_div' : _rename(op.divide),
+    'elemwise_div'  : _rename(op.divide),
+    'broadcast_mod' : _rename(op.mod),
+    # Negative
+    'negative': _rename("negative"),
+    # Power
+    '__pow_scalar__': _binop_scalar(op.power),
+    '__rpow_scalar__': _rbinop_scalar(op.power),
+    'broadcast_pow': _rename(op.power),
+    # Sum
+    'sum': _reduce(op.sum),
+    'elemwise_sum': _elemwise_sum,
+    'collapse_sum': _collapse_sum,
+    'broadcast_max': _rename(op.maximum),
+    'broadcast_min': _rename(op.minimum),
+
+    # Comparsion
+    'greater': _compare(op.greater),
+    'broadcast_greater': _compare(op.greater),
+    'greater_equal': _compare(op.greater_equal),
+    'broadcast_greater_equal': _compare(op.greater_equal),
+    'less': _compare(op.less),
+    'broadcast_less': _compare(op.less),
+    'less_equal': _compare(op.less_equal),
+    'broadcast_less_equal': _compare(op.less_equal),
+    'broadcast_equal': _compare(op.equal),
+    'broadcast_not_equal': _compare(op.not_equal),
+
+    # Activations
+    'sigmoid': _rename('sigmoid'),
+    'relu': _rename('nn.relu'),
+    'exp': _rename('exp'),
+    'log': _rename('log'),
+    'tanh': _rename('tanh'),
+    'leaky_relu': _leaky_relu,
+    'prelu': _prelu,
+    'clip': _clip,
+    'round': _rename('round'),
+    'cast': _cast,
+    'expand_dims': _expand_dims,
+    'broadcast_to': broadcast_to,
+    '__lshift_scalar__': _binop_scalar(op.left_shift),
+    '__rshift_scalar__': _binop_scalar(op.right_shift),
+    'broadcast_left_shift': _rename(op.left_shift),
+    'broadcast_right_shift': _rename(op.right_shift),
+    'copy': _rename(op.copy),
+    'global_avg_pool2d': _global_avg_pool2d,
+    'avg_pool2d': _avg_pool2d,
+    'conv2d_transpose': _conv2d_transpose,
+    'upsampling': _upsampling,
+    'pad': _pad,
+    'full_like': _full_like,
+    'strided_slice': _strided_slice,
+    'split': _split,
+    'squeeze': _squeeze,
+    'concatenate': _concatenate,
+    'abs': _rename(op.abs),
+    'ceil': _rename(op.ceil),
+    'floor': _rename(op.floor),
+    'trunc': _rename(op.trunc),
+    'take': _take,
+    'lrn': _lrn,
+    'l2_normalize': _l2_nomalize,
+    'matmul': _matmul,
+    'zeros_like': _rename(op.zeros_like),
+    'reshape_like': _rename(op.reshape_like),
+    'ones_like': _rename(op.ones_like),
+
+    'expand_like': _not_implemented("expand_like"),
+    'gather_nd': _not_implemented("gather_nd"),
+    'block_grad': _not_implemented("block_grad"),
+}
+
+
+def to_relay(graph, shape_dict, dtype_dict, params):
+    """Convert an NNVM graph into the corresponding Relay expression.
+
+    Parameters
+    ----------
+    graph : Graph
+       The input graph.
+
+    shape_dict : dict of str to shape
+       The input shape.
+
+    dtype_dict : dict of str to str/dtype
+       The input shape.
+
+    params : dict of str to array
+        The parameters.
+
+    Returns
+    -------
+    (expr, params) : Tuple[relay.Expr, dict of str to array]
+        The corresponding Relay expression and parameters.
+    """
+    if isinstance(graph, Symbol):
+        graph = graph_create(graph)
+
+    param_shapes = dict((k, params[k].shape) for k in params)
+    shape_dict = shape_dict.copy()
+    shape_dict.update(param_shapes)
+    graph = graph_attr.set_shape_inputs(graph, shape_dict)
+    graph = graph_attr.set_dtype_inputs(graph, dtype_dict)
+    graph = graph.apply(["InferShape", "InferType"])
+    shape = graph.json_attr("shape")
+    dtype = [graph_attr.TCODE_TO_DTYPE[di] for di in graph.json_attr("dtype")]
+
+    gidx = graph.index
+    relay_map = {}
+    fn_params = []
+
+    for nid, node in enumerate(gidx.nodes):
+        children = []
+        for i in node['inputs']:
+            child = relay_map[i[0]]
+            if isinstance(child, expr.TupleWrapper):
+                children.append(child[i[1]])
+            else:
+                children.append(child)
+
+        oshape = shape[gidx.entry_id(nid, 0)]
+        odtype = dtype[gidx.entry_id(nid, 0)]
+        attrs = node.get("attrs", {})
+        node_name = node["name"]
+        op_name = node["op"]
+
+        if op_name == "null":
+            v = var(node_name, shape=oshape, dtype=odtype)
+            fn_params.append(v)
+            relay_map[nid] = v
+        else:
+            if op_name in NNVM_OP_2_RELAY_OP:
+                str_attrs = StrAttrsDict(attrs)
+                call = NNVM_OP_2_RELAY_OP[op_name](children, str_attrs, odtype)
+                relay_map[nid] = call
+            else:
+                raise Exception(
+                    "nnvm.to_relay: unsupported operator: {0}".format(op_name))
+
+    outputs = []
+    for nid, idx, _ in gidx.output_entries:
+        output = relay_map[nid]
+        if isinstance(output, expr.TupleWrapper):
+            outputs.append(output[idx])
+        else:
+            outputs.append(output)
+
+    if len(outputs) == 1:
+        body = outputs[0]
+    else:
+        body = expr.Tuple(outputs)
+
+    func = relay.Function(fn_params, body)
+    return func, params
diff --git a/cmake/modules/contrib/DNNL.cmake b/nnvm/python/nnvm/top/__init__.py
similarity index 63%
rename from cmake/modules/contrib/DNNL.cmake
rename to nnvm/python/nnvm/top/__init__.py
index 3fd3f7cbc887..db80df03e269 100644
--- a/cmake/modules/contrib/DNNL.cmake
+++ b/nnvm/python/nnvm/top/__init__.py
@@ -15,14 +15,17 @@
 # specific language governing permissions and limitations
 # under the License.
 
-if(USE_DNNL_CODEGEN STREQUAL "ON")
-  file(GLOB DNNL_RELAY_CONTRIB_SRC src/relay/backend/contrib/dnnl/codegen.cc)
-  list(APPEND COMPILER_SRCS ${DNNL_RELAY_CONTRIB_SRC})
+"""Tensor operator property registry
 
-  find_library(EXTERN_LIBRARY_DNNL dnnl)
-  list(APPEND TVM_RUNTIME_LINKER_LIBS ${EXTERN_LIBRARY_DNNL})
-  file(GLOB DNNL_CONTRIB_SRC src/runtime/contrib/dnnl/*)
-  list(APPEND RUNTIME_SRCS ${DNNL_CONTRIB_SRC})
-  message(STATUS "Build with DNNL codegen: " ${EXTERN_LIBRARY_DNNL})
-endif()
+Provide information to lower and schedule tensor operators.
+"""
+from .attr_dict import AttrDict
+from . import tensor
+from . import nn
+from . import transform
+from . import reduction
+from . import vision
+from . import image
 
+from .registry import OpPattern
+from .registry import register_compute, register_schedule, register_pattern
diff --git a/nnvm/python/nnvm/top/attr_dict.py b/nnvm/python/nnvm/top/attr_dict.py
new file mode 100644
index 000000000000..5082a587d5a0
--- /dev/null
+++ b/nnvm/python/nnvm/top/attr_dict.py
@@ -0,0 +1,175 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name
+"""Attr dictionary object used by schedule functions"""
+import tvm
+
+_dict_get = tvm.get_global_func("nnvm.compiler._dict_get")
+_dict_size = tvm.get_global_func("nnvm.compiler._dict_size")
+_dict_keys = tvm.get_global_func("nnvm.compiler._dict_keys")
+
+class AttrDict(object):
+    """Attribute dictionary in nnvm.
+
+    Used by python registration of compute and schedule function.
+    AttrDict is passed as the first argument to schedule and compute function.
+    """
+    _tvm_tcode = 18
+
+    def __init__(self, handle):
+        self.handle = handle
+
+    def __del__(self):
+        tvm.nd.free_extension_handle(self.handle, 18)
+
+    @property
+    def _tvm_handle(self):
+        return self.handle.value
+
+    def __getitem__(self, key):
+        return _dict_get(self, key)
+
+    def keys(self):
+        """Get list of keys in the dict.
+
+        Returns
+        -------
+        keys : list of str
+            List of keys
+        """
+        return [x.value for x in _dict_keys(self)]
+
+    def get_int_tuple(self, key):
+        """Get tuple of integer from attr dict
+
+        Parameters
+        ----------
+        key : str
+            The attr key
+
+        Returns
+        -------
+        tuple : tuple of int
+            The result tuple
+        """
+        return tuple(int(x) for x in self[key][1:-1].split(",") if x)
+
+    def get_int_pair_tuple(self, key):
+        """Get tuple of integer pairs from attr dict
+
+        Parameters
+        ----------
+        key : str
+            The attr key
+
+        Returns
+        -------
+        tuple : tuple of int pairs
+            The result tuple
+        """
+        flat = [int(x.strip(' [] ')) for x in self[key][1:-1].split(",")]
+        return tuple((flat[i], flat[i+1]) for i in range(0, len(flat), 2))
+
+    def get_int(self, key):
+        """Get integer from attr dict
+
+        Parameters
+        ----------
+        key : str
+            The attr key
+
+        Returns
+        -------
+        value : int
+            The result value
+        """
+        return int(self[key])
+
+    def get_float_tuple(self, key):
+        """Get tuple of float from attr dict
+
+        Parameters
+        ----------
+        key : str
+            The attr key
+
+        Returns
+        -------
+        tuple : tuple of float
+            The result tuple
+        """
+        return tuple(float(x) for x in self[key][1:-1].split(",") if x)
+
+    def get_float(self, key):
+        """Get float from attr dict
+
+        Parameters
+        ----------
+        key : str
+            The attr key
+
+        Returns
+        -------
+        value : float
+            The result value
+        """
+        return float(self[key])
+
+    def get_bool(self, key):
+        """Get bool from attr dict
+
+        Parameters
+        ----------
+        key : str
+            The attr key
+
+        Returns
+        -------
+        value : bool
+            The result value
+        """
+        lowercase = self[key].lower()
+        if lowercase == "1":
+            return True
+        if lowercase == "0":
+            return False
+        if lowercase == "true":
+            return True
+        if lowercase == "false":
+            return False
+        raise ValueError("Wrong bool format for key %s" % key)
+
+    def get_str(self, key):
+        """Get string from attr dict
+
+        Parameters
+        ----------
+        key : str
+            The attr key
+
+        Returns
+        -------
+        value : str
+            The result value
+        """
+        return self[key]
+
+    def __repr__(self):
+        return str({k : self[k] for k in self.keys()})
+
+
+tvm.register_extension(AttrDict, AttrDict)
diff --git a/nnvm/python/nnvm/top/image.py b/nnvm/python/nnvm/top/image.py
new file mode 100644
index 000000000000..4367d982985c
--- /dev/null
+++ b/nnvm/python/nnvm/top/image.py
@@ -0,0 +1,33 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name, unused-argument
+"""Definition of image ops"""
+from __future__ import absolute_import
+
+import tvm
+import topi
+from . import registry as reg
+from .registry import OpPattern
+
+# resize
+@reg.register_schedule("resize")
+def schedule_resize(_, outs, target):
+    """Schedule definition of resize"""
+    with tvm.target.create(target):
+        return topi.generic.schedule_injective(outs)
+
+reg.register_pattern("resize", OpPattern.INJECTIVE)
diff --git a/nnvm/python/nnvm/top/reduction.py b/nnvm/python/nnvm/top/reduction.py
new file mode 100644
index 000000000000..ce14d0d28831
--- /dev/null
+++ b/nnvm/python/nnvm/top/reduction.py
@@ -0,0 +1,75 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name, unused-argument
+"""Reduction ops"""
+from __future__ import absolute_import
+
+import tvm
+import topi
+import topi.cuda
+from . import registry as reg
+from .registry import OpPattern
+
+def _schedule_reduce(_, outs, target):
+    """Generic schedule for reduce"""
+    with tvm.target.create(target):
+        return topi.generic.schedule_reduce(outs)
+
+
+_fschedule_reduce = tvm.convert(_schedule_reduce)
+
+def _compute_reduce(f):
+    """auxiliary function"""
+    def _compute(attrs, inputs, out_info):
+        axis = attrs.get_int_tuple("axis")
+        keepdims = attrs.get_bool("keepdims")
+        if axis:
+            return f(inputs[0], axis=axis, keepdims=keepdims)
+        return f(inputs[0], keepdims=keepdims)
+    return _compute
+
+# sum
+reg.register_pattern("sum", OpPattern.COMM_REDUCE)
+reg.register_schedule("sum", _fschedule_reduce)
+
+# max
+reg.register_pattern("max", OpPattern.COMM_REDUCE)
+reg.register_schedule("max", _fschedule_reduce)
+
+# min
+reg.register_pattern("min", OpPattern.COMM_REDUCE)
+reg.register_schedule("min", _fschedule_reduce)
+
+# collapse sum
+reg.register_pattern("collapse_sum", OpPattern.COMM_REDUCE)
+reg.register_schedule("collapse_sum", _fschedule_reduce)
+
+# argmax
+reg.register_pattern("argmax", OpPattern.COMM_REDUCE)
+reg.register_schedule("argmax", _fschedule_reduce)
+
+# argmin
+reg.register_pattern("argmin", OpPattern.COMM_REDUCE)
+reg.register_schedule("argmin", _fschedule_reduce)
+
+# mean
+reg.register_pattern("mean", OpPattern.COMM_REDUCE)
+reg.register_schedule("mean", _fschedule_reduce)
+
+# product
+reg.register_pattern("prod", OpPattern.COMM_REDUCE)
+reg.register_schedule("prod", _fschedule_reduce)
diff --git a/nnvm/python/nnvm/top/registry.py b/nnvm/python/nnvm/top/registry.py
new file mode 100644
index 000000000000..7ad10620f304
--- /dev/null
+++ b/nnvm/python/nnvm/top/registry.py
@@ -0,0 +1,138 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name
+"""Information registry to register operator information for compiler"""
+import tvm
+
+class OpPattern(object):
+    """Operator generic patterns
+
+    See Also
+    --------
+    top.tag : Contains explanation of the tag type.
+    """
+    # Elementwise operator
+    ELEMWISE = 0
+    # Broadcast operator
+    BROADCAST = 1
+    # Injective mapping
+    INJECTIVE = 2
+    # Comunication
+    COMM_REDUCE = 3
+    # Complex op, can still fuse ewise into it
+    OUT_ELEMWISE_FUSABLE = 4
+    # Not fusable opaque op
+    OPAQUE = 8
+
+_register_compute = tvm.get_global_func("nnvm._register_compute")
+_register_schedule = tvm.get_global_func("nnvm._register_schedule")
+_register_pattern = tvm.get_global_func("nnvm._register_pattern")
+_register_alter_op_layout = tvm.get_global_func("nnvm.compiler._register_alter_op_layout")
+
+def register_compute(op_name, f=None, level=10):
+    """Register compute function for operator
+
+    Parameters
+    ----------
+    op_name : str
+        The name of operator
+
+    f : function
+        The schedule function
+
+    level : int
+        The priority level
+
+    Returns
+    -------
+    fregister : function
+        Register function if f is not specified.
+    """
+    def register(myf):
+        """internal register function"""
+        _register_compute(op_name, myf, level)
+        return myf
+    return register(f) if f else register
+
+
+def register_schedule(op_name, f=None, level=10):
+    """Register schedule function for operator
+
+    Parameters
+    ----------
+    op_name : str
+        The name of operator
+
+    f : function
+        The schedule function
+
+    level : int
+        The priority level
+
+    Returns
+    -------
+    fregister : function
+        Register function if f is not specified.
+    """
+    def register(myf):
+        """internal register function"""
+        _register_schedule(op_name, myf, level)
+        return myf
+    return register(f) if f else register
+
+
+def register_pattern(op_name, pattern, level=10):
+    """Register pattern code for operator
+
+    Parameters
+    ----------
+    op_name : str
+        The name of operator
+
+    pattern : int
+        The pattern code.
+
+    level : int
+        The priority level
+    """
+    _register_pattern(op_name, pattern, level)
+
+
+def register_alter_op_layout(op_name, f=None, level=10):
+    """Register alter layout function for operator
+
+    Parameters
+    ----------
+    op_name : str
+        The name of operator
+
+    f : function
+        The schedule function
+
+    level : int
+        The priority level
+
+    Returns
+    -------
+    fregister : function
+        Register function if f is not specified.
+    """
+    def register(myf):
+        """internal register function"""
+        _register_alter_op_layout(op_name, myf, level)
+        return myf
+    return register(f) if f else register
diff --git a/nnvm/python/nnvm/top/tensor.py b/nnvm/python/nnvm/top/tensor.py
new file mode 100644
index 000000000000..9f12e3245e3a
--- /dev/null
+++ b/nnvm/python/nnvm/top/tensor.py
@@ -0,0 +1,306 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name, unused-argument
+"""Tensor ops"""
+from __future__ import absolute_import
+
+import tvm
+import topi
+import topi.cuda
+from . import registry as reg
+from .registry import OpPattern
+
+def _schedule_injective(_, outs, target):
+    """Generic schedule for binary bcast"""
+    with tvm.target.create(target):
+        return topi.generic.schedule_injective(outs)
+
+def _compute_binary_scalar(f):
+    """auxiliary function"""
+    @tvm.tag_scope(topi.tag.ELEMWISE)
+    def _compute(attrs, x, _):
+        x = x[0]
+        scalar = attrs.get_float("scalar")
+        scalar = tvm.const(scalar, x.dtype)
+        return tvm.compute(x.shape, lambda *i: f(x(*i), scalar))
+    return _compute
+
+
+def _compute_unary(f):
+    """auxiliary function"""
+    def _compute(attrs, x, _):
+        return f(x[0])
+    return _compute
+
+
+def _compute_binary(f):
+    """auxiliary function"""
+    def _compute(attrs, x, _):
+        return f(x[0], x[1])
+    return _compute
+
+
+_fschedule_injective = tvm.convert(_schedule_injective)
+_fschedule_broadcast = _fschedule_injective
+_fschedule_elemwise = _fschedule_injective
+
+# Assign requires special treatment in the compiler
+# The compute and schedule are designed as
+# copy from rhs to output
+reg.register_pattern("_assign", OpPattern.OPAQUE)
+reg.register_schedule("_assign", _fschedule_broadcast)
+
+# copy
+reg.register_pattern("copy", OpPattern.ELEMWISE)
+reg.register_schedule("copy", _fschedule_broadcast)
+
+# cast
+reg.register_pattern("cast", OpPattern.ELEMWISE)
+reg.register_schedule("cast", _fschedule_broadcast)
+
+# floor
+reg.register_pattern("floor", OpPattern.ELEMWISE)
+reg.register_schedule("floor", _fschedule_broadcast)
+
+# ceil
+reg.register_pattern("ceil", OpPattern.ELEMWISE)
+reg.register_schedule("ceil", _fschedule_broadcast)
+
+# round
+reg.register_pattern("round", OpPattern.ELEMWISE)
+reg.register_schedule("round", _fschedule_broadcast)
+
+# abs
+reg.register_pattern("abs", OpPattern.ELEMWISE)
+reg.register_schedule("abs", _fschedule_broadcast)
+
+# trunc
+reg.register_pattern("trunc", OpPattern.ELEMWISE)
+reg.register_schedule("trunc", _fschedule_broadcast)
+
+# exp
+reg.register_pattern("exp", OpPattern.ELEMWISE)
+reg.register_schedule("exp", _fschedule_broadcast)
+
+# sqrt
+reg.register_pattern("sqrt", OpPattern.ELEMWISE)
+reg.register_schedule("sqrt", _fschedule_broadcast)
+
+# log
+reg.register_pattern("log", OpPattern.ELEMWISE)
+reg.register_schedule("log", _fschedule_broadcast)
+
+# tanh
+reg.register_pattern("tanh", OpPattern.ELEMWISE)
+reg.register_schedule("tanh", _fschedule_broadcast)
+
+# negative
+reg.register_pattern("negative", OpPattern.ELEMWISE)
+reg.register_schedule("negative", _fschedule_broadcast)
+
+# sigmoid
+reg.register_pattern("sigmoid", OpPattern.ELEMWISE)
+reg.register_schedule("sigmoid", _fschedule_broadcast)
+
+# add_scalar
+reg.register_pattern("__add_scalar__", OpPattern.ELEMWISE)
+reg.register_schedule("__add_scalar__", _fschedule_broadcast)
+
+# sub_calar
+reg.register_pattern("__sub_scalar__", OpPattern.ELEMWISE)
+reg.register_schedule("__sub_scalar__", _fschedule_broadcast)
+
+# rsub_scalar
+reg.register_pattern("__rsub_scalar__", OpPattern.ELEMWISE)
+reg.register_schedule("__rsub_scalar__", _fschedule_broadcast)
+
+# mul_scalar
+reg.register_pattern("__mul_scalar__", OpPattern.ELEMWISE)
+reg.register_schedule("__mul_scalar__", _fschedule_broadcast)
+
+# div_scalar
+reg.register_pattern("__div_scalar__", OpPattern.ELEMWISE)
+reg.register_schedule("__div_scalar__", _fschedule_broadcast)
+
+# rdiv_scalar
+reg.register_pattern("__rdiv_scalar__", OpPattern.ELEMWISE)
+reg.register_schedule("__rdiv_scalar__", _fschedule_broadcast)
+
+# pow_scalar
+reg.register_pattern("__pow_scalar__", OpPattern.ELEMWISE)
+reg.register_schedule("__pow_scalar__", _fschedule_broadcast)
+
+# rpow_scalar
+reg.register_pattern("__rpow_scalar__", OpPattern.ELEMWISE)
+reg.register_schedule("__rpow_scalar__", _fschedule_broadcast)
+
+# lshift_scalar
+reg.register_pattern("__lshift_scalar__", OpPattern.ELEMWISE)
+reg.register_schedule("__lshift_scalar__", _fschedule_broadcast)
+
+# rshift_scalar
+reg.register_pattern("__rshift_scalar__", OpPattern.ELEMWISE)
+reg.register_schedule("__rshift_scalar__", _fschedule_broadcast)
+
+# logical_and
+reg.register_pattern("logical_and", OpPattern.ELEMWISE)
+reg.register_schedule("logical_and", _fschedule_broadcast)
+
+# logical_or
+reg.register_pattern("logical_or", OpPattern.ELEMWISE)
+reg.register_schedule("logical_or", _fschedule_broadcast)
+
+# logical_not
+reg.register_pattern("logical_not", OpPattern.ELEMWISE)
+reg.register_schedule("logical_not", _fschedule_broadcast)
+
+# elemwise_add
+reg.register_pattern("elemwise_add", OpPattern.BROADCAST)
+reg.register_schedule("elemwise_add", _fschedule_broadcast)
+
+# elemwise_sub
+reg.register_pattern("elemwise_sub", OpPattern.BROADCAST)
+reg.register_schedule("elemwise_sub", _fschedule_broadcast)
+
+# elemwise_mul
+reg.register_pattern("elemwise_mul", OpPattern.BROADCAST)
+reg.register_schedule("elemwise_mul", _fschedule_broadcast)
+
+# elemwise_div
+reg.register_pattern("elemwise_div", OpPattern.BROADCAST)
+reg.register_schedule("elemwise_div", _fschedule_broadcast)
+
+# elemwise_mod
+reg.register_pattern("elemwise_mod", OpPattern.BROADCAST)
+reg.register_schedule("elemwise_mod", _fschedule_broadcast)
+
+# elemwise_pow
+reg.register_pattern("elemwise_pow", OpPattern.BROADCAST)
+reg.register_schedule("elemwise_pow", _fschedule_broadcast)
+
+# broadcast_add
+reg.register_pattern("broadcast_add", OpPattern.BROADCAST)
+reg.register_schedule("broadcast_add", _fschedule_broadcast)
+
+# broadcast_sub
+reg.register_pattern("broadcast_sub", OpPattern.BROADCAST)
+reg.register_schedule("broadcast_sub", _fschedule_broadcast)
+
+# broadcast_mul
+reg.register_pattern("broadcast_mul", OpPattern.BROADCAST)
+reg.register_schedule("broadcast_mul", _fschedule_broadcast)
+
+# broadcast_div
+reg.register_pattern("broadcast_div", OpPattern.BROADCAST)
+reg.register_schedule("broadcast_div", _fschedule_broadcast)
+
+# broadcast mod
+reg.register_pattern("broadcast_mod", OpPattern.BROADCAST)
+reg.register_schedule("broadcast_mod", _fschedule_broadcast)
+
+# broadcast max
+reg.register_pattern("broadcast_max", OpPattern.BROADCAST)
+reg.register_schedule("broadcast_max", _fschedule_broadcast)
+
+# broadcast min
+reg.register_pattern("broadcast_min", OpPattern.BROADCAST)
+reg.register_schedule("broadcast_min", _fschedule_broadcast)
+
+# broadcast pow
+reg.register_pattern("broadcast_pow", OpPattern.BROADCAST)
+reg.register_schedule("broadcast_pow", _fschedule_broadcast)
+
+# broadcast left_shift
+reg.register_pattern("broadcast_left_shift", OpPattern.BROADCAST)
+reg.register_schedule("broadcast_left_shift", _fschedule_broadcast)
+
+# broadcast right_shift
+reg.register_pattern("broadcast_right_shift", OpPattern.BROADCAST)
+reg.register_schedule("broadcast_right_shift", _fschedule_broadcast)
+
+# broadcast greater
+reg.register_pattern("broadcast_greater", OpPattern.BROADCAST)
+reg.register_schedule("broadcast_greater", _fschedule_broadcast)
+
+# broadcast less
+reg.register_pattern("broadcast_less", OpPattern.BROADCAST)
+reg.register_schedule("broadcast_less", _fschedule_broadcast)
+
+# broadcast equal
+reg.register_pattern("broadcast_equal", OpPattern.BROADCAST)
+reg.register_schedule("broadcast_equal", _fschedule_broadcast)
+
+# broadcast not_equal
+reg.register_pattern("broadcast_not_equal", OpPattern.BROADCAST)
+reg.register_schedule("broadcast_not_equal", _fschedule_broadcast)
+
+# broadcast greater_equal
+reg.register_pattern("broadcast_greater_equal", OpPattern.BROADCAST)
+reg.register_schedule("broadcast_greater_equal", _fschedule_broadcast)
+
+# broadcast less_equal
+reg.register_pattern("broadcast_less_equal", OpPattern.BROADCAST)
+reg.register_schedule("broadcast_less_equal", _fschedule_broadcast)
+
+# broadcast_to
+reg.register_pattern("broadcast_to", OpPattern.BROADCAST)
+reg.register_schedule("broadcast_to", _fschedule_broadcast)
+
+# clip
+reg.register_pattern("clip", OpPattern.ELEMWISE)
+reg.register_schedule("clip", _fschedule_elemwise)
+
+# elemwise sum
+reg.register_pattern("elemwise_sum", OpPattern.ELEMWISE)
+reg.register_schedule("elemwise_sum", _fschedule_elemwise)
+
+# full
+reg.register_pattern("full", OpPattern.OUT_ELEMWISE_FUSABLE)
+reg.register_schedule("full", _fschedule_elemwise)
+
+# full_like
+reg.register_pattern("full_like", OpPattern.ELEMWISE)
+reg.register_schedule("full_like", _fschedule_elemwise)
+
+# zeros
+reg.register_pattern("zeros", OpPattern.OUT_ELEMWISE_FUSABLE)
+reg.register_schedule("zeros", _fschedule_elemwise)
+
+# zeros_like
+reg.register_pattern("zeros_like", OpPattern.ELEMWISE)
+reg.register_schedule("zeros_like", _fschedule_elemwise)
+
+# ones
+reg.register_pattern("ones", OpPattern.OUT_ELEMWISE_FUSABLE)
+reg.register_schedule("ones", _fschedule_elemwise)
+
+# ones_like
+reg.register_pattern("ones_like", OpPattern.ELEMWISE)
+reg.register_schedule("ones_like", _fschedule_elemwise)
+
+# greater
+reg.register_pattern("greater", OpPattern.ELEMWISE)
+reg.register_schedule("greater", _fschedule_elemwise)
+
+# less
+reg.register_pattern("less", OpPattern.ELEMWISE)
+reg.register_schedule("less", _fschedule_elemwise)
+
+# block_grad
+reg.register_compute("block_grad", _compute_unary(topi.identity))
+reg.register_pattern("block_grad", OpPattern.ELEMWISE)
+reg.register_schedule("block_grad", _fschedule_elemwise)
diff --git a/nnvm/python/nnvm/top/transform.py b/nnvm/python/nnvm/top/transform.py
new file mode 100644
index 000000000000..e9051309734a
--- /dev/null
+++ b/nnvm/python/nnvm/top/transform.py
@@ -0,0 +1,108 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name, unused-argument
+"""Tensor transformation ops"""
+from __future__ import absolute_import
+
+import tvm
+import topi
+from .tensor import _fschedule_broadcast, _fschedule_injective
+from . import registry as reg
+from .registry import OpPattern
+
+# expand_dims
+reg.register_pattern("expand_dims", OpPattern.BROADCAST)
+reg.register_schedule("expand_dims", _fschedule_broadcast)
+
+# expand_like
+@reg.register_compute("expand_like")
+def compute_expand_like(attrs, inputs, _):
+    """Compute definition of expand_like"""
+    if len(inputs[0].shape) == len(inputs[1].shape):
+        # If the number of dimensions is not changed then it is just a broadcasting
+        return topi.broadcast_to(inputs[0], inputs[1].shape)
+
+    exclude = attrs.get_bool("exclude")
+    axis = attrs.get_int_tuple("axis")
+    if exclude:
+        exclude_axis = (axis,) if isinstance(axis, int) else axis
+        axis = []
+        for item in range(len(inputs[1].shape)):
+            if item not in exclude_axis:
+                axis.append(item)
+        axis = tuple(axis)
+
+    return topi.transform.expand_like(inputs[0], inputs[1], axis)
+reg.register_pattern("expand_like", OpPattern.BROADCAST)
+reg.register_schedule("expand_like", _fschedule_broadcast)
+
+# reshape_like
+@reg.register_compute("reshape_like")
+def compute_reshape_like(attrs, inputs, out_info):
+    """Compute definition of reshape_like"""
+    return topi.reshape(inputs[0], inputs[1].shape)
+reg.register_pattern("reshape_like", OpPattern.INJECTIVE)
+reg.register_schedule("reshape_like", _fschedule_injective)
+
+# transpose
+reg.register_pattern("transpose", OpPattern.INJECTIVE)
+reg.register_schedule("transpose", _fschedule_injective)
+
+# flip
+reg.register_pattern("flip", OpPattern.INJECTIVE)
+reg.register_schedule("flip", _fschedule_injective)
+
+# reshape
+reg.register_pattern("reshape", OpPattern.INJECTIVE)
+reg.register_schedule("reshape", _fschedule_injective)
+
+# squeeze
+reg.register_pattern("squeeze", OpPattern.INJECTIVE)
+reg.register_schedule("squeeze", _fschedule_injective)
+
+# concatenate
+@reg.register_schedule("concatenate")
+def schedule_concatenate(_, outs, target):
+    """Schedule definition of concatenate"""
+    with tvm.target.create(target):
+        return topi.generic.schedule_concatenate(outs)
+
+reg.register_pattern("concatenate", OpPattern.INJECTIVE)
+
+# split
+reg.register_pattern("split", OpPattern.INJECTIVE)
+reg.register_schedule("split", _fschedule_injective)
+
+# take
+reg.register_pattern("take", OpPattern.INJECTIVE)
+reg.register_schedule("take", _fschedule_injective)
+
+# strided_slice
+reg.register_pattern("strided_slice", OpPattern.INJECTIVE)
+reg.register_schedule("strided_slice", _fschedule_injective)
+
+# slice_like
+reg.register_pattern("slice_like", OpPattern.INJECTIVE)
+reg.register_schedule("slice_like", _fschedule_injective)
+
+# where
+reg.register_pattern("where", OpPattern.INJECTIVE)
+reg.register_schedule("where", _fschedule_injective)
+
+# gather_nd
+reg.register_pattern("gather_nd", OpPattern.INJECTIVE)
+reg.register_schedule("gather_nd", _fschedule_injective)
diff --git a/nnvm/python/setup.py b/nnvm/python/setup.py
new file mode 100644
index 000000000000..f89ac33a2e39
--- /dev/null
+++ b/nnvm/python/setup.py
@@ -0,0 +1,78 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import os
+import sys
+from setuptools import find_packages
+from distutils.core import setup
+
+def config_cython():
+    # temporary disable cython for now
+    # as NNVM uses local DLL build
+    return []
+    try:
+        from Cython.Build import cythonize
+        from distutils.extension import Extension
+        if sys.version_info >= (3, 0):
+            subdir = "_cy3"
+        else:
+            subdir = "_cy2"
+        ret = []
+        path = "nnvm/cython"
+
+        for fn in os.listdir(path):
+            if not fn.endswith(".pyx"):
+                continue
+            ret.append(Extension(
+                "nnvm/%s/%s" % (subdir, fn[:-4]),
+                ["nnvm/cython/%s" % fn],
+                include_dirs=["../include/"],
+                language="c++"))
+        return cythonize(ret)
+    except:
+        print("Cython is not installed, will compile without cython module")
+        return []
+
+# We can not import `libinfo.py` in setup.py directly since __init__.py
+# Will be invoked which introduces dependences
+CURRENT_DIR = os.path.dirname(__file__)
+libinfo_py = os.path.join(CURRENT_DIR, './nnvm/libinfo.py')
+libinfo = {'__file__': libinfo_py}
+exec(compile(open(libinfo_py, "rb").read(), libinfo_py, 'exec'), libinfo, libinfo)
+
+__version__ = libinfo['__version__']
+if not os.getenv('CONDA_BUILD'):
+    LIB_PATH = libinfo['find_lib_path']()
+    _, LIB_NAME = os.path.split(LIB_PATH[0])
+    curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
+    rpath = os.path.relpath(LIB_PATH[0], curr_path)
+    setup_kwargs = dict(
+        include_package_data=True,
+        data_files=[('nnvm', [rpath])]
+    )
+else:
+    setup_kwargs = {}
+
+setup(name='nnvm',
+      version=__version__,
+      description="NNVM: Open Compiler for AI Frameworks",
+      zip_safe=False,
+      install_requires=[
+        'numpy'
+      ],
+      packages=find_packages(),
+      url='https://github.com/dmlc/nnvm',
+      **setup_kwargs)
diff --git a/nnvm/src/README.md b/nnvm/src/README.md
index 64fd1371719a..c1b66260625e 100644
--- a/nnvm/src/README.md
+++ b/nnvm/src/README.md
@@ -23,3 +23,8 @@ The following components are operator invariant.
 - c_api: NNVM C API
 - core: NNVM core data structure
 - pass: NNVM pass
+
+The following components are generic NNVM compiler and defines tensor operator set
+
+- top: NNVM core tensor operators
+- compiler: NNVM compiler toolchain
diff --git a/nnvm/src/compiler/alter_op_layout.cc b/nnvm/src/compiler/alter_op_layout.cc
new file mode 100644
index 000000000000..abc0022c2a79
--- /dev/null
+++ b/nnvm/src/compiler/alter_op_layout.cc
@@ -0,0 +1,177 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file alter_op_layout.cc
+ * \brief Alter the operator layouts. Keep inferred layouts (if any) from previous stages.
+ *        e.g., convolution may calculates faster with NCHW16c layout.
+ */
+#include <nnvm/pass.h>
+#include <nnvm/op_attr_types.h>
+#include <nnvm/layout.h>
+#include <nnvm/compiler/op_attr_types.h>
+#include <nnvm/pass_functions.h>
+#include <tvm/operation.h>
+#include <algorithm>
+#include <functional>
+#include "compile_engine.h"
+#include "graph_transform.h"
+
+namespace nnvm {
+namespace compiler {
+namespace {
+
+tvm::Array<tvm::Tensor> GetTensorInfo(const IndexedGraph& idx_graph,
+                                      const uint32_t nid,
+                                      const ShapeVector& shape_vec,
+                                      const DTypeVector& dtype_vec) {
+  tvm::Array<tvm::Tensor> vec;
+  for (uint32_t i = 0; i < idx_graph[nid].source->num_outputs(); ++i) {
+    tvm::Array<tvm::Expr> shape;
+    for (int64_t x : shape_vec[idx_graph.entry_id(nid, i)]) {
+      CHECK_LE(x, static_cast<int64_t>(std::numeric_limits<int>::max()));
+      shape.push_back(tvm::make_const(tvm::Int(32), x));
+    }
+    vec.push_back(tvm::placeholder(
+      shape, GetTVMType(dtype_vec[idx_graph.entry_id(nid, i)])));
+  }
+  return vec;
+}
+
+Graph AlterOpLayout(const Graph& src) {
+  static auto& falter_op_layout =
+    Op::GetAttr<nnvm::compiler::FTVMAlterOpLayout >("FTVMAlterOpLayout");
+
+  const ShapeVector& shape_vec = src.GetAttr<ShapeVector>("shape");
+  const DTypeVector& dtype_vec = src.GetAttr<DTypeVector>("dtype");
+  const IndexedGraph& idx_graph = src.indexed_graph();
+
+  std::vector<std::vector<Layout> > in_layouts_of_node(idx_graph.num_nodes());
+  std::vector<std::vector<Layout> > out_layouts_of_node(idx_graph.num_nodes());
+  std::unordered_map<const Node*, uint32_t> unchanged_nodes;
+
+  if (src.HasAttr("layout")) {
+    // record layouts so that LayoutTransform pass can fix layouts correctly,
+    // e.g., conv2d can be replaced by some contrib implement
+    // whose layout is different from the original one
+    // (which was imported from a model file).
+    const auto& layouts = src.GetAttr<std::vector<Layout> >("layout");
+    for (uint32_t nid = 0; nid < idx_graph.num_nodes(); ++nid) {
+      const auto &inode = idx_graph[nid];
+      // record input layouts for all nodes,
+      // while replaced nodes will ignore the records here and have undefined input layouts.
+      std::vector<Layout> in_layout;
+      for (const auto& e : inode.inputs) {
+        in_layout.emplace_back(layouts[idx_graph.entry_id(e)]);
+      }
+      in_layouts_of_node[nid] = in_layout;
+
+      std::vector<Layout> out_layout;
+      for (uint32_t i = 0; i < inode.source->num_outputs(); ++i) {
+        out_layout.emplace_back(layouts[idx_graph.entry_id(nid, i)]);
+      }
+      out_layouts_of_node[nid] = out_layout;
+    }
+  }
+
+  auto transform = [&](uint32_t nid,
+                       const NodePtr& n,
+                       std::vector<NodeEntry>* ret) {
+    nnvm::compiler::FTVMAlterOpLayout fn_alter_op_layout =
+      falter_op_layout.get(n->op(), nullptr);
+    if (fn_alter_op_layout == nullptr) {
+      // will restore the original input layouts later.
+      unchanged_nodes[n.get()] = nid;
+      return false;
+    }
+
+    // construct parameters for registered function
+    std::vector<Symbol> op_inputs;
+    tvm::Array<tvm::Tensor> tensor_infos;
+    CHECK_EQ(n->num_inputs(), idx_graph[nid].inputs.size());
+    for (uint32_t i = 0; i < n->num_inputs(); ++i) {
+      const nnvm::NodeEntry& input = n->inputs[i];
+      // input operator
+      Symbol op_input;
+      op_input.outputs.push_back(input);
+      op_inputs.push_back(op_input);
+
+      // input tinfo, extract from the original graph
+      // because it was where infer_shape & infer_type applied.
+      tvm::Array<tvm::Tensor> op_output_tinfos =
+        GetTensorInfo(idx_graph, idx_graph[nid].inputs[i].node_id,
+                      shape_vec, dtype_vec);
+      tensor_infos.push_back(op_output_tinfos[input.index]);
+    }
+    // callback registered function to get a new operator.
+    Symbol op;
+    bool do_alter =
+      fn_alter_op_layout(n->attrs, Symbol::CreateGroup(op_inputs), tensor_infos, &op);
+
+    if (do_alter) {
+      *ret = op.outputs;
+    } else {
+      // will restore the original input layouts later.
+      unchanged_nodes[n.get()] = nid;
+    }
+    return do_alter;
+  };
+
+  Graph ret = nnvm::compiler::GraphTransform(src, transform);
+
+  if (src.HasAttr("layout")) {
+    // restore the layouts to return graph
+    const auto& ret_idx = ret.indexed_graph();
+    std::vector<Layout> ret_layouts(ret_idx.num_node_entries(), Layout::Undef());
+    for (uint32_t nid = 0; nid < ret_idx.num_nodes(); ++nid) {
+      const auto& inode = ret_idx[nid];
+      if (unchanged_nodes.count(inode.source)) {
+        const std::vector<Layout>& in_layouts =
+          in_layouts_of_node[unchanged_nodes[inode.source]];
+        for (uint32_t i = 0; i < inode.inputs.size(); ++i) {
+          const auto& e = inode.inputs[i];
+          ret_layouts[ret_idx.entry_id(e)] = in_layouts[i];
+        }
+        const std::vector<Layout>& out_layouts =
+          out_layouts_of_node[unchanged_nodes[inode.source]];
+        for (uint32_t i = 0; i < inode.source->num_outputs(); ++i) {
+          ret_layouts[ret_idx.entry_id(nid, i)] = out_layouts[i];
+        }
+      }
+    }
+
+    // cannot call indexed_graph() before return the origin Graph,
+    // thus create a new one.
+    nnvm::Graph new_ret;
+    new_ret.outputs = ret.outputs;
+    new_ret.attrs["layout"] = std::make_shared<any>(std::move(ret_layouts));
+    return new_ret;
+  }
+
+  return ret;
+}
+
+// register pass
+NNVM_REGISTER_PASS(AlterOpLayout)
+.set_body(AlterOpLayout)
+.set_change_graph(true);
+
+}  // namespace
+}  // namespace compiler
+}  // namespace nnvm
diff --git a/nnvm/src/compiler/compile_engine.cc b/nnvm/src/compiler/compile_engine.cc
index 3fe10b53c4a2..c2c0aa82b902 100644
--- a/nnvm/src/compiler/compile_engine.cc
+++ b/nnvm/src/compiler/compile_engine.cc
@@ -47,52 +47,52 @@ using namespace tvm;
  * \param type the tvm type.
  * \return corresponding DLDataType
  */
-int GetTypeFlag(tvm::DataType type) {
-  if (type == tvm::DataType::Float(32)) return 0;
-  if (type == tvm::DataType::Float(64)) return 1;
-  if (type == tvm::DataType::Float(16)) return 2;
-  if (type == tvm::DataType::UInt(8)) return 3;
-  if (type == tvm::DataType::Int(32)) return 4;
-  if (type == tvm::DataType::Int(8)) return 5;
-  if (type == tvm::DataType::Int(64)) return 6;
-  if (type == tvm::DataType::Int(16)) return 7;
-  if (type == tvm::DataType::UInt(16)) return 8;
-  if (type == tvm::DataType::UInt(32)) return 9;
-  if (type == tvm::DataType::UInt(64)) return 10;
-  if (type == tvm::DataType::UInt(1)) return 11;
+int GetTypeFlag(tvm::Type type) {
+  if (type == tvm::Float(32)) return 0;
+  if (type == tvm::Float(64)) return 1;
+  if (type == tvm::Float(16)) return 2;
+  if (type == tvm::UInt(8)) return 3;
+  if (type == tvm::Int(32)) return 4;
+  if (type == tvm::Int(8)) return 5;
+  if (type == tvm::Int(64)) return 6;
+  if (type == tvm::Int(16)) return 7;
+  if (type == tvm::UInt(16)) return 8;
+  if (type == tvm::UInt(32)) return 9;
+  if (type == tvm::UInt(64)) return 10;
+  if (type == tvm::UInt(1)) return 11;
   LOG(FATAL) << "cannot convert " << type;
   return 0;
 }
 // convert from type flag to tvm type.
-DataType GetTVMType(int type_flag) {
+Type GetTVMType(int type_flag) {
   switch (type_flag) {
     case 0:
-      return tvm::DataType::Float(32);
+      return tvm::Float(32);
     case 1:
-      return tvm::DataType::Float(64);
+      return tvm::Float(64);
     case 2:
-      return tvm::DataType::Float(16);
+      return tvm::Float(16);
     case 3:
-      return tvm::DataType::UInt(8);
+      return tvm::UInt(8);
     case 4:
-      return tvm::DataType::Int(32);
+      return tvm::Int(32);
     case 5:
-      return tvm::DataType::Int(8);
+      return tvm::Int(8);
     case 6:
-      return tvm::DataType::Int(64);
+      return tvm::Int(64);
     case 7:
-      return tvm::DataType::Int(16);
+      return tvm::Int(16);
     case 8:
-      return tvm::DataType::UInt(16);
+      return tvm::UInt(16);
     case 9:
-      return tvm::DataType::UInt(32);
+      return tvm::UInt(32);
     case 10:
-      return tvm::DataType::UInt(64);
+      return tvm::UInt(64);
     case 11:
-      return tvm::DataType::UInt(1);
+      return tvm::UInt(1);
     default:
       LOG(FATAL) << "unknown type_flag=" << type_flag;
-      return DataType::Float(32);
+      return Float(32);
   }
 }
 
@@ -218,7 +218,7 @@ class CompileEngine {
         Array<Expr> shape;
         for (int64_t x : shape_vec[idx.entry_id(nid, i)]) {
           CHECK_LE(x, static_cast<int64_t>(std::numeric_limits<int>::max()));
-          shape.push_back(make_const(DataType::Int(32), x));
+          shape.push_back(make_const(Int(32), x));
         }
         out_info.push_back(
             placeholder(shape,
diff --git a/nnvm/src/compiler/compile_engine.h b/nnvm/src/compiler/compile_engine.h
new file mode 100644
index 000000000000..8151f6ced478
--- /dev/null
+++ b/nnvm/src/compiler/compile_engine.h
@@ -0,0 +1,133 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file compile_engine.h
+ * \brief Internal engine to compile a subgraph fragment and cache compilation.
+ */
+#ifndef NNVM_COMPILER_COMPILE_ENGINE_H_
+#define NNVM_COMPILER_COMPILE_ENGINE_H_
+
+#include <nnvm/graph.h>
+#include <nnvm/op_attr_types.h>
+#include <nnvm/graph_attr_types.h>
+#include <nnvm/tuple.h>
+#include <nnvm/pass.h>
+#include <nnvm/compiler/op_attr_types.h>
+#include <nnvm/compiler/packed_func_ext.h>
+#include <tvm/runtime/packed_func.h>
+#include <tvm/operation.h>
+#include <tvm/lowered_func.h>
+#include <string>
+#include <utility>
+#include "graph_hash.h"
+
+namespace nnvm {
+namespace compiler {
+
+/*! \brief A TVM Node to represent compiled graph function */
+struct GraphFuncNode : public tvm::Node {
+  /* \brief compiled target */
+  std::string target;
+  /*! \brief Function name */
+  std::string func_name;
+  /* \brief The inputs to the function */
+  tvm::Array<Tensor> inputs;
+  /* \brief The outputs to the function */
+  tvm::Array<Tensor> outputs;
+  /*! \brief The lowered functions */
+  tvm::Array<tvm::LoweredFunc> funcs;
+
+  void VisitAttrs(tvm::AttrVisitor* v) {
+    v->Visit("target", &target);
+    v->Visit("func_name", &func_name);
+    v->Visit("inputs", &inputs);
+    v->Visit("outputs", &outputs);
+    v->Visit("funcs", &funcs);
+  }
+
+  static constexpr const char* _type_key = "GraphFunc";
+  TVM_DECLARE_NODE_TYPE_INFO(GraphFuncNode, tvm::Node);
+};
+
+TVM_DEFINE_NODE_REF(GraphFunc, GraphFuncNode);
+
+/*! \brief Cache Entry in the graph */
+struct GraphCacheEntryNode : public tvm::Node {
+  /*! \brief The graph function */
+  GraphFunc graph_func;
+  /*! \brief Usage statistics */
+  int use_count{0};
+  /*! \brief Index of the master node for calling schedule*/
+  int master_idx;
+
+  void VisitAttrs(tvm::AttrVisitor* v) {
+    v->Visit("graph_func", &graph_func);
+    v->Visit("use_count", &use_count);
+    v->Visit("master_idx", &master_idx);
+  }
+  static constexpr const char* _type_key = "GraphCacheEntry";
+  TVM_DECLARE_NODE_TYPE_INFO(GraphCacheEntryNode, tvm::Node);
+};
+
+class GraphCacheEntry : public ::tvm::NodeRef {
+ public:
+  GraphCacheEntry() {}
+  explicit GraphCacheEntry(::tvm::NodePtr<::tvm::Node> n) : NodeRef(n) {}
+  GraphCacheEntryNode* operator->() {
+    return static_cast<GraphCacheEntryNode*>(get_mutable());
+  }
+  using ContainerType = GraphCacheEntryNode;
+};
+
+/*!
+ * \brief Call compile engine to lower a graph with given inputs.
+ *
+ * \param graph The graph to be compiled
+ * \param inputs The input specification.
+ * \param target The build target
+ * \param master_idx The index of master node for calling schedule
+ *
+ * \return func A lowered tvm function.
+ */
+GraphFunc GraphLower(Graph graph,
+                     const Array<tvm::Tensor>& inputs,
+                     const std::string& target,
+                     int master_idx);
+
+/*!
+ * \brief Get type flag from TVM Type
+ *
+ * \param type the tvm type
+ * \return corresponding DLDataType
+ */
+int GetTypeFlag(tvm::Type type);
+
+/*!
+ * \brief Get TVM Type from type flag
+ *
+ * \param type_flag the type flag
+ * \return corresponding TVM type
+ */
+tvm::Type GetTVMType(int type_flag);
+
+}  // namespace compiler
+}  // namespace nnvm
+
+#endif  // NNVM_COMPILER_COMPILE_ENGINE_H_
diff --git a/nnvm/src/compiler/fold_scale_axis.cc b/nnvm/src/compiler/fold_scale_axis.cc
new file mode 100644
index 000000000000..6e5e73788c4c
--- /dev/null
+++ b/nnvm/src/compiler/fold_scale_axis.cc
@@ -0,0 +1,602 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file fold_scale_axis.cc
+ * \author Fold scaling parameter of axis into weight of conv/dense
+*/
+#include <nnvm/graph.h>
+#include <nnvm/op_attr_types.h>
+#include <nnvm/graph_attr_types.h>
+#include <nnvm/pass.h>
+#include <nnvm/compiler/op_attr_types.h>
+#include <nnvm/top/nn.h>
+#include "pattern_util.h"
+#include "graph_transform.h"
+
+namespace nnvm {
+namespace compiler {
+
+enum FoldScaleKind {
+  // No folding is applied
+  kNone,
+  // The folding decision is pending, we can fold on a state.
+  kPending,
+  // The original operator that contains the scale.
+  kProvider,
+  // The final conumer of axis scale using multiply
+  // Likely be a conv or dense operator.
+  kMulConsumer,
+  // The final conumer of axis scale using division
+  kDivConsumer
+};
+
+struct FoldChainInfo {
+  // Entry kind
+  FoldScaleKind kind{kNone};
+  // The output axis to be folded
+  int axis{0};
+  // Source node in the fold chain
+  int source{0};
+};
+
+// The entry of folding chains on which
+// we should perform folding on
+struct FoldChainEntry {
+  // Fold information
+  FoldChainInfo info;
+  // Number of outgoing fork count
+  // in forward propagation.
+  int fork_count{0};
+  // Following field only used by provider.
+  // The input index
+  int fold_input_index{1};
+  // The scale entry
+  NodeEntry scale_entry;
+};
+
+// Try to pass axis scaling to backward,
+// Given that we we know the status of current fold axis.
+// return whether the forward signal is consumed.
+using FScaleAxisBackward = std::function<
+  bool(const NodeAttrs& attrs,
+       const std::vector<TShape>& in_shape,
+       const std::vector<TShape>& out_shape,
+       const FoldChainInfo& out_info,
+       std::vector<FoldChainInfo>* in_info)>;
+
+
+// Try to pass axis scaling to forward,
+// Given that we we know the status of one of its input to be pending
+// also update other input info
+// return whether the forward signal is consumed.
+using FScaleAxisForward = std::function<
+  bool(const NodeAttrs& attrs,
+       const std::vector<TShape>& in_shape,
+       const std::vector<TShape>& out_shape,
+       std::vector<FoldChainInfo>* in_info,
+       FoldChainInfo* out_info)>;
+
+
+// Detect if there is a scaling axis happening
+bool DetectScaleAxis(const IndexedGraph& idx,
+                     uint32_t nid,
+                     const ShapeVector& shape_vec,
+                     const std::vector<uint32_t>& ref_count,
+                     bool is_forward,
+                     std::vector<FoldChainEntry>* chain) {
+  const IndexedGraph::Node& inode = idx[nid];
+  static const Op* bcast_mul = Op::Get("broadcast_mul");
+  static const Op* expand_dims = Op::Get("expand_dims");
+  if (inode.source->op() != bcast_mul) return false;
+  const TShape& oshape = shape_vec[idx.entry_id(nid, 0)];
+  CHECK_NE(oshape.ndim(), 0);
+  if (oshape.ndim() <= 1) return false;
+  for (int i = 0; i < 2; ++i) {
+    const IndexedGraph::NodeEntry& a = inode.inputs[i];
+    const IndexedGraph::NodeEntry& b = inode.inputs[1 - i];
+    std::pair<int, int> axis =
+        MatchBroadcast1DAxis(oshape, shape_vec[idx.entry_id(a)]);
+    if (axis.first != -1 &&
+        shape_vec[idx.entry_id(b)] == oshape) {
+      if (ref_count[a.node_id] != 1) return false;
+      if (is_forward && ref_count[nid] != 1) return false;
+      if (!is_forward && ref_count[b.node_id] != 1) return false;
+      const IndexedGraph::Node& anode = idx[a.node_id];
+      // mark the current entry.
+      FoldChainEntry& e = (*chain)[nid];
+      if (anode.source->is_variable()) {
+        e.fold_input_index = 1 - i;
+        e.scale_entry = inode.source->inputs[1 - i];
+      } else if (anode.source->op()  == expand_dims &&
+                 shape_vec[idx.entry_id(anode.source->inputs[0])].ndim() == 1) {
+        e.fold_input_index = 1 - i;
+        e.scale_entry = anode.source->inputs[0];
+      } else {
+        return false;
+      }
+      e.info.axis = axis.first;
+      e.info.kind = kPending;
+      e.info.source = nid;
+      e.fork_count = 1;
+      // In the backward message passing
+      // We need to eagerly pass it to the input
+      // In the forward message passing
+      // we will "pull" the message from input.
+      if (!is_forward) {
+        FoldChainEntry& enext = (*chain)[b.node_id];
+        enext.info.axis = e.info.axis;
+        enext.info.kind = kPending;
+        enext.info.source = nid;
+      }
+      return true;
+    }
+  }
+  return false;
+}
+
+Graph FoldScaleAxis(Graph src) {
+  // Operator pattern
+  static auto& fbackward =
+      nnvm::Op::GetAttr<FScaleAxisBackward>("FScaleAxisBackward");
+  static auto& fforward =
+      nnvm::Op::GetAttr<FScaleAxisForward>("FScaleAxisForward");
+  const IndexedGraph& idx = src.indexed_graph();
+  const ShapeVector& shape_vec = src.GetAttr<ShapeVector>("shape");
+  std::vector<uint32_t> ref_count = GetNodeRefCounts(idx);
+  std::vector<FoldChainEntry> bwd_chain(idx.num_nodes());
+  std::vector<FoldChainEntry> fwd_chain(idx.num_nodes());
+  // shape hint for the inference.
+  std::vector<TShape> in_shape, out_shape;
+
+  // perform backward folding.
+  for (uint32_t i = idx.num_nodes(); i != 0; --i) {
+    uint32_t nid = i - 1;
+    const auto& inode = idx[nid];
+    if (inode.source->is_variable()) continue;
+    if (DetectScaleAxis(idx, nid, shape_vec,
+                        ref_count, false, &bwd_chain)) continue;
+    if (bwd_chain[nid].info.kind != kPending) continue;
+    // if referred by multiple node, cannot do propagation
+    if (ref_count[nid] != 1 || !fbackward.count(inode.source->op())) {
+      bwd_chain[nid].info.kind = kNone; continue;
+    }
+    // get input shape and output shape.
+    in_shape.clear(); out_shape.clear();
+    for (const IndexedGraph::NodeEntry& e : inode.inputs) {
+      in_shape.push_back(shape_vec[idx.entry_id(e)]);
+    }
+    for (uint32_t i = 0; i < inode.source->num_outputs(); ++i) {
+      out_shape.push_back(shape_vec[idx.entry_id(nid, i)]);
+    }
+    std::vector<FoldChainInfo> in_info(in_shape.size(), FoldChainInfo());
+    bool consumed = fbackward[inode.source->op()](
+        inode.source->attrs,
+        in_shape,
+        out_shape,
+        bwd_chain[nid].info,
+        &in_info);
+    CHECK_EQ(in_info.size(), in_shape.size());
+    // propagate back.
+    bool can_prop = true;
+    for (size_t i = 0; i < in_info.size(); ++i) {
+      const IndexedGraph::NodeEntry& e = inode.inputs[i];
+      if (ref_count[e.node_id] != 1 ||
+          idx[e.node_id].source->num_outputs() != 1) {
+        can_prop = false; break;
+      }
+    }
+    if (!can_prop) continue;
+    for (size_t i = 0; i < in_info.size(); ++i) {
+      const IndexedGraph::NodeEntry& e = inode.inputs[i];
+      bwd_chain[e.node_id].info = in_info[i];
+    }
+    // mark consumed by making the source as provider.
+    if (consumed) {
+      bwd_chain[bwd_chain[nid].info.source].info.kind = kProvider;
+    }
+  }
+
+
+  // perform forward folding.
+  for (uint32_t nid = 0; nid < idx.num_nodes(); ++nid) {
+    const auto& inode = idx[nid];
+    if (inode.source->is_variable()) continue;
+    // skip scales that are already folded in backward.
+    if (bwd_chain[nid].info.kind == kProvider) continue;
+    if (DetectScaleAxis(idx, nid, shape_vec,
+                        ref_count, true, &fwd_chain)) continue;
+    if (inode.source->num_outputs() != 1) continue;
+    // Do state update
+    // get input shape and output shape.
+    std::vector<FoldChainInfo> in_info;
+    FoldChainInfo out_info;
+    int num_inpending = 0;
+    in_shape.clear(); out_shape.clear();
+    for (const IndexedGraph::NodeEntry& e : inode.inputs) {
+      in_shape.push_back(shape_vec[idx.entry_id(e)]);
+      // input information
+      in_info.push_back(fwd_chain[e.node_id].info);
+      if (fwd_chain[e.node_id].info.kind == kPending) {
+        ++num_inpending;
+      }
+    }
+    for (uint32_t i = 0; i < inode.source->num_outputs(); ++i) {
+      out_shape.push_back(shape_vec[idx.entry_id(nid, i)]);
+    }
+    if (num_inpending != 1 ||
+        !fforward.count(inode.source->op())) continue;
+    bool consumed = fforward[inode.source->op()](
+        inode.source->attrs,
+        in_shape,
+        out_shape,
+        &in_info,
+        &out_info);
+    // update input info
+    for (size_t i = 0; i < in_info.size(); ++i) {
+      fwd_chain[inode.inputs[i].node_id].info = in_info[i];
+    }
+    if (consumed) {
+      fwd_chain[nid].info = out_info;
+      for (size_t i = 0; i < in_info.size(); ++i) {
+        if (in_info[i].kind == kPending) {
+          if (--fwd_chain[in_info[i].source].fork_count == 0) {
+            fwd_chain[in_info[i].source].info.kind = kProvider;
+          }
+        }
+      }
+    } else {
+      // can propagate condition
+      if (inode.source->num_outputs() == 1) {
+        fwd_chain[nid].info = out_info;
+        if (out_info.kind == kPending) {
+          // When there is multiple reference to input
+          // every path have to be consumed
+          fwd_chain[out_info.source].fork_count += ref_count[nid] - 1;
+        }
+      }
+    }
+  }
+
+  auto transform = [&](uint32_t nid, const NodePtr& n, std::vector<NodeEntry>* ret) {
+    NodeEntry rvalue = NodeEntry{n, 0, 0};
+    {
+      // Backward chain
+      const FoldChainEntry& e = bwd_chain[nid];
+      if (e.info.kind == kMulConsumer &&
+          bwd_chain[e.info.source].info.kind == kProvider) {
+        const FoldChainEntry& se = bwd_chain[e.info.source];
+        CHECK_EQ(n->num_outputs(), 1);
+        NodeEntry scale = ExpandBiasToMatchAxis(
+            se.scale_entry,
+            shape_vec[idx.entry_id(nid, 0)].ndim(),
+            shape_vec[idx.entry_id(se.scale_entry)].ndim(),
+            e.info.axis);
+        rvalue = MakeNode("broadcast_mul", n->attrs.name + "_sc",
+                          {rvalue, scale});
+      } else if (e.info.kind == kProvider) {
+        rvalue = n->inputs[e.fold_input_index];
+      }
+    }
+    // Note that the value might get transformed twice if it
+    // folds value from both fwd and backward chain.
+    {
+      // forward chain
+      const FoldChainEntry& e = fwd_chain[nid];
+      if (e.info.kind == kMulConsumer &&
+          fwd_chain[e.info.source].info.kind == kProvider) {
+        const FoldChainEntry& se = fwd_chain[e.info.source];
+        CHECK_EQ(n->num_outputs(), 1);
+        NodeEntry scale = ExpandBiasToMatchAxis(
+            se.scale_entry,
+            shape_vec[idx.entry_id(nid, 0)].ndim(),
+            shape_vec[idx.entry_id(se.scale_entry)].ndim(),
+            e.info.axis);
+        rvalue = MakeNode("broadcast_mul", n->attrs.name + "_sc",
+                          {rvalue, scale});
+      } else if (e.info.kind == kDivConsumer &&
+                 fwd_chain[e.info.source].info.kind == kProvider) {
+        const FoldChainEntry& se = fwd_chain[e.info.source];
+        CHECK_EQ(n->num_outputs(), 1);
+        NodeEntry scale = ExpandBiasToMatchAxis(
+            se.scale_entry,
+            shape_vec[idx.entry_id(nid, 0)].ndim(),
+            shape_vec[idx.entry_id(se.scale_entry)].ndim(),
+            e.info.axis);
+        rvalue = MakeNode("broadcast_div", n->attrs.name + "_sc",
+                          {rvalue, scale});
+      } else if (e.info.kind == kProvider) {
+        rvalue = n->inputs[e.fold_input_index];
+      }
+    }
+    if (rvalue.node == n) {
+      return false;
+    } else {
+      *ret = {rvalue};
+      return true;
+    }
+  };
+  return GraphTransform(src, transform);
+}
+
+NNVM_REGISTER_PASS(FoldScaleAxis)
+.set_body(FoldScaleAxis);
+
+// property registration.
+bool ReluScaleAxisBackward(
+    const NodeAttrs& attrs,
+    const std::vector<TShape>& in_shape,
+    const std::vector<TShape>& out_shape,
+    const FoldChainInfo& out_info,
+    std::vector<FoldChainInfo>* in_axis) {
+  (*in_axis)[0] = out_info;
+  return false;
+}
+
+bool ReluScaleAxisForward(
+    const NodeAttrs& attrs,
+    const std::vector<TShape>& in_shape,
+    const std::vector<TShape>& out_shape,
+    std::vector<FoldChainInfo>* in_info,
+    FoldChainInfo* out_info) {
+  *out_info = (*in_info)[0];
+  return false;
+}
+
+NNVM_REGISTER_OP(relu)
+.set_attr<FScaleAxisBackward>("FScaleAxisBackward", ReluScaleAxisBackward);
+
+NNVM_REGISTER_OP(leaky_relu)
+.set_attr<FScaleAxisBackward>("FScaleAxisBackward", ReluScaleAxisBackward);
+
+NNVM_REGISTER_OP(relu)
+.set_attr<FScaleAxisForward>("FScaleAxisForward", ReluScaleAxisForward);
+
+NNVM_REGISTER_OP(leaky_relu)
+.set_attr<FScaleAxisForward>("FScaleAxisForward", ReluScaleAxisForward);
+
+// property registration.
+template <typename T>
+bool Pool2DBackward(
+    const NodeAttrs& attrs,
+    const std::vector<TShape>& in_shape,
+    const std::vector<TShape>& out_shape,
+    const FoldChainInfo& out_info,
+    std::vector<FoldChainInfo>* in_axis) {
+  const T& param = nnvm::get<T>(attrs.parsed);
+  if (out_info.axis == 1 && param.layout == "NCHW") {
+    (*in_axis)[0] = out_info;
+  }
+  return false;
+}
+
+template <typename T>
+bool Pool2DForward(
+    const NodeAttrs& attrs,
+    const std::vector<TShape>& in_shape,
+    const std::vector<TShape>& out_shape,
+    std::vector<FoldChainInfo>* in_info,
+    FoldChainInfo* out_info) {
+  const T& param = nnvm::get<T>(attrs.parsed);
+  if ((*in_info)[0].axis == 1 && param.layout == "NCHW") {
+    *out_info = (*in_info)[0];
+  }
+  return false;
+}
+
+NNVM_REGISTER_OP(max_pool2d)
+.set_attr<FScaleAxisBackward>("FScaleAxisBackward", Pool2DBackward<top::MaxPool2DParam>);
+
+NNVM_REGISTER_OP(avg_pool2d)
+.set_attr<FScaleAxisBackward>("FScaleAxisBackward", Pool2DBackward<top::AvgPool2DParam>);
+
+NNVM_REGISTER_OP(max_pool2d)
+.set_attr<FScaleAxisForward>("FScaleAxisForward", Pool2DForward<top::MaxPool2DParam>);
+
+NNVM_REGISTER_OP(avg_pool2d)
+.set_attr<FScaleAxisForward>("FScaleAxisForward", Pool2DForward<top::AvgPool2DParam>);
+
+
+
+bool BroadcastAddSubScaleAxisBackward(
+    const NodeAttrs& attrs,
+    const std::vector<TShape>& in_shape,
+    const std::vector<TShape>& out_shape,
+    const FoldChainInfo& out_info,
+    std::vector<FoldChainInfo>* in_axis) {
+  if (out_info.kind != kPending) return false;
+  for (int i = 0; i < 2; ++i) {
+    std::pair<int, int> m = MatchBroadcast1DAxis(out_shape[0], in_shape[1 - i]);
+    if (m.second != -1 &&
+        in_shape[i] == out_shape[0] &&
+        m.first == out_info.axis) {
+      (*in_axis)[i].kind = kPending;
+      (*in_axis)[i].axis = out_info.axis;
+      (*in_axis)[i].source = out_info.source;
+      (*in_axis)[1 - i].kind = kMulConsumer;
+      (*in_axis)[1 - i].axis = m.second;
+      (*in_axis)[1 - i].source = out_info.source;
+      return false;
+    }
+  }
+  return false;
+}
+
+bool BroadcastAddSubScaleAxisForward(
+    const NodeAttrs& attrs,
+    const std::vector<TShape>& in_shape,
+    const std::vector<TShape>& out_shape,
+    std::vector<FoldChainInfo>* in_info,
+    FoldChainInfo* out_info) {
+  for (int i = 0; i < 2; ++i) {
+    if ((*in_info)[i].kind == kPending) {
+      std::pair<int, int> m = MatchBroadcast1DAxis(out_shape[0], in_shape[1 - i]);
+      if (m.second != -1 &&
+          in_shape[i] == out_shape[0] &&
+          m.first == (*in_info)[i].axis) {
+        out_info->kind = kPending;
+        out_info->axis = m.first;
+        out_info->source = (*in_info)[i].source;
+        (*in_info)[1 - i].kind = kDivConsumer;
+        (*in_info)[1 - i].axis = m.second;
+        (*in_info)[1 - i].source = (*in_info)[i].source;
+        return false;
+      }
+    }
+  }
+  return false;
+}
+
+NNVM_REGISTER_OP(broadcast_add)
+.set_attr<FScaleAxisBackward>("FScaleAxisBackward", BroadcastAddSubScaleAxisBackward);
+
+NNVM_REGISTER_OP(broadcast_sub)
+.set_attr<FScaleAxisBackward>("FScaleAxisBackward", BroadcastAddSubScaleAxisBackward);
+
+NNVM_REGISTER_OP(broadcast_add)
+.set_attr<FScaleAxisForward>("FScaleAxisForward", BroadcastAddSubScaleAxisForward);
+
+NNVM_REGISTER_OP(broadcast_sub)
+.set_attr<FScaleAxisForward>("FScaleAxisForward", BroadcastAddSubScaleAxisForward);
+
+bool Conv2DScaleAxisBackward(
+    const NodeAttrs& attrs,
+    const std::vector<TShape>& in_shape,
+    const std::vector<TShape>& out_shape,
+    const FoldChainInfo& out_info,
+    std::vector<FoldChainInfo>* in_axis) {
+  using top::Conv2DParam;
+  const Conv2DParam& param = nnvm::get<Conv2DParam>(attrs.parsed);
+  if (out_info.kind != kPending) return false;
+  // only optimize for kernel layout OIHW for now
+  if (param.kernel_layout == "OIHW" && out_info.axis == 1) {
+    (*in_axis)[1].kind = kMulConsumer;
+    (*in_axis)[1].axis = 0;
+    (*in_axis)[1].source = out_info.source;
+    if (param.use_bias) {
+      (*in_axis)[2].kind = kMulConsumer;
+      (*in_axis)[2].axis = 0;
+      (*in_axis)[2].source = out_info.source;
+    }
+    return true;
+  } else {
+    return false;
+  }
+}
+
+bool Conv2DScaleAxisForward(
+    const NodeAttrs& attrs,
+    const std::vector<TShape>& in_shape,
+    const std::vector<TShape>& out_shape,
+    std::vector<FoldChainInfo>* in_info,
+    FoldChainInfo* out_info) {
+  using top::Conv2DParam;
+  const Conv2DParam& param = nnvm::get<Conv2DParam>(attrs.parsed);
+  if ((*in_info)[0].kind != kPending) return false;
+  // only optimize for nchw for now
+  if (param.kernel_layout == "OIHW" && (*in_info)[0].axis == 1) {
+    // Check whether it is depthwise conv2d
+    if (param.use_bias) {
+      CHECK_EQ(in_shape.size(), 3U) << "Input:[data, weight, bias]";
+    } else {
+      CHECK_EQ(in_shape.size(), 2U) << "Input:[data, weight]";
+    }
+
+    auto dshape = in_shape.at(0);
+    CHECK_EQ(dshape.ndim(), 4U) << "Input data shape should be 4D";
+
+    // TODO(FrozenGene): Currently, we don't support conv2d's groups != in channels.
+    if (param.groups > 1 && dshape[1] != param.groups) {
+      LOG(WARNING) << "FoldScaleAxis optimization doesn't support conv2d "
+                   << "with groups != in channels. We will skip FoldScaleAxis "
+                   << "optimization for this op.";
+      return false;
+    }
+
+
+    // input channel equals to groups, which means depthwise conv2d
+    bool is_depthwise_conv2d = (dshape[1] == param.groups);
+
+    // if it is depthwise convolution, the weight fold axis should along to axis 0.
+    // For example:
+    // data shape [1,54,63,127] weights shape [54,1,3,3], scale shape [54]
+    // depthwise convolution's weights shape means we have divided the data shape's channel
+    // to groups parties. Here, we divide 54 channels into 54 parties. Every part size is 1.
+    // weights shape's first dimision means how many parties we have divided (mapping to
+    // input shape's channel). So, in the depthwise convolution, we shouldn't do like
+    // traditional convolution(i.e. OIHW)
+
+    // Backgroud of this algorithm:
+
+    // Original Graph:
+    //    Graph(%x,
+    //          %in_scale,
+    //          %weight,
+    //          %bias,
+    //          %out_scale) {
+    //      %1 = __add_scalar__(%x, scalar='1')
+    //      %3 = expand_dims(%in_scale, num_newaxis='2', axis='1')
+    //      %4 = broadcast_mul(%1, %3)
+    //      %7 = conv2d(%4, %weight, %bias, padding='(1, 1)', kernel_size='(3, 3)', channels='2')
+    //      %8 = relu(%7)
+    //      %10 = expand_dims(%out_scale, num_newaxis='2', axis='1')
+    //      %11 = broadcast_mul(%8, %10)
+    //      ret %11
+    //    }
+
+    // Optimized Graph:
+    //    Graph(%x,
+    //          %weight,
+    //          %out_scale,
+    //          %in_scale,
+    //          %bias) {
+    //      %1 = __add_scalar__(%x, scalar='1')
+    //      %4 = expand_dims(%out_scale, num_newaxis='3', axis='1')
+    //      %5 = broadcast_mul(%weight, %4)
+    //      %7 = expand_dims(%in_scale, num_newaxis='2', axis='1')
+    //      %8 = broadcast_mul(%5, %7)
+    //      %10 = broadcast_mul(%bias, %out_scale)
+    //      %11 = conv2d(%1, %8, %10, padding='(1, 1)', kernel_size='(3, 3)', channels='2')
+    //      %12 = relu(%11)
+    //      ret %12
+    //    }
+
+    // Conv2DScaleAxisForward will need in_scale. Conv2DScaleAxisBackward will need out_scale.
+    // in_scale will apply into input data's channel (in_channel). out_scale will apply in
+    // conv2d's result, which will apply in weight's output channel.
+    // So, default Conv2DScaleAxisForward will fold axis 1 (weights' input channel).
+    // Conv2DScaleAxisBackward will fold axis 0 (weights' output channel).
+    // But depthwise convolution is another story as said previously.
+    (*in_info)[1].kind = kMulConsumer;
+    (*in_info)[1].axis = is_depthwise_conv2d ? 0 : 1;
+    (*in_info)[1].source = (*in_info)[0].source;
+    return true;
+  } else {
+    return false;
+  }
+}
+
+NNVM_REGISTER_OP(conv2d)
+.set_attr<FScaleAxisBackward>("FScaleAxisBackward", Conv2DScaleAxisBackward);
+
+NNVM_REGISTER_OP(conv2d)
+.set_attr<FScaleAxisForward>("FScaleAxisForward", Conv2DScaleAxisForward);
+
+}  // namespace compiler
+}  // namespace nnvm
diff --git a/nnvm/src/compiler/graph_fuse.cc b/nnvm/src/compiler/graph_fuse.cc
new file mode 100644
index 000000000000..1b4a8e117555
--- /dev/null
+++ b/nnvm/src/compiler/graph_fuse.cc
@@ -0,0 +1,424 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file graph_fuse.cc
+ * \brief Fuse the operators together.
+ */
+#include <dmlc/parameter.h>
+#include <nnvm/compiler/packed_func_ext.h>
+#include <nnvm/graph.h>
+#include <nnvm/graph_attr_types.h>
+#include <nnvm/node.h>
+#include <nnvm/op_attr_types.h>
+#include <nnvm/pass.h>
+#include <nnvm/pass_functions.h>
+#include <nnvm/tuple.h>
+#include <tvm/lowered_func.h>
+#include <tvm/runtime/packed_func.h>
+#include <memory>
+#include <utility>
+#include <limits>
+#include <unordered_map>
+
+#include "graph_fuse.h"
+#include "graph_runtime.h"
+#include "pattern_util.h"
+
+namespace nnvm {
+namespace compiler {
+using namespace tvm;
+
+// Partition the graph into segments
+// Each segment will be compiled into one operator.
+// Also mark the property of the segment.
+nnvm::Graph GraphFindFusibleGroups(nnvm::Graph g) {
+  const IndexedGraph& idx = g.indexed_graph();
+  int opt_level = 2;
+  if (g.attrs.count("opt_level") != 0) {
+    opt_level = g.MoveCopyAttr<int>("opt_level");
+  }
+
+  // Get attributes from the graph
+  const ShapeVector& shape_vec = g.GetAttr<ShapeVector>("shape");
+
+  // Reference counter of each op node
+  // For now, always store result when an op is referred more than once.
+  std::vector<uint32_t> ref_count = GetNodeRefCounts(idx);
+  for (const auto& e : idx.outputs()) {
+    // this line will realize all the outputs
+    ref_count[e.node_id] += 1;
+  }
+  // Pattern for the subgraph
+  PatternVec pattern_vec(idx.num_nodes(),  kOpaque);
+  // Whether node can be fused to parent.
+  std::vector<FuseRule> fuse_vec(idx.num_nodes(), FuseRule::kUknown);
+  // Master node id of fusion segment.
+  std::vector<int> master_vec(idx.num_nodes(), -1);
+  // Operator pattern
+  static auto& op_pattern = nnvm::Op::GetAttr<TOpPattern>("TOpPattern");
+
+  for (uint32_t nid = 0; nid < idx.num_nodes(); ++nid) {
+    const auto& inode = idx[nid];
+    if (inode.source->is_variable()) {
+      fuse_vec[nid] = FuseRule::kRealize; continue;
+    }
+    TOpPattern pt = op_pattern.get(inode.source->op(), kOpaque);
+
+    if (pt <= kBroadcast) {
+      // Check if we can fuse to the master.
+      int chosen_master = -1;
+      bool ewise = inode.source->num_outputs() == 1;
+      bool mark_as_injective = false;
+      for (const auto& e : inode.inputs) {
+        if (fuse_vec[e.node_id] == FuseRule::kUknown) {
+          TOpPattern ipt = pattern_vec[e.node_id];
+          if (ipt != kElemWise) ewise = false;
+          if (ipt <= kBroadcast) {
+            fuse_vec[e.node_id] = FuseRule::kFuseToMaster;
+          } else if (ipt == kInjective) {
+            fuse_vec[e.node_id] = FuseRule::kFuseToMaster;
+            mark_as_injective = true;
+          } else if (ipt == kOutEWiseFusable &&
+                     chosen_master == -1 &&
+                     shape_vec[idx.entry_id(nid, 0)] == shape_vec[idx.entry_id(e)]) {
+            chosen_master = master_vec[e.node_id];
+            fuse_vec[e.node_id] = FuseRule::kFuseToMaster;
+          } else {
+            fuse_vec[e.node_id] = FuseRule::kRealize;
+          }
+        }
+        if (ewise) {
+          if (shape_vec[idx.entry_id(nid, 0)] != shape_vec[idx.entry_id(e)]) {
+            ewise = false;
+          }
+        }
+      }
+      master_vec[nid] = chosen_master;
+      if (chosen_master != -1) {
+        pt = kOutEWiseFusable;
+      } else if (mark_as_injective) {
+        pt = kInjective;
+      } else {
+        pt = ewise ? kElemWise : kBroadcast;
+      }
+    } else if (pt == kInjective || pt == kCommReduce) {
+      // Fuse to the comm reduce or injective
+      for (const auto& e : inode.inputs) {
+        if (fuse_vec[e.node_id] == FuseRule::kUknown) {
+          TOpPattern ipt = pattern_vec[e.node_id];
+          if (ipt <= kInjective) {
+            fuse_vec[e.node_id] = FuseRule::kFuseToMaster;
+          } else {
+            fuse_vec[e.node_id] = FuseRule::kRealize;
+          }
+        }
+      }
+      if (pt == kCommReduce) {
+        master_vec[nid] = nid;
+      }
+    } else {
+      // Realize
+      master_vec[nid] = nid;
+      for (const auto& e : inode.inputs) {
+        if (fuse_vec[e.node_id] == FuseRule::kUknown) {
+          fuse_vec[e.node_id] = FuseRule::kRealize;
+          if (master_vec[e.node_id] == -1) {
+            master_vec[e.node_id] = e.node_id;
+          }
+        }
+      }
+    }
+
+    pattern_vec[nid] = pt;
+    if (ref_count[nid] > 1 || opt_level < 1) {
+      fuse_vec[nid] = FuseRule::kRealize;
+      if (master_vec[nid] == -1) {
+        master_vec[nid] = nid;
+      }
+    }
+  }
+
+  // Point to the group root id of each node.
+  GroupVec group_vec(idx.num_nodes(), -1);
+  std::vector<std::vector<uint32_t> > node_ids_per_group(idx.num_nodes());
+  for (uint32_t i = idx.num_nodes(); i != 0; --i) {
+    uint32_t nid = i - 1;
+    const auto& inode = idx[nid];
+    bool is_root = false;
+    if (group_vec[nid] == -1) {
+      group_vec[nid] = nid;
+      node_ids_per_group[nid].push_back(nid);
+      is_root = true;
+    }
+
+    // Check if injective op and out_ewise_fusable op (e.g. conv2d) are in the same group.
+    bool parent_out_ewise = false;
+    bool parent_injective = false;
+    for (const auto& e : inode.inputs) {
+      if (fuse_vec[e.node_id] != FuseRule::kFuseToMaster) continue;
+      TOpPattern pt = pattern_vec[e.node_id];
+      if (pt == kOutEWiseFusable) {
+        parent_out_ewise = true;
+      } else if (pt == kInjective) {
+        parent_injective = true;
+      }
+    }
+    // Change the master node from out_ewise_fusable op to itself
+    if (parent_injective && parent_out_ewise) {
+      master_vec[nid] = nid;
+      if (!is_root) {
+        // Children nodes in the same group might be pointing to a master node in a different group.
+        for (uint32_t j : node_ids_per_group[group_vec[nid]]) {
+          master_vec[j] = nid;
+        }
+      }
+    }
+
+    // Propagate the group id.
+    for (const auto& e : inode.inputs) {
+      TOpPattern pt = pattern_vec[e.node_id];
+      if (parent_out_ewise && parent_injective) {
+        if (pt == kOutEWiseFusable) {
+          continue;  // Do not fuse out_ewise_fusable op
+        } else if (pt == kInjective) {
+          master_vec[e.node_id] = nid;
+        }
+      }
+      if (fuse_vec[e.node_id] == FuseRule::kFuseToMaster) {
+        CHECK(group_vec[e.node_id] == -1||
+              group_vec[e.node_id] == group_vec[nid]);
+        group_vec[e.node_id] = group_vec[nid];
+        node_ids_per_group[group_vec[nid]].push_back(e.node_id);
+      }
+    }
+  }
+
+  /*
+     Above algorithm will not fuse a node whose output is fed to more than one
+     child node. This is because in general, it does not make sense to fuse multiple
+     children branches with their parent, as in the following example.
+
+            conv2d
+            /  |  \
+           /   |   \
+         op    op   op
+          |    |    |
+          |    |    |
+
+     However, when all children branches meet at a certain node, there is a possibility for
+     further operator fusion. For example, all nodes in the following subgraph can be fused
+     into a single node, if three 'in-between' nodes and the bottom node are all element wise
+     operation.
+
+            conv2d
+            /  |  \
+           /   |   \
+         op    op   op
+          \    |    /
+           \   |   /
+          elemwise add
+               |
+
+     This pattern is not uncommon. For example, it arises when conv2d op is followed by exponential
+     linear unit. If bias add and batch normalization are also present, they can be fused as well.
+
+     In fact, above fusion algorithm already fuses three in-between nodes and the element wise
+     add node in the figure above. The following code fuses the conv2d node with the already
+     fused children nodes. The following patterns are supported.
+
+     * Any number of child nodes from the top node
+     * The path from the top node to bottom node can contain any number of element wise ops.
+
+     The only restriction is that in-between nodes cannot have more than one child.
+
+     The overview of the algorithm below is as follows:
+
+     1. Check if all children nodes are fused into a single op by the existing fusion algorithm
+     2. Fuse the parent node to children nodes, and update its group id to be the children's group id
+     3. If the parent node originally belongs to another group (for example, conv + batch norm),
+        propagate the new group id to a grand parent and upward
+  */
+  if (opt_level >= 1) {
+    std::vector<std::vector<uint32_t> > children_group_ids(idx.num_nodes());
+    for (uint32_t nid = idx.num_nodes() - 1; nid != 0; --nid) {
+      const auto& inode = idx[nid];
+      if (inode.source->is_variable()) continue;
+      CHECK_NE(group_vec[nid], -1);
+      if (inode.inputs.size() != 1) continue;
+      const uint32_t parent_nid = inode.inputs[0].node_id;
+      // if parent node has more than one child, record each child's group id.
+      if (ref_count[parent_nid] > 1) children_group_ids[parent_nid].push_back(group_vec[nid]);
+    }
+
+    std::vector<int> new_group_id(idx.num_nodes(), -1);
+    for (uint32_t nid = idx.num_nodes() - 1; nid != 0; --nid) {
+      if (new_group_id[group_vec[nid]] != -1) {
+        // propagate new group id from child
+        group_vec[nid] = new_group_id[group_vec[nid]];
+      }
+      TOpPattern pt = op_pattern.get(idx[nid].source->op(), kOpaque);
+      if (pt == kOpaque) continue;
+      const auto& group_ids = children_group_ids[nid];
+      if (group_ids.size() <= 1) continue;
+      const uint32_t child_group_id = group_ids[0];
+      const auto& children_node_ids = node_ids_per_group[child_group_id];
+
+      auto is_same_group_id = [child_group_id](uint32_t id) {
+          return id == child_group_id;
+      };
+      auto is_fusible_pattern = [&idx](uint32_t child_nid) {
+        TOpPattern child_pt = op_pattern.get(idx[child_nid].source->op(), kOpaque);
+        return child_pt  <= kBroadcast;
+      };
+      // fuse this node with children if
+      // all children belong to the same group and
+      // all nodes in the group are element wise or broadcast op.
+      const bool can_be_fused = std::all_of(group_ids.begin(), group_ids.end(), is_same_group_id) &&
+        std::all_of(children_node_ids.begin(), children_node_ids.end(), is_fusible_pattern);
+
+      if (can_be_fused) {
+        new_group_id[group_vec[nid]] = child_group_id;
+        group_vec[nid] = child_group_id;
+        for (uint32_t nid2 : node_ids_per_group[child_group_id]) {
+          pattern_vec[nid2] = pattern_vec[nid];
+          master_vec[nid2] = master_vec[nid];
+        }
+      }
+    }
+  }
+
+  g.attrs["group_root"] = std::make_shared<any>(std::move(group_vec));
+  g.attrs["group_master"] = std::make_shared<any>(std::move(master_vec));
+  g.attrs["pattern"] = std::make_shared<any>(std::move(pattern_vec));
+  return g;
+}
+
+NNVM_REGISTER_PASS(GraphFindFusibleGroups)
+.set_body(GraphFindFusibleGroups)
+.depend_graph_attr("shape")
+.depend_graph_attr("dtype");
+
+// Fuse the partitioned graph into segments.
+// Create a new graph with fused nodes.
+// Also inherit attribute shape, dltype from the previous graph.
+nnvm::Graph GraphFuse(nnvm::Graph g) {
+  CHECK(g.HasAttr("group_root") && g.HasAttr("pattern"))
+      << "GraphFindFusibleGroups pass hasn't been applied yet.";
+
+  const IndexedGraph& idx = g.indexed_graph();
+  // Get attributes from the graph
+  const ShapeVector& shape_vec = g.GetAttr<ShapeVector>("shape");
+  const DTypeVector& dtype_vec = g.GetAttr<DTypeVector>("dtype");
+  const GroupVec& group_vec = g.GetAttr<GroupVec>("group_root");
+  const PatternVec& pattern_vec = g.GetAttr<PatternVec>("pattern");
+
+  // Specially handle assign op.
+  const nnvm::Op* assign_op = nnvm::Op::Get("_assign");
+
+  FuseEntryVec fuse_entries(idx.num_nodes());
+  // Setup inputs and placeholder.
+  for (uint32_t nid = 0; nid < idx.num_nodes(); ++nid) {
+    const auto& inode = idx[nid];
+    if (inode.source->is_variable()) continue;
+    CHECK_GE(group_vec[nid], 0);
+    int root_id = group_vec[nid];
+    FuseEntry& fe = fuse_entries[root_id];
+    fe.flatten_data = (pattern_vec[root_id] == kElemWise ||
+                       inode.source->op() == assign_op);
+    for (const auto& e : inode.inputs) {
+      if (group_vec[e.node_id] != root_id && fe.imap.count(e) == 0) {
+        Array<Expr> shape;
+        if (fe.flatten_data) {
+          // Elementwise support flatten
+          int64_t prod = 1;
+          for (int64_t x : shape_vec[idx.entry_id(e)]) {
+            prod *= x;
+          }
+          CHECK_LE(prod, static_cast<int64_t>(std::numeric_limits<int>::max()));
+          shape.push_back(make_const(Int(32), prod));
+        } else {
+          for (int64_t x : shape_vec[idx.entry_id(e)]) {
+            CHECK_LE(x, static_cast<int64_t>(std::numeric_limits<int>::max()));
+            shape.push_back(make_const(Int(32), x));
+          }
+        }
+        std::ostringstream os_name;
+        os_name << "input" << fe.imap.size();
+        Tensor data = placeholder(
+            shape, TVMType2Type(GetDLType(dtype_vec[idx.entry_id(e)])),
+            os_name.str());
+        NodeEntry garg = Symbol::CreateVariable(os_name.str()).outputs[0];
+        fe.imap[e] = garg;
+        fe.reverse_imap[garg.node.get()] = e;
+        fe.input_info[garg.node.get()] = std::move(data);
+      }
+    }
+  }
+
+  // Setup the Subgraph
+  std::vector<NodeEntry> subgraph_vec(idx.num_node_entries());
+  for (uint32_t nid = 0; nid < idx.num_nodes(); ++nid) {
+    const auto& inode = idx[nid];
+    if (inode.source->is_variable()) continue;
+    int root_id = group_vec[nid];
+    FuseEntry& fe = fuse_entries[root_id];
+    // Create a subgraph node.
+    NodePtr gnode = Node::Create();
+    gnode->attrs = inode.source->attrs;
+    // Set input entries for the subgraph node.
+    for (const auto& e : inode.inputs) {
+      if (group_vec[e.node_id] != root_id) {
+        auto it = fe.imap.find(e);
+        CHECK(it != fe.imap.end());
+        gnode->inputs.push_back(it->second);
+      } else {
+        const NodeEntry& ne = subgraph_vec[idx.entry_id(e)];
+        CHECK(!idx[e.node_id].source->is_variable());
+        CHECK(ne.node != nullptr);
+        gnode->inputs.push_back(ne);
+      }
+    }
+    // Schedule on the root node and use the master's schedule
+    if (static_cast<int>(nid) != root_id) {
+      for (uint32_t index = 0; index < inode.source->num_outputs(); ++index) {
+        uint32_t eid = idx.entry_id(nid, index);
+        subgraph_vec[eid] = NodeEntry{gnode, index, 0};
+      }
+    } else {
+      for (uint32_t index = 0; index < inode.source->num_outputs(); ++index) {
+        fe.subgraph.outputs.push_back(NodeEntry{gnode, index, 0});
+      }
+    }
+  }
+  g.attrs["fused_entry"] = std::make_shared<any>(std::move(fuse_entries));
+  return g;
+}
+
+NNVM_REGISTER_PASS(GraphFuse)
+    .set_body(GraphFuse)
+    .set_change_graph(true)
+    .provide_graph_attr("fused_entry")
+    .depend_graph_attr("shape")
+    .depend_graph_attr("dtype")
+    .depend_graph_attr("group_root")
+    .depend_graph_attr("group_master");
+
+}  // namespace compiler
+}  // namespace nnvm
diff --git a/nnvm/src/compiler/graph_fuse.h b/nnvm/src/compiler/graph_fuse.h
index bde9a486669b..e5e51189dd94 100644
--- a/nnvm/src/compiler/graph_fuse.h
+++ b/nnvm/src/compiler/graph_fuse.h
@@ -48,7 +48,7 @@ enum class FuseRule {
  * \return corresponding DLDataType
  */
 inline DLDataType GetDLType(int type_flag) {
-  return GetTVMType(type_flag);
+  return tvm::Type2TVMType(GetTVMType(type_flag));
 }
 
 struct INodeEntryHash {
diff --git a/nnvm/src/compiler/graph_hash.cc b/nnvm/src/compiler/graph_hash.cc
new file mode 100644
index 000000000000..236a27375225
--- /dev/null
+++ b/nnvm/src/compiler/graph_hash.cc
@@ -0,0 +1,240 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file graph_deep_compare.cc
+ * \brief Deep compare two graph structure
+ */
+#include <dmlc/common.h>
+#include <nnvm/graph.h>
+#include <nnvm/op_attr_types.h>
+#include <nnvm/compiler/packed_func_ext.h>
+#include <tvm/ir.h>
+#include <tvm/runtime/packed_func.h>
+#include <functional>
+#include <vector>
+#include <utility>
+#include <algorithm>
+#include "node_attr.h"
+#include "graph_hash.h"
+
+namespace nnvm {
+namespace compiler {
+
+using namespace tvm;
+using tvm::ir::IntImm;
+
+size_t HashPlaceHolder(const Tensor& t) {
+  size_t key = t->shape.size();
+  key = dmlc::HashCombine(key, (t->dtype.code() << 8) | t->dtype.bits());
+  for (Expr s : t->shape) {
+    if (const IntImm* op = s.as<IntImm>()) {
+      key = dmlc::HashCombine(key, op->value);
+    }
+  }
+  return key;
+}
+
+bool PlaceHolderEqual(const Tensor& a, const Tensor& b) {
+  if (a->shape.size() != b->shape.size()) return false;
+  if (a->dtype != b->dtype) return false;
+  for (size_t i = 0; i < a->shape.size(); ++i) {
+    const IntImm* a_value = a->shape[i].as<IntImm>();
+    const IntImm* b_value = b->shape[i].as<IntImm>();
+    if (a_value && b_value == nullptr) return false;
+    if (b_value && a_value == nullptr) return false;
+    if (a_value == nullptr && b_value == nullptr) {
+      continue;
+    }
+    if (a_value->value != b_value->value) return false;
+  }
+  return true;
+}
+
+size_t GraphKeyHash::Hash(const GraphKey& gkey)  {
+  if (gkey->cache_hash_key_ != 0) return gkey->cache_hash_key_;
+  size_t key = dmlc::HashCombine(GraphHash(gkey->graph), gkey->target);
+  key = dmlc::HashCombine(key, gkey->inputs.size());
+  for (size_t i = 0; i < gkey->inputs.size(); ++i) {
+    key = dmlc::HashCombine(key, HashPlaceHolder(gkey->inputs[i]));
+  }
+  if (key == 0) key = 1;
+  gkey->cache_hash_key_ = key;
+  return key;
+}
+
+bool GraphKeyEqual::Equal(const GraphKey& a,
+                          const GraphKey& b) {
+  if (a->target != b->target) return false;
+  if (a->inputs.size() != b->inputs.size()) return false;
+  for (size_t i = 0; i < a->inputs.size(); ++i) {
+    if (!PlaceHolderEqual(a->inputs[i], b->inputs[i])) return false;
+  }
+  if (GraphDeepCompare(a->graph, b->graph, false).length() != 0) return false;
+  return true;
+}
+
+GraphKey GraphKeyNode::make(Graph graph,
+                            tvm::Array<Tensor> inputs,
+                            std::string target) {
+  auto n = tvm::make_node<GraphKeyNode>();
+  n->graph = std::move(graph);
+  n->inputs = inputs;
+  n->target = std::move(target);
+  return GraphKey(n);
+}
+
+TVM_STATIC_IR_FUNCTOR(IRPrinter, vtable)
+.set_dispatch<GraphKeyNode>([](const ObjectRef& ref, IRPrinter* p) {
+    auto* op = static_cast<const GraphKeyNode*>(ref.get());
+    p->stream << "GraphKeyNode("<< op << ")";
+});
+
+
+// Run graph hash
+size_t GraphHash(const Graph& graph) {
+  const IndexedGraph& idx = graph.indexed_graph();
+  size_t key = 0;
+  // Combine a linearized sequence of ops in subgraph
+  key = dmlc::HashCombine(key, idx.num_nodes());
+  std::hash<std::string> str_hash;
+  std::vector<size_t> hash_temp;
+  for (uint32_t nid = 0; nid < idx.num_nodes(); ++nid) {
+    const IndexedGraph::Node& inode = idx[nid];
+    // Use name instad op address so it is deterministic across runs
+    if (inode.source->is_variable()) continue;
+    key = dmlc::HashCombine(key, inode.source->op()->name);
+    hash_temp.clear();
+    for (const auto& kv : GetAttrDict(inode.source->attrs)) {
+      hash_temp.push_back(dmlc::HashCombine(str_hash(kv.first), kv.second));
+    }
+    // to make sure it is deterministic
+    // since unordered_map is not deterministic
+    std::sort(hash_temp.begin(), hash_temp.end());
+    for (size_t value : hash_temp) {
+      key = dmlc::HashCombine(key, value);
+    }
+  }
+  return key;
+}
+
+// deep compare the graph structure
+// not considering the graph attributes
+// return non-empty error message if the graph mismatch.
+// the comparator won't match name of intermediate node.
+// compare_var_attr
+std::string GraphDeepCompare(const Graph& a,
+                             const Graph& b,
+                             bool compare_variable_attr) {
+  const IndexedGraph& idxa = a.indexed_graph();
+  const IndexedGraph& idxb = b.indexed_graph();
+  std::ostringstream err;
+  if (idxa.num_nodes() != idxb.num_nodes()) {
+    err << "Number of nodes mismatch (" <<  idxa.num_nodes() << " v.s " << idxb.num_nodes() << ")";
+    return err.str();
+  }
+  if (idxa.num_node_entries() != idxb.num_node_entries()) {
+    err << "Number of node entry mismatch";
+    return err.str();
+  }
+  if (idxa.outputs().size() != idxb.outputs().size()) {
+    err << "Number of outputs mismatch";
+    return err.str();
+  }
+  for (size_t i = 0; i < idxa.outputs().size(); ++i) {
+    if (idxa.outputs()[i].node_id != idxb.outputs()[i].node_id ||
+        idxa.outputs()[i].index != idxb.outputs()[i].index) {
+      err << "Output entry mismatch";
+      return err.str();
+    }
+  }
+  if (idxa.input_nodes().size() != idxb.input_nodes().size()) {
+    err << "Number of inputs mismatch";
+    return err.str();
+  }
+
+  for (uint32_t nid = 0; nid < idxa.num_nodes(); ++nid) {
+    const IndexedGraph::Node& anode = idxa[nid];
+    const IndexedGraph::Node& bnode = idxb[nid];
+    if (anode.source->op() != bnode.source->op()) {
+      err << "Node mismatch ";
+      return err.str();
+    }
+    if (anode.source->is_variable()) {
+      CHECK(bnode.source->is_variable());
+      if (!compare_variable_attr) continue;
+    }
+    AttrDict adict = GetAttrDict(anode.source->attrs);
+    AttrDict bdict = GetAttrDict(bnode.source->attrs);
+
+    auto fmatch = [&err, &anode](const AttrDict& adict, const AttrDict& bdict) {
+      for (const auto& kv : adict) {
+        auto it = bdict.find(kv.first);
+        if (it != bdict.end()) {
+          if (it->second != kv.second) {
+            err << "Node attr mismatch, op=" << anode.source->attrs.name
+                << " attr_key=" << kv.first << " " << it->second
+                << " v.s. " << kv.second;
+            return false;
+          }
+        } else {
+          err << "One attr_key=" << kv.first << " is missing in another "
+               << "op=" << anode.source->attrs.name;
+          return false;
+        }
+      }
+      return true;
+    };
+    if (!fmatch(adict, bdict)) return err.str();
+    if (adict.size() != bdict.size()) {
+      CHECK(!fmatch(bdict, adict));
+      return err.str();
+    }
+    if (anode.inputs.size() != bnode.inputs.size()) {
+      err << "Node input mismatch, op=" << anode.source->attrs.name;
+      return err.str();
+    }
+    if (anode.control_deps.size() != bnode.control_deps.size()) {
+      err << "Node control_deps mistach, op=" << anode.source->attrs.name;
+      return err.str();
+    }
+    for (size_t i = 0; i < anode.inputs.size(); ++i) {
+      const IndexedGraph::NodeEntry& ae = anode.inputs[i];
+      const IndexedGraph::NodeEntry& be = bnode.inputs[i];
+      if (ae.node_id != be.node_id ||
+          ae.index != be.index ||
+          ae.version != be.version) {
+        err << "Node input mismatch on, op=" << anode.source->attrs.name;
+        return err.str();
+      }
+    }
+    for (size_t i = 0; i < anode.control_deps.size(); ++i) {
+      if (anode.control_deps[i] != bnode.control_deps[i]) {
+        err << "Node control_dep mismatch on, op=" << anode.source->attrs.name;
+        return err.str();
+      }
+    }
+  }
+  return "";
+}
+
+TVM_REGISTER_GLOBAL("nnvm.graph.DeepCompare")
+.set_body_typed(GraphDeepCompare);
+}  // namespace compiler
+}  // namespace nnvm
diff --git a/nnvm/src/compiler/graph_hash.h b/nnvm/src/compiler/graph_hash.h
new file mode 100644
index 000000000000..42c069b280c9
--- /dev/null
+++ b/nnvm/src/compiler/graph_hash.h
@@ -0,0 +1,101 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file graph_hash.h
+ * \brief The graph hashing function.
+ */
+#ifndef NNVM_COMPILER_GRAPH_HASH_H_
+#define NNVM_COMPILER_GRAPH_HASH_H_
+
+#include <dmlc/common.h>
+#include <nnvm/graph.h>
+#include <tvm/operation.h>
+#include <string>
+#include <utility>
+
+namespace nnvm {
+namespace compiler {
+
+class GraphKey;
+
+/*! \brief Key to a graph compiler cache */
+struct GraphKeyNode : public tvm::Node {
+  /*! \brief The graph structure */
+  Graph graph;
+  /* \brief The inputs to the function */
+  tvm::Array<Tensor> inputs;
+  /*! \brief The target */
+  std::string target;
+  // Cached internal hash key, invisible to the user.
+  // The graph hash key is ensured always not to be 0
+  mutable size_t cache_hash_key_{0};
+
+  void VisitAttrs(tvm::AttrVisitor* v) {
+    v->Visit("inputs", &inputs);
+    v->Visit("target", &target);
+  }
+
+  static GraphKey make(Graph graph,
+                       tvm::Array<Tensor> inputs,
+                       std::string target);
+  static constexpr const char* _type_key = "GraphKey";
+  TVM_DECLARE_NODE_TYPE_INFO(GraphKeyNode, tvm::Node);
+};
+
+TVM_DEFINE_NODE_REF(GraphKey, GraphKeyNode);
+
+/*! \brief Hashing function for graph key */
+struct GraphKeyHash {
+  size_t operator()(const GraphKey& gkey) const {
+    return Hash(gkey);
+  }
+  static size_t Hash(const GraphKey& gkey);
+};
+
+/*! \brief function for graph key */
+struct GraphKeyEqual {
+  bool operator()(const GraphKey& a,
+                  const GraphKey& b) const {
+    return Equal(a, b);
+  }
+  static bool Equal(const GraphKey& a, const GraphKey& b);
+};
+
+/*!
+ * \brief Create a hash code for a given graph.
+ * \return The hash code of the graph.
+ */
+size_t GraphHash(const Graph& graph);
+
+/*!
+ * \brief Compare two graphs
+ *  return empty string if they are equal
+ *  otherwise return error message
+ * \param a The first graph.
+ * \param b The second graph.
+ * \return empty string if they are equal, otherwise return error message.
+ */
+std::string GraphDeepCompare(const Graph& a,
+                             const Graph& b,
+                             bool compare_variable_attr);
+}  // namespace compiler
+}  // namespace nnvm
+
+#endif  // NNVM_COMPILER_GRAPH_HASH_H_
diff --git a/nnvm/src/compiler/graph_runtime.cc b/nnvm/src/compiler/graph_runtime.cc
new file mode 100644
index 000000000000..a4b398cd41ea
--- /dev/null
+++ b/nnvm/src/compiler/graph_runtime.cc
@@ -0,0 +1,127 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file graph_runtime.cc
+ * \brief Interface code with TVM graph runtime.
+*/
+#include <dmlc/memory_io.h>
+#include <tvm/runtime/registry.h>
+
+#include <utility>
+#include "graph_runtime.h"
+
+namespace nnvm {
+namespace compiler {
+
+using tvm::Object;
+using tvm::ObjectPtr;
+using tvm::runtime::TVMArgs;
+using tvm::runtime::TVMRetValue;
+using tvm::runtime::PackedFunc;
+
+DMLC_REGISTER_PARAMETER(TVMOpParam);
+
+// parser
+inline void TVMOpParamParser(nnvm::NodeAttrs* attrs) {
+  TVMOpParam param;
+  param.Init(attrs->dict);
+  attrs->parsed = std::move(param);
+}
+
+NNVM_REGISTER_OP(tvm_op)
+.set_attr_parser(TVMOpParamParser)
+.set_num_inputs([](const NodeAttrs& attrs) {
+    const TVMOpParam& param = nnvm::get<TVMOpParam>(attrs.parsed);
+    return param.num_inputs;
+  })
+.set_num_outputs([](const NodeAttrs& attrs) {
+    const TVMOpParam& param = nnvm::get<TVMOpParam>(attrs.parsed);
+    return param.num_outputs;
+  });
+
+
+TVM_REGISTER_GLOBAL("nnvm.compiler._save_param_dict")
+.set_body([](TVMArgs args, TVMRetValue *rv) {
+    CHECK_EQ(args.size() % 2, 0u);
+    size_t num_params = args.size() / 2;
+    std::vector<std::string> names;
+    names.reserve(num_params);
+    std::vector<DLTensor*> arrays;
+    arrays.reserve(num_params);
+    for (size_t i = 0; i < num_params * 2; i += 2) {
+      names.emplace_back(args[i].operator std::string());
+      arrays.emplace_back(args[i + 1].operator DLTensor*());
+    }
+    std::string bytes;
+    dmlc::MemoryStringStream strm(&bytes);
+    dmlc::Stream* fo = &strm;
+    uint64_t header = kTVMNDArrayListMagic, reserved = 0;
+    fo->Write(header);
+    fo->Write(reserved);
+    fo->Write(names);
+    {
+      uint64_t sz = static_cast<uint64_t>(arrays.size());
+      fo->Write(sz);
+      for (size_t i = 0; i < sz; ++i) {
+        tvm::runtime::SaveDLTensor(fo, arrays[i]);
+      }
+    }
+    TVMByteArray arr;
+    arr.data = bytes.c_str();
+    arr.size = bytes.length();
+    *rv = arr;
+  });
+
+
+TVM_REGISTER_GLOBAL("nnvm.compiler._load_param_dict")
+.set_body([](TVMArgs args, TVMRetValue *rv) {
+    std::string bytes = args[0];
+    std::vector<std::string> names;
+    dmlc::MemoryStringStream memstrm(&bytes);
+    dmlc::Stream* strm = &memstrm;
+    uint64_t header, reserved;
+    CHECK(strm->Read(&header))
+        << "Invalid parameters file format";
+    CHECK(header == kTVMNDArrayListMagic)
+        << "Invalid parameters file format";
+    CHECK(strm->Read(&reserved))
+        << "Invalid parameters file format";
+    CHECK(strm->Read(&names))
+        << "Invalid parameters file format";
+    uint64_t sz;
+    strm->Read(&sz, sizeof(sz));
+    size_t size = static_cast<size_t>(sz);
+    CHECK(size == names.size())
+        << "Invalid parameters file format";
+    tvm::Array<NDArrayWrapper> ret;
+    for (size_t i = 0; i < size; ++i) {
+      tvm::runtime::NDArray temp;
+      temp.Load(strm);
+      auto n = tvm::make_node<NDArrayWrapperNode>();
+      n->name = std::move(names[i]);
+      n->array = temp;
+      ret.push_back(NDArrayWrapper(n));
+    }
+    *rv = ret;
+  });
+
+TVM_REGISTER_NODE_TYPE(NDArrayWrapperNode);
+}  // namespace compiler
+}  // namespace nnvm
diff --git a/nnvm/src/compiler/graph_runtime.h b/nnvm/src/compiler/graph_runtime.h
new file mode 100644
index 000000000000..252a6b243c3d
--- /dev/null
+++ b/nnvm/src/compiler/graph_runtime.h
@@ -0,0 +1,77 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file graph_runtime.h
+ * \brief Interface code with TVM graph runtime.
+*/
+#ifndef NNVM_COMPILER_GRAPH_RUNTIME_H_
+#define NNVM_COMPILER_GRAPH_RUNTIME_H_
+
+#include <nnvm/graph.h>
+#include <tvm/base.h>
+#include <tvm/expr.h>
+#include <tvm/packed_func_ext.h>
+#include <tvm/runtime/ndarray.h>
+#include <vector>
+#include <string>
+
+namespace nnvm {
+namespace compiler {
+
+/*! \brief Magic number for NDArray list file  */
+constexpr uint64_t kTVMNDArrayListMagic = 0xF7E58D4F05049CB7;
+
+struct TVMOpParam : public dmlc::Parameter<TVMOpParam> {
+  std::string func_name;
+  uint32_t num_inputs;
+  uint32_t num_outputs;
+  uint32_t flatten_data;
+
+  DMLC_DECLARE_PARAMETER(TVMOpParam) {
+    DMLC_DECLARE_FIELD(func_name);
+    DMLC_DECLARE_FIELD(num_inputs).set_default(1);
+    DMLC_DECLARE_FIELD(num_outputs).set_default(1);
+    DMLC_DECLARE_FIELD(flatten_data).set_default(0);
+  }
+};
+
+
+/*!
+ * \brief wrapper node container for exchange.
+ */
+struct NDArrayWrapperNode : public ::tvm::Node {
+  std::string name;
+  tvm::runtime::NDArray array;
+
+  void VisitAttrs(tvm::AttrVisitor* v) {
+    v->Visit("name", &name);
+    v->Visit("array", &array);
+  }
+
+  static constexpr const char* _type_key = "NDArrayWrapper";
+  TVM_DECLARE_NODE_TYPE_INFO(NDArrayWrapperNode, tvm::Node);
+};
+
+TVM_DEFINE_NODE_REF(NDArrayWrapper, NDArrayWrapperNode);
+
+}  // namespace compiler
+}  // namespace nnvm
+
+#endif   // NNVM_COMPILER_GRAPH_RUNTIME_H_
diff --git a/nnvm/src/compiler/graph_transform.h b/nnvm/src/compiler/graph_transform.h
new file mode 100644
index 000000000000..4b183bf2dd6c
--- /dev/null
+++ b/nnvm/src/compiler/graph_transform.h
@@ -0,0 +1,144 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file graph_transform.h
+ * \brief A mutator class that does local pattern matching and mutates a node.
+*/
+#ifndef NNVM_COMPILER_GRAPH_TRANSFORM_H_
+#define NNVM_COMPILER_GRAPH_TRANSFORM_H_
+
+#include <nnvm/graph.h>
+#include <vector>
+#include <utility>
+#include <unordered_map>
+
+namespace nnvm {
+namespace compiler {
+
+/*!
+ * \brief Transform the graph to build a new Graph, in post DFS order.
+ *
+ *  Automatically copies node when some of its children or control_deps changed.
+ *  This function won't be called in Variable.
+ *
+ * \param graph The original graph
+ *
+ * \param ftransform Function of (int nid, const NodePtr& node, std::vector<NodeEntry>* out) -> bool
+ *
+ *      If empty vector is returned, it means original entries should be kept.
+ *
+ * \tparam FTransform The transformation function.
+ */
+template<typename FTransform>
+Graph GraphTransform(Graph graph, FTransform ftransform) {
+  const IndexedGraph& idx = graph.indexed_graph();
+  // new nodes
+  std::vector<NodeEntry> new_entry_map(idx.num_node_entries());
+  std::vector<bool> updated(idx.num_node_entries(), false);
+
+  // setup inputs and placeholder.
+  for (uint32_t nid = 0; nid < idx.num_nodes(); ++nid) {
+    const auto& inode = idx[nid];
+    bool need_copy = false;
+    for (const IndexedGraph::NodeEntry& e : inode.inputs) {
+      if (updated[idx.entry_id(e)]) {
+        need_copy = true; break;
+      }
+    }
+    if (!need_copy) {
+      for (const uint32_t cid : inode.control_deps) {
+        const auto& cnode = idx[cid];
+        for (uint32_t i = 0 ; i < cnode.source->num_outputs(); ++i) {
+          if (updated[idx.entry_id(cid, i)]) {
+            need_copy = true;
+          }
+        }
+        if (need_copy) break;
+      }
+    }
+
+    if (!need_copy) {
+      std::vector<NodeEntry> ret;
+      if (ftransform(nid, inode.weak_ref.lock(), &ret)) {
+        CHECK_EQ(ret.size(), static_cast<size_t>(inode.source->num_outputs()));
+        for (uint32_t i = 0 ; i < inode.source->num_outputs(); ++i) {
+          updated[idx.entry_id(nid, i)] = true;
+          new_entry_map[idx.entry_id(nid, i)] = ret[i];
+        }
+      }
+    } else {
+      NodePtr node = Node::Create();
+      node->attrs = inode.source->attrs;
+      for (size_t i = 0; i < inode.inputs.size(); ++i) {
+        const IndexedGraph::NodeEntry& e = inode.inputs[i];
+        if (updated[idx.entry_id(e)]) {
+          node->inputs.push_back(new_entry_map[idx.entry_id(e)]);
+        } else {
+          node->inputs.push_back(inode.source->inputs[i]);
+        }
+      }
+      for (size_t i = 0; i < inode.control_deps.size(); ++i) {
+        const uint32_t cid = inode.control_deps[i];
+        const auto& cnode = idx[cid];
+        CHECK_NE(cnode.source->num_outputs(), 0U);
+        NodePtr selected_ptr;
+        for (uint32_t j = 0 ; j < cnode.source->num_outputs(); ++j) {
+          NodePtr cptr = updated[idx.entry_id(cid, j)] ?
+              new_entry_map[idx.entry_id(cid, j)].node : inode.source->control_deps[i];
+          if (selected_ptr == nullptr) {
+            selected_ptr = std::move(cptr);
+          } else {
+            CHECK(selected_ptr.get() == cptr.get())
+                << "Control dependency node changed to more than one node";
+          }
+        }
+        node->control_deps.push_back(selected_ptr);
+      }
+      std::vector<NodeEntry> ret;
+      if (ftransform(nid, node, &ret)) {
+        CHECK_EQ(ret.size(), static_cast<size_t>(inode.source->num_outputs()));
+        for (uint32_t i = 0 ; i < inode.source->num_outputs(); ++i) {
+          updated[idx.entry_id(nid, i)] = true;
+          new_entry_map[idx.entry_id(nid, i)] = ret[i];
+        }
+      } else {
+        for (uint32_t i = 0 ; i < inode.source->num_outputs(); ++i) {
+          updated[idx.entry_id(nid, i)] = true;
+          new_entry_map[idx.entry_id(nid, i)] = NodeEntry{node, i, 0};
+        }
+      }
+    }
+  }
+  Graph ret;
+  for (size_t i = 0; i < idx.outputs().size(); ++i) {
+    const IndexedGraph::NodeEntry& e = idx.outputs()[i];
+    if (updated[idx.entry_id(e)]) {
+      ret.outputs.push_back(new_entry_map[idx.entry_id(e)]);
+    } else {
+      ret.outputs.push_back(graph.outputs[i]);
+    }
+  }
+  return ret;
+}
+
+}  // namespace compiler
+}  // namespace nnvm
+
+#endif  // NNVM_COMPILER_GRAPH_TRANSFORM_H_
diff --git a/nnvm/src/compiler/node_attr.h b/nnvm/src/compiler/node_attr.h
new file mode 100644
index 000000000000..cd11981bffec
--- /dev/null
+++ b/nnvm/src/compiler/node_attr.h
@@ -0,0 +1,52 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file node_attr.h
+ * \brief utility to access node attributes
+*/
+#ifndef NNVM_COMPILER_NODE_ATTR_H_
+#define NNVM_COMPILER_NODE_ATTR_H_
+
+#include <nnvm/op.h>
+#include <nnvm/compiler/op_attr_types.h>
+#include <unordered_map>
+#include <string>
+
+namespace nnvm {
+namespace compiler {
+
+using AttrDict = std::unordered_map<std::string, std::string>;
+/*!
+ * \brief Get canonicalized attr dict from node
+ * \param attrs The node attrs
+ * \return The attribute dict
+ */
+inline AttrDict GetAttrDict(const NodeAttrs& attrs) {
+  static auto& fgetdict = nnvm::Op::GetAttr<FGetAttrDict>("FGetAttrDict");
+  if (fgetdict.count(attrs.op)) {
+    return fgetdict[attrs.op](attrs);
+  } else {
+    return attrs.dict;
+  }
+}
+
+}  // namespace compiler
+}  // namespace nnvm
+#endif  // NNVM_COMPILER_NODE_ATTR_H_
diff --git a/nnvm/src/compiler/packed_func_ext.cc b/nnvm/src/compiler/packed_func_ext.cc
new file mode 100644
index 000000000000..5680af1b2550
--- /dev/null
+++ b/nnvm/src/compiler/packed_func_ext.cc
@@ -0,0 +1,164 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file packed_func_ext.cc
+ * \brief Registeration of extension type.
+ */
+#include <tvm/expr.h>
+#include <tvm/packed_func_ext.h>
+#include <nnvm/op.h>
+#include <nnvm/compiler/packed_func_ext.h>
+#include <nnvm/compiler/op_attr_types.h>
+#include <tvm/runtime/c_runtime_api.h>
+#include "node_attr.h"
+#include "compile_engine.h"
+
+namespace tvm {
+namespace runtime {
+
+TVM_REGISTER_EXT_TYPE(nnvm::Graph);
+TVM_REGISTER_EXT_TYPE(nnvm::Symbol);
+TVM_REGISTER_EXT_TYPE(nnvm::compiler::AttrDict);
+
+}  // namespace runtime
+}  // namespace tvm
+
+namespace nnvm {
+DMLC_JSON_ENABLE_ANY(int, int);
+}  // namespace nnvm
+
+namespace nnvm {
+namespace compiler {
+
+using tvm::Tensor;
+using tvm::Array;
+using tvm::Node;
+using tvm::runtime::TVMArgs;
+using tvm::runtime::TVMRetValue;
+
+TVM_REGISTER_GLOBAL("nnvm.compiler._dict_get")
+.set_body([](TVMArgs args, TVMRetValue *rv) {
+    const AttrDict& dict = args[0].AsExtension<AttrDict>();
+    std::string key = args[1];
+    auto it = dict.find(key);
+    if (it != dict.end()) {
+      *rv = it->second;
+    } else {
+      *rv = nullptr;
+    }
+  });
+
+TVM_REGISTER_GLOBAL("nnvm.compiler._dict_size")
+.set_body([](TVMArgs args, TVMRetValue *rv) {
+    const AttrDict& dict = args[0].AsExtension<AttrDict>();
+    *rv = static_cast<int64_t>(dict.size());
+  });
+
+TVM_REGISTER_GLOBAL("nnvm.compiler._dict_keys")
+.set_body([](TVMArgs args, TVMRetValue *rv) {
+    const AttrDict& dict = args[0].AsExtension<AttrDict>();
+    tvm::Array<tvm::Expr> keys;
+    for (const auto& kv : dict) {
+      keys.push_back(kv.first);
+    }
+    *rv = keys;
+  });
+
+TVM_REGISTER_GLOBAL("nnvm.compiler._register_alter_op_layout")
+.set_body([](TVMArgs args, TVMRetValue *rv) {
+  // Intentionally copy and not de-allocate it, to avoid free pyobject during shutdown
+  PackedFunc* f = new PackedFunc(args[1].operator PackedFunc());
+  Op& op = ::dmlc::Registry<nnvm::Op>::Get()->__REGISTER_OR_GET__(args[0]);
+  auto fpack = [f](const NodeAttrs& attrs,
+                   const Symbol& inputs,
+                   const Array<Tensor>& tinfos,
+                   Symbol* ret_symbol) {
+    TVMRetValue ret = (*f)(GetAttrDict(attrs), inputs, tinfos);
+    if (ret.type_code() == TVMTypeCode::kNull) {
+      return false;
+    }
+    CHECK_EQ(ret.type_code(), tvm::runtime::extension_type_info<Symbol>::code)
+      << " expected " << "Symbol (code = " << tvm::runtime::extension_type_info<Symbol>::code
+      << ") but get code = " << ret.type_code();
+    *ret_symbol = *(static_cast<Symbol*>(ret.value().v_handle));
+    return true;
+  };
+  op.set_attr<FTVMAlterOpLayout>("FTVMAlterOpLayout", fpack, args[2]);
+});
+
+// custom version of TVM compute
+TVM_REGISTER_GLOBAL("nnvm._register_compute")
+.set_body([](TVMArgs args, TVMRetValue *rv) {
+    // Intentionally copy and not de-allocate it, to avoid free pyobject during shutdown
+    PackedFunc* f = new PackedFunc(args[1].operator PackedFunc());
+    Op& op = ::dmlc::Registry<nnvm::Op>::Get()->__REGISTER_OR_GET__(args[0]);
+    auto fcompute = [f](const NodeAttrs& attrs,
+                        const Array<Tensor>& inputs,
+                        const Array<Tensor>& out_info)
+        -> Array<Tensor> {
+      TVMRetValue ret = (*f)(GetAttrDict(attrs), inputs, out_info);
+      if (ret.IsObjectRef<tvm::Tensor>()) {
+        return {ret.operator Tensor()};
+      } else {
+        return ret;
+      }
+    };
+    op.set_attr<FTVMCompute>("FTVMCompute", fcompute, args[2]);
+  });
+
+TVM_REGISTER_GLOBAL("nnvm._register_schedule")
+.set_body([](TVMArgs args, TVMRetValue *rv) {
+    // Intentionally copy and not de-allocate it, to avoid free pyobject during shutdown
+    PackedFunc* f = new PackedFunc(args[1].operator PackedFunc());
+    Op& op = ::dmlc::Registry<nnvm::Op>::Get()->__REGISTER_OR_GET__(args[0]);
+    auto fschedule = [f](const NodeAttrs& attrs,
+                         const Array<Tensor>& outs,
+                         const std::string& target) {
+      return (*f)(GetAttrDict(attrs), outs, target).operator Schedule();
+    };
+    op.set_attr<FTVMSchedule>("FTVMSchedule", fschedule, args[2]);
+  });
+
+TVM_REGISTER_GLOBAL("nnvm._register_pattern")
+.set_body([](TVMArgs args, TVMRetValue *rv) {
+    Op& op = ::dmlc::Registry<nnvm::Op>::Get()->__REGISTER_OR_GET__(args[0]);
+    op.set_attr<TOpPattern>("TOpPattern", args[1].operator int(), args[2]);
+  });
+
+TVM_REGISTER_GLOBAL("nnvm.graph._move_module")
+.set_body([](TVMArgs args, TVMRetValue *rv) {
+    const nnvm::Graph& g = args[0].AsExtension<Graph>();
+    *rv = const_cast<nnvm::Graph*>(&g)->
+        MoveCopyAttr<tvm::runtime::Module>(args[1]);
+  });
+
+TVM_REGISTER_GLOBAL("nnvm.graph._move_graph")
+.set_body([](TVMArgs args, TVMRetValue *rv) {
+    const nnvm::Graph& g = args[0].AsExtension<Graph>();
+    std::string key = args[1];
+    if (g.attrs.count(key)) {
+      *rv = const_cast<nnvm::Graph*>(&g)->
+          MoveCopyAttr<nnvm::Graph>(key);
+    } else {
+      *rv = nullptr;
+    }
+  });
+}  // namespace compiler
+}  // namespace nnvm
diff --git a/nnvm/src/compiler/pattern_util.h b/nnvm/src/compiler/pattern_util.h
new file mode 100644
index 000000000000..d3f9725caefa
--- /dev/null
+++ b/nnvm/src/compiler/pattern_util.h
@@ -0,0 +1,118 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file pattern_util.h
+ * \brief Utilities for doing various pattern matching in graph.
+*/
+#ifndef NNVM_COMPILER_PATTERN_UTIL_H_
+#define NNVM_COMPILER_PATTERN_UTIL_H_
+
+#include <nnvm/graph.h>
+#include <vector>
+#include <utility>
+#include <string>
+#include <unordered_map>
+
+namespace nnvm {
+namespace compiler {
+
+/*!
+ * \brief find axis in oshape, such that:
+ *  bias_shape = [1,1, ... oshape[axis], 1,1,]
+ *
+ *  This is used to detect bias or scaling factor on channel dimension.
+ * \param oshape The output shape
+ * \param bias_shape The shape of bias or scaling factor.
+ * \return Pair of matched axis in o shape and bias_shape if found.
+ */
+inline std::pair<int, int> MatchBroadcast1DAxis(
+    const TShape& oshape, const TShape& bias_shape) {
+  dim_t axis_dim = bias_shape.ndim();
+  for (dim_t i = bias_shape.ndim(); i != 0; --i, --axis_dim) {
+    if (bias_shape[i - 1] != 1) break;
+  }
+  // everything is 1
+  if (axis_dim == 0) {
+    return {oshape.ndim()  - bias_shape.ndim(), 0};
+  }
+  axis_dim = axis_dim - 1;
+  // The bias shape is not 1D
+  for (dim_t i = 0; i < axis_dim; ++i) {
+    if (bias_shape[i] != 1) return {-1, -1};
+  }
+  int axis = static_cast<int>(
+      oshape.ndim() - bias_shape.ndim() + axis_dim);
+  if (oshape[axis] != bias_shape[axis_dim]) return {-1, -1};
+  return {axis, axis_dim};
+}
+
+/*!
+ * \brief Expand bias dimension to match needed axis.
+ *
+ * \param bias The bias NodeEntry
+ * \param out_dim output dimension.
+ * \param bias_dim The current bias dimension.
+ * \param axis The axis we want to match on.
+ */
+inline NodeEntry
+ExpandBiasToMatchAxis(NodeEntry bias,
+                      int out_dim,
+                      int bias_dim,
+                      int axis) {
+  if (bias_dim != 1) {
+    bias = MakeNode("squeeze", bias.node->attrs.name + "_sqz", {bias});
+  }
+  int num_pad_axis = out_dim - axis - 1;
+  if (num_pad_axis > 0) {
+    std::unordered_map<std::string, std::string> kwargs{
+      {"axis", "1"},
+      {"num_newaxis", std::to_string(num_pad_axis)}};
+    return MakeNode("expand_dims", bias.node->attrs.name + "_expand",
+                    {bias}, kwargs);
+
+  } else {
+    return bias;
+  }
+}
+
+/*!
+ * \brief Get the reference count of each node.
+ * \param idx The IndexedGraph
+ * \return ref_count vector of length number nodes.
+ */
+inline std::vector<uint32_t>
+GetNodeRefCounts(const IndexedGraph& idx) {
+  std::vector<uint32_t> ref_count(idx.num_nodes(), 0);
+  for (uint32_t nid = 0; nid < idx.num_nodes(); ++nid) {
+    const auto& inode = idx[nid];
+    if (inode.source->is_variable()) continue;
+    for (const auto& e : inode.inputs) {
+      ++ref_count[e.node_id];
+    }
+  }
+  for (const auto& e : idx.outputs()) {
+    // this line will realize all the outputs
+    ref_count[e.node_id] += 1;
+  }
+  return ref_count;
+}
+}  // namespace compiler
+}  // namespace nnvm
+#endif  //  NNVM_COMPILER_PATTERN_UTIL_H_
diff --git a/nnvm/src/compiler/simplify_inference.cc b/nnvm/src/compiler/simplify_inference.cc
new file mode 100644
index 000000000000..0e33a2260986
--- /dev/null
+++ b/nnvm/src/compiler/simplify_inference.cc
@@ -0,0 +1,129 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file simplify_inference.cc
+ * \author Ziheng Jiang
+*/
+#include <nnvm/graph.h>
+#include <nnvm/op_attr_types.h>
+#include <nnvm/graph_attr_types.h>
+#include <nnvm/pass.h>
+#include <nnvm/compiler/op_attr_types.h>
+#include <nnvm/top/nn.h>
+#include "graph_transform.h"
+#include "pattern_util.h"
+
+namespace nnvm {
+namespace compiler {
+
+std::vector<NodeEntry>
+BatchNormToInferUnpack(const nnvm::NodeAttrs& attrs,
+                       nnvm::NodeEntry data,
+                       nnvm::NodeEntry gamma,
+                       nnvm::NodeEntry beta,
+                       nnvm::NodeEntry moving_mean,
+                       nnvm::NodeEntry moving_var,
+                       TShape dshape,
+                       TShape bshape) {
+  CHECK_NE(dshape.ndim(), 0);
+  CHECK(attrs.op);
+  static const  Op* bn_op = Op::Get("batch_norm");
+  CHECK(attrs.op == bn_op);
+  const auto& param = nnvm::get<top::BatchNormParam>(attrs.parsed);
+  std::string bn_name = attrs.name;
+
+  // transform batch_norm(data) to scale * data + shift
+  NodeEntry var_add_eps = MakeNode(
+      "__add_scalar__", bn_name + "_add_eps",
+      {moving_var}, {{"scalar", std::to_string(param.epsilon)}});
+
+  NodeEntry sqrt = MakeNode(
+      "sqrt", bn_name + "_sqrt", {var_add_eps});
+
+  NodeEntry scale = MakeNode(
+      "__rdiv_scalar__", bn_name + "_div",
+      {sqrt}, {{"scalar", "1"}});
+
+  if (param.scale) {
+    scale = MakeNode(
+        "elemwise_mul", bn_name + "_gamma_mul_div",
+        {scale, gamma});
+  }
+
+  NodeEntry neg_mean = MakeNode(
+      "negative", bn_name + "_neg_mean", {moving_mean});
+
+  NodeEntry shift = MakeNode(
+      "elemwise_mul", bn_name + "_neg_mean_mul_a",
+      {neg_mean, scale});
+
+  if (param.center) {
+    shift = MakeNode(
+        "elemwise_add", bn_name + "_add_beta", {shift, beta});
+  }
+  int axis = param.axis;
+  scale = ExpandBiasToMatchAxis(scale, dshape.ndim()-bshape.ndim()+1, 1, axis);
+  shift = ExpandBiasToMatchAxis(shift, dshape.ndim()-bshape.ndim()+1, 1, axis);
+
+  NodeEntry out = MakeNode("broadcast_mul", bn_name + "_a_mul_data",
+                           {data, scale});
+  out = MakeNode("broadcast_add", bn_name + "_out",
+                 {out, shift});
+  // It is invalid to ref the other values of BN after inference transform.
+  NodeEntry undef = MakeNode("__undef__", "undef", {});
+  return {out, undef, undef};
+}
+
+Graph SimplifyInference(nnvm::Graph src) {
+  // Get attributes from the graph
+  const IndexedGraph& idx = src.indexed_graph();
+  const ShapeVector& shape_vec = src.GetAttr<ShapeVector>("shape");
+  auto transform = [&](uint32_t nid, const NodePtr& n, std::vector<NodeEntry>* ret) {
+    if (n->is_variable()) return false;
+    static const Op* bn_op = Op::Get("batch_norm");
+    static const Op* dropout_op = Op::Get("dropout");
+    if (n->op() == bn_op) {
+      *ret = BatchNormToInferUnpack(
+          n->attrs,
+          n->inputs[0],
+          n->inputs[1],
+          n->inputs[2],
+          n->inputs[3],
+          n->inputs[4],
+          shape_vec[idx.entry_id(nid, 0)],
+          shape_vec[idx.entry_id(nid, 1)]);
+      return true;
+    } else if (n->op() == dropout_op) {
+      NodeEntry undef = MakeNode("__undef__", "undef", {});
+      *ret = {n->inputs[0], undef};
+      return true;
+    } else {
+      return false;
+    }
+  };
+  return GraphTransform(src, transform);
+}
+
+NNVM_REGISTER_PASS(SimplifyInference)
+.set_body(SimplifyInference)
+.set_change_graph(true);
+
+}  // namespace compiler
+}  // namespace nnvm
diff --git a/nnvm/src/pass/plan_memory.cc b/nnvm/src/pass/plan_memory.cc
index abd18eda5edd..f59a3006cf4c 100644
--- a/nnvm/src/pass/plan_memory.cc
+++ b/nnvm/src/pass/plan_memory.cc
@@ -6,9 +6,9 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- *
+ * 
  *   http://www.apache.org/licenses/LICENSE-2.0
- *
+ * 
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -25,13 +25,14 @@
 #include <nnvm/pass.h>
 #include <nnvm/graph_attr_types.h>
 #include <nnvm/op_attr_types.h>
+#include <nnvm/top/tensor.h>
 #include <memory>
 #include "graph_algorithm.h"
 
 namespace nnvm {
 namespace pass {
 namespace {
-
+  using namespace nnvm::top;
 // Return bytes of data flag.
 static int GetDTypeSize(int type_flag) {
   switch (type_flag) {
@@ -39,7 +40,6 @@ static int GetDTypeSize(int type_flag) {
     case kInt8:
       return 1;
     case kFloat16:
-    case kBfloat16:
     case kInt16:
     case kUint16:
       return 2;
diff --git a/nnvm/src/top/elemwise_op_common.h b/nnvm/src/top/elemwise_op_common.h
new file mode 100644
index 000000000000..1864850eb436
--- /dev/null
+++ b/nnvm/src/top/elemwise_op_common.h
@@ -0,0 +1,369 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file elemwise_op_common.h
+ * \brief Common operator utilities
+ */
+#ifndef NNVM_TOP_ELEMWISE_OP_COMMON_H_
+#define NNVM_TOP_ELEMWISE_OP_COMMON_H_
+
+#include <nnvm/layout.h>
+#include <nnvm/top/nn.h>
+#include <string>
+#include <vector>
+#include <utility>
+#include <functional>
+#include "op_common.h"
+
+namespace nnvm {
+namespace top {
+
+template<typename AttrType, bool (*is_none)(const AttrType&),
+         bool (*assign)(AttrType*, const AttrType&), bool reverse_infer,
+         std::string (*attr_string)(const AttrType&),
+         int n_in = -1, int n_out = -1>
+inline bool ElemwiseAttr(const nnvm::NodeAttrs& attrs,
+                         std::vector<AttrType> *in_attrs,
+                         std::vector<AttrType> *out_attrs,
+                         const AttrType& none) {
+  AttrType dattr = none;
+  size_t in_size = in_attrs->size();
+  size_t out_size = out_attrs->size();
+  if (n_in != -1)
+    in_size = static_cast<size_t>(n_in);
+  if (n_out != -1)
+    out_size = static_cast<size_t>(n_out);
+
+  auto deduce = [&](std::vector<AttrType> *vec, size_t size, const char *name) {
+      for (size_t i = 0; i < size; ++i) {
+        CHECK(assign(&dattr, (*vec)[i]))
+          << "Incompatible attr in node " << attrs.name << " at " << i << "-th "
+          << name << ": " << "expected " << attr_string(dattr)
+          << ", got " << attr_string((*vec)[i]);
+      }
+    };
+  deduce(in_attrs, in_size, "input");
+  if (reverse_infer) deduce(out_attrs, out_size, "output");
+
+  auto write = [&](std::vector<AttrType> *vec, size_t size, const char *name) {
+      for (size_t i = 0; i < size; ++i) {
+        CHECK(assign(&(*vec)[i], dattr))
+          << "Incompatible attr in node " << attrs.name << " at " << i << "-th "
+          << name << ": " << "expected " << attr_string(dattr)
+          << ", got " << attr_string((*vec)[i]);
+      }
+    };
+  write(in_attrs, in_size, "input");
+  write(out_attrs, out_size, "output");
+
+  if (is_none(dattr)) return false;
+  return true;
+}
+
+template<int n_in, int n_out>
+inline bool ElemwiseShape(const NodeAttrs& attrs,
+                          std::vector<TShape> *in_attrs,
+                          std::vector<TShape> *out_attrs) {
+  if (n_in != -1) {
+    CHECK_EQ(in_attrs->size(), static_cast<size_t>(n_in)) << " in operator " << attrs.name;
+  }
+  if (n_out != -1) {
+    CHECK_EQ(out_attrs->size(), static_cast<size_t>(n_out)) << " in operator " << attrs.name;
+  }
+  return ElemwiseAttr<TShape, shape_is_none, shape_assign, true, shape_string>(
+    attrs, in_attrs, out_attrs, TShape());
+}
+
+template<int n_in, int n_out>
+inline bool ElemwiseType(const NodeAttrs& attrs,
+                         std::vector<int> *in_attrs,
+                         std::vector<int> *out_attrs) {
+  if (n_in != -1) {
+    CHECK_EQ(in_attrs->size(), static_cast<size_t>(n_in)) << " in operator " << attrs.name;
+  }
+  if (n_out != -1) {
+    CHECK_EQ(out_attrs->size(), static_cast<size_t>(n_out)) << " in operator " << attrs.name;
+  }
+  return ElemwiseAttr<int, type_is_none, type_assign, true, type_string>(
+    attrs, in_attrs, out_attrs, -1);
+}
+
+inline bool ElementWiseReduceShape(const NodeAttrs& attrs,
+                                   std::vector<TShape> *in_attrs,
+                                   std::vector<TShape> *out_attrs) {
+  CHECK_EQ(out_attrs->size(), 1);
+  return ElemwiseAttr<TShape, shape_is_none, shape_assign, true, shape_string>(
+    attrs, in_attrs, out_attrs, TShape());
+}
+
+inline bool ElementWiseReduceType(const NodeAttrs& attrs,
+                                  std::vector<int> *in_attrs,
+                                  std::vector<int> *out_attrs) {
+  CHECK_EQ(out_attrs->size(), 1);
+  return ElemwiseAttr<int, type_is_none, type_assign, true, type_string>(
+    attrs, in_attrs, out_attrs, -1);
+}
+
+template<int n_in, int n_out>
+inline bool ElemwiseFixedLayout(const NodeAttrs& attrs,
+                                std::vector<Layout> *in_layouts,
+                                const std::vector<Layout> *last_in_layouts,
+                                std::vector<Layout> *out_layouts,
+                                const std::function<Layout(const Layout& in)>& finfer) {
+  const size_t in_size = (n_in == -1) ? in_layouts->size() : static_cast<size_t>(n_in);
+  const size_t out_size = (n_out == -1) ? out_layouts->size() : static_cast<size_t>(n_out);
+
+  auto deduce = [&](Layout *target, const std::vector<Layout> *vec,
+                    size_t size, const char *name) {
+    for (size_t i = 0; i < size; ++i) {
+      if (vec->at(i).defined()) {
+        if (!target->defined()) {
+          *target = vec->at(i);
+        }
+        CHECK_EQ(*target, vec->at(i))
+          << "Incompatible attr in node " << attrs.name << " at " << i << "-th "
+          << name << ": " << "expected " << *target
+          << ", got " << vec->at(i);
+      }
+    }
+  };
+
+  Layout in, last_in, out;
+  deduce(&in, in_layouts, in_size, "input");
+  deduce(&last_in, last_in_layouts, in_size, "input (last infer pass)");
+  deduce(&out, out_layouts, out_size, "output");
+
+  if (!last_in.defined()) {
+    last_in = in;
+  } else {
+    // else we copy in_layout produced by last infer pass to in_layout,
+    // and let LayoutTransform pass
+    // to insert an layout_transform node to fix the input layout.
+    in = last_in;
+  }
+
+  out = finfer(in);
+
+  auto write = [](std::vector<Layout> *vec, Layout& value, size_t size) {
+    for (size_t i = 0; i < size; ++i) {
+      vec->at(i) = value;
+    }
+  };
+  if (in.defined()) write(in_layouts, in, in_size);
+  if (out.defined()) write(out_layouts, out, out_size);
+
+  return true;
+}
+
+/*! \brief Fix the input layout as the previous inferred (if any) and copy to output */
+template<int n_in, int n_out>
+inline bool ElemwiseFixedLayoutCopyToOut(const NodeAttrs& attrs,
+                                         std::vector<Layout> *in_layouts,
+                                         const std::vector<Layout> *last_in_layouts,
+                                         std::vector<Layout> *out_layouts) {
+  return ElemwiseFixedLayout<n_in, n_out>(
+    attrs, in_layouts, last_in_layouts, out_layouts, [](const Layout& in) {
+    return in;
+  });
+}
+
+/*! \brief Fix the input layout as the previous inferred (if any) and do not define output */
+template<int n_in, int n_out>
+inline bool ElemwiseFixedLayoutUnknownOut(const NodeAttrs& attrs,
+                                          std::vector<Layout> *in_layouts,
+                                          const std::vector<Layout> *last_in_layouts,
+                                          std::vector<Layout> *out_layouts) {
+  return ElemwiseFixedLayout<n_in, n_out>(
+    attrs, in_layouts, last_in_layouts, out_layouts, [](const Layout& in) {
+    return Layout::Undef();
+  });
+}
+
+/*! \brief take arbitrary input layout and copy to output */
+template<int n_in, int n_out>
+inline bool ElemwiseArbitraryLayout(const NodeAttrs& attrs,
+                                    std::vector<Layout> *in_layouts,
+                                    const std::vector<Layout> *last_in_layouts,
+                                    std::vector<Layout> *out_layouts) {
+  const size_t in_size = (n_in == -1) ? in_layouts->size() : static_cast<size_t>(n_in);
+  const size_t out_size = (n_out == -1) ? out_layouts->size() : static_cast<size_t>(n_out);
+
+  Layout in;
+  for (size_t i = 0; i < in_size; ++i) {
+    if (!in.defined()) in = in_layouts->at(i);
+    CHECK_EQ(in, in_layouts->at(i))
+      << "Incompatible attr in node " << attrs.name << " at " << i
+      << "-th input: expected " << in
+      << ", got " << in_layouts->at(i);
+  }
+
+  if (in.defined()) {
+    for (size_t i = 0; i < out_size; ++i) {
+      out_layouts->at(i) = in;
+    }
+  }
+
+  return true;
+}
+
+/*!
+ * \brief try to convert right layout to left layout if they are different.
+ *        if the converting fails, it will use the last inferred layouts.
+ */
+inline bool ElemwiseBinaryKeepLeftLayout(const NodeAttrs& attrs,
+                                         std::vector<Layout> *in_layouts,
+                                         const std::vector<Layout> *last_in_layouts,
+                                         std::vector<Layout> *out_layouts) {
+  CHECK_EQ(in_layouts->size(), 2U);
+  CHECK_EQ(last_in_layouts->size(), 2U);
+  CHECK_EQ(out_layouts->size(), 1U);
+
+  const Layout& lhs_last = (*last_in_layouts)[0];
+  const Layout& rhs_last = (*last_in_layouts)[1];
+  CHECK((lhs_last.defined() && rhs_last.defined()) ||
+        (!lhs_last.defined() && !rhs_last.defined()));
+
+  const Layout& lhs = (*in_layouts)[0];
+  const Layout& rhs = (*in_layouts)[1];
+
+  if (!lhs.defined() && !rhs.defined()) {
+    CHECK(!lhs_last.defined() && !rhs_last.defined())
+      << "Lost input layouts in node " << attrs.name
+      << ": last inferred lhs=" << lhs_last << ", rhs=" << rhs_last;
+    return true;
+  } else if (!lhs.defined()) {
+    CHECK(!lhs_last.defined() && !rhs_last.defined());
+    in_layouts->at(0) = rhs;
+    out_layouts->at(0) = rhs;
+    return true;
+  } else if (!rhs.defined()) {
+    CHECK(!lhs_last.defined() && !rhs_last.defined());
+    in_layouts->at(1) = lhs;
+    out_layouts->at(0) = lhs;
+    return true;
+  }
+
+  if (lhs == rhs) {
+    // for same layout, we can always do binary calculation
+    // and pass the layout to next layer
+    out_layouts->at(0) = lhs;
+    return true;
+  }
+
+  if (rhs.convertible(lhs)) {
+    in_layouts->at(1) = lhs;
+    out_layouts->at(0) = lhs;
+  } else {
+    CHECK(lhs_last.defined() && rhs_last.defined())
+      << "Incompatible input layouts in node " << attrs.name
+      << ". lhs: " << lhs << ", rhs: " << rhs;
+    CHECK(lhs_last == rhs_last);
+    in_layouts->at(0) = lhs_last;
+    in_layouts->at(1) = rhs_last;
+    out_layouts->at(0) = lhs_last;
+  }
+
+  return true;
+}
+
+#define NNVM_REGISTER_ELEMWISE_UNARY_OP(name)                       \
+  NNVM_REGISTER_OP(name)                                            \
+  .set_num_inputs(1)                                                \
+  .set_num_outputs(1)                                               \
+  .set_attr<FInferShape>("FInferShape", ElemwiseShape<1, 1>)        \
+  .set_attr<FInferType>("FInferType", ElemwiseType<1, 1>)           \
+  .set_attr<FCorrectLayout>("FCorrectLayout",                       \
+    ElemwiseArbitraryLayout<1, 1>)                                  \
+  .set_attr<FInplaceOption>("FInplaceOption",                       \
+    [](const NodeAttrs& attrs){                                     \
+      return std::vector<std::pair<int, int> >{{0, 0}};             \
+    })                                                              \
+  .add_argument("data", "Tensor", "The input tensor.")
+
+
+#define NNVM_REGISTER_INIT_OP(name)                                 \
+  NNVM_REGISTER_OP(name)                                            \
+  .set_num_inputs(0)                                                \
+  .set_num_outputs(1)
+
+
+#define NNVM_REGISTER_INIT_LIKE_OP(name)                            \
+  NNVM_REGISTER_ELEMWISE_UNARY_OP(name)                             \
+  .set_attr<FGradient>("FGradient", MakeZeroGradNodes)              \
+  .add_argument("data", "Symbol", "The input")
+
+
+#define NNVM_REGISTER_ELEMWISE_BINARY_OP(name)                      \
+  NNVM_REGISTER_OP(name)                                            \
+  .set_num_inputs(2)                                                \
+  .set_num_outputs(1)                                               \
+  .set_attr<FInferShape>("FInferShape", ElemwiseShape<2, 1>)        \
+  .set_attr<FInferType>("FInferType", ElemwiseType<2, 1>)           \
+  .set_attr<FCorrectLayout>("FCorrectLayout",                       \
+    ElemwiseBinaryKeepLeftLayout)                                   \
+  .set_attr<FInplaceOption>("FInplaceOption",                       \
+    [](const NodeAttrs& attrs) {                                    \
+      return std::vector<std::pair<int, int> >{{0, 0}, {1, 0}};     \
+    })                                                              \
+  .add_argument("lhs", "Tensor", "first input")                     \
+  .add_argument("rhs", "Tensor", "second input")
+
+
+#define NNVM_REGISTER_ELEMWISE_REDUCE_OP(name)                      \
+  NNVM_REGISTER_OP(name)                                            \
+  .set_num_inputs([](const NodeAttrs& attrs) {                      \
+    return static_cast<uint32_t>(                                   \
+      dmlc::get<ElementWiseReduceParam>(attrs.parsed).num_args);    \
+    })                                                              \
+  .set_attr_parser(ParamParser<ElementWiseReduceParam>)             \
+  .set_attr<FGetAttrDict>("FGetAttrDict",                           \
+    ParamGetAttrDict<ElementWiseReduceParam>)                       \
+  .set_attr<nnvm::FInferShape>("FInferShape",                       \
+    ElementWiseReduceShape)                                         \
+  .set_attr<FCorrectLayout>("FCorrectLayout",                       \
+    ElemwiseFixedLayoutCopyToOut<-1, 1>)                             \
+  .set_attr<nnvm::FInferType>("FInferType", ElementWiseReduceType)  \
+  .add_argument("args", "Symbol[]", "Positional input arguments")
+
+
+#define NNVM_REGISTER_INDICATOR_OP(name)                            \
+  NNVM_REGISTER_OP(name)                                            \
+  .set_num_outputs(1)                                               \
+  .set_attr<FInferType>(                                            \
+    "FInferType", [](const NodeAttrs& attrs,                        \
+                     std::vector<int>* in_attrs,                    \
+                     std::vector<int>* out_attrs) {                 \
+      CHECK_EQ(out_attrs->size(), 1U);                              \
+      NNVM_ASSIGN_OUTPUT_TYPE(attrs, *out_attrs, 0,                 \
+        static_cast<int>(kFloat32));                                \
+      return true;                                                  \
+  })                                                                \
+  .set_attr<FCorrectLayout>("FCorrectLayout",                       \
+    ElemwiseFixedLayoutUnknownOut<1, 1>)                            \
+  .set_attr<FGradient>(                                             \
+    "FGradient", [](const NodePtr& n,                               \
+                    const std::vector<NodeEntry>& ograds) {         \
+      return MakeZeroGradNodes(n, ograds);                          \
+  })
+
+
+}  // namespace top
+}  // namespace nnvm
+#endif  // NNVM_TOP_ELEMWISE_OP_COMMON_H_
diff --git a/nnvm/src/top/image/resize.cc b/nnvm/src/top/image/resize.cc
new file mode 100644
index 000000000000..a50b4ac961ea
--- /dev/null
+++ b/nnvm/src/top/image/resize.cc
@@ -0,0 +1,127 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file resize.cc
+ * \brief Property def of resize operators.
+ */
+#include <tvm/operation.h>
+#include <tvm/expr.h>
+#include <tvm/packed_func_ext.h>
+#include <nnvm/layout.h>
+#include <nnvm/compiler/op_attr_types.h>
+#include <nnvm/op.h>
+#include <nnvm/node.h>
+#include <nnvm/op_attr_types.h>
+#include "../nn/nn_common.h"
+#include "../op_common.h"
+#include "../elemwise_op_common.h"
+#include "topi/elemwise.h"
+#include "topi/transform.h"
+#include "topi/image/resize.h"
+#include "resize.h"
+
+namespace nnvm {
+namespace top {
+using tvm::Expr;
+using tvm::Array;
+using tvm::Tensor;
+using nnvm::compiler::FTVMCompute;
+
+DMLC_REGISTER_PARAMETER(ResizeParam);
+
+inline bool ResizeInferShape(const nnvm::NodeAttrs& attrs,
+                             std::vector<TShape>* in_shape,
+                             std::vector<TShape>* out_shape) {
+  static const Layout kNCHW("NCHW");
+  const ResizeParam& param = nnvm::get<ResizeParam>(attrs.parsed);
+  CHECK_EQ(in_shape->size(), 1U);
+  CHECK_EQ(out_shape->size(), 1U);
+  TShape dshape = (*in_shape)[0];
+  if (dshape.ndim() ==  0) return false;
+  dshape = ConvertLayout(dshape, param.layout, kNCHW);
+
+  TShape oshape = dshape;
+  oshape[2] = param.size[0];
+  oshape[3] = param.size[1];
+
+  oshape = ConvertLayout(oshape, kNCHW, param.layout);
+  NNVM_ASSIGN_OUTPUT_SHAPE(attrs, *out_shape, 0, oshape);
+
+  return true;
+}
+
+inline bool ResizeLayout(const NodeAttrs& attrs,
+                         std::vector<Layout> *in_layouts,
+                         const std::vector<Layout> *last_in_layouts,
+                         std::vector<Layout> *out_layouts) {
+  const ResizeParam& param = nnvm::get<ResizeParam>(attrs.parsed);
+  CHECK_EQ(in_layouts->size(), 1U);
+  CHECK_EQ(out_layouts->size(), 1U);
+  const Layout layout(param.layout);
+  NNVM_ASSIGN_LAYOUT(*in_layouts, 0, layout);
+  NNVM_ASSIGN_LAYOUT(*out_layouts, 0, layout);
+  return true;
+}
+
+NNVM_REGISTER_OP(resize)
+.describe(R"(Perform resize to input array with nearest neighbour or bilinear interpolation.
+
+- **data**: data is 4D array of shape
+            (batch_size, channels, in_height, in_width) for NCHW
+            (batch_size, in_height, in_width, channels) for NHWC
+
+- **out**: Output is 4D array of shape
+           for layout NCHW
+           (batch_size, channels, size[0], size[1])
+
+           for layout NHWC
+           (batch_size, size[0], size[1], channels)
+
+)" NNVM_ADD_FILELINE)
+.add_argument("data", "4D Tensor", "Input data.")
+.add_arguments(ResizeParam::__FIELDS__())
+.set_attr_parser(ParamParser<ResizeParam>)
+.set_attr<FGetAttrDict>("FGetAttrDict", ParamGetAttrDict<ResizeParam>)
+.set_attr<FInferShape>("FInferShape", ResizeInferShape)
+.set_attr<FInferType>("FInferType", ElemwiseType<1, 1>)
+.set_attr<FCorrectLayout>("FCorrectLayout", ResizeLayout)
+.set_num_outputs(1)
+.set_num_inputs(1)
+.set_attr<FTVMCompute>(
+  "FTVMCompute", [](const NodeAttrs& attrs,
+                    const Array<Tensor>& inputs,
+                    const Array<Tensor>& out_info) {
+  const ResizeParam& param = nnvm::get<ResizeParam>(attrs.parsed);
+  Array<Expr> oshape;
+  if (param.layout == "NCHW") {
+    oshape.push_back(out_info[0]->shape[2]);
+    oshape.push_back(out_info[0]->shape[3]);
+  } else {
+    oshape.push_back(out_info[0]->shape[1]);
+    oshape.push_back(out_info[0]->shape[2]);
+  }
+
+  return Array<Tensor>{ topi::image::resize(inputs[0], oshape, param.layout,
+                                             param.align_corners, param.method)};
+})
+.set_support_level(2);
+
+}  // namespace top
+}  // namespace nnvm
diff --git a/nnvm/src/top/image/resize.h b/nnvm/src/top/image/resize.h
new file mode 100644
index 000000000000..8c894140fabc
--- /dev/null
+++ b/nnvm/src/top/image/resize.h
@@ -0,0 +1,63 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file resize.h
+ */
+#ifndef NNVM_TOP_IMAGE_RESIZE_H_
+#define NNVM_TOP_IMAGE_RESIZE_H_
+
+#include <string>
+#include <vector>
+#include <utility>
+#include <iostream>
+#include <sstream>
+
+namespace nnvm {
+namespace top {
+
+struct ResizeParam : public dmlc::Parameter<ResizeParam> {
+  TShape size;
+  std::string layout;
+  std::string method;
+  bool align_corners;
+
+  DMLC_DECLARE_PARAMETER(ResizeParam) {
+    DMLC_DECLARE_FIELD(size)
+      .describe("Output size");
+    DMLC_DECLARE_FIELD(layout)
+      .set_default("NCHW")
+      .describe("Dimension ordering of data. Can be 'NCHW', 'NHWC', etc."
+                "'N', 'C', 'H', 'W' stands for batch, channel, height, and width"
+                "dimensions respectively. Resize is applied on the 'H' and"
+                "'W' dimensions.");
+    DMLC_DECLARE_FIELD(method)
+      .set_default("BILINEAR")
+      .describe("Specify the mode to use for scaling."
+                "NEAREST_NEIGHBOR -  Nearest Neighbor"
+                "BILINEAR - Bilinear Interpolation");
+    DMLC_DECLARE_FIELD(align_corners)
+      .set_default(false)
+      .describe("Should be true to preserve the values at the corner pixels");
+  }
+};
+
+}  // namespace top
+}  // namespace nnvm
+#endif  // NNVM_TOP_IMAGE_RESIZE_H_
diff --git a/nnvm/src/top/nn/convolution.cc b/nnvm/src/top/nn/convolution.cc
new file mode 100644
index 000000000000..5c3b2d35991d
--- /dev/null
+++ b/nnvm/src/top/nn/convolution.cc
@@ -0,0 +1,660 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file convolution.cc
+ * \brief Convolution operators
+ */
+#include <nnvm/op.h>
+#include <nnvm/node.h>
+#include <nnvm/layout.h>
+#include <nnvm/op_attr_types.h>
+#include <nnvm/top/nn.h>
+#include <tvm/tensor.h>
+#include <tvm/packed_func_ext.h>
+#include <nnvm/compiler/op_attr_types.h>
+#include <tvm/operation.h>
+#include "nn_common.h"
+#include "../op_common.h"
+#include "../elemwise_op_common.h"
+#include "topi/nn.h"
+
+
+using tvm::Tensor;
+using tvm::Array;
+using nnvm::compiler::FTVMCompute;
+
+namespace nnvm {
+namespace top {
+
+// conv2d
+DMLC_REGISTER_PARAMETER(Conv2DParam);
+
+inline bool Conv2DInferShape(const nnvm::NodeAttrs& attrs,
+                             std::vector<TShape>* in_shape,
+                             std::vector<TShape>* out_shape) {
+  static const Layout kNCHW("NCHW");
+  static const Layout kOIHW("OIHW");
+
+  const Conv2DParam& param = nnvm::get<Conv2DParam>(attrs.parsed);
+
+  const Layout in_layout(param.layout);
+  const Layout kernel_layout(param.kernel_layout);
+  CHECK(in_layout.convertible(kNCHW))
+    << "Conv only support input layouts that are convertible from NCHW."
+    << " But got " << in_layout;
+  CHECK(kernel_layout.convertible(kOIHW))
+    << "Conv only support kernel layouts that are convertible from OIHW."
+    << " But got "<< kernel_layout;
+
+  Layout out_layout(param.out_layout);
+  if (!out_layout.defined()) out_layout = in_layout;
+  CHECK(out_layout.convertible(kNCHW))
+    << "Conv only support output layouts that are convertible from NCHW."
+    << " But got " << out_layout;
+
+  if (param.use_bias) {
+    CHECK_EQ(in_shape->size(), 3U) << "Input:[data, weight, bias]";
+  } else {
+    CHECK_EQ(in_shape->size(), 2U) << "Input:[data, weight]";
+  }
+  CHECK_EQ(out_shape->size(), 1U);
+
+  TShape dshape = in_shape->at(0);
+  if (dshape.ndim() == 0) return false;
+  dshape = ConvertLayout(dshape, in_layout, kNCHW);
+
+  CHECK_EQ(dshape.ndim(), 4U) << "Input data should be 4D";
+  CHECK_EQ(param.kernel_size.ndim(), 2U);
+  CHECK_EQ(param.strides.ndim(), 2U)
+      << "incorrect stride size: " << param.strides;
+  CHECK_EQ(param.dilation.ndim(), 2U)
+      << "incorrect dilate size: " << param.dilation;
+  CHECK_EQ(dshape[1] % param.groups, 0U)
+      << "input channels must divide group size";
+  CHECK_EQ(param.channels % param.groups, 0U)
+      << "output channels must divide group size";
+
+  TShape wshape({param.channels,
+                 dshape[1] / param.groups,
+                 param.kernel_size[0],
+                 param.kernel_size[1]});
+
+  wshape = ConvertLayout(wshape, kOIHW, kernel_layout);
+
+  if (in_shape->at(Conv2DParam::kWeight).ndim() == 0) {
+    NNVM_ASSIGN_INPUT_SHAPE(attrs, *in_shape, Conv2DParam::kWeight, wshape);
+  }
+  if (param.use_bias) {
+    static const Layout default_bias_layout("C");
+    TShape bias_shape({param.channels});
+    auto oc_block = out_layout.subsizeof('C');
+    if (oc_block > 0) {
+      size_t split_axis = (out_layout.indexof('C') < out_layout.indexof('c')) ? 1 : 0;
+      bias_shape = ConvertLayout(bias_shape, default_bias_layout,
+                                 default_bias_layout.split('C', split_axis, oc_block));
+    }
+    NNVM_ASSIGN_INPUT_SHAPE(attrs, *in_shape, Conv2DParam::kBias, bias_shape);
+  }
+  // dilation
+  dim_t dilated_ksize_y = 1 + (param.kernel_size[0] - 1) * param.dilation[0];
+  dim_t dilated_ksize_x = 1 + (param.kernel_size[1] - 1) * param.dilation[1];
+  TShape oshape({dshape[0], param.channels, 0, 0});
+  if (dshape[2] != 0) {
+    oshape[2] = (dshape[2] + param.padding[0] * 2 - dilated_ksize_y) / param.strides[0] + 1;
+  }
+  if (dshape[3] != 0) {
+    oshape[3] = (dshape[3] + param.padding[1] * 2 - dilated_ksize_x) / param.strides[1] + 1;
+  }
+  NNVM_ASSIGN_OUTPUT_SHAPE(attrs, *out_shape, 0, ConvertLayout(oshape, kNCHW, out_layout));
+  // Perform incomplete shape inference. Fill in the missing values in data shape.
+  // 1) We can always fill in the batch_size.
+  // 2) We can back-calculate the input height/width if the corresponding stride is 1.
+  oshape = ConvertLayout((*out_shape)[0], out_layout, kNCHW);
+  dshape[0] = oshape[0];
+  if (oshape[2] && param.strides[0] == 1) {
+    dshape[2] = oshape[2] + dilated_ksize_y - 1 - 2 * param.padding[0];
+  }
+  if (oshape[3] && param.strides[1] == 1) {
+    dshape[3] = oshape[3] + dilated_ksize_x - 1 - 2 * param.padding[1];
+  }
+  NNVM_ASSIGN_INPUT_SHAPE(attrs, *in_shape, Conv2DParam::kData,
+                          ConvertLayout(dshape, kNCHW, in_layout));
+  // Check whether the kernel sizes are valid
+  if (dshape[2] != 0) {
+    CHECK_LE(dilated_ksize_y, dshape[2] + 2 * param.padding[0])
+      << "kernel size exceed input";
+  }
+  if (dshape[3] != 0) {
+    CHECK_LE(dilated_ksize_x, dshape[3] + 2 * param.padding[1])
+        << "kernel size exceed input";
+  }
+  return true;
+}
+
+template<class Param>
+inline bool WinogradConv2DInferShape(const nnvm::NodeAttrs& attrs,
+                                     std::vector<TShape>* in_shape,
+                                     std::vector<TShape>* out_shape) {
+  static const Layout kNCHW("NCHW");
+  static const Layout kOIHW("OIHW");
+
+  const Param& param = nnvm::get<Param>(attrs.parsed);
+
+  const Layout in_layout(param.layout);
+  const Layout kernel_layout(param.kernel_layout);
+  CHECK(in_layout.convertible(kNCHW))
+    << "Conv only support input layouts that are convertible from NCHW."
+    << " But got " << in_layout;
+  CHECK(kernel_layout.convertible(kOIHW))
+    << "Conv only support kernel layouts that are convertible from OIHW."
+    << " But got "<< kernel_layout;
+
+  Layout out_layout(param.out_layout);
+  if (!out_layout.defined()) out_layout = in_layout;
+  CHECK(out_layout.convertible(kNCHW))
+    << "Conv only support output layouts that are convertible from NCHW."
+    << " But got " << out_layout;
+
+  if (param.use_bias) {
+    CHECK_EQ(in_shape->size(), 3U) << "Input:[data, weight, bias]";
+  } else {
+    CHECK_EQ(in_shape->size(), 2U) << "Input:[data, weight]";
+  }
+  CHECK_EQ(out_shape->size(), 1U);
+
+  TShape dshape = in_shape->at(0);
+  if (dshape.ndim() == 0) return false;
+  dshape = ConvertLayout(dshape, in_layout, kNCHW);
+
+  CHECK_EQ(dshape.ndim(), 4U) << "Input data should be 4D";
+  CHECK_EQ(param.kernel_size.ndim(), 2U);
+  CHECK_EQ(param.strides.ndim(), 2U)
+      << "incorrect stride size: " << param.strides;
+  CHECK_EQ(param.dilation.ndim(), 2U)
+      << "incorrect dilate size: " << param.dilation;
+  CHECK_EQ(dshape[1] % param.groups, 0U)
+      << "input channels must divide group size";
+  CHECK_EQ(param.channels % param.groups, 0U)
+      << "output channels must divide group size";
+
+  // NOTE: Do not check weight shape here!
+  // Different backend requires different layout to compute
+  // the batch gemm stage in winograd efficiently, but we want to
+  // make this NNVM symbol work for all backends.
+  // So we accept all weight shapes, and assume the TOPI developers
+  // can handle this correctly in alter_op_layout.
+
+  if (param.use_bias) {
+    static const Layout default_bias_layout("C");
+    TShape bias_shape({param.channels});
+    auto oc_block = out_layout.subsizeof('C');
+    if (oc_block > 0) {
+      size_t split_axis = (out_layout.indexof('C') < out_layout.indexof('c')) ? 1 : 0;
+      bias_shape = ConvertLayout(bias_shape, default_bias_layout,
+                                 default_bias_layout.split('C', split_axis, oc_block));
+    }
+    NNVM_ASSIGN_INPUT_SHAPE(attrs, *in_shape, WinogradConv2DParam::kBias, bias_shape);
+  }
+  // dilation
+  dim_t dilated_ksize_y = 1 + (param.kernel_size[0] - 1) * param.dilation[0];
+  dim_t dilated_ksize_x = 1 + (param.kernel_size[1] - 1) * param.dilation[1];
+  TShape oshape({dshape[0], param.channels, 0, 0});
+  if (dshape[2] != 0) {
+    oshape[2] = (dshape[2] + param.padding[0] * 2 - dilated_ksize_y) / param.strides[0] + 1;
+  }
+  if (dshape[3] != 0) {
+    oshape[3] = (dshape[3] + param.padding[1] * 2 - dilated_ksize_x) / param.strides[1] + 1;
+  }
+  NNVM_ASSIGN_OUTPUT_SHAPE(attrs, *out_shape, 0, ConvertLayout(oshape, kNCHW, out_layout));
+  // Perform incomplete shape inference. Fill in the missing values in data shape.
+  // 1) We can always fill in the batch_size.
+  // 2) We can back-calculate the input height/width if the corresponding stride is 1.
+  oshape = ConvertLayout((*out_shape)[0], out_layout, kNCHW);
+  dshape[0] = oshape[0];
+  if (oshape[2] && param.strides[0] == 1) {
+    dshape[2] = oshape[2] + dilated_ksize_y - 1 - 2 * param.padding[0];
+  }
+  if (oshape[3] && param.strides[1] == 1) {
+    dshape[3] = oshape[3] + dilated_ksize_x - 1 - 2 * param.padding[1];
+  }
+  NNVM_ASSIGN_INPUT_SHAPE(attrs, *in_shape, WinogradConv2DParam::kData,
+                          ConvertLayout(dshape, kNCHW, in_layout));
+  // Check whether the kernel sizes are valid
+  if (dshape[2] != 0) {
+    CHECK_LE(dilated_ksize_y, dshape[2] + 2 * param.padding[0])
+      << "kernel size exceed input";
+  }
+  if (dshape[3] != 0) {
+    CHECK_LE(dilated_ksize_x, dshape[3] + 2 * param.padding[1])
+      << "kernel size exceed input";
+  }
+  return true;
+}
+
+template <typename PARAM>
+inline bool Conv2DInferType(const nnvm::NodeAttrs& attrs,
+                            std::vector<int>* in_type,
+                            std::vector<int>* out_type) {
+  const PARAM& param = nnvm::get<PARAM>(attrs.parsed);
+  if (param.use_bias) {
+    CHECK_EQ(in_type->size(), 3U) << "Input:[data, weight, bias]";
+  } else {
+    CHECK_EQ(in_type->size(), 2U) << "Input:[data, weight]";
+  }
+  CHECK_EQ(out_type->size(), 1U);
+  if (param.out_dtype != -1) {
+    CHECK(!type_is_none((*in_type)[0]));
+    for (size_t i = 1; i < in_type->size(); ++i) {
+      NNVM_ASSIGN_INPUT_TYPE(attrs, *in_type, i, (*in_type)[0]);
+    }
+    NNVM_ASSIGN_OUTPUT_TYPE(attrs, *out_type, 0, param.out_dtype);
+  } else {
+    ElemwiseType<-1, 1>(attrs, in_type, out_type);
+  }
+  return true;
+}
+
+
+template<typename PARAM>
+inline bool Conv2DCorrectLayout(const NodeAttrs& attrs,
+                                std::vector<Layout> *ilayouts,
+                                const std::vector<Layout> *last_ilayouts,
+                                std::vector<Layout> *olayouts) {
+  const PARAM& param = nnvm::get<PARAM>(attrs.parsed);
+
+  const Layout in_layout(param.layout);
+  Layout out_layout(param.out_layout);
+  if (!out_layout.defined()) out_layout = in_layout;
+
+  const Layout kernel_layout(param.kernel_layout);
+  if (param.use_bias) {
+    CHECK_EQ(ilayouts->size(), 3U) << "Input:[data, weight, bias]";
+    NNVM_ASSIGN_LAYOUT(*ilayouts, 0, in_layout);
+    NNVM_ASSIGN_LAYOUT(*ilayouts, 1, kernel_layout);
+    // automatically decide bias layout
+    Layout bias_layout("C");
+    auto oc_block = out_layout.subsizeof('C');
+    if (oc_block > 0) {
+      size_t split_axis = (out_layout.indexof('C') < out_layout.indexof('c')) ? 1 : 0;
+      bias_layout = bias_layout.split('C', split_axis, oc_block);
+    }
+    NNVM_ASSIGN_LAYOUT(*ilayouts, 2, bias_layout);
+  } else {
+    CHECK_EQ(ilayouts->size(), 2U) << "Input:[data, weight]";
+    NNVM_ASSIGN_LAYOUT(*ilayouts, 0, in_layout);
+    NNVM_ASSIGN_LAYOUT(*ilayouts, 1, kernel_layout);
+  }
+
+  CHECK_EQ(olayouts->size(), 1U);
+  NNVM_ASSIGN_LAYOUT(*olayouts, 0, out_layout);
+
+  return true;
+}
+
+NNVM_REGISTER_OP(conv2d)
+.describe(R"code(2D convolution layer (e.g. spatial convolution over images).
+
+This layer creates a convolution kernel that is convolved
+with the layer input to produce a tensor of
+outputs. If `use_bias` is True,
+a bias vector is created and added to the outputs.
+
+- **data**: This depends on the `layout` parameter. Input is 4D array of shape
+            (batch_size, in_channels, height, width) if `layout` is `NCHW`.
+- **weight**: (channels, in_channels, kernel_size[0], kernel_size[1])
+- **bias**: (channels,)
+- **out**:  This depends on the `layout` parameter. Output is 4D array of shape
+            (batch_size, channels, out_height, out_width) if `layout` is `NCHW`.
+
+)code" NNVM_ADD_FILELINE)
+.add_argument("data", "4D Tensor", "Input data.")
+.add_argument("weight", "4D Tensor", "Weight matrix.")
+.add_argument("bias", "1D Tensor", "Bias parameter.")
+.add_arguments(Conv2DParam::__FIELDS__())
+.set_attr_parser(ParamParser<Conv2DParam>)
+.set_attr<FGetAttrDict>("FGetAttrDict", ParamGetAttrDict<Conv2DParam>)
+.set_attr<FListInputNames>("FListInputNames", UseBiasListInputNames<Conv2DParam>)
+.set_attr<FInferShape>("FInferShape", Conv2DInferShape)
+.set_attr<FInferType>("FInferType", Conv2DInferType<Conv2DParam>)
+.set_attr<FCorrectLayout>("FCorrectLayout", Conv2DCorrectLayout<Conv2DParam>)
+.set_num_outputs(1)
+.set_num_inputs(UseBiasNumInputs<Conv2DParam>)
+.set_support_level(2)
+.set_attr<FGradient>(
+  "FGradient", [](const NodePtr& n,
+                  const std::vector<NodeEntry>& ograds) {
+    return MakeGradNode("_conv2d_grad", n,
+                        {ograds[0], n->inputs[Conv2DParam::kData],
+                         n->inputs[Conv2DParam::kWeight]},
+                        n->attrs.dict);
+});
+
+NNVM_REGISTER_OP(_contrib_conv2d_NCHWc)
+.describe(R"code(2D convolution layer (e.g. spatial convolution over images).
+)code" NNVM_ADD_FILELINE)
+.add_argument("data", "5D Tensor", "Packed input data.")
+.add_argument("weight", "6D Tensor", "Packed weight matrix.")
+.add_argument("bias", "1D Tensor", "Bias parameter.")
+.add_arguments(Conv2DParam::__FIELDS__())
+.set_attr_parser(ParamParser<Conv2DParam>)
+.set_attr<FGetAttrDict>("FGetAttrDict", ParamGetAttrDict<Conv2DParam>)
+.set_attr<FListInputNames>("FListInputNames", UseBiasListInputNames<Conv2DParam>)
+.set_attr<FInferShape>("FInferShape", Conv2DInferShape)
+.set_attr<FInferType>("FInferType", Conv2DInferType<Conv2DParam>)
+.set_attr<FCorrectLayout>("FCorrectLayout", Conv2DCorrectLayout<Conv2DParam>)
+.set_num_outputs(1)
+.set_num_inputs(UseBiasNumInputs<Conv2DParam>)
+.set_support_level(2);
+
+NNVM_REGISTER_OP(_contrib_conv2d_winograd_weight_transform)
+.describe(R"code(Weight transformation of winograd fast convolution algorithm.
+Separate this into another nnvm symbol in order to enable Precompute Pass to compute the
+weight transformation in advance.
+
+- **weight**: (channels, in_channels, kernel_size[0], kernel_size[1])
+)code" NNVM_ADD_FILELINE)
+.add_argument("weight", "4D Tensor", "Weight tensor.")
+.add_arguments(WinogradWeightTransformParam::__FIELDS__())
+.set_attr_parser(ParamParser<WinogradWeightTransformParam>)
+.set_attr<FGetAttrDict>("FGetAttrDict", ParamGetAttrDict<WinogradWeightTransformParam>)
+.set_attr<FInferShape>("FInferShape", [](const nnvm::NodeAttrs& attrs,
+                                         std::vector<TShape> *in_shape,
+                                         std::vector<TShape> *out_shape) {
+  const auto& param = nnvm::get<WinogradWeightTransformParam>(attrs.parsed);
+  const TShape &wshape = (*in_shape)[0];
+
+  CHECK_EQ(wshape.ndim(), 4) << "Weight should be a 4 dimensional tensor";
+
+  TShape oshape({param.tile_size + wshape[2] - 1,
+                 param.tile_size + wshape[3] - 1,
+                 wshape[0],
+                 wshape[1]});
+  NNVM_ASSIGN_OUTPUT_SHAPE(attrs, *out_shape, 0, oshape);
+  return true;
+  })
+.set_attr<FCorrectLayout>("FCorrectLayot", [](const NodeAttrs& attrs,
+                                              std::vector<Layout> *ilayouts,
+                                              const std::vector<Layout> *last_ilayouts,
+                                              std::vector<Layout> *olayouts) {
+  Layout layout("OIHW");
+  NNVM_ASSIGN_LAYOUT(*ilayouts, 0, layout);
+  NNVM_ASSIGN_LAYOUT(*olayouts, 0, layout);
+  return true;
+})
+.set_attr<FInferType>("FInferType", ElemwiseType<1, 1>)
+.set_num_outputs(1)
+.set_num_inputs(1)
+.set_support_level(5);
+
+DMLC_REGISTER_PARAMETER(WinogradWeightTransformParam);
+
+NNVM_REGISTER_OP(_contrib_conv2d_winograd_without_weight_transform)
+.describe(R"code(Compute conv2d with winograd algorithm.
+
+- **data**: Input is 4D array of shape  (batch_size, in_channels, height, width)
+- **weight**: Any shape
+            We do not check shape for this input tensor.
+
+- **bias**: (channels,)
+- **out**:  Output is 4D array of shape (batch_size, channels, out_height, out_width)
+)code" NNVM_ADD_FILELINE)
+.add_argument("data", "4D Tensor", "Input data.")
+.add_argument("weight", "Tensor", "Transformed weight tensor.")
+.add_argument("bias", "1D Tensor", "Bias parameter.")
+.add_arguments(WinogradConv2DParam::__FIELDS__())
+.set_attr_parser(ParamParser<WinogradConv2DParam>)
+.set_attr<FGetAttrDict>("FGetAttrDict", ParamGetAttrDict<WinogradConv2DParam>)
+.set_attr<FListInputNames>("FListInputNames", UseBiasListInputNames<WinogradConv2DParam>)
+.set_attr<FInferShape>("FInferShape", WinogradConv2DInferShape<WinogradConv2DParam>)
+.set_attr<FInferType>("FInferType", Conv2DInferType<WinogradConv2DParam>)
+.set_attr<FCorrectLayout>("FCorrectLayout", Conv2DCorrectLayout<WinogradConv2DParam>)
+.set_num_outputs(1)
+.set_num_inputs(UseBiasNumInputs<WinogradConv2DParam>)
+.set_support_level(5);
+
+DMLC_REGISTER_PARAMETER(WinogradConv2DParam);
+
+
+inline bool Conv2DWinogradNNPACKWTInferType(const nnvm::NodeAttrs& attrs,
+                                            std::vector<int>* in_type,
+                                            std::vector<int>* out_type) {
+  const WinogradNNPACKWeightTransformParam& param =
+      nnvm::get<WinogradNNPACKWeightTransformParam>(attrs.parsed);
+
+  CHECK_EQ(in_type->size(), 1U) << "Input:[weight]";
+  CHECK_EQ(out_type->size(), 1U);
+
+  if (param.out_dtype != -1) {
+    NNVM_ASSIGN_OUTPUT_TYPE(attrs, *out_type, 0, param.out_dtype);
+  } else {
+    ElemwiseType<1, 1>(attrs, in_type, out_type);
+  }
+  return true;
+}
+
+NNVM_REGISTER_OP(_contrib_conv2d_winograd_nnpack_weight_transform)
+.describe(R"code(Weight transformation of winograd fast convolution algorithm.
+Separate this into another nnvm symbol in order to enable Precompute Pass to compute the
+weight transformation in advance.
+- **weight**: (channels, in_channels, kernel_size[0], kernel_size[1])
+)code" NNVM_ADD_FILELINE)
+.add_argument("weight", "4D Tensor", "Weight tensor.")
+.add_arguments(WinogradNNPACKWeightTransformParam::__FIELDS__())
+.set_attr_parser(ParamParser<WinogradNNPACKWeightTransformParam>)
+.set_attr<FGetAttrDict>("FGetAttrDict", ParamGetAttrDict<WinogradNNPACKWeightTransformParam>)
+.set_attr<FInferShape>("FInferShape", [](const nnvm::NodeAttrs& attrs,
+                                         std::vector<TShape> *in_shape,
+                                         std::vector<TShape> *out_shape) {
+  const TShape &wshape = (*in_shape)[0];
+  CHECK_EQ(wshape.ndim(), 4) << "Weight should be a 4 dimensional tensor";
+  TShape oshape({wshape[0], wshape[1], 8, 8});
+  NNVM_ASSIGN_OUTPUT_SHAPE(attrs, *out_shape, 0, oshape);
+  return true;
+})
+.set_attr<FCorrectLayout>("FCorrectLayout", [](const NodeAttrs& attrs,
+                                              std::vector<Layout> *ilayouts,
+                                              const std::vector<Layout> *last_ilayouts,
+                                              std::vector<Layout> *olayouts) {
+  Layout layout("OIHW");
+  NNVM_ASSIGN_LAYOUT(*ilayouts, 0, layout);
+  NNVM_ASSIGN_LAYOUT(*olayouts, 0, layout);
+  return true;
+})
+.set_attr<FInferType>("FInferType", Conv2DWinogradNNPACKWTInferType)
+.set_num_outputs(1)
+.set_num_inputs(1)
+.set_support_level(5);
+
+DMLC_REGISTER_PARAMETER(WinogradNNPACKWeightTransformParam);
+
+NNVM_REGISTER_OP(_contrib_conv2d_winograd_nnpack_without_weight_transform)
+.describe(R"code(Compute conv2d with winograd nnpack.
+- **data**: Input is 4D array of shape  (batch_size, in_channels, height, width)
+- **weight**: Any shape
+            We do not check shape for this input tensor.
+- **bias**: (channels,)
+- **out**:  Output is 4D array of shape (batch_size, channels, out_height, out_width)
+)code" NNVM_ADD_FILELINE)
+.add_argument("data", "4D Tensor", "Input data.")
+.add_argument("weight", "4D Tensor", "Transformed weight tensor.")
+.add_argument("bias", "1D Tensor", "Bias parameter.")
+.add_arguments(Conv2DParam::__FIELDS__())
+.set_attr_parser(ParamParser<Conv2DParam>)
+.set_attr<FGetAttrDict>("FGetAttrDict", ParamGetAttrDict<Conv2DParam>)
+.set_attr<FListInputNames>("FListInputNames", UseBiasListInputNames<Conv2DParam>)
+.set_attr<FInferShape>("FInferShape", WinogradConv2DInferShape<Conv2DParam>)
+.set_attr<FInferType>("FInferType", Conv2DInferType<Conv2DParam>)
+.set_attr<FCorrectLayout>("FCorrectLayout", Conv2DCorrectLayout<Conv2DParam>)
+.set_num_outputs(1)
+.set_num_inputs(UseBiasNumInputs<Conv2DParam>)
+.set_support_level(5);
+
+
+NNVM_REGISTER_OP(_conv2d_grad)
+  .describe(R"code(2D convolution grad.
+
+)code" NNVM_ADD_FILELINE)
+.add_argument("ograd", "4D Tensor", "Output grad.")
+.add_argument("data", "4D Tensor", "Input data of conv2d.")
+.add_argument("weight", "4D Tensor", "Input weight.")
+.set_num_inputs(3)
+.set_num_outputs(UseBiasNumInputs<Conv2DParam>)
+.set_attr<FListOutputNames>("FListOutputNames", UseBiasListInputNames<Conv2DParam>)
+.set_attr_parser(ParamParser<Conv2DParam>)
+.set_attr<FGetAttrDict>("FGetAttrDict", ParamGetAttrDict<Conv2DParam>)
+.set_attr<FInferShape>(
+  "FInferShape", [](const nnvm::NodeAttrs& attrs,
+                    std::vector<TShape>* in_attrs,
+                    std::vector<TShape>* out_attrs) {
+    const Conv2DParam& param = nnvm::get<Conv2DParam>(attrs.parsed);
+    NNVM_ASSIGN_OUTPUT_SHAPE(attrs, *out_attrs, Conv2DParam::kData, in_attrs->at(1));
+    NNVM_ASSIGN_OUTPUT_SHAPE(attrs, *out_attrs, Conv2DParam::kWeight, in_attrs->at(2));
+    if (param.use_bias) {
+      NNVM_ASSIGN_OUTPUT_SHAPE(attrs, *out_attrs, Conv2DParam::kBias, TShape({param.channels}));
+    }
+    return true;
+})
+.set_attr<FInferType>("FInferType", ElemwiseType<3, -1>)
+.set_attr<TIsBackward>("TIsBackward", true);
+
+
+DMLC_REGISTER_PARAMETER(Conv2DTransposeParam);
+
+inline bool Conv2DTransposeInferShape(const nnvm::NodeAttrs& attrs,
+                                      std::vector<TShape>* in_shape,
+                                      std::vector<TShape>* out_shape) {
+  static const Layout kNCHW("NCHW");
+  static const Layout kOIHW("OIHW");
+  const Conv2DTransposeParam& param = nnvm::get<Conv2DTransposeParam>(attrs.parsed);
+  const Layout layout(param.layout);
+  const Layout kernel_layout(param.kernel_layout);
+  if (param.use_bias) {
+    CHECK_EQ(in_shape->size(), 3U) << "Input:[data, weight, bias]";
+  } else {
+    CHECK_EQ(in_shape->size(), 2U) << "Input:[data, weight]";
+  }
+  CHECK_EQ(out_shape->size(), 1U);
+
+  const TShape& dshape = (*in_shape)[Conv2DTransposeParam::kData];
+  if (dshape.ndim() ==  0) return false;
+  TShape dshape_nchw = ConvertLayout(dshape, layout, kNCHW);
+
+  CHECK_EQ(dshape_nchw[1] % param.groups, 0U)
+      << "input num_filter must divide group size";
+  CHECK_EQ(param.channels % param.groups, 0U)
+      << "output num_filter must divide group size";
+  CHECK_EQ(param.kernel_size.ndim(), 2U)
+      << "incorrect kernel size: " << param.kernel_size;
+  CHECK_EQ(param.strides.ndim(), 2U)
+      << "incorrect stride size: " << param.strides;
+  CHECK_EQ(param.dilation.ndim(), 2U)
+      << "incorrect dilate size: " << param.dilation;
+
+  TShape wshape({dshape_nchw[1],
+                 param.channels / param.groups,
+                 param.kernel_size[0],
+                 param.kernel_size[1]});
+  wshape = ConvertLayout(wshape, kOIHW, kernel_layout);
+  NNVM_ASSIGN_INPUT_SHAPE(attrs, *in_shape, Conv2DTransposeParam::kWeight, wshape);
+
+  if (param.use_bias) {
+    NNVM_ASSIGN_INPUT_SHAPE(attrs, *in_shape,
+                            Conv2DTransposeParam::kBias,
+                            TShape({param.channels}));
+  }
+  // dilation
+  dim_t dilated_ksize_y = 1 + (param.kernel_size[0] - 1) * param.dilation[0];
+  dim_t dilated_ksize_x = 1 + (param.kernel_size[1] - 1) * param.dilation[1];
+  // output shape.
+  TShape oshape({dshape_nchw[0], param.channels, 0, 0});
+  oshape[2] = (param.strides[0] * (dshape_nchw[2] - 1) + dilated_ksize_y -
+               2 * param.padding[0] + param.output_padding[0]);
+
+  oshape[3] = (param.strides[1] * (dshape_nchw[3] - 1) + dilated_ksize_x -
+               2 * param.padding[1] + param.output_padding[1]);
+  NNVM_ASSIGN_OUTPUT_SHAPE(attrs, *out_shape, 0,
+                           ConvertLayout(oshape, kNCHW, layout));
+  return true;
+}
+
+inline bool Conv2DTransposeCorrectLayout(const NodeAttrs& attrs,
+                                         std::vector<Layout> *ilayouts,
+                                         const std::vector<Layout> *last_ilayouts,
+                                         std::vector<Layout> *olayouts) {
+  const Conv2DTransposeParam& param = nnvm::get<Conv2DTransposeParam>(attrs.parsed);
+
+  const Layout in_layout(param.layout);
+
+  const Layout kernel_layout(param.kernel_layout);
+  if (param.use_bias) {
+    CHECK_EQ(ilayouts->size(), 3U) << "Input:[data, weight, bias]";
+    NNVM_ASSIGN_LAYOUT(*ilayouts, 0, in_layout);
+    NNVM_ASSIGN_LAYOUT(*ilayouts, 1, kernel_layout);
+    NNVM_ASSIGN_LAYOUT(*ilayouts, 2, Layout("C"));
+  } else {
+    CHECK_EQ(ilayouts->size(), 2U) << "Input:[data, weight]";
+    NNVM_ASSIGN_LAYOUT(*ilayouts, 0, in_layout);
+    NNVM_ASSIGN_LAYOUT(*ilayouts, 1, kernel_layout);
+  }
+
+  CHECK_EQ(olayouts->size(), 1U);
+  NNVM_ASSIGN_LAYOUT(*olayouts, 0, in_layout);
+
+  return true;
+}
+
+NNVM_REGISTER_OP(conv2d_transpose)
+.describe(R"code(Transposed 2D convolution layer (sometimes called Deconvolution).
+
+The need for transposed convolutions generally arises
+from the desire to use a transformation going in the opposite direction
+of a normal convolution, i.e., from something that has the shape of the
+output of some convolution to something that has the shape of its input
+while maintaining a connectivity pattern that is compatible with
+said convolution.
+
+- **data**: This depends on the `layout` parameter. Input is 4D array of shape
+            (batch_size, in_channels, height, width) if `layout` is `NCHW`.
+- **weight**: (in_channels, channels, kernel_size[0], kernel_size[1])
+- **bias**: (channels,)
+- **out**:  This depends on the `layout` parameter. Output is 4D array of shape
+v            (batch_size, channels, out_height, out_width) if `layout` is `NCHW`.
+
+            out_height and out_width are calculated as::
+                out_height = (height-1)*strides[0]-2*padding[0]+kernel_size[0]+output_padding[0]
+                out_width = (width-1)*strides[1]-2*padding[1]+kernel_size[1]+output_padding[1]
+
+)code" NNVM_ADD_FILELINE)
+.add_argument("data", "4D Tensor", "Input data.")
+.add_argument("weight", "4D Tensor", "Weight matrix.")
+.add_argument("bias", "1D Tensor", "Bias parameter.")
+.add_arguments(Conv2DTransposeParam::__FIELDS__())
+.set_attr_parser(ParamParser<Conv2DTransposeParam>)
+.set_attr<FGetAttrDict>("FGetAttrDict", ParamGetAttrDict<Conv2DTransposeParam>)
+.set_attr<FListInputNames>("FListInputNames", UseBiasListInputNames<Conv2DTransposeParam>)
+.set_attr<FInferShape>("FInferShape", Conv2DTransposeInferShape)
+.set_attr<FInferType>("FInferType", Conv2DInferType<Conv2DTransposeParam>)
+.set_attr<FCorrectLayout>("FCorrectLayout", Conv2DTransposeCorrectLayout)
+.set_num_outputs(1)
+.set_num_inputs(UseBiasNumInputs<Conv2DTransposeParam>)
+.set_support_level(2);
+
+}  // namespace top
+}  // namespace nnvm
diff --git a/nnvm/src/top/nn/nn.cc b/nnvm/src/top/nn/nn.cc
index 91b568187026..ec3643bef306 100644
--- a/nnvm/src/top/nn/nn.cc
+++ b/nnvm/src/top/nn/nn.cc
@@ -683,11 +683,11 @@ NNVM_REGISTER_OP(pad)
       << "Illegal pad_width";
     Array<tvm::Expr> pad_before;
     for (size_t i = 0; i < pad_width.ndim(); ++i) {
-      pad_before.push_back(tvm::make_const(tvm::DataType::Int(32), pad_width[i][0]));
+      pad_before.push_back(tvm::make_const(tvm::Int(32), pad_width[i][0]));
     }
     Array<tvm::Expr> pad_after;
     for (size_t i = 0; i < pad_width.ndim(); ++i) {
-      pad_after.push_back(tvm::make_const(tvm::DataType::Int(32), pad_width[i][1]));
+      pad_after.push_back(tvm::make_const(tvm::Int(32), pad_width[i][1]));
     }
     return Array<Tensor>{ topi::pad(inputs[0], pad_before, pad_after,
                           tvm::make_const(inputs[0]->dtype, param.pad_value)) };
diff --git a/nnvm/src/top/nn/nn_common.h b/nnvm/src/top/nn/nn_common.h
new file mode 100644
index 000000000000..d7ce420b6d94
--- /dev/null
+++ b/nnvm/src/top/nn/nn_common.h
@@ -0,0 +1,110 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file nn_common.h
+ * \brief Common utilities for nn ops.
+ */
+#ifndef NNVM_TOP_NN_NN_COMMON_H_
+#define NNVM_TOP_NN_NN_COMMON_H_
+
+#include <dmlc/logging.h>
+#include <dmlc/parameter.h>
+#include <nnvm/layout.h>
+#include <nnvm/top/nn.h>
+#include <string>
+#include <vector>
+#include <utility>
+#include <algorithm>
+
+namespace nnvm {
+namespace top {
+
+template<typename ParamType>
+inline uint32_t UseBiasNumInputs(const NodeAttrs& attrs) {
+  const ParamType& param = get<ParamType>(attrs.parsed);
+  return param.use_bias ? 3 : 2;
+}
+
+template<typename ParamType>
+inline std::vector<std::string> UseBiasListInputNames(const NodeAttrs& attrs) {
+  const ParamType& param = nnvm::get<ParamType>(attrs.parsed);
+  if (param.use_bias) {
+    return {"data", "weight", "bias"};
+  } else {
+    return {"data", "weight"};
+  }
+}
+
+/*!
+ * \brief Convert shape in src_layout to shape in dst_layout
+ * \param src original shape
+ * \param src_layout layout of original shape
+ * \param dst_layout target layout
+ * \return shape in target layout
+ */
+inline TShape ConvertLayout(TShape src, const Layout& src_layout, const Layout& dst_layout) {
+  if (src_layout == dst_layout) {
+    return src;
+  } else if (!src_layout.defined()) {
+    LOG(FATAL) << "cannot convert undefined layout to " << dst_layout;
+  } else if (!dst_layout.defined()) {
+    LOG(FATAL) << "cannot convert " << src_layout << " to undefined layout";
+  }
+
+  CHECK(src_layout.convertible(dst_layout)) << "cannot convert from "
+                                            << src_layout << " to " << dst_layout;
+
+  TShape dst(dst_layout.ndim());
+  for (size_t i = 0; i < src_layout.ndim(); ++i) {
+    Layout::LayoutDim src_dim = src_layout[i];
+    if (Layout::is_superdim(src_dim)) {
+      int dst_major_pos = dst_layout.indexof(Layout::to_superdim(src_dim));
+      int dst_minor_pos = dst_layout.indexof(Layout::to_subdim(src_dim));
+      int src_minor_pos = src_layout.indexof(Layout::to_subdim(src_dim));
+      int src_factor = src_layout.subsizeof(src_dim);
+      int dst_factor = dst_layout.subsizeof(src_dim);
+
+      uint32_t src_dim_size = src[i];
+      if (src_minor_pos >= 0) {
+        CHECK_EQ(src_factor, src[src_minor_pos]) << "src shape " << src
+                                                 << " does not agree with layout " << src_layout;
+        src_dim_size *= src_factor;
+      }
+
+      dst[dst_major_pos] = src_dim_size;
+      if (dst_minor_pos >= 0) {
+        CHECK_GT(dst_factor, 0);
+        CHECK_LE(dst_factor, src_dim_size) << "Converting " << src
+                                           << " from " << src_layout
+                                           << " to " << dst_layout
+                                           << ": cannot split dimension size of "
+                                           << src_dim_size << " by " << dst_factor;
+        dst[dst_major_pos] /= dst_factor;
+        dst[dst_minor_pos] = dst_factor;
+      }
+    }
+  }
+  return dst;
+}
+
+}  // namespace top
+}  // namespace nnvm
+
+#endif  // NNVM_TOP_NN_NN_COMMON_H_
diff --git a/nnvm/src/top/nn/pooling.cc b/nnvm/src/top/nn/pooling.cc
new file mode 100644
index 000000000000..11ca637d3b06
--- /dev/null
+++ b/nnvm/src/top/nn/pooling.cc
@@ -0,0 +1,435 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+
+/*!
+ * \file pooling.cc
+ * \brief Property def of pooling operators.
+ */
+#include <nnvm/op.h>
+#include <nnvm/node.h>
+#include <nnvm/op_attr_types.h>
+#include <nnvm/compiler/op_attr_types.h>
+#include <nnvm/compiler/util.h>
+#include <nnvm/top/nn.h>
+#include "nn_common.h"
+#include "../op_common.h"
+#include "../elemwise_op_common.h"
+#include "topi/nn/pooling.h"
+
+namespace nnvm {
+namespace top {
+using namespace tvm;
+using namespace nnvm::compiler;
+
+DMLC_REGISTER_PARAMETER(MaxPool2DParam);
+
+template <typename T>
+inline bool Pool2DInferShape(const nnvm::NodeAttrs& attrs,
+                             std::vector<TShape>* in_shape,
+                             std::vector<TShape>* out_shape) {
+  const T& param = nnvm::get<T>(attrs.parsed);
+  CHECK_EQ(in_shape->size(), 1U);
+  CHECK_EQ(out_shape->size(), 1U);
+
+  TShape dshape = (*in_shape)[0];
+  if (dshape.ndim() ==  0) return false;
+
+  CHECK_GE(dshape.ndim(), 2U)
+    << "Pool2D only support input >= 2-D: input must have height and width";
+
+  Layout layout(param.layout);
+  CHECK(layout.contains('H') && layout.contains('W') &&
+        !layout.contains('h') && !layout.contains('w'))
+    << "Invalid layout " << layout
+    << ". Pool2D layout must have H and W, which cannot be split";
+
+  const auto hidx = layout.indexof('H');
+  const auto widx = layout.indexof('W');
+
+  dim_t pad_h, pad_w;
+  if (param.padding.ndim() == 1) {
+    pad_h = param.padding[0] * 2;
+    pad_w = param.padding[0] * 2;
+  } else if (param.padding.ndim() == 2) {
+    // (top, left)
+    pad_h = param.padding[0] * 2;
+    pad_w = param.padding[1] * 2;
+  } else if (param.padding.ndim() == 4) {
+    // (top, left, bottom, right)
+    pad_h = param.padding[0] + param.padding[2];
+    pad_w = param.padding[1] + param.padding[3];
+  } else {
+    return false;
+  }
+
+  TShape oshape = dshape;
+  CHECK(param.pool_size[0] <= dshape[hidx] + pad_h)
+      << "pool size (" << param.pool_size[0] << ") exceeds input (" << dshape[hidx]
+      << " padded to " << (dshape[hidx] + pad_h) << ")";
+  CHECK(param.pool_size[1] <= dshape[widx] + pad_w)
+      << "pool size (" << param.pool_size[1] << ") exceeds input (" << dshape[widx]
+      << " padded to " << (dshape[widx] + pad_w) << ")";
+
+  if (!param.ceil_mode) {
+    oshape[hidx] = ((dshape[hidx] + pad_h - param.pool_size[0]) /
+                    param.strides[0]) + 1;
+    oshape[widx] = ((dshape[widx] + pad_w - param.pool_size[1]) /
+                    param.strides[1]) + 1;
+  } else {
+    oshape[hidx] = ((dshape[hidx] + pad_h - param.pool_size[0] +
+                    param.strides[0] - 1) / param.strides[0]) + 1;
+    oshape[widx] = ((dshape[widx] + pad_w - param.pool_size[1] +
+                    param.strides[1] - 1) / param.strides[1]) + 1;
+  }
+  NNVM_ASSIGN_OUTPUT_SHAPE(attrs, *out_shape, 0, oshape);
+  return true;
+}
+
+template <typename T>
+inline bool Pool2DCorrectLayout(const NodeAttrs& attrs,
+                                std::vector<Layout> *ilayouts,
+                                const std::vector<Layout> *last_ilayouts,
+                                std::vector<Layout> *olayouts) {
+  const T &param = nnvm::get<T>(attrs.parsed);
+  CHECK_EQ(ilayouts->size(), 1);
+  CHECK_EQ(last_ilayouts->size(), 1);
+  CHECK_EQ(olayouts->size(), 1);
+
+  Layout input = (*ilayouts)[0];
+  const Layout layout(param.layout);
+
+  if (input.defined()) {
+    CHECK(input.convertible(layout)) << "Invalid input layout " << input;
+    if (input.indexof('W') != layout.indexof('W') ||
+        input.indexof('H') != layout.indexof('H') ||
+        input.contains('w') || input.contains('h')) {
+      // as long as the index doesn't change for width and height
+      // pool2d can keep the input layout.
+      input = layout;
+    }
+  } else {
+    input = layout;
+  }
+
+  NNVM_ASSIGN_LAYOUT(*ilayouts, 0, input);
+  NNVM_ASSIGN_LAYOUT(*olayouts, 0, input);
+
+  return true;
+}
+
+NNVM_REGISTER_OP(max_pool2d)
+.describe(R"code(Max pooling operation for one dimensional data.
+
+- **data**: This depends on the `layout` parameter. Input is 4D array of shape
+            (batch_size, channels, height, width) if `layout` is `NCHW`.
+- **out**: This depends on the `layout` parameter. Output is 4D array of shape
+           (batch_size, channels, out_height, out_width)  if `layout` is `NCHW`.
+           out_height and out_width are calculated as::
+
+               out_height = floor((height+padding[0]+padding[2]-pool_size[0])/strides[0])+1
+               out_width = floor((width+padding[1]+padding[3]-pool_size[1])/strides[1])+1
+
+           where padding will be an expanded array based on number of values passed as::
+               one int : all sides same padding used.
+               two int : bottom, right use same as top and left.
+               four int: padding width in the order of (top, left, bottom, right).
+
+           When `ceil_mode` is `True`, ceil will be used instead of floor in this
+           equation.
+
+)code" NNVM_ADD_FILELINE)
+.add_argument("data", "4D Tensor", "Input data.")
+.add_arguments(MaxPool2DParam::__FIELDS__())
+.set_attr_parser(ParamParser<MaxPool2DParam>)
+.set_attr<FGetAttrDict>("FGetAttrDict", ParamGetAttrDict<MaxPool2DParam>)
+.set_num_outputs(1)
+.set_num_inputs(1)
+.set_attr<FInferShape>("FInferShape", Pool2DInferShape<MaxPool2DParam>)
+.set_attr<FInferType>("FInferType", ElemwiseType<1, 1>)
+.set_attr<FCorrectLayout>("FCorrectLayout", Pool2DCorrectLayout<MaxPool2DParam>)
+.set_attr<FTVMCompute>("FTVMCompute", [](const NodeAttrs& attrs,
+                                         const Array<Tensor>& inputs,
+                                         const Array<Tensor>& out_info) {
+  const MaxPool2DParam& param = nnvm::get<MaxPool2DParam>(attrs.parsed);
+  auto pool_size = ShapeToArray(param.pool_size);
+  auto strides = ShapeToArray(param.strides);
+  auto padding = ShapeToArray(param.padding);
+  auto ceil_mode = param.ceil_mode;
+
+  Layout layout(param.layout);
+  CHECK(layout.convertible(Layout("NCHW")))
+    << "max_pool2d currently only supports layouts that are convertible from NCHW";
+  CHECK_EQ(layout.indexof('h'), -1) << "max_pool2d does not support input split on height";
+  CHECK_EQ(layout.indexof('w'), -1) << "max_pool2d does not support input split on width";
+
+  CHECK(inputs[0].ndim() == 4U || inputs[0].ndim() == 5U)
+    << "Pool2D only support 4-D input (e.g., NCHW)"
+    << " or 5-D input (last dimension is a split of channel)";
+
+  if (param.padding.ndim() == 1) {
+    padding.push_back(padding[0]);
+    padding.push_back(padding[0]);
+    padding.push_back(padding[0]);
+  } else if (param.padding.ndim() == 2) {
+    padding.push_back(padding[0]);
+    padding.push_back(padding[1]);
+  }
+
+  return Array<Tensor>{
+    topi::nn::pool(inputs[0], pool_size, strides, padding,
+                   topi::nn::kMaxPool, ceil_mode, layout.name())};
+})
+.set_attr<FGradient>(
+  "FGradient", [](const NodePtr& n,
+                  const std::vector<NodeEntry>& ograds) {
+    return MakeGradNode("_max_pool2d_grad", n,
+                        {ograds[0], n->inputs[0], NodeEntry{n, 0, 0}},
+                        n->attrs.dict);
+})
+.set_support_level(2);
+
+NNVM_REGISTER_OP(_max_pool2d_grad)
+  .describe(R"code(Max pooling 2D grad.
+
+)code" NNVM_ADD_FILELINE)
+.add_argument("ograd", "4D Tensor", "Output grad.")
+.add_argument("input", "4D Tensor", "Input data of max_pool2d grad.")
+.add_argument("output", "4D Tensor", "Output data of max_pool2d grad.")
+.set_num_inputs(3)
+.set_num_outputs(1)
+.set_attr_parser(ParamParser<MaxPool2DParam>)
+.set_attr<FGetAttrDict>("FGetAttrDict", ParamGetAttrDict<MaxPool2DParam>)
+.set_attr<FInferShape>("FInferShape", AssignOutputAttr<TShape, 1, 0>)
+.set_attr<FInferType>("FInferType", ElemwiseType<3, 1>)
+.set_attr<TIsBackward>("TIsBackward", true);
+
+DMLC_REGISTER_PARAMETER(AvgPool2DParam);
+
+NNVM_REGISTER_OP(avg_pool2d)
+.describe(R"code(Average pooling operation for one dimensional data.
+
+- **data**: This depends on the `layout` parameter. Input is 4D array of shape
+            (batch_size, channels, height, width) if `layout` is `NCHW`.
+- **out**: This depends on the `layout` parameter. Output is 4D array of shape
+           (batch_size, channels, out_height, out_width)  if `layout` is `NCHW`.
+           out_height and out_width are calculated as::
+
+               out_height = floor((height+padding[0]+padding[2]-pool_size[0])/strides[0])+1
+               out_width = floor((width+padding[1]+padding[3]-pool_size[1])/strides[1])+1
+
+           where padding will be an expanded array based on number of values passed as::
+               one int : all sides same padding used.
+               two int : bottom, right use same as top and left.
+               four int: padding width in the order of (top, left, bottom, right).
+
+           When `ceil_mode` is `True`, ceil will be used instead of floor in this
+           equation.
+
+)code" NNVM_ADD_FILELINE)
+.add_argument("data", "4D Tensor", "Input data.")
+.add_arguments(AvgPool2DParam::__FIELDS__())
+.set_attr_parser(ParamParser<AvgPool2DParam>)
+.set_attr<FGetAttrDict>("FGetAttrDict", ParamGetAttrDict<AvgPool2DParam>)
+.set_attr<FInferShape>("FInferShape", Pool2DInferShape<AvgPool2DParam>)
+.set_attr<FInferType>("FInferType", ElemwiseType<1, 1>)
+.set_attr<FCorrectLayout>("FCorrectLayout", Pool2DCorrectLayout<AvgPool2DParam>)
+.set_attr<FTVMCompute>("FTVMCompute", [](const NodeAttrs& attrs,
+                                         const Array<Tensor>& inputs,
+                                         const Array<Tensor>& out_info) {
+  const AvgPool2DParam& param = nnvm::get<AvgPool2DParam>(attrs.parsed);
+  auto pool_size = ShapeToArray(param.pool_size);
+  auto strides = ShapeToArray(param.strides);
+  auto padding = ShapeToArray(param.padding);
+  auto ceil_mode = param.ceil_mode;
+  auto count_include_pad = param.count_include_pad;
+
+  Layout layout(param.layout);
+  CHECK(layout.convertible(Layout("NCHW")))
+    << "avg_pool2d currently only supports layouts that are convertible from NCHW";
+  CHECK_EQ(layout.indexof('h'), -1) << "avg_pool2d does not support input split on height";
+  CHECK_EQ(layout.indexof('w'), -1) << "avg_pool2d does not support input split on width";
+
+  CHECK(inputs[0].ndim() == 4U || inputs[0].ndim() == 5U)
+    << "Pool2D only support 4-D input (e.g., NCHW)"
+    << " or 5-D input (last dimension is a split of channel)";
+
+  if (param.padding.ndim() == 1) {
+    padding.push_back(padding[0]);
+    padding.push_back(padding[0]);
+    padding.push_back(padding[0]);
+  } else if (param.padding.ndim() == 2) {
+    padding.push_back(padding[0]);
+    padding.push_back(padding[1]);
+  }
+
+  return Array<Tensor>{
+    topi::nn::pool(inputs[0], pool_size, strides, padding,
+                   topi::nn::kAvgPool, ceil_mode, layout.name(), count_include_pad)};
+})
+.set_num_outputs(1)
+.set_num_inputs(1)
+.set_support_level(2);
+
+
+DMLC_REGISTER_PARAMETER(GlobalPool2DParam);
+
+inline bool GlobalPool2DInferShape(const nnvm::NodeAttrs& attrs,
+                                   std::vector<TShape>* in_shape,
+                                   std::vector<TShape>* out_shape) {
+  static const Layout kNCHW("NCHW");
+  const GlobalPool2DParam& param = nnvm::get<GlobalPool2DParam>(attrs.parsed);
+  CHECK_EQ(in_shape->size(), 1U);
+  CHECK_EQ(out_shape->size(), 1U);
+
+  TShape dshape = (*in_shape)[0];
+  if (dshape.ndim() ==  0) return false;
+
+  CHECK_GE(dshape.ndim(), 2U)
+    << "Pool2D only support input >= 2-D: input must have height and width";
+
+  Layout layout(param.layout);
+  CHECK(layout.contains('H') && layout.contains('W') &&
+        !layout.contains('h') && !layout.contains('w'))
+    << "Invalid layout " << layout
+    << ". Pool2D layout must have H and W, which cannot be split";
+
+  const auto hidx = layout.indexof('H');
+  const auto widx = layout.indexof('W');
+
+  TShape oshape = dshape;
+  oshape[hidx] = oshape[widx] = 1;
+  NNVM_ASSIGN_OUTPUT_SHAPE(attrs, *out_shape, 0, oshape);
+  return true;
+}
+
+inline bool GlobalPool2DCorrectLayout(const NodeAttrs& attrs,
+                                      std::vector<Layout> *ilayouts,
+                                      const std::vector<Layout> *last_ilayouts,
+                                      std::vector<Layout> *olayouts) {
+  const GlobalPool2DParam &param = nnvm::get<GlobalPool2DParam>(attrs.parsed);
+  CHECK_EQ(ilayouts->size(), 1);
+  CHECK_EQ(last_ilayouts->size(), 1);
+  CHECK_EQ(olayouts->size(), 1);
+
+  Layout input = (*ilayouts)[0];
+  const Layout layout(param.layout);
+
+  if (input.defined()) {
+    CHECK(input.convertible(layout)) << "Invalid input layout " << input;
+    if (input.indexof('W') != layout.indexof('W') ||
+        input.indexof('H') != layout.indexof('H') ||
+        input.contains('w') || input.contains('h')) {
+      // as long as the index doesn't change for width and height
+      // pool2d can keep the input layout.
+      input = layout;
+    }
+  } else {
+    input = layout;
+  }
+
+  NNVM_ASSIGN_LAYOUT(*ilayouts, 0, input);
+  NNVM_ASSIGN_LAYOUT(*olayouts, 0, input);
+
+  return true;
+}
+
+NNVM_REGISTER_OP(global_max_pool2d)
+.describe(R"code(Global max pooling operation for 2D data.
+
+- **data**: This depends on the `layout` parameter. Input is 4D array of shape
+            (batch_size, channels, height, width) if `layout` is `NCHW`.
+- **out**: This depends on the `layout` parameter. Output is 4D array of shape
+           (batch_size, channels, 1, 1)  if `layout` is `NCHW`.
+
+)code" NNVM_ADD_FILELINE)
+.add_argument("data", "4D Tensor", "Input data.")
+.add_arguments(GlobalPool2DParam::__FIELDS__())
+.set_attr_parser(ParamParser<GlobalPool2DParam>)
+.set_attr<FGetAttrDict>("FGetAttrDict", ParamGetAttrDict<GlobalPool2DParam>)
+.set_attr<FInferShape>("FInferShape", GlobalPool2DInferShape)
+.set_attr<FInferType>("FInferType", ElemwiseType<1, 1>)
+.set_attr<FCorrectLayout>("FCorrectLayout", GlobalPool2DCorrectLayout)
+.set_attr<FTVMCompute>(
+  "FTVMCompute", [](const NodeAttrs& attrs,
+                    const Array<Tensor>& inputs,
+                    const Array<Tensor>& out_info) {
+  const GlobalPool2DParam& param = nnvm::get<GlobalPool2DParam>(attrs.parsed);
+  Layout layout(param.layout);
+  CHECK(layout.convertible(Layout("NCHW")))
+    << "global_max_pool2d currently only supports layouts that are convertible from NCHW";
+  CHECK_EQ(layout.indexof('h'), -1)
+    << "global_max_pool2d does not support input split on height";
+  CHECK_EQ(layout.indexof('w'), -1)
+    << "global_max_pool2d does not support input split on width";
+
+  CHECK(inputs[0].ndim() == 4U || inputs[0].ndim() == 5U)
+    << "Pool2D only support 4-D input (e.g., NCHW)"
+    << " or 5-D input (last dimension is a split of channel)";
+
+  return Array<Tensor>{
+    topi::nn::global_pool(inputs[0], topi::nn::kMaxPool, layout.name()) };
+})
+.set_num_outputs(1)
+.set_num_inputs(1)
+.set_support_level(2);
+
+
+NNVM_REGISTER_OP(global_avg_pool2d)
+.describe(R"code(Global average pooling operation for 2D data.
+
+- **data**: This depends on the `layout` parameter. Input is 4D array of shape
+            (batch_size, channels, height, width) if `layout` is `NCHW`.
+- **out**: This depends on the `layout` parameter. Output is 4D array of shape
+           (batch_size, channels, 1, 1)  if `layout` is `NCHW`.
+
+)code" NNVM_ADD_FILELINE)
+.add_argument("data", "4D Tensor", "Input data.")
+.add_arguments(GlobalPool2DParam::__FIELDS__())
+.set_attr_parser(ParamParser<GlobalPool2DParam>)
+.set_attr<FGetAttrDict>("FGetAttrDict", ParamGetAttrDict<GlobalPool2DParam>)
+.set_attr<FInferShape>("FInferShape", GlobalPool2DInferShape)
+.set_attr<FInferType>("FInferType", ElemwiseType<1, 1>)
+.set_attr<FCorrectLayout>("FCorrectLayout", GlobalPool2DCorrectLayout)
+.set_attr<FTVMCompute>(
+  "FTVMCompute", [](const NodeAttrs& attrs,
+                    const Array<Tensor>& inputs,
+                    const Array<Tensor>& out_info) {
+  const GlobalPool2DParam& param = nnvm::get<GlobalPool2DParam>(attrs.parsed);
+  Layout layout(param.layout);
+  CHECK(layout.convertible(Layout("NCHW")))
+    << "global_avg_pool2d currently only supports layouts that are convertible from NCHW";
+  CHECK_EQ(layout.indexof('h'), -1)
+    << "global_avg_pool2d does not support input split on height";
+  CHECK_EQ(layout.indexof('w'), -1)
+    << "global_avg_pool2d does not support input split on width";
+
+  CHECK(inputs[0].ndim() == 4U || inputs[0].ndim() == 5U)
+    << "Pool2D only support 4-D input (e.g., NCHW)"
+    << " or 5-D input (last dimension is a split of channel)";
+
+  return Array<Tensor>{
+    topi::nn::global_pool(inputs[0], topi::nn::kAvgPool, layout.name()) };
+})
+.set_num_outputs(1)
+.set_num_inputs(1)
+.set_support_level(2);
+
+}  // namespace top
+}  // namespace nnvm
diff --git a/nnvm/src/top/nn/upsampling.cc b/nnvm/src/top/nn/upsampling.cc
new file mode 100644
index 000000000000..68583ae616f2
--- /dev/null
+++ b/nnvm/src/top/nn/upsampling.cc
@@ -0,0 +1,124 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file upsampling.cc
+ * \brief Property def of upsampling operators.
+ */
+#include <tvm/operation.h>
+#include <tvm/expr.h>
+#include <nnvm/layout.h>
+#include <nnvm/compiler/op_attr_types.h>
+#include <nnvm/op.h>
+#include <nnvm/node.h>
+#include <nnvm/op_attr_types.h>
+#include <nnvm/top/nn.h>
+#include "nn_common.h"
+#include "../op_common.h"
+#include "../elemwise_op_common.h"
+#include "topi/elemwise.h"
+#include "topi/transform.h"
+#include "topi/nn/upsampling.h"
+
+namespace nnvm {
+namespace top {
+using tvm::Expr;
+using tvm::Array;
+using tvm::Tensor;
+using nnvm::compiler::FTVMCompute;
+
+DMLC_REGISTER_PARAMETER(UpSamplingParam);
+
+inline bool UpSamplingInferShape(const nnvm::NodeAttrs& attrs,
+                                 std::vector<TShape>* in_shape,
+                                 std::vector<TShape>* out_shape) {
+  static const Layout kNCHW("NCHW");
+  const UpSamplingParam& param = nnvm::get<UpSamplingParam>(attrs.parsed);
+  CHECK_EQ(in_shape->size(), 1U);
+  CHECK_EQ(out_shape->size(), 1U);
+  TShape dshape = (*in_shape)[0];
+  if (dshape.ndim() ==  0) return false;
+
+  dshape = ConvertLayout(dshape, param.layout, kNCHW);
+  TShape oshape = dshape;
+  oshape[2] = oshape[2] * param.scale;
+  oshape[3] = oshape[3] * param.scale;
+  oshape = ConvertLayout(oshape, kNCHW, param.layout);
+  NNVM_ASSIGN_OUTPUT_SHAPE(attrs, *out_shape, 0, oshape);
+
+  return true;
+}
+
+inline bool UpsamplingLayout(const NodeAttrs& attrs,
+                             std::vector<Layout> *in_layouts,
+                             const std::vector<Layout> *last_in_layouts,
+                             std::vector<Layout> *out_layouts) {
+  const UpSamplingParam& param = nnvm::get<UpSamplingParam>(attrs.parsed);
+  CHECK_EQ(in_layouts->size(), 1U);
+  CHECK_EQ(out_layouts->size(), 1U);
+  const Layout layout(param.layout);
+  NNVM_ASSIGN_LAYOUT(*in_layouts, 0, layout);
+  NNVM_ASSIGN_LAYOUT(*out_layouts, 0, layout);
+  return true;
+}
+
+NNVM_REGISTER_OP(upsampling)
+.describe(R"(Perform upsampling to input array with nearest neighbour or bilinear interpolation.
+
+- **data**: data is 4D array of shape
+            (batch_size, channels, in_height, in_width) for NCHW
+            (batch_size, in_height, in_width, channels) for NHWC
+
+- **out**: Output is 4D array of shape
+           for layout NCHW
+           (batch_size, channels, in_height*scale, in_width*scale)
+
+           for layout NHWC
+           (batch_size, in_height*scale, in_width*scale, channels)
+
+)" NNVM_ADD_FILELINE)
+.add_argument("data", "4D Tensor", "Input data.")
+.add_arguments(UpSamplingParam::__FIELDS__())
+.set_attr_parser(ParamParser<UpSamplingParam>)
+.set_attr<FGetAttrDict>("FGetAttrDict", ParamGetAttrDict<UpSamplingParam>)
+.set_attr<FInferShape>("FInferShape", UpSamplingInferShape)
+.set_attr<FInferType>("FInferType", ElemwiseType<1, 1>)
+.set_attr<FCorrectLayout>("FCorrectLayout", UpsamplingLayout)
+.set_num_outputs(1)
+.set_num_inputs(1)
+.set_attr<FTVMCompute>(
+  "FTVMCompute", [](const NodeAttrs& attrs,
+                    const Array<Tensor>& inputs,
+                    const Array<Tensor>& out_info) {
+  const UpSamplingParam& param = nnvm::get<UpSamplingParam>(attrs.parsed);
+  Array<Expr> oshape;
+  if (param.layout == "NCHW") {
+    oshape.push_back(out_info[0]->shape[2]);
+    oshape.push_back(out_info[0]->shape[3]);
+  } else {
+    oshape.push_back(out_info[0]->shape[1]);
+    oshape.push_back(out_info[0]->shape[2]);
+  }
+
+  return Array<Tensor>{ topi::nn::upsampling(inputs[0], oshape, param.layout, param.method)};
+})
+.set_support_level(2);
+
+}  // namespace top
+}  // namespace nnvm
diff --git a/nnvm/src/top/op_common.h b/nnvm/src/top/op_common.h
new file mode 100644
index 000000000000..7213e1c9c116
--- /dev/null
+++ b/nnvm/src/top/op_common.h
@@ -0,0 +1,351 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file op_common.h
+ * \brief Common operator utilities
+ */
+#ifndef NNVM_TOP_OP_COMMON_H_
+#define NNVM_TOP_OP_COMMON_H_
+
+#include <dmlc/logging.h>
+#include <dmlc/parameter.h>
+#include <nnvm/top/tensor.h>
+#include <string>
+#include <vector>
+#include <utility>
+#include <unordered_map>
+#include <unordered_set>
+
+namespace nnvm {
+namespace top {
+/*!
+ * \brief Parse keyword arguments as PType arguments and save to parsed
+ * \tparam PType the parameter type.
+ * \param attrs The attributes.
+ */
+template<typename PType>
+inline void ParamParser(nnvm::NodeAttrs* attrs) {
+  PType param;
+  try {
+    param.Init(attrs->dict);
+  } catch (const dmlc::ParamError& e) {
+    std::ostringstream os;
+    os << e.what();
+    os << ", in operator " << attrs->op->name << "("
+       << "name=\"" << attrs->name << "\"";
+    for (const auto& k : attrs->dict) {
+      os << ", " << k.first << "=\"" << k.second << "\"";
+    }
+    os << ")";
+    throw dmlc::ParamError(os.str());
+  }
+  attrs->parsed = std::move(param);
+}
+
+/*!
+ * \brief Parse keyword arguments as PType arguments and save to parsed
+ * \tparam PType the arameter type.
+ * \param attrs The attributes.
+ */
+template<typename PType>
+inline std::unordered_map<std::string, std::string>
+ParamGetAttrDict(const nnvm::NodeAttrs& attrs) {
+  std::unordered_map<std::string, std::string> dict = attrs.dict;
+  nnvm::get<PType>(attrs.parsed).UpdateDict(&dict);
+  return dict;
+}
+
+/*! \brief check if shape is empty or contains unkown (0) dim. */
+inline bool shape_is_none(const TShape& x) {
+  return x.ndim() == 0 || x.Size() == 0;
+}
+
+/*! \brief check if type is none (-1) */
+inline bool type_is_none(const int& x) {
+  return x == -1;
+}
+
+/*! \brief check if shape is scalar({1}). */
+inline bool shape_is_scalar(const TShape& x) {
+  return x.ndim() == 1 && x.Size() == 1;
+}
+
+/*! \brief get string representation of shape */
+inline std::string shape_string(const TShape& x) {
+  std::ostringstream os;
+  os << x;
+  return os.str();
+}
+
+/*! \brief get string representation of shape */
+inline std::string type_string(const int& x) {
+  return std::to_string(x);
+}
+
+/*!
+ * \brief Assign x to y. Checks for compatiblity when y is not empty.
+ *  Allow missing dim in both x and y (as 0).
+ * \param y target shape.
+ * \param x source shape.
+ * \return whether x and y are compatible.
+ */
+inline bool shape_assign(TShape *y, const TShape& x) {
+  if (y->ndim() == 0) {
+    *y = x;
+    return true;
+  } else if (y->ndim() != x.ndim()) {
+    return x.ndim() == 0;
+  } else {
+    for (size_t i = 0; i < y->ndim(); ++i) {
+      if ((*y)[i] == 0) {
+        (*y)[i] = x[i];
+      } else if ((*y)[i] != x[i] && x[i] != 0) {
+        return false;
+      }
+    }
+    return true;
+  }
+}
+
+/*!
+ * \brief Assign x to y. Checks for compatiblity when y is not -1.
+ * \param y target type.
+ * \param x source type.
+ * \return whether x and y are compatible.
+ */
+inline bool type_assign(int *y, const int& x) {
+  if (*y == -1) {
+    *y = x;
+    return true;
+  } else if (*y != x && x != -1) {
+    return false;
+  }
+  return true;
+}
+
+template<typename AttrType>
+inline std::string attr_assign_error_msg(const NodeAttrs& attrs,
+                                         int index, bool is_input,
+                                         const AttrType& expected,
+                                         const AttrType& actual,
+                                         const char* attr_name) {
+  static const auto& flist_inputs = Op::GetAttr<FListInputNames>("FListInputNames");
+  static const auto& flist_outputs = Op::GetAttr<FListOutputNames>("FListOutputNames");
+  const auto& flist = is_input ? flist_inputs : flist_outputs;
+  std::string name;
+  if (flist.count(attrs.op)) {
+    name = flist[attrs.op](attrs)[index];
+  } else {
+    name = (is_input ? "data" : "output") + std::to_string(index);
+  }
+  std::ostringstream msg;
+  msg << "Operator " << attrs.op->name << "(";
+  for (const auto& kv : attrs.dict) msg << kv.first << "=" << kv.second << ", ";
+  msg << "name=" << attrs.name << ") expects " << name << "\'s " << attr_name
+      << " to be " << expected << ", but got " << actual << ".";
+  return msg.str();
+}
+
+/*!
+ * \brief macro assign shape to input if out is unknown otherwise check consistency
+ *  Use macro so we can see the error file more clearly
+ * \param inputs the shape array to store the result
+ * \param index the index of in the array
+ * \param shape the inferred shape
+ */
+#define NNVM_ASSIGN_INPUT_SHAPE(attrs, inputs, index, shape)             \
+  {                                                                      \
+    if (!shape_assign(&(inputs)[index], TShape(shape))) {                \
+      LOG(FATAL) << attr_assign_error_msg(attrs, index, true, shape,     \
+                                          (inputs)[index], "shape");     \
+    }                                                                    \
+  }
+
+/*!
+ * \brief macro assign shape to out if out is unknown otherwise check consistency
+ *  Use macro so we can see the error file more clearly
+ * \param inputs the shape array to store the result
+ * \param index the index of in the array
+ * \param shape the inferred shape
+ */
+#define NNVM_ASSIGN_OUTPUT_SHAPE(attrs, outputs, index, shape)           \
+  {                                                                      \
+    if (!shape_assign(&(outputs)[index], TShape(shape))) {               \
+      LOG(FATAL) << attr_assign_error_msg(attrs, index, false, shape,    \
+                                          (outputs)[index], "shape");    \
+    }                                                                    \
+  }
+
+/*!
+ * \brief macro assign type to out if out is unknown (-1) otherwise check consistency
+ *  Use macro so we can see the error file more clearly
+ * \param inputs the type array to store the result
+ * \param index the index of in the array
+ * \param type the inferred type
+ */
+#define NNVM_ASSIGN_INPUT_TYPE(attrs, inputs, index, type)               \
+  {                                                                      \
+    if (!type_assign(&(inputs)[index], type)) {                          \
+      LOG(FATAL) << attr_assign_error_msg(attrs, index, true, type,      \
+                                          (inputs)[index], "type");      \
+    }                                                                    \
+  }
+
+/*!
+ * \brief macro assign type to out if out is unknown (-1) otherwise check consistency
+ *  Use macro so we can see the error file more clearly
+ * \param inputs the type array to store the result
+ * \param index the index of in the array
+ * \param type the inferred type
+ */
+#define NNVM_ASSIGN_OUTPUT_TYPE(attrs, outputs, index, type)             \
+  {                                                                      \
+    if (!type_assign(&(outputs)[index], type)) {                         \
+      LOG(FATAL) << attr_assign_error_msg(attrs, index, false, type,     \
+                                          (outputs)[index], "type");     \
+    }                                                                    \
+  }
+
+#define NNVM_ASSIGN_LAYOUT(outputs, index, layout)                       \
+  {                                                                      \
+    if (layout.defined()) {                                              \
+      (outputs)[index] = layout;                                         \
+    }                                                                    \
+  }
+
+/*!
+ * \brief macro assign rhs shape to lhs
+ *  Use macro so we can see the error file more clearly
+ * \param lhs lhs shape
+ * \param rhs rhs shape
+ */
+#define SHAPE_ASSIGN(lhs, rhs)                                \
+  if ((lhs).ndim() == 0) (lhs) = (rhs);                       \
+  else                                                        \
+    CHECK_EQ(lhs, rhs) << "shape inference inconsistent";     \
+
+/*!
+ * \brief macro assign rhs type to lhs
+ *  Use macro so we can see the error file more clearly
+ * \param lhs lhs type
+ * \param rhs rhs type
+ */
+#define DTYPE_ASSIGN(lhs, rhs)                                \
+  if ((lhs) == -1) (lhs) = (rhs);                             \
+  else                                                        \
+    CHECK_EQ(lhs, rhs) << "type inference inconsistent";     \
+
+// simply return the shape as same
+inline bool SameShape(const NodeAttrs& attrs,
+                      std::vector<TShape> *ishape,
+                      std::vector<TShape> *oshape) {
+  if (ishape->size() == 0 || (*ishape)[0].ndim() == 0) return false;
+  for (TShape& pshape : *oshape) {
+    pshape = (*ishape)[0];
+  }
+  for (TShape& pshape : *ishape) {
+    pshape = (*ishape)[0];
+  }
+  return true;
+}
+
+// return shape from node attrs
+template<typename PType>
+inline bool ZeroShape(const NodeAttrs& attrs,
+                      std::vector<TShape> *ishape,
+                      std::vector<TShape> *oshape) {
+  const TShape& ts = dmlc::get<PType>(attrs.parsed).shape;
+  if (ts.ndim() != 0) {
+    SHAPE_ASSIGN(oshape->at(0), ts);
+    return true;
+  } else {
+    return false;
+  }
+}
+
+// do not infer layout
+inline bool ZeroLayout(const NodeAttrs& attrs,
+                       std::vector<Layout> *in_layouts,
+                       const std::vector<Layout> *last_in_layouts,
+                       std::vector<Layout> *out_layouts) {
+  return true;
+}
+
+// simply assign output shape or type from input
+template<typename AttrType, int in_index, int out_index>
+inline bool AssignOutputAttr(const NodeAttrs& attrs,
+                              std::vector<AttrType> *in_attrs,
+                              std::vector<AttrType> *out_attrs) {
+  CHECK_LT(in_index, in_attrs->size());
+  CHECK_LT(out_index, out_attrs->size());
+  const TShape &dshape = in_attrs->at(in_index);
+  NNVM_ASSIGN_OUTPUT_SHAPE(attrs, *out_attrs, out_index, dshape);
+  return true;
+}
+
+// return type from node attrs
+template<typename PType>
+inline bool ZeroType(const NodeAttrs& attrs,
+                     std::vector<int> *iattr,
+                     std::vector<int> *oattr) {
+  int dtype = dmlc::get<PType>(attrs.parsed).dtype;
+  DTYPE_ASSIGN(oattr->at(0), dtype);
+  return true;
+}
+
+// Make zero grad node
+inline std::vector<NodeEntry> MakeZeroGradNodes(
+  const NodePtr& n,
+  const std::vector<NodeEntry>& ograds) {
+  std::vector<NodeEntry> ret;
+  for (uint32_t i = 0; i < n->num_inputs(); ++i) {
+    std::ostringstream os;
+    ret.push_back(MakeNode("zeros_like", n->attrs.name + "_zero_grad",
+                           {n->inputs[i]}));
+  }
+  return ret;
+}
+
+// Helper to make gradient node
+inline std::vector<NodeEntry> MakeGradNode(
+  const char* op_name,
+  const NodePtr& n,
+  std::vector<NodeEntry> inputs,
+  std::unordered_map<std::string, std::string> attr = {{}}) {
+  NodePtr p = Node::Create();
+  p->attrs.op = nnvm::Op::Get(op_name);
+  p->attrs.name = n->attrs.name + "_grad";
+  p->inputs = std::move(inputs);
+  p->attrs.dict = std::move(attr);
+  if (p->attrs.op->attr_parser) {
+    p->attrs.op->attr_parser(&p->attrs);
+  }
+  std::vector<NodeEntry> ret;
+  for (uint32_t i = 0; i < p->num_outputs(); ++i) {
+    ret.emplace_back(NodeEntry{p, i, 0});
+  }
+  return ret;
+}
+
+
+}  // namespace top
+}  // namespace nnvm
+
+#endif  // NNVM_TOP_OP_COMMON_H_
diff --git a/nnvm/src/top/tensor/elemwise.cc b/nnvm/src/top/tensor/elemwise.cc
new file mode 100644
index 000000000000..7a79db041755
--- /dev/null
+++ b/nnvm/src/top/tensor/elemwise.cc
@@ -0,0 +1,998 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file elemwise.cc
+ * \brief Elemenwise operators
+ */
+#include <nnvm/op.h>
+#include <nnvm/node.h>
+#include <nnvm/op_attr_types.h>
+#include <nnvm/compiler/op_attr_types.h>
+#include <nnvm/compiler/util.h>
+#include <nnvm/top/tensor.h>
+#include <cmath>
+#include "../op_common.h"
+#include "../elemwise_op_common.h"
+#include "topi/broadcast.h"
+#include "topi/elemwise.h"
+#include "topi/tags.h"
+#include "../../compiler/compile_engine.h"
+
+namespace nnvm {
+namespace top {
+
+using namespace tvm;
+using namespace nnvm::compiler;
+
+// undefined op
+NNVM_REGISTER_ELEMWISE_UNARY_OP(__undef__)
+.describe(R"code(undefined op.
+
+Used to produce invalide node during optimization.
+
+)code" NNVM_ADD_FILELINE)
+.set_num_outputs(1)
+.set_num_inputs(0);
+
+// floor
+NNVM_REGISTER_ELEMWISE_UNARY_OP(floor)
+.describe(R"code(Take floor input array, computed element-wise.
+)code" NNVM_ADD_FILELINE)
+.set_support_level(3)
+.set_attr<FTVMCompute>(
+  "FTVMCompute", [](const NodeAttrs& attrs,
+                    const Array<Tensor>& inputs,
+                    const Array<Tensor>& out_info) {
+      return Array<Tensor>{ topi::floor(inputs[0]) };
+});
+
+// ceil
+NNVM_REGISTER_ELEMWISE_UNARY_OP(ceil)
+.describe(R"code(Take ceil input array, computed element-wise.
+)code" NNVM_ADD_FILELINE)
+.set_support_level(3)
+.set_attr<FTVMCompute>(
+  "FTVMCompute", [](const NodeAttrs& attrs,
+                    const Array<Tensor>& inputs,
+                    const Array<Tensor>& out_info) {
+      return Array<Tensor>{ topi::ceil(inputs[0]) };
+});
+
+// trunc
+NNVM_REGISTER_ELEMWISE_UNARY_OP(trunc)
+.describe(R"code(Take truncated value of the input, element-wise.
+)code" NNVM_ADD_FILELINE)
+.set_support_level(3)
+.set_attr<FTVMCompute>(
+  "FTVMCompute", [](const NodeAttrs& attrs,
+                    const Array<Tensor>& inputs,
+                    const Array<Tensor>& out_info) {
+      return Array<Tensor>{ topi::trunc(inputs[0]) };
+});
+
+// round
+NNVM_REGISTER_ELEMWISE_UNARY_OP(round)
+.describe(R"code(Round elements of the input to nearest integer.
+)code" NNVM_ADD_FILELINE)
+.set_support_level(3)
+.set_attr<FTVMCompute>(
+  "FTVMCompute", [](const NodeAttrs& attrs,
+                    const Array<Tensor>& inputs,
+                    const Array<Tensor>& out_info) {
+      return Array<Tensor>{ topi::round(inputs[0]) };
+});
+
+// abs
+NNVM_REGISTER_ELEMWISE_UNARY_OP(abs)
+.describe(R"code(Take absolute value of elements of the input.
+)code" NNVM_ADD_FILELINE)
+.set_support_level(3)
+.set_attr<FTVMCompute>(
+  "FTVMCompute", [](const NodeAttrs& attrs,
+                    const Array<Tensor>& inputs,
+                    const Array<Tensor>& out_info) {
+      return Array<Tensor>{ topi::abs(inputs[0]) };
+});
+
+// sigmoid
+NNVM_REGISTER_ELEMWISE_UNARY_OP(sigmoid)
+.describe(R"code(Computes sigmoid.
+
+.. math::
+  Y = 1 / (1 + exp(-X))
+
+)code" NNVM_ADD_FILELINE)
+.set_support_level(1)
+.set_attr<FTVMCompute>(
+  "FTVMCompute", [](const NodeAttrs& attrs,
+                    const Array<Tensor>& inputs,
+                    const Array<Tensor>& out_info) {
+      return Array<Tensor>{ topi::sigmoid(inputs[0]) };
+})
+.set_attr<FGradient>(
+  "FGradient", [](const NodePtr& n,
+                  const std::vector<NodeEntry>& ograds) {
+    // y = 1 / (1 + exp(-n0))
+    // grad_0 = grad_y * y * (1 - y)
+    NodeEntry sub0 = MakeNode("elemwise_mul", n->attrs.name + "_grad_sub_0",
+                              {ograds[0], NodeEntry{n, 0, 0}});
+    NodeEntry sub1 = MakeNode("__rsub_scalar__", n->attrs.name + "_grad_sub_1",
+                              {NodeEntry{n, 0, 0}}, {{"scalar", "1"}});
+    return std::vector<NodeEntry>{
+      MakeNode("elemwise_mul", n->attrs.name + "_grad_0",
+               {sub0, sub1})
+    };
+});
+
+// tanh
+NNVM_REGISTER_ELEMWISE_UNARY_OP(tanh)
+.describe(R"code(Computes hyperbolic tangent.
+
+.. math::
+   Y = sinh(X) / cosh(X)
+
+)code" NNVM_ADD_FILELINE)
+.set_support_level(1)
+.set_attr<FTVMCompute>(
+  "FTVMCompute", [](const NodeAttrs& attrs,
+                    const Array<Tensor>& inputs,
+                    const Array<Tensor>& out_info) {
+      return Array<Tensor>{ topi::tanh(inputs[0]) };
+})
+.set_attr<FGradient>(
+  "FGradient", [](const NodePtr& n,
+                  const std::vector<NodeEntry>& ograds) {
+    // y = sinh(n0) / cosh(n0)
+    // grad_0 = grad_y * (1 - y^2)
+    NodeEntry sub0 = MakeNode("elemwise_mul", n->attrs.name + "_grad_sub_0",
+                              {NodeEntry{n, 0, 0}, NodeEntry{n, 0, 0}});
+    NodeEntry sub1 = MakeNode("__rsub_scalar__", n->attrs.name + "_grad_sub_1",
+                              {sub0}, {{"scalar", "1"}});
+    return std::vector<NodeEntry>{
+      MakeNode("elemwise_mul", n->attrs.name + "_grad_0",
+               {ograds[0], sub1})
+    };
+});
+
+// exp
+NNVM_REGISTER_ELEMWISE_UNARY_OP(exp)
+.describe(R"code(Returns the exp input array, computed element-wise.
+
+.. math::
+   exp(x)
+
+)code" NNVM_ADD_FILELINE)
+.set_support_level(1)
+.set_attr<FTVMCompute>(
+  "FTVMCompute", [](const NodeAttrs& attrs,
+                    const Array<Tensor>& inputs,
+                    const Array<Tensor>& out_info) {
+      return Array<Tensor>{ topi::exp(inputs[0]) };
+})
+.set_attr<FGradient>(
+  "FGradient", [](const NodePtr& n,
+                  const std::vector<NodeEntry>& ograds) {
+    // y = exp(n0)
+    // grad_0 = grad_y * y
+    return std::vector<NodeEntry>{
+      MakeNode("elemwise_mul", n->attrs.name + "_grad_0",
+               {ograds[0], NodeEntry{n, 0, 0}})
+    };
+});
+
+// log
+NNVM_REGISTER_ELEMWISE_UNARY_OP(log)
+.describe(R"code(Returns the log input array, computed element-wise.
+
+.. math::
+   log(x)
+
+)code" NNVM_ADD_FILELINE)
+.set_support_level(1)
+.set_attr<FTVMCompute>(
+  "FTVMCompute", [](const NodeAttrs& attrs,
+                    const Array<Tensor>& inputs,
+                    const Array<Tensor>& out_info) {
+      return Array<Tensor>{ topi::log(inputs[0]) };
+})
+.set_attr<FGradient>(
+  "FGradient", [](const NodePtr& n,
+                  const std::vector<NodeEntry>& ograds) {
+    // y = log(n0)
+    // grad_0 = grad_y / n0
+    return std::vector<NodeEntry>{
+      MakeNode("elemwise_div", n->attrs.name + "_grad_0",
+               {ograds[0], n->inputs[0]})
+    };
+});
+
+// sqrt
+NNVM_REGISTER_ELEMWISE_UNARY_OP(sqrt)
+.describe(R"code(Returns the sqrt input array, computed element-wise.
+
+.. math::
+   \sqrt(x)
+
+)code" NNVM_ADD_FILELINE)
+.set_support_level(1)
+.set_attr<FTVMCompute>(
+  "FTVMCompute", [](const NodeAttrs& attrs,
+                    const Array<Tensor>& inputs,
+                    const Array<Tensor>& out_info) {
+      return Array<Tensor>{ topi::sqrt(inputs[0]) };
+})
+.set_attr<FGradient>(
+  "FGradient", [](const NodePtr& n,
+                  const std::vector<NodeEntry>& ograds) {
+    // y = sqrt(n0)
+    // grad_0 = grad_y / (2 * y)
+    NodeEntry sub0 = MakeNode("__mul_scalar__", n->attrs.name + "_grad_sub_0",
+                              {NodeEntry{n, 0, 0}}, {{"scalar", "2"}});
+    return std::vector<NodeEntry>{
+      MakeNode("elemwise_div", n->attrs.name + "_grad_0",
+             {ograds[0], sub0})
+    };
+});
+
+// binary ops
+
+NNVM_REGISTER_ELEMWISE_BINARY_OP(elemwise_add)
+.describe(R"code(Element-wise add
+
+)code")
+.set_support_level(1)
+.set_attr<FTVMCompute>(
+  "FTVMCompute", [](const NodeAttrs& attrs,
+                    const Array<Tensor>& inputs,
+                    const Array<Tensor>& out_info) {
+      return Array<Tensor>{ topi::add(inputs[0], inputs[1]) };
+  })
+.set_attr<FGradient>(
+  "FGradient", [](const NodePtr& n,
+                  const std::vector<NodeEntry>& ograds){
+    // y = n0 + n1
+    // grad_0 = grad_y
+    // grad_1 = grad_y
+    return std::vector<NodeEntry>{ MakeNode("copy", n->attrs.name + "_grad_0",
+                                            {ograds[0]}),
+                                   MakeNode("copy", n->attrs.name + "_grad_0",
+                                            {ograds[0]}) };
+});
+
+NNVM_REGISTER_ELEMWISE_BINARY_OP(elemwise_sub)
+.describe(R"code(Element-wise substraction
+
+)code"  NNVM_ADD_FILELINE)
+.set_support_level(1)
+.set_attr<FTVMCompute>(
+  "FTVMCompute", [](const NodeAttrs& attrs,
+                    const Array<Tensor>& inputs,
+                    const Array<Tensor>& out_info) {
+    return Array<Tensor>{ topi::subtract(inputs[0], inputs[1]) };
+})
+.set_attr<FGradient>(
+  "FGradient", [](const NodePtr& n,
+                  const std::vector<NodeEntry>& ograds){
+    // y = n0 - n1
+    // grad_0 = grad_y
+    // grad_1 = - grad_y
+    return std::vector<NodeEntry>{
+      ograds[0],
+      MakeNode("negative", n->attrs.name + "_grad_1", {ograds[0]}),
+    };
+});
+
+NNVM_REGISTER_ELEMWISE_BINARY_OP(elemwise_mul)
+.describe(R"code(Element-wise multiplication
+
+)code"  NNVM_ADD_FILELINE)
+.set_support_level(1)
+.set_attr<FTVMCompute>(
+  "FTVMCompute", [](const NodeAttrs& attrs,
+                    const Array<Tensor>& inputs,
+                    const Array<Tensor>& out_info) {
+      return Array<Tensor>{ topi::multiply(inputs[0], inputs[1]) };
+})
+.set_attr<FGradient>(
+  "FGradient", [](const NodePtr& n,
+                  const std::vector<NodeEntry>& ograds){
+    // y = n0 * n1
+    // grad_0 = grad_y * n1
+    // grad_1 = grad_y * n0
+    return std::vector<NodeEntry>{
+      MakeNode("elemwise_mul", n->attrs.name + "_grad_0",
+               {ograds[0], n->inputs[1]}),
+      MakeNode("elemwise_mul", n->attrs.name + "_grad_1",
+               {ograds[0], n->inputs[0]})
+    };
+});
+
+NNVM_REGISTER_ELEMWISE_BINARY_OP(elemwise_div)
+.describe(R"code(Element-wise division
+
+)code"  NNVM_ADD_FILELINE)
+.set_support_level(1)
+.set_attr<FTVMCompute>(
+  "FTVMCompute", [](const NodeAttrs& attrs,
+                    const Array<Tensor>& inputs,
+                    const Array<Tensor>& out_info) {
+      return Array<Tensor>{ topi::divide(inputs[0], inputs[1]) };
+})
+.set_attr<FGradient>(
+  "FGradient", [](const NodePtr& n,
+                  const std::vector<NodeEntry>& ograds){
+    // y = n0 / n1
+    // grad_0 = grad_y / n1
+    // grad_1 = - grad_y * n0 / n1^2
+    NodeEntry sub0 = MakeNode("elemwise_mul", n->attrs.name + "_grad_sub_0",
+                              {ograds[0], n->inputs[0]});
+    NodeEntry sub1 = MakeNode("negative", n->attrs.name + "_grad_sub_1",
+                              {sub0});
+    NodeEntry sub2 = MakeNode("elemwise_mul", n->attrs.name + "_grad_sub_2",
+                              {n->inputs[1], n->inputs[1]});
+    return std::vector<NodeEntry>{
+      MakeNode("elemwise_div", n->attrs.name + "_grad_0",
+               {ograds[0], n->inputs[1]}),
+      MakeNode("elemwise_div", n->attrs.name + "_grad_1",
+               {sub1, sub2})
+    };
+});
+
+NNVM_REGISTER_ELEMWISE_BINARY_OP(elemwise_mod)
+  .describe(R"code(Element-wise modulo
+
+)code" NNVM_ADD_FILELINE)
+.set_support_level(1)
+.set_attr<FTVMCompute>(
+  "FTVMCompute", [](const NodeAttrs& attrs,
+                    const Array<Tensor>& inputs,
+                    const Array<Tensor>& out_info) {
+      return Array<Tensor>{ topi::mod(inputs[0], inputs[1]) };
+});
+
+NNVM_REGISTER_ELEMWISE_BINARY_OP(elemwise_pow)
+  .describe(R"code(Element-wise power
+
+)code" NNVM_ADD_FILELINE)
+.set_support_level(1)
+.set_attr<FTVMCompute>(
+  "FTVMCompute", [](const NodeAttrs& attrs,
+                    const Array<Tensor>& inputs,
+                    const Array<Tensor>& out_info) {
+      return Array<Tensor>{ topi::power(inputs[0], inputs[1]) };
+});
+
+// logical
+NNVM_REGISTER_ELEMWISE_BINARY_OP(logical_and)
+.describe(R"code(Elementwise compute the logical AND
+
+)code")
+.set_support_level(4)
+.set_attr<FTVMCompute>(
+  "FTVMCompute", [](const NodeAttrs& attrs,
+                    const Array<Tensor>& inputs,
+                    const Array<Tensor>& out_info) {
+      return Array<Tensor>{ topi::logical_and(inputs[0], inputs[1]) };
+});
+
+NNVM_REGISTER_ELEMWISE_BINARY_OP(logical_or)
+.describe(R"code(Elementwise compute the logical OR
+
+)code")
+.set_support_level(4)
+.set_attr<FTVMCompute>(
+  "FTVMCompute", [](const NodeAttrs& attrs,
+                    const Array<Tensor>& inputs,
+                    const Array<Tensor>& out_info) {
+      return Array<Tensor>{ topi::logical_or(inputs[0], inputs[1]) };
+});
+
+// negative
+NNVM_REGISTER_ELEMWISE_UNARY_OP(negative)
+.describe(R"code(Elemenwise numeric negative
+
+)code"  NNVM_ADD_FILELINE)
+.set_support_level(3)
+.set_attr<FTVMCompute>(
+  "FTVMCompute", [](const NodeAttrs& attrs,
+                    const Array<Tensor>& inputs,
+                    const Array<Tensor>& out_info) {
+      return Array<Tensor>{ topi::negative(inputs[0]) };
+})
+.set_attr<FGradient>(
+  "FGradient", [](const NodePtr& n,
+                  const std::vector<NodeEntry>& ograds){
+    // y = - n0
+    // grad_0 = - grad_y
+    return std::vector<NodeEntry>{
+      MakeNode("negative", n->attrs.name + "_grad_0", {ograds[0]}),
+    };
+});
+
+// logical NOT
+NNVM_REGISTER_ELEMWISE_UNARY_OP(logical_not)
+.describe(R"code(Elementwise compute the logical NOT
+
+)code"  NNVM_ADD_FILELINE)
+.set_support_level(4)
+.set_attr<FTVMCompute>(
+  "FTVMCompute", [](const NodeAttrs& attrs,
+                    const Array<Tensor>& inputs,
+                    const Array<Tensor>& out_info) {
+      return Array<Tensor>{ topi::logical_not(inputs[0]) };
+});
+
+// copy
+NNVM_REGISTER_ELEMWISE_UNARY_OP(copy)
+.describe(R"code(Copy tensor to another one.
+
+)code"  NNVM_ADD_FILELINE)
+.set_support_level(3)
+.set_attr<FTVMCompute>(
+  "FTVMCompute", [](const NodeAttrs& attrs,
+                    const Array<Tensor>& inputs,
+                    const Array<Tensor>& out_info) {
+      return Array<Tensor>{ topi::identity(inputs[0]) };
+})
+.set_attr<FGradient>(
+  "FGradient", [](const NodePtr& n,
+                  const std::vector<NodeEntry>& ograds){
+    // y = copy(n0)
+    // grad_0 = grad_y
+    return std::vector<NodeEntry>{ MakeNode("copy", n->attrs.name + "_grad_0",
+                                            {ograds[0]}) };
+});
+
+DMLC_REGISTER_PARAMETER(InitOpParam);
+DMLC_REGISTER_PARAMETER(InitOpWithScalarParam);
+DMLC_REGISTER_PARAMETER(FillValueParam);
+
+// full
+NNVM_REGISTER_INIT_OP(full)
+.describe(R"code(Fill array with scalar value
+
+)code"  NNVM_ADD_FILELINE)
+.set_attr_parser(ParamParser<InitOpWithScalarParam>)
+.set_attr<FGetAttrDict>(
+  "FGetAttrDict", ParamGetAttrDict<InitOpWithScalarParam>)
+.add_arguments(InitOpWithScalarParam::__FIELDS__())
+.set_attr<FInferShape>("FInferShape", ZeroShape<InitOpWithScalarParam>)
+.set_attr<FInferType>("FInferType", ZeroType<InitOpWithScalarParam>)
+.set_attr<FCorrectLayout>("FCorrectLayout", ZeroLayout)
+.set_attr<FTVMCompute>(
+  "FTVMCompute", [](const NodeAttrs& attrs,
+                    const Array<Tensor>& inputs,
+                    const Array<Tensor>& out_info) {
+    const InitOpWithScalarParam& param = nnvm::get<InitOpWithScalarParam>(attrs.parsed);
+    Array<Expr> shape = ShapeToArray(param.shape);
+    Type dtype = GetTVMType(param.dtype);
+    Expr fill_value = tvm::make_const(dtype, param.fill_value);
+    return Array<Tensor>{ topi::full(shape, dtype, fill_value) };
+})
+.set_support_level(4);
+
+NNVM_REGISTER_INIT_OP(zeros)
+.describe(R"code(Fill target with zeros
+
+)code"  NNVM_ADD_FILELINE)
+.set_attr_parser(ParamParser<InitOpParam>)
+.set_attr<FGetAttrDict>(
+  "FGetAttrDict", ParamGetAttrDict<InitOpParam>)
+.add_arguments(InitOpParam::__FIELDS__())
+.set_attr<FInferShape>("FInferShape", ZeroShape<InitOpParam>)
+.set_attr<FInferType>("FInferType", ZeroType<InitOpParam>)
+.set_attr<FCorrectLayout>("FCorrectLayout", ZeroLayout)
+.set_attr<FTVMCompute>(
+  "FTVMCompute", [](const NodeAttrs& attrs,
+                    const Array<Tensor>& inputs,
+                    const Array<Tensor>& out_info) {
+    const InitOpParam& param = nnvm::get<InitOpParam>(attrs.parsed);
+    Array<Expr> shape = ShapeToArray(param.shape);
+    Type dtype = GetTVMType(param.dtype);
+    Expr fill_value = tvm::make_const(dtype, 0);
+    return Array<Tensor>{ topi::full(shape, dtype, fill_value) };
+})
+.set_support_level(4);
+
+NNVM_REGISTER_INIT_OP(ones)
+.describe(R"code(Fill target with ones
+
+)code"  NNVM_ADD_FILELINE)
+.set_attr_parser(ParamParser<InitOpParam>)
+.set_attr<FGetAttrDict>(
+  "FGetAttrDict", ParamGetAttrDict<InitOpParam>)
+.add_arguments(InitOpParam::__FIELDS__())
+.set_attr<FInferShape>("FInferShape", ZeroShape<InitOpParam>)
+.set_attr<FInferType>("FInferType", ZeroType<InitOpParam>)
+.set_attr<FCorrectLayout>("FCorrectLayout", ZeroLayout)
+.set_attr<FTVMCompute>(
+  "FTVMCompute", [](const NodeAttrs& attrs,
+                    const Array<Tensor>& inputs,
+                    const Array<Tensor>& out_info) {
+    const InitOpParam& param = nnvm::get<InitOpParam>(attrs.parsed);
+    Array<Expr> shape = ShapeToArray(param.shape);
+    Type dtype = GetTVMType(param.dtype);
+    Expr fill_value = tvm::make_const(dtype, 1);
+    return Array<Tensor>{ topi::full(shape, dtype, fill_value) };
+})
+.set_support_level(4);
+
+// full_like
+NNVM_REGISTER_INIT_LIKE_OP(full_like)
+.describe(R"code(Return an scalar value array with the same shape and type
+as the input array
+
+)code"  NNVM_ADD_FILELINE)
+.add_arguments(FillValueParam::__FIELDS__())
+.set_attr_parser(ParamParser<FillValueParam>)
+.set_attr<FGetAttrDict>("FGetAttrDict", ParamGetAttrDict<FillValueParam>)
+.set_attr<FTVMCompute>(
+    "FTVMCompute", [](const NodeAttrs& attrs,
+                      const Array<Tensor>& inputs,
+                      const Array<Tensor>& out_info) {
+      const FillValueParam& param = nnvm::get<FillValueParam>(attrs.parsed);
+      const Expr fill_value = tvm::make_const(out_info[0]->dtype, param.fill_value);
+      return Array<Tensor> { topi::full_like(inputs[0], fill_value) };
+})
+.set_support_level(4);
+
+NNVM_REGISTER_INIT_LIKE_OP(zeros_like)
+.describe(R"code(Return an array of zeros with the same shape and type
+as the input array.
+
+)code")
+.set_attr<FTVMCompute>(
+    "FTVMCompute", [](const NodeAttrs& attrs,
+                      const Array<Tensor>& inputs,
+                      const Array<Tensor>& out_info) {
+      return Array<Tensor> { topi::full_like(inputs[0],
+                                             tvm::make_const(out_info[0]->dtype, 0)) };
+})
+.set_support_level(4);
+
+NNVM_REGISTER_INIT_LIKE_OP(ones_like)
+.describe(R"code(Return an array of ones with the same shape and type
+as the input array.
+
+)code")
+.set_attr<FTVMCompute>(
+    "FTVMCompute", [](const NodeAttrs& attrs,
+                      const Array<Tensor>& inputs,
+                      const Array<Tensor>& out_info) {
+      return Array<Tensor> { topi::full_like(inputs[0],
+                                             tvm::make_const(out_info[0]->dtype, 1)) };
+})
+.set_support_level(4);
+
+// unary scalar op
+DMLC_REGISTER_PARAMETER(ScalarParam);
+
+#define NNVM_REGISTER_ELEMWISE_BINARY_SCALAR(op)                        \
+  NNVM_REGISTER_ELEMWISE_UNARY_OP(op)                                   \
+  .add_arguments(ScalarParam::__FIELDS__())                             \
+  .set_attr_parser(ParamParser<ScalarParam>)                            \
+  .set_attr<FGetAttrDict>("FGetAttrDict", ParamGetAttrDict<ScalarParam>)
+
+inline Tensor binary_scalar_op(const NodeAttrs& attrs,
+                               const Tensor& x,
+                               std::function<Expr(Expr, Expr)> f) {
+  const ScalarParam& param = nnvm::get<ScalarParam>(attrs.parsed);
+  auto scalar_val = static_cast<float>(param.scalar);
+  return compute(x->shape, [&](const Array<Var>& i) {
+    auto scalar_const = make_const(x->dtype, scalar_val);
+    return f(x(i), scalar_const);
+    }, "tensor", topi::kElementWise);
+}
+
+NNVM_REGISTER_ELEMWISE_BINARY_SCALAR(__add_scalar__)
+.describe(R"code(Tensor add scalar
+
+)code"  NNVM_ADD_FILELINE)
+.set_support_level(3)
+.set_attr<FTVMCompute>(
+  "FTVMCompute", [](const NodeAttrs& attrs,
+                    const Array<Tensor>& inputs,
+                    const Array<Tensor>& out_info) {
+    return Array<Tensor>{ binary_scalar_op(attrs, inputs[0],
+      [](Expr x, Expr y) { return x + y; }) };
+})
+.set_attr<FGradient>(
+  "FGradient", [](const NodePtr& n,
+                  const std::vector<NodeEntry>& ograds){
+    return std::vector<NodeEntry>{ MakeNode("copy", n->attrs.name + "_grad_0",
+                                            {ograds[0]}) };
+});
+
+NNVM_REGISTER_ELEMWISE_BINARY_SCALAR(__sub_scalar__)
+.describe(R"code(Tensor substract scalar
+
+)code"  NNVM_ADD_FILELINE)
+.set_support_level(3)
+.set_attr<FTVMCompute>(
+  "FTVMCompute", [](const NodeAttrs& attrs,
+                    const Array<Tensor>& inputs,
+                    const Array<Tensor>& out_info) {
+    return Array<Tensor>{ binary_scalar_op(attrs, inputs[0],
+      [](Expr x, Expr y) { return x - y; }) };
+})
+.set_attr<FGradient>(
+  "FGradient", [](const NodePtr& n,
+                  const std::vector<NodeEntry>& ograds){
+    return std::vector<NodeEntry>{ograds[0]};
+});
+
+NNVM_REGISTER_ELEMWISE_BINARY_SCALAR(__rsub_scalar__)
+.describe(R"code(scalar substract Tensor
+
+)code"  NNVM_ADD_FILELINE)
+.set_support_level(3)
+.set_attr<FTVMCompute>(
+  "FTVMCompute", [](const NodeAttrs& attrs,
+                    const Array<Tensor>& inputs,
+                    const Array<Tensor>& out_info) {
+    return Array<Tensor>{ binary_scalar_op(attrs, inputs[0],
+      [](Expr x, Expr y) { return y - x; }) };
+})
+.set_attr<FGradient>(
+  "FGradient", [](const NodePtr& n,
+                  const std::vector<NodeEntry>& ograds){
+    return std::vector<NodeEntry>{
+      MakeNode("negative", n->attrs.name + "_grad_0", {ograds[0]})
+    };
+});
+
+
+NNVM_REGISTER_ELEMWISE_BINARY_SCALAR(__lshift_scalar__)
+.describe(R"code(Tensor left shift by scalar
+
+)code"  NNVM_ADD_FILELINE)
+.set_support_level(3)
+.set_attr<FTVMCompute>(
+  "FTVMCompute", [](const NodeAttrs& attrs,
+                    const Array<Tensor>& inputs,
+                    const Array<Tensor>& out_info) {
+    const ScalarParam& param = nnvm::get<ScalarParam>(attrs.parsed);
+    int scalar_val = static_cast<int>(param.scalar);
+    return Array<Tensor>{
+      topi::left_shift(inputs[0],
+                       make_const(inputs[0]->dtype, scalar_val))};
+    });
+
+NNVM_REGISTER_ELEMWISE_BINARY_SCALAR(__rshift_scalar__)
+.describe(R"code(Tensor right shift by scalar
+
+)code"  NNVM_ADD_FILELINE)
+.set_support_level(3)
+.set_attr<FTVMCompute>(
+  "FTVMCompute", [](const NodeAttrs& attrs,
+                    const Array<Tensor>& inputs,
+                    const Array<Tensor>& out_info) {
+    const ScalarParam& param = nnvm::get<ScalarParam>(attrs.parsed);
+    int scalar_val = static_cast<int>(param.scalar);
+    return Array<Tensor>{
+      topi::right_shift(inputs[0],
+                        make_const(inputs[0]->dtype, scalar_val))};
+  });
+
+NNVM_REGISTER_ELEMWISE_BINARY_SCALAR(__mul_scalar__)
+.describe(R"code(Tensor multiplies scalar
+
+)code"  NNVM_ADD_FILELINE)
+.set_support_level(3)
+.set_attr<FTVMCompute>(
+  "FTVMCompute", [](const NodeAttrs& attrs,
+                    const Array<Tensor>& inputs,
+                    const Array<Tensor>& out_info) {
+    return Array<Tensor>{ binary_scalar_op(attrs, inputs[0],
+      [](Expr x, Expr y) { return x * y; }) };
+})
+.set_attr<FGradient>(
+  "FGradient", [](const NodePtr& n,
+                  const std::vector<NodeEntry>& ograds){
+    // y = n0 * scalar
+    // grad_0 = grad_y * scalar
+    return std::vector<NodeEntry>{
+      MakeNode("__mul_scalar__", n->attrs.name + "_grad_0",
+               {ograds[0]}, {{"scalar", n->attrs.dict["scalar"]}})
+    };
+});
+
+NNVM_REGISTER_ELEMWISE_BINARY_SCALAR(__div_scalar__)
+.describe(R"code(Tensor divides scalar
+
+)code"  NNVM_ADD_FILELINE)
+.set_support_level(3)
+.set_attr<FTVMCompute>(
+  "FTVMCompute", [](const NodeAttrs& attrs,
+                    const Array<Tensor>& inputs,
+                    const Array<Tensor>& out_info) {
+    return Array<Tensor>{ binary_scalar_op(attrs, inputs[0],
+      [](Expr x, Expr y) { return x / y; }) };
+})
+.set_attr<FGradient>(
+  "FGradient", [](const NodePtr& n,
+                  const std::vector<NodeEntry>& ograds){
+    // y = n0 / scalar
+    // grad_0 = grad_y / scalar
+    return std::vector<NodeEntry>{
+      MakeNode("__div_scalar__", n->attrs.name + "_grad_0",
+               {ograds[0]}, {{"scalar", n->attrs.dict["scalar"]}})
+    };
+});
+
+NNVM_REGISTER_ELEMWISE_BINARY_SCALAR(__rdiv_scalar__)
+.describe(R"code(scalar divides Tensor
+
+)code"  NNVM_ADD_FILELINE)
+.set_support_level(3)
+.set_attr<FTVMCompute>(
+  "FTVMCompute", [](const NodeAttrs& attrs,
+                    const Array<Tensor>& inputs,
+                    const Array<Tensor>& out_info) {
+    return Array<Tensor>{ binary_scalar_op(attrs, inputs[0],
+      [](Expr x, Expr y) { return y / x; }) };
+})
+.set_attr<FGradient>(
+  "FGradient", [](const NodePtr& n,
+                  const std::vector<NodeEntry>& ograds){
+    // y = scalar / n0
+    // grad_0 = - grad_y * scalar / n0^2
+    NodeEntry sub0 = MakeNode("__mul_scalar__", n->attrs.name + "_grad_sub_0",
+                              {ograds[0]},
+                              {{"scalar", n->attrs.dict["scalar"]}});
+    NodeEntry sub1 = MakeNode("negative", n->attrs.name + "_grad_sub_1",
+                              {sub0});
+    NodeEntry sub2 = MakeNode("elemwise_mul", n->attrs.name + "_grad_sub_2",
+                              {n->inputs[0], n->inputs[0]});
+    return std::vector<NodeEntry>{
+      MakeNode("elemwise_div", n->attrs.name + "_grad_0",
+               {sub1, sub2})
+    };
+});
+
+NNVM_REGISTER_ELEMWISE_BINARY_SCALAR(__pow_scalar__)
+.describe(R"code(Tensor power scalar
+
+)code"  NNVM_ADD_FILELINE)
+.set_support_level(3)
+.set_attr<FTVMCompute>(
+  "FTVMCompute", [](const NodeAttrs& attrs,
+                    const Array<Tensor>& inputs,
+                    const Array<Tensor>& out_info) {
+    return Array<Tensor>{ binary_scalar_op(attrs, inputs[0],
+      [](Expr x, Expr y) { return tvm::pow(x, y); }) };
+})
+.set_attr<FGradient>(
+  "FGradient", [](const NodePtr& n,
+                  const std::vector<NodeEntry>& ograds){
+    // y = n0^scalar
+    // grad_0 = grad_y * scalar * n0^(scalar - 1)
+    double scalar = std::stod(n->attrs.dict["scalar"]);
+    NodeEntry sub0 = MakeNode("__pow_scalar__", n->attrs.name + "_grad_sub_0",
+                              {n->inputs[0]},
+                              {{"scalar", std::to_string(scalar - 1)}});
+    NodeEntry sub1 = MakeNode("__mul_scalar__", n->attrs.name + "_grad_sub_1",
+                              {ograds[0]},
+                              {{"scalar", std::to_string(scalar)}});
+    return std::vector<NodeEntry>{
+      MakeNode("elemwise_mul", n->attrs.name + "_grad_0",
+               {sub0, sub1})
+    };
+});
+
+NNVM_REGISTER_ELEMWISE_BINARY_SCALAR(__rpow_scalar__)
+.describe(R"code(scalar power Tensor
+
+)code"  NNVM_ADD_FILELINE)
+.set_support_level(3)
+.set_attr<FTVMCompute>(
+  "FTVMCompute", [](const NodeAttrs& attrs,
+                    const Array<Tensor>& inputs,
+                    const Array<Tensor>& out_info) {
+    return Array<Tensor>{ binary_scalar_op(attrs, inputs[0],
+      [](Expr x, Expr y) { return tvm::pow(y, x); }) };
+})
+.set_attr<FGradient>(
+  "FGradient", [](const NodePtr& n,
+                  const std::vector<NodeEntry>& ograds){
+    // y = scalar^n0
+    // grad_0 = grad_y * scalar^n0 * log(scalar)
+    double num = std::stod(n->attrs.dict["scalar"]);
+    NodeEntry sub0 = MakeNode("__mul_scalar__", n->attrs.name + "_grad_sub_0",
+                              {NodeEntry{n, 0, 0}},
+                              {{"scalar", std::to_string(std::log(num))}});
+    return std::vector<NodeEntry>{
+      MakeNode("__mul_symbol__", n->attrs.name + "_grad_0",
+               {ograds[0], sub0})
+    };
+});
+
+DMLC_REGISTER_PARAMETER(ElementWiseReduceParam);
+
+NNVM_REGISTER_ELEMWISE_REDUCE_OP(elemwise_sum)
+.describe(R"code(Adds all input arguments element-wise.
+
+)code"  NNVM_ADD_FILELINE)
+.set_attr<FTVMCompute>(
+  "FTVMCompute", [](const NodeAttrs& attrs,
+                    const Array<Tensor>& inputs,
+                    const Array<Tensor>& out_info) {
+    const ElementWiseReduceParam& param = nnvm::get<ElementWiseReduceParam>(attrs.parsed);
+    CHECK_EQ(param.num_args, inputs.size()) << """Compute definition of elemwise sum""";
+    return Array<Tensor>{ topi::elemwise_sum(inputs) };
+})
+.set_attr<nnvm::FGradient>(
+  "FGradient", [](const NodePtr& n,
+                  const std::vector<NodeEntry>& ograds){
+    CHECK_EQ(ograds.size(), 1);
+    std::vector<NodeEntry> ret;
+    for (size_t i = 0; i < n->inputs.size(); i++) {
+      ret.push_back(MakeNode("copy", n->attrs.name + "_grad_0", {ograds[0]}));
+    }
+    return ret;
+  })
+.set_support_level(4);
+
+NNVM_REGISTER_ELEMWISE_UNARY_OP(block_grad)
+.describe(R"code(Blocks gradient computation for input.
+
+)code" NNVM_ADD_FILELINE)
+.set_attr<nnvm::FInplaceIdentity>(
+  "FInplaceIdentity", [](const NodeAttrs& attrs){
+    return std::vector<bool>{true};
+})
+.set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes)
+.set_support_level(4);
+
+DMLC_REGISTER_PARAMETER(IndicatorParam);
+
+// indicator function
+NNVM_REGISTER_INDICATOR_OP(greater)
+.describe(R"code(Greater function that returns a mask tensor
+with 1.0 if (left > right), otherwise 0.0 element-wise.
+
+)code" NNVM_ADD_FILELINE)
+.add_argument("lhs", "Tensor", "First input")
+.add_argument("rhs", "Tensor", "Second input")
+.set_num_inputs(2)
+.set_attr<nnvm::FInferShape>("FInferShape", ElemwiseShape<2, 1>)
+.set_attr<FTVMCompute>(
+  "FTVMCompute", [](const NodeAttrs& attrs,
+                    const Array<Tensor>& inputs,
+                    const Array<Tensor>& out_info) {
+    return Array<Tensor>{ topi::cast(topi::greater(inputs[0], inputs[1]), out_info[0]->dtype) };
+})
+.set_support_level(4);
+
+
+NNVM_REGISTER_INDICATOR_OP(less)
+  .describe(R"code(Less function that returns a mask tensor
+with 1.0 if (left < right), otherwise 0.0 element-wise.
+
+)code" NNVM_ADD_FILELINE)
+.add_argument("lhs", "Tensor", "First input")
+.add_argument("rhs", "Tensor", "Second input")
+.set_num_inputs(2)
+.set_attr<nnvm::FInferShape>("FInferShape", ElemwiseShape<2, 1>)
+.set_attr<FTVMCompute>(
+  "FTVMCompute", [](const NodeAttrs& attrs,
+                    const Array<Tensor>& inputs,
+                    const Array<Tensor>& out_info) {
+    return Array<Tensor>{ topi::cast(topi::less(inputs[0], inputs[1]), out_info[0]->dtype) };
+})
+.set_support_level(4);
+
+NNVM_REGISTER_INDICATOR_OP(_max_mask)
+  .describe(R"code(Function that returns a mask tensor
+with 1.0 if the value is maximum over given axes, otherwise 0.0 element-wise.
+
+)code" NNVM_ADD_FILELINE)
+.add_argument("data", "Tensor", "Input")
+.set_num_inputs(1)
+.add_arguments(IndicatorParam::__FIELDS__())
+.set_attr_parser(ParamParser<IndicatorParam>)
+.set_attr<FGetAttrDict>("FGetAttrDict", ParamGetAttrDict<IndicatorParam>)
+.set_attr<nnvm::FInferShape>("FInferShape", ElemwiseShape<1, 1>)
+.set_support_level(1);
+
+NNVM_REGISTER_INDICATOR_OP(_min_mask)
+  .describe(R"code(Function that returns a mask tensor
+with 1.0 if the value is minimum over given axes, otherwise 0.0 element-wise.
+
+)code" NNVM_ADD_FILELINE)
+.add_argument("data", "Tensor", "Input")
+.set_num_inputs(1)
+.add_arguments(IndicatorParam::__FIELDS__())
+.set_attr_parser(ParamParser<IndicatorParam>)
+.set_attr<FGetAttrDict>("FGetAttrDict", ParamGetAttrDict<IndicatorParam>)
+.set_attr<nnvm::FInferShape>("FInferShape", ElemwiseShape<1, 1>)
+.set_support_level(1);
+
+
+DMLC_REGISTER_PARAMETER(ClipParam);
+
+NNVM_REGISTER_OP(clip)
+.describe(R"doc(Clips (limits) the values in an array.
+Given an interval, values outside the interval are clipped to the interval edges.
+Clipping ``x`` between `a_min` and `a_x` would be::
+   clip(x, a_min, a_max) = max(min(x, a_max), a_min))
+Example::
+    x = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
+    clip(x,1,8) = [ 1.,  1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  8.]
+)doc" NNVM_ADD_FILELINE)
+.set_num_inputs(1)
+.set_num_outputs(1)
+.set_attr_parser(ParamParser<ClipParam>)
+.set_attr<FGetAttrDict>("FGetAttrDict", ParamGetAttrDict<ClipParam>)
+.set_attr<nnvm::FInferShape>("FInferShape", ElemwiseShape<1, 1>)
+.set_attr<nnvm::FInferType>("FInferType", ElemwiseType<1, 1>)
+.set_attr<nnvm::FCorrectLayout>("FCorrectLayout", ElemwiseFixedLayoutUnknownOut<1, 1>)
+.set_attr<FTVMCompute>(
+  "FTVMCompute", [](const NodeAttrs& attrs,
+                    const Array<Tensor>& inputs,
+                    const Array<Tensor>& out_info) {
+    const ClipParam params = get<ClipParam>(attrs.parsed);
+    return Array<Tensor>{
+      topi::clip(inputs[0], tvm::make_const(tvm::Float(32), params.a_min),
+                 tvm::make_const(tvm::Float(32), params.a_max)) };
+  })
+.add_argument("data", "NDArray-or-Symbol", "Input array.")
+.add_arguments(ClipParam::__FIELDS__())
+.set_attr<nnvm::FGradient>(
+  "FGradient", [](const NodePtr& n,
+                  const std::vector<NodeEntry>& ograds){
+    // y = clip(x, a_min, a_max)
+    // min_mask = greater_equal(x, a_min*ones_like(x))
+    //          => ones_like(x) - less(x, a_min)
+    // max_mask = less_equal(x, a_max*ones_like(x))
+    //          => ones_like(x) - greater(x, a_max)
+    // grad_x = min_mask * max_mask * grad_y
+    CHECK_EQ(ograds.size(), 1);
+
+    NodeEntry sub0 = MakeNode("ones_like", n->attrs.name + "_grad_sub_0",
+                              {n->inputs[0]});
+    // min_mask
+    NodeEntry sub1 = MakeNode("__mul_scalar__", n->attrs.name + "_grad_sub_1",
+                              {sub0}, {{"scalar", n->attrs.dict["a_min"]}});
+    NodeEntry sub2 = MakeNode("less", n->attrs.name + "_grad_sub_2",
+                              {n->inputs[0], sub1});
+    NodeEntry sub3 = MakeNode("elemwise_sub", n->attrs.name + "_grad_sub_3",
+                              {sub0, sub2});
+
+    // max_mask
+    NodeEntry sub4 = MakeNode("__mul_scalar__", n->attrs.name + "_grad_sub_4",
+                              {sub0}, {{"scalar", n->attrs.dict["a_max"]}});
+    NodeEntry sub5 = MakeNode("greater", n->attrs.name + "_grad_sub_5",
+                              {n->inputs[0], sub4});
+    NodeEntry sub6 = MakeNode("elemwise_sub", n->attrs.name + "_grad_sub_6",
+                              {sub0, sub5});
+
+    // min_mask * max_mask
+    NodeEntry sub7 = MakeNode("elemwise_mul", n->attrs.name + "_grad_sub_7",
+                              {sub3, sub6});
+    return std::vector<NodeEntry>{
+      MakeNode("elemwise_mul", n->attrs.name + "_grad",
+               {sub7, ograds[0]})
+    };
+  })
+.set_support_level(4);
+
+}  // namespace top
+}  // namespace nnvm
diff --git a/nnvm/src/top/tensor/matrix_op.cc b/nnvm/src/top/tensor/matrix_op.cc
new file mode 100644
index 000000000000..b1810f40de20
--- /dev/null
+++ b/nnvm/src/top/tensor/matrix_op.cc
@@ -0,0 +1,195 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file matrix_op.cc
+ * \brief Matrix operators
+ */
+#include <topi/transform.h>
+#include <nnvm/op.h>
+#include <nnvm/node.h>
+#include <nnvm/op_attr_types.h>
+#include <nnvm/compiler/op_attr_types.h>
+#include <nnvm/top/tensor.h>
+#include "../op_common.h"
+#include "../elemwise_op_common.h"
+
+namespace nnvm {
+namespace top {
+
+using namespace nnvm::compiler;
+
+DMLC_REGISTER_PARAMETER(MatMulParam);
+
+inline bool DotShape(const nnvm::NodeAttrs& attrs,
+                     std::vector<TShape> *in_attrs,
+                     std::vector<TShape> *out_attrs) {
+  const MatMulParam& param = nnvm::get<MatMulParam>(attrs.parsed);
+  CHECK_EQ(in_attrs->size(), 2U);
+  CHECK_EQ(out_attrs->size(), 1U);
+  TShape lshape = (*in_attrs)[0];
+  TShape rshape = (*in_attrs)[1];
+
+  if (lshape.ndim() == 1)  lshape = TShape{1, lshape[0]};
+  if (rshape.ndim() == 1) rshape = TShape{1, rshape[0]};
+
+  if (param.transpose_a) std::reverse(lshape.begin(), lshape.end());
+  if (param.transpose_b) std::reverse(rshape.begin(), rshape.end());
+
+  CHECK_EQ(lshape[lshape.ndim() - 1], rshape[0])
+    << "dot shape inconsistent: " << lshape << " X " << rshape;
+
+  TShape oshape(lshape.ndim() + rshape.ndim() - 2);
+  for (uint32_t i = 0; i < lshape.ndim() - 1; i++) oshape[i] = lshape[i];
+  for (uint32_t i = 1; i < rshape.ndim(); i++) oshape[i + lshape.ndim() - 2] = rshape[i];
+
+  NNVM_ASSIGN_OUTPUT_SHAPE(attrs, *out_attrs, 0, oshape);
+  return true;
+}
+
+inline bool DotCorrectLayout(const NodeAttrs& attrs,
+                             std::vector<Layout> *ilayouts,
+                             const std::vector<Layout> *last_ilayouts,
+                             std::vector<Layout> *olayouts) {
+  const MatMulParam& param = nnvm::get<MatMulParam>(attrs.parsed);
+  CHECK_EQ(ilayouts->size(), 2U);
+  CHECK_EQ(olayouts->size(), 1U);
+  const Layout& lhs = last_ilayouts->at(0).defined() ? last_ilayouts->at(0)
+                                                     : ilayouts->at(0);
+  const Layout& rhs = last_ilayouts->at(1).defined() ? last_ilayouts->at(1)
+                                                     : ilayouts->at(1);
+  NNVM_ASSIGN_LAYOUT(*ilayouts, 0, lhs);
+  NNVM_ASSIGN_LAYOUT(*ilayouts, 1, rhs);
+
+  if (lhs.ndim() > 1 && rhs.ndim() > 1) {
+    // concat lhs and rhs layout
+    const Layout& lhs_out = param.transpose_a ? lhs.reverse() : lhs;
+    const Layout& rhs_out = param.transpose_b ? rhs.reverse() : rhs;
+    Layout out = lhs_out.sublayout(0, lhs_out.ndim()-1) +
+        rhs_out.sublayout(1, rhs_out.ndim()-1);
+    NNVM_ASSIGN_LAYOUT(*olayouts, 0, out);
+  }
+  return true;
+}
+
+NNVM_REGISTER_OP(matmul)
+.describe(R"doc(Matrix multiplication of two arrays.
+
+``dot``'s behavior depends on the input array dimensions:
+
+- 1-D arrays: inner product of vectors
+- 2-D arrays: matrix multiplication
+- N-D arrays: a sum product over the last axis of the first input and the first
+  axis of the second input
+
+  For example, given 3-D ``x`` with shape `(n,m,k)` and ``y`` with shape `(k,r,s)`, the
+  result array will have shape `(n,m,r,s)`. It is computed by::
+
+    dot(x,y) = sum(x[i,j,:]*y[:,a,b])
+
+)doc" NNVM_ADD_FILELINE)
+.set_support_level(1)
+.set_num_inputs(2)
+.set_num_outputs(1)
+.set_attr_parser(ParamParser<MatMulParam>)
+.set_attr<FGetAttrDict>("FGetAttrDict", ParamGetAttrDict<MatMulParam>)
+.add_arguments(MatMulParam::__FIELDS__())
+.add_argument("lhs", "NDArray-or-Symbol", "The first input")
+.add_argument("rhs", "NDArray-or-Symbol", "The second input")
+.set_attr<FInferShape>("FInferShape", DotShape)
+.set_attr<FInferType>("FInferType", ElemwiseType<2, 1>)
+.set_attr<FCorrectLayout>("FCorrectLayout", DotCorrectLayout)
+.set_attr<FTVMCompute>(
+  "FTVMCompute", [](const NodeAttrs& attrs,
+                    const Array<Tensor>& inputs,
+                    const Array<Tensor>& out_info) {
+    const MatMulParam& param = nnvm::get<MatMulParam>(attrs.parsed);
+    return Array<Tensor>{
+      topi::matmul(inputs[0], inputs[1], param.transpose_a, param.transpose_b)
+    };
+  })
+.set_attr<FGradient>(
+  "FGradient", [](const NodePtr& n,
+                  const std::vector<NodeEntry>& ograds) {
+    // z = x dot y
+    // xshape (n,m,k), yshape (k,r,s)
+    const MatMulParam& param = nnvm::get<MatMulParam>(n->attrs.parsed);
+    bool Ta = param.transpose_a;
+    bool Tb = param.transpose_b;
+    // Ta = false, Tb = false
+    // grad_x = grad_z dot y.T
+    // grad_y = x.T dot grad_z
+    if (!Ta && !Tb) {
+      return std::vector<NodeEntry>{
+        MakeNode("matmul", n->attrs.name + "_grad_0",
+                 {ograds[0], n->inputs[1]},
+                 {{"transpose_a", "false"},
+                  {"transpose_b", "true"}}),
+        MakeNode("matmul", n->attrs.name + "_grad_1",
+                 {n->inputs[0], ograds[0]},
+                 {{"transpose_a", "true"},
+                  {"transpose_b", "false"}})
+      };
+    } else if (Ta && !Tb) {
+      // Ta = true, Tb = false
+      // grad_x = y dot grad_z.T
+      // grad_y = x dot grad_z
+      return std::vector<NodeEntry>{
+        MakeNode("matmul", n->attrs.name + "_grad_0",
+                 {n->inputs[1], ograds[0]},
+                 {{"transpose_a", "false"},
+                  {"transpose_b", "true"}}),
+        MakeNode("matmul", n->attrs.name + "_grad_1",
+                 {n->inputs[0], ograds[0]},
+                 {{"transpose_a", "false"},
+                  {"transpose_b", "false"}})
+      };
+    } else if (!Ta && Tb) {
+      // Ta = false, Tb = true
+      // grad_x = grad_z dot y
+      // grad_y = grad_z.T dot x
+      return std::vector<NodeEntry>{
+        MakeNode("matmul", n->attrs.name + "_grad_0",
+                 {ograds[0], n->inputs[1]},
+                 {{"transpose_a", "false"},
+                  {"transpose_b", "false"}}),
+        MakeNode("matmul", n->attrs.name + "_grad_1",
+                 {ograds[0], n->inputs[0]},
+                 {{"transpose_a", "true"},
+                  {"transpose_b", "false"}})
+      };
+    } else {
+      // Ta = true, Tb = true
+      // grad_x = y.T dot grad_z.T
+      // grad_y = grad_z.T dot x.T
+      return std::vector<NodeEntry>{
+        MakeNode("matmul", n->attrs.name + "_grad_0",
+                 {n->inputs[1], ograds[0]},
+                 {{"transpose_a", "true"},
+                  {"transpose_b", "true"}}),
+        MakeNode("matmul", n->attrs.name + "_grad_1",
+                 {ograds[0], n->inputs[0]},
+                 {{"transpose_a", "true"},
+                  {"transpose_b", "true"}})
+      };
+    }
+});
+
+}  // namespace top
+}  // namespace nnvm
diff --git a/nnvm/src/top/tensor/reduce.cc b/nnvm/src/top/tensor/reduce.cc
new file mode 100644
index 000000000000..dd8e23cf6fe9
--- /dev/null
+++ b/nnvm/src/top/tensor/reduce.cc
@@ -0,0 +1,411 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file reduce.cc
+ * \brief reduce operator.
+ */
+#include <nnvm/op.h>
+#include <nnvm/node.h>
+#include <nnvm/op_attr_types.h>
+#include <nnvm/compiler/op_attr_types.h>
+#include <nnvm/compiler/util.h>
+#include <nnvm/top/tensor.h>
+#include <numeric>
+#include "../op_common.h"
+#include "../elemwise_op_common.h"
+#include "topi/detail/constant_utils.h"
+#include "topi/elemwise.h"
+#include "topi/reduction.h"
+#include "topi/transform.h"
+
+namespace nnvm {
+namespace top {
+using namespace tvm;
+using namespace nnvm::compiler;
+
+
+// reduce
+DMLC_REGISTER_PARAMETER(ReduceParam);
+
+inline TShape GetReduceAxes(const uint32_t indim,
+                            const TShape& axis,
+                            bool exclude) {
+  if (axis.ndim() == 0) {
+    TShape r_axes(indim);
+    std::iota(r_axes.begin(), r_axes.end(), 0);
+    return r_axes;
+  }
+
+  CHECK_LT(axis[axis.ndim() - 1], indim)
+    << "Reduction axis " << axis[axis.ndim() - 1]
+    << " exceeds input dimensions " << indim;
+
+  TShape in_axis = axis;
+  for (auto& i : in_axis) {
+    i = i < 0 ? i + indim : i;
+    CHECK_GE(i, 0) << "axis out of bounds in reduce operator";
+    CHECK_LT(i, indim) << "axis out of bounds in reduce operator";
+  }
+  std::sort(in_axis.begin(), in_axis.end());
+  if (!exclude) return in_axis;
+  TShape r_axis(indim - in_axis.ndim());
+  for (unsigned i = 0, j = 0, k = 0; i < indim; ++i) {
+    if (j < in_axis.ndim() && i == in_axis[j]) {
+        ++j;
+        continue;
+    }
+    r_axis[k++] = i;
+  }
+  return r_axis;
+}
+
+inline TShape ReduceShapeImpl(const TShape& ishape,
+                              const TShape& axis,
+                              bool keepdims,
+                              bool exclude) {
+  uint32_t indim = ishape.ndim();
+  TShape r_axes = GetReduceAxes(indim, axis, exclude);
+  if (!r_axes.ndim()) return ishape;
+  if (r_axes.ndim() == indim)
+    return TShape(keepdims ? indim : 1);
+
+  CHECK(r_axes.ndim() < indim);
+  if (keepdims) {
+    TShape oshape(ishape);
+    for (unsigned i = 0, j = 0; i < indim; ++i) {
+      if (j >= r_axes.ndim() || i != r_axes[j]) continue;
+      oshape[i] = 1;
+      ++j;
+    }
+    return oshape;
+  }
+
+  TShape oshape(indim - r_axes.ndim());
+  for (unsigned i = 0, j = 0, k = 0; i < indim; ++i) {
+    if (j < r_axes.ndim() && i == r_axes[j]) {
+      ++j;
+      continue;
+    }
+    oshape[k++] = ishape[i];
+  }
+  return oshape;
+}
+
+inline bool ReduceShape(const nnvm::NodeAttrs& attrs,
+                        std::vector<TShape>* in_attrs,
+                        std::vector<TShape>* out_attrs) {
+  CHECK_EQ(in_attrs->size(), 1U);
+  CHECK_EQ(out_attrs->size(), 1U);
+  if ((*in_attrs)[0].ndim() == 0) return false;
+  const ReduceParam& param = nnvm::get<ReduceParam>(attrs.parsed);
+  NNVM_ASSIGN_OUTPUT_SHAPE(
+      attrs, *out_attrs, 0,
+      ReduceShapeImpl((*in_attrs)[0], param.axis,
+                      param.keepdims, param.exclude));
+  return true;
+}
+
+inline bool CollapseShape(const nnvm::NodeAttrs& attrs,
+                          std::vector<TShape>* in_attrs,
+                          std::vector<TShape>* out_attrs) {
+  CHECK_EQ(in_attrs->size(), 2U);
+  CHECK_EQ(out_attrs->size(), 1U);
+  if ((*in_attrs)[0].ndim() == 1) return false;
+  NNVM_ASSIGN_OUTPUT_SHAPE(attrs, *out_attrs, 0, (*in_attrs)[1]);
+  return true;
+}
+
+template<typename PType>
+inline void AxesParamParser(nnvm::NodeAttrs* attrs) {
+  PType param;
+  param.Init(attrs->dict);
+  std::sort(&param.axis[0], &param.axis[param.axis.ndim()]);
+  attrs->parsed = std::move(param);
+}
+
+#define NNVM_REGISTER_BASE_REDUCE_OP(op)                                 \
+  NNVM_REGISTER_OP(op)                                                   \
+  .add_arguments(ReduceParam::__FIELDS__())                              \
+  .set_attr_parser(AxesParamParser<ReduceParam>)                         \
+  .set_attr<FGetAttrDict>("FGetAttrDict", ParamGetAttrDict<ReduceParam>) \
+  .set_num_outputs(1)
+
+#define NNVM_REGISTER_REDUCE_OP(op)                                     \
+  NNVM_REGISTER_BASE_REDUCE_OP(op)                                      \
+  .add_argument("data", "Tensor", "The input")                          \
+  .set_attr<FInferShape>("FInferShape", ReduceShape)                    \
+  .set_attr<FInferType>("FInferType", ElemwiseType<1, 1>)               \
+  .set_attr<FCorrectLayout>("FCorrectLayout",                           \
+    ElemwiseFixedLayoutUnknownOut<1, 1>)                                \
+  .set_num_inputs(1)
+
+NNVM_REGISTER_REDUCE_OP(sum)
+.describe(R"code(Computes the sum of array elements over given axes.
+
+Example::
+
+  data = [[[1,2],[2,3],[1,3]],
+          [[1,4],[4,3],[5,2]],
+          [[7,1],[7,2],[7,3]]]
+
+  sum(data, axis=1)
+  [[  4.   8.]
+   [ 10.   9.]
+   [ 21.   6.]]
+
+  sum(data, axis=[1,2])
+  [ 12.  19.  27.]
+
+)code" NNVM_ADD_FILELINE)
+.set_attr<FTVMCompute>(
+  "FTVMCompute", [](const NodeAttrs& attrs,
+                    const Array<Tensor>& inputs,
+                    const Array<Tensor>& out_info) {
+    const ReduceParam& param = nnvm::get<ReduceParam>(attrs.parsed);
+    TShape r_axes = GetReduceAxes(inputs[0]->shape.size(),
+                                  param.axis, param.exclude);
+    if (!r_axes.ndim()) return Array<Tensor> { topi::identity(inputs[0]) };
+    auto axis = ShapeToIntArray(r_axes);
+    return Array<Tensor>{
+      topi::sum(inputs[0], axis, param.keepdims, true) };
+})
+.set_attr<FGradient>(
+  "FGradient", [](const NodePtr& n,
+                  const std::vector<NodeEntry>& ograds){
+    const ReduceParam& param = nnvm::get<ReduceParam>(n->attrs.parsed);
+    bool exclude = param.exclude;
+    TShape p_axis = param.axis;
+    if (!param.exclude && param.axis.ndim() == 0) {
+      exclude = true;
+      p_axis = TShape();
+    }
+    std::ostringstream axis; axis << p_axis;
+    return std::vector<NodeEntry>{
+      MakeNode("expand_like", n->attrs.name + "_grad",
+               {ograds[0], n->inputs[0]},
+               {{"axis", axis.str()},
+                {"exclude", std::to_string(exclude)}})
+  };
+});
+
+NNVM_REGISTER_REDUCE_OP(max)
+.describe(R"code(Computes the max of array elements over given axes.
+
+)code" NNVM_ADD_FILELINE)
+.set_attr<FTVMCompute>(
+  "FTVMCompute", [](const NodeAttrs& attrs,
+                    const Array<Tensor>& inputs,
+                    const Array<Tensor>& out_info) {
+    const ReduceParam& param = nnvm::get<ReduceParam>(attrs.parsed);
+    TShape r_axes = GetReduceAxes(inputs[0]->shape.size(),
+                                  param.axis, param.exclude);
+    auto axis = ShapeToIntArray(r_axes);
+    return Array<Tensor>{
+      topi::max(inputs[0], axis, param.keepdims, true) };
+})
+.set_attr<FGradient>(
+  "FGradient", [](const NodePtr& n,
+                  const std::vector<NodeEntry>& ograds){
+    const ReduceParam& param = nnvm::get<ReduceParam>(n->attrs.parsed);
+    std::ostringstream axis; axis << param.axis;
+    NodeEntry sub0 = MakeNode("expand_like", n->attrs.name + "_grad_sub0",
+                             {ograds[0], n->inputs[0]},
+                             {{"axis", axis.str()},
+                              {"exclude", std::to_string(param.exclude)}});
+    NodeEntry sub1 = MakeNode("_max_mask", n->attrs.name + "_grad_sub1",
+                              {ograds[0]},
+                              {{"axis", axis.str()},
+                               {"exclude", std::to_string(param.exclude)}});
+    return std::vector<NodeEntry>{
+      MakeNode("elemwise_mul", n->attrs.name + "_grad", {sub0, sub1})
+    };
+});
+
+NNVM_REGISTER_REDUCE_OP(min)
+.describe(R"code(Computes the min of array elements over given axes.
+
+)code" NNVM_ADD_FILELINE)
+.set_attr<FTVMCompute>(
+  "FTVMCompute", [](const NodeAttrs& attrs,
+                    const Array<Tensor>& inputs,
+                    const Array<Tensor>& out_info) {
+    const ReduceParam& param = nnvm::get<ReduceParam>(attrs.parsed);
+    TShape r_axes = GetReduceAxes(inputs[0]->shape.size(),
+                                  param.axis, param.exclude);
+    auto axis = ShapeToIntArray(r_axes);
+    return Array<Tensor>{
+      topi::min(inputs[0], axis, param.keepdims, true) };
+})
+.set_attr<FGradient>(
+  "FGradient", [](const NodePtr& n,
+                  const std::vector<NodeEntry>& ograds){
+    const ReduceParam& param = nnvm::get<ReduceParam>(n->attrs.parsed);
+    std::ostringstream axis; axis << param.axis;
+    NodeEntry sub0 = MakeNode("expand_like", n->attrs.name + "_grad_sub0",
+                              {ograds[0], n->inputs[0]},
+                              {{"axis", axis.str()},
+                               {"exclude", std::to_string(param.exclude)}});
+    NodeEntry sub1 = MakeNode("_min_mask", n->attrs.name + "_grad_sub1",
+                              {ograds[0]},
+                              {{"axis", axis.str()},
+                               {"exclude", std::to_string(param.exclude)}});
+    return std::vector<NodeEntry>{
+      MakeNode("elemwise_mul", n->attrs.name + "_grad", {sub0, sub1})
+    };
+});
+
+NNVM_REGISTER_BASE_REDUCE_OP(collapse_sum)
+.add_argument("data", "Tensor", "The input")
+.add_argument("as", "Tensor", "The reference")
+.set_attr<FInferShape>("FInferShape", CollapseShape)
+.set_attr<FInferType>("FInferType", ElemwiseType<2, 1>)
+.set_attr<FCorrectLayout>("FCorrectLayout", ElemwiseFixedLayoutUnknownOut<2, 1>)
+.set_num_inputs(2)
+.describe(R"code(Reduces lhs to the shape of rhs via sum)code" NNVM_ADD_FILELINE)
+.set_attr<FTVMCompute>(
+  "FTVMCompute", [](const NodeAttrs& attrs,
+                    const Array<Tensor>& inputs,
+                    const Array<Tensor>& out_info) {
+    return Array<Tensor>{ topi::collapse_sum(inputs[0], inputs[1]->shape) };
+});
+
+inline bool InferFixedType(const NodeAttrs& attrs,
+                          std::vector<int>* in_attrs,
+                          std::vector<int>* out_attrs) {
+  CHECK_EQ(in_attrs->size(), 1U);
+  CHECK_EQ(out_attrs->size(), 1U);
+  const ReduceParam& param = nnvm::get<ReduceParam>(attrs.parsed);
+  NNVM_ASSIGN_OUTPUT_TYPE(attrs, *out_attrs, 0, param.dtype);
+  return true;
+}
+
+NNVM_REGISTER_BASE_REDUCE_OP(argmax)
+.describe(R"code(Creates an operation that finds the indices of the maximum
+values over a given axis.
+
+)code" NNVM_ADD_FILELINE)
+.add_argument("data", "Tensor", "The input")
+.set_attr<FInferShape>("FInferShape", ReduceShape)
+.set_attr<FInferType>("FInferType", InferFixedType)
+.set_attr<FCorrectLayout>("FCorrectLayout", ElemwiseFixedLayoutUnknownOut<1, 1>)
+.set_num_inputs(1)
+.set_attr<FTVMCompute>(
+  "FTVMCompute", [](const NodeAttrs& attrs,
+                    const Array<Tensor>& inputs,
+                    const Array<Tensor>& out_info) {
+    const ReduceParam& param = nnvm::get<ReduceParam>(attrs.parsed);
+    TShape r_axes = GetReduceAxes(inputs[0]->shape.size(),
+                                  param.axis, param.exclude);
+    auto axis = ShapeToIntArray(r_axes);
+    Tensor out = topi::argmax(inputs[0], axis, param.keepdims, true);
+    if (param.dtype == kFloat32) out = topi::cast(out, out_info[0]->dtype);
+    return Array<Tensor>{out};
+});
+
+NNVM_REGISTER_BASE_REDUCE_OP(argmin)
+.describe(R"code(Creates an operation that finds the indices of the minimum
+values over a given axis.
+
+)code" NNVM_ADD_FILELINE)
+.add_argument("data", "Tensor", "The input")
+.set_attr<FInferShape>("FInferShape", ReduceShape)
+.set_attr<FInferType>("FInferType", InferFixedType)
+.set_attr<FCorrectLayout>("FCorrectLayout", ElemwiseFixedLayoutUnknownOut<1, 1>)
+.set_num_inputs(1)
+.set_attr<FTVMCompute>(
+  "FTVMCompute", [](const NodeAttrs& attrs,
+                    const Array<Tensor>& inputs,
+                    const Array<Tensor>& out_info) {
+    const ReduceParam& param = nnvm::get<ReduceParam>(attrs.parsed);
+    TShape r_axes = GetReduceAxes(inputs[0]->shape.size(),
+                                  param.axis, param.exclude);
+    auto axis = ShapeToIntArray(r_axes);
+    Tensor out = topi::argmin(inputs[0], axis, param.keepdims, true);
+    if (param.dtype == kFloat32) out = topi::cast(out, out_info[0]->dtype);
+    return Array<Tensor>{out};
+});
+
+NNVM_REGISTER_REDUCE_OP(mean)
+  .describe(R"code(Computes the mean of array elements over given axes.
+
+Example::
+
+  data = [[[1,2],[2,3],[1,3]],
+          [[1,4],[4,3],[5,2]],
+          [[7,1],[7,2],[7,3]]]
+
+  mean(data)
+  [3.22]
+
+  mean(data, axis=[1,2])
+  [ 2.  3.16666667  4.5]
+
+)code" NNVM_ADD_FILELINE)
+.set_attr<FTVMCompute>(
+  "FTVMCompute", [](const NodeAttrs& attrs,
+                    const Array<Tensor>& inputs,
+                    const Array<Tensor>& out_info) {
+    const ReduceParam& param = nnvm::get<ReduceParam>(attrs.parsed);
+    TShape r_axes = GetReduceAxes(inputs[0]->shape.size(),
+                                  param.axis, param.exclude);
+    if (!r_axes.ndim()) return Array<Tensor> { topi::identity(inputs[0]) };
+    auto axis = ShapeToIntArray(r_axes);
+
+    Expr count = make_const(inputs[0]->dtype, 1);
+    for (auto& i : r_axes) {
+      count *= cast(inputs[0]->dtype, inputs[0]->shape[i]);
+    }
+
+    return Array<Tensor>{
+      topi::divide(topi::sum(inputs[0], axis, param.keepdims, true), count) };
+});
+
+NNVM_REGISTER_REDUCE_OP(prod)
+  .describe(R"code(Computes the products of array elements over given axes.
+
+Example::
+
+  data = [[[1,2],[2,3],[1,3]],
+          [[1,4],[4,3],[5,2]],
+          [[7,1],[7,2],[7,3]]]
+
+  mean(data, axis=1)
+  [35562240]
+
+  mean(data, axis=[1,2])
+  [ 36  480  2058]
+
+)code" NNVM_ADD_FILELINE)
+.set_attr<FTVMCompute>(
+  "FTVMCompute", [](const NodeAttrs& attrs,
+                    const Array<Tensor>& inputs,
+                    const Array<Tensor>& out_info) {
+    const ReduceParam& param = nnvm::get<ReduceParam>(attrs.parsed);
+    TShape r_axes = GetReduceAxes(inputs[0]->shape.size(),
+                                  param.axis, param.exclude);
+    if (!r_axes.ndim()) return Array<Tensor> { topi::identity(inputs[0]) };
+    auto axis = ShapeToIntArray(r_axes);
+    return Array<Tensor>{
+      topi::prod(inputs[0], axis, param.keepdims, true) };
+});
+
+
+}  // namespace top
+}  // namespace nnvm
diff --git a/nnvm/src/top/tensor/state_op.cc b/nnvm/src/top/tensor/state_op.cc
new file mode 100644
index 000000000000..23c7158aecd3
--- /dev/null
+++ b/nnvm/src/top/tensor/state_op.cc
@@ -0,0 +1,90 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file state_op.cc
+ * \brief Experimental operators
+ *   Currently we only support assign
+ */
+#include <nnvm/op.h>
+#include <nnvm/node.h>
+#include <nnvm/op_attr_types.h>
+#include <nnvm/compiler/op_attr_types.h>
+#include <nnvm/top/tensor.h>
+#include <topi/elemwise.h>
+#include "../op_common.h"
+#include "../elemwise_op_common.h"
+
+namespace nnvm {
+namespace top {
+
+using namespace tvm;
+using namespace nnvm::compiler;
+
+NNVM_REGISTER_OP(_assign)
+.describe(R"doc(Assign rhs to the lhs.
+
+lhs must be a Variable.
+This is an experimental operator.
+
+)doc" NNVM_ADD_FILELINE)
+.set_num_inputs(2)
+.set_num_outputs(1)
+.set_attr<FMutateInputs>(
+  "FMutateInputs", [](const NodeAttrs& attrs) {
+    return std::vector<uint32_t>{0};
+})
+.set_attr<FTVMCompute>(
+  "FTVMCompute", [](const NodeAttrs& attrs,
+                    const Array<Tensor>& inputs,
+                    const Array<Tensor>& out_info) {
+    // This implementation is needed for the special
+    // logic handling assign in the compiler
+    // It simply copies the result of rhs the output
+    // The later decoration in compiler will change
+    // the memory assignment of assign to tie
+    // the lhs to the output.
+    return Array<Tensor>{ topi::identity(inputs[1]) };
+})
+.set_attr<FInferShape>("FInferShape", SameShape)
+.set_attr<FCorrectLayout>(
+  "FCorrectLayout", [](const NodeAttrs& attrs,
+                     std::vector<Layout> *in_layouts,
+                     const std::vector<Layout> *last_in_layouts,
+                     std::vector<Layout> *out_layouts) {
+  NNVM_ASSIGN_LAYOUT(*in_layouts, 1, (*in_layouts)[0]);
+  NNVM_ASSIGN_LAYOUT(*out_layouts, 0, (*in_layouts)[0]);
+  return true;
+})
+.set_attr<FInplaceOption>(
+  "FInplaceOption", [](const NodeAttrs& attrs) {
+    return std::vector<std::pair<int, int> >{{1, 0}};
+})
+.set_attr<FGradient>(
+  "FGradient", [](const NodePtr& n,
+                  const std::vector<NodeEntry>& ograds){
+    return std::vector<NodeEntry>{
+      MakeNode("zeros_like", n->attrs.name + "_zero_grad",
+               {n->inputs[0]}),
+      ograds[0]
+    };
+});
+
+}  // namespace top
+}  // namespace nnvm
diff --git a/nnvm/src/top/tensor/transform.cc b/nnvm/src/top/tensor/transform.cc
index 5d0bf5c4d56e..efe24faae18d 100644
--- a/nnvm/src/top/tensor/transform.cc
+++ b/nnvm/src/top/tensor/transform.cc
@@ -477,7 +477,7 @@ NNVM_REGISTER_OP(cast)
                     const Array<Tensor>& inputs,
                     const Array<Tensor>& out_info) {
     const CastParam& param = nnvm::get<CastParam>(attrs.parsed);
-    DataType dtype = GetTVMType(param.dtype);
+    Type dtype = GetTVMType(param.dtype);
     return Array<Tensor>{ topi::cast(inputs[0], dtype) };
 })
 .set_support_level(1);
@@ -1266,8 +1266,8 @@ NNVM_REGISTER_OP(slice_like)
     Array<Expr> target_shape = inputs[1]->shape;
     Array<Expr> begin_idx, end_idx, strides;
     for (size_t i = 0; i < src_shape.size(); ++i) {
-      begin_idx.push_back(make_const(tvm::DataType::Int(32), 0));
-      strides.push_back(make_const(tvm::DataType::Int(32), 1));
+      begin_idx.push_back(make_const(tvm::Int(32), 0));
+      strides.push_back(make_const(tvm::Int(32), 1));
     }
     end_idx = Array<Expr>(src_shape);
     if (param.axis.ndim() == 0) {
diff --git a/nnvm/src/top/vision/nms.cc b/nnvm/src/top/vision/nms.cc
new file mode 100644
index 000000000000..ec97408284e5
--- /dev/null
+++ b/nnvm/src/top/vision/nms.cc
@@ -0,0 +1,107 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file nms.cc
+ * \brief Property def of SSD non-maximum suppression operator.
+ */
+
+#include <tvm/expr.h>
+#include <tvm/packed_func_ext.h>
+#include <nnvm/op.h>
+#include <nnvm/top/nn.h>
+#include <nnvm/op_attr_types.h>
+#include <nnvm/compiler/op_attr_types.h>
+#include "../op_common.h"
+#include "../elemwise_op_common.h"
+
+namespace nnvm {
+namespace top {
+using compiler::FTVMCompute;
+using tvm::Tensor;
+using tvm::Array;
+
+DMLC_REGISTER_PARAMETER(NonMaximumSuppressionParam);
+
+bool NMSShape(const NodeAttrs& attrs,
+              std::vector<TShape> *in_attrs,
+              std::vector<TShape> *out_attrs) {
+  const NonMaximumSuppressionParam& param =
+    nnvm::get<NonMaximumSuppressionParam>(attrs.parsed);
+  CHECK_EQ(in_attrs->size(), 2U) << "Inputs: [data, valid_count]";
+  TShape dshape = in_attrs->at(0);
+  TShape vshape = in_attrs->at(1);
+  CHECK_EQ(dshape.ndim(), 3U) << "Input data should be 3-D.";
+  CHECK_EQ(vshape.ndim(), 1U) << "Input valid count should be 1-D.";
+  CHECK_EQ(dshape[2], 6U) << "Data input should have shape "
+    "(batch_size, num_anchors, 6).";
+  CHECK_EQ(dshape[0], vshape[0]) << "batch_size mismatch.";
+  out_attrs->clear();
+  if (param.return_indices) {
+    TShape oshape = TShape(2);
+    oshape[0] = dshape[0];
+    oshape[1] = dshape[1];
+    NNVM_ASSIGN_OUTPUT_SHAPE(attrs, *out_attrs, 0, oshape);
+  } else {
+    NNVM_ASSIGN_OUTPUT_SHAPE(attrs, *out_attrs, 0, dshape);
+  }
+  return true;
+}
+
+inline bool NMSInferType(const NodeAttrs &attrs,
+                         std::vector<int> *in_attrs,
+                         std::vector<int> *out_attrs) {
+  DTYPE_ASSIGN(out_attrs->at(0), in_attrs->at(0));
+  return true;
+}
+
+inline bool NMSInferLayout(const NodeAttrs& attrs,
+                           std::vector<Layout> *ilayouts,
+                           const std::vector<Layout> *last_ilayouts,
+                           std::vector<Layout> *olayouts) {
+  static const Layout kNCHW("NCHW");
+  CHECK_EQ(ilayouts->size(), 2U);
+  CHECK_EQ(olayouts->size(), 1U);
+  NNVM_ASSIGN_LAYOUT(*ilayouts, 0, kNCHW);
+  NNVM_ASSIGN_LAYOUT(*ilayouts, 1, kNCHW);
+  return true;
+}
+
+NNVM_REGISTER_OP(non_max_suppression)
+  .describe(R"doc("Non-maximum suppression."
+)doc" NNVM_ADD_FILELINE)
+.set_num_inputs(2)
+.set_num_outputs(1)
+.set_attr_parser(ParamParser<NonMaximumSuppressionParam>)
+.set_attr<FGetAttrDict>("FGetAttrDict",
+                        ParamGetAttrDict<NonMaximumSuppressionParam>)
+.add_arguments(NonMaximumSuppressionParam::__FIELDS__())
+.add_argument("data", "Tensor", "Input data.")
+.add_argument("valid_count", "Tensor", "Number of valid anchor boxes.")
+.set_attr<FListInputNames>("FListInputNames", [](const NodeAttrs& attrs) {
+  return std::vector<std::string>{"data", "valid_count"};
+})
+.set_attr<FInferShape>("FInferShape", NMSShape)
+.set_attr<FInferType>("FInferType", NMSInferType)
+.set_attr<FCorrectLayout>("FCorrectLayout", NMSInferLayout)
+.set_support_level(4);
+
+}  // namespace top
+}  // namespace nnvm
+
diff --git a/nnvm/src/top/vision/ssd/mutibox_op.cc b/nnvm/src/top/vision/ssd/mutibox_op.cc
new file mode 100644
index 000000000000..47f2f82a8664
--- /dev/null
+++ b/nnvm/src/top/vision/ssd/mutibox_op.cc
@@ -0,0 +1,176 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file multibox_op.cc
+ * \brief Property def of SSD multibox related operators.
+ */
+
+#include <tvm/expr.h>
+#include <tvm/packed_func_ext.h>
+#include <nnvm/op.h>
+#include <nnvm/top/nn.h>
+#include <nnvm/op_attr_types.h>
+#include <nnvm/compiler/op_attr_types.h>
+#include "../../op_common.h"
+#include "../../elemwise_op_common.h"
+
+namespace nnvm {
+namespace top {
+using compiler::FTVMCompute;
+using tvm::Tensor;
+using tvm::Array;
+
+DMLC_REGISTER_PARAMETER(MultiBoxPriorParam);
+
+bool MultiBoxPriorShape(const NodeAttrs& attrs,
+                        std::vector<TShape> *in_attrs,
+                        std::vector<TShape> *out_attrs) {
+  const MultiBoxPriorParam& param = nnvm::get<MultiBoxPriorParam>(attrs.parsed);
+  CHECK_EQ(in_attrs->size(), 1U) << "Inputs: [data]" << in_attrs->size();
+  TShape dshape = in_attrs->at(0);
+  CHECK_GE(dshape.ndim(), 4U) << "Input data should be 4D: "
+      "[batch, channel, height, width]";
+  int in_height = dshape[2];
+  CHECK_GT(in_height, 0) << "Input height should > 0";
+  int in_width = dshape[3];
+  CHECK_GT(in_width, 0) << "Input width should > 0";
+  // since input sizes are same in each batch, we could share MultiBoxPrior
+  TShape oshape = TShape(3);
+  int num_sizes = param.sizes.ndim();
+  int num_ratios = param.ratios.ndim();
+  oshape[0] = 1;
+  oshape[1] = in_height * in_width * (num_sizes + num_ratios - 1);
+  oshape[2] = 4;
+  CHECK_EQ(param.steps.ndim(), 2) << "Step ndim must be 2: (step_y, step_x)";
+  CHECK_GE(param.steps[0] * param.steps[1], 0) << "Must specify both "
+      "step_y and step_x";
+  out_attrs->clear();
+  NNVM_ASSIGN_OUTPUT_SHAPE(attrs, *out_attrs, 0, oshape);
+  return true;
+}
+
+inline bool MultiBoxPriorLayout(const NodeAttrs& attrs,
+                                std::vector<Layout> *ilayouts,
+                                const std::vector<Layout> *last_ilayouts,
+                                std::vector<Layout> *olayouts) {
+  static const Layout kNCHW("NCHW");
+  CHECK_EQ(ilayouts->size(), 1U);
+  CHECK_EQ(olayouts->size(), 1U);
+  NNVM_ASSIGN_LAYOUT(*ilayouts, 0, kNCHW);
+  return true;
+}
+
+NNVM_REGISTER_OP(multibox_prior)
+  .describe(R"doc("Generate prior(anchor) boxes from data, sizes and ratios."
+)doc" NNVM_ADD_FILELINE)
+.set_num_inputs(1)
+.set_num_outputs(1)
+.set_attr_parser(ParamParser<MultiBoxPriorParam>)
+.set_attr<FGetAttrDict>("FGetAttrDict", ParamGetAttrDict<MultiBoxPriorParam>)
+.add_arguments(MultiBoxPriorParam::__FIELDS__())
+.add_argument("data", "Tensor", "Input data")
+.set_attr<FInferShape>("FInferShape", MultiBoxPriorShape)
+.set_attr<FInferType>("FInferType", ElemwiseType<1, 1>)
+.set_attr<FCorrectLayout>("FCorrectLayout", MultiBoxPriorLayout)
+.set_attr<FGradient>(
+  "FGradient", [](const NodePtr& n,
+                  const std::vector<NodeEntry>& ograds) {
+    return std::vector<NodeEntry>{
+      MakeNode("zeros_like", n->attrs.name + "_zero_grad",
+      {n->inputs[0]}),
+      ograds[0]
+    };
+})
+.set_support_level(4);
+
+DMLC_REGISTER_PARAMETER(MultiBoxTransformLocParam);
+
+bool MultiBoxTransformLocShape(const NodeAttrs& attrs,
+                               std::vector<TShape> *in_attrs,
+                               std::vector<TShape> *out_attrs) {
+  CHECK_EQ(in_attrs->size(), 3U) << "Inputs: [cls_prob, loc_pred, anchor]";
+  TShape cshape = in_attrs->at(0);
+  TShape lshape = in_attrs->at(1);
+  TShape ashape = in_attrs->at(2);
+  CHECK_EQ(cshape.ndim(), 3U) << "Class probability should be 3-D.";
+  CHECK_EQ(lshape.ndim(), 2U) << "Location prediction should be 2-D.";
+  CHECK_EQ(ashape.ndim(), 3U) << "Anchor should be 3-D.";
+  CHECK_EQ(cshape[2], ashape[1]) << "Number of anchors mismatch.";
+  CHECK_EQ(cshape[2] * 4, lshape[1]) << "# anchors mismatch with # loc.";
+  CHECK_GT(ashape[1], 0U) << "Number of anchors must > 0.";
+  CHECK_EQ(ashape[2], 4U);
+  TShape oshape0 = TShape(3);
+  oshape0[0] = cshape[0];
+  oshape0[1] = ashape[1];
+  oshape0[2] = 6;  // [id, prob, xmin, ymin, xmax, ymax]
+  TShape oshape1 = TShape(1);
+  oshape1[0] = cshape[0];
+  out_attrs->clear();
+  NNVM_ASSIGN_OUTPUT_SHAPE(attrs, *out_attrs, 0, oshape0);
+  NNVM_ASSIGN_OUTPUT_SHAPE(attrs, *out_attrs, 1, oshape1);
+  return true;
+}
+
+inline bool MultiBoxTransformLocLayout(const NodeAttrs& attrs,
+                                       std::vector<Layout> *ilayouts,
+                                       const std::vector<Layout> *last_ilayouts,
+                                       std::vector<Layout> *olayouts) {
+  CHECK_EQ(ilayouts->size(), 3U);
+  CHECK_EQ(last_ilayouts->size(), 3U);
+  CHECK_EQ(olayouts->size(), 2U);
+  for (size_t i = 0; i < last_ilayouts->size(); ++i) {
+    const Layout& last_layout = last_ilayouts->at(i);
+    if (last_layout.defined()) {
+      NNVM_ASSIGN_LAYOUT(*ilayouts, i, last_layout);
+    }
+  }
+  return true;
+}
+
+inline bool MultiBoxTransformLocInferType(const NodeAttrs &attrs,
+                                          std::vector<int> *in_attrs,
+                                          std::vector<int> *out_attrs) {
+  DTYPE_ASSIGN(out_attrs->at(0), in_attrs->at(0));
+  DTYPE_ASSIGN(out_attrs->at(1), 4U);
+  return true;
+}
+
+NNVM_REGISTER_OP(multibox_transform_loc)
+  .describe(R"doc("Location transformation for multibox detection."
+)doc" NNVM_ADD_FILELINE)
+.set_num_inputs(3)
+.set_num_outputs(2)
+.set_attr_parser(ParamParser<MultiBoxTransformLocParam>)
+.set_attr<FGetAttrDict>("FGetAttrDict",
+                        ParamGetAttrDict<MultiBoxTransformLocParam>)
+.add_arguments(MultiBoxTransformLocParam::__FIELDS__())
+.add_argument("cls_prob", "Tensor", "Class probabilities.")
+.add_argument("loc_pred", "Tensor", "Location regression predictions.")
+.add_argument("anchor", "Tensor", "Multibox prior anchor boxes")
+.set_attr<FListInputNames>("FListInputNames", [](const NodeAttrs& attrs) {
+    return std::vector<std::string>{"cls_prob", "loc_pred", "anchor"};
+})
+.set_attr<FInferShape>("FInferShape", MultiBoxTransformLocShape)
+.set_attr<FInferType>("FInferType", MultiBoxTransformLocInferType)
+.set_attr<FCorrectLayout>("FCorrectLayout", MultiBoxTransformLocLayout)
+.set_support_level(4);
+
+}  // namespace top
+}  // namespace nnvm
diff --git a/nnvm/src/top/vision/yolo/reorg.cc b/nnvm/src/top/vision/yolo/reorg.cc
new file mode 100644
index 000000000000..c16d46ff4652
--- /dev/null
+++ b/nnvm/src/top/vision/yolo/reorg.cc
@@ -0,0 +1,70 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file reorg.cc
+ */
+#include <nnvm/op.h>
+#include <nnvm/node.h>
+#include <nnvm/op_attr_types.h>
+#include <nnvm/top/nn.h>
+#include "../../op_common.h"
+#include "../../elemwise_op_common.h"
+#include "reorg.h"
+
+namespace nnvm {
+namespace top {
+
+// reorg
+DMLC_REGISTER_PARAMETER(ReorgParam);
+
+inline bool ReorgInferShape(const nnvm::NodeAttrs &attrs,
+                            std::vector<TShape> *in_shape,
+                            std::vector<TShape> *out_shape) {
+  const ReorgParam &param = nnvm::get<ReorgParam>(attrs.parsed);
+  TShape dshape = in_shape->at(0);
+  if (dshape.ndim() == 0)
+    return false;
+  NNVM_ASSIGN_INPUT_SHAPE(attrs, *in_shape, 0, dshape);
+  CHECK_EQ(dshape.ndim(), 4) << "Input data should be 4D";
+  CHECK_GT(param.stride, 0U) << "Stride value cannot be 0";
+  TShape oshape({dshape[0], 0, 0, 0});
+  oshape[1] = dshape[1] * param.stride * param.stride;
+  oshape[2] = dshape[2] / param.stride;
+  oshape[3] = dshape[3] / param.stride;
+  NNVM_ASSIGN_OUTPUT_SHAPE(attrs, *out_shape, 0, oshape);
+  return true;
+}
+
+NNVM_REGISTER_OP(yolo_reorg)
+.describe(R"(Perform reorg operation on input array based on the stride value.
+- **data**: Input is 4D array of shape (batch_size, channels, in_height, in_width).
+- **out**: Output is 4D array of shape (batch_size, channels/(stride*stride), in_height*stride, in_width*stride).
+)" NNVM_ADD_FILELINE)
+.set_num_inputs(1)
+.set_num_outputs(1)
+.set_support_level(5)
+.add_argument("data", "Tensor", "Data input to reorganize")
+.set_attr_parser(ParamParser<ReorgParam>)
+.add_arguments(ReorgParam::__FIELDS__())
+.set_attr<FGetAttrDict>("FGetAttrDict", ParamGetAttrDict<ReorgParam>)
+.set_attr<FInferType>("FInferType", ElemwiseType<-1, 1>)
+.set_attr<FInferShape>("FInferShape", ReorgInferShape);
+}  // namespace top
+}  // namespace nnvm
diff --git a/nnvm/src/top/vision/yolo/reorg.h b/nnvm/src/top/vision/yolo/reorg.h
new file mode 100644
index 000000000000..53549df3634a
--- /dev/null
+++ b/nnvm/src/top/vision/yolo/reorg.h
@@ -0,0 +1,128 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file reorg.h
+ */
+#ifndef NNVM_TOP_VISION_YOLO_REORG_H_
+#define NNVM_TOP_VISION_YOLO_REORG_H_
+
+#include <string>
+#include <vector>
+#include <utility>
+#include <iostream>
+#include <sstream>
+
+namespace nnvm {
+namespace top {
+
+template <typename AttrType,
+          bool (*is_none)(const AttrType &),
+          bool (*assign)(AttrType *,
+          const AttrType &),
+          bool reverse_infer,
+          std::string (*attr_string)(const AttrType &),
+          int n_in = -1,
+          int n_out = -1>
+inline bool ReorgAttr(const nnvm::NodeAttrs &attrs,
+                      std::vector<AttrType> *in_attrs,
+                      std::vector<AttrType> *out_attrs,
+                      const AttrType &none) {
+  AttrType dattr = none;
+  size_t in_size = in_attrs->size();
+  size_t out_size = out_attrs->size();
+  if (n_in != -1) {
+    in_size = static_cast<size_t>(n_in);
+  }
+  if (n_out != -1) {
+    out_size = static_cast<size_t>(n_out);
+  }
+
+  auto deduce = [&](std::vector<AttrType> *vec, size_t size, const char *name) {
+    for (size_t i = 0; i < size; ++i) {
+      if (i == 0) {
+        CHECK(assign(&dattr, (*vec)[i]))
+            << "Incompatible attr in node " << attrs.name << " at " << i
+            << "-th " << name << ": "
+            << "expected " << attr_string(dattr) << ", got "
+            << attr_string((*vec)[i]);
+      }
+    }
+  };
+  deduce(in_attrs, in_size, "input");
+
+  auto write = [&](std::vector<AttrType> *vec, size_t size, const char *name) {
+    for (size_t i = 0; i < size; ++i) {
+      CHECK(assign(&(*vec)[i], dattr))
+          << "Incompatible attr in node " << attrs.name << " at " << i << "-th "
+          << name << ": "
+          << "expected " << attr_string(dattr) << ", got "
+          << attr_string((*vec)[i]);
+    }
+  };
+  write(out_attrs, out_size, "output");
+
+  if (is_none(dattr)) {
+    return false;
+  }
+  return true;
+}
+
+template <int n_in, int n_out>
+inline bool ReorgShape(const NodeAttrs &attrs,
+                       std::vector<TShape> *in_attrs,
+                       std::vector<TShape> *out_attrs) {
+  if (n_in != -1) {
+    CHECK_EQ(in_attrs->size(), static_cast<size_t>(n_in))
+        << " in operator " << attrs.name;
+  }
+  if (n_out != -1) {
+    CHECK_EQ(out_attrs->size(), static_cast<size_t>(n_out))
+        << " in operator " << attrs.name;
+  }
+  return ReorgAttr<TShape, shape_is_none, shape_assign, true, shape_string>(
+      attrs, in_attrs, out_attrs, TShape());
+}
+
+template <int n_in, int n_out>
+inline bool ReorgType(const NodeAttrs &attrs,
+                      std::vector<int> *in_attrs,
+                      std::vector<int> *out_attrs) {
+  if (n_in != -1) {
+    CHECK_EQ(in_attrs->size(), static_cast<size_t>(n_in))
+        << " in operator " << attrs.name;
+  }
+  if (n_out != -1) {
+    CHECK_EQ(out_attrs->size(), static_cast<size_t>(n_out))
+        << " in operator " << attrs.name;
+  }
+  return ReorgAttr<int, type_is_none, type_assign, true, type_string>(
+      attrs, in_attrs, out_attrs, -1);
+}
+
+struct ReorgParam : public dmlc::Parameter<ReorgParam> {
+  int stride;
+
+  DMLC_DECLARE_PARAMETER(ReorgParam) {
+    DMLC_DECLARE_FIELD(stride).set_default(1).describe("Stride value");
+  }
+};
+}  // namespace top
+}  // namespace nnvm
+#endif  // NNVM_TOP_VISION_YOLO_REORG_H_
diff --git a/nnvm/tests/python/compiler/test_alter_op_layout.py b/nnvm/tests/python/compiler/test_alter_op_layout.py
new file mode 100644
index 000000000000..aad634f03843
--- /dev/null
+++ b/nnvm/tests/python/compiler/test_alter_op_layout.py
@@ -0,0 +1,121 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Unittest cases for AlterOpLayout pass"""
+from nnvm import symbol as sym
+from nnvm.compiler import graph_attr
+from nnvm.top import registry as reg
+import nnvm.graph as graph
+
+def get_layouts(g):
+    ldict = {}
+    vlayout = g.json_attr("layout")
+    entry_ptr = g.index.entry_ptr
+    for i, n in enumerate(g.index.nodes):
+        begin, end = entry_ptr[i], entry_ptr[i + 1]
+        ldict[n["name"]] = vlayout[begin:end]
+    return ldict
+
+
+def test_alter_conv2d_layout():
+    data = sym.Variable("data", shape=(1, 32, 512, 512))
+    conv = sym.conv2d(data, name="conv", channels=16,
+                      kernel_size=(3,3), padding=(1,1),
+                      use_bias=False, layout="NCHW")
+    # split here
+    convs = sym.split(conv, indices_or_sections=2)
+    relus = [sym.relu(x, name="relu") for x in convs]
+    relu = sym.concatenate(*relus)
+    flatten = sym.flatten(relu, name="flatten")
+    softmax = sym.softmax(flatten, name="softmax")
+    g = graph.create(softmax)
+
+    g = g.apply("CorrectLayout")
+    g = graph_attr.set_dtype_inputs(g, "float32")
+    g = g.apply(["InferShape", "InferType"])
+    layouts_origin = get_layouts(g)
+
+    @reg.register_alter_op_layout("conv2d", level=100)
+    def alter_conv2d_layout(attrs, inputs, tinfos):
+        new_attrs = {k : attrs[k] for k in attrs.keys()}
+        new_attrs["layout"] = "NCHW16c"
+        new_attrs["kernel_layout"] = "NCHW16c"
+        new_attrs["name"] = "conv_alter"
+        return sym.conv2d(inputs[0], inputs[1], **new_attrs)
+
+    g = g.apply("AlterOpLayout")
+    layouts = get_layouts(g)
+
+    # check copy layouts
+    for node in ["data", "relu", "flatten", "softmax", "conv_weight"]:
+        assert layouts[node] == layouts_origin[node]
+    assert layouts["conv_alter"] == layouts_origin["conv"]
+
+
+def test_consecutive_alter_layout():
+    data = sym.Variable("data", shape=(1, 32, 512, 512))
+    pool1 = sym.global_avg_pool2d(data, name="global_avg_pool2d_1", layout="NCHW")
+    pool2 = sym.global_avg_pool2d(pool1, name="global_avg_pool2d_2", layout="NCHW")
+    relu = sym.relu(pool2, name="relu")
+
+    g = graph.create(relu)
+    g = g.apply("CorrectLayout")
+    g = graph_attr.set_dtype_inputs(g, "float32")
+    g = g.apply(["InferShape", "InferType"])
+    assert g.json_attr("layout") == ['NCHW', 'NCHW', 'NCHW', 'NCHW']
+
+    @reg.register_alter_op_layout("global_avg_pool2d", level=100)
+    def alter_global_avg_pool2d_layout(attrs, inputs, tinfos):
+        new_attrs = {k : attrs[k] for k in attrs.keys()}
+        new_attrs["layout"] = "NCHW16c"
+        return sym.global_avg_pool2d(inputs[0], **new_attrs)
+
+    g = g.apply("AlterOpLayout")
+
+    # pool1 get replaced - output layout of pool1 is not recorded
+    # pool2 get replaced - input layout of pool2 is not recorded
+    # thus the second entry must be undefined - it can neither recover from pool1's output,
+    # nor from pool2's input.
+    assert g.json_attr("layout") == ['NCHW', '__undef__', 'NCHW', 'NCHW']
+
+
+def test_alter_func_return_none():
+    data = sym.Variable("data", shape=(1, 32, 512, 512))
+    pool1 = sym.global_max_pool2d(data, name="pool1", layout="NCHW")
+    pool2 = sym.global_max_pool2d(pool1, name="pool2", layout="NCHW")
+    relu = sym.relu(pool2, name="relu")
+
+    g = graph.create(relu)
+    g = g.apply("CorrectLayout")
+    g = graph_attr.set_dtype_inputs(g, "float32")
+    g = g.apply(["InferShape", "InferType"])
+    assert g.json_attr("layout") == ['NCHW', 'NCHW', 'NCHW', 'NCHW']
+
+    @reg.register_alter_op_layout("global_max_pool2d", level=100)
+    def alter_global_max_pool2d_layout(attrs, inputs, tinfos):
+        return None
+
+    g = g.apply("AlterOpLayout")
+
+    # alter func return none, nothing get replaced,
+    # the layouts should remain the same
+    assert g.json_attr("layout") == ['NCHW', 'NCHW', 'NCHW', 'NCHW']
+
+
+if __name__ == "__main__":
+    test_alter_conv2d_layout()
+    test_consecutive_alter_layout()
+    test_alter_func_return_none()
diff --git a/nnvm/tests/python/compiler/test_autotvm_task_extraction.py b/nnvm/tests/python/compiler/test_autotvm_task_extraction.py
new file mode 100644
index 000000000000..1ecbf053f923
--- /dev/null
+++ b/nnvm/tests/python/compiler/test_autotvm_task_extraction.py
@@ -0,0 +1,79 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Test task extraction for autotvm"""
+
+import nnvm.testing
+import nnvm.compiler
+from tvm import autotvm
+
+def get_network(name, batch_size):
+    """Get the symbol definition and random weight of a network"""
+    input_shape = (batch_size, 3, 224, 224)
+    output_shape = (batch_size, 1000)
+
+    if name == 'resnet-18':
+        net, params = nnvm.testing.resnet.get_workload(num_layers=18, batch_size=batch_size)
+    elif name == 'mobilenet':
+        net, params = nnvm.testing.mobilenet.get_workload(batch_size=batch_size)
+    elif name == 'squeezenet v1.1':
+        net, params = nnvm.testing.squeezenet.get_workload(batch_size=batch_size, version='1.1')
+    elif name == 'vgg-16':
+        net, params = nnvm.testing.vgg.get_workload(num_layers=16, batch_size=batch_size)
+    elif name == 'dcgan':
+        net, params = nnvm.testing.dcgan.get_workload(batch_size=batch_size)
+        input_shape = (batch_size, 100)
+    else:
+        raise ValueError("Unsupported network: " + name)
+
+    return net, params, input_shape, output_shape
+
+def test_task_extraction():
+    target = 'llvm'
+    dtype = 'float32'
+
+    net, params, input_shape, out_shape = get_network('resnet-18', batch_size=1)
+    tasks = autotvm.task.extract_from_graph(net, target=target,
+                                            shape={'data': input_shape}, dtype=dtype,
+                                            symbols=(nnvm.sym.conv2d,))
+    assert len(tasks) == 12
+
+    net, params, input_shape, out_shape = get_network('resnet-18', batch_size=1)
+    tasks = autotvm.task.extract_from_graph(net, target=target,
+                                            shape={'data': input_shape}, dtype=dtype,
+                                            symbols=(nnvm.sym.dense,))
+    assert len(tasks) == 1
+
+    net, params, input_shape, out_shape = get_network('resnet-18', batch_size=1)
+    tasks = autotvm.task.extract_from_graph(net, target=target,
+                                            shape={'data': input_shape}, dtype=dtype,
+                                            symbols=(nnvm.sym.conv2d, nnvm.sym.dense))
+    assert len(tasks) == 13
+
+    net, params, input_shape, out_shape = get_network('mobilenet', batch_size=1)
+    tasks = autotvm.task.extract_from_graph(net, target=target,
+                                            shape={'data': input_shape}, dtype=dtype,
+                                            symbols=(nnvm.sym.conv2d, nnvm.sym.dense))
+    assert len(tasks) == 20
+
+    net, params, input_shape, out_shape = get_network('dcgan', batch_size=1)
+    tasks = autotvm.task.extract_from_graph(net, target=target,
+                                            shape={'data': input_shape}, dtype=dtype,
+                                            symbols=(nnvm.sym.conv2d_transpose,))
+    assert len(tasks) == 4
+
+if __name__ == '__main__':
+    test_task_extraction()
diff --git a/nnvm/tests/python/compiler/test_build.py b/nnvm/tests/python/compiler/test_build.py
new file mode 100644
index 000000000000..a2a5ac659c8f
--- /dev/null
+++ b/nnvm/tests/python/compiler/test_build.py
@@ -0,0 +1,176 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import numpy as np
+
+import tvm
+from tvm.contrib import graph_runtime
+import nnvm.symbol as sym
+import nnvm.compiler
+from nnvm.compiler.build_module import _run_graph, precompute_prune
+
+def test_compile():
+    x = sym.Variable("x")
+    y = sym.Variable("y")
+    z = sym.exp(y + x)
+    shape = (10, 128)
+    dtype = tvm.float32
+    shape_dict = {"x": shape, "y": shape}
+    def verify(graph, lib):
+        m = graph_runtime.create(graph, lib, tvm.cpu(0))
+        # get member functions
+        set_input, run, get_output = m["set_input"], m["run"], m["get_output"]
+        na = tvm.nd.array(np.random.uniform(size=shape).astype(dtype))
+        nb = tvm.nd.array(np.random.uniform(size=shape).astype(dtype))
+        # set inputs
+        set_input("x", na)
+        set_input("y", nb)
+        # execute
+        run()
+        # get outputs
+        out = tvm.nd.empty(shape, dtype)
+        get_output(0, out)
+        tvm.testing.assert_allclose(
+            out.asnumpy(), np.exp(na.asnumpy() + nb.asnumpy()))
+
+    graph, lib, _ = nnvm.compiler.build(z, "llvm", shape_dict)
+    assert graph.index.num_nodes == 3
+    verify(graph, lib)
+
+    with nnvm.compiler.build_config(opt_level=0):
+        graph, lib, _ = nnvm.compiler.build(z, "llvm", shape_dict)
+        # print(graph.ir())
+        assert graph.index.num_nodes == 4
+        verify(graph, lib)
+
+def test_run():
+    x = sym.Variable("x")
+    y = sym.Variable("y")
+    z = sym.exp(y + x)
+    shape = (10, 10)
+    dtype = tvm.float32
+    nx = tvm.nd.array(np.random.uniform(size=shape).astype(dtype))
+    ny = tvm.nd.array(np.random.uniform(size=shape).astype(dtype))
+    res = _run_graph(z, {"x": nx, "y": ny})
+    tvm.testing.assert_allclose(
+        res[0].asnumpy(), np.exp(nx.asnumpy() + ny.asnumpy()))
+
+
+def test_precompute_prune():
+    x = sym.Variable("x") + 1
+    a = sym.Variable("a")
+    y = sym.Variable("y")
+    z = y + x + a
+    shape = (10, 10)
+    dtype = tvm.float32
+    nx = tvm.nd.array(np.random.uniform(size=shape).astype(dtype))
+    na = tvm.nd.array(np.random.uniform(size=shape).astype(dtype))
+    ny = tvm.nd.array(np.random.uniform(size=shape).astype(dtype))
+    params = {"x": nx, "a": na}
+    graph, lib, params = nnvm.compiler.build(
+        z, "llvm", shape={"y": ny.shape}, params=params)
+    assert graph.index.num_nodes == 4
+    m = graph_runtime.create(graph, lib, tvm.cpu(0))
+    params["y"] = ny
+    res = tvm.nd.empty(shape)
+    m["load_params"](nnvm.compiler.save_param_dict(params))
+    m.run()
+    out = m.get_output(0, out=res)
+    tvm.testing.assert_allclose(
+        res.asnumpy(), nx.asnumpy() + 1 + ny.asnumpy() + na.asnumpy())
+
+
+def test_dtypes():
+    x = sym.Variable("x")
+    y = sym.relu(x)
+    dshape = (1, 3, 32, 32)
+    oshape = dshape
+    for dtype in ['float32', 'float64', 'int32', 'int16', 'int8', 'int64']:
+        graph, lib, _ = nnvm.compiler.build(y, 'llvm', {"x": dshape}, dtype=dtype)
+        m = graph_runtime.create(graph, lib, tvm.cpu())
+        if 'float' in dtype:
+          data = np.random.uniform(size=dshape).astype(dtype)
+        elif 'int' in dtype:
+          data = np.random.randint(-127, 127, dshape).astype(dtype)
+        m.run(x=data)
+        data = (data > 0) * data
+        out = m.get_output(0, tvm.nd.empty(oshape, dtype))
+        tvm.testing.assert_allclose(out.asnumpy(), data, atol=1e-5, rtol=1e-5)
+
+def test_ndarray_output():
+    x = sym.Variable("x")
+    y = sym.Variable("y")
+    z = x + y
+    shape = (10, 10)
+    dtype = tvm.float32
+    nx = tvm.nd.array(np.random.uniform(size=shape).astype(dtype))
+    ny = tvm.nd.array(np.random.uniform(size=shape).astype(dtype))
+    params = {"x": nx, "ny": ny}
+    graph, lib, params = nnvm.compiler.build(
+        z, "llvm", shape={"y": ny.shape, "x": nx.shape}, params=params)
+    m = graph_runtime.create(graph, lib, tvm.cpu(0))
+    m.set_input("x", nx)
+    m.set_input("y", ny)
+    m.run()
+    out = m.get_output(0)
+    tvm.testing.assert_allclose(
+        out.asnumpy(), nx.asnumpy() + ny.asnumpy())
+
+def test_ndarray_input():
+    x = sym.Variable("x")
+    y = sym.Variable("y")
+    z = x + y
+    shape = (10, 10)
+    dtype = tvm.float32
+    nx = tvm.nd.array(np.random.uniform(size=shape).astype(dtype))
+    ny = tvm.nd.array(np.random.uniform(size=shape).astype(dtype))
+    params = {"x": nx, "ny": ny}
+    graph, lib, params = nnvm.compiler.build(
+        z, "llvm", shape={"y": ny.shape, "x": nx.shape}, params=params)
+    m = graph_runtime.create(graph, lib, tvm.cpu(0))
+    m.set_input("x", nx)
+    m.set_input("y", ny)
+    in_x = tvm.nd.empty(shape, dtype)
+    in_y = tvm.nd.empty(shape, dtype)
+    m.get_input("x", in_x)
+    m.get_input("y", in_y)
+    tvm.testing.assert_allclose(nx.asnumpy(), in_x.asnumpy())
+    tvm.testing.assert_allclose(ny.asnumpy(), in_y.asnumpy())
+    in_nx = m.get_input("x")
+    in_ny = m.get_input("y")
+    tvm.testing.assert_allclose(nx.asnumpy(), in_nx.asnumpy())
+    tvm.testing.assert_allclose(ny.asnumpy(), in_ny.asnumpy())
+
+def test_num_outputs():
+    x = sym.Variable('x')
+    z = sym.split(x, indices_or_sections=5, axis=1)
+    shape = (10, 10)
+    dtype = tvm.float32
+    nx = tvm.nd.array(np.random.uniform(size=shape).astype(dtype))
+    params = {"x": nx}
+    graph, lib, params = nnvm.compiler.build(
+        z, "llvm", shape={"x": nx.shape}, params=params)
+    m = graph_runtime.create(graph, lib, tvm.cpu(0))
+    assert m.get_num_outputs() == 5
+
+if __name__ == "__main__":
+    test_precompute_prune()
+    test_compile()
+    test_run()
+    test_dtypes()
+    test_ndarray_output()
+    test_ndarray_input()
+    test_num_outputs()
diff --git a/nnvm/tests/python/compiler/test_fold_axis.py b/nnvm/tests/python/compiler/test_fold_axis.py
new file mode 100644
index 000000000000..2bceb652162a
--- /dev/null
+++ b/nnvm/tests/python/compiler/test_fold_axis.py
@@ -0,0 +1,174 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Unittest cases for fold_axis"""
+import tvm
+import nnvm
+import nnvm.testing.resnet
+import numpy as np
+from nnvm import symbol as sym
+from nnvm.compiler import graph_util, graph_attr
+
+def test_fold_axis_conv():
+    # Before simplify
+    def before(x, conv_weight, conv_bias, in_scale, out_scale, channels):
+        x = x * sym.expand_dims(in_scale, axis=1, num_newaxis=2)
+        y = sym.conv2d(x, conv_weight, conv_bias,
+                       channels=channels,
+                       kernel_size=(3, 3),
+                       padding=(1, 1),
+                       name="conv")
+        y = sym.relu(y)
+        y = y * sym.expand_dims(out_scale, axis=1, num_newaxis=2)
+        return y
+
+    def expected(x, conv_weight, conv_bias, in_scale, out_scale, channels):
+        conv_weight = conv_weight * sym.expand_dims(out_scale, axis=1, num_newaxis=3)
+        conv_weight = conv_weight * sym.expand_dims(in_scale, axis=1, num_newaxis=2)
+        conv_bias = conv_bias * out_scale
+        y = sym.conv2d(x,
+                       conv_weight,
+                       conv_bias,
+                       channels=channels,
+                       kernel_size=(3, 3),
+                       padding=(1, 1),
+                       name="conv")
+        y = sym.relu(y)
+        return y
+
+    def check(shape, channels):
+        x = sym.Variable("x") + 1
+        weight = sym.Variable("weight")
+        bias = sym.Variable("bias")
+        in_scale = sym.Variable("in_scale")
+        out_scale = sym.Variable("out_scale")
+        y1 = before(x, weight, bias, in_scale, out_scale, channels)
+        y2 = expected(x, weight, bias, in_scale, out_scale, channels)
+        ishape = {"x": shape, "out_scale": (channels,), "in_scale": (shape[1],)}
+        g1 = nnvm.graph.create(y1)
+        g2 = nnvm.graph.create(y2)
+        graph_attr.set_shape_inputs(g1, ishape)
+        g1 = g1.apply("InferShape").apply("FoldScaleAxis")
+        # assert graph equals as expected
+        graph_util.check_graph_equal(g1, g2)
+
+    check((2, 4, 10, 10), 2)
+
+def test_fold_axis_depthwise_conv():
+    # Before simplify
+    def before(x, conv_weight, conv_bias, in_scale, out_scale, channels):
+        x = x * sym.expand_dims(in_scale, axis=1, num_newaxis=2)
+        y = sym.conv2d(x, conv_weight, conv_bias,
+                       channels=channels,
+                       kernel_size=(3, 3),
+                       padding=(1, 1),
+                       groups=54,
+                       name="depthiwise_conv")
+        y = sym.relu(y)
+        y = y * sym.expand_dims(out_scale, axis=1, num_newaxis=2)
+        return y
+
+    def expected(x, conv_weight, conv_bias, in_scale, out_scale, channels):
+        conv_weight = conv_weight * sym.expand_dims(out_scale, axis=1, num_newaxis=3)
+        conv_weight = conv_weight * sym.expand_dims(in_scale, axis=1, num_newaxis=3)
+        conv_bias = conv_bias * out_scale
+        y = sym.conv2d(x,
+                       conv_weight,
+                       conv_bias,
+                       channels=channels,
+                       kernel_size=(3, 3),
+                       padding=(1, 1),
+                       groups=54,
+                       name="depthiwise_conv")
+        y = sym.relu(y)
+        return y
+
+    def check(shape, channels):
+        x = sym.Variable("x") + 1
+        weight = sym.Variable("weight")
+        bias = sym.Variable("bias")
+        in_scale = sym.Variable("in_scale")
+        out_scale = sym.Variable("out_scale")
+        y1 = before(x, weight, bias, in_scale, out_scale, channels)
+        y2 = expected(x, weight, bias, in_scale, out_scale, channels)
+        ishape = {"x": shape, "out_scale": (channels,), "in_scale": (shape[1],)}
+        g1 = nnvm.graph.create(y1)
+        g2 = nnvm.graph.create(y2)
+        graph_attr.set_shape_inputs(g1, ishape)
+        g1 = g1.apply("InferShape").apply("FoldScaleAxis")
+        # assert graph equals as expected
+        graph_util.check_graph_equal(g1, g2)
+
+    check((1, 54, 63, 127), 54)
+
+def test_fold_fail():
+    # Before simplify
+    def before(x, scale, channels):
+        y = sym.conv2d(x,
+                       channels=channels,
+                       kernel_size=(3, 3),
+                       padding=(1, 1),
+                       name="conv")
+        y = y * sym.expand_dims(scale, axis=1, num_newaxis=1)
+        return y
+
+    def check(shape, channels):
+        x = sym.Variable("x")
+        bias = sym.Variable("bias")
+        scale = sym.Variable("scale")
+        y1 = before(x, scale, channels)
+        ishape = {"x": shape, "scale": (channels,), "bias": (channels,)}
+        g1 = nnvm.graph.create(y1)
+        graph_attr.set_shape_inputs(g1, ishape)
+        g2 = g1.apply("InferShape").apply("FoldScaleAxis")
+        # assert graph equals as expected
+        graph_util.check_graph_equal(g1, g2)
+
+    check((2, 10, 10, 10), 10)
+
+
+def test_fold_resnet():
+    batch_size = 1
+    num_classes = 1000
+    image_shape = (3, 224, 224)
+    data_shape = (batch_size,) +image_shape
+    net, params = nnvm.testing.resnet.get_workload(
+        batch_size=1, image_shape=image_shape)
+    ishape = {"data" : data_shape}
+    graph = nnvm.graph.create(net)
+    data = np.random.uniform(size=data_shape).astype("float32")
+    # Initial pass do shape type inference
+    shape, _ = graph_util.infer_shape(graph, **ishape)
+    ishape.update(zip(graph.index.input_names, shape))
+
+    def run_prune(graph, params, opt_level):
+        # Apply optimization
+        with nnvm.compiler.build_config(opt_level=0):
+            graph = nnvm.compiler.optimize(graph, ishape)
+        graph, params = nnvm.compiler.build_module.precompute_prune(graph, params)
+        params["data"] = data
+        return nnvm.compiler.build_module._run_graph(graph, params)
+
+    x = run_prune(graph, params, 0)
+    y = run_prune(graph, params, 3)
+    tvm.testing.assert_allclose(y[0].asnumpy(), x[0].asnumpy())
+
+
+if __name__ == "__main__":
+    test_fold_resnet()
+    test_fold_axis_conv()
+    test_fold_fail()
+    test_fold_axis_depthwise_conv()
diff --git a/nnvm/tests/python/compiler/test_graph_pass.py b/nnvm/tests/python/compiler/test_graph_pass.py
new file mode 100644
index 000000000000..d65a2be9abf8
--- /dev/null
+++ b/nnvm/tests/python/compiler/test_graph_pass.py
@@ -0,0 +1,34 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Unittest cases for graph pass"""
+import nnvm
+import nnvm.compiler
+from nnvm import symbol as sym
+from nnvm.compiler import graph_util, graph_attr
+
+def test_infer_attr():
+    x = sym.Variable("x")
+    y = x * 2
+    g = nnvm.graph.create(y)
+    ishape, oshape = graph_util.infer_shape(g, x=(10,20))
+    assert tuple(oshape[0]) == (10, 20)
+
+    itype, otype = graph_util.infer_dtype(g, x="float32")
+    assert otype[0] == "float32"
+
+if __name__ == "__main__":
+    test_infer_attr()
diff --git a/nnvm/tests/python/compiler/test_nhwc_layout.py b/nnvm/tests/python/compiler/test_nhwc_layout.py
new file mode 100644
index 000000000000..e3747daf8563
--- /dev/null
+++ b/nnvm/tests/python/compiler/test_nhwc_layout.py
@@ -0,0 +1,73 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import numpy as np
+import tvm
+from tvm.contrib import graph_runtime as runtime
+import nnvm.symbol as sym
+import nnvm.compiler
+from nnvm.testing.config import ctx_list
+
+def get_sym(layout, kernel_layout, channels):
+    data = sym.Variable(name="data")
+    data = sym.conv2d(data=data, kernel_size=(3,3), channels=channels, padding=(1, 1),
+                      layout=layout, kernel_layout=kernel_layout, use_bias=True)
+    data = sym.max_pool2d(data=data, pool_size=(2, 2), strides=(2, 2), layout=layout)
+    data = sym.upsampling(data=data, scale=2, layout=layout)
+    softmax_axis = 1
+    if layout == "NHWC":
+        softmax_axis = 3
+    data = sym.softmax(data=data, axis=softmax_axis)
+    return data
+
+
+def build_and_run(sym, params, data, out_shape):
+    ctx = tvm.cpu(0)
+    graph, lib, params = nnvm.compiler.build(sym, "llvm", shape={"data":data.shape}, params=params)
+    module = runtime.create(graph, lib, ctx)
+    module.set_input(**params)
+    module.set_input("data", data)
+    module.run()
+    out =  module.get_output(0, tvm.nd.empty(out_shape))
+    return out.asnumpy()
+
+
+def test_nhwc():
+    data_shape = (1, 3, 224, 224)
+    out_channel = 8
+    nchw_sym = get_sym("NCHW", "OIHW", out_channel)
+    nhwc_sym = get_sym("NHWC", "HWIO", out_channel)
+    conv_weight = np.random.uniform(-1, 1, (out_channel, 3, 3, 3)).astype(np.float32)
+    conv_bias = np.random.uniform(-1, 1, (out_channel)).astype(np.float32)
+    nchw_params = {
+        "conv2d0_weight" : tvm.nd.array(conv_weight, ctx=tvm.cpu(0)),
+        "conv2d0_bias" : tvm.nd.array(conv_bias, ctx=tvm.cpu(0))
+    }
+    nhwc_params = {
+        "conv2d1_weight" : tvm.nd.array(conv_weight.transpose(2, 3, 1, 0), ctx=tvm.cpu(0)),
+        "conv2d1_bias" : tvm.nd.array(conv_bias, ctx=tvm.cpu(0))
+    }
+
+    data = np.random.uniform(-1, 1, data_shape).astype(np.float32)
+    oshape = (1, out_channel, 224, 224)
+    oshape_nhwc = (1, 224, 224, out_channel)
+    nchw_output = build_and_run(nchw_sym, nchw_params, data, oshape)
+    nhwc_output = build_and_run(nhwc_sym, nhwc_params, data.transpose(0, 2, 3, 1), oshape_nhwc)
+    tvm.testing.assert_allclose(nchw_output, nhwc_output.transpose(0, 3, 1, 2), rtol=1e-5, atol=1e-5)
+
+
+if __name__ == "__main__":
+    test_nhwc()
diff --git a/nnvm/tests/python/compiler/test_op_fusion.py b/nnvm/tests/python/compiler/test_op_fusion.py
new file mode 100644
index 000000000000..bc0caeecf58c
--- /dev/null
+++ b/nnvm/tests/python/compiler/test_op_fusion.py
@@ -0,0 +1,248 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import nnvm
+import numpy as np
+import tvm
+import topi.testing
+from tvm.contrib import graph_runtime
+from nnvm import symbol as sym
+from nnvm.compiler import graph_util, graph_attr
+from nnvm.testing import ctx_list, utils
+
+def test_ewise_injective():
+    x = sym.Variable("x")
+    y = x * 2
+    y = sym.flatten(y) + 1
+    dshape = (10, 2, 3)
+    shape_dict = {"x": dshape}
+    dtype = "float32"
+    target = "llvm"
+    for target, ctx in ctx_list():
+        graph, lib, _ = nnvm.compiler.build(y, target, shape_dict)
+        assert graph.index.num_nodes == 2
+        m = graph_runtime.create(graph, lib, ctx)
+        x_np = np.random.uniform(size=dshape).astype(dtype)
+        m.run(x=x_np)
+        out = m.get_output(0, tvm.nd.empty((10, 6)))
+        tvm.testing.assert_allclose(
+            out.asnumpy(),  x_np.reshape(out.shape) * 2 + 1,
+            atol=1e-5, rtol=1e-5)
+
+
+def test_conv_ewise_injective():
+    x = sym.Variable("x")
+    y = sym.conv2d(x, channels=32, kernel_size=(3, 3), groups=32,
+                   name="y", padding=(1,1))
+    y = sym.flatten(y + 1) + 1
+    dtype = "float32"
+    dshape = (1, 32, 18, 18)
+    kshape = (32, 1, 3, 3)
+    oshape = (1, 32* 18 * 18)
+    shape_dict = {"x": dshape}
+
+    for target, ctx in ctx_list():
+        graph, lib, _ = nnvm.compiler.build(y, target, shape_dict)
+        m = graph_runtime.create(graph, lib, ctx)
+        # print(graph.ir(join_entry_attrs=["shape"]))
+        assert graph.index.num_nodes == 5
+        # set input
+        data = tvm.nd.array(np.random.uniform(size=dshape).astype(dtype))
+        kernel = tvm.nd.array(np.random.uniform(size=kshape).astype(dtype))
+        bias = tvm.nd.array(np.random.uniform(size=kshape[0]).astype(dtype))
+        m.run(x=data, y_weight=kernel, y_bias=bias)
+        # get output
+        out = m.get_output(0, tvm.nd.empty(oshape, dtype))
+        c_np = topi.testing.depthwise_conv2d_python_nchw(
+            data.asnumpy(), kernel.asnumpy(), (1,1), 'SAME')
+        c_np = c_np + bias.asnumpy().reshape(kshape[0], 1, 1) + 1
+        c_np = c_np.reshape(c_np.shape[0], np.prod(c_np.shape[1:])) + 1
+        tvm.testing.assert_allclose(out.asnumpy(), c_np, rtol=1e-5)
+
+
+def test_injective_reduce_injective():
+    x = sym.Variable("x")
+    x = sym.flatten(x) + 1
+    y = sym.sum(x, axis=1)
+    dtype = "float32"
+    dshape = (32, 1, 18, 18)
+    shape_dict = {"x": dshape}
+
+    for target, ctx in ctx_list():
+        graph, lib, _ = nnvm.compiler.build(y, target, shape_dict)
+        m = graph_runtime.create(graph, lib, ctx)
+        assert graph.index.num_nodes == 2
+        data = np.random.uniform(size=dshape).astype(dtype)
+        m.run(x=data)
+        c_np = np.sum(data.reshape(32, 18 * 18) + 1, axis=1)
+        # get output
+        out = m.get_output(0, tvm.nd.empty(c_np.shape, dtype))
+        tvm.testing.assert_allclose(out.asnumpy(), c_np, rtol=1e-5)
+
+
+def test_injective_conv2d():
+    channels = 16
+    data = sym.Variable(name="data")
+    pool = sym.global_avg_pool2d(data=data)
+    weight = sym.reshape(pool, shape=[1, channels, 1, 1])
+    residual = sym.conv2d(data=data, kernel_size=(3,3), channels=channels, padding=(1, 1),
+                          layout="NCHW", kernel_layout="OIHW", use_bias=False, name="conv")
+    net = weight * data + residual
+    size = 56
+    dtype="float32"
+    dshape = (1, channels, size, size)
+    kshape = (channels, channels, 3, 3)
+    oshape = dshape
+    shape_dict = {"data": dshape}
+
+    for target, ctx in ctx_list():
+        graph, lib, _ = nnvm.compiler.build(net, target, shape_dict)
+        # data, global_avg_pool, conv weight, conv op, fused elemwise add
+        assert graph.index.num_nodes == 5
+
+        data = tvm.nd.array(np.random.uniform(size=dshape).astype(dtype))
+        kernel = tvm.nd.array(np.random.uniform(size=kshape).astype(dtype))
+        m = graph_runtime.create(graph, lib, ctx)
+        m.run(data=data, conv_weight=kernel)
+        # get output
+        out = m.get_output(0, tvm.nd.empty(oshape, dtype))
+        residual = topi.testing.conv2d_nchw_python(
+            data.asnumpy(), kernel.asnumpy(), (1,1), 'SAME')
+        weight = np.mean(data.asnumpy(), axis=(2, 3))
+        c_np = weight[:, :, np.newaxis, np.newaxis] * data.asnumpy() + residual
+        tvm.testing.assert_allclose(out.asnumpy(), c_np, rtol=1e-5)
+
+
+def test_concatenate_conv2d():
+    ch = 3
+    size = 8
+    data = sym.Variable(name="data")
+    concat = sym.concatenate(data, data, axis=1)
+    conv = sym.conv2d(data=concat, kernel_size=(1,1), channels=ch*2, use_bias=False, name="conv")
+    net = sym.elemwise_add(concat, conv)
+
+    dtype="float32"
+    dshape = (1, ch, size, size)
+    kshape = (ch*2, ch*2, 1, 1)
+    oshape = (1, ch*2, size, size)
+    shape_dict = {"data": dshape}
+
+    for target, ctx in ctx_list():
+        graph, lib, _ = nnvm.compiler.build(net, target, shape_dict)
+        # data, conv weight, conv op, concat
+        assert graph.index.num_nodes == 4
+
+        data = tvm.nd.array(np.random.uniform(size=dshape).astype(dtype))
+        kernel = tvm.nd.array(np.random.uniform(size=kshape).astype(dtype))
+        m = graph_runtime.create(graph, lib, ctx)
+        m.run(data=data, conv_weight=kernel)
+        # get output
+        out = m.get_output(0, tvm.nd.empty(oshape, dtype))
+
+        concat = np.concatenate((data.asnumpy(), data.asnumpy()), axis=1)
+        conv = topi.testing.conv2d_nchw_python(
+            concat, kernel.asnumpy(), (1,1), 'SAME')
+        ref = concat + conv
+        tvm.testing.assert_allclose(out.asnumpy(), ref, rtol=1e-5)
+
+
+def test_residual_block_layout_transform():
+    ch = 16
+    size = 32
+    data = sym.Variable(name="data")
+    conv1 = sym.conv2d(data=data, kernel_size=(3,3), channels=ch, padding = (1, 1), use_bias=False, name="conv1")
+    layout_transform1 = sym.__layout_transform__(data=conv1, src_layout="NCHW", dst_layout="NCHW8c")
+    layout_transform2 = sym.__layout_transform__(data=layout_transform1, src_layout="NCHW8c", dst_layout="NCHW")
+    conv2 = sym.conv2d(data=conv1, kernel_size=(3,3), channels=ch, padding = (1, 1), use_bias=False, name="conv2")
+    elemwise_sum = sym.elemwise_add(layout_transform2, conv2)
+    out = sym.relu(elemwise_sum)
+
+    dtype="float32"
+    dshape = (1, ch, size, size)
+    kshape = (ch, ch, 3, 3)
+    oshape = (1, ch, size, size)
+    shape_dict = {"data": dshape}
+
+    target = "llvm" # only test on llvm since it involves NCHW8c layout
+    ctx = tvm.context(target, 0)
+    graph, lib, _ = nnvm.compiler.build(out, target, shape_dict)
+    # data, conv1 weight, conv1, layout transform + elemwise add + relu, conv2 weight, conv2 op
+    assert graph.index.num_nodes == 6
+
+    data = tvm.nd.array(np.random.uniform(size=dshape).astype(dtype))
+    kernel1 = tvm.nd.array(np.random.uniform(size=kshape).astype(dtype))
+    kernel2 = tvm.nd.array(np.random.uniform(size=kshape).astype(dtype))
+    m = graph_runtime.create(graph, lib, ctx)
+    m.run(data=data, conv1_weight=kernel1, conv2_weight=kernel2)
+    out = m.get_output(0, tvm.nd.empty(oshape, dtype))
+
+    conv1 = topi.testing.conv2d_nchw_python(
+        data.asnumpy(), kernel1.asnumpy(), (1,1), 'SAME')
+    conv2 = topi.testing.conv2d_nchw_python(
+        conv1, kernel2.asnumpy(), (1,1), 'SAME')
+    ref = np.maximum(conv1 + conv2, 0)
+    tvm.testing.assert_allclose(out.asnumpy(), ref, rtol=1e-5)
+
+
+def build_and_run(sym, params, data, out_shape, target, ctx, opt_level=2):
+    with nnvm.compiler.build_config(opt_level=opt_level):
+        graph, lib, params = nnvm.compiler.build(sym, target, shape={"data":data.shape}, params=params)
+    module = graph_runtime.create(graph, lib, ctx)
+    module.set_input(**params)
+    module.set_input("data", data)
+    module.run()
+    out =  module.get_output(0, tvm.nd.empty(out_shape))
+    return out.asnumpy(), graph
+
+
+def test_fuse_conv2d_elu():
+    def elu(data):
+        return -0.5 * sym.relu(1 - sym.exp(data)) + sym.relu(data)
+
+    def get_sym(out_channel):
+        data = sym.Variable(name="data")
+        data = sym.conv2d(data=data, kernel_size=(3,3), channels=out_channel, padding=(1, 1),
+                          layout="NCHW", kernel_layout="OIHW", use_bias=True)
+        data = sym.batch_norm(data)
+        data = elu(data)
+        return data
+
+    in_channel = 8
+    out_channel = 16
+    size = 64
+    dshape = (1, in_channel, size, size)
+    oshape = (1, out_channel, size, size)
+    data = np.random.uniform(-1, 1, dshape).astype(np.float32)
+
+    for target, ctx in ctx_list():
+        sym1 = get_sym(out_channel)
+        sym2 = get_sym(out_channel)
+        _, params1 = utils.create_workload(sym1, 1, dshape[1:], seed=0)
+        _, params2 = utils.create_workload(sym2, 1, dshape[1:], seed=0)
+        output1, g1 = build_and_run(sym1, params1, data, oshape, target, ctx, opt_level=2)
+        output2, g2 = build_and_run(sym2, params2, data, oshape, target, ctx, opt_level=0)
+        tvm.testing.assert_allclose(output1, output2, rtol=1e-5, atol=1e-5)
+        # data, conv weight, bias, batch norm gamma, batch norm beta, conv op
+        assert g1.index.num_nodes == 6
+
+if __name__ == "__main__":
+    test_injective_reduce_injective()
+    test_ewise_injective()
+    test_conv_ewise_injective()
+    test_fuse_conv2d_elu()
+    test_injective_conv2d()
+    test_concatenate_conv2d()
+    test_residual_block_layout_transform()
diff --git a/nnvm/tests/python/compiler/test_optimizer.py b/nnvm/tests/python/compiler/test_optimizer.py
new file mode 100644
index 000000000000..86a9b71b46dc
--- /dev/null
+++ b/nnvm/tests/python/compiler/test_optimizer.py
@@ -0,0 +1,134 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import numpy as np
+import tvm
+import nnvm
+import nnvm.compiler.optimizer as optimizer
+import nnvm.compiler.lr_scheduler as lr_scheduler
+
+from nnvm.testing.config import ctx_list
+from tvm.contrib import graph_runtime
+
+
+def helper(symbol, inputs, params, update_func, run_times, target, ctx, dtype="float32"):
+    ishapes = {}
+    np_inputs = {}
+    params_dict = {}
+    for (name, shape, s) in inputs:
+        ishapes.update({name: shape})
+        np_inputs.update({name: np.random.uniform(size=shape).astype(dtype)})
+    for (name, shape, s) in params:
+        np_inputs.update({name: np.random.uniform(size=shape).astype(dtype)})
+        params_dict.update({name: np_inputs[name]})
+
+    graph, lib, rt_params = nnvm.compiler.build(symbol, target, shape=ishapes)
+    m = graph_runtime.create(graph, lib, ctx)
+    m.set_input(**np_inputs)
+    m.set_input(**rt_params)
+    for _ in range(run_times):
+        m.run()
+    y_np = update_func(**np_inputs)
+    out = m.get_output(0, tvm.nd.empty(y_np.shape, dtype))
+    tvm.testing.assert_allclose(out.asnumpy(), y_np, atol=1e-5, rtol=1e-5)
+
+
+def test_sgd():
+    for target, ctx in ctx_list():
+        data = nnvm.sym.Variable("data")
+        weight = nnvm.sym.Variable("weight")
+        out = nnvm.sym.elemwise_mul(data, weight ** 2)
+
+        dshape = (1, 2, 3)
+        wshape = dshape
+
+        base_lr = 0.1
+        lr_factor = 0.5
+        rescale_grad = 0.2
+        wd = 0.1
+        clip_gradient = 0.25
+
+        scheduler = lr_scheduler.FactorScheduler(base_lr=base_lr, step=1, factor=lr_factor)
+        opt = optimizer.SGD(learning_rate=base_lr, lr_scheduler=scheduler,
+                            rescale_grad=rescale_grad, clip_gradient=clip_gradient,
+                            wd=wd)
+        opt_sym = opt.minimize(out, var=weight)
+
+        inputs = [("data", dshape, data)]
+        params = [("weight", wshape, weight)]
+
+        def update_func(data, weight):
+            gradient_0 = data * 2 * weight * rescale_grad
+            gradient_0 = np.clip(gradient_0, -clip_gradient, clip_gradient)
+            weight_0 = weight - base_lr * lr_factor * (gradient_0 + wd * weight)
+            gradient_1 = data * 2 * weight_0 * rescale_grad
+            gradient_1 = np.clip(gradient_1, -clip_gradient, clip_gradient)
+            weight_1 = weight_0 - base_lr * (lr_factor ** 2) * (gradient_1 + wd * weight_0)
+            return weight_1
+
+        helper(opt_sym, inputs, params, update_func, 2, target, ctx)
+
+
+
+def test_adam():
+    for target, ctx in ctx_list():
+        data = nnvm.sym.Variable("data")
+        weight = nnvm.sym.Variable("weight")
+        out = nnvm.sym.elemwise_mul(data, weight ** 2)
+
+        dshape = (1, 2, 3)
+        wshape = dshape
+
+        base_lr = 0.1
+        beta1 = 0.9
+        beta2 = 0.999
+        epsilon = 1e-8
+        lr_factor = 0.5
+        rescale_grad = 0.2
+        wd = 0.1
+        clip_gradient = 0.25
+
+        scheduler = lr_scheduler.FactorScheduler(base_lr=base_lr, step=1, factor=lr_factor)
+        opt = optimizer.Adam(learning_rate=base_lr, beta1=beta1, beta2=beta2, epsilon=epsilon,
+                             lr_scheduler=scheduler, rescale_grad=rescale_grad,
+                             clip_gradient=clip_gradient, wd=wd)
+        opt_sym = opt.minimize(out, var=weight)
+
+        inputs = [("data", dshape, data)]
+        params = [("weight", wshape, weight)]
+
+        def update_func(data, weight):
+            rate_0 = np.sqrt(1 - beta2) / (1 - beta1)
+            lr_0 = base_lr * lr_factor * rate_0
+            gradient_0 = data * 2 * weight * rescale_grad
+            gradient_0 = np.clip(gradient_0, -clip_gradient, clip_gradient)
+            m_0 = (1 - beta1) * gradient_0
+            v_0 = (1 - beta2) * (gradient_0 ** 2)
+            weight_0 = weight - lr_0 * (m_0 / (np.sqrt(v_0) + epsilon) + wd * weight)
+            rate_1 = np.sqrt(1 - beta2 ** 2) / (1 - beta1 ** 2)
+            lr_1 = base_lr * (lr_factor ** 2) * rate_1
+            gradient_1 = data * 2 * weight_0 * rescale_grad
+            gradient_1 = np.clip(gradient_1, -clip_gradient, clip_gradient)
+            m_1 = beta1 * m_0 + (1 - beta1) * gradient_1
+            v_1 = beta2 * v_0 + (1 - beta2) * (gradient_1 ** 2)
+            weight_1 = weight_0 - lr_1 * (m_1 / (np.sqrt(v_1) + epsilon) + wd * weight_0)
+            return weight_1
+
+        helper(opt_sym, inputs, params, update_func, 2, target, ctx)
+
+if __name__ == "__main__":
+    test_sgd()
+    test_adam()
diff --git a/nnvm/tests/python/compiler/test_param_dict.py b/nnvm/tests/python/compiler/test_param_dict.py
new file mode 100644
index 000000000000..b30f8f99082c
--- /dev/null
+++ b/nnvm/tests/python/compiler/test_param_dict.py
@@ -0,0 +1,100 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import os
+import numpy as np
+import nnvm.compiler
+import tvm
+import json
+import base64
+from tvm._ffi.base import py_str
+from tvm import rpc
+from tvm.contrib import util, graph_runtime
+
+
+def test_save_load():
+    x = np.random.uniform(size=(10, 2)).astype("float32")
+    y = np.random.uniform(size=(1, 2, 3)).astype("float32")
+    x[:] = 1
+    y[:] = 1
+    params = {"x": x, "y": y}
+    param_bytes = nnvm.compiler.save_param_dict(params)
+    assert isinstance(param_bytes, bytearray)
+    param2 = nnvm.compiler.load_param_dict(param_bytes)
+    assert len(param2) == 2
+    np.testing.assert_equal(param2["x"].asnumpy(), x)
+    np.testing.assert_equal(param2["y"].asnumpy(), y)
+
+
+def test_ndarray_reflection():
+    x = np.random.uniform(size=(10, 2)).astype("float32")
+    xx = tvm.nd.array(x)
+    xnode = tvm.make.node("NDArrayWrapper", name="xx", array=xx)
+    xnode2 = tvm.make.node("NDArrayWrapper", name="x2", array=xx)
+    assert xnode.array.same_as(xx)
+    json_str = tvm.save_json([xnode, xnode2])
+    json_dict = json.loads(json_str)
+    b64_str = json_dict["b64ndarrays"][0]
+    decoded = py_str(base64.b64encode(base64.b64decode(b64_str)))
+    assert b64_str == decoded
+    xlist = tvm.load_json(json_str)
+    np.testing.assert_equal(xlist[0].array.asnumpy(), xx.asnumpy())
+    assert xlist[1].array == xlist[0].array
+
+
+def test_bigendian_rpc_param():
+    """Test big endian rpc when there is a PowerPC RPC server available"""
+    host = os.environ.get("TVM_POWERPC_TEST_HOST", None)
+    port = os.environ.get("TVM_POWERPC_TEST_PORT", 9090)
+    if host is None:
+        return
+
+    def verify_nnvm(remote, target, shape, dtype):
+        x = nnvm.sym.Variable("x")
+        y = x + 1
+        graph, lib, _ = nnvm.compiler.build(
+            y, target,
+            shape={"x": shape},
+        dtype={"x": dtype})
+
+        temp = util.tempdir()
+        path_dso = temp.relpath("dev_lib.o")
+        lib.save(path_dso)
+        remote.upload(path_dso)
+        lib = remote.load_module("dev_lib.o")
+        a = np.random.randint(0, 256, size=shape).astype(dtype)
+        a[:] = 1
+        params = {"x" : a}
+        ctx = remote.cpu(0)
+        m = graph_runtime.create(graph, lib, ctx)
+        # uses save param_dict
+        m.load_params(nnvm.compiler.save_param_dict(params))
+        m.run()
+        out = m.get_output(0, tvm.nd.empty(shape, dtype=dtype, ctx=ctx))
+        tvm.testing.assert_allclose(a + 1, out.asnumpy())
+
+    print("Test RPC connection to PowerPC...")
+    remote = rpc.connect(host, port)
+    target = "llvm -mtriple=powerpc-linux-gnu"
+    for dtype in ["float32", "float64", "int32", "int8"]:
+        verify_nnvm(remote, target, (10,), dtype)
+
+
+
+if __name__ == "__main__":
+    test_ndarray_reflection()
+    test_save_load()
+    test_bigendian_rpc_param()
diff --git a/nnvm/tests/python/compiler/test_rpc_exec.py b/nnvm/tests/python/compiler/test_rpc_exec.py
new file mode 100644
index 000000000000..1584f7c589a4
--- /dev/null
+++ b/nnvm/tests/python/compiler/test_rpc_exec.py
@@ -0,0 +1,67 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import tvm
+from tvm import rpc
+from tvm.contrib import util, graph_runtime
+import nnvm.symbol as sym
+import nnvm.compiler
+import numpy as np
+import time
+
+def test_rpc_executor():
+    host = "localhost"
+    port = 9021
+    server = rpc.Server(host, port, use_popen=True)
+    time.sleep(1)
+    x = sym.Variable("x")
+    y = sym.Variable("y")
+    z = sym.exp(y + x)
+    shape = (10, 128)
+    dtype = tvm.float32
+    shape_dict = {"x": shape, "y": shape}
+    tmp = util.tempdir()
+    lib_name  = tmp.relpath("net.o")
+
+    graph, lib, _ = nnvm.compiler.build(z, "llvm", shape_dict)
+    # save module
+    lib.save(lib_name)
+    remote = rpc.connect(host, port)
+    remote.upload(lib_name)
+    ctx = remote.cpu(0)
+    # load remote
+    rlib = remote.load_module("net.o")
+
+    # Create remotemodule
+    m = graph_runtime.create(graph, rlib, remote.cpu(0))
+    # get member functions
+    set_input, run, get_output = m["set_input"], m["run"], m["get_output"]
+    na = tvm.nd.array(np.ones(shape).astype(dtype), ctx)
+    nb = tvm.nd.array(np.ones(shape).astype(dtype), ctx)
+    # set inputs
+    set_input("x", na)
+    set_input("y", nb)
+    # execute
+    run()
+    # get outputs
+    out = tvm.nd.empty(shape, dtype, ctx)
+    get_output(0, out)
+    tvm.testing.assert_allclose(
+        out.asnumpy(), np.exp(na.asnumpy() + nb.asnumpy()))
+    server.terminate()
+
+if __name__ == "__main__":
+    test_rpc_executor()
diff --git a/nnvm/tests/python/compiler/test_simplify_inference.py b/nnvm/tests/python/compiler/test_simplify_inference.py
new file mode 100644
index 000000000000..2f520bd6c125
--- /dev/null
+++ b/nnvm/tests/python/compiler/test_simplify_inference.py
@@ -0,0 +1,65 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Unittest cases for simplify batch_norm"""
+import nnvm
+from nnvm import symbol as sym
+from nnvm.compiler import graph_util, graph_attr
+
+def test_simplify_batchnorm():
+    def simple_bn(x, gamma, beta, moving_mean, moving_var,
+                  axis=1, epsilon=1e-5, shape=None):
+        # expect = (x - moving_mean) / sym.sqrt(moving_var + eps) * gamma + beta
+        scale = sym.elemwise_mul(1 / sym.sqrt(moving_var + epsilon), gamma)
+        shift = sym.elemwise_add(
+            sym.elemwise_mul(sym.negative(moving_mean), scale), beta)
+        # for 2D
+        num_newaxis=len(shape) - axis - 1
+        if num_newaxis:
+            scale = sym.expand_dims(scale, axis=1, num_newaxis=num_newaxis)
+            shift = sym.expand_dims(shift, axis=1, num_newaxis=num_newaxis)
+        return x * scale + shift
+
+
+    # Before simplify
+    def check(dim, axis, nstep):
+        eps = 0.01
+        x = sym.Variable("x") + 1
+        beta = sym.Variable("beta")
+        gamma = sym.Variable("gamma")
+        moving_var = sym.Variable("moving_var")
+        moving_mean = sym.Variable("moving_mean")
+        y1, y2 = x, sym.Variable("xx") + 1
+        ishape = {"x": tuple(10 for i in range(dim))}
+        for i in range(nstep):
+            y1 = sym.batch_norm(
+                y1 + 1, gamma, beta, moving_mean, moving_var, epsilon=eps, axis=axis)
+            y1 = sym.dropout(y1)
+            y2 = simple_bn(y2 + 1, gamma, beta, moving_mean, moving_var,
+                           epsilon=eps, axis=axis, shape=ishape["x"])
+        g = nnvm.graph.create(y1)
+        g2 = nnvm.graph.create(y2)
+        graph_attr.set_shape_inputs(g, ishape)
+        g1 = g.apply("InferShape").apply("SimplifyInference")
+        # assert graph equals as expected
+        graph_util.check_graph_equal(g1, g2)
+
+    check(2, 1, 1)
+    check(4, 0, 3)
+    check(4, 1, 2)
+
+if __name__ == "__main__":
+    test_simplify_batchnorm()
diff --git a/nnvm/tests/python/compiler/test_to_relay.py b/nnvm/tests/python/compiler/test_to_relay.py
new file mode 100644
index 000000000000..dac14a8c1f22
--- /dev/null
+++ b/nnvm/tests/python/compiler/test_to_relay.py
@@ -0,0 +1,58 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import nnvm
+from nnvm import testing
+from nnvm import to_relay
+import tvm
+from tvm.relay import transform
+from tvm.relay import create_executor
+from tvm.contrib import graph_runtime
+import numpy as np
+
+def check_model(sym, shapes, dtypes, params):
+    net = nnvm.graph.create(sym)
+    graph_json, mod, params = nnvm.compiler.build(
+        net,
+        'llvm',
+        shape=shapes,
+        dtype=dtypes,
+        params=params)
+    nnvm_rts = graph_runtime.create(graph_json, mod, tvm.cpu(0))
+    inputs = {}
+    for name in shapes:
+        np_array = np.random.rand(*shapes[name]).astype('float32')
+        inputs[name] = tvm.nd.array(np_array)
+
+    nnvm_rts.set_input(**params)
+    nnvm_rts.run(**inputs)
+    nnvm_out = nnvm_rts.get_output(0)
+    relay_model, params = to_relay.to_relay(net, shapes, dtypes, params)
+    mod = tvm.relay.Module.from_expr(relay_model)
+    mod = transform.InferType()(mod)
+    relay_rts = create_executor(kind='graph', mod=mod, ctx=tvm.cpu(0), target='llvm')
+    inputs.update(params)
+    relay_out = relay_rts.evaluate()(*list(inputs.values()))
+    np.testing.assert_allclose(nnvm_out.asnumpy(), relay_out.asnumpy())
+
+# def test_mlp():
+#     mlp, params = testing.mlp.get_workload(1)
+#     shapes =  { "data": (10, 3, 224, 224) }
+#     dtypes =  { "data": 'float32' }
+#     check_model(mlp, shapes, dtypes, params)
+
+if __name__ == "__main__":
+    test_mlp()
diff --git a/nnvm/tests/python/compiler/test_top_assign.py b/nnvm/tests/python/compiler/test_top_assign.py
new file mode 100644
index 000000000000..dae0506edc36
--- /dev/null
+++ b/nnvm/tests/python/compiler/test_top_assign.py
@@ -0,0 +1,57 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import numpy as np
+
+import tvm
+from tvm.contrib import graph_runtime
+
+import nnvm.symbol as sym
+import nnvm.compiler
+from nnvm.testing.config import ctx_list
+
+
+def test_update():
+    w = sym.Variable("w")
+    w2 = sym.Variable("w2")
+    w = sym._assign(w, w + 1)
+    w2 = sym._assign(w2, w + 1)
+
+    dshape = (5, 3, 18, 18)
+    shape_dict = {"w": dshape, "w2":dshape}
+    dtype = "float32"
+
+    def check(target, ctx):
+        graph, lib, _ = nnvm.compiler.build(w2, target, shape_dict)
+
+        m = graph_runtime.create(graph, lib, ctx)
+
+        data = tvm.nd.array(np.random.uniform(size=dshape).astype(dtype))
+        m.set_input("w", data)
+        m.run()
+        out = m.get_input("w2", tvm.nd.empty(dshape, dtype))
+        tvm.testing.assert_allclose(out.asnumpy(), data.asnumpy() + 2, rtol=1e-5)
+
+        m.run()
+        out = m.get_input("w2", tvm.nd.empty(dshape, dtype))
+        tvm.testing.assert_allclose(out.asnumpy(), data.asnumpy() + 3, rtol=1e-5)
+
+    for target, ctx in ctx_list():
+        check(target, ctx)
+
+
+if __name__ == "__main__":
+    test_update()
diff --git a/nnvm/tests/python/compiler/test_top_level1.py b/nnvm/tests/python/compiler/test_top_level1.py
new file mode 100644
index 000000000000..ae6266cdde54
--- /dev/null
+++ b/nnvm/tests/python/compiler/test_top_level1.py
@@ -0,0 +1,605 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import numpy as np
+import tvm
+from tvm.contrib import graph_runtime
+import topi.testing
+import nnvm.symbol as sym
+import nnvm.compiler
+from nnvm.testing.config import ctx_list
+from nnvm.testing.check_computation import check_function
+
+def test_check_function():
+    # test the testing function
+
+    x = sym.Variable("x")
+    y = sym.Variable("y")
+
+    # different styles of returning gradients from the backward function
+    check_function(x + 2*y, lambda x, y: x + 2*y,
+                   lambda x, y, head_grads: [head_grads, 2*head_grads],
+                   shape={'x': (1, 2), y: (1, 2)}, dtype='float32')
+    check_function(x + 2*y, lambda x, y: x + 2*y,
+                   lambda x, y, head_grads: (head_grads, 2*head_grads),
+                   shape={'x': (1, 2), y: (1, 2)}, dtype='float32')
+    check_function(x + 2*y, lambda x, y: x + 2*y,
+                   lambda x, y, head_grads: {'x': head_grads, 'y': 2*head_grads},
+                   shape={'x': (1, 2), y: (1, 2)}, dtype='float32')
+    check_function(x + 2*y, lambda x, y: x + 2*y,
+                   lambda x, y, head_grads: {'y': 2*head_grads},
+                   shape={'x': (1, 2), y: (1, 2)}, dtype='float32')
+    check_function(x + 2*y, lambda x, y: x + 2*y,
+                   lambda x, y, head_grads: [2*head_grads],
+                   grad_input_vars=[y],
+                   shape={'x': (1, 2), y: (1, 2)}, dtype='float32')
+    check_function(x + 2*y, lambda x, y: x + 2*y,
+                   lambda x, y, head_grads: 2*head_grads,
+                   grad_input_vars=[y],
+                   shape={'x': (1, 2), y: (1, 2)}, dtype='float32')
+    check_function(x + 2*y, lambda x, y: x + 2*y,
+                   lambda x, y, head_grads: 2*head_grads,
+                   grad_input_vars=[y],
+                   shape={'x': (1, 2), y: (1, 2)}, dtype='float64')
+
+    # test just numerical gradients
+    # different styles of shape and dtype passing
+    check_function(x + 2*y, shape={'x': (1, 2), y: (1, 2)},
+                   numerical_grads=True)
+    check_function(x + 2*y, shape={'x': (1, 2), y: (1, 2)}, dtype='float32',
+                   numerical_grads=True)
+    check_function(x + 2*y, shape={'x': (1, 2), y: (1, 2)}, dtype={x: 'float32', 'y': 'float32'},
+                   numerical_grads=True)
+    check_function(x + 2*y, shape=(1, 2), dtype='float32',
+                   numerical_grads=True)
+
+    # specifying variable attributes on variable creation
+    # (in this case type codes must be used)
+    x = sym.Variable("x", dtype=0, shape=(1, 2))
+    check_function(x + 2*y, shape={y: (1, 2)}, dtype={'y': 'float32'}, numerical_grads=True)
+    y = sym.Variable("y", dtype=0, shape=(1, 2))
+
+    # shape overriding
+    def _fwd1(x, y):
+        assert x.shape == (1, 1)
+        assert y.shape == (1, 2)
+        return x + 2*y
+    check_function(x + 2*y, _fwd1, shape={x: (1, 1)})
+
+    # in_range
+    def _fwd2(x, y):
+        assert x.shape == (100,)
+        assert (x <= 0.9).all()
+        assert (x >= 0.8).all()
+        return x + 2*y
+    check_function(x + 2*y, _fwd2, shape=(100,), in_range=(0.8, 0.9), numerical_grads=False)
+    check_function(x + 2*y, _fwd2, shape=(100,), in_range={'x': (0.8, 0.9)}, numerical_grads=False)
+    check_function(x + 2*y, backward=lambda x, y, head_grads: [1.0, 2.0],
+                   in_range={'head_grads_0': (1.0, 1.0)})
+    # explicit passing of values
+    check_function(x + 2*y, backward=lambda x, y, head_grads: [1.0, 2.0],
+                   values={'head_grads_0': np.full((1, 2), 1.0)})
+
+    # check that the function reports errors
+    def _check_function_must_fail(*args, **kwargs):
+        error = AssertionError
+        if 'error' in kwargs:
+            error = kwargs['error']
+            del kwargs['error']
+        try:
+            check_function(*args, quiet=True, **kwargs)
+        except error:
+            pass
+        else:
+            raise AssertionError("check_function didn't raise an exception")
+
+    _check_function_must_fail(x + 2*y, error=ValueError)
+    _check_function_must_fail(x + 2*y, lambda x, y: x + y)
+    _check_function_must_fail(x + 2*y, backward=lambda x, y, head_grads: [1.0, 2.0])
+    _check_function_must_fail(sym.block_grad(x + 2*y), numerical_grads=True)
+    _check_function_must_fail(x*x, numerical_grads=True,
+                              numerical_grads_params={'atol': 0.0, 'rtol': 0.0})
+    _check_function_must_fail(sym.log(-x*x), numerical_grads=True, error=ValueError)
+
+    # different styles of returning results from the forward function
+    check_function(x + 2*y, lambda x, y: [x + 2*y], numerical_grads=False)
+    _check_function_must_fail(x + 2*y, lambda x, y: [x + 2*y, x], numerical_grads=False,
+                              error=ValueError)
+    _check_function_must_fail(x + 2*y, lambda x, y: [], numerical_grads=False,
+                              error=ValueError)
+
+    # multiple outputs
+    z = sym.Group([2*x + y, x + 2*y])
+    check_function(z, lambda x, y: [2*x + y, x + 2*y])
+    check_function(z, lambda x, y: (2*x + y, x + 2*y))
+    check_function(z, backward=lambda x, y, head_grads: [2*head_grads[0] + head_grads[1],
+                                                         head_grads[0] + 2*head_grads[1]])
+    _check_function_must_fail(z, backward=lambda x, y, head_grads: [2*head_grads[0],
+                                                                    2*head_grads[1]])
+    check_function(z, backward=lambda x, y, head_grads: [head_grads[1], 2*head_grads[1]],
+                   in_range={'head_grads_0': (0, 0)})
+    check_function(z, numerical_grads=True)
+
+    z = sym.Group([sym.block_grad(2*x + y), x + 2*y])
+    check_function(z, lambda x, y: [2*x + y, x + 2*y], numerical_grads=False)
+    _check_function_must_fail(z, lambda x, y: [2*x + y, x + 2*y])
+    _check_function_must_fail(z, numerical_grads=True)
+
+    z = sym.Group([2*x + y, sym.block_grad(x + 2*y)])
+    _check_function_must_fail(z, numerical_grads=True)
+
+    z = sym.Group([2*x + y, x + 2*y, x, y, sym.sum(x)])
+    check_function(z, lambda x, y: [2*x + y, x + 2*y, x, y, np.sum(x)])
+
+    # passing additional parameters to forward and backward
+    def _fwd3(x, p):
+        assert p == 'v'
+        return x + 1
+    def _bwd3(x, p, head_grads):
+        assert p == 'v'
+        return head_grads
+    check_function(x + 1, _fwd3, _bwd3, additional_params={'p': 'v'})
+
+    # implicitly created variables and shape/dtype inference for inputs
+    x = sym.Variable("x", shape=(2, 3), dtype=0)
+    b = sym.Variable("b")
+    y = sym.dense(data=x, bias=b, units=4)
+    # Don't check gradients on cuda because is doesn't yet support ewise after reduce
+    check_function(y, exclude_targets={'cuda'}, numerical_grads=True)
+    check_function(y, shape={'x': (3, 4)}, exclude_targets={'cuda'}, numerical_grads=True)
+    check_function(y, dtype={'x': 'float64'}, exclude_targets={'cuda'}, numerical_grads=True)
+
+    x = sym.Variable("x")
+    b = sym.Variable("b")
+    w = sym.Variable("w")
+    y = sym.dense(data=x, bias=b, weight=w, units=4)
+    def _fwd_dense(x, w, b):
+        return np.dot(x, w.T) + b
+    check_function(y, _fwd_dense, shape={'x': (1,2)}, dtype={'x': 'float32'}, numerical_grads=False)
+    check_function(y, _fwd_dense, shape={'x': (1,2)}, dtype={'w': 'float64'}, numerical_grads=False)
+    _check_function_must_fail(y, _fwd_dense, shape={'x': (1,2)},
+                              dtype={'w': 'float64', 'b': 'float32'},
+                              numerical_grads=False,
+                              error=nnvm._base.NNVMError)
+    # fails because no shape
+    _check_function_must_fail(y, _fwd_dense, numerical_grads=False, error=ValueError)
+    # ok because type is float32 by default
+    check_function(y, _fwd_dense, shape={'x': (1,2)}, numerical_grads=False)
+
+def test_relu():
+    x = sym.Variable("x")
+    y = sym.relu(sym.leaky_relu(x, alpha=0.3) - 0.2)
+
+    def forward(x):
+        x = (x < 0) * x * 0.3 + (x > 0) * x - 0.2
+        return (x > 0) * x
+
+    def backward(head_grads, x):
+        sub = (x < 0) * x * 0.3 + (x > 0) * x - 0.2
+        return [(sub > 0).astype("float") * \
+                ((x > 0).astype("float") + 0.3 * (x < 0).astype("float")) * head_grads]
+
+    shape = {'x': (1, 3, 32, 32)}
+    check_function(y, forward, backward, shape=shape)
+
+def test_prelu_nchw():
+    x = sym.Variable("x")
+    a = sym.Variable("a")
+    y = sym.prelu(data=x, alpha=a)
+
+    def forward(x, a):
+        return (x < 0) * (x * a.reshape(3, 1, 1)) + (x>=0) * x
+
+    shape = {'x': (1, 3, 32, 32), 'a': (3,)}
+    check_function(y, forward, shape=shape)
+
+def test_prelu_nhwc():
+    x = sym.Variable("x")
+    a = sym.Variable("a")
+    y = sym.prelu(data=x, alpha=a, axis=3)
+
+    def forward(x, a):
+        return (x < 0) * (x * a.reshape(1, 1, 3)) + (x>=0) * x
+
+    shape = {'x': (1, 32, 32, 3), 'a': (3,)}
+    check_function(y, forward, shape=shape)
+
+def test_sym_scalar_pow():
+    scalar = 3
+    x = sym.Variable("x")
+    y = x**scalar
+
+    def forward(x):
+        return x**scalar
+
+    def backward(head_grads, x):
+        return [scalar * x**(scalar -  1) * head_grads]
+
+    shape = {'x': (1, 3, 32, 32)}
+    check_function(y, forward, backward, shape=shape)
+
+
+def test_scalar_sym_pow():
+    scalar = 3
+    x = sym.Variable("x")
+    y = scalar**x
+
+    def forward(x):
+        return scalar**x
+
+    def backward(head_grads, x):
+        return [np.log(scalar) * scalar**x * head_grads]
+
+    shape = {'x': (1, 3, 32, 32)}
+    check_function(y, forward, backward, shape=shape)
+
+
+def test_exp():
+    x = sym.Variable("x")
+    y = sym.exp(x)
+
+    def forward(x):
+        return np.exp(x)
+
+    def backward(head_grads, x):
+        return [np.exp(x) * head_grads]
+
+    shape = {'x': (1, 3, 32, 32)}
+    check_function(y, forward, backward, shape=shape)
+
+
+def test_log():
+    x = sym.Variable("x")
+    y = sym.log(x)
+
+    def forward(x):
+        return np.log(x)
+
+    def backward(head_grads, x):
+        return [1. / x * head_grads]
+
+    shape = {'x': (1, 3, 32, 32)}
+    check_function(y, forward, backward, in_range=(0.002, 2.0), shape=shape)
+
+
+def test_tanh():
+    x = sym.Variable("x")
+    y = sym.tanh(x)
+
+    def forward(x):
+        return np.sinh(x) / np.cosh(x)
+
+    def backward(head_grads, x):
+        y_np = forward(x)
+        return [(1 - y_np**2) * head_grads]
+
+    shape = {'x': (1, 3, 32, 32)}
+    check_function(y, forward, backward, shape=shape)
+
+
+def test_sigmoid():
+    x = sym.Variable("x")
+    y = sym.sigmoid(x)
+
+    def forward(x):
+        return 1.0 / (1.0 + np.exp(-x))
+
+    def backward(head_grads, x):
+        y_np = forward(x)
+        return [y_np *(1 - y_np) * head_grads]
+
+    shape = {'x': (1, 3, 32, 32)}
+    check_function(y, forward, backward, shape=shape)
+
+
+def test_softmax():
+    x = sym.Variable("x")
+    y = sym.softmax(x)
+
+    def forward(x):
+        return topi.testing.softmax_python(x)
+
+    def backward(head_grads, x):
+        y = topi.testing.softmax_python(x)
+        grad = y * (head_grads - np.sum(y * head_grads, axis=1, keepdims=True))
+        return [grad]
+
+    check_function(y, forward, backward,
+                   shape={'x': (10, 1000)}, numerical_grads=False)
+    check_function(y, forward, backward,
+                   shape={'x': (2, 10)})
+
+
+def test_log_softmax():
+    x = sym.Variable("x")
+    y = sym.log_softmax(x)
+
+    def forward(x):
+        return topi.testing.log_softmax_python(x)
+
+    def backward(head_grads, x):
+        y = topi.testing.log_softmax_python(x)
+        grad = head_grads - np.exp(y) * np.sum(head_grads, axis=1, keepdims=True)
+        return [grad]
+
+    check_function(y, forward, backward,
+                   shape={'x': (10, 1000)}, numerical_grads=False)
+    check_function(y, forward, backward,
+                   shape={'x': (2, 10)})
+
+
+def test_dense():
+    x = sym.Variable("x", shape=(10, 100))
+    w = sym.Variable("dense_weight", shape=(3, 100))
+    b = sym.Variable("dense_bias", shape=(3,))
+    y = sym.dense(x, w, b, use_bias=True, units=3, name="dense")
+    y = sym.flatten(y)
+
+    def forward(x, dense_weight, dense_bias):
+        return np.dot(x, dense_weight.T) + dense_bias
+    shape = {
+        'x': (10, 100),
+        'w': (3, 100),
+        'b': (3,)
+    }
+    # Don't check gradients on cuda because is doesn't yet support ewise after reduce
+    check_function(y, forward, shape=shape,
+                   exclude_targets={'cuda'}, numerical_grads=True)
+    check_function(y, forward, shape=shape,
+                   only_targets={'cuda'}, numerical_grads=False)
+
+
+def test_batchnorm():
+    x = sym.Variable("x")
+    beta = sym.Variable("beta")
+    gamma = sym.Variable("gamma")
+    moving_var = sym.Variable("moving_var")
+    moving_mean = sym.Variable("moving_mean")
+    eps = 1e-5
+    y = sym.batch_norm(
+        x, gamma, beta, moving_mean, moving_var, epsilon=eps)
+
+    def forward(x, gamma, beta, moving_mean, moving_var):
+        return (x - moving_mean) / np.sqrt(moving_var + eps) * gamma + beta
+
+    shape = {
+        'x': (10, 20),
+        'gamma': (20,),
+        'beta': (20,),
+        'moving_mean': (20,),
+        'moving_var': (20,)
+    }
+
+    check_function(y, forward, in_range=(0.001, 1.0), shape=shape)
+
+
+def verify_concatenate(ishape, axis):
+    x = [sym.Variable("x%d" % i, shape=ishape[i]) for i in range(len(ishape))]
+    y = sym.concatenate(*x, axis=axis) + 1
+
+    def forward(**kwargs):
+        return np.concatenate(list(kwargs.values()), axis=axis) + 1
+
+    check_function(y, forward)
+
+
+def test_concatenate():
+    verify_concatenate([(2, 3, 4), (1, 3, 4)], axis=0)
+    verify_concatenate([(2, 4), (2, 7)], axis=1)
+
+
+def verify_split(ishape, indices_or_sections, axis):
+    x = sym.Variable("x", shape=ishape)
+    y = sym.split(x, indices_or_sections=indices_or_sections, axis=axis)
+
+    def forward(x):
+        return np.split(x, indices_or_sections, axis=axis)
+
+    check_function(y, forward)
+
+
+def test_split():
+    verify_split((2, 3), 2, axis=0)
+    verify_split((5, 3), [3], axis=0)
+    verify_split((5, 9, 3), [3, 4], axis=1)
+
+def verify_strided_slice(ishape, begin, end, strideinp=None):
+    stride = strideinp if strideinp else [1, 1, 1]
+    x = sym.Variable("x", shape=ishape)
+    if strideinp:
+        y = sym.strided_slice(x, begin = begin, end = end, stride = stride) + 1
+    else:
+        y = sym.strided_slice(x, begin = begin, end = end) + 1
+
+    for i in range(len(begin), 3):
+        begin.append(0)
+    for i in range(len(end), 3):
+        end.append(ishape[i])
+
+    def test_forward(x):
+        return x[begin[0]:end[0]:stride[0],
+                    begin[1]:end[1]:stride[1], begin[2]:end[2]:stride[2]] + 1
+
+    check_function(y, test_forward)
+
+def test_strided_slice():
+    verify_strided_slice((3, 4, 3), [0, 0, 0], [4, -5, 4], [1, -1, 2])
+    verify_strided_slice((3, 4, 3), [1, 1, 0], [4, 4, 3], [2, 1, 1])
+    verify_strided_slice((3, 4, 3), [1, -1, 0], [4, -5, 3], [2, -1, 1])
+    verify_strided_slice((3, 4, 3), [1, 0, 0], [2, 2, 3], [1, 1, 2])
+    verify_strided_slice((3, 4, 3), [1, -1, 0], [2, -3, 3], [1, -1, 1])
+    verify_strided_slice((3, 4, 3), [1, 1, 0], [4, 4, 3])
+    verify_strided_slice((3, 4, 3), [1, 1, 0], [4, 1000, 3])
+    verify_strided_slice((3, 4, 3), [1, 1, 0], [4, 4])
+    verify_strided_slice((3, 4, 3), [1, 1], [4, 4, 3])
+
+def verify_take(src_shape, indices_src, axis=None):
+    src_dtype = "float32"
+    indices_dtype = "int32"
+    indices_src = np.array(indices_src, dtype=indices_dtype)
+    a = sym.Variable("a", shape=src_shape)
+    indices = sym.Variable("indices", shape=indices_src.shape)
+    y = sym.take(a, indices, axis=axis)
+
+    def forward(a, indices):
+        return np.take(a, indices=indices, axis=axis)
+
+    a_src = np.arange(np.prod(src_shape), dtype=src_dtype).reshape(src_shape)
+
+    check_function(y, forward,
+                   dtype={'a': src_dtype, 'indices': indices_dtype},
+                   values={'a': a_src, 'indices': indices_src})
+
+def test_take():
+    verify_take((4,), [1])
+    verify_take((4,), [[0,1,2,3]])
+    verify_take((3,3,3), [[11,25]])
+    verify_take((4,), [[0,1],[2,3]])
+    verify_take((4,), [1], 0)
+    verify_take((2,2), [[[1,0],[0,1]]], 0)
+    verify_take((2,2), [[[1,0],[0,1]]], 1)
+    verify_take((4,3,5,6), [[2,1,0,0]], -2)
+
+
+def verify_squeeze(shape, axis):
+    x = sym.Variable("x")
+    if axis is not None:
+        y = sym.squeeze(x, axis=axis)
+    else:
+        y = sym.squeeze(x)
+    y = y + 1
+
+    def forward(x):
+        return np.squeeze(x, axis=axis) + 1
+
+    def backward(head_grads, x):
+        return [np.reshape(head_grads, x.shape)]
+
+    check_function(y, forward, backward, shape=shape)
+
+
+def test_squeeze():
+    verify_squeeze((1, 3, 2, 5), None)
+    verify_squeeze((1, 3, 1), axis=0)
+    verify_squeeze((1, 3, 2, 5, 1), axis=-1)
+
+
+def test_pad():
+    x = sym.Variable("x")
+    y = sym.pad(x, pad_width=((0, 0), (0, 0), (0, 1), (2, 3)), pad_value=1.)
+
+    def forward(x):
+        return np.pad(x,
+                      pad_width=((0, 0), (0, 0), (0, 1), (2, 3)),
+                      mode='constant', constant_values=1.)
+
+    shape = {'x': (1, 3, 28, 28)}
+    check_function(y, forward, shape=shape)
+
+def verify_lrn(ishape, size, axis, bias, alpha, beta):
+    x = sym.Variable("x", shape=ishape)
+    y = sym.lrn(x, size=size, axis=axis, bias=bias, alpha=alpha, beta=beta)
+
+    def forward1(x):
+        return topi.testing.lrn_python(x, size, axis, bias, alpha, beta)
+
+    check_function(y, forward1)
+
+    def forward2(x):
+        y = forward1(x)
+        return (y > 0)*y
+
+    #Checking LRN op followed by elementwise op relu
+    check_function(sym.relu(y), forward2, in_range={'x': (-10.0, 10.0)})
+
+def verify_l2_normalize(ishape, eps, axis):
+    x = sym.Variable("x", shape=ishape)
+    y = sym.l2_normalize(x, eps=eps, axis=axis)
+
+    def forward1(x):
+        return topi.testing.l2_normalize_python(x, eps, axis)
+
+    check_function(y, forward1)
+
+    def forward2(x):
+        y = forward1(x)
+        return (y > 0)*y
+
+    #Checking L2 normalization op followed by elementwise op relu
+    check_function(sym.relu(y), forward2, in_range={'x': (-10.0, 10.0)})
+
+def test_lrn():
+    verify_lrn((1, 3, 20, 20), 3, 1, 1.0, 1.0, 0.5)
+    verify_lrn((1, 3, 20, 20), 3, 1, 2.0, 1.0, 0.75)
+
+def test_l2_normalize():
+    verify_l2_normalize((1, 3, 20, 20), 0.001, (1,))
+    verify_l2_normalize((1, 3, 20, 20), 0.001, (1, 2))
+
+def verify_gather_nd(src_shape, indices_src):
+    src_dtype = "float32"
+    indices_dtype = "int32"
+    indices_src = np.array(indices_src, dtype=indices_dtype)
+    a = sym.Variable("a", shape=src_shape)
+    indices = sym.Variable("indices", shape=indices_src.shape)
+    y = sym.gather_nd(a, indices)
+
+    def forward(a, indices):
+        return topi.testing.gather_nd_python(a, indices)
+
+    a_src = np.arange(np.prod(src_shape), dtype=src_dtype).reshape(src_shape)
+
+    check_function(y, forward,
+                   dtype={'a': src_dtype, 'indices': indices_dtype},
+                   values={'a': a_src, 'indices': indices_src})
+
+def test_gather_nd():
+    verify_gather_nd((4,), [[1]])
+    verify_gather_nd((4,), [[1, 3, 2]])
+    verify_gather_nd((2, 3), [[1]])
+    verify_gather_nd((2, 3), [[1], [0]])
+    verify_gather_nd((2, 3), [[1, 0], [0, 2]])
+    verify_gather_nd((2, 3, 4), [[1, 0], [0, 2]])
+    verify_gather_nd((2, 3, 4), [[1, 0], [0, 2], [3, 1]])
+    verify_gather_nd((2, 3, 4), [[[1, 0], [0, 1]], [[0, 2], [1, 2]],
+                                 [[3, 1], [0, 2]]])
+    verify_gather_nd((2, 3, 4, 5), [[1, 0], [0, 2]])
+    verify_gather_nd((2, 3, 4, 5), [[1, 0], [2, 1], [3, 2], [4, 2]])
+
+if __name__ == "__main__":
+    test_check_function()
+    test_split()
+    test_concatenate()
+    test_log_softmax()
+    test_batchnorm()
+    test_dense()
+    test_relu()
+    test_prelu_nchw()
+    test_prelu_nhwc()
+    test_sym_scalar_pow()
+    test_scalar_sym_pow()
+    test_exp()
+    test_log()
+    test_tanh()
+    test_sigmoid()
+    test_softmax()
+    test_squeeze()
+    test_pad()
+    test_take()
+    test_lrn()
+    test_l2_normalize()
+    test_strided_slice()
+    test_gather_nd()
diff --git a/nnvm/tests/python/compiler/test_top_level2.py b/nnvm/tests/python/compiler/test_top_level2.py
new file mode 100644
index 000000000000..b558428f0144
--- /dev/null
+++ b/nnvm/tests/python/compiler/test_top_level2.py
@@ -0,0 +1,362 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import numpy as np
+
+import tvm
+from tvm.contrib import graph_runtime
+import topi
+import topi.testing
+import nnvm.symbol as sym
+import nnvm.compiler
+from nnvm.testing.config import ctx_list
+
+
+def test_conv2d():
+    def run_test_conv2d(sym, dtype, dshape, kshape, oshape, shape_dict, padding):
+        for target, ctx in ctx_list():
+            graph, lib, _ = nnvm.compiler.build(sym, target, shape_dict)
+            m = graph_runtime.create(graph, lib, ctx)
+            data = tvm.nd.array(np.random.uniform(size=dshape).astype(dtype))
+            kernel = tvm.nd.array(np.random.uniform(size=kshape).astype(dtype))
+            bias = tvm.nd.array(np.random.uniform(size=kshape[0]).astype(dtype))
+            m.run(x=data, y_weight=kernel, y_bias=bias)
+            out = m.get_output(0, tvm.nd.empty(oshape, dtype))
+            c_np = topi.testing.conv2d_nchw_python(
+                data.asnumpy(), kernel.asnumpy(), 1, padding)
+            c_np = c_np + bias.asnumpy().reshape(kshape[0], 1, 1)
+            tvm.testing.assert_allclose(out.asnumpy(), c_np, rtol=1e-5)
+
+    x = sym.Variable("x")
+    y = sym.conv2d(x, channels=10, kernel_size=(3,3),
+                   name="y", padding=(1,1))
+    dtype = "float32"
+    dshape = (1, 3, 18, 18)
+    kshape = (10, 3, 3, 3)
+    oshape = (1, 10, 18, 18)
+    shape_dict = {"x": dshape}
+    run_test_conv2d(y, dtype, dshape, kshape, oshape, shape_dict, (1,1))
+
+    x = sym.Variable("x")
+    y = sym.conv2d(x, channels=10, kernel_size=(1,3),
+                   name="y", padding=(0,1))
+    dtype = "float32"
+    dshape = (1, 3, 224, 224)
+    kshape = (10, 3, 1, 3)
+    oshape = (1, 10, 224, 224)
+    shape_dict = {"x": dshape}
+    run_test_conv2d(y, dtype, dshape, kshape, oshape, shape_dict, (0,1))
+
+
+def test_mixed_precision():
+    x = sym.Variable("x")
+    dtype = "int8"
+    out_dtype="int32"
+    y = sym.conv2d(x,
+                   channels=10,
+                   kernel_size=(3,3),
+                   name="y",
+                   padding=(1,1),
+                   use_bias=False,
+                   out_dtype="int32")
+    dshape = (1, 3, 18, 18)
+    kshape = (10, 3, 3, 3)
+    oshape = (1, 10, 18, 18)
+    shape_dict = {"x": dshape}
+    dtype_dict = {"x": dtype}
+    for target, ctx in ctx_list():
+        graph, lib, _ = nnvm.compiler.build(y, target, shape_dict, dtype_dict)
+        m = graph_runtime.create(graph, lib, ctx)
+        data = tvm.nd.array(np.random.uniform(-127, 127, size=dshape).astype(dtype))
+        kernel = tvm.nd.array(np.random.uniform(-127, 127, size=kshape).astype(dtype))
+        m.run(x=data, y_weight=kernel)
+        out = m.get_output(0, tvm.nd.empty(oshape, out_dtype))
+        c_np = topi.testing.conv2d_nchw_python(
+            data.asnumpy().astype(out_dtype),
+            kernel.asnumpy().astype(out_dtype), 1, 1)
+        tvm.testing.assert_allclose(out.asnumpy(), c_np, rtol=1e-5)
+
+
+def test_dilated_conv2d():
+    dilation = 3
+    x = sym.Variable("x")
+    y = sym.conv2d(x, channels=10, kernel_size=(3, 3), dilation=(dilation, dilation),
+                   name="y", padding=(1, 1))
+    dtype = "float32"
+    dshape = (1, 3, 18, 18)
+    kshape = (10, 3, 3, 3)
+    oshape = (1, 10, 14, 14)
+    shape_dict = {"x": dshape}
+    for target, ctx in ctx_list():
+        graph, lib, _ = nnvm.compiler.build(y, target, shape_dict)
+        m = graph_runtime.create(graph, lib, ctx)
+        data = tvm.nd.array(np.random.uniform(size=dshape).astype(dtype))
+        bias = tvm.nd.array(np.random.uniform(size=kshape[0]).astype(dtype))
+        kernel_np = np.random.uniform(size=kshape).astype(dtype)
+        kernel = tvm.nd.array(kernel_np)
+        dkernel_np = topi.testing.dilate_python(kernel_np, (1, 1, dilation, dilation))
+        m.run(x=data, y_weight=kernel, y_bias=bias)
+        out = m.get_output(0, tvm.nd.empty(oshape, dtype))
+        c_np = topi.testing.conv2d_nchw_python(
+            data.asnumpy(), dkernel_np, 1, 1)
+        c_np = c_np + bias.asnumpy().reshape(kshape[0], 1, 1)
+        tvm.testing.assert_allclose(out.asnumpy(), c_np, rtol=1e-5)
+
+
+def test_grouped_conv2d_nchw():
+    x = sym.Variable("x")
+    y = sym.conv2d(x, channels=32, kernel_size=(3,3), groups=32,
+                   name="y", padding=(1,1))
+    dtype = "float32"
+    dshape = (1, 32, 18, 18)
+    kshape = (32, 1, 3, 3)
+    oshape = (1, 32, 18, 18)
+    shape_dict = {"x": dshape}
+    for target, ctx in ctx_list():
+        graph, lib, _ = nnvm.compiler.build(y, target, shape_dict)
+        m = graph_runtime.create(graph, lib, ctx)
+        data = tvm.nd.array(np.random.uniform(size=dshape).astype(dtype))
+        kernel = tvm.nd.array(np.random.uniform(size=kshape).astype(dtype))
+        bias = tvm.nd.array(np.random.uniform(size=kshape[0]).astype(dtype))
+        m.run(x=data, y_weight=kernel, y_bias=bias)
+        out = m.get_output(0, tvm.nd.empty(oshape, dtype))
+        c_np = topi.testing.depthwise_conv2d_python_nchw(
+            data.asnumpy(), kernel.asnumpy(), (1,1), 'SAME')
+        c_np = c_np + bias.asnumpy().reshape(kshape[0], 1, 1)
+        tvm.testing.assert_allclose(out.asnumpy(), c_np, rtol=1e-5)
+
+def test_grouped_conv2d_nhwc():
+    x = sym.Variable("x")
+    y = sym.conv2d(x, channels=32, kernel_size=(3,3), groups=32,
+                   name="y", padding=(1,1), layout="NHWC", kernel_layout ='HWOI')
+    dtype = "float32"
+    dshape = (1, 18, 18, 32)
+    kshape = (3, 3, 32, 1)
+    oshape = (1, 18, 18, 32)
+    shape_dict = {"x": dshape}
+    for target, ctx in ctx_list():
+        graph, lib, _ = nnvm.compiler.build(y, target, shape_dict)
+        m = graph_runtime.create(graph, lib, ctx)
+        data = tvm.nd.array(np.random.uniform(size=dshape).astype(dtype))
+        kernel = tvm.nd.array(np.random.uniform(size=kshape).astype(dtype))
+        bias = tvm.nd.array(np.random.uniform(size=kshape[2]).astype(dtype))
+        m.run(x=data, y_weight=kernel, y_bias=bias)
+        out = m.get_output(0, tvm.nd.empty(oshape, dtype))
+        c_np = topi.testing.depthwise_conv2d_python_nhwc(
+            data.asnumpy(), kernel.asnumpy(), (1,1), 'SAME')
+        c_np = c_np + bias.asnumpy().reshape(1, 1, kshape[2])
+        tvm.testing.assert_allclose(out.asnumpy(), c_np, rtol=1e-5)
+
+
+def test_conv2d_transpose():
+    x = sym.Variable("x")
+    y = sym.conv2d_transpose(x, channels=10, kernel_size=(3,3), strides=(2,2),
+                             name="y", padding=(1,1), output_padding=(2,2))
+    dtype = "float32"
+    dshape = (1, 3, 18, 18)
+    kshape = (3, 10, 3, 3)
+    oshape = (1, 10, 37, 37)
+    shape_dict = {"x": dshape}
+    for target, ctx in ctx_list():
+        graph, lib, _ = nnvm.compiler.build(y, target, shape_dict)
+        m = graph_runtime.create(graph, lib, ctx)
+        data = tvm.nd.array(np.random.uniform(size=dshape).astype(dtype))
+        kernel = tvm.nd.array(np.random.uniform(size=kshape).astype(dtype))
+        bias = tvm.nd.array(np.random.uniform(size=kshape[1]).astype(dtype))
+        m.run(x=data, y_weight=kernel, y_bias=bias)
+        out = m.get_output(0, tvm.nd.empty(oshape, dtype))
+        c_np = topi.testing.conv2d_transpose_nchw_python(
+            data.asnumpy(), kernel.asnumpy(), 2, 1)
+        c_np = c_np + bias.asnumpy().reshape(kshape[1], 1, 1)
+        d_np = np.zeros(shape=oshape)
+        d_np[:,:,0:c_np.shape[2],0:c_np.shape[3]] = c_np
+        tvm.testing.assert_allclose(out.asnumpy(), d_np, rtol=1e-5)
+
+
+def test_max_pool2d():
+    x = sym.Variable("x")
+    y = sym.max_pool2d(x, pool_size=(2,2), strides=(2,2),
+                       padding=(0,0), name="y", ceil_mode=True)
+    dtype = "float32"
+    dshape = (1, 3, 28, 28)
+    oshape = (1, 3, 14, 14)
+    shape_dict = {"x": dshape}
+    for target, ctx in ctx_list():
+        graph, lib, _ = nnvm.compiler.build(y, target, shape_dict)
+        m = graph_runtime.create(graph, lib, ctx)
+        data = tvm.nd.array(np.random.uniform(size=dshape).astype(dtype))
+        m.run(x=data)
+        out = m.get_output(0, tvm.nd.empty(oshape, dtype))
+        b_np = np.max(data.asnumpy().reshape(1,3,14,2,14,2), axis=(3,5))
+        tvm.testing.assert_allclose(out.asnumpy(), b_np, rtol=1e-5)
+
+
+def test_avg_pool2d():
+    x = sym.Variable("x")
+    y = sym.avg_pool2d(x, pool_size=(2,2), strides=(2,2), padding=(0,0), name="y")
+    dtype = "float32"
+    dshape = (1, 3, 28, 28)
+    oshape = (1, 3, 14, 14)
+    shape_dict = {"x": dshape}
+    for target, ctx in ctx_list():
+        graph, lib, _ = nnvm.compiler.build(y, target, shape_dict)
+        m = graph_runtime.create(graph, lib, ctx)
+        data = tvm.nd.array(np.random.uniform(size=dshape).astype(dtype))
+        m.run(x=data)
+        out = m.get_output(0, tvm.nd.empty(oshape, dtype))
+        b_np = np.mean(data.asnumpy().reshape(1,3,14,2,14,2), axis=(3,5))
+        tvm.testing.assert_allclose(out.asnumpy(), b_np, rtol=1e-5)
+
+
+def test_avg_pool2d_no_count_pad():
+    kh, kw = (4, 4)
+    sh, sw = (2, 2)
+    ph, pw = (2, 2)
+
+    x = sym.Variable("x")
+    y = sym.avg_pool2d(x, pool_size=(kh, kw), strides=(sw, sw), padding=(ph, pw),
+                       name="y", count_include_pad=False)
+    dtype = "float32"
+    n = 1
+    (ic, ih, iw) = (3, 28, 28)
+    (oc, oh, ow) = (3, 15, 15)
+
+    a_np = np.random.uniform(low=0.001, size=(n, ic, ih, iw)).astype(dtype)
+    pad_np = np.zeros(shape=(n, ic, ih+2*ph, iw+2*pw)).astype(dtype)
+    no_zero = (range(n), range(ic), (range(ph, ih+ph)), (range(pw, iw+pw)))
+    pad_np[np.ix_(*no_zero)] = a_np
+    b_np = np.zeros(shape=(n, oc, oh, ow)).astype(dtype)
+
+    for i in range(oh):
+        for j in range(ow):
+            pad_count = np.sum(pad_np[:, :, i*sh:i*sh+kh, j*sw:j*sw+kw] > 0, axis=(2,3))
+            b_np[:,:,i,j] = np.sum(pad_np[:, :, i*sh:i*sh+kh, j*sw:j*sw+kw],
+                                   axis=(2,3)) / np.maximum(pad_count, 1)
+    b_np = np.maximum(b_np, 0.0)
+    shape_dict = {"x": (n, ic, ih, iw)}
+    for target, ctx in ctx_list():
+        graph, lib, _ = nnvm.compiler.build(y, target, shape_dict)
+        m = graph_runtime.create(graph, lib, ctx)
+        data = tvm.nd.array(a_np)
+        m.run(x=data)
+        out = m.get_output(0, tvm.nd.empty((n, oc, oh, ow), dtype))
+        tvm.testing.assert_allclose(out.asnumpy(), b_np, rtol=1e-5)
+
+
+def test_global_max_pool2d():
+    x = sym.Variable("x")
+    y = sym.global_max_pool2d(x, name="y")
+    dtype = "float32"
+    dshape = (1, 1024, 7, 7)
+    oshape = (1, 1024, 1, 1)
+    shape_dict = {"x": dshape}
+    for target, ctx in ctx_list():
+        graph, lib, _ = nnvm.compiler.build(y, target, shape_dict)
+        m = graph_runtime.create(graph, lib, ctx)
+        data = tvm.nd.array(np.random.uniform(size=dshape).astype(dtype))
+        m.run(x=data)
+        out = m.get_output(0, tvm.nd.empty(oshape, dtype))
+        b_np = np.max(data.asnumpy(), axis=(2,3), keepdims=True)
+        tvm.testing.assert_allclose(out.asnumpy(), b_np, rtol=1e-5)
+
+
+def test_global_avg_pool2d():
+    x = sym.Variable("x")
+    y = sym.global_avg_pool2d(x, name="y")
+    dtype = "float32"
+    dshape = (1, 1024, 7, 7)
+    oshape = (1, 1024, 1, 1)
+    shape_dict = {"x": dshape}
+    for target, ctx in ctx_list():
+        graph, lib, _ = nnvm.compiler.build(y, target, shape_dict)
+        m = graph_runtime.create(graph, lib, ctx)
+        data = tvm.nd.array(np.random.uniform(size=dshape).astype(dtype))
+        m.run(x=data)
+        out = m.get_output(0, tvm.nd.empty(oshape, dtype))
+        b_np = np.mean(data.asnumpy(), axis=(2,3), keepdims=True)
+        tvm.testing.assert_allclose(out.asnumpy(), b_np, rtol=1e-5)
+
+
+def test_upsampling_nearest_neighbor():
+    x = sym.Variable("x")
+    scale = 2
+    y = sym.upsampling(x, scale=scale, name="y")
+    dtype = "float32"
+    dshape = (1, 16, 32, 32)
+    oshape = (1, 16, 32*scale, 32*scale)
+    shape_dict = {"x": dshape}
+    for target, ctx in ctx_list():
+        graph, lib, _ = nnvm.compiler.build(y, target, shape_dict)
+        m = graph_runtime.create(graph, lib, ctx)
+        a_np = np.random.uniform(size=dshape).astype(dtype)
+        data = tvm.nd.array(a_np)
+        m.run(x=data)
+        out = m.get_output(0, tvm.nd.empty(oshape, dtype))
+        b_np = topi.testing.upsampling_python(a_np, (scale, scale), "NCHW")
+        tvm.testing.assert_allclose(out.asnumpy(), b_np, rtol=1e-5)
+
+def test_upsampling_bilinear():
+    x = sym.Variable("x")
+    scale = 2
+    y = sym.upsampling(x, scale=scale, method="BILINEAR", name="y", layout="NCHW")
+    dtype = "float32"
+    dshape = (1, 4, 32, 32)
+    oshape = (1, 4, 32*scale, 32*scale)
+    shape_dict = {"x": dshape}
+    dtype_dict = {"x": dtype}
+    for target, ctx in ctx_list():
+        graph, lib, _ = nnvm.compiler.build(y, target, shape_dict, dtype_dict)
+        m = graph_runtime.create(graph, lib, ctx)
+        a_np = np.random.uniform(size=dshape).astype(dtype)
+        data = tvm.nd.array(a_np)
+        m.run(x=data)
+        out = m.get_output(0, tvm.nd.empty(oshape, dtype))
+        b_np = topi.testing.bilinear_resize_python(a_np, (32*scale, 32*scale), "NCHW", align_corners=False)
+        tvm.testing.assert_allclose(out.asnumpy(), b_np, rtol=1e-5, atol=1e-5)
+
+def test_resize_bilinear():
+    x = sym.Variable("x")
+    y = sym.resize(x, size=(60, 60), method="BILINEAR", name="y", layout="NHWC", align_corners=True)
+    dtype = "float32"
+    dshape = (1, 32, 32, 4)
+    oshape = (1, 60, 60, 4)
+    shape_dict = {"x": dshape}
+    dtype_dict = {"x": dtype}
+    for target, ctx in ctx_list():
+        graph, lib, _ = nnvm.compiler.build(y, target, shape_dict, dtype_dict)
+        m = graph_runtime.create(graph, lib, ctx)
+        a_np = np.random.uniform(size=dshape).astype(dtype)
+        data = tvm.nd.array(a_np)
+        m.run(x=data)
+        out = m.get_output(0, tvm.nd.empty(oshape, dtype))
+        b_np = topi.testing.bilinear_resize_python(a_np, (60, 60), "NHWC")
+        tvm.testing.assert_allclose(out.asnumpy(), b_np, rtol=1e-5, atol=1e-5)
+
+if __name__ == "__main__":
+    test_mixed_precision()
+    test_conv2d()
+    test_dilated_conv2d()
+    test_grouped_conv2d_nchw()
+    test_grouped_conv2d_nhwc()
+    test_conv2d_transpose()
+    test_max_pool2d()
+    test_avg_pool2d()
+    test_avg_pool2d_no_count_pad()
+    test_global_max_pool2d()
+    test_global_avg_pool2d()
+    test_upsampling_nearest_neighbor()
+    test_upsampling_bilinear()
+    test_resize_bilinear()
diff --git a/nnvm/tests/python/compiler/test_top_level3.py b/nnvm/tests/python/compiler/test_top_level3.py
new file mode 100644
index 000000000000..c60f0450b30a
--- /dev/null
+++ b/nnvm/tests/python/compiler/test_top_level3.py
@@ -0,0 +1,63 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import numpy as np
+import tvm
+from tvm.contrib import graph_runtime
+import topi.testing
+import nnvm.symbol as sym
+import nnvm.compiler
+from nnvm.testing.config import ctx_list
+from nnvm.testing.check_computation import check_function
+
+def check_map(symfunc, np_func, np_backward=None, dtype="float32", rnd_min=-1, rnd_max=1):
+    x = sym.Variable("x")
+    y = symfunc(x)
+    shape = {'x': (1, 3, 32, 32)}
+    check_function(y, lambda x: np_func(x), np_backward,
+                   dtype=dtype, shape=shape, in_range=(rnd_min, rnd_max))
+
+
+def test_floor():
+    check_map(sym.floor, np.floor)
+
+def test_ceil():
+    check_map(sym.ceil, np.ceil)
+
+def test_trunc():
+    check_map(sym.trunc, np.trunc)
+
+def test_round():
+    check_map(sym.round, np.round)
+
+def test_abs():
+    check_map(sym.abs, np.abs)
+    check_map(sym.abs, np.abs, dtype = "int32")
+    check_map(sym.abs, np.abs, dtype = "int8")
+
+def test_shift():
+    n = 3
+    for dtype in ["int32", "int8"]:
+        check_map(lambda x : x >> n, lambda x: x >> n, dtype=dtype, rnd_min=-100, rnd_max=100)
+        check_map(lambda x : x << n, lambda x: x << n, dtype=dtype, rnd_min=-100, rnd_max=100)
+
+if __name__ == "__main__":
+    test_shift()
+    test_floor()
+    test_ceil()
+    test_round()
+    test_abs()
+    test_trunc()
diff --git a/nnvm/tests/python/compiler/test_top_level4.py b/nnvm/tests/python/compiler/test_top_level4.py
new file mode 100644
index 000000000000..691163974470
--- /dev/null
+++ b/nnvm/tests/python/compiler/test_top_level4.py
@@ -0,0 +1,746 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import math
+import numpy as np
+import tvm
+from tvm.contrib import graph_runtime
+import topi
+import nnvm.symbol as sym
+import nnvm.compiler
+from nnvm.testing.config import ctx_list
+from nnvm.testing.check_computation import check_function
+
+def verify_transpose(dshape, axes):
+    x = sym.Variable("x")
+    if axes:
+        y = sym.transpose(x, axes=axes)
+    else:
+        y = sym.transpose(x)
+    y = y + 1
+    dtype = "float32"
+    for target, ctx in ctx_list():
+        graph, lib, _ = nnvm.compiler.build(y, target, {"x": dshape})
+        m = graph_runtime.create(graph, lib, ctx)
+        # set input
+        data = tvm.nd.array(np.random.uniform(size=dshape).astype(dtype))
+        m.run(x=data)
+        out_np = np.transpose(data.asnumpy(), axes=axes) + 1
+        out = m.get_output(0, tvm.nd.empty(out_np.shape))
+        tvm.testing.assert_allclose(out.asnumpy(), out_np, atol=1e-5, rtol=1e-5)
+
+def verify_reduce_explicit(dshape, data, result, fsym, oshape=None, otype='float32', **kwargs):
+    """ Verify reduce operations by comparign its result with `result` """
+    x = sym.Variable("x")
+    y = fsym(x + 0, **kwargs)
+    for target, ctx in ctx_list():
+        # TODO(yuruofei): remove when cuda reduce schedule is done
+        if target == 'cuda' and fsym == sym.mean:
+            continue
+        graph, lib, _ = nnvm.compiler.build(y, target, {"x": dshape})
+        m = graph_runtime.create(graph, lib, ctx)
+        # set input
+        m.run(x=data)
+        # oshape set to None means do not test the shape-correctness
+        oshape = result.shape if isinstance(result, np.ndarray) else (1,) if oshape is None else oshape
+        out = m.get_output(0, tvm.nd.empty(oshape, dtype=otype))
+        if isinstance(result, np.ndarray):
+            np.testing.assert_equal(out.asnumpy().shape, result.shape)
+            tvm.testing.assert_allclose(out.asnumpy(), result, atol=1e-5, rtol=1e-5)
+        else:
+            tvm_out = out.asnumpy()
+            assert abs(result - tvm_out) <= (1e-5 + 1e-5 * abs(tvm_out))
+
+def verify_reduce(dshape, fnp, fsym, oshape=None, otype='float32', **kwargs):
+    """ Verify reduce operations by generating data at random and calling numpy
+    version as reference """
+    data = np.random.uniform(size=dshape).astype(otype)
+    result = fnp(data + 0, **kwargs)
+    verify_reduce_explicit(dshape, data, result, fsym, oshape=oshape, otype=otype, **kwargs)
+
+def verify_collapse(dshape, target_shape, fnp):
+    x = sym.Variable("x", shape=dshape)
+    t = sym.Variable("t", shape=target_shape)
+    y = sym.collapse_sum(x, t)
+    dtype = "float32"
+    for target, ctx in ctx_list():
+        graph, lib, _ = nnvm.compiler.build(y, target,
+                                            {"x": dshape, "t": target_shape})
+        m = graph_runtime.create(graph, lib, ctx)
+        data = np.random.uniform(size=dshape).astype(dtype)
+        m.run(x=data)
+        out = m.get_output(0, tvm.nd.empty(target_shape))
+        out_np = fnp(data)
+        tvm.testing.assert_allclose(out.asnumpy(), out_np, atol=1e-5, rtol=1e-5)
+
+
+def test_transpose():
+    verify_transpose((2, 3, 4), (0, 2, 1))
+    verify_transpose((2, 3, 4), None)
+
+
+def test_reduce():
+
+    def _with_keepdims(func):
+        """ Wrapper around numpy's argmax/argmin with `keepdims` argument supported """
+        def wrapper(data, axis=None, keepdims=False):
+            if not keepdims:
+                return func(data, axis=axis)
+            else:
+                if axis is not None:
+                    out_shape = list(data.shape)
+                    out_shape[axis] = 1
+                else:
+                    out_shape = [1 for _ in range(len(data.shape))]
+                return func(data, axis=axis).reshape(out_shape)
+        return wrapper
+
+    verify_reduce((2, 3, 4), np.max, sym.max, axis=1, keepdims=True)
+    verify_reduce((4, 4, 3), np.min, sym.min, keepdims=True)
+    verify_reduce((4, 4, 3), np.sum, sym.sum, axis=(0, 2))
+    verify_reduce((4, 4, 3), np.sum, sym.sum)
+    verify_reduce((128, 24, 128), np.mean, sym.mean, axis=(0, 1), keepdims=False)
+    verify_reduce((128, 24, 128), np.mean, sym.mean, axis=(0, 2), keepdims=False)
+    verify_reduce((128, 24, 128), np.mean, sym.mean, axis=(0, 1), keepdims=True)
+    verify_reduce((128, 24, 128), np.mean, sym.mean, axis=(0, 2), keepdims=True)
+    verify_reduce((128, 24, 128), np.mean, sym.mean, keepdims=True)
+    verify_reduce((128, 24, 128), np.mean, sym.mean, keepdims=False)
+    verify_reduce((128, 24, 128), np.mean, sym.mean, axis=(0, 1, 2), keepdims=True)
+
+    data = np.array([[[1,2],[3,4]],[[3,44],[5,6]]], dtype=np.float32)
+    verify_reduce_explicit([2,2,2], data, np.array([[1,1],[1,0]]), sym.argmax, otype='int32', axis=[0,2], exclude=True)
+    verify_reduce_explicit([2,2,2], data, np.array([[0,0],[0,1]]), sym.argmin, otype='int32', axis=[0,2], exclude=True)
+    shape = [4, 4, 3]
+    for axis in [None, 0, 1, 2]:
+        for keepdims in [True,False]:
+            kwargs = { 'keepdims':keepdims }
+            if axis is None:
+                # FIXME: NNVM doesn't support setting `axis=None` explicitly.
+                kwargs.update({'oshape': [1,1,1] if keepdims else [1] })
+            else:
+                kwargs.update({'axis': axis})
+                kwargs.update({'oshape': shape[:axis]+[1]+shape[axis+1:] if keepdims else shape[:axis]+shape[axis+1:]})
+
+            verify_reduce(shape, _with_keepdims(np.argmax), sym.argmax, otype='int32', **kwargs)
+            verify_reduce(shape, _with_keepdims(np.argmin), sym.argmin, otype='int32', **kwargs)
+
+
+def test_collapse():
+    verify_collapse((2, 3, 4), (1,), lambda x: x.sum())
+    verify_collapse((2, 3, 4), (1, 1, 1), lambda x: x.sum(keepdims=True))
+    verify_collapse((2, 3, 4), (1, 1), lambda x: x.sum().reshape(1, 1))
+    verify_collapse((2, 3, 4), (1, 4), lambda x: x.reshape(-1, 4).sum(0, keepdims=True))
+    verify_collapse((2, 3, 4), (3, 4), lambda x: x.sum(0))
+    verify_collapse((2, 3, 4), (1, 3, 4), lambda x: x.sum(0, keepdims=True))
+    verify_collapse((2, 3, 4), (1, 1, 4), lambda x: x.sum((0, 1), keepdims=True))
+    verify_collapse((2, 3, 4), (2, 1, 4), lambda x: x.sum(1, keepdims=True))
+    verify_collapse((2, 3, 4), (2, 1, 1), lambda x: x.sum((1, 2), keepdims=True))
+    verify_collapse((2, 3, 4), (2, 3, 1), lambda x: x.sum(2, keepdims=True))
+    verify_collapse((2, 3, 4), (2, 3, 4), lambda x: x)
+
+
+def verify_flip(ishape, axis):
+    x = sym.Variable("x")
+    y = sym.flip(x, axis=axis) + 1
+    dtype = "float32"
+    x_np = np.random.uniform(size=ishape).astype(dtype)
+    res = np.flip(x_np, axis) + 1
+
+    for target, ctx in ctx_list():
+        # set input
+        graph, lib, _ = nnvm.compiler.build(y, target, {"x": ishape})
+        m = graph_runtime.create(graph, lib, ctx)
+        m.run(x=x_np)
+        out = m.get_output(0, tvm.nd.empty(res.shape))
+        tvm.testing.assert_allclose(out.asnumpy(), res, atol=1e-5, rtol=1e-5)
+
+
+def test_flip():
+    verify_flip((3, 4, 3), 1)
+    verify_flip((3, 4, 3), 0)
+    verify_flip((3, 4, 3), 2)
+    verify_flip((3, 4, 3), -1)
+    verify_flip((3, 4, 3), -3)
+    verify_flip((3, 4, 3), -2)
+
+
+def verify_reshape(dshape, oshape):
+    x = sym.Variable("x")
+    y = sym.reshape(x, shape=oshape)
+    y = y + 1
+    dtype = "float32"
+    for target, ctx in ctx_list():
+        graph, lib, _ = nnvm.compiler.build(y, target, {"x": dshape})
+        m = graph_runtime.create(graph, lib, ctx)
+        # set input
+        data = tvm.nd.array(np.random.uniform(size=dshape).astype(dtype))
+        m.run(x=data)
+        out_np = data.asnumpy().reshape(oshape) + 1
+        out = m.get_output(0, tvm.nd.empty(out_np.shape))
+        tvm.testing.assert_allclose(out.asnumpy(), out_np, atol=1e-5, rtol=1e-5)
+
+
+def test_reshape():
+    verify_reshape((2, 3, 4), (-1, 2, 1))
+    verify_reshape((2, 3, 4), (8, 3))
+    verify_reshape((4, 7), (2, 7, 2))
+
+
+def test_clip():
+    x = sym.Variable("x")
+    a_min=0.2
+    a_max=0.75
+    y = sym.clip(x, a_min=a_min, a_max=a_max)
+
+    def forward(x):
+        return np.clip(x, a_min=a_min, a_max=a_max)
+
+    def backward(head_grads, x):
+        mask1 = np.greater_equal(x, a_min).astype("float")
+        mask2 = np.less_equal(x, a_max).astype("float")
+        return [head_grads * mask1 * mask2]
+
+    shape = {'x': (3, 4, 5)}
+    check_function(y, forward, backward, shape=shape)
+
+
+def test_broadcast():
+    a = sym.Variable("a")
+    b = sym.Variable("b")
+    shape = {'a': (3, 4, 5), 'b': (1, 5)}
+
+    def _collapse(g):
+        return g.reshape(-1, shape['b'][-1]).sum(0, keepdims=True)
+
+    y = sym.broadcast_add(a, b)
+    def _backward_add(head_grads, a, b):
+        da = head_grads
+        db = _collapse(head_grads)
+        return da, db
+    check_function(y, lambda a, b: a + b, _backward_add, shape=shape)
+
+    y = sym.broadcast_sub(a, b)
+    def _backward_sub(head_grads, a, b):
+        da = head_grads
+        db = -_collapse(head_grads)
+        return da, db
+    check_function(y, lambda a, b: a - b, _backward_sub, shape=shape)
+
+    y = sym.broadcast_mul(a, b)
+    def _backward_mul(head_grads, a, b):
+        da = head_grads * b
+        db = _collapse(head_grads * a)
+        return da, db
+    check_function(y, lambda a, b: a * b, _backward_mul, shape=shape)
+
+    y = sym.broadcast_div(a, b)
+    def _backward_div(head_grads, a, b):
+        da = head_grads / b
+        db = _collapse(- head_grads * a / b**2)
+        return da, db
+    # We avoid computing numerical derivatives too close to zero here
+    check_function(y, lambda a, b: a / b, _backward_div, shape=shape, numerical_grads=False)
+    check_function(y, lambda a, b: a / b, _backward_div, shape=shape,
+                   in_range={'b': (0.1, 20)})
+
+    y = sym.broadcast_mod(a, b)
+    check_function(y,
+                   lambda a, b: np.mod(a, b),
+                   in_range={'a': (0.001, 100), 'b': (1, 100)}, dtype='int32', shape=shape)
+
+    y = sym.broadcast_max(a, b)
+    check_function(y, lambda a, b: np.maximum(a, b), shape=shape)
+
+    y = sym.broadcast_min(a, b)
+    check_function(y, lambda a, b: np.minimum(a, b), shape=shape)
+
+    y = sym.broadcast_pow(a, b)
+    check_function(y,
+                   lambda a, b: np.power(a, b),
+                   in_range={'a': (0.001, 100), 'b': (0.001, 2)}, shape=shape)
+
+    y = sym.broadcast_left_shift(a, b)
+    check_function(y, lambda a, b: a << b, dtype='int32', shape=shape)
+
+    y = sym.broadcast_right_shift(a, b)
+    check_function(y, lambda a, b: a >> b, dtype='int32', shape=shape)
+
+    y = sym.broadcast_greater(a, b)
+    check_function(y, lambda a, b: np.greater(a, b), shape=shape)
+
+    y = sym.broadcast_less(a, b)
+    check_function(y, lambda a, b: np.less(a, b), shape=shape)
+
+    y = sym.broadcast_equal(a, b)
+    check_function(y, lambda a, b: np.equal(a, b),
+                   in_range={'a': (-2, 2), 'b': (-2, 2)}, dtype='int32', shape=shape)
+
+    y = sym.broadcast_not_equal(a, b)
+    check_function(y, lambda a, b: np.not_equal(a, b),
+                   in_range={'a': (-2, 2), 'b': (-2, 2)}, dtype='int32', shape=shape)
+
+    y = sym.broadcast_greater_equal(a, b)
+    check_function(y, lambda a, b: np.greater_equal(a, b),
+                   in_range={'a': (-3, 3), 'b': (-3, 3)}, dtype='int32', shape=shape)
+
+    y = sym.broadcast_less_equal(a, b)
+    check_function(y, lambda a, b: np.less_equal(a, b),
+                   in_range={'a': (-3, 3), 'b': (-3, 3)}, dtype='int32', shape=shape)
+
+def test_greater():
+    l = sym.Variable("l")
+    r = sym.Variable("r")
+    y = sym.greater(l, r)
+
+    def forward(l, r):
+        return np.greater(l, r).astype("float32")
+
+    def backward(head_grads, l, r):
+        return {'l': np.zeros_like(l)}
+
+    shape = {'l': (3, 4, 5), 'r': (3, 4, 5)}
+    check_function(y, forward, backward, shape=shape)
+
+
+def test_less():
+    l = sym.Variable("l")
+    r = sym.Variable("r")
+    y = sym.less(l, r)
+
+    def forward(l, r):
+        return np.less(l, r).astype("float32")
+
+    def backward(head_grads, l, r):
+        return {'l': np.zeros_like(l)}
+
+    shape = {'l': (3, 4, 5), 'r': (3, 4, 5)}
+    check_function(y, forward, backward, shape=shape)
+
+
+def test_reshape_like():
+    x = sym.Variable("x")
+    y = sym.Variable("y")
+    z = sym.reshape_like(x, y)
+
+    def forward(x, y):
+        return np.reshape(x, y.shape)
+
+    def backward(head_grads, x, y):
+        return [np.reshape(head_grads, x.shape),
+                np.zeros_like(y)]
+
+    shape = {'x': (3, 4, 5), 'y': (5, 4, 3)}
+    check_function(z, forward, backward, shape=shape)
+
+
+def verify_expand_like(in_shape, out_shape, axis, exclude):
+    x = sym.Variable("x")
+    y = sym.Variable("y")
+    z = sym.expand_like(x, y, axis=axis, exclude=exclude)
+
+    def forward(x, y):
+        odim = len(out_shape)
+
+        if len(x.shape) == len(y.shape):
+            return np.broadcast_to(x, y.shape)
+
+        if x.shape == (1,) and len(y.shape) == odim:
+            x = np.reshape(x, ())
+
+        real_axis = [i if i >= 0 else i + odim for i in axis]
+        real_axis = sorted(real_axis)
+        if exclude:
+            real_axis = list(set(range(odim)) - set(real_axis))
+        for i in real_axis:
+            x = np.expand_dims(x, i).astype(x.dtype)
+        for i in real_axis:
+            x = np.concatenate([x]*out_shape[i], axis=i).astype(x.dtype)
+
+        return x
+
+    def backward(head_grads, x, y):
+        odim = len(out_shape)
+
+        keepdims = len(x.shape) == len(y.shape)
+
+        if x.shape == (1,) and len(y.shape) == odim:
+            x = np.reshape(x, ())
+
+        real_axis = [i if i >= 0 else i + odim for i in axis]
+        real_axis = sorted(real_axis)
+        if exclude:
+            real_axis = list(set(range(odim)) - set(real_axis))
+        return [np.sum(head_grads, axis=tuple(real_axis), keepdims=keepdims),
+                np.zeros_like(y)]
+
+
+    shape = {'x': in_shape, 'y': out_shape}
+    check_function(z, forward, backward, shape=shape)
+
+
+def test_expand_like():
+    verify_expand_like((3,), (3, 2), [1], False)
+    verify_expand_like((2,), (2, 3), [1], False)
+    verify_expand_like((3, 4), (3, 5, 4), [1], False)
+    verify_expand_like((5, 7), (5, 6, 7, 8), [0, 2], True)
+    verify_expand_like((2, 3), (2, 3), [], False)
+    verify_expand_like((1,), (2, 3), [0, 1], False)
+    verify_expand_like((1, 1), (2, 3), [0, 1], False)
+    verify_expand_like((2, 1), (2, 3), [1], False)
+    verify_expand_like((1, 3), (2, 3), [0], False)
+
+
+def verify_elemwise_sum(num_args):
+    s = [sym.Variable("input" + str(i)) for i in range(num_args)]
+    y = sym.elemwise_sum(*s, num_args=num_args)
+
+    def forward(**inputs):
+        return np.sum(np.array(list(inputs.values())), axis=0)
+
+    def backward(head_grads, **inputs):
+        return [head_grads] * num_args
+
+    shape = {s[i]: (3, 4, 5) for i in range(num_args)}
+    check_function(y, forward, backward, shape=shape)
+
+
+def test_elemwise_sum():
+    verify_elemwise_sum(1)
+    verify_elemwise_sum(5)
+    verify_elemwise_sum(7)
+
+
+def test_block_grad():
+    x = sym.Variable("x")
+    y = sym.block_grad(x)
+
+    def forward(x):
+        return x
+
+    def backward(head_grads, x):
+        return [np.zeros_like(head_grads)]
+
+
+    shape = {'x': (3, 4, 5)}
+    # Numerical grad checking would fail for this function
+    check_function(y, forward, backward, shape=shape, numerical_grads=False)
+
+
+def test_full():
+    shape = (3, 4, 5)
+    value = 7
+    dtype = "float32"
+    for target, ctx in ctx_list():
+        data = sym.Variable("data", dtype=dtype)
+        # full_like
+        s = sym.full_like(data=data, fill_value=value, name="s")
+        graph, lib, _ = nnvm.compiler.build(s, target, {"data": shape})
+        m = graph_runtime.create(graph, lib, ctx)
+        m.run(data=np.random.uniform(size=shape).astype(dtype))
+        out = m.get_output(0, tvm.nd.empty(shape, dtype=dtype))
+        tvm.testing.assert_allclose(
+            out.asnumpy(),
+            np.full(shape, fill_value=value, dtype=dtype),
+            atol=1e-5, rtol=1e-5)
+        # ones_like
+        s = sym.ones_like(data=data, fill_value=value, name="s")
+        graph, lib, _ = nnvm.compiler.build(s, target, {"data": shape})
+        m = graph_runtime.create(graph, lib, ctx)
+        m.run(data=np.random.uniform(size=shape).astype(dtype))
+        out = m.get_output(0, tvm.nd.empty(shape, dtype=dtype))
+        tvm.testing.assert_allclose(
+            out.asnumpy(),
+            np.full(shape, fill_value=1, dtype=dtype),
+            atol=1e-5, rtol=1e-5)
+        # zeros_like
+        s = sym.zeros_like(data=data, fill_value=value, name="s")
+        graph, lib, _ = nnvm.compiler.build(s, target, {"data": shape})
+        m = graph_runtime.create(graph, lib, ctx)
+        m.run(data=np.random.uniform(size=shape).astype(dtype))
+        out = m.get_output(0, tvm.nd.empty(shape, dtype=dtype))
+        tvm.testing.assert_allclose(
+            out.asnumpy(),
+            np.full(shape, fill_value=0, dtype=dtype),
+            atol=1e-5, rtol=1e-5)
+        # full
+        s = sym.full(shape=shape, dtype=dtype, fill_value=value, name="s")
+        graph, lib, _ = nnvm.compiler.build(s, target)
+        m = graph_runtime.create(graph, lib, ctx)
+        m.run()
+        out = m.get_output(0, tvm.nd.empty(shape, dtype=dtype))
+        tvm.testing.assert_allclose(
+            out.asnumpy(),
+            np.full(shape, fill_value=value, dtype=dtype),
+            atol=1e-5, rtol=1e-5)
+        # ones
+        s = sym.ones(shape=shape, dtype=dtype, name="s")
+        graph, lib, _ = nnvm.compiler.build(s, target)
+        m = graph_runtime.create(graph, lib, ctx)
+        m.run()
+        out = m.get_output(0, tvm.nd.empty(shape, dtype=dtype))
+        tvm.testing.assert_allclose(
+            out.asnumpy(),
+            np.full(shape, fill_value=1, dtype=dtype),
+            atol=1e-5, rtol=1e-5)
+        # zeros
+        s = sym.zeros(shape=shape, dtype=dtype, name="s")
+        graph, lib, _ = nnvm.compiler.build(s, target)
+        m = graph_runtime.create(graph, lib, ctx)
+        m.run()
+        out = m.get_output(0, tvm.nd.empty(shape, dtype=dtype))
+        tvm.testing.assert_allclose(
+            out.asnumpy(),
+            np.full(shape, fill_value=0, dtype=dtype),
+            atol=1e-5, rtol=1e-5)
+
+def verify_multibox_prior(dshape, sizes=(1,), ratios=(1,), steps=(-1, -1),
+                          offsets=(0.5, 0.5), clip=False):
+    data = sym.Variable("data")
+    out = sym.multibox_prior(data=data, sizes=sizes, ratios=ratios, steps=steps,
+                             offsets=offsets, clip=clip)
+
+    in_height = dshape[2]
+    in_width = dshape[3]
+    num_sizes = len(sizes)
+    num_ratios = len(ratios)
+    size_ratio_concat = sizes + ratios
+    steps_h = steps[0] if steps[0] > 0 else 1.0 / in_height
+    steps_w = steps[1] if steps[1] > 0 else 1.0 / in_width
+    offset_h = offsets[0]
+    offset_w = offsets[1]
+
+    oshape = (1, in_height * in_width * (num_sizes + num_ratios - 1), 4)
+    dtype = "float32"
+    np_out = np.zeros(oshape).astype(dtype)
+
+    for i in range(in_height):
+        center_h = (i + offset_h) * steps_h
+        for j in range(in_width):
+            center_w = (j + offset_w) * steps_w
+            for k in range(num_sizes + num_ratios - 1):
+                w = size_ratio_concat[k] * in_height / in_width / 2.0 if k < num_sizes else \
+                    size_ratio_concat[0] * in_height / in_width * math.sqrt(size_ratio_concat[k + 1]) / 2.0
+                h = size_ratio_concat[k] / 2.0 if k < num_sizes else \
+                    size_ratio_concat[0] / math.sqrt(size_ratio_concat[k + 1]) / 2.0
+                count = i * in_width * (num_sizes + num_ratios - 1) + j * (num_sizes + num_ratios - 1) + k
+                np_out[0][count][0] = center_w - w
+                np_out[0][count][1] = center_h - h
+                np_out[0][count][2] = center_w + w
+                np_out[0][count][3] = center_h + h
+    if clip:
+        np_out = np.clip(np_out, 0, 1)
+
+    for target, ctx in ctx_list():
+        graph, lib, _ = nnvm.compiler.build(out, target, {"data": dshape})
+        m = graph_runtime.create(graph, lib, ctx)
+        m.set_input("data", np.random.uniform(size=dshape).astype(dtype))
+        m.run()
+        tvm_out = m.get_output(0, tvm.nd.empty(np_out.shape, dtype))
+        tvm.testing.assert_allclose(tvm_out.asnumpy(), np_out, atol=1e-5, rtol=1e-5)
+
+def test_multibox_prior():
+    verify_multibox_prior((1, 3, 50, 50))
+    verify_multibox_prior((1, 3, 224, 224), sizes=(0.5, 0.25, 0.1), ratios=(1, 2, 0.5))
+    verify_multibox_prior((1, 32, 32, 32), sizes=(0.5, 0.25), ratios=(1, 2), steps=(2, 2), clip=True)
+
+def test_multibox_transform_loc():
+    batch_size = 1
+    num_anchors = 3
+    num_classes = 3
+    cls_prob = sym.Variable("cls_prob")
+    loc_preds = sym.Variable("loc_preds")
+    anchors = sym.Variable("anchors")
+    transform_loc_data, valid_count = sym.multibox_transform_loc(cls_prob=cls_prob, loc_pred=loc_preds,
+                                                                 anchor=anchors)
+    out = sym.non_max_suppression(data=transform_loc_data, valid_count=valid_count, return_indices=False)
+
+    # Manually create test case
+    np_cls_prob = np.array([[[0.2, 0.5, 0.3], [0.25, 0.3, 0.45], [0.7, 0.1, 0.2]]])
+    np_loc_preds = np.array([[0.1, -0.2, 0.3, 0.2, 0.2, 0.4, 0.5, -0.3, 0.7, -0.2, -0.4, -0.8]])
+    np_anchors = np.array([[[-0.1, -0.1, 0.1, 0.1], [-0.2, -0.2, 0.2, 0.2], [1.2, 1.2, 1.5, 1.5]]])
+
+    expected_np_out = np.array([[[1, 0.69999999, 0, 0, 0.10818365, 0.10008108],
+                                 [0, 0.44999999, 1, 1, 1, 1],
+                                 [0, 0.30000001, 0, 0, 0.22903419, 0.20435292]]])
+
+    dtype = "float32"
+    for target, ctx in ctx_list():
+        graph, lib, _ = nnvm.compiler.build(out, target, {"cls_prob": (batch_size, num_anchors, num_classes),
+                                                          "loc_preds": (batch_size, num_anchors * 4),
+                                                          "anchors": (1, num_anchors, 4)})
+        m = graph_runtime.create(graph, lib, ctx)
+        m.set_input(**{"cls_prob": np_cls_prob.astype(dtype), "loc_preds": np_loc_preds.astype(dtype), "anchors": np_anchors.astype(dtype)})
+        m.run()
+        tvm_out = m.get_output(0, tvm.nd.empty(expected_np_out.shape, dtype))
+        tvm.testing.assert_allclose(tvm_out.asnumpy(), expected_np_out, atol=1e-5, rtol=1e-5)
+
+def test_non_max_suppression():
+    dshape = (1, 5, 6)
+    data = sym.Variable("data")
+    valid_count = sym.Variable("valid_count", dtype="int32")
+    iou_threshold = 0.7
+    force_suppress = True
+    top_k = 2
+    out = sym.non_max_suppression(data=data, valid_count=valid_count, return_indices=False,
+                                  iou_threshold=iou_threshold, force_suppress=force_suppress, top_k=top_k)
+
+    np_data = np.array([[[0, 0.8, 1, 20, 25, 45], [1, 0.7, 30, 60, 50, 80],
+                         [0, 0.4, 4, 21, 19, 40], [2, 0.9, 35, 61, 52, 79],
+                         [1, 0.5, 100, 60, 70, 110]]]).astype("float32")
+    np_valid_count = np.array([4]).astype("int32")
+    np_result = np.array([[[2, 0.9, 35, 61, 52, 79], [0, 0.8, 1, 20, 25, 45],
+                           [-1, -1, -1, -1, -1, -1], [-1, -1, -1, -1, -1, -1],
+                           [-1, -1, -1, -1, -1, -1]]])
+
+    for target, ctx in ctx_list():
+        graph, lib, _ = nnvm.compiler.build(out, target, {"data": dshape, "valid_count": (dshape[0],)},
+                                            dtype={"data": "float32", "valid_count": "int32"})
+        m = graph_runtime.create(graph, lib, ctx)
+        m.set_input(**{"data": np_data, "valid_count": np_valid_count})
+        m.run()
+        tvm_out = m.get_output(0, tvm.nd.empty(np_result.shape, "float32"))
+        tvm.testing.assert_allclose(tvm_out.asnumpy(), np_result, atol=1e-5, rtol=1e-5)
+
+def np_slice_like(np_data, np_shape_like, axis=[]):
+    begin_idx = [0 for _ in np_data.shape]
+    end_idx = list(np_data.shape)
+    if len(axis) > 0:
+        for i in axis:
+            if i < 0:
+                i = len(np_data.shape) + i
+            end_idx[i] = np_shape_like.shape[i]
+    else:
+        for i in range(len(np_data.shape)):
+            if i < len(np_shape_like.shape):
+                end_idx[i] = np_shape_like.shape[i]
+    slice_idx = []
+    for b, e in zip(begin_idx, end_idx):
+        slice_idx.append(slice(b, e))
+    np_result = np_data[slice_idx]
+    return np_result
+
+def verify_slice_like(np_data, np_shape_like, axis=[]):
+    dtype = "float32"
+    np_data = np_data.astype(dtype)
+    np_shape_like = np_shape_like.astype(dtype)
+    np_result = np_slice_like(np_data, np_shape_like, axis)
+    data1 = sym.Variable("data1")
+    data2 = sym.Variable("data2")
+    net = sym.slice_like(data=data1, slice_like=data2, axis=axis)
+    for target, ctx in ctx_list():
+        graph, lib, _ = nnvm.compiler.build(net, target, {"data1": np_data.shape,
+                                                          "data2": np_shape_like.shape})
+        m = graph_runtime.create(graph, lib, ctx)
+        m.set_input(**{"data1": np_data, "data2": np_shape_like})
+        m.run()
+        out = m.get_output(0, tvm.nd.empty(np_result.shape, dtype))
+        tvm.testing.assert_allclose(out.asnumpy(), np_result, atol=1e-5, rtol=1e-5)
+
+def test_slice_like():
+    np_data = np.random.uniform(size=(3, 4, 5))
+    np_shape_like = np.random.uniform(size=(1, 2, 3))
+    verify_slice_like(np_data, np_shape_like)
+    np_data = np.random.uniform(size=(3, 4, 5))
+    np_shape_like = np.random.uniform(size=(1, 2))
+    verify_slice_like(np_data, np_shape_like)
+    np_data = np.random.uniform(size=(3, 4, 5))
+    np_shape_like = np.random.uniform(size=(1, 2, 3))
+    axis = (1, 2)
+    verify_slice_like(np_data, np_shape_like, axis)
+    np_data = np.random.uniform(size=(3, 4, 5))
+    np_shape_like = np.random.uniform(size=(1, 2, 3))
+    axis = (-1, -3)
+    verify_slice_like(np_data, np_shape_like, axis)
+    np_data = np.random.uniform(size=(1, 3, 224, 224))
+    np_shape_like = np.random.uniform(size=(1, 3, 112, 112))
+    axis = (2, 3)
+    verify_slice_like(np_data, np_shape_like, axis)
+
+def verify_where(condition, x, y):
+    dtype = "float32"
+    if len(condition.shape) == 1:
+        np_out = np.array([xv if c else yv for (c,xv,yv) in zip(condition,x,y)])
+    else:
+        np_out = np.where(condition, x, y)
+    cond_var = sym.Variable("condition")
+    x_var = sym.Variable("x")
+    y_var = sym.Variable("y")
+    net = sym.where(cond_var, x_var, y_var)
+    for target, ctx in ctx_list():
+        graph, lib, _ = nnvm.compiler.build(net, target, {"condition": condition.shape,
+                                                          "x": x.shape, "y": y.shape})
+        m = graph_runtime.create(graph, lib, ctx)
+        m.set_input(**{"condition": condition, "x": x, "y": y})
+        m.run()
+        out = m.get_output(0, tvm.nd.empty(x.shape, dtype))
+        tvm.testing.assert_allclose(out.asnumpy(), np_out, atol=1e-5, rtol=1e-5)
+
+def test_where():
+    shape = (13, 8, 224, 224, 6)
+    condition = np.random.uniform(low=-1, high=1, size=shape).astype("float32")
+    x = np.random.uniform(size=shape).astype("float32")
+    y = np.random.uniform(size=shape).astype("float32")
+    verify_where(condition, x, y)
+    condition = np.random.uniform(low=-1, high=1, size=(shape[0],)).astype("float32")
+    x = np.random.uniform(size=shape).astype("float32")
+    y = np.random.uniform(size=shape).astype("float32")
+    verify_where(condition, x, y)
+
+def test_argmax():
+    dshape = (204800, 2)
+    oshape = (1, 320, 640)
+
+    dtype = "float32"
+    x = sym.Variable("x", shape=dshape, dtype=dtype)
+    x = sym.reshape(x, shape=(1, 320, 640, 2))
+    x = sym.transpose(x, axes=(0, 3, 1, 2))
+    y = sym.argmax(x, axis=1)
+    target_str = "llvm"
+    target = tvm.target.create(target_str)
+    ctx = tvm.context(target_str, 0)
+    with nnvm.compiler.build_config(opt_level=2):
+        graph, lib, _ = nnvm.compiler.build(y, target, {"x": dshape})
+    m = graph_runtime.create(graph, lib, ctx)
+    data = np.random.uniform(size=dshape).astype(dtype)
+    m.run(x=data)
+    np_reshape = np.reshape(data, (1, 320, 640, 2))
+    np_transpose = np.transpose(np_reshape, axes=(0, 3, 1, 2))
+    np_argmax = np.argmax(np_transpose, axis=1)
+    out = m.get_output(0)
+    np.testing.assert_allclose(out.asnumpy(), np_argmax, atol=1e-5, rtol=1e-5)
+
+if __name__ == "__main__":
+    test_reshape()
+    test_broadcast()
+    test_reduce()
+    test_collapse()
+    test_transpose()
+    test_clip()
+    test_greater()
+    test_less()
+    test_reshape_like()
+    test_expand_like()
+    test_elemwise_sum()
+    test_block_grad()
+    test_full()
+    test_flip()
+    test_multibox_prior()
+    test_multibox_transform_loc()
+    test_non_max_suppression()
+    test_slice_like()
+    test_where()
+    test_argmax()
+    print(nnvm.compiler.engine.dump())
diff --git a/nnvm/tests/python/frontend/caffe2/model_zoo/__init__.py b/nnvm/tests/python/frontend/caffe2/model_zoo/__init__.py
new file mode 100644
index 000000000000..2dc1f08f6ec9
--- /dev/null
+++ b/nnvm/tests/python/frontend/caffe2/model_zoo/__init__.py
@@ -0,0 +1,35 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Store for caffe2 examples and common models."""
+from __future__ import absolute_import as _abs
+import os
+import importlib
+
+models = [
+    'squeezenet',
+    'resnet50',
+    'vgg19',
+]
+
+# skip download if model exist
+for model in models:
+    try:
+        locals()['c2_' + model] = importlib.import_module('caffe2.python.models.' + model)
+    except ImportError:
+        os.system("python -m caffe2.python.models.download -i -f " + model)
+        locals()['c2_' + model] = importlib.import_module('caffe2.python.models.' + model)
diff --git a/nnvm/tests/python/frontend/caffe2/model_zoo/squeezenet.py b/nnvm/tests/python/frontend/caffe2/model_zoo/squeezenet.py
new file mode 100644
index 000000000000..2de2d1075494
--- /dev/null
+++ b/nnvm/tests/python/frontend/caffe2/model_zoo/squeezenet.py
@@ -0,0 +1,118 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# coding: utf-8
+# pylint: disable=unused-argument
+
+"""
+Symbol of SqueezeNet
+
+Reference:
+Iandola, Forrest N., et al.
+"Squeezenet: Alexnet-level accuracy with 50x fewer parameters and< 0.5 mb model size." (2016).
+"""
+
+from nnvm import symbol as sym
+from nnvm.testing.utils import create_workload
+
+# Helpers
+def _make_fire(net, squeeze_channels, expand1x1_channels, expand3x3_channels):
+    net = _make_fire_conv(net, squeeze_channels, 1, 0)
+
+    left = _make_fire_conv(net, expand1x1_channels, 1, 0)
+    right = _make_fire_conv(net, expand3x3_channels, 3, 1)
+    # NOTE : Assume NCHW layout here
+    net = sym.concatenate(left, right, axis=1)
+
+    return net
+
+def _make_fire_conv(net, channels, kernel_size, padding=0):
+    net = sym.conv2d(net, channels=channels, kernel_size=(kernel_size, kernel_size),
+                     padding=(padding, padding))
+    net = sym.relu(net)
+    return net
+
+# Net
+def get_symbol(num_classes, version, **kwargs):
+    """Get symbol of SqueezeNet
+
+    Parameters
+    ----------
+    num_classes: int
+        The number of classification results
+
+    version : str, optional
+        "1.0" or "1.1" of SqueezeNet
+    """
+    assert version == '1.1', ("Unsupported SqueezeNet version {version}:"
+                              "1.1 expected".format(version=version))
+    net = sym.Variable("data")
+
+    net = sym.conv2d(net, channels=64, kernel_size=(3, 3), strides=(2, 2))
+    net = sym.relu(net)
+    net = sym.max_pool2d(net, pool_size=(3, 3), strides=(2, 2))
+    net = _make_fire(net, 16, 64, 64)
+    net = _make_fire(net, 16, 64, 64)
+    net = sym.max_pool2d(net, pool_size=(3, 3), strides=(2, 2))
+    net = _make_fire(net, 32, 128, 128)
+    net = _make_fire(net, 32, 128, 128)
+    net = sym.max_pool2d(net, pool_size=(3, 3), strides=(2, 2))
+    net = _make_fire(net, 48, 192, 192)
+    net = _make_fire(net, 48, 192, 192)
+    net = _make_fire(net, 64, 256, 256)
+    net = _make_fire(net, 64, 256, 256)
+
+    net = sym.dropout(net, rate=0.5)
+    net = sym.conv2d(net, channels=num_classes, kernel_size=(1, 1))
+    net = sym.relu(net)
+    net = sym.global_avg_pool2d(net)
+    return sym.softmax(net, axis=1)
+
+def get_workload(batch_size=1, num_classes=1000, version='1.0',
+                 image_shape=(3, 224, 224), dtype="float32", **kwargs):
+    """Get benchmark workload for SqueezeNet
+
+    Parameters
+    ----------
+    batch_size : int
+        The batch size used in the model
+
+    num_classes : int, optional
+        Number of classes
+
+    version : str, optional
+        "1.0" or "1.1" of SqueezeNet
+
+    image_shape : tuple, optional
+        The input image shape
+
+    dtype : str, optional
+        The data type
+
+    kwargs : dict
+        Extra arguments
+
+    Returns
+    -------
+    net : nnvm.Symbol
+        The computational graph
+
+    params : dict of str to NDArray
+        The parameters.
+    """
+    net = get_symbol(num_classes=num_classes, version=version, **kwargs)
+    return create_workload(net, batch_size, image_shape, dtype)
diff --git a/nnvm/tests/python/frontend/caffe2/test_forward.py b/nnvm/tests/python/frontend/caffe2/test_forward.py
new file mode 100644
index 000000000000..2a216314ba1a
--- /dev/null
+++ b/nnvm/tests/python/frontend/caffe2/test_forward.py
@@ -0,0 +1,108 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import numpy as np
+import nnvm
+import tvm
+from tvm.contrib import graph_runtime
+from nnvm.testing.config import ctx_list
+from model_zoo import c2_squeezenet, c2_resnet50, c2_vgg19
+
+from caffe2.python import workspace
+
+
+def get_tvm_output(model,
+                   input_data,
+                   target,
+                   ctx,
+                   output_shape,
+                   output_dtype='float32'):
+    """ Generic function to execute and get tvm output"""
+    sym, params = nnvm.frontend.from_caffe2(model.init_net, model.predict_net)
+
+    # supporting multiple inputs in caffe2 in a bit tricky,
+    # because the input names can appear at the beginning or end of model.predict_net.external_input
+    assert isinstance(input_data, np.ndarray)
+
+    # here we use the first input blob to the first op to get the input name
+    input_names = model.predict_net.op[0].input[0]
+    shape_dict = {input_names: input_data.shape}
+    dtype_dict = {input_names: input_data.dtype}
+
+    graph, lib, params = nnvm.compiler.build(
+        sym, target, shape=shape_dict, dtype=dtype_dict, params=params)
+
+    m = graph_runtime.create(graph, lib, ctx)
+
+    # set inputs
+    m.set_input(input_names, tvm.nd.array(input_data.astype(input_data.dtype)))
+    m.set_input(**params)
+
+    # execute
+    m.run()
+
+    # get outputs
+    if isinstance(output_shape, list) and isinstance(output_dtype, list):
+        tvm_output_list = []
+        for i, s in enumerate(output_shape):
+            tvm_output = m.get_output(i, tvm.nd.empty((s), output_dtype[i]))
+            tvm_output_list.append(tvm_output.asnumpy())
+        return tvm_output_list
+    else:
+        tvm_output = m.get_output(0, tvm.nd.empty((output_shape),
+                                                  output_dtype))
+        return tvm_output.asnumpy()
+
+
+def get_caffe2_output(model, x, dtype='float32'):
+    workspace.RunNetOnce(model.init_net)
+
+    input_blob = model.predict_net.op[0].input[0]
+    workspace.FeedBlob(input_blob, x.astype(dtype))
+    workspace.RunNetOnce(model.predict_net)
+
+    output_blob = model.predict_net.external_output[0]
+    c2_output = workspace.FetchBlob(output_blob)
+    return c2_output
+
+
+def verify_caffe2_forward_impl(model, data_shape, out_shape):
+    dtype = 'float32'
+    data = np.random.uniform(size=data_shape).astype(dtype)
+    c2_out = get_caffe2_output(model, data, dtype)
+    for target, ctx in ctx_list():
+        tvm_out = get_tvm_output(model, data, target, ctx, out_shape, dtype)
+        tvm.testing.assert_allclose(c2_out, tvm_out, rtol=1e-5, atol=1e-5)
+
+
+def test_squeezenet1_1():
+    verify_caffe2_forward_impl(c2_squeezenet, (1, 3, 224, 224),
+                               (1, 1000, 1, 1))
+
+
+def test_resnet50():
+    verify_caffe2_forward_impl(c2_resnet50, (1, 3, 224, 224),
+                               (1, 1000))
+
+
+def test_vgg19():
+    verify_caffe2_forward_impl(c2_vgg19, (1, 3, 224, 224), (1, 1000))
+
+
+if __name__ == '__main__':
+    test_squeezenet1_1()
+    test_resnet50()
+    test_vgg19()
diff --git a/nnvm/tests/python/frontend/caffe2/test_graph.py b/nnvm/tests/python/frontend/caffe2/test_graph.py
new file mode 100644
index 000000000000..c8203815e6d0
--- /dev/null
+++ b/nnvm/tests/python/frontend/caffe2/test_graph.py
@@ -0,0 +1,40 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Test graph equality of caffe2 models."""
+import nnvm
+from nnvm.compiler import graph_util, graph_attr
+from model_zoo import c2_squeezenet, squeezenet
+
+def compare_graph(init, predict, nnvm_sym, ishape):
+    caffe2_sym, params = nnvm.frontend.from_caffe2(init, predict)
+    g1 = nnvm.graph.create(caffe2_sym)
+    g2 = nnvm.graph.create(nnvm_sym)
+    input_name = predict.external_input[0]
+    ishapes = {input_name: ishape}
+    graph_attr.set_shape_inputs(g1, ishapes)
+    graph_attr.set_shape_inputs(g2, ishapes)
+    g1 = g1.apply("InferShape").apply("SimplifyInference")
+    g2 = g2.apply("InferShape").apply("SimplifyInference")
+    graph_util.check_graph_equal(g1, g2)
+
+def test_squeeze_net():
+    symbol, params = squeezenet.get_workload(version='1.1')
+    compare_graph(c2_squeezenet.init_net, c2_squeezenet.predict_net, symbol, ishape=(1, 3, 224, 224))
+
+
+if __name__ == '__main__':
+    test_squeeze_net()
diff --git a/nnvm/tests/python/frontend/coreml/model_zoo/.gitignore b/nnvm/tests/python/frontend/coreml/model_zoo/.gitignore
new file mode 100644
index 000000000000..4242a1b2e2e0
--- /dev/null
+++ b/nnvm/tests/python/frontend/coreml/model_zoo/.gitignore
@@ -0,0 +1,3 @@
+*.mlmodel
+*.jpg
+*.png
diff --git a/nnvm/tests/python/frontend/coreml/model_zoo/__init__.py b/nnvm/tests/python/frontend/coreml/model_zoo/__init__.py
new file mode 100644
index 000000000000..2dbaf2b10483
--- /dev/null
+++ b/nnvm/tests/python/frontend/coreml/model_zoo/__init__.py
@@ -0,0 +1,41 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import os
+from PIL import Image
+import numpy as np
+from tvm.contrib.download import download_testdata
+
+def get_mobilenet():
+    url = 'https://docs-assets.developer.apple.com/coreml/models/MobileNet.mlmodel'
+    dst = 'mobilenet.mlmodel'
+    real_dst = download_testdata(url, dst, module='coreml')
+    return real_dst
+
+def get_resnet50():
+    url = 'https://docs-assets.developer.apple.com/coreml/models/Resnet50.mlmodel'
+    dst = 'resnet50.mlmodel'
+    real_dst = download_testdata(url, dst, module='coreml')
+    return real_dst
+
+def get_cat_image():
+    url = 'https://gist.githubusercontent.com/zhreshold/bcda4716699ac97ea44f791c24310193/raw/fa7ef0e9c9a5daea686d6473a62aacd1a5885849/cat.png'
+    dst = 'cat.png'
+    real_dst = download_testdata(url, dst, module='data')
+    img = Image.open(real_dst).resize((224, 224))
+    img = np.transpose(img, (2, 0, 1))[np.newaxis, :]
+    return np.asarray(img)
diff --git a/nnvm/tests/python/frontend/coreml/test_forward.py b/nnvm/tests/python/frontend/coreml/test_forward.py
new file mode 100644
index 000000000000..7a9f294f4359
--- /dev/null
+++ b/nnvm/tests/python/frontend/coreml/test_forward.py
@@ -0,0 +1,370 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import numpy as np
+
+from coremltools.models.neural_network import NeuralNetworkBuilder
+from coremltools.models import datatypes
+
+import tvm
+from tvm.contrib import graph_runtime
+import topi
+import topi.testing
+import nnvm.symbol as sym
+import nnvm.compiler
+from nnvm.testing.config import ctx_list
+from nnvm import frontend
+import coremltools as cm
+import model_zoo
+
+def get_tvm_output(symbol, x, params, target, ctx,
+                   out_shape=(1, 1000), input_name='image', dtype='float32'):
+    shape_dict = {input_name : x.shape}
+    with nnvm.compiler.build_config(opt_level=2):
+        graph, lib, params = nnvm.compiler.build(symbol, target, shape_dict, params=params)
+    m = graph_runtime.create(graph, lib, ctx)
+    # set inputs
+    m.set_input(input_name, tvm.nd.array(x.astype(dtype)))
+    m.set_input(**params)
+    m.run()
+    # get outputs
+    out = m.get_output(0, tvm.nd.empty(out_shape, dtype))
+    return out.asnumpy()
+
+def run_model_checkonly(model_file, model_name=''):
+    model = cm.models.MLModel(model_file)
+    sym, params = nnvm.frontend.from_coreml(model)
+    x = model_zoo.get_cat_image()
+    for target, ctx in ctx_list():
+        tvm_output = get_tvm_output(sym, x, params, target, ctx)
+        print(target, ctx, model_name, 'prediction id: ', np.argmax(tvm_output.flat))
+
+def test_mobilenet_checkonly():
+    model_file = model_zoo.get_mobilenet()
+    run_model_checkonly(model_file, 'mobilenet')
+
+def test_resnet50_checkonly():
+    model_file = model_zoo.get_resnet50()
+    run_model_checkonly(model_file, 'resnet50')
+
+def run_tvm_graph(graph_def, input_data, input_name, output_shape, output_dtype='float32'):
+    """ Generic function to compile on nnvm and execute on tvm """
+
+    sym, params = nnvm.frontend.from_coreml(graph_def)
+    target = 'llvm'
+    if isinstance(input_data, list):
+        shape_dict = {}
+        dtype_dict = {}
+        for i, e in enumerate(input_name):
+            shape_dict[e] = input_data[i].shape
+            dtype_dict[e] = input_data[i].dtype
+    else:
+        shape_dict = {input_name: input_data.shape}
+        dtype_dict = {input_name: input_data.dtype}
+
+    graph, lib, params = nnvm.compiler.build(sym, target, shape_dict,
+                                             dtype=dtype_dict, params=params)
+
+    ctx = tvm.cpu(0)
+    from tvm.contrib import graph_runtime
+    m = graph_runtime.create(graph, lib, ctx)
+    # set inputs
+    if isinstance(input_data, list):
+        for i, e in enumerate(input_name):
+            m.set_input(e, tvm.nd.array(input_data[i].astype(input_data[i].dtype)))
+    else:
+        m.set_input(input_name, tvm.nd.array(input_data.astype(input_data.dtype)))
+
+    m.set_input(**params)
+    # execute
+    m.run()
+    # get outputs
+    if isinstance(output_shape, list) and isinstance(output_dtype, list):
+        tvm_output_list = []
+        for i, s in enumerate(output_shape):
+            tvm_output = m.get_output(i, tvm.nd.empty((s), output_dtype[i]))
+            tvm_output_list.append(tvm_output.asnumpy())
+        return tvm_output_list
+    else:
+        tvm_output = m.get_output(0, tvm.nd.empty((output_shape), output_dtype))
+        return tvm_output.asnumpy()
+
+def verify_AddLayerParams(input_dim, alpha=2):
+    dtype = 'float32'
+
+    a_np1 = np.random.uniform(size=input_dim).astype(dtype)
+    a_np2 = np.random.uniform(size=input_dim).astype(dtype)
+
+    b_np = np.add(a_np1, a_np2) + alpha
+    inputs = [('input1', datatypes.Array(*input_dim)),
+              ('input2', datatypes.Array(*input_dim))]
+    output = [('output', datatypes.Array(*b_np.shape))]
+    builder = NeuralNetworkBuilder(inputs, output)
+    builder.add_elementwise(name='Add',
+                            alpha=alpha,
+                            input_names=['input1', 'input2'],
+                            output_name='output',
+                            mode='ADD')
+    model = cm.models.MLModel(builder.spec)
+    for target, ctx in ctx_list():
+        out = run_tvm_graph(model,
+                           [a_np1, a_np2],
+                           ['input1', 'input2'],
+                           b_np.shape,
+                           dtype)
+        tvm.testing.assert_allclose(out, b_np, rtol=1e-5)
+
+def test_forward_AddLayerParams():
+    verify_AddLayerParams((1, 2, 2), 0)
+    verify_AddLayerParams((1, 2, 2), 1)
+    verify_AddLayerParams((1, 3, 3), 2)
+
+def verify_MultiplyLayerParams(input_dim, alpha):
+    dtype = 'float32'
+
+    a_np1 = np.random.uniform(size=input_dim).astype(dtype)
+    a_np2 = np.random.uniform(size=input_dim).astype(dtype)
+
+    b_np = np.multiply(a_np1, a_np2) * alpha
+    inputs = [('input1', datatypes.Array(*input_dim)),
+              ('input2', datatypes.Array(*input_dim))]
+    output = [('output', datatypes.Array(*b_np.shape))]
+    builder = NeuralNetworkBuilder(inputs, output)
+    builder.add_elementwise(name='Mul',
+                            alpha=alpha,
+                            input_names=['input1', 'input2'],
+                            output_name='output',
+                            mode='MULTIPLY')
+    model = cm.models.MLModel(builder.spec)
+    for target, ctx in ctx_list():
+        out = run_tvm_graph(model,
+                           [a_np1, a_np2],
+                           ['input1', 'input2'],
+                           b_np.shape,
+                           dtype)
+        tvm.testing.assert_allclose(out, b_np, rtol=1e-5)
+
+def test_forward_MultiplyLayerParams():
+    verify_MultiplyLayerParams((1, 2, 2), 0)
+    verify_MultiplyLayerParams((1, 2, 2), 1)
+    verify_MultiplyLayerParams((1, 3, 3), 2)
+
+def verify_ConcatLayerParams(input1_dim, input2_dim):
+    dtype = 'float32'
+
+    a_np1 = np.random.uniform(size=input1_dim).astype(dtype)
+    a_np2 = np.random.uniform(size=input2_dim).astype(dtype)
+
+    b_np = np.concatenate((a_np1, a_np2), axis=1)
+    inputs = [('input1', datatypes.Array(*input1_dim)),
+              ('input2', datatypes.Array(*input2_dim))]
+    output = [('output', datatypes.Array(*b_np.shape))]
+    builder = NeuralNetworkBuilder(inputs, output)
+    builder.add_elementwise(name='Concate',
+                            input_names=['input1', 'input2'],
+                            output_name='output',
+                            mode='CONCAT')
+    model = cm.models.MLModel(builder.spec)
+    for target, ctx in ctx_list():
+        out = run_tvm_graph(model,
+                           [a_np1, a_np2],
+                           ['input1', 'input2'],
+                           b_np.shape,
+                           dtype)
+        tvm.testing.assert_allclose(out, b_np, rtol=1e-5)
+
+def test_forward_ConcatLayerParams():
+    verify_ConcatLayerParams((1, 1, 2, 2), (1, 2, 2, 2))
+    verify_ConcatLayerParams((1, 2, 4, 4), (1, 3, 4, 4))
+
+def verify_UpsampleLayerParams(input_dim, scale, mode):
+    dtype = "float32"
+
+    a_np = np.full(input_dim, 1, dtype=dtype)
+    if mode == 'NN':
+        b_np = topi.testing.upsampling_python(a_np, (scale, scale))
+    else:
+        new_h = input_dim[2] * scale
+        new_w = input_dim[3] * scale
+        b_np = topi.testing.bilinear_resize_python(a_np, (new_h, new_w), 'NCHW')
+
+    input = [('input', datatypes.Array(*input_dim))]
+    output = [('output', datatypes.Array(*b_np.shape))]
+    builder = NeuralNetworkBuilder(input, output)
+    builder.add_upsample(name='Upsample',
+                         scaling_factor_h=scale,
+                         scaling_factor_w=scale,
+                         mode=mode,
+                         input_name='input',
+                         output_name='output')
+
+    model = cm.models.MLModel(builder.spec)
+    for target, ctx in ctx_list():
+        out = run_tvm_graph(model, a_np, 'input', b_np.shape, dtype)
+        tvm.testing.assert_allclose(out, b_np, rtol=1e-5)
+
+def test_forward_UpsampleLayerParams():
+    verify_UpsampleLayerParams((1, 16, 32, 32), 2, 'NN')
+    verify_UpsampleLayerParams((1, 4, 6, 6), 3, 'BILINEAR')
+
+def verify_l2_normalize(input_dim, eps):
+    dtype = "float32"
+
+    a_np = np.random.uniform(size=input_dim).astype(dtype)
+    b_np = topi.testing.l2_normalize_python(a_np, eps, 1)
+
+    input = [('input', datatypes.Array(*input_dim))]
+    output = [('output', datatypes.Array(*b_np.shape))]
+    builder = NeuralNetworkBuilder(input, output)
+    builder.add_l2_normalize(name='L2', epsilon=eps, input_name='input', output_name='output')
+
+    model = cm.models.MLModel(builder.spec)
+    for target, ctx in ctx_list():
+        out = run_tvm_graph(model, a_np, 'input', b_np.shape, dtype)
+        tvm.testing.assert_allclose(out, b_np, rtol=1e-5)
+
+def test_forward_l2_normalize():
+    verify_l2_normalize((1, 3, 20, 20), 0.001)
+
+def verify_lrn(input_dim, size, bias, alpha, beta):
+    dtype = "float32"
+    axis=1
+    a_np = np.random.uniform(size=input_dim).astype(dtype)
+    b_np = topi.testing.lrn_python(a_np, size, axis, bias, alpha, beta)
+
+    input = [('input', datatypes.Array(*input_dim))]
+    output = [('output', datatypes.Array(*b_np.shape))]
+    builder = NeuralNetworkBuilder(input, output)
+    builder.add_lrn(name='LRN',
+                    input_name='input',
+                    output_name='output',
+                    alpha=alpha,
+                    beta=beta,
+                    k=bias,
+                    local_size=size)
+
+    model = cm.models.MLModel(builder.spec)
+    for target, ctx in ctx_list():
+        out = run_tvm_graph(model, a_np, 'input', b_np.shape, dtype)
+        tvm.testing.assert_allclose(out, b_np, rtol=1e-5)
+
+def test_forward_lrn():
+    verify_lrn((1, 3, 10, 20), 3, 1.0, 1.0, 0.5)
+
+def verify_average(input_dim1, input_dim2, axis=0):
+    dtype = 'float32'
+
+    a_np1 = np.random.uniform(size=input_dim1).astype(dtype)
+    a_np2 = np.random.uniform(size=input_dim2).astype(dtype)
+
+    b_np = np.mean((a_np1, a_np2), axis=axis)
+
+    inputs = [('input1', datatypes.Array(*input_dim1)),
+              ('input2', datatypes.Array(*input_dim2))]
+    output = [('output', datatypes.Array(*b_np.shape))]
+    builder = NeuralNetworkBuilder(inputs, output)
+    builder.add_elementwise(name='MEAN',
+                            input_names=['input1', 'input2'],
+                            output_name='output',
+                            mode='AVE')
+    model = cm.models.MLModel(builder.spec)
+    for target, ctx in ctx_list():
+        out = run_tvm_graph(model,
+                           [a_np1, a_np2],
+                           ['input1', 'input2'],
+                           b_np.shape,
+                           dtype)
+        tvm.testing.assert_allclose(out, b_np, rtol=1e-5)
+
+def test_forward_average():
+    verify_average((1, 3, 20, 20), (1, 3, 20, 20))
+    verify_average((3, 20, 20), (1, 3, 20, 20))
+    verify_average((20, 20), (1, 3, 20, 20))
+
+def verify_max(input_dim):
+    dtype = 'float32'
+
+    a_np1 = np.random.uniform(size=input_dim).astype(dtype)
+    a_np2 = np.random.uniform(size=input_dim).astype(dtype)
+    a_np3 = np.random.uniform(size=input_dim).astype(dtype)
+
+    b_np = np.max((a_np1, a_np2, a_np3), axis=0)
+
+    inputs = [('input1', datatypes.Array(*input_dim)),
+              ('input2', datatypes.Array(*input_dim)),
+              ('input3', datatypes.Array(*input_dim))]
+    output = [('output', datatypes.Array(*b_np.shape))]
+    builder = NeuralNetworkBuilder(inputs, output)
+    builder.add_elementwise(name='Max',
+                            input_names=['input1', 'input2', 'input3'],
+                            output_name='output',
+                            mode='MAX')
+    model = cm.models.MLModel(builder.spec)
+    for target, ctx in ctx_list():
+        out = run_tvm_graph(model,
+                           [a_np1, a_np2, a_np3],
+                           ['input1', 'input2', 'input3'],
+                           b_np.shape,
+                           dtype)
+        tvm.testing.assert_allclose(out, b_np, rtol=1e-5)
+
+def test_forward_max():
+    verify_max((1, 3, 20, 20))
+    verify_max((20, 20))
+
+def verify_min(input_dim):
+    dtype = 'float32'
+
+    a_np1 = np.random.uniform(size=input_dim).astype(dtype)
+    a_np2 = np.random.uniform(size=input_dim).astype(dtype)
+    a_np3 = np.random.uniform(size=input_dim).astype(dtype)
+
+    b_np = np.min((a_np1, a_np2, a_np3), axis=0)
+
+    inputs = [('input1', datatypes.Array(*input_dim)),
+              ('input2', datatypes.Array(*input_dim)),
+              ('input3', datatypes.Array(*input_dim))]
+    output = [('output', datatypes.Array(*b_np.shape))]
+    builder = NeuralNetworkBuilder(inputs, output)
+    builder.add_elementwise(name='Min',
+                            input_names=['input1', 'input2', 'input3'],
+                            output_name='output',
+                            mode='MIN')
+    model = cm.models.MLModel(builder.spec)
+    for target, ctx in ctx_list():
+        out = run_tvm_graph(model,
+                           [a_np1, a_np2, a_np3],
+                           ['input1', 'input2', 'input3'],
+                           b_np.shape,
+                           dtype)
+        tvm.testing.assert_allclose(out, b_np, rtol=1e-5)
+
+def test_forward_min():
+    verify_min((1, 3, 20, 20))
+    verify_min((20, 20))
+
+if __name__ == '__main__':
+    test_mobilenet_checkonly()
+    test_resnet50_checkonly()
+    test_forward_AddLayerParams()
+    test_forward_ConcatLayerParams()
+    test_forward_MultiplyLayerParams()
+    test_forward_UpsampleLayerParams()
+    test_forward_l2_normalize()
+    test_forward_lrn()
+    test_forward_average()
+    test_forward_max()
+    test_forward_min()
diff --git a/nnvm/tests/python/frontend/darknet/test_forward.py b/nnvm/tests/python/frontend/darknet/test_forward.py
new file mode 100644
index 000000000000..4e62ff2e1f33
--- /dev/null
+++ b/nnvm/tests/python/frontend/darknet/test_forward.py
@@ -0,0 +1,525 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+Compile Darknet Models
+=====================
+This article is a test script to test darknet models with NNVM.
+All the required models and libraries will be downloaded from the internet
+by the script.
+"""
+import numpy as np
+import tvm
+from tvm.contrib import graph_runtime
+from tvm.contrib.download import download_testdata
+download_testdata.__test__ = False
+from nnvm import frontend
+from tvm.relay.testing.darknet import LAYERTYPE
+from tvm.relay.testing.darknet import __darknetffi__
+import nnvm.compiler
+
+DARKNET_LIB = 'libdarknet2.0.so'
+DARKNETLIB_URL = 'https://github.com/siju-samuel/darknet/blob/master/lib/' \
+                                    + DARKNET_LIB + '?raw=true'
+LIB = __darknetffi__.dlopen(download_testdata(DARKNETLIB_URL, DARKNET_LIB, module='darknet'))
+
+DARKNET_TEST_IMAGE_NAME = 'dog.jpg'
+DARKNET_TEST_IMAGE_URL = 'https://github.com/siju-samuel/darknet/blob/master/data/' + DARKNET_TEST_IMAGE_NAME +'?raw=true'
+DARKNET_TEST_IMAGE_PATH = download_testdata(DARKNET_TEST_IMAGE_URL, DARKNET_TEST_IMAGE_NAME, module='data')
+
+def _read_memory_buffer(shape, data, dtype='float32'):
+    length = 1
+    for x in shape:
+        length *= x
+    data_np = np.zeros(length, dtype=dtype)
+    for i in range(length):
+        data_np[i] = data[i]
+    return data_np.reshape(shape)
+
+def _get_tvm_output(net, data, build_dtype='float32'):
+    '''Compute TVM output'''
+    dtype = 'float32'
+    sym, params = frontend.darknet.from_darknet(net, dtype)
+
+    target = 'llvm'
+    shape_dict = {'data': data.shape}
+    graph, library, params = nnvm.compiler.build(sym, target, shape_dict,
+                                                 build_dtype, params=params)
+    # Execute on TVM
+    ctx = tvm.cpu(0)
+    m = graph_runtime.create(graph, library, ctx)
+    # set inputs
+    m.set_input('data', tvm.nd.array(data.astype(dtype)))
+    m.set_input(**params)
+    m.run()
+    # get outputs
+    tvm_out = []
+    for i in range(m.get_num_outputs()):
+        tvm_out.append(m.get_output(i).asnumpy())
+    return tvm_out
+
+def _load_net(cfg_url, cfg_name, weights_url, weights_name):
+    cfg_path = download_testdata(cfg_url, cfg_name, module='darknet')
+    weights_path = download_testdata(weights_url, weights_name, module='darknet')
+    net = LIB.load_network(cfg_path.encode('utf-8'), weights_path.encode('utf-8'), 0)
+    return net
+
+def verify_darknet_frontend(net, build_dtype='float32'):
+    '''Test network with given input image on both darknet and tvm'''
+    def get_darknet_output(net, img):
+        LIB.network_predict_image(net, img)
+        out = []
+        for i in range(net.n):
+            layer = net.layers[i]
+            if layer.type == LAYERTYPE.REGION:
+                attributes = np.array([layer.n, layer.out_c, layer.out_h,
+                                       layer.out_w, layer.classes,
+                                       layer.coords, layer.background],
+                                      dtype=np.int32)
+                out.insert(0, attributes)
+                out.insert(0, _read_memory_buffer((layer.n*2, ), layer.biases))
+                layer_outshape = (layer.batch, layer.out_c,
+                                  layer.out_h, layer.out_w)
+                out.insert(0, _read_memory_buffer(layer_outshape, layer.output))
+            elif layer.type == LAYERTYPE.YOLO:
+                attributes = np.array([layer.n, layer.out_c, layer.out_h,
+                                       layer.out_w, layer.classes,
+                                       layer.total],
+                                      dtype=np.int32)
+                out.insert(0, attributes)
+                out.insert(0, _read_memory_buffer((layer.total*2, ), layer.biases))
+                out.insert(0, _read_memory_buffer((layer.n, ), layer.mask, dtype='int32'))
+                layer_outshape = (layer.batch, layer.out_c,
+                                  layer.out_h, layer.out_w)
+                out.insert(0, _read_memory_buffer(layer_outshape, layer.output))
+            elif i == net.n-1:
+                if layer.type == LAYERTYPE.CONNECTED:
+                    darknet_outshape = (layer.batch, layer.out_c)
+                elif layer.type in [LAYERTYPE.SOFTMAX]:
+                    darknet_outshape = (layer.batch, layer.outputs)
+                else:
+                    darknet_outshape = (layer.batch, layer.out_c,
+                                        layer.out_h, layer.out_w)
+                out.insert(0, _read_memory_buffer(darknet_outshape, layer.output))
+        return out
+
+    dtype = 'float32'
+
+    img = LIB.letterbox_image(LIB.load_image_color(DARKNET_TEST_IMAGE_PATH.encode('utf-8'), 0, 0), net.w, net.h)
+    darknet_output = get_darknet_output(net, img)
+    batch_size = 1
+    data = np.empty([batch_size, img.c, img.h, img.w], dtype)
+    i = 0
+    for c in range(img.c):
+        for h in range(img.h):
+            for k in range(img.w):
+                data[0][c][h][k] = img.data[i]
+                i = i + 1
+
+    tvm_out = _get_tvm_output(net, data, build_dtype)
+    for tvm_outs, darknet_out in zip(tvm_out, darknet_output):
+        tvm.testing.assert_allclose(darknet_out, tvm_outs, rtol=1e-3, atol=1e-3)
+
+def verify_rnn_forward(net):
+    '''Test network with given input data on both darknet and tvm'''
+    def get_darknet_network_predict(net, data):
+        return LIB.network_predict(net, data)
+    from cffi import FFI
+    ffi = FFI()
+    np_arr = np.zeros([1, net.inputs], dtype='float32')
+    np_arr[0, 84] = 1
+    cffi_arr = ffi.cast('float*', np_arr.ctypes.data)
+    tvm_out = _get_tvm_output(net, np_arr)[0]
+    darknet_output = get_darknet_network_predict(net, cffi_arr)
+    darknet_out = np.zeros(net.outputs, dtype='float32')
+    for i in range(net.outputs):
+        darknet_out[i] = darknet_output[i]
+    last_layer = net.layers[net.n-1]
+    darknet_outshape = (last_layer.batch, last_layer.outputs)
+    darknet_out = darknet_out.reshape(darknet_outshape)
+    tvm.testing.assert_allclose(darknet_out, tvm_out, rtol=1e-4, atol=1e-4)
+
+def test_forward_extraction():
+    '''test extraction model'''
+    model_name = 'extraction'
+    cfg_name = model_name + '.cfg'
+    weights_name = model_name + '.weights'
+    cfg_url = 'https://github.com/pjreddie/darknet/blob/master/cfg/' + cfg_name + '?raw=true'
+    weights_url = 'http://pjreddie.com/media/files/' + weights_name + '?raw=true'
+    net = _load_net(cfg_url, cfg_name, weights_url, weights_name)
+    verify_darknet_frontend(net)
+    LIB.free_network(net)
+
+def test_forward_alexnet():
+    '''test alexnet model'''
+    model_name = 'alexnet'
+    cfg_name = model_name + '.cfg'
+    weights_name = model_name + '.weights'
+    cfg_url = 'https://github.com/pjreddie/darknet/blob/master/cfg/' + cfg_name + '?raw=true'
+    weights_url = 'http://pjreddie.com/media/files/' + weights_name + '?raw=true'
+    net = _load_net(cfg_url, cfg_name, weights_url, weights_name)
+    verify_darknet_frontend(net)
+    LIB.free_network(net)
+
+def test_forward_resnet50():
+    '''test resnet50 model'''
+    model_name = 'resnet50'
+    cfg_name = model_name + '.cfg'
+    weights_name = model_name + '.weights'
+    cfg_url = 'https://github.com/pjreddie/darknet/blob/master/cfg/' + cfg_name + '?raw=true'
+    weights_url = 'http://pjreddie.com/media/files/' + weights_name + '?raw=true'
+    net = _load_net(cfg_url, cfg_name, weights_url, weights_name)
+    verify_darknet_frontend(net)
+    LIB.free_network(net)
+
+def test_forward_yolov2():
+    '''test yolov2 model'''
+    model_name = 'yolov2'
+    cfg_name = model_name + '.cfg'
+    weights_name = model_name + '.weights'
+    cfg_url = 'https://github.com/pjreddie/darknet/blob/master/cfg/' + cfg_name + '?raw=true'
+    weights_url = 'http://pjreddie.com/media/files/' + weights_name + '?raw=true'
+    net = _load_net(cfg_url, cfg_name, weights_url, weights_name)
+    build_dtype = {}
+    verify_darknet_frontend(net, build_dtype)
+    LIB.free_network(net)
+
+def test_forward_yolov3():
+    '''test yolov3 model'''
+    model_name = 'yolov3'
+    cfg_name = model_name + '.cfg'
+    weights_name = model_name + '.weights'
+    cfg_url = 'https://github.com/pjreddie/darknet/blob/master/cfg/' + cfg_name + '?raw=true'
+    weights_url = 'http://pjreddie.com/media/files/' + weights_name + '?raw=true'
+    net = _load_net(cfg_url, cfg_name, weights_url, weights_name)
+    build_dtype = {}
+    verify_darknet_frontend(net, build_dtype)
+    LIB.free_network(net)
+
+def test_forward_convolutional():
+    '''test convolutional layer'''
+    net = LIB.make_network(1)
+    layer = LIB.make_convolutional_layer(1, 224, 224, 3, 32, 1, 3, 2, 0, 1, 0, 0, 0, 0)
+    net.layers[0] = layer
+    net.w = net.h = 224
+    LIB.resize_network(net, 224, 224)
+    verify_darknet_frontend(net)
+    LIB.free_network(net)
+
+def test_forward_dense():
+    '''test fully connected layer'''
+    net = LIB.make_network(1)
+    layer = LIB.make_connected_layer(1, 75, 20, 1, 0, 0)
+    net.layers[0] = layer
+    net.w = net.h = 5
+    LIB.resize_network(net, 5, 5)
+    verify_darknet_frontend(net)
+    LIB.free_network(net)
+
+def test_forward_dense_batchnorm():
+    '''test fully connected layer with batchnorm'''
+    net = LIB.make_network(1)
+    layer = LIB.make_connected_layer(1, 12, 2, 1, 1, 0)
+    for i in range(5):
+        layer.rolling_mean[i] = np.random.rand(1)
+        layer.rolling_variance[i] = np.random.rand(1)
+        layer.scales[i] = np.random.rand(1)
+    net.layers[0] = layer
+    net.w = net.h = 2
+    LIB.resize_network(net, 2, 2)
+    verify_darknet_frontend(net)
+    LIB.free_network(net)
+
+def test_forward_maxpooling():
+    '''test maxpooling layer'''
+    net = LIB.make_network(1)
+    layer = LIB.make_maxpool_layer(1, 224, 224, 3, 2, 2, 0)
+    net.layers[0] = layer
+    net.w = net.h = 224
+    LIB.resize_network(net, 224, 224)
+    verify_darknet_frontend(net)
+    LIB.free_network(net)
+
+def test_forward_avgpooling():
+    '''test avgerage pooling layer'''
+    net = LIB.make_network(1)
+    layer = LIB.make_avgpool_layer(1, 224, 224, 3)
+    net.layers[0] = layer
+    net.w = net.h = 224
+    LIB.resize_network(net, 224, 224)
+    verify_darknet_frontend(net)
+    LIB.free_network(net)
+
+def test_forward_batch_norm():
+    '''test batch normalization layer'''
+    net = LIB.make_network(1)
+    layer = LIB.make_convolutional_layer(1, 224, 224, 3, 32, 1, 3, 2, 0, 1, 1, 0, 0, 0)
+    for i in range(32):
+        layer.rolling_mean[i] = np.random.rand(1)
+        layer.rolling_variance[i] = np.random.rand(1)
+    net.layers[0] = layer
+    net.w = net.h = 224
+    LIB.resize_network(net, 224, 224)
+    verify_darknet_frontend(net)
+    LIB.free_network(net)
+
+def test_forward_shortcut():
+    '''test shortcut layer'''
+    net = LIB.make_network(3)
+    layer_1 = LIB.make_convolutional_layer(1, 224, 224, 3, 32, 1, 3, 2, 0, 1, 0, 0, 0, 0)
+    layer_2 = LIB.make_convolutional_layer(1, 111, 111, 32, 32, 1, 1, 1, 0, 1, 0, 0, 0, 0)
+    layer_3 = LIB.make_shortcut_layer(1, 0, 111, 111, 32, 111, 111, 32)
+    layer_3.activation = 1
+    layer_3.alpha = 1
+    layer_3.beta = 1
+    net.layers[0] = layer_1
+    net.layers[1] = layer_2
+    net.layers[2] = layer_3
+    net.w = net.h = 224
+    LIB.resize_network(net, 224, 224)
+    verify_darknet_frontend(net)
+    LIB.free_network(net)
+
+def test_forward_reorg():
+    '''test reorg layer'''
+    net = LIB.make_network(2)
+    layer_1 = LIB.make_convolutional_layer(1, 222, 222, 3, 32, 1, 3, 2, 0, 1, 0, 0, 0, 0)
+    layer_2 = LIB.make_reorg_layer(1, 110, 110, 32, 2, 0, 0, 0)
+    net.layers[0] = layer_1
+    net.layers[1] = layer_2
+    net.w = net.h = 222
+    LIB.resize_network(net, 222, 222)
+    verify_darknet_frontend(net)
+    LIB.free_network(net)
+
+def test_forward_region():
+    '''test region layer'''
+    net = LIB.make_network(2)
+    layer_1 = LIB.make_convolutional_layer(1, 19, 19, 3, 425, 1, 1, 1, 0, 1, 0, 0, 0, 0)
+    layer_2 = LIB.make_region_layer(1, 19, 19, 5, 80, 4)
+    layer_2.softmax = 1
+    net.layers[0] = layer_1
+    net.layers[1] = layer_2
+    net.w = net.h = 19
+    LIB.resize_network(net, 19, 19)
+    build_dtype = {}
+    verify_darknet_frontend(net, build_dtype)
+    LIB.free_network(net)
+
+def test_forward_yolo_op():
+    '''test yolo layer'''
+    net = LIB.make_network(2)
+    layer_1 = LIB.make_convolutional_layer(1, 224, 224, 3, 14, 1, 3, 2, 0, 1, 0, 0, 0, 0)
+    layer_2 = LIB.make_yolo_layer(1, 111, 111, 2, 9, __darknetffi__.NULL, 2)
+    net.layers[0] = layer_1
+    net.layers[1] = layer_2
+    net.w = net.h = 224
+    LIB.resize_network(net, 224, 224)
+    build_dtype = {}
+    verify_darknet_frontend(net, build_dtype)
+    LIB.free_network(net)
+
+def test_forward_upsample():
+    '''test upsample layer'''
+    net = LIB.make_network(1)
+    layer = LIB.make_upsample_layer(1, 19, 19, 3, 3)
+    layer.scale = 1
+    net.layers[0] = layer
+    net.w = net.h = 19
+    LIB.resize_network(net, 19, 19)
+    verify_darknet_frontend(net)
+    LIB.free_network(net)
+
+def test_forward_l2normalize():
+    '''test l2 normalization layer'''
+    net = LIB.make_network(1)
+    layer = LIB.make_l2norm_layer(1, 224*224*3)
+    layer.c = layer.out_c = 3
+    layer.h = layer.out_h = 224
+    layer.w = layer.out_w = 224
+    net.layers[0] = layer
+    net.w = net.h = 224
+    LIB.resize_network(net, 224, 224)
+    verify_darknet_frontend(net)
+    LIB.free_network(net)
+
+def test_forward_elu():
+    '''test elu activation layer'''
+    net = LIB.make_network(1)
+    layer_1 = LIB.make_convolutional_layer(1, 224, 224, 3, 32, 1, 3, 2, 0, 1, 0, 0, 0, 0)
+    layer_1.activation = 8
+    net.layers[0] = layer_1
+    net.w = net.h = 224
+    LIB.resize_network(net, 224, 224)
+    verify_darknet_frontend(net)
+    LIB.free_network(net)
+
+def test_forward_softmax():
+    '''test softmax layer'''
+    net = LIB.make_network(1)
+    layer_1 = LIB.make_softmax_layer(1, 75, 1)
+    layer_1.temperature = 1
+    net.layers[0] = layer_1
+    net.w = net.h = 5
+    LIB.resize_network(net, net.w, net.h)
+    verify_darknet_frontend(net)
+    LIB.free_network(net)
+
+def test_forward_softmax_temperature():
+    '''test softmax layer'''
+    net = LIB.make_network(1)
+    layer_1 = LIB.make_softmax_layer(1, 75, 1)
+    layer_1.temperature = 0.8
+    net.layers[0] = layer_1
+    net.w = net.h = 5
+    LIB.resize_network(net, net.w, net.h)
+    verify_darknet_frontend(net)
+    LIB.free_network(net)
+
+def test_forward_rnn():
+    '''test RNN layer'''
+    net = LIB.make_network(1)
+    batch = 1
+    inputs = 256
+    outputs = 256
+    steps = 1
+    activation = 1
+    batch_normalize = 0
+    adam = 0
+    layer_1 = LIB.make_rnn_layer(batch, inputs, outputs, steps, activation, batch_normalize, adam)
+    net.layers[0] = layer_1
+    net.inputs = inputs
+    net.outputs = outputs
+    net.w = net.h = 0
+    LIB.resize_network(net, net.w, net.h)
+    verify_rnn_forward(net)
+    LIB.free_network(net)
+
+def _test_forward_crnn():
+    '''test CRNN layer'''
+    net = LIB.make_network(1)
+    batch = 1
+    c = 3
+    h = 224
+    w = 224
+    hidden_filters = c
+    output_filters = c
+    steps = 1
+    activation = 0
+    batch_normalize = 0
+    inputs = 256
+    outputs = 256
+    layer_1 = LIB.make_crnn_layer(batch, h, w, c, hidden_filters, output_filters,
+                                  steps, activation, batch_normalize)
+    net.layers[0] = layer_1
+    net.inputs = inputs
+    net.outputs = output_filters * h * w
+    net.w = w
+    net.h = h
+    LIB.resize_network(net, net.w, net.h)
+    verify_darknet_frontend(net)
+    LIB.free_network(net)
+
+def test_forward_lstm():
+    '''test LSTM layer'''
+    net = LIB.make_network(1)
+    batch = 1
+    inputs = 256
+    outputs = 256
+    steps = 1
+    batch_normalize = 0
+    adam = 0
+    layer_1 = LIB.make_lstm_layer(batch, inputs, outputs, steps, batch_normalize, adam)
+    net.layers[0] = layer_1
+    net.inputs = inputs
+    net.outputs = outputs
+    net.w = net.h = 0
+    LIB.resize_network(net, net.w, net.h)
+    verify_rnn_forward(net)
+    LIB.free_network(net)
+
+def test_forward_gru():
+    '''test GRU layer'''
+    net = LIB.make_network(1)
+    batch = 1
+    inputs = 256
+    outputs = 256
+    steps = 1
+    batch_normalize = 0
+    adam = 0
+    layer_1 = LIB.make_gru_layer(batch, inputs, outputs, steps, batch_normalize, adam)
+    net.layers[0] = layer_1
+    net.inputs = inputs
+    net.outputs = outputs
+    net.w = net.h = 0
+    LIB.resize_network(net, net.w, net.h)
+    verify_rnn_forward(net)
+    LIB.free_network(net)
+
+def test_forward_activation_logistic():
+    '''test logistic activation layer'''
+    net = LIB.make_network(1)
+    batch = 1
+    h = 224
+    w = 224
+    c = 3
+    n = 32
+    groups = 1
+    size = 3
+    stride = 2
+    padding = 0
+    activation = 0
+    batch_normalize = 0
+    binary = 0
+    xnor = 0
+    adam = 0
+    layer_1 = LIB.make_convolutional_layer(batch, h, w, c, n, groups, size, stride, padding,
+                                           activation, batch_normalize, binary, xnor, adam)
+    net.layers[0] = layer_1
+    net.w = w
+    net.h = h
+    LIB.resize_network(net, net.w, net.h)
+    verify_darknet_frontend(net)
+    LIB.free_network(net)
+
+if __name__ == '__main__':
+    test_forward_resnet50()
+    test_forward_alexnet()
+    test_forward_extraction()
+    test_forward_yolov2()
+    test_forward_yolov3()
+    test_forward_convolutional()
+    test_forward_maxpooling()
+    test_forward_avgpooling()
+    test_forward_batch_norm()
+    test_forward_shortcut()
+    test_forward_dense()
+    test_forward_dense_batchnorm()
+    test_forward_softmax()
+    test_forward_softmax_temperature()
+    test_forward_rnn()
+    test_forward_reorg()
+    test_forward_region()
+    test_forward_yolo_op()
+    test_forward_upsample()
+    test_forward_l2normalize()
+    test_forward_elu()
+    test_forward_rnn()
+# FIXME: Skip CRNN test since it causes segfault in libdarknet2.0.so
+#    _test_forward_crnn()
+    test_forward_lstm()
+    test_forward_gru()
+    test_forward_activation_logistic()
diff --git a/nnvm/tests/python/frontend/keras/test_forward.py b/nnvm/tests/python/frontend/keras/test_forward.py
new file mode 100644
index 000000000000..78e4204e8250
--- /dev/null
+++ b/nnvm/tests/python/frontend/keras/test_forward.py
@@ -0,0 +1,354 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import numpy as np
+import nnvm
+import tvm
+from tvm.contrib import graph_runtime
+from nnvm.testing.config import ctx_list
+import keras
+
+# prevent keras from using up all gpu memory
+import tensorflow as tf
+from keras.backend.tensorflow_backend import set_session
+config = tf.ConfigProto()
+config.gpu_options.per_process_gpu_memory_fraction = 0.5
+set_session(tf.Session(config=config))
+
+
+def verify_keras_frontend(keras_model, need_transpose=True):
+    # Keras frontend currently supports tensorflow backend only.
+    assert(keras.backend.backend() == 'tensorflow')
+
+    in_shapes = []
+    for layer in keras_model._input_layers:
+        in_shapes.append(tuple(dim.value if dim.value is not None else 1 for dim in layer.input.shape))
+
+    def get_keras_output(xs, dtype='float32'):
+        return keras_model.predict(xs)
+
+    def get_tvm_output(xs, target, ctx, dtype='float32'):
+        sym, params = nnvm.frontend.from_keras(keras_model)
+        shape_dict = {name: x.shape for (name, x) in zip(keras_model.input_names, xs)}
+        with nnvm.compiler.build_config(opt_level=2):
+            graph, lib, params = nnvm.compiler.build(sym, target, shape_dict, params=params)
+        m = graph_runtime.create(graph, lib, ctx)
+        for name, x in zip(keras_model.input_names, xs):
+            m.set_input(name, tvm.nd.array(x.astype(dtype)))
+        m.set_input(**params)
+        m.run()
+
+        return [m.get_output(i).asnumpy() for i in range(m.get_num_outputs())]
+
+    def to_channels_first(arr):
+        return arr.transpose([0, -1] + list(range(1, arr.ndim - 1)))
+
+    def to_channels_last(arr):
+        return arr.transpose([0] + list(range(2, arr.ndim)) + [1])
+
+    xs = [np.random.uniform(size=shape, low=-1.0, high=1.0) for shape in in_shapes]
+    keras_out = get_keras_output(xs)
+
+    keras_out = keras_out if isinstance(keras_out, list) else [keras_out]
+    for target, ctx in ctx_list():
+        tvm_out = get_tvm_output([to_channels_first(x) for x in xs] if need_transpose else xs, target, ctx)
+        for kout, tout in zip(keras_out, tvm_out):
+            if need_transpose:
+                tout = to_channels_last(tout)
+            tvm.testing.assert_allclose(kout, tout, rtol=1e-5, atol=1e-5)
+
+def test_forward_elemwise_add():
+    r = []
+    data = keras.layers.Input(shape=(32,32,3))
+    x = keras.layers.Conv2D(8, (3, 3), padding="same")(data)
+    r.append(x)
+    x = keras.layers.Conv2D(8, (3, 3), padding="same")(x)
+    r.append(x)
+    x = keras.layers.Conv2D(8, (3, 3), padding="same")(x)
+    # add two symbols
+    y = keras.layers.add([keras.layers.add([x, r[0]]), r[1]])
+    y = keras.layers.GlobalAveragePooling2D()(y)
+    keras_model = keras.models.Model(data, y)
+    verify_keras_frontend(keras_model)
+    # add three symbols
+    y = keras.layers.add([x, r[0], r[1]])
+    y = keras.layers.GlobalAveragePooling2D()(y)
+    keras_model = keras.models.Model(data, y)
+    verify_keras_frontend(keras_model)
+
+
+def _test_forward_dense():
+    data = keras.layers.Input(shape=(32,32,1))
+    x = keras.layers.Flatten()(data)
+    x = keras.layers.Dropout(0.5)(x)
+    x = keras.layers.Dense(10, activation='relu', kernel_initializer='uniform')(x)
+    keras_model = keras.models.Model(data, x)
+    verify_keras_frontend(keras_model)
+
+def _test_forward_dense_with_3d_inp():
+    data = keras.layers.Input(shape=(1, 20))
+    x = keras.layers.Dense(10, activation='relu', kernel_initializer='uniform')(data)
+    keras_model = keras.models.Model(data, x)
+    verify_keras_frontend(keras_model, need_transpose=False)
+
+def test_forward_dense():
+    _test_forward_dense()
+    _test_forward_dense_with_3d_inp()
+
+def test_forward_pool():
+    data = keras.layers.Input(shape=(32,32,1))
+    # maxpool
+    x = keras.layers.MaxPooling2D((3, 3), strides=(1, 1), padding='same')(data)
+    keras_model = keras.models.Model(data, x)
+    verify_keras_frontend(keras_model)
+    # avgpool
+    y = keras.layers.AveragePooling2D((3, 3), strides=(1, 1), padding='same')(data)
+    keras_model = keras.models.Model(data, y)
+    verify_keras_frontend(keras_model)
+
+
+def test_forward_conv():
+    data = keras.layers.Input(shape=(32,32,3))
+    conv_funcs = [keras.layers.Conv2D(filters=10, kernel_size=(3,3),
+                                      strides=(2,2), padding='same'),
+                  keras.layers.Conv2D(filters=10, kernel_size=(3,3),
+                                      dilation_rate=(2,2), padding='same'),
+                  keras.layers.DepthwiseConv2D(kernel_size=(3,3), padding='same'),
+                  keras.layers.Conv2DTranspose(filters=10, kernel_size=(3,3), padding='valid'),
+                  keras.layers.SeparableConv2D(filters=10, kernel_size=(3,3), padding='same')]
+    for conv_func in conv_funcs:
+        x = conv_func(data)
+        keras_model = keras.models.Model(data, x)
+        verify_keras_frontend(keras_model)
+
+
+def test_forward_upsample():
+    data = keras.layers.Input(shape=(32,32,3))
+    x = keras.layers.UpSampling2D(size=(3,3))(data)
+    keras_model = keras.models.Model(data, x)
+    verify_keras_frontend(keras_model)
+
+
+def test_forward_reshape():
+    data = keras.layers.Input(shape=(32,32,3))
+    x = keras.layers.Reshape(target_shape=(32,32,3))(data)
+    keras_model = keras.models.Model(data, x)
+    verify_keras_frontend(keras_model)
+
+
+def test_forward_crop():
+    data = keras.layers.Input(shape=(32,32,3))
+    x = keras.layers.Cropping2D(cropping=((1, 1), (1, 1)))(data)
+    x = keras.layers.Cropping2D(cropping=(1, 1))(x)
+    x = keras.layers.Cropping2D(cropping=1)(x)
+    x = keras.layers.Cropping2D(cropping=((0, 1), (1, 0)))(x)
+    x = keras.layers.Cropping2D(cropping=(1, 0))(x)
+    x = keras.layers.Cropping2D(cropping=0)(x)
+    x = keras.layers.Add()([x, x])
+    keras_model = keras.models.Model(data, x)
+    verify_keras_frontend(keras_model)
+
+
+def test_forward_vgg16():
+    keras_model = keras.applications.vgg16.VGG16(include_top=True, weights='imagenet',
+        input_shape=(224,224,3), classes=1000)
+    verify_keras_frontend(keras_model)
+
+
+def test_forward_xception():
+    keras_model = keras.applications.xception.Xception(include_top=True, weights='imagenet',
+        input_shape=(299,299,3), classes=1000)
+    verify_keras_frontend(keras_model)
+
+
+def test_forward_resnet50():
+    keras_model = keras.applications.resnet50.ResNet50(include_top=True, weights='imagenet',
+        input_shape=(224,224,3), classes=1000)
+    verify_keras_frontend(keras_model)
+
+
+def test_forward_mobilenet():
+    keras_model = keras.applications.mobilenet.MobileNet(include_top=True, weights='imagenet',
+        input_shape=(224,224,3), classes=1000)
+    verify_keras_frontend(keras_model)
+
+
+def test_forward_activations():
+    data = keras.layers.Input(shape=(32,32,3))
+    weights = np.random.rand(1, 32, 32, 3)
+    act_funcs = [keras.layers.Activation('softmax'),
+                 keras.layers.Activation('softplus'),
+                 keras.layers.ReLU(),
+                 keras.layers.ReLU(max_value=6.),
+                 keras.layers.LeakyReLU(alpha=0.3),
+                 keras.layers.PReLU(weights=weights, alpha_initializer="zero"),
+                 keras.layers.ELU(alpha=0.5),
+                 keras.layers.Activation('selu'),
+                 keras.layers.ThresholdedReLU(theta=0.5),
+                 keras.layers.Activation('softsign'),
+                 keras.layers.Activation('hard_sigmoid'),
+                 keras.layers.Activation('sigmoid'),
+                 keras.layers.Activation('tanh'),
+                 keras.layers.Activation('linear')]
+    for act_func in act_funcs:
+        x = act_func(data)
+        keras_model = keras.models.Model(data, x)
+        verify_keras_frontend(keras_model)
+
+
+def test_forward_multi_inputs():
+    data1 = keras.layers.Input(shape=(32,32,3))
+    data2 = keras.layers.Input(shape=(32,32,3))
+    x = keras.layers.Conv2D(8, (3, 3), padding="same")(data1)
+    y = keras.layers.Conv2D(8, (3, 3), padding="same")(data2)
+    z = keras.layers.add([x, y])
+    z = keras.layers.GlobalAveragePooling2D()(z)
+    keras_model = keras.models.Model([data1, data2], z)
+    verify_keras_frontend(keras_model)
+
+
+def test_forward_multi_outputs():
+    data = keras.layers.Input(shape=(32,32,3))
+    x = keras.layers.Conv2D(8, (3, 3), padding="same")(data)
+    x = keras.layers.GlobalAveragePooling2D()(x)
+    y = keras.layers.Conv2D(8, (3, 3), padding="same")(data)
+    y = keras.layers.GlobalAveragePooling2D()(y)
+    keras_model = keras.models.Model(data, [x, y])
+    verify_keras_frontend(keras_model)
+
+
+def test_forward_reuse_layers():
+    # reuse conv2d
+    data = keras.layers.Input(shape=(32,32,3))
+    conv2d = keras.layers.Conv2D(8, (3, 3), padding="same")
+    x = conv2d(data)
+    y = conv2d(data)
+    z = keras.layers.add([x, y])
+    z = keras.layers.GlobalAveragePooling2D()(z)
+    keras_model = keras.models.Model(data, z)
+    verify_keras_frontend(keras_model)
+
+    # reuse add
+    data = keras.layers.Input(shape=(32,32,3))
+    x = keras.layers.Conv2D(8, (3, 3), padding="same")(data)
+    add = keras.layers.Add()
+    x = add([x, x])
+    x = add([x, x])
+    z = keras.layers.GlobalAveragePooling2D()(x)
+    keras_model = keras.models.Model(data, z)
+    verify_keras_frontend(keras_model)
+
+def _test_LSTM(time_steps, inputs, hidden, return_state=True):
+    data = keras.layers.Input(shape=(time_steps, inputs))
+    lstm_out = keras.layers.LSTM(hidden,
+                                 return_state=return_state,
+                                 recurrent_activation='sigmoid',
+                                 activation='tanh')
+    x = lstm_out(data)
+    keras_model = keras.models.Model(data, x)
+    verify_keras_frontend(keras_model, need_transpose=False)
+
+def _test_LSTM_MultiLayer(inputs, hidden):
+    inputs = keras.layers.Input(shape=(1, inputs))
+    layer = keras.layers.LSTM(hidden, return_state=True, return_sequences=True,
+                                 recurrent_activation='sigmoid',
+                                 activation='tanh')
+    outputs = layer(inputs)
+    output, state = outputs[0], outputs[1:]
+    output = keras.layers.LSTM(hidden, recurrent_activation='sigmoid',
+                               activation='tanh')(output, initial_state=state)
+    keras_model = keras.models.Model(inputs, output)
+    verify_keras_frontend(keras_model, need_transpose=False)
+
+
+def test_forward_LSTM():
+    _test_LSTM(1, 8, 8, return_state=True)
+    _test_LSTM(1, 4, 4, return_state=False)
+    _test_LSTM(20, 16, 256, return_state=False)
+    _test_LSTM_MultiLayer(4, 4)
+
+def _test_RNN(inputs, units):
+    data = keras.layers.Input(shape=(1, inputs))
+    rnn_out = keras.layers.SimpleRNN(units, return_state=True,
+                                 activation='tanh')
+    x = rnn_out(data)
+    keras_model = keras.models.Model(data, x)
+    verify_keras_frontend(keras_model, need_transpose=False)
+
+def _test_RNN_MultiLayer(inputs, units):
+    inputs = keras.layers.Input(shape=(1, inputs))
+    layer = keras.layers.SimpleRNN(units, return_state=True, return_sequences=True,
+                                   activation='tanh')
+    outputs = layer(inputs)
+    output, state = outputs[0], outputs[1:]
+    output = keras.layers.SimpleRNN(units, activation='tanh')(output, initial_state=state)
+    keras_model = keras.models.Model(inputs, output)
+    verify_keras_frontend(keras_model, need_transpose=False)
+
+def test_forward_RNN():
+    _test_RNN(2, 4)
+    _test_RNN(4, 3)
+    _test_RNN_MultiLayer(4, 12)
+
+def _test_GRU(inputs, units):
+    data = keras.layers.Input(shape=(1, inputs))
+    gru_out = keras.layers.GRU(units,
+                               return_state=True,
+                               recurrent_activation='sigmoid',
+                               activation='tanh')
+    x = gru_out(data)
+    keras_model = keras.models.Model(data, x)
+    verify_keras_frontend(keras_model, need_transpose=False)
+
+def _test_GRU_MultiLayer(inputs, units):
+    inputs = keras.layers.Input(shape=(1, inputs))
+    layer = keras.layers.GRU(units,
+                             return_state=True,
+                             return_sequences=True,
+                             recurrent_activation='sigmoid',
+                             activation='tanh')
+    outputs = layer(inputs)
+    output, state = outputs[0], outputs[1:]
+    output = keras.layers.GRU(units, recurrent_activation='sigmoid',
+                              activation='tanh')(output, initial_state=state)
+    keras_model = keras.models.Model(inputs, output)
+    verify_keras_frontend(keras_model, need_transpose=False)
+
+def test_forward_GRU():
+    _test_GRU(2, 4)
+    _test_GRU(4, 3)
+    _test_GRU_MultiLayer(4, 4)
+
+if __name__ == '__main__':
+    test_forward_elemwise_add()
+    test_forward_activations()
+    test_forward_dense()
+    test_forward_pool()
+    test_forward_conv()
+    test_forward_upsample()
+    test_forward_reshape()
+    test_forward_crop()
+    test_forward_vgg16()
+    test_forward_xception()
+    test_forward_resnet50()
+    test_forward_mobilenet()
+
+    test_forward_multi_inputs()
+    test_forward_multi_outputs()
+    test_forward_reuse_layers()
+    test_forward_LSTM()
+    test_forward_RNN()
+    test_forward_GRU()
diff --git a/nnvm/tests/python/frontend/mxnet/model_zoo/__init__.py b/nnvm/tests/python/frontend/mxnet/model_zoo/__init__.py
new file mode 100644
index 000000000000..3922ba673f2f
--- /dev/null
+++ b/nnvm/tests/python/frontend/mxnet/model_zoo/__init__.py
@@ -0,0 +1,62 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""MXNet and NNVM model zoo."""
+from __future__ import absolute_import
+from . import mlp, resnet, vgg, dqn, dcgan, squeezenet, inception_v3
+import nnvm.testing
+
+_num_class = 1000
+
+# mlp fc
+mx_mlp = mlp.get_symbol(_num_class)
+nnvm_mlp = nnvm.testing.mlp.get_workload(1, _num_class)[0]
+
+# resnet fc
+mx_resnet = {}
+nnvm_resnet = {}
+for num_layer in [18, 34, 50, 101, 152, 200, 269]:
+    mx_resnet[num_layer] = resnet.get_symbol(_num_class, num_layer, '3,224,224')
+    nnvm_resnet[num_layer] = nnvm.testing.resnet.get_workload(
+        1, _num_class, num_layers=num_layer)[0]
+
+# vgg fc
+mx_vgg = {}
+nnvm_vgg = {}
+for num_layer in [11, 13, 16, 19]:
+    mx_vgg[num_layer] = vgg.get_symbol(_num_class, num_layer)
+    nnvm_vgg[num_layer] = nnvm.testing.vgg.get_workload(
+        1, _num_class, num_layers=num_layer)[0]
+
+# squeezenet
+mx_squeezenet = {}
+nnvm_squeezenet = {}
+for version in ['1.0', '1.1']:
+    mx_squeezenet[version] = squeezenet.get_symbol(version=version)
+    nnvm_squeezenet[version] = nnvm.testing.squeezenet.get_workload(1, version=version)[0]
+
+# inception
+mx_inception_v3 = inception_v3.get_symbol()
+nnvm_inception_v3 = nnvm.testing.inception_v3.get_workload(1)[0]
+
+# dqn
+mx_dqn = dqn.get_symbol()
+nnvm_dqn = nnvm.testing.dqn.get_workload(1)[0]
+
+# dcgan generator
+mx_dcgan = dcgan.get_symbol()
+nnvm_dcgan = nnvm.testing.dcgan.get_workload(1)[0]
diff --git a/nnvm/tests/python/frontend/mxnet/model_zoo/dcgan.py b/nnvm/tests/python/frontend/mxnet/model_zoo/dcgan.py
new file mode 100644
index 000000000000..e606b78e1597
--- /dev/null
+++ b/nnvm/tests/python/frontend/mxnet/model_zoo/dcgan.py
@@ -0,0 +1,82 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=unused-argument
+"""
+The MXNet symbol of DCGAN generator
+
+Adopted from:
+https://github.com/tqchen/mxnet-gan/blob/master/mxgan/generator.py
+
+Reference:
+Radford, Alec, Luke Metz, and Soumith Chintala.
+"Unsupervised representation learning with deep convolutional generative adversarial networks."
+arXiv preprint arXiv:1511.06434 (2015).
+"""
+
+import mxnet as mx
+
+def deconv2d(data, ishape, oshape, kshape, name, stride=(2, 2)):
+    """a deconv layer that enlarges the feature map"""
+    target_shape = (oshape[-2], oshape[-1])
+    pad_y = (kshape[0] - 1) // 2
+    pad_x = (kshape[1] - 1) // 2
+    adj_y = (target_shape[0] + 2 * pad_y - kshape[0]) % stride[0]
+    adj_x = (target_shape[1] + 2 * pad_x - kshape[1]) % stride[1]
+
+    net = mx.sym.Deconvolution(data,
+                               kernel=kshape,
+                               stride=stride,
+                               pad=(pad_y, pad_x),
+                               adj=(adj_y, adj_x),
+                               num_filter=oshape[0],
+                               no_bias=True,
+                               name=name)
+    return net
+
+def deconv2d_bn_relu(data, prefix, **kwargs):
+    """a block of deconv + batch norm + relu"""
+    eps = 1e-5 + 1e-12
+
+    net = deconv2d(data, name="%s_deconv" % prefix, **kwargs)
+    net = mx.sym.BatchNorm(net, eps=eps, name="%s_bn" % prefix)
+    net = mx.sym.Activation(net, name="%s_act" % prefix, act_type='relu')
+    return net
+
+def get_symbol(oshape=(3, 64, 64), ngf=128, code=None):
+    """get symbol of dcgan generator"""
+    assert oshape[-1] == 64, "Only support 64x64 image"
+    assert oshape[-2] == 64, "Only support 64x64 image"
+
+    code = mx.sym.Variable("data") if code is None else code
+    net = mx.sym.FullyConnected(code, name="g1", num_hidden=ngf*8*4*4, no_bias=True, flatten=False)
+    net = mx.sym.Activation(net, act_type='relu')
+    # 4 x 4
+    net = mx.sym.reshape(net, shape=(-1, ngf * 8, 4, 4))
+    # 8 x 8
+    net = deconv2d_bn_relu(
+        net, ishape=(ngf * 8, 4, 4), oshape=(ngf * 4, 8, 8), kshape=(4, 4), prefix="g2")
+    # 16x16
+    net = deconv2d_bn_relu(
+        net, ishape=(ngf * 4, 8, 8), oshape=(ngf * 2, 16, 16), kshape=(4, 4), prefix="g3")
+    # 32x32
+    net = deconv2d_bn_relu(
+        net, ishape=(ngf * 2, 16, 16), oshape=(ngf, 32, 32), kshape=(4, 4), prefix="g4")
+    # 64x64
+    net = deconv2d(
+        net, ishape=(ngf, 32, 32), oshape=oshape[-3:], kshape=(4, 4), name="g5_deconv")
+    net = mx.sym.Activation(net, act_type='tanh')
+    return net
diff --git a/nnvm/tests/python/frontend/mxnet/model_zoo/dqn.py b/nnvm/tests/python/frontend/mxnet/model_zoo/dqn.py
new file mode 100644
index 000000000000..e661e18debcb
--- /dev/null
+++ b/nnvm/tests/python/frontend/mxnet/model_zoo/dqn.py
@@ -0,0 +1,43 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+The mxnet symbol of Nature DQN
+
+Reference:
+Mnih, Volodymyr, et al.
+"Human-level control through deep reinforcement learning."
+Nature 518.7540 (2015): 529.
+"""
+
+import mxnet as mx
+
+def get_symbol(num_action=18):
+    data = mx.sym.Variable(name='data')
+    net = mx.sym.Convolution(data, kernel=(8, 8), stride=(4, 4),
+                             num_filter=32, name='conv1')
+    net = mx.sym.Activation(net, act_type='relu', name='relu1')
+    net = mx.sym.Convolution(net, kernel=(4, 4), stride=(2, 2),
+                             num_filter=64, name='conv2')
+    net = mx.sym.Activation(net, act_type='relu', name='relu2')
+    net = mx.sym.Convolution(net, kernel=(3, 3), stride=(1, 1),
+                             num_filter=64, name='conv3')
+    net = mx.sym.Activation(net, act_type='relu', name='relu3')
+    net = mx.sym.FullyConnected(net, num_hidden=512, name='fc4')
+    net = mx.sym.Activation(net, act_type='relu', name='relu4')
+    net = mx.sym.FullyConnected(net, num_hidden=num_action, name='fc5', flatten=False)
+
+    return net
diff --git a/nnvm/tests/python/frontend/mxnet/model_zoo/inception_v3.py b/nnvm/tests/python/frontend/mxnet/model_zoo/inception_v3.py
new file mode 100644
index 000000000000..8e8f36a3e644
--- /dev/null
+++ b/nnvm/tests/python/frontend/mxnet/model_zoo/inception_v3.py
@@ -0,0 +1,186 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+Inception V3, suitable for images with around 299 x 299
+
+Reference:
+Szegedy, Christian, et al. "Rethinking the Inception Architecture for Computer Vision." arXiv preprint arXiv:1512.00567 (2015).
+
+Adopted from https://github.com/apache/incubator-mxnet/blob/
+             master/example/image-classification/symbols/inception-v3.py
+"""
+import mxnet as mx
+import numpy as np
+
+def Conv(data, num_filter, kernel=(1, 1), stride=(1, 1), pad=(0, 0), name=None, suffix=''):
+    conv = mx.sym.Convolution(data=data, num_filter=num_filter, kernel=kernel, stride=stride, pad=pad, no_bias=True, name='%s%s_conv2d' %(name, suffix))
+    bn = mx.sym.BatchNorm(data=conv, eps=2e-5, name='%s%s_batchnorm' % (name, suffix))
+    act = mx.sym.Activation(data=bn, act_type='relu', name='%s%s_relu' %(name, suffix))
+    return act
+
+
+def Inception7A(data,
+                num_1x1,
+                num_3x3_red, num_3x3_1, num_3x3_2,
+                num_5x5_red, num_5x5,
+                pool, proj,
+                name):
+    tower_1x1 = Conv(data, num_1x1, name=('%s_conv' % name))
+    tower_5x5 = Conv(data, num_5x5_red, name=('%s_tower' % name), suffix='_conv')
+    tower_5x5 = Conv(tower_5x5, num_5x5, kernel=(5, 5), pad=(2, 2), name=('%s_tower' % name), suffix='_conv_1')
+    tower_3x3 = Conv(data, num_3x3_red, name=('%s_tower_1' % name), suffix='_conv')
+    tower_3x3 = Conv(tower_3x3, num_3x3_1, kernel=(3, 3), pad=(1, 1), name=('%s_tower_1' % name), suffix='_conv_1')
+    tower_3x3 = Conv(tower_3x3, num_3x3_2, kernel=(3, 3), pad=(1, 1), name=('%s_tower_1' % name), suffix='_conv_2')
+    pooling = mx.sym.Pooling(data=data, kernel=(3, 3), stride=(1, 1), pad=(1, 1), pool_type=pool, name=('%s_pool_%s_pool' % (pool, name)))
+    cproj = Conv(pooling, proj, name=('%s_tower_2' %  name), suffix='_conv')
+    concat = mx.sym.Concat(*[tower_1x1, tower_5x5, tower_3x3, cproj], name='ch_concat_%s_chconcat' % name)
+    return concat
+
+# First Downsample
+def Inception7B(data,
+                num_3x3,
+                num_d3x3_red, num_d3x3_1, num_d3x3_2,
+                pool,
+                name):
+    tower_3x3 = Conv(data, num_3x3, kernel=(3, 3), pad=(0, 0), stride=(2, 2), name=('%s_conv' % name))
+    tower_d3x3 = Conv(data, num_d3x3_red, name=('%s_tower' % name), suffix='_conv')
+    tower_d3x3 = Conv(tower_d3x3, num_d3x3_1, kernel=(3, 3), pad=(1, 1), stride=(1, 1), name=('%s_tower' % name), suffix='_conv_1')
+    tower_d3x3 = Conv(tower_d3x3, num_d3x3_2, kernel=(3, 3), pad=(0, 0), stride=(2, 2), name=('%s_tower' % name), suffix='_conv_2')
+    pooling = mx.sym.Pooling(data=data, kernel=(3, 3), stride=(2, 2), pad=(0,0), pool_type="max", name=('max_pool_%s_pool' % name))
+    concat = mx.sym.Concat(*[tower_3x3, tower_d3x3, pooling], name='ch_concat_%s_chconcat' % name)
+    return concat
+
+def Inception7C(data,
+                num_1x1,
+                num_d7_red, num_d7_1, num_d7_2,
+                num_q7_red, num_q7_1, num_q7_2, num_q7_3, num_q7_4,
+                pool, proj,
+                name):
+    tower_1x1 = Conv(data=data, num_filter=num_1x1, kernel=(1, 1), name=('%s_conv' % name))
+    tower_d7 = Conv(data=data, num_filter=num_d7_red, name=('%s_tower' % name), suffix='_conv')
+    tower_d7 = Conv(data=tower_d7, num_filter=num_d7_1, kernel=(1, 7), pad=(0, 3), name=('%s_tower' % name), suffix='_conv_1')
+    tower_d7 = Conv(data=tower_d7, num_filter=num_d7_2, kernel=(7, 1), pad=(3, 0), name=('%s_tower' % name), suffix='_conv_2')
+    tower_q7 = Conv(data=data, num_filter=num_q7_red, name=('%s_tower_1' % name), suffix='_conv')
+    tower_q7 = Conv(data=tower_q7, num_filter=num_q7_1, kernel=(7, 1), pad=(3, 0), name=('%s_tower_1' % name), suffix='_conv_1')
+    tower_q7 = Conv(data=tower_q7, num_filter=num_q7_2, kernel=(1, 7), pad=(0, 3), name=('%s_tower_1' % name), suffix='_conv_2')
+    tower_q7 = Conv(data=tower_q7, num_filter=num_q7_3, kernel=(7, 1), pad=(3, 0), name=('%s_tower_1' % name), suffix='_conv_3')
+    tower_q7 = Conv(data=tower_q7, num_filter=num_q7_4, kernel=(1, 7), pad=(0, 3), name=('%s_tower_1' % name), suffix='_conv_4')
+    pooling = mx.sym.Pooling(data=data, kernel=(3, 3), stride=(1, 1), pad=(1, 1), pool_type=pool, name=('%s_pool_%s_pool' % (pool, name)))
+    cproj = Conv(data=pooling, num_filter=proj, kernel=(1, 1), name=('%s_tower_2' %  name), suffix='_conv')
+    # concat
+    concat = mx.sym.Concat(*[tower_1x1, tower_d7, tower_q7, cproj], name='ch_concat_%s_chconcat' % name)
+    return concat
+
+def Inception7D(data,
+                num_3x3_red, num_3x3,
+                num_d7_3x3_red, num_d7_1, num_d7_2, num_d7_3x3,
+                pool,
+                name):
+    tower_3x3 = Conv(data=data, num_filter=num_3x3_red, name=('%s_tower' % name), suffix='_conv')
+    tower_3x3 = Conv(data=tower_3x3, num_filter=num_3x3, kernel=(3, 3), pad=(0,0), stride=(2, 2), name=('%s_tower' % name), suffix='_conv_1')
+    tower_d7_3x3 = Conv(data=data, num_filter=num_d7_3x3_red, name=('%s_tower_1' % name), suffix='_conv')
+    tower_d7_3x3 = Conv(data=tower_d7_3x3, num_filter=num_d7_1, kernel=(1, 7), pad=(0, 3), name=('%s_tower_1' % name), suffix='_conv_1')
+    tower_d7_3x3 = Conv(data=tower_d7_3x3, num_filter=num_d7_2, kernel=(7, 1), pad=(3, 0), name=('%s_tower_1' % name), suffix='_conv_2')
+    tower_d7_3x3 = Conv(data=tower_d7_3x3, num_filter=num_d7_3x3, kernel=(3, 3), stride=(2, 2), name=('%s_tower_1' % name), suffix='_conv_3')
+    pooling = mx.sym.Pooling(data=data, kernel=(3, 3), stride=(2, 2), pool_type=pool, name=('%s_pool_%s_pool' % (pool, name)))
+    # concat
+    concat = mx.sym.Concat(*[tower_3x3, tower_d7_3x3, pooling], name='ch_concat_%s_chconcat' % name)
+    return concat
+
+def Inception7E(data,
+                num_1x1,
+                num_d3_red, num_d3_1, num_d3_2,
+                num_3x3_d3_red, num_3x3, num_3x3_d3_1, num_3x3_d3_2,
+                pool, proj,
+                name):
+    tower_1x1 = Conv(data=data, num_filter=num_1x1, kernel=(1, 1), name=('%s_conv' % name))
+    tower_d3 = Conv(data=data, num_filter=num_d3_red, name=('%s_tower' % name), suffix='_conv')
+    tower_d3_a = Conv(data=tower_d3, num_filter=num_d3_1, kernel=(1, 3), pad=(0, 1), name=('%s_tower' % name), suffix='_mixed_conv')
+    tower_d3_b = Conv(data=tower_d3, num_filter=num_d3_2, kernel=(3, 1), pad=(1, 0), name=('%s_tower' % name), suffix='_mixed_conv_1')
+    tower_3x3_d3 = Conv(data=data, num_filter=num_3x3_d3_red, name=('%s_tower_1' % name), suffix='_conv')
+    tower_3x3_d3 = Conv(data=tower_3x3_d3, num_filter=num_3x3, kernel=(3, 3), pad=(1, 1), name=('%s_tower_1' % name), suffix='_conv_1')
+    tower_3x3_d3_a = Conv(data=tower_3x3_d3, num_filter=num_3x3_d3_1, kernel=(1, 3), pad=(0, 1), name=('%s_tower_1' % name), suffix='_mixed_conv')
+    tower_3x3_d3_b = Conv(data=tower_3x3_d3, num_filter=num_3x3_d3_2, kernel=(3, 1), pad=(1, 0), name=('%s_tower_1' % name), suffix='_mixed_conv_1')
+    pooling = mx.sym.Pooling(data=data, kernel=(3, 3), stride=(1, 1), pad=(1, 1), pool_type=pool, name=('%s_pool_%s_pool' % (pool, name)))
+    cproj = Conv(data=pooling, num_filter=proj, kernel=(1, 1), name=('%s_tower_2' %  name), suffix='_conv')
+    # concat
+    concat = mx.sym.Concat(*[tower_1x1, tower_d3_a, tower_d3_b, tower_3x3_d3_a, tower_3x3_d3_b, cproj], name='ch_concat_%s_chconcat' % name)
+    return concat
+
+def get_symbol(num_classes=1000, **kwargs):
+    data = mx.sym.Variable(name="data")
+    # stage 1
+    conv = Conv(data, 32, kernel=(3, 3), stride=(2, 2), name="conv")
+    conv_1 = Conv(conv, 32, kernel=(3, 3), name="conv_1")
+    conv_2 = Conv(conv_1, 64, kernel=(3, 3), pad=(1, 1), name="conv_2")
+    pool = mx.sym.Pooling(data=conv_2, kernel=(3, 3), stride=(2, 2), pool_type="max", name="pool")
+    # stage 2
+    conv_3 = Conv(pool, 80, kernel=(1, 1), name="conv_3")
+    conv_4 = Conv(conv_3, 192, kernel=(3, 3), name="conv_4")
+    pool1 = mx.sym.Pooling(data=conv_4, kernel=(3, 3), stride=(2, 2), pool_type="max", name="pool1")
+
+    # # stage 3
+    in3a = Inception7A(pool1, 64,
+                       64, 96, 96,
+                       48, 64,
+                       "avg", 32, "mixed")
+    in3b = Inception7A(in3a, 64,
+                       64, 96, 96,
+                       48, 64,
+                       "avg", 64, "mixed_1")
+    in3c = Inception7A(in3b, 64,
+                       64, 96, 96,
+                       48, 64,
+                       "avg", 64, "mixed_2")
+    in3d = Inception7B(in3c, 384,
+                       64, 96, 96,
+                       "max", "mixed_3")
+    # stage 4
+    in4a = Inception7C(in3d, 192,
+                       128, 128, 192,
+                       128, 128, 128, 128, 192,
+                       "avg", 192, "mixed_4")
+    in4b = Inception7C(in4a, 192,
+                       160, 160, 192,
+                       160, 160, 160, 160, 192,
+                       "avg", 192, "mixed_5")
+    in4c = Inception7C(in4b, 192,
+                       160, 160, 192,
+                       160, 160, 160, 160, 192,
+                       "avg", 192, "mixed_6")
+    in4d = Inception7C(in4c, 192,
+                       192, 192, 192,
+                       192, 192, 192, 192, 192,
+                       "avg", 192, "mixed_7")
+    in4e = Inception7D(in4d, 192, 320,
+                       192, 192, 192, 192,
+                       "max", "mixed_8")
+    # stage 5
+    in5a = Inception7E(in4e, 320,
+                       384, 384, 384,
+                       448, 384, 384, 384,
+                       "avg", 192, "mixed_9")
+    in5b = Inception7E(in5a, 320,
+                       384, 384, 384,
+                       448, 384, 384, 384,
+                       "max", 192, "mixed_10")
+    # pool
+    pool = mx.sym.Pooling(data=in5b, kernel=(8, 8), stride=(1, 1), pool_type="avg", name="global_pool")
+    flatten = mx.sym.Flatten(data=pool, name="flatten")
+    fc1 = mx.sym.FullyConnected(data=flatten, num_hidden=num_classes, name='fc1', flatten=False)
+    softmax = mx.sym.SoftmaxOutput(data=fc1, name='softmax')
+    return softmax
diff --git a/nnvm/tests/python/frontend/mxnet/model_zoo/mlp.py b/nnvm/tests/python/frontend/mxnet/model_zoo/mlp.py
new file mode 100644
index 000000000000..922b208749bf
--- /dev/null
+++ b/nnvm/tests/python/frontend/mxnet/model_zoo/mlp.py
@@ -0,0 +1,40 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""
+a simple multilayer perceptron
+"""
+import mxnet as mx
+
+def get_symbol(num_classes=10, **kwargs):
+    data = mx.symbol.Variable('data')
+    data = mx.sym.Flatten(data=data)
+    try:
+        fc1  = mx.symbol.FullyConnected(data = data, name='fc1', num_hidden=128, flatten=False)
+        act1 = mx.symbol.Activation(data = fc1, name='relu1', act_type="relu")
+        fc2  = mx.symbol.FullyConnected(data = act1, name = 'fc2', num_hidden = 64, flatten=False)
+        act2 = mx.symbol.Activation(data = fc2, name='relu2', act_type="relu")
+        fc3  = mx.symbol.FullyConnected(data = act2, name='fc3', num_hidden=num_classes, flatten=False)
+        mlp  = mx.symbol.softmax(data = fc3, name = 'softmax')
+    except:
+        fc1  = mx.symbol.FullyConnected(data = data, name='fc1', num_hidden=128)
+        act1 = mx.symbol.Activation(data = fc1, name='relu1', act_type="relu")
+        fc2  = mx.symbol.FullyConnected(data = act1, name = 'fc2', num_hidden = 64)
+        act2 = mx.symbol.Activation(data = fc2, name='relu2', act_type="relu")
+        fc3  = mx.symbol.FullyConnected(data = act2, name='fc3', num_hidden=num_classes)
+        mlp  = mx.symbol.softmax(data = fc3, name = 'softmax')
+    return mlp
diff --git a/nnvm/tests/python/frontend/mxnet/model_zoo/resnet.py b/nnvm/tests/python/frontend/mxnet/model_zoo/resnet.py
new file mode 100644
index 000000000000..3f9a870d31c0
--- /dev/null
+++ b/nnvm/tests/python/frontend/mxnet/model_zoo/resnet.py
@@ -0,0 +1,199 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+'''
+Adapted from https://github.com/tornadomeet/ResNet/blob/master/symbol_resnet.py
+Original author Wei Wu
+
+Implemented the following paper:
+
+Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun. "Identity Mappings in Deep Residual Networks"
+'''
+import mxnet as mx
+import numpy as np
+
+def residual_unit(data, num_filter, stride, dim_match, name, bottle_neck=True, bn_mom=0.9, workspace=256, memonger=False):
+    """Return ResNet Unit symbol for building ResNet
+    Parameters
+    ----------
+    data : str
+        Input data
+    num_filter : int
+        Number of output channels
+    bnf : int
+        Bottle neck channels factor with regard to num_filter
+    stride : tuple
+        Stride used in convolution
+    dim_match : Boolean
+        True means channel number between input and output is the same, otherwise means differ
+    name : str
+        Base name of the operators
+    workspace : int
+        Workspace used in convolution operator
+    """
+    if bottle_neck:
+        bn1 = mx.sym.BatchNorm(data=data, fix_gamma=False, eps=2e-5, momentum=bn_mom, name=name + '_bn1')
+        act1 = mx.sym.Activation(data=bn1, act_type='relu', name=name + '_relu1')
+        conv1 = mx.sym.Convolution(data=act1, num_filter=int(num_filter*0.25), kernel=(1,1), stride=stride, pad=(0,0),
+                                   no_bias=True, workspace=workspace, name=name + '_conv1')
+        bn2 = mx.sym.BatchNorm(data=conv1, fix_gamma=False, eps=2e-5, momentum=bn_mom, name=name + '_bn2')
+        act2 = mx.sym.Activation(data=bn2, act_type='relu', name=name + '_relu2')
+        conv2 = mx.sym.Convolution(data=act2, num_filter=int(num_filter*0.25), kernel=(3,3), stride=(1,1), pad=(1,1),
+                                   no_bias=True, workspace=workspace, name=name + '_conv2')
+        bn3 = mx.sym.BatchNorm(data=conv2, fix_gamma=False, eps=2e-5, momentum=bn_mom, name=name + '_bn3')
+        act3 = mx.sym.Activation(data=bn3, act_type='relu', name=name + '_relu3')
+        conv3 = mx.sym.Convolution(data=act3, num_filter=num_filter, kernel=(1,1), stride=(1,1), pad=(0,0), no_bias=True,
+                                   workspace=workspace, name=name + '_conv3')
+        if dim_match:
+            shortcut = data
+        else:
+            shortcut = mx.sym.Convolution(data=act1, num_filter=num_filter, kernel=(1,1), stride=stride, no_bias=True,
+                                            workspace=workspace, name=name+'_sc')
+        if memonger:
+            shortcut._set_attr(mirror_stage='True')
+        return conv3 + shortcut
+    else:
+        bn1 = mx.sym.BatchNorm(data=data, fix_gamma=False, momentum=bn_mom, eps=2e-5, name=name + '_bn1')
+        act1 = mx.sym.Activation(data=bn1, act_type='relu', name=name + '_relu1')
+        conv1 = mx.sym.Convolution(data=act1, num_filter=num_filter, kernel=(3,3), stride=stride, pad=(1,1),
+                                      no_bias=True, workspace=workspace, name=name + '_conv1')
+        bn2 = mx.sym.BatchNorm(data=conv1, fix_gamma=False, momentum=bn_mom, eps=2e-5, name=name + '_bn2')
+        act2 = mx.sym.Activation(data=bn2, act_type='relu', name=name + '_relu2')
+        conv2 = mx.sym.Convolution(data=act2, num_filter=num_filter, kernel=(3,3), stride=(1,1), pad=(1,1),
+                                      no_bias=True, workspace=workspace, name=name + '_conv2')
+        if dim_match:
+            shortcut = data
+        else:
+            shortcut = mx.sym.Convolution(data=act1, num_filter=num_filter, kernel=(1,1), stride=stride, no_bias=True,
+                                            workspace=workspace, name=name+'_sc')
+        if memonger:
+            shortcut._set_attr(mirror_stage='True')
+        return conv2 + shortcut
+
+def resnet(units, num_stages, filter_list, num_classes, image_shape, bottle_neck=True, bn_mom=0.9, workspace=256, dtype='float32', memonger=False):
+    """Return ResNet symbol of
+    Parameters
+    ----------
+    units : list
+        Number of units in each stage
+    num_stages : int
+        Number of stage
+    filter_list : list
+        Channel size of each stage
+    num_classes : int
+        Ouput size of symbol
+    dataset : str
+        Dataset type, only cifar10 and imagenet supports
+    workspace : int
+        Workspace used in convolution operator
+    dtype : str
+        Precision (float32 or float16)
+    """
+    num_unit = len(units)
+    assert(num_unit == num_stages)
+    data = mx.sym.Variable(name='data')
+    if dtype == 'float32':
+        # data = mx.sym.identity(data=data, name='id')
+        data = data
+    else:
+        if dtype == 'float16':
+            data = mx.sym.Cast(data=data, dtype=np.float16)
+    data = mx.sym.BatchNorm(data=data, fix_gamma=True, eps=2e-5, momentum=bn_mom, name='bn_data')
+    (nchannel, height, width) = image_shape
+    if height <= 32:            # such as cifar10
+        body = mx.sym.Convolution(data=data, num_filter=filter_list[0], kernel=(3, 3), stride=(1,1), pad=(1, 1),
+                                  no_bias=True, name="conv0", workspace=workspace)
+    else:                       # often expected to be 224 such as imagenet
+        body = mx.sym.Convolution(data=data, num_filter=filter_list[0], kernel=(7, 7), stride=(2,2), pad=(3, 3),
+                                  no_bias=True, name="conv0", workspace=workspace)
+        body = mx.sym.BatchNorm(data=body, fix_gamma=False, eps=2e-5, momentum=bn_mom, name='bn0')
+        body = mx.sym.Activation(data=body, act_type='relu', name='relu0')
+        body = mx.sym.Pooling(data=body, kernel=(3, 3), stride=(2,2), pad=(1,1), pool_type='max')
+
+    for i in range(num_stages):
+        body = residual_unit(body, filter_list[i+1], (1 if i==0 else 2, 1 if i==0 else 2), False,
+                             name='stage%d_unit%d' % (i + 1, 1), bottle_neck=bottle_neck, workspace=workspace,
+                             memonger=memonger)
+        for j in range(units[i]-1):
+            body = residual_unit(body, filter_list[i+1], (1,1), True, name='stage%d_unit%d' % (i + 1, j + 2),
+                                 bottle_neck=bottle_neck, workspace=workspace, memonger=memonger)
+    bn1 = mx.sym.BatchNorm(data=body, fix_gamma=False, eps=2e-5, momentum=bn_mom, name='bn1')
+    relu1 = mx.sym.Activation(data=bn1, act_type='relu', name='relu1')
+    # Although kernel is not used here when global_pool=True, we should put one
+    pool1 = mx.sym.Pooling(data=relu1, global_pool=True, kernel=(7, 7), pool_type='avg', name='pool1')
+    flat = mx.sym.Flatten(data=pool1)
+    try:
+        fc1 = mx.sym.FullyConnected(data=flat, num_hidden=num_classes, name='fc1', flatten=False)
+    except:
+        fc1 = mx.sym.FullyConnected(data=flat, num_hidden=num_classes, name='fc1')
+    if dtype == 'float16':
+        fc1 = mx.sym.Cast(data=fc1, dtype=np.float32)
+    return mx.sym.softmax(data=fc1, name='softmax')
+
+def get_symbol(num_classes, num_layers, image_shape, conv_workspace=256, dtype='float32', **kwargs):
+    """
+    Adapted from https://github.com/tornadomeet/ResNet/blob/master/train_resnet.py
+    Original author Wei Wu
+    """
+    image_shape = [int(l) for l in image_shape.split(',')]
+    (nchannel, height, width) = image_shape
+    if height <= 28:
+        num_stages = 3
+        if (num_layers-2) % 9 == 0 and num_layers >= 164:
+            per_unit = [(num_layers-2)//9]
+            filter_list = [16, 64, 128, 256]
+            bottle_neck = True
+        elif (num_layers-2) % 6 == 0 and num_layers < 164:
+            per_unit = [(num_layers-2)//6]
+            filter_list = [16, 16, 32, 64]
+            bottle_neck = False
+        else:
+            raise ValueError("no experiments done on num_layers {}, you can do it yourself".format(num_layers))
+        units = per_unit * num_stages
+    else:
+        if num_layers >= 50:
+            filter_list = [64, 256, 512, 1024, 2048]
+            bottle_neck = True
+        else:
+            filter_list = [64, 64, 128, 256, 512]
+            bottle_neck = False
+        num_stages = 4
+        if num_layers == 18:
+            units = [2, 2, 2, 2]
+        elif num_layers == 34:
+            units = [3, 4, 6, 3]
+        elif num_layers == 50:
+            units = [3, 4, 6, 3]
+        elif num_layers == 101:
+            units = [3, 4, 23, 3]
+        elif num_layers == 152:
+            units = [3, 8, 36, 3]
+        elif num_layers == 200:
+            units = [3, 24, 36, 3]
+        elif num_layers == 269:
+            units = [3, 30, 48, 8]
+        else:
+            raise ValueError("no experiments done on num_layers {}, you can do it yourself".format(num_layers))
+
+    return resnet(units       = units,
+                  num_stages  = num_stages,
+                  filter_list = filter_list,
+                  num_classes = num_classes,
+                  image_shape = image_shape,
+                  bottle_neck = bottle_neck,
+                  workspace   = conv_workspace,
+                  dtype       = dtype)
diff --git a/nnvm/tests/python/frontend/mxnet/model_zoo/squeezenet.py b/nnvm/tests/python/frontend/mxnet/model_zoo/squeezenet.py
new file mode 100644
index 000000000000..093da51a78a7
--- /dev/null
+++ b/nnvm/tests/python/frontend/mxnet/model_zoo/squeezenet.py
@@ -0,0 +1,92 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+Symbol of SqueezeNet
+
+Reference:
+Iandola, Forrest N., et al.
+"Squeezenet: Alexnet-level accuracy with 50x fewer parameters and< 0.5 mb model size." (2016).
+"""
+
+import mxnet as mx
+
+# Helpers
+def _make_fire(net, squeeze_channels, expand1x1_channels, expand3x3_channels):
+    net = _make_fire_conv(net, squeeze_channels, 1, 0)
+
+    left = _make_fire_conv(net, expand1x1_channels, 1, 0)
+    right = _make_fire_conv(net, expand3x3_channels, 3, 1)
+    # NOTE : Assume NCHW layout here
+    net = mx.sym.concat(left, right, dim=1)
+
+    return net
+
+def _make_fire_conv(net, channels, kernel_size, padding=0):
+    net = mx.sym.Convolution(net, num_filter=channels, kernel=(kernel_size, kernel_size),
+                             pad=(padding, padding))
+    net = mx.sym.Activation(net, act_type='relu')
+    return net
+
+# Net
+def get_symbol(num_classes=1000, version='1.0', **kwargs):
+    """Get symbol of SqueezeNet
+
+    Parameters
+    ----------
+    num_classes: int
+        The number of classification results
+
+    version : str, optional
+        "1.0" or "1.1" of SqueezeNet
+    """
+    assert version in ['1.0', '1.1'], ("Unsupported SqueezeNet version {version}:"
+                                       "1.0 or 1.1 expected".format(version=version))
+    net = mx.sym.Variable("data")
+    if version == '1.0':
+        net = mx.sym.Convolution(net, num_filter=96, kernel=(7, 7), stride=(2, 2), pad=(3, 3))
+        net = mx.sym.Activation(net, act_type='relu')
+        net = mx.sym.Pooling(data=net, kernel=(3, 3), pool_type='max', stride=(2, 2))
+        net = _make_fire(net, 16, 64, 64)
+        net = _make_fire(net, 16, 64, 64)
+        net = _make_fire(net, 32, 128, 128)
+        net = mx.sym.Pooling(data=net, kernel=(3, 3), pool_type='max', stride=(2, 2))
+        net = _make_fire(net, 32, 128, 128)
+        net = _make_fire(net, 48, 192, 192)
+        net = _make_fire(net, 48, 192, 192)
+        net = _make_fire(net, 64, 256, 256)
+        net = mx.sym.Pooling(data=net, kernel=(3, 3), pool_type='max', stride=(2, 2))
+        net = _make_fire(net, 64, 256, 256)
+    else:
+        net = mx.sym.Convolution(net, num_filter=64, kernel=(3, 3), stride=(2, 2), pad=(1, 1))
+        net = mx.sym.Activation(net, act_type='relu')
+        net = mx.sym.Pooling(data=net, kernel=(3, 3), pool_type='max', stride=(2, 2))
+        net = _make_fire(net, 16, 64, 64)
+        net = _make_fire(net, 16, 64, 64)
+        net = mx.sym.Pooling(data=net, kernel=(3, 3), pool_type='max',  stride=(2, 2))
+        net = _make_fire(net, 32, 128, 128)
+        net = _make_fire(net, 32, 128, 128)
+        net = mx.sym.Pooling(data=net, kernel=(3, 3), pool_type='max',  stride=(2, 2))
+        net = _make_fire(net, 48, 192, 192)
+        net = _make_fire(net, 48, 192, 192)
+        net = _make_fire(net, 64, 256, 256)
+        net = _make_fire(net, 64, 256, 256)
+    net = mx.sym.Dropout(net, p=0.5)
+    net = mx.sym.Convolution(net, num_filter=num_classes, kernel=(1, 1))
+    net = mx.sym.Activation(net, act_type='relu')
+    net = mx.sym.Pooling(data=net, global_pool=True, kernel=(13, 13), pool_type='avg')
+    net = mx.sym.flatten(net)
+    return mx.sym.softmax(net)
diff --git a/nnvm/tests/python/frontend/mxnet/model_zoo/vgg.py b/nnvm/tests/python/frontend/mxnet/model_zoo/vgg.py
new file mode 100644
index 000000000000..68215bb80aaa
--- /dev/null
+++ b/nnvm/tests/python/frontend/mxnet/model_zoo/vgg.py
@@ -0,0 +1,85 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""References:
+
+Simonyan, Karen, and Andrew Zisserman. "Very deep convolutional networks for
+large-scale image recognition." arXiv preprint arXiv:1409.1556 (2014).
+"""
+
+import mxnet as mx
+import numpy as np
+
+def get_feature(internel_layer, layers, filters, batch_norm = False, **kwargs):
+    for i, num in enumerate(layers):
+        for j in range(num):
+            internel_layer = mx.sym.Convolution(data = internel_layer, kernel=(3, 3), pad=(1, 1), num_filter=filters[i], name="conv%s_%s" %(i + 1, j + 1))
+            if batch_norm:
+                internel_layer = mx.symbol.BatchNorm(data=internel_layer, name="bn%s_%s" %(i + 1, j + 1))
+            internel_layer = mx.sym.Activation(data=internel_layer, act_type="relu", name="relu%s_%s" %(i + 1, j + 1))
+        internel_layer = mx.sym.Pooling(data=internel_layer, pool_type="max", kernel=(2, 2), stride=(2,2), name="pool%s" %(i + 1))
+    return internel_layer
+
+def get_classifier(input_data, num_classes, **kwargs):
+    flatten = mx.sym.Flatten(data=input_data, name="flatten")
+    try:
+        fc6 = mx.sym.FullyConnected(data=flatten, num_hidden=4096, name="fc6", flatten=False)
+        relu6 = mx.sym.Activation(data=fc6, act_type="relu", name="relu6")
+        drop6 = mx.sym.Dropout(data=relu6, p=0.5, name="drop6")
+        fc7 = mx.sym.FullyConnected(data=drop6, num_hidden=4096, name="fc7", flatten=False)
+        relu7 = mx.sym.Activation(data=fc7, act_type="relu", name="relu7")
+        drop7 = mx.sym.Dropout(data=relu7, p=0.5, name="drop7")
+        fc8 = mx.sym.FullyConnected(data=drop7, num_hidden=num_classes, name="fc8", flatten=False)
+    except:
+        fc6 = mx.sym.FullyConnected(data=flatten, num_hidden=4096, name="fc6")
+        relu6 = mx.sym.Activation(data=fc6, act_type="relu", name="relu6")
+        drop6 = mx.sym.Dropout(data=relu6, p=0.5, name="drop6")
+        fc7 = mx.sym.FullyConnected(data=drop6, num_hidden=4096, name="fc7")
+        relu7 = mx.sym.Activation(data=fc7, act_type="relu", name="relu7")
+        drop7 = mx.sym.Dropout(data=relu7, p=0.5, name="drop7")
+        fc8 = mx.sym.FullyConnected(data=drop7, num_hidden=num_classes, name="fc8")
+    return fc8
+
+def get_symbol(num_classes, num_layers=11, batch_norm=False, dtype='float32', **kwargs):
+    """
+    Parameters
+    ----------
+    num_classes : int, default 1000
+        Number of classification classes.
+    num_layers : int
+        Number of layers for the variant of densenet. Options are 11, 13, 16, 19.
+    batch_norm : bool, default False
+        Use batch normalization.
+    dtype: str, float32 or float16
+        Data precision.
+    """
+    vgg_spec = {11: ([1, 1, 2, 2, 2], [64, 128, 256, 512, 512]),
+                13: ([2, 2, 2, 2, 2], [64, 128, 256, 512, 512]),
+                16: ([2, 2, 3, 3, 3], [64, 128, 256, 512, 512]),
+                19: ([2, 2, 4, 4, 4], [64, 128, 256, 512, 512])}
+    if num_layers not in vgg_spec:
+        raise ValueError("Invalide num_layers {}. Possible choices are 11,13,16,19.".format(num_layers))
+    layers, filters = vgg_spec[num_layers]
+    data = mx.sym.Variable(name="data")
+    if dtype == 'float16':
+        data = mx.sym.Cast(data=data, dtype=np.float16)
+    feature = get_feature(data, layers, filters, batch_norm)
+    classifier = get_classifier(feature, num_classes)
+    if dtype == 'float16':
+        classifier = mx.sym.Cast(data=classifier, dtype=np.float32)
+    symbol = mx.sym.softmax(data=classifier, name='softmax')
+    return symbol
diff --git a/nnvm/tests/python/frontend/mxnet/test_forward.py b/nnvm/tests/python/frontend/mxnet/test_forward.py
new file mode 100644
index 000000000000..dd315c6f87b0
--- /dev/null
+++ b/nnvm/tests/python/frontend/mxnet/test_forward.py
@@ -0,0 +1,333 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import numpy as np
+
+import topi
+import tvm
+from tvm.contrib import graph_runtime
+import nnvm.symbol as sym
+import nnvm.compiler
+from nnvm.testing.config import ctx_list
+from nnvm import frontend
+import mxnet as mx
+from mxnet import gluon
+from mxnet.gluon.model_zoo import vision
+import model_zoo
+
+
+def verify_mxnet_frontend_impl(mx_symbol, data_shape=(1, 3, 224, 224), out_shape=(1, 1000),
+                               gluon_impl=False, name=None, dtype='float32'):
+    """Use name different from test to avoid pytest picking it up"""
+    if gluon_impl:
+        def get_gluon_output(name, x):
+            net = vision.get_model(name)
+            net.collect_params().initialize(mx.init.Xavier())
+            net_sym = gluon.nn.SymbolBlock(outputs=net(mx.sym.var('data')),
+                                           inputs=mx.sym.var('data'),
+                                           params=net.collect_params())
+            out = net_sym(mx.nd.array(x.astype(dtype))).asnumpy()
+            return out, net_sym
+    else:
+        def get_mxnet_output(symbol, x, dtype='float32'):
+            from collections import namedtuple
+            Batch = namedtuple('Batch', ['data'])
+            mod = mx.mod.Module(symbol, label_names=None)
+            mod.bind(data_shapes=[('data', x.shape)], for_training=False)
+            mod.init_params()
+            mod.forward(Batch([mx.nd.array(x.astype(dtype))]))
+            out = mod.get_outputs()[0].asnumpy()
+            args, auxs = mod.get_params()
+            return out, args, auxs
+
+    def get_tvm_output(symbol, x, args, auxs, target, ctx, dtype='float32'):
+        if gluon_impl:
+            new_sym, params = frontend.from_mxnet(symbol)
+        else:
+            new_sym, params = frontend.from_mxnet(symbol, args, auxs)
+
+        dshape = x.shape
+        shape_dict = {'data': dshape}
+        with nnvm.compiler.build_config(opt_level=3):
+            graph, lib, params = nnvm.compiler.build(new_sym, target, shape_dict, params=params)
+        m = graph_runtime.create(graph, lib, ctx)
+        # set inputs
+        m.set_input("data", tvm.nd.array(x.astype(dtype)))
+        m.set_input(**params)
+        m.run()
+        # get outputs
+        out = m.get_output(0, tvm.nd.empty(out_shape, dtype))
+        return out.asnumpy()
+
+    # random input
+    x = np.random.uniform(size=data_shape)
+    if gluon_impl:
+        gluon_out, gluon_sym = get_gluon_output(name, x)
+        for target, ctx in ctx_list():
+            tvm_out = get_tvm_output(gluon_sym, x, None, None, target, ctx, dtype)
+            tvm.testing.assert_allclose(gluon_out, tvm_out, rtol=1e-5, atol=1e-5)
+    else:
+        mx_out, args, auxs = get_mxnet_output(mx_symbol, x, dtype)
+        assert "data" not in args
+        for target, ctx in ctx_list():
+            tvm_out = get_tvm_output(mx_symbol, x, args, auxs, target, ctx, dtype)
+            tvm.testing.assert_allclose(mx_out, tvm_out, rtol=1e-5, atol=1e-5)
+
+def test_forward_mlp():
+    mlp = model_zoo.mx_mlp
+    verify_mxnet_frontend_impl(mlp)
+
+def test_forward_vgg():
+    for n in [11]:
+        mx_sym = model_zoo.mx_vgg[n]
+        verify_mxnet_frontend_impl(mx_sym)
+
+def test_forward_resnet():
+    for n in [18]:
+        mx_sym = model_zoo.mx_resnet[n]
+        verify_mxnet_frontend_impl(mx_sym)
+
+def test_forward_elu():
+    data = mx.sym.var('data')
+    data = mx.sym.concat(data, -data, dim=1)  # negative part explicitly
+    mx_sym = mx.sym.LeakyReLU(data, act_type='elu')
+    verify_mxnet_frontend_impl(mx_sym, (1, 3, 100, 100), (1, 6, 100, 100))
+
+def test_forward_rrelu():
+    data = mx.sym.var('data')
+    data = mx.sym.concat(data, -data, dim=1)  # negative part explicitly
+    mx_sym = mx.sym.LeakyReLU(data, act_type='rrelu', lower_bound=0.3, upper_bound=0.7)
+    verify_mxnet_frontend_impl(mx_sym, (1, 3, 100, 100), (1, 6, 100, 100))
+
+def test_forward_prelu():
+    data = mx.sym.var('data')
+    data = mx.sym.concat(data, -data, dim=1)  # negative part explicitly
+    mx_sym = mx.sym.LeakyReLU(data, act_type='prelu')
+    verify_mxnet_frontend_impl(mx_sym, (1, 3, 100, 100), (1, 6, 100, 100))
+
+def test_forward_softrelu():
+    data = mx.sym.var('data')
+    data = mx.sym.concat(data, -data, dim=1)  # negative part explicitly
+    mx_sym = mx.sym.Activation(data, act_type='softrelu')
+    verify_mxnet_frontend_impl(mx_sym, (1, 3, 100, 100), (1, 6, 100, 100))
+
+def test_forward_fc_flatten():
+    # test flatten=True option in mxnet 0.11.1
+    data = mx.sym.var('data')
+    try:
+        mx_sym = mx.sym.FullyConnected(data, num_hidden=100, flatten=True)
+        verify_mxnet_frontend_impl(mx_sym, (1, 3, 100, 100), (1, 100))
+        mx_sym = mx.sym.FullyConnected(mx.sym.Flatten(data), num_hidden=100, flatten=False)
+        verify_mxnet_frontend_impl(mx_sym, (1, 3, 100, 100), (1, 100))
+    except:
+        pass
+
+def test_forward_clip():
+    data = mx.sym.var('data')
+    data = mx.sym.concat(data, -data, dim=1)  # negative part explicitly
+    mx_sym = mx.sym.clip(data, a_min=0, a_max=1)
+    verify_mxnet_frontend_impl(mx_sym, (1, 3, 100, 100), (1, 6, 100, 100))
+
+def test_forward_split():
+    data = mx.sym.var('data')
+    mx_sym = mx.sym.split(data, axis=1, num_outputs=4, squeeze_axis=False)
+    verify_mxnet_frontend_impl(mx_sym, (1, 4, 2, 1), (1, 1, 2, 1))
+
+def test_forward_split_squeeze():
+    data = mx.sym.var('data')
+    mx_sym = mx.sym.split(data, axis=1, num_outputs=4, squeeze_axis=True)
+    verify_mxnet_frontend_impl(mx_sym, (1, 4, 2, 1), (1, 2, 1))
+
+def test_forward_expand_dims():
+    data = mx.sym.var('data')
+    mx_sym = mx.sym.expand_dims(data, axis=1)
+    verify_mxnet_frontend_impl(mx_sym, (2, 3, 4), (2, 1, 3, 4))
+
+def test_forward_pooling():
+    data = mx.sym.var('data')
+    mx_sym = mx.sym.Pooling(data, kernel=(3, 3), pad=(1, 1), pool_type='avg')
+    verify_mxnet_frontend_impl(mx_sym, (1, 20, 8, 8), (1, 20, 8, 8))
+
+    mx_sym = mx.sym.Pooling(data, kernel=(3, 3), pad=(1, 1), pool_type='max')
+    verify_mxnet_frontend_impl(mx_sym, (1, 20, 8, 8), (1, 20, 8, 8))
+
+def test_forward_lrn():
+    data = mx.sym.var('data')
+    mx_sym = mx.sym.LRN(data, alpha=2, beta=2, knorm=1, nsize=5)
+    verify_mxnet_frontend_impl(mx_sym, (1, 10, 24, 24), (1, 10, 24, 24))
+
+def test_forward_ones():
+    data = mx.sym.var('data')
+    ones = mx.sym.ones(shape=(2, 3, 4), dtype='float32')
+    mx_sym = mx.sym.elemwise_add(data, ones)
+    verify_mxnet_frontend_impl(mx_sym, (2, 3, 4), (2, 3, 4))
+
+def test_forward_zeros():
+    data = mx.sym.var('data')
+    zeros = mx.sym.zeros(shape=(2, 3, 4), dtype='float32')
+    mx_sym = mx.sym.elemwise_add(data, zeros)
+    verify_mxnet_frontend_impl(mx_sym, (2, 3, 4), (2, 3, 4))
+
+def test_forward_ones_like():
+    data = mx.sym.var('data')
+    mx_sym = mx.sym.ones_like(data, dtype='float32')
+    verify_mxnet_frontend_impl(mx_sym, (2, 3, 4), (2, 3, 4))
+
+def test_forward_zeros_like():
+    data = mx.sym.var('data')
+    mx_sym = mx.sym.zeros_like(data, dtype='float32')
+    verify_mxnet_frontend_impl(mx_sym, (2, 3, 4), (2, 3, 4))
+
+def test_forward_argmax():
+    data = mx.sym.var('data')
+    mx_sym = mx.sym.argmax(data, axis=1)
+    verify_mxnet_frontend_impl(mx_sym, (5, 3), (5,))
+
+def test_forward_argmin():
+    data = mx.sym.var('data')
+    mx_sym = mx.sym.argmin(data, axis=0)
+    verify_mxnet_frontend_impl(mx_sym, (5, 4), (4,))
+
+def test_forward_where():
+    cond = mx.sym.var('cond')
+    x = mx.sym.var('x')
+    y = mx.sym.var('y')
+    dshape = (2, 2)
+    dtype = 'float32'
+    mx_sym = mx.sym.where(cond, x, y)
+    np_cond = np.array([[0, 1], [-1, 0]]).astype(dtype)
+    np_x = np.random.uniform(size=dshape).astype(dtype)
+    np_y = np.random.uniform(size=dshape).astype(dtype)
+    mx_cond = mx.nd.array(np_cond)
+    mx_x = mx.nd.array(np_x)
+    mx_y = mx.nd.array(np_y)
+    mod = mx.mod.Module(mx_sym, label_names=None, data_names=['cond', 'x', 'y'])
+    mod.bind(data_shapes=[('cond', dshape), ('x', dshape), ('y', dshape)], for_training=False)
+    mod.init_params()
+    args, auxs = mod.get_params()
+    mx_out = mx.nd.where(mx_cond, mx_x, mx_y).asnumpy()
+    out_shape = dshape
+    new_sym, params = frontend.from_mxnet(mx_sym, args, auxs)
+    shape_dict = {'cond': dshape, 'x': dshape, 'y': dshape}
+    for target, ctx in ctx_list():
+        with nnvm.compiler.build_config(opt_level=3):
+            graph, lib, params = nnvm.compiler.build(new_sym, target, shape_dict, params=params)
+        m = graph_runtime.create(graph, lib, ctx)
+        # set inputs
+        m.set_input("cond", tvm.nd.array(np_cond))
+        m.set_input("x", tvm.nd.array(np_x))
+        m.set_input("y", tvm.nd.array(np_y))
+        m.set_input(**params)
+        m.run()
+        # get outputs
+        tvm_out = m.get_output(0, tvm.nd.empty(out_shape, dtype)).asnumpy()
+        tvm.testing.assert_allclose(mx_out, tvm_out, rtol=1e-5, atol=1e-5)
+
+def test_forward_slice():
+    data = mx.sym.var('data')
+    mx_sym = mx.sym.slice(data, begin=(0, 1), end=(2, 4))
+    verify_mxnet_frontend_impl(mx_sym, (3, 4), (2, 3))
+    mx_sym = mx.sym.slice(data, begin=(-1, 1), end=(-3, 4), step=(-1, 2))
+    verify_mxnet_frontend_impl(mx_sym, (3, 4), (2, 2))
+
+def test_forward_maximum():
+    a = mx.sym.var('a')
+    b = mx.sym.var('b')
+    dshape = (10, 20)
+    dtype = 'float32'
+    mx_sym = mx.sym._internal._maximum(a, b)
+    np_a = np.random.uniform(size=dshape).astype(dtype)
+    np_b = np.random.uniform(size=dshape).astype(dtype)
+    mx_a = mx.nd.array(np_a)
+    mx_b = mx.nd.array(np_b)
+    mod = mx.mod.Module(mx_sym, label_names=None, data_names=['a', 'b'])
+    mod.bind(data_shapes=[('a', dshape), ('b', dshape)], for_training=False)
+    mod.init_params()
+    args, auxs = mod.get_params()
+    mx_out = mx.nd._internal._maximum(mx_a, mx_b).asnumpy()
+    out_shape = dshape
+    new_sym, params = frontend.from_mxnet(mx_sym, args, auxs)
+    shape_dict = {'a': dshape, 'b': dshape}
+    for target, ctx in ctx_list():
+        with nnvm.compiler.build_config(opt_level=3):
+            graph, lib, params = nnvm.compiler.build(new_sym, target, shape_dict, params=params)
+        m = graph_runtime.create(graph, lib, ctx)
+        # set inputs
+        m.set_input("a", tvm.nd.array(np_a))
+        m.set_input("b", tvm.nd.array(np_b))
+        m.set_input(**params)
+        m.run()
+        # get outputs
+        tvm_out = m.get_output(0, tvm.nd.empty(out_shape, dtype)).asnumpy()
+        tvm.testing.assert_allclose(mx_out, tvm_out, rtol=1e-5, atol=1e-5)
+
+def test_forward_minimum():
+    a = mx.sym.var('a')
+    b = mx.sym.var('b')
+    dshape = (10, 20)
+    dtype = 'float32'
+    mx_sym = mx.sym._internal._minimum(a, b)
+    np_a = np.random.uniform(size=dshape).astype(dtype)
+    np_b = np.random.uniform(size=dshape).astype(dtype)
+    mx_a = mx.nd.array(np_a)
+    mx_b = mx.nd.array(np_b)
+    mod = mx.mod.Module(mx_sym, label_names=None, data_names=['a', 'b'])
+    mod.bind(data_shapes=[('a', dshape), ('b', dshape)], for_training=False)
+    mod.init_params()
+    args, auxs = mod.get_params()
+    mx_out = mx.nd._internal._minimum(mx_a, mx_b).asnumpy()
+    out_shape = dshape
+    new_sym, params = frontend.from_mxnet(mx_sym, args, auxs)
+    shape_dict = {'a': dshape, 'b': dshape}
+    for target, ctx in ctx_list():
+        with nnvm.compiler.build_config(opt_level=3):
+            graph, lib, params = nnvm.compiler.build(new_sym, target, shape_dict, params=params)
+        m = graph_runtime.create(graph, lib, ctx)
+        # set inputs
+        m.set_input("a", tvm.nd.array(np_a))
+        m.set_input("b", tvm.nd.array(np_b))
+        m.set_input(**params)
+        m.run()
+        # get outputs
+        tvm_out = m.get_output(0, tvm.nd.empty(out_shape, dtype)).asnumpy()
+        tvm.testing.assert_allclose(mx_out, tvm_out, rtol=1e-5, atol=1e-5)
+
+
+if __name__ == '__main__':
+    test_forward_mlp()
+    test_forward_vgg()
+    test_forward_resnet()
+    test_forward_elu()
+    test_forward_rrelu()
+    test_forward_prelu()
+    test_forward_softrelu()
+    test_forward_fc_flatten()
+    test_forward_clip()
+    test_forward_split()
+    test_forward_split_squeeze()
+    test_forward_expand_dims()
+    test_forward_pooling()
+    test_forward_lrn()
+    test_forward_ones()
+    test_forward_zeros()
+    test_forward_ones_like()
+    test_forward_zeros_like()
+    test_forward_argmax()
+    test_forward_argmin()
+    test_forward_where()
+    test_forward_slice()
+    test_forward_maximum()
+    test_forward_minimum()
diff --git a/nnvm/tests/python/frontend/mxnet/test_graph.py b/nnvm/tests/python/frontend/mxnet/test_graph.py
new file mode 100644
index 000000000000..1bbd0a97e8e1
--- /dev/null
+++ b/nnvm/tests/python/frontend/mxnet/test_graph.py
@@ -0,0 +1,95 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import mxnet as mx
+import nnvm
+from nnvm.compiler import graph_util, graph_attr
+import model_zoo
+
+def compare_graph(sym1, sym2, ishape=(2, 3, 224, 224)):
+    g1 = nnvm.graph.create(sym1)
+    g2 = nnvm.graph.create(sym2)
+    graph_attr.set_shape_inputs(g1, {'data':ishape})
+    graph_attr.set_shape_inputs(g2, {'data':ishape})
+    g1 = g1.apply("InferShape").apply("SimplifyInference")
+    g2 = g2.apply("InferShape").apply("SimplifyInference")
+    graph_util.check_graph_equal(g1, g2)
+
+def test_mlp():
+    mx_sym = model_zoo.mx_mlp
+    from_mx_sym, _ = nnvm.frontend.from_mxnet(mx_sym)
+    nnvm_sym = model_zoo.nnvm_mlp
+    compare_graph(from_mx_sym, nnvm_sym)
+
+def test_vgg():
+    for n in [11, 13, 16, 19]:
+        mx_sym = model_zoo.mx_vgg[n]
+        from_mx_sym, _ = nnvm.frontend.from_mxnet(mx_sym)
+        nnvm_sym = model_zoo.nnvm_vgg[n]
+        compare_graph(from_mx_sym, nnvm_sym)
+
+def test_resnet():
+    for n in [18, 34, 50, 101]:
+        mx_sym = model_zoo.mx_resnet[n]
+        from_mx_sym, _ = nnvm.frontend.from_mxnet(mx_sym)
+        nnvm_sym = model_zoo.nnvm_resnet[n]
+        compare_graph(from_mx_sym, nnvm_sym)
+
+def test_squeezenet():
+    for version in ['1.0', '1.1']:
+        mx_sym = model_zoo.mx_squeezenet[version]
+        from_mx_sym, _ = nnvm.frontend.from_mxnet(mx_sym)
+        nnvm_sym = model_zoo.nnvm_squeezenet[version]
+        compare_graph(from_mx_sym, nnvm_sym)
+
+def test_inception_v3():
+    mx_sym = model_zoo.mx_inception_v3
+    from_mx_sym, _ = nnvm.frontend.from_mxnet(mx_sym)
+    nnvm_sym = model_zoo.nnvm_inception_v3
+    compare_graph(from_mx_sym, nnvm_sym, ishape=(2, 3, 299, 299))
+
+def test_dqn():
+    mx_sym = model_zoo.mx_dqn
+    from_mx_sym, _ = nnvm.frontend.from_mxnet(mx_sym)
+    nnvm_sym = model_zoo.nnvm_dqn
+    compare_graph(from_mx_sym, nnvm_sym, ishape=(2, 4, 84, 84))
+
+def test_dcgan():
+    mx_sym = model_zoo.mx_dcgan
+    from_mx_sym, _ = nnvm.frontend.from_mxnet(mx_sym)
+    nnvm_sym = model_zoo.nnvm_dcgan
+    compare_graph(from_mx_sym, nnvm_sym, ishape=(2, 100))
+
+def test_multi_outputs():
+    def compose(F, **kwargs):
+        x = F.sym.Variable('x')
+        y = F.sym.Variable('y')
+        z = F.sym.split(x, **kwargs)
+        return F.sym.broadcast_sub(F.sym.broadcast_add(z[0], z[2]), y)
+    mx_sym = compose(mx, num_outputs=3, axis=1)
+    from_mx_sym, _ = nnvm.frontend.from_mxnet(mx_sym)
+    nnvm_sym = compose(nnvm, indices_or_sections=3, axis=1)
+    compare_graph(from_mx_sym, nnvm_sym)
+
+if __name__ == '__main__':
+    test_mlp()
+    test_vgg()
+    test_resnet()
+    test_multi_outputs()
+    test_dqn()
+    test_dcgan()
+    test_squeezenet()
+    test_inception_v3()
diff --git a/nnvm/tests/python/frontend/onnx/model_zoo/__init__.py b/nnvm/tests/python/frontend/onnx/model_zoo/__init__.py
new file mode 100644
index 000000000000..f5eb604acfd7
--- /dev/null
+++ b/nnvm/tests/python/frontend/onnx/model_zoo/__init__.py
@@ -0,0 +1,40 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Store for onnx examples and common models."""
+from __future__ import absolute_import as _abs
+import os
+import logging
+from .super_resolution import get_super_resolution
+from tvm.contrib.download import download_testdata
+
+
+URLS = {
+    'super_resolution.onnx': 'https://gist.github.com/zhreshold/bcda4716699ac97ea44f791c24310193/raw/93672b029103648953c4e5ad3ac3aadf346a4cdc/super_resolution_0.2.onnx',
+    'squeezenet1_1.onnx': 'https://gist.github.com/zhreshold/bcda4716699ac97ea44f791c24310193/raw/93672b029103648953c4e5ad3ac3aadf346a4cdc/squeezenet1_1_0.2.onnx',
+    'lenet.onnx': 'https://gist.github.com/zhreshold/bcda4716699ac97ea44f791c24310193/raw/93672b029103648953c4e5ad3ac3aadf346a4cdc/lenet_0.2.onnx',
+    'resnet18_1_0.onnx': 'https://gist.github.com/zhreshold/bcda4716699ac97ea44f791c24310193/raw/b385b1b242dc89a35dd808235b885ed8a19aedc1/resnet18_1.0.onnx'}
+
+# download and add paths
+for k, v  in URLS.items():
+    name = k.split('.')[0]
+    relpath = os.path.join('onnx', k)
+    abspath = download_testdata(v, relpath, module='onnx')
+    locals()[name] = abspath
+
+# symbol for graph comparison
+super_resolution_sym = get_super_resolution()
diff --git a/nnvm/tests/python/frontend/onnx/model_zoo/squeezenet.py b/nnvm/tests/python/frontend/onnx/model_zoo/squeezenet.py
new file mode 100644
index 000000000000..2de2d1075494
--- /dev/null
+++ b/nnvm/tests/python/frontend/onnx/model_zoo/squeezenet.py
@@ -0,0 +1,118 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# coding: utf-8
+# pylint: disable=unused-argument
+
+"""
+Symbol of SqueezeNet
+
+Reference:
+Iandola, Forrest N., et al.
+"Squeezenet: Alexnet-level accuracy with 50x fewer parameters and< 0.5 mb model size." (2016).
+"""
+
+from nnvm import symbol as sym
+from nnvm.testing.utils import create_workload
+
+# Helpers
+def _make_fire(net, squeeze_channels, expand1x1_channels, expand3x3_channels):
+    net = _make_fire_conv(net, squeeze_channels, 1, 0)
+
+    left = _make_fire_conv(net, expand1x1_channels, 1, 0)
+    right = _make_fire_conv(net, expand3x3_channels, 3, 1)
+    # NOTE : Assume NCHW layout here
+    net = sym.concatenate(left, right, axis=1)
+
+    return net
+
+def _make_fire_conv(net, channels, kernel_size, padding=0):
+    net = sym.conv2d(net, channels=channels, kernel_size=(kernel_size, kernel_size),
+                     padding=(padding, padding))
+    net = sym.relu(net)
+    return net
+
+# Net
+def get_symbol(num_classes, version, **kwargs):
+    """Get symbol of SqueezeNet
+
+    Parameters
+    ----------
+    num_classes: int
+        The number of classification results
+
+    version : str, optional
+        "1.0" or "1.1" of SqueezeNet
+    """
+    assert version == '1.1', ("Unsupported SqueezeNet version {version}:"
+                              "1.1 expected".format(version=version))
+    net = sym.Variable("data")
+
+    net = sym.conv2d(net, channels=64, kernel_size=(3, 3), strides=(2, 2))
+    net = sym.relu(net)
+    net = sym.max_pool2d(net, pool_size=(3, 3), strides=(2, 2))
+    net = _make_fire(net, 16, 64, 64)
+    net = _make_fire(net, 16, 64, 64)
+    net = sym.max_pool2d(net, pool_size=(3, 3), strides=(2, 2))
+    net = _make_fire(net, 32, 128, 128)
+    net = _make_fire(net, 32, 128, 128)
+    net = sym.max_pool2d(net, pool_size=(3, 3), strides=(2, 2))
+    net = _make_fire(net, 48, 192, 192)
+    net = _make_fire(net, 48, 192, 192)
+    net = _make_fire(net, 64, 256, 256)
+    net = _make_fire(net, 64, 256, 256)
+
+    net = sym.dropout(net, rate=0.5)
+    net = sym.conv2d(net, channels=num_classes, kernel_size=(1, 1))
+    net = sym.relu(net)
+    net = sym.global_avg_pool2d(net)
+    return sym.softmax(net, axis=1)
+
+def get_workload(batch_size=1, num_classes=1000, version='1.0',
+                 image_shape=(3, 224, 224), dtype="float32", **kwargs):
+    """Get benchmark workload for SqueezeNet
+
+    Parameters
+    ----------
+    batch_size : int
+        The batch size used in the model
+
+    num_classes : int, optional
+        Number of classes
+
+    version : str, optional
+        "1.0" or "1.1" of SqueezeNet
+
+    image_shape : tuple, optional
+        The input image shape
+
+    dtype : str, optional
+        The data type
+
+    kwargs : dict
+        Extra arguments
+
+    Returns
+    -------
+    net : nnvm.Symbol
+        The computational graph
+
+    params : dict of str to NDArray
+        The parameters.
+    """
+    net = get_symbol(num_classes=num_classes, version=version, **kwargs)
+    return create_workload(net, batch_size, image_shape, dtype)
diff --git a/nnvm/tests/python/frontend/onnx/model_zoo/super_resolution.py b/nnvm/tests/python/frontend/onnx/model_zoo/super_resolution.py
new file mode 100644
index 000000000000..a98478e58307
--- /dev/null
+++ b/nnvm/tests/python/frontend/onnx/model_zoo/super_resolution.py
@@ -0,0 +1,36 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""NNVM symbol corresponding to super_resolution.onnx example."""
+from nnvm import sym
+
+def get_super_resolution():
+    factor = 3
+    size = 224
+    data = sym.Variable(name='9')
+    conv1 = sym.conv2d(data, channels=64, kernel_size=(5, 5), padding=(2, 2), use_bias=False)
+    relu1 = sym.relu(conv1 + sym.expand_dims(sym.Variable(name='2', shape=(64)), axis=1, num_newaxis=2))
+    conv2 = sym.conv2d(relu1, channels=64, kernel_size=(3, 3), padding=(1, 1), use_bias=False)
+    relu2 = sym.relu(conv2 + sym.expand_dims(sym.Variable(name='4', shape=(64)), axis=1, num_newaxis=2))
+    conv3 = sym.conv2d(relu2, channels=32, kernel_size=(3, 3), padding=(1, 1), use_bias=False)
+    relu3 = sym.relu(conv3 + sym.expand_dims(sym.Variable(name='6', shape=(32)), axis=1, num_newaxis=2))
+    conv4 = sym.conv2d(relu3, channels=factor**2, kernel_size=(3, 3), padding=(1, 1), use_bias=False)
+    conv4 = conv4 + sym.expand_dims(sym.Variable(name='8', shape=(factor**2)), axis=1, num_newaxis=2)
+    # TODO(zhreshold): allow shape inference for batch size > 1
+    r1 = sym.reshape(conv4, shape=(1, 1, factor, factor, size, size))
+    t1 = sym.transpose(r1, axes=(0, 1, 4, 2, 5, 3))
+    r2 = sym.reshape(t1, shape=(1, 1, size * factor, size * factor))
+    return r2
diff --git a/nnvm/tests/python/frontend/onnx/test_forward.py b/nnvm/tests/python/frontend/onnx/test_forward.py
new file mode 100644
index 000000000000..8cb6876956c4
--- /dev/null
+++ b/nnvm/tests/python/frontend/onnx/test_forward.py
@@ -0,0 +1,1099 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import numpy as np
+import math
+import nnvm
+import topi
+import topi.testing
+import tvm
+from tvm.contrib import graph_runtime
+from nnvm.testing.config import ctx_list
+import onnx
+from model_zoo import super_resolution, squeezenet1_1, lenet, resnet18_1_0
+from onnx import helper, TensorProto
+
+def get_tvm_output(graph_def, input_data, target, ctx, output_shape=None, output_dtype='float32'):
+    """ Generic function to execute and get tvm output"""
+
+    sym, params = nnvm.frontend.from_onnx(graph_def)
+    target = 'llvm'
+    if isinstance(input_data, list):
+        input_names = {}
+        shape_dict = {}
+        dtype_dict = {}
+        for i, _ in enumerate(input_data):
+            input_names[i] = graph_def.graph.input[i].name
+            shape_dict[input_names[i]] = input_data[i].shape
+            dtype_dict[input_names[i]] = input_data[i].dtype
+    else:
+        input_names = graph_def.graph.input[0].name
+        shape_dict = {input_names: input_data.shape}
+        dtype_dict = {input_names: input_data.dtype}
+
+    graph, lib, params = nnvm.compiler.build(sym, target, shape_dict,
+                                             dtype=dtype_dict, params=params)
+
+    ctx = tvm.cpu(0)
+    from tvm.contrib import graph_runtime
+    m = graph_runtime.create(graph, lib, ctx)
+    # set inputs
+    if isinstance(input_data, list):
+        for i, e in enumerate(input_names):
+            m.set_input(input_names[i], tvm.nd.array(input_data[i].astype(input_data[i].dtype)))
+    else:
+        m.set_input(input_names, tvm.nd.array(input_data.astype(input_data.dtype)))
+
+    m.set_input(**params)
+    # execute
+    m.run()
+    # get outputs
+    if isinstance(output_shape, list) and isinstance(output_dtype, list):
+        tvm_output_list = []
+        for i, _ in enumerate(output_shape):
+            tvm_output = m.get_output(i)
+            tvm_output_list.append(tvm_output.asnumpy())
+        return tvm_output_list
+    else:
+        tvm_output = m.get_output(0)
+        return tvm_output.asnumpy()
+
+def get_caffe2_output(model, x, dtype='float32'):
+    import caffe2.python.onnx.backend
+    prepared_backend = caffe2.python.onnx.backend.prepare(model)
+    W = {model.graph.input[0].name: x.astype(dtype)}
+    c2_out = prepared_backend.run(W)[0]
+    return c2_out
+
+
+def verify_onnx_forward_impl(graph_file, data_shape, out_shape):
+    dtype = 'float32'
+    x = np.random.uniform(size=data_shape)
+    model = onnx.load_model(graph_file)
+    c2_out = get_caffe2_output(model, x, dtype)
+    for target, ctx in ctx_list():
+        tvm_out = get_tvm_output(model, x, target, ctx, out_shape, dtype)
+        tvm.testing.assert_allclose(c2_out, tvm_out, rtol=1e-5, atol=1e-5)
+
+def verify_super_resolution_example():
+    verify_onnx_forward_impl(super_resolution, (1, 1, 224, 224), (1, 1, 672, 672))
+
+def verify_squeezenet1_1():
+    verify_onnx_forward_impl(squeezenet1_1, (1, 3, 224, 224), (1, 1000))
+
+def verify_lenet():
+    verify_onnx_forward_impl(lenet, (1, 1, 28, 28), (1, 10))
+
+def verify_resnet18():
+    verify_onnx_forward_impl(resnet18_1_0, (1, 3, 224, 224), (1, 1000))
+
+
+def test_reshape():
+    in_shape = (4, 3, 3, 4)
+    ref_shape = (3, 4, 4, 3)
+
+    ref_array = np.array(ref_shape)
+    ref_node = onnx.helper.make_node('Constant',
+                                 inputs=[],
+                                 outputs=['ref_in'],
+                                 value=onnx.helper.make_tensor(name = 'const_tensor',
+                                                               data_type = onnx.TensorProto.INT32,
+                                                               dims = ref_array.shape,
+                                                               vals = ref_array.flatten().astype(int)))
+    reshape_node = helper.make_node("Reshape", ["in", "ref_in"], ["out"])
+
+    graph = helper.make_graph([ref_node, reshape_node],
+                              "reshape_test",
+                              inputs = [helper.make_tensor_value_info("in",
+                                            TensorProto.FLOAT, list(in_shape))],
+                              outputs = [helper.make_tensor_value_info("out",
+                                            TensorProto.FLOAT, list(ref_shape))])
+
+    model = helper.make_model(graph, producer_name='reshape_test')
+
+    for target, ctx in ctx_list():
+        x = np.random.uniform(size=in_shape).astype('int32')
+        tvm_out = get_tvm_output(model, x, target, ctx, ref_shape, 'float32')
+
+    tvm.testing.assert_allclose(ref_shape, tvm_out.shape)
+
+def test_reshape_like():
+    in_shape = (4, 3, 3, 4)
+    ref_shape = (3, 4, 4, 3)
+
+    ref_array = np.random.uniform(size=ref_shape).astype('float32')
+    ref_node = onnx.helper.make_node('Constant',
+                                 inputs=[],
+                                 outputs=['ref_in'],
+                                 value=onnx.helper.make_tensor(name = 'const_tensor',
+                                                               data_type = onnx.TensorProto.FLOAT,
+                                                               dims = ref_array.shape,
+                                                               vals = ref_array.flatten().astype(float)))
+    copy_node = helper.make_node("Identity", ["ref_in"], ["copy_in"])
+    reshape_node = helper.make_node("Reshape", ["in", "copy_in"], ["out"])
+
+    graph = helper.make_graph([ref_node, copy_node, reshape_node],
+                              "reshape_like_test",
+                              inputs = [helper.make_tensor_value_info("in",
+                                            TensorProto.FLOAT, list(in_shape))],
+                              outputs = [helper.make_tensor_value_info("out",
+                                            TensorProto.FLOAT, list(ref_shape))])
+
+    model = helper.make_model(graph, producer_name='reshape_like_test')
+
+    for target, ctx in ctx_list():
+        x = np.random.uniform(size=in_shape).astype('float32')
+        tvm_out = get_tvm_output(model, x, target, ctx, ref_shape, 'float32')
+
+    tvm.testing.assert_allclose(ref_shape, tvm_out.shape)
+
+def _test_power_iteration(x_shape, y_shape):
+    if isinstance(y_shape, int):
+        y_shape = [y_shape]
+
+    x = np.random.uniform(size=x_shape).astype(np.float32)
+    y = np.random.uniform(size=y_shape).astype(np.float32)
+
+    np_res = np.power(x, y).astype(np.float32)
+
+    res = helper.make_node("Pow", ['x', 'y'], ['out'])
+
+    graph = helper.make_graph([res],
+                              'power_test',
+                              inputs = [helper.make_tensor_value_info("x",
+                                            TensorProto.FLOAT, list(x_shape)),
+                                        helper.make_tensor_value_info("y",
+                                            TensorProto.FLOAT, list(y_shape))],
+                              outputs = [helper.make_tensor_value_info("out",
+                                            TensorProto.FLOAT, list(np_res.shape))])
+
+    model = helper.make_model(graph, producer_name='power_test')
+
+    for target, ctx in ctx_list():
+        tvm_out = get_tvm_output(model, [x, y], target, ctx, np_res.shape)
+        tvm.testing.assert_allclose(np_res, tvm_out, rtol=1e-5, atol=1e-5)
+
+def test_power():
+    _test_power_iteration((1, 3), (1))
+    _test_power_iteration((2, 3), (2, 3))
+    _test_power_iteration((2, 3), (1, 3))
+
+def test_squeeze():
+    in_shape = (1, 3, 1, 3, 1, 1)
+    out_shape = (3, 3)
+    y = helper.make_node("Squeeze", ['in'], ['out'], axes=[0, 2, 4, 5])
+
+    graph = helper.make_graph([y],
+                              'squeeze_test',
+                              inputs = [helper.make_tensor_value_info("in",
+                                            TensorProto.FLOAT, list(in_shape))],
+                              outputs = [helper.make_tensor_value_info("out",
+                                            TensorProto.FLOAT, list(out_shape))])
+
+    model = helper.make_model(graph, producer_name='squeeze_test')
+
+    for target, ctx in ctx_list():
+        x = np.random.uniform(size=in_shape).astype('float32')
+        tvm_out = get_tvm_output(model, x, target, ctx, out_shape, 'float32')
+
+    tvm.testing.assert_allclose(out_shape, tvm_out.shape)
+
+def test_unsqueeze():
+    in_shape = (3, 3)
+    axis = (0, 3, 4)
+    out_shape = (1, 3, 3, 1, 1)
+    y = helper.make_node("Unsqueeze", ['in'], ['out'], axes=list(axis))
+
+    graph = helper.make_graph([y],
+                              'squeeze_test',
+                              inputs = [helper.make_tensor_value_info("in",
+                                            TensorProto.FLOAT, list(in_shape))],
+                              outputs = [helper.make_tensor_value_info("out",
+                                            TensorProto.FLOAT, list(out_shape))])
+
+    model = helper.make_model(graph, producer_name='squeeze_test')
+
+    for target, ctx in ctx_list():
+        x = np.random.uniform(size=in_shape).astype('float32')
+        tvm_out = get_tvm_output(model, x, target, ctx, out_shape, 'float32')
+
+    tvm.testing.assert_allclose(out_shape, tvm_out.shape)
+
+def verify_gather(in_shape, indices, axis, dtype):
+    x = np.random.uniform(size=in_shape).astype(dtype)
+    indices = np.array(indices, dtype="int32")
+    out_np = np.take(x, indices, axis=axis)
+
+    y = helper.make_node("Gather", ['in', 'indices'], ['out'], axis=axis)
+
+    graph = helper.make_graph([y],
+                              'gather_test',
+                              inputs = [helper.make_tensor_value_info("in",
+                                            TensorProto.FLOAT, list(in_shape)),
+                                        helper.make_tensor_value_info("indices",
+                                            TensorProto.INT32, list(indices.shape))],
+                              outputs = [helper.make_tensor_value_info("out",
+                                            TensorProto.FLOAT, list(out_np.shape))])
+    model = helper.make_model(graph, producer_name='gather_test')
+
+    for target, ctx in ctx_list():
+        tvm_out = get_tvm_output(model, [x, indices], target, ctx, out_np.shape)
+        tvm.testing.assert_allclose(out_np, tvm_out)
+
+def test_gather():
+    verify_gather((4,), [1], 0, 'int32')
+    verify_gather((1,4), [0], 0, 'int32')
+    verify_gather((4,), [[[1,0],[0,1]]], 0, 'float32')
+    verify_gather((2,2), [[[1,0],[0,1]]], 1, 'int32')
+    verify_gather((3,3,3), [[[1,0]]], -1, 'int32')
+    verify_gather((4,3,5,6), [[2,1,0,0]], 0, 'float32')
+
+def _test_slice_iteration(indata, outdata, starts, ends, axes=None):
+    if axes:
+        y = helper.make_node("Slice", ['in'], ['out'], axes=axes, starts=starts, ends=ends)
+    else:
+        y = helper.make_node("Slice", ['in'], ['out'], starts=starts, ends=ends)
+
+    graph = helper.make_graph([y],
+                              'slice_test',
+                              inputs = [helper.make_tensor_value_info("in",
+                                            TensorProto.FLOAT, list(indata.shape))],
+                              outputs = [helper.make_tensor_value_info("out",
+                                            TensorProto.FLOAT, list(outdata.shape))])
+
+    model = helper.make_model(graph, producer_name='slice_test')
+
+    for target, ctx in ctx_list():
+        tvm_out = get_tvm_output(model, indata, target, ctx, outdata.shape, 'float32')
+
+    tvm.testing.assert_allclose(outdata, tvm_out)
+
+def test_slice():
+    x = np.random.randn(20, 10, 5).astype(np.float32)
+    _test_slice_iteration(x, x[0:3, 0:10], (0, 0), (3, 10), (0, 1))
+    _test_slice_iteration(x, x[:, :, 3:4], (0, 0, 3), (20, 10, 4))
+    _test_slice_iteration(x, x[:, 1:1000], (1), (1000), (1))
+    _test_slice_iteration(x, x[:, 0:-1], (0), (-1), (1))
+
+def _test_onnx_op_elementwise(inshape, outfunc, npargs, dtype, opname, kwargs, rtol=1e-7, atol=1e-7):
+    indata = np.random.uniform(-1, 1, size=inshape).astype(dtype)
+    outdata = outfunc(indata, **npargs)
+
+    y = helper.make_node(opname, ['in'], ['out'], **kwargs)
+
+    graph = helper.make_graph([y],
+                              opname+'_test',
+                              inputs = [helper.make_tensor_value_info("in",
+                                            TensorProto.FLOAT, list(indata.shape))],
+                              outputs = [helper.make_tensor_value_info("out",
+                                            TensorProto.FLOAT, list(outdata.shape))])
+
+    model = helper.make_model(graph, producer_name=opname+'_test')
+
+    for target, ctx in ctx_list():
+        tvm_out = get_tvm_output(model, indata, target, ctx, outdata.shape, dtype)
+
+    tvm.testing.assert_allclose(outdata, tvm_out, rtol=rtol, atol=atol)
+
+def test_floor():
+    _test_onnx_op_elementwise((2, 4, 5, 6), np.floor, {}, 'float32', 'Floor', {})
+
+def test_ceil():
+    _test_onnx_op_elementwise((2, 4, 5, 6), np.ceil, {}, 'float32', 'Ceil', {})
+
+def test_clip():
+    _test_onnx_op_elementwise((2, 4, 5, 6),
+                              np.clip,
+                              {'a_min': -1.0, 'a_max': 1.0},
+                              'float32',
+                              'Clip',
+                              {'min': -1.0, 'max': 1.0})
+
+def test_matmul():
+    a_shape = (4, 3)
+    b_shape = (3, 4)
+
+    a_array = np.random.uniform(size=a_shape).astype('float32')
+    b_array = np.random.uniform(size=b_shape).astype('float32')
+    out_np = np.matmul(a_array, b_array)
+
+    mul_node = helper.make_node("MatMul", ["a", "b"], ["out"])
+
+    graph = helper.make_graph([mul_node],
+                              "matmul_test",
+                              inputs = [helper.make_tensor_value_info("a",
+                                            TensorProto.FLOAT, list(a_shape)),
+                                        helper.make_tensor_value_info("b",
+                                            TensorProto.FLOAT, list(b_shape))],
+                              outputs = [helper.make_tensor_value_info("out",
+                                            TensorProto.FLOAT, list(out_np.shape))])
+
+    model = helper.make_model(graph, producer_name='matmul_test')
+
+    for target, ctx in ctx_list():
+        tvm_out = get_tvm_output(model, [a_array, b_array], target, ctx, out_np.shape)
+        tvm.testing.assert_allclose(out_np, tvm_out, rtol=1e-5, atol=1e-5)
+
+def verify_lrn(shape, nsize, dtype, alpha=None, beta=None, bias=None):
+    in_array = np.random.uniform(size=shape).astype(dtype)
+
+    if alpha == None and beta == None and bias==None:
+        alpha = 0.0001
+        beta = 0.75
+        bias = 1.0
+        node = onnx.helper.make_node('LRN', inputs=['in'], outputs=['out'], size=nsize)
+    else:
+        node = onnx.helper.make_node('LRN', inputs=['in'], outputs=['out'], alpha=alpha,
+                                     beta=beta, bias=bias, size=nsize)
+
+    graph = helper.make_graph([node],
+                              "lrn_test",
+                              inputs = [helper.make_tensor_value_info("in", TensorProto.FLOAT, list(shape))],
+                              outputs = [helper.make_tensor_value_info("out", TensorProto.FLOAT, list(shape))])
+    model = helper.make_model(graph, producer_name='lrn_test')
+
+    def _get_python_lrn():
+        square_sum = np.zeros(shape).astype(dtype)
+        for n, c, h, w in np.ndindex(in_array.shape):
+            square_sum[n, c, h, w] = sum(in_array[n,
+                                         max(0, c - int(math.floor((nsize - 1) / 2))): \
+                                             min(5, c + int(math.ceil((nsize - 1) / 2)) + 1),
+                                         h,
+                                         w] ** 2)
+        py_out = in_array / ((bias + (alpha / nsize) * square_sum) ** beta)
+        return py_out
+
+    for target, ctx in ctx_list():
+        new_sym, params = nnvm.frontend.from_onnx(model)
+
+        input_name = model.graph.input[0].name
+        shape_dict = {input_name: in_array.shape}
+        dtype_dict = {input_name: dtype}
+        graph, lib, params = nnvm.compiler.build(new_sym, target,
+                                                 shape_dict, dtype_dict, params=params)
+        m = graph_runtime.create(graph, lib, ctx)
+        # set inputs
+        m.set_input(input_name, tvm.nd.array(in_array.astype(dtype)))
+        m.set_input(**params)
+        m.run()
+        # get outputs
+        tvm_out = m.get_output(0, tvm.nd.empty(shape, dtype))
+        py_out = _get_python_lrn()
+        tvm.testing.assert_allclose(py_out, tvm_out.asnumpy(), rtol=1e-5, atol=1e-5)
+
+def test_lrn():
+    verify_lrn((5, 5, 5, 5), 3, 'float32')
+    verify_lrn((5, 5, 5, 5), 3, 'float32', alpha=0.0002, beta=0.5, bias=2.0)
+
+def _test_upsample_nearest():
+    scale = 2
+    in_shape = (1, 1, 3, 3)
+    out_shape = (1, 1, 3*scale, 3*scale)
+    y = helper.make_node("Upsample", ['in'], ['out'], mode='nearest', scales=[1.0, 1.0, 2.0, 2.0])
+
+    in_array = np.random.uniform(size=in_shape).astype(np.float32)
+    out_array = topi.testing.upsampling_python(in_array, (scale, scale), "NCHW")
+
+    graph = helper.make_graph([y],
+                              'upsample_nearest_test',
+                              inputs = [helper.make_tensor_value_info("in", TensorProto.FLOAT, list(in_shape))],
+                              outputs = [helper.make_tensor_value_info("out", TensorProto.FLOAT, list(out_shape))])
+
+    model = helper.make_model(graph, producer_name='upsample_nearest_test')
+
+    for target, ctx in ctx_list():
+        tvm_out = get_tvm_output(model, in_array, target, ctx, out_shape, 'float32')
+        tvm.testing.assert_allclose(out_array, tvm_out)
+
+def _test_upsample_bilinear():
+    scale = 2
+    in_shape = (1, 1, 3, 3)
+    out_shape = (1, 1, 3*scale, 3*scale)
+    y = helper.make_node("Upsample", ['in'], ['out'], mode='linear', scales=[1.0, 1.0, 2.0, 2.0])
+
+    in_array = np.random.uniform(size=in_shape).astype(np.float32)
+    out_array = topi.testing.bilinear_resize_python(in_array, (3*scale, 3*scale), "NCHW", align_corners=False)
+
+    graph = helper.make_graph([y],
+                              'upsample_bilinear_test',
+                              inputs = [helper.make_tensor_value_info("in", TensorProto.FLOAT, list(in_shape))],
+                              outputs = [helper.make_tensor_value_info("out", TensorProto.FLOAT, list(out_shape))])
+
+    model = helper.make_model(graph, producer_name='upsample_bilinear_test')
+
+    for target, ctx in ctx_list():
+        tvm_out = get_tvm_output(model, in_array, target, ctx, out_shape, 'float32')
+        tvm.testing.assert_allclose(out_array, tvm_out, rtol=1e-5, atol=1e-5)
+
+def _test_upsample_bilinear_opset9():
+    scale = 2
+    in_shape = (1, 1, 3, 3)
+    out_shape = (1, 1, 3*scale, 3*scale)
+    y = helper.make_node("Upsample", ['in','scales'], ['out'], mode='linear')
+    scales=[1.0, 1.0, 2.0, 2.0]
+    in_array = np.random.uniform(size=in_shape).astype(np.float32)
+    out_array = topi.testing.bilinear_resize_python(in_array, (3*scale, 3*scale), "NCHW", align_corners=False)
+
+    ref_array = np.array(scales)
+    ref_node = helper.make_node('Constant',
+                                 inputs=[],
+                                 outputs=['scales'],
+                                 value=onnx.helper.make_tensor(name = 'const_tensor',
+                                                               data_type = TensorProto.FLOAT,
+                                                               dims = ref_array.shape,
+                                                               vals = ref_array.flatten().astype(float)))
+
+    graph = helper.make_graph([ref_node, y],
+                              'upsample_bilinear_opset9_test',
+                              inputs = [helper.make_tensor_value_info("in", TensorProto.FLOAT, list(in_shape))],
+                              outputs = [helper.make_tensor_value_info("out", TensorProto.FLOAT, list(out_shape))])
+
+    model = helper.make_model(graph, producer_name='upsample_bilinear_opset9_test')
+    inputs = []
+    inputs.append(in_array)
+
+    for target, ctx in ctx_list():
+        tvm_out = get_tvm_output(model, inputs, target, ctx, out_shape, 'float32')
+        tvm.testing.assert_allclose(out_array, tvm_out, rtol=1e-5, atol=1e-5)
+
+def test_upsample():
+    _test_upsample_nearest()
+    _test_upsample_bilinear()
+    _test_upsample_bilinear_opset9()
+
+def _test_softmax(inshape, axis):
+    opname = 'Softmax'
+    indata = np.random.uniform(size=inshape).astype(np.float32)
+    outshape = inshape
+    outdata = topi.testing.softmax_python(indata)
+    if isinstance(axis, int):
+        y = helper.make_node(opname, ['in'], ['out'], axis = axis)
+    elif axis is None:
+        y = helper.make_node(opname, ['in'], ['out'])
+
+    graph = helper.make_graph([y],
+                              opname+'_test',
+                              inputs = [helper.make_tensor_value_info("in",
+                                            TensorProto.FLOAT, list(indata.shape))],
+                              outputs = [helper.make_tensor_value_info("out",
+                                            TensorProto.FLOAT, list(outdata.shape))])
+
+    model = helper.make_model(graph, producer_name=opname+'_test')
+
+    for target, ctx in ctx_list():
+        tvm_out = get_tvm_output(model, indata, target, ctx, outshape, 'float32')
+        tvm.testing.assert_allclose(outdata, tvm_out, rtol=1e-5, atol=1e-5)
+
+def test_softmax():
+    _test_softmax((1, 10), None)
+    _test_softmax((1, 10), 1)
+
+def verify_min(input_dim):
+    dtype = 'float32'
+
+    a_np1 = np.random.uniform(size=input_dim).astype(dtype)
+    a_np2 = np.random.uniform(size=input_dim).astype(dtype)
+    a_np3 = np.random.uniform(size=input_dim).astype(dtype)
+
+    b_np = np.min((a_np1, a_np2, a_np3), axis=0)
+
+    min_node = helper.make_node("Min", ["a_np1", "a_np2", "a_np3"], ["out"])
+
+    graph = helper.make_graph([min_node],
+                              "Min_test",
+                              inputs = [helper.make_tensor_value_info("a_np1",
+                                            TensorProto.FLOAT, list(input_dim)),
+                                        helper.make_tensor_value_info("a_np2",
+                                            TensorProto.FLOAT, list(input_dim)),
+                                        helper.make_tensor_value_info("a_np3",
+                                            TensorProto.FLOAT, list(input_dim))],
+                              outputs = [helper.make_tensor_value_info("out",
+                                            TensorProto.FLOAT, list(b_np.shape))])
+
+    model = helper.make_model(graph, producer_name='Min_test')
+
+    for target, ctx in ctx_list():
+        tvm_out = get_tvm_output(model, [a_np1, a_np2, a_np3], target, ctx, b_np.shape)
+        tvm.testing.assert_allclose(b_np, tvm_out, rtol=1e-5, atol=1e-5)
+
+def test_forward_min():
+    verify_min((1, 3, 20, 20))
+    verify_min((20, 20))
+
+def verify_max(input_dim):
+    dtype = 'float32'
+
+    a_np1 = np.random.uniform(size=input_dim).astype(dtype)
+    a_np2 = np.random.uniform(size=input_dim).astype(dtype)
+    a_np3 = np.random.uniform(size=input_dim).astype(dtype)
+
+    b_np = np.max((a_np1, a_np2, a_np3), axis=0)
+
+    max_node = helper.make_node("Max", ["a_np1", "a_np2", "a_np3"], ["out"])
+
+    graph = helper.make_graph([max_node],
+                              "Max_test",
+                              inputs = [helper.make_tensor_value_info("a_np1",
+                                            TensorProto.FLOAT, list(input_dim)),
+                                        helper.make_tensor_value_info("a_np2",
+                                            TensorProto.FLOAT, list(input_dim)),
+                                        helper.make_tensor_value_info("a_np3",
+                                            TensorProto.FLOAT, list(input_dim))],
+                              outputs = [helper.make_tensor_value_info("out",
+                                            TensorProto.FLOAT, list(b_np.shape))])
+
+    model = helper.make_model(graph, producer_name='Max_test')
+
+    for target, ctx in ctx_list():
+        tvm_out = get_tvm_output(model, [a_np1, a_np2, a_np3], target, ctx, b_np.shape)
+        tvm.testing.assert_allclose(b_np, tvm_out, rtol=1e-5, atol=1e-5)
+
+def test_forward_max():
+    verify_max((1, 3, 20, 20))
+    verify_max((20, 20))
+
+def verify_mean(input_dim):
+    dtype = 'float32'
+
+    a_np1 = np.random.uniform(size=input_dim).astype(dtype)
+    a_np2 = np.random.uniform(size=input_dim).astype(dtype)
+    a_np3 = np.random.uniform(size=input_dim).astype(dtype)
+
+    b_np = np.mean((a_np1, a_np2, a_np3), axis=0)
+
+    mean_node = helper.make_node("Mean", ["a_np1", "a_np2", "a_np3"], ["out"])
+
+    graph = helper.make_graph([mean_node],
+                              "Mean_test",
+                              inputs = [helper.make_tensor_value_info("a_np1",
+                                            TensorProto.FLOAT, list(input_dim)),
+                                        helper.make_tensor_value_info("a_np2",
+                                            TensorProto.FLOAT, list(input_dim)),
+                                        helper.make_tensor_value_info("a_np3",
+                                            TensorProto.FLOAT, list(input_dim))],
+                              outputs = [helper.make_tensor_value_info("out",
+                                            TensorProto.FLOAT, list(b_np.shape))])
+
+    model = helper.make_model(graph, producer_name='Mean_test')
+
+    for target, ctx in ctx_list():
+        tvm_out = get_tvm_output(model, [a_np1, a_np2, a_np3], target, ctx, b_np.shape)
+        tvm.testing.assert_allclose(b_np, tvm_out, rtol=1e-5, atol=1e-5)
+
+def test_forward_mean():
+    verify_mean((1, 3, 20, 20))
+    verify_mean((20, 20))
+
+def verify_hardsigmoid(input_dim, alpha, beta):
+    dtype = 'float32'
+
+    a_np1 = np.random.uniform(size=input_dim).astype(dtype)
+
+    b_np = np.clip(a_np1 * alpha + beta, 0, 1)
+
+    hardsigmoid_node = helper.make_node("HardSigmoid", ["a_np1"], ["out"], alpha=alpha, beta=beta)
+
+    graph = helper.make_graph([hardsigmoid_node],
+                              "HardSigmoid_test",
+                              inputs = [helper.make_tensor_value_info("a_np1",
+                                            TensorProto.FLOAT, list(input_dim))],
+                              outputs = [helper.make_tensor_value_info("out",
+                                            TensorProto.FLOAT, list(b_np.shape))])
+
+    model = helper.make_model(graph, producer_name='HardSigmoid_test')
+
+    for target, ctx in ctx_list():
+        tvm_out = get_tvm_output(model, [a_np1], target, ctx, b_np.shape)
+        tvm.testing.assert_allclose(b_np, tvm_out, rtol=1e-5, atol=1e-5)
+
+def test_forward_hardsigmoid():
+    verify_hardsigmoid((1, 3, 20, 20), 0.5, 0.6)
+    verify_hardsigmoid((20, 20), 0.3, 0.4)
+
+def verify_argmin(input_dim, axis=None, keepdims=None):
+    def _argmin_numpy(data, axis=0, keepdims=True):
+        result = np.argmin(data, axis=axis)
+        if (keepdims == 1):
+            result = np.expand_dims(result, axis)
+        return result.astype(data.dtype)
+
+    a_np1 = np.random.uniform(-10, 10, input_dim).astype(np.int32)
+    if keepdims is None and axis is None:
+        b_np = _argmin_numpy(a_np1)
+        node = onnx.helper.make_node('ArgMin',
+                                     inputs=['a_np1'],
+                                     outputs=['out'])
+    elif axis is None:
+        b_np = _argmin_numpy(a_np1, keepdims=keepdims)
+        node = onnx.helper.make_node('ArgMin',
+                                     inputs=['a_np1'],
+                                     outputs=['out'],
+                                     keepdims=keepdims)
+    elif keepdims is None:
+        b_np = _argmin_numpy(a_np1, axis=axis)
+        node = onnx.helper.make_node('ArgMin',
+                                     inputs=['a_np1'],
+                                     outputs=['out'],
+                                     axis=axis)
+    else:
+        b_np = _argmin_numpy(a_np1, axis=axis, keepdims=keepdims)
+        node = onnx.helper.make_node('ArgMin',
+                                     inputs=['a_np1'],
+                                     outputs=['out'],
+                                     axis=axis,
+                                     keepdims=keepdims)
+    graph = helper.make_graph([node],
+                              "argmin_test",
+                              inputs = [helper.make_tensor_value_info("a_np1",
+                                            TensorProto.INT32, list(a_np1.shape))],
+                              outputs = [helper.make_tensor_value_info("out",
+                                            TensorProto.INT32, list(b_np.shape))])
+
+    model = helper.make_model(graph, producer_name='argmin_test')
+
+    for target, ctx in ctx_list():
+        tvm_out = get_tvm_output(model, [a_np1], target, ctx, b_np.shape, b_np.dtype)
+        tvm.testing.assert_allclose(b_np, tvm_out, rtol=1e-5, atol=1e-5)
+
+def verify_argmax(input_dim, axis=None, keepdims=None):
+    def _argmax_numpy(data, axis=0, keepdims=True):
+        result = np.argmax(data, axis=axis)
+        if (keepdims == 1):
+            result = np.expand_dims(result, axis)
+        return result.astype(data.dtype)
+
+    a_np1 = np.random.uniform(-10, 10, input_dim).astype(np.int32)
+
+    if keepdims is None and axis is None:
+        b_np = _argmax_numpy(a_np1)
+        node = onnx.helper.make_node('ArgMax',
+                                     inputs=['a_np1'],
+                                     outputs=['out'])
+    elif axis is None:
+        b_np = _argmax_numpy(a_np1, keepdims=keepdims)
+        node = onnx.helper.make_node('ArgMax',
+                                     inputs=['a_np1'],
+                                     outputs=['out'],
+                                     keepdims=keepdims)
+    elif keepdims is None:
+        b_np = _argmax_numpy(a_np1, axis=axis)
+        node = onnx.helper.make_node('ArgMax',
+                                     inputs=['a_np1'],
+                                     outputs=['out'],
+                                     axis=axis)
+    else:
+        b_np = _argmax_numpy(a_np1, axis=axis, keepdims=keepdims)
+        node = onnx.helper.make_node('ArgMax',
+                                     inputs=['a_np1'],
+                                     outputs=['out'],
+                                     axis=axis,
+                                     keepdims=keepdims)
+
+    graph = helper.make_graph([node],
+                              "argmax_test",
+                              inputs = [helper.make_tensor_value_info("a_np1",
+                                            TensorProto.INT32, list(a_np1.shape))],
+                              outputs = [helper.make_tensor_value_info("out",
+                                            TensorProto.INT32, list(b_np.shape))])
+
+    model = helper.make_model(graph, producer_name='argmax_test')
+
+    for target, ctx in ctx_list():
+        tvm_out = get_tvm_output(model, [a_np1], target, ctx, b_np.shape, b_np.dtype)
+        tvm.testing.assert_allclose(b_np, tvm_out, rtol=1e-5, atol=1e-5)
+
+def test_forward_arg_min_max():
+    '''Verify argmin and argmax'''
+    verify_argmin([3,4,4])
+    verify_argmax([3,4,4])
+    verify_argmin([3,4,4], axis=1)
+    verify_argmax([3,4,4], axis=0)
+    verify_argmin([3,4,4], keepdims=0)
+    verify_argmax([3,4,4], keepdims=1)
+    for axis in [0,1,2]:
+        for keepdims in [True,False]:
+            verify_argmin([3,4,4], axis, keepdims)
+            verify_argmax([3,4,4], axis, keepdims)
+
+def verify_constantfill(is_shape, input_dim, out_dim, value, dtype, **kwargs):
+    input_a = np.random.uniform(size=input_dim).astype(dtype)
+    out = np.empty(shape=out_dim, dtype=dtype)
+    out.fill(value)
+
+    if is_shape == True:
+        fill_node = helper.make_node("ConstantFill", [], ["out"], shape=input_dim, value=value, **kwargs)
+    else:
+        fill_node = helper.make_node("ConstantFill", ["input_a"], ["out"], value=value, dtype=dtype, **kwargs)
+
+    graph = helper.make_graph([fill_node],
+                              "fill_test",
+                              inputs = [helper.make_tensor_value_info("input_a",
+                                            TensorProto.FLOAT, list(input_dim))],
+                              outputs = [helper.make_tensor_value_info("out",
+                                            TensorProto.FLOAT, list(out.shape))])
+
+    model = helper.make_model(graph, producer_name='fill_test')
+
+    for target, ctx in ctx_list():
+        if is_shape == True:
+            tvm_out = get_tvm_output(model, [], target, ctx, out.shape)
+        else:
+            tvm_out = get_tvm_output(model, [input_a], target, ctx, out.shape)
+
+        tvm.testing.assert_allclose(out, tvm_out, rtol=1e-5, atol=1e-5)
+
+def test_constantfill():
+    verify_constantfill(True, (2, 3, 4, 5), (2, 3, 4, 5), 10, 'float32')
+    verify_constantfill(False, (2, 3, 4, 5), (2, 3, 4, 5), 10, 'float32')
+    verify_constantfill(True, (2, 3, 4, 5), (2, 3, 4, 5, 4, 5, 6), 10, 'float32', extra_shape=(4, 5, 6))
+
+
+def verify_pad(indata, pads, value=0.0):
+    indata = np.array(indata).astype(np.float32)
+    #  numpy expect result
+    len_dim = len(pads) // 2
+    np_pads = [(pads[i], pads[i+len_dim]) for i in range(len_dim)]
+    outdata = np.pad(indata, pad_width=np_pads, mode='constant', constant_values=value)
+    #  onnx graph
+    node = helper.make_node(
+        'Pad',
+        inputs=['input'],
+        outputs=['output'],
+        mode='constant',
+        pads=pads,
+        value=value
+    )
+    graph = helper.make_graph([node],
+                              'pad_test',
+                              inputs = [helper.make_tensor_value_info("input",
+                                            TensorProto.FLOAT, list(indata.shape))],
+                              outputs = [helper.make_tensor_value_info("output",
+                                            TensorProto.FLOAT, list(outdata.shape))])
+    model = helper.make_model(graph, producer_name='pad_test')
+    #  tvm result
+    for target, ctx in ctx_list():
+        tvm_out = get_tvm_output(model, indata, target, ctx, outdata.shape, 'float32')
+    tvm.testing.assert_allclose(outdata, tvm_out, rtol=1e-5, atol=1e-5)
+
+def test_pad():
+    verify_pad(np.random.randn(2, 2).astype(np.float32), [0, 1, 0, 0], 0.0)
+    verify_pad(np.random.randn(2, 3).astype(np.float32), [1, 0, 0, 1], 0.0)
+    verify_pad(np.random.randn(3, 2).astype(np.float32), [0, 0, 1, 0], 5.0)
+
+def verify_reduce_x(name, indata, axis, keepdims):
+    indata = np.array(indata).astype(np.float32)
+    #  numpy expect result
+    if name == 'ReduceMax':
+        outdata = np.maximum.reduce(indata, axis=axis, keepdims=keepdims == 1)
+    elif name == 'ReduceMin':
+        outdata = np.minimum.reduce(indata, axis=axis, keepdims=keepdims == 1)
+    elif name == 'ReduceSum':
+        outdata = np.sum(indata, axis=axis, keepdims=keepdims == 1)
+    elif name == 'ReduceMean':
+        outdata = np.mean(indata, axis=axis, keepdims=keepdims == 1)
+    else:
+        raise Exception('unsupport op: {}'.format(name))
+    if len(np.asarray(outdata).shape) == 0:
+        outdata = np.asarray([outdata])
+    #  onnx graph
+    if axis is None:
+        node = helper.make_node(name, inputs=['input'], outputs=['output'],
+                                keepdims=keepdims)
+    else:
+        node = helper.make_node(name, inputs=['input'], outputs=['output'],
+                                axis=axis, keepdims=keepdims)
+    graph = helper.make_graph([node],
+                              '{}_test'.format(name),
+                              inputs = [helper.make_tensor_value_info("input",
+                                            TensorProto.FLOAT, list(indata.shape))],
+                              outputs = [helper.make_tensor_value_info("output",
+                                            TensorProto.FLOAT, list(outdata.shape))])
+    model = helper.make_model(graph, producer_name='{}_test'.format(name))
+    #  tvm result
+    for target, ctx in ctx_list():
+        tvm_out = get_tvm_output(model, indata, target, ctx, outdata.shape, 'float32')
+    tvm.testing.assert_allclose(outdata, tvm_out, rtol=1e-5, atol=1e-5)
+
+def test_reduce_max():
+    verify_reduce_x("ReduceMax",
+                    np.random.randn(3, 2, 2).astype(np.float32),
+                    axis=None, keepdims=1)
+    verify_reduce_x("ReduceMax",
+                    np.random.randn(3, 2, 3).astype(np.float32),
+                    axis=None, keepdims=0)
+    verify_reduce_x("ReduceMax",
+                    np.random.randn(3, 3, 3).astype(np.float32),
+                    axis=(1,), keepdims=1)
+
+def test_reduce_min():
+    verify_reduce_x("ReduceMin",
+                    np.random.randn(3, 2, 2).astype(np.float32),
+                    axis=None, keepdims=1)
+    verify_reduce_x("ReduceMin",
+                    np.random.randn(3, 2, 3).astype(np.float32),
+                    axis=None, keepdims=0)
+    verify_reduce_x("ReduceMin",
+                    np.random.randn(3, 3, 3).astype(np.float32),
+                    axis=(1,), keepdims=1)
+
+def test_reduce_sum():
+    verify_reduce_x("ReduceSum",
+                    np.random.randn(3, 2, 2).astype(np.float32),
+                    axis=None, keepdims=1)
+    verify_reduce_x("ReduceSum",
+                    np.random.randn(3, 2, 3).astype(np.float32),
+                    axis=None, keepdims=0)
+    verify_reduce_x("ReduceSum",
+                    np.random.randn(3, 3, 3).astype(np.float32),
+                    axis=(1,), keepdims=1)
+
+def test_reduce_mean():
+    verify_reduce_x("ReduceMean",
+                    np.random.randn(3, 2, 2).astype(np.float32),
+                    axis=None, keepdims=1)
+    verify_reduce_x("ReduceMean",
+                    np.random.randn(3, 2, 3).astype(np.float32),
+                    axis=None, keepdims=0)
+    verify_reduce_x("ReduceMean",
+                    np.random.randn(3, 3, 3).astype(np.float32),
+                    axis=(1,), keepdims=1)
+
+def verify_split(indata, outdatas, split, axis=0):
+    indata = np.array(indata).astype(np.float32)
+    outdatas = [np.array(o).astype(np.float32) for o in outdatas]
+    node = helper.make_node(
+        'Split',
+        inputs=['input'],
+        outputs=['output_{}'.format(i) for i in range(len(split))],
+        axis=axis,
+        split=split
+    )
+    graph = helper.make_graph([node],
+                              'split_test',
+                              inputs = [helper.make_tensor_value_info("input",
+                                            TensorProto.FLOAT, list(indata.shape))],
+                              outputs = [helper.make_tensor_value_info("output_{}".format(i),
+                                            TensorProto.FLOAT, list(outdatas[i].shape))
+                                            for i in range(len(split))
+                                         ])
+    model = helper.make_model(graph, producer_name='split_test')
+
+    for target, ctx in ctx_list():
+        output_shape = [o.shape for o in outdatas]
+        output_type = ['float32', 'float32', 'float32']
+        tvm_out = get_tvm_output(model, indata, target, ctx, output_shape, output_type)
+    for o, t in zip(outdatas, tvm_out):
+        tvm.testing.assert_allclose(o, t)
+
+def test_split():
+    # 1D
+    verify_split([1., 2., 3., 4., 5., 6.], [[1., 2.], [3., 4.], [5., 6.]], [2, 2, 2], 0)
+    verify_split([1., 2., 3., 4., 5., 6.], [[1., 2.], [3.], [4., 5., 6.]], [2, 1, 3], 0)
+    # 2D
+    verify_split([[1., 2., 3., 4.], [7., 8., 9., 10.]],
+                 [[[1., 2.], [7., 8.]], [[3., 4.], [9., 10.]]], [2, 2], 1)
+
+def test_binary_ops():
+    in_shape = (1, 2, 3, 3)
+    dtype = "float32"
+    out_shape = in_shape
+
+    def verify_binary_ops(op, x, y, out_np, broadcast=None, rtol=1e-7, atol=1e-7):
+        if broadcast is None:
+            z = helper.make_node(op, ['in1', 'in2'], ['out'])
+        else:
+            z = helper.make_node(op, ['in1', 'in2'], ['out'], broadcast=1)
+        graph = helper.make_graph([z],
+                                   '_test',
+                                  inputs = [helper.make_tensor_value_info("in1",
+                                                TensorProto.FLOAT, list(in_shape)),
+                                            helper.make_tensor_value_info("in2",
+                                                TensorProto.FLOAT, list(in_shape))],
+                                  outputs = [helper.make_tensor_value_info("out",
+                                                TensorProto.FLOAT, list(out_shape))])
+        model = helper.make_model(graph, producer_name='_test')
+        for target, ctx in ctx_list():
+            tvm_out = get_tvm_output(model, [x, y], target, ctx)
+            tvm.testing.assert_allclose(out_np, tvm_out, rtol=rtol, atol=atol)
+
+    x = np.random.uniform(size=in_shape).astype(dtype)
+    y = np.random.uniform(size=in_shape).astype(dtype)
+    z = np.random.uniform(size=(3,)).astype(dtype)
+    verify_binary_ops("Add",x, y, x + y, broadcast=None)
+    verify_binary_ops("Add", x, z,  x + z, broadcast=True)
+    verify_binary_ops("Sub", x, y, x - y, broadcast=None)
+    verify_binary_ops("Sub", x, z, x - z, broadcast=True)
+    verify_binary_ops("Mul",x, y, x * y, broadcast=None)
+    verify_binary_ops("Mul", x, z,  x * z, broadcast=True)
+    verify_binary_ops("Div", x, y, x / y, broadcast=None, rtol=1e-5, atol=1e-5)
+    verify_binary_ops("Div", x, z, x / z, broadcast=True, rtol=1e-5, atol=1e-5)
+    verify_binary_ops("Sum", x, y, x + y, broadcast=None)
+
+def test_single_ops():
+    in_shape = (1, 2, 3, 3)
+    dtype = "float32"
+    out_shape = in_shape
+
+    def verify_single_ops(op, x, out_np, rtol=1e-7, atol=1e-7):
+        z = helper.make_node(op, ['in1'], ['out'])
+        graph = helper.make_graph([z],
+                                   '_test',
+                                  inputs = [helper.make_tensor_value_info("in1",
+                                                TensorProto.FLOAT, list(in_shape)),],
+                                  outputs = [helper.make_tensor_value_info("out",
+                                                TensorProto.FLOAT, list(out_shape))])
+        model = helper.make_model(graph, producer_name='_test')
+        for target, ctx in ctx_list():
+            tvm_out = get_tvm_output(model, [x], target, ctx)
+            tvm.testing.assert_allclose(out_np, tvm_out, rtol=rtol, atol=atol)
+
+    x = np.random.uniform(size=in_shape).astype(dtype)
+    verify_single_ops("Neg",x, -x)
+    verify_single_ops("Abs",x, np.abs(x))
+    verify_single_ops("Reciprocal",x, 1/x, rtol=1e-5, atol=1e-5)
+    verify_single_ops("Sqrt",x, np.sqrt(x), rtol=1e-5, atol=1e-5)
+    verify_single_ops("Relu",x, np.maximum(x, 0))
+    verify_single_ops("Exp",x, np.exp(x), rtol=1e-5, atol=1e-5)
+    verify_single_ops("Log",x, np.log(x), rtol=1e-5, atol=1e-5)
+    verify_single_ops("Log",x, np.log(x), rtol=1e-5, atol=1e-5)
+    verify_single_ops("Tanh",x, np.tanh(x), rtol=1e-5, atol=1e-5)
+    verify_single_ops("Sigmoid",x, 1 / (1 + np.exp(-x)), rtol=1e-5, atol=1e-5)
+    verify_single_ops("Softsign",x, x / (1 + np.abs(x)), rtol=1e-5, atol=1e-5)
+    verify_single_ops("SoftPlus",x, np.log(1 + np.exp(x)), rtol=1e-5, atol=1e-5)
+
+def test_leaky_relu():
+    def leaky_relu_x(x, alpha):
+        return np.where(x >= 0, x, x * alpha)
+    _test_onnx_op_elementwise((2, 4, 5, 6),
+                              leaky_relu_x,
+                              {'alpha': 0.25},
+                              'float32',
+                              'LeakyRelu',
+                              {'alpha': 0.25})
+
+def test_elu():
+    def elu_x(x, alpha):
+        return np.where(x > 0, x, alpha * (np.exp(x) - 1.0))
+    _test_onnx_op_elementwise((2, 4, 5, 6),
+                              elu_x,
+                              {'alpha': 0.25},
+                              'float32',
+                              'Elu',
+                              {'alpha': 0.25})
+
+def test_selu():
+    def selu_x(x, alpha, gamma):
+        return gamma * np.where(x > 0, x, alpha * (np.exp(x) - 1.0))
+    _test_onnx_op_elementwise((2, 4, 5, 6),
+                              selu_x,
+                              {'alpha': 0.25, 'gamma': 0.3},
+                              'float32',
+                              'Selu',
+                              {'alpha': 0.25, 'gamma': 0.3})
+
+def test_ThresholdedRelu():
+    def ThresholdedRelu_x(x, alpha):
+        out_np = np.clip(x, alpha, np.inf)
+        out_np[out_np == alpha] = 0
+        return out_np
+    _test_onnx_op_elementwise((2, 4, 5, 6),
+                              ThresholdedRelu_x,
+                              {'alpha': 0.25},
+                              'float32',
+                              'ThresholdedRelu',
+                              {'alpha': 0.25})
+
+def test_ScaledTanh():
+    def ScaledTanh_x(x, alpha, beta):
+        return alpha * np.tanh(beta * x)
+    _test_onnx_op_elementwise((2, 4, 5, 6),
+                              ScaledTanh_x,
+                              {'alpha': 0.25, 'beta': 0.3},
+                              'float32',
+                              'ScaledTanh',
+                              {'alpha': 0.25, 'beta': 0.3})
+
+def test_ParametricSoftplus():
+    def ParametricSoftplus_x(x, alpha, beta):
+        return alpha * np.log(np.exp(beta * x) + 1)
+    _test_onnx_op_elementwise((2, 4, 5, 6),
+                              ParametricSoftplus_x,
+                              {'alpha': 0.25, 'beta': 0.3},
+                              'float32',
+                              'ParametricSoftplus',
+                              {'alpha': 0.25, 'beta': 0.3})
+
+def test_Scale():
+    def Scale_x(x, scale):
+        return scale * x
+    _test_onnx_op_elementwise((2, 4, 5, 6),
+                              Scale_x,
+                              {'scale': 0.25},
+                              'float32',
+                              'Scale',
+                              {'scale': 0.25})
+
+def test_LogSoftmax():
+    _test_onnx_op_elementwise((1, 4),
+                              topi.testing.log_softmax_python,
+                              {},
+                              'float32',
+                              'LogSoftmax',
+                              {'axis': 1},
+                              rtol=1e-5,
+                              atol=1e-5)
+
+if __name__ == '__main__':
+    # verify_super_resolution_example()
+    # verify_squeezenet1_1()
+    # verify_lenet()
+    verify_resnet18()
+    test_reshape()
+    test_reshape_like()
+    test_power()
+    test_squeeze()
+    test_unsqueeze()
+    test_slice()
+    test_floor()
+    test_ceil()
+    test_clip()
+    test_matmul()
+    test_gather()
+    test_lrn()
+    test_upsample()
+    test_forward_min()
+    test_forward_max()
+    test_forward_mean()
+    test_forward_hardsigmoid()
+    test_forward_arg_min_max()
+    test_softmax()
+    test_constantfill()
+    test_pad()
+    test_reduce_max()
+    test_reduce_min()
+    test_reduce_sum()
+    test_reduce_mean()
+    test_split()
+    test_binary_ops()
+    test_single_ops()
+    test_leaky_relu()
+    test_elu()
+    test_selu()
+    test_ThresholdedRelu()
+    test_ScaledTanh()
+    test_ParametricSoftplus()
+    test_Scale()
+    test_LogSoftmax()
diff --git a/nnvm/tests/python/unittest/test_correct_layout.py b/nnvm/tests/python/unittest/test_correct_layout.py
new file mode 100644
index 000000000000..5d313fbacb3e
--- /dev/null
+++ b/nnvm/tests/python/unittest/test_correct_layout.py
@@ -0,0 +1,379 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import nnvm
+import nnvm.symbol as sym
+import nnvm.graph as graph
+from nnvm.compiler import graph_attr
+
+def correct_layout(g, layout=None):
+    if isinstance(g, nnvm.symbol.Symbol):
+        g = graph.create(g)
+    if layout:
+        graph_attr.set_layout_inputs(g, layout)
+    g = g.apply("CorrectLayout")
+    ldict = {}
+    vlayout = g.json_attr("layout")
+    entry_ptr = g.index.entry_ptr
+    for i, n in enumerate(g.index.nodes):
+        begin, end = entry_ptr[i], entry_ptr[i + 1]
+        ldict[n["name"]] = vlayout[begin:end]
+    return g, ldict
+
+
+# Level 1
+def test_dense():
+    x = sym.Variable("data", shape=(10, 20))
+    y = sym.dense(x, units=30, name="fc")
+    g, ldict = correct_layout(y, "HW")
+    assert(ldict["data"][0] == "HW")
+    assert(ldict["fc"][0] == "HW")
+    assert(ldict["fc_bias"][0] == "__undef__")
+    # second pass will insert layout transform
+    _, ldict = correct_layout(g, "HW16w")
+    assert(ldict["data"][0] == "HW16w")
+    assert(ldict["data_HW"][0] == "HW")
+    assert(ldict["fc"][0] == "HW")
+    assert(ldict["fc_bias"][0] == "__undef__")
+
+
+def test_matmul():
+    a = sym.Variable("a", shape=(10, 20))
+    b = sym.Variable("b", shape=(20, 30))
+    c = sym.matmul(a, b, name="matmul")
+    g, ldict = correct_layout(c, {"a" : "HW", "b" : "WC"})
+    assert(ldict["a"][0] == "HW")
+    assert(ldict["b"][0] == "WC")
+    assert(ldict["matmul"][0] == "HC")
+    # second pass will insert layout transform
+    _, ldict = correct_layout(g, {"a" : "HW16w", "b" : "WC16c"})
+    assert(ldict["a"][0] == "HW16w")
+    assert(ldict["a_HW"][0] == "HW")
+    assert(ldict["b"][0] == "WC16c")
+    assert(ldict["b_WC"][0] == "WC")
+    assert(ldict["matmul"][0] == "HC")
+    a = sym.Variable("a", shape=(20, 10))
+    c = sym.matmul(a, b, name="matmul", transpose_a=True)
+    g, ldict = correct_layout(c, {"a" : "HW", "b" : "HC"})
+    assert(ldict["a"][0] == "HW")
+    assert(ldict["b"][0] == "HC")
+    assert(ldict["matmul"][0] == "WC")
+    b = sym.Variable("b", shape=(30, 20))
+    c = sym.matmul(a, b, name="matmul", transpose_b=True)
+    g, ldict = correct_layout(c, {"a" : "HW", "b" : "CW"})
+    assert(ldict["a"][0] == "HW")
+    assert(ldict["b"][0] == "CW")
+    assert(ldict["matmul"][0] == "HC")
+    a = sym.Variable("a", shape=(20, 10))
+    b = sym.Variable("b", shape=(30, 20))
+    c = sym.matmul(a, b, name="matmul", transpose_a=True, transpose_b=True)
+    g, ldict = correct_layout(c, {"a" : "HW", "b" : "CH"})
+    assert(ldict["a"][0] == "HW")
+    assert(ldict["b"][0] == "CH")
+    assert(ldict["matmul"][0] == "WC")
+
+
+def test_concatenate():
+    x1 = sym.Variable("x", shape=(10, 20))
+    x2 = sym.Variable("y", shape=(10, 30))
+    z = sym.concatenate(x1, x2, name="concat")
+    g, ldict = correct_layout(z, {"x": "HW", "y": "HW"})
+    assert(ldict["x"][0] == "HW")
+    assert(ldict["y"][0] == "HW")
+    assert(ldict["concat"][0] == "HW")
+    # second pass will insert layout transform
+    _, ldict = correct_layout(g, {"x": "HW16w", "y": "HW16w"})
+    assert(ldict["x"][0] == "HW16w")
+    assert(ldict["y"][0] == "HW16w")
+    assert(ldict["concat"][0] == "HW16w")
+
+    x1 = sym.Variable("x", shape=(10, 20, 60))
+    x2 = sym.Variable("y", shape=(10, 20, 40))
+    z = sym.concatenate(x1, x2, axis=2, name="concat")
+    g, ldict = correct_layout(z, {"x": "H20wW", "y": "H20wW"})
+    assert(ldict["x"][0] == "H20wW")
+    assert(ldict["y"][0] == "H20wW")
+    assert(ldict["concat"][0] == "H20wW")
+    # second pass will insert layout transform
+    _, ldict = correct_layout(g, {"x": "HW", "y": "HW"})
+    assert(ldict["x_H20wW"][0] == "H20wW")
+    assert(ldict["x_H20wW"][0] == "H20wW")
+    assert(ldict["concat"][0] == "H20wW")
+
+
+def test_expand_dims():
+    x = sym.Variable("x", shape=(10, 20))
+    y = sym.expand_dims(x, axis=1, name="y")
+    g, ldict = correct_layout(y, "HW")
+    assert(ldict["x"][0] == "HW")
+    assert(ldict["y"][0] == "__undef__")
+    # second pass will insert layout transform
+    _, ldict = correct_layout(g, "HW16w")
+    assert(ldict["x"][0] == "HW16w")
+    assert(ldict["x_HW"][0] == "HW")
+    assert(ldict["y"][0] == "__undef__")
+
+
+def test_split():
+    x = sym.Variable("x", shape=(10, 20))
+    y = sym.split(x, indices_or_sections=[11], name="y")
+    g, ldict = correct_layout(y, "HW")
+    assert(ldict["x"][0] == "HW")
+    assert(ldict["y"][0] == "__undef__")
+    # second pass will insert layout transform
+    _, ldict = correct_layout(g, "HW16w")
+    assert(ldict["x"][0] == "HW16w")
+    assert(ldict["x_HW"][0] == "HW")
+    assert(ldict["y"][0] == "__undef__")
+
+
+def test_batchnorm():
+    x = sym.Variable("data", shape=(10, 20, 30, 40))
+    y = sym.batch_norm(x, axis=1, epsilon=2e-5, name="bn")
+    g, ldict = correct_layout(y, "NCHW")
+    assert(ldict["data"][0] == "NCHW")
+    assert(ldict["bn"][0] == "NCHW")
+    assert(ldict["bn"][1] == "C")
+    assert(ldict["bn"][2] == "C")
+    assert(ldict["bn_beta"][0] == "C")
+    assert(ldict["bn_gamma"][0] == "C")
+    assert(ldict["bn_moving_mean"][0] == "C")
+    assert(ldict["bn_moving_var"][0] == "C")
+    # batch_norm can deal with sub-dim of C at the last dim.
+    g, ldict = correct_layout(g, "NCHW16c")
+    assert(ldict["data"][0] == "NCHW16c")
+    assert(ldict["bn"][0] == "NCHW16c")
+    assert(ldict["bn"][1] == "C16c")
+    assert(ldict["bn"][2] == "C16c")
+    assert(ldict["bn_beta"][0] == "C")
+    assert(ldict["bn_beta_C16c"][0] == "C16c")
+    assert(ldict["bn_gamma"][0] == "C")
+    assert(ldict["bn_gamma_C16c"][0] == "C16c")
+    assert(ldict["bn_moving_mean"][0] == "C")
+    assert(ldict["bn_moving_mean_C16c"][0] == "C16c")
+    assert(ldict["bn_moving_var"][0] == "C")
+    assert(ldict["bn_moving_var_C16c"][0] == "C16c")
+    # but for other layout, it does a layout transform for data
+    g, ldict = correct_layout(g, "NCH16cW")
+    assert(ldict["data"][0] == "NCH16cW")
+    assert(ldict["data_NCHW16c"][0] == "NCHW16c")
+    assert(ldict["bn"][0] == "NCHW16c")
+    assert(ldict["bn"][1] == "C16c")
+    assert(ldict["bn"][2] == "C16c")
+    assert(ldict["bn_beta"][0] == "C")
+    assert(ldict["bn_beta_C16c"][0] == "C16c")
+    assert(ldict["bn_gamma"][0] == "C")
+    assert(ldict["bn_gamma_C16c"][0] == "C16c")
+    assert(ldict["bn_moving_mean"][0] == "C")
+    assert(ldict["bn_moving_mean_C16c"][0] == "C16c")
+    assert(ldict["bn_moving_var"][0] == "C")
+    assert(ldict["bn_moving_var_C16c"][0] == "C16c")
+
+
+def test_flatten():
+    x = sym.Variable("x", shape=(10, 20, 10, 10))
+    y = sym.flatten(x, name="y")
+    g, ldict = correct_layout(y, "NCHW")
+    assert(ldict["x"][0] == "NCHW")
+    assert(ldict["y"][0] == "__undef__")
+    # second pass will insert layout transform
+    _, ldict = correct_layout(g, "NCHW16c")
+    assert(ldict["x"][0] == "NCHW16c")
+    assert(ldict["x_NCHW"][0] == "NCHW")
+    assert(ldict["y"][0] == "__undef__")
+
+
+def test_softmax():
+    x = sym.Variable("x", shape=(10, 20, 10, 10))
+    y = sym.softmax(x, name="y")
+    g, ldict = correct_layout(y, "NCHW")
+    assert(ldict["x"][0] == "NCHW")
+    assert(ldict["y"][0] == "NCHW")
+    # second pass will insert layout transform
+    _, ldict = correct_layout(g, "NCHW16c")
+    assert(ldict["x"][0] == "NCHW16c")
+    assert(ldict["x_NCHW"][0] == "NCHW")
+    assert(ldict["y"][0] == "NCHW")
+
+
+# Level 2
+def test_conv2d():
+    x = sym.Variable("data", shape=(1, 32, 512, 512))
+    y = sym.conv2d(x, name="conv", channels=12,
+                   kernel_size=(3,3), padding=(1,1), layout="NCHW")
+    _, ldict = correct_layout(y)
+    assert(ldict["data"][0] == "NCHW")
+    assert(ldict["conv_weight"][0] == "OIHW")
+    assert(ldict["conv_bias"][0] == "C")
+    assert(ldict["conv"][0] == "NCHW")
+    y = sym.conv2d(x, name="conv", channels=12,
+                   kernel_size=(3,3), padding=(1,1), layout="NCHW16c",
+                   kernel_layout="OIHW16i16o", out_layout="NCHW8c")
+    _, ldict = correct_layout(y)
+    assert(ldict["data"][0] == "NCHW16c")
+    assert(ldict["conv_weight"][0] == "OIHW16i16o")
+    assert(ldict["conv_bias"][0] == "C8c")
+    assert(ldict["conv"][0] == "NCHW8c")
+    y = sym.conv2d(x, name="conv", channels=12,
+                   kernel_size=(3,3), padding=(1,1), layout="N16cHWC")
+    _, ldict = correct_layout(y)
+    assert(ldict["data"][0] == "N16cHWC")
+    assert(ldict["conv_weight"][0] == "OIHW")
+    assert(ldict["conv_bias"][0] == "16cC")
+    assert(ldict["conv"][0] == "N16cHWC")
+
+
+def test_conv2d_transpose():
+    x = sym.Variable("data", shape=(1, 32, 512, 512))
+    y = sym.conv2d_transpose(x, name="conv", channels=12,
+                             kernel_size=(3,3), padding=(1,1), layout="NCHW")
+    _, ldict = correct_layout(y)
+    assert(ldict["data"][0] == "NCHW")
+    assert(ldict["conv_weight"][0] == "OIHW")
+    assert(ldict["conv_bias"][0] == "C")
+    assert(ldict["conv"][0] == "NCHW")
+
+
+def test_max_pool2d():
+    x = sym.Variable("data", shape=(1, 32, 512, 512))
+    y = sym.max_pool2d(x, name="pool", pool_size=(3,3),
+                       padding=(1,1), layout="NCHW")
+    g, ldict = correct_layout(y)
+    assert(ldict["data"][0] == "NCHW")
+    assert(ldict["pool"][0] == "NCHW")
+    # if index of H and W remain the same,
+    # pool2d does not convert the layout.
+    g, ldict = correct_layout(g, "NCHW16c")
+    assert(ldict["data"][0] == "NCHW16c")
+    assert(ldict["pool"][0] == "NCHW16c")
+    # for other layout it requires a layout transform.
+    g, ldict = correct_layout(g, "NHWC")
+    assert(ldict["data"][0] == "NHWC")
+    assert(ldict["data_NCHW"][0] == "NCHW")
+    assert(ldict["pool"][0] == "NCHW")
+
+
+def test_global_pool2d():
+    x = sym.Variable("data", shape=(1, 32, 512, 512))
+    y = sym.global_max_pool2d(x, name="pool", layout="NCHW")
+    g, ldict = correct_layout(y)
+    assert(ldict["data"][0] == "NCHW")
+    assert(ldict["pool"][0] == "NCHW")
+    # if index of H and W remain the same,
+    # pool2d does not convert the layout.
+    g, ldict = correct_layout(g, "NCHW16c")
+    assert(ldict["data"][0] == "NCHW16c")
+    assert(ldict["pool"][0] == "NCHW16c")
+    # for other layout it requires a layout transform.
+    g, ldict = correct_layout(g, "NHWC")
+    assert(ldict["data"][0] == "NHWC")
+    assert(ldict["data_NCHW"][0] == "NCHW")
+    assert(ldict["pool"][0] == "NCHW")
+
+
+# Level 3
+def test_reshape():
+    x = sym.Variable("x", shape=(4,))
+    y = sym.reshape(x, shape=(2,2), name="y")
+    g, ldict = correct_layout(y, "C")
+    assert(ldict["x"][0] == "C")
+    assert(ldict["y"][0] == "__undef__")
+    # second pass will insert layout transform
+    g, ldict = correct_layout(g, "C16c")
+    assert(ldict["x"][0] == "C16c")
+    assert(ldict["x_C"][0] == "C")
+    assert(ldict["y"][0] == "__undef__")
+
+
+def test_transpose():
+    x = sym.Variable("x", shape=(1, 32, 512, 512))
+    y = sym.transpose(x, name="y", axes=(0, 2, 3, 1))
+    g, ldict = correct_layout(y, "NCHW")
+    assert(ldict["x"][0] == "NCHW")
+    assert(ldict["y"][0] == "NHWC")
+    # second pass will insert layout transform
+    g, ldict = correct_layout(g, "NCHW16c")
+    assert(ldict["x"][0] == "NCHW16c")
+    assert(ldict["x_NCHW"][0] == "NCHW")
+    assert(ldict["y"][0] == "NHWC")
+
+
+def test_broadcast_to():
+    x = sym.Variable("x", shape=(4, 1))
+    y = sym.broadcast_to(x, shape=(0, 4), name="y")
+    g, ldict = correct_layout(y, "HW")
+    assert(ldict["x"][0] == "HW")
+    assert(ldict["y"][0] == "__undef__")
+    # second pass will insert layout transform
+    g, ldict = correct_layout(g, "HW16h")
+    assert(ldict["x"][0] == "HW16h")
+    assert(ldict["x_HW"][0] == "HW")
+    assert(ldict["y"][0] == "__undef__")
+
+
+def test_broadcast_binary():
+    x = sym.Variable("x", shape=(1, 16, 512, 512))
+    y = sym.Variable("y", shape=(16, 512, 512))
+    z = sym.broadcast_add(x, y, name="z")
+    g, ldict = correct_layout(z, {"x": "NCHW", "y": "CHW"})
+    assert(ldict["x"][0] == "NCHW")
+    assert(ldict["y"][0] == "CHW")
+    assert(ldict["z"][0] == "NCHW")
+    # prior to keep the left layout if they do not match.
+    g, ldict = correct_layout(g, {"x": "NCHW16c", "y": "CHW"})
+    assert(ldict["x"][0] == "NCHW16c")
+    assert(ldict["y"][0] == "CHW")
+    assert(ldict["y_CHW16c"][0] == "CHW16c")
+    assert(ldict["z"][0] == "NCHW16c")
+    # broadcast_add(HCW16c, N16nCH16cW)
+    g, ldict = correct_layout(z, {"x": "HCW16c", "y": "N16nCH16cW"})
+    assert(ldict["x"][0] == "HCW16c")
+    assert(ldict["y"][0] == "N16nCH16cW")
+    assert(ldict["x_CH16cW"][0] == "CH16cW")
+    assert(ldict["z"][0] == "N16nCH16cW")
+
+
+def test_reduce():
+    x = sym.Variable("x", shape=(1, 16, 512, 512))
+    y = sym.sum(x, name="y", axis=1)
+    g, ldict = correct_layout(y, "NCHW")
+    assert(ldict["x"][0] == "NCHW")
+    assert(ldict["y"][0] == "__undef__")
+    # second pass will insert layout transform
+    g, ldict = correct_layout(g, "NCHW16c")
+    assert(ldict["x"][0] == "NCHW16c")
+    assert(ldict["x_NCHW"][0] == "NCHW")
+    assert(ldict["y"][0] == "__undef__")
+
+
+if __name__ == "__main__":
+    test_dense()
+    test_matmul()
+    test_concatenate()
+    test_expand_dims()
+    test_split()
+    test_batchnorm()
+    test_flatten()
+    test_softmax()
+    test_conv2d()
+    test_conv2d_transpose()
+    test_max_pool2d()
+    test_global_pool2d()
+    test_reshape()
+    test_transpose()
+    test_broadcast_to()
+    test_broadcast_binary()
+    test_reduce()
diff --git a/nnvm/tests/python/unittest/test_graph.py b/nnvm/tests/python/unittest/test_graph.py
new file mode 100644
index 000000000000..1ba0a2487cee
--- /dev/null
+++ b/nnvm/tests/python/unittest/test_graph.py
@@ -0,0 +1,160 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import json
+import nnvm.symbol as sym
+import nnvm.graph as graph
+import nnvm.compiler.graph_util as graph_util
+
+def test_json_pass():
+    x = sym.Variable('x')
+    y = sym.dense(data=x, name='conv', units=30)
+    g = graph.create(y)
+    ret = g.apply('SaveJSON')
+    ret._set_json_attr('json', ret.json_attr('json'))
+    g2 = ret.apply('LoadJSON')
+    assert g2.apply('SaveJSON').json_attr('json') == ret.json_attr('json')
+    json = g.json()
+    g2 = graph.load_json(json)
+    assert json == g2.json()
+
+
+def test_json_pass_with_attr():
+    x = sym.Variable('x')
+    y = sym.dense(data=x, name='fc', units=30)
+    g = graph.create(y)
+    g._set_json_attr('version', '0.1.0')
+    ret = g.apply('SaveJSON')
+    json_str = ret.json_attr('json')
+    ret._set_json_attr('json', json_str)
+    g2 = ret.apply('LoadJSON')
+    assert g2.json_attr('version') == '0.1.0'
+
+
+def test_graph_json_attr():
+    x = sym.Variable('x')
+    y = sym.dense(data=x, name='fc', units=30)
+    g = graph.create(y)
+    g._set_json_attr('ilist', [1,2,3], 'list_int')
+    assert g.json_attr('ilist') == [1,2,3]
+
+def test_list_args():
+    x = sym.Variable('x')
+    z = sym.Variable('z')
+    y = sym.dense(data=x, name='fc', units=30)
+    y = sym.elemwise_add(y, z, name='add1')
+
+def test_infer_shape():
+    x = sym.Variable('x', shape=(2, 4, 2))
+    y = sym.elemwise_add(x, x, name='add1')
+    y = sym.flatten(y, name="flatten")
+    g = graph.create(y)
+    g._set_json_attr("shape_attr_key", "shape")
+    g = g.apply('InferShape')
+    jgraph = json.loads(g.apply('SaveJSON').json_attr('json'))
+    jnodes = jgraph['nodes']
+    jnode_row_ptr = jgraph['node_row_ptr']
+    nindex = {n['name']: i for i, n in enumerate(jnodes)}
+    assert g.json_attr('shape')[jnode_row_ptr[nindex["flatten"]]] == [2, 8]
+    assert g.json_attr('shape')[jnode_row_ptr[nindex["add1"]]] == [2, 4, 2]
+
+def test_infer_shape_known_partial():
+    x = sym.Variable('x')
+    y = sym.elemwise_add(x, x, name='add1')
+    y = sym.flatten(y, name="flatten1")
+    g = graph.create(y)
+    jgraph = json.loads(g.apply('SaveJSON').json_attr('json'))
+    shape = [[2, 4, 2], [] , []]
+    g._set_json_attr("shape", shape, 'list_shape')
+    g = g.apply("InferShape")
+    jnodes = jgraph['nodes']
+    jnode_row_ptr = jgraph['node_row_ptr']
+    nindex = {n['name']: i for i, n in enumerate(jnodes)}
+    assert g.json_attr('shape')[jnode_row_ptr[nindex["flatten1"]]] == [2, 8]
+    assert g.json_attr('shape')[jnode_row_ptr[nindex["add1"]]] == [2, 4, 2]
+
+def test_infer_type():
+    x = sym.Variable('x', dtype=0)
+    y = sym.elemwise_add(x, x, name='add1')
+    y = sym.cast(y, dtype="float64", name="cast1")
+    g = graph.create(y)
+    g._set_json_attr("dtype_attr_key", "dtype")
+    g = g.apply('InferType')
+    jgraph = json.loads(g.apply('SaveJSON').json_attr('json'))
+    jnodes = jgraph['nodes']
+    jnode_row_ptr = jgraph['node_row_ptr']
+    nindex = {n['name']: i for i, n in enumerate(jnodes)}
+    assert g.json_attr('dtype')[jnode_row_ptr[nindex["cast1"]]] == 1
+    assert g.json_attr('dtype')[jnode_row_ptr[nindex["add1"]]] == 0
+
+def test_plan_memory():
+    x = sym.Variable('x', shape=(4, 2))
+    x2 = sym.elemwise_add(x, x, name='addk')
+    y = sym.flatten(x2, name="reshapek")
+    y = sym.elemwise_add(y, x2, name="add2")
+    y = sym.elemwise_add(y, y)
+    g = graph.create(y)
+    g._set_json_attr("shape_attr_key", "shape")
+    g = g.apply(["InferShape", "InferType", "PlanMemory"])
+    jgraph = json.loads(g.apply('SaveJSON').json_attr('json'))
+    jnodes = jgraph['nodes']
+    jnode_row_ptr = jgraph['node_row_ptr']
+    storage_id = g.json_attr('storage_id')
+    nindex = {n['name']: i for i, n in enumerate(jnodes)}
+    assert (storage_id[jnode_row_ptr[nindex["addk"]]] !=
+            storage_id[jnode_row_ptr[nindex["reshapek"]]])
+    assert (storage_id[jnode_row_ptr[nindex["add2"]]] ==
+            storage_id[jnode_row_ptr[nindex["reshapek"]]])
+
+def test_print_graph_ir():
+    x = sym.Variable("x", shape=(1, 1, 10, 20))
+    y = sym.conv2d(x + 1, name="y", channels=10, kernel_size=(3,3))
+    g = graph.create(y)
+    g = g.apply("InferShape")
+    ir1 = g.ir()
+    ir2 = g.ir(join_entry_attrs=["shape"])
+    assert("y_bias" in ir1)
+    assert("shape=" in ir2)
+
+def test_gradient():
+    x = sym.Variable("x")
+    y = sym.Variable("y")
+    z1 = sym.elemwise_add(x, sym.sqrt(y))
+    z2 = sym.log(x)
+    gradient = graph_util.gradients([z1, z2], [x, y])
+    assert len(gradient) == 2
+
+    g1 = sym.Variable("g1")
+    g2 = sym.Variable("g2")
+    grad_ys = [g1, g2]
+    gradient = graph_util.gradients(sym.Group([z1, z2]),
+                               sym.Group([x, y]), grad_ys=grad_ys)
+    g_graph = graph.create(sym.Group(gradient)).ir()
+    assert len(gradient) == 2
+    assert "g1" in g_graph
+    assert "g2" in g_graph
+
+if __name__ == "__main__":
+    test_print_graph_ir()
+    test_json_pass_with_attr()
+    test_graph_json_attr()
+    test_json_pass()
+    test_infer_shape()
+    test_infer_shape_known_partial()
+    test_infer_type()
+    test_plan_memory()
+    test_list_args()
+    test_gradient()
diff --git a/nnvm/tests/python/unittest/test_graph_gradient.py b/nnvm/tests/python/unittest/test_graph_gradient.py
new file mode 100644
index 000000000000..4ae6053c946f
--- /dev/null
+++ b/nnvm/tests/python/unittest/test_graph_gradient.py
@@ -0,0 +1,152 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import nnvm.symbol as sym
+from nnvm.compiler import graph_util
+
+def test_cnn_gradients():
+    # input data
+    h = 128
+    w = 128
+    data_shape = (1000, 3, h, w)
+    data = sym.Variable('data', shape=data_shape, dtype=0)
+
+    # conv2d
+    num_channels = 64
+    kernel_size = 32
+    conv_w_shape = (num_channels, 3, kernel_size, kernel_size)
+    conv_b_shape = (num_channels,)
+    conv_w = sym.Variable('conv_w', shape=conv_w_shape)
+    conv_b = sym.Variable('conv_b', shape=conv_b_shape)
+    conv1 = sym.conv2d(data=data, weight=conv_w, bias=conv_b,
+                      channels=num_channels, kernel_size=(kernel_size, kernel_size),
+                      name='conv1')
+    # relu1
+    relu1 = sym.relu(data=conv1, name='relu1')
+    # max pooling
+    max_pooling1 = sym.max_pool2d(data=relu1, pool_size=(2, 2), name='max_pooling1')
+    # flatten
+    flatten1 = sym.flatten(data=max_pooling1)
+    # shape after flatten
+    flatten_out_shape = (h - kernel_size) * (w - kernel_size) * num_channels
+    # dense1
+    dense1_hidden_units = 100
+    dense1 = sym.dense(data=flatten1, name='dense1', units=dense1_hidden_units)
+    # relu2
+    relu2 = sym.relu(data=dense1, name='relu2')
+    # dense2
+    dense2_hidden_units = 10
+    dense2 = sym.dense(data=relu2, name='dense2', units=dense2_hidden_units)
+    # softmax
+    mlp = sym.softmax(data=dense2, name='softmax')
+    # fake non-sparse label
+    label = sym.full_like(mlp, fill_value=1)
+    # cross entropy loss
+    ce_loss = sym.sum(
+        sym.elemwise_mul(sym.log_softmax(dense2), label),
+        axis=1,
+        keepdims=True,
+        name="ce_loss")
+
+    # input variables:
+    # print grad_g.symbol.list_input_names()
+    # >> ['data', 'conv_w', 'conv_b',
+    #     'dense1_weight', 'dense1_bias',
+    #     'dense2_weight', 'dense2_bias']
+
+    # output gradient variables:
+    # print grad_g.symbol.list_output_names()
+    # >> ['conv1_grad_data', 'conv1_grad_weight', 'conv1_grad_bias',
+    #     'dense1_grad_weight', 'dense1_grad_bias',
+    #     'dense2_grad_weight', 'dense2_grad_bias']
+    grad_g = graph_util.get_gradient_graph(ce_loss, ce_loss.list_input_variables())
+
+    # infer shape
+    in_shapes, out_shapes = graph_util.infer_shape(grad_g)
+
+    # forward graph shape
+    assert in_shapes == [list(data_shape), list(conv_w_shape), list(conv_b_shape),
+                          [dense1_hidden_units, flatten_out_shape], [dense1_hidden_units],
+                          [dense2_hidden_units, dense1_hidden_units], [dense2_hidden_units]]
+    # input grads shape should be equal with input shape
+    assert in_shapes == out_shapes
+
+    # output grads w.r.t input variables
+    grads = graph_util.gradients(ce_loss, ce_loss.list_input_variables())
+
+    # gradients number should be equal with grad_input number
+    assert len(grads) == len(ce_loss.list_input_variables())
+
+    # infer type
+    in_dtypes, out_dtypes = graph_util.infer_dtype(grad_g)
+    assert out_dtypes == ['float32', 'float32', 'float32', 'float32', 'float32', 'float32', 'float32']
+
+def test_multi_loss_graph_gradients():
+    # input data
+    shape1 = (1000, 100)
+    data1 = sym.Variable('data1', shape=(1000, 100), dtype=0)
+
+    # fake non-sparse label
+    label = sym.full(fill_value=3)
+
+    # square loss
+    sub1 = sym.elemwise_sub(data1, label, name="sub1")
+    square_loss = sym.sum(data=sub1**2, axis=1, name="square_loss")
+
+    # fake loss1
+    shape2 = (1000, )
+    data2 = sym.Variable('data2', shape=shape2, dtype=0)
+    loss1 = sym.sqrt(data2, name="loss1")
+
+    # fake loss2
+    loss2 = sym.relu(data1, name='loss2')
+
+    # block loss1
+    total_loss = sym.elemwise_sum(
+        sym.block_grad(loss1),
+        square_loss,
+        num_args=2,
+        name="total_loss")
+
+    # grad_g.symbol.list_output_names()
+    # >> ['loss1_grad_0_output', 'grad_sum_output']
+    grad_g = graph_util.get_gradient_graph([total_loss, loss2], total_loss.list_input_variables())
+    # infer shape
+    in_shapes, out_shapes = graph_util.infer_shape(grad_g)
+    assert out_shapes == [list(shape2), list(shape1)]
+
+    # grad_data1 is elemwise_sum of grad_loss2, grad_square_loss
+    grad_data1 = grad_g.symbol[1]
+    assert grad_data1.list_attr()['num_args'] == '2'
+
+    # block grad should return zero grad
+    grad_data2 = grad_g.symbol[0]
+    assert 'zeros_like' in grad_g.ir()
+
+    # test reverse infer shape for label
+    assert grad_g.apply('InferShape').json_attr('shape_num_unknown_nodes') == 0
+
+    # infer type
+    in_dtypes, out_dtypes = graph_util.infer_dtype(grad_g)
+    assert out_dtypes == ['float32', 'float32']
+
+    # test reverse infer type for label
+    assert grad_g.apply('InferType').json_attr('dtype_num_unknown_nodes') == 0
+
+
+if __name__ == "__main__":
+    test_cnn_gradients()
+    test_multi_loss_graph_gradients()
diff --git a/nnvm/tests/python/unittest/test_infer_shape.py b/nnvm/tests/python/unittest/test_infer_shape.py
new file mode 100644
index 000000000000..c394fab562f2
--- /dev/null
+++ b/nnvm/tests/python/unittest/test_infer_shape.py
@@ -0,0 +1,415 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import json
+import nnvm.symbol as sym
+import nnvm.graph as graph
+
+def infer_shape(sym):
+    g = graph.create(sym)
+    g._set_json_attr("shape_attr_key", "shape")
+    g = g.apply("InferShape")
+    sdict = {}
+    vshape = g.json_attr("shape")
+    entry_ptr = g.index.entry_ptr
+    for i, n in enumerate(g.index.nodes):
+        begin, end = entry_ptr[i], entry_ptr[i + 1]
+        sdict[n["name"]] = vshape[begin:end]
+    return sdict
+
+# Level 1
+def test_dense():
+    x = sym.Variable("x", shape=(10, 20))
+    y = sym.dense(x, units=30, name="fc")
+    sdict = infer_shape(y)
+    assert(sdict["fc"][0] == [10, 30])
+    assert(sdict["fc_bias"][0] == [30])
+
+
+def test_matmul():
+    a = sym.Variable('a', shape=(10, 20))
+    b = sym.Variable('b', shape=(20, 30))
+    c = sym.matmul(a, b, name="matmul")
+    sdict = infer_shape(c)
+    assert(sdict["matmul"][0] == [10, 30])
+    a = sym.Variable('a', shape=(20, 10))
+    c = sym.matmul(a, b, name="matmul", transpose_a=True)
+    sdict = infer_shape(c)
+    assert(sdict["matmul"][0] == [10, 30])
+    b = sym.Variable('b', shape=(30, 20))
+    c = sym.matmul(a, b, name="matmul", transpose_a=True, transpose_b=True)
+    sdict = infer_shape(c)
+    assert(sdict["matmul"][0] == [10, 30])
+    a = sym.Variable('a', shape=(10, 20))
+    c = sym.matmul(a, b, name="matmul", transpose_b=True)
+    sdict = infer_shape(c)
+    assert(sdict["matmul"][0] == [10, 30])
+    a = sym.Variable('a', shape=(10, 20, 30))
+    b = sym.Variable('b', shape=(30, 40, 50))
+    c = sym.matmul(a, b, name="matmul")
+    sdict = infer_shape(c)
+    assert(sdict["matmul"][0] == [10, 20, 40, 50])
+    a = sym.Variable('a', shape=(30, 20, 10))
+    b = sym.Variable('b', shape=(50, 40, 30))
+    c = sym.matmul(a, b, name="matmul", transpose_a=True, transpose_b=True)
+    sdict = infer_shape(c)
+    assert(sdict["matmul"][0] == [10, 20, 40, 50])
+
+
+def test_concatenate():
+    x1 = sym.Variable("x", shape=(10, 20))
+    x2 = sym.Variable("y", shape=(10, 30))
+    z = sym.concatenate(x1, x2, name="concat")
+    sdict = infer_shape(z)
+    assert(sdict["concat"][0] == [10, 50])
+    z = sym.concatenate(x1, x1, axis=0, name="concat")
+    sdict = infer_shape(z)
+    assert(sdict["concat"][0] == [20, 20])
+
+
+def test_expand_dims():
+    x = sym.Variable("x", shape=(10, 20))
+    y = sym.expand_dims(x, axis=1, name="y")
+    sdict = infer_shape(y)
+    assert(sdict["y"][0] == [10, 1, 20])
+    y = sym.expand_dims(x, axis=-1, name="y", num_newaxis=2)
+    sdict = infer_shape(y)
+    assert(sdict["y"][0] == [10, 20, 1, 1])
+
+
+def test_split():
+    x1 = sym.Variable("x", shape=(10, 20))
+    z = sym.split(x1, indices_or_sections=[11], name="y")
+    sdict = infer_shape(z)
+    assert(sdict["y"][0] == [10, 11])
+    assert(sdict["y"][1] == [10, 9])
+    z = sym.split(x1, indices_or_sections=2, name="y")
+    sdict = infer_shape(z)
+    assert(sdict["y"][0] == [10, 10])
+    assert(sdict["y"][1] == [10, 10])
+    z = sym.split(x1, indices_or_sections=[6], axis=-1, name="y")
+    sdict = infer_shape(z)
+    assert(sdict["y"][0] == [10, 6])
+    assert(sdict["y"][1] == [10, 14])
+
+
+def test_batchnorm():
+    x = sym.Variable("x", shape=(10, 20))
+    y = sym.batch_norm(1 / x, name="bn")
+    sdict = infer_shape(y)
+    assert(sdict["bn_gamma"][0] == [20])
+
+    x = sym.Variable("x", shape=(10, 20, 30, 40))
+    y = sym.batch_norm(data=x, axis=0, epsilon=2e-5, name='bn')
+    sdict = infer_shape(y)
+    assert(sdict['bn_moving_var'][0] == [10])
+
+    y = sym.batch_norm(data=x, axis=1, epsilon=2e-5, name='bn')
+    sdict = infer_shape(y)
+    assert(sdict['bn_gamma'][0] == [20])
+
+    y = sym.batch_norm(data=x, axis=2, epsilon=2e-5, name='bn')
+    sdict = infer_shape(y)
+    assert(sdict['bn_beta'][0] == [30])
+
+    y = sym.batch_norm(data=x, axis=3, epsilon=2e-5, name='bn')
+    sdict = infer_shape(y)
+    assert(sdict['bn_moving_mean'][0] == [40])
+
+def test_flatten():
+    x = sym.Variable("x", shape=(10, 20, 10))
+    y = sym.flatten(x) * 2
+    y = sym.exp(y, name="y")
+    sdict = infer_shape(y)
+    assert(sdict["y"][0] == [10, 200])
+
+def test_squeeze():
+    x = sym.Variable("x", shape=(1, 1, 1, 10))
+    y = sym.squeeze(x, axis=(1,2), name='squeeze')
+    sdict = infer_shape(y)
+    assert(sdict['squeeze'][0] == [1, 10])
+
+    x = sym.Variable("x", shape=(1, 3, 1))
+    y = sym.squeeze(x, name='squeeze')
+    sdict = infer_shape(y)
+    assert(sdict['squeeze'][0] == [3])
+
+    y = sym.squeeze(x, axis=(0), name='squeeze')
+    sdict = infer_shape(y)
+    assert(sdict['squeeze'][0] == [3, 1])
+
+    y = sym.squeeze(x, axis=(0,2), name='squeeze')
+    sdict = infer_shape(y)
+    assert(sdict['squeeze'][0] == [3])
+
+# Level 2
+def test_conv2d():
+    def check(in_shape, out_shape, **kwargs):
+        x = sym.Variable("x", shape=in_shape)
+        y = sym.conv2d(x, name="y", **kwargs)
+        sdict = infer_shape(y)
+        assert(tuple(sdict["y"][0]) == tuple(out_shape))
+
+    check((4, 10, 10, 12),
+          (4, 12, 10, 12),
+          channels=12,
+          kernel_size=(3,3),
+          padding=(1,1))
+    check((4, 10, 12, 4),
+          (4, 8, 8, 5),
+          channels=5,
+          kernel_size=(3, 5),
+          layout="NHWC")
+    check((4, 10, 12, 4),
+          (4, 6, 8, 5),
+          channels=5,
+          dilation=(2, 2),
+          kernel_size=(3, 3),
+          layout="NHWC")
+    check((4, 10, 12, 4),
+          (4, 5, 6, 5),
+          channels=5,
+          strides=(2, 2),
+          kernel_size=(3, 3),
+          padding=(1, 1),
+          layout="NHWC")
+
+
+def test_conv2d_packed():
+    def check(in_shape,
+              out_shape,
+              kernel_shape,
+              **kwargs):
+        x = sym.Variable("x", shape=in_shape)
+        y = sym.conv2d(x, name="y", **kwargs)
+        sdict = infer_shape(y)
+        assert(tuple(sdict["y"][0]) == tuple(out_shape))
+        assert(tuple(sdict["y_weight"][0]) == tuple(kernel_shape))
+
+    check((4, 10, 10, 12, 1, 8),
+          (4, 10, 10, 2, 1, 8),
+          (2, 12, 3, 3, 8, 8),
+          channels=8 * 2,
+          kernel_size=(3,3),
+          padding=(1,1),
+          layout="NHWC1n8c",
+          kernel_layout="OIHW8o8i")
+
+
+def test_conv2d_transpose():
+    def check(in_shape, out_shape, **kwargs):
+        x = sym.Variable("x", shape=in_shape)
+        y = sym.conv2d_transpose(x, name="y", **kwargs)
+        sdict = infer_shape(y)
+        assert(tuple(sdict["y"][0]) == tuple(out_shape))
+
+    check((4, 10, 10, 12),
+          (4, 15, 10, 12),
+          channels=15,
+          kernel_size=(3,3),
+          padding=(1,1))
+    check((4, 10, 10, 12),
+          (4, 15, 10, 14),
+          channels=15,
+          kernel_size=(3, 5),
+          padding=(1, 1))
+    check((4, 10, 10, 12),
+          (4, 15, 11, 15),
+          channels=15,
+          kernel_size=(3, 5),
+          padding=(1, 1),
+          output_padding=(1, 1))
+    check((4, 10, 10, 12),
+          (4, 15, 15, 11),
+          channels=11,
+          kernel_size=(5, 5),
+          output_padding=(1, 1),
+          layout="NHWC")
+
+
+def test_max_pool2d():
+    def check(in_shape, out_shape, **kwargs):
+        x = sym.Variable("x", shape=in_shape)
+        y = sym.max_pool2d(x, name="y", **kwargs)
+        sdict = infer_shape(y)
+        assert(tuple(sdict["y"][0]) == tuple(out_shape))
+
+    check((4, 10, 12, 12),
+          (4, 10, 12, 12),
+          pool_size=(3,3),
+          padding=(1,1))
+    check((4, 10, 12, 12),
+          (4, 10, 6, 6),
+          pool_size=(3, 3),
+          padding=(1, 1),
+          strides=(2, 2))
+    check((4, 10, 12, 12),
+          (4, 10, 7, 7),
+          pool_size=(3, 3),
+          padding=(1, 1),
+          strides=(2, 2),
+          ceil_mode=True)
+    check((4, 12, 14, 10),
+          (4, 6, 7, 10),
+          pool_size=(3, 3),
+          padding=(1, 1),
+          strides=(2, 2),
+          layout="NHWC")
+
+
+def test_global_pool2d():
+    def check(in_shape, out_shape, **kwargs):
+        x = sym.Variable("x", shape=in_shape)
+        y = sym.global_max_pool2d(x, name="y", **kwargs)
+        sdict = infer_shape(y)
+        assert(tuple(sdict["y"][0]) == tuple(out_shape))
+
+    check((4, 10, 12, 12),
+          (4, 10, 1, 1))
+    check((4, 10, 12, 12),
+          (4, 1, 1, 12),
+          layout="NHWC")
+
+
+# Level 3
+def test_reshape():
+    def check(in_shape, tshape, out_shape):
+        x = sym.Variable("x", shape=in_shape)
+        y = sym.reshape(x, shape=tshape, name="y")
+        sdict = infer_shape(y)
+        assert(tuple(sdict["y"][0]) == tuple(out_shape))
+
+    check((4,), (2, 2), (2, 2))
+    check((2, 3, 4), (4, 0, 2), (4, 3, 2))
+    check((2, 3, 4), (2, 0, 0), (2, 3, 4))
+    check((2, 3, 4), (6, 1, -1), (6, 1, 4))
+    check((2, 3, 4), (3, -1, 8), (3, 1, 8))
+    check((2, 3, 4), (-1,), (24,))
+    check((2, 3, 4), (-2,), (2, 3, 4))
+    check((2, 3, 4), (2, -2), (2, 3, 4))
+    check((2, 3, 4), (-2, 1, 1), (2, 3, 4, 1, 1))
+    check((2, 3, 4), (-3, 4), (6, 4))
+    check((2, 3, 4, 5), (-3, -3), (6, 20))
+    check((2, 3, 4), (0, -3), (2, 12))
+    check((2, 3, 4), (-3, -2), (6, 4))
+    check((2, 3, 4), (-4, 1, 2, -2), (1, 2, 3, 4))
+    check((2, 3, 4), (2, -4, -1, 3, -2), (2, 1, 3, 4))
+
+
+def test_prelu():
+    def check(in_shape, axis,  out_shape):
+        x = sym.Variable("x", shape=in_shape)
+        w = sym.Variable("w")
+        y = sym.prelu(x, w, axis=axis, name="y")
+        sdict = infer_shape(y)
+        assert(tuple(sdict["y"][0]) == tuple(out_shape))
+    check((1, 3, 2, 2), 1, (1, 3, 2, 2))
+    check((1, 2, 2, 3), 3, (1, 2, 2, 3))
+
+
+# Level 4
+def test_transpose():
+    def check(in_shape, out_shape, **kwargs):
+        x = sym.Variable("x", shape=in_shape)
+        y = sym.transpose(x, name="y", **kwargs)
+        sdict = infer_shape(y)
+        assert(tuple(sdict["y"][0]) == tuple(out_shape))
+
+    check((4, 1), (1, 4))
+    check((0, 1, 2, 3), (1, 2, 3, 0), axes=(1, 2, 3, 0))
+
+
+def test_broadcast_to():
+    def check(in_shape, tshape, out_shape):
+        x = sym.Variable("x", shape=in_shape)
+        y = sym.broadcast_to(x, shape=tshape, name="y")
+        sdict = infer_shape(y)
+        assert(tuple(sdict["y"][0]) == tuple(out_shape))
+
+    check((4, 1), (0, 4), (4, 4))
+    check((4, 1, 5), (0, 4, 5), (4, 4, 5))
+
+
+def test_broadcast_binary():
+    def check(lhs_shape, rhs_shape, out_shape):
+        x = sym.Variable("x", shape=lhs_shape)
+        y = sym.Variable("y", shape=rhs_shape)
+        z = sym.broadcast_add(x, y, name="y")
+        sdict = infer_shape(z)
+        assert(tuple(sdict["y"][0]) == tuple(out_shape))
+
+    check((4, 1), (4), (4, 4))
+    check((5, 1, 1), (1, 4, 4), (5, 4, 4))
+    check((6, 1, 4), (5, 4), (6, 5, 4))
+
+
+def test_reduce():
+    def check(in_shape, out_shape, **kwargs):
+        x = sym.Variable("x", shape=in_shape)
+        y = sym.sum(x, name="y", **kwargs)
+        sdict = infer_shape(y)
+        assert(tuple(sdict["y"][0]) == tuple(out_shape))
+
+    check((4, 5), (4,), axis=1)
+    check((4, 5), (4, 1), axis=1, keepdims=True)
+    check((4, 5), (1, 5), axis=0, keepdims=True)
+    check((4, 5), (1, 1), axis=(), keepdims=True)
+    check((4, 5), (1,), axis=())
+    check((4, 5, 10), (5,), axis=(0, 2))
+    check((4, 5, 10), (1, 5, 1), axis=(0, 2), keepdims=True)
+
+
+def test_gather_nd():
+    def check(data_shape, indices_shape, out_shape):
+        x = sym.Variable("x", shape=data_shape)
+        indices = sym.Variable("indices", shape=indices_shape)
+        y = sym.gather_nd(x, indices, name="y")
+        sdict = infer_shape(y)
+        assert(tuple(sdict["y"][0]) == tuple(out_shape))
+
+    check((4,), (1, 1), (1,))
+    check((4,), (1, 3), (3,))
+    check((2, 3), (1, 1), (1, 3))
+    check((2, 3), (2, 1), (1,))
+    check((2, 3), (2, 5, 6), (5, 6))
+    check((2, 3, 4), (1, 1), (1, 3, 4))
+    check((2, 3, 4), (2, 1), (1, 4))
+    check((2, 3, 4), (2, 5), (5, 4))
+    check((2, 3, 4), (2, 5, 6), (5, 6, 4))
+    check((2, 3, 4, 5), (2, 6, 7), (6, 7, 4, 5))
+
+
+if __name__ == "__main__":
+    test_conv2d_packed()
+    test_expand_dims()
+    test_dense()
+    test_matmul()
+    test_concatenate()
+    test_split()
+    test_batchnorm()
+    test_flatten()
+    test_conv2d()
+    test_conv2d_transpose()
+    test_max_pool2d()
+    test_global_pool2d()
+    test_reshape()
+    test_broadcast_to()
+    test_broadcast_binary()
+    test_reduce()
+    test_transpose()
+    test_prelu()
+    test_squeeze()
+    test_gather_nd()
diff --git a/nnvm/tests/python/unittest/test_pass_saveload_json.py b/nnvm/tests/python/unittest/test_pass_saveload_json.py
new file mode 100644
index 000000000000..a8b067c8fe24
--- /dev/null
+++ b/nnvm/tests/python/unittest/test_pass_saveload_json.py
@@ -0,0 +1,33 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import nnvm
+from tvm.contrib import util
+
+
+def test_variable_node_parsed():
+    sym = nnvm.sym.Variable('data')
+    tempdir = util.tempdir()
+    json_filename = 'test_nnvm_symbol.json'
+    with open(tempdir.relpath(json_filename), 'w') as fo:
+        fo.write(nnvm.graph.create(sym).json())
+    sym_str = open(tempdir.relpath(json_filename), 'r').read()
+    sym = nnvm.graph.load_json(sym_str).symbol()
+    sym = nnvm.sym.relu(sym)
+
+
+if __name__ == '__main__':
+    test_variable_node_parsed()
diff --git a/nnvm/tests/python/unittest/test_symbol.py b/nnvm/tests/python/unittest/test_symbol.py
new file mode 100644
index 000000000000..a54dec170aae
--- /dev/null
+++ b/nnvm/tests/python/unittest/test_symbol.py
@@ -0,0 +1,77 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import nnvm.symbol as sym
+from nnvm import NNVMError
+
+def test_dense():
+    x = sym.Variable('x')
+    y = sym.dense(x, units=30, name="fc")
+    assert y.list_input_names() == ["x", "fc_weight", "fc_bias"]
+
+def test_batch_norm():
+    x = sym.Variable('x')
+    y = sym.dense(x, units=30, name="fc")
+    z = sym.batch_norm(x, name='bn')
+    assert z.list_input_names('aux_state') == ['bn_moving_mean', 'bn_moving_var']
+    assert z.list_input_names('read_only') == ['x', 'bn_gamma', 'bn_beta']
+
+def test_compose():
+    x = sym.Variable('x')
+    z = sym.Variable('z')
+    y = sym.exp(sym.elemwise_add(x, x, name='add', gpu=2),
+                name='exp', gpu=1, attr={"kk": "1"})
+
+    assert y.list_input_names() == ['x']
+    assert y.list_output_names() == ["exp_output"]
+    assert y.list_attr()['gpu'] == '1'
+    z = y.get_internals()
+    assert z['add_output'].list_output_names() == ['add_output']
+    assert y.list_attr(recursive=True)['add$gpu'] == '2'
+
+def test_default_input():
+    x = sym.Variable('x')
+    y = sym.dense(data=x, units=30, name='fc', use_bias=False)
+    assert y.list_input_names() == ['x', 'fc_weight']
+    tname = [z.list_output_names()[0] for z in y.list_input_variables()]
+    assert tname == y.list_input_names()
+    try:
+        z = sym.elemwise_add(x)
+        assert False
+    except NNVMError:
+        pass
+
+def test_copy():
+    x = sym.Variable('x')
+    z = sym.Variable('z')
+    y = sym.exp(sym.elemwise_add(x, x, name='add', gpu=2),
+                name='exp', gpu=1, attr={"kk": "1"})
+    assert y.__copy__().debug_str() == y.debug_str()
+
+
+def test_op_name():
+    x = sym.Variable('x')
+    y = sym.exp(x)
+    op_name = y.attr("op_name")
+    op_func = sym.__dict__[op_name]
+    z = op_func(x)
+
+if __name__ == "__main__":
+    test_op_name()
+    test_copy()
+    test_default_input()
+    test_compose()
+    test_batch_norm()
diff --git a/nnvm/tests/python/unittest/test_top_level1.py b/nnvm/tests/python/unittest/test_top_level1.py
new file mode 100644
index 000000000000..2d646dc16ae4
--- /dev/null
+++ b/nnvm/tests/python/unittest/test_top_level1.py
@@ -0,0 +1,66 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import nnvm.symbol as sym
+import nnvm.graph as graph
+
+def test_dense():
+    x = sym.Variable('x')
+    x1 = sym.dense(x, units=3, name="dense")
+    x2 = sym.flatten(x1)
+    x3 = sym.softmax(x2)
+    assert x3.list_input_names() == ['x', 'dense_weight', 'dense_bias']
+
+
+def test_concatenate_split():
+    x = sym.Variable('x')
+    y = sym.Variable('y')
+    y = sym.concatenate(x, y)
+    assert y.list_input_names() == ['x', 'y']
+    z = sym.split(y, indices_or_sections=10)
+    assert len(z.list_output_names()) == 10
+    z = sym.split(y, indices_or_sections=[10, 20])
+    assert len(z.list_output_names()) == 3
+
+def test_expand_dims():
+    x = sym.Variable('x')
+    y = sym.expand_dims(x, axis=1, num_newaxis=2)
+    assert y.list_input_names() == ['x']
+
+
+def test_unary():
+    x = sym.Variable('x')
+    x = sym.exp(x)
+    x = sym.log(x)
+    x = sym.sigmoid(x)
+    x = sym.tanh(x)
+    x = sym.relu(x)
+    assert x.list_input_names() == ['x']
+
+
+def test_batchnorm():
+    x = sym.Variable('x')
+    x = sym.batch_norm(x, name="bn")
+    assert x.list_input_names() == [
+        "x", "bn_gamma", "bn_beta", "bn_moving_mean", "bn_moving_var"]
+
+
+if __name__ == "__main__":
+    test_concatenate_split()
+    test_expand_dims()
+    test_dense()
+    test_unary()
+    test_batchnorm()
diff --git a/nnvm/tests/python/unittest/test_top_level2.py b/nnvm/tests/python/unittest/test_top_level2.py
new file mode 100644
index 000000000000..b327356b5cc0
--- /dev/null
+++ b/nnvm/tests/python/unittest/test_top_level2.py
@@ -0,0 +1,35 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import nnvm.symbol as sym
+
+def test_conv2d():
+    x = sym.Variable('x')
+    y = sym.conv2d(x, channels=3, kernel_size=(3, 3),
+                   name="y", use_bias=False)
+    assert y.list_input_names() == ["x", "y_weight"]
+
+
+def test_max_pool2d():
+    x = sym.Variable('x')
+    y = sym.max_pool2d(x, pool_size=(3, 3), name="y")
+    y = sym.global_max_pool2d(y)
+    assert y.list_input_names() == ["x"]
+
+
+if __name__ == "__main__":
+    test_conv2d()
+    test_max_pool2d()
diff --git a/nnvm/tests/python/unittest/test_top_level3.py b/nnvm/tests/python/unittest/test_top_level3.py
new file mode 100644
index 000000000000..f19e1fd4376e
--- /dev/null
+++ b/nnvm/tests/python/unittest/test_top_level3.py
@@ -0,0 +1,46 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import nnvm.symbol as sym
+
+def test_reshape():
+    x = sym.Variable("x")
+    y = sym.reshape(x, shape=(10, 20), name="y")
+    assert(y.list_input_names() == ["x"])
+
+
+def test_scalar_op():
+    x = sym.Variable("x")
+    y = (1 / (x * 2) - 1) ** 2
+    assert(y.list_input_names() == ["x"])
+
+def test_leaky_relu():
+    x = sym.Variable("x")
+    y = sym.leaky_relu(x, alpha=0.1)
+    assert(y.list_input_names() == ["x"])
+
+def test_prelu():
+    x = sym.Variable("x")
+    w = sym.Variable("w")
+    y = sym.prelu(x, w)
+    assert(y.list_input_names()[0] == 'x')
+    assert(y.list_input_names()[1] == 'w')
+
+if __name__ == "__main__":
+    test_scalar_op()
+    test_reshape()
+    test_leaky_relu()
+    test_prelu()
diff --git a/nnvm/tests/python/unittest/test_top_level4.py b/nnvm/tests/python/unittest/test_top_level4.py
new file mode 100644
index 000000000000..ad0829b59283
--- /dev/null
+++ b/nnvm/tests/python/unittest/test_top_level4.py
@@ -0,0 +1,36 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import nnvm.symbol as sym
+
+def test_binary_broadcast():
+    x = sym.Variable('x')
+    y = sym.Variable('y')
+    z = x + y
+    z = x * y
+    z = x - y
+    z = x / y
+
+
+def test_broadcast_to():
+    x = sym.Variable('x')
+    y = sym.broadcast_to(x, shape=(3, 3))
+    assert y.list_input_names() == ["x"]
+
+
+if __name__ == "__main__":
+    test_binary_broadcast()
+    test_broadcast_to()
diff --git a/nnvm/tutorials/.gitignore b/nnvm/tutorials/.gitignore
new file mode 100644
index 000000000000..5f8a03c46b89
--- /dev/null
+++ b/nnvm/tutorials/.gitignore
@@ -0,0 +1,11 @@
+*.pb
+*.mlmodel
+*.ttf
+*.txt
+*synset*txt
+*.cfg
+ssd_model
+*.names
+*.jpg
+*.pbtxt
+*.weights
diff --git a/nnvm/tutorials/README.txt b/nnvm/tutorials/README.txt
new file mode 100644
index 000000000000..334409cd8a28
--- /dev/null
+++ b/nnvm/tutorials/README.txt
@@ -0,0 +1,4 @@
+.. _tutorial-nnvm:
+
+NNVM Compiler Tutorials
+-----------------------
diff --git a/nnvm/tutorials/deploy_model_on_mali_gpu.py b/nnvm/tutorials/deploy_model_on_mali_gpu.py
new file mode 100644
index 000000000000..d90b0955048c
--- /dev/null
+++ b/nnvm/tutorials/deploy_model_on_mali_gpu.py
@@ -0,0 +1,229 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+.. _tutorial-deploy-model-on-mali-gpu:
+
+Deploy the Pretrained Model on ARM Mali GPU
+===========================================
+**Author**: `Lianmin Zheng <https://lmzheng.net/>`_, `Ziheng Jiang <https://ziheng.org/>`_
+
+This is an example of using NNVM to compile a ResNet model and
+deploy it on Firefly-RK3399 with ARM Mali GPU. We will use the
+Mali-T860 MP4 GPU on this board to accelerate the inference.
+"""
+
+import tvm
+import nnvm.compiler
+import nnvm.testing
+from tvm import rpc
+from tvm.contrib import util, graph_runtime as runtime
+from tvm.contrib.download import download_testdata
+
+######################################################################
+# Build TVM Runtime on Device
+# ---------------------------
+#
+# The first step is to build tvm runtime on the remote device.
+#
+# .. note::
+#
+#   All instructions in both this section and next section should be
+#   executed on the target device, e.g. Rk3399. And we assume it
+#   has Linux running.
+#
+# Since we do compilation on local machine, the remote device is only used
+# for running the generated code. We only need to build tvm runtime on
+# the remote device. Make sure you have opencl driver in your board.
+# You can refer to `tutorial <https://gist.github.com/mli/585aed2cec0b5178b1a510f9f236afa2>`_
+# to setup OS and opencl driver for rk3399.
+#
+# .. code-block:: bash
+#
+#   git clone --recursive https://github.com/apache/incubator-tvm tvm
+#   cd tvm
+#   cp cmake/config.cmake .
+#   sed -i "s/USE_OPENCL OFF/USE_OPENCL ON/" config.cmake
+#   make runtime -j4
+#
+# After building runtime successfully, we need to set environment varibles
+# in :code:`~/.bashrc` file. We can edit :code:`~/.bashrc`
+# using :code:`vi ~/.bashrc` and add the line below (Assuming your TVM
+# directory is in :code:`~/tvm`):
+#
+# .. code-block:: bash
+#
+#   export PYTHONPATH=$PYTHONPATH:~/tvm/python
+#
+# To update the environment variables, execute :code:`source ~/.bashrc`.
+
+######################################################################
+# Set Up RPC Server on Device
+# ---------------------------
+# To start an RPC server, run the following command on your remote device
+# (Which is RK3399 in our example).
+#
+#   .. code-block:: bash
+#
+#     python -m tvm.exec.rpc_server --host 0.0.0.0 --port=9090
+#
+# If you see the line below, it means the RPC server started
+# successfully on your device.
+#
+#    .. code-block:: bash
+#
+#      INFO:root:RPCServer: bind to 0.0.0.0:9090
+#
+
+######################################################################
+# Prepare the Pre-trained Model
+# -----------------------------
+# Back to the host machine, which should have a full TVM installed (with LLVM).
+#
+# We will use pre-trained model from
+# `MXNet Gluon model zoo <https://mxnet.incubator.apache.org/api/python/gluon/model_zoo.html>`_.
+# You can found more details about this part at tutorial :ref:`tutorial-from-mxnet`.
+
+from mxnet.gluon.model_zoo.vision import get_model
+from PIL import Image
+import numpy as np
+
+# only one line to get the model
+block = get_model('resnet18_v1', pretrained=True)
+
+######################################################################
+# In order to test our model, here we download an image of cat and
+# transform its format.
+img_name = 'cat.png'
+img_path = download_testdata('https://github.com/dmlc/mxnet.js/blob/master/data/cat.png?raw=true',
+                             img_name, module='data')
+image = Image.open(img_path).resize((224, 224))
+
+def transform_image(image):
+    image = np.array(image) - np.array([123., 117., 104.])
+    image /= np.array([58.395, 57.12, 57.375])
+    image = image.transpose((2, 0, 1))
+    image = image[np.newaxis, :]
+    return image
+
+x = transform_image(image)
+
+######################################################################
+# synset is used to transform the label from number of ImageNet class to
+# the word human can understand.
+synset_url = ''.join(['https://gist.githubusercontent.com/zhreshold/',
+                      '4d0b62f3d01426887599d4f7ede23ee5/raw/',
+                      '596b27d23537e5a1b5751d2b0481ef172f58b539/',
+                      'imagenet1000_clsid_to_human.txt'])
+
+synset_name = 'imagenet1000_clsid_to_human.txt'
+synset_path = download_testdata(synset_url, synset_name, module='data')
+with open(synset_path) as f:
+    synset = eval(f.read())
+
+######################################################################
+# Now we would like to port the Gluon model to a portable computational graph.
+# It's as easy as several lines.
+
+# We support MXNet static graph(symbol) and HybridBlock in mxnet.gluon
+net, params = nnvm.frontend.from_mxnet(block)
+# we want a probability so add a softmax operator
+net = nnvm.sym.softmax(net)
+
+######################################################################
+# Here are some basic data workload configurations.
+batch_size = 1
+num_classes = 1000
+image_shape = (3, 224, 224)
+data_shape = (batch_size,) + image_shape
+
+######################################################################
+# Compile The Graph
+# -----------------
+# To compile the graph, we call the :any:`nnvm.compiler.build` function
+# with the graph configuration and parameters. As we use OpenCL for
+# GPU computing, the tvm will generate both OpenCL kernel code and ARM
+# CPU host code. The CPU host code is used for calling OpenCL kernels.
+# In order to generate correct CPU code, we need to specify the target
+# triplet for host ARM device by setting the parameter :code:`target_host`.
+
+######################################################################
+# If we run the example on our x86 server for demonstration, we can simply
+# set it as :code:`llvm`. If running it on the RK3399, we need to
+# specify its instruction set. Set :code:`local_demo` to False if you
+# want to run this tutorial with a real device.
+
+local_demo = True
+
+if local_demo:
+    target_host = "llvm"
+    target = "llvm"
+else:
+    # Here is the setting for my rk3399 board
+    # If you don't use rk3399, you can query your target triple by
+    # execute `gcc -v` on your board.
+    target_host = "llvm -target=aarch64-linux-gnu"
+
+    # set target as  `tvm.target.mali` instead of 'opencl' to enable
+    # optimization for mali
+    target = tvm.target.mali()
+
+with nnvm.compiler.build_config(opt_level=3):
+    graph, lib, params = nnvm.compiler.build(net, target=target,
+            shape={"data": data_shape}, params=params, target_host=target_host)
+
+# After `nnvm.compiler.build`, you will get three return values: graph,
+# library and the new parameter, since we do some optimization that will
+# change the parameters but keep the result of model as the same.
+
+# Save the library at local temporary directory.
+tmp = util.tempdir()
+lib_fname = tmp.relpath('net.tar')
+lib.export_library(lib_fname)
+
+######################################################################
+# Deploy the Model Remotely by RPC
+# --------------------------------
+# With RPC, you can deploy the model remotely from your host machine
+# to the remote device.
+
+# obtain an RPC session from remote device.
+if local_demo:
+    remote = rpc.LocalSession()
+else:
+    # The following is my environment, change this to the IP address of your target device
+    host = '10.77.1.145'
+    port = 9090
+    remote = rpc.connect(host, port)
+
+# upload the library to remote device and load it
+remote.upload(lib_fname)
+rlib = remote.load_module('net.tar')
+
+# create the remote runtime module
+ctx = remote.cl(0) if not local_demo else remote.cpu(0)
+module = runtime.create(graph, rlib, ctx)
+# set parameter (upload params to the remote device. This may take a while)
+module.set_input(**params)
+# set input data
+module.set_input('data', tvm.nd.array(x.astype('float32')))
+# run
+module.run()
+# get output
+out = module.get_output(0)
+# get top1 result
+top1 = np.argmax(out.asnumpy())
+print('TVM prediction top-1: {}'.format(synset[top1]))
diff --git a/nnvm/tutorials/deploy_model_on_rasp.py b/nnvm/tutorials/deploy_model_on_rasp.py
new file mode 100644
index 000000000000..576b517f3aa5
--- /dev/null
+++ b/nnvm/tutorials/deploy_model_on_rasp.py
@@ -0,0 +1,220 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+.. _tutorial-deploy-model-on-rasp:
+
+Deploy the Pretrained Model on Raspberry Pi
+===========================================
+**Author**: `Ziheng Jiang <https://ziheng.org/>`_
+
+This is an example of using NNVM to compile a ResNet model and deploy
+it on Raspberry Pi.
+"""
+
+import tvm
+import nnvm.compiler
+import nnvm.testing
+from tvm import rpc
+from tvm.contrib import util, graph_runtime as runtime
+from tvm.contrib.download import download_testdata
+
+######################################################################
+# .. _build-tvm-runtime-on-device:
+#
+# Build TVM Runtime on Device
+# ---------------------------
+#
+# The first step is to build tvm runtime on the remote device.
+#
+# .. note::
+#
+#   All instructions in both this section and next section should be
+#   executed on the target device, e.g. Raspberry Pi. And we assume it
+#   has Linux running.
+#
+# Since we do compilation on local machine, the remote device is only used
+# for running the generated code. We only need to build tvm runtime on
+# the remote device.
+#
+# .. code-block:: bash
+#
+#   git clone --recursive https://github.com/apache/incubator-tvm tvm
+#   cd tvm
+#   make runtime -j4
+#
+# After building runtime successfully, we need to set environment varibles
+# in :code:`~/.bashrc` file. We can edit :code:`~/.bashrc`
+# using :code:`vi ~/.bashrc` and add the line below (Assuming your TVM
+# directory is in :code:`~/tvm`):
+#
+# .. code-block:: bash
+#
+#   export PYTHONPATH=$PYTHONPATH:~/tvm/python
+#
+# To update the environment variables, execute :code:`source ~/.bashrc`.
+
+######################################################################
+# Set Up RPC Server on Device
+# ---------------------------
+# To start an RPC server, run the following command on your remote device
+# (Which is Raspberry Pi in our example).
+#
+#   .. code-block:: bash
+#
+#     python -m tvm.exec.rpc_server --host 0.0.0.0 --port=9090
+#
+# If you see the line below, it means the RPC server started
+# successfully on your device.
+#
+#    .. code-block:: bash
+#
+#      INFO:root:RPCServer: bind to 0.0.0.0:9090
+#
+
+######################################################################
+# Prepare the Pre-trained Model
+# -----------------------------
+# Back to the host machine, which should have a full TVM installed (with LLVM).
+#
+# We will use pre-trained model from
+# `MXNet Gluon model zoo <https://mxnet.incubator.apache.org/api/python/gluon/model_zoo.html>`_.
+# You can found more details about this part at tutorial :ref:`tutorial-from-mxnet`.
+
+from mxnet.gluon.model_zoo.vision import get_model
+from PIL import Image
+import numpy as np
+
+# one line to get the model
+block = get_model('resnet18_v1', pretrained=True)
+
+######################################################################
+# In order to test our model, here we download an image of cat and
+# transform its format.
+img_name = 'cat.png'
+img_path = download_testdata('https://github.com/dmlc/mxnet.js/blob/master/data/cat.png?raw=true',
+                             img_name, module='data')
+image = Image.open(img_path).resize((224, 224))
+
+def transform_image(image):
+    image = np.array(image) - np.array([123., 117., 104.])
+    image /= np.array([58.395, 57.12, 57.375])
+    image = image.transpose((2, 0, 1))
+    image = image[np.newaxis, :]
+    return image
+
+x = transform_image(image)
+
+######################################################################
+# synset is used to transform the label from number of ImageNet class to
+# the word human can understand.
+synset_url = ''.join(['https://gist.githubusercontent.com/zhreshold/',
+                      '4d0b62f3d01426887599d4f7ede23ee5/raw/',
+                      '596b27d23537e5a1b5751d2b0481ef172f58b539/',
+                      'imagenet1000_clsid_to_human.txt'])
+synset_name = 'imagenet1000_clsid_to_human.txt'
+synset_path = download_testdata(synset_url, synset_name, module='data')
+with open(synset_path) as f:
+    synset = eval(f.read())
+
+######################################################################
+# Now we would like to port the Gluon model to a portable computational graph.
+# It's as easy as several lines.
+
+# We support MXNet static graph(symbol) and HybridBlock in mxnet.gluon
+net, params = nnvm.frontend.from_mxnet(block)
+# we want a probability so add a softmax operator
+net = nnvm.sym.softmax(net)
+
+######################################################################
+# Here are some basic data workload configurations.
+batch_size = 1
+num_classes = 1000
+image_shape = (3, 224, 224)
+data_shape = (batch_size,) + image_shape
+
+######################################################################
+# Compile The Graph
+# -----------------
+# To compile the graph, we call the :any:`nnvm.compiler.build` function
+# with the graph configuration and parameters. However, You cannot to
+# deploy a x86 program on a device with ARM instruction set. It means
+# NNVM also needs to know the compilation option of target device,
+# apart from arguments :code:`net` and :code:`params` to specify the
+# deep learning workload. Actually, the option matters, different option
+# will lead to very different performance.
+
+######################################################################
+# If we run the example on our x86 server for demonstration, we can simply
+# set it as :code:`llvm`. If running it on the Raspberry Pi, we need to
+# specify its instruction set. Set :code:`local_demo` to False if you want
+# to run this tutorial with a real device.
+
+local_demo = True
+
+if local_demo:
+    target = tvm.target.create('llvm')
+else:
+    target = tvm.target.arm_cpu('rasp3b')
+    # The above line is a simple form of
+    # target = tvm.target.create('llvm -device=arm_cpu -model=bcm2837 -target=armv7l-linux-gnueabihf -mattr=+neon')
+
+with nnvm.compiler.build_config(opt_level=3):
+    graph, lib, params = nnvm.compiler.build(
+        net, target, shape={"data": data_shape}, params=params)
+
+# After `nnvm.compiler.build`, you will get three return values: graph,
+# library and the new parameter, since we do some optimization that will
+# change the parameters but keep the result of model as the same.
+
+# Save the library at local temporary directory.
+tmp = util.tempdir()
+lib_fname = tmp.relpath('net.tar')
+lib.export_library(lib_fname)
+
+######################################################################
+# Deploy the Model Remotely by RPC
+# --------------------------------
+# With RPC, you can deploy the model remotely from your host machine
+# to the remote device.
+
+# obtain an RPC session from remote device.
+if local_demo:
+    remote = rpc.LocalSession()
+else:
+    # The following is my environment, change this to the IP address of your target device
+    host = '10.77.1.162'
+    port = 9090
+    remote = rpc.connect(host, port)
+
+# upload the library to remote device and load it
+remote.upload(lib_fname)
+rlib = remote.load_module('net.tar')
+
+# create the remote runtime module
+ctx = remote.cpu(0)
+module = runtime.create(graph, rlib, ctx)
+# set parameter (upload params to the remote device. This may take a while)
+module.set_input(**params)
+# set input data
+module.set_input('data', tvm.nd.array(x.astype('float32')))
+# run
+module.run()
+# get output
+out = module.get_output(0)
+# get top1 result
+top1 = np.argmax(out.asnumpy())
+print('TVM prediction top-1: {}'.format(synset[top1]))
diff --git a/nnvm/tutorials/deploy_ssd_mxnet.py b/nnvm/tutorials/deploy_ssd_mxnet.py
new file mode 100644
index 000000000000..c88c61984293
--- /dev/null
+++ b/nnvm/tutorials/deploy_ssd_mxnet.py
@@ -0,0 +1,180 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+Deploy Single Shot Multibox Detector(SSD) model
+===============================================
+**Author**: `Yao Wang <https://github.com/kevinthesun>`_, \
+`Leyuan Wang <https://github.com/Laurawly>`_
+
+This article is an introductory tutorial to deploy SSD models with TVM.
+We will use mxnet pretrained SSD model with Resnet50 as body network and
+convert it to NNVM graph;
+"""
+import os
+import zipfile
+import tvm
+import mxnet as mx
+import cv2
+import numpy as np
+
+from nnvm import compiler
+from nnvm.frontend import from_mxnet
+from tvm import relay
+from tvm.contrib.download import download_testdata
+from tvm.contrib import graph_runtime
+from mxnet.model import load_checkpoint
+
+
+######################################################################
+# Preliminary and Set parameters
+# ------------------------------
+# We should build TVM with sort support, in TVM root directory
+#
+# .. code-block:: bash
+#
+#   echo "set(USE_SORT ON)" > config.mk
+#   make -j8
+#
+
+model_name = "ssd_resnet50_512"
+model_file = "%s.zip" % model_name
+test_image = "dog.jpg"
+dshape = (1, 3, 512, 512)
+dtype = "float32"
+
+# Target settings
+# Use these commented settings to build for cuda.
+#target = 'cuda'
+#ctx = tvm.gpu(0)
+# Use these commented settings to build for opencl.
+#target = 'opencl'
+#ctx = tvm.opencl(0)
+target = "llvm"
+ctx = tvm.cpu()
+
+######################################################################
+# Download MXNet SSD pre-trained model and demo image
+# ---------------------------------------------------
+# Pre-trained model available at
+# https://github.com/apache/incubator-\mxnet/tree/master/example/ssd
+
+model_url = "https://github.com/zhreshold/mxnet-ssd/releases/download/v0.6/" \
+            "resnet50_ssd_512_voc0712_trainval.zip"
+image_url = "https://cloud.githubusercontent.com/assets/3307514/20012567/" \
+            "cbb60336-a27d-11e6-93ff-cbc3f09f5c9e.jpg"
+inference_symbol_folder = \
+    "c1904e900848df4548ce5dfb18c719c7-a28c4856c827fe766aa3da0e35bad41d44f0fb26"
+inference_symbol_url = "https://gist.github.com/kevinthesun/c1904e900848df4548ce5dfb18c719c7/" \
+                       "archive/a28c4856c827fe766aa3da0e35bad41d44f0fb26.zip"
+
+model_file_path = download_testdata(model_url, model_file, module=["mxnet", "ssd_model"])
+inference_symbol_path = download_testdata(inference_symbol_url, "inference_model.zip",
+                                          module=["mxnet", "ssd_model"])
+test_image_path = download_testdata(image_url, test_image, module="data")
+model_dir = os.path.dirname(model_file_path)
+
+zip_ref = zipfile.ZipFile(model_file_path, 'r')
+zip_ref.extractall(model_dir)
+zip_ref.close()
+zip_ref = zipfile.ZipFile(inference_symbol_path)
+zip_ref.extractall(model_dir)
+zip_ref.close()
+
+######################################################################
+# Convert and compile model with NNVM or Relay for CPU.
+
+sym = mx.sym.load("%s/%s/ssd_resnet50_inference.json" % (model_dir, inference_symbol_folder))
+_, arg_params, aux_params = load_checkpoint("%s/%s" % (model_dir, model_name), 0)
+
+import argparse
+parser = argparse.ArgumentParser()
+parser.add_argument(
+    "-f", "--frontend",
+    help="Frontend for compilation, nnvm or relay",
+    type=str,
+    default="nnvm")
+args = parser.parse_args()
+if args.frontend == "relay":
+    net, params = relay.frontend.from_mxnet(sym, {"data": dshape}, arg_params=arg_params, \
+                                            aux_params=aux_params)
+    with relay.build_config(opt_level=3):
+        graph, lib, params = relay.build(net, target, params=params)
+elif args.frontend == "nnvm":
+    net, params = from_mxnet(sym, arg_params, aux_params)
+    with compiler.build_config(opt_level=3):
+        graph, lib, params = compiler.build(
+            net, target, {"data": dshape}, params=params)
+else:
+    parser.print_help()
+    parser.exit()
+
+######################################################################
+# Create TVM runtime and do inference
+
+# Preprocess image
+image = cv2.imread(test_image_path)
+img_data = cv2.resize(image, (dshape[2], dshape[3]))
+img_data = img_data[:, :, (2, 1, 0)].astype(np.float32)
+img_data -= np.array([123, 117, 104])
+img_data = np.transpose(np.array(img_data), (2, 0, 1))
+img_data = np.expand_dims(img_data, axis=0)
+# Build TVM runtime
+m = graph_runtime.create(graph, lib, ctx)
+m.set_input('data', tvm.nd.array(img_data.astype(dtype)))
+m.set_input(**params)
+# execute
+m.run()
+# get outputs
+tvm_output = m.get_output(0)
+
+
+######################################################################
+# Display result
+
+class_names = ["aeroplane", "bicycle", "bird", "boat", "bottle", "bus", "car", "cat", "chair",
+               "cow", "diningtable", "dog", "horse", "motorbike", "person", "pottedplant",
+               "sheep", "sofa", "train", "tvmonitor"]
+def display(img, out, thresh=0.5):
+    import random
+    import matplotlib as mpl
+    import matplotlib.pyplot as plt
+    mpl.rcParams['figure.figsize'] = (10, 10)
+    pens = dict()
+    plt.clf()
+    plt.imshow(img)
+    for det in out:
+        cid = int(det[0])
+        if cid < 0:
+            continue
+        score = det[1]
+        if score < thresh:
+            continue
+        if cid not in pens:
+            pens[cid] = (random.random(), random.random(), random.random())
+        scales = [img.shape[1], img.shape[0]] * 2
+        xmin, ymin, xmax, ymax = [int(p * s) for p, s in zip(det[2:6].tolist(), scales)]
+        rect = plt.Rectangle((xmin, ymin), xmax - xmin, ymax - ymin, fill=False,
+                             edgecolor=pens[cid], linewidth=3)
+        plt.gca().add_patch(rect)
+        text = class_names[cid]
+        plt.gca().text(xmin, ymin-2, '{:s} {:.3f}'.format(text, score),
+                       bbox=dict(facecolor=pens[cid], alpha=0.5),
+                       fontsize=12, color='white')
+    plt.show()
+
+image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+display(image, tvm_output.asnumpy()[0], thresh=0.45)
diff --git a/nnvm/tutorials/from_coreml.py b/nnvm/tutorials/from_coreml.py
new file mode 100644
index 000000000000..3eaced18728e
--- /dev/null
+++ b/nnvm/tutorials/from_coreml.py
@@ -0,0 +1,106 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+Compile CoreML Models
+=====================
+**Author**: `Joshua Z. Zhang <https://zhreshold.github.io/>`_
+
+This article is an introductory tutorial to deploy CoreML models with NNVM.
+
+For us to begin with, coremltools module is required to be installed.
+
+A quick solution is to install via pip
+
+.. code-block:: bash
+
+    pip install -U coremltools --user
+
+or please refer to official site
+https://github.com/apple/coremltools
+"""
+import nnvm
+import tvm
+import coremltools as cm
+import numpy as np
+from PIL import Image
+from tvm.contrib.download import download_testdata
+
+######################################################################
+# Load pretrained CoreML model
+# ----------------------------
+# We will download and load a pretrained mobilenet classification network
+# provided by apple in this example
+model_url = 'https://docs-assets.developer.apple.com/coreml/models/MobileNet.mlmodel'
+model_file = 'mobilenet.mlmodel'
+model_path = download_testdata(model_url, model_file, module='coreml')
+# now you mobilenet.mlmodel on disk
+mlmodel = cm.models.MLModel(model_path)
+# we can load the graph as NNVM compatible model
+sym, params = nnvm.frontend.from_coreml(mlmodel)
+
+######################################################################
+# Load a test image
+# ------------------
+# A single cat dominates the examples!
+from PIL import Image
+img_url = 'https://github.com/dmlc/mxnet.js/blob/master/data/cat.png?raw=true'
+img_path = download_testdata(img_url, 'cat.png', module='data')
+img = Image.open(img_path).resize((224, 224))
+#x = np.transpose(img, (2, 0, 1))[np.newaxis, :]
+image = np.asarray(img)
+image = image.transpose((2, 0, 1))
+x = image[np.newaxis, :]
+######################################################################
+# Compile the model on NNVM
+# ---------------------------
+# We should be familiar with the process right now.
+import nnvm.compiler
+target = 'cuda'
+shape_dict = {'image': x.shape}
+with nnvm.compiler.build_config(opt_level=2, add_pass=['AlterOpLayout']):
+    graph, lib, params = nnvm.compiler.build(sym, target, shape_dict, params=params)
+
+######################################################################
+# Execute on TVM
+# -------------------
+# The process is no different from other example
+from tvm.contrib import graph_runtime
+ctx = tvm.gpu(0)
+dtype = 'float32'
+m = graph_runtime.create(graph, lib, ctx)
+# set inputs
+m.set_input('image', tvm.nd.array(x.astype(dtype)))
+m.set_input(**params)
+# execute
+m.run()
+# get outputs
+tvm_output = m.get_output(0)
+top1 = np.argmax(tvm_output.asnumpy()[0])
+
+#####################################################################
+# Look up synset name
+# -------------------
+# Look up prediction top 1 index in 1000 class synset.
+synset_url = ''.join(['https://gist.githubusercontent.com/zhreshold/',
+                      '4d0b62f3d01426887599d4f7ede23ee5/raw/',
+                      '596b27d23537e5a1b5751d2b0481ef172f58b539/',
+                      'imagenet1000_clsid_to_human.txt'])
+synset_name = 'imagenet1000_clsid_to_human.txt'
+synset_path = download_testdata(synset_url, synset_name, module='data')
+with open(synset_path) as f:
+    synset = eval(f.read())
+print('Top-1 id', top1, 'class name', synset[top1])
diff --git a/nnvm/tutorials/from_darknet.py b/nnvm/tutorials/from_darknet.py
new file mode 100644
index 000000000000..d2ab647da1b3
--- /dev/null
+++ b/nnvm/tutorials/from_darknet.py
@@ -0,0 +1,177 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+Compile YOLO-V2 and YOLO-V3 in DarkNet Models
+=================================
+**Author**: `Siju Samuel <https://siju-samuel.github.io/>`_
+
+This article is an introductory tutorial to deploy darknet models with NNVM.
+All the required models and libraries will be downloaded from the internet by the script.
+This script runs the YOLO-V2 and YOLO-V3 Model with the bounding boxes
+Darknet parsing have dependancy with CFFI and CV2 library
+Please install CFFI and CV2 before executing this script
+
+.. code-block:: bash
+
+  pip install cffi
+  pip install opencv-python
+"""
+
+import nnvm
+import nnvm.frontend.darknet
+import tvm.relay.testing.yolo_detection
+import tvm.relay.testing.darknet
+import matplotlib.pyplot as plt
+import numpy as np
+import tvm
+import sys
+
+from ctypes import *
+from tvm.contrib.download import download_testdata
+from tvm.relay.testing.darknet import __darknetffi__
+
+# Model name
+MODEL_NAME = 'yolov3'
+
+######################################################################
+# Download required files
+# -----------------------
+# Download cfg and weights file if first time.
+CFG_NAME = MODEL_NAME + '.cfg'
+WEIGHTS_NAME = MODEL_NAME + '.weights'
+REPO_URL = 'https://github.com/siju-samuel/darknet/blob/master/'
+CFG_URL = REPO_URL + 'cfg/' + CFG_NAME + '?raw=true'
+WEIGHTS_URL = 'https://pjreddie.com/media/files/' + WEIGHTS_NAME
+
+cfg_path = download_testdata(CFG_URL, CFG_NAME, module="darknet")
+weights_path = download_testdata(WEIGHTS_URL, WEIGHTS_NAME, module="darknet")
+
+# Download and Load darknet library
+if sys.platform in ['linux', 'linux2']:
+    DARKNET_LIB = 'libdarknet2.0.so'
+    DARKNET_URL = REPO_URL + 'lib/' + DARKNET_LIB + '?raw=true'
+elif sys.platform == 'darwin':
+    DARKNET_LIB = 'libdarknet_mac2.0.so'
+    DARKNET_URL = REPO_URL + 'lib_osx/' + DARKNET_LIB + '?raw=true'
+else:
+    err = "Darknet lib is not supported on {} platform".format(sys.platform)
+    raise NotImplementedError(err)
+
+lib_path = download_testdata(DARKNET_URL, DARKNET_LIB, module="darknet")
+
+DARKNET_LIB = __darknetffi__.dlopen(lib_path)
+net = DARKNET_LIB.load_network(cfg_path.encode('utf-8'), weights_path.encode('utf-8'), 0)
+dtype = 'float32'
+batch_size = 1
+
+print("Converting darknet to nnvm symbols...")
+sym, params = nnvm.frontend.darknet.from_darknet(net, dtype)
+
+######################################################################
+# Compile the model on NNVM
+# -------------------------
+# compile the model
+target = 'llvm'
+ctx = tvm.cpu(0)
+data = np.empty([batch_size, net.c, net.h, net.w], dtype)
+shape = {'data': data.shape}
+print("Compiling the model...")
+dtype_dict = {}
+with nnvm.compiler.build_config(opt_level=2):
+    graph, lib, params = nnvm.compiler.build(sym, target, shape, dtype_dict, params)
+
+[neth, netw] = shape['data'][2:] # Current image shape is 608x608
+######################################################################
+# Load a test image
+# --------------------------------------------------------------------
+test_image = 'dog.jpg'
+print("Loading the test image...")
+img_url = 'https://github.com/siju-samuel/darknet/blob/master/data/' + \
+          test_image + '?raw=true'
+img_path = download_testdata(img_url, test_image, "data")
+
+data = tvm.relay.testing.darknet.load_image(img_path, netw, neth)
+######################################################################
+# Execute on TVM Runtime
+# ----------------------
+# The process is no different from other examples.
+from tvm.contrib import graph_runtime
+
+m = graph_runtime.create(graph, lib, ctx)
+
+# set inputs
+m.set_input('data', tvm.nd.array(data.astype(dtype)))
+m.set_input(**params)
+# execute
+print("Running the test image...")
+
+m.run()
+# get outputs
+tvm_out = []
+if MODEL_NAME == 'yolov2':
+    layer_out = {}
+    layer_out['type'] = 'Region'
+    # Get the region layer attributes (n, out_c, out_h, out_w, classes, coords, background)
+    layer_attr = m.get_output(2).asnumpy()
+    layer_out['biases'] = m.get_output(1).asnumpy()
+    out_shape = (layer_attr[0], layer_attr[1]//layer_attr[0],
+                 layer_attr[2], layer_attr[3])
+    layer_out['output'] = m.get_output(0).asnumpy().reshape(out_shape)
+    layer_out['classes'] = layer_attr[4]
+    layer_out['coords'] = layer_attr[5]
+    layer_out['background'] = layer_attr[6]
+    tvm_out.append(layer_out)
+
+elif MODEL_NAME == 'yolov3':
+    for i in range(3):
+        layer_out = {}
+        layer_out['type'] = 'Yolo'
+        # Get the yolo layer attributes (n, out_c, out_h, out_w, classes, total)
+        layer_attr = m.get_output(i*4+3).asnumpy()
+        layer_out['biases'] = m.get_output(i*4+2).asnumpy()
+        layer_out['mask'] = m.get_output(i*4+1).asnumpy()
+        out_shape = (layer_attr[0], layer_attr[1]//layer_attr[0],
+                     layer_attr[2], layer_attr[3])
+        layer_out['output'] = m.get_output(i*4).asnumpy().reshape(out_shape)
+        layer_out['classes'] = layer_attr[4]
+        tvm_out.append(layer_out)
+
+# do the detection and bring up the bounding boxes
+thresh = 0.5
+nms_thresh = 0.45
+img = tvm.relay.testing.darknet.load_image_color(img_path)
+_, im_h, im_w = img.shape
+dets = tvm.relay.testing.yolo_detection.fill_network_boxes((netw, neth), (im_w, im_h), thresh,
+                                                      1, tvm_out)
+last_layer = net.layers[net.n - 1]
+tvm.relay.testing.yolo_detection.do_nms_sort(dets, last_layer.classes, nms_thresh)
+
+coco_name = 'coco.names'
+coco_url = 'https://github.com/siju-samuel/darknet/blob/master/data/' + coco_name + '?raw=true'
+font_name = 'arial.ttf'
+font_url = 'https://github.com/siju-samuel/darknet/blob/master/data/' + font_name + '?raw=true'
+coco_path = download_testdata(coco_url, coco_name, module='data')
+font_path = download_testdata(font_url, font_name, module='data')
+
+with open(coco_path) as f:
+    content = f.readlines()
+
+names = [x.strip() for x in content]
+
+tvm.relay.testing.yolo_detection.draw_detections(font_path, img, dets, thresh, names, last_layer.classes)
+plt.imshow(img.transpose(1, 2, 0))
+plt.show()
diff --git a/nnvm/tutorials/from_mxnet.py b/nnvm/tutorials/from_mxnet.py
new file mode 100644
index 000000000000..e4a30aa2c0e0
--- /dev/null
+++ b/nnvm/tutorials/from_mxnet.py
@@ -0,0 +1,136 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+.. _tutorial-from-mxnet:
+
+Compile MXNet Models
+====================
+**Author**: `Joshua Z. Zhang <https://zhreshold.github.io/>`_
+
+This article is an introductory tutorial to deploy mxnet models with NNVM.
+
+For us to begin with, mxnet module is required to be installed.
+
+A quick solution is
+
+.. code-block:: bash
+
+    pip install mxnet --user
+
+or please refer to offical installation guide.
+https://mxnet.incubator.apache.org/versions/master/install/index.html
+"""
+# some standard imports
+import mxnet as mx
+import numpy as np
+import nnvm
+import tvm
+from tvm.contrib.download import download_testdata
+
+######################################################################
+# Download Resnet18 model from Gluon Model Zoo
+# ---------------------------------------------
+# In this section, we download a pretrained imagenet model and classify an image.
+from mxnet.gluon.model_zoo.vision import get_model
+from PIL import Image
+from matplotlib import pyplot as plt
+block = get_model('resnet18_v1', pretrained=True)
+img_url = 'https://github.com/dmlc/mxnet.js/blob/master/data/cat.png?raw=true'
+img_name = 'cat.png'
+synset_url = ''.join(['https://gist.githubusercontent.com/zhreshold/',
+                      '4d0b62f3d01426887599d4f7ede23ee5/raw/',
+                      '596b27d23537e5a1b5751d2b0481ef172f58b539/',
+                      'imagenet1000_clsid_to_human.txt'])
+synset_name = 'imagenet1000_clsid_to_human.txt'
+img_path = download_testdata(img_url, img_name, module='data')
+synset_path = download_testdata(synset_url, synset_name, module='data')
+with open(synset_path) as f:
+    synset = eval(f.read())
+image = Image.open(img_path).resize((224, 224))
+plt.imshow(image)
+plt.show()
+
+def transform_image(image):
+    image = np.array(image) - np.array([123., 117., 104.])
+    image /= np.array([58.395, 57.12, 57.375])
+    image = image.transpose((2, 0, 1))
+    image = image[np.newaxis, :]
+    return image
+
+x = transform_image(image)
+print('x', x.shape)
+
+######################################################################
+# Compile the Graph
+# -----------------
+# Now we would like to port the Gluon model to a portable computational graph.
+# It's as easy as several lines.
+# We support MXNet static graph(symbol) and HybridBlock in mxnet.gluon
+sym, params = nnvm.frontend.from_mxnet(block)
+# we want a probability so add a softmax operator
+sym = nnvm.sym.softmax(sym)
+
+######################################################################
+# now compile the graph
+import nnvm.compiler
+target = 'cuda'
+shape_dict = {'data': x.shape}
+with nnvm.compiler.build_config(opt_level=3):
+    graph, lib, params = nnvm.compiler.build(sym, target, shape_dict, params=params)
+
+######################################################################
+# Execute the portable graph on TVM
+# ---------------------------------
+# Now, we would like to reproduce the same forward computation using TVM.
+from tvm.contrib import graph_runtime
+ctx = tvm.gpu(0)
+dtype = 'float32'
+m = graph_runtime.create(graph, lib, ctx)
+# set inputs
+m.set_input('data', tvm.nd.array(x.astype(dtype)))
+m.set_input(**params)
+# execute
+m.run()
+# get outputs
+tvm_output = m.get_output(0)
+top1 = np.argmax(tvm_output.asnumpy()[0])
+print('TVM prediction top-1:', top1, synset[top1])
+
+######################################################################
+# Use MXNet symbol with pretrained weights
+# ----------------------------------------
+# MXNet often use `arg_params` and `aux_params` to store network parameters
+# separately, here we show how to use these weights with existing API
+def block2symbol(block):
+    data = mx.sym.Variable('data')
+    sym = block(data)
+    args = {}
+    auxs = {}
+    for k, v in block.collect_params().items():
+        args[k] = mx.nd.array(v.data().asnumpy())
+    return sym, args, auxs
+mx_sym, args, auxs = block2symbol(block)
+# usually we would save/load it as checkpoint
+mx.model.save_checkpoint('resnet18_v1', 0, mx_sym, args, auxs)
+# there are 'resnet18_v1-0000.params' and 'resnet18_v1-symbol.json' on disk
+
+######################################################################
+# for a normal mxnet model, we start from here
+mx_sym, args, auxs = mx.model.load_checkpoint('resnet18_v1', 0)
+# now we use the same API to get NNVM compatible symbol
+nnvm_sym, nnvm_params = nnvm.frontend.from_mxnet(mx_sym, args, auxs)
+# repeat the same steps to run this model using TVM
diff --git a/nnvm/tutorials/from_mxnet_to_webgl.py b/nnvm/tutorials/from_mxnet_to_webgl.py
new file mode 100644
index 000000000000..a54704cca381
--- /dev/null
+++ b/nnvm/tutorials/from_mxnet_to_webgl.py
@@ -0,0 +1,515 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+Deploy Deep Learning Models to OpenGL and WebGL
+===============================================
+**Author**: `Zhixun Tan <https://github.com/phisiart>`_
+
+This example shows how to build a neural network with NNVM python frontend and
+generate runtime library for WebGL running in a browser with TVM.
+To run this notebook, you need to install tvm and nnvm.
+Notice that you need to build tvm with OpenGL.
+"""
+
+######################################################################
+# Overview
+# --------
+# In this tutorial, we will download a pre-trained resnet18 model from Gluon
+# Model Zoo, and run image classification in 3 different ways:
+#
+# - Run locally:
+#   We will compile the model into a TVM library with OpenGL device code and
+#   directly run it locally.
+#
+# - Run in a browser through RPC:
+#   We will compile the model into a JavaScript TVM library with WebGL device
+#   code, and upload it to an RPC server that is hosting JavaScript TVM runtime
+#   to run it.
+#
+# - Export a JavaScript library and run in a browser:
+#   We will compile the model into a JavaScript TVM library with WebGL device
+#   code, combine it with JavaScript TVM runtime, and pack everything together.
+#   Then we will run it directly in a browser.
+#
+from __future__ import print_function
+
+import numpy as np
+import tvm
+from tvm.contrib.download import download_testdata
+import nnvm.compiler
+import nnvm.testing
+
+# This tutorial must be run with OpenGL backend enabled in TVM.
+# The NNVM CI does not enable OpenGL yet. But the user can run this script.
+opengl_enabled = tvm.module.enabled("opengl")
+
+# To run the local demo, set this flag to True.
+run_deploy_local = False
+
+# To run the RPC demo, set this flag to True.
+run_deploy_rpc = False
+
+# To run the WebGL deploy demo, set this flag to True.
+run_deploy_web = False
+
+######################################################################
+# Download a Pre-trained Resnet18 Model
+# -------------------------------------
+# Here we define 2 functions:
+#
+# - A function that downloads a pre-trained resnet18 model from Gluon Model Zoo.
+#   The model that we download is in MXNet format, we then transform it into an
+#   NNVM computation graph.
+#
+# - A function that downloads a file that contains the name of all the image
+#   classes in this model.
+#
+def load_mxnet_resnet():
+    """Load a pretrained resnet model from MXNet and transform that into NNVM
+       format.
+
+    Returns
+    -------
+    net : nnvm.Symbol
+        The loaded resnet computation graph.
+
+    params : dict[str -> NDArray]
+        The pretrained model parameters.
+
+    data_shape: tuple
+        The shape of the input tensor (an image).
+
+    out_shape: tuple
+        The shape of the output tensor (probability of all classes).
+    """
+
+    print("Loading pretrained resnet model from MXNet...")
+
+    # Download a pre-trained mxnet resnet18_v1 model.
+    from mxnet.gluon.model_zoo.vision import get_model
+    block = get_model('resnet18_v1', pretrained=True)
+
+    # Transform the mxnet model into NNVM.
+    # We want a probability so add a softmax operator.
+    sym, params = nnvm.frontend.from_mxnet(block)
+    sym = nnvm.sym.softmax(sym)
+
+    print("- Model loaded!")
+    return sym, params, (1, 3, 224, 224), (1, 1000)
+
+def download_synset():
+    """Download a dictionary from class index to name.
+    This lets us know what our prediction actually is.
+
+    Returns
+    -------
+    synset : dict[int -> str]
+        The loaded synset.
+    """
+
+    print("Downloading synset...")
+
+    url = "https://gist.githubusercontent.com/zhreshold/" + \
+          "4d0b62f3d01426887599d4f7ede23ee5/raw/" + \
+          "596b27d23537e5a1b5751d2b0481ef172f58b539/" + \
+          "imagenet1000_clsid_to_human.txt"
+    file_name = "imagenet1000_clsid_to_human.txt"
+
+    file_path = download_testdata(url, file_name, module='data')
+    with open(file_path) as f:
+        synset = eval(f.read())
+
+    print("- Synset downloaded!")
+    return synset
+
+######################################################################
+# Download Input Image
+# --------------------
+# Here we define 2 functions that prepare an image that we want to perform
+# classification on.
+#
+# - A function that downloads a cat image.
+#
+# - A function that performs preprocessing to an image so that it fits the
+#   format required by the resnet18 model.
+#
+def download_image():
+    """Download a cat image and resize it to 224x224 which fits resnet.
+
+    Returns
+    -------
+    image : PIL.Image.Image
+        The loaded and resized image.
+    """
+
+    print("Downloading cat image...")
+
+    from matplotlib import pyplot as plt
+    from PIL import Image
+
+    url = "https://github.com/dmlc/mxnet.js/blob/master/data/cat.png?raw=true"
+    img_name = "cat.png"
+
+    img_path = download_testdata(url, img_name, module='data')
+    image = Image.open(img_path).resize((224, 224))
+
+    print("- Cat image downloaded!")
+
+    plt.imshow(image)
+    plt.show()
+
+    return image
+
+def transform_image(image):
+    """Perform necessary preprocessing to input image.
+
+    Parameters
+    ----------
+    image : numpy.ndarray
+        The raw image.
+
+    Returns
+    -------
+    image : numpy.ndarray
+        The preprocessed image.
+    """
+
+    image = np.array(image) - np.array([123., 117., 104.])
+    image /= np.array([58.395, 57.12, 57.375])
+    image = image.transpose((2, 0, 1))
+    image = image[np.newaxis, :]
+    return image
+
+######################################################################
+# Compile the Model
+# -----------------
+# Here we define a function that invokes the NNVM compiler.
+#
+def compile_net(net, target_host, target, data_shape, params):
+    """Compiles an NNVM computation graph.
+
+    Parameters
+    ----------
+    net : nnvm.Graph
+        The NNVM computation graph.
+
+    target_host : str
+        The target to compile the host portion of the library.
+
+    target : str
+        The target to compile the device portion of the library.
+
+    data_shape : tuple
+        The shape of the input data (image).
+
+    params : dict[str -> NDArray]
+        Model parameters.
+
+    Returns
+    -------
+    graph : Graph
+        The final execution graph.
+
+    libmod : tvm.Module
+        The module that comes with the execution graph
+
+    params : dict[str -> NDArray]
+        The updated parameters of graph if params is passed.
+        This can be different from the params passed in.
+    """
+
+    print("Compiling the neural network...")
+
+    with nnvm.compiler.build_config(opt_level=0):
+        deploy_graph, lib, deploy_params = nnvm.compiler.build(
+            net,
+            target_host=target_host,
+            target=target,
+            shape={"data": data_shape},
+            params=params)
+
+    print("- Complilation completed!")
+    return deploy_graph, lib, deploy_params
+
+######################################################################
+# Demo 1: Deploy Locally
+# ----------------------
+# In this demo, we will compile the model targetting the local machine.
+#
+# Then we will demonstrate how to save the compiled model as a shared library
+# and load it back.
+#
+# Finally, we will run the model.
+#
+def deploy_local():
+    """Runs the demo that deploys a model locally.
+    """
+
+    # Load resnet model.
+    net, params, data_shape, out_shape = load_mxnet_resnet()
+
+    # Compile the model.
+    # Note that we specify the the host target as "llvm".
+    deploy_graph, lib, deploy_params = compile_net(
+        net,
+        target_host="llvm",
+        target="opengl",
+        data_shape=data_shape,
+        params=params)
+
+    # Save the compiled module.
+    # Note we need to save all three files returned from the NNVM compiler.
+    print("Saving the compiled module...")
+    from tvm.contrib import util
+    temp = util.tempdir()
+
+    path_lib = temp.relpath("deploy_lib.so")
+    path_graph_json = temp.relpath("deploy_graph.json")
+    path_params = temp.relpath("deploy_param.params")
+
+    lib.export_library(path_lib)
+    with open(path_graph_json, "w") as fo:
+        fo.write(deploy_graph.json())
+    with open(path_params, "wb") as fo:
+        fo.write(nnvm.compiler.save_param_dict(deploy_params))
+
+    print("- Saved files:", temp.listdir())
+
+    # Load the module back.
+    print("Loading the module back...")
+    loaded_lib = tvm.module.load(path_lib)
+    with open(path_graph_json) as fi:
+        loaded_graph_json = fi.read()
+    with open(path_params, "rb") as fi:
+        loaded_params = bytearray(fi.read())
+    print("- Module loaded!")
+
+    # Run the model! We will perform prediction on an image.
+    print("Running the graph...")
+    from tvm.contrib import graph_runtime
+
+    module = graph_runtime.create(loaded_graph_json, loaded_lib, tvm.opengl(0))
+    module.load_params(loaded_params)
+
+    image = transform_image(download_image())
+    input_data = tvm.nd.array(image.astype("float32"), ctx=tvm.opengl(0))
+
+    module.set_input("data", input_data)
+    module.run()
+
+    # Retrieve the output.
+    out = module.get_output(0, tvm.nd.empty(out_shape, ctx=tvm.opengl(0)))
+    top1 = np.argmax(out.asnumpy())
+    synset = download_synset()
+    print('TVM prediction top-1:', top1, synset[top1])
+
+if run_deploy_local and opengl_enabled:
+    deploy_local()
+
+######################################################################
+# Demo 2: Deploy the Model to WebGL Remotely with RPC
+# -------------------------------------------------------
+# Following the steps above, we can also compile the model for WebGL.
+# TVM provides rpc module to help with remote deploying.
+#
+# When we deploy a model locally to OpenGL, the model consists of two parts:
+# the host LLVM part and the device GLSL part. Now that we want to deploy to
+# WebGL, we need to leverage Emscripten to transform LLVM into JavaScript. In
+# order to do that, we will need to specify the host target as
+# 'llvm -target=asmjs-unknown-emscripten -system-lib`. Then call Emscripten to
+# compile the LLVM binary output into a JavaScript file.
+#
+# First, we need to manually start an RPC server. Please follow the instructions
+# in `tvm/web/README.md`. After following the steps, you should have a web page
+# opened in a browser, and a Python script running a proxy.
+#
+def deploy_rpc():
+    """Runs the demo that deploys a model remotely through RPC.
+    """
+    from tvm import rpc
+    from tvm.contrib import util, emscripten
+
+    # As usual, load the resnet18 model.
+    net, params, data_shape, out_shape = load_mxnet_resnet()
+
+    # Compile the model.
+    # Note that this time we are changing the target.
+    # This is because we want to translate the host library into JavaScript
+    # through Emscripten.
+    graph, lib, params = compile_net(
+        net,
+        target_host="llvm -target=asmjs-unknown-emscripten -system-lib",
+        target="opengl",
+        data_shape=data_shape,
+        params=params)
+
+    # Now we want to deploy our model through RPC.
+    # First we ned to prepare the module files locally.
+    print("Saving the compiled module...")
+
+    temp = util.tempdir()
+    path_obj = temp.relpath("deploy.bc") # host LLVM part
+    path_dso = temp.relpath("deploy.js") # host JavaScript part
+    path_gl = temp.relpath("deploy.gl") # device GLSL part
+    path_json = temp.relpath("deploy.tvm_meta.json")
+
+    lib.save(path_obj)
+    emscripten.create_js(path_dso, path_obj, side_module=True)
+    lib.imported_modules[0].save(path_gl)
+
+    print("- Saved files:", temp.listdir())
+
+    # Connect to the RPC server.
+    print("Connecting to RPC server...")
+    proxy_host = 'localhost'
+    proxy_port = 9090
+    remote = rpc.connect(proxy_host, proxy_port, key="js")
+    print("- Connected to RPC server!")
+
+    # Upload module to RPC server.
+    print("Uploading module to RPC server...")
+    remote.upload(path_dso, "deploy.dso")
+    remote.upload(path_gl)
+    remote.upload(path_json)
+    print("- Upload completed!")
+
+    # Load remote library.
+    print("Loading remote library...")
+    fdev = remote.load_module("deploy.gl")
+    fhost = remote.load_module("deploy.dso")
+    fhost.import_module(fdev)
+    rlib = fhost
+    print("- Remote library loaded!")
+
+    ctx = remote.opengl(0)
+
+    # Upload the parameters.
+    print("Uploading parameters...")
+    rparams = {k: tvm.nd.array(v, ctx) for k, v in params.items()}
+    print("- Parameters uploaded!")
+
+    # Create the remote runtime module.
+    print("Running remote module...")
+    from tvm.contrib import graph_runtime
+    module = graph_runtime.create(graph, rlib, ctx)
+
+    # Set parameter.
+    module.set_input(**rparams)
+
+    # Set input data.
+    input_data = np.random.uniform(size=data_shape)
+    module.set_input('data', tvm.nd.array(input_data.astype('float32')))
+
+    # Run.
+    module.run()
+    print("- Remote module execution completed!")
+
+    out = module.get_output(0, out=tvm.nd.empty(out_shape, ctx=ctx))
+    # Print first 10 elements of output.
+    print(out.asnumpy()[0][0:10])
+
+if run_deploy_rpc and opengl_enabled:
+    deploy_rpc()
+
+######################################################################
+# Demo 3: Deploy the Model to WebGL SystemLib
+# -----------------------------------------------
+# This time we are not using RPC. Instead, we will compile the model and link it
+# with the entire tvm runtime into a single giant JavaScript file. Then we will
+# run the model using JavaScript.
+#
+def deploy_web():
+    """Runs the demo that deploys to web.
+    """
+
+    import base64
+    import json
+    import os
+    import shutil
+    import SimpleHTTPServer, SocketServer
+
+    from tvm.contrib import emscripten
+
+    curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(os.getcwd())))
+    working_dir = os.getcwd()
+    output_dir = os.path.join(working_dir, "resnet")
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+
+    # As usual, load the resnet18 model.
+    net, params, data_shape, out_shape = load_mxnet_resnet()
+
+    # As usual, compile the model.
+    graph, lib, params = compile_net(
+        net,
+        target_host="llvm -target=asmjs-unknown-emscripten -system-lib",
+        target="opengl",
+        data_shape=data_shape,
+        params=params)
+
+    # Now we save the model and link it with the TVM web runtime.
+    path_lib = os.path.join(output_dir, "resnet.js")
+    path_graph = os.path.join(output_dir, "resnet.json")
+    path_params = os.path.join(output_dir, "resnet.params")
+    path_data_shape = os.path.join(output_dir, "data_shape.json")
+    path_out_shape = os.path.join(output_dir, "out_shape.json")
+
+    lib.export_library(path_lib, emscripten.create_js, options=[
+        "-s", "USE_GLFW=3",
+        "-s", "USE_WEBGL2=1",
+        "-lglfw",
+        "-s", "TOTAL_MEMORY=1073741824",
+    ])
+    with open(path_graph, "w") as fo:
+        fo.write(graph.json())
+    with open(path_params, "w") as fo:
+        fo.write(base64.b64encode(nnvm.compiler.save_param_dict(params)))
+
+    shutil.copyfile(os.path.join(curr_path, "../tvm/web/tvm_runtime.js"),
+                    os.path.join(output_dir, "tvm_runtime.js"))
+    shutil.copyfile(os.path.join(curr_path, "web/resnet.html"),
+                    os.path.join(output_dir, "resnet.html"))
+
+    # Now we want to save some extra files so that we can execute the model from
+    # JavaScript.
+    # - data shape
+    with open(path_data_shape, "w") as fo:
+        json.dump(list(data_shape), fo)
+    # - out shape
+    with open(path_out_shape, "w") as fo:
+        json.dump(list(out_shape), fo)
+    # - input image
+    image = download_image()
+    image.save(os.path.join(output_dir, "data.png"))
+    # - synset
+    synset = download_synset()
+    with open(os.path.join(output_dir, "synset.json"), "w") as fo:
+        json.dump(synset, fo)
+
+    print("Output files are in", output_dir)
+
+    # Finally, we fire up a simple web server to serve all the exported files.
+    print("Now running a simple server to serve the files...")
+    os.chdir(output_dir)
+    port = 8080
+    handler = SimpleHTTPServer.SimpleHTTPRequestHandler
+    httpd = SocketServer.TCPServer(("", port), handler)
+    print("Please open http://localhost:" + str(port) + "/resnet.html")
+    httpd.serve_forever()
+
+if run_deploy_web and opengl_enabled:
+    deploy_web()
diff --git a/nnvm/tutorials/from_onnx.py b/nnvm/tutorials/from_onnx.py
new file mode 100644
index 000000000000..97d154615e67
--- /dev/null
+++ b/nnvm/tutorials/from_onnx.py
@@ -0,0 +1,111 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+Compile ONNX Models
+===================
+**Author**: `Joshua Z. Zhang <https://zhreshold.github.io/>`_
+
+This article is an introductory tutorial to deploy ONNX models with NNVM.
+
+For us to begin with, onnx module is required to be installed.
+
+A quick solution is to install protobuf compiler, and
+
+.. code-block:: bash
+
+    pip install onnx --user
+
+or please refer to offical site.
+https://github.com/onnx/onnx
+"""
+import nnvm
+import tvm
+from tvm.contrib.download import download_testdata
+import onnx
+import numpy as np
+
+######################################################################
+# Load pretrained ONNX model
+# ---------------------------------------------
+# The example super resolution model used here is exactly the same model in onnx tutorial
+# http://pytorch.org/tutorials/advanced/super_resolution_with_caffe2.html
+# we skip the pytorch model construction part, and download the saved onnx model
+model_url = ''.join(['https://gist.github.com/zhreshold/',
+                     'bcda4716699ac97ea44f791c24310193/raw/',
+                     '93672b029103648953c4e5ad3ac3aadf346a4cdc/',
+                     'super_resolution_0.2.onnx'])
+model_path = download_testdata(model_url, 'super_resolution.onnx', module='onnx')
+# now you have super_resolution.onnx on disk
+onnx_model = onnx.load_model(model_path)
+# we can load the graph as NNVM compatible model
+sym, params = nnvm.frontend.from_onnx(onnx_model)
+
+######################################################################
+# Load a test image
+# ---------------------------------------------
+# A single cat dominates the examples!
+from PIL import Image
+img_url = 'https://github.com/dmlc/mxnet.js/blob/master/data/cat.png?raw=true'
+img_path = download_testdata(img_url, 'cat.png', module='data')
+img = Image.open(img_path).resize((224, 224))
+img_ycbcr = img.convert("YCbCr")  # convert to YCbCr
+img_y, img_cb, img_cr = img_ycbcr.split()
+x = np.array(img_y)[np.newaxis, np.newaxis, :, :]
+
+######################################################################
+# Compile the model on NNVM
+# ---------------------------------------------
+# We should be familiar with the process right now.
+import nnvm.compiler
+target = 'cuda'
+# assume first input name is data
+input_name = sym.list_input_names()[0]
+shape_dict = {input_name: x.shape}
+with nnvm.compiler.build_config(opt_level=3):
+    graph, lib, params = nnvm.compiler.build(sym, target, shape_dict, params=params)
+
+######################################################################
+# Execute on TVM
+# ---------------------------------------------
+# The process is no different from other example
+from tvm.contrib import graph_runtime
+ctx = tvm.gpu(0)
+dtype = 'float32'
+m = graph_runtime.create(graph, lib, ctx)
+# set inputs
+m.set_input(input_name, tvm.nd.array(x.astype(dtype)))
+m.set_input(**params)
+# execute
+m.run()
+# get outputs
+output_shape = (1, 1, 672, 672)
+tvm_output = m.get_output(0, tvm.nd.empty(output_shape, dtype)).asnumpy()
+
+######################################################################
+# Display results
+# ---------------------------------------------
+# We put input and output image neck to neck
+from matplotlib import pyplot as plt
+out_y = Image.fromarray(np.uint8((tvm_output[0, 0]).clip(0, 255)), mode='L')
+out_cb = img_cb.resize(out_y.size, Image.BICUBIC)
+out_cr = img_cr.resize(out_y.size, Image.BICUBIC)
+result = Image.merge('YCbCr', [out_y, out_cb, out_cr]).convert('RGB')
+canvas = np.full((672, 672*2, 3), 255)
+canvas[0:224, 0:224, :] = np.asarray(img)
+canvas[:, 672:, :] = np.asarray(result)
+plt.imshow(canvas.astype(np.uint8))
+plt.show()
diff --git a/nnvm/tutorials/from_tensorflow.py b/nnvm/tutorials/from_tensorflow.py
new file mode 100644
index 000000000000..6a30443dba60
--- /dev/null
+++ b/nnvm/tutorials/from_tensorflow.py
@@ -0,0 +1,239 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+Compile Tensorflow Models
+=========================
+This article is an introductory tutorial to deploy tensorflow models with TVM.
+
+For us to begin with, tensorflow python module is required to be installed.
+
+Please refer to https://www.tensorflow.org/install
+"""
+
+# tvm and nnvm
+import nnvm
+import tvm
+
+# os and numpy
+import numpy as np
+import os.path
+
+# Tensorflow imports
+import tensorflow as tf
+from tensorflow.core.framework import graph_pb2
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import tensor_util
+
+# Tensorflow utility functions
+import tvm.relay.testing.tf as tf_testing
+
+# Base location for model related files.
+repo_base = 'https://github.com/dmlc/web-data/raw/master/tensorflow/models/InceptionV1/'
+
+# Test image
+img_name = 'elephant-299.jpg'
+image_url = os.path.join(repo_base, img_name)
+
+######################################################################
+# Tutorials
+# ---------
+# .. note::
+#
+#   protobuf should be exported with :any:`add_shapes=True` option.
+#   Could use https://github.com/dmlc/web-data/tree/master/tensorflow/scripts/tf-to-nnvm.py
+#   to add shapes for existing models.
+#
+# Please refer docs/frontend/tensorflow.md for more details for various models
+# from tensorflow.
+
+model_name = 'classify_image_graph_def-with_shapes.pb'
+model_url = os.path.join(repo_base, model_name)
+
+# Image label map
+map_proto = 'imagenet_2012_challenge_label_map_proto.pbtxt'
+map_proto_url = os.path.join(repo_base, map_proto)
+
+# Human readable text for labels
+label_map = 'imagenet_synset_to_human_label_map.txt'
+label_map_url = os.path.join(repo_base, label_map)
+
+# Target settings
+# Use these commented settings to build for cuda.
+#target = 'cuda'
+#target_host = 'llvm'
+#layout = "NCHW"
+#ctx = tvm.gpu(0)
+target = 'llvm'
+target_host = 'llvm'
+layout = None
+ctx = tvm.cpu(0)
+
+######################################################################
+# Download required files
+# -----------------------
+# Download files listed above.
+from tvm.contrib.download import download_testdata
+
+img_path = download_testdata(image_url, img_name, module='data')
+model_path = download_testdata(model_url, model_name, module=['tf', 'InceptionV1'])
+map_proto_path = download_testdata(map_proto_url, map_proto, module='data')
+label_path = download_testdata(label_map_url, label_map, module='data')
+
+######################################################################
+# Import model
+# ------------
+# Creates tensorflow graph definition from protobuf file.
+
+with tf.gfile.FastGFile(model_path, 'rb') as f:
+    graph_def = tf.GraphDef()
+    graph_def.ParseFromString(f.read())
+    graph = tf.import_graph_def(graph_def, name='')
+    # Call the utility to import the graph definition into default graph.
+    graph_def = tf_testing.ProcessGraphDefParam(graph_def)
+    # Add shapes to the graph.
+    with tf.Session() as sess:
+        graph_def = tf_testing.AddShapesToGraphDef(sess, 'softmax')
+
+######################################################################
+# Decode image
+# ------------
+# .. note::
+#
+#   tensorflow frontend import doesn't support preprocessing ops like JpegDecode.
+#   JpegDecode is bypassed (just return source node).
+#   Hence we supply decoded frame to TVM instead.
+#
+
+from PIL import Image
+image = Image.open(img_path).resize((299, 299))
+
+x = np.array(image)
+
+######################################################################
+# Import the graph to NNVM
+# ------------------------
+# Import tensorflow graph definition to nnvm.
+#
+# Results:
+#   sym: nnvm graph for given tensorflow protobuf.
+#   params: params converted from tensorflow params (tensor protobuf).
+sym, params = nnvm.frontend.from_tensorflow(graph_def, layout=layout)
+
+print("Tensorflow protobuf imported as nnvm graph")
+######################################################################
+# NNVM Compilation
+# ----------------
+# Compile the graph to llvm target with given input specification.
+#
+# Results:
+#   graph: Final graph after compilation.
+#   params: final params after compilation.
+#   lib: target library which can be deployed on target with tvm runtime.
+
+import nnvm.compiler
+shape_dict = {'DecodeJpeg/contents': x.shape}
+dtype_dict = {'DecodeJpeg/contents': 'uint8'}
+graph, lib, params = nnvm.compiler.build(sym, shape=shape_dict, target=target, target_host=target_host, dtype=dtype_dict, params=params)
+
+######################################################################
+# Execute the portable graph on TVM
+# ---------------------------------
+# Now we can try deploying the NNVM compiled model on target.
+
+from tvm.contrib import graph_runtime
+dtype = 'uint8'
+m = graph_runtime.create(graph, lib, ctx)
+# set inputs
+m.set_input('DecodeJpeg/contents', tvm.nd.array(x.astype(dtype)))
+m.set_input(**params)
+# execute
+m.run()
+# get outputs
+tvm_output = m.get_output(0, tvm.nd.empty(((1, 1008)), 'float32'))
+
+######################################################################
+# Process the output
+# ------------------
+# Process the model output to human readable text for InceptionV1.
+predictions = tvm_output.asnumpy()
+predictions = np.squeeze(predictions)
+
+# Creates node ID --> English string lookup.
+node_lookup = tf_testing.NodeLookup(label_lookup_path=map_proto_path,
+                                    uid_lookup_path=label_path)
+
+# Print top 5 predictions from TVM output.
+top_k = predictions.argsort()[-5:][::-1]
+for node_id in top_k:
+    human_string = node_lookup.id_to_string(node_id)
+    score = predictions[node_id]
+    print('%s (score = %.5f)' % (human_string, score))
+
+######################################################################
+# Inference on tensorflow
+# -----------------------
+# Run the corresponding model on tensorflow
+
+def create_graph():
+    """Creates a graph from saved GraphDef file and returns a saver."""
+    # Creates graph from saved graph_def.pb.
+    with tf.gfile.FastGFile(model_path, 'rb') as f:
+        graph_def = tf.GraphDef()
+        graph_def.ParseFromString(f.read())
+        graph = tf.import_graph_def(graph_def, name='')
+        # Call the utility to import the graph definition into default graph.
+        graph_def = tf_testing.ProcessGraphDefParam(graph_def)
+
+def run_inference_on_image(image):
+    """Runs inference on an image.
+
+    Parameters
+    ----------
+    image: String
+        Image file name.
+
+    Returns
+    -------
+        Nothing
+    """
+    if not tf.gfile.Exists(image):
+        tf.logging.fatal('File does not exist %s', image)
+    image_data = tf.gfile.FastGFile(image, 'rb').read()
+
+    # Creates graph from saved GraphDef.
+    create_graph()
+
+    with tf.Session() as sess:
+        softmax_tensor = sess.graph.get_tensor_by_name('softmax:0')
+        predictions = sess.run(softmax_tensor,
+                               {'DecodeJpeg/contents:0': image_data})
+
+        predictions = np.squeeze(predictions)
+
+        # Creates node ID --> English string lookup.
+        node_lookup = tf_testing.NodeLookup(label_lookup_path=map_proto_path,
+                                            uid_lookup_path=label_path)
+
+        # Print top 5 predictions from tensorflow.
+        top_k = predictions.argsort()[-5:][::-1]
+        print ("===== TENSORFLOW RESULTS =======")
+        for node_id in top_k:
+            human_string = node_lookup.id_to_string(node_id)
+            score = predictions[node_id]
+            print('%s (score = %.5f)' % (human_string, score))
+
+run_inference_on_image(img_path)
diff --git a/nnvm/tutorials/get_started.py b/nnvm/tutorials/get_started.py
new file mode 100644
index 000000000000..46f711e7d347
--- /dev/null
+++ b/nnvm/tutorials/get_started.py
@@ -0,0 +1,190 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+Get Started with NNVM
+=====================
+**Author**: `Tianqi Chen <https://tqchen.github.io/>`_
+
+This article is an introductory tutorial to workflow in NNVM.
+"""
+import nnvm.compiler
+import nnvm.symbol as sym
+
+######################################################################
+# Declare Computation
+# -------------------
+# We start by describing our need using computational graph.
+# Most deep learning frameworks use computation graph to describe
+# their computation. In this example, we directly use
+# NNVM's API to construct the computational graph.
+#
+# .. note::
+#
+#   In a typical deep learning compilation workflow,
+#   we can get the models from :any:`nnvm.frontend`
+#
+# The following code snippet describes :math:`z = x + \sqrt{y}`
+# and creates a nnvm graph from the description.
+# We can print out the graph ir to check the graph content.
+
+x = sym.Variable("x")
+y = sym.Variable("y")
+z = sym.elemwise_add(x, sym.sqrt(y))
+compute_graph = nnvm.graph.create(z)
+print("-------compute graph-------")
+print(compute_graph.ir())
+
+######################################################################
+# Compile
+# -------
+# We can call :any:`nnvm.compiler.build` to compile the graph.
+# The build function takes a shape parameter which specifies the
+# input shape requirement. Here we only need to pass in shape of ``x``
+# and the other one will be inferred automatically by NNVM.
+#
+# The function returns three values. ``deploy_graph`` contains
+# the final compiled graph structure. ``lib`` is a :any:`tvm.module.Module`
+# that contains compiled CUDA functions. We do not need the ``params``
+# in this case.
+shape = (4,)
+deploy_graph, lib, params = nnvm.compiler.build(
+    compute_graph, target="cuda", shape={"x": shape}, dtype="float32")
+
+######################################################################
+# We can print out the IR of ``deploy_graph`` to understand what just
+# happened under the hood. We can find that ``deploy_graph`` only
+# contains a single operator ``tvm_op``. This is because NNVM
+# automatically fused the operator together into one operator.
+#
+print("-------deploy graph-------")
+print(deploy_graph.ir())
+
+######################################################################
+# Let us also peek into content of ``lib``.
+# Typically a compiled TVM CUDA module contains a host module(lib)
+# and a device module(``lib.imported_modules[0]``) that contains the CUDA code.
+# We print out the the generated device code here.
+# This is exactly a fused CUDA version of kernel that the graph points to.
+#
+print("-------deploy library-------")
+print(lib.imported_modules[0].get_source())
+
+######################################################################
+# Deploy and Run
+# --------------
+# Now that we have have compiled module, let us run it.
+# We can use :any:`graph_runtime <tvm.contrib.graph_runtime.create>`
+# in tvm to create a deployable :any:`GraphModule <tvm.contrib.graph_runtime.GraphModule>`.
+# We can use the :any:`set_input <tvm.contrib.graph_runtime.GraphModule.set_input>`,
+# :any:`run <tvm.contrib.graph_runtime.GraphModule.run>` and
+# :any:`get_output <tvm.contrib.graph_runtime.GraphModule.get_output>` function
+# to set the input, execute the graph and get the output we need.
+#
+import tvm
+import numpy as np
+from tvm.contrib import graph_runtime, util
+
+module = graph_runtime.create(deploy_graph, lib, tvm.gpu(0))
+x_np = np.array([1, 2, 3, 4]).astype("float32")
+y_np = np.array([4, 4, 4, 4]).astype("float32")
+# set input to the graph module
+module.set_input(x=x_np, y=y_np)
+# run forward computation
+module.run()
+# get the first output
+out = module.get_output(0, out=tvm.nd.empty(shape))
+print(out.asnumpy())
+
+######################################################################
+# Provide Model Parameters
+# ------------------------
+# Most deep learning models contains two types of inputs: parameters
+# that remains fixed during inference and data input that need to
+# change for each inference task. It is helpful to provide these
+# information to NNVM. Let us assume that ``y`` is the parameter
+# in our example. We can provide the model parameter information
+# by the params argument to :any:`nnvm.compiler.build`.
+#
+deploy_graph, lib, params = nnvm.compiler.build(
+    compute_graph, target="cuda", shape={"x": shape}, params={"y": y_np})
+
+######################################################################
+# This time we will need params value returned by :any:`nnvm.compiler.build`.
+# NNVM applys  optimization  to pre-compute the intermediate values in
+# the graph that can be determined by parameters. In this case
+# :math:`\sqrt{y}` can be pre-computed. The pre-computed values
+# are returned as new params. We can print out the new compiled library
+# to confirm that the fused kernel only now contains add.
+#
+print("-----optimized params-----")
+print(params)
+print("-------deploy library-------")
+print(lib.imported_modules[0].get_source())
+
+######################################################################
+# Save the Deployed Module
+# ------------------------
+# We can save the ``deploy_graph``, ``lib`` and ``params`` separately
+# and load them back later. We can use :any:`tvm.module.Module` to export
+# the compiled library. ``deploy_graph`` is saved in json format and ``params``
+# is serialized into a bytearray.
+#
+temp = util.tempdir()
+path_lib = temp.relpath("deploy.so")
+lib.export_library(path_lib)
+with open(temp.relpath("deploy.json"), "w") as fo:
+    fo.write(deploy_graph.json())
+with open(temp.relpath("deploy.params"), "wb") as fo:
+    fo.write(nnvm.compiler.save_param_dict(params))
+print(temp.listdir())
+
+######################################################################
+# We can load the module back.
+loaded_lib = tvm.module.load(path_lib)
+loaded_json = open(temp.relpath("deploy.json")).read()
+loaded_params = bytearray(open(temp.relpath("deploy.params"), "rb").read())
+module = graph_runtime.create(loaded_json, loaded_lib, tvm.gpu(0))
+params = nnvm.compiler.load_param_dict(loaded_params)
+# directly load from byte array
+module.load_params(loaded_params)
+module.run(x=x_np)
+# get the first output
+out = module.get_output(0, out=tvm.nd.empty(shape))
+print(out.asnumpy())
+
+######################################################################
+# Deploy using Another Language
+# -----------------------------
+# We use python in this example for demonstration.
+# We can also deploy the compiled modules with other languages
+# supported by TVM such as  c++, java, javascript.
+# The graph module itself is fully embedded in TVM runtime.
+#
+# The following block demonstrates how we can directly use TVM's
+# runtime API to execute the compiled module.
+# You can find similar runtime API in TVMRuntime of other languages.
+#
+fcreate = tvm.get_global_func("tvm.graph_runtime.create")
+ctx = tvm.gpu(0)
+gmodule = fcreate(loaded_json, loaded_lib, ctx.device_type, ctx.device_id)
+set_input, get_output, run = gmodule["set_input"], gmodule["get_output"], gmodule["run"]
+set_input("x", tvm.nd.array(x_np))
+gmodule["load_params"](loaded_params)
+run()
+out = tvm.nd.empty(shape)
+get_output(0, out)
+print(out.asnumpy())
diff --git a/nnvm/tutorials/nlp/from_darknet_rnn.py b/nnvm/tutorials/nlp/from_darknet_rnn.py
new file mode 100644
index 000000000000..1bc9627dd62f
--- /dev/null
+++ b/nnvm/tutorials/nlp/from_darknet_rnn.py
@@ -0,0 +1,198 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+Compile Darknet Models for RNN
+==============================
+**Author**: `Siju Samuel <https://siju-samuel.github.io/>`_
+
+This article is an introductory tutorial to deploy darknet rnn models with NNVM.
+
+This script will run a character prediction model
+Each module consists of 3 fully-connected layers. The input layer propagates information from the
+input to the current state. The recurrent layer propagates information through time from the
+previous state to the current one.
+
+The input to the network is a 1-hot encoding of ASCII characters. We train the network to predict
+the next character in a stream of characters. The output is constrained to be a probability
+distribution using a softmax layer.
+
+Since each recurrent layer contains information about the current character and the past
+characters, it can use this context to predict the future characters in a word or phrase.
+
+All the required models and libraries will be downloaded from the internet
+by the script.
+"""
+import random
+import numpy as np
+import tvm
+from tvm.contrib import graph_runtime
+from tvm.contrib.download import download_testdata
+from nnvm.testing.darknet import __darknetffi__
+import nnvm
+import nnvm.frontend.darknet
+
+# Set the parameters
+# -----------------------
+# Set the seed value and the number of characters to predict
+
+#Model name
+MODEL_NAME = 'rnn'
+#Seed value
+seed = 'Thus'
+#Number of characters to predict
+num = 1000
+
+# Download required files
+# -----------------------
+# Download cfg and weights file if first time.
+CFG_NAME = MODEL_NAME + '.cfg'
+WEIGHTS_NAME = MODEL_NAME + '.weights'
+REPO_URL = 'https://github.com/dmlc/web-data/blob/master/darknet/'
+CFG_URL = REPO_URL + 'cfg/' + CFG_NAME + '?raw=true'
+WEIGHTS_URL = REPO_URL + 'weights/' + WEIGHTS_NAME + '?raw=true'
+
+cfg_path = download_testdata(CFG_URL, CFG_NAME, module='darknet')
+weights_path = download_testdata(WEIGHTS_URL, WEIGHTS_NAME, module='darknet')
+
+# Download and Load darknet library
+DARKNET_LIB = 'libdarknet.so'
+DARKNET_URL = REPO_URL + 'lib/' + DARKNET_LIB + '?raw=true'
+lib_path = download_testdata(DARKNET_URL, DARKNET_LIB, module='darknet')
+DARKNET_LIB = __darknetffi__.dlopen(lib_path)
+net = DARKNET_LIB.load_network(cfg_path.encode('utf-8'), weights_path.encode('utf-8'), 0)
+dtype = 'float32'
+batch_size = 1
+
+# Import the graph to NNVM
+# ------------------------
+# Import darknet graph definition to nnvm.
+#
+# Results:
+#   sym: nnvm graph for rnn model
+#   params: params converted from darknet weights
+print("Converting darknet rnn model to nnvm symbols...")
+sym, params = nnvm.frontend.darknet.from_darknet(net, dtype)
+
+# Compile the model on NNVM
+data = np.empty([1, net.inputs], dtype)#net.inputs
+
+target = 'llvm'
+shape = {'data': data.shape}
+print("Compiling the model...")
+
+shape_dict = {'data': data.shape}
+dtype_dict = {'data': data.dtype}
+
+with nnvm.compiler.build_config(opt_level=2):
+    graph, lib, params = nnvm.compiler.build(sym, target, shape_dict, dtype_dict, params)
+
+# Execute the portable graph on TVM
+# ---------------------------------
+# Now we can try deploying the NNVM compiled model on cpu target.
+
+# Set the cpu context
+ctx = tvm.cpu(0)
+# Create graph runtime
+m = graph_runtime.create(graph, lib, ctx)
+# Set the params to runtime
+m.set_input(**params)
+
+def _init_state_memory(rnn_cells_count, dtype):
+    '''Initialize memory for states'''
+    states = {}
+    state_shape = (1024,)
+    for i in range(rnn_cells_count):
+        k = 'rnn' + str(i) + '_state'
+        states[k] = tvm.nd.array(np.zeros(state_shape, dtype).astype(dtype))
+    return states
+
+def _set_state_input(runtime, states):
+    '''Set the state inputs'''
+    for state in states:
+        runtime.set_input(state, states[state])
+
+def _get_state_output(runtime, states):
+    '''Get the state outputs and save'''
+    i = 1
+    for state in states:
+        data = states[state]
+        states[state] = runtime.get_output((i), tvm.nd.empty(data.shape, data.dtype))
+        i += 1
+
+def _proc_rnn_output(out_data):
+    '''Generate the characters from the output array'''
+    sum_array = 0
+    n = out_data.size
+    r = random.uniform(0, 1)
+    for j in range(n):
+        if out_data[j] < 0.0001:
+            out_data[j] = 0
+        sum_array += out_data[j]
+
+    for j in range(n):
+        out_data[j] *= float(1.0) / sum_array
+        r = r - out_data[j]
+        if r <= 0:
+            return j
+    return n-1
+
+print("RNN generaring text...")
+
+out_shape = (net.outputs,)
+rnn_cells_count = 3
+
+# Initialize state memory
+# -----------------------
+states = _init_state_memory(rnn_cells_count, dtype)
+
+len_seed = len(seed)
+count = len_seed + num
+out_txt = ""
+
+#Initialize random seed
+random.seed(0)
+c = ord(seed[0])
+inp_data = np.zeros([net.inputs], dtype)
+
+# Run the model
+# -------------
+
+# Predict character by character till `num`
+for i in range(count):
+    inp_data[c] = 1
+
+    # Set the input data
+    m.set_input('data', tvm.nd.array(inp_data.astype(dtype)))
+    inp_data[c] = 0
+
+    # Set the state inputs
+    _set_state_input(m, states)
+
+    # Run the model
+    m.run()
+
+    # Get the output
+    tvm_out = m.get_output(0, tvm.nd.empty(out_shape, dtype)).asnumpy()
+
+    # Get the state outputs
+    _get_state_output(m, states)
+
+    # Get the predicted character and keep buffering it
+    c = ord(seed[i])  if i < len_seed else _proc_rnn_output(tvm_out)
+    out_txt += chr(c)
+
+print("Predicted Text =", out_txt)
diff --git a/nnvm/tutorials/nlp/keras_s2s_translate.py b/nnvm/tutorials/nlp/keras_s2s_translate.py
new file mode 100644
index 000000000000..16c737418c6f
--- /dev/null
+++ b/nnvm/tutorials/nlp/keras_s2s_translate.py
@@ -0,0 +1,254 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+Keras LSTM Sequence to Sequence Model for Translation
+=================================
+**Author**: `Siju Samuel <https://siju-samuel.github.io/>`_
+
+This script demonstrates how to implement a basic character-level sequence-to-sequence model.
+We apply it to translating short English sentences into short French sentences,
+character-by-character.
+
+# Summary of the algorithm
+
+- We start with input sequences from a domain (e.g. English sentences)
+    and corresponding target sequences from another domain
+    (e.g. French sentences).
+- An encoder LSTM turns input sequences to 2 state vectors
+    (we keep the last LSTM state and discard the outputs).
+- A decoder LSTM is trained to turn the target sequences into
+    the same sequence but offset by one timestep in the future,
+    a training process called "teacher forcing" in this context.
+    Is uses as initial state the state vectors from the encoder.
+    Effectively, the decoder learns to generate `targets[t+1...]`
+    given `targets[...t]`, conditioned on the input sequence.
+
+This script loads the s2s.h5 model saved in repository
+https://github.com/dmlc/web-data/raw/master/keras/models/s2s_translate/lstm_seq2seq.py
+and generates sequences from it.  It assumes that no changes have been made (for example:
+latent_dim is unchanged, and the input data and model architecture are unchanged).
+
+# References
+
+- Sequence to Sequence Learning with Neural Networks
+    https://arxiv.org/abs/1409.3215
+- Learning Phrase Representations using
+    RNN Encoder-Decoder for Statistical Machine Translation
+    https://arxiv.org/abs/1406.1078
+
+See lstm_seq2seq.py for more details on the model architecture and how it is trained.
+"""
+
+from keras.models import Model, load_model
+from keras.layers import Input
+import random
+import os
+import numpy as np
+import keras
+import tvm
+import nnvm
+
+######################################################################
+# Download required files
+# -----------------------
+# Download files listed below from dmlc web-data repo.
+model_file = "s2s_translate.h5"
+data_file = "fra-eng.txt"
+
+# Base location for model related files.
+repo_base = 'https://github.com/dmlc/web-data/raw/master/keras/models/s2s_translate/'
+model_url = os.path.join(repo_base, model_file)
+data_url = os.path.join(repo_base, data_file)
+
+# Download files listed below.
+from tvm.contrib.download import download_testdata
+model_path = download_testdata(model_url, model_file, module='keras')
+data_path = download_testdata(data_url, data_file, module='data')
+
+latent_dim = 256  # Latent dimensionality of the encoding space.
+test_samples = 10000  # Number of samples used for testing.
+
+######################################################################
+# Process the data file
+# ---------------------
+# Vectorize the data.  We use the same approach as the training script.
+# NOTE: the data must be identical, in order for the character -> integer
+# mappings to be consistent.
+input_texts = []
+target_texts = []
+input_characters = set()
+target_characters = set()
+with open(data_path, 'r', encoding='utf-8') as f:
+    lines = f.read().split('\n')
+test_samples = min(test_samples, len(lines))
+max_encoder_seq_length = 0
+max_decoder_seq_length = 0
+for line in lines[:test_samples]:
+    input_text, target_text = line.split('\t')
+    # We use "tab" as the "start sequence" character
+    # for the targets, and "\n" as "end sequence" character.
+    target_text = '\t' + target_text + '\n'
+    max_encoder_seq_length = max(max_encoder_seq_length, len(input_text))
+    max_decoder_seq_length = max(max_decoder_seq_length, len(target_text))
+    for char in input_text:
+        if char not in input_characters:
+            input_characters.add(char)
+    for char in target_text:
+        if char not in target_characters:
+            target_characters.add(char)
+
+input_characters = sorted(list(input_characters))
+target_characters = sorted(list(target_characters))
+num_encoder_tokens = len(input_characters)
+num_decoder_tokens = len(target_characters)
+input_token_index = dict(
+    [(char, i) for i, char in enumerate(input_characters)])
+target_token_index = dict(
+    [(char, i) for i, char in enumerate(target_characters)])
+
+# Reverse-lookup token index to decode sequences back to something readable.
+reverse_target_char_index = dict(
+    (i, char) for char, i in target_token_index.items())
+
+######################################################################
+# Load Keras Model
+# ----------------
+# Restore the model and construct the encoder and decoder.
+model = load_model(model_path)
+encoder_inputs = model.input[0]   # input_1
+
+encoder_outputs, state_h_enc, state_c_enc = model.layers[2].output   # lstm_1
+encoder_states = [state_h_enc, state_c_enc]
+encoder_model = Model(encoder_inputs, encoder_states)
+
+decoder_inputs = model.input[1]   # input_2
+decoder_state_input_h = Input(shape=(latent_dim,), name='input_3')
+decoder_state_input_c = Input(shape=(latent_dim,), name='input_4')
+decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
+decoder_lstm = model.layers[3]
+decoder_outputs, state_h_dec, state_c_dec = decoder_lstm(
+    decoder_inputs, initial_state=decoder_states_inputs)
+decoder_states = [state_h_dec, state_c_dec]
+decoder_dense = model.layers[4]
+decoder_outputs = decoder_dense(decoder_outputs)
+decoder_model = Model(
+    [decoder_inputs] + decoder_states_inputs,
+    [decoder_outputs] + decoder_states)
+
+######################################################################
+# Compile both encoder and decoder model on NNVM
+# ----------------------------------------------
+# Creates NNVM graph definition from keras model file.
+from tvm.contrib import graph_runtime
+target = 'llvm'
+ctx = tvm.cpu(0)
+
+# Parse Encoder model
+sym, params = nnvm.frontend.from_keras(encoder_model)
+inp_enc_shape = (1, max_encoder_seq_length, num_encoder_tokens)
+shape_dict = {'input_1': inp_enc_shape}
+
+# Build Encoder model
+with nnvm.compiler.build_config(opt_level=2):
+    enc_graph, enc_lib, enc_params = nnvm.compiler.build(sym, target, shape_dict, params=params)
+print("Encoder build ok.")
+
+# Create graph runtime for encoder model
+tvm_enc = graph_runtime.create(enc_graph, enc_lib, ctx)
+tvm_enc.set_input(**enc_params)
+
+# Parse Decoder model
+inp_dec_shape = (1, 1, num_decoder_tokens)
+shape_dict = {'input_2': inp_dec_shape,
+              'input_3': (1, latent_dim),
+              'input_4': (1, latent_dim)}
+
+# Build Decoder model
+sym, params = nnvm.frontend.from_keras(decoder_model)
+with nnvm.compiler.build_config(opt_level=2):
+    dec_graph, dec_lib, dec_params = nnvm.compiler.build(sym, target, shape_dict, params=params)
+print("Decoder build ok.")
+
+# Create graph runtime for decoder model
+tvm_dec = graph_runtime.create(dec_graph, dec_lib, ctx)
+tvm_dec.set_input(**dec_params)
+
+# Decodes an input sequence.
+def decode_sequence(input_seq):
+    # Set the input for encoder model.
+    tvm_enc.set_input('input_1', input_seq)
+
+    # Run encoder model
+    tvm_enc.run()
+
+    # Get states from encoder network
+    h = tvm_enc.get_output(0).asnumpy()
+    c = tvm_enc.get_output(1).asnumpy()
+
+    # Populate the first character of target sequence with the start character.
+    sampled_token_index = target_token_index['\t']
+
+    # Sampling loop for a batch of sequences
+    decoded_sentence = ''
+    while True:
+        # Generate empty target sequence of length 1.
+        target_seq = np.zeros((1, 1, num_decoder_tokens), dtype='float32')
+        # Update the target sequence (of length 1).
+        target_seq[0, 0, sampled_token_index] = 1.
+
+        # Set the input and states for decoder model.
+        tvm_dec.set_input('input_2', target_seq)
+        tvm_dec.set_input('input_3', h)
+        tvm_dec.set_input('input_4', c)
+        # Run decoder model
+        tvm_dec.run()
+
+        output_tokens = tvm_dec.get_output(0).asnumpy()
+        h = tvm_dec.get_output(1).asnumpy()
+        c = tvm_dec.get_output(2).asnumpy()
+
+        # Sample a token
+        sampled_token_index = np.argmax(output_tokens[0, -1, :])
+        sampled_char = reverse_target_char_index[sampled_token_index]
+
+        # Exit condition: either hit max length or find stop character.
+        if sampled_char == '\n':
+            break
+
+        # Update the sentence
+        decoded_sentence += sampled_char
+        if len(decoded_sentence) > max_decoder_seq_length:
+            break
+    return decoded_sentence
+
+def generate_input_seq(input_text):
+    input_seq = np.zeros((1, max_encoder_seq_length, num_encoder_tokens), dtype='float32')
+    for t, char in enumerate(input_text):
+        input_seq[0, t, input_token_index[char]] = 1.
+    return input_seq
+
+######################################################################
+# Run the model
+# -------------
+# Randonly take some text from test samples and translate
+for seq_index in range(100):
+    # Take one sentence randomly and try to decode.
+    index = random.randint(1, test_samples)
+    input_text, _ = lines[index].split('\t')
+    input_seq = generate_input_seq(input_text)
+    decoded_sentence = decode_sequence(input_seq)
+    print((seq_index + 1), ": ", input_text,  "==>", decoded_sentence)
diff --git a/nnvm/tutorials/tune_nnvm_arm.py b/nnvm/tutorials/tune_nnvm_arm.py
new file mode 100644
index 000000000000..d61130b852cc
--- /dev/null
+++ b/nnvm/tutorials/tune_nnvm_arm.py
@@ -0,0 +1,427 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+Auto-tuning a convolutional network for ARM CPU (NNVM)
+======================================================
+**Author**: `Lianmin Zheng <https://github.com/merrymercy>`_, `Zhao Wu <https://github.com/FrozenGene>`_
+
+Auto-tuning for a specific ARM device is critical for getting the best
+performance. This is a tutorial about how to tune a whole convolutional
+network.
+
+The operator implementation for ARM CPU in TVM is written in template form.
+The template has many tunable knobs (tile factor, vectorization, unrolling, etc).
+We will tune all convolution and depthwise convolution operators
+in the neural network. After tuning, we produce a log file which stores
+the best knob values for all required operators. When the tvm compiler compiles
+these operators, it will query this log file to get the best knob values.
+
+We also released pre-tuned parameters for some arm devices. You can go to
+`ARM CPU Benchmark <https://github.com/apache/incubator-tvm/wiki/Benchmark#arm-cpu>`_
+to see the results.
+"""
+
+######################################################################
+# Install dependencies
+# --------------------
+# To use the autotvm package in tvm, we need to install some extra dependencies.
+# (change "3" to "2" if you use python2):
+#
+# .. code-block:: bash
+#
+#   pip3 install --user psutil xgboost tornado
+#
+# To make tvm run faster during tuning, it is recommended to use cython
+# as FFI of tvm. In the root directory of tvm, execute
+# (change "3" to "2" if you use python2):
+#
+# .. code-block:: bash
+#
+#   pip3 install --user cython
+#   sudo make cython3
+#
+# Now return to python code. Import packages.
+
+import os
+
+import numpy as np
+
+import nnvm.testing
+import nnvm.compiler
+import tvm
+from tvm import autotvm
+from tvm.autotvm.tuner import XGBTuner, GATuner, RandomTuner, GridSearchTuner
+from tvm.contrib.util import tempdir
+import tvm.contrib.graph_runtime as runtime
+
+#################################################################
+# Define network
+# --------------
+# First we need to define the network in nnvm symbol API.
+# We can load some pre-defined network from :code:`nnvm.testing`.
+# We can also load models from MXNet, ONNX and TensorFlow (see NNVM
+# tutorials :ref:`tutorial-nnvm` for more details).
+
+def get_network(name, batch_size):
+    """Get the symbol definition and random weight of a network"""
+    input_shape = (batch_size, 3, 224, 224)
+    output_shape = (batch_size, 1000)
+
+    if "resnet" in name:
+        n_layer = int(name.split('-')[1])
+        net, params = nnvm.testing.resnet.get_workload(num_layers=n_layer, batch_size=batch_size)
+    elif "vgg" in name:
+        n_layer = int(name.split('-')[1])
+        net, params = nnvm.testing.vgg.get_workload(num_layers=n_layer, batch_size=batch_size)
+    elif name == 'mobilenet':
+        net, params = nnvm.testing.mobilenet.get_workload(batch_size=batch_size)
+    elif name == 'squeezenet_v1.1':
+        net, params = nnvm.testing.squeezenet.get_workload(batch_size=batch_size, version='1.1')
+    elif name == 'inception_v3':
+        input_shape = (1, 3, 299, 299)
+        net, params = nnvm.testing.inception_v3.get_workload(batch_size=batch_size)
+    elif name == 'custom':
+        # an example for custom network
+        from nnvm.testing import utils
+        net = nnvm.sym.Variable('data')
+        net = nnvm.sym.conv2d(net, channels=4, kernel_size=(3,3), padding=(1,1))
+        net = nnvm.sym.flatten(net)
+        net = nnvm.sym.dense(net, units=1000)
+        net, params = utils.create_workload(net, batch_size, (3, 224, 224))
+    elif name == 'mxnet':
+        # an example for mxnet model
+        from mxnet.gluon.model_zoo.vision import get_model
+        block = get_model('resnet18_v1', pretrained=True)
+        net, params = nnvm.frontend.from_mxnet(block)
+        net = nnvm.sym.softmax(net)
+    else:
+        raise ValueError("Unsupported network: " + name)
+
+    return net, params, input_shape, output_shape
+
+
+#################################################################
+# Start RPC Tracker
+# -----------------
+# TVM uses RPC session to communicate with ARM boards.
+# During tuning, the tuner will send the generated code to the board and
+# measure the speed of code on the board.
+#
+# To scale up the tuning, TVM uses RPC Tracker to manage distributed devices.
+# The RPC Tracker is a centralized master node. We can register all devices to
+# the tracker. For example, if we have 10 phones, we can register all of them
+# to the tracker, and run 10 measurements in parallel, accelerating the tuning process.
+#
+# To start an RPC tracker, run this command on the host machine. The tracker is
+# required during the whole tuning process, so we need to open a new terminal for
+# this command:
+#
+# .. code-block:: bash
+#
+#   python -m tvm.exec.rpc_tracker --host=0.0.0.0 --port=9190
+#
+# The expected output is
+#
+# .. code-block:: bash
+#
+#   INFO:RPCTracker:bind to 0.0.0.0:9190
+
+#################################################################
+# Register devices to RPC Tracker
+# -----------------------------------
+# Now we can register our devices to the tracker. The first step is to
+# build tvm runtime for the ARM devices.
+#
+# * For Linux:
+#   Follow this section :ref:`build-tvm-runtime-on-device` to build
+#   tvm runtime on the device. Then register the device to tracker by
+#
+#   .. code-block:: bash
+#
+#     python -m tvm.exec.rpc_server --tracker=[HOST_IP]:9190 --key=rk3399
+#
+#   (replace :code:`[HOST_IP]` with the IP address of your host machine)
+#
+# * For Android:
+#   Follow this `readme page <https://github.com/apache/incubator-tvm/tree/master/apps/android_rpc>`_ to
+#   install tvm rpc apk on the android device. Make sure you can pass the android rpc test.
+#   Then you have already registred your device. During tuning, you have to go to developer option
+#   and enable "Keep screen awake during changing" and charge your phone to make it stable.
+#
+# After registering devices, we can confirm it by querying rpc_tracker
+#
+# .. code-block:: bash
+#
+#   python -m tvm.exec.query_rpc_tracker --host=0.0.0.0 --port=9190
+#
+# For example, if we have 2 Huawei mate10 pro, 11 Raspberry Pi 3B and 2 rk3399,
+# the output can be
+#
+# .. code-block:: bash
+#
+#    Queue Status
+#    ----------------------------------
+#    key          total  free  pending
+#    ----------------------------------
+#    mate10pro    2      2     0
+#    rk3399       2      2     0
+#    rpi3b        11     11    0
+#    ----------------------------------
+#
+# You can register multiple devices to the tracker to accelerate the measurement in tuning.
+
+###########################################
+# Set Tuning Options
+# ------------------
+# Before tuning, we should apply some configurations. Here I use an RK3399 board
+# as example. In your setting, you should modify the target and device_key accordingly.
+# set :code:`use_android` to True if you use android phone.
+
+#### DEVICE CONFIG ####
+
+# Replace "aarch64-linux-gnu" with the correct target of your board.
+# This target is used for cross compilation. You can query it by :code:`gcc -v` on your device.
+target = tvm.target.create('llvm -device=arm_cpu -target=aarch64-linux-gnu')
+
+# Also replace this with the device key in your tracker
+device_key = 'rk3399'
+
+# Set this to True if you use android phone
+use_android = False
+
+#### TUNING OPTION ####
+network = 'resnet-18'
+log_file = "%s.%s.log" % (device_key, network)
+dtype = 'float32'
+
+tuning_option = {
+    'log_filename': log_file,
+
+    'tuner': 'xgb',
+    'n_trial': 2000,
+    'early_stopping': 800,
+
+    'measure_option': autotvm.measure_option(
+        builder=autotvm.LocalBuilder(
+            build_func='ndk' if use_android else 'default'),
+        runner=autotvm.RPCRunner(
+            device_key, host='localhost', port=9190,
+            number=5,
+            timeout=4,
+        ),
+    ),
+}
+
+####################################################################
+#
+# .. note:: How to set tuning options
+#
+#   In general, the default values provided here work well.
+#   If you have enough time budget, you can set :code:`n_trial`, :code:`early_stopping` larger,
+#   which makes the tuning run longer.
+#   If your device runs very slow or your conv2d operators have many GFLOPs, considering to
+#   set timeout larger.
+#
+#   If your model has depthwise convolution, you could consider setting
+#   :code:`try_spatial_pack_depthwise` be :code:`True`, which perform better than default
+#   optimization in general. For example, on ARM CPU A53 2.0GHz, we find it could boost 1.6x
+#   performance of depthwise convolution on Mobilenet V1 model.
+
+###################################################################
+# Begin Tuning
+# ------------
+# Now we can extract tuning tasks from the network and begin tuning.
+# Here, we provide a simple utility function to tune a list of tasks.
+# This function is just an initial implementation which tunes them in sequential order.
+# We will introduce a more sophisticated tuning scheduler in the future.
+
+# You can skip the implementation of this function for this tutorial.
+def tune_tasks(tasks,
+               measure_option,
+               tuner='xgb',
+               n_trial=1000,
+               early_stopping=None,
+               log_filename='tuning.log',
+               use_transfer_learning=True,
+               try_winograd=True,
+               try_spatial_pack_depthwise=False):
+    if try_winograd:
+        for i in range(len(tasks)):
+            try:  # try winograd template
+                tsk = autotvm.task.create(tasks[i].name, tasks[i].args,
+                                          tasks[i].target, tasks[i].target_host, 'winograd')
+                input_channel = tsk.workload[1][1]
+                if input_channel >= 64:
+                    tasks[i] = tsk
+            except Exception:
+                pass
+
+    # if we want to use spatial pack for depthwise convolution
+    if try_spatial_pack_depthwise:
+        tuner = 'xgb_knob'
+        for i in range(len(tasks)):
+            if tasks[i].name == 'topi_nn_depthwise_conv2d_nchw':
+                tsk = autotvm.task.create(tasks[i].name, tasks[i].args,
+                                          tasks[i].target, tasks[i].target_host,
+                                          'contrib_spatial_pack')
+                tasks[i] = tsk
+
+    # create tmp log file
+    tmp_log_file = log_filename + ".tmp"
+    if os.path.exists(tmp_log_file):
+        os.remove(tmp_log_file)
+
+    for i, tsk in enumerate(reversed(tasks)):
+        prefix = "[Task %2d/%2d] " % (i+1, len(tasks))
+
+        # create tuner
+        if tuner == 'xgb' or tuner == 'xgb-rank':
+            tuner_obj = XGBTuner(tsk, loss_type='rank')
+        elif tuner == 'xgb_knob':
+            tuner_obj = XGBTuner(tsk, loss_type='rank', feature_type='knob')
+        elif tuner == 'ga':
+            tuner_obj = GATuner(tsk, pop_size=50)
+        elif tuner == 'random':
+            tuner_obj = RandomTuner(tsk)
+        elif tuner == 'gridsearch':
+            tuner_obj = GridSearchTuner(tsk)
+        else:
+            raise ValueError("Invalid tuner: " + tuner)
+
+        if use_transfer_learning:
+            if os.path.isfile(tmp_log_file):
+                tuner_obj.load_history(autotvm.record.load_from_file(tmp_log_file))
+
+        # do tuning
+        n_trial = min(n_trial, len(tsk.config_space))
+        tuner_obj.tune(n_trial=n_trial,
+                       early_stopping=early_stopping,
+                       measure_option=measure_option,
+                       callbacks=[
+                           autotvm.callback.progress_bar(n_trial, prefix=prefix),
+                           autotvm.callback.log_to_file(tmp_log_file)])
+
+    # pick best records to a cache file
+    autotvm.record.pick_best(tmp_log_file, log_filename)
+    os.remove(tmp_log_file)
+
+
+########################################################################
+# Finally, we launch tuning jobs and evaluate the end-to-end performance.
+
+def tune_and_evaluate(tuning_opt):
+    # extract workloads from nnvm graph
+    print("Extract tasks...")
+    net, params, input_shape, out_shape = get_network(network, batch_size=1)
+    tasks = autotvm.task.extract_from_graph(net, target=target,
+                                            shape={'data': input_shape}, dtype=dtype,
+                                            symbols=(nnvm.sym.conv2d,))
+
+    # run tuning tasks
+    print("Tuning...")
+    tune_tasks(tasks, **tuning_opt)
+
+    # compile kernels with history best records
+    with autotvm.apply_history_best(log_file):
+        print("Compile...")
+        with nnvm.compiler.build_config(opt_level=3):
+            graph, lib, params = nnvm.compiler.build(
+                net, target=target, shape={'data': input_shape}, params=params, dtype=dtype)
+
+        # export library
+        tmp = tempdir()
+        if use_android:
+            from tvm.contrib import ndk
+            filename = "net.so"
+            lib.export_library(tmp.relpath(filename), ndk.create_shared)
+        else:
+            filename = "net.tar"
+            lib.export_library(tmp.relpath(filename))
+
+        # upload module to device
+        print("Upload...")
+        remote = autotvm.measure.request_remote(device_key, 'localhost', 9190,
+                                                timeout=10000)
+        remote.upload(tmp.relpath(filename))
+        rlib = remote.load_module(filename)
+
+        # upload parameters to device
+        ctx = remote.context(str(target), 0)
+        module = runtime.create(graph, rlib, ctx)
+        data_tvm = tvm.nd.array((np.random.uniform(size=input_shape)).astype(dtype))
+        module.set_input('data', data_tvm)
+        module.set_input(**params)
+
+        # evaluate
+        print("Evaluate inference time cost...")
+        ftimer = module.module.time_evaluator("run", ctx, number=1, repeat=10)
+        prof_res = np.array(ftimer().results) * 1000  # convert to millisecond
+        print("Mean inference time (std dev): %.2f ms (%.2f ms)" %
+              (np.mean(prof_res), np.std(prof_res)))
+
+# We do not run the tuning in our webpage server since it takes too long.
+# Uncomment the following line to run it by yourself.
+
+# tune_and_evaluate(tuning_option)
+
+######################################################################
+# Sample Output
+# -------------
+# The tuning needs to compile many programs and extract feature from them.
+# So a high performance CPU is recommended.
+# One sample output is listed below.
+# It takes about 2 hours on a 32T AMD Ryzen Threadripper.
+#
+# .. code-block:: bash
+#
+#    Extract tasks...
+#    Tuning...
+#    [Task  1/12]  Current/Best:   22.37/  52.19 GFLOPS | Progress: (544/1000) | 406.59 s Done.
+#    [Task  2/12]  Current/Best:    6.51/  18.77 GFLOPS | Progress: (608/1000) | 325.05 s Done.
+#    [Task  3/12]  Current/Best:    4.67/  24.87 GFLOPS | Progress: (480/1000) | 372.31 s Done.
+#    [Task  4/12]  Current/Best:   11.35/  46.83 GFLOPS | Progress: (736/1000) | 602.39 s Done.
+#    [Task  5/12]  Current/Best:    1.01/  19.80 GFLOPS | Progress: (448/1000) | 262.16 s Done.
+#    [Task  6/12]  Current/Best:    2.47/  23.76 GFLOPS | Progress: (672/1000) | 563.85 s Done.
+#    [Task  7/12]  Current/Best:   14.57/  33.97 GFLOPS | Progress: (544/1000) | 465.15 s Done.
+#    [Task  8/12]  Current/Best:    1.13/  17.65 GFLOPS | Progress: (576/1000) | 365.08 s Done.
+#    [Task  9/12]  Current/Best:   14.45/  22.66 GFLOPS | Progress: (928/1000) | 724.25 s Done.
+#    [Task 10/12]  Current/Best:    3.22/  15.36 GFLOPS | Progress: (864/1000) | 564.27 s Done.
+#    [Task 11/12]  Current/Best:   11.03/  32.23 GFLOPS | Progress: (736/1000) | 635.15 s Done.
+#    [Task 12/12]  Current/Best:    8.00/  21.65 GFLOPS | Progress: (1000/1000) | 1111.81 s Done.
+#    Compile...
+#    Upload...
+#    Evaluate inference time cost...
+#    Mean inference time (std dev): 162.59 ms (0.06 ms)
+
+######################################################################
+#
+# .. note:: **Experiencing Difficulties?**
+#
+#   The auto tuning module is error-prone. If you always see " 0.00/ 0.00 GFLOPS",
+#   then there must be something wrong.
+#
+#   First, make sure you set the correct configuration of your device.
+#   Then, you can print debug information by adding these lines in the beginning
+#   of the script. It will print every measurement result, where you can find useful
+#   error messages.
+#
+#   .. code-block:: python
+#
+#      import logging
+#      logging.getLogger('autotvm').setLevel(logging.DEBUG)
+#
+#   Finally, always feel free to ask our community for help on https://discuss.tvm.ai
diff --git a/nnvm/tutorials/tune_nnvm_cuda.py b/nnvm/tutorials/tune_nnvm_cuda.py
new file mode 100644
index 000000000000..be3f79992cb6
--- /dev/null
+++ b/nnvm/tutorials/tune_nnvm_cuda.py
@@ -0,0 +1,391 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+Auto-tuning a convolutional network for NVIDIA GPU (NNVM)
+=========================================================
+**Author**: `Lianmin Zheng <https://github.com/merrymercy>`_
+
+Auto-tuning for specific devices and workloads is critical for getting the
+best performance. This is a tutorial on how to tune a whole convolutional
+network for NVIDIA GPU.
+
+The operator implementation for NVIDIA GPU in TVM is written in template form.
+The template has many tunable knobs (tile factor, unrolling, etc).
+We will tune all convolution and depthwise convolution operators
+in the neural network. After tuning, we produce a log file which stores
+the best knob values for all required operators. When the tvm compiler compiles
+these operators, it will query this log file to get the best knob values.
+
+We also released pre-tuned parameters for some NVIDIA GPUs. You can go to
+`NVIDIA GPU Benchmark <https://github.com/apache/incubator-tvm/wiki/Benchmark#nvidia-gpu>`_
+to see the results.
+"""
+
+######################################################################
+# Install dependencies
+# --------------------
+# To use the autotvm package in tvm, we need to install some extra dependencies.
+# (change "3" to "2" if you use python2):
+#
+# .. code-block:: bash
+#
+#   pip3 install --user psutil xgboost tornado
+#
+# To make tvm run faster during tuning, it is recommended to use cython
+# as FFI of tvm. In the root directory of tvm, execute:
+#
+# .. code-block:: bash
+#
+#   pip3 install --user cython
+#   sudo make cython3
+#
+# Now return to python code. Import packages.
+
+import os
+
+import numpy as np
+
+import nnvm.testing
+import nnvm.compiler
+import tvm
+from tvm import autotvm
+from tvm.autotvm.tuner import XGBTuner, GATuner, RandomTuner, GridSearchTuner
+from tvm.contrib.util import tempdir
+import tvm.contrib.graph_runtime as runtime
+
+#################################################################
+# Define Network
+# --------------
+# First we need to define the network in nnvm symbol API.
+# We can load some pre-defined network from :code:`nnvm.testing`.
+# We can also load models from MXNet, ONNX and TensorFlow (see NNVM
+# tutorials :ref:`tutorial-nnvm` for more details).
+
+def get_network(name, batch_size):
+    """Get the symbol definition and random weight of a network"""
+    input_shape = (batch_size, 3, 224, 224)
+    output_shape = (batch_size, 1000)
+
+    if "resnet" in name:
+        n_layer = int(name.split('-')[1])
+        net, params = nnvm.testing.resnet.get_workload(num_layers=n_layer, batch_size=batch_size)
+    elif "vgg" in name:
+        n_layer = int(name.split('-')[1])
+        net, params = nnvm.testing.vgg.get_workload(num_layers=n_layer, batch_size=batch_size)
+    elif name == 'mobilenet':
+        net, params = nnvm.testing.mobilenet.get_workload(batch_size=batch_size)
+    elif name == 'squeezenet_v1.1':
+        net, params = nnvm.testing.squeezenet.get_workload(batch_size=batch_size, version='1.1')
+    elif name == 'inception_v3':
+        input_shape = (1, 3, 299, 299)
+        net, params = nnvm.testing.inception_v3.get_workload(batch_size=batch_size)
+    elif name == 'custom':
+        # an example for custom network
+        from nnvm.testing import utils
+        net = nnvm.sym.Variable('data')
+        net = nnvm.sym.conv2d(net, channels=4, kernel_size=(3,3), padding=(1,1))
+        net = nnvm.sym.flatten(net)
+        net = nnvm.sym.dense(net, units=1000)
+        net, params = utils.create_workload(net, batch_size, (3, 224, 224))
+    elif name == 'mxnet':
+        # an example for mxnet model
+        from mxnet.gluon.model_zoo.vision import get_model
+        block = get_model('resnet18_v1', pretrained=True)
+        net, params = nnvm.frontend.from_mxnet(block)
+        net = nnvm.sym.softmax(net)
+    else:
+        raise ValueError("Unsupported network: " + name)
+
+    return net, params, input_shape, output_shape
+
+###########################################
+# Set Tuning Options
+# ------------------
+# Before tuning, we apply some configurations.
+
+#### DEVICE CONFIG ####
+target = tvm.target.cuda()
+
+#### TUNING OPTION ####
+network = 'resnet-18'
+log_file = "%s.log" % network
+dtype = 'float32'
+
+tuning_option = {
+    'log_filename': log_file,
+
+    'tuner': 'xgb',
+    'n_trial': 2000,
+    'early_stopping': 600,
+
+    'measure_option': autotvm.measure_option(
+        builder=autotvm.LocalBuilder(timeout=10),
+        runner=autotvm.LocalRunner(number=20, repeat=3, timeout=4, min_repeat_ms=150),
+    ),
+}
+
+####################################################################
+#
+# .. note:: How to set tuning options
+#
+#   In general, the default value provided here works well.
+#
+#   If you have large time budget, you can set :code:`n_trial`, :code:`early_stopping` larger,
+#   which makes the tuning runs longer.
+#
+#   If you have multiple devices, you can use all of them for measurement to
+#   accelerate the tuning process. (see the 'Scale up measurement` section below).
+#
+
+###################################################################
+# Begin Tuning
+# ------------
+# Now we can extract tuning tasks from the network and begin tuning.
+# Here, we provide a simple utility function to tune a list of tasks.
+# This function is just an initial implementation which tunes them in sequential order.
+# We will introduce a more sophisticated tuning scheduler in the future.
+
+# You can skip the implementation of this function for this tutorial.
+def tune_tasks(tasks,
+               measure_option,
+               tuner='xgb',
+               n_trial=1000,
+               early_stopping=None,
+               log_filename='tuning.log',
+               use_transfer_learning=True,
+               try_winograd=True):
+    if try_winograd:
+        for i in range(len(tasks)):
+            try:  # try winograd template
+                tsk = autotvm.task.create(tasks[i].name, tasks[i].args,
+                                          tasks[i].target, tasks[i].target_host, 'winograd')
+                input_channel = tsk.workload[1][1]
+                if input_channel >= 64:
+                    tasks[i] = tsk
+            except Exception:
+                pass
+
+    # create tmp log file
+    tmp_log_file = log_filename + ".tmp"
+    if os.path.exists(tmp_log_file):
+        os.remove(tmp_log_file)
+
+    for i, tsk in enumerate(reversed(tasks)):
+        prefix = "[Task %2d/%2d] " %(i+1, len(tasks))
+
+        # create tuner
+        if tuner == 'xgb' or tuner == 'xgb-rank':
+            tuner_obj = XGBTuner(tsk, loss_type='rank')
+        elif tuner == 'ga':
+            tuner_obj = GATuner(tsk, pop_size=100)
+        elif tuner == 'random':
+            tuner_obj = RandomTuner(tsk)
+        elif tuner == 'gridsearch':
+            tuner_obj = GridSearchTuner(tsk)
+        else:
+            raise ValueError("Invalid tuner: " + tuner)
+
+        if use_transfer_learning:
+            if os.path.isfile(tmp_log_file):
+                tuner_obj.load_history(autotvm.record.load_from_file(tmp_log_file))
+
+        # do tuning
+        n_trial = min(n_trial, len(tsk.config_space))
+        tuner_obj.tune(n_trial=n_trial,
+                       early_stopping=early_stopping,
+                       measure_option=measure_option,
+                       callbacks=[
+                           autotvm.callback.progress_bar(n_trial, prefix=prefix),
+                           autotvm.callback.log_to_file(tmp_log_file)])
+
+    # pick best records to a cache file
+    autotvm.record.pick_best(tmp_log_file, log_filename)
+    os.remove(tmp_log_file)
+
+
+########################################################################
+# Finally, we launch tuning jobs and evaluate the end-to-end performance.
+
+def tune_and_evaluate(tuning_opt):
+    # extract workloads from nnvm graph
+    print("Extract tasks...")
+    net, params, input_shape, out_shape = get_network(network, batch_size=1)
+    tasks = autotvm.task.extract_from_graph(net, target=target,
+                                            shape={'data': input_shape}, dtype=dtype,
+                                            symbols=(nnvm.sym.conv2d,))
+
+    # run tuning tasks
+    print("Tuning...")
+    tune_tasks(tasks, **tuning_opt)
+
+    # compile kernels with history best records
+    with autotvm.apply_history_best(log_file):
+        print("Compile...")
+        with nnvm.compiler.build_config(opt_level=3):
+            graph, lib, params = nnvm.compiler.build(
+                net, target=target, shape={'data': input_shape}, params=params, dtype=dtype)
+
+        # export library
+        tmp = tempdir()
+        filename = "net.tar"
+        lib.export_library(tmp.relpath(filename))
+
+        # load parameters
+        ctx = tvm.context(str(target), 0)
+        module = runtime.create(graph, lib, ctx)
+        data_tvm = tvm.nd.array((np.random.uniform(size=input_shape)).astype(dtype))
+        module.set_input('data', data_tvm)
+        module.set_input(**params)
+
+        # evaluate
+        print("Evaluate inference time cost...")
+        ftimer = module.module.time_evaluator("run", ctx, number=1, repeat=600)
+        prof_res = np.array(ftimer().results) * 1000  # convert to millisecond
+        print("Mean inference time (std dev): %.2f ms (%.2f ms)" %
+              (np.mean(prof_res), np.std(prof_res)))
+
+# We do not run the tuning in our webpage server since it takes too long.
+# Uncomment the following line to run it by yourself.
+
+# tune_and_evaluate(tuning_option)
+
+######################################################################
+# Sample Output
+# -------------
+# The tuning needs to compile many programs and extract feature from them.
+# So a high performance CPU is recommended. One sample output is listed below.
+# It takes about 4 hours to get the following output on a 32T AMD Ryzen Threadripper.
+# The tuning target is NVIDIA 1080 Ti.
+# (You can see some errors during compilation. If the tuning is not stuck, it is okay.)
+#
+# .. code-block:: bash
+#
+#    Extract tasks...
+#    Tuning...
+#    [Task  1/12]  Current/Best:  541.83/3570.66 GFLOPS | Progress: (960/2000) | 1001.31 s Done.
+#    [Task  2/12]  Current/Best:    0.56/ 803.33 GFLOPS | Progress: (704/2000) | 608.08 s Done.
+#    [Task  3/12]  Current/Best:  103.69/1141.25 GFLOPS | Progress: (768/2000) | 702.13 s Done.
+#    [Task  4/12]  Current/Best: 2905.03/3925.15 GFLOPS | Progress: (864/2000) | 745.94 sterminate called without an active exception
+#    [Task  4/12]  Current/Best: 2789.36/3925.15 GFLOPS | Progress: (1056/2000) | 929.40 s Done.
+#    [Task  5/12]  Current/Best:   89.06/1076.24 GFLOPS | Progress: (704/2000) | 601.73 s Done.
+#    [Task  6/12]  Current/Best:   40.39/2129.02 GFLOPS | Progress: (1088/2000) | 1125.76 s Done.
+#    [Task  7/12]  Current/Best: 4090.53/5007.02 GFLOPS | Progress: (800/2000) | 903.90 s Done.
+#    [Task  8/12]  Current/Best:    4.78/1272.28 GFLOPS | Progress: (768/2000) | 749.14 s Done.
+#    [Task  9/12]  Current/Best: 1391.45/2325.08 GFLOPS | Progress: (992/2000) | 1084.87 s Done.
+#    [Task 10/12]  Current/Best: 1995.44/2383.59 GFLOPS | Progress: (864/2000) | 862.60 s Done.
+#    [Task 11/12]  Current/Best: 4093.94/4899.80 GFLOPS | Progress: (224/2000) | 240.92 sterminate called without an active exception
+#    [Task 11/12]  Current/Best: 3487.98/4909.91 GFLOPS | Progress: (480/2000) | 534.96 sterminate called without an active exception
+#    [Task 11/12]  Current/Best: 4636.84/4912.17 GFLOPS | Progress: (1184/2000) | 1381.16 sterminate called without an active exception
+#    [Task 11/12]  Current/Best:   50.12/4912.17 GFLOPS | Progress: (1344/2000) | 1602.81 s Done.
+#    [Task 12/12]  Current/Best: 3581.31/4286.30 GFLOPS | Progress: (736/2000) | 943.52 s Done.
+#    Compile...
+#    Evaluate inference time cost...
+#    Mean inference time (std dev): 1.07 ms (0.05 ms)
+#
+# As a reference baseline, the time cost of MXNet + TensorRT on resnet-18 is 1.30ms. So we are a little faster.
+
+######################################################################
+#
+# .. note:: **Experiencing Difficulties?**
+#
+#   The auto tuning module is error-prone. If you always see " 0.00/ 0.00 GFLOPS",
+#   then there must be something wrong.
+#
+#   First, make sure you set the correct configuration of your device.
+#   Then, you can print debug information by adding these lines in the beginning
+#   of the script. It will print every measurement result, where you can find useful
+#   error messages.
+#
+#   .. code-block:: python
+#
+#      import logging
+#      logging.getLogger('autotvm').setLevel(logging.DEBUG)
+#
+#   Finally, always feel free to ask our community for help on https://discuss.tvm.ai
+
+
+#################################################################
+# Scale up measurement by using multiple devices
+# ----------------------------------------------
+#
+# If you have multiple devices, you can use all of them for measurement.
+# TVM uses the RPC Tracker to manage distributed devices.
+# The RPC Tracker is a centralized master node. We can register all devices to
+# the tracker. For example, if we have 10 GPU cards, we can register all of them
+# to the tracker, and run 10 measurements in parallel, accelerating the tuning process.
+#
+# To start an RPC tracker, run this command on the host machine. The tracker is
+# required during the whole tuning process, so we need to open a new terminal for
+# this command:
+#
+# .. code-block:: bash
+#
+#   python -m tvm.exec.rpc_tracker --host=0.0.0.0 --port=9190
+#
+# The expected output is
+#
+# .. code-block:: bash
+#
+#   INFO:RPCTracker:bind to 0.0.0.0:9190
+#
+# Then open another new terminal for the RPC server. We need to start one server
+# for each dedicated device. We use a string key to distinguish the types of devices.
+# You can pick a name you like.
+# (Note: For rocm backend, there are some internal errors with the compiler,
+# we need to add `--no-fork` to the argument list.)
+#
+# .. code-block:: bash
+#
+#     python -m tvm.exec.rpc_server --tracker=localhost:9190 --key=1080ti
+#
+# After registering devices, we can confirm it by querying rpc_tracker
+#
+# .. code-block:: bash
+#
+#   python -m tvm.exec.query_rpc_tracker --host=localhost --port=9190
+#
+# For example, if we have four 1080ti, two titanx and one gfx900, the output can be
+#
+# .. code-block:: bash
+#
+#    Queue Status
+#    ----------------------------------
+#    key          total  free  pending
+#    ----------------------------------
+#    1080ti       4      4     0
+#    titanx       2      2     0
+#    gfx900       1      1     0
+#    ----------------------------------
+#
+# Finally, we need to change the tuning option to use RPCRunner. Use the code below
+# to replace the corresponding part above.
+
+tuning_option = {
+    'log_filename': log_file,
+
+    'tuner': 'xgb',
+    'n_trial': 2000,
+    'early_stopping': 600,
+
+    'measure_option': autotvm.measure_option(
+        builder=autotvm.LocalBuilder(timeout=10),
+        runner=autotvm.RPCRunner(
+            '1080ti',  # change the device key to your key
+            'localhost', 9190,
+            number=20, repeat=3, timeout=4, min_repeat_ms=150),
+    ),
+}
diff --git a/nnvm/tutorials/tune_nnvm_mobile_gpu.py b/nnvm/tutorials/tune_nnvm_mobile_gpu.py
new file mode 100644
index 000000000000..8946dc1833bd
--- /dev/null
+++ b/nnvm/tutorials/tune_nnvm_mobile_gpu.py
@@ -0,0 +1,416 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+Auto-tuning a convolutional network for Mobile GPU (NNVM)
+=========================================================
+**Author**: `Lianmin Zheng <https://github.com/merrymercy>`_
+
+Auto-tuning for a specific device is critical for getting the best
+performance. This is a tutorial about how to tune a whole convolutional
+network.
+
+The operator implementation for Mobile GPU in TVM is written in template form.
+The template has many tunable knobs (tile factor, vectorization, unrolling, etc).
+We will tune all convolution, depthwise convolution and dense operators
+in the neural network. After tuning, we produce a log file which stores
+the best knob values for all required operators. When the tvm compiler compiles
+these operators, it will query this log file to get the best knob values.
+
+We also released pre-tuned parameters for some arm devices. You can go to
+`Mobile GPU Benchmark <https://github.com/apache/incubator-tvm/wiki/Benchmark#mobile-gpu>`_
+to see the results.
+"""
+
+######################################################################
+# Install dependencies
+# --------------------
+# To use the autotvm package in tvm, we need to install some extra dependencies.
+# (change "3" to "2" if you use python2):
+#
+# .. code-block:: bash
+#
+#   pip3 install --user psutil xgboost tornado
+#
+# To make tvm run faster during tuning, it is recommended to use cython
+# as FFI of tvm. In the root directory of tvm, execute
+# (change "3" to "2" if you use python2):
+#
+# .. code-block:: bash
+#
+#   pip3 install --user cython
+#   sudo make cython3
+#
+# Now return to python code. Import packages.
+
+import os
+
+import numpy as np
+
+import nnvm.testing
+import nnvm.compiler
+import tvm
+from tvm import autotvm
+from tvm.autotvm.tuner import XGBTuner, GATuner, RandomTuner, GridSearchTuner
+from tvm.contrib.util import tempdir
+import tvm.contrib.graph_runtime as runtime
+
+#################################################################
+# Define network
+# --------------
+# First we need to define the network in nnvm symbol API.
+# We can load some pre-defined network from :code:`nnvm.testing`.
+# We can also load models from MXNet, ONNX and TensorFlow (see NNVM
+# tutorials :ref:`tutorial-nnvm` for more details).
+
+def get_network(name, batch_size):
+    """Get the symbol definition and random weight of a network"""
+    input_shape = (batch_size, 3, 224, 224)
+    output_shape = (batch_size, 1000)
+
+    if "resnet" in name:
+        n_layer = int(name.split('-')[1])
+        net, params = nnvm.testing.resnet.get_workload(num_layers=n_layer, batch_size=batch_size)
+    elif "vgg" in name:
+        n_layer = int(name.split('-')[1])
+        net, params = nnvm.testing.vgg.get_workload(num_layers=n_layer, batch_size=batch_size)
+    elif name == 'mobilenet':
+        net, params = nnvm.testing.mobilenet.get_workload(batch_size=batch_size)
+    elif name == 'squeezenet_v1.1':
+        net, params = nnvm.testing.squeezenet.get_workload(batch_size=batch_size, version='1.1')
+    elif name == 'inception_v3':
+        input_shape = (1, 3, 299, 299)
+        net, params = nnvm.testing.inception_v3.get_workload(batch_size=batch_size)
+    elif name == 'custom':
+        # an example for custom network
+        from nnvm.testing import utils
+        net = nnvm.sym.Variable('data')
+        net = nnvm.sym.conv2d(net, channels=4, kernel_size=(3,3), padding=(1,1))
+        net = nnvm.sym.flatten(net)
+        net = nnvm.sym.dense(net, units=1000)
+        net, params = utils.create_workload(net, batch_size, (3, 224, 224))
+    elif name == 'mxnet':
+        # an example for mxnet model
+        from mxnet.gluon.model_zoo.vision import get_model
+        block = get_model('resnet18_v1', pretrained=True)
+        net, params = nnvm.frontend.from_mxnet(block)
+        net = nnvm.sym.softmax(net)
+    else:
+        raise ValueError("Unsupported network: " + name)
+
+    return net, params, input_shape, output_shape
+
+
+#################################################################
+# Start RPC Tracker
+# -----------------
+# TVM uses RPC session to communicate with ARM boards.
+# During tuning, the tuner will send the generated code to the board and
+# measure the speed of code on the board.
+#
+# To scale up the tuning, TVM uses RPC Tracker to manage distributed devices.
+# The RPC Tracker is a centralized master node. We can register all devices to
+# the tracker. For example, if we have 10 phones, we can register all of them
+# to the tracker, and run 10 measurements in parallel, accelerating the tuning process.
+#
+# To start an RPC tracker, run this command on the host machine. The tracker is
+# required during the whole tuning process, so we need to open a new terminal for
+# this command:
+#
+# .. code-block:: bash
+#
+#   python -m tvm.exec.rpc_tracker --host=0.0.0.0 --port=9190
+#
+# The expected output is
+#
+# .. code-block:: bash
+#
+#   INFO:RPCTracker:bind to 0.0.0.0:9190
+
+#################################################################
+# Register devices to RPC Tracker
+# -----------------------------------
+# Now we can register our devices to the tracker. The first step is to
+# build tvm runtime for the ARM devices.
+#
+# * For Linux:
+#   Follow this section :ref:`build-tvm-runtime-on-device` to build
+#   tvm runtime on the device. Then register the device to tracker by
+#
+#   .. code-block:: bash
+#
+#     python -m tvm.exec.rpc_server --tracker=[HOST_IP]:9190 --key=rk3399
+#
+#   (replace :code:`[HOST_IP]` with the IP address of your host machine)
+#
+# * For Android:
+#   Follow this `readme page <https://github.com/apache/incubator-tvm/tree/master/apps/android_rpc>`_ to
+#   install tvm rpc apk on the android device. Make sure you can pass the android rpc test.
+#   Then you have already registred your device. During tuning, you have to go to developer option
+#   and enable "Keep screen awake during changing" and charge your phone to make it stable.
+#
+# After registering devices, we can confirm it by querying rpc_tracker
+#
+# .. code-block:: bash
+#
+#   python -m tvm.exec.query_rpc_tracker --host=0.0.0.0 --port=9190
+#
+# For example, if we have 2 Huawei mate10 pro, 11 Raspberry Pi 3B and 2 rk3399,
+# the output can be
+#
+# .. code-block:: bash
+#
+#    Queue Status
+#    ----------------------------------
+#    key          total  free  pending
+#    ----------------------------------
+#    mate10pro    2      2     0
+#    rk3399       2      2     0
+#    rpi3b        11     11    0
+#    ----------------------------------
+#
+# You can register multiple devices to the tracker to accelerate the measurement in tuning.
+
+###########################################
+# Set Tuning Options
+# ------------------
+# Before tuning, we should apply some configurations. Here I use an RK3399 board
+# as example. In your setting, you should modify the target and device_key accordingly.
+# set :code:`use_android` to True if you use android phone.
+
+#### DEVICE CONFIG ####
+
+target = tvm.target.create('opencl -device=mali')
+
+# Replace "aarch64-linux-gnu" with the correct target of your board.
+# This target host is used for cross compilation. You can query it by :code:`gcc -v` on your device.
+target_host = 'llvm -target=aarch64-linux-gnu'
+
+# Also replace this with the device key in your tracker
+device_key = 'rk3399'
+
+# Set this to True if you use android phone
+use_android = False
+
+#### TUNING OPTION ####
+network = 'resnet-18'
+log_file = "%s.%s.log" % (device_key, network)
+dtype = 'float32'
+
+tuning_option = {
+    'log_filename': log_file,
+
+    'tuner': 'xgb',
+    'n_trial': 1000,
+    'early_stopping': 450,
+
+    'measure_option': autotvm.measure_option(
+        builder=autotvm.LocalBuilder(
+            build_func='ndk' if use_android else 'default'),
+        runner=autotvm.RPCRunner(
+            device_key, host='localhost', port=9190,
+            number=10,
+            timeout=5,
+        ),
+    ),
+}
+
+####################################################################
+#
+# .. note:: How to set tuning options
+#
+#   In general, the default values provided here work well.
+#   If you have enough time budget, you can set :code:`n_trial`, :code:`early_stopping` larger,
+#   which makes the tuning run longer.
+#   If your device runs very slow or your conv2d operators have many GFLOPs, considering to
+#   set timeout larger.
+#
+
+###################################################################
+# Begin Tuning
+# ------------
+# Now we can extract tuning tasks from the network and begin tuning.
+# Here, we provide a simple utility function to tune a list of tasks.
+# This function is just an initial implementation which tunes them in sequential order.
+# We will introduce a more sophisticated tuning scheduler in the future.
+
+# You can skip the implementation of this function for this tutorial.
+def tune_tasks(tasks,
+               measure_option,
+               tuner='xgb',
+               n_trial=1000,
+               early_stopping=None,
+               log_filename='tuning.log',
+               use_transfer_learning=True,
+               try_winograd=True):
+    if try_winograd:
+        for i in range(len(tasks)):
+            try:  # try winograd template
+                tsk = autotvm.task.create(tasks[i].name, tasks[i].args,
+                                          tasks[i].target, tasks[i].target_host, 'winograd')
+                tasks.append(tsk)
+            except Exception:
+                pass
+
+    # create tmp log file
+    tmp_log_file = log_filename + ".tmp"
+    if os.path.exists(tmp_log_file):
+        os.remove(tmp_log_file)
+
+    for i, tsk in enumerate(reversed(tasks)):
+        prefix = "[Task %2d/%2d] " % (i+1, len(tasks))
+
+        # create tuner
+        if tuner == 'xgb' or tuner == 'xgb-rank':
+            tuner_obj = XGBTuner(tsk, loss_type='rank')
+        elif tuner == 'ga':
+            tuner_obj = GATuner(tsk, pop_size=50)
+        elif tuner == 'random':
+            tuner_obj = RandomTuner(tsk)
+        elif tuner == 'gridsearch':
+            tuner_obj = GridSearchTuner(tsk)
+        else:
+            raise ValueError("Invalid tuner: " + tuner)
+
+        if use_transfer_learning:
+            if os.path.isfile(tmp_log_file):
+                tuner_obj.load_history(autotvm.record.load_from_file(tmp_log_file))
+
+        # do tuning
+        n_trial = min(n_trial, len(tsk.config_space))
+        tuner_obj.tune(n_trial=n_trial,
+                       early_stopping=early_stopping,
+                       measure_option=measure_option,
+                       callbacks=[
+                           autotvm.callback.progress_bar(n_trial, prefix=prefix),
+                           autotvm.callback.log_to_file(tmp_log_file)])
+
+    # pick best records to a cache file
+    autotvm.record.pick_best(tmp_log_file, log_filename)
+    os.remove(tmp_log_file)
+
+
+########################################################################
+# Finally, we launch tuning jobs and evaluate the end-to-end performance.
+
+def tune_and_evaluate(tuning_opt):
+    # extract workloads from nnvm graph
+    print("Extract tasks...")
+    net, params, input_shape, out_shape = get_network(network, batch_size=1)
+    tasks = autotvm.task.extract_from_graph(net, target=target, target_host=target_host,
+                                            shape={'data': input_shape}, dtype=dtype,
+                                            symbols=(nnvm.sym.conv2d, nnvm.sym.dense))
+
+    # run tuning tasks
+    print("Tuning...")
+    tune_tasks(tasks, **tuning_opt)
+
+    # compile kernels with history best records
+    with autotvm.apply_history_best(log_file):
+        print("Compile...")
+        with nnvm.compiler.build_config(opt_level=3):
+            graph, lib, params = nnvm.compiler.build(
+                net, target=target, target_host=target_host,
+                shape={'data': input_shape}, params=params, dtype=dtype)
+
+        # export library
+        tmp = tempdir()
+        if use_android:
+            from tvm.contrib import ndk
+            filename = "net.so"
+            lib.export_library(tmp.relpath(filename), ndk.create_shared)
+        else:
+            filename = "net.tar"
+            lib.export_library(tmp.relpath(filename))
+
+        # upload module to device
+        print("Upload...")
+        remote = autotvm.measure.request_remote(device_key, 'localhost', 9190,
+                                                timeout=10000)
+        remote.upload(tmp.relpath(filename))
+        rlib = remote.load_module(filename)
+
+        # upload parameters to device
+        ctx = remote.context(str(target), 0)
+        module = runtime.create(graph, rlib, ctx)
+        data_tvm = tvm.nd.array((np.random.uniform(size=input_shape)).astype(dtype))
+        module.set_input('data', data_tvm)
+        module.set_input(**params)
+
+        # evaluate
+        print("Evaluate inference time cost...")
+        ftimer = module.module.time_evaluator("run", ctx, number=1, repeat=30)
+        prof_res = np.array(ftimer().results) * 1000  # convert to millisecond
+        print("Mean inference time (std dev): %.2f ms (%.2f ms)" %
+              (np.mean(prof_res), np.std(prof_res)))
+
+# We do not run the tuning in our webpage server since it takes too long.
+# Uncomment the following line to run it by yourself.
+
+# tune_and_evaluate(tuning_option)
+
+######################################################################
+# Sample Output
+# -------------
+# The tuning needs to compile many programs and extract feature from them.
+# So a high performance CPU is recommended.
+# One sample output is listed below. It takes about 3 hours on a 32T AMD Ryzen Threadripper.
+#
+# .. code-block:: bash
+#
+#    Extract tasks...
+#    Tuning...
+#    [Task  1/17]  Current/Best:   25.30/  39.12 GFLOPS | Progress: (992/1000) | 751.22 s Done.
+#    [Task  2/17]  Current/Best:   40.70/  45.50 GFLOPS | Progress: (736/1000) | 545.46 s Done.
+#    [Task  3/17]  Current/Best:   38.83/  42.35 GFLOPS | Progress: (992/1000) | 1549.85 s Done.
+#    [Task  4/17]  Current/Best:   23.31/  31.02 GFLOPS | Progress: (640/1000) | 1059.31 s Done.
+#    [Task  5/17]  Current/Best:    0.06/   2.34 GFLOPS | Progress: (544/1000) | 305.45 s Done.
+#    [Task  6/17]  Current/Best:   10.97/  17.20 GFLOPS | Progress: (992/1000) | 1050.00 s Done.
+#    [Task  7/17]  Current/Best:    8.98/  10.94 GFLOPS | Progress: (928/1000) | 421.36 s Done.
+#    [Task  8/17]  Current/Best:    4.48/  14.86 GFLOPS | Progress: (704/1000) | 582.60 s Done.
+#    [Task  9/17]  Current/Best:   10.30/  25.99 GFLOPS | Progress: (864/1000) | 899.85 s Done.
+#    [Task 10/17]  Current/Best:   11.73/  12.52 GFLOPS | Progress: (608/1000) | 304.85 s Done.
+#    [Task 11/17]  Current/Best:   15.26/  18.68 GFLOPS | Progress: (800/1000) | 747.52 s Done.
+#    [Task 12/17]  Current/Best:   17.48/  26.71 GFLOPS | Progress: (1000/1000) | 1166.40 s Done.
+#    [Task 13/17]  Current/Best:    0.96/  11.43 GFLOPS | Progress: (960/1000) | 611.65 s Done.
+#    [Task 14/17]  Current/Best:   17.88/  20.22 GFLOPS | Progress: (672/1000) | 670.29 s Done.
+#    [Task 15/17]  Current/Best:   11.62/  13.98 GFLOPS | Progress: (736/1000) | 449.25 s Done.
+#    [Task 16/17]  Current/Best:   19.90/  23.83 GFLOPS | Progress: (608/1000) | 708.64 s Done.
+#    [Task 17/17]  Current/Best:   17.98/  22.75 GFLOPS | Progress: (736/1000) | 1122.60 s Done.
+#    Compile...
+#    Upload...
+#    Evaluate inference time cost...
+#    Mean inference time (std dev): 128.05 ms (7.74 ms)
+#
+
+######################################################################
+#
+# .. note:: **Experiencing Difficulties?**
+#
+#   The auto tuning module is error-prone. If you always see " 0.00/ 0.00 GFLOPS",
+#   then there must be something wrong.
+#
+#   First, make sure you set the correct configuration of your device.
+#   Then, you can print debug information by adding these lines in the beginning
+#   of the script. It will print every measurement result, where you can find useful
+#   error messages.
+#
+#   .. code-block:: python
+#
+#      import logging
+#      logging.getLogger('autotvm').setLevel(logging.DEBUG)
+#
+#   Finally, always feel free to ask our community for help on https://discuss.tvm.ai
diff --git a/nnvm/tutorials/tune_nnvm_x86.py b/nnvm/tutorials/tune_nnvm_x86.py
new file mode 100644
index 000000000000..b7426271f06b
--- /dev/null
+++ b/nnvm/tutorials/tune_nnvm_x86.py
@@ -0,0 +1,236 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+Auto-tuning a convolutional network for x86 CPU (NNVM)
+======================================================
+**Author**: `Yao Wang <https://github.com/kevinthesun>`_
+
+This is a tutorial about how to tune convolution neural network
+for x86 cpu.
+"""
+import os
+import numpy as np
+
+import nnvm.testing
+import nnvm.compiler
+import tvm
+from tvm import autotvm
+from tvm.autotvm.tuner import XGBTuner, GATuner, RandomTuner, GridSearchTuner
+import tvm.contrib.graph_runtime as runtime
+
+#################################################################
+# Define network
+# --------------
+# First we need to define the network in nnvm symbol API.
+# We can load some pre-defined network from :code:`nnvm.testing`.
+# We can also load models from MXNet, ONNX and TensorFlow (see NNVM
+# tutorials :ref:`tutorial-nnvm` for more details).
+#
+# In this tutorial, we choose resnet-18 as tuning example.
+
+def get_network(name, batch_size):
+    """Get the symbol definition and random weight of a network"""
+    input_shape = (batch_size, 3, 224, 224)
+    output_shape = (batch_size, 1000)
+
+    if "resnet" in name:
+        n_layer = int(name.split('-')[1])
+        net, params = nnvm.testing.resnet.get_workload(num_layers=n_layer, batch_size=batch_size)
+    elif "vgg" in name:
+        n_layer = int(name.split('-')[1])
+        net, params = nnvm.testing.vgg.get_workload(num_layers=n_layer, batch_size=batch_size)
+    elif name == 'mobilenet':
+        net, params = nnvm.testing.mobilenet.get_workload(batch_size=batch_size)
+    elif name == 'squeezenet_v1.1':
+        net, params = nnvm.testing.squeezenet.get_workload(batch_size=batch_size, version='1.1')
+    elif name == 'inception_v3':
+        input_shape = (1, 3, 299, 299)
+        net, params = nnvm.testing.inception_v3.get_workload(batch_size=batch_size)
+    elif name == 'custom':
+        # an example for custom network
+        from nnvm.testing import utils
+        net = nnvm.sym.Variable('data')
+        net = nnvm.sym.conv2d(net, channels=4, kernel_size=(3,3), padding=(1,1))
+        net = nnvm.sym.flatten(net)
+        net = nnvm.sym.dense(net, units=1000)
+        net, params = utils.create_workload(net, batch_size, (3, 224, 224))
+    elif name == 'mxnet':
+        # an example for mxnet model
+        from mxnet.gluon.model_zoo.vision import get_model
+        block = get_model('resnet18_v1', pretrained=True)
+        net, params = nnvm.frontend.from_mxnet(block)
+        net = nnvm.sym.softmax(net)
+    else:
+        raise ValueError("Unsupported network: " + name)
+
+    return net, params, input_shape, output_shape
+
+# Replace "llvm" with the correct target of your cpu.
+# For example, for AWS EC2 c5 instance with Intel Xeon
+# Platinum 8000 series, the target should be "llvm -mcpu=skylake-avx512".
+# For AWS EC2 c4 instance with Intel Xeon E5-2666 v3, it should be
+# "llvm -mcpu=core-avx2".
+target = "llvm"
+
+batch_size = 1
+dtype = "float32"
+model_name = "resnet-18"
+log_file = "%s.log" % model_name
+
+# Set number of threads used for tuning based on the number of
+# physical cpu cores on your machine.
+num_threads = 1
+os.environ["TVM_NUM_THREADS"] = str(num_threads)
+
+
+#################################################################
+# Configure tensor tuning settings and create tasks
+# -------------------------------------------------
+# To get better kernel execution performance on x86 cpu,
+# we need to change data layout of convolution kernel from
+# "NCHW" to "NCHWc". To deal with this situation, we define
+# conv2d_NCHWc operator in topi. We will tune this operator
+# instead of plain conv2d.
+#
+# We will use local mode for tuning configuration. RPC tracker
+# mode can be setup similarly to the approach in
+# :ref:`tune_nnvm_arm` tutorial.
+
+tuning_option = {
+    'log_filename': log_file,
+    'tuner': 'random',
+    'early_stopping': None,
+
+    'measure_option': autotvm.measure_option(
+        builder=autotvm.LocalBuilder(),
+        runner=autotvm.LocalRunner(number=10, repeat=1,
+                                   min_repeat_ms=1000),
+    ),
+}
+
+# You can skip the implementation of this function for this tutorial.
+def tune_kernels(tasks,
+                 measure_option,
+                 tuner='gridsearch',
+                 early_stopping=None,
+                 log_filename='tuning.log'):
+
+    for i, tsk in enumerate(tasks):
+        prefix = "[Task %2d/%2d] " % (i+1, len(tasks))
+
+        # converting conv2d tasks to conv2d_NCHWc tasks
+        op_name = tsk.workload[0]
+        if op_name == 'conv2d':
+            func_create = 'topi_x86_conv2d_NCHWc'
+        elif op_name == 'depthwise_conv2d_nchw':
+            func_create = 'topi_x86_depthwise_conv2d_NCHWc_from_nchw'
+        else:
+            raise ValueError("Tuning {} is not supported on x86".format(op_name))
+
+        task = autotvm.task.create(func_create, args=tsk.args,
+                                   target=target, template_key='direct')
+        task.workload = tsk.workload
+
+        # create tuner
+        if tuner == 'xgb' or tuner == 'xgb-rank':
+            tuner_obj = XGBTuner(task, loss_type='rank')
+        elif tuner == 'ga':
+            tuner_obj = GATuner(task, pop_size=50)
+        elif tuner == 'random':
+            tuner_obj = RandomTuner(task)
+        elif tuner == 'gridsearch':
+            tuner_obj = GridSearchTuner(task)
+        else:
+            raise ValueError("Invalid tuner: " + tuner)
+
+        # do tuning
+        n_trial=len(task.config_space)
+        tuner_obj.tune(n_trial=n_trial,
+                       early_stopping=early_stopping,
+                       measure_option=measure_option,
+                       callbacks=[
+                           autotvm.callback.progress_bar(n_trial, prefix=prefix),
+                           autotvm.callback.log_to_file(log_filename)])
+
+
+########################################################################
+# Finally, we launch tuning jobs and evaluate the end-to-end performance.
+
+def tune_and_evaluate(tuning_opt):
+    # extract workloads from nnvm graph
+    print("Extract tasks...")
+    net, params, data_shape, out_shape = get_network(model_name, batch_size)
+    tasks = autotvm.task.extract_from_graph(net, target=target,
+                                            shape={'data': data_shape}, dtype=dtype,
+                                            symbols=(nnvm.sym.conv2d,))
+
+    # run tuning tasks
+    print("Tuning...")
+    tune_kernels(tasks, **tuning_opt)
+
+    # compile kernels with history best records
+    with autotvm.apply_history_best(log_file):
+        print("Compile...")
+        with nnvm.compiler.build_config(opt_level=3):
+            graph, lib, params = nnvm.compiler.build(
+                net, target=target, shape={'data': data_shape}, params=params, dtype=dtype)
+
+        # upload parameters to device
+        ctx = tvm.cpu()
+        data_tvm = tvm.nd.array((np.random.uniform(size=data_shape)).astype(dtype))
+        module = runtime.create(graph, lib, ctx)
+        module.set_input('data', data_tvm)
+        module.set_input(**params)
+
+        # evaluate
+        print("Evaluate inference time cost...")
+        ftimer = module.module.time_evaluator("run", ctx, number=100, repeat=3)
+        prof_res = np.array(ftimer().results) * 1000  # convert to millisecond
+        print("Mean inference time (std dev): %.2f ms (%.2f ms)" %
+              (np.mean(prof_res), np.std(prof_res)))
+
+# We do not run the tuning in our webpage server since it takes too long.
+# Uncomment the following line to run it by yourself.
+
+# tune_and_evaluate(tuning_option)
+
+######################################################################
+# Sample Output
+# -------------
+# The tuning needs to compile many programs and extract feature from them.
+# So a high performance CPU is recommended.
+# One sample output is listed below.
+#
+# .. code-block:: bash
+#
+#    Extract tasks...
+#    Tuning...
+#    [Task  1/12]  Current/Best:  598.05/2497.63 GFLOPS | Progress: (252/252) | 1357.95 s Done.
+#    [Task  2/12]  Current/Best:  522.63/2279.24 GFLOPS | Progress: (784/784) | 3989.60 s Done.
+#    [Task  3/12]  Current/Best:  447.33/1927.69 GFLOPS | Progress: (784/784) | 3869.14 s Done.
+#    [Task  4/12]  Current/Best:  481.11/1912.34 GFLOPS | Progress: (672/672) | 3274.25 s Done.
+#    [Task  5/12]  Current/Best:  414.09/1598.45 GFLOPS | Progress: (672/672) | 2720.78 s Done.
+#    [Task  6/12]  Current/Best:  508.96/2273.20 GFLOPS | Progress: (768/768) | 3718.75 s Done.
+#    [Task  7/12]  Current/Best:  469.14/1955.79 GFLOPS | Progress: (576/576) | 2665.67 s Done.
+#    [Task  8/12]  Current/Best:  230.91/1658.97 GFLOPS | Progress: (576/576) | 2435.01 s Done.
+#    [Task  9/12]  Current/Best:  487.75/2295.19 GFLOPS | Progress: (648/648) | 3009.95 s Done.
+#    [Task 10/12]  Current/Best:  182.33/1734.45 GFLOPS | Progress: (360/360) | 1755.06 s Done.
+#    [Task 11/12]  Current/Best:  372.18/1745.15 GFLOPS | Progress: (360/360) | 1684.50 s Done.
+#    [Task 12/12]  Current/Best:  215.34/2271.11 GFLOPS | Progress: (400/400) | 2128.74 s Done.
+#    Compile...
+#    Evaluate inference time cost...
+#    Mean inference time (std dev): 3.16 ms (0.03 ms)
diff --git a/nnvm/tutorials/using_external_lib.py b/nnvm/tutorials/using_external_lib.py
new file mode 100644
index 000000000000..cc52652ffa37
--- /dev/null
+++ b/nnvm/tutorials/using_external_lib.py
@@ -0,0 +1,234 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+Using External Libraries in NNVM
+================================
+**Author**: `Masahiro Masuda <https://github.com/masahi>`_
+
+This is a short tutorial on how to use external libraries such as cuDNN, or cuBLAS with NNVM.
+
+NNVM uses TVM internally to generate target specific code. For example, with cuda backend TVM generates cuda kernels for all layers in the user provided network.
+But sometimes it is also helpful to incorporate external libraries developed by various vendors into NNVM.
+Luckily, TVM has a mechanism to transparently call into these libraries.
+For NNVM users, all we need to do is just to set a target string appropriately.
+
+Before we can use external libraries from NNVM, your TVM needs to be built with libraries you want to use.
+For example, to use cuDNN, USE_CUDNN option in tvm/make/config.mk needs to be enabled, and cuDNN include and library directories need to be specified.
+
+To begin with, we import NNVM and TVM.
+"""
+import tvm
+import numpy as np
+from tvm.contrib import graph_runtime as runtime
+import nnvm.symbol as sym
+import nnvm.compiler
+from nnvm.testing import utils
+
+######################################################################
+# Create a simple network
+# -----------------------
+# Let's create a very simple network for demonstration.
+# It consists of convolution, batch normalization, and ReLU activation.
+
+out_channels = 16
+data = sym.Variable(name="data")
+simple_net = sym.conv2d(data=data, kernel_size=(3,3), channels=out_channels, padding = (1, 1), use_bias=True)
+simple_net = sym.batch_norm(data=simple_net)
+simple_net = sym.relu(data=simple_net)
+
+batch_size = 1
+data_shape = (batch_size, 3, 224, 224)
+net, params = utils.create_workload(simple_net, batch_size, data_shape[1:])
+
+######################################################################
+# Build and run with cuda backend
+# -------------------------------
+# We build and run this network with cuda backend, as usual.
+# By setting the logging level to DEBUG, the result of NNVM graph compilation will be dumped as pseudo code.
+import logging
+logging.basicConfig(level=logging.DEBUG) # to dump TVM IR after fusion
+
+target = "cuda"
+graph, lib, params = nnvm.compiler.build(
+    net, target, shape={"data": data_shape}, params=params)
+
+ctx = tvm.context(target, 0)
+data = np.random.uniform(-1, 1, size=data_shape).astype("float32")
+module = runtime.create(graph, lib, ctx)
+module.set_input(**params)
+module.set_input("data", data)
+module.run()
+out_shape = (batch_size, out_channels, 224, 224)
+out = module.get_output(0, tvm.nd.empty(out_shape))
+out_cuda = out.asnumpy()
+
+######################################################################
+# The generated pseudo code should look something like below.
+# Note how bias add, batch normalization, and ReLU activation are fused into the convolution kernel.
+# TVM generates a single, fused kernel from this representation.
+#
+# .. code-block:: text
+#
+#       produce compute {
+#         // attr [iter_var(blockIdx.x, , blockIdx.x)] thread_extent = 112
+#         // attr [input1.shared] storage_scope = "shared"
+#         allocate input1.shared[float32 * 16 * 3 * 3 * 3]
+#         // attr [compute] storage_scope = "local"
+#         allocate compute[float32 * 16 * 1 * 1 * 1 * 1]
+#         // attr [pad_temp.global.global.shared] storage_scope = "shared"
+#         allocate pad_temp.global.global.shared[float32 * 1 * 1 * 4 * 57 * 4]
+#         // attr [iter_var(threadIdx.x, Range(min=0, extent=448), threadIdx.x)] thread_extent = 448
+#         produce compute {
+#           produce input1.shared {
+#             for (ax0, 0, 16) {
+#               if (likely((threadIdx.x < 27))) {
+#                 input1.shared[(threadIdx.x + (ax0*27))] = input1[((((((blockIdx.x/112)*48) + (threadIdx.x/9))*9) + (threadIdx.x % 9)) + (ax0*27))]
+#               }
+#             }
+#           }
+#           compute[0] = 0.000000f
+#           compute[1] = 0.000000f
+#           compute[2] = 0.000000f
+#           compute[3] = 0.000000f
+#           compute[4] = 0.000000f
+#           compute[5] = 0.000000f
+#           compute[6] = 0.000000f
+#           compute[7] = 0.000000f
+#           compute[8] = 0.000000f
+#           compute[9] = 0.000000f
+#           compute[10] = 0.000000f
+#           compute[11] = 0.000000f
+#           compute[12] = 0.000000f
+#           compute[13] = 0.000000f
+#           compute[14] = 0.000000f
+#           compute[15] = 0.000000f
+#           for (rc, 0, 3) {
+#             produce pad_temp.global.global.shared {
+#               if (likely((threadIdx.x < 228))) {
+#                 if (likely(((blockIdx.x*2) < (226 - (threadIdx.x/57))))) {
+#                   pad_temp.global.global.shared[ramp((threadIdx.x*4), 1, 4)] = pad_temp[ramp(((((((blockIdx.x*2) + (threadIdx.x/57))*57) + (threadIdx.x % 57)) + (rc*12882))*4), 1, 4)]
+#                 }
+#               }
+#             }
+#             for (ry, 0, 3) {
+#               for (rx, 0, 3) {
+#                 compute[0] = (compute[0] + (pad_temp.global.global.shared[(((((threadIdx.x/224)*228) + (threadIdx.x % 224)) + (ry*228)) + rx)]*input1.shared[((((rc*3) + ry)*3) + rx)]))
+#                 compute[1] = (compute[1] + (pad_temp.global.global.shared[(((((threadIdx.x/224)*228) + (threadIdx.x % 224)) + (ry*228)) + rx)]*input1.shared[(((((rc*3) + ry)*3) + rx) + 27)]))
+#                 compute[2] = (compute[2] + (pad_temp.global.global.shared[(((((threadIdx.x/224)*228) + (threadIdx.x % 224)) + (ry*228)) + rx)]*input1.shared[(((((rc*3) + ry)*3) + rx) + 54)]))
+#                 compute[3] = (compute[3] + (pad_temp.global.global.shared[(((((threadIdx.x/224)*228) + (threadIdx.x % 224)) + (ry*228)) + rx)]*input1.shared[(((((rc*3) + ry)*3) + rx) + 81)]))
+#                 compute[4] = (compute[4] + (pad_temp.global.global.shared[(((((threadIdx.x/224)*228) + (threadIdx.x % 224)) + (ry*228)) + rx)]*input1.shared[(((((rc*3) + ry)*3) + rx) + 108)]))
+#                 compute[5] = (compute[5] + (pad_temp.global.global.shared[(((((threadIdx.x/224)*228) + (threadIdx.x % 224)) + (ry*228)) + rx)]*input1.shared[(((((rc*3) + ry)*3) + rx) + 135)]))
+#                 compute[6] = (compute[6] + (pad_temp.global.global.shared[(((((threadIdx.x/224)*228) + (threadIdx.x % 224)) + (ry*228)) + rx)]*input1.shared[(((((rc*3) + ry)*3) + rx) + 162)]))
+#                 compute[7] = (compute[7] + (pad_temp.global.global.shared[(((((threadIdx.x/224)*228) + (threadIdx.x % 224)) + (ry*228)) + rx)]*input1.shared[(((((rc*3) + ry)*3) + rx) + 189)]))
+#                 compute[8] = (compute[8] + (pad_temp.global.global.shared[(((((threadIdx.x/224)*228) + (threadIdx.x % 224)) + (ry*228)) + rx)]*input1.shared[(((((rc*3) + ry)*3) + rx) + 216)]))
+#                 compute[9] = (compute[9] + (pad_temp.global.global.shared[(((((threadIdx.x/224)*228) + (threadIdx.x % 224)) + (ry*228)) + rx)]*input1.shared[(((((rc*3) + ry)*3) + rx) + 243)]))
+#                 compute[10] = (compute[10] + (pad_temp.global.global.shared[(((((threadIdx.x/224)*228) + (threadIdx.x % 224)) + (ry*228)) + rx)]*input1.shared[(((((rc*3) + ry)*3) + rx) + 270)]))
+#                 compute[11] = (compute[11] + (pad_temp.global.global.shared[(((((threadIdx.x/224)*228) + (threadIdx.x % 224)) + (ry*228)) + rx)]*input1.shared[(((((rc*3) + ry)*3) + rx) + 297)]))
+#                 compute[12] = (compute[12] + (pad_temp.global.global.shared[(((((threadIdx.x/224)*228) + (threadIdx.x % 224)) + (ry*228)) + rx)]*input1.shared[(((((rc*3) + ry)*3) + rx) + 324)]))
+#                 compute[13] = (compute[13] + (pad_temp.global.global.shared[(((((threadIdx.x/224)*228) + (threadIdx.x % 224)) + (ry*228)) + rx)]*input1.shared[(((((rc*3) + ry)*3) + rx) + 351)]))
+#                 compute[14] = (compute[14] + (pad_temp.global.global.shared[(((((threadIdx.x/224)*228) + (threadIdx.x % 224)) + (ry*228)) + rx)]*input1.shared[(((((rc*3) + ry)*3) + rx) + 378)]))
+#                 compute[15] = (compute[15] + (pad_temp.global.global.shared[(((((threadIdx.x/224)*228) + (threadIdx.x % 224)) + (ry*228)) + rx)]*input1.shared[(((((rc*3) + ry)*3) + rx) + 405)]))
+#               }
+#             }
+#           }
+#         }
+#         compute[(((((blockIdx.x + ((blockIdx.x/112)*1792))*2) + (threadIdx.x/224))*224) + (threadIdx.x % 224))] = max((((compute[0] + input2[((blockIdx.x/112)*16)])*input3[((blockIdx.x/112)*16)]) + input4[((blockIdx.x/112)*16)]), 0.000000f)
+#         compute[((((((blockIdx.x + ((blockIdx.x/112)*1792))*2) + (threadIdx.x/224))*224) + (threadIdx.x % 224)) + 50176)] = max((((compute[1] + input2[(((blockIdx.x/112)*16) + 1)])*input3[(((blockIdx.x/112)*16) + 1)]) + input4[(((blockIdx.x/112)*16) + 1)]), 0.000000f)
+#         compute[((((((blockIdx.x + ((blockIdx.x/112)*1792))*2) + (threadIdx.x/224))*224) + (threadIdx.x % 224)) + 100352)] = max((((compute[2] + input2[(((blockIdx.x/112)*16) + 2)])*input3[(((blockIdx.x/112)*16) + 2)]) + input4[(((blockIdx.x/112)*16) + 2)]), 0.000000f)
+#         compute[((((((blockIdx.x + ((blockIdx.x/112)*1792))*2) + (threadIdx.x/224))*224) + (threadIdx.x % 224)) + 150528)] = max((((compute[3] + input2[(((blockIdx.x/112)*16) + 3)])*input3[(((blockIdx.x/112)*16) + 3)]) + input4[(((blockIdx.x/112)*16) + 3)]), 0.000000f)
+#         compute[((((((blockIdx.x + ((blockIdx.x/112)*1792))*2) + (threadIdx.x/224))*224) + (threadIdx.x % 224)) + 200704)] = max((((compute[4] + input2[(((blockIdx.x/112)*16) + 4)])*input3[(((blockIdx.x/112)*16) + 4)]) + input4[(((blockIdx.x/112)*16) + 4)]), 0.000000f)
+#         compute[((((((blockIdx.x + ((blockIdx.x/112)*1792))*2) + (threadIdx.x/224))*224) + (threadIdx.x % 224)) + 250880)] = max((((compute[5] + input2[(((blockIdx.x/112)*16) + 5)])*input3[(((blockIdx.x/112)*16) + 5)]) + input4[(((blockIdx.x/112)*16) + 5)]), 0.000000f)
+#         compute[((((((blockIdx.x + ((blockIdx.x/112)*1792))*2) + (threadIdx.x/224))*224) + (threadIdx.x % 224)) + 301056)] = max((((compute[6] + input2[(((blockIdx.x/112)*16) + 6)])*input3[(((blockIdx.x/112)*16) + 6)]) + input4[(((blockIdx.x/112)*16) + 6)]), 0.000000f)
+#         compute[((((((blockIdx.x + ((blockIdx.x/112)*1792))*2) + (threadIdx.x/224))*224) + (threadIdx.x % 224)) + 351232)] = max((((compute[7] + input2[(((blockIdx.x/112)*16) + 7)])*input3[(((blockIdx.x/112)*16) + 7)]) + input4[(((blockIdx.x/112)*16) + 7)]), 0.000000f)
+#         compute[((((((blockIdx.x + ((blockIdx.x/112)*1792))*2) + (threadIdx.x/224))*224) + (threadIdx.x % 224)) + 401408)] = max((((compute[8] + input2[(((blockIdx.x/112)*16) + 8)])*input3[(((blockIdx.x/112)*16) + 8)]) + input4[(((blockIdx.x/112)*16) + 8)]), 0.000000f)
+#         compute[((((((blockIdx.x + ((blockIdx.x/112)*1792))*2) + (threadIdx.x/224))*224) + (threadIdx.x % 224)) + 451584)] = max((((compute[9] + input2[(((blockIdx.x/112)*16) + 9)])*input3[(((blockIdx.x/112)*16) + 9)]) + input4[(((blockIdx.x/112)*16) + 9)]), 0.000000f)
+#         compute[((((((blockIdx.x + ((blockIdx.x/112)*1792))*2) + (threadIdx.x/224))*224) + (threadIdx.x % 224)) + 501760)] = max((((compute[10] + input2[(((blockIdx.x/112)*16) + 10)])*input3[(((blockIdx.x/112)*16) + 10)]) + input4[(((blockIdx.x/112)*16) + 10)]), 0.000000f)
+#         compute[((((((blockIdx.x + ((blockIdx.x/112)*1792))*2) + (threadIdx.x/224))*224) + (threadIdx.x % 224)) + 551936)] = max((((compute[11] + input2[(((blockIdx.x/112)*16) + 11)])*input3[(((blockIdx.x/112)*16) + 11)]) + input4[(((blockIdx.x/112)*16) + 11)]), 0.000000f)
+#         compute[((((((blockIdx.x + ((blockIdx.x/112)*1792))*2) + (threadIdx.x/224))*224) + (threadIdx.x % 224)) + 602112)] = max((((compute[12] + input2[(((blockIdx.x/112)*16) + 12)])*input3[(((blockIdx.x/112)*16) + 12)]) + input4[(((blockIdx.x/112)*16) + 12)]), 0.000000f)
+#         compute[((((((blockIdx.x + ((blockIdx.x/112)*1792))*2) + (threadIdx.x/224))*224) + (threadIdx.x % 224)) + 652288)] = max((((compute[13] + input2[(((blockIdx.x/112)*16) + 13)])*input3[(((blockIdx.x/112)*16) + 13)]) + input4[(((blockIdx.x/112)*16) + 13)]), 0.000000f)
+#         compute[((((((blockIdx.x + ((blockIdx.x/112)*1792))*2) + (threadIdx.x/224))*224) + (threadIdx.x % 224)) + 702464)] = max((((compute[14] + input2[(((blockIdx.x/112)*16) + 14)])*input3[(((blockIdx.x/112)*16) + 14)]) + input4[(((blockIdx.x/112)*16) + 14)]), 0.000000f)
+#         compute[((((((blockIdx.x + ((blockIdx.x/112)*1792))*2) + (threadIdx.x/224))*224) + (threadIdx.x % 224)) + 752640)] = max((((compute[15] + input2[(((blockIdx.x/112)*16) + 15)])*input3[(((blockIdx.x/112)*16) + 15)]) + input4[(((blockIdx.x/112)*16) + 15)]), 0.000000f)
+#       }
+#
+
+######################################################################
+# Use cuDNN for a convolutional layer
+# -----------------------------------
+# We can use cuDNN to replace convolution kernels with cuDNN ones.
+# To do that, all we need to do is to append the option " -libs=cudnn" to the target string.
+net, params = utils.create_workload(simple_net, batch_size, data_shape[1:])
+target = "cuda -libs=cudnn" # use cudnn for convolution
+graph, lib, params = nnvm.compiler.build(
+    net, target, shape={"data": data_shape}, params=params)
+
+ctx = tvm.context(target, 0)
+data = np.random.uniform(-1, 1, size=data_shape).astype("float32")
+module = runtime.create(graph, lib, ctx)
+module.set_input(**params)
+module.set_input("data", data)
+module.run()
+out_shape = (batch_size, out_channels, 224, 224)
+out = module.get_output(0, tvm.nd.empty(out_shape))
+out_cudnn = out.asnumpy()
+
+######################################################################
+# Note that if you use cuDNN, NNVM cannot fuse convolution with layers following it.
+# This is because layer fusion happens at the level of TVM internal representation(IR).
+# NNVM treats external libraries as black box, so there is no way to fuse them with TVM IR.
+#
+# The pseudo code below shows that cuDNN convolution + bias add + batch norm + ReLU turned into two stages of computation, one for cuDNN call and the other for the rest of operations.
+#
+# .. code-block:: text
+#
+#       allocate y[float32 * 1 * 16 * 224 * 224]
+#       produce y {
+#          // attr [0] extern_scope = 0
+#          tvm_call_packed("tvm.contrib.cudnn.conv2d.forward", 1, 0, 1, 1, 1, 1, 1, 1, 1, tvm_stack_make_array(input0, tvm_stack_make_shape(1, 3, 224, 224), 0, 4, 0.000000f, 0), tvm_stack_make_array(input1, tvm_stack_make_shape(16, 3, 3, 3), 0, 4, 0.000000f, 0), tvm_stack_make_array(y, tvm_stack_make_shape(1, 16, 224, 224), 0, 4, 0.000000f, 0))
+#        }
+#       produce compute {
+#          // attr [iter_var(blockIdx.x, , blockIdx.x)] thread_extent = 1568
+#          // attr [iter_var(threadIdx.x, , threadIdx.x)] thread_extent = 512
+#          compute[((((((blockIdx.x*512) + threadIdx.x)/50176) + ((((blockIdx.x*512) + threadIdx.x)/802816)*16))*50176) + ((((((blockIdx.x*512) + threadIdx.x)/224) % 224)*224) + (((blockIdx.x*64) + threadIdx.x) % 224)))] = max((((y[((((((blockIdx.x*512) + threadIdx.x)/50176) + ((((blockIdx.x*512) + threadIdx.x)/802816)*16))*50176) + ((((((blockIdx.x*512) + threadIdx.x)/224) % 224)*224) + (((blockIdx.x*64) + threadIdx.x) % 224)))] + input2[(((blockIdx.x*512) + threadIdx.x)/50176)])*input3[(((blockIdx.x*512) + threadIdx.x)/50176)]) + input4[(((blockIdx.x*512) + threadIdx.x)/50176)]), 0.000000f)
+#        }
+#
+
+######################################################################
+# Verify the result
+# -----------------
+# We can check that the results of two runs match.
+
+tvm.testing.assert_allclose(out_cuda, out_cudnn, rtol=1e-5)
+
+#####################################################################
+# Conclusion
+# ----------
+# This tutorial covered the usage of cuDNN with NNVM.
+# We also have support for cuBLAS. If cuBLAS is enabled, it will be used inside a fully connected layer (nnvm.symbol.dense).
+# To use cuBLAS, set a target string as "cuda -libs=cublas".
+# You can use both cuDNN and cuBLAS with "cuda -libs=cudnn,cublas".
+#
+# For ROCm backend, we have support for MIOpen and rocBLAS.
+# They can be enabled with target "rocm -libs=miopen,rocblas".
+#
+# Being able to use external libraries is great, but we need to keep in mind some cautions.
+#
+# First, the use of external libraries may restrict your usage of TVM and NNVM.
+# For example, MIOpen only supports NCHW layout and fp32 data type at the moment, so you cannot use other layouts or data type in TVM.
+#
+# Second, and more importantly, external libraries restrict the possibility of operator fusion during graph compilation, as shown above.
+# TVM and NNVM aim to achieve the best performance on a variety of hardwares, with joint operator level and graph level optimization.
+# To achieve this goal, we should continue developing better optimizations for TVM and NNVM, while using external libraries as a nice way to fall back to existing implementation when necessary.
diff --git a/nnvm/tutorials/web/resnet.html b/nnvm/tutorials/web/resnet.html
new file mode 100644
index 000000000000..13531a3809c0
--- /dev/null
+++ b/nnvm/tutorials/web/resnet.html
@@ -0,0 +1,204 @@
+<html>
+<!--- Licensed to the Apache Software Foundation (ASF) under one -->
+<!--- or more contributor license agreements.  See the NOTICE file -->
+<!--- distributed with this work for additional information -->
+<!--- regarding copyright ownership.  The ASF licenses this file -->
+<!--- to you under the Apache License, Version 2.0 (the -->
+<!--- "License"); you may not use this file except in compliance -->
+<!--- with the License.  You may obtain a copy of the License at -->
+
+<!---   http://www.apache.org/licenses/LICENSE-2.0 -->
+
+<!--- Unless required by applicable law or agreed to in writing, -->
+<!--- software distributed under the License is distributed on an -->
+<!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -->
+<!--- KIND, either express or implied.  See the License for the -->
+<!--- specific language governing permissions and limitations -->
+<!--- under the License. -->
+
+
+<head>
+  <meta charset="UTF-8">
+  <title>NNVM WebGL Test Page</title>
+</head>
+
+<body>
+  <h1>NNVM WebGL Test Page</h1>
+
+  <!-- We will draw the input image here. -->
+  <div>Input Image:</div>
+  <img id="image", src="data.png">
+
+  <!-- We need a canvas to get the image pixel data. Hide this element. -->
+  <canvas hidden id="image_canvas" width="224" height="224"></canvas>
+
+  <!-- We will write te prediction result here. -->
+  <div id="prediction"></div>
+
+  <!-- We will write all log messages here. -->
+  <div id="log">Log:</div>
+
+  <!-- The OpenGL canvas. -->
+  <canvas id="canvas"></canvas>
+
+  <script>
+    var Module = {};
+
+    // resnet.js would recognize Module["canvas"]
+    Module["canvas"] = document.getElementById("canvas");
+  </script>
+
+  <script src="resnet.js"></script>
+  <script src="tvm_runtime.js"></script>
+
+  <script>
+
+    /**
+     * Load a text file synchronously.
+     * @param {string} url The file path.
+     * @return {string} The file content.
+     */
+    function load_file(url) {
+      assert(typeof url == "string", "URL must be string");
+
+      var req = new XMLHttpRequest();
+      var result;
+      req.addEventListener("load", function() {
+        result = this.responseText;
+      });
+      req.open("get", url, false);
+      req.send();
+      return result;
+    }
+
+    /**
+     * The index of the maximum element in an array.
+     * @param {Array} The array.
+     * @return {number} The index.
+     */
+    function argmax(arr) {
+      assert(typeof arr.length == "number", "Input must be array-like");
+
+      var res = 0;
+      for (var i = 0; i < arr.length; i++) {
+        if (arr[i] > arr[res]) {
+          res = i;
+        }
+      }
+      return res;
+    }
+
+    /**
+     * Preprocess an image to fit resnet input format.
+     * @param {ImageData} The input image data. Should be 224x224xRGBA.
+     * @return {Float32Array} The preprocessed input array.
+     */
+    function preprocess_image(image_data) {
+      assert(image_data instanceof ImageData, "Input must be ImageData.");
+      assert(image_data.width == 224, "Width must be 224.");
+      assert(image_data.height == 224, "Height must be 224.");
+
+      var width = image_data.width;
+      var height = image_data.height;
+      var npixels = width * height;
+
+      var rgba_uint8 = image_data.data;
+      assert(rgba_uint8.length == npixels * 4, "Image should be RGBA.");
+
+      // Drop alpha channel. Resnet does not need it.
+      var rgb_uint8 = new Uint8Array(npixels * 3);
+      for (var i = 0; i < npixels; i++) {
+        rgb_uint8[i * 3] = rgba_uint8[i * 4];
+        rgb_uint8[i * 3 + 1] = rgba_uint8[i * 4 + 1];
+        rgb_uint8[i * 3 + 2] = rgba_uint8[i * 4 + 2];
+      }
+
+      // Cast to float and normalize.
+      var rgb_float = new Float32Array(npixels * 3);
+      for (var i = 0; i < npixels; i++) {
+        rgb_float[i * 3] = (rgb_uint8[i * 3] - 123.0) / 58.395;
+        rgb_float[i * 3 + 1] = (rgb_uint8[i * 3 + 1] - 117.0) / 57.12;
+        rgb_float[i * 3 + 2] = (rgb_uint8[i * 3 + 2] - 104.0) / 57.375;
+      }
+
+      // Transpose. Resnet expects 3 greyscale images.
+      var data = new Float32Array(npixels * 3);
+      for (var i = 0; i < npixels; i++) {
+        data[i] = rgb_float[i * 3];
+        data[npixels + i] = rgb_float[i * 3 + 1];
+        data[npixels * 2 + i] = rgb_float[i * 3 + 2];
+      }
+
+      return data;
+    }
+
+    // Set these variables at the global scope so that we can debug more easily.
+    var tvm;
+    var syslib;
+    var graph_json_str;
+    var loaded_module;
+    var data_array;
+    var data;
+    var input;
+    var base64_params;
+    var output;
+    Module["onRuntimeInitialized"] = function () {
+      tvm = tvm_runtime.create(Module);
+
+      tvm.logger = function (message) {
+        console.log(message);
+        var d = document.createElement("div");
+        d.innerHTML = message;
+        document.getElementById("log").appendChild(d);
+      };
+
+      tvm.logger("Loading SystemLib...");
+      syslib = tvm.systemLib();
+      tvm.logger("- SystemLib loaded!");
+
+      tvm.logger("Loading resnet model...");
+      graph_json_str = load_file("resnet.json");
+      ctx = tvm.context("opengl", 0);
+      loaded_module = tvm.createGraphRuntime(graph_json_str, syslib, ctx);
+      tvm.logger("- Model loaded!");
+
+      tvm.logger("Loading model parameters...");
+      base64_params = load_file("resnet.params");
+      loaded_module.load_base64_params(base64_params);
+      tvm.logger("- Model parameters loaded!");
+
+      tvm.logger("Loading input image...");
+      var image = document.getElementById("image");
+      var image_canvas = document.getElementById("image_canvas");
+      var image_canvas_context = image_canvas.getContext("2d");
+      image_canvas_context.drawImage(image, 0, 0);
+      var image_data = image_canvas_context.getImageData(0, 0, 224, 224);
+      data_array = preprocess_image(image_data);
+      tvm.logger("- Input image loaded!");
+
+      tvm.logger("Setting input data...");
+      data_shape = JSON.parse(load_file("data_shape.json"));
+      data = tvm.empty(data_shape, "float32", ctx);
+      data.copyFrom(data_array);
+      loaded_module.set_input("data", data);
+      tvm.logger("- Input data set!");
+
+      tvm.logger("Running model...");
+      loaded_module.run();
+      tvm.logger("- Model execution completed!");
+
+      out_shape = JSON.parse(load_file("out_shape.json"));
+      output = tvm.empty(out_shape, "float32", ctx);
+      loaded_module.get_output(0, output);
+
+      prediction = argmax(output.asArray());
+      
+      synset = JSON.parse(load_file("synset.json"));
+      result_string = "Prediction: " + synset[prediction] + "\n";
+      document.getElementById("prediction").innerHTML = result_string;
+    };
+
+  </script>
+</body>
+
+</html>
diff --git a/python/setup.py b/python/setup.py
index bc53060f95cf..bc3390a63f28 100644
--- a/python/setup.py
+++ b/python/setup.py
@@ -96,7 +96,6 @@ def config_cython():
                               "../3rdparty/dmlc-core/include",
                               "../3rdparty/dlpack/include",
                 ],
-                extra_compile_args=["-std=c++11"],
                 library_dirs=library_dirs,
                 libraries=libraries,
                 language="c++"))
@@ -160,7 +159,7 @@ def get_package_data_files():
         'attrs',
         'psutil',
         ],
-      extras_require={'test': ['pillow',
+      extras_require={'test': ['PIL',
                                'matplotlib'],
                       'extra_feature': ['tornado',
                                         'psutil',
diff --git a/python/tvm/_ffi/_ctypes/ndarray.py b/python/tvm/_ffi/_ctypes/ndarray.py
index c572947c8d19..9367160b811b 100644
--- a/python/tvm/_ffi/_ctypes/ndarray.py
+++ b/python/tvm/_ffi/_ctypes/ndarray.py
@@ -20,7 +20,7 @@
 
 import ctypes
 from ..base import _LIB, check_call, c_str
-from ..runtime_ctypes import TVMArrayHandle
+from ..runtime_ctypes import TVMArrayHandle, TVMNDArrayContainerHandle
 from .types import RETURN_SWITCH, C_TO_PY_ARG_SWITCH, _wrap_arg_func, _return_handle
 
 
@@ -85,16 +85,6 @@ def __del__(self):
     def _tvm_handle(self):
         return ctypes.cast(self.handle, ctypes.c_void_p).value
 
-    def _copyto(self, target_nd):
-        """Internal function that implements copy to target ndarray."""
-        check_call(_LIB.TVMArrayCopyFromTo(self.handle, target_nd.handle, None))
-        return target_nd
-
-    @property
-    def shape(self):
-        """Shape of this array"""
-        return tuple(self.handle.contents.shape[i] for i in range(self.handle.contents.ndim))
-
     def to_dlpack(self):
         """Produce an array from a DLPack Tensor without copying memory
 
@@ -110,17 +100,12 @@ def to_dlpack(self):
 def _make_array(handle, is_view, is_container):
     global _TVM_ND_CLS
     handle = ctypes.cast(handle, TVMArrayHandle)
-    if is_container:
-        tindex = ctypes.c_uint()
-        check_call(_LIB.TVMArrayGetTypeIndex(handle, ctypes.byref(tindex)))
-        cls = _TVM_ND_CLS.get(tindex.value, _CLASS_NDARRAY)
-    else:
-        cls = _CLASS_NDARRAY
-
-    ret = cls.__new__(cls)
-    ret.handle = handle
-    ret.is_view = is_view
-    return ret
+    fcreate = _CLASS_NDARRAY
+    if is_container and _TVM_ND_CLS:
+        array_type_info = ctypes.cast(handle, TVMNDArrayContainerHandle).array_type_info.value
+        if array_type_info > 0:
+            fcreate = _TVM_ND_CLS[array_type_info]
+    return fcreate(handle, is_view)
 
 _TVM_COMPATS = ()
 
@@ -134,9 +119,9 @@ def _reg_extension(cls, fcreate):
 
 _TVM_ND_CLS = {}
 
-def _register_ndarray(index, cls):
+def _reg_ndarray(cls, fcreate):
     global _TVM_ND_CLS
-    _TVM_ND_CLS[index] = cls
+    _TVM_ND_CLS[cls._array_type_code] = fcreate
 
 _CLASS_NDARRAY = None
 
diff --git a/python/tvm/_ffi/_ctypes/object.py b/python/tvm/_ffi/_ctypes/object.py
index b8b8aefea131..c3ae56822198 100644
--- a/python/tvm/_ffi/_ctypes/object.py
+++ b/python/tvm/_ffi/_ctypes/object.py
@@ -21,7 +21,7 @@
 import ctypes
 from ..base import _LIB, check_call
 from .types import TypeCode, RETURN_SWITCH, C_TO_PY_ARG_SWITCH, _wrap_arg_func
-from .ndarray import _register_ndarray, NDArrayBase
+from ..node_generic import _set_class_node_base
 
 
 ObjectHandle = ctypes.c_void_p
@@ -39,9 +39,6 @@ def _set_class_node(node_class):
 
 def _register_object(index, cls):
     """register object class"""
-    if issubclass(cls, NDArrayBase):
-        _register_ndarray(index, cls)
-        return
     OBJECT_TYPE[index] = cls
 
 
@@ -94,3 +91,6 @@ def __init_handle_by_constructor__(self, fconstructor, *args):
         if not isinstance(handle, ObjectHandle):
             handle = ObjectHandle(handle)
         self.handle = handle
+
+
+_set_class_node_base(ObjectBase)
diff --git a/python/tvm/_ffi/_cython/base.pxi b/python/tvm/_ffi/_cython/base.pxi
index 7ccb6279fed0..4b7b2c88ffa5 100644
--- a/python/tvm/_ffi/_cython/base.pxi
+++ b/python/tvm/_ffi/_cython/base.pxi
@@ -19,7 +19,7 @@ from ..base import get_last_ffi_error
 from libcpp.vector cimport vector
 from cpython.version cimport PY_MAJOR_VERSION
 from cpython cimport pycapsule
-from libc.stdint cimport int32_t, int64_t, uint64_t, uint32_t, uint8_t, uint16_t
+from libc.stdint cimport int32_t, int64_t, uint64_t, uint8_t, uint16_t
 import ctypes
 
 cdef enum TVMTypeCode:
@@ -78,11 +78,14 @@ ctypedef void* TVMRetValueHandle
 ctypedef void* TVMFunctionHandle
 ctypedef void* ObjectHandle
 
-ctypedef struct TVMObject:
-    uint32_t type_index_
-    int32_t ref_counter_
-    void (*deleter_)(TVMObject* self)
 
+ctypedef struct TVMNDArrayContainer:
+    DLTensor dl_tensor
+    void* manager_ctx
+    void (*deleter)(DLManagedTensor* self)
+    int32_t array_type_info
+
+ctypedef TVMNDArrayContainer* TVMNDArrayContainerHandle
 
 ctypedef int (*TVMPackedCFunc)(
     TVMValue* args,
diff --git a/python/tvm/_ffi/_cython/ndarray.pxi b/python/tvm/_ffi/_cython/ndarray.pxi
index 9fd3aa43841f..402c9de24ebc 100644
--- a/python/tvm/_ffi/_cython/ndarray.pxi
+++ b/python/tvm/_ffi/_cython/ndarray.pxi
@@ -68,11 +68,6 @@ cdef class NDArrayBase:
         def __set__(self, value):
             self._set_handle(value)
 
-    @property
-    def shape(self):
-        """Shape of this array"""
-        return tuple(self.chandle.shape[i] for i in range(self.chandle.ndim))
-
     def __init__(self, handle, is_view):
         self._set_handle(handle)
         self.c_is_view = is_view
@@ -81,11 +76,6 @@ cdef class NDArrayBase:
         if self.c_is_view == 0:
             CALL(TVMArrayFree(self.chandle))
 
-    def _copyto(self, target_nd):
-        """Internal function that implements copy to target ndarray."""
-        CALL(TVMArrayCopyFromTo(self.chandle, (<NDArrayBase>target_nd).chandle, NULL))
-        return target_nd
-
     def to_dlpack(self):
         """Produce an array from a DLPack Tensor without copying memory
 
@@ -100,34 +90,17 @@ cdef class NDArrayBase:
         return pycapsule.PyCapsule_New(dltensor, _c_str_dltensor, _c_dlpack_deleter)
 
 
-# Import limited object-related function from C++ side to improve the speed
-# NOTE: can only use POD-C compatible object in FFI.
-cdef extern from "tvm/runtime/ndarray.h" namespace "tvm::runtime":
-    cdef void* TVMArrayHandleToObjectHandle(DLTensorHandle handle)
-
-
 cdef c_make_array(void* chandle, is_view, is_container):
     global _TVM_ND_CLS
-
-    if is_container:
-        tindex = (
-            <TVMObject*>TVMArrayHandleToObjectHandle(<DLTensorHandle>chandle)).type_index_
-        if tindex < len(_TVM_ND_CLS):
-            cls = _TVM_ND_CLS[tindex]
-            if cls is not None:
-                ret = cls.__new__(cls)
-            else:
-                ret = _CLASS_NDARRAY.__new__(_CLASS_NDARRAY)
-        else:
-            ret = _CLASS_NDARRAY.__new__(_CLASS_NDARRAY)
-        (<NDArrayBase>ret).chandle = <DLTensor*>chandle
-        (<NDArrayBase>ret).c_is_view = <int>is_view
-        return ret
-    else:
-        ret = _CLASS_NDARRAY.__new__(_CLASS_NDARRAY)
-        (<NDArrayBase>ret).chandle = <DLTensor*>chandle
-        (<NDArrayBase>ret).c_is_view = <int>is_view
-        return ret
+    cdef int32_t array_type_info
+    fcreate = _CLASS_NDARRAY
+    if is_container and len(_TVM_ND_CLS) > 0:
+        array_type_info = (<TVMNDArrayContainerHandle>chandle).array_type_info
+        if array_type_info > 0:
+            fcreate = _TVM_ND_CLS[array_type_info]
+    ret = fcreate(None, is_view)
+    (<NDArrayBase>ret).chandle = <DLTensor*>chandle
+    return ret
 
 
 cdef _TVM_COMPATS = ()
@@ -140,16 +113,11 @@ def _reg_extension(cls, fcreate):
     if fcreate:
         _TVM_EXT_RET[cls._tvm_tcode] = fcreate
 
-cdef list _TVM_ND_CLS = []
+cdef _TVM_ND_CLS = {}
 
-cdef _register_ndarray(int index, object cls):
-    """register object class"""
+def _reg_ndarray(cls, fcreate):
     global _TVM_ND_CLS
-    while len(_TVM_ND_CLS) <= index:
-        _TVM_ND_CLS.append(None)
-
-    _TVM_ND_CLS[index] = cls
-
+    _TVM_ND_CLS[cls._array_type_code] = fcreate
 
 def _make_array(handle, is_view, is_container):
     cdef unsigned long long ptr
diff --git a/python/tvm/_ffi/_cython/object.pxi b/python/tvm/_ffi/_cython/object.pxi
index 6d20723fd188..9561eab94ea2 100644
--- a/python/tvm/_ffi/_cython/object.pxi
+++ b/python/tvm/_ffi/_cython/object.pxi
@@ -16,15 +16,12 @@
 # under the License.
 
 """Maps object type to its constructor"""
-cdef list OBJECT_TYPE = []
+from ..node_generic import _set_class_node_base
+
+OBJECT_TYPE = []
 
 def _register_object(int index, object cls):
     """register object class"""
-    if issubclass(cls, NDArrayBase):
-        _register_ndarray(index, cls)
-        return
-
-    global OBJECT_TYPE
     while len(OBJECT_TYPE) <= index:
         OBJECT_TYPE.append(None)
     OBJECT_TYPE[index] = cls
@@ -34,13 +31,14 @@ cdef inline object make_ret_object(void* chandle):
     global OBJECT_TYPE
     global _CLASS_NODE
     cdef unsigned tindex
+    cdef list object_type
     cdef object cls
     cdef object handle
     object_type = OBJECT_TYPE
     handle = ctypes_handle(chandle)
     CALL(TVMObjectGetTypeIndex(chandle, &tindex))
-    if tindex < len(OBJECT_TYPE):
-        cls = OBJECT_TYPE[tindex]
+    if tindex < len(object_type):
+        cls = object_type[tindex]
         if cls is not None:
             obj = cls.__new__(cls)
         else:
@@ -101,3 +99,6 @@ cdef class ObjectBase:
             (<FunctionBase>fconstructor).chandle,
             kObjectHandle, args, &chandle)
         self.chandle = chandle
+
+
+_set_class_node_base(ObjectBase)
diff --git a/python/tvm/_ffi/base.py b/python/tvm/_ffi/base.py
index 716091e96ed0..c61c5c445442 100644
--- a/python/tvm/_ffi/base.py
+++ b/python/tvm/_ffi/base.py
@@ -35,13 +35,8 @@
     # this function is needed for python3
     # to convert ctypes.char_p .value back to python str
     if sys.platform == "win32":
-        def _py_str(x):
-            try:
-                return x.decode('utf-8')
-            except UnicodeDecodeError:
-                encoding = 'cp' + str(ctypes.cdll.kernel32.GetACP())
-                return x.decode(encoding)
-        py_str = _py_str
+        encoding = 'cp' + str(ctypes.cdll.kernel32.GetACP())
+        py_str = lambda x: x.decode(encoding)
     else:
         py_str = lambda x: x.decode('utf-8')
 else:
@@ -61,7 +56,7 @@ def _load_lib():
 
 # version number
 __version__ = libinfo.__version__
-# library instance
+# library instance of nnvm
 _LIB, _LIB_NAME = _load_lib()
 
 # Whether we are runtime only
diff --git a/python/tvm/_ffi/function.py b/python/tvm/_ffi/function.py
index 23d95ebbf66b..60e7aeb9aec5 100644
--- a/python/tvm/_ffi/function.py
+++ b/python/tvm/_ffi/function.py
@@ -22,7 +22,6 @@
 import sys
 import ctypes
 from .base import _LIB, check_call, py_str, c_str, string_types, _FFI_MODE
-from .node_generic import _set_class_objects
 
 IMPORT_EXCEPT = RuntimeError if _FFI_MODE == "cython" else ImportError
 
@@ -33,21 +32,15 @@
     if sys.version_info >= (3, 0):
         from ._cy3.core import _set_class_function, _set_class_module
         from ._cy3.core import FunctionBase as _FunctionBase
-        from ._cy3.core import NDArrayBase as _NDArrayBase
-        from ._cy3.core import ObjectBase as _ObjectBase
         from ._cy3.core import convert_to_tvm_func
     else:
         from ._cy2.core import _set_class_function, _set_class_module
         from ._cy2.core import FunctionBase as _FunctionBase
-        from ._cy2.core import NDArrayBase as _NDArrayBase
-        from ._cy2.core import ObjectBase as _ObjectBase
         from ._cy2.core import convert_to_tvm_func
 except IMPORT_EXCEPT:
     # pylint: disable=wrong-import-position
     from ._ctypes.function import _set_class_function, _set_class_module
     from ._ctypes.function import FunctionBase as _FunctionBase
-    from ._ctypes.ndarray import NDArrayBase as _NDArrayBase
-    from ._ctypes.object import ObjectBase as _ObjectBase
     from ._ctypes.function import convert_to_tvm_func
 
 FunctionHandle = ctypes.c_void_p
@@ -89,9 +82,6 @@ def __init__(self, handle):
     def __del__(self):
         check_call(_LIB.TVMModFree(self.handle))
 
-    def __hash__(self):
-        return ctypes.cast(self.handle, ctypes.c_void_p).value
-
     @property
     def entry_func(self):
         """Get the entry function
@@ -332,4 +322,3 @@ def _init_api_prefix(module_name, prefix):
         setattr(target_module, ff.__name__, ff)
 
 _set_class_function(Function)
-_set_class_objects((_ObjectBase, _NDArrayBase, ModuleBase))
diff --git a/python/tvm/_ffi/ndarray.py b/python/tvm/_ffi/ndarray.py
index 650f01dd5409..da0783e10410 100644
--- a/python/tvm/_ffi/ndarray.py
+++ b/python/tvm/_ffi/ndarray.py
@@ -35,16 +35,16 @@
     if sys.version_info >= (3, 0):
         from ._cy3.core import _set_class_ndarray, _make_array, _from_dlpack
         from ._cy3.core import NDArrayBase as _NDArrayBase
-        from ._cy3.core import _reg_extension
+        from ._cy3.core import _reg_extension, _reg_ndarray
     else:
         from ._cy2.core import _set_class_ndarray, _make_array, _from_dlpack
         from ._cy2.core import NDArrayBase as _NDArrayBase
-        from ._cy2.core import _reg_extension
+        from ._cy2.core import _reg_extension, _reg_ndarray
 except IMPORT_EXCEPT:
     # pylint: disable=wrong-import-position
     from ._ctypes.ndarray import _set_class_ndarray, _make_array, _from_dlpack
     from ._ctypes.ndarray import NDArrayBase as _NDArrayBase
-    from ._ctypes.ndarray import _reg_extension
+    from ._ctypes.ndarray import _reg_extension, _reg_ndarray
 
 
 def context(dev_type, dev_id=0):
@@ -157,6 +157,10 @@ def from_dlpack(dltensor):
 
 class NDArrayBase(_NDArrayBase):
     """A simple Device/CPU Array object in runtime."""
+    @property
+    def shape(self):
+        """Shape of this array"""
+        return tuple(self.handle.contents.shape[i] for i in range(self.handle.contents.ndim))
 
     @property
     def dtype(self):
@@ -236,7 +240,6 @@ def copyfrom(self, source_array):
             except:
                 raise TypeError('array must be an array_like data,' +
                                 'type %s is not supported' % str(type(source_array)))
-
         t = TVMType(self.dtype)
         shape, dtype = self.shape, self.dtype
         if t.lanes > 1:
@@ -291,12 +294,28 @@ def copyto(self, target):
         target : NDArray
             The target array to be copied, must have same shape as this array.
         """
+        if isinstance(target, TVMContext):
+            target = empty(self.shape, self.dtype, target)
         if isinstance(target, NDArrayBase):
-            return self._copyto(target)
-        elif isinstance(target, TVMContext):
-            res = empty(self.shape, self.dtype, target)
-            return self._copyto(res)
-        raise ValueError("Unsupported target type %s" % str(type(target)))
+            check_call(_LIB.TVMArrayCopyFromTo(
+                self.handle, target.handle, None))
+        else:
+            raise ValueError("Unsupported target type %s" % str(type(target)))
+        return target
+
+
+def free_extension_handle(handle, type_code):
+    """Free c++ extension type handle
+
+    Parameters
+    ----------
+    handle : ctypes.c_void_p
+        The handle to the extension type.
+
+    type_code : int
+         The tyoe code
+    """
+    check_call(_LIB.TVMExtTypeFree(handle, ctypes.c_int(type_code)))
 
 
 def register_extension(cls, fcreate=None):
@@ -348,8 +367,13 @@ def __init__(self):
            def _tvm_handle(self):
                return self.handle.value
     """
-    assert hasattr(cls, "_tvm_tcode")
-    if fcreate and cls._tvm_tcode < TypeCode.EXT_BEGIN:
-        raise ValueError("Cannot register create when extension tcode is same as buildin")
-    _reg_extension(cls, fcreate)
+    if issubclass(cls, _NDArrayBase):
+        assert fcreate is not None
+        assert hasattr(cls, "_array_type_code")
+        _reg_ndarray(cls, fcreate)
+    else:
+        assert hasattr(cls, "_tvm_tcode")
+        if fcreate and cls._tvm_tcode < TypeCode.EXT_BEGIN:
+            raise ValueError("Cannot register create when extension tcode is same as buildin")
+        _reg_extension(cls, fcreate)
     return cls
diff --git a/python/tvm/_ffi/node_generic.py b/python/tvm/_ffi/node_generic.py
index 8ee7fc5f2b5b..e89812685eb2 100644
--- a/python/tvm/_ffi/node_generic.py
+++ b/python/tvm/_ffi/node_generic.py
@@ -23,11 +23,11 @@
 from .base import string_types
 
 # Node base class
-_CLASS_OBJECTS = None
+_CLASS_NODE_BASE = None
 
-def _set_class_objects(cls):
-    global _CLASS_OBJECTS
-    _CLASS_OBJECTS = cls
+def _set_class_node_base(cls):
+    global _CLASS_NODE_BASE
+    _CLASS_NODE_BASE = cls
 
 
 def _scalar_type_inference(value):
@@ -67,7 +67,7 @@ def convert_to_node(value):
     node : Node
         The corresponding node value.
     """
-    if isinstance(value, _CLASS_OBJECTS):
+    if isinstance(value, _CLASS_NODE_BASE):
         return value
     if isinstance(value, bool):
         return const(value, 'uint1x1')
@@ -81,7 +81,7 @@ def convert_to_node(value):
     if isinstance(value, dict):
         vlist = []
         for item in value.items():
-            if (not isinstance(item[0], _CLASS_OBJECTS) and
+            if (not isinstance(item[0], _CLASS_NODE_BASE) and
                     not isinstance(item[0], string_types)):
                 raise ValueError("key of map must already been a container type")
             vlist.append(item[0])
diff --git a/python/tvm/_ffi/runtime_ctypes.py b/python/tvm/_ffi/runtime_ctypes.py
index a7947dbc38a2..2dbb67dfbf73 100644
--- a/python/tvm/_ffi/runtime_ctypes.py
+++ b/python/tvm/_ffi/runtime_ctypes.py
@@ -271,3 +271,12 @@ class TVMArray(ctypes.Structure):
                 ("byte_offset", ctypes.c_uint64)]
 
 TVMArrayHandle = ctypes.POINTER(TVMArray)
+
+class TVMNDArrayContainer(ctypes.Structure):
+    """TVM NDArray::Container"""
+    _fields_ = [("dl_tensor", TVMArray),
+                ("manager_ctx", ctypes.c_void_p),
+                ("deleter", ctypes.c_void_p),
+                ("array_type_info", ctypes.c_int32)]
+
+TVMNDArrayContainerHandle = ctypes.POINTER(TVMNDArrayContainer)
diff --git a/python/tvm/api.py b/python/tvm/api.py
index ef121bc880b2..f0261be37e41 100644
--- a/python/tvm/api.py
+++ b/python/tvm/api.py
@@ -179,7 +179,7 @@ def var(name="tindex", dtype=int32):
     name : str
         The name
 
-    dtype : str
+    dtype : int
         The data type
 
     Returns
diff --git a/python/tvm/autotvm/database.py b/python/tvm/autotvm/database.py
index 07f3766acb1d..f820c1234832 100644
--- a/python/tvm/autotvm/database.py
+++ b/python/tvm/autotvm/database.py
@@ -156,7 +156,7 @@ def filter(self, func):
         Examples
         --------
         get records for a target
-        >>> db.filter(lambda inp, results: "cuda" in inp.target.keys)
+        >>> db.filter(lambda inp, resulst: "cuda" in inp.target.keys)
         get records with errors
         >>> db.filter(lambda inp, results: any(r.error_no != 0 for r in results))
         """
diff --git a/python/tvm/autotvm/record.py b/python/tvm/autotvm/record.py
index fbf4a08f7b0c..14efb7bd9239 100644
--- a/python/tvm/autotvm/record.py
+++ b/python/tvm/autotvm/record.py
@@ -183,13 +183,7 @@ def load_from_file(filename):
     """
     for row in open(filename):
         if row and not row.startswith('#'):
-            inp, res = decode(row)
-            # Avoid loading the record with an empty config. The TOPI schedule with no entities
-            # will result in an empty entity map (e.g., depthwise_conv2d_nchw on x86).
-            # Using an empty config will cause problems when applying alter op like NCHW to NCHWc.
-            if not inp.config._entity_map:
-                continue
-            yield (inp, res)
+            yield decode(row)
 
 
 def split_workload(in_file, clean=True):
diff --git a/python/tvm/autotvm/task/__init__.py b/python/tvm/autotvm/task/__init__.py
index f249f6bacb90..0a0e6e1e8ac7 100644
--- a/python/tvm/autotvm/task/__init__.py
+++ b/python/tvm/autotvm/task/__init__.py
@@ -30,4 +30,5 @@
 
 from .topi_integration import register_topi_compute, register_topi_schedule, \
     TaskExtractEnv
+from .nnvm_integration import extract_from_graph, extract_from_multiple_graph
 from .relay_integration import extract_from_program, extract_from_multiple_program
diff --git a/python/tvm/autotvm/task/nnvm_integration.py b/python/tvm/autotvm/task/nnvm_integration.py
new file mode 100644
index 000000000000..9161822d173c
--- /dev/null
+++ b/python/tvm/autotvm/task/nnvm_integration.py
@@ -0,0 +1,200 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=unused-variable,invalid-name
+"""
+Decorator and utilities for the integration with TOPI and NNVM
+
+"""
+import threading
+import warnings
+import logging
+
+
+from .task import create
+from .topi_integration import TaskExtractEnv
+
+logger = logging.getLogger('autotvm')
+
+
+def extract_from_graph(graph, shape, dtype, target, symbols, params=None, target_host=None):
+    """ Extract tuning tasks from a nnvm graph.
+
+    This function collects tuning tasks by building the graph
+    and trace all the calls to topi.
+
+    Parameters
+    ----------
+    graph : Graph
+        The graph to tune
+    shape : dict of str to tuple
+        The input shape to the graph
+    dtype : str or dict of str to str
+        The input types to the graph
+    target: tvm.target.Target
+        The compilation target
+    symbols : Array of nnvm.symbol
+        Array of nnvm symbols want to be tuned
+    params : dict of str to NDArray
+        The parameter dictionary.
+    target_host: tvm.target.Target
+        The host compilation target
+
+    Returns
+    -------
+    task: Array of autotvm.task.Task
+        collected tasks
+    """
+    import nnvm.compiler
+    import nnvm
+    import topi
+
+    env = TaskExtractEnv.get()
+
+    # NOTE: To add more symbols, you only need to change the following lists
+    # nnvm symbol -> topi compute
+    SYMBOL2TOPI = {
+        nnvm.sym.conv2d: [topi.nn.conv2d, topi.nn.depthwise_conv2d_nchw,
+                          topi.nn.group_conv2d_nchw],
+        nnvm.sym.conv2d_transpose: [topi.nn.conv2d_transpose_nchw],
+        nnvm.sym.dense: [topi.nn.dense],
+    }
+
+    topi_funcs = []
+    for sym_name in symbols:
+        if sym_name in SYMBOL2TOPI:
+            topi_funcs.extend(SYMBOL2TOPI[sym_name])
+        else:
+            warnings.warn("Symbol %s is not tunable, ignored" % sym_name)
+
+    # run compiler to collect all TOPI calls during compilation
+    env.reset(topi_funcs)
+    with env:
+        # disable logger temporarily
+        old_state = logger.disabled
+        logger.disabled = True
+
+        nnvm.compiler.engine.clear_cache()
+        # wrap build call in thread to avoid multiprocessing problems
+        build_thread = threading.Thread(target=nnvm.compiler.build,
+                                        args=(graph,
+                                              target,
+                                              shape,
+                                              dtype,
+                                              params,
+                                              target_host))
+        build_thread.start()
+        build_thread.join()
+
+        logger.disabled = old_state
+
+    # create tasks for target
+    tasks = []
+    for task_name, args in env.get_tasks():
+        try:
+            tsk = create(task_name, args,
+                         target=target, target_host=target_host,
+                         template_key='direct')
+            tasks.append(tsk)
+        except topi.InvalidShapeError:
+            print("[Warning] Invalid shape during AutoTVM task creation")
+
+    return tasks
+
+
+def extract_from_multiple_graph(graphs, shapes, dtypes, target, symbols, params, target_host=None):
+    """ Extract tuning tasks from multiple nnvm graphs.
+
+    This function is the multiple graph version of extract_from_graph
+
+    Parameters
+    ----------
+    graphs : List of Graph
+        The list of graphs to tune
+    shapes : List of dict of str to tuple
+        The input shape to the graph
+    dtypes : List of str or dict of str to str
+        The input types to the graph
+    target: tvm.target.Target
+        The compilation target
+    symbols : Array of nnvm.symbol
+        Array of nnvm symbols want to be tuned
+    params : dict of str to NDArray
+        The parameter dictionary.
+    target_host: tvm.target.Target
+        The host compilation target
+
+    Returns
+    -------
+    task: Array of autotvm.task.Task
+        collected tasks
+    """
+    import nnvm.compiler
+    import nnvm
+    import topi
+
+    env = TaskExtractEnv.get()
+
+    #NOTE: To add more symbols, you only need to change the following lists
+    #nnvm symbol -> topi compute
+    SYMBOL2TOPI = {
+        nnvm.sym.conv2d: [topi.nn.conv2d, topi.nn.depthwise_conv2d_nchw,
+                          topi.nn.group_conv2d_nchw],
+        nnvm.sym.conv2d_transpose: [topi.nn.conv2d_transpose_nchw],
+        nnvm.sym.dense: [topi.nn.dense],
+    }
+
+    topi_funcs = []
+    for sym_name in symbols:
+        if sym_name in SYMBOL2TOPI:
+            topi_funcs.extend(SYMBOL2TOPI[sym_name])
+        else:
+            warnings.warn("Symbol %s is not tunable, ignored" % sym_name)
+
+    # run compiler to collect all TOPI calls during compilation
+    env.reset(topi_funcs)
+    with env:
+        # disable logger temporarily
+        old_state = logger.disabled
+        logger.disabled = True
+
+        for graph, shape, dtype in zip(graphs, shapes, dtypes):
+            nnvm.compiler.engine.clear_cache()
+            # wrap build call in thread to avoid multiprocessing problems
+            build_thread = threading.Thread(target=nnvm.compiler.build,
+                                            args=(graph,
+                                                  target,
+                                                  shape,
+                                                  dtype,
+                                                  params,
+                                                  target_host))
+            build_thread.start()
+            build_thread.join()
+
+        logger.disabled = old_state
+
+    # create tasks for target
+    tasks = []
+    for task_name, args in env.get_tasks():
+        try:
+            tsk = create(task_name, args,
+                         target=target, target_host=target_host,
+                         template_key='direct')
+            tasks.append(tsk)
+        except topi.InvalidShapeError:
+            print("[Warning] Invalid shape during AutoTVM task creation")
+
+    return tasks
diff --git a/python/tvm/autotvm/task/relay_integration.py b/python/tvm/autotvm/task/relay_integration.py
index 4a407714b414..b65c5d428e4b 100644
--- a/python/tvm/autotvm/task/relay_integration.py
+++ b/python/tvm/autotvm/task/relay_integration.py
@@ -128,7 +128,6 @@ def extract_from_multiple_program(funcs, params, ops, target, target_host=None,
         tvm.relay.op.nn.dense: [topi.nn.dense],
         tvm.relay.op.nn.batch_matmul: [topi.nn.batch_matmul],
         tvm.relay.op.nn.deformable_conv2d: [topi.nn.deformable_conv2d_nchw],
-        tvm.relay.op.nn.conv1d_transpose: [topi.nn.conv1d_transpose_ncw],
     }
 
     topi_funcs = []
diff --git a/python/tvm/autotvm/task/topi_integration.py b/python/tvm/autotvm/task/topi_integration.py
index 8b3ba35e92ab..7bfc313de6e9 100644
--- a/python/tvm/autotvm/task/topi_integration.py
+++ b/python/tvm/autotvm/task/topi_integration.py
@@ -69,9 +69,9 @@ def deserialize_args(args):
     return ret
 
 
-# Task extractor for relay program
+# Task extractor for nnvm graph, relay program
 class TaskExtractEnv:
-    """Global environment for extracting tuning tasks from graph"""
+    """Global environment for extracting tuning tasks from nnvm graph"""
     current = None
     registered = None
 
@@ -92,7 +92,6 @@ def __init__(self, allow_duplicate=False):
             topi.nn.bitserial_conv2d_nhwc: "topi_nn_bitserial_conv2d_nhwc",
             topi.nn.bitserial_dense: "topi_nn_bitserial_dense",
             topi.nn.deformable_conv2d_nchw: "topi_nn_deformable_conv2d_nchw",
-            topi.nn.conv1d_transpose_ncw: "topi_nn_conv1d_transpose_ncw",
         }
 
         self.topi_to_schedule = {
@@ -110,7 +109,6 @@ def __init__(self, allow_duplicate=False):
             topi.nn.bitserial_conv2d_nhwc: [topi.generic.schedule_bitserial_conv2d_nhwc],
             topi.nn.bitserial_dense: [topi.generic.schedule_bitserial_dense],
             topi.nn.deformable_conv2d_nchw: [topi.generic.schedule_deformable_conv2d_nchw],
-            topi.nn.conv1d_transpose_ncw: [topi.generic.schedule_conv1d_transpose_ncw],
         }
 
         # function reflection for tracing
@@ -127,7 +125,6 @@ def __init__(self, allow_duplicate=False):
             topi.nn.bitserial_conv2d_nhwc:  lambda x: setattr(topi.nn, 'bitserial_conv2d_nhwc', x),
             topi.nn.bitserial_dense:        lambda x: setattr(topi.nn, 'bitserial_dense', x),
             topi.nn.deformable_conv2d_nchw: lambda x: setattr(topi.nn, 'deformable_conv2d_nchw', x),
-            topi.nn.conv1d_transpose_ncw:   lambda x: setattr(topi.nn, 'conv1d_transpose_ncw', x),
         }
 
         self.allow_duplicate = allow_duplicate
@@ -182,15 +179,12 @@ def _topi_nn_conv2d(*args, **kwargs):
             args = deserialize_args(args)
             A, W = args[:2]
             layout = args[-2]
+            assert layout == 'NCHW' or layout == 'HWCN', "only support NCHW/HWCN currently"
             C = topi.nn.conv2d(*args, **kwargs)
             if layout == 'NCHW':
                 s = topi.generic.schedule_conv2d_nchw([C])
-            elif layout == 'HWCN':
-                s = topi.generic.schedule_conv2d_hwcn([C])
-            elif layout == 'NHWC':
-                s = topi.generic.schedule_conv2d_nhwc([C])
             else:
-                raise ValueError("Unsupported layout {}".format(layout))
+                s = topi.generic.schedule_conv2d_hwcn([C])
             return s, [A, W, C]
 
         @register("topi_nn_depthwise_conv2d_nchw")
@@ -220,15 +214,6 @@ def _topi_nn_conv2d_transpose_nchw(*args, **kwargs):
             s = topi.generic.schedule_conv2d_transpose_nchw([C])
             return s, [A, W, C]
 
-        @register("topi_nn_conv1d_transpose_ncw")
-        def _topi_nn_conv1d_transpose_ncw(*args, **kwargs):
-            assert not kwargs, "Do not support kwargs in template function call"
-            args = deserialize_args(args)
-            A, W = args[:2]
-            C = topi.nn.conv1d_transpose_ncw(*args, **kwargs)
-            s = topi.generic.schedule_conv1d_transpose_ncw([C])
-            return s, [A, W, C]
-
         @register("topi_nn_dense")
         def _topi_nn_dense(*args, **kwargs):
             assert not kwargs, "Do not support kwargs in template function call"
@@ -313,7 +298,7 @@ def get_tasks(self):
         Returns
         -------
         tasks: List of tuple(name, args)
-            A list of tasks extracted from the graph
+            A list of tasks extracted from the nnvm graph
         """
         return self.task_collection
 
diff --git a/python/tvm/autotvm/tophub.py b/python/tvm/autotvm/tophub.py
index ebfe6ee3a354..95e9acb23dff 100644
--- a/python/tvm/autotvm/tophub.py
+++ b/python/tvm/autotvm/tophub.py
@@ -18,7 +18,8 @@
 TopHub: Tensor Operator Hub
 To get the best performance, we typically need auto-tuning for the specific devices.
 TVM releases pre-tuned parameters in TopHub for some common networks and hardware targets.
-TVM will download these parameters for you when you call relay.build.
+TVM will download these parameters for you when you call
+nnvm.compiler.build_module or relay.build.
 """
 # pylint: disable=invalid-name
 
@@ -223,7 +224,7 @@ def load_reference_log(backend, model, workload_name, template_key):
                 if model == inp.target.model:
                     find = True
                     break
-            # if device model is not find, use the device model with the most tuned workloads
+            # if device model is not find, use the device model with the most tuned worklaods
             if not find and counts:
                 model = max(counts.items(), key=lambda k: k[1])[0]
 
diff --git a/python/tvm/autotvm/tuner/xgboost_cost_model.py b/python/tvm/autotvm/tuner/xgboost_cost_model.py
index 34f4c03e224b..265365144639 100644
--- a/python/tvm/autotvm/tuner/xgboost_cost_model.py
+++ b/python/tvm/autotvm/tuner/xgboost_cost_model.py
@@ -51,7 +51,7 @@ class XGBoostCostModel(CostModel):
                                 'itervar' is more accurate but 'knob' is much faster.
                                 There are some constraints on 'itervar', if you meet
                                 problems with feature extraction when using 'itervar',
-                                you can switch to 'knob'.
+                                you can swith to 'knob'.
 
         For cross-shape tuning (e.g. many convolutions with different shapes),
                                'itervar' and 'curve' has better transferability,
diff --git a/python/tvm/autotvm/tuner/xgboost_tuner.py b/python/tvm/autotvm/tuner/xgboost_tuner.py
index 2ebea86d8e3e..a7498c3b6309 100644
--- a/python/tvm/autotvm/tuner/xgboost_tuner.py
+++ b/python/tvm/autotvm/tuner/xgboost_tuner.py
@@ -40,7 +40,7 @@ class XGBTuner(ModelBasedTuner):
                                 'itervar' is more accurate but 'knob' is much faster.
                                 There are some constraints on 'itervar', if you meet
                                 problems with feature extraction when using 'itervar',
-                                you can switch to 'knob'.
+                                you can swith to 'knob'.
 
         For cross-shape tuning (e.g. many convolutions with different shapes),
                                'itervar' and 'curve' has better transferability,
diff --git a/python/tvm/contrib/cublaslt.py b/python/tvm/contrib/cublaslt.py
deleted file mode 100644
index 5470fd0b4c18..000000000000
--- a/python/tvm/contrib/cublaslt.py
+++ /dev/null
@@ -1,51 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""External function interface to cuBLASlt libraries."""
-from __future__ import absolute_import as _abs
-
-from .. import api as _api
-from .. import intrin as _intrin
-
-def matmul(lhs, rhs, transa=False, transb=False, n=0, m=0, dtype=None):
-    """Create an extern op that compute matrix mult of A and rhs with cuBLAS
-
-    Parameters
-    ----------
-    lhs : Tensor
-        The left matrix operand
-    rhs : Tensor
-        The right matrix operand
-    transa : bool
-        Whether transpose lhs
-    transb : bool
-        Whether transpose rhs
-
-    Returns
-    -------
-    C : Tensor
-        The result tensor.
-    """
-    if n == 0:
-        n = lhs.shape[1] if transa else lhs.shape[0]
-    if m == 0:
-        m = rhs.shape[0] if transb else rhs.shape[1]
-    dtype = dtype if dtype is not None else lhs.dtype
-    return _api.extern(
-        (n, m), [lhs, rhs],
-        lambda ins, outs: _intrin.call_packed(
-            "tvm.contrib.cublaslt.matmul",
-            ins[0], ins[1], outs[0], transa, transb), dtype=dtype, name="C")
diff --git a/python/tvm/contrib/debugger/debug_result.py b/python/tvm/contrib/debugger/debug_result.py
index 3fc0d1574b8c..8ee99d7561bd 100644
--- a/python/tvm/contrib/debugger/debug_result.py
+++ b/python/tvm/contrib/debugger/debug_result.py
@@ -40,7 +40,7 @@ class DebugResult(object):
     Parameters
     ----------
     graph_json : str
-        The graph to be deployed in json format output by graph compiler. Each operator (tvm_op)
+        The graph to be deployed in json format output by nnvm graph. Each operator (tvm_op)
         in the graph will have a one to one mapping with the symbol in libmod which is used
         to construct a "PackedFunc" .
 
@@ -57,12 +57,12 @@ def __init__(self, graph_json, dump_path):
         self.dump_graph_json(graph_json)
 
     def _parse_graph(self, graph_json):
-        """Parse and extract the JSON graph and update the nodes, shapes and dltype.
+        """Parse and extract the NNVM graph and update the nodes, shapes and dltype.
 
         Parameters
         ----------
         graph_json : str or graph class
-           The graph to be deployed in json format output by JSON graph.
+           The graph to be deployed in json format output by nnvm graph.
         """
         json_obj = json.loads(graph_json)
         self._nodes_list = json_obj['nodes']
@@ -197,7 +197,7 @@ def dump_graph_json(self, graph):
         Parameters
         ----------
         graph : json format
-            json formatted JSON graph contain list of each node's
+            json formatted NNVM graph contain list of each node's
             name, shape and type.
         """
         graph_dump_file_name = GRAPH_DUMP_FILE_NAME
diff --git a/python/tvm/contrib/debugger/debug_runtime.py b/python/tvm/contrib/debugger/debug_runtime.py
index 7d150c7c3d34..c71cbd2b0c2d 100644
--- a/python/tvm/contrib/debugger/debug_runtime.py
+++ b/python/tvm/contrib/debugger/debug_runtime.py
@@ -35,7 +35,7 @@ def create(graph_json_str, libmod, ctx, dump_root=None):
     Parameters
     ----------
     graph_json_str : str or graph class
-        The graph to be deployed in json format output by graph compiler.
+        The graph to be deployed in json format output by nnvm graph.
         The graph can only contain one operator(tvm_op) that
         points to the name of PackedFunc in the libmod.
 
@@ -85,7 +85,7 @@ class GraphModuleDebug(graph_runtime.GraphModule):
     Parameters
     ----------
     module : Module
-        The internal tvm module that holds the actual graph functions.
+        The interal tvm module that holds the actual graph functions.
 
     ctx : TVMContext
         The context this module is under.
@@ -188,7 +188,7 @@ def _run_debug(self):
                 out_tensor = array(out_tensor)
                 self.debug_datum._output_tensor_list.append(out_tensor)
 
-    def debug_get_output(self, node, out=None):
+    def debug_get_output(self, node, out):
         """Run graph up to node and get the output to out
 
         Parameters
@@ -199,11 +199,12 @@ def debug_get_output(self, node, out=None):
         out : NDArray
             The output array container
         """
+        ret = None
         if isinstance(node, str):
             output_tensors = self.debug_datum.get_output_tensors()
             try:
-                out = output_tensors[node]
-            except KeyError:
+                ret = output_tensors[node]
+            except:
                 node_list = output_tensors.keys()
                 raise RuntimeError(
                     "Node "
@@ -214,10 +215,10 @@ def debug_get_output(self, node, out=None):
                 )
         elif isinstance(node, int):
             output_tensors = self.debug_datum._output_tensor_list
-            out = output_tensors[node]
+            ret = output_tensors[node]
         else:
             raise RuntimeError("Require node index or name only.")
-        return out
+        return ret
 
     def run(self, **input_dict):
         """Run forward execution of the graph with debug
@@ -243,6 +244,7 @@ def run_individual(self, number, repeat=1, min_repeat_ms=0):
         ret = self._run_individual(number, repeat, min_repeat_ms)
         return ret.strip(",").split(",") if ret else []
 
+
     def exit(self):
         """Exits the dump folder and all its contents"""
         self._remove_dump_root()
diff --git a/python/tvm/contrib/graph_runtime.py b/python/tvm/contrib/graph_runtime.py
index 2c945d2fca95..f4ee2f7db28d 100644
--- a/python/tvm/contrib/graph_runtime.py
+++ b/python/tvm/contrib/graph_runtime.py
@@ -22,13 +22,12 @@
 from .._ffi.runtime_ctypes import TVMContext
 from ..rpc import base as rpc_base
 
-
 def create(graph_json_str, libmod, ctx):
     """Create a runtime executor module given a graph and module.
     Parameters
     ----------
     graph_json_str : str or graph class
-        The graph to be deployed in json format output by json graph.
+        The graph to be deployed in json format output by nnvm graph.
         The graph can only contain one operator(tvm_op) that
         points to the name of PackedFunc in the libmod.
     libmod : tvm.Module
@@ -58,7 +57,6 @@ def create(graph_json_str, libmod, ctx):
 
     return GraphModule(fcreate(graph_json_str, libmod, *device_type_id))
 
-
 def get_device_ctx(libmod, ctx):
     """Parse and validate all the device context(s).
     Parameters
@@ -114,12 +112,12 @@ class GraphModule(object):
     Parameters
     ----------
     module : Module
-        The internal tvm module that holds the actual graph functions.
+        The interal tvm module that holds the actual graph functions.
 
     Attributes
     ----------
     module : Module
-        The internal tvm module that holds the actual graph functions.
+        The interal tvm module that holds the actual graph functions.
     """
 
     def __init__(self, module):
@@ -144,7 +142,7 @@ def set_input(self, key=None, value=None, **params):
            The input key
 
         params : dict of str to NDArray
-           Additional arguments
+           Additonal arguments
         """
         if key is not None:
             self._get_input(key).copyfrom(value)
@@ -213,7 +211,7 @@ def get_output(self, index, out=None):
         return self._get_output(index)
 
     def debug_get_output(self, node, out):
-        """Run graph up to node and get the output to out
+        """Run graph upto node and get the output to out
 
         Parameters
         ----------
diff --git a/python/tvm/contrib/verilog.py b/python/tvm/contrib/verilog.py
new file mode 100644
index 000000000000..30fa6ce2c7dd
--- /dev/null
+++ b/python/tvm/contrib/verilog.py
@@ -0,0 +1,316 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Verilog simulator modules."""
+from __future__ import absolute_import
+
+import subprocess
+import sys
+import os
+import ctypes
+
+from .. import _api_internal
+from .._ffi.base import string_types
+from .._ffi.node import NodeBase, register_node
+from .._ffi.function import register_func
+from . import util
+
+@register_node
+class VPISession(NodeBase):
+    """Verilog session"""
+    def __init__(self, handle):
+        super(VPISession, self).__init__(handle)
+        self.proc = None
+        self.execpath = None
+        self.yield_callbacks = []
+
+    def __del__(self):
+        self.proc.kill()
+        try:
+            super(VPISession, self).__del__()
+        except AttributeError:
+            pass
+
+    def arg(self, index):
+        """Get handle passed to host session.
+
+        Parameters
+        ----------
+        index : int
+            The index value.
+
+        Returns
+        -------
+        handle : VPIHandle
+            The handle
+        """
+        return _api_internal._vpi_SessGetArg(self, index)
+
+    def __getitem__(self, name):
+        if not isinstance(name, string_types):
+            raise ValueError("have to be string types")
+        return _api_internal._vpi_SessGetHandleByName(self, name)
+
+    def __getattr__(self, name):
+        return _api_internal._vpi_SessGetHandleByName(self, name)
+
+    def yield_until_next_cycle(self):
+        """Yield until next posedge"""
+        for f in self.yield_callbacks:
+            f()
+        return _api_internal._vpi_SessYield(self)
+
+    def shutdown(self):
+        """Shutdown the simulator"""
+        return _api_internal._vpi_SessShutdown(self)
+
+
+@register_node
+class VPIHandle(NodeBase):
+    """Handle to a verilog variable."""
+    def __init__(self, handle):
+        super(VPIHandle, self).__init__(handle)
+        self._name = None
+        self._size = None
+
+    def get_int(self):
+        """Get integer value from handle.
+
+        Returns
+        -------
+        value : int
+        """
+        return _api_internal._vpi_HandleGetInt(self)
+
+    def put_int(self, value):
+        """Put integer value to handle.
+
+        Parameters
+        ----------
+        value : int
+            The value to put
+        """
+        return _api_internal._vpi_HandlePutInt(self, value)
+
+    @property
+    def name(self):
+        if self._name is None:
+            self._name = _api_internal._vpi_HandleGetName(self)
+        return self._name
+
+    @property
+    def size(self):
+        if self._size is None:
+            self._size = _api_internal._vpi_HandleGetSize(self)
+        return self._size
+
+    def __getitem__(self, name):
+        if not isinstance(name, string_types):
+            raise ValueError("have to be string types")
+        return _api_internal._vpi_HandleGetHandleByName(self, name)
+
+    def __getattr__(self, name):
+        return _api_internal._vpi_HandleGetHandleByName(self, name)
+
+
+def _find_vpi_path():
+    curr_path = os.path.dirname(os.path.realpath(os.path.expanduser(__file__)))
+    api_path = os.path.join(curr_path, '../../../lib/')
+    vpi_path = [curr_path, api_path]
+    vpi_path = [os.path.join(p, 'tvm_vpi.vpi') for p in vpi_path]
+    vpi_found = [p for p in vpi_path if os.path.exists(p) and os.path.isfile(p)]
+    if vpi_found:
+        return os.path.dirname(vpi_found[0])
+    raise ValueError("Cannot find tvm_vpi.vpi, make sure you did `make verilog`")
+
+def search_path():
+    """Get the search directory."""
+    curr_path = os.path.dirname(os.path.realpath(os.path.expanduser(__file__)))
+    ver_path = [os.path.join(curr_path, '../../../verilog/')]
+    ver_path += [os.path.join(curr_path, '../../../tests/verilog/unittest/')]
+    ver_path += [os.path.join(curr_path, '../../../tests/verilog/integration/')]
+    return ver_path
+
+
+def find_file(file_name):
+    """Find file in the search directories.
+
+    Parameters
+    ----------
+    file_name : str
+        The file name
+
+    Return
+    ------
+    file_name : str
+        The absolute path to the file, raise Error if cannot find it.
+    """
+    ver_path = search_path()
+    flist = [os.path.join(p, file_name) for p in ver_path]
+    found = [p for p in flist if os.path.exists(p) and os.path.isfile(p)]
+    if not found:
+        raise ValueError("Cannot find %s in %s" % (file_name, flist))
+    return found[0]
+
+
+def compile_file(file_name, file_target, options=None):
+    """Compile verilog via iverilog
+
+    Parameters
+    ----------
+    file_name : str or list of str
+        The cuda code.
+
+    file_target : str
+        The target file.
+    """
+    cmd = ["iverilog"]
+    for path in search_path():
+        cmd += ["-I%s" % path]
+
+    cmd += ["-o", file_target]
+    if options:
+        cmd += options
+
+    if isinstance(file_name, string_types):
+        file_name = [file_name]
+    cmd += file_name
+    proc = subprocess.Popen(
+        cmd,
+        stdout=subprocess.PIPE,
+        stderr=subprocess.STDOUT)
+    (out, _) = proc.communicate()
+
+    if proc.returncode != 0:
+        raise ValueError("Compilation error:\n%s" % out)
+
+
+def session(file_names, codes=None):
+    """Create a new iverilog session by compile the file.
+
+    Parameters
+    ----------
+    file_names : str or list of str
+        The name of the file
+
+    codes : str or list of str
+        The code in str.
+
+    Returns
+    -------
+    sess : VPISession
+        The created session.
+    """
+    if isinstance(file_names, string_types):
+        file_names = [file_names]
+    path = util.tempdir()
+
+    if codes:
+        if isinstance(codes, (list, tuple)):
+            codes = '\n'.join(codes)
+        fcode = path.relpath("temp_code.v")
+        with open(fcode, "w") as out_file:
+            out_file.write(codes)
+        file_names.append(fcode)
+
+    for name in file_names:
+        if not os.path.exists(name):
+            raise ValueError("Cannot find file %s" % name)
+
+    target = path.relpath(os.path.basename(file_names[0].rsplit(".", 1)[0]))
+    compile_file(file_names, target)
+    vpi_path = _find_vpi_path()
+
+    cmd = ["vvp"]
+    cmd += ["-M", vpi_path]
+    cmd += ["-m", "tvm_vpi"]
+    cmd += [target]
+    env = os.environ.copy()
+
+    read_device, write_host = os.pipe()
+    read_host, write_device = os.pipe()
+
+    if sys.platform == "win32":
+        import msvcrt
+        env['TVM_DREAD_PIPE'] = str(msvcrt.get_osfhandle(read_device))
+        env['TVM_DWRITE_PIPE'] = str(msvcrt.get_osfhandle(write_device))
+        read_host = msvcrt.get_osfhandle(read_host)
+        write_host = msvcrt.get_osfhandle(write_host)
+    else:
+        env['TVM_DREAD_PIPE'] = str(read_device)
+        env['TVM_DWRITE_PIPE'] = str(write_device)
+
+    env['TVM_HREAD_PIPE'] = str(read_host)
+    env['TVM_HWRITE_PIPE'] = str(write_host)
+
+    try:
+        # close_fds does not work well for all python3
+        # Use pass_fds instead.
+        # pylint: disable=unexpected-keyword-arg
+        pass_fds = (read_device, write_device, read_host, write_host)
+        proc = subprocess.Popen(cmd, pass_fds=pass_fds, env=env)
+    except TypeError:
+        # This is effective for python2
+        proc = subprocess.Popen(cmd, close_fds=False, env=env)
+
+    # close device side pipe
+    os.close(read_device)
+    os.close(write_device)
+
+    sess = _api_internal._vpi_SessMake(read_host, write_host)
+    sess.proc = proc
+    sess.execpath = path
+    return sess
+
+
+@register_func
+def tvm_callback_verilog_simulator(code, *args):
+    """Callback by TVM runtime to invoke verilog simulator
+
+    Parameters
+    ----------
+    code : str
+        The verilog code to be simulated
+
+    args : list
+        Additional arguments to be set.
+    """
+    libs = [
+        find_file("tvm_vpi_mmap.v")
+    ]
+    sess = session(libs, code)
+    for i, value in enumerate(args):
+        vpi_h = sess.main["tvm_arg%d" % i]
+        if isinstance(value, ctypes.c_void_p):
+            int_value = int(value.value)
+        elif isinstance(value, int):
+            int_value = value
+        else:
+            raise ValueError(
+                "Do not know how to handle value type %s" % type(value))
+        vpi_h.put_int(int_value)
+
+    rst = sess.main.rst
+    done = sess.main.done
+    # start driving
+    rst.put_int(1)
+    sess.yield_until_next_cycle()
+    rst.put_int(0)
+    sess.yield_until_next_cycle()
+    while not done.get_int():
+        sess.yield_until_next_cycle()
+    sess.yield_until_next_cycle()
+    sess.shutdown()
diff --git a/python/tvm/module.py b/python/tvm/module.py
index 163ad3bc8822..fb350a2d131e 100644
--- a/python/tvm/module.py
+++ b/python/tvm/module.py
@@ -136,27 +136,21 @@ def export_library(self,
             self.save(file_name)
             return
 
-        modules = self._collect_dso_modules()
+        if not (self.type_key == "llvm" or self.type_key == "c"):
+            raise ValueError("Module[%s]: Only llvm and c support export shared" % self.type_key)
         temp = _util.tempdir()
-        files = []
-        is_system_lib = False
-        has_c_module = False
-        for index, module in enumerate(modules):
-            if fcompile is not None and hasattr(fcompile, "object_format"):
-                object_format = fcompile.object_format
+        if fcompile is not None and hasattr(fcompile, "object_format"):
+            object_format = fcompile.object_format
+        else:
+            if self.type_key == "llvm":
+                object_format = "o"
             else:
-                if module.type_key == "llvm":
-                    object_format = "o"
-                else:
-                    assert module.type_key == "c"
-                    object_format = "cc"
-                    has_c_module = True
-            path_obj = temp.relpath("lib" + str(index) + "." + object_format)
-            module.save(path_obj)
-            files.append(path_obj)
-            is_system_lib = (module.type_key == "llvm" and
-                             module.get_function("__tvm_is_system_module")())
-
+                assert self.type_key == "c"
+                object_format = "cc"
+        path_obj = temp.relpath("lib." + object_format)
+        self.save(path_obj)
+        files = [path_obj]
+        is_system_lib = self.type_key == "llvm" and self.get_function("__tvm_is_system_module")()
         if self.imported_modules:
             path_cc = temp.relpath("devc.cc")
             with open(path_cc, "w") as f:
@@ -167,15 +161,13 @@ def export_library(self,
                 fcompile = _tar.tar
             else:
                 fcompile = _cc.create_shared
-
-        if has_c_module:
+        if self.type_key == "c":
             options = []
             if "options" in kwargs:
                 opts = kwargs["options"]
                 options = opts if isinstance(opts, (list, tuple)) else [opts]
             opts = options + ["-I" + path for path in find_include_path()]
             kwargs.update({'options': opts})
-
         fcompile(file_name, files, **kwargs)
 
     def time_evaluator(self, func_name, ctx, number=10, repeat=1, min_repeat_ms=0):
@@ -236,25 +228,6 @@ def evaluator(*args):
         except NameError:
             raise NameError("time_evaluate is only supported when RPC is enabled")
 
-    def _collect_dso_modules(self):
-        """Helper function to collect dso modules, then return it."""
-        visited, stack, dso_modules = set(), [], []
-        # append root module
-        visited.add(self)
-        stack.append(self)
-        while stack:
-            module = stack.pop()
-            if module._dso_exportable():
-                dso_modules.append(module)
-            for m in module.imported_modules:
-                if m not in visited:
-                    visited.add(m)
-                    stack.append(m)
-        return dso_modules
-
-    def _dso_exportable(self):
-        return self.type_key == "llvm" or self.type_key == "c"
-
 
 def system_lib():
     """Get system-wide library module singleton.
diff --git a/python/tvm/ndarray.py b/python/tvm/ndarray.py
index b19db6627ac6..f9c7cc6c5403 100644
--- a/python/tvm/ndarray.py
+++ b/python/tvm/ndarray.py
@@ -27,11 +27,8 @@
 from ._ffi.ndarray import TVMContext, TVMType, NDArrayBase
 from ._ffi.ndarray import context, empty, from_dlpack
 from ._ffi.ndarray import _set_class_ndarray
-from ._ffi.ndarray import register_extension
-from ._ffi.object import register_object
+from ._ffi.ndarray import register_extension, free_extension_handle
 
-
-@register_object
 class NDArray(NDArrayBase):
     """Lightweight NDArray class of TVM runtime.
 
diff --git a/python/tvm/relay/_parser.py b/python/tvm/relay/_parser.py
index 45822c56ede2..71e5bfaadff0 100644
--- a/python/tvm/relay/_parser.py
+++ b/python/tvm/relay/_parser.py
@@ -135,15 +135,12 @@ def __call__(self, args, attrs, type_args):
     "nn.dense": op.nn.dense,
     "nn.bias_add": op.nn.bias_add,
     "nn.max_pool2d": op.nn.max_pool2d,
-    "nn.max_pool3d": op.nn.max_pool3d,
     "nn.global_max_pool2d": op.nn.global_max_pool2d,
     "nn.avg_pool2d": op.nn.avg_pool2d,
-    "nn.avg_pool3d": op.nn.avg_pool3d,
     "nn.global_avg_pool2d": op.nn.global_avg_pool2d,
     "nn.softmax": op.nn.softmax,
     "reshape": op.reshape,
     "nn.conv2d_transpose": op.nn.conv2d_transpose,
-    "nn.conv1d_transpose": op.nn.conv1d_transpose,
     "concatenate": op.concatenate,
     "nn.dropout": op.nn.dropout_raw,
     "zeros": op.zeros,
diff --git a/python/tvm/relay/frontend/keras.py b/python/tvm/relay/frontend/keras.py
index f0468e7c232b..57ee227694db 100644
--- a/python/tvm/relay/frontend/keras.py
+++ b/python/tvm/relay/frontend/keras.py
@@ -362,7 +362,7 @@ def _convert_flatten(inexpr, keras_layer, _):
 def _convert_pooling(inexpr, keras_layer, etab):
     _check_data_format(keras_layer)
     pool_type = type(keras_layer).__name__
-    # global pool in keras = global pool + flatten in relay
+    # global pool in keras = global pool + flatten in nnvm/relay
     if pool_type == 'GlobalMaxPooling2D':
         return _convert_flatten(_op.nn.global_max_pool2d(inexpr), keras_layer, etab)
     if pool_type == 'GlobalAveragePooling2D':
diff --git a/python/tvm/relay/frontend/mxnet.py b/python/tvm/relay/frontend/mxnet.py
index a1a357883a83..abef45d498a1 100644
--- a/python/tvm/relay/frontend/mxnet.py
+++ b/python/tvm/relay/frontend/mxnet.py
@@ -207,23 +207,29 @@ def _mx_conv1d_transpose(inputs, attrs):
     if data_layout != "NCW":
         raise tvm.error.OpAttributeInvalid(
             'Only "NCW" data layout is supported for 1D Convolution')
+    data_layout = "NCHW"
     channel_axis = 1
-    kernel_layout = "OIW"
+    kernel_layout = "OIHW"
+
     new_attrs = {}
     new_attrs["channels"] = attrs.get_int("num_filter")
-    new_attrs["kernel_size"] = attrs.get_int_tuple("kernel")
-    new_attrs["strides"] = attrs.get_int_tuple("stride", (1,))
-    new_attrs["output_padding"] = attrs.get_int_tuple("adj", (0,))
-    new_attrs["padding"] = attrs.get_int_tuple("pad", (0,))
-    new_attrs["dilation"] = attrs.get_int_tuple("dilate", (1,))
+    new_attrs["kernel_size"] = (1,) + attrs.get_int_tuple("kernel")
+    new_attrs["strides"] = (1,) + attrs.get_int_tuple("stride", (1,))
+    new_attrs["output_padding"] = (0,) + attrs.get_int_tuple("adj", (0,))
+    new_attrs["padding"] = (0,) + attrs.get_int_tuple("pad", (0,))
+    new_attrs["dilation"] = (1,) +  attrs.get_int_tuple("dilate", (1,))
     new_attrs["groups"] = attrs.get_int("num_group", 1)
     new_attrs["data_layout"] = data_layout
     new_attrs["kernel_layout"] = kernel_layout
     use_bias = not attrs.get_bool("no_bias", True)
-    res = _op.nn.conv1d_transpose(inputs[0], inputs[1], **new_attrs)
+    data = _op.expand_dims(inputs[0], axis=2)
+    kernel = _op.expand_dims(inputs[1], axis=2)
+    res = _op.nn.conv2d_transpose(data, kernel, **new_attrs)
+
     if use_bias:
         assert len(inputs) == 3
         res = _op.nn.bias_add(res, inputs[2], axis=channel_axis)
+    res = _op.squeeze(res, axis=[2])
     return res
 
 
diff --git a/python/tvm/relay/frontend/onnx.py b/python/tvm/relay/frontend/onnx.py
index c7764db729ee..3d90d15e1916 100644
--- a/python/tvm/relay/frontend/onnx.py
+++ b/python/tvm/relay/frontend/onnx.py
@@ -66,17 +66,6 @@ def revert_caffe2_pad(pads):
     return pads
 
 
-def get_pad_pair(input1d, kernel1d, stride1d):
-    """infer pad size"""
-    if input1d % stride1d == 0:
-        pad = max(kernel1d - stride1d, 0)
-    else:
-        pad = max(kernel1d - (input1d % stride1d), 0)
-    pad_before = pad // 2
-    pad_after = pad - pad_before
-    return [pad_before, pad_after]
-
-
 def onnx_storage_order2layout(storage_order):
     """converter of onnx storage order parameter to tvm storage order format"""
     if storage_order not in (0, 1):
@@ -213,37 +202,14 @@ class Conv(OnnxOpConverter):
 
     @classmethod
     def _impl_v1(cls, inputs, attr, params):
-        # infer pads for auto_pad
-        if 'auto_pad' in attr:
-            attr['auto_pad'] = attr['auto_pad'].decode('utf-8')
-            if attr['auto_pad'] in ('SAME_UPPER', 'SAME_LOWER'):
-                input_shape = infer_shape(inputs[0])
-                in_h, in_w = input_shape[2], input_shape[3]
-                stride_h, stride_w = attr['strides']
-                kernel_h, kernel_w = attr['kernel_shape']
-                dilation_h, dilation_w = attr['dilations']
-                dilated_kernel_h = (kernel_h - 1) * dilation_h + 1
-                dilated_kernel_w = (kernel_w - 1) * dilation_w + 1
-                pad_v = get_pad_pair(in_h, dilated_kernel_h, stride_h)
-                pad_h = get_pad_pair(in_w, dilated_kernel_w, stride_w)
-                attr['pads'] = (pad_v[0], pad_h[0], pad_v[1], pad_h[1])
-            elif attr['auto_pad'] == 'VALID':
-                attr['pads'] = (0, 0)
-            elif attr['auto_pad'] == 'NOTSET':
-                pass
-            else:
-                msg = 'Value {} in attribute "auto_pad" of operator Conv is invalid.'
-                raise tvm.error.OpAttributeInvalid(msg.format(attr['auto_pad']))
-            attr.pop('auto_pad')
-
-        out = AttrCvt(
-            op_name=dimension_picker('conv'),
-            transforms={
-                'kernel_shape': 'kernel_size',
-                'dilations': ('dilation', (0, 0)),
-                'pads': ('padding', (0, 0), revert_caffe2_pad),
-                'group': ('groups', 1)},
-            custom_check=dimension_constraint())(inputs[:2], attr, params)
+        out = AttrCvt(op_name=dimension_picker('conv'),
+                      transforms={
+                          'kernel_shape': 'kernel_size',
+                          'dilations': ('dilation', (0, 0)),
+                          'pads': ('padding', (0, 0), revert_caffe2_pad),
+                          'group': ('groups', 1)},
+                      ignores=['auto_pad'],
+                      custom_check=dimension_constraint())(inputs[:2], attr, params)
         use_bias = len(inputs) == 3
         if use_bias:
             out = _op.nn.bias_add(out, inputs[2])
@@ -260,29 +226,6 @@ def _impl_v1(cls, inputs, attr, params):
         attr['channels'] = channels
         groups = attr.pop('group')
         attr['groups'] = groups
-        # infer pads for auto_pad
-        if 'auto_pad' in attr:
-            attr['auto_pad'] = attr['auto_pad'].decode('utf-8')
-            if attr['auto_pad'] in ('SAME_UPPER', 'SAME_LOWER'):
-                input_shape = infer_shape(inputs[0])
-                in_h, in_w = input_shape[2], input_shape[3]
-                stride_h, stride_w = attr['strides']
-                kernel_h, kernel_w = attr['kernel_shape']
-                dilation_h, dilation_w = attr['dilations']
-                dilated_kernel_h = (kernel_h - 1) * dilation_h + 1
-                dilated_kernel_w = (kernel_w - 1) * dilation_w + 1
-                pad_v = get_pad_pair(in_h, dilated_kernel_h, stride_h)
-                pad_h = get_pad_pair(in_w, dilated_kernel_w, stride_w)
-                attr['pads'] = (pad_v[0], pad_h[0], pad_v[1], pad_h[1])
-            elif attr['auto_pad'] == 'VALID':
-                attr['pads'] = (0, 0)
-            elif attr['auto_pad'] == 'NOTSET':
-                pass
-            else:
-                msg = 'Value {} in attribute "auto_pad" of operator Conv is invalid.'
-                raise tvm.error.OpAttributeInvalid(msg.format(attr['auto_pad']))
-            attr.pop('auto_pad')
-
         out = AttrCvt(
             op_name=dimension_picker('conv', '_transpose'),
             transforms={
@@ -520,8 +463,7 @@ def _impl_v1(cls, inputs, attr, params):
     @classmethod
     def _impl_v5(cls, inputs, attr, params):
         if get_name(inputs[1]) in params:
-            # pop shape out of parameters since it wont be needed later.
-            shape = tuple(params.pop(inputs[1].name_hint).asnumpy())
+            shape = tuple(params[inputs[1].name_hint].asnumpy())
             out = _op.reshape(inputs[0], shape)
         else:
             data, shape = inputs
@@ -540,7 +482,34 @@ def _impl_v11(cls, inputs, attr, params):
 
         block_size = int(attr['blocksize'])
         mode = attr.get("mode", "DCR")
-        return _op.nn.depth_to_space(inputs[0], block_size, mode=mode)
+
+        # handle NCHW layout
+        indata = infer_value_simulated(inputs[0], params)
+        in_n, in_c, in_h, in_w = indata.shape
+
+        # reshape to proper output
+        new_c = int(in_c / (block_size * block_size))
+        new_h = in_h * block_size
+        new_w = in_w * block_size
+        newshape = (in_n, new_c, new_h, new_w)
+
+        if mode == "DCR":
+            # expand input to larger dimension.
+            expanded = _op.reshape(inputs[0],
+                                   newshape=(in_n, block_size, block_size, new_c, in_h, in_w))
+            # reorder to expand spatial blocks.
+            transposed = _op.transpose(expanded, axes=(0, 3, 4, 1, 5, 2))
+
+        else:  # CRD mode
+            # expand input to larger dimension.
+            expanded = _op.reshape(inputs[0],
+                                   newshape=(in_n, new_c, block_size, block_size, in_h, in_w))
+            # reorder to expand spatial blocks.
+            transposed = _op.transpose(expanded, axes=(0, 1, 4, 2, 5, 3))
+
+        return AttrCvt(op_name="reshape",
+                       extras={'newshape': newshape},
+                       ignores=['mode', 'blocksize'])([transposed], attr)
 
 
 class SpaceToDepth(OnnxOpConverter):
@@ -551,7 +520,26 @@ class SpaceToDepth(OnnxOpConverter):
     def _impl_v1(cls, inputs, attr, params):
 
         block_size = int(attr['blocksize'])
-        return _op.nn.space_to_depth(inputs[0], block_size)
+
+        # handle NCHW layout
+        indata = infer_value_simulated(inputs[0], params)
+        in_n, in_c, in_h, in_w = indata.shape
+
+        # reshape to proper output
+        new_c = in_c * (block_size * block_size)
+        new_h = int(in_h / block_size)
+        new_w = int(in_w / block_size)
+        newshape = (in_n, new_c, new_h, new_w)
+
+        # expand input to larger dimension.
+        expanded = _op.reshape(inputs[0],
+                               newshape=(in_n, in_c, new_h, block_size, new_w, block_size))
+        # reorder to expand spatial blocks.
+        transposed = _op.transpose(expanded, axes=(0, 3, 5, 1, 2, 4))
+
+        return AttrCvt(op_name="reshape",
+                       extras={'newshape': newshape},
+                       ignores=['blocksize'])([transposed], attr)
 
 
 class Concat(OnnxOpConverter):
@@ -698,7 +686,8 @@ class Shape(OnnxOpConverter):
 
     @classmethod
     def _impl_v1(cls, inputs, attr, params):
-        return _op.shape_of(inputs[0], "int64")
+        # TODO(@jroesch): use shape_of once it has been fixed)
+        return _op.shape_of(inputs[0])
 
 class Cast(OnnxOpConverter):
     """ Operator converter for Cast.
@@ -1091,52 +1080,6 @@ class Or(Elemwise):
     def _impl_v7(cls, inputs, attr, params):
         return _op.logical_or(inputs[0], inputs[1])
 
-class Expand(OnnxOpConverter):
-    """ Operator converter for Expand.
-    """
-    @classmethod
-    def _impl_v8(cls, inputs, attr, params):
-        in_shape = np.array(infer_shape(inputs[0])).astype('int32')
-        if get_name(inputs[1]) in params:
-            shape = params[inputs[1].name_hint].asnumpy().astype('int32')
-        else:
-            shape = infer_value_simulated(inputs[1], params).asnumpy().astype('int32')
-
-        # Currently 'op.broadcast_to' expect the rank of the given 'shape'
-        # (the 2nd input) is always higher than that of the given 'input' (the 1st input)
-        # However, ONNX Expand supports multi-directional broadcasting, which allows
-        # above pattern and also some extent of 'shape' can be smaller than the corresponding
-        # extent of 'input'. In this case, the extent of 'shape' must be 1.
-        # https://github.com/onnx/onnx/blob/master/docs/Broadcasting.md
-        # In above cases, we cannot directorly apply 'op.broadcast_to' instead of 'expand'
-        # so, here we solved this problem by expanding the given 'shape' itself.
-        def expand_shape(in_shape, shape):
-            """ A function expands the shape when the rank is lower than that of the given
-            intput. Also it replaces the extent of the shape with the corresponding extent
-            of the intput when it is 1.
-            """
-
-            # here we flip the shapes because this can be more simply written
-            # when the innermost dimension is located at the index 0.
-            in_shape = np.flip(in_shape, axis=0)
-            shape = np.flip(shape, axis=0)
-
-            if in_shape.size < shape.size:
-                for i in range(shape.size):
-                    if i < in_shape.size and in_shape[i] > shape[i]:
-                        shape[i] = in_shape[i]
-            else:
-                for i in range(in_shape.size):
-                    if i >= shape.size:
-                        np.append(shape, in_shape[i])
-                    elif shape[i] == 1:
-                        shape[i] = in_shape[i]
-
-            new_shape = np.flip(shape, axis=0)
-            return new_shape
-
-        shape = expand_shape(in_shape, shape)
-        return _op.broadcast_to(inputs[0], shape=tuple(shape))
 
 # compatible operators that do NOT require any conversion.
 _identity_list = []
@@ -1244,7 +1187,6 @@ def _get_convert_map(opset):
         # defs/tensor
         'Cast': Cast.get_converter(opset),
         'Reshape': Reshape.get_converter(opset),
-        'Expand': Expand.get_converter(opset),
         'Concat': Concat.get_converter(opset),
         'Split': Split.get_converter(opset),
         'Slice': Slice.get_converter(opset),
@@ -1368,6 +1310,8 @@ def from_onnx(self, graph, opset):
                 self._num_param += 1
                 # We should convert scalar integers to int32, to normalize.
                 array = self._parse_array(t_proto)
+                if len(array.shape) == 0 and array.dtype == 'int64':
+                    array = _nd.array(array.asnumpy().astype('int32'))
                 self._params[node.output[0]] = array
                 self._nodes[node.output[0]] = new_var(
                     node.output[0],
diff --git a/python/tvm/relay/frontend/tensorflow.py b/python/tvm/relay/frontend/tensorflow.py
index f748fe828bfd..460a14699a77 100644
--- a/python/tvm/relay/frontend/tensorflow.py
+++ b/python/tvm/relay/frontend/tensorflow.py
@@ -122,70 +122,6 @@ def _impl(inputs, attr, params):
         return get_relay_op(name)(*inputs)
     return _impl
 
-def _pool3d(name):
-    def _impl(inputs, attr, params):
-        attr['data_format'] = attr['data_format'].decode("utf-8")
-        flip_layout = False
-
-        input_shape = attr['_input_shapes'][inputs[0]]
-
-        if attr['data_format'] == 'NDHWC':
-            attr['kernel_shape'] = (attr['ksize'][1], attr['ksize'][2], attr['ksize'][3])
-            attr['strides'] = (attr['strides'][1], attr['strides'][2], attr['strides'][3])
-        elif attr['data_format'] == 'NCDHW':
-            attr['kernel_shape'] = (attr['ksize'][2], attr['ksize'][3], attr['ksize'][4])
-            attr['strides'] = (attr['strides'][2], attr['strides'][3], attr['strides'][4])
-        else:
-            msg = 'Value {} of attribute "data_format" of operator Pooling ' \
-                  'is not valid.'
-            raise tvm.error.OpAttributeInvalid(msg.format(attr['data_format']))
-        if attr['data_format'] == "NDHWC":
-            input_shape = [attr['_input_shapes'][inputs[0]][i] for i in (0, 4, 1, 2, 3)]
-            inputs[0] = _op.transpose(inputs[0], axes=(0, 4, 1, 2, 3))
-            attr['data_format'] = "NCDHW"
-            attr['_input_shapes'][inputs[0]] = input_shape
-            flip_layout = True
-
-        attr['padding'] = attr['padding'].decode("utf-8")
-
-        if attr['padding'] == 'VALID':
-            attr['padding'] = [0, 0, 0, 0, 0, 0]
-        elif attr['padding'] == 'SAME':
-            stride_d, stride_h, stride_w = attr['strides']
-            kernel_d, kernel_h, kernel_w = attr['kernel_shape']
-            if attr['data_format'] == 'NDHWC':
-                in_d = input_shape[1]
-                in_h = input_shape[2]
-                in_w = input_shape[3]
-            else:
-                in_d = input_shape[2]
-                in_h = input_shape[3]
-                in_w = input_shape[4]
-            pad_d = _get_pad_pair(in_d, kernel_d, stride_d)
-            pad_v = _get_pad_pair(in_h, kernel_h, stride_h)
-            pad_h = _get_pad_pair(in_w, kernel_w, stride_w)
-
-            attr['padding'] = [pad_d[0], pad_v[0], pad_h[0], pad_d[1], pad_v[1], pad_h[1]]
-        else:
-            msg = 'Value {} in attribute "padding" of operator Pooling is ' \
-                  'not valid.'
-            raise tvm.error.OpAttributeInvalid(msg.format(attr['padding']))
-
-        if name == "avg_pool":
-            attr['count_include_pad'] = False
-        attr['ceil_mode'] = False
-        out = AttrCvt(
-            op_name=name,
-            transforms={
-                'kernel_shape': 'pool_size',
-                'data_format': 'layout'},
-            ignores=['ksize'])(inputs, attr)
-        if flip_layout:
-            out = _op.transpose(out, axes=(0, 2, 3, 4, 1))
-        return out
-
-    return _impl
-
 def _pooling(name):
     def _impl(inputs, attr, params):
 
@@ -269,12 +205,6 @@ def _impl(inputs, attr, params):
             attr['strides'][1], attr['strides'][2], attr['strides'][3] = \
                 attr['strides'][3], attr['strides'][1], attr['strides'][2]
             attr['data_format'] = 'NCHW'
-
-            if opname == 'conv_transpose' and len(attr['_output_shapes']) > 0:
-                tmp_shape = attr['_output_shapes'][0]
-                tmp_shape = [tmp_shape[ii] for ii in (0, 3, 1, 2)]
-                attr['_output_shapes'][0] = tmp_shape
-
             flip_layout = True
 
         inputs_data = inputs[0] if opname != 'conv_transpose' else inputs[2]
@@ -351,17 +281,12 @@ def _impl(inputs, attr, params):
         elif attr['padding'] == 'SAME':
             stride_h, stride_w = attr['strides']
             kernel_h, kernel_w = attr['kernel_shape']
-
-            pdata_shape = input_shape
-            if opname == 'conv_transpose' and len(attr['_output_shapes']) > 0:
-                pdata_shape = attr['_output_shapes'][0]
-
             if attr['data_format'] == 'NHWC':
-                in_h = pdata_shape[1]
-                in_w = pdata_shape[2]
+                in_h = input_shape[1]
+                in_w = input_shape[2]
             else:
-                in_h = pdata_shape[2]
-                in_w = pdata_shape[3]
+                in_h = input_shape[2]
+                in_w = input_shape[3]
 
             dilation_h = attr['dilations'][0]
             dilation_w = attr['dilations'][1]
@@ -370,23 +295,21 @@ def _impl(inputs, attr, params):
             pad_v = _get_pad_pair(in_h, dilated_kernel_h, stride_h)
             pad_h = _get_pad_pair(in_w, dilated_kernel_w, stride_w)
 
-            if opname != 'conv_transpose':
-                if attr['data_format'] == 'NHWC':
-                    inputs_data = _op.nn.pad(data=inputs_data,
-                                             pad_width=((0, 0),
-                                                        (pad_v[0], pad_v[1]),
-                                                        (pad_h[0], pad_h[1]),
-                                                        (0, 0)))
-                else:
-                    inputs_data = _op.nn.pad(data=inputs_data,
-                                             pad_width=((0, 0),
-                                                        (0, 0),
-                                                        (pad_v[0], pad_v[1]),
-                                                        (pad_h[0], pad_h[1])))
 
-                attr['padding'] = [0, 0]
+            if attr['data_format'] == 'NHWC':
+                inputs_data = _op.nn.pad(data=inputs_data,
+                                         pad_width=((0, 0),
+                                                    (pad_v[0], pad_v[1]),
+                                                    (pad_h[0], pad_h[1]),
+                                                    (0, 0)))
             else:
-                attr['padding'] = [pad_v[0], pad_h[0], pad_v[1], pad_h[1]]
+                inputs_data = _op.nn.pad(data=inputs_data,
+                                         pad_width=((0, 0),
+                                                    (0, 0),
+                                                    (pad_v[0], pad_v[1]),
+                                                    (pad_h[0], pad_h[1])))
+
+            attr['padding'] = [0, 0]
 
         else:
             msg = 'Value {} in attribute "padding" of operator Conv is not ' \
@@ -741,18 +664,76 @@ def _impl(inputs, attr, params):
 
 def _depth_to_space():
     def _impl(inputs, attr, params):
+        # Need to handle data layouts differently.
+        input_shape = attr['_input_shapes'][inputs[0]]
         block_size = int(attr['block_size'])
-        layout = attr['data_format'].decode("utf-8")
-        return _op.nn.depth_to_space(inputs[0], block_size, layout)
+        if attr['data_format'].decode("utf-8") == 'NHWC':
+            in_n, in_h, in_w, in_c = input_shape
+            new_c = int(in_c / (block_size * block_size))
+
+            # First expand input to larger dimension.
+            expanded = _op.reshape(
+                inputs[0], newshape=(in_n, in_h, in_w, block_size, block_size, new_c))
+            # Now reorder to expand spatial blocks.
+            transposed = _op.transpose(expanded, axes=(0, 1, 3, 2, 4, 5))
+            # Finally reshape to proper output.
+            new_h = in_h * block_size
+            new_w = in_w * block_size
+            newshape = (in_n, new_h, new_w, new_c)
+
+        else: # Handle NCHW layout
+            in_n, in_c, in_h, in_w = input_shape
+            new_c = int(in_c / (block_size * block_size))
+
+            expanded = _op.reshape(
+                inputs[0], newshape=(in_n, block_size, block_size, new_c, in_h, in_w))
+            transposed = _op.transpose(expanded, axes=(0, 3, 4, 1, 5, 2))
+            new_h = in_h * block_size
+            new_w = in_w * block_size
+            newshape = (in_n, new_c, new_h, new_w)
+
+        return AttrCvt(
+            op_name="reshape",
+            extras={'newshape': newshape},
+            ignores=['data_format', 'block_size'])([transposed], attr)
 
     return _impl
 
 
 def _space_to_depth():
     def _impl(inputs, attr, params):
+        # Need to handle data layouts differently.
+        input_shape = attr['_input_shapes'][inputs[0]]
         block_size = int(attr['block_size'])
-        layout = attr['data_format'].decode("utf-8")
-        return _op.nn.space_to_depth(inputs[0], block_size, layout)
+        if attr['data_format'].decode("utf-8") == 'NHWC':
+            in_n, in_h, in_w, in_c = input_shape
+            new_h = int(in_h / block_size)
+            new_w = int(in_w / block_size)
+
+            # First expand input to larger dimension.
+            expanded = _op.reshape(
+                inputs[0], newshape=(in_n, new_h, block_size, new_w, block_size, in_c))
+            # Now reorder to expand spatial blocks.
+            transposed = _op.transpose(expanded, axes=(0, 1, 3, 2, 4, 5))
+            # Finally reshape to proper output.
+            new_c = in_c * block_size * block_size
+            newshape = (in_n, new_h, new_w, new_c)
+
+        else:  # Handle NCHW layout
+            in_n, in_c, in_h, in_w = input_shape
+            new_h = int(in_h / block_size)
+            new_w = int(in_w / block_size)
+
+            expanded = _op.reshape(
+                inputs[0], newshape=(in_n, in_c, new_h, block_size, new_w, block_size))
+            transposed = _op.transpose(expanded, axes=(0, 3, 5, 1, 2, 4))
+            new_c = int(in_c * block_size * block_size)
+            newshape = (in_n, new_c, new_h, new_w)
+
+        return AttrCvt(
+            op_name="reshape",
+            extras={'newshape': newshape},
+            ignores=['data_format', 'block_size'])([transposed], attr)
 
     return _impl
 
@@ -1428,7 +1409,6 @@ def _impl(inputs, attr, params):
     'ArgMin'                            : _argx(_op.argmin, 'argmin'),
     'Assert'                            : _assert(),
     'AvgPool'                           : _pooling('avg_pool'),
-    'AvgPool3D'                         : _pool3d('avg_pool3d'),
     'BatchMatMul'                       : _batch_matmul(),
     'BatchMatMulV2'                     : _batch_matmul(),
     'BatchNormWithGlobalNormalization'  : _batch_norm(),
@@ -1480,7 +1460,6 @@ def _impl(inputs, attr, params):
     'MatMul'                            : _matmul(),
     'Max'                               : _reduce('max'),
     'MaxPool'                           : _pooling('max_pool'),
-    'MaxPool3D'                         : _pool3d('max_pool3d'),
     'Maximum'                           : _elemwise('maximum'),
     'Mean'                              : _mean(),
     'Min'                               : _reduce('min'),
diff --git a/python/tvm/relay/op/_tensor_grad.py b/python/tvm/relay/op/_tensor_grad.py
index 944e51e636f5..d55cad7c7a2d 100644
--- a/python/tvm/relay/op/_tensor_grad.py
+++ b/python/tvm/relay/op/_tensor_grad.py
@@ -379,9 +379,9 @@ def log_softmax_grad(orig, grad):
 @register_gradient("nn.bias_add")
 def bias_add_grad(orig, grad):
     """Returns gradient of bias_add"""
-    data = orig.args[0]
+    data, bias = orig.args
     return [collapse_sum_like(grad, data),
-            _sum(grad, orig.attrs.axis, keepdims=False, exclude=True)]
+            collapse_sum_like(grad, bias)]
 
 
 @register_gradient("nn.dense")
diff --git a/python/tvm/relay/op/nn/_nn.py b/python/tvm/relay/op/nn/_nn.py
index 322325819fba..cd8a1311eaba 100644
--- a/python/tvm/relay/op/nn/_nn.py
+++ b/python/tvm/relay/op/nn/_nn.py
@@ -251,47 +251,6 @@ def legalize_conv2d(attrs, inputs, types):
     """
     return topi.nn.conv2d_legalize(attrs, inputs, types)
 
-
-@reg.register_convert_op_layout("nn.conv2d")
-def convert_conv2d(attrs, inputs, tinfos, desired_layout):
-    """Convert Layout pass registration for conv2d op.
-
-    Parameters
-    ----------
-    attrs : tvm.attrs.Attrs
-        Attributes of current convolution
-    inputs : list of tvm.relay.Expr
-        The args of the Relay expr to be legalized
-    tinfos : list of types
-        List of input and output types
-    desired_layout : str
-        The desired layout
-
-    Returns
-    -------
-    result : tvm.relay.Expr
-        The transformed expr
-    """
-
-    from tvm import relay
-    data_layout = attrs['data_layout']
-    kernel_layout = attrs['kernel_layout']
-    data, weight = inputs
-    assert desired_layout == 'NCHW', \
-            "Currently only transformation to NCHW layout is supported."
-    if desired_layout == 'NCHW':
-        new_attrs = dict(attrs)
-        new_attrs['data_layout'] = desired_layout
-        new_attrs['kernel_layout'] = 'OIHW'
-
-        if data_layout == 'NHWC' and kernel_layout == 'HWIO':
-            # Convert (NHWC, HWIO) to (NCHW, OIHW)
-            return relay.nn.conv2d(data, weight, **new_attrs)
-        if data_layout == 'NHWC' and kernel_layout == 'HWOI':
-            # Convert (NHWC, HWOI) to (NCHW, OIHW). Depthwise conv2d.
-            return relay.nn.conv2d(data, weight, **new_attrs)
-    return None
-
 reg.register_pattern("nn.conv2d", OpPattern.OUT_ELEMWISE_FUSABLE)
 
 
@@ -389,37 +348,6 @@ def legalize_conv2d_transpose(attrs, inputs, types):
 
 reg.register_pattern("nn.conv2d_transpose", OpPattern.OUT_ELEMWISE_FUSABLE)
 
-# conv1d_transpose
-@reg.register_compute("nn.conv1d_transpose")
-def compute_conv1d_transpose(attrs, inputs, out_dtype, target):
-    """Compute definition of conv1d_transpose"""
-    padding = get_const_tuple(attrs.padding)
-    strides = get_const_tuple(attrs.strides)
-    dilation = get_const_tuple(attrs.dilation)
-    groups = attrs.groups
-    layout = attrs.data_layout
-    out_dtype = attrs.out_dtype
-    out_dtype = (inputs[0].dtype if out_dtype in ("same", "")
-                 else out_dtype)
-    assert layout == "NCW", "conv1d_transpose ncw only supported"
-    assert dilation == (1,), "conv1d_transpose dilation is not supported"
-    assert groups == 1, "conv1d_transpose groups == 1 only supported"
-    out = topi.nn.conv1d_transpose_ncw(
-        inputs[0], inputs[1], strides, padding, out_dtype)
-    output_padding = get_const_tuple(attrs.output_padding)
-    out = topi.nn.pad(out,
-                      [0, 0, 0], [0, 0, output_padding[0]])
-    return [out]
-
-
-@reg.register_schedule("nn.conv1d_transpose")
-def schedule_conv1d_transpose(attrs, outs, target):
-    """Schedule definition of conv1d_transpose"""
-    with target:
-        return topi.generic.schedule_conv1d_transpose_ncw(outs)
-
-reg.register_pattern("nn.conv1d_transpose", OpPattern.OUT_ELEMWISE_FUSABLE)
-
 # bias_add
 reg.register_schedule("nn.bias_add", schedule_injective)
 reg.register_pattern("nn.bias_add", OpPattern.BROADCAST)
@@ -437,18 +365,6 @@ def schedule_max_pool2d(attrs, outs, target):
 reg.register_pattern("nn.max_pool2d", OpPattern.OUT_ELEMWISE_FUSABLE)
 
 
-# max_pool3d
-@reg.register_schedule("nn.max_pool3d")
-def schedule_max_pool3d(attrs, outs, target):
-    """Schedule definition of max_pool3d"""
-    layout = attrs.layout
-    with target:
-        return topi.generic.schedule_pool(outs, layout)
-
-
-reg.register_pattern("nn.max_pool3d", OpPattern.OUT_ELEMWISE_FUSABLE)
-
-
 # avg_pool2d
 @reg.register_schedule("nn.avg_pool2d")
 def schedule_avg_pool2d(attrs, outs, target):
@@ -457,19 +373,8 @@ def schedule_avg_pool2d(attrs, outs, target):
     with target:
         return topi.generic.schedule_pool(outs, layout)
 
-reg.register_pattern("nn.avg_pool2d", OpPattern.OUT_ELEMWISE_FUSABLE)
-
 
-# avg_pool3d
-@reg.register_schedule("nn.avg_pool3d")
-def schedule_avg_pool3d(attrs, outs, target):
-    """Schedule definition of avg_pool3d"""
-    layout = attrs.layout
-    with target:
-        return topi.generic.schedule_pool(outs, layout)
-
-
-reg.register_pattern("nn.avg_pool3d", OpPattern.OUT_ELEMWISE_FUSABLE)
+reg.register_pattern("nn.avg_pool2d", OpPattern.OUT_ELEMWISE_FUSABLE)
 
 
 # max_pool2d_grad
@@ -582,25 +487,6 @@ def compute_upsampling(attrs, inputs, out_dtype, target):
     align_corners = attrs.align_corners
     return [topi.nn.upsampling(inputs[0], scale_h, scale_w, layout, method, align_corners)]
 
-# upsampling3d
-reg.register_schedule("nn.upsampling3d", reg.schedule_injective)
-
-def schedule_upsampling3d(_, outs, target):
-    """Schedule definition of upsampling3d"""
-    with target:
-        return topi.generic.schedule_injective(outs)
-
-@reg.register_compute("nn.upsampling3d")
-def compute_upsampling3d(attrs, inputs, out_dtype, target):
-    scale_d = attrs.scale_d
-    scale_h = attrs.scale_h
-    scale_w = attrs.scale_w
-    layout = attrs.layout
-    method = attrs.method
-    coordinate_transformation_mode = attrs.coordinate_transformation_mode
-    return [topi.nn.upsampling3d(inputs[0], scale_d, scale_h, scale_w, layout, method,\
-        coordinate_transformation_mode)]
-
 # pad
 reg.register_schedule("nn.pad", schedule_broadcast)
 
@@ -964,28 +850,6 @@ def compute_cross_entropy_with_logits(attrs, inputs, out_dtype, target):
     x, y = inputs
     return [-topi.sum(x * y) / x.shape[0]]
 
-
-@reg.register_compute("nn.depth_to_space")
-def compute_depth_to_space(attrs, inputs, out_dtype, target):
-    block_size = attrs.block_size
-    layout = attrs.layout
-    mode = attrs.mode
-    return [topi.nn.depth_to_space(inputs[0], block_size, layout=layout, mode=mode)]
-
-reg.register_schedule("nn.depth_to_space", schedule_injective)
-reg.register_pattern("nn.depth_to_space", OpPattern.INJECTIVE)
-
-
-@reg.register_compute("nn.space_to_depth")
-def compute_space_to_depth(attrs, inputs, out_dtype, target):
-    block_size = attrs.block_size
-    layout = attrs.layout
-    return [topi.nn.space_to_depth(inputs[0], block_size, layout=layout)]
-
-reg.register_schedule("nn.space_to_depth", schedule_injective)
-reg.register_pattern("nn.space_to_depth", OpPattern.INJECTIVE)
-
-
 # shape func
 @script
 def _conv2d_NCHWc_shape_func(dshape, kshape, strides, padding, dilation, oc_bn):
diff --git a/python/tvm/relay/op/nn/nn.py b/python/tvm/relay/op/nn/nn.py
index ec360af6dd48..5e1c6a8c2616 100644
--- a/python/tvm/relay/op/nn/nn.py
+++ b/python/tvm/relay/op/nn/nn.py
@@ -257,72 +257,6 @@ def conv2d_transpose(data,
                                   kernel_layout, out_layout, output_padding, out_dtype)
 
 
-def conv1d_transpose(data,
-                     weight,
-                     strides=(1,),
-                     padding=(0,),
-                     dilation=(1,),
-                     groups=1,
-                     channels=None,
-                     kernel_size=None,
-                     data_layout="NCW",
-                     kernel_layout="OIW",
-                     out_layout="",
-                     output_padding=(0,),
-                     out_dtype=""):
-    """One dimensional transposed convolution operator.
-
-    Parameters
-    ----------
-    data : tvm.relay.Expr
-        The input data to the operator.
-
-    weight : tvm.relay.Expr
-        The weight expressions.
-
-    strides : Tuple[int], optional
-        The strides of convolution.
-
-    padding : Tuple[int], optional
-        The padding of convolution on both sides of inputs.
-
-    dilation : Tuple[int], optional
-        Specifies the dilation rate to be used for dilated convolution.
-
-    channels : int, optional
-        Number of output channels of this convolution.
-
-    kernel_size : tuple of int, optional
-        The spatial of the convolution kernel.
-
-    groups : int, optional
-        Number of groups for grouped convolution.
-
-    data_layout : str, optional
-        Layout of the input.
-
-    kernel_layout : str, optional
-        Layout of the weight.
-
-    out_layout : Optional[str]
-        Layout of the output, by default, out_layout is the same as data_layout
-
-    output_padding : Tuple[int], optional
-        Additional zero-padding to be added to one side of the output.
-
-    out_dtype : str, optional
-        Specifies the output data type for mixed precision conv2d.
-
-    Returns
-    -------
-    result : tvm.relay.Expr
-        The computed result.
-    """
-    return _make.conv1d_transpose(data, weight, strides, padding, dilation,
-                                  groups, channels, kernel_size, data_layout,
-                                  kernel_layout, out_layout, output_padding, out_dtype)
-
-
 def softmax(data, axis=-1):
     r"""Computes softmax.
 
@@ -425,51 +359,6 @@ def max_pool2d(data,
     return _make.max_pool2d(data, pool_size, strides, padding,
                             layout, ceil_mode)
 
-def max_pool3d(data,
-               pool_size=(1, 1, 1),
-               strides=(1, 1, 1),
-               padding=(0, 0, 0),
-               layout="NCDHW",
-               ceil_mode=False):
-    r"""3D maximum pooling operator.
-
-    This operator takes data as input and does 3D max value calculation
-    with in pool_size sized window by striding defined by stride.
-
-
-    In the default case, where the data_layout is `NCDHW`
-    a data Tensor with shape `(batch_size, channels, depth, height, width)`,
-    to produce an output Tensor.
-
-    The ceil_mode is used to take ceil or floor while computing out shape.
-    count_include_pad indicates including or excluding padded input values in computation.
-    This operator accepts data layout specification.
-
-    Parameters
-    ----------
-    data : tvm.relay.Expr
-        The input data to the operator.
-
-    strides : tuple of int, optional
-        The strides of pooling.
-
-    padding : tuple of int, optional
-        The padding for pooling.
-
-    layout : str, optional
-        Layout of the input.
-
-    ceil_mode : bool, optional
-        To enable or disable ceil while pooling.
-
-    Returns
-    -------
-    result : tvm.relay.Expr
-        The computed result.
-    """
-    return _make.max_pool3d(data, pool_size, strides, padding,
-                            layout, ceil_mode)
-
 def avg_pool2d(data,
                pool_size=(1, 1),
                strides=(1, 1),
@@ -527,55 +416,6 @@ def avg_pool2d(data,
     return _make.avg_pool2d(data, pool_size, strides, padding,
                             layout, ceil_mode, count_include_pad)
 
-def avg_pool3d(data,
-               pool_size=(1, 1, 1),
-               strides=(1, 1, 1),
-               padding=(0, 0, 0),
-               layout="NCDHW",
-               ceil_mode=False,
-               count_include_pad=False):
-    r"""3D average pooling operator.
-
-    This operator takes data as input and does 3D average value calculation
-    with in pool_size sized window by striding defined by stride
-
-
-    In the default case, where the data_layout is `NCDHW`
-    a data Tensor with shape `(batch_size, channels, depthm height, width)`,
-    to produce an output Tensor.
-
-    The ceil_mode is used to take ceil or floor while computing out shape.
-    count_include_pad indicates including or excluding padded input values in computation.
-    This operator accepts data layout specification.
-
-    Parameters
-    ----------
-    data : tvm.relay.Expr
-        The input data to the operator.
-
-    strides : tuple of int, optional
-        The strides of pooling.
-
-    padding : tuple of int, optional
-        The padding for pooling.
-
-    layout : str, optional
-        Layout of the input.
-
-    ceil_mode : bool, optional
-        To enable or disable ceil while pooling.
-
-    count_include_pad : bool, optional
-        To include padding to compute the average.
-
-    Returns
-    -------
-    result : tvm.relay.Expr
-        The computed result.
-    """
-    return _make.avg_pool3d(data, pool_size, strides, padding,
-                            layout, ceil_mode, count_include_pad)
-
 def max_pool2d_grad(out_grad,
                     data,
                     pool_size=(1, 1),
@@ -771,58 +611,6 @@ def upsampling(data,
     return _make.upsampling(data, scale_h, scale_w, layout, method, align_corners)
 
 
-def upsampling3d(data,
-                 scale_d=1,
-                 scale_h=1,
-                 scale_w=1,
-                 layout="NCDHW",
-                 method="nearest_neighbor",
-                 coordinate_transformation_mode="half_pixel"):
-    """3D Upsampling.
-
-    This operator takes data as input and does 3D scaling to the given scale factor.
-    In the default case, where the data_layout is `NCDHW`
-    with data of shape (n, c, d, h, w)
-    out will have a shape (n, c, d*scale_d, h*scale_h, w*scale_w)
-
-    method indicates the algorithm to be used while calculating the out value
-    and method can be one of ("trilinear", "nearest_neighbor")
-
-    Parameters
-    ----------
-    data : tvm.relay.Expr
-        The input data to the operator.
-
-    scale_d : tvm.relay.Expr
-        The scale factor for depth upsampling.
-
-    scale_h : tvm.relay.Expr
-        The scale factor for height upsampling.
-
-    scale_w : tvm.relay.Expr
-        The scale factor for width upsampling.
-
-    layout : str, optional
-        Layout of the input.
-
-    method : str, optional
-        Scale method to used [nearest_neighbor, trilinear].
-
-    coordinate_transformation_mode: string, optional
-        Describes how to transform the coordinate in the resized tensor
-        to the coordinate in the original tensor.
-        Refer to the ONNX Resize operator specification for details.
-        Available options are "half_pixel", "align_corners" and "asymmetric".
-
-    Returns
-    -------
-    result : tvm.relay.Expr
-        The computed result.
-    """
-    return _make.upsampling3d(data, scale_d, scale_h, scale_w, layout, method,
-                              coordinate_transformation_mode)
-
-
 def batch_flatten(data):
     """BatchFlatten.
 
@@ -2131,53 +1919,3 @@ def cross_entropy_with_logits(predictions, targets):
       The computed result.
     """
     return _make.cross_entropy_with_logits(predictions, targets)
-
-
-def depth_to_space(data, block_size, layout='NCHW', mode='DCR'):
-    """Convert channels into spatial blocks.
-
-    Parameters
-    ----------
-    data : tvm.relay.Expr
-        Input data with channels divisible by block_size**2
-
-    block_size : int
-        Size of blocks to convert channels into.
-
-    layout : string
-        One of NCHW or NHWC, indicates channel axis.
-
-    mode : string
-        One of DCR or CDR, indicates which order channels
-        are accessed in.
-
-    Returns
-    -------
-    result : tvm.relay.Expr
-        Tensor with shape [in_batch, in_channel / block_size * block_size,
-                           in_height * block_size, in_width * block_size]
-    """
-    return _make.depth_to_space(data, block_size, layout, mode)
-
-
-def space_to_depth(data, block_size, layout='NCHW'):
-    """Convert spatial blocks into channels.
-
-    Parameters
-    ----------
-    data : tvm.relay.Expr
-        Input data with spatial dimensions divisible by block_size
-
-    block_size : int
-        Size of blocks to decompose into channels.
-
-    layout : string
-        One of NCHW or NHWC, indicates channel axis.
-
-    Returns
-    -------
-    result : tvm.relay.Expr
-        Tensor with shape [in_batch, in_channel * block_size * block_size,
-                           in_height / block_size, in_width / block_size]
-    """
-    return _make.space_to_depth(data, block_size, layout)
diff --git a/python/tvm/relay/op/op.py b/python/tvm/relay/op/op.py
index 382f667b86a9..355496e42b48 100644
--- a/python/tvm/relay/op/op.py
+++ b/python/tvm/relay/op/op.py
@@ -196,23 +196,6 @@ def register_alter_op_layout(op_name, alter_layout=None, level=10):
     return register(op_name, "FTVMAlterOpLayout", alter_layout, level)
 
 
-def register_convert_op_layout(op_name, convert_layout=None, level=10):
-    """Register convert op layout function for an op
-
-    Parameters
-    ----------
-    op_name : str
-        The name of the operator
-
-    convert_layout: function (attrs: Attrs, inputs: List[Expr]) -> new_expr: Expr
-        The function for changing the layout or replacing the operator
-
-    level : int
-        The priority level
-    """
-    return register(op_name, "FTVMConvertOpLayout", convert_layout, level)
-
-
 def register_legalize(op_name, legal_op=None, level=10):
     """Register legal transformation function for an op
 
diff --git a/python/tvm/relay/op/op_attrs.py b/python/tvm/relay/op/op_attrs.py
index e0887e5ff872..35b2c053f8cf 100644
--- a/python/tvm/relay/op/op_attrs.py
+++ b/python/tvm/relay/op/op_attrs.py
@@ -63,10 +63,6 @@ class FIFOBufferAttrs(Attrs):
 class UpSamplingAttrs(Attrs):
     """Attributes for nn.upsampling"""
 
-@register_relay_attr_node
-class UpSampling3DAttrs(Attrs):
-    """Attributes for nn.upsampling3d"""
-
 @register_relay_attr_node
 class PadAttrs(Attrs):
     """Attributes for nn.pad"""
@@ -275,16 +271,6 @@ class AvgPool2DAttrs(Attrs):
     """Attributes used in avg_pool2d operators"""
 
 
-@register_relay_attr_node
-class MaxPool3DAttrs(Attrs):
-    """Attributes used in max_pool3d operators"""
-
-
-@register_relay_attr_node
-class AvgPool3DAttrs(Attrs):
-    """Attributes used in avg_pool3d operators"""
-
-
 @register_relay_attr_node
 class BitPackAttrs(Attrs):
     """Attributes used in bitpack operator"""
@@ -303,8 +289,3 @@ class BinaryDenseAttrs(Attrs):
 @register_relay_attr_node
 class Conv2DTransposeAttrs(Attrs):
     """Attributes used in Transposed Conv2D operators"""
-
-
-@register_relay_attr_node
-class SubPixelAttrs(Attrs):
-    """Attributes used in depth to space and space to depth operators"""
diff --git a/python/tvm/relay/quantize/_calibrate.py b/python/tvm/relay/quantize/_calibrate.py
index 21254fa61e8e..aae50519b132 100644
--- a/python/tvm/relay/quantize/_calibrate.py
+++ b/python/tvm/relay/quantize/_calibrate.py
@@ -53,18 +53,11 @@ def collect_stats(mod, dataset):
     logging.info("collecting statistics for calibration...")
     func = mod['main']
     func = _quantize.CreateStatsCollector(func)
-
-    if tvm.target.current_target():
-        target = tvm.target.current_target()
-        ctx = tvm.context(target.target_name)
-    else:
-        target = 'llvm'
-        ctx = tvm.context(target)
-
+    target = tvm.target.current_target() or 'llvm'
     with _transform.build_config(opt_level=3):
         graph, lib, params = _build_module.build(func, target=target)
     outputs = []
-    runtime = graph_runtime.create(graph, lib, ctx)
+    runtime = graph_runtime.create(graph, lib, tvm.context(target))
     runtime.set_input(**params)
 
     num_outputs = runtime.get_num_outputs()
diff --git a/python/tvm/relay/testing/tf.py b/python/tvm/relay/testing/tf.py
index e3d6e7df0b98..79d0d8257953 100644
--- a/python/tvm/relay/testing/tf.py
+++ b/python/tvm/relay/testing/tf.py
@@ -28,12 +28,8 @@
 # Tensorflow imports
 import tensorflow as tf
 from tensorflow.core.framework import graph_pb2
-from tvm.contrib.download import download_testdata
 
-try:
-    tf_compat_v1 = tf.compat.v1
-except ImportError:
-    tf_compat_v1 = tf
+from tvm.contrib.download import download_testdata
 
 ######################################################################
 # Some helper functions
@@ -84,7 +80,7 @@ def AddShapesToGraphDef(session, out_node):
 
     """
 
-    graph_def = tf_compat_v1.graph_util.convert_variables_to_constants(
+    graph_def = tf.compat.v1.graph_util.convert_variables_to_constants(
         session,
         session.graph.as_graph_def(add_shapes=True),
         [out_node],
@@ -116,13 +112,13 @@ def load(self, label_lookup_path, uid_lookup_path):
             dict from integer node ID to human-readable string.
 
         """
-        if not tf_compat_v1.gfile.Exists(uid_lookup_path):
+        if not tf.compat.v1.io.gfile.exists(uid_lookup_path):
             tf.logging.fatal('File does not exist %s', uid_lookup_path)
-        if not tf_compat_v1.gfile.Exists(label_lookup_path):
+        if not tf.compat.v1.io.gfile.exists(label_lookup_path):
             tf.logging.fatal('File does not exist %s', label_lookup_path)
 
         # Loads mapping from string UID to human-readable string
-        proto_as_ascii_lines = tf_compat_v1.gfile.GFile(uid_lookup_path).readlines()
+        proto_as_ascii_lines = tf.compat.v1.gfile.GFile(uid_lookup_path).readlines()
         uid_to_human = {}
         p = re.compile(r'[n\d]*[ \S,]*')
         for line in proto_as_ascii_lines:
@@ -133,7 +129,7 @@ def load(self, label_lookup_path, uid_lookup_path):
 
         # Loads mapping from string UID to integer node ID.
         node_id_to_uid = {}
-        proto_as_ascii = tf_compat_v1.gfile.GFile(label_lookup_path).readlines()
+        proto_as_ascii = tf.compat.v1.gfile.GFile(label_lookup_path).readlines()
         for line in proto_as_ascii:
             if line.startswith('  target_class:'):
                 target_class = int(line.split(': ')[1])
@@ -213,7 +209,7 @@ def get_workload(model_path, model_sub_path=None):
         path_model = download_testdata(model_url, model_path, module='tf')
 
     # Creates graph from saved graph_def.pb.
-    with tf_compat_v1.gfile.FastGFile(path_model, 'rb') as f:
+    with tf.compat.v1.gfile.FastGFile(path_model, 'rb') as f:
         graph_def = tf.GraphDef()
         graph_def.ParseFromString(f.read())
         graph = tf.import_graph_def(graph_def, name='')
@@ -303,7 +299,7 @@ def _create_ptb_vocabulary(data_dir):
     file_name = 'ptb.train.txt'
     def _read_words(filename):
         """Read the data for creating vocabulary"""
-        with tf_compat_v1.gfile.GFile(filename, "r") as f:
+        with tf.compat.v1.gfile.GFile(filename, "r") as f:
             return f.read().encode("utf-8").decode("utf-8").replace("\n", "<eos>").split()
 
     def _build_vocab(filename):
diff --git a/python/tvm/relay/transform.py b/python/tvm/relay/transform.py
index 1f91272769b4..540c1f5b79cd 100644
--- a/python/tvm/relay/transform.py
+++ b/python/tvm/relay/transform.py
@@ -460,34 +460,6 @@ def AlterOpLayout():
     return _transform.AlterOpLayout()
 
 
-def ConvertLayout(desired_layout):
-    """ Given a dest layout, this pass transforms the expr such that most of the ops input data
-    layout is changed to the dest layout. In ideal situation, there are only 2 layout transforms,
-    one at the start and one at the end.
-
-    This pass is not a part of relay.build and is expected to be called between framework-relay
-    parser and relay.build call. This is very helpful for hardware backends that support/prefer only
-    type of data layout.
-
-    RFC - https://discuss.tvm.ai/t/layout-conversion-pass/4009
-
-    This pass uses most of the AlterOpLayout and InferCorrectLayout infrastructure. We can define
-    new layouts for conv2d ops for now. Most of the other operators try to adapt to their input
-    layout using the InferCorrectLayout infrastructure.
-
-    Parameters
-    ----------
-    desired_layout : str
-      The desired layout for the transformed expr.
-
-    Returns
-    -------
-    pass: FunctionPass
-      The pass.
-    """
-    return _transform.ConvertLayout(desired_layout)
-
-
 def Legalize(legalize_map_attr_name="FTVMLegalize"):
     """Legalizes an expression with another expression.
     This pass can be used to replace an expr with another expr for target
diff --git a/rust/Cargo.toml b/rust/Cargo.toml
index 8467f6a92ea8..02e2c7c67c99 100644
--- a/rust/Cargo.toml
+++ b/rust/Cargo.toml
@@ -22,7 +22,7 @@ members = [
 	"runtime",
 	"runtime/tests/test_tvm_basic",
 	"runtime/tests/test_tvm_dso",
-	"runtime/tests/test_nn",
+	"runtime/tests/test_nnvm",
 	"frontend",
 	"frontend/tests/basics",
 	"frontend/tests/callback",
diff --git a/rust/frontend/Cargo.toml b/rust/frontend/Cargo.toml
index 3f99188a40f5..c6b56800ef59 100644
--- a/rust/frontend/Cargo.toml
+++ b/rust/frontend/Cargo.toml
@@ -23,7 +23,7 @@ description = "Rust frontend support for TVM"
 repository = "https://github.com/apache/incubator-tvm"
 homepage = "https://github.com/apache/incubator-tvm"
 readme = "README.md"
-keywords = ["rust", "tvm"]
+keywords = ["rust", "tvm", "nnvm"]
 categories = ["api-bindings", "science"]
 authors = ["TVM Contributors"]
 edition = "2018"
diff --git a/rust/frontend/README.md b/rust/frontend/README.md
index c61ba847c1f2..b77a4bd156ef 100644
--- a/rust/frontend/README.md
+++ b/rust/frontend/README.md
@@ -35,12 +35,14 @@ Here's a Python snippet for downloading and building a pretrained Resnet18 via A
 
 ```python
 block = get_model('resnet18_v1', pretrained=True)
-
-sym, params = relay.frontend.from_mxnet(block, shape_dict)
+    
+sym, params = nnvm.frontend.from_mxnet(block)
+# add the softmax layer for prediction
+net = nnvm.sym.softmax(sym)
 # compile the model
-with relay.build_config(opt_level=opt_level):
-    graph, lib, params = relay.build(
-        net, target, params=params)
+with nnvm.compiler.build_config(opt_level=opt_level):
+    graph, lib, params = nnvm.compiler.build(
+        net, target, shape={"data": data_shape}, params=params)
 # same the model artifacts
 lib.save(os.path.join(target_dir, "deploy_lib.o"))
 cc.create_shared(os.path.join(target_dir, "deploy_lib.so"),
@@ -49,7 +51,7 @@ cc.create_shared(os.path.join(target_dir, "deploy_lib.so"),
 with open(os.path.join(target_dir, "deploy_graph.json"), "w") as fo:
     fo.write(graph.json())
 with open(os.path.join(target_dir,"deploy_param.params"), "wb") as fo:
-    fo.write(relay.save_param_dict(params))
+    fo.write(nnvm.compiler.save_param_dict(params))
 ```
 
 Now, we need to input the artifacts to create and run the *Graph Runtime* to detect our input cat image
@@ -111,7 +113,7 @@ and the model correctly predicts the input image as **tiger cat**.
 
 Please follow TVM [installations](https://docs.tvm.ai/install/index.html), `export TVM_HOME=/path/to/tvm` and add `libtvm_runtime` to your `LD_LIBRARY_PATH`.
 
-*Note:* To run the end-to-end examples and tests, `tvm` and `topi` need to be added to your `PYTHONPATH` or it's automatic via an Anaconda environment when it is installed individually.
+*Note:* To run the end-to-end examples and tests, `tvm`, `nnvm` and `topi` need to be added to your `PYTHONPATH` or it's automatic via an Anaconda environment when it is installed individually.
 
 ## Supported TVM Functionalities
 
diff --git a/rust/frontend/examples/resnet/README.md b/rust/frontend/examples/resnet/README.md
index 29274743a1ce..3ce4a778e4bd 100644
--- a/rust/frontend/examples/resnet/README.md
+++ b/rust/frontend/examples/resnet/README.md
@@ -18,11 +18,11 @@
 ## Resnet example
 
 This end-to-end example shows how to:
-* build `Resnet 18` with `tvm` from Python
+* build `Resnet 18` with `tvm` and `nnvm` from Python
 * use the provided Rust frontend API to test for an input image
 
-To run the example with pretrained resnet weights, first `tvm`  and `mxnet` must be installed for the python build. To install mxnet for cpu, run `pip install mxnet`
-and to install `tvm` with `llvm` follow the [TVM installation guide](https://docs.tvm.ai/install/index.html).
+To run the example with pretrained resnet weights, first `tvm`, `nnvm` and `mxnet` must be installed for the python build. To install mxnet for cpu, run `pip install mxnet`
+and to install `tvm` and `nnvm` with `llvm` follow the [TVM installation guide](https://docs.tvm.ai/install/index.html).
 
 * **Build the example**: `cargo build
 
diff --git a/rust/runtime/Cargo.toml b/rust/runtime/Cargo.toml
index f0d24595f0a1..34acc77899e9 100644
--- a/rust/runtime/Cargo.toml
+++ b/rust/runtime/Cargo.toml
@@ -22,7 +22,7 @@ license = "Apache-2.0"
 description = "A static TVM runtime"
 repository = "https://github.com/apache/incubator-tvm"
 readme = "README.md"
-keywords = ["tvm"]
+keywords = ["tvm", "nnvm"]
 categories = ["api-bindings", "science"]
 authors = ["TVM Contributors"]
 edition = "2018"
diff --git a/rust/runtime/src/graph.rs b/rust/runtime/src/graph.rs
index 42b9458223a6..cacd7a38a97f 100644
--- a/rust/runtime/src/graph.rs
+++ b/rust/runtime/src/graph.rs
@@ -440,7 +440,7 @@ named!(
     )
 );
 
-/// Loads a param dict saved using `relay.save_param_dict`.
+/// Loads a param dict saved using `nnvm.compiler.save_param_dict`.
 pub fn load_param_dict(bytes: &[u8]) -> Result<HashMap<String, Tensor>, GraphFormatError> {
     if let Ok((remaining_bytes, param_dict)) = parse_param_dict(bytes) {
         if remaining_bytes.len() == 0 {
diff --git a/rust/runtime/src/threading.rs b/rust/runtime/src/threading.rs
index f05faf73566c..3f25309741ec 100644
--- a/rust/runtime/src/threading.rs
+++ b/rust/runtime/src/threading.rs
@@ -296,7 +296,7 @@ pub(crate) fn sgx_join_threads() {
     ocall_packed!("__sgx_thread_group_join__", 0);
 }
 
-// @see issue 988 for information on why this function is used.
+// @see https://github.com/apache/incubator-tvm/issues/988 for information on why this function is used.
 #[no_mangle]
 pub extern "C" fn TVMBackendParallelBarrier(_task_id: usize, penv: *const TVMParallelGroupEnv) {
     let barrier: &Arc<Barrier> = unsafe { &*((*penv).sync_handle as *const Arc<Barrier>) };
diff --git a/rust/runtime/tests/build_model.py b/rust/runtime/tests/build_model.py
index e3da95f24fd8..bed3c0aa2da8 100755
--- a/rust/runtime/tests/build_model.py
+++ b/rust/runtime/tests/build_model.py
@@ -16,37 +16,56 @@
 # specific language governing permissions and limitations
 # under the License.
 
-"""Builds a simple graph for testing."""
+"""Builds a simple NNVM graph for testing."""
 
 from os import path as osp
 
+import nnvm
+from nnvm import sym
+from nnvm.compiler import graph_util
+from nnvm.testing import init
 import numpy as np
 import tvm
-from tvm import relay
-from tvm.relay import testing
 
 CWD = osp.dirname(osp.abspath(osp.expanduser(__file__)))
 
+
 def _get_model(dshape):
-    data = relay.var('data', shape=dshape)
-    fc = relay.nn.dense(data, relay.var("dense_weight"), units=dshape[-1]*2)
-    fc = relay.nn.bias_add(data, relay.var("dense_bias"))
-    left, right = relay.split(fc, indices_or_sections=2, axis=1)
-    one = relay.const(1, dtype="float32")
-    return relay.Tuple([(left + one), (right - one), fc])
+    data = sym.Variable('data', shape=dshape)
+    fc1 = sym.dense(data, units=dshape[-1]*2, use_bias=True)
+    left, right = sym.split(fc1, indices_or_sections=2, axis=1)
+    return sym.Group(((left + 1), (right - 1)))
+
 
+def _init_params(graph, input_shapes, initializer=init.Xavier(), seed=10):
+    if isinstance(graph, sym.Symbol):
+        graph = nnvm.graph.create(graph)
+    ishapes, _ = graph_util.infer_shape(graph, **input_shapes)
+    param_shapes = dict(zip(graph.index.input_names, ishapes))
+    np.random.seed(seed)
+    params = {}
+    for param, shape in param_shapes.items():
+        if param in {'data', 'label'} or not shape:
+            continue
+        init_value = np.empty(shape).astype('float32')
+        initializer(param, init_value)
+        params[param] = tvm.nd.array(init_value)
+    return params
 
 def main():
     dshape = (32, 16)
     net = _get_model(dshape)
-    mod, params = testing.create_workload(net)
-    graph, lib, params = relay.build(
-        mod, 'llvm', params=params)
+    ishape_dict = {'data': dshape}
+    params = _init_params(net, ishape_dict)
+    graph, lib, params = nnvm.compiler.build(net, 'llvm',
+                                             shape=ishape_dict,
+                                             params=params,
+                                             dtype='float32')
 
     with open(osp.join(CWD, 'graph.json'), 'w') as f_resnet:
-        f_resnet.write(graph)
+        f_resnet.write(graph.json())
     with open(osp.join(CWD, 'graph.params'), 'wb') as f_params:
-        f_params.write(relay.save_param_dict(params))
+        f_params.write(nnvm.compiler.save_param_dict(params))
 
 if __name__ == '__main__':
     main()
diff --git a/rust/runtime/tests/test_nn/src/build_test_graph.py b/rust/runtime/tests/test_nn/src/build_test_graph.py
deleted file mode 100755
index dd7621b921f7..000000000000
--- a/rust/runtime/tests/test_nn/src/build_test_graph.py
+++ /dev/null
@@ -1,54 +0,0 @@
-#!/usr/bin/env python3
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-"""Builds a simple graph for testing."""
-
-from os import path as osp
-import sys
-
-import numpy as np
-import tvm
-from tvm import relay
-from tvm.relay import testing
-
-
-def _get_model(dshape):
-    data = relay.var('data', shape=dshape)
-    fc = relay.nn.dense(data, relay.var("dense_weight"), units=dshape[-1]*2)
-    fc = relay.nn.bias_add(data, relay.var("dense_bias"))
-    left, right = relay.split(fc, indices_or_sections=2, axis=1)
-    one = relay.const(1, dtype="float32")
-    return relay.Tuple([(left + one), (right - one), fc])
-
-def main():
-    dshape = (4, 8)
-    net = _get_model(dshape)
-    mod, params = testing.create_workload(net)
-    graph, lib, params = relay.build(
-        mod, 'llvm --system-lib', params=params)
-
-    out_dir = sys.argv[1]
-    lib.save(osp.join(sys.argv[1], 'graph.o'))
-    with open(osp.join(out_dir, 'graph.json'), 'w') as f_resnet:
-        f_resnet.write(graph)
-
-    with open(osp.join(out_dir, 'graph.params'), 'wb') as f_params:
-        f_params.write(relay.save_param_dict(params))
-
-if __name__ == '__main__':
-    main()
diff --git a/rust/runtime/tests/test_nn/Cargo.toml b/rust/runtime/tests/test_nnvm/Cargo.toml
similarity index 98%
rename from rust/runtime/tests/test_nn/Cargo.toml
rename to rust/runtime/tests/test_nnvm/Cargo.toml
index afd218817104..93fdef4f61c3 100644
--- a/rust/runtime/tests/test_nn/Cargo.toml
+++ b/rust/runtime/tests/test_nnvm/Cargo.toml
@@ -16,7 +16,7 @@
 # under the License.
 
 [package]
-name = "test-nn"
+name = "test-nnvm"
 version = "0.0.0"
 license = "Apache-2.0"
 authors = ["TVM Contributors"]
diff --git a/rust/runtime/tests/test_nn/build.rs b/rust/runtime/tests/test_nnvm/build.rs
similarity index 100%
rename from rust/runtime/tests/test_nn/build.rs
rename to rust/runtime/tests/test_nnvm/build.rs
diff --git a/rust/runtime/tests/test_nnvm/src/build_test_graph.py b/rust/runtime/tests/test_nnvm/src/build_test_graph.py
new file mode 100755
index 000000000000..69ec6d24dbef
--- /dev/null
+++ b/rust/runtime/tests/test_nnvm/src/build_test_graph.py
@@ -0,0 +1,82 @@
+#!/usr/bin/env python3
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Builds a simple NNVM graph for testing."""
+
+from os import path as osp
+import sys
+
+import nnvm
+from nnvm import sym
+from nnvm.compiler import graph_util
+from nnvm.testing import init
+import numpy as np
+import tvm
+
+
+def _get_model(dshape):
+    data = sym.Variable('data', shape=dshape)
+    fc = sym.dense(data, units=dshape[-1]*2, use_bias=True)
+    left, right = sym.split(fc, indices_or_sections=2, axis=1)
+    return sym.Group(((left + 1), (right - 1), fc))
+
+
+def _init_params(graph, input_shapes, initializer=init.Xavier(), seed=10):
+    if isinstance(graph, sym.Symbol):
+        graph = nnvm.graph.create(graph)
+
+    ishapes, _ = graph_util.infer_shape(graph, **input_shapes)
+    param_shapes = dict(zip(graph.index.input_names, ishapes))
+    np.random.seed(seed)
+    params = {}
+    for param, shape in param_shapes.items():
+        if param in {'data', 'label'} or not shape:
+            continue
+
+        init_value = np.arange(np.product(shape), 0, -1).reshape(*shape).astype('float32')
+        if param.endswith('_bias'):
+            params[param] = tvm.nd.array(init_value)
+            continue
+
+        init_value = np.empty(shape).astype('float32')
+        initializer(param, init_value)
+        # init_value /= init_value.sum() + 1e-10
+        params[param] = tvm.nd.array(init_value)
+
+    return params
+
+def main():
+    dshape = (4, 8)
+    net = _get_model(dshape)
+    ishape_dict = {'data': dshape}
+    params = _init_params(net, ishape_dict)
+    graph, lib, params = nnvm.compiler.build(net, 'llvm --system-lib',
+                                             shape=ishape_dict,
+                                             params=params,
+                                             dtype='float32')
+
+    out_dir = sys.argv[1]
+    lib.save(osp.join(sys.argv[1], 'graph.o'))
+    with open(osp.join(out_dir, 'graph.json'), 'w') as f_resnet:
+        f_resnet.write(graph.json())
+
+    with open(osp.join(out_dir, 'graph.params'), 'wb') as f_params:
+        f_params.write(nnvm.compiler.save_param_dict(params))
+
+if __name__ == '__main__':
+    main()
diff --git a/rust/runtime/tests/test_nn/src/main.rs b/rust/runtime/tests/test_nnvm/src/main.rs
similarity index 100%
rename from rust/runtime/tests/test_nn/src/main.rs
rename to rust/runtime/tests/test_nnvm/src/main.rs
diff --git a/src/api/api_ir.cc b/src/api/api_ir.cc
index 03f37b171782..9312c5532302 100644
--- a/src/api/api_ir.cc
+++ b/src/api/api_ir.cc
@@ -30,7 +30,7 @@ namespace tvm {
 namespace ir {
 
 TVM_REGISTER_API("_Var")
-.set_body_typed<VarExpr(std::string, DataType)>([](std::string s, DataType t) {
+.set_body_typed<VarExpr(std::string, Type)>([](std::string s, Type t) {
     return Variable::make(t, s);
   });
 
@@ -75,7 +75,7 @@ TVM_REGISTER_API("make.For")
 
 TVM_REGISTER_API("make.Load")
 .set_body([](TVMArgs args,  TVMRetValue *ret) {
-    DataType t = args[0];
+    Type t = args[0];
     if (args.size() == 3) {
       *ret = Load::make(t, args[1], args[2], const_true(t.lanes()));
     } else {
@@ -87,7 +87,7 @@ TVM_REGISTER_API("make.Store")
 .set_body([](TVMArgs args,  TVMRetValue *ret) {
     Expr value = args[1];
     if (args.size() == 3) {
-      *ret = Store::make(args[0], value, args[2], const_true(value.dtype().lanes()));
+      *ret = Store::make(args[0], value, args[2], const_true(value.type().lanes()));
     } else {
       *ret = Store::make(args[0], value, args[2], args[3]);
     }
@@ -97,8 +97,8 @@ TVM_REGISTER_API("make.Realize")
 .set_body_typed(Realize::make);
 
 TVM_REGISTER_API("make.Call")
-.set_body_typed<Expr(DataType, std::string, Array<Expr>, int, FunctionRef, int)>([](
-  DataType type, std::string name,
+.set_body_typed<Expr(Type, std::string, Array<Expr>, int, FunctionRef, int)>([](
+  Type type, std::string name,
   Array<Expr> args, int call_type,
   FunctionRef func, int value_index
 ) {
@@ -166,8 +166,8 @@ TVM_REGISTER_API("make.Block")
 
 // has default args
 TVM_REGISTER_API("make.Allocate")
-  .set_body_typed<Stmt(VarExpr, DataType, Array<Expr>, Expr, Stmt)>([](
-    VarExpr buffer_var, DataType type, Array<Expr> extents, Expr condition, Stmt body
+  .set_body_typed<Stmt(VarExpr, Type, Array<Expr>, Expr, Stmt)>([](
+    VarExpr buffer_var, Type type, Array<Expr> extents, Expr condition, Stmt body
   ){
     return Allocate::make(buffer_var, type, extents, condition, body);
   });
diff --git a/src/api/api_lang.cc b/src/api/api_lang.cc
index 8a74fe5cdb7d..f3d6c5f6ab62 100644
--- a/src/api/api_lang.cc
+++ b/src/api/api_lang.cc
@@ -35,10 +35,10 @@
 namespace tvm {
 
 TVM_REGISTER_API("_min_value")
-.set_body_typed(min_value);
+.set_body_method(&DataType::min);
 
 TVM_REGISTER_API("_max_value")
-.set_body_typed(max_value);
+.set_body_method(&DataType::max);
 
 TVM_REGISTER_API("_const")
 .set_body([](TVMArgs args,  TVMRetValue* ret) {
@@ -67,7 +67,7 @@ TVM_REGISTER_API("_Array")
     }
     auto node = make_node<ArrayNode>();
     node->data = std::move(data);
-    *ret = Array<ObjectRef>(node);
+    *ret = runtime::ObjectRef(node);
   });
 
 TVM_REGISTER_API("_ArrayGetItem")
@@ -100,28 +100,28 @@ TVM_REGISTER_API("_Map")
       for (int i = 0; i < args.num_args; i += 2) {
         CHECK(args[i].type_code() == kStr)
             << "key of str map need to be str";
-        CHECK(args[i + 1].IsObjectRef<ObjectRef>())
+        CHECK(args[i + 1].type_code() == kObjectHandle)
             << "value of the map to be NodeRef";
         data.emplace(std::make_pair(args[i].operator std::string(),
                                     args[i + 1].operator ObjectRef()));
       }
       auto node = make_node<StrMapNode>();
       node->data = std::move(data);
-      *ret = Map<ObjectRef, ObjectRef>(node);
+      *ret = node;
     } else {
       // Container node.
       MapNode::ContainerType data;
       for (int i = 0; i < args.num_args; i += 2) {
-        CHECK(args[i].IsObjectRef<ObjectRef>())
-            << "key of str map need to be object";
-        CHECK(args[i + 1].IsObjectRef<ObjectRef>())
+        CHECK(args[i].type_code() == kObjectHandle)
+            << "key of str map need to be str";
+        CHECK(args[i + 1].type_code() == kObjectHandle)
             << "value of map to be NodeRef";
         data.emplace(std::make_pair(args[i].operator ObjectRef(),
                                     args[i + 1].operator ObjectRef()));
       }
       auto node = make_node<MapNode>();
       node->data = std::move(data);
-      *ret = Map<ObjectRef, ObjectRef>(node);
+      *ret = node;
     }
   });
 
@@ -191,7 +191,7 @@ TVM_REGISTER_API("_MapItems")
         rkvs->data.push_back(kv.first);
         rkvs->data.push_back(kv.second);
       }
-      *ret = Array<ObjectRef>(rkvs);
+      *ret = rkvs;
     } else {
       auto* n = static_cast<const StrMapNode*>(ptr);
       auto rkvs = make_node<ArrayNode>();
@@ -199,7 +199,7 @@ TVM_REGISTER_API("_MapItems")
         rkvs->data.push_back(ir::StringImm::make(kv.first));
         rkvs->data.push_back(kv.second);
       }
-      *ret = Array<ObjectRef>(rkvs);
+      *ret = rkvs;
     }
   });
 
@@ -287,8 +287,8 @@ TVM_REGISTER_API("_TensorHash")
   });
 
 TVM_REGISTER_API("_Placeholder")
-.set_body_typed<Tensor(Array<Expr>, DataType, std::string)>([](
-  Array<Expr> shape, DataType dtype, std::string name
+.set_body_typed<Tensor(Array<Expr>, Type, std::string)>([](
+  Array<Expr> shape, Type dtype, std::string name
 ) {
   return placeholder(shape, dtype, name);
 });
diff --git a/src/api/api_pass.cc b/src/api/api_pass.cc
index c62cc8ad16a0..4210788d52b5 100644
--- a/src/api/api_pass.cc
+++ b/src/api/api_pass.cc
@@ -159,7 +159,9 @@ REGISTER_PASS(InjectPrefetch);
 REGISTER_PASS(InjectDoubleBuffer);
 REGISTER_PASS(LoopPartition);
 REGISTER_PASS(RemoveNoOp);
+REGISTER_PASS(SplitPipeline);
 REGISTER_PASS(LiftAttrScope);
+REGISTER_PASS(NarrowChannelAccess);
 REGISTER_PASS(LowerThreadAllreduce);
 REGISTER_PASS(LowerWarpMemory);
 REGISTER_PASS(RemapThreadAxis);
diff --git a/src/arithmetic/bound_deducer.cc b/src/arithmetic/bound_deducer.cc
index 19f045241915..31fedcc72cde 100644
--- a/src/arithmetic/bound_deducer.cc
+++ b/src/arithmetic/bound_deducer.cc
@@ -132,7 +132,7 @@ class BoundDeducer: public IRVisitor {
     Expr target_var = left ? op->a : op->b;
 
     SignType sign_operand;
-    if (operand.dtype().is_uint()) {
+    if (operand.type().is_uint()) {
       sign_operand = kPositive;
     } else {
       sign_operand = expr_map_[operand].sign_type();
diff --git a/src/arithmetic/canonical_simplify.cc b/src/arithmetic/canonical_simplify.cc
index 022dd8e94dbb..1b576a645824 100644
--- a/src/arithmetic/canonical_simplify.cc
+++ b/src/arithmetic/canonical_simplify.cc
@@ -115,7 +115,7 @@ class SplitExprNode : public CanonicalExprNode {
 
   Expr NormalizeWithScale(int64_t sscale) const {
     Expr res = this->index;
-    DataType dtype = this->dtype;
+    Type dtype = this->type;
     if (this->scale == 0) {
       return make_const(dtype, 0);
     }
@@ -190,9 +190,9 @@ class SumExprNode : public CanonicalExprNode {
   Expr Normalize() const final {
     // quick path 1.
     if (this->args.size() == 0) {
-      return make_const(this->dtype, this->base);
+      return make_const(this->type, this->base);
     }
-    return Normalize_(this->dtype,
+    return Normalize_(this->type,
                       SimplifySplitExprs(args),
                       base);
   }
@@ -379,7 +379,7 @@ class SumExprNode : public CanonicalExprNode {
     std::stable_sort(args.begin(), args.end(), fcompare);
     return args;
   }
-  static Expr Normalize_(DataType dtype,
+  static Expr Normalize_(Type dtype,
                          const std::vector<SplitExpr>& args,
                          int64_t base) {
     // Positive scales first
@@ -508,7 +508,7 @@ class CanonicalSimplifier::Impl : public RewriteSimplifier::Impl {
       expr = op->Normalize();
     }
     NodePtr<SplitExprNode> n = make_node<SplitExprNode>();
-    n->dtype = expr.dtype();
+    n->type = expr.type();
     n->index = std::move(expr);
     n->div_mode = kTruncDiv;
     return SplitExpr(n);
@@ -545,7 +545,7 @@ class CanonicalSimplifier::Impl : public RewriteSimplifier::Impl {
       return GetRef<SumExpr>(op);
     }
     NodePtr<SumExprNode> n = make_node<SumExprNode>();
-    n->dtype = expr.dtype();
+    n->type = expr.type();
     if (const auto* op = expr.as<IntImm>()) {
       n->base = op->value;
       return SumExpr(n);
@@ -560,7 +560,7 @@ class CanonicalSimplifier::Impl : public RewriteSimplifier::Impl {
 
 Expr CanonicalSimplifier::Impl::
 Mutate_(const Add* op, const Expr& self) {
-  if (!IsIndexType(op->dtype)) {
+  if (!IsIndexType(op->type)) {
     return Rewriter::Mutate_(op, self);
   }
   // normalize
@@ -586,7 +586,7 @@ Mutate_(const Add* op, const Expr& self) {
 
 Expr CanonicalSimplifier::Impl::
 Mutate_(const Sub* op, const Expr& self) {
-  if (!IsIndexType(op->dtype)) {
+  if (!IsIndexType(op->type)) {
     return Rewriter::Mutate_(op, self);
   }
   // normalize
@@ -613,7 +613,7 @@ Mutate_(const Sub* op, const Expr& self) {
 
 Expr CanonicalSimplifier::Impl::
 Mutate_(const Mul* op, const Expr& self) {
-  if (!IsIndexType(op->dtype)) {
+  if (!IsIndexType(op->type)) {
     return Rewriter::Mutate_(op, self);
   }
   // normalize
@@ -657,8 +657,8 @@ SeparateDivisibleParts(const SumExprNode* psum,
                        SumExpr* out_non_divisible) {
   auto divisible = make_node<SumExprNode>();
   auto non_divisible = make_node<SumExprNode>();
-  divisible->dtype = psum->dtype;
-  non_divisible->dtype = psum->dtype;
+  divisible->type = psum->type;
+  non_divisible->type = psum->type;
 
   if (psum->base % coeff == 0) {
     divisible->base = psum->base;
@@ -698,11 +698,11 @@ SplitDivConst(SplitExpr lhs, int64_t cval, DivMode div_mode) {
       return lhs;
     } else if (lhs->upper_factor <= (lhs->lower_factor * scaled_cval)) {
       // (x % c1) / c2  => 0 when c2 >= c1
-      return ToSplitExpr(make_zero(lhs.dtype()));
+      return ToSplitExpr(make_zero(lhs.type()));
     } else {
       // move the upper_factor modular into index.
       lhs.CopyOnWrite()->index =
-          ModImpl(lhs->index, make_const(lhs.dtype(), lhs->upper_factor), div_mode);
+          ModImpl(lhs->index, make_const(lhs.type(), lhs->upper_factor), div_mode);
       lhs.CopyOnWrite()->upper_factor = SplitExprNode::kPosInf;
       lhs.CopyOnWrite()->scale = 1;
       lhs.CopyOnWrite()->lower_factor *= scaled_cval;
@@ -720,7 +720,7 @@ SplitDivConst(SplitExpr lhs, int64_t cval, DivMode div_mode) {
 
 Expr CanonicalSimplifier::Impl::
 Mutate_(const Div* op, const Expr& self) {
-  if (!IsIndexType(op->dtype)) {
+  if (!IsIndexType(op->type)) {
     return Rewriter::Mutate_(op, self);
   }
 
@@ -764,7 +764,7 @@ Mutate_(const Div* op, const Expr& self) {
       // if a >= 0 && a < cval, then result == 0
       auto cbound = analyzer_->const_int_bound(Normalize(a));
       if (cbound->min_value >= 0 && cbound->max_value < cval) {
-        return make_zero(a.dtype());
+        return make_zero(a.type());
       }
     }
     return SplitDivConst(ToSplitExpr(std::move(a)), cval, kTruncDiv);
@@ -781,7 +781,7 @@ Mutate_(const Div* op, const Expr& self) {
 
 Expr CanonicalSimplifier::Impl::
 Mutate_(const FloorDiv* op, const Expr& self) {
-  if (!IsIndexType(op->dtype)) {
+  if (!IsIndexType(op->type)) {
     return Rewriter::Mutate_(op, self);
   }
   Expr a = this->CanonicalMutate(op->a);
@@ -820,7 +820,7 @@ Mutate_(const FloorDiv* op, const Expr& self) {
       // if a >= 0 && a < cval, then result == 0
       auto cbound = analyzer_->const_int_bound(Normalize(a));
       if (cbound->min_value >= 0 && cbound->max_value < cval) {
-        return make_zero(a.dtype());
+        return make_zero(a.type());
       }
     }
     return SplitDivConst(ToSplitExpr(std::move(a)), cval, kFloorDiv);
@@ -859,7 +859,7 @@ SplitModConst(SplitExpr lhs, int64_t cval, DivMode div_mode) {
       if (new_upper_factor < lhs->upper_factor &&
           lhs->upper_factor != SplitExprNode::kPosInf) {
         auto updated = ToSplitExpr(Mutate(ModImpl(
-            lhs->index, make_const(lhs.dtype(), new_upper_factor), div_mode)));
+            lhs->index, make_const(lhs.type(), new_upper_factor), div_mode)));
         // re-apply the lower_factor
         if (lhs->lower_factor != 1) {
           return SplitDivConst(updated, lhs->lower_factor, div_mode);
@@ -887,7 +887,7 @@ SplitModConst(SplitExpr lhs, int64_t cval, DivMode div_mode) {
 
 Expr CanonicalSimplifier::Impl::
 Mutate_(const Mod* op, const Expr& self) {
-  if (!IsIndexType(op->dtype)) {
+  if (!IsIndexType(op->type)) {
     return Rewriter::Mutate_(op, self);
   }
   // normalize
@@ -906,7 +906,7 @@ Mutate_(const Mod* op, const Expr& self) {
       SumExpr lhs, extra;
       SeparateDivisibleParts(psum, cval, &lhs, &extra);
       if (extra->IsZero()) {
-        return make_zero(a.dtype());
+        return make_zero(a.type());
       }
       // both lhs and extra are non-negative
       if (analyzer_->CanProveGreaterEqual(lhs->Normalize(), 0) &&
@@ -957,7 +957,7 @@ Mutate_(const Mod* op, const Expr& self) {
 
 Expr CanonicalSimplifier::Impl::
 Mutate_(const FloorMod* op, const Expr& self) {
-  if (!IsIndexType(op->dtype)) {
+  if (!IsIndexType(op->type)) {
     return Rewriter::Mutate_(op, self);
   }
   // normalize
diff --git a/src/arithmetic/compute_expr.h b/src/arithmetic/compute_expr.h
index 806587ab75aa..4b001cfb8610 100644
--- a/src/arithmetic/compute_expr.h
+++ b/src/arithmetic/compute_expr.h
@@ -56,7 +56,7 @@ inline Expr ComputeReduce(
     const Array<Expr>& values, Expr empty_value);
 
 inline bool GetConst(Expr e, int64_t* out) {
-  if (e.dtype().is_vector()) return false;
+  if (e.type().is_vector()) return false;
   const int64_t* v = as_const_int(e);
   if (v) {
     *out = *v; return true;
diff --git a/src/arithmetic/const_fold.h b/src/arithmetic/const_fold.h
index 93bf708a113f..86f1927f2abe 100644
--- a/src/arithmetic/const_fold.h
+++ b/src/arithmetic/const_fold.h
@@ -70,7 +70,7 @@ inline Expr TryConstFold(Expr a);
  * \param type The type to represent index.
  * \return the checked result.
  */
-inline bool IsIndexType(const DataType& type) {
+inline bool IsIndexType(const Type& type) {
   return type.is_int() && type.lanes() == 1 &&
       (type.bits() == 32 || type.bits() == 64);
 }
@@ -92,8 +92,8 @@ inline bool IsIndexType(const DataType& type) {
   using ir::UIntImm;                                                    \
   const IntImm* pa = a.as<IntImm>();                                    \
   const IntImm* pb = b.as<IntImm>();                                    \
-  const DataType& ta = a.dtype();                                       \
-  const DataType& tb = b.dtype();                                       \
+  const Type& ta = a.type();                                            \
+  const Type& tb = b.type();                                            \
   if (arith::IsIndexType(ta) && arith::IsIndexType(tb)) {               \
     BODY;                                                               \
   }                                                                     \
@@ -103,7 +103,7 @@ inline bool IsIndexType(const DataType& type) {
 template<>
 inline Expr TryConstFold<ir::Add>(Expr a, Expr b) {
   TVM_ARITH_CONST_PROPAGATION({
-      const DataType& rtype = a.dtype();
+      const Type& rtype = a.type();
       if (pa && pb) return IntImm::make(rtype, pa->value + pb->value);
       if (pa && pa->value == 0) return b;
       if (pb && pb->value == 0) return a;
@@ -117,7 +117,7 @@ inline Expr TryConstFold<ir::Add>(Expr a, Expr b) {
 template<>
 inline Expr TryConstFold<ir::Sub>(Expr a, Expr b) {
   TVM_ARITH_CONST_PROPAGATION({
-      const DataType& rtype = a.dtype();
+      const Type& rtype = a.type();
       if (pa && pb) return IntImm::make(rtype, pa->value - pb->value);
       if (pb && pb->value == 0) return a;
       if (fa && fb) return FloatImm::make(rtype, fa->value - fb->value);
@@ -129,7 +129,7 @@ inline Expr TryConstFold<ir::Sub>(Expr a, Expr b) {
 template<>
 inline Expr TryConstFold<ir::Mul>(Expr a, Expr b) {
   TVM_ARITH_CONST_PROPAGATION({
-      const DataType& rtype = a.dtype();
+      const Type& rtype = a.type();
       if (pa && pb) return IntImm::make(rtype, pa->value * pb->value);
       if (pa) {
         if (pa->value == 1) return b;
@@ -155,7 +155,7 @@ inline Expr TryConstFold<ir::Mul>(Expr a, Expr b) {
 template<>
 inline Expr TryConstFold<ir::Div>(Expr a, Expr b) {
   TVM_ARITH_CONST_PROPAGATION({
-      const DataType& rtype = a.dtype();
+      const Type& rtype = a.type();
       if (pa && pb) {
         // due to division and mod can have different modes
         // NOTE: this will assumes truc div.
@@ -184,7 +184,7 @@ inline Expr TryConstFold<ir::Div>(Expr a, Expr b) {
 template<>
 inline Expr TryConstFold<ir::Mod>(Expr a, Expr b) {
   TVM_INDEX_CONST_PROPAGATION({
-      const DataType& rtype = a.dtype();
+      const Type& rtype = a.type();
       if (pa && pb) {
         return IntImm::make(rtype, pa->value % pb->value);
       }
@@ -202,7 +202,7 @@ inline Expr TryConstFold<ir::Mod>(Expr a, Expr b) {
 template<>
 inline Expr TryConstFold<ir::FloorDiv>(Expr a, Expr b) {
   TVM_ARITH_CONST_PROPAGATION({
-      const DataType& rtype = a.dtype();
+      const Type& rtype = a.type();
       if (pa && pb) {
         CHECK_NE(pb->value, 0) << "Divide by zero";
         return IntImm::make(rtype, arith::floordiv(pa->value, pb->value));
@@ -229,7 +229,7 @@ inline Expr TryConstFold<ir::FloorDiv>(Expr a, Expr b) {
 template<>
 inline Expr TryConstFold<ir::FloorMod>(Expr a, Expr b) {
   TVM_INDEX_CONST_PROPAGATION({
-      const DataType& rtype = a.dtype();
+      const Type& rtype = a.type();
       if (pa && pb) {
         return IntImm::make(rtype, arith::floormod(pa->value, pb->value));
       }
@@ -247,7 +247,7 @@ inline Expr TryConstFold<ir::FloorMod>(Expr a, Expr b) {
 template<>
 inline Expr TryConstFold<ir::Min>(Expr a, Expr b) {
   TVM_ARITH_CONST_PROPAGATION({
-      const DataType& rtype = a.dtype();
+      const Type& rtype = a.type();
       if (pa && pb) return IntImm::make(rtype, std::min(pa->value, pb->value));
       if (fa && fb) return FloatImm::make(rtype, std::min(fa->value, fb->value));
     });
@@ -258,7 +258,7 @@ inline Expr TryConstFold<ir::Min>(Expr a, Expr b) {
 template<>
 inline Expr TryConstFold<ir::Max>(Expr a, Expr b) {
   TVM_ARITH_CONST_PROPAGATION({
-      const DataType& rtype = a.dtype();
+      const Type& rtype = a.type();
       if (pa && pb) return IntImm::make(rtype, std::max(pa->value, pb->value));
       if (fa && fb) return FloatImm::make(rtype, std::max(fa->value, fb->value));
     });
@@ -269,8 +269,8 @@ inline Expr TryConstFold<ir::Max>(Expr a, Expr b) {
 template<>
 inline Expr TryConstFold<ir::GT>(Expr a, Expr b) {
   TVM_ARITH_CONST_PROPAGATION({
-      if (pa && pb) return UIntImm::make(DataType::UInt(1), pa->value > pb->value);
-      if (fa && fb) return UIntImm::make(DataType::UInt(1), fa->value > fb->value);
+      if (pa && pb) return UIntImm::make(UInt(1), pa->value > pb->value);
+      if (fa && fb) return UIntImm::make(UInt(1), fa->value > fb->value);
     });
   return Expr();
 }
@@ -278,8 +278,8 @@ inline Expr TryConstFold<ir::GT>(Expr a, Expr b) {
 template<>
 inline Expr TryConstFold<ir::GE>(Expr a, Expr b) {
   TVM_ARITH_CONST_PROPAGATION({
-      if (pa && pb) return UIntImm::make(DataType::UInt(1), pa->value >= pb->value);
-      if (fa && fb) return UIntImm::make(DataType::UInt(1), fa->value >= fb->value);
+      if (pa && pb) return UIntImm::make(UInt(1), pa->value >= pb->value);
+      if (fa && fb) return UIntImm::make(UInt(1), fa->value >= fb->value);
     });
   return Expr();
 }
@@ -287,8 +287,8 @@ inline Expr TryConstFold<ir::GE>(Expr a, Expr b) {
 template<>
 inline Expr TryConstFold<ir::LT>(Expr a, Expr b) {
   TVM_ARITH_CONST_PROPAGATION({
-      if (pa && pb) return UIntImm::make(DataType::UInt(1), pa->value < pb->value);
-      if (fa && fb) return UIntImm::make(DataType::UInt(1), fa->value < fb->value);
+      if (pa && pb) return UIntImm::make(UInt(1), pa->value < pb->value);
+      if (fa && fb) return UIntImm::make(UInt(1), fa->value < fb->value);
     });
   return Expr();
 }
@@ -296,8 +296,8 @@ inline Expr TryConstFold<ir::LT>(Expr a, Expr b) {
 template<>
 inline Expr TryConstFold<ir::LE>(Expr a, Expr b) {
   TVM_ARITH_CONST_PROPAGATION({
-      if (pa && pb) return UIntImm::make(DataType::UInt(1), pa->value <= pb->value);
-      if (fa && fb) return UIntImm::make(DataType::UInt(1), fa->value <= fb->value);
+      if (pa && pb) return UIntImm::make(UInt(1), pa->value <= pb->value);
+      if (fa && fb) return UIntImm::make(UInt(1), fa->value <= fb->value);
     });
   return Expr();
 }
@@ -305,8 +305,8 @@ inline Expr TryConstFold<ir::LE>(Expr a, Expr b) {
 template<>
 inline Expr TryConstFold<ir::EQ>(Expr a, Expr b) {
   TVM_ARITH_CONST_PROPAGATION({
-      if (pa && pb) return UIntImm::make(DataType::UInt(1), pa->value == pb->value);
-      if (fa && fb) return UIntImm::make(DataType::UInt(1), fa->value == fb->value);
+      if (pa && pb) return UIntImm::make(UInt(1), pa->value == pb->value);
+      if (fa && fb) return UIntImm::make(UInt(1), fa->value == fb->value);
     });
   return Expr();
 }
@@ -314,8 +314,8 @@ inline Expr TryConstFold<ir::EQ>(Expr a, Expr b) {
 template<>
 inline Expr TryConstFold<ir::NE>(Expr a, Expr b) {
   TVM_ARITH_CONST_PROPAGATION({
-      if (pa && pb) return UIntImm::make(DataType::UInt(1), pa->value != pb->value);
-      if (fa && fb) return UIntImm::make(DataType::UInt(1), fa->value != fb->value);
+      if (pa && pb) return UIntImm::make(UInt(1), pa->value != pb->value);
+      if (fa && fb) return UIntImm::make(UInt(1), fa->value != fb->value);
     });
   return Expr();
 }
@@ -349,7 +349,7 @@ inline Expr TryConstFold<ir::Not>(Expr a) {
   using ir::UIntImm;
   const UIntImm* pa = a.as<UIntImm>();
   if (pa) {
-    return UIntImm::make(DataType::UInt(1), !(pa->value));
+    return UIntImm::make(UInt(1), !(pa->value));
   }
   return Expr();
 }
diff --git a/src/arithmetic/const_int_bound.cc b/src/arithmetic/const_int_bound.cc
index c0519107d5b8..6e119695a8c8 100644
--- a/src/arithmetic/const_int_bound.cc
+++ b/src/arithmetic/const_int_bound.cc
@@ -125,7 +125,7 @@ class ConstIntBoundAnalyzer::Impl :
   // Override visitor behaviors
   Entry VisitExprDefault_(const Node* op) final {
     return Everything(
-        static_cast<const ExprNode*>(op)->dtype);
+        static_cast<const ExprNode*>(op)->type);
   }
 
   Entry VisitExpr(const Expr& expr) final {
@@ -142,7 +142,7 @@ class ConstIntBoundAnalyzer::Impl :
 
   Entry VisitExpr_(const Cast* op) final {
     Entry a = VisitExpr(op->value);
-    Entry b = Everything(op->dtype);
+    Entry b = Everything(op->type);
     return Intersect(a, b);
   }
 
@@ -154,7 +154,7 @@ class ConstIntBoundAnalyzer::Impl :
     if (op->value <= static_cast<uint64_t>(kPosInf)) {
       return MakeBound(op->value, op->value);
     } else {
-      return Everything(op->dtype);
+      return Everything(op->type);
     }
   }
 
@@ -211,7 +211,7 @@ class ConstIntBoundAnalyzer::Impl :
       CHECK(!b.is_const(0)) << "mod by zero";
       // mod by negative value is rare,
       // and we just use the simpliest rule.
-      return Everything(op->dtype);
+      return Everything(op->type);
     }
   }
 
@@ -242,7 +242,7 @@ class ConstIntBoundAnalyzer::Impl :
       CHECK(!b.is_const(0)) << "floormod by zero";
       // mod by negative value is rare,
       // and we just use the simpliest rule.
-      return Everything(op->dtype);
+      return Everything(op->type);
     }
   }
 
@@ -278,7 +278,7 @@ class ConstIntBoundAnalyzer::Impl :
     } else if (op->is_intrinsic(Call::bitwise_and)) {
       return VisitBitwiseAnd(op);
     } else {
-      return Everything(op->dtype);
+      return Everything(op->type);
     }
   }
 
@@ -288,7 +288,7 @@ class ConstIntBoundAnalyzer::Impl :
     if (it != var_map_.end()) {
       return it->second;
     } else {
-      return Everything(op->dtype);
+      return Everything(op->type);
     }
   }
 
@@ -311,7 +311,7 @@ class ConstIntBoundAnalyzer::Impl :
       if (a.min_value >= 0) {
         return MakeBound(0, a.max_value);
       }
-      return Everything(op->dtype);
+      return Everything(op->type);
     }
   }
 
@@ -466,7 +466,7 @@ class ConstIntBoundAnalyzer::Impl :
    * \param dtype The data type.
    * \return Bound that represent everything dtype can represent.
    */
-  static Entry Everything(DataType dtype) {
+  static Entry Everything(Type dtype) {
     if (!dtype.is_int() && !dtype.is_uint()) {
       return MakeBound(kNegInf, kPosInf);
     }
diff --git a/src/arithmetic/detect_linear_equation.cc b/src/arithmetic/detect_linear_equation.cc
index cf37545502ba..8c7f4f2bb738 100644
--- a/src/arithmetic/detect_linear_equation.cc
+++ b/src/arithmetic/detect_linear_equation.cc
@@ -53,10 +53,10 @@ class LinearEqDetector
     *ret = VisitExpr(e, e);
     if (fail_) return false;
     if (!ret->base.defined()) {
-      ret->base = make_zero(var_.dtype());
+      ret->base = make_zero(var_.type());
     }
     if (!ret->coeff.defined()) {
-      ret->coeff = make_zero(var_.dtype());
+      ret->coeff = make_zero(var_.type());
     }
     return true;
   }
@@ -100,7 +100,7 @@ class LinearEqDetector
   LinearEqEntry VisitExpr_(const Variable* op, const Expr& e) final {
     LinearEqEntry ret;
     if (op == var_.get()) {
-      ret.coeff = make_const(op->dtype, 1);
+      ret.coeff = make_const(op->type, 1);
     } else {
       ret.base = e;
     }
@@ -190,16 +190,16 @@ bool DetectClipBound(
   // canonical form: exp >= 0
   Expr canonical;
   if (const LT* op = cond.as<LT>()) {
-    if (!op->a.dtype().is_int()) return false;
-    canonical = op->b - op->a - make_const(op->a.dtype(), 1);
+    if (!op->a.type().is_int()) return false;
+    canonical = op->b - op->a - make_const(op->a.type(), 1);
   } else if (const LE* op = cond.as<LE>()) {
-    if (!op->a.dtype().is_int()) return false;
+    if (!op->a.type().is_int()) return false;
     canonical = op->b - op->a;
   } else if (const GT* op = cond.as<GT>()) {
-    if (!op->a.dtype().is_int()) return false;
-    canonical = op->a - op->b - make_const(op->a.dtype(), 1);
+    if (!op->a.type().is_int()) return false;
+    canonical = op->a - op->b - make_const(op->a.type(), 1);
   } else if (const GE* op = cond.as<GE>()) {
-    if (!op->a.dtype().is_int()) return false;
+    if (!op->a.type().is_int()) return false;
     canonical = op->a - op->b;
   } else {
     return false;
diff --git a/src/arithmetic/domain_touched.cc b/src/arithmetic/domain_touched.cc
index 947f0050c6cb..c28346ed2e33 100644
--- a/src/arithmetic/domain_touched.cc
+++ b/src/arithmetic/domain_touched.cc
@@ -72,7 +72,7 @@ class FuncTouchedDomain final : public IRVisitor {
       const IterVarNode* thread_axis = op->node.as<IterVarNode>();
       CHECK(thread_axis);
       const Variable* var = thread_axis->var.get();
-      dom_map_[var] = IntSet::range(Range(make_zero(op->value.dtype()), op->value));
+      dom_map_[var] = IntSet::range(Range(make_zero(op->value.type()), op->value));
       IRVisitor::Visit_(op);
       dom_map_.erase(var);
     } else {
diff --git a/src/arithmetic/int_set.cc b/src/arithmetic/int_set.cc
index e4f2042a19d7..9f8effb6c612 100644
--- a/src/arithmetic/int_set.cc
+++ b/src/arithmetic/int_set.cc
@@ -33,8 +33,8 @@
 namespace tvm {
 namespace arith {
 
-Expr SymbolicLimits::pos_inf_ = Var("pos_inf", DataType::Handle());
-Expr SymbolicLimits::neg_inf_ = Var("neg_inf", DataType::Handle());
+Expr SymbolicLimits::pos_inf_ = Var("pos_inf", Handle());
+Expr SymbolicLimits::neg_inf_ = Var("neg_inf", Handle());
 
 IntervalSet::IntervalSet(Expr min_value, Expr max_value) {
   auto node = make_node<IntervalSetNode>();
@@ -54,8 +54,8 @@ TVM_REGISTER_API("arith._make_IntervalSet")
 IntervalSet Intersect(Analyzer* analyzer, IntervalSet a, IntervalSet b) {
   Expr max_value = min(a->max_value, b->max_value);
   Expr min_value = max(a->min_value, b->min_value);
-  if ((max_value.dtype().is_int() || max_value.dtype().is_uint()) &&
-      (min_value.dtype().is_int() || min_value.dtype().is_uint()) &&
+  if ((max_value.type().is_int() || max_value.type().is_uint()) &&
+      (min_value.type().is_int() || min_value.type().is_uint()) &&
       analyzer->CanProveGreaterEqual(min_value - max_value, 1)) {
     return IntervalSet::Empty();
   } else {
@@ -105,8 +105,8 @@ inline IntervalSet Combine(Analyzer* analyzer,
     return IntervalSet::SinglePoint(res);
   }
   if (is_logical_op<Op>::value) {
-    return IntervalSet(make_const(a->min_value.dtype(), 0),
-                       make_const(a->min_value.dtype(), 1));
+    return IntervalSet(make_const(a->min_value.type(), 0),
+                       make_const(a->min_value.type(), 1));
   }
   if (a->IsEmpty()) return a;
   if (b->IsEmpty()) return b;
@@ -177,7 +177,7 @@ inline IntervalSet Combine<ir::Mul>(Analyzer* analyzer,
       return IntervalSet(min_value, max_value);
     } else if (a->HasUpperBound() && a->HasLowerBound()) {
       using ir::Select;
-      Expr sign = b->min_value >= make_zero(b->min_value.dtype().element_of());
+      Expr sign = b->min_value >= make_zero(b->min_value.type().element_of());
       Expr e1 = a->min_value * b->min_value;
       Expr e2 = a->max_value * b->min_value;
       return IntervalSet(Select::make(sign, e1, e2), Select::make(sign, e2, e1));
@@ -212,7 +212,7 @@ inline IntervalSet Combine<ir::Div>(Analyzer* analyzer,
       return IntervalSet(min_value, max_value);
     } else if (a->HasUpperBound() && a->HasLowerBound()) {
       using ir::Select;
-      Expr sign = b->min_value >= make_zero(b->min_value.dtype().element_of());
+      Expr sign = b->min_value >= make_zero(b->min_value.type().element_of());
       Expr e1 = a->min_value / b->min_value;
       Expr e2 = a->max_value / b->min_value;
       return IntervalSet(Select::make(sign, e1, e2), Select::make(sign, e2, e1));
@@ -242,7 +242,7 @@ inline IntervalSet Combine<ir::Mod>(Analyzer* analyzer,
     // is the case of our application.
     // TODO(tqchen): add bound constraints for a.
     if (analyzer->CanProveGreaterEqual(divisor, 0)) {
-      return IntervalSet(make_zero(divisor.dtype()), divisor - 1);
+      return IntervalSet(make_zero(divisor.type()), divisor - 1);
     } else {
       Expr bound = abs(divisor) - 1;
       return IntervalSet(-bound, bound);
@@ -278,7 +278,7 @@ inline IntervalSet Combine<ir::FloorDiv>(Analyzer* analyzer,
       return IntervalSet(min_value, max_value);
     } else if (a->HasUpperBound() && a->HasLowerBound()) {
       using ir::Select;
-      Expr sign = b->min_value >= make_zero(b->min_value.dtype().element_of());
+      Expr sign = b->min_value >= make_zero(b->min_value.type().element_of());
       Expr e1 = floordiv(a->min_value, b->min_value);
       Expr e2 = floordiv(a->max_value, b->min_value);
       return IntervalSet(Select::make(sign, e1, e2), Select::make(sign, e2, e1));
@@ -304,7 +304,7 @@ inline IntervalSet Combine<ir::FloorMod>(Analyzer* analyzer,
       LOG(FATAL) << "Modular by zero in CombineInterval Mod";
     }
     if (analyzer->CanProveGreaterEqual(divisor, 0)) {
-      return IntervalSet(make_zero(divisor.dtype()), divisor - 1);
+      return IntervalSet(make_zero(divisor.type()), divisor - 1);
     } else {
       Expr bound = abs(divisor) - 1;
       return IntervalSet(-bound, bound);
@@ -476,7 +476,7 @@ class IntervalSetEvaluator :
     IntervalSet base = Eval(op->base);
     PVar<Integer> stride;
     if (stride.Match(op->stride)) {
-      DataType t = op->base.dtype();
+      Type t = op->base.type();
       int64_t vstride = stride.Eval()->value;
       if (vstride> 0) {
         return Combine<Add>(
diff --git a/src/arithmetic/ir_mutator_with_analyzer.cc b/src/arithmetic/ir_mutator_with_analyzer.cc
index 0d4b8f26b18b..cda9d585ace1 100644
--- a/src/arithmetic/ir_mutator_with_analyzer.cc
+++ b/src/arithmetic/ir_mutator_with_analyzer.cc
@@ -140,7 +140,7 @@ Mutate_(const Call* op, const Expr& self) {
         false_value.same_as(op->args[2])) {
       return self;
     } else {
-      return Call::make(op->dtype, op->name,
+      return Call::make(op->type, op->name,
                         {cond, true_value, false_value},
                         op->call_type);
     }
diff --git a/src/arithmetic/pattern_match.h b/src/arithmetic/pattern_match.h
index fd07a377e955..f7d5483cf6de 100644
--- a/src/arithmetic/pattern_match.h
+++ b/src/arithmetic/pattern_match.h
@@ -291,7 +291,7 @@ class PConstWithTypeLike :
   }
 
   Expr Eval() const {
-    return make_const(ref_.Eval().dtype(), value_);
+    return make_const(ref_.Eval().type(), value_);
   }
 
  private:
@@ -474,7 +474,7 @@ class PCastExpr :
 
   bool Match_(const NodeRef& node) const {
     if (const ir::Cast* ptr = node.as<ir::Cast>()) {
-      if (!dtype_.Match_(ptr->dtype)) return false;
+      if (!dtype_.Match_(ptr->type)) return false;
       if (!value_.Match_(ptr->value)) return false;
       return true;
     } else {
@@ -730,7 +730,7 @@ class PCallExpr :
 #define TVM_PATTERN_BINARY_INTRIN(FuncName, OpName, IntrinStr)        \
   struct OpName {                                                     \
     static Expr Eval(Array<Expr> args) {                              \
-      return ir::Call::make(args[0].dtype(), kName, args,             \
+      return ir::Call::make(args[0].type(), kName, args,              \
                             ir::Call::PureIntrinsic);                 \
     }                                                                 \
     static constexpr const char* kName = IntrinStr;                   \
@@ -751,7 +751,7 @@ TVM_PATTERN_BINARY_INTRIN(operator^, PBitwiseXorOp, "bitwise_xor");
 #define TVM_PATTERN_UNARY_INTRIN(FuncName, OpName, IntrinStr)         \
   struct OpName {                                                     \
     static Expr Eval(Array<Expr> args) {                              \
-      return ir::Call::make(args[0].dtype(), kName, args,             \
+      return ir::Call::make(args[0].type(), kName, args,              \
                             ir::Call::PureIntrinsic);                 \
     }                                                                 \
     static constexpr const char* kName = IntrinStr;                   \
@@ -768,7 +768,7 @@ TVM_PATTERN_UNARY_INTRIN(operator~, PBitwiseNotOp, "bitwise_not");
 struct PIfThenElseOp {
   static Expr Eval(Array<Expr> args) {
     return ir::Call::make(
-        args[1].dtype(), kName, args,
+        args[1].type(), kName, args,
         ir::Call::PureIntrinsic);
   }
   static constexpr const char* kName = "tvm_if_then_else";
diff --git a/src/arithmetic/rewrite_simplify.cc b/src/arithmetic/rewrite_simplify.cc
index 235306cc7bf8..b26f8335055a 100644
--- a/src/arithmetic/rewrite_simplify.cc
+++ b/src/arithmetic/rewrite_simplify.cc
@@ -129,7 +129,7 @@ Mutate_(const Add* op, const Expr& self) {
   // Pattern var for lanes in broadcast and ramp
   PVar<int> lanes;
   // Vector rules
-  if (op->dtype.lanes() != 1) {
+  if (op->type.lanes() != 1) {
     TVM_TRY_REWRITE(ramp(b1, s1, lanes) + ramp(b2, s2, lanes),
                     ramp(b1 + b2, s1 + s2, lanes));
     TVM_TRY_REWRITE(ramp(b1, s1, lanes) + broadcast(x, lanes),
@@ -140,7 +140,7 @@ Mutate_(const Add* op, const Expr& self) {
                     broadcast(x + y, lanes));
   }
 
-  if (IsIndexType(op->dtype)) {
+  if (IsIndexType(op->type)) {
     // Index rules
     // cancelation rules
     TVM_TRY_REWRITE((x - y) + y, x);
@@ -244,7 +244,7 @@ Mutate_(const Sub* op, const Expr& self) {
   // Pattern var for lanes in broadcast and ramp
   PVar<int> lanes;
   // Vector rules
-  if (op->dtype.lanes() != 1) {
+  if (op->type.lanes() != 1) {
     TVM_TRY_REWRITE(ramp(b1, s1, lanes) - ramp(b2, s2, lanes),
                     ramp(b1 - b2, s1 - s2, lanes));
     TVM_TRY_REWRITE(ramp(b1, s1, lanes) - broadcast(x, lanes),
@@ -255,7 +255,7 @@ Mutate_(const Sub* op, const Expr& self) {
                     broadcast(x - y, lanes));
   }
 
-  if (IsIndexType(op->dtype)) {
+  if (IsIndexType(op->type)) {
     // Index rules
     // cancelation rules
     TVM_TRY_REWRITE((x + y) - y, x);
@@ -443,7 +443,7 @@ Mutate_(const Mul* op, const Expr& self) {
   // Pattern var for lanes in broadcast and ramp
   PVar<int> lanes;
   // Vector rules
-  if (op->dtype.lanes() != 1) {
+  if (op->type.lanes() != 1) {
     TVM_TRY_REWRITE(broadcast(x, lanes) * broadcast(y, lanes),
                     broadcast(x * y, lanes));
     TVM_TRY_REWRITE(ramp(b1, s1, lanes) * broadcast(x, lanes),
@@ -452,7 +452,7 @@ Mutate_(const Mul* op, const Expr& self) {
                     ramp(b1 * x, s1 * x, lanes));
   }
 
-  if (IsIndexType(op->dtype)) {
+  if (IsIndexType(op->type)) {
     // constant simplification rule
     TVM_TRY_REWRITE((x + c1) * c2, x * c2 + c1 * c2);
     TVM_TRY_REWRITE((x * c1) * c2, x * (c1 * c2));
@@ -484,12 +484,12 @@ Mutate_(const Div* op, const Expr& self) {
 
   // x / 2.0 = x * 0.5
   if (const FloatImm* ptr = op->b.as<FloatImm>()) {
-    CHECK(op->dtype.is_float());
-    return op->a * make_const(op->b.dtype(), 1.0 / ptr->value);
+    CHECK(op->type.is_float());
+    return op->a * make_const(op->b.type(), 1.0 / ptr->value);
   }
 
   // Vector rules
-  if (op->dtype.lanes() != 1) {
+  if (op->type.lanes() != 1) {
     // NOTE: use div as the pattern also works for float.
     TVM_TRY_REWRITE(div(broadcast(x, lanes), broadcast(y, lanes)),
                     broadcast(div(x, y), lanes));
@@ -512,7 +512,7 @@ Mutate_(const Div* op, const Expr& self) {
     }
   }
 
-  if (IsIndexType(op->dtype)) {
+  if (IsIndexType(op->type)) {
     // Be-aware of the division rules:
     // We adopt the default C division uses truncation instead of floordiv.
     // This means most rules need to check non-negativeness of the operands.
@@ -524,7 +524,7 @@ Mutate_(const Div* op, const Expr& self) {
     if (truncdiv(c1, c2).Match(ret)) {
       int64_t c1val = c1.Eval()->value;
       int64_t c2val = c2.Eval()->value;
-      return make_const(op->dtype, truncdiv(c1val, c2val));
+      return make_const(op->type, truncdiv(c1val, c2val));
     }
 
     // while it is always true for trunc div
@@ -706,7 +706,7 @@ Mutate_(const Mod* op, const Expr& self) {
   PVar<int> lanes;
 
   // Vector rules
-  if (op->dtype.lanes() != 1) {
+  if (op->type.lanes() != 1) {
     TVM_TRY_REWRITE(truncmod(broadcast(x, lanes), broadcast(y, lanes)),
                     broadcast(truncmod(x, y), lanes));
 
@@ -734,7 +734,7 @@ Mutate_(const Mod* op, const Expr& self) {
     }
   }
 
-  if (IsIndexType(op->dtype)) {
+  if (IsIndexType(op->type)) {
     // Be-aware of the division rules:
     // We adopt the default C division uses truncation instead of floordiv.
     // This means most rules need to check non-negativeness of the operands.
@@ -762,10 +762,9 @@ Mutate_(const Mod* op, const Expr& self) {
 
     // canonicalization: x % c == x % (-c) for truncated division
     // NOTE: trunc div required
-    TVM_TRY_RECURSIVE_REWRITE_IF(
-        truncmod(x, c1),
-        truncmod(x, PConst<Expr>(make_const(op->dtype, -c1.Eval()->value))),
-        c1.Eval()->value < 0);
+    TVM_TRY_RECURSIVE_REWRITE_IF(truncmod(x, c1),
+                                 truncmod(x, PConst<Expr>(make_const(op->type, -c1.Eval()->value))),
+                                 c1.Eval()->value < 0);
 
     // try modular analysis
     if (truncmod(x, c1).Match(ret)) {
@@ -795,7 +794,7 @@ Mutate_(const FloorDiv* op, const Expr& self) {
   PVar<int> lanes;
 
   // Vector rules
-  if (op->dtype.lanes() != 1) {
+  if (op->type.lanes() != 1) {
     TVM_TRY_REWRITE(floordiv(broadcast(x, lanes), broadcast(y, lanes)),
                     broadcast(floordiv(x, y), lanes));
     // ramp // bcast
@@ -815,7 +814,7 @@ Mutate_(const FloorDiv* op, const Expr& self) {
     }
   }
 
-  if (IsIndexType(op->dtype)) {
+  if (IsIndexType(op->type)) {
     // Be-aware of the division rules: this is floor division.
     TVM_TRY_REWRITE_IF(floordiv(floordiv(x, c1), c2), floordiv(x, c1 * c2),
                        c1.Eval()->value > 0 && c2.Eval()->value > 0);
@@ -940,7 +939,7 @@ Mutate_(const FloorMod* op, const Expr& self) {
   PVar<int> lanes;
 
   // Vector rules
-  if (op->dtype.lanes() != 1) {
+  if (op->type.lanes() != 1) {
     TVM_TRY_REWRITE(floormod(broadcast(x, lanes), broadcast(y, lanes)),
                     broadcast(floormod(x, y), lanes));
 
@@ -965,7 +964,7 @@ Mutate_(const FloorMod* op, const Expr& self) {
     }
   }
 
-  if (IsIndexType(op->dtype)) {
+  if (IsIndexType(op->type)) {
     // Be-aware of the division rules: we use floordiv/floormod here
     TVM_TRY_REWRITE_IF(floormod(x * c1, c2), ZeroWithTypeLike(x),
                        c2.Eval()->value != 0 &&
@@ -1009,13 +1008,13 @@ Mutate_(const Min* op, const Expr& self) {
   PVar<int> lanes;
 
   // vector rule
-  if (op->dtype.lanes() != 1) {
+  if (op->type.lanes() != 1) {
     TVM_TRY_REWRITE(min(broadcast(x, lanes), broadcast(y, lanes)),
                     broadcast(min(x, y), lanes));
     TVM_TRY_REWRITE(min(min(x, broadcast(y, lanes)), broadcast(z, lanes)),
                     min(x, broadcast(min(y, z), lanes)));
   }
-  if (IsIndexType(op->dtype)) {
+  if (IsIndexType(op->type)) {
     TVM_TRY_REWRITE(min(x, x), x);
 
     // constant int bound
@@ -1194,13 +1193,13 @@ Mutate_(const Max* op, const Expr& self) {
   PVar<int> lanes;
 
   // vector rule
-  if (op->dtype.lanes() != 1) {
+  if (op->type.lanes() != 1) {
     TVM_TRY_REWRITE(max(broadcast(x, lanes), broadcast(y, lanes)),
                     broadcast(max(x, y), lanes));
     TVM_TRY_REWRITE(max(max(x, broadcast(y, lanes)), broadcast(z, lanes)),
                     max(x, broadcast(max(y, z), lanes)));
   }
-  if (IsIndexType(op->dtype)) {
+  if (IsIndexType(op->type)) {
     TVM_TRY_REWRITE(max(x, x), x);
 
     // constant int bound
@@ -1367,17 +1366,17 @@ Mutate_(const EQ* op, const Expr& self) {
   PVar<int> lanes;
 
   // vector rule
-  if (op->dtype.lanes() != 1) {
+  if (op->type.lanes() != 1) {
     TVM_TRY_REWRITE(broadcast(x, lanes) == broadcast(y, lanes),
                     broadcast(x == y, lanes));
   }
 
-  if (IsIndexType(op->a.dtype())) {
+  if (IsIndexType(op->a.type())) {
     CompareResult result = TryCompare(op->a - op->b, 0);
     if (result == kEQ) {
-      return make_const(op->dtype, true);
+      return make_const(op->type, true);
     } else if (result == kNE || result == kGT || result == kLT) {
-      return make_const(op->dtype, false);
+      return make_const(op->type, false);
     }
     TVM_TRY_REWRITE(x - c1 == 0, x == c1);
     TVM_TRY_REWRITE(c1 - x == 0, x == c1);
@@ -1421,20 +1420,20 @@ Mutate_(const LT* op, const Expr& self) {
   PVar<int> lanes;
 
   // vector rule
-  if (op->dtype.lanes() != 1) {
+  if (op->type.lanes() != 1) {
     TVM_TRY_REWRITE(broadcast(x, lanes) < broadcast(y, lanes),
                     broadcast(x < y, lanes));
     TVM_TRY_REWRITE(ramp(x, s1, lanes) < ramp(y, s1, lanes),
                     broadcast(x < y, lanes));
   }
 
-  if (IsIndexType(op->a.dtype())) {
+  if (IsIndexType(op->a.type())) {
     CompareResult result = TryCompare(op->a - op->b, 0);
     if (result == kLT) {
-      return make_const(op->dtype, true);
+      return make_const(op->type, true);
     }
     if (result == kEQ || result == kGT || result == kGE) {
-      return make_const(op->dtype, false);
+      return make_const(op->type, false);
     }
 
     TVM_TRY_REWRITE(x + y < x + z, y < z);
@@ -1572,7 +1571,7 @@ Mutate_(const Not* op, const Expr& self) {
   // Pattern var to match any expression
   PVar<Expr> x, y;
   PVar<int> lanes;
-  if (op->dtype.lanes() != 1) {
+  if (op->type.lanes() != 1) {
     TVM_TRY_REWRITE(!broadcast(x, lanes), broadcast(!x, lanes));
   }
 
@@ -1601,12 +1600,12 @@ Mutate_(const And* op, const Expr& self) {
   PVar<Integer> c1, c2;
   PVar<int> lanes;
 
-  if (op->dtype.lanes() != 1) {
+  if (op->type.lanes() != 1) {
     TVM_TRY_REWRITE(broadcast(x, lanes) && broadcast(y, lanes),
                     broadcast(x && y, lanes));
   }
 
-  auto cfalse = PConst<Expr>(make_const(op->dtype, false));
+  auto cfalse = PConst<Expr>(make_const(op->type, false));
   TVM_TRY_REWRITE(x == y && x != y, cfalse);
   TVM_TRY_REWRITE(x != y && x == y, cfalse);
   TVM_TRY_REWRITE(x && !x, cfalse);
@@ -1650,12 +1649,12 @@ Mutate_(const Or* op, const Expr& self) {
   PVar<Integer> c1, c2;
   PVar<int> lanes;
 
-  if (op->dtype.lanes() != 1) {
+  if (op->type.lanes() != 1) {
     TVM_TRY_REWRITE(broadcast(x, lanes) || broadcast(y, lanes),
                     broadcast(x || y, lanes));
   }
 
-  auto ctrue = PConst<Expr>(make_const(op->dtype, true));
+  auto ctrue = PConst<Expr>(make_const(op->type, true));
 
   TVM_TRY_REWRITE(x == y || x != y, ctrue);
   TVM_TRY_REWRITE(x != y || x == y, ctrue);
@@ -1721,7 +1720,7 @@ Mutate_(const Call* op, const Expr& self) {
     for (const auto& constraint : literal_constraints_) {
       // Cases such as for (i, 0, bound) {if (likely(iter_var < bound)) { .. } }
       if (Equal(constraint, op->args[0])) {
-        return make_const(op->dtype, true);
+        return make_const(op->type, true);
       }
     }
   }
@@ -1742,7 +1741,7 @@ Expr RewriteSimplifier::Impl::
 Mutate_(const Cast* op, const Expr& self) {
   Expr ret = IRMutator::Mutate_(op, self);
   op = ret.as<Cast>();
-  return cast(op->dtype, op->value);
+  return cast(op->type, op->value);
 }
 
 Expr RewriteSimplifier::Impl::
diff --git a/src/autotvm/touch_extractor.cc b/src/autotvm/touch_extractor.cc
index f66a724595c6..101d8f1aa57f 100644
--- a/src/autotvm/touch_extractor.cc
+++ b/src/autotvm/touch_extractor.cc
@@ -6,9 +6,9 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- *
+ * 
  *   http://www.apache.org/licenses/LICENSE-2.0
- *
+ * 
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -255,10 +255,10 @@ void GetItervarFeature(Stmt stmt, bool take_log, Array<Array<Array<Expr> > > *re
     feature_row.push_back(Array<Expr>{std::string("_itervar_"), var});
 
     Array<Expr> attr{std::string("_attr_"),
-                     FloatImm::make(DataType::Float(32), trans(fea.length)),
-                     IntImm::make(DataType::Int(32), fea.nest_level),
-                     FloatImm::make(DataType::Float(32), trans(fea.topdown_product)),
-                     FloatImm::make(DataType::Float(32), trans(fea.bottomup_product)),
+                     FloatImm::make(Float(32), trans(fea.length)),
+                     IntImm::make(Int(32), fea.nest_level),
+                     FloatImm::make(Float(32), trans(fea.topdown_product)),
+                     FloatImm::make(Float(32), trans(fea.bottomup_product)),
     };
     // one hot annotation
     for (int i = 0; i < kNum; i++) {
@@ -268,9 +268,9 @@ void GetItervarFeature(Stmt stmt, bool take_log, Array<Array<Array<Expr> > > *re
 
     // arithmetic
     feature_row.push_back(Array<Expr>{std::string("_arith_"),
-                                      FloatImm::make(DataType::Float(32), trans(fea.add_ct)),
-                                      FloatImm::make(DataType::Float(32), trans(fea.mul_ct)),
-                                      FloatImm::make(DataType::Float(32), trans(fea.div_ct)),
+                                      FloatImm::make(Float(32), trans(fea.add_ct)),
+                                      FloatImm::make(Float(32), trans(fea.mul_ct)),
+                                      FloatImm::make(Float(32), trans(fea.div_ct)),
     });
 
     // touch map
@@ -282,12 +282,12 @@ void GetItervarFeature(Stmt stmt, bool take_log, Array<Array<Array<Expr> > > *re
     for (auto k : bufs) {
       TouchPattern &v = fea.touch_feature[k];
       feature_row.push_back(Array<Expr>{k,
-                                        FloatImm::make(DataType::Float(32), trans(v.stride)),
-                                        FloatImm::make(DataType::Float(32), trans(v.mod)),
-                                        FloatImm::make(DataType::Float(32), trans(v.count)),
-                                        FloatImm::make(DataType::Float(32), trans(v.reuse)),
-                                        FloatImm::make(DataType::Float(32), trans(v.thread_count)),
-                                        FloatImm::make(DataType::Float(32), trans(v.thread_reuse)),
+                                        FloatImm::make(Float(32), trans(v.stride)),
+                                        FloatImm::make(Float(32), trans(v.mod)),
+                                        FloatImm::make(Float(32), trans(v.count)),
+                                        FloatImm::make(Float(32), trans(v.reuse)),
+                                        FloatImm::make(Float(32), trans(v.thread_count)),
+                                        FloatImm::make(Float(32), trans(v.thread_reuse)),
       });
     }
 
diff --git a/src/autotvm/touch_extractor.h b/src/autotvm/touch_extractor.h
index 1028b0144e12..e6690641edc6 100644
--- a/src/autotvm/touch_extractor.h
+++ b/src/autotvm/touch_extractor.h
@@ -91,31 +91,31 @@ class TouchExtractor : public FeatureVisitor {
 
   // arithmetic stats
   void Visit_(const Add *op) {
-    if (op->dtype.is_float())
+    if (op->type.is_float())
       itervar_map[itervar_stack_.back()].add_ct++;
     IRVisitor::Visit_(op);
   }
 
   void Visit_(const Sub *op) {
-    if (op->dtype.is_float())
+    if (op->type.is_float())
       itervar_map[itervar_stack_.back()].add_ct++;
     IRVisitor::Visit_(op);
   }
 
   void Visit_(const Mul *op) {
-    if (op->dtype.is_float())
+    if (op->type.is_float())
       itervar_map[itervar_stack_.back()].mul_ct++;
     IRVisitor::Visit_(op);
   }
 
   void Visit_(const Div *op) {
-    if (op->dtype.is_float())
+    if (op->type.is_float())
       itervar_map[itervar_stack_.back()].div_ct++;
     IRVisitor::Visit_(op);
   }
 
   void Visit_(const Mod *op) {
-    if (op->dtype.is_float())
+    if (op->type.is_float())
       itervar_map[itervar_stack_.back()].div_ct++;
     IRVisitor::Visit_(op);
   }
diff --git a/src/codegen/build_common.h b/src/codegen/build_common.h
index b2c895348a46..8a21aeea7eee 100644
--- a/src/codegen/build_common.h
+++ b/src/codegen/build_common.h
@@ -39,7 +39,7 @@ ExtractFuncInfo(const Array<LoweredFunc>& funcs) {
   for (LoweredFunc f : funcs) {
     runtime::FunctionInfo info;
     for (size_t i = 0; i < f->args.size(); ++i) {
-      info.arg_types.push_back(f->args[i].dtype());
+      info.arg_types.push_back(Type2TVMType(f->args[i].type()));
     }
     for (size_t i = 0; i < f->thread_axis.size(); ++i) {
       info.thread_axis_tags.push_back(f->thread_axis[i]->thread_tag);
diff --git a/src/codegen/build_module.cc b/src/codegen/build_module.cc
index ca25731cafef..80fd57af66f9 100644
--- a/src/codegen/build_module.cc
+++ b/src/codegen/build_module.cc
@@ -309,10 +309,6 @@ Target intel_graphics(const std::vector<std::string>& options) {
 Target stackvm(const std::vector<std::string>& options) {
   return CreateTarget("stackvm", options);
 }
-
-Target ext_dev(const std::vector<std::string>& options) {
-  return CreateTarget("ext_dev", options);
-}
 }  // namespace target
 
 bool LLVMEnabled() {
@@ -334,12 +330,12 @@ Target DefaultTargetHost(Target target) {
 }
 
 Buffer BufferWithOffsetAlignment(Array<Expr> shape,
-                                 DataType dtype,
+                                 Type dtype,
                                  std::string name,
                                  int data_alignment,
                                  int offset_factor,
                                  bool compact) {
-  auto data = Var(name, DataType::Handle());
+  auto data = Var(name, Handle());
   bool has_any = false;
   if (!compact) {
     for (const auto& it : shape) {
@@ -353,7 +349,7 @@ Buffer BufferWithOffsetAlignment(Array<Expr> shape,
 
   Expr elem_offset;
   if (offset_factor != 0) {
-    elem_offset = Var(name + "_elem_offset", shape[0].dtype());
+    elem_offset = Var(name + "_elem_offset", shape[0].type());
   } else {
     elem_offset = Expr();
   }
diff --git a/src/codegen/codegen.cc b/src/codegen/codegen.cc
index 60b12dc6e553..ded8fcebf57c 100644
--- a/src/codegen/codegen.cc
+++ b/src/codegen/codegen.cc
@@ -28,10 +28,7 @@
 #include <tvm/build_module.h>
 #include <dmlc/memory_io.h>
 #include <sstream>
-#include <vector>
-#include <cstdint>
-#include <unordered_set>
-#include <cstring>
+#include <iostream>
 
 namespace tvm {
 namespace codegen {
@@ -61,111 +58,19 @@ runtime::Module Build(const Array<LoweredFunc>& funcs,
   return m;
 }
 
-/*! \brief Helper class to serialize module */
-class ModuleSerializer {
- public:
-  explicit ModuleSerializer(runtime::Module mod) : mod_(mod) {
-    Init();
-  }
-
-  void SerializeModule(dmlc::Stream* stream) {
-    // Only have one DSO module and it is in the root, then
-    // we will not produce import_tree_.
-    bool has_import_tree = true;
-    if (DSOExportable(mod_.operator->()) && mod_->imports().empty()) {
-      has_import_tree = false;
-    }
-    uint64_t sz = 0;
-    if (has_import_tree) {
-      // we will append one key for _import_tree
-      // The layout is the same as before: binary_size, key, logic, key, logic...
-      sz = mod_vec_.size() + 1;
-    } else {
-      // Keep the old behaviour
-      sz = mod_->imports().size();
-    }
-    stream->Write(sz);
-
-    for (auto m : mod_vec_) {
-      std::string mod_type_key = m->type_key();
-      if (!DSOExportable(m)) {
-        stream->Write(mod_type_key);
-        m->SaveToBinary(stream);
-      } else if (has_import_tree) {
-        mod_type_key = "_lib";
-        stream->Write(mod_type_key);
-      }
-    }
-
-    // Write _import_tree key if we have
-    if (has_import_tree) {
-      std::string import_key = "_import_tree";
-      stream->Write(import_key);
-      stream->Write(import_tree_row_ptr_);
-      stream->Write(import_tree_child_indices_);
-    }
-  }
-
- private:
-  void Init() {
-    CreateModuleIndex();
-    CreateImportTree();
-  }
-
-  // invariance: root module is always at location 0.
-  // The module order is collected via DFS
-  void CreateModuleIndex() {
-    std::unordered_set<const runtime::ModuleNode*> visited {mod_.operator->()};
-    std::vector<runtime::ModuleNode*> stack {mod_.operator->()};
-    uint64_t module_index = 0;
-
-    while (!stack.empty()) {
-      runtime::ModuleNode* n = stack.back();
-      stack.pop_back();
-      mod2index_[n] = module_index++;
-      mod_vec_.emplace_back(n);
-      for (runtime::Module m : n->imports()) {
-        runtime::ModuleNode* next = m.operator->();
-        if (visited.count(next) == 0) {
-          visited.insert(next);
-          stack.push_back(next);
-        }
-      }
-    }
-  }
-
-  void CreateImportTree() {
-    for (auto m : mod_vec_) {
-      for (runtime::Module im : m->imports()) {
-        uint64_t mod_index = mod2index_[im.operator->()];
-        import_tree_child_indices_.push_back(mod_index);
-      }
-      import_tree_row_ptr_.push_back(import_tree_child_indices_.size());
-    }
-  }
-
-  bool DSOExportable(const runtime::ModuleNode* mod) {
-    return !std::strcmp(mod->type_key(), "llvm") ||
-           !std::strcmp(mod->type_key(), "c");
-  }
-
-  runtime::Module mod_;
-  // construct module to index
-  std::unordered_map<runtime::ModuleNode*, size_t> mod2index_;
-  // index -> module
-  std::vector<runtime::ModuleNode*> mod_vec_;
-  std::vector<uint64_t> import_tree_row_ptr_ {0};
-  std::vector<uint64_t> import_tree_child_indices_;
-};
-
 std::string PackImportsToC(const runtime::Module& mod, bool system_lib) {
   std::string bin;
   dmlc::MemoryStringStream ms(&bin);
   dmlc::Stream* stream = &ms;
-
-  ModuleSerializer module_serializer(mod);
-  module_serializer.SerializeModule(stream);
-
+  uint64_t sz = static_cast<uint64_t>(mod->imports().size());
+  stream->Write(sz);
+  for (runtime::Module im : mod->imports()) {
+    CHECK_EQ(im->imports().size(), 0U)
+        << "Only support simply one-level hierarchy";
+    std::string tkey = im->type_key();
+    stream->Write(tkey);
+    im->SaveToBinary(stream);
+  }
   // translate to C program
   std::ostringstream os;
   os << "#ifdef _WIN32\n"
diff --git a/src/codegen/codegen_c.cc b/src/codegen/codegen_c.cc
index 4b95e2caf1aa..eab542dd3e08 100644
--- a/src/codegen/codegen_c.cc
+++ b/src/codegen/codegen_c.cc
@@ -79,7 +79,7 @@ void CodeGenC::AddFunction(LoweredFunc f) {
   ReserveKeywordsAsUnique();
   // add to alloc buffer type.
   for (const auto & kv : f->handle_data_type) {
-    RegisterHandleType(kv.first.get(), kv.second.dtype());
+    RegisterHandleType(kv.first.get(), kv.second.type());
   }
 
   this->stream << "void " << f->name << "(";
@@ -87,7 +87,7 @@ void CodeGenC::AddFunction(LoweredFunc f) {
     Var v = f->args[i];
     std::string vid = AllocVarID(v.get());
     if (i != 0) stream << ", ";
-    if (v.dtype().is_handle()) {
+    if (v.type().is_handle()) {
       auto it = alloc_storage_scope_.find(v.get());
       if (it != alloc_storage_scope_.end())
         PrintStorageScope(it->second, stream);
@@ -104,7 +104,7 @@ void CodeGenC::AddFunction(LoweredFunc f) {
         stream << ' ' << restrict_keyword_;
       }
     } else {
-      PrintType(v.dtype(), stream);
+      PrintType(v.type(), stream);
     }
     stream << ' ' << vid;
   }
@@ -125,14 +125,14 @@ void CodeGenC::PrintExpr(const Expr& n, std::ostream& os) {  // NOLINT(*)
   if (print_ssa_form_) {
     std::ostringstream temp;
     VisitExpr(n, temp);
-    os << SSAGetID(temp.str(), n.dtype());
+    os << SSAGetID(temp.str(), n.type());
   } else {
     VisitExpr(n, os);
   }
 }
 
 void CodeGenC::PrintSSAAssign(
-    const std::string& target, const std::string& src, DataType t) {
+    const std::string& target, const std::string& src, Type t) {
   PrintType(t, stream);
   stream << ' ' << target << " = ";
   if (src.length() > 3 &&
@@ -146,7 +146,7 @@ void CodeGenC::PrintSSAAssign(
 
 // Print a reference expression to a buffer.
 std::string CodeGenC::GetBufferRef(
-    DataType t, const Variable* buffer, Expr index) {
+    Type t, const Variable* buffer, Expr index) {
   std::ostringstream os;
   std::string vid = GetVarID(buffer);
   std::string scope;
@@ -213,7 +213,7 @@ std::string CodeGenC::GetBufferRef(
 
 // Print a reference expression to a buffer.
 std::string CodeGenC::GetStructRef(
-    DataType t, const Expr& buffer, const Expr& index, int kind) {
+    Type t, const Expr& buffer, const Expr& index, int kind) {
   if (kind < intrinsic::kArrKindBound_) {
     std::ostringstream os;
     os << "(((TVMArray*)";
@@ -265,13 +265,13 @@ std::string CodeGenC::GetStructRef(
 }
 
 
-bool CodeGenC::HandleTypeMatch(const Variable* buf_var, DataType t) const {
+bool CodeGenC::HandleTypeMatch(const Variable* buf_var, Type t) const {
   auto it = handle_data_type_.find(buf_var);
   if (it == handle_data_type_.end()) return false;
   return it->second == t;
 }
 
-void CodeGenC::RegisterHandleType(const Variable* buf_var, DataType t) {
+void CodeGenC::RegisterHandleType(const Variable* buf_var, Type t) {
   auto it = handle_data_type_.find(buf_var);
   if (it == handle_data_type_.end()) {
     handle_data_type_[buf_var] = t;
@@ -282,13 +282,13 @@ void CodeGenC::RegisterHandleType(const Variable* buf_var, DataType t) {
 }
 
 void CodeGenC::PrintVecElemLoad(const std::string& vec,
-                                DataType t, int i,
+                                Type t, int i,
                                 std::ostream& os) {  // NOLINT(*)
   os << vec << ".s" << std::hex << i << std::dec;
 }
 
 void CodeGenC::PrintVecElemStore(const std::string& vec,
-                                 DataType t, int i,
+                                 Type t, int i,
                                  const std::string& value) {
   this->PrintIndent();
   stream << vec << ".s" << std::hex << i
@@ -296,19 +296,19 @@ void CodeGenC::PrintVecElemStore(const std::string& vec,
 }
 
 std::string CodeGenC::GetVecLoad(
-    DataType t, const Variable* buffer, Expr base) {
+    Type t, const Variable* buffer, Expr base) {
   return GetBufferRef(t, buffer, base);
 }
 
 void CodeGenC::PrintVecStore(const Variable* buffer,
-                             DataType t, Expr base,
+                             Type t, Expr base,
                              const std::string& value) {
   std::string ref = GetBufferRef(t, buffer, base);
   this->PrintIndent();
   stream << ref << " = " << value << ";\n";
 }
 
-std::string CodeGenC::CastFromTo(std::string value, DataType from, DataType target) {
+std::string CodeGenC::CastFromTo(std::string value, Type from, Type target) {
   if (from == target) return value;
   std::ostringstream os;
   os << "((";
@@ -328,7 +328,7 @@ void CodeGenC::PrintStorageScope(const std::string& scope, std::ostream& os) { /
   CHECK_EQ(scope, "global");
 }
 
-void CodeGenC::PrintType(DataType t, std::ostream& os) {  // NOLINT(*)
+void CodeGenC::PrintType(Type t, std::ostream& os) {  // NOLINT(*)
   CHECK_EQ(t.lanes(), 1)
       << "do not yet support vector types";
   if (t.is_handle()) {
@@ -360,48 +360,48 @@ void CodeGenC::PrintType(DataType t, std::ostream& os) {  // NOLINT(*)
 
 
 inline void PrintConst(const IntImm* op, std::ostream& os, CodeGenC* p) { // NOLINT(*)
-  if (op->dtype == DataType::Int(32)) {
+  if (op->type == Int(32)) {
     std::ostringstream temp;
     temp << op->value;
     p->MarkConst(temp.str());
     os << temp.str();
   } else {
     os << "(";
-    p->PrintType(op->dtype, os);
+    p->PrintType(op->type, os);
     os << ")" << op->value;
   }
 }
 
 inline void PrintConst(const UIntImm* op, std::ostream& os, CodeGenC* p) { // NOLINT(*)
-  if (op->dtype == DataType::UInt(32)) {
+  if (op->type == UInt(32)) {
     std::ostringstream temp;
     temp << op->value << "U";
     p->MarkConst(temp.str());
     os << temp.str();
   } else {
     os << "(";
-    p->PrintType(op->dtype, os);
+    p->PrintType(op->type, os);
     os << ")" << op->value;
   }
 }
 
 inline void PrintConst(const FloatImm* op, std::ostream& os, CodeGenC* p) { // NOLINT(*)
-  switch (op->dtype.bits()) {
+  switch (op->type.bits()) {
     case 64: case 32: {
       std::ostringstream temp;
       temp << std::scientific << op->value;
-      if (op->dtype.bits() == 32) temp << 'f';
+      if (op->type.bits() == 32) temp << 'f';
       p->MarkConst(temp.str());
       os << temp.str();
       break;
     }
     case 16: {
       os << '(';
-      p->PrintType(op->dtype, os);
+      p->PrintType(op->type, os);
       os << ')' << std::scientific <<op->value << 'f';
       break;
     }
-    default: LOG(FATAL) << "Bad bit-width for float: " << op->dtype << "\n";
+    default: LOG(FATAL) << "Bad bit-width for float: " << op->type << "\n";
   }
 }
 
@@ -423,7 +423,7 @@ inline void PrintBinaryExpr(const T* op,
                             const char *opstr,
                             std::ostream& os,  // NOLINT(*)
                             CodeGenC* p) {
-  if (op->dtype.lanes() == 1) {
+  if (op->type.lanes() == 1) {
     if (isalpha(opstr[0])) {
       os << opstr << '(';
       p->PrintExpr(op->a, os);
@@ -438,7 +438,7 @@ inline void PrintBinaryExpr(const T* op,
       os << ')';
     }
   } else {
-    p->PrintVecBinaryOp(opstr, op->dtype, op->a, op->b, os);
+    p->PrintVecBinaryOp(opstr, op->type, op->a, op->b, os);
   }
 }
 
@@ -446,7 +446,7 @@ inline void PrintBinaryIntrinsic(const Call* op,
                                   const char *opstr,
                                   std::ostream& os,  // NOLINT(*)
                                   CodeGenC* p) {
-  if (op->dtype.lanes() == 1) {
+  if (op->type.lanes() == 1) {
     CHECK_EQ(op->args.size(), 2U);
     os << '(';
     p->PrintExpr(op->args[0], os);
@@ -454,13 +454,13 @@ inline void PrintBinaryIntrinsic(const Call* op,
     p->PrintExpr(op->args[1], os);
     os << ')';
   } else {
-    p->PrintVecBinaryOp(opstr, op->dtype, op->args[0], op->args[1], os);
+    p->PrintVecBinaryOp(opstr, op->type, op->args[0], op->args[1], os);
   }
 }
 void CodeGenC::VisitExpr_(const Cast *op, std::ostream& os) {  // NOLINT(*)
   std::stringstream value;
   this->PrintExpr(op->value, value);
-  os << CastFromTo(value.str(), op->value.dtype(), op->dtype);
+  os << CastFromTo(value.str(), op->value.type(), op->type);
 }
 void CodeGenC::VisitExpr_(const Variable *op, std::ostream& os) {  // NOLINT(*)
   os << GetVarID(op);
@@ -553,7 +553,7 @@ void CodeGenC::VisitExpr_(const Call *op, std::ostream& os) {  // NOLINT(*)
     const Load *l = op->args[0].as<Load>();
     CHECK(op->args.size() == 1 && l);
     os << "((";
-    this->PrintType(l->dtype.element_of(), os);
+    this->PrintType(l->type.element_of(), os);
     os << " *)" << this->GetVarID(l->buffer_var.get())
        << " + ";
     this->PrintExpr(l->index, os);
@@ -561,7 +561,7 @@ void CodeGenC::VisitExpr_(const Call *op, std::ostream& os) {  // NOLINT(*)
   } else if (op->is_intrinsic(intrinsic::tvm_struct_get)) {
     CHECK_EQ(op->args.size(), 3U);
     os << GetStructRef(
-        op->dtype, op->args[0], op->args[1],
+        op->type, op->args[0], op->args[1],
         op->args[2].as<IntImm>()->value);
   } else if (op->is_intrinsic(intrinsic::tvm_handle_is_null)) {
     CHECK_EQ(op->args.size(), 1U);
@@ -571,7 +571,7 @@ void CodeGenC::VisitExpr_(const Call *op, std::ostream& os) {  // NOLINT(*)
   } else if (op->is_intrinsic(Call::reinterpret)) {
     // generate (*( TYPE *)(&(ARG)))
     os << "(*(";
-    this->PrintType(op->dtype, os);
+    this->PrintType(op->type, os);
     os << " *)(&(";
     this->PrintExpr(op->args[0], os);
     os << ")))";
@@ -585,7 +585,7 @@ void CodeGenC::VisitExpr_(const Call *op, std::ostream& os) {  // NOLINT(*)
     if (op->call_type == Call::Intrinsic ||
         op->call_type == Call::PureIntrinsic) {
       LOG(FATAL) << "Unresolved intrinsic " << op->name
-                 << " with return type " << op->dtype;
+                 << " with return type " << op->type;
     } else {
       LOG(FATAL) << "Unresolved call type " << op->call_type;
     }
@@ -593,7 +593,7 @@ void CodeGenC::VisitExpr_(const Call *op, std::ostream& os) {  // NOLINT(*)
 }
 
 void CodeGenC::PrintVecBinaryOp(
-    const std::string& op, DataType t,
+    const std::string& op, Type t,
     Expr lhs, Expr rhs, std::ostream& os) {  // NOLINT(*)
   if (isalpha(op[0])) {
     os << op << "(";
@@ -611,17 +611,17 @@ void CodeGenC::PrintVecBinaryOp(
 }
 
 void CodeGenC::VisitExpr_(const Load* op, std::ostream& os) {  // NOLINT(*)
-  int lanes = op->dtype.lanes();
+  int lanes = op->type.lanes();
   // delcare type.
-  if (op->dtype.lanes() == 1) {
-    std::string ref = GetBufferRef(op->dtype, op->buffer_var.get(), op->index);
+  if (op->type.lanes() == 1) {
+    std::string ref = GetBufferRef(op->type, op->buffer_var.get(), op->index);
     os << ref;
   } else {
     CHECK(is_one(op->predicate))
         << "predicated load is not supported";
     Expr base;
-    if (GetRamp1Base(op->index, op->dtype.lanes(), &base)) {
-      std::string ref = GetVecLoad(op->dtype, op->buffer_var.get(), base);
+    if (GetRamp1Base(op->index, op->type.lanes(), &base)) {
+      std::string ref = GetVecLoad(op->type, op->buffer_var.get(), base);
       os << ref;
     } else {
       // The assignment below introduces side-effect, and the resulting value cannot
@@ -631,16 +631,16 @@ void CodeGenC::VisitExpr_(const Load* op, std::ostream& os) {  // NOLINT(*)
       // load seperately.
       std::string svalue = GetUniqueName("_");
       this->PrintIndent();
-      this->PrintType(op->dtype, stream);
+      this->PrintType(op->type, stream);
       stream << ' ' << svalue << ";\n";
-      std::string sindex = SSAGetID(PrintExpr(op->index), op->index.dtype());
+      std::string sindex = SSAGetID(PrintExpr(op->index), op->index.type());
       std::string vid = GetVarID(op->buffer_var.get());
-      DataType elem_type = op->dtype.element_of();
+      Type elem_type = op->type.element_of();
       for (int i = 0; i < lanes; ++i) {
         std::ostringstream value_temp;
         if (!HandleTypeMatch(op->buffer_var.get(), elem_type)) {
           value_temp << "((";
-          if (op->buffer_var.get()->dtype.is_handle()) {
+          if (op->buffer_var.get()->type.is_handle()) {
             auto it = alloc_storage_scope_.find(op->buffer_var.get());
             if (it != alloc_storage_scope_.end()) {
               PrintStorageScope(it->second, value_temp);
@@ -653,9 +653,9 @@ void CodeGenC::VisitExpr_(const Load* op, std::ostream& os) {  // NOLINT(*)
           value_temp << vid;
         }
         value_temp << '[';
-        PrintVecElemLoad(sindex, op->index.dtype(), i, value_temp);
+        PrintVecElemLoad(sindex, op->index.type(), i, value_temp);
         value_temp << ']';
-        PrintVecElemStore(svalue, op->dtype, i, value_temp.str());
+        PrintVecElemStore(svalue, op->type, i, value_temp.str());
       }
       os << svalue;
       EndScope(vec_scope);
@@ -664,7 +664,7 @@ void CodeGenC::VisitExpr_(const Load* op, std::ostream& os) {  // NOLINT(*)
 }
 
 void CodeGenC::VisitStmt_(const Store* op) {
-  DataType t = op->value.dtype();
+  Type t = op->value.type();
   if (t.lanes() == 1) {
     std::string value = this->PrintExpr(op->value);
     std::string ref  = this->GetBufferRef(t, op->buffer_var.get(), op->index);
@@ -683,15 +683,15 @@ void CodeGenC::VisitStmt_(const Store* op) {
       int vec_scope = BeginScope();
 
       // store elements seperately
-      std::string index = SSAGetID(PrintExpr(op->index), op->index.dtype());
-      std::string value = SSAGetID(PrintExpr(op->value), op->value.dtype());
+      std::string index = SSAGetID(PrintExpr(op->index), op->index.type());
+      std::string value = SSAGetID(PrintExpr(op->value), op->value.type());
       std::string vid = GetVarID(op->buffer_var.get());
       for (int i = 0; i < t.lanes(); ++i) {
         this->PrintIndent();
-        DataType elem_type = t.element_of();
+        Type elem_type = t.element_of();
         if (!HandleTypeMatch(op->buffer_var.get(), elem_type)) {
           stream << "((";
-          if (op->buffer_var.get()->dtype.is_handle()) {
+          if (op->buffer_var.get()->type.is_handle()) {
             auto it = alloc_storage_scope_.find(op->buffer_var.get());
             if (it != alloc_storage_scope_.end()) {
               PrintStorageScope(it->second, stream);
@@ -704,9 +704,9 @@ void CodeGenC::VisitStmt_(const Store* op) {
           stream << vid;
         }
         stream << '[';
-        PrintVecElemLoad(index, op->index.dtype(), i, stream);
+        PrintVecElemLoad(index, op->index.type(), i, stream);
         stream << "] = ";
-        PrintVecElemLoad(value, op->value.dtype(), i, stream);
+        PrintVecElemLoad(value, op->value.type(), i, stream);
         stream << ";\n";
       }
       EndScope(vec_scope);
@@ -723,7 +723,7 @@ void CodeGenC::VisitExpr_(const Let* op, std::ostream& os) {  // NOLINT(*)
 
 void CodeGenC::VisitExpr_(const Ramp* op, std::ostream& os) {  // NOLINT(*)
   // constraint of current logic
-  CHECK_EQ(op->base.dtype(), DataType::Int(32));
+  CHECK_EQ(op->base.type(), Int(32));
   os << "((int" << op->lanes << ")(";
   for (int i = 0; i < op->lanes; i++) {
     os << "(" << PrintExpr(op->base) << ")" << "+(" << PrintExpr(op->stride) << "*" << i <<")";
@@ -758,7 +758,7 @@ void CodeGenC::VisitStmt_(const LetStmt* op) {
     var_idmap_[op->var.get()] = value;
   } else {
     PrintIndent();
-    if (op->var.dtype() == DataType::Handle() &&
+    if (op->var.type() == Handle() &&
         handle_data_type_.count(op->var.get())) {
       PrintType(handle_data_type_.at(op->var.get()), stream);
       stream << "* "
@@ -767,7 +767,7 @@ void CodeGenC::VisitStmt_(const LetStmt* op) {
       PrintType(handle_data_type_.at(op->var.get()), stream);
       stream << "*)"  << value << ";\n";
     } else {
-      PrintType(op->var.dtype(), this->stream);
+      PrintType(op->var.type(), this->stream);
       this->stream << ' '
                    << AllocVarID(op->var.get())
                    << " = " << value << ";\n";
@@ -784,7 +784,7 @@ void CodeGenC::VisitStmt_(const Allocate* op) {
     CHECK_EQ(op->free_function, "nop");
     std::string new_data = PrintExpr(op->new_expr);
     this->PrintIndent();
-    PrintType(op->dtype, stream);
+    PrintType(op->type, stream);
     stream << "* "<< vid << '=' << new_data << ";\n";
   } else {
     this->PrintIndent();
@@ -795,11 +795,11 @@ void CodeGenC::VisitStmt_(const Allocate* op) {
     std::string scope = alloc_storage_scope_.at(buffer);
     PrintStorageScope(scope, stream);
     stream << ' ';
-    PrintType(op->dtype, stream);
+    PrintType(op->type, stream);
     stream << ' '<< vid << '['
            << constant_size << "];\n";
   }
-  RegisterHandleType(op->buffer_var.get(), op->dtype);
+  RegisterHandleType(op->buffer_var.get(), op->type);
   this->PrintStmt(op->body);
 }
 
@@ -841,7 +841,7 @@ void CodeGenC::VisitStmt_(const For* op) {
   std::string vid = AllocVarID(op->loop_var.get());
   CHECK(is_zero(op->min));
   stream << "for (";
-  PrintType(op->loop_var.dtype(), stream);
+  PrintType(op->loop_var.type(), stream);
   stream << ' ' << vid << " = 0; "
             << vid << " < " << extent
             << "; ++" << vid << ") {\n";
@@ -890,7 +890,7 @@ void CodeGenC::VisitStmt_(const Evaluate *op) {
       CHECK_EQ(call->args.size(), 4);
       std::string value = PrintExpr(call->args[3]);
       std::string ref = GetStructRef(
-          call->args[3].dtype(),
+          call->args[3].type(),
           call->args[0],
           call->args[1],
           call->args[2].as<IntImm>()->value);
diff --git a/src/codegen/codegen_c.h b/src/codegen/codegen_c.h
index b8d357051998..8701cda1e14c 100644
--- a/src/codegen/codegen_c.h
+++ b/src/codegen/codegen_c.h
@@ -147,7 +147,7 @@ class CodeGenC :
    * \param t The type representation.
    * \param os The stream to print the ctype into
    */
-  virtual void PrintType(DataType t, std::ostream& os); // NOLINT(*)
+  virtual void PrintType(Type t, std::ostream& os); // NOLINT(*)
   /*!
    * \brief Print expr representing the thread tag
    * \param IterVar iv The thread index to be binded;
@@ -157,51 +157,51 @@ class CodeGenC :
   virtual void PrintStorageSync(const Call* op);  // NOLINT(*)
   // Binary vector op.
   virtual void PrintVecBinaryOp(
-      const std::string&op, DataType op_type,
+      const std::string&op, Type op_type,
       Expr lhs, Expr rhs, std::ostream& os);  // NOLINT(*)
   // print vector load
-  virtual std::string GetVecLoad(DataType t, const Variable* buffer, Expr base);
+  virtual std::string GetVecLoad(Type t, const Variable* buffer, Expr base);
   // print vector store
   virtual void PrintVecStore(const Variable* buffer,
-                             DataType t, Expr base,
+                             Type t, Expr base,
                              const std::string& value);  // NOLINT(*)
   // print load of single element
   virtual void PrintVecElemLoad(
-      const std::string& vec, DataType t, int i, std::ostream& os);  // NOLINT(*)
+      const std::string& vec, Type t, int i, std::ostream& os);  // NOLINT(*)
   // print store of single element.
   virtual void PrintVecElemStore(
-      const std::string& vec, DataType t, int i, const std::string& value);
+      const std::string& vec, Type t, int i, const std::string& value);
   // Get a cast type from to
-  virtual std::string CastFromTo(std::string value, DataType from, DataType target);
+  virtual std::string CastFromTo(std::string value, Type from, Type target);
 
  protected:
   // Print reference to struct location
   std::string GetStructRef(
-      DataType t, const Expr& buffer, const Expr& index, int kind);
+      Type t, const Expr& buffer, const Expr& index, int kind);
   // print reference to a buffer as type t in index.
   virtual std::string GetBufferRef(
-      DataType t, const Variable* buffer, Expr index);
+      Type t, const Variable* buffer, Expr index);
   /*!
    * \brief If buffer is allocated as type t.
    * \param buf_var The buffer variable.
    * \param t The type to be checked.
    */
-  bool HandleTypeMatch(const Variable* buf_var, DataType t) const;
+  bool HandleTypeMatch(const Variable* buf_var, Type t) const;
   /*!
    * \brief Register the data type of buf_var
    * \param buf_var The buffer variable.
    * \param t The type to be checked.
    */
-  void RegisterHandleType(const Variable* buf_var, DataType t);
+  void RegisterHandleType(const Variable* buf_var, Type t);
   // override
   void PrintSSAAssign(
-      const std::string& target, const std::string& src, DataType t) final;
+      const std::string& target, const std::string& src, Type t) final;
   /*! \brief restrict keyword */
   std::string restrict_keyword_{""};
   /*! \brief the storage scope of allocation */
   std::unordered_map<const Variable*, std::string> alloc_storage_scope_;
   /*! \brief the data type of allocated buffers */
-  std::unordered_map<const Variable*, DataType> handle_data_type_;
+  std::unordered_map<const Variable*, Type> handle_data_type_;
   /*! \brief reserves common C keywords */
   void ReserveKeywordsAsUnique();
 
diff --git a/src/codegen/codegen_c_host.cc b/src/codegen/codegen_c_host.cc
index f2c54c2700c9..9c099a425fd6 100644
--- a/src/codegen/codegen_c_host.cc
+++ b/src/codegen/codegen_c_host.cc
@@ -48,7 +48,7 @@ void CodeGenCHost::AddFunction(LoweredFunc f) {
   ReserveKeywordsAsUnique();
   // add to alloc buffer type.
   for (const auto & kv : f->handle_data_type) {
-    RegisterHandleType(kv.first.get(), kv.second.dtype());
+    RegisterHandleType(kv.first.get(), kv.second.type());
   }
 
   this->stream << "#ifdef __cplusplus\n";
@@ -59,7 +59,7 @@ void CodeGenCHost::AddFunction(LoweredFunc f) {
     Var v = f->args[i];
     std::string vid = AllocVarID(v.get());
     if (i != 0) stream << ", ";
-    if (v.dtype().is_handle()) {
+    if (v.type().is_handle()) {
       auto it = alloc_storage_scope_.find(v.get());
       if (it != alloc_storage_scope_.end()) {
         PrintStorageScope(it->second, stream);
@@ -77,7 +77,7 @@ void CodeGenCHost::AddFunction(LoweredFunc f) {
         stream << ' ' << restrict_keyword_;
       }
     } else {
-      PrintType(v.dtype(), stream);
+      PrintType(v.type(), stream);
     }
     stream << ' ' << vid;
   }
@@ -96,14 +96,14 @@ std::string CodeGenCHost::Finish() {
   return CodeGenC::Finish();
 }
 
-void CodeGenCHost::PrintType(DataType t, std::ostream& os) {  // NOLINT(*)
+void CodeGenCHost::PrintType(Type t, std::ostream& os) {  // NOLINT(*)
   int lanes = t.lanes();
   if (t.is_handle()) {
     CHECK_EQ(lanes, 1)
         << "does not support vector types";
     os << "void*"; return;
   }
-  if (t == DataType::Bool()) {
+  if (t == Bool()) {
     os << "bool"; return;
   }
   bool fail = false;
@@ -145,7 +145,7 @@ void CodeGenCHost::PrintType(DataType t, std::ostream& os) {  // NOLINT(*)
 void CodeGenCHost::VisitExpr_(const Broadcast* op, std::ostream& os) {   // NOLINT(*)
   std::string v = PrintExpr(op->value);
   os << "((";
-  PrintType(op->dtype, os);
+  PrintType(op->type, os);
   os << ")(";
   for (int i = 0; i < op->lanes; ++i) {
     if (i != 0) os << ", ";
@@ -268,10 +268,10 @@ inline void CodeGenCHost::PrintTernaryCondExpr(const T* op,
                                            std::ostream& os) {  // NOLINT(*)
   std::ostringstream temp_a;
   VisitExpr(op->a, temp_a);
-  std::string a_id = SSAGetID(temp_a.str(), op->a.dtype());
+  std::string a_id = SSAGetID(temp_a.str(), op->a.type());
   std::ostringstream temp_b;
   VisitExpr(op->b, temp_b);
-  std::string b_id = SSAGetID(temp_b.str(), op->b.dtype());
+  std::string b_id = SSAGetID(temp_b.str(), op->b.type());
 
   os << "((" << a_id << ") " << compare << " (" << b_id << ") "
      << "? (" << a_id << ") : (" << b_id << "))";
diff --git a/src/codegen/codegen_c_host.h b/src/codegen/codegen_c_host.h
index 44f838536627..80e359c33ce0 100644
--- a/src/codegen/codegen_c_host.h
+++ b/src/codegen/codegen_c_host.h
@@ -39,7 +39,7 @@ class CodeGenCHost final : public CodeGenC {
   void AddFunction(LoweredFunc f);
   std::string Finish();
 
-  void PrintType(DataType t, std::ostream& os) final; // NOLINT(*)
+  void PrintType(Type t, std::ostream& os) final; // NOLINT(*)
 
   // overload visitor functions
   void VisitExpr_(const Broadcast* op, std::ostream& os) final; // NOLINT(*)
diff --git a/src/codegen/codegen_cuda.cc b/src/codegen/codegen_cuda.cc
index 06b542a66323..6656fa07740d 100644
--- a/src/codegen/codegen_cuda.cc
+++ b/src/codegen/codegen_cuda.cc
@@ -105,10 +105,10 @@ void CodeGenCUDA::VisitStmt_(const ir::For* op) {
 void CodeGenCUDA::BindThreadIndex(const IterVar& iv) {
   CHECK(!var_idmap_.count(iv->var.get()));
   var_idmap_[iv->var.get()] =
-      CastFromTo(iv->thread_tag, DataType::UInt(32), iv->var.dtype());
+      CastFromTo(iv->thread_tag, UInt(32), iv->var.type());
 }
 
-void CodeGenCUDA::PrintType(DataType t, std::ostream& os) {  // NOLINT(*)
+void CodeGenCUDA::PrintType(Type t, std::ostream& os) {  // NOLINT(*)
   int lanes = t.lanes();
   if (t.is_handle()) {
     CHECK_EQ(lanes, 1)
@@ -137,7 +137,7 @@ void CodeGenCUDA::PrintType(DataType t, std::ostream& os) {  // NOLINT(*)
     if (!fail && (lanes >= 2 && lanes <= 4)) {
       os << lanes; return;
     }
-  } else if (t == DataType::Bool()) {
+  } else if (t == Bool()) {
     os << "bool"; return;
   } else if (t.is_uint() || t.is_int()) {
     if (t.is_uint()) {
@@ -199,7 +199,7 @@ void CodeGenCUDA::PrintType(DataType t, std::ostream& os) {  // NOLINT(*)
 }
 
 void CodeGenCUDA::PrintVecBinaryOp(
-    const std::string&op, DataType t,
+    const std::string&op, Type t,
     Expr lhs, Expr rhs, std::ostream& os) {  // NOLINT(*)
   // unpacking operations.
   int lanes = t.lanes();
@@ -210,8 +210,8 @@ void CodeGenCUDA::PrintVecBinaryOp(
     int vec_scope = BeginScope();
 
     // default: unpack into individual ops.
-    std::string vlhs = SSAGetID(PrintExpr(lhs), lhs.dtype());
-    std::string vrhs = SSAGetID(PrintExpr(rhs), rhs.dtype());
+    std::string vlhs = SSAGetID(PrintExpr(lhs), lhs.type());
+    std::string vrhs = SSAGetID(PrintExpr(rhs), rhs.type());
     std::string sret = GetUniqueName("_");
     {
       // delcare type.
@@ -223,15 +223,15 @@ void CodeGenCUDA::PrintVecBinaryOp(
       std::ostringstream value_temp;
       if (isalpha(op[0])) {
         value_temp << op << "(";
-        PrintVecElemLoad(vlhs, lhs.dtype(), i, value_temp);
+        PrintVecElemLoad(vlhs, lhs.type(), i, value_temp);
         value_temp << ", ";
-        PrintVecElemLoad(vrhs, rhs.dtype(), i, value_temp);
+        PrintVecElemLoad(vrhs, rhs.type(), i, value_temp);
         value_temp << ")";
       } else {
         value_temp << "(";
-        PrintVecElemLoad(vlhs, lhs.dtype(), i, value_temp);
+        PrintVecElemLoad(vlhs, lhs.type(), i, value_temp);
         value_temp << op;
-        PrintVecElemLoad(vrhs, rhs.dtype(), i, value_temp);
+        PrintVecElemLoad(vrhs, rhs.type(), i, value_temp);
         value_temp << ")";
       }
       PrintVecElemStore(sret, t, i, value_temp.str());
@@ -242,7 +242,7 @@ void CodeGenCUDA::PrintVecBinaryOp(
 }
 
 void CodeGenCUDA::PrintVecElemLoad(
-    const std::string& vec, DataType t, int i, std::ostream& os) {  // NOLINT(*)
+    const std::string& vec, Type t, int i, std::ostream& os) {  // NOLINT(*)
   static const char access[] = {'x', 'y', 'z', 'w'};
   CHECK(i >= 0 && i < 4);
   if (t.is_int() && t.bits() == 8) {
@@ -253,7 +253,7 @@ void CodeGenCUDA::PrintVecElemLoad(
 }
 
 void CodeGenCUDA::PrintVecElemStore(
-    const std::string& vec, DataType t, int i, const std::string& value) {
+    const std::string& vec, Type t, int i, const std::string& value) {
   this->PrintIndent();
   static const char access[] = {'x', 'y', 'z', 'w'};
   CHECK(i >= 0 && i < 4);
@@ -390,7 +390,7 @@ void CodeGenCUDA::VisitStmt_(const Allocate* op) {
     CHECK_EQ(op->free_function, "nop");
     std::string new_data = PrintExpr(op->new_expr);
     this->PrintIndent();
-    PrintType(op->dtype, stream);
+    PrintType(op->type, stream);
     stream << "* "<< vid << '=' << new_data << ";\n";
   } else {
     this->PrintIndent();
@@ -401,27 +401,23 @@ void CodeGenCUDA::VisitStmt_(const Allocate* op) {
     std::string scope = alloc_storage_scope_.at(buffer);
     if (scope.find("wmma.") == 0) {
       if (scope == "wmma.matrix_a" || scope == "wmma.matrix_b") {
-        CHECK(op->dtype == DataType::Float(16) ||
-              op->dtype == DataType::Int(8) ||
-              op->dtype == DataType::UInt(8))
+        CHECK(op->type == Float(16) || op->type == Int(8) || op->type == UInt(8))
           << "Matrix_a and matrix_b only support half or char or unsigned char type for now";
       } else {
-        CHECK(op->dtype == DataType::Float(16) ||
-              op->dtype == DataType::Float(32) ||
-              op->dtype == DataType::Int(32))
+        CHECK(op->type == Float(16) || op->type == Float(32) || op->type == Int(32))
           << "Accumulator only support half, float and int type for now";
       }
       constant_size = GetWmmaFragmentSize(scope, buffer, constant_size);
-      PrintWmmaScope(scope, op->dtype, buffer, stream);
+      PrintWmmaScope(scope, op->type, buffer, stream);
     } else {
       PrintStorageScope(scope, stream);
       stream << ' ';
-      PrintType(op->dtype, stream);
+      PrintType(op->type, stream);
     }
     stream << ' '<< vid << '['
            << constant_size << "];\n";
   }
-  RegisterHandleType(op->buffer_var.get(), op->dtype);
+  RegisterHandleType(op->buffer_var.get(), op->type);
   this->PrintStmt(op->body);
 }
 
@@ -453,7 +449,7 @@ void CodeGenCUDA::VisitExpr_(const Ramp* op, std::ostream& os) {
 }
 
 void CodeGenCUDA::VisitExpr_(const Broadcast* op, std::ostream& os) {   // NOLINT(*)
-  if (op->dtype.is_int() && op->dtype.bits() == 8 && op->lanes == 4) {
+  if (op->type.is_int() && op->type.bits() == 8 && op->lanes == 4) {
     // make_int8x4
     const int64_t *p = as_const_int(op->value);
     CHECK(p);
@@ -465,7 +461,7 @@ void CodeGenCUDA::VisitExpr_(const Broadcast* op, std::ostream& os) {   // NOLIN
 
   std::string v = PrintExpr(op->value);
   os << "make_";
-  PrintType(op->dtype, os);
+  PrintType(op->type, os);
   os << '(';
   for (int i = 0; i < op->lanes; ++i) {
     if (i != 0) os << ", ";
@@ -477,11 +473,11 @@ void CodeGenCUDA::VisitExpr_(const Broadcast* op, std::ostream& os) {   // NOLIN
 void CodeGenCUDA::VisitExpr_(const Shuffle* op, std::ostream &os) {
   std::vector<std::string> to_shuffle(op->vectors.size());
   for (int i = 0, e = op->vectors.size(); i < e; ++i) {
-    CHECK(op->vectors[i].dtype().lanes() == 1) << "Only scalars can be shuffled in CUDA!";
+    CHECK(op->vectors[i].type().lanes() == 1) << "Only scalars can be shuffled in CUDA!";
     to_shuffle[i] = PrintExpr(op->vectors[i]);
   }
   os << "make_";
-  PrintType(op->dtype, os);
+  PrintType(op->type, os);
   os << '(';
   for (int i = 0, e = op->indices.size(); i < e; ++i) {
     const int64_t *val = as_const_int(op->indices[i]);
@@ -493,21 +489,21 @@ void CodeGenCUDA::VisitExpr_(const Shuffle* op, std::ostream &os) {
 }
 
 inline void PrintConst(const FloatImm* op, std::ostream& os, CodeGenCUDA* p) { // NOLINT(*)
-  switch (op->dtype.bits()) {
+  switch (op->type.bits()) {
     case 64: case 32: {
       std::ostringstream temp;
       if (std::isinf(op->value)) {
         if (op->value < 0) {
           temp << "-";
         }
-        temp << ((op->dtype.bits() == 32) ? "CUDART_INF_F" : "CUDART_INF");
+        temp << ((op->type.bits() == 32) ? "CUDART_INF_F" : "CUDART_INF");
         p->need_math_constants_h_ = true;
       } else if (std::isnan(op->value)) {
-        temp << ((op->dtype.bits() == 32) ? "CUDART_NAN_F" : "CUDART_NAN");
+        temp << ((op->type.bits() == 32) ? "CUDART_NAN_F" : "CUDART_NAN");
         p->need_math_constants_h_ = true;
       } else {
         temp << std::scientific << op->value;
-        if (op->dtype.bits() == 32) temp << 'f';
+        if (op->type.bits() == 32) temp << 'f';
       }
       p->MarkConst(temp.str());
       os << temp.str();
@@ -518,7 +514,7 @@ inline void PrintConst(const FloatImm* op, std::ostream& os, CodeGenCUDA* p) { /
       os << '(' << std::scientific << op->value << 'f' << ')';
       break;
     }
-    default: LOG(FATAL) << "Bad bit-width for float: " << op->dtype << "\n";
+    default: LOG(FATAL) << "Bad bit-width for float: " << op->type << "\n";
   }
 }
 
@@ -527,7 +523,7 @@ void CodeGenCUDA::VisitExpr_(const FloatImm *op, std::ostream& os) { // NOLINT(*
   PrintConst(op, os, this);
 }
 
-void CodeGenCUDA::PrintWmmaScope(const std::string &scope, DataType t,
+void CodeGenCUDA::PrintWmmaScope(const std::string &scope, Type t,
     const Variable* variable, std::ostream &os) {
   std::stringstream type;
   PrintType(t, type);
diff --git a/src/codegen/codegen_cuda.h b/src/codegen/codegen_cuda.h
index 74d6fba35fc7..efb300415b56 100644
--- a/src/codegen/codegen_cuda.h
+++ b/src/codegen/codegen_cuda.h
@@ -6,9 +6,9 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- *
+ * 
  *   http://www.apache.org/licenses/LICENSE-2.0
- *
+ * 
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -47,13 +47,13 @@ class CodeGenCUDA final : public CodeGenC {
   void PrintStorageSync(const Call* op) final;
   void PrintStorageScope(const std::string& scope, std::ostream& os) final;  // NOLINT(*)
   void PrintVecBinaryOp(
-      const std::string&op, DataType t,
+      const std::string&op, Type t,
       Expr lhs, Expr rhs, std::ostream& os) final;  // NOLINT(*)
-  void PrintType(DataType t, std::ostream& os) final; // NOLINT(*)
+  void PrintType(Type t, std::ostream& os) final; // NOLINT(*)
   void PrintVecElemLoad(
-      const std::string& vec, DataType t, int i, std::ostream& os) final;  // NOLINT(*)
+      const std::string& vec, Type t, int i, std::ostream& os) final;  // NOLINT(*)
   void PrintVecElemStore(
-      const std::string& vec, DataType t, int i, const std::string& value) final;
+      const std::string& vec, Type t, int i, const std::string& value) final;
   void BindThreadIndex(const IterVar& iv) final;  // NOLINT(*)
   // overload visitor
   void VisitExpr_(const Ramp* op, std::ostream& os) final; // NOLINT(*)
@@ -84,10 +84,8 @@ class CodeGenCUDA final : public CodeGenC {
   std::unordered_map<const Variable*, std::string> fragment_shapes;
   std::unordered_map<const Variable*, std::string> fragment_layouts;
   friend void PrintConst(const FloatImm* op, std::ostream& os, CodeGenCUDA* p);
-  void PrintWmmaScope(
-      const std::string& scope, DataType t, const Variable* variable, std::ostream& os);
-  int32_t GetWmmaFragmentSize(
-      const std::string &scope, const Variable* variable, int32_t size);
+  void PrintWmmaScope(const std::string& scope, Type t, const Variable* variable, std::ostream& os);
+  int32_t GetWmmaFragmentSize(const std::string &scope, const Variable* variable, int32_t size);
 };
 
 }  // namespace codegen
diff --git a/src/codegen/codegen_metal.cc b/src/codegen/codegen_metal.cc
index f4ff014c7297..311bdcbfa8d4 100644
--- a/src/codegen/codegen_metal.cc
+++ b/src/codegen/codegen_metal.cc
@@ -36,7 +36,7 @@ void CodeGenMetal::InitFuncState(LoweredFunc f) {
   CodeGenC::InitFuncState(f);
   // analyze the data;
   for (Var arg : f->args) {
-    if (arg.dtype().is_handle()) {
+    if (arg.type().is_handle()) {
       alloc_storage_scope_[arg.get()] = "global";
     }
   }
@@ -57,7 +57,7 @@ void CodeGenMetal::AddFunction(LoweredFunc f) {
   GetUniqueName("_");
   // add to alloc buffer type.
   for (const auto & kv : f->handle_data_type) {
-    RegisterHandleType(kv.first.get(), kv.second.dtype());
+    RegisterHandleType(kv.first.get(), kv.second.type());
   }
   // Function header.
   this->stream << "kernel void " << f->name << "(\n";
@@ -65,7 +65,7 @@ void CodeGenMetal::AddFunction(LoweredFunc f) {
   size_t num_buffer = 0;
   for (size_t i = 0; i < f->args.size(); ++i, ++num_buffer) {
     Var v = f->args[i];
-    if (!v.dtype().is_handle())  break;
+    if (!v.type().is_handle())  break;
     stream << "  ";
     std::string vid = AllocVarID(v.get());
     auto it = alloc_storage_scope_.find(v.get());
@@ -76,7 +76,7 @@ void CodeGenMetal::AddFunction(LoweredFunc f) {
       PrintType(handle_data_type_.at(v.get()), stream);
       stream << "*";
     } else {
-      PrintType(v.dtype(), stream);
+      PrintType(v.type(), stream);
     }
     stream << ' ' << vid
            << " [[ buffer(" << i << ") ]],\n";
@@ -92,19 +92,19 @@ void CodeGenMetal::AddFunction(LoweredFunc f) {
     decl_stream << "struct " << arg_buf_type << " {\n";
     for (size_t i = num_buffer; i < f->args.size(); ++i) {
       Var v = f->args[i];
-      CHECK(!v.dtype().is_handle());
+      CHECK(!v.type().is_handle());
       std::string vid = AllocVarID(v.get());
       std::ostringstream vref;
-      if (v.dtype().bits() == 32) {
+      if (v.type().bits() == 32) {
         decl_stream << "  ";
-        PrintType(v.dtype(), decl_stream);
+        PrintType(v.type(), decl_stream);
         decl_stream << " " << vid << ";\n";
         vref << varg << "." << vid;
       } else {
         // For non 32bit type, ref through arg union.
         decl_stream << "  __TVMArgUnion " << vid << ";\n";
         vref << varg << "." << vid << ".v_";
-        PrintType(v.dtype(), vref);
+        PrintType(v.type(), vref);
       }
       var_idmap_[v.get()] = vref.str();
     }
@@ -121,10 +121,10 @@ void CodeGenMetal::AddFunction(LoweredFunc f) {
   if (work_dim != 0) {
     // use ushort by default for now
     stream << "  ";
-    PrintType(DataType::UInt(thread_index_bits_, work_dim), stream);
+    PrintType(UInt(thread_index_bits_, work_dim), stream);
     stream << " blockIdx [[threadgroup_position_in_grid]],\n";
     stream << "  ";
-    PrintType(DataType::UInt(thread_index_bits_, work_dim), stream);
+    PrintType(UInt(thread_index_bits_, work_dim), stream);
     stream << " threadIdx [[thread_position_in_threadgroup]]\n";
   }
   // bind thread axis
@@ -135,7 +135,7 @@ void CodeGenMetal::AddFunction(LoweredFunc f) {
       vname = vname.substr(0, iv->thread_tag.length() - 2);
     }
     var_idmap_[iv->var.get()] =
-        CastFromTo(vname, DataType::UInt(thread_index_bits_), iv->var.dtype());
+        CastFromTo(vname, UInt(thread_index_bits_), iv->var.type());
   }
   // the function scope.
   stream << ") {\n";
@@ -149,17 +149,17 @@ void CodeGenMetal::AddFunction(LoweredFunc f) {
 void CodeGenMetal::BindThreadIndex(const IterVar& iv) {
   CHECK(!var_idmap_.count(iv->var.get()));
   var_idmap_[iv->var.get()] =
-      CastFromTo(iv->thread_tag, DataType::UInt(thread_index_bits_), iv->var.dtype());
+      CastFromTo(iv->thread_tag, UInt(thread_index_bits_), iv->var.type());
 }
 
-void CodeGenMetal::PrintType(DataType t, std::ostream& os) {  // NOLINT(*)
+void CodeGenMetal::PrintType(Type t, std::ostream& os) {  // NOLINT(*)
   int lanes = t.lanes();
   if (t.is_handle()) {
     CHECK_EQ(lanes, 1)
         << "do not yet support vector types";
     os << "void*"; return;
   }
-  if (t == DataType::Bool()) {
+  if (t == Bool()) {
     os << "bool"; return;
   }
   bool fail = false;
@@ -210,13 +210,13 @@ void CodeGenMetal::PrintStorageSync(const Call* op) {
 }
 
 void CodeGenMetal::PrintVecElemLoad(const std::string& vec,
-                                    DataType t, int i,
+                                    Type t, int i,
                                     std::ostream& os) {  // NOLINT(*)
   os << vec << "[" << i << "]";
 }
 
 void CodeGenMetal::PrintVecElemStore(const std::string& vec,
-                                     DataType t, int i,
+                                     Type t, int i,
                                      const std::string& value) {
   this->PrintIndent();
   stream << vec << "[" << i << "]"
@@ -236,7 +236,7 @@ void CodeGenMetal::PrintStorageScope(
 
 void CodeGenMetal::VisitExpr_(const Broadcast* op, std::ostream& os) {   // NOLINT(*)
   std::string v = PrintExpr(op->value);
-  PrintType(op->dtype, os);
+  PrintType(op->type, os);
   os << "(";
   for (int i = 0; i < op->lanes; ++i) {
     if (i != 0) os << ", ";
@@ -249,7 +249,7 @@ void CodeGenMetal::VisitExpr_(const Call* op, std::ostream& os) {  // NOLINT(*)
   if (op->is_intrinsic(Call::reinterpret)) {
     // generate as_type<TYPE>(ARG)
     os << "(as_type<";
-    this->PrintType(op->dtype, os);
+    this->PrintType(op->type, os);
     os << ">(";
     this->PrintExpr(op->args[0], os);
     os << "))";
diff --git a/src/codegen/codegen_metal.h b/src/codegen/codegen_metal.h
index 728e3e07a916..c009cd1e9169 100644
--- a/src/codegen/codegen_metal.h
+++ b/src/codegen/codegen_metal.h
@@ -41,14 +41,14 @@ class CodeGenMetal final : public CodeGenC {
   void InitFuncState(LoweredFunc f) final;
   void PrintStorageScope(const std::string& scope, std::ostream& os) final; // NOLINT(*)
   void PrintStorageSync(const Call* op) final;  // NOLINT(*)
-  void PrintType(DataType t, std::ostream& os) final; // NOLINT(*)
+  void PrintType(Type t, std::ostream& os) final; // NOLINT(*)
   void BindThreadIndex(const IterVar& iv) final;  // NOLINT(*)
   // print load of single element
   void PrintVecElemLoad(
-      const std::string& vec, DataType t, int i, std::ostream& os) final;  // NOLINT(*)
+      const std::string& vec, Type t, int i, std::ostream& os) final;  // NOLINT(*)
   // print store of single element.
   void PrintVecElemStore(
-      const std::string& vec, DataType t, int i, const std::string& value) final;
+      const std::string& vec, Type t, int i, const std::string& value) final;
   // overload visitor
   void VisitExpr_(const Broadcast* op, std::ostream& os) final; // NOLINT(*)
 
diff --git a/src/codegen/codegen_opencl.cc b/src/codegen/codegen_opencl.cc
index ae434197400f..49dccb173ed3 100644
--- a/src/codegen/codegen_opencl.cc
+++ b/src/codegen/codegen_opencl.cc
@@ -6,9 +6,9 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- *
+ * 
  *   http://www.apache.org/licenses/LICENSE-2.0
- *
+ * 
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -39,7 +39,7 @@ CodeGenOpenCL::CodeGenOpenCL() {
 void CodeGenOpenCL::InitFuncState(LoweredFunc f) {
   CodeGenC::InitFuncState(f);
   for (Var arg : f->args) {
-    if (arg.dtype().is_handle()) {
+    if (arg.type().is_handle()) {
       alloc_storage_scope_[arg.get()] = "global";
     }
   }
@@ -89,17 +89,17 @@ void CodeGenOpenCL::BindThreadIndex(const IterVar& iv) {
     os << "get_group_id(" << ts.dim_index << ")";
   }
   var_idmap_[iv->var.get()] =
-      CastFromTo(os.str(), DataType::UInt(64), iv->var.dtype());
+      CastFromTo(os.str(), UInt(64), iv->var.type());
 }
 
-void CodeGenOpenCL::PrintType(DataType t, std::ostream& os) {  // NOLINT(*)
+void CodeGenOpenCL::PrintType(Type t, std::ostream& os) {  // NOLINT(*)
   int lanes = t.lanes();
   if (t.is_handle()) {
     CHECK_EQ(lanes, 1)
         << "do not yet support vector types";
     os << "void*"; return;
   }
-  if (t == DataType::Bool()) {
+  if (t == Bool()) {
     os << "bool"; return;
   }
   bool fail = false;
@@ -144,7 +144,7 @@ void CodeGenOpenCL::PrintType(DataType t, std::ostream& os) {  // NOLINT(*)
   LOG(FATAL) << "Cannot convert type " << t << " to OpenCL type";
 }
 
-void CodeGenOpenCL::PrintVecAddr(const Variable* buffer, DataType t,
+void CodeGenOpenCL::PrintVecAddr(const Variable* buffer, Type t,
                                  Expr base, std::ostream& os) {  // NOLINT(*)
   if (!HandleTypeMatch(buffer, t.element_of())) {
     os << '(';
@@ -160,7 +160,7 @@ void CodeGenOpenCL::PrintVecAddr(const Variable* buffer, DataType t,
   PrintExpr(base, os);
 }
 std::string CodeGenOpenCL::GetVecLoad(
-    DataType t, const Variable* buffer, Expr base) {
+    Type t, const Variable* buffer, Expr base) {
   std::ostringstream os;
   os << "vload" << t.lanes() << "(0, ";
   PrintVecAddr(buffer, t, base, os);
@@ -169,7 +169,7 @@ std::string CodeGenOpenCL::GetVecLoad(
 }
 
 void CodeGenOpenCL::PrintVecStore(const Variable* buffer,
-                                  DataType t, Expr base,
+                                  Type t, Expr base,
                                   const std::string& value) {
   this->PrintIndent();
   stream << "vstore" << t.lanes() << "(" << value << ", 0, ";
@@ -199,7 +199,7 @@ void CodeGenOpenCL::PrintStorageScope(
   }
 }
 
-std::string CodeGenOpenCL::CastFromTo(std::string value, DataType from, DataType target) {
+std::string CodeGenOpenCL::CastFromTo(std::string value, Type from, Type target) {
   if (from == target) return value;
   std::ostringstream os;
   if (target.lanes() == 1) {
@@ -218,7 +218,7 @@ std::string CodeGenOpenCL::CastFromTo(std::string value, DataType from, DataType
 void CodeGenOpenCL::VisitExpr_(const Broadcast* op, std::ostream& os) {   // NOLINT(*)
   std::string v = PrintExpr(op->value);
   os << "((";
-  PrintType(op->dtype, os);
+  PrintType(op->type, os);
   os << ")(";
   for (int i = 0; i < op->lanes; ++i) {
     if (i != 0) os << ", ";
@@ -232,7 +232,7 @@ void CodeGenOpenCL::VisitExpr_(const Call *op, std::ostream& os) {  // NOLINT(*)
    * add a cast */
   if (op->is_intrinsic(intrinsic::tvm_if_then_else)) {
     os << "(";
-    PrintType(op->args[2].dtype(), os);
+    PrintType(op->args[2].type(), os);
     os << ")";
   }
   CodeGenC::VisitExpr_(op, os);
@@ -242,7 +242,7 @@ void CodeGenOpenCL::VisitExpr_(const Select* op, std::ostream& os) {  // NOLINT(
   /* Return type of ternary expression is not always same as its sub-expressions,
    * add a cast */
   os << "(";
-  PrintType(op->true_value.dtype(), os);
+  PrintType(op->true_value.type(), os);
   os << ")";
   CodeGenC::VisitExpr_(op, os);
 }
diff --git a/src/codegen/codegen_opencl.h b/src/codegen/codegen_opencl.h
index 36324eb431ae..32f4501276e7 100644
--- a/src/codegen/codegen_opencl.h
+++ b/src/codegen/codegen_opencl.h
@@ -43,16 +43,16 @@ class CodeGenOpenCL final : public CodeGenC {
   void BindThreadIndex(const IterVar& iv) final;  // NOLINT(*)
   void PrintStorageScope(const std::string& scope, std::ostream& os) final; // NOLINT(*)
   void PrintStorageSync(const Call* op) final;  // NOLINT(*)
-  void PrintType(DataType t, std::ostream& os) final; // NOLINT(*)
-  std::string GetVecLoad(DataType t, const Variable* buffer,
+  void PrintType(Type t, std::ostream& os) final; // NOLINT(*)
+  std::string GetVecLoad(Type t, const Variable* buffer,
                          Expr base) final;
   void PrintVecStore(const Variable* buffer,
-                     DataType t, Expr base,
+                     Type t, Expr base,
                      const std::string& value) final;  // NOLINT(*)
   // the address of load/store
-  void PrintVecAddr(const Variable* buffer, DataType t,
+  void PrintVecAddr(const Variable* buffer, Type t,
                     Expr base, std::ostream& os);  // NOLINT(*)
-  std::string CastFromTo(std::string value, DataType from, DataType target); // NOLINT(*)
+  std::string CastFromTo(std::string value, Type from, Type target); // NOLINT(*)
 
   // overload visitor
   void VisitExpr_(const Broadcast* op, std::ostream& os) final; // NOLINT(*)
diff --git a/src/codegen/codegen_opengl.cc b/src/codegen/codegen_opengl.cc
index db14be3b395e..52e04db12480 100644
--- a/src/codegen/codegen_opengl.cc
+++ b/src/codegen/codegen_opengl.cc
@@ -59,7 +59,7 @@ void CodeGenOpenGL::AddFunction(LoweredFunc f) {
   GetUniqueName("_");
   // add to alloc buffer type.
   for (const auto& kv : f->handle_data_type) {
-    RegisterHandleType(kv.first.get(), kv.second.dtype());
+    RegisterHandleType(kv.first.get(), kv.second.type());
   }
 
   // Allocate argument names. Store in `var_idmap_`.
@@ -93,7 +93,7 @@ void CodeGenOpenGL::AddFunction(LoweredFunc f) {
 
       auto type_it = this->handle_data_type_.find(arg.get());
       CHECK(type_it != this->handle_data_type_.cend()) << "Cannot find type.";
-      DLDataType type = type_it->second;
+      auto type = Type2TVMType(type_it->second);
       CHECK_EQ(type.lanes, 1) << "Vector type not supported.";
 
       switch (type.code) {
@@ -129,7 +129,7 @@ void CodeGenOpenGL::AddFunction(LoweredFunc f) {
       // Format: "uniform {type} {name};"
 
       auto arg_name = GetVarID(arg.get());
-      auto type = arg.get()->dtype;
+      auto type = arg.get()->type;
 
       this->decl_stream << "uniform ";
       PrintType(type, this->decl_stream);
@@ -207,7 +207,7 @@ std::string CodeGenOpenGL::TexelFetch(const Variable* buffer, Expr index) {
 // Print a reference expression to a buffer.
 // Format: texelFetch(buffer, index, 0).r
 std::string CodeGenOpenGL::GetBufferRef(
-    DataType t, const Variable* buffer, Expr index) {
+    Type t, const Variable* buffer, Expr index) {
   CHECK_EQ(t.lanes(), 1) << "Vector type not supported.";
   CHECK(HandleTypeMatch(buffer, t)) << "Type mismatch not supported.";
 
@@ -221,7 +221,7 @@ std::string CodeGenOpenGL::GetBufferRef(
   }
 }
 
-void CodeGenOpenGL::PrintType(DataType t, std::ostream& os) {
+void CodeGenOpenGL::PrintType(Type t, std::ostream& os) {
   switch (t.code()) {
     case kDLInt:
       CHECK_EQ(t.bits(), 32) << "Only support 32-bit int.";
@@ -243,17 +243,17 @@ void CodeGenOpenGL::PrintType(DataType t, std::ostream& os) {
 // Codegen for immediate values
 
 void CodeGenOpenGL::VisitExpr_(const IntImm* op, std::ostream& os) {
-  CHECK_EQ(op->dtype, DataType::Int(32)) << "GLSL 3.0 only supports 32-bit ints.";
+  CHECK_EQ(op->type, Int(32)) << "GLSL 3.0 only supports 32-bit ints.";
   CodeGenC::VisitExpr_(op, os);
 }
 
 void CodeGenOpenGL::VisitExpr_(const UIntImm* op, std::ostream& os) {
-  CHECK_EQ(op->dtype, DataType::UInt(32)) << "GLSL 3.0 only supports 32-bit uints.";
+  CHECK_EQ(op->type, UInt(32)) << "GLSL 3.0 only supports 32-bit uints.";
   CodeGenC::VisitExpr_(op, os);
 }
 
 void CodeGenOpenGL::VisitExpr_(const FloatImm* op, std::ostream& os) {
-  CHECK_EQ(op->dtype, DataType::Float(32)) << "GLSL 3.0 only supports 32-bit floats.";
+  CHECK_EQ(op->type, Float(32)) << "GLSL 3.0 only supports 32-bit floats.";
   CodeGenC::VisitExpr_(op, os);
 }
 
@@ -273,7 +273,7 @@ void CodeGenOpenGL::VisitStmt_(const Evaluate* op) {
   auto value = call->args[1];
 
   // Doesn't support store to vector.
-  auto type = value.dtype();
+  auto type = value.type();
   CHECK_EQ(type.lanes(), 1)
     << "Vectorized store not implemented, type = " << type;
 
diff --git a/src/codegen/codegen_opengl.h b/src/codegen/codegen_opengl.h
index 46e87a8165c1..d18052f5f46c 100644
--- a/src/codegen/codegen_opengl.h
+++ b/src/codegen/codegen_opengl.h
@@ -45,8 +45,8 @@ class CodeGenOpenGL final : public CodeGenC {
   void BindThreadIndex(const IterVar& iv) final;
   void VisitStmt_(const Store* op) final;
   std::string TexelFetch(const Variable* buffer, Expr index);
-  std::string GetBufferRef(DataType t, const Variable* buffer, Expr index) final;
-  void PrintType(DataType t, std::ostream& os) final; // NOLINT(*)
+  std::string GetBufferRef(Type t, const Variable* buffer, Expr index) final;
+  void PrintType(Type t, std::ostream& os) final; // NOLINT(*)
 
   // Codegen for immediate values
   void VisitExpr_(const IntImm* op, std::ostream& os) final;  // NOLINT(*)
diff --git a/src/codegen/codegen_source_base.cc b/src/codegen/codegen_source_base.cc
index 7c4ed5b91c8b..9a9f525d40f1 100644
--- a/src/codegen/codegen_source_base.cc
+++ b/src/codegen/codegen_source_base.cc
@@ -52,7 +52,7 @@ std::string CodeGenSourceBase::GetUniqueName(std::string prefix) {
   return prefix;
 }
 
-std::string CodeGenSourceBase::SSAGetID(std::string src, DataType t) {
+std::string CodeGenSourceBase::SSAGetID(std::string src, Type t) {
   if (name_alloc_map_.count(src)) return src;
   auto it = ssa_assign_map_.find(src);
   if (it != ssa_assign_map_.end()) {
diff --git a/src/codegen/codegen_source_base.h b/src/codegen/codegen_source_base.h
index 7fd0eef98a90..e0608c6afbde 100644
--- a/src/codegen/codegen_source_base.h
+++ b/src/codegen/codegen_source_base.h
@@ -79,7 +79,7 @@ class CodeGenSourceBase {
    * \param src The source expression
    * \param t The type of the expression.
    */
-  std::string SSAGetID(std::string src, DataType t);
+  std::string SSAGetID(std::string src, Type t);
   /*!
    * \brief get a unique name with the corresponding prefix
    * \param prefix The prefix of the name
@@ -103,7 +103,7 @@ class CodeGenSourceBase {
    * \param t The type of target.
    */
   virtual void PrintSSAAssign(
-      const std::string& target, const std::string& src, DataType t) = 0;
+      const std::string& target, const std::string& src, Type t) = 0;
 
   /*! \brief the declaration stream */
   std::ostringstream decl_stream;
diff --git a/src/codegen/codegen_vhls.cc b/src/codegen/codegen_vhls.cc
index 40550d9f9916..84329f90ddfc 100644
--- a/src/codegen/codegen_vhls.cc
+++ b/src/codegen/codegen_vhls.cc
@@ -37,7 +37,7 @@ void CodeGenVivadoHLS::Init(bool output_ssa) {
   this->stream << "#include <algorithm>\n\n";
 }
 
-void CodeGenVivadoHLS::PrintType(DataType t, std::ostream& os) {
+void CodeGenVivadoHLS::PrintType(Type t, std::ostream& os) {
   if (t.is_uint()) {
     switch (t.bits()) {
       case 8:
@@ -78,7 +78,7 @@ void CodeGenVivadoHLS::PreFunctionBody(LoweredFunc f) {
   for (size_t i = 0; i < f->args.size(); ++i) {
     Var v = f->args[i];
     std::string vid = GetVarID(v.get());
-    if (v.dtype().is_handle()) {
+    if (v.type().is_handle()) {
       this->stream << "#pragma HLS INTERFACE m_axi port=" << vid << "  offset=slave bundle=gmem\n";
     }
     this->stream << "#pragma HLS INTERFACE s_axilite port=" << vid << " bundle=control\n";
@@ -100,8 +100,8 @@ inline void PrintBinaryExpr(const T* op,
 
 void CodeGenVivadoHLS::VisitExpr_(const Min *op, std::ostream& os) {  // NOLINT(*)
   const char *opstr = "std::min";
-  if (op->dtype.is_float()) {
-    switch (op->dtype.bits()) {
+  if (op->type.is_float()) {
+    switch (op->type.bits()) {
       case 32:
         opstr = "fminf"; break;
       case 64:
@@ -114,8 +114,8 @@ void CodeGenVivadoHLS::VisitExpr_(const Min *op, std::ostream& os) {  // NOLINT(
 
 void CodeGenVivadoHLS::VisitExpr_(const Max *op, std::ostream& os) {  // NOLINT(*)
   const char *opstr = "std::max";
-  if (op->dtype.is_float()) {
-    switch (op->dtype.bits()) {
+  if (op->type.is_float()) {
+    switch (op->type.bits()) {
       case 32:
         opstr = "fmaxf"; break;
       case 64:
diff --git a/src/codegen/codegen_vhls.h b/src/codegen/codegen_vhls.h
index e678edb05198..4ec7b105385d 100644
--- a/src/codegen/codegen_vhls.h
+++ b/src/codegen/codegen_vhls.h
@@ -35,7 +35,7 @@ namespace codegen {
 class CodeGenVivadoHLS final : public CodeGenC {
  public:
   void Init(bool output_ssa);
-  void PrintType(DataType t, std::ostream& os);
+  void PrintType(Type t, std::ostream& os);
   void AddFunction(LoweredFunc f);
   void PreFunctionBody(LoweredFunc f);
   void VisitExpr_(const Min *op, std::ostream& os);
diff --git a/src/codegen/intrin_rule.cc b/src/codegen/intrin_rule.cc
index 219b485387d5..f765c0095ce1 100644
--- a/src/codegen/intrin_rule.cc
+++ b/src/codegen/intrin_rule.cc
@@ -57,7 +57,7 @@ TVM_REGISTER_GLOBAL("tvm.intrin.rule.default.rsqrt")
     const Call* call = e.as<Call>();
     CHECK(call != nullptr);
 
-    auto one = make_const(call->args[0].dtype(), 1);
+    auto one = make_const(call->args[0].type(), 1);
     *rv = one / sqrt(call->args[0]);
   });
 
@@ -70,7 +70,7 @@ TVM_REGISTER_GLOBAL("tvm.intrin.rule.default.sigmoid")
     const Call* call = e.as<Call>();
     CHECK(call != nullptr);
 
-    auto one = make_const(call->args[0].dtype(), 1);
+    auto one = make_const(call->args[0].type(), 1);
     *rv = one / (one + exp(-call->args[0]));
   });
 
diff --git a/src/codegen/intrin_rule.h b/src/codegen/intrin_rule.h
index 581387da69cf..9f3bd793dd39 100644
--- a/src/codegen/intrin_rule.h
+++ b/src/codegen/intrin_rule.h
@@ -6,9 +6,9 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- *
+ * 
  *   http://www.apache.org/licenses/LICENSE-2.0
- *
+ * 
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -37,10 +37,10 @@ using namespace ir;
 
 // Add float suffix to the intrinsics
 struct FloatSuffix {
-  std::string operator()(DataType t, std::string name) const {
-    if (t == DataType::Float(32)) {
+  std::string operator()(Type t, std::string name) const {
+    if (t == Float(32)) {
       return name + 'f';
-    } else if (t == DataType::Float(64)) {
+    } else if (t == Float(64)) {
       return name;
     } else {
       return "";
@@ -50,7 +50,7 @@ struct FloatSuffix {
 
 // Return the intrinsic name
 struct Direct {
-  std::string operator()(DataType t, std::string name) const {
+  std::string operator()(Type t, std::string name) const {
     return name;
   }
 };
@@ -61,10 +61,10 @@ inline void DispatchExtern(const TVMArgs& args, TVMRetValue* rv) {
   Expr e = args[0];
   const Call* call = e.as<Call>();
   CHECK(call != nullptr);
-  std::string name = T()(call->dtype, call->name);
+  std::string name = T()(call->type, call->name);
   if (name.length() != 0) {
     *rv = Call::make(
-        call->dtype, name, call->args, Call::PureExtern);
+        call->type, name, call->args, Call::PureExtern);
   } else {
     *rv = e;
   }
diff --git a/src/codegen/intrin_rule_cuda.cc b/src/codegen/intrin_rule_cuda.cc
index 3f6bc7ba1d06..4fed20fce51d 100644
--- a/src/codegen/intrin_rule_cuda.cc
+++ b/src/codegen/intrin_rule_cuda.cc
@@ -28,7 +28,7 @@ namespace codegen {
 namespace intrin {
 // Add float suffix to the intrinsics, CUDA fast math.
 struct CUDAMath {
-  std::string operator()(DataType t, std::string name) const {
+  std::string operator()(Type t, std::string name) const {
     if (t.lanes() == 1) {
       if (t.is_float()) {
         switch (t.bits()) {
@@ -44,7 +44,7 @@ struct CUDAMath {
 };
 
 struct CUDAFastMath : public CUDAMath {
-  std::string operator()(DataType t, std::string name) const {
+  std::string operator()(Type t, std::string name) const {
     if (t.lanes() == 1 && t.is_float() && t.bits() == 32) {
       return "__" + name + 'f';
     } else {
@@ -55,7 +55,7 @@ struct CUDAFastMath : public CUDAMath {
 };
 
 struct CUDAPopcount {
-  std::string operator()(DataType t, std::string name) const {
+  std::string operator()(Type t, std::string name) const {
     if (t.lanes() == 1 && t.is_uint()) {
       switch (t.bits()) {
         case 32: return "__popc";
@@ -68,7 +68,7 @@ struct CUDAPopcount {
 };
 
 struct CUDAShuffle {
-  std::string operator()(DataType t, std::string name) const {
+  std::string operator()(Type t, std::string name) const {
     return "__shfl";
   }
 };
diff --git a/src/codegen/intrin_rule_opencl.cc b/src/codegen/intrin_rule_opencl.cc
index 4b1d4033c16f..246747cc361d 100644
--- a/src/codegen/intrin_rule_opencl.cc
+++ b/src/codegen/intrin_rule_opencl.cc
@@ -66,7 +66,7 @@ TVM_REGISTER_GLOBAL("tvm.intrin.rule.opencl.fmod")
 // There is no warp shuffle instruction in standard OpenCL
 // When shuffle is used, we assume it is intel's shuffle extension
 struct IntelShuffle {
-  std::string operator()(DataType t, std::string name) const {
+  std::string operator()(Type t, std::string name) const {
     return "intel_sub_group_shuffle";
   }
 };
diff --git a/src/codegen/llvm/codegen_amdgpu.cc b/src/codegen/llvm/codegen_amdgpu.cc
index f57a3ca869ef..491a304983c6 100644
--- a/src/codegen/llvm/codegen_amdgpu.cc
+++ b/src/codegen/llvm/codegen_amdgpu.cc
@@ -82,7 +82,7 @@ class CodeGenAMDGPU : public CodeGenLLVM {
           << "Can only handle constant size stack allocation in GPU";
       StorageInfo& info = alloc_storage_info_[op->buffer_var.get()];
       if (constant_size % 4 == 0 && info.alignment == 0) {
-        info.alignment = GetTempAllocaAlignment(op->dtype, constant_size);
+        info.alignment = GetTempAllocaAlignment(op->type, constant_size);
       }
       // maximum necessary alignment in the AMD devices
       if (info.alignment > 16) {
@@ -93,7 +93,7 @@ class CodeGenAMDGPU : public CodeGenLLVM {
         // TODO(tqchen): for higher version of LLVM, local address space can be set.
         llvm::AllocaInst* alloca = WithFunctionEntry([&]() {
             return builder_->CreateAlloca(
-                LLVMType(op->dtype), ConstInt32(constant_size));
+                LLVMType(op->type), ConstInt32(constant_size));
           });
         if (alloca->getAlignment() < static_cast<uint32_t>(info.alignment)) {
 #if TVM_LLVM_VERSION >= 100
@@ -108,7 +108,7 @@ class CodeGenAMDGPU : public CodeGenLLVM {
             << "Can only allocate shared or local memory inside kernel";
         // Shared memory: address space  == 3
         const unsigned shared_address_space = 3;
-        llvm::Type* type = llvm::ArrayType::get(LLVMType(op->dtype), constant_size);
+        llvm::Type* type = llvm::ArrayType::get(LLVMType(op->type), constant_size);
         // Allocate shared memory in global, address_space = 3
         llvm::GlobalVariable *global = new llvm::GlobalVariable(
             *module_, type, false, llvm::GlobalValue::PrivateLinkage, 0, ".shared",
@@ -122,7 +122,7 @@ class CodeGenAMDGPU : public CodeGenLLVM {
       }
     }
     buf = builder_->CreatePointerCast(
-        buf, LLVMType(op->dtype)->getPointerTo(
+        buf, LLVMType(op->type)->getPointerTo(
             buf->getType()->getPointerAddressSpace()));
     CHECK(!var_map_.count(op->buffer_var.get()));
     var_map_[op->buffer_var.get()] = buf;
diff --git a/src/codegen/llvm/codegen_arm.cc b/src/codegen/llvm/codegen_arm.cc
index 4c092dfe377a..9b21455605c3 100644
--- a/src/codegen/llvm/codegen_arm.cc
+++ b/src/codegen/llvm/codegen_arm.cc
@@ -6,9 +6,9 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- *
+ * 
  *   http://www.apache.org/licenses/LICENSE-2.0
- *
+ * 
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -61,14 +61,14 @@ Expr CodeGenARM::ARMPopcount(const Call *call) {
   ::llvm::Intrinsic::ID vpaddlu_id = ::llvm::Intrinsic::arm_neon_vpaddlu;
 
   // Fallback to default llvm lowering rule if input type not a full vector or half vector length
-  int total_size =  call->dtype.bits() * call->dtype.lanes();
-  if (!call->dtype.is_vector() || call->dtype.bits() == 8 ||
+  int total_size =  call->type.bits() * call->type.lanes();
+  if (!call->type.is_vector() || call->type.bits() == 8 ||
      (total_size != 128 && total_size != 64)) {
     Array<Expr> vcnt_args;
-    vcnt_args.push_back(ir::UIntImm::make(DataType::UInt(32), ctpop_id));
-    vcnt_args.push_back(ir::UIntImm::make(DataType::UInt(32), 1));
+    vcnt_args.push_back(ir::UIntImm::make(UInt(32), ctpop_id));
+    vcnt_args.push_back(ir::UIntImm::make(UInt(32), 1));
     vcnt_args.push_back(e);
-    return ir::Call::make(call->dtype,  "llvm_intrin", vcnt_args, Call::PureIntrinsic);
+    return ir::Call::make(call->type,  "llvm_intrin", vcnt_args, Call::PureIntrinsic);
   }
 
   // Popcount lowering rule:
@@ -77,12 +77,9 @@ Expr CodeGenARM::ARMPopcount(const Call *call) {
   // to return back to original input type
 
   // Dvisions are always divisible (number of bits = 64 or 128)
-  DataType uint8_type = DataType(
-      e.dtype().code(), 8, e.dtype().bits() * e.dtype().lanes() / 8);
-  DataType uint16_type = DataType(
-      uint8_type.code(), 16, uint8_type.bits() * uint8_type.lanes() / 16);
-  DataType uint32_type = DataType(
-      uint16_type.code(), 32, uint8_type.bits() * uint8_type.lanes() / 32);
+  Type uint8_type = Type(e.type().code(), 8, e.type().bits() * e.type().lanes() / 8);
+  Type uint16_type = Type(uint8_type.code(), 16, uint8_type.bits() * uint8_type.lanes() / 16);
+  Type uint32_type = Type(uint16_type.code(), 32, uint8_type.bits() * uint8_type.lanes() / 32);
 
   // Interpret input as vector of 8bit values
   Expr input8 = reinterpret(uint8_type, e);
@@ -90,37 +87,37 @@ Expr CodeGenARM::ARMPopcount(const Call *call) {
   const Call* c0 = input8.as<Call>();
   CHECK(c0 != nullptr);
   Array<Expr> vcnt8_args;
-  vcnt8_args.push_back(ir::UIntImm::make(DataType::UInt(32), ctpop_id));
-  vcnt8_args.push_back(ir::UIntImm::make(DataType::UInt(32), 1));
+  vcnt8_args.push_back(ir::UIntImm::make(UInt(32), ctpop_id));
+  vcnt8_args.push_back(ir::UIntImm::make(UInt(32), 1));
   vcnt8_args.push_back(input8);
   Expr vcnt8 = ir::Call::make(uint8_type,  "llvm_intrin", vcnt8_args, Call::PureIntrinsic);
 
   // Accumulation 8->16bit
   Array<Expr> vcnt16_args;
-  vcnt16_args.push_back(ir::UIntImm::make(DataType::UInt(32), vpaddlu_id));
-  vcnt16_args.push_back(ir::UIntImm::make(DataType::UInt(32), 1));
+  vcnt16_args.push_back(ir::UIntImm::make(UInt(32), vpaddlu_id));
+  vcnt16_args.push_back(ir::UIntImm::make(UInt(32), 1));
   vcnt16_args.push_back(vcnt8);
   Expr vcnt16 = ir::Call::make(uint16_type, "llvm_intrin", vcnt16_args, Call::PureIntrinsic);
-  if (call->dtype.bits() == 16) {
+  if (call->type.bits() == 16) {
     return vcnt16;
   }
 
   // Accumulation 16->32bit
   Array<Expr> vcnt32_args;
-  vcnt32_args.push_back(ir::UIntImm::make(DataType::UInt(32), vpaddlu_id));
-  vcnt32_args.push_back(ir::UIntImm::make(DataType::UInt(32), 1));
+  vcnt32_args.push_back(ir::UIntImm::make(UInt(32), vpaddlu_id));
+  vcnt32_args.push_back(ir::UIntImm::make(UInt(32), 1));
   vcnt32_args.push_back(vcnt16);
   Expr vcnt32 = ir::Call::make(uint32_type,  "llvm_intrin", vcnt32_args, Call::PureIntrinsic);
-  if (call->dtype.bits() == 32) {
+  if (call->type.bits() == 32) {
     return vcnt32;
   }
 
   // Accumulation 32->64bit
   Array<Expr> vcnt64_args;
-  vcnt64_args.push_back(ir::UIntImm::make(DataType::UInt(32), vpaddlu_id));
-  vcnt64_args.push_back(ir::UIntImm::make(DataType::UInt(32), 1));
+  vcnt64_args.push_back(ir::UIntImm::make(UInt(32), vpaddlu_id));
+  vcnt64_args.push_back(ir::UIntImm::make(UInt(32), 1));
   vcnt64_args.push_back(vcnt32);
-  return ir::Call::make(call->dtype,  "llvm_intrin", vcnt64_args, Call::PureIntrinsic);
+  return ir::Call::make(call->type,  "llvm_intrin", vcnt64_args, Call::PureIntrinsic);
 }
 
 TVM_REGISTER_GLOBAL("tvm.codegen.llvm.target_arm")
diff --git a/src/codegen/llvm/codegen_cpu.cc b/src/codegen/llvm/codegen_cpu.cc
index 9f1a2926f002..0ba0c584a590 100644
--- a/src/codegen/llvm/codegen_cpu.cc
+++ b/src/codegen/llvm/codegen_cpu.cc
@@ -43,7 +43,7 @@ void CodeGenCPU::Init(const std::string& module_name,
   func_handle_map_.clear();
   export_system_symbols_.clear();
   // TVM runtime types
-  t_tvm_shape_index_ = llvm::Type::getIntNTy(*ctx, DataType::ShapeIndex().bits());
+  t_tvm_shape_index_ = llvm::Type::getIntNTy(*ctx, TVMShapeIndexType().bits());
   t_tvm_context_ = llvm::StructType::create({t_int_, t_int_});
   t_tvm_type_ = llvm::StructType::create({t_int8_, t_int8_, t_int16_});
   t_tvm_func_handle_ = t_void_p_;
@@ -252,7 +252,7 @@ std::unique_ptr<llvm::Module> CodeGenCPU::Finish() {
   return CodeGenLLVM::Finish();
 }
 llvm::Value* CodeGenCPU::CreateStructRefPtr(
-    DataType t, llvm::Value* buf, llvm::Value* index, int kind) {
+    Type t, llvm::Value* buf, llvm::Value* index, int kind) {
   if (kind < intrinsic::kArrKindBound_) {
     if (buf->getType() == t_void_p_) {
       buf = builder_->CreatePointerCast(buf, t_tvm_array_->getPointerTo());
@@ -329,7 +329,7 @@ llvm::Value* CodeGenCPU::CreateCallExtern(const Call* op) {
     arg_types.push_back(v->getType());
   }
   llvm::FunctionType* ftype = llvm::FunctionType::get(
-      LLVMType(op->dtype), arg_types, false);
+      LLVMType(op->type), arg_types, false);
   // Check if it is available in global function table as injected function.
   auto it = gv_func_map_.find(op->name);
   if (it != gv_func_map_.end()) {
@@ -448,7 +448,7 @@ void CodeGenCPU::CreateComputeScope(const AttrStmt* op) {
     llvm::Argument* v = &(*it);
     const Var& var = vargs[idx];
     new_vmap[var.get()] = v;
-    if (var.dtype().is_handle() && !alias_var_set_.count(var.get())) {
+    if (var.type().is_handle() && !alias_var_set_.count(var.get())) {
       // set non alias.
 #if TVM_LLVM_VERSION >= 50
       fcompute->addParamAttr(idx, llvm::Attribute::NoAlias);
@@ -532,8 +532,8 @@ void CodeGenCPU::CreateParallelLaunch(const Stmt& body, int num_task) {
   UnpackClosureData(cdata, vfields, &new_vmap);
   // setup parallel env
   ParallelEnv par_env;
-  par_env.task_id = Var("task_id", DataType::Int(32));
-  par_env.num_task = Var("num_task", DataType::Int(32));
+  par_env.task_id = Var("task_id", Int(32));
+  par_env.num_task = Var("num_task", Int(32));
   new_vmap[par_env.task_id.get()] = task_id;
   new_vmap[par_env.num_task.get()] = builder_->CreateLoad(
       builder_->CreateInBoundsGEP(
@@ -670,7 +670,7 @@ llvm::Value* CodeGenCPU::GetPackedFuncHandle(const std::string& fname) {
 
 llvm::BasicBlock *
 CodeGenCPU::MakeCallPacked(const Array<Expr> &args, llvm::Value **rvalue,
-                           llvm::Value **ret_tcode, const DataType &r_type,
+                           llvm::Value **ret_tcode, const Type &r_type,
                            const int64_t begin, const int64_t end) {
   using llvm::BasicBlock;
   std::string func_name = args[0].as<StringImm>()->value;
@@ -684,15 +684,15 @@ CodeGenCPU::MakeCallPacked(const Array<Expr> &args, llvm::Value **rvalue,
       builder_->CreatePointerCast(stack_value, t_tvm_value_->getPointerTo()),
       ConstInt32(begin));
   llvm::Value *arg_tcode =
-      CreateBufferPtr(DataType::Int(32), stack_tcode, ConstInt32(begin));
+      CreateBufferPtr(Int(32), stack_tcode, ConstInt32(begin));
   llvm::Value *ret_value = builder_->CreateInBoundsGEP(
       builder_->CreatePointerCast(stack_value, t_tvm_value_->getPointerTo()),
       ConstInt32(end));
-  *ret_tcode = CreateBufferPtr(DataType::Int(32), stack_tcode, ConstInt32(end));
+  *ret_tcode = CreateBufferPtr(Int(32), stack_tcode, ConstInt32(end));
   BasicBlock *end_block = CheckCallSuccess(builder_->CreateCall(
       RuntimeTVMFuncCall(), {handle, arg_value, arg_tcode, ConstInt32(nargs),
                              ret_value, *ret_tcode}));
-  DataType r_api_type = ir::APIType(r_type);
+  Type r_api_type = ir::APIType(r_type);
   *rvalue = builder_->CreateAlignedLoad(
       builder_->CreatePointerCast(ret_value,
                                   LLVMType(r_api_type)->getPointerTo()),
@@ -705,7 +705,7 @@ llvm::Value *CodeGenCPU::CreateCallPacked(const Call *op) {
   CHECK_EQ(op->args.size(), 5U);
   llvm::Value *rvalue = nullptr;
   llvm::Value *ret_tcode = nullptr;
-  MakeCallPacked(op->args, &rvalue, &ret_tcode, op->dtype,
+  MakeCallPacked(op->args, &rvalue, &ret_tcode, op->type,
                  op->args[3].as<IntImm>()->value,
                  op->args[4].as<IntImm>()->value);
   return rvalue;
@@ -717,7 +717,7 @@ llvm::Value *CodeGenCPU::CreateCallTracePacked(const Call *op) {
   llvm::Value *rvalue = nullptr;
   llvm::Value *ret_tcode = nullptr;
   BasicBlock *end_block = MakeCallPacked(
-      op->args, &rvalue, &ret_tcode, op->dtype, op->args[3].as<IntImm>()->value,
+      op->args, &rvalue, &ret_tcode, op->type, op->args[3].as<IntImm>()->value,
       op->args[4].as<IntImm>()->value);
   // Get traced value.
   llvm::Value *traced_value = MakeValue(op->args[5]);
@@ -800,7 +800,7 @@ llvm::Value* CodeGenCPU::CreateIntrinsic(const Call* op) {
     CHECK_EQ(op->args.size(), 3U);
     int kind = op->args[2].as<IntImm>()->value;
     llvm::Value* ref = this->CreateStructRefPtr(
-        op->dtype, MakeValue(op->args[0]),
+        op->type, MakeValue(op->args[0]),
         MakeValue(op->args[1]), kind);
     if (kind == intrinsic::kArrAddr) {
       return builder_->CreatePointerCast(ref, t_void_p_);
@@ -812,7 +812,7 @@ llvm::Value* CodeGenCPU::CreateIntrinsic(const Call* op) {
     int kind = op->args[2].as<IntImm>()->value;
     llvm::Value* value = MakeValue(op->args[3]);
     llvm::Value* ref = this->CreateStructRefPtr(
-        op->args[3].dtype(), MakeValue(op->args[0]),
+        op->args[3].type(), MakeValue(op->args[0]),
         MakeValue(op->args[1]), kind);
     CHECK(kind != intrinsic::kArrAddr);
     if (value->getType()->isPointerTy()) {
@@ -922,7 +922,7 @@ void CodeGenCPU::VisitStmt_(const For* op) {
       CHECK(parallel_env_.task_id.defined());
       CHECK(parallel_env_.num_task.defined());
       CHECK(parallel_env_.penv != nullptr);
-      DataType t = op->extent.dtype();
+      Type t = op->extent.type();
       Expr num_task = cast(t, parallel_env_.num_task);
       Expr task_id = cast(t, parallel_env_.task_id);
       CHECK(!parallel_env_.in_parallel_loop)
diff --git a/src/codegen/llvm/codegen_cpu.h b/src/codegen/llvm/codegen_cpu.h
index b9e127557e1a..52e6f6c6ef90 100644
--- a/src/codegen/llvm/codegen_cpu.h
+++ b/src/codegen/llvm/codegen_cpu.h
@@ -96,14 +96,14 @@ class CodeGenCPU : public CodeGenLLVM {
   llvm::Value* CreateStaticHandle();
   llvm::Value* GetPackedFuncHandle(const std::string& str);
   llvm::Value* PackClosureData(const Array<Var>& fields, uint64_t *num_bytes);
-  llvm::Value* CreateStructRefPtr(DataType t, llvm::Value* buffer, llvm::Value* index, int kind);
+  llvm::Value* CreateStructRefPtr(Type t, llvm::Value* buffer, llvm::Value* index, int kind);
   void UnpackClosureData(llvm::Value*cdata,
                          const Array<Var>& fields,
                          std::unordered_map<const Variable*, llvm::Value*>* vmap);
   // Make packed call.
   llvm::BasicBlock *MakeCallPacked(const Array<Expr> &args,
                                    llvm::Value **rvalue,
-                                   llvm::Value **ret_tcode, const DataType &r_type,
+                                   llvm::Value **ret_tcode, const Type &r_type,
                                    const int64_t begin, const int64_t end);
   // create call into tvm packed function.
   llvm::Value* CreateCallPacked(const Call* op);
diff --git a/src/codegen/llvm/codegen_llvm.cc b/src/codegen/llvm/codegen_llvm.cc
index 94ad8b76c9c9..2cff88b0bbf4 100644
--- a/src/codegen/llvm/codegen_llvm.cc
+++ b/src/codegen/llvm/codegen_llvm.cc
@@ -115,11 +115,11 @@ void CodeGenLLVM::AddFunctionInternal(const LoweredFunc& f, bool ret_void) {
   std::vector<llvm::Type*> arg_types;
   is_restricted_ = f->is_restricted;
   for (Var arg : f->args) {
-    DataType t = arg.dtype();
+    Type t = arg.type();
     if (t.is_handle()) {
       auto it = f->handle_data_type.find(arg);
       if (it != f->handle_data_type.end()) {
-        arg_types.push_back(LLVMType((*it).second.dtype())
+        arg_types.push_back(LLVMType((*it).second.type())
                             ->getPointerTo(GetGlobalAddressSpace()));
       } else {
         arg_types.push_back(t_int8_->getPointerTo(GetGlobalAddressSpace()));
@@ -128,7 +128,7 @@ void CodeGenLLVM::AddFunctionInternal(const LoweredFunc& f, bool ret_void) {
         alias_var_set_.insert(arg.get());
       }
     } else {
-      arg_types.push_back(LLVMType(arg.dtype()));
+      arg_types.push_back(LLVMType(arg.type()));
     }
   }
   llvm::FunctionType* ftype = llvm::FunctionType::get(
@@ -147,7 +147,7 @@ void CodeGenLLVM::AddFunctionInternal(const LoweredFunc& f, bool ret_void) {
     const Var& var = f->args[i];
     var_map_[var.get()] = v;
     if (is_restricted_) {
-      if (var.dtype().is_handle() && !alias_var_set_.count(var.get())) {
+      if (var.type().is_handle() && !alias_var_set_.count(var.get())) {
         // set non alias.
 #if TVM_LLVM_VERSION >= 50
         function_->addParamAttr(i, llvm::Attribute::NoAlias);
@@ -302,7 +302,7 @@ unsigned CodeGenLLVM::GetGlobalAddressSpace() {
   return 0;
 }
 
-llvm::Type* CodeGenLLVM::LLVMType(const DataType& t) const {
+llvm::Type* CodeGenLLVM::LLVMType(const Type& t) const {
   if (t.is_handle()) {
     CHECK_EQ(t.lanes(), 1);
     return t_void_p_;
@@ -335,7 +335,7 @@ llvm::Type* CodeGenLLVM::LLVMType(const DataType& t) const {
 void CodeGenLLVM::AddAliasInfo(llvm::Instruction* inst,
                                const Variable* buffer,
                                Expr index,
-                               DataType type) {
+                               Type type) {
   if (alias_var_set_.count(buffer) != 0) {
     // Mark all possibly aliased pointer as same type.
     llvm::MDNode* meta = md_tbaa_alias_set_;
@@ -387,7 +387,7 @@ void CodeGenLLVM::AddAliasInfo(llvm::Instruction* inst,
       md_builder_->createTBAAStructTagNode(meta, meta, 0));
 }
 
-void CodeGenLLVM::GetAlignment(DataType t,
+void CodeGenLLVM::GetAlignment(Type t,
                                const Variable* buf_var,
                                const Expr& index,
                                int* p_alignment,
@@ -474,7 +474,7 @@ llvm::Value* CodeGenLLVM::CreateVecFlip(llvm::Value* vec) {
 }
 
 llvm::Value* CodeGenLLVM::CreateVecPad(llvm::Value* vec, int target_lanes) {
-  llvm::Value* mask = llvm::UndefValue::get(LLVMType(DataType::Int(32, target_lanes)));
+  llvm::Value* mask = llvm::UndefValue::get(LLVMType(Int(32, target_lanes)));
   int num_elems = static_cast<int>(vec->getType()->getVectorNumElements());
   if (num_elems == target_lanes) return vec;
   CHECK_LT(num_elems, target_lanes);
@@ -542,19 +542,19 @@ void CodeGenLLVM::CreateSerialFor(llvm::Value* begin,
   loop_value->addIncoming(begin, pre_block);
   CHECK(!var_map_.count(loop_var.get()));
   var_map_[loop_var.get()] = loop_value;
-  builder_->CreateCondBr(CreateLT(loop_var.dtype(), loop_value, end),
+  builder_->CreateCondBr(CreateLT(loop_var.type(), loop_value, end),
                          for_body, for_end, md_very_likely_branch_);
   builder_->SetInsertPoint(for_body);
   this->VisitStmt(body);
   var_map_.erase(loop_var.get());
-  llvm::Value* loop_next = CreateAdd(loop_var.dtype(), loop_value, stride);
+  llvm::Value* loop_next = CreateAdd(loop_var.type(), loop_value, stride);
   loop_value->addIncoming(loop_next, builder_->GetInsertBlock());
   builder_->CreateBr(for_begin);
   builder_->SetInsertPoint(for_end);
 }
 
 // cast operatpr
-llvm::Value* CodeGenLLVM::CreateCast(DataType from, DataType to, llvm::Value* value) {
+llvm::Value* CodeGenLLVM::CreateCast(Type from, Type to, llvm::Value* value) {
   llvm::Type * target = LLVMType(to);
   if (value->getType() == target) return value;
   if (to.is_handle()) {
@@ -609,7 +609,7 @@ llvm::Value* CodeGenLLVM::GetConstString(const std::string& str) {
 }
 
 llvm::Value* CodeGenLLVM::CreateBufferPtr(
-    DataType t, llvm::Value* buffer, llvm::Value* index) {
+    Type t, llvm::Value* buffer, llvm::Value* index) {
   CHECK_EQ(t.lanes(), 1);
   llvm::PointerType* btype = llvm::dyn_cast<llvm::PointerType>(buffer->getType());
   CHECK(btype != nullptr);
@@ -622,7 +622,7 @@ llvm::Value* CodeGenLLVM::CreateBufferPtr(
 }
 
 llvm::Value* CodeGenLLVM::CreateBufferVecPtr(
-    DataType t, llvm::Value* buffer, llvm::Value* index) {
+    Type t, llvm::Value* buffer, llvm::Value* index) {
   CHECK_GT(t.lanes(), 1);
   llvm::PointerType* btype = llvm::dyn_cast<llvm::PointerType>(buffer->getType());
   CHECK(btype != nullptr);
@@ -647,7 +647,7 @@ llvm::Value* CodeGenLLVM::CreateCallExtern(const Call* op) {
     arg_type.push_back(arg_value.back()->getType());
   }
   llvm::FunctionType* ftype = llvm::FunctionType::get(
-      LLVMType(op->dtype), arg_type, false);
+      LLVMType(op->type), arg_type, false);
   llvm::Function* f = module_->getFunction(op->name);
   if (f == nullptr) {
     f = llvm::Function::Create(
@@ -674,7 +674,7 @@ llvm::Value* CodeGenLLVM::CreateIntrinsic(const Call* op) {
         sig_type.push_back(arg_value.back()->getType());
       }
     }
-    llvm::Type *return_type = LLVMType(op->dtype);
+    llvm::Type *return_type = LLVMType(op->type);
     if (sig_type.size() > 0 && return_type != sig_type[0]) {
       sig_type.insert(sig_type.begin(), return_type);
     }
@@ -692,7 +692,7 @@ llvm::Value* CodeGenLLVM::CreateIntrinsic(const Call* op) {
   } else if (op->is_intrinsic(Call::shift_left)) {
     return builder_->CreateShl(MakeValue(op->args[0]), MakeValue(op->args[1]));
   } else if (op->is_intrinsic(Call::shift_right)) {
-    if (op->args[0].dtype().is_int()) {
+    if (op->args[0].type().is_int()) {
       return builder_->CreateAShr(MakeValue(op->args[0]), MakeValue(op->args[1]));
     } else {
       return builder_->CreateLShr(MakeValue(op->args[0]), MakeValue(op->args[1]));
@@ -707,13 +707,13 @@ llvm::Value* CodeGenLLVM::CreateIntrinsic(const Call* op) {
     unsigned addrspace;
     if (!r) {
         ptr = CreateBufferPtr(
-          l->dtype, MakeValue(l->buffer_var), MakeValue(l->index));
+          l->type, MakeValue(l->buffer_var), MakeValue(l->index));
         addrspace = llvm::dyn_cast<llvm::PointerType>(
           ptr->getType())->getAddressSpace();
     } else {
-        Expr index = r->base / make_const(DataType::Int(32), r->lanes);
+        Expr index = r->base / make_const(Int(32), r->lanes);
         ptr = CreateBufferVecPtr(
-          l->dtype, MakeValue(l->buffer_var), MakeValue(index));
+          l->type, MakeValue(l->buffer_var), MakeValue(index));
         addrspace = llvm::dyn_cast<llvm::PointerType>(
           ptr->getType())->getAddressSpace();
     }
@@ -723,7 +723,7 @@ llvm::Value* CodeGenLLVM::CreateIntrinsic(const Call* op) {
   } else if (op->is_intrinsic(intrinsic::tvm_handle_is_null)) {
     return builder_->CreateIsNull(MakeValue(op->args[0]));
   } else if (op->is_intrinsic(intrinsic::tvm_if_then_else)) {
-    CHECK_EQ(op->args[0].dtype().lanes(), 1)
+    CHECK_EQ(op->args[0].type().lanes(), 1)
         << "if_then_else can only take scalar condition";
     using llvm::BasicBlock;
     BasicBlock* then_block = BasicBlock::Create(
@@ -747,7 +747,7 @@ llvm::Value* CodeGenLLVM::CreateIntrinsic(const Call* op) {
     value->addIncoming(else_value, else_value_block);
     return value;
   } else if (op->is_intrinsic(Call::reinterpret)) {
-    llvm::Type * target = LLVMType(op->dtype);
+    llvm::Type * target = LLVMType(op->type);
     return builder_->CreateBitCast(MakeValue(op->args[0]), target);
   } else if (op->is_intrinsic(Call::isnan)) {
     // TODO(hgt312): set fast math flag
@@ -779,13 +779,13 @@ llvm::Value* CodeGenLLVM::CreateIntrinsic(const Call* op) {
 void CodeGenLLVM::Scalarize(const Expr& e,
                             std::function<void(int i, llvm::Value* v)> f) {
   if (const Ramp* ramp = e.as<Ramp>()) {
-    for (int i = 0; i < ramp->dtype.lanes(); ++i) {
+    for (int i = 0; i < ramp->type.lanes(); ++i) {
       Expr offset = ramp->base + (ramp->stride * i);
       f(i, MakeValue(offset));
     }
   } else {
     llvm::Value* value = MakeValue(e);
-    for (int i = 0; i < e.dtype().lanes(); ++i) {
+    for (int i = 0; i < e.type().lanes(); ++i) {
       f(i, builder_->CreateExtractElement(value, i));
     }
   }
@@ -798,18 +798,18 @@ llvm::Value* CodeGenLLVM::VisitExpr_(const Variable* op) {
 }
 
 llvm::Value* CodeGenLLVM::VisitExpr_(const Cast* op) {
-  return CreateCast(op->value.dtype(), op->dtype, MakeValue(op->value));
+  return CreateCast(op->value.type(), op->type, MakeValue(op->value));
 }
 llvm::Value* CodeGenLLVM::VisitExpr_(const IntImm* op) {
-  return llvm::ConstantInt::getSigned(LLVMType(op->dtype), op->value);
+  return llvm::ConstantInt::getSigned(LLVMType(op->type), op->value);
 }
 
 llvm::Value* CodeGenLLVM::VisitExpr_(const UIntImm* op) {
-  return llvm::ConstantInt::get(LLVMType(op->dtype), op->value);
+  return llvm::ConstantInt::get(LLVMType(op->type), op->value);
 }
 
 llvm::Value* CodeGenLLVM::VisitExpr_(const FloatImm* op) {
-  return llvm::ConstantFP::get(LLVMType(op->dtype), op->value);
+  return llvm::ConstantFP::get(LLVMType(op->type), op->value);
 }
 
 llvm::Value* CodeGenLLVM::VisitExpr_(const StringImm* op) {
@@ -818,7 +818,7 @@ llvm::Value* CodeGenLLVM::VisitExpr_(const StringImm* op) {
 
 #define DEFINE_CODEGEN_BINARY_OP(Op)                                    \
   llvm::Value* CodeGenLLVM::Create ## Op(                               \
-      DataType t, llvm::Value* a, llvm::Value *b) {                         \
+      Type t, llvm::Value* a, llvm::Value *b) {                         \
     if (t.is_int()) {                                                   \
       if (t.bits() >= 32) {                                             \
         return builder_->CreateNSW ## Op (a, b);                        \
@@ -837,7 +837,7 @@ llvm::Value* CodeGenLLVM::VisitExpr_(const StringImm* op) {
     }                                                                   \
   }                                                                     \
   llvm::Value* CodeGenLLVM::VisitExpr_(const Op* op) {                  \
-    return Create ## Op(op->dtype, MakeValue(op->a), MakeValue(op->b));  \
+    return Create ## Op(op->type, MakeValue(op->a), MakeValue(op->b));  \
   }
 
 DEFINE_CODEGEN_BINARY_OP(Add);
@@ -846,7 +846,7 @@ DEFINE_CODEGEN_BINARY_OP(Mul);
 
 #define DEFINE_CODEGEN_CMP_OP(Op)                                       \
   llvm::Value* CodeGenLLVM::Create ## Op(                               \
-      DataType t, llvm::Value* a, llvm::Value* b) {                         \
+      Type t, llvm::Value* a, llvm::Value* b) {                         \
     if (t.is_int()) {                                                   \
       return builder_->CreateICmpS ## Op (a, b);                        \
     } else if (t.is_uint()) {                                           \
@@ -857,7 +857,7 @@ DEFINE_CODEGEN_BINARY_OP(Mul);
     }                                                                   \
 }                                                                       \
   llvm::Value* CodeGenLLVM::VisitExpr_(const Op* op) {                  \
-    return Create ## Op(op->a.dtype(), MakeValue(op->a), MakeValue(op->b)); \
+    return Create ## Op(op->a.type(), MakeValue(op->a), MakeValue(op->b)); \
   }
 
 DEFINE_CODEGEN_CMP_OP(LT);
@@ -868,12 +868,12 @@ DEFINE_CODEGEN_CMP_OP(GE);
 llvm::Value* CodeGenLLVM::VisitExpr_(const Div* op) {
   llvm::Value* a = MakeValue(op->a);
   llvm::Value* b = MakeValue(op->b);
-  if (op->dtype.is_int()) {
+  if (op->type.is_int()) {
     return builder_->CreateSDiv(a, b);
-  } else if (op->dtype.is_uint()) {
+  } else if (op->type.is_uint()) {
     return builder_->CreateUDiv(a, b);
   } else {
-    CHECK(op->dtype.is_float());
+    CHECK(op->type.is_float());
     return builder_->CreateFDiv(a, b);
   }
 }
@@ -881,12 +881,12 @@ llvm::Value* CodeGenLLVM::VisitExpr_(const Div* op) {
 llvm::Value* CodeGenLLVM::VisitExpr_(const Mod* op) {
   llvm::Value* a = MakeValue(op->a);
   llvm::Value* b = MakeValue(op->b);
-  if (op->dtype.is_int()) {
+  if (op->type.is_int()) {
     return builder_->CreateSRem(a, b);
-  } else if (op->dtype.is_uint()) {
+  } else if (op->type.is_uint()) {
     return builder_->CreateURem(a, b);
   } else {
-    CHECK(op->dtype.is_float());
+    CHECK(op->type.is_float());
     return builder_->CreateFRem(a, b);
   }
 }
@@ -894,19 +894,19 @@ llvm::Value* CodeGenLLVM::VisitExpr_(const Mod* op) {
 llvm::Value* CodeGenLLVM::VisitExpr_(const Min* op) {
   llvm::Value* a = MakeValue(op->a);
   llvm::Value* b = MakeValue(op->b);
-  return builder_->CreateSelect(CreateLT(op->a.dtype(), a, b), a, b);
+  return builder_->CreateSelect(CreateLT(op->a.type(), a, b), a, b);
 }
 
 llvm::Value* CodeGenLLVM::VisitExpr_(const Max* op) {
   llvm::Value* a = MakeValue(op->a);
   llvm::Value* b = MakeValue(op->b);
-  return builder_->CreateSelect(CreateGT(op->a.dtype(), a, b), a, b);
+  return builder_->CreateSelect(CreateGT(op->a.type(), a, b), a, b);
 }
 
 llvm::Value* CodeGenLLVM::VisitExpr_(const EQ* op) {
   llvm::Value* a = MakeValue(op->a);
   llvm::Value* b = MakeValue(op->b);
-  if (op->a.dtype().is_int() || op->a.dtype().is_uint()) {
+  if (op->a.type().is_int() || op->a.type().is_uint()) {
     return builder_->CreateICmpEQ(a, b);
   } else {
     return builder_->CreateFCmpOEQ(a, b);
@@ -916,7 +916,7 @@ llvm::Value* CodeGenLLVM::VisitExpr_(const EQ* op) {
 llvm::Value* CodeGenLLVM::VisitExpr_(const NE* op) {
   llvm::Value* a = MakeValue(op->a);
   llvm::Value* b = MakeValue(op->b);
-  if (op->a.dtype().is_int() || op->a.dtype().is_uint()) {
+  if (op->a.type().is_int() || op->a.type().is_uint()) {
     return builder_->CreateICmpNE(a, b);
   } else {
     return builder_->CreateFCmpONE(a, b);
@@ -950,7 +950,7 @@ llvm::Value* CodeGenLLVM::VisitExpr_(const Let* op) {
 }
 
 llvm::Value* CodeGenLLVM::VisitExpr_(const Load* op) {
-  DataType t = op->dtype;
+  Type t = op->type;
   bool is_volatile = volatile_buf_.count(op->buffer_var.get());
   llvm::Value* buffer = MakeValue(op->buffer_var);
   llvm::Value* index = MakeValue(op->index);
@@ -1010,10 +1010,10 @@ llvm::Value* CodeGenLLVM::VisitExpr_(const Call* op) {
 }
 
 llvm::Value* CodeGenLLVM::VisitExpr_(const Ramp* op) {
-  llvm::Value* vec = llvm::UndefValue::get(LLVMType(op->dtype));
+  llvm::Value* vec = llvm::UndefValue::get(LLVMType(op->type));
   for (int i = 0; i < op->lanes; ++i) {
     vec = builder_->CreateInsertElement(
-        vec, MakeValue(op->base + op->stride * make_const(op->stride.dtype(), i)),
+        vec, MakeValue(op->base + op->stride * make_const(op->stride.type(), i)),
         ConstInt32(i));
   }
   return vec;
@@ -1024,7 +1024,7 @@ llvm::Value* CodeGenLLVM::VisitExpr_(const Shuffle* op) {
   int total_lanes = 0;
   for (int i = 0, e = op->vectors.size(); i < e; ++i) {
     vecs[i] = VisitExpr(op->vectors[i]);
-    total_lanes += op->vectors[i].dtype().lanes();
+    total_lanes += op->vectors[i].type().lanes();
   }
   llvm::Value* v0 = CreateVecConcat(vecs);
   std::vector<uint32_t> idx(op->indices.size());
@@ -1045,7 +1045,7 @@ llvm::Value* CodeGenLLVM::VisitExpr_(const Broadcast* op) {
 
 void CodeGenLLVM::VisitStmt_(const Store* op) {
   CHECK(is_one(op->predicate));
-  DataType t = op->value.dtype();
+  Type t = op->value.type();
   bool is_volatile = volatile_buf_.count(op->buffer_var.get());
   llvm::Value* buffer = MakeValue(op->buffer_var);
   llvm::Value* index = MakeValue(op->index);
@@ -1056,7 +1056,7 @@ void CodeGenLLVM::VisitStmt_(const Store* op) {
     GetAlignment(t, op->buffer_var.get(), op->index, &alignment, &native_bits);
     llvm::Value* ptr = CreateBufferPtr(t, buffer, index);
     llvm::StoreInst* store = builder_->CreateAlignedStore(value, ptr, alignment, is_volatile);
-    AddAliasInfo(store, op->buffer_var.get(), op->index, op->value.dtype());
+    AddAliasInfo(store, op->buffer_var.get(), op->index, op->value.type());
     return;
   } else {
     // vector store
@@ -1071,7 +1071,7 @@ void CodeGenLLVM::VisitStmt_(const Store* op) {
             t.element_of(), buffer, MakeValue(ramp->base));
         ptr = builder_->CreatePointerCast(ptr, LLVMType(t)->getPointerTo(addrspace));
         llvm::StoreInst* store = builder_->CreateAlignedStore(value, ptr, alignment, is_volatile);
-        AddAliasInfo(store, op->buffer_var.get(), op->index, op->value.dtype());
+        AddAliasInfo(store, op->buffer_var.get(), op->index, op->value.type());
         return;
       }
     }
@@ -1084,7 +1084,7 @@ void CodeGenLLVM::VisitStmt_(const Store* op) {
     llvm::StoreInst* store = builder_->CreateAlignedStore(
         builder_->CreateExtractElement(value, i),
         ptr, basic_align, is_volatile);
-    AddAliasInfo(store, op->buffer_var.get(), Expr(), op->value.dtype());
+    AddAliasInfo(store, op->buffer_var.get(), Expr(), op->value.type());
   };
   this->Scalarize(op->index, f);
 }
@@ -1142,7 +1142,7 @@ void CodeGenLLVM::VisitStmt_(const Allocate* op) {
         << "Can only handle constant size stack allocation";
     StorageInfo& info = alloc_storage_info_[op->buffer_var.get()];
     if (constant_size % 4 == 0 && info.alignment == 0) {
-      info.alignment = GetTempAllocaAlignment(op->dtype, constant_size);
+      info.alignment = GetTempAllocaAlignment(op->type, constant_size);
     }
     // maximum necessary alignment in the NV devices
     if (info.alignment > 16) {
@@ -1150,7 +1150,7 @@ void CodeGenLLVM::VisitStmt_(const Allocate* op) {
     }
     llvm::AllocaInst* alloca = WithFunctionEntry([&]() {
         return builder_->CreateAlloca(
-            LLVMType(op->dtype), ConstInt32(constant_size));
+            LLVMType(op->type), ConstInt32(constant_size));
       });
     if (alloca->getAlignment() < static_cast<uint32_t>(info.alignment)) {
 #if TVM_LLVM_VERSION >= 100
@@ -1163,7 +1163,7 @@ void CodeGenLLVM::VisitStmt_(const Allocate* op) {
     buf = alloca;
   }
   buf = builder_->CreatePointerCast(
-      buf, LLVMType(op->dtype)->getPointerTo(
+      buf, LLVMType(op->type)->getPointerTo(
           buf->getType()->getPointerAddressSpace()));
   CHECK(!var_map_.count(op->buffer_var.get()));
   var_map_[op->buffer_var.get()] = buf;
@@ -1204,7 +1204,7 @@ void CodeGenLLVM::VisitStmt_(const AssertStmt* op) {
 
 void CodeGenLLVM::VisitStmt_(const LetStmt* op) {
   CHECK(!var_map_.count(op->var.get()));
-  if (op->var.dtype().is_handle()) {
+  if (op->var.type().is_handle()) {
     if (!is_restricted_) {
       alias_var_set_.insert(op->var.get());
     }
diff --git a/src/codegen/llvm/codegen_llvm.h b/src/codegen/llvm/codegen_llvm.h
index 08c836adf9d0..b7d091b3921b 100644
--- a/src/codegen/llvm/codegen_llvm.h
+++ b/src/codegen/llvm/codegen_llvm.h
@@ -206,12 +206,12 @@ class CodeGenLLVM :
    * \param t The original type.
    * \return LLVM type of t
    */
-  llvm::Type* LLVMType(const DataType& t) const;
+  llvm::Type* LLVMType(const Type& t) const;
   // initialize the function state.
   void InitFuncState();
   // Get alignment given index.
   void GetAlignment(
-      DataType t, const Variable* buf_var, const Expr& index,
+      Type t, const Variable* buf_var, const Expr& index,
       int* p_alignment, int* p_native_bits);
   // Get constant string
   llvm::Value* GetConstString(const std::string& str);
@@ -221,19 +221,19 @@ class CodeGenLLVM :
   // handle module import
   void HandleImport(const std::string& code);
   // cast operatpr
-  llvm::Value* CreateCast(DataType from, DataType to, llvm::Value* value);
+  llvm::Value* CreateCast(Type from, Type to, llvm::Value* value);
   // comparison op
   llvm::Value* GetVarValue(const Variable* v) const;
-  llvm::Value* CreateLT(DataType t, llvm::Value* a, llvm::Value* b);
-  llvm::Value* CreateLE(DataType t, llvm::Value* a, llvm::Value* b);
-  llvm::Value* CreateGT(DataType t, llvm::Value* a, llvm::Value* b);
-  llvm::Value* CreateGE(DataType t, llvm::Value* a, llvm::Value* b);
-  llvm::Value* CreateAdd(DataType t, llvm::Value* a, llvm::Value* b);
-  llvm::Value* CreateSub(DataType t, llvm::Value* a, llvm::Value* b);
-  llvm::Value* CreateMul(DataType t, llvm::Value* a, llvm::Value* b);
+  llvm::Value* CreateLT(Type t, llvm::Value* a, llvm::Value* b);
+  llvm::Value* CreateLE(Type t, llvm::Value* a, llvm::Value* b);
+  llvm::Value* CreateGT(Type t, llvm::Value* a, llvm::Value* b);
+  llvm::Value* CreateGE(Type t, llvm::Value* a, llvm::Value* b);
+  llvm::Value* CreateAdd(Type t, llvm::Value* a, llvm::Value* b);
+  llvm::Value* CreateSub(Type t, llvm::Value* a, llvm::Value* b);
+  llvm::Value* CreateMul(Type t, llvm::Value* a, llvm::Value* b);
   llvm::Value* CreateBroadcast(llvm::Value* value, int lanes);
-  llvm::Value* CreateBufferPtr(DataType t, llvm::Value* buffer, llvm::Value* index);
-  llvm::Value* CreateBufferVecPtr(DataType t, llvm::Value* buffer, llvm::Value* index);
+  llvm::Value* CreateBufferPtr(Type t, llvm::Value* buffer, llvm::Value* index);
+  llvm::Value* CreateBufferVecPtr(Type t, llvm::Value* buffer, llvm::Value* index);
   // Vector concatenation.
   llvm::Value* CreateVecSlice(llvm::Value* vec, int begin, int extent);
   llvm::Value* CreateVecFlip(llvm::Value* vec);
@@ -245,7 +245,7 @@ class CodeGenLLVM :
                        llvm::Value* stride,
                        const VarExpr& loop_var, const Stmt& body);
   // add alias information.
-  void AddAliasInfo(llvm::Instruction* load, const Variable* buffer, Expr index, DataType type);
+  void AddAliasInfo(llvm::Instruction* load, const Variable* buffer, Expr index, Type type);
   // The IRBuilder.
   using IRBuilder = llvm::IRBuilder<llvm::ConstantFolder, llvm::IRBuilderDefaultInserter>;
   // The current function
diff --git a/src/codegen/llvm/codegen_nvptx.cc b/src/codegen/llvm/codegen_nvptx.cc
index 372408c5e666..b6bc6ef952fd 100644
--- a/src/codegen/llvm/codegen_nvptx.cc
+++ b/src/codegen/llvm/codegen_nvptx.cc
@@ -58,7 +58,7 @@ class CodeGenNVPTX : public CodeGenLLVM {
           << "Can only handle constant size stack allocation in GPU";
       StorageInfo& info = alloc_storage_info_[op->buffer_var.get()];
       if (constant_size % 4 == 0 && info.alignment == 0) {
-        info.alignment = GetTempAllocaAlignment(op->dtype, constant_size);
+        info.alignment = GetTempAllocaAlignment(op->type, constant_size);
       }
       // maximum necessary alignment in the NV devices
       if (info.alignment > 16) {
@@ -69,7 +69,7 @@ class CodeGenNVPTX : public CodeGenLLVM {
         // TODO(tqchen): for higher version of LLVM, local address space can be set.
         llvm::AllocaInst* alloca = WithFunctionEntry([&]() {
             return builder_->CreateAlloca(
-                LLVMType(op->dtype), ConstInt32(constant_size));
+                LLVMType(op->type), ConstInt32(constant_size));
           });
         if (alloca->getAlignment() < static_cast<uint32_t>(info.alignment)) {
 #if TVM_LLVM_VERSION >= 100
@@ -84,7 +84,7 @@ class CodeGenNVPTX : public CodeGenLLVM {
             << "Can only allocate shared or local memory inside kernel";
         // Shared memory: address space  == 3
         const unsigned shared_address_space = 3;
-        llvm::Type* type = llvm::ArrayType::get(LLVMType(op->dtype), constant_size);
+        llvm::Type* type = llvm::ArrayType::get(LLVMType(op->type), constant_size);
         // Allocate shared memory in global, address_space = 3
         llvm::GlobalVariable *global = new llvm::GlobalVariable(
             *module_, type, false, llvm::GlobalValue::PrivateLinkage, 0, ".shared",
@@ -98,7 +98,7 @@ class CodeGenNVPTX : public CodeGenLLVM {
       }
     }
     buf = builder_->CreatePointerCast(
-        buf, LLVMType(op->dtype)->getPointerTo(
+        buf, LLVMType(op->type)->getPointerTo(
             buf->getType()->getPointerAddressSpace()));
     CHECK(!var_map_.count(op->buffer_var.get()));
     var_map_[op->buffer_var.get()] = buf;
diff --git a/src/codegen/llvm/codegen_x86_64.cc b/src/codegen/llvm/codegen_x86_64.cc
index 5d72b56df376..804d9b2f1b37 100644
--- a/src/codegen/llvm/codegen_x86_64.cc
+++ b/src/codegen/llvm/codegen_x86_64.cc
@@ -6,9 +6,9 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- *
+ * 
  *   http://www.apache.org/licenses/LICENSE-2.0
- *
+ * 
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -74,8 +74,8 @@ llvm::Value* CodeGenX86_64::VisitExpr_(const Cast* op) {
   // LLVM does not automatically generate the correct instruction sequences for
   // half -> float conversion (i.e. using AVX2/AVX-512 vectorized variants of
   // vcvtph2ps), so we explicitly generate them ourselves.
-  const auto from = op->value.dtype();
-  const auto to = op->dtype;
+  const auto from = op->value.type();
+  const auto to = op->type;
   if (from.is_float() && to.is_float() && from.bits() == 16 && to.bits() == 32) {
     CHECK_EQ(from.lanes(), to.lanes());
     CHECK_NOTNULL(target_machine_);
@@ -85,25 +85,21 @@ llvm::Value* CodeGenX86_64::VisitExpr_(const Cast* op) {
 
     if (from.lanes() >= 16 && has_avx512) {
       return CallVectorIntrin(
-          ::llvm::Intrinsic::x86_avx512_mask_vcvtph2ps_512, 16,
-          LLVMType(DataType::Float(32, from.lanes())),
+          ::llvm::Intrinsic::x86_avx512_mask_vcvtph2ps_512, 16, LLVMType(Float(32, from.lanes())),
           {
-            MakeValue(ir::Call::make(
-                DataType::Int(16, from.lanes()), ir::Call::reinterpret, {op->value},
-                ir::Call::PureIntrinsic)),
-                MakeValue(
-                    ir::Broadcast::make(ir::FloatImm::make(DataType::Float(32), 0), from.lanes())),
-                /*mask=*/MakeValue(ir::IntImm::make(DataType::Int(16), -1)),
-                /*rounding-mode=*/MakeValue(ir::IntImm::make(DataType::Int(32), 4)),
+              MakeValue(ir::Call::make(Int(16, from.lanes()), ir::Call::reinterpret, {op->value},
+                                       ir::Call::PureIntrinsic)),
+              MakeValue(ir::Broadcast::make(ir::FloatImm::make(Float(32), 0), from.lanes())),
+              /*mask=*/MakeValue(ir::IntImm::make(Int(16), -1)),
+              /*rounding-mode=*/MakeValue(ir::IntImm::make(Int(32), 4)),
           });
     }
 
     if (from.lanes() >= 8 && has_f16c) {
       return CallVectorIntrin(
-          ::llvm::Intrinsic::x86_vcvtph2ps_256, 8, LLVMType(DataType::Float(32, from.lanes())),
-          {MakeValue(ir::Call::make(
-              DataType::Int(16, from.lanes()), ir::Call::reinterpret, {op->value},
-              ir::Call::PureIntrinsic))});
+          ::llvm::Intrinsic::x86_vcvtph2ps_256, 8, LLVMType(Float(32, from.lanes())),
+          {MakeValue(ir::Call::make(Int(16, from.lanes()), ir::Call::reinterpret, {op->value},
+                                    ir::Call::PureIntrinsic))});
     }
   }
 
diff --git a/src/codegen/llvm/intrin_rule_llvm.cc b/src/codegen/llvm/intrin_rule_llvm.cc
index da07ff324b20..fd28d7e4594a 100644
--- a/src/codegen/llvm/intrin_rule_llvm.cc
+++ b/src/codegen/llvm/intrin_rule_llvm.cc
@@ -67,19 +67,19 @@ TVM_REGISTER_GLOBAL("tvm.intrin.rule.llvm.tanh")
   const ir::Call* call = e.as<ir::Call>();
   CHECK(call != nullptr);
   const Expr& x = call->args[0];
-  Expr one = make_const(x.dtype(), 1);
-  Expr two = make_const(x.dtype(), 2);
-  Expr neg_two = make_const(x.dtype(), -2);
+  Expr one = make_const(x.type(), 1);
+  Expr two = make_const(x.type(), 2);
+  Expr neg_two = make_const(x.type(), -2);
 
   Expr exp_neg2x = ir::Call::make(
-      x.dtype(), "exp", {neg_two * x}, ir::Call::PureIntrinsic);
+      x.type(), "exp", {neg_two * x}, ir::Call::PureIntrinsic);
   Expr exp_pos2x = ir::Call::make(
-      x.dtype(), "exp", {two * x}, ir::Call::PureIntrinsic);
+      x.type(), "exp", {two * x}, ir::Call::PureIntrinsic);
 
   Expr tanh_pos = (one - exp_neg2x) / (one + exp_neg2x);
   Expr tanh_neg = (exp_pos2x - one) / (exp_pos2x + one);
   *rv = ir::Select::make(
-      x >= make_zero(x.dtype()), tanh_pos, tanh_neg);
+      x >= make_zero(x.type()), tanh_pos, tanh_neg);
 });
 
 TVM_REGISTER_GLOBAL("tvm.intrin.rule.llvm.pow")
diff --git a/src/codegen/llvm/intrin_rule_llvm.h b/src/codegen/llvm/intrin_rule_llvm.h
index 7863a3dd7a96..c0b5241e8876 100644
--- a/src/codegen/llvm/intrin_rule_llvm.h
+++ b/src/codegen/llvm/intrin_rule_llvm.h
@@ -6,9 +6,9 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- *
+ * 
  *   http://www.apache.org/licenses/LICENSE-2.0
- *
+ * 
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -41,14 +41,14 @@ inline void DispatchLLVMPureIntrin(const TVMArgs& targs, TVMRetValue* rv) {
   CHECK(call != nullptr);
   Array<Expr> cargs;
   // intrin id.
-  cargs.push_back(ir::UIntImm::make(DataType::UInt(32), id));
-  cargs.push_back(ir::UIntImm::make(DataType::UInt(32), num_signature));
+  cargs.push_back(ir::UIntImm::make(UInt(32), id));
+  cargs.push_back(ir::UIntImm::make(UInt(32), num_signature));
 
   for (Expr arg : call->args) {
     cargs.push_back(arg);
   }
   *rv = ir::Call::make(
-      call->dtype, "llvm_intrin", cargs, ir::Call::PureIntrinsic);
+      call->type, "llvm_intrin", cargs, ir::Call::PureIntrinsic);
 }
 
 template<unsigned id, int num_signature>
@@ -58,13 +58,13 @@ inline void DispatchLLVMIntrin(const TVMArgs& targs, TVMRetValue* rv) {
   CHECK(call != nullptr);
   Array<Expr> cargs;
   // intrin id.
-  cargs.push_back(ir::UIntImm::make(DataType::UInt(32), id));
-  cargs.push_back(ir::UIntImm::make(DataType::UInt(32), num_signature));
+  cargs.push_back(ir::UIntImm::make(UInt(32), id));
+  cargs.push_back(ir::UIntImm::make(UInt(32), num_signature));
   for (Expr arg : call->args) {
     cargs.push_back(arg);
   }
   *rv = ir::Call::make(
-      call->dtype, "llvm_intrin", cargs, ir::Call::Intrinsic);
+      call->type, "llvm_intrin", cargs, ir::Call::Intrinsic);
 }
 
 }  // namespace codegen
diff --git a/src/codegen/llvm/intrin_rule_nvptx.cc b/src/codegen/llvm/intrin_rule_nvptx.cc
index 862d06b73a5f..4718cf78062e 100644
--- a/src/codegen/llvm/intrin_rule_nvptx.cc
+++ b/src/codegen/llvm/intrin_rule_nvptx.cc
@@ -35,11 +35,11 @@ inline void DispatchExternLibDevice(const TVMArgs& args, TVMRetValue* rv) {
   using namespace ir;
   const Call* call = e.as<Call>();
   CHECK(call != nullptr);
-  CHECK(call->dtype.bits() == 32 || call->dtype.bits() == 64) << "Only support float32 or float64.";
+  CHECK(call->type.bits() == 32 || call->type.bits() == 64) << "Only support float32 or float64.";
   std::ostringstream intrinsic_name;
   intrinsic_name << "__nv_" << call->name;
-  if (call->dtype.bits() == 32) intrinsic_name << "f";
-  *rv = Call::make(call->dtype, intrinsic_name.str(), call->args,
+  if (call->type.bits() == 32) intrinsic_name << "f";
+  *rv = Call::make(call->type, intrinsic_name.str(), call->args,
                    Call::PureExtern);
 }
 
diff --git a/src/codegen/llvm/intrin_rule_rocm.cc b/src/codegen/llvm/intrin_rule_rocm.cc
index 22b324545825..5ad5261c81bf 100644
--- a/src/codegen/llvm/intrin_rule_rocm.cc
+++ b/src/codegen/llvm/intrin_rule_rocm.cc
@@ -36,8 +36,8 @@ inline void DispatchExternOCML(const TVMArgs& args, TVMRetValue* rv) {
   const Call* call = e.as<Call>();
   CHECK(call != nullptr);
   std::ostringstream intrinsic_name;
-  intrinsic_name << "__ocml_" << call->name << "_f" << call->dtype.bits();
-  *rv = Call::make(call->dtype, intrinsic_name.str(), call->args,
+  intrinsic_name << "__ocml_" << call->name << "_f" << call->type.bits();
+  *rv = Call::make(call->type, intrinsic_name.str(), call->args,
                    Call::PureExtern);
 }
 
diff --git a/src/codegen/llvm/llvm_common.h b/src/codegen/llvm/llvm_common.h
index 5ec8bb3f2a9c..c16229f5f10a 100644
--- a/src/codegen/llvm/llvm_common.h
+++ b/src/codegen/llvm/llvm_common.h
@@ -33,12 +33,6 @@
 
 #include <llvm/IR/Value.h>
 #include <llvm/IR/Intrinsics.h>
-#if TVM_LLVM_VERSION >= 100
-#include <llvm/IR/IntrinsicsAMDGPU.h>
-#include <llvm/IR/IntrinsicsARM.h>
-#include <llvm/IR/IntrinsicsNVPTX.h>
-#include <llvm/IR/IntrinsicsX86.h>
-#endif
 #include <llvm/IR/Argument.h>
 #include <llvm/IR/BasicBlock.h>
 #include <llvm/IR/Constants.h>
@@ -50,6 +44,7 @@
 #include <llvm/IR/LLVMContext.h>
 #include <llvm/IR/Module.h>
 #include <llvm/IR/Type.h>
+#include <llvm/IR/Intrinsics.h>
 #include <llvm/IR/MDBuilder.h>
 #include <llvm/IR/Verifier.h>
 
diff --git a/src/codegen/spirv/codegen_spirv.cc b/src/codegen/spirv/codegen_spirv.cc
index 7800e47319e0..be2b6cc668eb 100644
--- a/src/codegen/spirv/codegen_spirv.cc
+++ b/src/codegen/spirv/codegen_spirv.cc
@@ -37,11 +37,11 @@ std::vector<uint32_t> CodeGenSPIRV::BuildFunction(const LoweredFunc& f) {
   std::vector<Var> pod_args;
   uint32_t num_buffer = 0;
   for (Var arg : f->args) {
-    DataType t = arg.dtype();
+    Type t = arg.type();
     if (t.is_handle()) {
       auto it = f->handle_data_type.find(arg);
       if (it != f->handle_data_type.end()) {
-        DataType value_type = (*it).second.dtype();
+        Type value_type = (*it).second.type();
         spirv::Value arg_value = builder_->BufferArgument(
             builder_->GetSType(value_type), 0, num_buffer);
         storage_info_[arg.get()].UpdateContentType(value_type);
@@ -61,7 +61,7 @@ std::vector<uint32_t> CodeGenSPIRV::BuildFunction(const LoweredFunc& f) {
   if (pod_args.size() != 0) {
     std::vector<spirv::SType> value_types;
     for (size_t i = 0; i < pod_args.size(); ++i) {
-      value_types.push_back(builder_->GetSType(pod_args[i].dtype()));
+      value_types.push_back(builder_->GetSType(pod_args[i].type()));
     }
     spirv::Value ptr = builder_->DeclarePushConstant(value_types);
     for (size_t i = 0; i < pod_args.size(); ++i) {
@@ -103,7 +103,7 @@ spirv::Value CodeGenSPIRV::GetThreadIndex(
   } else {
     v = builder_->GetWorkgroupID(ts.dim_index);
   }
-  return builder_->Cast(builder_->GetSType(iv->var.dtype()), v);
+  return builder_->Cast(builder_->GetSType(iv->var.type()), v);
 }
 
 spirv::Value CodeGenSPIRV::CreateStorageSync(const Call* op) {
@@ -112,7 +112,7 @@ spirv::Value CodeGenSPIRV::CreateStorageSync(const Call* op) {
   if (sync == "warp") {
     return value;
   } else if (sync == "shared") {
-    auto type_int = builder_->GetSType(DataType::Int(32));
+    auto type_int = builder_->GetSType(Int(32));
     builder_->MakeInst(
       spv::OpControlBarrier,
       builder_->IntImm(type_int, static_cast<int64_t>(spv::ScopeWorkgroup)),
@@ -133,15 +133,15 @@ spirv::Value CodeGenSPIRV::VisitExpr_(const Variable* op) {
 }
 
 spirv::Value CodeGenSPIRV::VisitExpr_(const IntImm* op) {
-  return builder_->IntImm(builder_->GetSType(op->dtype), op->value);
+  return builder_->IntImm(builder_->GetSType(op->type), op->value);
 }
 
 spirv::Value CodeGenSPIRV::VisitExpr_(const UIntImm* op) {
-  return builder_->UIntImm(builder_->GetSType(op->dtype), op->value);
+  return builder_->UIntImm(builder_->GetSType(op->type), op->value);
 }
 
 spirv::Value CodeGenSPIRV::VisitExpr_(const FloatImm* op) {
-  return builder_->FloatImm(builder_->GetSType(op->dtype), op->value);
+  return builder_->FloatImm(builder_->GetSType(op->type), op->value);
 }
 
 spirv::Value CodeGenSPIRV::VisitExpr_(const StringImm* op) {
@@ -150,7 +150,7 @@ spirv::Value CodeGenSPIRV::VisitExpr_(const StringImm* op) {
 }
 
 spirv::Value CodeGenSPIRV::VisitExpr_(const Cast* op) {
-  return builder_->Cast(builder_->GetSType(op->dtype), MakeValue(op->value));
+  return builder_->Cast(builder_->GetSType(op->type), MakeValue(op->value));
 }
 
 spirv::Value CodeGenSPIRV::VisitExpr_(const Add* op) {
@@ -248,7 +248,7 @@ spirv::Value CodeGenSPIRV::VisitExpr_(const Call* op) {
       values.push_back(MakeValue(op->args[i]));
     }
     return builder_->CallGLSL450(
-        builder_->GetSType(op->dtype), inst_id, values);
+        builder_->GetSType(op->type), inst_id, values);
   } else if (op->is_intrinsic(Call::bitwise_and)) {
     CHECK_EQ(op->args.size(), 2U);
     spirv::Value a = MakeValue(op->args[0]);
@@ -277,13 +277,13 @@ spirv::Value CodeGenSPIRV::VisitExpr_(const Call* op) {
     CHECK_EQ(op->args.size(), 2U);
     spirv::Value a = MakeValue(op->args[0]);
     spirv::Value b = MakeValue(op->args[1]);
-    if (op->args[0].dtype().is_int()) {
+    if (op->args[0].type().is_int()) {
       return builder_->MakeValue(spv::OpShiftRightArithmetic, a.stype, a, b);
     } else {
       return builder_->MakeValue(spv::OpShiftRightLogical, a.stype, a, b);
     }
   } else if (op->is_intrinsic(Call::reinterpret)) {
-    return builder_->MakeValue(spv::OpBitcast, builder_->GetSType(op->dtype),
+    return builder_->MakeValue(spv::OpBitcast, builder_->GetSType(op->type),
                                MakeValue(op->args[0]));
   } else if (op->is_intrinsic(intrinsic::tvm_storage_sync)) {
     return this->CreateStorageSync(op);
@@ -316,17 +316,17 @@ spirv::Value CodeGenSPIRV::VisitExpr_(const Call* op) {
   } else if (op->is_intrinsic("popcount")) {
     return builder_->MakeValue(
         spv::OpBitCount,
-        builder_->GetSType(op->dtype),
+        builder_->GetSType(op->type),
         MakeValue(op->args[0]));
   } else {
     if (op->call_type == Call::Intrinsic ||
         op->call_type == Call::PureIntrinsic) {
       LOG(FATAL) << "Unresolved intrinsic " << op->name
-                 << " with return type " << op->dtype;
+                 << " with return type " << op->type;
     } else if (op->call_type == Call::Extern ||
                op->call_type == Call::PureExtern) {
       LOG(FATAL) << "Unresolved extern " << op->name
-                 << " with return type " << op->dtype;
+                 << " with return type " << op->type;
     } else {
       LOG(FATAL) << "Unresolved call type " << op->call_type;
     }
@@ -341,7 +341,7 @@ spirv::Value CodeGenSPIRV::VisitExpr_(const Ramp* op) {
     spirv::Value v = base;
     if (i != 0) {
       spirv::Value offset = MakeValue(
-          make_const(op->stride.dtype(), i) * op->stride);
+          make_const(op->stride.type(), i) * op->stride);
       v = builder_->Add(v, offset);
     }
     values.push_back(v);
@@ -364,7 +364,7 @@ spirv::Value CodeGenSPIRV::VisitExpr_(const Load* op) {
   CHECK(it != storage_info_.end());
   StorageInfo& info = it->second;
   if (!info.content_fixed) {
-    info.UpdateContentType(op->dtype);
+    info.UpdateContentType(op->type);
   }
 
   spirv::SType content_type = builder_->GetSType(info.content_type);
@@ -376,15 +376,15 @@ spirv::Value CodeGenSPIRV::VisitExpr_(const Load* op) {
   if (info.is_volatile) {
     mask |= spv::MemoryAccessVolatileMask;
   }
-  if (op->dtype.lanes() == 1) {
-    CHECK_EQ(info.content_type, op->dtype)
+  if (op->type.lanes() == 1) {
+    CHECK_EQ(info.content_type, op->type)
         << "Vulkan only allow one type access to the same buffer";
     spirv::Value index = MakeValue(op->index);
     spirv::Value ptr = builder_->StructArrayAccess(
         ptr_type, buffer, index);
     return builder_->MakeValue(spv::OpLoad, content_type, ptr, mask);
   } else {
-    if (op->dtype.element_of() == info.content_type) {
+    if (op->type.element_of() == info.content_type) {
       // because content type is element type, we can only do scalarize load.
       std::vector<spirv::Value> values;
       auto f = [&](int i, spirv::Value index) {
@@ -398,13 +398,13 @@ spirv::Value CodeGenSPIRV::VisitExpr_(const Load* op) {
     } else {
       if (const Ramp* ramp = op->index.as<Ramp>()) {
         if (is_one(ramp->stride)) {
-          CHECK_EQ(ramp->lanes, op->dtype.lanes());
+          CHECK_EQ(ramp->lanes, op->type.lanes());
           arith::ModularSet me = analyzer_->modular_set(ramp->base);
           CHECK((me->coeff % ramp->lanes) == 0 &&
                 (me->base % ramp->lanes)  == 0)
               << "Only aligned vector access is allowed in SPIRV";
           Expr vec_index = ir::Simplify(
-              ramp->base / make_const(ramp->base.dtype(), ramp->lanes));
+              ramp->base / make_const(ramp->base.type(), ramp->lanes));
           spirv::Value ptr = builder_->StructArrayAccess(
               ptr_type, buffer, MakeValue(vec_index));
           return builder_->MakeValue(spv::OpLoad, content_type, ptr, mask);
@@ -420,14 +420,14 @@ spirv::Value CodeGenSPIRV::VisitExpr_(const Load* op) {
 void CodeGenSPIRV::Scalarize(const Expr& e,
                              std::function<void(int i, spirv::Value v)> f) {
   if (const Ramp* ramp = e.as<Ramp>()) {
-    for (int i = 0; i < ramp->dtype.lanes(); ++i) {
+    for (int i = 0; i < ramp->type.lanes(); ++i) {
       Expr offset = ramp->base + ramp->stride * i;
       f(i, MakeValue(offset));
     }
   } else {
-    spirv::SType etype = builder_->GetSType(e.dtype().element_of());
+    spirv::SType etype = builder_->GetSType(e.type().element_of());
     spirv::Value value = MakeValue(e);
-    for (int i = 0; i < e.dtype().lanes(); ++i) {
+    for (int i = 0; i < e.type().lanes(); ++i) {
       f(i, builder_->MakeValue(
           spv::OpCompositeExtract, etype, value, i));
     }
@@ -441,7 +441,7 @@ void CodeGenSPIRV::VisitStmt_(const Store* op) {
   StorageInfo& info = it->second;
 
   if (!info.content_fixed) {
-    info.UpdateContentType(op->value.dtype());
+    info.UpdateContentType(op->value.type());
   }
 
   spirv::SType content_type = builder_->GetSType(info.content_type);
@@ -455,15 +455,15 @@ void CodeGenSPIRV::VisitStmt_(const Store* op) {
     mask |= spv::MemoryAccessVolatileMask;
   }
 
-  if (op->value.dtype().lanes() == 1) {
-    CHECK_EQ(info.content_type, op->value.dtype())
+  if (op->value.type().lanes() == 1) {
+    CHECK_EQ(info.content_type, op->value.type())
         << "Vulkan only allow one type access to the same buffer";
     spirv::Value index = MakeValue(op->index);
     spirv::Value ptr = builder_->StructArrayAccess(
         ptr_type, buffer, index);
     builder_->MakeInst(spv::OpStore, ptr, value, mask);
   } else {
-    if (op->value.dtype().element_of() == info.content_type) {
+    if (op->value.type().element_of() == info.content_type) {
       // because content type is element type, we can only do scalarize load.
       auto f = [&](int i, spirv::Value index) {
         spirv::Value elem = builder_->MakeValue(
@@ -476,13 +476,13 @@ void CodeGenSPIRV::VisitStmt_(const Store* op) {
     } else {
       if (const Ramp* ramp = op->index.as<Ramp>()) {
         if (is_one(ramp->stride)) {
-          CHECK_EQ(ramp->lanes, op->value.dtype().lanes());
+          CHECK_EQ(ramp->lanes, op->value.type().lanes());
           arith::ModularSet me = analyzer_->modular_set(ramp->base);
           CHECK((me->coeff % ramp->lanes) == 0 &&
                 (me->base % ramp->lanes)  == 0)
               << "Only aligned vector access is allowed in SPIRV";
           Expr vec_index = ir::Simplify(
-              ramp->base / make_const(ramp->base.dtype(), ramp->lanes));
+              ramp->base / make_const(ramp->base.type(), ramp->lanes));
           spirv::Value ptr = builder_->StructArrayAccess(
               ptr_type, buffer, MakeValue(vec_index));
           builder_->MakeInst(spv::OpStore, ptr, value, mask);
@@ -530,7 +530,7 @@ void CodeGenSPIRV::VisitStmt_(const For* op) {
   // loop continue
   builder_->StartLabel(continue_label);
   spirv::Value one =
-      op->loop_var.dtype().is_int() ?
+      op->loop_var.type().is_int() ?
       builder_->IntImm(loop_var.stype, 1) :
       builder_->UIntImm(loop_var.stype, 1);
   spirv::Value next_value = builder_->Add(loop_var, one);
@@ -576,13 +576,13 @@ void CodeGenSPIRV::VisitStmt_(const IfThenElse* op) {
 void CodeGenSPIRV::VisitStmt_(const Allocate* op) {
   CHECK(!is_zero(op->condition));
   CHECK(!op->new_expr.defined());
-  CHECK(!op->dtype.is_handle());
+  CHECK(!op->type.is_handle());
   int32_t constant_size = op->constant_allocation_size();
   CHECK_GT(constant_size, 0)
       << "Can only handle constant size stack allocation in GPU";
   spirv::Value buf;
   StorageInfo& info = storage_info_[op->buffer_var.get()];
-  spirv::SType etype = builder_->GetSType(op->dtype);
+  spirv::SType etype = builder_->GetSType(op->type);
   if (info.scope.rank == runtime::StorageRank::kLocal) {
     buf = builder_->Allocate(
         etype, static_cast<uint32_t>(constant_size),
@@ -597,7 +597,7 @@ void CodeGenSPIRV::VisitStmt_(const Allocate* op) {
         spv::StorageClassWorkgroup);
   }
   CHECK(!info.content_fixed);
-  info.UpdateContentType(op->dtype);
+  info.UpdateContentType(op->type);
   CHECK(!var_map_.count(op->buffer_var.get()));
   var_map_[op->buffer_var.get()] = buf;
   this->VisitStmt(op->body);
@@ -632,7 +632,7 @@ void CodeGenSPIRV::VisitStmt_(const AssertStmt* op) {
 
 void CodeGenSPIRV::VisitStmt_(const LetStmt* op) {
   CHECK(!var_map_.count(op->var.get()));
-  CHECK(!op->var.dtype().is_handle());
+  CHECK(!op->var.type().is_handle());
   var_map_[op->var.get()] = MakeValue(op->value);
   analyzer_->Bind(op->var, op->value);
   this->VisitStmt(op->body);
diff --git a/src/codegen/spirv/codegen_spirv.h b/src/codegen/spirv/codegen_spirv.h
index 3d16377271c4..eca361493e80 100644
--- a/src/codegen/spirv/codegen_spirv.h
+++ b/src/codegen/spirv/codegen_spirv.h
@@ -6,9 +6,9 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- *
+ * 
  *   http://www.apache.org/licenses/LICENSE-2.0
- *
+ * 
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -112,10 +112,10 @@ class CodeGenSPIRV:
     /*! \brief Whether it is volatile */
     bool content_fixed{false};
     /*! \brief Current content type */
-    DataType content_type{DataType::Handle()};
+    Type content_type{Handle()};
 
     // Update content type if it hasn't beenupdated.
-    void UpdateContentType(DataType type) {
+    void UpdateContentType(Type type) {
       if (content_fixed) {
         CHECK_EQ(type, content_type)
             << "Cannot use two different content type in GLSL model";
diff --git a/src/codegen/spirv/intrin_rule_spirv.cc b/src/codegen/spirv/intrin_rule_spirv.cc
index 7a347e5e8dbc..fca9aa203f80 100644
--- a/src/codegen/spirv/intrin_rule_spirv.cc
+++ b/src/codegen/spirv/intrin_rule_spirv.cc
@@ -39,13 +39,13 @@ inline void DispatchGLSLPureIntrin(const TVMArgs& targs, TVMRetValue* rv) {
   CHECK(call != nullptr);
   Array<Expr> cargs;
   // intrin id.
-  cargs.push_back(ir::UIntImm::make(DataType::UInt(32), id));
+  cargs.push_back(ir::UIntImm::make(UInt(32), id));
 
   for (Expr arg : call->args) {
     cargs.push_back(arg);
   }
   *rv = ir::Call::make(
-      call->dtype, "spirv_glsl450", cargs, ir::Call::PureIntrinsic);
+      call->type, "spirv_glsl450", cargs, ir::Call::PureIntrinsic);
 }
 
 TVM_REGISTER_GLOBAL("tvm.intrin.rule.vulkan.floor")
diff --git a/src/codegen/spirv/ir_builder.cc b/src/codegen/spirv/ir_builder.cc
index 6f8d96e148c1..35d57d7cc3f8 100644
--- a/src/codegen/spirv/ir_builder.cc
+++ b/src/codegen/spirv/ir_builder.cc
@@ -53,10 +53,10 @@ void IRBuilder::InitHeader() {
 
 void IRBuilder::InitPreDefs() {
   ext_glsl450_ = ExtInstImport("GLSL.std.450");
-  t_int32_ = DeclareType(DataType::Int(32));
-  t_uint32_ = DeclareType(DataType::UInt(32));
-  t_bool_ = DeclareType(DataType::UInt(1));
-  t_fp32_ = DeclareType(DataType::Float(32));
+  t_int32_ = DeclareType(Int(32));
+  t_uint32_ = DeclareType(UInt(32));
+  t_bool_ = DeclareType(UInt(1));
+  t_fp32_ = DeclareType(Float(32));
   const_i32_zero_ = IntImm(t_int32_, 0);
   // declare void, and void functions
   t_void_.id = id_counter_++;
@@ -66,14 +66,14 @@ void IRBuilder::InitPreDefs() {
       .AddSeq(t_void_func_, t_void_).Commit(&global_);
 }
 
-SType IRBuilder::GetSType(const DataType& dtype) {
-  if (dtype == DataType::Int(32)) {
+SType IRBuilder::GetSType(const Type& dtype) {
+  if (dtype == Int(32)) {
     return t_int32_;
-  } else if (dtype == DataType::UInt(1)) {
+  } else if (dtype == UInt(1)) {
     return t_bool_;
-  } else if (dtype == DataType::Float(32)) {
+  } else if (dtype == Float(32)) {
     return t_fp32_;
-  } else if (dtype == DataType::UInt(32)) {
+  } else if (dtype == UInt(32)) {
     return t_uint32_;
   }
   uint32_t type_key;
@@ -99,7 +99,7 @@ SType IRBuilder::GetPointerType(const SType& value_type,
   }
   SType t;
   t.id = id_counter_++;
-  t.type = DataType::Handle();
+  t.type = Handle();
   t.element_type_id = value_type.id;
   t.storage_class = storage_class;
   ib_.Begin(spv::OpTypePointer)
@@ -118,11 +118,11 @@ SType IRBuilder::GetStructArrayType(const SType& value_type,
 
   SType arr_type;
   arr_type.id = id_counter_++;
-  arr_type.type = DataType::Handle();
+  arr_type.type = Handle();
   arr_type.element_type_id = value_type.id;
 
   if (num_elems != 0) {
-    Value length = UIntImm(GetSType(DataType::UInt(32)), num_elems);
+    Value length = UIntImm(GetSType(UInt(32)), num_elems);
     ib_.Begin(spv::OpTypeArray)
         .AddSeq(arr_type, value_type, length).Commit(&global_);
   } else {
@@ -138,7 +138,7 @@ SType IRBuilder::GetStructArrayType(const SType& value_type,
   // declare struct of array
   SType struct_type;
   struct_type.id = id_counter_++;
-  struct_type.type = DataType::Handle();
+  struct_type.type = Handle();
   struct_type.element_type_id = value_type.id;
   ib_.Begin(spv::OpTypeStruct)
       .AddSeq(struct_type, arr_type).Commit(&global_);
@@ -183,7 +183,7 @@ Value IRBuilder::FloatImm(const SType& dtype, double value) {
   } else {
     CHECK_EQ(dtype.type.bits(), 16);
     return Cast(dtype,
-                FloatImm(GetSType(DataType::Float(32)), value));
+                FloatImm(GetSType(Float(32)), value));
   }
 }
 
@@ -206,7 +206,7 @@ Value IRBuilder::DeclarePushConstant(const std::vector<SType>& value_types) {
   CHECK_EQ(push_const_.id, 0);
   SType struct_type;
   struct_type.id = id_counter_++;
-  struct_type.type = DataType::Handle();
+  struct_type.type = Handle();
   ib_.Begin(spv::OpTypeStruct).Add(struct_type);
   for (const SType& vtype : value_types) {
     ib_.Add(vtype);
@@ -218,7 +218,7 @@ Value IRBuilder::DeclarePushConstant(const std::vector<SType>& value_types) {
     ib_.Begin(spv::OpMemberDecorate)
         .AddSeq(struct_type, i, spv::DecorationOffset, offset)
         .Commit(&decorate_);
-    DataType t = value_types[i].type;
+    Type t = value_types[i].type;
     uint32_t nbits = t.bits() * t.lanes();
     CHECK_EQ(nbits % 8 , 0);
     offset += nbits / 8;
@@ -296,7 +296,7 @@ Value IRBuilder::Allocate(const SType& value_type,
 
 Value IRBuilder::GetWorkgroupID(uint32_t dim_index) {
   if (workgroup_id_.id == 0) {
-    SType vec3_type = this->GetSType(DataType::Int(32).with_lanes(3));
+    SType vec3_type = this->GetSType(Int(32).with_lanes(3));
     SType ptr_type = this->GetPointerType(
         vec3_type, spv::StorageClassInput);
     workgroup_id_ = NewValue(ptr_type, kVectorPtr);
@@ -315,7 +315,7 @@ Value IRBuilder::GetWorkgroupID(uint32_t dim_index) {
 
 Value IRBuilder::GetLocalID(uint32_t dim_index) {
   if (local_id_.id == 0) {
-    SType vec3_type = this->GetSType(DataType::Int(32).with_lanes(3));
+    SType vec3_type = this->GetSType(Int(32).with_lanes(3));
     SType ptr_type = this->GetPointerType(vec3_type, spv::StorageClassInput);
     local_id_ = NewValue(ptr_type, kVectorPtr);
     ib_.Begin(spv::OpVariable)
@@ -339,7 +339,7 @@ Value IRBuilder::GetConst_(const SType& dtype, const uint64_t* pvalue) {
   }
   CHECK_LE(dtype.type.bits(), 64);
   Value ret = NewValue(dtype, kConstant);
-  if (dtype.type == DataType::UInt(1)) {
+  if (dtype.type == UInt(1)) {
     // bool types.
     if (*pvalue) {
       ib_.Begin(spv::OpConstantTrue).AddSeq(ret);
@@ -367,7 +367,7 @@ Value IRBuilder::GetConst_(const SType& dtype, const uint64_t* pvalue) {
   return ret;
 }
 
-SType IRBuilder::DeclareType(const DataType& dtype) {
+SType IRBuilder::DeclareType(const Type& dtype) {
   if (dtype.lanes() == 1) {
     SType t;
     t.id = id_counter_++;
@@ -426,7 +426,7 @@ Value IRBuilder::CallGLSL450(const SType& ret_type,
 
 Value IRBuilder::Concat(const std::vector<Value>& vec) {
   bool is_const = vec[0].flag == kConstant;
-  DataType etype = vec[0].stype.type;
+  Type etype = vec[0].stype.type;
   int lanes = etype.lanes();
   for (size_t i = 1; i < vec.size(); ++i) {
     CHECK_EQ(etype, vec[i].stype.type.element_of())
@@ -456,10 +456,10 @@ Value IRBuilder::Concat(const std::vector<Value>& vec) {
 Value IRBuilder::Cast(const SType& dst_type, spirv::Value value) {
   CHECK_NE(value.stype.id, 0U);
   if (value.stype.id == dst_type.id) return value;
-  const tvm::DataType& from = value.stype.type;
-  const tvm::DataType& to = dst_type.type;
+  const tvm::Type& from = value.stype.type;
+  const tvm::Type& to = dst_type.type;
   CHECK_EQ(from.lanes(), to.lanes());
-  if (from == DataType::Bool()) {
+  if (from == Bool()) {
     if (to.is_int()) {
       return Select(value, IntImm(dst_type, 1), IntImm(dst_type, 0));
     } else if (to.is_uint()) {
@@ -471,7 +471,7 @@ Value IRBuilder::Cast(const SType& dst_type, spirv::Value value) {
       LOG(FATAL) << "cannot cast from " << from << " to " << to;
       return Value();
     }
-  } else if (to == DataType::Bool()) {
+  } else if (to == Bool()) {
     if (from.is_int()) {
       return NE(value, IntImm(value.stype, 0));
     } else if (to.is_uint()) {
@@ -558,7 +558,7 @@ Value IRBuilder::Mod(Value a, Value b) {
   Value IRBuilder::_OpName(Value a, Value b) {                                        \
     CHECK_EQ(a.stype.id, b.stype.id);                                                 \
     CHECK_EQ(a.stype.type.lanes(), b.stype.type.lanes());                             \
-    const auto& bool_type = this->GetSType(DataType::UInt(1).with_lanes(a.stype.type.lanes())); \
+    const auto& bool_type = this->GetSType(UInt(1).with_lanes(a.stype.type.lanes())); \
     if (a.stype.type.is_int()) {                                                      \
       return MakeValue(spv::OpS##_Op, bool_type, a, b);                               \
     } else if (a.stype.type.is_uint()) {                                              \
@@ -578,7 +578,7 @@ DEFINE_BUILDER_CMP_OP(GE, GreaterThanEqual);
   Value IRBuilder::_OpName(Value a, Value b) {                                        \
     CHECK_EQ(a.stype.id, b.stype.id);                                                 \
     CHECK_EQ(a.stype.type.lanes(), b.stype.type.lanes());                             \
-    const auto& bool_type = this->GetSType(DataType::UInt(1).with_lanes(a.stype.type.lanes())); \
+    const auto& bool_type = this->GetSType(UInt(1).with_lanes(a.stype.type.lanes())); \
     if (a.stype.type.is_int() || a.stype.type.is_uint()) {                            \
       return MakeValue(spv::OpI##_Op, bool_type, a, b);                               \
     } else {                                                                          \
@@ -592,7 +592,7 @@ DEFINE_BUILDER_CMP_UOP(NE, NotEqual);
 
 Value IRBuilder::Select(Value cond, Value a, Value b) {
   CHECK_EQ(a.stype.id, b.stype.id);
-  CHECK_EQ(cond.stype.type.element_of(), DataType::UInt(1));
+  CHECK_EQ(cond.stype.type.element_of(), UInt(1));
   return MakeValue(spv::OpSelect, a.stype, cond, a, b);
 }
 
diff --git a/src/codegen/spirv/ir_builder.h b/src/codegen/spirv/ir_builder.h
index 3843cbb3c6a9..c04af743fbb8 100644
--- a/src/codegen/spirv/ir_builder.h
+++ b/src/codegen/spirv/ir_builder.h
@@ -45,7 +45,7 @@ struct SType {
   /*! \brief The Id to represent type */
   uint32_t id{0};
   /*! \brief corresponding TVM type */
-  tvm::DataType type;
+  tvm::Type type;
   /*! \brief content type id if it is a pointer/struct-array class */
   uint32_t element_type_id{0};
   /*! \brief The storage class, if it is a pointer */
@@ -424,7 +424,7 @@ class IRBuilder {
    * \param dtype The data type.
    * \return The corresponding spirv type.
    */
-  SType GetSType(const tvm::DataType& dtype);
+  SType GetSType(const tvm::Type& dtype);
   /*!
    * \brief Get the pointer type that points to value_type
    * \param value_type.
@@ -575,7 +575,7 @@ class IRBuilder {
   // get constant given value encoded in uint64_t
   Value GetConst_(const SType& dtype, const uint64_t* pvalue);
   // declare type
-  SType DeclareType(const DataType& dtype);
+  SType DeclareType(const Type& dtype);
   /*! \brief internal instruction builder  */
   InstrBuilder ib_;
   /*! \brief Current label */
diff --git a/src/codegen/stackvm/codegen_stackvm.cc b/src/codegen/stackvm/codegen_stackvm.cc
index 52cabaf0b6eb..fd2a5f764ff6 100644
--- a/src/codegen/stackvm/codegen_stackvm.cc
+++ b/src/codegen/stackvm/codegen_stackvm.cc
@@ -100,12 +100,12 @@ int CodeGenStackVM::GetVarID(const Variable* v) const {
 
 void CodeGenStackVM::VisitExpr_(const Load* op) {
   this->Push(op->buffer_var);
-  StackVM::OpCode code = StackVM::GetLoad(op->dtype);
+  StackVM::OpCode code = StackVM::GetLoad(Type2TVMType(op->type));
   if (const IntImm* index = op->index.as<IntImm>()) {
     this->PushOp(code, index->value);
   } else {
     this->Push(op->index);
-    this->PushOp(StackVM::PUSH_I64, op->dtype.element_of().bytes());
+    this->PushOp(StackVM::PUSH_I64, op->type.element_of().bytes());
     this->PushOp(StackVM::MUL_I64);
     this->PushOp(StackVM::ADDR_ADD);
     this->PushOp(code, 0);
@@ -114,13 +114,13 @@ void CodeGenStackVM::VisitExpr_(const Load* op) {
 
 void CodeGenStackVM::VisitStmt_(const Store* op) {
   this->Push(op->buffer_var);
-  StackVM::OpCode code = StackVM::GetStore(op->value.dtype());
+  StackVM::OpCode code = StackVM::GetStore(Type2TVMType(op->value.type()));
   if (const IntImm* index = op->index.as<IntImm>()) {
     this->Push(op->value);
     this->PushOp(code, index->value);
   } else {
     this->Push(op->index);
-    this->PushOp(StackVM::PUSH_I64, op->value.dtype().element_of().bytes());
+    this->PushOp(StackVM::PUSH_I64, op->value.type().element_of().bytes());
     this->PushOp(StackVM::MUL_I64);
     this->PushOp(StackVM::ADDR_ADD);
     this->Push(op->value);
@@ -147,7 +147,7 @@ void CodeGenStackVM::VisitExpr_(const Call* op) {
     CHECK(op->args.size() == 1 && l);
     this->PushOp(StackVM::LOAD_HEAP, GetVarID(l->buffer_var.get()));
     this->Push(l->index);
-    this->PushOp(StackVM::PUSH_I64, l->dtype.element_of().bytes());
+    this->PushOp(StackVM::PUSH_I64, l->type.element_of().bytes());
     this->PushOp(StackVM::MUL_I64);
     this->PushOp(StackVM::ADDR_ADD);
   } else if (op->is_intrinsic(Call::reinterpret)) {
@@ -248,7 +248,7 @@ void CodeGenStackVM::PushBinary(StackVM::OpCode op_int64,
                                 const Expr& b) {
   this->Push(a);
   this->Push(b);
-  DataType t = a.dtype();
+  Type t = a.type();
   if (t.is_int()) {
     this->PushOp(op_int64);
   } else if (t.is_uint()) {
@@ -258,7 +258,7 @@ void CodeGenStackVM::PushBinary(StackVM::OpCode op_int64,
   }
 }
 
-void CodeGenStackVM::PushCast(DataType dst, DataType src) {
+void CodeGenStackVM::PushCast(Type dst, Type src) {
   if (dst.is_int()) {
     if (src.is_int() || src.is_uint()) return;
   } else if (dst.is_uint()) {
@@ -297,7 +297,7 @@ void CodeGenStackVM::VisitExpr_(const Variable *op) {
 
 void CodeGenStackVM::VisitExpr_(const Cast *op) {
   this->Push(op->value);
-  PushCast(op->dtype, op->value.dtype());
+  PushCast(op->type, op->value.type());
 }
 
 void CodeGenStackVM::VisitExpr_(const Add *op) {
diff --git a/src/codegen/stackvm/codegen_stackvm.h b/src/codegen/stackvm/codegen_stackvm.h
index dcae072c102d..1e6dd64476aa 100644
--- a/src/codegen/stackvm/codegen_stackvm.h
+++ b/src/codegen/stackvm/codegen_stackvm.h
@@ -108,7 +108,7 @@ class CodeGenStackVM
                   const Expr& a,
                   const Expr& b);
   // push cast;
-  void PushCast(DataType dst, DataType src);
+  void PushCast(Type dst, Type src);
   // overloadable functions
   // expression
   void VisitExpr_(const Variable* op) final;
diff --git a/src/contrib/hybrid/codegen_hybrid.cc b/src/contrib/hybrid/codegen_hybrid.cc
index 2bb86093e2f8..9e55d9be13d5 100644
--- a/src/contrib/hybrid/codegen_hybrid.cc
+++ b/src/contrib/hybrid/codegen_hybrid.cc
@@ -57,7 +57,7 @@ std::string CodeGenHybrid::Finish() {
   return stream.str();
 }
 
-void CodeGenHybrid::PrintType(DataType t, std::ostream &os) {
+void CodeGenHybrid::PrintType(Type t, std::ostream &os) {
   if (t.is_float()) {
     os << "float";
     CHECK(t.bits() == 16 || t.bits() == 32 || t.bits() == 64);
@@ -76,11 +76,11 @@ void CodeGenHybrid::VisitExpr_(const IntImm *op, std::ostream& os) {  // NOLINT(
   os << op->value;
 }
 void CodeGenHybrid::VisitExpr_(const UIntImm *op, std::ostream& os) {  // NOLINT(*)
-  PrintType(op->dtype, os);
+  PrintType(op->type, os);
   os << "(" << op->value << ")";
 }
 void CodeGenHybrid::VisitExpr_(const FloatImm *op, std::ostream& os) { // NOLINT(*)
-  PrintType(op->dtype, os);
+  PrintType(op->type, os);
   os << "(" << std::setprecision(20) << op->value << ")";
 }
 void CodeGenHybrid::VisitExpr_(const StringImm *op, std::ostream& os) { // NOLINT(*)
@@ -92,7 +92,7 @@ inline void PrintBinaryExpr(const T* op,
                             const char *opstr,
                             std::ostream& os,  // NOLINT(*)
                             CodeGenHybrid* p) {
-  CHECK(op->dtype.lanes() == 1)  << "vec bin op not implemented";
+  CHECK(op->type.lanes() == 1)  << "vec bin op not implemented";
   if (isalpha(opstr[0])) {
     os << opstr << '(';
     p->PrintExpr(op->a, os);
@@ -114,7 +114,7 @@ inline void PrintBinaryIntrinsitc(const Call* op,
                                   const char *opstr,
                                   std::ostream& os,  // NOLINT(*)
                                   CodeGenHybrid* p) {
-  CHECK(op->dtype.lanes() == 1)  << "vec bin intrin not implemented";
+  CHECK(op->type.lanes() == 1)  << "vec bin intrin not implemented";
   CHECK_EQ(op->args.size(), 2U);
   os << '(';
   p->PrintExpr(op->args[0], os);
@@ -124,10 +124,10 @@ inline void PrintBinaryIntrinsitc(const Call* op,
 }
 
 void CodeGenHybrid::VisitExpr_(const Cast *op, std::ostream& os) {  // NOLINT(*)
-  if (op->dtype == op->value.dtype()) {
+  if (op->type == op->value.type()) {
     PrintExpr(op->value, stream);
   } else {
-    PrintType(op->dtype, os);
+    PrintType(op->type, os);
     os << "(";
     PrintExpr(op->value, os);
     os << ")";
@@ -148,14 +148,14 @@ void CodeGenHybrid::VisitExpr_(const Mul *op, std::ostream& os) {  // NOLINT(*)
 }
 
 void CodeGenHybrid::VisitExpr_(const Div *op, std::ostream& os) {  // NOLINT(*)
-  if (op->dtype.is_int())
+  if (op->type.is_int())
     PrintBinaryExpr(op, "//", os, this);
   else
     PrintBinaryExpr(op, "/", os, this);
 }
 
 void CodeGenHybrid::VisitExpr_(const FloorDiv *op, std::ostream& os) {  // NOLINT(*)
-  if (op->dtype.is_int())
+  if (op->type.is_int())
     PrintBinaryExpr(op, "//", os, this);
   else
     PrintBinaryExpr(op, "/", os, this);
@@ -320,7 +320,7 @@ void CodeGenHybrid::VisitStmt_(const Realize *op) {
     }
     if (op->bounds.size() == 1) stream << ", ";
     stream << "), '";
-    PrintType(op->dtype, stream);
+    PrintType(op->type, stream);
     stream << "', '";
     stream << alloc_storage_scope_[op->func] << "')\n";
   }
diff --git a/src/contrib/hybrid/codegen_hybrid.h b/src/contrib/hybrid/codegen_hybrid.h
index 2c719b0b3ecf..866756996f8d 100644
--- a/src/contrib/hybrid/codegen_hybrid.h
+++ b/src/contrib/hybrid/codegen_hybrid.h
@@ -138,7 +138,7 @@ class CodeGenHybrid :
    * \param t The type representation.
    * \param os The stream to print the ctype into
    */
-  virtual void PrintType(DataType t, std::ostream& os); // NOLINT(*)
+  virtual void PrintType(Type t, std::ostream& os); // NOLINT(*)
 
  private:
   /*! \brief The current indent of the code dump. */
diff --git a/src/lang/attrs.cc b/src/lang/attrs.cc
index b83734beacb3..007a68b1e629 100644
--- a/src/lang/attrs.cc
+++ b/src/lang/attrs.cc
@@ -177,7 +177,7 @@ bool AttrsEqualHandler::VisitAttr_(const Not* lhs, const ObjectRef& other) {
 
 bool AttrsEqualHandler::VisitAttr_(const Cast* lhs, const ObjectRef& other) {
   if (const auto* rhs = other.as<Cast>()) {
-    if (lhs->dtype != rhs->dtype) return false;
+    if (lhs->type != rhs->type) return false;
     return Equal(lhs->value, rhs->value);
   } else {
     return false;
@@ -188,7 +188,7 @@ bool AttrsEqualHandler::VisitAttr_(const Call* lhs, const ObjectRef& other) {
   if (const auto* rhs = other.as<Call>()) {
     return
         lhs->name == rhs->name &&
-        lhs->dtype == rhs->dtype &&
+        lhs->type == rhs->type &&
         lhs->call_type == rhs->call_type &&
         Equal(lhs->args, rhs->args);
   } else {
@@ -290,7 +290,7 @@ size_t AttrsHashHandler::VisitAttr_(const Cast* op) {
   static size_t key = std::hash<std::string>()(Cast::_type_key);
   AttrsHash hasher;
   size_t res = key;
-  res = Combine(res, hasher(op->dtype));
+  res = Combine(res, hasher(op->type));
   res = Combine(res, Hash(op->value));
   return res;
 }
@@ -300,7 +300,7 @@ size_t AttrsHashHandler::VisitAttr_(const Call* op) {
   AttrsHash hasher;
   size_t res = key;
   res = Combine(res, hasher(op->name));
-  res = Combine(res, hasher(op->dtype));
+  res = Combine(res, hasher(op->type));
   res = Combine(res, Hash(op->args));
   return res;
 }
diff --git a/src/lang/buffer.cc b/src/lang/buffer.cc
index eb5d87efbbfa..77e741086a59 100644
--- a/src/lang/buffer.cc
+++ b/src/lang/buffer.cc
@@ -42,10 +42,10 @@ Array<Expr> SimplifyArray(Array<Expr> array) {
 }
 
 Buffer decl_buffer(Array<Expr> shape,
-                   DataType dtype,
+                   Type dtype,
                    std::string name) {
   return BufferNode::make(
-      Var(name, DataType::Handle()),
+      Var(name, Handle()),
       dtype,
       shape,
       Array<Expr>(),
@@ -279,30 +279,30 @@ inline Expr ElemOffset(const BufferNode* n, Array<Expr> index) {
   return base;
 }
 
-inline Expr BufferOffset(const BufferNode* n, Array<Expr> index, DataType dtype) {
+inline Expr BufferOffset(const BufferNode* n, Array<Expr> index, Type dtype) {
   Expr offset = ElemOffset(n, index);
   if (n->dtype.lanes() != 1) {
-    offset = offset * make_const(offset.dtype(), dtype.lanes());
+    offset = offset * make_const(offset.type(), dtype.lanes());
   }
   if (dtype.lanes() != 1) {
-    return ir::Ramp::make(offset, make_const(offset.dtype(), 1), dtype.lanes());
+    return ir::Ramp::make(offset, make_const(offset.type(), 1), dtype.lanes());
   } else {
     return offset;
   }
 }
 
-Expr Buffer::vload(Array<Expr> begin, DataType dtype) const {
-  // specially handle bool, stored asDataType::Int(8)
+Expr Buffer::vload(Array<Expr> begin, Type dtype) const {
+  // specially handle bool, stored as Int(8)
   const BufferNode* n = operator->();
   CHECK(dtype.element_of() == n->dtype.element_of() &&
         dtype.lanes() % n->dtype.lanes() == 0)
       << "Cannot load " << dtype
       << " from buffer of " << n->dtype;
-  if (dtype == DataType::Bool()) {
+  if (dtype == Bool()) {
     return ir::Cast::make(
-        DataType::Bool(),
+        Bool(),
         ir::Load::make(
-            DataType::Int(8), n->data, BufferOffset(n, begin, DataType::Int(8)),
+            Int(8), n->data, BufferOffset(n, begin, Int(8)),
             const_true()));
   } else {
     return ir::Load::make(
@@ -312,17 +312,17 @@ Expr Buffer::vload(Array<Expr> begin, DataType dtype) const {
 }
 
 Stmt Buffer::vstore(Array<Expr> begin, Expr value) const {
-  // specially handle bool, stored asDataType::Int(8)
+  // specially handle bool, stored as Int(8)
   const BufferNode* n = operator->();
-  DataType dtype = value.dtype();
+  Type dtype = value.type();
   CHECK(dtype.element_of() == n->dtype.element_of() &&
         dtype.lanes() % n->dtype.lanes() == 0)
       << "Cannot load " << dtype
       << " from buffer of " << n->dtype;
-  if (value.dtype() == DataType::Bool()) {
+  if (value.type() == Bool()) {
     return ir::Store::make(n->data,
-                           ir::Cast::make(DataType::Int(8), value),
-                           BufferOffset(n, begin, DataType::Int(8)),
+                           ir::Cast::make(Int(8), value),
+                           BufferOffset(n, begin, Int(8)),
                            const_true());
   } else {
     return ir::Store::make(n->data, value, BufferOffset(n, begin, dtype),
@@ -381,7 +381,7 @@ Buffer Buffer::MakeSlice(Array<Expr> begins, Array<Expr> extents) const {
                           n->buffer_type);
 }
 
-Expr Buffer::access_ptr(int access_mask, DataType ptr_type, int content_lanes, Expr offset) const {
+Expr Buffer::access_ptr(int access_mask, Type ptr_type, int content_lanes, Expr offset) const {
   const BufferNode* self = operator->();
   Expr e_dtype;
   Expr extent;
@@ -396,21 +396,21 @@ Expr Buffer::access_ptr(int access_mask, DataType ptr_type, int content_lanes, E
   Expr elem_offset = self->elem_offset + offset;
   if (content_lanes > 1) {
     e_dtype = ir::TypeAnnotation(self->dtype.with_lanes(content_lanes));
-    extent = extent / make_const(self->elem_offset.dtype(), content_lanes);
-    elem_offset = self->elem_offset / make_const(self->elem_offset.dtype(),
+    extent = extent / make_const(self->elem_offset.type(), content_lanes);
+    elem_offset = self->elem_offset / make_const(self->elem_offset.type(),
                                                  content_lanes);
   } else {
     e_dtype = ir::TypeAnnotation(self->dtype);
   }
   Array<Expr> acc_args{
     e_dtype, self->data, elem_offset,
-        extent, make_const(DataType::Int(32), access_mask)};
+        extent, make_const(Int(32), access_mask)};
   return ir::Call::make(
       ptr_type, ir::intrinsic::tvm_access_ptr, acc_args, ir::Call::Intrinsic);
 }
 
 Buffer BufferNode::make(Var data,
-                        DataType dtype,
+                        Type dtype,
                         Array<Expr> shape,
                         Array<Expr> strides,
                         Expr elem_offset,
diff --git a/src/lang/channel.cc b/src/lang/channel.cc
new file mode 100644
index 000000000000..cb3e2f566c77
--- /dev/null
+++ b/src/lang/channel.cc
@@ -0,0 +1,41 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file channel.cc
+ */
+#include <tvm/channel.h>
+
+namespace tvm {
+
+Channel ChannelNode::make(Var handle_var, Type dtype) {
+  auto n = make_node<ChannelNode>();
+  n->handle_var = handle_var;
+  n->dtype = dtype;
+  return Channel(n);
+}
+
+TVM_STATIC_IR_FUNCTOR(IRPrinter, vtable)
+.set_dispatch<ChannelNode>([](const ObjectRef& node, IRPrinter *p) {
+    auto* op = static_cast<const ChannelNode*>(node.get());
+    p->stream << "channel(" << op->handle_var << ", " << op->dtype << ")";
+});
+
+TVM_REGISTER_NODE_TYPE(ChannelNode);
+}  // namespace tvm
diff --git a/src/lang/expr.cc b/src/lang/expr.cc
index 997c15177546..6a69fdaa20c4 100644
--- a/src/lang/expr.cc
+++ b/src/lang/expr.cc
@@ -29,11 +29,70 @@
 
 namespace tvm {
 
+// maximum and min values
+Expr DataType::max() const {
+  using namespace ir;
+  CHECK_EQ(lanes(), 1);
+  if (is_int()) {
+    if (bits() == 64) {
+      return IntImm::make(*this, std::numeric_limits<int64_t>::max());
+    } else if (bits() < 64) {
+      int64_t val = 1;
+      val = (val << (bits() - 1)) - 1;
+      return IntImm::make(*this, val);
+    }
+  } else if (is_uint()) {
+    if (bits() == 64) {
+      return UIntImm::make(*this, std::numeric_limits<uint64_t>::max());
+    } else if (bits() < 64) {
+      uint64_t val = 1;
+      val = (val << static_cast<uint64_t>(bits())) - 1;
+      return UIntImm::make(*this, val);
+    }
+  } else if (is_float()) {
+    if (bits() == 64) {
+      return FloatImm::make(*this, std::numeric_limits<double>::max());
+    } else if (bits() == 32) {
+      return FloatImm::make(*this, std::numeric_limits<float>::max());
+    } else if (bits() == 16) {
+      return FloatImm::make(*this, 65504.0);
+    }
+  }
+  LOG(FATAL) << "Cannot decide max_value for type" << *this;
+  return Expr();
+}
+
+Expr DataType::min() const {
+  using namespace ir;
+  CHECK_EQ(lanes(), 1);
+  if (is_int()) {
+    if (bits() == 64) {
+      return IntImm::make(*this, std::numeric_limits<int64_t>::lowest());
+    } else if (bits() < 64) {
+      int64_t val = 1;
+      val = -(val << (bits() - 1));
+      return IntImm::make(*this, val);
+    }
+  } else if (is_uint()) {
+    return UIntImm::make(*this, 0);
+  } else if (is_float()) {
+    if (bits() == 64) {
+      return FloatImm::make(*this, std::numeric_limits<double>::lowest());
+    } else if (bits() == 32) {
+      return FloatImm::make(*this, std::numeric_limits<float>::lowest());
+    } else if (bits() == 16) {
+      return FloatImm::make(*this, -65504.0);
+    }
+  }
+  LOG(FATAL) << "Cannot decide min_value for type" << *this;
+  return Expr();
+}
+
 Expr::Expr(int32_t value)
-    : Expr(IntImm::make(DataType::Int(32), value)) {}
+    : Expr(IntImm::make(Int(32), value)) {}
 
 Expr::Expr(float value)
-    : Expr(ir::FloatImm::make(DataType::Float(32), value)) {}
+    : Expr(ir::FloatImm::make(Float(32), value)) {}
 
 Expr::Expr(std::string str)
     : Expr(ir::StringImm::make(str)) {}
@@ -43,7 +102,7 @@ Var::Var(std::string name_hint, DataType t)
 
 Var Variable::make(DataType t, std::string name_hint) {
   NodePtr<Variable> node = make_node<Variable>();
-  node->dtype = t;
+  node->type = t;
   node->name_hint = std::move(name_hint);
   return Var(node);
 }
@@ -54,11 +113,11 @@ Range::Range(Expr begin, Expr end)
           is_zero(begin) ? end : (end - begin))) {
 }
 
-Integer IntImm::make(DataType t, int64_t value) {
+Integer IntImm::make(Type t, int64_t value) {
   CHECK(t.is_int() && t.is_scalar())
       << "ValueError: IntImm can only take scalar.";
   NodePtr<IntImm> node = make_node<IntImm>();
-  node->dtype = t;
+  node->type = t;
   node->value = value;
   return Integer(node);
 }
@@ -93,7 +152,7 @@ void Dump(const NodeRef& n) {
   std::cerr << n << "\n";
 }
 
-Var var(std::string name_hint, DataType t) {
+Var var(std::string name_hint, Type t) {
   return Var(name_hint, t);
 }
 
@@ -125,10 +184,10 @@ IRPrinter::FType& IRPrinter::vtable() {
 TVM_STATIC_IR_FUNCTOR(IRPrinter, vtable)
 .set_dispatch<IntImm>([](const ObjectRef& node, IRPrinter* p) {
     auto* op = static_cast<const IntImm*>(node.get());
-    if (op->dtype == DataType::Int(32)) {
+    if (op->type == Int(32)) {
       p->stream << op->value;
     } else {
-      p->stream << "(" << op->dtype << ")" << op->value;
+      p->stream << "(" << op->type << ")" << op->value;
     }
   });
 
diff --git a/src/lang/expr_operator.cc b/src/lang/expr_operator.cc
index 1166e7eef976..220d4378cc97 100644
--- a/src/lang/expr_operator.cc
+++ b/src/lang/expr_operator.cc
@@ -30,16 +30,16 @@
 namespace tvm {
 
 // simple cast that only checks if type matches and cast
-inline Expr SimpleCast(const DataType& t, Expr value) {
-  if (value.dtype() == t) return value;
+inline Expr SimpleCast(const Type& t, Expr value) {
+  if (value.type() == t) return value;
   return ir::Cast::make(t, value);
 }
 
 // The public function with a quick checking path.
 void BinaryOpMatchTypes(Expr& lhs, Expr& rhs) {  // NOLINT(*)
-  if (lhs.dtype() == rhs.dtype()) return;
-  DataType ltype = lhs.dtype();
-  DataType rtype = rhs.dtype();
+  if (lhs.type() == rhs.type()) return;
+  Type ltype = lhs.type();
+  Type rtype = rhs.type();
   if (ltype.lanes() == 1 && rtype.lanes() != 1) {
     lhs = ir::Broadcast::make(lhs, rtype.lanes());
   } else if (rtype.lanes() == 1 && ltype.lanes() != 1) {
@@ -48,96 +48,37 @@ void BinaryOpMatchTypes(Expr& lhs, Expr& rhs) {  // NOLINT(*)
     CHECK(ltype.lanes() == rtype.lanes())
         << "Cannot match type " << ltype << " vs " << rtype;
   }
-  if (lhs.dtype() == rhs.dtype()) return;
+  if (lhs.type() == rhs.type()) return;
   // Only do very simple type coversion
-  // int->float, DataType::Int(32)->int(64)
+  // int->float, int(32)->int(64)
   // require the types to be relatively consistent
   // This will the reduce amount code generated by operators
   // and also help user to find potential type conversion problems.
-  if (!lhs.dtype().is_float() && rhs.dtype().is_float()) {
+  if (!lhs.type().is_float() && rhs.type().is_float()) {
     // int->float
-    lhs = cast(rhs.dtype(), lhs);
-  } else if (lhs.dtype().is_float() && !rhs.dtype().is_float()) {
+    lhs = cast(rhs.type(), lhs);
+  } else if (lhs.type().is_float() && !rhs.type().is_float()) {
     // int->float
-    rhs = cast(lhs.dtype(), rhs);
-  } else if ((lhs.dtype().is_int() && rhs.dtype().is_int()) ||
-             (lhs.dtype().is_uint() && rhs.dtype().is_uint())) {
+    rhs = cast(lhs.type(), rhs);
+  } else if ((lhs.type().is_int() && rhs.type().is_int()) ||
+             (lhs.type().is_uint() && rhs.type().is_uint())) {
     // promote int to higher bits
-    if (lhs.dtype().bits() < rhs.dtype().bits()) {
-      lhs = cast(rhs.dtype(), lhs);
+    if (lhs.type().bits() < rhs.type().bits()) {
+      lhs = cast(rhs.type(), lhs);
     } else {
-      rhs = cast(lhs.dtype(), rhs);
+      rhs = cast(lhs.type(), rhs);
     }
-  } else if ((lhs.dtype().is_int() && rhs.dtype().is_uint()) ||
-             (lhs.dtype().is_uint() && rhs.dtype().is_int())) {
-    int bits = std::max(lhs.dtype().bits(), rhs.dtype().bits());
-    lhs = SimpleCast(DataType::Int(bits, lhs.dtype().lanes()), lhs);
-    rhs = SimpleCast(DataType::Int(bits, rhs.dtype().lanes()), rhs);
+  } else if ((lhs.type().is_int() && rhs.type().is_uint()) ||
+             (lhs.type().is_uint() && rhs.type().is_int())) {
+    int bits = std::max(lhs.type().bits(), rhs.type().bits());
+    lhs = SimpleCast(Int(bits, lhs.type().lanes()), lhs);
+    rhs = SimpleCast(Int(bits, rhs.type().lanes()), rhs);
   } else {
     LOG(FATAL) << "Cannot match type " << ltype << " vs " << rtype;
   }
 }
 
 
-// maximum and min limits
-Expr max_value(const DataType& dtype) {
-  using namespace ir;
-  CHECK_EQ(dtype.lanes(), 1);
-  if (dtype.is_int()) {
-    if (dtype.bits() == 64) {
-      return IntImm::make(dtype, std::numeric_limits<int64_t>::max());
-    } else if (dtype.bits() < 64) {
-      int64_t val = 1;
-      val = (val << (dtype.bits() - 1)) - 1;
-      return IntImm::make(dtype, val);
-    }
-  } else if (dtype.is_uint()) {
-    if (dtype.bits() == 64) {
-      return UIntImm::make(dtype, std::numeric_limits<uint64_t>::max());
-    } else if (dtype.bits() < 64) {
-      uint64_t val = 1;
-      val = (val << static_cast<uint64_t>(dtype.bits())) - 1;
-      return UIntImm::make(dtype, val);
-    }
-  } else if (dtype.is_float()) {
-    if (dtype.bits() == 64) {
-      return FloatImm::make(dtype, std::numeric_limits<double>::max());
-    } else if (dtype.bits() == 32) {
-      return FloatImm::make(dtype, std::numeric_limits<float>::max());
-    } else if (dtype.bits() == 16) {
-      return FloatImm::make(dtype, 65504.0);
-    }
-  }
-  LOG(FATAL) << "Cannot decide max_value for type" << dtype;
-  return Expr();
-}
-
-Expr min_value(const DataType& dtype) {
-  using namespace ir;
-  CHECK_EQ(dtype.lanes(), 1);
-  if (dtype.is_int()) {
-    if (dtype.bits() == 64) {
-      return IntImm::make(dtype, std::numeric_limits<int64_t>::lowest());
-    } else if (dtype.bits() < 64) {
-      int64_t val = 1;
-      val = -(val << (dtype.bits() - 1));
-      return IntImm::make(dtype, val);
-    }
-  } else if (dtype.is_uint()) {
-    return UIntImm::make(dtype, 0);
-  } else if (dtype.is_float()) {
-    if (dtype.bits() == 64) {
-      return FloatImm::make(dtype, std::numeric_limits<double>::lowest());
-    } else if (dtype.bits() == 32) {
-      return FloatImm::make(dtype, std::numeric_limits<float>::lowest());
-    } else if (dtype.bits() == 16) {
-      return FloatImm::make(dtype, -65504.0);
-    }
-  }
-  LOG(FATAL) << "Cannot decide min_value for type" << dtype;
-  return Expr();
-}
-
 template<typename ValueType>
 inline bool ConstPowerHelper(ValueType val, int *shift) {
   if (val <= 0) return false;
@@ -162,11 +103,11 @@ bool is_const_power_of_two_integer(const Expr& x, int* shift) {
   }
 }
 
-Expr cast(const DataType& t, Expr value) {
+Expr cast(const Type& t, Expr value) {
   using ir::IntImm;
   using ir::UIntImm;
   using ir::FloatImm;
-  if (value.dtype() == t) return value;
+  if (value.type() == t) return value;
   // const fold IntImm as they are used in index computations
   if (t.lanes() == 1) {
     if (const IntImm* op = value.as<IntImm>()) {
@@ -178,10 +119,10 @@ Expr cast(const DataType& t, Expr value) {
     }
     return ir::Cast::make(t, value);
   } else {
-    if (value.dtype().lanes() == 1) {
+    if (value.type().lanes() == 1) {
       // manually unroll cast
-      DataType vtype = t.element_of();
-      if (value.dtype() != vtype) {
+      Type vtype = t.element_of();
+      if (value.type() != vtype) {
         if (const IntImm* op = value.as<IntImm>()) {
           value = make_const(vtype, op->value);
         } else if (const UIntImm* op = value.as<UIntImm>()) {
@@ -194,14 +135,14 @@ Expr cast(const DataType& t, Expr value) {
       }
       return ir::Broadcast::make(value, t.lanes());
     } else {
-      CHECK(value.dtype().lanes() == t.lanes());
+      CHECK(value.type().lanes() == t.lanes());
       return ir::Cast::make(t, value);
     }
   }
 }
 
-Expr reinterpret(const DataType& t, Expr value) {
-  if (value.dtype() == t) return value;
+Expr reinterpret(const Type& t, Expr value) {
+  if (value.type() == t) return value;
   return ir::Call::make(t, ir::Call::reinterpret, { value }, ir::Call::PureIntrinsic);
 }
 
@@ -218,9 +159,9 @@ Expr operator-(Expr a) {
   using ir::FloatImm;
   const IntImm* pa = a.as<IntImm>();
   const FloatImm* fa = a.as<FloatImm>();
-  if (pa) return ir::IntImm::make(a.dtype(), -pa->value);
-  if (fa) return ir::FloatImm::make(a.dtype(), -fa->value);
-  return make_zero(a.dtype()) - a;
+  if (pa) return ir::IntImm::make(a.type(), -pa->value);
+  if (fa) return ir::FloatImm::make(a.type(), -fa->value);
+  return make_zero(a.type()) - a;
 }
 
 Expr operator-(Expr a, Expr b) {
@@ -245,8 +186,8 @@ Expr div(Expr a, Expr b) {
 }
 
 Expr truncdiv(Expr a, Expr b) {
-  CHECK(a.dtype().is_int() || a.dtype().is_uint());
-  CHECK(b.dtype().is_int() || b.dtype().is_uint());
+  CHECK(a.type().is_int() || a.type().is_uint());
+  CHECK(b.type().is_int() || b.type().is_uint());
   return div(a, b);
 }
 
@@ -275,8 +216,8 @@ Expr indexmod(Expr a, Expr b) {
 }
 
 Expr floordiv(Expr a, Expr b) {
-  CHECK(a.dtype().is_int() || a.dtype().is_uint());
-  CHECK(b.dtype().is_int() || b.dtype().is_uint());
+  CHECK(a.type().is_int() || a.type().is_uint());
+  CHECK(b.type().is_int() || b.type().is_uint());
   BinaryOpMatchTypes(a, b);
   Expr ret = arith::TryConstFold<ir::FloorDiv>(a, b);
   if (ret.defined()) return ret;
@@ -284,8 +225,8 @@ Expr floordiv(Expr a, Expr b) {
 }
 
 Expr floormod(Expr a, Expr b) {
-  CHECK(a.dtype().is_int() || a.dtype().is_uint());
-  CHECK(b.dtype().is_int() || b.dtype().is_uint());
+  CHECK(a.type().is_int() || a.type().is_uint());
+  CHECK(b.type().is_int() || b.type().is_uint());
   BinaryOpMatchTypes(a, b);
   Expr ret = arith::TryConstFold<ir::FloorMod>(a, b);
   if (ret.defined()) return ret;
@@ -323,7 +264,7 @@ Expr max(Expr a, Expr b) {
 Expr if_then_else(Expr cond, Expr true_value, Expr false_value) {
   using ir::IntImm;
   using ir::UIntImm;
-  CHECK(cond.dtype() == DataType::Bool(1))
+  CHECK(cond.type() == Bool(1))
       << "if_then_else only accept the condition to be boolean type.";
   BinaryOpMatchTypes(true_value, false_value);
   if (const UIntImm* op = cond.as<UIntImm>()) {
@@ -340,7 +281,7 @@ Expr if_then_else(Expr cond, Expr true_value, Expr false_value) {
     }
   }
   return ir::Call::make(
-      true_value.dtype(),
+      true_value.type(),
       ir::intrinsic::tvm_if_then_else,
       {cond, true_value, false_value},
       ir::Call::PureIntrinsic);
@@ -348,7 +289,7 @@ Expr if_then_else(Expr cond, Expr true_value, Expr false_value) {
 
 Expr likely(Expr cond) {
   if (is_const(cond)) return cond;
-  return ir::Call::make(cond.dtype(), ir::Call::likely, { cond }, ir::Call::PureIntrinsic);
+  return ir::Call::make(cond.type(), ir::Call::likely, { cond }, ir::Call::PureIntrinsic);
 }
 
 Expr operator>(Expr a, Expr b) {
@@ -394,23 +335,23 @@ Expr operator!=(Expr a, Expr b) {
 }
 
 Expr operator&&(Expr a, Expr b) {
-  CHECK(a.dtype().is_bool());
-  CHECK(b.dtype().is_bool());
+  CHECK(a.type().is_bool());
+  CHECK(b.type().is_bool());
   Expr ret = arith::TryConstFold<ir::And>(a, b);
   if (ret.defined()) return ret;
   return ir::And::make(a, b);
 }
 
 Expr operator||(Expr a, Expr b) {
-  CHECK(a.dtype().is_bool());
-  CHECK(b.dtype().is_bool());
+  CHECK(a.type().is_bool());
+  CHECK(b.type().is_bool());
   Expr ret = arith::TryConstFold<ir::Or>(a, b);
   if (ret.defined()) return ret;
   return ir::Or::make(a, b);
 }
 
 Expr operator!(Expr a) {
-  CHECK(a.dtype().is_bool());
+  CHECK(a.type().is_bool());
   Expr ret = arith::TryConstFold<ir::Not>(a);
   if (ret.defined()) return ret;
   return ir::Not::make(a);
@@ -419,211 +360,211 @@ Expr operator!(Expr a) {
 Expr operator>>(Expr a, Expr b) {
   BinaryOpMatchTypes(a, b);
   TVM_INDEX_CONST_PROPAGATION({
-      const DataType& rtype = a.dtype();
+      const Type& rtype = a.type();
       if (pa && pb) return IntImm::make(rtype, (pa->value >> pb->value));
       if (pb) {
         if (pb->value == 0) return a;
       }
     });
-  return ir::Call::make(a.dtype(), ir::Call::shift_right, { a, b }, ir::Call::PureIntrinsic);
+  return ir::Call::make(a.type(), ir::Call::shift_right, { a, b }, ir::Call::PureIntrinsic);
 }
 
 Expr operator<<(Expr a, Expr b) {
   BinaryOpMatchTypes(a, b);
   TVM_INDEX_CONST_PROPAGATION({
-      const DataType& rtype = a.dtype();
+      const Type& rtype = a.type();
       if (pa && pb) return IntImm::make(rtype, (pa->value << pb->value));
       if (pb) {
         if (pb->value == 0) return a;
       }
     });
-  return ir::Call::make(a.dtype(), ir::Call::shift_left, { a, b }, ir::Call::PureIntrinsic);
+  return ir::Call::make(a.type(), ir::Call::shift_left, { a, b }, ir::Call::PureIntrinsic);
 }
 
 Expr operator&(Expr a, Expr b) {
   BinaryOpMatchTypes(a, b);
   TVM_INDEX_CONST_PROPAGATION({
-      const DataType& rtype = a.dtype();
+      const Type& rtype = a.type();
       if (pa && pb) return IntImm::make(rtype, (pa->value & pb->value));
     });
-  return ir::Call::make(a.dtype(), ir::Call::bitwise_and, { a, b }, ir::Call::PureIntrinsic);
+  return ir::Call::make(a.type(), ir::Call::bitwise_and, { a, b }, ir::Call::PureIntrinsic);
 }
 
 Expr operator|(Expr a, Expr b) {
   BinaryOpMatchTypes(a, b);
   TVM_INDEX_CONST_PROPAGATION({
-      const DataType& rtype = a.dtype();
+      const Type& rtype = a.type();
       if (pa && pb) return IntImm::make(rtype, (pa->value | pb->value));
     });
-  return ir::Call::make(a.dtype(), ir::Call::bitwise_or, { a, b }, ir::Call::PureIntrinsic);
+  return ir::Call::make(a.type(), ir::Call::bitwise_or, { a, b }, ir::Call::PureIntrinsic);
 }
 
 Expr operator^(Expr a, Expr b) {
   BinaryOpMatchTypes(a, b);
   TVM_INDEX_CONST_PROPAGATION({
-      const DataType& rtype = a.dtype();
+      const Type& rtype = a.type();
       if (pa && pb) return IntImm::make(rtype, (pa->value ^ pb->value));
     });
-  return ir::Call::make(a.dtype(), ir::Call::bitwise_xor, { a, b }, ir::Call::PureIntrinsic);
+  return ir::Call::make(a.type(), ir::Call::bitwise_xor, { a, b }, ir::Call::PureIntrinsic);
 }
 
 Expr operator~(Expr a) {
-  CHECK(a.dtype().is_int() || a.dtype().is_uint());
-  return ir::Call::make(a.dtype(), ir::Call::bitwise_not, { a }, ir::Call::PureIntrinsic);
+  CHECK(a.type().is_int() || a.type().is_uint());
+  return ir::Call::make(a.type(), ir::Call::bitwise_not, { a }, ir::Call::PureIntrinsic);
 }
 
 Expr pow(Expr x, Expr y) {
   BinaryOpMatchTypes(x, y);
-  CHECK(x.dtype().is_float()) << "power only applies to float";
-  return ir::Call::make(x.dtype(), "pow", { x, y }, ir::Call::PureIntrinsic);
+  CHECK(x.type().is_float()) << "power only applies to float";
+  return ir::Call::make(x.type(), "pow", { x, y }, ir::Call::PureIntrinsic);
 }
 
 Expr abs(Expr x) {
-  if (x.dtype().is_int()) {
+  if (x.type().is_int()) {
     using ir::IntImm;
     const IntImm* px = x.as<IntImm>();
     if (px) {
-      return ir::IntImm::make(x.dtype(), std::abs(px->value));
+      return ir::IntImm::make(x.type(), std::abs(px->value));
     }
-    return ir::Select::make(x >= make_zero(x.dtype()), x, -x);
-  } else if (x.dtype().is_float()) {
+    return ir::Select::make(x >= make_zero(x.type()), x, -x);
+  } else if (x.type().is_float()) {
     using ir::FloatImm;
     const FloatImm* fx = x.as<FloatImm>();
     if (fx) {
-      return ir::FloatImm::make(x.dtype(), std::fabs(fx->value));
+      return ir::FloatImm::make(x.type(), std::fabs(fx->value));
     }
-    return ir::Call::make(x.dtype(), "fabs", {x}, ir::Call::PureIntrinsic);
-  } else if (x.dtype().is_uint()) {
+    return ir::Call::make(x.type(), "fabs", {x}, ir::Call::PureIntrinsic);
+  } else if (x.type().is_uint()) {
     return x;
   } else {
-    LOG(FATAL) << "Data type " << x.dtype()
+    LOG(FATAL) << "Data type " << x.type()
                <<" not supported for absolute op. Skipping absolute op...";
     return x;
   }
 }
 
 Expr isnan(Expr x) {
-  DataType t = DataType::Bool(x.dtype().lanes());
-  if (x.dtype().is_int() || x.dtype().is_uint()) {
+  Type t = Bool(x.type().lanes());
+  if (x.type().is_int() || x.type().is_uint()) {
     return make_const(t, false);
-  } else if (x.dtype().is_float()) {
+  } else if (x.type().is_float()) {
     using ir::FloatImm;
     const FloatImm* fx = x.as<FloatImm>();
     if (fx) {
       return make_const(t, std::isnan(fx->value));
     }
-    if (x.dtype().bits() == 16) {
+    if (x.type().bits() == 16) {
       return ir::Call::make(t, ir::Call::isnan,
-                               {cast(DataType::Float(32, t.lanes()), std::move(x))},
+                               {cast(Float(32, t.lanes()), std::move(x))},
                                ir::Call::PureIntrinsic);
     } else {
       return ir::Call::make(t, ir::Call::isnan, {x}, ir::Call::PureIntrinsic);
     }
   } else {
-    LOG(FATAL) << "Data type " << x.dtype()
+    LOG(FATAL) << "Data type " << x.type()
                <<" not supported for isnan op. Skipping isnan op...";
     return x;
   }
 }
 
 Expr sum(Expr source, Array<IterVar> rdom) {
-  Var x("x", source.dtype()), y("y", source.dtype());
+  Var x("x", source.type()), y("y", source.type());
   Expr result = ir::Add::make(x, y);
-  Expr identity_element = make_zero(source.dtype());
+  Expr identity_element = make_zero(source.type());
   ir::CommReducer combiner =
     ir::CommReducerNode::make({x}, {y}, {result}, {identity_element});
-  return ir::Reduce::make(combiner, {source}, rdom, make_const(DataType::Bool(1), true), 0);
+  return ir::Reduce::make(combiner, {source}, rdom, make_const(Bool(1), true), 0);
 }
 
 Expr all(Expr source, Array<IterVar> rdom) {
-  CHECK(source.dtype().is_bool());
-  Var x("x", source.dtype()), y("y", source.dtype());
+  CHECK(source.type().is_bool());
+  Var x("x", source.type()), y("y", source.type());
   Expr result = ir::And::make(x, y);
-  Expr identity_element = make_const(source.dtype(), true);
+  Expr identity_element = make_const(source.type(), true);
   ir::CommReducer combiner =
     ir::CommReducerNode::make({x}, {y}, {result}, {identity_element});
-  return ir::Reduce::make(combiner, {source}, rdom, make_const(DataType::Bool(1), true), 0);
+  return ir::Reduce::make(combiner, {source}, rdom, make_const(Bool(1), true), 0);
 }
 
 Expr any(Expr source, Array<IterVar> rdom) {
-  CHECK(source.dtype().is_bool());
-  Var x("x", source.dtype()), y("y", source.dtype());
+  CHECK(source.type().is_bool());
+  Var x("x", source.type()), y("y", source.type());
   Expr result = ir::Or::make(x, y);
-  Expr identity_element = make_const(source.dtype(), false);
+  Expr identity_element = make_const(source.type(), false);
   ir::CommReducer combiner =
     ir::CommReducerNode::make({x}, {y}, {result}, {identity_element});
-  return ir::Reduce::make(combiner, {source}, rdom, make_const(DataType::Bool(1), true), 0);
+  return ir::Reduce::make(combiner, {source}, rdom, make_const(Bool(1), true), 0);
 }
 
 Expr max(Expr source, Array<IterVar> rdom) {
-  Var x("x", source.dtype()), y("y", source.dtype());
+  Var x("x", source.type()), y("y", source.type());
   Expr result = ir::Max::make(x, y);
-  Expr identity_element = min_value(source.dtype());
+  Expr identity_element = source.type().min();
   ir::CommReducer combiner =
     ir::CommReducerNode::make({x}, {y}, {result}, {identity_element});
-  return ir::Reduce::make(combiner, {source}, rdom, make_const(DataType::Bool(1), true), 0);
+  return ir::Reduce::make(combiner, {source}, rdom, make_const(Bool(1), true), 0);
 }
 
 Expr min(Expr source, Array<IterVar> rdom) {
-  Var x("x", source.dtype()), y("y", source.dtype());
+  Var x("x", source.type()), y("y", source.type());
   Expr result = ir::Min::make(x, y);
-  Expr identity_element = max_value(source.dtype());
+  Expr identity_element = source.type().max();
   ir::CommReducer combiner =
     ir::CommReducerNode::make({x}, {y}, {result}, {identity_element});
-  return ir::Reduce::make(combiner, {source}, rdom, make_const(DataType::Bool(1), true), 0);
+  return ir::Reduce::make(combiner, {source}, rdom, make_const(Bool(1), true), 0);
 }
 
 Expr prod(Expr source, Array<IterVar> rdom) {
-  Var x("x", source.dtype()), y("y", source.dtype());
+  Var x("x", source.type()), y("y", source.type());
   Expr result = ir::Mul::make(x, y);
-  Expr identity_element = make_const(source.dtype(), 1);
+  Expr identity_element = make_const(source.type(), 1);
   ir::CommReducer combiner =
     ir::CommReducerNode::make({x}, {y}, {result}, {identity_element});
-  return ir::Reduce::make(combiner, {source}, rdom, make_const(DataType::Bool(1), true), 0);
+  return ir::Reduce::make(combiner, {source}, rdom, make_const(Bool(1), true), 0);
 }
 
 Expr fmod(Expr x, Expr y) {
   BinaryOpMatchTypes(x, y);
-  CHECK(x.dtype().is_float()) << "fmod only applies to float";
-  return ir::Call::make(x.dtype(), "fmod", { x, y }, ir::Call::PureIntrinsic);
+  CHECK(x.type().is_float()) << "fmod only applies to float";
+  return ir::Call::make(x.type(), "fmod", { x, y }, ir::Call::PureIntrinsic);
 }
 
 Expr floor(Expr x) {
   using ir::FloatImm;
   const FloatImm* fx = x.as<FloatImm>();
-  if (fx) return FloatImm::make(x.dtype(), std::floor(fx->value));
-  return ir::Call::make(x.dtype(), "floor", {x}, ir::Call::PureIntrinsic);
+  if (fx) return FloatImm::make(x.type(), std::floor(fx->value));
+  return ir::Call::make(x.type(), "floor", {x}, ir::Call::PureIntrinsic);
 }
 
 Expr ceil(Expr x) {
   using ir::FloatImm;
   const FloatImm* fx = x.as<FloatImm>();
-  if (fx) return FloatImm::make(x.dtype(), std::ceil(fx->value));
-  return ir::Call::make(x.dtype(), "ceil", {x}, ir::Call::PureIntrinsic);
+  if (fx) return FloatImm::make(x.type(), std::ceil(fx->value));
+  return ir::Call::make(x.type(), "ceil", {x}, ir::Call::PureIntrinsic);
 }
 
 Expr round(Expr x) {
   using ir::FloatImm;
   const FloatImm* fx = x.as<FloatImm>();
-  if (fx) return FloatImm::make(x.dtype(), std::nearbyint(fx->value));
-  return ir::Call::make(x.dtype(), "round", {x}, ir::Call::PureIntrinsic);
+  if (fx) return FloatImm::make(x.type(), std::nearbyint(fx->value));
+  return ir::Call::make(x.type(), "round", {x}, ir::Call::PureIntrinsic);
 }
 
 Expr nearbyint(Expr x) {
   using ir::FloatImm;
   const FloatImm* fx = x.as<FloatImm>();
-  if (fx) return FloatImm::make(x.dtype(), std::nearbyint(fx->value));
-  return ir::Call::make(x.dtype(), "nearbyint", {x}, ir::Call::PureIntrinsic);
+  if (fx) return FloatImm::make(x.type(), std::nearbyint(fx->value));
+  return ir::Call::make(x.type(), "nearbyint", {x}, ir::Call::PureIntrinsic);
 }
 
 Expr trunc(Expr x) {
   using ir::FloatImm;
   const FloatImm* fx = x.as<FloatImm>();
   if (fx) {
-    return FloatImm::make(x.dtype(), (fx->value < 0 ? std::ceil(fx->value) :
+    return FloatImm::make(x.type(), (fx->value < 0 ? std::ceil(fx->value) :
                                      std::floor(fx->value)));
   }
-  return ir::Call::make(x.dtype(), "trunc", {x}, ir::Call::PureIntrinsic);
+  return ir::Call::make(x.type(), "trunc", {x}, ir::Call::PureIntrinsic);
 }
 
 }  // namespace tvm
diff --git a/src/lang/ir.cc b/src/lang/ir.cc
index 427e026bc728..bb8401dae843 100644
--- a/src/lang/ir.cc
+++ b/src/lang/ir.cc
@@ -35,7 +35,7 @@ Expr UIntImm::make(DataType t, uint64_t value) {
   CHECK(t.is_uint() && t.lanes() == 1)
       << "ValueError: UIntImm can only take scalar";
   NodePtr<UIntImm> node = make_node<UIntImm>();
-  node->dtype = t;
+  node->type = t;
   node->value = value;
   return Expr(node);
 }
@@ -44,23 +44,23 @@ Expr FloatImm::make(DataType t, double value) {
   CHECK_EQ(t.lanes(), 1)
       << "ValueError: FloatImm can only take scalar";
   NodePtr<FloatImm> node = make_node<FloatImm>();
-  node->dtype = t;
+  node->type = t;
   node->value = value;
   return Expr(node);
 }
 
 Expr StringImm::make(std::string value) {
   NodePtr<StringImm> node = make_node<StringImm>();
-  node->dtype = DataType::Handle();
+  node->type = Handle();
   node->value = std::move(value);
   return Expr(node);
 }
 
 Expr Cast::make(DataType t, Expr value) {
   CHECK(value.defined());
-  CHECK_EQ(t.lanes(), value.dtype().lanes());
+  CHECK_EQ(t.lanes(), value.type().lanes());
   NodePtr<Cast> node = make_node<Cast>();
-  node->dtype = t;
+  node->type = t;
   node->value = std::move(value);
   return Expr(node);
 }
@@ -68,12 +68,12 @@ Expr Cast::make(DataType t, Expr value) {
 Expr And::make(Expr a, Expr b) {
   CHECK(a.defined()) << "ValueError: a is undefined";
   CHECK(b.defined()) << "ValueError: b is undefined";
-  CHECK(a.dtype().is_bool());
-  CHECK(b.dtype().is_bool());
-  CHECK(a.dtype() == b.dtype()) << "TypeError: mismatched types";
+  CHECK(a.type().is_bool());
+  CHECK(b.type().is_bool());
+  CHECK(a.type() == b.type()) << "TypeError: mismatched types";
 
   NodePtr<And> node = make_node<And>();
-  node->dtype = DataType::Bool(a.dtype().lanes());
+  node->type = Bool(a.type().lanes());
   node->a = std::move(a);
   node->b = std::move(b);
   return Expr(node);
@@ -82,12 +82,12 @@ Expr And::make(Expr a, Expr b) {
 Expr Or::make(Expr a, Expr b) {
   CHECK(a.defined()) << "ValueError: a is undefined";
   CHECK(b.defined()) << "ValueError: b is undefined";
-  CHECK(a.dtype().is_bool());
-  CHECK(b.dtype().is_bool());
-  CHECK(a.dtype() == b.dtype()) << "TypeError: mismatched types";
+  CHECK(a.type().is_bool());
+  CHECK(b.type().is_bool());
+  CHECK(a.type() == b.type()) << "TypeError: mismatched types";
 
   NodePtr<Or> node = make_node<Or>();
-  node->dtype = DataType::Bool(a.dtype().lanes());
+  node->type = Bool(a.type().lanes());
   node->a = std::move(a);
   node->b = std::move(b);
   return Expr(node);
@@ -95,10 +95,10 @@ Expr Or::make(Expr a, Expr b) {
 
 Expr Not::make(Expr a) {
   CHECK(a.defined()) << "ValueError: a is undefined";
-  CHECK(a.dtype().is_bool());
+  CHECK(a.type().is_bool());
 
   NodePtr<Not> node = make_node<Not>();
-  node->dtype = DataType::Bool(a.dtype().lanes());
+  node->type = Bool(a.type().lanes());
   node->a = std::move(a);
   return Expr(node);
 }
@@ -107,27 +107,27 @@ Expr Select::make(Expr condition, Expr true_value, Expr false_value) {
   CHECK(condition.defined()) << "ValueError: condition is undefined";
   CHECK(true_value.defined()) << "ValueError: true_value is undefined";
   CHECK(false_value.defined()) << "ValueError: true_value is undefined";
-  CHECK(condition.dtype().is_bool());
-  CHECK_EQ(condition.dtype().lanes(), true_value.dtype().lanes());
-  CHECK(false_value.dtype() == true_value.dtype()) << "TypeError: mismatched types";
+  CHECK(condition.type().is_bool());
+  CHECK_EQ(condition.type().lanes(), true_value.type().lanes());
+  CHECK(false_value.type() == true_value.type()) << "TypeError: mismatched types";
 
   NodePtr<Select> node = make_node<Select>();
-  node->dtype = true_value.dtype();
+  node->type = true_value.type();
   node->condition = std::move(condition);
   node->true_value = std::move(true_value);
   node->false_value = std::move(false_value);
   return Expr(node);
 }
 
-Expr Load::make(DataType dtype, Var buffer_var, Expr index, Expr predicate) {
+Expr Load::make(DataType type, Var buffer_var, Expr index, Expr predicate) {
   CHECK(buffer_var.defined());
   CHECK(predicate.defined());
   CHECK(index.defined());
-  CHECK_EQ(dtype.lanes(), index.dtype().lanes());
-  CHECK_EQ(dtype.lanes(), predicate.dtype().lanes());
+  CHECK_EQ(type.lanes(), index.type().lanes());
+  CHECK_EQ(type.lanes(), predicate.type().lanes());
 
   NodePtr<Load> node = make_node<Load>();
-  node->dtype = dtype;
+  node->type = type;
   node->buffer_var = std::move(buffer_var);
   node->index = std::move(index);
   node->predicate = std::move(predicate);
@@ -138,13 +138,13 @@ Expr Load::make(DataType dtype, Var buffer_var, Expr index, Expr predicate) {
 Expr Ramp::make(Expr base, Expr stride, int lanes) {
   CHECK(base.defined());
   CHECK(stride.defined());
-  CHECK(base.dtype().is_scalar());
-  CHECK(stride.dtype().is_scalar());
+  CHECK(base.type().is_scalar());
+  CHECK(stride.type().is_scalar());
   CHECK_GT(lanes, 1);
-  CHECK_EQ(stride.dtype(), base.dtype());
+  CHECK_EQ(stride.type(), base.type());
 
   NodePtr<Ramp> node = make_node<Ramp>();
-  node->dtype = base.dtype().with_lanes(lanes);
+  node->type = base.type().with_lanes(lanes);
   node->base = base;
   node->stride = stride;
   node->lanes = lanes;
@@ -153,11 +153,11 @@ Expr Ramp::make(Expr base, Expr stride, int lanes) {
 
 Expr Broadcast::make(Expr value, int lanes) {
   CHECK(value.defined());
-  CHECK(value.dtype().is_scalar());
+  CHECK(value.type().is_scalar());
   CHECK_GT(lanes, 1);
 
   NodePtr<Broadcast> node = make_node<Broadcast>();
-  node->dtype = value.dtype().with_lanes(lanes);
+  node->type = value.type().with_lanes(lanes);
   node->value = std::move(value);
   node->lanes = lanes;
   return Expr(node);
@@ -166,10 +166,10 @@ Expr Broadcast::make(Expr value, int lanes) {
 Expr Let::make(Var var, Expr value, Expr body) {
   CHECK(value.defined());
   CHECK(body.defined());
-  CHECK_EQ(value.dtype(), var.dtype());
+  CHECK_EQ(value.type(), var.type());
 
   NodePtr<Let> node = make_node<Let>();
-  node->dtype = body.dtype();
+  node->type = body.type();
   node->var = std::move(var);
   node->value = std::move(value);
   node->body = std::move(body);
@@ -192,7 +192,7 @@ bool Call::is_vectorizable() const {
   return false;
 }
 
-Expr Call::make(DataType dtype,
+Expr Call::make(DataType type,
                 std::string name,
                 Array<Expr> args,
                 CallType call_type,
@@ -204,12 +204,12 @@ Expr Call::make(DataType dtype,
 
   if (call_type == Halide) {
     for (size_t i = 0; i < args.size(); ++i) {
-      CHECK(args[i].dtype().is_int());
+      CHECK(args[i].type().is_int());
     }
   }
 
   NodePtr<Call> node = make_node<Call>();
-  node->dtype = dtype;
+  node->type = type;
   node->name = std::move(name);
   node->args = std::move(args);
   node->call_type = call_type;
@@ -223,17 +223,17 @@ Expr Shuffle::make(Array<Expr> vectors,
   CHECK_NE(vectors.size(), 0U);
   CHECK_NE(indices.size(), 0U);
 
-  DataType base_type = vectors[0].dtype().element_of();
+  Type base_type = vectors[0].type().element_of();
   int total_lanes = 0;
 
   for (Expr val : vectors) {
-    CHECK(val.dtype().element_of() == base_type);
-    total_lanes += val.dtype().lanes();
+    CHECK(val.type().element_of() == base_type);
+    total_lanes += val.type().lanes();
   }
   CHECK_LE(indices.size(), static_cast<size_t>(total_lanes));
 
   NodePtr<Shuffle> node = make_node<Shuffle>();
-  node->dtype = base_type.with_lanes(static_cast<int>(indices.size()));
+  node->type = base_type.with_lanes(static_cast<int>(indices.size()));
   node->vectors = std::move(vectors);
   node->indices = std::move(indices);
   return Expr(node);
@@ -247,8 +247,8 @@ Expr Shuffle::make_concat(Array<Expr> vectors) {
   Array<Expr> indices;
   int index = 0;
   for (const Expr& e : vectors) {
-    for (int i = 0; i < e.dtype().lanes(); ++i) {
-      indices.push_back(IntImm::make(DataType::Int(32), index++));
+    for (int i = 0; i < e.type().lanes(); ++i) {
+      indices.push_back(IntImm::make(Int(32), index++));
     }
   }
   return make(vectors, indices);
@@ -298,7 +298,7 @@ Expr Reduce::make(CommReducer combiner, Array<Expr> source,
   for (size_t i = 0; i < axis.size(); ++i) {
     CHECK(axis[i].defined());
   }
-  n->dtype = source[value_index].dtype();
+  n->type = source[value_index].type();
   n->combiner = std::move(combiner);
   n->source = std::move(source);
   n->axis = std::move(axis);
@@ -315,7 +315,7 @@ Expr Any::make() {
 Stmt LetStmt::make(Var var, Expr value, Stmt body) {
   CHECK(value.defined());
   CHECK(body.defined());
-  CHECK_EQ(value.dtype(), var.dtype());
+  CHECK_EQ(value.type(), var.type());
 
   NodePtr<LetStmt> node = make_node<LetStmt>();
   node->var = std::move(var);
@@ -338,7 +338,7 @@ Stmt AttrStmt::make(NodeRef node,
 
 Stmt AssertStmt::make(Expr condition, Expr message, Stmt body) {
   CHECK(condition.defined());
-  CHECK(message.dtype() == DataType::Int(32) ||
+  CHECK(message.type() == Int(32) ||
         message.as<StringImm>())
       << "TypeError: AssertStmt message must be an int or string:"
       << message << "\n";
@@ -368,9 +368,9 @@ Stmt For::make(Var loop_var,
                Stmt body) {
   CHECK(min.defined());
   CHECK(extent.defined());
-  CHECK(min.dtype().is_scalar());
-  CHECK(extent.dtype().is_scalar());
-  CHECK(loop_var.dtype().is_scalar());
+  CHECK(min.type().is_scalar());
+  CHECK(extent.type().is_scalar());
+  CHECK(loop_var.type().is_scalar());
   CHECK(body.defined());
 
   NodePtr<For> node = make_node<For>();
@@ -387,8 +387,8 @@ Stmt Store::make(Var buffer_var, Expr value, Expr index, Expr predicate) {
   CHECK(value.defined());
   CHECK(index.defined());
   CHECK(predicate.defined());
-  CHECK_EQ(value.dtype().lanes(), index.dtype().lanes());
-  CHECK_EQ(value.dtype().lanes(), predicate.dtype().lanes());
+  CHECK_EQ(value.type().lanes(), index.type().lanes());
+  CHECK_EQ(value.type().lanes(), predicate.type().lanes());
 
   NodePtr<Store> node = make_node<Store>();
   node->buffer_var = std::move(buffer_var);
@@ -416,7 +416,7 @@ Stmt Provide::make(FunctionRef func, int value_index, Expr value, Array<Expr> ar
 }
 
 Stmt Allocate::make(Var buffer_var,
-                    DataType dtype,
+                    DataType type,
                     Array<Expr> extents,
                     Expr condition,
                     Stmt body,
@@ -424,15 +424,15 @@ Stmt Allocate::make(Var buffer_var,
                     std::string free_function) {
     for (size_t i = 0; i < extents.size(); ++i) {
       CHECK(extents[i].defined());
-      CHECK(extents[i].dtype().is_scalar());
+      CHECK(extents[i].type().is_scalar());
     }
     CHECK(body.defined());
     CHECK(condition.defined());
-    CHECK(condition.dtype().is_bool());
+    CHECK(condition.type().is_bool());
 
     NodePtr<Allocate> node = make_node<Allocate>();
     node->buffer_var = std::move(buffer_var);
-    node->dtype = dtype;
+    node->type = type;
     node->extents = std::move(extents);
     node->condition = std::move(condition);
     node->body = std::move(body);
@@ -464,42 +464,42 @@ Stmt Free::make(Var buffer_var) {
 
 Stmt Realize::make(FunctionRef func,
                    int value_index,
-                   DataType dtype,
+                   DataType type,
                    Region bounds,
                    Expr condition,
                    Stmt body) {
   for (size_t i = 0; i < bounds.size(); ++i) {
     CHECK(bounds[i]->min.defined());
     CHECK(bounds[i]->extent.defined());
-    CHECK(bounds[i]->min.dtype().is_scalar());
-    CHECK(bounds[i]->extent.dtype().is_scalar());
+    CHECK(bounds[i]->min.type().is_scalar());
+    CHECK(bounds[i]->extent.type().is_scalar());
   }
   CHECK(body.defined());
   CHECK(condition.defined());
-  CHECK(condition.dtype().is_bool());
+  CHECK(condition.type().is_bool());
 
   NodePtr<Realize> node = make_node<Realize>();
   node->func = std::move(func);
   node->value_index = value_index;
-  node->dtype = dtype;
+  node->type = type;
   node->bounds = std::move(bounds);
   node->condition = std::move(condition);
   node->body = std::move(body);
   return Stmt(node);
 }
 
-Stmt Prefetch::make(FunctionRef func, int value_index, DataType dtype, Region bounds) {
+Stmt Prefetch::make(FunctionRef func, int value_index, DataType type, Region bounds) {
   for (size_t i = 0; i < bounds.size(); ++i) {
     CHECK(bounds[i]->min.defined());
     CHECK(bounds[i]->extent.defined());
-    CHECK(bounds[i]->min.dtype().is_scalar());
-    CHECK(bounds[i]->extent.dtype().is_scalar());
+    CHECK(bounds[i]->min.type().is_scalar());
+    CHECK(bounds[i]->extent.type().is_scalar());
   }
 
   NodePtr<Prefetch> node = make_node<Prefetch>();
   node->func = std::move(func);
   node->value_index = value_index;
-  node->dtype = dtype;
+  node->type = type;
   node->bounds = std::move(bounds);
   return Stmt(node);
 }
@@ -555,14 +555,14 @@ Stmt Evaluate::make(Expr value) {
 TVM_STATIC_IR_FUNCTOR(IRPrinter, vtable)
 .set_dispatch<UIntImm>([](const ObjectRef& node, IRPrinter* p) {
     auto* op = static_cast<const UIntImm*>(node.get());
-    p->stream << "(" << op->dtype << ")" << op->value;
+    p->stream << "(" << op->type << ")" << op->value;
   });
 
 TVM_STATIC_IR_FUNCTOR(IRPrinter, vtable)
 .set_dispatch<FloatImm>([](const ObjectRef& node, IRPrinter* p) {
     auto* op = static_cast<const FloatImm*>(node.get());
     auto& stream = p->stream;
-    switch (op->dtype.bits()) {
+    switch (op->type.bits()) {
       case 64:
         stream << op->value;
         break;
@@ -573,7 +573,7 @@ TVM_STATIC_IR_FUNCTOR(IRPrinter, vtable)
         stream << op->value << 'h';
         break;
       default:
-        LOG(FATAL) << "Unknown float type bits=" << op->dtype.bits();
+        LOG(FATAL) << "Unknown float type bits=" << op->type.bits();
     }
   });
 
@@ -616,7 +616,7 @@ TVM_STATIC_IR_FUNCTOR(IRPrinter, vtable)
 TVM_STATIC_IR_FUNCTOR(IRPrinter, vtable)
 .set_dispatch<Cast>([](const ObjectRef& node, IRPrinter* p) {
     auto* op = static_cast<const Cast*>(node.get());
-    p->stream << op->dtype << '(';
+    p->stream << op->type << '(';
     p->Print(op->value);
     p->stream << ')';
   })
@@ -959,7 +959,7 @@ TVM_STATIC_IR_FUNCTOR(IRPrinter, vtable)
 .set_dispatch<Allocate>([](const ObjectRef& node, IRPrinter* p) {
     auto* op = static_cast<const Allocate*>(node.get());
     p->PrintIndent();
-    p->stream << "allocate " << op->buffer_var << "[" << op->dtype;
+    p->stream << "allocate " << op->buffer_var << "[" << op->type;
     for (size_t i = 0; i < op->extents.size(); ++i) {
       p->stream << " * ";
       p->Print(op->extents[i]);
diff --git a/src/lang/tensor.cc b/src/lang/tensor.cc
index 1c110936b3ef..05ba6f7a08bd 100644
--- a/src/lang/tensor.cc
+++ b/src/lang/tensor.cc
@@ -56,7 +56,7 @@ Tensor Operation::output(size_t i) const {
 }
 
 Tensor TensorNode::make(Array<Expr> shape,
-                        DataType dtype,
+                        Type dtype,
                         Operation op,
                         int value_index) {
   auto n = make_node<TensorNode>();
diff --git a/src/node/reflection.cc b/src/node/reflection.cc
index f53583723f24..e92ca92834a2 100644
--- a/src/node/reflection.cc
+++ b/src/node/reflection.cc
@@ -61,7 +61,7 @@ class AttrGetter : public AttrVisitor {
   void Visit(const char* key, void** value) final {
     if (skey == key) *ret = static_cast<void*>(value[0]);
   }
-  void Visit(const char* key, DataType* value) final {
+  void Visit(const char* key, Type* value) final {
     if (skey == key) *ret = value[0];
   }
   void Visit(const char* key, std::string* value) final {
@@ -135,7 +135,7 @@ class AttrDir : public AttrVisitor {
   void Visit(const char* key, void** value) final {
     names->push_back(key);
   }
-  void Visit(const char* key, DataType* value) final {
+  void Visit(const char* key, Type* value) final {
     names->push_back(key);
   }
   void Visit(const char* key, std::string* value) final {
diff --git a/src/node/serialization.cc b/src/node/serialization.cc
index 5a991aa3ad1b..cb310eb2cda9 100644
--- a/src/node/serialization.cc
+++ b/src/node/serialization.cc
@@ -39,11 +39,11 @@
 namespace tvm {
 
 inline std::string Type2String(const DataType& t) {
-  return runtime::TVMType2String(t);
+  return runtime::TVMType2String(Type2TVMType(t));
 }
 
-inline DataType String2Type(std::string s) {
-  return DataType(runtime::String2TVMType(s));
+inline Type String2Type(std::string s) {
+  return TVMType2Type(runtime::String2TVMType(s));
 }
 
 // indexer to index all the nodes
diff --git a/src/op/compute_op.cc b/src/op/compute_op.cc
index bd129ac33058..5f5d2d4f475b 100644
--- a/src/op/compute_op.cc
+++ b/src/op/compute_op.cc
@@ -70,9 +70,9 @@ Array<IterVar> BaseComputeOpNode::root_iter_vars() const {
   return ret;
 }
 
-DataType ComputeOpNode::output_dtype(size_t idx) const {
+Type ComputeOpNode::output_dtype(size_t idx) const {
   CHECK_LT(idx, num_outputs());
-  return body[idx].dtype();
+  return body[idx].type();
 }
 
 Array<Expr> BaseComputeOpNode::output_shape(size_t idx) const {
@@ -100,7 +100,7 @@ Tensor compute(Array<Expr> shape,
     std::ostringstream os;
     os << "ax" << i;
     axis.emplace_back(IterVarNode::make(
-        Range(0, shape[i]), Var(os.str(), shape[i].dtype()), kDataPar));
+        Range(0, shape[i]), Var(os.str(), shape[i].type()), kDataPar));
     args.push_back(axis.back()->var);
   }
 
@@ -122,7 +122,7 @@ Array<Tensor> compute(Array<Expr> shape,
     std::ostringstream os;
     os << "ax" << i;
     axis.emplace_back(IterVarNode::make(
-        Range(0, shape[i]), Var(os.str(), shape[i].dtype()), kDataPar));
+        Range(0, shape[i]), Var(os.str(), shape[i].type()), kDataPar));
     args.push_back(axis.back()->var);
   }
 
@@ -190,7 +190,7 @@ Operation ComputeOpNode::ReplaceInputs(
       for (size_t k = 0; k < this->body.size(); ++k) {
         auto n = make_node<ir::Reduce>(*r);
         n->value_index = static_cast<int>(k);
-        n->dtype = r->source[k].dtype();
+        n->type = r->source[k].type();
         arr.push_back(Expr(n));
       }
     } else {
@@ -229,7 +229,7 @@ void ComputeOpNode::PropBoundToInputs(
           IntSet arg_intset = EvalSet(call->args[i], dom_map);
           const arith::IntervalSetNode* arg_interval = arg_intset.as<arith::IntervalSetNode>();
           if (arg_interval) {
-            Expr shape_i_min_value = make_zero(t->shape[i].dtype());
+            Expr shape_i_min_value = make_zero(t->shape[i].type());
             Expr shape_i_max_value = t->shape[i] - 1;
             Expr min_value = arg_interval->min_value;
             Expr max_value = arg_interval->max_value;
@@ -295,7 +295,7 @@ Stmt BaseComputeOpNode::BuildRealize(
                                attr->dim_align_offset};
           realize = ir::AttrStmt::make(
               t, ir::attr::buffer_dim_align,
-              Call::make(DataType::Handle(), ir::intrinsic::tvm_tuple, tuple, Call::Intrinsic),
+              Call::make(Handle(), ir::intrinsic::tvm_tuple, tuple, Call::Intrinsic),
               realize);
         }
       }
diff --git a/src/op/cross_thread_reduction.cc b/src/op/cross_thread_reduction.cc
index 4a3aa54ccc6d..818acb912f9c 100644
--- a/src/op/cross_thread_reduction.cc
+++ b/src/op/cross_thread_reduction.cc
@@ -6,9 +6,9 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- *
+ * 
  *   http://www.apache.org/licenses/LICENSE-2.0
- *
+ * 
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -57,14 +57,14 @@ Stmt MakeCrossThreadReduction(
     cond = cond && v;
   }
   Array<Expr> freduce_args;
-  freduce_args.push_back(make_const(DataType::UInt(32), static_cast<uint32_t>(size)));
+  freduce_args.push_back(make_const(UInt(32), static_cast<uint32_t>(size)));
   for (size_t i = 0; i < size; ++i) {
     freduce_args.push_back(reduces[0]->source[i]);
   }
   freduce_args.push_back(cond);
   std::vector<Var> res_handles(size);
   for (size_t idx = 0; idx < size; ++idx) {
-    res_handles[idx] = Var("reduce_temp" + std::to_string(idx), DataType::Handle());
+    res_handles[idx] = Var("reduce_temp" + std::to_string(idx), Handle());
     freduce_args.push_back(res_handles[idx]);
   }
 
@@ -85,17 +85,17 @@ Stmt MakeCrossThreadReduction(
   }
 
   Stmt reduce_body = Evaluate::make(Call::make(
-      DataType::Handle(),
+      Handle(),
       ir::intrinsic::tvm_thread_allreduce,
       freduce_args, Call::Intrinsic));
   reduce_body = AttrStmt::make(
       reduces[0]->combiner,
       attr::reduce_scope,
-      make_zero(DataType::Handle()),
+      make_zero(Handle()),
       reduce_body);
   std::vector<Stmt> assigns(size);
   for (size_t idx = 0; idx < size; ++idx) {
-    DataType t = reduces[idx]->dtype;
+    Type t = reduces[idx]->type;
     assigns[idx] = Provide::make(
       stage->op, idx,
       Load::make(t, res_handles[idx], 0, const_true(t.lanes())), args);
@@ -106,7 +106,7 @@ Stmt MakeCrossThreadReduction(
   Stmt body = Block::make(reduce_body, assign_body);
   for (size_t idx = size; idx != 0; --idx) {
     body = Allocate::make(
-      res_handles[idx - 1], reduces[idx - 1]->dtype, {1}, const_true(), body);
+      res_handles[idx - 1], reduces[idx - 1]->type, {1}, const_true(), body);
     body = AttrStmt::make(
       res_handles[idx - 1], attr::storage_scope, StringImm::make("local"), body);
   }
diff --git a/src/op/extern_op.cc b/src/op/extern_op.cc
index 883ebdc4a0f7..35fe469fbe16 100644
--- a/src/op/extern_op.cc
+++ b/src/op/extern_op.cc
@@ -46,7 +46,7 @@ Array<IterVar> ExternOpNode::root_iter_vars() const {
   return {};
 }
 
-DataType ExternOpNode::output_dtype(size_t i) const {
+Type ExternOpNode::output_dtype(size_t i) const {
   return output_placeholders[i]->dtype;
 }
 
@@ -122,7 +122,7 @@ void ExternOpNode::PropBoundToInputs(
     for (size_t i = 0; i < t->shape.size(); ++i) {
       dom.data[i].emplace_back(IntSet::range(
           Range::make_by_min_extent(
-              make_const(t->shape[i].dtype(), 0), t->shape[i])));
+              make_const(t->shape[i].type(), 0), t->shape[i])));
     }
   }
 }
@@ -145,7 +145,7 @@ Stmt ExternOpNode::BuildRealize(
     for (size_t i = 0; i < t->shape.size(); ++i) {
       bounds.push_back(
           Range::make_by_min_extent(
-              make_const(t->shape[i].dtype(), 0), t->shape[i]));
+              make_const(t->shape[i].type(), 0), t->shape[i]));
     }
     realize_body = ir::Realize::make(
         t->op, t->value_index, t->dtype,
@@ -159,19 +159,19 @@ Stmt ExternOpNode::BuildProvide(
     const std::unordered_map<IterVar, Range>& dom_map,
     bool debug_keep_trivial_loop) const {
   CHECK_EQ(stage->op.operator->(), this);
-  Stmt ret = AttrStmt::make(make_zero(DataType::Int(32)), attr::extern_scope, 0, this->body);
+  Stmt ret = AttrStmt::make(make_zero(Int(32)), attr::extern_scope, 0, this->body);
   auto f_push_bind = [&ret](Buffer buffer, Tensor tensor) {
     Array<NodeRef> bind_spec;
     Array<Expr> tuple;
     bind_spec.push_back(buffer);
     bind_spec.push_back(tensor);
     for (size_t k = 0; k < buffer->shape.size(); ++k) {
-      tuple.push_back(make_const(buffer->shape[k].dtype(), 0));
+      tuple.push_back(make_const(buffer->shape[k].type(), 0));
       tuple.push_back(buffer->shape[k]);
     }
     ret = AttrStmt::make(
         bind_spec, attr::buffer_bind_scope,
-        Call::make(DataType::Handle(), intrinsic::tvm_tuple, tuple, Call::Intrinsic), ret);
+        Call::make(Handle(), intrinsic::tvm_tuple, tuple, Call::Intrinsic), ret);
   };
   for (size_t i = output_placeholders.size(); i != 0; --i) {
     f_push_bind(output_placeholders[i - 1], stage->op.output(i - 1));
diff --git a/src/op/hybrid_op.cc b/src/op/hybrid_op.cc
index 1e1a81423b69..7a99ea10b74d 100644
--- a/src/op/hybrid_op.cc
+++ b/src/op/hybrid_op.cc
@@ -52,7 +52,7 @@ Array<IterVar> HybridOpNode::root_iter_vars() const {
   return this->axis;
 }
 
-DataType HybridOpNode::output_dtype(size_t i) const {
+Type HybridOpNode::output_dtype(size_t i) const {
   return outputs[i]->dtype;
 }
 
@@ -138,7 +138,7 @@ void HybridOpNode::PropBoundToInputs(
     for (size_t i = 0; i < t->shape.size(); ++i) {
       dom.data[i].emplace_back(IntSet::range(
           Range::make_by_min_extent(
-              make_const(t->shape[i].dtype(), 0), t->shape[i])));
+              make_const(t->shape[i].type(), 0), t->shape[i])));
     }
   }
 }
@@ -166,7 +166,7 @@ Stmt HybridOpNode::BuildRealize(
     for (size_t i = 0; i < t->shape.size(); ++i) {
       bounds.push_back(
           Range::make_by_min_extent(
-              make_const(t->shape[i].dtype(), 0), t->shape[i]));
+              make_const(t->shape[i].type(), 0), t->shape[i]));
     }
     realize_body = ir::Realize::make(
         t->op, t->value_index, t->dtype,
@@ -180,7 +180,7 @@ Stmt HybridOpNode::BuildProvide(
     const std::unordered_map<IterVar, Range> &dom_map,
     bool debug_keep_trivial_loop) const {
   CHECK_EQ(stage->op.operator->(), this);
-  Stmt ret = AttrStmt::make(make_zero(DataType::Int(32)), attr::extern_scope, 0, this->body);
+  Stmt ret = AttrStmt::make(make_zero(Int(32)), attr::extern_scope, 0, this->body);
   std::unordered_map<Tensor, Tensor> rmap;
   for (int i = 0; i < this->num_outputs(); ++i) {
     rmap[outputs[i]] = stage->op.output(i);
diff --git a/src/op/op_util.cc b/src/op/op_util.cc
index cd3b168d810b..691603157b1c 100644
--- a/src/op/op_util.cc
+++ b/src/op/op_util.cc
@@ -74,7 +74,7 @@ MakeLoopNest(const Stage& stage,
     if (bind_iv->thread_tag.length() == 0) {
       // Only generate new loop if we're not bound to a thread.
       if (new_loop_var) {
-        var = Var(iv->var->name_hint + ".init", bind_iv->var.dtype());
+        var = Var(iv->var->name_hint + ".init", bind_iv->var.type());
       }
 
       ForType for_type = ForType::Serial;
@@ -98,7 +98,7 @@ MakeLoopNest(const Stage& stage,
           const std::string& pkey = it_attr->pragma_keys[k].as<StringImm>()->value;
           Expr pvalue = it_attr->pragma_values[k];
           if (!pvalue.defined()) {
-            pvalue = make_const(DataType::Int(32), 1);
+            pvalue = make_const(Int(32), 1);
           }
           nest[i + 1].emplace_back(
               AttrStmt::make(iv, ir::attr::pragma_scope_prefix + pkey, pvalue, no_op));
@@ -114,7 +114,7 @@ MakeLoopNest(const Stage& stage,
                       for_type, DeviceAPI::None, no_op));
         value_map[iv] = var;
       } else {
-        Var idx(bind_iv->var->name_hint + ".idx", bind_iv->var.dtype());
+        Var idx(bind_iv->var->name_hint + ".idx", bind_iv->var.type());
         nest[i + 1].emplace_back(
             For::make(idx, 0, dom->extent,
                       for_type, DeviceAPI::None, no_op));
@@ -197,7 +197,7 @@ class TensorReplacer : public ir::IRMutator {
       auto it = vmap_.find(t);
       if (it != vmap_.end()) {
         Expr ret = ir::Call::make(
-            op->dtype, it->second->op->name, op->args,
+            op->type, it->second->op->name, op->args,
             op->call_type, it->second->op, it->second->value_index);
         found = true;
         return IRMutator::Mutate_(ret.as<ir::Call>(), ret);
diff --git a/src/op/placeholder_op.cc b/src/op/placeholder_op.cc
index 6910f63b44d3..91b0589e3dd0 100644
--- a/src/op/placeholder_op.cc
+++ b/src/op/placeholder_op.cc
@@ -42,7 +42,7 @@ Array<IterVar> PlaceholderOpNode::root_iter_vars() const {
   return {};
 }
 
-DataType PlaceholderOpNode::output_dtype(size_t i) const {
+Type PlaceholderOpNode::output_dtype(size_t i) const {
   CHECK_EQ(i, 0U);
   return dtype;
 }
@@ -54,7 +54,7 @@ Array<Expr> PlaceholderOpNode::output_shape(size_t i) const {
 
 Operation PlaceholderOpNode::make(std::string name,
                                   Array<Expr> shape,
-                                  DataType dtype) {
+                                  Type dtype) {
   auto n = make_node<PlaceholderOpNode>();
   n->name = name;
   n->shape = shape;
@@ -62,7 +62,7 @@ Operation PlaceholderOpNode::make(std::string name,
   return Operation(n);
 }
 
-Tensor placeholder(Array<Expr> shape, DataType dtype, std::string name) {
+Tensor placeholder(Array<Expr> shape, Type dtype, std::string name) {
   return PlaceholderOpNode::make(name, shape, dtype).output(0);
 }
 
diff --git a/src/op/scan_op.cc b/src/op/scan_op.cc
index e83a23194cf8..b02073b5357e 100644
--- a/src/op/scan_op.cc
+++ b/src/op/scan_op.cc
@@ -53,7 +53,7 @@ Array<IterVar> ScanOpNode::root_iter_vars() const {
   return ret;
 }
 
-DataType ScanOpNode::output_dtype(size_t i) const {
+Type ScanOpNode::output_dtype(size_t i) const {
   return update[i]->dtype;
 }
 
diff --git a/src/op/tensor_compute_op.cc b/src/op/tensor_compute_op.cc
index e59f90f4948e..83cdd76c2b2a 100644
--- a/src/op/tensor_compute_op.cc
+++ b/src/op/tensor_compute_op.cc
@@ -46,7 +46,7 @@ int TensorComputeOpNode::num_outputs() const {
   return static_cast<int>(this->intrin->buffers.size() - this->inputs.size());
 }
 
-DataType TensorComputeOpNode::output_dtype(size_t i) const {
+Type TensorComputeOpNode::output_dtype(size_t i) const {
   return this->intrin->buffers[this->inputs.size() + i]->dtype;
 }
 
@@ -155,7 +155,7 @@ Stmt TensorComputeOpNode::BuildProvide(
     }
     input_bind_nest.emplace_back(AttrStmt::make(
         bind_spec, ir::attr::buffer_bind_scope,
-        Call::make(DataType::Handle(), ir::intrinsic::tvm_tuple, tuple, Call::Intrinsic), nop));
+        Call::make(Handle(), ir::intrinsic::tvm_tuple, tuple, Call::Intrinsic), nop));
   }
 
   // output binding
@@ -179,7 +179,7 @@ Stmt TensorComputeOpNode::BuildProvide(
 
     output_bind_nest.emplace_back(AttrStmt::make(
         bind_spec, ir::attr::buffer_bind_scope,
-        Call::make(DataType::Handle(), ir::intrinsic::tvm_tuple, tuple, Call::Intrinsic), nop));
+        Call::make(Handle(), ir::intrinsic::tvm_tuple, tuple, Call::Intrinsic), nop));
   }
 
   // Check variable remap
diff --git a/src/op/tensorize.cc b/src/op/tensorize.cc
index b7f32de8b5ad..c4abf0b04141 100644
--- a/src/op/tensorize.cc
+++ b/src/op/tensorize.cc
@@ -173,7 +173,7 @@ class TensorIntrinMatcher final : public IRMutator {
           args.push_back(op->args[i] - e.region[i]->min);
         }
         return Call::make(
-            op->dtype, e.tensor->op->name, args,
+            op->type, e.tensor->op->name, args,
             op->call_type, e.tensor->op, e.tensor->value_index);
       }
     }
@@ -341,12 +341,12 @@ void VerifyTensorizeBody(
     lhs = CanonicalSimplify(lhs, compute_intrin_iter_space);
     Expr rhs = Simplify(intrin_compute->body[i], compute_intrin_iter_space);
     rhs = CanonicalSimplify(rhs, compute_intrin_iter_space);
-    if (lhs.dtype() != rhs.dtype()) {
+    if (lhs.type() != rhs.type()) {
       LOG(FATAL)
           << "Failed to match the data type with TensorIntrin "
           << intrin->name << "'s declaration "
-          << " provided=" << lhs.dtype()
-          << ", intrin=" << rhs.dtype();
+          << " provided=" << lhs.type()
+          << ", intrin=" << rhs.type();
     }
     CHECK(Equal(lhs, rhs))
         << "Failed to match the compute with TensorIntrin "
@@ -390,7 +390,7 @@ Stmt MakeTensorize(const ComputeOpNode* self,
     }
     input_bind_nest.emplace_back(AttrStmt::make(
         bind_spec, ir::attr::buffer_bind_scope,
-        Call::make(DataType::Handle(), ir::intrinsic::tvm_tuple, tuple, Call::Intrinsic), nop));
+        Call::make(Handle(), ir::intrinsic::tvm_tuple, tuple, Call::Intrinsic), nop));
   }
   // output binding
   const ComputeOpNode* intrin_compute = intrin->op.as<ComputeOpNode>();
@@ -410,7 +410,7 @@ Stmt MakeTensorize(const ComputeOpNode* self,
     Array<NodeRef> bind_spec{buffer, tensor};
     output_bind_nest.emplace_back(AttrStmt::make(
         bind_spec, ir::attr::buffer_bind_scope,
-        Call::make(DataType::Handle(), ir::intrinsic::tvm_tuple, tuple, Call::Intrinsic), nop));
+        Call::make(Handle(), ir::intrinsic::tvm_tuple, tuple, Call::Intrinsic), nop));
   }
   // Check variable remap
   std::unordered_map<const Variable*, Expr> vmap;
@@ -430,7 +430,7 @@ Stmt MakeTensorize(const ComputeOpNode* self,
     IterVar target = intrin_compute->reduce_axis[i - start];
     auto it = out_dom.find(iv);
     CHECK(it != out_dom.end());
-    binder.Bind(target->dom->min, make_const(iv->dom->min.dtype(), 0),
+    binder.Bind(target->dom->min, make_const(iv->dom->min.type(), 0),
                 "tensir_intrin.reduction.min");
     binder.Bind(target->dom->extent, it->second->extent,
                 "tensir_intrin.reduction.extent");
diff --git a/src/pass/arg_binder.cc b/src/pass/arg_binder.cc
index e4ff9cb457a5..f892b6b957f8 100644
--- a/src/pass/arg_binder.cc
+++ b/src/pass/arg_binder.cc
@@ -50,7 +50,7 @@ bool ArgBinder::Bind_(const Expr& arg,
                       const Expr& value,
                       const std::string& arg_name,
                       bool with_lets) {
-  CHECK_EQ(arg.dtype(), value.dtype());
+  CHECK_EQ(arg.type(), value.type());
   if (const Variable* v = arg.as<Variable>()) {
     auto it = def_map_->find(v);
     if (it == def_map_->end()) {
@@ -118,8 +118,8 @@ void ArgBinder::BindBuffer(const Buffer& arg,
   if (Bind_(arg->elem_offset, value->elem_offset, arg_name + ".elem_offset", false)) {
     if (arg->offset_factor > 1) {
       Expr offset = value->elem_offset;
-      Expr factor = make_const(offset.dtype(), arg->offset_factor);
-      Expr zero = make_zero(offset.dtype());
+      Expr factor = make_const(offset.type(), arg->offset_factor);
+      Expr zero = make_zero(offset.type());
       BinderAddAssert(truncmod(offset, factor) == zero,
                       arg_name + ".elem_offset", &asserts_);
     }
@@ -153,7 +153,7 @@ void ArgBinder::BindBuffer(const Buffer& arg,
   }
 }
 
-inline Expr TVMArrayGet(DataType t, Var arr, intrinsic::TVMStructFieldKind kind) {
+inline Expr TVMArrayGet(Type t, Var arr, intrinsic::TVMStructFieldKind kind) {
   return TVMStructGet(t, arr, 0, kind);
 }
 
@@ -162,8 +162,8 @@ void ArgBinder::BindDLTensor(const Buffer& buffer,
                              const Expr& device_id,
                              const Var& handle,
                              const std::string& arg_name) {
-  const DataType tvm_shape_type = DataType::ShapeIndex();
-  const DataType tvm_ndim_type = DataType::Int(32);
+  const Type tvm_shape_type = TVMShapeIndexType();
+  const Type tvm_ndim_type = Int(32);
   const Stmt nop = Evaluate::make(0);
   // dimension checks
   Expr v_ndim = TVMArrayGet(tvm_ndim_type, handle, intrinsic::kArrNDim);
@@ -175,52 +175,52 @@ void ArgBinder::BindDLTensor(const Buffer& buffer,
                << buffer->shape.size();
   asserts_.emplace_back(AssertStmt::make(a_ndim == v_ndim, ndim_err_msg.str(), nop));
   // type checks
-  DataType dtype = buffer->dtype;
+  Type dtype = buffer->dtype;
   std::ostringstream type_err_msg;
   type_err_msg << arg_name << ".dtype is expected to be " << dtype;
-  Expr cond = (TVMArrayGet(DataType::UInt(8), handle, intrinsic::kArrTypeCode) ==
-               UIntImm::make(DataType::UInt(8), dtype.code()) &&
-               TVMArrayGet(DataType::UInt(8), handle, intrinsic::kArrTypeBits) ==
-               UIntImm::make(DataType::UInt(8), dtype.bits()) &&
-               TVMArrayGet(DataType::UInt(16), handle, intrinsic::kArrTypeLanes) ==
-               UIntImm::make(DataType::UInt(16), dtype.lanes()));
+  Expr cond = (TVMArrayGet(UInt(8), handle, intrinsic::kArrTypeCode) ==
+               UIntImm::make(UInt(8), dtype.code()) &&
+               TVMArrayGet(UInt(8), handle, intrinsic::kArrTypeBits) ==
+               UIntImm::make(UInt(8), dtype.bits()) &&
+               TVMArrayGet(UInt(16), handle, intrinsic::kArrTypeLanes) ==
+               UIntImm::make(UInt(16), dtype.lanes()));
   asserts_.emplace_back(AssertStmt::make(cond, type_err_msg.str(), nop));
   // data field
-  if (Bind_(buffer->data, TVMArrayGet(DataType::Handle(), handle, intrinsic::kArrData),
+  if (Bind_(buffer->data, TVMArrayGet(Handle(), handle, intrinsic::kArrData),
             arg_name + ".data", true)) {
     Var vptr(buffer->data);
     def_handle_dtype_.Set(vptr, ir::TypeAnnotation(buffer->dtype));
     // mark alignment of external bufs
     init_nest_.emplace_back(AttrStmt::make(
         vptr, ir::attr::storage_alignment,
-        IntImm::make(DataType::Int(32), buffer->data_alignment), nop));
+        IntImm::make(Int(32), buffer->data_alignment), nop));
   }
 
-  Var v_shape(arg_name + ".shape", DataType::Handle());
+  Var v_shape(arg_name + ".shape", Handle());
   def_handle_dtype_.Set(v_shape, make_const(tvm_shape_type, 0));
   init_nest_.emplace_back(LetStmt::make(
-      v_shape, TVMArrayGet(DataType::Handle(), handle, intrinsic::kArrShape), nop));
+      v_shape, TVMArrayGet(Handle(), handle, intrinsic::kArrShape), nop));
   for (size_t k = 0; k < buffer->shape.size(); ++k) {
     std::ostringstream field_name;
     field_name << v_shape->name_hint << '[' << k << ']';
     Bind_(buffer->shape[k],
-          cast(buffer->shape[k].dtype(),
+          cast(buffer->shape[k].type(),
                Load::make(tvm_shape_type, v_shape,
-                          IntImm::make(DataType::Int(32), k), const_true(1))),
+                          IntImm::make(Int(32), k), const_true(1))),
           field_name.str(), true);
   }
   // strides field
-  Var v_strides(arg_name + ".strides", DataType::Handle());
+  Var v_strides(arg_name + ".strides", Handle());
   def_handle_dtype_.Set(v_strides, ir::TypeAnnotation(tvm_shape_type));
   init_nest_.emplace_back(LetStmt::make(
-      v_strides, TVMArrayGet(DataType::Handle(), handle, intrinsic::kArrStrides),
+      v_strides, TVMArrayGet(Handle(), handle, intrinsic::kArrStrides),
       nop));
   Expr is_null = Call::make(
-    DataType::Bool(1), intrinsic::tvm_handle_is_null,
+    Bool(1), intrinsic::tvm_handle_is_null,
     {v_strides}, Call::PureIntrinsic);
   if (buffer->strides.size() == 0) {
     // Assert the buffer is compact
-    DataType stype = buffer->DefaultIndexType();
+    Type stype = buffer->DefaultIndexType();
     Expr expect_stride = make_const(stype, 1);
     Array<Expr> conds;
     for (size_t i = buffer->shape.size(); i != 0; --i) {
@@ -228,7 +228,7 @@ void ArgBinder::BindDLTensor(const Buffer& buffer,
       Expr svalue = cast(
           stype,
           Load::make(tvm_shape_type, v_strides,
-                     IntImm::make(DataType::Int(32), k), const_true(1)));
+                     IntImm::make(Int(32), k), const_true(1)));
       conds.push_back(expect_stride == svalue);
       expect_stride = expect_stride * buffer->shape[k];
     }
@@ -243,15 +243,15 @@ void ArgBinder::BindDLTensor(const Buffer& buffer,
       asserts_.emplace_back(Block::make(check, Evaluate::make(0)));
     }
   } else if (buffer->buffer_type == kAutoBroadcast) {
-    DataType stype = buffer->DefaultIndexType();
+    Type stype = buffer->DefaultIndexType();
     Expr stride = make_const(stype, 1);
     for (size_t i = buffer->shape.size(); i != 0; --i) {
       size_t k = i - 1;
       std::ostringstream field_name;
       field_name << v_strides->name_hint << '[' << k << ']';
-      Expr value = cast(buffer->shape[k].dtype(),
+      Expr value = cast(buffer->shape[k].type(),
                         Load::make(tvm_shape_type, v_strides,
-                                   IntImm::make(DataType::Int(32), k), const_true(1)));
+                                   IntImm::make(Int(32), k), const_true(1)));
       value = tvm::if_then_else(is_null, stride, value);
       value = tvm::if_then_else(buffer->shape[k] == 1, 0, value);
       Bind_(buffer->strides[k], value, field_name.str(), true);
@@ -266,9 +266,9 @@ void ArgBinder::BindDLTensor(const Buffer& buffer,
       std::ostringstream field_name;
       field_name << v_strides->name_hint << '[' << k << ']';
       Bind_(buffer->strides[k],
-            cast(buffer->shape[k].dtype(),
+            cast(buffer->shape[k].type(),
                  Load::make(tvm_shape_type, v_strides,
-                            IntImm::make(DataType::Int(32), k), const_true(1))),
+                            IntImm::make(Int(32), k), const_true(1))),
             field_name.str(), true);
     }
   }
@@ -276,29 +276,29 @@ void ArgBinder::BindDLTensor(const Buffer& buffer,
   int data_bytes = GetVectorBytes(buffer->dtype);
   int64_t const_offset;
   if (arith::GetConst(buffer->elem_offset, &const_offset)) {
-    Bind_(make_const(DataType::UInt(64), const_offset * data_bytes),
-               TVMArrayGet(DataType::UInt(64), handle, intrinsic::kArrByteOffset),
+    Bind_(make_const(UInt(64), const_offset * data_bytes),
+               TVMArrayGet(UInt(64), handle, intrinsic::kArrByteOffset),
           arg_name + ".byte_offset", true);
   } else {
     if (Bind_(buffer->elem_offset,
-              cast(buffer->elem_offset.dtype(),
-                   (TVMArrayGet(DataType::UInt(64), handle, intrinsic::kArrByteOffset) /
-                    make_const(DataType::UInt(64), data_bytes))),
+              cast(buffer->elem_offset.type(),
+                   (TVMArrayGet(UInt(64), handle, intrinsic::kArrByteOffset) /
+                    make_const(UInt(64), data_bytes))),
               arg_name + ".elem_offset", true)) {
       if (buffer->offset_factor > 1) {
         Expr offset = buffer->elem_offset;
-        Expr factor = make_const(offset.dtype(), buffer->offset_factor);
-        Expr zero = make_zero(offset.dtype());
+        Expr factor = make_const(offset.type(), buffer->offset_factor);
+        Expr zero = make_zero(offset.type());
         BinderAddAssert(truncmod(offset, factor) == zero, arg_name + ".elem_offset", &asserts_);
       }
     }
   }
   // device info.
   Bind_(device_type,
-        TVMArrayGet(DataType::Int(32), handle, intrinsic::kArrDeviceType),
+        TVMArrayGet(Int(32), handle, intrinsic::kArrDeviceType),
         arg_name + ".device_type", true);
   Bind_(device_id,
-        TVMArrayGet(DataType::Int(32), handle, intrinsic::kArrDeviceId),
+        TVMArrayGet(Int(32), handle, intrinsic::kArrDeviceId),
         arg_name + ".device_id", true);
 }
 
diff --git a/src/pass/bound_checker.cc b/src/pass/bound_checker.cc
index 648302e9740a..55f98474994a 100644
--- a/src/pass/bound_checker.cc
+++ b/src/pass/bound_checker.cc
@@ -6,9 +6,9 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- *
+ * 
  *   http://www.apache.org/licenses/LICENSE-2.0
- *
+ * 
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -58,7 +58,7 @@ class BoundChecker : public IRMutator {
   Stmt Mutate_(const Allocate *op, const Stmt &s) final {
     // If the shape was updated we should update the hashtable.
     if (UpdateIsNeeded(op->buffer_var)) {
-      Update(op->buffer_var, op->extents, op->dtype);
+      Update(op->buffer_var, op->extents, op->type);
     }
     return IRMutator::Mutate_(op, s);
   }
@@ -108,26 +108,26 @@ class BoundChecker : public IRMutator {
   }
 
   void Update(const VarExpr &buffer_var, const Array<Expr> &new_shape,
-              const DataType &type) {
+              const Type &type) {
     // Sanity check at first.
     if (!new_shape.size()) {
       return;
     }
 
     for (size_t i = 0; i < new_shape.size(); ++i) {
-      if (!new_shape[0].defined() || !new_shape[i].dtype().is_scalar() ||
+      if (!new_shape[0].defined() || !new_shape[i].type().is_scalar() ||
           is_negative_const(new_shape[i])) {
         return;
       }
     }
 
     // Scalarize the shape.
-    Expr shape = Mul::make(make_const(DataType::UInt(64), type.lanes()),
-                           Cast::make(DataType::UInt(64), new_shape[0]));
+    Expr shape = Mul::make(make_const(UInt(64), type.lanes()),
+                           Cast::make(UInt(64), new_shape[0]));
     for (size_t i = 1; i < new_shape.size(); ++i) {
       // Cast to unsigned to avoid integer overlow at frist.
-      shape = Mul::make(shape, Mul::make(make_const(DataType::UInt(64), type.lanes()),
-                                         Cast::make(DataType::UInt(64), new_shape[i])));
+      shape = Mul::make(shape, Mul::make(make_const(UInt(64), type.lanes()),
+                                         Cast::make(UInt(64), new_shape[i])));
     }
     mem_to_shape_[buffer_var.get()] = shape;
   }
@@ -139,9 +139,9 @@ class BoundChecker : public IRMutator {
 
     if (const Ramp *ramp_index = index.as<Ramp>()) {
       return ramp_index->base.defined() &&
-             ramp_index->base.dtype().is_scalar() &&
+             ramp_index->base.type().is_scalar() &&
              ramp_index->stride.defined() &&
-             ramp_index->stride.dtype().is_scalar() && (ramp_index->lanes > 0);
+             ramp_index->stride.type().is_scalar() && (ramp_index->lanes > 0);
     }
     return true;
   }
@@ -168,7 +168,7 @@ class BoundChecker : public IRMutator {
         // Non inclusive range.
         index = Add::make(
             ramp_index->base,
-            Mul::make(ramp_index->stride, make_const(ramp_index->stride.dtype(),
+            Mul::make(ramp_index->stride, make_const(ramp_index->stride.type(),
                                                      ramp_index->lanes - 1)));
       }
 
@@ -177,11 +177,11 @@ class BoundChecker : public IRMutator {
       upper_bound = ir::Simplify(upper_bound);
 
       // Cast to the same type - signed, to be able to check lower bound.
-      index = Cast::make(DataType::Int(64), index);
-      upper_bound = Cast::make(DataType::Int(64), upper_bound);
+      index = Cast::make(Int(64), index);
+      upper_bound = Cast::make(Int(64), upper_bound);
 
       // Looks like a lower bound should always be zero after normalization.
-      Expr lower_bound = make_zero(DataType::Int(64));
+      Expr lower_bound = make_zero(Int(64));
 
       Expr current_condition =
           And::make(GE::make(index, lower_bound), LT::make(index, upper_bound));
diff --git a/src/pass/combine_context_call.cc b/src/pass/combine_context_call.cc
index f1cb8fe10a4b..d7fb77961b4b 100644
--- a/src/pass/combine_context_call.cc
+++ b/src/pass/combine_context_call.cc
@@ -48,14 +48,14 @@ class ContextCallCombiner final : public IRMutator {
       if (it != ctx_map_.end()) {
         return it->second;
       } else {
-        CHECK(ctx.dtype().is_handle());
+        CHECK(ctx.type().is_handle());
         std::string name;
         if (const Call* call = ctx.as<Call>()) {
           name = call->name + "_cache";
         } else {
           name = "ctx_cache_";
         }
-        Var ctx_var(name, ctx.dtype());
+        Var ctx_var(name, ctx.type());
         ctx_map_[ctx] = ctx_var;
         return std::move(ctx_var);
       }
diff --git a/src/pass/coproc_sync.cc b/src/pass/coproc_sync.cc
index 4aa8879f679b..3dacb6d5bff7 100644
--- a/src/pass/coproc_sync.cc
+++ b/src/pass/coproc_sync.cc
@@ -198,7 +198,7 @@ class CoProcSyncPlanner : public StorageAccessVisitor {
 
   std::vector<Stmt> GetSync(std::string sync_name) {
     return {Evaluate::make(Call::make(
-        DataType::Int(32),
+        Int(32),
         sync_name,
         {}, Call::Intrinsic))};
   }
@@ -345,7 +345,7 @@ class CoProcBarrierDetector : public StorageAccessVisitor {
     Expr min = r->min;
     Expr extent = r->extent;
     return Evaluate::make(Call::make(
-        DataType::Int(32), func,
+        Int(32), func,
         {wvec[0].buffer, wvec[0].dtype.bits(), r->min, r->extent}, Call::Intrinsic));
   }
   // Write barrier name
@@ -588,14 +588,14 @@ class CoProcInstDepDetector : public IRVisitor {
 
   Stmt MakePush(int from, int to) {
     return Evaluate::make(Call::make(
-        DataType::Int(32), sync_push_name_,
-        {make_const(DataType::Int(32), from), make_const(DataType::Int(32), to)},
+        Int(32), sync_push_name_,
+        {make_const(Int(32), from), make_const(Int(32), to)},
         Call::Intrinsic));
   }
   Stmt MakePop(int from, int to) {
     return Evaluate::make(Call::make(
-        DataType::Int(32), sync_pop_name_,
-        {make_const(DataType::Int(32), from), make_const(DataType::Int(32), to)},
+        Int(32), sync_pop_name_,
+        {make_const(Int(32), from), make_const(Int(32), to)},
         Call::Intrinsic));
   }
   // sync states.
diff --git a/src/pass/detect_device.cc b/src/pass/detect_device.cc
index cd7c979171a6..92e368b62d20 100644
--- a/src/pass/detect_device.cc
+++ b/src/pass/detect_device.cc
@@ -28,7 +28,7 @@
 namespace tvm {
 namespace ir {
 Stmt DecorateDeviceScope(Stmt stmt) {
-  Stmt body = AttrStmt::make(make_zero(DataType::Int(32)),
+  Stmt body = AttrStmt::make(make_zero(Int(32)),
                              ir::attr::device_scope,
                              0,
                              stmt);
diff --git a/src/pass/inject_copy_intrin.cc b/src/pass/inject_copy_intrin.cc
index 7b7c5df48236..3b148361fbfc 100644
--- a/src/pass/inject_copy_intrin.cc
+++ b/src/pass/inject_copy_intrin.cc
@@ -88,7 +88,7 @@ class CopyIntrinInjector : public IRMutator {
       load = cast->value.as<Load>();
     }
     if (load == nullptr) return false;
-    if (load->dtype.lanes() != 1) return false;
+    if (load->type.lanes() != 1) return false;
     Array<Var> loop_vars;
     for (const For* op : loops) {
       loop_vars.push_back(op->loop_var);
@@ -101,7 +101,7 @@ class CopyIntrinInjector : public IRMutator {
     Array<Expr> dst_shape;
     const size_t loop_var_size = loop_vars.size();
     if (loop_var_size == 0) {
-      dst_shape.push_back(make_const(DataType::Int(32), 1));
+      dst_shape.push_back(make_const(Int(32), 1));
     } else {
       for (const For* op : loops) {
         dst_shape.push_back(op->extent);
@@ -121,7 +121,7 @@ class CopyIntrinInjector : public IRMutator {
       for (size_t i = 0; i < src_shape.size(); ++i) {
         Expr min_value = clip_bound[2 * i];
         Expr max_value = clip_bound[2 * i + 1];
-        DataType t = loop_vars[i].dtype();
+        Type t = loop_vars[i].type();
         Expr svalue = src_shape[i];
         if (min_value.defined()) {
           Expr pbefore = Simplify(Max::make(min_value, make_zero(t)));
@@ -148,12 +148,12 @@ class CopyIntrinInjector : public IRMutator {
     Array<Expr> src_strides(load_strides.begin(), load_strides.begin() + loop_var_size);
     Array<Expr> dst_strides(store_strides.begin(), store_strides.begin() + loop_var_size);
     if (loop_var_size == 0) {
-        src_strides.push_back(make_const(DataType::Int(32), 1));
-        dst_strides.push_back(make_const(DataType::Int(32), 1));
+        src_strides.push_back(make_const(Int(32), 1));
+        dst_strides.push_back(make_const(Int(32), 1));
     }
     Buffer dst = BufferNode::make(
         store->buffer_var,
-        store->value.dtype(),
+        store->value.type(),
         dst_shape,
         dst_strides,
         store_strides[loop_var_size],
@@ -162,7 +162,7 @@ class CopyIntrinInjector : public IRMutator {
         0, 0, kDefault);
     Buffer src = BufferNode::make(
         load->buffer_var,
-        load->dtype,
+        load->type,
         src_shape,
         src_strides,
         src_elem_offset,
diff --git a/src/pass/inject_double_buffer.cc b/src/pass/inject_double_buffer.cc
index 78d3305d3e17..065bbd4e4db3 100644
--- a/src/pass/inject_double_buffer.cc
+++ b/src/pass/inject_double_buffer.cc
@@ -100,10 +100,10 @@ class DoubleBufferInjector : public IRMutator {
     auto it = dbuffer_info_.find(op->buffer_var.get());
     if (it != dbuffer_info_.end()) {
       it->second.stride = arith::ComputeReduce<Mul>(
-          op->extents, Expr()) * op->dtype.lanes();
+          op->extents, Expr()) * op->type.lanes();
       Stmt stmt = IRMutator::Mutate_(op, s);
       op = stmt.as<Allocate>();
-      Array<Expr> new_extents{make_const(op->extents[0].dtype(), 2)};
+      Array<Expr> new_extents{make_const(op->extents[0].type(), 2)};
       for (Expr e : op->extents) {
         new_extents.push_back(e);
       }
@@ -114,7 +114,7 @@ class DoubleBufferInjector : public IRMutator {
           StringImm::make(it->second.scope),
           Evaluate::make(0)));
       alloc_nest.emplace_back(Allocate::make(
-          op->buffer_var, op->dtype, new_extents, op->condition,
+          op->buffer_var, op->type, new_extents, op->condition,
           Evaluate::make(0)));
       return op->body;
     } else {
@@ -135,15 +135,15 @@ class DoubleBufferInjector : public IRMutator {
         CHECK(is_zero(old_loop->min));
         Expr zero = old_loop->min;
         Expr new_ext =
-            old_loop->extent - make_const(old_loop->loop_var.dtype(), 1);
-        Expr factor = make_const(new_ext.dtype(), split_loop_);
+            old_loop->extent - make_const(old_loop->loop_var.type(), 1);
+        Expr factor = make_const(new_ext.type(), split_loop_);
         Expr outer_ext = new_ext / factor;
         Expr tail_base = outer_ext * factor;
-        Var outer_var(old_loop->loop_var->name_hint + ".outer", old_loop->loop_var.dtype());
+        Var outer_var(old_loop->loop_var->name_hint + ".outer", old_loop->loop_var.type());
         std::unordered_map<const Variable*, Expr> vmap;
         std::vector<Stmt> loop_seq;
         for (int32_t i = 0; i < split_loop_; ++i) {
-          vmap[old_loop->loop_var.get()] = outer_var * factor + make_const(factor.dtype(), i);
+          vmap[old_loop->loop_var.get()] = outer_var * factor + make_const(factor.type(), i);
           loop_seq.emplace_back(Substitute(old_loop->body, vmap));
         }
         Stmt loop = For::make(
@@ -153,7 +153,7 @@ class DoubleBufferInjector : public IRMutator {
         std::vector<Stmt> tail_seq;
         Stmt tail_body = StripDoubleBufferWrite().Mutate(old_loop->body);
         for (int32_t i = 0; i < split_loop_; ++i) {
-          Expr idx = tail_base + make_const(tail_base.dtype(), i);
+          Expr idx = tail_base + make_const(tail_base.type(), i);
           vmap[old_loop->loop_var.get()] = idx;
           tail_seq.emplace_back(
               IfThenElse::make(idx < old_loop->extent,
@@ -196,7 +196,7 @@ class DoubleBufferInjector : public IRMutator {
       const StorageEntry& e = it->second;
       CHECK(e.stride.defined());
       CHECK(e.switch_read_var.defined());
-      return Load::make(op->dtype,
+      return Load::make(op->type,
                         op->buffer_var,
                         e.switch_read_var * e.stride + op->index,
                         op->predicate);
@@ -222,12 +222,12 @@ class DoubleBufferInjector : public IRMutator {
     }
     StorageEntry& e = it->second;
     e.loop = loop_nest_.back();
-    Expr zero = make_const(e.loop->loop_var.dtype(), 0);
-    Expr one = make_const(e.loop->loop_var.dtype(), 1);
-    Expr two = make_const(e.loop->loop_var.dtype(), 2);
+    Expr zero = make_const(e.loop->loop_var.type(), 0);
+    Expr one = make_const(e.loop->loop_var.type(), 1);
+    Expr two = make_const(e.loop->loop_var.type(), 2);
     Expr loop_shift = e.loop->loop_var + one;
     e.switch_write_var = Var(e.loop->loop_var->name_hint + ".db",
-                             e.loop->loop_var.dtype());
+                             e.loop->loop_var.type());
     e.switch_read_var = indexmod(e.loop->loop_var, two);
     in_double_buffer_scope_ = true;
     Stmt body = Mutate(op->body);
diff --git a/src/pass/inject_virtual_thread.cc b/src/pass/inject_virtual_thread.cc
index c80c7fcdaa8c..eafe5a928cd7 100644
--- a/src/pass/inject_virtual_thread.cc
+++ b/src/pass/inject_virtual_thread.cc
@@ -222,7 +222,7 @@ class VTInjector : public IRMutator {
     }
     auto it = alloc_remap_.find(op->buffer_var.get());
     if (it != alloc_remap_.end()) {
-      return Load::make(op->dtype, op->buffer_var,
+      return Load::make(op->type, op->buffer_var,
                         RewriteIndex(op->index, it->second),
                         op->predicate);
     } else {
@@ -233,7 +233,7 @@ class VTInjector : public IRMutator {
   Expr Mutate_(const Call* op, const Expr& e) final {
     if (op->is_intrinsic(intrinsic::tvm_access_ptr)) {
       CHECK_EQ(op->args.size(), 5U);
-      DataType dtype = op->args[0].dtype();
+      Type dtype = op->args[0].type();
       const Variable* buffer = op->args[1].as<Variable>();
       auto it = alloc_remap_.find(buffer);
       if (it == alloc_remap_.end()) return IRMutator::Mutate_(op, e);
@@ -241,10 +241,10 @@ class VTInjector : public IRMutator {
       Expr offset = Mutate(op->args[2]);
       Expr extent = Mutate(op->args[3]);
       Expr stride =
-          it->second / make_const(offset.dtype(), dtype.lanes());
+          it->second / make_const(offset.type(), dtype.lanes());
       offset = stride * var_ + offset;
       return Call::make(
-          op->dtype, op->name,
+          op->type, op->name,
           {op->args[0], op->args[1], offset, extent, op->args[4]},
           op->call_type);
     } else if (op->is_intrinsic(intrinsic::tvm_context_id)) {
@@ -395,9 +395,9 @@ class VTInjector : public IRMutator {
     if (touched_var_.count(op->buffer_var.get()) || !allow_share_) {
       // place v on highest dimension.
       Expr stride = arith::ComputeReduce<Mul>(
-          op->extents, Expr()) * op->dtype.lanes();
+          op->extents, Expr()) * op->type.lanes();
       Array<Expr> other;
-      other.push_back(make_const(op->extents[0].dtype(), num_threads_));
+      other.push_back(make_const(op->extents[0].type(), num_threads_));
       for (Expr e : extents) {
         other.push_back(e);
       }
@@ -417,7 +417,7 @@ class VTInjector : public IRMutator {
       return s;
     } else {
       return Allocate::make(
-          op->buffer_var, op->dtype,
+          op->buffer_var, op->type,
           extents, condition, body,
           op->new_expr, op->free_function);
     }
@@ -439,19 +439,19 @@ class VTInjector : public IRMutator {
     // only unroll if number of vthreads are small
     if (max_loop_depth_ == 0 && num_threads_ < 16) {
       // do unrolling if it is inside innermost content.
-      Stmt blk = Substitute(stmt, {{var_, make_zero(var_.dtype())}});
+      Stmt blk = Substitute(stmt, {{var_, make_zero(var_.type())}});
       for (int i = 1; i < num_threads_; ++i) {
         blk = Block::make(
-            blk, Substitute(stmt, {{var_, make_const(var_.dtype(), i)}}));
+            blk, Substitute(stmt, {{var_, make_const(var_.type(), i)}}));
       }
       return blk;
     } else {
       // insert a for loop
-      Var idx(var_->name_hint + ".s", var_->dtype);
+      Var idx(var_->name_hint + ".s", var_->type);
       Map<Var, Expr> values{{var_, idx}};
       stmt = Substitute(stmt, values);
-      return For::make(idx, make_zero(idx.dtype()),
-                       make_const(idx.dtype(), num_threads_),
+      return For::make(idx, make_zero(idx.type()),
+                       make_const(idx.type(), num_threads_),
                        ForType::Serial, DeviceAPI::None, stmt);
     }
   }
diff --git a/src/pass/ir_deep_compare.cc b/src/pass/ir_deep_compare.cc
index e399e7f2c54f..cb859d07f07b 100644
--- a/src/pass/ir_deep_compare.cc
+++ b/src/pass/ir_deep_compare.cc
@@ -63,7 +63,7 @@ class IRDeepCompare :
     if (order_ != 0) return;
     if (n.same_as(other)) return;
     if (CompareValue(n->type_index(), other->type_index()) != 0) return;
-    if (CompareType(n.dtype(), other.dtype()) != 0) return;
+    if (CompareType(n.type(), other.type()) != 0) return;
     ExprComparator::VisitExpr(n, other);
   }
 
@@ -119,7 +119,7 @@ class IRDeepCompare :
     } else {
       if (CompareExpr(op->buffer_var, rhs->buffer_var) != 0) return;
     }
-    if (CompareType(op->dtype, rhs->dtype) != 0) return;
+    if (CompareType(op->type, rhs->type) != 0) return;
     if (CompareArray(op->extents, rhs->extents) != 0) return;
     if (CompareExpr(op->condition, rhs->condition) != 0) return;
     if (CompareStmt(op->body, rhs->body) != 0) return;
@@ -166,7 +166,7 @@ class IRDeepCompare :
     const Realize* rhs = other.as<Realize>();
     if (CompareNodeRef(op->func, rhs->func) != 0) return;
     if (CompareValue(op->value_index, rhs->value_index) != 0) return;
-    if (CompareType(op->dtype, rhs->dtype) != 0) return;
+    if (CompareType(op->type, rhs->type) != 0) return;
     if (CompareRegion(op->bounds, rhs->bounds) != 0) return;
     if (CompareStmt(op->body, rhs->body) != 0) return;
   }
@@ -175,7 +175,7 @@ class IRDeepCompare :
     const Prefetch* rhs = other.as<Prefetch>();
     if (CompareNodeRef(op->func, rhs->func) != 0) return;
     if (CompareValue(op->value_index, rhs->value_index) != 0) return;
-    if (CompareType(op->dtype, rhs->dtype) != 0) return;
+    if (CompareType(op->type, rhs->type) != 0) return;
     if (CompareRegion(op->bounds, rhs->bounds) != 0) return;
   }
 
@@ -369,7 +369,7 @@ class IRDeepCompare :
     return order_;
   }
 
-  int CompareType(const DataType& lhs, const DataType& rhs) {
+  int CompareType(const Type& lhs, const Type& rhs) {
     if (order_ != 0) return order_;
     if (lhs == rhs) return order_;
     if (CompareValue(lhs.code(), rhs.code()) != 0) return order_;
diff --git a/src/pass/ir_mutator.cc b/src/pass/ir_mutator.cc
index b300989dd2fd..f79a1ab8fe3b 100644
--- a/src/pass/ir_mutator.cc
+++ b/src/pass/ir_mutator.cc
@@ -45,7 +45,7 @@ class IRTransformer final : public IRMutator {
   }
 
  private:
-  template <typename T>
+  template<typename T>
   T MutateInternal(T node) {
     if (only_enable_.size() &&
         !only_enable_.count(node->type_index())) {
@@ -89,11 +89,11 @@ IRMutator::FMutateStmt& IRMutator::vtable_stmt() {  // NOLINT(*)
   static FMutateStmt inst; return inst;
 }
 
-inline Array<Expr> MutateArray(Array<Expr> arr, IRMutator* m) {
-  return UpdateArray(arr, [&m](const Expr& e) { return m->Mutate(e); });
+inline Array<Expr> MutateArray(Array<Expr> arr, IRMutator *m) {
+  return UpdateArray(arr, [&m] (const Expr& e) { return m->Mutate(e); });
 }
 
-inline Array<IterVar> MutateIterVarArr(Array<IterVar> rdom, IRMutator* m) {
+inline Array<IterVar> MutateIterVarArr(Array<IterVar> rdom, IRMutator *m) {
   std::vector<IterVar> new_dom(rdom.size());
   bool changed = false;
   for (size_t i = 0; i < rdom.size(); i++) {
@@ -133,7 +133,7 @@ Stmt IRMutator::Mutate_(const AttrStmt* op, const Stmt& s) {
   }
 }
 
-Stmt IRMutator::Mutate_(const LetStmt* op, const Stmt& s) {
+Stmt IRMutator::Mutate_(const LetStmt *op, const Stmt& s) {
   Expr value = this->Mutate(op->value);
   Stmt body = this->Mutate(op->body);
   if (value.same_as(op->value) &&
@@ -144,7 +144,7 @@ Stmt IRMutator::Mutate_(const LetStmt* op, const Stmt& s) {
   }
 }
 
-Stmt IRMutator::Mutate_(const For* op, const Stmt& s) {
+Stmt IRMutator::Mutate_(const For *op, const Stmt& s) {
   Expr min = this->Mutate(op->min);
   Expr extent = this->Mutate(op->extent);
   Stmt body = this->Mutate(op->body);
@@ -179,13 +179,13 @@ Stmt IRMutator::Mutate_(const Allocate* op, const Stmt& s) {
     return s;
   } else {
     return Allocate::make(
-        op->buffer_var, op->dtype,
+        op->buffer_var, op->type,
         new_extents, condition, body,
         new_expr, op->free_function);
   }
 }
 
-Stmt IRMutator::Mutate_(const IfThenElse* op, const Stmt& s) {
+Stmt IRMutator::Mutate_(const IfThenElse *op, const Stmt& s) {
   Expr condition = this->Mutate(op->condition);
   Stmt then_case = this->Mutate(op->then_case);
   Stmt else_case;
@@ -201,7 +201,7 @@ Stmt IRMutator::Mutate_(const IfThenElse* op, const Stmt& s) {
   }
 }
 
-Stmt IRMutator::Mutate_(const Store* op, const Stmt& s) {
+Stmt IRMutator::Mutate_(const Store *op, const Stmt& s) {
   Expr value = this->Mutate(op->value);
   Expr index = this->Mutate(op->index);
   Expr pred = this->Mutate(op->predicate);
@@ -233,7 +233,7 @@ Stmt IRMutator::Mutate_(const Realize* op, const Stmt& s) {
     Expr old_extent = op->bounds[i]->extent;
     Expr new_min = m->Mutate(old_min);
     Expr new_extent = m->Mutate(old_extent);
-    if (!new_min.same_as(old_min)) bounds_changed = true;
+    if (!new_min.same_as(old_min))  bounds_changed = true;
     if (!new_extent.same_as(old_extent)) bounds_changed = true;
     new_bounds.push_back(
         Range::make_by_min_extent(new_min, new_extent));
@@ -247,7 +247,7 @@ Stmt IRMutator::Mutate_(const Realize* op, const Stmt& s) {
     return s;
   } else {
     return Realize::make(op->func, op->value_index,
-                         op->dtype, new_bounds,
+                         op->type, new_bounds,
                          condition, body);
   }
 }
@@ -263,7 +263,7 @@ Stmt IRMutator::Mutate_(const Prefetch* op, const Stmt& s) {
     Expr old_extent = op->bounds[i]->extent;
     Expr new_min = m->Mutate(old_min);
     Expr new_extent = m->Mutate(old_extent);
-    if (!new_min.same_as(old_min)) bounds_changed = true;
+    if (!new_min.same_as(old_min))  bounds_changed = true;
     if (!new_extent.same_as(old_extent)) bounds_changed = true;
     new_bounds.push_back(
         Range::make_by_min_extent(new_min, new_extent));
@@ -273,7 +273,7 @@ Stmt IRMutator::Mutate_(const Prefetch* op, const Stmt& s) {
     return s;
   } else {
     return Prefetch::make(op->func, op->value_index,
-                          op->dtype, new_bounds);
+                          op->type, new_bounds);
   }
 }
 
@@ -288,7 +288,7 @@ Stmt IRMutator::Mutate_(const Block* op, const Stmt& s) {
   }
 }
 
-Stmt IRMutator::Mutate_(const AssertStmt* op, const Stmt& s) {
+Stmt IRMutator::Mutate_(const AssertStmt *op, const Stmt& s) {
   Expr condition = this->Mutate(op->condition);
   Expr message = this->Mutate(op->message);
   Stmt body = this->Mutate(op->body);
@@ -302,7 +302,7 @@ Stmt IRMutator::Mutate_(const AssertStmt* op, const Stmt& s) {
   }
 }
 
-Stmt IRMutator::Mutate_(const ProducerConsumer* op, const Stmt& s) {
+Stmt IRMutator::Mutate_(const ProducerConsumer *op, const Stmt& s) {
   Stmt body = this->Mutate(op->body);
   if (body.same_as(op->body)) {
     return s;
@@ -311,7 +311,7 @@ Stmt IRMutator::Mutate_(const ProducerConsumer* op, const Stmt& s) {
   }
 }
 
-Stmt IRMutator::Mutate_(const Evaluate* op, const Stmt& s) {
+Stmt IRMutator::Mutate_(const Evaluate *op, const Stmt& s) {
   Expr v = this->Mutate(op->value);
   if (v.same_as(op->value)) {
     return s;
@@ -320,7 +320,7 @@ Stmt IRMutator::Mutate_(const Evaluate* op, const Stmt& s) {
   }
 }
 
-Stmt IRMutator::Mutate_(const Free* op, const Stmt& s) {
+Stmt IRMutator::Mutate_(const Free *op, const Stmt& s) {
   return s;
 }
 
@@ -348,21 +348,21 @@ TVM_STATIC_IR_FUNCTOR(IRMutator, vtable_stmt)
       return m->Mutate_(static_cast<const OP*>(node.get()), e);             \
     })
 
-Expr IRMutator::Mutate_(const Variable* op, const Expr& e) {
+Expr IRMutator::Mutate_(const Variable *op, const Expr& e) {
   return e;
 }
 
-Expr IRMutator::Mutate_(const Load* op, const Expr& e) {
+Expr IRMutator::Mutate_(const Load *op, const Expr& e) {
   Expr index = this->Mutate(op->index);
   Expr pred = this->Mutate(op->predicate);
   if (index.same_as(op->index) && pred.same_as(op->predicate)) {
     return e;
   } else {
-    return Load::make(op->dtype, op->buffer_var, index, pred);
+    return Load::make(op->type, op->buffer_var, index, pred);
   }
 }
 
-Expr IRMutator::Mutate_(const Let* op, const Expr& e) {
+Expr IRMutator::Mutate_(const Let *op, const Expr& e) {
   Expr value = this->Mutate(op->value);
   Expr body = this->Mutate(op->body);
   if (value.same_as(op->value) &&
@@ -378,7 +378,7 @@ Expr IRMutator::Mutate_(const Call* op, const Expr& e) {
   if (op->args.same_as(new_args)) {
     return e;
   } else {
-    return Call::make(op->dtype, op->name, new_args, op->call_type,
+    return Call::make(op->type, op->name, new_args, op->call_type,
                       op->func, op->value_index);
   }
 }
@@ -413,8 +413,8 @@ DEFINE_BIOP_EXPR_MUTATE_(GE)
 DEFINE_BIOP_EXPR_MUTATE_(And)
 DEFINE_BIOP_EXPR_MUTATE_(Or)
 
-Expr IRMutator::Mutate_(const Reduce* op, const Expr& e) {
-  Array<IterVar> new_axis = MutateIterVarArr(op->axis, this);
+Expr IRMutator::Mutate_(const Reduce *op, const Expr& e) {
+  Array<IterVar> new_axis  = MutateIterVarArr(op->axis, this);
   Array<Expr> new_source = MutateArray(op->source, this);
   Expr new_cond = this->Mutate(op->condition);
   if (op->axis.same_as(new_axis) &&
@@ -427,16 +427,16 @@ Expr IRMutator::Mutate_(const Reduce* op, const Expr& e) {
   }
 }
 
-Expr IRMutator::Mutate_(const Cast* op, const Expr& e) {
+Expr IRMutator::Mutate_(const Cast *op, const Expr& e) {
   Expr value = this->Mutate(op->value);
   if (value.same_as(op->value)) {
     return e;
   } else {
-    return Cast::make(op->dtype, value);
+    return Cast::make(op->type, value);
   }
 }
 
-Expr IRMutator::Mutate_(const Not* op, const Expr& e) {
+Expr IRMutator::Mutate_(const Not *op, const Expr& e) {
   Expr a = this->Mutate(op->a);
   if (a.same_as(op->a)) {
     return e;
@@ -445,7 +445,7 @@ Expr IRMutator::Mutate_(const Not* op, const Expr& e) {
   }
 }
 
-Expr IRMutator::Mutate_(const Select* op, const Expr& e) {
+Expr IRMutator::Mutate_(const Select *op, const Expr& e) {
   Expr cond = this->Mutate(op->condition);
   Expr t = this->Mutate(op->true_value);
   Expr f = this->Mutate(op->false_value);
@@ -458,7 +458,7 @@ Expr IRMutator::Mutate_(const Select* op, const Expr& e) {
   }
 }
 
-Expr IRMutator::Mutate_(const Ramp* op, const Expr& e) {
+Expr IRMutator::Mutate_(const Ramp *op, const Expr& e) {
   Expr base = this->Mutate(op->base);
   Expr stride = this->Mutate(op->stride);
   if (base.same_as(op->base) &&
@@ -469,7 +469,7 @@ Expr IRMutator::Mutate_(const Ramp* op, const Expr& e) {
   }
 }
 
-Expr IRMutator::Mutate_(const Broadcast* op, const Expr& e) {
+Expr IRMutator::Mutate_(const Broadcast *op, const Expr& e) {
   Expr value = this->Mutate(op->value);
   if (value.same_as(op->value)) {
     return e;
@@ -478,7 +478,7 @@ Expr IRMutator::Mutate_(const Broadcast* op, const Expr& e) {
   }
 }
 
-Expr IRMutator::Mutate_(const Shuffle* op, const Expr& e) {
+Expr IRMutator::Mutate_(const Shuffle *op, const Expr& e) {
   auto new_vec = MutateArray(op->vectors, this);
   if (new_vec.same_as(op->vectors)) {
     return e;
diff --git a/src/pass/ir_util.h b/src/pass/ir_util.h
index 0f8bb990c2d3..690feca135ef 100644
--- a/src/pass/ir_util.h
+++ b/src/pass/ir_util.h
@@ -6,9 +6,9 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- *
+ * 
  *   http://www.apache.org/licenses/LICENSE-2.0
- *
+ * 
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -89,12 +89,12 @@ inline Array<T> UpdateArray(Array<T> arr, F fupdate) {
  * \return the get expression.
  */
 inline Expr TVMStructGet(
-    DataType dtype, Var handle, int index,
+    Type dtype, Var handle, int index,
     intrinsic::TVMStructFieldKind kind) {
   Array<Expr> args ={
     handle,
-    make_const(DataType::Int(32), index),
-    make_const(DataType::Int(32), static_cast<int>(kind))};
+    make_const(Int(32), index),
+    make_const(Int(32), static_cast<int>(kind))};
   return Call::make(dtype, intrinsic::tvm_struct_get, args, Call::PureIntrinsic);
 }
 
@@ -104,10 +104,10 @@ inline Expr TVMStructGet(
  * \param dtype The data type.
  * \param offset the offset index.
  */
-inline Expr AddressOffset(Var handle, DataType dtype, int offset) {
+inline Expr AddressOffset(Var handle, Type dtype, int offset) {
   return Call::make(
-      DataType::Handle(), intrinsic::tvm_address_of,
-      {Load::make(dtype, handle, make_const(DataType::Int(32), offset * dtype.lanes()),
+      Handle(), intrinsic::tvm_address_of,
+      {Load::make(dtype, handle, make_const(Int(32), offset * dtype.lanes()),
                   const_true(dtype.lanes()))},
       Call::PureIntrinsic);
 }
@@ -118,13 +118,13 @@ inline Expr AddressOffset(Var handle, DataType dtype, int offset) {
  * \param dtype The data type.
  * \param offset the offset index.
  */
-inline Expr AddressOffset(Var handle, DataType dtype, Expr offset) {
+inline Expr AddressOffset(Var handle, Type dtype, Expr offset) {
   if (dtype.lanes() != 1) {
-    offset = offset * make_const(offset.dtype(), dtype.lanes());
-    offset = Ramp::make(offset, make_const(offset.dtype(), 1), dtype.lanes());
+    offset = offset * make_const(offset.type(), dtype.lanes());
+    offset = Ramp::make(offset, make_const(offset.type(), 1), dtype.lanes());
   }
   return Call::make(
-      DataType::Handle(), intrinsic::tvm_address_of,
+      Handle(), intrinsic::tvm_address_of,
       {Load::make(dtype, handle, offset,
                   const_true(dtype.lanes()))},
       Call::PureIntrinsic);
@@ -143,11 +143,11 @@ inline Stmt TVMStructSet(
     intrinsic::TVMStructFieldKind kind, Expr value) {
   Array<Expr> args ={
     handle,
-    make_const(DataType::Int(32), index),
-    make_const(DataType::Int(32), static_cast<int>(kind)),
+    make_const(Int(32), index),
+    make_const(Int(32), static_cast<int>(kind)),
     value};
   return Evaluate::make(
-      Call::make(DataType::Int(32), intrinsic::tvm_struct_set, args, Call::Intrinsic));
+      Call::make(Int(32), intrinsic::tvm_struct_set, args, Call::Intrinsic));
 }
 
 /*!
@@ -155,13 +155,13 @@ inline Stmt TVMStructSet(
  * \param t The original type.
  * \return The corresponding API type.
  */
-inline DataType APIType(DataType t) {
+inline Type APIType(Type t) {
   if (t.is_handle()) return t;
   CHECK_EQ(t.lanes(), 1)
       << "Cannot pass vector type through packed API.";
-  if (t.is_uint() || t.is_int()) return DataType::Int(64);
+  if (t.is_uint() || t.is_int()) return Int(64);
   CHECK(t.is_float());
-  return DataType::Float(64);
+  return Float(64);
 }
 
 /*!
@@ -170,7 +170,7 @@ inline DataType APIType(DataType t) {
  * \param const_size The constant size of the array.
  * \return the alignment
  */
-inline int GetTempAllocaAlignment(DataType type, int32_t const_size) {
+inline int GetTempAllocaAlignment(Type type, int32_t const_size) {
   int align = runtime::kTempAllocaAlignment;
   if (const_size > 0) {
     int64_t const_s = static_cast<int64_t>(const_size) * type.bits() * type.lanes() / 8;
diff --git a/src/pass/ir_visitor.cc b/src/pass/ir_visitor.cc
index d6f163ccedc6..204c0f75fe4a 100644
--- a/src/pass/ir_visitor.cc
+++ b/src/pass/ir_visitor.cc
@@ -43,6 +43,7 @@ class IRApplyVisit : public IRVisitor {
   std::unordered_set<const Node*> visited_;
 };
 
+
 void PostOrderVisit(const NodeRef& node, std::function<void(const NodeRef&)> fvisit) {
   IRApplyVisit(fvisit).Visit(node);
 }
@@ -67,7 +68,7 @@ inline void VisitRDom(const Array<IterVar>& rdom, IRVisitor* v) {
 
 void IRVisitor::Visit_(const Variable* op) {}
 
-void IRVisitor::Visit_(const LetStmt* op) {
+void IRVisitor::Visit_(const LetStmt *op) {
   this->Visit(op->value);
   this->Visit(op->body);
 }
@@ -77,14 +78,14 @@ void IRVisitor::Visit_(const AttrStmt* op) {
   this->Visit(op->body);
 }
 
-void IRVisitor::Visit_(const For* op) {
+void IRVisitor::Visit_(const For *op) {
   IRVisitor* v = this;
   v->Visit(op->min);
   v->Visit(op->extent);
   v->Visit(op->body);
 }
 
-void IRVisitor::Visit_(const Allocate* op) {
+void IRVisitor::Visit_(const Allocate *op) {
   IRVisitor* v = this;
   for (size_t i = 0; i < op->extents.size(); i++) {
     v->Visit(op->extents[i]);
@@ -96,18 +97,18 @@ void IRVisitor::Visit_(const Allocate* op) {
   }
 }
 
-void IRVisitor::Visit_(const Load* op) {
+void IRVisitor::Visit_(const Load *op) {
   this->Visit(op->index);
   this->Visit(op->predicate);
 }
 
-void IRVisitor::Visit_(const Store* op) {
+void IRVisitor::Visit_(const Store *op) {
   this->Visit(op->value);
   this->Visit(op->index);
   this->Visit(op->predicate);
 }
 
-void IRVisitor::Visit_(const IfThenElse* op) {
+void IRVisitor::Visit_(const IfThenElse *op) {
   this->Visit(op->condition);
   this->Visit(op->then_case);
   if (op->else_case.defined()) {
@@ -115,14 +116,14 @@ void IRVisitor::Visit_(const IfThenElse* op) {
   }
 }
 
-void IRVisitor::Visit_(const Let* op) {
+void IRVisitor::Visit_(const Let *op) {
   this->Visit(op->value);
   this->Visit(op->body);
 }
 
 void IRVisitor::Visit_(const Free* op) {}
 
-void IRVisitor::Visit_(const Call* op) {
+void IRVisitor::Visit_(const Call *op) {
   VisitArray(op->args, this);
 }
 
@@ -170,38 +171,38 @@ void IRVisitor::Visit_(const Select* op) {
   this->Visit(op->false_value);
 }
 
-void IRVisitor::Visit_(const Ramp* op) {
+void IRVisitor::Visit_(const Ramp *op) {
   this->Visit(op->base);
   this->Visit(op->stride);
 }
 
-void IRVisitor::Visit_(const Shuffle* op) {
-  for (const auto& elem : op->indices)
+void IRVisitor::Visit_(const Shuffle *op) {
+  for (const auto &elem : op->indices)
     this->Visit(elem);
-  for (const auto& elem : op->vectors)
+  for (const auto &elem : op->vectors)
     this->Visit(elem);
 }
 
-void IRVisitor::Visit_(const Broadcast* op) {
+void IRVisitor::Visit_(const Broadcast *op) {
   this->Visit(op->value);
 }
 
-void IRVisitor::Visit_(const AssertStmt* op) {
+void IRVisitor::Visit_(const AssertStmt *op) {
   this->Visit(op->condition);
   this->Visit(op->message);
   this->Visit(op->body);
 }
 
-void IRVisitor::Visit_(const ProducerConsumer* op) {
+void IRVisitor::Visit_(const ProducerConsumer *op) {
   this->Visit(op->body);
 }
 
-void IRVisitor::Visit_(const Provide* op) {
+void IRVisitor::Visit_(const Provide *op) {
   VisitArray(op->args, this);
   this->Visit(op->value);
 }
 
-void IRVisitor::Visit_(const Realize* op) {
+void IRVisitor::Visit_(const Realize *op) {
   for (size_t i = 0; i < op->bounds.size(); i++) {
     this->Visit(op->bounds[i]->min);
     this->Visit(op->bounds[i]->extent);
@@ -211,19 +212,19 @@ void IRVisitor::Visit_(const Realize* op) {
   this->Visit(op->condition);
 }
 
-void IRVisitor::Visit_(const Prefetch* op) {
+void IRVisitor::Visit_(const Prefetch *op) {
   for (size_t i = 0; i < op->bounds.size(); i++) {
     this->Visit(op->bounds[i]->min);
     this->Visit(op->bounds[i]->extent);
   }
 }
 
-void IRVisitor::Visit_(const Block* op) {
+void IRVisitor::Visit_(const Block *op) {
   this->Visit(op->first);
   this->Visit(op->rest);
 }
 
-void IRVisitor::Visit_(const Evaluate* op) {
+void IRVisitor::Visit_(const Evaluate *op) {
   this->Visit(op->value);
 }
 
diff --git a/src/pass/lift_attr_scope.cc b/src/pass/lift_attr_scope.cc
index cfc6e5a7fc68..adcaaebd6d6e 100644
--- a/src/pass/lift_attr_scope.cc
+++ b/src/pass/lift_attr_scope.cc
@@ -57,7 +57,7 @@ class AttrScopeLifter : public IRMutator {
       attr_node_ = NodeRef();
       attr_value_ = Expr();
       return Allocate::make(
-        op->buffer_var, op->dtype,
+        op->buffer_var, op->type,
         op->extents, op->condition, body,
         op->new_expr, op->free_function);
     } else {
@@ -198,7 +198,7 @@ class AttrScopeLifter : public IRMutator {
   static bool ValueSame(const Expr& a, const Expr& b) {
     if (a.same_as(b)) return true;
     if (a->type_index() != b->type_index()) return false;
-    if (a.dtype() != b.dtype()) return false;
+    if (a.type() != b.type()) return false;
     if (const IntImm* op = a.as<IntImm>()) {
       return op->value == b.as<IntImm>()->value;
     }
diff --git a/src/pass/loop_partition.cc b/src/pass/loop_partition.cc
index 1ac386767ae3..ef5cc9c4fa9f 100644
--- a/src/pass/loop_partition.cc
+++ b/src/pass/loop_partition.cc
@@ -181,7 +181,7 @@ class PartitionFinder : public IRVisitor {
       const IterVarNode* thread_axis = op->node.as<IterVarNode>();
       CHECK(thread_axis);
       const Variable* var = thread_axis->var.get();
-      IntSet dom = IntSet::range(Range(make_zero(op->value.dtype()), op->value));
+      IntSet dom = IntSet::range(Range(make_zero(op->value.type()), op->value));
       hint_map_.insert({var, dom});
       relax_map_.insert({var, dom});
       IRVisitor::Visit_(op);
@@ -351,12 +351,12 @@ class LoopPartitioner : public IRMutator {
     if (scope.rank == 1) {
       // threadIdx should be put into relax map, in case of divergence.
       relax_map_.insert({var.get(),
-        IntSet::interval(make_zero(var.dtype()), op->value - 1)});
+        IntSet::interval(make_zero(var.type()), op->value - 1)});
       res = IRMutator::Mutate_(op, stmt);
       relax_map_.erase(var.get());
     } else {
       hint_map_.insert({var.get(),
-        IntSet::interval(make_zero(var.dtype()), op->value - 1)});
+        IntSet::interval(make_zero(var.type()), op->value - 1)});
       res = IRMutator::Mutate_(op, stmt);
       hint_map_.erase(var.get());
     }
@@ -595,9 +595,9 @@ Stmt LoopPartitioner::TryPartition(const Node* node,
 inline Stmt LoopPartitioner::MakeFor(const Node *node, Expr extent, Stmt body) {
   const For *for_node = static_cast<const For*>(node);
   CHECK(for_node);
-  if (analyzer_.CanProve(extent == make_const(DataType::Int(32), 1))) {
+  if (analyzer_.CanProve(extent == make_const(Int(32), 1))) {
     // If the loop extent is 1, do not create the loop anymore
-    return Substitute(body, {{Var{for_node->loop_var}, make_const(DataType::Int(32), 0)}});
+    return Substitute(body, {{Var{for_node->loop_var}, make_const(Int(32), 0)}});
   } else {
     return For::make(for_node->loop_var, 0, extent,
                      for_node->for_type, for_node->device_api, body);
diff --git a/src/pass/lower_custom_datatypes.cc b/src/pass/lower_custom_datatypes.cc
index e24cddd97f25..3e71868ce3bc 100644
--- a/src/pass/lower_custom_datatypes.cc
+++ b/src/pass/lower_custom_datatypes.cc
@@ -42,8 +42,8 @@ class CustomDatatypesLowerer : public IRMutator {
   explicit CustomDatatypesLowerer(const std::string& target) : target_(target) {}
 
   inline Expr Mutate_(const Cast* op, const Expr& e) final {
-    auto type_code = op->dtype.code();
-    auto src_type_code = op->value.dtype().code();
+    auto type_code = op->type.code();
+    auto src_type_code = op->value.type().code();
     // If either datatype is a registered custom datatype, we must lower.
     bool toBeLowered = datatype::Registry::Global()->GetTypeRegistered(type_code) ||
                        datatype::Registry::Global()->GetTypeRegistered(src_type_code);
@@ -60,7 +60,7 @@ class CustomDatatypesLowerer : public IRMutator {
   }
 
   inline Expr Mutate_(const FloatImm* imm, const Expr& e) final {
-    auto type_code = imm->dtype.code();
+    auto type_code = imm->type.code();
     if (datatype::Registry::Global()->GetTypeRegistered(type_code)) {
       auto lower = datatype::GetFloatImmLowerFunc(target_, type_code);
       CHECK(lower) << "FloatImm lowering function for target " << target_ << " type "
@@ -71,12 +71,12 @@ class CustomDatatypesLowerer : public IRMutator {
   }
 
   inline Stmt Mutate_(const Allocate* allocate, const Stmt& s) final {
-    bool toBeLowered = datatype::Registry::Global()->GetTypeRegistered(allocate->dtype.code());
+    bool toBeLowered = datatype::Registry::Global()->GetTypeRegistered(allocate->type.code());
     Stmt stmt = IRMutator::Mutate_(allocate, s);
     allocate = stmt.as<Allocate>();
 
     if (toBeLowered) {
-      auto new_allocate_type = DataType::UInt(allocate->dtype.bits(), allocate->dtype.lanes());
+      auto new_allocate_type = UInt(allocate->type.bits(), allocate->type.lanes());
       return Allocate::make(allocate->buffer_var, new_allocate_type, allocate->extents,
                             allocate->condition, allocate->body, allocate->new_expr,
                             allocate->free_function);
@@ -85,11 +85,11 @@ class CustomDatatypesLowerer : public IRMutator {
   }
 
   inline Expr Mutate_(const Load* load, const Expr& e) final {
-    bool toBeLowered = datatype::Registry::Global()->GetTypeRegistered(load->dtype.code());
+    bool toBeLowered = datatype::Registry::Global()->GetTypeRegistered(load->type.code());
     Expr expr = IRMutator::Mutate_(load, e);
     load = expr.as<Load>();
     if (toBeLowered) {
-      auto new_load_type = DataType::UInt(load->dtype.bits());
+      auto new_load_type = UInt(load->type.bits());
       return Load::make(new_load_type, load->buffer_var, load->index, load->predicate);
     }
     return expr;
@@ -97,7 +97,7 @@ class CustomDatatypesLowerer : public IRMutator {
 
 #define DEFINE_MUTATE__(OP)                                                        \
   inline Expr Mutate_(const OP* op, const Expr& e) final {                         \
-    auto type_code = op->dtype.code();                                             \
+    auto type_code = op->type.code();                                              \
     bool toBeLowered = datatype::Registry::Global()->GetTypeRegistered(type_code); \
     Expr expr = IRMutator::Mutate_(op, e);                                         \
     op = expr.as<OP>();                                                            \
diff --git a/src/pass/lower_intrin.cc b/src/pass/lower_intrin.cc
index f0b0b3c36d42..c2a2fe6f5942 100644
--- a/src/pass/lower_intrin.cc
+++ b/src/pass/lower_intrin.cc
@@ -76,7 +76,7 @@ class IntrinInjecter : public arith::IRMutatorWithAnalyzer {
     op = ret.as<FloorDiv>();
     if (op == nullptr) return ret;
     int shift;
-    const DataType& dtype = op->dtype;
+    const DataType& dtype = op->type;
     CHECK(dtype.is_int() || dtype.is_uint());
 
     if (support_bitwise_op_ &&
@@ -97,7 +97,7 @@ class IntrinInjecter : public arith::IRMutatorWithAnalyzer {
         // condition on b >= 0.
         // truncmod(a, b) < 0 will implies ceildiv,
         // So we need to correct these cases.
-        if ((dtype == DataType::Int(32) || dtype == DataType::Int(64)) && support_bitwise_op_) {
+        if ((dtype == Int(32) || dtype == Int(64)) && support_bitwise_op_) {
           // equivalent to rdiv + (rmod >= 0 ? 0: -1);
           return rdiv + (rmod >> make_const(dtype, dtype.bits() - 1));
         } else {
@@ -123,7 +123,7 @@ class IntrinInjecter : public arith::IRMutatorWithAnalyzer {
     if (op == nullptr) return ret;
     // Lower floordiv to native truncdiv.
     int shift;
-    const DataType& dtype = op->dtype;
+    const DataType& dtype = op->type;
     CHECK(dtype.is_int() || dtype.is_uint());
 
     if (support_bitwise_op_ &&
@@ -144,7 +144,7 @@ class IntrinInjecter : public arith::IRMutatorWithAnalyzer {
         // mod(a, b) < 0 will imply we are doing ceildiv,
         // So we need to correct these cases.
         Expr rmod = truncmod(op->a, op->b);
-        if ((dtype == DataType::Int(32) || dtype == DataType::Int(64)) && support_bitwise_op_) {
+        if ((dtype == Int(32) || dtype == Int(64)) && support_bitwise_op_) {
           // (rmod >> shift) & b
           // -> (rmod >= 0 ? 0: -1) & b
           // -> rmod >= 0 ? 0 : b
@@ -207,23 +207,23 @@ class IntrinInjecter : public arith::IRMutatorWithAnalyzer {
       if (const Cast* cast = bcast->value.as<Cast>()) {
         auto should_swap = [&]() {
           // Maintain behaviour (int8 -> int16, fp16 -> fp32).
-          if (cast->dtype.bits() == cast->value.dtype().bits() * 2) {
+          if (cast->type.bits() == cast->value.type().bits() * 2) {
             return true;
           }
           // Check both operands are integer-like.
-          if (!cast->dtype.is_uint() && !cast->dtype.is_int()) {
+          if (!cast->type.is_uint() && !cast->type.is_int()) {
             return false;
           }
-          if (!cast->value.dtype().is_uint() && !cast->value.dtype().is_int()) {
+          if (!cast->value.type().is_uint() && !cast->value.type().is_int()) {
             return false;
           }
           // If both are integer-like, swap if we have a widening cast.
-          return cast->dtype.bits() > cast->value.dtype().bits();
+          return cast->type.bits() > cast->value.type().bits();
         };
 
         if (should_swap()) {
           Expr new_bcast = Broadcast::make(cast->value, bcast->lanes);
-          return Cast::make(bcast->dtype, new_bcast);
+          return Cast::make(bcast->type, new_bcast);
         }
       }
     }
@@ -236,9 +236,9 @@ class IntrinInjecter : public arith::IRMutatorWithAnalyzer {
     Expr lhs = SwapBroadcastCast(a);
     Expr rhs = SwapBroadcastCast(b);
 
-    if (fma_ != nullptr && op->dtype.is_float()) {
+    if (fma_ != nullptr && op->type.is_float()) {
       Expr r = (*fma_)(Call::make(
-          op->dtype, "fma", {lhs, rhs, c}, Call::PureIntrinsic));
+          op->type, "fma", {lhs, rhs, c}, Call::PureIntrinsic));
       if (r.defined()) return this->Mutate(r);
     } else {
       if (!lhs.same_as(a) || !rhs.same_as(b)) {
diff --git a/src/pass/lower_thread_allreduce.cc b/src/pass/lower_thread_allreduce.cc
index 2a121180d695..e8ea52e886cc 100644
--- a/src/pass/lower_thread_allreduce.cc
+++ b/src/pass/lower_thread_allreduce.cc
@@ -83,7 +83,7 @@ class ThreadAllreduceBuilder final : public IRMutator {
       stmt = AttrStmt::make(
           repl->buffer_var, attr::volatile_scope, 1, op->body);
       stmt = Allocate::make(
-          repl->buffer_var, repl->dtype,
+          repl->buffer_var, repl->type,
           repl->extents, repl->condition, stmt);
       stmt = AttrStmt::make(
           repl->buffer_var, attr::storage_scope,
@@ -125,14 +125,14 @@ class ThreadAllreduceBuilder final : public IRMutator {
     CHECK_EQ(size, size_of_args->value);
     Array<Expr> inits = combiner->identity_element;
     std::vector<Expr> values(size);
-    std::vector<DataType> types(size);
+    std::vector<Type> types(size);
     Expr cond  = call->args[size+1];
     for (size_t idx = 0; idx < size; ++idx) {
       values[idx] = call->args[1+idx];
       if (!is_one(cond)) {
         values[idx] = Select::make(cond, values[idx], inits[idx]);
       }
-      types[idx] = values[idx].dtype();
+      types[idx] = values[idx].type();
     }
     std::vector<const Variable*> buffers(size);
     for (size_t idx = 0; idx < size; ++idx) {
@@ -197,7 +197,7 @@ class ThreadAllreduceBuilder final : public IRMutator {
     // previous iteration on the same buffer.
     seq.emplace_back(SyncThread("shared"));
     for (size_t idx = 0; idx < size; ++idx) {
-      shared_bufs[idx] = Var("red_buf"+std::to_string(idx), DataType::Handle());
+      shared_bufs[idx] = Var("red_buf"+std::to_string(idx), Handle());
       Expr pred = const_true(types[idx].lanes());
       seq.emplace_back(Store::make(
           shared_bufs[idx], values[idx],
@@ -212,7 +212,7 @@ class ThreadAllreduceBuilder final : public IRMutator {
       Expr pred = const_true(types[idx].lanes());
       load_remap_[buffers[idx]] = Load::make(
         types[idx], shared_bufs[idx],
-        BufIndex(make_zero(reduce_index.dtype()), group_index, reduce_extent), pred);
+        BufIndex(make_zero(reduce_index.type()), group_index, reduce_extent), pred);
       alloc_remap_[buffers[idx]] = Allocate::make(
         shared_bufs[idx], types[idx],
         {Expr(group_extent), Expr(reduce_extent)},
@@ -222,7 +222,7 @@ class ThreadAllreduceBuilder final : public IRMutator {
   }
   // make allreduce.
   Stmt MakeBufAllreduce(const CommReducerNode *combiner,
-                        const std::vector<DataType>& types,
+                        const std::vector<Type>& types,
                         const Array<Var>& shared_bufs,
                         Expr reduce_index,
                         Expr group_index,
@@ -293,7 +293,7 @@ class ThreadAllreduceBuilder final : public IRMutator {
     int& total_extent = *out_total_extent;
     total_extent = 1;
     if (tvec.size() == 0) {
-      return make_zero(DataType::Int(32));
+      return make_zero(Int(32));
     }
 
     Expr ret;
@@ -311,7 +311,7 @@ class ThreadAllreduceBuilder final : public IRMutator {
   // sync thread op.
   static Stmt SyncThread(const std::string& sync) {
     return Evaluate::make(
-        Call::make(DataType::Int(32), intrinsic::tvm_storage_sync,
+        Call::make(Int(32), intrinsic::tvm_storage_sync,
                    {StringImm::make(sync)},
                    Call::Intrinsic));
   }
diff --git a/src/pass/lower_tvm_builtin.cc b/src/pass/lower_tvm_builtin.cc
index c8c8fa9c62d0..e73956cb3d62 100644
--- a/src/pass/lower_tvm_builtin.cc
+++ b/src/pass/lower_tvm_builtin.cc
@@ -33,12 +33,12 @@ namespace ir {
 
 inline Expr ConstInt32(size_t index) {
   CHECK_LE(index, std::numeric_limits<int>::max());
-  return make_const(DataType::Int(32), static_cast<int>(index));
+  return make_const(Int(32), static_cast<int>(index));
 }
 
 inline Expr StackAlloca(std::string type, size_t num) {
   Array<Expr> args = {StringImm::make(type), ConstInt32(num)};
-  return Call::make(DataType::Handle(), intrinsic::tvm_stack_alloca, args, Call::Intrinsic);
+  return Call::make(Handle(), intrinsic::tvm_stack_alloca, args, Call::Intrinsic);
 }
 
 // Calculate the statistics of packed function.
@@ -46,10 +46,10 @@ inline Expr StackAlloca(std::string type, size_t num) {
 class BuiltinLower : public IRMutator {
  public:
   Stmt Build(Stmt stmt) {
-    stack_shape_ = Var("stack_shape", DataType::Handle());
-    stack_array_ = Var("stack_array", DataType::Handle());
-    stack_value_ = Var("stack_value", DataType::Handle());
-    stack_tcode_ = Var("stack_tcode", DataType::Handle());
+    stack_shape_ = Var("stack_shape", Handle());
+    stack_array_ = Var("stack_array", Handle());
+    stack_value_ = Var("stack_value", Handle());
+    stack_tcode_ = Var("stack_tcode", Handle());
     stmt = this->Mutate(stmt);
     if (max_shape_stack_ != 0) {
       stmt = LetStmt::make(
@@ -86,7 +86,7 @@ class BuiltinLower : public IRMutator {
     if (op->new_expr.defined()) return stmt;
     // Get constant allocation bound.
     int64_t dev_type;
-    int64_t nbytes = GetVectorBytes(op->dtype);
+    int64_t nbytes = GetVectorBytes(op->type);
     if (device_type_.defined()) {
       if (arith::GetConst(device_type_, &dev_type)) {
         if (dev_type == kDLCPU) {
@@ -97,18 +97,18 @@ class BuiltinLower : public IRMutator {
         }
       }
     }
-    Expr total_bytes = make_const(op->extents[0].dtype(), nbytes);
+    Expr total_bytes = make_const(op->extents[0].type(), nbytes);
     for (size_t i = 0; i < op->extents.size(); ++i) {
       total_bytes = total_bytes * op->extents[i];
     }
     CHECK(device_type_.defined()) << "Unknown device type in current IR";
     CHECK(device_id_.defined()) << "Unknown device id in current IR";
-    Stmt throw_last_error = Evaluate::make(Call::make(DataType::Int(32),
+    Stmt throw_last_error = Evaluate::make(Call::make(Int(32),
                                            intrinsic::tvm_throw_last_error, {},
                                            Call::Intrinsic));
 
     Stmt body = Block::make(
-        IfThenElse::make(Call::make(DataType::Bool(1),
+        IfThenElse::make(Call::make(Bool(1),
                                     intrinsic::tvm_handle_is_null,
                                     {op->buffer_var}, Call::PureIntrinsic),
                          throw_last_error),
@@ -116,27 +116,27 @@ class BuiltinLower : public IRMutator {
 
     Stmt alloca = LetStmt::make(
         op->buffer_var,
-        Call::make(op->buffer_var.dtype(),
+        Call::make(op->buffer_var.type(),
                    "TVMBackendAllocWorkspace",
-                   {cast(DataType::Int(32), device_type_),
-                    cast(DataType::Int(32), device_id_),
-                    cast(DataType::UInt(64), total_bytes),
-                    IntImm::make(DataType::Int(32), op->dtype.code()),
-                    IntImm::make(DataType::Int(32), op->dtype.bits())},
+                   {cast(Int(32), device_type_),
+                    cast(Int(32), device_id_),
+                    cast(UInt(64), total_bytes),
+                    IntImm::make(Int(32), op->type.code()),
+                    IntImm::make(Int(32), op->type.bits())},
                    Call::Extern),
         body);
 
-    Expr free_op = Call::make(DataType::Int(32),
+    Expr free_op = Call::make(Int(32),
                               "TVMBackendFreeWorkspace",
-                              {cast(DataType::Int(32), device_type_),
-                                    cast(DataType::Int(32), device_id_),
+                              {cast(Int(32), device_type_),
+                                    cast(Int(32), device_id_),
                                     op->buffer_var},
                               Call::Extern);
-    Stmt free_stmt = IfThenElse::make(free_op != make_zero(DataType::Int(32)), throw_last_error);
+    Stmt free_stmt = IfThenElse::make(free_op != make_zero(Int(32)), throw_last_error);
     body = Block::make(alloca, free_stmt);
     body = AttrStmt::make(
         op->buffer_var, attr::storage_alignment,
-        make_const(DataType::Int(32), runtime::kTempAllocaAlignment),
+        make_const(Int(32), runtime::kTempAllocaAlignment),
         body);
     return body;
   }
@@ -164,7 +164,7 @@ class BuiltinLower : public IRMutator {
     } else if (op->is_intrinsic(intrinsic::tvm_stack_make_array)) {
       return MakeArray(op, e);
     } else if (op->is_intrinsic(intrinsic::tvm_context_id)) {
-      return make_zero(op->dtype);
+      return make_zero(op->type);
     } else {
       return IRMutator::Mutate_(op, e);
     }
@@ -177,10 +177,10 @@ class BuiltinLower : public IRMutator {
     op = expr.as<Call>();
     for (size_t i = 0; i < op->args.size(); ++i) {
       prep_seq_.emplace_back(
-          Store::make(stack_shape_, cast(DataType::Int(64), op->args[i]),
+          Store::make(stack_shape_, cast(Int(64), op->args[i]),
                       ConstInt32(stack_begin +i), const_true(1)));
     }
-    return AddressOffset(stack_shape_, DataType::Int(64), stack_begin);
+    return AddressOffset(stack_shape_, Int(64), stack_begin);
   }
   // make array
   Expr MakeArray(const Call* op, const Expr& e) {
@@ -194,40 +194,40 @@ class BuiltinLower : public IRMutator {
         TVMStructSet(stack_array_, idx, intrinsic::kArrShape, op->args[1]));
     Expr strides = op->args[2];
     if (!strides.defined() || is_zero(strides)) {
-      strides = make_zero(DataType::Handle());
+      strides = make_zero(Handle());
     }
     prep_seq_.emplace_back(
         TVMStructSet(stack_array_, idx, intrinsic::kArrStrides, strides));
     prep_seq_.emplace_back(
         TVMStructSet(stack_array_, idx, intrinsic::kArrNDim, op->args[3]));
-    DataType dtype = op->args[4].dtype();
+    Type dtype = op->args[4].type();
     prep_seq_.emplace_back(
         TVMStructSet(stack_array_, idx, intrinsic::kArrTypeCode,
-                     make_const(DataType::UInt(8), static_cast<int>(dtype.code()))));
+                     make_const(UInt(8), static_cast<int>(dtype.code()))));
     prep_seq_.emplace_back(
         TVMStructSet(stack_array_, idx, intrinsic::kArrTypeBits,
-                     make_const(DataType::UInt(8), dtype.bits())));
+                     make_const(UInt(8), dtype.bits())));
     prep_seq_.emplace_back(
         TVMStructSet(stack_array_, idx, intrinsic::kArrTypeLanes,
-                     make_const(DataType::UInt(16), dtype.lanes())));
+                     make_const(UInt(16), dtype.lanes())));
     // set byte offset
     int data_bytes = GetVectorBytes(dtype);
     Expr byte_offset = op->args[5];
     if (!is_zero(byte_offset)) {
-      byte_offset = byte_offset * make_const(byte_offset.dtype(), data_bytes);
+      byte_offset = byte_offset * make_const(byte_offset.type(), data_bytes);
     }
     prep_seq_.emplace_back(
         TVMStructSet(stack_array_, idx, intrinsic::kArrByteOffset,
-                     cast(DataType::UInt(64), byte_offset)));
+                     cast(UInt(64), byte_offset)));
     CHECK(device_type_.defined()) << "Unknown device type in current IR";
     CHECK(device_id_.defined()) << "Unknown device id in current IR";
     prep_seq_.emplace_back(
         TVMStructSet(stack_array_, idx, intrinsic::kArrDeviceId,
-                     cast(DataType::Int(32), device_id_)));
+                     cast(Int(32), device_id_)));
     prep_seq_.emplace_back(
         TVMStructSet(stack_array_, idx, intrinsic::kArrDeviceType,
-                     cast(DataType::Int(32), device_type_)));
-    return TVMStructGet(DataType::Handle(), stack_array_, idx, intrinsic::kArrAddr);
+                     cast(Int(32), device_type_)));
+    return TVMStructGet(Handle(), stack_array_, idx, intrinsic::kArrAddr);
   }
   // call packed.
   Expr MakeCallPacked(const Call* op, const Expr& e) {
@@ -241,8 +241,8 @@ class BuiltinLower : public IRMutator {
     for (size_t i = 1; i < op->args.size(); ++i) {
       Expr stack_index = ConstInt32(arg_stack_begin + i - 1);
       Expr arg = op->args[i];
-      DataType t = arg.dtype();
-      DataType api_type = APIType(t);
+      Type t = arg.type();
+      Type api_type = APIType(t);
       if (t != api_type) {
         arg = Cast::make(api_type, arg);
       }
@@ -274,7 +274,7 @@ class BuiltinLower : public IRMutator {
       ConstInt32(arg_stack_begin + op->args.size() - 1)
     };
     return Call::make(
-        DataType::Int(32), intrinsic::tvm_call_packed_lowered,
+        Int(32), intrinsic::tvm_call_packed_lowered,
         packed_args, Call::Intrinsic);
   }
 
@@ -290,8 +290,8 @@ class BuiltinLower : public IRMutator {
     for (size_t i = 1; i < op->args.size(); ++i) {
       Expr stack_index = ConstInt32(arg_stack_begin + i - 1);
       Expr arg = op->args[i];
-      DataType t = arg.dtype();
-      DataType api_type = APIType(t);
+      Type t = arg.type();
+      Type api_type = APIType(t);
       if (t != api_type) {
         arg = Cast::make(api_type, arg);
       }
@@ -324,7 +324,7 @@ class BuiltinLower : public IRMutator {
       op->args[args_size - 1]
     };
     return Call::make(
-        op->dtype, intrinsic::tvm_call_trace_packed_lowered,
+        op->type, intrinsic::tvm_call_trace_packed_lowered,
         packed_args, Call::Intrinsic);
   }
 
diff --git a/src/pass/lower_warp_memory.cc b/src/pass/lower_warp_memory.cc
index 0ed2b6232fc1..393605e85b8a 100644
--- a/src/pass/lower_warp_memory.cc
+++ b/src/pass/lower_warp_memory.cc
@@ -94,11 +94,11 @@ class WarpStoreCoeffFinder : private IRVisitor {
   /// Visitor implementation
   void Visit_(const Store *op) final {
     if (op->buffer_var.get() == buffer_) {
-      if (op->value.dtype().lanes() == 1) {
+      if (op->value.type().lanes() == 1) {
         UpdatePattern(op->index);
       } else {
         Expr base;
-        CHECK(GetRamp1Base(op->index, op->value.dtype().lanes(), &base))
+        CHECK(GetRamp1Base(op->index, op->value.type().lanes(), &base))
             << "LowerWarpMemory failed due to store index=" << op->index
             << ", can only handle continuous store";
         UpdatePattern(base);
@@ -196,7 +196,7 @@ class WarpAccessRewriter : protected IRMutator {
     int alloc_size = op->constant_allocation_size();
     CHECK_GT(alloc_size, 0)
         << "warp memory only support constant alloc size";
-    alloc_size *= op->dtype.lanes();
+    alloc_size *= op->type.lanes();
     warp_index_ = WarpIndexFinder(warp_size_).Find(op->body)->var;
     warp_coeff_ = WarpStoreCoeffFinder(
         buffer_, warp_index_, analyzer_).Find(op->body);
@@ -205,8 +205,8 @@ class WarpAccessRewriter : protected IRMutator {
     warp_group_ = alloc_size / (warp_size_ * warp_coeff_);
     return Allocate::make(
         op->buffer_var,
-        op->dtype,
-        {make_const(DataType::Int(32), alloc_size / warp_size_)},
+        op->type,
+        {make_const(Int(32), alloc_size / warp_size_)},
         op->condition,
         this->Mutate(op->body));
   }
@@ -237,8 +237,8 @@ class WarpAccessRewriter : protected IRMutator {
           << "LowerWarpMemory failed to rewrite load to shuffle for index "
           << op->index << " local_index=" << local_index;
       Expr load_value = Load::make(
-          op->dtype, op->buffer_var, local_index, op->predicate);
-      return Call::make(load_value.dtype(),
+          op->type, op->buffer_var, local_index, op->predicate);
+      return Call::make(load_value.type(),
                         intrinsic::tvm_warp_shuffle,
                         {load_value, group},
                         Call::Intrinsic);
@@ -252,15 +252,15 @@ class WarpAccessRewriter : protected IRMutator {
   // source index is the corresponding source index
   // in this access pattern.
   std::pair<Expr, Expr> SplitIndexByGroup(const Expr& index) {
-    if (index.dtype().lanes() != 1) {
+    if (index.type().lanes() != 1) {
       Expr base, local_index, group;
-      CHECK(GetRamp1Base(index, index.dtype().lanes(), &base));
+      CHECK(GetRamp1Base(index, index.type().lanes(), &base));
       std::tie(local_index, group) = SplitIndexByGroup(base);
       local_index =
-          Ramp::make(local_index, make_const(local_index.dtype(), 1), index.dtype().lanes());
+          Ramp::make(local_index, make_const(local_index.type(), 1), index.type().lanes());
       return std::make_pair(local_index, group);
     }
-    Expr m = make_const(index.dtype(), warp_coeff_);
+    Expr m = make_const(index.type(), warp_coeff_);
 
     // simple case, warp index is on the highest.
     if (warp_group_ == 1) {
@@ -269,9 +269,9 @@ class WarpAccessRewriter : protected IRMutator {
       return std::make_pair(x, z);
     } else {
       Expr x = analyzer_->canonical_simplify(indexmod(index, m));
-      Expr y = index / make_const(index.dtype(), warp_coeff_ * warp_size_);
+      Expr y = index / make_const(index.type(), warp_coeff_ * warp_size_);
       y = y * m + x;
-      Expr z = indexdiv(indexmod(index, make_const(index.dtype(), warp_coeff_ * warp_size_)),
+      Expr z = indexdiv(indexmod(index, make_const(index.type(), warp_coeff_ * warp_size_)),
                         m);
       return std::make_pair(analyzer_->canonical_simplify(y),
                             analyzer_->canonical_simplify(z));
diff --git a/src/pass/make_api.cc b/src/pass/make_api.cc
index 74b8f891299a..4d9c92bb428e 100644
--- a/src/pass/make_api.cc
+++ b/src/pass/make_api.cc
@@ -51,9 +51,9 @@ LoweredFunc MakeAPI(Stmt body,
   int num_packed_args = num_args - num_unpacked_args;
   // Data field definitions
   // The packed fields
-  Var v_packed_args("args", DataType::Handle());
-  Var v_packed_arg_type_ids("arg_type_ids", DataType::Handle());
-  Var v_num_packed_args("num_args", DataType::Int(32));
+  Var v_packed_args("args", Handle());
+  Var v_packed_arg_type_ids("arg_type_ids", Handle());
+  Var v_num_packed_args("num_args", Int(32));
   // The arguments of the function.
   Array<Var> args;
   // The device context
@@ -66,12 +66,12 @@ LoweredFunc MakeAPI(Stmt body,
   // ---------------------------
   // local function definitions
   // load i-th argument as type t
-  auto f_arg_value = [&](DataType t, int i) {
+  auto f_arg_value = [&](Type t, int i) {
     Array<Expr> call_args{v_packed_args,
-                          IntImm::make(DataType::Int(32), i),
-                          IntImm::make(DataType::Int(32), intrinsic::kTVMValueContent)};
+                          IntImm::make(Int(32), i),
+                          IntImm::make(Int(32), intrinsic::kTVMValueContent)};
     // load 64 bit version
-    DataType api_type = APIType(t);
+    Type api_type = APIType(t);
     Expr res = Call::make(
         api_type, intrinsic::tvm_struct_get, call_args,
         Call::PureIntrinsic);
@@ -86,7 +86,7 @@ LoweredFunc MakeAPI(Stmt body,
     std::ostringstream os;
     os << "arg" << i;
     const Variable* v = api_args[i].as<Variable>();
-    return Var(os.str(), v ? v->dtype: DataType::Handle());
+    return Var(os.str(), v ? v->type: Handle());
   };
   // ---------------------------
   // start of logics
@@ -110,15 +110,14 @@ LoweredFunc MakeAPI(Stmt body,
     if (i < num_packed_args) {
       // Value loads
       seq_init.emplace_back(LetStmt::make(
-          v_arg, f_arg_value(v_arg.dtype(), i), nop));
+          v_arg, f_arg_value(v_arg.type(), i), nop));
       // type code checks
-      Var tcode(v_arg->name_hint + ".code", DataType::Int(32));
+      Var tcode(v_arg->name_hint + ".code", Int(32));
       seq_init.emplace_back(LetStmt::make(
           tcode, Load::make(
-              DataType::Int(32), v_packed_arg_type_ids,
-              IntImm::make(DataType::Int(32), i), const_true(1)),
+              Int(32), v_packed_arg_type_ids, IntImm::make(Int(32), i), const_true(1)),
           nop));
-      DataType t = v_arg.dtype();
+      Type t = v_arg.type();
       if (t.is_handle()) {
         std::ostringstream msg;
         msg << name << ": Expect arg[" << i << "] to be pointer";
@@ -175,7 +174,7 @@ LoweredFunc MakeAPI(Stmt body,
   n->is_packed_func = num_unpacked_args == 0;
   n->is_restricted = is_restricted;
   body = AttrStmt::make(
-      make_zero(DataType::Int(32)), attr::compute_scope,
+      make_zero(Int(32)), attr::compute_scope,
       StringImm::make(name + "_compute_"), body);
   // Set device context
   if (vmap.count(device_id.get())) {
@@ -187,7 +186,7 @@ LoweredFunc MakeAPI(Stmt body,
         node, attr::device_context_type, device_type, nop));
     Stmt set_device = IfThenElse::make(
         device_type != kDLCPU, Evaluate::make(Call::make(
-            DataType::Int(32), intrinsic::tvm_call_packed,
+            Int(32), intrinsic::tvm_call_packed,
             {StringImm::make(runtime::symbol::tvm_set_device),
              device_type, device_id}, Call::Intrinsic)));
     body = Block::make(set_device, body);
@@ -216,7 +215,7 @@ class DeviceTypeBinder: public IRMutator {
     if (op->attr_key == attr::device_context_type) {
       if (const Variable* var = op->value.as<Variable>()) {
         var_ = var;
-        Expr value = make_const(op->value.dtype(), device_type_);
+        Expr value = make_const(op->value.type(), device_type_);
         Stmt body = IRMutator::Mutate_(op, s);
         var_ = nullptr;
         std::ostringstream os;
@@ -246,14 +245,14 @@ class DeviceTypeBinder: public IRMutator {
     Expr res = IRMutator::Mutate_(op, e);
     op = res.as<NE>();
     if (ir::Equal(op->a, op->b)) {
-      return make_const(op->dtype, false);
+      return make_const(op->type, false);
     }
     return res;
   }
 
   Expr Mutate_(const Variable* op, const Expr& e) final {
     if (op == var_) {
-      return make_const(op->dtype, device_type_);
+      return make_const(op->type, device_type_);
     } else {
       return e;
     }
diff --git a/src/pass/narrow_channel_access.cc b/src/pass/narrow_channel_access.cc
new file mode 100644
index 000000000000..13c4e5141e8d
--- /dev/null
+++ b/src/pass/narrow_channel_access.cc
@@ -0,0 +1,244 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file narrow_channel_access.cc
+ * \brief Narrow channel access to a smaller range
+ *  when possible by bringing it to the internal loop.
+ */
+#include <tvm/ir.h>
+#include <tvm/expr.h>
+#include <tvm/ir_pass.h>
+#include <tvm/ir_visitor.h>
+#include <tvm/ir_mutator.h>
+#include <tvm/arithmetic.h>
+#include <tvm/channel.h>
+#include "ir_util.h"
+
+namespace tvm {
+namespace ir {
+using namespace arith;
+
+// Bound deducer for channel access.
+class ChannelAccessBound : public IRVisitor {
+ public:
+  ChannelAccessBound(const Variable* buf_var, bool read_access)
+      : buf_var_(buf_var), read_access_(read_access) {}
+
+  void Visit_(const Store* op) final {
+    if (!read_access_ && buf_var_ == op->buffer_var.get()) {
+      ret_.emplace_back(EvalSet(op->index, dom_map_));
+    }
+    IRVisitor::Visit_(op);
+  }
+  void Visit_(const For* op) final {
+    CHECK(is_zero(op->min));
+    // We know that the extent of the loop won't depend on relaxed scope.
+    // TODO(tqchen) have a verification pass.
+    dom_map_[op->loop_var.get()] = IntSet::interval(op->min, op->extent - 1);
+    IRVisitor::Visit_(op);
+  }
+  void Visit_(const Load* op) final {
+    if (read_access_ && buf_var_ == op->buffer_var.get()) {
+      ret_.emplace_back(EvalSet(op->index, dom_map_));
+    }
+    IRVisitor::Visit_(op);
+  }
+  void Visit_(const Let* op) final {
+    LOG(FATAL) << "cannot pass through let";
+  }
+  void Visit_(const LetStmt* op) final {
+    LOG(FATAL) << "cannot pass through let";
+  }
+  IntSet Eval(const Stmt& stmt) {
+    Visit(stmt);
+    return Union(ret_);
+  }
+
+ private:
+  // The buffer variable.
+  const Variable* buf_var_;
+  // read or write
+  bool read_access_{true};
+  // Box
+  std::vector<IntSet> ret_;
+  // Domain map.
+  std::unordered_map<const Variable*, IntSet> dom_map_;
+};
+
+class ChannelAccessIndexRewriter : public IRMutator {
+ public:
+  ChannelAccessIndexRewriter(const Variable* buf_var,
+                             Expr min,
+                             bool read_access)
+      : buf_var_(buf_var), min_(min), read_access_(read_access) {}
+  Expr Mutate_(const Load* op, const Expr& e) final {
+    Expr expr = IRMutator::Mutate_(op, e);
+    op = expr.as<Load>();
+    if (read_access_ && buf_var_ == op->buffer_var.get()) {
+      return Load::make(
+          op->type, op->buffer_var, ir::Simplify(op->index - min_),
+          op->predicate);
+    } else {
+      return expr;
+    }
+  }
+  Stmt Mutate_(const Store* op, const Stmt& s) final {
+    Stmt stmt = IRMutator::Mutate_(op, s);
+    op = stmt.as<Store>();
+    if (!read_access_ && buf_var_ == op->buffer_var.get()) {
+      return Store::make(
+          op->buffer_var, op->value, ir::Simplify(op->index - min_),
+          op->predicate);
+    } else {
+      return stmt;
+    }
+  }
+
+ private:
+  // The buffer variable.
+  const Variable* buf_var_;
+  // The min bound.
+  Expr min_;
+  // read or write
+  bool read_access_{true};
+};
+
+
+// Rewrite channel access pattern.
+class ChannelAccessRewriter : public IRMutator {
+ public:
+  Stmt Mutate_(const AttrStmt* op, const Stmt& s) final {
+    Stmt ret;
+    const AttrStmt* adv = op->body.as<AttrStmt>();
+    if ((op->attr_key == ir::attr::channel_read_scope &&
+         adv && adv->attr_key == ir::attr::channel_read_advance) ||
+        (op->attr_key == ir::attr::channel_write_scope &&
+         adv && adv->attr_key == ir::attr::channel_write_advance)) {
+      RewriteEntry e;
+      e.window = op;
+      e.advance = adv;
+      e.read_access = op->attr_key == ir::attr::channel_read_scope;
+      tasks_.push_back(e);
+      ret = IRMutator::Mutate_(op, s);
+      if (tasks_.back().rewrite_success) {
+        ret = ret.as<AttrStmt>()->body.as<AttrStmt>()->body;
+      }
+      tasks_.pop_back();
+      return ret;
+    } else {
+      return IRMutator::Mutate_(op, s);
+    }
+  }
+
+  Stmt Mutate_(const For* op, const Stmt& s) final {
+    std::vector<RewriteEntry> tasks;
+    std::swap(tasks_, tasks);
+    Stmt body = op->body;
+    std::vector<Stmt> nest;
+    for (RewriteEntry& e : tasks) {
+      body = RewriteAccess(op, body, &e, &nest);
+    }
+
+    if (!body.same_as(op->body)) {
+      body = Mutate(body);
+      body = For::make(
+          op->loop_var, op->min, op->extent,
+          op->for_type, op->device_api, body);
+      body = MergeNest(nest, body);
+    } else {
+      CHECK_EQ(nest.size(), 0U);
+      body = IRMutator::Mutate_(op, s);
+    }
+    std::swap(tasks_, tasks);
+    return body;
+  }
+
+ private:
+  struct RewriteEntry {
+    bool read_access;
+    const AttrStmt* window;
+    const AttrStmt* advance;
+    bool rewrite_success{false};
+  };
+
+  Stmt RewriteAccess(const For* for_op,
+                     Stmt body,
+                     RewriteEntry* e,
+                     std::vector<Stmt>* outer_nest) {
+    const AttrStmt* adv_op = e->advance;
+    const Expr& window = e->window->value;
+    bool read_access = e->read_access;
+    Var var(for_op->loop_var);
+    Channel ch = Downcast<Channel>(adv_op->node);
+    ChannelAccessBound acc(ch->handle_var.get(), read_access);
+    IntSet iset = acc.Eval(for_op->body);
+    Range r = iset.cover_range(Range::make_by_min_extent(0, window));
+    r = Range::make_by_min_extent(
+        ir::Simplify(r->min), ir::Simplify(r->extent));
+    if (ExprUseVar(r->extent, var)) return body;
+    Array<Expr> linear_eq = DetectLinearEquation(r->min, {var});
+    if (linear_eq.size() == 0) return body;
+    Expr coeff = linear_eq[0];
+    Expr base = linear_eq[1];
+    if (!is_zero(base)) return body;
+    Expr left = ir::Simplify(adv_op->value - coeff * for_op->extent);
+    if (!analyzer_.CanProve(left >= 0)) return body;
+    // rewrite access index.
+    ChannelAccessIndexRewriter rw(
+        ch->handle_var.get(), var * coeff, read_access);
+    body = rw.Mutate(body);
+
+    if (read_access) {
+      body = AttrStmt::make(
+          ch, ir::attr::channel_read_scope, r->extent,
+          AttrStmt::make(ch, ir::attr::channel_read_advance, coeff,
+                         body));
+    } else {
+      body = AttrStmt::make(
+          ch, ir::attr::channel_write_scope, r->extent,
+          AttrStmt::make(ch, ir::attr::channel_write_advance, coeff,
+                         body));
+    }
+
+    if (!is_zero(left)) {
+      Stmt no_op = Evaluate::make(0);
+      if (read_access) {
+        outer_nest->emplace_back(
+            AttrStmt::make(ch, ir::attr::channel_read_advance, left, no_op));
+      } else {
+        outer_nest->emplace_back(
+            AttrStmt::make(ch, ir::attr::channel_write_advance, left, no_op));
+      }
+    }
+
+    e->rewrite_success = true;
+    return body;
+  }
+
+  arith::Analyzer analyzer_;
+  std::vector<RewriteEntry> tasks_;
+};
+
+Stmt NarrowChannelAccess(Stmt stmt) {
+  return ChannelAccessRewriter().Mutate(stmt);
+}
+
+}  // namespace ir
+}  // namespace tvm
diff --git a/src/pass/rewrite_unsafe_select.cc b/src/pass/rewrite_unsafe_select.cc
index 43e3005aef64..25ed03963524 100644
--- a/src/pass/rewrite_unsafe_select.cc
+++ b/src/pass/rewrite_unsafe_select.cc
@@ -115,12 +115,12 @@ class UnsafeSelectRewriter : public IRMutator {
     Expr expr = IRMutator::Mutate_(op, e);
     op = expr.as<Select>();
     UnsafeExprDetector unsafe;
-    bool cond_is_scalar_bool = op->condition.dtype().is_bool() && op->condition.dtype().is_scalar();
+    bool cond_is_scalar_bool = op->condition.type().is_bool() && op->condition.type().is_scalar();
     if ((unsafe.VisitExpr(op->true_value) ||
         unsafe.VisitExpr(op->false_value)) &&
         cond_is_scalar_bool) {
       return Call::make(
-          op->dtype,
+          op->type,
           intrinsic::tvm_if_then_else,
           {op->condition, op->true_value, op->false_value},
           Call::Intrinsic);
diff --git a/src/pass/split_host_device.cc b/src/pass/split_host_device.cc
index 5076300d968a..2239e5a07b07 100644
--- a/src/pass/split_host_device.cc
+++ b/src/pass/split_host_device.cc
@@ -23,6 +23,7 @@
  */
 #include <tvm/ir.h>
 #include <tvm/lowered_func.h>
+#include <tvm/channel.h>
 #include <tvm/ir_pass.h>
 #include <tvm/ir_mutator.h>
 #include <tvm/runtime/module.h>
@@ -53,6 +54,13 @@ class IRUseDefAnalysis : public IRMutator {
       Stmt body = this->Mutate(op->body);
       if (value.same_as(op->value) && body.same_as(op->body)) return s;
       return AttrStmt::make(op->node, op->attr_key, value, body);
+    } else if (op->attr_key == attr::channel_write_scope ||
+               op->attr_key == attr::channel_read_scope) {
+      Channel ch = Downcast<Channel>(op->node);
+      if (!use_count_.count(ch->handle_var.get())) {
+        this->HandleDef(ch->handle_var.get());
+      }
+      return IRMutator::Mutate_(op, s);
     } else {
       return IRMutator::Mutate_(op, s);
     }
@@ -157,7 +165,7 @@ class IRUseDefAnalysis : public IRMutator {
 class HostDeviceSplitter : public IRMutator {
  public:
   Stmt Mutate_(const Allocate* op, const Stmt& s) final {
-    handle_data_type_[op->buffer_var.get()] = make_const(op->dtype, 0);
+    handle_data_type_[op->buffer_var.get()] = make_const(op->type, 0);
     return IRMutator::Mutate_(op, s);
   }
 
@@ -201,7 +209,7 @@ class HostDeviceSplitter : public IRMutator {
     n->thread_axis = m.thread_axis_;
     // Strictly order the arguments: Var pointers, positional arguments.
     for (Var v : m.undefined_) {
-      if (v.dtype().is_handle()) {
+      if (v.type().is_handle()) {
         n->args.push_back(v);
         // mark handle data type.
         auto it = handle_data_type_.find(v.get());
@@ -211,7 +219,7 @@ class HostDeviceSplitter : public IRMutator {
       }
     }
     for (Var v : m.undefined_) {
-      if (!v.dtype().is_handle()) {
+      if (!v.type().is_handle()) {
         n->args.push_back(v);
       }
     }
@@ -226,7 +234,7 @@ class HostDeviceSplitter : public IRMutator {
     }
     device_funcs_.emplace_back(f_device);
     return Evaluate::make(Call::make(
-        DataType::Int(32), intrinsic::tvm_call_packed,
+        Int(32), intrinsic::tvm_call_packed,
         call_args, Call::Intrinsic));
   }
 
diff --git a/src/pass/split_pipeline.cc b/src/pass/split_pipeline.cc
new file mode 100644
index 000000000000..7aefb1bc4ead
--- /dev/null
+++ b/src/pass/split_pipeline.cc
@@ -0,0 +1,322 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file split_pipeline.cc
+ * \brief Split statement into pipeline stage modules.
+ */
+#include <tvm/ir.h>
+#include <tvm/expr.h>
+#include <tvm/ir_pass.h>
+#include <tvm/ir_visitor.h>
+#include <tvm/ir_mutator.h>
+#include <tvm/channel.h>
+#include <unordered_map>
+#include <unordered_set>
+#include "ir_util.h"
+
+namespace tvm {
+namespace ir {
+
+class MarkChannelAccess : public IRMutator {
+ public:
+  MarkChannelAccess(
+      const std::unordered_map<const Variable*, Channel>& cmap,
+      const std::unordered_map<const Variable*, Channel>& fifo_map)
+      : cmap_(cmap), fifo_map_(fifo_map) {}
+  using IRMutator::Mutate;
+  Stmt Mutate(Stmt stmt) final {
+    Stmt ret = IRMutator::Mutate(stmt);
+    if (read_fifos_.size() != 0) {
+      for (const Variable* v : read_fifos_) {
+        Channel ch = fifo_map_.at(v);
+        ret = ReadChannel(ch, 1, ret);
+      }
+      read_fifos_.clear();
+    }
+    if (write_fifos_.size() != 0) {
+      for (const Variable* v : write_fifos_) {
+        Channel ch = fifo_map_.at(v);
+        ret = WriteChannel(ch, 1, ret);
+      }
+      write_fifos_.clear();
+    }
+    return ret;
+  }
+
+  Expr Mutate_(const Load *op, const Expr& e) final {
+    auto it = rmap_.find(op->buffer_var.get());
+    if (it != rmap_.end()) {
+      ++it->second.read_count;
+    }
+    if (fifo_map_.count(op->buffer_var.get())) {
+      read_fifos_.insert(op->buffer_var.get());
+      CHECK(!write_fifos_.count(op->buffer_var.get()));
+    }
+    return IRMutator::Mutate_(op, e);
+  }
+  Stmt Mutate_(const Store *op, const Stmt& s) final {
+    auto it = rmap_.find(op->buffer_var.get());
+    if (it != rmap_.end()) {
+      ++it->second.write_count;
+    }
+    if (fifo_map_.count(op->buffer_var.get())) {
+      write_fifos_.insert(op->buffer_var.get());
+      CHECK(!read_fifos_.count(op->buffer_var.get()));
+    }
+    return IRMutator::Mutate_(op, s);
+  }
+  Stmt Mutate_(const Allocate* op, const Stmt& s) final {
+    if (cmap_.count(op->buffer_var.get())) {
+      CHECK(!rmap_.count(op->buffer_var.get()));
+      rmap_[op->buffer_var.get()] = Entry();
+      Stmt body = Mutate(op->body);
+      body = CreateChannelAccess(op, body);
+      rmap_.erase(op->buffer_var.get());
+      return body;
+    } else {
+      return IRMutator::Mutate_(op, s);
+    }
+  }
+  Stmt Mutate_(const AttrStmt* op, const Stmt& s) final {
+    if (op->attr_key == ir::attr::storage_scope) {
+      Var buf_var = Downcast<Var>(op->node);
+      if (cmap_.count(buf_var.get())) return Mutate(op->body);
+    }
+    return IRMutator::Mutate_(op, s);
+  }
+
+ private:
+  // Create channel access wrap
+  Stmt CreateChannelAccess(const Allocate* op, Stmt body) {
+    const Entry& rw = rmap_.at(op->buffer_var.get());
+    CHECK(rw.write_count == 0 || rw.read_count == 0)
+        << "Cannot read/write to the same channel " << op->buffer_var
+        <<  " body:" << body;
+    if (rw.write_count == 0 && rw.read_count == 0) {
+      return body;
+    }
+    const Channel& ch = cmap_.at(op->buffer_var.get());
+    int32_t csize = op->constant_allocation_size();
+    Expr alloc_size;
+    if (csize > 0) {
+      alloc_size = IntImm::make(Int(32), csize);
+    } else {
+      alloc_size = op->extents[0];
+      for (size_t i = 1; i < op->extents.size(); ++i) {
+        alloc_size = alloc_size * op->extents[i];
+      }
+    }
+
+    if (rw.write_count) {
+      return WriteChannel(ch, alloc_size, body);
+    } else {
+      CHECK(rw.read_count);
+      return ReadChannel(ch, alloc_size, body);
+    }
+  }
+  Stmt ReadChannel(Channel ch, Expr size, Stmt body) {
+    return AttrStmt::make(
+        ch, ir::attr::channel_read_scope, size,
+        AttrStmt::make(ch, ir::attr::channel_read_advance, size,
+                       body));
+  }
+  Stmt WriteChannel(Channel ch, Expr size, Stmt body) {
+    return AttrStmt::make(
+        ch, ir::attr::channel_write_scope, size,
+        AttrStmt::make(ch, ir::attr::channel_write_advance, size,
+                       body));
+  }
+  struct Entry {
+    int read_count{0};
+    int write_count{0};
+  };
+  // The channels of each allocation.
+  const std::unordered_map<const Variable*, Channel>& cmap_;
+  // FIFO map.
+  const std::unordered_map<const Variable*, Channel>& fifo_map_;
+  // the result.
+  std::unordered_map<const Variable*, Entry> rmap_;
+  // Accessed FIFOs
+  std::unordered_set<const Variable*> read_fifos_, write_fifos_;
+};
+
+// Mark the statment of each stage.
+class StageSplitter : public IRMutator {
+ public:
+  using IRMutator::Mutate;
+  explicit StageSplitter(bool split_load)
+      : split_load_(split_load) {}
+
+  Stmt Mutate(Stmt stmt) final {
+    nest_.push_back(stmt);
+    Stmt ret = IRMutator::Mutate(stmt);
+    nest_.pop_back();
+    return ret;
+  }
+  Stmt Mutate_(const ProducerConsumer* op, const Stmt& s) final {
+    if (!op->is_producer) {
+      return Mutate(op->body);
+    }
+    Stmt body = Mutate(op->body);
+    stages_.emplace_back(BuildStage(body, op->func));
+    return Evaluate::make(0);
+  }
+  Expr Mutate_(const Load* op, const Expr& e) final {
+    if (!split_load_) return IRMutator::Mutate_(op, e);
+    std::ostringstream cname;
+    cname << "fifo." << temp_fifo_count_++;
+    // Create FIFO channel for load.
+    Channel ch = ChannelNode::make(Var(cname.str(), Handle()), op->type);
+    Expr index = Mutate(op->index);
+    Stmt provide = Store::make(
+        ch->handle_var,
+        Load::make(op->type, op->buffer_var, index, op->predicate),
+        0, op->predicate);
+    Stmt temp = nest_.back(); nest_.pop_back();
+    stages_.emplace_back(BuildStage(provide, ch));
+    nest_.push_back(temp);
+    fifo_map_[ch->handle_var.get()] = ch;
+    return Load::make(op->type, ch->handle_var, 0, op->predicate);
+  }
+
+  Stmt Split(Stmt stmt, const ProducerConsumer* env) {
+    stmt = Mutate(stmt);
+    if (env) {
+      stages_.emplace_back(BuildStage(stmt, env->func));
+    } else {
+      stmt = RemoveNoOp(stmt);
+      CHECK(is_no_op(stmt));
+    }
+    CHECK_NE(stages_.size(), 0);
+    stmt = stages_.back();
+    for (size_t i = stages_.size() - 1; i != 0; --i) {
+      stmt = Block::make(stages_[i - 1], stmt);
+    }
+    stmt = MarkChannelAccess(cmap_, fifo_map_).Mutate(stmt);
+    return RemoveNoOp(stmt);
+  }
+
+ private:
+  // Build the stage.
+  Stmt BuildStage(Stmt body, NodeRef target) {
+    int stage_index = static_cast<int>(stages_.size());
+    std::string stage_suffix = "." + std::to_string(stage_index);
+    // The Substitute
+    Map<Var, Expr> subst;
+    std::vector<Stmt> nest;
+    Stmt no_op = Evaluate::make(0);
+
+    for (const Stmt& s : nest_) {
+      if (const For* op = s.as<For>()) {
+        Var loop_var(op->loop_var);
+        Var new_var = loop_var.copy_with_suffix(stage_suffix);
+        subst.Set(loop_var, new_var);
+        nest.emplace_back(For::make(
+            new_var, op->min, op->extent,
+            op->for_type, op->device_api, no_op));
+      } else if (const LetStmt* op = s.as<LetStmt>()) {
+        Var var(op->var);
+        Var new_var = var.copy_with_suffix(stage_suffix);
+        subst.Set(var, new_var);
+        nest.emplace_back(LetStmt::make(new_var, op->value, no_op));
+      } else if (const IfThenElse* op = s.as<IfThenElse>()) {
+        CHECK(!op->else_case.defined());
+        nest.emplace_back(IfThenElse::make(op->condition, no_op));
+      } else if (const AttrStmt* op = s.as<AttrStmt>()) {
+        nest.emplace_back(AttrStmt::make(
+            op->node, op->attr_key, op->value, no_op));
+      } else if (s.as<ProducerConsumer>()) {
+      } else if (s.as<Block>()) {
+      } else if (const Allocate* op = s.as<Allocate>()) {
+        nest.emplace_back(Allocate::make(
+            op->buffer_var, op->type, op->extents,
+            op->condition, no_op, op->new_expr, op->free_function));
+        MarkChannel(op);
+      } else {
+        LOG(FATAL) << "not supported nest type " << s->GetTypeKey();
+      }
+    }
+    body = Substitute(MergeNest(nest, body), subst);
+    return AttrStmt::make(
+        target, ir::attr::pipeline_stage_scope,
+        make_const(Int(32), stage_index), body);
+  }
+  void MarkChannel(const Allocate* op) {
+    if (!cmap_.count(op->buffer_var.get())) {
+      Channel ch = ChannelNode::make(Var(op->buffer_var), op->type);
+      cmap_[op->buffer_var.get()] = ch;
+    }
+  }
+  // The stack
+  std::vector<Stmt> nest_;
+  // The stages
+  std::vector<Stmt> stages_;
+  // channel map
+  std::unordered_map<const Variable*, Channel> cmap_;
+  // Whether split load into a temp fifo.
+  bool split_load_{true};
+  // Counter for temp FIFOs.
+  size_t temp_fifo_count_{0};
+  // fifo map
+  std::unordered_map<const Variable*, Channel> fifo_map_;
+};
+
+class PipelineSplitter : public IRMutator {
+ public:
+  explicit PipelineSplitter(bool split_load)
+      : split_load_(split_load) {}
+
+  Stmt Mutate_(const AttrStmt* op, const Stmt& s) final {
+    if (op->attr_key == ir::attr::pipeline_exec_scope) {
+      CHECK_LE(env_.size(), 1U);
+      const ProducerConsumer* env = nullptr;
+      if (env_.size() == 1) {
+        std::swap(env_[0], env);
+      }
+      Stmt body = StageSplitter(split_load_).Split(
+          op->body, env);
+      if (body.same_as(op->body)) return s;
+      return AttrStmt::make(
+          op->node, op->attr_key, op->value, body);
+    } else {
+      return IRMutator::Mutate_(op, s);
+    }
+  }
+  Stmt Mutate_(const ProducerConsumer* op, const Stmt& s) {
+    env_.push_back(op);
+    Stmt ret = IRMutator::Mutate_(op, s);
+    if (env_.back() == nullptr) {
+      ret = ret.as<ProducerConsumer>()->body;
+    }
+    env_.pop_back();
+    return ret;
+  }
+
+ private:
+  bool split_load_;
+  std::vector<const ProducerConsumer *> env_;
+};
+
+Stmt SplitPipeline(Stmt stmt, bool split_load) {
+  return PipelineSplitter(split_load).Mutate(stmt);
+}
+
+}  // namespace ir
+}  // namespace tvm
diff --git a/src/pass/ssa.cc b/src/pass/ssa.cc
index 0fff1e6e6774..83fc032bb7be 100644
--- a/src/pass/ssa.cc
+++ b/src/pass/ssa.cc
@@ -83,7 +83,7 @@ class IRConvertSSA final : public IRMutator {
     const VarExpr& v = op->var;
     if (defined_.count(v.get())) {
       Expr value = IRMutator::Mutate(op->value);
-      VarExpr new_var = Variable::make(v.dtype(), v->name_hint);
+      VarExpr new_var = Variable::make(v.type(), v->name_hint);
       scope_[v.get()].push_back(new_var);
       Expr body = IRMutator::Mutate(op->body);
       scope_[v.get()].pop_back();
@@ -98,7 +98,7 @@ class IRConvertSSA final : public IRMutator {
     op = expr.as<Load>();
     if (scope_.count(op->buffer_var.get())) {
       return Load::make(
-          op->dtype, scope_[op->buffer_var.get()].back(),
+          op->type, scope_[op->buffer_var.get()].back(),
           op->index, op->predicate);
     } else {
       return expr;
@@ -119,7 +119,7 @@ class IRConvertSSA final : public IRMutator {
     const VarExpr& v = op->var;
     if (defined_.count(v.get())) {
       Expr value = IRMutator::Mutate(op->value);
-      VarExpr new_var = Variable::make(v.dtype(), v->name_hint);
+      VarExpr new_var = Variable::make(v.type(), v->name_hint);
       scope_[v.get()].push_back(new_var);
       Stmt body = IRMutator::Mutate(op->body);
       scope_[v.get()].pop_back();
@@ -132,7 +132,7 @@ class IRConvertSSA final : public IRMutator {
   Stmt Mutate_(const For* op, const Stmt& s) final {
     const VarExpr& v = op->loop_var;
     if (defined_.count(v.get())) {
-      VarExpr new_var = Variable::make(v.dtype(), v->name_hint);
+      VarExpr new_var = Variable::make(v.type(), v->name_hint);
       scope_[v.get()].push_back(new_var);
       Stmt stmt = IRMutator::Mutate_(op, s);
       scope_[v.get()].pop_back();
@@ -147,13 +147,13 @@ class IRConvertSSA final : public IRMutator {
   Stmt Mutate_(const Allocate* op, const Stmt& s) final {
     const VarExpr& v = op->buffer_var;
     if (defined_.count(v.get())) {
-      VarExpr new_var = Variable::make(v.dtype(), v->name_hint);
+      VarExpr new_var = Variable::make(v.type(), v->name_hint);
       scope_[v.get()].push_back(new_var);
       Stmt stmt = IRMutator::Mutate_(op, s);
       scope_[v.get()].pop_back();
       op = stmt.as<Allocate>();
       return Allocate::make(
-          new_var, op->dtype, op->extents, op->condition,
+          new_var, op->type, op->extents, op->condition,
           op->body, op->new_expr, op->free_function);
     } else {
       defined_.insert(v.get());
diff --git a/src/pass/storage_access.cc b/src/pass/storage_access.cc
index c146a8709b1e..6f9e18fa4eb8 100644
--- a/src/pass/storage_access.cc
+++ b/src/pass/storage_access.cc
@@ -40,7 +40,7 @@ void StorageAccessVisitor::Visit_(const Load* op) {
     AccessEntry e;
     e.threads = env_threads();
     e.buffer = op->buffer_var;
-    e.dtype = op->dtype.element_of();
+    e.dtype = op->type.element_of();
     e.touched = arith::IntSet::vector(op->index);
     e.type = kRead;
     e.scope = scope;
@@ -60,7 +60,7 @@ void StorageAccessVisitor::Visit_(const Store* op) {
     AccessEntry e;
     e.threads = env_threads();
     e.buffer = op->buffer_var;
-    e.dtype = op->value.dtype().element_of();
+    e.dtype = op->value.type().element_of();
     e.touched = arith::IntSet::vector(op->index);
     e.type = kWrite;
     e.scope = scope;
@@ -186,7 +186,7 @@ void StorageAccessVisitor::Visit_(const Call* op) {
     IRVisitor::Visit_(l);
   } else if (op->is_intrinsic(intrinsic::tvm_access_ptr)) {
     CHECK_EQ(op->args.size(), 5U);
-    DataType dtype = op->args[0].dtype();
+    Type dtype = op->args[0].type();
     const Variable* buffer = op->args[1].as<Variable>();
     Expr offset = op->args[2];
     Expr extent = op->args[3];
@@ -251,7 +251,7 @@ class StorageAccessInfoLower : public IRMutator {
           << "Double allocation of " << it->second.scope.to_string();
       if (info->head_address.defined()) {
         return Allocate::make(
-            op->buffer_var, op->dtype, op->extents, op->condition,
+            op->buffer_var, op->type, op->extents, op->condition,
             op->body, info->head_address, "nop");
       }
       return op->body;
@@ -292,24 +292,24 @@ class StorageAccessInfoLower : public IRMutator {
     Expr expr = IRMutator::Mutate_(op, e);
     op = expr.as<Call>();
     CHECK_EQ(op->args.size(), 5U);
-    DataType dtype = op->args[0].dtype();
+    Type dtype = op->args[0].type();
     const Variable* buffer = op->args[1].as<Variable>();
     Var buffer_var = Downcast<Var>(op->args[1]);
     Expr offset = op->args[2];
     auto it = storage_info_.find(buffer);
     if (it != storage_info_.end() && it->second.info.defined()) {
       return MakeTaggedAccessPtr(
-          op->dtype, buffer_var, dtype, offset,
+          op->type, buffer_var, dtype, offset,
           it->second.info);
     }
-    CHECK(op->dtype.is_handle());
+    CHECK(op->type.is_handle());
     // Change to address_of
     return AddressOffset(buffer_var, dtype, offset);
   }
 
-  Expr MakeTaggedAccessPtr(DataType ptr_type,
+  Expr MakeTaggedAccessPtr(Type ptr_type,
                            Var buffer_var,
-                           DataType dtype,
+                           Type dtype,
                            Expr offset,
                            const MemoryInfo& info) {
     if (ptr_type.is_handle()) {
@@ -321,7 +321,7 @@ class StorageAccessInfoLower : public IRMutator {
     CHECK_EQ(info->unit_bits % dtype_bits, 0);
     return cast(ptr_type,
                    ir::Simplify(offset / make_const(
-                       offset.dtype(), info->unit_bits / dtype_bits)));
+                       offset.type(), info->unit_bits / dtype_bits)));
   }
   // The storage entry.
   struct StorageEntry {
diff --git a/src/pass/storage_access.h b/src/pass/storage_access.h
index 028645b78640..8832b5231cbf 100644
--- a/src/pass/storage_access.h
+++ b/src/pass/storage_access.h
@@ -58,7 +58,7 @@ class StorageAccessVisitor : public IRVisitor {
     /*! \brief The buffer variable, if any */
     Var buffer = NullValue<Var>();
     /*! \brief The access data type */
-    DataType dtype;
+    Type dtype;
     /*! \brief The touched access range */
     arith::IntSet touched;
     /*! \brief The type of access */
diff --git a/src/pass/storage_flatten.cc b/src/pass/storage_flatten.cc
index d6dde29a519d..3851e1c55af1 100644
--- a/src/pass/storage_flatten.cc
+++ b/src/pass/storage_flatten.cc
@@ -137,7 +137,7 @@ class StorageFlattener : public IRMutator {
         << "Read a buffer that is already out of scope";
     if (is_opengl_) {
       return Evaluate::make(Call::make(
-          DataType(),
+          Type(),
           Call::glsl_texture_store,
           {e.buffer->data, op->value},
           Call::Intrinsic));
@@ -190,12 +190,12 @@ class StorageFlattener : public IRMutator {
 
       // use small alignment for small arrays
       int32_t const_size = Allocate::constant_allocation_size(shape);
-      int align = GetTempAllocaAlignment(op->dtype, const_size);
+      int align = GetTempAllocaAlignment(op->type, const_size);
       if (skey.tag.length() != 0) {
         MemoryInfo info = GetMemoryInfo(skey.to_string());
         if (info.defined()) {
-          align = (info->max_simd_bits + op->dtype.bits() - 1) / op->dtype.bits();
-          CHECK_LE(const_size * op->dtype.bits(), info->max_num_bits)
+          align = (info->max_simd_bits + op->type.bits() - 1) / op->type.bits();
+          CHECK_LE(const_size * op->type.bits(), info->max_num_bits)
               << "Allocation exceed bound of memory tag " << skey.to_string();
         }
       }
@@ -204,12 +204,12 @@ class StorageFlattener : public IRMutator {
         std::vector<Expr> rstrides;
         const std::vector<DimAlignInfo>& avec = dim_align_[key];
         int first_dim = 0;
-        Expr stride = make_const(shape[first_dim].dtype(), 1);
+        Expr stride = make_const(shape[first_dim].type(), 1);
         for (size_t i = shape.size(); i != 0; --i) {
           size_t dim = i - 1;
           if (dim < avec.size() && avec[dim].align_factor != 0) {
-            Expr factor = make_const(stride.dtype(), avec[dim].align_factor);
-            Expr offset = make_const(stride.dtype(), avec[dim].align_offset);
+            Expr factor = make_const(stride.type(), avec[dim].align_factor);
+            Expr offset = make_const(stride.type(), avec[dim].align_offset);
             stride = stride + indexmod(factor + offset - indexmod(stride, factor), factor);
             stride = ir::Simplify(stride);
           }
@@ -220,8 +220,8 @@ class StorageFlattener : public IRMutator {
       }
 
       e.buffer = BufferNode::make(
-          Var(key.GetName(), DataType::Handle()),
-          op->dtype, shape, strides, Expr(),
+          Var(key.GetName(), Handle()),
+          op->type, shape, strides, Expr(),
           key.GetName(), skey.to_string(),
           align, 0, kDefault);
 
@@ -230,26 +230,26 @@ class StorageFlattener : public IRMutator {
       buf_map_[key].released = true;
       Stmt ret;
 
-      DataType storage_type = e.buffer->dtype;
+      Type storage_type = e.buffer->dtype;
       // specially handle bool, lower its storage
-      // type to beDataType::Int(8)(byte)
-      if (storage_type == DataType::Bool()) {
-        storage_type = DataType::Int(8);
+      // type to be Int(8)(byte)
+      if (storage_type == Bool()) {
+        storage_type = Int(8);
       }
       if (strides.size() != 0) {
         int first_dim = 0;
         ret = Allocate::make(
             e.buffer->data, storage_type,
             {e.buffer->strides[first_dim] * e.buffer->shape[first_dim]},
-            make_const(DataType::Bool(e.buffer->dtype.lanes()), true), body);
+            make_const(Bool(e.buffer->dtype.lanes()), true), body);
       } else {
         shape = e.buffer->shape;
         if (shape.size() == 0) {
-          shape.push_back(make_const(DataType::Int(32), 1));
+          shape.push_back(make_const(Int(32), 1));
         }
         ret = Allocate::make(
             e.buffer->data, storage_type, shape,
-            make_const(DataType::Bool(e.buffer->dtype.lanes()), true), body);
+            make_const(Bool(e.buffer->dtype.lanes()), true), body);
       }
       ret = AttrStmt::make(
           e.buffer->data, attr::storage_scope,
@@ -271,7 +271,7 @@ class StorageFlattener : public IRMutator {
         !it->second.same_as(op->buffer_var)) {
       CHECK(it->second.as<Variable>());
       VarExpr buf_var = Downcast<VarExpr>(it->second);
-      return Load::make(op->dtype, buf_var, op->index, op->predicate);
+      return Load::make(op->type, buf_var, op->index, op->predicate);
     } else {
       return expr;
     }
@@ -342,12 +342,10 @@ class StorageFlattener : public IRMutator {
       args.push_back(op->bounds[i]->min);
     }
     auto &func_name = op->func->func_name();
-    vars.push_back(VarExpr(
-        "prefetch." + func_name + "." + std::to_string(starts), DataType::Int(32)));
+    vars.push_back(VarExpr("prefetch." + func_name + "." + std::to_string(starts), Int(32)));
     args.push_back(op->bounds[starts]->min + stride * vars.back());
     for (int i = starts - 1; i >= 0; --i) {
-      vars.push_back(VarExpr(
-          "prefetch." + func_name + "." + std::to_string(i), DataType::Int(32)));
+      vars.push_back(VarExpr("prefetch." + func_name + "." + std::to_string(i), Int(32)));
       args.push_back(vars.back() + op->bounds[i]->min);
     }
     for (int i = starts; i >= 0; --i) {
@@ -356,8 +354,8 @@ class StorageFlattener : public IRMutator {
             vars[i], 0, op->bounds[i]->extent, ForType::Serial, DeviceAPI::None, stmt);
       } else {
         Expr load = e.buffer.vload(e.RelIndex(args), e.buffer->dtype);
-        Expr address = Call::make(DataType::Handle(), tvm_address_of, {load}, Call::PureIntrinsic);
-        Expr prefetch = Call::make(op->dtype, Call::prefetch, {address, 0, 3, 1}, Call::Intrinsic);
+        Expr address = Call::make(Handle(), tvm_address_of, {load}, Call::PureIntrinsic);
+        Expr prefetch = Call::make(op->type, Call::prefetch, {address, 0, 3, 1}, Call::Intrinsic);
         stmt = Evaluate::make(prefetch);
         Expr extent = (op->bounds[i]->extent - 1) / stride + 1;
         stmt = For::make(vars[i], 0, extent, ForType::Serial, DeviceAPI::None, stmt);
@@ -486,7 +484,7 @@ class StorageFlattener : public IRMutator {
       return false;
 
     for (size_t i = 0; i < shape.size(); ++i) {
-      if (!shape[i].defined() || !shape[i].dtype().is_scalar() ||
+      if (!shape[i].defined() || !shape[i].type().is_scalar() ||
           is_negative_const(shape[i])) {
         return false;
       }
@@ -494,12 +492,12 @@ class StorageFlattener : public IRMutator {
     return true;
   }
 
-  Expr MakeBound(const DataType &type, const Array<Expr> &shape) {
+  Expr MakeBound(const Type &type, const Array<Expr> &shape) {
     // We have already checked the shape size to be greater then 0.
-    Expr bound = Mul::make(make_const(shape[0].dtype(), type.lanes()), shape[0]);
+    Expr bound = Mul::make(make_const(shape[0].type(), type.lanes()), shape[0]);
     for (size_t i = 1; i < shape.size(); ++i) {
       bound = Mul::make(
-          bound, Mul::make(make_const(bound.dtype(), type.lanes()), shape[i]));
+          bound, Mul::make(make_const(bound.type(), type.lanes()), shape[i]));
     }
     return bound;
   }
diff --git a/src/pass/storage_rewrite.cc b/src/pass/storage_rewrite.cc
index 12a06da8007f..18b6634ec422 100644
--- a/src/pass/storage_rewrite.cc
+++ b/src/pass/storage_rewrite.cc
@@ -306,7 +306,7 @@ class InplaceOpVerifier : public IRVisitor {
     }
     if (src_ == buf) {
       if (store_ == nullptr ||
-          store_->value.dtype() != op->dtype ||
+          store_->value.type() != op->type ||
           !ir::Equal(store_->index, op->index)) {
         result_ = false; return;
       }
@@ -370,7 +370,7 @@ class StoragePlanRewriter : public IRMutator {
     if (it == alloc_map_.end()) return stmt;
     return Store::make(it->second->alloc_var,
                        op->value,
-                       RemapIndex(op->value.dtype(), op->index, it->second),
+                       RemapIndex(op->value.type(), op->index, it->second),
                        op->predicate);
   }
   Expr Mutate_(const Load* op, const Expr& e) final {
@@ -378,9 +378,9 @@ class StoragePlanRewriter : public IRMutator {
     op = expr.as<Load>();
     auto it = alloc_map_.find(op->buffer_var.get());
     if (it == alloc_map_.end()) return expr;
-    return Load::make(op->dtype,
+    return Load::make(op->type,
                       it->second->alloc_var,
-                      RemapIndex(op->dtype, op->index, it->second),
+                      RemapIndex(op->type, op->index, it->second),
                       op->predicate);
   }
   Expr Mutate_(const Variable* op, const Expr& e) final {
@@ -397,7 +397,7 @@ class StoragePlanRewriter : public IRMutator {
   Expr Mutate_(const Call* op, const Expr& e) final {
     if (op->is_intrinsic(intrinsic::tvm_access_ptr)) {
       CHECK_EQ(op->args.size(), 5U);
-      DataType dtype = op->args[0].dtype();
+      Type dtype = op->args[0].type();
       const Variable* buffer = op->args[1].as<Variable>();
       auto it = alloc_map_.find(buffer);
        if (it == alloc_map_.end()) return IRMutator::Mutate_(op, e);
@@ -407,10 +407,10 @@ class StoragePlanRewriter : public IRMutator {
        uint64_t elem_bits = dtype.bits() * dtype.lanes();
        CHECK_EQ(se->bits_offset % elem_bits, 0U);
        if (se->bits_offset != 0) {
-         offset = make_const(offset.dtype(), se->bits_offset / elem_bits) + offset;
+         offset = make_const(offset.type(), se->bits_offset / elem_bits) + offset;
        }
        return Call::make(
-           op->dtype, op->name,
+           op->type, op->name,
            {op->args[0], se->alloc_var, offset, extent, op->args[4]},
            op->call_type);
     } else {
@@ -485,7 +485,7 @@ class StoragePlanRewriter : public IRMutator {
     // The var expr of new allocation.
     VarExpr alloc_var;
     // The allocation element type.
-    DataType elem_type;
+    Type elem_type;
     // This is non-zero if this allocate is folded into another one
     // the address(in bits) becomes alloc_var + bits_offset;
     // can be effectively converted to the element type.
@@ -524,11 +524,11 @@ class StoragePlanRewriter : public IRMutator {
     return MergeNest(nest, body);
   }
   // Remap the index
-  Expr RemapIndex(DataType dtype, Expr index, StorageEntry* e) {
+  Expr RemapIndex(Type dtype, Expr index, StorageEntry* e) {
     if (e->bits_offset == 0) return index;
     uint64_t elem_bits = dtype.bits() * dtype.lanes();
     CHECK_EQ(e->bits_offset % elem_bits, 0U);
-    return make_const(index.dtype(), e->bits_offset / elem_bits) + index;
+    return make_const(index.type(), e->bits_offset / elem_bits) + index;
   }
   // Prepare the new allocations
   void PrepareNewAlloc() {
@@ -564,16 +564,16 @@ class StoragePlanRewriter : public IRMutator {
         }
         // Get the allocation size;
         e->alloc_var = e->allocs[0]->buffer_var;
-        DataType alloc_type = e->allocs[0]->dtype;
+        Type alloc_type = e->allocs[0]->type;
         for (const Allocate* op : e->allocs) {
-          if (op->dtype.lanes() > alloc_type.lanes()) {
-            alloc_type = op->dtype;
+          if (op->type.lanes() > alloc_type.lanes()) {
+            alloc_type = op->type;
           }
         }
         if (e->allocs.size() == 1) {
           // simply use the original allocation.
           Expr sz = arith::ComputeReduce<Mul>(e->allocs[0]->extents,
-                                              make_const(DataType::Int(32), 1));
+                                              make_const(Int(32), 1));
           e->new_alloc = Allocate::make(
               e->alloc_var, alloc_type, {sz},
               e->allocs[0]->condition, Evaluate::make(0));
@@ -587,8 +587,8 @@ class StoragePlanRewriter : public IRMutator {
           // Build a merged allocation
           Expr combo_size;
           for (const Allocate* op : e->allocs) {
-            Expr sz = arith::ComputeReduce<Mul>(op->extents, make_const(DataType::Int(32), 1));
-            auto nbits = op->dtype.bits() * op->dtype.lanes();
+            Expr sz = arith::ComputeReduce<Mul>(op->extents, make_const(Int(32), 1));
+            auto nbits = op->type.bits() * op->type.lanes();
             if (const auto* imm = sz.as<IntImm>()) {
               if (imm->value > std::numeric_limits<int>::max() / nbits) {
                 LOG(WARNING) << "The allocation requires : " << imm->value
@@ -596,7 +596,7 @@ class StoragePlanRewriter : public IRMutator {
                              << " bits, which is greater than the maximum of"
                                 " int32. The size is cast to int64."
                              << "\n";
-                sz = make_const(DataType::Int(64), imm->value);
+                sz = make_const(Int(64), imm->value);
               }
             }
             // transform to bits
@@ -613,7 +613,7 @@ class StoragePlanRewriter : public IRMutator {
           combo_size = indexdiv(combo_size, type_bits);
           // round up for can not divided
           if (!divided) {
-            combo_size = combo_size + make_const(DataType::Int(32), 1);
+            combo_size = combo_size + make_const(Int(32), 1);
           }
           combo_size = ir::Simplify(combo_size);
           e->new_alloc = Allocate::make(
@@ -658,7 +658,7 @@ class StoragePlanRewriter : public IRMutator {
       }
     }
     uint64_t type_bits = e->elem_type.bits() * e->elem_type.lanes();
-    Expr alloc_size = make_const(e->allocs[0]->extents[0].dtype(),
+    Expr alloc_size = make_const(e->allocs[0]->extents[0].type(),
                                  (total_bits + type_bits - 1) / type_bits);
     e->new_alloc = Allocate::make(
         e->alloc_var, e->elem_type, {alloc_size}, const_true(),
@@ -751,12 +751,12 @@ class StoragePlanRewriter : public IRMutator {
                 StorageEntry* src_entry = alloc_map_.at(src);
                 if (src_entry->scope == ae.storage_scope &&
                     src_entry->attach_scope_ == thread_scope_ &&
-                    src_entry->elem_type == ae.alloc->dtype.element_of() &&
+                    src_entry->elem_type == ae.alloc->type.element_of() &&
                     visitor.Check(s.stmt, var, src)) {
                   uint64_t const_nbits =
                       static_cast<uint64_t>(ae.alloc->constant_allocation_size()) *
-                      ae.alloc->dtype.bits() *
-                      ae.alloc->dtype.lanes();
+                      ae.alloc->type.bits() *
+                      ae.alloc->type.lanes();
                   if (src_entry->const_nbits == const_nbits && !inplace_found) {
                     // successfully inplace
                     dst_entry = src_entry;
@@ -816,7 +816,7 @@ class StoragePlanRewriter : public IRMutator {
     std::unique_ptr<StorageEntry> entry(new StorageEntry());
     entry->attach_scope_ = attach_scope;
     entry->scope = scope;
-    entry->elem_type = op->dtype.element_of();
+    entry->elem_type = op->type.element_of();
     entry->const_nbits = const_nbits;
     StorageEntry* e = entry.get();
     alloc_vec_.emplace_back(std::move(entry));
@@ -830,13 +830,13 @@ class StoragePlanRewriter : public IRMutator {
     // skip plan for local variable,
     // compiler can do a better job with register allocation.
     const uint64_t match_range = 16;
-    uint64_t op_elem_bits = op->dtype.bits() * op->dtype.lanes();
+    uint64_t op_elem_bits = op->type.bits() * op->type.lanes();
     uint64_t const_nbits = static_cast<uint64_t>(
         op->constant_allocation_size() * op_elem_bits);
     // disable reuse of small arrays, they will be lowered to registers in LLVM
     // This rules only apply if we are using non special memory
     if (scope.tag.length() == 0) {
-      if (scope.rank >= StorageRank::kWarp || op->dtype.is_handle()) {
+      if (scope.rank >= StorageRank::kWarp || op->type.is_handle()) {
         return NewAlloc(op, attach_scope, scope, const_nbits);
       }
       if (const_nbits > 0  &&  const_nbits <= 32) {
@@ -865,7 +865,7 @@ class StoragePlanRewriter : public IRMutator {
         StorageEntry *e = it->second;
         if (e->attach_scope_ != attach_scope) continue;
         if (e->scope != scope) continue;
-        if (e->elem_type != op->dtype.element_of()) continue;
+        if (e->elem_type != op->type.element_of()) continue;
         e->const_nbits = std::max(const_nbits, e->const_nbits);
         const_free_map_.erase(it);
         return e;
@@ -877,7 +877,7 @@ class StoragePlanRewriter : public IRMutator {
         StorageEntry* e = *it;
         if (e->attach_scope_ != attach_scope) continue;
         if (e->scope != scope) continue;
-        if (e->elem_type != op->dtype.element_of()) continue;
+        if (e->elem_type != op->type.element_of()) continue;
         sym_free_list_.erase(it);
         return e;
       }
@@ -896,7 +896,7 @@ class StoragePlanRewriter : public IRMutator {
     if (e->scope.tag.length() == 0) {
       // Disable sharing of local memory.
       if (e->scope.rank >= StorageRank::kWarp ||
-          e->allocs[0]->dtype.is_handle()) return;
+          e->allocs[0]->type.is_handle()) return;
       // disable reuse of small arrays
       if (e->const_nbits > 0 && e->const_nbits <= 32) return;
     }
@@ -932,17 +932,17 @@ class StoragePlanRewriter : public IRMutator {
 class VectorAllocRewriter : public IRMutator {
  public:
   Expr Mutate_(const Load* op, const Expr& e) final {
-    UpdateTypeMap(op->buffer_var.get(), op->dtype);
+    UpdateTypeMap(op->buffer_var.get(), op->type);
     return IRMutator::Mutate_(op, e);
   }
 
   Stmt Mutate_(const Store* op, const Stmt& s) final {
-    UpdateTypeMap(op->buffer_var.get(), op->value.dtype());
+    UpdateTypeMap(op->buffer_var.get(), op->value.type());
     return IRMutator::Mutate_(op, s);
   }
   Expr Mutate_(const Call* op, const Expr& e) final {
     if (op->is_intrinsic(intrinsic::tvm_access_ptr)) {
-      DataType dtype = op->args[0].dtype();
+      Type dtype = op->args[0].type();
       const Variable* buffer = op->args[1].as<Variable>();
       UpdateTypeMap(buffer, dtype);
     }
@@ -955,15 +955,15 @@ class VectorAllocRewriter : public IRMutator {
     const auto& tvec = acc_map_[op->buffer_var.get()];
 
     if (tvec.size() == 1 &&
-        tvec[0].element_of() == op->dtype.element_of() &&
-        tvec[0].lanes() % op->dtype.lanes() == 0 &&
-        tvec[0].lanes() != op->dtype.lanes()) {
-      int factor = tvec[0].lanes() / op->dtype.lanes();
+        tvec[0].element_of() == op->type.element_of() &&
+        tvec[0].lanes() % op->type.lanes() == 0 &&
+        tvec[0].lanes() != op->type.lanes()) {
+      int factor = tvec[0].lanes() / op->type.lanes();
       Array<Expr> extents = op->extents;
       arith::ModularSet me = analyzer_.modular_set(extents[extents.size() - 1]);
       if (me->base % factor == 0 && me->coeff % factor == 0) {
         extents.Set(extents.size() - 1,
-                    extents[extents.size() - 1] / make_const(extents[0].dtype(), factor));
+                    extents[extents.size() - 1] / make_const(extents[0].type(), factor));
         return Allocate::make(
             op->buffer_var, tvec[0], extents,
             op->condition, op->body);
@@ -972,7 +972,7 @@ class VectorAllocRewriter : public IRMutator {
     return stmt;
   }
 
-  void UpdateTypeMap(const Variable* buffer, DataType t) {
+  void UpdateTypeMap(const Variable* buffer, Type t) {
     auto& tvec = acc_map_[buffer];
     if (std::find(tvec.begin(), tvec.end(), t) == tvec.end()) {
       tvec.push_back(t);
@@ -980,7 +980,7 @@ class VectorAllocRewriter : public IRMutator {
   }
 
   // Internal access map
-  std::unordered_map<const Variable*, std::vector<DataType> > acc_map_;
+  std::unordered_map<const Variable*, std::vector<Type> > acc_map_;
   // internal analyzer
   arith::Analyzer analyzer_;
 };
@@ -991,7 +991,7 @@ LoweredFunc PointerValueTypeRewrite(LoweredFunc f) {
   VectorAllocRewriter rewriter;
   n->body = rewriter.Mutate(n->body);
   for (Var arg : f->args) {
-    if (arg.dtype().is_handle()) {
+    if (arg.type().is_handle()) {
       const auto& tvec = rewriter.acc_map_[arg.get()];
       if (tvec.size() == 1) {
         Expr dtype = make_const(tvec[0], 0);
diff --git a/src/pass/storage_sync.cc b/src/pass/storage_sync.cc
index 018a6bb2e79e..69b1a31e3ef4 100644
--- a/src/pass/storage_sync.cc
+++ b/src/pass/storage_sync.cc
@@ -211,7 +211,7 @@ class ThreadSyncInserter : public IRMutator {
         barrier = MakeGlobalBarrier();
       } else {
         barrier = Evaluate::make(
-                Call::make(DataType::Int(32), intrinsic::tvm_storage_sync,
+                Call::make(Int(32), intrinsic::tvm_storage_sync,
                            {StringImm::make(sync_scope_.to_string())},
                            Call::Intrinsic));
       }
@@ -303,7 +303,7 @@ class ThreadSyncInserter : public IRMutator {
     CHECK(op != nullptr);
     Array<Expr> pargs = {StringImm::make(runtime::symbol::tvm_prepare_global_barrier)};
     Stmt prep = Evaluate::make(
-        Call::make(DataType::Int(32), intrinsic::tvm_call_packed, pargs, Call::Intrinsic));
+        Call::make(Int(32), intrinsic::tvm_call_packed, pargs, Call::Intrinsic));
     Stmt body = op->body;
     for (const auto& kv : rw_stats_) {
       const auto& e = kv.second;
@@ -313,7 +313,7 @@ class ThreadSyncInserter : public IRMutator {
     }
     rw_stats_.clear();
     Stmt kinit = Evaluate::make(
-        Call::make(DataType::Int(32), intrinsic::tvm_global_barrier_kinit, {}, Call::Intrinsic));
+        Call::make(Int(32), intrinsic::tvm_global_barrier_kinit, {}, Call::Intrinsic));
     body = Block::make(kinit, body);
     body = AttrStmt::make(
         op->node, op->attr_key, op->value, body);
@@ -331,7 +331,7 @@ class ThreadSyncInserter : public IRMutator {
           num_blocks_ = (num_blocks_.defined() ?
                          attr->value * num_blocks_ : attr->value);
         } else if (s.rank == 1) {
-          Expr cond = iv->var == make_zero(iv->var.dtype());
+          Expr cond = iv->var == make_zero(iv->var.type());
           is_lead_ = is_lead_.defined() ? (is_lead_ && cond) : cond;
         }
       }
@@ -339,7 +339,7 @@ class ThreadSyncInserter : public IRMutator {
       CHECK_EQ(num_work_dim_, thread_extents_.size());
     }
     return Evaluate::make(
-        Call::make(DataType::Int(32), intrinsic::tvm_storage_sync,
+        Call::make(Int(32), intrinsic::tvm_storage_sync,
                    {StringImm::make(sync_scope_.to_string()),
                     is_lead_, num_blocks_},
                    Call::Intrinsic));
diff --git a/src/pass/tensor_core.cc b/src/pass/tensor_core.cc
index 2ead2b934d7e..b85542725a58 100644
--- a/src/pass/tensor_core.cc
+++ b/src/pass/tensor_core.cc
@@ -60,11 +60,11 @@ std::string simplify_name(std::string input) {
   }
 }
 
-Expr unpack_type_cast(const Expr &input, const DataType &target_type) {
+Expr unpack_type_cast(const Expr &input, const Type &target_type) {
   auto cast = input.as<Cast>();
   if (cast == nullptr) {
     return input;
-  } else if (cast->dtype == target_type) {
+  } else if (cast->type == target_type) {
     return cast->value;
   }
   return Expr();
@@ -123,7 +123,7 @@ class MMAMatcher: public IRVisitor {
     } else {
       BufferInfo bi;
       bi.name = key.GetName();
-      bi.dtype = op->dtype;
+      bi.dtype = op->type;
       buf_map_[key] = bi;
       Visit(op->body);
       buf_map_[key].released = true;
@@ -138,7 +138,7 @@ class MMAMatcher: public IRVisitor {
  private:
   struct BufferInfo {
     std::string name;
-    DataType dtype;
+    Type dtype;
     bool external{false};
     bool released{false};
     bool same_as(const BufferInfo &bi) {
@@ -185,8 +185,8 @@ class MMAMatcher: public IRVisitor {
     BufferInfo buffer_c;
     if (!check_local_buffer_(load_c, &buffer_c)
         || !buffer_c.same_as(store_buffer)
-        || !(buffer_c.dtype == DataType::Float(32) ||
-             buffer_c.dtype == DataType::Int(32))) {
+        || !(buffer_c.dtype == Float(32) ||
+             buffer_c.dtype == Int(32))) {
       return false;
     }
 
@@ -199,8 +199,8 @@ class MMAMatcher: public IRVisitor {
     auto load_a = load_a_expr.as<Call>();
     BufferInfo buffer_a;
     if (!check_local_buffer_(load_a, &buffer_a)
-        || !(buffer_a.dtype == DataType::Float(16) ||
-             buffer_a.dtype == DataType::Int(8))) {
+        || !(buffer_a.dtype == Float(16) ||
+             buffer_a.dtype == Int(8))) {
       return false;
     }
 
@@ -208,8 +208,8 @@ class MMAMatcher: public IRVisitor {
     auto load_b = load_b_expr.as<Call>();
     BufferInfo buffer_b;
     if (!check_local_buffer_(load_b, &buffer_b)
-        || !(buffer_b.dtype == DataType::Float(16) ||
-             buffer_b.dtype == DataType::Int(8))) {
+        || !(buffer_b.dtype == Float(16) ||
+             buffer_b.dtype == Int(8))) {
       return false;
     }
 
@@ -247,8 +247,8 @@ class BodyVisitor : public IRVisitor {
       return;
     }
     for (Expr source : op->source) {
-      auto mul_0 = unpack_type_cast(source, DataType::Float(32)).as<Mul>();
-      auto mul_1 = unpack_type_cast(source, DataType::Int(32)).as<Mul>();
+      auto mul_0 = unpack_type_cast(source, Float(32)).as<Mul>();
+      auto mul_1 = unpack_type_cast(source, Int(32)).as<Mul>();
       if (mul_0 == nullptr && mul_1 == nullptr) {
         continue;
       }
@@ -467,13 +467,13 @@ class BufferAnalyser : public IRVisitor {
       strides = bi.strides;
     } else {
       for (size_t i = 1; i < bi.shape.size(); ++i) {
-        Expr stride = IntImm::make(DataType::Int(32), 1);
+        Expr stride = IntImm::make(Int(32), 1);
         for (size_t j = bi.shape.size() - 1; j >= i; --j) {
           stride = Mul::make(stride, bi.shape[j]);
         }
         strides.push_back(stride);
       }
-      strides.push_back(make_const(DataType::Int(32), 1));
+      strides.push_back(make_const(Int(32), 1));
     }
     strides_.insert(std::make_pair(key.GetName(), strides));
 
@@ -580,13 +580,13 @@ class BufferAnalyser : public IRVisitor {
         strides = bi.strides;
       } else {
         for (size_t i = 1; i < bi.shape.size(); ++i) {
-          Expr stride = IntImm::make(DataType::Int(32), 1);
+          Expr stride = IntImm::make(Int(32), 1);
           for (size_t j = bi.shape.size() - 1; j >= i; --j) {
             stride = Mul::make(stride, bi.shape[j]);
           }
           strides.push_back(stride);
         }
-        strides.push_back(make_const(DataType::Int(32), 1));
+        strides.push_back(make_const(Int(32), 1));
       }
       strides_.insert(std::make_pair(key.GetName(), strides));
 
@@ -631,12 +631,12 @@ class BufferAnalyser : public IRVisitor {
         std::vector<Expr> rstrides;
         const std::vector<DimAlignInfo>& avec = dim_align_[key];
         int first_dim = 0;
-        Expr stride = make_const(shape[first_dim].dtype(), 1);
+        Expr stride = make_const(shape[first_dim].type(), 1);
         for (size_t i = shape.size(); i != 0; --i) {
           size_t dim = i - 1;
           if (dim < avec.size() && avec[dim].align_factor != 0) {
-            Expr factor = make_const(stride.dtype(), avec[dim].align_factor);
-            Expr offset = make_const(stride.dtype(), avec[dim].align_offset);
+            Expr factor = make_const(stride.type(), avec[dim].align_factor);
+            Expr offset = make_const(stride.type(), avec[dim].align_offset);
             stride = stride + \
               indexmod(factor + offset - indexmod(stride, factor), factor);
             stride = ir::Simplify(stride);
@@ -648,7 +648,7 @@ class BufferAnalyser : public IRVisitor {
       }
 
       bi.name = key.GetName();
-      bi.dtype = op->dtype;
+      bi.dtype = op->type;
       bi.strides = strides;
       bi.shape = shape;
 
@@ -693,7 +693,7 @@ class BufferAnalyser : public IRVisitor {
 
   struct BufferInfo {
     std::string name;
-    DataType dtype;
+    Type dtype;
     Array<Expr> strides;
     Array<Expr> shape;
     Region bounds;
@@ -770,7 +770,7 @@ class ThreadIdxMutator : public IRMutator {
     op = expr.as<Variable>();
     if (op != nullptr) {
       if (op->name_hint == "threadIdx.x") {
-        Expr zero = IntImm::make(DataType::Int(32), 0);
+        Expr zero = IntImm::make(Int(32), 0);
         return zero;
       }
       if (op->name_hint == "threadIdx.y") {
@@ -827,7 +827,7 @@ class TensorCoreIRMutator : public IRMutator {
           op->bounds[op->bounds.size() - 1]->min, new_extents[1]));
 
       return Realize::make(op->func, op->value_index,
-                           op->dtype, new_bounds,
+                           op->type, new_bounds,
                            op->condition, op->body);
     }
     return stmt;
@@ -878,7 +878,7 @@ class TensorCoreIRMutator : public IRMutator {
           Buffer buffer_a(buffer_node_a);
           Buffer buffer_b(buffer_node_b);
           return Evaluate::make(
-                  Call::make(DataType::Handle(),
+                  Call::make(Handle(),
                         intrinsic::tvm_mma_sync,
                         {buffer->data, buffer->elem_offset,
                         buffer_a->data, buffer_a->elem_offset,
@@ -890,17 +890,17 @@ class TensorCoreIRMutator : public IRMutator {
       auto call_add_c =
         [this, &cc, &buffer_node_c, &mma_sync_call](const Buffer &buffer) {
           return add_buffer_bind_scope_(cc, buffer_node_c,
-            TensorKey{cc->func, cc->value_index}, mma_sync_call, cc->dtype);
+            TensorKey{cc->func, cc->value_index}, mma_sync_call, cc->type);
         };
 
       auto call_add_b =
         [this, &cb, &buffer_node_b, &call_add_c](const Buffer &buffer) {
           return add_buffer_bind_scope_(cb, buffer_node_b,
-            TensorKey{cb->func, cb->value_index}, call_add_c, cb->dtype);
+            TensorKey{cb->func, cb->value_index}, call_add_c, cb->type);
         };
 
       return add_buffer_bind_scope_(ca, buffer_node_a,
-        TensorKey{ca->func, ca->value_index}, call_add_b, ca->dtype);
+        TensorKey{ca->func, ca->value_index}, call_add_b, ca->type);
     }
 
     auto it2 = frag_load_.find(op);
@@ -913,7 +913,7 @@ class TensorCoreIRMutator : public IRMutator {
         auto fill_fragment_call =
           [this, &op](const Buffer &buffer) {
             return Evaluate::make(
-                    Call::make(DataType::Handle(),
+                    Call::make(Handle(),
                               intrinsic::tvm_fill_fragment,
                               {buffer->data,
                               warp_tile_.m, warp_tile_.n, warp_tile_.k,
@@ -924,7 +924,7 @@ class TensorCoreIRMutator : public IRMutator {
         NodePtr<BufferNode> buffer_node = make_node<BufferNode>();
         return add_buffer_bind_scope_(call, buffer_node,
                                       TensorKey{call->func, call->value_index},
-                                      fill_fragment_call, call->dtype);
+                                      fill_fragment_call, call->type);
       }
 
       const Call* value = op->value.as<Call>();
@@ -939,10 +939,10 @@ class TensorCoreIRMutator : public IRMutator {
       Expr stride = strides[strides.size()-2];
 
       // thread index unification inside a warp
-      Expr warp_y = IntImm::make(DataType::Int(32), warp_threads_y_);
+      Expr warp_y = IntImm::make(Int(32), warp_threads_y_);
       ThreadIdxMutator thread_idx_mutator(warp_y);
       Expr mutated_value = thread_idx_mutator.Mutate(op->value);
-      Expr src = Call::make(value->dtype,
+      Expr src = Call::make(value->type,
                             "&",
                             {mutated_value},
                             Call::Extern);
@@ -963,7 +963,7 @@ class TensorCoreIRMutator : public IRMutator {
       auto load_matrix_call =
         [this, &src, &stride, &matrix_major](const Buffer &buffer) {
         return Evaluate::make(
-                Call::make(DataType::Handle(),
+                Call::make(Handle(),
                           intrinsic::tvm_load_matrix_sync,
                           {buffer->data,
                           warp_tile_.m, warp_tile_.n, warp_tile_.k,
@@ -974,7 +974,7 @@ class TensorCoreIRMutator : public IRMutator {
       NodePtr<BufferNode> buffer_node = make_node<BufferNode>();
       return add_buffer_bind_scope_(call, buffer_node,
                                     TensorKey{op->func, op->value_index},
-                                    load_matrix_call, call->dtype);
+                                    load_matrix_call, call->type);
     }
 
     auto it3 = frag_store_.find(op);
@@ -989,10 +989,10 @@ class TensorCoreIRMutator : public IRMutator {
 
       Expr dst = it3->second;
       // thread index unification inside a warp
-      Expr warp_y = IntImm::make(DataType::Int(32), warp_threads_y_);
+      Expr warp_y = IntImm::make(Int(32), warp_threads_y_);
       ThreadIdxMutator thread_idx_mutator(warp_y);
       dst = thread_idx_mutator.Mutate(dst);
-      dst = Call::make(DataType::Handle(),
+      dst = Call::make(Handle(),
                        "&",
                        {dst},
                        Call::Extern);
@@ -1002,7 +1002,7 @@ class TensorCoreIRMutator : public IRMutator {
       auto store_matrix_call =
         [this, &dst, &stride](const Buffer &buffer) {
           return Evaluate::make(
-                  Call::make(DataType::Handle(),
+                  Call::make(Handle(),
                             intrinsic::tvm_store_matrix_sync,
                             {buffer->data,
                             warp_tile_.m, warp_tile_.n, warp_tile_.k,
@@ -1014,7 +1014,7 @@ class TensorCoreIRMutator : public IRMutator {
       NodePtr<BufferNode> buffer_node = make_node<BufferNode>();
       return add_buffer_bind_scope_(call, buffer_node,
                                     TensorKey{call->func, call->value_index},
-                                    store_matrix_call, call->dtype);
+                                    store_matrix_call, call->type);
     }
 
     return stmt;
@@ -1032,7 +1032,7 @@ class TensorCoreIRMutator : public IRMutator {
           int ori_extent_value = ori_extent->value;
           scaled_extent_value = ori_extent_value / scale_factor;
         }
-        Expr scaled_extent = make_const(op->extent.dtype(), scaled_extent_value);
+        Expr scaled_extent = make_const(op->extent.type(), scaled_extent_value);
         stmt = For::make(op->loop_var, op->min, scaled_extent, op->for_type,
           op->device_api, op->body);
       }
@@ -1046,27 +1046,27 @@ class TensorCoreIRMutator : public IRMutator {
       auto it2 = matrix_major_.find(name);
       CHECK(it != matrix_abc_.end() && it2 != matrix_major_.end())
           << "Cannot find matrix info for " << name;
-      Expr size0 = make_const(DataType::Int(32), 16);
-      Expr size1 = make_const(DataType::Int(32), 16);
+      Expr size0 = make_const(Int(32), 16);
+      Expr size1 = make_const(Int(32), 16);
       if (it->second == "matrix_a" && it2->second == "col_major") {
-        size0 = make_const(DataType::Int(32), warp_tile_.k);
-        size1 = make_const(DataType::Int(32), warp_tile_.m);
+        size0 = make_const(Int(32), warp_tile_.k);
+        size1 = make_const(Int(32), warp_tile_.m);
       }
       if (it->second == "matrix_a" && it2->second == "row_major") {
-        size0 = make_const(DataType::Int(32), warp_tile_.m);
-        size1 = make_const(DataType::Int(32), warp_tile_.k);
+        size0 = make_const(Int(32), warp_tile_.m);
+        size1 = make_const(Int(32), warp_tile_.k);
       }
       if (it->second == "matrix_b" && it2->second == "row_major") {
-        size0 = make_const(DataType::Int(32), warp_tile_.k);
-        size1 = make_const(DataType::Int(32), warp_tile_.n);
+        size0 = make_const(Int(32), warp_tile_.k);
+        size1 = make_const(Int(32), warp_tile_.n);
       }
       if (it->second == "matrix_b" && it2->second == "col_major") {
-        size0 = make_const(DataType::Int(32), warp_tile_.n);
-        size1 = make_const(DataType::Int(32), warp_tile_.k);
+        size0 = make_const(Int(32), warp_tile_.n);
+        size1 = make_const(Int(32), warp_tile_.k);
       }
       if (it->second == "matrix_c") {
-        size0 = make_const(DataType::Int(32), warp_tile_.n);
-        size1 = make_const(DataType::Int(32), warp_tile_.m);
+        size0 = make_const(Int(32), warp_tile_.n);
+        size1 = make_const(Int(32), warp_tile_.m);
       }
       Array<Expr> tile_size = {size0, size1};
       return tile_size;
@@ -1094,15 +1094,15 @@ class TensorCoreIRMutator : public IRMutator {
 
     Array<Expr> strides;
     for (size_t i = 1; i < shape.size(); ++i) {
-      Expr stride = IntImm::make(DataType::Int(32), 1);
+      Expr stride = IntImm::make(Int(32), 1);
       for (size_t j = shape.size() - 1; j >= i; --j) {
         stride = Mul::make(stride, shape[j]);
       }
       strides.push_back(stride);
     }
-    strides.push_back(make_const(DataType::Int(32), 1));
+    strides.push_back(make_const(Int(32), 1));
 
-    Expr elem_offset = IntImm::make(DataType::Int(32), 0);
+    Expr elem_offset = IntImm::make(Int(32), 0);
     CHECK_EQ(call->args.size(), min_bound.size());
     for (size_t i = 0; i < min_bound.size(); i++) {
       elem_offset = Add::make(
@@ -1113,7 +1113,7 @@ class TensorCoreIRMutator : public IRMutator {
     auto it2 = matrix_abc_.find(simplify_name(call->name));
     CHECK(it2 != matrix_abc_.end())
           << "Cannot find matrix info for " << call->name;
-    buffer_node->data = Variable::make(DataType::Handle(), call->name);
+    buffer_node->data = Variable::make(Handle(), call->name);
     buffer_node->name = call->name;
     buffer_node->scope = "wmma." + it2->second;
     buffer_node->dtype = datatype;
@@ -1136,7 +1136,7 @@ class TensorCoreIRMutator : public IRMutator {
       args.push_back(call->args[i]);
       args.push_back(shape[i]);
     }
-    auto tuple = Call::make(DataType::Handle(),
+    auto tuple = Call::make(Handle(),
                             intrinsic::tvm_tuple,
                             args,
                             Call::Intrinsic);
diff --git a/src/pass/unroll_loop.cc b/src/pass/unroll_loop.cc
index c56944406bd4..11d9daf3cf5d 100644
--- a/src/pass/unroll_loop.cc
+++ b/src/pass/unroll_loop.cc
@@ -151,7 +151,7 @@ class LoopUnroller : public IRMutator {
     Map<Var, Expr> vmap;
     Stmt unrolled;
     for (int i = 0; i < value; ++i) {
-      vmap.Set(op->loop_var, op->min + make_const(op->loop_var.dtype(), i));
+      vmap.Set(op->loop_var, op->min + make_const(op->loop_var.type(), i));
       Stmt step = Substitute(body, vmap);
       if (unrolled.defined()) {
         unrolled = Block::make(unrolled, step);
diff --git a/src/pass/vectorize_loop.cc b/src/pass/vectorize_loop.cc
index 94639f7be363..187033092e76 100644
--- a/src/pass/vectorize_loop.cc
+++ b/src/pass/vectorize_loop.cc
@@ -34,14 +34,14 @@ namespace tvm {
 namespace ir {
 
 inline Expr BroadcastTo(Expr e, int lanes) {
-  if (e.dtype().lanes() == lanes) return e;
+  if (e.type().lanes() == lanes) return e;
   if (const Broadcast* op = e.as<Broadcast>()) {
     if (lanes % op->lanes == 0) {
       return Broadcast::make(op->value, lanes);
     }
   }
-  CHECK_EQ(e.dtype().lanes(), 1)
-      << "Cannot broadcast lane=" << e.dtype().lanes()
+  CHECK_EQ(e.type().lanes(), 1)
+      << "Cannot broadcast lane=" << e.type().lanes()
       << " to " << lanes;
   return Broadcast::make(e, lanes);
 }
@@ -63,7 +63,7 @@ class VecAllocAccess : public IRMutator {
     Expr expr = IRMutator::Mutate_(op, e);
     op = expr.as<Load>();
     if (op->buffer_var.get() == buf_) {
-      return Load::make(op->dtype, op->buffer_var,
+      return Load::make(op->type, op->buffer_var,
                         op->index * var_lanes_ + var_,
                         op->predicate);
     } else {
@@ -128,15 +128,15 @@ class Vectorizer : public IRMutator {
         b.same_as(op->b)) {
       return e;
     } else {
-      int lanes = std::max(a.dtype().lanes(), b.dtype().lanes());
+      int lanes = std::max(a.type().lanes(), b.type().lanes());
       if (lanes != 1) {
         const Ramp* b_ramp = b.as<Ramp>();
         const Ramp* a_ramp = a.as<Ramp>();
-        if (a_ramp && b.dtype().lanes() == 1 && analyzer_.CanProve(b > 0)) {
+        if (a_ramp && b.type().lanes() == 1 && analyzer_.CanProve(b > 0)) {
           return Ramp::make(
               a_ramp->base * b, a_ramp->stride * b, a_ramp->lanes);
         }
-        if (b_ramp && a.dtype().lanes() == 1 && analyzer_.CanProve(a > 0)) {
+        if (b_ramp && a.type().lanes() == 1 && analyzer_.CanProve(a > 0)) {
           return Ramp::make(
               b_ramp->base * a, b_ramp->stride * a, b_ramp->lanes);
         }
@@ -190,13 +190,13 @@ class Vectorizer : public IRMutator {
   Expr Mutate_(const Ramp* op, const Expr &e) final {
     Expr base = this->Mutate(op->base);
     Expr stride = this->Mutate(op->stride);
-    if (base.dtype().lanes() > 1 && stride.dtype().lanes() == 1) {
+    if (base.type().lanes() > 1 && stride.type().lanes() == 1) {
       const Ramp* base_ramp = base.as<Ramp>();
-      if (analyzer_.CanProve(base_ramp->stride == stride * make_const(stride.dtype(), op->lanes))) {
+      if (analyzer_.CanProve(base_ramp->stride == stride * make_const(stride.type(), op->lanes))) {
         return Ramp::make(base_ramp->base, stride, op->lanes * base_ramp->lanes);
       }
     }
-    int lanes = std::max(base.dtype().lanes(), stride.dtype().lanes());
+    int lanes = std::max(base.type().lanes(), stride.type().lanes());
     base = BroadcastTo(base, lanes);
     stride = BroadcastTo(stride, lanes);
     Array<Expr> elems;
@@ -218,8 +218,8 @@ class Vectorizer : public IRMutator {
       return e;
     } else {
       int lanes = std::max(std::max(
-          cond.dtype().lanes(),
-          t.dtype().lanes()), f.dtype().lanes());
+          cond.type().lanes(),
+          t.type().lanes()), f.type().lanes());
       return Select::make(cond, BroadcastTo(t, lanes), BroadcastTo(f, lanes));
     }
   }
@@ -228,7 +228,7 @@ class Vectorizer : public IRMutator {
     if (value.same_as(op->value)) {
       return e;
     } else {
-      return Cast::make(op->dtype.with_lanes(value.dtype().lanes()), value);
+      return Cast::make(op->type.with_lanes(value.type().lanes()), value);
     }
   }
   // Variable
@@ -244,7 +244,7 @@ class Vectorizer : public IRMutator {
   // IfThenElse expr
   Expr MutateIfThenElseExpr_(const Call *op, const Expr& e) {
     Expr cond = this->Mutate(op->args[0]);
-    if (cond.dtype().is_vector())  {
+    if (cond.type().is_vector())  {
       need_scalarize_ = true;
       return e;
     }
@@ -255,11 +255,11 @@ class Vectorizer : public IRMutator {
         f.same_as(op->args[2])) {
       return e;
     } else {
-      int lanes = std::max(t.dtype().lanes(), f.dtype().lanes());
+      int lanes = std::max(t.type().lanes(), f.type().lanes());
       t = BroadcastTo(t, lanes);
       f = BroadcastTo(f, lanes);
       return Call::make(
-          op->dtype.with_lanes(lanes), op->name,
+          op->type.with_lanes(lanes), op->name,
           {cond, t, f}, op->call_type, op->func, op->value_index);
     }
   }
@@ -273,7 +273,7 @@ class Vectorizer : public IRMutator {
       Array<Expr> new_args;
       for (auto arg : op->args) {
         auto new_arg = this->Mutate(arg);
-        if (new_arg.dtype().is_vector()) {
+        if (new_arg.type().is_vector()) {
           need_scalarize_ = true;
           return e;
         }
@@ -283,7 +283,7 @@ class Vectorizer : public IRMutator {
         return e;
       } else {
         return Call::make(
-            op->dtype, op->name, new_args, op->call_type, op->func, op->value_index);
+            op->type, op->name, new_args, op->call_type, op->func, op->value_index);
       }
     } else {
       int lane = 0;
@@ -293,7 +293,7 @@ class Vectorizer : public IRMutator {
         return e;
       } else {
         return Call::make(
-            op->dtype.with_lanes(lane), op->name, new_args,
+            op->type.with_lanes(lane), op->name, new_args,
             op->call_type, op->func, op->value_index);
       }
     }
@@ -305,9 +305,9 @@ class Vectorizer : public IRMutator {
     if (index.same_as(op->index) && pred.same_as(op->predicate)) {
       return e;
     } else {
-      int lanes = std::max(index.dtype().lanes(), pred.dtype().lanes());
+      int lanes = std::max(index.type().lanes(), pred.type().lanes());
       return Load::make(
-          op->dtype.with_lanes(lanes),
+          op->type.with_lanes(lanes),
           op->buffer_var,
           BroadcastTo(index, lanes),
           BroadcastTo(pred, lanes));
@@ -317,8 +317,8 @@ class Vectorizer : public IRMutator {
   Expr Mutate_(const Let* op, const Expr& e) final {
     Expr value = this->Mutate(op->value);
     CHECK(!lets_.count(op->var.get())) << "not SSA";
-    if (value.dtype().lanes() != op->value.dtype().lanes()) {
-      Var v(op->var->name_hint, value.dtype());
+    if (value.type().lanes() != op->value.type().lanes()) {
+      Var v(op->var->name_hint, value.type());
       lets_[op->var.get()] = v;
       return Let::make(v, value, Mutate(op->body));
     } else {
@@ -334,7 +334,7 @@ class Vectorizer : public IRMutator {
   // Provide
   Stmt Mutate_(const Provide* op, const Stmt& s) final {
     Expr new_value = this->Mutate(op->value);
-    int lane = new_value.dtype().lanes();
+    int lane = new_value.type().lanes();
     Array<Expr> new_args = MutateArray(op->args, &lane);
     if (op->args.same_as(new_args) && op->value.same_as(new_value)) {
       return s;
@@ -351,8 +351,8 @@ class Vectorizer : public IRMutator {
     if (value.same_as(op->value) && index.same_as(op->index)) {
       return s;
     } else {
-      int lanes = std::max(value.dtype().lanes(), index.dtype().lanes());
-      lanes = std::max(lanes, pred.dtype().lanes());
+      int lanes = std::max(value.type().lanes(), index.type().lanes());
+      lanes = std::max(lanes, pred.type().lanes());
       return Store::make(op->buffer_var,
                          BroadcastTo(value, lanes),
                          BroadcastTo(index, lanes),
@@ -365,9 +365,9 @@ class Vectorizer : public IRMutator {
       LOG(WARNING) << "Detect vectorize inside vectorized loop, ignoring...";
     }
     CHECK(is_zero(op->min));
-    CHECK(!op->extent.dtype().is_vector());
+    CHECK(!op->extent.type().is_vector());
     Expr extent = Mutate(op->extent);
-    if (extent.dtype().is_vector()) {
+    if (extent.type().is_vector()) {
       return Scalarize(s);
     }
     Stmt body = Mutate(op->body);
@@ -382,9 +382,9 @@ class Vectorizer : public IRMutator {
   }
   // IfThenElse
   Stmt Mutate_(const IfThenElse* op, const Stmt& s) final {
-    CHECK(!op->condition.dtype().is_vector());
+    CHECK(!op->condition.type().is_vector());
     Expr condition = this->Mutate(op->condition);
-    if (condition.dtype().is_vector()) {
+    if (condition.type().is_vector()) {
       return Scalarize(s);
     }
     Stmt then_case = this->Mutate(op->then_case);
@@ -412,14 +412,14 @@ class Vectorizer : public IRMutator {
       return Scalarize(s);
     }
     Expr condition = Mutate(op->condition);
-    if (condition.dtype().is_vector()) {
+    if (condition.type().is_vector()) {
       LOG(WARNING) << "Cannot handle vector extent in alloc ";
       return Scalarize(s);
     }
     Array<Expr> extents;
     for (size_t i = 0; i < op->extents.size(); i++) {
       Expr new_ext = Mutate(op->extents[i]);
-      if (new_ext.dtype().is_vector()) {
+      if (new_ext.type().is_vector()) {
         LOG(WARNING) << "Cannot handle vector extent in alloc ";
         return Scalarize(s);
       }
@@ -432,13 +432,13 @@ class Vectorizer : public IRMutator {
         op->buffer_var.get(), var_, var_lanes_).Mutate(op->body);
     body = Mutate(body);
     return Allocate::make(
-        op->buffer_var, op->dtype,
+        op->buffer_var, op->type,
         extents, condition, body,
         op->new_expr, op->free_function);
   }
   // scalarize the statment
   Stmt Scalarize(Stmt stmt) {
-    Var idx(var_->name_hint + ".s", var_->dtype);
+    Var idx(var_->name_hint + ".s", var_->type);
     Map<Var, Expr> values{{var_, idx}};
     stmt = Substitute(stmt, values);
     return For::make(idx, 0, var_lanes_, ForType::Serial, DeviceAPI::None, stmt);
@@ -469,11 +469,11 @@ class Vectorizer : public IRMutator {
       Expr new_elem = this->Mutate(old_elem);
       if (!new_elem.same_as(old_elem)) changed = true;
       new_arr[i] = new_elem;
-      lanes = std::max(lanes, new_elem.dtype().lanes());
+      lanes = std::max(lanes, new_elem.type().lanes());
     }
 
     for (size_t i = 0; i < arr.size(); ++i) {
-      if (new_arr[i].dtype().lanes() != lanes) {
+      if (new_arr[i].type().lanes() != lanes) {
         new_arr[i] = BroadcastTo(new_arr[i], lanes);
         changed = true;
       }
@@ -489,7 +489,7 @@ class Vectorizer : public IRMutator {
         b.same_as(op->b)) {
       return e;
     } else {
-      int lanes = std::max(a.dtype().lanes(), b.dtype().lanes());
+      int lanes = std::max(a.type().lanes(), b.type().lanes());
       return T::make(BroadcastTo(a, lanes), BroadcastTo(b, lanes));
     }
   }
@@ -501,17 +501,17 @@ class Vectorizer : public IRMutator {
         b.same_as(op->b)) {
       return e;
     } else {
-      int lanes = std::max(a.dtype().lanes(), b.dtype().lanes());
+      int lanes = std::max(a.type().lanes(), b.type().lanes());
       if (lanes != 1) {
         const Ramp* b_ramp = b.as<Ramp>();
         const Ramp* a_ramp = a.as<Ramp>();
-        if (a.dtype().lanes() == 1 && b_ramp) {
+        if (a.type().lanes() == 1 && b_ramp) {
           return Ramp::make(
               arith::Compute<T>(a, b_ramp->base),
-              arith::Compute<T>(make_zero(b_ramp->stride.dtype()), b_ramp->stride),
+              arith::Compute<T>(make_zero(b_ramp->stride.type()), b_ramp->stride),
               b_ramp->lanes);
         }
-        if (b.dtype().lanes() == 1 && a_ramp) {
+        if (b.type().lanes() == 1 && a_ramp) {
           return Ramp::make(
               arith::Compute<T>(a_ramp->base, b), a_ramp->stride, a_ramp->lanes);
         }
diff --git a/src/pass/verify_gpu_code.cc b/src/pass/verify_gpu_code.cc
index 49a05345a99f..d01cb66d1b0a 100644
--- a/src/pass/verify_gpu_code.cc
+++ b/src/pass/verify_gpu_code.cc
@@ -82,10 +82,10 @@ class GPUCodeVerifier : public IRVisitor {
     // visit an allocation of a buffer in shared memory, record its size
     if (visited_local_buffers_.count(op->buffer_var.get()) != 0) {
       size_t size = static_cast<size_t>(op->constant_allocation_size());
-      local_memory_per_block_ += size * op->dtype.bytes() * op->dtype.lanes();
+      local_memory_per_block_ += size * op->type.bytes() * op->type.lanes();
     } else if (visited_shared_buffers_.count(op->buffer_var.get()) != 0) {
       size_t size = static_cast<size_t>(op->constant_allocation_size());
-      shared_memory_per_block_ += size * op->dtype.bytes() * op->dtype.lanes();
+      shared_memory_per_block_ += size * op->type.bytes() * op->type.lanes();
     }
   }
 
diff --git a/src/relay/backend/build_module.cc b/src/relay/backend/build_module.cc
index 780e19bd017f..9254c7e3e7b9 100644
--- a/src/relay/backend/build_module.cc
+++ b/src/relay/backend/build_module.cc
@@ -73,10 +73,6 @@ struct GraphCodegen {
     return CallFunc<std::string>("get_graph_json", nullptr);
   }
 
-  Array<tvm::runtime::Module> GetExternalModules() {
-    return CallFunc<Array<tvm::runtime::Module> >("get_external_modules", nullptr);
-  }
-
   Map<std::string, Array<LoweredFunc> > GetLoweredFunc() {
     return CallFunc<Map<std::string, Array<LoweredFunc> > >("get_lowered_funcs", nullptr);
   }
@@ -152,10 +148,6 @@ class RelayBuildModule : public runtime::ModuleNode {
       return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
           *rv = this->graph_codegen_->GetLoweredFunc();
       });
-    } else if (name == "get_external_modules") {
-      return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
-          *rv = this->graph_codegen_->GetExternalModules();
-      });
     } else if (name == "optimize") {
       return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
         CHECK_EQ(args.num_args, 2);
@@ -323,7 +315,7 @@ class RelayBuildModule : public runtime::ModuleNode {
         auto op_node = call_node->op.as<OpNode>();
         if (op_node->name == "cast") {
           auto attrs = call_node->attrs.as<CastAttrs>();
-          if (attrs->dtype == DataType::Int(32)) {
+          if (attrs->dtype == Int(32)) {
             *rv = true;
           }
         }
@@ -482,20 +474,6 @@ class RelayBuildModule : public runtime::ModuleNode {
         target_host_,
         BuildConfig::Current());
     }
-    Array<tvm::runtime::Module> ext_mods = graph_codegen_->GetExternalModules();
-    if (!ext_mods.empty()) {
-      CHECK(lowered_funcs.size() > 0 || ext_mods.size() == 1)
-          << "Expect to have a TVM DSOModule when multiple external runtime modules exist";
-      if (lowered_funcs.size() == 0) {
-        // Execute the whole module using external runtime.
-        ret_.mod = ext_mods[0];
-      } else {
-        // Import all external runtime modules.
-        for (const auto& it : ext_mods) {
-          ret_.mod.Import(it);
-        }
-      }
-    }
   }
 
  protected:
diff --git a/src/relay/backend/compile_engine.cc b/src/relay/backend/compile_engine.cc
index 7c33ac9ed61a..083fa5d5610c 100644
--- a/src/relay/backend/compile_engine.cc
+++ b/src/relay/backend/compile_engine.cc
@@ -21,17 +21,13 @@
  * \file relay/backend/compile_engine.cc
  * \brief Internal compialtion engine.
  */
-#include "compile_engine.h"
-
 #include <tvm/schedule.h>
 #include <tvm/packed_func_ext.h>
 #include <tvm/operation.h>
 #include <tvm/runtime/registry.h>
 #include <tvm/relay/attrs/device_copy.h>
 #include <tvm/relay/analysis.h>
-#include <tvm/relay/expr.h>
 #include <tvm/relay/expr_functor.h>
-#include <tvm/relay/op.h>
 #include <tvm/relay/op_attr_types.h>
 #include <topi/tags.h>
 #include <utility>
@@ -41,6 +37,7 @@
 #include <vector>
 #include <unordered_map>
 #include "../ir/type_functor.h"
+#include "compile_engine.h"
 
 namespace tvm {
 namespace relay {
@@ -88,7 +85,7 @@ Array<IndexExpr> GetShape(const Array<IndexExpr>& shape) {
     if (pval != nullptr) {
       CHECK_LE(pval[0], std::numeric_limits<int32_t>::max());
       CHECK_GE(pval[0], std::numeric_limits<int32_t>::min());
-      res.push_back(ir::IntImm::make(DataType::Int(32), *pval));
+      res.push_back(ir::IntImm::make(Int(32), *pval));
     } else if (val->IsInstance<ir::Any>()) {
       res.push_back(val.as<ir::Any>()->ToVar());
     } else {
@@ -104,7 +101,7 @@ class ScheduleGetter :
       public ExprFunctor<Array<Tensor>(const Expr&)> {
  public:
   explicit ScheduleGetter(Target target)
-      : target_(target), device_copy_op_(Op::Get("device_copy")) {}
+      : target_(target) {}
 
   std::pair<Schedule, CachedFunc> Create(const Function& prim_func) {
     static auto fschedule =
@@ -189,17 +186,17 @@ class ScheduleGetter :
   Array<Tensor> VisitExpr_(const ConstantNode* op) final {
     CHECK(op->is_scalar());
     void* data = op->data->data;
-    DataType dtype = DataType(op->data->dtype);
+    DataType dtype = TVMType2Type(op->data->dtype);
     Tensor value = tvm::compute({}, [&](const Array<tvm::Var>&) {
-        if (dtype == DataType::Int(32)) {
+        if (dtype == Int(32)) {
           return make_const(dtype, static_cast<const int32_t*>(data)[0]);
-        } else if (dtype == DataType::Int(64)) {
+        } else if (dtype == Int(64)) {
           return make_const(dtype, static_cast<const int64_t*>(data)[0]);
-        } else if (dtype == DataType::Float(32)) {
+        } else if (dtype == Float(32)) {
           return make_const(dtype, static_cast<const float*>(data)[0]);
-        } else if (dtype == DataType::Float(64)) {
+        } else if (dtype == Float(64)) {
           return make_const(dtype, static_cast<const double*>(data)[0]);
-        } else if (dtype == DataType::Bool()) {
+        } else if (dtype == Bool()) {
           return make_const(dtype, static_cast<const uint8_t*>(data)[0]);
         } else {
           LOG(FATAL) << "not handled";
@@ -252,9 +249,11 @@ class ScheduleGetter :
     CHECK(call_node->op.as<OpNode>())
         << "Primitive function only allows call into primitive ops";
     Op op = Downcast<Op>(call_node->op);
+    // Check if the op is a device copy op.
+    bool is_copy_op = op.same_as(Op::Get("device_copy"));
     Array<Tensor> outputs;
     // Skip fcompute for device copy operators as it is not registered.
-    if (op == device_copy_op_) {
+    if (is_copy_op) {
       const auto* copy_input = inputs[0].operator->();
       outputs.push_back(TensorNode::make(copy_input->shape, copy_input->dtype,
                                          Operation(), 0));
@@ -282,7 +281,7 @@ class ScheduleGetter :
     }
     // Set the name to `__copy`. It will be detected in graph runtime to perform
     // data copy across devices.
-    if (op == device_copy_op_) {
+    if (is_copy_op) {
       readable_name_stream_.str(std::string());
       readable_name_stream_ << "__copy";
     } else {
@@ -332,9 +331,6 @@ class ScheduleGetter :
   std::ostringstream readable_name_stream_;
   std::unordered_map<Expr, Array<Tensor>, NodeHash, NodeEqual> memo_;
   Array<Operation> scalars_;
-  // Cache device copy op for equivalence checking to reduce registry lookup
-  // overhead for each invocation of call node when retrieving schedules.
-  const Op& device_copy_op_;
 };
 
 // Creates shape function from functor.
@@ -359,7 +355,7 @@ class MakeShapeFunc : public ExprFunctor<Array<Tensor>(const Expr&)> {
         if (ndim > 0) {
           sshape.push_back(tvm::Integer(ndim));
         }
-        tvm::Tensor shape_tensor = tvm::placeholder(sshape, DataType::Int(64));
+        tvm::Tensor shape_tensor = tvm::placeholder(sshape, Int(64));
         shape_inputs.push_back(shape_tensor);
       };
 
@@ -395,7 +391,7 @@ class MakeShapeFunc : public ExprFunctor<Array<Tensor>(const Expr&)> {
     // set inputs
     for (auto param : prim_func->params) {
       int state = param_states_[param];
-      cache_node->shape_func_param_states.push_back(IntImm::make(DataType::Int(32), state));
+      cache_node->shape_func_param_states.push_back(IntImm::make(Int(32), state));
       if (state & kNeedInputData) {
         for (auto t : param_data_[param]) {
           cache_node->inputs.push_back(t);
@@ -465,17 +461,17 @@ class MakeShapeFunc : public ExprFunctor<Array<Tensor>(const Expr&)> {
     bool data_dependant = data_dependants_.back();
     if (data_dependant) {
       void* data = op->data->data;
-      DataType dtype = DataType(op->data->dtype);
+      DataType dtype = TVMType2Type(op->data->dtype);
       Tensor value = tvm::compute({}, [&](const Array<tvm::Var>&) {
-          if (dtype == DataType::Int(32)) {
+          if (dtype == Int(32)) {
             return make_const(dtype, static_cast<const int32_t*>(data)[0]);
-          } else if (dtype == DataType::Int(64)) {
+          } else if (dtype == Int(64)) {
             return make_const(dtype, static_cast<const int64_t*>(data)[0]);
-          } else if (dtype == DataType::Float(32)) {
+          } else if (dtype == Float(32)) {
             return make_const(dtype, static_cast<const float*>(data)[0]);
-          } else if (dtype == DataType::Float(64)) {
+          } else if (dtype == Float(64)) {
             return make_const(dtype, static_cast<const double*>(data)[0]);
-          } else if (dtype == DataType::Bool()) {
+          } else if (dtype == Bool()) {
             return make_const(dtype, static_cast<const uint8_t*>(data)[0]);
           } else {
             LOG(FATAL) << "not handled";
@@ -486,7 +482,7 @@ class MakeShapeFunc : public ExprFunctor<Array<Tensor>(const Expr&)> {
       return {value};
     } else {
       Tensor value = tvm::compute({}, [&](const Array<tvm::Var>&) {
-          return make_const(DataType::Int(64), 0);
+          return make_const(Int(64), 0);
       }, "shape_const", topi::kBroadcast);
       scalars_.push_back(value);
       return {value};
@@ -528,7 +524,7 @@ class MakeShapeFunc : public ExprFunctor<Array<Tensor>(const Expr&)> {
     auto ret_type = call_node->checked_type();
     Array<IndexExpr> out_ndims;
     if (const auto* ttype = ret_type.as<TensorTypeNode>()) {
-      out_ndims.push_back(IntImm::make(DataType::Int(32), ttype->shape.size()));
+      out_ndims.push_back(IntImm::make(Int(32), ttype->shape.size()));
     } else {
       auto rtype = ret_type.as<TupleTypeNode>();
       // TODO(@icemelon): Allow recursive tuple
@@ -536,7 +532,7 @@ class MakeShapeFunc : public ExprFunctor<Array<Tensor>(const Expr&)> {
       for (size_t i = 0; i < rtype->fields.size(); ++i) {
         auto ttype = rtype->fields[i].as<TensorTypeNode>();
         CHECK(ttype);
-        out_ndims.push_back(IntImm::make(DataType::Int(32), ttype->shape.size()));
+        out_ndims.push_back(IntImm::make(Int(32), ttype->shape.size()));
       }
     }
     // Call shape function
@@ -612,46 +608,6 @@ class CompileEngineImpl : public CompileEngineNode {
     return LowerShapeFuncInternal(key)->cached_func;
   }
 
-  Array<tvm::runtime::Module> LowerExternalFunctions() {
-    std::unordered_map<std::string, relay::Module> ext_mods;
-    std::vector<CCacheKey> cached_ext_funcs;
-    for (const auto& it : cache_) {
-      auto src_func = it.first->source_func;
-      CHECK(src_func.defined());
-      if (!src_func->UseDefaultCompiler()) {
-        auto compiler = FunctionGetAttr(src_func, attr::kCompiler);
-        const tvm::ir::StringImm* code_gen = compiler.as<tvm::ir::StringImm>();
-        CHECK(code_gen) << "No external codegen is set";
-        if (ext_mods.find(code_gen->value) == ext_mods.end()) {
-          ext_mods[code_gen->value] = relay::ModuleNode::make({}, {});
-        }
-        auto ext_symbol = FunctionGetAttr(src_func, attr::kExternalSymbol);
-        const tvm::ir::StringImm* symbol_name = ext_symbol.as<tvm::ir::StringImm>();
-        CHECK(symbol_name) << "No external symbol is set for:\n" << AsText(src_func, false);
-        auto gv = GlobalVarNode::make(symbol_name->value);
-        ext_mods[code_gen->value]->Add(gv, src_func);
-        cached_ext_funcs.push_back(it.first);
-      }
-    }
-
-    Array<tvm::runtime::Module> ret;
-    for (const auto& it : ext_mods) {
-      std::string ext_name = "relay.ext." + it.first;
-      auto pf = tvm::runtime::Registry::Get(ext_name);
-      CHECK(pf) << "Failed to find the codegen tool for " << ext_name << "\n";
-      runtime::Module ext_mod = (*pf)(it.second);
-      CHECK(ext_mod.defined()) << "No external runtime is generated.";
-      ret.push_back(ext_mod);
-    }
-
-    // No need to cache external functions as we collected them all to create
-    // external runtime modules.
-    for (const auto& it : cached_ext_funcs) {
-      cache_.erase(it);
-    }
-    return ret;
-  }
-
   void Clear() final {
     cache_.clear();
   }
@@ -692,18 +648,6 @@ class CompileEngineImpl : public CompileEngineNode {
       value->use_count = 0;
       cache_[key] = value;
     }
-    // No need to lower external functions for now. We will invoke the external
-    // codegen tool once and lower all functions together.
-    if (!key->source_func->UseDefaultCompiler()) {
-      auto cache_node = make_node<CachedFuncNode>();
-      const auto name_node =
-          FunctionGetAttr(key->source_func, attr::kExternalSymbol).as<tvm::ir::StringImm>();
-      CHECK(name_node != nullptr) << "External function has not been attached a name yet.";
-      cache_node->func_name = name_node->value;
-      cache_node->target = tvm::target::ext_dev();
-      value->cached_func = CachedFunc(cache_node);
-      return value;
-    }
     // Enforce use the target.
     With<Target> target_scope(key->target);
 
@@ -815,46 +759,42 @@ const CompileEngine& CompileEngine::Global() {
   return *inst;
 }
 
+
 TVM_REGISTER_GLOBAL("relay.backend._make_CCacheKey")
 .set_body_typed<CCacheKey(Function, Target)>(CCacheKeyNode::make);
 
 TVM_REGISTER_GLOBAL("relay.backend._CompileEngineGlobal")
 .set_body_typed<CompileEngine()>([]() {
-  return CompileEngine::Global();
-});
+    return CompileEngine::Global();
+  });
 
 TVM_REGISTER_GLOBAL("relay.backend._CompileEngineClear")
 .set_body_typed<void(const CompileEngine&)>([](CompileEngine self) {
-  self->Clear();
-});
+    self->Clear();
+  });
 
 TVM_REGISTER_GLOBAL("relay.backend._CompileEngineLower")
 .set_body_typed<CachedFunc(CompileEngine, CCacheKey)>(
     [](CompileEngine self, CCacheKey key) {
-  return self->Lower(key);
-});
+      return self->Lower(key);
+    });
 
 TVM_REGISTER_GLOBAL("relay.backend._CompileEngineLowerShapeFunc")
 .set_body_typed<CachedFunc(CompileEngine, CCacheKey)>(
     [](CompileEngine self, CCacheKey key) {
-  return self->LowerShapeFunc(key);
-});
-
-TVM_REGISTER_GLOBAL("relay.backend._CompileLowerExternalFunctions")
-.set_body_typed<void(const CompileEngine&)>([](CompileEngine self) {
-  return self->LowerExternalFunctions();
-});
+      return self->LowerShapeFunc(key);
+    });
 
 TVM_REGISTER_GLOBAL("relay.backend._CompileEngineJIT")
 .set_body_typed<PackedFunc(CompileEngine, CCacheKey)>(
     [](CompileEngine self, CCacheKey key) {
-  return self->JIT(key);
-});
+      return self->JIT(key);
+    });
 
 TVM_REGISTER_GLOBAL("relay.backend._CompileEngineListItems")
 .set_body_typed<Array<NodeRef>(CompileEngine)>(
     [](CompileEngine self){
-  return static_cast<CompileEngineImpl*>(self.operator->())->ListItems();
-});
+      return static_cast<CompileEngineImpl*>(self.operator->())->ListItems();
+    });
 }  // namespace relay
 }  // namespace tvm
diff --git a/src/relay/backend/compile_engine.h b/src/relay/backend/compile_engine.h
index 596dfa7154f7..31e246ecf1fe 100644
--- a/src/relay/backend/compile_engine.h
+++ b/src/relay/backend/compile_engine.h
@@ -26,7 +26,6 @@
 #define TVM_RELAY_BACKEND_COMPILE_ENGINE_H_
 
 #include <tvm/lowered_func.h>
-#include <tvm/runtime/module.h>
 #include <tvm/relay/analysis.h>
 #include <tvm/relay/expr.h>
 #include <tvm/relay/transform.h>
@@ -187,12 +186,6 @@ class CompileEngineNode : public Node {
    * \return The result.
    */
   virtual CachedFunc LowerShapeFunc(const CCacheKey& key) = 0;
-  /*!
-   * \brief Lower the external function using external codegen tools.
-   * \return The runtime moduels for each needed external codegen tool.
-   */
-  virtual tvm::Array<tvm::runtime::Module> LowerExternalFunctions() = 0;
-
   /*! \brief clear the cache. */
   virtual void Clear() = 0;
 
diff --git a/src/relay/backend/contrib/codegen_c/codegen.cc b/src/relay/backend/contrib/codegen_c/codegen.cc
deleted file mode 100644
index cdaf813c44e4..000000000000
--- a/src/relay/backend/contrib/codegen_c/codegen.cc
+++ /dev/null
@@ -1,226 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-#include <tvm/relay/expr_functor.h>
-#include <tvm/relay/transform.h>
-#include <tvm/relay/type.h>
-#include <tvm/runtime/module.h>
-#include <tvm/runtime/object.h>
-
-#include <fstream>
-#include <sstream>
-
-#include "codegen_c.h"
-
-namespace tvm {
-namespace relay {
-namespace contrib {
-
-/*!
- * \brief An example codegen that is only used for quick prototyping and testing
- * purpose. Only several binary options are covered. Users
- * may need to extend them to cover more operators.
- */
-class CodegenC : public ExprVisitor, public CodegenCBase {
- public:
-  explicit CodegenC(const std::string& id) { this->ext_func_id_ = id; }
-
-  void VisitExpr_(const VarNode* node) {
-    ext_func_args_.push_back(node->name_hint());
-    out_.clear();
-    out_.push_back({node->name_hint(), 0});
-  }
-
-  void VisitExpr_(const CallNode* call) final {
-    std::ostringstream macro_stream;
-    std::ostringstream decl_stream;
-    std::ostringstream buf_stream;
-
-    std::string func_name = ext_func_id_ + "_" + std::to_string(func_idx++);
-
-    // Make function declaration
-    macro_stream << "CSOURCE_BINARY_OP_" << call->args.size() << "D(" << func_name << ", ";
-
-    if (IsOp(call, "add")) {
-      macro_stream << "+";
-    } else if (IsOp(call, "subtract")) {
-      macro_stream << "-";
-    } else if (IsOp(call, "multiply")) {
-      macro_stream << "*";
-    } else {
-      LOG(FATAL) << "Unrecognized op";
-    }
-
-    auto in_shape = GetShape(call->args[0]->checked_type());
-    for (size_t i = 0; i < in_shape.size(); ++i) {
-      macro_stream << ", " << in_shape[i];
-    }
-    macro_stream << ");";
-    func_decl_.push_back(macro_stream.str());
-
-    // Make function call when visiting arguments
-    bool first = true;
-    decl_stream << func_name << "(";
-    for (size_t i = 0; i < call->args.size(); ++i) {
-      VisitExpr(call->args[i]);
-      for (auto out : out_) {
-        if (!first) {
-          decl_stream << ", ";
-        }
-        first = false;
-        decl_stream << out.first;
-      }
-    }
-
-    auto type_node = call->checked_type().as<TensorTypeNode>();
-    CHECK(type_node != nullptr && runtime::TypeMatch(type_node->dtype, kDLFloat, 32))
-        << "Only support single output tensor with float type";
-    std::string out = "buf_" + std::to_string(buf_idx_++);
-    auto out_shape = GetShape(call->checked_type());
-    int out_size = 1;
-    for (size_t i = 0; i < out_shape.size(); ++i) {
-      out_size *= out_shape[i];
-    }
-    buf_stream << "float* " << out << " = (float*)std::malloc(4 * " << out_size << ");";
-    buf_decl_.push_back(buf_stream.str());
-
-    decl_stream << ", " << out << ");";
-    ext_func_body.push_back(decl_stream.str());
-
-    // Update output buffer
-    out_.clear();
-    out_.push_back({out, out_size});
-  }
-
-  /*!
-   * \brief Emit the source code that invokes C compiler compatible wrappers.
-   *
-   * \return The emitted code.
-   */
-  std::string JIT() {
-    // Write function macros
-    for (auto decl : func_decl_) {
-      code_stream_ << decl << "\n";
-    }
-    return JitImpl(ext_func_id_, ext_func_args_, buf_decl_, ext_func_body, out_);
-  }
-
- private:
-  /*! \brief The function id that represents a C source function. */
-  std::string ext_func_id_ = "";
-  /*! \brief The index of a wrapped C function. */
-  int func_idx = 0;
-  /*! \brief The index of allocated buffers. */
-  int buf_idx_ = 0;
-  /*! \brief The arguments of a C compiler compatible function. */
-  std::vector<std::string> ext_func_args_;
-  /*! \brief The statements of a C compiler compatible function. */
-  std::vector<std::string> ext_func_body;
-  /*! \brief The declaration statements of a C compiler compatible function. */
-  std::vector<std::string> func_decl_;
-  /*! \brief The declaration statements of buffers. */
-  std::vector<std::string> buf_decl_;
-  /*! \brief The name and index pairs for output. */
-  std::vector<std::pair<std::string, int>> out_;
-};
-
-class CSourceCodegen : public CSourceModuleCodegenBase {
- public:
-  void GenCFunc(const Function& func) {
-    CHECK(func.defined()) << "Input error: expect a Relay function.";
-
-    // Record the external symbol for runtime lookup.
-    auto sid = GetExtSymbol(func);
-
-    CodegenC builder(sid);
-    builder.VisitExpr(func->body);
-    code_stream_ << builder.JIT();
-  }
-
-  runtime::Module CreateCSourceModule(const NodeRef& ref) override {
-    // Create headers
-    code_stream_ << "#include <cstdint>\n";
-    code_stream_ << "#include <iostream>\n";
-    code_stream_ << "#include <cstdlib>\n";
-    code_stream_ << "#include <stdio.h>\n";
-    code_stream_ << "#include <cstring>\n";
-    code_stream_ << "#include <tvm/runtime/c_runtime_api.h>\n";
-    code_stream_ << "#include <dlpack/dlpack.h>\n";
-
-    // Append some common macro for operator definition.
-    const char* operator_macro = R"op_macro(
-    #define CSOURCE_BINARY_OP_1D(p_ID_, p_OP_, p_DIM1_)       \
-      extern "C" void p_ID_(float* a, float* b, float* out) { \
-        for (int64_t i = 0; i < p_DIM1_; ++i) {               \
-          out[i] = a[i] p_OP_ b[i];                           \
-        }                                                     \
-      }
-    
-    #define CSOURCE_BINARY_OP_2D(p_ID_, p_OP_, p_DIM1_, p_DIM2_)  \
-      extern "C" void p_ID_(float* a, float* b, float* out) {     \
-        for (int64_t i = 0; i < p_DIM1_; ++i) {                   \
-          for (int64_t j = 0; j < p_DIM2_; ++j) {                 \
-            int64_t k = i * p_DIM2_ + j;                          \
-            out[k] = a[k] p_OP_ b[k];                             \
-          }                                                       \
-        }                                                         \
-      }
-    )op_macro";
-
-    code_stream_ << operator_macro << "\n\n";
-
-    if (ref->IsInstance<FunctionNode>()) {
-      GenCFunc(Downcast<Function>(ref));
-    } else if (ref->IsInstance<relay::ModuleNode>()) {
-      relay::Module mod = Downcast<relay::Module>(ref);
-      for (const auto& it : mod->functions) {
-        GenCFunc(Downcast<Function>(it.second));
-      }
-    } else {
-      LOG(FATAL) << "The input ref is expected to be a Relay function or module"
-                 << "\n";
-    }
-
-    // Create a CSourceModule
-    const auto* pf = runtime::Registry::Get("module.csource_module_create");
-    CHECK(pf != nullptr) << "Cannot find csource module to create the external runtime module";
-    return (*pf)(code_stream_.str(), "cc");
-  }
-
- private:
-  std::ostringstream code_stream_;
-};
-
-/*!
- * \brief The external compiler/codegen tool. It takes a Relay expression/module and
- * compile it into a runtime module.
- *
- * The external codegen tool should have been registered similiarly to LLVM,
- * CUDA, etc, under TVM, so the generated code could be packed in a runtime
- * module. This module simplifies code serialization and invocation.
- */
-runtime::Module CCompiler(const NodeRef& ref) {
-  CSourceCodegen csource;
-  return csource.CreateCSourceModule(ref);
-}
-
-TVM_REGISTER_API("relay.ext.ccompiler").set_body_typed(CCompiler);
-
-}  // namespace contrib
-}  // namespace relay
-}  // namespace tvm
diff --git a/src/relay/backend/contrib/codegen_c/codegen_c.h b/src/relay/backend/contrib/codegen_c/codegen_c.h
deleted file mode 100644
index 1319ca2ff787..000000000000
--- a/src/relay/backend/contrib/codegen_c/codegen_c.h
+++ /dev/null
@@ -1,270 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file src/relay/backend/contrib/codegen_c/codegen_c.h
- * \brief The base class for external codegen tools.
- */
-#ifndef TVM_RELAY_BACKEND_CONTRIB_CODEGEN_C_CODEGEN_C_H_
-#define TVM_RELAY_BACKEND_CONTRIB_CODEGEN_C_CODEGEN_C_H_
-
-#include <tvm/relay/expr.h>
-#include <tvm/relay/op.h>
-#include <sstream>
-#include <string>
-#include <utility>
-#include <vector>
-
-namespace tvm {
-namespace relay {
-namespace contrib {
-
-class CSourceModuleCodegenBase {
- public:
-  CSourceModuleCodegenBase() = default;
-
-  /*!
-   * \brief Create a runtime module for the external library. For example, it
-   * could be a CSourceModule that can be directly compiled and linked together
-   * with a DSOModule, or a json style module that emitts a json artifact that
-   * is able to be executed by a customized json runtime.
-   *
-   * \param ref The ext_func Relay expression/module to be executed using extern ops.
-   *
-   * \return A runtime module.
-   */
-  virtual runtime::Module CreateCSourceModule(const NodeRef& ref) = 0;
-
-  /*!
-   * \brief Get the external symbol of the Relay function name.
-   *
-   * \param func The provided function.
-   *
-   * \return An external symbol.
-   */
-  std::string GetExtSymbol(const Function& func) const {
-    const auto name_node = FunctionGetAttr(func, attr::kExternalSymbol).as<tvm::ir::StringImm>();
-    CHECK(name_node != nullptr) << "Fail to retrieve external symbol.";
-    std::string ext_symbol = name_node->value;
-    return ext_symbol;
-  }
-};
-
-// The base class to generate the declaration functions in C.
-class CodegenCBase {
- protected:
-  /*! \brief Print indents using spaces. */
-  void PrintIndents() {
-    for (int i = 0; i < indent_; i++) {
-      code_stream_ << ' ';
-    }
-  }
-
-  /*!
-   * \brief Enter a new scope.
-   */
-  void EnterScope() { indent_ += 2; }
-
-  /*!
-   * \brief Exit a scope.
-   */
-  void ExitScope() {
-    CHECK_GE(indent_, 2U) << "Wrong ident found.";
-    indent_ -= 2;
-  }
-
-  /*!
-   * \brief Gerenate C code for the external function.
-   *
-   * \param func_name The name of the external function.
-   * \param arg_cnt The expected number of arguments.
-   *
-   * \code
-   *
-   * // An example code for the generated C function.
-   * extern "C" void foo(TVMValue* value, int* type_code, int nargs) {
-   *   if (nargs != 3) {
-   *     printf("foo expects 3 args, but received %d\n", nargs);
-   *     return 1;
-   *   }
-   *
-   *   DLTensor* arg0 = static_cast<DLTensor*>(value[0].v_handle);
-   *   DLTensor* arg1 = static_cast<DLTensor*>(value[1].v_handle);
-   *   DLTensor* out = static_cast<DLTensor*>(value[2].v_handle);
-   *
-   *   foo_(static_cast<float*>(arg0->data),
-   *        static_cast<float*>(arg1->data),
-   *        static_cast<float*>(out->data));
-   *   return 0;
-   * }
-   *
-   * \endcode
-   */
-  void GenerateBackendCFunc(const std::string& func_name, int arg_cnt) {
-    // Print signature
-    code_stream_ << "\n";
-    code_stream_ << "extern \"C\" int " << func_name;
-    code_stream_ << "(TVMValue* value, int* type_code, int nargs) {\n";
-    EnterScope();
-    // Print guard
-    PrintIndents();
-    code_stream_ << "if (nargs != " << arg_cnt << "){\n";
-    EnterScope();
-    PrintIndents();
-    code_stream_ << "printf(\"" << func_name << " expects " << arg_cnt
-                 << " arguments, but received %d\\n\", nargs);\n";
-    PrintIndents();
-    code_stream_ << "return 1;\n";
-    ExitScope();
-    PrintIndents();
-    code_stream_ << "}\n";
-
-    // According to TVM's calling convention, the last one is output.
-    for (int i = 0; i < arg_cnt; i++) {
-      PrintIndents();
-      code_stream_ << "DLTensor* arg" << i << " = "
-                   << "static_cast<DLTensor*>(value[" << i << "].v_handle);\n";
-    }
-    // Generate the call.
-    PrintIndents();
-    code_stream_ << func_name << "_(";
-    for (int i = 0; i < arg_cnt - 1; i++) {
-      code_stream_ << "static_cast<float*>(arg" << i << "->data), ";
-    }
-    if (arg_cnt > 0) {
-      code_stream_ << "static_cast<float*>(arg" << arg_cnt - 1 << "->data)";
-    }
-    code_stream_ << ");\n\n";
-    PrintIndents();
-    code_stream_ << "return 0;\n";
-    ExitScope();
-    code_stream_ << "}";
-  }
-
-  /*!
-   * \brief Emit the code for external runtime.
-   *
-   * \return The code string.
-   */
-  virtual std::string JIT() = 0;
-
-  /*!
-   * \brief Extract the shape from a Relay tensor type.
-   *
-   * \param type The provided type.
-   *
-   * \return The extracted shape in a list.
-   */
-  std::vector<int> GetShape(const Type& type) const {
-    const auto* ttype = type.as<TensorTypeNode>();
-    CHECK(ttype) << "Expect TensorTypeNode";
-    std::vector<int> shape;
-    for (size_t i = 0; i < ttype->shape.size(); ++i) {
-      auto* val = ttype->shape[i].as<IntImm>();
-      CHECK(val);
-      shape.push_back(val->value);
-    }
-    return shape;
-  }
-
-  /*!
-   * \brief Check if a call has the provided name.
-   *
-   * \param call A Relay call node.
-   * \param op_name The name of the expected call.
-   *
-   * \return true if the call's name is equivalent to the given name. Otherwise,
-   * false.
-   */
-  bool IsOp(const CallNode* call, std::string op_name) const {
-    const auto* op_node = call->op.as<OpNode>();
-    CHECK(op_node) << "Expects a single op.";
-    Op op = GetRef<Op>(op_node);
-    return op == Op::Get(op_name);
-  }
-
-  /*!
-   * \brief A common interface that is used by various external runtime to
-   * generate the wrapper to invoke external kernels.
-   *
-   * \param ext_func_id The unique id of an external function. It will be used
-   * during runtime to pick the correct external function.
-   * \param args The arguments used by the external function.
-   * \param buf_decl The declaration of temporary buffers that used to store the
-   * intermeidate of each external kernel.
-   * \param body The statements of the external function.
-   * \param out The name and id pairs for output.
-   *
-   * \return The emitted code string.
-   */
-  std::string JitImpl(std::string ext_func_id, std::vector<std::string> args,
-                      std::vector<std::string> buf_decl, std::vector<std::string> body,
-                      std::vector<std::pair<std::string, int>> out) {
-    // Create the signature. For example, it could be:
-    // extern "C" void dnnl_0_(float* input0, float* input1, float* out, int M, int N) {}
-    code_stream_ << "extern \"C\" void " << ext_func_id << "_(";
-
-    for (const auto& arg : args) {
-      code_stream_ << "float* " << arg << ", ";
-    }
-    code_stream_ << "float* out) {\n";
-    this->EnterScope();
-
-    // Function body
-    for (auto decl : buf_decl) {
-      this->PrintIndents();
-      code_stream_ << decl << "\n";
-    }
-    code_stream_ << "\n";
-    for (auto stmt : body) {
-      this->PrintIndents();
-      code_stream_ << stmt << "\n";
-    }
-
-    // Copy output
-    CHECK_EQ(out.size(), 1U) << "Internal error: only single output is support.";
-    this->PrintIndents();
-    code_stream_ << "std::memcpy(out, " << out[0].first << ", 4 * " << out[0].second << ");\n";
-
-    // Free buffers
-    for (size_t i = 0; i < buf_decl.size(); i++) {
-      this->PrintIndents();
-      code_stream_ << "std::free(buf_" << i << ");\n";
-    }
-
-    this->ExitScope();
-    code_stream_ << "}\n";
-
-    // Create the wrapper to call the ext_func
-    this->GenerateBackendCFunc(ext_func_id, args.size() + 1 /* output */);
-    return code_stream_.str();
-  }
-
-  /*! \brief The external function source code stream. */
-  std::ostringstream code_stream_;
-
- private:
-  /*! \brief Indent of the source code. */
-  int indent_{0};
-};
-
-}  // namespace contrib
-}  // namespace relay
-}  // namespace tvm
-#endif  // TVM_RELAY_BACKEND_CONTRIB_CODEGEN_C_CODEGEN_C_H_
diff --git a/src/relay/backend/contrib/dnnl/codegen.cc b/src/relay/backend/contrib/dnnl/codegen.cc
deleted file mode 100644
index e7f7bd6ff559..000000000000
--- a/src/relay/backend/contrib/dnnl/codegen.cc
+++ /dev/null
@@ -1,310 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file src/relay/backend/contrib/dnnl/codegen.cc
- * \brief Implementation of DNNL codegen APIs.
- */
-
-#include <tvm/relay/attrs/nn.h>
-#include <tvm/relay/expr_functor.h>
-#include <tvm/relay/transform.h>
-#include <tvm/relay/type.h>
-#include <tvm/runtime/module.h>
-#include <tvm/runtime/registry.h>
-
-#include <fstream>
-#include <sstream>
-
-#include "../codegen_c/codegen_c.h"
-
-namespace tvm {
-namespace relay {
-namespace contrib {
-
-// TODO(@zhiics, @comaniac): This is a basic implementation. We should implement
-// all utilities and make a base class for users to implement.
-class CodegenDNNL : public ExprVisitor, public CodegenCBase {
- public:
-  explicit CodegenDNNL(const std::string& id) { this->ext_func_id_ = id; }
-
-  void VisitExpr_(const VarNode* node) final {
-    ext_func_args_.push_back(node->name_hint());
-    out_.clear();
-    out_.push_back({node->name_hint(), 0});
-  }
-
-  void VisitExpr_(const TupleGetItemNode* op) final {
-    // Do nothing
-  }
-
-  void VisitExpr_(const CallNode* call) final {
-    std::ostringstream decl_stream;
-    std::ostringstream buf_stream;
-    // Args: ID
-    std::vector<std::string> args;
-
-    // Get the arguments for various DNNL kernels.
-    if (IsOp(call, "nn.conv2d")) {
-      decl_stream << "dnnl_conv2d";
-      args = Conv2d(call);
-    } else if (IsOp(call, "nn.dense")) {
-      decl_stream << "dnnl_dense";
-      args = Dense(call);
-    } else if (IsOp(call, "nn.relu")) {
-      decl_stream << "dnnl_relu";
-      args = Relu(call);
-    } else if (IsOp(call, "nn.batch_norm")) {
-      decl_stream << "dnnl_bn";
-      args = BatchNorm(call);
-    } else if (IsOp(call, "add")) {
-      decl_stream << "dnnl_add";
-      args = Add(call);
-    } else {
-      LOG(FATAL) << "Unsupported op: " << AsText(call->op, false);
-    }
-
-    // Make function call with input buffers when visiting arguments
-    bool first = true;
-    decl_stream << "(";
-    for (size_t i = 0; i < call->args.size(); ++i) {
-      VisitExpr(call->args[i]);
-      for (auto out : out_) {
-        if (!first) {
-          decl_stream << ", ";
-        }
-        first = false;
-        decl_stream << out.first;
-      }
-    }
-
-    // Analyze the output buffer
-    auto type_node = call->checked_type().as<TensorTypeNode>();
-    CHECK(type_node != nullptr && runtime::TypeMatch(type_node->dtype, kDLFloat, 32))
-        << "Only support single output tensor with float type";
-    std::string out = "buf_" + std::to_string(buf_idx_++);
-    auto out_shape = GetShape(call->checked_type());
-    int out_size = 1;
-    for (size_t i = 0; i < out_shape.size(); ++i) {
-      out_size *= out_shape[i];
-    }
-    this->PrintIndents();
-    buf_stream << "float* " << out << " = (float*)std::malloc(4 * " << out_size << ");";
-    buf_decl_.push_back(buf_stream.str());
-    decl_stream << ", " << out;
-
-    // Attach attribute arguments
-    for (size_t i = 0; i < args.size(); ++i) {
-      decl_stream << ", " << args[i];
-    }
-    decl_stream << ");";
-    ext_func_body.push_back(decl_stream.str());
-
-    // Update output buffer
-    out_.clear();
-    out_.push_back({out, out_size});
-  }
-
-  std::string JIT(void) {
-    return JitImpl(ext_func_id_, ext_func_args_, buf_decl_, ext_func_body, out_);
-  }
-
- private:
-  std::vector<std::string> Conv2d(const CallNode* call) {
-    std::vector<std::string> args;
-    const auto* conv2d_attr = call->attrs.as<Conv2DAttrs>();
-    CHECK(conv2d_attr);
-
-    auto ishape = GetShape(call->args[0]->checked_type());
-    auto wshape = GetShape(call->args[1]->checked_type());
-
-    // Args: N, C, H, W
-    for (auto s : ishape) {
-      args.push_back(std::to_string(s));
-    }
-
-    // Args: O, G, Ph, Pw, Kh, Kw, Sh, Sw
-    args.push_back(std::to_string(wshape[0]));
-    args.push_back(std::to_string(conv2d_attr->groups));
-    args.push_back(std::to_string(conv2d_attr->padding[0].as<IntImm>()->value));
-    args.push_back(std::to_string(conv2d_attr->padding[1].as<IntImm>()->value));
-    args.push_back(std::to_string(wshape[2]));
-    args.push_back(std::to_string(wshape[3]));
-    args.push_back(std::to_string(conv2d_attr->strides[0].as<IntImm>()->value));
-    args.push_back(std::to_string(conv2d_attr->strides[1].as<IntImm>()->value));
-
-    return args;
-  }
-
-  std::vector<std::string> Dense(const CallNode* call) {
-    std::vector<std::string> args;
-    auto ishape = GetShape(call->args[0]->checked_type());
-    auto wshape = GetShape(call->args[1]->checked_type());
-
-    // Args: N, C, O
-    args.push_back(std::to_string(ishape[0]));
-    args.push_back(std::to_string(ishape[1]));
-    args.push_back(std::to_string(wshape[0]));
-
-    return args;
-  }
-
-  std::vector<std::string> Relu(const CallNode* call) {
-    std::vector<std::string> args;
-    auto ishape = GetShape(call->args[0]->checked_type());
-
-    // Args: N, C, H, W
-    for (auto s : ishape) {
-      args.push_back(std::to_string(s));
-    }
-
-    return args;
-  }
-
-  std::vector<std::string> BatchNorm(const CallNode* call) {
-    std::vector<std::string> args;
-    const auto* bn_attr = call->attrs.as<BatchNormAttrs>();
-    auto ishape = GetShape(call->args[0]->checked_type());
-
-    // Args: N, C, H, W
-    for (auto s : ishape) {
-      args.push_back(std::to_string(s));
-    }
-
-    // Args: epsilon
-    args.push_back(std::to_string(bn_attr->epsilon));
-
-    return args;
-  }
-
-  std::vector<std::string> Add(const CallNode* call) {
-    std::vector<std::string> args;
-    auto ishape = GetShape(call->args[0]->checked_type());
-
-    // Args: H, W
-    for (auto s : ishape) {
-      args.push_back(std::to_string(s));
-    }
-
-    return args;
-  }
-
-  /*! \brief The id of the external dnnl ext_func. */
-  std::string ext_func_id_{""};
-  /*!
-   * \brief The index to track the output buffer. Each kernel will redirect the
-   * output to a buffer that may be consumed by other kernels.
-   */
-  int buf_idx_{0};
-  /*! \brief The arguments used by a wrapped function that calls DNNL kernels. */
-  std::vector<std::string> ext_func_args_;
-  /*! \brief statement of the function that will be compiled using DNNL kernels. */
-  std::vector<std::string> ext_func_body;
-  /*! \brief The declaration of intermeidate buffers. */
-  std::vector<std::string> buf_decl_;
-  /*! \brief The name of the the outputs. */
-  std::vector<std::pair<std::string, int>> out_;
-};
-
-/*!
- * \brief The DNNL codegen helper to generate wrapepr function calls of DNNL
- * libraries. The code is a CSourceModule that can be compiled separately and
- * linked together with a DSOModule.
- */
-class DNNLModuleCodegen : public CSourceModuleCodegenBase {
- public:
-  // Create a corresponding DNNL function for the given relay Function.
-  void GenDNNLFunc(const Function& func) {
-    CHECK(func.defined()) << "Input error: expect a Relay function.";
-    const auto* call = func->body.as<CallNode>();
-    CHECK(call) << "DNNL expects a single convolution or dense op";
-
-    // Record the external symbol for runtime lookup.
-    auto sid = GetExtSymbol(func);
-
-    CodegenDNNL builder(sid);
-    builder.VisitExpr(func->body);
-    code_stream_ << builder.JIT();
-  }
-
-  /*!
-   * \brief The overridden function that will create a CSourceModule. In order
-   * to compile the generated C source code, users need to specify the paths to
-   * some libraries, including some TVM required and dnnl specific ones. To make
-   * linking simpiler, the DNNL kernels are wrapped in a TVM compatible manner
-   * and live under tvm/src/runtime/contrib/dnnl folder.
-   *
-   * \param ref An object ref that could be either a Relay function or module.
-   *
-   * \return The runtime module that contains C source code.
-   */
-  runtime::Module CreateCSourceModule(const NodeRef& ref) override {
-    // Create headers
-    code_stream_ << "#include <cstdint>\n";
-    code_stream_ << "#include <cstdlib>\n";
-    code_stream_ << "#include <cstring>\n";
-    code_stream_ << "#include <tvm/runtime/c_runtime_api.h>\n";
-    code_stream_ << "#include <dlpack/dlpack.h>\n";
-    // dnnl_kernel file is saved under src/runtime/contrib/dnnl so that we don't
-    // expose it to ordinary users. To make export_library use it, users need to
-    // pass -I${PATH_TO_TVM}/src/runtime/contrib
-    code_stream_ << "#include <dnnl/dnnl_kernel.h>\n";
-    code_stream_ << "using namespace tvm::runtime::contrib;\n";
-    code_stream_ << "\n";
-
-    if (ref->IsInstance<FunctionNode>()) {
-      GenDNNLFunc(Downcast<Function>(ref));
-    } else if (ref->IsInstance<relay::ModuleNode>()) {
-      relay::Module mod = Downcast<relay::Module>(ref);
-      for (const auto& it : mod->functions) {
-        GenDNNLFunc(Downcast<Function>(it.second));
-      }
-    } else {
-      LOG(FATAL) << "The input ref is expected to be a Relay function or module"
-                 << "\n";
-    }
-
-    // Create a CSourceModule
-    const auto* pf = runtime::Registry::Get("module.csource_module_create");
-    CHECK(pf != nullptr) << "Cannot find csource module to create the external runtime module";
-    return (*pf)(code_stream_.str(), "cc");
-  }
-
- private:
-  /*!
-   * \brief The code stream that prints the code that will be compiled using
-   * external codegen tools.
-   */
-  std::ostringstream code_stream_;
-};
-
-/*!
- * \brief The external compiler/codegen tool. It takes a Relay expression/module and
- * compile it into a runtime module.
- */
-runtime::Module DNNLCompiler(const NodeRef& ref) {
-  DNNLModuleCodegen dnnl;
-  return dnnl.CreateCSourceModule(ref);
-}
-
-TVM_REGISTER_API("relay.ext.dnnl").set_body_typed(DNNLCompiler);
-
-}  // namespace contrib
-}  // namespace relay
-}  // namespace tvm
diff --git a/src/relay/backend/graph_runtime_codegen.cc b/src/relay/backend/graph_runtime_codegen.cc
index fc12cf66900f..e2881785766c 100644
--- a/src/relay/backend/graph_runtime_codegen.cc
+++ b/src/relay/backend/graph_runtime_codegen.cc
@@ -24,7 +24,6 @@
 
 #include <dmlc/any.h>
 #include <dmlc/json.h>
-#include <tvm/relay/module.h>
 #include <tvm/relay/expr_functor.h>
 #include <tvm/runtime/device_api.h>
 
@@ -56,7 +55,6 @@ using TargetsMap = std::unordered_map<int, Target>;
 struct LoweredOutput {
   std::string graph_json;
   Map<std::string, Array<LoweredFunc> > lowered_funcs;
-  Array<tvm::runtime::Module> external_mods;
   std::unordered_map<std::string, tvm::runtime::NDArray> params;
 };
 
@@ -228,7 +226,6 @@ class GraphRuntimeCodegen
       }
       ret.lowered_funcs.Set(kv.first, tmp);
     }
-    ret.external_mods = compile_engine_->LowerExternalFunctions();
     return ret;
   }
 
@@ -383,25 +380,6 @@ class GraphRuntimeCodegen
     }
     return fields;
   }
-
-  std::vector<GraphNodeRef> GraphAddCallNode(const CallNode* op,
-                                             const std::string& op_name,
-                                             const std::string& func_name) {
-    std::vector<GraphNodeRef> inputs;
-    for (auto arg : op->args) {
-      auto res = VisitExpr(arg);
-      for (auto nr : res) {
-        inputs.push_back(nr);
-      }
-    }
-    auto node = GraphOpNode::make_node_ptr(op_name,
-                                           GraphAttrs(),
-                                           func_name,
-                                           inputs,
-                                           GraphAttrs());
-    return AddNode(node, GetRef<Expr>(op));
-  }
-
   std::vector<GraphNodeRef> VisitExpr_(const CallNode* op) override {
     Expr expr = GetRef<Expr>(op);
     Function func;
@@ -420,26 +398,17 @@ class GraphRuntimeCodegen
                  << "(i.e functions composed of fusable operator invocations)";
     }
 
+    CHECK_GE(storage_device_map_.count(expr), 0);
     auto pf0 = GetPackedFunc("relay.backend._make_CCacheKey");
     auto pf1 = GetPackedFunc("relay.backend._CompileEngineLower");
-    Target target;
-    // Handle external function
-    if (!func->UseDefaultCompiler()) {
-      target = tvm::target::ext_dev();
-      CCacheKey key = (*pf0)(func, target);
-      CachedFunc ext_func = (*pf1)(compile_engine_, key);
-      CHECK(ext_func.defined()) << "External function is not defined.";
-      return GraphAddCallNode(op, ext_func->func_name, ext_func->func_name);
-    }
-
-    CHECK_GE(storage_device_map_.count(expr), 0);
     auto &device_type = storage_device_map_[expr][1];
     auto call_dev_type = device_type[0]->value;
-    // Normal Relay Function
+    Target target;
     if (targets_.size() == 1) {
        // homogeneous execution.
-      const auto& it = targets_.begin();
-      target = (*it).second;
+       for (auto kv : targets_) {
+         target = kv.second;
+       }
     } else {
       // heterogeneous execution.
       std::string call_dev_name;
@@ -455,17 +424,28 @@ class GraphRuntimeCodegen
       target = targets_[call_dev_type];
     }
     CCacheKey key = (*pf0)(func, target);
-    CachedFunc lowered_func = (*pf1)(compile_engine_, key);
+    CachedFunc lowerd_func = (*pf1)(compile_engine_, key);
     if (!lowered_funcs_.count(target->str())) {
       lowered_funcs_[target->str()] = {};
     }
-    for (auto f : lowered_func->funcs) {
+    for (auto f : lowerd_func->funcs) {
       lowered_funcs_[target->str()].insert(f);
     }
 
-    return GraphAddCallNode(op,
-                           _GetUniqueName(lowered_func->func_name),
-                           lowered_func->func_name);
+    std::vector<GraphNodeRef> inputs;
+    for (auto arg : op->args) {
+      auto res = VisitExpr(arg);
+      for (auto nr : res) {
+        inputs.push_back(nr);
+      }
+    }
+    auto& op_name = lowerd_func->func_name;
+    auto node = GraphOpNode::make_node_ptr(_GetUniqueName(op_name),
+                                           GraphAttrs(),
+                                           op_name,
+                                           inputs,
+                                           GraphAttrs());
+    return AddNode(node, expr);
   }
 
   std::vector<GraphNodeRef> VisitExpr_(const LetNode* op) override {
@@ -490,7 +470,7 @@ class GraphRuntimeCodegen
     return {};
   }
   std::vector<GraphNodeRef> VisitExpr_(const FunctionNode* op) override {
-    CHECK(!op->UseDefaultCompiler()) << "Only functions supported by custom codegen";
+    throw std::invalid_argument("function not supported");
     return {};
   }
   std::vector<GraphNodeRef> VisitExpr_(const RefCreateNode* op) override {
@@ -648,6 +628,7 @@ class GraphRuntimeCodegenModule : public runtime::ModuleNode {
         }
         *rv = ret;
       });
+
     } else if (name == "get_param_by_name") {
       return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
         std::string key = args[0];
@@ -658,10 +639,6 @@ class GraphRuntimeCodegenModule : public runtime::ModuleNode {
       return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
         *rv = this->output_.lowered_funcs;
       });
-    } else if (name == "get_external_modules") {
-      return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
-        *rv = this->output_.external_mods;
-      });
     } else {
       return PackedFunc([](TVMArgs args, TVMRetValue* rv) {});
     }
diff --git a/src/relay/backend/interpreter.cc b/src/relay/backend/interpreter.cc
index b5fd0c914b62..45283582bf05 100644
--- a/src/relay/backend/interpreter.cc
+++ b/src/relay/backend/interpreter.cc
@@ -246,12 +246,10 @@ class Interpreter :
       public ExprFunctor<Value(const Expr& n)>,
              PatternFunctor<bool(const Pattern& p, const Value& v)> {
  public:
-  Interpreter(Module mod, DLContext context, Target target)
-      : mod_(mod),
-        context_(context),
-        target_(target),
-        debug_op_(Op::Get("debug")),
-        shape_of_op_(Op::Get("shape_of")) {
+  Interpreter(Module mod,
+              DLContext context,
+              Target target)
+      : mod_(mod), context_(context), target_(target) {
     engine_ = CompileEngine::Global();
   }
 
@@ -265,7 +263,7 @@ class Interpreter :
     stack_.current_frame().locals.Set(id, v);
   }
 
-  Value Lookup(const Var& local) {
+  inline Value Lookup(const Var& local) {
     return stack_.Lookup(local);
   }
 
@@ -309,7 +307,7 @@ class Interpreter :
     return TupleValueNode::make(values);
   }
 
-  Value MakeClosure(const Function& func, Var letrec_name = Var()) {
+  inline Value MakeClosure(const Function& func, Var letrec_name = Var()) {
     tvm::Map<Var, Value> captured_mod;
     Array<Var> free_vars = FreeVars(func);
 
@@ -359,9 +357,9 @@ class Interpreter :
           int64_t ndim = tv->data.Shape().size();
           NDArray shape_arr;
           if (ndim == 0) {
-            shape_arr = NDArray::Empty({}, DataType::Int(64), cpu_ctx);
+            shape_arr = NDArray::Empty({}, Type2TVMType(Int(64)), cpu_ctx);
           } else {
-            shape_arr = NDArray::Empty({ndim}, DataType::Int(64), cpu_ctx);
+            shape_arr = NDArray::Empty({ndim}, Type2TVMType(Int(64)), cpu_ctx);
             int64_t* data = reinterpret_cast<int64_t*>(shape_arr->data);
             for (auto j = 0; j < ndim; ++j) {
               data[j] = tv->data.Shape()[j];
@@ -411,7 +409,7 @@ class Interpreter :
         const TensorTypeNode* rtype = val_type.as<TensorTypeNode>();
         CHECK(rtype != nullptr);
         int64_t ndim = rtype->shape.size();
-        auto arr = NDArray::Empty({ndim}, DataType::Int(64), cpu_ctx);
+        auto arr = NDArray::Empty({ndim}, Type2TVMType(Int(64)), cpu_ctx);
         outputs[i] = arr;
         setter(arg_counter + i, arr);
     };
@@ -456,9 +454,9 @@ class Interpreter :
 
   Value InvokePrimitiveOp(const Function& func,
                           const Array<Value>& args) {
-    const auto* call_node = func->body.as<CallNode>();
+    auto call_node = func->body.as<CallNode>();
 
-    if (call_node && call_node->op == debug_op_) {
+    if (call_node && call_node->op == Op::Get("debug")) {
       auto dattrs = call_node->attrs.as<DebugAttrs>();
       auto interp_state = this->get_state(call_node->args[0]);
 
@@ -532,7 +530,7 @@ class Interpreter :
         CHECK(ivalue) << "expected concrete dimensions";
         shape.push_back(ivalue[0]);
       }
-      DLDataType dtype = rtype->dtype;
+      DLDataType dtype = Type2TVMType(rtype->dtype);
       auto out_tensor = TensorValueNode::make(
           NDArray::Empty(shape, dtype, context_));
       setter(num_inputs + i, out_tensor->data);
@@ -542,7 +540,7 @@ class Interpreter :
     Array<Shape> out_shapes;
     auto ret_type = func->body->checked_type();
     bool is_dyn = IsDynamic(func->checked_type());
-    if (call_node->op == shape_of_op_) {
+    if (call_node->op == Op::Get("shape_of")) {
       // The output shape of shape_of must be static since Relay doesn't support
       // dynamic rank tensors.
       is_dyn = false;
@@ -675,7 +673,7 @@ class Interpreter :
       cpu_ctx.device_type = kDLCPU;
       cpu_ctx.device_id = 0;
       NDArray cpu_array = bv->data.CopyTo(cpu_ctx);
-      CHECK_EQ(DataType(cpu_array->dtype), DataType::Bool());
+      CHECK_EQ(TVMType2Type(cpu_array->dtype), Bool());
       // TODO(@jroesch, @MK): Refactor code into helper from DCE.
       if (reinterpret_cast<uint8_t*>(cpu_array->data)[0]) {
         return Eval(op->true_branch);
@@ -784,9 +782,6 @@ class Interpreter :
   Stack stack_;
   // Backend compile engine.
   CompileEngine engine_;
-  // Cache ops that need to be frequently used later to reduce lookup overhead.
-  const Op& debug_op_;
-  const Op& shape_of_op_;
 };
 
 
diff --git a/src/relay/backend/utils.h b/src/relay/backend/utils.h
index 3ef740306071..05ebdbd88192 100644
--- a/src/relay/backend/utils.h
+++ b/src/relay/backend/utils.h
@@ -66,18 +66,19 @@ inline const runtime::TypedPackedFunc<R(Args...)> GetTypedPackedFunc(const std::
  * \param typ
  * \return std::string string format of type
  */
-inline std::string DType2String(const tvm::DataType dtype) {
+inline std::string DType2String(const tvm::Type typ) {
   std::ostringstream os;
-  if (dtype.is_float()) {
+  auto tvm_type = Type2TVMType(typ);
+  if (tvm_type.code == kDLFloat) {
     os << "float";
-  } else if (dtype.is_int()) {
+  } else if (tvm_type.code == kDLInt) {
     os << "int";
-  } else if (dtype.is_uint()) {
+  } else if (tvm_type.code == kDLUInt) {
     os << "uint";
   } else {
     LOG(FATAL) << "Unknown type";
   }
-  os << dtype.bits();
+  os << typ.bits();
   return os.str();
 }
 
diff --git a/src/relay/backend/vm/compiler.cc b/src/relay/backend/vm/compiler.cc
index 0de47bda0bbc..c38ca1ae0469 100644
--- a/src/relay/backend/vm/compiler.cc
+++ b/src/relay/backend/vm/compiler.cc
@@ -476,39 +476,30 @@ class VMFunctionCompiler : ExprFunctor<void(const Expr& expr)> {
       argument_registers.push_back(reg->second);
     }
 
+    // Next generate the invoke instruction.
     Target target;
-
-    if (!func->UseDefaultCompiler()) {
-      target = tvm::target::ext_dev();
-    } else {
-      // Next generate the invoke instruction.
-      if (targets_.size() == 1) {
-        // homogeneous execution.
-        const auto& it = targets_.begin();
-        target = (*it).second;
-      } else {
-        // heterogeneous execution.
-        LOG(FATAL) << "Currently VM compiler doesn't support heterogeneous compilation";
+    if (targets_.size() == 1) {
+      // homogeneous execution.
+      for (auto kv : targets_) {
+        target = kv.second;
       }
+    } else {
+      // heterogeneous execution.
+      LOG(FATAL) << "Currently VM compiler doesn't support heterogeneous compilation";
     }
 
     auto key = CCacheKeyNode::make(func, target);
     auto cfunc = engine_->Lower(key);
 
+    // TODO(jroesch): support lowered funcs for multiple targets
+    CHECK_EQ(cfunc->funcs.size(), 1);
     auto op_index = -1;
-    if (!func->UseDefaultCompiler()) {
+    if (context_->seen_funcs.find(cfunc->funcs[0]) == context_->seen_funcs.end()) {
       op_index = context_->cached_funcs.size();
       context_->cached_funcs.push_back(cfunc);
+      context_->seen_funcs[cfunc->funcs[0]] = op_index;
     } else {
-      // TODO(jroesch): support lowered funcs for multiple targets
-      CHECK_EQ(cfunc->funcs.size(), 1);
-      if (context_->seen_funcs.find(cfunc->funcs[0]) == context_->seen_funcs.end()) {
-        op_index = context_->cached_funcs.size();
-        context_->cached_funcs.push_back(cfunc);
-        context_->seen_funcs[cfunc->funcs[0]] = op_index;
-      } else {
-        op_index = context_->seen_funcs[cfunc->funcs[0]];
-      }
+      op_index = context_->seen_funcs[cfunc->funcs[0]];
     }
 
     Emit(Instruction::InvokePacked(op_index,
@@ -895,7 +886,7 @@ Module VMCompiler::OptimizeModule(const Module& mod, const TargetsMap& targets)
       auto op_node = call_node->op.as<OpNode>();
       if (op_node->name == "cast") {
         auto attrs = call_node->attrs.as<CastAttrs>();
-        if (attrs->dtype == DataType::Int(32)) {
+        if (attrs->dtype == Int(32)) {
           *rv = true;
         }
       }
@@ -959,46 +950,32 @@ void VMCompiler::LibraryCodegen() {
   if (cached_funcs.size() == 0) {
     return;
   }
-  std::unordered_map<std::string, Array<LoweredFunc>> funcs;
-  for (auto& cfunc : cached_funcs) {
+  std::unordered_map<std::string, Array<LoweredFunc>> tgt_funcs;
+  for (auto &cfunc : cached_funcs) {
     std::string target_str = cfunc->target->str();
-    if (target_str == "ext_dev") {
-      continue;
-    } else if (funcs.count(target_str) == 0) {
-      funcs.emplace(target_str, Array<LoweredFunc>{cfunc->funcs[0]});
+    if (tgt_funcs.count(target_str) == 0) {
+      tgt_funcs.emplace(target_str, Array<LoweredFunc>{cfunc->funcs[0]});
     } else {
-      funcs[target_str].push_back(cfunc->funcs[0]);
+      tgt_funcs[target_str].push_back(cfunc->funcs[0]);
     }
   }
+  Map<Target, Array<LoweredFunc>> funcs;
+  for (auto &it : tgt_funcs) {
+    funcs.Set(Target::Create(it.first), it.second);
+  }
 
-  auto compile_engine = CompileEngine::Global();
-  auto ext_mods = compile_engine->LowerExternalFunctions();
-  runtime::Module mod;
-  if (funcs.size() > 0) {
-    mod = tvm::build(funcs, target_host_, tvm::BuildConfig::Current());
+  if (const auto *f = runtime::Registry::Get("relay.backend.build")) {
+    // The target is just a dummy arg because funcs already contains corresponding target
+    // therefore target won't be used in the build function
+    runtime::Module mod = (*f)(funcs, Target(), target_host_);
     CHECK(mod.operator->());
+    exec_->lib = mod;
   } else {
-    CHECK_EQ(ext_mods.size(), 1U)
-        << "Expect to have a TVM DSOModule when multiple runtime modules exist";
-  }
-  if (!ext_mods.empty()) {
-    if (funcs.size() == 0) {
-      mod = ext_mods[0];
-    } else {
-      // Import all external runtime modules.
-      for (auto it : ext_mods) {
-        mod.Import(it);
-      }
-    }
+    LOG(FATAL) << "relay.backend.build is not registered";
   }
-  exec_->lib = mod;
   size_t primitive_index = 0;
   for (auto cfunc : cached_funcs) {
-    if (cfunc->target->str() == "ext_dev") {
-      exec_->primitive_map.insert({cfunc->func_name, primitive_index++});
-    } else {
-      exec_->primitive_map.insert({cfunc->funcs[0]->name, primitive_index++});
-    }
+    exec_->primitive_map.insert({cfunc->funcs[0]->name, primitive_index++});
   }
 }
 
diff --git a/src/relay/backend/vm/lambda_lift.cc b/src/relay/backend/vm/lambda_lift.cc
index ab9dc8cbec63..b8250fd0dfb9 100644
--- a/src/relay/backend/vm/lambda_lift.cc
+++ b/src/relay/backend/vm/lambda_lift.cc
@@ -37,19 +37,21 @@ namespace tvm {
 namespace relay {
 namespace vm {
 
+static const char* kIsClosure = "IsClosure";
+
 inline std::string GenerateName(const Function& func) {
   size_t hash = StructuralHash()(func);
   return std::string("lifted_name") + std::to_string(hash);
 }
 
 bool IsClosure(const Function& func) {
-  NodeRef res = FunctionGetAttr(func, attr::kClosure);
+  NodeRef res = FunctionGetAttr(func, kIsClosure);
   const ir::IntImm* pval = res.as<ir::IntImm>();
   return pval && pval->value != 0;
 }
 
 Function MarkClosure(const Function& func) {
-  return FunctionSetAttr(func, attr::kClosure, tvm::Integer(1));
+  return FunctionSetAttr(func, kIsClosure, tvm::Integer(1));
 }
 
 /* The goal of this class is to lift out any nested functions into top-level
diff --git a/src/relay/ir/doc.cc b/src/relay/ir/doc.cc
index cc2f40f7cb9a..b92254a2bd70 100644
--- a/src/relay/ir/doc.cc
+++ b/src/relay/ir/doc.cc
@@ -108,7 +108,7 @@ Doc PrintBool(bool value) {
 }
 
 Doc PrintDType(DataType dtype) {
-  return Doc(runtime::TVMType2String(dtype));
+  return Doc(runtime::TVMType2String(Type2TVMType(dtype)));
 }
 
 Doc PrintString(const std::string& value) {
diff --git a/src/relay/ir/doc.h b/src/relay/ir/doc.h
index a41fd6145d26..9d26c40eb6ad 100644
--- a/src/relay/ir/doc.h
+++ b/src/relay/ir/doc.h
@@ -112,11 +112,11 @@ Doc PrintNewLine(int indent = 0);
 template<typename T>
 Doc PrintConstScalar(DataType dtype, const T* data) {
   std::ostringstream os;
-  if (dtype == DataType::Int(32)) {
+  if (dtype == Int(32)) {
     os << data[0];
-  } else if (dtype == DataType::Float(32)) {
+  } else if (dtype == Float(32)) {
     os << data[0] << 'f';
-  } else if (dtype == DataType::Bool()) {
+  } else if (dtype == Bool()) {
     return PrintBool(data[0] != 0);
   } else {
     // todo(@M.K.) this is unsafe. fix.
diff --git a/src/relay/ir/expr.cc b/src/relay/ir/expr.cc
index cae35895dbbf..47e735f20fc8 100644
--- a/src/relay/ir/expr.cc
+++ b/src/relay/ir/expr.cc
@@ -50,13 +50,13 @@ TVM_STATIC_IR_FUNCTOR(IRPrinter, vtable)
   });
 
 TensorType ConstantNode::tensor_type() const {
-  auto dtype = DataType(data->dtype);
+  auto dtype = TVMType2Type(data->dtype);
   Array<tvm::Expr> shape;
   for (int i = 0; i < data->ndim; i++) {
     CHECK_LE(data->shape[i], std::numeric_limits<int32_t>::max());
     CHECK_GE(data->shape[i], std::numeric_limits<int32_t>::min());
     shape.push_back(
-        tvm::ir::IntImm::make(DataType::Int(32), data->shape[i]));
+        tvm::ir::IntImm::make(Int(32), data->shape[i]));
   }
 
   return TensorTypeNode::make(shape, dtype);
@@ -157,13 +157,13 @@ FuncType FunctionNode::func_type_annotation() const {
 }
 
 bool FunctionNode::IsPrimitive() const {
-  NodeRef res = FunctionGetAttr(GetRef<Function>(this), attr::kPrimitive);
+  NodeRef res = FunctionGetAttr(GetRef<Function>(this), "Primitive");
   const ir::IntImm* pval = res.as<ir::IntImm>();
   return pval && pval->value != 0;
 }
 
 Function FunctionNode::SetParams(const tvm::Map<Var, Constant>& parameters) const {
-  return FunctionSetAttr(GetRef<Function>(this), attr::kParams, parameters);
+  return FunctionSetAttr(GetRef<Function>(this), "__params__", parameters);
 }
 
 TVM_REGISTER_API("relay._expr.FunctionSetParams")
@@ -173,7 +173,7 @@ TVM_REGISTER_API("relay._expr.FunctionSetParams")
 });
 
 tvm::Map<Var, Constant> FunctionNode::GetParams() const {
-  auto node_ref = FunctionGetAttr(GetRef<Function>(this), attr::kParams);
+  auto node_ref = FunctionGetAttr(GetRef<Function>(this), "__params__");
   return Downcast<tvm::Map<Var, Constant>>(node_ref);
 }
 
@@ -182,12 +182,6 @@ TVM_REGISTER_API("relay._expr.FunctionGetParams")
   return func->GetParams();
 });
 
-bool FunctionNode::UseDefaultCompiler() const {
-  NodeRef res = FunctionGetAttr(GetRef<Function>(this), attr::kCompiler);
-  const ir::StringImm* pval = res.as<ir::StringImm>();
-  return pval == nullptr || pval->value == "default";
-}
-
 NodeRef FunctionGetAttr(const Function& func, const std::string& key) {
   if (!func->attrs.defined()) { return NodeRef(); }
 
diff --git a/src/relay/ir/pretty_printer.cc b/src/relay/ir/pretty_printer.cc
index 597ef4abee4f..374e15368df9 100644
--- a/src/relay/ir/pretty_printer.cc
+++ b/src/relay/ir/pretty_printer.cc
@@ -424,17 +424,17 @@ class PrettyPrinter :
     // Print out simple scalars directly.
     if (op->is_scalar()) {
       std::ostringstream os;
-      DataType dtype = DataType(op->data->dtype);
+      DataType dtype = TVMType2Type(op->data->dtype);
       CHECK_EQ(op->data->ctx.device_type, kDLCPU);
-      if (dtype == DataType::Int(32)) {
+      if (dtype == Int(32)) {
         return PrintConstScalar(dtype, static_cast<const int32_t*>(op->data->data));
-      } else if (dtype == DataType::Int(64)) {
+      } else if (dtype == Int(64)) {
         return PrintConstScalar(dtype, static_cast<const int64_t*>(op->data->data));
-      } else if (dtype == DataType::Float(32)) {
+      } else if (dtype == Float(32)) {
         return PrintConstScalar(dtype, static_cast<const float*>(op->data->data));
-      } else if (dtype == DataType::Float(64)) {
+      } else if (dtype == Float(64)) {
         return PrintConstScalar(dtype, static_cast<const double*>(op->data->data));
-      } else if (dtype == DataType::Bool()) {
+      } else if (dtype == Bool()) {
         return PrintConstScalar(dtype, static_cast<const uint8_t*>(op->data->data));
       }
     }
@@ -843,15 +843,15 @@ class PrettyPrinter :
   }
 
   Doc VisitAttr_(const ir::IntImm* op) final {
-    return PrintConstScalar(op->dtype, &(op->value));
+    return PrintConstScalar(op->type, &(op->value));
   }
 
   Doc VisitAttr_(const ir::UIntImm* op) final {
-    return PrintConstScalar(op->dtype, &(op->value));
+    return PrintConstScalar(op->type, &(op->value));
   }
 
   Doc VisitAttr_(const ir::FloatImm* op) final {
-    return PrintConstScalar(op->dtype, &(op->value));
+    return PrintConstScalar(op->type, &(op->value));
   }
 
   Doc VisitAttr_(const ir::StringImm* op) final {
@@ -925,7 +925,7 @@ class PrettyPrinter::AttrPrinter : public AttrVisitor {
     LOG(FATAL) << "do not allow void as argument";
   }
   void Visit(const char* key, DataType* value) final {
-    PrintKV(key, PrintString(runtime::TVMType2String(*value)));
+    PrintKV(key, PrintString(runtime::TVMType2String(Type2TVMType(*value))));
   }
   void Visit(const char* key, runtime::NDArray* value) final {
     LOG(FATAL) << "do not allow NDarray as argument";
diff --git a/src/relay/ir/type.cc b/src/relay/ir/type.cc
index 94e9883d4e41..471b36964179 100644
--- a/src/relay/ir/type.cc
+++ b/src/relay/ir/type.cc
@@ -42,7 +42,7 @@ TensorType TensorTypeNode::Scalar(DataType dtype) {
 
 IndexExpr TensorTypeNode::Size() const {
   if (shape.size() == 0) {
-    return make_const(DataType::Int(64), 1);
+    return make_const(Int(64), 1);
   }
 
   IndexExpr size = shape[0];
diff --git a/src/relay/op/annotation/annotation.cc b/src/relay/op/annotation/annotation.cc
index 6835525c3585..f5674fa06adb 100644
--- a/src/relay/op/annotation/annotation.cc
+++ b/src/relay/op/annotation/annotation.cc
@@ -30,7 +30,7 @@
 #include <tvm/relay/op_attr_types.h>
 #include <topi/elemwise.h>
 
-#include "../../pass/infer_layout_util.h"
+#include "../../pass/alter_op_layout.h"
 #include "../type_relations.h"
 
 namespace tvm {
diff --git a/src/relay/op/device_copy.cc b/src/relay/op/device_copy.cc
index 3b997a273fa5..51aff4154f73 100644
--- a/src/relay/op/device_copy.cc
+++ b/src/relay/op/device_copy.cc
@@ -33,7 +33,7 @@
 #include <tvm/relay/op_attr_types.h>
 
 #include "type_relations.h"
-#include "../pass/infer_layout_util.h"
+#include "../pass/alter_op_layout.h"
 
 namespace tvm {
 namespace relay {
diff --git a/src/relay/op/memory/memory.cc b/src/relay/op/memory/memory.cc
index c535d76838c8..8f6dea01b3b1 100644
--- a/src/relay/op/memory/memory.cc
+++ b/src/relay/op/memory/memory.cc
@@ -29,7 +29,7 @@
 #include <tvm/relay/attrs/memory.h>
 
 #include "../op_common.h"
-#include "../../pass/infer_layout_util.h"
+#include "../../pass/alter_op_layout.h"
 #include "../type_relations.h"
 
 namespace tvm {
@@ -55,12 +55,12 @@ bool AllocStorageRel(const Array<Type>& types, int num_inputs, const Attrs& attr
   auto size_type = types[0];
   auto tensor_type = size_type.as<TensorTypeNode>();
   CHECK(tensor_type != nullptr);
-  CHECK_EQ(tensor_type->dtype, DataType::Int(64));
+  CHECK_EQ(tensor_type->dtype, Int(64));
   CHECK_EQ(tensor_type->shape.size(), 0);
   auto align_type = types[1];
   auto align_ttype = align_type.as<TensorTypeNode>();
   CHECK(align_ttype != nullptr);
-  CHECK_EQ(align_ttype->dtype, DataType::Int(64));
+  CHECK_EQ(align_ttype->dtype, Int(64));
   CHECK_EQ(align_ttype->shape.size(), 0);
   auto mod = reporter->GetModule();
   CHECK(mod.defined());
@@ -309,13 +309,13 @@ bool ShapeFuncRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
       shape_func_ins.push_back(in_type);
     } else {
       auto shape = RankShape(in_type->shape);
-      shape_func_ins.push_back(TensorTypeNode::make(shape, DataType::Int(64)));
+      shape_func_ins.push_back(TensorTypeNode::make(shape, Int(64)));
     }
   }
 
   for (auto out_type : out_types) {
     auto rank_shape = RankShape(out_type->shape);
-    shape_func_outs.push_back(TensorTypeNode::make(rank_shape, DataType::Int(64)));
+    shape_func_outs.push_back(TensorTypeNode::make(rank_shape, Int(64)));
   }
 
   auto input_type = TupleTypeNode::make(shape_func_ins);
diff --git a/src/relay/op/nn/bitserial.cc b/src/relay/op/nn/bitserial.cc
index d651baeccb4c..d70f1af9b164 100644
--- a/src/relay/op/nn/bitserial.cc
+++ b/src/relay/op/nn/bitserial.cc
@@ -26,7 +26,7 @@
 #include <tvm/relay/attrs/bitserial.h>
 #include <tvm/relay/op.h>
 
-#include "../../pass/infer_layout_util.h"
+#include "../../pass/alter_op_layout.h"
 
 namespace tvm {
 namespace relay {
diff --git a/src/relay/op/nn/convolution.cc b/src/relay/op/nn/convolution.cc
index 4a1fd466108d..3c9bebc1b0d0 100644
--- a/src/relay/op/nn/convolution.cc
+++ b/src/relay/op/nn/convolution.cc
@@ -27,8 +27,7 @@
 #include <tvm/relay/attrs/nn.h>
 #include <vector>
 
-#include "../../pass/infer_layout_util.h"
-#include "../op_common.h"
+#include "../../pass/alter_op_layout.h"
 #include "convolution.h"
 
 namespace tvm {
@@ -249,22 +248,10 @@ bool Conv2DTransposeRel(const Array<Type>& types,
   }
   // dilation
   Array<IndexExpr> oshape({dshape_nchw[0], channels, 0, 0});
-  auto pad_h = param->padding[0];
-  auto pad_w = param->padding[1];
-  if (param->padding.size() == 2) {
-    pad_h *= 2;
-    pad_w *= 2;
-  } else if (param->padding.size() == 4) {
-    pad_h += param->padding[2];
-    pad_w += param->padding[3];
-  } else {
-    CHECK_EQ(param->padding.size(), 4) << " Padding should be 2 or 4, but got "
-        << param->padding.size();
-  }
   oshape.Set(2, (param->strides[0] * (dshape_nchw[2] - 1) + dilated_ksize_y -
-                 pad_h + param->output_padding[0]));
+                 2 * param->padding[0] + param->output_padding[0]));
   oshape.Set(3, (param->strides[1] * (dshape_nchw[3] - 1) + dilated_ksize_x -
-                 pad_w + param->output_padding[1]));
+                 2 * param->padding[1] + param->output_padding[1]));
 
   DataType out_dtype = param->out_dtype;
   if (out_dtype.bits() == 0) {
@@ -341,162 +328,6 @@ v            (batch_size, channels, out_height, out_width) if `layout` is `NCHW`
 .add_type_rel("Conv2DTranspose", Conv2DTransposeRel);
 
 
-// relay.nn.conv1d_transpose
-TVM_REGISTER_NODE_TYPE(Conv1DTransposeAttrs);
-
-bool Conv1DTransposeRel(const Array<Type>& types,
-                        int num_inputs,
-                        const Attrs& attrs,
-                        const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 3);
-  const auto* data = types[0].as<TensorTypeNode>();
-  const auto* weight = types[1].as<TensorTypeNode>();
-  if (data == nullptr) return false;
-
-  static const Layout kNCW("NCW");
-  static const Layout kOIW("OIW");
-
-  const Conv1DTransposeAttrs* param = attrs.as<Conv1DTransposeAttrs>();
-  CHECK(param != nullptr);
-  const Layout in_layout(param->data_layout);
-  const Layout kernel_layout(param->kernel_layout);
-
-  const auto trans_in_layout = BijectiveLayoutNode::make(in_layout, kNCW);
-  CHECK(trans_in_layout.defined())
-    << "Conv only support input layouts that are convertible from NCW."
-    << " But got " << in_layout;
-
-  const auto trans_kernel_layout = BijectiveLayoutNode::make(kernel_layout, kOIW);
-  CHECK(trans_kernel_layout.defined())
-    << "Conv only support kernel layouts that are convertible from OIW."
-    << " But got "<< kernel_layout;
-
-  Layout out_layout(param->out_layout == "" ? param->data_layout : param->out_layout);
-  const auto trans_out_layout = BijectiveLayoutNode::make(out_layout, kNCW);
-  CHECK(trans_out_layout.defined())
-    << "Conv only support output layouts that are convertible from NCW."
-    << " But got " << out_layout;
-
-  IndexExpr channels, dilated_ksize_y, dilated_ksize_x;
-
-  auto dshape_ncw = trans_in_layout.ForwardShape(data->shape);
-
-  // infer weight if the kernel_size and channels are defined
-  if (param->kernel_size.defined() && param->channels.defined()) {
-    CHECK_EQ(param->kernel_size.size(), 1);
-    CHECK_EQ(param->dilation.size(), 1);
-
-    Array<IndexExpr> wshape({dshape_ncw[1],
-            indexdiv(param->channels, param->groups),
-            param->kernel_size[0]});
-
-    wshape = trans_kernel_layout.BackwardShape(wshape);
-    dilated_ksize_x = 1 + (param->kernel_size[0] - 1) * param->dilation[0];
-    channels = param->channels;
-
-    // assign result to reporter
-    reporter->Assign(types[1], TensorTypeNode::make(wshape, data->dtype));
-  } else {
-    // use weight to infer the conv shape.
-    if (weight == nullptr) return false;
-    auto wshape = trans_kernel_layout.ForwardShape(weight->shape);
-    if (param->kernel_size.defined()) {
-      CHECK_EQ(param->kernel_size.size(), 1);
-      // check the size
-      CHECK(reporter->AssertEQ(param->kernel_size[0], wshape[2]))
-          << "Conv1D: shape of weight is inconsistent with kernel_size, "
-          << " kernel_size=" << param->kernel_size
-          << " wshape=" << Array<IndexExpr>(wshape);
-    }
-    if (param->channels.defined()) {
-      CHECK(reporter->AssertEQ(param->channels, wshape[1]))
-          << "Conv1D: shape of weight is inconsistent with channels, "
-          << " channels=" << param->channels
-          << " wshape=" << Array<IndexExpr>(wshape);
-    }
-    CHECK(reporter->AssertEQ(indexdiv(dshape_ncw[1], param->groups), wshape[0]));
-    channels = wshape[1];
-    dilated_ksize_x = 1 + (wshape[2] - 1) * param->dilation[0];
-  }
-  // dilation
-  IndexExpr pad_w;
-  GetPaddingWidth(param->padding, &pad_w);
-  Array<IndexExpr> oshape({dshape_ncw[0], channels, 0});
-  oshape.Set(2, (param->strides[0] * (dshape_ncw[2] - 1) + dilated_ksize_x -
-                 pad_w + param->output_padding[0]));
-
-  DataType out_dtype = param->out_dtype;
-  if (out_dtype.bits() == 0) {
-    out_dtype = data->dtype;
-  }
-  oshape = trans_out_layout.BackwardShape(oshape);
-  reporter->Assign(types[2], TensorTypeNode::make(oshape, out_dtype));
-  return true;
-}
-
-
-Expr MakeConv1DTranspose(Expr data,
-                         Expr weight,
-                         Array<IndexExpr> strides,
-                         Array<IndexExpr> padding,
-                         Array<IndexExpr> dilation,
-                         int groups,
-                         IndexExpr channels,
-                         Array<IndexExpr> kernel_size,
-                         std::string data_layout,
-                         std::string kernel_layout,
-                         std::string out_layout,
-                         Array<IndexExpr> output_padding,
-                         DataType out_dtype) {
-  auto attrs = make_node<Conv1DTransposeAttrs>();
-  attrs->channels = std::move(channels);
-  attrs->kernel_size = std::move(kernel_size);
-  attrs->strides = std::move(strides);
-  attrs->padding = std::move(padding);
-  attrs->output_padding = std::move(output_padding);
-  attrs->dilation = std::move(dilation);
-  attrs->groups = groups;
-  attrs->data_layout = std::move(data_layout);
-  attrs->kernel_layout = std::move(kernel_layout);
-  attrs->out_layout = std::move(out_layout);
-  attrs->out_dtype = std::move(out_dtype);
-  static const Op& op = Op::Get("nn.conv1d_transpose");
-  return CallNode::make(op, {data, weight}, Attrs(attrs), {});
-}
-
-
-TVM_REGISTER_API("relay.op.nn._make.conv1d_transpose")
-.set_body_typed(MakeConv1DTranspose);
-
-RELAY_REGISTER_OP("nn.conv1d_transpose")
-.describe(R"code(Transposed 1D convolution layer (sometimes called Deconvolution).
-
-The need for transposed convolutions generally arises
-from the desire to use a transformation going in the opposite direction
-of a normal convolution, i.e., from something that has the shape of the
-output of some convolution to something that has the shape of its input
-while maintaining a connectivity pattern that is compatible with
-said convolution.
-
-- **data**: This depends on the `layout` parameter. Input is 3D array of shape
-            (batch_size, in_channels, width) if `layout` is `NCW`.
-- **weight**: (in_channels, channels, kernel_size[0])
-- **bias**: (channels,)
-- **out**:  This depends on the `layout` parameter. Output is 3D array of shape
-            (batch_size, channels, out_width) if `layout` is `NCW`.
-
-            out_width is calculated as::
-                out_width = (width-1)*strides[0]-2*padding[0]+kernel_size[0]+output_padding[0]
-
-)code" TVM_ADD_FILELINE)
-.set_attrs_type<Conv1DTransposeAttrs>()
-.set_num_inputs(2)
-.add_argument("data", "Tensor", "The input tensor.")
-.add_argument("weight", "Tensor", "The weight tensor.")
-.set_support_level(2)
-.add_type_rel("Conv1DTranspose", Conv1DTransposeRel);
-
-
 // relay.nn.contrib_conv2d_winograd_without_weight_transform
 TVM_REGISTER_NODE_TYPE(Conv2DWinogradAttrs);
 
@@ -682,7 +513,7 @@ TVM_REGISTER_API("relay.op.nn._make.contrib_conv2d_winograd_weight_transform")
 RELAY_REGISTER_OP("nn.contrib_conv2d_winograd_weight_transform")
 .describe(R"code(Weight transformation of winograd fast convolution algorithm.
 
-Separate this into another operator in order to enable Precompute Pass to compute the
+Separate this into another nnvm symbol in order to enable Precompute Pass to compute the
 weight transformation in advance.
 
 - **weight**: (channels, in_channels, kernel_size[0], kernel_size[1])
diff --git a/src/relay/op/nn/nn.cc b/src/relay/op/nn/nn.cc
index dfb360a2dec0..d1f7325b7e75 100644
--- a/src/relay/op/nn/nn.cc
+++ b/src/relay/op/nn/nn.cc
@@ -31,9 +31,8 @@
 #include <topi/nn/softmax.h>
 #include <topi/nn/flatten.h>
 #include <vector>
-#include <string>
 #include "../type_relations.h"
-#include "../../pass/infer_layout_util.h"
+#include "../../pass/alter_op_layout.h"
 #include "../op_common.h"
 #include "nn.h"
 
@@ -405,7 +404,7 @@ bool BatchFlattenRel(const Array<Type>& types,
   if (data == nullptr) return false;
   if (data->shape.size() == 0) return false;
 
-  auto target_dim = make_const(DataType::Int(32), 1);
+  auto target_dim = make_const(Int(32), 1);
 
   for (uint32_t i = 1; i < data->shape.size(); ++i) {
     if (!data->shape[i].as<ir::Any>()) {
@@ -617,34 +616,6 @@ The whole array is rescaled by ``1/(1-p)`` to keep the expected sum of the input
 // batch_norm
 TVM_REGISTER_NODE_TYPE(BatchNormAttrs);
 
-Array<Array<Layout>> BatchNormInferCorrectLayout(const Attrs& attrs,
-                                                 const Array<Layout>& new_in_layouts,
-                                                 const Array<Layout>& old_in_layouts,
-                                                 const Array<Array<IndexExpr>>& old_in_shapes) {
-  BatchNormAttrs* param = const_cast<BatchNormAttrs*>(attrs.as<BatchNormAttrs>());
-
-  size_t axis =
-      param->axis < 0 ? param->axis + old_in_shapes[0].size() : static_cast<size_t>(param->axis);
-
-  Layout ret = Layout::Undef();
-
-  // If new_in_layouts are defined, this code tries to modify the layout.
-  if (new_in_layouts.defined() && old_in_layouts.defined()) {
-    // Get the new C axis. Extract the dim in old layout. Find the index of that dim in next layout.
-    const auto& bn_dim = old_in_layouts[0][axis];
-    auto new_index = new_in_layouts[0].IndexOf(bn_dim);
-    param->axis = new_index;
-    ret = new_in_layouts[0];
-  } else if (old_in_layouts.defined()) {
-    ret = old_in_layouts[0];
-  }
-  // BN has 5 inputs, 3 outputs. The last 4 inputs and last 2 outputs have "C" layout.
-  Layout c_layout = Layout("C");
-
-  return Array<Array<Layout>>{{ret, c_layout, c_layout, c_layout, c_layout},
-                              {ret, c_layout, c_layout}};
-}
-
 bool BatchNormRel(const Array<Type>& types,
                   int num_inputs,
                   const Attrs& attrs,
@@ -736,7 +707,6 @@ axis to be the last item in the input shape.
 .add_argument("beta", "Tensor", "The beta offset factor.")
 .add_argument("moving_mean", "Tensor", "Running mean of input.")
 .add_argument("moving_var", "Tensor", "Running variance of input.")
-.set_attr<FInferCorrectLayout>("FInferCorrectLayout", BatchNormInferCorrectLayout)
 .set_support_level(1)
 .add_type_rel("BatchNorm", BatchNormRel);
 
@@ -989,123 +959,6 @@ Accept logits.
 .set_support_level(10)
 .add_type_rel("CrossEntropy", CrossEntropyRel);
 
-// Depth to space and space to depth
-TVM_REGISTER_NODE_TYPE(SubPixelAttrs);
-
-bool DepthToSpaceRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
-                     const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 2);
-  const auto* data = types[0].as<TensorTypeNode>();
-  if (data == nullptr) return false;
-
-  static const Layout kNCHW("NCHW");
-
-  const SubPixelAttrs* param = attrs.as<SubPixelAttrs>();
-  CHECK(param != nullptr);
-  const int block_size = param->block_size;
-  const Layout in_layout(param->layout);
-  auto layout_converter = BijectiveLayoutNode::make(in_layout, kNCHW);
-  CHECK(layout_converter.defined())
-      << "DepthToSpace only support input layouts that are convertible from NCHW."
-      << " But got " << in_layout;
-
-  auto oshape = layout_converter.ForwardShape(data->shape);
-  oshape.Set(1, indexdiv(oshape[1], (block_size * block_size)));
-  oshape.Set(2, oshape[2] * block_size);
-  oshape.Set(3, oshape[3] * block_size);
-
-  // Assign output type
-  reporter->Assign(types[1],
-                   TensorTypeNode::make(layout_converter.BackwardShape(oshape), data->dtype));
-
-  return true;
-}
-
-// Positional relay function to create DepthToSpace operator
-// used by frontend FFI
-Expr MakeDepthToSpace(Expr data, int block_size, std::string layout, std::string mode) {
-  auto attrs = make_node<SubPixelAttrs>();
-  attrs->block_size = block_size;
-  attrs->layout = std::move(layout);
-  attrs->mode = std::move(mode);
-  static const Op& op = Op::Get("nn.depth_to_space");
-  return CallNode::make(op, {data}, Attrs(attrs), {});
-}
-
-TVM_REGISTER_API("relay.op.nn._make.depth_to_space").set_body_typed(MakeDepthToSpace);
-
-RELAY_REGISTER_OP("nn.depth_to_space")
-    .describe(R"code(Rearrange input channels into spatial pixels.
-
-- **data**: data is a 4D array of shape
-            (batch, in_channels, in_height, in_width) for NCHW
-
-- **out**: Output is a 4D array of shape
-           (batch, in_channels / block_size * block_size, in_height * block_size, in_width * block_size) for NCHW.
-
-)code" TVM_ADD_FILELINE)
-    .set_attrs_type<SubPixelAttrs>()
-    .set_num_inputs(1)
-    .add_argument("data", "Tensor", "The input tensor")
-    .set_support_level(5)
-    .add_type_rel("DepthToSpace", DepthToSpaceRel);
-
-bool SpaceToDepthRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
-                     const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 2);
-  const auto* data = types[0].as<TensorTypeNode>();
-  if (data == nullptr) return false;
-
-  static const Layout kNCHW("NCHW");
-
-  const SubPixelAttrs* param = attrs.as<SubPixelAttrs>();
-  CHECK(param != nullptr);
-  const int block_size = param->block_size;
-  const Layout in_layout(param->layout);
-  auto layout_converter = BijectiveLayoutNode::make(in_layout, kNCHW);
-  CHECK(layout_converter.defined())
-      << "SpaceToDepth only support input layouts that are convertible from NCHW."
-      << " But got " << in_layout;
-
-  auto oshape = layout_converter.ForwardShape(data->shape);
-  oshape.Set(1, oshape[1] * (block_size * block_size));
-  oshape.Set(2, indexdiv(oshape[2], block_size));
-  oshape.Set(3, indexdiv(oshape[3], block_size));
-
-  // Assign output type
-  reporter->Assign(types[1],
-                   TensorTypeNode::make(layout_converter.BackwardShape(oshape), data->dtype));
-
-  return true;
-}
-
-// Positional relay function to create SpaceToDepth operator
-// used by frontend FFI
-Expr MakeSpaceToDepth(Expr data, int block_size, std::string layout) {
-  auto attrs = make_node<SubPixelAttrs>();
-  attrs->block_size = block_size;
-  attrs->layout = std::move(layout);
-  static const Op& op = Op::Get("nn.space_to_depth");
-  return CallNode::make(op, {data}, Attrs(attrs), {});
-}
-
-TVM_REGISTER_API("relay.op.nn._make.space_to_depth").set_body_typed(MakeSpaceToDepth);
-
-RELAY_REGISTER_OP("nn.space_to_depth")
-    .describe(R"code(Rearrange spatial pixels into new output channels.
-
-- **data**: data is a 4D array of shape
-            (batch, in_channels, in_height, in_width) for NCHW
-
-- **out**: Output is a 4D array of shape
-           (batch, in_channels * block_size * block_size, in_height / block_size, in_width / block_size) for NCHW.
-
-)code" TVM_ADD_FILELINE)
-    .set_attrs_type<SubPixelAttrs>()
-    .set_num_inputs(1)
-    .add_argument("data", "Tensor", "The input tensor")
-    .set_support_level(5)
-    .add_type_rel("SpaceToDepth", SpaceToDepthRel);
 
 }  // namespace relay
 }  // namespace tvm
diff --git a/src/relay/op/nn/pad.cc b/src/relay/op/nn/pad.cc
index 519619f8812a..44bb287f2ee8 100644
--- a/src/relay/op/nn/pad.cc
+++ b/src/relay/op/nn/pad.cc
@@ -148,7 +148,7 @@ bool PadRel(const Array<Type>& types,
       << "index " << i << " is " << *width2 << ".";
 
     if (!data->shape[i].as<ir::Any>()) {
-      auto padding = make_const(data->shape[i].dtype(), *width1 + *width2);
+      auto padding = make_const(data->shape[i].type(), *width1 + *width2);
       oshape.push_back(data->shape[i] + padding);
     } else {
       oshape.push_back(data->shape[i]);
@@ -256,7 +256,7 @@ bool MirrorPadRel(const Array<Type>& types,
       << "Param width elements should be positive but first pad width at "
       << "index " << i << " is " << *width2 << ".";
 
-    auto padding = make_const(data->shape[i].dtype(), *width1 + *width2);
+    auto padding = make_const(data->shape[i].type(), *width1 + *width2);
     oshape.push_back(data->shape[i] + padding);
   }
 
diff --git a/src/relay/op/nn/pooling.cc b/src/relay/op/nn/pooling.cc
index e7529a9d7bb9..aa3755921037 100644
--- a/src/relay/op/nn/pooling.cc
+++ b/src/relay/op/nn/pooling.cc
@@ -27,7 +27,7 @@
 #include <tvm/relay/attrs/nn.h>
 #include <topi/nn/pooling.h>
 #include <vector>
-#include "../../pass/infer_layout_util.h"
+#include "../../pass/alter_op_layout.h"
 
 namespace tvm {
 namespace relay {
diff --git a/src/relay/op/nn/sparse.cc b/src/relay/op/nn/sparse.cc
index 7cf8a27f3b56..a2d89c40bca8 100644
--- a/src/relay/op/nn/sparse.cc
+++ b/src/relay/op/nn/sparse.cc
@@ -27,7 +27,7 @@
 #include <tvm/relay/attrs/nn.h>
 #include <vector>
 
-#include "../../pass/infer_layout_util.h"
+#include "../../pass/alter_op_layout.h"
 
 namespace tvm {
 namespace relay {
diff --git a/src/relay/op/nn/upsampling.cc b/src/relay/op/nn/upsampling.cc
index 61b40588c3d7..6cdf6fc0c7b5 100644
--- a/src/relay/op/nn/upsampling.cc
+++ b/src/relay/op/nn/upsampling.cc
@@ -33,7 +33,6 @@ namespace tvm {
 namespace relay {
 
 TVM_REGISTER_NODE_TYPE(UpSamplingAttrs);
-TVM_REGISTER_NODE_TYPE(UpSampling3DAttrs);
 
 template <typename T>
 Array<Array<Layout> > UpsamplingInferCorrectLayout(
@@ -51,11 +50,8 @@ Array<Array<Layout> > UpsamplingInferCorrectLayout(
     Layout input = new_in_layouts[0];
     if (input.IndexOf(LayoutAxis::Get('W')) == raw_layout.IndexOf(LayoutAxis::Get('W')) &&
       input.IndexOf(LayoutAxis::Get('H')) == raw_layout.IndexOf(LayoutAxis::Get('H')) &&
-        !input.Contains(LayoutAxis::Get('w')) && !input.Contains(LayoutAxis::Get('h'))&&
-        (input.IndexOf(LayoutAxis::Get('D')) == -1 ||
-        (input.IndexOf(LayoutAxis::Get('D')) == raw_layout.IndexOf(LayoutAxis::Get('D')) &&
-        !input.Contains(LayoutAxis::Get('d'))))) {
-        params->layout = input.name();  // modify self to follow the input layout
+        !input.Contains(LayoutAxis::Get('w')) && !input.Contains(LayoutAxis::Get('h'))) {
+      params->layout = input.name();  // modify self to follow the input layout
     }
   }
 
@@ -83,8 +79,8 @@ bool UpSamplingRel(const Array<Type>& types,
     << " But got " << in_layout;
 
   auto oshape = layout_converter.ForwardShape(data->shape);
-  oshape.Set(2, ir::Cast::make(oshape[2].dtype(), tvm::round(oshape[2] * param->scale_h)));
-  oshape.Set(3, ir::Cast::make(oshape[3].dtype(), tvm::round(oshape[3] * param->scale_w)));
+  oshape.Set(2, ir::Cast::make(oshape[2].type(), tvm::round(oshape[2] * param->scale_h)));
+  oshape.Set(3, ir::Cast::make(oshape[3].type(), tvm::round(oshape[3] * param->scale_w)));
 
   // assign output type
   reporter->Assign(types[1],
@@ -112,6 +108,7 @@ Expr MakeUpSampling(Expr data,
   return CallNode::make(op, {data}, Attrs(attrs), {});
 }
 
+
 TVM_REGISTER_API("relay.op.nn._make.upsampling")
 .set_body_typed(MakeUpSampling);
 
@@ -141,86 +138,5 @@ RELAY_REGISTER_OP("nn.upsampling")
 .set_attr<TOpPattern>("TOpPattern", kInjective);
 
 
-// UpSampling3D
-bool UpSampling3DRel(const Array<Type>& types,
-                     int num_inputs,
-                     const Attrs& attrs,
-                     const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 2);
-  const auto* data = types[0].as<TensorTypeNode>();
-  if (data == nullptr) return false;
-
-  static const Layout kNCDHW("NCDHW");
-
-  const UpSampling3DAttrs* param = attrs.as<UpSampling3DAttrs>();
-  CHECK(param != nullptr);
-  const Layout in_layout(param->layout);
-
-  auto layout_converter = BijectiveLayoutNode::make(in_layout, kNCDHW);
-  CHECK(layout_converter.defined())
-    << "UpSampling3D only support input layouts that are convertible from NCDHW."
-    << " But got " << in_layout;
-
-  auto oshape = layout_converter.ForwardShape(data->shape);
-  oshape.Set(2, ir::Cast::make(oshape[2].dtype(), tvm::round(oshape[2] * param->scale_d)));
-  oshape.Set(3, ir::Cast::make(oshape[3].dtype(), tvm::round(oshape[3] * param->scale_h)));
-  oshape.Set(4, ir::Cast::make(oshape[4].dtype(), tvm::round(oshape[4] * param->scale_w)));
-
-  // assign output type
-  reporter->Assign(types[1],
-                   TensorTypeNode::make(layout_converter.BackwardShape(oshape),
-                                        data->dtype));
-  return true;
-}
-
-// Positional relay function to create upsampling3d operator
-// used by frontend FFI.
-Expr MakeUpSampling3D(Expr data,
-                      double scale_d,
-                      double scale_h,
-                      double scale_w,
-                      std::string layout,
-                      std::string method,
-                      std::string coordinate_transformation_mode) {
-  auto attrs = make_node<UpSampling3DAttrs>();
-  attrs->layout = std::move(layout);
-  attrs->method = std::move(method);
-  attrs->scale_d = scale_d;
-  attrs->scale_h = scale_h;
-  attrs->scale_w = scale_w;
-  attrs->coordinate_transformation_mode = coordinate_transformation_mode;
-  static const Op& op = Op::Get("nn.upsampling3d");
-  return CallNode::make(op, {data}, Attrs(attrs), {});
-}
-
-TVM_REGISTER_API("relay.op.nn._make.upsampling3d")
-.set_body_typed(MakeUpSampling3D);
-
-
-RELAY_REGISTER_OP("nn.upsampling3d")
-.describe(R"code(Perform upsampling on input array with nearest neighbour or
-bilinear interpolation.
-
-- **data**: data is 5D array of shape
-            (batch_size, channels, in_depth, in_height, in_width) for NCDHW
-            (batch_size, in_depth, in_height, in_width, channels) for NDHWC
-
-- **out**: Output is 5D array of shape
-           for layout NCDHW
-           (batch_size, channels, in_depth*scale, in_height*scale, in_width*scale)
-
-           for layout NDHWC
-           (batch_size, in_depth*scale, in_height*scale, in_width*scale, channels)
-
-)code" TVM_ADD_FILELINE)
-.set_attrs_type<UpSampling3DAttrs>()
-.set_num_inputs(1)
-.add_argument("data", "Tensor", "The input tensor.")
-.set_support_level(2)
-.add_type_rel("UpSampling3D", UpSampling3DRel)
-.set_attr<FInferCorrectLayout>("FInferCorrectLayout",
-  UpsamplingInferCorrectLayout<UpSampling3DAttrs>)
-.set_attr<TOpPattern>("TOpPattern", kInjective);
-
 }  // namespace relay
 }  // namespace tvm
diff --git a/src/relay/op/op_common.h b/src/relay/op/op_common.h
index 42c1fc485a63..5cf9851ecb44 100644
--- a/src/relay/op/op_common.h
+++ b/src/relay/op/op_common.h
@@ -32,7 +32,7 @@
 #include <string>
 #include <unordered_map>
 #include "type_relations.h"
-#include "../pass/infer_layout_util.h"
+#include "../pass/alter_op_layout.h"
 
 namespace tvm {
 namespace relay {
@@ -150,18 +150,6 @@ class OpMatch {
   MatchFunc default_;
 };
 
-/*! \brief A utility function to get padding width from a 1 or 2 ints tuple. */
-inline void GetPaddingWidth(const Array<IndexExpr>& padding, IndexExpr* pad_w) {
-  if (padding.size() == 1) {
-    *pad_w = padding[0] * 2;
-  } else if (padding.size() == 2) {
-    *pad_w = padding[0] + padding[1];
-  } else {
-    CHECK_EQ(padding.size(), 4) << " Expected padding size of 1 or 2, found "
-        << padding.size();
-  }
-}
-
 }  // namespace relay
 }  // namespace tvm
 
diff --git a/src/relay/op/tensor/reduce.cc b/src/relay/op/tensor/reduce.cc
index 7e499bae7683..48f35ea49d07 100644
--- a/src/relay/op/tensor/reduce.cc
+++ b/src/relay/op/tensor/reduce.cc
@@ -209,7 +209,7 @@ inline std::vector<IndexExpr> ReduceShapeImpl(const std::vector<IndexExpr> &in_s
     return in_shape;
   }
 
-  auto max_shape = make_const(DataType::Int(64), 1);
+  auto max_shape = make_const(Int(64), 1);
   bool is_dynamic_input = false;
   for (int64_t axis : r_axes) {
     if (in_shape[axis].as<IntImm>()) {
@@ -221,8 +221,7 @@ inline std::vector<IndexExpr> ReduceShapeImpl(const std::vector<IndexExpr> &in_s
   }
 
   if (is_dynamic_input) {
-    CHECK(reporter->Assert(max_shape < make_const(
-        DataType::Int(64), std::numeric_limits<int32_t>::max())))
+    CHECK(reporter->Assert(max_shape < make_const(Int(64), std::numeric_limits<int32_t>::max())))
       << "The maximum possible index of reduced shape cannot be more than int32 max.";
   }
 
@@ -272,7 +271,7 @@ bool ArgReduceRel(const Array<Type>& types,
 
   // assign output type and shape
   auto oshape = ReduceShapeImpl(in_shape, param, reporter);
-  reporter->Assign(types[1], TensorTypeNode::make(oshape, DataType::Int(32)));
+  reporter->Assign(types[1], TensorTypeNode::make(oshape, Int(32)));
   return true;
 }
 
diff --git a/src/relay/op/tensor/transform.cc b/src/relay/op/tensor/transform.cc
index ff018e43aea7..23d49cd3f66f 100644
--- a/src/relay/op/tensor/transform.cc
+++ b/src/relay/op/tensor/transform.cc
@@ -36,7 +36,7 @@
 #include <vector>
 #include "../op_common.h"
 #include "../../../arithmetic/compute_expr.h"
-#include "../../pass/infer_layout_util.h"
+#include "../../pass/alter_op_layout.h"
 #include "../../pass/pattern_util.h"
 #include "transform.h"
 
@@ -852,8 +852,8 @@ bool ArgWhereRel(const Array<Type>& types,
   const auto& input_rank = input_shape.size();
   std::vector<IndexExpr> result_shape;
   result_shape.push_back(Any::make());
-  result_shape.push_back(IntImm::make(DataType::Int(32), input_rank));
-  reporter->Assign(types[1], TensorTypeNode::make(result_shape, DataType::Int(32)));
+  result_shape.push_back(IntImm::make(Int(32), input_rank));
+  reporter->Assign(types[1], TensorTypeNode::make(result_shape, Int(32)));
   return true;
 }
 
@@ -1216,11 +1216,8 @@ bool ArangeRel(const Array<Type>& types,
   }
 }
 
-inline Tensor DynamicArange(const tvm::Tensor& start,
-                            const tvm::Tensor& stop,
-                            const tvm::Tensor& step,
-                            tvm::DataType dtype,
-                            std::string name = "tensor",
+inline Tensor DynamicArange(const tvm::Tensor& start, const tvm::Tensor& stop,
+                            const tvm::Tensor& step, tvm::Type dtype, std::string name = "tensor",
                             std::string tag = topi::kInjective) {
   tvm::Expr num_elem = tvm::Var("num_elem");
   return tvm::compute({num_elem}, [&](const Array<tvm::Var>& indices) {
@@ -1936,7 +1933,7 @@ bool StridedSliceRel(const Array<Type>& types,
       slice_range = end_v - begin_v;
       step = stride_v;
     }
-    oshape[i] = make_const(dshape[i].dtype(), (slice_range + step - 1) / step);
+    oshape[i] = make_const(dshape[i].type(), (slice_range + step - 1) / step);
   }
   reporter->Assign(types[1], TensorTypeNode::make(oshape, data->dtype));
   return true;
@@ -2137,7 +2134,7 @@ bool SplitRel(const Array<Type>& types,
 
   if (const IntImm* sections = param->indices_or_sections.as<IntImm>()) {
     CHECK(reporter->Assert(indexmod(data->shape[axis],
-                                    sections->value) == make_zero(DataType::Int(64))))
+                                    sections->value) == make_zero(Int(64))))
         << "indices_or_sections need to be able to divide input.shape[axis]";
     std::vector<Type> fields;
     for (int i = 0; i < sections->value; ++i) {
@@ -2149,7 +2146,7 @@ bool SplitRel(const Array<Type>& types,
     reporter->Assign(types[1], TupleTypeNode::make(Array<Type>(fields)));
   } else {
     auto indices = param->indices_or_sections.as<ArrayNode>()->data;
-    auto begin = IndexExpr(make_zero(DataType::Int(32)));
+    auto begin = IndexExpr(make_zero(Int(32)));
     std::vector<Type> fields;
     for (unsigned int i = 0; i < indices.size(); ++i) {
       CHECK(reporter->Assert(Downcast<IndexExpr>(indices[i]) > begin))
@@ -2201,7 +2198,7 @@ Expr MakeSplit(Expr data,
 TVM_REGISTER_API("relay.op._make.split")
 .set_body([](const TVMArgs& args, TVMRetValue* rv) {
     if (args.type_codes[1] == kDLInt) {
-      *rv = MakeSplit(args[0], make_const(DataType::Int(64), int64_t(args[1])), args[2]);
+      *rv = MakeSplit(args[0], make_const(Int(64), int64_t(args[1])), args[2]);
     } else {
       *rv = MakeSplit(args[0], args[1], args[2]);
     }
diff --git a/src/relay/op/type_relations.cc b/src/relay/op/type_relations.cc
index 13baefb3f18e..559c41195b50 100644
--- a/src/relay/op/type_relations.cc
+++ b/src/relay/op/type_relations.cc
@@ -136,7 +136,7 @@ bool BroadcastCompRel(const Array<Type>& types,
   if (auto t0 = ToTensorType(types[0])) {
     if (auto t1 = ToTensorType(types[1])) {
       CHECK_EQ(t0->dtype, t1->dtype);
-      reporter->Assign(types[2], ConcreteBroadcast(t0, t1, ::tvm::DataType::Bool()));
+      reporter->Assign(types[2], ConcreteBroadcast(t0, t1, ::tvm::Bool()));
       return true;
     }
   }
diff --git a/src/relay/op/vision/multibox_op.cc b/src/relay/op/vision/multibox_op.cc
index 28289e76810f..19662e42d513 100644
--- a/src/relay/op/vision/multibox_op.cc
+++ b/src/relay/op/vision/multibox_op.cc
@@ -122,7 +122,7 @@ bool MultiBoxTransformLocRel(const Array<Type>& types,
   std::vector<IndexExpr> oshape1({cls_shape[0]});
   std::vector<Type> fields;
   fields.push_back(TensorTypeNode::make(oshape0, cls_prob->dtype));
-  fields.push_back(TensorTypeNode::make(oshape1, DataType::Int(32)));
+  fields.push_back(TensorTypeNode::make(oshape1, Int(32)));
 
   // assign output type
   reporter->Assign(types[3], TupleTypeNode::make(Array<Type>(fields)));
diff --git a/src/relay/op/vision/nms.cc b/src/relay/op/vision/nms.cc
index cba5b6bc7c50..07b5d1aa449b 100644
--- a/src/relay/op/vision/nms.cc
+++ b/src/relay/op/vision/nms.cc
@@ -40,7 +40,7 @@ bool GetValidCountRel(const Array<Type>& types,
 
   std::vector<IndexExpr> oshape({data->shape[0]});
   std::vector<Type> fields;
-  fields.push_back(TensorTypeNode::make(oshape, DataType::Int(32)));
+  fields.push_back(TensorTypeNode::make(oshape, Int(32)));
   fields.push_back(TensorTypeNode::make(data->shape, data->dtype));
 
   // assign output type
@@ -95,7 +95,7 @@ bool NMSRel(const Array<Type>& types,
   // assign output type
   if (param->return_indices) {
     std::vector<IndexExpr> oshape({dshape[0], dshape[1]});
-    reporter->Assign(types[2], TensorTypeNode::make(oshape, DataType::Int(32)));
+    reporter->Assign(types[2], TensorTypeNode::make(oshape, Int(32)));
   } else {
     reporter->Assign(types[2], TensorTypeNode::make(dshape, data->dtype));
   }
diff --git a/src/relay/pass/alter_op_layout.cc b/src/relay/pass/alter_op_layout.cc
index bd89c5123bd7..d893d941576f 100644
--- a/src/relay/pass/alter_op_layout.cc
+++ b/src/relay/pass/alter_op_layout.cc
@@ -36,7 +36,7 @@
 #include <utility>
 #include <unordered_map>
 
-#include "transform_layout.h"
+#include "alter_op_layout.h"
 #include "pattern_util.h"
 
 namespace tvm {
@@ -44,73 +44,328 @@ namespace relay {
 
 namespace alter_op_layout {
 
-/*!
- * \brief Container to instantiate a Node for alter op layouts.
+// Make a transform CallNode
+/* Performs 2 operations
+ * 1) If src_layout ndim is smaller then dst_layout, expand_dim is inserted to match the dim size.
+ *    For example, src_layout = C, dst_layout = NCHW16c. The src is expanded to NHWC.
+ * 2) Call layout transform with new src layout.
  */
-class AlterTransformMemorizerNode : public TransformMemorizerNode {
+Expr TransformLayout(Expr raw, Layout src_layout, Layout dst_layout) {
+  if (src_layout.Equals(dst_layout)) {
+    return raw;
+  }
+
+  // 1) Check if the shape lengths are different. If yes, expand dims.
+  Expr input_expr = raw;
+  Layout new_src_layout = src_layout;
+  if (src_layout.ndim_primal() < dst_layout.ndim_primal()) {
+    int num_new_axis = dst_layout.ndim_primal() - src_layout.ndim_primal();
+    new_src_layout = src_layout.ExpandPrimal(dst_layout);
+    input_expr = MakeExpandDims(input_expr, 0, num_new_axis);
+    if (new_src_layout.Equals(dst_layout)) {
+      return input_expr;
+    }
+  }
+
+  // 2) Insert layout transform on the transformed src.
+  CHECK(new_src_layout.defined() && dst_layout.defined())
+      << "Cannot insert layout transform because there are undefined layouts";
+  CHECK(BijectiveLayoutNode::make(new_src_layout, dst_layout).defined())
+      << "Cannot insert layout transform because there are inconvertible layouts: "
+      << new_src_layout << " v.s. " << dst_layout;
+  return MakeLayoutTransform(input_expr, new_src_layout.name(), dst_layout.name());
+}
+
+// Memorize layout transform so we can reuse internal transformed nodes
+class TransformMemorizerNode : public Node {
  public:
-  static constexpr const char* _type_key = "relay.alter_op_layout.AlterTransformMemorizerNode";
+  // map from (Expr, src_layout, dst_layout) to transformed Expr
+  using TransformKey = std::tuple<const Node*, std::string, std::string>;
+struct key_hash : public std::function<std::size_t(TransformKey)> {
+    std::size_t operator()(const TransformKey& k) const {
+      return dmlc::HashCombine<std::string>(dmlc::HashCombine<std::string>(
+              std::hash<const Node*>()(std::get<0>(k)), std::get<1>(k)), (std::get<2>(k)));
+    }
+  };
+
+  std::unordered_map<TransformKey, Expr, key_hash> memo;
+  static constexpr const char *_type_key = "relay.alter_op_layout.TransformMemorizerNode";
+  TVM_DECLARE_NODE_TYPE_INFO(TransformMemorizerNode, Node);
 };
 
-/*!
- * \brief Container that provides the transformation function for alter layout..
- */
-class AlterTransformMemorizer : public TransformMemorizer {
+class TransformMemorizer : public NodeRef {
+ public:
+  TransformMemorizer() {}
+  explicit TransformMemorizer(ObjectPtr<Object> n) : NodeRef(n) {}
+
+  TransformMemorizerNode* operator->() {
+    return static_cast<TransformMemorizerNode*>(get_mutable());
+  }
+
+  // Transform layout with memorizer
+  Expr Transform(Expr raw, const Layout& src_layout, const Layout& dst_layout) {
+    if (src_layout.Equals(dst_layout)) { return raw; }
+
+    std::tuple<const Node*, std::string, std::string> key =
+        std::make_tuple<>(raw.get(), src_layout.name(), dst_layout.name());
+    auto& memo = operator->()->memo;
+
+    auto iter = memo.find(key);
+    if (iter != memo.end()) {
+      return iter->second;
+    } else {
+      Expr transform = TransformLayout(raw, src_layout, dst_layout);
+      memo[key] = transform;
+      return transform;
+    }
+  }
+
+  using ContainerType = TransformMemorizerNode;
+};
+
+
+// TempExprNode during layout transform
+// Instance of this expr will be Realized to normal expr ultimately
+class LayoutAlternatedExprNode : public TempExprNode {
  public:
-  AlterTransformMemorizer() {}
-  explicit AlterTransformMemorizer(ObjectPtr<Object> n) : TransformMemorizer(n) {}
-
-  AlterTransformMemorizerNode* operator->() {
-    return static_cast<AlterTransformMemorizerNode*>(get_mutable());
-  }
-
-  /*!
-   * \brief Defines the call transformation for AlterOpLayout pass. The new layouts are defined by
-   * used for different targets using a packed func.
-   * \param ref_call The original call.
-   * \param new_args The traversed/recursed args to the call.
-   * \return The new Call after calling the packed func.
-   */
-  Call CallWithNewLayouts(const Call& ref_call, const std::vector<Expr>& new_args) override {
-    static auto falter_layout = Op::GetAttr<FTVMAlterOpLayout>("FTVMAlterOpLayout");
-    Op op = Downcast<Op>(ref_call->op);
-
-    Expr new_e;
-    bool modified = false;
-    if (falter_layout.count(op)) {
-      tvm::Array<tvm::Tensor> tinfos;
-      for (auto expr : ref_call->args) {
-        auto ttype = expr->type_as<TensorTypeNode>();
-        tinfos.push_back(tvm::placeholder(ttype->shape, ttype->dtype));
+  Expr value;
+  Layout old_layout;
+  Layout new_layout;
+  TransformMemorizer memorizer;
+
+  Expr Realize() const final {
+    // NOTE: use a copy to discard the "const" qualifier
+    TransformMemorizer tmp_memorizer = memorizer;
+    // fallback to old layout
+    return tmp_memorizer.Transform(value, new_layout, old_layout);
+  }
+
+  void VisitAttrs(AttrVisitor *v) {
+    v->Visit("value", &value);
+    v->Visit("old_layout", &old_layout);
+    v->Visit("new_layout", &new_layout);
+  }
+
+  static constexpr const char *_type_key = "relay.alter_op_layout.LayoutAlternatedExprNode";
+  TVM_DECLARE_NODE_TYPE_INFO(LayoutAlternatedExprNode, TempExprNode);
+};
+
+RELAY_DEFINE_NODE_REF(LayoutAlternatedExpr, LayoutAlternatedExprNode, TempExpr);
+
+// Call registered FInferCorrectLayout of an op.
+// Parameters are the same as the parameters for FInferCorrectLayout
+// Returns inferred_input_layout, inferred_output_layout, success
+std::tuple<Array<Layout>, Array<Layout>, bool> CallInfer(
+    const Call& call,
+    const Array<Layout>& new_in_layouts,
+    const Array<Layout>& old_in_layouts,
+    const Array<Array<IndexExpr> > &old_in_shapes) {
+  static auto finfer_layout = Op::GetAttr<FInferCorrectLayout>("FInferCorrectLayout");
+  if (!call->op.as<OpNode>()) {
+    return std::make_tuple<>(Array<Layout>(nullptr), Array<Layout>(nullptr), false);
+  }
+
+  Op op = Downcast<Op>(call->op);
+  if (finfer_layout.count(op)) {
+    Array<Array<Layout> > inferred_layouts;
+    inferred_layouts = finfer_layout[op](call->attrs, new_in_layouts,
+                                         old_in_layouts, old_in_shapes);
+    CHECK_EQ(inferred_layouts.size(), 2)
+      << "FInferCorrectLayout should return an array with size of 2";
+    for (auto x : inferred_layouts) {
+      for (auto y : x) {
+        if (!y.defined()) {  // inference fails
+          return std::make_tuple<>(Array<Layout>(nullptr), Array<Layout>(nullptr), false);
+        }
       }
-      Expr altered_value = falter_layout[op](ref_call->attrs, new_args, tinfos);
-      if (altered_value.defined()) {
-        new_e = altered_value;
-        modified = true;
+    }
+    return std::make_tuple<>(inferred_layouts[0], inferred_layouts[1], true);
+  } else {
+    return std::make_tuple<>(Array<Layout>(nullptr), Array<Layout>(nullptr), false);
+  }
+}
+
+// Call registered FTVMAlterOpLayout of an op
+// Returns the altered expression
+Call CallAlter(const Call& ref_call,
+               const std::vector<Expr>& new_args) {
+  static auto falter_layout = Op::GetAttr<FTVMAlterOpLayout>("FTVMAlterOpLayout");
+  Op op = Downcast<Op>(ref_call->op);
+
+  Expr new_e;
+  bool modified = false;
+  if (falter_layout.count(op)) {
+    tvm::Array<tvm::Tensor> tinfos;
+    for (auto expr : ref_call->args) {
+      auto ttype = expr->type_as<TensorTypeNode>();
+      tinfos.push_back(tvm::placeholder(ttype->shape, ttype->dtype));
+    }
+    Expr altered_value = falter_layout[op](ref_call->attrs, new_args, tinfos);
+    if (altered_value.defined()) {
+      new_e = altered_value;
+      modified = true;
+    }
+  }
+  if (!modified) {
+    new_e = CallNode::make(ref_call->op, new_args,
+                           ref_call->attrs);
+  }
+
+  const CallNode *new_call = new_e.as<CallNode>();
+  CHECK(new_call) << "Can only replace the original operator with another call node";
+  return GetRef<Call>(new_call);
+}
+
+Expr AlterOpLayoutRewrite(const Call &ref_call,
+                          const Array<Expr> &new_args,
+                          const NodeRef& ctx) {
+  std::vector<LayoutAlternatedExpr> inputs;
+  std::vector<Expr> normal_new_args;
+  Array<Array<IndexExpr> > input_shapes;
+
+  // NOTE: discard the "const" qualifier
+  TransformMemorizer memorizer = Downcast<TransformMemorizer>(ctx);
+
+  // fill incomplete state and flatten tuple
+  auto push_back_one_arg = [&inputs, memorizer](Expr arg) {
+    // We always expect LayoutAlternatedExpr.
+    // This is used to convert the normal Expr to LayoutAlternatedExpr.
+    if (const LayoutAlternatedExprNode *inp = arg.as<LayoutAlternatedExprNode>()) {
+      inputs.push_back(GetRef<LayoutAlternatedExpr>(inp));
+      return inp->value;
+    } else {
+      auto inode = make_node<LayoutAlternatedExprNode>();
+      inode->value = arg;
+      inode->memorizer = memorizer;
+      inputs.push_back(LayoutAlternatedExpr(inode));
+      return arg;
+    }
+  };
+
+  for (auto new_arg : new_args) {
+    // NOTE: do not support nested tuple
+    if (new_arg->IsInstance<TupleNode>()) {
+      Tuple tuple_new_arg = Downcast<Tuple>(new_arg);
+      std::vector<Expr> fields;
+      for (auto x : tuple_new_arg->fields) {
+        Expr tmp = push_back_one_arg(x);
+        fields.push_back(tmp);
       }
+      normal_new_args.push_back(TupleNode::make(fields));
+    } else {
+      Expr tmp = push_back_one_arg(new_arg);
+      normal_new_args.push_back(tmp);
     }
-    if (!modified) {
-      new_e = CallNode::make(ref_call->op, new_args, ref_call->attrs);
+  }
+
+  // old_in, new_in = state[inputs]
+  Array<Layout> old_in, old_out, new_in, new_out, new_in2;
+  for (auto inp : inputs) {
+    old_in.push_back(inp->old_layout);
+    new_in.push_back(inp->new_layout);
+  }
+
+  for (auto arg : ref_call->args) {
+    if (arg->IsInstance<TupleNode>()) {  // flatten tuple
+      Tuple tuple_arg = Downcast<Tuple>(arg);
+      for (auto x : tuple_arg->fields) {
+        input_shapes.push_back(x->type_as<TensorTypeNode>()->shape);
+      }
+    } else {
+      input_shapes.push_back(arg->type_as<TensorTypeNode>()->shape);
     }
+  }
+
+  // old_in, old_out = op.infer(old_in)
+  bool success = false;
+  std::tie(old_in, old_out, success) = CallInfer(ref_call,
+                                                 Array<Layout>(nullptr),
+                                                 old_in, input_shapes);
+  if (!success) { return Expr(nullptr); }
+  CHECK_EQ(old_in.size(), new_in.size());
 
-    const CallNode* new_call = new_e.as<CallNode>();
-    CHECK(new_call) << "Can only replace the original operator with another call node";
-    return GetRef<Call>(new_call);
+  // if new_in == 'undef':  new_in = old_in
+  for (size_t i = 0; i < new_in.size(); ++i) {
+    if (!new_in[i].defined()) {
+      new_in.Set(i, old_in[i]);
+    }
   }
 
-  using ContainerType = AlterTransformMemorizerNode;
-};
+  // new_op = alter(op)
+  Call new_call = CallAlter(ref_call, normal_new_args);
 
-/*!
- * Limitations:
- * 1. The altered op should have the same number of arguments as the previous one.
- * 2. Do not support nested tuple arguments.
- */
+  // new_in2, new_out = op.infer(new_in)
+  if (new_call->op->IsInstance<OpNode>()) {
+    success = false;
+    std::tie(new_in2, new_out, success) = CallInfer(new_call, new_in, old_in, input_shapes);
+    if (!success) { return Expr(nullptr); }
+  } else {
+    return Expr(nullptr);
+  }
+
+  CHECK_EQ(new_out.size(), old_out.size())
+    << "The number of output nodes should keep the same during alter_op_layout";
+  CHECK_EQ(new_in.size(), new_in2.size())
+    << "The number of input nodes should keep the same during alter_op_layout";
+
+  // if (new_in != new_in2): insert transform (new_in -> new_in2)
+  Array<Expr> transformed_args;
+  size_t pt = 0;
+  for (auto arg : new_call->args) {
+    if (arg->IsInstance<TupleNode>()) {  // unflatten tuple
+      Tuple tuple_arg = Downcast<Tuple>(arg);
+      std::vector<Expr> transformed_tuple_arg;
+      for (auto arg_item : tuple_arg->fields) {
+          transformed_tuple_arg.push_back(
+                  memorizer.Transform(arg_item, new_in[pt], new_in2[pt]));
+          pt++;
+      }
+      transformed_args.push_back(TupleNode::make(transformed_tuple_arg));
+    } else {
+      transformed_args.push_back(
+              memorizer.Transform(arg, new_in[pt], new_in2[pt]));
+      pt++;
+    }
+  }
+  CHECK_EQ(pt, inputs.size());
+
+  // state[node] = (old_out, new_out)
+  // (handle tuple output)
+  if (ref_call->checked_type()->IsInstance<TupleTypeNode>()) {
+    Expr tuple_output = CallNode::make(new_call->op, transformed_args,
+                                       new_call->attrs);
+    Array<Expr> fields;
+    for (size_t i = 0; i < new_out.size(); ++i) {
+      auto rnode = make_node<LayoutAlternatedExprNode>();
+      rnode->value = TupleGetItemNode::make(tuple_output, i);
+      rnode->old_layout = old_out[i];
+      rnode->new_layout = new_out[i];
+      rnode->memorizer = memorizer;
+      fields.push_back(Expr(rnode));
+    }
+    return TupleNode::make(fields);
+  } else {
+    auto rnode = make_node<LayoutAlternatedExprNode>();
+    CHECK_EQ(new_out.size(), 1);
+    rnode->value = CallNode::make(new_call->op, transformed_args,
+                                  new_call->attrs);
+    rnode->old_layout = old_out[0];
+    rnode->new_layout = new_out[0];
+    rnode->memorizer = memorizer;
+    return Expr(rnode);
+  }
+}
+
+// Limiations:
+// 1. the altered op should have the same number of arguments as the previous one
+// 2. do not support nested tuple arguments
 Expr AlterOpLayout(const Expr& expr) {
-  AlterTransformMemorizer alterMemorizer(make_node<AlterTransformMemorizerNode>());
-  auto fcontext = [&](const Call& call) -> NodeRef { return alterMemorizer; };
+  TransformMemorizer transformMemorizer(make_node<TransformMemorizerNode>());
+  auto fcontext = [&](const Call& call) -> NodeRef{
+    return transformMemorizer;
+  };
 
-  return ForwardRewrite(expr, LayoutRewriter<AlterTransformMemorizer>, fcontext);
+  return ForwardRewrite(expr, AlterOpLayoutRewrite, fcontext);
 }
 
 }  // namespace alter_op_layout
diff --git a/src/relay/pass/infer_layout_util.h b/src/relay/pass/alter_op_layout.h
similarity index 82%
rename from src/relay/pass/infer_layout_util.h
rename to src/relay/pass/alter_op_layout.h
index 94eeba101fc2..49bf35c8a81f 100644
--- a/src/relay/pass/infer_layout_util.h
+++ b/src/relay/pass/alter_op_layout.h
@@ -18,20 +18,18 @@
  */
 
 /*!
- * \file infer_layout_util.h
- * \brief Utility functions to alter the layouts of operators or replace primitive operators with
+ * \file alter_op_layout.h
+ * \brief Alternate the layouts of operators or replace primitive operators with
           other expressions. This pass can be used for computing convolution in
           custom layouts or other general weight pre-transformation.
  */
 
-#ifndef TVM_RELAY_PASS_INFER_LAYOUT_UTIL_H_
-#define TVM_RELAY_PASS_INFER_LAYOUT_UTIL_H_
+#ifndef TVM_RELAY_PASS_ALTER_OP_LAYOUT_H_
+#define TVM_RELAY_PASS_ALTER_OP_LAYOUT_H_
 
 #include <tvm/data_layout.h>
 #include <tvm/relay/expr.h>
 #include <string>
-#include <tuple>
-#include "pattern_util.h"
 
 namespace tvm {
 namespace relay {
@@ -195,40 +193,7 @@ inline Array<Array<Layout> > BinaryBroadcastLayout(const Attrs& attrs,
   }
 }
 
-/*!
- * Call registered FInferCorrectLayout of an op.
- * Parameters are the same as the parameters for FInferCorrectLayout
- * Returns inferred_input_layout, inferred_output_layout, success
- */
-static inline std::tuple<Array<Layout>, Array<Layout>, bool> InferCorrectLayouts(
-    const Call& call, const Array<Layout>& new_in_layouts, const Array<Layout>& old_in_layouts,
-    const Array<Array<IndexExpr>>& old_in_shapes) {
-  static auto finfer_layout = Op::GetAttr<FInferCorrectLayout>("FInferCorrectLayout");
-  if (!call->op.as<OpNode>()) {
-    return std::make_tuple<>(Array<Layout>(nullptr), Array<Layout>(nullptr), false);
-  }
-
-  Op op = Downcast<Op>(call->op);
-  if (finfer_layout.count(op)) {
-    Array<Array<Layout>> inferred_layouts;
-    inferred_layouts =
-        finfer_layout[op](call->attrs, new_in_layouts, old_in_layouts, old_in_shapes);
-    CHECK_EQ(inferred_layouts.size(), 2)
-        << "FInferCorrectLayout should return an array with size of 2";
-    for (auto x : inferred_layouts) {
-      for (auto y : x) {
-        if (!y.defined()) {  // inference fails
-          return std::make_tuple<>(Array<Layout>(nullptr), Array<Layout>(nullptr), false);
-        }
-      }
-    }
-    return std::make_tuple<>(inferred_layouts[0], inferred_layouts[1], true);
-  } else {
-    return std::make_tuple<>(Array<Layout>(nullptr), Array<Layout>(nullptr), false);
-  }
-}
-
 }  //  namespace relay
 }  //  namespace tvm
 
-#endif  // TVM_RELAY_PASS_INFER_LAYOUT_UTIL_H_
+#endif  // TVM_RELAY_PASS_ALTER_OP_LAYOUT_H_
diff --git a/src/relay/pass/canonicalize_cast.cc b/src/relay/pass/canonicalize_cast.cc
index 6913eb2d80c5..fa306ea922dc 100644
--- a/src/relay/pass/canonicalize_cast.cc
+++ b/src/relay/pass/canonicalize_cast.cc
@@ -62,8 +62,6 @@ namespace relay {
 // \endcode
 class CastCanonicalizer : public ExprMutator {
  public:
-  CastCanonicalizer() : cast_op_(Op::Get("cast")) {}
-
   Expr VisitExpr_(const CallNode* call) {
     static auto fpattern = Op::GetAttr<TOpPattern>("TOpPattern");
 
@@ -93,17 +91,15 @@ class CastCanonicalizer : public ExprMutator {
 
  private:
   std::unordered_map<const Node*, size_t> ref_counter_;
-  // cast op is frequently checked for equivalence. Therefore, we cache it to
-  // reduce lookup overhead.
-  const Op& cast_op_;
-
 
   Expr GetNewCallArg(const Expr& e) {
     // if e is a upcast and ref count > 1, create an copy; otherwise call the default visitor
+
+    static auto& cast = Op::Get("cast");
     Expr new_expr = this->VisitExpr(e);
 
     if (const CallNode* call = e.as<CallNode>()) {
-      if (call->op == cast_op_) {
+      if (call->op.same_as(cast)) {
         auto attrs = call->attrs.as<CastAttrs>();
         const auto* from_type = call->args[0]->type_as<TensorTypeNode>();
         CHECK(from_type);
@@ -112,7 +108,7 @@ class CastCanonicalizer : public ExprMutator {
           if (++ref_counter_[call] > 1) {
             const CallNode* new_call = new_expr.as<CallNode>();
             CHECK(new_call);
-            CHECK(new_call->op == cast_op_);
+            CHECK(new_call->op.same_as(cast));
             return CallNode::make(new_call->op, new_call->args, new_call->attrs,
                  new_call->type_args);
           }
diff --git a/src/relay/pass/canonicalize_ops.cc b/src/relay/pass/canonicalize_ops.cc
index 64b702ccb379..9755154732a1 100644
--- a/src/relay/pass/canonicalize_ops.cc
+++ b/src/relay/pass/canonicalize_ops.cc
@@ -24,7 +24,6 @@
  */
 #include <tvm/relay/analysis.h>
 #include <tvm/relay/expr_functor.h>
-#include <tvm/relay/op.h>
 #include <tvm/relay/attrs/nn.h>
 #include <tvm/relay/transform.h>
 #include "pattern_util.h"
@@ -34,11 +33,10 @@ namespace relay {
 
 class BiasAddSimplifier : public ExprMutator {
  public:
-  BiasAddSimplifier() : bias_add_op_(Op::Get("nn.bias_add")) {}
-
   Expr VisitExpr_(const CallNode* n) {
+    static const Op& bias_add = Op::Get("nn.bias_add");
     auto new_n = ExprMutator::VisitExpr_(n);
-    if (n->op == bias_add_op_) {
+    if (n->op.same_as(bias_add)) {
       Call call = Downcast<Call>(new_n);
       CHECK_EQ(call->args.size(), 2);
       const BiasAddAttrs* param = call->attrs.as<BiasAddAttrs>();
@@ -56,10 +54,6 @@ class BiasAddSimplifier : public ExprMutator {
     }
     return new_n;
   }
-
- private:
-  // Cache the bias_add for equivalence checking.
-  const Op& bias_add_op_;
 };
 
 Expr CanonicalizeOps(const Expr& e) {
diff --git a/src/relay/pass/combine_parallel_conv2d.cc b/src/relay/pass/combine_parallel_conv2d.cc
index 109d86e806f6..a7100ee79130 100644
--- a/src/relay/pass/combine_parallel_conv2d.cc
+++ b/src/relay/pass/combine_parallel_conv2d.cc
@@ -204,7 +204,7 @@ class ParallelConv2DCombiner : public ParallelOpCombiner {
     auto index = branches[0][0]->attrs.as<Conv2DAttrs>()->kernel_layout.find('O');
     CHECK_NE(index, std::string::npos);
     return std::make_tuple(MakeConcatenate(TupleNode::make(weights), index),
-                           MakeConstScalar(DataType::Int(32), num_filters));
+                           MakeConstScalar(Int(32), num_filters));
   }
 };
 
diff --git a/src/relay/pass/combine_parallel_op.cc b/src/relay/pass/combine_parallel_op.cc
index 6b9926c698d6..081216c10232 100644
--- a/src/relay/pass/combine_parallel_op.cc
+++ b/src/relay/pass/combine_parallel_op.cc
@@ -27,30 +27,29 @@
 #include <tvm/relay/expr_functor.h>
 #include <tvm/relay/attrs/nn.h>
 #include <tvm/relay/attrs/transform.h>
-#include <tvm/relay/op.h>
 #include <tvm/relay/op_attr_types.h>
 #include <tvm/relay/transform.h>
-#include <algorithm>
-#include <utility>
 #include <unordered_map>
 #include <unordered_set>
-#include "expr_subst.h"
-#include "pattern_util.h"
-#include "combine_parallel_op.h"
+#include "./expr_subst.h"
+#include "./pattern_util.h"
+#include "./combine_parallel_op.h"
 
 
 namespace tvm {
 namespace relay {
 
-BranchGroupFinder::BranchGroupFinder(const Op& op,
+BranchGroupFinder::BranchGroupFinder(const std::string& op_name,
                                      FIsSupportedOp fis_supported_op,
                                      FAreCompatibleOps fare_compatible_ops)
-  : cached_op_(op),
+  : op_name_(op_name),
     fis_supported_op_(fis_supported_op),
     fare_compatible_ops_(fare_compatible_ops) {
 }
 
 std::vector<Group> BranchGroupFinder::Find(const Expr& expr) {
+  const Op& op = Op::Get(op_name_);
+
   this->VisitExpr(expr);
 
   std::vector<Group> groups;
@@ -58,7 +57,7 @@ std::vector<Group> BranchGroupFinder::Find(const Expr& expr) {
     const auto& children = children_map_.at(root);
     size_t ngroups = groups.size();
     for (const CallNode* child : children) {
-      if (child->op != cached_op_) continue;
+      if (!child->op.same_as(op)) continue;
 
       auto&& branch = CreateBranch(child);
       // add the branch to a group, or create a new group
@@ -98,8 +97,9 @@ Branch BranchGroupFinder::CreateBranch(const CallNode* op) {
 }
 
 void BranchGroupFinder::VisitExpr_(const CallNode* n) {
+  const Op& op = Op::Get(op_name_);
   ExprVisitor::VisitExpr_(n);
-  if (n->op == cached_op_ && fis_supported_op_(n)) {
+  if (n->op.same_as(op) && fis_supported_op_(n)) {
     op_roots_.insert(n->args[0]);
     children_map_[n->args[0]].push_back(n);
   } else {
@@ -110,12 +110,12 @@ void BranchGroupFinder::VisitExpr_(const CallNode* n) {
 }
 
 ParallelOpCombiner::ParallelOpCombiner(const std::string& op_name, uint64_t min_num_branches)
-  : cached_op_(Op::Get(op_name)),
+  : op_name_(op_name),
     min_num_branches_(min_num_branches) {
 }
 
 Expr ParallelOpCombiner::Combine(const Expr& expr) {
-  auto groups = BranchGroupFinder(cached_op_,
+  auto groups = BranchGroupFinder(op_name_,
                                   [&](const CallNode* n) {
                                     return IsSupportedOp(n);
                                   },
diff --git a/src/relay/pass/combine_parallel_op.h b/src/relay/pass/combine_parallel_op.h
index 858926e662e6..9004b0437660 100644
--- a/src/relay/pass/combine_parallel_op.h
+++ b/src/relay/pass/combine_parallel_op.h
@@ -68,13 +68,13 @@ class BranchGroupFinder : private ExprVisitor {
  public:
   /*
    * \brief Constructor
-   * \param op The op that indicates the start of each group
+   * \param op_name name of op to start each group
    * \param fis_supported_op function that returns true if op
    *                         is supported for combining
    * \param fare_compatible_ops function that returns true if
    *                            two ops are compatible for combining
    */
-  BranchGroupFinder(const Op& op,
+  BranchGroupFinder(const std::string& op_name,
                     FIsSupportedOp fis_supported_op,
                     FAreCompatibleOps fare_compatible_ops);
 
@@ -87,8 +87,8 @@ class BranchGroupFinder : private ExprVisitor {
   std::vector<Group> Find(const Expr& expr);
 
  private:
-  /* \brief Cache the op for finding parallel branches */
-  const Op& cached_op_;
+  /* \brief name of op to find parallel branches for */
+  std::string op_name_;
 
   /* \brief function to return true if op is eligible to be combined,
    *         false otherwise 
@@ -205,8 +205,8 @@ class ParallelOpCombiner {
                                  ExprSubstMap* subst_map) = 0;
 
  private:
-  /* \brief Cache the op to be combined */
-  const Op& cached_op_;
+  /* \brief name of op to be combined */
+  std::string op_name_;
 
   /* \brief minimum number of parallel branches to combine */
   uint64_t min_num_branches_;
diff --git a/src/relay/pass/convert_layout.cc b/src/relay/pass/convert_layout.cc
deleted file mode 100644
index fa8b8722f814..000000000000
--- a/src/relay/pass/convert_layout.cc
+++ /dev/null
@@ -1,146 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file convert_op_layout.cc
- * \brief Alternate the layouts of operators or replace primitive operators with
-          other expressions. This pass can be used for computing convolution in
-          custom layouts or other general weight pre-transformation.
- */
-#include <tvm/relay/analysis.h>
-#include <tvm/relay/transform.h>
-#include <tvm/relay/op_attr_types.h>
-#include <tvm/relay/attrs/transform.h>
-#include <tvm/relay/transform.h>
-#include <tvm/operation.h>
-#include <tuple>
-#include <vector>
-#include <functional>
-#include <string>
-#include <utility>
-#include <unordered_map>
-
-#include "transform_layout.h"
-#include "pattern_util.h"
-
-namespace tvm {
-namespace relay {
-
-namespace convert_op_layout {
-
-/*!
- * \brief Container for the transformations for ConvertLayout.
- */
-class ConvertTransformMemorizerNode : public TransformMemorizerNode {
- public:
-  /*!
-   * \brief Initializes the desired_layout.
-   * \param desired_layout The desired layout.
-   */
-  explicit ConvertTransformMemorizerNode(const std::string& desired_layout)
-      : desired_layout_(desired_layout) {}
-
-  /*! \brief The desired layout for the Convert Layout pass */
-  std::string desired_layout_;
-};
-
-/*!
- * \brief Container that provides the transformation function for convert layout.
- */
-class ConvertTransformMemorizer : public TransformMemorizer {
- public:
-  ConvertTransformMemorizer() {}
-  explicit ConvertTransformMemorizer(ObjectPtr<Object> n) : TransformMemorizer(n) {}
-
-  ConvertTransformMemorizerNode* operator->() {
-    return static_cast<ConvertTransformMemorizerNode*>(get_mutable());
-  }
-
-  /*!
-   * \brief Defines the call transformation for ConvertLayout pass. The new layouts should be the
-   * desired layout as specified by the user.
-   * \param ref_call The original call.
-   * \param new_args The traversed/recursed args to the call.
-   * \return The new Call after calling the packed func.
-   */
-  Call CallWithNewLayouts(const Call& ref_call, const std::vector<Expr>& new_args) override {
-    static auto fconvert_layout = Op::GetAttr<FTVMConvertOpLayout>("FTVMConvertOpLayout");
-    Op op = Downcast<Op>(ref_call->op);
-
-    Expr new_e;
-    bool modified = false;
-    if (fconvert_layout.count(op)) {
-      tvm::Array<tvm::Tensor> tinfos;
-      for (auto expr : ref_call->args) {
-        auto ttype = expr->type_as<TensorTypeNode>();
-        tinfos.push_back(tvm::placeholder(ttype->shape, ttype->dtype));
-      }
-      Expr altered_value =
-          fconvert_layout[op](ref_call->attrs, new_args, tinfos, operator->()->desired_layout_);
-      if (altered_value.defined()) {
-        new_e = altered_value;
-        modified = true;
-      }
-    }
-    if (!modified) {
-      new_e = CallNode::make(ref_call->op, new_args, ref_call->attrs);
-    }
-
-    const CallNode* new_call = new_e.as<CallNode>();
-    CHECK(new_call) << "Can only replace the original operator with another call node";
-    return GetRef<Call>(new_call);
-  }
-
-  using ContainerType = ConvertTransformMemorizerNode;
-};
-
-/*!
- * Limitations:
- * 1. The altered op should have the same number of arguments as the previous one.
- * 2. Do not support nested tuple arguments.
- */
-Expr ConvertLayout(const Expr& expr, const std::string& desired_layout) {
-  ConvertTransformMemorizer transformMemorizer(
-      make_node<ConvertTransformMemorizerNode>(desired_layout));
-  auto fcontext = [&](const Call& call) -> NodeRef { return transformMemorizer; };
-
-  return ForwardRewrite(expr, LayoutRewriter<ConvertTransformMemorizer>, fcontext);
-}
-
-}  // namespace convert_op_layout
-
-namespace transform {
-
-Pass ConvertLayout(const std::string& desired_layout) {
-  runtime::TypedPackedFunc<Function(Function, Module, PassContext)> pass_func =
-      [=](Function f, Module m, PassContext pc) {
-        return Downcast<Function>(relay::convert_op_layout::ConvertLayout(f, desired_layout));
-      };
-  return CreateFunctionPass(
-      pass_func, 3, "ConvertLayout",
-      {ir::StringImm::make("InferType"),
-       ir::StringImm::make("CanonicalizeOps")});
-}
-
-TVM_REGISTER_API("relay._transform.ConvertLayout").set_body_typed(ConvertLayout);
-
-}  // namespace transform
-
-}  // namespace relay
-}  // namespace tvm
diff --git a/src/relay/pass/fold_constant.cc b/src/relay/pass/fold_constant.cc
index 1e22571f6b43..b034c5699fe3 100644
--- a/src/relay/pass/fold_constant.cc
+++ b/src/relay/pass/fold_constant.cc
@@ -22,7 +22,6 @@
  */
 #include <tvm/relay/analysis.h>
 #include <tvm/relay/expr_functor.h>
-#include <tvm/relay/op.h>
 #include <tvm/relay/op_attr_types.h>
 #include <tvm/relay/interpreter.h>
 #include <tvm/relay/attrs/transform.h>
@@ -34,6 +33,7 @@ namespace relay {
 
 using FInterpreter = runtime::TypedPackedFunc<Value(Expr)>;
 
+
 class ConstantChecker : private ExprVisitor {
  public:
   // Check whether an expression is constant. The results are memoized.
@@ -78,14 +78,8 @@ TVM_REGISTER_API("relay._analysis.check_constant")
 class ConstantFolder : public ExprMutator {
  public:
   explicit ConstantFolder(FInterpreter executor, Module module)
-      : executor_(executor),
-        module_(module),
-        shape_of_op_(Op::Get("shape_of")),
-        invoke_tvm_op_(Op::Get("memory.invoke_tvm_op")),
-        shape_func_op_(Op::Get("memory.shape_func")),
-        alloc_tensor_op_(Op::Get("memory.alloc_tensor")),
-        alloc_storage_op_(Op::Get("memory.alloc_storage")),
-        cast_op_(Op::Get("cast")) {}
+      : executor_(executor), module_(module) {
+  }
 
   Expr VisitExpr_(const LetNode* op) final {
     Expr value = this->Mutate(op->value);
@@ -125,15 +119,15 @@ class ConstantFolder : public ExprMutator {
     // skip stateful ops.
     if (op_stateful.get(GetRef<Op>(op), false)) return res;
     // Try to evaluate shape_of op
-    if (call->op == shape_of_op_) {
+    if (call->op.same_as(Op::Get("shape_of"))) {
       return EvaluateShapeOf(res, origin_args, call->attrs);
     }
 
     // We should think about potentially constant evaluation over these ops too.
-    if (call->op == invoke_tvm_op_ ||
-        call->op == shape_func_op_ ||
-        call->op == alloc_tensor_op_ ||
-        call->op == alloc_storage_op_) {
+    if (call->op.same_as(Op::Get("memory.invoke_tvm_op")) ||
+        call->op.same_as(Op::Get("memory.shape_func")) ||
+        call->op.same_as(Op::Get("memory.alloc_tensor")) ||
+        call->op.same_as(Op::Get("memory.alloc_storage"))) {
       return GetRef<Call>(call);
     }
 
@@ -168,14 +162,6 @@ class ConstantFolder : public ExprMutator {
   // Module
   Module module_;
 
-  // Cache the following ops for equivalence checking in this pass.
-  const Op& shape_of_op_;
-  const Op& invoke_tvm_op_;
-  const Op& shape_func_op_;
-  const Op& alloc_tensor_op_;
-  const Op& alloc_storage_op_;
-  const Op& cast_op_;
-
   // Convert value to expression.
   Expr ValueToExpr(Value value) {
     if (const auto* val = value.as<TensorValueNode>()) {
@@ -240,7 +226,7 @@ class ConstantFolder : public ExprMutator {
     ctx.device_type = kDLCPU;
     ctx.device_id = 0;
     runtime::NDArray value;
-    DLDataType cdtype = DataType::Int(32);
+    auto cdtype = Type2TVMType(Int(32));
     if (ishape.size() == 0) {
       value = runtime::NDArray::Empty({}, cdtype, ctx);
     } else {
@@ -268,7 +254,8 @@ class ConstantFolder : public ExprMutator {
     // Cast the constant into correct dtype
     auto cast_attrs = make_node<CastAttrs>();
     cast_attrs->dtype = param->dtype;
-    Expr ret = CallNode::make(cast_op_, { shape }, Attrs(cast_attrs), {});
+    static const Op& cast_op = Op::Get("cast");
+    Expr ret = CallNode::make(cast_op, { shape }, Attrs(cast_attrs), {});
     return ConstEvaluate(ret);
   }
 };
diff --git a/src/relay/pass/fuse_ops.cc b/src/relay/pass/fuse_ops.cc
index 8209a8010b98..904d24657cad 100644
--- a/src/relay/pass/fuse_ops.cc
+++ b/src/relay/pass/fuse_ops.cc
@@ -78,8 +78,6 @@ using common::LinkedList;
 
 constexpr uint32_t kMaxFusedOps = 256;
 
-static const Op& stop_fusion_op = Op::Get("annotation.stop_fusion");
-
 /*!
  * \brief Indexed data flow graph in forward direction.
  *  This is a temporary data structure used for operator fusion analysis.
@@ -209,14 +207,14 @@ class IndexedForwardGraph::Creator : private ExprVisitor {
   void VisitExpr_(const ConstantNode* op) final {
     this->AddNode(op);
     Node* node = graph_.node_map.at(op);
-    DataType dtype = DataType(op->data->dtype);
+    DataType dtype = TVMType2Type(op->data->dtype);
     // This rule must be consistent with code generator.
     bool is_simple_const = (
-        dtype == DataType::Int(32) ||
-        dtype == DataType::Int(64) ||
-        dtype == DataType::Float(32) ||
-        dtype == DataType::Float(64) ||
-        dtype == DataType::Bool());
+        dtype == Int(32) ||
+        dtype == Int(64) ||
+        dtype == Float(32) ||
+        dtype == Float(64) ||
+        dtype == Bool());
     if (op->is_scalar() && is_simple_const) {
       node->pattern = kElemWise;
     } else {
@@ -241,8 +239,7 @@ class IndexedForwardGraph::Creator : private ExprVisitor {
     // Finally if the operator position is not a call node we will
     // need to call Update, as it may be an arbitrary expression.
     OpPatternKind op_pattern = kOpaque;
-    const OpNode* opnode = call->op.as<OpNode>();
-    if (opnode != nullptr && call->op != Op::Get("nn.batch_norm")) {
+    if (const OpNode* opnode = call->op.as<OpNode>()) {
       op_pattern = static_cast<OpPatternKind>(fpattern[GetRef<Op>(opnode)]);
     } else {
       this->Update(call->op, node, kOpaque);
@@ -862,6 +859,7 @@ class FuseMutator : private ExprMutator {
 
   // Transform calls.
   Expr VisitExpr_(const CallNode* call) {
+    static const Op& stop_fusion = Op::Get("annotation.stop_fusion");
     if (call->op.as<OpNode>()) {
       static auto fnoncomputational =
         Op::GetAttr<TNonComputational>("TNonComputational");
@@ -873,7 +871,7 @@ class FuseMutator : private ExprMutator {
       // If it is a primitive op call
       // then we must have a group assignment for it already.
       CHECK(gmap_.count(call));
-      if (call->op == stop_fusion_op) {
+      if (call->op.same_as(stop_fusion)) {
         return ExprMutator::VisitExpr(call->args[0]);
       }
       auto* ret_group = gmap_.at(call)->FindRoot();
@@ -934,7 +932,7 @@ class FuseMutator : private ExprMutator {
     visitor(body);
     const GroupInfo& ginfo = ginfo_[group];
     auto func = FunctionNode::make(ginfo.params, body, ret_type, {});
-    func = FunctionSetAttr(func, attr::kPrimitive, tvm::Integer(visitor.has_call));
+    func = FunctionSetAttr(func, "Primitive", tvm::Integer(visitor.has_call));
     return CallNode::make(func, ginfo.arguments, Attrs());
   }
 
diff --git a/src/relay/pass/partial_eval.cc b/src/relay/pass/partial_eval.cc
index afcc4935fa41..92f0db5d8ebe 100644
--- a/src/relay/pass/partial_eval.cc
+++ b/src/relay/pass/partial_eval.cc
@@ -559,28 +559,30 @@ struct WithFuncIdAttrs : public tvm::AttrsNode<WithFuncIdAttrs> {
 
 TVM_REGISTER_NODE_TYPE(WithFuncIdAttrs);
 
-RELAY_REGISTER_OP("annotation.with_funcid")
-.describe(R"code(Annotate a function with a funcid.)code"
-TVM_ADD_FILELINE)
-.set_num_inputs(1)
-.add_argument("func", "Function", "The input data.");
-
-// Cache with_funcid op to reduce lookup overhead during traversal.
-static const Op& with_funcid_op = Op::Get("annotation.with_funcid");
+Op WithFuncIdOp() {
+  static const Op& op = Op::Get("annotation.with_funcid");
+  return op;
+}
 
 Expr MkWithFuncId(const Expr& expr, FuncId fid) {
   auto attrs = make_node<WithFuncIdAttrs>();
   attrs->fid = fid;
-  return CallNode::make(with_funcid_op, {expr}, Attrs(attrs), {});
+  return CallNode::make(WithFuncIdOp(), {expr}, Attrs(attrs), {});
 }
 
+RELAY_REGISTER_OP("annotation.with_funcid")
+.describe(R"code(Annotate a function with a funcid.)code"
+TVM_ADD_FILELINE)
+.set_num_inputs(1)
+.add_argument("func", "Function", "The input data.");
+
 Expr StripWithFuncId(const Expr& e);
 
 Function AsFunc(const Expr& e) {
   if (e.as<FunctionNode>()) {
     return Downcast<Function>(e);
   } else if (const CallNode* c = e.as<CallNode>()) {
-    CHECK(c->op == with_funcid_op);
+    CHECK(c->op.same_as(WithFuncIdOp()));
     CHECK_EQ(c->args.size(), 1);
     return AsFunc(c->args[0]);
   } else {
@@ -602,7 +604,7 @@ class PartialEvaluator : public ExprFunctor<PStatic(const Expr& e, LetList* ll)>
 
   PStatic VisitExpr(const Expr& e, LetList* ll, const Var& name) {
     if (const CallNode* c = e.as<CallNode>()) {
-      if (c->op == with_funcid_op) {
+      if (c->op.same_as(WithFuncIdOp())) {
         CHECK_EQ(c->args.size(), 1);
         return VisitExpr(c->args[0], ll, name);
       }
@@ -668,7 +670,7 @@ class PartialEvaluator : public ExprFunctor<PStatic(const Expr& e, LetList* ll)>
     PStatic c = VisitExpr(op->cond, ll);
     if (c->pstatic.defined()) {
       NDArray cpu_array = Downcast<STensor>(c->pstatic)->data.CopyTo(CPUContext());
-      CHECK_EQ(DataType(cpu_array->dtype), DataType::Bool());
+      CHECK_EQ(TVMType2Type(cpu_array->dtype), Bool());
       if (reinterpret_cast<uint8_t*>(cpu_array->data)[0]) {
         return VisitExpr(op->true_branch, ll);
       } else {
@@ -720,7 +722,7 @@ class PartialEvaluator : public ExprFunctor<PStatic(const Expr& e, LetList* ll)>
   }
 
   PStatic VisitExpr_(const CallNode* op, LetList* ll) final {
-    if (op->op == with_funcid_op) {
+    if (op->op.same_as(WithFuncIdOp())) {
       CHECK_EQ(op->args.size(), 1);
       return VisitExpr(op->args[0], ll);
     }
@@ -761,10 +763,10 @@ class PartialEvaluator : public ExprFunctor<PStatic(const Expr& e, LetList* ll)>
       if (auto* st = ps->pstatic.as<STensorNode>()) {
         if (st->data.Shape().empty()) {
           NDArray cpu_array = st->data.CopyTo(CPUContext());
-          DataType dtype = DataType(cpu_array->dtype);
-          if (dtype == DataType::Int(32)) {
+          DataType dtype = TVMType2Type(cpu_array->dtype);
+          if (dtype == Int(32)) {
             return std::max<int32_t>(0, *static_cast<const int32_t*>(cpu_array->data));
-          } else if (dtype == DataType::Int(64)) {
+          } else if (dtype == Int(64)) {
             return std::max<int64_t>(0, *static_cast<const int64_t*>(cpu_array->data));
           }
         }
@@ -1094,7 +1096,7 @@ class PartialEvaluator : public ExprFunctor<PStatic(const Expr& e, LetList* ll)>
       explicit RegisterFuncIdVisitor(PartialEvaluator* pe) : pe(pe) { }
 
       void VisitExpr_(const CallNode* op) final {
-        if (op->op == with_funcid_op) {
+        if (op->op.same_as(WithFuncIdOp())) {
           CHECK_EQ(op->args.size(), 1);
           CHECK(op->attrs.defined());
           CHECK(op->attrs.as<WithFuncIdAttrs>());
@@ -1192,7 +1194,7 @@ Expr Remap(const Expr& e) {
 Expr StripWithFuncId(const Expr& e) {
   struct StripWithFuncIdMutator : ExprMutator, PatternMutator {
     Expr VisitExpr_(const CallNode* op) final {
-      if (op->op == with_funcid_op) {
+      if (op->op.same_as(WithFuncIdOp())) {
         CHECK_EQ(op->args.size(), 1);
         return VisitExpr(op->args[0]);
       } else {
diff --git a/src/relay/pass/pass_manager.cc b/src/relay/pass/pass_manager.cc
index 97b8fd681cb8..b025d3787f9e 100644
--- a/src/relay/pass/pass_manager.cc
+++ b/src/relay/pass/pass_manager.cc
@@ -329,10 +329,12 @@ Module FunctionPassNode::operator()(const Module& mod,
   return updated_mod;
 }
 
+// TODO(zhiics) Create an enum attribute for FunctionNode
+// enum Attribute {kPrimitive, kSkipOptimization}
 bool FunctionPassNode::SkipFunction(const Function& func) const {
-  NodeRef skip_opt = FunctionGetAttr(func, attr::kSkipOptimization);
-  const ir::IntImm* pval = skip_opt.as<ir::IntImm>();
-  return (pval && pval->value != 0) || (!func->UseDefaultCompiler());
+  NodeRef res = FunctionGetAttr(func, "SkipOptimization");
+  const ir::IntImm* pval = res.as<ir::IntImm>();
+  return pval && pval->value != 0;
 }
 
 Sequential::Sequential(tvm::Array<Pass> passes, PassInfo pass_info) {
diff --git a/src/relay/pass/pattern_util.h b/src/relay/pass/pattern_util.h
index 5e93ea1ff0aa..b1a6ef160a4d 100644
--- a/src/relay/pass/pattern_util.h
+++ b/src/relay/pass/pattern_util.h
@@ -46,37 +46,37 @@ namespace relay {
  *  during runtime.
  */
 #define TVM_DTYPE_DISPATCH(type, DType, ...)            \
-  if (type == DataType::Float(64)) {                              \
+  if (type == Float(64)) {                              \
     typedef double DType;                               \
     {__VA_ARGS__}                                       \
-  } else if (type == DataType::Float(32)) {                       \
+  } else if (type == Float(32)) {                       \
     typedef float DType;                                \
     {__VA_ARGS__}                                       \
-  } else if (type == DataType::Float(16)) {                       \
+  } else if (type == Float(16)) {                       \
     typedef uint16_t DType;                             \
     {__VA_ARGS__}                                       \
-  } else if (type == DataType::Int(64)) {                         \
+  } else if (type == Int(64)) {                         \
     typedef int64_t DType;                              \
     {__VA_ARGS__}                                       \
-  } else if (type == DataType::Int(32)) {                         \
+  } else if (type == Int(32)) {                         \
     typedef int32_t DType;                              \
     {__VA_ARGS__}                                       \
-  } else if (type == DataType::Int(16)) {                         \
+  } else if (type == Int(16)) {                         \
     typedef int16_t DType;                              \
     {__VA_ARGS__}                                       \
-  } else if (type == DataType::Int(8)) {                          \
+  } else if (type == Int(8)) {                          \
     typedef int8_t DType;                               \
     {__VA_ARGS__}                                       \
-  } else if (type == DataType::UInt(64)) {                        \
+  } else if (type == UInt(64)) {                        \
     typedef uint64_t DType;                             \
     {__VA_ARGS__}                                       \
-  } else if (type == DataType::UInt(32)) {                        \
+  } else if (type == UInt(32)) {                        \
     typedef uint32_t DType;                             \
     {__VA_ARGS__}                                       \
-  } else if (type == DataType::UInt(16)) {                        \
+  } else if (type == UInt(16)) {                        \
     typedef uint16_t DType;                             \
     {__VA_ARGS__}                                       \
-  } else if (type == DataType::UInt(8)) {                         \
+  } else if (type == UInt(8)) {                         \
     typedef uint8_t DType;                              \
     {__VA_ARGS__}                                       \
   } else {                                              \
@@ -199,28 +199,6 @@ inline int64_t GetConv2DSuperChannelsDim(const CallNode* call) {
     return *channels;
 }
 
-/*!
- * \brief Is single value tensor (scalar).
- * \param expr The expr.
- * \return True if single value tensor.
- */
-inline bool IsScalar(const Expr& expr) {
-  if (auto tensor_type = expr->checked_type().as<TensorTypeNode>()) {
-    for (auto dim_index_expr : tensor_type->shape) {
-      if (auto dim_index = dim_index_expr.as<IntImm>()) {
-        if (dim_index->value != 1) {
-          return false;
-        }
-      } else {
-        return false;
-      }
-    }
-  } else {
-    return false;
-  }
-  return true;
-}
-
 /*!
  * \brief Create a Constant with a scalar
  *
@@ -230,9 +208,9 @@ inline bool IsScalar(const Expr& expr) {
  */
 template<typename T>
 inline Constant MakeConstantScalar(DataType dtype, T value) {
-  runtime::NDArray arr = runtime::NDArray::Empty({}, dtype, {kDLCPU, 0});
+  runtime::NDArray arr = runtime::NDArray::Empty({}, Type2TVMType(dtype), {kDLCPU, 0});
   TVM_DTYPE_DISPATCH(dtype, DType, {
-    if (dtype == DataType::Float(16)) {
+    if (dtype == Float(16)) {
       // convert to float16
       // storage is uint16_t
       *static_cast<DType*>(arr->data) =
diff --git a/src/relay/pass/quantize/calibrate.cc b/src/relay/pass/quantize/calibrate.cc
index e78abbf6aee0..4f17d68ac54c 100644
--- a/src/relay/pass/quantize/calibrate.cc
+++ b/src/relay/pass/quantize/calibrate.cc
@@ -25,17 +25,15 @@
  */
 #include <tvm/relay/analysis.h>
 #include <tvm/relay/expr_functor.h>
-#include <tvm/relay/op.h>
 #include "./quantize.h"
 
+
 namespace tvm {
 namespace relay {
 namespace quantize {
 
 class StatsCollector : private ExprMutator {
  public:
-  StatsCollector() : simulated_quantize_op_(Op::Get("relay.op.annotation.simulated_quantize")) {}
-
   Expr Collect(const Expr& expr) {
     auto new_e = this->Mutate(expr);
     const FunctionNode* func = new_e.as<FunctionNode>();
@@ -47,18 +45,18 @@ class StatsCollector : private ExprMutator {
 
  private:
   Array<Expr> profile_data_;
-  const Op& simulated_quantize_op_;
 
   Expr VisitExpr_(const CallNode* call) {
+    static const Op& simulated_quantize = Op::Get("relay.op.annotation.simulated_quantize");
     Expr new_e = ExprMutator::VisitExpr_(call);
     const CallNode* new_call = new_e.as<CallNode>();
     CHECK(new_call);
-    if (new_call->op == simulated_quantize_op_) {
+    if (new_call->op.same_as(simulated_quantize)) {
       auto attrs = new_call->attrs.as<SimulatedQuantizeAttrs>();
       // rewrite the annotation
       auto new_attrs = make_node<SimulatedQuantizeAttrs>();
       const Expr& quantize_input = new_call->args[0];  // expression being quantized
-      auto placeholder = MakeConstantScalar(DataType::Float(32), 0.);  // unused argument
+      auto placeholder = MakeConstantScalar(Float(32), 0.);  // unused argument
       Array<Expr> new_args{quantize_input, placeholder, placeholder, placeholder};
       new_attrs->kind = QAnnotateKind::kQIdentity;
       new_attrs->sign = attrs->sign;
diff --git a/src/relay/pass/quantize/quantize.cc b/src/relay/pass/quantize/quantize.cc
index c022d4236b05..be24ad7404e0 100644
--- a/src/relay/pass/quantize/quantize.cc
+++ b/src/relay/pass/quantize/quantize.cc
@@ -48,9 +48,9 @@ bool SimulatedQuantizeRel(const Array<Type>& types,
   CHECK(data != nullptr);
   CHECK_NE(data->shape.size(), 0) << "Input shape cannot be empty";
 
-  reporter->Assign(types[1], TensorTypeNode::make({}, DataType::Float(32)));    // dom_scale
-  reporter->Assign(types[2], TensorTypeNode::make({}, DataType::Float(32)));    // clip_min
-  reporter->Assign(types[3], TensorTypeNode::make({}, DataType::Float(32)));    // clip_max
+  reporter->Assign(types[1], TensorTypeNode::make({}, Float(32)));    // dom_scale
+  reporter->Assign(types[2], TensorTypeNode::make({}, Float(32)));    // clip_min
+  reporter->Assign(types[3], TensorTypeNode::make({}, Float(32)));    // clip_max
   reporter->Assign(types[4], types[0]);                               // output
   return true;
 }
diff --git a/src/relay/pass/quantize/quantize.h b/src/relay/pass/quantize/quantize.h
index 77900ab33e7b..3af13a97b578 100644
--- a/src/relay/pass/quantize/quantize.h
+++ b/src/relay/pass/quantize/quantize.h
@@ -67,9 +67,9 @@ class QConfigNode : public Node {
   int nbit_input = 8;
   int nbit_weight = 8;
   int nbit_activation = 32;
-  DataType dtype_input = DataType::Int(8);
-  DataType dtype_weight = DataType::Int(8);
-  DataType dtype_activation = DataType::Int(32);
+  DataType dtype_input = Int(8);
+  DataType dtype_weight = Int(8);
+  DataType dtype_activation = Int(32);
   std::string calibrate_mode = "global_scale";
   double global_scale = 8.0;
   std::string weight_scale = "power2";
diff --git a/src/relay/pass/quantize/realize.cc b/src/relay/pass/quantize/realize.cc
index 7a7e218ced05..773551a4c690 100644
--- a/src/relay/pass/quantize/realize.cc
+++ b/src/relay/pass/quantize/realize.cc
@@ -76,7 +76,7 @@ RELAY_DEFINE_NODE_REF(QRealizeIntExpr, QRealizeIntExprNode, QRealizeExpr);
 Expr QRealizeIntExprNode::Realize() const {
   Expr data = this->data;
   // dequantize
-  data = Cast(data, DataType::Float(32));
+  data = Cast(data, Float(32));
   data = Multiply(data, this->dom_scale);
   return data;
 }
@@ -112,8 +112,7 @@ inline Expr MulAndDiv(Expr data, float s1, float s2, DataType dtype,
   } else if (static_cast<int>(factor) == factor) {
     return Multiply(data, MakeConstantScalar(dtype, factor));
   } else {
-    data = qnn::FixedPointMultiply(
-        Cast(data, DataType::Int(64)), factor, data_shape, cfg->rounding);
+    data = qnn::FixedPointMultiply(Cast(data, Int(64)), factor, data_shape, cfg->rounding);
     return Cast(data, dtype);
   }
 }
@@ -166,7 +165,7 @@ Expr QuantizeRealize(const Call& ref_call,
       data = Clip(data, clip_min_imm, clip_max_imm);
       return QRealizeIntExprNode::make(data, dom_scale, n->dtype);
     } else {
-      data = Cast(data, DataType::Int(64));
+      data = Cast(data, Int(64));
       data = qnn::FixedPointMultiply(data, idom_scale_imm / odom_scale_imm,
                                      ref_call->type_as<TensorTypeNode>()->shape,
                                      cfg->rounding);
@@ -178,9 +177,9 @@ Expr QuantizeRealize(const Call& ref_call,
   // quantize from real
   CHECK(!new_args[0]->IsInstance<TempExprNode>());
   Expr data = new_args[0];
-  Expr scaled_data = Multiply(data, MakeConstantScalar(DataType::Float(32), 1 / dom_scale_imm));
+  Expr scaled_data = Multiply(data, MakeConstantScalar(Float(32), 1 / dom_scale_imm));
   Expr round_data = Clip(Round(scaled_data), clip_min_imm, clip_max_imm);
-  return QRealizeIntExprNode::make(round_data, dom_scale, DataType::Float(32));
+  return QRealizeIntExprNode::make(round_data, dom_scale, Float(32));
 }
 
 Expr FoldConstantOpt(const Expr& expr) {
@@ -351,7 +350,7 @@ Array<Expr> UnifyDTypeScale(const Array<Expr>& ref_args, const Array<Expr>& args
 
   // unify the dom_scale
   float s = ChooseDomScale(nptrs);
-  Expr dom_scale = MakeConstantScalar(DataType::Float(32), s);
+  Expr dom_scale = MakeConstantScalar(Float(32), s);
   for (size_t i = 0; i < ret.size(); ++i) {
     float cur_s = GetScalarFromConstant<float>(nptrs[i]->dom_scale);
     ret.Set(i, MulAndDiv(ret[i], cur_s, s, dtype, ref_args[i]->type_as<TensorTypeNode>()->shape));
diff --git a/src/relay/pass/simplify_inference.cc b/src/relay/pass/simplify_inference.cc
index acd5163d1335..773e042af09c 100644
--- a/src/relay/pass/simplify_inference.cc
+++ b/src/relay/pass/simplify_inference.cc
@@ -6,9 +6,9 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- *
+ * 
  *   http://www.apache.org/licenses/LICENSE-2.0
- *
+ * 
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -91,6 +91,7 @@ Expr LayerNormToInferUnpack(const Attrs attrs,
   return out;
 }
 
+
 Expr InstanceNormToInferUnpack(const Attrs attrs,
                                Expr data,
                                Expr gamma,
@@ -109,7 +110,7 @@ Expr InstanceNormToInferUnpack(const Attrs attrs,
           reduced_axes.push_back(i);
   }
 
-  Expr epsilon = MakeConstantScalar(DataType::Float(32), static_cast<float>(param->epsilon));
+  Expr epsilon = MakeConstantScalar(Float(32), static_cast<float>(param->epsilon));
   Expr mean = Mean(data, reduced_axes, true, false);
   Expr var = Variance(data, mean, reduced_axes, true, false);
   Expr denom = Sqrt(Add(var, epsilon));
@@ -124,25 +125,23 @@ Expr InstanceNormToInferUnpack(const Attrs attrs,
   return out;
 }
 
+
 class InferenceSimplifier : public ExprMutator {
  public:
-  InferenceSimplifier()
-      : batch_norm_op_(Op::Get("nn.batch_norm")),
-        dropout_op_(Op::Get("nn.dropout")),
-        instance_norm_op_(Op::Get("nn.instance_norm")),
-        layer_norm_op_(Op::Get("nn.layer_norm")) {}
-
   Expr VisitExpr_(const TupleGetItemNode* n) final {
+    static const Op& batch_norm = Op::Get("nn.batch_norm");
+    static const Op& dropout = Op::Get("nn.dropout");
+
     Expr new_e = ExprMutator::VisitExpr_(n);
     const auto* new_n = new_e.as<TupleGetItemNode>();
     if (new_n->index != 0) {
       return new_e;
     }
     if (const auto* call = new_n->tuple.as<CallNode>()) {
-      if (call->op == batch_norm_op_) {
+      if (call->op.same_as(batch_norm)) {
         return BatchNormToInferUnpack(call->attrs, call->args[0], call->args[1], call->args[2],
                                       call->args[3], call->args[4], ty_map_.at(call->args[0]));
-      } else if (call->op == dropout_op_) {
+      } else if (call->op.same_as(dropout)) {
         return call->args[0];
       }
     }
@@ -150,14 +149,17 @@ class InferenceSimplifier : public ExprMutator {
   }
 
   Expr VisitExpr_(const CallNode* n) {
+    static const Op& batch_norm = Op::Get("nn.batch_norm");
+    static const Op& instance_norm = Op::Get("nn.instance_norm");
+    static const Op& layer_norm = Op::Get("nn.layer_norm");
     auto new_n = ExprMutator::VisitExpr_(n);
-    if (n->op == batch_norm_op_) {
+    if (n->op.same_as(batch_norm)) {
       ty_map_[new_n.as<CallNode>()->args[0]] = n->args[0]->checked_type();
-    } else if (n->op == layer_norm_op_) {
+    } else if (n->op.same_as(layer_norm)) {
       const auto* call = new_n.as<CallNode>();
       return LayerNormToInferUnpack(call->attrs, call->args[0], call->args[1],
                                     call->args[2], n->args[0]->checked_type());
-    } else if (n->op == instance_norm_op_) {
+    } else if (n->op.same_as(instance_norm)) {
       const auto* call = new_n.as<CallNode>();
       return InstanceNormToInferUnpack(call->attrs, call->args[0], call->args[1],
                                     call->args[2], n->args[0]->checked_type());
@@ -166,13 +168,6 @@ class InferenceSimplifier : public ExprMutator {
   }
 
  private:
-  // Cache the following ops. They will be used in the passes repeatedly for
-  // operator equivalence checking so that the registry lookup overhead can be
-  // reduced.
-  const Op& batch_norm_op_;
-  const Op& dropout_op_;
-  const Op& instance_norm_op_;
-  const Op& layer_norm_op_;
   std::unordered_map<Expr, Type, NodeHash, NodeEqual> ty_map_;
 };
 
diff --git a/src/relay/pass/transform_layout.h b/src/relay/pass/transform_layout.h
deleted file mode 100644
index f6c5e9af6d62..000000000000
--- a/src/relay/pass/transform_layout.h
+++ /dev/null
@@ -1,367 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- * 
- *   http://www.apache.org/licenses/LICENSE-2.0
- * 
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- *
- * \file transform_layout.h
- * \brief Common infrastructure for transforming the layouts. This is used for AlterOpLayout and
- *        ConvertLayout pass. */
-
-#ifndef TVM_RELAY_PASS_TRANSFORM_LAYOUT_H_
-#define TVM_RELAY_PASS_TRANSFORM_LAYOUT_H_
-
-#include <tvm/data_layout.h>
-#include <tvm/relay/expr.h>
-#include <string>
-#include <unordered_map>
-#include <tuple>
-#include <vector>
-#include "pattern_util.h"
-#include "infer_layout_util.h"
-
-namespace tvm {
-namespace relay {
-
-/*!
- * \brief Memorizes layout transformations to reuse.
- */
-class TransformMemorizerNode : public Node {
- public:
-  /*! \brief The key for the memorizer map is (Expr, src_layout, dst_layout). */
-  using TransformKey = std::tuple<const Node*, std::string, std::string>;
-
-  struct key_hash : public std::function<std::size_t(TransformKey)> {
-    std::size_t operator()(const TransformKey& k) const {
-      return dmlc::HashCombine<std::string>(
-          dmlc::HashCombine<std::string>(std::hash<const Node*>()(std::get<0>(k)), std::get<1>(k)),
-          (std::get<2>(k)));
-    }
-  };
-
-  /*! \brief The memorizer map. */
-  std::unordered_map<TransformKey, Expr, key_hash> memo;
-
-  static constexpr const char* _type_key = "relay.alter_op_layout.TransformMemorizerNode";
-  TVM_DECLARE_NODE_TYPE_INFO(TransformMemorizerNode, Node);
-};
-
-/*!
- * \brief Container that transforms the layouts and memorizes them.
- */
-class TransformMemorizer : public NodeRef {
- public:
-  TransformMemorizer() {}
-  explicit TransformMemorizer(ObjectPtr<Object> n) : NodeRef(n) {}
-
-  TransformMemorizerNode* operator->() {
-    return static_cast<TransformMemorizerNode*>(get_mutable());
-  }
-
-  /*
-   * \brief Memorizes and transforms the layout.
-   * \param expr The initial expr.
-   * \param src_layout The source layout.
-   * \param dst_layout The dest layout.
-   * \return The new expr with the dst layout.
-   */
-  Expr Transform(Expr raw, const Layout& src_layout, const Layout& dst_layout) {
-    if (src_layout.Equals(dst_layout)) {
-      return raw;
-    }
-
-    std::tuple<const Node*, std::string, std::string> key =
-        std::make_tuple<>(raw.get(), src_layout.name(), dst_layout.name());
-    auto& memo = operator->()->memo;
-
-    auto iter = memo.find(key);
-    if (iter != memo.end()) {
-      return iter->second;
-    } else {
-      Expr transform = TransformHelper(raw, src_layout, dst_layout);
-      memo[key] = transform;
-      return transform;
-    }
-  }
-
-  /*
-   * \brief Helper to transform the layouts.
-   * \param expr The initial expr.
-   * \param src_layout The source layout.
-   * \param dst_layout The dest layout.
-   * \return The new expr with the dst layout.
-   * \note It performs following 2 operations
-   *       1) If src_layout ndim is smaller then dst_layout, expand_dim is inserted to match the dim
-   *          size. For example, src_layout = C, dst_layout = NCHW16c. The src is expanded to NHWC.
-   *       2) Call layout transform with new src layout.
-   */
-  Expr TransformHelper(Expr raw, Layout src_layout, Layout dst_layout) {
-    if (src_layout.Equals(dst_layout)) {
-      return raw;
-    }
-
-    // 1) Check if the shape lengths are different. If yes, expand dims.
-    Expr input_expr = raw;
-    Layout new_src_layout = src_layout;
-    if (src_layout.ndim_primal() < dst_layout.ndim_primal()) {
-      // If scalar, then no need of layout transformation as scalar can be broadcasted easily even
-      // if the other operand has a transformed layout.
-      if (IsScalar(input_expr)) {
-        return raw;
-      }
-      int num_new_axis = dst_layout.ndim_primal() - src_layout.ndim_primal();
-      new_src_layout = src_layout.ExpandPrimal(dst_layout);
-      input_expr = MakeExpandDims(input_expr, 0, num_new_axis);
-      if (new_src_layout.Equals(dst_layout)) {
-        return input_expr;
-      }
-    }
-
-    // 2) Insert layout transform on the transformed src.
-    CHECK(new_src_layout.defined() && dst_layout.defined())
-        << "Cannot insert layout transform because there are undefined layouts";
-    CHECK(BijectiveLayoutNode::make(new_src_layout, dst_layout).defined())
-        << "Cannot insert layout transform because there are inconvertible layouts: "
-        << new_src_layout << " v.s. " << dst_layout;
-    return MakeLayoutTransform(input_expr, new_src_layout.name(), dst_layout.name());
-  }
-
-  /*!
-   * \brief Defines the call transformation for derived passes. The new layouts are defined by
-   * used for different targets using a packed func.
-   * \param ref_call The original call.
-   * \param new_args The traversed/recursed args to the call.
-   * \return The new Call after calling the packed func.
-   */
-  virtual Call CallWithNewLayouts(const Call& ref_call, const std::vector<Expr>& new_args) = 0;
-  using ContainerType = TransformMemorizerNode;
-};
-
-/*
- * \brief TempExprNode during layout transform. Instance of this expr will be Realized to normal
- *        expr ultimately.
- * \tparam TransformMemorizerT The derived TransformMemorizer type.
- */
-template <class TransformMemorizerT>
-class LayoutAlternatedExprNode : public TempExprNode {
- public:
-  Expr value;
-  Layout old_layout;
-  Layout new_layout;
-  TransformMemorizerT memorizer;
-
-  Expr Realize() const final {
-    // NOTE: use a copy to discard the "const" qualifier
-    TransformMemorizerT tmp_memorizer = memorizer;
-    // fallback to old layout
-    return tmp_memorizer.Transform(value, new_layout, old_layout);
-  }
-
-  void VisitAttrs(AttrVisitor* v) {
-    v->Visit("value", &value);
-    v->Visit("old_layout", &old_layout);
-    v->Visit("new_layout", &new_layout);
-  }
-
-  static constexpr const char* _type_key = "relay.alter_op_layout.LayoutAlternatedExprNode";
-  TVM_DECLARE_NODE_TYPE_INFO(LayoutAlternatedExprNode, TempExprNode);
-};
-
-/*!
- * \brief Container for the layout alternated expr.
- * \tparam TransformMemorizerT The derived TransformMemorizer type.
- */
-template <class TransformMemorizerT>
-class LayoutAlternatedExpr : public NodeRef {
- public:
-  LayoutAlternatedExpr() {}
-  explicit LayoutAlternatedExpr(ObjectPtr<Object> n) : NodeRef(n) {}
-
-  LayoutAlternatedExprNode<TransformMemorizerT>* operator->() {
-    return static_cast<LayoutAlternatedExprNode<TransformMemorizerT>*>(get_mutable());
-  }
-
-  using ContainerType = LayoutAlternatedExprNode<TransformMemorizerT>;
-};
-
-/*
- * \brief Used with ForwardRewrite to transform the expr. The input args are same as
- *        FForwardRewrite.
- * \param ref_call The reference old call type to be rewritten.
- *                 We can make use of the op and type information.
- * \param new_args The new arguments (some of them could be TempExpr).
- * \param ctx  Optional context information about ref_call.
- * \tparam TransformMemorizerT The derived TransformMemorizer type.
- * \return The rewriten result call, can also return nullptr,
- *         which indicate the rewriter should use the default fallback
- *         rule that realizes all its input and compose the call.
- *
- * \note The ctx can be used to provide extra information during transformation. The ctx is
- *       templated to reuse across AlterOpLayout and ConvertLayout pass. The steps are
- *       - Extract the original layouts.
- *       - Use ctx transformation to get a Call with new layouts - CallWithNewLayouts.
- *       - Extract the new layouts from the returned Call.
- *       - Transform the original call to reuse the new layouts using TransformMemorizer.
- */
-template <class TransformMemorizerT>
-Expr LayoutRewriter(const Call& ref_call, const Array<Expr>& new_args, const NodeRef& ctx) {
-  std::vector<LayoutAlternatedExpr<TransformMemorizerT>> inputs;
-  std::vector<Expr> normal_new_args;
-  Array<Array<IndexExpr>> input_shapes;
-
-  // NOTE: discard the "const" qualifier
-  // TransformMemorizer memorizer = Downcast<TransformMemorizer>(ctx);
-  // TransformMemorizerT* ctx_transformer =
-  // static_cast<TransformMemorizerT*>(memorizer.operator->());
-  TransformMemorizerT memorizer = Downcast<TransformMemorizerT>(ctx);
-
-  // fill incomplete state and flatten tuple
-  auto push_back_one_arg = [&inputs, memorizer](Expr arg) {
-    // We always expect LayoutAlternatedExpr<TransformMemorizerT>.
-    // This is used to convert the normal Expr to LayoutAlternatedExpr<TransformMemorizerT>.
-    if (const LayoutAlternatedExprNode<TransformMemorizerT>* inp =
-            arg.as<LayoutAlternatedExprNode<TransformMemorizerT>>()) {
-      inputs.push_back(GetRef<LayoutAlternatedExpr<TransformMemorizerT>>(inp));
-      return inp->value;
-    } else {
-      auto inode = make_node<LayoutAlternatedExprNode<TransformMemorizerT>>();
-      inode->value = arg;
-      inode->memorizer = memorizer;
-      inputs.push_back(LayoutAlternatedExpr<TransformMemorizerT>(inode));
-      return arg;
-    }
-  };
-
-  for (auto new_arg : new_args) {
-    // NOTE: do not support nested tuple
-    if (new_arg->IsInstance<TupleNode>()) {
-      Tuple tuple_new_arg = Downcast<Tuple>(new_arg);
-      std::vector<Expr> fields;
-      for (auto x : tuple_new_arg->fields) {
-        Expr tmp = push_back_one_arg(x);
-        fields.push_back(tmp);
-      }
-      normal_new_args.push_back(TupleNode::make(fields));
-    } else {
-      Expr tmp = push_back_one_arg(new_arg);
-      normal_new_args.push_back(tmp);
-    }
-  }
-
-  // old_in, new_in = state[inputs]
-  Array<Layout> old_in, old_out, new_in, new_out, new_in2;
-  for (auto inp : inputs) {
-    old_in.push_back(inp->old_layout);
-    new_in.push_back(inp->new_layout);
-  }
-
-  for (auto arg : ref_call->args) {
-    if (arg->IsInstance<TupleNode>()) {  // flatten tuple
-      Tuple tuple_arg = Downcast<Tuple>(arg);
-      for (auto x : tuple_arg->fields) {
-        input_shapes.push_back(x->type_as<TensorTypeNode>()->shape);
-      }
-    } else {
-      input_shapes.push_back(arg->type_as<TensorTypeNode>()->shape);
-    }
-  }
-
-  // old_in, old_out = op.infer(old_in)
-  bool success = false;
-  std::tie(old_in, old_out, success) =
-      InferCorrectLayouts(ref_call, Array<Layout>(nullptr), old_in, input_shapes);
-  if (!success) {
-    return Expr(nullptr);
-  }
-  CHECK_EQ(old_in.size(), new_in.size());
-
-  // if new_in == 'undef':  new_in = old_in
-  for (size_t i = 0; i < new_in.size(); ++i) {
-    if (!new_in[i].defined()) {
-      new_in.Set(i, old_in[i]);
-    }
-  }
-
-  // new_op = alter(op)
-  Call new_call = memorizer.CallWithNewLayouts(ref_call, normal_new_args);
-
-  // new_in2, new_out = op.infer(new_in)
-  if (new_call->op->IsInstance<OpNode>()) {
-    success = false;
-    std::tie(new_in2, new_out, success) =
-        InferCorrectLayouts(new_call, new_in, old_in, input_shapes);
-    if (!success) {
-      return Expr(nullptr);
-    }
-  } else {
-    return Expr(nullptr);
-  }
-
-  CHECK_EQ(new_out.size(), old_out.size())
-      << "The number of output nodes should keep the same during alter_op_layout";
-  CHECK_EQ(new_in.size(), new_in2.size())
-      << "The number of input nodes should keep the same during alter_op_layout";
-
-  // if (new_in != new_in2): insert transform (new_in -> new_in2)
-  Array<Expr> transformed_args;
-  size_t pt = 0;
-  for (auto arg : new_call->args) {
-    if (arg->IsInstance<TupleNode>()) {  // unflatten tuple
-      Tuple tuple_arg = Downcast<Tuple>(arg);
-      std::vector<Expr> transformed_tuple_arg;
-      for (auto arg_item : tuple_arg->fields) {
-        transformed_tuple_arg.push_back(memorizer.Transform(arg_item, new_in[pt], new_in2[pt]));
-        pt++;
-      }
-      transformed_args.push_back(TupleNode::make(transformed_tuple_arg));
-    } else {
-      transformed_args.push_back(memorizer.Transform(arg, new_in[pt], new_in2[pt]));
-      pt++;
-    }
-  }
-  CHECK_EQ(pt, inputs.size());
-
-  // state[node] = (old_out, new_out)
-  // (handle tuple output)
-  if (ref_call->checked_type()->IsInstance<TupleTypeNode>()) {
-    Expr tuple_output = CallNode::make(new_call->op, transformed_args, new_call->attrs);
-    Array<Expr> fields;
-    for (size_t i = 0; i < new_out.size(); ++i) {
-      auto rnode = make_node<LayoutAlternatedExprNode<TransformMemorizerT>>();
-      rnode->value = TupleGetItemNode::make(tuple_output, i);
-      rnode->old_layout = old_out[i];
-      rnode->new_layout = new_out[i];
-      rnode->memorizer = memorizer;
-      fields.push_back(Expr(rnode));
-    }
-    return TupleNode::make(fields);
-  } else {
-    auto rnode = make_node<LayoutAlternatedExprNode<TransformMemorizerT>>();
-    CHECK_EQ(new_out.size(), 1);
-    rnode->value = CallNode::make(new_call->op, transformed_args, new_call->attrs);
-    rnode->old_layout = old_out[0];
-    rnode->new_layout = new_out[0];
-    rnode->memorizer = memorizer;
-    return Expr(rnode);
-  }
-}
-
-}  //  namespace relay
-}  //  namespace tvm
-
-#endif  // TVM_RELAY_PASS_TRANSFORM_LAYOUT_H_
diff --git a/src/relay/pass/type_infer.cc b/src/relay/pass/type_infer.cc
index 2c4cff4983a6..8fff8131657a 100644
--- a/src/relay/pass/type_infer.cc
+++ b/src/relay/pass/type_infer.cc
@@ -358,7 +358,7 @@ class TypeInferencer : private ExprFunctor<Type(const Expr&)>,
     // that is a rank-0 boolean tensor.
     Type cond_type = this->GetType(ite->cond);
     this->Unify(cond_type,
-                TensorTypeNode::Scalar(tvm::DataType::Bool()),
+                TensorTypeNode::Scalar(tvm::Bool()),
                 ite->cond);
     Type checked_true = this->GetType(ite->true_branch);
     Type checked_false = this->GetType(ite->false_branch);
diff --git a/src/relay/pass/util.cc b/src/relay/pass/util.cc
index 17c527b39237..334c98ba308a 100644
--- a/src/relay/pass/util.cc
+++ b/src/relay/pass/util.cc
@@ -25,7 +25,6 @@
  */
 #include <tvm/relay/analysis.h>
 #include <tvm/relay/expr_functor.h>
-#include <tvm/relay/op.h>
 #include <tvm/relay/pattern_functor.h>
 #include "pass_util.h"
 #include "../ir/type_functor.h"
@@ -361,14 +360,13 @@ bool IsNDArrayAllGreaterEqual(const runtime::NDArray& tensor, T value) {
   return true;
 }
 
-// Cache the operators that are checked recursively to reduce lookup overhead.
-static const auto& expand_dims_op = Op::Get("expand_dims");
-static const auto& reshape_op = Op::Get("reshape");
-static const auto& transpose_op = Op::Get("transpose");
-static const auto& squeeze_op = Op::Get("squeeze");
-
 bool IsAllPositiveConstant(const Expr& expr) {
   // peel through a few common transform ops.
+  static const auto& expand_dims = Op::Get("expand_dims");
+  static const auto& reshape = Op::Get("reshape");
+  static const auto& transpose = Op::Get("transpose");
+  static const auto& squeeze = Op::Get("squeeze");
+
   if (const auto* constant = expr.as<ConstantNode>()) {
     const auto& tensor = constant->data;
     const auto& dtype = tensor->dtype;
@@ -391,10 +389,10 @@ bool IsAllPositiveConstant(const Expr& expr) {
     }
   } else if (const auto* op = expr.as<CallNode>()) {
     // tail recursion.
-    if (op->op == expand_dims_op ||
-        op->op == reshape_op ||
-        op->op == transpose_op ||
-        op->op == squeeze_op) {
+    if (op->op.same_as(expand_dims) ||
+        op->op.same_as(reshape) ||
+        op->op.same_as(transpose) ||
+        op->op.same_as(squeeze)) {
       return IsAllPositiveConstant(op->args[0]);
     } else {
       return false;
diff --git a/src/relay/qnn/op/add.cc b/src/relay/qnn/op/add.cc
index f535567a9a23..5bfebdf57a2d 100644
--- a/src/relay/qnn/op/add.cc
+++ b/src/relay/qnn/op/add.cc
@@ -84,25 +84,25 @@ Expr QnnAddCanonicalize(const Attrs& attrs, const Array<Expr>& new_args,
   auto requantized_lhs = lhs;
   if (lhs_scale != output_scale || lhs_zero_point != output_zero_point) {
     requantized_lhs = Requantize(lhs, input_shape, lhs_scale, lhs_zero_point, output_scale,
-                                 output_zero_point, DataType::Int(32));
+                                 output_zero_point, Int(32));
   } else {
-    requantized_lhs = Cast(requantized_lhs, DataType::Int(32));
+    requantized_lhs = Cast(requantized_lhs, Int(32));
   }
 
   // Requantize RHS if necessary.
   auto requantized_rhs = rhs;
   if (rhs_scale != output_scale || rhs_zero_point != output_zero_point) {
     requantized_rhs = Requantize(rhs, input_shape, rhs_scale, rhs_zero_point, output_scale,
-                                 output_zero_point, DataType::Int(32));
+                                 output_zero_point, Int(32));
   } else {
-    requantized_rhs = Cast(requantized_rhs, DataType::Int(32));
+    requantized_rhs = Cast(requantized_rhs, Int(32));
   }
 
   auto output = Add(requantized_lhs, requantized_rhs);
 
   // Subtract zero point.
   if (output_zero_point != 0) {
-    auto output_zp = MakeConstantScalar(DataType::Int(32), output_zero_point);
+    auto output_zp = MakeConstantScalar(Int(32), output_zero_point);
     output = Subtract(output, output_zp);
   }
 
diff --git a/src/relay/qnn/op/convolution.cc b/src/relay/qnn/op/convolution.cc
index a629bf2b462e..6bcaa09d610e 100644
--- a/src/relay/qnn/op/convolution.cc
+++ b/src/relay/qnn/op/convolution.cc
@@ -46,11 +46,11 @@ bool QnnConv2DRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
   if (data == nullptr || weight == nullptr) return false;
   const auto* param = attrs.as<QnnConv2DAttrs>();
   CHECK(param != nullptr) << "QnnConv2DAttrs cannot be nullptr.";
-  CHECK(data->dtype == DataType::Int(8) || data->dtype == DataType::UInt(8))
+  CHECK(data->dtype == Int(8) || data->dtype == UInt(8))
       << "Expected qnn conv2d type(int8, uint8) for input but was " << data->dtype;
-  CHECK(weight->dtype == DataType::Int(8) || weight->dtype == DataType::UInt(8))
+  CHECK(weight->dtype == Int(8) || weight->dtype == UInt(8))
       << "Expected qnn conv2d type(int8, uint8) for weight but was " << weight->dtype;
-  CHECK(param->out_dtype == DataType::Int(16) || param->out_dtype == DataType::Int(32))
+  CHECK(param->out_dtype == Int(16) || param->out_dtype == Int(32))
       << "Expected qnn conv2d type(int32, int16) for output but was " << param->out_dtype;
   CHECK(param->out_dtype.bits() > 0) << "Output dtype bits should be greater than 0.";
   return Conv2DRel<QnnConv2DAttrs>(types, num_inputs, attrs, reporter);
@@ -130,17 +130,17 @@ WorkloadType GetWorkload(const Array<tvm::relay::Type>& arg_types, const QnnConv
  */
 Expr Conv2DFallBack(const Expr& data, const Expr& weight, const QnnConv2DAttrs* param) {
   // Upcast the zero point to Int16.
-  auto zp_data = MakeConstantScalar(DataType::Int(16), param->input_zero_point);
-  auto zp_kernel = MakeConstantScalar(DataType::Int(16), param->kernel_zero_point);
+  auto zp_data = MakeConstantScalar(Int(16), param->input_zero_point);
+  auto zp_kernel = MakeConstantScalar(Int(16), param->kernel_zero_point);
 
-  auto shifted_data = Cast(data, DataType::Int(16));
+  auto shifted_data = Cast(data, Int(16));
   if (param->input_zero_point != 0) {
-    shifted_data = Subtract(Cast(data, DataType::Int(16)), zp_data);
+    shifted_data = Subtract(Cast(data, Int(16)), zp_data);
   }
 
-  auto shifted_kernel = Cast(weight, DataType::Int(16));
+  auto shifted_kernel = Cast(weight, Int(16));
   if (param->kernel_zero_point != 0) {
-    shifted_kernel = Subtract(Cast(weight, DataType::Int(16)), zp_kernel);
+    shifted_kernel = Subtract(Cast(weight, Int(16)), zp_kernel);
   }
 
   return Conv2D(shifted_data, shifted_kernel, param->strides, param->padding, param->dilation,
@@ -200,9 +200,9 @@ Expr Conv2DPadInput(const Expr& data, const QnnConv2DAttrs* param) {
 Expr DepthwiseConv2DSecondTerm(const Expr& padded_data, const QnnConv2DAttrs* param, int kernel_h,
                                int kernel_w, int channel_multiplier) {
   // Constant Expr for the kernel zero point.
-  auto zp_kernel = MakeConstantScalar(DataType::Int(32), param->kernel_zero_point);
+  auto zp_kernel = MakeConstantScalar(Int(32), param->kernel_zero_point);
 
-  auto casted_t2 = Cast(padded_data, DataType::Int(32));
+  auto casted_t2 = Cast(padded_data, Int(32));
 
   // We can reduce the H and W axis by using avg_pool2d. However, avg_pool2d averages the sum.
   // Since, this is integer division (floor), we can first multiply the data by the pool_size and
@@ -210,8 +210,7 @@ Expr DepthwiseConv2DSecondTerm(const Expr& padded_data, const QnnConv2DAttrs* pa
   // pool_size is 1x1, we don't need avg_pool2d.
   auto reduced_t2 = casted_t2;
   if (kernel_h * kernel_w != 1) {
-    auto scaled_hw_t2 = Multiply(
-        casted_t2, MakeConstantScalar(DataType::Int(32), kernel_h * kernel_w));
+    auto scaled_hw_t2 = Multiply(casted_t2, MakeConstantScalar(Int(32), kernel_h * kernel_w));
     Array<IndexExpr> padding({0, 0});
     reduced_t2 =
         AvgPool2D(scaled_hw_t2, param->kernel_size, param->strides, padding, param->data_layout,
@@ -257,7 +256,7 @@ Expr DepthwiseConv2DSecondTerm(const Expr& padded_data, const QnnConv2DAttrs* pa
 Expr DepthwiseConv2DThirdTerm(const Expr& weight, const QnnConv2DAttrs* param, int out_channels,
                               int channel_multiplier) {
   // Constant expr for input zero point.
-  auto zp_data = MakeConstantScalar(DataType::Int(32), param->input_zero_point);
+  auto zp_data = MakeConstantScalar(Int(32), param->input_zero_point);
 
   // Find which dimensions are R, S.
   Array<Integer> axes_t3;
@@ -271,7 +270,7 @@ Expr DepthwiseConv2DThirdTerm(const Expr& weight, const QnnConv2DAttrs* param, i
   } else {
     LOG(FATAL) << "qnn.conv2d does not support " << param->kernel_layout << " layout";
   }
-  auto reduced_t3 = Sum(Cast(weight, DataType::Int(32)), axes_t3, false, false);
+  auto reduced_t3 = Sum(Cast(weight, Int(32)), axes_t3, false, false);
 
   // Find the newshape depending on NCHW/NHWC layout.
   Array<Integer> newshape;
@@ -302,7 +301,7 @@ Expr DepthwiseConv2DThirdTerm(const Expr& weight, const QnnConv2DAttrs* param, i
  */
 Expr DepthwiseConv2DFourthTerm(const QnnConv2DAttrs* param, int kernel_h, int kernel_w) {
   int scalar_term4 = param->input_zero_point * param->kernel_zero_point * kernel_h * kernel_w;
-  return MakeConstantScalar(DataType::Int(32), scalar_term4);
+  return MakeConstantScalar(Int(32), scalar_term4);
 }
 
 /*
@@ -342,9 +341,9 @@ Expr Conv2DFirstTerm(const Expr& padded_data, const Expr& weight, const QnnConv2
 Expr Conv2DSecondTerm(const Expr& padded_data, const QnnConv2DAttrs* param, int kernel_h,
                       int kernel_w, int out_channels) {
   // Constant Expr for the kernel zero point.
-  auto zp_kernel = MakeConstantScalar(DataType::Int(32), param->kernel_zero_point);
+  auto zp_kernel = MakeConstantScalar(Int(32), param->kernel_zero_point);
 
-  auto casted_t2 = Cast(padded_data, DataType::Int(32));
+  auto casted_t2 = Cast(padded_data, Int(32));
 
   // We can reduce the H and W axis by using avg_pool2d. However, avg_pool2d averages the sum.
   // Since, this is integer division (floor), we can first multiply the data by the pool_size and
@@ -366,13 +365,9 @@ Expr Conv2DSecondTerm(const Expr& padded_data, const QnnConv2DAttrs* param, int
   // If the pool_size is 1x1, we don't need avg_pool2d.
   auto reduced_t2 = reduced_c_t2;
   if (kernel_h * kernel_w != 1) {
-    reduced_c_t2 = Multiply(
-        reduced_c_t2, MakeConstantScalar(DataType::Int(32), kernel_h * kernel_w));
+    reduced_c_t2 = Multiply(reduced_c_t2, MakeConstantScalar(Int(32), kernel_h * kernel_w));
     reduced_t2 =
-        AvgPool2D(reduced_c_t2,
-                  param->kernel_size,
-                  param->strides,
-                  padding, param->data_layout,
+        AvgPool2D(reduced_c_t2, param->kernel_size, param->strides, padding, param->data_layout,
                   false,   // ceil_mode
                   false);  // count_include_pad
   }
@@ -400,7 +395,7 @@ Expr Conv2DSecondTerm(const Expr& padded_data, const QnnConv2DAttrs* param, int
  */
 Expr Conv2DThirdTerm(const Expr& weight, const QnnConv2DAttrs* param, int out_channels) {
   // Constant expr for input zero point.
-  auto zp_data = MakeConstantScalar(DataType::Int(32), param->input_zero_point);
+  auto zp_data = MakeConstantScalar(Int(32), param->input_zero_point);
 
   // Find which dimensions are C, R, S.
   Array<Integer> axes_t3;
@@ -414,7 +409,7 @@ Expr Conv2DThirdTerm(const Expr& weight, const QnnConv2DAttrs* param, int out_ch
   } else {
     LOG(FATAL) << "qnn.conv2d does not support " << param->kernel_layout << " layout";
   }
-  auto reduced_t3 = Sum(Cast(weight, DataType::Int(32)), axes_t3, false, false);
+  auto reduced_t3 = Sum(Cast(weight, Int(32)), axes_t3, false, false);
 
   // Find the newshape depending on NCHW/NHWC layout.
   Array<Integer> newshape;
@@ -448,7 +443,7 @@ Expr Conv2DThirdTerm(const Expr& weight, const QnnConv2DAttrs* param, int out_ch
 Expr Conv2DFourthTerm(const QnnConv2DAttrs* param, int in_channels, int kernel_h, int kernel_w) {
   int scalar_term4 =
       param->input_zero_point * param->kernel_zero_point * in_channels * kernel_h * kernel_w;
-  return MakeConstantScalar(DataType::Int(32), scalar_term4);
+  return MakeConstantScalar(Int(32), scalar_term4);
 }
 
 /*
diff --git a/src/relay/qnn/op/dense.cc b/src/relay/qnn/op/dense.cc
index ad0da52ec120..96b999aa8e5b 100644
--- a/src/relay/qnn/op/dense.cc
+++ b/src/relay/qnn/op/dense.cc
@@ -45,11 +45,11 @@ bool QnnDenseRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
   if (data == nullptr || weight == nullptr) return false;
   const auto* param = attrs.as<QnnDenseAttrs>();
   CHECK(param != nullptr) << "QnnDenseAttrs cannot be nullptr.";
-  CHECK(data->dtype == DataType::Int(8) || data->dtype == DataType::UInt(8))
+  CHECK(data->dtype == Int(8) || data->dtype == UInt(8))
       << "Expected quantized dense type(int8, uint8) for input but was " << data->dtype;
-  CHECK(weight->dtype == DataType::Int(8) || weight->dtype == DataType::UInt(8))
+  CHECK(weight->dtype == Int(8) || weight->dtype == UInt(8))
       << "Expected quantized dense type(int8, uint8) for weight but was " << weight->dtype;
-  CHECK(param->out_dtype == DataType::Int(32))
+  CHECK(param->out_dtype == Int(32))
       << "Expected quantized dense type(int32) for output but was " << param->out_dtype;
   CHECK(param->out_dtype.bits() > 0) << "Output dtype bits should be greater than 0.";
   return DenseRel<QnnDenseAttrs>(types, num_inputs, attrs, reporter);
@@ -78,17 +78,17 @@ Expr DenseFirstTerm(const Expr& quantized_data, const Expr& quantized_kernel,
 
 Expr DenseSecondTerm(const Expr& quantized_data, const Expr& zp_kernel) {
   Array<Integer> axes = {1};
-  return Multiply(zp_kernel, Sum(Cast(quantized_data, DataType::Int(32)), axes, true, false));
+  return Multiply(zp_kernel, Sum(Cast(quantized_data, Int(32)), axes, true, false));
 }
 
 Expr DenseThirdTerm(const Expr& quantized_kernel, const Expr& zp_data) {
   Array<Integer> axes = {1};
-  return Multiply(zp_data, Sum(Cast(quantized_kernel, DataType::Int(32)), axes, false, false));
+  return Multiply(zp_data, Sum(Cast(quantized_kernel, Int(32)), axes, false, false));
 }
 
 Expr DenseFourthTerm(const QnnDenseAttrs* attrs, int reduction_dim_size) {
   int32_t scalar_term = attrs->input_zero_point * attrs->kernel_zero_point * reduction_dim_size;
-  return MakeConstantScalar(DataType::Int(32), scalar_term);
+  return MakeConstantScalar(Int(32), scalar_term);
 }
 
 /*
@@ -133,8 +133,8 @@ Expr QnnDenseCanonicalize(const Attrs& attrs, const Array<Expr>& new_args,
   const int reduction_dim_size = get_const_int(in_shape[1]);
 
   const auto* qnn_dense_attrs = attrs.as<QnnDenseAttrs>();
-  auto zp_kernel = MakeConstantScalar(DataType::Int(32), qnn_dense_attrs->kernel_zero_point);
-  auto zp_data = MakeConstantScalar(DataType::Int(32), qnn_dense_attrs->input_zero_point);
+  auto zp_kernel = MakeConstantScalar(Int(32), qnn_dense_attrs->kernel_zero_point);
+  auto zp_data = MakeConstantScalar(Int(32), qnn_dense_attrs->input_zero_point);
 
   // Get all the terms as described in the comments.
   auto term1 = DenseFirstTerm(quantized_data, quantized_kernel, qnn_dense_attrs);
diff --git a/src/relay/qnn/op/dequantize.cc b/src/relay/qnn/op/dequantize.cc
index 7daee4664ac5..ba0c15509e68 100644
--- a/src/relay/qnn/op/dequantize.cc
+++ b/src/relay/qnn/op/dequantize.cc
@@ -42,14 +42,12 @@ bool DequantizeRel(const Array<Type>& types,
   CHECK_EQ(types.size(), 2);
   const auto* data = types[0].as<TensorTypeNode>();
   const auto input_dtype = data->dtype;
-  CHECK(input_dtype == DataType::Int(8) ||
-        input_dtype == DataType::UInt(8) ||
-        input_dtype == DataType::Int(32))
+  CHECK(input_dtype == Int(8) || input_dtype == UInt(8) || input_dtype == Int(32))
     << "Input type should be one of the quantized types [unit8, int8, int32] but was "
     <<  input_dtype;
   const Array<tvm::Expr> oshape = data->shape;
   // assign output type, output will always be float 32.
-  reporter->Assign(types[1], TensorTypeNode::make(oshape, DataType::Float(32)));
+  reporter->Assign(types[1], TensorTypeNode::make(oshape, Float(32)));
   return true;
 }
 
@@ -67,10 +65,10 @@ Expr MakeDequantize(Expr data,
 
 Expr DequantizeLower(const Expr& input_tensor,
                      const DequantizeAttrs* attrs) {
-  const auto input_zero_point = MakeConstantScalar(DataType::Int(32), attrs->input_zero_point);
-  const auto input_scale = MakeConstantScalar(DataType::Float(32), attrs->input_scale);
-  auto shift = Subtract(Cast(input_tensor, DataType::Int(32)), input_zero_point);
-  auto scaled_output = Multiply(Cast(shift, DataType::Float(32)), input_scale);
+  const auto input_zero_point = MakeConstantScalar(Int(32), attrs->input_zero_point);
+  const auto input_scale = MakeConstantScalar(Float(32), attrs->input_scale);
+  auto shift = Subtract(Cast(input_tensor, Int(32)), input_zero_point);
+  auto scaled_output = Multiply(Cast(shift, Float(32)), input_scale);
   return scaled_output;
 }
 
diff --git a/src/relay/qnn/op/mul.cc b/src/relay/qnn/op/mul.cc
index a34098fcc5ca..71fe4f83f328 100644
--- a/src/relay/qnn/op/mul.cc
+++ b/src/relay/qnn/op/mul.cc
@@ -72,16 +72,16 @@ Expr QnnMulCanonicalize(const Attrs& attrs, const Array<Expr>& new_args,
   which is essentially a requantization of tensor Q' into tensor Q_c.
   */
 
-  auto lhs_shifted = Cast(lhs, DataType::Int(32));
-  auto rhs_shifted = Cast(rhs, DataType::Int(32));
+  auto lhs_shifted = Cast(lhs, Int(32));
+  auto rhs_shifted = Cast(rhs, Int(32));
 
   if (lhs_zero_point != 0) {
-    auto lhs_zp = MakeConstantScalar(DataType::Int(32), lhs_zero_point);
+    auto lhs_zp = MakeConstantScalar(Int(32), lhs_zero_point);
     lhs_shifted = Subtract(lhs_shifted, lhs_zp);
   }
 
   if (rhs_zero_point != 0) {
-    auto rhs_zp = MakeConstantScalar(DataType::Int(32), rhs_zero_point);
+    auto rhs_zp = MakeConstantScalar(Int(32), rhs_zero_point);
     rhs_shifted = Subtract(rhs_shifted, rhs_zp);
   }
 
diff --git a/src/relay/qnn/op/quantize.cc b/src/relay/qnn/op/quantize.cc
index 6b7fecd191fc..e6683f228ab1 100644
--- a/src/relay/qnn/op/quantize.cc
+++ b/src/relay/qnn/op/quantize.cc
@@ -42,14 +42,12 @@ bool QuantizeRel(const Array<Type>& types,
   CHECK_EQ(types.size(), 2);
   const auto* data = types[0].as<TensorTypeNode>();
   const auto input_dtype = data->dtype;
-  CHECK(input_dtype == DataType::Float(32))
+  CHECK(input_dtype == Float(32))
     << "Input type should be one of float32 but was " <<  input_dtype;
   const auto* quantize_attrs = attrs.as<QuantizeAttrs>();
   const Array<tvm::Expr> oshape = data->shape;
   const DataType out_dtype = quantize_attrs->out_dtype;
-  CHECK(out_dtype == DataType::Int(8) ||
-        out_dtype == DataType::UInt(8) ||
-        out_dtype == DataType::Int(32))
+  CHECK(out_dtype == Int(8) || out_dtype == UInt(8) || out_dtype == Int(32))
     << "Output type should be one of [int8, unit8, int32] but was " << out_dtype;
   // assign output type
   reporter->Assign(types[1], TensorTypeNode::make(oshape, out_dtype));
@@ -73,12 +71,12 @@ Expr MakeQuantize(Expr data,
 Expr QuantizeLower(const Expr& input_tensor,
                    const QuantizeAttrs* attrs) {
   const auto out_dtype = attrs->out_dtype;
-  const auto output_zero_point = MakeConstantScalar(DataType::Float(32), attrs->output_zero_point);
-  const auto scale = MakeConstantScalar(DataType::Float(32), attrs->output_scale);
+  const auto output_zero_point = MakeConstantScalar(Float(32), attrs->output_zero_point);
+  const auto scale = MakeConstantScalar(Float(32), attrs->output_scale);
   const int32_t min_val = GetQmin(out_dtype);
   const int32_t max_val = GetQmax(out_dtype);
   auto scale_data = Divide(input_tensor, scale);
-  auto add_zero_point = Cast(Round(Add(scale_data, output_zero_point)), DataType::Int(32));
+  auto add_zero_point = Cast(Round(Add(scale_data, output_zero_point)), Int(32));
   auto clamped_output = Clip(add_zero_point, min_val, max_val);
   auto clamp_out_dtype = Cast(clamped_output, out_dtype);
   return clamp_out_dtype;
diff --git a/src/relay/qnn/op/requantize.cc b/src/relay/qnn/op/requantize.cc
index ec8c845dc8c6..dc27de62872d 100644
--- a/src/relay/qnn/op/requantize.cc
+++ b/src/relay/qnn/op/requantize.cc
@@ -58,7 +58,7 @@ Expr RequantizeLower(const Expr& input_tensor, const RequantizeAttrs* param,
                      const Array<IndexExpr>& input_shape, const DataType& out_dtype) {
   double double_multiplier = param->input_scale / param->output_scale;
 
-  DataType hp_dtype = DataType::Int(64);
+  DataType hp_dtype = Int(64);
 
   auto tensor = Cast(input_tensor, hp_dtype);
   // 1) Subtract the input_zero_point
@@ -143,18 +143,14 @@ bool RequantizeRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
   CHECK_EQ(types.size(), 2);
   const auto* data = types[0].as<TensorTypeNode>();
   const auto in_dtype = data->dtype;
-  CHECK(in_dtype == DataType::Int(8) ||
-        in_dtype == DataType::UInt(8) ||
-        in_dtype == DataType::Int(32))
+  CHECK(in_dtype == Int(8) || in_dtype == UInt(8) || in_dtype == Int(32))
       << "Input type should be one of [int8, uint8, int32] but was " << in_dtype;
 
   const Array<tvm::Expr> oshape = data->shape;
   // assign output type
   const RequantizeAttrs* param = attrs.as<RequantizeAttrs>();
   auto out_dtype = param->out_dtype;
-  CHECK(out_dtype == DataType::Int(8) ||
-        out_dtype == DataType::UInt(8) ||
-        out_dtype == DataType::Int(32))
+  CHECK(out_dtype == Int(8) || out_dtype == UInt(8) || out_dtype == Int(32))
       << "Output type should be one of [int8, uint8, int32] but was " << out_dtype;
   reporter->Assign(types[1], TensorTypeNode::make(oshape, out_dtype));
   return true;
diff --git a/src/relay/qnn/util.cc b/src/relay/qnn/util.cc
index 98d07eb098de..ffe59eab8832 100644
--- a/src/relay/qnn/util.cc
+++ b/src/relay/qnn/util.cc
@@ -79,7 +79,7 @@ Expr FixedPointMultiply(Expr tensor, double multiplier,
                    const Array<IndexExpr>& input_shape, const std::string& rounding) {
   // Choose high precision datatype to be int64. This is for avoiding overflow
   // in multiplication of two int32 values.
-  DataType hp_dtype = DataType::Int(64);
+  DataType hp_dtype = Int(64);
 
   // 1) Calculating the integer multiplier and integer shift
   int32_t fixed_point_multiplier, shift;
diff --git a/src/relay/qnn/util.h b/src/relay/qnn/util.h
index 6659a44e63f6..138fe6a50597 100644
--- a/src/relay/qnn/util.h
+++ b/src/relay/qnn/util.h
@@ -47,11 +47,11 @@ static inline const int32_t GetQmin(const DataType& dtype) {
   CHECK_LE(dtype.bits(), 32)
       << "QNN ops support int32 or lower precision";
   if (dtype.is_int()) {
-    auto* min_value = as_const_int(tvm::min_value(dtype));
+    auto* min_value = as_const_int(dtype.min());
     CHECK(min_value != nullptr);
     return static_cast<int32_t>(min_value[0]);
   } else if (dtype.is_uint()) {
-    auto* min_value = as_const_uint(tvm::min_value(dtype));
+    auto* min_value = as_const_uint(dtype.min());
     CHECK(min_value != nullptr);
     return static_cast<int32_t>(min_value[0]);
   } else {
@@ -64,11 +64,11 @@ static inline const int32_t GetQmax(const DataType& dtype) {
   CHECK_LE(dtype.bits(), 32)
       << "QNN ops support int32 or lower precision";
   if (dtype.is_int()) {
-    auto* max_value = as_const_int(tvm::max_value(dtype));
+    auto* max_value = as_const_int(dtype.max());
     CHECK(max_value != nullptr);
     return static_cast<int32_t>(max_value[0]);
   } else if (dtype.is_uint()) {
-    auto* max_value = as_const_uint(tvm::max_value(dtype));
+    auto* max_value = as_const_uint(dtype.max());
     CHECK(max_value != nullptr);
     return static_cast<int32_t>(max_value[0]);
   } else {
diff --git a/src/runtime/c_runtime_api.cc b/src/runtime/c_runtime_api.cc
index f1d543537e19..3608fcea4aa1 100644
--- a/src/runtime/c_runtime_api.cc
+++ b/src/runtime/c_runtime_api.cc
@@ -235,14 +235,7 @@ std::string NormalizeError(std::string err_msg) {
     if (!(is >> line)) return false;
     // get filename
     while (is.peek() == ' ') is.get();
-#ifdef _MSC_VER  // handle volume separator ":" in Windows path
-    std::string drive;
-    if (!getline(is, drive, ':')) return false;
     if (!getline(is, file_name, ':')) return false;
-    file_name = drive + ":" + file_name;
-#else
-    if (!getline(is, file_name, ':')) return false;
-#endif
     // get line number
     if (!(is >> line_number)) return false;
     // get rest of the message.
diff --git a/src/runtime/contrib/cublas/cublas.cc b/src/runtime/contrib/cublas/cublas.cc
index 2cb677729654..bbb2d2e952cc 100644
--- a/src/runtime/contrib/cublas/cublas.cc
+++ b/src/runtime/contrib/cublas/cublas.cc
@@ -181,98 +181,6 @@ bool CheckMixPrecisionType(DLDataType in_dtype, DLDataType out_dtype, bool int_s
   }
 }
 
-int roundoff(int v, int d) {
-  return (v + d - 1) / d * d;
-}
-
-#if CUDART_VERSION >= 10010
-inline void CallLtIgemm(TVMArgs args, TVMRetValue *ret, cublasLtHandle_t hdl) {
-  DLTensor *A = args[0];
-  DLTensor *B = args[1];
-  DLTensor *C = args[2];
-  bool transa = args[3];
-  bool transb = args[4];
-  // Reversed strides indicates an in-place transpose operation.
-  transa = IsInPlaceTransposed(A) ? !transa : transa;
-  transb = IsInPlaceTransposed(B) ? !transb : transb;
-  int M = ColumnCount(B, transb);
-  int N = RowCount(A, transa);
-  int K = ColumnCount(A, transa);
-  int N_out = ColumnCount(C, false);
-  int m = M;
-  int n = m;
-  int k = m;
-  int lda = M * K / (roundoff(K, 32) / 32);
-  int ldb = K * N / (roundoff(K, 32) / 32);
-  int ldc = M * N_out / (roundoff(N_out, 32) / 32);
-  CHECK_EQ(A->ndim, 2);
-  CHECK_EQ(B->ndim, 2);
-  CHECK_EQ(C->ndim, 2);
-
-  CHECK_EQ(ElementStride(A), 1);
-  CHECK_EQ(ElementStride(B), 1);
-  CHECK_EQ(ElementStride(C), 1);
-
-  CHECK(TypeEqual(A->dtype, B->dtype));
-  CHECK(TypeMatch(A->dtype, kDLInt, 8));
-  CHECK(TypeMatch(C->dtype, kDLInt, 32));
-
-  CHECK(CheckMixPrecisionType(A->dtype, C->dtype)) << "Unsupported data type";
-  int32_t alpha = args.size() > 5 ? args[5] : 1;
-  int32_t beta = args.size() > 6 ? args[6] : 0;
-  cublasLtMatrixLayout_t Adesc = NULL, Bdesc = NULL, Cdesc = NULL;
-  auto A_data = reinterpret_cast<void*>(static_cast<char*>(A->data) + A->byte_offset);
-  auto B_data = reinterpret_cast<void*>(static_cast<char*>(B->data) + B->byte_offset);
-  auto C_data = reinterpret_cast<void*>(static_cast<char*>(C->data) + C->byte_offset);
-
-  cublasOperation_t opTranspose = CUBLAS_OP_T;
-  cublasLtOrder_t order_COL32 = CUBLASLT_ORDER_COL32;
-  cublasLtOrder_t order_COL4_4R2_8C = CUBLASLT_ORDER_COL4_4R2_8C;
-  cublasLtMatmulDesc_t operationDesc = nullptr;
-  CHECK_CUBLAS_ERROR(cublasLtMatmulDescCreate(&operationDesc, CUDA_R_32I));
-  CHECK_CUBLAS_ERROR(cublasLtMatmulDescSetAttribute(
-          operationDesc, CUBLASLT_MATMUL_DESC_TRANSB, &opTranspose, sizeof(opTranspose)));
-  cublasOperation_t opTransA = BooleanToTranspose(transa);
-  cublasOperation_t opTransB = BooleanToTranspose(transb);
-  CHECK_CUBLAS_ERROR(cublasLtMatmulDescSetAttribute(
-          operationDesc, CUBLASLT_MATMUL_DESC_TRANSA, &opTransA, sizeof(opTransA)));
-  CHECK_CUBLAS_ERROR(cublasLtMatmulDescSetAttribute(
-          operationDesc, CUBLASLT_MATMUL_DESC_TRANSB, &opTransB, sizeof(opTransB)));
-  // Create descriptors for the original matrices
-  CHECK_CUBLAS_ERROR(cublasLtMatrixLayoutCreate(
-          &Adesc, CUDA_R_8I, opTransA == CUBLAS_OP_N ? m : k ,
-          opTransA == CUBLAS_OP_N ? k : m, lda));
-  CHECK_CUBLAS_ERROR(cublasLtMatrixLayoutCreate(
-          &Bdesc, CUDA_R_8I, opTransB == CUBLAS_OP_N ? k : n ,
-          opTransB == CUBLAS_OP_N ? n : k, ldb));
-  CHECK_CUBLAS_ERROR(cublasLtMatrixLayoutCreate(&Cdesc, CUDA_R_32I, m, n, ldc));
-
-  CHECK_CUBLAS_ERROR(cublasLtMatrixLayoutSetAttribute(
-          Adesc, CUBLASLT_MATRIX_LAYOUT_ORDER, &order_COL32, sizeof(order_COL32)));
-  CHECK_CUBLAS_ERROR(cublasLtMatrixLayoutSetAttribute(
-          Bdesc, CUBLASLT_MATRIX_LAYOUT_ORDER, &order_COL4_4R2_8C, sizeof(order_COL4_4R2_8C)));
-  CHECK_CUBLAS_ERROR(cublasLtMatrixLayoutSetAttribute(
-          Cdesc, CUBLASLT_MATRIX_LAYOUT_ORDER, &order_COL32, sizeof(order_COL32)));
-
-  CHECK_CUBLAS_ERROR(cublasLtMatmul(hdl,
-                                    operationDesc,
-                                    &alpha,
-                                    B_data,
-                                    Adesc,
-                                    A_data,
-                                    Bdesc,
-                                    &beta,
-                                    C_data,
-                                    Cdesc,
-                                    C_data,
-                                    Cdesc,
-                                    NULL,
-                                    NULL,
-                                    0,
-                                    0));
-}
-#endif
-
 inline void CallGemmEx(TVMArgs args, TVMRetValue *ret, cublasHandle_t hdl) {
   DLTensor *A = args[0];
   DLTensor *B = args[1];
@@ -434,28 +342,13 @@ TVM_REGISTER_GLOBAL("tvm.contrib.cublas.matmul")
     }
 });
 
-#if CUDART_VERSION >= 10010
-TVM_REGISTER_GLOBAL("tvm.contrib.cublaslt.matmul")
-.set_body([](TVMArgs args, TVMRetValue* ret) {
-    DLTensor* A = args[0];
-
-    CuBlasThreadEntry* entry_ptr = CuBlasThreadEntry::ThreadLocal();
-
-    TryEnableTensorCore(entry_ptr->handle);
-
-    CHECK(TypeMatch(A->dtype, kDLInt, 8)) << "Expects dtype to be int8\n";
-    cublasLtHandle_t ltHandle;
-    CHECK_CUBLAS_ERROR(cublasLtCreate(&ltHandle));
-    CallLtIgemm(args, ret, ltHandle);
-    CHECK_CUBLAS_ERROR(cublasLtDestroy(ltHandle));
-});
-#endif  // CUDART_VERSION >= 10010
-
 TVM_REGISTER_GLOBAL("tvm.contrib.cublas.batch_matmul")
 .set_body([](TVMArgs args, TVMRetValue* ret) {
     DLTensor* A = args[0];
     DLTensor* C = args[2];
 
+
+
     CuBlasThreadEntry* entry_ptr = CuBlasThreadEntry::ThreadLocal();
 
     TryEnableTensorCore(entry_ptr->handle);
diff --git a/src/runtime/contrib/cublas/cublas_utils.h b/src/runtime/contrib/cublas/cublas_utils.h
index 2e553e28493b..17e123219089 100644
--- a/src/runtime/contrib/cublas/cublas_utils.h
+++ b/src/runtime/contrib/cublas/cublas_utils.h
@@ -27,12 +27,6 @@
 #include <dmlc/logging.h>
 #include <dlpack/dlpack.h>
 #include <cublas_v2.h>
-#include <cuda_runtime.h>
-#include <cuda_runtime_api.h>
-#include <cstdint>
-#if CUDART_VERSION >= 10010
-#include <cublasLt.h>
-#endif  // CUDART_VERSION >= 10010
 
 namespace tvm {
 namespace contrib {
diff --git a/src/runtime/contrib/dnnl/dnnl.cc b/src/runtime/contrib/dnnl/dnnl.cc
deleted file mode 100644
index cc430b2c7c76..000000000000
--- a/src/runtime/contrib/dnnl/dnnl.cc
+++ /dev/null
@@ -1,247 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file src/runtime/contrib/dnnl/dnnl.cc
- * \brief TVM compatible wrappers for dnnl kernels.
- */
-
-#include "dnnl_kernel.h"
-
-#include <assert.h>
-#include <stdlib.h>
-#include <string.h>
-
-#include <algorithm>
-#include <iostream>
-#include <numeric>
-#include <string>
-#include <vector>
-
-namespace tvm {
-namespace runtime {
-namespace contrib {
-
-using namespace dnnl;
-
-typedef struct {
-  void** data;
-} DnnlPackedArgs;
-
-// Read from memory, write to handle
-inline void read_from_dnnl_memory(void* handle, const memory& mem) {
-  size_t bytes = mem.get_desc().get_size();
-
-  uint8_t* src = static_cast<uint8_t*>(mem.get_data_handle());
-  std::copy(src, src + bytes, reinterpret_cast<uint8_t*>(handle));
-}
-
-extern "C" void dnnl_conv2d(float* data, float* weights, float* out, int p_N_,
-                            int p_C_, int p_H_, int p_W_, int p_O_, int p_G_,
-                            int p_Ph_, int p_Pw_, int p_Kh_, int p_Kw_,
-                            int p_Sh_, int p_Sw_) {
-  using tag = memory::format_tag;
-  using dt = memory::data_type;
-  engine eng(engine::kind::cpu, 0);
-  stream s(eng);
-
-  memory::dims conv2d_src_tz = {p_N_, p_C_, p_H_, p_W_};
-  memory::dims conv2d_weights_tz = {p_O_, p_C_, p_Kh_, p_Kw_};
-  if (p_G_ > 1) conv2d_weights_tz = {p_G_, 1, p_C_ / p_G_, p_Kh_, p_Kw_};
-  memory::dims conv2d_bias_tz = {p_O_};
-  memory::dims conv2d_dst_tz = {p_N_, p_O_,
-                                (p_H_ - p_Kh_ + 2 * p_Ph_ + p_Sh_) / p_Sh_,
-                                (p_W_ - p_Kw_ + 2 * p_Pw_ + p_Sw_) / p_Sw_};
-  memory::dims conv2d_strides = {p_Sh_, p_Sw_};
-  memory::dims conv2d_padding = {p_Ph_, p_Pw_};
-
-  std::vector<float> conv2d_bias(p_O_, 0);
-
-  auto user_src_memory =
-      memory({{conv2d_src_tz}, dt::f32, tag::nchw}, eng, data);
-  auto user_weights_memory = memory(
-      {{conv2d_weights_tz}, dt::f32, (p_G_ > 1) ? tag::goihw : tag::oihw}, eng,
-      weights);
-  auto conv2d_user_bias_memory =
-      memory({{conv2d_bias_tz}, dt::f32, tag::x}, eng, conv2d_bias.data());
-
-  auto conv2d_src_md = memory::desc({conv2d_src_tz}, dt::f32, tag::any);
-  auto conv2d_bias_md = memory::desc({conv2d_bias_tz}, dt::f32, tag::any);
-  auto conv2d_weights_md = memory::desc({conv2d_weights_tz}, dt::f32, tag::any);
-  auto conv2d_dst_md = memory::desc({conv2d_dst_tz}, dt::f32, tag::nchw);
-
-  auto conv2d_desc = convolution_forward::desc(
-      prop_kind::forward_inference, algorithm::convolution_direct,
-      conv2d_src_md, conv2d_weights_md, conv2d_bias_md, conv2d_dst_md,
-      conv2d_strides, conv2d_padding, conv2d_padding);
-  auto conv2d_prim_desc = convolution_forward::primitive_desc(conv2d_desc, eng);
-
-  auto conv2d_src_memory = user_src_memory;
-  auto conv2d_weights_memory = user_weights_memory;
-  auto conv2d_dst_memory = memory(conv2d_prim_desc.dst_desc(), eng);
-
-  auto conv = convolution_forward(conv2d_prim_desc);
-  conv.execute(s, {{DNNL_ARG_SRC, conv2d_src_memory},
-                   {DNNL_ARG_WEIGHTS, conv2d_weights_memory},
-                   {DNNL_ARG_BIAS, conv2d_user_bias_memory},
-                   {DNNL_ARG_DST, conv2d_dst_memory}});
-  s.wait();
-  read_from_dnnl_memory(out, conv2d_dst_memory);
-}
-
-extern "C" void dnnl_dense(float* data, float* weight, float* out, int p_B_,
-                           int p_I_, int p_O_) {
-  using tag = memory::format_tag;
-  using dt = memory::data_type;
-
-  engine eng(engine::kind::cpu, 0);
-  stream s(eng);
-
-  memory::dims data_tz = {p_B_, p_I_};
-  memory::dims weight_tz = {p_O_, p_I_};
-  memory::dims bias_tz = {p_O_};
-  memory::dims dst_tz = {p_B_, p_O_};
-
-  auto data_md = memory::desc{{data_tz}, dt::f32, tag::nc};
-  auto weight_md = memory::desc({{weight_tz}, dt::f32, tag::nc});
-  auto bias_md = memory::desc({{bias_tz}, dt::f32, tag::x});
-  auto dst_md = memory::desc({{dst_tz}, dt::f32, tag::nc});
-
-  std::vector<float> bias(p_O_, 0);
-  auto data_memory = memory(data_md, eng, data);
-  auto weight_memory = memory(weight_md, eng, weight);
-  auto bias_memory = memory(bias_md, eng, bias.data());
-  auto dst_memory = memory(dst_md, eng);
-
-  auto dense_desc = inner_product_forward::desc(
-      prop_kind::forward_inference, data_md, weight_md, bias_md, dst_md);
-  auto dense_prim_desc = inner_product_forward::primitive_desc(dense_desc, eng);
-  assert(dst_md == dense_prim_desc.dst_desc());
-
-  auto dense = inner_product_forward(dense_prim_desc);
-  dense.execute(s, {{DNNL_ARG_SRC, data_memory},
-                    {DNNL_ARG_WEIGHTS, weight_memory},
-                    {DNNL_ARG_BIAS, bias_memory},
-                    {DNNL_ARG_DST, dst_memory}});
-  s.wait();
-  read_from_dnnl_memory(out, dst_memory);
-}
-
-extern "C" void dnnl_relu(float* data, float* out, int p_N_, int p_C_, int p_H_,
-                          int p_W_) {
-  using tag = memory::format_tag;
-  using dt = memory::data_type;
-
-  engine eng(engine::kind::cpu, 0);
-  stream s(eng);
-
-  memory::dims data_tz = {p_N_, p_C_, p_H_, p_W_};
-
-  auto data_md = memory::desc{{data_tz}, dt::f32, tag::nchw};
-
-  auto data_memory = memory(data_md, eng, data);
-  auto dst_memory = memory(data_md, eng);
-
-  auto relu_desc = eltwise_forward::desc(prop_kind::forward_inference,
-                                         algorithm::eltwise_relu, data_md, 0);
-  auto relu_prim_desc = eltwise_forward::primitive_desc(relu_desc, eng);
-  assert(data_md == relu_prim_desc.dst_desc());
-
-  auto relu = eltwise_forward(relu_prim_desc);
-  relu.execute(s, {{DNNL_ARG_SRC, data_memory}, {DNNL_ARG_DST, dst_memory}});
-  s.wait();
-  read_from_dnnl_memory(out, dst_memory);
-}
-
-extern "C" void dnnl_bn(float* data, float* gamma, float* beta, float* mean,
-                        float* variance, float* out, int p_N_, int p_C_,
-                        int p_H_, int p_W_, int p_E_) {
-  using tag = memory::format_tag;
-  using dt = memory::data_type;
-
-  engine eng(engine::kind::cpu, 0);
-  stream s(eng);
-
-  memory::dims data_tz = {p_N_, p_C_, p_H_, p_W_};
-
-  auto data_md = memory::desc{{data_tz}, dt::f32, tag::nchw};
-
-  auto data_memory = memory(data_md, eng, data);
-  auto dst_memory = memory(data_md, eng);
-
-  auto bn_desc = batch_normalization_forward::desc(
-      prop_kind::forward_inference, data_md, p_E_,
-      normalization_flags::use_global_stats |
-          normalization_flags::use_scale_shift);
-  auto bn_prim_desc = batch_normalization_forward::primitive_desc(bn_desc, eng);
-  assert(data_md == bn_prim_desc.dst_desc());
-
-  float* weight = reinterpret_cast<float*>(malloc(sizeof(float) * 2 * p_C_));
-  memcpy(weight, gamma, sizeof(float) * p_C_);
-  memcpy(weight + p_C_, beta, sizeof(float) * p_C_);
-
-  auto weight_memory = memory(bn_prim_desc.weights_desc(), eng, weight);
-  auto mean_memory = memory(bn_prim_desc.mean_desc(), eng, mean);
-  auto variance_memory = memory(bn_prim_desc.variance_desc(), eng, variance);
-
-  auto bn = batch_normalization_forward(bn_prim_desc);
-  bn.execute(s, {{DNNL_ARG_SRC, data_memory},
-                 {DNNL_ARG_DST, dst_memory},
-                 {DNNL_ARG_SCALE_SHIFT, weight_memory},
-                 {DNNL_ARG_MEAN, mean_memory},
-                 {DNNL_ARG_VARIANCE, variance_memory}});
-  s.wait();
-  read_from_dnnl_memory(out, dst_memory);
-  free(weight);
-}
-
-extern "C" void dnnl_add(float* data, float* weight, float* out, int p_N_,
-                         int p_C_, int p_H_, int p_W_) {
-  using tag = memory::format_tag;
-  using dt = memory::data_type;
-
-  engine eng(engine::kind::cpu, 0);
-  stream s(eng);
-
-  memory::dims data_tz = {p_N_, p_C_, p_H_, p_W_};
-
-  auto data_md = memory::desc{{data_tz}, dt::f32, tag::nchw};
-  auto weight_md = memory::desc({{data_tz}, dt::f32, tag::nchw});
-  auto dst_md = memory::desc({{data_tz}, dt::f32, tag::nchw});
-
-  auto data_memory = memory(data_md, eng, data);
-  auto weight_memory = memory(weight_md, eng, weight);
-  auto dst_memory = memory(dst_md, eng);
-
-  auto add_desc =
-      binary::desc(algorithm::binary_add, data_md, weight_md, dst_md);
-  auto add_prim_desc = binary::primitive_desc(add_desc, eng);
-  assert(dst_md == add_prim_desc.dst_desc());
-
-  auto add = binary(add_prim_desc);
-  add.execute(s, {{DNNL_ARG_SRC_0, data_memory},
-                  {DNNL_ARG_SRC_1, weight_memory},
-                  {DNNL_ARG_DST, dst_memory}});
-  s.wait();
-  read_from_dnnl_memory(out, dst_memory);
-}
-
-}  // namespace contrib
-}  // namespace runtime
-}  // namespace tvm
diff --git a/src/runtime/contrib/dnnl/dnnl_kernel.h b/src/runtime/contrib/dnnl/dnnl_kernel.h
deleted file mode 100644
index 4d0b100b92ec..000000000000
--- a/src/runtime/contrib/dnnl/dnnl_kernel.h
+++ /dev/null
@@ -1,56 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file src/runtime/contrib/dnnl/dnnl_kernel.h
- * \brief Use external dnnl library kernels.
- */
-
-#ifndef TVM_RUNTIME_CONTRIB_DNNL_DNNL_KERNEL_H_
-#define TVM_RUNTIME_CONTRIB_DNNL_DNNL_KERNEL_H_
-
-#include <tvm/runtime/c_runtime_api.h>
-#include "dnnl.hpp"
-
-namespace tvm {
-namespace runtime {
-namespace contrib {
-
-using namespace dnnl;
-
-extern "C" TVM_DLL void dnnl_conv2d(float* data, float* weights, float* out, int p_N_, int p_C_,
-                                    int p_H_, int p_W_, int p_O_, int p_G_, int p_Ph_, int p_Pw_,
-                                    int p_Kh_, int p_Kw_, int p_Sh_, int p_Sw_);
-
-extern "C" TVM_DLL void dnnl_dense(float* data, float* weight, float* out, int p_B_, int p_I_,
-                                   int p_O_);
-
-extern "C" TVM_DLL void dnnl_relu(float* data, float* out, int p_N_, int p_C_, int p_H_, int p_W_);
-
-extern "C" TVM_DLL void dnnl_bn(float* data, float* gamma, float* beta, float* mean,
-                                float* variance, float* out, int p_n_, int p_c_, int p_h_, int p_w_,
-                                int p_e_);
-
-extern "C" TVM_DLL void dnnl_add(float* data, float* weight, float* out, int p_n_, int p_c_,
-                                 int p_h_, int p_w_);
-
-}  // namespace contrib
-}  // namespace runtime
-}  // namespace tvm
-#endif  // TVM_RUNTIME_CONTRIB_DNNL_DNNL_KERNEL_H_
diff --git a/src/runtime/contrib/tflite/tflite_runtime.cc b/src/runtime/contrib/tflite/tflite_runtime.cc
index e249f351d887..a32669d5f635 100644
--- a/src/runtime/contrib/tflite/tflite_runtime.cc
+++ b/src/runtime/contrib/tflite/tflite_runtime.cc
@@ -33,37 +33,37 @@ namespace tvm {
 namespace runtime {
 
 #define TVM_DTYPE_DISPATCH(type, DType, ...)            \
-  if (type == DataType::Float(64)) {                              \
+  if (type == Float(64)) {                              \
     typedef double DType;                               \
     {__VA_ARGS__}                                       \
-  } else if (type == DataType::Float(32)) {                       \
+  } else if (type == Float(32)) {                       \
     typedef float DType;                                \
     {__VA_ARGS__}                                       \
-  } else if (type == DataType::Float(16)) {                       \
+  } else if (type == Float(16)) {                       \
     typedef uint16_t DType;                             \
     {__VA_ARGS__}                                       \
-  } else if (type == DataType::Int(64)) {                         \
+  } else if (type == Int(64)) {                         \
     typedef int64_t DType;                              \
     {__VA_ARGS__}                                       \
-  } else if (type == DataType::Int(32)) {                         \
+  } else if (type == Int(32)) {                         \
     typedef int32_t DType;                              \
     {__VA_ARGS__}                                       \
-  } else if (type == DataType::Int(16)) {                         \
+  } else if (type == Int(16)) {                         \
     typedef int16_t DType;                              \
     {__VA_ARGS__}                                       \
-  } else if (type == DataType::Int(8)) {                          \
+  } else if (type == Int(8)) {                          \
     typedef int8_t DType;                               \
     {__VA_ARGS__}                                       \
-  } else if (type == DataType::UInt(64)) {                        \
+  } else if (type == UInt(64)) {                        \
     typedef uint64_t DType;                             \
     {__VA_ARGS__}                                       \
-  } else if (type == DataType::UInt(32)) {                        \
+  } else if (type == UInt(32)) {                        \
     typedef uint32_t DType;                             \
     {__VA_ARGS__}                                       \
-  } else if (type == DataType::UInt(16)) {                        \
+  } else if (type == UInt(16)) {                        \
     typedef uint16_t DType;                             \
     {__VA_ARGS__}                                       \
-  } else if (type == DataType::UInt(8)) {                         \
+  } else if (type == UInt(8)) {                         \
     typedef uint8_t DType;                              \
     {__VA_ARGS__}                                       \
   } else {                                              \
@@ -73,22 +73,22 @@ namespace runtime {
 DataType TfLiteDType2TVMDType(TfLiteType dtype) {
   switch (dtype) {
     case kTfLiteFloat32:
-      return DataType::Float(32);
+      return Float(32);
     case kTfLiteInt32:
-      return DataType::Int(32);
+      return Int(32);
     case kTfLiteInt64:
-      return DataType::Int(64);
+      return Int(64);
     case kTfLiteInt16:
-      returnDataType::Int(16);
+      return Int(16);
     case kTfLiteInt8:
-      returnDataType::Int(8);
+      return Int(8);
     case kTfLiteUInt8:
-      return DataType::UInt(8);
+      return UInt(8);
     case kTfLiteFloat16:
-      return DataType::Float(16);
+      return Float(16);
     default:
       LOG(FATAL) << "tflite data type not support yet: " << dtype;
-      return DataType::Float(32);
+      return Float(32);
   }
 }
 
diff --git a/src/runtime/library_module.cc b/src/runtime/library_module.cc
index 9aaf5b9ad390..423848a8fba8 100644
--- a/src/runtime/library_module.cc
+++ b/src/runtime/library_module.cc
@@ -28,7 +28,6 @@
 #include <tvm/runtime/registry.h>
 #include <string>
 #include <vector>
-#include <cstdint>
 #include "library_module.h"
 
 namespace tvm {
@@ -109,11 +108,9 @@ void InitContextFunctions(std::function<void*(const char*)> fgetsymbol) {
 /*!
  * \brief Load and append module blob to module list
  * \param mblob The module blob.
- * \param lib The library.
- *
- * \return Root Module.
+ * \param module_list The module list to append to
  */
-runtime::Module ProcessModuleBlob(const char* mblob, ObjectPtr<Library> lib) {
+void ImportModuleBlob(const char* mblob, std::vector<Module>* mlist) {
 #ifndef _LIBCPP_SGX_CONFIG
   CHECK(mblob != nullptr);
   uint64_t nbytes = 0;
@@ -126,56 +123,19 @@ runtime::Module ProcessModuleBlob(const char* mblob, ObjectPtr<Library> lib) {
   dmlc::Stream* stream = &fs;
   uint64_t size;
   CHECK(stream->Read(&size));
-  std::vector<Module> modules;
-  std::vector<uint64_t> import_tree_row_ptr;
-  std::vector<uint64_t> import_tree_child_indices;
   for (uint64_t i = 0; i < size; ++i) {
     std::string tkey;
     CHECK(stream->Read(&tkey));
-    // Currently, _lib is for DSOModule, but we
-    // don't have loadbinary function for it currently
-    if (tkey == "_lib") {
-      auto dso_module = Module(make_object<LibraryModuleNode>(lib));
-      modules.emplace_back(dso_module);
-    } else if (tkey == "_import_tree") {
-      CHECK(stream->Read(&import_tree_row_ptr));
-      CHECK(stream->Read(&import_tree_child_indices));
-    } else {
-      std::string fkey = "module.loadbinary_" + tkey;
-      const PackedFunc* f = Registry::Get(fkey);
-      CHECK(f != nullptr)
+    std::string fkey = "module.loadbinary_" + tkey;
+    const PackedFunc* f = Registry::Get(fkey);
+    CHECK(f != nullptr)
         << "Loader of " << tkey << "("
         << fkey << ") is not presented.";
-      Module m = (*f)(static_cast<void*>(stream));
-      modules.emplace_back(m);
-    }
+    Module m = (*f)(static_cast<void*>(stream));
+    mlist->push_back(m);
   }
-  // if we are using old dll, we don't have import tree
-  // so that we can't reconstruct module relationship using import tree
-  if (import_tree_row_ptr.empty()) {
-    auto n = make_object<LibraryModuleNode>(lib);
-    auto module_import_addr = ModuleInternal::GetImportsAddr(n.operator->());
-    for (const auto& m : modules) {
-      module_import_addr->emplace_back(m);
-    }
-    return Module(n);
-  } else {
-    for (size_t i = 0; i < modules.size(); ++i) {
-      for (size_t j = import_tree_row_ptr[i]; j < import_tree_row_ptr[i + 1]; ++j) {
-        auto module_import_addr = ModuleInternal::GetImportsAddr(modules[i].operator->());
-        auto child_index = import_tree_child_indices[j];
-        CHECK(child_index < modules.size());
-        module_import_addr->emplace_back(modules[child_index]);
-      }
-    }
-  }
-  CHECK(!modules.empty());
-  // invariance: root module is always at location 0.
-  // The module order is collected via DFS
-  return modules[0];
 #else
   LOG(FATAL) << "SGX does not support ImportModuleBlob";
-  return Module();
 #endif
 }
 
@@ -188,20 +148,17 @@ Module CreateModuleFromLibrary(ObjectPtr<Library> lib) {
   const char* dev_mblob =
       reinterpret_cast<const char*>(
           lib->GetSymbol(runtime::symbol::tvm_dev_mblob));
-  Module root_mod;
   if (dev_mblob != nullptr) {
-    root_mod = ProcessModuleBlob(dev_mblob, lib);
-  } else {
-    // Only have one single DSO Module
-    root_mod = Module(n);
+    ImportModuleBlob(
+        dev_mblob, ModuleInternal::GetImportsAddr(n.operator->()));
   }
 
-  // allow lookup of symbol from root (so all symbols are visible).
+  Module root_mod = Module(n);
+  // allow lookup of symbol from root(so all symbols are visible).
   if (auto *ctx_addr =
       reinterpret_cast<void**>(lib->GetSymbol(runtime::symbol::tvm_module_ctx))) {
     *ctx_addr = root_mod.operator->();
   }
-
   return root_mod;
 }
 }  // namespace runtime
diff --git a/src/runtime/module.cc b/src/runtime/module.cc
index 55f89647bec9..e2cae6c74caf 100644
--- a/src/runtime/module.cc
+++ b/src/runtime/module.cc
@@ -119,7 +119,7 @@ const PackedFunc* ModuleNode::GetFuncFromEnv(const std::string& name) {
   if (it != import_cache_.end()) return it->second.get();
   PackedFunc pf;
   for (Module& m : this->imports_) {
-    pf = m.GetFunction(name, true);
+    pf = m.GetFunction(name, false);
     if (pf != nullptr) break;
   }
   if (pf == nullptr) {
@@ -153,6 +153,8 @@ bool RuntimeEnabled(const std::string& target) {
     f_name = "codegen.build_stackvm";
   } else if (target == "rpc") {
     f_name = "device_api.rpc";
+  } else if (target == "vpi" || target == "verilog") {
+    f_name = "device_api.vpi";
   } else if (target == "micro_dev") {
     f_name = "device_api.micro_dev";
   } else if (target.length() >= 5 && target.substr(0, 5) == "nvptx") {
diff --git a/src/runtime/ndarray.cc b/src/runtime/ndarray.cc
index f4160cc97d72..9d2d53e03eb8 100644
--- a/src/runtime/ndarray.cc
+++ b/src/runtime/ndarray.cc
@@ -27,12 +27,8 @@
 #include <tvm/runtime/device_api.h>
 #include "runtime_base.h"
 
-extern "C" {
-// C-mangled dlpack deleter.
-static void TVMNDArrayDLPackDeleter(DLManagedTensor* tensor);
-// helper function to get NDArray's type index, only used by ctypes.
-TVM_DLL int TVMArrayGetTypeIndex(TVMArrayHandle handle, unsigned* out_tindex);
-}
+// deleter for arrays used by DLPack exporter
+extern "C" void NDArrayDLPackDeleter(DLManagedTensor* tensor);
 
 namespace tvm {
 namespace runtime {
@@ -57,8 +53,8 @@ inline size_t GetDataAlignment(const DLTensor& arr) {
 
 struct NDArray::Internal {
   // Default deleter for the container
-  static void DefaultDeleter(Object* ptr_obj) {
-    auto* ptr = static_cast<NDArray::Container*>(ptr_obj);
+  static void DefaultDeleter(NDArray::Container* ptr) {
+    using tvm::runtime::NDArray;
     if (ptr->manager_ctx != nullptr) {
       static_cast<NDArray::Container*>(ptr->manager_ctx)->DecRef();
     } else if (ptr->dl_tensor.data != nullptr) {
@@ -72,8 +68,7 @@ struct NDArray::Internal {
   // that are not allocated inside of TVM.
   // This enables us to create NDArray from memory allocated by other
   // frameworks that are DLPack compatible
-  static void DLPackDeleter(Object* ptr_obj) {
-    auto* ptr = static_cast<NDArray::Container*>(ptr_obj);
+  static void DLPackDeleter(NDArray::Container* ptr) {
     DLManagedTensor* tensor = static_cast<DLManagedTensor*>(ptr->manager_ctx);
     if (tensor->deleter != nullptr) {
       (*tensor->deleter)(tensor);
@@ -86,13 +81,12 @@ struct NDArray::Internal {
                         DLDataType dtype,
                         DLContext ctx) {
     VerifyDataType(dtype);
-
-    // critical zone: construct header
+    // critical zone
     NDArray::Container* data = new NDArray::Container();
-    data->SetDeleter(DefaultDeleter);
-
+    data->deleter = DefaultDeleter;
+    NDArray ret(data);
+    ret.data_ = data;
     // RAII now in effect
-    NDArray ret(GetObjectPtr<Object>(data));
     // setup shape
     data->shape_ = std::move(shape);
     data->dl_tensor.shape = dmlc::BeginPtr(data->shape_);
@@ -104,57 +98,45 @@ struct NDArray::Internal {
     return ret;
   }
   // Implementation of API function
-  static DLTensor* MoveToFFIHandle(NDArray arr) {
-    DLTensor* handle = NDArray::FFIGetHandle(arr);
-    ObjectRef::FFIClearAfterMove(&arr);
-    return handle;
-  }
-  static void FFIDecRef(TVMArrayHandle tensor) {
-    NDArray::FFIDecRef(tensor);
+  static DLTensor* MoveAsDLTensor(NDArray arr) {
+    DLTensor* tensor = const_cast<DLTensor*>(arr.operator->());
+    CHECK(reinterpret_cast<DLTensor*>(arr.data_) == tensor);
+    arr.data_ = nullptr;
+    return tensor;
   }
   // Container to DLManagedTensor
-  static DLManagedTensor* ToDLPack(TVMArrayHandle handle) {
-    auto* from = static_cast<NDArray::Container*>(
-        reinterpret_cast<NDArray::ContainerBase*>(handle));
-    return ToDLPack(from);
-  }
-
   static DLManagedTensor* ToDLPack(NDArray::Container* from) {
     CHECK(from != nullptr);
     DLManagedTensor* ret = new DLManagedTensor();
     ret->dl_tensor = from->dl_tensor;
     ret->manager_ctx = from;
     from->IncRef();
-    ret->deleter = TVMNDArrayDLPackDeleter;
+    ret->deleter = NDArrayDLPackDeleter;
     return ret;
   }
-  // Delete dlpack object.
-  static void NDArrayDLPackDeleter(DLManagedTensor* tensor) {
-    static_cast<NDArray::Container*>(tensor->manager_ctx)->DecRef();
-    delete tensor;
-  }
 };
 
-NDArray NDArray::CreateView(std::vector<int64_t> shape, DLDataType dtype) {
+NDArray NDArray::CreateView(std::vector<int64_t> shape,
+                            DLDataType dtype) {
   CHECK(data_ != nullptr);
-  CHECK(get_mutable()->dl_tensor.strides == nullptr)
+  CHECK(data_->dl_tensor.strides == nullptr)
       << "Can only create view for compact tensor";
-  NDArray ret = Internal::Create(shape, dtype, get_mutable()->dl_tensor.ctx);
-  ret.get_mutable()->dl_tensor.byte_offset =
-      this->get_mutable()->dl_tensor.byte_offset;
-  size_t curr_size = GetDataSize(this->get_mutable()->dl_tensor);
-  size_t view_size = GetDataSize(ret.get_mutable()->dl_tensor);
+  NDArray ret = Internal::Create(shape, dtype, data_->dl_tensor.ctx);
+  ret.data_->dl_tensor.byte_offset =
+      this->data_->dl_tensor.byte_offset;
+  size_t curr_size = GetDataSize(this->data_->dl_tensor);
+  size_t view_size = GetDataSize(ret.data_->dl_tensor);
   CHECK_LE(view_size, curr_size)
       << "Tries to create a view that has bigger memory than current one";
   // increase ref count
-  get_mutable()->IncRef();
-  ret.get_mutable()->manager_ctx = get_mutable();
-  ret.get_mutable()->dl_tensor.data = get_mutable()->dl_tensor.data;
+  this->data_->IncRef();
+  ret.data_->manager_ctx = this->data_;
+  ret.data_->dl_tensor.data = this->data_->dl_tensor.data;
   return ret;
 }
 
 DLManagedTensor* NDArray::ToDLPack() const {
-  return Internal::ToDLPack(get_mutable());
+  return Internal::ToDLPack(data_);
 }
 
 NDArray NDArray::Empty(std::vector<int64_t> shape,
@@ -162,9 +144,9 @@ NDArray NDArray::Empty(std::vector<int64_t> shape,
                        DLContext ctx) {
   NDArray ret = Internal::Create(shape, dtype, ctx);
   // setup memory content
-  size_t size = GetDataSize(ret.get_mutable()->dl_tensor);
-  size_t alignment = GetDataAlignment(ret.get_mutable()->dl_tensor);
-  ret.get_mutable()->dl_tensor.data =
+  size_t size = GetDataSize(ret.data_->dl_tensor);
+  size_t alignment = GetDataAlignment(ret.data_->dl_tensor);
+  ret.data_->dl_tensor.data =
       DeviceAPI::Get(ret->ctx)->AllocDataSpace(
           ret->ctx, size, alignment, ret->dtype);
   return ret;
@@ -172,15 +154,13 @@ NDArray NDArray::Empty(std::vector<int64_t> shape,
 
 NDArray NDArray::FromDLPack(DLManagedTensor* tensor) {
   NDArray::Container* data = new NDArray::Container();
-  // construct header
-  data->SetDeleter(Internal::DLPackDeleter);
-  // fill up content.
+  data->deleter = Internal::DLPackDeleter;
   data->manager_ctx = tensor;
   data->dl_tensor = tensor->dl_tensor;
-  return NDArray(GetObjectPtr<Object>(data));
+  return NDArray(data);
 }
 
-void NDArray::CopyFromTo(const DLTensor* from,
+void NDArray::CopyFromTo(DLTensor* from,
                          DLTensor* to,
                          TVMStreamHandle stream) {
   size_t from_size = GetDataSize(*from);
@@ -204,24 +184,17 @@ void NDArray::CopyFromTo(const DLTensor* from,
 }
 
 std::vector<int64_t> NDArray::Shape() const {
-  return get_mutable()->shape_;
+  return data_->shape_;
 }
 
-TVM_REGISTER_OBJECT_TYPE(NDArray::Container);
-
 }  // namespace runtime
 }  // namespace tvm
 
 using namespace tvm::runtime;
 
-void TVMNDArrayDLPackDeleter(DLManagedTensor* tensor) {
-  NDArray::Internal::NDArrayDLPackDeleter(tensor);
-}
-
-int TVMArrayGetTypeIndex(TVMArrayHandle handle, unsigned* out_tindex) {
-  API_BEGIN();
-  *out_tindex = TVMArrayHandleToObjectHandle(handle)->type_index();
-  API_END();
+void NDArrayDLPackDeleter(DLManagedTensor* tensor) {
+  static_cast<NDArray::Container*>(tensor->manager_ctx)->DecRef();
+  delete tensor;
 }
 
 int TVMArrayAlloc(const tvm_index_t* shape,
@@ -240,14 +213,14 @@ int TVMArrayAlloc(const tvm_index_t* shape,
   DLContext ctx;
   ctx.device_type = static_cast<DLDeviceType>(device_type);
   ctx.device_id = device_id;
-  *out = NDArray::Internal::MoveToFFIHandle(
+  *out = NDArray::Internal::MoveAsDLTensor(
       NDArray::Empty(std::vector<int64_t>(shape, shape + ndim), dtype, ctx));
   API_END();
 }
 
 int TVMArrayFree(TVMArrayHandle handle) {
   API_BEGIN();
-  NDArray::Internal::FFIDecRef(handle);
+  reinterpret_cast<NDArray::Container*>(handle)->DecRef();
   API_END();
 }
 
@@ -262,14 +235,14 @@ int TVMArrayCopyFromTo(TVMArrayHandle from,
 int TVMArrayFromDLPack(DLManagedTensor* from,
                        TVMArrayHandle* out) {
   API_BEGIN();
-  *out = NDArray::Internal::MoveToFFIHandle(NDArray::FromDLPack(from));
+  *out = NDArray::Internal::MoveAsDLTensor(NDArray::FromDLPack(from));
   API_END();
 }
 
 int TVMArrayToDLPack(TVMArrayHandle from,
                      DLManagedTensor** out) {
   API_BEGIN();
-  *out = NDArray::Internal::ToDLPack(from);
+  *out = NDArray::Internal::ToDLPack(reinterpret_cast<NDArray::Container*>(from));
   API_END();
 }
 
diff --git a/src/runtime/registry.cc b/src/runtime/registry.cc
index 4717d89e33c1..ce6a281a6ead 100644
--- a/src/runtime/registry.cc
+++ b/src/runtime/registry.cc
@@ -6,9 +6,9 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- *
+ * 
  *   http://www.apache.org/licenses/LICENSE-2.0
- *
+ * 
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -40,10 +40,15 @@ struct Registry::Manager {
   // and the resource can become invalid because of indeterminstic order of destruction.
   // The resources will only be recycled during program exit.
   std::unordered_map<std::string, Registry*> fmap;
+  // vtable for extension type
+  std::array<ExtTypeVTable, kExtEnd> ext_vtable;
   // mutex
   std::mutex mutex;
 
   Manager() {
+    for (auto& x : ext_vtable) {
+      x.destroy = nullptr;
+    }
   }
 
   static Manager* Global() {
@@ -104,6 +109,24 @@ std::vector<std::string> Registry::ListNames() {
   return keys;
 }
 
+ExtTypeVTable* ExtTypeVTable::Get(int type_code) {
+  CHECK(type_code > kExtBegin && type_code < kExtEnd);
+  Registry::Manager* m = Registry::Manager::Global();
+  ExtTypeVTable* vt = &(m->ext_vtable[type_code]);
+  CHECK(vt->destroy != nullptr)
+      << "Extension type not registered";
+  return vt;
+}
+
+ExtTypeVTable* ExtTypeVTable::RegisterInternal(
+    int type_code, const ExtTypeVTable& vt) {
+  CHECK(type_code > kExtBegin && type_code < kExtEnd);
+  Registry::Manager* m = Registry::Manager::Global();
+  std::lock_guard<std::mutex> lock(m->mutex);
+  ExtTypeVTable* pvt = &(m->ext_vtable[type_code]);
+  pvt[0] = vt;
+  return pvt;
+}
 }  // namespace runtime
 }  // namespace tvm
 
@@ -118,6 +141,12 @@ struct TVMFuncThreadLocalEntry {
 /*! \brief Thread local store that can be used to hold return values. */
 typedef dmlc::ThreadLocalStore<TVMFuncThreadLocalEntry> TVMFuncThreadLocalStore;
 
+int TVMExtTypeFree(void* handle, int type_code) {
+  API_BEGIN();
+  tvm::runtime::ExtTypeVTable::Get(type_code)->destroy(handle);
+  API_END();
+}
+
 int TVMFuncRegisterGlobal(
     const char* name, TVMFunctionHandle f, int override) {
   API_BEGIN();
diff --git a/src/runtime/rpc/rpc_module.cc b/src/runtime/rpc/rpc_module.cc
index 881788a5292c..1042a4f68e5e 100644
--- a/src/runtime/rpc/rpc_module.cc
+++ b/src/runtime/rpc/rpc_module.cc
@@ -59,8 +59,7 @@ class RPCWrappedFunc {
                             const TVMArgValue& arg);
 
   // deleter of RPC remote array
-  static void RemoteNDArrayDeleter(Object* obj) {
-    auto* ptr = static_cast<NDArray::Container*>(obj);
+  static void RemoteNDArrayDeleter(NDArray::Container* ptr) {
     RemoteSpace* space = static_cast<RemoteSpace*>(ptr->dl_tensor.data);
     space->sess->CallRemote(RPCCode::kNDArrayFree, ptr->manager_ctx);
     delete space;
@@ -72,12 +71,12 @@ class RPCWrappedFunc {
                                    void* nd_handle) {
     NDArray::Container* data = new NDArray::Container();
     data->manager_ctx = nd_handle;
-    data->SetDeleter(RemoteNDArrayDeleter);
+    data->deleter = RemoteNDArrayDeleter;
     RemoteSpace* space = new RemoteSpace();
     space->sess = sess;
     space->data = tensor->data;
     data->dl_tensor.data = space;
-    NDArray ret(GetObjectPtr<Object>(data));
+    NDArray ret(data);
     // RAII now in effect
     data->shape_ = std::vector<int64_t>(
         tensor->shape, tensor->shape + tensor->ndim);
diff --git a/src/runtime/rpc/rpc_session.cc b/src/runtime/rpc/rpc_session.cc
index 77d39754b095..16b0e7f69529 100644
--- a/src/runtime/rpc/rpc_session.cc
+++ b/src/runtime/rpc/rpc_session.cc
@@ -787,7 +787,9 @@ class RPCSession::EventHandler : public dmlc::Stream {
         TVMValue ret_value_pack[2];
         int ret_tcode_pack[2];
         rv.MoveToCHost(&ret_value_pack[0], &ret_tcode_pack[0]);
-        ret_value_pack[1].v_handle = ret_value_pack[0].v_handle;
+
+        NDArray::Container* nd = static_cast<NDArray::Container*>(ret_value_pack[0].v_handle);
+        ret_value_pack[1].v_handle = nd;
         ret_tcode_pack[1] = kHandle;
         SendPackedSeq(ret_value_pack, ret_tcode_pack, 2, false, nullptr, true);
       } else {
@@ -1188,8 +1190,7 @@ void RPCModuleGetSource(TVMArgs args, TVMRetValue *rv) {
 
 void RPCNDArrayFree(TVMArgs args, TVMRetValue *rv) {
   void* handle = args[0];
-  static_cast<NDArray::Container*>(
-      reinterpret_cast<NDArray::ContainerBase*>(handle))->DecRef();
+  static_cast<NDArray::Container*>(handle)->DecRef();
 }
 
 void RPCGetTimeEvaluator(TVMArgs args, TVMRetValue *rv) {
diff --git a/src/runtime/vm/memory_manager.cc b/src/runtime/vm/memory_manager.cc
index 3e6140ed3830..ff2bbe8eaf11 100644
--- a/src/runtime/vm/memory_manager.cc
+++ b/src/runtime/vm/memory_manager.cc
@@ -31,8 +31,7 @@ namespace tvm {
 namespace runtime {
 namespace vm {
 
-static void BufferDeleter(Object* obj) {
-  auto* ptr = static_cast<NDArray::Container*>(obj);
+static void BufferDeleter(NDArray::Container* ptr) {
   CHECK(ptr->manager_ctx != nullptr);
   Buffer* buffer = reinterpret_cast<Buffer*>(ptr->manager_ctx);
   MemoryManager::Global()->GetAllocator(buffer->ctx)->
@@ -41,8 +40,7 @@ static void BufferDeleter(Object* obj) {
   delete ptr;
 }
 
-void StorageObj::Deleter(Object* obj) {
-  auto* ptr = static_cast<NDArray::Container*>(obj);
+void StorageObj::Deleter(NDArray::Container* ptr) {
   // When invoking AllocNDArray we don't own the underlying allocation
   // and should not delete the buffer, but instead let it be reclaimed
   // by the storage object's destructor.
@@ -79,23 +77,16 @@ NDArray StorageObj::AllocNDArray(size_t offset, std::vector<int64_t> shape, DLDa
   // TODO(@jroesch): generalize later to non-overlapping allocations.
   CHECK_EQ(offset, 0u);
   VerifyDataType(dtype);
-
-  // crtical zone: allocate header, cannot throw
   NDArray::Container* container = new NDArray::Container(nullptr, shape, dtype, this->buffer.ctx);
-
-  container->SetDeleter(StorageObj::Deleter);
+  container->deleter = StorageObj::Deleter;
   size_t needed_size = GetDataSize(container->dl_tensor);
-  this->IncRef();
-  container->manager_ctx = reinterpret_cast<void*>(this);
-  container->dl_tensor.data = this->buffer.data;
-  NDArray ret(GetObjectPtr<Object>(container));
-
-  // RAII in effect, now run the check.
   // TODO(@jroesch): generalize later to non-overlapping allocations.
   CHECK(needed_size == this->buffer.size)
     << "size mistmatch required " << needed_size << " found " << this->buffer.size;
-
-  return ret;
+  this->IncRef();
+  container->manager_ctx = reinterpret_cast<void*>(this);
+  container->dl_tensor.data = this->buffer.data;
+  return NDArray(container);
 }
 
 MemoryManager* MemoryManager::Global() {
@@ -117,14 +108,14 @@ Allocator* MemoryManager::GetAllocator(TVMContext ctx) {
 NDArray Allocator::Empty(std::vector<int64_t> shape, DLDataType dtype, DLContext ctx) {
   VerifyDataType(dtype);
   NDArray::Container* container = new NDArray::Container(nullptr, shape, dtype, ctx);
-  container->SetDeleter(BufferDeleter);
+  container->deleter = BufferDeleter;
   size_t size = GetDataSize(container->dl_tensor);
   size_t alignment = GetDataAlignment(container->dl_tensor);
   Buffer *buffer = new Buffer;
   *buffer = this->Alloc(size, alignment, dtype);
   container->manager_ctx = reinterpret_cast<void*>(buffer);
   container->dl_tensor.data = buffer->data;
-  return NDArray(GetObjectPtr<Object>(container));
+  return NDArray(container);
 }
 
 }  // namespace vm
diff --git a/src/runtime/vm/memory_manager.h b/src/runtime/vm/memory_manager.h
index 292fb55e5995..78c8fb36bf70 100644
--- a/src/runtime/vm/memory_manager.h
+++ b/src/runtime/vm/memory_manager.h
@@ -120,7 +120,7 @@ class StorageObj : public Object {
                        DLDataType dtype);
 
   /*! \brief The deleter for an NDArray when allocated from underlying storage. */
-  static void Deleter(Object* ptr);
+  static void Deleter(NDArray::Container* ptr);
 
   ~StorageObj() {
     auto alloc = MemoryManager::Global()->GetAllocator(buffer.ctx);
diff --git a/src/runtime/vm/vm.cc b/src/runtime/vm/vm.cc
index a3b11d46a4fb..41fe71a9f9ed 100644
--- a/src/runtime/vm/vm.cc
+++ b/src/runtime/vm/vm.cc
@@ -800,9 +800,7 @@ void VirtualMachine::LoadExecutable(const Executable* exec) {
     if (packed_funcs_.size() <= packed_index) {
       packed_funcs_.resize(packed_index + 1);
     }
-    tvm::runtime::PackedFunc pf = lib.GetFunction(packed_name, true);
-    CHECK(pf != nullptr) << "Cannot find function in module: " << packed_name;
-    packed_funcs_[packed_index] = pf;
+    packed_funcs_[packed_index] = lib.GetFunction(packed_name);
   }
 }
 
diff --git a/src/runtime/vulkan/vulkan.cc b/src/runtime/vulkan/vulkan.cc
index 9ab1d77e0554..b14260e07816 100644
--- a/src/runtime/vulkan/vulkan.cc
+++ b/src/runtime/vulkan/vulkan.cc
@@ -724,7 +724,7 @@ class VulkanModuleNode final : public runtime::ModuleNode {
 
   ~VulkanModuleNode() {
     // cleanup vulkan related caches.
-    for (size_t device_id = 0; device_id < ecache_.size(); ++device_id) {
+    for (int device_id = 0; device_id < ecache_.size(); ++device_id) {
       for (auto& kv : ecache_[device_id]) {
         auto& pe = kv.second;
         CHECK(pe);
@@ -1026,7 +1026,7 @@ void VulkanWrappedFunc::operator()(TVMArgs args, TVMRetValue* rv,
   ThreadWorkLoad wl = thread_axis_cfg_.Extract(args);
   std::vector<VkDescriptorBufferInfo> descriptor_buffers;
   descriptor_buffers.resize(num_buffer_args_);
-  for (size_t i = 0; i < num_buffer_args_; ++i) {
+  for (int i = 0; i < num_buffer_args_; ++i) {
     void* buf = args[static_cast<int>(i)];
     VkDescriptorBufferInfo binfo;
     binfo.buffer = static_cast<VulkanBuffer*>(buf)->buffer;
@@ -1066,7 +1066,7 @@ void VulkanWrappedFunc::operator()(TVMArgs args, TVMRetValue* rv,
   const auto& deferred_initializer = [&vctx, pipeline, descriptor_buffers]() {
     std::vector<VkWriteDescriptorSet> write_descriptor_sets;
     write_descriptor_sets.resize(descriptor_buffers.size());
-    for (size_t i = 0; i < write_descriptor_sets.size(); i++) {
+    for (int i = 0; i < write_descriptor_sets.size(); i++) {
       write_descriptor_sets[i].sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET;
       write_descriptor_sets[i].pNext = 0;
       write_descriptor_sets[i].dstSet = pipeline->descriptor_set;
@@ -1104,7 +1104,7 @@ void VulkanWrappedFunc::operator()(TVMArgs args, TVMRetValue* rv,
   VulkanStreamToken deferred_token;
   deferred_token.descriptor_set_ = pipeline->descriptor_set;
   deferred_token.buffers_.resize(descriptor_buffers.size());
-  for (size_t i = 0; i < descriptor_buffers.size(); ++i) {
+  for (int i = 0; i < descriptor_buffers.size(); ++i) {
     deferred_token.buffers_[i] = descriptor_buffers[i].buffer;
   }
   VulkanThreadEntry::ThreadLocal()->Stream(device_id)->LaunchDeferred(
diff --git a/src/schedule/graph.cc b/src/schedule/graph.cc
index 518f05a03250..bb0be274b583 100644
--- a/src/schedule/graph.cc
+++ b/src/schedule/graph.cc
@@ -393,7 +393,7 @@ Map<IterVar, Expr> ScanFixPointAnalysis(const Operation& scan_op) {
       if (fail_set.count(sp_iv.get()) ||
           !exact_reach.count(key) ||
           exact_reach.at(key) != sp_iv.get()) {
-        ret.Set(sp_iv, make_const(DataType::Int(32), 0));
+        ret.Set(sp_iv, make_const(Int(32), 0));
       } else {
         // now we proved exact match, need to prove no interference with other graph.
         if (reach.size() == 0) reach = GetReachGraph(body);
@@ -419,9 +419,9 @@ Map<IterVar, Expr> ScanFixPointAnalysis(const Operation& scan_op) {
         }
         if (!stack.empty()) {
           // failed the prove.
-          ret.Set(sp_iv, make_const(DataType::Int(32), 0));
+          ret.Set(sp_iv, make_const(Int(32), 0));
         } else {
-          ret.Set(sp_iv, make_const(DataType::Int(32), 1));
+          ret.Set(sp_iv, make_const(Int(32), 1));
         }
       }
     }
diff --git a/src/schedule/message_passing.cc b/src/schedule/message_passing.cc
index f917e7fe6387..85f28f193eb7 100644
--- a/src/schedule/message_passing.cc
+++ b/src/schedule/message_passing.cc
@@ -215,7 +215,7 @@ void PassDownIndex(const Stage& stage,
       CHECK(is_zero(parent_min));
       state[s->rebased] = value;
     } else if (const SingletonNode* s = rel.as<SingletonNode>()) {
-      state[s->iter] = make_zero(s->iter->var.dtype());
+      state[s->iter] = make_zero(s->iter->var.type());
     } else {
       LOG(FATAL) << "unknown relation type";
     }
@@ -514,7 +514,7 @@ std::vector<Expr> MakeBoundCheck(
       Range dom = dom_map.at(iv);
       Expr value = value_map.at(iv) - dom->min;
       Expr vmax = EvalSet(value, iset_dmap).max();
-      if (vmax.dtype() != value.dtype() || !analyzer.CanProve(vmax < dom->extent)) {
+      if (vmax.type() != value.type() || !analyzer.CanProve(vmax < dom->extent)) {
         preds.emplace_back(value < dom->extent);
       }
     }
@@ -529,10 +529,10 @@ std::vector<Expr> MakeBoundCheck(
       Expr vmin = s.min();
       Expr vmax = s.max();
       // The range of `value` resides in [vmin, vmax]
-      if (vmin.dtype() != value.dtype() || !analyzer.CanProve(vmin >= 0)) {
+      if (vmin.type() != value.type() || !analyzer.CanProve(vmin >= 0)) {
         preds.emplace_back(value >= 0);
       }
-      if (vmax.dtype() != value.dtype() || !analyzer.CanProve(vmax < iv->dom->extent)) {
+      if (vmax.type() != value.type() || !analyzer.CanProve(vmax < iv->dom->extent)) {
         preds.emplace_back(value < iv->dom->extent);
       }
     }
diff --git a/src/schedule/schedule_dataflow_rewrite.cc b/src/schedule/schedule_dataflow_rewrite.cc
index c9afcf45a1f2..6400eeaab69a 100644
--- a/src/schedule/schedule_dataflow_rewrite.cc
+++ b/src/schedule/schedule_dataflow_rewrite.cc
@@ -104,7 +104,7 @@ Expr InjectPredicate(const Array<Expr>& predicates,
   }
   return Select::make(arith::ComputeReduce<ir::And>(predicates, Expr()),
                       body,
-                      make_zero(body.dtype()));
+                      make_zero(body.type()));
 }
 
 // Replace data flow appears in all stages given the tensor change.
@@ -593,7 +593,7 @@ void InjectInline(ScheduleNode* sch) {
               for (size_t k = 0; k < new_body[j].size(); ++k) {
                 auto n = make_node<ir::Reduce>(*r);
                 n->value_index = static_cast<int>(k);
-                n->dtype = r->source[k].dtype();
+                n->type = r->source[k].type();
                 new_body[j].Set(k, Expr(n));
               }
             }
diff --git a/src/schedule/schedule_lang.cc b/src/schedule/schedule_lang.cc
index 7a2ab5a4d8b9..54503fc4a8b8 100644
--- a/src/schedule/schedule_lang.cc
+++ b/src/schedule/schedule_lang.cc
@@ -253,7 +253,7 @@ Stage& Stage::fuse(IterVar outer, IterVar inner, IterVar* p_target) {  // NOLINT
       outer->var->name_hint + "." + inner->var->name_hint + ".fused";
 
   IterVar fused = IterVarNode::make(
-      Range(), Var(fused_name, outer->var.dtype()), iter_type);
+      Range(), Var(fused_name, outer->var.type()), iter_type);
 
   ArrayNode* all_vars = self->all_iter_vars.CopyOnWrite();
   ArrayNode* leaf_vars = self->leaf_iter_vars.CopyOnWrite();
@@ -289,7 +289,7 @@ Stage& Stage::fuse(const Array<IterVar>& axes, IterVar* p_target) {  // NOLINT(*
     // insert at the outer most loop
     IterVar singleton = IterVarNode::make(
         Range::make_by_min_extent(0, 1),
-        Var("singleton", DataType::Int(32)), kDataPar);
+        Var("singleton", Int(32)), kDataPar);
     self->relations.push_back(SingletonNode::make(singleton));
     ArrayNode* all_vars = self->all_iter_vars.CopyOnWrite();
     ArrayNode* leaf_vars = self->leaf_iter_vars.CopyOnWrite();
diff --git a/src/schedule/schedule_ops.cc b/src/schedule/schedule_ops.cc
index 3a9d0bcb2a98..8d2fd1599f06 100644
--- a/src/schedule/schedule_ops.cc
+++ b/src/schedule/schedule_ops.cc
@@ -255,7 +255,7 @@ class SchedulePostProc : public IRMutator {
       if (it->second.defined()) {
         Stmt ret = Realize::make(
             it->second->op, it->second->value_index,
-            op->dtype, op->bounds, op->condition, op->body);
+            op->type, op->bounds, op->condition, op->body);
         return this->Mutate(ret);
       } else {
         return this->Mutate(op->body);
@@ -285,7 +285,7 @@ class SchedulePostProc : public IRMutator {
       if (it != replace_buffer_.end()) {
         const Tensor& dst = it->second;
         Expr ret = Call::make(
-            op->dtype, dst->op->name, op->args,
+            op->type, dst->op->name, op->args,
             op->call_type, dst->op, dst->value_index);
         return this->Mutate(ret);
       }
diff --git a/tests/cpp/attrs_test.cc b/tests/cpp/attrs_test.cc
index cb0233f68c7b..9ccb9c96166d 100644
--- a/tests/cpp/attrs_test.cc
+++ b/tests/cpp/attrs_test.cc
@@ -41,7 +41,7 @@ struct TestAttrs : public AttrsNode<TestAttrs> {
         .describe("name of the field");
     TVM_ATTR_FIELD(expr)
         .describe("expression field")
-        .set_default(make_const(DataType::Int(32), 1));
+        .set_default(make_const(Int(32), 1));
     TVM_ATTR_FIELD(learning_rate)
         .describe("learning_rate")
         .set_default(0.1);
diff --git a/tests/cpp/build_module_test.cc b/tests/cpp/build_module_test.cc
index 699865754b13..6e43b408978a 100644
--- a/tests/cpp/build_module_test.cc
+++ b/tests/cpp/build_module_test.cc
@@ -34,8 +34,8 @@ TEST(BuildModule, Basic) {
   Array<Expr> shape;
   shape.push_back(n);
 
-  auto A = placeholder(shape, DataType::Float(32), "A");
-  auto B = placeholder(shape, DataType::Float(32), "B");
+  auto A = placeholder(shape, Float(32), "A");
+  auto B = placeholder(shape, Float(32), "B");
 
   auto C = compute(A->shape, [&A, &B](Expr i) {
     return A[i] + B[i];
@@ -90,15 +90,15 @@ TEST(BuildModule, Heterogeneous) {
   const int n = 4;
   Array<Expr> shape{n};
 
-  auto A = placeholder(shape, DataType::Float(32), "A");
-  auto B = placeholder(shape, DataType::Float(32), "B");
-  auto C = placeholder(shape, DataType::Float(32), "C");
+  auto A = placeholder(shape, Float(32), "A");
+  auto B = placeholder(shape, Float(32), "B");
+  auto C = placeholder(shape, Float(32), "C");
 
   auto elemwise_add = compute(A->shape, [&A, &B](Expr i) {
     return A[i] + B[i];
   }, "elemwise_add");
 
-  auto copy = placeholder(shape, DataType::Float(32), "__copy");
+  auto copy = placeholder(shape, Float(32), "__copy");
   auto elemwise_sub = compute(C->shape, [&copy, &C](Expr i) {
     return copy[i] - C[i];
   }, "elemwise_sub");
diff --git a/tests/cpp/ir_mutator_test.cc b/tests/cpp/ir_mutator_test.cc
index 6f73b5fd06ff..1b3296da17c3 100644
--- a/tests/cpp/ir_mutator_test.cc
+++ b/tests/cpp/ir_mutator_test.cc
@@ -48,7 +48,7 @@ TVM_STATIC_IR_FUNCTOR(IRVar2Const, vtable_expr)
 .set_dispatch<Variable>([](const ObjectRef& ref, const Expr &e, IRMutator* m) {
     IRVar2Const* vm = static_cast<IRVar2Const*>(m);
     if (e.same_as(vm->var)) {
-      return Expr(IntImm::make(DataType::Int(32), vm->int_val));
+      return Expr(IntImm::make(Int(32), vm->int_val));
     } else {
       return e;
     }
diff --git a/tests/cpp/packed_func_test.cc b/tests/cpp/packed_func_test.cc
index a9b9b83ee250..70a4c32bedac 100644
--- a/tests/cpp/packed_func_test.cc
+++ b/tests/cpp/packed_func_test.cc
@@ -22,7 +22,6 @@
 #include <tvm/runtime/packed_func.h>
 #include <tvm/runtime/registry.h>
 #include <tvm/packed_func_ext.h>
-#include <tvm/runtime/registry.h>
 #include <tvm/ir.h>
 
 TEST(PackedFunc, Basic) {
@@ -145,15 +144,15 @@ TEST(PackedFunc, Type) {
   using namespace tvm;
   using namespace tvm::runtime;
   auto get_type = PackedFunc([](TVMArgs args, TVMRetValue* rv) {
-      DataType x = args[0];
+      Type x = args[0];
       *rv = x;
     });
   auto get_type2 = PackedFunc([](TVMArgs args, TVMRetValue* rv) {
       *rv = args[0];
     });
-  CHECK(get_type("int32").operator DataType() == DataType::Int(32));
-  CHECK(get_type("float").operator DataType() == DataType::Float(32));
-  CHECK(get_type2("float32x2").operator DataType() == DataType::Float(32, 2));
+  CHECK(get_type("int32").operator Type() == Int(32));
+  CHECK(get_type("float").operator Type() == Float(32));
+  CHECK(get_type2("float32x2").operator Type() == Float(32, 2));
 }
 
 TEST(TypedPackedFunc, HighOrder) {
@@ -179,69 +178,56 @@ TEST(TypedPackedFunc, HighOrder) {
   CHECK_EQ(f1(3), 4);
 }
 
+// new namespoace
+namespace test {
+// register int vector as extension type
+using IntVector = std::vector<int>;
+}  // namespace test
+
+namespace tvm {
+namespace runtime {
+
+template<>
+struct extension_type_info<test::IntVector> {
+  static const int code = kExtBegin + 1;
+};
+}  // runtime
+}  // tvm
 
-TEST(PackedFunc, ObjectConversion) {
+// do registration, this need to be in cc file
+TVM_REGISTER_EXT_TYPE(test::IntVector);
+
+TEST(PackedFunc, ExtensionType) {
   using namespace tvm;
   using namespace tvm::runtime;
-  TVMRetValue rv;
-  auto x = NDArray::Empty(
-      {}, String2TVMType("float32"),
-      TVMContext{kDLCPU, 0});
-  // assign null
-  rv = ObjectRef();
-  CHECK_EQ(rv.type_code(), kNull);
-
-  // Can assign NDArray to ret type
-  rv = x;
-  CHECK_EQ(rv.type_code(), kNDArrayContainer);
-  // Even if we assign base type it still shows as NDArray
-  rv = ObjectRef(x);
-  CHECK_EQ(rv.type_code(), kNDArrayContainer);
-  // Check convert back
-  CHECK(rv.operator NDArray().same_as(x));
-  CHECK(rv.operator ObjectRef().same_as(x));
-  CHECK(!rv.IsObjectRef<Expr>());
-
-  auto pf1 = PackedFunc([&](TVMArgs args, TVMRetValue* rv) {
-      CHECK_EQ(args[0].type_code(), kNDArrayContainer);
-      CHECK(args[0].operator NDArray().same_as(x));
-      CHECK(args[0].operator ObjectRef().same_as(x));
-      CHECK(args[1].operator ObjectRef().get() == nullptr);
-      CHECK(args[1].operator NDArray().get() == nullptr);
-      CHECK(args[1].operator Module().get() == nullptr);
-      CHECK(args[1].operator Array<NDArray>().get() == nullptr);
-      CHECK(!args[0].IsObjectRef<Expr>());
+  // note: class are copy by value.
+  test::IntVector vec{1, 2, 4};
+
+  auto copy_vec = PackedFunc([&](TVMArgs args, TVMRetValue* rv) {
+      // copy by value
+      const test::IntVector& v = args[0].AsExtension<test::IntVector>();
+      CHECK(&v == &vec);
+      test::IntVector v2 = args[0];
+      CHECK_EQ(v2.size(), 3U);
+      CHECK_EQ(v[2], 4);
+      // return copy by value
+      *rv = v2;
     });
-  pf1(x, ObjectRef());
-  pf1(ObjectRef(x), NDArray());
-
-  // testcases for modules
-  auto* pf = tvm::runtime::Registry::Get("module.source_module_create");
-  CHECK(pf != nullptr);
-  Module m = (*pf)("", "xyz");
-  rv = m;
-  CHECK_EQ(rv.type_code(), kModuleHandle);
-  // Even if we assign base type it still shows as NDArray
-  rv = ObjectRef(m);
-  CHECK_EQ(rv.type_code(), kModuleHandle);
-  // Check convert back
-  CHECK(rv.operator Module().same_as(m));
-  CHECK(rv.operator ObjectRef().same_as(m));
-  CHECK(!rv.IsObjectRef<NDArray>());
-
-  auto pf2 = PackedFunc([&](TVMArgs args, TVMRetValue* rv) {
-      CHECK_EQ(args[0].type_code(), kModuleHandle);
-      CHECK(args[0].operator Module().same_as(m));
-      CHECK(args[0].operator ObjectRef().same_as(m));
-      CHECK(args[1].operator ObjectRef().get() == nullptr);
-      CHECK(args[1].operator NDArray().get() == nullptr);
-      CHECK(args[1].operator Module().get() == nullptr);
-      CHECK(!args[0].IsObjectRef<Expr>());
+
+  auto pass_vec = PackedFunc([&](TVMArgs args, TVMRetValue* rv) {
+      // copy by value
+      *rv = args[0];
     });
-  pf2(m, ObjectRef());
-  pf2(ObjectRef(m), Module());
+
+  test::IntVector vret1 = copy_vec(vec);
+  test::IntVector vret2 = pass_vec(copy_vec(vec));
+  CHECK_EQ(vret1.size(), 3U);
+  CHECK_EQ(vret2.size(), 3U);
+  CHECK_EQ(vret1[2], 4);
+  CHECK_EQ(vret2[2], 4);
 }
 
+
 int main(int argc, char ** argv) {
   testing::InitGoogleTest(&argc, argv);
   testing::FLAGS_gtest_death_test_style = "threadsafe";
diff --git a/tests/cpp/pattern_match_test.cc b/tests/cpp/pattern_match_test.cc
index 9710428d1b13..7fb654b5d9d4 100644
--- a/tests/cpp/pattern_match_test.cc
+++ b/tests/cpp/pattern_match_test.cc
@@ -25,7 +25,7 @@ TEST(Pattern, Basic) {
   using namespace tvm::arith;
   Var x("x"), y("y"), z("z");
   arith::PVar<Expr> px, py, pz;
-  arith::PVar<DataType> pt;
+  arith::PVar<Type> pt;
   arith::PVar<int> planes;
 
   // arithmetics
@@ -99,14 +99,13 @@ TEST(Pattern, Basic) {
   }
   // cast pattern
   {
-    CHECK(!cast(PConst<DataType>(
-        DataType::Int(32)), px).Match(ir::Cast::make(DataType::Float(64), x)));
-    CHECK(cast(pt, px).Match(ir::Cast::make(DataType::Float(64), x)));
-    CHECK(pt.Eval() == DataType::Float(64));
+    CHECK(!cast(PConst<Type>(Int(32)), px).Match(ir::Cast::make(Float(64), x)));
+    CHECK(cast(pt, px).Match(ir::Cast::make(Float(64), x)));
+    CHECK(pt.Eval() == Float(64));
     auto zz = cast(pt, px).Eval();
     CHECK((cast(pt, px) - cast(pt, py)).Match(
-        ir::Cast::make(DataType::Float(64), x) - ir::Cast::make(DataType::Int(64), x)));
-    auto expr = ir::Cast::make(DataType::Int(32), ir::Cast::make(DataType::Float(64), x));
+        ir::Cast::make(Float(64), x) - ir::Cast::make(Int(64), x)));
+    auto expr = ir::Cast::make(Int(32), ir::Cast::make(Float(64), x));
     CHECK(!(cast(pt, cast(pt, px))).Match(expr));
   }
   // ramp pattern
diff --git a/tests/cpp/relay_build_module_test.cc b/tests/cpp/relay_build_module_test.cc
index df2a6b6d9717..5547e9342efb 100644
--- a/tests/cpp/relay_build_module_test.cc
+++ b/tests/cpp/relay_build_module_test.cc
@@ -38,7 +38,7 @@ TVM_REGISTER_GLOBAL("test.sch")
 
 TEST(Relay, BuildModule) {
   using namespace tvm;
-  auto tensor_type = relay::TensorTypeNode::make({2, 3}, DataType::Float(32));
+  auto tensor_type = relay::TensorTypeNode::make({2, 3}, ::tvm::Float(32));
   auto a = relay::VarNode::make("a", tensor_type);
   auto b = relay::VarNode::make("b", tensor_type);
   auto add_op = relay::Op::Get("add");
diff --git a/tests/cpp/relay_pass_type_infer_test.cc b/tests/cpp/relay_pass_type_infer_test.cc
index cdc69964562d..0a2bf9a7af63 100644
--- a/tests/cpp/relay_pass_type_infer_test.cc
+++ b/tests/cpp/relay_pass_type_infer_test.cc
@@ -26,7 +26,7 @@
 
 TEST(Relay, SelfReference) {
   using namespace tvm;
-  auto tensor_type = relay::TensorTypeNode::make({}, DataType::Bool());
+  auto tensor_type = relay::TensorTypeNode::make({}, ::tvm::Bool());
   auto x = relay::VarNode::make("x", relay::Type());
   auto f = relay::FunctionNode::make(tvm::Array<relay::Var>{ x }, x, relay::Type(), {});
 
diff --git a/tests/cpp/relay_transform_sequential.cc b/tests/cpp/relay_transform_sequential.cc
index 3914d96edc37..34db91745480 100644
--- a/tests/cpp/relay_transform_sequential.cc
+++ b/tests/cpp/relay_transform_sequential.cc
@@ -37,7 +37,7 @@ TVM_REGISTER_GLOBAL("schedule")
 
 TEST(Relay, Sequential) {
   using namespace tvm;
-  auto tensor_type = relay::TensorTypeNode::make({1, 2, 3}, DataType::Float(32));
+  auto tensor_type = relay::TensorTypeNode::make({1, 2, 3}, ::tvm::Float(32));
   auto c_data =
       tvm::runtime::NDArray::Empty({1, 2, 3}, {kDLFloat, 32, 1}, {kDLCPU, 0});
 
diff --git a/tests/cpp/simple_passes_test.cc b/tests/cpp/simple_passes_test.cc
index 19c851b3ecda..5d63b22c5a8f 100644
--- a/tests/cpp/simple_passes_test.cc
+++ b/tests/cpp/simple_passes_test.cc
@@ -28,7 +28,7 @@ TEST(SimplePasses, HasSideEffect) {
   Array<Expr> shape;
   shape.push_back(n);
 
-  auto A = placeholder(shape, DataType::Float(32), "A");
+  auto A = placeholder(shape, Float(32), "A");
 
   CHECK(!tvm::ir::HasSideEffect(A[0]));
 }
diff --git a/tests/cpp/tensor_test.cc b/tests/cpp/tensor_test.cc
index c994c0fbc355..ec00fa785af6 100644
--- a/tests/cpp/tensor_test.cc
+++ b/tests/cpp/tensor_test.cc
@@ -25,8 +25,8 @@ TEST(Tensor, Basic) {
   using namespace tvm;
   Var m("m"), n("n"), l("l");
 
-  Tensor A = placeholder({m, l}, DataType::Float(32), "A");
-  Tensor B = placeholder({n, l}, DataType::Float(32), "B");
+  Tensor A = placeholder({m, l}, Float(32), "A");
+  Tensor B = placeholder({n, l}, Float(32), "B");
 
   auto C = compute({m, n}, [&](Var i, Var j) {
       return A[i][j];
@@ -38,8 +38,8 @@ TEST(Tensor, Basic) {
 TEST(Tensor, Reduce) {
   using namespace tvm;
   Var m("m"), n("n"), l("l");
-  Tensor A = placeholder({m, l}, DataType::Float(32), "A");
-  Tensor B = placeholder({n, l}, DataType::Float(32), "B");
+  Tensor A = placeholder({m, l}, Float(32), "A");
+  Tensor B = placeholder({n, l}, Float(32), "B");
   IterVar rv = reduce_axis(Range{0, l}, "k");
 
   auto C = compute({m, n}, [&](Var i, Var j) {
diff --git a/tests/cpp/topi_ewise_test.cc b/tests/cpp/topi_ewise_test.cc
index 55a91452c189..fc94865fc2d8 100644
--- a/tests/cpp/topi_ewise_test.cc
+++ b/tests/cpp/topi_ewise_test.cc
@@ -25,7 +25,7 @@ namespace topi {
 TEST(Tensor, Basic) {
   using namespace tvm;
   Var m("m"), n("n"), l("l");
-  Tensor A = placeholder({m, l}, DataType::Float(32), "A");
+  Tensor A = placeholder({m, l}, Float(32), "A");
   auto C = topi::exp(A);
 }
 }
diff --git a/tests/lint/check_file_type.py b/tests/lint/check_file_type.py
index fcfef181c4e6..5b83641eb8e1 100644
--- a/tests/lint/check_file_type.py
+++ b/tests/lint/check_file_type.py
@@ -102,6 +102,7 @@
     # sgx file
     "apps/sgx/enclave/sgx-deps.diff",
     # html for demo purposes
+    "nnvm/tutorials/web/resnet.html",
     "tests/webgl/test_static_webgl_library.html",
     "web/example_rpc.html",
     # images are normally not allowed
diff --git a/tests/python/contrib/test_cublas.py b/tests/python/contrib/test_cublas.py
index 4d4789663a9f..85268b95a7a8 100644
--- a/tests/python/contrib/test_cublas.py
+++ b/tests/python/contrib/test_cublas.py
@@ -17,7 +17,6 @@
 import tvm
 import numpy as np
 from tvm.contrib import cublas
-from tvm.contrib import cublaslt
 
 def verify_matmul_add(in_dtype, out_dtype, rtol=1e-5):
     n = 1024
@@ -45,64 +44,6 @@ def verify(target="cuda"):
             c.asnumpy(), np.dot(a.asnumpy().astype(C.dtype), b.asnumpy().astype(C.dtype)), rtol=rtol)
     verify()
 
-def roundoff(v, d):
-    return int(np.floor((v + d - 1) / d) * d)
-
-def verify_matmul_add_igemm(in_dtype, out_dtype, rtol=1e-5):
-    n = 1024
-    l = 1024
-    m = 1024
-    L = roundoff(l, 32)
-    N = roundoff(n, 8)
-    N_out = roundoff(n, 32)
-
-    A = tvm.placeholder((N, L), name='A', dtype=in_dtype)
-    B = tvm.placeholder((m, L), name='B', dtype=in_dtype)
-    # C has CUBLASLT_ORDER_COL32 layout, thus a different shape
-    C = cublaslt.matmul(A, B, False, True, m, N_out, dtype=out_dtype)
-    s = tvm.create_schedule(C.op)
-
-    def verify(target="cuda"):
-        if not tvm.module.enabled(target):
-            print("skip because %s is not enabled..." % target)
-            return
-        if not tvm.get_global_func("tvm.contrib.cublaslt.matmul", True):
-            print("skip because extern function is not available")
-            return
-        ctx = tvm.gpu(0)
-        f = tvm.build(s, [A, B, C], target)
-        a_old = np.random.uniform(0, 128, size=(n, l))
-        b_old = np.random.uniform(0, 128, size=(l, m))
-
-        # Transform a to become CUBLASLT_ORDER_COL4_4R2_8C layout
-        a_new = np.hstack((a_old.astype(A.dtype), np.zeros([n, L-l])))
-        a_new = np.vstack((a_new.astype(A.dtype), np.zeros([N-n, L])))
-        a_even = np.vsplit(a_new[::2], N / 8)
-        a_odd = np.vsplit(a_new[1::2], N / 8)
-        a_new = [None]*(len(a_even) + len(a_odd))
-        a_new[::2] = a_even
-        a_new[1::2] = a_odd
-        a_new = np.vstack(a_new)
-        a_new = np.vstack(np.vstack(np.vstack(np.hsplit(i, 8)).reshape([4, 32]) for i in np.vsplit(j, N/4)) for j in np.hsplit(a_new, L/32))
-        a_new = a_new.reshape([N, L])
-        # Transform b to become CUBLASLT_ORDER_COL32 layout
-        b_new = np.vstack(np.hsplit(np.hstack((b_old.T.astype(B.dtype), np.zeros([m, L - l]))), L / 32))
-        b_new = b_new.reshape([m, L])
-
-        a = tvm.nd.array(a_new.astype(A.dtype), ctx)
-        b = tvm.nd.array(b_new.astype(B.dtype), ctx)
-        c = tvm.nd.array(np.zeros((m, N_out), dtype=C.dtype), ctx)
-        f(a, b, c)
-        # Transform output c from layout CUBLASLT_ORDER_COL32 to row major layout
-        c_out = c.asnumpy()
-        c_out = c_out.reshape([int(m * N_out / 32), 32])
-        c_out = np.hstack(np.vsplit(c_out, int(N_out / 32)))
-        c_out = c_out[:, :n]
-        c_out = c_out.T
-        tvm.testing.assert_allclose(
-            c_out, np.dot(a_old.astype(C.dtype), b_old.astype(C.dtype)), rtol=rtol)
-    verify()
-
 def verify_batch_matmul(in_dtype, out_dtype, rtol=1e-5):
     j = 16
     n = 1024
@@ -132,14 +73,11 @@ def verify(target="cuda"):
     verify()
 
 def test_matmul_add():
-    verify_matmul_add('float', 'float', rtol=1e-3)
+    verify_matmul_add('float', 'float')
     verify_matmul_add('float16', 'float')
     verify_matmul_add('float16', 'float16', rtol=1e-2)
     verify_matmul_add('int8', 'int32')
 
-def test_matmul_add_igemm():
-    verify_matmul_add_igemm('int8', 'int32')
-
 def test_batch_matmul():
     verify_batch_matmul('float', 'float')
     verify_batch_matmul('float16', 'float')
@@ -148,5 +86,4 @@ def test_batch_matmul():
 if __name__ == "__main__":
     test_matmul_add()
     test_batch_matmul()
-    test_matmul_add_igemm()
 
diff --git a/tests/python/frontend/darknet/test_forward.py b/tests/python/frontend/darknet/test_forward.py
index 22dd08ab52ea..51f05d7c707d 100644
--- a/tests/python/frontend/darknet/test_forward.py
+++ b/tests/python/frontend/darknet/test_forward.py
@@ -251,7 +251,7 @@ def test_forward_dense_batchnorm():
     layer = LIB.make_connected_layer(1, 12, 2, 1, 1, 0)
     for i in range(5):
         layer.rolling_mean[i] = np.random.rand(1)
-        layer.rolling_variance[i] = np.random.rand(1) + 0.5
+        layer.rolling_variance[i] = np.random.rand(1)
         layer.scales[i] = np.random.rand(1)
     net.layers[0] = layer
     net.w = net.h = 2
@@ -285,7 +285,7 @@ def test_forward_conv_batch_norm():
     layer = LIB.make_convolutional_layer(1, 224, 224, 3, 32, 1, 3, 2, 0, 1, 1, 0, 0, 0)
     for i in range(32):
         layer.rolling_mean[i] = np.random.rand(1)
-        layer.rolling_variance[i] = np.random.rand(1) + 0.5
+        layer.rolling_variance[i] = np.random.rand(1)
     net.layers[0] = layer
     net.w = net.h = 224
     LIB.resize_network(net, 224, 224)
diff --git a/tests/python/frontend/mxnet/test_forward.py b/tests/python/frontend/mxnet/test_forward.py
index 18250d0ea5a6..be4436dda07e 100644
--- a/tests/python/frontend/mxnet/test_forward.py
+++ b/tests/python/frontend/mxnet/test_forward.py
@@ -734,7 +734,7 @@ def verify(shape, axis=1, fix_gamma=False):
         gamma = np.random.uniform(size=(shape[axis])).astype("float32")
         beta = np.random.uniform(size=(shape[axis])).astype("float32")
         moving_mean = np.random.uniform(size=(shape[axis])).astype("float32")
-        moving_var = np.abs(np.random.uniform(size=(shape[axis])).astype("float32")) + 0.5
+        moving_var = np.random.uniform(size=(shape[axis])).astype("float32")
         ref_res = mx.nd.BatchNorm(mx.nd.array(x), mx.nd.array(gamma), mx.nd.array(beta),
                                   mx.nd.array(moving_mean), mx.nd.array(moving_var),
                                   axis=axis, use_global_stats=True, fix_gamma=fix_gamma)
diff --git a/tests/python/frontend/nnvm_to_relay/test_alter_conv2d.py b/tests/python/frontend/nnvm_to_relay/test_alter_conv2d.py
new file mode 100644
index 000000000000..ed8b9cd9ed97
--- /dev/null
+++ b/tests/python/frontend/nnvm_to_relay/test_alter_conv2d.py
@@ -0,0 +1,89 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Test alter conv2d layout pass"""
+import tvm
+import nnvm
+
+from tvm import relay
+from tvm import autotvm
+from tvm.relay import transform
+from tvm.relay.analysis import alpha_equal
+
+
+def test_alter_layout_conv2d():
+    """Additional layout transformations should occour on the graph.
+    """
+
+    def convnet():
+        """Alternating layout of simple convnet (from image super-resolution).
+        """
+        bias1 = relay.var('bias1', shape=(64,))
+        bias2 = relay.var('bias2', shape=(64,))
+        bias3 = relay.var('bias3', shape=(64,))
+        bias4 = relay.var('bias4', shape=(64,))
+        weight1 = relay.var('weight1', shape=(64, 1, 5, 5))
+        weight2 = relay.var('weight2', shape=(64, 64, 3, 3))
+        weight3 = relay.var('weight3', shape=(64, 64, 3, 3))
+        weight4 = relay.var('weight4', shape=(64, 64, 3, 3))
+        data = relay.var("x", shape=(1, 1, 224, 224))
+        n00 = relay.nn.conv2d(data, weight1, padding=[2, 2], kernel_size=[5, 5])
+        n01 = relay.expand_dims(bias1, axis=1, num_newaxis=2)
+        n02 = relay.add(n00, n01)
+        n03 = relay.nn.relu(n02)
+        n04 = relay.nn.conv2d(n03, weight2, padding=[1, 1], kernel_size=[3, 3])
+        n05 = relay.expand_dims(bias2, axis=1, num_newaxis=2)
+        n06 = relay.add(n04, n05)
+        n07 = relay.nn.relu(n06)
+        n08 = relay.nn.conv2d(n07, weight3, padding=[1, 1], kernel_size=[3, 3])
+        n09 = relay.expand_dims(bias3, axis=1, num_newaxis=2)
+        n10 = relay.add(n08, n09)
+        n11 = relay.nn.relu(n10)
+        n12 = relay.nn.conv2d(n11, weight4, padding=[1, 1], kernel_size=[3, 3])
+        n13 = relay.expand_dims(bias4, axis=1, num_newaxis=2)
+        n14 = relay.add(n12, n13)
+        n15 = relay.reshape(n14, newshape=[1, 1, 3, 3, 224, 224])
+        n16 = relay.transpose(n15, axes=[0, 1, 4, 2, 5, 3])
+        net = relay.reshape(n16, newshape=[1, 1, 672, 672])
+        args = relay.analysis.free_vars(net)
+        return relay.Function(args, net)
+
+    # orig net
+    N = convnet()
+
+    # trigger a test
+    # for each known alter_conv2d
+    targets=['cuda',
+             'opencl -device=mali',
+             'opencl -device=intel_graphics',
+
+             'llvm -device=arm_cpu',
+             'llvm -device=core-avx-ii']
+
+    for tgt in targets:
+        with tvm.target.create(tgt) as target:
+            with autotvm.tophub.context(target):
+                mod = relay.Module.from_expr(N)
+                mod = transform.AlterOpLayout()(mod)
+                O = mod["main"]
+
+                # graph should differ
+                assert not relay.analysis.alpha_equal(N, O)
+
+if __name__ == "__main__":
+    import numpy as np
+    np.random.seed(42)
+    test_alter_layout_conv2d()
diff --git a/tests/python/frontend/nnvm_to_relay/test_forward.py b/tests/python/frontend/nnvm_to_relay/test_forward.py
new file mode 100644
index 000000000000..6a00b5a471f4
--- /dev/null
+++ b/tests/python/frontend/nnvm_to_relay/test_forward.py
@@ -0,0 +1,116 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import numpy as np
+
+import tvm
+import nnvm
+import nnvm.testing
+from nnvm.to_relay import to_relay
+from tvm import relay
+from tvm.relay.testing.config import ctx_list
+from tvm.contrib import graph_runtime
+
+def verify_nnvm_to_relay(nnvm_sym, params, data_shape=(1, 3, 224, 224)):
+    def get_nnvm_output(sym, x, params, target, ctx, dtype='float32'):
+        shape_dict = {'data': x.shape}
+        with nnvm.compiler.build_config(opt_level=3):
+            graph, lib, params = nnvm.compiler.build(sym, target, shape_dict, params=params)
+        m = graph_runtime.create(graph, lib, ctx)
+        m.set_input("data", tvm.nd.array(x.astype(dtype)))
+        m.set_input(**params)
+        m.run()
+        return m.get_output(0).asnumpy()
+
+    def get_relay_output(sym, x, params, target, ctx, dtype='float32'):
+        shape_dict = {'data': x.shape}
+        func, params = to_relay(sym, shape_dict, dtype, params)
+        with relay.build_config(opt_level=3):
+            graph, lib, params = relay.build(func, target=target, params=params)
+        m = graph_runtime.create(graph, lib, ctx)
+        m.set_input("data", tvm.nd.array(x.astype(dtype)))
+        m.set_input(**params)
+        m.run()
+        return m.get_output(0).asnumpy()
+
+    x = np.random.uniform(size=data_shape)
+    for target, ctx in ctx_list():
+        nnvm_out = get_nnvm_output(nnvm_sym, x, params, target, ctx)
+        relay_out = get_relay_output(nnvm_sym, x, params, target, ctx)
+        tvm.testing.assert_allclose(nnvm_out, relay_out, rtol=1e-5, atol=1e-5)
+
+
+def test_forward_mlp():
+    model, params = nnvm.testing.mlp.get_workload(1)
+    verify_nnvm_to_relay(model, params)
+
+
+def test_forward_vgg():
+    model, params = nnvm.testing.vgg.get_workload(1)
+    verify_nnvm_to_relay(model, params)
+
+
+def test_forward_resnet():
+    model, params = nnvm.testing.resnet.get_workload(1)
+    verify_nnvm_to_relay(model, params)
+
+
+def test_forward_squeezenet():
+    model, params = nnvm.testing.squeezenet.get_workload(1)
+    verify_nnvm_to_relay(model, params)
+
+
+def test_forward_inception_v3():
+    model, params = nnvm.testing.inception_v3.get_workload(1)
+    verify_nnvm_to_relay(model, params, data_shape=(1, 3, 299, 299))
+
+
+def test_forward_densenet():
+    model, params = nnvm.testing.squeezenet.get_workload(1)
+    verify_nnvm_to_relay(model, params)
+
+
+def test_forward_dqn():
+    model, params = nnvm.testing.dqn.get_workload(1)
+    verify_nnvm_to_relay(model, params, data_shape=(1, 4, 84, 84))
+
+
+def test_forward_split_concatenate():
+    shape = (2, 16)
+
+    tensor = nnvm.sym.Variable("data", shape=shape)
+
+    splited = nnvm.sym.split(tensor, indices_or_sections=2, axis=1)
+
+    concatenated = nnvm.sym.concatenate(*splited, axis=1)
+
+    params = {}
+
+    verify_nnvm_to_relay(splited[0], params, data_shape=shape)
+    verify_nnvm_to_relay(splited[1], params, data_shape=shape)
+    verify_nnvm_to_relay(splited, params, data_shape=shape)
+    verify_nnvm_to_relay(concatenated, params, data_shape=shape)
+
+
+if __name__ == '__main__':
+    test_forward_mlp()
+    test_forward_vgg()
+    test_forward_resnet()
+    test_forward_squeezenet()
+    test_forward_inception_v3()
+    test_forward_densenet()
+    test_forward_dqn()
+    test_forward_split_concatenate()
diff --git a/tests/python/frontend/onnx/test_forward.py b/tests/python/frontend/onnx/test_forward.py
index a35ebd23ae0a..e074bac90f2a 100644
--- a/tests/python/frontend/onnx/test_forward.py
+++ b/tests/python/frontend/onnx/test_forward.py
@@ -23,7 +23,7 @@
 import tvm
 from tvm import relay
 from tvm.contrib import graph_runtime
-from tvm.relay.testing.config import ctx_list
+from nnvm.testing.config import ctx_list
 import onnx
 from onnx import helper, TensorProto, mapping
 import scipy
@@ -77,14 +77,11 @@ def get_tvm_output(graph_def, input_data, target, ctx, output_shape=None, output
         return tvm_output.asnumpy()
 
 
-def get_onnxruntime_output(model, inputs, dtype='float32'):
+def get_onnxruntime_output(model, x, dtype='float32'):
     import onnxruntime.backend
     rep = onnxruntime.backend.prepare(model, 'CPU')
-    if isinstance(inputs, list) and len(inputs) > 1:
-        ort_out = rep.run(inputs)
-    else:
-        x = inputs.astype(dtype)
-        ort_out = rep.run(x)[0]
+    x = x.astype(dtype)
+    ort_out = rep.run(x)[0]
     return ort_out
 
 
@@ -145,46 +142,6 @@ def test_reshape():
     tvm.testing.assert_allclose(ref_shape, tvm_out.shape)
 
 
-def test_expand():
-
-    def _test_expand(name, data, shape, ref_data):
-        shape_array = np.array(shape)
-        shape_node = onnx.helper.make_node('Constant',
-                                    inputs=[],
-                                    outputs=['shape'],
-                                    value=onnx.helper.make_tensor(name = 'const_tensor',
-                                                                  data_type = onnx.TensorProto.INT32,
-                                                                  dims = shape_array.shape,
-                                                                  vals = shape_array.flatten().astype('int32')))
-        expand_node = helper.make_node("Expand", ["in", "shape"], ["out"])
-
-        graph = helper.make_graph([shape_node, expand_node],
-                                "expand_test",
-                                inputs = [helper.make_tensor_value_info("in",
-                                                TensorProto.FLOAT, list(data.shape))],
-                                outputs = [helper.make_tensor_value_info("out",
-                                                TensorProto.FLOAT, list(ref_data.shape))])
-
-        model = helper.make_model(graph, producer_name=name)
-
-        for target, ctx in ctx_list():
-            tvm_out = get_tvm_output(model, data, target, ctx, ref_data.shape, 'float32')
-
-        tvm.testing.assert_allclose(ref_data, tvm_out)
-
-    in_shape = (3, 1)
-    shape = (3, 4)
-    data = np.random.uniform(size=in_shape).astype(np.float32)
-    ref_data = np.tile(data, 4)
-    _test_expand('expand_with_dim_unchanged_test', data, shape, ref_data)
-
-    in_shape = (3, 1)
-    shape = (2, 1, 6)
-    data = np.random.uniform(size=in_shape).astype(np.float32)
-    ref_data = data * np.ones(shape, dtype=np.float32)
-    _test_expand('expand_with_dim_changed_test', data, shape, ref_data)
-
-
 def verify_depth_to_space(inshape, outshape, mode, blockSize):
     node = onnx.helper.make_node('DepthToSpace',
                                  inputs=['x'],
@@ -1749,106 +1706,10 @@ def test_or():
     verify_or(indata=[x, y], dtype=bool)
 
 
-def verify_conv(x_shape, w_shape, y_shape, p):
-    node = helper.make_node('Conv',
-                            inputs=['x', 'W'],
-                            outputs=['y'],
-                            kernel_shape=[3, 3],
-                            # Default values for other attributes:
-                            # strides=[1, 1],
-                            # dilations=[1, 1],
-                            # groups=1
-                            pads=p,)
-
-    graph = helper.make_graph([node],
-                              'conv_test',
-                              inputs=[helper.make_tensor_value_info("x", TensorProto.FLOAT, list(x_shape)),
-                                      helper.make_tensor_value_info("W", TensorProto.FLOAT, list(w_shape))],
-                              outputs=[helper.make_tensor_value_info("y", TensorProto.FLOAT, list(y_shape))])
-
-    model = helper.make_model(graph, producer_name='conv_test')
-
-    for target, ctx in ctx_list():
-        x = np.random.uniform(size=x_shape).astype('float32')
-        W = np.random.uniform(size=w_shape).astype('float32')
-        tvm_out = get_tvm_output(model, [x, W], target, ctx, y_shape)
-        onnx_out = get_onnxruntime_output(model, [x, W], 'float32')[0]
-        tvm.testing.assert_allclose(onnx_out, tvm_out, rtol=1e-5, atol=1e-5)
-
-
-def test_conv():
-    # Convolution with padding
-    # (1, 1, 5, 5) input tensor
-    # (1, 1, 3, 3) tensor for convolution weights
-    # (1, 1, 5, 5) output tensor
-    # [1, 1, 1, 1] list for pads
-    verify_conv((1, 1, 5, 5), (1, 1, 3, 3), (1, 1, 5, 5), [1, 1, 1, 1])
-
-    # Convolution without padding
-    # (1, 1, 5, 5) input tensor
-    # (1, 1, 3, 3) tensor for convolution weights
-    # (1, 1, 3, 3) output tensor
-    # [0, 0, 0, 0] list for pads
-    verify_conv((1, 1, 5, 5), (1, 1, 3, 3), (1, 1, 3, 3), [0, 0, 0, 0])
-
-
-def verify_convtranspose(x_shape, w_shape, y_shape, p):
-    node = onnx.helper.make_node("ConvTranspose",
-                                 inputs=["x", "W"],
-                                 outputs=['y'],
-                                 strides=[3, 2],
-                                 group=1,
-                                 kernel_shape=[3, 3],
-                                 pads=p)
-
-    graph = helper.make_graph([node],
-                              'verify_convtranspose_test',
-                              inputs=[helper.make_tensor_value_info("x", TensorProto.FLOAT, list(x_shape)),
-                                      helper.make_tensor_value_info("W", TensorProto.FLOAT, list(w_shape))],
-                              outputs=[helper.make_tensor_value_info("y", TensorProto.FLOAT, list(y_shape))])
-
-    model = helper.make_model(graph, producer_name='convtranspose_trest')
-
-    for target, ctx in ctx_list():
-        x = np.random.uniform(size=x_shape).astype('float32')
-        W = np.random.uniform(size=w_shape).astype('float32')
-        tvm_out = get_tvm_output(model, [x, W], target, ctx, y_shape)
-        onnx_out = get_onnxruntime_output(model, [x, W], 'float32')[0]
-        tvm.testing.assert_allclose(onnx_out, tvm_out, rtol=1e-5, atol=1e-5)
-
-
-def test_convtranspose():
-    # Convolution Transpose with padding
-    # (1, 1, 3, 3) input tensor
-    # (1, 2, 3, 3) tensor for convolution weights
-    # (1, 2, 7, 3) output tensor
-    # [1, 2, 1, 2] list for pads
-    verify_convtranspose((1, 1, 3, 3), (1, 2, 3, 3), (1, 2, 7, 3), [1, 2, 1, 2])
-
-
-def test_unsqueeze_constant():
-    from torch.nn import Linear, Sequential, Module
-    class Flatten(Module):
-        def forward(self, input):
-            return input.view(input.size(0), -1)
-
-    import tempfile
-    with tempfile.NamedTemporaryFile() as fp:
-        file_name = fp.name
-        input_size = (1, 16, 32, 32)
-        dummy_input = torch.randn(*input_size)
-        layer = Sequential(Flatten(), Linear(16 * 32 * 32, 64))
-        torch.onnx.export(layer, dummy_input, file_name, export_params=True)
-
-        onnx_model = onnx.load(file_name)
-        relay.frontend.from_onnx(onnx_model, {'0': input_size})
-
-
 if __name__ == '__main__':
     test_flatten()
     test_reshape()
     test_shape()
-    test_expand()
     test_power()
     test_squeeze()
     test_unsqueeze()
@@ -1898,6 +1759,3 @@ def forward(self, input):
     test_or()
     test_depth_to_space()
     test_space_to_depth()
-    test_conv()
-    test_convtranspose()
-    test_unsqueeze_constant()
diff --git a/tests/python/frontend/tensorflow/test_forward.py b/tests/python/frontend/tensorflow/test_forward.py
index 9b7fe62306fd..82de233f7b7e 100644
--- a/tests/python/frontend/tensorflow/test_forward.py
+++ b/tests/python/frontend/tensorflow/test_forward.py
@@ -237,58 +237,16 @@ def _test_pooling_iteration(input_shape, **kwargs):
 def _test_pooling(input_shape, **kwargs):
     _test_pooling_iteration(input_shape, **kwargs)
 
-    if is_gpu_available():
-        if len(input_shape) == 4:
-            input_shape = [input_shape[ii] for ii in (0, 3, 1, 2)]
-            kwargs['data_format'] = 'NCHW'
-            _test_pooling_iteration(input_shape, **kwargs)
+    if is_gpu_available() and (len(input_shape) == 4):
+        input_shape = [input_shape[ii] for ii in (0, 3, 1, 2)]
+        kwargs['data_format'] = 'NCHW'
+        _test_pooling_iteration(input_shape, **kwargs)
 
 
 def test_forward_pooling():
     """ Pooling """
-    # TensorFlow only supports NDHWC for max_pool3d on CPU
-    for pool_type in ['AVG', 'MAX']:
-        # NDHWC is the default layout for max_pool3d and avg_pool3d in TensorFlow
-        _test_pooling(input_shape=[1, 3, 32, 32, 32],
-                      window_shape=[2, 2, 2],
-                      padding='VALID',
-                      pooling_type=pool_type,
-                      dilation_rate=[1, 1, 1],
-                      strides=[2, 2, 2])
-
-        _test_pooling(input_shape=[1, 3, 32, 32, 32],
-                      window_shape=[1, 1, 1],
-                      padding='SAME',
-                      pooling_type=pool_type,
-                      dilation_rate=[1, 1, 1],
-                      strides=[1, 1, 1])
-
-        _test_pooling(input_shape=[1, 3, 32, 32, 32],
-                      window_shape=[2, 2, 2],
-                      padding='SAME',
-                      pooling_type=pool_type,
-                      dilation_rate=[1, 1, 1],
-                      strides=[2, 2, 2])
-
-        # test cases for max_pool3d & avg_pool3d with layout NCDHW
-        # TensorFlow pool3d  doesn't support NCDHW on cpu
-        if is_gpu_available():
-            _test_pooling(input_shape=[1, 3, 32, 32, 32],
-                          window_shape=[1, 1, 1],
-                          padding='SAME',
-                          pooling_type=pool_type,
-                          dilation_rate=[1, 1, 1],
-                          strides=[1, 1, 1],
-                          data_format='NCDHW')
-
-            _test_pooling(input_shape=[1, 3, 32, 32, 32],
-                          window_shape=[2, 2, 2],
-                          padding='VALID',
-                          pooling_type=pool_type,
-                          dilation_rate=[1, 1, 1],
-                          strides=[2, 2, 2],
-                          data_format='NCDHW')
 
+    for pool_type in ['AVG', 'MAX']:
         _test_pooling(input_shape=[2, 9, 10, 2],
                       window_shape=[1, 1],
                       padding='SAME',
@@ -403,22 +361,10 @@ def test_forward_convolution():
         _test_convolution('depthwise', [4, 12, 17, 17], [3, 3, 12, 2], [1, 1], [2, 2], 'VALID', 'NCHW')
         _test_convolution('conv_transpose', [4, 32, 8, 8], [1, 1, 176, 32], [1, 1], [1, 1], 'SAME',
                           'NCHW', [4, 176, 8, 8])
-        _test_convolution('conv_transpose', [4, 32, 8, 8], [2, 2, 176, 32], [1, 1], [1, 1], 'SAME',
-                          'NCHW', [4, 176, 8, 8])
-        _test_convolution('conv_transpose', [4, 32, 8, 8], [2, 2, 176, 32], [1, 1], [2, 2], 'SAME',
-                          'NCHW', [4, 176, 15, 15])
-        _test_convolution('conv_transpose', [4, 32, 8, 8], [3, 3, 176, 32], [1, 1], [1, 1], 'SAME',
-                          'NCHW', [4, 176, 8, 8])
-        _test_convolution('conv_transpose', [4, 32, 8, 8], [3, 3, 176, 32], [1, 1], [2, 2], 'SAME',
-                          'NCHW', [4, 176, 15, 15])
-        _test_convolution('conv_transpose', [4, 32, 8, 8], [3, 3, 176, 32], [1, 1], [2, 2], 'SAME',
-                          'NCHW', [4, 176, 16, 16])
         _test_convolution('conv_transpose', [4, 19, 8, 8], [3, 3, 19, 19], [1, 1], [2, 2], 'VALID',
                           'NCHW', [4, 19, 17, 17])
         _test_convolution('conv_transpose', [4, 19, 17, 17], [1, 1, 124, 19], [1, 1], [1, 1], 'SAME',
                           'NCHW', [4, 124, 17, 17])
-        _test_convolution('conv_transpose', [4, 19, 17, 17], [3, 3, 124, 19], [1, 1], [1, 1], 'SAME',
-                          'NCHW', [4, 124, 17, 17])
         _test_convolution('conv_transpose', [4, 32, 8, 8], [3, 3, 12, 32], [1, 1], [2, 2], 'VALID',
                           'NCHW', [4, 12, 17, 17])
         # kernel 2x2, strides (2,2)
@@ -441,22 +387,10 @@ def test_forward_convolution():
     _test_convolution('depthwise', [4, 17, 17, 12], [3, 3, 12, 2], [1, 1], [2, 2], 'VALID', 'NHWC')
     _test_convolution('conv_transpose', [4, 8, 8, 32], [1, 1, 176, 32], [1, 1], [1, 1], 'SAME',
                       'NHWC', [4, 8, 8, 176])
-    _test_convolution('conv_transpose', [4, 8, 8, 32], [2, 2, 176, 32], [1, 1], [1, 1], 'SAME',
-                      'NHWC', [4, 8, 8, 176])
-    _test_convolution('conv_transpose', [4, 8, 8, 32], [2, 2, 176, 32], [1, 1], [2, 2], 'SAME',
-                      'NHWC', [4, 15, 15, 176])
-    _test_convolution('conv_transpose', [4, 8, 8, 32], [3, 3, 176, 32], [1, 1], [1, 1], 'SAME',
-                      'NHWC', [4, 8, 8, 176])
-    _test_convolution('conv_transpose', [4, 8, 8, 32], [3, 3, 176, 32], [1, 1], [2, 2], 'SAME',
-                      'NHWC', [4, 15, 15, 176])
-    _test_convolution('conv_transpose', [4, 8, 8, 32], [3, 3, 176, 32], [1, 1], [2, 2], 'SAME',
-                      'NHWC', [4, 16, 16, 176])
     _test_convolution('conv_transpose', [4, 8, 8, 19], [3, 3, 19, 19], [1, 1], [2, 2], 'VALID',
                       'NHWC', [4, 17, 17, 19])
     _test_convolution('conv_transpose', [4, 17, 17, 19], [1, 1, 124, 19], [1, 1], [1, 1], 'SAME',
                       'NHWC', [4, 17, 17, 124])
-    _test_convolution('conv_transpose', [4, 17, 17, 19], [3, 3, 124, 19], [1, 1], [1, 1], 'SAME',
-                      'NHWC', [4, 17, 17, 124])
     _test_convolution('conv_transpose', [4, 8, 8, 32], [3, 3, 12, 32], [1, 1], [2, 2], 'VALID',
                       'NHWC', [4, 17, 17, 12])
     # kernel 2x2, strides (2,2)
@@ -2921,6 +2855,7 @@ def test_forward_add_n():
     test_forward_sin()
     test_forward_negative()
     test_forward_divide()
+    test_forward_floordiv()
     test_forward_abs()
     test_forward_softplus()
     test_forward_sqrt()
@@ -2981,3 +2916,5 @@ def test_forward_add_n():
     test_forward_where()
     test_forward_matmul()
     test_forward_batch_matmul()
+
+    # TODO missing tests: rank
diff --git a/tests/python/frontend/tflite/test_forward.py b/tests/python/frontend/tflite/test_forward.py
index fd43ff3f4229..12ea429983e8 100644
--- a/tests/python/frontend/tflite/test_forward.py
+++ b/tests/python/frontend/tflite/test_forward.py
@@ -1030,7 +1030,7 @@ def _test_tanh(data):
     """ One iteration of TANH """
     with tf.Graph().as_default():
         in_data = array_ops.placeholder(shape=data.shape, dtype=data.dtype)
-        out = math_ops.tanh(in_data)
+        out = math_ops.sigmoid(in_data)
         compare_tflite_with_tvm(data, 'Placeholder:0', [in_data], [out])
 
 def test_forward_tanh():
diff --git a/tests/python/relay/test_external_codegen.py b/tests/python/relay/test_external_codegen.py
deleted file mode 100644
index 2cf32e7786ee..000000000000
--- a/tests/python/relay/test_external_codegen.py
+++ /dev/null
@@ -1,246 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Unit tests for graph partitioning."""
-import os
-import sys
-import numpy as np
-import pytest
-
-import tvm
-import tvm.relay.testing
-import tvm.relay.transform
-from tvm import relay
-from tvm.contrib import util
-
-def check_result(mod, map_inputs, out_shape, result, tol=1e-5, target="llvm",
-                 ctx=tvm.cpu()):
-    if sys.platform == "win32":
-        print("Skip test on Windows for now")
-        return
-
-    def update_lib(lib):
-        test_dir = os.path.dirname(os.path.realpath(os.path.expanduser(__file__)))
-        source_dir = os.path.join(test_dir, "..", "..", "..")
-        contrib_path = os.path.join(source_dir, "src", "runtime", "contrib")
-
-        kwargs = {}
-        kwargs["options"] = ["-O2", "-std=c++11", "-I" + contrib_path]
-        tmp_path = util.tempdir()
-        lib_name = 'lib.so'
-        lib_path = tmp_path.relpath(lib_name)
-        lib.export_library(lib_path, fcompile=False, **kwargs)
-        lib = tvm.module.load(lib_path)
-
-        return lib
-
-    def check_vm_result():
-        with relay.build_config(opt_level=3, disabled_pass=["AlterOpLayout"]):
-            exe = relay.vm.compile(mod, target=target)
-        code, lib = exe.save()
-        lib = update_lib(lib)
-        exe = relay.vm.Executable.load_exec(code, lib)
-        vm = relay.vm.VirtualMachine(exe)
-        vm.init(ctx)
-        out = vm.run(**map_inputs)
-        tvm.testing.assert_allclose(out.asnumpy(), result, rtol=tol, atol=tol)
-
-    def check_graph_runtime_result():
-        with relay.build_config(opt_level=3, disabled_pass=["AlterOpLayout"]):
-            json, lib, _ = relay.build(mod, target=target)
-        lib = update_lib(lib)
-        rt_mod = tvm.contrib.graph_runtime.create(json, lib, ctx)
-
-        for name, data in map_inputs.items():
-            rt_mod.set_input(name, data)
-        rt_mod.run()
-        out = tvm.nd.empty(out_shape, ctx=ctx)
-        out = rt_mod.get_output(0, out)
-
-        tvm.testing.assert_allclose(out.asnumpy(), result, rtol=tol, atol=tol)
-
-    check_vm_result()
-    check_graph_runtime_result()
-
-
-def set_external_func_attr(func, compiler, ext_symbol):
-    func = func.set_attribute("Primitive", tvm.expr.IntImm("int32", 1))
-    func = func.set_attribute("Compiler", tvm.expr.StringImm(compiler))
-    func = func.set_attribute("ExternalSymbol", tvm.expr.StringImm(ext_symbol))
-    return func
-
-
-def test_multi_node_subgraph():
-    x = relay.var('x', shape=(10, 10))
-    w0 = relay.var('w0', shape=(10, 10))
-    w1 = relay.var('w1', shape=(10, 10))
-    w2 = relay.var('w2', shape=(10, 10))
-    w3 = relay.var('w3', shape=(10, 10))
-    w4 = relay.var('w4', shape=(10, 10))
-    w5 = relay.var('w5', shape=(10, 10))
-    w6 = relay.var('w6', shape=(10, 10))
-    w7 = relay.var('w7', shape=(10, 10))
-
-    # subgraph0
-    x0 = relay.var('x0', shape=(10, 10))
-    w00 = relay.var('w00', shape=(10, 10))
-    w01 = relay.var('w01', shape=(10, 10))
-    w02 = relay.var('w02', shape=(10, 10))
-    z00 = relay.add(x0, w00)
-    p00 = relay.subtract(z00, w01)
-    q00 = relay.multiply(p00, w02)
-    subgraph0 = relay.Function([x0, w00, w01, w02], q00)
-    subgraph0 = set_external_func_attr(subgraph0, "ccompiler", "ccompiler_0")
-    call0 = relay.Call(subgraph0, [x, w0, w1, w2])
-
-    # subgraph1
-    x1 = relay.var('x1', shape=(10, 10))
-    w10 = relay.var('w10', shape=(10, 10))
-    w11 = relay.var('w11', shape=(10, 10))
-    w12 = relay.var('w12', shape=(10, 10))
-    z10 = relay.add(x1, w10)
-    p10 = relay.subtract(z10, w11)
-    q10 = relay.multiply(p10, w12)
-    subgraph1 = relay.Function([x1, w10, w11, w12], q10)
-    subgraph1 = set_external_func_attr(subgraph1, "ccompiler", "ccompiler_1")
-    call1 = relay.Call(subgraph1, [x, w3, w4, w5])
-
-
-    # Other parts on TVM
-    z2 = relay.add(x, w6)
-    q2 = relay.subtract(z2, w7)
-
-    r = relay.concatenate((call0, call1, q2), axis=0)
-    f = relay.Function([x, w0, w1, w2, w3, w4, w5, w6, w7], r)
-    mod = relay.Module()
-    mod["main"] = f
-    mod = relay.transform.InferType()(mod)
-
-    x_data = np.random.rand(10, 10).astype('float32')
-    w_data = []
-    for _ in range(8):
-        w_data.append(np.random.rand(10, 10).astype('float32'))
-
-    map_inputs = {"w{}".format(i): w_data[i] for i in range(8)}
-    map_inputs["x"] = x_data
-    check_result(
-        mod, map_inputs, (30, 10),
-        np.concatenate((((x_data + w_data[0]) - w_data[1]) * w_data[2],
-                        ((x_data + w_data[3]) - w_data[4]) * w_data[5],
-                        x_data + w_data[6] - w_data[7]),
-                       axis=0))
-
-
-def test_extern_gcc_single_op():
-    x = relay.var('x', shape=(8, 8))
-    y = relay.var('y', shape=(8, 8))
-
-    x0 = relay.var('x0', shape=(8, 8))
-    y0 = relay.var('y0', shape=(8, 8))
-    z = x0 + y0
-    f = relay.Function([x0, y0], z)
-    f = set_external_func_attr(f, "ccompiler", "ccompiler_0")
-    call = relay.Call(f, [x, y])
-    mod = relay.Module.from_expr(call)
-    x_data = np.random.rand(8, 8).astype('float32')
-    y_data = np.random.rand(8, 8).astype('float32')
-
-    check_result(mod, {"x": x_data, "y": y_data}, (8, 8), x_data + y_data)
-
-
-def test_extern_gcc():
-    x = relay.var('x', shape=(2, 2))
-    y = relay.var('y', shape=(2, 2))
-
-    # subgraph for mul
-    x0 = relay.var('x0', shape=(2, 2))
-    y0 = relay.var('y0', shape=(2, 2))
-    mul = x0 * y0
-    mul = relay.Function([x0, y0], mul)
-    mul = set_external_func_attr(mul, "ccompiler", "ccompiler_2")
-    call_mul = relay.Call(mul, [y, y])
-
-    # subgraph for add
-    x1 = relay.var('x1', shape=(2, 2))
-    y1 = relay.var('y1', shape=(2, 2))
-    add = x1 + y1
-    add = relay.Function([x1, y1], add)
-    add = set_external_func_attr(add, "ccompiler", "ccompiler_1")
-    call_add = relay.Call(add, [x, x])
-
-    # subgraph for sub
-    x2 = relay.var('x2', shape=(2, 2))
-    y2 = relay.var('y2', shape=(2, 2))
-    sub = x2 - y2
-    sub = relay.Function([x2, y2], sub)
-    sub = set_external_func_attr(sub, "ccompiler", "ccompiler_0")
-    call_sub = relay.Call(sub, [call_mul, call_add])
-    mod = relay.Module.from_expr(call_sub)
-
-    x_data = np.random.rand(2, 2).astype('float32')
-    y_data = np.random.rand(2, 2).astype('float32')
-
-    check_result(mod, {"x": x_data, "y": y_data}, (2, 2), (y_data * y_data) - (x_data + x_data))
-
-
-def test_extern_dnnl():
-    if not tvm.get_global_func("relay.ext.dnnl", True):
-        print("skip because DNNL codegen is not available")
-        return
-
-    dtype = 'float32'
-    ishape = (1, 32, 14, 14)
-    w1shape = (32, 1, 3, 3)
-    data0 = relay.var('data0', shape=(ishape), dtype=dtype)
-    weight0 = relay.var('weight0', shape=(w1shape), dtype=dtype)
-
-    data1 = relay.var('data0', shape=(ishape), dtype=dtype)
-    weight1 = relay.var('weight0', shape=(w1shape), dtype=dtype)
-    weight2 = relay.var('weight1', shape=(w1shape), dtype=dtype)
-    depthwise_conv2d_1 = relay.nn.conv2d(data1,
-                                         weight1,
-                                         kernel_size=(3, 3),
-                                         padding=(1, 1),
-                                         groups=32)
-    depthwise_conv2d_2 = relay.nn.conv2d(depthwise_conv2d_1,
-                                         weight2,
-                                         kernel_size=(3, 3),
-                                         padding=(1, 1),
-                                         groups=32)
-    out = relay.add(depthwise_conv2d_1, depthwise_conv2d_2)
-
-    f = relay.Function([data1, weight1, weight2], out)
-    ref_mod = relay.Module()
-    ref_mod['main'] = f
-
-    f = set_external_func_attr(f, "dnnl", "dnnl_0")
-    call = relay.Call(f, [data0, weight0, weight0])
-    mod = relay.Module.from_expr(call)
-
-    i_data = np.random.uniform(0, 1, ishape).astype(dtype)
-    w_data = np.random.uniform(0, 1, w1shape).astype(dtype)
-
-    ref_ex = relay.create_executor("graph", mod=ref_mod, ctx=tvm.cpu())
-    ref_res = ref_ex.evaluate()(i_data, w_data, w_data)
-    check_result(mod, {"data0": i_data, "weight0": w_data},
-                 (1, 32, 14, 14), ref_res.asnumpy(), tol=1e-5)
-
-
-if __name__ == "__main__":
-    test_multi_node_subgraph()
-    test_extern_gcc_single_op()
-    test_extern_gcc()
-    test_extern_dnnl()
diff --git a/tests/python/relay/test_op_grad_level1.py b/tests/python/relay/test_op_grad_level1.py
index 3be62a3170fb..114bda0eccd5 100644
--- a/tests/python/relay/test_op_grad_level1.py
+++ b/tests/python/relay/test_op_grad_level1.py
@@ -110,18 +110,11 @@ def test_log_softmax_grad():
     check_grad(fwd_func, scale=1)
 
 
-def verify_bias_add(d_shape, b_shape, axis=1):
-    data = relay.var("data", relay.TensorType(d_shape, "float32"))
-    bias = relay.var("bias", relay.TensorType(b_shape, "float32"))
-    fwd_func = relay.Function([data, bias], relay.nn.bias_add(data, bias, axis=axis))
-    check_grad(fwd_func)
-
-
 def test_bias_add_grad():
-    verify_bias_add((1, 16), (16,))
-    verify_bias_add((1, 8, 2, 2), (8,))
-    verify_bias_add((1, 2, 2, 8), (8,), 3)
-    verify_bias_add((4, 8), (8,))
+    data = relay.var("data", relay.TensorType((1, 16), "float32"))
+    bias = relay.var("bias", relay.TensorType((16,), "float32"))
+    fwd_func = relay.Function([data, bias], relay.nn.bias_add(data, bias))
+    check_grad(fwd_func)
 
 
 if __name__ == "__main__":
diff --git a/tests/python/relay/test_op_level2.py b/tests/python/relay/test_op_level2.py
index 2f19f7a1f7d6..bb16487d610b 100644
--- a/tests/python/relay/test_op_level2.py
+++ b/tests/python/relay/test_op_level2.py
@@ -413,25 +413,6 @@ def test_conv2d_transpose_nhwc_run():
     c_np = topi.testing.conv2d_transpose_nhwc_python(data, kernel, 'HWOI', 2, 1)
     d_np = np.zeros(shape=oshape_nhwc)
     d_np[:,0:c_np.shape[1],0:c_np.shape[2],:] = c_np
-
-
-def test_conv1d_transpose_ncw_run():
-    dshape = (1, 3, 18)
-    kshape = (3, 10, 3)
-    oshape = (1, 10, 37)
-    x = relay.var("x", shape=dshape)
-    w = relay.var("w")
-    y = relay.nn.conv1d_transpose(x, w,
-                                  channels=10, kernel_size=(3,), strides=(2,),
-                                  padding=(1,), output_padding=(2,))
-    func = relay.Function([x, w], y)
-    dtype = "float32"
-    data = np.random.uniform(size=dshape).astype(dtype)
-    kernel = np.random.uniform(size=kshape).astype(dtype)
-    c_np = topi.testing.conv1d_transpose_ncw_python(
-        data, kernel, 2, 1)
-    d_np = np.zeros(shape=oshape)
-    d_np[:,:,0:c_np.shape[2]] = c_np
     ref_res = d_np
 
     for target, ctx in ctx_list():
@@ -456,22 +437,6 @@ def test_upsampling_infer_type():
     yy = run_infer_type(y)
     assert yy.checked_type == relay.TensorType((n, c, 200, 400), "float32")
 
-def test_upsampling3d_infer_type():
-    n, c, d, h, w = tvm.var("n"), tvm.var("c"), tvm.var("d"), tvm.var("h"), tvm.var("w")
-    scale = tvm.const(2.0, "float64")
-    x = relay.var("x", relay.TensorType((n, c, d, h, w), "float32"))
-    y = relay.nn.upsampling3d(x, scale_d=2, scale_h=2, scale_w=2, layout="NCDHW", method="trilinear")
-
-    yy = run_infer_type(y)
-    assert yy.checked_type == relay.TensorType((n, c, tvm.expr.Cast("int32", tvm.round(d*scale)),
-                                                tvm.expr.Cast("int32", tvm.round(h*scale)),
-                                                tvm.expr.Cast("int32", tvm.round(w*scale))),
-                                                "float32")
-    n, c = tvm.var("n"), tvm.var("c")
-    x = relay.var("x", relay.TensorType((n, c, 100, 100, 200), "float32"))
-    y = relay.nn.upsampling3d(x, scale_d=2, scale_h=2, scale_w=2, layout="NCDHW", method="trilinear")
-    yy = run_infer_type(y)
-    assert yy.checked_type == relay.TensorType((n, c, 200, 200, 400), "float32")
 
 def _test_pool2d(opfunc, reffunc):
     n, c, h, w = tvm.var("n"), 10, 224, 224
@@ -487,7 +452,7 @@ def _test_pool2d(opfunc, reffunc):
     y = opfunc(x, pool_size=(2, 2), strides=(2, 2), padding=(0, 0))
     func = relay.Function([x], y)
     data = np.random.uniform(size=dshape).astype(dtype)
-    ref_res = reffunc(data.reshape(1, 3, 14, 2, 14, 2), axis=(3, 5))
+    ref_res = reffunc(data.reshape(1,3,14,2,14,2), axis=(3,5))
     for target, ctx in ctx_list():
         intrp1 = relay.create_executor("graph", ctx=ctx, target=target)
         op_res1 = intrp1.evaluate(func)(data)
@@ -548,34 +513,6 @@ def test_pool2d():
     _test_global_pool2d(relay.nn.global_avg_pool2d, np.mean)
 
 
-def test_pool3d():
-
-    def _test_pool3d(opfunc):
-        n, c, d, h, w = tvm.var("n"), 10, 5, 224, 224
-        x = relay.var("x", relay.TensorType((n, c, d, h, w), "float32"))
-        y = opfunc(x, pool_size=(1, 1, 1))
-        assert "pool_size=" in y.astext()
-        yy = run_infer_type(y)
-        assert yy.checked_type == relay.TensorType((n, 10, 5, 224, 224), "float32")
-        # test execution
-        dtype = "float32"
-        dshape = (1, 3, 32, 32, 32)
-        x = relay.var("x", shape=dshape)
-        pool_type = 'max' if 'max' in str(opfunc) else 'avg'
-        y = opfunc(x, pool_size=(2, 2, 2), strides=(2, 2, 2), padding=(0, 0, 0, 0, 0, 0))
-        func = relay.Function([x], y)
-        data = np.random.uniform(size=dshape).astype(dtype)
-        ref_res = topi.testing.pool3d_ncdhw_python(data, (2, 2, 2), (2, 2, 2),
-                                                   (0, 0, 0, 0, 0, 0), (1, 3, 16, 16, 16), pool_type, False)
-        for target, ctx in ctx_list():
-            intrp1 = relay.create_executor("graph", ctx=ctx, target=target)
-            op_res1 = intrp1.evaluate(func)(data)
-            tvm.testing.assert_allclose(op_res1.asnumpy(), ref_res, rtol=1e-5, atol=1e-5)
-
-    _test_pool3d(relay.nn.max_pool3d)
-    _test_pool3d(relay.nn.avg_pool3d)
-
-
 def test_avg_pool2d_no_count_pad():
     kh, kw = (4, 4)
     sh, sw = (2, 2)
@@ -798,50 +735,6 @@ def test_upsampling():
     _test_upsampling("NHWC", "nearest_neighbor")
     _test_upsampling("NHWC", "bilinear", True)
 
-def _test_upsampling3d(layout, method, coordinate_transformation_mode="half_pixel"):
-    n, c, d, h, w = tvm.var("n"), 8, 16, 16, 16
-    scale_d = 2.0
-    scale_h = 2.0
-    scale_w = 2.0
-    dtype = "float32"
-    def get_shape():
-        if layout == "NCDHW":
-            return (c, d, h, w), (c, int(round(d*scale_d)), int(round(h*scale_h)),\
-                                  int(round(w*scale_w)))
-        else:
-            return (d, h, w, c), (int(round(d*scale_d)), int(round(h*scale_h)),\
-                                  int(round(w*scale_w)), c)
-    ishape, oshape = get_shape()
-    x = relay.var("x", relay.TensorType((n,) + ishape, dtype))
-    y = relay.nn.upsampling3d(x, scale_d=scale_d, scale_h=scale_h, scale_w=scale_w,\
-                              layout=layout, method=method,\
-                              coordinate_transformation_mode=coordinate_transformation_mode)
-
-    yy = run_infer_type(y)
-    assert yy.checked_type == relay.TensorType((n,) + oshape, dtype)
-    dshape = (1,) + ishape
-    x = relay.var("x", shape=dshape)
-    y = relay.nn.upsampling3d(x, scale_d=scale_d, scale_h=scale_h, scale_w=scale_w,\
-                            layout=layout, method=method,\
-                            coordinate_transformation_mode=coordinate_transformation_mode)
-    func = relay.Function([x], y)
-    data = np.random.uniform(size=dshape).astype(dtype)
-    if method == "nearest_neighbor":
-        ref = topi.testing.upsampling3d_python(data, (scale_d, scale_h, scale_w), layout)
-    else:
-        ref = topi.testing.trilinear_resize3d_python(data, (int(round(d*scale_d)),\
-                                                     int(round(h*scale_h)),\
-                                                     int(round(w*scale_w))), layout)
-    for target, ctx in ctx_list():
-        executor = relay.create_executor("graph", ctx=ctx, target=target)
-        out = executor.evaluate(func)(data)
-        tvm.testing.assert_allclose(out.asnumpy(), ref, rtol=1e-5, atol=1e-5)
-
-def test_upsampling3d():
-    _test_upsampling3d("NCDHW", "nearest_neighbor")
-    _test_upsampling3d("NCDHW", "trilinear", "align_corners")
-    _test_upsampling3d("NDHWC", "nearest_neighbor")
-    _test_upsampling3d("NDHWC", "trilinear", "align_corners")
 
 def test_conv2d_int8_intrinsics():
     def _compile(ic, oc, target, data_layout, kernel_layout, dtypes):
@@ -988,26 +881,22 @@ def test_bitpack_infer_type():
 
 if __name__ == "__main__":
     test_pool2d()
-    test_pool3d()
     test_avg_pool2d_no_count_pad()
     test_lrn()
     test_l2_normalize()
     test_conv2d_infer_type()
     test_bitpack_infer_type()
     test_upsampling_infer_type()
-    test_upsampling3d_infer_type()
     test_flatten_infer_type()
     test_pad_infer_type()
     test_pad_run()
     test_conv2d_transpose_infer_type()
     test_conv2d_transpose_nchw_run()
     test_conv2d_transpose_nhwc_run()
-    test_conv1d_transpose_ncw_run()
     test_conv2d_run()
     test_conv2d_winograd()
     test_conv3d_run()
     test_bitserial_conv2d_infer_type()
     test_batch_flatten()
     test_upsampling()
-    test_upsampling3d()
     test_conv2d_int8_intrinsics()
diff --git a/tests/python/relay/test_op_level5.py b/tests/python/relay/test_op_level5.py
index 84e9f55d67e7..f7447465c3ac 100644
--- a/tests/python/relay/test_op_level5.py
+++ b/tests/python/relay/test_op_level5.py
@@ -573,69 +573,6 @@ def test_run(batch, in_channel, size, out_channel, deformable_groups, groups):
     test_run(2, 4, 16, 4, 4, 1)
 
 
-def test_depth_to_space():
-    def verify_depth_to_space(dshape, block_size, layout, mode):
-        if layout == "NHWC":
-            out_shape = [dshape[0], dshape[1] * block_size, dshape[2] * block_size, dshape[3] / (block_size * block_size)]
-        else:
-            out_shape = [dshape[0], dshape[1] / (block_size * block_size), dshape[2] * block_size, dshape[3] * block_size]
-
-        x_data = np.random.uniform(size=dshape).astype("float32")
-        if layout == "NHWC":
-            x_data = np.transpose(x_data, axes=[0, 3, 1, 2])
-        ref_res = topi.testing.depth_to_space_python(x_data, block_size, mode=mode)
-        if layout == "NHWC":
-            x_data = np.transpose(x_data, axes=[0, 2, 3, 1])
-            ref_res = np.transpose(ref_res, axes=[0, 2, 3, 1])
-
-        x = relay.var("x", relay.TensorType(dshape, "float32"))
-        z = relay.nn.depth_to_space(x, block_size, layout, mode)
-        assert "block_size=" in z.astext()
-        zz = run_infer_type(z)
-        assert zz.checked_type == relay.TensorType(ref_res.shape, "float32")
-        func = relay.Function([x], z)
-
-        for target, ctx in ctx_list():
-            for kind in ["graph", "debug"]:
-                intrp = relay.create_executor(kind, ctx=ctx, target=target)
-                op_res = intrp.evaluate(func)(x_data)
-                tvm.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=1e-4)
-    for layout in ["NHWC", "NCHW"]:
-        for mode in ["DCR", "CDR"]:
-            verify_depth_to_space((1, 4, 4, 4), 2, layout, mode)
-
-
-def test_space_to_depth():
-    def verify_space_to_depth(dshape, block_size, layout):
-        if layout == "NHWC":
-            out_shape = [dshape[0], dshape[1] / block_size, dshape[2] / block_size, dshape[3] * (block_size * block_size)]
-        else:
-            out_shape = [dshape[0], dshape[1] * (block_size * block_size), dshape[2] / block_size, dshape[3] / block_size]
-
-        x_data = np.random.uniform(size=dshape).astype("float32")
-        if layout == "NHWC":
-            x_data = np.transpose(x_data, axes=[0, 3, 1, 2])
-        ref_res = topi.testing.space_to_depth_python(x_data, block_size)
-        if layout == "NHWC":
-            x_data = np.transpose(x_data, axes=[0, 2, 3, 1])
-            ref_res = np.transpose(ref_res, axes=[0, 2, 3, 1])
-
-        x = relay.var("x", relay.TensorType(dshape, "float32"))
-        z = relay.nn.space_to_depth(x, block_size, layout)
-        assert "block_size=" in z.astext()
-        zz = run_infer_type(z)
-        assert zz.checked_type == relay.TensorType(ref_res.shape, "float32")
-        func = relay.Function([x], z)
-
-        for target, ctx in ctx_list():
-            for kind in ["graph", "debug"]:
-                intrp = relay.create_executor(kind, ctx=ctx, target=target)
-                op_res = intrp.evaluate(func)(x_data)
-                tvm.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=1e-4)
-    for layout in ["NHWC", "NCHW"]:
-        verify_space_to_depth((1, 4, 4, 4), 2, layout)
-
-
 if __name__ == "__main__":
     test_resize_infer_type()
     test_resize()
@@ -649,5 +586,3 @@ def verify_space_to_depth(dshape, block_size, layout):
     test_yolo_reorg()
     test_non_max_suppression()
     test_deformable_conv2d()
-    test_depth_to_space()
-    test_space_to_depth()
\ No newline at end of file
diff --git a/tests/python/relay/test_pass_alter_op_layout.py b/tests/python/relay/test_pass_alter_op_layout.py
index 3f02e1db625e..9ab582d5b3e2 100644
--- a/tests/python/relay/test_pass_alter_op_layout.py
+++ b/tests/python/relay/test_pass_alter_op_layout.py
@@ -318,70 +318,6 @@ def expected():
 
     assert analysis.alpha_equal(a, b), "Actual = \n" + str(a)
 
-
-def test_alter_layout_broadcast_scalar_op():
-    """Test alternating the layout of a conv2d.
-    The layout of broadcast operators and the weight should be changed accordingly.
-    """
-    def before():
-        x = relay.var("x", shape=(1, 500, 500, 64))
-        kernel = relay.var('kernel', shape=(3, 3, 64, 64), dtype='float32')
-        bias = relay.var("bias", shape=(64,))
-        multiplier1 = relay.var('multiplier1', shape=(1, ), dtype='float32')
-        multiplier2 = relay.var('multiplier2', shape=(1, 1), dtype='float32')
-
-        y = relay.nn.conv2d(x, kernel,
-                            data_layout='NHWC',
-                            kernel_layout="HWIO",
-                            kernel_size=(3, 3))
-        y = relay.add(bias, y)
-        y = relay.nn.relu(y)
-
-        y = relay.multiply(multiplier1, y)
-        y = relay.multiply(y, multiplier2)
-        y = relay.Function(analysis.free_vars(y), y)
-        return y
-
-    def alter_conv2d(attrs, inputs, tinfos):
-        data, weight = inputs
-        new_attrs = dict(attrs)
-        new_attrs['data_layout'] = 'NCHW16c'
-        return relay.nn.conv2d(data, weight, **new_attrs)
-
-    def expected():
-        x = relay.var("x", shape=(1, 500, 500, 64))
-        kernel = relay.var('kernel', shape=(3, 3, 64, 64), dtype='float32')
-        bias = relay.var("bias", shape=(64,))
-        multiplier1 = relay.var('multiplier1', shape=(1, ), dtype='float32')
-        multiplier2 = relay.var('multiplier2', shape=(1, 1), dtype='float32')
-
-        b = relay.expand_dims(bias, axis=0, num_newaxis=3)
-        b = relay.layout_transform(b, "NHWC", "NCHW16c")
-
-        y = relay.layout_transform(x, "NHWC", "NCHW16c")
-        y = relay.nn.conv2d(y, kernel,
-                            data_layout='NCHW16c',
-                            kernel_layout="HWIO",
-                            kernel_size=(3, 3))
-
-        y = relay.add(b, y)
-        y = relay.nn.relu(y)
-
-        y = relay.multiply(multiplier1, y)
-        y = relay.multiply(y, multiplier2)
-        y = relay.layout_transform(y, "NCHW16c", "NHWC")
-        y = relay.Function(analysis.free_vars(y), y)
-        return y
-
-    with TempOpAttr("nn.conv2d", "FTVMAlterOpLayout", alter_conv2d):
-        a = before()
-        a = run_opt_pass(a, [transform.CanonicalizeOps(),
-                             transform.AlterOpLayout()])
-        b = run_opt_pass(expected(), transform.InferType())
-
-    assert analysis.alpha_equal(a, b), "Actual = \n" + str(a)
-
-
 def test_alter_layout_scalar():
     """Test alternating the layout of a conv2d.
     The layout of broadcast operators and the weight should be changed accordingly.
@@ -1044,7 +980,6 @@ def expected():
     test_alter_layout_dual_path()
     test_alter_layout_resnet()
     test_alter_layout_broadcast_op()
-    test_alter_layout_broadcast_scalar_op()
     test_alter_layout_scalar()
     test_alter_layout_concatenate()
     test_alter_layout_nchw_upsamping_op()
diff --git a/tests/python/relay/test_pass_auto_quantize.py b/tests/python/relay/test_pass_auto_quantize.py
index 5b2e368f9dbd..e4aa36bf9f70 100644
--- a/tests/python/relay/test_pass_auto_quantize.py
+++ b/tests/python/relay/test_pass_auto_quantize.py
@@ -14,7 +14,6 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-import numpy as np
 import tvm
 from tvm import relay
 from tvm.relay import testing
@@ -46,28 +45,5 @@ def test_mul_rewrite():
 
     quantize_and_build(act * pool)
 
-
-def get_calibration_dataset(input_name):
-    dataset = []
-    for i in range(5):
-        data = np.random.uniform(size=(1, 3, 224, 224))
-        dataset.append({input_name: data})
-    return dataset
-
-
-def test_calibrate_target(create_target=False):
-    mod, params = testing.resnet.get_workload(num_layers=18)
-    dataset = get_calibration_dataset("data")
-    with relay.quantize.qconfig(calibrate_mode="kl_divergence"):
-        if create_target:
-            with tvm.target.create("llvm"):
-                relay.quantize.quantize(mod, params, dataset)
-        else:
-            # current_target = None
-            relay.quantize.quantize(mod, params, dataset)
-
-
 if __name__ == "__main__":
     test_mul_rewrite()
-    test_calibrate_target(False)
-    test_calibrate_target(True)
diff --git a/tests/python/relay/test_pass_convert_op_layout.py b/tests/python/relay/test_pass_convert_op_layout.py
deleted file mode 100644
index dfd745164069..000000000000
--- a/tests/python/relay/test_pass_convert_op_layout.py
+++ /dev/null
@@ -1,409 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Test alter op layout pass"""
-import tvm
-
-from tvm import relay
-from tvm.relay.op import register_alter_op_layout
-from tvm.relay import transform, analysis
-
-
-def run_opt_pass(expr, passes):
-    passes = passes if isinstance(passes, list) else [passes]
-    mod = relay.Module.from_expr(expr)
-    seq = transform.Sequential(passes)
-    with transform.PassContext(opt_level=3):
-        mod = seq(mod)
-    entry = mod["main"]
-    return entry if isinstance(expr, relay.Function) else entry.body
-
-
-def test_no_convert_layout():
-    def before():
-        x = relay.var("x", shape=(1, 64, 56, 56))
-        weight = relay.var('weight', shape=(64, 64, 3, 3))
-        y = relay.nn.conv2d(x, weight,
-                            channels=64,
-                            kernel_size=(3, 3),
-                            padding=(1, 1))
-        y = relay.nn.relu(y)
-        y = relay.Function([x, weight], y)
-        return y
-
-    def expected():
-        return before()
-
-    a = before()
-    a = run_opt_pass(a, transform.ConvertLayout('NCHW'))
-    b = run_opt_pass(expected(), transform.InferType())
-
-    assert analysis.alpha_equal(a, b), "Actual = \n" + str(a)
-
-
-def test_conv_convert_layout():
-    def before():
-        x = relay.var("x", shape=(1, 56, 56, 64))
-        weight = relay.var('weight', shape=(3, 3, 64, 64))
-        y = relay.nn.conv2d(x, weight,
-                            channels=64,
-                            kernel_size=(3, 3),
-                            padding=(1, 1),
-                            data_layout='NHWC',
-                            kernel_layout='HWIO')
-        y = relay.nn.relu(y)
-        y = relay.Function([x, weight], y)
-        return y
-
-    def expected():
-        x = relay.var("x", shape=(1, 56, 56, 64))
-        weight = relay.var('weight', shape=(3, 3, 64, 64))
-        x = relay.layout_transform(x, 'NHWC', 'NCHW')
-        weight = relay.layout_transform(weight, 'HWIO', 'OIHW')
-        y = relay.nn.conv2d(x, weight,
-                            channels=64,
-                            kernel_size=(3, 3),
-                            padding=(1, 1))
-        y = relay.nn.relu(y)
-        y = relay.layout_transform(y, 'NCHW', 'NHWC')
-        y = relay.Function(relay.analysis.free_vars(y), y)
-        return y
-
-    a = before()
-    a = run_opt_pass(a, transform.ConvertLayout('NCHW'))
-    b = run_opt_pass(expected(), transform.InferType())
-
-    assert analysis.alpha_equal(a, b), "Actual = \n" + str(a)
-
-
-def test_conv_bias_pool_convert_layout():
-    def before():
-        x = relay.var("x", shape=(1, 56, 56, 64))
-        bias = relay.var("bias", shape=(64,))
-        weight = relay.var("weight", shape=(3, 3, 64, 64))
-        y = relay.nn.conv2d(x, weight, channels=64, kernel_size=(3, 3), padding=(1, 1),
-                            data_layout='NHWC', kernel_layout='HWIO')
-        y = relay.nn.bias_add(y, bias, axis=3)
-        # a useless tuple, which will be eliminated
-        y = relay.Tuple([y])[0]
-        y = relay.nn.relu(y)
-        y = relay.nn.max_pool2d(y, pool_size=(2, 2), layout='NHWC')
-        y = relay.cast(y, 'int32')
-        y = relay.nn.batch_flatten(y)
-        y = relay.Function(analysis.free_vars(y), y)
-        return y
-
-    def expected():
-        x = relay.var("x", shape=(1, 56, 56, 64))
-        bias = relay.var("bias", shape=(64,))
-        weight = relay.var("weight", shape=(3, 3, 64, 64))
-        x = relay.layout_transform(x, 'NHWC', 'NCHW')
-        weight = relay.layout_transform(weight, 'HWIO', 'OIHW')
-        y = relay.nn.conv2d(x, weight, channels=64, kernel_size=(3, 3), padding=(1, 1))
-
-        bias = relay.expand_dims(bias, axis=0, num_newaxis=3)
-        bias = relay.layout_transform(bias, 'NHWC', 'NCHW')
-        y = relay.add(y, bias)
-        # a useless tuple, which will be eliminated
-        y = relay.Tuple([y])[0]
-        y = relay.nn.relu(y)
-        y = relay.nn.max_pool2d(y, pool_size=(2, 2))
-        y = relay.cast(y, 'int32')
-        y = relay.layout_transform(y, 'NCHW', 'NHWC')
-        y = relay.nn.batch_flatten(y)
-        y = relay.Function(analysis.free_vars(y), y)
-        return y
-
-    a = before()
-    a = run_opt_pass(a, transform.ConvertLayout('NCHW'))
-    b = run_opt_pass(expected(), transform.InferType())
-
-    assert analysis.alpha_equal(a, b), "Actual = \n" + str(a)
-
-
-def test_conv_concat_convert_layout():
-    def before():
-        x = relay.var("x", shape=(1, 56, 56, 64))
-        weight1 = relay.var('weight1', shape=(3, 3, 64, 64))
-        weight2 = relay.var('weight2', shape=(3, 3, 64, 64))
-        y = relay.nn.conv2d(x, weight1,
-                            channels=64,
-                            kernel_size=(3, 3),
-                            padding=(1, 1),
-                            data_layout='NHWC',
-                            kernel_layout='HWIO')
-        y1 = relay.nn.conv2d(y, weight2,
-                             channels=64,
-                             kernel_size=(3, 3),
-                             padding=(1, 1),
-                             data_layout='NHWC',
-                             kernel_layout='HWIO')
-        ret = relay.concatenate([y, y1], axis=3)
-        y = relay.Function(analysis.free_vars(ret), ret)
-        return y
-
-    def expected():
-        x = relay.var("x", shape=(1, 56, 56, 64))
-        weight1 = relay.var('weight1', shape=(3, 3, 64, 64))
-        weight2 = relay.var('weight2', shape=(3, 3, 64, 64))
-        weight1 = relay.layout_transform(weight1, 'HWIO', 'OIHW')
-        weight2 = relay.layout_transform(weight2, 'HWIO', 'OIHW')
-        y = relay.layout_transform(x, "NHWC", "NCHW")
-        y = relay.nn.conv2d(y, weight1,
-                            channels=64,
-                            kernel_size=(3, 3),
-                            padding=(1, 1))
-        y1 = relay.nn.conv2d(y, weight2,
-                             channels=64,
-                             kernel_size=(3, 3),
-                             padding=(1, 1))
-        ret = relay.concatenate([y, y1], axis=1)
-        ret = relay.layout_transform(ret, "NCHW", "NHWC")
-        y = relay.Function(analysis.free_vars(ret), ret)
-        return y
-
-    a = before()
-    a = run_opt_pass(a, transform.ConvertLayout('NCHW'))
-    b = run_opt_pass(expected(), transform.InferType())
-
-    assert analysis.alpha_equal(a, b), "Actual = \n" + str(a)
-
-
-def test_dual_path_convert_layout():
-    def before():
-        x = relay.var("x", shape=(1, 56, 56, 64))
-        weight1 = relay.var('weight1', shape=(3, 3, 64, 32))
-        weight2 = relay.var('weight2', shape=(3, 3, 32, 32))
-        y = relay.nn.conv2d(x, weight1,
-                            channels=32,
-                            kernel_size=(3, 3),
-                            padding=(1, 1),
-                            data_layout='NHWC',
-                            kernel_layout='HWIO')
-        y = relay.nn.relu(y)
-        y1 = relay.nn.conv2d(y, weight2,
-                             channels=32,
-                             kernel_size=(3, 3),
-                             padding=(1, 1),
-                             data_layout='NHWC',
-                             kernel_layout='HWIO')
-        y1 = relay.nn.relu(y1)
-        y2 = relay.nn.batch_flatten(y)
-        ret = relay.Tuple([y1, y2])
-        y = relay.Function(analysis.free_vars(ret), ret)
-        return y
-
-    def expected():
-        x = relay.var("x", shape=(1, 56, 56, 64))
-        weight1 = relay.var('weight1', shape=(3, 3, 64, 32))
-        weight2 = relay.var('weight2', shape=(3, 3, 32, 32))
-        weight1 = relay.layout_transform(weight1, 'HWIO', 'OIHW')
-        weight2 = relay.layout_transform(weight2, 'HWIO', 'OIHW')
-        y = relay.layout_transform(x, "NHWC", "NCHW")
-        y = relay.nn.conv2d(y, weight1,
-                            channels=32,
-                            kernel_size=(3, 3),
-                            padding=(1, 1))
-        y = relay.nn.relu(y)
-        y1 = relay.nn.conv2d(y, weight2,
-                             channels=32,
-                             kernel_size=(3, 3),
-                             padding=(1, 1))
-        y1 = relay.nn.relu(y1)
-        y1 = relay.layout_transform(y1, "NCHW", "NHWC")
-        y2 = relay.layout_transform(y, "NCHW", "NHWC")
-        y2 = relay.nn.batch_flatten(y2)
-        ret = relay.Tuple([y1, y2])
-        y = relay.Function(analysis.free_vars(ret), ret)
-        return y
-
-    a = before()
-    a = run_opt_pass(a, transform.ConvertLayout('NCHW'))
-    b = run_opt_pass(expected(), transform.InferType())
-
-    assert analysis.alpha_equal(a, b), "Actual = \n" + str(a)
-
-
-def test_bn_convert_layout():
-    def before():
-        x = relay.var("x", shape=(1, 56, 56, 64))
-        weight1 = relay.var('weight1', shape=(3, 3, 64, 32))
-        y = relay.nn.conv2d(x, weight1,
-                            channels=32,
-                            kernel_size=(3, 3),
-                            padding=(1, 1),
-                            data_layout='NHWC',
-                            kernel_layout='HWIO')
-        gamma = relay.var("gamma")
-        beta = relay.var("beta")
-        mean = relay.var("mean")
-        variance = relay.var("variance")
-        y, _, _ = relay.nn.batch_norm(y , gamma, beta, mean, variance, axis=3)
-        return relay.Function(analysis.free_vars(y), y)
-
-    a = before()
-    a = run_opt_pass(a, transform.ConvertLayout('NCHW'))
-
-    # Check that there is only 1 NHWC to NCHW transform.
-    has_lt = list()
-    find_op = lambda x : \
-            has_lt.append(isinstance(x, tvm.relay.expr.Call) and x.op.name == "layout_transform" \
-            and x.attrs.src_layout == 'NCHW' and x.attrs.dst_layout == 'NHWC')
-    relay.analysis.post_order_visit(a, find_op)
-    has_lt = list(filter(lambda x: x, has_lt))
-    assert len(has_lt) == 1
-
-
-def test_resnet_convert_layout():
-    def before():
-        x = relay.var("x", shape=(1, 56, 56, 64))
-        weight1 = relay.var('weight1', shape=(3, 3, 64, 32))
-        weight2 = relay.var('weight2', shape=(1, 1, 64, 32))
-        y = relay.nn.conv2d(x, weight1,
-                            channels=32,
-                            kernel_size=(3, 3),
-                            padding=(1, 1),
-                            data_layout='NHWC',
-                            kernel_layout='HWIO')
-        y = relay.nn.relu(y)
-        y2 = relay.nn.conv2d(x, weight2,
-                             channels=32,
-                             kernel_size=(1, 1),
-                             data_layout='NHWC',
-                             kernel_layout='HWIO')
-        y2 = relay.nn.relu(y2)
-        y = y + y2
-        y = relay.nn.global_max_pool2d(y, layout='NHWC')
-        return relay.Function(analysis.free_vars(y), y)
-
-    def expected():
-        x = relay.var("x", shape=(1,56, 56, 64))
-        weight1 = relay.var('weight1', shape=(3, 3, 64, 32))
-        weight2 = relay.var('weight2', shape=(1, 1, 64, 32))
-        weight1 = relay.layout_transform(weight1, 'HWIO', 'OIHW')
-        weight2 = relay.layout_transform(weight2, 'HWIO', 'OIHW')
-        x = relay.layout_transform(x, "NHWC", "NCHW")
-        y = relay.nn.conv2d(x, weight1,
-                            channels=32,
-                            kernel_size=(3, 3),
-                            padding=(1, 1))
-        y = relay.nn.relu(y)
-        y2 = relay.nn.conv2d(x, weight2,
-                             channels=32,
-                             kernel_size=(1, 1))
-        y2 = relay.nn.relu(y2)
-        y = y + y2
-        y = relay.nn.global_max_pool2d(y)
-        y = relay.layout_transform(y, "NCHW", "NHWC")
-        return relay.Function(analysis.free_vars(y), y)
-
-    a = before()
-    a = run_opt_pass(a, transform.ConvertLayout('NCHW'))
-    b = run_opt_pass(expected(), transform.InferType())
-
-    assert analysis.alpha_equal(a, b), "Actual = \n" + str(a)
-
-
-def test_scalar_convert_layout():
-    def before():
-        x = relay.var("x", shape=(1, 56, 56, 64))
-        weight = relay.var("weight", shape=(3, 3, 64, 64))
-        y = relay.nn.conv2d(x, weight, channels=64, kernel_size=(3, 3), padding=(1, 1),
-                            data_layout='NHWC', kernel_layout='HWIO')
-        y = relay.add(y, relay.const(1, "float32"))
-        y = relay.Function(analysis.free_vars(y), y)
-        return y
-
-    def expected():
-        x = relay.var("x", shape=(1, 56, 56, 64))
-        w = relay.var("weight", shape=(3, 3, 64, 64))
-        x = relay.layout_transform(x, 'NHWC', 'NCHW')
-        w = relay.layout_transform(w, 'HWIO', 'OIHW')
-        y = relay.nn.conv2d(x, w,
-                            channels=64,
-                            kernel_size=(3, 3),
-                            padding=(1, 1))
-        y = relay.add(y, relay.const(1.0, "float32"))
-
-        y = relay.layout_transform(y, "NCHW", "NHWC")
-        y = relay.Function(analysis.free_vars(y), y)
-        return y
-
-    a = before()
-    a = run_opt_pass(a, transform.ConvertLayout('NCHW'))
-    b = run_opt_pass(expected(), transform.InferType())
-
-    assert analysis.alpha_equal(a, b), "Actual = \n" + str(a)
-
-
-def test_conv_bn_convert_layout():
-    """ Check that layout transforms are propagated through bn. """
-    def before():
-        x = relay.var("x", shape=(1, 56, 56, 64))
-        weight = relay.var("weight", shape=(3, 3, 64, 64))
-        y = relay.nn.conv2d(x, weight, channels=64, kernel_size=(3, 3), padding=(1, 1),
-                            data_layout='NHWC', kernel_layout='HWIO')
-
-        dtype = "float32"
-        beta = relay.var("beta", relay.TensorType((64,), dtype))
-        gamma = relay.var("gamma", relay.TensorType((64,), dtype))
-        moving_mean = relay.var("moving_mean", relay.TensorType((64,), dtype))
-        moving_var = relay.var("moving_var", relay.TensorType((64,), dtype))
-
-        y = relay.nn.batch_norm(y, gamma, beta, moving_mean, moving_var, axis=3)
-        y = relay.nn.relu(y[0])
-        y = relay.Function(analysis.free_vars(y), y)
-        return y
-
-    def expected():
-        x = relay.var("x", shape=(1, 56, 56, 64))
-        w = relay.var("weight", shape=(3, 3, 64, 64))
-        x = relay.layout_transform(x, 'NHWC', 'NCHW')
-        w = relay.layout_transform(w, 'HWIO', 'OIHW')
-        y = relay.nn.conv2d(x, w,
-                            channels=64,
-                            kernel_size=(3, 3),
-                            padding=(1, 1))
-
-        dtype = "float32"
-        beta = relay.var("beta", relay.TensorType((64,), dtype))
-        gamma = relay.var("gamma", relay.TensorType((64,), dtype))
-        moving_mean = relay.var("moving_mean", relay.TensorType((64,), dtype))
-        moving_var = relay.var("moving_var", relay.TensorType((64,), dtype))
-
-        y = relay.nn.batch_norm(y, gamma, beta, moving_mean, moving_var, axis=1)
-        y = relay.nn.relu(y[0])
-        y = relay.layout_transform(y, "NCHW", "NHWC")
-        y = relay.Function(analysis.free_vars(y), y)
-        return y
-
-    a = before()
-    a = run_opt_pass(a, transform.ConvertLayout('NCHW'))
-    b = run_opt_pass(expected(), transform.InferType())
-
-    assert analysis.alpha_equal(a, b), "Actual = \n" + str(a)
-
-
-if __name__ == "__main__":
-    test_no_convert_layout()
-    test_conv_convert_layout()
-    test_conv_bias_pool_convert_layout()
-    test_conv_concat_convert_layout()
-    test_dual_path_convert_layout()
-    test_bn_convert_layout()
-    test_resnet_convert_layout()
-    test_scalar_convert_layout()
-    test_conv_bn_convert_layout()
diff --git a/tests/python/relay/test_py_converter.py b/tests/python/relay/test_py_converter.py
index 2a07e9514f0c..49a2219dcd04 100644
--- a/tests/python/relay/test_py_converter.py
+++ b/tests/python/relay/test_py_converter.py
@@ -510,6 +510,7 @@ def verify_stack(dshapes, axis):
 
 # test an op with a tuple output
 # adapted from test_split_infer_type in test_op_level3
+# and test_split in nnvm's test_top_level1
 def test_split():
     def verify_split(shape, indices_or_sections, axis=0):
         x = np.random.normal(size=shape).astype('float32')
@@ -528,6 +529,7 @@ def verify_split(shape, indices_or_sections, axis=0):
 
 
 # ensure we can generate code for batch_norm, since it requires simplify_inference
+# adapted from test_batchnorm in nnvm's test_top_level1
 def test_batch_norm():
     def verify_batch_norm(shapes):
         data = [np.absolute(np.random.normal(size=shape).astype('float32'))
diff --git a/tests/python/unittest/test_autotvm_record.py b/tests/python/unittest/test_autotvm_record.py
index 0839ad9b68cf..f83dd7c74d0b 100644
--- a/tests/python/unittest/test_autotvm_record.py
+++ b/tests/python/unittest/test_autotvm_record.py
@@ -52,16 +52,9 @@ def test_file_io():
     inputs = [MeasureInput(target, tsk, tsk.config_space.get(i)) for i in range(0, 10)]
     results = [MeasureResult((i, ), 0, 0, 0) for i in range(0, 10)]
 
-    invalid_inp = MeasureInput(target, tsk, tsk.config_space.get(10))
-    invalid_res = MeasureResult((10, ), 0, 0, 0)
-
-    # Erase the entity map to test if it will be ignored when loading back.
-    invalid_inp.config._entity_map = {}
-
     with open(file_path, "w") as fo:
         cb = autotvm.callback.log_to_file(fo)
         cb(None, inputs, results)
-        cb(None, [invalid_inp], [invalid_res])
 
     ref = zip(inputs, results)
     for x, y in zip(ref, autotvm.record.load_from_file(file_path)):
diff --git a/tests/python/unittest/test_lang_container.py b/tests/python/unittest/test_lang_container.py
index 92edbee9072f..206e143029cf 100644
--- a/tests/python/unittest/test_lang_container.py
+++ b/tests/python/unittest/test_lang_container.py
@@ -15,7 +15,6 @@
 # specific language governing permissions and limitations
 # under the License.
 import tvm
-import numpy as np
 
 def test_array():
     a = tvm.convert([1,2,3])
@@ -72,14 +71,6 @@ def test_in_container():
     assert tvm.make.StringImm('a') in arr
     assert 'd' not in arr
 
-def test_ndarray_container():
-    x = tvm.nd.array([1,2,3])
-    arr = tvm.convert([x, x])
-    assert arr[0].same_as(x)
-    assert arr[1].same_as(x)
-    assert isinstance(arr[0], tvm.nd.NDArray)
-
-
 if __name__ == "__main__":
     test_str_map()
     test_array()
@@ -87,4 +78,3 @@ def test_ndarray_container():
     test_array_save_load_json()
     test_map_save_load_json()
     test_in_container()
-    test_ndarray_container()
diff --git a/tests/python/unittest/test_pass_split_pipeline.py b/tests/python/unittest/test_pass_split_pipeline.py
new file mode 100644
index 000000000000..380053420f44
--- /dev/null
+++ b/tests/python/unittest/test_pass_split_pipeline.py
@@ -0,0 +1,81 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import tvm
+
+def lower(s, args):
+    binds = {}
+    arg_list = []
+
+    for x in args:
+        assert isinstance(x, tvm.tensor.Tensor)
+        buf = tvm.decl_buffer(x.shape, dtype=x.dtype, name=x.op.name)
+        binds[x] = buf
+        arg_list.append(buf)
+    s.normalize()
+    bounds = tvm.schedule.InferBound(s)
+    stmt = tvm.schedule.ScheduleOps(s, bounds)
+    stmt = tvm.ir_pass.StorageFlatten(stmt, binds, 64)
+    stmt = tvm.ir_pass.CanonicalSimplify(stmt)
+    stmt = tvm.ir_pass.Simplify(stmt)
+    return stmt
+
+def test_basic_pipeline():
+    n = tvm.convert(128)
+    A = tvm.placeholder((n,), name='A')
+    stages = []
+    num_stage = 3
+
+    B = A
+    for k in range(num_stage):
+        stages.append(B)
+        B = tvm.compute((n,), lambda i: B[i] + k, name="A%s" % k)
+
+    s = tvm.create_schedule(B.op)
+    xo, xi = s[B].split(B.op.axis[0], nparts=1)
+    s[B].bind(xo, tvm.thread_axis("pipeline"))
+    xo, xi = s[B].split(xi, factor=4)
+    for S in stages:
+        s[S].compute_at(s[B], xo)
+
+    stmt = lower(s, [A, B])
+    stmt = tvm.ir_pass.SplitPipeline(stmt, False)
+    print(stmt)
+    stmt = tvm.ir_pass.NarrowChannelAccess(stmt)
+    print(stmt)
+    assert(tvm.ir_pass.VerifySSA(stmt))
+
+def test_conv1d():
+    n = tvm.var('n')
+    A = tvm.compute((n+2), lambda i: 1,  name='A')
+    def computeB(ii):
+        i = ii + 1
+        return A[i-1] + A[i] + A[i+1]
+    B = tvm.compute(n, computeB, name='B')
+    s = tvm.create_schedule(B.op)
+    px, xi = s[B].split(B.op.axis[0], nparts=1)
+    s[B].bind(px, tvm.thread_axis("pipeline"))
+    s[A].compute_at(s[B], px)
+    stmt = lower(s, [B])
+    stmt = tvm.ir_pass.SplitPipeline(stmt, False)
+    print(stmt)
+    stmt = tvm.ir_pass.NarrowChannelAccess(stmt)
+    print(stmt)
+
+
+if __name__ == "__main__":
+    test_basic_pipeline()
+    test_conv1d()
diff --git a/tests/python/unittest/test_runtime_module_export.py b/tests/python/unittest/test_runtime_module_export.py
deleted file mode 100644
index 951ea97bf252..000000000000
--- a/tests/python/unittest/test_runtime_module_export.py
+++ /dev/null
@@ -1,208 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-from tvm import relay
-from tvm.relay import testing
-import tvm
-
-from tvm.contrib import util
-header_file_dir_path = util.tempdir()
-
-
-def gen_engine_header():
-    code = r'''
-        #ifndef _ENGINE_H_
-        #define _ENGINE_H_
-        #include <cstdint>
-        #include <string>
-        #include <sstream>
-        #include <vector>
-        class Engine {
-        };
-
-        #endif
-        '''
-    header_file = header_file_dir_path.relpath("gcc_engine.h")
-    with open(header_file, 'w') as f:
-        f.write(code)
-
-
-def generate_engine_module():
-    code = r'''
-        #include <tvm/runtime/c_runtime_api.h>
-        #include <dlpack/dlpack.h>
-        #include "gcc_engine.h"
-
-        extern "C" void gcc_1_(float* gcc_input4, float* gcc_input5,
-                float* gcc_input6, float* gcc_input7, float* out) {
-            Engine engine;
-        }
-        '''
-    gen_engine_header()
-    csource_module = tvm.module.csource_module_create(code, "cc")
-    return csource_module
-
-
-def test_mod_export():
-    def verify_gpu_mod_export(obj_format):
-        for device in ["llvm", "cuda"]:
-            if not tvm.module.enabled(device):
-                print("skip because %s is not enabled..." % device)
-                return
-
-        resnet18_mod, resnet18_params = relay.testing.resnet.get_workload(num_layers=18)
-        resnet50_mod, resnet50_params = relay.testing.resnet.get_workload(num_layers=50)
-        with relay.build_config(opt_level=3):
-            _, resnet18_gpu_lib, _ = relay.build_module.build(resnet18_mod, "cuda", params=resnet18_params)
-            _, resnet50_cpu_lib, _ = relay.build_module.build(resnet50_mod, "llvm", params=resnet50_params)
-
-        from tvm.contrib import util
-        temp = util.tempdir()
-        if obj_format == ".so":
-            file_name = "deploy_lib.so"
-        else:
-            assert obj_format == ".tar"
-            file_name = "deploy_lib.tar"
-        path_lib = temp.relpath(file_name)
-        resnet18_gpu_lib.imported_modules[0].import_module(resnet50_cpu_lib)
-        resnet18_gpu_lib.export_library(path_lib)
-        loaded_lib = tvm.module.load(path_lib)
-        assert loaded_lib.type_key == "library"
-        assert loaded_lib.imported_modules[0].type_key == "cuda"
-        assert loaded_lib.imported_modules[0].imported_modules[0].type_key == "library"
-
-    def verify_multi_dso_mod_export(obj_format):
-        for device in ["llvm"]:
-            if not tvm.module.enabled(device):
-                print("skip because %s is not enabled..." % device)
-                return
-
-        resnet18_mod, resnet18_params = relay.testing.resnet.get_workload(num_layers=18)
-        with relay.build_config(opt_level=3):
-            _, resnet18_cpu_lib, _ = relay.build_module.build(resnet18_mod, "llvm", params=resnet18_params)
-
-        A = tvm.placeholder((1024,), name='A')
-        B = tvm.compute(A.shape, lambda *i: A(*i) + 1.0, name='B')
-        s = tvm.create_schedule(B.op)
-        f = tvm.build(s, [A, B], "llvm", name="myadd")
-        from tvm.contrib import util
-        temp = util.tempdir()
-        if obj_format == ".so":
-            file_name = "deploy_lib.so"
-        else:
-            assert obj_format == ".tar"
-            file_name = "deploy_lib.tar"
-        path_lib = temp.relpath(file_name)
-        resnet18_cpu_lib.import_module(f)
-        resnet18_cpu_lib.export_library(path_lib)
-        loaded_lib = tvm.module.load(path_lib)
-        assert loaded_lib.type_key == "library"
-        assert loaded_lib.imported_modules[0].type_key == "library"
-
-    def verify_json_import_dso(obj_format):
-        for device in ["llvm"]:
-            if not tvm.module.enabled(device):
-                print("skip because %s is not enabled..." % device)
-                return
-
-        # Get subgraph Json.
-        subgraph_json = ("json_rt_0\n" +
-                         "input 0 10 10\n" +
-                         "input 1 10 10\n" +
-                         "input 2 10 10\n" +
-                         "input 3 10 10\n" +
-                         "add 4 inputs: 0 1 shape: 10 10\n" +
-                         "sub 5 inputs: 4 2 shape: 10 10\n" +
-                         "mul 6 inputs: 5 3 shape: 10 10\n" +
-                         "json_rt_1\n" +
-                         "input 0 10 10\n" +
-                         "input 1 10 10\n" +
-                         "input 2 10 10\n" +
-                         "input 3 10 10\n" +
-                         "add 4 inputs: 0 1 shape: 10 10\n" +
-                         "sub 5 inputs: 4 2 shape: 10 10\n" +
-                         "mul 6 inputs: 5 3 shape: 10 10")
-
-        from tvm.contrib import util
-        temp = util.tempdir()
-        subgraph_path = temp.relpath('subgraph.examplejson')
-        with open(subgraph_path, 'w') as f:
-            f.write(subgraph_json)
-
-        # Get Json and module.
-        A = tvm.placeholder((1024,), name='A')
-        B = tvm.compute(A.shape, lambda *i: A(*i) + 1.0, name='B')
-        s = tvm.create_schedule(B.op)
-        f = tvm.build(s, [A, B], "llvm", name="myadd")
-        try:
-            ext_lib = tvm.module.load(subgraph_path, "examplejson")
-        except:
-            print("skip because Loader of examplejson is not presented")
-            return
-        ext_lib.import_module(f)
-        if obj_format == ".so":
-            file_name = "deploy_lib.so"
-        else:
-            assert obj_format == ".tar"
-            file_name = "deploy_lib.tar"
-        path_lib = temp.relpath(file_name)
-        ext_lib.export_library(path_lib)
-        lib = tvm.module.load(path_lib)
-        assert lib.type_key == "examplejson"
-        assert lib.imported_modules[0].type_key == "library"
-
-    def verify_multi_c_mod_export():
-        from shutil import which
-        if which("gcc") is None:
-            print("Skip test because gcc is not available.")
-
-        for device in ["llvm"]:
-            if not tvm.module.enabled(device):
-                print("skip because %s is not enabled..." % device)
-                return
-
-        resnet18_mod, resnet18_params = relay.testing.resnet.get_workload(num_layers=18)
-        with relay.build_config(opt_level=3):
-            _, resnet18_cpu_lib, _ = relay.build_module.build(resnet18_mod, "llvm", params=resnet18_params)
-
-        A = tvm.placeholder((1024,), name='A')
-        B = tvm.compute(A.shape, lambda *i: A(*i) + 1.0, name='B')
-        s = tvm.create_schedule(B.op)
-        f = tvm.build(s, [A, B], "c", name="myadd")
-        engine_module = generate_engine_module()
-        from tvm.contrib import util
-        temp = util.tempdir()
-        file_name = "deploy_lib.so"
-        path_lib = temp.relpath(file_name)
-        resnet18_cpu_lib.import_module(f)
-        resnet18_cpu_lib.import_module(engine_module)
-        kwargs = {"options": ["-O2", "-std=c++11", "-I" + header_file_dir_path.relpath("")]}
-        resnet18_cpu_lib.export_library(path_lib, fcompile=False, **kwargs)
-        loaded_lib = tvm.module.load(path_lib)
-        assert loaded_lib.type_key == "library"
-        assert loaded_lib.imported_modules[0].type_key == "library"
-        assert loaded_lib.imported_modules[1].type_key == "library"
-
-    for obj_format in [".so", ".tar"]:
-        verify_gpu_mod_export(obj_format)
-        verify_multi_dso_mod_export(obj_format)
-        verify_json_import_dso(obj_format)
-
-    verify_multi_c_mod_export()
-
-
-if __name__ == "__main__":
-    test_mod_export()
diff --git a/tests/scripts/packages.mk b/tests/scripts/packages.mk
new file mode 100644
index 000000000000..a735ba37246d
--- /dev/null
+++ b/tests/scripts/packages.mk
@@ -0,0 +1,27 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# rules for gtest
+.PHONY: iverilog
+
+iverilog: | ${CACHE_PREFIX}/bin/vvp
+
+${CACHE_PREFIX}/bin/vvp:
+	rm -rf verilog-10.1.tar.gz verilog-10.1
+	wget ftp://icarus.com/pub/eda/verilog/v10/verilog-10.1.tar.gz
+	tar xf verilog-10.1.tar.gz
+	cd verilog-10.1;./configure --prefix=${CACHE_PREFIX}; make install
diff --git a/tests/scripts/task_golang.sh b/tests/scripts/task_golang.sh
index 49965793f6b3..ee9ec19c4201 100755
--- a/tests/scripts/task_golang.sh
+++ b/tests/scripts/task_golang.sh
@@ -22,7 +22,7 @@ set -u
 export LD_LIBRARY_PATH="lib:${LD_LIBRARY_PATH:-}"
 
 tvm_root="$(git rev-parse --show-toplevel)"
-export PYTHONPATH="$tvm_root/python":"$tvm_root/topi/python"
+export PYTHONPATH="$tvm_root/python":"$tvm_root/nnvm/python":"$tvm_root/topi/python"
 
 # Golang tests
 make -C golang tests
diff --git a/tests/scripts/task_python_frontend.sh b/tests/scripts/task_python_frontend.sh
index 68d861baa5e0..7a7bcacf6140 100755
--- a/tests/scripts/task_python_frontend.sh
+++ b/tests/scripts/task_python_frontend.sh
@@ -19,7 +19,7 @@
 set -e
 set -u
 
-export PYTHONPATH=python:topi/python
+export PYTHONPATH=nnvm/python:python:topi/python
 # to avoid openblas threading error
 export TVM_BIND_THREADS=0
 export OMP_NUM_THREADS=1
@@ -42,6 +42,9 @@ python3 -m pytest -v tests/python/frontend/onnx
 echo "Running relay CoreML frontend test..."
 python3 -m pytest -v tests/python/frontend/coreml
 
+echo "Running nnvm to relay frontend test..."
+python3 -m pytest -v tests/python/frontend/nnvm_to_relay
+
 echo "Running relay Tensorflow frontend test..."
 python3 -m pytest -v tests/python/frontend/tensorflow
 
diff --git a/tests/scripts/task_python_integration.sh b/tests/scripts/task_python_integration.sh
index 143f7f2f98e5..ebbcf2106617 100755
--- a/tests/scripts/task_python_integration.sh
+++ b/tests/scripts/task_python_integration.sh
@@ -35,8 +35,7 @@ rm -rf lib
 make
 cd ../..
 
-TVM_FFI=cython python3 -m pytest -v apps/extension/tests
-TVM_FFI=ctypes python3 -m pytest -v apps/extension/tests
+python3 -m pytest -v apps/extension/tests
 
 TVM_FFI=ctypes python3 -m pytest -v tests/python/integration
 TVM_FFI=ctypes python3 -m pytest -v tests/python/contrib
diff --git a/tests/scripts/task_python_legacy.sh b/tests/scripts/task_python_legacy.sh
new file mode 100755
index 000000000000..db3e882e5a3f
--- /dev/null
+++ b/tests/scripts/task_python_legacy.sh
@@ -0,0 +1,55 @@
+#!/bin/bash
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Test cases for legacy code, will be deprecated in the future.
+set -e
+set -u
+
+export PYTHONPATH=nnvm/python:python:topi/python
+export OMP_NUM_THREADS=1
+
+# Rebuild cython
+make cython3
+
+echo "Running nnvm unittest..."
+python3 -m pytest -v nnvm/tests/python/unittest
+
+
+echo "Running nnvm compiler test..."
+python3 -m pytest -v nnvm/tests/python/compiler
+
+echo "Running nnvm ONNX frontend test..."
+python3 -m pytest -v nnvm/tests/python/frontend/onnx
+
+echo "Running nnvm MXNet frontend test..."
+python3 -m pytest -v nnvm/tests/python/frontend/mxnet
+
+echo "Running nnvm DarkNet frontend test..."
+python3 -m pytest -v nnvm/tests/python/frontend/darknet
+
+echo "Running nnvm Keras frontend test..."
+python3 -m pytest -v nnvm/tests/python/frontend/keras
+
+echo "Running nnvm Tensorflow frontend test..."
+python3 -m pytest -v nnvm/tests/python/frontend/tensorflow
+
+echo "Running nnvm CoreML frontend test..."
+python3 -m pytest -v nnvm/tests/python/frontend/coreml
+
+echo "Running nnvm Caffe2 frontend test..."
+python3 -m pytest -v nnvm/tests/python/frontend/caffe2
diff --git a/tests/scripts/task_python_vta.sh b/tests/scripts/task_python_vta.sh
index 07f680dffbe3..397efa139770 100755
--- a/tests/scripts/task_python_vta.sh
+++ b/tests/scripts/task_python_vta.sh
@@ -19,7 +19,7 @@
 set -e
 set -u
 
-export PYTHONPATH=python:vta/python:topi/python
+export PYTHONPATH=python:nnvm/python:vta/python:topi/python
 
 rm -rf python/tvm/*.pyc python/tvm/*/*.pyc python/tvm/*/*/*.pyc python/tvm/*/*/*/*.pyc
 rm -rf ~/.tvm
diff --git a/tests/scripts/task_rust.sh b/tests/scripts/task_rust.sh
index 140563a1ba8b..cdf777c86c0e 100755
--- a/tests/scripts/task_rust.sh
+++ b/tests/scripts/task_rust.sh
@@ -21,8 +21,8 @@ set -u
 
 export TVM_HOME="$(git rev-parse --show-toplevel)"
 
-export LD_LIBRARY_PATH="$TVM_HOME/lib:$TVM_HOME/build:${LD_LIBRARY_PATH:-}"
-export PYTHONPATH="$TVM_HOME/python":"$TVM_HOME/topi/python"
+export LD_LIBRARY_PATH="$TVM_HOME/lib:$TVM_HOME/build:$TVM_HOME/nnvm:${LD_LIBRARY_PATH:-}"
+export PYTHONPATH="$TVM_HOME/python":"$TVM_HOME/nnvm/python":"$TVM_HOME/topi/python"
 export RUST_DIR="$TVM_HOME/rust"
 
 cd $RUST_DIR
@@ -52,8 +52,8 @@ cd tests/test_tvm_dso
 cargo run
 cd -
 
-# run nn graph test
-cd tests/test_nn
+# run NNVM graph test
+cd tests/test_nnvm
 cargo run
 cd -
 
diff --git a/tests/scripts/task_verilog_test.sh b/tests/scripts/task_verilog_test.sh
new file mode 100755
index 000000000000..48bee0d34ce4
--- /dev/null
+++ b/tests/scripts/task_verilog_test.sh
@@ -0,0 +1,25 @@
+#!/bin/bash
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+set -e
+set -u
+
+export PYTHONPATH=python
+make verilog
+python3 -m pytest -v tests/verilog/unittest
+python3 -m pytest -v tests/verilog/integration
diff --git a/tests/travis/run_test.sh b/tests/travis/run_test.sh
new file mode 100644
index 000000000000..0cb529b46e88
--- /dev/null
+++ b/tests/travis/run_test.sh
@@ -0,0 +1,60 @@
+#!/bin/bash
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+if [ ${TASK} == "lint" ] || [ ${TASK} == "all_test" ]; then
+    if [ ! ${TRAVIS_OS_NAME} == "osx" ]; then
+        ./tests/scripts/task_lint.sh || exit -1
+    fi
+fi
+
+cp make/config.mk config.mk
+echo "USE_CUDA=0" >> config.mk
+echo "USE_RPC=1" >> config.mk
+
+if [ ${TRAVIS_OS_NAME} == "osx" ]; then
+    echo "USE_OPENCL=1" >> config.mk
+    echo "USE_METAL=1" >> config.mk
+else
+    # use g++-4.8 for linux
+    if [ ${CXX} == "g++" ]; then
+        export CXX=g++-4.8
+    fi
+    echo "USE_OPENCL=0" >> config.mk
+fi
+
+if [ ${TASK} == "verilog_test" ] || [ ${TASK} == "all_test" ]; then
+    if [ ! ${TRAVIS_OS_NAME} == "osx" ]; then
+        make -f tests/scripts/packages.mk iverilog
+        make all || exit -1
+        ./tests/scripts/task_verilog_test.sh || exit -1
+    fi
+fi
+
+if [ ${TASK} == "cpp_test" ] || [ ${TASK} == "all_test" ]; then
+    make -f dmlc-core/scripts/packages.mk gtest
+    ./tests/scripts/task_cpp_unittest.sh || exit -1
+fi
+
+if [ ${TASK} == "python_test" ] || [ ${TASK} == "all_test" ]; then
+    make all || exit -1
+    if [ ${TRAVIS_OS_NAME} == "osx" ]; then
+        ./tests/scripts/task_python_unittest.sh || exit -1
+    else
+        pytests3 -v tests/python/unittest || exit -1
+    fi
+fi
diff --git a/tests/travis/setup.sh b/tests/travis/setup.sh
new file mode 100644
index 000000000000..932c47d75f99
--- /dev/null
+++ b/tests/travis/setup.sh
@@ -0,0 +1,31 @@
+#!/bin/bash
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+if [ ${TASK} == "python_test" ] || [ ${TASK} == "all_test" ]; then
+    if [ ${TRAVIS_OS_NAME} == "osx" ]; then
+        brew update
+        brew install python3
+        python3 -m pip install --user pytest numpy cython
+    fi
+fi
+
+if [ ${TASK} == "lint" ] || [ ${TASK} == "all_test" ]; then
+    if [ ! ${TRAVIS_OS_NAME} == "osx" ]; then
+        pip install --user cpplint 'pylint==1.4.4' 'astroid==1.3.6'
+    fi
+fi
diff --git a/tests/travis/travis_after_failure.sh b/tests/travis/travis_after_failure.sh
new file mode 100644
index 000000000000..36bf5d2347fd
--- /dev/null
+++ b/tests/travis/travis_after_failure.sh
@@ -0,0 +1,17 @@
+#!/bin/bash
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
diff --git a/topi/include/topi/broadcast.h b/topi/include/topi/broadcast.h
index 542d43e6b869..38d9bc86f67f 100644
--- a/topi/include/topi/broadcast.h
+++ b/topi/include/topi/broadcast.h
@@ -206,7 +206,7 @@ TOPI_DEFINE_BCAST_OP(divide, { return div(a, b); });
  * \return The result.
  */
 TOPI_DEFINE_BCAST_OP(floor_divide, {
-  if (a.dtype().is_int() || a.dtype().is_uint()) {
+  if (a.type().is_int() || a.type().is_uint()) {
     return floordiv(a, b);
   } else {
     return floor(div(a, b));
@@ -238,7 +238,7 @@ TOPI_DEFINE_BCAST_OP(mod, { return truncmod(a, b); });
  * \return The result.
  */
 TOPI_DEFINE_BCAST_OP(floor_mod, {
-  if (a.dtype().is_int() || a.dtype().is_uint()) {
+  if (a.type().is_int() || a.type().is_uint()) {
     return floormod(a, b);
   } else {
     return a - floor_divide(a, b) * b;
diff --git a/topi/include/topi/cuda/dense.h b/topi/include/topi/cuda/dense.h
index 3bd3e4aba744..19e8ab3e5874 100644
--- a/topi/include/topi/cuda/dense.h
+++ b/topi/include/topi/cuda/dense.h
@@ -51,7 +51,7 @@ inline tvm::Tensor dense_cuda(const Target& target,
                               const tvm::Tensor& data,
                               const tvm::Tensor& weight,
                               const tvm::Tensor& bias,
-                              const DataType& out_dtype) {
+                              const Type& out_dtype) {
   CHECK_EQ(data->shape.size(), 2) << "dense requires 2-D data";
   CHECK_EQ(weight->shape.size(), 2) << "dense requires 2-D weight";
   if (bias.defined()) {
diff --git a/topi/include/topi/detail/broadcast.h b/topi/include/topi/detail/broadcast.h
index 4fdd18626498..deef59e3368c 100644
--- a/topi/include/topi/detail/broadcast.h
+++ b/topi/include/topi/detail/broadcast.h
@@ -119,7 +119,7 @@ inline tvm::Array<tvm::Expr> InputIndexFromBroadcast(
     // Only inject 0 here if we have not yet reached the dimension of I
     // (i.e. this must be a 1)
     if (!found && (ovars.size() - i) <= expected_dims) {
-      ivars.push_back(tvm::make_zero(ovars[i].dtype()));
+      ivars.push_back(tvm::make_zero(ovars[i].type()));
     }
   }
   CHECK(expected_dims == ivars.size());
diff --git a/topi/include/topi/detail/extern.h b/topi/include/topi/detail/extern.h
index 1b510a45661f..6bf2e9c9b973 100644
--- a/topi/include/topi/detail/extern.h
+++ b/topi/include/topi/detail/extern.h
@@ -43,9 +43,9 @@ using namespace tvm;
  * \return The Buffer object
  */
 inline Buffer DeclExternBuffer(Array<Expr> shape,
-                               DataType dtype,
+                               Type dtype,
                                std::string name) {
-  auto data = var(name, DataType::Handle());
+  auto data = var(name, Handle());
   auto elem_offset = Expr();
   return BufferNode::make(data, dtype, shape, Array<Expr>(), elem_offset, name, "",
                           -1, 0, kDefault);
@@ -76,7 +76,7 @@ using FExtern = std::function<Expr(Array<Buffer>, Array<Buffer>)>;
  * element of out_types.
  */
 inline Array<Tensor> make_extern(const Array< Array<Expr> >& out_shapes,
-                                 const std::vector<DataType>& out_types,
+                                 const std::vector<Type>& out_types,
                                  const Array<Tensor>& inputs,
                                  FExtern fextern,
                                  std::string name,
@@ -118,11 +118,11 @@ inline Array<Tensor> make_extern(const Array< Array<Expr> >& out_shapes,
  */
 inline Expr pack_buffer(Buffer buf) {
   CHECK_GT(buf->shape.size(), 0) << "buf shape must have at least one element";
-  auto shape = tvm::ir::Call::make(DataType::Handle(), tvm::ir::intrinsic::tvm_stack_make_shape,
+  auto shape = tvm::ir::Call::make(Handle(), tvm::ir::intrinsic::tvm_stack_make_shape,
                                    buf->shape, tvm::ir::Call::CallType::Intrinsic);
   Expr strides;
   if (buf->strides.size() > 0) {
-    strides = tvm::ir::Call::make(DataType::Handle(), tvm::ir::intrinsic::tvm_stack_make_shape,
+    strides = tvm::ir::Call::make(Handle(), tvm::ir::intrinsic::tvm_stack_make_shape,
                                   buf->shape, tvm::ir::Call::CallType::Intrinsic);
   } else {
     strides = 0;
@@ -131,11 +131,11 @@ inline Expr pack_buffer(Buffer buf) {
     buf->data,
     shape,
     strides,
-    make_const(DataType::Int(32), static_cast<int64_t>(buf->shape.size())),
+    make_const(Int(32), static_cast<int64_t>(buf->shape.size())),
     make_const(buf->dtype, 0),
     buf->elem_offset
   };
-  return tvm::ir::Call::make(DataType::Handle(), tvm::ir::intrinsic::tvm_stack_make_array,
+  return tvm::ir::Call::make(Handle(), tvm::ir::intrinsic::tvm_stack_make_array,
                              pack_args, tvm::ir::Call::CallType::Intrinsic);
 }
 
@@ -149,7 +149,7 @@ inline Expr pack_buffer(Buffer buf) {
  * \return An expression representing the invocation
  */
 inline Expr call_packed(Array<Expr> args) {
-  return tvm::ir::Call::make(DataType::Int(32), tvm::ir::intrinsic::tvm_call_packed,
+  return tvm::ir::Call::make(Int(32), tvm::ir::intrinsic::tvm_call_packed,
                              args, tvm::ir::Call::CallType::Intrinsic);
 }
 
diff --git a/topi/include/topi/elemwise.h b/topi/include/topi/elemwise.h
index 15b945499c13..3aca0a6904f6 100644
--- a/topi/include/topi/elemwise.h
+++ b/topi/include/topi/elemwise.h
@@ -116,7 +116,7 @@ inline Tensor fast_tanh_float(const Tensor& in,
 inline Tensor tanh(const Tensor& x,
                    std::string name = "T_tanh",
                    std::string tag = kElementWise) {
-  if (x->dtype == DataType::Float(32)) {
+  if (x->dtype == Float(32)) {
     // invoke fast_tanh_float implementation
     return fast_tanh_float(x, name, tag);
   } else {
@@ -255,15 +255,15 @@ inline Tensor clip(const Tensor& x,
  * \return A Tensor whose op member is the cast operation
  */
 inline Tensor cast(const Tensor& x,
-                   DataType type,
+                   Type type,
                    std::string name = "T_cast",
                    std::string tag = kElementWise) {
   return compute(x->shape, [&](const Array<Var>& i) {
     auto expr = x(i);
-    if (expr.dtype().code() == type.code() && expr.dtype().bits() == type.bits()) {
-      if (expr.dtype().lanes() == type.lanes()) {
+    if (expr.type().code() == type.code() && expr.type().bits() == type.bits()) {
+      if (expr.type().lanes() == type.lanes()) {
         return expr;
-      } else if (expr.dtype().lanes() == 1 && type.lanes() > 1) {
+      } else if (expr.type().lanes() == 1 && type.lanes() > 1) {
         return tvm::ir::Broadcast::make(expr, type.lanes());
       }
     }
@@ -282,7 +282,7 @@ inline Tensor cast(const Tensor& x,
  *
  * \return A Tensor whose op member is the reinterpret operation
  */
-inline Tensor reinterpret(const Tensor& x, DataType type, std::string name = "tensor",
+inline Tensor reinterpret(const Tensor& x, Type type, std::string name = "tensor",
                           std::string tag = kElementWise) {
   return compute(x->shape,
                  [&](const Array<Var>& i) {
@@ -326,7 +326,7 @@ inline Tensor elemwise_sum(const Array<Tensor>& xs,
 * \return A Tensor whose op member is the full operation
 */
 inline Tensor full(const Array<Expr>& shape,
-                   DataType dtype,
+                   Type dtype,
                    const Expr fill_value,
                    std::string name = "T_full",
                    std::string tag = kElementWise) {
diff --git a/topi/include/topi/image/resize.h b/topi/include/topi/image/resize.h
index 5f58562ddd81..3a5efba7ffa3 100644
--- a/topi/include/topi/image/resize.h
+++ b/topi/include/topi/image/resize.h
@@ -54,17 +54,17 @@ inline Expr bilinear_sample_nchw(const Tensor& input, const Array<Expr>& indices
                                  const Expr max_y, const Expr max_x) {
   auto in_y = indices[2];
   auto yf = tvm::floor(in_y);
-  auto yc = tvm::cast(DataType::Int(32), tvm::ceil(in_y));
+  auto yc = tvm::cast(Int(32), tvm::ceil(in_y));
 
-  auto y0 = tvm::cast(DataType::Int(32), tvm::floor(in_y));
+  auto y0 = tvm::cast(Int(32), tvm::floor(in_y));
   auto y1 = tvm::if_then_else((yc > max_y), max_y, yc);
   auto y_lerp = in_y - yf;
 
   auto in_x = indices[3];
   auto xf = tvm::floor(in_x);
-  auto xc = tvm::cast(DataType::Int(32), tvm::ceil(in_x));
+  auto xc = tvm::cast(Int(32), tvm::ceil(in_x));
 
-  auto x0 = tvm::cast(DataType::Int(32), tvm::floor(in_x));
+  auto x0 = tvm::cast(Int(32), tvm::floor(in_x));
   auto x1 = tvm::if_then_else((xc > max_x), max_x, xc);
   auto x_lerp = in_x - xf;
 
@@ -97,8 +97,8 @@ inline Tensor resize_nearest_neighbor_nhwc(const Tensor& input,
                                            std::string tag = kInjective) {
   Array<Expr> out_shape;
   out_shape.push_back(input->shape[0]);
-  out_shape.push_back(cast(DataType::Int(32), shape[0]));
-  out_shape.push_back(cast(DataType::Int(32), shape[1]));
+  out_shape.push_back(cast(Int(32), shape[0]));
+  out_shape.push_back(cast(Int(32), shape[1]));
   out_shape.push_back(input->shape[3]);
 
   return compute(
@@ -132,8 +132,8 @@ inline Tensor resize_nearest_neighbor_nchw(const Tensor& input,
   Array<Expr> out_shape;
   out_shape.push_back(input->shape[0]);
   out_shape.push_back(input->shape[1]);
-  out_shape.push_back(cast(DataType::Int(32), shape[0]));
-  out_shape.push_back(cast(DataType::Int(32), shape[1]));
+  out_shape.push_back(cast(Int(32), shape[0]));
+  out_shape.push_back(cast(Int(32), shape[1]));
 
   return compute(
     out_shape, [&](const Array<Var>& indices) {
@@ -166,8 +166,8 @@ inline Tensor resize_nearest_neighbor_nchwc(const Tensor& input,
   Array<Expr> out_shape;
   out_shape.push_back(input->shape[0]);
   out_shape.push_back(input->shape[1]);
-  out_shape.push_back(cast(DataType::Int(32), shape[0]));
-  out_shape.push_back(cast(DataType::Int(32), shape[1]));
+  out_shape.push_back(cast(Int(32), shape[0]));
+  out_shape.push_back(cast(Int(32), shape[1]));
   out_shape.push_back(input->shape[4]);
 
   return compute(
@@ -233,11 +233,11 @@ inline Tensor resize_bilinear_nhwc(const Tensor& input,
                                    std::string tag = kInjective) {
   Array<Expr> out_shape;
   out_shape.push_back(input->shape[0]);
-  out_shape.push_back(cast(DataType::Int(32), shape[0]));
-  out_shape.push_back(cast(DataType::Int(32), shape[1]));
+  out_shape.push_back(cast(Int(32), shape[0]));
+  out_shape.push_back(cast(Int(32), shape[1]));
   out_shape.push_back(input->shape[3]);
 
-  Expr cone = make_const(DataType::Int(32), 1);
+  Expr cone = make_const(Int(32), 1);
 
   auto in_height = as_const_int(input->shape[1]);
   auto in_width = as_const_int(input->shape[2]);
@@ -248,14 +248,14 @@ inline Tensor resize_bilinear_nhwc(const Tensor& input,
   Expr x_ratio;
 
   if (!align_corners) {
-    y_ratio = make_const(DataType::Float(32), (static_cast<float>(*in_height) /
+    y_ratio = make_const(Float(32), (static_cast<float>(*in_height) /
                                      static_cast<float>(*out_height)));
-    x_ratio = make_const(DataType::Float(32), (static_cast<float>(*in_width) /
+    x_ratio = make_const(Float(32), (static_cast<float>(*in_width) /
                                      static_cast<float>(*out_width)));
   } else {
-    y_ratio = make_const(DataType::Float(32), (static_cast<float>(*in_height - 1) /
+    y_ratio = make_const(Float(32), (static_cast<float>(*in_height - 1) /
                                      static_cast<float>(*out_height - 1)));
-    x_ratio = make_const(DataType::Float(32), (static_cast<float>(*in_width - 1) /
+    x_ratio = make_const(Float(32), (static_cast<float>(*in_width - 1) /
                                      static_cast<float>(*out_width - 1)));
   }
 
@@ -266,17 +266,17 @@ inline Tensor resize_bilinear_nhwc(const Tensor& input,
     out_shape, [&](const Array<Var>& indices) {
     auto in_y = indices[1] * y_ratio;
     auto yf = tvm::floor(in_y);
-    auto yc = tvm::cast(DataType::Int(32), tvm::ceil(in_y));
+    auto yc = tvm::cast(Int(32), tvm::ceil(in_y));
 
-    auto y0 = tvm::cast(DataType::Int(32), tvm::floor(in_y));
+    auto y0 = tvm::cast(Int(32), tvm::floor(in_y));
     auto y1 = tvm::if_then_else((yc > other_y), other_y, yc);
     auto y_lerp  = in_y - yf;
 
     auto in_x = indices[2] * x_ratio;
     auto xf = tvm::floor(in_x);
-    auto xc = tvm::cast(DataType::Int(32), tvm::ceil(in_x));
+    auto xc = tvm::cast(Int(32), tvm::ceil(in_x));
 
-    auto x0 = tvm::cast(DataType::Int(32), tvm::floor(in_x));
+    auto x0 = tvm::cast(Int(32), tvm::floor(in_x));
     auto x1 = tvm::if_then_else((xc > other_x), other_x, xc);
     auto x_lerp  = in_x - xf;
 
@@ -311,10 +311,10 @@ inline Tensor resize_bilinear_nchw(const Tensor& input,
   Array<Expr> out_shape;
   out_shape.push_back(input->shape[0]);
   out_shape.push_back(input->shape[1]);
-  out_shape.push_back(cast(DataType::Int(32), shape[0]));
-  out_shape.push_back(cast(DataType::Int(32), shape[1]));
+  out_shape.push_back(cast(Int(32), shape[0]));
+  out_shape.push_back(cast(Int(32), shape[1]));
 
-  Expr cone = make_const(DataType::Int(32), 1);
+  Expr cone = make_const(Int(32), 1);
 
   auto in_height = as_const_int(input->shape[2]);
   auto in_width = as_const_int(input->shape[3]);
@@ -325,14 +325,14 @@ inline Tensor resize_bilinear_nchw(const Tensor& input,
   Expr x_ratio;
 
   if (!align_corners) {
-    y_ratio = make_const(DataType::Float(32), (static_cast<float>(*in_height) /
+    y_ratio = make_const(Float(32), (static_cast<float>(*in_height) /
                                      static_cast<float>(*out_height)));
-    x_ratio = make_const(DataType::Float(32), (static_cast<float>(*in_width) /
+    x_ratio = make_const(Float(32), (static_cast<float>(*in_width) /
                                      static_cast<float>(*out_width)));
   } else {
-    y_ratio = make_const(DataType::Float(32), (static_cast<float>(*in_height - 1) /
+    y_ratio = make_const(Float(32), (static_cast<float>(*in_height - 1) /
                                      static_cast<float>(*out_height - 1)));
-    x_ratio = make_const(DataType::Float(32), (static_cast<float>(*in_width - 1) /
+    x_ratio = make_const(Float(32), (static_cast<float>(*in_width - 1) /
                                      static_cast<float>(*out_width - 1)));
   }
 
diff --git a/topi/include/topi/nn.h b/topi/include/topi/nn.h
index 2235fba4ec72..3a8ae0aac92b 100644
--- a/topi/include/topi/nn.h
+++ b/topi/include/topi/nn.h
@@ -93,7 +93,7 @@ inline tvm::Tensor leaky_relu(const tvm::Tensor& t,
     t->shape,
     [&](const tvm::Array<tvm::Var>& i) {
       auto value = t(i);
-      auto calpha = tvm::make_const(value.dtype(), alpha);
+      auto calpha = tvm::make_const(value.type(), alpha);
       return tvm::ir::Select::make(value > 0, value, value * calpha);
     },
     name,
@@ -189,10 +189,10 @@ inline tvm::Tensor pad(const tvm::Tensor& t,
   tvm::Array<tvm::Expr> pad_before_int32;
   tvm::Array<tvm::Expr> pad_after_int32;
   for (const auto &ele : pad_before) {
-    pad_before_int32.push_back(tvm::cast(tvm::DataType::Int(32), ele));
+    pad_before_int32.push_back(tvm::cast(tvm::Int(32), ele));
   }
   for (const auto &ele : pad_after) {
-    pad_after_int32.push_back(tvm::cast(tvm::DataType::Int(32), ele));
+    pad_after_int32.push_back(tvm::cast(tvm::Int(32), ele));
   }
   for (size_t i = 0; i < t->shape.size(); ++i) {
     if (i >= pad_before.size()) {
diff --git a/topi/include/topi/nn/bnn.h b/topi/include/topi/nn/bnn.h
index 6b79db3b5c79..2b6e58429983 100644
--- a/topi/include/topi/nn/bnn.h
+++ b/topi/include/topi/nn/bnn.h
@@ -71,7 +71,7 @@ inline tvm::Tensor binarize_pack(const tvm::Tensor& data,
                             indices[i] * 32 :
                             static_cast<Expr>(indices[i]));
       }
-      auto packed = make_const(DataType::UInt(32), 0);
+      auto packed = make_const(UInt(32), 0);
       for (size_t j = 0; j < 32; ++j) {
         Array<Expr> idx;
         for (size_t i = 0; i < n; ++i) {
@@ -79,7 +79,7 @@ inline tvm::Tensor binarize_pack(const tvm::Tensor& data,
                         start_idx[i] + static_cast<int>(j) :
                         start_idx[i]);
         }
-        auto sign = tvm::cast(DataType::UInt(32), data(idx) >= 0);
+        auto sign = tvm::cast(UInt(32), data(idx) >= 0);
         packed = (packed | sign);
         if (j == 31) {
           return packed;
@@ -102,8 +102,8 @@ inline tvm::Tensor binary_dense(const tvm::Tensor& data,
                                 const tvm::Tensor& weight) {
   CHECK_EQ(data->shape.size(), 2) << "binary_dense requires 2-D data";
   CHECK_EQ(weight->shape.size(), 2) << "binary_dense requires 2-D weight";
-  CHECK_EQ(data->dtype, DataType::UInt(32)) << "binary_dense requires uint32 data";
-  CHECK_EQ(weight->dtype, DataType::UInt(32)) << "binary_dense requires uint32 weight";
+  CHECK_EQ(data->dtype, UInt(32)) << "binary_dense requires uint32 data";
+  CHECK_EQ(weight->dtype, UInt(32)) << "binary_dense requires uint32 weight";
 
   auto batch = data->shape[0];
   auto in_dim = data->shape[1];
diff --git a/topi/include/topi/nn/dense.h b/topi/include/topi/nn/dense.h
index f1bb07abf262..af106dbc885b 100644
--- a/topi/include/topi/nn/dense.h
+++ b/topi/include/topi/nn/dense.h
@@ -46,7 +46,7 @@ using namespace tvm;
 inline tvm::Tensor dense(const tvm::Tensor& data,
                          const tvm::Tensor& weight,
                          const tvm::Tensor& bias,
-                         const DataType& out_dtype) {
+                         const Type& out_dtype) {
   CHECK_EQ(data->shape.size(), 2) << "dense requires 2-D data";
   CHECK_EQ(weight->shape.size(), 2) << "dense requires 2-D weight";
   if (bias.defined()) {
diff --git a/topi/include/topi/nn/dilate.h b/topi/include/topi/nn/dilate.h
index 1dc2c8d53948..2de9e602ffe5 100644
--- a/topi/include/topi/nn/dilate.h
+++ b/topi/include/topi/nn/dilate.h
@@ -76,7 +76,7 @@ inline Tensor dilate(const Tensor& x,
   Array<Expr> out_shape;
   for (size_t i = 0; i < n; ++i) {
     out_shape.push_back(tvm::ir::Simplify(
-      (x->shape[i] - 1) * cast(DataType::Int(32), strides[i] + 1)));
+      (x->shape[i] - 1) * cast(Int(32), strides[i] + 1)));
   }
 
   return tvm::compute(
diff --git a/topi/include/topi/nn/pooling.h b/topi/include/topi/nn/pooling.h
index c4cda6a20625..72ea2b86c280 100644
--- a/topi/include/topi/nn/pooling.h
+++ b/topi/include/topi/nn/pooling.h
@@ -74,18 +74,18 @@ inline Tensor pool_impl(const Tensor& x,
   CHECK_EQ(stride_size.size(), 2) << "Pooling stride_size must have 2 elements";
   CHECK_EQ(padding_size.size(), 4) << "Pooling padding_size must have 4 elements";
 
-  auto kernel_height = cast(DataType::DataType::Int(32), kernel_size[0]);
-  auto kernel_width = cast(DataType::DataType::Int(32), kernel_size[1]);
-  auto stride_height = cast(DataType::DataType::Int(32), stride_size[0]);
-  auto stride_width = cast(DataType::DataType::Int(32), stride_size[1]);
+  auto kernel_height = cast(Int(32), kernel_size[0]);
+  auto kernel_width = cast(Int(32), kernel_size[1]);
+  auto stride_height = cast(Int(32), stride_size[0]);
+  auto stride_width = cast(Int(32), stride_size[1]);
 
   auto height = x->shape[height_axis];
   auto width = x->shape[width_axis];
 
-  auto pad_top = cast(DataType::DataType::Int(32), padding_size[0]);
-  auto pad_left = cast(DataType::DataType::Int(32), padding_size[1]);
-  auto pad_bottom = cast(DataType::DataType::Int(32), padding_size[2]);
-  auto pad_right = cast(DataType::DataType::Int(32), padding_size[3]);
+  auto pad_top = cast(Int(32), padding_size[0]);
+  auto pad_left = cast(Int(32), padding_size[1]);
+  auto pad_bottom = cast(Int(32), padding_size[2]);
+  auto pad_right = cast(Int(32), padding_size[3]);
 
   if (ceil_mode) {
     // Additional padding to ensure we do ceil instead of floor when
@@ -122,8 +122,7 @@ inline Tensor pool_impl(const Tensor& x,
                       ((padding_h1 && *padding_h1) || (padding_w1 && *padding_w1));
 
   if (pool_type == kMaxPool) {
-    auto temp = do_pad ? pad(
-        x, pad_before, pad_after, tvm::min_value(x->dtype), "pad_temp") : x;
+    auto temp = do_pad ? pad(x, pad_before, pad_after, x->dtype.min(), "pad_temp") : x;
     return tvm::compute(out_shape, [&](const Array<Var>& output) {
       Array<Expr> indices;
       for (const Var& var : output) indices.push_back(var);
@@ -157,10 +156,10 @@ inline Tensor pool_impl(const Tensor& x,
         Expr w_start = output[width_axis] * stride_width - pad_left;
         Expr h_end = ir::Min::make(h_start + kernel_height, height);
         Expr w_end = ir::Min::make(w_start + kernel_width, width);
-        h_start = ir::Max::make(h_start, make_const(DataType::DataType::Int(32), 0));
-        w_start = ir::Max::make(w_start, make_const(DataType::DataType::Int(32), 0));
+        h_start = ir::Max::make(h_start, make_const(Int(32), 0));
+        w_start = ir::Max::make(w_start, make_const(Int(32), 0));
         Expr divide_factor = ir::Max::make((h_end - h_start) * (w_end - w_start),
-                                           make_const(DataType::DataType::Int(32), 1));
+                                           make_const(Int(32), 1));
         return div(pool_sum(indices), divide_factor);
       }
     }, "tensor", kElementWise);
@@ -181,18 +180,18 @@ inline Tensor pool_grad_impl(const Tensor& out_grad, const Tensor& x,
   CHECK_EQ(stride_size.size(), 2) << "Pooling stride_size must have 2 elements";
   CHECK_EQ(padding_size.size(), 4) << "Pooling padding_size must have 4 elements";
 
-  auto kernel_height = cast(DataType::DataType::Int(32), kernel_size[0]);
-  auto kernel_width = cast(DataType::DataType::Int(32), kernel_size[1]);
-  auto stride_height = cast(DataType::DataType::Int(32), stride_size[0]);
-  auto stride_width = cast(DataType::DataType::Int(32), stride_size[1]);
+  auto kernel_height = cast(Int(32), kernel_size[0]);
+  auto kernel_width = cast(Int(32), kernel_size[1]);
+  auto stride_height = cast(Int(32), stride_size[0]);
+  auto stride_width = cast(Int(32), stride_size[1]);
 
   auto height = x->shape[height_axis];
   auto width = x->shape[width_axis];
 
-  auto pad_top = cast(DataType::DataType::Int(32), padding_size[0]);
-  auto pad_left = cast(DataType::DataType::Int(32), padding_size[1]);
-  auto pad_bottom = cast(DataType::DataType::Int(32), padding_size[2]);
-  auto pad_right = cast(DataType::DataType::Int(32), padding_size[3]);
+  auto pad_top = cast(Int(32), padding_size[0]);
+  auto pad_left = cast(Int(32), padding_size[1]);
+  auto pad_bottom = cast(Int(32), padding_size[2]);
+  auto pad_right = cast(Int(32), padding_size[3]);
 
   if (ceil_mode) {
     // Additional padding to ensure we do ceil instead of floor when
@@ -237,8 +236,7 @@ inline Tensor pool_grad_impl(const Tensor& out_grad, const Tensor& x,
     auto windoww = tvm::reduce_axis(Range(0, (kernel_width + stride_width - 1) / stride_width));
 
     auto argmax = MakeArgmaxReducer();
-    auto pad_x = do_pad ? pad(
-        x, pad_before, pad_after, tvm::min_value(x->dtype), "pad_temp") : x;
+    auto pad_x = do_pad ? pad(x, pad_before, pad_after, x->dtype.min(), "pad_temp") : x;
 
     auto mp_argmax =
         tvm::compute(out_shape,
@@ -266,10 +264,10 @@ inline Tensor pool_grad_impl(const Tensor& out_grad, const Tensor& x,
           out_idx.Set(width_axis, (inds[width_axis] + pad_left) / stride_width - windoww);
 
           Expr out_idx_lower_h = ir::Select::make(
-              pad_inds[height_axis] < kernel_height, make_const(DataType::DataType::Int(32), 0),
+              pad_inds[height_axis] < kernel_height, make_const(Int(32), 0),
               (pad_inds[height_axis] - kernel_height) / stride_height + 1);
           Expr out_idx_lower_w = ir::Select::make(
-              pad_inds[width_axis] < kernel_width, make_const(DataType::DataType::Int(32), 0),
+              pad_inds[width_axis] < kernel_width, make_const(Int(32), 0),
               (pad_inds[width_axis] - kernel_width) / stride_width + 1);
 
           return tvm::sum(
@@ -295,12 +293,10 @@ inline Tensor pool_grad_impl(const Tensor& out_grad, const Tensor& x,
           out_idx.Set(height_axis, (pad_h_idx / stride_height - windowh));
           out_idx.Set(width_axis, (pad_w_idx / stride_width - windoww));
 
-          Expr out_idx_lower_h = ir::Select::make(
-              pad_h_idx < kernel_height, make_const(DataType::Int(32), 0),
-              (pad_h_idx - kernel_height) / stride_height + 1);
-          Expr out_idx_lower_w = ir::Select::make(
-              pad_w_idx < kernel_width, make_const(DataType::Int(32), 0),
-              (pad_w_idx - kernel_width) / stride_width + 1);
+          Expr out_idx_lower_h = ir::Select::make(pad_h_idx < kernel_height, make_const(Int(32), 0),
+                                                  (pad_h_idx - kernel_height) / stride_height + 1);
+          Expr out_idx_lower_w = ir::Select::make(pad_w_idx < kernel_width, make_const(Int(32), 0),
+                                                  (pad_w_idx - kernel_width) / stride_width + 1);
 
           Expr divide_factor;  // number of pooled elements
           if (count_include_pad) {
@@ -310,11 +306,10 @@ inline Tensor pool_grad_impl(const Tensor& out_grad, const Tensor& x,
             Expr w_start = out_idx[width_axis] * stride_width - pad_left;
             Expr h_end = ir::Min::make(h_start + kernel_height, height);
             Expr w_end = ir::Min::make(w_start + kernel_width, width);
-            h_start = ir::Max::make(h_start, make_const(DataType::Int(32), 0));
-            w_start = ir::Max::make(w_start, make_const(DataType::Int(32), 0));
+            h_start = ir::Max::make(h_start, make_const(Int(32), 0));
+            w_start = ir::Max::make(w_start, make_const(Int(32), 0));
             divide_factor =
-                ir::Max::make((h_end - h_start) * (w_end - w_start),
-                              make_const(DataType::Int(32), 1));
+                ir::Max::make((h_end - h_start) * (w_end - w_start), make_const(Int(32), 1));
           }
           return tvm::sum(tvm::if_then_else(
               ir::And::make(
@@ -492,8 +487,8 @@ inline Tensor adaptive_pool_impl(const Tensor& x,
   auto height = x->shape[height_axis];
   auto width = x->shape[width_axis];
 
-  auto out_height = cast(DataType::Int(32), output_size[0]);
-  auto out_width = cast(DataType::Int(32), output_size[1]);
+  auto out_height = cast(Int(32), output_size[0]);
+  auto out_width = cast(Int(32), output_size[1]);
   Array<Expr> out_shape = x->shape;
   out_shape.Set(height_axis, out_height);
   out_shape.Set(width_axis, out_width);
@@ -655,10 +650,10 @@ inline Tensor pool_impl_nd(const Tensor& x,
   bool do_pad = false;
   for (int i = 0; i < k_size; i++) {
     int ii = axis[i];
-    kernel[i] = cast(DataType::Int(32), kernel_size[i]);
-    stride[i] = cast(DataType::Int(32), stride_size[i]);
-    pad_head[i] = cast(DataType::Int(32), padding_size[i]);
-    pad_tail[i] = cast(DataType::Int(32), padding_size[i + k_size]);
+    kernel[i] = cast(Int(32), kernel_size[i]);
+    stride[i] = cast(Int(32), stride_size[i]);
+    pad_head[i] = cast(Int(32), padding_size[i]);
+    pad_tail[i] = cast(Int(32), padding_size[i + k_size]);
     const int64_t *padding0 = as_const_int(pad_head[i]);
     const int64_t *padding1 = as_const_int(pad_tail[i]);
     do_pad = (do_pad) ? do_pad : ((padding0 && *padding0) || (padding1 && *padding1));
@@ -681,8 +676,7 @@ inline Tensor pool_impl_nd(const Tensor& x,
   }
 
   if (pool_type == kMaxPool) {
-    auto temp = do_pad ? pad(
-        x, pad_before, pad_after, tvm::min_value(x->dtype), "pad_temp") : x;
+    auto temp = do_pad ? pad(x, pad_before, pad_after, x->dtype.min(), "pad_temp") : x;
     return tvm::compute(out_shape, [&](const Array<Var>& output) {
       Array<Expr> indices;
       for (const Var& var : output) indices.push_back(var);
@@ -717,7 +711,7 @@ inline Tensor pool_impl_nd(const Tensor& x,
       Array<Expr> indices;
       for (const Var& var : output) indices.push_back(var);
       if (count_include_pad) {
-        auto kernel_size = make_const(DataType::Int(32), 1);
+        auto kernel_size = make_const(Int(32), 1);
         for (int i = 0; i < k_size; i++) {
           kernel_size *= kernel[i];
         }
@@ -725,16 +719,16 @@ inline Tensor pool_impl_nd(const Tensor& x,
       } else {
         std::vector<Expr> start(k_size);
         std::vector<Expr> end(k_size);
-        auto kernel_size = make_const(DataType::Int(32), 1);
+        auto kernel_size = make_const(Int(32), 1);
         for (int i = 0; i < k_size; i++) {
           int ii = axis[i];
           start[i] = output[ii] * stride[i] - pad_head[i];
           end[i] = ir::Min::make(start[i] + kernel[i], x->shape[ii]);
-          start[i] = ir::Max::make(start[i], make_const(DataType::Int(32), 0));
+          start[i] = ir::Max::make(start[i], make_const(Int(32), 0));
           kernel_size *= (end[i] - start[i]);
         }
 
-        Expr divide_factor = ir::Max::make(kernel_size, make_const(DataType::Int(32), 1));
+        Expr divide_factor = ir::Max::make(kernel_size, make_const(Int(32), 1));
         return div(pool_sum(indices), divide_factor);
       }
     }, "tensor", kElementWise);
diff --git a/topi/include/topi/reduction.h b/topi/include/topi/reduction.h
index 0ffc3e036bfd..32608e9499f0 100644
--- a/topi/include/topi/reduction.h
+++ b/topi/include/topi/reduction.h
@@ -266,7 +266,7 @@ inline Tensor CommReduceIdx(const Tensor& data,
 using FCombine = std::function<Array<Expr>(Array<Var> lhs, Array<Var> rhs)>;
 
 /*! \brief An initializer function for a reduction */
-using FIdentity = std::function<Array<Expr>(std::vector<DataType> types)>;
+using FIdentity = std::function<Array<Expr>(std::vector<Type> types)>;
 
 /*!
  * \brief Create a commutative reducer for a reduction
@@ -283,10 +283,10 @@ inline FCommReduce MakeCommReducer(FCombine fcombine,
   return [fcombine, fidentity, name]
   (Array<Expr> exprs, const Array<IterVar>& axis, Expr* condition) {
     Array<Var> lhs, rhs;
-    std::vector<DataType> dtypes;
+    std::vector<Type> dtypes;
 
     for (size_t i = 0; i < exprs.size(); ++i) {
-      auto dtype = exprs[i].dtype();
+      auto dtype = exprs[i].type();
       dtypes.push_back(dtype);
       lhs.push_back(var(name + "_lhs_" + std::to_string(i), dtype));
       rhs.push_back(var(name + "_rhs_" + std::to_string(i), dtype));
@@ -476,10 +476,10 @@ inline Tensor argmin(const Tensor& data,
     result.push_back(tvm::ir::Select::make(lhs[1] <= rhs[1], lhs[1], rhs[1]));  // val
     return result;
   };
-  auto fidentity = [](std::vector<DataType> types) {
+  auto fidentity = [](std::vector<Type> types) {
     Array<Expr> result;
     result.push_back(tvm::make_const(types[0], -1));  // idx
-    result.push_back(tvm::max_value(types[1]));  // val
+    result.push_back(types[1].max());  // val
     return result;
   };
   auto func = MakeCommReducer(fcombine, fidentity, "argmin");
@@ -493,10 +493,10 @@ inline FCommReduce MakeArgmaxReducer() {
     result.push_back(tvm::ir::Select::make(lhs[1] >= rhs[1], lhs[1], rhs[1]));  // val
     return result;
   };
-  auto fidentity = [](std::vector<DataType> types) {
+  auto fidentity = [](std::vector<Type> types) {
     Array<Expr> result;
     result.push_back(tvm::make_const(types[0], -1));  // idx
-    result.push_back(tvm::min_value(types[1]));  // val
+    result.push_back(types[1].min());  // val
     return result;
   };
   return MakeCommReducer(fcombine, fidentity, "argmax");
diff --git a/topi/include/topi/rocm/dense.h b/topi/include/topi/rocm/dense.h
index 7eb9f9a51962..ef93aef26252 100644
--- a/topi/include/topi/rocm/dense.h
+++ b/topi/include/topi/rocm/dense.h
@@ -52,7 +52,7 @@ inline tvm::Tensor dense_rocm(const Target& target,
                               const tvm::Tensor& data,
                               const tvm::Tensor& weight,
                               const tvm::Tensor& bias,
-                              const DataType& out_dtype) {
+                              const Type& out_dtype) {
   CHECK_EQ(data->shape.size(), 2) << "dense requires 2-D data";
   CHECK_EQ(weight->shape.size(), 2) << "dense requires 2-D weight";
   if (bias.defined()) {
diff --git a/topi/include/topi/transform.h b/topi/include/topi/transform.h
index e68b7a5a0b3c..98de0b0c2a8a 100644
--- a/topi/include/topi/transform.h
+++ b/topi/include/topi/transform.h
@@ -210,7 +210,7 @@ inline Tensor reshape(const Tensor& x,
   Array<Expr> newshape_int32;
 
   for (const auto &ele : newshape) {
-    newshape_int32.push_back(cast(DataType::Int(32), ele));
+    newshape_int32.push_back(cast(Int(32), ele));
   }
   return compute(
     newshape_int32, [&](const Array<Var>& indices) {
@@ -560,8 +560,8 @@ inline Tensor strided_slice(const Tensor& x,
       << ": Input [Begin=" << begin_vec[i] << ", End=" << end_vec[i]
       << "] is invalid for axis=" << i;
 
-    begin_expr.push_back(make_const(begin[0].dtype(), begin_i));
-    strides_expr.push_back(make_const((strides.size() != 0 ? strides[0].dtype() : begin[0].dtype()),
+    begin_expr.push_back(make_const(begin[0].type(), begin_i));
+    strides_expr.push_back(make_const((strides.size() != 0 ? strides[0].type() : begin[0].type()),
                                      stride_vec[i]));
     out_shape.push_back(slice_size);
   }
@@ -980,7 +980,7 @@ inline Tensor gather_nd(const Tensor& data,
     out_shape.push_back(data->shape[i]);
   }
   if (out_shape.size() == 0) {
-    out_shape.push_back(make_const(DataType::Int(32), 1));
+    out_shape.push_back(make_const(Int(32), 1));
   }
   return compute(
         out_shape, [&](const Array<Var>& out_index) {
@@ -991,12 +991,12 @@ inline Tensor gather_nd(const Tensor& data,
           }
           Array<Expr> real_indices;
           for (size_t i = 0; i < indices_dim0; ++i) {
-            indices_position.Set(0, make_const(DataType::Int(32), i));
+            indices_position.Set(0, make_const(Int(32), i));
             if (indices->dtype.is_int()) {
               real_indices.push_back(indices(indices_position));
             } else {
               real_indices.push_back(
-                  tvm::cast(tvm::DataType::Int(32), indices(indices_position)));
+                  tvm::cast(tvm::Int(32), indices(indices_position)));
             }
           }
           for (size_t i = ndim_i - 1; i < out_index.size(); ++i) {
@@ -1155,11 +1155,11 @@ inline Tensor tensordot(const Tensor& A,
 inline Tensor arange(const Expr& start,
                      const Expr& stop,
                      const Expr& step,
-                     DataType dtype,
+                     Type dtype,
                      std::string name = "T_arange",
                      std::string tag = kInjective) {
-  Expr num_elem = tvm::cast(tvm::DataType::Int(32), tvm::ceil(
-      tvm::cast(tvm::DataType::Float(32), stop - start) / step));
+  Expr num_elem = tvm::cast(tvm::Int(32), tvm::ceil(
+      tvm::cast(tvm::Float(32), stop - start) / step));
   Array<Expr> shape;
   return compute({num_elem}, [&](const Array<Var>& indices) {
     return tvm::cast(dtype, start + step * indices[0]);
@@ -1213,7 +1213,7 @@ inline Tensor layout_transform(const Tensor& src,
  * \return Tensor of input shape.
  */
 inline Tensor shape(const Tensor& src,
-                    DataType dtype,
+                    Type dtype,
                     const std::string name = "T_shape",
                     const std::string tag = kInjective) {
   int ndim = static_cast<int>(src->shape.size());
@@ -1237,7 +1237,7 @@ inline Tensor shape(const Tensor& src,
  * \return Tensor of input shape.
  */
 inline Tensor ndarray_size(const Tensor& src,
-                           const DataType& dtype,
+                           const Type& dtype,
                            const std::string& name = "ndarray_size",
                            const std::string& tag = kInjective) {
   int ndim = static_cast<int>(src->shape.size());
@@ -1269,7 +1269,7 @@ inline Tensor one_hot(const Tensor& indices,
                       const Expr off_value,
                       int depth,
                       int axis,
-                      const DataType& dtype,
+                      const Type& dtype,
                       const std::string name = "T_one_hot",
                       const std::string tag = kInjective) {
   Array<Expr> oshape;
diff --git a/topi/python/topi/arm_cpu/conv2d.py b/topi/python/topi/arm_cpu/conv2d.py
index 2adb71848400..abd3c540ea8b 100644
--- a/topi/python/topi/arm_cpu/conv2d.py
+++ b/topi/python/topi/arm_cpu/conv2d.py
@@ -24,8 +24,7 @@
 from tvm import autotvm
 import tvm.contrib.nnpack
 
-from ..generic import schedule_conv2d_nchw, schedule_conv2d_nhwc, \
-                      schedule_conv2d_winograd_without_weight_transform, \
+from ..generic import schedule_conv2d_nchw, schedule_conv2d_winograd_without_weight_transform, \
                       schedule_conv2d_winograd_nnpack_without_weight_transform
 from ..util import traverse_inline, get_const_tuple
 from ..nn import dilate, pad, conv2d, conv2d_alter_layout, \
@@ -35,9 +34,7 @@
 from ..nn.util import get_const_int, get_pad_tuple
 from ..nn.winograd_util import winograd_transform_matrices
 from .conv2d_spatial_pack import conv2d_spatial_pack_nchw, \
-                                 conv2d_spatial_pack_nhwc, \
-                                 schedule_conv2d_spatial_pack_nchw, \
-                                 schedule_conv2d_spatial_pack_nhwc
+                                 schedule_conv2d_spatial_pack_nchw
 
 logger = logging.getLogger('topi')
 
@@ -81,9 +78,6 @@ def conv2d_arm_cpu(cfg, data, kernel, strides, padding, dilation, layout, out_dt
     if layout == 'NCHW':
         return conv2d_spatial_pack_nchw(cfg, data, kernel, strides, padding,
                                         dilation, out_dtype, num_tile=2)
-    elif layout == 'NHWC':
-        return conv2d_spatial_pack_nhwc(cfg, data, kernel, strides, padding,
-                                        dilation, out_dtype)
     else:
         raise ValueError("Unsupported layout {}".format(layout))
 
@@ -142,34 +136,6 @@ def _callback(op):
     traverse_inline(s, outs[0].op, _callback)
     return s
 
-@autotvm.register_topi_schedule(schedule_conv2d_nhwc, 'arm_cpu', ['direct'])
-def schedule_conv2d_nhwc_arm_cpu(cfg, outs):
-    """TOPI schedule callback for conv2d
-
-    Parameters
-    ----------
-    cfg: ConfigEntity
-        The config for this template
-
-    outs: Array of Tensor
-        The computation graph description of conv2d
-        in the format of an array of tensors.
-
-    Returns
-    -------
-    s: Schedule
-        The computation schedule for conv2d.
-    """
-    s = tvm.create_schedule([x.op for x in outs])
-
-    def _callback(op):
-        if 'spatial_conv_output_NHWC' in op.tag:
-            schedule_conv2d_spatial_pack_nhwc(cfg, s, op, outs[0])
-
-    traverse_inline(s, outs[0].op, _callback)
-    return s
-
-
 @autotvm.register_topi_compute(conv2d, 'arm_cpu', ['winograd'])
 def conv2d_arm_cpu_winograd(cfg, data, kernel, strides, padding, dilation, layout, out_dtype):
     """ TOPI compute callback. Use winograd template """
@@ -512,19 +478,19 @@ def _alter_conv2d_layout_arm(attrs, inputs, tinfos, F):
 
     Parameters
     ----------
-    attrs : tvm.attrs.Attrs
+    attrs : nnvm.top.AttrDict or tvm.attrs.Attrs
         Attributes of current convolution
-    inputs : tvm.relay.Expr
+    inputs : nnvm.symbol or tvm.relay.Expr
         Grouped input symbols
     tinfos : list
         Input shape and dtype
     F: symbol
-        The context, can be either relay.op
+        The context, can be either nnvm.sym or relay.op
 
     Note
     ----
     Unlike other TOPI functions, this function operates on both graph level and operator level,
-    so we have to pass 'F' to make it support our two versions of graph IR,  Relay.
+    so we have to pass 'F' to make it support our two versions of graph IR, NNVM and Relay.
     """
     copy_inputs = [s for s in inputs]
 
diff --git a/topi/python/topi/arm_cpu/conv2d_spatial_pack.py b/topi/python/topi/arm_cpu/conv2d_spatial_pack.py
index 350a0227ef48..b566c98a4ec5 100644
--- a/topi/python/topi/arm_cpu/conv2d_spatial_pack.py
+++ b/topi/python/topi/arm_cpu/conv2d_spatial_pack.py
@@ -196,163 +196,3 @@ def schedule_conv2d_spatial_pack_nchw(cfg, s, data_vec, kernel_vec,
         s[kernel_vec].parallel(co)
 
     return s
-
-def conv2d_spatial_pack_nhwc(cfg, data, kernel, strides, padding, dilation, out_dtype):
-    """Spatial pack compute for Conv2d NHWC"""
-    out_dtype = out_dtype or data.dtype
-
-    N, IH, IW, IC = get_const_tuple(data.shape)
-    assert len(kernel.shape) == 4, "AlterOpLayout not enabled for NHWC yet"
-    KH, KW, _, OC = get_const_tuple(kernel.shape)
-
-    if isinstance(dilation, int):
-        dilation_h = dilation_w = dilation
-    else:
-        dilation_h, dilation_w = dilation
-
-    dilated_kernel_h = (KH - 1) * dilation_h + 1
-    dilated_kernel_w = (KW - 1) * dilation_w + 1
-
-    pad_top, pad_left, pad_down, pad_right = \
-            get_pad_tuple(padding, (dilated_kernel_h, dilated_kernel_w))
-    HSTR, WSTR = strides if isinstance(strides, (tuple, list)) else (strides, strides)
-
-    OH = (IH + pad_top + pad_down - dilated_kernel_h) // HSTR + 1
-    OW = (IW + pad_left + pad_right - dilated_kernel_w) // WSTR + 1
-    data_pad = nn.pad(data, [0, pad_top, pad_left, 0], [0, pad_down, pad_right, 0])
-
-    # ==================== define configuration space ====================
-    n, oc, oh, ow = cfg.axis(N), cfg.axis(OC), cfg.axis(OH), cfg.axis(OW)
-    ic, kh, kw = cfg.reduce_axis(IC), cfg.reduce_axis(KH), cfg.reduce_axis(KW)
-
-    oco, oci = cfg.define_split('tile_co', oc, num_outputs=2)
-    oho, ohi = cfg.define_split('tile_oh', oh, num_outputs=2)
-    owo, owi = cfg.define_split('tile_ow', ow, num_outputs=2)
-
-    cfg.define_reorder('reorder_conv',
-                       [n, oho, owo, oco, kh, kw, ic, ohi, owi, oci],
-                       policy='candidate', candidate=[
-                           [n, oho, owo, oco, kh, kw, ic, ohi, owi, oci],
-                           [n, oho, owo, oco, ohi, kh, kw, ic, owi, oci],
-                           [n, oho, owo, oco, ohi, kh, kw, owi, ic, oci],
-                           [n, oho, owo, ohi, oco, kh, kw, owi, ic, oci]])
-
-    cfg.define_annotate("ann_reduce", [kh, kw], policy='try_unroll')
-    cfg.define_annotate("ann_spatial", [ohi, owi, oci], policy='try_unroll_vec')
-    # ====================================================================
-
-    OCI = cfg['tile_co'].size[-1]
-    OHI = cfg['tile_oh'].size[-1]
-    OWI = cfg['tile_ow'].size[-1]
-    OCO = OC // OCI
-    OHO = OH // OHI
-    OWO = OW // OWI
-
-    kvshape = (OCO, KH, KW, IC, OCI)
-    ovshape = (N, OHO, OWO, OCO, OHI, OWI, OCI)
-    oshape = (N, OH, OW, OC)
-
-    if dilation_h != 1 or dilation_w != 1:
-        # undilate input data
-        dvshape = (N, OHO, OWO, KH, KW, IC, OHI, OWI)
-        data_vec = tvm.compute(dvshape, lambda n, oho, owo, kh, kw, ic, ohi, owi:
-                               data_pad[n][(oho*OHI+ohi)*HSTR+kh*dilation_h]
-                               [(owo*OWI+owi)*WSTR+kw*dilation_w][ic],
-                               name='data_vec_undilated')
-    else:
-        dvshape = (N, OHO, OWO, KH + (OHI-1)*HSTR, KW + (OWI-1)*WSTR, IC)
-        data_vec = tvm.compute(dvshape, lambda n, oho, owo, ohi, owi, ic:
-                               data_pad[n][oho*OHI*HSTR+ohi][owo*OWI*WSTR+owi][ic],
-                               name='data_vec')
-    kernel_vec = tvm.compute(kvshape, lambda oco, kh, kw, ic, oci: \
-                             kernel[kh][kw][ic][oco*OCI+oci],
-                             name='kernel_vec')
-
-    ic = tvm.reduce_axis((0, IC), name='ic')
-    kh = tvm.reduce_axis((0, KH), name='kh')
-    kw = tvm.reduce_axis((0, KW), name='kw')
-
-    if dilation_h != 1 or dilation_w != 1:
-        conv = tvm.compute(ovshape, lambda n, oho, owo, oco, ohi, owi, oci: \
-            tvm.sum(data_vec[n, oho, owo, kh, kw, ohi, owi, ic].astype(out_dtype) *
-                    kernel_vec[oco, kh, kw, ic, oci].astype(out_dtype),
-                    axis=[ic, kh, kw]), name='conv')
-    else:
-        conv = tvm.compute(ovshape, lambda n, oho, owo, oco, ohi, owi, oci: \
-            tvm.sum(data_vec[n, oho, owo, ohi*HSTR+kh, owi*WSTR+kw, ic].astype(out_dtype) *
-                    kernel_vec[oco, kh, kw, ic, oci].astype(out_dtype),
-                    axis=[ic, kh, kw]), name='conv')
-
-    idiv = tvm.indexdiv
-    imod = tvm.indexmod
-    output = tvm.compute(oshape, lambda n, oho, owo, oc:
-                         conv[n][idiv(oho, OHI)][idiv(owo, OWI)][idiv(oc, OCI)]\
-                             [imod(oho, OHI)][imod(owo, OWI)][imod(oc, OCI)],
-                         name='output_unpack', tag='spatial_conv_output_NHWC')
-    return output
-
-def schedule_conv2d_spatial_pack_nhwc(cfg, s, op, output):
-    """Spatial Pack schedule for Conv2d NHWC"""
-    unpack = op.output(0)
-    conv = unpack.op.input_tensors[0]
-    data_vec = conv.op.input_tensors[0]
-    kernel_vec = conv.op.input_tensors[1]
-    data_pad = data_vec.op.input_tensors[0]
-    OHI = cfg['tile_oh'].size[-1]
-    OWI = cfg['tile_ow'].size[-1]
-    OCI = cfg['tile_co'].size[-1]
-
-    # schedule unpack/output
-    if output != unpack:
-        s[unpack].compute_inline()
-    n, oh, ow, oc = s[output].op.axis
-    oco, oci = cfg['tile_co'].apply(s, output, oc)
-    oho, ohi = cfg['tile_oh'].apply(s, output, oh)
-    owo, owi = cfg['tile_ow'].apply(s, output, ow)
-    s[output].reorder(n, oho, owo, oco, ohi, owi, oci)
-    cfg['ann_spatial'].apply(s, output, [ohi, owi, oci], axis_lens=[OHI, OWI, OCI],
-                             max_unroll=16, cfg=cfg)
-    cfg.define_knob('compat', [0, 1, 2])
-    if cfg['compat'].val < 2:
-        compat_axis = [owo, oco][cfg['compat'].val] # pylint: disable=R1706
-        s[conv].compute_at(s[output], compat_axis)
-    paxis = s[output].fuse(n, oho)
-    s[output].parallel(paxis)
-
-    # schedule conv
-    n, oho, owo, oco, ohi, owi, oci = s[conv].op.axis
-    ic, kh, kw = s[conv].op.reduce_axis
-    cfg['reorder_conv'].apply(s, conv, [n, oho, owo, oco, kh, kw, ohi, owi, ic, oci])
-    cfg['ann_reduce'].apply(s, conv, [kh, kw],
-                            axis_lens=[get_const_int(kh.dom.extent),
-                                       get_const_int(kw.dom.extent)],
-                            max_unroll=16,
-                            cfg=cfg)
-    cfg['ann_spatial'].apply(s, conv, [ohi, owi, oci], axis_lens=[OHI, OWI, OCI],
-                             max_unroll=16, cfg=cfg)
-    if cfg['compat'].val < 2:
-        compat_axis = [owo, oco][cfg['compat'].val] # pylint: disable=R1706
-        s[kernel_vec].compute_at(s[conv], compat_axis)
-        s[data_vec].compute_at(s[conv], compat_axis)
-
-    # schedule kernel pack
-    oco, kh, kw, ic, oci = kernel_vec.op.axis
-    s[kernel_vec].vectorize(oci)
-    s[kernel_vec].unroll(ic)
-    if cfg['compat'].val == 2:
-        s[kernel_vec].parallel(oco)
-
-    # schedule data pack
-    if data_vec.op.name == 'data_vec_undilated':
-        n, oho, owo, kh, kw, ic, ohi, owi = s[data_vec].op.axis
-        s[data_vec].vectorize(owi)
-        s[data_vec].unroll(ohi)
-    else:
-        n, oho, owo, ohi, owi, ic = s[data_vec].op.axis
-        s[data_vec].vectorize(ic)
-        s[data_vec].unroll(owi)
-    if cfg['compat'].val == 2:
-        paxis = s[data_vec].fuse(n, oho)
-        s[data_vec].parallel(paxis)
-
-    return s
diff --git a/topi/python/topi/cuda/__init__.py b/topi/python/topi/cuda/__init__.py
index 55255f42a05e..e6a342def32b 100644
--- a/topi/python/topi/cuda/__init__.py
+++ b/topi/python/topi/cuda/__init__.py
@@ -20,7 +20,7 @@
 from __future__ import absolute_import as _abs
 
 from . import conv2d, depthwise_conv2d, conv2d_transpose_nchw, deformable_conv2d, \
-              group_conv2d_nchw, dense, conv1d_transpose_ncw
+              group_conv2d_nchw, dense
 from . import conv3d
 from .conv2d_hwcn import schedule_conv2d_hwcn
 from .depthwise_conv2d import schedule_depthwise_conv2d_backward_input_nhwc
diff --git a/topi/python/topi/cuda/conv1d_transpose_ncw.py b/topi/python/topi/cuda/conv1d_transpose_ncw.py
deleted file mode 100644
index be7824e71e81..000000000000
--- a/topi/python/topi/cuda/conv1d_transpose_ncw.py
+++ /dev/null
@@ -1,187 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name
-"""Conv1d transpose template for cuda backend"""
-
-import tvm
-from tvm import autotvm
-from .. import nn, generic
-from ..util import get_const_tuple, traverse_inline
-
-@autotvm.task.register_topi_compute(nn.conv1d_transpose_ncw, ['cuda', 'gpu'], "direct")
-def conv1d_transpose_ncw_cuda(cfg, data, kernel, stride, padding, out_dtype):
-    """Transposed 1D convolution ncw forward operator.
-
-    Parameters
-    ----------
-    cfg: ConfigEntity
-        The config for this template
-    Input : tvm.Tensor
-        3-D with shape [batch, in_channel, inp_width]
-    Filter : tvm.Tensor
-        3-D with shape [in_channel, num_filter, kernel_size]
-    stride : tuple of one int
-        The spatial stride along width
-    padding : int, tuple, or string
-        int: padding size
-        tuple of 2 ints: (pad_left, pad_right) for left and right padding
-        string: ['VALID', 'SAME']
-    out_dtype: str
-        The output type. This is used in mixed precision
-
-    Returns
-    -------
-    Output : tvm.Tensor
-    u    3-D with shape [batch, out_channel, out_width]
-    """
-    if isinstance(stride, (tuple, list)):
-        stride = stride[0]
-    cfg.stride = stride
-    batch, inp_channels, inp_width = get_const_tuple(data.shape)
-    _, out_channels, kernel_size = get_const_tuple(kernel.shape)
-    pad_left, pad_right = nn.get_pad_tuple1d(padding, kernel_size)
-    out_width = (inp_width - 1) * stride + kernel_size - pad_left - pad_right
-    pad_left = kernel_size - 1 - pad_left
-    pad_right = kernel_size - 1 - pad_right
-    dilated_width = stride * (inp_width - 1) + 1
-    data = tvm.compute(
-        (batch, inp_channels, pad_left + dilated_width + pad_right),
-        lambda n, c, x: tvm.if_then_else(
-            tvm.all(x >= pad_left,
-                    x < pad_left + dilated_width,
-                    tvm.indexmod(x - pad_left, stride).equal(0)),
-            data[n, c, tvm.indexdiv(x - pad_left, stride)],
-            tvm.const(0., "float32")),
-        name='data_pad')
-
-    dc = tvm.reduce_axis((0, inp_channels), name='dc')
-    dw = tvm.reduce_axis((0, kernel_size), name='dw')
-    data_out = tvm.compute(
-        (batch, out_channels, out_width),
-        lambda b, c, w: tvm.sum(
-            data[b, dc, w + dw].astype(out_dtype) *
-            kernel[dc, c, kernel_size - 1 - dw].astype(out_dtype),
-            axis=[dc, dw]), tag="conv1d_transpose_ncw")
-
-    return data_out
-
-@autotvm.task.register_topi_schedule(generic.schedule_conv1d_transpose_ncw,
-                                     ['cuda', 'gpu'], 'direct')
-def schedule_conv1d_transpose_ncw_cuda(cfg, outs):
-    """TOPI Schedule callback for conv1d_transpose operator.
-
-    Parameters
-    ----------
-    cfg: ConfigEntity
-        The parameters for this template
-
-    outs: Array of Tensor
-        The computation graph description of conv1d transpose
-        in the format of an array of tensors.
-
-    Returns
-    -------
-    s: Schedule
-        The computation schedule for conv1d transpose.
-    """
-    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
-    s = tvm.create_schedule([x.op for x in outs])
-
-    def _callback(op):
-        if op.tag == 'conv1d_transpose_ncw':
-            pad_data = op.input_tensors[0]
-            kernel = op.input_tensors[1]
-            conv = op.output(0)
-
-            ##### space definition begin #####
-            n, f, x = s[conv].op.axis
-            rc = s[conv].op.reduce_axis[0]
-            cfg.define_split("tile_n", cfg.axis(n), num_outputs=4)
-            cfg.define_split("tile_f", cfg.axis(f), num_outputs=4)
-            cfg.define_split("tile_x", cfg.axis(x), num_outputs=4)
-            cfg.define_split("tile_rc", cfg.axis(rc), num_outputs=3)
-            cfg.define_knob("auto_unroll_max_step", [64, 512, 1500])
-
-            target = tvm.target.current_target()
-            if target.target_name in ['nvptx', 'rocm']:
-                cfg.define_knob("unroll_explicit", [1])
-            else:
-                cfg.define_knob("unroll_explicit", [0, 1])
-
-            ##### space definition end #####
-
-            if isinstance(kernel.op, tvm.tensor.ComputeOp) and 'dilate' in kernel.op.tag:
-                s[kernel].compute_inline()
-
-            if conv.op in s.outputs:
-                output = conv
-                OL = s.cache_write(conv, 'local')
-            else:
-                output = s.outputs[0].output(0)
-                s[conv].set_scope('local')
-                OL = conv
-
-            # create cache stage
-            s[pad_data].set_scope('shared')
-            AA = pad_data
-            WW = s.cache_read(kernel, 'shared', [OL])
-
-            # tile and bind spatial axes
-            n, f, x = s[output].op.axis
-            kernel_scope, n = s[output].split(n, nparts=1)
-            bn, vn, tn, ni = cfg["tile_n"].apply(s, output, n)
-            bf, vf, tf, fi = cfg["tile_f"].apply(s, output, f)
-            bx, vx, tx, xi = cfg["tile_x"].apply(s, output, x)
-
-            s[output].reorder(bn, bf, bx, vn, vf, vx, tn, tf, tx, ni, fi, xi)
-            s[output].bind(bn, tvm.thread_axis("blockIdx.z"))
-            s[output].bind(bf, tvm.thread_axis("blockIdx.y"))
-            s[output].bind(bx, tvm.thread_axis("blockIdx.x"))
-            s[output].bind(vn, tvm.thread_axis("vthread"))
-            s[output].bind(vf, tvm.thread_axis("vthread"))
-            s[output].bind(vx, tvm.thread_axis("vthread"))
-
-            s[output].bind(tx, tvm.thread_axis("threadIdx.x"))
-            s[OL].compute_at(s[output], tx)
-            # number of threads
-            n_tz = cfg["tile_n"].size[2] * cfg["tile_f"].size[2]
-            n_tx = cfg["tile_x"].size[2]
-
-            # tile reduction axes
-            n, f, x = s[OL].op.axis
-            rc, rx = s[OL].op.reduce_axis
-            rco, rcm, rci = cfg['tile_rc'].apply(s, OL, rc)
-            s[OL].reorder(rco, rcm, rx, rci, n, f, x)
-
-            s[AA].compute_at(s[OL], rx)
-            s[WW].compute_at(s[OL], rx)
-
-            # cooperative fetching
-            for load in [AA, WW]:
-                n, f, x = s[load].op.axis
-                fused = s[load].fuse(f, x)
-                tz, fused = s[load].split(fused, nparts=n_tz)
-                tx, fused = s[load].split(fused, nparts=n_tx)
-                s[load].bind(tz, tvm.thread_axis("threadIdx.y"))
-                s[load].bind(tx, tvm.thread_axis("threadIdx.x"))
-
-            s[output].pragma(kernel_scope, 'auto_unroll_max_step', cfg['auto_unroll_max_step'].val)
-            s[output].pragma(kernel_scope, 'unroll_explicit', cfg['unroll_explicit'].val)
-
-    traverse_inline(s, outs[0].op, _callback)
-
-    return s
diff --git a/topi/python/topi/cuda/conv2d_transpose_nchw.py b/topi/python/topi/cuda/conv2d_transpose_nchw.py
index 274dfb03e794..a3a4cfe6c87f 100644
--- a/topi/python/topi/cuda/conv2d_transpose_nchw.py
+++ b/topi/python/topi/cuda/conv2d_transpose_nchw.py
@@ -197,8 +197,6 @@ def _callback(op):
                     do_fallback = False
                 elif (kh, kw) == (1, 1):
                     do_fallback = True
-                elif (stride_h, stride_w) == (2, 2):
-                    do_fallback = False
                 elif (kh, kw) == (stride_h, stride_w):
                     do_fallback = False
 
diff --git a/topi/python/topi/cuda/conv2d_winograd.py b/topi/python/topi/cuda/conv2d_winograd.py
index 13bad7d39264..7a205cd8ade4 100644
--- a/topi/python/topi/cuda/conv2d_winograd.py
+++ b/topi/python/topi/cuda/conv2d_winograd.py
@@ -163,7 +163,7 @@ def schedule_winograd_cuda(cfg, s, output, pre_computed):
         eps, nu, ci, co = s[kernel_pack].op.axis
         if autotvm.GLOBAL_SCOPE.in_tuning:
             # skip this part during tuning to make recrods accurate
-            # this part will be pre-computed during pre-compute optimization pass
+            # this part will be pre-computed during NNVM's pre-compute optimization pass
             s[G].pragma(s[G].op.axis[0], 'debug_skip_region')
             s[kernel_pack].pragma(eps, 'debug_skip_region')
         else:
@@ -311,19 +311,19 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, F):
 
     Parameters
     ----------
-    attrs : tvm.attrs.Attrs
+    attrs : nnvm.top.AttrDict or tvm.attrs.Attrs
         Attributes of current convolution
-    inputs : tvm.relay.Expr
+    inputs : nnvm.symbol or tvm.relay.Expr
         Grouped input symbols
     tinfos : list
         Input shape and dtype
     F: symbol
-        The context, can be relay.op
+        The context, can be either nnvm.sym or relay.op
 
     Note
     ----
     Unlike other TOPI functions, this function operates on both graph level and operator level,
-    so we have to pass 'F' to make it support our two versions of graph IR,  Relay.
+    so we have to pass 'F' to make it support our two versions of graph IR, NNVM and Relay.
     """
     if 'cudnn' in tvm.target.current_target().libs or 'miopen' in tvm.target.current_target().libs:
         return None
@@ -331,7 +331,17 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, F):
     copy_inputs = [s for s in inputs]
     new_attrs = {k: attrs[k] for k in attrs.keys()}
 
-    new_attrs["channels"] = inputs[1].checked_type.shape[attrs['kernel_layout'].index('O')]
+    assert attrs.get_int_tuple("dilation") == (1, 1), "Does not support dilation " \
+                                                      "when alter_op_layout is enabled"
+
+     # Remove attached compilation target because `transform` needs to
+     # a conv2d_nchwc op and target is not one of conv2d's parameters.
+    if "target" in new_attrs:
+        del new_attrs["target"]
+
+    if F.__name__ == 'tvm.relay.op':
+        # Derive channels for frontends (e.g ONNX) that miss "channel" field.
+        new_attrs["channels"] = inputs[1].checked_type.shape[attrs['kernel_layout'].index('O')]
 
     strides = attrs.get_int_tuple("strides")
     padding = attrs.get_int_tuple("padding")
diff --git a/topi/python/topi/generic/nn.py b/topi/python/topi/generic/nn.py
index 77f8cadb991e..752cb5a63401 100644
--- a/topi/python/topi/generic/nn.py
+++ b/topi/python/topi/generic/nn.py
@@ -159,7 +159,7 @@ def schedule_conv2d_winograd_weight_transform(outs):
     sch: Schedule
         The computation schedule for the op.
     """
-    # Typically this is computed in PreCompute pass
+    # Typically this is computed in nnvm PreCompute pass
     # so we make a schedule here for cpu llvm
     s = tvm.create_schedule([x.op for x in outs])
     output = outs[0]
@@ -205,7 +205,7 @@ def schedule_conv2d_winograd_nnpack_weight_transform(outs):
     sch: Schedule
         The computation schedule for the op.
     """
-    # Typically this is computed in PreCompute pass
+    # Typically this is computed in nnvm PreCompute pass
     s = tvm.create_schedule([x.op for x in outs])
     return s
 
@@ -261,24 +261,6 @@ def schedule_conv2d_transpose_nchw(outs):
     return _default_schedule(outs, False)
 
 
-@tvm.target.generic_func
-def schedule_conv1d_transpose_ncw(outs):
-    """Schedule for conv1d_transpose_ncw
-
-    Parameters
-    ----------
-    outs: Array of Tensor
-        The computation graph description of conv2d_transpose_ncw
-        in the format of an array of tensors.
-
-    Returns
-    -------
-    s: Schedule
-        The computation schedule for the op.
-    """
-    return _default_schedule(outs, False)
-
-
 @tvm.target.generic_func
 def schedule_depthwise_conv2d_nchw(outs):
     """Schedule for depthwise_conv2d_nchw
diff --git a/topi/python/topi/image/resize.py b/topi/python/topi/image/resize.py
index 27bea9434348..7a249908d8a9 100644
--- a/topi/python/topi/image/resize.py
+++ b/topi/python/topi/image/resize.py
@@ -210,172 +210,3 @@ def _bicubic(*indices):
         raise ValueError('%s method is not supported.' % method)
 
     return tvm.compute(output_shape, compute_func, name='resize', tag=tag.INJECTIVE)
-
-def resize3d(data, size, layout="NCDHW", method="nearest_neighbor",
-             coordinate_transformation_mode="align_corners", out_dtype=None):
-    """Perform resize operation on the data.
-
-    Parameters
-    ----------
-    inputs: tvm.Tensor
-        inputs is a 5-D tensor with shape
-        [batch, channel, in_depth, in_height, in_width]
-        or  [batch, in_depth, in_height, in_width, channel]
-
-    size: Tuple
-        Output resolution scale to
-
-    layout: string, optional
-        "NCDHW", "NDHWC", or "NCDHWc".
-
-    coordinate_transformation_mode: string, optional
-        Describes how to transform the coordinate in the resized tensor
-        to the coordinate in the original tensor.
-        Refer to the ONNX Resize operator specification for details.
-        Available options are "half_pixel", "align_corners" and "asymmetric".
-
-    method: {"trilinear", "nearest_neighbor"}
-        Method to be used for resizing.
-
-    out_dtype: string, optional
-        Type to return. If left None will be same as input type.
-
-    Returns
-    -------
-    output : tvm.Tensor
-        5-D with shape [batch, channel, in_depth*scale, in_height*scale, in_width*scale]
-        or [batch, in_depth*scale, in_height*scale, in_width*scale, channel]
-        or 5-D with shape [batch, channel-major, in_depth*scale, in_height*scale, in_width*scale,
-        channel-minor]
-    """
-    method = method.lower()
-
-    if layout == 'NDHWC':
-        in_n, in_d, in_h, in_w, in_c = data.shape
-        output_shape = [in_n, size[0], size[1], size[2], in_c]
-    elif layout == 'NCDHW':
-        in_n, in_c, in_d, in_h, in_w = data.shape
-        output_shape = [in_n, in_c, size[0], size[1], size[2]]
-    # Otherwise layout must be NCHWxc
-    else:
-        in_n, in_c, in_d, in_h, in_w, in_cc = data.shape
-        output_shape = [in_n, in_c, size[0], size[1], size[2], in_cc]
-
-    if coordinate_transformation_mode == "align_corners":
-        z_ratio = (in_d - 1).astype('float') / (size[0] - 1)
-        y_ratio = (in_h - 1).astype('float') / (size[1] - 1)
-        x_ratio = (in_w - 1).astype('float') / (size[2] - 1)
-    elif coordinate_transformation_mode in ["asymmetric", "half_pixel"]:
-        z_ratio = (in_d).astype('float') / (size[0])
-        y_ratio = (in_h).astype('float') / (size[1])
-        x_ratio = (in_w).astype('float') / (size[2])
-    else:
-        raise ValueError("Unsupported coordinate_transformation_mode: {}".format(
-            coordinate_transformation_mode))
-
-    def _get_pixel(n, c, z, y, x, cc):
-        z = tvm.max(tvm.min(z, in_d - 1), 0)
-        y = tvm.max(tvm.min(y, in_h - 1), 0)
-        x = tvm.max(tvm.min(x, in_w - 1), 0)
-        if layout == 'NDHWC':
-            return data(n, z, y, x, c).astype('float')
-        if layout == 'NCDHW':
-            return data(n, c, z, y, x).astype('float')
-        # else must be NCDHWxc
-        return data(n, c, z, y, x, cc).astype('float')
-
-    def _get_indices(*indices):
-        if layout == 'NDHWC':
-            n, z, y, x, c = indices
-            cc = None
-        elif layout == 'NCDHW':
-            n, c, z, y, x = indices
-            cc = None
-        else:
-            n, c, z, y, x, cc = indices
-
-        return n, c, z, y, x, cc
-
-    def _cast_output(value):
-        if out_dtype:
-            dtype = out_dtype
-        else:
-            dtype = data.dtype
-        return value.astype(dtype)
-
-    # Nearest neighbor computation
-    def _nearest_neighbor(*indices):
-        n, c, z, y, x, cc = _get_indices(*indices)
-
-        in_z = z_ratio * z
-        in_y = y_ratio * y
-        in_x = x_ratio * x
-
-        if coordinate_transformation_mode == "align_corners":
-            zint = tvm.round(in_z).astype('int32')
-            yint = tvm.round(in_y).astype('int32')
-            xint = tvm.round(in_x).astype('int32')
-        elif coordinate_transformation_mode in ["asymmetric", "half_pixel"]:
-            # Add epsilon to floor to prevent gpu rounding errors.
-            epsilon = 1e-5
-            zint = tvm.floor(in_z + epsilon).astype('int32')
-            yint = tvm.floor(in_y + epsilon).astype('int32')
-            xint = tvm.floor(in_x + epsilon).astype('int32')
-        else:
-            raise ValueError("Unsupported coordinate_transformation_mode: {}".format(
-                coordinate_transformation_mode))
-
-        return _cast_output(_get_pixel(n, c, zint, yint, xint, cc))
-
-    # Trilinear helper functions and computation.
-    def _lerp(A, B, t):
-        return A * (1.0 - t) + B * t
-
-    def _trilinear(*indices):
-        n, c, z, y, x, cc = _get_indices(*indices)
-
-        if coordinate_transformation_mode == "half_pixel":
-            in_z = z_ratio * (z + 0.5) - 0.5
-            in_y = y_ratio * (y + 0.5) - 0.5
-            in_x = x_ratio * (x + 0.5) - 0.5
-        else:
-            in_z = z_ratio * z
-            in_y = y_ratio * y
-            in_x = x_ratio * x
-
-        zint = tvm.floor(in_z).astype('int32')
-        zfract = in_z - tvm.floor(in_z)
-
-        xint = tvm.floor(in_x).astype('int32')
-        xfract = in_x - tvm.floor(in_x)
-
-        yint = tvm.floor(in_y).astype('int32')
-        yfract = in_y - tvm.floor(in_y)
-
-        p000 = _get_pixel(n, c, zint, yint, xint, cc)
-        p001 = _get_pixel(n, c, zint, yint, xint + 1, cc)
-        p010 = _get_pixel(n, c, zint, yint + 1, xint, cc)
-        p011 = _get_pixel(n, c, zint, yint + 1, xint + 1, cc)
-        p100 = _get_pixel(n, c, zint + 1, yint, xint, cc)
-        p101 = _get_pixel(n, c, zint + 1, yint, xint + 1, cc)
-        p110 = _get_pixel(n, c, zint + 1, yint + 1, xint, cc)
-        p111 = _get_pixel(n, c, zint + 1, yint + 1, xint + 1, cc)
-
-        dep00 = _lerp(p000, p100, zfract)
-        dep01 = _lerp(p001, p101, zfract)
-        dep10 = _lerp(p010, p110, zfract)
-        dep11 = _lerp(p011, p111, zfract)
-        col0 = _lerp(dep00, dep01, xfract)
-        col1 = _lerp(dep10, dep11, xfract)
-        value = _lerp(col0, col1, yfract)
-        return _cast_output(value)
-
-    # Determine which interpolation method to use then run it.
-    if method == "nearest_neighbor":
-        compute_func = _nearest_neighbor
-    elif method == "trilinear":
-        compute_func = _trilinear
-    else:
-        raise ValueError('%s method is not supported.' % method)
-
-    return tvm.compute(output_shape, compute_func, name='resize3d', tag=tag.INJECTIVE)
diff --git a/topi/python/topi/intel_graphics/conv2d.py b/topi/python/topi/intel_graphics/conv2d.py
index 56b63cb69017..f19905dff2f0 100644
--- a/topi/python/topi/intel_graphics/conv2d.py
+++ b/topi/python/topi/intel_graphics/conv2d.py
@@ -189,6 +189,8 @@ def __topi_nn_conv2d_NCHWc(*args, **kwargs):
 
 @conv2d_alter_layout.register(["intel_graphics"])
 def _alter_conv2d_layout(attrs, inputs, tinfo, F):
+    import nnvm.symbol as sym
+
     copy_inputs = [s for s in inputs]
     new_attrs = {k : attrs[k] for k in attrs.keys()}
 
@@ -210,7 +212,7 @@ def _alter_conv2d_layout(attrs, inputs, tinfo, F):
     dilation = attrs.get_int_tuple("dilation")
     out_dtype = attrs["out_dtype"]
 
-    layout_name = 'data_layout'
+    layout_name = 'layout' if F == sym else 'data_layout'
     layout = attrs[layout_name]
     kh, kw = attrs.get_int_tuple("kernel_size")
 
@@ -260,6 +262,8 @@ def _alter_conv2d_layout(attrs, inputs, tinfo, F):
          new_attrs['out_layout'], out_dtype], conv2d_NCHWc)
 
     dispatch_ctx.update(target, new_workload, cfg)
+    if F == sym:
+        return F.contrib.conv2d_NCHWc(*copy_inputs, **new_attrs)
     return F.nn.contrib_conv2d_nchwc(*copy_inputs, **new_attrs)
 
 @autotvm.register_topi_compute(conv2d_NCHWc, 'intel_graphics', 'direct')
diff --git a/topi/python/topi/nn/__init__.py b/topi/python/topi/nn/__init__.py
index b805b7c64919..f42cde860ae9 100644
--- a/topi/python/topi/nn/__init__.py
+++ b/topi/python/topi/nn/__init__.py
@@ -31,7 +31,6 @@
 from .pooling import *
 from .softmax import *
 from .conv2d_transpose import *
-from .conv1d_transpose import *
 from .bnn import *
 from .upsampling import *
 from .local_response_norm import *
@@ -42,5 +41,3 @@
 from .sparse import *
 from .pad import *
 from .fifo_buffer import *
-from .depth_to_space import *
-from .space_to_depth import *
diff --git a/topi/python/topi/nn/conv1d_transpose.py b/topi/python/topi/nn/conv1d_transpose.py
deleted file mode 100644
index 39918e90c317..000000000000
--- a/topi/python/topi/nn/conv1d_transpose.py
+++ /dev/null
@@ -1,83 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name, unused-variable, unused-argument
-"""Transposed 1D convolution operators (sometimes called Deconvolution)."""
-from __future__ import absolute_import as _abs
-import tvm
-from .dilate import dilate
-from .pad import pad
-from ..util import simplify
-from .util import get_pad_tuple1d
-
-
-@tvm.target.generic_func
-def conv1d_transpose_ncw(data, kernel, stride, padding, out_dtype):
-    """Transposed 1D convolution ncw forward operator.
-
-    Parameters
-    ----------
-    data : tvm.Tensor
-        3-D with shape [batch, in_channel, in_width]
-
-    kernel : tvm.Tensor
-        3-D with shape [in_channel, num_filter, filter_width]
-
-    stride : ints
-        The spatial stride along width
-
-    padding : int or str
-        Padding size, or ['VALID', 'SAME']
-
-    out_dtype : str
-        The output data type. This is used for mixed precision.
-
-    Returns
-    -------
-    output : tvm.Tensor
-        3-D with shape [batch, out_channel, out_width]
-    """
-
-    # dilate and pad
-    if isinstance(stride, (tuple, list)):
-        stride = stride[0]
-    batch, channels_in, data_width = data.shape
-    _, channels_out, kernel_width = kernel.shape
-    channels_out = simplify(channels_out)
-    data = dilate(data, [1, 1, stride], name='data_dilate')
-    pad_left, pad_right = get_pad_tuple1d(padding, (kernel_width,))
-    pad_left = kernel_width - 1 - pad_left
-    pad_right = kernel_width - 1 - pad_right
-    data = pad(data, [0, 0, pad_left], [0, 0, pad_right], name='data_pad')
-
-    # transpose kernel, switch kernel layout to IOW
-    kernel = tvm.compute((channels_out, channels_in, kernel_width), \
-                         lambda o, i, w: kernel[i][o][kernel_width-1-w],\
-                         name='kernel')
-
-    # convolution
-    _, _, data_width = data.shape
-    out_w = simplify(data_width - kernel_width + 1)
-    dc = tvm.reduce_axis((0, channels_in), name='dc')
-    dw = tvm.reduce_axis((0, kernel_width), name='dw')
-    output = tvm.compute(
-        (batch, channels_out, out_w),
-        lambda b, c, w: tvm.sum(
-            data[b, dc, w+dw].astype(out_dtype) *
-            kernel[c, dc, dw].astype(out_dtype),
-            axis=[dc, dw]), tag="conv1d_transpose_ncw")
-
-    return output
diff --git a/topi/python/topi/nn/conv2d.py b/topi/python/topi/nn/conv2d.py
index 5af30335a9c5..130632fd08a9 100644
--- a/topi/python/topi/nn/conv2d.py
+++ b/topi/python/topi/nn/conv2d.py
@@ -99,19 +99,19 @@ def conv2d_alter_layout(attrs, inputs, tinfos, F):
 
     Parameters
     ----------
-    attrs : tvm.attrs.Attrs
+    attrs : nnvm.top.AttrDict or tvm.attrs.Attrs
         Attributes of current convolution
-    inputs : tvm.relay.Expr
+    inputs : nnvm.symbol or tvm.relay.Expr
         Grouped input symbols
     tinfos : list
         Input shape and dtype
     F: symbol
-        The context, can be either relay.op
+        The context, can be either nnvm.sym or relay.op
 
     Note
     ----
     Unlike other TOPI functions, this function operates on both graph level and operator level,
-    so we have to pass 'F' to make it support our two versions of graph IR, Relay.
+    so we have to pass 'F' to make it support our two versions of graph IR, NNVM and Relay.
     """
     # not to change by default
     return None
diff --git a/topi/python/topi/nn/depth_to_space.py b/topi/python/topi/nn/depth_to_space.py
deleted file mode 100644
index d847c08daf27..000000000000
--- a/topi/python/topi/nn/depth_to_space.py
+++ /dev/null
@@ -1,88 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name
-"""TVM operator depth_to_space compute."""
-from __future__ import absolute_import
-import tvm
-from .. import tag
-
-
-def depth_to_space(data, block_size, layout='NCHW', mode='DCR'):
-    """Perform depth to space transformation on the data
-
-    Parameters
-    ----------
-    data : tvm.Tensor
-        4-D tensor in either NCHW or NHWC layout.
-
-    block_size : int
-        Size of blocks to compose from channel dimension.
-
-    layout : string
-        Either NCHW or NHWC, indicating data layout.
-
-    mode : string
-        Either DCR or CDR, indicates how channels should be accessed.
-        In DCR, channels are interwoven in the Tensorflow style while
-        in CDR channels are accessed sequentially as in Pytorch.
-
-    Returns
-    -------
-    output : tvm.Tensor
-        Output of shape [N, C / block_size**2, H * block_size, W * block_size]
-    """
-    if layout == 'NCHW':
-        in_n, in_c, in_h, in_w = data.shape
-        channel_factor = tvm.truncdiv(in_c, (block_size * block_size))
-        output_shape = [in_n, channel_factor,
-                        in_h * block_size, in_w * block_size]
-    elif layout == 'NHWC':
-        in_n, in_h, in_w, in_c = data.shape
-        channel_factor = tvm.truncdiv(in_c, (block_size * block_size))
-        output_shape = [in_n, in_h * block_size,
-                        in_w * block_size, channel_factor]
-    else:
-        raise ValueError("Only NCHW and NHWC layouts are currently supported.")
-
-    def _get_indices(*indices):
-        if layout == 'NCHW':
-            n, c, y, x = indices
-        elif layout == 'NHWC':
-            n, y, x, c = indices
-        return n, c, y, x
-
-    def _get_pixel(n, c, y, x):
-        block_x = tvm.truncdiv(x, block_size)
-        block_y = tvm.truncdiv(y, block_size)
-        idx_x = tvm.truncmod(x, block_size)
-        idx_y = tvm.truncmod(y, block_size)
-        if mode == "DCR":
-            channel_idx = channel_factor * ((block_size * idx_y) + idx_x) + c
-        else:
-            channel_idx = (c * block_size * block_size) + ((block_size * idx_y) + idx_x)
-
-        if layout == 'NCHW':
-            output = data(n, channel_idx, block_y, block_x)
-        else:
-            output = data(n, block_y, block_x, channel_idx)
-        return output
-
-    def _compute(*indices):
-        n, c, y, x = _get_indices(*indices)
-        return _get_pixel(n, c, y, x)
-
-    return tvm.compute(output_shape, _compute, name='depth_to_space', tag=tag.INJECTIVE)
diff --git a/topi/python/topi/nn/space_to_depth.py b/topi/python/topi/nn/space_to_depth.py
deleted file mode 100644
index 6ed7cd64a448..000000000000
--- a/topi/python/topi/nn/space_to_depth.py
+++ /dev/null
@@ -1,80 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name
-"""TVM operator space_to_depth compute."""
-from __future__ import absolute_import
-import tvm
-from .. import tag
-
-
-def space_to_depth(data, block_size, layout='NCHW'):
-    """Perform space to depth transformation on the data
-
-    Parameters
-    ----------
-    data : tvm.Tensor
-        4-D tensor in either NCHW or NHWC layout.
-
-    block_size : int
-        Size of blocks to decompose into channel dimension.
-
-    layout : string
-        Either NCHW or NHWC, indicating data layout.
-
-    Returns
-    -------
-    output : tvm.Tensor
-        Output of shape [N, C * block_size**2, H / block_size, W / block_size]
-    """
-
-    if layout == 'NCHW':
-        in_n, in_c, in_h, in_w = data.shape
-        output_shape = [in_n, in_c * block_size * block_size,
-                        tvm.truncdiv(in_h, block_size), tvm.truncdiv(in_w, block_size)]
-    elif layout == 'NHWC':
-        in_n, in_h, in_w, in_c = data.shape
-        output_shape = [in_n, tvm.truncdiv(in_h, block_size), tvm.truncdiv(
-            in_w, block_size), in_c * block_size * block_size]
-    else:
-        raise ValueError("Only NCHW and NHWC layouts are currently supported.")
-
-    def _get_indices(*indices):
-        if layout == 'NCHW':
-            n, c, y, x = indices
-        elif layout == 'NHWC':
-            n, y, x, c = indices
-        return n, c, y, x
-
-    def _get_pixel(n, c, y, x):
-        block_offset = tvm.truncdiv(c, in_c)
-        channel_idx = tvm.truncmod(c, in_c)
-        x_idx = tvm.truncmod(block_offset, block_size)
-        y_idx = tvm.truncdiv(block_offset, block_size)
-
-        if layout == 'NCHW':
-            output = data(n, channel_idx, y_idx +
-                          (y * block_size), x_idx + (x * block_size))
-        else:
-            output = data(n, y_idx + (y * block_size), x_idx +
-                          (x * block_size), channel_idx)
-        return output
-
-    def _compute(*indices):
-        n, c, y, x = _get_indices(*indices)
-        return _get_pixel(n, c, y, x)
-
-    return tvm.compute(output_shape, _compute, name='space_to_depth', tag=tag.INJECTIVE)
diff --git a/topi/python/topi/nn/upsampling.py b/topi/python/topi/nn/upsampling.py
index fe63e474f2bf..771c9e207a17 100644
--- a/topi/python/topi/nn/upsampling.py
+++ b/topi/python/topi/nn/upsampling.py
@@ -63,58 +63,3 @@ def upsampling(data, scale_h, scale_w, layout="NCHW", method='nearest_neighbor',
         raise ValueError("not support this layout {} yet".format(layout))
     return topi.image.resize(data, out_shape, layout=layout,
                              method=method, align_corners=align_corners)
-
-
-def upsampling3d(data, scale_d, scale_h, scale_w, layout="NCDHW", method='nearest_neighbor',
-                 coordinate_transformation_mode="half_pixel"):
-    """Perform upsampling on the data.
-       Nearest neighbor and bilinear upsampling are supported.
-
-    Parameters
-    ----------
-    inputs : tvm.Tensor
-        inputs is a 5-D tensor with shape
-        [batch, channel, in_depth, in_height, in_width]
-        or  [batch, in_depth, in_height, in_width, channel]
-
-    scale_d : float
-        Scaling factor for depth
-
-    scale_h : float
-        Scaling factor for height
-
-    scale_w : float
-        Scaling factor for width
-
-    layout : string, optional
-        either "NCDHW" or "NDHWC"
-
-    method : {"trilinear", "nearest_neighbor"}
-        Method to be used for upsampling.
-
-    coordinate_transformation_mode: string, optional
-        Describes how to transform the coordinate in the resized tensor
-        to the coordinate in the original tensor.
-        Refer to the ONNX Resize operator specification for details.
-        Available options are "half_pixel", "align_corners" and "asymmetric".
-
-    Returns
-    -------
-    output : tvm.Tensor
-        5-D with shape [batch, channel, in_depth*scale, in_height*scale, in_width*scale]
-        or [batch, in_depth*scale, in_height*scale, in_width*scale, channel]
-    """
-    base_layout = layout[0:5]
-    if base_layout == "NCDHW":
-        out_shape = (simplify(topi.cast(tvm.round(data.shape[2] * scale_d), data.shape[2].dtype)),
-                     simplify(topi.cast(tvm.round(data.shape[3] * scale_h), data.shape[3].dtype)),
-                     simplify(topi.cast(tvm.round(data.shape[4] * scale_w), data.shape[4].dtype)))
-    elif layout == "NDHWC":
-        out_shape = (simplify(topi.cast(tvm.round(data.shape[1] * scale_d), data.shape[1].dtype)),
-                     simplify(topi.cast(tvm.round(data.shape[2] * scale_h), data.shape[2].dtype)),
-                     simplify(topi.cast(tvm.round(data.shape[3] * scale_w), data.shape[3].dtype)))
-
-    else:
-        raise ValueError("not support this layout {} yet".format(layout))
-    return topi.image.resize3d(data, out_shape, layout=layout, method=method,
-                               coordinate_transformation_mode=coordinate_transformation_mode)
diff --git a/topi/python/topi/nn/util.py b/topi/python/topi/nn/util.py
index 847a5c84daaa..463edaa463dc 100644
--- a/topi/python/topi/nn/util.py
+++ b/topi/python/topi/nn/util.py
@@ -103,13 +103,8 @@ def get_pad_tuple(padding, kernel):
     """
     # compute the padding size
     if isinstance(padding, (tuple, list)):
-        if len(padding) == 2:
-            pad_h = padding[0] * 2
-            pad_w = padding[1] * 2
-        elif len(padding) == 4:
-            return  padding[0], padding[1], padding[2], padding[3]
-        else:
-            raise ValueError("Size of padding can only be 2 or 4")
+        pad_h = padding[0] * 2
+        pad_w = padding[1] * 2
     elif isinstance(padding, int):
         pad_h = pad_w = padding * 2
     elif padding == "VALID":
@@ -177,42 +172,3 @@ def get_pad_tuple3d(padding, kernel):
     pad_left = (pad_w + 1) // 2
     pad_front = (pad_d + 1) // 2
     return pad_front, pad_top, pad_left, pad_d - pad_front, pad_h - pad_top, pad_w - pad_left
-
-
-def get_pad_tuple1d(padding, kernel):
-    """Common code to get the pad option
-
-    Parameters
-    ----------
-    padding : int or str
-        Padding size, or ['VALID', 'SAME']
-
-    kernel : tuple of int
-        Conv kernel size
-
-    Returns
-    -------
-    pad_left : int
-        Padding size on left
-
-    pad_right : int
-        Padding size on right.
-    """
-    # compute the padding size
-    if isinstance(padding, (tuple, list)):
-        if len(padding) == 1:
-            pad_w = padding[0] * 2
-        elif len(padding) == 2:
-            return  padding[0], padding[1]
-        else:
-            raise ValueError("Size of padding can only be 2 or 4")
-    elif isinstance(padding, int):
-        pad_w = padding * 2
-    elif padding == "VALID":
-        pad_w = 0
-    elif padding == "SAME":
-        pad_w = kernel[0] - 1
-    else:
-        raise ValueError("Unknown padding option %s" % padding)
-    pad_left = (pad_w + 1) // 2
-    return pad_left, pad_w - pad_left
diff --git a/topi/python/topi/testing/__init__.py b/topi/python/topi/testing/__init__.py
index 2826a2b5fdc6..6c5ca6b9db1e 100644
--- a/topi/python/topi/testing/__init__.py
+++ b/topi/python/topi/testing/__init__.py
@@ -26,14 +26,12 @@
 from .conv2d_nhwc_python import conv2d_nhwc_python
 from .conv3d_ncdhw_python import conv3d_ncdhw_python
 from .conv2d_transpose_python import conv2d_transpose_nchw_python, conv2d_transpose_nhwc_python
-from .conv1d_transpose_ncw_python import conv1d_transpose_ncw_python
 from .deformable_conv2d_nchw_python import deformable_conv2d_nchw_python
 from .depthwise_conv2d_python import depthwise_conv2d_python_nchw, depthwise_conv2d_python_nhwc
 from .dilate_python import dilate_python
 from .softmax_python import softmax_python, log_softmax_python
-from .upsampling_python import upsampling_python, upsampling3d_python
+from .upsampling_python import upsampling_python
 from .bilinear_resize_python import bilinear_resize_python
-from .trilinear_resize3d_python import trilinear_resize3d_python
 from .reorg_python import reorg_python
 from .roi_align_python import roi_align_nchw_python
 from .roi_pool_python import roi_pool_nchw_python
@@ -44,8 +42,5 @@
 from .batch_matmul import batch_matmul
 from .slice_axis_python import slice_axis_python
 from .sequence_mask_python import sequence_mask
-from .pool3d_python import pool3d_ncdhw_python
 from .pool_grad_python import pool_grad_nchw
 from .one_hot import one_hot
-from .depth_to_space import depth_to_space_python
-from .space_to_depth import space_to_depth_python
diff --git a/topi/python/topi/testing/conv1d_transpose_ncw_python.py b/topi/python/topi/testing/conv1d_transpose_ncw_python.py
deleted file mode 100644
index cb78bbf8cb3f..000000000000
--- a/topi/python/topi/testing/conv1d_transpose_ncw_python.py
+++ /dev/null
@@ -1,71 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=unused-variable
-"""Transposed 1D convolution in python"""
-import numpy as np
-import scipy
-import topi
-from topi.nn.util import get_pad_tuple1d
-
-def conv1d_transpose_ncw_python(a_np, w_np, stride, padding):
-    """Transposed 1D convolution operator in NCW layout.
-
-    Parameters
-    ----------
-    a_np : numpy.ndarray
-        3-D with shape [batch, in_channel, in_width]
-
-    w_np : numpy.ndarray
-        3-D with shape [in_channel, num_filter, filter_width]
-
-    stride : int or a list/tuple of one int
-        Stride size, or [stride_width]
-
-    padding : int, tuple, or str
-        Single int for padding size, or
-        tuple of 2 ints for left and right padding, or
-        ['VALID', 'SAME']
-
-    Returns
-    -------
-    b_np : np.ndarray
-        3-D with shape [batch, out_channel, out_width]
-    """
-    batch, in_c, in_w = a_np.shape
-    _, out_c, filter_w = w_np.shape
-    if isinstance(stride, int):
-        stride_w = stride
-    else:
-        stride_w = stride[0]
-    fpad_left, fpad_right = get_pad_tuple1d(padding, filter_w)
-    # dilate stage
-    dilated_a_np = topi.testing.dilate_python(a_np, [1, 1, stride_w])
-    # padding stage
-    bpad_left = filter_w - 1 - fpad_left
-    bpad_right = filter_w - 1 - fpad_right
-    padded_a_np = np.zeros((batch, in_c, dilated_a_np.shape[2]+bpad_left+bpad_right))
-    padded_a_np[:, :, bpad_left:dilated_a_np.shape[2]+bpad_left] = dilated_a_np
-    # convolution stage
-    out_w = (in_w - 1) * stride_w - fpad_left - fpad_right + filter_w
-    b_np = np.zeros((batch, out_c, out_w))
-    for n in range(batch):
-        for f in range(out_c):
-            for c in range(in_c):
-                out = scipy.signal.convolve(
-                    padded_a_np[n, c], w_np[c, f], mode='valid')
-                b_np[n, f] += out
-    return b_np
diff --git a/topi/python/topi/testing/depth_to_space.py b/topi/python/topi/testing/depth_to_space.py
deleted file mode 100644
index f4a60bc12d87..000000000000
--- a/topi/python/topi/testing/depth_to_space.py
+++ /dev/null
@@ -1,54 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name, line-too-long, unused-variable, too-many-locals
-"""Depth to space in python"""
-import numpy as np
-
-
-def depth_to_space_python(data, block_size, mode='DCR'):
-    """Depth to Space operator in python for NCHW layout.
-
-    Parameters
-    ----------
-    data : np.ndarray
-        4-D with shape [batch, in_channel, in_height, in_width]
-
-    block_size : int
-        Size of blocks to convert channel pixels into.
-
-    Returns
-    -------
-    d2s_out : np.ndarray
-        4-D with shape [batch, in_channel / (block_size * block_size),
-                        out_height * block_size, out_width * block_size]
-    """
-    in_n, in_c, in_h, in_w = data.shape
-    new_h = int(in_h * block_size)
-    new_w = int(in_h * block_size)
-    new_c = int(in_c / (block_size * block_size))
-
-    if mode == 'DCR':
-        expanded = np.reshape(
-            data, newshape=[in_n, block_size, block_size, new_c, in_h, in_w])
-        transposed = np.transpose(expanded, axes=[0, 3, 4, 1, 5, 2])
-    else:
-        expanded = np.reshape(
-            data, newshape=(in_n, new_c, block_size, block_size, in_h, in_w))
-        transposed = np.transpose(expanded, axes=(0, 1, 4, 2, 5, 3))
-    newshape = [in_n, new_c, new_h, new_w]
-    d2s_out = np.reshape(transposed, newshape=newshape)
-    return d2s_out
diff --git a/topi/python/topi/testing/pool3d_python.py b/topi/python/topi/testing/pool3d_python.py
deleted file mode 100644
index 32513163d068..000000000000
--- a/topi/python/topi/testing/pool3d_python.py
+++ /dev/null
@@ -1,85 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name, unused-argument, unused-variable
-"""max_pool3d and avg_pool3d in python"""
-import math
-import numpy as np
-
-def pool3d_ncdhw_python(np_data, kernel,
-                        strides, padding,
-                        out_shape, pool_type,
-                        count_include_pad=True,
-                        ceil_mode=False, dtype="float32"):
-    """baseline for max_pool3d and avg_pool3d, default layout is "NCDHW"""
-    in_n, in_c, in_d, in_h, in_w = in_shape = np_data.shape
-    k_d, k_h, k_w = kernel
-    s_d, s_h, s_w = strides
-    pf, pt, pl, pk, pb, pr = padding
-
-    if ceil_mode:
-        assert out_shape[2] == int(math.ceil(float(in_shape[2] - k_d + pf + pk) / s_d) + 1)
-        assert out_shape[3] == int(math.ceil(float(in_shape[3] - k_h + pt + pb) / s_h) + 1)
-        assert out_shape[4] == int(math.ceil(float(in_shape[4] - k_w + pl + pr) / s_w) + 1)
-    else:
-        assert out_shape[2] == int(math.floor(float(in_shape[2] - k_d + pf + pk) / s_d) + 1)
-        assert out_shape[3] == int(math.floor(float(in_shape[3] - k_h + pt + pb) / s_h) + 1)
-        assert out_shape[4] == int(math.floor(float(in_shape[4] - k_w + pl + pr) / s_w) + 1)
-
-    pad_np = np.zeros(shape=(in_n, in_c,
-                             in_d + pf + pk,
-                             in_h + pt + pb,
-                             in_w + pl + pr)).astype(dtype)
-    no_zero = (range(in_n),
-               range(in_c),
-               (range(pf, in_d + pf)),
-               (range(pt, in_h + pt)),
-               (range(pl, in_w + pl)))
-    pad_np[np.ix_(*no_zero)] = np_data
-    ret_np = np.zeros(shape=out_shape).astype(dtype)
-
-    if pool_type == 'avg':
-        for k in range(out_shape[2]):
-            for i in range(out_shape[3]):
-                for j in range(out_shape[4]):
-                    if count_include_pad:
-                        ret_np[:, :, k, i, j] = \
-                            np.mean(pad_np[:, :, k * s_d: k * s_d + k_d,
-                                           i * s_h: i * s_h + k_h,
-                                           j * s_w: j * s_w + k_w], axis=(2, 3, 4))
-                    else:
-                        pad_count = np.sum(pad_np[:, :,
-                                                  k * s_d: k * s_d + k_d,
-                                                  i * s_h: i * s_h + k_h,
-                                                  j * s_w: j * s_w + k_w] > 0, axis=(2, 3, 4))
-                        ret_np[:, :, k, i, j] = np.sum(pad_np[:, :,
-                                                              k * s_d: k * s_d + k_d,
-                                                              i * s_h: i * s_h + k_h,
-                                                              j * s_w: j * s_w + k_w],
-                                                       axis=(2, 3, 4)) / np.maximum(pad_count, 1)
-    elif pool_type == 'max':
-        for k in range(out_shape[2]):
-            for i in range(out_shape[3]):
-                for j in range(out_shape[4]):
-                    ret_np[:, :, k, i, j] = np.max(
-                        pad_np[:, :, k * s_d: k * s_d + k_d,
-                               i * s_h: i * s_h + k_h,
-                               j * s_w: j * s_w + k_w], axis=(2, 3, 4))
-    else:
-        raise ValueError("pool type {} is not supported".format(pool_type))
-
-    ret_np = np.maximum(ret_np, 0.0)
-    return ret_np
diff --git a/topi/python/topi/testing/space_to_depth.py b/topi/python/topi/testing/space_to_depth.py
deleted file mode 100644
index 3a3b94177602..000000000000
--- a/topi/python/topi/testing/space_to_depth.py
+++ /dev/null
@@ -1,49 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name, line-too-long, unused-variable, too-many-locals
-"""Space to depth in python"""
-import numpy as np
-
-
-def space_to_depth_python(data, block_size):
-    """Space to Depth operator in python for NCHW layout.
-
-    Parameters
-    ----------
-    data : np.ndarray
-        4-D with shape [batch, in_channel, in_height, in_width]
-
-    block_size : int
-        Size of spatial blocks to decompose into channels.
-
-    Returns
-    -------
-    d2s_out : np.ndarray
-        4-D with shape [batch, in_channel * (block_size * block_size),
-                        out_height / block_size, out_width / block_size]
-    """
-    in_n, in_c, in_h, in_w = data.shape
-    new_h = int(in_h / block_size)
-    new_w = int(in_h / block_size)
-    new_c = int(in_c * (block_size * block_size))
-
-    expanded = np.reshape(
-        data, newshape=[in_n, in_c, new_h, block_size, new_w, block_size])
-    transposed = np.transpose(expanded, axes=[0, 3, 5, 1, 2, 4])
-    newshape = [in_n, new_c, new_h, new_w]
-    d2s_out = np.reshape(transposed, newshape=newshape)
-    return d2s_out
diff --git a/topi/python/topi/testing/trilinear_resize3d_python.py b/topi/python/topi/testing/trilinear_resize3d_python.py
deleted file mode 100644
index cc8fdd685021..000000000000
--- a/topi/python/topi/testing/trilinear_resize3d_python.py
+++ /dev/null
@@ -1,105 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name, line-too-long, unused-variable, too-many-locals, too-many-nested-blocks
-"""Trilinear 3D resize in python"""
-import math
-import numpy as np
-
-def trilinear_resize3d_python(data_in, out_size, layout,
-                              coordinate_transformation_mode="align_corners"):
-    """ Trilinear 3d scaling using python"""
-    (new_d, new_h, new_w) = out_size
-
-    if layout == 'NDHWC':
-        (batch, d, h, w, channel) = data_in.shape
-        data_out = np.ones((batch, new_d, new_h, new_w, channel))
-    else:
-        (batch, channel, d, h, w) = data_in.shape
-        data_out = np.ones((batch, channel, new_d, new_h, new_w))
-
-    if coordinate_transformation_mode == "align_corners":
-        depth_scale = np.float32(d-1) / np.float32(out_size[0]-1)
-        height_scale = np.float32(h-1) / np.float32(out_size[1]-1)
-        width_scale = np.float32(w-1) / np.float32(out_size[2]-1)
-    elif coordinate_transformation_mode in ["asymmetric", "half_pixel"]:
-        depth_scale = np.float32(d) / np.float32(out_size[0])
-        height_scale = np.float32(h) / np.float32(out_size[1])
-        width_scale = np.float32(w) / np.float32(out_size[2])
-    else:
-        raise ValueError("Unsupported coordinate_transformation_mode: {}".format(
-            coordinate_transformation_mode))
-
-    def _lerp(A, B, t):
-        return A * (1.0 - t) + B * t
-
-    def _in_coord(new_coord, scale, shape, mode):
-        if mode == "half_pixel":
-            in_coord = (new_coord + 0.5) * scale - 0.5
-        else:
-            in_coord = new_coord * scale
-        coord0 = int(math.floor(in_coord))
-        coord1 = max(min(coord0 + 1, shape - 1), 0)
-        coord0 = max(coord0, 0)
-        coord_lerp = in_coord - math.floor(in_coord)
-        return coord0, coord1, coord_lerp
-
-    for b in range(batch):
-        for i in range(channel):
-            for m in range(new_d):
-                for j in range(new_h):
-                    for k in range(new_w):
-                        z0, z1, z_lerp = _in_coord(m, depth_scale, d,\
-                                                   coordinate_transformation_mode)
-                        y0, y1, y_lerp = _in_coord(j, height_scale, h,\
-                                                   coordinate_transformation_mode)
-                        x0, x1, x_lerp = _in_coord(k, width_scale, w,\
-                                                   coordinate_transformation_mode)
-
-                        if layout == 'NDHWC':
-                            A0 = data_in[b][z0][y0][x0][i]
-                            B0 = data_in[b][z0][y0][x1][i]
-                            C0 = data_in[b][z0][y1][x0][i]
-                            D0 = data_in[b][z0][y1][x1][i]
-                            A1 = data_in[b][z1][y0][x0][i]
-                            B1 = data_in[b][z1][y0][x1][i]
-                            C1 = data_in[b][z1][y1][x0][i]
-                            D1 = data_in[b][z1][y1][x1][i]
-                        else:
-                            A0 = data_in[b][i][z0][y0][x0]
-                            B0 = data_in[b][i][z0][y0][x1]
-                            C0 = data_in[b][i][z0][y1][x0]
-                            D0 = data_in[b][i][z0][y1][x1]
-                            A1 = data_in[b][i][z1][y0][x0]
-                            B1 = data_in[b][i][z1][y0][x1]
-                            C1 = data_in[b][i][z1][y1][x0]
-                            D1 = data_in[b][i][z1][y1][x1]
-
-                        A = _lerp(A0, A1, z_lerp)
-                        B = _lerp(B0, B1, z_lerp)
-                        C = _lerp(C0, C1, z_lerp)
-                        D = _lerp(D0, D1, z_lerp)
-                        top = _lerp(A, B, x_lerp)
-                        bottom = _lerp(C, D, x_lerp)
-
-                        pixel = np.float32(_lerp(top, bottom, y_lerp))
-
-                        if layout == 'NDHWC':
-                            data_out[b][m][j][k][i] = pixel
-                        else:
-                            data_out[b][i][m][j][k] = pixel
-
-    return data_out
diff --git a/topi/python/topi/testing/upsampling_python.py b/topi/python/topi/testing/upsampling_python.py
index a34e54128dc6..6ea7d6ad8835 100644
--- a/topi/python/topi/testing/upsampling_python.py
+++ b/topi/python/topi/testing/upsampling_python.py
@@ -53,45 +53,3 @@ def upsampling_python(data, scale, layout='NCHW'):
                 output_np[b, :, :, c] = upsample_nearest(data[b, :, :, c], scale)
         return output_np
     raise ValueError("not support this layout {} yet".format(layout))
-
-def upsample3d_nearest(arr, scale):
-    """ Populate the array by scale factor"""
-    d, h, w = arr.shape
-    out_d = int(round(d * scale[0]))
-    out_h = int(round(h * scale[1]))
-    out_w = int(round(w * scale[2]))
-    out = np.empty((out_d, out_h, out_w))
-    for z in range(out_d):
-        for y in range(out_h):
-            for x in range(out_w):
-                in_z = math.floor(z / scale[0])
-                in_y = math.floor(y / scale[1])
-                in_x = math.floor(x / scale[2])
-                out[z, y, x] = arr[in_z, in_y, in_x]
-    return out
-
-def upsampling3d_python(data, scale, layout='NCDHW'):
-    """ Python version of 3D scaling using nearest neighbour """
-
-    ishape = data.shape
-    if layout == 'NCDHW':
-        oshape = (ishape[0], ishape[1],
-                  int(round(ishape[2]*scale[0])),
-                  int(round(ishape[3]*scale[1])),
-                  int(round(ishape[4]*scale[2])))
-        output_np = np.zeros(oshape, dtype=data.dtype)
-        for b in range(oshape[0]):
-            for c in range(oshape[1]):
-                output_np[b, c, :, :, :] = upsample3d_nearest(data[b, c, :, :, :], scale)
-        return output_np
-    if layout == 'NDHWC':
-        oshape = (ishape[0],
-                  int(round(ishape[1]*scale[0])),
-                  int(round(ishape[2]*scale[1])),
-                  int(round(ishape[3]*scale[2])), ishape[4])
-        output_np = np.zeros(oshape, dtype=data.dtype)
-        for b in range(oshape[0]):
-            for c in range(oshape[4]):
-                output_np[b, :, :, :, c] = upsample3d_nearest(data[b, :, :, :, c], scale)
-        return output_np
-    raise ValueError("not support this layout {} yet".format(layout))
diff --git a/topi/python/topi/vision/nms.py b/topi/python/topi/vision/nms.py
index 5bb36f7dfa74..0d20095abd74 100644
--- a/topi/python/topi/vision/nms.py
+++ b/topi/python/topi/vision/nms.py
@@ -278,7 +278,7 @@ def hybrid_nms(data, sorted_index, valid_count,
         # Only return max_output_size valid boxes
         num_valid_boxes = 0
         if max_output_size > 0:
-            for j in range(valid_count[i]):
+            for j in parallel(valid_count[i]):
                 if output[i, j, 0] >= zero:
                     if num_valid_boxes == max_output_size:
                         for k in range(box_data_length):
diff --git a/topi/python/topi/x86/batch_matmul.py b/topi/python/topi/x86/batch_matmul.py
index 25b49d12400d..b505cbfabb55 100644
--- a/topi/python/topi/x86/batch_matmul.py
+++ b/topi/python/topi/x86/batch_matmul.py
@@ -92,40 +92,33 @@ def schedule_batch_matmul(cfg, outs):
     def _callback(op):
         if "batch_matmul" in op.tag:
             C = op.output(0)
-            A, B = op.input_tensors
+            A, B = s[C].op.input_tensors
             _, M, K = get_const_tuple(A.shape)
             _, _, N = get_const_tuple(C.shape)
 
-            if op not in s.outputs:
-                s[C].compute_inline()
-                O = outs[0]
-            else:
-                O = C
-
-            CC = s.cache_write(C, "global")
-
             # create tuning space
             cfg.define_split("tile_y", M, num_outputs=2)
             cfg.define_split("tile_x", N, num_outputs=2)
             cfg.define_split("tile_k", K, num_outputs=2)
 
-            b, y, x = s[O].op.axis
-            yo, yi = cfg["tile_y"].apply(s, O, y)
-            xo, xi = cfg["tile_x"].apply(s, O, x)
-            s[O].reorder(b, yo, xo, yi, xi)
-            bxyo = s[O].fuse(b, yo, xo)
-            s[O].parallel(bxyo)
-
-            s[CC].compute_at(s[O], bxyo)
-            k, = s[CC].op.reduce_axis
-            ko, ki = cfg["tile_k"].apply(s, CC, k)
-
-            Crf = s.rfactor(CC, ki)
-            s[Crf].compute_at(s[CC], s[CC].op.axis[0])
-            _, _, y, x = s[Crf].op.axis
-            s[Crf].fuse(y, x)
-            s[Crf].vectorize(s[Crf].op.axis[0])
-            s[O].pragma(bxyo, 'auto_unroll_max_step', 16)
+            k, = s[C].op.reduce_axis
+
+            ko, ki = cfg["tile_k"].apply(s, C, k)
+            CC = s.rfactor(C, ki)
+
+            b, y, x = s[C].op.axis
+            yo, yi = cfg["tile_y"].apply(s, C, y)
+            xo, xi = cfg["tile_x"].apply(s, C, x)
+            s[C].reorder(b, yo, xo, yi, xi)
+            bxyo = s[C].fuse(b, yo, xo)
+            s[C].parallel(bxyo)
+            s[C].fuse(yi, xi)
+
+            s[CC].compute_at(s[C], bxyo)
+            _, _, y, x = s[CC].op.axis
+            s[CC].fuse(y, x)
+            s[CC].vectorize(s[CC].op.axis[0])
+            s[C].pragma(bxyo, 'auto_unroll_max_step', 16)
 
     traverse_inline(s, outs[0].op, _callback)
     return s
diff --git a/topi/python/topi/x86/conv2d_alter_op.py b/topi/python/topi/x86/conv2d_alter_op.py
index 1332c687a301..a4e031b8aae9 100644
--- a/topi/python/topi/x86/conv2d_alter_op.py
+++ b/topi/python/topi/x86/conv2d_alter_op.py
@@ -39,7 +39,7 @@ def _alter_conv2d_layout(attrs, inputs, tinfo, F):
     strides = attrs.get_int_tuple("strides")
     dilation = attrs.get_int_tuple("dilation")
     out_dtype = attrs["out_dtype"]
-    layout_name = 'data_layout'
+    layout_name = 'layout' if F.__name__ == 'nnvm.symbol' else 'data_layout'
     data_layout = attrs[layout_name]
     kh, kw = attrs.get_int_tuple("kernel_size")
 
@@ -115,7 +115,9 @@ def _alter_conv2d_layout(attrs, inputs, tinfo, F):
             [new_data, new_kernel, strides, padding, dilation, new_attrs[layout_name],
              new_attrs['out_layout'], out_dtype], depthwise_conv2d_NCHWc)
         dispatch_ctx.update(target, new_workload, cfg)
-
+        if F.__name__ == 'nnvm.symbol':
+            logging.warning("Use native layout for depthwise convolution on NNVM.")
+            return None
         return F.nn.contrib_depthwise_conv2d_nchwc(*copy_inputs, **new_attrs)
 
     if _is_int8_hw_support(data_dtype, kernel_dtype):
@@ -157,6 +159,9 @@ def _alter_conv2d_layout(attrs, inputs, tinfo, F):
                                                       out_dtype],
                                                      conv2d_NCHWc_int8)
         dispatch_ctx.update(target, new_workload, cfg)
+        if F.__name__ == 'nnvm.symbol':
+            logging.warning("Use native layout for int8 convolution on NNVM.")
+            return None
         return F.nn.contrib_conv2d_nchwc_int8(*copy_inputs, **new_attrs)
 
     # (oc, ic, h, w) -> (OC, IC, h, w, ic, oc)
@@ -169,6 +174,8 @@ def _alter_conv2d_layout(attrs, inputs, tinfo, F):
          new_attrs['out_layout'], out_dtype], conv2d_NCHWc)
     dispatch_ctx.update(target, new_workload, cfg)
 
+    if F.__name__ == 'nnvm.symbol':
+        return F.contrib.conv2d_NCHWc(*copy_inputs, **new_attrs)
     return F.nn.contrib_conv2d_nchwc(*copy_inputs, **new_attrs)
 
 
diff --git a/topi/python/topi/x86/dense.py b/topi/python/topi/x86/dense.py
index b7a3d6d5a330..605a1754c846 100644
--- a/topi/python/topi/x86/dense.py
+++ b/topi/python/topi/x86/dense.py
@@ -59,8 +59,8 @@ def _declaration_dense_pack(cfg, data, weight, bias=None, out_dtype=None):
     M, K = get_const_tuple(data.shape) # batch, in_dim
     N, _ = get_const_tuple(weight.shape) # out_dim
     # create tuning space
-    cfg.define_split("tile_y", 32 if isinstance(M, tvm.expr.Var) else M, num_outputs=3)
-    cfg.define_split("tile_x", 32 if isinstance(N, tvm.expr.Var) else N, num_outputs=3)
+    cfg.define_split("tile_y", 32 if isinstance(M, tvm.expr.Var) else M, num_outputs=2)
+    cfg.define_split("tile_x", 32 if isinstance(N, tvm.expr.Var) else N, num_outputs=2)
     cfg.define_split("tile_k", 32 if isinstance(K, tvm.expr.Var) else K, num_outputs=2)
     if cfg.is_fallback:
         _default_dense_pack_config(cfg, M, N, K)
diff --git a/topi/src/topi.cc b/topi/src/topi.cc
index 87837f82635b..bf3d9518c509 100644
--- a/topi/src/topi.cc
+++ b/topi/src/topi.cc
@@ -839,7 +839,7 @@ using FTVMDenseOpBuilder = std::function<tvm::Tensor(const Target& target,
                                                      const tvm::Tensor& data,
                                                      const tvm::Tensor& weight,
                                                      const tvm::Tensor& bias,
-                                                     const DataType& out_dtype)>;
+                                                     const Type& out_dtype)>;
 
 /*!
 * \brief Helper function for registering dense ops matching the
@@ -856,7 +856,7 @@ inline PackedFunc WrapDenseOp(FTVMDenseOpBuilder builder) {
     Tensor data = args[0];
     Tensor weight = args[1];
     Tensor bias = args[2];
-    DataType out_dtype = args[3];
+    Type out_dtype = args[3];
 
     *ret = builder(target, data, weight, bias, out_dtype);
   });
@@ -867,7 +867,7 @@ TVM_REGISTER_GENERIC_FUNC(dense)
                             const tvm::Tensor& data,
                             const tvm::Tensor& weight,
                             const tvm::Tensor& bias,
-                            const DataType& out_dtype) {
+                            const Type& out_dtype) {
   return topi::nn::dense(data, weight, bias, out_dtype);
 }))
 .register_func({ "cuda", "gpu" }, WrapDenseOp(topi::cuda::dense_cuda))
diff --git a/topi/tests/python/test_topi_conv1d_transpose_ncw.py b/topi/tests/python/test_topi_conv1d_transpose_ncw.py
deleted file mode 100644
index 9d6e9db254b5..000000000000
--- a/topi/tests/python/test_topi_conv1d_transpose_ncw.py
+++ /dev/null
@@ -1,87 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Test code for transposed convolution."""
-import numpy as np
-import itertools
-import tvm
-import topi
-import topi.testing
-from tvm.contrib.pickle_memoize import memoize
-from topi.util import get_const_tuple
-from common import get_all_backend
-
-def verify_conv1d_transpose_ncw(batch, in_channel, in_size, num_filter, kernel, stride, padding):
-    in_width = in_size
-    A = tvm.placeholder((batch, in_channel, in_width), name='A')
-    W = tvm.placeholder((in_channel, num_filter, kernel), name='W')
-
-    a_shape = get_const_tuple(A.shape)
-    w_shape = get_const_tuple(W.shape)
-    dtype = A.dtype
-
-    @memoize("topi.tests.test_topi_conv1d_transpose.verify_conv1d_transpose_ncw")
-    def get_ref_data():
-        a_np = np.random.uniform(size=a_shape).astype(dtype)
-        w_np = np.random.uniform(size=w_shape).astype(dtype)
-        b_np = topi.testing.conv1d_transpose_ncw_python(a_np, w_np, stride, padding)
-        c_np = np.maximum(b_np, 0)
-        return a_np, w_np, b_np, c_np
-
-    a_np, w_np, b_np, c_np = get_ref_data()
-
-    def check_device(device):
-        ctx = tvm.context(device, 0)
-        if not ctx.exist:
-            print("Skip because %s is not enabled" % device)
-            return
-        with tvm.target.create(device):
-            B = topi.nn.conv1d_transpose_ncw(A, W, stride, padding, A.dtype)
-            C = topi.nn.relu(B)
-            s1 = topi.generic.schedule_conv1d_transpose_ncw([B])
-            s2 = topi.generic.schedule_conv1d_transpose_ncw([C])
-        a = tvm.nd.array(a_np, ctx)
-        w = tvm.nd.array(w_np, ctx)
-        b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx)
-        c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), ctx)
-
-        func1 = tvm.build(s1, [A, W, B], device)
-        func2 = tvm.build(s2, [A, W, C], device)
-        func1(a, w, b)
-        func2(a, w, c)
-        tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
-        tvm.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-5)
-
-    for device in get_all_backend():
-        check_device(device)
-
-
-def test_conv1d_transpose_ncw():
-    verify_conv1d_transpose_ncw(1, 3, 224, 32, 5, 1, 0)
-    verify_conv1d_transpose_ncw(1, 3, 224, 32, 7, 1, 2)
-    verify_conv1d_transpose_ncw(1, 3, 224, 32, 5, 2, 1)
-    verify_conv1d_transpose_ncw(1, 3, 224, 32, 5, 2, 0)
-    verify_conv1d_transpose_ncw(1, 32, 32, 128, 5, 1, 0)
-    verify_conv1d_transpose_ncw(1, 32, 32, 128, 5, 2, 1)
-    verify_conv1d_transpose_ncw(1, 1, 1024, 1, 512, 1, 256)
-    verify_conv1d_transpose_ncw(1, 1, 1024, 1, 512, 2, 256)
-    verify_conv1d_transpose_ncw(1, 1, 1024, 1, 512, 5, 256)
-    verify_conv1d_transpose_ncw(1, 1, 10, 1, 5, 1, (0,3))
-    verify_conv1d_transpose_ncw(1, 1, 10, 1, 5, 1, (1,3))
-    verify_conv1d_transpose_ncw(1, 1, 10, 1, 5, 1, (2,3))
-
-if __name__ == "__main__":
-    test_conv1d_transpose_ncw()
diff --git a/topi/tests/python/test_topi_depth_to_space.py b/topi/tests/python/test_topi_depth_to_space.py
deleted file mode 100644
index 4e895cb5db55..000000000000
--- a/topi/tests/python/test_topi_depth_to_space.py
+++ /dev/null
@@ -1,86 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Test code for depth to space"""
-import numpy as np
-import tvm
-import topi
-import topi.testing
-
-from common import get_all_backend
-
-
-def verify_depth_to_space(block_size, batch, in_channel, in_height, in_width, layout='NCHW', mode='DCR'):
-    out_channel = int(in_channel / (block_size * block_size))
-    out_height = int(in_height * block_size)
-    out_width = int(in_width * block_size)
-
-    if layout == 'NCHW':
-        in_shape = [batch, in_channel, in_height, in_width]
-        out_shape = [batch, out_channel, out_height, out_width]
-    elif layout == 'NHWC':
-        in_shape = [batch, in_height, in_width, in_channel]
-        out_shape = [batch, out_height, out_width, out_channel]
-    else:
-        raise NotImplementedError('Layout not supported {}'.format(layout))
-
-    A = tvm.placeholder(in_shape, name='A', dtype='float32')
-    dtype = A.dtype
-    a_np = np.random.uniform(size=in_shape).astype(dtype)
-
-    B = topi.nn.depth_to_space(A, block_size=block_size, layout=layout, mode=mode)
-    if layout == 'NHWC':
-        a_np = np.transpose(a_np, axes=[0, 3, 1, 2])
-    b_np = topi.testing.depth_to_space_python(a_np, block_size, mode=mode)
-    if layout == 'NHWC':
-        a_np = np.transpose(a_np, axes=[0, 2, 3, 1])
-        b_np = np.transpose(b_np, axes=[0, 2, 3, 1])
-
-    def check_device(device):
-        ctx = tvm.context(device, 0)
-        if not ctx.exist:
-            print("Skip because %s is not enabled" % device)
-            return
-        print("Running on target: %s" % device)
-        with tvm.target.create(device):
-            s = topi.generic.schedule_injective(B)
-        a = tvm.nd.array(a_np, ctx)
-        b = tvm.nd.array(np.zeros(out_shape, dtype=dtype), ctx)
-        f = tvm.build(s, [A, B], device)
-        f(a, b)
-        tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-3, atol=1e-3)
-
-    for device in get_all_backend():
-        check_device(device)
-
-
-def test_depth_to_space():
-    for layout in ['NCHW', 'NHWC']:
-        for mode in ['DCR', 'CDR']:
-            # Simplest possible case
-            verify_depth_to_space(2, 1, 4, 1, 1, layout=layout, mode=mode)
-            # Average input size
-            verify_depth_to_space(2, 1, 32, 32, 32, layout=layout, mode=mode)
-            # Large block size
-            verify_depth_to_space(8, 1, 256, 32, 32, layout=layout, mode=mode)
-            # Large batch size
-            verify_depth_to_space(4, 8, 32, 32, 32, layout=layout, mode=mode)
-            # Large input size
-            verify_depth_to_space(4, 8, 32, 128, 128, layout=layout, mode=mode)
-
-
-if __name__ == "__main__":
-    test_depth_to_space()
diff --git a/topi/tests/python/test_topi_pooling.py b/topi/tests/python/test_topi_pooling.py
index 8f649de9a24d..8a32c18d700c 100644
--- a/topi/tests/python/test_topi_pooling.py
+++ b/topi/tests/python/test_topi_pooling.py
@@ -15,12 +15,13 @@
 # specific language governing permissions and limitations
 # under the License.
 """Test code for pooling"""
-import math
 import numpy as np
 import tvm
 import topi
 import topi.testing
+import math
 from topi.util import get_const_tuple
+
 from common import get_all_backend
 
 def verify_pool(n, ic, ih, kh, sh, padding, pool_type, ceil_mode, count_include_pad=True):
@@ -263,25 +264,57 @@ def test_adaptive_pool():
     verify_adaptive_pool((1, 14, 56, 78), (34, 13), "max")
     verify_adaptive_pool((1, 5, 46, 97), (4, 96), "avg")
 
-def verify_pool3d(n, ic, ih, kh, sh, padding, pool_type,
-                  ceil_mode, count_include_pad=True, layout='NCDHW'):
-    id = iw = ih
-    kd = kw = kh
-    sd = sw = sh
-    input_shape = (n, ic, id, ih, iw)
-    kernel = [kd, kh, kw]
-    stride = [sd, sh, sw]
-    A = tvm.placeholder(input_shape, name='A')
-    B = topi.nn.pool3d(A, kernel=kernel, stride=stride, padding=padding,
+def verify_pool3d(n, ic, ih, kh, sh, padding, pool_type, ceil_mode, count_include_pad=True):
+    iz = iw = ih
+    kz = kw = kh
+    sz = sw = sh
+    pf, pt, pl, pk, pb, pr = padding
+    layout = "NCDHW"
+    A = tvm.placeholder((n, ic, iz, ih, iw), name='A')
+    B = topi.nn.pool3d(A, kernel=[kz, kh, kw], stride=[sz, sh, sw], padding=padding,
                        pool_type=pool_type, ceil_mode=ceil_mode,
-                       layout=layout, count_include_pad=count_include_pad)
+                       layout="NCDHW", count_include_pad=count_include_pad)
     B = topi.nn.relu(B)
     dtype = A.dtype
-    output_shape = [int(i) for i in B.shape]
 
-    input_np = np.random.uniform(low=0.001, size=input_shape).astype(dtype)
-    ref_np = topi.testing.pool3d_ncdhw_python(input_np, kernel, stride, padding,
-                                              output_shape, pool_type, count_include_pad, ceil_mode)
+    bshape = get_const_tuple(B.shape)
+    ashape = get_const_tuple(A.shape)
+    if ceil_mode:
+        assert bshape[2] == int(math.ceil(float(ashape[2] - kz + pf + pk) / sz) + 1)
+        assert bshape[3] == int(math.ceil(float(ashape[3] - kh + pt + pb) / sh) + 1)
+        assert bshape[4] == int(math.ceil(float(ashape[4] - kw + pl + pr) / sw) + 1)
+    else:
+        assert bshape[2] == int(math.floor(float(ashape[2] - kz + pf + pk) / sz) + 1)
+        assert bshape[3] == int(math.floor(float(ashape[3] - kh + pt + pb) / sh) + 1)
+        assert bshape[4] == int(math.floor(float(ashape[4] - kw + pl + pr) / sw) + 1)
+
+    a_np = np.random.uniform(low=0.001, size=(n, ic, iz, ih, iw)).astype(dtype)
+    pad_np = np.zeros(shape=(n, ic, iz+pf+pk, ih+pt+pb, iw+pl+pr)).astype(dtype)
+    no_zero = (range(n), range(ic), (range(pf, iz+pf)), (range(pt, ih+pt)), (range(pl, iw+pl)))
+    pad_np[np.ix_(*no_zero)] = a_np
+    _, oc, oz, oh, ow = get_const_tuple(B.shape)
+    b_np = np.zeros(shape=(n, oc, oz, oh, ow)).astype(dtype)
+
+    if pool_type == 'avg':
+        for k in range(oz):
+            for i in range(oh):
+                for j in range(ow):
+                    if count_include_pad:
+                        b_np[:,:,k,i,j] = np.mean( \
+                            pad_np[:, :, k*sz:k*sz+kz, i*sh:i*sh+kh, j*sw:j*sw+kw], axis=(2,3,4))
+                    else:
+                        pad_count = np.sum( \
+                            pad_np[:, :, k*sz:k*sz+kz, i*sh:i*sh+kh, j*sw:j*sw+kw] > 0, axis=(2,3,4))
+                        b_np[:,:,k,i,j] = np.sum(pad_np[:, :, k*sz:k*sz+kz, i*sh:i*sh+kh, j*sw:j*sw+kw], \
+                            axis=(2,3, 4)) / np.maximum(pad_count, 1)
+
+    elif pool_type =='max':
+        for k in range(oz):
+            for i in range(oh):
+                for j in range(ow):
+                    b_np[:,:,k,i,j] = np.max( \
+                        pad_np[:, :, k*sz:k*sz+kz, i*sh:i*sh+kh, j*sw:j*sw+kw], axis=(2,3,4))
+    b_np = np.maximum(b_np, 0.0)
 
     def check_device(device):
         ctx = tvm.context(device, 0)
@@ -292,11 +325,11 @@ def check_device(device):
         with tvm.target.create(device):
             s = topi.generic.schedule_pool(B, layout)
 
-        a = tvm.nd.array(input_np, ctx)
+        a = tvm.nd.array(a_np, ctx)
         b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=dtype), ctx)
         f = tvm.build(s, [A, B], device)
         f(a, b)
-        tvm.testing.assert_allclose(b.asnumpy(), ref_np, rtol=1e-5)
+        tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
 
     for device in get_all_backend():
         check_device(device)
@@ -320,7 +353,7 @@ def test_pool3d():
 
 if __name__ == "__main__":
     test_pool()
-    test_pool3d()
     test_pool_grad()
     test_global_pool()
     test_adaptive_pool()
+    test_pool3d()
diff --git a/topi/tests/python/test_topi_resize.py b/topi/tests/python/test_topi_resize.py
index 10678a0c2600..7c33526b589b 100644
--- a/topi/tests/python/test_topi_resize.py
+++ b/topi/tests/python/test_topi_resize.py
@@ -79,68 +79,5 @@ def test_resize():
     verify_resize(4, 16, 32, 32, 50, 50, 'NCHW', method="nearest_neighbor", align_corners=False)
     verify_resize(4, 16, 32, 32, 50, 50, 'NHWC', method="nearest_neighbor", align_corners=False)
 
-
-def verify_resize3d(batch, in_channel, in_depth, in_height, in_width, out_depth, out_height, out_width,
-                    layout='NCDHW', coordinate_transformation_mode="half_pixel", method="trilinear"):
-    if layout == 'NCDHW':
-        A = tvm.placeholder((batch, in_channel, in_depth, in_height, in_width), name='A', dtype='float32')
-        dtype = A.dtype
-        out_shape = (batch, in_channel, out_depth, out_height, out_width)
-        a_np = np.random.uniform(size=(batch, in_channel, in_depth, in_height, in_width)).astype(dtype)
-    elif layout == 'NDHWC':
-        A = tvm.placeholder((batch, in_depth, in_height, in_width, in_channel), name='A', dtype='float32')
-        dtype = A.dtype
-        out_shape = (batch, out_depth, out_height, out_width, in_channel)
-        a_np = np.random.uniform(size=(batch, in_depth, in_height, in_width, in_channel)).astype(dtype)
-    else:
-        raise NotImplementedError(
-            'Layout not supported {} '.format(layout))
-
-    B = topi.image.resize3d(A, (out_depth, out_height, out_width), layout=layout,
-                                coordinate_transformation_mode=coordinate_transformation_mode, method=method)
-
-    if method == "trilinear":
-        b_np = topi.testing.trilinear_resize3d_python(a_np, (out_depth, out_height, out_width), layout,
-                                                      coordinate_transformation_mode)
-    else:
-        scale_d = out_depth / in_depth
-        scale_h = out_height / in_height
-        scale_w = out_width / in_width
-        b_np = topi.testing.upsampling3d_python(a_np, (scale_d, scale_h, scale_w), layout)
-
-    def check_device(device):
-        ctx = tvm.context(device, 0)
-        if not ctx.exist:
-            print("Skip because %s is not enabled" % device)
-            return
-        print("Running on target: %s" % device)
-        with tvm.target.create(device):
-            s = topi.generic.schedule_injective(B)
-        a = tvm.nd.array(a_np, ctx)
-        b = tvm.nd.array(np.zeros(out_shape, dtype=dtype), ctx)
-        f = tvm.build(s, [A, B], device)
-        f(a, b)
-
-        tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-3, atol=1e-3)
-
-    for device in get_all_backend():
-        check_device(device)
-
-
-def test_resize3d():
-    # Trilinear
-    verify_resize3d(4, 8, 16, 16, 16, 25, 25, 25, 'NCDHW')
-    verify_resize3d(1, 8, 16, 16, 16, 25, 25, 25, "NDHWC")
-    verify_resize3d(3, 16, 32, 32, 32, 10, 10, 10, 'NCDHW', "align_corners")
-    verify_resize3d(3, 16, 32, 32, 32, 10, 10, 10, 'NDHWC', "align_corners")
-    verify_resize3d(3, 16, 32, 32, 32, 10, 10, 10, 'NCDHW', "asymmetric")
-    verify_resize3d(3, 16, 32, 32, 32, 10, 10, 10, 'NDHWC', "asymmetric")
-
-    # Nearest neighbor
-    verify_resize3d(4, 8, 16, 16, 16, 25, 25, 25, 'NCDHW', method="nearest_neighbor")
-    verify_resize3d(4, 8, 16, 16, 16, 25, 25, 25, 'NDHWC', method="nearest_neighbor")
-
-
 if __name__ == "__main__":
     test_resize()
-    test_resize3d()
diff --git a/topi/tests/python/test_topi_space_to_depth.py b/topi/tests/python/test_topi_space_to_depth.py
deleted file mode 100644
index b25cad194301..000000000000
--- a/topi/tests/python/test_topi_space_to_depth.py
+++ /dev/null
@@ -1,85 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Test code for space to depth"""
-import numpy as np
-import tvm
-import topi
-import topi.testing
-
-from common import get_all_backend
-
-
-def verify_space_to_depth(block_size, batch, in_channel, in_height, in_width, layout='NCHW'):
-    out_channel = int(in_channel * (block_size * block_size))
-    out_height = int(in_height / block_size)
-    out_width = int(in_width / block_size)
-
-    if layout == 'NCHW':
-        in_shape = [batch, in_channel, in_height, in_width]
-        out_shape = [batch, out_channel, out_height, out_width]
-    elif layout == 'NHWC':
-        in_shape = [batch, in_height, in_width, in_channel]
-        out_shape = [batch, out_height, out_width, out_channel]
-    else:
-        raise NotImplementedError('Layout not supported {}'.format(layout))
-
-    A = tvm.placeholder(in_shape, name='A', dtype='float32')
-    dtype = A.dtype
-    a_np = np.random.uniform(size=in_shape).astype(dtype)
-
-    B = topi.nn.space_to_depth(A, block_size=block_size, layout=layout)
-    if layout == 'NHWC':
-        a_np = np.transpose(a_np, axes=[0, 3, 1, 2])
-    b_np = topi.testing.space_to_depth_python(a_np, block_size)
-    if layout == 'NHWC':
-        a_np = np.transpose(a_np, axes=[0, 2, 3, 1])
-        b_np = np.transpose(b_np, axes=[0, 2, 3, 1])
-
-    def check_device(device):
-        ctx = tvm.context(device, 0)
-        if not ctx.exist:
-            print("Skip because %s is not enabled" % device)
-            return
-        print("Running on target: %s" % device)
-        with tvm.target.create(device):
-            s = topi.generic.schedule_injective(B)
-        a = tvm.nd.array(a_np, ctx)
-        b = tvm.nd.array(np.zeros(out_shape, dtype=dtype), ctx)
-        f = tvm.build(s, [A, B], device)
-        f(a, b)
-        tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-3, atol=1e-3)
-
-    for device in get_all_backend():
-        check_device(device)
-
-
-def test_space_to_depth():
-    for layout in ['NCHW', 'NHWC']:
-        # Simplest possible case
-        verify_space_to_depth(2, 1, 1, 2, 2, layout=layout)
-        # Average input size
-        verify_space_to_depth(2, 1, 32, 32, 32, layout=layout)
-        # Large block size
-        verify_space_to_depth(8, 1, 32, 64, 64, layout=layout)
-        # Large batch size
-        verify_space_to_depth(4, 8, 32, 32, 32, layout=layout)
-        # Large input size
-        verify_space_to_depth(4, 8, 32, 128, 128, layout=layout)
-
-
-if __name__ == "__main__":
-    test_space_to_depth()
diff --git a/topi/tests/python/test_topi_upsampling.py b/topi/tests/python/test_topi_upsampling.py
index f5b77b1190a6..83909c085d14 100644
--- a/topi/tests/python/test_topi_upsampling.py
+++ b/topi/tests/python/test_topi_upsampling.py
@@ -86,73 +86,5 @@ def test_upsampling():
     verify_upsampling(2, 2, 32, 32, 3.0, 3.0, layout="NHWC", method="bilinear")
     verify_upsampling(1, 64, 22, 32,  3.0, 3.0, layout="NHWC", method="bilinear")
 
-def verify_upsampling3d(batch, in_channel, in_depth, in_height, in_width, scale_d, scale_h, scale_w,
-                        layout='NCDHW', method="nearest_neighbor"):
-    if layout == 'NCDHW':
-        A = tvm.placeholder((batch, in_channel, in_depth, in_height, in_width), name='A')
-        dtype = A.dtype
-        out_shape = (batch, in_channel, int(round(in_depth*scale_d)), int(round(in_height*scale_h)),
-                     int(round(in_width*scale_w)))
-        a_np = np.random.uniform(size=(batch, in_channel, in_depth, in_height, in_width)).astype(dtype)
-    elif layout == 'NDHWC':
-        A = tvm.placeholder((batch, in_depth, in_height, in_width, in_channel), name='A')
-        dtype = A.dtype
-        out_shape = (batch, int(round(in_depth*scale_d)), int(round(in_height*scale_h)),
-                     int(round(in_width*scale_w)), in_channel)
-        a_np = np.random.uniform(size=(batch, in_depth, in_height, in_width, in_channel)).astype(dtype)
-    else:
-        raise NotImplementedError(
-            'Layout not supported {} '.format(layout))
-
-    B = topi.nn.upsampling3d(A, scale_d, scale_h, scale_w, layout=layout, method=method,
-                             coordinate_transformation_mode="half_pixel")
-
-    if method == "trilinear":
-        out_size = (int(round(in_depth*scale_d)), int(round(in_height*scale_h)), int(round(in_width*scale_w)))
-        b_np = topi.testing.trilinear_resize3d_python(a_np, out_size, layout,
-                                                      coordinate_transformation_mode="half_pixel")
-    else:
-        b_np = topi.testing.upsampling3d_python(a_np, (scale_d, scale_h, scale_w), layout)
-
-    def check_device(device):
-        ctx = tvm.context(device, 0)
-        if not ctx.exist:
-            print("Skip because %s is not enabled" % device)
-            return
-        print("Running on target: %s" % device)
-        with tvm.target.create(device):
-            s = topi.generic.schedule_injective(B)
-        a = tvm.nd.array(a_np, ctx)
-        b = tvm.nd.array(np.zeros(out_shape, dtype=dtype), ctx)
-        f = tvm.build(s, [A, B], device)
-        f(a, b)
-
-        tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5, atol=1e-5)
-
-    for device in get_all_backend():
-        check_device(device)
-
-def test_upsampling3d():
-    # nearest_neighbor - NCDHW
-    verify_upsampling3d(8, 8, 16, 16, 16, 2.0, 2.0, 2.0)
-    verify_upsampling3d(2, 16, 32, 32, 32, 3.0, 3.0, 3.0)
-    verify_upsampling3d(1, 8, 11, 16, 6, 1.954545497894287, 2.0, 1.5)
-
-    ## nearest_neighbor - NDHWC
-    verify_upsampling3d(8, 8, 16, 16, 16, 2.0, 2.0, 2.0, layout="NDHWC")
-    verify_upsampling3d(2, 16, 32, 32, 32, 3.0, 3.0, 3.0, layout="NDHWC")
-    verify_upsampling3d(1, 8, 11, 16, 6, 1.954545497894287, 2.0, 1.5, layout="NDHWC")
-
-    # trilinear - NCDHW
-    verify_upsampling3d(2, 2, 16, 16, 16, 2.0, 2.0, 2.0, method="trilinear")
-    verify_upsampling3d(2, 2, 32, 32, 32, 3.0, 3.0, 3.0, method="trilinear")
-    verify_upsampling3d(1, 2, 11, 16, 6, 1.954545497894287, 2.0, 1.5, method="trilinear")
-
-    # trilinear - NDHWC
-    verify_upsampling3d(2, 2, 16, 16, 16, 2.0, 2.0, 2.0, layout="NDHWC", method="trilinear")
-    verify_upsampling3d(2, 2, 32, 32, 32, 3.0, 3.0, 3.0, layout="NDHWC", method="trilinear")
-    verify_upsampling3d(1, 2, 11, 16, 6, 1.954545497894287, 2.0, 1.5, layout="NDHWC", method="trilinear")
-
 if __name__ == "__main__":
     test_upsampling()
-    test_upsampling3d()
diff --git a/tutorials/autotvm/tune_relay_arm.py b/tutorials/autotvm/tune_relay_arm.py
index 8e9bf9f9fcfb..fe2f94c2ad89 100644
--- a/tutorials/autotvm/tune_relay_arm.py
+++ b/tutorials/autotvm/tune_relay_arm.py
@@ -151,7 +151,7 @@ def get_network(name, batch_size):
 # * For Android:
 #   Follow this `readme page <https://github.com/apache/incubator-tvm/tree/master/apps/android_rpc>`_ to
 #   install the TVM RPC APK on the android device. Make sure you can pass the android rpc test.
-#   Then you have already registered your device. During tuning, you have to go to developer option
+#   Then you have already registred your device. During tuning, you have to go to developer option
 #   and enable "Keep screen awake during changing" and charge your phone to make it stable.
 #
 # After registering devices, we can confirm it by querying rpc_tracker
diff --git a/tutorials/autotvm/tune_relay_cuda.py b/tutorials/autotvm/tune_relay_cuda.py
index b931172a9f64..efce3cb9d832 100644
--- a/tutorials/autotvm/tune_relay_cuda.py
+++ b/tutorials/autotvm/tune_relay_cuda.py
@@ -71,7 +71,7 @@
 # Define Network
 # --------------
 # First we need to define the network in relay frontend API.
-# We can load some pre-defined network from :code:`tvm.relay.testing`.
+# We can load some pre-defined network from :code:`nnvm.testing`.
 # We can also load models from MXNet, ONNX and TensorFlow.
 
 def get_network(name, batch_size):
diff --git a/tutorials/autotvm/tune_relay_mobile_gpu.py b/tutorials/autotvm/tune_relay_mobile_gpu.py
index e7b8ee284641..b9fb3b188570 100644
--- a/tutorials/autotvm/tune_relay_mobile_gpu.py
+++ b/tutorials/autotvm/tune_relay_mobile_gpu.py
@@ -152,7 +152,7 @@ def get_network(name, batch_size):
 # * For Android:
 #   Follow this `readme page <https://github.com/apache/incubator-tvm/tree/master/apps/android_rpc>`_ to
 #   install TVM RPC APK on the android device. Make sure you can pass the android RPC test.
-#   Then you have already registered your device. During tuning, you have to go to developer option
+#   Then you have already registred your device. During tuning, you have to go to developer option
 #   and enable "Keep screen awake during changing" and charge your phone to make it stable.
 #
 # After registering devices, we can confirm it by querying rpc_tracker
diff --git a/tutorials/autotvm/tune_simple_template.py b/tutorials/autotvm/tune_simple_template.py
index 2e877b4cd3cc..dc1b2ce4a4fd 100644
--- a/tutorials/autotvm/tune_simple_template.py
+++ b/tutorials/autotvm/tune_simple_template.py
@@ -32,7 +32,6 @@
 # Install dependencies
 # --------------------
 # To use autotvm package in TVM, we need to install some extra dependencies.
-# This step (installing xgboost) can be skipped as it doesn't need XGBoost
 # (change "3" to "2" if you use python2):
 #
 # .. code-block:: bash
@@ -295,8 +294,7 @@ def matmul(N, L, M, dtype):
     builder='local',
     runner=autotvm.LocalRunner(number=5))
 
-# Begin tuning with RandomTuner, log records to file `matmul.log`
-# You can use alternatives like XGBTuner.
+# begin tuning, log records to file `matmul.log`
 tuner = autotvm.tuner.RandomTuner(task)
 tuner.tune(n_trial=10,
            measure_option=measure_option,
diff --git a/tutorials/frontend/deploy_model_on_android.py b/tutorials/frontend/deploy_model_on_android.py
index 3d0e83d5e450..813254df46b3 100644
--- a/tutorials/frontend/deploy_model_on_android.py
+++ b/tutorials/frontend/deploy_model_on_android.py
@@ -78,7 +78,7 @@
 #
 # .. code-block:: bash
 #
-#   echo 'export PYTHONPATH=/workspace/python:/workspacem/topi/python:/workspace/vta/python:${PYTHONPATH}' >> ~/.bashrc
+#   echo 'export PYTHONPATH=/workspace/python:/workspacem/topi/python:/workspace/nnvm/python/:/workspace/vta/python:${PYTHONPATH}' >> ~/.bashrc
 #   source ~/.bashrc
 
 #################################################################
diff --git a/tutorials/frontend/from_tensorflow.py b/tutorials/frontend/from_tensorflow.py
index 55eb3d014191..2c109cbaf907 100644
--- a/tutorials/frontend/from_tensorflow.py
+++ b/tutorials/frontend/from_tensorflow.py
@@ -34,10 +34,6 @@
 
 # Tensorflow imports
 import tensorflow as tf
-try:
-    tf_compat_v1 = tf.compat.v1
-except ImportError:
-    tf_compat_v1 = tf
 
 # Tensorflow utility functions
 import tvm.relay.testing.tf as tf_testing
@@ -93,14 +89,14 @@
 # ------------
 # Creates tensorflow graph definition from protobuf file.
 
-with tf_compat_v1.gfile.GFile(model_path, 'rb') as f:
-    graph_def = tf_compat_v1.GraphDef()
+with tf.compat.v1.gfile.GFile(model_path, 'rb') as f:
+    graph_def = tf.compat.v1.GraphDef()
     graph_def.ParseFromString(f.read())
     graph = tf.import_graph_def(graph_def, name='')
     # Call the utility to import the graph definition into default graph.
     graph_def = tf_testing.ProcessGraphDefParam(graph_def)
     # Add shapes to the graph.
-    with tf_compat_v1.Session() as sess:
+    with tf.compat.v1.Session() as sess:
         graph_def = tf_testing.AddShapesToGraphDef(sess, 'softmax')
 
 ######################################################################
@@ -191,8 +187,8 @@
 def create_graph():
     """Creates a graph from saved GraphDef file and returns a saver."""
     # Creates graph from saved graph_def.pb.
-    with tf_compat_v1.gfile.GFile(model_path, 'rb') as f:
-        graph_def = tf_compat_v1.GraphDef()
+    with tf.compat.v1.gfile.GFile(model_path, 'rb') as f:
+        graph_def = tf.compat.v1.GraphDef()
         graph_def.ParseFromString(f.read())
         graph = tf.import_graph_def(graph_def, name='')
         # Call the utility to import the graph definition into default graph.
@@ -210,14 +206,14 @@ def run_inference_on_image(image):
     -------
         Nothing
     """
-    if not tf_compat_v1.gfile.Exists(image):
+    if not tf.compat.v1.io.gfile.exists(image):
         tf.logging.fatal('File does not exist %s', image)
-    image_data = tf_compat_v1.gfile.GFile(image, 'rb').read()
+    image_data = tf.compat.v1.gfile.GFile(image, 'rb').read()
 
     # Creates graph from saved GraphDef.
     create_graph()
 
-    with tf_compat_v1.Session() as sess:
+    with tf.compat.v1.Session() as sess:
         softmax_tensor = sess.graph.get_tensor_by_name('softmax:0')
         predictions = sess.run(softmax_tensor,
                                {'DecodeJpeg/contents:0': image_data})
diff --git a/vta/hardware/chisel/src/main/scala/core/TensorAlu.scala b/vta/hardware/chisel/src/main/scala/core/TensorAlu.scala
index 6f1a804d8726..b438641d2938 100644
--- a/vta/hardware/chisel/src/main/scala/core/TensorAlu.scala
+++ b/vta/hardware/chisel/src/main/scala/core/TensorAlu.scala
@@ -230,8 +230,7 @@ class TensorAlu(debug: Boolean = false)(implicit p: Parameters) extends Module {
   tensorImm.data.valid := state === sReadTensorB
   tensorImm.data.bits.foreach { b =>
     b.foreach { c =>
-      c := Mux(dec.alu_imm(C_ALU_IMM_BITS - 1),
-               Cat(-1.S((aluBits - C_ALU_IMM_BITS).W), dec.alu_imm), dec.alu_imm)
+      c := dec.alu_imm
     }
   }
 
diff --git a/vta/src/tsim/tsim_driver.cc b/vta/src/tsim/tsim_driver.cc
index 646dbe17a2e6..94e5858b5949 100644
--- a/vta/src/tsim/tsim_driver.cc
+++ b/vta/src/tsim/tsim_driver.cc
@@ -188,23 +188,25 @@ TVM_REGISTER_GLOBAL("vta.tsim.profiler_status")
 }  // namespace vta
 
 void* VTAMemAlloc(size_t size, int cached) {
-  return vta::vmem::VirtualMemoryManager::Global()->Alloc(size);
+  void * addr = vta::vmem::VirtualMemoryManager::Global()->Alloc(size);
+  return reinterpret_cast<void*>(vta::vmem::VirtualMemoryManager::Global()->GetPhyAddr(addr));
 }
 
 void VTAMemFree(void* buf) {
-  vta::vmem::VirtualMemoryManager::Global()->Free(buf);
+  void * addr = vta::vmem::VirtualMemoryManager::Global()->GetAddr(reinterpret_cast<uint64_t>(buf));
+  vta::vmem::VirtualMemoryManager::Global()->Free(addr);
 }
 
 vta_phy_addr_t VTAMemGetPhyAddr(void* buf) {
-  return vta::vmem::VirtualMemoryManager::Global()->GetPhyAddr(buf);
+  return reinterpret_cast<uint64_t>(reinterpret_cast<uint64_t*>(buf));
 }
 
 void VTAMemCopyFromHost(void* dst, const void* src, size_t size) {
-  memcpy(dst, src, size);
+  vta::vmem::VirtualMemoryManager::Global()->MemCopyFromHost(dst, src, size);
 }
 
 void VTAMemCopyToHost(void* dst, const void* src, size_t size) {
-  memcpy(dst, src, size);
+  vta::vmem::VirtualMemoryManager::Global()->MemCopyToHost(dst, src, size);
 }
 
 void VTAFlushCache(void* vir_addr, vta_phy_addr_t phy_addr, int size) {
diff --git a/vta/src/vmem/virtual_memory.cc b/vta/src/vmem/virtual_memory.cc
index 0bf2382e155e..20ffd00b9814 100644
--- a/vta/src/vmem/virtual_memory.cc
+++ b/vta/src/vmem/virtual_memory.cc
@@ -66,19 +66,9 @@ void* VirtualMemoryManager::GetAddr(uint64_t phy_addr) {
 vta_phy_addr_t VirtualMemoryManager::GetPhyAddr(void* buf) {
   std::lock_guard<std::mutex> lock(mutex_);
   auto it = pmap_.find(buf);
-  uint64_t offset = 0;
-  if (it == pmap_.end()) {
-    for (it = pmap_.begin(); it != pmap_.end(); it++) {
-      uint64_t bytes = it->second->num_pages << kPageBits;
-      if ((buf >= it->first) && (buf < static_cast<char*>(it->first) + bytes)) {
-        offset = static_cast<char*>(buf) - static_cast<char*>(it->first);
-        break;
-      }
-    }
-    CHECK(it != pmap_.end());
-  }
+  CHECK(it != pmap_.end());
   Page* p = it->second.get();
-  return ((p->ptable_begin + 1) << kPageBits) + offset;
+  return (p->ptable_begin + 1) << kPageBits;
 }
 
 /*!