diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index 6558d9893b004..114b231c192a3 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -1,10 +1,21 @@
 TVM Contributors
 ================
-TVM adopts the Apache style model and governs by merit. We believe that it is important to create an inclusive community where everyone can use,
+TVM adopts the Apache way and governs by merit. We believe that it is important to create an inclusive community where everyone can use,
 contribute to, and influence the direction of the project. We actively invite contributors who have earned the merit to be part of the development community.
 
 See the [community structure document](http://docs.tvm.ai/contribute/community.html) for the explanation of community structure and contribution guidelines.
 
+## Mentors
+
+TVM is now part of the Apache Incubator.
+We are fortunate to have the following mentors.
+
+- Markus Weimer @markusweimer
+- Sebastian Schelter @sscdotopen
+- Byung-Gon Chun @bgchun
+- Henry Saputra @hsaputra
+- Timothy Chen @tnachen
+- Furkan KAMACI @kamaci
 
 ## Committers
 
diff --git a/Jenkinsfile b/Jenkinsfile
index 57049004fdadf..5e745dedeb2ed 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -22,7 +22,7 @@
 //
 ci_lint = "tvmai/ci-lint:v0.50"
 ci_gpu = "tvmai/ci-gpu:v0.51"
-ci_cpu = "tvmai/ci-cpu:v0.41"
+ci_cpu = "tvmai/ci-cpu:v0.50"
 ci_i386 = "tvmai/ci-i386:v0.50"
 
 // tvm libraries
diff --git a/apps/howto_deploy/Makefile b/apps/howto_deploy/Makefile
index 7accb7dd64aef..39c5996233abf 100644
--- a/apps/howto_deploy/Makefile
+++ b/apps/howto_deploy/Makefile
@@ -31,4 +31,4 @@ lib/cpp_deploy_pack: cpp_deploy.cc lib/test_addone_sys.o lib/libtvm_runtime_pack
 # Deploy using pre-built libtvm_runtime.so
 lib/cpp_deploy_normal: cpp_deploy.cc lib/test_addone_sys.o
 	@mkdir -p $(@D)
-	$(CXX) $(PKG_CFLAGS) -o $@  $^ $(PKG_LDFLAGS) -ltvm_runtime
+	$(CXX) $(PKG_CFLAGS) -o $@  $^ -ltvm_runtime $(PKG_LDFLAGS)
diff --git a/apps/howto_deploy/run_example.sh b/apps/howto_deploy/run_example.sh
index 8c49e860c5389..899dc48b5f254 100755
--- a/apps/howto_deploy/run_example.sh
+++ b/apps/howto_deploy/run_example.sh
@@ -3,8 +3,8 @@ echo "Build the libraries.."
 mkdir -p lib
 make
 echo "Run the example"
-export LD_LIBRARY_PATH=../../lib:${LD_LIBRARY_PATH}
-export DYLD_LIBRARY_PATH=../../lib:${DYLD_LIBRARY_PATH}
+export LD_LIBRARY_PATH=../../build:${LD_LIBRARY_PATH}
+export DYLD_LIBRARY_PATH=../../build:${DYLD_LIBRARY_PATH}
 
 echo "Run the deployment with all in one packed library..."
 lib/cpp_deploy_pack
diff --git a/cmake/config.cmake b/cmake/config.cmake
index 411e74ffa14f7..a4899b40de298 100644
--- a/cmake/config.cmake
+++ b/cmake/config.cmake
@@ -127,7 +127,7 @@ set(USE_MPS OFF)
 set(USE_ROCBLAS OFF)
 
 # Whether use contrib sort
-set(USE_SORT OFF)
+set(USE_SORT ON)
 
 # Build ANTLR parser for Relay text format
 set(USE_ANTLR OFF)
diff --git a/cmake/modules/SGX.cmake b/cmake/modules/SGX.cmake
index 608d6ff5a4bd2..86e4c22f4331b 100644
--- a/cmake/modules/SGX.cmake
+++ b/cmake/modules/SGX.cmake
@@ -48,4 +48,6 @@ if(NOT USE_SGX STREQUAL "OFF")
     -L${USE_SGX}/lib64 -l${_urts_lib}
     -L${RUST_SGX_SDK}/sgx_ustdc -lsgx_ustdc)
   list(APPEND RUNTIME_SRCS ${RUNTIME_SGX_SRCS})
+
+  include_directories(${RUST_SGX_SDK}/edl ${RUST_SGX_SDK}/common)
 endif()
diff --git a/cmake/modules/contrib/BLAS.cmake b/cmake/modules/contrib/BLAS.cmake
index 45269a20715db..09526ef38f6bc 100644
--- a/cmake/modules/contrib/BLAS.cmake
+++ b/cmake/modules/contrib/BLAS.cmake
@@ -10,7 +10,7 @@ elseif(USE_BLAS STREQUAL "mkl")
   if(NOT IS_DIRECTORY ${USE_MKL_PATH})
     set(USE_MKL_PATH /opt/intel/mkl)
   endif()
-  find_library(BLAS_LIBRARY mkl_rt ${USE_MKL_PATH}/lib/ ${USE_MKL_PATH}/lib/intel64)
+  find_library(BLAS_LIBRARY NAMES mkl_rt mklml_gnu HINTS ${USE_MKL_PATH}/lib/ ${USE_MKL_PATH}/lib/intel64)
   include_directories(${USE_MKL_PATH}/include)
   list(APPEND TVM_RUNTIME_LINKER_LIBS ${BLAS_LIBRARY})
   list(APPEND RUNTIME_SRCS ${CBLAS_CONTRIB_SRC})
diff --git a/docker/install/ubuntu_install_rust.sh b/docker/install/ubuntu_install_rust.sh
index e41a3d63a1be7..f27ba7226179a 100755
--- a/docker/install/ubuntu_install_rust.sh
+++ b/docker/install/ubuntu_install_rust.sh
@@ -9,12 +9,10 @@ apt-get update && apt-get install -y --no-install-recommends curl
 export RUSTUP_HOME=/opt/rust
 export CARGO_HOME=/opt/rust
 # this rustc is one supported by the installed version of rust-sgx-sdk
-curl -s -S -L https://sh.rustup.rs -sSf | sh -s -- -y --no-modify-path --default-toolchain nightly-2019-01-28
+curl -s -S -L https://sh.rustup.rs -sSf | sh -s -- -y --no-modify-path --default-toolchain nightly-2019-03-24
 . $CARGO_HOME/env
-rustup component add rust-src
-cargo install sccache
-cargo install rustfmt-nightly --version 1.0.1 --force
-cargo install xargo
+rustup component add rustfmt
+cargo install sccache --no-default-features
 
 # make rust usable by all users
 chmod -R a+w /opt/rust
diff --git a/docker/install/ubuntu_install_tflite.sh b/docker/install/ubuntu_install_tflite.sh
index ed8ea1deff3f9..fc5d0e95356c4 100755
--- a/docker/install/ubuntu_install_tflite.sh
+++ b/docker/install/ubuntu_install_tflite.sh
@@ -5,7 +5,7 @@ set -u
 set -o pipefail
 
 # Download, build and install flatbuffers
-git clone --depth=1 --recursive https://github.com/google/flatbuffers.git
+git clone --branch=v1.10.0 --depth=1 --recursive https://github.com/google/flatbuffers.git
 cd flatbuffers
 cmake -G "Unix Makefiles" -DCMAKE_BUILD_TYPE=Release
 make install -j8
diff --git a/include/tvm/relay/attrs/nn.h b/include/tvm/relay/attrs/nn.h
index 103359e3617cb..e4329b77b5ebf 100644
--- a/include/tvm/relay/attrs/nn.h
+++ b/include/tvm/relay/attrs/nn.h
@@ -155,6 +155,24 @@ struct Conv2DWinogradAttrs : public tvm::AttrsNode<Conv2DWinogradAttrs> {
   }
 };
 
+/*! \brief Attributes used in winograd weight transformation operators */
+struct Conv2DWinogradNNPACKWeightTransformAttrs
+    : public tvm::AttrsNode<Conv2DWinogradNNPACKWeightTransformAttrs> {
+  int convolution_algorithm;
+  DataType out_dtype;
+
+  TVM_DECLARE_ATTRS(Conv2DWinogradNNPACKWeightTransformAttrs,
+                    "relay.attrs.Conv2DWinogradNNPACKWeightTransformAttrs") {
+    TVM_ATTR_FIELD(convolution_algorithm)
+        .describe(
+            "The convolution algorithm for Winograd NNPACK. "
+            "E.g. tvm.contrib.nnpack.ConvolutionAlgorithm.WT_8x8 for WT_8x8, "
+            "tvm.contrib.nnpack.ConvolutionAlgorithm.WT_8x8_FP16 for WT_8x8_FP16");
+    TVM_ATTR_FIELD(out_dtype)
+        .set_default(NullValue<DataType>())
+        .describe("Output data type, set to explicit type under mixed precision setting");
+  }
+};
 
 /*! \brief Attributes used in softmax operators */
 struct SoftmaxAttrs : public tvm::AttrsNode<SoftmaxAttrs> {
@@ -438,6 +456,67 @@ struct L2NormalizeAttrs : public tvm::AttrsNode<L2NormalizeAttrs> {
   }
 };
 
+
+/*! \brief Attributes for DeformableConv2D operator */
+struct DeformableConv2DAttrs : public tvm::AttrsNode<DeformableConv2DAttrs> {
+  Array<IndexExpr> strides;
+  Array<IndexExpr> padding;
+  Array<IndexExpr> dilation;
+  int deformable_groups;
+  int groups;
+  IndexExpr channels;
+  Array<IndexExpr> kernel_size;
+  std::string data_layout;
+  std::string kernel_layout;
+  std::string out_layout;
+  DataType out_dtype;
+
+  TVM_DECLARE_ATTRS(DeformableConv2DAttrs, "relay.attrs.DeformableConv2DAttrs") {
+    TVM_ATTR_FIELD(strides).set_default(Array<IndexExpr>({1, 1}))
+        .describe("Specifies the strides of the convolution.");
+    TVM_ATTR_FIELD(padding).set_default(Array<IndexExpr>({0, 0}))
+        .describe("If padding is non-zero, then the input is implicitly zero-padded"
+                  "on both sides for padding number of points");
+    TVM_ATTR_FIELD(dilation).set_default(Array<IndexExpr>({1, 1}))
+        .describe("Specifies the dilation rate to use for dilated convolution.");
+    TVM_ATTR_FIELD(deformable_groups).set_default(1)
+        .describe("Controls the connections between inputs and offsets."
+                  "Input channels are partitioned into multiple deformable groups. Offsets"
+                  "are shared across input channels in the same deformable group.");
+    TVM_ATTR_FIELD(groups).set_default(1)
+        .describe("Controls the connections between inputs and outputs."
+                  "At groups=1, all inputs are convolved to all outputs."
+                  "At groups=2, the operation becomes equivalent to having two convolution"
+                  "layers side by side, each seeing half the input channels, and producing"
+                  "half the output channels, and both subsequently concatenated.");
+    TVM_ATTR_FIELD(channels)
+        .describe("The number of output channels in the convolution."
+                  " If it is not set, inferred by shape of the weight.")
+        .set_default(NullValue<IndexExpr>());
+    TVM_ATTR_FIELD(kernel_size)
+        .describe("Specifies the dimensions of the convolution window.")
+        .set_default(NullValue<Array<IndexExpr> >());
+    TVM_ATTR_FIELD(data_layout).set_default("NCHW")
+        .describe("Dimension ordering of input data. Can be 'NCHW', 'NHWC', etc."
+                  "'N', 'C', 'H', 'W' stands for batch, channel, height, and width"
+                  "dimensions respectively. Convolution is applied on the 'H' and"
+                  "'W' dimensions.");
+    TVM_ATTR_FIELD(kernel_layout).set_default("OIHW")
+        .describe("Dimension ordering of weight. Can be 'OIHW', 'OIHW16o16i', etc."
+                  "'O', 'I', 'H', 'W' stands for num_filter, input_channel, height, and width"
+                  "dimensions respectively.");
+    TVM_ATTR_FIELD(out_layout).set_default("")
+        .describe("Dimension ordering of output. Can be 'NCHW', 'NHWC', etc."
+                  "'N', 'C', 'H', 'W' stands for batch, channel, height, and width"
+                  "dimensions respectively. Default to be same as input layout.");
+
+    // use 0 bits to indicate none.
+    TVM_ATTR_FIELD(out_dtype)
+        .set_default(NullValue<DataType>())
+        .describe("Output data type, set to explicit type under mixed precision setting");
+  }
+};
+
 }  // namespace relay
 }  // namespace tvm
 #endif  // TVM_RELAY_ATTRS_NN_H_
diff --git a/include/tvm/relay/expr.h b/include/tvm/relay/expr.h
index 4513022687f87..4769c7e9b8936 100644
--- a/include/tvm/relay/expr.h
+++ b/include/tvm/relay/expr.h
@@ -166,6 +166,26 @@ class VarNode : public ExprNode {
 
 RELAY_DEFINE_NODE_REF(Var, VarNode, Expr);
 
+/*! \brief Hash Var by it's id.
+ * Different VarNode might has same vid, and they are considered to be the same var in such case.
+ * Use VarHash to hash Var by id.
+ */
+struct VarHash {
+  size_t operator()(const Var& v) const {
+    return v->vid.hash();
+  }
+};
+
+/*! \brief Compare Var by it's id.
+ * Different VarNode might has same vid, and they are considered to be the same var in such case.
+ * Use VarEqual to compare Var by id.
+ */
+struct VarEqual {
+  bool operator()(const Var& l, const Var& r) const {
+    return l->vid.get() == r->vid.get();
+  }
+};
+
 /*!
  * \brief Global variable that leaves in the top-level module.
  * This is used to enable recursive calls between function.
@@ -503,7 +523,7 @@ RELAY_DEFINE_NODE_REF(RefWrite, RefWriteNode, Expr);
  * rewriting pass such as layout or type transformation.
  *
  * Subclass TempExprNode allows us to pattern match on
- * specific kind TempExpr and use them for expression rewriting.
+ * specific kind of TempExpr and use them for expression rewriting.
  *
  * TempExpr should only be used within a pass,
  */
@@ -521,6 +541,25 @@ class TempExprNode : public ExprNode {
 
 RELAY_DEFINE_NODE_REF(TempExpr, TempExprNode, Expr);
 
+class Annotate;
+class AnnotateNode : public ExprNode {
+ public:
+  Expr expr;
+  NodeRef annotation;
+  void VisitAttrs(tvm::AttrVisitor* v) final {
+    v->Visit("expr", &expr);
+    v->Visit("annotation", &annotation);
+    v->Visit("_checked_type_", &checked_type_);
+  }
+
+  TVM_DLL static Annotate make(Expr expr, NodeRef annotation);
+
+  static constexpr const char* _type_key = "relay.AnnotateNode";
+  TVM_DECLARE_NODE_TYPE_INFO(AnnotateNode, ExprNode);
+};
+
+RELAY_DEFINE_NODE_REF(Annotate, AnnotateNode, Expr);
+
 // implementataions
 inline const Type& ExprNode::checked_type() const {
   CHECK(checked_type_.defined()) << "internal error: the type checker has "
diff --git a/include/tvm/relay/expr_functor.h b/include/tvm/relay/expr_functor.h
index 446e4eec78ee8..46cdb0a3f98e8 100644
--- a/include/tvm/relay/expr_functor.h
+++ b/include/tvm/relay/expr_functor.h
@@ -71,6 +71,7 @@ class ExprFunctor<R(const Expr& n, Args...)> {
    * \return The result of the call
    */
   virtual R VisitExpr(const Expr& n, Args... args) {
+    CHECK(n.defined());
     static FType vtable = InitVTable();
     return vtable(n, this, std::forward<Args>(args)...);
   }
@@ -97,6 +98,7 @@ class ExprFunctor<R(const Expr& n, Args...)> {
   virtual R VisitExpr_(const RefWriteNode* op, Args... args) EXPR_FUNCTOR_DEFAULT;
   virtual R VisitExpr_(const ConstructorNode* op, Args... args) EXPR_FUNCTOR_DEFAULT;
   virtual R VisitExpr_(const MatchNode* op, Args... args) EXPR_FUNCTOR_DEFAULT;
+  virtual R VisitExpr_(const AnnotateNode* op, Args... args) EXPR_FUNCTOR_DEFAULT;
   virtual R VisitExprDefault_(const Node* op, Args...) {
     throw Error(std::string("Do not have a default for ") + op->type_key());
   }
@@ -121,6 +123,7 @@ class ExprFunctor<R(const Expr& n, Args...)> {
     RELAY_EXPR_FUNCTOR_DISPATCH(RefWriteNode);
     RELAY_EXPR_FUNCTOR_DISPATCH(ConstructorNode);
     RELAY_EXPR_FUNCTOR_DISPATCH(MatchNode);
+    RELAY_EXPR_FUNCTOR_DISPATCH(AnnotateNode);
     return vtable;
   }
 };
@@ -151,6 +154,7 @@ class ExprVisitor
   void VisitExpr_(const RefWriteNode* op) override;
   void VisitExpr_(const ConstructorNode* op) override;
   void VisitExpr_(const MatchNode* op) override;
+  void VisitExpr_(const AnnotateNode* op) override;
   virtual void VisitType(const Type& t);
   virtual void VisitClause(const Clause& c);
   virtual void VisitPattern(const Pattern& c);
@@ -193,6 +197,7 @@ class ExprMutator
   Expr VisitExpr_(const RefWriteNode* op) override;
   Expr VisitExpr_(const ConstructorNode* op) override;
   Expr VisitExpr_(const MatchNode* op) override;
+  Expr VisitExpr_(const AnnotateNode* op) override;
 
   /*!
    * \brief Used to visit the types inside of expressions.
diff --git a/include/tvm/relay/pass.h b/include/tvm/relay/pass.h
index eb1084a0279c1..10b13616dac42 100644
--- a/include/tvm/relay/pass.h
+++ b/include/tvm/relay/pass.h
@@ -46,7 +46,7 @@
 #include <tvm/relay/module.h>
 #include <tvm/relay/op_attr_types.h>
 #include <tvm/relay/type.h>
-
+#include <tvm/relay/adt.h>
 #include <string>
 #include <vector>
 
@@ -326,6 +326,17 @@ TVM_DLL bool WellFormed(const Expr& expr);
  */
 TVM_DLL tvm::Array<Var> BoundVars(const Expr& expr);
 
+/*! \brief Get all bound variables from pattern pat.
+ *
+ * Bound variables are all variables that got bound by the pat.
+ * They only have meaning inside that expr, and can only be used in it.
+ *
+ * \param pat the Pattern.
+ *
+ * \return List of bound vars, in the PostDFS order in the expression.
+ */
+TVM_DLL tvm::Array<Var> BoundVars(const Pattern& pat);
+
 /*! \brief Get free type parameters from expression expr.
  *
  * Free variables are variables that are not bound by a
@@ -413,12 +424,13 @@ TVM_DLL tvm::Array<TypeVar> AllTypeVars(const Type& t, const Module& mod);
 
 /*! \brief Remove expressions which does not effect the program result.
  *
- * It will remove let bindings which are not referenced, and branches that will
- * not be entered.
+ * It will remove let bindings which are not referenced,
+ * and inline let bindings that are only used once.
  *
- * For example, this pass should turn `let a = 1 in 2` into `2`, as the value of
- * the expression does not depend on a. Another example is `if (true) then 1
- * else 2` will be optimized into 1.
+ * For example, this pass should turn `let a = 1 in 2` into `2`,
+ * as the value of the expression does not depend on a.
+ *
+ * As another example, `let a = 1 in a` will be optimized into 1.
  *
  * \param e the expression to optimize.
  *
@@ -527,7 +539,7 @@ struct StructuralHash {
  *
  * \return expression in A-Normal Form
  */
-Expr ToANormalForm(const Expr& e, const Module& mod);
+TVM_DLL Expr ToANormalForm(const Expr& e, const Module& mod);
 
 /*! \brief Remove let binding and directly share via pointer instead.
  *
@@ -538,8 +550,14 @@ Expr ToANormalForm(const Expr& e, const Module& mod);
  *
  * \return the expression in graph normal form.
  */
-Expr ToGraphNormalForm(const Expr& e);
+TVM_DLL Expr ToGraphNormalForm(const Expr& e);
 
+/*! \brief Aggressive constant propagation/constant folding/inlining.
+ * It will do as much computation in compile time as possible.
+ * It has two benefit: remove runtime overhead, and allow more optimization (typically fusion).
+ * As a side effect, code size will explode.
+ */
+Expr PartialEval(const Expr& e, const Module& mod);
 }  // namespace relay
 }  // namespace tvm
 
diff --git a/include/tvm/relay/pattern_functor.h b/include/tvm/relay/pattern_functor.h
index 747ab197ce3c3..27d68527b7f67 100644
--- a/include/tvm/relay/pattern_functor.h
+++ b/include/tvm/relay/pattern_functor.h
@@ -71,6 +71,7 @@ class PatternFunctor<R(const Pattern& n, Args...)> {
    * \return The result of the call
    */
   virtual R VisitPattern(const Pattern& n, Args... args) {
+    CHECK(n.defined());
     static FType vtable = InitVTable();
     return vtable(n, this, std::forward<Args>(args)...);
   }
diff --git a/nnvm/include/nnvm/top/nn.h b/nnvm/include/nnvm/top/nn.h
index 578f928c5b9f4..ed4e964383eb3 100644
--- a/nnvm/include/nnvm/top/nn.h
+++ b/nnvm/include/nnvm/top/nn.h
@@ -183,6 +183,26 @@ struct WinogradWeightTransformParam : public dmlc::Parameter<WinogradWeightTrans
     static const constexpr int kWeight = 0;
 };
 
+struct WinogradNNPACKWeightTransformParam
+    : public dmlc::Parameter<WinogradNNPACKWeightTransformParam> {
+  int convolution_algorithm;
+  int out_dtype;
+
+  DMLC_DECLARE_PARAMETER(WinogradNNPACKWeightTransformParam) {
+    DMLC_DECLARE_FIELD(convolution_algorithm)
+        .describe(
+            "The convolution algorithm for Winograd NNPACK. "
+            "E.g. tvm.contrib.nnpack.ConvolutionAlgorithm.WT_8x8 for WT_8x8, "
+            "tvm.contrib.nnpack.ConvolutionAlgorithm.WT_8x8_FP16 for WT_8x8_FP16");
+    DMLC_DECLARE_DTYPE_FIELD(out_dtype)
+        .add_enum("same", -1)
+        .set_default(-1)
+        .describe("Output data type, set to explicit type under mixed precision setting");
+  }
+
+  static const constexpr int kWeight = 0;
+};
+
 struct WinogradConv2DParam : public dmlc::Parameter<WinogradConv2DParam> {
   int channels;
   TShape kernel_size;
diff --git a/nnvm/python/nnvm/frontend/caffe2.py b/nnvm/python/nnvm/frontend/caffe2.py
index 8211971a8c3c2..63b7913dd755a 100755
--- a/nnvm/python/nnvm/frontend/caffe2.py
+++ b/nnvm/python/nnvm/frontend/caffe2.py
@@ -3,7 +3,7 @@
 from __future__ import absolute_import as _abs
 import tvm
 from nnvm import symbol as _sym
-from nnvm.frontend.common import get_nnvm_op, Renamer, AttrConverter as AttrCvt
+from .common import get_nnvm_op
 from .onnx_caffe2_utils import dimension_picker, dimension_constraint, infer_channels, revert_caffe2_pad
 from . import onnx
 
@@ -73,8 +73,8 @@ def get_converter(cls):
 
         if hasattr(cls, '_impl'):
             return getattr(cls, '_impl')
-        raise NotImplementedError('{} not implemented'.format(
-            cls.__name__))
+        raise tvm.error.OpNotImplemented(
+            'Operator {} is not implemented in frontend Caffe2.'.format(cls.__name__))
 
 
 _caffe2_internal_args = {
@@ -176,8 +176,7 @@ def _get_axis_from_order_str(order):
                 return 1
             if order == 'NHWC':
                 return 3
-            raise RuntimeError(
-                "Unsupported storage order: {} in caffe2".format(order))
+            raise tvm.error.OpAttributeInvalid('Value {} in attribute {} of operator {} is not valid.'.format(order, 'order', 'Concat'))
 
         return AttrCvt(
             op_name='concatenate',
@@ -427,8 +426,8 @@ def _convert_operator(self,
             # Add a sanitizing step to convert all byte strings in args to strings
             sym = convert_map[op_type](inputs, args, self._params)
         else:
-            raise NotImplementedError(
-                "Operator {} not implemented.".format(op_type))
+            raise tvm.error.OpNotImplemented(
+                'Operator {} is not supported in frontend Caffe2.'.format(op_type))
         return sym
 
 
diff --git a/nnvm/python/nnvm/frontend/common.py b/nnvm/python/nnvm/frontend/common.py
index 7b8c4621029d7..5a8defdb3d6ea 100644
--- a/nnvm/python/nnvm/frontend/common.py
+++ b/nnvm/python/nnvm/frontend/common.py
@@ -7,9 +7,25 @@
 def get_nnvm_op(op_name):
     op = getattr(_sym, op_name)
     if not op:
-        raise RuntimeError("Unable to map op_name {} to nnvm.sym".format(op_name))
+        raise OpNotImplemented(
+            'Operator {} is not supported.'.format(op))
     return op
 
+def required_attr(attr, key, op_name):
+    assert isinstance(attr, dict)
+    if key not in attr:
+        raise OpAttributeRequired(
+            'Required attribute {} not found in operator {}'.format(key, op_name))
+    return attr[key]
+
+def parse_tshape(tshape):
+    """Parse tshape in string."""
+    return [int(x.strip()) for x in tshape.strip('()').split(',')]
+
+def parse_bool_str(attr, key, default='False'):
+    """Parse bool string to boolean."""
+    return attr.get(key, default).strip().lower() in ['true', '1', 't', 'y', 'yes']
+
 class Renamer(object):
     """A simply renamer for operators.
 
diff --git a/nnvm/python/nnvm/frontend/coreml.py b/nnvm/python/nnvm/frontend/coreml.py
index 77285efe7a769..1483e95cf6f05 100644
--- a/nnvm/python/nnvm/frontend/coreml.py
+++ b/nnvm/python/nnvm/frontend/coreml.py
@@ -2,11 +2,10 @@
 """CoreML frontend."""
 from __future__ import absolute_import as _abs
 import numpy as np
-
 import tvm
+from .common import SymbolTable
 from .. import symbol as _sym
 from .._base import string_types
-from .common import SymbolTable
 
 __all__ = ['from_coreml']
 
@@ -83,7 +82,8 @@ def BatchnormLayerParams(op, insym, symtab):
     """Get layer of batchnorm parameter"""
     # this changes the symbol
     if op.instanceNormalization:
-        raise NotImplementedError("instance normalization not implemented")
+        msg = 'Operator "instance normalization" is not supported in frontend CoreML.'
+        raise tvm.error.OpNotImplemented(msg)
     else:
         params = {'gamma':symtab.new_const(list(op.gamma.floatValue)),
                   'beta':symtab.new_const(list(op.beta.floatValue)),
@@ -136,7 +136,8 @@ def ActivationParams(op, insym, symtab):
         betasym = symtab.new_const(beta)
         return _sym.broadcast_mul(_sym.log(_sym.broadcast_add(
             _sym.exp(insym), betasym)), alphasym)
-    raise NotImplementedError('%s not implemented' % whichActivation)
+    raise tvm.error.OpNotImplemented(
+        'Operator {} is not supported in frontend CoreML.'.format(whichActivation))
 
 def ScaleLayerParams(op, insym, symtab):
     """Scale layer params."""
@@ -158,7 +159,8 @@ def PoolingLayerParams(op, insym, symtab):
             return _sym.global_max_pool2d(insym)
         if op.type == 1:
             return _sym.global_avg_pool2d(insym)
-        raise NotImplementedError("Only max and average pooling implemented")
+        raise tvm.error.OpNotImplemented(
+            'Operator pooling (not max or average) is not supported in frontend CoreML.')
 
     else:
         params = {'pool_size':list(op.kernelSize),
@@ -178,7 +180,8 @@ def PoolingLayerParams(op, insym, symtab):
             params['padding'] = padding
             params['ceil_mode'] = True
         else:
-            raise NotImplementedError("Other convolution padding not implemented")
+            msg = 'Value {} in attribute PoolingPaddingType of operator Pooling is not valid.'
+            raise tvm.error.OpAttributeInvalid(msg.format(op.WhichOneof('PoolingPaddingType')))
 
         # consume padding layer
         if symtab.in_padding:
@@ -190,7 +193,8 @@ def PoolingLayerParams(op, insym, symtab):
             return _sym.max_pool2d(insym, **params)
         if op.type == 1:
             return _sym.avg_pool2d(insym, **params)
-        raise NotImplementedError("Only max and average pooling implemented")
+        msg = 'Operator pooling (not max or average) is not supported in frontend CoreML.'
+        raise tvm.error.OpNotImplemented(msg)
 
 def SoftmaxLayerParams(op, insym, symtab):
     return _sym.softmax(_sym.flatten(insym))
@@ -229,7 +233,8 @@ def ConcatLayerParams(op, insyms, symtab):
     if not isinstance(insyms, list):
         insyms = [insyms]
     if op.sequenceConcat:
-        raise NotImplementedError("Sequence Concat not supported")
+        raise tvm.error.OpNotImplemented(
+            'Operator Sequence Concat is not supported in frontend CoreML.')
     ret = _sym.concatenate(*insyms, axis=1)
     return ret
 
@@ -243,14 +248,16 @@ def PaddingLayerParams(op, insym, symtab):
     if op.WhichOneof('PaddingType') == 'constant':
         constant = op.constant
         if constant.value != 0:
-            raise NotImplementedError("Padding value {} not supported.".format(constant.value))
+            msg = 'Value {} in attribute "padding value" of operator Padding is not valid.'
+            raise tvm.error.OpAttributeInvalid(msg.format(constant.value))
         padding = [b.startEdgeSize for b in op.paddingAmounts.borderAmounts]
         padding2 = [b.endEdgeSize for b in op.paddingAmounts.borderAmounts]
         for i, j in zip(padding, padding2):
             assert i == j
         symtab.set_padding(padding)
     else:
-        raise NotImplementedError("Only constant padding is supported now.")
+        raise tvm.error.OpNotImplemented(
+            'Operator "non-constant padding" is not supported in frontend CoreML.')
     return insym
 
 def PermuteLayerParams(op, insym, symtab):
@@ -259,8 +266,8 @@ def PermuteLayerParams(op, insym, symtab):
 
 def UpsampleLayerParams(op, insym, symtab):
     if op.scalingFactor[0] != op.scalingFactor[1]:
-        raise NotImplementedError("Upsampling only supported with same \
-            height and width scaling factor.")
+        raise tvm.error.OpAttributeInvalid(
+            'Height and width scaling factors of Upsample operator must be equal.')
     interpolationMode = 'NEAREST_NEIGHBOR' if op.mode == 0 else 'BILINEAR'
     return _sym.upsampling(insym, scale=op.scalingFactor[0], method=interpolationMode)
 
@@ -341,7 +348,8 @@ def coreml_op_to_nnvm(op, inname, outname, symtab):
     """
     classname = type(op).__name__
     if classname not in _convert_map:
-        raise NotImplementedError("%s is not supported" % (classname))
+        raise tvm.error.OpNotImplemented(
+            'Operator {} is not supported in frontend CoreML.'.format(classname))
     if isinstance(inname, string_types):
         insym = symtab.get_var(inname)
     else:
diff --git a/nnvm/python/nnvm/frontend/darknet.py b/nnvm/python/nnvm/frontend/darknet.py
index 154c83c90ec60..bf5a832258fa5 100644
--- a/nnvm/python/nnvm/frontend/darknet.py
+++ b/nnvm/python/nnvm/frontend/darknet.py
@@ -6,6 +6,7 @@
 import numpy as np
 import tvm
 from .. import symbol as _sym
+from .common import get_nnvm_op, required_attr, parse_tshape, parse_bool_str
 
 class LAYERTYPE(object):
     """Darknet LAYERTYPE Class constant."""
@@ -57,45 +58,12 @@ class ACTIVATION(object):
 
 __all__ = ['from_darknet']
 
-def _darknet_get_nnvm_op(op_name):
-    """Get the nnvm operation from opname, raise error if not supported."""
-    op = getattr(_sym, op_name)
-    if not op:
-        raise RuntimeError("Not to map op_name {} to nnvm.sym".format(op_name))
-    return op
-
-def _darknet_required_attr(attr, key):
-    """Check the attribute exists and return if exists, if not return error."""
-    assert isinstance(attr, dict)
-    if key not in attr:
-        raise AttributeError("Required attribute {} not found.".format(key))
-    return attr[key]
-
-def _darknet_raise_not_supported(attr, op='nnvm'):
-    """Raise error if any operation is not supported."""
-    err = "{} is not supported in {}.".format(attr, op)
-    raise NotImplementedError(err)
-
-def _darknet_warn_not_used(attr, op='nnvm'):
-    """Raise warning if any operation not supported."""
-    import warnings
-    err = "{} is ignored in {}.".format(attr, op)
-    warnings.warn(err)
-
-def _darknet_parse_tshape(tshape):
-    """Parse tshape in string."""
-    return [int(x.strip()) for x in tshape.strip('()').split(',')]
-
-def _darknet_parse_bool_str(attr, key, default='False'):
-    """Parse bool string to boolean."""
-    return attr.get(key, default).strip().lower() in \
-                                    ['true', '1', 't', 'y', 'yes']
-
 def _darknet_maxpooling(inputs, attrs):
     """Process the max pool 2d operation."""
-    kernel = _darknet_parse_tshape(_darknet_required_attr(attrs, 'kernel'))
+    kernel = parse_tshape(required_attr(attrs, 'kernel', 'maxpool'))
     if len(kernel) != 1:
-        _darknet_raise_not_supported('non-2d kernel', 'pool_2d')
+        raise tvm.error.OpAttributeUnimplemented(
+            'Non-2D kernels for Max Pooling are not supported in frontend Darknet.')
 
     op_name, new_attrs = 'max_pool2d', {}
     strides = int(attrs.get('stride', (1, 1)))
@@ -107,13 +75,14 @@ def _darknet_maxpooling(inputs, attrs):
     if extra_pad_size:
         pad_width = ((0, 0), (0, 0), (0, extra_pad_size), (0, extra_pad_size))
         inputs = _sym.pad(*inputs, pad_width=pad_width, pad_value=np.finfo(np.float32).min)
-    return _darknet_get_nnvm_op(op_name)(*inputs, **new_attrs), None
+    return get_nnvm_op(op_name)(*inputs, **new_attrs), None
 
 def _darknet_avgpooling(inputs, attrs):
     """Process the average pool 2d operation."""
-    kernel = _darknet_parse_tshape(_darknet_required_attr(attrs, 'kernel'))
+    kernel = parse_tshape(required_attr(attrs, 'kernel', 'avgpool'))
     if len(kernel) != 1:
-        _darknet_raise_not_supported('non-2d kernel', 'pool_2d')
+        raise tvm.error.OpAttributeUnimplemented(
+            'Non-2D kernels for Average Pooling are not supported in frontend Darknet.')
 
     op_name, new_attrs = 'avg_pool2d', {}
     strides = int(attrs.get('stride', (1, 1)))
@@ -122,7 +91,7 @@ def _darknet_avgpooling(inputs, attrs):
     new_attrs['strides'] = str((strides, strides))
     new_attrs['padding'] = str((pads, pads))
 
-    return _darknet_get_nnvm_op(op_name)(*inputs, **new_attrs), None
+    return get_nnvm_op(op_name)(*inputs, **new_attrs), None
 
 def _darknet_batch_norm(inputs, attrs):
     """Process the batchnormalization operation."""
@@ -131,21 +100,23 @@ def _darknet_batch_norm(inputs, attrs):
     new_attrs['epsilon'] = attrs.get('eps', 0.000001)
     new_attrs['center'] = True
     new_attrs['scale'] = True
-    return _darknet_get_nnvm_op(op_name)(*inputs, **new_attrs), None
+    return get_nnvm_op(op_name)(*inputs, **new_attrs), None
 
 def _darknet_conv2d(inputs, attrs):
     """Process the convolution 2d operation."""
-    kernel = _darknet_parse_tshape(_darknet_required_attr(attrs, 'kernel'))
+    kernel = parse_tshape(required_attr(attrs, 'kernel', 'conv2d'))
     if len(kernel) != 1:
-        _darknet_raise_not_supported('non 2d kernel', 'conv2d')
+        raise tvm.error.OpAttributeUnimplemented('Non-2D kernels for Conv2D are unsupported '
+                                                 'in frontend Darknet.')
     layout = attrs.get('layout', 'NCHW')
     if layout not in ['NCHW', 'NHWC']:
-        _darknet_raise_not_supported('layout: ' + layout, 'conv2d')
+        raise tvm.error.OpAttributeInvalid(
+            'Value {} in attribute "layout" of operator Conv2D is not valid.'.format(layout))
     strides = int(attrs.get('stride', (1, 1)))
     pads = int(attrs.get('pad', (0, 0)))
 
     op_name, new_attrs = 'conv2d', {}
-    new_attrs['channels'] = _darknet_required_attr(attrs, 'num_filter')
+    new_attrs['channels'] = required_attr(attrs, 'num_filter', 'conv2d')
     new_attrs['kernel_size'] = [kernel[0], kernel[0]]
     new_attrs['strides'] = (strides, strides)
     new_attrs['padding'] = (pads, pads)
@@ -157,13 +128,13 @@ def _darknet_conv2d(inputs, attrs):
     else:
         new_attrs['use_bias'] = True
     out_name = {}
-    sym = _darknet_get_nnvm_op(op_name)(*inputs, **new_attrs)
+    sym = get_nnvm_op(op_name)(*inputs, **new_attrs)
     out_name[0] = sym.list_output_names()[0].replace('_output', '')
 
     if attrs.get('use_batchNorm', False) is True:
         op_name, new_attrs = 'batch_norm', {}
         new_attrs['epsilon'] = 0.000001
-        sym = _darknet_get_nnvm_op(op_name)(*sym, **new_attrs)
+        sym = get_nnvm_op(op_name)(*sym, **new_attrs)
         out_name[1] = sym.list_output_names()[0].replace('_output', '')
     if 'activation' in attrs:
         new_attrs = {}
@@ -176,15 +147,18 @@ def _darknet_conv2d(inputs, attrs):
 def _darknet_conv2d_transpose(inputs, attrs):
     """Process the convolution 2d transpose operation."""
     if 'target_shape' in attrs:
-        _darknet_raise_not_supported('target_shape', 'conv2d_transpose')
-    kernel = _darknet_parse_tshape(_darknet_required_attr(attrs, 'kernel'))
+        raise tvm.error.OpAttributeUnimplemented(
+            'Attribute "target_shape" is not supported in operator Conv2D-transpose.')
+    kernel = parse_tshape(required_attr(attrs, 'kernel', 'conv2d_transpose'))
     if len(kernel) != 2:
-        _darknet_raise_not_supported('non-2d kernel', 'conv2d_transpose')
+        raise tvm.error.OpAttributeUnimplemented(
+            'Non-2D kernels are not supported in operator Conv2D-transpose.')
     layout = attrs.get('layout', 'NCHW')
     if layout not in ['NCHW', 'NHWC']:
-        _darknet_raise_not_supported('layout: ' + layout, 'conv2d_transpose')
+        msg = 'Value {} in attribute "layout" of operator Conv2D-transpose is not valid.'
+        raise tvm.error.OpAttributeInvalid(msg.format(layout))
     op_name, new_attrs = 'conv2d_transpose', {}
-    new_attrs['channels'] = _darknet_required_attr(attrs, 'num_filter')
+    new_attrs['channels'] = required_attr(attrs, 'num_filter', 'conv2d_transpose')
     new_attrs['kernel_size'] = kernel
     new_attrs['strides'] = attrs.get('stride', (1, 1))
     new_attrs['output_padding'] = attrs.get('adj', (0, 0))
@@ -192,8 +166,8 @@ def _darknet_conv2d_transpose(inputs, attrs):
     new_attrs['dilation'] = attrs.get('dilate', (1, 1))
     new_attrs['groups'] = attrs.get('num_group', 1)
     new_attrs['layout'] = layout
-    new_attrs['use_bias'] = not _darknet_parse_bool_str(attrs, 'no_bias')
-    return _darknet_get_nnvm_op(op_name)(*inputs, **new_attrs), None
+    new_attrs['use_bias'] = not parse_bool_str(attrs, 'no_bias')
+    return get_nnvm_op(op_name)(*inputs, **new_attrs), None
 
 def _darknet_shortcut(inputs, attrs):
     """Process the shortcut operation."""
@@ -219,7 +193,7 @@ def _darknet_shortcut(inputs, attrs):
                            pad_value=0.)
 
     new_inputs = _as_list([input_0, input_1])
-    sym = _darknet_get_nnvm_op(op_name)(*new_inputs, **new_attrs)
+    sym = get_nnvm_op(op_name)(*new_inputs, **new_attrs)
     out_name = sym.list_output_names()[0].replace('_output', '')
     if 'activation' in attrs:
         new_attrs['activation'] = attrs['activation']
@@ -229,17 +203,17 @@ def _darknet_shortcut(inputs, attrs):
 def _darknet_dense(inputs, attrs):
     """Process the dense operation."""
     op_name, new_attrs = 'dense', {}
-    new_attrs['units'] = _darknet_required_attr(attrs, 'num_hidden')
+    new_attrs['units'] = required_attr(attrs, 'num_hidden', 'dense')
     out_name = {}
     new_attrs['use_bias'] = attrs.get('use_bias', False)
     if attrs.get('use_flatten', False) is True:
         inputs[0] = _sym.flatten(inputs[0])
-    sym = _darknet_get_nnvm_op(op_name)(*inputs, **new_attrs)
+    sym = get_nnvm_op(op_name)(*inputs, **new_attrs)
     out_name[0] = sym.list_output_names()[0].replace('_output', '')
     if 'use_batchNorm' in attrs:
         op_name, new_attrs = 'batch_norm', {}
         new_attrs['epsilon'] = 0.000001
-        sym = _darknet_get_nnvm_op(op_name)(*sym, **new_attrs)
+        sym = get_nnvm_op(op_name)(*sym, **new_attrs)
         out_name[1] = sym.list_output_names()[0].replace('_output', '')
     if 'activation' in attrs:
         new_attrs = {}
@@ -251,28 +225,29 @@ def _darknet_dropout(inputs, attrs):
     """Process the dropout operation, its a blank operation."""
     op_name, new_attrs = 'dropout', {}
     new_attrs['rate'] = attrs.get('p', 0.5)
-    return _darknet_get_nnvm_op(op_name)(*inputs, **new_attrs), None
+    return get_nnvm_op(op_name)(*inputs, **new_attrs), None
 
 def _darknet_reshape(inputs, attrs):
     """Process the reshape operation."""
-    if _darknet_parse_bool_str(attrs, 'reverse'):
-        _darknet_raise_not_supported('reverse', 'reshape')
+    if parse_bool_str(attrs, 'reverse'):
+        raise tvm.error.OpAttributeUnimplemented(
+            'Attribute "reverse" is not supported in operator Reshape.')
     op_name, new_attrs = 'reshape', {}
-    new_attrs['shape'] = _darknet_required_attr(attrs, 'shape')
-    return _darknet_get_nnvm_op(op_name)(*inputs, **new_attrs), None
+    new_attrs['shape'] = required_attr(attrs, 'shape', 'reshape')
+    return get_nnvm_op(op_name)(*inputs, **new_attrs), None
 
 def _darknet_upsampling(inputs, attrs):
     """Process the upsampling operation."""
     op_name, new_attrs = 'upsampling', {}
     new_attrs['scale'] = attrs.get('scale', 1)
-    return _darknet_get_nnvm_op(op_name)(*inputs, **new_attrs), None
+    return get_nnvm_op(op_name)(*inputs, **new_attrs), None
 
 def _darknet_l2normalize(inputs, attrs):
     """Process the l2 normalization operation."""
     op_name, new_attrs = 'l2_normalize', {}
     new_attrs['eps'] = attrs.get('eps', 0)
     new_attrs['axis'] = attrs.get('axis', 1)
-    return _darknet_get_nnvm_op(op_name)(*inputs, **new_attrs), None
+    return get_nnvm_op(op_name)(*inputs, **new_attrs), None
 
 def _darknet_softmax_output(inputs, attrs):
     """Process the softmax operation."""
@@ -280,25 +255,25 @@ def _darknet_softmax_output(inputs, attrs):
     if temperature != 1:
         inputs[0] = inputs[0] / float(temperature)
     op_name, new_attrs = 'softmax', {}
-    if _darknet_parse_bool_str(attrs, 'multi_output'):
+    if parse_bool_str(attrs, 'multi_output'):
         new_attrs['axis'] = 1
 
     if attrs.get('use_flatten', False) is True:
         inputs[0] = _sym.flatten(inputs[0])
-    return _darknet_get_nnvm_op(op_name)(*inputs, **new_attrs), None
+    return get_nnvm_op(op_name)(*inputs, **new_attrs), None
 
 def _darknet_route(inputs, attrs):
     """Process the route operation, which is equivalent to concat."""
     op_name = 'concatenate'
     new_attrs = {'axis': attrs.get('dim', 1)}
-    return _darknet_get_nnvm_op(op_name)(*inputs, **new_attrs), None
+    return get_nnvm_op(op_name)(*inputs, **new_attrs), None
 
 def _darknet_reorg(inputs, attrs):
     """Process the reorg operation."""
     op_name, new_attrs = 'yolo_reorg', {}
     if 'stride' in attrs:
         new_attrs = {'stride': attrs.get('stride', 1)}
-    return _darknet_get_nnvm_op(op_name)(*inputs, **new_attrs), None
+    return get_nnvm_op(op_name)(*inputs, **new_attrs), None
 
 def _darknet_region(inputs, attrs):
     """Process the region operation."""
@@ -344,7 +319,7 @@ def _darknet_yolo(inputs, attrs):
 
 def _darknet_activations(inputs, attrs):
     """Process the activation function."""
-    act = _darknet_required_attr(attrs, 'activation')
+    act = required_attr(attrs, 'activation', 'activations')
     if ACTIVATION.LOGISTIC == act:
         act_type = 'sigmoid'
     elif ACTIVATION.RELU == act:
@@ -358,22 +333,24 @@ def _darknet_activations(inputs, attrs):
     elif ACTIVATION.ELU == act:
         act_type = 'elu'
     else:
-        _darknet_raise_not_supported('act: ' + act)
+        raise tvm.error.OpNotImplemented(
+            'Operator act: {} is not supported in framework Darknet.'.format(act))
 
     if act_type in ['relu', 'tanh']:
         op_name, new_attrs = act_type, {}
-        sym = _darknet_get_nnvm_op(op_name)(*inputs, **new_attrs)
+        sym = get_nnvm_op(op_name)(*inputs, **new_attrs)
     elif act_type in ['leaky_relu']:
         op_name, new_attrs = act_type, {}
         new_attrs['alpha'] = attrs.get('slope', 0.1)
-        sym = _darknet_get_nnvm_op(op_name)(*inputs, **new_attrs)
+        sym = get_nnvm_op(op_name)(*inputs, **new_attrs)
     elif act_type in ['elu']:
         sym = -1 * _sym.relu(1 - _sym.exp(*inputs)) + _sym.relu(*inputs)
     elif act_type in ['sigmoid']:
         op_name, new_attrs = act_type, {}
-        sym = _darknet_get_nnvm_op(op_name)(*inputs, **new_attrs)
+        sym = get_nnvm_op(op_name)(*inputs, **new_attrs)
     else:
-        _darknet_raise_not_supported('act_type: ' + act_type)
+        raise tvm.error.OpNotImplemented(
+            'Operator act: {} is not supported in framework Darknet.'.format(act))
     return sym, None
 
 def _darknet_op_not_support(inputs, attrs):
@@ -436,7 +413,8 @@ def _darknet_convert_symbol(op_name, inputs, attrs):
     if op_name in _DARKNET_CONVERT_MAP:
         sym, out_name = _DARKNET_CONVERT_MAP[op_name](inputs, attrs)
     else:
-        _darknet_raise_not_supported('Operator type ' + str(op_name))
+        raise tvm.error.OpNotImplemented(
+            'Operator {} is not supported in frontend Darknet.'.format(op_name))
     if out_name is  None:
         out_name = sym.list_output_names()[0].replace('_output', '')
     return out_name, sym
@@ -482,8 +460,10 @@ def _get_convolution_weights(self, layer, opname):
         if layer.nweights == 0:
             return
 
-        if (layer.n * layer.c * layer.size * layer.size) != layer.nweights:
-            raise RuntimeError("layer weights size not matching with n c h w")
+        if layer.n * layer.c * layer.size * layer.size != layer.nweights:
+            msg = 'nweights ({}) != n * c * h * w ({}) in operator {}'
+            msg = msg.format(layer.nweights, layer.n * layer.c * layer.size ** 2, opname)
+            raise tvm.error.OpAttributeInvalid(msg)
 
         shape = (layer.n, layer.c, layer.size, layer.size)
         weights = self._read_memory_buffer(shape, layer.weights)
@@ -663,8 +643,8 @@ def _get_darknet_attrs(self, layer, layer_num):
             pass
 
         else:
-            err = "Darknet layer type {} is not supported in nnvm.".format(layer.type)
-            raise NotImplementedError(err)
+            raise tvm.error.OpNotImplemented(
+                'Operator {} is not supported in frontend Darknet.'.format(layer.type))
 
         return attr
 
@@ -761,7 +741,7 @@ def _handle_darknet_rnn_layers(self, layer_num, sym):
 
                 op_name, new_attrs = 'elemwise_add', {}
                 new_inputs = _as_list([sym, state])
-                state = _darknet_get_nnvm_op(op_name)(*new_inputs, **new_attrs)
+                state = get_nnvm_op(op_name)(*new_inputs, **new_attrs)
                 self._outs.append(state)
 
                 output_layer = layer.output_layer
@@ -786,7 +766,7 @@ def _handle_darknet_rnn_layers(self, layer_num, sym):
 
                 op_name, new_attrs = 'elemwise_add', {}
                 new_inputs = _as_list([sym, state])
-                state = _darknet_get_nnvm_op(op_name)(*new_inputs, **new_attrs)
+                state = get_nnvm_op(op_name)(*new_inputs, **new_attrs)
                 self._outs.append(state)
 
                 output_layer = layer.output_layer
@@ -797,7 +777,8 @@ def _handle_darknet_rnn_layers(self, layer_num, sym):
 
         elif LAYERTYPE.LSTM == layer.type:
             if layer.steps > 1:
-                raise NotImplementedError("Currently support only single step GRU")
+                raise tvm.error.OpAttributeInvalid(
+                    'Number of steps {} of RNN is not valid.'.format(layer.steps))
 
             op_name_add = 'elemwise_add'
             op_name_mul = 'elemwise_mul'
@@ -819,16 +800,16 @@ def _handle_darknet_rnn_layers(self, layer_num, sym):
                 sym_uo = self._get_darknet_rnn_attrs(layer.uo, input_sym)
 
                 new_inputs = _as_list([sym_wf, sym_uf])
-                add_f = _darknet_get_nnvm_op(op_name_add)(*new_inputs, **attrs)
+                add_f = get_nnvm_op(op_name_add)(*new_inputs, **attrs)
 
                 new_inputs = _as_list([sym_wi, sym_ui])
-                add_i = _darknet_get_nnvm_op(op_name_add)(*new_inputs, **attrs)
+                add_i = get_nnvm_op(op_name_add)(*new_inputs, **attrs)
 
                 new_inputs = _as_list([sym_wg, sym_ug])
-                add_g = _darknet_get_nnvm_op(op_name_add)(*new_inputs, **attrs)
+                add_g = get_nnvm_op(op_name_add)(*new_inputs, **attrs)
 
                 new_inputs = _as_list([sym_wo, sym_uo])
-                add_o = _darknet_get_nnvm_op(op_name_add)(*new_inputs, **attrs)
+                add_o = get_nnvm_op(op_name_add)(*new_inputs, **attrs)
 
                 act_attr['activation'] = ACTIVATION.LOGISTIC
                 act_f, _ = _darknet_activations(_as_list(add_f), act_attr)
@@ -843,19 +824,19 @@ def _handle_darknet_rnn_layers(self, layer_num, sym):
                 act_o, _ = _darknet_activations(_as_list(add_o), act_attr)
 
                 new_inputs = _as_list([act_i, act_g])
-                mul_t = _darknet_get_nnvm_op(op_name_mul)(*new_inputs, **attrs)
+                mul_t = get_nnvm_op(op_name_mul)(*new_inputs, **attrs)
 
                 new_inputs = _as_list([act_f, c_state])
-                c_state = _darknet_get_nnvm_op(op_name_mul)(*new_inputs, **attrs)
+                c_state = get_nnvm_op(op_name_mul)(*new_inputs, **attrs)
 
                 new_inputs = _as_list([mul_t, c_state])
-                c_state = _darknet_get_nnvm_op(op_name_add)(*new_inputs, **attrs)
+                c_state = get_nnvm_op(op_name_add)(*new_inputs, **attrs)
 
                 act_attr['activation'] = ACTIVATION.TANH
                 h_state, _ = _darknet_activations(_as_list(c_state), act_attr)
 
                 new_inputs = _as_list([act_o, h_state])
-                h_state = _darknet_get_nnvm_op(op_name_mul)(*new_inputs, **attrs)
+                h_state = get_nnvm_op(op_name_mul)(*new_inputs, **attrs)
                 self._outs = self._outs + [c_state, h_state]
                 sym = h_state
             self._sym_array[layer_num] = sym
@@ -863,7 +844,8 @@ def _handle_darknet_rnn_layers(self, layer_num, sym):
 
         elif LAYERTYPE.GRU == layer.type:
             if layer.steps > 1:
-                raise NotImplementedError("Currently support only single step GRU")
+                raise tvm.error.OpAttributeInvalid(
+                    'Number of steps {} is not valid in RNN.'.format(layer.steps))
 
             op_name_add = 'elemwise_add'
             op_name_mul = 'elemwise_mul'
@@ -881,10 +863,10 @@ def _handle_darknet_rnn_layers(self, layer_num, sym):
                 sym_uh = self._get_darknet_rnn_attrs(layer.uh, input_sym)
 
                 new_inputs = _as_list([sym_uz, sym_wz])
-                add_z = _darknet_get_nnvm_op(op_name_add)(*new_inputs, **attrs)
+                add_z = get_nnvm_op(op_name_add)(*new_inputs, **attrs)
 
                 new_inputs = _as_list([sym_ur, sym_wr])
-                add_r = _darknet_get_nnvm_op(op_name_add)(*new_inputs, **attrs)
+                add_r = get_nnvm_op(op_name_add)(*new_inputs, **attrs)
 
                 act_attr['activation'] = ACTIVATION.LOGISTIC
                 act_z, _ = _darknet_activations(_as_list(add_z), act_attr)
@@ -893,12 +875,12 @@ def _handle_darknet_rnn_layers(self, layer_num, sym):
                 act_r, _ = _darknet_activations(_as_list(add_r), act_attr)
 
                 new_inputs = _as_list([act_r, state])
-                forgot = _darknet_get_nnvm_op(op_name_mul)(*new_inputs, **attrs)
+                forgot = get_nnvm_op(op_name_mul)(*new_inputs, **attrs)
 
                 sym_wh = self._get_darknet_rnn_attrs(layer.wh, forgot)
 
                 new_inputs = _as_list([sym_uh, sym_wh])
-                h_state = _darknet_get_nnvm_op(op_name_add)(*new_inputs, **attrs)
+                h_state = get_nnvm_op(op_name_add)(*new_inputs, **attrs)
 
                 if layer.tanh == 1:
                     act_attr['activation'] = ACTIVATION.TANH
diff --git a/nnvm/python/nnvm/frontend/keras.py b/nnvm/python/nnvm/frontend/keras.py
index 56758ada5f462..63b4122a40603 100644
--- a/nnvm/python/nnvm/frontend/keras.py
+++ b/nnvm/python/nnvm/frontend/keras.py
@@ -74,7 +74,8 @@ def _convert_activation(insym, keras_layer, _):
     if act_type == 'hard_sigmoid':
         transformX = (0.2 * insym) + 0.5
         return _sym.clip(transformX, a_min=0, a_max=1)
-    raise TypeError("Unsupported activation type : {}".format(act_type))
+    raise tvm.error.OpNotImplemented(
+        'Operator {} is not supported in frontend Keras.'.format(act_type))
 
 
 def _convert_advanced_activation(insym, keras_layer, symtab):
@@ -100,7 +101,8 @@ def _convert_advanced_activation(insym, keras_layer, symtab):
         theta = keras_layer.theta if hasattr(keras_layer, "theta") else 1.0
         theta_tensor = _sym.full_like(insym[0], fill_value=float(theta))
         return _sym.elemwise_mul(insym[0], _sym.greater(insym[0], theta_tensor, out_type="float32"))
-    raise TypeError("Unsupported advanced activation type : {}".format(act_type))
+    raise tvm.error.OpNotImplemented(
+        'Operator {} is not supported in frontend Keras.'.format(act_type))
 
 
 def _convert_merge(insym, keras_layer, _):
@@ -113,12 +115,9 @@ def _convert_merge(insym, keras_layer, _):
             ret = _sym.elemwise_sub(ret, insym[i])
         elif merge_type == 'Multiply':
             ret = _sym.elemwise_mul(ret, insym[i])
-        elif merge_type == 'Average':
-            raise NotImplementedError('Average merge not implemented')
-        elif merge_type == 'Maximum':
-            raise NotImplementedError('Maximum merge not implemented')
         else:
-            raise TypeError("Unsupported merge type : {}".format(merge_type))
+            raise tvm.error.OpNotImplemented(
+                'Operator {} Merge is not supported in frontend Keras.'.format(merge_type))
     return ret
 
 
@@ -135,7 +134,8 @@ def _convert_dense(insym, keras_layer, symtab):
     if input_dim > 2:
         input_shape = tuple(dim if dim else 1 for dim in _as_list(input_shape)[0])
         if input_dim != 3 or input_shape[0] != 1 or input_shape[1] != 1:
-            raise ValueError("Cannot flatten the inputs with shape.", input_shape, " for dense.")
+            msg = 'Value {} in attribute "input_shape" of operator Dense is not valid.'
+            raise tvm.error.OpAttributeInvalid(msg.format(input_shape))
         insym = _sym.squeeze(insym, axis=0)
     out = _sym.dense(data=insym, **params)
     # defuse activation
@@ -199,7 +199,8 @@ def _convert_convolution(insym, keras_layer, symtab):
         else:
             insym = _sym.pad(data=insym, pad_width=((0, 0), (0, 0), (pad_t, pad_b), (pad_l, pad_r)))
     else:
-        raise TypeError("Unsupported padding type : {}".format(keras_layer.padding))
+        msg = 'Value {} in attribute "padding" of operator Convolution is not valid.'
+        raise tvm.error.OpAttributeInvalid(msg.format(keras_layer.padding))
     if is_deconv:
         out = _sym.conv2d_transpose(data=insym, **params)
     else:
@@ -240,7 +241,8 @@ def _convert_separable_convolution(insym, keras_layer, symtab):
         insym = _sym.pad(data=insym, pad_width=(
             (0, 0), (0, 0), (pad_t, pad_b), (pad_l, pad_r)))
     else:
-        raise TypeError("Unsupported padding type : {}".format(keras_layer.padding))
+        msg = 'Value {} in attribute "padding" of operator Separable Convolution is not valid.'
+        raise tvm.error.OpAttributeInvalid(msg.format(keras_layer.padding))
     depthconv = _sym.conv2d(data=insym, **params0)
     # pointwise conv
     weight1 = weightList[1].transpose([3, 2, 0, 1])
@@ -294,13 +296,15 @@ def _convert_pooling(insym, keras_layer, symtab):
         pad_l, pad_r = _get_pad_pair(in_w, pool_w, stride_w)
         params['padding'] = [pad_t, pad_l, pad_b, pad_r]
     else:
-        raise TypeError("Unsupported padding type : {}".format(keras_layer.padding))
+        msg = 'Value {} in attribute "padding" of operator Pooling is not valid.'
+        raise tvm.error.OpAttributeInvalid(msg.format(keras_layer.padding))
     if pool_type == 'MaxPooling2D':
         return _sym.max_pool2d(insym, **params)
     if pool_type == 'AveragePooling2D':
         # TODO: in keras, padded zeros are not calculated
         return _sym.avg_pool2d(insym, **params)
-    raise TypeError("Unsupported pooling type : {}".format(keras_layer))
+    msg = 'Value {} in attribute "padding" of operator Pooling is not valid.'
+    raise tvm.error.OpAttributeInvalid(msg.format(keras_layer.padding))
 
 
 def _convert_upsample(insym, keras_layer, _):
@@ -312,30 +316,30 @@ def _convert_upsample(insym, keras_layer, _):
     elif upsample_type == "UpSampling2D":
         h, w = keras_layer.size
         if h != w:
-            raise TypeError("Unsupported upsampling type with different axes size : {}"
-                            .format(keras_layer.size))
+            raise tvm.error.OpAttributeInvalid(
+                'Upsample height ({}) must equal width ({})'.format(h, w))
         params = {'scale': h}
     elif upsample_type == "UpSampling3D":
         h, w, d = keras_layer.size
         if h != w or w != d:
-            raise TypeError("Unsupported upsampling type with different axes size : {}"
-                            .format(keras_layer.size))
+            raise tvm.error.OpAttributeInvalid(
+                'Upsample height ({}), width ({}), and depth ({}) must be equal.'.format(h, w, d))
         params = {'scale': h}
     else:
-        raise TypeError("Unsupported upsampling type : {}".format(upsample_type))
+        msg = 'Operator {} is not supported in frontend Keras.'
+        raise tvm.error.OpNotImplemented(msg.format(upsample_type))
     return _sym.upsampling(insym, **params)
 
 
 def _convert_cropping(insym, keras_layer, _):
     _check_data_format(keras_layer)
     crop_type = type(keras_layer).__name__
-    if crop_type == "Cropping1D":
-        raise NotImplementedError("Cropping1D not implemented")
-    elif crop_type == "Cropping2D":
+    if crop_type == "Cropping2D":
         (_, in_h, in_w, _) = keras_layer.input_shape
         ((crop_t, crop_b), (crop_l, crop_r)) = keras_layer.cropping
     else:
-        raise TypeError("Unrecognized cropping type : {}".format(crop_type))
+        raise tvm.error.OpNotImplemented(
+            'Operator {} is not supported in frontend Keras.'.format(crop_type))
     int32_max = np.iinfo(np.int32).max
     return _sym.strided_slice(insym, begin=[0, 0, crop_t, crop_l],
                               end=[int32_max, int32_max, in_h-crop_b, in_w-crop_r])
@@ -379,13 +383,13 @@ def _convert_padding(insym, keras_layer, _):
                 top, bottom = padding[0]
                 left, right = padding[1]
             else:
-                raise ValueError("Unrecognized padding option: {}".format(str(padding)))
+                msg = 'Value {} in attribute "padding" of operator {} is not valid.'
+                raise tvm.error.OpAttributeInvalid(msg.format(str(padding), padding_type))
         else:
-            raise ValueError("Unrecognized padding option: {}".format(str(padding)))
-    elif padding_type == 'ZeroPadding1D':
-        raise NotImplementedError("ZeroPadding1D not implemented")
+            msg = 'Value {} in attribute "padding" of operator {} is not valid.'
+            raise tvm.error.OpAttributeInvalid(msg.format(str(padding), padding_type))
     else:
-        raise ValueError("Unrecognized padding type: {}".format(padding_type))
+        raise tvm.error.OpNotImplemented('Operator {} is not supported in frontend Keras.')
     return _sym.pad(data=insym, pad_width=((0, 0), (0, 0), (top, bottom), (left, right)))
 
 
@@ -592,8 +596,10 @@ def _default_skip(insym, keras_layer, _): # pylint: disable=unused-argument
 
 def _check_unsupported_layers(model):
     for layer in model.layers:
-        if type(layer).__name__ not in _convert_map:
-            raise ValueError("Keras layer {} not supported.".format(type(layer).__name__))
+        op_name = type(layer).__name__
+        if op_name not in _convert_map:
+            raise tvm.error.OpNotImplemented(
+                'Operator {} is not supported in frontend Keras.'.format(op_name))
 
 def _as_list(arr):
     """Force being a list, ignore if already is."""
@@ -618,9 +624,11 @@ def keras_op_to_nnvm(insym, keras_layer, outname, symtab):
     symtab : nnvm.frontend.common.SymbolTable
         The global symbol table to be updated
     """
-    if type(keras_layer).__name__ not in _convert_map:
-        raise NotImplementedError("{} is not supported".format((type(keras_layer).__name__)))
-    outs = _convert_map[type(keras_layer).__name__](insym, keras_layer, symtab)
+    op_name = type(keras_layer).__name__
+    if op_name not in _convert_map:
+        raise tvm.error.OpNotImplemented(
+            'Operator {} is not supported in frontend Keras.'.format(op_name))
+    outs = _convert_map[op_name](insym, keras_layer, symtab)
     outs = _as_list(outs)
 
     for t_idx, out in enumerate(outs):
diff --git a/nnvm/python/nnvm/frontend/mxnet.py b/nnvm/python/nnvm/frontend/mxnet.py
index 47d7ede96e5fe..da5e154bce12b 100644
--- a/nnvm/python/nnvm/frontend/mxnet.py
+++ b/nnvm/python/nnvm/frontend/mxnet.py
@@ -4,51 +4,25 @@
 import json
 import tvm
 from .. import symbol as _sym
+from .common import get_nnvm_op, required_attr, parse_tshape, parse_bool_str
 
 __all__ = ['from_mxnet']
 
-def _get_nnvm_op(op_name):
-    op = getattr(_sym, op_name)
-    if not op:
-        raise RuntimeError("Unable to map op_name {} to nnvm.sym".format(op_name))
-    return op
-
-def _required_attr(attr, key):
-    assert isinstance(attr, dict)
-    if key not in attr:
-        raise AttributeError("Required attribute {} not found.".format(key))
-    return attr[key]
-
-def _raise_not_supported(attr, op='nnvm'):
-    err = "{} is not supported in {}.".format(attr, op)
-    raise NotImplementedError(err)
-
-def _warn_not_used(attr, op='nnvm'):
-    import warnings
-    err = "{} is ignored in {}.".format(attr, op)
-    warnings.warn(err)
-
-def _parse_tshape(tshape):
-    """Parse tshape in string."""
-    return [int(x.strip()) for x in tshape.strip('()').split(',')]
-
-def _parse_bool_str(attr, key, default='False'):
-    """Parse bool string to boolean."""
-    return attr.get(key, default).strip().lower() in ['true', '1', 't', 'y', 'yes']
-
 def _rename(new_name):
     def impl(inputs, attrs):
-        return _get_nnvm_op(new_name)(*inputs, **attrs)
+        return get_nnvm_op(new_name)(*inputs, **attrs)
     return impl
 
 def _pooling(inputs, attrs):
-    kernel = _parse_tshape(_required_attr(attrs, 'kernel'))
+    kernel = parse_tshape(required_attr(attrs, 'kernel', 'pooling'))
     if len(kernel) != 2:
-        _raise_not_supported('non-2d kernel', 'pool_2d')
-    global_pool = 'global' if _parse_bool_str(attrs, 'global_pool') else ''
-    pool_type = _required_attr(attrs, 'pool_type')
+        raise tvm.error.OpAttributeUnimplemented(
+            'Non-2D kernels are not supported for Pool2D.')
+    global_pool = 'global' if parse_bool_str(attrs, 'global_pool') else ''
+    pool_type = required_attr(attrs, 'pool_type', 'pooling')
     if pool_type not in ['avg', 'max']:
-        _raise_not_supported('non-avg/max', 'pool2d')
+        raise tvm.error.OpNotImplemented(
+            'Only max and average pooling are supported in frontend MXNet.')
     op_name, new_attrs = '_'.join([global_pool, pool_type, 'pool2d']).strip('_'), {}
     # new_attrs['layout'] = 'NCHW'
     if not global_pool:
@@ -58,42 +32,47 @@ def _pooling(inputs, attrs):
         new_attrs['ceil_mode'] = (attrs.get('pooling_convention', 'valid') == 'full')
         if pool_type == 'avg':
             new_attrs['count_include_pad'] = attrs.get('count_include_pad', True)
-    return _get_nnvm_op(op_name)(*inputs, **new_attrs)
+    return get_nnvm_op(op_name)(*inputs, **new_attrs)
 
 def _batch_norm(inputs, attrs):
-    if _parse_bool_str(attrs, 'output_mean_var'):
-        _raise_not_supported('output_mean_var', 'batch_norm')
-    # if _parse_bool_str(attrs, 'fix_gamma'):
+    if parse_bool_str(attrs, 'output_mean_var'):
+        raise tvm.error.OpAttributeUnimplemented(
+            'Attribute "output_mean_var" is not supported in operator batch_norm.')
+    # if parse_bool_str(attrs, 'fix_gamma'):
     #     _warn_not_used('fix_gamma', 'batch_norm')
-    if _parse_bool_str(attrs, 'use_global_stats'):
-        _warn_not_used('use_global_stats', 'batch_norm')
-    # if _parse_bool_str(attrs, 'momentum'):
+    if parse_bool_str(attrs, 'use_global_stats'):
+        from warnings import warn
+        warn(
+            'Attribute "use_global_stats" is ignored in operator batch_norm.')
+    # if parse_bool_str(attrs, 'momentum'):
     #     _warn_not_used('momentum', 'batch_norm')
     op_name, new_attrs = 'batch_norm', {}
     new_attrs['axis'] = attrs.get('axis', 1)
     new_attrs['epsilon'] = attrs.get('eps', 0.001)
     new_attrs['center'] = True
-    new_attrs['scale'] = not _parse_bool_str(attrs, 'fix_gamma', default="False")
-    return _get_nnvm_op(op_name)(*inputs, **new_attrs)
+    new_attrs['scale'] = not parse_bool_str(attrs, 'fix_gamma', default="False")
+    return get_nnvm_op(op_name)(*inputs, **new_attrs)
 
 def _concat(inputs, attrs):
     op_name = 'concatenate'
     new_attrs = {'axis': attrs.get('dim', 1)}
-    return _get_nnvm_op(op_name)(*inputs, **new_attrs)
+    return get_nnvm_op(op_name)(*inputs, **new_attrs)
 
 def _conv2d(inputs, attrs):
-    kernel = _parse_tshape(_required_attr(attrs, 'kernel'))
+    kernel = parse_tshape(required_attr(attrs, 'kernel', 'conv2d'))
     if len(kernel) != 2:
-        _raise_not_supported('non 2d kernel', 'conv2d')
+        raise tvm.error.OpAttributeUnimplemented(
+            'Non-2D kernels are not supported for operator Conv2D.')
     layout = attrs.get('layout', 'NCHW')
     if layout not in ['NCHW', 'NHWC']:
-        _raise_not_supported('layout: ' + layout, 'conv2d')
+        raise tvm.error.OpAttributeUnimplemented(
+            'Layout {} is not supported in operator Conv2D.'.format(layout))
     if 'kernel_layout' in attrs:
         kernel_layout = attrs['kernel_layout']
     else:
         kernel_layout = 'HWIO' if layout == 'NHWC' else 'OIHW'
     op_name, new_attrs = 'conv2d', {}
-    new_attrs['channels'] = _required_attr(attrs, 'num_filter')
+    new_attrs['channels'] = required_attr(attrs, 'num_filter', 'conv2d')
     new_attrs['kernel_size'] = kernel
     new_attrs['strides'] = attrs.get('stride', (1, 1))
     new_attrs['padding'] = attrs.get('pad', (0, 0))
@@ -102,23 +81,26 @@ def _conv2d(inputs, attrs):
     new_attrs['layout'] = layout
     new_attrs['kernel_layout'] = kernel_layout
     new_attrs['use_bias'] = attrs.get('no_bias', 'False').strip() == 'False'
-    return _get_nnvm_op(op_name)(*inputs, **new_attrs)
+    return get_nnvm_op(op_name)(*inputs, **new_attrs)
 
 def _conv2d_transpose(inputs, attrs):
     if 'target_shape' in attrs:
-        _raise_not_supported('target_shape', 'conv2d_transpose')
-    kernel = _parse_tshape(_required_attr(attrs, 'kernel'))
+        raise tvm.error.OpAttributeUnimplemented(
+            'Attribute "target_shape" is not supported in operator Conv2D-transpose.')
+    kernel = parse_tshape(required_attr(attrs, 'kernel', 'conv2d_transpose'))
     if len(kernel) != 2:
-        _raise_not_supported('non-2d kernel', 'conv2d_transpose')
+        raise tvm.error.OpAttributeInvalid(
+            'Non-2D kernels are not supported in Conv2D-transpose.')
     layout = attrs.get('layout', 'NCHW')
     if layout not in ['NCHW', 'NHWC']:
-        _raise_not_supported('layout: ' + layout, 'conv2d_transpose')
+        raise tvm.error.OpAttributeUnimplemented(
+            'Layout {} is not supported in operator Conv2D-transpose.')
     if 'kernel_layout' in attrs:
         kernel_layout = attrs['kernel_layout']
     else:
         kernel_layout = 'HWIO' if layout == 'NHWC' else 'OIHW'
     op_name, new_attrs = 'conv2d_transpose', {}
-    new_attrs['channels'] = _required_attr(attrs, 'num_filter')
+    new_attrs['channels'] = required_attr(attrs, 'num_filter', 'conv2d_transpose')
     new_attrs['kernel_size'] = kernel
     new_attrs['strides'] = attrs.get('stride', (1, 1))
     new_attrs['output_padding'] = attrs.get('adj', (0, 0))
@@ -127,67 +109,70 @@ def _conv2d_transpose(inputs, attrs):
     new_attrs['groups'] = attrs.get('num_group', 1)
     new_attrs['layout'] = layout
     new_attrs['kernel_layout'] = kernel_layout
-    new_attrs['use_bias'] = not _parse_bool_str(attrs, 'no_bias')
-    return _get_nnvm_op(op_name)(*inputs, **new_attrs)
+    new_attrs['use_bias'] = not parse_bool_str(attrs, 'no_bias')
+    return get_nnvm_op(op_name)(*inputs, **new_attrs)
 
 def _dense(inputs, attrs):
     import mxnet as mx
     op_name, new_attrs = 'dense', {}
-    new_attrs['units'] = _required_attr(attrs, 'num_hidden')
-    new_attrs['use_bias'] = not _parse_bool_str(attrs, 'no_bias')
+    new_attrs['units'] = required_attr(attrs, 'num_hidden', 'dense')
+    new_attrs['use_bias'] = not parse_bool_str(attrs, 'no_bias')
     try:
         _ = mx.sym.FullyConnected(mx.sym.var('x'), num_hidden=1, flatten=True)
         has_flatten = True
     except mx.base.MXNetError:
         # no flatten attribute in old mxnet
         has_flatten = False
-    use_flatten = _parse_bool_str(attrs, 'flatten', 'True')
+    use_flatten = parse_bool_str(attrs, 'flatten', 'True')
     if has_flatten and use_flatten:
         inputs[0] = _sym.flatten(inputs[0])
-    return _get_nnvm_op(op_name)(*inputs, **new_attrs)
+    return get_nnvm_op(op_name)(*inputs, **new_attrs)
 
 def _dropout(inputs, attrs):
     op_name, new_attrs = 'dropout', {}
     new_attrs['rate'] = attrs.get('p', 0.5)
-    return _get_nnvm_op(op_name)(*inputs, **new_attrs)
+    return get_nnvm_op(op_name)(*inputs, **new_attrs)
 
 def _leaky_relu(inputs, attrs):
-    act_type = _required_attr(attrs, 'act_type')
+    act_type = required_attr(attrs, 'act_type', 'leaky_relu')
     if act_type in ['leaky', 'prelu']:
         op_name, new_attrs = act_type, {}
         if act_type == 'leaky':
             new_attrs['alpha'] = attrs.get('slope', 0.25)
-        sym = _get_nnvm_op(op_name)(*inputs, **new_attrs)
+        sym = get_nnvm_op(op_name)(*inputs, **new_attrs)
     elif act_type == 'elu':
         slope = attrs.get('slope', 0.25)
         sym = -slope * _sym.relu(1 - _sym.exp(*inputs)) + _sym.relu(*inputs)
     elif act_type == 'rrelu':
-        lower_bound = float(_required_attr(attrs, 'lower_bound'))
-        upper_bound = float(_required_attr(attrs, 'upper_bound'))
+        lower_bound = float(required_attr(attrs, 'lower_bound', 'leaky_relu'))
+        upper_bound = float(required_attr(attrs, 'upper_bound', 'leaky_relu'))
         slope = (lower_bound + upper_bound) / 2.0
         op_name, new_attrs = 'leaky_relu', {'alpha': str(slope)}
-        sym = _get_nnvm_op(op_name)(*inputs, **new_attrs)
+        sym = get_nnvm_op(op_name)(*inputs, **new_attrs)
     else:
-        _raise_not_supported('act_type: ' + act_type)
+        raise tvm.error.OpNotImplemented(
+            'Operator {} is not supported in frontend MXNet.'.format(act_type))
     return sym
 
 def _activations(inputs, attrs):
-    act_type = _required_attr(attrs, 'act_type')
+    act_type = required_attr(attrs, 'act_type', 'activations')
     if act_type in ['relu', 'sigmoid', 'tanh']:
         op_name, new_attrs = act_type, {}
-        sym = _get_nnvm_op(op_name)(*inputs, **new_attrs)
+        sym = get_nnvm_op(op_name)(*inputs, **new_attrs)
     elif act_type == 'softrelu':
         sym = _sym.log((1 + _sym.exp(*inputs)))
     else:
-        _raise_not_supported('act_type: ' + act_type)
+        raise tvm.error.OpNotImplemented(
+            'Operator {} is not supported in frontend MXNet.'.format(act_type))
     return sym
 
 def _reshape(inputs, attrs):
-    if _parse_bool_str(attrs, 'reverse'):
-        _raise_not_supported('reverse', 'reshape')
+    if parse_bool_str(attrs, 'reverse'):
+        raise tvm.error.OpAttributeUnimplemented(
+            'Attribute "reverse" is not supported in operator Reshape.')
     op_name, new_attrs = 'reshape', {}
-    new_attrs['shape'] = _required_attr(attrs, 'shape')
-    return _get_nnvm_op(op_name)(*inputs, **new_attrs)
+    new_attrs['shape'] = required_attr(attrs, 'shape', 'reshape')
+    return get_nnvm_op(op_name)(*inputs, **new_attrs)
 
 def _slice(inputs, attrs):
     begin = attrs.get('begin', None)
@@ -200,60 +185,60 @@ def _slice(inputs, attrs):
     new_attrs = {'begin': begin, 'end': end}
     if stride is not None:
         new_attrs['stride'] = stride
-    return _get_nnvm_op('strided_slice')(inputs[0], **new_attrs)
+    return get_nnvm_op('strided_slice')(inputs[0], **new_attrs)
 
 def _split(inputs, attrs):
     op_name, new_attrs = 'split', {}
     axis = attrs.get('axis', 1)
-    new_attrs['indices_or_sections'] = _required_attr(attrs, 'num_outputs')
+    new_attrs['indices_or_sections'] = required_attr(attrs, 'num_outputs', 'split')
     new_attrs['axis'] = axis
-    outputs = _get_nnvm_op(op_name)(*inputs, **new_attrs)
-    if _parse_bool_str(attrs, 'squeeze_axis'):
+    outputs = get_nnvm_op(op_name)(*inputs, **new_attrs)
+    if parse_bool_str(attrs, 'squeeze_axis'):
         squeeze_attrs = {'axis': axis}
-        outputs = _sym.Group([_get_nnvm_op('squeeze')(o, **squeeze_attrs) for o in outputs])
+        outputs = _sym.Group([get_nnvm_op('squeeze')(o, **squeeze_attrs) for o in outputs])
     return outputs
 
 def _softmax_activation(inputs, attrs):
     op_name, new_attrs = 'softmax', {}
     mode = attrs.get('mode', 'instance')
     new_attrs['axis'] = 0 if mode == 'instance' else 1
-    return _get_nnvm_op(op_name)(inputs[0], **new_attrs)
+    return get_nnvm_op(op_name)(inputs[0], **new_attrs)
 
 def _softmax_output(inputs, attrs):
     op_name, new_attrs = 'softmax', {}
-    if _parse_bool_str(attrs, 'multi_output'):
+    if parse_bool_str(attrs, 'multi_output'):
         new_attrs['axis'] = 1
-    return _get_nnvm_op(op_name)(inputs[0], **new_attrs)
+    return get_nnvm_op(op_name)(inputs[0], **new_attrs)
 
 def _upsampling(inputs, attrs):
     scale = attrs.get('scale')
     new_attrs = {'scale':int(scale)}
-    return _get_nnvm_op('upsampling')(inputs[0], **new_attrs)
+    return get_nnvm_op('upsampling')(inputs[0], **new_attrs)
 
 def _clip(inputs, attrs):
     op_name, new_attrs = "clip", {}
-    new_attrs['a_min'] = _required_attr(attrs, 'a_min')
-    new_attrs['a_max'] = _required_attr(attrs, 'a_max')
-    return _get_nnvm_op(op_name)(*inputs, **new_attrs)
+    new_attrs['a_min'] = required_attr(attrs, 'a_min', 'clip')
+    new_attrs['a_max'] = required_attr(attrs, 'a_max', 'clip')
+    return get_nnvm_op(op_name)(*inputs, **new_attrs)
 
 def _contrib_multibox_detection(inputs, attrs):
-    clip = _parse_bool_str(attrs, 'clip', default='True')
+    clip = parse_bool_str(attrs, 'clip', default='True')
     threshold = attrs.get('threshold') or 0.01
     nms_threshold = attrs.get('nms_threshold') or 0.5
-    force_suppress = _parse_bool_str(attrs, 'force_suppress', default='False')
+    force_suppress = parse_bool_str(attrs, 'force_suppress', default='False')
     variances = tuple([float(x.strip()) for x in attrs.get('variances').strip('()').split(',')]) \
         if attrs.get('variances') is not None else (0.1, 0.1, 0.2, 0.2)
     nms_topk = attrs.get('nms_topk') or -1
     new_attrs0 = {'clip': clip, 'threshold': float(threshold), 'variances': variances}
     new_attrs1 = {'return_indices': False, 'iou_threshold': float(nms_threshold),
                   'force_suppress': force_suppress, 'top_k': int(nms_topk)}
-    data, valid_count = _get_nnvm_op('multibox_transform_loc')(inputs[0], inputs[1],
-                                                               inputs[2], **new_attrs0)
-    return _get_nnvm_op('non_max_suppression')(data, valid_count, **new_attrs1)
+    data, valid_count = get_nnvm_op('multibox_transform_loc')(inputs[0], inputs[1],
+                                                              inputs[2], **new_attrs0)
+    return get_nnvm_op('non_max_suppression')(data, valid_count, **new_attrs1)
 
 def _elemwise_sum(inputs, _):
     new_attrs = {'num_args':len(inputs)}
-    return _get_nnvm_op('elemwise_sum')(*inputs, **new_attrs)
+    return get_nnvm_op('elemwise_sum')(*inputs, **new_attrs)
 
 def _crop_like(inputs, attrs):
     new_attrs = {}
@@ -261,20 +246,22 @@ def _crop_like(inputs, attrs):
         tuple([float(x.strip()) for x in attrs.get('offsets').strip('()').split(',')]) \
             if attrs.get('offsets') is not None else (0, 0)
     if offsets != (0, 0):
-        raise RuntimeError("Currently only supports offsets to be zero.")
-    center_crop = _parse_bool_str(attrs, 'center_crop', default="False")
+        raise tvm.error.OpAttributeInvalid(
+            'crop_like offsets must equal (0,0).')
+    center_crop = parse_bool_str(attrs, 'center_crop', default="False")
     if center_crop:
-        raise RuntimeError("center crop is not supported.")
+        raise tvm.error.OpAttributeUnimplemented(
+            'Center crop is not supported in operator crop_like.')
     if len(inputs) < 2:
         raise RuntimeError("Only support crop_like pattern.")
     new_attrs["axis"] = [2, 3]
-    return _get_nnvm_op('slice_like')(inputs[0], inputs[1], **new_attrs)
+    return get_nnvm_op('slice_like')(inputs[0], inputs[1], **new_attrs)
 
 
 def _expand_dims(inputs, attrs):
     op_name, new_attrs = 'expand_dims', {}
-    new_attrs['axis'] = _required_attr(attrs, 'axis')
-    return _get_nnvm_op(op_name)(*inputs, **new_attrs)
+    new_attrs['axis'] = required_attr(attrs, 'axis', 'expand_dims')
+    return get_nnvm_op(op_name)(*inputs, **new_attrs)
 
 def _lrn(inputs, attrs):
     op_name, new_attrs = 'lrn', {}
@@ -283,36 +270,36 @@ def _lrn(inputs, attrs):
     new_attrs['bias'] = attrs.get('knorm', 2)
     # NCHW format and normalization along channel axis
     new_attrs['axis'] = 1
-    new_attrs['size'] = _required_attr(attrs, 'nsize')
-    return _get_nnvm_op(op_name)(*inputs, **new_attrs)
+    new_attrs['size'] = required_attr(attrs, 'nsize', 'lrn')
+    return get_nnvm_op(op_name)(*inputs, **new_attrs)
 
 def _minimum(inputs, attrs):
-    return _get_nnvm_op('broadcast_min')(*inputs, **attrs)
+    return get_nnvm_op('broadcast_min')(*inputs, **attrs)
 
 def _maximum(inputs, attrs):
-    return _get_nnvm_op('broadcast_max')(*inputs, **attrs)
+    return get_nnvm_op('broadcast_max')(*inputs, **attrs)
 
 def _ones(_, attrs):
     op_name = 'ones'
-    return _get_nnvm_op(op_name)(**attrs)
+    return get_nnvm_op(op_name)(**attrs)
 
 def _zeros(_, attrs):
     op_name = 'zeros'
-    return _get_nnvm_op(op_name)(**attrs)
+    return get_nnvm_op(op_name)(**attrs)
 
 def _argmax(inputs, attrs):
     op_name, new_attrs = 'argmax', {}
     new_attrs['dtype'] = 'float32'
     new_attrs['axis'] = attrs.get('axis', 0)
-    new_attrs['keepdims'] = _parse_bool_str(attrs, 'keepdims', default="False")
-    return _get_nnvm_op(op_name)(*inputs, **new_attrs)
+    new_attrs['keepdims'] = parse_bool_str(attrs, 'keepdims', default="False")
+    return get_nnvm_op(op_name)(*inputs, **new_attrs)
 
 def _argmin(inputs, attrs):
     op_name, new_attrs = 'argmin', {}
     new_attrs['dtype'] = 'float32'
     new_attrs['axis'] = attrs.get('axis', 0)
-    new_attrs['keepdims'] = _parse_bool_str(attrs, 'keepdims', default="False")
-    return _get_nnvm_op(op_name)(*inputs, **new_attrs)
+    new_attrs['keepdims'] = parse_bool_str(attrs, 'keepdims', default="False")
+    return get_nnvm_op(op_name)(*inputs, **new_attrs)
 
 _identity_list = ['__add_scalar__', '__add_symbol__', '__div_scalar__',
                   '__div_symbol__', '__mul_scalar__', '__mul_symbol__',
@@ -406,12 +393,13 @@ def _convert_symbol(op_name, inputs, attrs,
     identity_list = identity_list if identity_list else _identity_list
     convert_map = convert_map if convert_map else _convert_map
     if op_name in identity_list:
-        op = _get_nnvm_op(op_name)
+        op = get_nnvm_op(op_name)
         sym = op(*inputs, **attrs)
     elif op_name in convert_map:
         sym = convert_map[op_name](inputs, attrs)
     else:
-        _raise_not_supported('Operator: ' + op_name)
+        raise tvm.error.OpNotImplemented(
+            'Operator {} is not supported in frontend MXNet.'.format(op_name))
     return sym
 
 def _as_list(arr):
diff --git a/nnvm/python/nnvm/frontend/onnx.py b/nnvm/python/nnvm/frontend/onnx.py
index ad0acc31a5213..18eb213bab7b1 100644
--- a/nnvm/python/nnvm/frontend/onnx.py
+++ b/nnvm/python/nnvm/frontend/onnx.py
@@ -397,7 +397,8 @@ def _impl_v7(cls, inputs, attr, params):
         elif mode == b'linear':
             method = "BILINEAR"
         else:
-            raise ValueError("Invalid ONNX upsample mode: {}".format(mode))
+            raise tvm.error.OpAttributeInvalid(
+                'Value {} in attribute "mode" of operator Upsample is not valid.'.format(mode))
         return _sym.upsampling(inputs[0], scale=int(scales[-1]), method=method, layout='NCHW')
 
 
@@ -922,8 +923,8 @@ def _convert_operator(self,
         elif op_name in convert_map:
             sym = convert_map[op_name](inputs, attrs, self._params)
         else:
-            raise NotImplementedError(
-                "Operator {} not implemented.".format(op_name))
+            raise tvm.error.OpNotImplemented(
+                'Operator {} is not supported in frontend ONNX.')
         return sym
 
     def _fix_outputs(self, op_name, outputs):
diff --git a/nnvm/python/nnvm/frontend/tensorflow.py b/nnvm/python/nnvm/frontend/tensorflow.py
index f4065cc544e16..f2ff60294489b 100644
--- a/nnvm/python/nnvm/frontend/tensorflow.py
+++ b/nnvm/python/nnvm/frontend/tensorflow.py
@@ -68,7 +68,8 @@ def _impl(attr):
         kernel = attr['kernel_shape']
         if len(kernel) == 2:
             return prefix + '2d' + surfix
-        raise NotImplementedError("Only 2d kernel supported.")
+        raise tvm.error.OpAttributeUnimplemented(
+            'Non-2D kernels are not supported for operator {}.'.format(prefix))
     return _impl
 
 def _dimension_constraint():
@@ -129,7 +130,8 @@ def _impl(inputs, attr, params):
             attr['kernel_shape'] = (attr['ksize'][2], attr['ksize'][3])
             attr['strides'] = (attr['strides'][2], attr['strides'][3])
         else:
-            raise TypeError("Unsupported data_format type : {}".format(attr['data_format']))
+            msg = 'Value {} in attribute "data_format" of operator Pooling is not valid.'
+            raise tvm.error.OpAttributeInvalid(msg.format(attr['data_format']))
 
         if attr['_target_layout'] == "NCHW" and attr['data_format'] == "NHWC":
             tmp_shape = attr['_input_shapes'][inputs[0]]
@@ -158,7 +160,8 @@ def _impl(inputs, attr, params):
 
             attr['padding'] = [pad_v[0], pad_h[0], pad_v[1], pad_h[1]]
         else:
-            raise TypeError("Unsupported padding type : {}".format(attr['padding']))
+            msg = 'Value {} in attribute "padding" of operator Pooling is not valid.'
+            raise tvm.error.OpAttributeUnimplemented(msg.format(attr['padding']))
 
         if name == "avg_pool":
             attr['count_include_pad'] = False
@@ -232,7 +235,8 @@ def _impl(inputs, attr, params):
                 attr['dilations'] = (attr['dilations'][2], attr['dilations'][3])
             attr['strides'] = (attr['strides'][2], attr['strides'][3])
         else:
-            raise TypeError("Unsupported data format type : {}".format(attr['data_format']))
+            msg = 'Value {} in attribute "data_format" of operator Conv is not valid.'
+            raise tvm.error.OpAttributeInvalid(msg.format(attr['data_format']))
 
 
         if opname == 'depthwise':
@@ -276,7 +280,8 @@ def _impl(inputs, attr, params):
             attr['padding'] = [0, 0]
 
         else:
-            raise TypeError("Unsupported padding type : {}".format(attr['padding']))
+            msg = 'Value {} in attribute "padding" of operator Conv is not valid.'
+            raise tvm.error.OpAttributeInvalid(msg.format(attr['padding']))
 
         if 'kernel_layout' not in attr:
             if opname == 'conv':
@@ -432,7 +437,8 @@ def _impl(inputs, attr, params):
                     op_name="reshape",
                     extras={'shape':tuple(params_new[0].asnumpy().flatten())},
                     ignores=['Tshape'])(inputs, attr)
-            raise RuntimeError("Reshape with dynamic shape input not supported yet.")
+            raise tvm.error.OpAttributeUnimplemented(
+                'Attribute "dynamic shape" of operator Reshape is not supported.')
     return _impl
 
 def _bias_add():
@@ -736,7 +742,8 @@ def _impl(inputs, attr, params):
         if padlist_key in params:
             padlist = params.pop(padlist_key).asnumpy()
         else:
-            raise RuntimeError("Required parameter {} not fount.".format(padlist_key))
+            raise tvm.error.OpAttributeRequired(
+                'Required attribute "{}" not found in operator Pad.'.format(padlist_key))
         paddings = tuple([tuple(l) for l in padlist])
         attr['pad_width'] = paddings
         attr['pad_value'] = 0
@@ -1188,8 +1195,9 @@ def from_tensorflow(self, graph, layout="NHWC", shape=None, outputs=None):
         missing_operators = self._parse_import_prerequisites(graph)
 
         if missing_operators:
-            raise NotImplementedError( \
-                "The following operators are not implemented: {}".format(missing_operators))
+            msg = 'The following operators are not supported in frontend TensorFlow: {}'
+            ops = str(list(missing_operators)).strip('[,]')
+            raise tvm.error.OpNotImplemented(msg.format(ops))
 
         for node in graph.node:
             if node.op == 'Placeholder':
@@ -1529,7 +1537,8 @@ def _convert_operator(self, op_name, inputs, attrs,
                                              self._params, graph,
                                              convert_map_rnn)
         else:
-            raise NotImplementedError("Operator {} not implemented.".format(op_name))
+            raise tvm.error.OpNotImplemented(
+                'Operator {} is not supported in frontend TensorFlow.'.format(op_name))
         return sym
 
     def _fix_extranodes(self, op_name, attr, inputs):
diff --git a/nnvm/python/nnvm/top/nn.py b/nnvm/python/nnvm/top/nn.py
index ce2085da5a91a..2510c902774bc 100644
--- a/nnvm/python/nnvm/top/nn.py
+++ b/nnvm/python/nnvm/top/nn.py
@@ -161,6 +161,10 @@ def alter_conv2d_layout(attrs, inputs, tinfos):
             sym.contrib.conv2d_winograd_without_weight_transform
     sym.contrib_conv2d_winograd_weight_transform = \
             sym.contrib.conv2d_winograd_weight_transform
+    sym.contrib_conv2d_winograd_nnpack_without_weight_transform = \
+            sym.contrib.conv2d_winograd_nnpack_without_weight_transform
+    sym.contrib_conv2d_winograd_nnpack_weight_transform = \
+            sym.contrib.conv2d_winograd_nnpack_weight_transform
     sym.nn = sym
 
     # map relay argument names to nnvm argument names
@@ -274,6 +278,49 @@ def schedule_contrib_conv2d_winograd_without_weight_transform(attrs, outs, targe
                      OpPattern.OUT_ELEMWISE_FUSABLE)
 
 
+@reg.register_compute("_contrib_conv2d_winograd_nnpack_weight_transform")
+def compute_contrib_conv2d_winograd_nnpack_weight_transform(attrs, inputs, _):
+    convolution_algorithm = attrs.get_int('convolution_algorithm')
+    out_dype = attrs.get_str('out_dtype')
+    return topi.nn.conv2d_winograd_nnpack_weight_transform(
+        inputs[0], convolution_algorithm, out_dype)
+
+
+@reg.register_schedule("_contrib_conv2d_winograd_nnpack_weight_transform")
+def schedule_contrib_conv2d_winograd_nnpack_weight_transform(attrs, outs, target):
+    with tvm.target.create(target):
+        return topi.generic.schedule_conv2d_winograd_nnpack_weight_transform(outs)
+
+reg.register_pattern("_contrib_conv2d_winograd_nnpack_weight_transform", OpPattern.OPAQUE)
+
+
+@reg.register_compute("_contrib_conv2d_winograd_nnpack_without_weight_transform")
+def compute_contrib_conv2d_winograd_nnpack_without_weight_transform(attrs, inputs, _):
+    padding = attrs.get_int_tuple("padding")
+    strides = attrs.get_int_tuple("strides")
+    dilation = attrs.get_int_tuple("dilation")
+    groups = attrs.get_int("groups")
+    layout = attrs.get_str("layout")
+    out_dtype = attrs.get_str("out_dtype")
+    out_dtype = inputs[0].dtype if out_dtype == "same" else out_dtype
+    assert dilation == (1, 1), "Do not support dilate now"
+    assert groups == 1, "Do not supoort arbitrary group number"
+
+    # pylint: disable=assignment-from-no-return
+    out = topi.nn.conv2d_winograd_nnpack_without_weight_transform(
+        inputs[0], inputs[1], inputs[2] if attrs.get_bool("use_bias") else None,
+        strides, padding, dilation, layout, out_dtype)
+    return out
+
+@reg.register_schedule("_contrib_conv2d_winograd_nnpack_without_weight_transform")
+def schedule_contrib_conv2d_winograd_nnpack_without_weight_transform(attrs, outs, target):
+    with tvm.target.create(target):
+        return topi.generic.schedule_conv2d_winograd_nnpack_without_weight_transform(outs)
+
+reg.register_pattern("_contrib_conv2d_winograd_nnpack_without_weight_transform",
+                     OpPattern.OPAQUE)
+
+
 # conv2d_transpose
 @reg.register_compute("conv2d_transpose")
 def compute_conv2d_transpose(attrs, inputs, _):
diff --git a/nnvm/src/top/nn/convolution.cc b/nnvm/src/top/nn/convolution.cc
index e6ff722396720..601e57ab325b9 100644
--- a/nnvm/src/top/nn/convolution.cc
+++ b/nnvm/src/top/nn/convolution.cc
@@ -130,13 +130,14 @@ inline bool Conv2DInferShape(const nnvm::NodeAttrs& attrs,
   return true;
 }
 
+template<class Param>
 inline bool WinogradConv2DInferShape(const nnvm::NodeAttrs& attrs,
                                      std::vector<TShape>* in_shape,
                                      std::vector<TShape>* out_shape) {
   static const Layout kNCHW("NCHW");
   static const Layout kOIHW("OIHW");
 
-  const WinogradConv2DParam& param = nnvm::get<WinogradConv2DParam>(attrs.parsed);
+  const Param& param = nnvm::get<Param>(attrs.parsed);
 
   const Layout in_layout(param.layout);
   const Layout kernel_layout(param.kernel_layout);
@@ -403,7 +404,7 @@ NNVM_REGISTER_OP(_contrib_conv2d_winograd_without_weight_transform)
 .set_attr_parser(ParamParser<WinogradConv2DParam>)
 .set_attr<FGetAttrDict>("FGetAttrDict", ParamGetAttrDict<WinogradConv2DParam>)
 .set_attr<FListInputNames>("FListInputNames", UseBiasListInputNames<WinogradConv2DParam>)
-.set_attr<FInferShape>("FInferShape", WinogradConv2DInferShape)
+.set_attr<FInferShape>("FInferShape", WinogradConv2DInferShape<WinogradConv2DParam>)
 .set_attr<FInferType>("FInferType", Conv2DInferType<WinogradConv2DParam>)
 .set_attr<FCorrectLayout>("FCorrectLayout", Conv2DCorrectLayout<WinogradConv2DParam>)
 .set_num_outputs(1)
@@ -412,6 +413,82 @@ NNVM_REGISTER_OP(_contrib_conv2d_winograd_without_weight_transform)
 
 DMLC_REGISTER_PARAMETER(WinogradConv2DParam);
 
+
+inline bool Conv2DWinogradNNPACKWTInferType(const nnvm::NodeAttrs& attrs,
+                                            std::vector<int>* in_type,
+                                            std::vector<int>* out_type) {
+  const WinogradNNPACKWeightTransformParam& param =
+      nnvm::get<WinogradNNPACKWeightTransformParam>(attrs.parsed);
+
+  CHECK_EQ(in_type->size(), 1U) << "Input:[weight]";
+  CHECK_EQ(out_type->size(), 1U);
+
+  if (param.out_dtype != -1) {
+    NNVM_ASSIGN_OUTPUT_TYPE(attrs, *out_type, 0, param.out_dtype);
+  } else {
+    ElemwiseType<1, 1>(attrs, in_type, out_type);
+  }
+  return true;
+}
+
+NNVM_REGISTER_OP(_contrib_conv2d_winograd_nnpack_weight_transform)
+.describe(R"code(Weight transformation of winograd fast convolution algorithm.
+Separate this into another nnvm symbol in order to enable Precompute Pass to compute the
+weight transformation in advance.
+- **weight**: (channels, in_channels, kernel_size[0], kernel_size[1])
+)code" NNVM_ADD_FILELINE)
+.add_argument("weight", "4D Tensor", "Weight tensor.")
+.add_arguments(WinogradNNPACKWeightTransformParam::__FIELDS__())
+.set_attr_parser(ParamParser<WinogradNNPACKWeightTransformParam>)
+.set_attr<FGetAttrDict>("FGetAttrDict", ParamGetAttrDict<WinogradNNPACKWeightTransformParam>)
+.set_attr<FInferShape>("FInferShape", [](const nnvm::NodeAttrs& attrs,
+                                         std::vector<TShape> *in_shape,
+                                         std::vector<TShape> *out_shape) {
+  const TShape &wshape = (*in_shape)[0];
+  CHECK_EQ(wshape.ndim(), 4) << "Weight should be a 4 dimensional tensor";
+  TShape oshape({wshape[0], wshape[1], 8, 8});
+  NNVM_ASSIGN_OUTPUT_SHAPE(attrs, *out_shape, 0, oshape);
+  return true;
+})
+.set_attr<FCorrectLayout>("FCorrectLayout", [](const NodeAttrs& attrs,
+                                              std::vector<Layout> *ilayouts,
+                                              const std::vector<Layout> *last_ilayouts,
+                                              std::vector<Layout> *olayouts) {
+  Layout layout("OIHW");
+  NNVM_ASSIGN_LAYOUT(*ilayouts, 0, layout);
+  NNVM_ASSIGN_LAYOUT(*olayouts, 0, layout);
+  return true;
+})
+.set_attr<FInferType>("FInferType", Conv2DWinogradNNPACKWTInferType)
+.set_num_outputs(1)
+.set_num_inputs(1)
+.set_support_level(5);
+
+DMLC_REGISTER_PARAMETER(WinogradNNPACKWeightTransformParam);
+
+NNVM_REGISTER_OP(_contrib_conv2d_winograd_nnpack_without_weight_transform)
+.describe(R"code(Compute conv2d with winograd nnpack.
+- **data**: Input is 4D array of shape  (batch_size, in_channels, height, width)
+- **weight**: Any shape
+            We do not check shape for this input tensor.
+- **bias**: (channels,)
+- **out**:  Output is 4D array of shape (batch_size, channels, out_height, out_width)
+)code" NNVM_ADD_FILELINE)
+.add_argument("data", "4D Tensor", "Input data.")
+.add_argument("weight", "4D Tensor", "Transformed weight tensor.")
+.add_argument("bias", "1D Tensor", "Bias parameter.")
+.add_arguments(Conv2DParam::__FIELDS__())
+.set_attr_parser(ParamParser<Conv2DParam>)
+.set_attr<FGetAttrDict>("FGetAttrDict", ParamGetAttrDict<Conv2DParam>)
+.set_attr<FListInputNames>("FListInputNames", UseBiasListInputNames<Conv2DParam>)
+.set_attr<FInferShape>("FInferShape", WinogradConv2DInferShape<Conv2DParam>)
+.set_attr<FInferType>("FInferType", Conv2DInferType<Conv2DParam>)
+.set_attr<FCorrectLayout>("FCorrectLayout", Conv2DCorrectLayout<Conv2DParam>)
+.set_num_outputs(1)
+.set_num_inputs(UseBiasNumInputs<Conv2DParam>)
+.set_support_level(5);
+
+
 NNVM_REGISTER_OP(_conv2d_grad)
   .describe(R"code(2D convolution grad.
 
diff --git a/nnvm/tests/python/frontend/coreml/model_zoo/__init__.py b/nnvm/tests/python/frontend/coreml/model_zoo/__init__.py
index 99b23819ced46..aa28475776ef9 100644
--- a/nnvm/tests/python/frontend/coreml/model_zoo/__init__.py
+++ b/nnvm/tests/python/frontend/coreml/model_zoo/__init__.py
@@ -1,33 +1,24 @@
-from six.moves import urllib
 import os
 from PIL import Image
 import numpy as np
-
-def download(url, path, overwrite=False):
-    if os.path.exists(path) and not overwrite:
-        return
-    print('Downloading {} to {}.'.format(url, path))
-    urllib.request.urlretrieve(url, path)
+from tvm.contrib.download import download_testdata
 
 def get_mobilenet():
     url = 'https://docs-assets.developer.apple.com/coreml/models/MobileNet.mlmodel'
     dst = 'mobilenet.mlmodel'
-    real_dst = os.path.abspath(os.path.join(os.path.dirname(__file__), dst))
-    download(url, real_dst)
-    return os.path.abspath(real_dst)
+    real_dst = download_testdata(url, dst, module='coreml')
+    return real_dst
 
 def get_resnet50():
     url = 'https://docs-assets.developer.apple.com/coreml/models/Resnet50.mlmodel'
     dst = 'resnet50.mlmodel'
-    real_dst = os.path.abspath(os.path.join(os.path.dirname(__file__), dst))
-    download(url, real_dst)
-    return os.path.abspath(real_dst)
+    real_dst = download_testdata(url, dst, module='coreml')
+    return real_dst
 
 def get_cat_image():
     url = 'https://gist.githubusercontent.com/zhreshold/bcda4716699ac97ea44f791c24310193/raw/fa7ef0e9c9a5daea686d6473a62aacd1a5885849/cat.png'
     dst = 'cat.png'
-    real_dst = os.path.abspath(os.path.join(os.path.dirname(__file__), dst))
-    download(url, real_dst)
+    real_dst = download_testdata(url, dst, module='data')
     img = Image.open(real_dst).resize((224, 224))
     img = np.transpose(img, (2, 0, 1))[np.newaxis, :]
     return np.asarray(img)
diff --git a/nnvm/tests/python/frontend/darknet/test_forward.py b/nnvm/tests/python/frontend/darknet/test_forward.py
index 1f5e89c6e4d56..db7d58b1b60af 100644
--- a/nnvm/tests/python/frontend/darknet/test_forward.py
+++ b/nnvm/tests/python/frontend/darknet/test_forward.py
@@ -12,44 +12,16 @@
 import numpy as np
 import tvm
 from tvm.contrib import graph_runtime
+from tvm.contrib.download import download_testdata
 from nnvm import frontend
 from nnvm.testing.darknet import LAYERTYPE
 from nnvm.testing.darknet import __darknetffi__
 import nnvm.compiler
-if sys.version_info >= (3,):
-    import urllib.request as urllib2
-else:
-    import urllib2
-
-
-def _download(url, path, overwrite=False, sizecompare=False):
-    ''' Download from internet'''
-    if os.path.isfile(path) and not overwrite:
-        if sizecompare:
-            file_size = os.path.getsize(path)
-            res_head = requests.head(url)
-            res_get = requests.get(url, stream=True)
-            if 'Content-Length' not in res_head.headers:
-                res_get = urllib2.urlopen(url)
-            urlfile_size = int(res_get.headers['Content-Length'])
-            if urlfile_size != file_size:
-                print("exist file got corrupted, downloading", path, " file freshly")
-                _download(url, path, True, False)
-                return
-        print('File {} exists, skip.'.format(path))
-        return
-    print('Downloading from url {} to {}'.format(url, path))
-    try:
-        urllib.request.urlretrieve(url, path)
-        print('')
-    except:
-        urllib.urlretrieve(url, path)
 
 DARKNET_LIB = 'libdarknet2.0.so'
 DARKNETLIB_URL = 'https://github.com/siju-samuel/darknet/blob/master/lib/' \
                                     + DARKNET_LIB + '?raw=true'
-_download(DARKNETLIB_URL, DARKNET_LIB)
-LIB = __darknetffi__.dlopen('./' + DARKNET_LIB)
+LIB = __darknetffi__.dlopen(download_testdata(DARKNETLIB_URL, DARKNET_LIB, module='darknet'))
 
 def _read_memory_buffer(shape, data, dtype='float32'):
     length = 1
@@ -82,6 +54,12 @@ def _get_tvm_output(net, data, build_dtype='float32'):
         tvm_out.append(m.get_output(i).asnumpy())
     return tvm_out
 
+def _load_net(cfg_url, cfg_name, weights_url, weights_name):
+    cfg_path = download_testdata(cfg_url, cfg_name, module='darknet')
+    weights_path = download_testdata(weights_url, weights_name, module='darknet')
+    net = LIB.load_network(cfg_path.encode('utf-8'), weights_path.encode('utf-8'), 0)
+    return net
+
 def test_forward(net, build_dtype='float32'):
     '''Test network with given input image on both darknet and tvm'''
     def get_darknet_output(net, img):
@@ -125,8 +103,8 @@ def get_darknet_output(net, img):
 
     test_image = 'dog.jpg'
     img_url = 'https://github.com/siju-samuel/darknet/blob/master/data/' + test_image   +'?raw=true'
-    _download(img_url, test_image)
-    img = LIB.letterbox_image(LIB.load_image_color(test_image.encode('utf-8'), 0, 0), net.w, net.h)
+    img_path = download_testdata(img_url, test_image, module='data')
+    img = LIB.letterbox_image(LIB.load_image_color(img_path.encode('utf-8'), 0, 0), net.w, net.h)
     darknet_output = get_darknet_output(net, img)
     batch_size = 1
     data = np.empty([batch_size, img.c, img.h, img.w], dtype)
@@ -167,9 +145,7 @@ def test_forward_extraction():
     weights_name = model_name + '.weights'
     cfg_url = 'https://github.com/pjreddie/darknet/blob/master/cfg/' + cfg_name + '?raw=true'
     weights_url = 'http://pjreddie.com/media/files/' + weights_name + '?raw=true'
-    _download(cfg_url, cfg_name)
-    _download(weights_url, weights_name)
-    net = LIB.load_network(cfg_name.encode('utf-8'), weights_name.encode('utf-8'), 0)
+    net = _load_net(cfg_url, cfg_name, weights_url, weights_name)
     test_forward(net)
     LIB.free_network(net)
 
@@ -180,9 +156,7 @@ def test_forward_alexnet():
     weights_name = model_name + '.weights'
     cfg_url = 'https://github.com/pjreddie/darknet/blob/master/cfg/' + cfg_name + '?raw=true'
     weights_url = 'http://pjreddie.com/media/files/' + weights_name + '?raw=true'
-    _download(cfg_url, cfg_name)
-    _download(weights_url, weights_name)
-    net = LIB.load_network(cfg_name.encode('utf-8'), weights_name.encode('utf-8'), 0)
+    net = _load_net(cfg_url, cfg_name, weights_url, weights_name)
     test_forward(net)
     LIB.free_network(net)
 
@@ -193,9 +167,7 @@ def test_forward_resnet50():
     weights_name = model_name + '.weights'
     cfg_url = 'https://github.com/pjreddie/darknet/blob/master/cfg/' + cfg_name + '?raw=true'
     weights_url = 'http://pjreddie.com/media/files/' + weights_name + '?raw=true'
-    _download(cfg_url, cfg_name)
-    _download(weights_url, weights_name)
-    net = LIB.load_network(cfg_name.encode('utf-8'), weights_name.encode('utf-8'), 0)
+    net = _load_net(cfg_url, cfg_name, weights_url, weights_name)
     test_forward(net)
     LIB.free_network(net)
 
@@ -206,9 +178,7 @@ def test_forward_yolov2():
     weights_name = model_name + '.weights'
     cfg_url = 'https://github.com/pjreddie/darknet/blob/master/cfg/' + cfg_name + '?raw=true'
     weights_url = 'http://pjreddie.com/media/files/' + weights_name + '?raw=true'
-    _download(cfg_url, cfg_name)
-    _download(weights_url, weights_name)
-    net = LIB.load_network(cfg_name.encode('utf-8'), weights_name.encode('utf-8'), 0)
+    net = _load_net(cfg_url, cfg_name, weights_url, weights_name)
     build_dtype = {}
     test_forward(net, build_dtype)
     LIB.free_network(net)
@@ -220,9 +190,7 @@ def test_forward_yolov3():
     weights_name = model_name + '.weights'
     cfg_url = 'https://github.com/pjreddie/darknet/blob/master/cfg/' + cfg_name + '?raw=true'
     weights_url = 'http://pjreddie.com/media/files/' + weights_name + '?raw=true'
-    _download(cfg_url, cfg_name)
-    _download(weights_url, weights_name)
-    net = LIB.load_network(cfg_name.encode('utf-8'), weights_name.encode('utf-8'), 0)
+    net = _load_net(cfg_url, cfg_name, weights_url, weights_name)
     build_dtype = {}
     test_forward(net, build_dtype)
     LIB.free_network(net)
diff --git a/nnvm/tests/python/frontend/onnx/model_zoo/__init__.py b/nnvm/tests/python/frontend/onnx/model_zoo/__init__.py
index ee5093e07a97c..ffd39f5b9976a 100644
--- a/nnvm/tests/python/frontend/onnx/model_zoo/__init__.py
+++ b/nnvm/tests/python/frontend/onnx/model_zoo/__init__.py
@@ -3,22 +3,7 @@
 import os
 import logging
 from .super_resolution import get_super_resolution
-
-def _download(url, filename, overwrite=False):
-    if os.path.isfile(filename) and not overwrite:
-        logging.debug('File %s existed, skip.', filename)
-        return
-    logging.debug('Downloading from url %s to %s', url, filename)
-    try:
-        import urllib.request
-        urllib.request.urlretrieve(url, filename)
-    except:
-        import urllib
-        urllib.urlretrieve(url, filename)
-
-def _as_abs_path(fname):
-    cur_dir = os.path.abspath(os.path.dirname(__file__))
-    return os.path.join(cur_dir, fname)
+from tvm.contrib.download import download_testdata
 
 
 URLS = {
@@ -30,9 +15,9 @@ def _as_abs_path(fname):
 # download and add paths
 for k, v  in URLS.items():
     name = k.split('.')[0]
-    path = _as_abs_path(k)
-    _download(v, path, False)
-    locals()[name] = path
+    relpath = os.path.join('onnx', k)
+    abspath = download_testdata(v, relpath, module='onnx')
+    locals()[name] = abspath
 
 # symbol for graph comparison
 super_resolution_sym = get_super_resolution()
diff --git a/nnvm/tutorials/deploy_model_on_mali_gpu.py b/nnvm/tutorials/deploy_model_on_mali_gpu.py
index 6e3962a6609fe..8a495961e4353 100644
--- a/nnvm/tutorials/deploy_model_on_mali_gpu.py
+++ b/nnvm/tutorials/deploy_model_on_mali_gpu.py
@@ -15,6 +15,7 @@
 import nnvm.testing
 from tvm import rpc
 from tvm.contrib import util, graph_runtime as runtime
+from tvm.contrib.download import download_testdata
 
 ######################################################################
 # Build TVM Runtime on Device
@@ -81,7 +82,6 @@
 # You can found more details about this part at tutorial :ref:`tutorial-from-mxnet`.
 
 from mxnet.gluon.model_zoo.vision import get_model
-from mxnet.gluon.utils import download
 from PIL import Image
 import numpy as np
 
@@ -92,8 +92,9 @@
 # In order to test our model, here we download an image of cat and
 # transform its format.
 img_name = 'cat.png'
-download('https://github.com/dmlc/mxnet.js/blob/master/data/cat.png?raw=true', img_name)
-image = Image.open(img_name).resize((224, 224))
+img_path = download_testdata('https://github.com/dmlc/mxnet.js/blob/master/data/cat.png?raw=true',
+                             img_name, module='data')
+image = Image.open(img_path).resize((224, 224))
 
 def transform_image(image):
     image = np.array(image) - np.array([123., 117., 104.])
@@ -112,9 +113,9 @@ def transform_image(image):
                       '596b27d23537e5a1b5751d2b0481ef172f58b539/',
                       'imagenet1000_clsid_to_human.txt'])
 
-synset_name = 'synset.txt'
-download(synset_url, synset_name)
-with open(synset_name) as f:
+synset_name = 'imagenet1000_clsid_to_human.txt'
+synset_path = download_testdata(synset_url, synset_name, module='data')
+with open(synset_path) as f:
     synset = eval(f.read())
 
 ######################################################################
diff --git a/nnvm/tutorials/deploy_model_on_rasp.py b/nnvm/tutorials/deploy_model_on_rasp.py
index c110d7ffdc5c8..b2fad1fd9e0b3 100644
--- a/nnvm/tutorials/deploy_model_on_rasp.py
+++ b/nnvm/tutorials/deploy_model_on_rasp.py
@@ -14,6 +14,7 @@
 import nnvm.testing
 from tvm import rpc
 from tvm.contrib import util, graph_runtime as runtime
+from tvm.contrib.download import download_testdata
 
 ######################################################################
 # .. _build-tvm-runtime-on-device:
@@ -78,7 +79,6 @@
 # You can found more details about this part at tutorial :ref:`tutorial-from-mxnet`.
 
 from mxnet.gluon.model_zoo.vision import get_model
-from mxnet.gluon.utils import download
 from PIL import Image
 import numpy as np
 
@@ -89,8 +89,9 @@
 # In order to test our model, here we download an image of cat and
 # transform its format.
 img_name = 'cat.png'
-download('https://github.com/dmlc/mxnet.js/blob/master/data/cat.png?raw=true', img_name)
-image = Image.open(img_name).resize((224, 224))
+img_path = download_testdata('https://github.com/dmlc/mxnet.js/blob/master/data/cat.png?raw=true',
+                             img_name, module='data')
+image = Image.open(img_path).resize((224, 224))
 
 def transform_image(image):
     image = np.array(image) - np.array([123., 117., 104.])
@@ -108,9 +109,9 @@ def transform_image(image):
                       '4d0b62f3d01426887599d4f7ede23ee5/raw/',
                       '596b27d23537e5a1b5751d2b0481ef172f58b539/',
                       'imagenet1000_clsid_to_human.txt'])
-synset_name = 'synset.txt'
-download(synset_url, synset_name)
-with open(synset_name) as f:
+synset_name = 'imagenet1000_clsid_to_human.txt'
+synset_path = download_testdata(synset_url, synset_name, module='data')
+with open(synset_path) as f:
     synset = eval(f.read())
 
 ######################################################################
diff --git a/nnvm/tutorials/deploy_ssd_mxnet.py b/nnvm/tutorials/deploy_ssd_mxnet.py
index 1a71c96eaa0c0..ccccd35f04a69 100644
--- a/nnvm/tutorials/deploy_ssd_mxnet.py
+++ b/nnvm/tutorials/deploy_ssd_mxnet.py
@@ -18,7 +18,7 @@
 from nnvm import compiler
 from nnvm.frontend import from_mxnet
 from tvm import relay
-from tvm.contrib.download import download
+from tvm.contrib.download import download_testdata
 from tvm.contrib import graph_runtime
 from mxnet.model import load_checkpoint
 
@@ -65,28 +65,24 @@
 inference_symbol_url = "https://gist.github.com/kevinthesun/c1904e900848df4548ce5dfb18c719c7/" \
                        "archive/a28c4856c827fe766aa3da0e35bad41d44f0fb26.zip"
 
-dir = "ssd_model"
-if not os.path.exists(dir):
-    os.makedirs(dir)
-model_file_path = "%s/%s" % (dir, model_file)
-test_image_path = "%s/%s" % (dir, test_image)
-inference_symbol_path = "%s/inference_model.zip" % dir
-download(model_url, model_file_path)
-download(image_url, test_image_path)
-download(inference_symbol_url, inference_symbol_path)
+model_file_path = download_testdata(model_url, model_file, module=["mxnet", "ssd_model"])
+inference_symbol_path = download_testdata(inference_symbol_url, "inference_model.zip",
+                                          module=["mxnet", "ssd_model"])
+test_image_path = download_testdata(image_url, test_image, module="data")
+model_dir = os.path.dirname(model_file_path)
 
 zip_ref = zipfile.ZipFile(model_file_path, 'r')
-zip_ref.extractall(dir)
+zip_ref.extractall(model_dir)
 zip_ref.close()
 zip_ref = zipfile.ZipFile(inference_symbol_path)
-zip_ref.extractall(dir)
+zip_ref.extractall(model_dir)
 zip_ref.close()
 
 ######################################################################
 # Convert and compile model with NNVM or Relay for CPU.
 
-sym = mx.sym.load("%s/%s/ssd_resnet50_inference.json" % (dir, inference_symbol_folder))
-_, arg_params, aux_params = load_checkpoint("%s/%s" % (dir, model_name), 0)
+sym = mx.sym.load("%s/%s/ssd_resnet50_inference.json" % (model_dir, inference_symbol_folder))
+_, arg_params, aux_params = load_checkpoint("%s/%s" % (model_dir, model_name), 0)
 
 import argparse
 parser = argparse.ArgumentParser()
diff --git a/nnvm/tutorials/from_coreml.py b/nnvm/tutorials/from_coreml.py
index 1c958746247bc..b25645a4d6a13 100644
--- a/nnvm/tutorials/from_coreml.py
+++ b/nnvm/tutorials/from_coreml.py
@@ -21,19 +21,7 @@
 import coremltools as cm
 import numpy as np
 from PIL import Image
-
-def download(url, path, overwrite=False):
-    import os
-    if os.path.isfile(path) and not overwrite:
-        print('File {} existed, skip.'.format(path))
-        return
-    print('Downloading from url {} to {}'.format(url, path))
-    try:
-        import urllib.request
-        urllib.request.urlretrieve(url, path)
-    except:
-        import urllib
-        urllib.urlretrieve(url, path)
+from tvm.contrib.download import download_testdata
 
 ######################################################################
 # Load pretrained CoreML model
@@ -42,9 +30,9 @@ def download(url, path, overwrite=False):
 # provided by apple in this example
 model_url = 'https://docs-assets.developer.apple.com/coreml/models/MobileNet.mlmodel'
 model_file = 'mobilenet.mlmodel'
-download(model_url, model_file)
+model_path = download_testdata(model_url, model_file, module='coreml')
 # now you mobilenet.mlmodel on disk
-mlmodel = cm.models.MLModel(model_file)
+mlmodel = cm.models.MLModel(model_path)
 # we can load the graph as NNVM compatible model
 sym, params = nnvm.frontend.from_coreml(mlmodel)
 
@@ -54,8 +42,8 @@ def download(url, path, overwrite=False):
 # A single cat dominates the examples!
 from PIL import Image
 img_url = 'https://github.com/dmlc/mxnet.js/blob/master/data/cat.png?raw=true'
-download(img_url, 'cat.png')
-img = Image.open('cat.png').resize((224, 224))
+img_path = download_testdata(img_url, 'cat.png', module='data')
+img = Image.open(img_path).resize((224, 224))
 #x = np.transpose(img, (2, 0, 1))[np.newaxis, :]
 image = np.asarray(img)
 image = image.transpose((2, 0, 1))
@@ -95,8 +83,8 @@ def download(url, path, overwrite=False):
                       '4d0b62f3d01426887599d4f7ede23ee5/raw/',
                       '596b27d23537e5a1b5751d2b0481ef172f58b539/',
                       'imagenet1000_clsid_to_human.txt'])
-synset_name = 'synset.txt'
-download(synset_url, synset_name)
-with open(synset_name) as f:
+synset_name = 'imagenet1000_clsid_to_human.txt'
+synset_path = download_testdata(synset_url, synset_name, module='data')
+with open(synset_path) as f:
     synset = eval(f.read())
 print('Top-1 id', top1, 'class name', synset[top1])
diff --git a/nnvm/tutorials/from_darknet.py b/nnvm/tutorials/from_darknet.py
index f0eec98c00ea1..29053ea793c71 100644
--- a/nnvm/tutorials/from_darknet.py
+++ b/nnvm/tutorials/from_darknet.py
@@ -25,7 +25,7 @@
 import sys
 
 from ctypes import *
-from tvm.contrib.download import download
+from tvm.contrib.download import download_testdata
 from nnvm.testing.darknet import __darknetffi__
 
 # Model name
@@ -41,8 +41,8 @@
 CFG_URL = REPO_URL + 'cfg/' + CFG_NAME + '?raw=true'
 WEIGHTS_URL = 'https://pjreddie.com/media/files/' + WEIGHTS_NAME
 
-download(CFG_URL, CFG_NAME)
-download(WEIGHTS_URL, WEIGHTS_NAME)
+cfg_path = download_testdata(CFG_URL, CFG_NAME, module="darknet")
+weights_path = download_testdata(WEIGHTS_URL, WEIGHTS_NAME, module="darknet")
 
 # Download and Load darknet library
 if sys.platform in ['linux', 'linux2']:
@@ -55,12 +55,10 @@
     err = "Darknet lib is not supported on {} platform".format(sys.platform)
     raise NotImplementedError(err)
 
-download(DARKNET_URL, DARKNET_LIB)
+lib_path = download_testdata(DARKNET_URL, DARKNET_LIB, module="darknet")
 
-DARKNET_LIB = __darknetffi__.dlopen('./' + DARKNET_LIB)
-cfg = "./" + str(CFG_NAME)
-weights = "./" + str(WEIGHTS_NAME)
-net = DARKNET_LIB.load_network(cfg.encode('utf-8'), weights.encode('utf-8'), 0)
+DARKNET_LIB = __darknetffi__.dlopen(lib_path)
+net = DARKNET_LIB.load_network(cfg_path.encode('utf-8'), weights_path.encode('utf-8'), 0)
 dtype = 'float32'
 batch_size = 1
 
@@ -88,9 +86,9 @@
 print("Loading the test image...")
 img_url = 'https://github.com/siju-samuel/darknet/blob/master/data/' + \
           test_image + '?raw=true'
-download(img_url, test_image)
+img_path = download_testdata(img_url, test_image, "data")
 
-data = nnvm.testing.darknet.load_image(test_image, netw, neth)
+data = nnvm.testing.darknet.load_image(img_path, netw, neth)
 ######################################################################
 # Execute on TVM Runtime
 # ----------------------
@@ -150,10 +148,10 @@
 coco_url = 'https://github.com/siju-samuel/darknet/blob/master/data/' + coco_name + '?raw=true'
 font_name = 'arial.ttf'
 font_url = 'https://github.com/siju-samuel/darknet/blob/master/data/' + font_name + '?raw=true'
-download(coco_url, coco_name)
-download(font_url, font_name)
+coco_path = download_testdata(coco_url, coco_name, module='data')
+font_path = download_testdata(font_url, font_name, module='data')
 
-with open(coco_name) as f:
+with open(coco_path) as f:
     content = f.readlines()
 
 names = [x.strip() for x in content]
diff --git a/nnvm/tutorials/from_mxnet.py b/nnvm/tutorials/from_mxnet.py
index b4c2c5b7dfbdf..bc8a23e3e898e 100644
--- a/nnvm/tutorials/from_mxnet.py
+++ b/nnvm/tutorials/from_mxnet.py
@@ -20,30 +20,31 @@
 """
 # some standard imports
 import mxnet as mx
+import numpy as np
 import nnvm
 import tvm
-import numpy as np
+from tvm.contrib.download import download_testdata
 
 ######################################################################
 # Download Resnet18 model from Gluon Model Zoo
 # ---------------------------------------------
 # In this section, we download a pretrained imagenet model and classify an image.
 from mxnet.gluon.model_zoo.vision import get_model
-from mxnet.gluon.utils import download
 from PIL import Image
 from matplotlib import pyplot as plt
 block = get_model('resnet18_v1', pretrained=True)
+img_url = 'https://github.com/dmlc/mxnet.js/blob/master/data/cat.png?raw=true'
 img_name = 'cat.png'
 synset_url = ''.join(['https://gist.githubusercontent.com/zhreshold/',
                       '4d0b62f3d01426887599d4f7ede23ee5/raw/',
                       '596b27d23537e5a1b5751d2b0481ef172f58b539/',
                       'imagenet1000_clsid_to_human.txt'])
-synset_name = 'synset.txt'
-download('https://github.com/dmlc/mxnet.js/blob/master/data/cat.png?raw=true', img_name)
-download(synset_url, synset_name)
-with open(synset_name) as f:
+synset_name = 'imagenet1000_clsid_to_human.txt'
+img_path = download_testdata(img_url, img_name, module='data')
+synset_path = download_testdata(synset_url, synset_name, module='data')
+with open(synset_path) as f:
     synset = eval(f.read())
-image = Image.open(img_name).resize((224, 224))
+image = Image.open(img_path).resize((224, 224))
 plt.imshow(image)
 plt.show()
 
diff --git a/nnvm/tutorials/from_mxnet_to_webgl.py b/nnvm/tutorials/from_mxnet_to_webgl.py
index 4e7b57706de63..540585d3c7c64 100644
--- a/nnvm/tutorials/from_mxnet_to_webgl.py
+++ b/nnvm/tutorials/from_mxnet_to_webgl.py
@@ -33,6 +33,7 @@
 
 import numpy as np
 import tvm
+from tvm.contrib.download import download_testdata
 import nnvm.compiler
 import nnvm.testing
 
@@ -106,16 +107,14 @@ def download_synset():
 
     print("Downloading synset...")
 
-    from mxnet import gluon
-
     url = "https://gist.githubusercontent.com/zhreshold/" + \
           "4d0b62f3d01426887599d4f7ede23ee5/raw/" + \
           "596b27d23537e5a1b5751d2b0481ef172f58b539/" + \
           "imagenet1000_clsid_to_human.txt"
-    file_name = "synset.txt"
+    file_name = "imagenet1000_clsid_to_human.txt"
 
-    gluon.utils.download(url, file_name)
-    with open(file_name) as f:
+    file_path = download_testdata(url, file_name, module='data')
+    with open(file_path) as f:
         synset = eval(f.read())
 
     print("- Synset downloaded!")
@@ -144,14 +143,13 @@ def download_image():
     print("Downloading cat image...")
 
     from matplotlib import pyplot as plt
-    from mxnet import gluon
     from PIL import Image
 
     url = "https://github.com/dmlc/mxnet.js/blob/master/data/cat.png?raw=true"
     img_name = "cat.png"
 
-    gluon.utils.download(url, img_name)
-    image = Image.open(img_name).resize((224, 224))
+    img_path = download_testdata(url, img_name, module='data')
+    image = Image.open(img_path).resize((224, 224))
 
     print("- Cat image downloaded!")
 
diff --git a/nnvm/tutorials/from_onnx.py b/nnvm/tutorials/from_onnx.py
index 0fdef8afa98cf..c5fe03fd1c969 100644
--- a/nnvm/tutorials/from_onnx.py
+++ b/nnvm/tutorials/from_onnx.py
@@ -18,22 +18,10 @@
 """
 import nnvm
 import tvm
+from tvm.contrib.download import download_testdata
 import onnx
 import numpy as np
 
-def download(url, path, overwrite=False):
-    import os
-    if os.path.isfile(path) and not overwrite:
-        print('File {} existed, skip.'.format(path))
-        return
-    print('Downloading from url {} to {}'.format(url, path))
-    try:
-        import urllib.request
-        urllib.request.urlretrieve(url, path)
-    except:
-        import urllib
-        urllib.urlretrieve(url, path)
-
 ######################################################################
 # Load pretrained ONNX model
 # ---------------------------------------------
@@ -44,9 +32,9 @@ def download(url, path, overwrite=False):
                      'bcda4716699ac97ea44f791c24310193/raw/',
                      '93672b029103648953c4e5ad3ac3aadf346a4cdc/',
                      'super_resolution_0.2.onnx'])
-download(model_url, 'super_resolution.onnx', True)
+model_path = download_testdata(model_url, 'super_resolution.onnx', module='onnx')
 # now you have super_resolution.onnx on disk
-onnx_model = onnx.load_model('super_resolution.onnx')
+onnx_model = onnx.load_model(model_path)
 # we can load the graph as NNVM compatible model
 sym, params = nnvm.frontend.from_onnx(onnx_model)
 
@@ -56,8 +44,8 @@ def download(url, path, overwrite=False):
 # A single cat dominates the examples!
 from PIL import Image
 img_url = 'https://github.com/dmlc/mxnet.js/blob/master/data/cat.png?raw=true'
-download(img_url, 'cat.png')
-img = Image.open('cat.png').resize((224, 224))
+img_path = download_testdata(img_url, 'cat.png', module='data')
+img = Image.open(img_path).resize((224, 224))
 img_ycbcr = img.convert("YCbCr")  # convert to YCbCr
 img_y, img_cb, img_cr = img_ycbcr.split()
 x = np.array(img_y)[np.newaxis, np.newaxis, :, :]
diff --git a/nnvm/tutorials/from_tensorflow.py b/nnvm/tutorials/from_tensorflow.py
index ac632b122e76e..ab9ba03232a46 100644
--- a/nnvm/tutorials/from_tensorflow.py
+++ b/nnvm/tutorials/from_tensorflow.py
@@ -52,8 +52,8 @@
 map_proto_url = os.path.join(repo_base, map_proto)
 
 # Human readable text for labels
-lable_map = 'imagenet_synset_to_human_label_map.txt'
-lable_map_url = os.path.join(repo_base, lable_map)
+label_map = 'imagenet_synset_to_human_label_map.txt'
+label_map_url = os.path.join(repo_base, label_map)
 
 # Target settings
 # Use these commented settings to build for cuda.
@@ -70,19 +70,19 @@
 # Download required files
 # -----------------------
 # Download files listed above.
-from mxnet.gluon.utils import download
+from tvm.contrib.download import download_testdata
 
-download(image_url, img_name)
-download(model_url, model_name)
-download(map_proto_url, map_proto)
-download(lable_map_url, lable_map)
+img_path = download_testdata(image_url, img_name, module='data')
+model_path = download_testdata(model_url, model_name, module=['tf', 'InceptionV1'])
+map_proto_path = download_testdata(map_proto_url, map_proto, module='data')
+label_path = download_testdata(label_map_url, label_map, module='data')
 
 ######################################################################
 # Import model
 # ------------
 # Creates tensorflow graph definition from protobuf file.
 
-with tf.gfile.FastGFile(os.path.join("./", model_name), 'rb') as f:
+with tf.gfile.FastGFile(model_path, 'rb') as f:
     graph_def = tf.GraphDef()
     graph_def.ParseFromString(f.read())
     graph = tf.import_graph_def(graph_def, name='')
@@ -103,7 +103,7 @@
 #
 
 from PIL import Image
-image = Image.open(img_name).resize((299, 299))
+image = Image.open(img_path).resize((299, 299))
 
 x = np.array(image)
 
@@ -117,7 +117,7 @@
 #   params: params converted from tensorflow params (tensor protobuf).
 sym, params = nnvm.frontend.from_tensorflow(graph_def, layout=layout)
 
-print ("Tensorflow protobuf imported as nnvm graph")
+print("Tensorflow protobuf imported as nnvm graph")
 ######################################################################
 # NNVM Compilation
 # ----------------
@@ -157,8 +157,8 @@
 predictions = np.squeeze(predictions)
 
 # Creates node ID --> English string lookup.
-node_lookup = tf_testing.NodeLookup(label_lookup_path=os.path.join("./", map_proto),
-                                         uid_lookup_path=os.path.join("./", lable_map))
+node_lookup = tf_testing.NodeLookup(label_lookup_path=map_proto_path,
+                                    uid_lookup_path=label_path)
 
 # Print top 5 predictions from TVM output.
 top_k = predictions.argsort()[-5:][::-1]
@@ -175,7 +175,7 @@
 def create_graph():
     """Creates a graph from saved GraphDef file and returns a saver."""
     # Creates graph from saved graph_def.pb.
-    with tf.gfile.FastGFile(model_name, 'rb') as f:
+    with tf.gfile.FastGFile(model_path, 'rb') as f:
         graph_def = tf.GraphDef()
         graph_def.ParseFromString(f.read())
         graph = tf.import_graph_def(graph_def, name='')
@@ -209,8 +209,8 @@ def run_inference_on_image(image):
         predictions = np.squeeze(predictions)
 
         # Creates node ID --> English string lookup.
-        node_lookup = tf_testing.NodeLookup(label_lookup_path=os.path.join("./", map_proto),
-                                                 uid_lookup_path=os.path.join("./", lable_map))
+        node_lookup = tf_testing.NodeLookup(label_lookup_path=map_proto_path,
+                                            uid_lookup_path=label_path)
 
         # Print top 5 predictions from tensorflow.
         top_k = predictions.argsort()[-5:][::-1]
@@ -220,4 +220,4 @@ def run_inference_on_image(image):
             score = predictions[node_id]
             print('%s (score = %.5f)' % (human_string, score))
 
-run_inference_on_image (img_name)
+run_inference_on_image(img_path)
diff --git a/nnvm/tutorials/nlp/from_darknet_rnn.py b/nnvm/tutorials/nlp/from_darknet_rnn.py
index 54013f04fca63..bbf70c724bbe4 100644
--- a/nnvm/tutorials/nlp/from_darknet_rnn.py
+++ b/nnvm/tutorials/nlp/from_darknet_rnn.py
@@ -22,9 +22,9 @@
 """
 import random
 import numpy as np
-from mxnet.gluon.utils import download
 import tvm
 from tvm.contrib import graph_runtime
+from tvm.contrib.download import download_testdata
 from nnvm.testing.darknet import __darknetffi__
 import nnvm
 import nnvm.frontend.darknet
@@ -49,17 +49,15 @@
 CFG_URL = REPO_URL + 'cfg/' + CFG_NAME + '?raw=true'
 WEIGHTS_URL = REPO_URL + 'weights/' + WEIGHTS_NAME + '?raw=true'
 
-download(CFG_URL, CFG_NAME)
-download(WEIGHTS_URL, WEIGHTS_NAME)
+cfg_path = download_testdata(CFG_URL, CFG_NAME, module='darknet')
+weights_path = download_testdata(WEIGHTS_URL, WEIGHTS_NAME, module='darknet')
 
 # Download and Load darknet library
 DARKNET_LIB = 'libdarknet.so'
 DARKNET_URL = REPO_URL + 'lib/' + DARKNET_LIB + '?raw=true'
-download(DARKNET_URL, DARKNET_LIB)
-DARKNET_LIB = __darknetffi__.dlopen('./' + DARKNET_LIB)
-cfg = "./" + str(CFG_NAME)
-weights = "./" + str(WEIGHTS_NAME)
-net = DARKNET_LIB.load_network(cfg.encode('utf-8'), weights.encode('utf-8'), 0)
+lib_path = download_testdata(DARKNET_URL, DARKNET_LIB, module='darknet')
+DARKNET_LIB = __darknetffi__.dlopen(lib_path)
+net = DARKNET_LIB.load_network(cfg_path.encode('utf-8'), weights_path.encode('utf-8'), 0)
 dtype = 'float32'
 batch_size = 1
 
diff --git a/nnvm/tutorials/nlp/keras_s2s_translate.py b/nnvm/tutorials/nlp/keras_s2s_translate.py
index 77c7f23902f4c..97c7a706a80fd 100644
--- a/nnvm/tutorials/nlp/keras_s2s_translate.py
+++ b/nnvm/tutorials/nlp/keras_s2s_translate.py
@@ -59,9 +59,9 @@
 data_url = os.path.join(repo_base, data_file)
 
 # Download files listed below.
-from mxnet.gluon.utils import download
-download(model_url, model_file)
-download(data_url, model_file)
+from tvm.contrib.download import download_testdata
+model_path = download_testdata(model_url, model_file, module='keras')
+data_path = download_testdata(data_url, data_file, module='data')
 
 latent_dim = 256  # Latent dimensionality of the encoding space.
 test_samples = 10000  # Number of samples used for testing.
@@ -76,7 +76,7 @@
 target_texts = []
 input_characters = set()
 target_characters = set()
-with open(data_file, 'r', encoding='utf-8') as f:
+with open(data_path, 'r', encoding='utf-8') as f:
     lines = f.read().split('\n')
 test_samples = min(test_samples, len(lines))
 max_encoder_seq_length = 0
@@ -112,7 +112,7 @@
 # Load Keras Model
 # ----------------
 # Restore the model and construct the encoder and decoder.
-model = load_model(model_file)
+model = load_model(model_path)
 encoder_inputs = model.input[0]   # input_1
 
 encoder_outputs, state_h_enc, state_c_enc = model.layers[2].output   # lstm_1
diff --git a/python/tvm/autotvm/measure/measure_methods.py b/python/tvm/autotvm/measure/measure_methods.py
index f77a13bcd2ed5..7f65f2e88dac2 100644
--- a/python/tvm/autotvm/measure/measure_methods.py
+++ b/python/tvm/autotvm/measure/measure_methods.py
@@ -19,7 +19,7 @@
 
 from ... import ir_pass, build, build_config, nd, TVMError, register_func, \
     rpc as _rpc, target as _target
-from ...contrib import nvcc, ndk
+from ...contrib import nvcc, ndk, tar
 
 from ..util import get_const_tuple
 from ..env import AutotvmGlobalScope
@@ -58,20 +58,20 @@ class LocalBuilder(Builder):
     build_func: callable or str
         If is 'default', use default build function
         If is 'ndk', use function for android ndk
-        If is callable, use it as custom build function
+        If is callable, use it as custom build function, expect lib_format field.
     """
     def __init__(self, timeout=10, n_parallel=None, build_func='default'):
         super(LocalBuilder, self).__init__(timeout, n_parallel)
 
         if isinstance(build_func, str):
             if build_func == 'default':
-                build_func = default_build_func
+                build_func = tar.tar
             elif build_func == 'ndk':
-                build_func = android_ndk_build_func
+                build_func = ndk.create_shared
             else:
                 raise ValueError("Invalid build_func" + build_func)
 
-        self.build_func = build_func
+        self.build_func = _wrap_build_func(build_func)
         self.executor = LocalExecutor(timeout=timeout)
         self.tmp_dir = tempfile.mkdtemp()
 
@@ -349,46 +349,47 @@ def _build_func_common(measure_input, check_gpu=None, cuda_arch=None, build_opti
     return func, tuple((get_const_tuple(x.shape), x.dtype) for x in args)
 
 
-def default_build_func(measure_input, tmp_dir, **kwargs):
+def _wrap_build_func(build_func):
     """
-    Default build func. This can work for cuda, opencl, llvm backend
+    Wrap build_func to a function that can be used in measure.
 
     Parameters
     ----------
-    measure_input: MeasureInput
-        The input of measurement
-    tmp_dir: str
-        The path of temporary directory to export generated library
-    """
-    tic = time.time()
-    try:
-        filename = os.path.join(tmp_dir, "tmp_func_%0x.tar" % getrandbits(64))
-        func, arg_info = _build_func_common(measure_input, **kwargs)
-        func.export_library(filename)
-    except Exception as e:  # pylint: disable=broad-except
-        return BuildResult(None, None, e, time.time() - tic)
-    return BuildResult(filename, arg_info, None, time.time() - tic)
-
-
-def android_ndk_build_func(measure_input, tmp_dir, **kwargs):
-    """
-    Build function for android device using ndk.
+    build_func : The compilation function
+        We expect fcompile to contain an attr "output_format"
 
-    Parameters
-    ----------
-    measure_input: MeasureInput
-        The input of measurement
-    tmp_dir: str
-        The path of temporary directory to export generated library
+    Returns
+    -------
+    wrapped_build_func : function
+        The wrapped build function
     """
-    tic = time.time()
-    try:
-        filename = os.path.join(tmp_dir, "tmp_func_%0x.so" % getrandbits(64))
-        func, arg_info = _build_func_common(measure_input, **kwargs)
-        func.export_library(filename, ndk.create_shared)
-    except Exception as e:  # pylint: disable=broad-except
-        return BuildResult(None, None, e, time.time() - tic)
-    return BuildResult(filename, arg_info, None, time.time() - tic)
+    if not hasattr(build_func, "output_format"):
+        raise AttributeError("Expect build_func to have the attribute output_format.")
+    output_format = build_func.output_format
+
+    def _wrapped(measure_input, tmp_dir, **kwargs):
+        """
+        Wrapped build func.
+
+        Parameters
+        ----------
+        measure_input: MeasureInput
+            The input of measurement
+
+        tmp_dir: str
+            The path of temporary directory to export generated library
+        """
+        tic = time.time()
+        try:
+            filename = os.path.join(tmp_dir, "tmp_func_%0x.%s" % (
+                getrandbits(64), output_format))
+            # TODO(tvm-team) consider linline _build_func_common
+            func, arg_info = _build_func_common(measure_input, **kwargs)
+            func.export_library(filename, build_func)
+        except Exception as e:  # pylint: disable=broad-except
+            return BuildResult(None, None, e, time.time() - tic)
+        return BuildResult(filename, arg_info, None, time.time() - tic)
+    return _wrapped
 
 
 def run_through_rpc(measure_input, build_result,
diff --git a/python/tvm/autotvm/task/relay_integration.py b/python/tvm/autotvm/task/relay_integration.py
index 82350dcebea87..9b6e70e056ae3 100644
--- a/python/tvm/autotvm/task/relay_integration.py
+++ b/python/tvm/autotvm/task/relay_integration.py
@@ -53,6 +53,7 @@ def extract_from_program(func, params, ops, target, target_host=None):
                                  topi.nn.group_conv2d_nchw],
         tvm.relay.op.nn.conv2d_transpose: [topi.nn.conv2d_transpose_nchw],
         tvm.relay.op.nn.dense: [topi.nn.dense],
+        tvm.relay.op.nn.deformable_conv2d: [topi.nn.deformable_conv2d_nchw],
     }
 
     topi_funcs = []
@@ -126,6 +127,7 @@ def extract_from_multiple_program(funcs, params, ops, target, target_host=None):
                                  topi.nn.group_conv2d_nchw],
         tvm.relay.op.nn.conv2d_transpose: [topi.nn.conv2d_transpose_nchw],
         tvm.relay.op.nn.dense: [topi.nn.dense],
+        tvm.relay.op.nn.contrib_deformable_conv2d: [topi.nn.deformable_conv2d_nchw],
     }
 
     topi_funcs = []
diff --git a/python/tvm/autotvm/task/topi_integration.py b/python/tvm/autotvm/task/topi_integration.py
index 882f2df43a50f..c184c6b469989 100644
--- a/python/tvm/autotvm/task/topi_integration.py
+++ b/python/tvm/autotvm/task/topi_integration.py
@@ -68,6 +68,7 @@ def __init__(self):
             topi.nn.group_conv2d_nchw: "topi_nn_group_conv2d_nchw",
             topi.nn.conv2d_transpose_nchw: "topi_nn_conv2d_transpose_nchw",
             topi.nn.dense: "topi_nn_dense",
+            topi.nn.deformable_conv2d_nchw: "topi_nn_deformable_conv2d_nchw",
         }
 
         self.topi_to_schedule = {
@@ -78,6 +79,7 @@ def __init__(self):
             topi.nn.group_conv2d_nchw: [topi.generic.schedule_group_conv2d_nchw],
             topi.nn.conv2d_transpose_nchw: [topi.generic.schedule_conv2d_transpose_nchw],
             topi.nn.dense: [topi.generic.schedule_dense],
+            topi.nn.deformable_conv2d_nchw: [topi.generic.schedule_deformable_conv2d_nchw],
         }
 
         self._register_tracing()
@@ -172,6 +174,15 @@ def _topi_nn_dense(*args, **kwargs):
                 return s, [data, weight, bias, C]
             return s, [data, weight, C]
 
+        @register("topi_nn_deformable_conv2d_nchw")
+        def _topi_nn_deformable_conv2d_nchw(*args, **kwargs):
+            assert not kwargs, "Do not support kwargs in template function call"
+            args = deserialize_args(args)
+            A, Offset, W = args[:3]
+            C = topi.nn.deformable_conv2d_nchw(*args, **kwargs)
+            s = topi.generic.schedule_deformable_conv2d_nchw([C])
+            return s, [A, Offset, W, C]
+
     def reset(self, wanted_topi_funcs):
         """Reset task collections
 
diff --git a/python/tvm/contrib/cc.py b/python/tvm/contrib/cc.py
index ee84da8209028..09822e594b75d 100644
--- a/python/tvm/contrib/cc.py
+++ b/python/tvm/contrib/cc.py
@@ -29,7 +29,7 @@ def create_shared(output,
     cc : str, optional
         The compile string.
     """
-    if sys.platform == "darwin" or sys.platform.startswith('linux'):
+    if sys.platform == "darwin" or sys.platform.startswith("linux"):
         _linux_shared(output, objects, options, cc)
     elif sys.platform == "win32":
         _windows_shared(output, objects, options)
@@ -37,6 +37,38 @@ def create_shared(output,
         raise ValueError("Unsupported platform")
 
 
+# assign so as default output format
+create_shared.output_format = "so" if sys.platform != "win32" else "dll"
+
+
+def cross_compiler(cc, options=None, output_format="so"):
+    """Create a cross compiler function.
+
+    Parameters
+    ----------
+    cc :  str
+        The cross compiler name.
+
+    options : list, optional
+        List of additional optional string.
+
+    output_format : str, optional
+        Library output format.
+
+    Returns
+    -------
+    fcompile : function
+        A compilation function that can be passed to export_library.
+    """
+    def _fcompile(outputs, objects, opts=None):
+        opts = opts if opts else []
+        if options:
+            opts += options
+        _linux_shared(outputs, objects, opts, cc=cc)
+    _fcompile.output_format = output_format
+    return _fcompile
+
+
 def _linux_shared(output, objects, options, cc="g++"):
     cmd = [cc]
     cmd += ["-shared", "-fPIC"]
diff --git a/python/tvm/contrib/debugger/debug_result.py b/python/tvm/contrib/debugger/debug_result.py
index 101af6887c47e..7e3e97e2cae1a 100644
--- a/python/tvm/contrib/debugger/debug_result.py
+++ b/python/tvm/contrib/debugger/debug_result.py
@@ -1,9 +1,18 @@
 """Graph debug results dumping class."""
-import os
+import collections
 import json
+import os
+import numpy as np
 import tvm
 
 GRAPH_DUMP_FILE_NAME = '_tvmdbg_graph_dump.json'
+CHROME_TRACE_FILE_NAME = "_tvmdbg_execution_trace.json"
+
+ChromeTraceEvent = collections.namedtuple(
+    'ChromeTraceEvent',
+    ['ts', 'tid', 'pid', 'name', 'ph']
+)
+
 
 class DebugResult(object):
     """Graph debug data module.
@@ -127,6 +136,45 @@ def dump_output_tensor(self):
         with open(os.path.join(self._dump_path, "output_tensors.params"), "wb") as param_f:
             param_f.write(save_tensors(output_tensors))
 
+    def dump_chrome_trace(self):
+        """Dump the trace to the Chrome trace.json format.
+        """
+        def s_to_us(t):
+            return t * 10 ** 6
+
+        starting_times = np.zeros(len(self._time_list) + 1)
+        starting_times[1:] = np.cumsum([times[0] for times in self._time_list])
+
+        def node_to_events(node, times, starting_time):
+            return [
+                ChromeTraceEvent(
+                    ts=s_to_us(starting_time),
+                    tid=1,
+                    pid=1,
+                    ph='B',
+                    name=node['name'],
+                ),
+                ChromeTraceEvent(
+                    # Use start + duration instead of end to ensure precise timings.
+                    ts=s_to_us(times[0] + starting_time),
+                    tid=1,
+                    pid=1,
+                    ph='E',
+                    name=node['name'],
+                ),
+            ]
+        events = [
+            e for (node, times, starting_time) in zip(
+                self._nodes_list, self._time_list, starting_times)
+            for e in node_to_events(node, times, starting_time)]
+        result = dict(
+            displayTimeUnit='ns',
+            traceEvents=[e._asdict() for e in events]
+        )
+
+        with open(os.path.join(self._dump_path, CHROME_TRACE_FILE_NAME), "w") as trace_f:
+            json.dump(result, trace_f)
+
     def dump_graph_json(self, graph):
         """Dump json formatted graph.
 
diff --git a/python/tvm/contrib/debugger/debug_runtime.py b/python/tvm/contrib/debugger/debug_runtime.py
index 725f212fce007..5fcabcc6507e5 100644
--- a/python/tvm/contrib/debugger/debug_runtime.py
+++ b/python/tvm/contrib/debugger/debug_runtime.py
@@ -220,7 +220,9 @@ def run(self, **input_dict):
         self._run_debug()
         # Step 2. Dump the output tensors to the dump folder
         self.debug_datum.dump_output_tensor()
-        # Step 3. Display the collected information
+        # Step 3. Dump the Chrome trace to the dump folder
+        self.debug_datum.dump_chrome_trace()
+        # Step 4. Display the collected information
         self.debug_datum.display_debug_result()
 
     def run_individual(self, number, repeat=1, min_repeat_ms=0):
diff --git a/python/tvm/contrib/download.py b/python/tvm/contrib/download.py
index 0dcbb56ad663f..ed1740eb92e83 100644
--- a/python/tvm/contrib/download.py
+++ b/python/tvm/contrib/download.py
@@ -5,8 +5,10 @@
 import os
 import sys
 import time
+import uuid
+import shutil
 
-def download(url, path, overwrite=False, size_compare=False, verbose=1):
+def download(url, path, overwrite=False, size_compare=False, verbose=1, retries=3):
     """Downloads the file from the internet.
     Set the input options correctly to overwrite or do the size comparison
 
@@ -26,6 +28,9 @@ def download(url, path, overwrite=False, size_compare=False, verbose=1):
 
     verbose: int, optional
         Verbose level
+
+    retries: int, optional
+        Number of time to retry download, default at 3.
     """
     if sys.version_info >= (3,):
         import urllib.request as urllib2
@@ -53,6 +58,11 @@ def download(url, path, overwrite=False, size_compare=False, verbose=1):
 
     # Stateful start time
     start_time = time.time()
+    dirpath = os.path.dirname(path)
+    if not os.path.isdir(dirpath):
+        os.makedirs(dirpath)
+    random_uuid = str(uuid.uuid4())
+    tempfile = os.path.join(dirpath, random_uuid)
 
     def _download_progress(count, block_size, total_size):
         #pylint: disable=unused-argument
@@ -68,11 +78,62 @@ def _download_progress(count, block_size, total_size):
                          (percent, progress_size / (1024.0 * 1024), speed, duration))
         sys.stdout.flush()
 
-    if sys.version_info >= (3,):
-        urllib2.urlretrieve(url, path, reporthook=_download_progress)
-        print("")
+    while retries >= 0:
+        # Disable pyling too broad Exception
+        # pylint: disable=W0703
+        try:
+            if sys.version_info >= (3,):
+                urllib2.urlretrieve(url, tempfile, reporthook=_download_progress)
+                print("")
+            else:
+                f = urllib2.urlopen(url)
+                data = f.read()
+                with open(tempfile, "wb") as code:
+                    code.write(data)
+            shutil.move(tempfile, path)
+            break
+        except Exception as err:
+            retries -= 1
+            if retries == 0:
+                os.remove(tempfile)
+                raise err
+            else:
+                print("download failed due to {}, retrying, {} attempt{} left"
+                      .format(repr(err), retries, 's' if retries > 1 else ''))
+
+
+TEST_DATA_ROOT_PATH = os.path.join(os.path.expanduser('~'), '.tvm_test_data')
+if not os.path.exists(TEST_DATA_ROOT_PATH):
+    os.mkdir(TEST_DATA_ROOT_PATH)
+
+def download_testdata(url, relpath, module=None):
+    """Downloads the test data from the internet.
+
+    Parameters
+    ----------
+    url : str
+        Download url.
+
+    relpath : str
+        Relative file path.
+
+    module : Union[str, list, tuple], optional
+        Subdirectory paths under test data folder.
+
+    Returns
+    -------
+    abspath : str
+        Absolute file path of downloaded file
+    """
+    global TEST_DATA_ROOT_PATH
+    if module is None:
+        module_path = ''
+    elif isinstance(module, str):
+        module_path = module
+    elif isinstance(module, (list, tuple)):
+        module_path = os.path.join(*module)
     else:
-        f = urllib2.urlopen(url)
-        data = f.read()
-        with open(path, "wb") as code:
-            code.write(data)
+        raise ValueError("Unsupported module: " + module)
+    abspath = os.path.join(TEST_DATA_ROOT_PATH, module_path, relpath)
+    download(url, abspath, overwrite=False, size_compare=True)
+    return abspath
diff --git a/python/tvm/contrib/nnpack.py b/python/tvm/contrib/nnpack.py
index 9fd0e7ed2cbad..3aa874f3a65ca 100644
--- a/python/tvm/contrib/nnpack.py
+++ b/python/tvm/contrib/nnpack.py
@@ -149,11 +149,12 @@ def convolution_inference_without_weight_transform(
             ins[1],
             ins[2] if bias is not None else 0,
             outs[0], padding[0], padding[1], padding[2], padding[3],
-            stride[0], stride[1], nthreads, algorithm), name="C")
+            stride[0], stride[1], nthreads, algorithm), name="C", dtype='float32')
 
 def convolution_inference_weight_transform(
         kernel, nthreads=1,
-        algorithm=ConvolutionAlgorithm.AUTO):
+        algorithm=ConvolutionAlgorithm.AUTO,
+        dtype='float32'):
     """Create an extern op to do inference convolution of 3D tensor data and
     4D tensor kernel and 1D tensor bias with nnpack.
 
@@ -171,13 +172,14 @@ def convolution_inference_weight_transform(
     """
     assert algorithm in (ConvolutionAlgorithm.WT_8x8, ConvolutionAlgorithm.WT_8x8_FP16)
     output_channels, input_channels, _, _ = kernel.shape
-
     transform_tile_size = 8
+    if not isinstance(dtype, str):
+        dtype = dtype.dtype
     return _api.extern(
         (output_channels, input_channels, transform_tile_size, transform_tile_size),
         [kernel],
         lambda ins, outs: _intrin.call_packed(
             "tvm.contrib.nnpack.convolution_inference_weight_transform",
-            ins[0], outs[0], nthreads, algorithm), name="transform_kernel")
+            ins[0], outs[0], nthreads, algorithm), name="transform_kernel", dtype=dtype)
 
 _init_api("tvm.contrib.nnpack")
diff --git a/python/tvm/contrib/tar.py b/python/tvm/contrib/tar.py
index 7e075d7a5697e..741a9140d741f 100644
--- a/python/tvm/contrib/tar.py
+++ b/python/tvm/contrib/tar.py
@@ -42,6 +42,9 @@ def tar(output, files):
         msg += py_str(out)
         raise RuntimeError(msg)
 
+# assign output format
+tar.output_format = "tar"
+
 
 def untar(tar_file, directory):
     """Unpack all tar files into the directory
diff --git a/python/tvm/contrib/xcode.py b/python/tvm/contrib/xcode.py
index a43dc9ae2bfe3..99f5938635222 100644
--- a/python/tvm/contrib/xcode.py
+++ b/python/tvm/contrib/xcode.py
@@ -98,6 +98,9 @@ def create_dylib(output, objects, arch, sdk="macosx"):
         raise RuntimeError(msg)
 
 
+# assign so as default output format
+create_dylib.output_format = "dylib"
+
 def compile_metal(code, path_target=None, sdk="macosx"):
     """Compile metal with CLI tool from env.
 
diff --git a/python/tvm/relay/expr_functor.py b/python/tvm/relay/expr_functor.py
index c5ab5cdbde5d5..cb4bcf53a2649 100644
--- a/python/tvm/relay/expr_functor.py
+++ b/python/tvm/relay/expr_functor.py
@@ -20,9 +20,8 @@ def __init__(self):
     # pylint: disable=no-else-return
     def visit(self, expr):
         """Apply the visitor to an expression."""
-        found = self.memo_map.get(expr)
-        if found:
-            return found
+        if expr in self.memo_map:
+            return self.memo_map[expr]
 
         if isinstance(expr, Function):
             res = self.visit_function(expr)
@@ -58,7 +57,6 @@ def visit(self, expr):
             raise Exception("warning unhandled case: {0}".format(type(expr)))
 
         self.memo_map[expr] = res
-
         return res
 
     def visit_function(self, _):
diff --git a/python/tvm/relay/frontend/caffe2.py b/python/tvm/relay/frontend/caffe2.py
index 519dfc185add3..769740df0be3a 100755
--- a/python/tvm/relay/frontend/caffe2.py
+++ b/python/tvm/relay/frontend/caffe2.py
@@ -1,6 +1,7 @@
 # pylint: disable=import-self, invalid-name, line-too-long, unused-argument
 """Caffe2 frontend"""
 from __future__ import absolute_import as _abs
+import tvm
 from .. import ir_pass
 from .. import expr as _expr
 from .. import op as _op
@@ -15,7 +16,8 @@ def _impl(attr):
         kernel = attr['kernel_shape']
         if len(kernel) == 2:
             return prefix + '2d' + surfix
-        raise NotImplementedError("Only 2d kernel supported.")
+        raise tvm.error.OpAttributeUnimplemented(
+            'Non-2D kernels are not supported for operator {}2d'.format(prefix))
 
     return _impl
 
@@ -27,7 +29,8 @@ def revert_caffe2_pad(pads):
     elif len(pads) == 2:
         pass
     else:
-        raise ValueError("Invalid caffe2 type padding: {}".format(pads))
+        raise tvm.error.OpAttributeInvalid(
+            'Number of pads must equal 2 or 4.')
     return pads
 
 
@@ -103,8 +106,8 @@ def get_converter(cls):
 
         if hasattr(cls, '_impl'):
             return getattr(cls, '_impl')
-        raise NotImplementedError('{} not implemented'.format(
-            cls.__name__))
+        raise tvm.error.OpNotInplemented(
+            'Operator {} is not supported in frontend Caffe2.'.format(cls.__name__))
 
 
 _caffe2_internal_args = [
@@ -224,8 +227,8 @@ def _get_axis_from_order_str(order):
                 return 1
             if order == 'NHWC':
                 return 3
-            raise RuntimeError(
-                "Unsupported storage order: {} in caffe2".format(order))
+            raise tvm.error.OpAttributeUnimplemented(
+                'Order {} is not supported in operator Concat.'.format(order))
 
         return AttrCvt(
             op_name='concatenate',
@@ -517,8 +520,8 @@ def _convert_operator(self,
             # Add a sanitizing step to convert all byte strings in args to strings
             func = convert_map[op_type](inputs, args, self._params)
         else:
-            raise NotImplementedError(
-                "Operator {} not implemented.".format(op_type))
+            raise tvm.error.OpNotImplemented(
+                'Operator {} is not supported in frontend Caffe2.'.format(op_type))
         return func
 
 
diff --git a/python/tvm/relay/frontend/common.py b/python/tvm/relay/frontend/common.py
index ef9f63f3cd958..d07f2af3e08bc 100644
--- a/python/tvm/relay/frontend/common.py
+++ b/python/tvm/relay/frontend/common.py
@@ -255,7 +255,8 @@ def get_expr(self, name):
 
     def set_expr(self, name, expr):
         assert isinstance(expr, _expr.Expr)
-        self.exprs[name] = expr
+        if name not in self.exprs:
+            self.exprs[name] = expr
 
     def set_padding(self, paddings):
         self.paddings = paddings
@@ -320,6 +321,10 @@ def __call__(self, inputs, attrs, *args):
         else:
             assert callable(self._op_name), "op_name can either be string or callable"
             op_name = self._op_name(attrs)
+
+        # ignore 'tvm_custom' always
+        self._ignores.append('tvm_custom')
+
         # convert attributes
         new_attrs = {}
         for k in attrs.keys():
@@ -328,7 +333,8 @@ def __call__(self, inputs, attrs, *args):
             elif k in self._disables:
                 logging.warning("Attribute %s is disabled in relay.sym.%s", k, op_name)
             elif k in self._ignores:
-                logging.debug("Attribute %s is ignored in relay.sym.%s", k, op_name)
+                if k != 'tvm_custom':
+                    logging.warning("Attribute %s is ignored in relay.sym.%s", k, op_name)
             elif k in self._transforms:
                 new_name, defaults, transform = self._parse_default(self._transforms[k])
                 if defaults is None:
@@ -415,4 +421,6 @@ def __init__(self, new_name):
         self._new_name = new_name
 
     def __call__(self, inputs, attrs, *args):
+        if 'tvm_custom' in attrs:
+            attrs.pop('tvm_custom')
         return get_relay_op(self._new_name)(*inputs, **attrs)
diff --git a/python/tvm/relay/frontend/coreml.py b/python/tvm/relay/frontend/coreml.py
index a4f9b39b70e2c..963b21f382970 100644
--- a/python/tvm/relay/frontend/coreml.py
+++ b/python/tvm/relay/frontend/coreml.py
@@ -1,6 +1,7 @@
 # pylint: disable=invalid-name, import-self, unused-argument, unused-variable, inconsistent-return-statements
 """CoreML frontend."""
 from __future__ import absolute_import as _abs
+import tvm
 import numpy as np
 from .. import ir_pass
 from .. import expr as _expr
@@ -81,7 +82,8 @@ def _BatchnormLayerParams(op, inexpr, etab):
     """Get layer of batchnorm parameter"""
     # this changes the symbol
     if op.instanceNormalization:
-        raise NotImplementedError("instance normalization not implemented")
+        raise tvm.error.OpNotImplemented(
+            'Operator "instance normalization" is not supported in frontend CoreML.')
     else:
         params = {'gamma':etab.new_const(list(op.gamma.floatValue)),
                   'beta':etab.new_const(list(op.beta.floatValue)),
@@ -142,7 +144,8 @@ def _ActivationParams(op, inexpr, etab):
         alpha_expr = etab.new_const(alpha)
         beta_expr = etab.new_const(beta)
         return _op.multiply(_op.log(_op.add(_op.exp(inexpr), beta_expr)), alpha_expr)
-    raise NotImplementedError('%s not implemented' % whichActivation)
+    raise tvm.error.OpNotImplemented(
+        'Operator {} is not supported in frontend CoreML.'.format(whichActivation))
 
 
 def _ScaleLayerParams(op, inexpr, etab):
@@ -164,7 +167,8 @@ def _PoolingLayerParams(op, inexpr, etab):
             return _op.nn.global_max_pool2d(inexpr)
         if op.type == 1:
             return _op.nn.global_avg_pool2d(inexpr)
-        raise NotImplementedError("Only max and average pooling implemented")
+        raise tvm.error.OpNotImplemented(
+            'Only Max and Average Pooling are supported in frontend CoreML.')
 
     else:
         params = {'pool_size':list(op.kernelSize),
@@ -184,7 +188,9 @@ def _PoolingLayerParams(op, inexpr, etab):
             params['padding'] = padding
             params['ceil_mode'] = True
         else:
-            raise NotImplementedError("Other convolution padding not implemented")
+            msg = 'PoolingPaddingType {} is not supported in operator Pooling.'
+            op_name = op.WhichOneof('PoolingPaddingType')
+            raise tvm.error.OpAttributeUnimplemented(msg.format(op_name))
 
         # consume padding layer
         if etab.in_padding:
@@ -196,7 +202,8 @@ def _PoolingLayerParams(op, inexpr, etab):
             return _op.nn.max_pool2d(inexpr, **params)
         if op.type == 1:
             return _op.nn.avg_pool2d(inexpr, **params)
-        raise NotImplementedError("Only max and average pooling implemented")
+        raise tvm.error.OpNotImplemented(
+            'Only Max and Average Pooling are supported in CoreML.')
 
 
 def _SoftmaxLayerParams(op, inexpr, etab):
@@ -239,7 +246,8 @@ def _ConcatLayerParams(op, inexpr, etab):
     if not isinstance(inexpr, list):
         inexpr = [inexpr]
     if op.sequenceConcat:
-        raise NotImplementedError("Sequence Concat not supported")
+        raise tvm.error.OpNotImplemented(
+            'Operator Sequence Concat is not supported in frontend CoreML.')
     ret = _op.concatenate(inexpr, axis=1)
     return ret
 
@@ -255,14 +263,16 @@ def _PaddingLayerParams(op, inexpr, etab):
     if op.WhichOneof('PaddingType') == 'constant':
         constant = op.constant
         if constant.value != 0:
-            raise NotImplementedError("Padding value {} not supported.".format(constant.value))
+            raise tvm.error.OpAttributeUnimplemented(
+                '{} is not supported in operator Padding.'.format(constant.value))
         padding = [b.startEdgeSize for b in op.paddingAmounts.borderAmounts]
         padding2 = [b.endEdgeSize for b in op.paddingAmounts.borderAmounts]
         for i, j in zip(padding, padding2):
             assert i == j
         etab.set_padding(padding)
     else:
-        raise NotImplementedError("Only constant padding is supported now.")
+        raise tvm.error.OpNotImplemented(
+            'Non-constant padding is not supported in frontend CoreML.')
     return inexpr
 
 
@@ -273,8 +283,8 @@ def _PermuteLayerParams(op, inexpr, etab):
 
 def _UpsampleLayerParams(op, inexpr, etab):
     if op.scalingFactor[0] != op.scalingFactor[1]:
-        raise NotImplementedError("Upsampling only supported with same \
-            height and width scaling factor.")
+        raise tvm.error.OpAttributeUnimplemented(
+            'Upsample height and width must be equal.')
     interpolationMode = 'NEAREST_NEIGHBOR' if op.mode == 0 else 'BILINEAR'
     return _op.nn.upsampling(inexpr, scale=op.scalingFactor[0], method=interpolationMode)
 
@@ -364,7 +374,8 @@ def coreml_op_to_relay(op, inname, outname, etab):
     """
     classname = type(op).__name__
     if classname not in _convert_map:
-        raise NotImplementedError("%s is not supported" % (classname))
+        raise tvm.error.OpNotImplemented(
+            'Operator {} is not supported in frontend CoreML.'.format(classname))
     if isinstance(inname, _base.string_types):
         insym = etab.get_expr(inname)
     else:
diff --git a/python/tvm/relay/frontend/keras.py b/python/tvm/relay/frontend/keras.py
index f6f2d99e2ea5b..bd7cb4f3b1103 100644
--- a/python/tvm/relay/frontend/keras.py
+++ b/python/tvm/relay/frontend/keras.py
@@ -2,12 +2,13 @@
 """Keras frontend."""
 from __future__ import absolute_import as _abs
 import sys
+import tvm
 import numpy as np
 from .. import ir_pass
 from .. import expr as _expr
 from .. import op as _op
 from ... import nd as _nd
-from .common import ExprTable
+from .common import ExprTable, new_var
 
 __all__ = ['from_keras']
 
@@ -91,7 +92,8 @@ def _convert_activation(inexpr, keras_layer, _):
         x = (_expr.const(0.2, dtype='float32') * inexpr) + _expr.const(0.5, dtype='float32')
         return _op.clip(x, a_min=0., a_max=1.)
 
-    raise TypeError("Unsupported activation type : {}".format(act_type))
+    raise tvm.error.OpNotImplemented(
+        'Operator {} is not supported in frontend Keras.'.format(act_type))
 
 
 def _convert_advanced_activation(inexpr, keras_layer, etab):
@@ -118,7 +120,8 @@ def _convert_advanced_activation(inexpr, keras_layer, etab):
         return _op.multiply(inexpr, _op.greater(inexpr, \
             _expr.const(theta, dtype='float32')).astype('float32'))
 
-    raise TypeError("Unsupported advanced activation type : {}".format(act_type))
+    raise tvm.error.OpNotImplemented(
+        'Operator {} is not supported in frontend Keras.'.format(act_type))
 
 
 def _convert_merge(inexpr, keras_layer, _):
@@ -136,7 +139,8 @@ def _convert_merge(inexpr, keras_layer, _):
             ret = _op.add(ret, inexpr[i])
         ret = ret / _expr.const(len(inexpr), dtype='float32')
     else:
-        raise TypeError("Unsupported merge type : {}".format(merge_type))
+        raise tvm.error.OpNotImplemented(
+            'Operator {} is not supported in frontend Keras.'.format(merge_type))
     return ret
 
 
@@ -150,7 +154,8 @@ def _convert_dense(inexpr, keras_layer, etab):
     if input_dim > 2:
         input_shape = tuple(dim if dim else 1 for dim in _as_list(input_shape)[0])
         if input_dim != 3 or input_shape[0] != 1 or input_shape[1] != 1:
-            raise ValueError("Cannot flatten the inputs with shape.", input_shape, " for dense.")
+            raise tvm.error.OpAttributeInvalid(
+                'Input shape {} is not valid for operator Dense.'.format(input_shape))
         inexpr = _op.squeeze(inexpr, axis=0)
     out = _op.nn.dense(data=inexpr, **params)
     if keras_layer.use_bias:
@@ -214,7 +219,9 @@ def _convert_convolution(inexpr, keras_layer, etab):
             inexpr = _op.nn.pad(data=inexpr, pad_width=(
                 (0, 0), (0, 0), (pad_t, pad_b), (pad_l, pad_r)))
     else:
-        raise TypeError("Unsupported padding type : {}".format(keras_layer.padding))
+        msg = 'Padding with {} is not supported for operator Convolution ' \
+              'in frontend Keras.'
+        raise tvm.error.OpAttributeUnimplemented(msg.format(keras_layer.padding))
     if is_deconv:
         out = _op.nn.conv2d_transpose(data=inexpr, **params)
     else:
@@ -260,7 +267,10 @@ def _convert_separable_convolution(inexpr, keras_layer, etab):
             inexpr = _op.nn.pad(data=inexpr, pad_width=(
                 (0, 0), (0, 0), (pad_t, pad_b), (pad_l, pad_r)))
     else:
-        raise TypeError("Unsupported padding type : {}".format(keras_layer.padding))
+        msg = 'Padding with {} is not supported for operator Separable ' \
+              'Convolution in frontend Keras.'
+        raise tvm.error.OpAttributeUnimplemented(msg.format(keras_layer.padding))
+
     depthconv = _op.nn.conv2d(data=inexpr, **params0)
     # pointwise conv
     weight1 = weightList[1].transpose([3, 2, 0, 1])
@@ -313,13 +323,15 @@ def _convert_pooling(inexpr, keras_layer, etab):
         pad_l, pad_r = _get_pad_pair(in_w, pool_w, stride_w)
         params['padding'] = [pad_t, pad_l, pad_b, pad_r]
     else:
-        raise TypeError("Unsupported padding type : {}".format(keras_layer.padding))
+        raise tvm.error.OpAttributeUnimplemented(
+            'Padding with {} is not supported in operator Pooling.'.format(keras_layer.padding))
     if pool_type == 'MaxPooling2D':
         return _op.nn.max_pool2d(inexpr, **params)
     if pool_type == 'AveragePooling2D':
         params['count_include_pad'] = False
         return _op.nn.avg_pool2d(inexpr, **params)
-    raise TypeError("Unsupported pooling type : {}".format(keras_layer))
+    raise tvm.error.OpNotImplemented(
+        'Operator {} is not supported for frontend Keras.'.format(keras_layer))
 
 
 def _convert_upsample(inexpr, keras_layer, _):
@@ -331,8 +343,8 @@ def _convert_upsample(inexpr, keras_layer, _):
     elif upsample_type == 'UpSampling2D':
         h, w = keras_layer.size
         if h != w:
-            raise TypeError("Unsupported upsampling type with different axes size : {}"
-                            .format(keras_layer.size))
+            raise tvm.error.OpAttributeInvalid(
+                'Height must equal width for operator Upsample.')
         params = {'scale': h}
 
         if hasattr(keras_layer, 'interpolation'):
@@ -345,24 +357,24 @@ def _convert_upsample(inexpr, keras_layer, _):
     elif upsample_type == 'UpSampling3D':
         h, w, d = keras_layer.size
         if h != w or w != d:
-            raise TypeError("Unsupported upsampling type with different axes size : {}"
-                            .format(keras_layer.size))
+            raise tvm.error.OpAttributeInvalid(
+                'Height, width, and depth must all be equal for operator Upsample.')
         params = {'scale': h}
     else:
-        raise TypeError("Unsupported upsampling type : {}".format(upsample_type))
+        raise tvm.error.OpNotImplemented(
+            'Operator {} is not supported for frontend Keras.'.format(upsample_type))
     return _op.nn.upsampling(inexpr, **params)
 
 
 def _convert_cropping(inexpr, keras_layer, _):
     _check_data_format(keras_layer)
     crop_type = type(keras_layer).__name__
-    if crop_type == 'Cropping1D':
-        raise NotImplementedError("Cropping1D not implemented")
-    elif crop_type == 'Cropping2D':
+    if crop_type == 'Cropping2D':
         (_, in_h, in_w, _) = keras_layer.input_shape
         ((crop_t, crop_b), (crop_l, crop_r)) = keras_layer.cropping
     else:
-        raise TypeError("Unrecognized cropping type : {}".format(crop_type))
+        raise tvm.error.OpNotImplemented(
+            'Operator {} is not supported for frontend Keras.'.format(crop_type))
     int32_max = np.iinfo(np.int32).max
     return _op.strided_slice(inexpr, begin=[0, 0, crop_t, crop_l], \
         end=[int32_max, int32_max, in_h-crop_b, in_w-crop_r])
@@ -407,14 +419,18 @@ def _convert_padding(inexpr, keras_layer, _):
                 top, bottom = padding[0]
                 left, right = padding[1]
             else:
-                raise ValueError("Unrecognized padding option: {}".format(str(padding)))
+                msg = 'Value {} in attribute "padding" of operator Padding ' \
+                      'is not valid.'
+                raise tvm.error.OpAttributeInvalid(msg.format(str(padding)))
         else:
-            raise ValueError("Unrecognized padding option: {}".format(str(padding)))
-    elif padding_type == 'ZeroPadding1D':
-        raise NotImplementedError("ZeroPadding1D not implemented")
+            msg = 'Value {} in attribute "padding" of operator Padding is ' \
+                  'not valid.'
+            raise tvm.error.OpAttributeInvalid(msg.format(str(padding)))
     else:
-        raise ValueError("Unrecognized padding type: {}".format(padding_type))
-    return _op.nn.pad(data=inexpr, pad_width=((0, 0), (0, 0), (top, bottom), (left, right)))
+        msg = 'Operator {} is not supported in frontend Keras.'
+        raise tvm.error.OpNotImplemented(msg.format(padding_type))
+    return _op.nn.pad(data=inexpr,
+                      pad_width=((0, 0), (0, 0), (top, bottom), (left, right)))
 
 
 def _convert_concat(inexpr, keras_layer, _):
@@ -601,8 +617,10 @@ def _default_skip(inexpr, keras_layer, _): # pylint: disable=unused-argument
 
 def _check_unsupported_layers(model):
     for layer in model.layers:
-        if type(layer).__name__ not in _convert_map:
-            raise ValueError("Keras layer {} not supported.".format(type(layer).__name__))
+        op_name = type(layer).__name__
+        if op_name not in _convert_map:
+            raise tvm.error.OpNotImplemented(
+                'Operator {} is not supported in frontend Keras.'.format(op_name))
 
 
 def keras_op_to_relay(inexpr, keras_layer, outname, etab):
@@ -622,9 +640,11 @@ def keras_op_to_relay(inexpr, keras_layer, outname, etab):
     etab : relay.frontend.common.ExprTable
         The global expression table to be updated.
     """
-    if type(keras_layer).__name__ not in _convert_map:
-        raise NotImplementedError("{} is not supported".format((type(keras_layer).__name__)))
-    outs = _convert_map[type(keras_layer).__name__](inexpr, keras_layer, etab)
+    op_name = type(keras_layer).__name__
+    if op_name not in _convert_map:
+        raise tvm.error.OpNotImplemented(
+            'Operator {} is not supported for frontend Keras.'.format(op_name))
+    outs = _convert_map[op_name](inexpr, keras_layer, etab)
     outs = _as_list(outs)
     for t_idx, out in enumerate(outs):
         name = outname + ":" + str(t_idx)
@@ -661,12 +681,15 @@ def from_keras(model, shape=None):
         raise ValueError("Keras frontend currently supports data_format = channels_last only.")
     _check_unsupported_layers(model)
 
+    def _convert_input_layer(keras_layer):
+        input_name = keras_layer.name
+        input_shape = shape[input_name] if shape is not None and input_name in shape else None
+        etab.set_expr(input_name, new_var(input_name, shape=input_shape))
+
     etab = ExprTable()
     for keras_layer in model.layers:
         if isinstance(keras_layer, keras.engine.InputLayer):
-            input_name = keras_layer.name
-            input_shape = shape[input_name] if shape is not None and input_name in shape else None
-            etab.set_expr(input_name, _expr.var(input_name, shape=input_shape))
+            _convert_input_layer(keras_layer)
         else:
             inbound_nodes = keras_layer.inbound_nodes if hasattr(keras_layer, 'inbound_nodes') \
                        else keras_layer._inbound_nodes if hasattr(keras_layer, '_inbound_nodes') \
@@ -690,6 +713,7 @@ def from_keras(model, shape=None):
                 for n_idx, t_idx, inbound_layer in zip_node:
                     if isinstance(inbound_layer, keras.engine.InputLayer):
                         expr_name = inbound_layer.name
+                        _convert_input_layer(inbound_layer)
                     else:
                         expr_name = inbound_layer.name + ':' + str(n_idx) + ':' + str(t_idx)
                     expr = etab.get_expr(expr_name)
diff --git a/python/tvm/relay/frontend/mxnet.py b/python/tvm/relay/frontend/mxnet.py
index 758793c980d68..ca24febc6aa3c 100644
--- a/python/tvm/relay/frontend/mxnet.py
+++ b/python/tvm/relay/frontend/mxnet.py
@@ -3,10 +3,13 @@
 from __future__ import absolute_import as _abs
 
 import json
+import tvm
+from tvm.relay.ir_pass import infer_type
 from .. import ir_pass
 from .. import expr as _expr
 from .. import op as _op
 from ... import nd as _nd
+
 from .common import StrAttrsDict
 from .nnvm_common import _rename, _binop_scalar, _rbinop_scalar, _reduce
 from .nnvm_common import _arg_reduce, _init_op, _softmax_op, _cast
@@ -27,12 +30,13 @@ def _mx_fully_connected(inputs, attrs):
         # no flatten attribute in old mxnet
         has_flatten = False
     use_flatten = attrs.get_bool("flatten", True)
+    assert use_flatten == False
     if has_flatten and use_flatten:
         inputs[0] = _op.nn.batch_flatten(inputs[0])
     res = _op.nn.dense(inputs[0], inputs[1], units=units)
     if use_bias:
         assert len(inputs) == 3
-        res = _op.nn.bias_add(res, inputs[2])
+        res = _op.nn.bias_add(res, inputs[2], axis=-1)
     return res
 
 
@@ -41,7 +45,8 @@ def _get_channel_axis(layout, op_name):
         return 1
     if layout == "NHWC":
         return 3
-    raise RuntimeError("layout: {} is not supported in {}".format(layout, op_name))
+    raise tvm.error.OpAttributeInvalid(
+        'Value {} in attribute "layout" of operator {} is not valid.'.format(layout, op_name))
 
 
 def _mx_activations(inputs, attrs):
@@ -61,7 +66,8 @@ def _stable_softrelu(x):
             return _op.add(_op.log(_op.add(one, exp_neg_abs_x)),
                            _op.nn.relu(x))
         return _stable_softrelu(inputs[0])
-    raise RuntimeError("Do not support act_type: {}".format(act_type))
+    raise tvm.error.OpNotImplemented(
+        'Operator {} is not supported for frontend MXNet.'.format(act_type))
 
 
 def _mx_compare(new_op, wrapper):
@@ -74,7 +80,8 @@ def impl(inputs, attrs):
 def _mx_conv2d(inputs, attrs):
     kernel_size = attrs.get_int_tuple("kernel")
     if len(kernel_size) != 2:
-        raise RuntimeError("non-2d kernel is not supported in conv2d")
+        raise tvm.error.OpAttributeInvalid(
+            'Non-2D kernels are not supported for operator Conv2D.')
     data_layout = attrs.get_str("layout", "NCHW")
     channel_axis = _get_channel_axis(data_layout, "conv2d")
 
@@ -102,10 +109,12 @@ def _mx_conv2d(inputs, attrs):
 
 def _mx_conv2d_transpose(inputs, attrs):
     if "target_shape" in attrs.attrs:
-        raise RuntimeError("target_shape is not supported in conv2d_transpose")
+        raise tvm.error.OpAttributeUnimplemented(
+            'Attribute "target_shape" is not supported for operator Conv2D-transpose.')
     kernel_size = attrs.get_int_tuple("kernel")
     if len(kernel_size) != 2:
-        raise RuntimeError("non-2d kernel is not supported in conv2d")
+        raise tvm.error.OpAttributeInvalid(
+            'Non-2D kernels are not supported for operator Conv2D-transpose.')
     data_layout = attrs.get_str("layout", "NCHW")
     channel_axis = _get_channel_axis(data_layout, "conv2d_transpose")
 
@@ -140,7 +149,8 @@ def _mx_pooling(inputs, attrs):
     def _pool2d(new_op, is_avg):
         kernel_size = attrs.get_int_tuple("kernel")
         if len(kernel_size) != 2:
-            raise RuntimeError("non-2d kernel is not supported in pool2d")
+            raise tvm.error.OpAttributeInvalid(
+                'Only 2D kernels are supported for operator Pool2D.')
         new_attrs = {}
         new_attrs["pool_size"] = kernel_size
         new_attrs["strides"] = attrs.get_int_tuple("stride", (1, 1))
@@ -158,7 +168,8 @@ def _pool2d(new_op, is_avg):
         if global_pool:
             return _op.nn.global_avg_pool2d(inputs[0])
         return _pool2d(_op.nn.avg_pool2d, True)
-    raise RuntimeError("Do not support pool_type:{}".format(pool_type))
+    raise tvm.error.OpNotImplemented(
+        'Operator {} Pooling is not supported for frontend MXNet.'.format(pool_type.capitalize()))
 
 
 def _mx_dropout(inputs, attrs):
@@ -172,7 +183,8 @@ def _mx_BlockGrad(inputs, attrs): #pylint: disable=unused-argument
 
 def _mx_batch_norm(inputs, attrs):
     if attrs.get_bool("output_mean_var", False):
-        raise RuntimeError("batch_norm do not support output_mean_var")
+        raise tvm.error.OpAttributeUnimplemented(
+            'Attribute "output_mean_var" is not supported for operator Batch Norm.')
     if attrs.get_bool("use_global_stats", False):
         _warn_not_used("use_global_stats", "batch_norm")
     new_attrs = {}
@@ -182,16 +194,44 @@ def _mx_batch_norm(inputs, attrs):
     new_attrs["scale"] = not attrs.get_bool("fix_gamma", False)
     return _op.nn.batch_norm(*inputs, **new_attrs)
 
+def _mx_layer_norm(inputs, attrs):
+    # TODO: implement layer norm
+    return inputs[0] * inputs[0]
+
+def _mx_div_sqrt_dim(inputs, attrs):
+    assert len(inputs) == 1
+    data = inputs[0]
+    shape = _op.shape_of(data)
+    last_dim_index = _op.subtract(_op.sum(_op.ones_like(shape)), _expr.const(1))
+    last_dim = _op.take(_op.shape_of(data), indices=last_dim_index)
+    return _op.divide(data,
+        _op.sqrt(last_dim.astype('float32')))
+
+def _mx_erf(inputs, attrs):
+    # TODO: implement erf
+    return inputs[0] * inputs[0]
+
+def _mx_sequence_mask(inputs, attrs):
+    # TODO: implement seq mask
+    return inputs[0] * inputs[0]
 
 def _mx_slice(inputs, attrs):
     new_attrs = {}
     begin = attrs.get_int_tuple('begin', None)
     end = attrs.get_int_tuple('end', None)
     stride = attrs.get_int_tuple('step', None)
-    if begin is None or end is None:
-        raise RuntimeError("begin and end are required parameters.")
-    if None in begin or None in end:
-        raise RuntimeError("None in begin or end is not supported yet.")
+    if begin is None:
+        raise tvm.error.OpAttributeRequired(
+            'Attribute "begin" not found in operator Slice.')
+    if end is None:
+        raise tvm.error.OpAttributeRequired(
+            'Attribute "end" not found in operator Slice.')
+    if None in begin:
+        raise tvm.error.OpAttributeInvalid(
+            'Value None in attribute "begin" of operator Slice is not valid.')
+    if None in end:
+        raise tvm.error.OpAttributeInvalid(
+            'Value None in attribute "end" of operator Slice is not valid.')
     new_attrs = {'begin': begin, 'end': end}
     if stride is not None:
         new_attrs['strides'] = stride
@@ -295,7 +335,8 @@ def _mx_leaky_relu(inputs, attrs):
         upper_bound = attrs.get_float("upper_bound")
         alpha = (lower_bound + upper_bound) / 2.0
         return _op.nn.leaky_relu(inputs[0], alpha=alpha)
-    raise RuntimeError("act_type: {} is not supported".format(act_type))
+    raise tvm.error.OpNotImplemented(
+        'Operator {} is not supported for frontend MXNet.'.format(act_type))
 
 
 def _mx_make_power(power):
@@ -389,21 +430,33 @@ def _mx_batch_dot(inputs, attrs):
     transpose_a = attrs.get_bool("transpose_a", False)
     transpose_b = attrs.get_bool("transpose_b", False)
     if transpose_a is True:
-        raise RuntimeError("batch_dot: only support transpose_a=False")
+        msg = 'Value {} in attribute "transpose_a" of operator batch_dot ' \
+              'is not valid.'
+        raise tvm.error.OpAttributeInvalid(msg.format(transpose_a))
     if transpose_b is False:
         b = _op.transpose(b, axes=[0, 2, 1])
-    return _op.batch_matmul(a, b)
+    return _op.nn.batch_matmul(a, b)
 
 
 def _mx_arange(inputs, attrs):
     assert len(inputs) == 0
     if attrs.get_int("repeat", 1) != 1:
-        raise RuntimeError("arange doesn't support repeat")
+        raise tvm.error.OpAttributeUnimplemented(
+            'Attribute "repeat" is not supported in operator arange.')
     new_attrs = {}
-    new_attrs["start"] = attrs.get_float("start", 0)
-    new_attrs["stop"] = attrs.get_float("stop")
-    new_attrs["step"] = attrs.get_float("step", 1)
-    new_attrs["dtype"] = attrs.get_str("dtype", "float32")
+    stop = attrs.attrs.get('stop')
+    # This op has special behavior when only start is passed.
+    if stop != 'None':
+        new_attrs["start"] = attrs.get_float("start", 0)
+        new_attrs["stop"] = attrs.get_float("stop")
+        new_attrs["step"] = attrs.get_float("step", 1)
+        new_attrs["dtype"] = attrs.get_str("dtype", "float32")
+    else:
+        new_attrs["start"] = 0
+        new_attrs["stop"] = attrs.get_float("start")
+        new_attrs["step"] = attrs.get_float("step", 1)
+        new_attrs["dtype"] = attrs.get_str("dtype", "float32")
+
     return _op.arange(**new_attrs)
 
 
@@ -482,15 +535,20 @@ def _mx_box_nms(inputs, attrs):
     in_format = attrs.get_str('in_format', 'corner')
     out_format = attrs.get_str('out_format', 'corner')
     if coord_start != 2:
-        raise RuntimeError('coord_start %s is not supported.' % coord_start)
+        raise tvm.error.OpAttributeInvalid(
+            'Value of attribute "coord_start" must equal 2 for operator box_nms.')
     if score_index != 1:
-        raise RuntimeError('score_index %s is not supported.' % score_index)
+        raise tvm.error.OpAttributeInvalid(
+            'Value of attribute "score_index" must equal 1 for operator box_nms.')
     if id_index != -1 and int(id_index) != 0:
-        raise RuntimeError('id_index %s is not supported.' % id_index)
+        raise tvm.error.OpAttributeInvalid(
+            'Value of attribute "id_index" must equal either -1 or 0 for operator box_nms.')
     if in_format != 'corner':
-        raise RuntimeError('in_format %s is not supported.' % in_format)
+        raise tvm.error.OpAttributeInvalid(
+            'Value of attribute "in_format" must equal "corner" for operator box_nms.')
     if out_format != 'corner':
-        raise RuntimeError('out_format %s is not supported.' % out_format)
+        raise tvm.error.OpAttributeInvalid(
+            'Value of attribute "out_format" must equal "corner" for operator box_nms.')
 
     ret = _op.vision.get_valid_counts(inputs[0], score_threshold=valid_thresh)
     nms_out = _op.vision.non_max_suppression(ret[1],
@@ -508,7 +566,8 @@ def _mx_l2_normalize(inputs, attrs):
     new_attrs = {}
     mode = attrs.get_str('mode', 'instance')
     if mode != 'channel':
-        raise RuntimeError('mode %s is not supported.' % mode)
+        raise tvm.error.OpAttributeInvalid(
+            'Value of attribute "mode" must equal "channel" for operator l2_normalize.')
     new_attrs['eps'] = attrs.get_float('eps', 1e-10)
     new_attrs['axis'] = [1]
     return _op.nn.l2_normalize(inputs[0], **new_attrs)
@@ -566,6 +625,34 @@ def _mx_embedding(inputs, _):
     return _op.take(weight, indices.astype('int32'), axis=0)
 
 
+def _mx_smooth_l1(inputs, attrs):
+    scalar = attrs.get_float("scalar", 1.0)
+    scalar_sq = scalar * scalar
+    mask = _op.less(inputs[0], _expr.const(1.0 / scalar_sq, dtype='float32'))
+    return _op.where(mask,
+                     _expr.const(scalar_sq / 2.0, dtype='float32') * inputs[0] * inputs[0],
+                     _op.abs(inputs[0]) - _expr.const(0.5 / scalar_sq))
+
+
+def _mx_deformable_convolution(inputs, attrs):
+    new_attrs = {}
+    assert attrs.get_bool("no_bias")
+    new_attrs["kernel_size"] = attrs.get_int_tuple("kernel")
+    new_attrs["strides"] = attrs.get_int_tuple("stride")
+    new_attrs["padding"] = attrs.get_int_tuple("pad")
+    new_attrs["dilation"] = attrs.get_int_tuple("dilate")
+    new_attrs["channels"] = attrs.get_int("num_filter")
+    new_attrs["deformable_groups"] = attrs.get_int("num_deformable_group", 1)
+    new_attrs["groups"] = attrs.get_int("num_group", 1)
+    assert attrs.get_str("layout", "NCHW") == "NCHW", "Deformable conv2d only supports NCHW layout"
+    use_bias = not attrs.get_bool("no_bias", False)
+    res = _op.nn.deformable_conv2d(inputs[0], inputs[1], inputs[2], **new_attrs)
+    if use_bias:
+        assert len(inputs) == 4
+        res = _op.nn.bias_add(res, inputs[3])
+    return res
+
+
 # Note: due to attribute conversion constraint
 # ops in the identity set must be attribute free
 _identity_list = [
@@ -701,6 +788,7 @@ def _mx_embedding(inputs, _):
     "Embedding"     : _mx_embedding,
     "SoftmaxOutput" : _mx_softmax_output,
     "SoftmaxActivation" : _mx_softmax_activation,
+    "smooth_l1"     : _mx_smooth_l1,
     # vision
     "_contrib_BilinearResize2D" : _mx_upsampling,
     "_contrib_MultiBoxPrior" : _mx_multibox_prior,
@@ -710,12 +798,17 @@ def _mx_embedding(inputs, _):
     "_contrib_Proposal" : _mx_proposal,
     "_contrib_MultiProposal" : _mx_proposal,
     "_contrib_box_nms" : _mx_box_nms,
+    "_contrib_DeformableConvolution" : _mx_deformable_convolution,
     # List of missing operators that are present in NNVMv1
     # TODO(tvm-tvm): support all operators.
     #
     # "broadcast_to",
     # "gather_nd",
     # "Crop"          : _crop_like,
+    "LayerNorm": _mx_layer_norm,
+    "_contrib_div_sqrt_dim": _mx_div_sqrt_dim,
+    "erf": _mx_erf,
+    "SequenceMask": _mx_sequence_mask,
 }
 
 # set identity list
@@ -754,6 +847,9 @@ def _from_mxnet_impl(symbol, shape_dict, dtype_info):
         attrs = StrAttrsDict(node.get("attrs", {}))
         node_name = node["name"]
         op_name = node["op"]
+
+
+
         if op_name == "null":
             shape = shape_dict[node_name] if node_name in shape_dict else None
             if isinstance(dtype_info, dict):
@@ -770,8 +866,13 @@ def _from_mxnet_impl(symbol, shape_dict, dtype_info):
             else:
                 raise RuntimeError("unexpected type %s" % type(res))
             node_map[nid] = res
+
+            # if op_name == 'FullyConnected':
+            #     outputs = res
+            #     break
         else:
-            raise RuntimeError("{} is not supported in relay frontend".format(op_name))
+            raise tvm.error.OpNotImplemented(
+                'Operator {} is not supported in frontend MXNet.'.format(op_name))
 
     outputs = [node_map[e[0]][e[1]] for e in jgraph["heads"]]
     outputs = outputs[0] if len(outputs) == 1 else _expr.Tuple(outputs)
@@ -800,6 +901,7 @@ def _update_shape_dtype(shape, dtype, params):
 def from_mxnet(symbol,
                shape=None,
                dtype="float32",
+               input_symbols=None,
                arg_params=None,
                aux_params=None):
     """Convert from MXNet"s model into compatible relay Function.
@@ -850,8 +952,14 @@ def from_mxnet(symbol,
         params = {}
         for k, v in symbol.collect_params().items():
             params[k] = _nd.array(v.data().asnumpy())
-        data = mx.sym.Variable("data")
-        sym = symbol(data)
+
+        if input_symbols is not None:
+            inputs = input_symbols
+        else:
+            inputs = []
+            inputs.append(mx.sym.Variable("data"))
+        sym = symbol(*inputs)
+
         if isinstance(sym, (list, tuple)):
             sym = mx.sym.Group(sym)
         shape, dtype = _update_shape_dtype(shape, dtype, params)
diff --git a/python/tvm/relay/frontend/onnx.py b/python/tvm/relay/frontend/onnx.py
index d322da31fc197..45471e1eb8926 100644
--- a/python/tvm/relay/frontend/onnx.py
+++ b/python/tvm/relay/frontend/onnx.py
@@ -3,6 +3,7 @@
 from __future__ import absolute_import as _abs
 
 import logging
+import tvm
 import numpy as np
 from ... import nd as _nd
 from .. import ir_pass
@@ -18,7 +19,9 @@ def _impl(attr):
         kernel = attr['kernel_shape']
         if len(kernel) == 2:
             return prefix + '2d' + surfix
-        raise NotImplementedError("Only 2d kernel supported.")
+        msg = 'Only 2D kernels are supported for operator {}.'
+        op_name = prefix + '2d'
+        raise tvm.error.OpAttributeInvalid(msg.format(op_name))
 
     return _impl
 
@@ -29,7 +32,8 @@ def revert_caffe2_pad(pads):
     elif len(pads) == 2:
         pass
     else:
-        raise ValueError("Invalid caffe2 type padding: {}".format(pads))
+        raise tvm.error.OpAttributeInvalid(
+            'Number of pads must be either 2 or 4.')
     return pads
 
 def dimension_constraint():
@@ -102,7 +106,7 @@ def _impl_v1(cls, inputs, attr, params):
                 'pads': ('padding', (0, 0), revert_caffe2_pad)
             },
             # very weird attributes here in onnx, force check
-            ignores=['dilations'],
+            ignores=['dilations', 'auto_pad'],
             # TODO(zhreshold): make sure ceil_mode in onnx, and layout?
             extras={'ceil_mode': False},
             custom_check=dimension_constraint())(inputs, attr, params)
@@ -156,6 +160,7 @@ def _impl_v1(cls, inputs, attr, params):
                           'dilations': ('dilation', (0, 0)),
                           'pads': ('padding', (0, 0), revert_caffe2_pad),
                           'group': ('groups', 1)},
+                      ignores=['auto_pad'],
                       custom_check=dimension_constraint())(inputs[:2], attr, params)
         use_bias = len(inputs) == 3
         if use_bias:
@@ -328,7 +333,21 @@ def _impl_v1(cls, inputs, attr, params):
             shape = tuple(params[inputs[1].name_hint].asnumpy())
             out = _op.reshape(inputs[0], shape)
         else:
-            out = _op.reshape_like(inputs[0], inputs[1])
+            # Try to infer shape by precompute prune if possible.
+            # TODO: good to check inputs to be in params.
+            #       to be enhanced when relay support list_input_names API of NNVM
+            logging.warning("Infering Reshape argument by precompute")
+            func = _expr.Function(ir_pass.free_vars(inputs[1]), inputs[1])
+            with tvm.relay.build_config(opt_level=0):
+                graph, lib, params = tvm.relay.build(func, target="llvm", params=params)
+            ctx = tvm.context("llvm", 0)
+            from tvm.contrib import graph_runtime
+            m = graph_runtime.create(graph, lib, ctx)
+            m.set_input(**params)
+            m.run()
+            params_new = m.get_output(0)
+            inputs.pop(1)
+            out = _op.reshape(inputs[0], tuple(params_new.asnumpy().astype('int32').flatten()))
 
         return out
 
@@ -447,8 +466,13 @@ class Upsample(OnnxOpConverter):
     """
 
     @classmethod
-    def _impl_v7(cls, inputs, attr, params):
+    def _impl_v9(cls, inputs, attr, params):
         scales = attr.get('scales')
+        if not scales:
+            #Here we are going to higher OPSET version.
+            assert len(inputs) == 2, "Upsample op take 2 inputs, {} given".format(len(inputs))
+            scales = params[inputs[1].name_hint].asnumpy()
+            inputs = inputs[:1]
         assert len(scales) == 4 and scales[0] == 1.0 and scales[1] == 1.0 and scales[2] == scales[3]
         mode = attr.get('mode')
         if mode == b'nearest':
@@ -456,7 +480,8 @@ def _impl_v7(cls, inputs, attr, params):
         elif mode == b'linear':
             method = "BILINEAR"
         else:
-            raise ValueError("Invalid ONNX upsample mode: {}".format(mode))
+            raise tvm.error.OpAttributeInvalid(
+                'Value {} in attribute "mode" of operator Upsample is not valid.'.format(mode))
         attr = {'scale':int(scales[-1]), 'method':method, 'layout':'NCHW'}
         return AttrCvt('upsampling')(inputs, attr)
 
@@ -467,10 +492,7 @@ class Shape(OnnxOpConverter):
 
     @classmethod
     def _impl_v1(cls, inputs, attr, params):
-        # Result of this operator is prominently used by reshape operator.
-        # Just pass the input as it is so that reshape_like can be used there.
-        logging.warning("Shape: Differently implemented in relay as a bypass (dummy operator)")
-        return inputs[0]
+        return _op.shape_of(inputs[0])
 
 class Cast(OnnxOpConverter):
     """ Operator converter for Cast.
@@ -484,7 +506,7 @@ def _impl_v1(cls, inputs, attr, params):
     def _impl_v5(cls, inputs, attr, params):
         try:
             from onnx.mapping import TENSOR_TYPE_TO_NP_TYPE
-            attr['to'] = TENSOR_TYPE_TO_NP_TYPE[attr['to']]
+            attr['to'] = str(TENSOR_TYPE_TO_NP_TYPE[attr['to']])
         except ImportError as e:
             raise ImportError(
                 "Unable to import onnx.mapping which is required {}".format(e))
@@ -664,6 +686,11 @@ class ReduceMean(Reduce):
     """
     name = 'mean'
 
+class ReduceProd(Reduce):
+    """ Operator converter for ArgMax.
+    """
+    name = 'prod'
+
 class ArgMax(OnnxOpConverter):
     """ Operator converter for ArgMax.
     """
@@ -713,8 +740,9 @@ def _impl_v1(cls, inputs, attr, params):
                 shape = params[get_name(inputs[0])].asnumpy()
             else:
                 if 'extra_shape' in attr:
-                    raise ImportError(
-                        "Extra Shape not supported with fill_like")
+                    raise tvm.error.OpAttributeInvalid('Attribute "extra_shape" not '
+                                                       'supported with "fill_like" for '
+                                                       'operator ConstantFill.')
                 return _op.full_like(inputs[0], inputs[1])
 
         if 'extra_shape' in attr:
@@ -807,7 +835,7 @@ def _get_convert_map(opset):
         # 'InstanceNormalization'
         # 'LpNormalization'
         'Dropout': AttrCvt('dropout', {'ratio': 'rate'}, ignores=['is_test']),
-        'Flatten': Renamer('flatten'),
+        'Flatten': Renamer('batch_flatten'),
         'LRN': LRN.get_converter(opset),
 
         # defs/reduction
@@ -815,6 +843,7 @@ def _get_convert_map(opset):
         'ReduceMin': ReduceMin.get_converter(opset),
         'ReduceSum': ReduceSum.get_converter(opset),
         'ReduceMean': ReduceMean.get_converter(opset),
+        'ReduceProd': ReduceProd.get_converter(opset),
         # 'ReduceProd'
         # 'ReduceLogSumExp'
         'ArgMax': ArgMax.get_converter(opset),
@@ -831,8 +860,7 @@ def _get_convert_map(opset):
         'Squeeze': AttrCvt('squeeze', {'axes': 'axis'}),
         'Unsqueeze': Unsqueeze.get_converter(opset),
         'Pad': Pad.get_converter(opset),
-        # TODO(zhreshold) Shape op is implemented as bypass op in relay
-        # 'Shape': Shape.get_converter(opset),
+        'Shape': Shape.get_converter(opset),
     }
 
 
@@ -872,6 +900,7 @@ def from_onnx(self, graph, opset):
         ----------
         graph : onnx protobuf object
             The loaded onnx graph
+
         opset : opset version
 
         Returns
@@ -900,12 +929,12 @@ def from_onnx(self, graph, opset):
                                               dtype=self._params[i_name].dtype)
             else:
                 self._num_input += 1
-                shape = self._shape[i_name] if i_name in self._shape else ()
+                tshape = self._shape[i_name] if i_name in self._shape else ()
                 if isinstance(self._dtype, dict):
                     dtype = self._dtype[i_name] if i_name in self._dtype else d_type
                 else:
                     dtype = d_type
-                self._nodes[i_name] = new_var(i_name, shape=shape, dtype=dtype)
+                self._nodes[i_name] = new_var(i_name, shape=tshape, dtype=dtype)
         # construct nodes, nodes are stored as directed acyclic graph
         for node in graph.node:
             op_name = node.op_type
@@ -925,6 +954,10 @@ def from_onnx(self, graph, opset):
                     self._nodes[i_name] = new_var(node.output[0], shape=(), dtype=dtype)
                     inputs.append(self._nodes[i_name])
 
+                i_name = self._parse_value_proto(node)
+                attr['tvm_custom'] = {}
+                attr['tvm_custom']['name'] = i_name
+
                 op = self._convert_operator(op_name, inputs, attr, opset)
                 node_output = self._fix_outputs(op_name, node.output)
                 if not isinstance(op, _expr.TupleWrapper):
diff --git a/python/tvm/relay/frontend/tensorflow.py b/python/tvm/relay/frontend/tensorflow.py
index 0efebe3cfec90..5ac0de4335f7f 100644
--- a/python/tvm/relay/frontend/tensorflow.py
+++ b/python/tvm/relay/frontend/tensorflow.py
@@ -5,6 +5,7 @@
 
 import logging
 import warnings
+from collections import defaultdict
 # Numpy support
 import numpy as np
 
@@ -26,7 +27,8 @@ def _get_relay_op(op_name):
             op = getattr(_op.image, op_name)
 
     if not op:
-        raise RuntimeError("Unable to map op_name {} to relay".format(op_name))
+        raise tvm.error.OpNotImplemented(
+            'Operator {} is not supported for frontend TensorFlow.'.format(op_name))
     return op
 
 class AttrCvt(object):
@@ -98,7 +100,8 @@ def __call__(self, inputs, attrs, *args):
         new_attrs = {}
         for k in attrs.keys():
             if k in self._excludes:
-                raise NotImplementedError("Attribute {} not supported yet.".format(k))
+                raise tvm.error.OpAttributeUnimplemented(
+                    'Attribute {} in operator {} is not supported.'.format(k, op_name))
             elif k in self._disables:
                 logging.warning("Attribute %s is disabled in relay.%s", k, op_name)
             elif k in self._ignores:
@@ -147,7 +150,8 @@ def _required_attr(self, attr, key):
         """Wrapper for getting required attributes."""
         assert isinstance(attr, dict)
         if key not in attr:
-            raise AttributeError("Required attribute {} not found.".format(key))
+            raise tvm.error.OpAttributeRequired(
+                'Attribute {} not found in operator {}'.format(key, self._op_name))
         return attr[key]
 
 def _get_pad_pair(input1d, kernel1d, stride1d):
@@ -177,7 +181,8 @@ def _impl(attr):
         kernel = attr['kernel_shape']
         if len(kernel) == 2:
             return prefix + '2d' + surfix
-        raise NotImplementedError("Only 2d kernel supported.")
+        raise tvm.error.OpAttributeInvalid(
+            'Only 2D kernels are supported for operator {}'.format(prefix + '2d'))
     return _impl
 
 def _dimension_constraint():
@@ -237,7 +242,9 @@ def _impl(inputs, attr, params):
             attr['kernel_shape'] = (attr['ksize'][2], attr['ksize'][3])
             attr['strides'] = (attr['strides'][2], attr['strides'][3])
         else:
-            raise TypeError("Unsupported data_format type : {}".format(attr['data_format']))
+            msg = 'Value {} of attribute "data_format" of operator Pooling ' \
+                  'is not valid.'
+            raise tvm.error.OpAttributeInvalid(msg.format(attrs['data_format']))
 
         if attr['_target_layout'] == "NCHW" and attr['data_format'] == "NHWC":
             tmp_shape = attr['_input_shapes'][inputs[0]]
@@ -266,7 +273,9 @@ def _impl(inputs, attr, params):
 
             attr['padding'] = [pad_v[0], pad_h[0], pad_v[1], pad_h[1]]
         else:
-            raise TypeError("Unsupported padding type : {}".format(attr['padding']))
+            msg = 'Value {} in attribute "padding" of operator Pooling is ' \
+                  'not valid.'
+            raise tvm.error.OpAttributeInvalid(msg.format(attr['padding']))
 
         if name == "avg_pool":
             attr['count_include_pad'] = False
@@ -340,7 +349,9 @@ def _impl(inputs, attr, params):
                 attr['dilations'] = (attr['dilations'][2], attr['dilations'][3])
             attr['strides'] = (attr['strides'][2], attr['strides'][3])
         else:
-            raise TypeError("Unsupported data format type : {}".format(attr['data_format']))
+            msg = 'Value {} in attribute "data_format" of operator Conv is ' \
+                  'not valid.'
+            raise tvm.error.OpAttributeInvalid(msg.format(attr['data_format']))
 
 
         if opname == 'depthwise':
@@ -385,7 +396,9 @@ def _impl(inputs, attr, params):
             attr['padding'] = [0, 0]
 
         else:
-            raise TypeError("Unsupported padding type : {}".format(attr['padding']))
+            msg = 'Value {} in attribute "padding" of operator Conv is not ' \
+                  'valid.'
+            raise tvm.error.OpAttributeInvalid(msg.format(attr['padding']))
 
         if 'kernel_layout' not in attr:
             if opname == 'conv':
@@ -530,25 +543,23 @@ def _impl(inputs, attr, params):
                 op_name="reshape",
                 extras={'newshape':tuple(shape_arg.asnumpy())},
                 ignores=['Tshape'])(inputs, attr)
-        except KeyError:
+        except AttributeError:
             # Shape operator is already pruned, hence
             # try to infer shape by precompute prune if possible.
-            if all(in_node in params for in_node in inputs[1].list_input_names()):
-                func = _expr.Function(ir_pass.free_vars(inputs[1]), inputs[1])
-                with tvm.relay.build_config(opt_level=0):
-                    graph, lib, params = tvm.relay.build(func, target="llvm", params=params)
-                ctx = tvm.context("llvm", 0)
-                from tvm.contrib import graph_runtime
-                m = graph_runtime.create(graph, lib, ctx)
-                m.set_input(**params)
-                m.run()
-                params_new = m.get_output(0)
-                inputs.pop(1)
-                return AttrCvt(
-                    op_name="reshape",
-                    extras={'newshape':tuple(params_new.asnumpy().flatten())},
-                    ignores=['Tshape'])(inputs, attr)
-            raise RuntimeError("Reshape with dynamic shape input not supported yet.")
+            func = _expr.Function(ir_pass.free_vars(inputs[1]), inputs[1])
+            with tvm.relay.build_config(opt_level=0):
+                graph, lib, params = tvm.relay.build(func, target="llvm", params=params)
+            ctx = tvm.context("llvm", 0)
+            from tvm.contrib import graph_runtime
+            m = graph_runtime.create(graph, lib, ctx)
+            m.set_input(**params)
+            m.run()
+            params_new = m.get_output(0)
+            inputs.pop(1)
+            return AttrCvt(
+                op_name="reshape",
+                extras={'newshape':tuple(params_new.asnumpy().astype('int64').flatten())},
+                ignores=['Tshape'])(inputs, attr)
     return _impl
 
 def _bias_add():
@@ -790,7 +801,8 @@ def _impl(inputs, attr, params):
         if padlist_key in params:
             padlist = params.pop(padlist_key).asnumpy()
         else:
-            raise RuntimeError("Required parameter {} not fount.".format(padlist_key))
+            raise tvm.error.OpAttributeRequired(
+                'Attribute {} not found in operator Pad.'.format(padlist_key))
         paddings = tuple([tuple(l) for l in padlist])
         attr['pad_width'] = paddings
         attr['pad_value'] = 0
@@ -1270,6 +1282,220 @@ def _get_abs_layer_name(node):
                                                      params, num_layers)
         return sym
 
+# An internal list to contain all the control flow primitives used in Tensorflow
+# 1.x.
+_control_flow_nodes = ['Merge', 'Switch', 'NextIteration', 'Exit', 'Enter', 'LoopCond']
+
+def _in_while_loop(control_flow_node_map, op_name):
+    """
+    Check if a given control flow operator is part of a while loop execution
+    frame. This is based on the fact that there is only one occurrence of
+    `LoopCond` for a loop execution frame and it is only presented in the loop
+    construct.
+
+    Parameters
+    ----------
+    control_flow_node_map : Dict[str, Set[str]]
+        A dictionay contains the unqiue control flow execution frame name to
+        a set of primitive operators mapping.
+
+    op_name : str
+        The name of a control flow primitive.
+
+    Returns
+    -------
+    ret : bool
+        Return true if the operator is in a while loop execution frame,
+    otherwise, return false.
+    """
+    return op_name in control_flow_node_map and \
+            "LoopCond" in control_flow_node_map[op_name]
+
+
+class Branch:
+    """A class contains the components that are used to build up a Relay if
+    node.
+
+    Parameters
+    ----------
+    cond : tvm.relay.Expr
+        The condition of a if node.
+
+    true_branch : tvm.relay.Expr
+        The body of the true branch of a if expression.
+
+    false_branch: tvm.relay.Expr
+        The body of the false branch of a if expression.
+
+    _if : tvm.relay.Expr
+        An internal variable indicates where an if expression is already created
+        for a matched TF condition construct.
+
+    Examples
+    --------
+    The following is a cond statement written in TensorFlow:
+
+    .. code-block:: python
+
+        def vanilla_cond():
+            i = tf.constant(1)
+            j = tf.constant(4)
+
+             def f1():
+                return tf.multiply(1, 17)
+
+             def f2():
+                return tf.add(4, 23)
+            r = tf.cond(tf.less(i, j), f1, f2)
+
+    This condition statement should be coverted into Relay in the following
+    form:
+
+    .. code-block:: python
+
+        fn (%Const: Tensor[(1,), int32],
+            %Const_1: Tensor[(1,), int32],
+            %cond/Mul/x: Tensor[(1,), int32],
+            %cond/Mul/y: Tensor[(1,), int32],
+            %cond/Add/x: Tensor[(1,), int32],
+            %cond/Add/y: Tensor[(1,), int32]) {
+          %0 = less(%Const, %Const_1) # ty=Tensor[(1,), bool]
+          %1 = min(%0)
+          if (%1) {
+            %2 = multiply(%cond/Mul/x, %cond/Mul/y)
+            %2
+          }  else {
+            %3 = add(%cond/Add/x, %cond/Add/y)
+            %3
+          }
+        }
+    """
+    def __init__(self):
+        self._if = None
+        self.cond = None
+        self.true_branch = None
+        self.false_branch = None
+
+    def _if_node(self):
+        """An internal API to create a relay if node from the matched TF
+        condition construct.
+        """
+        # `cond`  returns a tensor that contains boolean values. We add a `min`
+        # operator to checks if there is any false value. If so, this condition
+        # doesn't not hold.
+        cond = tvm.relay.op.min(self.cond)
+        return tvm.relay.If(cond, self.true_branch, self.false_branch)
+
+    def if_node(self):
+        """Create an tvm.relay.If node if it hasn't been created yet."""
+        if self._if is None:
+            self._if = self._if_node()
+        return self._if
+
+
+class Loop:
+    """
+    A class contains the components that are used to build up a Relay
+    recursive call.
+
+    Parameters
+    ----------
+    loop_vars : List[tvm.relay.Expr]
+        The loop variables that used in a while loop.
+
+    cond : tvm.relay.Expr
+        The condition of a while loop.
+
+    body : tvm.relay.Expr
+        The body of a matched while loop.
+
+    _loop : tvm.relay.Expr
+        An internal variable indicates where a recursive call is already created
+        for a matched TF while loop construct.
+
+    Examples
+    --------
+    The following is a vanilla loop from TensorFlow:
+
+    .. code-block:: python
+
+        i = tf.constant(0)
+        c = lambda i: tf.less(i, 10)
+        b = lambda i: tf.add(i, 1)
+        r = tf.while_loop(c, b, [i])
+
+    It will be converted to the following recursive call in Relay:
+
+    .. code-block:: python
+
+        fn (%while/Less/y: Tensor[(1,), int32],
+            %while/Add/y: Tensor[(1,), int32],
+            %Const: Tensor[(1,), int32]) {
+          %0 = fn(%loop_var0: Tensor[(1,), int32]) {
+            %1 = less(%loop_var0, %while/Less/y)
+            %2 = min(%1)
+            if (%2) {
+              %3 = add(%loop_var0, %while/Add/y)
+              free_var %while_loop
+              %4 = %while_loop(%3)
+              %4
+            }    else {
+              %5 = (%loop_var0,)
+              %5
+            }
+          }
+          let %while_loop1 = %0
+          %6 = %while_loop1(%Const)
+          %6
+        }
+    """
+    def __init__(self):
+        self.loop_vars = []
+        self.cond = None
+        self.body = []
+        self._loop = None
+
+    def _while_loop(self):
+        """An internal API to create a Relay recurisve call for a matched TF
+        `while_loop` construct.
+        """
+        wl = tvm.relay.var('while_loop')
+
+        sb = tvm.relay.scope_builder.ScopeBuilder()
+
+        loop_vars = []
+        bind_map = {}
+        for i, var in enumerate(self.loop_vars):
+            assert isinstance(var, _expr.Var), repr(var)
+            v = tvm.relay.var("loop_var" + str(i),
+                              type_annotation=var.type_annotation)
+            loop_vars.append(v)
+            bind_map[var] = v
+
+        self.cond = tvm.relay.bind(self.cond, bind_map)
+        self.body = [tvm.relay.bind(b, bind_map) for b in self.body]
+
+        cond = tvm.relay.op.min(self.cond)
+
+        with sb.if_scope(cond):
+            sb.ret(wl(*self.body))
+        with sb.else_scope():
+            sb.ret(tvm.relay.Tuple(loop_vars))
+
+        loop_fn = tvm.relay.Function(loop_vars, sb.get())
+        sb = tvm.relay.scope_builder.ScopeBuilder()
+        sb.let(wl, loop_fn)
+        sb.ret(wl(*self.loop_vars))
+        return sb.get()
+
+    def while_loop(self):
+        """Instantiate a while loop if it has not been created yet."""
+        if self._loop is None:
+            self._loop = self._while_loop()
+            return self._loop
+        return self._loop
+
+
 class GraphProto(object):
     """ A helper class for handling relay graph copying from Tensorflow GraphDef.
     Definition:
@@ -1284,6 +1510,8 @@ def __init__(self):
         self._num_rnn_layer = False
         self._outputs_are_0d = {}
         self._input_shapes = {}
+        self._loops = {}
+        self._branches = {}
 
     def from_tensorflow(self, graph, layout="NHWC", shape=None, outputs=None):
         """Construct relay nodes from tensorflow  graph definition - GraphDef.
@@ -1332,7 +1560,10 @@ def from_tensorflow(self, graph, layout="NHWC", shape=None, outputs=None):
             raise NotImplementedError( \
                 "The following operators are not implemented: {}".format(missing_operators))
 
+        control_flow_node_map = defaultdict(set)
         for node in graph.node:
+            node_name_prefix = node.name.rsplit('/', 1)[0]
+            control_flow_node_map[node_name_prefix].add(node.op)
             if node.op == 'Placeholder':
                 if shape and node.name in shape:
                     self._input_shapes[node.name] = list(shape[node.name])
@@ -1447,12 +1678,17 @@ def from_tensorflow(self, graph, layout="NHWC", shape=None, outputs=None):
                         # This means the node is 1d in Relay and 0d in TF.
                         # See `_expand_dims_0d_aware`.
                         if self._outputs_are_0d[node_name][tensor_slot] and input_shape:
-                            input_0d_mismatch.add(in_sym)
+                            input_0d_mismatch.add(in_sym[0])
 
                 attr['_input_shapes'] = input_shapes
                 attr['_input_0d_mismatch'] = input_0d_mismatch
 
-                op = self._convert_operator(node.op, inputs, attr, graph)
+                if node.op in _control_flow_nodes:
+                    op = self._convert_control_flow_operator(node, inputs,
+                                                             attr,
+                                                             control_flow_node_map)
+                else:
+                    op = self._convert_operator(node.op, inputs, attr, graph)
 
                 # Check if op is converted to param
                 if isinstance(op, np.ndarray):
@@ -1493,7 +1729,10 @@ def from_tensorflow(self, graph, layout="NHWC", shape=None, outputs=None):
 
         out = []
         if outputs is None:
-            out = op
+            if node.op == "Exit":
+                out = [op[0].tuple_value]
+            else:
+                out = op
         else:
             for out_name in outputs:
                 if ":" in out_name:
@@ -1529,7 +1768,9 @@ def _parse_import_prerequisites(self, graph):
             elif node.op == "Const":
                 pass
             else:
-                if any([node.op in t for t in [_identity_list, _convert_map, _convert_map_rnn]]):
+                if any([node.op in t for t in [_identity_list, _convert_map,
+                                               _convert_map_rnn,
+                                               _control_flow_nodes]]):
                     pass
                 else:
                     missing_operators.add(node.op)
@@ -1656,6 +1897,89 @@ def _convert_rnn_operator(self, op_name, inputs,
         sym = self.rnn.process_op(op_name, inputs, attrs, params)
         return sym
 
+    def _convert_control_flow_operator(self, node, inputs, attrs, control_flow_node_map):
+        """
+        Convert the Relay control flow primitive into corresponding component
+        of a Relay control flow construct, i.e. `tf.cond` and `tf.while_loop`
+        are converted in Relay `If` and recusrive call, respectively.
+
+        Parameters
+        ----------
+        node: TensorFlow graph node object.
+            A TensorFlow graph node object.
+
+        inputs : List[tvm.relay.Expr]
+            List of input symbols.
+
+        attrs : Dict[tvm.Attrs]
+            Dict of operator attributes.
+
+        control_flow_node_map : Dict[str, Set[str]]
+            A dictionary contains the execution frame name to primitives
+            mapping.
+
+        Returns
+        -------
+        op : tvm.relay.Expr
+            Converted relay expression.
+        """
+        node_name_prefix = node.name.rsplit('/', 1)[0]
+        if node.op == "Merge":
+            if _in_while_loop(control_flow_node_map, node_name_prefix):
+                op = self._nodes[node.input[0]]
+                self._loops[node_name_prefix] = Loop()
+            else:
+                if len(self._branches) == 0:
+                    raise RuntimeError("Cannot find a created "
+                                       "conditional for merge node")
+                branch = self._branches[node_name_prefix]
+                false_br = self._nodes[node.input[0]]
+                true_br = self._nodes[node.input[1]]
+                assert len(true_br) == 1
+                assert len(false_br) == 1
+                branch.true_branch = true_br[0]
+                branch.false_branch = false_br[0]
+                op = [branch.if_node()]
+        elif node.op == "Exit":
+            loop = self._loops[node_name_prefix]
+            exit_name = node.name.split('/')[-1]
+            assert str.startswith(exit_name, 'Exit')
+
+            # TensorFlow has differen naming convention on different
+            # versions.
+            if '_' in exit_name:
+                exit_number = int("0" + exit_name[5:])
+            else:
+                exit_number = int("0" + exit_name[4:])
+
+            expr = loop.while_loop()
+            op = _expr.TupleGetItem(expr, exit_number)
+        elif node.op == "Enter":
+            op = self._nodes[node.input[0]]
+        elif node.op == "LoopCond":
+            op = self._nodes[node.input[0]]
+            assert len(op) == 1
+            self._loops[node_name_prefix].cond = op[0]
+        elif node.op == "Switch":
+            op = self._nodes[node.input[0]]
+            assert len(op) == 1
+            if _in_while_loop(control_flow_node_map, node_name_prefix):
+                self._loops[node_name_prefix].loop_vars.append(op[0])
+            else:
+                if node_name_prefix not in self._branches:
+                    self._branches[node_name_prefix] = Branch()
+                self._branches[node_name_prefix].cond = ir_pass.infer_type(op[0])
+        elif node.op == "NextIteration":
+            op = self._nodes[node.input[0]]
+            assert len(op) == 1
+            self._loops[node_name_prefix].body.append(op[0])
+        else:
+            raise Exception("Cannot identify control flow operator: " +
+                            "{}".format(node.op))
+
+        return op
+
+
     def _convert_operator(self, op_name, inputs, attrs,
                           graph, identity_list=None, convert_map=None):
         """Convert from Tensorflow operator to relay operator.
diff --git a/python/tvm/relay/frontend/tflite.py b/python/tvm/relay/frontend/tflite.py
index d45bb33859b2b..0e31500fe67d8 100644
--- a/python/tvm/relay/frontend/tflite.py
+++ b/python/tvm/relay/frontend/tflite.py
@@ -3,6 +3,7 @@
 from __future__ import absolute_import as _abs
 import math
 import numpy as np
+import tvm
 from .. import ir_pass
 from .. import expr as _expr
 from .. import op as _op
@@ -59,8 +60,10 @@ def check_unsupported_ops(self):
                 unsupported_ops_set.add(op_code_str)
 
         if unsupported_ops_set:
-            raise NotImplementedError("Unsupported Ops: %s" % (
-                ','.join(unsupported_ops_set)))
+            msg = 'The following operators are not supported in frontend ' \
+                  'TFLite: {}'
+            ops = str(list(unsupported_ops_set)).strip('[,]')
+            raise tvm.error.OpNotImplemented(msg.format(ops))
 
     def convert_op_to_relay(self):
         """Convert TFLite ops to relay ops"""
@@ -205,8 +208,8 @@ def convert_reshape(self, op):
             # finally convert back if necessary
             in_expr = _op.transpose(in_expr, axes=(0, 2, 3, 1))
         else:
-            raise NotImplementedError("Not support input shape length {} of reshape : "
-                                      .format(str(input_shape_length)))
+            msg = 'Input shape length {} for operator Reshape is not valid.'
+            raise tvm.error.OpAttributeInvalid(msg.format(input_shape_length))
 
         out = _op.reshape(in_expr, newshape=tuple(target_shape))
 
@@ -223,8 +226,8 @@ def convert_reshape(self, op):
         elif len(target_shape) == 4:
             out = _op.transpose(out, axes=(0, 3, 1, 2))
         else:
-            raise NotImplementedError("Not support to reshape to shape length {}: "
-                                      .format(str(len(target_shape))))
+            raise tvm.error.OpAttributeInvalid(
+                'Length of target shape must be between 1 and 5 for operator Reshape.')
 
         return out
 
@@ -330,8 +333,8 @@ def convert_squeeze(self, op):
             # finally convert back if necessary
             in_expr = _op.transpose(in_expr, axes=(0, 2, 3, 1))
         else:
-            raise NotImplementedError("Not support input shape length {} of squeeze : "
-                                      .format(str(input_shape_length)))
+            msg = 'Input shape length {} for operator Squeeze is not valid.'
+            raise tvm.error.OpAttributeInvalid(msg.format(input_shape_length))
 
         out = _op.squeeze(in_expr, axis=tuple(squeeze_axis))
 
@@ -348,8 +351,8 @@ def convert_squeeze(self, op):
         elif output_shape_length == 4:
             out = _op.transpose(out, axes=(0, 3, 1, 2))
         else:
-            raise NotImplementedError("Not support to squeeze to length {} : "
-                                      .format(str(output_shape_length)))
+            msg = 'Output shape length {} for operator Squeeze is not valid.'
+            raise tvm.error.OpAttributeInvalid(msg.format(output_shape_length))
 
         return out
 
@@ -369,8 +372,8 @@ def convert_fused_activation_function(self, in_expr, fused_activation_fn):
         if fused_activation_fn == ActivationFunctionType.TANH:
             return _op.tanh(in_expr)
         fused_activation_fn_str = self.activation_fn_type[fused_activation_fn]
-        raise NotImplementedError("Unsupported fused activation fn {}"
-                                  .format(fused_activation_fn_str))
+        raise tvm.error.OpNotImplemented(
+            'Operator {} is not supported for frontend TFLite.'.format(fused_activation_fn_str))
 
     def convert_conv(self, op, conv_type):
         """convolution implementation."""
@@ -409,7 +412,8 @@ def convert_conv(self, op, conv_type):
             assert depth_multiplier == 1, "TF frontend have transformed it be 1 " \
                                           "no matter original value be set by 0.25, 0.5 or any else"
         else:
-            raise ValueError("Not support conv type: {}".format(conv_type))
+            raise tvm.error.OpNotImplemented(
+                'Operator {} is not supported for frontend TFLite.'.format(conv_type))
 
         stride_h = conv_options.StrideH()
         stride_w = conv_options.StrideW()
@@ -466,7 +470,8 @@ def convert_conv(self, op, conv_type):
                                                           (pad_top, pad_bottom),
                                                           (pad_left, pad_right)))
         else:
-            raise NotImplementedError("Not support padding format: {}".format(padding))
+            raise tvm.error.OpAttributeUnimplemented(
+                'Padding format {} is not supported for operator Conv.'.format(padding))
 
         out = _op.nn.conv2d(data=in_expr, weight=weight_expr, **params)
 
@@ -529,14 +534,16 @@ def convert_pool2d(self, op, pool_type):
             pad_left, pad_right = get_pad_value(input_w, filter_w, stride_w)
             params['padding'] = [pad_top, pad_left, pad_bottom, pad_right]
         else:
-            raise NotImplementedError("Not support padding format: {}".format(padding))
+            raise tvm.error.OpAttributeUnimplemented(
+                'Padding format {} for operator Pool2D is not supported.'.format(padding))
 
         if pool_type == "average":
             out = _op.nn.avg_pool2d(in_expr, **params)
         elif pool_type == "max":
             out = _op.nn.max_pool2d(in_expr, **params)
         else:
-            raise ValueError("Not support pool type: {}".format(pool_type))
+            raise tvm.error.OpNotImplemented(
+                'Operator {} is not supported for frontend TFLite.'.format(pool_type + ' pool'))
 
         # If we have fused activations
         if fused_activation_fn != ActivationFunctionType.NONE:
diff --git a/python/tvm/relay/ir_pass.py b/python/tvm/relay/ir_pass.py
index f3f8fea974127..103a422e56e6f 100644
--- a/python/tvm/relay/ir_pass.py
+++ b/python/tvm/relay/ir_pass.py
@@ -937,3 +937,21 @@ def pass_debug_print(ast, show_meta_data=True, annotate=None, gnf=True):
         A text representation of `ast`.
     """
     return _ir_pass.pass_debug_print(ast, show_meta_data, annotate, gnf)
+
+
+def partial_eval(expr, mod=None):
+    """
+    Evaluate the static fragment of the code.
+
+    Parameters
+    ----------
+    expr : tvm.relay.Expr
+        The input expression.
+
+    mod : Optional[tvm.relay.Module]
+    Returns
+    -------
+    expr : tvm.relay.Expr
+      The output expression.
+    """
+    return _ir_pass.partial_eval(expr, mod)
diff --git a/python/tvm/relay/network.py b/python/tvm/relay/network.py
new file mode 100644
index 0000000000000..cc3ca35b9b934
--- /dev/null
+++ b/python/tvm/relay/network.py
@@ -0,0 +1,159 @@
+import numpy as np
+import tvm
+from tvm import relay
+from tvm.relay import op
+from tvm.relay import create_executor, Module
+from tvm.relay.backend.interpreter import TensorValue
+from tvm.relay.prelude import Prelude
+import aot
+import collections
+
+class OrderedSet(collections.MutableSet):
+
+    def __init__(self, iterable=None):
+        self.end = end = []
+        end += [None, end, end]         # sentinel node for doubly linked list
+        self.map = {}                   # key --> [key, prev, next]
+        if iterable is not None:
+            self |= iterable
+
+    def __len__(self):
+        return len(self.map)
+
+    def __contains__(self, key):
+        return key in self.map
+
+    def add(self, key):
+        if key not in self.map:
+            end = self.end
+            curr = end[1]
+            curr[2] = end[1] = self.map[key] = [key, curr, end]
+
+    def discard(self, key):
+        if key in self.map:
+            key, prev, next = self.map.pop(key)
+            prev[2] = next
+            next[1] = prev
+
+    def __iter__(self):
+        end = self.end
+        curr = end[2]
+        while curr is not end:
+            yield curr[0]
+            curr = curr[2]
+
+    def __reversed__(self):
+        end = self.end
+        curr = end[1]
+        while curr is not end:
+            yield curr[0]
+            curr = curr[1]
+
+    def pop(self):
+        key = self.last()
+        self.discard(key)
+        return key
+
+    def last(self):
+        return self.end[1][0]
+
+    def __repr__(self):
+        if not self:
+            return '%s()' % (self.__class__.__name__,)
+        return '%s(%r)' % (self.__class__.__name__, list(self))
+
+    def __eq__(self, other):
+        if isinstance(other, OrderedSet):
+            return len(self) == len(other) and list(self) == list(other)
+        return set(self) == set(other)
+
+def initialize(param):
+    ty = param.type_annotation
+    shape = [int(i) for i in ty.shape]
+    return np.random.normal(0, 1, shape).astype('float32')
+
+def copy_var(v):
+    return relay.Var(v.name_hint, v.type_annotation)
+
+class Network:
+    stack = []
+    cnt = 0
+
+    def __init__(self, *, name="f", **kwargs):
+        name = f"{name}_{Network.cnt}"
+        Network.cnt += 1
+        if len(Network.stack) is not 0:
+            mod = Network.stack[-1].mod
+            p = Network.stack[-1].p
+        else:
+            mod = Module()
+            p = Prelude(mod)
+
+        self.mod = mod
+        self.p = p
+        self.inputs = []
+        self.weights = OrderedSet()
+        self.sub_network = OrderedSet()
+        self.f = relay.GlobalVar(name)
+        self.recurse = relay.Var("recurse")
+        self.use_recurse = False
+        self.ret_type = None
+        body = self.build(**kwargs)
+        assert isinstance(body, relay.Expr)
+        if self.use_recurse:
+            inputs = [copy_var(v) for v in self.inputs]
+            body = relay.Let(self.recurse, relay.Function(inputs, self.call_from_outside(*inputs)), body)
+        self.mod[self.f] = relay.Function(self.inputs + self.all_weights(), body, self.ret_type)
+
+    def build(self, **kwargs):
+        Network.stack.append(self)
+        try:
+            return self.build_impl(**kwargs)
+        finally:
+            Network.stack.pop()
+
+    def build_impl(self, *args):
+        raise NotImplementedError
+
+    def weight(self, w):
+        assert isinstance(w, relay.Var)
+        self.weights.add(w)
+        return w
+
+    def input(self, i):
+        assert isinstance(i, relay.Var)
+        self.inputs.append(i)
+        return i
+
+    def all_weights(self):
+        return list(set(list(self.weights) + [w for n in self.sub_network for w in n.all_weights()]))
+
+    def call_from_outside(self, *inputs):
+        return self.f(*(list(inputs) + self.all_weights()))
+
+    def __call__(self, *inputs):
+        if self in Network.stack:
+            self.use_recurse = True
+            return self.recurse(*inputs)
+        else:
+            assert len(Network.stack) > 0
+            assert Network.stack[-1].mod == self.mod
+            assert Network.stack[-1].p == self.p
+            Network.stack[-1].sub_network.add(self)
+            return self.call_from_outside(*inputs)
+
+    def interface_type(self):
+        t = relay.ir_pass.infer_type(self.mod[self.f], mod=self.mod).checked_type
+        return relay.FuncType(t.arg_types[:len(self.inputs)], t.ret_type, t.type_params, t.type_constraints)
+
+    def get(self):
+        weights = []
+        for x in self.all_weights():
+            ty = x.type_annotation
+            assert isinstance(ty, relay.TensorType)
+            assert ty.dtype == 'float32'
+            shape = [int(i) for i in ty.shape]
+            weight = relay.const(np.random.normal(0, 1, shape).astype('float32'))
+            weights.append(weight)
+        inputs = [copy_var(v) for v in self.inputs]
+        return relay.Function(inputs, self.f(*inputs, *weights))
diff --git a/python/tvm/relay/op/_tensor_grad.py b/python/tvm/relay/op/_tensor_grad.py
index 173e97a004962..0e796294e96c9 100644
--- a/python/tvm/relay/op/_tensor_grad.py
+++ b/python/tvm/relay/op/_tensor_grad.py
@@ -3,7 +3,7 @@
 from __future__ import absolute_import
 from ..expr import const
 from .op import register_gradient
-from .transform import collapse_sum_like, where
+from .transform import collapse_sum_like, broadcast_to_like, where
 from .tensor import exp, negative, power, less
 from .tensor import zeros_like, ones_like
 
@@ -77,3 +77,20 @@ def divide_grad(orig, grad):
     x, y = orig.args
     return [collapse_sum_like(grad / y, x),
             collapse_sum_like(- (grad * orig / y), y)]
+
+
+@register_gradient("zeros_like")
+def zeros_like_grad(orig, grad):
+    """Returns [0]"""
+    return [orig]
+
+@register_gradient("ones_like")
+def ones_like_grad(orig, grad):
+    """Returns [0]"""
+    return [zeros_like(orig.args[0])]
+
+@register_gradient("collapse_sum_like")
+def collapse_sum_like_grad(orig, grad):
+    """Returns [broadcast_to_like(grad, x), 0]"""
+    x, y = orig.args
+    return [broadcast_to_like(grad, x), zeros_like(y)]
diff --git a/python/tvm/relay/op/nn/_nn.py b/python/tvm/relay/op/nn/_nn.py
index 58de44c2e0b51..1a5c21507bd27 100644
--- a/python/tvm/relay/op/nn/_nn.py
+++ b/python/tvm/relay/op/nn/_nn.py
@@ -58,7 +58,7 @@ def schedule_batch_matmul(attrs, outputs, target):
     with target:
         return topi.generic.schedule_batch_matmul(outputs)
 
-reg.register_pattern("nn.batch_matmul", reg.OpPattern.OUT_ELEMWISE_FUSABLE)
+reg.register_pattern("nn.batch_matmul", reg.OpPattern.OPAQUE)
 
 
 # conv2d
@@ -85,7 +85,6 @@ def compute_conv2d(attrs, inputs, out_type, target):
             inputs[0], inputs[1], strides, padding,
             dilation, layout, out_dtype=out_dtype)
     elif layout == "NCHW" and \
-         kernel_layout == "OIHW" and \
          get_const_int(inputs[1].shape[0]) == groups and \
          get_const_int(inputs[1].shape[1]) == 1:
         out = topi.nn.depthwise_conv2d_nchw(
@@ -96,6 +95,9 @@ def compute_conv2d(attrs, inputs, out_type, target):
          get_const_int(inputs[1].shape[3]) == 1:
         out = topi.nn.depthwise_conv2d_nhwc(
             inputs[0], inputs[1], strides, padding, dilation, out_dtype=out_dtype)
+    elif layout in ['NCHW', 'NCHW4c']:
+        out = topi.nn.group_conv2d_nchw(inputs[0], inputs[1], strides, padding, dilation, groups,
+                                        out_dtype=out_dtype)
     else:
         raise ValueError("not support arbitrary group number for now")
     return [out]
@@ -120,6 +122,8 @@ def schedule_conv2d(attrs, outs, target):
                 return topi.generic.schedule_depthwise_conv2d_nchw(outs)
             if layout == "NHWC" and kernel_layout == "HWOI":
                 return topi.generic.schedule_depthwise_conv2d_nhwc(outs)
+            if layout == "NCHW4c":
+                return topi.generic.schedule_group_conv2d_nchw(outs)
     raise ValueError("No compatible schedule")
 
 
@@ -321,6 +325,58 @@ def schedule_contrib_conv2d_winograd_weight_transform(attrs, outs, target):
 reg.register_pattern("nn.contrib_conv2d_winograd_weight_transform",
                      OpPattern.OUT_ELEMWISE_FUSABLE)
 
+
+# winograd nnpack related operators
+@reg.register_compute("nn.contrib_conv2d_winograd_nnpack_without_weight_transform")
+def compute_contrib_conv2d_winograd_nnpack_without_weight_transform(
+        attrs, inputs, out_dtype, target):
+    """Compute definition of conv2d_winograd_nnpack_without_weight_transform"""
+    # pylint: disable=assignment-from-no-return
+    padding = attrs.get_int_tuple("padding")
+    strides = attrs.get_int_tuple("strides")
+    dilation = attrs.get_int_tuple("dilation")
+    groups = attrs.get_int("groups")
+    data_layout = attrs.get_str("data_layout")
+    out_dtype = attrs.get_str("out_dtype")
+    out_dtype = inputs[0].dtype if out_dtype == "" else out_dtype
+    assert dilation == (1, 1), "Do not support dilate now"
+    assert groups == 1, "Do not supoort arbitrary group number"
+
+    # No bias
+    out = topi.nn.conv2d_winograd_nnpack_without_weight_transform(
+        inputs[0], inputs[1], None, strides, padding, dilation, data_layout,
+        out_dtype)
+
+    return [out]
+
+@reg.register_schedule("nn.contrib_conv2d_winograd_nnpack_without_weight_transform")
+def schedule_contrib_conv2d_winograd_nnpack_without_weight_transform(attrs, outs, target):
+    """Schedule definition of conv2d_winograd_nnpack_without_weight_transform"""
+    with target:
+        return topi.generic.schedule_conv2d_winograd_nnpack_without_weight_transform(outs)
+
+reg.register_pattern("nn.contrib_conv2d_winograd_nnpack_without_weight_transform",
+                     OpPattern.OPAQUE)
+
+
+@reg.register_compute("nn.contrib_conv2d_winograd_nnpack_weight_transform")
+def compute_contrib_conv2d_winograd_nnpack_weight_transform(attrs, inputs, out_dtype, target):
+    """Compute definition of contrib_conv2d_winograd_nnpack_weight_transform"""
+    convolution_algorithm = attrs.get_int('convolution_algorithm')
+    out = topi.nn.conv2d_winograd_nnpack_weight_transform(
+        inputs[0], convolution_algorithm, out_dtype)
+    return [out]
+
+@reg.register_schedule("nn.contrib_conv2d_winograd_nnpack_weight_transform")
+def schedule_contrib_conv2d_winograd_nnpack_weight_transform(attrs, outs, target):
+    """Schedule definition of contrib_conv2d_winograd_nnpack_weight_transform"""
+    with target:
+        return topi.generic.schedule_conv2d_winograd_nnpack_weight_transform(outs)
+
+reg.register_pattern("nn.contrib_conv2d_winograd_nnpack_weight_transform",
+                     OpPattern.OPAQUE)
+
+
 @reg.register_compute("nn.contrib_conv2d_NCHWc")
 def compute_contrib_conv2d_NCHWc(attrs, inputs, out_dtype, target):
     """Compute definition of conv2d NCHWc"""
@@ -370,3 +426,26 @@ def schedule_contrib_depthwise_conv2d_NCHWc(attrs, outs, target):
 
 reg.register_pattern("nn.contrib_depthwise_conv2d_NCHWc",
                      OpPattern.OUT_ELEMWISE_FUSABLE)
+
+@reg.register_compute("nn.deformable_conv2d")
+def compute_deformable_conv2d(attrs, inputs, out_dtype, target):
+    """Compute definition of deformable_conv2d"""
+    padding = get_const_tuple(attrs.padding)
+    strides = get_const_tuple(attrs.strides)
+    dilation = get_const_tuple(attrs.dilation)
+    deformable_groups = attrs.deformable_groups
+    groups = attrs.groups
+    out_dtype = attrs.out_dtype
+    out_dtype = inputs[0].dtype if out_dtype in ("same", "") else out_dtype
+    with target:
+        out = topi.nn.deformable_conv2d_nchw(inputs[0], inputs[1], inputs[2], strides, padding,
+                                             dilation, deformable_groups, groups, out_dtype)
+    return [out]
+
+@reg.register_schedule("nn.deformable_conv2d")
+def schedule_deformable_conv2d(attrs, outs, target):
+    """Schedule definition of deformable_conv2d"""
+    with target:
+        return topi.generic.schedule_deformable_conv2d_nchw(outs)
+
+reg.register_pattern("nn.deformable_conv2d", OpPattern.OUT_ELEMWISE_FUSABLE)
diff --git a/python/tvm/relay/op/nn/nn.py b/python/tvm/relay/op/nn/nn.py
index ad8b287bb3973..ca92f70a6bf64 100644
--- a/python/tvm/relay/op/nn/nn.py
+++ b/python/tvm/relay/op/nn/nn.py
@@ -1,3 +1,4 @@
+#pylint: disable=invalid-name, too-many-lines
 """Neural network operations."""
 from __future__ import absolute_import as _abs
 from ...expr import TupleWrapper
@@ -862,6 +863,72 @@ def contrib_conv2d_winograd_without_weight_transform(data,
         kernel_layout, out_layout, out_dtype)
 
 
+def contrib_conv2d_winograd_nnpack_without_weight_transform(data,
+                                                            weight,
+                                                            strides=(1, 1),
+                                                            padding=(0, 0),
+                                                            dilation=(1, 1),
+                                                            groups=1,
+                                                            channels=None,
+                                                            kernel_size=None,
+                                                            data_layout="NCHW",
+                                                            kernel_layout="OIHW",
+                                                            out_layout="",
+                                                            out_dtype=""):
+    r"""2D convolution with the NNPACK implementation of winograd algorithm.
+
+    The basic parameters are the same as the ones in vanilla conv2d.
+    It assumes the weight is pre-transformed by nn.contrib_conv2d_winograd_nnpack_weight_transform
+
+    Parameters
+    ----------
+    data : tvm.relay.Expr
+        The input data to the operator.
+
+    weight : tvm.relay.Expr
+        The weight expressions.
+
+    strides : tuple of int, optional
+        The strides of convoltution.
+
+    padding : tuple of int, optional
+        The padding of convolution on both sides of inputs before convolution.
+
+    dilation : tuple of int, optional
+        Specifies the dilation rate to be used for dilated convolution.
+
+    groups : int, optional
+        Number of groups for grouped convolution.
+
+    channels : int, optional
+        Number of output channels of this convolution.
+
+    kernel_size : tuple of int, optional
+        The spatial of the convolution kernel.
+
+    data_layout : str, optional
+        Layout of the input.
+
+    kernel_layout : str, optional
+        Layout of the weight.
+
+    out_layout : str, optional
+        Layout of the output, by default, out_layout is the same as data_layout
+
+    out_dtype : str, optional
+        Specifies the output data type for mixed precision conv2d.
+
+    Returns
+    -------
+    result : tvm.relay.Expr
+        The computed result.
+    """
+    return _make.contrib_conv2d_winograd_nnpack_without_weight_transform(
+        data, weight, strides, padding, dilation,
+        groups, channels, kernel_size, data_layout,
+        kernel_layout, out_layout, out_dtype)
+
+
 def contrib_conv2d_nchwc(data,
                          kernel,
                          strides=(1, 1),
@@ -1013,3 +1080,101 @@ def contrib_conv2d_winograd_weight_transform(weight,
         The computed result.
     """
     return _make.contrib_conv2d_winograd_weight_transform(weight, tile_size)
+
+
+def contrib_conv2d_winograd_nnpack_weight_transform(weight,
+                                                    convolution_algorithm,
+                                                    out_dtype=""):
+    r"""Weight Transformation part for 2D convolution with winograd algorithm.
+
+    We separate this as a single op to enable pre-compute for inference.
+    Use this together with nn.contrib_conv2d_winograd_without_weight_transform
+
+    Parameters
+    ----------
+    weight : tvm.relay.Expr
+        The weight expressions.
+
+    convolution_algorithm : int
+        The Tile size of winograd. E.g. 2 for F(2x2, 3x3) and 4 for F(4x4, 3x3)
+
+    Returns
+    -------
+    result : tvm.relay.Expr
+        The computed result.
+    """
+    return _make.contrib_conv2d_winograd_nnpack_weight_transform(
+        weight, convolution_algorithm, out_dtype)
+
+
+def deformable_conv2d(data,
+                      offset,
+                      weight,
+                      strides=(1, 1),
+                      padding=(0, 0),
+                      dilation=(1, 1),
+                      deformable_groups=1,
+                      groups=1,
+                      channels=None,
+                      kernel_size=None,
+                      data_layout='NCHW',
+                      kernel_layout='OIHW',
+                      out_layout='',
+                      out_dtype=''):
+    r""" Deformable 2d convolution.
+
+    The deformable convolution operation is described in https://arxiv.org/abs/1703.06211
+
+    Parameters
+    ----------
+    data : tvm.relay.Expr
+        The input data to the operator.
+
+    offset : tvm.relay.Expr
+        The offset expressions.
+
+    weight : tvm.relay.Expr
+        The weight expressions.
+
+    strides : tuple of int, optional
+        The strides of convoltution.
+
+    padding : tuple of int, optional
+        The padding of convolution on both sides of inputs before convolution.
+
+    dilation : tuple of int, optional
+        Specifies the dilation rate to be used for dilated convolution.
+
+    deformable_groups : int, optional
+        Number of deformable groups.
+
+    groups : int, optional
+        Number of groups for grouped convolution.
+
+    channels : int, optional
+        Number of output channels of this convolution.
+
+    kernel_size : tuple of int, optional
+        The spatial of the convolution kernel.
+
+    data_layout : str, optional
+        Layout of the input.
+
+    kernel_layout : str, optional
+        Layout of the weight.
+
+    out_layout : str, optional
+        Layout of the output, by default, out_layout is the same as data_layout
+
+    out_dtype : str, optional
+        Specifies the output data type for mixed precision conv2d.
+
+    Returns
+    -------
+    result : tvm.relay.Expr
+        The computed result.
+
+    """
+    return _make.deformable_conv2d(data, offset, weight, strides, padding, dilation,
+                                   deformable_groups, groups, channels, kernel_size, data_layout,
+                                   kernel_layout, out_layout, out_dtype)
diff --git a/python/tvm/relay/op/op_attrs.py b/python/tvm/relay/op/op_attrs.py
index 5fa83bd96f30d..5f9b38a56aa6c 100644
--- a/python/tvm/relay/op/op_attrs.py
+++ b/python/tvm/relay/op/op_attrs.py
@@ -18,6 +18,11 @@ class Conv2DWinogradWeightTransformAttrs(Attrs):
     """Attribute of nn.contrib_conv2d_winograd_weight_transform"""
 
 
+@register_relay_attr_node
+class Conv2DWinogradNNPACKWeightTransformAttrs(Attrs):
+    """Attribute of nn.contrib_conv2d_winograd_nnpack_weight_transform"""
+
+
 @register_relay_attr_node
 class GlobalPool2DAttrs(Attrs):
     """Attribute of nn.global_pool"""
diff --git a/python/tvm/relay/prelude.py b/python/tvm/relay/prelude.py
index 26f00c5c5e6d2..6cf104ab388a5 100644
--- a/python/tvm/relay/prelude.py
+++ b/python/tvm/relay/prelude.py
@@ -29,7 +29,6 @@ def define_list_hd(self):
         x = Var("x", self.l(a))
         y = Var("y")
         z = Var("z")
-        # Don't match nil() since it will break type checking
         cons_case = Clause(PatternConstructor(self.cons, [PatternVar(y), PatternVar(z)]), y)
         self.mod[self.hd] = Function([x], Match(x, [cons_case]), a, [a])
 
@@ -43,9 +42,8 @@ def define_list_tl(self):
         x = Var("x", self.l(a))
         y = Var("y")
         z = Var("z")
-        nil_case = Clause(PatternConstructor(self.nil, []), self.nil())
         cons_case = Clause(PatternConstructor(self.cons, [PatternVar(y), PatternVar(z)]), z)
-        self.mod[self.tl] = Function([x], Match(x, [nil_case, cons_case]), self.l(a), [a])
+        self.mod[self.tl] = Function([x], Match(x, [cons_case]), self.l(a), [a])
 
     def define_list_nth(self):
         """Defines a function to get the nth element of a list.
@@ -62,6 +60,25 @@ def define_list_nth(self):
         s_case = Clause(PatternConstructor(self.s, [PatternVar(y)]), self.nth(self.tl(x), y))
         self.mod[self.nth] = Function([x, n], Match(n, [z_case, s_case]), a, [a])
 
+    def define_list_update(self):
+        """Defines a function to update the nth element of a list and return the updated list.
+
+        update(l, i, v) : list[a] -> nat -> a -> list[a]
+        """
+        self.update = GlobalVar("update")
+        a = TypeVar("a")
+        l = Var("l", self.l(a))
+        n = Var("n", self.nat())
+        v = Var("v", a)
+
+        y = Var("y")
+
+        z_case = Clause(PatternConstructor(self.z), self.cons(v, self.tl(l)))
+        s_case = Clause(PatternConstructor(self.s, [PatternVar(y)]),
+                        self.cons(self.hd(l), self.update(self.tl(l), y, v)))
+
+        self.mod[self.update] = Function([l, n, v], Match(n, [z_case, s_case]), self.l(a), [a])
+
     def define_list_map(self):
         """Defines a function for mapping a function over a list's
         elements. That is, map(f, l) returns a new list where
@@ -470,6 +487,7 @@ def __init__(self, mod):
         self.define_nat_add()
         self.define_list_length()
         self.define_list_nth()
+        self.define_list_update()
         self.define_list_sum()
 
         self.define_tree_adt()
diff --git a/python/tvm/relay/test_network.py b/python/tvm/relay/test_network.py
new file mode 100644
index 0000000000000..b84895503b784
--- /dev/null
+++ b/python/tvm/relay/test_network.py
@@ -0,0 +1,93 @@
+from .network import Network
+from tvm import relay
+from tvm.relay import op, var, Var, Function, Clause, PatternConstructor, PatternVar, Match
+from tvm.relay import TupleGetItem, Tuple, TensorType, TupleType
+
+class Linear(Network):
+    def build_impl(self, input_size, output_size, dtype="float32"):
+        x = self.input(var("linear_input", shape=(1, input_size), dtype=dtype))
+        w = self.weight(var("linear_weight", shape=(output_size, input_size), dtype=dtype))
+        b = self.weight(var("linear_bias", shape=(output_size,), dtype=dtype))
+        return op.add(op.nn.dense(x, w), b)
+
+def lam(names, func):
+    args = [Var(name) for name in names]
+    return Function(args, func(*args))
+
+class LSTMCell(Network):
+    def build_impl(self, input_size, memory_size, dtype="float32"):
+        t = TensorType(shape=(1, memory_size), dtype=dtype)
+        i = self.input(var("lstmcell_input", shape=(1, input_size), dtype=dtype))
+        c = self.input(Var("lstmcell_children", self.p.l(TupleType([t, t]))))
+        sum = lam(["x", "y"], lambda x, y: x + y)
+        child_h_sum = self.p.foldl(sum,
+                                   op.zeros(shape=(1, memory_size), dtype=dtype),
+                                   self.p.map(lam(["z"], lambda z: TupleGetItem(z, 1)), c))
+        ioux = Linear(input_size=input_size, output_size=memory_size * 3)(i)
+        iouh = Linear(input_size=memory_size, output_size=memory_size * 3)(child_h_sum)
+        iou = ioux + iouh
+        fx = Linear(input_size=input_size, output_size=memory_size)(i)
+        fh = Linear(input_size=memory_size, output_size=memory_size)
+        i, o, u = op.split(iou, 3, axis=1)
+        i, o, u = op.sigmoid(i), op.sigmoid(o), op.tanh(u)
+        def foreach_children(children):
+            f = op.sigmoid(fh(TupleGetItem(children, 1)) + fx)
+            return f * TupleGetItem(children, 0)
+        c = self.p.foldl(sum, i * u, self.p.map(lam(["z"], foreach_children), c))
+        return Tuple([c, o * op.tanh(c)])
+
+class LSTMEncoder(Network):
+    def build_impl(self, input_size, memory_size, dtype="float32"):
+        l = self.input(Var("l", self.p.l(TensorType(shape=(1, input_size), dtype=dtype))))
+        cell = LSTMCell(input_size=input_size, memory_size=memory_size, dtype=dtype)
+        return self.p.foldl(lam(["c", "x"], lambda c, x: cell(x, self.p.cons(c, self.p.nil()))),
+                            Tuple([op.zeros(shape=(1, memory_size), dtype=dtype),
+                                   op.zeros(shape=(1, memory_size), dtype=dtype)]), l)
+
+class LSTMTransformer(Network):
+    def build_impl(self, input_size, memory_size, dtype="float32"):
+        l = self.input(Var("l", self.p.l(TensorType(shape=(1, input_size), dtype=dtype))))
+        def f(c, x):
+            cell = LSTMCell(input_size=input_size, memory_size=memory_size, dtype=dtype)
+            o = cell(x, self.p.cons(c, self.p.nil()))
+            return Tuple([o, TupleGetItem(o, 1)])
+        res = self.p.map_accuml(lam(["c", "x"], f),
+                                 Tuple([op.zeros(shape=(1, memory_size), dtype=dtype),
+                                        op.zeros(shape=(1, memory_size), dtype=dtype)]),
+                                 l)
+        return Tuple([TupleGetItem(TupleGetItem(res, 0), 1), TupleGetItem(res, 1)])
+
+class TreeLSTM(Network):
+    def build_impl(self, input_size, memory_size, dtype="float32"):
+        t = TensorType(shape=(1, memory_size), dtype=dtype)
+        self.ret_type = TupleType([t, t])
+        tree_type = self.p.tree(TensorType(shape=(1, input_size), dtype=dtype))
+        t = self.input(Var("tlstm_input", tree_type))
+        i = Var("i", TensorType(shape=(1, input_size), dtype=dtype))
+        c = Var("c", self.p.l(tree_type))
+        cell = LSTMCell(input_size=input_size, memory_size=memory_size, dtype=dtype)
+        rose_case = Clause(PatternConstructor(self.p.rose, [PatternVar(i), PatternVar(c)]),
+                           cell(i, self.p.map(lam(["x"], self), c)))
+        return Match(t, [rose_case])
+
+class BiLSTM(Network):
+    def build_impl(self, input_size, memory_size, dtype="float32"):
+        l = self.input(Var("l", self.p.l(TensorType(shape=(1, input_size), dtype=dtype))))
+        def LSTM(l):
+            return LSTMTransformer(input_size=input_size,
+                                   memory_size=memory_size,
+                                   dtype=dtype)(l)
+        fwd = LSTM(l)
+        rev = LSTM(self.p.rev(l))
+        lhs = op.concatenate([TupleGetItem(fwd, 0), TupleGetItem(rev, 0)], axis=1)
+        t = TensorType(shape=(1, memory_size), dtype=dtype)
+        x = Var("x", TupleType([t, t])) # cannot infer here
+        rhs = self.p.map(Function([x], op.concatenate([TupleGetItem(x, 0),
+                                                       TupleGetItem(x, 1)],
+                                                      axis=1)),
+                         self.p.zip(TupleGetItem(fwd, 1), TupleGetItem(rev, 1)))
+        return Tuple([lhs, rhs])
+
+# t = BiLSTM(input_size=128, memory_size=256)
+# print("type of BidirectionalLSTM, with input_size=128, memory_size=256, is:")
+# print(t.interface_type())
diff --git a/python/tvm/relay/testing/mlp.py b/python/tvm/relay/testing/mlp.py
index 7d7d984f75263..3eac0a8f26540 100644
--- a/python/tvm/relay/testing/mlp.py
+++ b/python/tvm/relay/testing/mlp.py
@@ -50,7 +50,7 @@ def get_net(batch_size,
                      dtype=dtype)
     data = relay.nn.batch_flatten(data)
     fc1 = relay.nn.dense(data, relay.var("fc1_weight"), units=128)
-    fc1 = relay.nn.bias_add(fc1, relay.var("fc2_bias"))
+    fc1 = relay.nn.bias_add(fc1, relay.var("fc1_bias"))
     act1 = relay.nn.relu(fc1)
     fc2 = relay.nn.dense(act1, relay.var("fc2_weight"), units=64)
     fc2 = relay.nn.bias_add(fc2, relay.var("fc2_bias"))
diff --git a/python/tvm/relay/testing/tf.py b/python/tvm/relay/testing/tf.py
index effe19808a59d..2bbbb46fe2331 100644
--- a/python/tvm/relay/testing/tf.py
+++ b/python/tvm/relay/testing/tf.py
@@ -13,7 +13,7 @@
 import tensorflow as tf
 from tensorflow.core.framework import graph_pb2
 
-from tvm.contrib import util
+from tvm.contrib.download import download_testdata
 
 ######################################################################
 # Some helper functions
@@ -136,7 +136,7 @@ def id_to_string(self, node_id):
             return ''
         return self.node_lookup[node_id]
 
-def get_workload_official(model_url, model_sub_path, temp_dir):
+def get_workload_official(model_url, model_sub_path):
     """ Import workload from tensorflow official
 
     Parameters
@@ -158,21 +158,17 @@ def get_workload_official(model_url, model_sub_path, temp_dir):
     """
 
     model_tar_name = os.path.basename(model_url)
-
-    from mxnet.gluon.utils import download
-    temp_path = temp_dir.relpath("./")
-    path_model = temp_path + model_tar_name
-
-    download(model_url, path_model)
+    model_path = download_testdata(model_url, model_tar_name, module=['tf', 'official'])
+    dir_path = os.path.dirname(model_path)
 
     import tarfile
-    if path_model.endswith("tgz") or path_model.endswith("gz"):
-        tar = tarfile.open(path_model)
-        tar.extractall(path=temp_path)
+    if model_path.endswith("tgz") or model_path.endswith("gz"):
+        tar = tarfile.open(model_path)
+        tar.extractall(path=dir_path)
         tar.close()
     else:
-        raise RuntimeError('Could not decompress the file: ' + path_model)
-    return temp_path + model_sub_path
+        raise RuntimeError('Could not decompress the file: ' + model_path)
+    return os.path.join(dir_path, model_sub_path)
 
 def get_workload(model_path, model_sub_path=None):
     """ Import workload from frozen protobuf
@@ -192,24 +188,18 @@ def get_workload(model_path, model_sub_path=None):
 
     """
 
-    temp = util.tempdir()
     if model_sub_path:
-        path_model = get_workload_official(model_path, model_sub_path, temp)
+        path_model = get_workload_official(model_path, model_sub_path)
     else:
         repo_base = 'https://github.com/dmlc/web-data/raw/master/tensorflow/models/'
-        model_name = os.path.basename(model_path)
         model_url = os.path.join(repo_base, model_path)
-
-        from mxnet.gluon.utils import download
-        path_model = temp.relpath(model_name)
-        download(model_url, path_model)
+        path_model = download_testdata(model_url, model_path, module='tf')
 
     # Creates graph from saved graph_def.pb.
     with tf.gfile.FastGFile(path_model, 'rb') as f:
         graph_def = tf.GraphDef()
         graph_def.ParseFromString(f.read())
         graph = tf.import_graph_def(graph_def, name='')
-        temp.remove()
         return graph_def
 
 #######################################################################
@@ -292,7 +282,7 @@ def _get_feed_dict(input_name, input_data):
 
 def _create_ptb_vocabulary(data_dir):
     """Read the PTB sample data input to create vocabulary"""
-    data_path = data_dir+'simple-examples/data/'
+    data_path = os.path.join(data_dir, 'simple-examples/data/')
     file_name = 'ptb.train.txt'
     def _read_words(filename):
         """Read the data for creating vocabulary"""
@@ -341,13 +331,10 @@ def get_workload_ptb():
     ptb_model_file = 'RNN/ptb/ptb_model_with_lstmblockcell.pb'
 
     import tarfile
-    from tvm.contrib.download import download
-    DATA_DIR = './ptb_data/'
-    if not os.path.exists(DATA_DIR):
-        os.mkdir(DATA_DIR)
-    download(sample_url, DATA_DIR+sample_data_file)
-    t = tarfile.open(DATA_DIR+sample_data_file, 'r')
-    t.extractall(DATA_DIR)
-
-    word_to_id, id_to_word = _create_ptb_vocabulary(DATA_DIR)
+    file_path = download_testdata(sample_url, sample_data_file, module=['data', 'ptb_data'])
+    dir_path = os.path.dirname(file_path)
+    t = tarfile.open(file_path, 'r')
+    t.extractall(dir_path)
+
+    word_to_id, id_to_word = _create_ptb_vocabulary(dir_path)
     return word_to_id, id_to_word, get_workload(ptb_model_file)
diff --git a/rust/.rustfmt.toml b/rust/.rustfmt.toml
index 9e52f9efacc8e..9f852c00254c0 100644
--- a/rust/.rustfmt.toml
+++ b/rust/.rustfmt.toml
@@ -45,7 +45,6 @@ use_field_init_shorthand = false
 force_explicit_abi = true
 condense_wildcard_suffixes = false
 color = "Auto"
-required_version = "1.0.1"
 unstable_features = false
 disable_all_formatting = false
 skip_children = false
diff --git a/src/arithmetic/detect_linear_equation.cc b/src/arithmetic/detect_linear_equation.cc
index 6f4d3cfb53bb2..e7bc7e74b6754 100644
--- a/src/arithmetic/detect_linear_equation.cc
+++ b/src/arithmetic/detect_linear_equation.cc
@@ -127,25 +127,21 @@ Array<Expr> DetectLinearEquation(const Expr& e, const Array<Var>& vars) {
   Expr base = e;
   Array<Expr> coeff;
 
-  if (0 == vars.size()) {
-    coeff.push_back(make_const(Int(32), 1));
-  } else {
-    for (Var v : vars) {
-      LinearEqEntry ret;
-      if (!LinearEqDetector(v).Detect(base, &ret)) {
-        return Array<Expr>();
-      }
-      coeff.push_back(ret.coeff);
-      base = std::move(ret.base);
+  for (Var v : vars) {
+    LinearEqEntry ret;
+    if (!LinearEqDetector(v).Detect(base, &ret)) {
+      return Array<Expr>();
     }
+    coeff.push_back(ret.coeff);
+    base = std::move(ret.base);
+  }
 
-    std::unordered_set<const Variable*> vset;
-    for (size_t i = vars.size(); i != 1; --i) {
-      vset.insert(vars[i - 1].get());
-      // The previous coeff contains the variable
-      if (ExprUseVar(coeff[i - 2], vset)) {
-        return Array<Expr>();
-      }
+  std::unordered_set<const Variable*> vset;
+  for (size_t i = vars.size(); i > 1; --i) {
+    vset.insert(vars[i - 1].get());
+    // The previous coeff contains the variable
+    if (ExprUseVar(coeff[i - 2], vset)) {
+      return Array<Expr>();
     }
   }
   coeff.push_back(base);
diff --git a/src/arithmetic/modular_set.cc b/src/arithmetic/modular_set.cc
index 8112beef75515..ebf8f3c1db4a1 100644
--- a/src/arithmetic/modular_set.cc
+++ b/src/arithmetic/modular_set.cc
@@ -36,6 +36,18 @@ struct ModularSetAnalyzer::Entry {
   int64_t coeff{1};
   int64_t base{0};
 
+  Entry() = default;
+
+  Entry(int64_t coeff, int64_t base) {
+    CHECK_GE(coeff, 0);
+    this->coeff = coeff;
+    if (coeff != 0) {
+      base = base % coeff;
+      if (base < 0) base += coeff;
+    }
+    this->base = base;
+  }
+
   bool is_const() const {
     return coeff == 0;
   }
@@ -53,10 +65,7 @@ class ModularSetAnalyzer::Impl :
     if (!override) {
       CHECK(!var_map_.count(var));
     }
-    Entry e;
-    e.coeff = info->coeff;
-    e.base = info->base;
-    var_map_[var] = e;
+    var_map_[var] = Entry(info->coeff, info->base);
   }
 
   // Detect useful constraints and use them in the analysis scope.
@@ -65,9 +74,7 @@ class ModularSetAnalyzer::Impl :
     PVar<Integer> coeff, base;
     // pattern match interesting constraints
     if (((var % coeff) == base).Match(constraint)) {
-      Entry entry;
-      entry.coeff = coeff.Eval()->value;
-      entry.base = base.Eval()->value;
+      Entry entry(coeff.Eval()->value, base.Eval()->value);
       return UpdateByIntersect(var.Eval(), entry);
     }
     return nullptr;
@@ -83,18 +90,12 @@ class ModularSetAnalyzer::Impl :
   }
 
   Entry VisitExpr_(const IntImm* op) final {
-    Entry ret;
-    ret.base = op->value;
-    ret.coeff = 0;
-    return ret;
+    return Entry(0, op->value);
   }
 
   Entry VisitExpr_(const UIntImm* op) final {
     if (op->value < std::numeric_limits<int64_t>::max()) {
-      Entry ret;
-      ret.base = static_cast<int>(op->value);
-      ret.coeff = 0;
-      return ret;
+      return Entry(0, static_cast<int>(op->value));
     } else {
       return Everything();
     }
@@ -103,19 +104,15 @@ class ModularSetAnalyzer::Impl :
   Entry VisitExpr_(const Add* op) final {
     Entry a = VisitExpr(op->a);
     Entry b = VisitExpr(op->b);
-    Entry ret;
-    ret.coeff = ZeroAwareGCD(a.coeff, b.coeff);
-    ret.base = BaseSimplify(a.base + b.base, ret.coeff);
-    return ret;
+    int64_t coeff = ZeroAwareGCD(a.coeff, b.coeff);
+    return Entry(coeff, a.base + b.base);
   }
 
   Entry VisitExpr_(const Sub* op) final {
     Entry a = VisitExpr(op->a);
     Entry b = VisitExpr(op->b);
-    Entry ret;
-    ret.coeff = ZeroAwareGCD(a.coeff, b.coeff);
-    ret.base = BaseSimplify(a.base - b.base, ret.coeff);
-    return ret;
+    int64_t coeff = ZeroAwareGCD(a.coeff, b.coeff);
+    return Entry(coeff, a.base - b.base);
   }
 
   Entry VisitExpr_(const Mul* op) final {
@@ -128,10 +125,8 @@ class ModularSetAnalyzer::Impl :
     int64_t pq = a.coeff * b.coeff;
     int64_t pm = a.coeff * b.base;
     int64_t qn = a.base * b.coeff;
-    Entry ret;
-    ret.coeff = ZeroAwareGCD(pq, ZeroAwareGCD(pm, qn));
-    ret.base = BaseSimplify(a.base * b.base, ret.coeff);
-    return ret;
+    int64_t coeff = ZeroAwareGCD(pq, ZeroAwareGCD(pm, qn));
+    return Entry(coeff, a.base * b.base);
   }
 
   Entry DivByConst(const Expr& lhs,
@@ -140,20 +135,15 @@ class ModularSetAnalyzer::Impl :
     Entry a = VisitExpr(lhs);
     CHECK_NE(val, 0);
     if (a.coeff % val == 0) {
-      Entry ret;
       if (a.base == 0) {
         // a c x  / c -> a x
-        ret.coeff = std::abs(a.coeff / val);
-        ret.base = 0;
-        return ret;
+        return Entry(std::abs(a.coeff / val), 0);
       }
       // positive division have a clear rounding mode.
       // Only handle case where we clearly know we need to round down.
       if (a.base > 0 && val > 0 &&
           (round_down || parent_->CanProveGreaterEqual(lhs, 0))) {
-        ret.coeff = a.coeff / val;
-        ret.base = a.base / val;
-        return ret;
+        return Entry(a.coeff / val, a.base / val);
       }
     }
     return Everything();
@@ -251,41 +241,80 @@ class ModularSetAnalyzer::Impl :
     }
     int64_t base0 = a.base % coeff;
     int64_t base1 = b.base % coeff;
-    Entry ret;
     if (base0 == base1) {
-      ret.coeff = coeff;
-      ret.base = base0;
-      return ret;
+      return Entry(coeff, base0);
     } else {
-      ret.coeff = ZeroAwareGCD(ZeroAwareGCD(base0, base1), coeff);
-      ret.base = 0;
-      return ret;
+      return Entry(ZeroAwareGCD(ZeroAwareGCD(base0, base1), coeff), base0);
     }
   }
+  /*!
+   * \brief Use Extended Euclidean algorithm to solve ax + by = gcd(a, b)
+   * \param a The first coefficient.
+   * \param b The second coefficient.
+   * \param x The solution of x.
+   * \param y The solution of y.
+   * \return The GCD of a and b.
+   */
+  static int64_t ExtendedEuclidean(int64_t a, int64_t b, int64_t* x, int64_t* y) {
+    // Extended Euclidean algorithm
+    // if a < 0, the problem can be convert into
+    // |a|* (-x) + b * y = gcd(|a|, b)
+    //
+    // initial condition:
+    // a * 0 + b * 1 = b
+    // a * 1 + b * 0 = a
+    int64_t s = 0, old_s = 1;
+    int64_t r = b, old_r = a >= 0 ? a : -a;
+    // Iteration (r2 < r1):
+    // a * x1 + b * y1 = r1
+    // a * x2 + b * y2 = r2
+    // The above two eqs can derive the following eq (q = r1 / r2)
+    // a * (x1 - x2 * q) + b * (y1 - y2 * q) = r1 - r2 * q = r3
+    // Because r3 < r2, the iteration can eventually terminate
+    while (r != 0) {
+      int64_t q = old_r / r;
+      int64_t tmp = old_r;
+      old_r = r;
+      r = tmp - q * r;
+      tmp = old_s;
+      old_s = s;
+      s = tmp - q * s;
+    }
+
+    *x = a >= 0 ? old_s : -old_s;
+    if (b != 0) {
+      *y = (old_r - (*x) * a) / b;
+    } else {
+      *y = 1;
+    }
+
+    return old_r;
+  }
   /*!
    * \brief Create interect of two sets.
    * \param a The left operand.
    * \param b the right operand.
    */
   static Entry Intersect(Entry a, Entry b) {
-    // simple rule for now: pick higher constraints.
-    // TODO(team-team): Use extended euclidean algorithm.
-    if (a.coeff == 0) return a;
-    if (b.coeff == 0) return b;
-    if (a.coeff >= b.coeff) return a;
-    return b;
-  }
-  /*!
-   * \brief Simplify base so that it is in [0, coeff) when coeff != 0.
-   * \param base The base value.
-   * \param coeff The coeff value.
-   * \return The simplified base.
-   */
-  static int64_t BaseSimplify(int64_t base, int64_t coeff) {
-    if (coeff == 0) return base;
-    base = base % coeff;
-    if (base < 0) base += coeff;
-    return base;
+    int64_t x, y;
+    int64_t c1 = a.coeff, b1 = a.base, c2 = b.coeff, b2 = b.base;
+    // z = c1 * p + b1
+    // z = c2 * q + b2
+    // c1 * x + c2 * y = gcd(c1, c2)
+    // -> c1 * p - c2 * q = b2 - b1
+    // -> p = (b2 - b1) / gcd * x
+    // -> q = (b2 - b1) / gcd * (-y)
+    // -> z = LCM(x, y) * k + (c1 * p + b1)
+    int64_t gcd = ExtendedEuclidean(c1, c2, &x, &y);
+    int64_t v = b2 - b1;
+    if (v % gcd == 0) {
+      x = v / gcd * x;
+      y = v / gcd * (-y);
+      int64_t coeff = c1 / gcd * c2;
+      return Entry(coeff, x * c1 + b1);
+    } else {
+      return Nothing();
+    }
   }
   /*!
    * \brief Take GCD of a and b.
@@ -311,9 +340,14 @@ class ModularSetAnalyzer::Impl :
    * \return Bound that represent everything dtype can represent.
    */
   static Entry Everything() {
-    Entry ret;
-    ret.coeff = 1; ret.base = 0;
-    return ret;
+    return Entry(1, 0);
+  }
+  /*!
+   * \brief return an empty set
+   * \return Bound that represent everything dtype can represent.
+   */
+  static Entry Nothing() {
+      return Entry(0, 1);
   }
 };
 
diff --git a/src/arithmetic/rewrite_simplify.cc b/src/arithmetic/rewrite_simplify.cc
index 17f8e010f3936..f031e094d84af 100644
--- a/src/arithmetic/rewrite_simplify.cc
+++ b/src/arithmetic/rewrite_simplify.cc
@@ -96,6 +96,8 @@ class RewriteSimplifier::Impl : public IRMutator {
     kEQ,
     kGT,
     kLT,
+    kGE,
+    kLE,
     kNE
   };
   // reference to the main analyzer
@@ -140,6 +142,12 @@ class RewriteSimplifier::Impl : public IRMutator {
     if (dbound->max_value < val) {
       return kLT;
     }
+    if (dbound->min_value >= val) {
+      return kGE;
+    }
+    if (dbound->max_value <= val) {
+      return kLE;
+    }
     return kUnknown;
   }
 
@@ -994,12 +1002,10 @@ Mutate_(const EQ* op, const Expr& self) {
 
   if (IsIndexType(op->a.type())) {
     CompareResult result = TryCompare(op->a - op->b, 0);
-    if (result != kUnknown) {
-      if (result == kEQ) {
-        return make_const(op->type, true);
-      } else {
-        return make_const(op->type, false);
-      }
+    if (result == kEQ) {
+      return make_const(op->type, true);
+    } else if (result == kNE || result == kGT || result == kLT) {
+      return make_const(op->type, false);
     }
     TVM_TRY_REWRITE(x - c1 == 0, x == c1);
     TVM_TRY_REWRITE(c1 - x == 0, x == c1);
@@ -1055,7 +1061,7 @@ Mutate_(const LT* op, const Expr& self) {
     if (result == kLT) {
       return make_const(op->type, true);
     }
-    if (result == kEQ || result == kGT) {
+    if (result == kEQ || result == kGT || result == kGE) {
       return make_const(op->type, false);
     }
 
diff --git a/src/codegen/llvm/codegen_llvm.cc b/src/codegen/llvm/codegen_llvm.cc
index 6b69f97a66fe5..66aec3e13d38d 100644
--- a/src/codegen/llvm/codegen_llvm.cc
+++ b/src/codegen/llvm/codegen_llvm.cc
@@ -4,9 +4,11 @@
  */
 #ifdef TVM_LLVM_VERSION
 // Part of the code are adapted from Halide's CodeGen_LLVM
-
 #include <tvm/runtime/device_api.h>
 #include <tvm/runtime/c_runtime_api.h>
+
+#include <algorithm>
+
 #include "codegen_llvm.h"
 #include "codegen_cpu.h"
 #include "../../pass/ir_util.h"
@@ -410,12 +412,16 @@ llvm::Value* CodeGenLLVM::CreateBroadcast(llvm::Value* value, int lanes) {
 llvm::Value* CodeGenLLVM::CreateVecSlice(llvm::Value* vec, int begin, int extent) {
   int num_elems = static_cast<int>(vec->getType()->getVectorNumElements());
   if (extent == num_elems && begin == 0) return vec;
-  CHECK_LE(begin + extent, num_elems);
-  std::vector<unsigned> indices;
+  std::vector<llvm::Constant*> indices;
+  indices.reserve(extent);
   for (int i = 0; i < extent; ++i) {
-    indices.push_back(begin + i);
+    if (begin + i >= 0 && begin + i < num_elems) {
+      indices.push_back(llvm::ConstantInt::get(t_int32_, begin + i));
+    } else {
+      indices.push_back(llvm::UndefValue::get(t_int32_));
+    }
   }
-  return builder_->CreateShuffleVector(vec, vec, indices);
+  return builder_->CreateShuffleVector(vec, vec, llvm::ConstantVector::get(indices));
 }
 
 llvm::Value* CodeGenLLVM::CreateVecFlip(llvm::Value* vec) {
@@ -446,24 +452,31 @@ llvm::Value* CodeGenLLVM::CreateVecConcat(std::vector<llvm::Value*> vecs) {
         v->getType()->getVectorNumElements());
   }
   while (vecs.size() > 1) {
-    for (size_t i = 0; i < vecs.size(); i+=2) {
-      if (i + 1 >= vecs.size()) {
-        vecs[i / 2] = vecs[i]; continue;
-      }
+    std::vector<llvm::Value*> new_vecs;
+    for (size_t i = 0; i < vecs.size() - 1; i += 2) {
       llvm::Value* lhs = vecs[i];
       llvm::Value* rhs = vecs[i + 1];
-      int lanes = static_cast<int>(std::max(
-          lhs->getType()->getVectorNumElements(),
-          rhs->getType()->getVectorNumElements()));
-      lhs = CreateVecPad(lhs, lanes);
-      rhs = CreateVecPad(lhs, lanes);
+      const size_t lhs_lanes = lhs->getType()->getVectorNumElements();
+      const size_t rhs_lanes = rhs->getType()->getVectorNumElements();
+      if (lhs_lanes < rhs_lanes) {
+        lhs = CreateVecPad(lhs, rhs_lanes);
+      } else if (rhs_lanes < lhs_lanes) {
+        rhs = CreateVecPad(rhs, lhs_lanes);
+      }
+      const size_t shared_lanes = std::max(lhs_lanes, rhs_lanes);
       std::vector<unsigned> mask;
-      for (int i = 0; i < lanes * 2; ++i) {
+      for (size_t i = 0; i < lhs_lanes; ++i) {
         mask.push_back(i);
       }
-      vecs[i / 2] = builder_->CreateShuffleVector(lhs, rhs, mask);
+      for (size_t i = 0; i < rhs_lanes; ++i) {
+        mask.push_back(shared_lanes + i);
+      }
+      new_vecs.push_back(builder_->CreateShuffleVector(lhs, rhs, mask));
+    }
+    if (vecs.size() % 2 != 0) {
+      new_vecs.push_back(vecs.back());
     }
-    vecs.resize((vecs.size() + 1) / 2);
+    vecs.swap(new_vecs);
   }
   return CreateVecSlice(vecs[0], 0, total_lanes);
 }
diff --git a/src/codegen/llvm/codegen_x86_64.cc b/src/codegen/llvm/codegen_x86_64.cc
new file mode 100644
index 0000000000000..3184a830daf4c
--- /dev/null
+++ b/src/codegen/llvm/codegen_x86_64.cc
@@ -0,0 +1,131 @@
+/*!
+ *  Copyright (c) 2019 by Contributors
+ * \file codegen_x86_64.cc
+ * \brief X86-64 specific code generator
+ */
+#ifdef TVM_LLVM_VERSION
+#include "codegen_cpu.h"
+
+#include "llvm/MC/MCSubtargetInfo.h"
+
+namespace tvm {
+namespace codegen {
+
+namespace {
+bool TargetHasFeature(const llvm::TargetMachine& tm, const std::string& feature) {
+  // MCSubTargetInfo::checkFeatures was added in LLVM 6.0
+#if TVM_LLVM_VERSION >= 60
+  const auto* MCInfo = tm.getMCSubtargetInfo();
+  return MCInfo->checkFeatures(std::string("+") + feature);
+#else
+  return false;
+  // TODO(tulloch) - enable this block, need to figure out how to reimplement
+  // this given visibility constraints, similar to
+  // https://github.com/rust-lang/rust/pull/31709
+
+  // Copied from
+  // https://github.com/llvm-mirror/llvm/blob/5136df4/lib/MC/MCSubtargetInfo.cpp#L78-L88.
+
+  // auto checkFeatures = [&](const std::string FS) {
+  //   llvm::SubtargetFeatures T(FS);
+  //   llvm::FeatureBitset Set, All;
+  //   for (std::string F : T.getFeatures()) {
+  //     llvm::SubtargetFeatures::ApplyFeatureFlag(Set, F, MCInfo->ProcFeatures);
+  //     if (F[0] == '-') {
+  //       F[0] = '+';
+  //     }
+  //     llvm::SubtargetFeatures::ApplyFeatureFlag(All, F, MCInfo->ProcFeatures);
+  //   }
+  //   return (MCInfo->getFeatureBits() & All) == Set;
+  // };
+  // return checkFeatures(MCInfo, std::string("+") + feature);
+#endif
+}
+}  // namespace
+
+class CodeGenX86_64 final : public CodeGenCPU {
+ public:
+  llvm::Value* VisitExpr_(const Cast* op) override;
+
+ private:
+  llvm::Value* CallVectorIntrin(llvm::Intrinsic::ID id, size_t intrin_lanes, llvm::Type* result_ty,
+                                const std::vector<llvm::Value*>& args);
+};
+
+llvm::Value* CodeGenX86_64::VisitExpr_(const Cast* op) {
+  // LLVM does not automatically generate the correct instruction sequences for
+  // half -> float conversion (i.e. using AVX2/AVX-512 vectorized variants of
+  // vcvtph2ps), so we explicitly generate them ourselves.
+  const auto from = op->value.type();
+  const auto to = op->type;
+  if (from.is_float() && to.is_float() && from.bits() == 16 && to.bits() == 32) {
+    CHECK_EQ(from.lanes(), to.lanes());
+    CHECK_NOTNULL(target_machine_);
+
+    const auto has_f16c = TargetHasFeature(*target_machine_, "f16c");
+    const auto has_avx512 = TargetHasFeature(*target_machine_, "avx512f");
+
+    if (from.lanes() >= 16 && has_avx512) {
+      return CallVectorIntrin(
+          ::llvm::Intrinsic::x86_avx512_mask_vcvtph2ps_512, 16, LLVMType(Float(32, from.lanes())),
+          {
+              MakeValue(ir::Call::make(Int(16, from.lanes()), ir::Call::reinterpret, {op->value},
+                                       ir::Call::PureIntrinsic)),
+              MakeValue(ir::Broadcast::make(ir::FloatImm::make(Float(32), 0), from.lanes())),
+              /*mask=*/MakeValue(ir::IntImm::make(Int(16), -1)),
+              /*rounding-mode=*/MakeValue(ir::IntImm::make(Int(32), 4)),
+          });
+    }
+
+    if (from.lanes() >= 8 && has_f16c) {
+      return CallVectorIntrin(
+          ::llvm::Intrinsic::x86_vcvtph2ps_256, 8, LLVMType(Float(32, from.lanes())),
+          {MakeValue(ir::Call::make(Int(16, from.lanes()), ir::Call::reinterpret, {op->value},
+                                    ir::Call::PureIntrinsic))});
+    }
+  }
+
+  return CodeGenCPU::VisitExpr_(op);
+}
+
+llvm::Value* CodeGenX86_64::CallVectorIntrin(llvm::Intrinsic::ID id, size_t intrin_lanes,
+                                             llvm::Type* result_ty,
+
+                                             const std::vector<llvm::Value*>& args) {
+  llvm::Function* f = llvm::Intrinsic::getDeclaration(module_.get(), id, {});
+  if (intrin_lanes == result_ty->getVectorNumElements()) {
+    return builder_->CreateCall(f, args);
+  }
+
+  // Otherwise, we split the vector into intrin_lanes sized elements (widening where necessary),
+  // compute each result, and then concatenate the vectors (slicing the result if necessary).
+  CHECK_LT(intrin_lanes, result_ty->getVectorNumElements());
+  std::vector<llvm::Value*> split_results;
+  for (size_t i = 0;
+       i < static_cast<size_t>(result_ty->getVectorNumElements());
+       i += intrin_lanes) {
+    std::vector<llvm::Value*> split_args;
+    for (const auto& v : args) {
+      if (v->getType()->isVectorTy()) {
+        CHECK_EQ(v->getType()->getVectorNumElements(), result_ty->getVectorNumElements());
+        split_args.push_back(CreateVecSlice(v, i, intrin_lanes));
+      } else {
+        split_args.push_back(v);
+      }
+    }
+    split_results.push_back(CallVectorIntrin(
+        id, intrin_lanes, llvm::VectorType::get(result_ty->getScalarType(), intrin_lanes),
+        split_args));
+  }
+  return CreateVecSlice(CreateVecConcat(split_results), 0, result_ty->getVectorNumElements());
+}
+
+TVM_REGISTER_GLOBAL("tvm.codegen.llvm.target_x86-64")
+.set_body([](const TVMArgs& targs, TVMRetValue* rv) {
+    CodeGenLLVM* cg = new CodeGenX86_64();
+    *rv = static_cast<void*>(cg);
+  });
+
+}  // namespace codegen
+}  // namespace tvm
+#endif  // TVM_LLVM_VERSION
diff --git a/src/common/socket.h b/src/common/socket.h
index fafff97b25226..5c39f409d8e21 100644
--- a/src/common/socket.h
+++ b/src/common/socket.h
@@ -65,7 +65,7 @@ struct SockAddr {
     memset(&hints, 0, sizeof(hints));
     hints.ai_family = PF_UNSPEC;
     hints.ai_flags = AI_PASSIVE;
-    hints.ai_protocol = SOCK_STREAM;
+    hints.ai_socktype = SOCK_STREAM;
     addrinfo *res = NULL;
     int sig = getaddrinfo(host, NULL, &hints, &res);
     CHECK(sig == 0 && res != NULL)
diff --git a/src/contrib/nnpack/convolution.cc b/src/contrib/nnpack/convolution.cc
index 887129819bc2e..538d29333b4a0 100644
--- a/src/contrib/nnpack/convolution.cc
+++ b/src/contrib/nnpack/convolution.cc
@@ -189,20 +189,20 @@ TVM_REGISTER_GLOBAL("tvm.contrib.nnpack.convolution_inference_without_weight_tra
       CHECK(workspace_buffer != nullptr);
 
       for (auto n = 0; n < input->shape[0]; ++n) {
-      nnp_status status = nnp_convolution_inference(
-          algo, nnp_convolution_transform_strategy_reuse, input_channels, output_channels,
-          input_size, input_padding, kernel_size, stride_size,
-          static_cast<float *>(input->data) + n * input->shape[1] *
-                               input->shape[2] *
-                               input->shape[3],
-          static_cast<float *>(transformed_kernel->data),
-          bias ? static_cast<float *>(bias->data) : zero_bias->data(),
-          static_cast<float *>(output->data) + n * output->shape[1] *
-                               output->shape[2] *
-                               output->shape[3],
-          workspace_buffer, &workspace_size,
-          nnp_activation_identity, nullptr, entry->threadpool, nullptr);
-      CHECK_EQ(status, nnp_status_success);
+        nnp_status status = nnp_convolution_inference(
+            algo, nnp_convolution_transform_strategy_reuse, input_channels, output_channels,
+            input_size, input_padding, kernel_size, stride_size,
+            static_cast<float *>(input->data) + n * input->shape[1] *
+                                input->shape[2] *
+                                input->shape[3],
+            static_cast<float *>(transformed_kernel->data),
+            bias ? static_cast<float *>(bias->data) : zero_bias->data(),
+            static_cast<float *>(output->data) + n * output->shape[1] *
+                                output->shape[2] *
+                                output->shape[3],
+            workspace_buffer, &workspace_size,
+            nnp_activation_identity, nullptr, entry->threadpool, nullptr);
+        CHECK_EQ(status, nnp_status_success);
       }
 
       cpu_api->FreeWorkspace(ctx, workspace_buffer);
diff --git a/src/pass/inject_copy_intrin.cc b/src/pass/inject_copy_intrin.cc
index 7ca1d133bd2db..7dcfcfdae2399 100644
--- a/src/pass/inject_copy_intrin.cc
+++ b/src/pass/inject_copy_intrin.cc
@@ -39,7 +39,6 @@ class CopyIntrinInjector : public IRMutator {
   bool MatchCopyPattern(Stmt stmt, Stmt *out) {
     using namespace arith;
     Stmt body = stmt;
-    bool is_single_point_copy = false;
 
     // strip the loops
     std::vector<const For*> loops;
@@ -60,7 +59,6 @@ class CopyIntrinInjector : public IRMutator {
     const Cast* cast = store->value.as<Cast>();
     const Load* load = store->value.as<Load>();
     if (0 == loops.size()) {
-      is_single_point_copy = true;
       CHECK(!has_cond);
     }
     // for now only support true condition matching
@@ -83,9 +81,8 @@ class CopyIntrinInjector : public IRMutator {
         arith::DetectLinearEquation(load->index, loop_vars);
     if (load_strides.size()  == 0 || store_strides.size() == 0) return false;
     Array<Expr> dst_shape;
-    auto loop_var_size = loop_vars.size();
-    if (is_single_point_copy) {
-      loop_var_size = 1;
+    const size_t loop_var_size = loop_vars.size();
+    if (loop_var_size == 0) {
       dst_shape.push_back(make_const(Int(32), 1));
     } else {
       for (const For* op : loops) {
@@ -132,6 +129,10 @@ class CopyIntrinInjector : public IRMutator {
     CHECK_EQ(load_strides.size(), loop_var_size + 1);
     Array<Expr> src_strides(load_strides.begin(), load_strides.begin() + loop_var_size);
     Array<Expr> dst_strides(store_strides.begin(), store_strides.begin() + loop_var_size);
+    if (loop_var_size == 0) {
+        src_strides.push_back(make_const(Int(32), 1));
+        dst_strides.push_back(make_const(Int(32), 1));
+    }
     Buffer dst = BufferNode::make(
         Var(store->buffer_var.node_),
         store->value.type(),
diff --git a/src/relay/backend/interpreter.cc b/src/relay/backend/interpreter.cc
index 3128d2a711595..77840c9be824a 100644
--- a/src/relay/backend/interpreter.cc
+++ b/src/relay/backend/interpreter.cc
@@ -270,16 +270,30 @@ class Interpreter :
     return TupleValueNode::make(values);
   }
 
-  Value VisitExpr_(const FunctionNode* func_node) final {
-    auto func = GetRef<Function>(func_node);
+  // TODO(@jroesch): this doesn't support mutual letrec.
+  Value MakeClosure(const Function& func, const Var& letrec_name = Var()) {
     tvm::Map<Var, Value> captured_mod;
     Array<Var> free_vars = FreeVars(func);
 
     for (const auto& var : free_vars) {
-      captured_mod.Set(var, Eval(var));
+      // Evaluate the free var (which could be a function call) if it hasn't
+      // shown up in a letting binding that has invoked the function.
+      if (!letrec_name.defined() || letrec_name != var) {
+        captured_mod.Set(var, Eval(var));
+      }
     }
 
-    return ClosureNode::make(captured_mod, func);
+    // We must use mutation here to build a self referential closure.
+    auto closure = ClosureNode::make(captured_mod, func);
+    auto mut_closure =
+        static_cast<ClosureNode*>(const_cast<Node*>(closure.get()));
+    mut_closure->env.Set(letrec_name, closure);
+    return closure;
+  }
+
+  Value VisitExpr_(const FunctionNode* func_node) final {
+    auto func = GetRef<Function>(func_node);
+    return MakeClosure(func);
   }
 
   Value InvokePrimitiveOp(Function func,
@@ -438,10 +452,16 @@ class Interpreter :
     }
   }
 
-  Value VisitExpr_(const LetNode* op) final {
-    auto value = Eval(op->value);
-    this->extend(op->var, value);
-    return Eval(op->body);
+  Value VisitExpr_(const LetNode* let) final {
+    if (auto func = let->value.as<FunctionNode>()) {
+      auto clo = MakeClosure(GetRef<Function>(func), let->var);
+      this->extend(let->var, clo);
+    } else {
+      auto value = Eval(let->value);
+      this->extend(let->var, value);
+    }
+
+    return Eval(let->body);
   }
 
   Value VisitExpr_(const TupleGetItemNode* op) final {
@@ -517,7 +537,7 @@ class Interpreter :
     CHECK_NE(cvn->constructor->tag, -1);
     if (op->constructor->tag == cvn->constructor->tag) {
       // todo(M.K.): should use ptr equality but it is broken
-      CHECK(op->patterns.size() == cvn->fields.size());
+      CHECK_EQ(op->patterns.size(), cvn->fields.size());
       for (size_t i = 0; i < op->patterns.size(); ++i) {
         if (!VisitPattern(op->patterns[i], cvn->fields[i])) {
           return false;
diff --git a/src/relay/ir/error.cc b/src/relay/ir/error.cc
index 24f8d1c49b6b1..bdde10daeca21 100644
--- a/src/relay/ir/error.cc
+++ b/src/relay/ir/error.cc
@@ -77,6 +77,9 @@ void ErrorReporter::RenderErrors(const Module& module, bool use_color) {
   for (auto pair : error_maps) {
     auto global = pair.first;
     auto err_map = pair.second;
+    for (auto x : err_map) {
+      std::cout << x.second << std::endl;
+    }
     auto func = module->Lookup(global);
 
     // We output the name of the function before displaying
diff --git a/src/relay/ir/expr.cc b/src/relay/ir/expr.cc
index 29fe98ba78f5c..d6826d1a7c437 100644
--- a/src/relay/ir/expr.cc
+++ b/src/relay/ir/expr.cc
@@ -213,8 +213,7 @@ TVM_REGISTER_API("relay._make.Call")
 
 TVM_STATIC_IR_FUNCTOR_REGISTER(IRPrinter, vtable)
 .set_dispatch<CallNode>([](const CallNode* node, tvm::IRPrinter* p) {
-  p->stream << "CallNode(" << node->op << ", " << node->args << ", "
-    << node->attrs << ", " << node->type_args << ")";
+  p->stream << "CallNode(" << node->op << ")";
 });
 
 Let LetNode::make(Var var, Expr value, Expr body) {
@@ -330,5 +329,17 @@ TVM_REGISTER_API("relay._expr.TempExprRealize")
   *ret = temp->Realize();
 });
 
+Annotate AnnotateNode::make(Expr expr, NodeRef annotation) {
+  NodePtr<AnnotateNode> n = make_node<AnnotateNode>();
+  n->expr = std::move(expr);
+  n->annotation = std::move(annotation);
+  return Annotate(n);
+}
+
+TVM_STATIC_IR_FUNCTOR_REGISTER(IRPrinter, vtable)
+.set_dispatch<AnnotateNode>([](const AnnotateNode* node, tvm::IRPrinter* p) {
+    p->stream << "AnnotateNode(" << node->expr << ")";
+  });
+
 }  // namespace relay
 }  // namespace tvm
diff --git a/src/relay/ir/expr_functor.cc b/src/relay/ir/expr_functor.cc
index 8d2163e0ecc8d..70bd080ad062c 100644
--- a/src/relay/ir/expr_functor.cc
+++ b/src/relay/ir/expr_functor.cc
@@ -24,9 +24,6 @@ Expr ExprMutator::VisitExpr(const Expr& expr) {
 }
 
 Expr ExprMutator::VisitExpr_(const VarNode* op) {
-  // NOTE: var will only be mutated once
-  // Thanks to the memo and reused during rewriting if necessary.
-  // It is safe to assume that the
   if (op->type_annotation.defined()) {
     auto type = this->VisitType(op->type_annotation);
     if (!op->type_annotation.same_as(type)) {
@@ -205,6 +202,10 @@ Pattern ExprMutator::VisitPattern(const Pattern& p) { return p; }
 
 Type ExprMutator::VisitType(const Type& t) { return t; }
 
+Expr ExprMutator::VisitExpr_(const AnnotateNode* op) {
+  return AnnotateNode::make(VisitExpr(op->expr), op->annotation);
+}
+
 void ExprVisitor::VisitExpr(const Expr& expr) {
   auto it = visit_counter_.find(expr.get());
   if (it != visit_counter_.end()) {
@@ -299,6 +300,10 @@ void ExprVisitor::VisitExpr_(const MatchNode* op) {
   }
 }
 
+void ExprVisitor::VisitExpr_(const AnnotateNode* op) {
+  this->VisitExpr(op->expr);
+}
+
 void ExprVisitor::VisitClause(const Clause& op) {
   this->VisitPattern(op->lhs);
   this->VisitExpr(op->rhs);
diff --git a/src/relay/ir/pretty_printer.cc b/src/relay/ir/pretty_printer.cc
index a030a056f7cd9..a770a2c4f0397 100644
--- a/src/relay/ir/pretty_printer.cc
+++ b/src/relay/ir/pretty_printer.cc
@@ -226,15 +226,55 @@ class PrettyPrinter :
     return Doc(unique_prefix);
   }
 
+  Doc Print(Kind k) {
+    switch (k) {
+    case kType:
+      return Doc("Type");
+    case kShapeVar:
+      return Doc("Shape");
+    case kBaseType:
+      return Doc("BaseType");
+    case kConstraint:
+      return Doc("Constraint");
+    case kAdtHandle:
+      return Doc("AdtHandle");
+    case kTypeData:
+      return Doc("TypeData");
+    default:
+      LOG(ERROR) << "Unknown Kind";
+      throw;
+    }
+  }
   /*!
-    * \brief Allocate name to a variable.
-    * \param var The input variable.
-    * \return The corresponding name.
-    */
+   * \brief Allocate name to a type variable.
+   * \param var The input type variable.
+   * \return The corresponding name.
+   */
+  Doc AllocTypeVar(const TypeVar& var) {
+    std::string name = var->var->name_hint;
+    if (name.length() == 0 || !std::isalpha(name[0])) {
+      name = "t" + name;
+    }
+    Doc val = GetUniqueName("%" + name);
+    if (memo_type_.count(var)) {
+      val << "-malformed-ir";
+    }
+    memo_type_[var] = val;
+    if (var->kind != kType) {
+      val << ": " << Print(var->kind);
+    }
+    return val;
+  }
+
+  /*!
+   * \brief Allocate name to a variable.
+   * \param var The input variable.
+   * \return The corresponding name.
+   */
   Doc AllocVar(const Var& var) {
     std::string name = var->name_hint();
     // always make sure first name is alpha
-    if (name.length() != 0 && !std::isalpha(name[0])) {
+    if (name.length() == 0 || !std::isalpha(name[0])) {
       name = "v" + name;
     }
     Doc val = GetUniqueName("%" + name);
@@ -368,12 +408,18 @@ class PrettyPrinter :
   }
 
   Doc PrintFunc(const Doc& prefix, const Function& fn) {
-      // TODO(tqchen, M.K.) support generic function
-      // Possibly through meta data
-      CHECK_EQ(fn->type_params.size(), 0U)
-      << "generic fn not yet supported";
       Doc doc;
-      doc << prefix << "(";
+      doc << prefix;
+      if (fn->type_params.size() > 0) {
+        doc << "<";
+        std::vector<Doc> type_params;
+        for (const TypeVar& tv : fn->type_params) {
+          type_params.push_back(AllocTypeVar(tv));
+        }
+        doc << PrintVec(type_params);
+        doc << ">";
+      }
+      doc << "(";
       std::vector<Doc> params;
       for (Var param : fn->params) {
         params.push_back(AllocVar(param));
@@ -416,11 +462,13 @@ class PrettyPrinter :
 
   Doc VisitExpr_(const CallNode* op) final {
     Doc doc;
-    doc << Print(op->op);
+    // visit args first so they are lifted before the op
+    // this places op closer to its call site
     std::vector<Doc> args;
     for (Expr arg : op->args) {
       args.push_back(Print(arg));
     }
+    doc << Print(op->op);
     return doc << "(" << PrintVec(args) << PrintAttrs(op->attrs, op->op) << ")";
   }
 
@@ -495,6 +543,10 @@ class PrettyPrinter :
     return Print(GetRef<NodeRef>(node), true);
   }
 
+  Doc VisitType_(const TypeVarNode* node) final {
+    return AllocTypeVar(GetRef<TypeVar>(node));
+  }
+
   Doc VisitType_(const TensorTypeNode* node) final {
     // scalar type
     if (node->shape.size() == 0) {
@@ -514,6 +566,24 @@ class PrettyPrinter :
     return doc << "), " << PrintDType(node->dtype) << "]";
   }
 
+  Doc VisitType_(const GlobalTypeVarNode* node) final {
+    Doc doc;
+    doc << node->var->name_hint;
+    return doc;
+  }
+
+  Doc VisitType_(const TypeCallNode* node) final {
+    Doc doc = PrintType(node->func, false);
+    std::vector<Doc> args;
+    for (const Type& t : node->args) {
+      args.push_back(PrintType(t, false));
+    }
+    doc << "(";
+    doc << PrintVec(args);
+    doc << ")";
+    return doc;
+  }
+
   Doc VisitType_(const TupleTypeNode* node) final {
     std::vector<Doc> fields;
     for (Type field : node->fields) {
diff --git a/src/relay/ir/type.cc b/src/relay/ir/type.cc
index 25b7beb5356af..0aee5171dfac5 100644
--- a/src/relay/ir/type.cc
+++ b/src/relay/ir/type.cc
@@ -94,6 +94,7 @@ TVM_STATIC_IR_FUNCTOR_REGISTER(IRPrinter, vtable)
 });
 
 TypeCall TypeCallNode::make(Type func, tvm::Array<Type> args) {
+  CHECK(func.as<GlobalTypeVarNode>());
   NodePtr<TypeCallNode> n = make_node<TypeCallNode>();
   n->func = std::move(func);
   n->args = std::move(args);
diff --git a/src/relay/ir/type_functor.h b/src/relay/ir/type_functor.h
index 3f235d87ca212..16cdc31c4b0d3 100644
--- a/src/relay/ir/type_functor.h
+++ b/src/relay/ir/type_functor.h
@@ -58,6 +58,7 @@ class TypeFunctor<R(const Type& n, Args...)> {
    * \return The result of the call
    */
   virtual R VisitType(const Type& n, Args... args) {
+    CHECK(n.defined());
     static FType vtable = InitVTable();
     return vtable(n, this, std::forward<Args>(args)...);
   }
diff --git a/src/relay/op/nn/convolution.cc b/src/relay/op/nn/convolution.cc
index b53f57464e813..8c92a68132fa0 100644
--- a/src/relay/op/nn/convolution.cc
+++ b/src/relay/op/nn/convolution.cc
@@ -344,6 +344,7 @@ v            (batch_size, channels, out_height, out_width) if `layout` is `NCHW`
 // relay.nn.contrib_conv2d_winograd_without_weight_transform
 TVM_REGISTER_NODE_TYPE(Conv2DWinogradAttrs);
 
+template<class Param>
 bool Conv2DWinogradRel(const Array<Type>& types,
                        int num_inputs,
                        const Attrs& attrs,
@@ -354,7 +355,7 @@ bool Conv2DWinogradRel(const Array<Type>& types,
   static const Layout kNCHW("NCHW");
   static const Layout kOIHW("OIHW");
 
-  const Conv2DWinogradAttrs* param = attrs.as<Conv2DWinogradAttrs>();
+  const Param* param = attrs.as<Param>();
   CHECK(param != nullptr);
   const Layout in_layout(param->data_layout);
   const Layout kernel_layout(param->kernel_layout);
@@ -467,7 +468,7 @@ RELAY_REGISTER_OP("nn.contrib_conv2d_winograd_without_weight_transform")
 .add_argument("data", "Tensor", "The input tensor.")
 .add_argument("weight", "Tensor", "The weight tensor.")
 .set_support_level(10)
-.add_type_rel("Conv2DWinograd", Conv2DWinogradRel)
+.add_type_rel("Conv2DWinograd", Conv2DWinogradRel<Conv2DWinogradAttrs>)
 .set_attr<FInferCorrectLayout>("FInferCorrectLayout",
         Conv2DInferCorrectLayout<Conv2DWinogradAttrs>);
 
@@ -511,8 +512,8 @@ Expr MakeConv2DWinogradWeightTransform(Expr weight,
 
 TVM_REGISTER_API("relay.op.nn._make.contrib_conv2d_winograd_weight_transform")
 .set_body([](const TVMArgs& args, TVMRetValue* rv) {
-    runtime::detail::unpack_call<Expr, 2>(MakeConv2DWinogradWeightTransform, args, rv);
-  });
+  runtime::detail::unpack_call<Expr, 2>(MakeConv2DWinogradWeightTransform, args, rv);
+});
 
 
 RELAY_REGISTER_OP("nn.contrib_conv2d_winograd_weight_transform")
@@ -530,6 +531,124 @@ weight transformation in advance.
 .add_type_rel("Conv2DWinogradWeightTransform", Conv2DWinogradWeightTransformRel);
 
 
+// Positional relay function to create conv2d winograd nnpack operator
+// used by frontend FFI.
+Expr MakeConv2DWinogradNNPACK(Expr data,
+                              Expr weight,
+                              Array<IndexExpr> strides,
+                              Array<IndexExpr> padding,
+                              Array<IndexExpr> dilation,
+                              int groups,
+                              IndexExpr channels,
+                              Array<IndexExpr> kernel_size,
+                              std::string data_layout,
+                              std::string kernel_layout,
+                              std::string out_layout,
+                              DataType out_dtype) {
+  auto attrs = make_node<Conv2DAttrs>();
+  attrs->strides = std::move(strides);
+  attrs->padding = std::move(padding);
+  attrs->dilation = std::move(dilation);
+  attrs->groups = groups;
+  attrs->channels = channels;
+  attrs->kernel_size = std::move(kernel_size);
+  attrs->data_layout = std::move(data_layout);
+  attrs->kernel_layout = std::move(kernel_layout);
+  attrs->out_layout = std::move(out_layout);
+  attrs->out_dtype = std::move(out_dtype);
+  static const Op& op = Op::Get("nn.contrib_conv2d_winograd_nnpack_without_weight_transform");
+  return CallNode::make(op, {data, weight}, Attrs(attrs), {});
+}
+
+TVM_REGISTER_API("relay.op.nn._make.contrib_conv2d_winograd_nnpack_without_weight_transform")
+.set_body([](const TVMArgs& args, TVMRetValue* rv) {
+  runtime::detail::unpack_call<Expr, 12>(MakeConv2DWinogradNNPACK, args, rv);
+});
+
+RELAY_REGISTER_OP("nn.contrib_conv2d_winograd_nnpack_without_weight_transform")
+.describe(R"code(Compute conv2d with winograd nnpack. Only supports NCHW layout.
+              This operator assumes the weight tensor is already pre-transformed by
+              nn.contrib_conv2d_winograd_nnpack_weight_transform.
+
+- **data**: Input is 4D array of shape  (batch_size, in_channels, height, width)
+- **weight**: Any shape
+            We do not check the shape for this input tensor. Since different backend
+            has different layout strategy.
+
+- **out**:  Output is 4D array of shape (batch_size, channels, out_height, out_width)
+)code" TVM_ADD_FILELINE)
+.set_attrs_type_key("relay.attrs.Conv2DAttrs")
+.set_num_inputs(2)
+.add_argument("data", "Tensor", "The input tensor.")
+.add_argument("weight", "Tensor", "The weight tensor.")
+.set_support_level(10)
+.add_type_rel("Conv2DWinogradNNPACKRel", Conv2DWinogradRel<Conv2DAttrs>)
+.set_attr<FInferCorrectLayout>("FInferCorrectLayout", Conv2DInferCorrectLayout<Conv2DAttrs>);
+
+// relay.nn.contrib_conv2d_winograd_nnpack_weight_transform
+TVM_REGISTER_NODE_TYPE(Conv2DWinogradNNPACKWeightTransformAttrs);
+
+bool Conv2DWinogradNNPACKWeightTransformRel(const Array<Type>& types,
+                                            int num_inputs,
+                                            const Attrs& attrs,
+                                            const TypeReporter& reporter) {
+  CHECK_EQ(types.size(), 2);
+  const auto* data = types[0].as<TensorTypeNode>();
+  if (data == nullptr) {
+    return false;
+  }
+
+  const Conv2DWinogradNNPACKWeightTransformAttrs* param =
+      attrs.as<Conv2DWinogradNNPACKWeightTransformAttrs>();
+  CHECK(param != nullptr);
+
+  CHECK_EQ(data->shape.size(), 4) << "Only support NCHW normal kernel layout";
+
+  std::vector<IndexExpr> oshape{
+      data->shape[0],
+      data->shape[1],
+      8,
+      8,
+  };
+
+  DataType out_dtype = param->out_dtype;
+  if (out_dtype.bits() == 0) {
+    out_dtype = data->dtype;
+  }
+  reporter->Assign(types[1], TensorTypeNode::make(Array<IndexExpr>(oshape), out_dtype));
+  return true;
+}
+
+Expr MakeConv2DWinogradNNPACKWeightTransform(Expr weight,
+                                             int convolution_algorithm,
+                                             DataType out_dtype) {
+  auto attrs = make_node<Conv2DWinogradNNPACKWeightTransformAttrs>();
+  attrs->convolution_algorithm = convolution_algorithm;
+  attrs->out_dtype = std::move(out_dtype);
+  static const Op& op = Op::Get("nn.contrib_conv2d_winograd_nnpack_weight_transform");
+  return CallNode::make(op, {weight}, Attrs(attrs), {});
+}
+
+TVM_REGISTER_API("relay.op.nn._make.contrib_conv2d_winograd_nnpack_weight_transform")
+.set_body([](const TVMArgs& args, TVMRetValue* rv) {
+  runtime::detail::unpack_call<Expr, 3>(MakeConv2DWinogradNNPACKWeightTransform, args, rv);
+});
+
+RELAY_REGISTER_OP("nn.contrib_conv2d_winograd_nnpack_weight_transform")
+.describe(R"code(Weight transformation of winograd fast convolution algorithm with NNPACK.
+Separate this into another symbol in order to enable Precompute Pass to compute the
+weight transformation in advance.
+
+- **weight**: (channels, in_channels, kernel_size[0], kernel_size[1])
+
+)code" TVM_ADD_FILELINE)
+.set_attrs_type_key("relay.attrs.Conv2DWinogradNNPACKWeightTransformAttrs")
+.set_num_inputs(1)
+.add_argument("weight", "Tensor", "The weight tensor.")
+.set_support_level(10)
+.add_type_rel("Conv2DWinogradNNPACKWeightTransform", Conv2DWinogradNNPACKWeightTransformRel);
+
+
 // Positional relay function to create conv2d NCHWc operator
 // used by frontend FFI.
 Expr MakeConv2DNCHWc(Expr data,
@@ -634,5 +753,148 @@ RELAY_REGISTER_OP("nn.contrib_depthwise_conv2d_NCHWc")
         Conv2DInferCorrectLayout<Conv2DAttrs>);
 
 
+bool DeformableConv2DRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
+                         const TypeReporter& reporter) {
+  CHECK_EQ(types.size(), 4);
+  const auto* data = types[0].as<TensorTypeNode>();
+  const auto* weight = types[2].as<TensorTypeNode>();
+
+  CHECK(data);
+  auto* param = attrs.as<DeformableConv2DAttrs>();
+  CHECK_EQ(param->data_layout, "NCHW") << "data layout not supported.";
+  CHECK_EQ(param->kernel_layout, "OIHW") << "kernel_layout not supported.";
+
+  IndexExpr channels, dilated_ksize_y, dilated_ksize_x, ksize_y, ksize_x;
+
+  // infer weight shape if kernel_size and channels are defiend
+  if (param->kernel_size.defined() && param->channels.defined()) {
+    CHECK_EQ(param->kernel_size.size(), 2);
+    CHECK_EQ(param->dilation.size(), 2);
+    Array<IndexExpr> wshape(
+       {param->channels,
+         data->shape[1] / param->groups,
+         param->kernel_size[0],
+         param->kernel_size[1]});
+    channels = param->channels;
+    ksize_y = param->kernel_size[0];
+    ksize_x = param->kernel_size[1];
+    dilated_ksize_y = 1 + (param->kernel_size[0] - 1) * param->dilation[0];
+    dilated_ksize_x = 1 + (param->kernel_size[1] - 1) * param->dilation[1];
+    // assign result to reporter
+    reporter->Assign(types[2], TensorTypeNode::make(wshape, data->dtype));
+  } else {
+    // use weight to infer the conv shape.
+    if (weight == nullptr) return false;
+    auto wshape = weight->shape;
+    if (param->kernel_size.defined()) {
+      CHECK_EQ(param->kernel_size.size(), 2);
+      // check the size
+      CHECK(reporter->AssertEQ(param->kernel_size[0], wshape[2]) &&
+            reporter->AssertEQ(param->kernel_size[1], wshape[3]))
+          << "DeformableConv2D: shape of weight is inconsistent with kernel_size, "
+          << " kernel_size=" << param->kernel_size
+          << " wshape=" << wshape;
+    }
+    if (param->channels.defined()) {
+      CHECK(reporter->AssertEQ(param->channels, wshape[0]))
+          << "DeformableConv2D: shape of weight is inconsistent with channels, "
+          << " channels=" << param->channels
+          << " wshape=" << wshape;
+    }
+    CHECK(reporter->AssertEQ(data->shape[1] / param->groups, wshape[1]));
+    channels = wshape[0];
+    ksize_y = wshape[2];
+    ksize_x = wshape[3];
+    dilated_ksize_y = 1 + (wshape[2] - 1) * param->dilation[0];
+    dilated_ksize_x = 1 + (wshape[3] - 1) * param->dilation[1];
+  }
+  // dilation
+  Array<IndexExpr> oshape({data->shape[0], channels, 0, 0});
+
+  oshape.Set(2, (data->shape[2] + param->padding[0] * 2 - dilated_ksize_y) / param->strides[0] + 1);
+  oshape.Set(3, (data->shape[3] + param->padding[1] * 2 - dilated_ksize_x) / param->strides[1] + 1);
+  DataType out_dtype = param->out_dtype;
+
+  // infer offset shape
+  Array<IndexExpr> offset_shape({data->shape[0], 2 * ksize_y * ksize_x * param->deformable_groups,
+          oshape[2], oshape[3]});
+  reporter->Assign(types[1], TensorTypeNode::make(offset_shape, data->dtype));
+  if (out_dtype.bits() == 0) {
+    out_dtype = data->dtype;
+  }
+
+  reporter->Assign(types[3], TensorTypeNode::make(oshape, out_dtype));
+  return true;
+}
+
+
+TVM_REGISTER_NODE_TYPE(DeformableConv2DAttrs);
+
+RELAY_REGISTER_OP("nn.deformable_conv2d")
+    .describe(R"code(Compute 2-D deformable convolution on 4-D input.
+The deformable convolution operation is described in https://arxiv.org/abs/1703.06211
+
+For 2-D deformable convolution, the shapes are
+- **data**: (batch_size, channel, height, width)
+- **offset**: (batch_size, deformable_groups * kernel[0] * kernel[1] * 2, out_height, out_width)
+- **weight**: (num_filter, channel, kernel[0], kernel[1])
+- **out**: (batch_size, num_filter, out_height, out_width).
+
+If `deformable_groups` is larger than 1, denoted by *dg*, then split the
+input `offset` evenly into *dg* parts along the channel axis, and also evenly split `out`
+evenly into *dg* parts along the channel axis. Next compute the deformable convolution, apply the
+*i*-th part of the offset part on the *i*-th out.
+
+If `groups` is larger than 1, denoted by *g*, then split the input `data` evenly into *g* parts
+along the channel axis, and also evenly split `weight` along the first dimension. Next compute
+the convolution on the *i*-th part of the data with the *i*-th weight part. The output is obtained
+by concating all the *g* results.
+)code" TVM_ADD_FILELINE)
+.set_attrs_type_key("relay.attrs.DeformableConv2D")
+.set_num_inputs(3)
+.add_argument("data", "Tensor", "The input tensor.")
+.add_argument("offset", "Tensor", "The offset tensor.")
+.add_argument("weight", "Tensor", "The weight tensor.")
+.set_support_level(5)
+.add_type_rel("DeformableConv2D", DeformableConv2DRel);
+
+// Positional relay function to create deformable_conv2d operator
+// used by frontend FFI.
+Expr MakeDeformableConv2D(Expr data,
+                          Expr offset,
+                          Expr weight,
+                          Array<IndexExpr> strides,
+                          Array<IndexExpr> padding,
+                          Array<IndexExpr> dilation,
+                          int deformable_groups,
+                          int groups,
+                          int channels,
+                          Array<IndexExpr> kernel_size,
+                          std::string data_layout,
+                          std::string kernel_layout,
+                          std::string out_layout,
+                          DataType out_dtype) {
+  auto attrs = make_node<DeformableConv2DAttrs>();
+  attrs->strides = strides;
+  attrs->padding = padding;
+  attrs->dilation = dilation;
+  attrs->deformable_groups = deformable_groups;
+  attrs->groups = groups;
+  attrs->channels = channels;
+  attrs->kernel_size = kernel_size;
+  attrs->data_layout = data_layout;
+  attrs->kernel_layout = kernel_layout;
+  attrs->out_layout = out_layout;
+  attrs->out_dtype = out_dtype;
+  static const Op& op = Op::Get("nn.deformable_conv2d");
+  return CallNode::make(op, {data, offset, weight}, Attrs{attrs}, {});
+}
+
+TVM_REGISTER_API("relay.op.nn._make.deformable_conv2d")
+.set_body([](const TVMArgs& args, TVMRetValue* rv) {
+    runtime::detail::unpack_call<Expr, 14>(MakeDeformableConv2D, args, rv);
+  });
+
+
 }  // namespace relay
 }  // namespace tvm
diff --git a/src/relay/op/nn/nn.cc b/src/relay/op/nn/nn.cc
index 59f68d9d8880a..526c5378289bb 100644
--- a/src/relay/op/nn/nn.cc
+++ b/src/relay/op/nn/nn.cc
@@ -664,7 +664,7 @@ bool BatchMatmulRel(const Array<Type>& types,
   const auto* x = types[0].as<TensorTypeNode>();
   const auto* y = types[1].as<TensorTypeNode>();
   if (x == nullptr || y == nullptr) return false;
-  if (x->shape.size() != 3 || y->shape.size() != 3) return false;
+  CHECK (x->shape.size() == 3 && y->shape.size() == 3);
   CHECK(reporter->AssertEQ(x->shape[0], y->shape[0]))
       << "BatchDot: batch dimension doesn't match, "
       << " x shape=" << x->shape
diff --git a/src/relay/op/tensor/transform.cc b/src/relay/op/tensor/transform.cc
index a0ea8f2e60a36..456800255f379 100644
--- a/src/relay/op/tensor/transform.cc
+++ b/src/relay/op/tensor/transform.cc
@@ -710,9 +710,19 @@ bool TakeRel(const Array<Type>& types,
   // `types` contains: [data, indices, result]
   CHECK_EQ(types.size(), 3);
   const auto* data = types[0].as<TensorTypeNode>();
-  CHECK(data != nullptr);
+  if (data == nullptr) {
+    CHECK(types[0].as<IncompleteTypeNode>())
+      << "must be tensor type or incomplete type";
+    return false;
+  }
+
   const auto* indices = types[1].as<TensorTypeNode>();
-  CHECK(indices != nullptr);
+  if (indices == nullptr) {
+    CHECK(types[1].as<IncompleteTypeNode>())
+      << "must be tensor type or incomplete type";
+    return true;
+  }
+
   const auto param = attrs.as<TakeAttrs>();
   CHECK(param != nullptr);
 
diff --git a/src/relay/pass/dead_code.cc b/src/relay/pass/dead_code.cc
index 0d2677e11c67f..091e7cf836033 100644
--- a/src/relay/pass/dead_code.cc
+++ b/src/relay/pass/dead_code.cc
@@ -8,105 +8,364 @@
  * The algorithm is implemented by two visitor:
  * CalcDep turn an expr into a dependency graph of expr,
  * GenLet turn the dependency graph into a let list, taking only the used value.
+ *
+ * Also, Dead Code Eliminator has to take into account of effect -
+ * Call to foreign function should not be eliminated.
+ * Write to reference should not be eliminated if that reference is used.
+ *
+ * To do this we implement a simple escape analysis.
+ * We abstract Reference Value point to StoreId.
+ * Each RefCreate get a unique StoreId,
+ * And also assign each parameter a unique StoreId (as they might has/contain Ref).
+ * We then create a map of Expr -> Set StoreId, which record what StoreId Expr might depend on.
+ * The map is ran until a Fixpoint (it will terminate as there are finite StoreId.).
+ * The StoreId inside the inputs and the body are all the StoreId that is alive,
+ * and effect to other StoreId can be removed.
+ *
+ * We choose to implement StoreId as Expr for simplicity.
+ *
+ * Whenever a function is called, or a reference is written into,
+ * We make the set of reference inside depend on that call/write.
  */
 #include <tvm/relay/pass.h>
 #include <tvm/relay/expr_functor.h>
+#include <tvm/relay/pattern_functor.h>
 #include "let_list.h"
 
 namespace tvm {
 namespace relay {
 
-bool IsBoolLit(const Expr& e, bool b) {
-  if (const ConstantNode* c = e.as<ConstantNode>()) {
-    if (c->is_scalar()) {
-      auto dt = c->tensor_type()->dtype;
-      if (dt == Bool()) {
-        return *reinterpret_cast<const uint8_t*>(c->data->data) == b;
-      } else if (dt == UInt(8)) {
-        return *reinterpret_cast<const uint8_t*>(c->data->data) == b;
-      } else if (dt == UInt(16)) {
-        return *reinterpret_cast<const uint16_t*>(c->data->data) == b;
-      } else if (dt == UInt(32)) {
-        return *reinterpret_cast<const uint32_t*>(c->data->data) == b;
-      } else if (dt == UInt(64)) {
-        return *reinterpret_cast<const uint64_t*>(c->data->data) == b;
-      } else if (dt == Int(8)) {
-        return *reinterpret_cast<const int8_t*>(c->data->data) == b;
-      } else if (dt == Int(16)) {
-        return *reinterpret_cast<const int16_t*>(c->data->data) == b;
-      } else if (dt == Int(32)) {
-        return *reinterpret_cast<const int32_t*>(c->data->data) == b;
-      } else if (dt == Int(64)) {
-        return *reinterpret_cast<const int64_t*>(c->data->data) == b;
+using Sid = Expr;
+using SidSet = std::unordered_set<Sid, NodeHash, NodeEqual>;
+using ExprSet = std::unordered_set<Expr, NodeHash, NodeEqual>;
+template<typename T>
+using ExprMap = std::unordered_map<Expr, T, NodeHash, NodeEqual>;
+template<typename X>
+using VarMap = std::unordered_map<Var, X, VarHash, VarEqual>;
+using VarSet = std::unordered_set<Var, VarHash, VarEqual>;
+
+struct EscapeAnalysis : ExprFunctor<void(const Expr&, const Expr&)>,
+                        PatternFunctor<void(const Pattern&, const Expr&)> {
+  ExprMap<SidSet> map_;
+  SidSet live_sid_;
+  ExprSet root_expr_;
+  bool converge = false;
+  bool HasEffect(const Expr& e) {
+    struct EffectVisitor : ExprVisitor {
+      EscapeAnalysis* ea_;
+      explicit EffectVisitor(EscapeAnalysis* ea) : ea_(ea) { }
+      bool has_effect = false;
+      void Touch(const Expr& e) {
+        for (const Sid& s: ea_->Get(e)) {
+          has_effect |= (ea_->live_sid_.count(s) > 0);
+          if (ea_->live_sid_.count(s) > 0) {
+            std::cout << "HAS EFFECT" << std::endl;
+            return;
+          }
+        }
+      }
+      void VisitExpr_(const RefReadNode* op) final {
+        Touch(op->ref);
+        VisitExpr(op->ref);
+      }
+      void VisitExpr_(const RefWriteNode* op) final {
+        Touch(op->ref);
+        VisitExpr(op->ref);
+        VisitExpr(op->value);
+      }
+      void VisitExpr_(const CallNode* op) final {
+        // The args contain same sid as op, so no need to touch them.
+        Touch(op->op);
+        VisitExpr(op->op);
+        for (const Expr& arg: op->args) {
+          VisitExpr(arg);
+        }
+      }
+      void VisitExpr_(const FunctionNode* op) final { }
+    };
+    std::cout << "CHECK EFFECT:" << e << std::endl;
+    EffectVisitor ev(this);
+    ev(e);
+    return ev.has_effect;
+  }
+  explicit EscapeAnalysis(const Expr& e) {
+    for (const Var& v: FreeVars(e)) {
+      AllocRoot(v);
+    }
+    while (!converge) {
+      converge = true;
+      Analysis(e);
+    }
+    Alive(e);
+    for (const Expr& r: root_expr_) {
+      Alive(r);
+    }
+    while (!converge) {
+      converge = true;
+      std::vector<Sid> live_sid_old;
+      for (const Sid& s: live_sid_) {
+        live_sid_old.push_back(s);
+      }
+      for (const Sid& s: live_sid_old) {
+        Alive(s);
       }
     }
   }
-  return false;
-}
+  void Alive(const Expr& e) {
+    for (const Sid& s: Get(e)) {
+      if (live_sid_.count(s) == 0) {
+        converge = false;
+        live_sid_.insert(s);
+      }
+    }
+  }
+  void Analysis(const Expr& e) {
+    VisitExpr(e, e);
+  }
+  ExprSet& Get(const Expr& e) {
+    if (map_.count(e) == 0) {
+      map_.insert({e, ExprSet()});
+    }
+    return map_.at(e);
+  }
+  std::vector<Sid> Range(const Expr& e) {
+    std::vector<Sid> ret;
+    for (const auto& x: Get(e)) {
+      ret.push_back(x);
+    }
+    return ret;
+  }
+  void Insert(const Expr& from, const Expr& to) {
+    ExprSet& x = Get(from);
+    if (x.count(to) == 0) {
+      converge = false;
+      x.insert(to);
+    }
+  }
+  void Join(const Expr& from, const Expr& to) {
+    for (const Expr& e: Range(to)) {
+      Insert(from, e);
+    }
+  }
+  void Write(const Expr& from, const Expr& to) {
+    for (const Expr& e: Range(from)) {
+      Join(e, to);
+    }
+  }
+  void Alloc(const Expr& e) {
+    Insert(e, e);
+  }
+  void Root(const Expr& e) {
+    root_expr_.insert(e);
+  }
+  void AllocRoot(const Expr& e) {
+    Alloc(e);
+    Root(e);
+  }
+  void Depend(const Expr& val, const Expr& on) {
+    Analysis(on);
+    Join(val, on);
+  }
+  void VisitExpr_(const RefCreateNode* op, const Expr& e) final {
+    AllocRoot(e);
+    Depend(e, op->value);
+  }
+  void VisitExpr_(const RefWriteNode* op, const Expr& e) final {
+    Write(e, op->ref);
+    Analysis(op->ref);
+    Analysis(op->value);
+  }
+  void VisitExpr_(const FunctionNode* op, const Expr& e) final {
+    for (const Var& v: op->params) {
+      AllocRoot(v);
+    }
+    Root(op->body);
+    Depend(e, op->body);
+  }
+  void VisitExpr_(const CallNode* op, const Expr& e) final {
+    std::vector<Expr> exprs;
+    Depend(e, op->op);
+    exprs.push_back(op->op);
+    for (const Expr& arg: op->args) {
+      Depend(e, arg);
+      exprs.push_back(arg);
+    }
+    for (size_t i = 0; i < exprs.size(); ++i) {
+      for (size_t j = i + 1; j < exprs.size(); ++j) {
+        Write(exprs[i], exprs[j]);
+        Write(exprs[j], exprs[i]);
+      }
+    }
+  }
+  void RecordVar(const Var& v) {
+    Get(v);
+  }
+  void VisitExpr_(const LetNode* op, const Expr& e) final {
+    RecordVar(op->var);
+    Depend(op->var, op->value);
+    Depend(e, op->body);
+  }
+  // From here on the uninteresting case: just declare Depend on children
+  void VisitExpr_(const VarNode* op, const Expr& e) final {
+    CHECK_GT(map_.count(GetRef<Expr>(op)), 0);
+  }
+  void VisitExpr_(const ConstructorNode* op, const Expr& e) final { }
+  void VisitExpr_(const OpNode* op, const Expr& e) final {
+    // TODO(@M.K.): handle stateful op
+  }
+  void VisitExpr_(const ConstantNode* op, const Expr& e) final { }
+  void VisitExpr_(const GlobalVarNode* op, const Expr& e) final { }
+  void VisitExpr_(const MatchNode* op, const Expr& e) final {
+    Depend(e, op->data);
+    for (const Clause& c: op->clauses) {
+      VisitPattern(c->lhs, op->data);
+      Depend(e, c->rhs);
+    }
+  }
+  void VisitPattern_(const PatternWildcardNode* op, const Expr& e) final { }
+  void VisitPattern_(const PatternVarNode* op, const Expr& e) final {
+    Depend(op->var, e);
+  }
+  void VisitPattern_(const PatternConstructorNode* op, const Expr& e) final {
+    for (const Pattern& pat: op->patterns) {
+      VisitPattern(pat, e);
+    }
+  }
+  void VisitExpr_(const RefReadNode* op, const Expr& e) final {
+    Depend(e, op->ref);
+  }
+  void VisitExpr_(const TupleNode* op, const Expr& e) final {
+    for (const Expr& c: op->fields) {
+      Depend(e, c);
+    }
+  }
+  void VisitExpr_(const TupleGetItemNode* op, const Expr& e) final {
+    Depend(e, op->tuple);
+  }
+  void VisitExpr_(const IfNode* op, const Expr& e) final {
+    Depend(e, op->cond);
+    Depend(e, op->true_branch);
+    Depend(e, op->false_branch);
+  }
+};
+
 
 // calculate the dependency graph from expression
-class CalcDep : private ExprMutator {
+class CalcDep : private ExprVisitor {
  public:
-  static Expr Eliminate(const Expr& e) {
-    CalcDep cd;
-    auto res = cd(e);
-    GenLet gl(cd.var_map_);
-    gl(res);
-    return gl.lets_.Get(res);
+  explicit CalcDep(const Expr& v) {
+    VisitExpr(v);
+    return;
+    count_ = false;
+    while (!dead_worklist_.empty()) {
+      Var dead = *(dead_worklist_.begin());
+      dead_worklist_.erase(dead);
+      CHECK_EQ(use_map_[dead], 0);
+      if (expr_map_.count(dead) > 0) {
+        LetRec([&]() { VisitExpr(expr_map_[dead]); }, dead);
+      }
+    }
+  }
+
+  bool Used(const Var& v) {
+    return use_map_[v] > 0;
+  }
+
+  bool HasLet(const Var& v) {
+    return (use_map_[v] > 1 || (use_map_[v] != 0 && letrec_set_.count(v) != 0));
+  }
+
+  Expr Map(const Var& v) {
+    return expr_map_.count(v) == 0 ? Expr(v) : expr_map_[v];
   }
 
  private:
-  using VarMap = std::unordered_map<Var, Expr, NodeHash, NodeEqual>;
-  VarMap var_map_;
-
-  Expr VisitExpr_(const IfNode* i) final {
-    auto cond = VisitExpr(i->cond);
-    if (IsBoolLit(cond, true)) {
-      return Eliminate(i->true_branch);
-    } else if (IsBoolLit(cond, false)) {
-      return Eliminate(i->false_branch);
-    } else {
-      return IfNode::make(cond, Eliminate(i->true_branch), Eliminate(i->false_branch));
+  VarMap<Expr> expr_map_;
+  VarMap<size_t> use_map_;
+  VarSet letrec_set_;
+  bool count_ = true;
+  VarSet dead_worklist_;
+  VarSet current_letrec_;
+
+  void LetRec(const std::function<void()>& func, const Var& v) {
+    current_letrec_.insert(v);
+    func();
+    current_letrec_.erase(v);
+  }
+
+  void VisitExpr_(const LetNode* l) final {
+    if (count_) {
+      CHECK_EQ(expr_map_.count(l->var), 0);
+      CHECK_EQ(use_map_.count(l->var), 0);
+      expr_map_[l->var] = l->value;
+      use_map_[l->var] = 0;
+      dead_worklist_.insert(l->var);
+      LetRec([&]() { VisitExpr(l->value); }, l->var);
     }
+    VisitExpr(l->body);
   }
 
-  Expr VisitExpr_(const LetNode* l) final {
-    var_map_[l->var] = Eliminate(l->value);
-    return VisitExpr(l->body);
+  void VisitExpr(const Expr& e) final {
+    ExprFunctor<void(const Expr&)>::VisitExpr(e);
   }
 
-  Expr VisitExpr_(const FunctionNode* f) final {
-    return FunctionNode::make(f->params,
-                              Eliminate(f->body),
-                              f->ret_type,
-                              f->type_params);
+  void VisitExpr_(const VarNode* v) final {
+    Var var = GetRef<Var>(v);
+    if (expr_map_.count(var) == 0) {
+      return;
+    }
+    if (current_letrec_.count(var) == 0) {
+      if (count_) {
+        use_map_[var] += 1;
+        dead_worklist_.erase(var);
+      } else {
+        CHECK_GT(use_map_[var], 0) << var;
+        use_map_[var] -= 1;
+        if (use_map_[var] == 0) {
+          dead_worklist_.insert(var);
+        }
+      }
+    } else {
+      letrec_set_.insert(var);
+    }
   }
+};
+
+class Eliminator : private ExprMutator {
+ public:
+  static Expr Eliminate(const Expr& e) {
+    Eliminator elm(e);
+    return elm(e);
+  }
+ private:
+  EscapeAnalysis ea_;
+  CalcDep cd_;
+  explicit Eliminator(const Expr& e) : ea_(e), cd_(e) { }
+  friend CalcDep;
 
-  // generate the let list from dependency graph
-  class GenLet : private ExprVisitor {
-   private:
-    LetList lets_;
-    VarMap var_map_;
-    explicit GenLet(const VarMap& var_map) : var_map_(var_map) { }
-    friend CalcDep;
+  Expr VisitExpr_(const VarNode* op) final {
+    Var v = GetRef<Var>(op);
+    return v;
+    std::cout << v << " map to " << cd_.Map(v) << std::endl;
+    return (cd_.Used(v) || ea_.HasEffect(cd_.Map(v))) ? v : cd_.Map(v);
+  }
 
-    void VisitExpr_(const VarNode* vnode) final {
-      Var v = GetRef<Var>(vnode);
-      auto it = var_map_.find(v);
-      if (it != var_map_.end()) {
-        Expr expr = it->second;
-        var_map_.erase(it);
-        // erase before visit to handle letrec
-        VisitExpr(expr);
-        // visit before push back so the dependency of dependency is before the dependency
-        lets_.Push(v, expr);
-      }
+  Expr VisitExpr_(const LetNode* op) final {
+    Var v = op->var;
+    CHECK_EQ(cd_.Map(v), op->value);
+    if (cd_.Used(v) || ea_.HasEffect(op->value)) {
+      return LetNode::make(v, VisitExpr(op->value), VisitExpr(op->body));
+    } else {
+      return VisitExpr(op->body);
     }
-  };
+  }
+  //Expr VisitExpr_(const IfNode* op) final {
+  //  return IfNode::make(op->cond, Descend(op->true_branch), Descend(op->false_branch));
+  //}
 };
 
 Expr DeadCodeElimination(const Expr& e) {
-  return CalcDep::Eliminate(e);
+  return Eliminator::Eliminate(e);
 }
 
 TVM_REGISTER_API("relay._ir_pass.dead_code_elimination")
diff --git a/src/relay/pass/fuse_ops.cc b/src/relay/pass/fuse_ops.cc
index 66ff9caf4ae41..75f89e1c32efd 100644
--- a/src/relay/pass/fuse_ops.cc
+++ b/src/relay/pass/fuse_ops.cc
@@ -261,9 +261,28 @@ class IndexedForwardGraph::Creator : private ExprVisitor {
   }
 
   void VisitExpr_(const TupleGetItemNode* op) final {
-    CHECK(graph_.node_map.count(op));
-    Node* node = graph_.node_map.at(op);
-    this->Update(op->tuple, node, kOpaque);
+    auto tuple_type = op->tuple->checked_type().as<TupleTypeNode>();
+    CHECK(tuple_type);
+    // when TVM lowers a fused function, it expects all arguments to be a Tensor or
+    // a tuple containing only Tensors. But this tuple may contain a reference or
+    // another tuple. To avoid modifying codegen logic, we do not allow fusing through this node
+    // if the tuple contains such non Tensor fields. However, all fields will be recursively
+    // visited via call to ExprVisitor::VisitExpr_(op) below and corresponding visitor methods.
+    bool has_non_tensor = false;
+    for (auto ty : tuple_type->fields) {
+      if (!ty.as<TensorTypeNode>()) {
+        has_non_tensor = true;
+        break;
+      }
+    }
+    if (has_non_tensor) {
+      this->Update(op->tuple, nullptr, kOpaque);
+    } else {
+      CHECK(graph_.node_map.count(op));
+      Node* node = graph_.node_map.at(op);
+      node->pattern = kInjective;
+      this->Update(op->tuple, node, kInjective);
+    }
     ExprVisitor::VisitExpr_(op);
     this->AddNode(op);
   }
@@ -809,6 +828,23 @@ class FuseMutator : private ExprMutator {
     return TupleNode::make(new_fields);
   }
 
+  Expr VisitExpr_(const TupleGetItemNode* tuple_get) {
+    auto* ret_group = gmap_.at(tuple_get)->FindRoot();
+    auto new_tuple = GetNewArguments({tuple_get->tuple}, ret_group)[0];
+    auto new_node = TupleGetItemNode::make(new_tuple, tuple_get->index);
+    if (ret_group == gmap_.at(tuple_get)) {
+      if (gmap_.at(tuple_get->tuple.get())->FindRoot() != ret_group) {
+        // Isolated. This case occurs when tuple is created by an Opaque op
+        // e.g. multibox_transform_loc
+        return ExprMutator::VisitExpr_(tuple_get);
+      }
+      // A new function whose output is a tuple field access
+      return MakeNewFunction(ret_group, tuple_get->checked_type(), new_node);
+    }
+    // This is an intermediate node in the group
+    return new_node;
+  }
+
   Expr MakeNewFunction(GraphPartitioner::Group* group, Type ret_type, Expr body) {
     const GroupInfo& ginfo = ginfo_[group];
     auto func = FunctionNode::make(ginfo.params, body, ret_type, {});
@@ -849,6 +885,21 @@ class FuseMutator : private ExprMutator {
 
 
 Expr FuseOps(const Expr& expr, int fuse_opt_level) {
+  struct DenseToBatchMatMul : ExprMutator {
+    Expr VisitExpr_(const CallNode* op) final {
+      if (op->op.as<OpNode>()) {
+	Op o = Downcast<Op>(op->op);
+	if (o.same_as(Op::Get("nn.dense"))) {
+	  CHECK_EQ(op->args.size(), 2);
+	  auto tt = Downcast<TensorType>(op->args[0]->checked_type());
+	  if (tt->shape.size() == 3) {
+	    return CallNode::make(Op::Get("nn.batch_matmul"), {VisitExpr(op->args[0]), VisitExpr(op->args[1])});
+	  }
+	}
+      }
+      return ExprMutator::VisitExpr_(op);
+    }
+  };
   // First we convert all chains of fusable ops into
   // abstracted functions which we mark as primtive
   // then we convert these primtive functions into
diff --git a/src/relay/pass/let_list.h b/src/relay/pass/let_list.h
index 3afbcba96ae60..27afa3bfab269 100644
--- a/src/relay/pass/let_list.h
+++ b/src/relay/pass/let_list.h
@@ -62,7 +62,7 @@ class LetList {
    *  \return a Var that hold the inserted expr.
    */
   Var Push(Expr expr) {
-    return Push(IncompleteTypeNode::make(Kind::kType), expr);
+    return Push(Type(), expr);
   }
 
   /*!
diff --git a/src/relay/pass/partial_eval.cc b/src/relay/pass/partial_eval.cc
new file mode 100644
index 0000000000000..63ad11d2e1182
--- /dev/null
+++ b/src/relay/pass/partial_eval.cc
@@ -0,0 +1,1057 @@
+/*!
+ * Copyright (c) 2018 by Contributors
+ *
+ * \file partial_eval.cc
+ *
+ * \brief Perform known computation in compile time.
+ *
+ * The partial evaluator try to do computation at compile time,
+ * so it can generate code that do less work.
+ * Additionally, it might open more chance for further optimization,
+ * since the high level, structural part of the code (closure, reference, control flow)
+ * might get partially evaluated away, and the subsequent optimization (for example, kernel fusion)
+ * can reason across those structural code as it got removed.
+ * In the extreme case, partial evaluation can even turn the whole program
+ * into pure first order computation with no control flow.
+ * In such a case, we can compile the whole computation onto SIMD Instruction/GPU/FPGA,
+ * and get huge speedup.
+ *
+ * It works by making the following modifications to the standard relay interpreter:
+ *
+ * 0: The values become partially static value.
+ * Since we cannot know the value of every term at compile time,
+ * Term might get partially evaluated to 'Unknown Value'.
+ * Every partially static value is, hence,
+ * a static fragment that might not be there (partially static),
+ * and a dynamic fragment that is semantically equivalent to the original term,
+ * so the unknown part will be computed at runtime, using the dynamic fragment.
+ *
+ * 1: The interpreter holds a LetList, which preserves A Normal Form for the generated code.
+ * More specifically, we require that all dynamic is an atom.
+ * This avoids code duplication (which is both inefficient and incorrect), as atom has constant size
+ * and allow us to not handle capture-avoidance substitution (as atom has no binder).
+ *
+ * 2: The map of References to partially static values is reified, as described below.
+ * Instead of Reference having mutable field, Reference only has an unique identifier.
+ * There will be a mutable mapping of id to partially static value, called the store.
+ * This allow us to rollback the store:
+ * when a path may or may not be executed (as in a conditional), we copy the store,
+ * recurse with the copy, and reinstate the original when the call returns
+ * so that the effects of the computation are not preserved.
+ * We do this in if else, pattern matching, and in function,
+ * as, when we see a function, we partially evaluate it with all the argument as dynamic,
+ * to generate efficient dynamic for that function.
+ *
+ * 3: The generated code reuses bindings (although they are not shadowed),
+ * so we have to deduplicate them.
+ *
+ * 4: In the generated code, multiple VarNode might have same Id.
+ * While it is permitted, most pass use NodeHash for Var,
+ * and having multiple VarNode for same Id break them.
+ * Thus we remap them to a single Id for now.
+ *
+ * Also, It will also generate lots of dead code,
+ * so it is a good idea to feed it through the dead code eliminator after partial evaluation.
+ *
+ * The partial evaluator makes several assumptions, so there is room for improvement:
+ *
+ * 0: The partial evaluator treats global variables as opaque.
+ * Doing PartialEval on a module level will solve this.
+ *
+ * 1: The partial evaluator assume all functions as terminating.
+ * We need to has a max_expand parameter that shrink on every compile time evaluation,
+ * to make sure PE does not infinite loop.
+ * Additionally, we might add a termination analysis pass that lift this requirement
+ * for function that analysis found terminating.
+ *
+ * 2: Every time an unknown effect happened, we clear the whole store.
+ * It is too conservative: if a local reference is created (and do not get passed outside),
+ * An unknown global function call/global reference write can not modify it.
+ * We can pair PE with escape analysis/alias analysis.
+ *
+ * 3: We assume all unknown code has effect. Doing effect analysis can make the store more precise.
+ *
+ * 4: When doing pattern matching, we can simplify the match even for dynamic case.
+ * Right now it is all or nothing: either a complete match, or the original dynamic code.
+ * Instead, we can get a match tree, pair it with the data and evaluate it to a normal form.
+ * We then can reify the result.
+ *
+ * 5: Every time a function is called, it's code will get expanded and partially evaluated.
+ * We can do a binding time analysis to cache the result and avoid re-partial evaluation.
+ *
+ * These assumptions do not affect the correctness of the algorithm, however.
+ */
+#include <tvm/relay/pass.h>
+#include <tvm/relay/expr_functor.h>
+#include <tvm/relay/pattern_functor.h>
+#include <tvm/relay/interpreter.h>
+#include "../ir/type_functor.h"
+#include "pass_util.h"
+#include "let_list.h"
+
+namespace tvm {
+namespace relay {
+
+using namespace runtime;
+
+Expr PostProcess(const Expr&);
+/*! \brief The base container type of Relay values. */
+class StaticNode : public RelayNode {
+ public:
+  static constexpr const char* _type_key = "relay.Value";
+  TVM_DECLARE_BASE_NODE_INFO(ValueNode, RelayNode);
+};
+
+class Static : public NodeRef {
+ public:
+  Static() {}
+  explicit Static(NodePtr<Node> n) : NodeRef(n) {}
+  const ValueNode* operator->() const {
+    return static_cast<const ValueNode*>(node_.get());
+  }
+
+  using ContainerType = StaticNode;
+};
+
+using Time = size_t;
+
+struct PStaticNode : Node {
+  static Time time() {
+    static Time time_ = 0;
+    Time ret = time_;
+    time_++;
+    return ret;
+  }
+  Static pstatic;  // may be null
+  Expr dynamic;
+  Time created_time;
+  PStaticNode(const Static& pstatic, const Expr& dynamic) :
+    pstatic(pstatic), dynamic(dynamic), created_time(time()) { }
+  explicit PStaticNode(const Expr& dynamic) : PStaticNode(Static(), dynamic) { }
+  TVM_DECLARE_NODE_TYPE_INFO(PStaticNode, Node);
+};
+
+RELAY_DEFINE_NODE_REF(PStatic, PStaticNode, NodeRef);
+
+struct STupleNode : StaticNode {
+  std::vector<PStatic> fields;
+  explicit STupleNode(const std::vector<PStatic>& fields) : fields(fields) { }
+  TVM_DECLARE_NODE_TYPE_INFO(STupleNode, StaticNode);
+};
+
+RELAY_DEFINE_NODE_REF(STuple, STupleNode, Value);
+
+Static MkSTuple(const std::vector<PStatic>& fields) {
+  return Static(make_node<STupleNode>(fields));
+}
+
+struct STensorNode : StaticNode {
+  runtime::NDArray data;
+  explicit STensorNode(const NDArray& data) : data(data) { }
+  TVM_DECLARE_NODE_TYPE_INFO(STupleNode, StaticNode);
+};
+
+RELAY_DEFINE_NODE_REF(STensor, STensorNode, Value);
+
+Static MkSTensor(const NDArray& data) {
+  return Static(make_node<STensorNode>(data));
+}
+
+struct SConstructorNode : StaticNode {
+  Constructor constructor;
+  std::vector<PStatic> fields;
+  SConstructorNode(const Constructor& constructor, const std::vector<PStatic>& fields) :
+    constructor(constructor), fields(fields) { }
+  TVM_DECLARE_NODE_TYPE_INFO(SConstructorNode, StaticNode);
+};
+
+RELAY_DEFINE_NODE_REF(SConstructor, SConstructorNode, Value);
+
+Static MkSConstructor(const Constructor& constructor, const std::vector<PStatic>& fields) {
+  return Static(make_node<SConstructorNode>(constructor, fields));
+}
+
+struct SRefNode : StaticNode {
+  // we will use the address as the guid for hashing
+  TVM_DECLARE_NODE_TYPE_INFO(SRefNode, StaticNode);
+};
+
+RELAY_DEFINE_NODE_REF(SRef, SRefNode, Value);
+
+Static MkSRef() {
+  return Static(make_node<SRefNode>());
+}
+
+using Func = std::function<PStatic(const std::vector<PStatic>&,
+                                      const Attrs&,
+                                      const Array<Type>&,
+                                      LetList*)>;
+
+struct SFuncNode : StaticNode {
+  Func func;
+  explicit SFuncNode(const Func& func) : func(func) { }
+  TVM_DECLARE_NODE_TYPE_INFO(SFuncNode, StaticNode);
+};
+
+RELAY_DEFINE_NODE_REF(SFunc, SFuncNode, Value);
+
+Static MkSFunc(const Func& func) {
+  return Static(make_node<SFuncNode>(func));
+}
+
+/*!
+ * \brief A stack frame in the Relay interpreter.
+ *
+ * Contains a mapping from relay::Var to relay::Value.
+ */
+struct Frame {
+  /*! \brief The set of local variables and arguments for the frame. */
+  std::unordered_map<Var, PStatic, VarHash, VarEqual> locals;
+  Frame() = default;
+};
+
+class Environment {
+ public:
+  Environment() : env_({Frame()}) { }
+  Environment(const Environment&) = delete;
+
+  template<typename T>
+  T Extend(const std::function<T()>& body) {
+    FrameContext fc(this);
+    return body();
+  }
+
+  void Insert(const Var& v, const PStatic& ps) {
+    CHECK(ps.defined());
+    env_.back().locals[v] = ps;
+  }
+
+  PStatic Lookup(const Var& v) {
+    auto rit = env_.rbegin();
+    while (rit != env_.rend()) {
+      if (rit->locals.find(v) != rit->locals.end()) {
+        return rit->locals.find(v)->second;
+      }
+      ++rit;
+    }
+    LOG(FATAL) << "Unknown Variable: " << v << v.as<VarNode>();
+    throw;
+  }
+
+ private:
+  std::list<Frame> env_;
+
+  struct FrameContext {
+    Environment* env_;
+    explicit FrameContext(Environment* env) : env_(env) {
+      env_->env_.push_back(Frame());
+    }
+    ~FrameContext() {
+      env_->env_.pop_back();
+    }
+  };
+};
+
+/*!
+ * \brief As our store require rollback, we implement it as a frame.
+ * every time we need to copy the store, a new frame is insert.
+ * every time we roll back, a frame is popped.
+ */
+struct StoreFrame {
+  std::unordered_map<const SRefNode*, PStatic> store;
+  /*! \brief on unknown effect, history_valid is set to true to signal above frame is outdated */
+  bool history_valid = true;
+  explicit StoreFrame(const std::unordered_map<const SRefNode*, PStatic>& store) : store(store) { }
+  StoreFrame() = default;
+};
+
+class Store {
+ public:
+  Store() : store_({StoreFrame()}) { }
+  Store(const Store&) = delete;
+
+  template<typename T>
+  T Extend(const std::function<T()>& body) {
+    StoreFrameContext sfc(this);
+    return body();
+  }
+
+  void Insert(const SRefNode* r, const PStatic& ps) {
+    store_.back().store[r] = ps;
+  }
+
+  // return null if not found
+  PStatic Lookup(const SRefNode* r) {
+    auto rit = store_.rbegin();
+    while (rit != store_.rend()) {
+      if (!rit->history_valid) {
+        return PStatic();
+      }
+      if (rit->store.find(r) != rit->store.end()) {
+        return rit->store.find(r)->second;
+      }
+      ++rit;
+    }
+    return PStatic();
+  }
+
+  void Invalidate() {
+    store_.back().history_valid = false;
+  }
+
+ private:
+  std::list<StoreFrame> store_;
+
+  struct StoreFrameContext {
+    Store* store_;
+    explicit StoreFrameContext(Store* store) : store_(store) {
+      store_->store_.push_back(StoreFrame());
+    }
+    ~StoreFrameContext() {
+      store_->store_.pop_back();
+    }
+  };
+};
+
+PStatic HasStatic(const Static& stat, const Expr& dynamic) {
+  CHECK(stat.defined());
+  return PStatic(make_node<PStaticNode>(stat, dynamic));
+}
+
+PStatic NoStatic(const Expr& dynamic) {
+  return PStatic(make_node<PStaticNode>(dynamic));
+}
+
+enum struct MatchStatus {
+  Match, NoMatch, Unknown
+};
+
+bool StatefulOp(const Expr& e) {
+  static auto op_stateful = Op::GetAttr<TOpIsStateful>("TOpIsStateful");
+  struct StatefulOpVisitor : ExprVisitor {
+    bool stateful = false;
+    void VisitExpr_(const OpNode* op) {
+      stateful = stateful || op_stateful.get(GetRef<Op>(op), false);
+    }
+  };
+  StatefulOpVisitor sov;
+  sov(e);
+  return sov.stateful;
+}
+
+using FInterpreter = runtime::TypedPackedFunc<Value(Expr)>;
+
+DLContext CPUContext() {
+  DLContext ctx;
+  ctx.device_type = kDLCPU;
+  ctx.device_id = 0;
+  return ctx;
+}
+
+FInterpreter CPUInterpreter() {
+  Target target = Target::create("llvm");
+  // use a fresh build context
+  // in case we are already in a build context.
+  BuildConfigContext fresh_build_ctx(build_config());
+
+  return CreateInterpreter(Module(nullptr), CPUContext(), target);
+}
+
+bool IsAtomic(const Expr& e) {
+  return e.as<VarNode>() || e.as<OpNode>() || e.as<ConstructorNode>() || e.as<GlobalVarNode>();
+}
+
+using FuncId = size_t;
+
+struct WithFuncId;
+
+struct WithFuncIdNode : Node {
+  FuncId fid;
+  WithFuncIdNode(FuncId fid) : fid(fid) { }
+  static constexpr const char* _type_key = "relay.WithFuncId";
+  TVM_DECLARE_NODE_TYPE_INFO(WithFuncIdNode, Node);
+};
+
+RELAY_DEFINE_NODE_REF(WithFuncId, WithFuncIdNode, NodeRef);
+
+Annotate MkWithFuncId(const Expr& expr, FuncId fid) {
+  return AnnotateNode::make(expr, WithFuncId(make_node<WithFuncIdNode>(fid)));
+}
+
+Expr StripWithFuncId(const Expr& e);
+
+Expr DeDup(const Expr& e);
+
+Function AsFunc(const Expr& e) {
+  if (e.as<FunctionNode>()) {
+    return Downcast<Function>(e);
+  } else if (const AnnotateNode* a = e.as<AnnotateNode>()) {
+    CHECK(a->annotation.as<WithFuncIdNode>());
+    return AsFunc(a->expr);
+  } else {
+    LOG(FATAL) << "Unknown case";
+    throw;
+  }
+}
+
+class PartialEvaluator : public ExprFunctor<PStatic(const Expr&, LetList*)>,
+                         public PatternFunctor<MatchStatus(const Pattern&, const PStatic&)> {
+ public:
+  PartialEvaluator(const tvm::Array<Var>& free_vars,
+                   const Module& mod) :
+    mod_(mod) {
+    for (const Var& v : free_vars) {
+      env_.Insert(v, NoStatic(v));
+    }
+  }
+
+  size_t depth = 0;
+  PStatic VisitExpr(const Expr& e, LetList* ll) final {
+    PStatic ret = ExprFunctor<PStatic(const Expr&, LetList*)>::VisitExpr(e, ll);
+    CHECK(IsAtomic(ret->dynamic)) << ret->dynamic;
+    return ret;
+  }
+
+  PStatic VisitExpr_(const ConstantNode* op, LetList* ll) final {
+    return HasStatic(MkSTensor(op->data.CopyTo(context_)), ll->Push(GetRef<Expr>(op)));
+  }
+
+  PStatic VisitExpr_(const TupleNode* op, LetList* ll) final {
+    std::vector<PStatic> value;
+    tvm::Array<Expr> expr;
+    for (const Expr& e : op->fields) {
+      PStatic ps = VisitExpr(e, ll);
+      value.push_back(ps);
+      expr.push_back(ps->dynamic);
+    }
+    return HasStatic(MkSTuple(value), ll->Push(TupleNode::make(expr)));
+  }
+
+  PStatic VisitExpr_(const TupleGetItemNode* op, LetList* ll) final {
+    PStatic ps = VisitExpr(op->tuple, ll);
+    if (ps->pstatic.defined()) {
+      return Downcast<STuple>(ps->pstatic)->fields[op->index];
+    } else {
+      return NoStatic(ll->Push(TupleGetItemNode::make(ps->dynamic, op->index)));
+    }
+  }
+
+  PStatic VisitExpr_(const VarNode* op, LetList* ll) final {
+    return env_.Lookup(GetRef<Var>(op));
+  }
+
+  PStatic VisitExpr_(const GlobalVarNode* op, LetList* ll) final {
+    GlobalVar gv = GetRef<GlobalVar>(op);
+    if (gv_map_.count(gv) == 0) {
+      if (mod_.defined()) {
+        Function func = mod_->Lookup(gv);
+        InitializeFuncId(func);
+        Func f = VisitFuncStatic(func, gv);
+        gv_map_.insert({gv, HasStatic(MkSFunc(f), gv)});
+        func = AsFunc(PostProcess(VisitFuncDynamic(func, f)));
+        mod_->Update(gv, func);
+      } else {
+        gv_map_.insert({gv, NoStatic(gv)});
+      }
+    }
+    return gv_map_.at(gv);
+  }
+
+  PStatic VisitExpr_(const LetNode* op, LetList* ll) final {
+    env_.Insert(op->var, VisitExpr(op->value, ll));
+    return VisitExpr(op->body, ll);
+  }
+
+  PStatic VisitExpr_(const IfNode* op, LetList* ll) final {
+    PStatic c = VisitExpr(op->cond, ll);
+    if (c->pstatic.defined()) {
+      NDArray cpu_array = Downcast<STensor>(c->pstatic)->data.CopyTo(CPUContext());
+      CHECK_EQ(TVMType2Type(cpu_array->dtype), Bool());
+      if (reinterpret_cast<uint8_t*>(cpu_array->data)[0]) {
+        return VisitExpr(op->true_branch, ll);
+      } else {
+        return VisitExpr(op->false_branch, ll);
+      }
+    } else {
+      Expr t = store_.Extend<Expr>([&]() {
+          return LetList::With([&](LetList* ll) {
+              return VisitExpr(op->true_branch, ll)->dynamic;
+            });
+        });
+      Expr f = store_.Extend<Expr>([&]() {
+          return LetList::With([&](LetList* ll) {
+              return VisitExpr(op->false_branch, ll)->dynamic;
+            });
+        });
+      store_.Invalidate();
+      return NoStatic(ll->Push(IfNode::make(c->dynamic, t, f)));
+    }
+  }
+
+  PStatic VisitExpr_(const RefCreateNode* op, LetList* ll) final {
+    PStatic ps = VisitExpr(op->value, ll);
+    Static r = MkSRef();
+    store_.Insert(r.as<SRefNode>(), ps);
+    return HasStatic(r, ll->Push(RefCreateNode::make(ps->dynamic)));
+  }
+
+  PStatic VisitExpr_(const RefWriteNode* op, LetList* ll) final {
+    PStatic r = VisitExpr(op->ref, ll);
+    PStatic v = VisitExpr(op->value, ll);
+    if (r->pstatic.defined()) {
+      store_.Insert(r->pstatic.as<SRefNode>(), v);
+    } else {
+      store_.Invalidate();
+    }
+    return HasStatic(MkSTuple({}), ll->Push(RefWriteNode::make(r->dynamic, v->dynamic)));
+  }
+
+  PStatic VisitExpr_(const RefReadNode* op, LetList* ll) final {
+    PStatic r = VisitExpr(op->ref, ll);
+    if (r->pstatic.defined()) {
+      PStatic ret = store_.Lookup(r->pstatic.as<SRefNode>());
+      if (ret) {
+        return ret;
+      }
+    }
+    return NoStatic(ll->Push(RefReadNode::make(r->dynamic)));
+  }
+
+  PStatic VisitExpr_(const CallNode* op, LetList* ll) final {
+    PStatic f = VisitExpr(op->op, ll);
+    std::vector<PStatic> x;
+    tvm::Array<Expr> x_dyn;
+    for (const Expr& e : op->args) {
+      PStatic ps = VisitExpr(e, ll);
+      x.push_back(ps);
+      x_dyn.push_back(ps->dynamic);
+    }
+    if (f->pstatic.defined()) {
+      return Downcast<SFunc>(f->pstatic)->func(x, op->attrs, op->type_args, ll);
+    } else {
+      store_.Invalidate();
+      return NoStatic(ll->Push(CallNode::make(f->dynamic, x_dyn, op->attrs, op->type_args)));
+    }
+  }
+
+  PStatic VisitExpr_(const AnnotateNode* op, LetList* ll) final {
+    CHECK(op->annotation.as<WithFuncIdNode>());
+    return VisitExpr(op->expr, ll);
+  }
+
+  struct TimeFrame {
+    PartialEvaluator* pe_;
+    FuncId fid_;
+    std::vector<Time> old_time;
+    bool has_old_time;
+    TimeFrame(PartialEvaluator* pe,
+              FuncId fid,
+              const std::vector<Time>& args_time) : pe_(pe), fid_(fid) {
+      has_old_time = pe_->time_map_.count(fid_) > 0;
+      old_time = pe_->time_map_[fid_];
+      pe_->time_map_[fid_] = args_time;
+    }
+    ~TimeFrame() {
+      if (has_old_time) {
+        pe_->time_map_[fid_] = old_time;
+      } else {
+        pe_->time_map_.erase(fid_);
+      }
+    }
+  };
+
+  Func VisitFuncStatic(const Function& func, const Expr& var) {
+    CHECK(IsAtomic(var));
+    if (func->IsPrimitive()) {
+      return ConstEvaluateFunc(func);
+    }
+    std::vector<std::pair<Var, PStatic> > free_vars;
+    for (const auto& v : FreeVars(func)) {
+      free_vars.push_back(std::pair<Var, PStatic>(v, env_.Lookup(v)));
+    }
+    return [=](const std::vector<PStatic>& pv,
+               const Attrs& attrs,
+               const tvm::Array<Type>& type_args,
+               LetList* ll) {
+      return env_.Extend<PStatic>([&]() {
+          CHECK_EQ(pv.size(), func->params.size());
+          for (size_t i = 0; i < pv.size(); ++i) {
+            env_.Insert(func->params[i], pv[i]);
+          }
+          for (const auto& p : free_vars) {
+            env_.Insert(p.first, p.second);
+          }
+          tvm::Map<TypeVar, Type> subst;
+          for (size_t i = 0; i < type_args.size(); ++i) {
+            subst.Set(func->type_params[i], type_args[i]);
+          }
+          for (size_t i = type_args.size(); i < func->type_params.size(); ++i) {
+            subst.Set(func->type_params[i], Type());
+          }
+          std::vector<Time> args_time;
+          for (const auto& v : pv) {
+            args_time.push_back(v->created_time);
+          }
+          CHECK_GT(func_map_.count(func), 0);
+          FuncId fid = func_map_.at(func);
+          auto recurse = [&]() {
+            TimeFrame tf(this, fid, args_time);
+            return VisitExpr(RegisterFuncId(TypeSubst(AnnotateFuncId(func->body), subst)), ll);
+          };
+          if (time_map_.count(fid) == 0) {
+            return recurse();
+          } else {
+            bool can_recurse = false;
+            std::vector<Time>& min_time = time_map_.at(fid);
+            CHECK_EQ(args_time.size(), min_time.size());
+            for (size_t i = 0; i < args_time.size(); ++i) {
+              if (args_time[i] < min_time[i]) {
+                can_recurse = true;
+              }
+              args_time[i] = std::min(args_time[i], min_time[i]);
+            }
+            if (can_recurse) {
+              return recurse();
+            } else {
+              std::vector<Expr> dyn;
+              for (const auto& v : pv) {
+                dyn.push_back(v->dynamic);
+              }
+              return NoStatic(ll->Push(CallNode::make(var, dyn, attrs, type_args)));
+            }
+          }
+        });
+    };
+  }
+
+  Expr VisitFuncDynamic(const Function& func, const Func& f) {
+    return store_.Extend<Expr>([&]() {
+        store_.Invalidate();
+        return FunctionNode::make(func->params, LetList::With([&](LetList* ll) {
+              std::vector<PStatic> pv;
+              for (const auto& v : func->params) {
+                pv.push_back(NoStatic(v));
+              }
+              tvm::Array<Type> type_args;
+              for (const auto& tp : func->type_params) {
+                type_args.push_back(tp);
+              }
+              return f(pv, Attrs(), type_args, ll)->dynamic;
+            }), func->ret_type, func->type_params, func->attrs);
+      });
+  }
+
+  PStatic VisitFunc(const Function& func, LetList* ll) {
+    Var v = VarNode::make("x", Type());
+    Func f = VisitFuncStatic(func, v);
+    Function u_func = AsFunc(RegisterFuncId(DeDup(AnnotateFuncId(func))));
+    // TODO(@M.K.): we seems to reduce landin knot into letrec.
+    // restore letrec support across whole relay.
+    return HasStatic(MkSFunc(f),
+                     ll->Push(v, VisitFuncDynamic(u_func, f)));
+  }
+
+  PStatic VisitExpr_(const FunctionNode* op, LetList* ll) final {
+    Function func = GetRef<Function>(op);
+    return VisitFunc(func, ll);
+  }
+
+  Expr Reflect(const PStatic& st) {
+    if (const STensorNode* op = st->pstatic.as<STensorNode>()) {
+      return ConstantNode::make(op->data);
+    } else if (const STupleNode* op = st->pstatic.as<STupleNode>()) {
+      tvm::Array<Expr> fields;
+      for (const PStatic& field : op->fields) {
+        fields.push_back(Reflect(field));
+      }
+      return TupleNode::make(fields);
+    } else {
+      LOG(FATAL) << "Unknown case";
+      throw;
+    }
+  }
+
+  PStatic Reify(const Value& v, LetList* ll) const {
+    if (const TensorValueNode* op = v.as<TensorValueNode>()) {
+      return HasStatic(MkSTensor(op->data), ll->Push(ConstantNode::make(op->data)));
+    } else if (const TupleValueNode* op = v.as<TupleValueNode>()) {
+      std::vector<PStatic> fields;
+      tvm::Array<Expr> fields_dyn;
+      for (const Value& field : op->fields) {
+        PStatic ps = Reify(field, ll);
+        fields.push_back(ps);
+        fields_dyn.push_back(ps->dynamic);
+      }
+      return HasStatic(MkSTuple(fields), ll->Push(TupleNode::make(fields_dyn)));
+    } else {
+      LOG(FATAL) << "Unknown case";
+      throw;
+    }
+  }
+
+  // Constant evaluate a expression.
+  PStatic ConstEvaluate(const Expr& expr, LetList* ll) {
+    Expr infered = InferType(expr, Module(nullptr));
+    Expr fused = FuseOps(infered, 0);
+    Expr fused_infered = InferType(fused, Module(nullptr));
+    return Reify(executor_(fused_infered), ll);
+  }
+
+  Func ConstEvaluateFunc(const Expr& expr) {
+    CHECK(FreeVars(expr).size() == 0);
+    return [=](const std::vector<PStatic>& pv,
+               const Attrs& attrs,
+               const tvm::Array<Type>& type_args,
+               LetList* ll) {
+      tvm::Array<Expr> ns_args;
+      for (const PStatic& ps : pv) {
+        ns_args.push_back(ps->dynamic);
+      }
+      PStatic ns = NoStatic(ll->Push(CallNode::make(expr, ns_args, attrs, type_args)));
+      if (StatefulOp(expr)) {
+        return ns;
+      }
+      tvm::Array<Expr> args;
+      for (const PStatic& ps : pv) {
+        if (ps->pstatic.defined()) {
+          args.push_back(Reflect(ps));
+        } else {
+          return ns;
+        }
+      }
+      return ConstEvaluate(CallNode::make(expr, args, attrs, type_args), ll);
+    };
+  }
+
+  PStatic VisitExpr_(const OpNode* op, LetList* ll) final {
+    return HasStatic(MkSFunc(ConstEvaluateFunc(GetRef<Expr>(op))), GetRef<Expr>(op));
+  }
+
+  PStatic VisitExpr_(const ConstructorNode* op, LetList* ll) final {
+    Constructor c = GetRef<Constructor>(op);
+    Func f = [=](const std::vector<PStatic>& pv,
+                 const Attrs& attrs,
+                 const tvm::Array<Type>& type_args,
+                 LetList* ll) {
+      tvm::Array<Expr> dyn;
+      for (const PStatic& ps : pv) {
+        dyn.push_back(ps->dynamic);
+      }
+      return HasStatic(MkSConstructor(c, pv), ll->Push(CallNode::make(c, dyn)));
+    };
+    return HasStatic(MkSFunc(f), GetRef<Expr>(op));
+  }
+
+  PStatic VisitExpr_(const MatchNode* op, LetList* ll) final {
+    PStatic ps = VisitExpr(op->data, ll);
+    return env_.Extend<PStatic>([&]() {
+        for (const Clause& c : op->clauses) {
+          switch (VisitPattern(c->lhs, ps)) {
+          case MatchStatus::Match:
+            return VisitExpr(c->rhs, ll);
+          case MatchStatus::NoMatch:
+            continue;
+          case MatchStatus::Unknown:
+            tvm::Array<Clause> clauses;
+            for (const Clause& c : op->clauses) {
+              Expr expr = store_.Extend<Expr>([&]() {
+                  return LetList::With([&](LetList* ll) {
+                      for (const Var& v : BoundVars(c->lhs)) {
+                        env_.Insert(v, NoStatic(v));
+                      }
+                      return VisitExpr(c->rhs, ll)->dynamic;
+                    });
+                });
+              clauses.push_back(ClauseNode::make(c->lhs, expr));
+            }
+            store_.Invalidate();
+            return NoStatic(ll->Push(MatchNode::make(ps->dynamic, clauses)));
+          }
+        }
+        LOG(FATAL) << "No case Match";
+        throw;
+      });
+  }
+
+  MatchStatus VisitPattern_(const PatternWildcardNode* op, const PStatic& ps) final {
+    return MatchStatus::Match;
+  }
+
+  MatchStatus VisitPattern_(const PatternVarNode* op, const PStatic& ps) final {
+    env_.Insert(op->var, ps);
+    return MatchStatus::Match;
+  }
+
+  MatchStatus VisitPattern_(const PatternConstructorNode* op, const PStatic& ps) final {
+    if (ps->pstatic.defined()) {
+      SConstructor scn = Downcast<SConstructor>(ps->pstatic);
+      CHECK_NE(op->constructor->tag, -1);
+      CHECK_NE(scn->constructor->tag, -1);
+      if (op->constructor->tag == scn->constructor->tag) {
+        // todo(M.K.): should use ptr equality but it is broken
+        CHECK_EQ(op->patterns.size(), scn->fields.size());
+        MatchStatus current_match_status = MatchStatus::Match;
+        for (size_t i = 0; i < op->patterns.size(); ++i) {
+          MatchStatus ms = VisitPattern(op->patterns[i], scn->fields[i]);
+          switch (ms) {
+          case MatchStatus::Match:
+            continue;
+          case MatchStatus::NoMatch:
+            return MatchStatus::NoMatch;
+          case MatchStatus::Unknown:
+            current_match_status = MatchStatus::Unknown;
+          }
+        }
+        return current_match_status;
+      }
+      return MatchStatus::NoMatch;
+    } else {
+      return MatchStatus::Unknown;
+    }
+  }
+
+  void InitializeFuncId(const Expr& e) {
+    struct InitializeFuncIdVisitor : ExprVisitor, PatternVisitor {
+      PartialEvaluator* pe;
+      explicit InitializeFuncIdVisitor(PartialEvaluator* pe) : pe(pe) { }
+
+      void VisitExpr_(const FunctionNode* op) final {
+        Function f = GetRef<Function>(op);
+        CHECK_EQ(pe->func_map_.count(f), 0);
+        pe->func_map_.insert({f, pe->func_map_.size()});
+        VisitExpr(f->body);
+      }
+
+      void VisitPattern(const Pattern& p) final {
+        PatternVisitor::VisitPattern(p);
+      }
+    };
+    InitializeFuncIdVisitor(this).VisitExpr(e);
+  }
+
+  Expr RegisterFuncId(const Expr& e) {
+    struct RegisterFuncIdVisitor : ExprVisitor, PatternVisitor {
+      PartialEvaluator* pe;
+      explicit RegisterFuncIdVisitor(PartialEvaluator* pe) : pe(pe) { }
+
+      void VisitExpr_(const AnnotateNode* op) final {
+        if (const WithFuncIdNode* n = op->annotation.as<WithFuncIdNode>()) {
+          Function f = AsFunc(op->expr);
+          if (pe->func_map_.count(f) != 0) {
+            CHECK_EQ(pe->func_map_.at(f), n->fid);
+          }
+          pe->func_map_.insert({f, n->fid});
+        }
+        ExprVisitor::VisitExpr_(op);
+      }
+
+      void VisitExpr_(const FunctionNode* op) final {
+        Function f = GetRef<Function>(op);
+        CHECK_GT(pe->func_map_.count(f), 0);
+        ExprVisitor::VisitExpr_(op);
+      }
+
+      void VisitPattern(const Pattern& p) final {
+        PatternVisitor::VisitPattern(p);
+      }
+    };
+    RegisterFuncIdVisitor(this).VisitExpr(e);
+    return e;
+  }
+
+  Expr AnnotateFuncId(const Expr& e) {
+    struct AnnotateFuncIdMutator : ExprMutator, PatternMutator {
+      PartialEvaluator* pe;
+      explicit AnnotateFuncIdMutator(PartialEvaluator* pe) : pe(pe) { }
+
+      Expr VisitExpr_(const FunctionNode* op) final {
+        Function f = GetRef<Function>(op);
+        CHECK_GT(pe->func_map_.count(f), 0);
+        return MkWithFuncId(ExprMutator::VisitExpr_(op), pe->func_map_.at(f));
+      }
+
+      Pattern VisitPattern(const Pattern& p) final {
+        return PatternMutator::VisitPattern(p);
+      }
+
+      Var VisitVar(const Var& v) final {
+        return v;
+      }
+    };
+    return AnnotateFuncIdMutator(this).VisitExpr(e);
+  }
+
+private:
+  Environment env_;
+  Module mod_;
+  std::unordered_map<GlobalVar, PStatic, NodeHash, NodeEqual> gv_map_;
+  /*! Termination checking is done as follow:
+   *  We have finitely many FunctionId.
+   *  Each Function Id map to a class of semantically equivalent function (ignoring type),
+   *  as both TypeSubst, DeDup create semantically equivalent function.
+   *  We partially map each FunctionId to a std::vector<Time>,
+   *  denoting the minimal TimeFrame of the arguments of the function.
+   *  Every time we try to inline a Function, we make sure it either not has a vector<Time>,
+   *  or some argument has lesser time.
+   *  In any case, we remap the mapping to a minimal vector<Time>
+   *  when we PE inside the Function body.
+   */
+  std::unordered_map<Function, FuncId, NodeHash, NodeEqual> func_map_;
+  std::unordered_map<FuncId, std::vector<Time> > time_map_;
+  Store store_;
+  DLContext context_ = CPUContext();
+  FInterpreter executor_ = CPUInterpreter();
+};
+
+Var DeDupVar(const Var& v) {
+  return VarNode::make(v->name_hint(), v->type_annotation);
+}
+
+TypeVar DeDupTypeVar(const TypeVar& tv) {
+  return TypeVarNode::make(tv->var->name_hint, tv->kind);
+}
+
+/*! \brief Use a fresh Id for every Var to make the result well-formed. */
+Expr DeDup(const Expr& e) {
+  class DeDupMutator : public ExprMutator,
+                       public PatternMutator,
+                       public TypeMutator {
+   public:
+    Var Fresh(const Var& v) {
+      Var ret = VarNode::make(v->name_hint(), VisitType(v->type_annotation));
+      rename_[v] = ret;
+      return ret;
+    }
+
+    TypeVar Fresh(const TypeVar& v) {
+      TypeVar ret = DeDupTypeVar(v);
+      type_rename_[v] = ret;
+      return ret;
+    }
+
+    Expr VisitExpr(const Expr& e) final {
+      return ExprMutator::VisitExpr(e);
+    }
+
+    Expr VisitExpr_(const VarNode* op) final {
+      Var v = GetRef<Var>(op);
+      return rename_.count(v) != 0 ? rename_.at(v) : v;
+    }
+
+    Expr VisitExpr_(const LetNode* op) final {
+      Var v = Fresh(op->var);
+      return LetNode::make(v, VisitExpr(op->value), VisitExpr(op->body));
+    }
+
+    Type VisitType(const Type& t) final {
+      return t.defined() ? TypeMutator::VisitType(t) : t;
+    }
+
+    Expr VisitExpr_(const FunctionNode* op) final {
+      tvm::Array<TypeVar> type_params;
+      for (const TypeVar& type_param : op->type_params) {
+        type_params.push_back(Fresh(type_param));
+      }
+      tvm::Array<Var> params;
+      for (const Var& param : op->params) {
+        params.push_back(Fresh(param));
+      }
+      return FunctionNode::make(params,
+                                VisitExpr(op->body),
+                                VisitType(op->ret_type),
+                                type_params,
+                                op->attrs);
+    }
+
+    Pattern VisitPattern(const Pattern& p) final {
+      return PatternMutator::VisitPattern(p);
+    }
+
+    Clause VisitClause(const Clause& c) final {
+      Pattern pat = VisitPattern(c->lhs);
+      return ClauseNode::make(pat, VisitExpr(c->rhs));
+    }
+
+    Var VisitVar(const Var& v) final {
+      return Fresh(v);
+    }
+
+    Type VisitType_(const TypeVarNode* op) final {
+      TypeVar v = GetRef<TypeVar>(op);
+      return type_rename_.count(v) != 0 ? type_rename_.at(v) : v;
+    }
+
+   private:
+    std::unordered_map<Var, Var, NodeHash, NodeEqual> rename_;
+    std::unordered_map<TypeVar, TypeVar, NodeHash, NodeEqual> type_rename_;
+  };
+
+  Expr ret = DeDupMutator().VisitExpr(e);
+  CHECK_EQ(FreeVars(ret).size(), FreeVars(e).size());
+  return ret;
+}
+
+/*! \brief Remap multiple Var sharing the same Id into the same Var. */
+Expr Remap(const Expr& e) {
+  class RemapMutator : public ExprMutator, public PatternMutator {
+    Expr VisitExpr_(const VarNode* op) final {
+      Var v = GetRef<Var>(op);
+      if (remap_.count(v) == 0) {
+        remap_.insert({v, v});
+      }
+      return remap_.at(v);
+    }
+
+    Var VisitVar(const Var& v) final {
+      return Downcast<Var>(VisitExpr(v));
+    }
+
+   private:
+    std::unordered_map<Var, Var, VarHash, VarEqual> remap_;
+  };
+  return RemapMutator().VisitExpr(e);
+}
+
+Expr StripWithFuncId(const Expr& e) {
+  struct StripWithFuncIdMutator : ExprMutator, PatternMutator {
+    Expr VisitExpr_(const AnnotateNode* op) final {
+      if (op->annotation.as<WithFuncIdNode>()) {
+        return VisitExpr(op->expr);
+      } else {
+        return ExprMutator::VisitExpr(GetRef<Expr>(op));
+      }
+    }
+
+    Pattern VisitPattern(const Pattern& p) final {
+      return PatternMutator::VisitPattern(p);
+    }
+
+    Var VisitVar(const Var& v) final {
+      return v;
+    }
+
+  };
+  return StripWithFuncIdMutator().VisitExpr(e);
+}
+
+Expr PostProcess(const Expr& e) {
+  return StripWithFuncId(DeDup(Remap(e)));
+}
+
+Expr PartialEval(const Expr& e, const Module& m) {
+  return TransformF([&](const Expr& e) {
+      return LetList::With([&](LetList* ll) {
+          PartialEvaluator pe(FreeVars(e), m);
+          pe.InitializeFuncId(e);
+          return PostProcess(pe.VisitExpr(pe.RegisterFuncId(pe.AnnotateFuncId(e)), ll)->dynamic);
+        });
+    }, e);
+}
+
+TVM_REGISTER_API("relay._ir_pass.partial_eval")
+.set_body([](TVMArgs args, TVMRetValue* ret) {
+    *ret = PartialEval(args[0], args[1]);
+  });
+
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/pass/pass_util.h b/src/relay/pass/pass_util.h
index 888919354c982..a722d2903b6c2 100644
--- a/src/relay/pass/pass_util.h
+++ b/src/relay/pass/pass_util.h
@@ -23,7 +23,6 @@ namespace relay {
 std::unordered_map<const Node*, size_t>
 GetExprRefCount(const Expr& body);
 
-
 /*!
  * \brief Check if expr is positive constant.
  * \param expr The expression to be checked.
@@ -31,7 +30,6 @@ GetExprRefCount(const Expr& body);
  */
 bool IsAllPositiveConstant(const Expr& expr);
 
-
 /*!
  * \brief Substitute var with subst.
  * \param type The type to be substituted.
@@ -41,6 +39,15 @@ bool IsAllPositiveConstant(const Expr& expr);
  */
 Type TypeSubst(const Type& type, const TypeVar& tvar, const Type& subst);
 
+/*!
+ * \brief Substitute var with subst.
+ * \param expr The expr to be substituted.
+ * \param tvar The type variable to be substituted.
+ * \param subst The target of substitution.
+ * \return The substituted result.
+ */
+Expr TypeSubst(const Expr& expr, const TypeVar& tvar, const Type& subst);
+
 /*!
  * \brief Substitute type vars in type.
  * \param type The type to be substituted.
@@ -49,6 +56,28 @@ Type TypeSubst(const Type& type, const TypeVar& tvar, const Type& subst);
  */
 Type TypeSubst(const Type& type, const tvm::Map<TypeVar, Type>& subst_map);
 
+/*!
+ * \brief Substitute type vars in type.
+ * \param expr The expr to be substituted.
+ * \param subst_map The map of substitution.
+ * \return The substituted result.
+ */
+Expr TypeSubst(const Expr& expr, const tvm::Map<TypeVar, Type>& subst_map);
+
+/*!
+ * \brief Make arbitrary transformation preserve the out most function.
+ * \param func The transformation.
+ * \param e The expression
+ * \return the transformed expression. If e is a function the return is also a function.
+ */
+inline Expr TransformF(const std::function<Expr(const Expr&)>& func, const Expr& e) {
+  if (const FunctionNode* f = e.as<FunctionNode>()) {
+    return FunctionNode::make(f->params, func(f->body), f->ret_type, f->type_params, f->attrs);
+  } else {
+    return func(e);
+  }
+}
+
 }  // namespace relay
 }  // namespace tvm
 #endif  // TVM_RELAY_PASS_PASS_UTIL_H_
diff --git a/src/relay/pass/to_a_normal_form.cc b/src/relay/pass/to_a_normal_form.cc
index 46a4b92ac9b9e..e0309ec3d03e3 100644
--- a/src/relay/pass/to_a_normal_form.cc
+++ b/src/relay/pass/to_a_normal_form.cc
@@ -9,6 +9,7 @@
 #include <tvm/relay/expr_functor.h>
 #include "let_list.h"
 #include "../../common/arena.h"
+#include "pass_util.h"
 
 namespace tvm {
 namespace relay {
@@ -196,7 +197,9 @@ DependencyGraph DependencyGraph::Create(common::Arena* arena, const Expr& body)
   return Creator(arena).Create(body);
 }
 
-Expr ToANormalForm(const Expr& e, const Module& m, std::set<GlobalVar>* gv);
+Expr ToANormalForm(const Expr& e,
+                   const Module& m,
+                   std::unordered_set<GlobalVar, NodeHash, NodeEqual>* gv);
 
 struct ScopeNode;
 using Scope = std::shared_ptr<ScopeNode>;
@@ -266,7 +269,7 @@ class Fill : ExprFunctor<Expr(const Expr&, const Var&)> {
                             const Module& m,
                             const DependencyGraph& dg,
                             std::unordered_map<DependencyGraph::Node*, Scope>* node_scope,
-                            std::set<GlobalVar>* gv) {
+                            std::unordered_set<GlobalVar, NodeHash, NodeEqual>* gv) {
     Fill fi(m, dg, node_scope, gv);
     return fi.GetScope(e)->ll->Get(fi.VisitExpr(e));
   }
@@ -275,13 +278,13 @@ class Fill : ExprFunctor<Expr(const Expr&, const Var&)> {
   Module mod_;
   const DependencyGraph& dg_;
   std::unordered_map<DependencyGraph::Node*, Scope>* node_scope_;
-  std::set<GlobalVar>* visited_;
+  std::unordered_set<GlobalVar, NodeHash, NodeEqual>* visited_;
   std::unordered_map<Expr, Expr, NodeHash, NodeEqual> memo;
 
   Fill(Module mod,
        const DependencyGraph& dg,
        std::unordered_map<DependencyGraph::Node*, Scope>* node_scope,
-       std::set<GlobalVar>* visited) :
+       std::unordered_set<GlobalVar, NodeHash, NodeEqual>* visited) :
     mod_(mod),
     dg_(dg),
     node_scope_(node_scope),
@@ -435,7 +438,9 @@ class Fill : ExprFunctor<Expr(const Expr&, const Var&)> {
   }
 };
 
-Expr ToANormalFormAux(const Expr& e, const Module& m, std::set<GlobalVar>* gv) {
+Expr ToANormalFormAux(const Expr& e,
+                      const Module& m,
+                      std::unordered_set<GlobalVar, NodeHash, NodeEqual>* gv) {
   /* When you lift a lambda, what is inside is also being lift.
    *
    * So we must determine the scope of the lambda before determining the scope of it's body.
@@ -461,20 +466,14 @@ Expr ToANormalFormAux(const Expr& e, const Module& m, std::set<GlobalVar>* gv) {
   return Fill::ToANormalForm(e, m, dg, &node_scope, gv);
 }
 
-Expr ToANormalForm(const Expr& e, const Module& m, std::set<GlobalVar>* gv) {
-  if (const auto* f = e.as<FunctionNode>()) {
-    return FunctionNode::make(f->params,
-                              ToANormalFormAux(f->body, m, gv),
-                              f->ret_type,
-                              f->type_params,
-                              f->attrs);
-  } else {
-    return ToANormalFormAux(e, m, gv);
-  }
+Expr ToANormalForm(const Expr& e,
+                   const Module& m,
+                   std::unordered_set<GlobalVar, NodeHash, NodeEqual>* gv) {
+  return TransformF([&](const Expr& e) { return ToANormalFormAux(e, m, gv); }, e);
 }
 
 Expr ToANormalForm(const Expr& e, const Module& m) {
-  std::set<GlobalVar> gv;
+  std::unordered_set<GlobalVar, NodeHash, NodeEqual> gv;
   return ToANormalForm(e, m, &gv);
 }
 
diff --git a/src/relay/pass/util.cc b/src/relay/pass/util.cc
index 76fc0aa1a45e9..e8d36dc5c09d5 100644
--- a/src/relay/pass/util.cc
+++ b/src/relay/pass/util.cc
@@ -8,6 +8,7 @@
 #include <tvm/relay/pass.h>
 #include <tvm/relay/expr_functor.h>
 #include <tvm/relay/pattern_functor.h>
+#include "pass_util.h"
 #include "../ir/type_functor.h"
 
 namespace tvm {
@@ -152,8 +153,7 @@ class VarVisitor : protected ExprVisitor, protected PatternVisitor {
     return ret;
   }
 
-  Array<Var> Bound(const Expr& expr) {
-    this->VisitExpr(expr);
+  Array<Var> Collect() {
     Array<Var> ret;
     for (const auto& v : bound_vars_.data) {
       ret.push_back(v);
@@ -161,6 +161,16 @@ class VarVisitor : protected ExprVisitor, protected PatternVisitor {
     return ret;
   }
 
+  Array<Var> Bound(const Expr& expr) {
+    this->VisitExpr(expr);
+    return Collect();
+  }
+
+  Array<Var> Bound(const Pattern& pat) {
+    this->VisitPattern(pat);
+    return Collect();
+  }
+
   Array<Var> All(const Expr& expr) {
     this->VisitExpr(expr);
     Array<Var> ret;
@@ -237,6 +247,10 @@ tvm::Array<Var> BoundVars(const Expr& expr) {
   return VarVisitor().Bound(expr);
 }
 
+tvm::Array<Var> BoundVars(const Pattern& pat) {
+  return VarVisitor().Bound(pat);
+}
+
 tvm::Array<Var> AllVars(const Expr& expr) {
   return VarVisitor().All(expr);
 }
@@ -248,7 +262,12 @@ TVM_REGISTER_API("relay._ir_pass.free_vars")
 
 TVM_REGISTER_API("relay._ir_pass.bound_vars")
   .set_body([](TVMArgs args, TVMRetValue* ret) {
-      *ret = BoundVars(args[0]);
+      NodeRef x = args[0];
+      if (x.as_derived<ExprNode>()) {
+        *ret = BoundVars(Downcast<Expr>(x));
+      } else {
+        *ret = BoundVars(Downcast<Pattern>(x));
+      }
     });
 
 TVM_REGISTER_API("relay._ir_pass.all_vars")
@@ -369,5 +388,33 @@ bool IsAllPositiveConstant(const Expr& expr) {
   }
 }
 
+Type TypeSubst(const Type& type, const TypeVar& tvar, const Type& subst) {
+  return TypeSubst(type, tvm::Map<TypeVar, Type>({{tvar, subst}}));
+}
+
+Expr TypeSubst(const Expr& expr, const TypeVar& tvar, const Type& subst) {
+  return TypeSubst(expr, tvm::Map<TypeVar, Type>({{tvar, subst}}));
+}
+
+Type TypeSubst(const Type& type, const tvm::Map<TypeVar, Type>& subst_map) {
+  return Bind(type, subst_map);
+}
+
+Expr TypeSubst(const Expr& expr, const tvm::Map<TypeVar, Type>& subst_map) {
+  class TypeSubstMutator : public ExprMutator, public PatternMutator {
+   public:
+    explicit TypeSubstMutator(const tvm::Map<TypeVar, Type>& subst_map) : subst_map_(subst_map) { }
+    Type VisitType(const Type& t) final {
+      return TypeSubst(t, subst_map_);
+    }
+    Var VisitVar(const Var& v) final {
+      return Downcast<Var>(VisitExpr(v));
+    }
+   private:
+    const tvm::Map<TypeVar, Type>& subst_map_;
+  };
+  return TypeSubstMutator(subst_map).VisitExpr(expr);
+}
+
 }  // namespace relay
 }  // namespace tvm
diff --git a/tests/python/contrib/test_nnpack.py b/tests/python/contrib/test_nnpack.py
index 3ebea0e62ce3c..11e23c7494030 100644
--- a/tests/python/contrib/test_nnpack.py
+++ b/tests/python/contrib/test_nnpack.py
@@ -2,6 +2,7 @@
 import numpy as np
 import scipy.signal
 from tvm.contrib import nnpack
+from nose import SkipTest
 
 
 def test_fully_connected_inference():
@@ -17,13 +18,11 @@ def test_fully_connected_inference():
 
     def verify(target="llvm"):
         if not tvm.module.enabled(target):
-            print("skip because %s is not enabled..." % target)
-            return
+            raise SkipTest("skip because %s is not enabled..." % target)
         if not tvm.get_global_func("tvm.contrib.nnpack.fully_connected_inference", True):
-            print("skip because extern function is not available")
-            return
+            raise SkipTest("skip because extern function is not available")
         if not nnpack.is_available():
-            return
+            raise SkipTest("skip because nnpack is not available")
 
         ctx = tvm.cpu(0)
         f = tvm.build(s, [A, B, D, bias], target)
@@ -97,13 +96,11 @@ def verify(target="llvm",
                algorithm=nnpack.ConvolutionAlgorithm.AUTO,
                with_bias=True):
         if not tvm.module.enabled(target):
-            print("skip because %s is not enabled..." % target)
-            return
-        if not tvm.get_global_func("tvm.contrib.nnpack.convolution_inference", True):
-            print("skip because extern function is not available")
-            return
+            raise SkipTest("skip because %s is not enabled..." % target)
+        if not tvm.get_global_func("tvm.contrib.nnpack.fully_connected_inference", True):
+            raise SkipTest("skip because extern function is not available")
         if not nnpack.is_available():
-            return
+            raise SkipTest("skip because nnpack is not available")
 
         ctx = tvm.cpu(0)
         output = nnpack.convolution_inference(
@@ -161,13 +158,11 @@ def verify(target="llvm",
                algorithm=nnpack.ConvolutionAlgorithm.AUTO,
                with_bias=True):
         if not tvm.module.enabled(target):
-            print("skip because %s is not enabled..." % target)
-            return
-        if not tvm.get_global_func("tvm.contrib.nnpack.convolution_inference_without_weight_transform", True):
-            print("skip because extern function is not available")
-            return
+            raise SkipTest("skip because %s is not enabled..." % target)
+        if not tvm.get_global_func("tvm.contrib.nnpack.fully_connected_inference", True):
+            raise SkipTest("skip because extern function is not available")
         if not nnpack.is_available():
-            return
+            raise SkipTest("skip because nnpack is not available")
 
         ctx = tvm.cpu(0)
         transformed_kernel = nnpack.convolution_inference_weight_transform(
diff --git a/tests/python/frontend/coreml/model_zoo/__init__.py b/tests/python/frontend/coreml/model_zoo/__init__.py
index 9e075bba073fc..b89075c7a04cc 100644
--- a/tests/python/frontend/coreml/model_zoo/__init__.py
+++ b/tests/python/frontend/coreml/model_zoo/__init__.py
@@ -1,33 +1,24 @@
-from six.moves import urllib
 import os
 from PIL import Image
 import numpy as np
-
-def download(url, path, overwrite=False):
-    if os.path.exists(path) and not overwrite:
-        return
-    print('Downloading {} to {}.'.format(url, path))
-    urllib.request.urlretrieve(url, path)
+from tvm.contrib.download import download_testdata
 
 def get_mobilenet():
     url = 'https://docs-assets.developer.apple.com/coreml/models/MobileNet.mlmodel'
     dst = 'mobilenet.mlmodel'
-    real_dst = os.path.abspath(os.path.join(os.path.dirname(__file__), dst))
-    download(url, real_dst)
+    real_dst = download_testdata(url, dst, module='coreml')
     return os.path.abspath(real_dst)
 
 def get_resnet50():
     url = 'https://docs-assets.developer.apple.com/coreml/models/Resnet50.mlmodel'
     dst = 'resnet50.mlmodel'
-    real_dst = os.path.abspath(os.path.join(os.path.dirname(__file__), dst))
-    download(url, real_dst)
+    real_dst = download_testdata(url, dst, module='coreml')
     return os.path.abspath(real_dst)
 
 def get_cat_image():
     url = 'https://gist.githubusercontent.com/zhreshold/bcda4716699ac97ea44f791c24310193/raw/fa7ef0e9c9a5daea686d6473a62aacd1a5885849/cat.png'
     dst = 'cat.png'
-    real_dst = os.path.abspath(os.path.join(os.path.dirname(__file__), dst))
-    download(url, real_dst)
+    real_dst = download_testdata(url, dst, module='data')
     img = Image.open(real_dst).resize((224, 224))
     img = np.transpose(img, (2, 0, 1))[np.newaxis, :]
     return np.asarray(img)
\ No newline at end of file
diff --git a/tests/python/frontend/keras/test_forward.py b/tests/python/frontend/keras/test_forward.py
index baa2e4fc203f6..90c07ac09042d 100644
--- a/tests/python/frontend/keras/test_forward.py
+++ b/tests/python/frontend/keras/test_forward.py
@@ -106,6 +106,17 @@ def test_forward_dense():
     verify_keras_frontend(keras_model)
 
 
+def test_forward_sequential():
+    keras_model = keras.models.Sequential([
+        keras.layers.Dense(16, input_dim=32, activation='relu'),
+        keras.layers.Dropout(0.5),
+        keras.layers.Dense(8, activation='relu'),
+        keras.layers.Dropout(0.5),
+        keras.layers.Dense(1, activation='sigmoid')
+    ])
+    verify_keras_frontend(keras_model)
+
+
 def test_forward_pool():
     data = keras.layers.Input(shape=(32,32,1))
     # maxpool
@@ -244,6 +255,7 @@ def test_forward_mobilenet():
     test_forward_merge()
     test_forward_activations()
     test_forward_dense()
+    test_forward_sequential()
     test_forward_pool()
     test_forward_conv()
     test_forward_upsample(interpolation='nearest')
diff --git a/tests/python/frontend/mxnet/test_forward.py b/tests/python/frontend/mxnet/test_forward.py
index aad666ca75b4a..faccfbfd12fe9 100644
--- a/tests/python/frontend/mxnet/test_forward.py
+++ b/tests/python/frontend/mxnet/test_forward.py
@@ -464,6 +464,14 @@ def verify(data_shape, weight_shape):
     verify((2, 2), (4, 5))
     verify((2, 3, 4), (4, 5))
 
+
+def test_forward_smooth_l1():
+    data = mx.sym.var('data')
+    mx_sym = mx.sym.smooth_l1(data)
+    verify_mxnet_frontend_impl(mx_sym, (3, 4), (3, 4))
+    mx_sym = mx.sym.smooth_l1(data, scalar=1.0)
+    verify_mxnet_frontend_impl(mx_sym, (3, 4), (3, 4))
+
 if __name__ == '__main__':
     test_forward_mlp()
     test_forward_vgg()
@@ -498,3 +506,4 @@ def verify(data_shape, weight_shape):
     test_forward_broadcast_axis()
     test_forward_full()
     test_forward_embedding()
+    test_forward_smooth_l1()
diff --git a/tests/python/frontend/onnx/test_forward.py b/tests/python/frontend/onnx/test_forward.py
index de95ff00aef92..1796a548d8ac6 100644
--- a/tests/python/frontend/onnx/test_forward.py
+++ b/tests/python/frontend/onnx/test_forward.py
@@ -113,35 +113,36 @@ def test_reshape():
 
     tvm.testing.assert_allclose(ref_shape, tvm_out.shape)
 
-def test_reshape_like():
+def test_shape():
     in_shape = (4, 3, 3, 4)
-    ref_shape = (3, 4, 4, 3)
+    ref_shape = (6, 2, 4, 3)
 
-    ref_array = np.random.uniform(size=ref_shape).astype('float32')
+    ref_array = np.array(ref_shape)
     ref_node = onnx.helper.make_node('Constant',
                                  inputs=[],
                                  outputs=['ref_in'],
                                  value=onnx.helper.make_tensor(name = 'const_tensor',
-                                                               data_type = onnx.TensorProto.FLOAT,
+                                                               data_type = onnx.TensorProto.INT32,
                                                                dims = ref_array.shape,
-                                                               vals = ref_array.flatten().astype(float)))
-    copy_node = helper.make_node("Identity", ["ref_in"], ["copy_in"])
-    reshape_node = helper.make_node("Reshape", ["in", "copy_in"], ["out"])
+                                                               vals = ref_array.flatten().astype(int)))
+    reshape_node = helper.make_node("Reshape", ["in", "ref_in"], ["out"])
+
+    shape_node = helper.make_node("Shape", ['out'], ['final_out'])
 
-    graph = helper.make_graph([ref_node, copy_node, reshape_node],
-                              "reshape_like_test",
+    graph = helper.make_graph([ref_node, reshape_node, shape_node],
+                              "shape_test",
                               inputs = [helper.make_tensor_value_info("in",
                                             TensorProto.FLOAT, list(in_shape))],
-                              outputs = [helper.make_tensor_value_info("out",
+                              outputs = [helper.make_tensor_value_info("final_out",
                                             TensorProto.FLOAT, list(ref_shape))])
 
-    model = helper.make_model(graph, producer_name='reshape_like_test')
+    model = helper.make_model(graph, producer_name='shape_test')
 
     for target, ctx in ctx_list():
-        x = np.random.uniform(size=in_shape).astype('float32')
-        tvm_out = get_tvm_output(model, x, target, ctx, ref_shape, 'float32')
+        x = np.random.uniform(size=in_shape).astype('int32')
+        tvm_out = get_tvm_output(model, x, target, ctx, ref_shape, 'int32')
 
-    tvm.testing.assert_allclose(ref_shape, tvm_out.shape)
+    tvm.testing.assert_allclose(ref_shape, tvm_out)
 
 def _test_power_iteration(x_shape, y_shape):
     if isinstance(y_shape, int):
@@ -995,7 +996,7 @@ def test_LogSoftmax():
 
 if __name__ == '__main__':
     test_reshape()
-    test_reshape_like()
+    test_shape()
     test_power()
     test_squeeze()
     test_unsqueeze()
diff --git a/tests/python/frontend/tensorflow/test_control_flow.py b/tests/python/frontend/tensorflow/test_control_flow.py
new file mode 100644
index 0000000000000..c5b38c3194672
--- /dev/null
+++ b/tests/python/frontend/tensorflow/test_control_flow.py
@@ -0,0 +1,285 @@
+"""Unit tests for converting TensorFlow control flow op to Relay."""
+import tensorflow as tf
+import numpy as np
+from tvm import relay
+from tvm.relay.frontend.tensorflow import from_tensorflow
+
+
+def check_equal(graph, tf_out):
+    expr, params = from_tensorflow(graph.as_graph_def(add_shapes=True))
+    ex = relay.create_executor('debug')
+    relay_out = ex.evaluate(expr)(**params)
+    if isinstance(relay_out, relay.backend.interpreter.TensorValue):
+        np.testing.assert_allclose(tf_out, relay_out.asnumpy())
+    else:
+        if not isinstance(tf_out, list):
+            tf_out = [tf_out]
+        for x, y in zip(tf_out, [r.asnumpy() for r in relay_out]):
+            np.testing.assert_allclose(x, y)
+
+
+def test_vanilla_loop():
+    graph = tf.Graph()
+    with graph.as_default():
+        i = tf.constant(0)
+
+        def c(i): return tf.less(i, 10)
+
+        def b(i): return tf.add(i, 1)
+
+        r = tf.while_loop(c, b, [i])
+
+        with tf.Session() as sess:
+            tf_out = sess.run(r)
+
+        check_equal(graph, tf_out)
+
+
+def test_loop_2_vars():
+    graph = tf.Graph()
+    with graph.as_default():
+        i0 = tf.constant(0)
+        j0 = tf.ones([2, 2])
+
+        def c(i, j): return i < 10
+
+        def b(i, j): return [tf.add(i, 1), j]
+
+        i1, i2 = tf.while_loop(c, b, loop_vars=[i0, j0])
+        i1 += tf.constant(1337)
+
+        with tf.Session() as sess:
+            tf_out = sess.run(i1)
+
+    check_equal(graph, tf_out)
+
+
+def test_loop_3_vars():
+    graph = tf.Graph()
+    with graph.as_default():
+        i0 = tf.constant(1)
+        j0 = tf.constant(2)
+        k0 = tf.constant(4)
+
+        def c(i, j, k): return i < 10
+
+        def b(i, j, k): return [i+1, j * k, k + i]
+        r = tf.while_loop(c, b, loop_vars=[i0, j0, k0])
+
+        with tf.Session() as sess:
+            tf_out = sess.run(r)
+
+    check_equal(graph, tf_out)
+
+
+def test_loop_conditions():
+    graph = tf.Graph()
+    with graph.as_default():
+        i = tf.constant(1)
+        j = tf.constant(1)
+        k = tf.constant(5)
+
+        def c(i, j, k): return \
+            tf.equal(tf.not_equal(tf.less(i + j, 10),
+                                  tf.less(j * k, 100)),
+                     tf.greater_equal(k, i + j))
+
+        def b(i, j, k): return [i+j, j+k, k+1]
+        r = tf.while_loop(c, b, loop_vars=[i, j, k])
+        with tf.Session() as sess:
+            tf_out = sess.run(r)
+
+    check_equal(graph, tf_out)
+
+
+def test_loop_bodies():
+    graph = tf.Graph()
+    with graph.as_default():
+        def body(x):
+            a = tf.constant(np.array([[5, 6], [7, 8]]), dtype=tf.int32)
+            b = tf.constant(np.array([[1, 2], [3, 4]]), dtype=tf.int32)
+            c = a + b
+            return tf.nn.relu(x + c)
+
+        def condition(x):
+            return tf.reduce_sum(x) < 100
+        x = tf.constant(0, shape=[2, 2])
+        r = tf.while_loop(condition, body, [x])
+        with tf.Session() as sess:
+            tf_out = sess.run(r)
+
+    check_equal(graph, tf_out)
+
+
+def test_nested_loop():
+    graph = tf.Graph()
+    with graph.as_default():
+
+        def body(x):
+            def nest_body(c):
+                return tf.multiply(c, 2)
+            def cd(c): return tf.less(c, 10)
+            c = tf.constant(2)
+            res = tf.while_loop(cd, nest_body, loop_vars=[c])
+            return tf.nn.relu(x + res)
+
+        def condition(x):
+            return tf.greater(x, 100)
+        x = tf.constant(3)
+        r = tf.while_loop(condition, body, loop_vars=[x])
+
+        with tf.Session() as sess:
+            tf_out = sess.run(r)
+
+    check_equal(graph, tf_out)
+
+
+def test_vanilla_cond():
+    graph = tf.Graph()
+    with graph.as_default():
+        i = tf.constant(1)
+        j = tf.constant(4)
+
+        def f1():
+            return tf.multiply(1, 17)
+
+        def f2():
+            return tf.add(4, 23)
+        r = tf.cond(tf.less(i, j), f1, f2)
+
+    with tf.Session(graph=graph) as sess:
+        tf_out = sess.run(r)
+
+    check_equal(graph, tf_out)
+
+
+def test_multiple_cond_vars():
+    graph = tf.Graph()
+    with graph.as_default():
+        x1 = tf.constant(7)
+        x2 = tf.constant(12)
+        z = tf.constant(20)
+        r = tf.cond(tf.less(tf.add(x1, x2), 10),
+                    lambda: tf.add(10, 2), lambda: tf.square(5))
+
+        with tf.Session() as sess:
+            tf_out = sess.run(r)
+
+    check_equal(graph, tf_out)
+
+
+def test_cond_fn_parameters():
+    graph = tf.Graph()
+    with graph.as_default():
+        def fn1(x, y):
+            return tf.multiply(5, 6)
+
+        def fn2(x, y):
+            return tf.add(3, 4)
+
+        i = tf.constant(1)
+        j = tf.constant(2)
+        k = tf.constant(3)
+        r = tf.cond(tf.less(i, j), lambda: fn1(i, k), lambda: fn2(j, k))
+
+        with tf.Session() as sess:
+            tf_out = sess.run(r, feed_dict={i: 1, j: 2, k: 3})
+
+    check_equal(graph, tf_out)
+
+
+def test_nested_cond():
+    graph = tf.Graph()
+    with graph.as_default():
+        def fn1(a, b):
+            def nest_fn1():
+                return tf.add(1, 2)
+
+            def nest_fn2():
+                return tf.subtract(10, 5)
+
+            res = tf.cond(tf.less(1, 2), nest_fn1, nest_fn2)
+            return tf.multiply(tf.add(87, res), 10)
+
+        def fn2(a, b):
+            return tf.add(10, 10)
+
+        x = tf.constant(5)
+        y = tf.constant(6)
+        z = tf.constant(7)
+        pred = tf.less(x, y)
+        r = tf.cond(pred, lambda: fn1(x, y), lambda: fn2(y, z))
+
+        with tf.Session() as sess:
+            tf_out = sess.run(r, feed_dict={x: 1, y: 2, z: 3, pred: True})
+
+    check_equal(graph, tf_out)
+
+
+def test_loop_in_cond():
+    graph = tf.Graph()
+    with graph.as_default():
+        def fn1(a, b):
+            i = tf.constant(0)
+
+            def cd(i): return tf.less(i, 10)
+
+            def bd(i): return tf.add(i, 1)
+            res = tf.while_loop(cd, bd, [i])
+            return tf.multiply(tf.add(20, res), 10)
+
+        def fn2(a, b):
+            return tf.add(10, 20)
+
+        x = tf.constant(7)
+        y = tf.constant(20)
+        z = tf.constant(10)
+        pred = tf.less(x, y)
+        r = tf.cond(pred, lambda: fn1(x, y), lambda: fn2(y, z))
+
+        with tf.Session() as sess:
+            tf_out = sess.run(r, feed_dict={x: 1, y: 2, z: 3, pred: True})
+
+    check_equal(graph, tf_out)
+
+
+def test_cond_in_loop():
+    graph = tf.Graph()
+    with graph.as_default():
+        def body(x):
+            x = tf.constant(7)
+            z = tf.constant(20)
+            res = tf.cond(tf.less(x, 10), lambda: tf.add(
+                10, 20), lambda: tf.square(10))
+            return tf.multiply(res, x)
+
+        x = tf.constant(21)
+        def condition(x):
+            return tf.less(x, 100)
+
+        r = tf.while_loop(condition, body, loop_vars=[x])
+        with tf.Session() as sess:
+            tf_out = sess.run(r)
+
+    check_equal(graph, tf_out)
+
+
+if __name__ == "__main__":
+
+    # tf.while_loop
+    test_vanilla_loop()
+    test_loop_2_vars()
+    test_loop_3_vars()
+    test_loop_conditions()
+    test_loop_bodies()
+
+    # tf.cond
+    test_vanilla_cond()
+    test_multiple_cond_vars()
+    test_cond_fn_parameters()
+
+    # nested cases
+    test_nested_loop()
+    test_nested_cond()
+    test_loop_in_cond()
+    test_cond_in_loop()
diff --git a/tests/python/frontend/tflite/test_forward.py b/tests/python/frontend/tflite/test_forward.py
index 3ccc895dc60e4..3c41792ae903b 100644
--- a/tests/python/frontend/tflite/test_forward.py
+++ b/tests/python/frontend/tflite/test_forward.py
@@ -391,10 +391,9 @@ def test_forward_softmax():
 def test_forward_mobilenet():
     '''test mobilenet v1 tflite model'''
     # MobilenetV1
-    temp = util.tempdir()
     tflite_model_file = tf_testing.get_workload_official(
         "http://download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_1.0_224.tgz",
-        "mobilenet_v1_1.0_224.tflite", temp)
+        "mobilenet_v1_1.0_224.tflite")
     with open(tflite_model_file, "rb") as f:
         tflite_model_buf = f.read()
     data = np.random.uniform(size=(1, 224, 224, 3)).astype('float32')
@@ -403,7 +402,6 @@ def test_forward_mobilenet():
     tvm_output = run_tvm_graph(tflite_model_buf, tvm_data, 'input')
     tvm.testing.assert_allclose(np.squeeze(tvm_output[0]), np.squeeze(tflite_output[0]),
                                 rtol=1e-5, atol=1e-5)
-    temp.remove()
 
 #######################################################################
 # Inception V3
@@ -412,10 +410,9 @@ def test_forward_mobilenet():
 def test_forward_inception_v3_net():
     '''test inception v3 tflite model'''
     # InceptionV3
-    temp = util.tempdir()
     tflite_model_file = tf_testing.get_workload_official(
         "https://storage.googleapis.com/download.tensorflow.org/models/tflite/model_zoo/upload_20180427/inception_v3_2018_04_27.tgz",
-        "inception_v3.tflite", temp)
+        "inception_v3.tflite")
     with open(tflite_model_file, "rb") as f:
         tflite_model_buf = f.read()
     data = np.random.uniform(size=(1, 299, 299, 3)).astype('float32')
@@ -424,7 +421,6 @@ def test_forward_inception_v3_net():
     tvm_output = run_tvm_graph(tflite_model_buf, tvm_data, 'input')
     tvm.testing.assert_allclose(np.squeeze(tvm_output[0]), np.squeeze(tflite_output[0]),
                                 rtol=1e-5, atol=1e-5)
-    temp.remove()
 
 #######################################################################
 # Main
diff --git a/tests/python/integration/test_winograd_nnpack.py b/tests/python/integration/test_winograd_nnpack.py
new file mode 100644
index 0000000000000..62dc3771f7cf1
--- /dev/null
+++ b/tests/python/integration/test_winograd_nnpack.py
@@ -0,0 +1,127 @@
+import numpy as np
+import tvm
+from tvm import autotvm
+from tvm.autotvm.task.space import FallbackConfigEntity
+from tvm.contrib import nnpack
+from tvm.contrib.pickle_memoize import memoize
+import topi
+import topi.testing
+from topi.util import get_const_tuple
+from nose import SkipTest
+
+
+def verify_conv2d_nchw(batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation=1, add_bias=False, add_relu=False,
+        devices=['cuda', 'llvm -device=arm_cpu', 'opencl -device=mali']):
+    print("Workload: (%d, %d, %d, %d, %d, %d, %d, %d)" % (batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation))
+
+    in_height = in_width = in_size
+
+    A = tvm.placeholder((batch, in_channel, in_height, in_width), name='A')
+    W = tvm.placeholder((num_filter, in_channel, kernel, kernel), name='W')
+    bias = tvm.placeholder((num_filter, 1, 1), name='bias')
+
+    a_shape = get_const_tuple(A.shape)
+    w_shape = get_const_tuple(W.shape)
+    bias_shape = get_const_tuple(bias.shape)
+    dtype = A.dtype
+
+    @memoize("topi.tests.test_topi_conv2d_nchw.verify_conv2d_nchw")
+    def get_ref_data():
+        a_np = np.random.uniform(size=a_shape).astype(dtype)
+        w_np = np.random.uniform(size=w_shape).astype(dtype)
+        b_np = np.random.uniform(size=bias_shape).astype(dtype)
+        dw_np = topi.testing.dilate_python(w_np, (1, 1, dilation, dilation))
+        c_np = topi.testing.conv2d_nchw_python(a_np, dw_np, stride, padding)
+        if add_bias:
+            b_np = np.random.uniform(size=bias_shape).astype(dtype)
+            c_np += b_np
+        if add_relu:
+            c_np = np.maximum(c_np, 0)
+        return a_np, w_np, b_np, c_np
+
+    a_np, w_np, b_np, c_np = get_ref_data()
+
+    def check_device(device):
+        ctx = tvm.context(device, 0)
+        if not ctx.exist:
+            raise SkipTest("Skip because %s is not enabled" % device)
+        print("Running on target: %s" % device)
+        with tvm.target.create(device):
+            C = topi.nn.conv2d(A, W, stride, padding, dilation, layout='NCHW', out_dtype=dtype)
+            if add_bias:
+                C = topi.add(C, bias)
+            if add_relu:
+                C = topi.nn.relu(C)
+            s = topi.generic.schedule_conv2d_nchw([C])
+
+        a = tvm.nd.array(a_np, ctx)
+        w = tvm.nd.array(w_np, ctx)
+        b = tvm.nd.array(b_np, ctx)
+        c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), ctx)
+        if add_bias:
+            func = tvm.build(s, [A, W, bias, C], device, name="relu_%d_%d_%d_%d_%d_%d_%d_%d" % (batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation))
+            func(a, w, b, c)
+        else:
+            func = tvm.build(s, [A, W, C], device, name="relu_%d_%d_%d_%d_%d_%d_%d_%d" % (batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation))
+            func(a, w, c)
+        tvm.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-4)
+
+
+    for device in devices:
+        check_device(device)
+
+
+class WinogradFallback(autotvm.FallbackContext):
+    def _query_inside(self, target, workload):
+        key = (target, workload)
+        if key in self.memory:
+            return self.memory[key]
+        cfg = FallbackConfigEntity()
+        cfg.template_key = 'winograd_nnpack_fp32'
+        self.memory[key] = cfg
+        return cfg
+
+def test_conv2d_nchw():
+    if not tvm.get_global_func("tvm.contrib.nnpack.convolution_inference_without_weight_transform", True):
+        raise SkipTest("skip because extern function is not available")
+
+    if not nnpack.is_available():
+        raise SkipTest("skip because nnpack is not available")
+
+    devices = ['llvm -device=arm_cpu']
+    autotvm.DispatchContext.current.silent = True
+    with WinogradFallback():
+        # resnet 18 workloads
+        verify_conv2d_nchw(1, 64, 56, 64, 3, 1, 1, devices=devices)
+        verify_conv2d_nchw(1, 128, 28, 128, 3, 1, 1, devices=devices)
+        verify_conv2d_nchw(1, 256, 14, 256, 3, 1, 1, devices=devices)
+        verify_conv2d_nchw(1, 512, 7, 512, 3, 1, 1, devices=devices)
+
+        # unet workloads
+        verify_conv2d_nchw(1, 3, 192, 12, 3, 1, 1, add_bias=True, devices=devices)
+        verify_conv2d_nchw(1, 4, 192, 12, 3, 1, 1, add_bias=True, devices=devices)
+        verify_conv2d_nchw(1, 12, 96, 24, 3, 1, 1, add_bias=True, devices=devices)
+        verify_conv2d_nchw(1, 24, 48, 48, 3, 1, 1, add_bias=True, devices=devices)
+        verify_conv2d_nchw(1, 48, 24, 96, 3, 1, 1, add_bias=True, devices=devices)
+        verify_conv2d_nchw(1, 96, 12, 180, 3, 1, 1, add_bias=True, devices=devices)
+        verify_conv2d_nchw(1, 180, 6, 220, 3, 1, 1, add_bias=True, devices=devices)
+        verify_conv2d_nchw(1, 220, 6, 180, 3, 1, 1, add_bias=True, devices=devices)
+        verify_conv2d_nchw(1, 180, 12, 96, 3, 1, 1, add_bias=True, devices=devices)
+        verify_conv2d_nchw(1, 96, 24, 48, 3, 1, 1, add_bias=True, devices=devices)
+        verify_conv2d_nchw(1, 48, 48, 24, 3, 1, 1, add_bias=True, devices=devices)
+        verify_conv2d_nchw(1, 24, 96, 12, 3, 1, 1, add_bias=True, devices=devices)
+        verify_conv2d_nchw(1, 12, 192, 1, 3, 1, 1, add_bias=True, devices=devices)
+
+        # relu, bias
+        verify_conv2d_nchw(1, 64, 56, 64, 3, 1, 1, add_bias=True, devices=devices)
+        verify_conv2d_nchw(1, 64, 56, 64, 3, 1, 1, add_relu=True, devices=devices)
+        verify_conv2d_nchw(1, 64, 56, 64, 3, 1, 1, add_relu=True, add_bias=True, devices=devices)
+
+        # werid workloads
+        verify_conv2d_nchw(1, 3, 3, 3, 3, 1, 1, devices=devices)
+        verify_conv2d_nchw(1, 13, 71, 59, 3, 1, 1, devices=devices)
+
+
+if __name__ == "__main__":
+    import nose
+    nose.runmodule()
diff --git a/tests/python/relay/test_adt.py b/tests/python/relay/test_adt.py
index e176194fede60..e9e2915f28a89 100644
--- a/tests/python/relay/test_adt.py
+++ b/tests/python/relay/test_adt.py
@@ -26,6 +26,7 @@
 hd = p.hd
 tl = p.tl
 nth = p.nth
+update = p.update
 length = p.length
 map = p.map
 foldl = p.foldl
@@ -148,6 +149,23 @@ def test_nth():
 
     assert got == expected
 
+def test_update():
+    expected = list(range(10))
+    l = nil()
+    # create zero initialized list
+    for i in range(len(expected)):
+        l = cons(build_nat(0), l)
+
+    # set value
+    for i, v in enumerate(expected):
+        l = update(l, build_nat(i), build_nat(v))
+
+    got = []
+    for i in range(len(expected)):
+        got.append(count(intrp.evaluate(nth(l, build_nat(i)))))
+
+    assert got == expected
+
 def test_length():
     a = relay.TypeVar("a")
     assert mod[length].checked_type == relay.FuncType([l(a)], nat(), [a])
diff --git a/tests/python/relay/test_backend_graph_runtime.py b/tests/python/relay/test_backend_graph_runtime.py
index 434b0e6ddfa1d..56da263c9b4e0 100644
--- a/tests/python/relay/test_backend_graph_runtime.py
+++ b/tests/python/relay/test_backend_graph_runtime.py
@@ -7,6 +7,7 @@
 from tvm.relay.scope_builder import ScopeBuilder
 from tvm.relay.op import add
 from tvm.relay.module import Module
+from tvm.relay.testing.config import ctx_list
 
 # @tq, @jr should we put this in testing ns?
 def check_rts(expr, args, expected_result, mod=None):
@@ -127,9 +128,47 @@ def test_plan_memory():
     assert len(device_types) == 1
 
 
+def test_gru_like():
+    def unit(rnn_dim):
+        X = relay.var("X", shape=(1, rnn_dim))
+        W = relay.var("y", shape=(3 * rnn_dim, rnn_dim))
+        matmul = relay.nn.dense(X, W)
+        splitted = relay.split(matmul, indices_or_sections=3, axis=1)
+        out = relay.sigmoid(splitted[0]) + relay.tanh(splitted[1]) * relay.exp(splitted[2])
+        return relay.Function([X, W], out)
+
+    def sigmoid(x):
+        return 1 / (1 + np.exp(-x))
+
+    def unit_numpy(X, W):
+        prod = np.dot(X, W.transpose())
+        splits = np.split(prod, indices_or_sections=3, axis=1)
+        return sigmoid(splits[0]) + np.tanh(splits[1]) * np.exp(splits[2])
+
+    dtype = "float32"
+    rnn_dim = 1000
+    x = np.random.rand(1, rnn_dim).astype(dtype)
+    y = np.random.rand(3*rnn_dim, rnn_dim).astype(dtype) * 0.01 - 0.005
+    out_shape = (1, rnn_dim)
+    z = unit(rnn_dim)
+
+    for target, ctx in ctx_list():
+        with relay.build_config(opt_level=2):
+            graph, lib, params = relay.build(z, target)
+            m = graph_runtime.create(graph, lib, ctx)
+            m.set_input("X", tvm.nd.array(x.astype(dtype)))
+            m.set_input("y", tvm.nd.array(y.astype(dtype)))
+            m.set_input(**params)
+            m.run()
+            out = m.get_output(0, tvm.nd.empty(out_shape, dtype)).asnumpy()
+            ref = unit_numpy(x, y)
+            tvm.testing.assert_allclose(out, ref, rtol=1e-5, atol=1e-5)
+
+
 if __name__ == "__main__":
     test_plan_memory()
     test_with_params()
     test_add_op_scalar()
     test_add_op_tensor()
     test_add_op_broadcast()
+    test_gru_like()
diff --git a/tests/python/relay/test_ir_text_printer.py b/tests/python/relay/test_ir_text_printer.py
index 626436d9573f4..d3e2d005a4e12 100644
--- a/tests/python/relay/test_ir_text_printer.py
+++ b/tests/python/relay/test_ir_text_printer.py
@@ -3,9 +3,10 @@
 import numpy as np
 from tvm import relay
 
-
 do_print = [False]
 
+SEMVER = "v0.0.1\n"
+
 def show(text):
     if do_print[0]:
         print("---------------------------")
@@ -152,6 +153,19 @@ def test_densenet():
     net, params = tvm.relay.testing.densenet.get_workload(batch_size=1)
     net.astext()
 
+def test_call_node_order():
+    x = relay.var("x")
+    y = relay.var("y")
+    assert relay.Call(relay.Function([x], x), [relay.Call(relay.Function([y], y), [relay.const(1)])]).astext() == SEMVER + \
+        ("%0 = fn (%y) {\n"
+         "  %y\n"
+         "}\n"
+         "%1 = %0(1)\n"
+         "%2 = fn (%x) {\n"
+         "  %x\n"
+         "}\n"
+         "%3 = %2(%1)\n"
+         "%3")
 
 if __name__ == "__main__":
     do_print[0] = True
@@ -170,3 +184,4 @@ def test_densenet():
     test_call_attrs()
     test_let_if_scope()
     test_variable_name()
+    test_call_node_order()
diff --git a/tests/python/relay/test_op_level5.py b/tests/python/relay/test_op_level5.py
index 71f843ec7e008..5a0013c293557 100644
--- a/tests/python/relay/test_op_level5.py
+++ b/tests/python/relay/test_op_level5.py
@@ -489,6 +489,66 @@ def verify_yolo_reorg(shape, stride):
     verify_yolo_reorg((1, 100, 20, 20), 10)
     verify_yolo_reorg((1, 4, 6, 6), 2)
 
+
+def test_deformable_conv2d():
+    def test_infer_type(batch, in_channel, size, out_channel, deformable_groups, groups):
+        data_shape = (batch, in_channel, size, size)
+        data = relay.var("data", shape=data_shape)
+        offset = relay.var("offset")
+        kernel = relay.var("kernel")
+        kernel_size = (3, 3)
+        y = relay.nn.deformable_conv2d(data, offset, kernel,
+            strides=(1, 1),
+            padding=(1, 1),
+            dilation=(1, 1),
+            kernel_size=kernel_size,
+            deformable_groups=deformable_groups,
+            groups=groups,
+            channels=out_channel)
+        weight_shape = (out_channel, in_channel // groups, kernel_size[0], kernel_size[1])
+        out_shape = (batch, out_channel, size, size)
+        offset_shape = (batch, 2 * kernel_size[0] * kernel_size[1] * deformable_groups, out_shape[2], out_shape[3])
+        yy = relay.ir_pass.infer_type(y)
+        assert yy.checked_type == relay.TensorType(out_shape)
+        assert yy.args[1].checked_type == relay.TensorType(offset_shape), yy.args[1].checked_type
+        assert yy.args[2].checked_type == relay.TensorType(weight_shape)
+
+    test_infer_type(1, 4, 16, 4, 4, 1)
+    test_infer_type(2, 4, 16, 4, 1, 2)
+
+
+    def test_run(batch, in_channel, size, out_channel, deformable_groups, groups):
+        kernel_size = (3, 3)
+        data_shape = (batch, in_channel, size, size)
+        offset_shape = (batch, 2 * kernel_size[0] * kernel_size[1] * deformable_groups, size, size)
+        kernel_shape = (out_channel, in_channel // groups, kernel_size[0], kernel_size[1])
+        dtype = 'float32'
+        data = relay.var("data", shape=data_shape, dtype=dtype)
+        offset = relay.var("offset")
+        kernel = relay.var("kernel")
+        y = relay.nn.deformable_conv2d(data, offset, kernel,
+            strides=(1, 1),
+            padding=(1, 1),
+            dilation=(1, 1),
+            kernel_size=kernel_size,
+            deformable_groups=deformable_groups,
+            groups=groups,
+            channels=out_channel)
+        func = relay.Function([data, offset, kernel], y)
+        data = np.random.uniform(size=data_shape).astype(dtype)
+        offset = np.random.uniform(size=offset_shape).astype(dtype)
+        kernel = np.random.uniform(size=kernel_shape).astype(dtype)
+        ref_res = topi.testing.deformable_conv2d_nchw_python(data, offset, kernel, stride=(1, 1), padding=(1, 1), dilation=(1, 1), deformable_groups=deformable_groups, groups=groups)
+
+        for target, ctx in ctx_list():
+            for kind in ["graph", "debug"]:
+                intrp1 = relay.create_executor(kind, ctx=ctx, target=target)
+                op_res1 = intrp1.evaluate(func)(data, offset, kernel)
+                tvm.testing.assert_allclose(op_res1.asnumpy(), ref_res, rtol=1e-5, atol=1e-5)
+    test_run(1, 4, 16, 4, 1, 1)
+    test_run(2, 4, 16, 4, 4, 1)
+
+
 if __name__ == "__main__":
     test_resize_infer_type()
     test_resize()
@@ -501,3 +561,4 @@ def verify_yolo_reorg(shape, stride):
     test_yolo_reorg_infer_shape()
     test_yolo_reorg()
     test_non_max_suppression()
+    test_deformable_conv2d()
diff --git a/tests/python/relay/test_pass_dead_code_elimination.py b/tests/python/relay/test_pass_dead_code_elimination.py
index b88f6500de1a8..eb9f2069e31f6 100644
--- a/tests/python/relay/test_pass_dead_code_elimination.py
+++ b/tests/python/relay/test_pass_dead_code_elimination.py
@@ -32,8 +32,13 @@ def test_let():
 
 
 def test_used_let():
+    orig = relay.Let(e.c, e.one, e.c + e.c)
+    assert alpha_equal(dead_code_elimination(orig), relay.Let(e.c, e.one, e.c + e.c))
+
+
+def test_inline():
     orig = relay.Let(e.a, e.b, relay.Let(e.c, e.d, e.c))
-    assert alpha_equal(dead_code_elimination(orig), relay.Let(e.c, e.d, e.c))
+    assert alpha_equal(dead_code_elimination(orig), e.d)
 
 
 def test_chain_unused_let():
@@ -71,25 +76,27 @@ def test_op_let():
     assert alpha_equal(dead_code_elimination(add(relay.Let(e.a, e.one, e.three), e.two)), add(e.three, e.two))
 
 
-def test_if():
-    cond = relay.const(True)
-    orig = relay.If(cond, e.a, e.b)
-    y = dead_code_elimination(orig)
-    assert alpha_equal(y, e.a)
-
-
 def test_tuple_get_item():
-    t = relay.Var('t')
+    t = relay.Var("t")
     g = relay.TupleGetItem(t, 0)
     assert alpha_equal(dead_code_elimination(g), g)
     assert alpha_equal(dead_code_elimination(relay.TupleGetItem(relay.Let(e.a, e.one, t), 0)), g)
 
 
+def test_dce_respect_effect():
+    r = relay.Var("r")
+    u = relay.Var("u")
+    orig = relay.Let(u, relay.RefWrite(r, relay.const(1)), relay.RefRead(r))
+    print(dead_code_elimination(orig))
+    assert alpha_equal(dead_code_elimination(orig), orig)
+
+
 if __name__ == "__main__":
-    test_if()
     test_let()
     test_used_let()
+    test_inline()
     test_chain_unused_let()
     test_recursion()
     test_op_let()
     test_tuple_get_item()
+    test_dce_respect_effect()
diff --git a/tests/python/relay/test_pass_fuse_ops.py b/tests/python/relay/test_pass_fuse_ops.py
index 634d69bae823b..5df6ad7d52264 100644
--- a/tests/python/relay/test_pass_fuse_ops.py
+++ b/tests/python/relay/test_pass_fuse_ops.py
@@ -217,7 +217,6 @@ def expected(dshape):
     assert not relay.ir_pass.free_vars(zz)
     after = relay.ir_pass.infer_type(expected(dshape))
     assert relay.ir_pass.alpha_equal(zz, after)
-    print(zz.astext())
 
 
 def test_stop_fusion():
@@ -287,6 +286,81 @@ def expected(dshape, dtype):
     assert relay.ir_pass.alpha_equal(f, after)
 
 
+def test_fuse_tuple_get_elemwise():
+    def before(dim):
+        X = relay.var("X", shape=(1, dim))
+        W = relay.var("W", shape=(3 * dim, dim))
+        matmul = relay.nn.dense(X, W)
+        splitted = relay.split(matmul, indices_or_sections=3, axis=1)
+        out = relay.sigmoid(splitted[0]) + relay.tanh(splitted[1]) * relay.exp(splitted[2])
+        return relay.Function([X, W], out)
+
+    def expected(dim):
+        p0 = relay.var("p0", shape=(1, dim))
+        p1 = relay.var("p1", shape=(3 * dim, dim))
+        matmul = relay.nn.dense(p0, p1)
+        f0 = relay.Function([p0, p1], matmul)
+
+        p01 = relay.var("p01", shape=(1, 3 * dim))
+        splitted = relay.split(p01, indices_or_sections=3, axis=1)
+        out = relay.sigmoid(splitted[0]) + relay.tanh(splitted[1]) * relay.exp(splitted[2])
+        f1 = relay.Function([p01], out)
+
+        X = relay.var("X", shape=(1, dim))
+        W = relay.var("W", shape=(3 * dim, dim))
+        y = relay.Call(f0, [X, W])
+        z = relay.Call(f1, [y])
+        return relay.Function([X, W], z)
+
+    dim = 10
+    z = before(dim)
+    z = relay.ir_pass.infer_type(z)
+    zz = relay.ir_pass.fuse_ops(z, opt_level=0)
+    assert not relay.ir_pass.free_vars(zz)
+    zz = relay.ir_pass.fuse_ops(z, opt_level=2)
+    zz = relay.ir_pass.infer_type(zz)
+    assert not relay.ir_pass.free_vars(zz)
+    after = relay.ir_pass.infer_type(expected(dim))
+    assert relay.ir_pass.alpha_equal(zz, after)
+
+
+def test_tuple_get_root():
+    def before(dim):
+        X = relay.var("X", shape=(1, 3 * dim))
+        W = relay.var("W", shape=(dim, dim))
+        splitted = relay.split(X, indices_or_sections=3, axis=1)
+        out = relay.nn.dense(splitted[0], W)
+        return relay.Function([X, W], out)
+
+    def expected(dim):
+        p0 = relay.var("p0", shape=(1, 3 * dim))
+        splitted = relay.split(p0, indices_or_sections=3, axis=1)
+        out = splitted[0]
+        f0 = relay.Function([p0], out)
+
+        p01 = relay.var("p01", shape=(1, dim))
+        p1 = relay.var("p1", shape=(dim, dim))
+        out = relay.nn.dense(p01, p1)
+        f1 = relay.Function([p01, p1], out)
+
+        X = relay.var("X", shape=(1, 3 * dim))
+        W = relay.var("W", shape=(dim, dim))
+        y = relay.Call(f0, [X])
+        z = relay.Call(f1, [y, W])
+        return relay.Function([X, W], z)
+
+    dim = 10
+    z = before(dim)
+    z = relay.ir_pass.infer_type(z)
+    zz = relay.ir_pass.fuse_ops(z, opt_level=0)
+    assert not relay.ir_pass.free_vars(zz)
+    zz = relay.ir_pass.fuse_ops(z, opt_level=2)
+    zz = relay.ir_pass.infer_type(zz)
+    assert not relay.ir_pass.free_vars(zz)
+    after = relay.ir_pass.infer_type(expected(dim))
+    assert relay.ir_pass.alpha_equal(zz, after)
+
+
 if __name__ == "__main__":
     test_fuse_simple()
     test_conv2d_fuse()
@@ -295,3 +369,5 @@ def expected(dshape, dtype):
     test_tuple_strided_slice()
     test_stop_fusion()
     test_fuse_myia_regression()
+    test_fuse_tuple_get_elemwise()
+    test_tuple_get_root()
diff --git a/tests/python/relay/test_pass_gradient.py b/tests/python/relay/test_pass_gradient.py
index 400941f126172..284d721f55688 100644
--- a/tests/python/relay/test_pass_gradient.py
+++ b/tests/python/relay/test_pass_gradient.py
@@ -6,9 +6,11 @@
 
 import numpy as np
 
+
 def rand(dtype='float32', *shape):
     return tvm.nd.array(np.random.rand(*shape).astype(dtype))
 
+
 def test_id():
     shape = (10, 10)
     dtype = 'float32'
@@ -20,8 +22,8 @@ def test_id():
     ex = create_executor()
     x = rand(dtype, *shape)
     forward, (grad,) = ex.evaluate(back_func)(x)
-    np.testing.assert_allclose(forward.asnumpy(), x.asnumpy())
-    np.testing.assert_allclose(grad.asnumpy(), np.ones_like(x.asnumpy()))
+    tvm.testing.assert_allclose(forward.asnumpy(), x.asnumpy())
+    tvm.testing.assert_allclose(grad.asnumpy(), np.ones_like(x.asnumpy()))
 
 
 def test_add():
@@ -35,8 +37,8 @@ def test_add():
     ex = create_executor()
     x = rand(dtype, *shape)
     forward, (grad,) = ex.evaluate(back_func)(x)
-    np.testing.assert_allclose(forward.asnumpy(), 2 * x.asnumpy())
-    np.testing.assert_allclose(grad.asnumpy(), 2 * np.ones_like(x.asnumpy()))
+    tvm.testing.assert_allclose(forward.asnumpy(), 2 * x.asnumpy())
+    tvm.testing.assert_allclose(grad.asnumpy(), 2 * np.ones_like(x.asnumpy()))
 
 
 def test_temp_add():
@@ -51,8 +53,8 @@ def test_temp_add():
     ex = create_executor()
     x = rand(dtype, *shape)
     forward, (grad,) = ex.evaluate(back_func)(x)
-    np.testing.assert_allclose(forward.asnumpy(), 4 * x.asnumpy())
-    np.testing.assert_allclose(grad.asnumpy(), 4 * np.ones_like(x.asnumpy()))
+    tvm.testing.assert_allclose(forward.asnumpy(), 4 * x.asnumpy())
+    tvm.testing.assert_allclose(grad.asnumpy(), 4 * np.ones_like(x.asnumpy()))
 
 
 def test_sub():
@@ -66,8 +68,8 @@ def test_sub():
     ex = create_executor()
     x = rand(dtype, *shape)
     forward, (grad,) = ex.evaluate(back_func)(x)
-    np.testing.assert_allclose(forward.asnumpy(), np.zeros_like(x.asnumpy()))
-    np.testing.assert_allclose(grad.asnumpy(), np.zeros_like(x.asnumpy()))
+    tvm.testing.assert_allclose(forward.asnumpy(), np.zeros_like(x.asnumpy()))
+    tvm.testing.assert_allclose(grad.asnumpy(), np.zeros_like(x.asnumpy()))
 
 
 def test_broadcast_add():
@@ -90,11 +92,11 @@ def test_broadcast_add():
                                                                      relay.TupleType([t1, t2])]))
     ex = create_executor()
     forward, (grad_x, grad_y) = ex.evaluate(full_func)(x_nd, y_nd)
-    np.testing.assert_allclose(forward.asnumpy(), expected_forward)
-    np.testing.assert_allclose(grad_x.asnumpy(),
-                               np.ones_like(expected_forward).sum(axis=2, keepdims=True))
-    np.testing.assert_allclose(grad_y.asnumpy(),
-                               np.ones_like(expected_forward).sum(axis=(0, 1), keepdims=True).squeeze(axis=0))
+    tvm.testing.assert_allclose(forward.asnumpy(), expected_forward)
+    tvm.testing.assert_allclose(grad_x.asnumpy(),
+                                np.ones_like(expected_forward).sum(axis=2, keepdims=True))
+    tvm.testing.assert_allclose(grad_y.asnumpy(),
+                                np.ones_like(expected_forward).sum(axis=(0, 1), keepdims=True).squeeze(axis=0))
 
 
 def test_broadcast_subtract():
@@ -117,11 +119,11 @@ def test_broadcast_subtract():
                                                                      relay.TupleType([t1, t2])]))
     ex = create_executor()
     forward, (grad_x, grad_y) = ex.evaluate(full_func)(x_nd, y_nd)
-    np.testing.assert_allclose(forward.asnumpy(), expected_forward)
-    np.testing.assert_allclose(grad_x.asnumpy(),
-                               np.ones_like(expected_forward).sum(axis=2, keepdims=True))
-    np.testing.assert_allclose(grad_y.asnumpy(),
-                               -np.ones_like(expected_forward).sum(axis=(0, 1), keepdims=True).squeeze(axis=0))
+    tvm.testing.assert_allclose(forward.asnumpy(), expected_forward)
+    tvm.testing.assert_allclose(grad_x.asnumpy(),
+                                np.ones_like(expected_forward).sum(axis=2, keepdims=True))
+    tvm.testing.assert_allclose(grad_y.asnumpy(),
+                                -np.ones_like(expected_forward).sum(axis=(0, 1), keepdims=True).squeeze(axis=0))
 
 
 def test_tuple():
@@ -147,10 +149,10 @@ def test_tuple():
     expected_forward = x_np + y_np - z_np
     ex = create_executor()
     forward, (grad_x, grad_y, grad_z) = ex.evaluate(back_func)(x_nd, y_nd, z_nd)
-    np.testing.assert_allclose(forward.asnumpy(), expected_forward)
-    np.testing.assert_allclose(grad_x.asnumpy(), np.ones_like(grad_x.asnumpy()))
-    np.testing.assert_allclose(grad_y.asnumpy(), np.ones_like(grad_y.asnumpy()))
-    np.testing.assert_allclose(grad_z.asnumpy(), -1 * np.ones_like(grad_z.asnumpy()))
+    tvm.testing.assert_allclose(forward.asnumpy(), expected_forward)
+    tvm.testing.assert_allclose(grad_x.asnumpy(), np.ones_like(grad_x.asnumpy()))
+    tvm.testing.assert_allclose(grad_y.asnumpy(), np.ones_like(grad_y.asnumpy()))
+    tvm.testing.assert_allclose(grad_z.asnumpy(), -1 * np.ones_like(grad_z.asnumpy()))
 
 
 def test_pow():
@@ -168,8 +170,10 @@ def test_pow():
     i_nd = rand(dtype, *shape)
     ex = create_executor(mod=mod)
     forward, (grad_i,) = ex.evaluate(back_func)(i_nd)
-    np.testing.assert_allclose(forward.asnumpy(), 8 * i_nd.asnumpy())
-    np.testing.assert_allclose(grad_i.asnumpy(), 8 * np.ones_like(grad_i.asnumpy()))
+    tvm.testing.assert_allclose(forward.asnumpy(), 8 * i_nd.asnumpy())
+    tvm.testing.assert_allclose(grad_i.asnumpy(), 8 * np.ones_like(grad_i.asnumpy()))
+
+
 
 def test_ref():
     shape = (10, 10)
@@ -187,8 +191,29 @@ def test_ref():
     x_nd = rand(dtype, *shape)
     ex = create_executor()
     forward, (grad_x,) = ex.evaluate(back_func)(x_nd)
-    np.testing.assert_allclose(forward.asnumpy(), 2 * x_nd.asnumpy())
-    np.testing.assert_allclose(grad_x.asnumpy(), 2 * np.ones_like(grad_x.asnumpy()))
+    tvm.testing.assert_allclose(forward.asnumpy(), 2 * x_nd.asnumpy())
+    tvm.testing.assert_allclose(grad_x.asnumpy(), 2 * np.ones_like(grad_x.asnumpy()))
+
+
+def test_square_second_order():
+    shape = (10, 10)
+    dtype = 'float32'
+    t = relay.TensorType(shape, dtype)
+    x = relay.var("x", t)
+    func = relay.Function([x], x * x)
+    back_func = relay.ir_pass.infer_type(gradient(func))
+    y = relay.var("y", t)
+    back_func_adjusted = relay.Function([y], relay.TupleGetItem(relay.TupleGetItem(back_func(y), 1), 0))
+    back_func_adjusted = relay.ir_pass.infer_type(back_func_adjusted)
+    back_back_func = relay.ir_pass.infer_type(gradient(back_func_adjusted))
+    assert back_func.checked_type == relay.FuncType([t], relay.TupleType([t, relay.TupleType([t])]))
+    x_nd = rand(dtype, *shape)
+    ex = create_executor()
+    forward, (grad_x,) = ex.evaluate(back_back_func)(x_nd)
+    tvm.testing.assert_allclose(forward.asnumpy(), 2 * x_nd.asnumpy())
+    tvm.testing.assert_allclose(grad_x.asnumpy(), 2 * np.ones_like(grad_x.asnumpy()))
+
+
 
 if __name__ == "__main__":
     test_id()
@@ -200,3 +225,4 @@ def test_ref():
     test_tuple()
     test_pow()
     test_ref()
+    test_square_second_order()
diff --git a/tests/python/relay/test_pass_partial_eval.py b/tests/python/relay/test_pass_partial_eval.py
new file mode 100644
index 0000000000000..1e4fd1dcfc2eb
--- /dev/null
+++ b/tests/python/relay/test_pass_partial_eval.py
@@ -0,0 +1,189 @@
+import numpy as np
+import tvm
+from tvm import relay
+from tvm.relay.ir_pass import partial_eval, alpha_equal, infer_type, dead_code_elimination, gradient
+from tvm.relay import op, create_executor
+from tvm.relay.backend.interpreter import Value, TupleValue, ConstructorValue
+from tvm.relay.prelude import Prelude
+from tvm.relay import create_executor
+from tvm.relay import Var, TypeVar, TupleGetItem, Let, Function, const, RefRead, RefWrite, RefCreate
+from tvm.relay import TensorType, Tuple, If, Module, Clause, PatternConstructor, PatternVar, Match
+from tvm.relay import GlobalVar, Call, Type
+
+def check_eval(expr, expected_result, mod=None, rtol=1e-07):
+    ctx = tvm.context("llvm", 0)
+    intrp = create_executor(mod=mod, ctx=ctx, target="llvm")
+
+    result = intrp.evaluate(expr)
+    np.testing.assert_allclose(result.asnumpy(), expected_result, rtol=rtol)
+
+
+def dcpe(expr, mod=None):
+    return dead_code_elimination(partial_eval(expr, mod=mod))
+
+
+def test_tuple():
+    t = TypeVar("t")
+    x = Var("x", t)
+    body = TupleGetItem(relay.Tuple([relay.const(4.0), x]), 1)
+    f = Function([x], body, None, [t])
+    assert alpha_equal(dcpe(f), relay.Function([x], x, None, [t]))
+
+
+def test_const_inline():
+    d = Var("d")
+    double = Function([d], d + d)
+    orig = double(const(4.0))
+    assert alpha_equal(dcpe(orig), const(8.0))
+
+
+def test_ref():
+    d = relay.Var("d")
+    r = relay.Var("r")
+    x = relay.Var("x")
+    body = relay.RefRead(r)
+    body = Let(x, RefWrite(r, RefRead(r) * RefRead(r)), body)
+    body = Let(r, RefCreate(d), body)
+    square = Function([d], body)
+    assert alpha_equal(dcpe(square), Function([d], d * d))
+
+
+def test_empty_ad():
+    shape = (10, 10)
+    dtype = "float32"
+    t = TensorType(shape, dtype)
+    d = Var("d", t)
+    f = Function([d], d)
+    g = dcpe(gradient(f))
+    m = d * d
+    x = relay.Var("x")
+    o = op.ones_like(x)
+    x1 = relay.Var("x1")
+    grad = op.zeros_like(d) + op.collapse_sum_like(x1 * d, d) + op.collapse_sum_like(x1 * d, d)
+    body = Tuple([x, Tuple([grad])])
+    body = relay.Let(x1, o, body)
+    expected = Function([d], relay.Let(x, m, body))
+    print(g)
+    assert alpha_equal(g, expected)
+
+
+def test_ad():
+    shape = (10, 10)
+    dtype = "float32"
+    t = TensorType(shape, dtype)
+    d = Var("d", t)
+    f = Function([d], d * d)
+    g = dcpe(gradient(f))
+    m = d * d
+    x = relay.Var("x")
+    o = op.ones_like(x)
+    x1 = relay.Var("x1")
+    grad = op.zeros_like(d) + op.collapse_sum_like(x1 * d, d) + op.collapse_sum_like(x1 * d, d)
+    body = Tuple([x, Tuple([grad])])
+    body = relay.Let(x1, o, body)
+    expected = Function([d], relay.Let(x, m, body))
+    print(g)
+    assert alpha_equal(g, expected)
+
+
+def test_if_ref():
+    shape = ()
+    dtype = "bool"
+    t = TensorType(shape, dtype)
+    d = Var("d", t)
+    r = Var("r")
+    update = Function([], RefWrite(r, RefRead(r) + RefRead(r)))
+    u = Var("u")
+    body = If(d, u(), u())
+    eff = Var("eff")
+    body = Let(eff, body, RefRead(r))
+    f = Function([d], Let(r, RefCreate(const(1)), Let(u, update, body)))
+    f = infer_type(f)
+    pe_f = infer_type(partial_eval(f))
+    ex = create_executor()
+    f_res = ex.evaluate(f)(const(True))
+    pe_f_res = ex.evaluate(pe_f)(const(True))
+    np.testing.assert_allclose(f_res.asnumpy(), 2 * np.ones_like(f_res.asnumpy()))
+    np.testing.assert_allclose(pe_f_res.asnumpy(), 2 * np.ones_like(pe_f_res.asnumpy()))
+
+
+def test_function_invalidate():
+    shape = ()
+    dtype = "bool"
+    t = TensorType(shape, dtype)
+    d = Var("d", t)
+    r = Var("r")
+    fetch = Function([], RefRead(r))
+    fet = Var("fetch")
+    fet_obscured = Var("fetch_obscured")
+    u = Var("u")
+    body = If(d, fet_obscured(), fet_obscured())
+    body = Let(u, RefWrite(r, const(1)), body)
+    body = Let(fet_obscured, If(d, fet, fet), body)
+    body = Let(fet, fetch, body)
+    body = Let(r, RefCreate(const(0)), body)
+    f = Function([d], body)
+    f = infer_type(f)
+    pe_f = infer_type(partial_eval(f))
+    ex = create_executor()
+    f_res = ex.evaluate(f)(const(True))
+    pe_f_res = ex.evaluate(pe_f)(const(True))
+    np.testing.assert_allclose(f_res.asnumpy(), np.ones_like(f_res.asnumpy()))
+    np.testing.assert_allclose(pe_f_res.asnumpy(), np.ones_like(pe_f_res.asnumpy()))
+
+
+def test_head_cons():
+    mod = Module()
+    p = Prelude(mod)
+    def hd_impl():
+        a = TypeVar("a")
+        x = Var("x", p.l(a))
+        y = Var("y")
+        z = Var("z")
+        cons_case = Clause(PatternConstructor(p.cons,
+                                              [PatternVar(y),
+                                               PatternVar(z)]),
+                           y)
+        return Function([x], Match(x, [cons_case]), a, [a])
+    t = TypeVar("t")
+    x = Var("x", t)
+    hd = Var("hd")
+    body = Let(hd, hd_impl(), hd(p.cons(x, p.nil())))
+    f = Function([x], body, None, [t])
+    f = infer_type(f, mod=mod)
+    res = dcpe(f)
+    assert alpha_equal(res, Function([x], x, t, [t]))
+
+
+def test_map():
+    mod = Module()
+    p = Prelude(mod)
+    f = Var("f")
+    orig = p.map(f, p.cons(const(1), p.cons(const(2), p.cons(const(3), p.nil()))))
+    expected = p.cons(f(const(1)), p.cons(f(const(2)), p.cons(f(const(3)), p.nil())))
+    assert alpha_equal(dcpe(orig, mod=mod), expected)
+
+
+def test_loop():
+    mod = Module()
+    t = TypeVar("t")
+    x = Var("x", t)
+    loop = GlobalVar("loop")
+    mod[loop] = Function([x], loop(x), t, [t])
+    res = dcpe(loop(const(1)), mod=mod)
+    expected = Call(loop, [const(1)], None, [None])
+    assert alpha_equal(res, expected)
+
+
+if __name__ == '__main__':
+    test_empty_ad()
+    raise
+    test_tuple()
+    test_const_inline()
+    test_ref()
+    test_ad()
+    test_if_ref()
+    test_function_invalidate()
+    test_head_cons()
+    test_map()
+    test_loop()
diff --git a/tests/python/relay/test_to_a_normal_form.py b/tests/python/relay/test_pass_to_a_normal_form.py
similarity index 94%
rename from tests/python/relay/test_to_a_normal_form.py
rename to tests/python/relay/test_pass_to_a_normal_form.py
index 392e1769e57db..dc2e3b0795278 100644
--- a/tests/python/relay/test_to_a_normal_form.py
+++ b/tests/python/relay/test_pass_to_a_normal_form.py
@@ -138,6 +138,7 @@ def test_add():
     assert count(intrp.evaluate(to_a_normal_form(add(s(z()), s(z())), mod))) == 2
     assert "let" in mod[add].astext()
 
+
 def test_let():
     x = relay.Var("x")
     y = relay.Var("y")
@@ -147,6 +148,17 @@ def test_let():
     check_eval(body, 8)
     check_eval(to_a_normal_form(body), 8)
 
+
+def test_function():
+    x = relay.Var("x")
+    f = relay.Function([x], x + x)
+    d = relay.const(4.0, 'float32')
+    anf_f = to_a_normal_form(f)
+    assert isinstance(anf_f, relay.Function)
+    check_eval(f(d), 8)
+    check_eval(anf_f(d), 8)
+
+
 if __name__ == '__main__':
     test_explicit_bound()
     test_order()
@@ -155,3 +167,4 @@ def test_let():
     test_ref()
     test_add()
     test_let()
+    test_function()
diff --git a/tests/python/relay/test_to_graph_normal_form.py b/tests/python/relay/test_pass_to_graph_normal_form.py
similarity index 100%
rename from tests/python/relay/test_to_graph_normal_form.py
rename to tests/python/relay/test_pass_to_graph_normal_form.py
diff --git a/tests/python/unittest/test_arith_detect_linear_equation.py b/tests/python/unittest/test_arith_detect_linear_equation.py
index 2b0f327b65b29..33e266684f097 100644
--- a/tests/python/unittest/test_arith_detect_linear_equation.py
+++ b/tests/python/unittest/test_arith_detect_linear_equation.py
@@ -20,6 +20,10 @@ def test_basic():
     m = tvm.arith.DetectLinearEquation(b * 7, [a])
     assert m[0].value == 0
 
+    m = tvm.arith.DetectLinearEquation(b * 7, [])
+    assert len(m) == 1
+    assert tvm.ir_pass.Simplify(m[0] - b * 7).value == 0
+
 def test_multivariate():
     v = [tvm.var("v%d" % i) for i in range(4)]
     b = tvm.var("b")
@@ -42,6 +46,10 @@ def test_multivariate():
     assert(m[0].value == 0)
     assert(tvm.ir_pass.Simplify(m[1] - (v[0] - v[1])).value == 0)
 
+    m = tvm.arith.DetectLinearEquation((v[0] - v[1]), [])
+    assert(len(m) == 1)
+    assert(tvm.ir_pass.Simplify(m[0] - (v[0] - v[1])).value == 0)
+
 if __name__ == "__main__":
     test_basic()
     test_multivariate()
diff --git a/tests/python/unittest/test_arith_modular_set.py b/tests/python/unittest/test_arith_modular_set.py
index 06ae5197b9749..af60bc2152f0f 100644
--- a/tests/python/unittest/test_arith_modular_set.py
+++ b/tests/python/unittest/test_arith_modular_set.py
@@ -117,6 +117,22 @@ def test_constraint_scope():
     assert m.coeff == 1
     assert m.base == 0
 
+def test_intersect():
+    a = tvm.var("a")
+    analyzer = tvm.arith.Analyzer()
+    with analyzer.constraint_scope(a % 4 == 1):
+        with analyzer.constraint_scope(a % 3 == 1):
+            m = analyzer.modular_set(a)
+            assert m.coeff == 12
+            assert m.base == 1
+
+    with analyzer.constraint_scope(a % 3 == 2):
+        with analyzer.constraint_scope(a % 5 == 3):
+            with analyzer.constraint_scope(a % 7 == 2):
+                m = analyzer.modular_set(a)
+                assert m.coeff == 105
+                assert m.base == 23
+
 
 if __name__ == "__main__":
     test_cast()
@@ -126,3 +142,4 @@ def test_constraint_scope():
     test_min_max_select()
     test_mix_index()
     test_constraint_scope()
+    test_intersect()
diff --git a/tests/python/unittest/test_arith_rewrite_simplify.py b/tests/python/unittest/test_arith_rewrite_simplify.py
index 274676449cab2..62e6ea9c6c8e7 100644
--- a/tests/python/unittest/test_arith_rewrite_simplify.py
+++ b/tests/python/unittest/test_arith_rewrite_simplify.py
@@ -450,6 +450,21 @@ def test_cmp_simplify():
     ck.verify(tvm.max(8, x) > 10, tvm.expr.LT(10, x))
     ck.verify(x + 1 < tvm.max(8, x), x < 7)
 
+    ck.analyzer.update(x, tvm.arith.ConstIntBound(0, 10), override=True)
+    ck.analyzer.update(y, tvm.arith.ConstIntBound(-10, 0), override=True)
+    ck.analyzer.update(z, tvm.arith.ConstIntBound(-5, 5), override=True)
+
+    ck.verify(x < 11, tvm.const(1, "bool"))
+    ck.verify(x <= 10, tvm.const(1, "bool"))
+    ck.verify(z <= 5, tvm.const(1, "bool"))
+    ck.verify(x + y <= 10, tvm.const(1, "bool"))
+    ck.verify(x + y >= -10, tvm.const(1, "bool"))
+    ck.verify(z - 5 <= y + 10, tvm.const(1, "bool"))
+    ck.verify(tvm.all(x > -1, z <= x + 5), tvm.const(1, "bool"))
+    ck.verify(x*y <= 0, tvm.const(1, "bool"))
+    ck.verify((x + 1)*(y - 1) < 0, tvm.const(1, "bool"))
+    ck.verify(y*y >= 0, tvm.const(1, "bool"))
+
 
 def test_logical_simplify():
     ck = RewriteChecker()
diff --git a/tests/python/unittest/test_codegen_x86.py b/tests/python/unittest/test_codegen_x86.py
new file mode 100644
index 0000000000000..b4dde96a06f67
--- /dev/null
+++ b/tests/python/unittest/test_codegen_x86.py
@@ -0,0 +1,60 @@
+import tvm
+import re
+
+
+def test_fp16_to_fp32():
+    if tvm.codegen.llvm_version_major() < 6:
+        print("Skipping due to LLVM version being {} < 6".format(
+            tvm.codegen.llvm_version_major()))
+        return
+
+    def fp16_to_fp32(target, width, match=None, not_match=None):
+        elements = 64
+        n = tvm.convert(elements)
+        A = tvm.placeholder((n, width), dtype="float16", name='A')
+        B = tvm.compute(A.shape, lambda *i: A(*i).astype("float32"), name='B')
+        s = tvm.create_schedule(B.op)
+        s[B].vectorize(s[B].op.axis[1])
+        f = tvm.build(s, [A, B], target)
+
+        assembly = f.get_source('asm').splitlines()
+        if match:
+            matches = [l for l in assembly if re.search(match, l)]
+            assert matches
+        if not_match:
+            not_matches = [l for l in assembly if re.search(not_match, l)]
+            assert not not_matches
+
+
+    fp16_to_fp32(
+        'llvm -mcpu=skylake-avx512', 15,
+        match="vcvtph2ps.*ymm", not_match="vcvtph2ps.*zmm")
+    fp16_to_fp32(
+        'llvm -mcpu=skylake-avx512', 16,
+        match="vcvtph2ps.*zmm")
+    fp16_to_fp32(
+        'llvm -mcpu=skylake-avx512', 17,
+        match="vcvtph2ps.*zmm")
+    fp16_to_fp32(
+        'llvm -mcpu=skylake-avx512', 49,
+        match="vcvtph2ps.*zmm")
+    fp16_to_fp32(
+        'llvm -mcpu=skylake-avx512 -mattr=-avx512f', 49,
+        match="vcvtph2ps.*ymm",
+        not_match="vcvtph2ps.*zmm")
+    fp16_to_fp32(
+        'llvm -mcpu=skylake-avx512 -mattr=-f16c,-avx512f', 49,
+        not_match="vcvtph2ps")
+    fp16_to_fp32(
+        'llvm -mcpu=core-avx2', 8,
+        match="vcvtph2ps.*ymm")
+    fp16_to_fp32(
+        'llvm -mcpu=core-avx2', 9,
+        match="vcvtph2ps.*ymm")
+    fp16_to_fp32(
+        'llvm', 9,
+        not_match="vcvtph2ps")
+
+
+if __name__ == "__main__":
+    test_fp16_to_fp32()
diff --git a/tests/python/unittest/test_runtime_graph_debug.py b/tests/python/unittest/test_runtime_graph_debug.py
index 4bbe6509c40c2..4c59cb53eadd3 100644
--- a/tests/python/unittest/test_runtime_graph_debug.py
+++ b/tests/python/unittest/test_runtime_graph_debug.py
@@ -64,6 +64,22 @@ def check_verify():
         #Verify the tensors are dumped
         assert(len(os.listdir(directory)) > 1)
 
+        CHROME_TRACE_FILE_NAME = '_tvmdbg_execution_trace.json'
+        assert(os.path.exists(os.path.join(directory, CHROME_TRACE_FILE_NAME)))
+
+        with open(os.path.join(directory, CHROME_TRACE_FILE_NAME)) as f:
+            trace = json.load(f)
+        assert trace["displayTimeUnit"] == "ns"
+        events = trace["traceEvents"]
+        assert len(events) == 4
+        assert all(event["ph"] in ('B', 'E') for event in events)
+        assert all(event["pid"] == 1 for event in events)
+        assert all(event["tid"] == 1 for event in events)
+        assert all(event["name"] == 'x' for event in events[:2])
+        assert all(event["name"] == 'add' for event in events[2:])
+        assert events[0]["ts"] == 0
+        assert events[0]["ph"] == 'B'
+
         #verify the output is correct
         out = mod.get_output(0, tvm.nd.empty((n,)))
         np.testing.assert_equal(out.asnumpy(), a + 1)
diff --git a/tests/scripts/task_cpp_unittest.sh b/tests/scripts/task_cpp_unittest.sh
index 2996fd6099a16..339019b0aa4df 100755
--- a/tests/scripts/task_cpp_unittest.sh
+++ b/tests/scripts/task_cpp_unittest.sh
@@ -1,7 +1,11 @@
 #!/bin/bash
-export LD_LIBRARY_PATH=lib:${LD_LIBRARY_PATH}
 
-make cpptest -j8 || exit -1
+set -e
+set -u
+
+export LD_LIBRARY_PATH="lib:${LD_LIBRARY_PATH:-}"
+
+make cpptest -j8
 for test in build/*_test; do
-    ./$test || exit -1
+    ./$test
 done
diff --git a/tests/scripts/task_golang.sh b/tests/scripts/task_golang.sh
index 363ee05bcbecb..672445c99743d 100755
--- a/tests/scripts/task_golang.sh
+++ b/tests/scripts/task_golang.sh
@@ -1,8 +1,9 @@
 #!/bin/bash
 
 set -e
+set -u
 
-export LD_LIBRARY_PATH=lib:$LD_LIBRARY_PATH
+export LD_LIBRARY_PATH="lib:${LD_LIBRARY_PATH:-}"
 
 tvm_root="$(git rev-parse --show-toplevel)"
 export PYTHONPATH="$tvm_root/python":"$tvm_root/nnvm/python":"$tvm_root/topi/python"
diff --git a/tests/scripts/task_java_unittest.sh b/tests/scripts/task_java_unittest.sh
index df85e496b226e..fa85f24ef801b 100755
--- a/tests/scripts/task_java_unittest.sh
+++ b/tests/scripts/task_java_unittest.sh
@@ -1,23 +1,27 @@
 #!/bin/bash
+
+set -e
+set -u
+
 export PYTHONPATH=python
-export LD_LIBRARY_PATH=lib:${LD_LIBRARY_PATH}
+export LD_LIBRARY_PATH="lib:${LD_LIBRARY_PATH:-}"
 
 CURR_DIR=$(cd `dirname $0`; pwd)
 SCRIPT_DIR=$CURR_DIR/../../jvm/core/src/test/scripts
 TEMP_DIR=$(mktemp -d)
 
-python $SCRIPT_DIR/test_add_cpu.py $TEMP_DIR || exit -1
-python $SCRIPT_DIR/test_add_gpu.py $TEMP_DIR || exit -1
-python $SCRIPT_DIR/test_graph_runtime.py $TEMP_DIR || exit -1
+python $SCRIPT_DIR/test_add_cpu.py $TEMP_DIR
+python $SCRIPT_DIR/test_add_gpu.py $TEMP_DIR
+python $SCRIPT_DIR/test_graph_runtime.py $TEMP_DIR
 
 # start rpc proxy server
 PORT=$(( ( RANDOM % 1000 )  + 9000 ))
 python $SCRIPT_DIR/test_rpc_proxy_server.py $PORT 30 &
 
-make jvmpkg || exit -1
+make jvmpkg
 make jvmpkg JVM_TEST_ARGS="-DskipTests=false \
   -Dtest.tempdir=$TEMP_DIR \
   -Dtest.rpc.proxy.host=localhost \
-  -Dtest.rpc.proxy.port=$PORT" || exit -1
+  -Dtest.rpc.proxy.port=$PORT"
 
 rm -rf $TEMP_DIR
diff --git a/tests/scripts/task_python_docs.sh b/tests/scripts/task_python_docs.sh
index 2dfa68415f982..b076a557b6af7 100755
--- a/tests/scripts/task_python_docs.sh
+++ b/tests/scripts/task_python_docs.sh
@@ -1,4 +1,8 @@
 #!/bin/bash
+
+set -e
+set -u
+
 mkdir -p docs/_build/html
 rm -rf docs/_build/html/jsdoc
 rm -rf docs/_build/html/javadoc
@@ -10,17 +14,17 @@ rm -rf docs/tutorials
 make doc
 
 # JS doc
-jsdoc web/tvm_runtime.js web/README.md || exit -1
-mv out docs/_build/html/jsdoc || exit -1
+jsdoc web/tvm_runtime.js web/README.md
+mv out docs/_build/html/jsdoc
 
 # Java doc
-make javadoc || exit -1
-mv jvm/core/target/site/apidocs docs/_build/html/javadoc || exit -1
+make javadoc
+mv jvm/core/target/site/apidocs docs/_build/html/javadoc
 
 rm -rf python/tvm/*.pyc python/tvm/*/*.pyc python/tvm/*/*/*.pyc
 
 cd docs
-PYTHONPATH=`pwd`/../python make html || exit -1
+PYTHONPATH=`pwd`/../python make html
 cd _build/html
 tar czf docs.tgz *
 mv docs.tgz ../../../
diff --git a/tests/scripts/task_python_frontend.sh b/tests/scripts/task_python_frontend.sh
index b4802da1c42ad..4a192ef142ff2 100755
--- a/tests/scripts/task_python_frontend.sh
+++ b/tests/scripts/task_python_frontend.sh
@@ -1,55 +1,58 @@
 #!/bin/bash
 
+set -e
+set -u
+
 export PYTHONPATH=nnvm/python:python:topi/python
 # to avoid openblas threading error
 export OMP_NUM_THREADS=1
 
 # Rebuild cython
-make cython || exit -1
-make cython3 || exit -1
+make cython
+make cython3
+
+echo "Running relay TFLite frontend test..."
+python3 -m nose -v tests/python/frontend/tflite
 
 echo "Running nnvm unittest..."
-python -m nose -v nnvm/tests/python/unittest || exit -1
-python3 -m nose -v nnvm/tests/python/unittest || exit -1
+python -m nose -v nnvm/tests/python/unittest
+python3 -m nose -v nnvm/tests/python/unittest
 
 echo "Running nnvm compiler test..."
-python3 -m nose -v nnvm/tests/python/compiler || exit -1
+python3 -m nose -v nnvm/tests/python/compiler
 
 echo "Running nnvm ONNX frontend test..."
-python3 -m nose -v nnvm/tests/python/frontend/onnx || exit -1
+python3 -m nose -v nnvm/tests/python/frontend/onnx
 
 echo "Running nnvm MXNet frontend test..."
-python3 -m nose -v nnvm/tests/python/frontend/mxnet || exit -1
+python3 -m nose -v nnvm/tests/python/frontend/mxnet
 
 echo "Running nnvm Keras frontend test..."
-python3 -m nose -v nnvm/tests/python/frontend/keras || exit -1
+python3 -m nose -v nnvm/tests/python/frontend/keras
 
 echo "Running nnvm Tensorflow frontend test..."
-python3 -m nose -v nnvm/tests/python/frontend/tensorflow || exit -1
+python3 -m nose -v nnvm/tests/python/frontend/tensorflow
 
 echo "Running nnvm CoreML frontend test..."
-python3 -m nose -v nnvm/tests/python/frontend/coreml || exit -1
+python3 -m nose -v nnvm/tests/python/frontend/coreml
 
 echo "Running relay MXNet frontend test..."
-python3 -m nose -v tests/python/frontend/mxnet || exit -1
+python3 -m nose -v tests/python/frontend/mxnet
 
 echo "Running relay Keras frontend test..."
-python3 -m nose -v tests/python/frontend/keras || exit -1
+python3 -m nose -v tests/python/frontend/keras
 
 echo "Running relay ONNX frondend test..."
-python3 -m nose -v tests/python/frontend/onnx || exit -1
+python3 -m nose -v tests/python/frontend/onnx
 
 echo "Running relay CoreML frondend test..."
-python3 -m nose -v tests/python/frontend/coreml || exit -1
-
-echo "Running relay Tensorflow frontend test..."
-python3 -m nose -v tests/python/frontend/tensorflow || exit -1
+python3 -m nose -v tests/python/frontend/coreml
 
 echo "Running nnvm to relay frontend test..."
-python3 -m nose -v tests/python/frontend/nnvm_to_relay || exit -1
+python3 -m nose -v tests/python/frontend/nnvm_to_relay
 
-echo "Running relay TFLite frontend test..."
-python3 -m nose -v tests/python/frontend/tflite || exit -1
+echo "Running relay Tensorflow frontend test..."
+python3 -m nose -v tests/python/frontend/tensorflow
 
 echo "Running relay caffe2 frondend test..."
-python3 -m nose -v tests/python/frontend/caffe2 || exit -1
+python3 -m nose -v tests/python/frontend/caffe2
diff --git a/tests/scripts/task_python_integration.sh b/tests/scripts/task_python_integration.sh
index 8991c89d426cc..abed38d6639da 100755
--- a/tests/scripts/task_python_integration.sh
+++ b/tests/scripts/task_python_integration.sh
@@ -1,28 +1,32 @@
 #!/bin/bash
+
+set -e
+set -u
+
 export PYTHONPATH=python:topi/python:apps/extension/python
-export LD_LIBRARY_PATH=build:${LD_LIBRARY_PATH}
+export LD_LIBRARY_PATH="build:${LD_LIBRARY_PATH:-}"
 
 rm -rf python/tvm/*.pyc python/tvm/*/*.pyc python/tvm/*/*/*.pyc
 
 # Test TVM
-make cython || exit -1
-make cython3 || exit -1
+make cython
+make cython3
 
 # Test extern package
 cd apps/extension
 rm -rf lib
-make || exit -1
+make
 cd ../..
-python -m nose -v apps/extension/tests || exit -1
+python -m nose -v apps/extension/tests
 
-TVM_FFI=cython python -m nose -v tests/python/integration || exit -1
-TVM_FFI=ctypes python3 -m nose -v tests/python/integration || exit -1
-TVM_FFI=cython python -m nose -v tests/python/contrib || exit -1
-TVM_FFI=ctypes python3 -m nose -v tests/python/contrib || exit -1
+TVM_FFI=cython python -m nose -v tests/python/integration
+TVM_FFI=ctypes python3 -m nose -v tests/python/integration
+TVM_FFI=cython python -m nose -v tests/python/contrib
+TVM_FFI=ctypes python3 -m nose -v tests/python/contrib
 
-TVM_FFI=cython python -m nose -v tests/python/relay || exit -1
-TVM_FFI=ctypes python3 -m nose -v tests/python/relay || exit -1
+TVM_FFI=cython python -m nose -v tests/python/relay
+TVM_FFI=ctypes python3 -m nose -v tests/python/relay
 
 # Do not enable OpenGL
-# TVM_FFI=cython python -m nose -v tests/webgl || exit -1
-# TVM_FFI=ctypes python3 -m nose -v tests/webgl || exit -1
+# TVM_FFI=cython python -m nose -v tests/webgl
+# TVM_FFI=ctypes python3 -m nose -v tests/webgl
diff --git a/tests/scripts/task_python_topi.sh b/tests/scripts/task_python_topi.sh
index d348f611e8cef..58c4d18cdc31c 100755
--- a/tests/scripts/task_python_topi.sh
+++ b/tests/scripts/task_python_topi.sh
@@ -1,11 +1,14 @@
+set -e
+set -u
+
 export PYTHONPATH=python:topi/python
 
 # Rebuild cython
-make cython || exit -1
-make cython3 || exit -1
+make cython
+make cython3
 
 rm -rf python/tvm/*.pyc python/tvm/*/*.pyc python/tvm/*/*/*.pyc
 rm -rf topi/python/topi/*.pyc topi/python/topi/*/*.pyc topi/python/topi/*/*/*.pyc topi/python/topi/*/*/*/*.pyc 
 
-python -m nose -v topi/tests/python || exit -1
-python3 -m nose -v topi/tests/python || exit -1
+python -m nose -v topi/tests/python
+python3 -m nose -v topi/tests/python
diff --git a/tests/scripts/task_python_unittest.sh b/tests/scripts/task_python_unittest.sh
index 6ffd5675534af..1c1f4c6f13906 100755
--- a/tests/scripts/task_python_unittest.sh
+++ b/tests/scripts/task_python_unittest.sh
@@ -1,12 +1,15 @@
 #!/bin/bash
 
+set -e
+set -u
+
 export PYTHONPATH=python:topi/python
 
 rm -rf python/tvm/*.pyc python/tvm/*/*.pyc python/tvm/*/*/*.pyc
 
-TVM_FFI=ctypes python -m nose -v tests/python/unittest || exit -1
-TVM_FFI=ctypes python3 -m nose -v tests/python/unittest || exit -1
-make cython || exit -1
-make cython3 || exit -1
-TVM_FFI=cython python -m nose -v tests/python/unittest || exit -1
-TVM_FFI=cython python3 -m nose -v tests/python/unittest || exit -1
+TVM_FFI=ctypes python -m nose -v tests/python/unittest
+TVM_FFI=ctypes python3 -m nose -v tests/python/unittest
+make cython
+make cython3
+TVM_FFI=cython python -m nose -v tests/python/unittest
+TVM_FFI=cython python3 -m nose -v tests/python/unittest
diff --git a/tests/scripts/task_python_vta.sh b/tests/scripts/task_python_vta.sh
index ff43801b7e21f..ea71fda178647 100755
--- a/tests/scripts/task_python_vta.sh
+++ b/tests/scripts/task_python_vta.sh
@@ -1,18 +1,21 @@
 #!/bin/bash
 
+set -e
+set -u
+
 export PYTHONPATH=python:nnvm/python:vta/python:topi/python
 
 rm -rf python/tvm/*.pyc python/tvm/*/*.pyc python/tvm/*/*/*.pyc python/tvm/*/*/*/*.pyc
 rm -rf ~/.tvm
 
 # Rebuild cython
-make cython || exit -1
-make cython3 || exit -1
+make cython
+make cython3
 
 echo "Running unittest..."
-python -m nose -v vta/tests/python/unittest || exit -1
-python3 -m nose -v vta/tests/python/unittest || exit -1
+python -m nose -v vta/tests/python/unittest
+python3 -m nose -v vta/tests/python/unittest
 
 echo "Running integration test..."
-python -m nose -v vta/tests/python/integration || exit -1
-python3 -m nose -v vta/tests/python/integration || exit -1
+python -m nose -v vta/tests/python/integration
+python3 -m nose -v vta/tests/python/integration
diff --git a/tests/scripts/task_rust.sh b/tests/scripts/task_rust.sh
index 5d8c242f44dff..be0181b4d95b8 100755
--- a/tests/scripts/task_rust.sh
+++ b/tests/scripts/task_rust.sh
@@ -1,10 +1,11 @@
 #!/bin/bash
 
 set -e
+set -u
 
 export TVM_HOME="$(git rev-parse --show-toplevel)"
 
-export LD_LIBRARY_PATH="$TVM_HOME/lib":"$TVM_HOME/build":"$TVM_HOME/nnvm":$LD_LIBRARY_PATH
+export LD_LIBRARY_PATH="$TVM_HOME/lib:$TVM_HOME/build:$TVM_HOME/nnvm:${LD_LIBRARY_PATH:-}"
 export PYTHONPATH="$TVM_HOME/python":"$TVM_HOME/nnvm/python":"$TVM_HOME/topi/python"
 export RUST_DIR="$TVM_HOME/rust"
 
diff --git a/tests/scripts/task_verilog_test.sh b/tests/scripts/task_verilog_test.sh
index d088f29c0df80..69b5aeb90420c 100755
--- a/tests/scripts/task_verilog_test.sh
+++ b/tests/scripts/task_verilog_test.sh
@@ -1,5 +1,9 @@
 #!/bin/bash
+
+set -e
+set -u
+
 export PYTHONPATH=python
-make verilog || exit -1
-nosetests -v tests/verilog/unittest || exit -1
-nosetests -v tests/verilog/integration || exit -1
+make verilog
+nosetests -v tests/verilog/unittest
+nosetests -v tests/verilog/integration
diff --git a/tests/scripts/task_web_test.sh b/tests/scripts/task_web_test.sh
index c8c18cec25046..5ac5f1ae8ae7b 100755
--- a/tests/scripts/task_web_test.sh
+++ b/tests/scripts/task_web_test.sh
@@ -1,4 +1,8 @@
 #!/bin/bash
+
+set -e
+set -u
+
 export PYTHONPATH=python
 
 cp /emsdk-portable/.emscripten ~/.emscripten
@@ -11,13 +15,13 @@ echo "Build TVM Web runtime..."
 make web
 
 echo "Prepare test libraries..."
-python tests/web/prepare_test_libs.py || exit -1
+python tests/web/prepare_test_libs.py
 
 echo "Start testing..."
 
 for test in tests/web/test_*.js; do
     echo node $test
-    node $test || exit -1
+    node $test
 done
 
 echo "All tests finishes..."
diff --git a/topi/include/topi/nn.h b/topi/include/topi/nn.h
index 00c3f999853d2..653c0a5f70ce9 100644
--- a/topi/include/topi/nn.h
+++ b/topi/include/topi/nn.h
@@ -97,7 +97,6 @@ inline tvm::Tensor prelu(const tvm::Tensor &x,
                          const int axis = 1,
                          std::string name = "tensor",
                          std::string tag = kBroadcast) {
-  CHECK_EQ(4, x->shape.size());
   CHECK((size_t)axis < x->shape.size()) <<
         "Wrong axis ("  << axis << ")value. ";
   CHECK(topi::detail::GetConstInt(slope->shape[0]) ==
diff --git a/topi/include/topi/transform.h b/topi/include/topi/transform.h
index 57d442dc9206c..464bd6facad54 100644
--- a/topi/include/topi/transform.h
+++ b/topi/include/topi/transform.h
@@ -1084,6 +1084,7 @@ inline Tensor layout_transform(const Tensor& src,
 /*!
  * \brief Get the shape of input tensor.
  * \param src the input tensor.
+ * \param dtype the type of the elements in the tensor.
  * \param name output tensor name.
  * \param tag output tensor tag.
  * \return Tensor of input shape.
diff --git a/topi/python/topi/arm_cpu/conv2d.py b/topi/python/topi/arm_cpu/conv2d.py
index df07f95f27446..2ab623ba1c462 100644
--- a/topi/python/topi/arm_cpu/conv2d.py
+++ b/topi/python/topi/arm_cpu/conv2d.py
@@ -1,4 +1,4 @@
-# pylint: disable=invalid-name,unused-variable,no-else-return
+# pylint: disable=invalid-name, unused-variable, no-else-return, unused-argument
 """Conv2D schedule for ARM CPU"""
 from __future__ import absolute_import as _abs
 
@@ -8,11 +8,15 @@
 
 import tvm
 from tvm import autotvm
+import tvm.contrib.nnpack
 
-from ..generic import schedule_conv2d_nchw, schedule_conv2d_winograd_without_weight_transform
+from ..generic import schedule_conv2d_nchw, schedule_conv2d_winograd_without_weight_transform, \
+                      schedule_conv2d_winograd_nnpack_without_weight_transform
 from ..util import traverse_inline, get_const_tuple, const_matrix
 from ..nn import dilate, pad, conv2d, conv2d_alter_layout, \
-                 conv2d_winograd_without_weight_transform, depthwise_conv2d_nchw
+                 conv2d_winograd_without_weight_transform, \
+                 conv2d_winograd_nnpack_without_weight_transform, \
+                 depthwise_conv2d_nchw
 from ..nn.util import get_const_int, get_pad_tuple
 
 @autotvm.register_topi_compute(conv2d, 'arm_cpu', ['direct'])
@@ -55,7 +59,10 @@ def conv2d_arm_cpu(cfg, data, kernel, strides, padding, dilation, layout, out_dt
     return _decl_spatial_pack(cfg, data, kernel, strides, padding, dilation, layout, out_dtype,
                               num_tile=2)
 
-@autotvm.register_topi_schedule(schedule_conv2d_nchw, 'arm_cpu', ['direct', 'winograd'])
+
+@autotvm.register_topi_schedule(
+    schedule_conv2d_nchw, 'arm_cpu',
+    ['direct', 'winograd', 'winograd_nnpack_fp16', 'winograd_nnpack_fp32'])
 def schedule_conv2d_nchw_arm_cpu(cfg, outs):
     """TOPI schedule callback for conv2d
 
@@ -99,6 +106,10 @@ def _callback(op):
             output = op.output(0)
             _schedule_winograd(cfg, s, output, outs[0])
 
+        if 'winograd_nnpack_conv2d_output' in op.tag:
+            output = op.output(0)
+            _schedule_winograd_nnpack(cfg, s, output, outs[0])
+
     traverse_inline(s, outs[0].op, _callback)
     return s
 
@@ -499,6 +510,78 @@ def _schedule_winograd(cfg, s, output, last):
         s[output].compute_inline()
 
 
+@autotvm.register_topi_compute(conv2d, 'arm_cpu', ['winograd_nnpack_fp16'])
+def conv2d_arm_cpu_winograd_nnpack_fp16(
+        cfg, data, kernel, strides, padding, dilation, layout, out_dtype):
+    """ TOPI compute callback. Use winograd_nnpack_fp16 template """
+    return conv2d_arm_cpu_winograd_nnpack(
+        cfg, data, kernel, strides, padding, dilation, layout, out_dtype,
+        tvm.contrib.nnpack.ConvolutionAlgorithm.WT_8x8_FP16)
+
+
+@autotvm.register_topi_compute(conv2d, 'arm_cpu', ['winograd_nnpack_fp32'])
+def conv2d_arm_cpu_winograd_nnpack_fp32(
+        cfg, data, kernel, strides, padding, dilation, layout, out_dtype):
+    """ TOPI compute callback. Use winograd_nnpack_fp32 template """
+    return conv2d_arm_cpu_winograd_nnpack(
+        cfg, data, kernel, strides, padding, dilation, layout, out_dtype,
+        tvm.contrib.nnpack.ConvolutionAlgorithm.WT_8x8)
+
+
+def conv2d_arm_cpu_winograd_nnpack(
+        cfg, data, kernel, strides, padding, dilation, layout, out_dtype, convolution_algorithm):
+    """ TOPI compute callback. Use winograd NNPACK template """
+    N, CI, IH, IW = get_const_tuple(data.shape)
+
+    if isinstance(dilation, int):
+        dilation_h = dilation_w = dilation
+    else:
+        dilation_h, dilation_w = dilation
+    assert (dilation_h, dilation_w) == (1, 1)
+    assert len(kernel.shape) == 4
+    CO, _, KH, KW = get_const_tuple(kernel.shape)
+    HSTR, WSTR = strides if isinstance(strides, (tuple, list)) else (strides, strides)
+    HPAD, WPAD, _, _ = get_pad_tuple(padding, kernel)
+
+    assert layout == 'NCHW'
+    assert KH == 3 and KW == 3 and HPAD == 1 and WPAD == 1 and HSTR == 1 and WSTR == 1
+    H = (IH + 2 * HPAD - 3) // HSTR + 1
+    W = (IW + 2 * WPAD - 3) // WSTR + 1
+
+    cfg.define_knob('winograd_nnpack_algorithm', [convolution_algorithm])
+
+    assert N == 1
+    with tvm.tag_scope("winograd_nnpack_conv2d_weight_transform"):
+        transformed_kernel = tvm.contrib.nnpack.convolution_inference_weight_transform(
+            kernel, algorithm=tvm.contrib.nnpack.ConvolutionAlgorithm.WT_8x8)
+        if autotvm.GLOBAL_SCOPE.in_tuning:
+            transformed_kernel = tvm.compute(transformed_kernel.shape, lambda *args: 0.0)
+
+    with tvm.tag_scope("winograd_nnpack_conv2d_output"):
+        output = tvm.contrib.nnpack.convolution_inference_without_weight_transform(
+            data, transformed_kernel,
+            bias=None,
+            padding=[HPAD, HPAD, WPAD, WPAD],
+            stride=[HSTR, WSTR],
+            algorithm=cfg['winograd_nnpack_algorithm'].val)
+
+    # we have to manually assign effective GFLOP for winograd
+    cfg.add_flop(2 * N * CI * H * W * KH * KW * CO)
+    return output
+
+def _schedule_winograd_nnpack(cfg, s, output, last):
+    # Could have bias.
+
+    (X, TK) = output.op.input_tensors[:2]
+
+    # transform kernel
+    assert isinstance(TK.op, (tvm.tensor.ComputeOp, tvm.tensor.ExternOp, tvm.tensor.PlaceholderOp))
+    if autotvm.GLOBAL_SCOPE.in_tuning and isinstance(TK.op, tvm.tensor.ComputeOp):
+        # kernel transformation will be pre-computed during compilation, so we skip
+        # this part to make tuning records correct
+        s[TK].pragma(s[TK].op.axis[0], 'debug_skip_region')
+
+
 ##### REGISTER TOPI COMPUTE / SCHEDULE FOR WINOGRAD WITH WEIGHT TRANSFORM #####
 @autotvm.register_topi_compute(conv2d_winograd_without_weight_transform, 'arm_cpu', ['winograd'])
 def conv2d_winograd_ww(cfg, data, kernel, strides, padding, dilation, layout, out_dtype, tile_size):
@@ -522,6 +605,60 @@ def _callback(op):
     return s
 
 
+##### REGISTER TOPI COMPUTE / SCHEDULE FOR WINOGRAD NNPACK WITHOUT WEIGHT TRANSFORM #####
+@autotvm.register_topi_compute(conv2d_winograd_nnpack_without_weight_transform,
+                               'arm_cpu',
+                               ['winograd_nnpack_fp16', 'winograd_nnpack_fp32'])
+def conv2d_winograd_nnpack_ww(cfg, data, transformed_kernel, bias, strides,
+                              padding, dilation, layout, out_dtype):
+    """ TOPI compute callback. Use winograd NNPACK template """
+    N, CI, IH, IW = get_const_tuple(data.shape)
+    if isinstance(dilation, int):
+        dilation_h = dilation_w = dilation
+    else:
+        dilation_h, dilation_w = dilation
+    assert (dilation_h, dilation_w) == (1, 1)
+    assert len(transformed_kernel.shape) == 4
+    CO, _, _, _ = get_const_tuple(transformed_kernel.shape)
+    HSTR, WSTR = strides if isinstance(strides, (tuple, list)) else (strides, strides)
+    HPAD, WPAD, _, _ = get_pad_tuple(padding, (3, 3))
+    KH, KW = 3, 3
+
+    assert layout == 'NCHW'
+    assert KH == 3 and KW == 3 and HPAD == 1 and WPAD == 1 and HSTR == 1 and WSTR == 1
+    H = (IH + 2 * HPAD - 3) // HSTR + 1
+    W = (IW + 2 * WPAD - 3) // WSTR + 1
+
+    assert N == 1
+    with tvm.tag_scope("winograd_nnpack_conv2d_output"):
+        output = tvm.contrib.nnpack.convolution_inference_without_weight_transform(
+            data=data,
+            transformed_kernel=transformed_kernel,
+            bias=bias,
+            padding=[HPAD, HPAD, WPAD, WPAD],
+            stride=[HSTR, WSTR],
+            algorithm=tvm.contrib.nnpack.ConvolutionAlgorithm.WT_8x8)
+
+    # we have to manually assign effective GFLOP for winograd
+    cfg.add_flop(2 * N * CI * H * W * KH * KW * CO)
+    return output
+
+
+@autotvm.register_topi_schedule(schedule_conv2d_winograd_nnpack_without_weight_transform,
+                                'arm_cpu', ['winograd_nnpack_fp16', 'winograd_nnpack_fp32'])
+def schedule_conv2d_winograd_nnpack_without_weight_transform_(cfg, outs):
+    """TOPI schedule callback"""
+    s = tvm.create_schedule([x.op for x in outs])
+
+    def _callback(op):
+        if 'winograd_nnpack_conv2d_output' in op.tag:
+            output = op.output(0)
+            _schedule_winograd_nnpack(cfg, s, output, outs[0])
+
+    traverse_inline(s, outs[0].op, _callback)
+    return s
+
+
 ##### REGISTER ALTER OP LAYOUT #####
 @conv2d_alter_layout.register(["arm_cpu"])
 def _alter_conv2d_layout_arm(attrs, inputs, tinfos, F):
@@ -591,7 +728,7 @@ def _alter_conv2d_layout_arm(attrs, inputs, tinfos, F):
             dispatch_ctx.update(target, new_workload, cfg)
 
             return F.nn.conv2d(*copy_inputs, **new_attrs)
-        else:  # pre-compute weight transformation in winograd
+        elif cfg.template_key == "winograd":  # pre-compute weight transformation in winograd
             if "-device=arm_cpu" in target.options:
                 tile_size = 4
                 VC = cfg['tile_k'].size[-1]
@@ -620,6 +757,31 @@ def _alter_conv2d_layout_arm(attrs, inputs, tinfos, F):
             dispatch_ctx.update(target, new_workload, cfg)
 
             return F.nn.contrib_conv2d_winograd_without_weight_transform(*copy_inputs, **new_attrs)
+        elif cfg.template_key in ["winograd_nnpack_fp16", "winograd_nnpack_fp32"]:
+            # pre-compute winograd_nnpack transform
+            # for winograd_nnpack_fp16, the the precomputeprune pass must run on device,
+            # where float16 is supported
+            weight_dtype = 'float32'
+            transformed_kernel = F.nn.contrib_conv2d_winograd_nnpack_weight_transform(
+                copy_inputs[1],
+                convolution_algorithm=cfg['winograd_nnpack_algorithm'].val,
+                out_dtype=weight_dtype)
+            copy_inputs[1] = transformed_kernel
+            new_data = data
+            new_kernel = tvm.placeholder((CO, CI, 8, 8), "float32")
+            bias = tvm.placeholder((CO, ), "float32")
+            new_workload = autotvm.task.args_to_workload(
+                [new_data, new_kernel, bias, strides,
+                 padding, dilation, new_attrs[data_layout_key], out_dtype]
+                if len(copy_inputs) == 3 else
+                [new_data, new_kernel, strides,
+                 padding, dilation, new_attrs[data_layout_key], out_dtype],
+                conv2d_winograd_nnpack_without_weight_transform)
+            dispatch_ctx.update(target, new_workload, cfg)
+            return F.nn.contrib_conv2d_winograd_nnpack_without_weight_transform(
+                *copy_inputs, **new_attrs)
+        else:
+            raise RuntimeError("Unsupported template_key '%s'" % cfg.template_key)
     else:
         workload = autotvm.task.args_to_workload(
             [data, kernel, strides, padding, dilation, out_dtype], depthwise_conv2d_nchw)
diff --git a/topi/python/topi/cuda/__init__.py b/topi/python/topi/cuda/__init__.py
index ba577cd944f0d..706ecfb7f4bce 100644
--- a/topi/python/topi/cuda/__init__.py
+++ b/topi/python/topi/cuda/__init__.py
@@ -2,7 +2,7 @@
 """CUDA specific declaration and schedules."""
 from __future__ import absolute_import as _abs
 
-from . import conv2d, depthwise_conv2d, conv2d_transpose_nchw, group_conv2d_nchw
+from . import conv2d, depthwise_conv2d, conv2d_transpose_nchw, deformable_conv2d, group_conv2d_nchw
 from .conv2d_hwcn import schedule_conv2d_hwcn
 from .depthwise_conv2d import schedule_depthwise_conv2d_backward_input_nhwc
 from .depthwise_conv2d import schedule_depthwise_conv2d_backward_weight_nhwc
diff --git a/topi/python/topi/cuda/deformable_conv2d.py b/topi/python/topi/cuda/deformable_conv2d.py
new file mode 100644
index 0000000000000..132a6e93e491b
--- /dev/null
+++ b/topi/python/topi/cuda/deformable_conv2d.py
@@ -0,0 +1,126 @@
+# pylint: disable=invalid-name
+"""Schedule template of deformable conv2d with cuda backend"""
+import tvm
+from tvm import autotvm
+from .. import nn, generic
+from ..util import traverse_inline
+
+
+autotvm.register_topi_compute(nn.deformable_conv2d_nchw, ["cuda", "gpu"], "direct",
+                              nn.deformable_conv2d_nchw.fdefault)
+
+
+@autotvm.register_topi_schedule(generic.schedule_deformable_conv2d_nchw, ["cuda", "gpu"], "direct")
+def schedule_deformable_conv2d_nchw_cuda(cfg, outs):
+    """TOPI schedule callback of deformable conv2d for cuda gpu
+
+    Parameters
+    ----------
+    cfg: ConfigEntity
+        The config for this template
+
+    outs: Array of Tensor
+        The computation graph description of conv2d
+        in the format of an array of tensors.
+
+    Returns
+    -------
+    s: Schedule
+        The computation schedule for conv2d.
+    """
+    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
+    s = tvm.create_schedule([x.op for x in outs])
+
+    def _callback(op):
+        if op.tag == 'deformable_conv2d_nchw':
+            schedule_direct_cuda(cfg, s, op.output(0))
+
+    traverse_inline(s, outs[0].op, _callback)
+    return s
+
+
+def schedule_direct_cuda(cfg, s, conv):
+    """Schedule template of deformable conv2d"""
+    n, f, y, x = s[conv].op.axis
+    rc, ry, rx = s[conv].op.reduce_axis
+    cfg.define_split("tile_f", f, num_outputs=4)
+    cfg.define_split("tile_y", y, num_outputs=4)
+    cfg.define_split("tile_x", x, num_outputs=4)
+    cfg.define_split("tile_rc", rc, num_outputs=2)
+    cfg.define_split("tile_ry", ry, num_outputs=2)
+    cfg.define_split("tile_rx", rx, num_outputs=2)
+    cfg.define_knob("auto_unroll_max_step", [0, 512, 1500])
+
+    target = tvm.target.current_target()
+    if target.target_name in ['nvptx', 'rocm']:
+        cfg.define_knob("unroll_explicit", [1])
+    else:
+        cfg.define_knob("unroll_explicit", [0, 1])
+
+    data_deform, kernel = s[conv].op.input_tensors
+
+    s[data_deform].compute_inline()
+    if isinstance(kernel.op, tvm.tensor.ComputeOp) and 'dilate' in kernel.op.tag:
+        s[kernel].compute_inline()
+
+    if conv.op in s.outputs:
+        output = conv
+        OL = s.cache_write(conv, 'local')
+    else:
+        output = s.outputs[0].output(0)
+        s[conv].set_scope('local')
+        OL = conv
+
+    # create cache stage
+    AA = s.cache_read(data_deform, 'shared', [OL])
+    WW = s.cache_read(kernel, 'shared', [OL])
+
+    # tile and bind spatial axes
+    n, f, y, x = s[output].op.axis
+    kernel_scope, n = s[output].split(n, nparts=1)
+
+    bf, vf, tf, fi = cfg["tile_f"].apply(s, output, f)
+    by, vy, ty, yi = cfg["tile_y"].apply(s, output, y)
+    bx, vx, tx, xi = cfg["tile_x"].apply(s, output, x)
+
+    bf = s[output].fuse(n, bf)
+    s[output].bind(bf, tvm.thread_axis("blockIdx.z"))
+    s[output].bind(by, tvm.thread_axis("blockIdx.y"))
+    s[output].bind(bx, tvm.thread_axis("blockIdx.x"))
+    s[output].bind(vf, tvm.thread_axis("vthread"))
+    s[output].bind(vy, tvm.thread_axis("vthread"))
+    s[output].bind(vx, tvm.thread_axis("vthread"))
+    s[output].bind(tf, tvm.thread_axis("threadIdx.z"))
+    s[output].bind(ty, tvm.thread_axis("threadIdx.y"))
+    s[output].bind(tx, tvm.thread_axis("threadIdx.x"))
+    s[output].reorder(bf, by, bx, vf, vy, vx, tf, ty, tx, fi, yi, xi)
+    s[OL].compute_at(s[output], tx)
+
+    # tile reduction axes
+    n, f, y, x = s[OL].op.axis
+    rc, ry, rx = s[OL].op.reduce_axis
+    rco, rci = cfg['tile_rc'].apply(s, OL, rc)
+    ryo, ryi = cfg['tile_ry'].apply(s, OL, ry)
+    rxo, rxi = cfg['tile_rx'].apply(s, OL, rx)
+    s[OL].reorder(rco, ryo, rxo, rci, ryi, rxi, n, f, y, x)
+    cfg.define_reorder("reorder_inner", [rco, ryo, rxo], "all")
+    cfg["reorder_inner"].apply(s, OL, [rco, ryo, rxo])
+    cfg["reorder_inner"].apply(s, OL, [rci, ryi, rxi])
+
+    cache_loc = [rco, ryo, rxo][cfg["reorder_inner"].perm[-1]]
+    s[AA].compute_at(s[OL], cache_loc)
+    s[WW].compute_at(s[OL], cache_loc)
+
+    # cooperative fetching
+    for load in [AA, WW]:
+        fused = s[load].fuse(*s[load].op.axis)
+        tz, fused = s[load].split(fused, nparts=cfg["tile_f"].size[2])
+        ty, fused = s[load].split(fused, nparts=cfg["tile_y"].size[2])
+        tx, fused = s[load].split(fused, nparts=cfg["tile_x"].size[2])
+        s[load].bind(tz, tvm.thread_axis("threadIdx.z"))
+        s[load].bind(ty, tvm.thread_axis("threadIdx.y"))
+        s[load].bind(tx, tvm.thread_axis("threadIdx.x"))
+
+    # unroll
+    s[output].pragma(kernel_scope, 'auto_unroll_max_step', cfg['auto_unroll_max_step'].val)
+    s[output].pragma(kernel_scope, 'unroll_explicit', cfg['unroll_explicit'].val)
diff --git a/topi/python/topi/cuda/dense.py b/topi/python/topi/cuda/dense.py
index 6981d88e4e3d1..8c3058c3309c9 100644
--- a/topi/python/topi/cuda/dense.py
+++ b/topi/python/topi/cuda/dense.py
@@ -27,16 +27,15 @@ def dense_cuda(data, weight, bias=None):
     output : tvm.Tensor
         2-D with shape [batch, out_dim]
     """
-    assert len(data.shape) == 2 and len(weight.shape) == 2, \
-        "only support 2-dim dense"
-    if bias is not None:
-        assert len(bias.shape) == 1
-    batch, in_dim = data.shape
-    out_dim, _ = weight.shape
     target = tvm.target.current_target()
-    if "cublas" in target.libs:
+    if "cublas" in target.libs and len(data.shape) == 2:
+        assert len(weight.shape) == 2, \
+            "only support 2-dim dense"
+        batch, in_dim = data.shape
+        out_dim, _ = weight.shape
         matmul = cublas.matmul(data, weight, False, True)
         if bias is not None:
+            assert len(bias.shape) == 1
             matmul = tvm.compute((batch, out_dim), \
                                  lambda i, j: matmul[i, j] + bias[j], \
                                  tag=tag.BROADCAST)
diff --git a/topi/python/topi/generic/nn.py b/topi/python/topi/generic/nn.py
index 00b742f24e644..16eb6ae93a2af 100644
--- a/topi/python/topi/generic/nn.py
+++ b/topi/python/topi/generic/nn.py
@@ -121,6 +121,39 @@ def schedule_conv2d_winograd_without_weight_transform(outs):
     return _default_schedule(outs, False)
 
 
+@tvm.target.generic_func
+def schedule_conv2d_winograd_nnpack_weight_transform(outs):
+    """Schedule for weight transformation of winograd
+     Parameters
+    ----------
+    outs: Array of Tensor
+          The computation graph description of this operator
+          in the format of an array of tensors.
+     Returns
+    -------
+    sch: Schedule
+        The computation schedule for the op.
+    """
+    # Typically this is computed in nnvm PreCompute pass
+    s = tvm.create_schedule([x.op for x in outs])
+    return s
+
+@tvm.target.generic_func
+def schedule_conv2d_winograd_nnpack_without_weight_transform(outs):
+    """Schedule for winograd without weight transformation
+     Parameters
+    ----------
+    outs: Array of Tensor
+          The computation graph description of this operator
+          in the format of an array of tensors.
+     Returns
+    -------
+    sch: Schedule
+        The computation schedule for the op.
+    """
+    return _default_schedule(outs, False)
+
+
 @tvm.target.generic_func
 def schedule_conv2d_transpose_nchw(outs):
     """Schedule for conv2d_transpose_nchw
@@ -209,6 +242,24 @@ def schedule_group_conv2d_nchw(outs):
     return _default_schedule(outs, False)
 
 
+@tvm.target.generic_func
+def schedule_deformable_conv2d_nchw(outs):
+    """Schedule for deformable_conv2d_nchw
+
+    Parameters
+    ----------
+    outs: Array of Tensor
+          The computation graph description of deformable_conv2d_nchw
+          in the format of an array of tensors.
+
+    Returns
+    -------
+    sch: Schedule
+        The computation schedule for the op.
+    """
+    return _default_schedule(outs, False)
+
+
 @tvm.target.generic_func
 def schedule_bitserial_conv2d_nchw(outs):
     """Schedule for bitserial_conv2d_nchw
diff --git a/topi/python/topi/nn/__init__.py b/topi/python/topi/nn/__init__.py
index 941fec91a6bd7..65eb7341babd4 100644
--- a/topi/python/topi/nn/__init__.py
+++ b/topi/python/topi/nn/__init__.py
@@ -3,6 +3,7 @@
 from __future__ import absolute_import as _abs
 
 from .conv2d import *
+from .deformable_conv2d import *
 from .depthwise_conv2d import *
 from .elemwise import *
 from .dilate import *
diff --git a/topi/python/topi/nn/conv2d.py b/topi/python/topi/nn/conv2d.py
index 559f132f19c26..a67f608d26dc5 100644
--- a/topi/python/topi/nn/conv2d.py
+++ b/topi/python/topi/nn/conv2d.py
@@ -410,6 +410,48 @@ def conv2d_winograd_without_weight_transform(input, filter, strides, padding, di
     raise ValueError("missing register for topi.nn.conv2d_winograd_without_weight_transform")
 
 
+def conv2d_winograd_nnpack_weight_transform(kernel, convolution_algorithm, out_dtype):
+    """Weight transformation for winograd
+     Parameters
+    ----------
+    kernel: Tensor
+        The raw kernel tensor with layout "NCHW". Only 3x3 kernel is supported for now.
+    convolution_algorithm: int
+        The convolution algorithm for Winograd NNPACK.
+     Returns
+    -------
+    output : tvm.Tensor
+        4-D with shape [alpha, alpha, CO, CI]
+    """
+    from tvm.contrib import nnpack
+    return nnpack.convolution_inference_weight_transform(
+        kernel, algorithm=convolution_algorithm, dtype=out_dtype)
+
+@tvm.target.generic_func
+def conv2d_winograd_nnpack_without_weight_transform(
+        input, filter, bias, strides, padding, dilation, layout, out_dtype):
+    """Compute convolution in winograd algorithm. The filter is supposed to be transformed
+    in advance.
+     Parameters
+    ----------
+    input : tvm.Tensor
+        4-D with shape [batch, in_height, in_width, in_channel]
+    filter : tvm.Tensor
+        4-D with shape [num_filter, in_channel, 8, 8]
+    bias : tvm.Tensor
+        1-D with shape [num_filter]
+    strides : int or a list/tuple of two ints
+        Stride size, or [stride_height, stride_width]
+    padding : int or str
+        Padding size, or ['VALID', 'SAME']
+     Returns
+    -------
+    output : tvm.Tensor
+        4-D with shape [batch, out_height, out_width, out_channel]
+    """
+    raise ValueError("missing register for topi.nn.conv2d_winograd_without_weight_transform")
+
+
 @tvm.target.generic_func
 def group_conv2d_nchw(Input, Filter, stride, padding, dilation, groups, out_dtype=None):
     """Group convolution operator in NCHW layout.
diff --git a/topi/python/topi/nn/deformable_conv2d.py b/topi/python/topi/nn/deformable_conv2d.py
new file mode 100644
index 0000000000000..ae0ad85037f37
--- /dev/null
+++ b/topi/python/topi/nn/deformable_conv2d.py
@@ -0,0 +1,99 @@
+# pylint: disable=invalid-name, too-many-locals, too-many-arguments
+"""Deformable Conv2D operators"""
+import tvm
+
+from .util import get_pad_tuple
+from ..util import get_const_tuple
+from ..cpp.image import bilinear_sample_nchw
+
+@tvm.target.generic_func
+def deformable_conv2d_nchw(data, offset, kernel, strides, padding, dilation, deformable_groups,
+                           groups, out_dtype):
+    """Deformable conv2D operator in NCHW layout.
+
+    The deformable convolution operation is described in https://arxiv.org/abs/1703.06211
+
+    Parameters
+    ----------
+    data : tvm.Tensor
+        4-D with shape [batch, in_channel, in_height, in_width]
+
+    offset : tvm.Tensor
+        4-D with shape [batch, deformable_groups * filter_height * filter_width * 2,
+        out_height, out_width].
+
+    kernel : tvm.Tensor
+        4-D with shape [num_filter, in_channel, filter_height, filter_width]
+
+    strides : int or a list/tuple of two ints
+        stride size, or [stride_height, stride_width]
+
+    padding : int or a list/tuple of two ints
+        padding size, or [pad_height, pad_width]
+
+    dilation : int or a list/tuple of two ints
+        dilation size, or [dilation_height, dilation_width]
+
+    deformable_groups : int
+        number of deformable groups
+
+    groups : int
+        number of groups
+
+    Returns
+    -------
+    output : tvm.Tensor
+        4-D with shape [batch, out_channel, out_height, out_width]
+    """
+    if out_dtype is None:
+        out_dtype = data.dtype
+
+    if isinstance(strides, int):
+        stride_h = stride_w = strides
+    else:
+        stride_h, stride_w = strides
+
+    if isinstance(dilation, int):
+        dilation_h = dilation_w = dilation
+    else:
+        dilation_h, dilation_w = dilation
+
+    batch, in_channel, in_height, in_width = get_const_tuple(data.shape)
+    out_channel, channel, kernel_h, kernel_w = get_const_tuple(kernel.shape)
+    _, _, out_height, out_width = get_const_tuple(offset.shape)
+    assert in_channel % deformable_groups == 0, "Input cahnnels must divide deformable group size"
+    assert groups == 1, "deformable_conv2d_nchw does not support groups > 1"
+
+    ic_per_dgroup = channel // deformable_groups
+
+    dilated_kernel_h = (kernel_h - 1) * dilation_h + 1
+    dilated_kernel_w = (kernel_w - 1) * dilation_w + 1
+    pad_top, pad_left, _, _ = get_pad_tuple(
+        padding, (dilated_kernel_h, dilated_kernel_w))
+    rc = tvm.reduce_axis((0, in_channel), name='rc')
+    ry = tvm.reduce_axis((0, kernel_h), name='ry')
+    rx = tvm.reduce_axis((0, kernel_w), name='rx')
+
+    zero = tvm.const(0.0, data.dtype)
+
+    def _bilinear(n, c, h, w):
+        outside = tvm.any(h < 0, w < 0, h >= in_height, w >= in_width)
+        val = bilinear_sample_nchw(data, (n, c, h, w), in_height - 1, in_width - 1)
+        return tvm.if_then_else(outside, zero, val)
+
+    data_deform = \
+        tvm.compute((batch, in_channel, kernel_h, kernel_w, out_height, out_width),
+                    lambda n, c, kh, kw, y, x:
+                    _bilinear(n, c,
+                              y * stride_h - pad_top + kh * dilation_h +
+                              offset[n, c // ic_per_dgroup * (kernel_w*kernel_h*2) +
+                                     (kh * kernel_w + kw) * 2, y, x],
+                              x * stride_w - pad_left + kw * dilation_w +
+                              offset[n, c // ic_per_dgroup * (kernel_w*kernel_h*2) +
+                                     (kh * kernel_w + kw) * 2 + 1, y, x]))
+    return tvm.compute(
+        (batch, out_channel, out_height, out_width),
+        lambda n, f, y, x: tvm.sum(
+            data_deform[n, rc, ry, rx, y, x].astype(out_dtype) *
+            kernel[f, rc, ry, rx].astype(out_dtype),
+            axis=[rc, ry, rx]), tag="deformable_conv2d_nchw")
diff --git a/topi/python/topi/nn/dense.py b/topi/python/topi/nn/dense.py
index abd424a64aeba..02d8977de0aa2 100644
--- a/topi/python/topi/nn/dense.py
+++ b/topi/python/topi/nn/dense.py
@@ -9,7 +9,7 @@ def dense_default(data, weight, bias=None):
     Parameters
     ----------
     data : tvm.Tensor
-        2-D with shape [batch, in_dim]
+        Tensor with shape [batch0, batch1, batch2..., in_dim]
 
     weight : tvm.Tensor
         2-D with shape [out_dim, in_dim]
@@ -20,22 +20,28 @@ def dense_default(data, weight, bias=None):
     Returns
     -------
     output : tvm.Tensor
-        2-D with shape [batch, out_dim]
+        Tensor with shape [batch0, batch1, batch2..., out_dim]
     """
-    assert len(data.shape) == 2 and len(weight.shape) == 2, \
-        "only support 2-dim dense"
+    assert len(data.shape) > 0 and len(weight.shape) == 2, \
+        "bad shape"
     if bias is not None:
         assert len(bias.shape) == 1
-    batch, in_dim = data.shape
-    out_dim, _ = weight.shape
+    batch = data.shape[:-1]
+    in_dim = data.shape[-1]
+    out_dim = weight.shape[0]
     k = tvm.reduce_axis((0, in_dim), name='k')
-    matmul = tvm.compute((batch, out_dim), \
-                         lambda i, j: tvm.sum(data[i, k] * weight[j, k], axis=k), \
-                         tag='dense')
+    out_shape = batch + [out_dim]
+    def matmul_func(*idx):
+        batch_idx = idx[:-1]
+        out_idx = idx[-1]
+        return tvm.sum(data[tuple(batch_idx + (k,))] * weight[out_idx, k], axis=k)
+    matmul = tvm.compute(out_shape, matmul_func, tag='dense')
     if bias is not None:
-        matmul = tvm.compute((batch, out_dim), \
-                             lambda i, j: matmul[i, j] + bias[j], \
-                             tag=tag.BROADCAST)
+        def matmul_func(*idx):
+            batch_idx = idx[:-1]
+            out_idx = idx[-1]
+            return matmul[tuple(batch_idx + (k,))] + bias[out_idx]
+        matmul = tvm.compute(out_shape, matmul_func, tag=tag.BROADCAST)
     return matmul
 
 
diff --git a/topi/python/topi/nn/elemwise.py b/topi/python/topi/nn/elemwise.py
index 14a747e67610f..6a2697795f4dd 100644
--- a/topi/python/topi/nn/elemwise.py
+++ b/topi/python/topi/nn/elemwise.py
@@ -69,7 +69,7 @@ def prelu(x, slope, axis=1):
         [http://arxiv.org/pdf/1502.01852v1.pdf]
     """
 
-    assert len(x.shape) == 4 and len(slope.shape) == 1
+    assert len(slope.shape) == 1
     assert axis < len(x.shape)
     assert get_const_int(slope.shape[0]) == get_const_int(x.shape[axis])
 
diff --git a/topi/python/topi/testing/__init__.py b/topi/python/topi/testing/__init__.py
index 2eabb4b3d95b6..40c1bdc83cac0 100644
--- a/topi/python/topi/testing/__init__.py
+++ b/topi/python/topi/testing/__init__.py
@@ -8,6 +8,7 @@
 from .conv2d_nchw_python import conv2d_nchw_python
 from .conv2d_nhwc_python import conv2d_nhwc_python
 from .conv2d_transpose_nchw_python import conv2d_transpose_nchw_python
+from .deformable_conv2d_nchw_python import deformable_conv2d_nchw_python
 from .depthwise_conv2d_python import depthwise_conv2d_python_nchw, depthwise_conv2d_python_nhwc
 from .dilate_python import dilate_python
 from .softmax_python import softmax_python, log_softmax_python
diff --git a/topi/python/topi/testing/deformable_conv2d_nchw_python.py b/topi/python/topi/testing/deformable_conv2d_nchw_python.py
new file mode 100644
index 0000000000000..071b6f35822ba
--- /dev/null
+++ b/topi/python/topi/testing/deformable_conv2d_nchw_python.py
@@ -0,0 +1,107 @@
+# pylint: disable=invalid-name, too-many-locals, too-many-arguments
+"""Deformable convolution in python"""
+import itertools
+import numpy as np
+
+
+def deformable_conv2d_nchw_python(a_np, offset_np, w_np, stride, padding, dilation,
+                                  deformable_groups, groups):
+    """Deformable convolution operator in NCHW layout.
+
+    Parameters
+    ----------
+    a_np : numpy.ndarray
+        4-D with shape [batch, in_channel, in_height, in_width]
+
+    offset_np : numpy.ndarray
+        4-D with shape [batch, deformable_groups * filter_height * filter_width * 2,
+                        out_height, out_width]
+
+    w_np : numpy.ndarray
+        4-D with shape [num_filter, in_channel, filter_height, filter_width]
+
+    stride : int or a list/tuple of two ints
+        Stride size, or [stride_height, stride_width]
+
+    padding : int or str or a list/tuple of two ints
+        Padding size, or ['VALID', 'SAME'], or [pad_height, pad_width]
+
+    dilation : int or a list/tuple of two ints
+        Dilation size, or [dilate_height, dilate_width]
+
+    deformable_groups : int
+        Number of deformable groups
+
+    groups : int
+        Number of groups
+
+    Returns
+    -------
+    b_np : np.ndarray
+        4-D with shape [batch, out_channel, out_height, out_width]
+    """
+    batch, in_channel, in_height, in_width = a_np.shape
+    out_channel, _, kernel_h, kernel_w = w_np.shape
+    out_height, out_width = offset_np.shape[-2:]
+    dtype = a_np.dtype
+    ic_per_dgroup = in_channel // deformable_groups
+    assert groups == 1, "deformable_conv2d_nchw_python does not support groups > 1"
+
+    if isinstance(stride, int):
+        stride_h = stride_w = stride
+    else:
+        stride_h, stride_w = stride
+    if isinstance(padding, int):
+        pad_h = pad_w = padding * 2
+    elif isinstance(padding, (list, tuple)):
+        pad_h, pad_w = padding[0] * 2, padding[1] * 2
+    else:
+        pad_h = 0 if padding == 'VALID' else kernel_h - 1
+        pad_w = 0 if padding == 'VALID' else kernel_w - 1
+    pad_top = int(np.ceil(float(pad_h) / 2))
+    pad_left = int(np.ceil(float(pad_w) / 2))
+    if isinstance(dilation, int):
+        dilation_h = dilation_w = dilation
+    else:
+        dilation_h, dilation_w = dilation
+
+
+    def _bilinear(n, c, h, w):
+        low_h, low_w = int(h), int(w)
+        high_h = min(low_h + 1, in_height - 1)
+        high_w = min(low_w + 1, in_width - 1)
+        y_lerp = h - low_h
+        x_lerp = w - low_w
+
+        bottom = (1 - x_lerp) * a_np[n, c, low_h, low_w] + x_lerp * a_np[n, c, low_h, high_w]
+        top = (1 - x_lerp) * a_np[n, c, high_h, low_w] + x_lerp * a_np[n, c, high_h, high_w]
+        return (1 - y_lerp) * bottom + y_lerp * top
+
+
+    a_deform = np.zeros((batch, in_channel, out_height, out_width, kernel_h, kernel_w), dtype=dtype)
+    for n, h, w in itertools.product(range(batch), range(out_height), range(out_width)):
+        offset = offset_np[n, :, h, w].reshape(deformable_groups, kernel_h, kernel_w, 2)
+        in_h = h * stride_h - pad_top
+        in_w = w * stride_w - pad_left
+
+        index_h_base, index_w_base = np.meshgrid(
+            np.arange(in_h, in_h + kernel_h * dilation_h, dilation_h, dtype=offset_np.dtype),
+            np.arange(in_w, in_w + kernel_w * dilation_w, dilation_w, dtype=offset_np.dtype),
+            indexing='ij')
+
+        for c, kh, kw in itertools.product(range(in_channel), range(kernel_h), range(kernel_w)):
+            dg = c // ic_per_dgroup
+            index_h = index_h_base + offset[dg, ..., 0]
+            index_w = index_w_base + offset[dg, ..., 1]
+
+            y, x = index_h[kh, kw], index_w[kh, kw]
+            if y < 0 or y >= in_height or x < 0 or x >= in_width:
+                continue
+            a_deform[n, c, h, w, kh, kw] = _bilinear(n, c, y, x)
+
+    b_np = np.zeros((batch, out_channel, out_height, out_width), dtype=dtype)
+    for n, c, f, h, w in itertools.product(range(batch), range(in_channel), range(out_channel),
+                                           range(out_height), range(out_width)):
+        b_np[n, f, h, w] += np.tensordot(a_deform[n, c, h, w], w_np[f, c])
+
+    return b_np
diff --git a/topi/python/topi/x86/__init__.py b/topi/python/topi/x86/__init__.py
index 9e0e94e6cd2db..638d428ec28a0 100644
--- a/topi/python/topi/x86/__init__.py
+++ b/topi/python/topi/x86/__init__.py
@@ -10,3 +10,5 @@
 from .pooling import schedule_pool, schedule_global_pool
 from .bitserial_conv2d import schedule_bitserial_conv2d
 from .depthwise_conv2d import schedule_depthwise_conv2d_NCHWc
+from .dense import _schedule_dense, _schedule_dense_pack, _schedule_dense_nopack
+from .batch_matmul import schedule_batch_matmul
diff --git a/topi/python/topi/x86/dense.py b/topi/python/topi/x86/dense.py
index 33575b4c399dc..b53e5227c637a 100644
--- a/topi/python/topi/x86/dense.py
+++ b/topi/python/topi/x86/dense.py
@@ -9,7 +9,8 @@
 from .. import generic, tag, nn
 from ..util import traverse_inline, get_const_tuple
 
-@autotvm.register_topi_compute(nn.dense, "cpu", "direct")
+
+#@autotvm.register_topi_compute(nn.dense, "cpu", "direct")
 def _declaration_dense(cfg, data, weight, bias=None):
     batch, _ = get_const_tuple(data.shape)
 
@@ -22,7 +23,7 @@ def _declaration_dense(cfg, data, weight, bias=None):
 
 
 # Declare dense compute with packing weight into cache-friendly layout
-@autotvm.register_topi_compute(nn.dense, "cpu", "direct_pack")
+#@autotvm.register_topi_compute(nn.dense, "cpu", "direct_pack")
 def _declaration_dense_pack(cfg, data, weight, bias=None):
     batch, in_dim = get_const_tuple(data.shape)
     out_dim, _ = get_const_tuple(weight.shape)
@@ -51,7 +52,7 @@ def _declaration_dense_pack(cfg, data, weight, bias=None):
 
 
 # Declare dense compute without packing weight
-@autotvm.register_topi_compute(nn.dense, "cpu", "direct_nopack")
+#@autotvm.register_topi_compute(nn.dense, "cpu", "direct_nopack")
 def _declaration_dense_nopack(cfg, data, weight, bias=None):
     batch, in_dim = get_const_tuple(data.shape)
     out_dim, _ = get_const_tuple(weight.shape)
@@ -79,7 +80,7 @@ def _declaration_dense_nopack(cfg, data, weight, bias=None):
     return C
 
 
-@autotvm.register_topi_schedule(generic.schedule_dense, "cpu", "direct")
+#@autotvm.register_topi_schedule(generic.schedule_dense, "cpu", "direct")
 def _schedule_dense(cfg, outs):
     s = tvm.create_schedule([x.op for x in outs])
 
@@ -92,7 +93,7 @@ def _callback(op):
     return s
 
 
-@autotvm.register_topi_schedule(generic.schedule_dense, "cpu", "direct_pack")
+#@autotvm.register_topi_schedule(generic.schedule_dense, "cpu", "direct_pack")
 def _schedule_dense_pack(cfg, outs):
     s = tvm.create_schedule([x.op for x in outs])
 
@@ -103,7 +104,7 @@ def _callback(op):
     return s
 
 
-@autotvm.register_topi_schedule(generic.schedule_dense, "cpu", "direct_nopack")
+#@autotvm.register_topi_schedule(generic.schedule_dense, "cpu", "direct_nopack")
 def _schedule_dense_nopack(cfg, outs):
     s = tvm.create_schedule([x.op for x in outs])
 
diff --git a/topi/tests/python/test_topi_conv2d_nchw.py b/topi/tests/python/test_topi_conv2d_nchw.py
index c0dc1953d2c26..d1bbd5ad15d43 100644
--- a/topi/tests/python/test_topi_conv2d_nchw.py
+++ b/topi/tests/python/test_topi_conv2d_nchw.py
@@ -136,7 +136,8 @@ def test_conv2d_nchw():
     verify_conv2d_nchw(1,  128,  17, 128, 7, 1, 3)
     verify_conv2d_nchw(1,  128,  17, 192, 1, 1, 0)
     verify_conv2d_nchw(1,  768,  17, 160, 1, 1, 0)
-    verify_conv2d_nchw(1,  160,  17, 160, 1, 1, 0)
+    # disable these tests due to some bugs of llvm with nvptx
+    # verify_conv2d_nchw(1,  160,  17, 160, 1, 1, 0)
     verify_conv2d_nchw(1,  160,  17, 192, 7, 1, 3)
     verify_conv2d_nchw(1,  160,  17, 160, 7, 1, 3)
     verify_conv2d_nchw(1,  160,  17, 192, 1, 1, 0)
diff --git a/topi/tests/python/test_topi_conv2d_winograd.py b/topi/tests/python/test_topi_conv2d_winograd.py
index 1ca7240a41b0c..a76c9c62e0089 100644
--- a/topi/tests/python/test_topi_conv2d_winograd.py
+++ b/topi/tests/python/test_topi_conv2d_winograd.py
@@ -10,7 +10,8 @@
 from topi.util import get_const_tuple
 
 
-def verify_conv2d_nchw(batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation=1, add_bias=False, add_relu=False):
+def verify_conv2d_nchw(batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation=1, add_bias=False, add_relu=False,
+        devices=['cuda', 'llvm -device=arm_cpu', 'opencl -device=mali']):
     print("Workload: (%d, %d, %d, %d, %d, %d, %d, %d)" % (batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation))
 
     in_height = in_width = in_size
@@ -67,7 +68,7 @@ def check_device(device):
         tvm.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-5)
 
 
-    for device in ['cuda', 'llvm -device=arm_cpu', 'opencl -device=mali']:
+    for device in devices:
         check_device(device)
 
 
diff --git a/topi/tests/python/test_topi_deformable_conv2d.py b/topi/tests/python/test_topi_deformable_conv2d.py
new file mode 100644
index 0000000000000..34058c7d65a27
--- /dev/null
+++ b/topi/tests/python/test_topi_deformable_conv2d.py
@@ -0,0 +1,72 @@
+import numpy as np
+import tvm
+from tvm import autotvm
+import topi
+import topi.testing
+from tvm.contrib.pickle_memoize import memoize
+from topi.util import get_const_tuple
+
+from common import get_all_backend
+
+
+def verify_deformable_conv2d_nchw(batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation=1, deformable_groups=1, groups=1):
+    print("Workload: (%d, %d, %d, %d, %d, %d, %d, %d, %d, %d)" % (batch, in_channel, in_size,
+            num_filter, kernel, stride, padding, dilation, deformable_groups, groups))
+
+    A = tvm.placeholder((batch, in_channel, in_size, in_size), name='A')
+    out_size = (in_size - (kernel - 1) * dilation - 1 + 2 * padding) // stride + 1
+    Offset = tvm.placeholder((batch, deformable_groups * kernel * kernel * 2, out_size, out_size), name='offset')
+    W = tvm.placeholder((num_filter, in_channel, kernel, kernel), name='W')
+    bias = tvm.placeholder((num_filter, 1, 1), name='bias')
+
+    a_shape = get_const_tuple(A.shape)
+    offset_shape = get_const_tuple(Offset.shape)
+    w_shape = get_const_tuple(W.shape)
+    bias_shape = get_const_tuple(bias.shape)
+    dtype = A.dtype
+
+    @memoize("topi.tests.test_topi_deformable_conv2d_nchw.verify_deformable_conv2d_nchw")
+    def get_ref_data():
+        a_np = np.random.uniform(size=a_shape).astype(dtype)
+        offset_np = np.random.randn(*offset_shape).astype(dtype)
+        w_np = np.random.uniform(size=w_shape).astype(dtype)
+        b_np = np.random.uniform(size=bias_shape).astype(dtype)
+        c_np = topi.testing.deformable_conv2d_nchw_python(a_np, offset_np, w_np, stride, padding,
+                                                          dilation, deformable_groups, groups)
+
+        return a_np, offset_np, w_np, c_np
+
+    a_np, offset_np, w_np, c_np = get_ref_data()
+
+    def check_device(device):
+        ctx = tvm.context(device, 0)
+        if not ctx.exist:
+            print("Skip because %s is not enabled" % device)
+            return
+        print("Running on target: %s" % device)
+        with tvm.target.create(device):
+            C = topi.nn.deformable_conv2d_nchw(A, Offset, W, stride, padding, dilation,
+                    deformable_groups, groups, out_dtype=dtype)
+            s = topi.generic.schedule_deformable_conv2d_nchw([C])
+
+            a = tvm.nd.array(a_np, ctx)
+            offset = tvm.nd.array(offset_np, ctx)
+            w = tvm.nd.array(w_np, ctx)
+            c = tvm.nd.empty(c_np.shape, dtype=c_np.dtype, ctx=ctx)
+
+            func = tvm.build(s, [A, Offset, W, C], device)
+            func(a, offset, w, c)
+            tvm.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-5)
+
+    for device in ['llvm', 'cuda']:
+        check_device(device)
+
+
+def test_deformable_conv2d_nchw():
+    verify_deformable_conv2d_nchw(1, 16, 7, 16, 1, 1, 0, deformable_groups=4)
+    verify_deformable_conv2d_nchw(1, 16, 7, 16, 3, 1, 1, dilation=2, deformable_groups=4)
+    verify_deformable_conv2d_nchw(1, 16, 7, 16, 3, 1, 2, dilation=2)
+
+
+if __name__ == "__main__":
+    test_deformable_conv2d_nchw()
diff --git a/topi/tests/python/test_topi_relu.py b/topi/tests/python/test_topi_relu.py
index a7ff64f0f7593..5aa9c1ee57a01 100644
--- a/topi/tests/python/test_topi_relu.py
+++ b/topi/tests/python/test_topi_relu.py
@@ -83,6 +83,7 @@ def test_leaky_relu():
 def test_prelu():
     verify_prelu((1, 3, 2, 2), (3,), 1, (3, 1, 1))
     verify_prelu((1, 3, 2, 2), (2,), 2, (2, 1))
+    verify_prelu((1, 3), (3,), 1, (3, ))
 
 if __name__ == "__main__":
     test_schedule_big_array()
diff --git a/tutorials/autotvm/tune_relay_arm.py b/tutorials/autotvm/tune_relay_arm.py
index 266a1532a8b6d..b56cb1f1dd42d 100644
--- a/tutorials/autotvm/tune_relay_arm.py
+++ b/tutorials/autotvm/tune_relay_arm.py
@@ -186,7 +186,7 @@ def get_network(name, batch_size):
     'log_filename': log_file,
 
     'tuner': 'xgb',
-    'n_trial': 1000,
+    'n_trial': 1500,
     'early_stopping': 800,
 
     'measure_option': autotvm.measure_option(
diff --git a/tutorials/frontend/deploy_model_on_rasp.py b/tutorials/frontend/deploy_model_on_rasp.py
index b90127b3858e5..c96962a52c1b5 100644
--- a/tutorials/frontend/deploy_model_on_rasp.py
+++ b/tutorials/frontend/deploy_model_on_rasp.py
@@ -14,6 +14,7 @@
 import tvm.relay as relay
 from tvm import rpc
 from tvm.contrib import util, graph_runtime as runtime
+from tvm.contrib.download import download_testdata
 
 ######################################################################
 # .. _build-tvm-runtime-on-device:
@@ -82,7 +83,6 @@
 # You can found more details about this part at tutorial :ref:`tutorial-from-mxnet`.
 
 from mxnet.gluon.model_zoo.vision import get_model
-from mxnet.gluon.utils import download
 from PIL import Image
 import numpy as np
 
@@ -92,9 +92,10 @@
 ######################################################################
 # In order to test our model, here we download an image of cat and
 # transform its format.
+img_url = 'https://github.com/dmlc/mxnet.js/blob/master/data/cat.png?raw=true'
 img_name = 'cat.png'
-download('https://github.com/dmlc/mxnet.js/blob/master/data/cat.png?raw=true', img_name)
-image = Image.open(img_name).resize((224, 224))
+img_path = download_testdata(img_url, img_name, module='data')
+image = Image.open(img_path).resize((224, 224))
 
 def transform_image(image):
     image = np.array(image) - np.array([123., 117., 104.])
@@ -112,9 +113,9 @@ def transform_image(image):
                       '4d0b62f3d01426887599d4f7ede23ee5/raw/',
                       '596b27d23537e5a1b5751d2b0481ef172f58b539/',
                       'imagenet1000_clsid_to_human.txt'])
-synset_name = 'synset.txt'
-download(synset_url, synset_name)
-with open(synset_name) as f:
+synset_name = 'imagenet1000_clsid_to_human.txt'
+synset_path = download_testdata(synset_url, synset_name, module='data')
+with open(synset_path) as f:
     synset = eval(f.read())
 
 ######################################################################
diff --git a/tutorials/frontend/deploy_ssd_gluoncv.py b/tutorials/frontend/deploy_ssd_gluoncv.py
index bcd5459e1cf77..38a84f5cbbe0f 100644
--- a/tutorials/frontend/deploy_ssd_gluoncv.py
+++ b/tutorials/frontend/deploy_ssd_gluoncv.py
@@ -12,19 +12,13 @@
 from tvm.relay.testing.config import ctx_list
 from tvm import relay
 from tvm.contrib import graph_runtime
+from tvm.contrib.download import download_testdata
 from gluoncv import model_zoo, data, utils
 
 
 ######################################################################
 # Preliminary and Set parameters
 # ------------------------------
-# We should build TVM with sort support, in TVM root directory
-#
-# .. code-block:: bash
-#
-#   echo "set(USE_SORT ON)" > config.mk
-#   make -j8
-#
 # .. note::
 #
 #   Currently we support compiling SSD on CPU only.
@@ -56,9 +50,9 @@
 ######################################################################
 # Download and pre-process demo image
 
-im_fname = utils.download('https://github.com/dmlc/web-data/blob/master/' +
-                          'gluoncv/detection/street_small.jpg?raw=true',
-                          path='street_small.jpg')
+im_fname = download_testdata('https://github.com/dmlc/web-data/blob/master/' +
+                             'gluoncv/detection/street_small.jpg?raw=true',
+                             'street_small.jpg', module='data')
 x, img = data.transforms.presets.ssd.load_test(im_fname, short=512)
 
 ######################################################################
diff --git a/tutorials/frontend/from_caffe2.py b/tutorials/frontend/from_caffe2.py
index fce7f30d865dc..fbb54bd898097 100644
--- a/tutorials/frontend/from_caffe2.py
+++ b/tutorials/frontend/from_caffe2.py
@@ -19,21 +19,6 @@
 or please refer to official site
 https://caffe2.ai/docs/getting-started.html
 """
-######################################################################
-# Utils for downloading files
-# ----------------------------
-def download(url, path, overwrite=False):
-    import os
-    if os.path.isfile(path) and not overwrite:
-        print('File {} exists, skip.'.format(path))
-        return
-    print('Downloading from url {} to {}'.format(url, path))
-    try:
-        import urllib.request
-        urllib.request.urlretrieve(url, path)
-    except:
-        import urllib
-        urllib.urlretrieve(url, path)
 
 ######################################################################
 # Load pretrained Caffe2 model
@@ -52,12 +37,13 @@ def __init__(self, model_name):
 # Load a test image
 # ------------------
 # A single cat dominates the examples!
+from tvm.contrib.download import download_testdata
 from PIL import Image
 from matplotlib import pyplot as plt
 import numpy as np
 img_url = 'https://github.com/dmlc/mxnet.js/blob/master/data/cat.png?raw=true'
-download(img_url, 'cat.png')
-img = Image.open('cat.png').resize((224, 224))
+img_path = download_testdata(img_url, 'cat.png', module='data')
+img = Image.open(img_path).resize((224, 224))
 plt.imshow(img)
 plt.show()
 # input preprocess
@@ -118,9 +104,9 @@ def transform_image(image):
                       '4d0b62f3d01426887599d4f7ede23ee5/raw/',
                       '596b27d23537e5a1b5751d2b0481ef172f58b539/',
                       'imagenet1000_clsid_to_human.txt'])
-synset_name = 'synset.txt'
-download(synset_url, synset_name)
-with open(synset_name) as f:
+synset_name = 'imagenet1000_clsid_to_human.txt'
+synset_path = download_testdata(synset_url, synset_name, module='data')
+with open(synset_path) as f:
     synset = eval(f.read())
 print('Relay top-1 id: {}, class name: {}'.format(top1_tvm, synset[top1_tvm]))
 # confirm correctness with caffe2 output
diff --git a/tutorials/frontend/from_coreml.py b/tutorials/frontend/from_coreml.py
index a79e21921068d..cefac979317a3 100644
--- a/tutorials/frontend/from_coreml.py
+++ b/tutorials/frontend/from_coreml.py
@@ -19,23 +19,11 @@
 """
 import tvm
 import tvm.relay as relay
+from tvm.contrib.download import download_testdata
 import coremltools as cm
 import numpy as np
 from PIL import Image
 
-def download(url, path, overwrite=False):
-    import os
-    if os.path.isfile(path) and not overwrite:
-        print('File {} existed, skip.'.format(path))
-        return
-    print('Downloading from url {} to {}'.format(url, path))
-    try:
-        import urllib.request
-        urllib.request.urlretrieve(url, path)
-    except:
-        import urllib
-        urllib.urlretrieve(url, path)
-
 ######################################################################
 # Load pretrained CoreML model
 # ----------------------------
@@ -43,17 +31,17 @@ def download(url, path, overwrite=False):
 # provided by apple in this example
 model_url = 'https://docs-assets.developer.apple.com/coreml/models/MobileNet.mlmodel'
 model_file = 'mobilenet.mlmodel'
-download(model_url, model_file)
+model_path = download_testdata(model_url, model_file, module='coreml')
 # Now you have mobilenet.mlmodel on disk
-mlmodel = cm.models.MLModel(model_file)
+mlmodel = cm.models.MLModel(model_path)
 
 ######################################################################
 # Load a test image
 # ------------------
 # A single cat dominates the examples!
 img_url = 'https://github.com/dmlc/mxnet.js/blob/master/data/cat.png?raw=true'
-download(img_url, 'cat.png')
-img = Image.open('cat.png').resize((224, 224))
+img_path = download_testdata(img_url, 'cat.png', module='data')
+img = Image.open(img_path).resize((224, 224))
 x = np.transpose(img, (2, 0, 1))[np.newaxis, :]
 
 ######################################################################
@@ -94,8 +82,8 @@ def download(url, path, overwrite=False):
                       '4d0b62f3d01426887599d4f7ede23ee5/raw/',
                       '596b27d23537e5a1b5751d2b0481ef172f58b539/',
                       'imagenet1000_clsid_to_human.txt'])
-synset_name = 'synset.txt'
-download(synset_url, synset_name)
-with open(synset_name) as f:
+synset_name = 'imagenet1000_clsid_to_human.txt'
+synset_path = download_testdata(synset_url, synset_name, module='data')
+with open(synset_path) as f:
     synset = eval(f.read())
 print('Top-1 id', top1, 'class name', synset[top1])
diff --git a/tutorials/frontend/from_keras.py b/tutorials/frontend/from_keras.py
index 0e30a8d012a88..e8a972cfbb1cc 100644
--- a/tutorials/frontend/from_keras.py
+++ b/tutorials/frontend/from_keras.py
@@ -20,22 +20,10 @@
 """
 import tvm
 import tvm.relay as relay
+from tvm.contrib.download import download_testdata
 import keras
 import numpy as np
 
-def download(url, path, overwrite=False):
-    import os
-    if os.path.isfile(path) and not overwrite:
-        print('File {} exists, skip.'.format(path))
-        return
-    print('Downloading from url {} to {}'.format(url, path))
-    try:
-        import urllib.request
-        urllib.request.urlretrieve(url, path)
-    except:
-        import urllib
-        urllib.urlretrieve(url, path)
-
 ######################################################################
 # Load pretrained keras model
 # ----------------------------
@@ -43,10 +31,10 @@ def download(url, path, overwrite=False):
 weights_url = ''.join(['https://github.com/fchollet/deep-learning-models/releases/',
                        'download/v0.2/resnet50_weights_tf_dim_ordering_tf_kernels.h5'])
 weights_file = 'resnet50_weights.h5'
-download(weights_url, weights_file)
+weights_path = download_testdata(weights_url, weights_file, module='keras')
 keras_resnet50 = keras.applications.resnet50.ResNet50(include_top=True, weights=None,
                                                       input_shape=(224, 224, 3), classes=1000)
-keras_resnet50.load_weights('resnet50_weights.h5')
+keras_resnet50.load_weights(weights_path)
 
 ######################################################################
 # Load a test image
@@ -56,8 +44,8 @@ def download(url, path, overwrite=False):
 from matplotlib import pyplot as plt
 from keras.applications.resnet50 import preprocess_input
 img_url = 'https://github.com/dmlc/mxnet.js/blob/master/data/cat.png?raw=true'
-download(img_url, 'cat.png')
-img = Image.open('cat.png').resize((224, 224))
+img_path = download_testdata(img_url, 'cat.png', module='data')
+img = Image.open(img_path).resize((224, 224))
 plt.imshow(img)
 plt.show()
 # input preprocess
@@ -92,9 +80,9 @@ def download(url, path, overwrite=False):
                       '4d0b62f3d01426887599d4f7ede23ee5/raw/',
                       '596b27d23537e5a1b5751d2b0481ef172f58b539/',
                       'imagenet1000_clsid_to_human.txt'])
-synset_name = 'synset.txt'
-download(synset_url, synset_name)
-with open(synset_name) as f:
+synset_name = 'imagenet1000_clsid_to_human.txt'
+synset_path = download_testdata(synset_url, synset_name, module='data')
+with open(synset_path) as f:
     synset = eval(f.read())
 print('Relay top-1 id: {}, class name: {}'.format(top1_tvm, synset[top1_tvm]))
 # confirm correctness with keras output
diff --git a/tutorials/frontend/from_mxnet.py b/tutorials/frontend/from_mxnet.py
index a465350a0df8a..e7e6db1fe9a48 100644
--- a/tutorials/frontend/from_mxnet.py
+++ b/tutorials/frontend/from_mxnet.py
@@ -29,22 +29,23 @@
 # Download Resnet18 model from Gluon Model Zoo
 # ---------------------------------------------
 # In this section, we download a pretrained imagenet model and classify an image.
+from tvm.contrib.download import download_testdata
 from mxnet.gluon.model_zoo.vision import get_model
-from mxnet.gluon.utils import download
 from PIL import Image
 from matplotlib import pyplot as plt
 block = get_model('resnet18_v1', pretrained=True)
+img_url = 'https://github.com/dmlc/mxnet.js/blob/master/data/cat.png?raw=true'
 img_name = 'cat.png'
 synset_url = ''.join(['https://gist.githubusercontent.com/zhreshold/',
                       '4d0b62f3d01426887599d4f7ede23ee5/raw/',
                       '596b27d23537e5a1b5751d2b0481ef172f58b539/',
                       'imagenet1000_clsid_to_human.txt'])
-synset_name = 'synset.txt'
-download('https://github.com/dmlc/mxnet.js/blob/master/data/cat.png?raw=true', img_name)
-download(synset_url, synset_name)
-with open(synset_name) as f:
+synset_name = 'imagenet1000_clsid_to_human.txt'
+img_path = download_testdata(img_url, 'cat.png', module='data')
+synset_path = download_testdata(synset_url, synset_name, module='data')
+with open(synset_path) as f:
     synset = eval(f.read())
-image = Image.open(img_name).resize((224, 224))
+image = Image.open(img_path).resize((224, 224))
 plt.imshow(image)
 plt.show()
 
diff --git a/tutorials/frontend/from_onnx.py b/tutorials/frontend/from_onnx.py
index 90b0c1c6645ca..19bee0a0319b0 100644
--- a/tutorials/frontend/from_onnx.py
+++ b/tutorials/frontend/from_onnx.py
@@ -20,19 +20,7 @@
 import numpy as np
 import tvm
 import tvm.relay as relay
-
-def download(url, path, overwrite=False):
-    import os
-    if os.path.isfile(path) and not overwrite:
-        print('File {} existed, skip.'.format(path))
-        return
-    print('Downloading from url {} to {}'.format(url, path))
-    try:
-        import urllib.request
-        urllib.request.urlretrieve(url, path)
-    except:
-        import urllib
-        urllib.urlretrieve(url, path)
+from tvm.contrib.download import download_testdata
 
 ######################################################################
 # Load pretrained ONNX model
@@ -44,9 +32,9 @@ def download(url, path, overwrite=False):
                      'bcda4716699ac97ea44f791c24310193/raw/',
                      '93672b029103648953c4e5ad3ac3aadf346a4cdc/',
                      'super_resolution_0.2.onnx'])
-download(model_url, 'super_resolution.onnx', False)
+model_path = download_testdata(model_url, 'super_resolution.onnx', module='onnx')
 # now you have super_resolution.onnx on disk
-onnx_model = onnx.load('super_resolution.onnx')
+onnx_model = onnx.load(model_path)
 
 ######################################################################
 # Load a test image
@@ -54,8 +42,8 @@ def download(url, path, overwrite=False):
 # A single cat dominates the examples!
 from PIL import Image
 img_url = 'https://github.com/dmlc/mxnet.js/blob/master/data/cat.png?raw=true'
-download(img_url, 'cat.png')
-img = Image.open('cat.png').resize((224, 224))
+img_path = download_testdata(img_url, 'cat.png', module='data')
+img = Image.open(img_path).resize((224, 224))
 img_ycbcr = img.convert("YCbCr")  # convert to YCbCr
 img_y, img_cb, img_cr = img_ycbcr.split()
 x = np.array(img_y)[np.newaxis, np.newaxis, :, :]
diff --git a/tutorials/frontend/from_tensorflow.py b/tutorials/frontend/from_tensorflow.py
index 1f76db890ade8..1b01ef8a6b8b5 100644
--- a/tutorials/frontend/from_tensorflow.py
+++ b/tutorials/frontend/from_tensorflow.py
@@ -43,8 +43,8 @@
 map_proto_url = os.path.join(repo_base, map_proto)
 
 # Human readable text for labels
-lable_map = 'imagenet_synset_to_human_label_map.txt'
-lable_map_url = os.path.join(repo_base, lable_map)
+label_map = 'imagenet_synset_to_human_label_map.txt'
+label_map_url = os.path.join(repo_base, label_map)
 
 # Target settings
 # Use these commented settings to build for cuda.
@@ -61,19 +61,19 @@
 # Download required files
 # -----------------------
 # Download files listed above.
-from mxnet.gluon.utils import download
+from tvm.contrib.download import download_testdata
 
-download(image_url, img_name)
-download(model_url, model_name)
-download(map_proto_url, map_proto)
-download(lable_map_url, lable_map)
+img_path = download_testdata(image_url, img_name, module='data')
+model_path = download_testdata(model_url, model_name, module=['tf', 'InceptionV1'])
+map_proto_path = download_testdata(map_proto_url, map_proto, module='data')
+label_path = download_testdata(label_map_url, label_map, module='data')
 
 ######################################################################
 # Import model
 # ------------
 # Creates tensorflow graph definition from protobuf file.
 
-with tf.gfile.FastGFile(os.path.join("./", model_name), 'rb') as f:
+with tf.gfile.FastGFile(model_path, 'rb') as f:
     graph_def = tf.GraphDef()
     graph_def.ParseFromString(f.read())
     graph = tf.import_graph_def(graph_def, name='')
@@ -94,7 +94,7 @@
 #
 
 from PIL import Image
-image = Image.open(img_name).resize((299, 299))
+image = Image.open(img_path).resize((299, 299))
 
 x = np.array(image)
 
@@ -110,7 +110,7 @@
 dtype_dict = {'DecodeJpeg/contents': 'uint8'}
 sym, params = relay.frontend.from_tensorflow(graph_def, layout=layout, shape=shape_dict)
 
-print ("Tensorflow protobuf imported to relay frontend.")
+print("Tensorflow protobuf imported to relay frontend.")
 ######################################################################
 # Relay Build
 # -----------
@@ -148,8 +148,8 @@
 predictions = np.squeeze(predictions)
 
 # Creates node ID --> English string lookup.
-node_lookup = tf_testing.NodeLookup(label_lookup_path=os.path.join("./", map_proto),
-                                         uid_lookup_path=os.path.join("./", lable_map))
+node_lookup = tf_testing.NodeLookup(label_lookup_path=map_proto_path,
+                                    uid_lookup_path=label_path)
 
 # Print top 5 predictions from TVM output.
 top_k = predictions.argsort()[-5:][::-1]
@@ -166,7 +166,7 @@
 def create_graph():
     """Creates a graph from saved GraphDef file and returns a saver."""
     # Creates graph from saved graph_def.pb.
-    with tf.gfile.FastGFile(model_name, 'rb') as f:
+    with tf.gfile.FastGFile(model_path, 'rb') as f:
         graph_def = tf.GraphDef()
         graph_def.ParseFromString(f.read())
         graph = tf.import_graph_def(graph_def, name='')
@@ -200,8 +200,8 @@ def run_inference_on_image(image):
         predictions = np.squeeze(predictions)
 
         # Creates node ID --> English string lookup.
-        node_lookup = tf_testing.NodeLookup(label_lookup_path=os.path.join("./", map_proto),
-                                                 uid_lookup_path=os.path.join("./", lable_map))
+        node_lookup = tf_testing.NodeLookup(label_lookup_path=map_proto_path,
+                                            uid_lookup_path=label_path)
 
         # Print top 5 predictions from tensorflow.
         top_k = predictions.argsort()[-5:][::-1]
@@ -211,4 +211,4 @@ def run_inference_on_image(image):
             score = predictions[node_id]
             print('%s (score = %.5f)' % (human_string, score))
 
-run_inference_on_image (img_name)
+run_inference_on_image(img_path)
diff --git a/tutorials/frontend/from_tflite.py b/tutorials/frontend/from_tflite.py
index cfa18af291b14..fbd4a6d2837bc 100644
--- a/tutorials/frontend/from_tflite.py
+++ b/tutorials/frontend/from_tflite.py
@@ -52,25 +52,14 @@
 ######################################################################
 # Utils for downloading and extracting zip files
 # ---------------------------------------------
-
-def download(url, path, overwrite=False):
-    import os
-    if os.path.isfile(path) and not overwrite:
-        print('File {} existed, skip.'.format(path))
-        return
-    print('Downloading from url {} to {}'.format(url, path))
-    try:
-        import urllib.request
-        urllib.request.urlretrieve(url, path)
-    except:
-        import urllib
-        urllib.urlretrieve(url, path)
+import os
 
 def extract(path):
     import tarfile
     if path.endswith("tgz") or path.endswith("gz"):
+        dir_path = os.path.dirname(path)
         tar = tarfile.open(path)
-        tar.extractall()
+        tar.extractall(path=dir_path)
         tar.close()
     else:
         raise RuntimeError('Could not decompress the file: ' + path)
@@ -80,14 +69,17 @@ def extract(path):
 # Load pretrained TFLite model
 # ---------------------------------------------
 # we load mobilenet V1 TFLite model provided by Google
+from tvm.contrib.download import download_testdata
+
 model_url = "http://download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_1.0_224.tgz"
 
 # we download model tar file and extract, finally get mobilenet_v1_1.0_224.tflite
-download(model_url, "mobilenet_v1_1.0_224.tgz", False)
-extract("mobilenet_v1_1.0_224.tgz")
+model_path = download_testdata(model_url, "mobilenet_v1_1.0_224.tgz", module=['tf', 'official'])
+model_dir = os.path.dirname(model_path)
+extract(model_path)
 
 # now we have mobilenet_v1_1.0_224.tflite on disk and open it
-tflite_model_file = "mobilenet_v1_1.0_224.tflite"
+tflite_model_file = os.path.join(model_dir, "mobilenet_v1_1.0_224.tflite")
 tflite_model_buf = open(tflite_model_file, "rb").read()
 
 # get TFLite model from buffer
@@ -103,8 +95,8 @@ def extract(path):
 import numpy as np
 
 image_url = 'https://github.com/dmlc/mxnet.js/blob/master/data/cat.png?raw=true'
-download(image_url, 'cat.png')
-resized_image = Image.open('cat.png').resize((224, 224))
+image_path = download_testdata(image_url, 'cat.png', module='data')
+resized_image = Image.open(image_path).resize((224, 224))
 plt.imshow(resized_image)
 plt.show()
 image_data = np.asarray(resized_image).astype("float32")
@@ -179,11 +171,11 @@ def extract(path):
                           'app/src/main/assets/',
                           'labels_mobilenet_quant_v1_224.txt'])
 label_file = "labels_mobilenet_quant_v1_224.txt"
-download(label_file_url, label_file)
+label_path = download_testdata(label_file_url, label_file, module='data')
 
 # map id to 1001 classes
 labels = dict()
-with open(label_file) as f:
+with open(label_path) as f:
     for id, line in enumerate(f):
         labels[id] = line