[CodeGen][CUDA] Vectorization for intrinsics

- This allows to emit vectorized loads/stores for CUDA math intrinsics. - Fixed a few intrinsics that should be lowered as CUDAMath not CUDAFastMath ones.
apache · Mar 19, 2020 · b3be9bd · b3be9bd
1 parent e1ebf06
commit b3be9bd
Show file tree

Hide file tree

Showing 4 changed files with 212 additions and 15 deletions.
diff --git a/src/target/source/codegen_c.h b/src/target/source/codegen_c.h
@@ -237,6 +237,33 @@ class CodeGenC :
     return volatile_buf_.count(buf_var) != 0;
   }
 
+  /*!
+   * \brief A RAII utility class for emitting code in a scoped region.
+   */
+  class EnterScopeRAII {
+    // The codegen context.
+    CodeGenC& cg;
+
+    // The code stream being emitted.
+    std::ostringstream& stream;
+
+    // The new scope level.
+    int scope;
+
+   public:
+    EnterScopeRAII(CodeGenC& cg, std::ostringstream& stream)
+      : cg(cg), stream(stream) {
+      cg.PrintIndent();
+      stream << "{\n";
+      scope = cg.BeginScope();
+    }
+    ~EnterScopeRAII() {
+      cg.EndScope(scope);
+      cg.PrintIndent();
+      stream << "}\n";
+    }
+  };
+
  private:
   /*! \brief whether to print in SSA form */
   bool print_ssa_form_{false};

diff --git a/src/target/source/codegen_cuda.cc b/src/target/source/codegen_cuda.cc
@@ -418,6 +418,56 @@ void CodeGenCUDA::VisitExpr_(const CallNode *op, std::ostream& os) {
       this->PrintExpr(op->args[i * 2 + 1], os);
       os << "]" << ((i < 3) ? ", ": ")");
     }
+  } else if ((op->call_type == CallNode::PureIntrinsic ||
+              op->call_type == CallNode::PureExtern) &&
+              op->dtype.is_vector()) {
+    //
+    // Emit an unsupported vector call
+    //
+    // v = intrin_f((float4*)A[0], (float4*)B[0])
+    //
+    // as
+    //
+    // float4 __ret;
+    // {
+    //   float4 __arg0 = ((float4*)A)[0];
+    //   float4 __arg1 = ((float4*)B)[0];
+    //   __ret.x = intrin_f(__arg0.x, __arg1.x);
+    //   __ret.y = intrin_f(__arg0.y, __arg1.y);
+    //   __ret.z = intrin_f(__arg0.z, __arg1.z);
+    //   __ret.w = intrin_f(__arg0.w, __arg1.w);
+    // }
+    // v = __ret;
+    //
+    // Declare the result vector.
+    std::string sret = GetUniqueName("_");
+    this->PrintIndent();
+    this->PrintType(op->dtype, stream);
+    stream << ' ' << sret << ";\n";
+    {
+      EnterScopeRAII scope(*this, stream);
+
+      // Load arguments.
+      std::vector<std::string> sargs;
+      for (size_t i = 0; i < op->args.size(); ++i) {
+        std::string val = SSAGetID(PrintExpr(op->args[i]), op->args[i].dtype());
+        sargs.push_back(std::move(val));
+      }
+
+      // Emit a scalar call for each lane.
+      for (int i = 0; i < op->dtype.lanes(); ++i) {
+        std::ostringstream scall;
+        scall << op->name << "(";
+        for (size_t j = 0; j < op->args.size(); ++j) {
+          if (j > 0)
+            scall << ',';
+          PrintVecElemLoad(sargs[j], op->args[j].dtype(), i, scall);
+        }
+        scall << ")";
+        PrintVecElemStore(sret, op->dtype, i, scall.str());
+      }
+    }
+    os << sret;
   } else {
     CodeGenC::VisitExpr_(op, os);
   }

diff --git a/src/target/source/intrin_rule_cuda.cc b/src/target/source/intrin_rule_cuda.cc
@@ -29,14 +29,12 @@ namespace intrin {
 // Add float suffix to the intrinsics, CUDA fast math.
 struct CUDAMath {
   std::string operator()(DataType t, std::string name) const {
-    if (t.lanes() == 1) {
-      if (t.is_float()) {
-        switch (t.bits()) {
-          case 64: return name;
-          case 32: return name + 'f';
-          case 16: return 'h' + name;
-          default: return "";
-        }
+    if (t.is_float()) {
+      switch (t.bits()) {
+        case 64: return name;
+        case 32: return name + 'f';
+        case 16: return 'h' + name;
+        default: return "";
       }
     }
     return "";
@@ -45,7 +43,7 @@ struct CUDAMath {
 
 struct CUDAFastMath : public CUDAMath {
   std::string operator()(DataType t, std::string name) const {
-    if (t.lanes() == 1 && t.is_float() && t.bits() == 32) {
+    if (t.is_float() && t.bits() == 32) {
       return "__" + name + 'f';
     } else {
       return CUDAMath::operator()(t, name);
@@ -56,7 +54,7 @@ struct CUDAFastMath : public CUDAMath {
 
 struct CUDAFastMathTan : public CUDAMath {
   std::string operator()(DataType t, std::string name) const {
-    if (t.lanes() == 1 && t.is_float()) {
+    if (t.is_float()) {
         switch (t.bits()) {
           case 64: return name;
           // `__tanf` seems to produce some values too deviant from numpy tan version.
@@ -72,7 +70,7 @@ struct CUDAFastMathTan : public CUDAMath {
 
 struct CUDAPopcount {
   std::string operator()(DataType t, std::string name) const {
-    if (t.lanes() == 1 && t.is_uint()) {
+    if (t.is_uint()) {
       switch (t.bits()) {
         case 32: return "__popc";
         case 64: return "__popcll";
@@ -108,7 +106,7 @@ TVM_REGISTER_GLOBAL("tvm.intrin.rule.cuda.exp")
 .set_body(DispatchExtern<CUDAFastMath>);
 
 TVM_REGISTER_GLOBAL("tvm.intrin.rule.cuda.exp2")
-.set_body(DispatchExtern<CUDAFastMath>);
+.set_body(DispatchExtern<CUDAMath>);
 
 TVM_REGISTER_GLOBAL("tvm.intrin.rule.cuda.exp10")
 .set_body(DispatchExtern<CUDAFastMath>);
@@ -132,13 +130,13 @@ TVM_REGISTER_GLOBAL("tvm.intrin.rule.cuda.cos")
 .set_body(DispatchExtern<CUDAFastMath>);
 
 TVM_REGISTER_GLOBAL("tvm.intrin.rule.cuda.cosh")
-.set_body(DispatchExtern<CUDAFastMath>);
+.set_body(DispatchExtern<CUDAMath>);
 
 TVM_REGISTER_GLOBAL("tvm.intrin.rule.cuda.sin")
 .set_body(DispatchExtern<CUDAFastMath>);
 
 TVM_REGISTER_GLOBAL("tvm.intrin.rule.cuda.sinh")
-.set_body(DispatchExtern<CUDAFastMath>);
+.set_body(DispatchExtern<CUDAMath>);
 
 TVM_REGISTER_GLOBAL("tvm.intrin.rule.cuda.atan")
 .set_body(DispatchExtern<CUDAMath>);

diff --git a/tests/python/unittest/test_target_codegen_cuda.py b/tests/python/unittest/test_target_codegen_cuda.py
@@ -348,6 +348,125 @@ def test_cuda_floordiv_with_vectorization():
         func(a_nd, b_nd)
         tvm.testing.assert_allclose(b_nd.asnumpy(), b_np, rtol=1e-3)
 
+def sched(B):
+    s = te.create_schedule(B.op)
+    io, ii = s[B].split(s[B].op.axis[0], nparts=1)
+    iio, iii = s[B].split(ii, nparts=32)
+    _, iiii = s[B].split(iii, factor=4)
+    s[B].vectorize(iiii)
+    s[B].bind(io, bx)
+    s[B].bind(iio, tx)
+    return s
+
+def test_vectorized_intrin1():
+    test_funcs = [
+        (tvm.tir.floor, lambda x : np.floor(x)),
+        (tvm.tir.ceil,  lambda x : np.ceil(x)),
+        (tvm.tir.trunc, lambda x : np.trunc(x)),
+        (tvm.tir.abs,   lambda x : np.fabs(x)),
+        (tvm.tir.round, lambda x : np.round(x)),
+        (tvm.tir.exp,   lambda x : np.exp(x)),
+        (tvm.tir.exp2,  lambda x : np.exp2(x)),
+        (tvm.tir.exp10, lambda x : np.power(10,x)),
+        (tvm.tir.log,   lambda x : np.log(x)),
+        (tvm.tir.log2,  lambda x : np.log2(x)),
+        (tvm.tir.log10, lambda x : np.log10(x)),
+        (tvm.tir.tan,   lambda x : np.tan(x)),
+        (tvm.tir.cos,   lambda x : np.cos(x)),
+        (tvm.tir.cosh,  lambda x : np.cosh(x)),
+        (tvm.tir.sin,   lambda x : np.sin(x)),
+        (tvm.tir.sinh,  lambda x : np.sinh(x)),
+        (tvm.tir.atan,  lambda x : np.arctan(x)),
+        (tvm.tir.tanh,  lambda x : np.tanh(x)),
+        (tvm.tir.sqrt,  lambda x : np.sqrt(x)),
+    ]
+    def run_test(tvm_intrin, np_func, dtype):
+        if not tvm.gpu(0).exist or not tvm.runtime.enabled("cuda"):
+            print("skip because cuda is not enabled..")
+            return
+        if dtype == "float16" and not have_fp16(tvm.gpu(0).compute_version):
+            print("Skip because gpu does not have fp16 support")
+            return
+        # set of intrinsics does not support fp16 yet.
+        skip_set = {tvm.tir.abs,
+                    tvm.tir.round,
+                    tvm.tir.tan,
+                    tvm.tir.atan,
+                    tvm.tir.tanh,
+                    tvm.tir.cosh,
+                    tvm.tir.sinh}
+        if dtype == "float16" and tvm_intrin in skip_set:
+            print("Skip because '{0}' does not support fp16 yet".format(tvm_intrin.__name__))
+            return
+
+        n = 128
+        A = te.placeholder((n,), dtype=dtype, name='A')
+        B = te.compute((n,), lambda *i: tvm_intrin(A(*i)), name='B')
+        s = sched(B)
+        f = tvm.build(s, [A, B], "cuda")
+        ctx = tvm.gpu(0)
+        a = tvm.nd.array(np.random.uniform(0, 1, size=n).astype(A.dtype), ctx)
+        b = tvm.nd.array(np.zeros(shape=(n,)).astype(A.dtype), ctx)
+        f(a, b)
+        tvm.testing.assert_allclose(b.asnumpy(), np_func(a.asnumpy()), atol=1e-3, rtol=1e-3)
+
+    for func in test_funcs:
+        run_test(*func, "float32")
+        run_test(*func, "float16")
+
+def test_vectorized_intrin2(dtype="float32"):
+    c2 = tvm.tir.const(2, dtype=dtype)
+    test_funcs = [
+        (tvm.tir.power, lambda x : np.power(x, 2.0)),
+        (tvm.tir.fmod,  lambda x : np.fmod(x, 2.0))
+    ]
+    def run_test(tvm_intrin, np_func):
+        if not tvm.gpu(0).exist or not tvm.runtime.enabled("cuda"):
+            print("skip because cuda is not enabled..")
+            return
+
+        n = 128
+        A = te.placeholder((n,), dtype=dtype, name='A')
+        B = te.compute((n,), lambda i: tvm_intrin(A[i], c2), name='B')
+        s = sched(B)
+        f = tvm.build(s, [A, B], "cuda")
+        ctx = tvm.gpu(0)
+        a = tvm.nd.array(np.random.uniform(0, 1, size=n).astype(A.dtype), ctx)
+        b = tvm.nd.array(np.zeros(shape=(n,)).astype(A.dtype), ctx)
+        f(a, b)
+        tvm.testing.assert_allclose(b.asnumpy(), np_func(a.asnumpy()), atol=1e-3, rtol=1e-3)
+
+    for func in test_funcs:
+        run_test(*func)
+
+def test_vectorized_popcount():
+    def ref_popcount(x):
+        cnt = 0
+        while x:
+            x -= x & -x
+            cnt += 1
+        return cnt
+
+    def run_test(dtype):
+        if not tvm.gpu(0).exist or not tvm.runtime.enabled("cuda"):
+            print("skip because cuda is not enabled..")
+            return
+
+        n = 128
+        A = te.placeholder((n,), dtype=dtype, name='A')
+        B = te.compute((n,), lambda i: tvm.tir.popcount(A[i]), name='B')
+        s = sched(B)
+        f = tvm.build(s, [A, B], "cuda")
+        ctx = tvm.gpu(0)
+        a = tvm.nd.array(np.random.randint(0, 100000, size=n).astype(A.dtype), ctx)
+        b = tvm.nd.array(np.zeros(shape=(n,)).astype(B.dtype), ctx)
+        f(a, b)
+        ref = np.vectorize(ref_popcount)(a.asnumpy())
+        tvm.testing.assert_allclose(b.asnumpy(), ref)
+
+    run_test("uint32")
+    run_test("uint64")
+
 if __name__ == "__main__":
     test_cuda_vectorize_add()
     test_cuda_multiply_add()
@@ -359,4 +478,7 @@ def test_cuda_floordiv_with_vectorization():
     test_rfactor_predicates()
     test_cuda_const_float_to_half()
     test_cuda_reduction()
-    test_cuda_floordiv_with_vectorization()
+    test_cuda_floordiv_with_vectorization()
+    test_vectorized_intrin1()
+    test_vectorized_intrin2()
+    test_vectorized_popcount()