[CodeGen][CUDA] Vectorization for intrinsics (apache#5101)

- This allows to emit vectorized loads/stores for CUDA math intrinsics. - A few intrinsics should be lowered as CUDAMath not CUDAFastMath ones. - Fixed the code block identation.
trevor-m · Apr 16, 2020 · a764b60 · a764b60
1 parent 9daaf1e
commit a764b60
Show file tree

Hide file tree

Showing 4 changed files with 238 additions and 53 deletions.
diff --git a/src/target/source/codegen_c.h b/src/target/source/codegen_c.h
@@ -257,6 +257,29 @@ class CodeGenC :
   /*! \brief the data type of allocated buffers */
   std::unordered_map<const VarNode*, DataType> handle_data_type_;
 
+  /*!
+   * \brief A RAII utility class for emitting code in a scoped region.
+   */
+  class EnterScopeRAII {
+    // The codegen context.
+    CodeGenC* cg;
+
+    // The new scope level.
+    int scope;
+
+   public:
+    explicit EnterScopeRAII(CodeGenC* cg) : cg(cg) {
+      cg->PrintIndent();
+      cg->stream << "{\n";
+      scope = cg->BeginScope();
+    }
+    ~EnterScopeRAII() {
+      cg->EndScope(scope);
+      cg->PrintIndent();
+      cg->stream << "}\n";
+    }
+  };
+
  private:
   /*! \brief whether to print in SSA form */
   bool print_ssa_form_{false};

diff --git a/src/target/source/codegen_cuda.cc b/src/target/source/codegen_cuda.cc
@@ -24,6 +24,7 @@
 #include <tvm/runtime/registry.h>
 
 #include <cmath>
+#include <utility>
 #include <vector>
 #include <string>
 #include "literal/cuda_half_t.h"
@@ -235,25 +236,19 @@ void CodeGenCUDA::PrintType(DataType t, std::ostream& os) {  // NOLINT(*)
 void CodeGenCUDA::PrintVecBinaryOp(
     const std::string& op, DataType t,
     PrimExpr lhs, PrimExpr rhs, std::ostream& os) {  // NOLINT(*)
-  // unpacking operations.
-  int lanes = t.lanes();
-
+  // Delcare the result.
+  std::string sret = GetUniqueName("_");
+  this->PrintIndent();
+  this->PrintType(t, stream);
+  stream << ' ' << sret << ";\n";
   {
-    // The assignment below introduces side-effect, and the resulting value cannot
-    // be reused across multiple expression, thus a new scope is needed
-    int vec_scope = BeginScope();
+    EnterScopeRAII scope(this);
 
-    // default: unpack into individual ops.
+    // Unpack into individual ops.
     std::string vlhs = SSAGetID(PrintExpr(lhs), lhs.dtype());
     std::string vrhs = SSAGetID(PrintExpr(rhs), rhs.dtype());
-    std::string sret = GetUniqueName("_");
-    {
-      // delcare type.
-      this->PrintIndent();
-      this->PrintType(t, stream);
-      stream << ' ' << sret << ";\n";
-    }
-    for (int i = 0; i < lanes; ++i) {
+
+    for (int i = 0, lanes = t.lanes(); i < lanes; ++i) {
       std::ostringstream value_temp;
       if (isalpha(op[0])) {
         value_temp << op << "(";
@@ -270,9 +265,8 @@ void CodeGenCUDA::PrintVecBinaryOp(
       }
       PrintVecElemStore(sret, t, i, value_temp.str());
     }
-    os << sret;
-    EndScope(vec_scope);
   }
+  os << sret;
 }
 
 void CodeGenCUDA::PrintVecElemLoad(
@@ -418,6 +412,54 @@ void CodeGenCUDA::VisitExpr_(const CallNode *op, std::ostream& os) {
       this->PrintExpr(op->args[i * 2 + 1], os);
       os << "]" << ((i < 3) ? ", ": ")");
     }
+  } else if (op->call_type == CallNode::PureExtern && op->dtype.is_vector()) {
+    //
+    // Emit an unsupported vector call
+    //
+    // v = intrin_f((float4*)A[0], (float4*)B[0])
+    //
+    // as
+    //
+    // float4 __ret;
+    // {
+    //   float4 __arg0 = ((float4*)A)[0];
+    //   float4 __arg1 = ((float4*)B)[0];
+    //   __ret.x = intrin_f(__arg0.x, __arg1.x);
+    //   __ret.y = intrin_f(__arg0.y, __arg1.y);
+    //   __ret.z = intrin_f(__arg0.z, __arg1.z);
+    //   __ret.w = intrin_f(__arg0.w, __arg1.w);
+    // }
+    // v = __ret;
+    //
+    // Declare the result vector.
+    std::string sret = GetUniqueName("_");
+    this->PrintIndent();
+    this->PrintType(op->dtype, stream);
+    stream << ' ' << sret << ";\n";
+    {
+      EnterScopeRAII scope(this);
+
+      // Load arguments.
+      std::vector<std::string> sargs;
+      for (size_t i = 0; i < op->args.size(); ++i) {
+        std::string val = SSAGetID(PrintExpr(op->args[i]), op->args[i].dtype());
+        sargs.push_back(std::move(val));
+      }
+
+      // Emit a scalar call for each lane.
+      for (int i = 0; i < op->dtype.lanes(); ++i) {
+        std::ostringstream scall;
+        scall << op->name << "(";
+        for (size_t j = 0; j < op->args.size(); ++j) {
+          if (j > 0)
+            scall << ", ";
+          PrintVecElemLoad(sargs[j], op->args[j].dtype(), i, scall);
+        }
+        scall << ")";
+        PrintVecElemStore(sret, op->dtype, i, scall.str());
+      }
+    }
+    os << sret;
   } else {
     CodeGenC::VisitExpr_(op, os);
   }
@@ -580,34 +622,34 @@ void CodeGenCUDA::VisitExpr_(const SelectNode* op, std::ostream &os) {
         op->true_value->dtype == op->dtype &&
         op->dtype.lanes() == op->condition.dtype().lanes());
 
-  int lanes = op->dtype.lanes();
-  int scope = BeginScope();
-
-  std::string c_var = SSAGetID(PrintExpr(op->condition), op->dtype);
-  std::string t_var = SSAGetID(PrintExpr(op->true_value), op->dtype);
-  std::string f_var = SSAGetID(PrintExpr(op->false_value), op->dtype);
   std::string r_var = GetUniqueName("_");
-
   this->PrintIndent();
   this->PrintType(op->dtype, stream);
   stream << ' ' << r_var << ";\n";
+  {
+    EnterScopeRAII scope(this);
+
+    std::string c_var = SSAGetID(PrintExpr(op->condition), op->dtype);
+    std::string t_var = SSAGetID(PrintExpr(op->true_value), op->dtype);
+    std::string f_var = SSAGetID(PrintExpr(op->false_value), op->dtype);
 
-  // The condition is stored as an ushort vector.
-  DataType memory_ty(DataType::TypeCode::kUInt, 16, lanes);
-
-  for (int i = 0; i < lanes; ++i) {
-    std::ostringstream item;
-    item << "(bool(";
-    PrintVecElemLoad(c_var, memory_ty, i, item);
-    item << ")?";
-    PrintVecElemLoad(t_var, op->dtype, i, item);
-    item << ':';
-    PrintVecElemLoad(f_var, op->dtype, i, item);
-    item << ')';
-    PrintVecElemStore(r_var, op->dtype, i, item.str());
+    // The condition is stored as an ushort vector.
+    int lanes = op->dtype.lanes();
+    DataType memory_ty(DataType::TypeCode::kUInt, 16, lanes);
+
+    for (int i = 0; i < lanes; ++i) {
+      std::ostringstream item;
+      item << "(bool(";
+      PrintVecElemLoad(c_var, memory_ty, i, item);
+      item << ")?";
+      PrintVecElemLoad(t_var, op->dtype, i, item);
+      item << ':';
+      PrintVecElemLoad(f_var, op->dtype, i, item);
+      item << ')';
+      PrintVecElemStore(r_var, op->dtype, i, item.str());
+    }
   }
   os << r_var;
-  EndScope(scope);
 }
 
 inline void PrintConst(const FloatImmNode* op, std::ostream& os, CodeGenCUDA* p) { // NOLINT(*)

diff --git a/src/target/source/intrin_rule_cuda.cc b/src/target/source/intrin_rule_cuda.cc
@@ -29,14 +29,12 @@ namespace intrin {
 // Add float suffix to the intrinsics, CUDA fast math.
 struct CUDAMath {
   std::string operator()(DataType t, std::string name) const {
-    if (t.lanes() == 1) {
-      if (t.is_float()) {
-        switch (t.bits()) {
-          case 64: return name;
-          case 32: return name + 'f';
-          case 16: return 'h' + name;
-          default: return "";
-        }
+    if (t.is_float()) {
+      switch (t.bits()) {
+        case 64: return name;
+        case 32: return name + 'f';
+        case 16: return 'h' + name;
+        default: return "";
       }
     }
     return "";
@@ -45,7 +43,7 @@ struct CUDAMath {
 
 struct CUDAFastMath : public CUDAMath {
   std::string operator()(DataType t, std::string name) const {
-    if (t.lanes() == 1 && t.is_float() && t.bits() == 32) {
+    if (t.is_float() && t.bits() == 32) {
       return "__" + name + 'f';
     } else {
       return CUDAMath::operator()(t, name);
@@ -56,7 +54,7 @@ struct CUDAFastMath : public CUDAMath {
 
 struct CUDAFastMathTan : public CUDAMath {
   std::string operator()(DataType t, std::string name) const {
-    if (t.lanes() == 1 && t.is_float()) {
+    if (t.is_float()) {
         switch (t.bits()) {
           case 64: return name;
           // `__tanf` seems to produce some values too deviant from numpy tan version.
@@ -72,7 +70,7 @@ struct CUDAFastMathTan : public CUDAMath {
 
 struct CUDAPopcount {
   std::string operator()(DataType t, std::string name) const {
-    if (t.lanes() == 1 && t.is_uint()) {
+    if (t.is_uint()) {
       switch (t.bits()) {
         case 32: return "__popc";
         case 64: return "__popcll";
@@ -108,7 +106,7 @@ TVM_REGISTER_GLOBAL("tvm.intrin.rule.cuda.exp")
 .set_body(DispatchExtern<CUDAFastMath>);
 
 TVM_REGISTER_GLOBAL("tvm.intrin.rule.cuda.exp2")
-.set_body(DispatchExtern<CUDAFastMath>);
+.set_body(DispatchExtern<CUDAMath>);
 
 TVM_REGISTER_GLOBAL("tvm.intrin.rule.cuda.exp10")
 .set_body(DispatchExtern<CUDAFastMath>);
@@ -132,13 +130,13 @@ TVM_REGISTER_GLOBAL("tvm.intrin.rule.cuda.cos")
 .set_body(DispatchExtern<CUDAFastMath>);
 
 TVM_REGISTER_GLOBAL("tvm.intrin.rule.cuda.cosh")
-.set_body(DispatchExtern<CUDAFastMath>);
+.set_body(DispatchExtern<CUDAMath>);
 
 TVM_REGISTER_GLOBAL("tvm.intrin.rule.cuda.sin")
 .set_body(DispatchExtern<CUDAFastMath>);
 
 TVM_REGISTER_GLOBAL("tvm.intrin.rule.cuda.sinh")
-.set_body(DispatchExtern<CUDAFastMath>);
+.set_body(DispatchExtern<CUDAMath>);
 
 TVM_REGISTER_GLOBAL("tvm.intrin.rule.cuda.atan")
 .set_body(DispatchExtern<CUDAMath>);