[TensorIR][M2a] Reorder (apache#8767)

This PR is part of the TensorIR upstreaming effort (apache#7527), which adds a schedule primitive: reorder. Co-authored-by: Siyuan Feng <[email protected]> Co-authored-by: Bohan Hou <[email protected]> Co-authored-by: Ruihang Lai <[email protected]> Co-authored-by: Wuwei Lin <[email protected]> Co-authored-by: Junru Shao <[email protected]>
ylc · Jan 13, 2022 · d14454a · d14454a
1 parent 3f831b4
commit d14454a
Show file tree

Hide file tree

Showing 10 changed files with 736 additions and 0 deletions.
diff --git a/include/tvm/tir/schedule/schedule.h b/include/tvm/tir/schedule/schedule.h
@@ -219,6 +219,19 @@ class ScheduleNode : public runtime::Object {
    * \return The new loops after split
    */
   virtual Array<LoopRV> Split(const LoopRV& loop_rv, const Array<Optional<ExprRV>>& factors) = 0;
+  /*!
+   * \brief Reorder a list of loops. It doesn't require the loops to be consecutive.
+   * It requires:
+   * 1) The loops are in the same chain. That means: the loops can be ordered to [l_1, l_2, ... ,
+   *     l_n] where l_i is an ancestor of l_{i+1} and there are only single-branch loops between
+   *     l_1 and l_n (which also indicates they are under the same scope).
+   * 2) After reordering, the domain of an outer loop cannot depend on any of the inner loops.
+   * 3) For every block under the loop nests, its block binding must be affine, and the block
+   *    variables must be either data parallel or reduction.
+   * 4) No duplicated loops are allowed in the arguments.
+   * \param ordered_loop_rvs The loops in the new order
+   */
+  virtual void Reorder(const Array<LoopRV>& ordered_loop_rvs) = 0;
   /******** Schedule: Manipulate ForKind ********/
   /*!
    * \brief Parallelize the input loop. It requires:

diff --git a/python/tvm/tir/schedule/schedule.py b/python/tvm/tir/schedule/schedule.py
@@ -442,6 +442,65 @@ def after_split(a: ty.handle, b: ty.handle) -> None:
         # that there is at most one None in `factors`
         return _ffi_api.ScheduleSplit(self, loop, factors)  # type: ignore # pylint: disable=no-member
 
+    def reorder(self, *ordered_loops: List[LoopRV]) -> None:
+        """
+        Reorder a list of loops. It doesn't require the loops to be consecutive.
+        It requires:
+        1) The loops are in the same chain. That means: the loops can be ordered to [l_1, l_2, ... ,
+        l_n] where l_i is an ancestor of l_{i+1} and there are only single-branch loops between
+        l_1 and l_n (which also indicates they are under the same scope).
+        2) After reordering, the domain of an outer loop cannot depend on any of the inner loops.
+        3) For every block under the loop nests, its block binding must be affine, and the block
+        variables must be either data parallel or reduction.
+        4) No duplicated loops are allowed in the arguments.
+
+        Parameters
+        ----------
+        *ordered_loops : List[LoopRV]
+            The loops in the new order
+
+        Examples
+        --------
+
+        Before reorder, in TensorIR, the IR is:
+
+        .. code-block:: python
+
+            @tvm.script.tir
+            def before_reorder(a: ty.handle, b: ty.handle) -> None:
+                A = tir.match_buffer(a, (128, 128))
+                B = tir.match_buffer(b, (128, 128))
+                for i, j in tir.grid(128, 128):
+                    with tir.block([128, 128], "B") as [vi, vj]:
+                        B[vi, vj] = A[vi, vj] * 2.0
+
+        Create the schedule and do reorder:
+
+        .. code-block:: python
+
+            sch = tir.Schedule(before_reorder)
+            i, j = sch.get_loops(sch.get_block("B"))
+            sch.reorder(j, i)
+            print(tvm.script.asscript(sch.mod["main"]))
+
+        After applying reorder, the IR becomes:
+
+        .. code-block:: python
+
+            @tvm.script.tir
+            def after_reorder(a: ty.handle, b: ty.handle) -> None:
+                A = tir.match_buffer(a, (128, 128))
+                B = tir.match_buffer(b, (128, 128))
+                # Here j and i are reordered
+                for j, i in tir.grid(128, 128):
+                    with tir.block([128, 128], "B") as [vi, vj]:
+                        tir.bind(vi, i)
+                        tir.bind(vj, j)
+                        B[vi, vj] = A[vi, vj] * 2.0
+
+        """
+        _ffi_api.ScheduleReorder(self, ordered_loops)  # type: ignore # pylint: disable=no-member
+
     ########## Schedule: Manipulate ForKind ##########
 
     def parallel(self, loop: LoopRV) -> None:

diff --git a/src/tir/schedule/concrete_schedule.cc b/src/tir/schedule/concrete_schedule.cc
@@ -346,6 +346,13 @@ Array<LoopRV> ConcreteScheduleNode::Split(const LoopRV& loop_rv,
   return CreateRV<LoopRV>(results);
 }
 
+void ConcreteScheduleNode::Reorder(const Array<LoopRV>& ordered_loop_rvs) {
+  TVM_TIR_SCHEDULE_BEGIN();
+  tir::Reorder(state_, GetSRefs(ordered_loop_rvs));
+  TVM_TIR_SCHEDULE_END("reorder", this->error_render_level_);
+  this->state_->DebugVerify();
+}
+
 /******** Schedule: Manipulate ForKind ********/
 
 void ConcreteScheduleNode::Parallel(const LoopRV& loop_rv) {

diff --git a/src/tir/schedule/concrete_schedule.h b/src/tir/schedule/concrete_schedule.h
@@ -81,6 +81,7 @@ class ConcreteScheduleNode : public ScheduleNode {
   /******** Schedule: Transform loops ********/
   LoopRV Fuse(const Array<LoopRV>& loop_rvs) override;
   Array<LoopRV> Split(const LoopRV& loop_rv, const Array<Optional<ExprRV>>& factors) override;
+  void Reorder(const Array<LoopRV>& ordered_loop_rvs) override;
   /******** Schedule: Manipulate ForKind ********/
   void Parallel(const LoopRV& loop_rv) override;
   void Vectorize(const LoopRV& loop_rv) override;

diff --git a/src/tir/schedule/primitive.h b/src/tir/schedule/primitive.h
@@ -63,6 +63,21 @@ TVM_DLL Array<StmtSRef> Split(ScheduleState self, const StmtSRef& loop_sref,
  * \return The sref to the fused loop
  */
 TVM_DLL StmtSRef Fuse(ScheduleState self, const Array<StmtSRef>& loop_srefs);
+/*!
+ * \brief Reorder a list of loops. It doesn't require the loops to be consecutive.
+ * It requires:
+ * 1) The loops are in the same chain. That means: the loops can be ordered to [l_1, l_2, ... ,
+ *     l_n] where l_i is an ancestor of l_{i+1} and there are only single-branch loops between
+ *     l_1 and l_n (which also indicates they are under the same scope).
+ * 2) After reordering, the domain of an outer loop cannot depend on any of the inner loops.
+ * 3) For every block under the loop nests, its block binding must be affine, and the block
+ *    variables must be either data parallel or reduction.
+ * 4) No duplicated loops are allowed in the arguments.
+ * \param self The state of the schedule
+ * \param ordered_loop_srefs An array of srefs which indicates the new order of loops
+ */
+TVM_DLL void Reorder(ScheduleState self, const Array<StmtSRef>& ordered_loop_srefs);
+
 /******** Schedule: Manipulate ForKind ********/
 /*!
  * \brief Parallelize the input loop. It requires: