[LLVMCPU] Add an option for tiling reduction only to LLVMCPUTile. (ir…

…ee-org#13821) If the option is true, only tile the ops that has reduction loops. It is useful because it allows us to tile on reduction ops firstly and tileAndFuse on other operations later. We can greedily apply tileAndFuse on consumers because the reduction op will no longer be pulled in. There is a scf.for as barrier to stop fusion on reductions. The changes to LLVMTileAndFuse is needed together because we follow the same pipeline behavior. Now we need to use TileAndFuse in last level of tiling for consumers. If there are no consumers, it will not be applied on reduction ops. It is a step toward iree-org#13706 and iree-org#13474
NatashaKnk · Jul 6, 2023 · 7297be0 · 7297be0
1 parent 5435c54
commit 7297be0
Show file tree

Hide file tree

Showing 8 changed files with 83 additions and 8 deletions.
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/LLVMCPUPasses.h b/compiler/src/iree/compiler/Codegen/LLVMCPU/LLVMCPUPasses.h
@@ -69,7 +69,7 @@ std::unique_ptr<OperationPass<func::FuncOp>> createLLVMCPUTileAndFusePass(
 
 /// Pass to tile TilingInterface ops with given tilingLevel.
 std::unique_ptr<OperationPass<func::FuncOp>> createLLVMCPUTilePass(
-    int64_t tilingLevel = -1);
+    int64_t tilingLevel = -1, bool reductionOnly = false);
 
 /// Replaces llvm.intr.fma with its unfused mul and add ops.
 std::unique_ptr<OperationPass<func::FuncOp>> createLLVMCPUUnfuseFMAOpsPass();

diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/LLVMCPUTile.cpp b/compiler/src/iree/compiler/Codegen/LLVMCPU/LLVMCPUTile.cpp
@@ -46,8 +46,9 @@ static SmallVector<Value> buildTileSizesForOp(OpBuilder &b, Operation *op,
 /// be specified. It picks the `tilingLevel`-th list as tiling sizes from
 /// lowering_config.
 struct LLVMCPUTilePass : LLVMCPUTileBase<LLVMCPUTilePass> {
-  LLVMCPUTilePass(int64_t tilingLevel = -1) {
+  LLVMCPUTilePass(int64_t tilingLevel, bool reductionOnly) {
     this->tilingLevel.setValue(tilingLevel);
+    this->reductionOnly.setValue(reductionOnly);
   }
   void getDependentDialects(DialectRegistry &registry) const override {
     registry.insert<arith::ArithDialect, affine::AffineDialect,
@@ -89,6 +90,13 @@ void LLVMCPUTilePass::runOnOperation() {
     // Need a better way for handling this, but this works for now.
     if (isa<tensor::PadOp>(computeOp)) continue;
 
+    if (reductionOnly &&
+        llvm::none_of(op.getLoopIteratorTypes(), [](auto iterType) {
+          return iterType == utils::IteratorType::reduction;
+        })) {
+      continue;
+    }
+
     LLVM_DEBUG(llvm::dbgs() << "candidate: " << op << "\n");
 
     IRRewriter rewriter(context);
@@ -117,8 +125,8 @@ void LLVMCPUTilePass::runOnOperation() {
 }  // namespace
 
 std::unique_ptr<OperationPass<func::FuncOp>> createLLVMCPUTilePass(
-    int64_t tilingLevel) {
-  return std::make_unique<LLVMCPUTilePass>(tilingLevel);
+    int64_t tilingLevel, bool reductionOnly) {
+  return std::make_unique<LLVMCPUTilePass>(tilingLevel, reductionOnly);
 }
 
 }  // namespace iree_compiler

diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/LLVMCPUTileAndFuse.cpp b/compiler/src/iree/compiler/Codegen/LLVMCPU/LLVMCPUTileAndFuse.cpp
@@ -221,6 +221,16 @@ void LLVMCPUTileAndFusePass::runOnOperation() {
     return;
   }
 
+  auto iterTypes = consumerOp.getLoopIteratorTypes();
+  for (auto [idx, size] : llvm::enumerate(tilingSizes)) {
+    if (size == 0) continue;
+    if (iterTypes[idx] == utils::IteratorType::reduction) {
+      LLVM_DEBUG(llvm::dbgs()
+                 << "----- skip, can't tile and fuse reduction dims -----\n");
+      return;
+    }
+  }
+
   auto options = scf::SCFTilingOptions().setTileSizes(tilingSizes);
   IRRewriter rewriter(context);
   if (failed(applyTileAndFuse(rewriter, consumerOp, options))) {

diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/Passes.cpp b/compiler/src/iree/compiler/Codegen/LLVMCPU/Passes.cpp
@@ -442,12 +442,21 @@ void addMultiTilingExpertPassPipeline(OpPassManager &passManager,
     nestedModulePM.addNestedPass<func::FuncOp>(
         createConcretizePadResultShapePass());
   }
-  // Run SplitReductionPass before the final reduction Fuse pass, because
-  // SplitReductionPass takes care of banked-tiling.
+
+  // Apply vector level of tiling on the ops. We firstly tile the reduction
+  // ops, and then handle the consumer ops. There are no differences between
+  // LLVMCPUTile and LLVMCPUTileAndFuse if we have a single consumer op. They
+  // will just tile the consumer op. But it's important if there is a consumer
+  // ops chain, e.g., reduction + broadcast + tensor.pack/pad ops. We want to
+  // tile and fuse `boadcast + pack` ops.
+  // SplitReductionPass is run before the final reduction Fuse pass, because it
+  // takes care of banked-tiling.
   nestedModulePM.addNestedPass<func::FuncOp>(
       createLLVMCPUSplitReductionPass(clEnableReassociateFpReductions));
   nestedModulePM.addNestedPass<func::FuncOp>(
-      createLLVMCPUTilePass(numLevels - 1));
+      createLLVMCPUTilePass(numLevels - 1, /*reductionOnly=*/true));
+  nestedModulePM.addNestedPass<func::FuncOp>(
+      createLLVMCPUTileAndFusePass(numLevels - 1));
 
   nestedModulePM.addNestedPass<func::FuncOp>(
       createFuseTensorPadWithConsumerPass());

diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/BUILD.bazel b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/BUILD.bazel
@@ -50,6 +50,7 @@ iree_lit_test_suite(
             "synchronize_symbol_visibility.mlir",
             "tensor_pad.mlir",
             "test_config_mmt4d.mlir",
+            "tile.mlir",
             "tile_and_fuse.mlir",
             "transform_dialect_bufferize.mlir",
             "transform_dialect_iree_tile_to_forall.mlir",

diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/CMakeLists.txt
@@ -45,6 +45,7 @@ iree_lit_test_suite(
     "synchronize_symbol_visibility.mlir"
     "tensor_pad.mlir"
     "test_config_mmt4d.mlir"
+    "tile.mlir"
     "tile_and_fuse.mlir"
     "transform_dialect_bufferize.mlir"
     "transform_dialect_iree_tile_to_forall.mlir"

diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/tile.mlir b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/tile.mlir
@@ -0,0 +1,44 @@
+// RUN: iree-opt --pass-pipeline="builtin.module(func.func(iree-llvmcpu-tile{tiling-level=0}))" --split-input-file %s | FileCheck %s
+// RUN: iree-opt --pass-pipeline="builtin.module(func.func(iree-llvmcpu-tile{tiling-level=0 reduction-only=true}))" --split-input-file %s | FileCheck %s --check-prefix=CHECK-REDUCTION
+
+func.func @matmul_bias_add(%arg0 : tensor<?x?xf32>, %arg1 : tensor<?x?xf32>, %arg2 : tensor<?xf32>) -> tensor<?x?xf32> {
+  %cst = arith.constant 0.0 : f32
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %d0 = tensor.dim %arg0, %c0 : tensor<?x?xf32>
+  %d1 = tensor.dim %arg1, %c1 : tensor<?x?xf32>
+  %init = tensor.empty(%d0, %d1) : tensor<?x?xf32>
+  %0 = linalg.fill ins(%cst : f32) outs(%init : tensor<?x?xf32>) -> tensor<?x?xf32>
+  %1 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[10, 20, 30]]>}
+      ins(%arg0, %arg1 : tensor<?x?xf32>, tensor<?x?xf32>)
+      outs(%0 : tensor<?x?xf32>) -> tensor<?x?xf32>
+  %2 = linalg.generic {
+    indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1)-> (d0, d1)>],
+    iterator_types = ["parallel", "parallel"]}
+    ins(%1, %arg2 : tensor<?x?xf32>, tensor<?xf32>)
+    outs(%init : tensor<?x?xf32>) {
+      ^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
+        %3 = arith.addf %arg3, %arg4 : f32
+        linalg.yield %3 : f32
+    } -> tensor<?x?xf32>
+  return %2 : tensor<?x?xf32>
+}
+// CHECK-LABEL: func.func @matmul_bias_add(
+// CHECK:         scf.for
+// CHECK:           scf.for
+// CHECK:             linalg.fill
+// CHECK:         scf.for
+// CHECK:           scf.for
+// CHECK:             scf.for
+// CHECK:             linalg.matmul
+// CHECK:         scf.for
+// CHECK:           scf.for
+// CHECK:             linalg.generic
+// CHECK-REDUCTION-LABEL: func.func @matmul_bias_add(
+// CHECK-REDUCTION:         linalg.fill
+// CHECK-REDUCTION:         scf.for
+// CHECK-REDUCTION:           scf.for
+// CHECK-REDUCTION:             scf.for
+// CHECK-REDUCTION:             linalg.matmul
+// CHECK-REDUCITON-NOT:     scf.for
+// CHECK-REDUCTION:         linalg.generic
diff --git a/compiler/src/iree/compiler/Codegen/Passes.td b/compiler/src/iree/compiler/Codegen/Passes.td
@@ -527,7 +527,9 @@ def LLVMCPUTile :
       "mlir::iree_compiler::createLLVMCPUTilePass()";
   let options = [
     Option<"tilingLevel", "tiling-level", "int64_t", /*default=*/"-1",
-      "Use default tiling level used to retrieve the configuration from lowering_config">
+      "Use default tiling level used to retrieve the configuration from lowering_config">,
+    Option<"reductionOnly", "reduction-only", "bool", /*default=*/"false",
+      "Only reduction ops are tiled if the option is true">,
   ];
 }