Skip to content

Commit

Permalink
[LLVMCPU] Add an option for tiling reduction only to LLVMCPUTile. (ir…
Browse files Browse the repository at this point in the history
…ee-org#13821)

If the option is true, only tile the ops that has reduction loops. It is useful because it allows us to tile on reduction ops firstly and tileAndFuse on other operations later. We can greedily apply tileAndFuse on consumers because the reduction op will no longer be pulled in. There is a scf.for as barrier to stop fusion on reductions.

The changes to LLVMTileAndFuse is needed together because we follow the same pipeline behavior. Now we need to use TileAndFuse in last level of tiling for consumers. If there are no consumers, it will not be applied on reduction ops.

It is a step toward iree-org#13706 and iree-org#13474
  • Loading branch information
hanhanW authored and NatashaKnk committed Jul 6, 2023
1 parent 5435c54 commit 7297be0
Show file tree
Hide file tree
Showing 8 changed files with 83 additions and 8 deletions.
2 changes: 1 addition & 1 deletion compiler/src/iree/compiler/Codegen/LLVMCPU/LLVMCPUPasses.h
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ std::unique_ptr<OperationPass<func::FuncOp>> createLLVMCPUTileAndFusePass(

/// Pass to tile TilingInterface ops with given tilingLevel.
std::unique_ptr<OperationPass<func::FuncOp>> createLLVMCPUTilePass(
int64_t tilingLevel = -1);
int64_t tilingLevel = -1, bool reductionOnly = false);

/// Replaces llvm.intr.fma with its unfused mul and add ops.
std::unique_ptr<OperationPass<func::FuncOp>> createLLVMCPUUnfuseFMAOpsPass();
Expand Down
14 changes: 11 additions & 3 deletions compiler/src/iree/compiler/Codegen/LLVMCPU/LLVMCPUTile.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -46,8 +46,9 @@ static SmallVector<Value> buildTileSizesForOp(OpBuilder &b, Operation *op,
/// be specified. It picks the `tilingLevel`-th list as tiling sizes from
/// lowering_config.
struct LLVMCPUTilePass : LLVMCPUTileBase<LLVMCPUTilePass> {
LLVMCPUTilePass(int64_t tilingLevel = -1) {
LLVMCPUTilePass(int64_t tilingLevel, bool reductionOnly) {
this->tilingLevel.setValue(tilingLevel);
this->reductionOnly.setValue(reductionOnly);
}
void getDependentDialects(DialectRegistry &registry) const override {
registry.insert<arith::ArithDialect, affine::AffineDialect,
Expand Down Expand Up @@ -89,6 +90,13 @@ void LLVMCPUTilePass::runOnOperation() {
// Need a better way for handling this, but this works for now.
if (isa<tensor::PadOp>(computeOp)) continue;

if (reductionOnly &&
llvm::none_of(op.getLoopIteratorTypes(), [](auto iterType) {
return iterType == utils::IteratorType::reduction;
})) {
continue;
}

LLVM_DEBUG(llvm::dbgs() << "candidate: " << op << "\n");

IRRewriter rewriter(context);
Expand Down Expand Up @@ -117,8 +125,8 @@ void LLVMCPUTilePass::runOnOperation() {
} // namespace

std::unique_ptr<OperationPass<func::FuncOp>> createLLVMCPUTilePass(
int64_t tilingLevel) {
return std::make_unique<LLVMCPUTilePass>(tilingLevel);
int64_t tilingLevel, bool reductionOnly) {
return std::make_unique<LLVMCPUTilePass>(tilingLevel, reductionOnly);
}

} // namespace iree_compiler
Expand Down
10 changes: 10 additions & 0 deletions compiler/src/iree/compiler/Codegen/LLVMCPU/LLVMCPUTileAndFuse.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -221,6 +221,16 @@ void LLVMCPUTileAndFusePass::runOnOperation() {
return;
}

auto iterTypes = consumerOp.getLoopIteratorTypes();
for (auto [idx, size] : llvm::enumerate(tilingSizes)) {
if (size == 0) continue;
if (iterTypes[idx] == utils::IteratorType::reduction) {
LLVM_DEBUG(llvm::dbgs()
<< "----- skip, can't tile and fuse reduction dims -----\n");
return;
}
}

auto options = scf::SCFTilingOptions().setTileSizes(tilingSizes);
IRRewriter rewriter(context);
if (failed(applyTileAndFuse(rewriter, consumerOp, options))) {
Expand Down
15 changes: 12 additions & 3 deletions compiler/src/iree/compiler/Codegen/LLVMCPU/Passes.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -442,12 +442,21 @@ void addMultiTilingExpertPassPipeline(OpPassManager &passManager,
nestedModulePM.addNestedPass<func::FuncOp>(
createConcretizePadResultShapePass());
}
// Run SplitReductionPass before the final reduction Fuse pass, because
// SplitReductionPass takes care of banked-tiling.

// Apply vector level of tiling on the ops. We firstly tile the reduction
// ops, and then handle the consumer ops. There are no differences between
// LLVMCPUTile and LLVMCPUTileAndFuse if we have a single consumer op. They
// will just tile the consumer op. But it's important if there is a consumer
// ops chain, e.g., reduction + broadcast + tensor.pack/pad ops. We want to
// tile and fuse `boadcast + pack` ops.
// SplitReductionPass is run before the final reduction Fuse pass, because it
// takes care of banked-tiling.
nestedModulePM.addNestedPass<func::FuncOp>(
createLLVMCPUSplitReductionPass(clEnableReassociateFpReductions));
nestedModulePM.addNestedPass<func::FuncOp>(
createLLVMCPUTilePass(numLevels - 1));
createLLVMCPUTilePass(numLevels - 1, /*reductionOnly=*/true));
nestedModulePM.addNestedPass<func::FuncOp>(
createLLVMCPUTileAndFusePass(numLevels - 1));

nestedModulePM.addNestedPass<func::FuncOp>(
createFuseTensorPadWithConsumerPass());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ iree_lit_test_suite(
"synchronize_symbol_visibility.mlir",
"tensor_pad.mlir",
"test_config_mmt4d.mlir",
"tile.mlir",
"tile_and_fuse.mlir",
"transform_dialect_bufferize.mlir",
"transform_dialect_iree_tile_to_forall.mlir",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ iree_lit_test_suite(
"synchronize_symbol_visibility.mlir"
"tensor_pad.mlir"
"test_config_mmt4d.mlir"
"tile.mlir"
"tile_and_fuse.mlir"
"transform_dialect_bufferize.mlir"
"transform_dialect_iree_tile_to_forall.mlir"
Expand Down
44 changes: 44 additions & 0 deletions compiler/src/iree/compiler/Codegen/LLVMCPU/test/tile.mlir
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
// RUN: iree-opt --pass-pipeline="builtin.module(func.func(iree-llvmcpu-tile{tiling-level=0}))" --split-input-file %s | FileCheck %s
// RUN: iree-opt --pass-pipeline="builtin.module(func.func(iree-llvmcpu-tile{tiling-level=0 reduction-only=true}))" --split-input-file %s | FileCheck %s --check-prefix=CHECK-REDUCTION

func.func @matmul_bias_add(%arg0 : tensor<?x?xf32>, %arg1 : tensor<?x?xf32>, %arg2 : tensor<?xf32>) -> tensor<?x?xf32> {
%cst = arith.constant 0.0 : f32
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%d0 = tensor.dim %arg0, %c0 : tensor<?x?xf32>
%d1 = tensor.dim %arg1, %c1 : tensor<?x?xf32>
%init = tensor.empty(%d0, %d1) : tensor<?x?xf32>
%0 = linalg.fill ins(%cst : f32) outs(%init : tensor<?x?xf32>) -> tensor<?x?xf32>
%1 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[10, 20, 30]]>}
ins(%arg0, %arg1 : tensor<?x?xf32>, tensor<?x?xf32>)
outs(%0 : tensor<?x?xf32>) -> tensor<?x?xf32>
%2 = linalg.generic {
indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1)-> (d0, d1)>],
iterator_types = ["parallel", "parallel"]}
ins(%1, %arg2 : tensor<?x?xf32>, tensor<?xf32>)
outs(%init : tensor<?x?xf32>) {
^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
%3 = arith.addf %arg3, %arg4 : f32
linalg.yield %3 : f32
} -> tensor<?x?xf32>
return %2 : tensor<?x?xf32>
}
// CHECK-LABEL: func.func @matmul_bias_add(
// CHECK: scf.for
// CHECK: scf.for
// CHECK: linalg.fill
// CHECK: scf.for
// CHECK: scf.for
// CHECK: scf.for
// CHECK: linalg.matmul
// CHECK: scf.for
// CHECK: scf.for
// CHECK: linalg.generic
// CHECK-REDUCTION-LABEL: func.func @matmul_bias_add(
// CHECK-REDUCTION: linalg.fill
// CHECK-REDUCTION: scf.for
// CHECK-REDUCTION: scf.for
// CHECK-REDUCTION: scf.for
// CHECK-REDUCTION: linalg.matmul
// CHECK-REDUCITON-NOT: scf.for
// CHECK-REDUCTION: linalg.generic
4 changes: 3 additions & 1 deletion compiler/src/iree/compiler/Codegen/Passes.td
Original file line number Diff line number Diff line change
Expand Up @@ -527,7 +527,9 @@ def LLVMCPUTile :
"mlir::iree_compiler::createLLVMCPUTilePass()";
let options = [
Option<"tilingLevel", "tiling-level", "int64_t", /*default=*/"-1",
"Use default tiling level used to retrieve the configuration from lowering_config">
"Use default tiling level used to retrieve the configuration from lowering_config">,
Option<"reductionOnly", "reduction-only", "bool", /*default=*/"false",
"Only reduction ops are tiled if the option is true">,
];
}

Expand Down

0 comments on commit 7297be0

Please sign in to comment.