From 510acc5a3c18ca6f2254ef9f33a82e93b7a118b7 Mon Sep 17 00:00:00 2001 From: Benoit Jacob Date: Fri, 23 Jun 2023 17:49:34 +0000 Subject: [PATCH] tile-1x1 --- .../Codegen/Common/DecomposePackUnPackOps.cpp | 5 ++ .../MaterializeEncodingIntoPackUnPack.cpp | 15 +++--- .../test/decompose_pack_unpack_ops.mlir | 12 +++++ .../Dialect/Flow/Transforms/Passes.cpp | 6 ++- .../compiler/Dialect/Flow/Transforms/Passes.h | 4 +- .../Dialect/Flow/Transforms/Passes.td | 2 +- .../Dialect/Flow/Transforms/SetEncoding.cpp | 48 +++++++++++++----- compiler/src/iree/compiler/Utils/BUILD.bazel | 1 + .../src/iree/compiler/Utils/CMakeLists.txt | 1 + .../Utils/DataTilingUniversalPadding.h | 49 +++++++++++++++++++ runtime/src/iree/modules/vmvx/module.c | 3 ++ 11 files changed, 125 insertions(+), 21 deletions(-) create mode 100644 compiler/src/iree/compiler/Utils/DataTilingUniversalPadding.h diff --git a/compiler/src/iree/compiler/Codegen/Common/DecomposePackUnPackOps.cpp b/compiler/src/iree/compiler/Codegen/Common/DecomposePackUnPackOps.cpp index 56cad06759db..ad212bb36472 100644 --- a/compiler/src/iree/compiler/Codegen/Common/DecomposePackUnPackOps.cpp +++ b/compiler/src/iree/compiler/Codegen/Common/DecomposePackUnPackOps.cpp @@ -83,6 +83,11 @@ struct FoldTrailingUnitTranspose } if (numDropDims == 0) return failure(); + if (numDropDims == inputTy.getRank()) { + rewriter.replaceOp(op, op.getInput()); + return success(); + } + Location loc = op.getLoc(); SmallVector srcMixedSizes = tensor::createDimValues(rewriter, loc, op.getInput()); diff --git a/compiler/src/iree/compiler/Codegen/Common/MaterializeEncodingIntoPackUnPack.cpp b/compiler/src/iree/compiler/Codegen/Common/MaterializeEncodingIntoPackUnPack.cpp index 0958fb47517d..1449d09089ce 100644 --- a/compiler/src/iree/compiler/Codegen/Common/MaterializeEncodingIntoPackUnPack.cpp +++ b/compiler/src/iree/compiler/Codegen/Common/MaterializeEncodingIntoPackUnPack.cpp @@ -18,6 +18,7 @@ #include "iree/compiler/Dialect/Flow/IR/FlowOps.h" #include "iree/compiler/Dialect/HAL/IR/HALTypes.h" #include "iree/compiler/Dialect/VMVX/IR/VMVXOps.h" +#include "iree/compiler/Utils/DataTilingUniversalPadding.h" #include "mlir/Dialect/Affine/IR/AffineOps.h" #include "mlir/Dialect/Arith/IR/Arith.h" #include "mlir/Dialect/MemRef/Transforms/Transforms.h" @@ -278,6 +279,7 @@ void adjustTileSizesToNarrowStaticShape(MaterializeEncodingInfo &encodingInfo, // Dynamic sizes are assumed to be large enough, not to be candidates for // narrow kernels. if (ShapedType::isDynamic(size)) continue; + int64_t &tileSize = encodingInfo.innerTileSizes[i]; // Let's not try to handle any dynamic tile sizes here. We could handle the // case where size==1 (as whatever is the runtime value of tileSize, it @@ -285,12 +287,13 @@ void adjustTileSizesToNarrowStaticShape(MaterializeEncodingInfo &encodingInfo, // in general, adjusting dynamic tile sizes has to be done by the // materializeEncodingValueFn which we obtain those tileSizes from. if (ShapedType::isDynamic(tileSize)) continue; - auto generateNarrowTileSize = [&](int64_t n) { - if (size <= n && tileSize >= n) tileSize = n; - }; - generateNarrowTileSize(1); - generateNarrowTileSize(2); - generateNarrowTileSize(4); + // Honor the contract in the comment on DataTilingUniversalPadding: + // For tile sizes along statically-sized dimensions that are smaller than + // DataTilingUniversalPadding, ensure that we never generate a tile size + // greater than the next power of two. + for (int po2 = 1; po2 < DataTilingUniversalPadding; po2 *= 2) { + if (size <= po2 && tileSize >= po2) tileSize = po2; + } } } diff --git a/compiler/src/iree/compiler/Codegen/Common/test/decompose_pack_unpack_ops.mlir b/compiler/src/iree/compiler/Codegen/Common/test/decompose_pack_unpack_ops.mlir index f7cc2b528281..80dbc441e762 100644 --- a/compiler/src/iree/compiler/Codegen/Common/test/decompose_pack_unpack_ops.mlir +++ b/compiler/src/iree/compiler/Codegen/Common/test/decompose_pack_unpack_ops.mlir @@ -35,6 +35,18 @@ func.func @simple_pad_and_pack(%input: tensor<5x1xf32>, %output: tensor<1x1x8x2x // ----- +func.func @simple_pack_1x1_to_1x1x1x1(%input: tensor<1x1xf32>, %output: tensor<1x1x1x1xf32>, %pad: f32) -> tensor<1x1x1x1xf32> { + %0 = tensor.pack %input padding_value(%pad : f32) inner_dims_pos = [0, 1] inner_tiles = [1, 1] into %output : tensor<1x1xf32> -> tensor<1x1x1x1xf32> + return %0 : tensor<1x1x1x1xf32> +} +// CHECK-LABEL: func.func @simple_pack_1x1_to_1x1x1x1 +// CHECK-SAME: %[[IN:[A-Za-z0-9]+]]: +// CHECK-SAME: %[[OUT:[A-Za-z0-9]+]]: +// CHECK: %[[INSERT:.+]] = tensor.insert_slice %[[IN]] into %[[OUT]][0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] +// CHECK: return %[[INSERT]] + +// ----- + func.func @simple_NC_to_CNnc(%arg0: tensor<32x8xf32>, %arg1: tensor<1x1x32x8xf32>) -> tensor<1x1x32x8xf32>{ %0 = tensor.pack %arg0 outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [32, 8] into %arg1 : tensor<32x8xf32> -> tensor<1x1x32x8xf32> return %0 : tensor<1x1x32x8xf32> diff --git a/compiler/src/iree/compiler/Dialect/Flow/Transforms/Passes.cpp b/compiler/src/iree/compiler/Dialect/Flow/Transforms/Passes.cpp index 16b5d2ca712d..63d57804c1ba 100644 --- a/compiler/src/iree/compiler/Dialect/Flow/Transforms/Passes.cpp +++ b/compiler/src/iree/compiler/Dialect/Flow/Transforms/Passes.cpp @@ -6,6 +6,8 @@ #include "iree/compiler/Dialect/Flow/Transforms/Passes.h" +#include + #include #include "iree-dialects/Dialect/LinalgExt/Passes/Passes.h" @@ -262,7 +264,9 @@ void buildFlowTransformPassPipeline(OpPassManager &passManager, .addPredicatedPass(clNormalizeInputIndexingMap, createInterchangeTransposeGenericOpsPass) // Enable data tiling after all linalg level transformations. - .addPredicatedPass(clEnableDataTiling, createSetEncodingPass) + .addPredicatedPass( + clEnableDataTiling, + []() { return createSetEncodingPass(DataTilingUniversalPadding); }) //////////////////////////////////////////////////////////////////////// // Dispatch region formation. .addPredicatedPass(!clDispatchTransformFileName.empty(), diff --git a/compiler/src/iree/compiler/Dialect/Flow/Transforms/Passes.h b/compiler/src/iree/compiler/Dialect/Flow/Transforms/Passes.h index bc0a0009653b..4bca665e4b14 100644 --- a/compiler/src/iree/compiler/Dialect/Flow/Transforms/Passes.h +++ b/compiler/src/iree/compiler/Dialect/Flow/Transforms/Passes.h @@ -8,6 +8,7 @@ #define IREE_COMPILER_DIALECT_FLOW_TRANSFORMS_PASSES_H_ #include +#include #include "iree/compiler/Dialect/Flow/IR/FlowOps.h" #include "llvm/ADT/StringMap.h" @@ -121,7 +122,8 @@ std::unique_ptr createConvertToFlowPass(); std::unique_ptr createOptimizeNumericsPass(); // Sets encoding for tensors to allow tiled execution of operations. -std::unique_ptr createSetEncodingPass(); +std::unique_ptr createSetEncodingPass( + std::optional padding = std::nullopt); // Strips the signed/unsigned portion off of tensors. std::unique_ptr> diff --git a/compiler/src/iree/compiler/Dialect/Flow/Transforms/Passes.td b/compiler/src/iree/compiler/Dialect/Flow/Transforms/Passes.td index 83e90f15fe72..ac0e2577e213 100644 --- a/compiler/src/iree/compiler/Dialect/Flow/Transforms/Passes.td +++ b/compiler/src/iree/compiler/Dialect/Flow/Transforms/Passes.td @@ -251,7 +251,7 @@ def SetEncoding : Pass<"iree-flow-set-encoding", ""> { let summary = "Introduce tensor encoding for compute operations"; let constructor = "mlir::iree_compiler::IREE::Flow::createSetEncodingPass()"; let options = [ - Option<"defaultPadding", "default-padding", "int64_t", + Option<"optionDefaultPadding", "default-padding", "int64_t", /*default=*/"16", "Default padding to use so packing can be done without padding during the packing"> ]; diff --git a/compiler/src/iree/compiler/Dialect/Flow/Transforms/SetEncoding.cpp b/compiler/src/iree/compiler/Dialect/Flow/Transforms/SetEncoding.cpp index e72e849a4cc3..b75f48920fe5 100644 --- a/compiler/src/iree/compiler/Dialect/Flow/Transforms/SetEncoding.cpp +++ b/compiler/src/iree/compiler/Dialect/Flow/Transforms/SetEncoding.cpp @@ -9,11 +9,14 @@ // operations in tiled layouts. //===---------------------------------------------------------------------===// +#include + #include "iree-dialects/Dialect/LinalgExt/IR/LinalgExtDialect.h" #include "iree-dialects/Dialect/LinalgExt/IR/LinalgExtOps.h" #include "iree-dialects/Dialect/LinalgExt/Utils/Utils.h" #include "iree/compiler/Dialect/Flow/Transforms/PassDetail.h" #include "iree/compiler/Dialect/Flow/Transforms/Passes.h" +#include "llvm/Support/MathExtras.h" #include "mlir/Dialect/Affine/IR/AffineOps.h" #include "mlir/Dialect/Arith/IR/Arith.h" #include "mlir/Dialect/Arith/Utils/Utils.h" @@ -61,14 +64,11 @@ static FailureOr getZero(OpBuilder &builder, Location loc, .getResult(); } -/// Pads `value` to `padding` if needed. If no padding is specified, -/// return `value` itself. -static FailureOr padIfNeeded( - OpBuilder &builder, Location loc, Value value, - std::optional padding = std::nullopt) { - if (!padding) return value; - - OpFoldResult paddingOfr = builder.getIndexAttr(padding.value()); +/// Pads `value` to `padding` if needed. As described in the comment on +// DataTilingUniversalPadding, statically-sized dimensions smaller that +// `padding` only get padded to the next power of two. +static FailureOr padIfNeeded(OpBuilder &builder, Location loc, + Value value, int64_t universalPadding) { FailureOr> shape = LinalgExt::getDims(builder, loc, value); if (failed(shape)) { @@ -80,12 +80,21 @@ static FailureOr padIfNeeded( SmallVector highPad(shape->size(), zero); // The low padding is always zero. The high padding is - // shape.ceildDiv(padding) - shape. + // shape.ceildDiv(padding) * padding - shape. AffineExpr paddingExpr, shapeExpr; bindSymbols(builder.getContext(), paddingExpr, shapeExpr); AffineExpr highPadExpr = shapeExpr.ceilDiv(paddingExpr) * paddingExpr - shapeExpr; for (auto shape : llvm::enumerate(shape.value())) { + int64_t padding = universalPadding; + // Case of small static sizes - round to the next power of two instead of + // the universal padding amount. + if (auto constant_value = getConstantIntValue(shape.value())) { + if (constant_value.value() < padding) { + padding = llvm::PowerOf2Ceil(constant_value.value()); + } + } + OpFoldResult paddingOfr = builder.getIndexAttr(padding); highPad[shape.index()] = affine::makeComposedFoldedAffineApply( builder, loc, highPadExpr, {paddingOfr, shape.value()}); } @@ -259,11 +268,16 @@ struct FoldFillWithSetEncoding }; struct SetEncodingPass : public SetEncodingBase { + SetEncodingPass(std::optional padding) : padding(padding) {} + void getDependentDialects(DialectRegistry ®istry) const override { registry.insert(); } void runOnOperation() override; + LogicalResult initializeOptions(StringRef options) override; + + std::optional padding; }; } // namespace @@ -271,7 +285,7 @@ void SetEncodingPass::runOnOperation() { MLIRContext *context = &getContext(); { RewritePatternSet patterns(context); - patterns.insert(context, defaultPadding); + patterns.insert(context, padding.value()); linalg::FillOp::getCanonicalizationPatterns(patterns, context); patterns.insert(context); memref::populateResolveRankedShapedTypeResultDimsPatterns(patterns); @@ -282,8 +296,18 @@ void SetEncodingPass::runOnOperation() { } } -std::unique_ptr createSetEncodingPass() { - return std::make_unique(); +LogicalResult SetEncodingPass::initializeOptions(StringRef options) { + if (failed(Pass::initializeOptions(options))) { + return failure(); + } + if (!padding) { + padding = optionDefaultPadding; + } + return success(); +} + +std::unique_ptr createSetEncodingPass(std::optional padding) { + return std::make_unique(padding); } } // namespace Flow diff --git a/compiler/src/iree/compiler/Utils/BUILD.bazel b/compiler/src/iree/compiler/Utils/BUILD.bazel index aaa6ca5ecccc..ec1881377537 100644 --- a/compiler/src/iree/compiler/Utils/BUILD.bazel +++ b/compiler/src/iree/compiler/Utils/BUILD.bazel @@ -29,6 +29,7 @@ iree_compiler_cc_library( ], hdrs = [ "ConversionUtils.h", + "DataTilingUniversalPadding.h", "ElementPackingUtils.h", "FlatbufferUtils.h", "IndexSet.h", diff --git a/compiler/src/iree/compiler/Utils/CMakeLists.txt b/compiler/src/iree/compiler/Utils/CMakeLists.txt index d7ef3b56ddc2..5dcd30f43a92 100644 --- a/compiler/src/iree/compiler/Utils/CMakeLists.txt +++ b/compiler/src/iree/compiler/Utils/CMakeLists.txt @@ -15,6 +15,7 @@ iree_cc_library( Utils HDRS "ConversionUtils.h" + "DataTilingUniversalPadding.h" "ElementPackingUtils.h" "FlatbufferUtils.h" "IndexSet.h" diff --git a/compiler/src/iree/compiler/Utils/DataTilingUniversalPadding.h b/compiler/src/iree/compiler/Utils/DataTilingUniversalPadding.h new file mode 100644 index 000000000000..87d8296fd3e1 --- /dev/null +++ b/compiler/src/iree/compiler/Utils/DataTilingUniversalPadding.h @@ -0,0 +1,49 @@ +// Copyright 2023 The IREE Authors +// +// Licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#ifndef IREE_COMPILER_UTILS_DATATILINGUNIVERSALPADDING_H_ +#define IREE_COMPILER_UTILS_DATATILINGUNIVERSALPADDING_H_ + +namespace mlir { +namespace iree_compiler { + +// When using data-tiling, during Flow, the SetEncoding pass must ensure that +// allocated buffers will be large enough for the eventual padded-and-tiled +// buffers. Those will only be created in the MaterializeEncoding pass, in HAL. +// Until then, the exact tile sizes aren't know. Our short-term approach to +// unblock this is to let SetEncoding pad everything to the next multiple of +// a "universal" padding size. In order for this to work, this universal padding +// value must be greater than or equal to any actual tile size that can occur. +// +// This widening of tensors is particularly problematic for narrow tensors. For +// example, it is inefficient to rewrite a tensor<1x1024xf32> into +// tensor<16x1024xf32>, using only row 0, leaving the other 15 rows unused. To +// remedy that in the short term until a better solution is found, we have the +// following contract: for any dimension that is statically sized and whose size +// is less than DataTilingUniversalPadding, the largest tile size that +// MaterializeEncoding is allowed to choose is the original dimension size +// rounded up to the next power of two. +// +// Example. If DataTilingUniversalPadding=16, then: +// +// For the source tensor type | MaterializeEncoding can choose tile sizes up to +// -------------------------- | ----------------------------------------------- +// tensor<20x40xf32> | 16x16 +// tensor<20x1xf32> | 16x1 +// tensor<1x40xf32> | 1x16 +// tensor<1x1xf32> | 1x1 +// tensor<20x2xf32> | 16x2 +// tensor<20x3xf32> | 16x4 +// tensor<20x4xf32> | 16x4 +// tensor<20x5xf32> | 16x8 +// +// TODO(#11632) - find a way to do without universal padding. +const int DataTilingUniversalPadding = 16; + +} // namespace iree_compiler +} // namespace mlir + +#endif // IREE_COMPILER_UTILS_DATATILINGUNIVERSALPADDING_H_ \ No newline at end of file diff --git a/runtime/src/iree/modules/vmvx/module.c b/runtime/src/iree/modules/vmvx/module.c index 3d00c539ce72..5f91b4df37f6 100644 --- a/runtime/src/iree/modules/vmvx/module.c +++ b/runtime/src/iree/modules/vmvx/module.c @@ -590,6 +590,9 @@ IREE_VMVX_ABI_EXPORT(iree_vmvx_mmt4d, mmt4d, v) { .K0 = K0, .cpu_data = (const iree_uk_uint64_t*)iree_cpu_data_fields(), }; + fprintf(stderr, "M0=%ld N0=%ld K0=%ld\n", M0, N0, K0); + fprintf(stderr, "lhs_buffer=%p rhs_buffer=%p out_buffer=%p\n", lhs_buffer, + rhs_buffer, out_buffer); iree_uk_mmt4d(&ukernel_params); IREE_TRACE_ZONE_END(z0); return iree_ok_status();