Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adjust the universal flow-level padding for narrow static-sized dimensions #14206

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,11 @@ struct FoldTrailingUnitTranspose
}
if (numDropDims == 0) return failure();

if (numDropDims == inputTy.getRank()) {
rewriter.replaceOp(op, op.getInput());
return success();
}

Location loc = op.getLoc();
SmallVector<OpFoldResult> srcMixedSizes =
tensor::createDimValues(rewriter, loc, op.getInput());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
#include "iree/compiler/Dialect/Flow/IR/FlowOps.h"
#include "iree/compiler/Dialect/HAL/IR/HALTypes.h"
#include "iree/compiler/Dialect/VMVX/IR/VMVXOps.h"
#include "iree/compiler/Utils/DataTilingUniversalPadding.h"
#include "mlir/Dialect/Affine/IR/AffineOps.h"
#include "mlir/Dialect/Arith/IR/Arith.h"
#include "mlir/Dialect/MemRef/Transforms/Transforms.h"
Expand Down Expand Up @@ -278,19 +279,21 @@ void adjustTileSizesToNarrowStaticShape(MaterializeEncodingInfo &encodingInfo,
// Dynamic sizes are assumed to be large enough, not to be candidates for
// narrow kernels.
if (ShapedType::isDynamic(size)) continue;

int64_t &tileSize = encodingInfo.innerTileSizes[i];
// Let's not try to handle any dynamic tile sizes here. We could handle the
// case where size==1 (as whatever is the runtime value of tileSize, it
// can't be less than that, so it should be OK to replace it with 1) but
// in general, adjusting dynamic tile sizes has to be done by the
// materializeEncodingValueFn which we obtain those tileSizes from.
if (ShapedType::isDynamic(tileSize)) continue;
auto generateNarrowTileSize = [&](int64_t n) {
if (size <= n && tileSize >= n) tileSize = n;
};
generateNarrowTileSize(1);
generateNarrowTileSize(2);
generateNarrowTileSize(4);
// Honor the contract in the comment on DataTilingUniversalPadding:
// For tile sizes along statically-sized dimensions that are smaller than
// DataTilingUniversalPadding, ensure that we never generate a tile size
// greater than the next power of two.
for (int po2 = 1; po2 < DataTilingUniversalPadding; po2 *= 2) {
if (size <= po2 && tileSize >= po2) tileSize = po2;
}
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,18 @@ func.func @simple_pad_and_pack(%input: tensor<5x1xf32>, %output: tensor<1x1x8x2x

// -----

func.func @simple_pack_1x1_to_1x1x1x1(%input: tensor<1x1xf32>, %output: tensor<1x1x1x1xf32>, %pad: f32) -> tensor<1x1x1x1xf32> {
%0 = tensor.pack %input padding_value(%pad : f32) inner_dims_pos = [0, 1] inner_tiles = [1, 1] into %output : tensor<1x1xf32> -> tensor<1x1x1x1xf32>
return %0 : tensor<1x1x1x1xf32>
}
// CHECK-LABEL: func.func @simple_pack_1x1_to_1x1x1x1
// CHECK-SAME: %[[IN:[A-Za-z0-9]+]]:
// CHECK-SAME: %[[OUT:[A-Za-z0-9]+]]:
// CHECK: %[[INSERT:.+]] = tensor.insert_slice %[[IN]] into %[[OUT]][0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1]
// CHECK: return %[[INSERT]]

// -----

func.func @simple_NC_to_CNnc(%arg0: tensor<32x8xf32>, %arg1: tensor<1x1x32x8xf32>) -> tensor<1x1x32x8xf32>{
%0 = tensor.pack %arg0 outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [32, 8] into %arg1 : tensor<32x8xf32> -> tensor<1x1x32x8xf32>
return %0 : tensor<1x1x32x8xf32>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@

#include "iree/compiler/Dialect/Flow/Transforms/Passes.h"

#include <iree/compiler/Utils/DataTilingUniversalPadding.h>

#include <memory>

#include "iree-dialects/Dialect/LinalgExt/Passes/Passes.h"
Expand Down Expand Up @@ -262,7 +264,9 @@ void buildFlowTransformPassPipeline(OpPassManager &passManager,
.addPredicatedPass(clNormalizeInputIndexingMap,
createInterchangeTransposeGenericOpsPass)
// Enable data tiling after all linalg level transformations.
.addPredicatedPass(clEnableDataTiling, createSetEncodingPass)
.addPredicatedPass(
clEnableDataTiling,
[]() { return createSetEncodingPass(DataTilingUniversalPadding); })
////////////////////////////////////////////////////////////////////////
// Dispatch region formation.
.addPredicatedPass(!clDispatchTransformFileName.empty(),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
#define IREE_COMPILER_DIALECT_FLOW_TRANSFORMS_PASSES_H_

#include <functional>
#include <optional>

#include "iree/compiler/Dialect/Flow/IR/FlowOps.h"
#include "llvm/ADT/StringMap.h"
Expand Down Expand Up @@ -121,7 +122,8 @@ std::unique_ptr<Pass> createConvertToFlowPass();
std::unique_ptr<Pass> createOptimizeNumericsPass();

// Sets encoding for tensors to allow tiled execution of operations.
std::unique_ptr<Pass> createSetEncodingPass();
std::unique_ptr<Pass> createSetEncodingPass(
std::optional<int64_t> padding = std::nullopt);

// Strips the signed/unsigned portion off of tensors.
std::unique_ptr<InterfacePass<mlir::FunctionOpInterface>>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -251,7 +251,7 @@ def SetEncoding : Pass<"iree-flow-set-encoding", ""> {
let summary = "Introduce tensor encoding for compute operations";
let constructor = "mlir::iree_compiler::IREE::Flow::createSetEncodingPass()";
let options = [
Option<"defaultPadding", "default-padding", "int64_t",
Option<"optionDefaultPadding", "default-padding", "int64_t",
/*default=*/"16",
"Default padding to use so packing can be done without padding during the packing">
];
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,14 @@
// operations in tiled layouts.
//===---------------------------------------------------------------------===//

#include <iree/compiler/Utils/DataTilingUniversalPadding.h>

#include "iree-dialects/Dialect/LinalgExt/IR/LinalgExtDialect.h"
#include "iree-dialects/Dialect/LinalgExt/IR/LinalgExtOps.h"
#include "iree-dialects/Dialect/LinalgExt/Utils/Utils.h"
#include "iree/compiler/Dialect/Flow/Transforms/PassDetail.h"
#include "iree/compiler/Dialect/Flow/Transforms/Passes.h"
#include "llvm/Support/MathExtras.h"
#include "mlir/Dialect/Affine/IR/AffineOps.h"
#include "mlir/Dialect/Arith/IR/Arith.h"
#include "mlir/Dialect/Arith/Utils/Utils.h"
Expand Down Expand Up @@ -61,14 +64,11 @@ static FailureOr<Value> getZero(OpBuilder &builder, Location loc,
.getResult();
}

/// Pads `value` to `padding` if needed. If no padding is specified,
/// return `value` itself.
static FailureOr<Value> padIfNeeded(
OpBuilder &builder, Location loc, Value value,
std::optional<int64_t> padding = std::nullopt) {
if (!padding) return value;

OpFoldResult paddingOfr = builder.getIndexAttr(padding.value());
/// Pads `value` to `padding` if needed. As described in the comment on
// DataTilingUniversalPadding, statically-sized dimensions smaller that
// `padding` only get padded to the next power of two.
static FailureOr<Value> padIfNeeded(OpBuilder &builder, Location loc,
Value value, int64_t universalPadding) {
FailureOr<SmallVector<OpFoldResult>> shape =
LinalgExt::getDims(builder, loc, value);
if (failed(shape)) {
Expand All @@ -80,12 +80,21 @@ static FailureOr<Value> padIfNeeded(
SmallVector<OpFoldResult> highPad(shape->size(), zero);

// The low padding is always zero. The high padding is
// shape.ceildDiv(padding) - shape.
// shape.ceildDiv(padding) * padding - shape.
AffineExpr paddingExpr, shapeExpr;
bindSymbols(builder.getContext(), paddingExpr, shapeExpr);
AffineExpr highPadExpr =
shapeExpr.ceilDiv(paddingExpr) * paddingExpr - shapeExpr;
for (auto shape : llvm::enumerate(shape.value())) {
int64_t padding = universalPadding;
// Case of small static sizes - round to the next power of two instead of
// the universal padding amount.
if (auto constant_value = getConstantIntValue(shape.value())) {
if (constant_value.value() < padding) {
padding = llvm::PowerOf2Ceil(constant_value.value());
}
}
OpFoldResult paddingOfr = builder.getIndexAttr(padding);
highPad[shape.index()] = affine::makeComposedFoldedAffineApply(
builder, loc, highPadExpr, {paddingOfr, shape.value()});
}
Expand Down Expand Up @@ -259,19 +268,24 @@ struct FoldFillWithSetEncoding
};

struct SetEncodingPass : public SetEncodingBase<SetEncodingPass> {
SetEncodingPass(std::optional<int64_t> padding) : padding(padding) {}

void getDependentDialects(DialectRegistry &registry) const override {
registry.insert<IREE::LinalgExt::IREELinalgExtDialect>();
}

void runOnOperation() override;
LogicalResult initializeOptions(StringRef options) override;

std::optional<int64_t> padding;
};
} // namespace

void SetEncodingPass::runOnOperation() {
MLIRContext *context = &getContext();
{
RewritePatternSet patterns(context);
patterns.insert<SetMatmulEncoding>(context, defaultPadding);
patterns.insert<SetMatmulEncoding>(context, padding.value());
linalg::FillOp::getCanonicalizationPatterns(patterns, context);
patterns.insert<FoldFillWithSetEncoding>(context);
memref::populateResolveRankedShapedTypeResultDimsPatterns(patterns);
Expand All @@ -282,8 +296,18 @@ void SetEncodingPass::runOnOperation() {
}
}

std::unique_ptr<Pass> createSetEncodingPass() {
return std::make_unique<SetEncodingPass>();
LogicalResult SetEncodingPass::initializeOptions(StringRef options) {
if (failed(Pass::initializeOptions(options))) {
return failure();
}
if (!padding) {
padding = optionDefaultPadding;
}
return success();
}

std::unique_ptr<Pass> createSetEncodingPass(std::optional<int64_t> padding) {
return std::make_unique<SetEncodingPass>(padding);
}

} // namespace Flow
Expand Down
1 change: 1 addition & 0 deletions compiler/src/iree/compiler/Utils/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ iree_compiler_cc_library(
],
hdrs = [
"ConversionUtils.h",
"DataTilingUniversalPadding.h",
"ElementPackingUtils.h",
"FlatbufferUtils.h",
"IndexSet.h",
Expand Down
1 change: 1 addition & 0 deletions compiler/src/iree/compiler/Utils/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ iree_cc_library(
Utils
HDRS
"ConversionUtils.h"
"DataTilingUniversalPadding.h"
"ElementPackingUtils.h"
"FlatbufferUtils.h"
"IndexSet.h"
Expand Down
49 changes: 49 additions & 0 deletions compiler/src/iree/compiler/Utils/DataTilingUniversalPadding.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
// Copyright 2023 The IREE Authors
//
// Licensed under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

#ifndef IREE_COMPILER_UTILS_DATATILINGUNIVERSALPADDING_H_
#define IREE_COMPILER_UTILS_DATATILINGUNIVERSALPADDING_H_

namespace mlir {
namespace iree_compiler {

// When using data-tiling, during Flow, the SetEncoding pass must ensure that
// allocated buffers will be large enough for the eventual padded-and-tiled
// buffers. Those will only be created in the MaterializeEncoding pass, in HAL.
// Until then, the exact tile sizes aren't know. Our short-term approach to
// unblock this is to let SetEncoding pad everything to the next multiple of
// a "universal" padding size. In order for this to work, this universal padding
// value must be greater than or equal to any actual tile size that can occur.
//
// This widening of tensors is particularly problematic for narrow tensors. For
// example, it is inefficient to rewrite a tensor<1x1024xf32> into
// tensor<16x1024xf32>, using only row 0, leaving the other 15 rows unused. To
// remedy that in the short term until a better solution is found, we have the
// following contract: for any dimension that is statically sized and whose size
// is less than DataTilingUniversalPadding, the largest tile size that
// MaterializeEncoding is allowed to choose is the original dimension size
// rounded up to the next power of two.
//
// Example. If DataTilingUniversalPadding=16, then:
//
// For the source tensor type | MaterializeEncoding can choose tile sizes up to
// -------------------------- | -----------------------------------------------
// tensor<20x40xf32> | 16x16
// tensor<20x1xf32> | 16x1
// tensor<1x40xf32> | 1x16
// tensor<1x1xf32> | 1x1
// tensor<20x2xf32> | 16x2
// tensor<20x3xf32> | 16x4
// tensor<20x4xf32> | 16x4
// tensor<20x5xf32> | 16x8
//
// TODO(#11632) - find a way to do without universal padding.
const int DataTilingUniversalPadding = 16;

} // namespace iree_compiler
} // namespace mlir

#endif // IREE_COMPILER_UTILS_DATATILINGUNIVERSALPADDING_H_
3 changes: 3 additions & 0 deletions runtime/src/iree/modules/vmvx/module.c
Original file line number Diff line number Diff line change
Expand Up @@ -590,6 +590,9 @@ IREE_VMVX_ABI_EXPORT(iree_vmvx_mmt4d, mmt4d, v) {
.K0 = K0,
.cpu_data = (const iree_uk_uint64_t*)iree_cpu_data_fields(),
};
fprintf(stderr, "M0=%ld N0=%ld K0=%ld\n", M0, N0, K0);
fprintf(stderr, "lhs_buffer=%p rhs_buffer=%p out_buffer=%p\n", lhs_buffer,
rhs_buffer, out_buffer);
iree_uk_mmt4d(&ukernel_params);
IREE_TRACE_ZONE_END(z0);
return iree_ok_status();
Expand Down
Loading