From 510acc5a3c18ca6f2254ef9f33a82e93b7a118b7 Mon Sep 17 00:00:00 2001
From: Benoit Jacob <benoitjacob@google.com>
Date: Fri, 23 Jun 2023 17:49:34 +0000
Subject: [PATCH] tile-1x1

---
 .../Codegen/Common/DecomposePackUnPackOps.cpp |  5 ++
 .../MaterializeEncodingIntoPackUnPack.cpp     | 15 +++---
 .../test/decompose_pack_unpack_ops.mlir       | 12 +++++
 .../Dialect/Flow/Transforms/Passes.cpp        |  6 ++-
 .../compiler/Dialect/Flow/Transforms/Passes.h |  4 +-
 .../Dialect/Flow/Transforms/Passes.td         |  2 +-
 .../Dialect/Flow/Transforms/SetEncoding.cpp   | 48 +++++++++++++-----
 compiler/src/iree/compiler/Utils/BUILD.bazel  |  1 +
 .../src/iree/compiler/Utils/CMakeLists.txt    |  1 +
 .../Utils/DataTilingUniversalPadding.h        | 49 +++++++++++++++++++
 runtime/src/iree/modules/vmvx/module.c        |  3 ++
 11 files changed, 125 insertions(+), 21 deletions(-)
 create mode 100644 compiler/src/iree/compiler/Utils/DataTilingUniversalPadding.h
diff --git a/compiler/src/iree/compiler/Codegen/Common/DecomposePackUnPackOps.cpp b/compiler/src/iree/compiler/Codegen/Common/DecomposePackUnPackOps.cpp
index 56cad06759db..ad212bb36472 100644
--- a/compiler/src/iree/compiler/Codegen/Common/DecomposePackUnPackOps.cpp
+++ b/compiler/src/iree/compiler/Codegen/Common/DecomposePackUnPackOps.cpp
@@ -83,6 +83,11 @@ struct FoldTrailingUnitTranspose
     }
     if (numDropDims == 0) return failure();
 
+    if (numDropDims == inputTy.getRank()) {
+      rewriter.replaceOp(op, op.getInput());
+      return success();
+    }
+
     Location loc = op.getLoc();
     SmallVector<OpFoldResult> srcMixedSizes =
         tensor::createDimValues(rewriter, loc, op.getInput());
diff --git a/compiler/src/iree/compiler/Codegen/Common/MaterializeEncodingIntoPackUnPack.cpp b/compiler/src/iree/compiler/Codegen/Common/MaterializeEncodingIntoPackUnPack.cpp
index 0958fb47517d..1449d09089ce 100644
--- a/compiler/src/iree/compiler/Codegen/Common/MaterializeEncodingIntoPackUnPack.cpp
+++ b/compiler/src/iree/compiler/Codegen/Common/MaterializeEncodingIntoPackUnPack.cpp
@@ -18,6 +18,7 @@
 #include "iree/compiler/Dialect/Flow/IR/FlowOps.h"
 #include "iree/compiler/Dialect/HAL/IR/HALTypes.h"
 #include "iree/compiler/Dialect/VMVX/IR/VMVXOps.h"
+#include "iree/compiler/Utils/DataTilingUniversalPadding.h"
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/MemRef/Transforms/Transforms.h"
@@ -278,6 +279,7 @@ void adjustTileSizesToNarrowStaticShape(MaterializeEncodingInfo &encodingInfo,
     // Dynamic sizes are assumed to be large enough, not to be candidates for
     // narrow kernels.
     if (ShapedType::isDynamic(size)) continue;
+
     int64_t &tileSize = encodingInfo.innerTileSizes[i];
     // Let's not try to handle any dynamic tile sizes here. We could handle the
     // case where size==1 (as whatever is the runtime value of tileSize, it
@@ -285,12 +287,13 @@ void adjustTileSizesToNarrowStaticShape(MaterializeEncodingInfo &encodingInfo,
     // in general, adjusting dynamic tile sizes has to be done by the
     // materializeEncodingValueFn which we obtain those tileSizes from.
     if (ShapedType::isDynamic(tileSize)) continue;
-    auto generateNarrowTileSize = [&](int64_t n) {
-      if (size <= n && tileSize >= n) tileSize = n;
-    };
-    generateNarrowTileSize(1);
-    generateNarrowTileSize(2);
-    generateNarrowTileSize(4);
+    // Honor the contract in the comment on DataTilingUniversalPadding:
+    // For tile sizes along statically-sized dimensions that are smaller than
+    // DataTilingUniversalPadding, ensure that we never generate a tile size
+    // greater than the next power of two.
+    for (int po2 = 1; po2 < DataTilingUniversalPadding; po2 *= 2) {
+      if (size <= po2 && tileSize >= po2) tileSize = po2;
+    }
   }
 }
 
diff --git a/compiler/src/iree/compiler/Codegen/Common/test/decompose_pack_unpack_ops.mlir b/compiler/src/iree/compiler/Codegen/Common/test/decompose_pack_unpack_ops.mlir
index f7cc2b528281..80dbc441e762 100644
--- a/compiler/src/iree/compiler/Codegen/Common/test/decompose_pack_unpack_ops.mlir
+++ b/compiler/src/iree/compiler/Codegen/Common/test/decompose_pack_unpack_ops.mlir
@@ -35,6 +35,18 @@ func.func @simple_pad_and_pack(%input: tensor<5x1xf32>, %output: tensor<1x1x8x2x
 
 // -----
 
+func.func @simple_pack_1x1_to_1x1x1x1(%input: tensor<1x1xf32>, %output: tensor<1x1x1x1xf32>, %pad: f32) -> tensor<1x1x1x1xf32> {
+  %0 = tensor.pack %input padding_value(%pad : f32) inner_dims_pos = [0, 1] inner_tiles = [1, 1] into %output : tensor<1x1xf32> -> tensor<1x1x1x1xf32>
+  return %0 : tensor<1x1x1x1xf32>
+}
+// CHECK-LABEL: func.func @simple_pack_1x1_to_1x1x1x1
+// CHECK-SAME:    %[[IN:[A-Za-z0-9]+]]:
+// CHECK-SAME:    %[[OUT:[A-Za-z0-9]+]]:
+// CHECK:         %[[INSERT:.+]] = tensor.insert_slice %[[IN]] into %[[OUT]][0, 0, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1]
+// CHECK:         return %[[INSERT]]
+
+// -----
+
 func.func @simple_NC_to_CNnc(%arg0: tensor<32x8xf32>, %arg1: tensor<1x1x32x8xf32>) -> tensor<1x1x32x8xf32>{
   %0 = tensor.pack %arg0 outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [32, 8] into %arg1 : tensor<32x8xf32> -> tensor<1x1x32x8xf32>
   return %0 : tensor<1x1x32x8xf32>
diff --git a/compiler/src/iree/compiler/Dialect/Flow/Transforms/Passes.cpp b/compiler/src/iree/compiler/Dialect/Flow/Transforms/Passes.cpp
index 16b5d2ca712d..63d57804c1ba 100644
--- a/compiler/src/iree/compiler/Dialect/Flow/Transforms/Passes.cpp
+++ b/compiler/src/iree/compiler/Dialect/Flow/Transforms/Passes.cpp
@@ -6,6 +6,8 @@
 
 #include "iree/compiler/Dialect/Flow/Transforms/Passes.h"
 
+#include <iree/compiler/Utils/DataTilingUniversalPadding.h>
+
 #include <memory>
 
 #include "iree-dialects/Dialect/LinalgExt/Passes/Passes.h"
@@ -262,7 +264,9 @@ void buildFlowTransformPassPipeline(OpPassManager &passManager,
       .addPredicatedPass(clNormalizeInputIndexingMap,
                          createInterchangeTransposeGenericOpsPass)
       // Enable data tiling after all linalg level transformations.
-      .addPredicatedPass(clEnableDataTiling, createSetEncodingPass)
+      .addPredicatedPass(
+          clEnableDataTiling,
+          []() { return createSetEncodingPass(DataTilingUniversalPadding); })
       ////////////////////////////////////////////////////////////////////////
       // Dispatch region formation.
       .addPredicatedPass(!clDispatchTransformFileName.empty(),
diff --git a/compiler/src/iree/compiler/Dialect/Flow/Transforms/Passes.h b/compiler/src/iree/compiler/Dialect/Flow/Transforms/Passes.h
index bc0a0009653b..4bca665e4b14 100644
--- a/compiler/src/iree/compiler/Dialect/Flow/Transforms/Passes.h
+++ b/compiler/src/iree/compiler/Dialect/Flow/Transforms/Passes.h
@@ -8,6 +8,7 @@
 #define IREE_COMPILER_DIALECT_FLOW_TRANSFORMS_PASSES_H_
 
 #include <functional>
+#include <optional>
 
 #include "iree/compiler/Dialect/Flow/IR/FlowOps.h"
 #include "llvm/ADT/StringMap.h"
@@ -121,7 +122,8 @@ std::unique_ptr<Pass> createConvertToFlowPass();
 std::unique_ptr<Pass> createOptimizeNumericsPass();
 
 // Sets encoding for tensors to allow tiled execution of operations.
-std::unique_ptr<Pass> createSetEncodingPass();
+std::unique_ptr<Pass> createSetEncodingPass(
+    std::optional<int64_t> padding = std::nullopt);
 
 // Strips the signed/unsigned portion off of tensors.
 std::unique_ptr<InterfacePass<mlir::FunctionOpInterface>>
diff --git a/compiler/src/iree/compiler/Dialect/Flow/Transforms/Passes.td b/compiler/src/iree/compiler/Dialect/Flow/Transforms/Passes.td
index 83e90f15fe72..ac0e2577e213 100644
--- a/compiler/src/iree/compiler/Dialect/Flow/Transforms/Passes.td
+++ b/compiler/src/iree/compiler/Dialect/Flow/Transforms/Passes.td
@@ -251,7 +251,7 @@ def SetEncoding : Pass<"iree-flow-set-encoding", ""> {
   let summary = "Introduce tensor encoding for compute operations";
   let constructor = "mlir::iree_compiler::IREE::Flow::createSetEncodingPass()";
   let options = [
-    Option<"defaultPadding", "default-padding", "int64_t",
+    Option<"optionDefaultPadding", "default-padding", "int64_t",
            /*default=*/"16",
            "Default padding to use so packing can be done without padding during the packing">
   ];
diff --git a/compiler/src/iree/compiler/Dialect/Flow/Transforms/SetEncoding.cpp b/compiler/src/iree/compiler/Dialect/Flow/Transforms/SetEncoding.cpp
index e72e849a4cc3..b75f48920fe5 100644
--- a/compiler/src/iree/compiler/Dialect/Flow/Transforms/SetEncoding.cpp
+++ b/compiler/src/iree/compiler/Dialect/Flow/Transforms/SetEncoding.cpp
@@ -9,11 +9,14 @@
 // operations in tiled layouts.
 //===---------------------------------------------------------------------===//
 
+#include <iree/compiler/Utils/DataTilingUniversalPadding.h>
+
 #include "iree-dialects/Dialect/LinalgExt/IR/LinalgExtDialect.h"
 #include "iree-dialects/Dialect/LinalgExt/IR/LinalgExtOps.h"
 #include "iree-dialects/Dialect/LinalgExt/Utils/Utils.h"
 #include "iree/compiler/Dialect/Flow/Transforms/PassDetail.h"
 #include "iree/compiler/Dialect/Flow/Transforms/Passes.h"
+#include "llvm/Support/MathExtras.h"
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/Arith/Utils/Utils.h"
@@ -61,14 +64,11 @@ static FailureOr<Value> getZero(OpBuilder &builder, Location loc,
       .getResult();
 }
 
-/// Pads `value` to `padding` if needed. If no padding is specified,
-/// return `value` itself.
-static FailureOr<Value> padIfNeeded(
-    OpBuilder &builder, Location loc, Value value,
-    std::optional<int64_t> padding = std::nullopt) {
-  if (!padding) return value;
-
-  OpFoldResult paddingOfr = builder.getIndexAttr(padding.value());
+/// Pads `value` to `padding` if needed. As described in the comment on
+// DataTilingUniversalPadding, statically-sized dimensions smaller that
+// `padding` only get padded to the next power of two.
+static FailureOr<Value> padIfNeeded(OpBuilder &builder, Location loc,
+                                    Value value, int64_t universalPadding) {
   FailureOr<SmallVector<OpFoldResult>> shape =
       LinalgExt::getDims(builder, loc, value);
   if (failed(shape)) {
@@ -80,12 +80,21 @@ static FailureOr<Value> padIfNeeded(
   SmallVector<OpFoldResult> highPad(shape->size(), zero);
 
   // The low padding is always zero. The high padding is
-  // shape.ceildDiv(padding) - shape.
+  // shape.ceildDiv(padding) * padding - shape.
   AffineExpr paddingExpr, shapeExpr;
   bindSymbols(builder.getContext(), paddingExpr, shapeExpr);
   AffineExpr highPadExpr =
       shapeExpr.ceilDiv(paddingExpr) * paddingExpr - shapeExpr;
   for (auto shape : llvm::enumerate(shape.value())) {
+    int64_t padding = universalPadding;
+    // Case of small static sizes - round to the next power of two instead of
+    // the universal padding amount.
+    if (auto constant_value = getConstantIntValue(shape.value())) {
+      if (constant_value.value() < padding) {
+        padding = llvm::PowerOf2Ceil(constant_value.value());
+      }
+    }
+    OpFoldResult paddingOfr = builder.getIndexAttr(padding);
     highPad[shape.index()] = affine::makeComposedFoldedAffineApply(
         builder, loc, highPadExpr, {paddingOfr, shape.value()});
   }
@@ -259,11 +268,16 @@ struct FoldFillWithSetEncoding
 };
 
 struct SetEncodingPass : public SetEncodingBase<SetEncodingPass> {
+  SetEncodingPass(std::optional<int64_t> padding) : padding(padding) {}
+
   void getDependentDialects(DialectRegistry &registry) const override {
     registry.insert<IREE::LinalgExt::IREELinalgExtDialect>();
   }
 
   void runOnOperation() override;
+  LogicalResult initializeOptions(StringRef options) override;
+
+  std::optional<int64_t> padding;
 };
 }  // namespace
 
@@ -271,7 +285,7 @@ void SetEncodingPass::runOnOperation() {
   MLIRContext *context = &getContext();
   {
     RewritePatternSet patterns(context);
-    patterns.insert<SetMatmulEncoding>(context, defaultPadding);
+    patterns.insert<SetMatmulEncoding>(context, padding.value());
     linalg::FillOp::getCanonicalizationPatterns(patterns, context);
     patterns.insert<FoldFillWithSetEncoding>(context);
     memref::populateResolveRankedShapedTypeResultDimsPatterns(patterns);
@@ -282,8 +296,18 @@ void SetEncodingPass::runOnOperation() {
   }
 }
 
-std::unique_ptr<Pass> createSetEncodingPass() {
-  return std::make_unique<SetEncodingPass>();
+LogicalResult SetEncodingPass::initializeOptions(StringRef options) {
+  if (failed(Pass::initializeOptions(options))) {
+    return failure();
+  }
+  if (!padding) {
+    padding = optionDefaultPadding;
+  }
+  return success();
+}
+
+std::unique_ptr<Pass> createSetEncodingPass(std::optional<int64_t> padding) {
+  return std::make_unique<SetEncodingPass>(padding);
 }
 
 }  // namespace Flow
diff --git a/compiler/src/iree/compiler/Utils/BUILD.bazel b/compiler/src/iree/compiler/Utils/BUILD.bazel
index aaa6ca5ecccc..ec1881377537 100644
--- a/compiler/src/iree/compiler/Utils/BUILD.bazel
+++ b/compiler/src/iree/compiler/Utils/BUILD.bazel
@@ -29,6 +29,7 @@ iree_compiler_cc_library(
     ],
     hdrs = [
         "ConversionUtils.h",
+        "DataTilingUniversalPadding.h",
         "ElementPackingUtils.h",
         "FlatbufferUtils.h",
         "IndexSet.h",
diff --git a/compiler/src/iree/compiler/Utils/CMakeLists.txt b/compiler/src/iree/compiler/Utils/CMakeLists.txt
index d7ef3b56ddc2..5dcd30f43a92 100644
--- a/compiler/src/iree/compiler/Utils/CMakeLists.txt
+++ b/compiler/src/iree/compiler/Utils/CMakeLists.txt
@@ -15,6 +15,7 @@ iree_cc_library(
     Utils
   HDRS
     "ConversionUtils.h"
+    "DataTilingUniversalPadding.h"
     "ElementPackingUtils.h"
     "FlatbufferUtils.h"
     "IndexSet.h"
diff --git a/compiler/src/iree/compiler/Utils/DataTilingUniversalPadding.h b/compiler/src/iree/compiler/Utils/DataTilingUniversalPadding.h
new file mode 100644
index 000000000000..87d8296fd3e1
--- /dev/null
+++ b/compiler/src/iree/compiler/Utils/DataTilingUniversalPadding.h
@@ -0,0 +1,49 @@
+// Copyright 2023 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_COMPILER_UTILS_DATATILINGUNIVERSALPADDING_H_
+#define IREE_COMPILER_UTILS_DATATILINGUNIVERSALPADDING_H_
+
+namespace mlir {
+namespace iree_compiler {
+
+// When using data-tiling, during Flow, the SetEncoding pass must ensure that
+// allocated buffers will be large enough for the eventual padded-and-tiled
+// buffers. Those will only be created in the MaterializeEncoding pass, in HAL.
+// Until then, the exact tile sizes aren't know. Our short-term approach to
+// unblock this is to let SetEncoding pad everything to the next multiple of
+// a "universal" padding size. In order for this to work, this universal padding
+// value must be greater than or equal to any actual tile size that can occur.
+//
+// This widening of tensors is particularly problematic for narrow tensors. For
+// example, it is inefficient to rewrite a tensor<1x1024xf32> into
+// tensor<16x1024xf32>, using only row 0, leaving the other 15 rows unused. To
+// remedy that in the short term until a better solution is found, we have the
+// following contract: for any dimension that is statically sized and whose size
+// is less than DataTilingUniversalPadding, the largest tile size that
+// MaterializeEncoding is allowed to choose is the original dimension size
+// rounded up to the next power of two.
+//
+// Example. If DataTilingUniversalPadding=16, then:
+//
+// For the source tensor type | MaterializeEncoding can choose tile sizes up to
+// -------------------------- | -----------------------------------------------
+// tensor<20x40xf32>          | 16x16
+// tensor<20x1xf32>           | 16x1
+// tensor<1x40xf32>           | 1x16
+// tensor<1x1xf32>            | 1x1
+// tensor<20x2xf32>           | 16x2
+// tensor<20x3xf32>           | 16x4
+// tensor<20x4xf32>           | 16x4
+// tensor<20x5xf32>           | 16x8
+//
+// TODO(#11632) - find a way to do without universal padding.
+const int DataTilingUniversalPadding = 16;
+
+}  // namespace iree_compiler
+}  // namespace mlir
+
+#endif  // IREE_COMPILER_UTILS_DATATILINGUNIVERSALPADDING_H_
\ No newline at end of file
diff --git a/runtime/src/iree/modules/vmvx/module.c b/runtime/src/iree/modules/vmvx/module.c
index 3d00c539ce72..5f91b4df37f6 100644
--- a/runtime/src/iree/modules/vmvx/module.c
+++ b/runtime/src/iree/modules/vmvx/module.c
@@ -590,6 +590,9 @@ IREE_VMVX_ABI_EXPORT(iree_vmvx_mmt4d, mmt4d, v) {
       .K0 = K0,
       .cpu_data = (const iree_uk_uint64_t*)iree_cpu_data_fields(),
   };
+  fprintf(stderr, "M0=%ld N0=%ld K0=%ld\n", M0, N0, K0);
+  fprintf(stderr, "lhs_buffer=%p rhs_buffer=%p out_buffer=%p\n", lhs_buffer,
+          rhs_buffer, out_buffer);
   iree_uk_mmt4d(&ukernel_params);
   IREE_TRACE_ZONE_END(z0);
   return iree_ok_status();