[Codegen][SME] Recognize transposed output indexing map as transpose (#…

…18369)
iree-org · Aug 30, 2024 · a7ed2d7 · a7ed2d7
1 parent 3ceffb6
commit a7ed2d7
Show file tree

Hide file tree

Showing 2 changed files with 32 additions and 1 deletion.
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/Utils.cpp b/compiler/src/iree/compiler/Codegen/LLVMCPU/Utils.cpp
@@ -74,7 +74,8 @@ bool isLinalgGeneric2DTranspose(linalg::GenericOp genericOp) {
   // Check that the two indexing maps are a permutation of each other.
   SmallVector<AffineMap> indexingMaps = genericOp.getIndexingMapsArray();
   bool isTranspose =
-      indexingMaps[0].isPermutation() && indexingMaps[1].isIdentity();
+      (indexingMaps[0].isPermutation() && indexingMaps[1].isIdentity()) ||
+      (indexingMaps[1].isPermutation() && indexingMaps[0].isIdentity());
   if (!isTranspose)
     return false;
 

diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/select_aarch64_sme_lowering_strategy.mlir b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/select_aarch64_sme_lowering_strategy.mlir
@@ -33,6 +33,36 @@ func.func @transpose_f32() attributes {hal.executable.target = #executable_targe
   #hal.pipeline.binding<storage_buffer>
 ]>
 #executable_target_embedded_elf_arm_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu_features = "+sve,+sme", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "aarch64-none-elf"}>
+func.func @transpose_output_indexing_map_f32() attributes {hal.executable.target = #executable_target_embedded_elf_arm_64_} {
+  %c0 = arith.constant 0 : index
+  %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<32x32xf32>>
+  %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<32x32xf32>>
+  %2 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [32, 32], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<32x32xf32>> -> tensor<32x32xf32>
+  %3 = tensor.empty() : tensor<32x32xf32>
+  %4 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1, d0)>], iterator_types = ["parallel", "parallel"]} ins(%2 : tensor<32x32xf32>) outs(%3 : tensor<32x32xf32>) {
+  ^bb0(%in: f32, %out: f32):
+    linalg.yield %in : f32
+  } -> tensor<32x32xf32>
+  flow.dispatch.tensor.store %4, %1, offsets = [0, 0], sizes = [32, 32], strides = [1, 1] : tensor<32x32xf32> -> !flow.dispatch.tensor<writeonly:tensor<32x32xf32>>
+  return
+}
+
+//   CHECK: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[4, 16], {{\[}}[4], [4]], [0, 0], [0, 0]]>
+//   CHECK: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDoubleTilingExpert>
+//       CHECK: func.func @transpose_output_indexing_map_f32()
+//  CHECK-SAME:     translation_info = #[[TRANSLATION]]
+//       CHECK: linalg.generic
+//  CHECK-SAME:     lowering_config = #[[CONFIG]]
+
+// -----
+
+#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
+  #hal.descriptor_set.layout<0, bindings = [
+    #hal.descriptor_set.binding<0, storage_buffer>,
+    #hal.descriptor_set.binding<1, storage_buffer>
+  ]>
+]>
+#executable_target_embedded_elf_arm_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu_features = "+sve,+sme", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "aarch64-none-elf"}>
 func.func @transpose_f64() attributes {hal.executable.target = #executable_target_embedded_elf_arm_64_} {
   %c0 = arith.constant 0 : index
   %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<32x32xf64>>