diff --git a/iree/compiler/Conversion/LinalgToNVVM/ConvertToNVVM.cpp b/iree/compiler/Conversion/LinalgToNVVM/ConvertToNVVM.cpp index fef7158d02eb..786fa3156a31 100644 --- a/iree/compiler/Conversion/LinalgToNVVM/ConvertToNVVM.cpp +++ b/iree/compiler/Conversion/LinalgToNVVM/ConvertToNVVM.cpp @@ -126,6 +126,41 @@ class ConvertIREEBindingOp : public ConvertToLLVMPattern { } }; +/// A pattern to convert hal.interface.workgroup.id/count/size into +/// corresponding NVVM ops. +template +struct HALInterfaceWorkgroupOpsConverter final + : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + + LogicalResult matchAndRewrite( + InterfaceOpTy op, ArrayRef operands, + ConversionPatternRewriter &rewriter) const override { + Location loc = op.getLoc(); + Type i32Type = rewriter.getI32Type(); + Value newOp; + int32_t index = static_cast(op.dimension().getSExtValue()); + switch (index) { + case 0: + newOp = rewriter.create(loc, i32Type); + break; + case 1: + newOp = rewriter.create(loc, i32Type); + break; + case 2: + newOp = rewriter.create(loc, i32Type); + break; + default: + return failure(); + } + + newOp = + rewriter.create(loc, rewriter.getIntegerType(64), newOp); + rewriter.replaceOp(op, {newOp}); + return success(); + } +}; + /// A pass that replaces all occurrences of GPU device operations with their /// corresponding NVVM equivalent. /// @@ -157,6 +192,16 @@ struct ConvertToNVVMPass OwningRewritePatternList llvmPatterns; llvmPatterns.insert(m.getContext(), converter); + llvmPatterns + .insert, + HALInterfaceWorkgroupOpsConverter< + IREE::HAL::InterfaceWorkgroupCountOp, NVVM::GridDimXOp, + NVVM::GridDimYOp, NVVM::GridDimZOp>, + HALInterfaceWorkgroupOpsConverter< + IREE::HAL::InterfaceWorkgroupSizeOp, NVVM::BlockDimXOp, + NVVM::BlockDimYOp, NVVM::BlockDimZOp>>(m.getContext()); populateStdToLLVMConversionPatterns(converter, llvmPatterns); populateGpuToNVVMConversionPatterns(converter, llvmPatterns); LLVMConversionTarget target(getContext()); diff --git a/iree/compiler/Conversion/LinalgToNVVM/Passes.cpp b/iree/compiler/Conversion/LinalgToNVVM/Passes.cpp index c99eeb3f9623..b78c06a0d29f 100644 --- a/iree/compiler/Conversion/LinalgToNVVM/Passes.cpp +++ b/iree/compiler/Conversion/LinalgToNVVM/Passes.cpp @@ -32,6 +32,7 @@ static void addLinalgToNVVMPasses(OpPassManager &pm) { //===--------------------------------------------------------------------===// // Initial clean up. //===--------------------------------------------------------------------===// + pm.addPass(createLowerAffinePass()); pm.addPass(createCanonicalizerPass()); pm.addPass(createCSEPass()); diff --git a/iree/compiler/Conversion/LinalgToSPIRV/KernelDispatchUtils.cpp b/iree/compiler/Conversion/LinalgToSPIRV/KernelDispatchUtils.cpp index f0807c48d909..f885ab158eeb 100644 --- a/iree/compiler/Conversion/LinalgToSPIRV/KernelDispatchUtils.cpp +++ b/iree/compiler/Conversion/LinalgToSPIRV/KernelDispatchUtils.cpp @@ -312,6 +312,51 @@ static LogicalResult getConfigForCooperativeMatmul( return success(); } +/// Launch config for element-wise linalg.generic. +template <> +LogicalResult getOpLaunchConfig(linalg::GenericOp op, + const spirv::TargetEnv &targetEnv, + const SPIRVCodegenOptions &options, + TileSizesListType &tileSizes, + LaunchConfigInfo &config) { + int64_t subgroupSize = + targetEnv.getResourceLimits().subgroup_size().getValue().getSExtValue(); + config.workgroupSize[0] = subgroupSize; + config.workgroupSize[1] = 1; + config.workgroupSize[2] = 1; + ShapedType outputShape = op.getOutputShapedType(0); + + SmallVector sizes; + // When Vectororization is not enabled we skil the second level of tiling and + // fall back to convertToGPU which will map one element to one thread. To + // avoid a mismatch in the number of workgroup dispatched, we pick a tile size + // to have one element per thread. + // TODO: Remove this once we switch to linalg on tensor path. + if (options.enableVectorization) { + sizes.append({4 * subgroupSize, 2 * subgroupSize}); + } + sizes.push_back(subgroupSize); + // Use the first tile size that can divide the shape. If the shape is not + // aligned on any of the tile sizes pick the smallest tile of one element per + // thread. + int64_t lowerTs = config.workgroupSize[0]; + for (int64_t size : sizes) { + if (outputShape.getShape().back() % size != 0) continue; + lowerTs = size; + break; + } + SmallVector ts; + size_t numLoops = getNumOuterParallelLoops(op); + ts.resize(numLoops, 1); + ts.back() = lowerTs; + tileSizes.emplace_back(ts); + tileSizes.emplace_back(); + ts.back() = lowerTs / subgroupSize; + tileSizes.emplace_back(ts); + config.vectorize = options.enableVectorization; + return success(); +} + /// Launch configuration for different known GPU configuration. static LogicalResult getTargetSpecificConfig( linalg::MatmulOp op, const spirv::TargetEnv &targetEnv, @@ -708,6 +753,27 @@ Optional initGPULaunchConfig( #undef DISPATCH } + if (!rootOperation) { + for (linalg::LinalgOp linalgOp : linalgOps) { + if (auto op = dyn_cast(linalgOp.getOperation())) { + if (getNumOuterParallelLoops(linalgOp) == 0 || + llvm::any_of(linalgOp.getIndexingMaps(), [](AffineMap &map) { + return !map.isProjectedPermutation(); + })) { + continue; + } + TileSizesListType tileSizesInfo; + if (failed(getOpLaunchConfig(op, targetEnv, options, tileSizesInfo, + config))) { + continue; + } + launchConfig.setTileSizes(op, tileSizesInfo); + launchConfig.setRootOperation(op); + break; + } + } + } + launchConfig.setWorkgroupSize(config.workgroupSize); launchConfig.setNumSubgroups(config.numSubgroups); launchConfig.setVectorize(config.vectorize); diff --git a/iree/compiler/Dialect/Flow/Transforms/DispatchLinalgOnTensors.cpp b/iree/compiler/Dialect/Flow/Transforms/DispatchLinalgOnTensors.cpp index fc3d8536d8af..13df37fa0388 100644 --- a/iree/compiler/Dialect/Flow/Transforms/DispatchLinalgOnTensors.cpp +++ b/iree/compiler/Dialect/Flow/Transforms/DispatchLinalgOnTensors.cpp @@ -612,13 +612,13 @@ struct TileAndDistributeOnTensorsPattern SmallVector count = llvm::to_vector<4>( llvm::map_range(linalgOp.createLoopRanges(rewriter, loc), [](Range r) { return r.size; })); - // NOTE: Special treatment for convolution, which have more than 3 parallel - // dimensions. We want to ignore the batch dimension and tile along the - // next three. - // TODO(#5048): figure out a better way to avoid this special case. - if (isa(op)) { - count.erase(count.begin()); + size_t numParrallelLoops = getNumOuterParallelLoops(op); + // Flow currently allows only 3 level of tiling. If there are more parallel + // dimension drop the higher dimensions. + if (numParrallelLoops > kNumMaxParallelDims) { + count.erase( + count.begin(), + std::next(count.begin(), numParrallelLoops - kNumMaxParallelDims)); } count.resize(getNumTilableLoops(op)); auto workload = convertToWorkload(rewriter, loc, count); @@ -849,6 +849,23 @@ static void decideFusableLinalgOps(FuncOp funcOp) { builder.getI64ArrayAttr(fusionGroups)); } } + + // As a second step mark all the element-wise linalg ops not fused as roots + // so that they get tiled and distributed. + for (linalg::LinalgOp linalgOp : linalgOps) { + Operation *op = linalgOp.getOperation(); + if (!isa(op) || + getNumOuterParallelLoops(linalgOp) == 0 || + llvm::any_of(linalgOp.getIndexingMaps(), [](AffineMap &map) { + return !map.isProjectedPermutation(); + })) { + continue; + } + + if (op->hasAttr(kRootOpAttr) || op->hasAttr(kFusionGroupsAttr)) continue; + unsigned currGroupNum = numRootOps++; + op->setAttr(kRootOpAttr, builder.getI64IntegerAttr(currGroupNum)); + } } } @@ -906,11 +923,8 @@ void DispatchLinalgOnTensorsPass::runOnOperation() { // parallel dimensions. We want to ignore the batch dimension and tile // along the next three. That means setting the first position to zero. // TODO(#5048): figure out a better way to avoid this special case. - bool isConvOp = isa(op); - for (size_t dim = 0; dim < numTiledLoops; ++dim) { - useTileSizes[(isConvOp ? numParallelDims : numTiledLoops) - dim - 1] = + useTileSizes[numParallelDims - dim - 1] = buildFlowWorkgroupInfoOp(builder, dim); } return useTileSizes; diff --git a/iree/compiler/Dialect/Flow/Transforms/test/dispatch_linalg_on_tensors_dynamic.mlir b/iree/compiler/Dialect/Flow/Transforms/test/dispatch_linalg_on_tensors_dynamic.mlir index 9a41ff06a0be..ee1f3fea0f18 100644 --- a/iree/compiler/Dialect/Flow/Transforms/test/dispatch_linalg_on_tensors_dynamic.mlir +++ b/iree/compiler/Dialect/Flow/Transforms/test/dispatch_linalg_on_tensors_dynamic.mlir @@ -66,6 +66,7 @@ func @generic_op(%A: tensor, %B: tensor) -> tensor { } -> tensor return %1 : tensor } +// CHECK: #[[MULMAP:.+]] = affine_map<()[s0, s1] -> (s0 * s1)> // CHECK: func @generic_op // CHECK-SAME: %[[ARG0:[a-zA-Z0-9_]+]]: tensor // CHECK-SAME: %[[ARG1:[a-zA-Z0-9_]+]]: tensor @@ -80,13 +81,27 @@ func @generic_op(%A: tensor, %B: tensor) -> tensor { // CHECK-SAME: %[[ARG4:[a-zA-Z0-9_]+]]: index // CHECK-SAME: %[[ARG5:[a-zA-Z0-9_]+]]: index // CHECK-SAME: %[[ARG6:[a-zA-Z0-9_]+]]: !flow.dispatch.tensor -// CHECK-DAG: %[[LOAD2:.+]] = flow.dispatch.tensor.load %[[ARG2]] : !flow.dispatch.tensor -// CHECK-DAG: %[[INIT:.+]] = linalg.init_tensor [%[[ARG4]], %[[ARG5]]] -// CHECK-DAG: %[[LOAD3:.+]] = flow.dispatch.tensor.load %[[ARG3]] : !flow.dispatch.tensor -// CHECK: %[[RESULT:.+]] = linalg.generic -// CHECK-SAME: ins(%[[LOAD2]], %[[LOAD3]] : tensor, tensor) -// CHECK-SAME: outs(%[[INIT]] : tensor) -// CHECK: flow.dispatch.tensor.store %[[RESULT]], %[[ARG6]] +// CHECK-DAG: %[[WGSIZE_X:.+]] = flow.dispatch.workgroup.size[0] +// CHECK-DAG: %[[WGSIZE_Y:.+]] = flow.dispatch.workgroup.size[1] +// CHECK-DAG: %[[WGID_X:.+]] = flow.dispatch.workgroup.id[0] +// CHECK-DAG: %[[WGID_Y:.+]] = flow.dispatch.workgroup.id[1] +// CHECK-DAG: %[[WGCOUNT_X:.+]] = flow.dispatch.workgroup.count[0] +// CHECK-DAG: %[[WGCOUNT_Y:.+]] = flow.dispatch.workgroup.count[1] +// CHECK: %[[OFFSET_Y:.+]] = affine.apply #[[MULMAP]]()[%[[WGID_Y]], %[[WGSIZE_Y]]] +// CHECK: %[[STEP_Y:.+]] = affine.apply #[[MULMAP]]()[%[[WGCOUNT_Y]], %[[WGSIZE_Y]]] +// CHECK: scf.for %[[ARG7:.+]] = %[[OFFSET_Y]] +// CHECK-SAME: to %{{.+}} step %[[STEP_Y]] +// CHECK: %[[OFFSET_X:.+]] = affine.apply #[[MULMAP]]()[%[[WGID_X]], %[[WGSIZE_X]]] +// CHECK: %[[STEP_X:.+]] = affine.apply #[[MULMAP]]()[%[[WGCOUNT_X]], %[[WGSIZE_X]]] +// CHECK: scf.for %[[ARG8:.+]] = %[[OFFSET_X]] +// CHECK-SAME: to %{{.+}} step %[[STEP_X]] +// CHECK-DAG: %[[LOAD2:.+]] = flow.dispatch.tensor.load %[[ARG2]] +// CHECK-DAG: %[[INIT:.+]] = linalg.init_tensor +// CHECK-DAG: %[[LOAD3:.+]] = flow.dispatch.tensor.load %[[ARG3]] +// CHECK: %[[RESULT:.+]] = linalg.generic +// CHECK-SAME: ins(%[[LOAD2]], %[[LOAD3]] : tensor, tensor) +// CHECK-SAME: outs(%[[INIT]] : tensor) +// CHECK: flow.dispatch.tensor.store %[[RESULT]], %[[ARG6]] // ----- @@ -295,6 +310,8 @@ func @generic_op_4D } -> tensor return %1 : tensor } +// For ops of rank greater than 3 we serialized the higher dimension. When flow +// supports larger ranks this can be changed. // CHECK-DAG: #[[MAP0:.+]] = affine_map<()[s0, s1] -> (s0 * s1)> // CHECK: func @generic_op_4D // CHECK-SAME: %[[ARG0:[a-zA-Z0-9_]+]]: tensor @@ -306,8 +323,7 @@ func @generic_op_4D // CHECK-DAG: %[[D1:.+]] = memref.dim %[[ARG0]], %[[C1]] // CHECK-DAG: %[[D2:.+]] = memref.dim %[[ARG0]], %[[C2]] // CHECK-DAG: %[[D3:.+]] = memref.dim %[[ARG0]], %[[C3]] -// CHECK: %[[WORKLOAD_Z:.+]] = affine.apply #[[MAP0]]()[%[[D0]], %[[D1]]] -// CHECK: flow.dispatch.workgroups[%[[D3]], %[[D2]], %[[WORKLOAD_Z]]] +// CHECK: flow.dispatch.workgroups[%[[D3]], %[[D2]], %[[D1]]] // ----- diff --git a/iree/compiler/Dialect/HAL/Target/CUDA/BUILD b/iree/compiler/Dialect/HAL/Target/CUDA/BUILD index e1b63ad1e20a..862989b1ca9a 100644 --- a/iree/compiler/Dialect/HAL/Target/CUDA/BUILD +++ b/iree/compiler/Dialect/HAL/Target/CUDA/BUILD @@ -49,6 +49,7 @@ cc_library( "@llvm-project//mlir:LLVMDialect", "@llvm-project//mlir:LLVMToLLVMIRTranslation", "@llvm-project//mlir:NVVMDialect", + "@llvm-project//mlir:NVVMToLLVMIRTranslation", "@llvm-project//mlir:Pass", "@llvm-project//mlir:Support", "@llvm-project//mlir:ToLLVMIRTranslation", diff --git a/iree/compiler/Dialect/HAL/Target/CUDA/CMakeLists.txt b/iree/compiler/Dialect/HAL/Target/CUDA/CMakeLists.txt index 95c008303bcc..00e222577978 100644 --- a/iree/compiler/Dialect/HAL/Target/CUDA/CMakeLists.txt +++ b/iree/compiler/Dialect/HAL/Target/CUDA/CMakeLists.txt @@ -29,6 +29,7 @@ iree_cc_library( MLIRLLVMIR MLIRLLVMToLLVMIRTranslation MLIRNVVMIR + MLIRNVVMToLLVMIRTranslation MLIRPass MLIRSupport MLIRTargetLLVMIRExport diff --git a/iree/compiler/Dialect/HAL/Target/CUDA/CUDATarget.cpp b/iree/compiler/Dialect/HAL/Target/CUDA/CUDATarget.cpp index 0edb024b40de..86b0e7506ad7 100644 --- a/iree/compiler/Dialect/HAL/Target/CUDA/CUDATarget.cpp +++ b/iree/compiler/Dialect/HAL/Target/CUDA/CUDATarget.cpp @@ -28,6 +28,7 @@ #include "mlir/Pass/PassManager.h" #include "mlir/Support/LogicalResult.h" #include "mlir/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.h" +#include "mlir/Target/LLVMIR/Dialect/NVVM/NVVMToLLVMIRTranslation.h" #include "mlir/Target/LLVMIR/Export.h" namespace mlir { @@ -64,6 +65,7 @@ class CUDATargetBackend final : public TargetBackend { void getDependentDialects(DialectRegistry ®istry) const override { mlir::registerLLVMDialectTranslation(registry); + mlir::registerNVVMDialectTranslation(registry); } void buildTranslationPassPipeline(OpPassManager &passManager) override { diff --git a/iree/test/e2e/xla_ops/add.mlir b/iree/test/e2e/xla_ops/add.mlir index 6a57bb622c16..04d28b89e18f 100644 --- a/iree/test/e2e/xla_ops/add.mlir +++ b/iree/test/e2e/xla_ops/add.mlir @@ -5,3 +5,24 @@ func @tensor() attributes { iree.module.export } { check.expect_almost_eq_const(%result, dense<[6.0, 8.0, 10.0, 12.0]> : tensor<4xf32>) : tensor<4xf32> return } + +func @tensor_4d() attributes { iree.module.export } { + %0 = iree.unfoldable_constant dense<[[[[1.0, 2.0], [3.0, 4.0]], + [[5.0, 6.0], [7.0, 8.0]]], + [[[9.0, 10.0], [11.0, 12.0]], + [[13.0, 14.0], [15.0, 16.0]]]]> : + tensor<2x2x2x2xf32> + %1 = iree.unfoldable_constant dense<[[[[1.0, 2.0], [3.0, 4.0]], + [[5.0, 6.0], [7.0, 8.0]]], + [[[9.0, 10.0], [11.0, 12.0]], + [[13.0, 14.0], [15.0, 16.0]]]]> : + tensor<2x2x2x2xf32> + %result = "mhlo.add"(%0, %1) : (tensor<2x2x2x2xf32>, tensor<2x2x2x2xf32>) + -> tensor<2x2x2x2xf32> + check.expect_almost_eq_const(%result, dense<[[[[2.0, 4.0], [6.0, 8.0]], + [[10.0, 12.0], [14.0, 16.0]]], + [[[18.0, 20.0], [22.0, 24.0]], + [[26.0, 28.0], [30.0, 32.0]]]]> : + tensor<2x2x2x2xf32>) : tensor<2x2x2x2xf32> + return +}