forked from iree-org/iree
-
Notifications
You must be signed in to change notification settings - Fork 11
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge branch 'openxla:main' into shark
- Loading branch information
Showing
96 changed files
with
3,735 additions
and
3,084 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
27 changes: 27 additions & 0 deletions
27
compiler/plugins/target/ROCM/test/target_device_features.mlir
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
// RUN: iree-opt --pass-pipeline='builtin.module(iree-hal-assign-target-devices{targets=rocm},iree-hal-transformation-pipeline{serialize-executables=false})' --iree-rocm-target-chip=gfx940 %s | FileCheck %s --check-prefix=MI300 | ||
// RUN: iree-opt --pass-pipeline='builtin.module(iree-hal-assign-target-devices{targets=rocm},iree-hal-transformation-pipeline{serialize-executables=false})' --iree-rocm-target-chip=gfx942 %s | FileCheck %s --check-prefix=MI300 | ||
|
||
// MI300: mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>] | ||
|
||
stream.executable public @reduce_dispatch { | ||
stream.executable.export @reduce_dispatch workgroups(%arg0: index) -> (index, index, index) { | ||
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0 | ||
stream.return %x, %y, %z : index, index, index | ||
} | ||
builtin.module { | ||
func.func @reduce_dispatch(%arg0_binding: !stream.binding, %arg1_binding: !stream.binding) { | ||
%c0 = arith.constant 0 : index | ||
%arg0 = stream.binding.subspan %arg0_binding[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<16xf32>> | ||
%arg1 = stream.binding.subspan %arg1_binding[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<f32>> | ||
%0 = tensor.empty() : tensor<f32> | ||
%1 = flow.dispatch.tensor.load %arg0, offsets=[0], sizes=[16], strides=[1] : !flow.dispatch.tensor<readonly:tensor<16xf32>> -> tensor<16xf32> | ||
%3 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> ()>], iterator_types = ["reduction"]} ins(%1 : tensor<16xf32>) outs(%0 : tensor<f32>) { | ||
^bb0(%arg2: f32, %arg3: f32): | ||
%4 = arith.addf %arg2, %arg3 : f32 | ||
linalg.yield %4 : f32 | ||
} -> tensor<f32> | ||
flow.dispatch.tensor.store %3, %arg1, offsets=[], sizes=[], strides=[] : tensor<f32> -> !flow.dispatch.tensor<writeonly:tensor<f32>> | ||
return | ||
} | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
113 changes: 113 additions & 0 deletions
113
compiler/src/iree/compiler/Codegen/Common/GPU/GPUHeuristics.cpp
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,113 @@ | ||
// Copyright 2024 The IREE Authors | ||
// | ||
// Licensed under the Apache License v2.0 with LLVM Exceptions. | ||
// See https://llvm.org/LICENSE.txt for license information. | ||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | ||
|
||
#include "iree/compiler/Codegen/Common/GPU/GPUHeuristics.h" | ||
#include "llvm/ADT/APInt.h" | ||
#include "llvm/Support/Debug.h" | ||
#include "llvm/Support/MathExtras.h" | ||
#include "llvm/Support/raw_ostream.h" | ||
|
||
#define DEBUG_TYPE "iree-codegen-gpu-heuristics" | ||
|
||
using llvm::APIntOps::GreatestCommonDivisor; | ||
|
||
namespace mlir::iree_compiler { | ||
|
||
std::optional<GPUMMASchedule> | ||
deduceMMASchedule(const GPUMatmulShapeType &problem, | ||
ArrayRef<GPUMatmulShapeType> intrinsics, | ||
const GPUMMAHeuristicSeeds &seeds) { | ||
for (auto [index, intrinsic] : llvm::enumerate(intrinsics)) { | ||
if (problem.aType != intrinsic.aType || problem.bType != intrinsic.bType || | ||
problem.cType != intrinsic.cType) { | ||
continue; // Cannot use this intrinsic for mismatched types | ||
} | ||
|
||
if (problem.mSize % intrinsic.mSize != 0 || | ||
problem.nSize % intrinsic.nSize != 0 || | ||
problem.kSize % intrinsic.kSize != 0) { | ||
continue; // Cannot use this intrinsic for misaligned cases | ||
} | ||
|
||
int64_t mTotalTileCount = problem.mSize / intrinsic.mSize; | ||
int64_t nTotalTileCount = problem.nSize / intrinsic.nSize; | ||
|
||
int64_t remainingWarps = seeds.bestSubgroupCountPerWorkgroup; | ||
int64_t remainingTiles = seeds.bestMNTileCountPerSubgroup; | ||
// Assign more warps to the M dimension (used later) to balance thread | ||
// counts along X and Y dimensions. | ||
int64_t warpSqrt = 1ull | ||
<< (llvm::divideCeil(llvm::Log2_64(remainingWarps), 2)); | ||
int64_t tileSqrt = 1ull << (llvm::Log2_64(remainingTiles) / 2); | ||
|
||
int64_t mWarpCount = 0, nWarpCount = 0; | ||
int64_t mTileCount = 0, nTileCount = 0; | ||
|
||
// See if the square root can divide mTotalTileCount. If so it means we can | ||
// distribute to both dimensions evenly. Otherwise, try to distribute to N | ||
// and then M. | ||
if (mTotalTileCount > (warpSqrt * tileSqrt) && | ||
mTotalTileCount % (warpSqrt * tileSqrt) == 0) { | ||
mWarpCount = warpSqrt; | ||
mTileCount = tileSqrt; | ||
|
||
remainingWarps /= warpSqrt; | ||
remainingTiles /= tileSqrt; | ||
|
||
APInt nGCD = GreatestCommonDivisor(APInt(64, nTotalTileCount), | ||
APInt(64, remainingWarps)); | ||
nWarpCount = nGCD.getSExtValue(); | ||
nTotalTileCount /= nWarpCount; | ||
remainingWarps /= nWarpCount; | ||
|
||
nGCD = GreatestCommonDivisor(APInt(64, nTotalTileCount), | ||
APInt(64, remainingTiles)); | ||
nTileCount = nGCD.getSExtValue(); | ||
} else { | ||
APInt nGCD = GreatestCommonDivisor(APInt(64, nTotalTileCount), | ||
APInt(64, remainingWarps)); | ||
nWarpCount = nGCD.getSExtValue(); | ||
nTotalTileCount /= nWarpCount; | ||
remainingWarps /= nWarpCount; | ||
|
||
nGCD = GreatestCommonDivisor(APInt(64, nTotalTileCount), | ||
APInt(64, remainingTiles)); | ||
nTileCount = nGCD.getSExtValue(); | ||
remainingTiles /= nTileCount; | ||
|
||
APInt mGCD = GreatestCommonDivisor(APInt(64, mTotalTileCount), | ||
APInt(64, remainingWarps)); | ||
mWarpCount = mGCD.getSExtValue(); | ||
mTotalTileCount /= mWarpCount; | ||
remainingWarps /= mWarpCount; | ||
|
||
mGCD = GreatestCommonDivisor(APInt(64, mTotalTileCount), | ||
APInt(64, remainingTiles)); | ||
mTileCount = mGCD.getSExtValue(); | ||
} | ||
|
||
const uint64_t kTotalTileCount = problem.kSize / intrinsic.kSize; | ||
APInt kGCD = GreatestCommonDivisor( | ||
APInt(64, kTotalTileCount), APInt(64, seeds.bestKTileCountPerSubgroup)); | ||
int64_t kTileCount = kGCD.getSExtValue(); | ||
|
||
LLVM_DEBUG({ | ||
llvm::dbgs() << "chosen MMA schedule:\n"; | ||
llvm::dbgs() << " intrinsic (M, N, K) = (" << intrinsic.mSize << ", " | ||
<< intrinsic.nSize << ", " << intrinsic.kSize << ")\n"; | ||
llvm::dbgs() << " subgroup count (M, N) = (" << mWarpCount << ", " | ||
<< nWarpCount << ")\n"; | ||
llvm::dbgs() << " subgroup tile count (M, N, K) = (" << mTileCount | ||
<< ", " << nTileCount << ", " << kTileCount << ")\n"; | ||
}); | ||
return GPUMMASchedule{index, intrinsic.mSize, intrinsic.nSize, | ||
intrinsic.kSize, mWarpCount, nWarpCount, | ||
mTileCount, nTileCount, kTileCount}; | ||
} | ||
return std::nullopt; | ||
} | ||
|
||
} // namespace mlir::iree_compiler |
Oops, something went wrong.