Skip to content

Commit

Permalink
[GPU] Follow the official naming convention for WMMA attributes. (ire…
Browse files Browse the repository at this point in the history
…e-org#18147)

iree-org@82012e6
missed the `WMMA_F32_16x16x16_F16` case. The `WMMA_F16_16x16x16_F16` is
fine because the input type and output type are all F16.

The revision addresses the failure on main branch:
https://github.com/iree-org/iree/actions/runs/10289449633/job/28478608054

The change is generated by the below command.

```
sed -i "s/WMMA_F16_16x16x16_F32/WMMA_F32_16x16x16_F16/g" **/*.h
sed -i "s/WMMA_F16_16x16x16_F32/WMMA_F32_16x16x16_F16/g" **/*.td
sed -i "s/WMMA_F16_16x16x16_F32/WMMA_F32_16x16x16_F16/g" **/*.cpp
sed -i "s/WMMA_F16_16x16x16_F32/WMMA_F32_16x16x16_F16/g" **/*.mlir
sed -i "s/WMMA_F16_16x16x16_F32/WMMA_F32_16x16x16_F16/g" **/*.py
```

ci-extra:
build_packages,test_amd_mi250,test_amd_mi300,test_amd_w7900,test_nvidia_t4

---------

Signed-off-by: hanhanW <[email protected]>
  • Loading branch information
hanhanW authored Aug 7, 2024
1 parent 235e110 commit e9e24f8
Show file tree
Hide file tree
Showing 18 changed files with 46 additions and 46 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
// GFX940-SAME: mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>],

// GFX1100: target = #iree_gpu.target<arch = "gfx1100",
// GFX1100-SAME: mma = [<WMMA_F16_16x16x16_F32>, <WMMA_F16_16x16x16_F16>]
// GFX1100-SAME: mma = [<WMMA_F32_16x16x16_F16>, <WMMA_F16_16x16x16_F16>]
// GFX1100-SAME: subgroup_size_choices = [32, 64]

// GFX941: target = #iree_gpu.target<arch = "gfx941",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -558,7 +558,7 @@ func.func @contract_to_wmma_16x16x16_mm(%a : vector<16x16xf16>, %b : vector<16x1
indexing_maps = [#map1, #map2, #map3],
iterator_types = ["parallel", "parallel", "reduction"],
kind = #vector.kind<add>,
iree.amdgpu.mma = #iree_gpu.mma_layout<WMMA_F16_16x16x16_F32>
iree.amdgpu.mma = #iree_gpu.mma_layout<WMMA_F32_16x16x16_F16>
} %A, %B, %C : vector<16x16xf16>, vector<16x16xf16> into vector<16x16xf32>

%O = iree_vector_ext.to_layout %output to #layout_c : vector<16x16xf32>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -582,7 +582,7 @@ func.func @resolve_wmma_layout_conflict_with_shared_memory(%15 : vector<16x16xf1
affine_map<(d0, d1, d2) -> (d0, d1)>],
iterator_types = ["parallel", "parallel", "reduction"],
kind = #vector.kind<add>,
iree.amdgpu.mma = #iree_gpu.mma_layout<WMMA_F16_16x16x16_F32>}
iree.amdgpu.mma = #iree_gpu.mma_layout<WMMA_F32_16x16x16_F16>}
%A, %B, %C : vector<16x16xf16>, vector<16x16xf16> into vector<16x16xf32>

%TM1 = arith.truncf %M1 : vector<16x16xf32> to vector<16x16xf16>
Expand All @@ -596,7 +596,7 @@ func.func @resolve_wmma_layout_conflict_with_shared_memory(%15 : vector<16x16xf1
affine_map<(d0, d1, d2) -> (d0, d1)>],
iterator_types = ["parallel", "parallel", "reduction"],
kind = #vector.kind<add>,
iree.amdgpu.mma = #iree_gpu.mma_layout<WMMA_F16_16x16x16_F32>}
iree.amdgpu.mma = #iree_gpu.mma_layout<WMMA_F32_16x16x16_F16>}
%A2, %B2, %C2 : vector<16x16xf16>, vector<16x16xf16> into vector<16x16xf32>

func.return %M2 : vector<16x16xf32>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2689,7 +2689,7 @@ hal.executable private @set_size_to_tilesize_when_divisible {
hal.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @set_size_to_tilesize_when_divisible() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [128, 1, 1] subgroup_size = 32, {mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<WMMA_F16_16x16x16_F32>, subgroup_m_count = 1, subgroup_n_count = 4>}>} {
func.func @set_size_to_tilesize_when_divisible() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [128, 1, 1] subgroup_size = 32, {mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<WMMA_F32_16x16x16_F16>, subgroup_m_count = 1, subgroup_n_count = 4>}>} {
%c0 = arith.constant 0 : index
%c32_i64 = arith.constant 32 : i64
%cst = arith.constant 0.000000e+00 : f16
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -231,7 +231,7 @@ static OpaqueMmaLayout getOpaqueMFMALayout(MLIRContext *context,
case MMAIntrinsic::MFMA_I32_32x32x16_I8: {
return OpaqueMmaLayout{32, 32, 16, i8, i8, i32};
}
case MMAIntrinsic::WMMA_F16_16x16x16_F32: {
case MMAIntrinsic::WMMA_F32_16x16x16_F16: {
return OpaqueMmaLayout{16, 16, 16, f16, f16, f32};
}
case MMAIntrinsic::WMMA_F16_16x16x16_F16: {
Expand Down Expand Up @@ -353,7 +353,7 @@ static ConcreteMmaLayout getConcreteMFMALayout(MLIRContext *context,
return ConcreteMmaLayout{opaqueLayout, aMLayout, aKLayout, bKLayout,
bNLayout, cMLayout, cNLayout};
}
case MMAIntrinsic::WMMA_F16_16x16x16_F32:
case MMAIntrinsic::WMMA_F32_16x16x16_F16:
case MMAIntrinsic::WMMA_F16_16x16x16_F16: {
// #outer = #iree_vector_ext.per_dim_layout<[LANEX], [16]>
// #inner = #iree_vector_ext.per_dim_layout<[LANEY, VECTORX], [1, 16]>
Expand Down Expand Up @@ -463,7 +463,7 @@ MMAAttr::getABCVectorTypes() const {
auto cType = VectorType::get({16}, getCType());
return std::make_tuple(aType, bType, cType);
}
case MMAIntrinsic::WMMA_F16_16x16x16_F32:
case MMAIntrinsic::WMMA_F32_16x16x16_F16:
case MMAIntrinsic::WMMA_F16_16x16x16_F16: {
auto aType = VectorType::get({16}, getAType());
auto bType = VectorType::get({16}, getBType());
Expand Down Expand Up @@ -492,7 +492,7 @@ int64_t MMAAttr::getBlockSize() const {
case MMAIntrinsic::MFMA_I32_16x16x32_I8:
case MMAIntrinsic::MFMA_I32_32x32x16_I8:
case MMAIntrinsic::WMMA_F16_16x16x16_F16:
case MMAIntrinsic::WMMA_F16_16x16x16_F32: {
case MMAIntrinsic::WMMA_F32_16x16x16_F16: {
return 1;
}
}
Expand All @@ -510,7 +510,7 @@ int64_t MMAAttr::getSubgroupSize() const {
case MMAIntrinsic::MFMA_I32_32x32x16_I8: {
return 64;
}
case MMAIntrinsic::WMMA_F16_16x16x16_F32:
case MMAIntrinsic::WMMA_F32_16x16x16_F16:
case MMAIntrinsic::WMMA_F16_16x16x16_F16: {
return 32;
}
Expand Down Expand Up @@ -542,7 +542,7 @@ MMAAttr::SingleSubgroupLayout MMAAttr::getASingleSubgroupLayout() const {
return {/*outer=*/{1, 1}, /*thread=*/{32, 2}, /*strides=*/{1, 32},
/*element=*/{1, 8}};
}
case MMAIntrinsic::WMMA_F16_16x16x16_F32:
case MMAIntrinsic::WMMA_F32_16x16x16_F16:
case MMAIntrinsic::WMMA_F16_16x16x16_F16: {
return {/*outer=*/{1, 1}, /*thread=*/{16, 1}, /*strides=*/{1, 16},
/*element=*/{1, 16}};
Expand Down Expand Up @@ -574,7 +574,7 @@ MMAAttr::SingleSubgroupLayout MMAAttr::getBSingleSubgroupLayout() const {
return {/*outer=*/{1, 1}, /*thread=*/{2, 32}, /*strides=*/{32, 1},
/*element=*/{8, 1}};
}
case MMAIntrinsic::WMMA_F16_16x16x16_F32:
case MMAIntrinsic::WMMA_F32_16x16x16_F16:
case MMAIntrinsic::WMMA_F16_16x16x16_F16: {
return {/*outer=*/{1, 1}, /*thread=*/{1, 16}, /*strides=*/{16, 1},
/*element=*/{16, 1}};
Expand All @@ -597,7 +597,7 @@ MMAAttr::SingleSubgroupLayout MMAAttr::getCSingleSubgroupLayout() const {
return {/*outer=*/{4, 1}, /*thread=*/{2, 32}, /*strides=*/{32, 1},
/*element=*/{4, 1}};
}
case MMAIntrinsic::WMMA_F16_16x16x16_F32:
case MMAIntrinsic::WMMA_F32_16x16x16_F16:
case MMAIntrinsic::WMMA_F16_16x16x16_F16: {
return {/*outer=*/{8, 1}, /*thread=*/{2, 16}, /*strides=*/{16, 1},
/*element=*/{1, 1}};
Expand Down Expand Up @@ -644,7 +644,7 @@ FailureOr<Value> MMAAttr::buildMmaOperation(OpBuilder &builder, Location loc,
rhs, acc)
.getResult();
}
case MMAIntrinsic::WMMA_F16_16x16x16_F32:
case MMAIntrinsic::WMMA_F32_16x16x16_F16:
case MMAIntrinsic::WMMA_F16_16x16x16_F16: {
return builder.create<amdgpu::WMMAOp>(loc, resultType, lhs, rhs, acc)
.getResult();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@ def MFMA_F32_16x16x32_F8E4M3FNUZ : I32EnumAttrCase<"MFMA_F32_16x16x32_F8E4M3FNU
def MFMA_I32_16x16x32_I8 : I32EnumAttrCase<"MFMA_I32_16x16x32_I8", 4>;
def MFMA_I32_32x32x16_I8 : I32EnumAttrCase<"MFMA_I32_32x32x16_I8", 5>;
// TODO: Create separate WMMA ops for AMD and NVIDIA GPUs
def WMMA_F16_16x16x16_F32 : I32EnumAttrCase<"WMMA_F16_16x16x16_F32", 6>;
def WMMA_F32_16x16x16_F16 : I32EnumAttrCase<"WMMA_F32_16x16x16_F16", 6>;
def WMMA_F16_16x16x16_F16 : I32EnumAttrCase<"WMMA_F16_16x16x16_F16", 7>;

def IREEGPU_MMAIntrinsic : IREEGPU_I32MmaEnumAttr<"MMAIntrinsic",
Expand All @@ -117,7 +117,7 @@ def IREEGPU_MMAIntrinsic : IREEGPU_I32MmaEnumAttr<"MMAIntrinsic",
MFMA_F32_16x16x32_F8E4M3FNUZ,
MFMA_I32_16x16x32_I8,
MFMA_I32_32x32x16_I8,
WMMA_F16_16x16x16_F32,
WMMA_F32_16x16x16_F16,
WMMA_F16_16x16x16_F16
]>;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,12 +20,12 @@ module {

module {
func.func @test_wmma_f16_16x16x16_f32() attributes {
mma_types = #iree_gpu.mma_layout<WMMA_F16_16x16x16_F32>} {
mma_types = #iree_gpu.mma_layout<WMMA_F32_16x16x16_F16>} {
return
}
}
// CHECK-LABEL: func @test_wmma_f16_16x16x16_f32
// CHECK-SAME: mma_types = #iree_gpu.mma_layout<WMMA_F16_16x16x16_F32>
// CHECK-SAME: mma_types = #iree_gpu.mma_layout<WMMA_F32_16x16x16_F16>

module {
func.func @test_any_lowering_config() attributes {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -165,7 +165,7 @@ const WgpDetails *getCDNA1WgpDetails() {

const WgpDetails *getRDNA3WgpDetails() {
static const MMAIntrinsic rdna3MMAOps[] = {
MMAIntrinsic::WMMA_F16_16x16x16_F32,
MMAIntrinsic::WMMA_F32_16x16x16_F16,
MMAIntrinsic::WMMA_F16_16x16x16_F16,
};
static const WgpDetails rdna3Wgp = {
Expand Down Expand Up @@ -355,7 +355,7 @@ StringRef normalizeARMGPUTarget(StringRef target) {

const WgpDetails *getAmpereWgpDetails() {
static const MMAIntrinsic mmaOps[] = {
MMAIntrinsic::WMMA_F16_16x16x16_F32,
MMAIntrinsic::WMMA_F32_16x16x16_F16,
MMAIntrinsic::WMMA_F16_16x16x16_F16,
};
static const WgpDetails ampereWgp = {
Expand All @@ -368,7 +368,7 @@ const WgpDetails *getAmpereWgpDetails() {

const WgpDetails *getTuringWgpDetails() {
static const MMAIntrinsic mmaOps[] = {
MMAIntrinsic::WMMA_F16_16x16x16_F32,
MMAIntrinsic::WMMA_F32_16x16x16_F16,
MMAIntrinsic::WMMA_F16_16x16x16_F16,
};
static const WgpDetails turingWgp = {
Expand All @@ -381,7 +381,7 @@ const WgpDetails *getTuringWgpDetails() {

const WgpDetails *getVoltaWgpDetails() {
static const MMAIntrinsic mmaOps[] = {
MMAIntrinsic::WMMA_F16_16x16x16_F32,
MMAIntrinsic::WMMA_F32_16x16x16_F16,
MMAIntrinsic::WMMA_F16_16x16x16_F16,
};
// clang-format off
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ func.func @lower_multi_mma_wmma_16x16x16(%lhs: vector<16xf16>, %rhs: vector<16xf
%0 = iree_gpu.multi_mma %lhs, %rhs, %acc {
indexing_maps = #contraction_accesses,
iterator_types = [],
kind = #iree_gpu.mma_layout<WMMA_F16_16x16x16_F32>
kind = #iree_gpu.mma_layout<WMMA_F32_16x16x16_F16>
} : vector<16xf16>, vector<16xf16> into vector<8xf32>
return %0 : vector<8xf32>
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -203,7 +203,7 @@ func.func @conv_nchwc() {
// WMMA: #[[$TILE_SIZES:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[64, 128, 64]{{\]}}
// WMMA: #iree_codegen.translation_info<LLVMGPUVectorDistribute
// WMMA-SAME: mma_schedule = #iree_gpu.mma_schedule
// WMMA-SAME: intrinsic = #iree_gpu.mma_layout<WMMA_F16_16x16x16_F32>
// WMMA-SAME: intrinsic = #iree_gpu.mma_layout<WMMA_F32_16x16x16_F16>
// WMMA-SAME: subgroup_m_count = 2, subgroup_n_count = 2

#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -480,7 +480,7 @@ hal.executable.variant @rocm target(<"rocm", "rocm-hsaco-fb">) {
}

// RDNA3: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [64, 2, 1] subgroup_size = 32
// RDNA3-SAME: mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<WMMA_F16_16x16x16_F32>,
// RDNA3-SAME: mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<WMMA_F32_16x16x16_F16>,
// RDNA3-SAME: subgroup_m_count = 2, subgroup_n_count = 2>
// RDNA3-SAME: prefetch_shared_memory

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -218,7 +218,7 @@ builtin.module attributes { transform.with_named_sequence } {

// -----

#layout = #iree_gpu.mma_layout<WMMA_F16_16x16x16_F32>
#layout = #iree_gpu.mma_layout<WMMA_F32_16x16x16_F16>
#map1 = affine_map<(d0, d1, d2) -> (d1, d2)>
#map2 = affine_map<(d0, d1, d2) -> (d0, d2)>
#map3 = affine_map<(d0, d1, d2) -> (d1, d0)>
Expand Down Expand Up @@ -271,7 +271,7 @@ builtin.module attributes { transform.with_named_sequence } {

// -----

#layout = #iree_gpu.mma_layout<WMMA_F16_16x16x16_F32>
#layout = #iree_gpu.mma_layout<WMMA_F32_16x16x16_F16>

#map1 = affine_map<(d0, d1, d2) -> (d1, d2)>
#map2 = affine_map<(d0, d1, d2) -> (d0, d2)>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ builtin.module attributes { transform.with_named_sequence } {

// -----

#layout = #iree_gpu.mma_layout<WMMA_F16_16x16x16_F32>
#layout = #iree_gpu.mma_layout<WMMA_F32_16x16x16_F16>
#map1 = affine_map<(d0, d1, d2) -> (d1, d2)>
#map2 = affine_map<(d0, d1, d2) -> (d0, d2)>
#map3 = affine_map<(d0, d1, d2) -> (d1, d0)>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ func.func @mfma_matmul_96x64x16_mm_cannot_downcast(%lhs: vector<96x16xf16>, %rhs

func.func @wmma_matmul_48x32x32_mm(%lhs: vector<48x32xf16>, %rhs: vector<32x32xf16>, %init: vector<48x32xf16>) -> vector<48x32xf16> attributes {
mma_schedule = #iree_gpu.mma_schedule<
intrinsic = #iree_gpu.mma_layout<WMMA_F16_16x16x16_F32>,
intrinsic = #iree_gpu.mma_layout<WMMA_F32_16x16x16_F16>,
subgroup_m_count = 1, subgroup_n_count = 1>,
workgroup_size = [32, 1, 1]} {
%0 = vector.contract {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -258,7 +258,7 @@ func.func @matmul_16x16x256_fused(%lhs: memref<16x32xf16>,
#translation = #iree_codegen.translation_info<LLVMGPUVectorDistribute
workgroup_size = [32, 1, 1]
subgroup_size = 32,
{mma_schedule = #iree_gpu.mma_schedule< intrinsic = #iree_gpu.mma_layout<WMMA_F16_16x16x16_F32>, subgroup_m_count = 1, subgroup_n_count = 1>}>
{mma_schedule = #iree_gpu.mma_schedule< intrinsic = #iree_gpu.mma_layout<WMMA_F32_16x16x16_F16>, subgroup_m_count = 1, subgroup_n_count = 1>}>

// CHECK-DAG: #[[$NESTED:.+]] = #iree_vector_ext.nested_layout<subgroups_per_workgroup = [1, 1], batches_per_subgroup = [3, 2], outers_per_batch = [1, 1], threads_per_outer = [16, 1], elements_per_thread = [1, 16], subgroup_strides = [0, 0], thread_strides = [1, 0]>
// CHECK-DAG: #[[$NESTED1:.+]] = #iree_vector_ext.nested_layout<subgroups_per_workgroup = [1, 1], batches_per_subgroup = [2, 2], outers_per_batch = [1, 1], threads_per_outer = [1, 16], elements_per_thread = [16, 1], subgroup_strides = [0, 0], thread_strides = [0, 1]>
Expand All @@ -283,7 +283,7 @@ func.func @wmma_matmul_48x32x32_mm(%lhs: vector<48x32xf16>, %rhs: vector<32x32xf
#translation = #iree_codegen.translation_info<LLVMGPUVectorDistribute
workgroup_size = [32, 1, 1]
subgroup_size = 32,
{mma_schedule = #iree_gpu.mma_schedule< intrinsic = #iree_gpu.mma_layout<WMMA_F16_16x16x16_F32>, subgroup_m_count = 1, subgroup_n_count = 1>}>
{mma_schedule = #iree_gpu.mma_schedule< intrinsic = #iree_gpu.mma_layout<WMMA_F32_16x16x16_F16>, subgroup_m_count = 1, subgroup_n_count = 1>}>

// CHECK-DAG: #[[$NESTED:.+]] = #iree_vector_ext.nested_layout<subgroups_per_workgroup = [1, 1], batches_per_subgroup = [3, 2], outers_per_batch = [1, 1], threads_per_outer = [16, 1], elements_per_thread = [1, 16], subgroup_strides = [0, 0], thread_strides = [1, 0]>
// CHECK-DAG: #[[$NESTED1:.+]] = #iree_vector_ext.nested_layout<subgroups_per_workgroup = [1, 1], batches_per_subgroup = [2, 2], outers_per_batch = [1, 1], threads_per_outer = [16, 1], elements_per_thread = [1, 16], subgroup_strides = [0, 0], thread_strides = [1, 0]>
Expand Down Expand Up @@ -383,7 +383,7 @@ func.func @matmul_192x64x16_mmt_multi_m_and_n(%lhs: vector<4x64x16xf16>, %rhs: v
#translation = #iree_codegen.translation_info<LLVMGPUVectorDistribute
workgroup_size = [32, 4, 1]
subgroup_size = 32,
{mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<WMMA_F16_16x16x16_F32>, subgroup_m_count = 1, subgroup_n_count = 4>}>
{mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<WMMA_F32_16x16x16_F16>, subgroup_m_count = 1, subgroup_n_count = 4>}>

// CHECK-DAG: #[[$NESTED:.+]] = #iree_vector_ext.nested_layout<subgroups_per_workgroup = [1, 1], batches_per_subgroup = [4, 1], outers_per_batch = [1, 1], threads_per_outer = [32, 4], elements_per_thread = [1, 32], subgroup_strides = [0, 0], thread_strides = [4, 1]>

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
hal.executable @dispatch {
hal.executable.variant public @vulkan_spirv_fb target(<"vulkan-spirv", "vulkan-spirv-fb", {
iree.gpu.target = #iree_gpu.target<arch = "rdna3", features = "spirv:v1.6,cap:Shader",
wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<WMMA_F16_16x16x16_F32>, <WMMA_F16_16x16x16_F16>],
wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<WMMA_F32_16x16x16_F16>, <WMMA_F16_16x16x16_F16>],
subgroup_size_choices = [32, 64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536>>}>) {
hal.executable.export public @dispatch ordinal(0) layout(#hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer>]>]>) {
^bb0(%arg0: !hal.device):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -237,7 +237,7 @@ func.func @illegal() attributes {hal.executable.target = #executable_target_vulk
#executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", {
iree.gpu.target = #iree_gpu.target<arch = "", features = "spirv:v1.6,cap:Shader", wgp = <
compute = fp32|fp16|int32, storage = b32|b16, subgroup = none, dot = none,
mma = [<WMMA_F16_16x16x16_F32>, <WMMA_F16_16x16x16_F16>],
mma = [<WMMA_F32_16x16x16_F16>, <WMMA_F16_16x16x16_F16>],
subgroup_size_choices = [32, 64], max_workgroup_sizes = [1024, 1024, 1024],
max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536>>
}>
Expand Down Expand Up @@ -272,7 +272,7 @@ func.func @matmul_tensor() attributes {hal.executable.target = #executable_targe
#executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", {
iree.gpu.target = #iree_gpu.target<arch = "", features = "spirv:v1.6,cap:Shader", wgp = <
compute = fp32|fp16|int32, storage = b32|b16, subgroup = none, dot = none,
mma = [<WMMA_F16_16x16x16_F32>, <WMMA_F16_16x16x16_F16>],
mma = [<WMMA_F32_16x16x16_F16>, <WMMA_F16_16x16x16_F16>],
subgroup_size_choices = [32, 64], max_workgroup_sizes = [1024, 1024, 1024],
max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536>>
}>
Expand Down Expand Up @@ -307,7 +307,7 @@ func.func @matmul_tensor() attributes {hal.executable.target = #executable_targe
#executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", {
iree.gpu.target = #iree_gpu.target<arch = "", features = "spirv:v1.6,cap:Shader", wgp = <
compute = fp32|fp16|int32, storage = b32|b16, subgroup = none, dot = none,
mma = [<WMMA_F16_16x16x16_F32>, <WMMA_F16_16x16x16_F16>],
mma = [<WMMA_F32_16x16x16_F16>, <WMMA_F16_16x16x16_F16>],
subgroup_size_choices = [32, 64], max_workgroup_sizes = [1024, 1024, 1024],
max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536>>
}>
Expand Down Expand Up @@ -342,7 +342,7 @@ func.func @matmul_tensor() attributes {hal.executable.target = #executable_targe
#executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", {
iree.gpu.target = #iree_gpu.target<arch = "", features = "spirv:v1.6,cap:Shader", wgp = <
compute = fp32|fp16|int32, storage = b32|b16, subgroup = none, dot = none,
mma = [<WMMA_F16_16x16x16_F32>, <WMMA_F16_16x16x16_F16>],
mma = [<WMMA_F32_16x16x16_F16>, <WMMA_F16_16x16x16_F16>],
subgroup_size_choices = [32, 64], max_workgroup_sizes = [1024, 1024, 1024],
max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536>>
}>
Expand Down Expand Up @@ -377,7 +377,7 @@ func.func @matmul_tensor() attributes {hal.executable.target = #executable_targe
#executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", {
iree.gpu.target = #iree_gpu.target<arch = "", features = "spirv:v1.6,cap:Shader", wgp = <
compute = fp32|fp16|int32, storage = b32|b16, subgroup = none, dot = none,
mma = [<WMMA_F16_16x16x16_F32>, <WMMA_F16_16x16x16_F16>],
mma = [<WMMA_F32_16x16x16_F16>, <WMMA_F16_16x16x16_F16>],
subgroup_size_choices = [32, 64], max_workgroup_sizes = [1024, 1024, 1024],
max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536>>
}>
Expand Down
Loading

0 comments on commit e9e24f8

Please sign in to comment.