diff --git a/compiler/plugins/target/ROCM/test/target_device_features.mlir b/compiler/plugins/target/ROCM/test/target_device_features.mlir index 55098bf8cf5b..d21ca0a23443 100644 --- a/compiler/plugins/target/ROCM/test/target_device_features.mlir +++ b/compiler/plugins/target/ROCM/test/target_device_features.mlir @@ -15,7 +15,7 @@ // GFX940-SAME: mma = [, , , , , ], // GFX1100: target = #iree_gpu.target, ] +// GFX1100-SAME: mma = [, ] // GFX1100-SAME: subgroup_size_choices = [32, 64] // GFX941: target = #iree_gpu.target, %b : vector<16x1 indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind, - iree.amdgpu.mma = #iree_gpu.mma_layout + iree.amdgpu.mma = #iree_gpu.mma_layout } %A, %B, %C : vector<16x16xf16>, vector<16x16xf16> into vector<16x16xf32> %O = iree_vector_ext.to_layout %output to #layout_c : vector<16x16xf32> diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_vector_distribution.mlir b/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_vector_distribution.mlir index adb278c3712e..032028c1c073 100644 --- a/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_vector_distribution.mlir +++ b/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_vector_distribution.mlir @@ -582,7 +582,7 @@ func.func @resolve_wmma_layout_conflict_with_shared_memory(%15 : vector<16x16xf1 affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind, - iree.amdgpu.mma = #iree_gpu.mma_layout} + iree.amdgpu.mma = #iree_gpu.mma_layout} %A, %B, %C : vector<16x16xf16>, vector<16x16xf16> into vector<16x16xf32> %TM1 = arith.truncf %M1 : vector<16x16xf32> to vector<16x16xf16> @@ -596,7 +596,7 @@ func.func @resolve_wmma_layout_conflict_with_shared_memory(%15 : vector<16x16xf1 affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind, - iree.amdgpu.mma = #iree_gpu.mma_layout} + iree.amdgpu.mma = #iree_gpu.mma_layout} %A2, %B2, %C2 : vector<16x16xf16>, vector<16x16xf16> into vector<16x16xf32> func.return %M2 : vector<16x16xf32> diff --git a/compiler/src/iree/compiler/Codegen/Common/test/tile_and_distribute_to_workgroups.mlir b/compiler/src/iree/compiler/Codegen/Common/test/tile_and_distribute_to_workgroups.mlir index 42e5487db5fc..89ac36091ca1 100644 --- a/compiler/src/iree/compiler/Codegen/Common/test/tile_and_distribute_to_workgroups.mlir +++ b/compiler/src/iree/compiler/Codegen/Common/test/tile_and_distribute_to_workgroups.mlir @@ -2689,7 +2689,7 @@ hal.executable private @set_size_to_tilesize_when_divisible { hal.return %x, %y, %z : index, index, index } builtin.module { - func.func @set_size_to_tilesize_when_divisible() attributes {translation_info = #iree_codegen.translation_info, subgroup_m_count = 1, subgroup_n_count = 4>}>} { + func.func @set_size_to_tilesize_when_divisible() attributes {translation_info = #iree_codegen.translation_info, subgroup_m_count = 1, subgroup_n_count = 4>}>} { %c0 = arith.constant 0 : index %c32_i64 = arith.constant 32 : i64 %cst = arith.constant 0.000000e+00 : f16 diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.cpp b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.cpp index 7ccfccb4f617..4b6cd7b8dd66 100644 --- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.cpp +++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.cpp @@ -231,7 +231,7 @@ static OpaqueMmaLayout getOpaqueMFMALayout(MLIRContext *context, case MMAIntrinsic::MFMA_I32_32x32x16_I8: { return OpaqueMmaLayout{32, 32, 16, i8, i8, i32}; } - case MMAIntrinsic::WMMA_F16_16x16x16_F32: { + case MMAIntrinsic::WMMA_F32_16x16x16_F16: { return OpaqueMmaLayout{16, 16, 16, f16, f16, f32}; } case MMAIntrinsic::WMMA_F16_16x16x16_F16: { @@ -353,7 +353,7 @@ static ConcreteMmaLayout getConcreteMFMALayout(MLIRContext *context, return ConcreteMmaLayout{opaqueLayout, aMLayout, aKLayout, bKLayout, bNLayout, cMLayout, cNLayout}; } - case MMAIntrinsic::WMMA_F16_16x16x16_F32: + case MMAIntrinsic::WMMA_F32_16x16x16_F16: case MMAIntrinsic::WMMA_F16_16x16x16_F16: { // #outer = #iree_vector_ext.per_dim_layout<[LANEX], [16]> // #inner = #iree_vector_ext.per_dim_layout<[LANEY, VECTORX], [1, 16]> @@ -463,7 +463,7 @@ MMAAttr::getABCVectorTypes() const { auto cType = VectorType::get({16}, getCType()); return std::make_tuple(aType, bType, cType); } - case MMAIntrinsic::WMMA_F16_16x16x16_F32: + case MMAIntrinsic::WMMA_F32_16x16x16_F16: case MMAIntrinsic::WMMA_F16_16x16x16_F16: { auto aType = VectorType::get({16}, getAType()); auto bType = VectorType::get({16}, getBType()); @@ -492,7 +492,7 @@ int64_t MMAAttr::getBlockSize() const { case MMAIntrinsic::MFMA_I32_16x16x32_I8: case MMAIntrinsic::MFMA_I32_32x32x16_I8: case MMAIntrinsic::WMMA_F16_16x16x16_F16: - case MMAIntrinsic::WMMA_F16_16x16x16_F32: { + case MMAIntrinsic::WMMA_F32_16x16x16_F16: { return 1; } } @@ -510,7 +510,7 @@ int64_t MMAAttr::getSubgroupSize() const { case MMAIntrinsic::MFMA_I32_32x32x16_I8: { return 64; } - case MMAIntrinsic::WMMA_F16_16x16x16_F32: + case MMAIntrinsic::WMMA_F32_16x16x16_F16: case MMAIntrinsic::WMMA_F16_16x16x16_F16: { return 32; } @@ -542,7 +542,7 @@ MMAAttr::SingleSubgroupLayout MMAAttr::getASingleSubgroupLayout() const { return {/*outer=*/{1, 1}, /*thread=*/{32, 2}, /*strides=*/{1, 32}, /*element=*/{1, 8}}; } - case MMAIntrinsic::WMMA_F16_16x16x16_F32: + case MMAIntrinsic::WMMA_F32_16x16x16_F16: case MMAIntrinsic::WMMA_F16_16x16x16_F16: { return {/*outer=*/{1, 1}, /*thread=*/{16, 1}, /*strides=*/{1, 16}, /*element=*/{1, 16}}; @@ -574,7 +574,7 @@ MMAAttr::SingleSubgroupLayout MMAAttr::getBSingleSubgroupLayout() const { return {/*outer=*/{1, 1}, /*thread=*/{2, 32}, /*strides=*/{32, 1}, /*element=*/{8, 1}}; } - case MMAIntrinsic::WMMA_F16_16x16x16_F32: + case MMAIntrinsic::WMMA_F32_16x16x16_F16: case MMAIntrinsic::WMMA_F16_16x16x16_F16: { return {/*outer=*/{1, 1}, /*thread=*/{1, 16}, /*strides=*/{16, 1}, /*element=*/{16, 1}}; @@ -597,7 +597,7 @@ MMAAttr::SingleSubgroupLayout MMAAttr::getCSingleSubgroupLayout() const { return {/*outer=*/{4, 1}, /*thread=*/{2, 32}, /*strides=*/{32, 1}, /*element=*/{4, 1}}; } - case MMAIntrinsic::WMMA_F16_16x16x16_F32: + case MMAIntrinsic::WMMA_F32_16x16x16_F16: case MMAIntrinsic::WMMA_F16_16x16x16_F16: { return {/*outer=*/{8, 1}, /*thread=*/{2, 16}, /*strides=*/{16, 1}, /*element=*/{1, 1}}; @@ -644,7 +644,7 @@ FailureOr MMAAttr::buildMmaOperation(OpBuilder &builder, Location loc, rhs, acc) .getResult(); } - case MMAIntrinsic::WMMA_F16_16x16x16_F32: + case MMAIntrinsic::WMMA_F32_16x16x16_F16: case MMAIntrinsic::WMMA_F16_16x16x16_F16: { return builder.create(loc, resultType, lhs, rhs, acc) .getResult(); diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUEnums.td b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUEnums.td index a68a8a3fe1fb..d9f45e1c49eb 100644 --- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUEnums.td +++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUEnums.td @@ -106,7 +106,7 @@ def MFMA_F32_16x16x32_F8E4M3FNUZ : I32EnumAttrCase<"MFMA_F32_16x16x32_F8E4M3FNU def MFMA_I32_16x16x32_I8 : I32EnumAttrCase<"MFMA_I32_16x16x32_I8", 4>; def MFMA_I32_32x32x16_I8 : I32EnumAttrCase<"MFMA_I32_32x32x16_I8", 5>; // TODO: Create separate WMMA ops for AMD and NVIDIA GPUs -def WMMA_F16_16x16x16_F32 : I32EnumAttrCase<"WMMA_F16_16x16x16_F32", 6>; +def WMMA_F32_16x16x16_F16 : I32EnumAttrCase<"WMMA_F32_16x16x16_F16", 6>; def WMMA_F16_16x16x16_F16 : I32EnumAttrCase<"WMMA_F16_16x16x16_F16", 7>; def IREEGPU_MMAIntrinsic : IREEGPU_I32MmaEnumAttr<"MMAIntrinsic", @@ -117,7 +117,7 @@ def IREEGPU_MMAIntrinsic : IREEGPU_I32MmaEnumAttr<"MMAIntrinsic", MFMA_F32_16x16x32_F8E4M3FNUZ, MFMA_I32_16x16x32_I8, MFMA_I32_32x32x16_I8, - WMMA_F16_16x16x16_F32, + WMMA_F32_16x16x16_F16, WMMA_F16_16x16x16_F16 ]>; diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/test/iree_gpu_attrs.mlir b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/test/iree_gpu_attrs.mlir index 07663501c79f..d1003069e94a 100644 --- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/test/iree_gpu_attrs.mlir +++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/test/iree_gpu_attrs.mlir @@ -20,12 +20,12 @@ module { module { func.func @test_wmma_f16_16x16x16_f32() attributes { - mma_types = #iree_gpu.mma_layout} { + mma_types = #iree_gpu.mma_layout} { return } } // CHECK-LABEL: func @test_wmma_f16_16x16x16_f32 -// CHECK-SAME: mma_types = #iree_gpu.mma_layout +// CHECK-SAME: mma_types = #iree_gpu.mma_layout module { func.func @test_any_lowering_config() attributes { diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/KnownTargets.cpp b/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/KnownTargets.cpp index 6c50a8a96b6e..6f32054be429 100644 --- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/KnownTargets.cpp +++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/KnownTargets.cpp @@ -165,7 +165,7 @@ const WgpDetails *getCDNA1WgpDetails() { const WgpDetails *getRDNA3WgpDetails() { static const MMAIntrinsic rdna3MMAOps[] = { - MMAIntrinsic::WMMA_F16_16x16x16_F32, + MMAIntrinsic::WMMA_F32_16x16x16_F16, MMAIntrinsic::WMMA_F16_16x16x16_F16, }; static const WgpDetails rdna3Wgp = { @@ -355,7 +355,7 @@ StringRef normalizeARMGPUTarget(StringRef target) { const WgpDetails *getAmpereWgpDetails() { static const MMAIntrinsic mmaOps[] = { - MMAIntrinsic::WMMA_F16_16x16x16_F32, + MMAIntrinsic::WMMA_F32_16x16x16_F16, MMAIntrinsic::WMMA_F16_16x16x16_F16, }; static const WgpDetails ampereWgp = { @@ -368,7 +368,7 @@ const WgpDetails *getAmpereWgpDetails() { const WgpDetails *getTuringWgpDetails() { static const MMAIntrinsic mmaOps[] = { - MMAIntrinsic::WMMA_F16_16x16x16_F32, + MMAIntrinsic::WMMA_F32_16x16x16_F16, MMAIntrinsic::WMMA_F16_16x16x16_F16, }; static const WgpDetails turingWgp = { @@ -381,7 +381,7 @@ const WgpDetails *getTuringWgpDetails() { const WgpDetails *getVoltaWgpDetails() { static const MMAIntrinsic mmaOps[] = { - MMAIntrinsic::WMMA_F16_16x16x16_F32, + MMAIntrinsic::WMMA_F32_16x16x16_F16, MMAIntrinsic::WMMA_F16_16x16x16_F16, }; // clang-format off diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/TransformExtensions/test/lower_multi_mma.mlir b/compiler/src/iree/compiler/Codegen/Dialect/GPU/TransformExtensions/test/lower_multi_mma.mlir index c8700153b482..893994160471 100644 --- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/TransformExtensions/test/lower_multi_mma.mlir +++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/TransformExtensions/test/lower_multi_mma.mlir @@ -77,7 +77,7 @@ func.func @lower_multi_mma_wmma_16x16x16(%lhs: vector<16xf16>, %rhs: vector<16xf %0 = iree_gpu.multi_mma %lhs, %rhs, %acc { indexing_maps = #contraction_accesses, iterator_types = [], - kind = #iree_gpu.mma_layout + kind = #iree_gpu.mma_layout } : vector<16xf16>, vector<16xf16> into vector<8xf32> return %0 : vector<8xf32> } diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_vector_distribute.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_vector_distribute.mlir index d6bea0cbabe4..9f0e6cc9a466 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_vector_distribute.mlir +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_vector_distribute.mlir @@ -203,7 +203,7 @@ func.func @conv_nchwc() { // WMMA: #[[$TILE_SIZES:.+]] = #iree_codegen.lowering_config +// WMMA-SAME: intrinsic = #iree_gpu.mma_layout // WMMA-SAME: subgroup_m_count = 2, subgroup_n_count = 2 #pipeline_layout = #hal.pipeline.layout) { } // RDNA3: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info, +// RDNA3-SAME: mma_schedule = #iree_gpu.mma_schedule, // RDNA3-SAME: subgroup_m_count = 2, subgroup_n_count = 2> // RDNA3-SAME: prefetch_shared_memory diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/amdgpu_contraction_distribution.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/amdgpu_contraction_distribution.mlir index 76e9f9bcb800..cc0688ac332e 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/amdgpu_contraction_distribution.mlir +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/amdgpu_contraction_distribution.mlir @@ -218,7 +218,7 @@ builtin.module attributes { transform.with_named_sequence } { // ----- -#layout = #iree_gpu.mma_layout +#layout = #iree_gpu.mma_layout #map1 = affine_map<(d0, d1, d2) -> (d1, d2)> #map2 = affine_map<(d0, d1, d2) -> (d0, d2)> #map3 = affine_map<(d0, d1, d2) -> (d1, d0)> @@ -271,7 +271,7 @@ builtin.module attributes { transform.with_named_sequence } { // ----- -#layout = #iree_gpu.mma_layout +#layout = #iree_gpu.mma_layout #map1 = affine_map<(d0, d1, d2) -> (d1, d2)> #map2 = affine_map<(d0, d1, d2) -> (d0, d2)> diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/amdgpu_set_anchor_layouts.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/amdgpu_set_anchor_layouts.mlir index 54fd9b1fd433..972143537dec 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/amdgpu_set_anchor_layouts.mlir +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/amdgpu_set_anchor_layouts.mlir @@ -65,7 +65,7 @@ builtin.module attributes { transform.with_named_sequence } { // ----- -#layout = #iree_gpu.mma_layout +#layout = #iree_gpu.mma_layout #map1 = affine_map<(d0, d1, d2) -> (d1, d2)> #map2 = affine_map<(d0, d1, d2) -> (d0, d2)> #map3 = affine_map<(d0, d1, d2) -> (d1, d0)> diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/cast_type_to_fit_mma.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/cast_type_to_fit_mma.mlir index d69b8e503492..f8c43416b95f 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/cast_type_to_fit_mma.mlir +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/cast_type_to_fit_mma.mlir @@ -67,7 +67,7 @@ func.func @mfma_matmul_96x64x16_mm_cannot_downcast(%lhs: vector<96x16xf16>, %rhs func.func @wmma_matmul_48x32x32_mm(%lhs: vector<48x32xf16>, %rhs: vector<32x32xf16>, %init: vector<48x32xf16>) -> vector<48x32xf16> attributes { mma_schedule = #iree_gpu.mma_schedule< - intrinsic = #iree_gpu.mma_layout, + intrinsic = #iree_gpu.mma_layout, subgroup_m_count = 1, subgroup_n_count = 1>, workgroup_size = [32, 1, 1]} { %0 = vector.contract { diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/configure_vector_layout.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/configure_vector_layout.mlir index 98855831f160..2f12b1595f0e 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/configure_vector_layout.mlir +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/configure_vector_layout.mlir @@ -258,7 +258,7 @@ func.func @matmul_16x16x256_fused(%lhs: memref<16x32xf16>, #translation = #iree_codegen.translation_info, subgroup_m_count = 1, subgroup_n_count = 1>}> + {mma_schedule = #iree_gpu.mma_schedule< intrinsic = #iree_gpu.mma_layout, subgroup_m_count = 1, subgroup_n_count = 1>}> // CHECK-DAG: #[[$NESTED:.+]] = #iree_vector_ext.nested_layout // CHECK-DAG: #[[$NESTED1:.+]] = #iree_vector_ext.nested_layout @@ -283,7 +283,7 @@ func.func @wmma_matmul_48x32x32_mm(%lhs: vector<48x32xf16>, %rhs: vector<32x32xf #translation = #iree_codegen.translation_info, subgroup_m_count = 1, subgroup_n_count = 1>}> + {mma_schedule = #iree_gpu.mma_schedule< intrinsic = #iree_gpu.mma_layout, subgroup_m_count = 1, subgroup_n_count = 1>}> // CHECK-DAG: #[[$NESTED:.+]] = #iree_vector_ext.nested_layout // CHECK-DAG: #[[$NESTED1:.+]] = #iree_vector_ext.nested_layout @@ -383,7 +383,7 @@ func.func @matmul_192x64x16_mmt_multi_m_and_n(%lhs: vector<4x64x16xf16>, %rhs: v #translation = #iree_codegen.translation_info, subgroup_m_count = 1, subgroup_n_count = 4>}> + {mma_schedule = #iree_gpu.mma_schedule, subgroup_m_count = 1, subgroup_n_count = 4>}> // CHECK-DAG: #[[$NESTED:.+]] = #iree_vector_ext.nested_layout diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/test/convert_gpu_target.mlir b/compiler/src/iree/compiler/Codegen/SPIRV/test/convert_gpu_target.mlir index d8d3770c1848..b1da56d15761 100644 --- a/compiler/src/iree/compiler/Codegen/SPIRV/test/convert_gpu_target.mlir +++ b/compiler/src/iree/compiler/Codegen/SPIRV/test/convert_gpu_target.mlir @@ -3,7 +3,7 @@ hal.executable @dispatch { hal.executable.variant public @vulkan_spirv_fb target(<"vulkan-spirv", "vulkan-spirv-fb", { iree.gpu.target = #iree_gpu.target, ], + wgp = , ], subgroup_size_choices = [32, 64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536>>}>) { hal.executable.export public @dispatch ordinal(0) layout(#hal.pipeline.layout]>]>) { ^bb0(%arg0: !hal.device): diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/test/illegal_configuration.mlir b/compiler/src/iree/compiler/Codegen/SPIRV/test/illegal_configuration.mlir index bb7668854bd6..ba502da223b8 100644 --- a/compiler/src/iree/compiler/Codegen/SPIRV/test/illegal_configuration.mlir +++ b/compiler/src/iree/compiler/Codegen/SPIRV/test/illegal_configuration.mlir @@ -237,7 +237,7 @@ func.func @illegal() attributes {hal.executable.target = #executable_target_vulk #executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", { iree.gpu.target = #iree_gpu.target, ], + mma = [, ], subgroup_size_choices = [32, 64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536>> }> @@ -272,7 +272,7 @@ func.func @matmul_tensor() attributes {hal.executable.target = #executable_targe #executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", { iree.gpu.target = #iree_gpu.target, ], + mma = [, ], subgroup_size_choices = [32, 64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536>> }> @@ -307,7 +307,7 @@ func.func @matmul_tensor() attributes {hal.executable.target = #executable_targe #executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", { iree.gpu.target = #iree_gpu.target, ], + mma = [, ], subgroup_size_choices = [32, 64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536>> }> @@ -342,7 +342,7 @@ func.func @matmul_tensor() attributes {hal.executable.target = #executable_targe #executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", { iree.gpu.target = #iree_gpu.target, ], + mma = [, ], subgroup_size_choices = [32, 64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536>> }> @@ -377,7 +377,7 @@ func.func @matmul_tensor() attributes {hal.executable.target = #executable_targe #executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", { iree.gpu.target = #iree_gpu.target, ], + mma = [, ], subgroup_size_choices = [32, 64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536>> }> diff --git a/tests/e2e/matmul/generate_e2e_matmul_tests.py b/tests/e2e/matmul/generate_e2e_matmul_tests.py index c57af2bd9516..b4d371d5546a 100644 --- a/tests/e2e/matmul/generate_e2e_matmul_tests.py +++ b/tests/e2e/matmul/generate_e2e_matmul_tests.py @@ -292,13 +292,13 @@ def get_rocm_test_compilation_infos( ] elif intrinsic == "WMMA": schedules = [ - MMASchedule("WMMA_F16_16x16x16_F32", 1, 1, 1, 1, 1), - MMASchedule("WMMA_F16_16x16x16_F32", 1, 1, 1, 1, 2), - MMASchedule("WMMA_F16_16x16x16_F32", 1, 1, 1, 2, 1), - MMASchedule("WMMA_F16_16x16x16_F32", 1, 1, 2, 1, 1), - MMASchedule("WMMA_F16_16x16x16_F32", 2, 2, 1, 1, 1), - MMASchedule("WMMA_F16_16x16x16_F32", 2, 4, 2, 1, 2), - MMASchedule("WMMA_F16_16x16x16_F32", 4, 2, 4, 2, 2), + MMASchedule("WMMA_F32_16x16x16_F16", 1, 1, 1, 1, 1), + MMASchedule("WMMA_F32_16x16x16_F16", 1, 1, 1, 1, 2), + MMASchedule("WMMA_F32_16x16x16_F16", 1, 1, 1, 2, 1), + MMASchedule("WMMA_F32_16x16x16_F16", 1, 1, 2, 1, 1), + MMASchedule("WMMA_F32_16x16x16_F16", 2, 2, 1, 1, 1), + MMASchedule("WMMA_F32_16x16x16_F16", 2, 4, 2, 1, 2), + MMASchedule("WMMA_F32_16x16x16_F16", 4, 2, 4, 2, 2), ] else: raise NotImplementedError("unhandled intrinsic case") @@ -338,7 +338,7 @@ def get_rocm_test_compilation_infos( wg_tile_m = schedule.m_count * schedule.m_tile_count * 32 wg_tile_n = schedule.n_count * schedule.n_tile_count * 32 wg_tile_k = schedule.k_tile_count * 16 - elif schedule.intrinsic == "WMMA_F16_16x16x16_F32": + elif schedule.intrinsic == "WMMA_F32_16x16x16_F16": wg_tile_m = schedule.m_count * schedule.m_tile_count * 16 wg_tile_n = schedule.n_count * schedule.n_tile_count * 16 wg_tile_k = schedule.k_tile_count * 16