https://github.com/ritter-x2a updated https://github.com/llvm/llvm-project/pull/125836
>From 6a184d4af1ab15e105155aa0d3463a467e16c89c Mon Sep 17 00:00:00 2001 From: Fabian Ritter <fabian.rit...@amd.com> Date: Wed, 5 Feb 2025 05:50:12 -0500 Subject: [PATCH 1/2] [AMDGPU][MLIR] Replace gfx940 and gfx941 with gfx942 in MLIR gfx940 and gfx941 are no longer supported. This is one of a series of PRs to remove them from the code base. For SWDEV-512631 --- mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td | 2 +- mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td | 8 +++---- .../AMDGPUToROCDL/AMDGPUToROCDL.cpp | 22 +++++++++---------- .../ArithToAMDGPU/ArithToAMDGPU.cpp | 2 +- .../AMDGPU/Transforms/EmulateAtomics.cpp | 8 +------ .../AMDGPUToROCDL/8-bit-floats.mlir | 2 +- mlir/test/Conversion/AMDGPUToROCDL/mfma.mlir | 2 +- .../ArithToAMDGPU/8-bit-float-saturation.mlir | 2 +- .../ArithToAMDGPU/8-bit-floats.mlir | 2 +- .../Dialect/AMDGPU/AMDGPUUtilsTest.cpp | 20 +++++++---------- 10 files changed, 30 insertions(+), 40 deletions(-) diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td index 69745addfd748..24f541587cba8 100644 --- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td +++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td @@ -602,7 +602,7 @@ def AMDGPU_MFMAOp : order (that is, v[0] will go to arg[7:0], v[1] to arg[15:8] and so on). The negateA, negateB, and negateC flags are only supported for double-precision - operations on gfx940+. + operations on gfx942+. }]; let assemblyFormat = [{ $sourceA `*` $sourceB `+` $destC diff --git a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td index 7efa4ffa2aa6f..77401bd6de4bd 100644 --- a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td +++ b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td @@ -348,11 +348,11 @@ def ROCDL_mfma_f32_16x16x4bf16_1k : ROCDL_Mfma_IntrOp<"mfma.f32.16x16x4bf16.1k"> def ROCDL_mfma_f32_4x4x4bf16_1k : ROCDL_Mfma_IntrOp<"mfma.f32.4x4x4bf16.1k">; def ROCDL_mfma_f32_32x32x8bf16_1k : ROCDL_Mfma_IntrOp<"mfma.f32.32x32x8bf16.1k">; def ROCDL_mfma_f32_16x16x16bf16_1k : ROCDL_Mfma_IntrOp<"mfma.f32.16x16x16bf16.1k">; -// Note: in gfx940, unlike in gfx90a, the f64 xdlops use the "blgp" argument as a -// NEG bitfield. See IntrinsicsAMDGPU.td for more info. +// Note: in gfx942, unlike in gfx90a, the f64 xdlops use the "blgp" argument as +// a NEG bitfield. See IntrinsicsAMDGPU.td for more info. def ROCDL_mfma_f64_16x16x4f64 : ROCDL_Mfma_IntrOp<"mfma.f64.16x16x4f64">; def ROCDL_mfma_f64_4x4x4f64 : ROCDL_Mfma_IntrOp<"mfma.f64.4x4x4f64">; -// New in gfx940. +// New in gfx942. def ROCDL_mfma_i32_16x16x32_i8 : ROCDL_Mfma_IntrOp<"mfma.i32.16x16x32.i8">; def ROCDL_mfma_i32_32x32x16_i8 : ROCDL_Mfma_IntrOp<"mfma.i32.32x32x16.i8">; def ROCDL_mfma_f32_16x16x8_xf32 : ROCDL_Mfma_IntrOp<"mfma.f32.16x16x8.xf32">; @@ -375,7 +375,7 @@ def ROCDL_mfma_f32_32x32x16_f16 : ROCDL_Mfma_IntrOp<"mfma.f32.32x32x16.f16">; def ROCDL_mfma_scale_f32_16x16x128_f8f6f4 : ROCDL_Mfma_OO_IntrOp<"mfma.scale.f32.16x16x128.f8f6f4", [0,1]>; def ROCDL_mfma_scale_f32_32x32x64_f8f6f4 : ROCDL_Mfma_OO_IntrOp<"mfma.scale.f32.32x32x64.f8f6f4", [0,1]>; -// 2:4 Sparsity ops (GFX940) +// 2:4 Sparsity ops (GFX942) def ROCDL_smfmac_f32_16x16x32_f16 : ROCDL_Mfma_IntrOp<"smfmac.f32.16x16x32.f16">; def ROCDL_smfmac_f32_32x32x16_f16 : ROCDL_Mfma_IntrOp<"smfmac.f32.32x32x16.f16">; def ROCDL_smfmac_f32_16x16x32_bf16 : ROCDL_Mfma_IntrOp<"smfmac.f32.16x16x32.bf16">; diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp index c62314e504dcc..36fbdbed4ae2f 100644 --- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp +++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp @@ -80,7 +80,7 @@ namespace { // Define commonly used chipsets versions for convenience. constexpr Chipset kGfx908 = Chipset(9, 0, 8); constexpr Chipset kGfx90a = Chipset(9, 0, 0xa); -constexpr Chipset kGfx940 = Chipset(9, 4, 0); +constexpr Chipset kGfx942 = Chipset(9, 4, 2); /// Define lowering patterns for raw buffer ops template <typename GpuOp, typename Intrinsic> @@ -483,7 +483,7 @@ static std::optional<StringRef> mfmaOpToIntrinsic(MFMAOp mfma, destElem = destType.getElementType(); if (sourceElem.isF32() && destElem.isF32()) { - if (mfma.getReducePrecision() && chipset >= kGfx940) { + if (mfma.getReducePrecision() && chipset >= kGfx942) { if (m == 32 && n == 32 && k == 4 && b == 1) return ROCDL::mfma_f32_32x32x4_xf32::getOperationName(); if (m == 16 && n == 16 && k == 8 && b == 1) @@ -551,9 +551,9 @@ static std::optional<StringRef> mfmaOpToIntrinsic(MFMAOp mfma, return ROCDL::mfma_i32_32x32x8i8::getOperationName(); if (m == 16 && n == 16 && k == 16 && b == 1) return ROCDL::mfma_i32_16x16x16i8::getOperationName(); - if (m == 32 && n == 32 && k == 16 && b == 1 && chipset >= kGfx940) + if (m == 32 && n == 32 && k == 16 && b == 1 && chipset >= kGfx942) return ROCDL::mfma_i32_32x32x16_i8::getOperationName(); - if (m == 16 && n == 16 && k == 32 && b == 1 && chipset >= kGfx940) + if (m == 16 && n == 16 && k == 32 && b == 1 && chipset >= kGfx942) return ROCDL::mfma_i32_16x16x32_i8::getOperationName(); } @@ -565,7 +565,7 @@ static std::optional<StringRef> mfmaOpToIntrinsic(MFMAOp mfma, } if (isa<Float8E5M2FNUZType>(sourceElem) && destElem.isF32() && - chipset >= kGfx940) { + chipset >= kGfx942) { // Known to be correct because there are no scalar f8 instructions and // because a length mismatch will have been caught by the verifier. Type sourceBElem = @@ -585,7 +585,7 @@ static std::optional<StringRef> mfmaOpToIntrinsic(MFMAOp mfma, } if (isa<Float8E4M3FNUZType>(sourceElem) && destElem.isF32() && - chipset >= kGfx940) { + chipset >= kGfx942) { Type sourceBElem = cast<VectorType>(mfma.getSourceB().getType()).getElementType(); if (m == 16 && n == 16 && k == 32 && b == 1) { @@ -653,8 +653,8 @@ struct MFMAOpLowering : public ConvertOpToLLVMPattern<MFMAOp> { return op->emitOpError("MFMA only supported on gfx908+"); uint32_t getBlgpField = static_cast<uint32_t>(op.getBlgp()); if (op.getNegateA() || op.getNegateB() || op.getNegateC()) { - if (chipset < kGfx940) - return op.emitOpError("negation unsupported on older than gfx940"); + if (chipset < kGfx942) + return op.emitOpError("negation unsupported on older than gfx942"); getBlgpField |= op.getNegateA() | (op.getNegateB() << 1) | (op.getNegateC() << 2); } @@ -775,7 +775,7 @@ LogicalResult ExtPackedFp8OpLowering::matchAndRewrite( ExtPackedFp8Op op, ExtPackedFp8OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const { Location loc = op.getLoc(); - if (chipset.majorVersion != 9 || chipset < kGfx940) + if (chipset.majorVersion != 9 || chipset < kGfx942) return rewriter.notifyMatchFailure( loc, "Fp8 conversion instructions are not available on target " "architecture and their emulation is not implemented"); @@ -819,7 +819,7 @@ LogicalResult PackedTrunc2xFp8OpLowering::matchAndRewrite( PackedTrunc2xFp8Op op, PackedTrunc2xFp8OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const { Location loc = op.getLoc(); - if (chipset.majorVersion != 9 || chipset < kGfx940) + if (chipset.majorVersion != 9 || chipset < kGfx942) return rewriter.notifyMatchFailure( loc, "Fp8 conversion instructions are not available on target " "architecture and their emulation is not implemented"); @@ -856,7 +856,7 @@ LogicalResult PackedStochRoundFp8OpLowering::matchAndRewrite( PackedStochRoundFp8Op op, PackedStochRoundFp8OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const { Location loc = op.getLoc(); - if (chipset.majorVersion != 9 || chipset < kGfx940) + if (chipset.majorVersion != 9 || chipset < kGfx942) return rewriter.notifyMatchFailure( loc, "Fp8 conversion instructions are not available on target " "architecture and their emulation is not implemented"); diff --git a/mlir/lib/Conversion/ArithToAMDGPU/ArithToAMDGPU.cpp b/mlir/lib/Conversion/ArithToAMDGPU/ArithToAMDGPU.cpp index 60a002c41bfb2..b22d852f7c543 100644 --- a/mlir/lib/Conversion/ArithToAMDGPU/ArithToAMDGPU.cpp +++ b/mlir/lib/Conversion/ArithToAMDGPU/ArithToAMDGPU.cpp @@ -384,7 +384,7 @@ void ArithToAMDGPUConversionPass::runOnOperation() { } bool convertFP8Arithmetic = - maybeChipset->majorVersion == 9 && *maybeChipset >= Chipset(9, 4, 0); + maybeChipset->majorVersion == 9 && *maybeChipset >= Chipset(9, 4, 2); arith::populateArithToAMDGPUConversionPatterns( patterns, convertFP8Arithmetic, saturateFP8Truncf, allowPackedF16Rtz, *maybeChipset); diff --git a/mlir/lib/Dialect/AMDGPU/Transforms/EmulateAtomics.cpp b/mlir/lib/Dialect/AMDGPU/Transforms/EmulateAtomics.cpp index 77f972e0e5894..7459a6503cddf 100644 --- a/mlir/lib/Dialect/AMDGPU/Transforms/EmulateAtomics.cpp +++ b/mlir/lib/Dialect/AMDGPU/Transforms/EmulateAtomics.cpp @@ -179,7 +179,7 @@ void mlir::amdgpu::populateAmdgpuEmulateAtomicsPatterns( } // gfx9 has no to a very limited support for floating-point min and max. if (chipset.majorVersion == 9) { - if (chipset >= Chipset(9, 0, 0xa) && chipset != Chipset(9, 4, 1)) { + if (chipset >= Chipset(9, 0, 0xa)) { // gfx90a supports f64 max (and min, but we don't have a min wrapper right // now) but all other types need to be emulated. target.addDynamicallyLegalOp<RawBufferAtomicFmaxOp>( @@ -189,12 +189,6 @@ void mlir::amdgpu::populateAmdgpuEmulateAtomicsPatterns( } else { target.addIllegalOp<RawBufferAtomicFmaxOp>(); } - if (chipset == Chipset(9, 4, 1)) { - // gfx941 requires non-CAS atomics to be implemented with CAS loops. - // The workaround here mirrors HIP and OpenMP. - target.addIllegalOp<RawBufferAtomicFaddOp, RawBufferAtomicFmaxOp, - RawBufferAtomicSmaxOp, RawBufferAtomicUminOp>(); - } } patterns.add< RawBufferAtomicByCasPattern<RawBufferAtomicFaddOp, arith::AddFOp>, diff --git a/mlir/test/Conversion/AMDGPUToROCDL/8-bit-floats.mlir b/mlir/test/Conversion/AMDGPUToROCDL/8-bit-floats.mlir index 7818a525d17b5..a313aaffdf5cc 100644 --- a/mlir/test/Conversion/AMDGPUToROCDL/8-bit-floats.mlir +++ b/mlir/test/Conversion/AMDGPUToROCDL/8-bit-floats.mlir @@ -1,4 +1,4 @@ -// RUN: mlir-opt %s -convert-amdgpu-to-rocdl=chipset=gfx940 | FileCheck %s +// RUN: mlir-opt %s -convert-amdgpu-to-rocdl=chipset=gfx942 | FileCheck %s // CHECK-LABEL: func @ext_scalar // CHECK: [[V:%.+]] = builtin.unrealized_conversion_cast %{{.+}} : f8E5M2FNUZ to i8 diff --git a/mlir/test/Conversion/AMDGPUToROCDL/mfma.mlir b/mlir/test/Conversion/AMDGPUToROCDL/mfma.mlir index f8a60d37801eb..52db1421dc3c6 100644 --- a/mlir/test/Conversion/AMDGPUToROCDL/mfma.mlir +++ b/mlir/test/Conversion/AMDGPUToROCDL/mfma.mlir @@ -1,4 +1,4 @@ -// RUN: mlir-opt %s -convert-amdgpu-to-rocdl=chipset=gfx940 -cse | FileCheck %s +// RUN: mlir-opt %s -convert-amdgpu-to-rocdl=chipset=gfx942 -cse | FileCheck %s func.func @mfma_to_rocdl(%arg0 : f32, %arg1 : vector<32xf32>, %arg2 : vector<16xf32>, %arg3 : vector<4xf32>, %arg4 : vector<4xf16>, %arg5 : vector<4xi8>, diff --git a/mlir/test/Conversion/ArithToAMDGPU/8-bit-float-saturation.mlir b/mlir/test/Conversion/ArithToAMDGPU/8-bit-float-saturation.mlir index cd921da2294e1..07a428566d488 100644 --- a/mlir/test/Conversion/ArithToAMDGPU/8-bit-float-saturation.mlir +++ b/mlir/test/Conversion/ArithToAMDGPU/8-bit-float-saturation.mlir @@ -1,5 +1,5 @@ // RUN: mlir-opt --split-input-file %s \ -// RUN: --pass-pipeline='builtin.module(func.func(convert-arith-to-amdgpu{chipset=gfx940 saturate-fp8-truncf=true}))' \ +// RUN: --pass-pipeline='builtin.module(func.func(convert-arith-to-amdgpu{chipset=gfx942 saturate-fp8-truncf=true}))' \ // RUN: | FileCheck %s // CHECK-LABEL: func.func @scalar_trunc diff --git a/mlir/test/Conversion/ArithToAMDGPU/8-bit-floats.mlir b/mlir/test/Conversion/ArithToAMDGPU/8-bit-floats.mlir index 985fb532ea74a..6bb5b9771c015 100644 --- a/mlir/test/Conversion/ArithToAMDGPU/8-bit-floats.mlir +++ b/mlir/test/Conversion/ArithToAMDGPU/8-bit-floats.mlir @@ -1,4 +1,4 @@ -// RUN: mlir-opt --split-input-file %s -convert-arith-to-amdgpu="chipset=gfx940" | FileCheck %s +// RUN: mlir-opt --split-input-file %s -convert-arith-to-amdgpu="chipset=gfx942" | FileCheck %s // CHECK-LABEL: func.func @scalar_ext // CHECK-SAME: ([[V:%.+]]: f8E5M2FNUZ) diff --git a/mlir/unittests/Dialect/AMDGPU/AMDGPUUtilsTest.cpp b/mlir/unittests/Dialect/AMDGPU/AMDGPUUtilsTest.cpp index 976ff2e7382ed..570d56f3c6ff1 100644 --- a/mlir/unittests/Dialect/AMDGPU/AMDGPUUtilsTest.cpp +++ b/mlir/unittests/Dialect/AMDGPU/AMDGPUUtilsTest.cpp @@ -19,11 +19,11 @@ TEST(ChipsetTest, Parsing) { EXPECT_EQ(chipset->minorVersion, 0u); EXPECT_EQ(chipset->steppingVersion, 0xau); - chipset = Chipset::parse("gfx940"); + chipset = Chipset::parse("gfx942"); ASSERT_TRUE(succeeded(chipset)); EXPECT_EQ(chipset->majorVersion, 9u); EXPECT_EQ(chipset->minorVersion, 4u); - EXPECT_EQ(chipset->steppingVersion, 0u); + EXPECT_EQ(chipset->steppingVersion, 2u); chipset = Chipset::parse("gfx1103"); ASSERT_TRUE(succeeded(chipset)); @@ -36,30 +36,26 @@ TEST(ChipsetTest, ParsingInvalid) { EXPECT_TRUE(failed(Chipset::parse("navi33"))); EXPECT_TRUE(failed(Chipset::parse("rdna2"))); EXPECT_TRUE(failed(Chipset::parse("sm_80"))); - EXPECT_TRUE(failed(Chipset::parse("GFX940"))); - EXPECT_TRUE(failed(Chipset::parse("Gfx940"))); + EXPECT_TRUE(failed(Chipset::parse("GFX942"))); + EXPECT_TRUE(failed(Chipset::parse("Gfx942"))); EXPECT_TRUE(failed(Chipset::parse("gfx9"))); - EXPECT_TRUE(failed(Chipset::parse("gfx_940"))); - EXPECT_TRUE(failed(Chipset::parse("gfx940_"))); + EXPECT_TRUE(failed(Chipset::parse("gfx_942"))); + EXPECT_TRUE(failed(Chipset::parse("gfx942_"))); EXPECT_TRUE(failed(Chipset::parse("gfxmeow"))); EXPECT_TRUE(failed(Chipset::parse("gfx1fff"))); } TEST(ChipsetTest, Comparison) { - EXPECT_EQ(Chipset(9, 4, 0), Chipset(9, 4, 0)); - EXPECT_NE(Chipset(9, 4, 0), Chipset(9, 4, 2)); + EXPECT_EQ(Chipset(9, 4, 2), Chipset(9, 4, 2)); EXPECT_NE(Chipset(9, 0, 0), Chipset(10, 0, 0)); EXPECT_LT(Chipset(9, 0, 0), Chipset(10, 0, 0)); EXPECT_LT(Chipset(9, 0, 0), Chipset(9, 4, 2)); - EXPECT_LE(Chipset(9, 4, 1), Chipset(9, 4, 1)); EXPECT_FALSE(Chipset(9, 4, 2) < Chipset(9, 4, 2)); - EXPECT_FALSE(Chipset(9, 4, 2) < Chipset(9, 4, 0)); EXPECT_GT(Chipset(9, 0, 0xa), Chipset(9, 0, 8)); EXPECT_GE(Chipset(9, 0, 0xa), Chipset(9, 0, 0xa)); - EXPECT_FALSE(Chipset(9, 4, 1) >= Chipset(9, 4, 2)); - EXPECT_FALSE(Chipset(9, 0, 0xa) >= Chipset(9, 4, 0)); + EXPECT_FALSE(Chipset(9, 0, 0xa) >= Chipset(9, 4, 2)); } } // namespace >From 61eb3ccce685b1dd44483ebcd197ea7e30f7a4e8 Mon Sep 17 00:00:00 2001 From: Fabian Ritter <fabian.rit...@amd.com> Date: Tue, 11 Feb 2025 04:18:54 -0500 Subject: [PATCH 2/2] Use "gfx94x" in docs and comments. --- mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td | 2 +- mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td index 24f541587cba8..f795dd89b79a1 100644 --- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td +++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td @@ -602,7 +602,7 @@ def AMDGPU_MFMAOp : order (that is, v[0] will go to arg[7:0], v[1] to arg[15:8] and so on). The negateA, negateB, and negateC flags are only supported for double-precision - operations on gfx942+. + operations on gfx94x. }]; let assemblyFormat = [{ $sourceA `*` $sourceB `+` $destC diff --git a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td index 77401bd6de4bd..c268311d52741 100644 --- a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td +++ b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td @@ -348,11 +348,11 @@ def ROCDL_mfma_f32_16x16x4bf16_1k : ROCDL_Mfma_IntrOp<"mfma.f32.16x16x4bf16.1k"> def ROCDL_mfma_f32_4x4x4bf16_1k : ROCDL_Mfma_IntrOp<"mfma.f32.4x4x4bf16.1k">; def ROCDL_mfma_f32_32x32x8bf16_1k : ROCDL_Mfma_IntrOp<"mfma.f32.32x32x8bf16.1k">; def ROCDL_mfma_f32_16x16x16bf16_1k : ROCDL_Mfma_IntrOp<"mfma.f32.16x16x16bf16.1k">; -// Note: in gfx942, unlike in gfx90a, the f64 xdlops use the "blgp" argument as +// Note: in gfx94x, unlike in gfx90a, the f64 xdlops use the "blgp" argument as // a NEG bitfield. See IntrinsicsAMDGPU.td for more info. def ROCDL_mfma_f64_16x16x4f64 : ROCDL_Mfma_IntrOp<"mfma.f64.16x16x4f64">; def ROCDL_mfma_f64_4x4x4f64 : ROCDL_Mfma_IntrOp<"mfma.f64.4x4x4f64">; -// New in gfx942. +// New in gfx94x. def ROCDL_mfma_i32_16x16x32_i8 : ROCDL_Mfma_IntrOp<"mfma.i32.16x16x32.i8">; def ROCDL_mfma_i32_32x32x16_i8 : ROCDL_Mfma_IntrOp<"mfma.i32.32x32x16.i8">; def ROCDL_mfma_f32_16x16x8_xf32 : ROCDL_Mfma_IntrOp<"mfma.f32.16x16x8.xf32">; @@ -375,7 +375,7 @@ def ROCDL_mfma_f32_32x32x16_f16 : ROCDL_Mfma_IntrOp<"mfma.f32.32x32x16.f16">; def ROCDL_mfma_scale_f32_16x16x128_f8f6f4 : ROCDL_Mfma_OO_IntrOp<"mfma.scale.f32.16x16x128.f8f6f4", [0,1]>; def ROCDL_mfma_scale_f32_32x32x64_f8f6f4 : ROCDL_Mfma_OO_IntrOp<"mfma.scale.f32.32x32x64.f8f6f4", [0,1]>; -// 2:4 Sparsity ops (GFX942) +// 2:4 Sparsity ops (GFX94x) def ROCDL_smfmac_f32_16x16x32_f16 : ROCDL_Mfma_IntrOp<"smfmac.f32.16x16x32.f16">; def ROCDL_smfmac_f32_32x32x16_f16 : ROCDL_Mfma_IntrOp<"smfmac.f32.32x32x16.f16">; def ROCDL_smfmac_f32_16x16x32_bf16 : ROCDL_Mfma_IntrOp<"smfmac.f32.16x16x32.bf16">; _______________________________________________ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits