https://github.com/changpeng updated https://github.com/llvm/llvm-project/pull/184176
>From 0da7138b70532ac54dee9978075d4b3d246df248 Mon Sep 17 00:00:00 2001 From: Changpeng Fang <[email protected]> Date: Mon, 2 Mar 2026 08:57:08 -0800 Subject: [PATCH 1/3] [AMDGPU] Add suffix _D4 to tensor load/store with 4 groups D#, NFC Rename TENSOR_LOAD_TO_LDS to TENSOR_LOAD_TO_LDS_D4 Rename TENSOR_STORE_FROM_LDS to TENSOR_STORE_FROM_LDS_D4 Also rename function names in a couple of tests to reflact this change. --- ...iltins-amdgcn-gfx1250-tensor-load-store.cl | 8 ++++---- llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp | 2 +- .../AMDGPU/AMDGPUInstructionSelector.cpp | 2 +- llvm/lib/Target/AMDGPU/MIMGInstructions.td | 8 ++++---- llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 12 +++++------ llvm/lib/Target/AMDGPU/SIInstrInfo.h | 4 ++-- .../Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp | 4 ++-- .../AMDGPU/llvm.amdgcn.tensor.load.store.ll | 20 +++++++++---------- 8 files changed, 30 insertions(+), 30 deletions(-) diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250-tensor-load-store.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250-tensor-load-store.cl index cb106805d24bd..5c97067cdd971 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250-tensor-load-store.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250-tensor-load-store.cl @@ -8,12 +8,12 @@ typedef int v8i __attribute__((ext_vector_type(8))); static v4i v4i_zeros = (v4i){0,0,0,0}; static v8i v8i_zeros = (v8i){0,0,0,0,0,0,0,0}; -// CHECK-GFX1250-LABEL: @test_amdgcn_tensor_load_to_lds( +// CHECK-GFX1250-LABEL: @test_amdgcn_tensor_load_to_lds_d4( // CHECK-GFX1250-NEXT: entry: // CHECK-GFX1250-NEXT: tail call void @llvm.amdgcn.tensor.load.to.lds(<4 x i32> [[SG0:%.*]], <8 x i32> [[SG1:%.*]], <4 x i32> [[SG2:%.*]], <4 x i32> [[SG3:%.*]], <8 x i32> zeroinitializer, i32 0) // CHECK-GFX1250-NEXT: ret void // -void test_amdgcn_tensor_load_to_lds(v4i sg0, v8i sg1, v4i sg2, v4i sg3) +void test_amdgcn_tensor_load_to_lds_d4(v4i sg0, v8i sg1, v4i sg2, v4i sg3) { __builtin_amdgcn_tensor_load_to_lds(sg0, sg1, sg2, sg3, v8i_zeros, 0); } @@ -28,12 +28,12 @@ void test_amdgcn_tensor_load_to_lds_d2(v4i sg0, v8i sg1) __builtin_amdgcn_tensor_load_to_lds(sg0, sg1, v4i_zeros, v4i_zeros, v8i_zeros, 27); } -// CHECK-GFX1250-LABEL: @test_amdgcn_tensor_store_from_lds( +// CHECK-GFX1250-LABEL: @test_amdgcn_tensor_store_from_lds_d4( // CHECK-GFX1250-NEXT: entry: // CHECK-GFX1250-NEXT: tail call void @llvm.amdgcn.tensor.store.from.lds(<4 x i32> [[SG0:%.*]], <8 x i32> [[SG1:%.*]], <4 x i32> [[SG2:%.*]], <4 x i32> [[SG3:%.*]], <8 x i32> zeroinitializer, i32 22) // CHECK-GFX1250-NEXT: ret void // -void test_amdgcn_tensor_store_from_lds(v4i sg0, v8i sg1, v4i sg2, v4i sg3) +void test_amdgcn_tensor_store_from_lds_d4(v4i sg0, v8i sg1, v4i sg2, v4i sg3) { __builtin_amdgcn_tensor_store_from_lds(sg0, sg1, sg2, sg3, v8i_zeros, 22); } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index 4acda590ed5b6..a7324417ea151 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -3006,7 +3006,7 @@ void AMDGPUDAGToDAGISel::SelectDSBvhStackIntrinsic(SDNode *N, unsigned IntrID) { void AMDGPUDAGToDAGISel::SelectTensorLoadStore(SDNode *N, unsigned IntrID) { bool IsLoad = IntrID == Intrinsic::amdgcn_tensor_load_to_lds; unsigned Opc = - IsLoad ? AMDGPU::TENSOR_LOAD_TO_LDS : AMDGPU::TENSOR_STORE_FROM_LDS; + IsLoad ? AMDGPU::TENSOR_LOAD_TO_LDS_D4 : AMDGPU::TENSOR_STORE_FROM_LDS_D4; SmallVector<SDValue, 7> TensorOps; // First two groups diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index 3ae638f14ee40..5a6676e58f23c 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -3794,7 +3794,7 @@ bool AMDGPUInstructionSelector::selectTensorLoadStore(MachineInstr &MI, Intrinsic::ID IID) const { bool IsLoad = IID == Intrinsic::amdgcn_tensor_load_to_lds; unsigned Opc = - IsLoad ? AMDGPU::TENSOR_LOAD_TO_LDS : AMDGPU::TENSOR_STORE_FROM_LDS; + IsLoad ? AMDGPU::TENSOR_LOAD_TO_LDS_D4 : AMDGPU::TENSOR_STORE_FROM_LDS_D4; int NumGroups = 4; // A lamda function to check whether an operand is a vector of all 0s. diff --git a/llvm/lib/Target/AMDGPU/MIMGInstructions.td b/llvm/lib/Target/AMDGPU/MIMGInstructions.td index 0521e199c31dd..6b37a87ba44d0 100644 --- a/llvm/lib/Target/AMDGPU/MIMGInstructions.td +++ b/llvm/lib/Target/AMDGPU/MIMGInstructions.td @@ -2052,7 +2052,7 @@ def : MIMGG16Mapping<IMAGE_SAMPLE_C_CD_CL_O_nortn, IMAGE_SAMPLE_C_CD_CL_O_G16_no class VIMAGE_TENSOR_Pseudo<string opName, bit _UpTo2D = 0> : InstSI<(outs ), (ins ), "", []>, - SIMCInstr<opName#!if(_UpTo2D, "_D2", ""), SIEncodingFamily.NONE> { + SIMCInstr<opName#!if(_UpTo2D, "_D2", "_D4"), SIEncodingFamily.NONE> { let isPseudo = 1; let isCodeGenOnly = 1; @@ -2077,8 +2077,8 @@ class VIMAGE_TENSOR_Pseudo<string opName, bit _UpTo2D = 0> : } let SubtargetPredicate = isGFX125xOnly in { -def TENSOR_LOAD_TO_LDS : VIMAGE_TENSOR_Pseudo<"tensor_load_to_lds">; -def TENSOR_STORE_FROM_LDS : VIMAGE_TENSOR_Pseudo<"tensor_store_from_lds">; +def TENSOR_LOAD_TO_LDS_D4 : VIMAGE_TENSOR_Pseudo<"tensor_load_to_lds">; +def TENSOR_STORE_FROM_LDS_D4 : VIMAGE_TENSOR_Pseudo<"tensor_store_from_lds">; def TENSOR_LOAD_TO_LDS_D2 : VIMAGE_TENSOR_Pseudo<"tensor_load_to_lds", 1>; def TENSOR_STORE_FROM_LDS_D2 : VIMAGE_TENSOR_Pseudo<"tensor_store_from_lds", 1>; } // End SubtargetPredicate = isGFX125xOnly. @@ -2114,7 +2114,7 @@ class VIMAGE_TENSOR_Real <bits<8> op, VIMAGE_TENSOR_Pseudo ps, string opName = p multiclass VIMAGE_TENSOR_Real_gfx1250<bits<8> op> { let AssemblerPredicate = isGFX125xOnly, DecoderNamespace = "GFX1250" in { - foreach DSuffix = ["_D2", ""] in { + foreach DSuffix = ["_D2", "_D4"] in { defvar ps = !cast<VIMAGE_TENSOR_Pseudo>(NAME # DSuffix); def DSuffix # _gfx1250 : VIMAGE_TENSOR_Real<op, ps, ps.Mnemonic>, SIMCInstr<ps.PseudoInstr, SIEncodingFamily.GFX1250>; diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 41608cfb3457b..f24d2bd4678ad 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -7530,12 +7530,12 @@ SIInstrInfo::legalizeOperands(MachineInstr &MI, return nullptr; } - // Legalize TENSOR_LOAD_TO_LDS, TENSOR_LOAD_TO_LDS_D2, TENSOR_STORE_FROM_LDS, - // TENSOR_STORE_FROM_LDS_D2. All their operands are scalar. - if (MI.getOpcode() == AMDGPU::TENSOR_LOAD_TO_LDS || - MI.getOpcode() == AMDGPU::TENSOR_LOAD_TO_LDS_D2 || - MI.getOpcode() == AMDGPU::TENSOR_STORE_FROM_LDS || - MI.getOpcode() == AMDGPU::TENSOR_STORE_FROM_LDS_D2) { + // Legalize TENSOR_LOAD_TO_LDS_D2/_D4, TENSOR_STORE_FROM_LDS_D2/_D4. All their + // operands are scalar. + if (MI.getOpcode() == AMDGPU::TENSOR_LOAD_TO_LDS_D2 || + MI.getOpcode() == AMDGPU::TENSOR_LOAD_TO_LDS_D4 || + MI.getOpcode() == AMDGPU::TENSOR_STORE_FROM_LDS_D2 || + MI.getOpcode() == AMDGPU::TENSOR_STORE_FROM_LDS_D4) { for (MachineOperand &Src : MI.explicit_operands()) { if (Src.isReg() && RI.hasVectorRegisters(MRI.getRegClass(Src.getReg()))) Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI)); diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h index dd4c97f0c5746..2fb408c06d535 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -827,8 +827,8 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo { unsigned Opc = MI.getOpcode(); // Exclude instructions that read FROM LDS (not write to it) return isLDSDMA(MI) && Opc != AMDGPU::BUFFER_STORE_LDS_DWORD && - Opc != AMDGPU::TENSOR_STORE_FROM_LDS && - Opc != AMDGPU::TENSOR_STORE_FROM_LDS_D2; + Opc != AMDGPU::TENSOR_STORE_FROM_LDS_D2 && + Opc != AMDGPU::TENSOR_STORE_FROM_LDS_D4; } static bool isSBarrierSCCWrite(unsigned Opcode) { diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index c1337f27a0f70..1c4380d8cce43 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -773,8 +773,8 @@ bool isAsyncStore(unsigned Opc) { } bool isTensorStore(unsigned Opc) { - return Opc == TENSOR_STORE_FROM_LDS_gfx1250 || - Opc == TENSOR_STORE_FROM_LDS_D2_gfx1250; + return Opc == TENSOR_STORE_FROM_LDS_D2_gfx1250 || + Opc == TENSOR_STORE_FROM_LDS_D4_gfx1250; } unsigned getTemporalHintType(const MCInstrDesc TID) { diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tensor.load.store.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tensor.load.store.ll index a8bba2e384377..2ce533c299dce 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tensor.load.store.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tensor.load.store.ll @@ -6,8 +6,8 @@ declare void @llvm.amdgcn.tensor.load.to.lds(<4 x i32> %D0, <8 x i32> %D1, <4 x i32> %D2, <4 x i32> %D3, <8 x i32> %D4, i32 %cpol) declare void @llvm.amdgcn.tensor.store.from.lds(<4 x i32> %D0, <8 x i32> %D1, <4 x i32> %D2, <4 x i32> %D3, <8 x i32> %D4, i32 %cpol) -define amdgpu_ps void @tensor_load_to_lds(<4 x i32> inreg %D0, <8 x i32> inreg %D1, <4 x i32> inreg %D2, <4 x i32> inreg %D3) { -; GFX1250-LABEL: tensor_load_to_lds: +define amdgpu_ps void @tensor_load_to_lds_d4(<4 x i32> inreg %D0, <8 x i32> inreg %D1, <4 x i32> inreg %D2, <4 x i32> inreg %D3) { +; GFX1250-LABEL: tensor_load_to_lds_d4: ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: tensor_load_to_lds s[0:3], s[4:11], s[12:15], s[16:19] @@ -16,8 +16,8 @@ define amdgpu_ps void @tensor_load_to_lds(<4 x i32> inreg %D0, <8 x i32> inreg % ret void } -define amdgpu_ps void @tensor_load_to_lds_vector(<4 x i32> %D0, <8 x i32> %D1, <4 x i32> %D2, <4 x i32> %D3) { -; GFX1250-SDAG-LABEL: tensor_load_to_lds_vector: +define amdgpu_ps void @tensor_load_to_lds_d4_vector(<4 x i32> %D0, <8 x i32> %D1, <4 x i32> %D2, <4 x i32> %D3) { +; GFX1250-SDAG-LABEL: tensor_load_to_lds_d4_vector: ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s0, v4 @@ -44,7 +44,7 @@ define amdgpu_ps void @tensor_load_to_lds_vector(<4 x i32> %D0, <8 x i32> %D1, < ; GFX1250-SDAG-NEXT: tensor_load_to_lds s[8:11], s[0:7], s[12:15], s[16:19] ; GFX1250-SDAG-NEXT: s_endpgm ; -; GFX1250-GISEL-LABEL: tensor_load_to_lds_vector: +; GFX1250-GISEL-LABEL: tensor_load_to_lds_d4_vector: ; GFX1250-GISEL: ; %bb.0: ; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s8, v0 @@ -126,8 +126,8 @@ define amdgpu_ps void @tensor_load_to_lds_d2_vector(<4 x i32> %D0, <8 x i32> %D1 ret void } -define amdgpu_ps void @tensor_store_from_lds(<4 x i32> inreg %D0, <8 x i32> inreg %D1, <4 x i32> inreg %D2, <4 x i32> inreg %D3) { -; GFX1250-LABEL: tensor_store_from_lds: +define amdgpu_ps void @tensor_store_from_lds_d4(<4 x i32> inreg %D0, <8 x i32> inreg %D1, <4 x i32> inreg %D2, <4 x i32> inreg %D3) { +; GFX1250-LABEL: tensor_store_from_lds_d4: ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: tensor_store_from_lds s[0:3], s[4:11], s[12:15], s[16:19] th:TH_STORE_NT_HT scope:SCOPE_DEV @@ -136,8 +136,8 @@ define amdgpu_ps void @tensor_store_from_lds(<4 x i32> inreg %D0, <8 x i32> inre ret void } -define amdgpu_ps void @tensor_store_from_lds_vector(<4 x i32> %D0, <8 x i32> %D1, <4 x i32> %D2, <4 x i32> %D3) { -; GFX1250-SDAG-LABEL: tensor_store_from_lds_vector: +define amdgpu_ps void @tensor_store_from_lds_d4_vector(<4 x i32> %D0, <8 x i32> %D1, <4 x i32> %D2, <4 x i32> %D3) { +; GFX1250-SDAG-LABEL: tensor_store_from_lds_d4_vector: ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s0, v4 @@ -164,7 +164,7 @@ define amdgpu_ps void @tensor_store_from_lds_vector(<4 x i32> %D0, <8 x i32> %D1 ; GFX1250-SDAG-NEXT: tensor_store_from_lds s[8:11], s[0:7], s[12:15], s[16:19] th:TH_STORE_NT_HT scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_endpgm ; -; GFX1250-GISEL-LABEL: tensor_store_from_lds_vector: +; GFX1250-GISEL-LABEL: tensor_store_from_lds_d4_vector: ; GFX1250-GISEL: ; %bb.0: ; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s8, v0 >From 92aaddbce5e594c3b1477a67005b9bbbad9b514d Mon Sep 17 00:00:00 2001 From: Changpeng Fang <[email protected]> Date: Tue, 3 Mar 2026 12:53:11 -0800 Subject: [PATCH 2/3] [AMDGPU] Compiler invented suffixes should use lowercase --- llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp | 6 +++--- llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp | 6 +++--- llvm/lib/Target/AMDGPU/MIMGInstructions.td | 12 ++++++------ llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 10 +++++----- llvm/lib/Target/AMDGPU/SIInstrInfo.h | 4 ++-- llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp | 4 ++-- 6 files changed, 21 insertions(+), 21 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index a7324417ea151..cc2058a5a1d4a 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -3006,7 +3006,7 @@ void AMDGPUDAGToDAGISel::SelectDSBvhStackIntrinsic(SDNode *N, unsigned IntrID) { void AMDGPUDAGToDAGISel::SelectTensorLoadStore(SDNode *N, unsigned IntrID) { bool IsLoad = IntrID == Intrinsic::amdgcn_tensor_load_to_lds; unsigned Opc = - IsLoad ? AMDGPU::TENSOR_LOAD_TO_LDS_D4 : AMDGPU::TENSOR_STORE_FROM_LDS_D4; + IsLoad ? AMDGPU::TENSOR_LOAD_TO_LDS_d4 : AMDGPU::TENSOR_STORE_FROM_LDS_d4; SmallVector<SDValue, 7> TensorOps; // First two groups @@ -3018,8 +3018,8 @@ void AMDGPUDAGToDAGISel::SelectTensorLoadStore(SDNode *N, unsigned IntrID) { SDValue Group3 = N->getOperand(5); if (ISD::isBuildVectorAllZeros(Group2.getNode()) && ISD::isBuildVectorAllZeros(Group3.getNode())) { - Opc = IsLoad ? AMDGPU::TENSOR_LOAD_TO_LDS_D2 - : AMDGPU::TENSOR_STORE_FROM_LDS_D2; + Opc = IsLoad ? AMDGPU::TENSOR_LOAD_TO_LDS_d2 + : AMDGPU::TENSOR_STORE_FROM_LDS_d2; } else { // Has at least 4 groups TensorOps.push_back(Group2); // D# group 2 TensorOps.push_back(Group3); // D# group 3 diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index 5a6676e58f23c..61b70dc7585aa 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -3794,7 +3794,7 @@ bool AMDGPUInstructionSelector::selectTensorLoadStore(MachineInstr &MI, Intrinsic::ID IID) const { bool IsLoad = IID == Intrinsic::amdgcn_tensor_load_to_lds; unsigned Opc = - IsLoad ? AMDGPU::TENSOR_LOAD_TO_LDS_D4 : AMDGPU::TENSOR_STORE_FROM_LDS_D4; + IsLoad ? AMDGPU::TENSOR_LOAD_TO_LDS_d4 : AMDGPU::TENSOR_STORE_FROM_LDS_d4; int NumGroups = 4; // A lamda function to check whether an operand is a vector of all 0s. @@ -3808,8 +3808,8 @@ bool AMDGPUInstructionSelector::selectTensorLoadStore(MachineInstr &MI, // Use _D2 version if both group 2 and 3 are zero-initialized. if (isAllZeros(MI.getOperand(3)) && isAllZeros(MI.getOperand(4))) { NumGroups = 2; - Opc = IsLoad ? AMDGPU::TENSOR_LOAD_TO_LDS_D2 - : AMDGPU::TENSOR_STORE_FROM_LDS_D2; + Opc = IsLoad ? AMDGPU::TENSOR_LOAD_TO_LDS_d2 + : AMDGPU::TENSOR_STORE_FROM_LDS_d2; } // TODO: Handle the fifth group: MI.getOpetand(5), which is silently ignored diff --git a/llvm/lib/Target/AMDGPU/MIMGInstructions.td b/llvm/lib/Target/AMDGPU/MIMGInstructions.td index 6b37a87ba44d0..03159cf9398ca 100644 --- a/llvm/lib/Target/AMDGPU/MIMGInstructions.td +++ b/llvm/lib/Target/AMDGPU/MIMGInstructions.td @@ -2052,7 +2052,7 @@ def : MIMGG16Mapping<IMAGE_SAMPLE_C_CD_CL_O_nortn, IMAGE_SAMPLE_C_CD_CL_O_G16_no class VIMAGE_TENSOR_Pseudo<string opName, bit _UpTo2D = 0> : InstSI<(outs ), (ins ), "", []>, - SIMCInstr<opName#!if(_UpTo2D, "_D2", "_D4"), SIEncodingFamily.NONE> { + SIMCInstr<opName#!if(_UpTo2D, "_d2", "_d4"), SIEncodingFamily.NONE> { let isPseudo = 1; let isCodeGenOnly = 1; @@ -2077,10 +2077,10 @@ class VIMAGE_TENSOR_Pseudo<string opName, bit _UpTo2D = 0> : } let SubtargetPredicate = isGFX125xOnly in { -def TENSOR_LOAD_TO_LDS_D4 : VIMAGE_TENSOR_Pseudo<"tensor_load_to_lds">; -def TENSOR_STORE_FROM_LDS_D4 : VIMAGE_TENSOR_Pseudo<"tensor_store_from_lds">; -def TENSOR_LOAD_TO_LDS_D2 : VIMAGE_TENSOR_Pseudo<"tensor_load_to_lds", 1>; -def TENSOR_STORE_FROM_LDS_D2 : VIMAGE_TENSOR_Pseudo<"tensor_store_from_lds", 1>; +def TENSOR_LOAD_TO_LDS_d4 : VIMAGE_TENSOR_Pseudo<"tensor_load_to_lds">; +def TENSOR_STORE_FROM_LDS_d4 : VIMAGE_TENSOR_Pseudo<"tensor_store_from_lds">; +def TENSOR_LOAD_TO_LDS_d2 : VIMAGE_TENSOR_Pseudo<"tensor_load_to_lds", 1>; +def TENSOR_STORE_FROM_LDS_d2 : VIMAGE_TENSOR_Pseudo<"tensor_store_from_lds", 1>; } // End SubtargetPredicate = isGFX125xOnly. class VIMAGE_TENSOR_Real <bits<8> op, VIMAGE_TENSOR_Pseudo ps, string opName = ps.Mnemonic> : @@ -2114,7 +2114,7 @@ class VIMAGE_TENSOR_Real <bits<8> op, VIMAGE_TENSOR_Pseudo ps, string opName = p multiclass VIMAGE_TENSOR_Real_gfx1250<bits<8> op> { let AssemblerPredicate = isGFX125xOnly, DecoderNamespace = "GFX1250" in { - foreach DSuffix = ["_D2", "_D4"] in { + foreach DSuffix = ["_d2", "_d4"] in { defvar ps = !cast<VIMAGE_TENSOR_Pseudo>(NAME # DSuffix); def DSuffix # _gfx1250 : VIMAGE_TENSOR_Real<op, ps, ps.Mnemonic>, SIMCInstr<ps.PseudoInstr, SIEncodingFamily.GFX1250>; diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index f24d2bd4678ad..7d9e621110330 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -7530,12 +7530,12 @@ SIInstrInfo::legalizeOperands(MachineInstr &MI, return nullptr; } - // Legalize TENSOR_LOAD_TO_LDS_D2/_D4, TENSOR_STORE_FROM_LDS_D2/_D4. All their + // Legalize TENSOR_LOAD_TO_LDS_d2/_d4, TENSOR_STORE_FROM_LDS_d2/_d4. All their // operands are scalar. - if (MI.getOpcode() == AMDGPU::TENSOR_LOAD_TO_LDS_D2 || - MI.getOpcode() == AMDGPU::TENSOR_LOAD_TO_LDS_D4 || - MI.getOpcode() == AMDGPU::TENSOR_STORE_FROM_LDS_D2 || - MI.getOpcode() == AMDGPU::TENSOR_STORE_FROM_LDS_D4) { + if (MI.getOpcode() == AMDGPU::TENSOR_LOAD_TO_LDS_d2 || + MI.getOpcode() == AMDGPU::TENSOR_LOAD_TO_LDS_d4 || + MI.getOpcode() == AMDGPU::TENSOR_STORE_FROM_LDS_d2 || + MI.getOpcode() == AMDGPU::TENSOR_STORE_FROM_LDS_d4) { for (MachineOperand &Src : MI.explicit_operands()) { if (Src.isReg() && RI.hasVectorRegisters(MRI.getRegClass(Src.getReg()))) Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI)); diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h index 2fb408c06d535..f363560784730 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -827,8 +827,8 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo { unsigned Opc = MI.getOpcode(); // Exclude instructions that read FROM LDS (not write to it) return isLDSDMA(MI) && Opc != AMDGPU::BUFFER_STORE_LDS_DWORD && - Opc != AMDGPU::TENSOR_STORE_FROM_LDS_D2 && - Opc != AMDGPU::TENSOR_STORE_FROM_LDS_D4; + Opc != AMDGPU::TENSOR_STORE_FROM_LDS_d2 && + Opc != AMDGPU::TENSOR_STORE_FROM_LDS_d4; } static bool isSBarrierSCCWrite(unsigned Opcode) { diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index 1c4380d8cce43..865770031bb06 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -773,8 +773,8 @@ bool isAsyncStore(unsigned Opc) { } bool isTensorStore(unsigned Opc) { - return Opc == TENSOR_STORE_FROM_LDS_D2_gfx1250 || - Opc == TENSOR_STORE_FROM_LDS_D4_gfx1250; + return Opc == TENSOR_STORE_FROM_LDS_d2_gfx1250 || + Opc == TENSOR_STORE_FROM_LDS_d4_gfx1250; } unsigned getTemporalHintType(const MCInstrDesc TID) { >From d2b15f3351d6b8a11506cf9af9604afe352377fe Mon Sep 17 00:00:00 2001 From: Changpeng Fang <[email protected]> Date: Tue, 3 Mar 2026 13:01:59 -0800 Subject: [PATCH 3/3] [AMDGPU] Fix instruction suffixes for reg-coalescer-subreg-liveness.mir --- .../CodeGen/AMDGPU/reg-coalescer-subreg-liveness.mir | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/llvm/test/CodeGen/AMDGPU/reg-coalescer-subreg-liveness.mir b/llvm/test/CodeGen/AMDGPU/reg-coalescer-subreg-liveness.mir index f098618018839..00c4ec981111e 100644 --- a/llvm/test/CodeGen/AMDGPU/reg-coalescer-subreg-liveness.mir +++ b/llvm/test/CodeGen/AMDGPU/reg-coalescer-subreg-liveness.mir @@ -18,7 +18,7 @@ body: | ; CHECK-NEXT: undef [[S_MOV_B32_:%[0-9]+]].sub0:sgpr_128 = S_MOV_B32 1 ; CHECK-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]].sub2:sgpr_128 = S_MOV_B32 0 ; CHECK-NEXT: undef [[S_MOV_B32_1:%[0-9]+]].sub0:sgpr_256 = S_MOV_B32 0 - ; CHECK-NEXT: TENSOR_LOAD_TO_LDS_D2 [[S_MOV_B32_]], [[S_MOV_B32_1]], 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt + ; CHECK-NEXT: TENSOR_LOAD_TO_LDS_d2 [[S_MOV_B32_]], [[S_MOV_B32_1]], 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt ; CHECK-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]].sub0:sgpr_128 = S_MOV_B32 1 ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]].sub1:sgpr_128 = COPY [[S_MOV_B32_]].sub0 ; CHECK-NEXT: {{ $}} @@ -27,8 +27,8 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]].sub2:sgpr_128 = COPY [[S_MOV_B32_]].sub0 ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]].sub3:sgpr_128 = COPY [[S_MOV_B32_]].sub0 - ; CHECK-NEXT: TENSOR_LOAD_TO_LDS_D2 [[S_MOV_B32_]], [[S_MOV_B32_1]], 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt - ; CHECK-NEXT: TENSOR_LOAD_TO_LDS_D2 [[S_LOAD_DWORD_IMM]], [[S_MOV_B32_1]], 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt + ; CHECK-NEXT: TENSOR_LOAD_TO_LDS_d2 [[S_MOV_B32_]], [[S_MOV_B32_1]], 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt + ; CHECK-NEXT: TENSOR_LOAD_TO_LDS_d2 [[S_LOAD_DWORD_IMM]], [[S_MOV_B32_1]], 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt ; CHECK-NEXT: $vcc_lo = COPY $exec_lo ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]].sub1:sgpr_128 = S_MOV_B32 0 ; CHECK-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]].sub2:sgpr_128 = S_MOV_B32 1 @@ -47,7 +47,7 @@ body: | undef %3.sub0:sgpr_128 = COPY %2 %4:sreg_32 = S_MOV_B32 0 undef %5.sub0:sgpr_256 = COPY %4 - TENSOR_LOAD_TO_LDS_D2 %3, %5, 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt + TENSOR_LOAD_TO_LDS_d2 %3, %5, 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt %6:sgpr_128 = COPY killed %3 %6.sub1:sgpr_128 = COPY killed %1 %7:sreg_32 = COPY $exec_lo @@ -62,11 +62,11 @@ body: | %11.sub1:sgpr_128 = COPY killed %10 %11.sub2:sgpr_128 = COPY %2 %11.sub3:sgpr_128 = COPY %2 - TENSOR_LOAD_TO_LDS_D2 killed %11, %5, 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt + TENSOR_LOAD_TO_LDS_d2 killed %11, %5, 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt %12:sreg_32 = COPY killed %9 %13:sgpr_128 = COPY %6 %13.sub2:sgpr_128 = COPY killed %12 - TENSOR_LOAD_TO_LDS_D2 killed %13, %5, 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt + TENSOR_LOAD_TO_LDS_d2 killed %13, %5, 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt $vcc_lo = COPY %7 %8:sreg_32 = COPY %4 %9:sreg_32 = COPY %2 _______________________________________________ cfe-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
