https://github.com/jayfoad updated https://github.com/llvm/llvm-project/pull/77470
>From ae231d88c5b5e2e0996edefd45389992f8e97d05 Mon Sep 17 00:00:00 2001 From: Jay Foad <jay.f...@amd.com> Date: Tue, 9 Jan 2024 13:16:24 +0000 Subject: [PATCH 1/3] [AMDGPU] Precommit tests for broken combine Add tests for sign-extending the result of an unsigned subword buffer load from the wrong width. --- .../llvm.amdgcn.struct.buffer.load.ll | 82 +++++++++++++++++++ 1 file changed, 82 insertions(+) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.ll index 81c0f7557e6417..fcd7821a86897e 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.ll @@ -500,6 +500,47 @@ define amdgpu_ps float @struct_buffer_load_i8_sext__sgpr_rsrc__vgpr_vindex__vgpr ret float %cast } +define amdgpu_ps float @struct_buffer_load_i8_sext_wrong_width(<4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { + ; GFX8-LABEL: name: struct_buffer_load_i8_sext_wrong_width + ; GFX8: bb.1 (%ir-block.0): + ; GFX8-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 + ; GFX8-NEXT: {{ $}} + ; GFX8-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GFX8-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; GFX8-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; GFX8-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 + ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX8-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX8-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 + ; GFX8-NEXT: [[BUFFER_LOAD_SBYTE_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_SBYTE_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable load (s8), addrspace 8) + ; GFX8-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_SBYTE_BOTHEN]] + ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 + ; + ; GFX12-LABEL: name: struct_buffer_load_i8_sext_wrong_width + ; GFX12: bb.1 (%ir-block.0): + ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX12-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 + ; GFX12-NEXT: [[BUFFER_LOAD_SBYTE_VBUFFER_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_SBYTE_VBUFFER_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable load (s8), addrspace 8) + ; GFX12-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_SBYTE_VBUFFER_BOTHEN]] + ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 + %val = call i8 @llvm.amdgcn.struct.buffer.load.i8(<4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) + %trunc = trunc i8 %val to i4 + %ext = sext i4 %trunc to i32 + %cast = bitcast i32 %ext to float + ret float %cast +} + define amdgpu_ps float @struct_buffer_load_i16_zext__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset(<4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { ; GFX8-LABEL: name: struct_buffer_load_i16_zext__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset ; GFX8: bb.1 (%ir-block.0): @@ -580,6 +621,47 @@ define amdgpu_ps float @struct_buffer_load_i16_sext__sgpr_rsrc__vgpr_vindex__vgp ret float %cast } +define amdgpu_ps float @struct_buffer_load_i16_sext_wrong_width(<4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { + ; GFX8-LABEL: name: struct_buffer_load_i16_sext_wrong_width + ; GFX8: bb.1 (%ir-block.0): + ; GFX8-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 + ; GFX8-NEXT: {{ $}} + ; GFX8-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GFX8-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; GFX8-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; GFX8-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 + ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX8-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX8-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 + ; GFX8-NEXT: [[BUFFER_LOAD_SSHORT_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_SSHORT_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable load (s16), align 1, addrspace 8) + ; GFX8-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_SSHORT_BOTHEN]] + ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 + ; + ; GFX12-LABEL: name: struct_buffer_load_i16_sext_wrong_width + ; GFX12: bb.1 (%ir-block.0): + ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX12-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 + ; GFX12-NEXT: [[BUFFER_LOAD_SSHORT_VBUFFER_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_SSHORT_VBUFFER_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable load (s16), align 1, addrspace 8) + ; GFX12-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_SSHORT_VBUFFER_BOTHEN]] + ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 + %val = call i16 @llvm.amdgcn.struct.buffer.load.i16(<4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) + %trunc = trunc i16 %val to i8 + %ext = sext i8 %trunc to i32 + %cast = bitcast i32 %ext to float + ret float %cast +} + ; Natural mapping define amdgpu_ps half @struct_buffer_load_f16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset(<4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { ; GFX8-LABEL: name: struct_buffer_load_f16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset >From 32a84fbfc8428cd4f8fb1992f12027d220eb64e9 Mon Sep 17 00:00:00 2001 From: Jay Foad <jay.f...@amd.com> Date: Tue, 9 Jan 2024 12:58:32 +0000 Subject: [PATCH 2/3] [AMDGPU] Rewrite G_SEXT_INREG combine. NFC. Simplify the apply function by storing the new opcode in the matchdata. --- llvm/lib/Target/AMDGPU/AMDGPUCombine.td | 2 +- .../AMDGPU/AMDGPUPostLegalizerCombiner.cpp | 45 ++++++++++--------- 2 files changed, 24 insertions(+), 23 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td index 0c77fe72595880..b9411e2052120d 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td @@ -111,7 +111,7 @@ def smulu64 : GICombineRule< [{ return matchCombine_s_mul_u64(*${smul}, ${matchinfo}); }]), (apply [{ applyCombine_s_mul_u64(*${smul}, ${matchinfo}); }])>; -def sign_exension_in_reg_matchdata : GIDefMatchData<"MachineInstr *">; +def sign_exension_in_reg_matchdata : GIDefMatchData<"std::pair<MachineInstr *, unsigned>">; def sign_extension_in_reg : GICombineRule< (defs root:$sign_inreg, sign_exension_in_reg_matchdata:$matchinfo), diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp index 21bfab52c6c4b5..441375ea6e458c 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp @@ -99,10 +99,10 @@ class AMDGPUPostLegalizerCombinerImpl : public Combiner { // Combine unsigned buffer load and signed extension instructions to generate // signed buffer laod instructions. - bool matchCombineSignExtendInReg(MachineInstr &MI, - MachineInstr *&MatchInfo) const; - void applyCombineSignExtendInReg(MachineInstr &MI, - MachineInstr *&MatchInfo) const; + bool matchCombineSignExtendInReg( + MachineInstr &MI, std::pair<MachineInstr *, unsigned> &MatchInfo) const; + void applyCombineSignExtendInReg( + MachineInstr &MI, std::pair<MachineInstr *, unsigned> &MatchInfo) const; // Find the s_mul_u64 instructions where the higher bits are either // zero-extended or sign-extended. @@ -395,34 +395,35 @@ bool AMDGPUPostLegalizerCombinerImpl::matchRemoveFcanonicalize( // Identify buffer_load_{u8, u16}. bool AMDGPUPostLegalizerCombinerImpl::matchCombineSignExtendInReg( - MachineInstr &MI, MachineInstr *&SubwordBufferLoad) const { - Register Op0Reg = MI.getOperand(1).getReg(); - SubwordBufferLoad = MRI.getVRegDef(Op0Reg); - - if (!MRI.hasOneNonDBGUse(Op0Reg)) + MachineInstr &MI, std::pair<MachineInstr *, unsigned> &MatchData) const { + Register LoadReg = MI.getOperand(1).getReg(); + if (!MRI.hasOneNonDBGUse(LoadReg)) return false; // Check if the first operand of the sign extension is a subword buffer load // instruction. - return SubwordBufferLoad->getOpcode() == AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE || - SubwordBufferLoad->getOpcode() == AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT; + MachineInstr *LoadMI = MRI.getVRegDef(LoadReg); + switch (LoadMI->getOpcode()) { + case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE: + MatchData = {LoadMI, AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE}; + return true; + case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT: + MatchData = {LoadMI, AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT}; + return true; + } + return false; } // Combine buffer_load_{u8, u16} and the sign extension instruction to generate // buffer_load_{i8, i16}. void AMDGPUPostLegalizerCombinerImpl::applyCombineSignExtendInReg( - MachineInstr &MI, MachineInstr *&SubwordBufferLoad) const { - // Modify the opcode and the destination of buffer_load_{u8, u16}: - // Replace the opcode. - unsigned Opc = - SubwordBufferLoad->getOpcode() == AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE - ? AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE - : AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT; - SubwordBufferLoad->setDesc(TII.get(Opc)); - // Update the destination register of SubwordBufferLoad with the destination - // register of the sign extension. + MachineInstr &MI, std::pair<MachineInstr *, unsigned> &MatchData) const { + auto [LoadMI, NewOpcode] = MatchData; + LoadMI->setDesc(TII.get(NewOpcode)); + // Update the destination register of the load with the destination register + // of the sign extension. Register SignExtendInsnDst = MI.getOperand(0).getReg(); - SubwordBufferLoad->getOperand(0).setReg(SignExtendInsnDst); + LoadMI->getOperand(0).setReg(SignExtendInsnDst); // Remove the sign extension. MI.eraseFromParent(); } >From e85ff44794f0c1cd9b341ad720d4df2c89c46b57 Mon Sep 17 00:00:00 2001 From: Jay Foad <jay.f...@amd.com> Date: Tue, 9 Jan 2024 13:20:30 +0000 Subject: [PATCH 3/3] [AMDGPU] Fix broken sign-extended subword buffer load combine --- .../AMDGPU/AMDGPUPostLegalizerCombiner.cpp | 5 +++-- .../llvm.amdgcn.struct.buffer.load.ll | 20 +++++++++++-------- 2 files changed, 15 insertions(+), 10 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp index 441375ea6e458c..bb1d6cb72e8071 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp @@ -403,13 +403,14 @@ bool AMDGPUPostLegalizerCombinerImpl::matchCombineSignExtendInReg( // Check if the first operand of the sign extension is a subword buffer load // instruction. MachineInstr *LoadMI = MRI.getVRegDef(LoadReg); + int64_t Width = MI.getOperand(2).getImm(); switch (LoadMI->getOpcode()) { case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE: MatchData = {LoadMI, AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE}; - return true; + return Width == 8; case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT: MatchData = {LoadMI, AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT}; - return true; + return Width == 16; } return false; } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.ll index fcd7821a86897e..94ce8aac8a4c68 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.ll @@ -514,8 +514,9 @@ define amdgpu_ps float @struct_buffer_load_i8_sext_wrong_width(<4 x i32> inreg % ; GFX8-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX8-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 - ; GFX8-NEXT: [[BUFFER_LOAD_SBYTE_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_SBYTE_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable load (s8), addrspace 8) - ; GFX8-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_SBYTE_BOTHEN]] + ; GFX8-NEXT: [[BUFFER_LOAD_UBYTE_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable load (s8), addrspace 8) + ; GFX8-NEXT: [[V_BFE_I32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_I32_e64 [[BUFFER_LOAD_UBYTE_BOTHEN]], 0, 4, implicit $exec + ; GFX8-NEXT: $vgpr0 = COPY [[V_BFE_I32_e64_]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 ; ; GFX12-LABEL: name: struct_buffer_load_i8_sext_wrong_width @@ -531,8 +532,9 @@ define amdgpu_ps float @struct_buffer_load_i8_sext_wrong_width(<4 x i32> inreg % ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX12-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 - ; GFX12-NEXT: [[BUFFER_LOAD_SBYTE_VBUFFER_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_SBYTE_VBUFFER_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable load (s8), addrspace 8) - ; GFX12-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_SBYTE_VBUFFER_BOTHEN]] + ; GFX12-NEXT: [[BUFFER_LOAD_UBYTE_VBUFFER_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_VBUFFER_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable load (s8), addrspace 8) + ; GFX12-NEXT: [[V_BFE_I32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_I32_e64 [[BUFFER_LOAD_UBYTE_VBUFFER_BOTHEN]], 0, 4, implicit $exec + ; GFX12-NEXT: $vgpr0 = COPY [[V_BFE_I32_e64_]] ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 %val = call i8 @llvm.amdgcn.struct.buffer.load.i8(<4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) %trunc = trunc i8 %val to i4 @@ -635,8 +637,9 @@ define amdgpu_ps float @struct_buffer_load_i16_sext_wrong_width(<4 x i32> inreg ; GFX8-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX8-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 - ; GFX8-NEXT: [[BUFFER_LOAD_SSHORT_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_SSHORT_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable load (s16), align 1, addrspace 8) - ; GFX8-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_SSHORT_BOTHEN]] + ; GFX8-NEXT: [[BUFFER_LOAD_USHORT_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable load (s16), align 1, addrspace 8) + ; GFX8-NEXT: [[V_BFE_I32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_I32_e64 [[BUFFER_LOAD_USHORT_BOTHEN]], 0, 8, implicit $exec + ; GFX8-NEXT: $vgpr0 = COPY [[V_BFE_I32_e64_]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 ; ; GFX12-LABEL: name: struct_buffer_load_i16_sext_wrong_width @@ -652,8 +655,9 @@ define amdgpu_ps float @struct_buffer_load_i16_sext_wrong_width(<4 x i32> inreg ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX12-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 - ; GFX12-NEXT: [[BUFFER_LOAD_SSHORT_VBUFFER_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_SSHORT_VBUFFER_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable load (s16), align 1, addrspace 8) - ; GFX12-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_SSHORT_VBUFFER_BOTHEN]] + ; GFX12-NEXT: [[BUFFER_LOAD_USHORT_VBUFFER_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_VBUFFER_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable load (s16), align 1, addrspace 8) + ; GFX12-NEXT: [[V_BFE_I32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_I32_e64 [[BUFFER_LOAD_USHORT_VBUFFER_BOTHEN]], 0, 8, implicit $exec + ; GFX12-NEXT: $vgpr0 = COPY [[V_BFE_I32_e64_]] ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 %val = call i16 @llvm.amdgcn.struct.buffer.load.i16(<4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) %trunc = trunc i16 %val to i8 _______________________________________________ lldb-commits mailing list lldb-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/lldb-commits