llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT--> @llvm/pr-subscribers-backend-amdgpu Author: Christudasan Devadasan (cdevadas) <details> <summary>Changes</summary> Use the constrained buffer load opcodes while combining under-aligned load for XNACK enabled subtargets. --- Patch is 47.71 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/101619.diff 3 Files Affected: - (modified) llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp (+63-12) - (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.buffer.load.ll (+38-18) - (modified) llvm/test/CodeGen/AMDGPU/merge-sbuffer-load.mir (+512-52) ``````````diff diff --git a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp index ae537b194f50c..7553c370f694f 100644 --- a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp +++ b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp @@ -352,6 +352,8 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) { return 1; case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM: + case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec: + case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec: case AMDGPU::S_LOAD_DWORDX2_IMM: case AMDGPU::S_LOAD_DWORDX2_IMM_ec: case AMDGPU::GLOBAL_LOAD_DWORDX2: @@ -363,6 +365,8 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) { return 2; case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM: case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM: + case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec: + case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec: case AMDGPU::S_LOAD_DWORDX3_IMM: case AMDGPU::S_LOAD_DWORDX3_IMM_ec: case AMDGPU::GLOBAL_LOAD_DWORDX3: @@ -374,6 +378,8 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) { return 3; case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM: + case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec: + case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec: case AMDGPU::S_LOAD_DWORDX4_IMM: case AMDGPU::S_LOAD_DWORDX4_IMM_ec: case AMDGPU::GLOBAL_LOAD_DWORDX4: @@ -385,6 +391,8 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) { return 4; case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM: case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM: + case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec: + case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec: case AMDGPU::S_LOAD_DWORDX8_IMM: case AMDGPU::S_LOAD_DWORDX8_IMM_ec: return 8; @@ -499,12 +507,20 @@ static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) { case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM: case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM: + case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec: + case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec: + case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec: + case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec: return S_BUFFER_LOAD_IMM; case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM: case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM: case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM: case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM: case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM: + case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec: + case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec: + case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec: + case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec: return S_BUFFER_LOAD_SGPR_IMM; case AMDGPU::S_LOAD_DWORD_IMM: case AMDGPU::S_LOAD_DWORDX2_IMM: @@ -587,12 +603,20 @@ static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) { case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM: case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM: + case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec: + case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec: + case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec: + case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec: return AMDGPU::S_BUFFER_LOAD_DWORD_IMM; case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM: case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM: case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM: case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM: case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM: + case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec: + case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec: + case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec: + case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec: return AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM; case AMDGPU::S_LOAD_DWORD_IMM: case AMDGPU::S_LOAD_DWORDX2_IMM: @@ -703,6 +727,10 @@ static AddressRegs getRegs(unsigned Opc, const SIInstrInfo &TII) { case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM: case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM: case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM: + case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec: + case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec: + case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec: + case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec: Result.SOffset = true; [[fallthrough]]; case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: @@ -710,6 +738,10 @@ static AddressRegs getRegs(unsigned Opc, const SIInstrInfo &TII) { case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM: case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM: + case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec: + case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec: + case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec: + case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec: case AMDGPU::S_LOAD_DWORD_IMM: case AMDGPU::S_LOAD_DWORDX2_IMM: case AMDGPU::S_LOAD_DWORDX3_IMM: @@ -1679,6 +1711,12 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatStorePair( return New; } +static bool needsConstraintedOpcode(const GCNSubtarget &STM, + const MachineMemOperand *MMO, + unsigned Width) { + return STM.isXNACKEnabled() && MMO->getAlign().value() < Width * 4; +} + unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI, const CombineInfo &Paired) { const unsigned Width = CI.Width + Paired.Width; @@ -1696,38 +1734,51 @@ unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI, case UNKNOWN: llvm_unreachable("Unknown instruction class"); - case S_BUFFER_LOAD_IMM: + case S_BUFFER_LOAD_IMM: { + const MachineMemOperand *MMO = *CI.I->memoperands_begin(); + bool NeedsConstrainedOpc = needsConstraintedOpcode(*STM, MMO, Width); switch (Width) { default: return 0; case 2: - return AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM; + return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec + : AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM; case 3: - return AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM; + return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec + : AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM; case 4: - return AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM; + return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec + : AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM; case 8: - return AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM; + return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec + : AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM; } - case S_BUFFER_LOAD_SGPR_IMM: + } + case S_BUFFER_LOAD_SGPR_IMM: { + const MachineMemOperand *MMO = *CI.I->memoperands_begin(); + bool NeedsConstrainedOpc = needsConstraintedOpcode(*STM, MMO, Width); switch (Width) { default: return 0; case 2: - return AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM; + return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec + : AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM; case 3: - return AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM; + return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec + : AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM; case 4: - return AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM; + return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec + : AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM; case 8: - return AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM; + return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec + : AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM; } + } case S_LOAD_IMM: { // If XNACK is enabled, use the constrained opcodes when the first load is // under-aligned. const MachineMemOperand *MMO = *CI.I->memoperands_begin(); - bool NeedsConstrainedOpc = - STM->isXNACKEnabled() && MMO->getAlign().value() < Width * 4; + bool NeedsConstrainedOpc = needsConstraintedOpcode(*STM, MMO, Width); switch (Width) { default: return 0; diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.buffer.load.ll index 074489b9ff505..d085b3c768a86 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.buffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.buffer.load.ll @@ -523,14 +523,23 @@ define amdgpu_ps void @s_buffer_load_imm_mergex2(<4 x i32> inreg %desc) { ; GFX67-NEXT: exp mrt0 v0, v1, v0, v0 done vm ; GFX67-NEXT: s_endpgm ; -; GFX8910-LABEL: s_buffer_load_imm_mergex2: -; GFX8910: ; %bb.0: ; %main_body -; GFX8910-NEXT: s_buffer_load_dwordx2 s[0:1], s[0:3], 0x4 -; GFX8910-NEXT: s_waitcnt lgkmcnt(0) -; GFX8910-NEXT: v_mov_b32_e32 v0, s0 -; GFX8910-NEXT: v_mov_b32_e32 v1, s1 -; GFX8910-NEXT: exp mrt0 v0, v1, v0, v0 done vm -; GFX8910-NEXT: s_endpgm +; GFX8-LABEL: s_buffer_load_imm_mergex2: +; GFX8: ; %bb.0: ; %main_body +; GFX8-NEXT: s_buffer_load_dwordx2 s[0:1], s[0:3], 0x4 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: exp mrt0 v0, v1, v0, v0 done vm +; GFX8-NEXT: s_endpgm +; +; GFX910-LABEL: s_buffer_load_imm_mergex2: +; GFX910: ; %bb.0: ; %main_body +; GFX910-NEXT: s_buffer_load_dwordx2 s[4:5], s[0:3], 0x4 +; GFX910-NEXT: s_waitcnt lgkmcnt(0) +; GFX910-NEXT: v_mov_b32_e32 v0, s4 +; GFX910-NEXT: v_mov_b32_e32 v1, s5 +; GFX910-NEXT: exp mrt0 v0, v1, v0, v0 done vm +; GFX910-NEXT: s_endpgm ; ; GFX11-LABEL: s_buffer_load_imm_mergex2: ; GFX11: ; %bb.0: ; %main_body @@ -570,16 +579,27 @@ define amdgpu_ps void @s_buffer_load_imm_mergex4(<4 x i32> inreg %desc) { ; GFX67-NEXT: exp mrt0 v0, v1, v2, v3 done vm ; GFX67-NEXT: s_endpgm ; -; GFX8910-LABEL: s_buffer_load_imm_mergex4: -; GFX8910: ; %bb.0: ; %main_body -; GFX8910-NEXT: s_buffer_load_dwordx4 s[0:3], s[0:3], 0x8 -; GFX8910-NEXT: s_waitcnt lgkmcnt(0) -; GFX8910-NEXT: v_mov_b32_e32 v0, s0 -; GFX8910-NEXT: v_mov_b32_e32 v1, s1 -; GFX8910-NEXT: v_mov_b32_e32 v2, s2 -; GFX8910-NEXT: v_mov_b32_e32 v3, s3 -; GFX8910-NEXT: exp mrt0 v0, v1, v2, v3 done vm -; GFX8910-NEXT: s_endpgm +; GFX8-LABEL: s_buffer_load_imm_mergex4: +; GFX8: ; %bb.0: ; %main_body +; GFX8-NEXT: s_buffer_load_dwordx4 s[0:3], s[0:3], 0x8 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: exp mrt0 v0, v1, v2, v3 done vm +; GFX8-NEXT: s_endpgm +; +; GFX910-LABEL: s_buffer_load_imm_mergex4: +; GFX910: ; %bb.0: ; %main_body +; GFX910-NEXT: s_buffer_load_dwordx4 s[4:7], s[0:3], 0x8 +; GFX910-NEXT: s_waitcnt lgkmcnt(0) +; GFX910-NEXT: v_mov_b32_e32 v0, s4 +; GFX910-NEXT: v_mov_b32_e32 v1, s5 +; GFX910-NEXT: v_mov_b32_e32 v2, s6 +; GFX910-NEXT: v_mov_b32_e32 v3, s7 +; GFX910-NEXT: exp mrt0 v0, v1, v2, v3 done vm +; GFX910-NEXT: s_endpgm ; ; GFX11-LABEL: s_buffer_load_imm_mergex4: ; GFX11: ; %bb.0: ; %main_body diff --git a/llvm/test/CodeGen/AMDGPU/merge-sbuffer-load.mir b/llvm/test/CodeGen/AMDGPU/merge-sbuffer-load.mir index f8502091f8b78..02c1a328f4825 100644 --- a/llvm/test/CodeGen/AMDGPU/merge-sbuffer-load.mir +++ b/llvm/test/CodeGen/AMDGPU/merge-sbuffer-load.mir @@ -9,14 +9,23 @@ body: | bb.0: liveins: $sgpr0_sgpr1_sgpr2_sgpr3 - ; CHECK-LABEL: name: merge_s_buffer_load_x2 - ; CHECK: liveins: $sgpr0_sgpr1_sgpr2_sgpr3 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3 - ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_BUFFER_LOAD_DWORDX2_IMM [[COPY]], 0, 0 :: (dereferenceable invariant load (s64), align 4) - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[S_BUFFER_LOAD_DWORDX2_IMM]].sub0 - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[S_BUFFER_LOAD_DWORDX2_IMM]].sub1 - ; CHECK-NEXT: S_ENDPGM 0 + ; GFX10-LABEL: name: merge_s_buffer_load_x2 + ; GFX10: liveins: $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX10-NEXT: {{ $}} + ; GFX10-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX10-NEXT: early-clobber %3:sreg_64_xexec = S_BUFFER_LOAD_DWORDX2_IMM_ec [[COPY]], 0, 0 :: (dereferenceable invariant load (s64), align 4) + ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sreg_32_xm0_xexec = COPY %3.sub0 + ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed %3.sub1 + ; GFX10-NEXT: S_ENDPGM 0 + ; + ; GFX12-LABEL: name: merge_s_buffer_load_x2 + ; GFX12: liveins: $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX12-NEXT: [[S_BUFFER_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_BUFFER_LOAD_DWORDX2_IMM [[COPY]], 0, 0 :: (dereferenceable invariant load (s64), align 4) + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[S_BUFFER_LOAD_DWORDX2_IMM]].sub0 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[S_BUFFER_LOAD_DWORDX2_IMM]].sub1 + ; GFX12-NEXT: S_ENDPGM 0 %0:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3 %1:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %0:sgpr_128, 0, 0 :: (dereferenceable invariant load (s32)) %2:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %0:sgpr_128, 4, 0 :: (dereferenceable invariant load (s32)) @@ -86,9 +95,9 @@ body: | ; GFX10: liveins: $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3 - ; GFX10-NEXT: [[S_BUFFER_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM [[COPY]], 0, 0 :: (dereferenceable invariant load (s128), align 4) - ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sreg_64_xexec = COPY [[S_BUFFER_LOAD_DWORDX4_IMM]].sub0_sub1 - ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sreg_64_xexec = COPY killed [[S_BUFFER_LOAD_DWORDX4_IMM]].sub2_sub3 + ; GFX10-NEXT: early-clobber %7:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM_ec [[COPY]], 0, 0 :: (dereferenceable invariant load (s128), align 4) + ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sreg_64_xexec = COPY %7.sub0_sub1 + ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sreg_64_xexec = COPY killed %7.sub2_sub3 ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[COPY1]].sub0 ; GFX10-NEXT: [[COPY4:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[COPY1]].sub1 ; GFX10-NEXT: [[COPY5:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[COPY2]].sub0 @@ -170,9 +179,9 @@ body: | ; GFX10: liveins: $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3 - ; GFX10-NEXT: [[S_BUFFER_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_BUFFER_LOAD_DWORDX8_IMM [[COPY]], 0, 0 :: (dereferenceable invariant load (s256), align 4) - ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_128 = COPY [[S_BUFFER_LOAD_DWORDX8_IMM]].sub0_sub1_sub2_sub3 - ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_128 = COPY killed [[S_BUFFER_LOAD_DWORDX8_IMM]].sub4_sub5_sub6_sub7 + ; GFX10-NEXT: early-clobber %15:sgpr_256 = S_BUFFER_LOAD_DWORDX8_IMM_ec [[COPY]], 0, 0 :: (dereferenceable invariant load (s256), align 4) + ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_128 = COPY %15.sub0_sub1_sub2_sub3 + ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_128 = COPY killed %15.sub4_sub5_sub6_sub7 ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sreg_64_xexec = COPY [[COPY1]].sub0_sub1 ; GFX10-NEXT: [[COPY4:%[0-9]+]]:sreg_64_xexec = COPY killed [[COPY1]].sub2_sub3 ; GFX10-NEXT: [[COPY5:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[COPY3]].sub0 @@ -231,9 +240,9 @@ body: | ; GFX10: liveins: $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3 - ; GFX10-NEXT: [[S_BUFFER_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_BUFFER_LOAD_DWORDX8_IMM [[COPY]], 0, 0 :: (dereferenceable invariant load (s256), align 4) - ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_128 = COPY [[S_BUFFER_LOAD_DWORDX8_IMM]].sub4_sub5_sub6_sub7 - ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_128 = COPY killed [[S_BUFFER_LOAD_DWORDX8_IMM]].sub0_sub1_sub2_sub3 + ; GFX10-NEXT: early-clobber %15:sgpr_256 = S_BUFFER_LOAD_DWORDX8_IMM_ec [[COPY]], 0, 0 :: (dereferenceable invariant load (s256), align 4) + ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_128 = COPY %15.sub4_sub5_sub6_sub7 + ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_128 = COPY killed %15.sub0_sub1_sub2_sub3 ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sreg_64_xexec = COPY [[COPY1]].sub0_sub1 ; GFX10-NEXT: [[COPY4:%[0-9]+]]:sreg_64_xexec = COPY killed [[COPY1]].sub2_sub3 ; GFX10-NEXT: [[COPY5:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[COPY3]].sub1 @@ -288,18 +297,31 @@ body: | bb.0: liveins: $sgpr0_sgpr1_sgpr2_sgpr3 - ; CHECK-LABEL: name: merge_s_buffer_load_x8_out_of_x2 - ; CHECK: liveins: $sgpr0_sgpr1_sgpr2_sgpr3 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3 - ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_BUFFER_LOAD_DWORDX8_IMM [[COPY]], 0, 0 :: (dereferenceable invariant load (s256), align 8) - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_128 = COPY [[S_BUFFER_LOAD_DWORDX8_IMM]].sub4_sub5_sub6_sub7 - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr_128 = COPY killed [[S_BUFFER_LOAD_DWORDX8_IMM]].sub0_sub1_sub2_sub3 - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_64 = COPY [[COPY1]].sub0_sub1 - ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_64 = COPY killed [[COPY1]].sub2_sub3 - ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_64 = COPY [[COPY2]].sub2_sub3 - ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY killed [[COPY2]].sub0_sub1 - ; CHECK-NEXT: S_ENDPGM 0 + ; GFX10-LABEL: name: merge_s_buffer_load_x8_out_of_x2 + ; GFX10: liveins: $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX10-NEXT: {{ $}} + ; GFX10-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX10-NEXT: early-clobber %7:sgpr_256 = S_BUFFER_LOAD_DWORDX8_IMM_ec [[COPY]], 0, 0 :: (dereferenceable invariant load (s256), align 8) + ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_128 = COPY %7.sub4_sub5_sub6_sub7 + ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_128 = COPY killed %7.sub0_sub1_sub2_sub3 + ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sgpr_64 = COPY [[COPY1]].sub0_sub1 + ; GFX10-NEXT: [[COPY4:%[0-9]+]]:sgpr_64 = COPY killed [[COPY1]].sub2_sub3 + ; GFX10-NEXT: [[COPY5:%[0-9]+]]:sgpr_64 = COPY [[COPY2]].sub2_sub3 + ; GFX10-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY killed [[COPY2]].sub0_sub1 + ; GFX10-NEXT: S_ENDPGM 0 + ; + ; GFX12-LABEL: name: merge_s_buffer_load_x8_out_of_x2 + ; GFX12: liveins: $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX12-NEXT: [[S_BUFFER_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_BUFFER_LOAD_DWORDX8_IMM [[COPY]], 0, 0 :: (dereferenceable invariant load (s256), align 8) + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr_128 = COPY [[S_BUFFER_LOAD_DWORDX8_IMM]].sub4_sub5_sub6_sub7 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr_128 = COPY killed [[S_BUFFER_LOAD_DWORDX8_IMM]].sub0_sub1_sub2_sub3 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr_64 = COPY [[COPY1]].sub0_sub1 + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sgpr_64 = COPY killed [[COPY1]].sub2_sub3 + ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sgpr_64 = COPY [[COPY2]].sub2_sub3 + ; GFX12-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY killed [[COPY2]].sub0_sub1 + ; GFX12-NEXT: S_ENDPGM 0 %0:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3 %1:sgpr_64 = S_BUFFER_LOAD_DWORDX2_IMM %0:sgpr_128, 16, 0 :: (dereferenceable invariant load (s64)) %2:sgpr_64 = S_BUFFER_LOAD_DWORDX2_IMM %0:sgpr_128, 8, 0 :: (dereferenceable invariant load (s64)) @@ -316,14 +338,23 @@ body: | bb.0: liveins: $sgpr0_sgpr1_sgpr2_sgpr3 - ; CHECK-LABEL: name: merge_s_buffer_load_x8_out_of_x4 - ; CHECK: liveins: $sgpr0_sgpr1_sgpr2_sgpr3 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3 - ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_B... [truncated] `````````` </details> https://github.com/llvm/llvm-project/pull/101619 _______________________________________________ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits