llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT--> @llvm/pr-subscribers-backend-amdgpu Author: Matt Arsenault (arsenm) <details> <summary>Changes</summary> --- Patch is 297.50 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/102346.diff 13 Files Affected: - (modified) llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp (+218-16) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll (+34-42) - (modified) llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-cc.ll (+1-2) - (modified) llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-preserve-cc.ll (+1-2) - (modified) llvm/test/CodeGen/AMDGPU/array-ptr-calc-i32.ll (+6-1) - (modified) llvm/test/CodeGen/AMDGPU/eliminate-frame-index-v-add-co-u32.mir (+333-584) - (modified) llvm/test/CodeGen/AMDGPU/eliminate-frame-index-v-add-u32.mir (+171-221) - (modified) llvm/test/CodeGen/AMDGPU/flat-scratch.ll (+4-4) - (modified) llvm/test/CodeGen/AMDGPU/frame-index.mir (+4-4) - (modified) llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.gfx10.ll (+14-14) - (modified) llvm/test/CodeGen/AMDGPU/mubuf-offset-private.ll (+7-3) - (modified) llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll (+1-1) - (modified) llvm/test/CodeGen/AMDGPU/stack-realign.ll (+1-1) ``````````diff diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp index 76da1f0eb4f7d..81337c62ffe17 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -2086,7 +2086,7 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, assert(MF->getRegInfo().isReserved(MFI->getScratchRSrcReg()) && "unreserved scratch RSRC register"); - MachineOperand &FIOp = MI->getOperand(FIOperandNum); + MachineOperand *FIOp = &MI->getOperand(FIOperandNum); int Index = MI->getOperand(FIOperandNum).getIndex(); Register FrameReg = FrameInfo.isFixedObjectIndex(Index) && hasBasePointer(*MF) @@ -2268,6 +2268,208 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, MI->eraseFromParent(); return true; } + case AMDGPU::V_ADD_U32_e32: + case AMDGPU::V_ADD_U32_e64: + case AMDGPU::V_ADD_CO_U32_e32: + case AMDGPU::V_ADD_CO_U32_e64: { + // TODO: Handle sub, and, or. + unsigned NumDefs = MI->getNumExplicitDefs(); + unsigned Src0Idx = NumDefs; + + bool HasClamp = false; + MachineOperand *VCCOp = nullptr; + + switch (MI->getOpcode()) { + case AMDGPU::V_ADD_U32_e32: + break; + case AMDGPU::V_ADD_U32_e64: + HasClamp = MI->getOperand(3).getImm(); + break; + case AMDGPU::V_ADD_CO_U32_e32: + VCCOp = &MI->getOperand(3); + break; + case AMDGPU::V_ADD_CO_U32_e64: + VCCOp = &MI->getOperand(1); + HasClamp = MI->getOperand(4).getImm(); + break; + default: + break; + } + bool DeadVCC = !VCCOp || VCCOp->isDead(); + MachineOperand &DstOp = MI->getOperand(0); + Register DstReg = DstOp.getReg(); + + unsigned OtherOpIdx = + FIOperandNum == Src0Idx ? FIOperandNum + 1 : Src0Idx; + MachineOperand *OtherOp = &MI->getOperand(OtherOpIdx); + + unsigned Src1Idx = Src0Idx + 1; + Register MaterializedReg = FrameReg; + Register ScavengedVGPR; + + if (FrameReg && !ST.enableFlatScratch()) { + // We should just do an in-place update of the result register. However, + // the value there may also be used by the add, in which case we need a + // temporary register. + // + // FIXME: The scavenger is not finding the result register in the + // common case where the add does not read the register. + + ScavengedVGPR = RS->scavengeRegisterBackwards( + AMDGPU::VGPR_32RegClass, MI, /*RestoreAfter=*/false, /*SPAdj=*/0); + + // TODO: If we have a free SGPR, it's sometimes better to use a scalar + // shift. + BuildMI(*MBB, *MI, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64)) + .addDef(ScavengedVGPR, RegState::Renamable) + .addImm(ST.getWavefrontSizeLog2()) + .addReg(FrameReg); + MaterializedReg = ScavengedVGPR; + } + + int64_t Offset = FrameInfo.getObjectOffset(Index); + // For the non-immediate case, we could fall through to the default + // handling, but we do an in-place update of the result register here to + // avoid scavenging another register. + if (OtherOp->isImm()) { + OtherOp->setImm(OtherOp->getImm() + Offset); + Offset = 0; + } + + if ((!OtherOp->isImm() || OtherOp->getImm() != 0) && MaterializedReg) { + if (ST.enableFlatScratch() && + !TII->isOperandLegal(*MI, Src1Idx, OtherOp)) { + // We didn't need the shift above, so we have an SGPR for the frame + // register, but may have a VGPR only operand. + // + // TODO: On gfx10+, we can easily change the opcode to the e64 version + // and use the higher constant bus restriction to avoid this copy. + + if (!ScavengedVGPR) { + ScavengedVGPR = RS->scavengeRegisterBackwards( + AMDGPU::VGPR_32RegClass, MI, /*RestoreAfter=*/false, + /*SPAdj=*/0); + } + + assert(ScavengedVGPR != DstReg); + + BuildMI(*MBB, *MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), ScavengedVGPR) + .addReg(MaterializedReg, + MaterializedReg != FrameReg ? RegState::Kill : 0); + MaterializedReg = ScavengedVGPR; + } + + auto AddI32 = BuildMI(*MBB, *MI, DL, TII->get(MI->getOpcode())) + .addDef(DstReg, RegState::Renamable); + if (NumDefs == 2) + AddI32.add(MI->getOperand(1)); + + unsigned MaterializedRegFlags = + MaterializedReg != FrameReg ? RegState::Kill : 0; + + if (isVGPRClass(getPhysRegBaseClass(MaterializedReg))) { + // If we know we have a VGPR already, it's more likely the other + // operand is a legal vsrc0. + AddI32 + .add(*OtherOp) + .addReg(MaterializedReg, MaterializedRegFlags); + } else { + // Commute operands to avoid violating VOP2 restrictions. This will + // typically happen when using scratch. + AddI32 + .addReg(MaterializedReg, MaterializedRegFlags) + .add(*OtherOp); + } + + if (MI->getOpcode() == AMDGPU::V_ADD_CO_U32_e64 || + MI->getOpcode() == AMDGPU::V_ADD_U32_e64) + AddI32.addImm(0); // clamp + + if (MI->getOpcode() == AMDGPU::V_ADD_CO_U32_e32) + AddI32.setOperandDead(3); // Dead vcc + + MaterializedReg = DstReg; + + OtherOp->ChangeToRegister(MaterializedReg, false); + OtherOp->setIsKill(true); + FIOp->ChangeToImmediate(Offset); + Offset = 0; + } else if (Offset != 0) { + assert(!MaterializedReg); + FIOp->ChangeToImmediate(Offset); + Offset = 0; + } else { + if (DeadVCC && !HasClamp) { + assert(Offset == 0); + + // TODO: Losing kills and implicit operands. Just mutate to copy and + // let lowerCopy deal with it? + if (OtherOp->isReg() && OtherOp->getReg() == DstReg) { + // Folded to an identity copy. + MI->eraseFromParent(); + return true; + } + + // The immediate value should be in OtherOp + MI->setDesc(TII->get(AMDGPU::V_MOV_B32_e32)); + MI->removeOperand(FIOperandNum); + + unsigned NumOps = MI->getNumOperands(); + for (unsigned I = NumOps - 2; I >= 2; --I) + MI->removeOperand(I); + + if (NumDefs == 2) + MI->removeOperand(1); + + // The code below can't deal with a mov. + return true; + } + + // This folded to a constant, but we have to keep the add around for + // pointless implicit defs or clamp modifier. + FIOp->ChangeToImmediate(0); + } + + // Try to improve legality by commuting. + if (!TII->isOperandLegal(*MI, Src1Idx) && TII->commuteInstruction(*MI)) { + std::swap(FIOp, OtherOp); + std::swap(FIOperandNum, OtherOpIdx); + } + + for (unsigned SrcIdx : {Src1Idx, Src0Idx}) { + // Depending on operand constraints we may need to insert another copy. + if (!TII->isOperandLegal(*MI, SrcIdx)) { + // If commuting didn't make the operands legal, we need to materialize + // in a register. + // TODO: Can use SGPR on gfx10+ in some cases. + if (!ScavengedVGPR) { + ScavengedVGPR = RS->scavengeRegisterBackwards( + AMDGPU::VGPR_32RegClass, MI, /*RestoreAfter=*/false, + /*SPAdj=*/0); + } + + assert(ScavengedVGPR != DstReg); + + MachineOperand &Src = MI->getOperand(SrcIdx); + BuildMI(*MBB, *MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), ScavengedVGPR) + .add(Src); + + Src.ChangeToRegister(ScavengedVGPR, false); + Src.setIsKill(true); + } + } + + // Fold out add of 0 case that can appear in kernels. + if (FIOp->isImm() && FIOp->getImm() == 0 && DeadVCC && !HasClamp) { + if (OtherOp->isReg() && OtherOp->getReg() != DstReg) { + BuildMI(*MBB, *MI, DL, TII->get(AMDGPU::COPY), DstReg).add(*OtherOp); + } + + MI->eraseFromParent(); + } + + return true; + } case AMDGPU::S_ADD_I32: case AMDGPU::S_OR_B32: case AMDGPU::S_AND_B32: { @@ -2336,7 +2538,7 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, } else { if (MaterializedReg) OtherOp.ChangeToRegister(MaterializedReg, false); - FIOp.ChangeToImmediate(NewOffset); + FIOp->ChangeToImmediate(NewOffset); } return true; @@ -2354,7 +2556,7 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, // The offset is always swizzled, just replace it if (FrameReg) - FIOp.ChangeToRegister(FrameReg, false); + FIOp->ChangeToRegister(FrameReg, false); MachineOperand *OffsetOp = TII->getNamedOperand(*MI, AMDGPU::OpName::offset); @@ -2407,18 +2609,18 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, } if (!FrameReg) { - FIOp.ChangeToImmediate(Offset); - if (TII->isImmOperandLegal(*MI, FIOperandNum, FIOp)) + FIOp->ChangeToImmediate(Offset); + if (TII->isImmOperandLegal(*MI, FIOperandNum, *FIOp)) return false; } // We need to use register here. Check if we can use an SGPR or need // a VGPR. - FIOp.ChangeToRegister(AMDGPU::M0, false); - bool UseSGPR = TII->isOperandLegal(*MI, FIOperandNum, &FIOp); + FIOp->ChangeToRegister(AMDGPU::M0, false); + bool UseSGPR = TII->isOperandLegal(*MI, FIOperandNum, FIOp); if (!Offset && FrameReg && UseSGPR) { - FIOp.setReg(FrameReg); + FIOp->setReg(FrameReg); return false; } @@ -2427,8 +2629,8 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, Register TmpReg = RS->scavengeRegisterBackwards(*RC, MI, false, 0, !UseSGPR); - FIOp.setReg(TmpReg); - FIOp.setIsKill(); + FIOp->setReg(TmpReg); + FIOp->setIsKill(); if ((!FrameReg || !Offset) && TmpReg) { unsigned Opc = UseSGPR ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32; @@ -2457,8 +2659,8 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, if (!TmpSReg) { // Use frame register and restore it after. TmpSReg = FrameReg; - FIOp.setReg(FrameReg); - FIOp.setIsKill(false); + FIOp->setReg(FrameReg); + FIOp->setIsKill(false); } if (NeedSaveSCC) { @@ -2706,7 +2908,7 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, MI->eraseFromParent(); return true; } - FIOp.ChangeToRegister(ResultReg, false, false, true); + FIOp->ChangeToRegister(ResultReg, false, false, true); return false; } @@ -2737,13 +2939,13 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, // If the offset is simply too big, don't convert to a scratch wave offset // relative index. - FIOp.ChangeToImmediate(Offset); - if (!TII->isImmOperandLegal(*MI, FIOperandNum, FIOp)) { + FIOp->ChangeToImmediate(Offset); + if (!TII->isImmOperandLegal(*MI, FIOperandNum, *FIOp)) { Register TmpReg = RS->scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass, MI, false, 0); BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpReg) .addImm(Offset); - FIOp.ChangeToRegister(TmpReg, false, false, true); + FIOp->ChangeToRegister(TmpReg, false, false, true); } } } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll index 83f2329feb8f2..e2eac156ea787 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll @@ -105,15 +105,13 @@ define amdgpu_kernel void @store_load_vindex_kernel() { ; GFX9-LABEL: store_load_vindex_kernel: ; GFX9: ; %bb.0: ; %bb ; GFX9-NEXT: s_add_u32 flat_scratch_lo, s6, s11 -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 2, v0 -; GFX9-NEXT: v_sub_u32_e32 v0, 0, v0 ; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 -; GFX9-NEXT: v_add_u32_e32 v1, 0, v1 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 15 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: v_sub_u32_e32 v0, 0, v0 ; GFX9-NEXT: scratch_store_dword v1, v2, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v0, 0, v0 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: scratch_load_dword v0, v0, off offset:124 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_endpgm @@ -128,8 +126,6 @@ define amdgpu_kernel void @store_load_vindex_kernel() { ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v2, 15 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 2, v1 -; GFX10-NEXT: v_add_nc_u32_e32 v0, 0, v0 -; GFX10-NEXT: v_add_nc_u32_e32 v1, 0, v1 ; GFX10-NEXT: scratch_store_dword v0, v2, off ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: scratch_load_dword v0, v1, off offset:124 glc dlc @@ -140,12 +136,11 @@ define amdgpu_kernel void @store_load_vindex_kernel() { ; GFX940: ; %bb.0: ; %bb ; GFX940-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX940-NEXT: v_lshlrev_b32_e32 v1, 2, v0 -; GFX940-NEXT: v_sub_u32_e32 v0, 0, v0 ; GFX940-NEXT: v_mov_b32_e32 v2, 15 -; GFX940-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX940-NEXT: v_sub_u32_e32 v0, 0, v0 ; GFX940-NEXT: scratch_store_dword v1, v2, off sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_add_u32_e32 v0, 0, v0 +; GFX940-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX940-NEXT: scratch_load_dword v0, v0, off offset:124 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_endpgm @@ -160,7 +155,6 @@ define amdgpu_kernel void @store_load_vindex_kernel() { ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v1 ; GFX11-NEXT: scratch_store_b32 v0, v2, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: v_add_nc_u32_e32 v1, 0, v1 ; GFX11-NEXT: scratch_load_b32 v0, v1, off offset:124 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_endpgm @@ -539,15 +533,15 @@ define void @store_load_vindex_small_offset_foo(i32 %idx) { ; GFX9-NEXT: scratch_load_dword v1, off, s32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 2, v0 -; GFX9-NEXT: s_add_i32 s0, s32, 0x100 ; GFX9-NEXT: v_and_b32_e32 v0, 15, v0 -; GFX9-NEXT: v_add_u32_e32 v1, s0, v1 -; GFX9-NEXT: v_mov_b32_e32 v2, 15 +; GFX9-NEXT: v_add_u32_e32 v1, s32, v1 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: s_add_i32 s0, s32, 0x100 +; GFX9-NEXT: v_add_u32_e32 v1, 0x100, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, 15 +; GFX9-NEXT: v_add_u32_e32 v0, s32, v0 ; GFX9-NEXT: scratch_store_dword v1, v2, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v0, s0, v0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x100, v0 ; GFX9-NEXT: scratch_load_dword v0, v0, off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -557,14 +551,14 @@ define void @store_load_vindex_small_offset_foo(i32 %idx) { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_and_b32_e32 v1, 15, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-NEXT: s_add_i32 s0, s32, 0x100 ; GFX10-NEXT: v_mov_b32_e32 v2, 15 ; GFX10-NEXT: scratch_load_dword v3, off, s32 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 2, v1 -; GFX10-NEXT: v_add_nc_u32_e32 v0, s0, v0 -; GFX10-NEXT: s_add_i32 s0, s32, 0x100 -; GFX10-NEXT: v_add_nc_u32_e32 v1, s0, v1 +; GFX10-NEXT: v_add_nc_u32_e32 v0, s32, v0 +; GFX10-NEXT: v_add_nc_u32_e32 v1, s32, v1 +; GFX10-NEXT: v_add_nc_u32_e32 v0, 0x100, v0 +; GFX10-NEXT: v_add_nc_u32_e32 v1, 0x100, v1 ; GFX10-NEXT: scratch_store_dword v0, v2, off ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: scratch_load_dword v0, v1, off glc dlc @@ -577,8 +571,8 @@ define void @store_load_vindex_small_offset_foo(i32 %idx) { ; GFX940-NEXT: scratch_load_dword v1, off, s32 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshlrev_b32_e32 v1, 2, v0 -; GFX940-NEXT: s_add_i32 s0, s32, 0x100 -; GFX940-NEXT: v_add_u32_e32 v1, s0, v1 +; GFX940-NEXT: v_add_u32_e32 v1, s32, v1 +; GFX940-NEXT: v_add_u32_e32 v1, 0x100, v1 ; GFX940-NEXT: v_mov_b32_e32 v2, 15 ; GFX940-NEXT: v_and_b32_e32 v0, 15, v0 ; GFX940-NEXT: scratch_store_dword v1, v2, off sc0 sc1 @@ -593,11 +587,12 @@ define void @store_load_vindex_small_offset_foo(i32 %idx) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_lshlrev_b32 v1, 2, v0 ; GFX11-NEXT: v_and_b32_e32 v0, 15, v0 -; GFX11-NEXT: s_add_i32 s0, s32, 0x100 ; GFX11-NEXT: scratch_load_b32 v3, off, s32 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_add_nc_u32_e32 v1, s0, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v1, s32, v1 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x100, v1 ; GFX11-NEXT: scratch_store_b32 v1, v2, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: scratch_load_b32 v0, v0, s32 offset:256 glc dlc @@ -855,15 +850,15 @@ define void @store_load_vindex_large_offset_foo(i32 %idx) { ; GFX9-NEXT: scratch_load_dword v1, off, s32 offset:4 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 2, v0 -; GFX9-NEXT: s_add_i32 s0, s32, 0x4004 ; GFX9-NEXT: v_and_b32_e32 v0, 15, v0 -; GFX9-NEXT: v_add_u32_e32 v1, s0, v1 -; GFX9-NEXT: v_mov_b32_e32 v2, 15 +; GFX9-NEXT: v_add_u32_e32 v1, s32, v1 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: s_add_i32 s0, s32, 0x4004 +; GFX9-NEXT: v_add_u32_e32 v1, 0x4004, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, 15 +; GFX9-NEXT: v_add_u32_e32 v0, s32, v0 ; GFX9-NEXT: scratch_store_dword v1, v2, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v0, s0, v0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x4004, v0 ; GFX9-NEXT: scratch_load_dword v0, v0, off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -873,14 +868,14 @@ define void @store_load_vindex_large_offset_foo(i32 %idx) { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_and_b32_e32 v1, 15, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-NEXT: s_add_i32 s0, s32, 0x4004 ; GFX10-NEXT: v_mov_b32_e32 v2, 15 ; GFX10-NEXT: scratch_load_dword v3, off, s32 offset:4 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 2, v1 -; GFX10-NEXT: v_add_nc_u32_e32 v0, s0, v0 -; GFX10-NEXT: s_add_i32 s0, s32, 0x4004 -; GFX10-NEXT: v_add_nc_u32_e32 v1, s0, v1 +; GFX10-NEXT: v_add_nc_u32_e32 v0, s32, v0 +; GFX10-NEXT: v_add_nc_u32_e32 v1, s32, v1 +; GFX10-NEXT: v_add_nc_u32_e32 v0, 0x4004, v0 +; GFX10-NEXT: v_add_nc_u32_e32 v1, 0x4004, v1 ; GFX10-NEXT: scratch_store_dword v0, v2, off ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: scratch_load_dword v0, v1, off glc dlc @@ -893,8 +888,8 @@ define void @store_load_vindex_large_offset_foo(i32 %idx) { ; GFX940-NEXT: scratch_load_dword v1, off, s32 offset:4 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshlrev_b32_e32 v1, 2, v0 -; GFX940-NEXT: s_add_i32 s0, s32, 0x4004 -; GFX940-NEXT: v_add_u32_e32 v1, s0, v1 +; GFX940-NEXT: v_add_u32_e32 v1, s32, v1 +; GFX940-NEXT: v_add_u32_e32 v1, 0x4004, v1 ; GFX940-NEXT: v_mov_b32_e32 v2, 15 ; GFX940-NEXT: v_and_b32_e32 v0, 15, v0 ; GFX940-NEXT: scratch_store_dword v1, v2, off sc0 sc1 @@ -913,9 +908,10 @@ define void @store_load_vindex_large_offset_foo(i32 %idx) { ; GFX11-NEXT: s_add_i32 s0, s32, 0x4004 ; GFX11-NEXT: scratch_load_b32 v3, off, s32 offset:4 glc dlc ; GFX11-NEXT: ... [truncated] `````````` </details> https://github.com/llvm/llvm-project/pull/102346 _______________________________________________ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits