https://github.com/arsenm created https://github.com/llvm/llvm-project/pull/115059
Further extend workaround for the lack of proper regbankselect for frame indexes. >From 493a45c9a65aca2402c950bae122bad477e0e5b0 Mon Sep 17 00:00:00 2001 From: Matt Arsenault <matthew.arsena...@amd.com> Date: Fri, 1 Nov 2024 12:24:37 -0700 Subject: [PATCH] AMDGPU: Fold more scalar operations on frame index to VALU Further extend workaround for the lack of proper regbankselect for frame indexes. --- llvm/lib/Target/AMDGPU/SIFoldOperands.cpp | 121 ++++++++----- .../fold-operands-s-add-copy-to-vgpr.mir | 167 ++++++++++++++++-- 2 files changed, 229 insertions(+), 59 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp index 28bcbd58dc0376..de7dec8831daec 100644 --- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -78,9 +78,25 @@ class SIFoldOperandsImpl { bool frameIndexMayFold(const MachineInstr &UseMI, int OpNo, const MachineOperand &OpToFold) const; - /// Fold %vgpr = COPY (S_ADD_I32 x, frameindex) - /// - /// => %vgpr = V_ADD_U32 x, frameindex + // TODO: Just use TII::getVALUOp + unsigned convertToVALUOp(unsigned Opc, bool UseVOP3 = false) const { + switch (Opc) { + case AMDGPU::S_ADD_I32: { + if (ST->hasAddNoCarry()) + return UseVOP3 ? AMDGPU::V_ADD_U32_e64 : AMDGPU::V_ADD_U32_e32; + return UseVOP3 ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_ADD_CO_U32_e32; + } + case AMDGPU::S_OR_B32: + return UseVOP3 ? AMDGPU::V_OR_B32_e64 : AMDGPU::V_OR_B32_e32; + case AMDGPU::S_AND_B32: + return UseVOP3 ? AMDGPU::V_AND_B32_e64 : AMDGPU::V_AND_B32_e32; + case AMDGPU::S_MUL_I32: + return AMDGPU::V_MUL_LO_U32_e64; + default: + return AMDGPU::INSTRUCTION_LIST_END; + } + } + bool foldCopyToVGPROfScalarAddOfFrameIndex(Register DstReg, Register SrcReg, MachineInstr &MI) const; @@ -202,6 +218,8 @@ bool SIFoldOperandsImpl::frameIndexMayFold( const unsigned Opc = UseMI.getOpcode(); switch (Opc) { case AMDGPU::S_ADD_I32: + case AMDGPU::S_OR_B32: + case AMDGPU::S_AND_B32: case AMDGPU::V_ADD_U32_e32: case AMDGPU::V_ADD_CO_U32_e32: // TODO: Possibly relax hasOneUse. It matters more for mubuf, since we have @@ -238,53 +256,62 @@ bool SIFoldOperandsImpl::foldCopyToVGPROfScalarAddOfFrameIndex( if (TRI->isVGPR(*MRI, DstReg) && TRI->isSGPRReg(*MRI, SrcReg) && MRI->hasOneNonDBGUse(SrcReg)) { MachineInstr *Def = MRI->getVRegDef(SrcReg); - if (Def && Def->getOpcode() == AMDGPU::S_ADD_I32 && - Def->getOperand(3).isDead()) { - MachineOperand *Src0 = &Def->getOperand(1); - MachineOperand *Src1 = &Def->getOperand(2); - - // TODO: This is profitable with more operand types, and for more - // opcodes. But ultimately this is working around poor / nonexistent - // regbankselect. - if (!Src0->isFI() && !Src1->isFI()) - return false; + if (!Def || Def->getNumOperands() != 4) + return false; - if (Src0->isFI()) - std::swap(Src0, Src1); - - MachineBasicBlock *MBB = Def->getParent(); - const DebugLoc &DL = Def->getDebugLoc(); - if (ST->hasAddNoCarry()) { - bool UseVOP3 = !Src0->isImm() || TII->isInlineConstant(*Src0); - MachineInstrBuilder Add = - BuildMI(*MBB, *Def, DL, - TII->get(UseVOP3 ? AMDGPU::V_ADD_U32_e64 - : AMDGPU::V_ADD_U32_e32), - DstReg) - .add(*Src0) - .add(*Src1) - .setMIFlags(Def->getFlags()); - if (UseVOP3) - Add.addImm(0); - - Def->eraseFromParent(); - MI.eraseFromParent(); - return true; - } + MachineOperand *Src0 = &Def->getOperand(1); + MachineOperand *Src1 = &Def->getOperand(2); - MachineBasicBlock::LivenessQueryResult Liveness = - MBB->computeRegisterLiveness(TRI, AMDGPU::VCC, *Def, 16); - if (Liveness == MachineBasicBlock::LQR_Dead) { - // TODO: If src1 satisfies operand constraints, use vop3 version. - BuildMI(*MBB, *Def, DL, TII->get(AMDGPU::V_ADD_CO_U32_e32), DstReg) - .add(*Src0) - .add(*Src1) - .setOperandDead(3) // implicit-def $vcc - .setMIFlags(Def->getFlags()); - Def->eraseFromParent(); - MI.eraseFromParent(); - return true; + // TODO: This is profitable with more operand types, and for more + // opcodes. But ultimately this is working around poor / nonexistent + // regbankselect. + if (!Src0->isFI() && !Src1->isFI()) + return false; + + if (Src0->isFI()) + std::swap(Src0, Src1); + + const bool UseVOP3 = !Src0->isImm() || TII->isInlineConstant(*Src0); + unsigned NewOp = convertToVALUOp(Def->getOpcode(), UseVOP3); + if (NewOp == AMDGPU::INSTRUCTION_LIST_END || + !Def->getOperand(3).isDead()) // Check if scc is dead + return false; + + MachineBasicBlock *MBB = Def->getParent(); + const DebugLoc &DL = Def->getDebugLoc(); + if (NewOp != AMDGPU::V_ADD_CO_U32_e32) { + MachineInstrBuilder Add = + BuildMI(*MBB, *Def, DL, TII->get(NewOp), DstReg); + + if (Add->getDesc().getNumDefs() == 2) { + Register CarryOutReg = MRI->createVirtualRegister(TRI->getBoolRC()); + Add.addDef(CarryOutReg, RegState::Dead); + MRI->setRegAllocationHint(CarryOutReg, 0, TRI->getVCC()); } + + Add.add(*Src0).add(*Src1).setMIFlags(Def->getFlags()); + if (AMDGPU::hasNamedOperand(NewOp, AMDGPU::OpName::clamp)) + Add.addImm(0); + + Def->eraseFromParent(); + MI.eraseFromParent(); + return true; + } + + assert(NewOp == AMDGPU::V_ADD_CO_U32_e32); + + MachineBasicBlock::LivenessQueryResult Liveness = + MBB->computeRegisterLiveness(TRI, AMDGPU::VCC, *Def, 16); + if (Liveness == MachineBasicBlock::LQR_Dead) { + // TODO: If src1 satisfies operand constraints, use vop3 version. + BuildMI(*MBB, *Def, DL, TII->get(NewOp), DstReg) + .add(*Src0) + .add(*Src1) + .setOperandDead(3) // implicit-def $vcc + .setMIFlags(Def->getFlags()); + Def->eraseFromParent(); + MI.eraseFromParent(); + return true; } } diff --git a/llvm/test/CodeGen/AMDGPU/fold-operands-s-add-copy-to-vgpr.mir b/llvm/test/CodeGen/AMDGPU/fold-operands-s-add-copy-to-vgpr.mir index 683f02b413315e..8c88c7a97174e2 100644 --- a/llvm/test/CodeGen/AMDGPU/fold-operands-s-add-copy-to-vgpr.mir +++ b/llvm/test/CodeGen/AMDGPU/fold-operands-s-add-copy-to-vgpr.mir @@ -75,8 +75,8 @@ stack: body: | bb.0: ; GFX8-LABEL: name: fold_s_add_i32__fi_imm_copy_to_virt_vgpr - ; GFX8: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = nuw V_ADD_CO_U32_e32 64, %stack.0, implicit-def dead $vcc, implicit $exec - ; GFX8-NEXT: SI_RETURN implicit [[V_ADD_CO_U32_e32_]] + ; GFX8: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = nuw V_ADD_CO_U32_e64 64, %stack.0, 0, implicit $exec + ; GFX8-NEXT: SI_RETURN implicit [[V_ADD_CO_U32_e64_]] ; ; GFX9-LABEL: name: fold_s_add_i32__fi_imm_copy_to_virt_vgpr ; GFX9: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = nuw V_ADD_U32_e64 64, %stack.0, 0, implicit $exec @@ -98,8 +98,8 @@ stack: body: | bb.0: ; GFX8-LABEL: name: fold_s_add_i32__imm_fi_copy_to_virt_vgpr - ; GFX8: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = nuw V_ADD_CO_U32_e32 64, %stack.0, implicit-def dead $vcc, implicit $exec - ; GFX8-NEXT: SI_RETURN implicit [[V_ADD_CO_U32_e32_]] + ; GFX8: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = nuw V_ADD_CO_U32_e64 64, %stack.0, 0, implicit $exec + ; GFX8-NEXT: SI_RETURN implicit [[V_ADD_CO_U32_e64_]] ; ; GFX9-LABEL: name: fold_s_add_i32__imm_fi_copy_to_virt_vgpr ; GFX9: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = nuw V_ADD_U32_e64 64, %stack.0, 0, implicit $exec @@ -202,8 +202,8 @@ body: | ; GFX8: liveins: $sgpr8 ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr8 - ; GFX8-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 [[COPY]], %stack.0, implicit-def dead $vcc, implicit $exec - ; GFX8-NEXT: SI_RETURN implicit [[V_ADD_CO_U32_e32_]] + ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 [[COPY]], %stack.0, 0, implicit $exec + ; GFX8-NEXT: SI_RETURN implicit [[V_ADD_CO_U32_e64_]] ; ; GFX9-LABEL: name: fold_s_add_i32__mov_fi_reg_copy_to_virt_vgpr ; GFX9: liveins: $sgpr8 @@ -239,8 +239,8 @@ body: | ; GFX8: liveins: $sgpr8 ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr8 - ; GFX8-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 [[COPY]], %stack.0, implicit-def dead $vcc, implicit $exec - ; GFX8-NEXT: SI_RETURN implicit [[V_ADD_CO_U32_e32_]] + ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 [[COPY]], %stack.0, 0, implicit $exec + ; GFX8-NEXT: SI_RETURN implicit [[V_ADD_CO_U32_e64_]] ; ; GFX9-LABEL: name: fold_s_add_i32__reg_copy_mov_fi_to_virt_vgpr ; GFX9: liveins: $sgpr8 @@ -337,8 +337,8 @@ body: | ; GFX8: liveins: $sgpr8 ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr8 - ; GFX8-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 [[COPY]], %stack.0, implicit-def dead $vcc, implicit $exec - ; GFX8-NEXT: SI_RETURN implicit [[V_ADD_CO_U32_e32_]] + ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 [[COPY]], %stack.0, 0, implicit $exec + ; GFX8-NEXT: SI_RETURN implicit [[V_ADD_CO_U32_e64_]] ; ; GFX9-LABEL: name: fold_s_add_i32__fi_reg_copy_to_virt_vgpr ; GFX9: liveins: $sgpr8 @@ -371,8 +371,8 @@ body: | ; GFX8: liveins: $sgpr8 ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr8 - ; GFX8-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 [[COPY]], %stack.0, implicit-def dead $vcc, implicit $exec - ; GFX8-NEXT: SI_RETURN implicit [[V_ADD_CO_U32_e32_]] + ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 [[COPY]], %stack.0, 0, implicit $exec + ; GFX8-NEXT: SI_RETURN implicit [[V_ADD_CO_U32_e64_]] ; ; GFX9-LABEL: name: fold_s_add_i32__reg_fi_copy_to_virt_vgpr ; GFX9: liveins: $sgpr8 @@ -392,3 +392,146 @@ body: | %2:vgpr_32 = COPY %1 SI_RETURN implicit %2 ... + +--- +name: fold_s_or_b32__mov_fi_const_copy_to_virt_vgpr +tracksRegLiveness: true +stack: + - { id: 0, size: 16384, alignment: 4, local-offset: 0 } +body: | + bb.0: + ; CHECK-LABEL: name: fold_s_or_b32__mov_fi_const_copy_to_virt_vgpr + ; CHECK: [[V_OR_B32_e32_:%[0-9]+]]:vgpr_32 = V_OR_B32_e32 128, %stack.0, implicit $exec + ; CHECK-NEXT: SI_RETURN implicit [[V_OR_B32_e32_]] + %0:sreg_32 = S_MOV_B32 %stack.0 + %1:sreg_32 = S_OR_B32 %0, 128, implicit-def dead $scc + %2:vgpr_32 = COPY %1 + SI_RETURN implicit %2 +... + +--- +name: fold_s_or_b32__const_copy_mov_fi_to_virt_vgpr +tracksRegLiveness: true +stack: + - { id: 0, size: 16384, alignment: 4, local-offset: 0 } +body: | + bb.0: + ; CHECK-LABEL: name: fold_s_or_b32__const_copy_mov_fi_to_virt_vgpr + ; CHECK: [[V_OR_B32_e32_:%[0-9]+]]:vgpr_32 = V_OR_B32_e32 128, %stack.0, implicit $exec + ; CHECK-NEXT: SI_RETURN implicit [[V_OR_B32_e32_]] + %0:sreg_32 = S_MOV_B32 %stack.0 + %1:sreg_32 = S_OR_B32 128, %0, implicit-def dead $scc + %2:vgpr_32 = COPY %1 + SI_RETURN implicit %2 +... + +--- +name: fold_s_or_b32__fi_imm_copy_to_virt_vgpr +tracksRegLiveness: true +stack: + - { id: 0, size: 16384, alignment: 4, local-offset: 0 } +body: | + bb.0: + ; CHECK-LABEL: name: fold_s_or_b32__fi_imm_copy_to_virt_vgpr + ; CHECK: %1:vgpr_32 = disjoint V_OR_B32_e64 64, %stack.0, implicit $exec + ; CHECK-NEXT: SI_RETURN implicit %1 + %0:sreg_32 = disjoint S_OR_B32 %stack.0, 64, implicit-def dead $scc + %1:vgpr_32 = COPY %0 + SI_RETURN implicit %1 +... + +--- +name: fold_s_or_b32__imm_fi_copy_to_virt_vgpr +tracksRegLiveness: true +stack: + - { id: 0, size: 16384, alignment: 4, local-offset: 0 } +body: | + bb.0: + ; CHECK-LABEL: name: fold_s_or_b32__imm_fi_copy_to_virt_vgpr + ; CHECK: %1:vgpr_32 = disjoint V_OR_B32_e64 64, %stack.0, implicit $exec + ; CHECK-NEXT: SI_RETURN implicit %1 + %0:sreg_32 = disjoint S_OR_B32 64, %stack.0, implicit-def dead $scc + %1:vgpr_32 = COPY %0 + SI_RETURN implicit %1 +... + +--- +name: fold_s_and_b32__fi_imm_copy_to_virt_vgpr +tracksRegLiveness: true +stack: + - { id: 0, size: 16384, alignment: 4, local-offset: 0 } +body: | + bb.0: + ; CHECK-LABEL: name: fold_s_and_b32__fi_imm_copy_to_virt_vgpr + ; CHECK: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 64, %stack.0, implicit $exec + ; CHECK-NEXT: SI_RETURN implicit [[V_AND_B32_e64_]] + %0:sreg_32 = S_AND_B32 %stack.0, 64, implicit-def dead $scc + %1:vgpr_32 = COPY %0 + SI_RETURN implicit %1 +... + +--- +name: fold_s_and_b32__fi_const_copy_to_virt_vgpr +tracksRegLiveness: true +stack: + - { id: 0, size: 16384, alignment: 4, local-offset: 0 } +body: | + bb.0: + ; CHECK-LABEL: name: fold_s_and_b32__fi_const_copy_to_virt_vgpr + ; CHECK: [[V_AND_B32_e32_:%[0-9]+]]:vgpr_32 = V_AND_B32_e32 128, %stack.0, implicit $exec + ; CHECK-NEXT: SI_RETURN implicit [[V_AND_B32_e32_]] + %0:sreg_32 = S_AND_B32 %stack.0, 128, implicit-def dead $scc + %1:vgpr_32 = COPY %0 + SI_RETURN implicit %1 +... + +--- +name: fold_s_mul_i32__fi_imm_copy_to_virt_vgpr +tracksRegLiveness: true +stack: + - { id: 0, size: 16384, alignment: 4, local-offset: 0 } +body: | + bb.0: + ; CHECK-LABEL: name: fold_s_mul_i32__fi_imm_copy_to_virt_vgpr + ; CHECK: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32_e64 64, %stack.0, implicit $exec + ; CHECK-NEXT: SI_RETURN implicit [[V_MUL_LO_U32_e64_]] + %0:sreg_32 = S_MUL_I32 %stack.0, 64, implicit-def dead $scc + %1:vgpr_32 = COPY %0 + SI_RETURN implicit %1 +... + +--- +name: fold_s_mul_i32__fi_reg_copy_to_virt_vgpr +tracksRegLiveness: true +stack: + - { id: 0, size: 16384, alignment: 4, local-offset: 0 } +body: | + bb.0: + liveins: $sgpr4 + ; CHECK-LABEL: name: fold_s_mul_i32__fi_reg_copy_to_virt_vgpr + ; CHECK: liveins: $sgpr4 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32_e64 [[COPY]], %stack.0, implicit $exec + ; CHECK-NEXT: SI_RETURN implicit [[V_MUL_LO_U32_e64_]] + %0:sreg_32 = COPY $sgpr4 + %1:sreg_32 = S_MUL_I32 %stack.0, %0, implicit-def dead $scc + %2:vgpr_32 = COPY %1 + SI_RETURN implicit %2 +... + +--- +name: fold_s_and_b32__mov_fi_const_copy_to_virt_vgpr +tracksRegLiveness: true +stack: + - { id: 0, size: 16384, alignment: 4, local-offset: 0 } +body: | + bb.0: + ; CHECK-LABEL: name: fold_s_and_b32__mov_fi_const_copy_to_virt_vgpr + ; CHECK: [[V_AND_B32_e32_:%[0-9]+]]:vgpr_32 = V_AND_B32_e32 128, %stack.0, implicit $exec + ; CHECK-NEXT: SI_RETURN implicit [[V_AND_B32_e32_]] + %0:sreg_32 = S_MOV_B32 %stack.0 + %1:sreg_32 = S_AND_B32 %0, 128, implicit-def dead $scc + %2:vgpr_32 = COPY %1 + SI_RETURN implicit %2 +... _______________________________________________ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits