[llvm-branch-commits] [llvm] b76014a - RegionInfo: use a range-based for loop [NFCI]
Author: Nicolai Hähnle Date: 2020-12-29T16:00:26+01:00 New Revision: b76014a4f15ad9f3151862fcc6c6ab2f0f505199 URL: https://github.com/llvm/llvm-project/commit/b76014a4f15ad9f3151862fcc6c6ab2f0f505199 DIFF: https://github.com/llvm/llvm-project/commit/b76014a4f15ad9f3151862fcc6c6ab2f0f505199.diff LOG: RegionInfo: use a range-based for loop [NFCI] Change-Id: I9985d72191a2b0680195032acf8a14ad2ba954ed Differential Revision: https://reviews.llvm.org/D92932 Added: Modified: llvm/include/llvm/Analysis/RegionInfoImpl.h Removed: diff --git a/llvm/include/llvm/Analysis/RegionInfoImpl.h b/llvm/include/llvm/Analysis/RegionInfoImpl.h index 8d9ec646f519..bbd3dd5dc0ea 100644 --- a/llvm/include/llvm/Analysis/RegionInfoImpl.h +++ b/llvm/include/llvm/Analysis/RegionInfoImpl.h @@ -585,10 +585,8 @@ bool RegionInfoBase::isRegion(BlockT *entry, BlockT *exit) const { // Exit is the header of a loop that contains the entry. In this case, // the dominance frontier must only contain the exit. if (!DT->dominates(entry, exit)) { -for (typename DST::iterator SI = entrySuccs->begin(), -SE = entrySuccs->end(); - SI != SE; ++SI) { - if (*SI != exit && *SI != entry) +for (BlockT *successor : *entrySuccs) { + if (successor != exit && successor != entry) return false; } ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: RegBankLegalize rules for load (PR #112882)
@@ -233,6 +327,126 @@ RegBankLegalizeRules::getRulesForOpc(MachineInstr &MI) const { return GRules.at(GRulesAlias.at(Opc)); } +// Syntactic sugar wrapper for predicate lambda that enables '&&', '||' and '!'. +class Predicate { +public: + struct Elt { +// Save formula composed of Pred, '&&', '||' and '!' as a jump table. +// Sink ! to Pred. For example !((A && !B) || C) -> (!A || B) && !C +// Sequences of && and || will be represented by jumps, for example: +// (A && B && ... X) or (A && B && ... X) || Y +// A == true jump to B +// A == false jump to end or Y, result is A(false) or Y +// (A || B || ... X) or (A || B || ... X) && Y +// A == true jump to end or Y, result is B(true) or Y nhaehnle wrote: ```suggestion // A == true jump to end or Y, result is A(true) or Y ``` https://github.com/llvm/llvm-project/pull/112882 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: RegBankLegalize rules for load (PR #112882)
@@ -233,6 +327,126 @@ RegBankLegalizeRules::getRulesForOpc(MachineInstr &MI) const { return GRules.at(GRulesAlias.at(Opc)); } +// Syntactic sugar wrapper for predicate lambda that enables '&&', '||' and '!'. +class Predicate { +public: + struct Elt { +// Save formula composed of Pred, '&&', '||' and '!' as a jump table. +// Sink ! to Pred. For example !((A && !B) || C) -> (!A || B) && !C +// Sequences of && and || will be represented by jumps, for example: +// (A && B && ... X) or (A && B && ... X) || Y +// A == true jump to B +// A == false jump to end or Y, result is A(false) or Y +// (A || B || ... X) or (A || B || ... X) && Y +// A == true jump to end or Y, result is B(true) or Y +// A == false jump to B +// Notice that when negating expression, we simply flip Neg on each Pred +// and swap TJumpOffset and FJumpOffset (&& becomes ||, || becomes &&). +std::function Pred; +bool Neg; // Neg of Pred is calculated before jump +unsigned TJumpOffset; +unsigned FJumpOffset; + }; + + SmallVector Expression; + + Predicate(std::function Pred) { +Expression.push_back({Pred, false, 1, 1}); + }; + + Predicate(SmallVectorImpl &Expr) { Expression.swap(Expr); }; nhaehnle wrote: I find having a constructor that is destructive on its argument in this way to be quite surprising. It is against good patterns in modern C++. Better have it take an rvalue reference (&&-reference). Then its callers have to explicitly `std::move`, but that makes it more obvious what happens at the call sites. Also, this constructor should be private just like `Elt`. https://github.com/llvm/llvm-project/pull/112882 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: RegBankLegalize rules for load (PR #112882)
@@ -233,6 +327,126 @@ RegBankLegalizeRules::getRulesForOpc(MachineInstr &MI) const { return GRules.at(GRulesAlias.at(Opc)); } +// Syntactic sugar wrapper for predicate lambda that enables '&&', '||' and '!'. +class Predicate { +public: + struct Elt { +// Save formula composed of Pred, '&&', '||' and '!' as a jump table. +// Sink ! to Pred. For example !((A && !B) || C) -> (!A || B) && !C +// Sequences of && and || will be represented by jumps, for example: +// (A && B && ... X) or (A && B && ... X) || Y +// A == true jump to B +// A == false jump to end or Y, result is A(false) or Y +// (A || B || ... X) or (A || B || ... X) && Y +// A == true jump to end or Y, result is B(true) or Y +// A == false jump to B +// Notice that when negating expression, we simply flip Neg on each Pred +// and swap TJumpOffset and FJumpOffset (&& becomes ||, || becomes &&). +std::function Pred; +bool Neg; // Neg of Pred is calculated before jump +unsigned TJumpOffset; +unsigned FJumpOffset; + }; + + SmallVector Expression; nhaehnle wrote: Make this and `Elt` private. `Elt` is quite subtle and should be hidden as an implementation detail. https://github.com/llvm/llvm-project/pull/112882 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: RegBankLegalize rules for load (PR #112882)
@@ -233,6 +327,126 @@ RegBankLegalizeRules::getRulesForOpc(MachineInstr &MI) const { return GRules.at(GRulesAlias.at(Opc)); } +// Syntactic sugar wrapper for predicate lambda that enables '&&', '||' and '!'. +class Predicate { +public: + struct Elt { +// Save formula composed of Pred, '&&', '||' and '!' as a jump table. +// Sink ! to Pred. For example !((A && !B) || C) -> (!A || B) && !C +// Sequences of && and || will be represented by jumps, for example: +// (A && B && ... X) or (A && B && ... X) || Y +// A == true jump to B +// A == false jump to end or Y, result is A(false) or Y +// (A || B || ... X) or (A || B || ... X) && Y +// A == true jump to end or Y, result is B(true) or Y +// A == false jump to B +// Notice that when negating expression, we simply flip Neg on each Pred +// and swap TJumpOffset and FJumpOffset (&& becomes ||, || becomes &&). +std::function Pred; +bool Neg; // Neg of Pred is calculated before jump +unsigned TJumpOffset; +unsigned FJumpOffset; + }; + + SmallVector Expression; + + Predicate(std::function Pred) { +Expression.push_back({Pred, false, 1, 1}); + }; + + Predicate(SmallVectorImpl &Expr) { Expression.swap(Expr); }; + + bool operator()(const MachineInstr &MI) const { +unsigned Idx = 0; +unsigned ResultIdx = Expression.size(); +bool Result; +do { + Result = Expression[Idx].Pred(MI); + Result = Expression[Idx].Neg ? !Result : Result; + if (Result) { +Idx += Expression[Idx].TJumpOffset; + } else { +Idx += Expression[Idx].FJumpOffset; + } +} while ((Idx != ResultIdx)); + +return Result; + }; + + Predicate operator!() { +SmallVector NegExpression; +for (Elt &ExprElt : Expression) { + NegExpression.push_back({ExprElt.Pred, !ExprElt.Neg, ExprElt.FJumpOffset, + ExprElt.TJumpOffset}); +} +return Predicate(NegExpression); + }; + + Predicate operator&&(Predicate &RHS) { nhaehnle wrote: Similar to the constructor above, the baseline of this should be a const method that accepts a const reference as an argument. Do you have a good argument for the rvalue-argument overload below? If not, please just remove it. The same applies to `operator||`. https://github.com/llvm/llvm-project/pull/112882 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: RegBankLegalize rules for load (PR #112882)
@@ -290,7 +504,86 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST, .Any({{UniS64, S32}, {{Sgpr64}, {Sgpr32}, Ext32To64}}) .Any({{DivS64, S32}, {{Vgpr64}, {Vgpr32}, Ext32To64}}); - addRulesForGOpcs({G_LOAD}).Any({{DivS32, DivP1}, {{Vgpr32}, {VgprP1}}}); + bool hasUnAlignedLoads = ST->getGeneration() >= AMDGPUSubtarget::GFX12; + bool hasSMRDSmall = ST->hasScalarSubwordLoads(); + + Predicate isAlign16([](const MachineInstr &MI) -> bool { +return (*MI.memoperands_begin())->getAlign() >= Align(16); + }); + + Predicate isAlign4([](const MachineInstr &MI) -> bool { +return (*MI.memoperands_begin())->getAlign() >= Align(4); + }); + + Predicate isAtomicMMO([](const MachineInstr &MI) -> bool { +return (*MI.memoperands_begin())->isAtomic(); + }); + + Predicate isUniMMO([](const MachineInstr &MI) -> bool { +return AMDGPUInstrInfo::isUniformMMO(*MI.memoperands_begin()); + }); + + Predicate isConst([](const MachineInstr &MI) -> bool { +// Address space in MMO be different then address space on pointer. +const MachineMemOperand *MMO = *MI.memoperands_begin(); +const unsigned AS = MMO->getAddrSpace(); +return AS == AMDGPUAS::CONSTANT_ADDRESS || + AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT; + }); + + Predicate isVolatileMMO([](const MachineInstr &MI) -> bool { +return (*MI.memoperands_begin())->isVolatile(); + }); + + Predicate isInvMMO([](const MachineInstr &MI) -> bool { +return (*MI.memoperands_begin())->isInvariant(); + }); + + Predicate isNoClobberMMO([](const MachineInstr &MI) -> bool { +return (*MI.memoperands_begin())->getFlags() & MONoClobber; + }); + + Predicate isNaturalAlignedSmall([](const MachineInstr &MI) -> bool { +const MachineMemOperand *MMO = *MI.memoperands_begin(); +const unsigned MemSize = 8 * MMO->getSize().getValue(); +return (MemSize == 16 && MMO->getAlign() >= Align(2)) || + (MemSize == 8 && MMO->getAlign() >= Align(1)); + }); + + auto isUL = !isAtomicMMO && isUniMMO && (isConst || !isVolatileMMO) && + (isConst || isInvMMO || isNoClobberMMO); nhaehnle wrote: What's the logic behind the `isConst || !isVolatileMMO` part of this predicate? const && volatile doesn't make much sense to me, so why isn't this just `!isVolatileMMO`? https://github.com/llvm/llvm-project/pull/112882 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: RegBankLegalize rules for load (PR #112882)
https://github.com/nhaehnle edited https://github.com/llvm/llvm-project/pull/112882 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: RegBankLegalize rules for load (PR #112882)
@@ -290,7 +504,86 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST, .Any({{UniS64, S32}, {{Sgpr64}, {Sgpr32}, Ext32To64}}) .Any({{DivS64, S32}, {{Vgpr64}, {Vgpr32}, Ext32To64}}); - addRulesForGOpcs({G_LOAD}).Any({{DivS32, DivP1}, {{Vgpr32}, {VgprP1}}}); + bool hasUnAlignedLoads = ST->getGeneration() >= AMDGPUSubtarget::GFX12; nhaehnle wrote: "unaligned" is a single word: ```suggestion bool hasUnalignedLoads = ST->getGeneration() >= AMDGPUSubtarget::GFX12; ``` https://github.com/llvm/llvm-project/pull/112882 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: RegBankLegalize rules for load (PR #112882)
https://github.com/nhaehnle commented: I have a bunch of comments, but apart from that the change LGTM https://github.com/llvm/llvm-project/pull/112882 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: RegBankLegalize rules for load (PR #112882)
@@ -233,6 +327,126 @@ RegBankLegalizeRules::getRulesForOpc(MachineInstr &MI) const { return GRules.at(GRulesAlias.at(Opc)); } +// Syntactic sugar wrapper for predicate lambda that enables '&&', '||' and '!'. +class Predicate { +public: + struct Elt { +// Save formula composed of Pred, '&&', '||' and '!' as a jump table. +// Sink ! to Pred. For example !((A && !B) || C) -> (!A || B) && !C +// Sequences of && and || will be represented by jumps, for example: +// (A && B && ... X) or (A && B && ... X) || Y +// A == true jump to B +// A == false jump to end or Y, result is A(false) or Y +// (A || B || ... X) or (A || B || ... X) && Y +// A == true jump to end or Y, result is B(true) or Y +// A == false jump to B +// Notice that when negating expression, we simply flip Neg on each Pred +// and swap TJumpOffset and FJumpOffset (&& becomes ||, || becomes &&). +std::function Pred; +bool Neg; // Neg of Pred is calculated before jump +unsigned TJumpOffset; +unsigned FJumpOffset; + }; + + SmallVector Expression; + + Predicate(std::function Pred) { +Expression.push_back({Pred, false, 1, 1}); + }; + + Predicate(SmallVectorImpl &Expr) { Expression.swap(Expr); }; + + bool operator()(const MachineInstr &MI) const { +unsigned Idx = 0; +unsigned ResultIdx = Expression.size(); +bool Result; +do { + Result = Expression[Idx].Pred(MI); + Result = Expression[Idx].Neg ? !Result : Result; + if (Result) { +Idx += Expression[Idx].TJumpOffset; + } else { +Idx += Expression[Idx].FJumpOffset; + } +} while ((Idx != ResultIdx)); + +return Result; + }; + + Predicate operator!() { nhaehnle wrote: Should be a const method. https://github.com/llvm/llvm-project/pull/112882 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: RegBankLegalize rules for load (PR #112882)
@@ -116,6 +193,50 @@ void RegBankLegalizeHelper::lower(MachineInstr &MI, MI.eraseFromParent(); break; } + case SplitLoad: { +LLT DstTy = MRI.getType(MI.getOperand(0).getReg()); +unsigned Size = DstTy.getSizeInBits(); +// Even split to 128-bit loads +if (Size > 128) { + LLT B128; + if (DstTy.isVector()) { +LLT EltTy = DstTy.getElementType(); +B128 = LLT::fixed_vector(128 / EltTy.getSizeInBits(), EltTy); + } else { +B128 = LLT::scalar(128); + } + if (Size / 128 == 2) +splitLoad(MI, {B128, B128}); + if (Size / 128 == 4) +splitLoad(MI, {B128, B128, B128, B128}); nhaehnle wrote: I would expect this to be an else-if chain with an `else llvm_unreachable` at the end. https://github.com/llvm/llvm-project/pull/112882 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: AMDGPURegBankLegalize (PR #112864)
https://github.com/nhaehnle commented: LGTM aside from a small nit https://github.com/llvm/llvm-project/pull/112864 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: AMDGPURegBankLegalize (PR #112864)
@@ -217,6 +217,74 @@ bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const { return true; } +bool AMDGPUInstructionSelector::selectCOPY_SCC_VCC(MachineInstr &I) const { nhaehnle wrote: I don't think it's trivial to avoid this a priori. I agree that a separate cleanup optimization could do it, in any case I'd say it's best left to a separate focused change. https://github.com/llvm/llvm-project/pull/112864 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: AMDGPURegBankLegalize (PR #112864)
https://github.com/nhaehnle edited https://github.com/llvm/llvm-project/pull/112864 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: AMDGPURegBankLegalize (PR #112864)
@@ -9,7 +9,11 @@ #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUGLOBALISELUTILS_H #define LLVM_LIB_TARGET_AMDGPU_AMDGPUGLOBALISELUTILS_H +#include "AMDGPURegisterBankInfo.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "llvm/ADT/DenseSet.h" +#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" +#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" nhaehnle wrote: We should avoid unnecessary `#include`s in headers (for compile time reason). I doubt that all of these are really needed. https://github.com/llvm/llvm-project/pull/112864 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: RegBankLegalize rules for load (PR #112882)
https://github.com/nhaehnle approved this pull request. LGTM https://github.com/llvm/llvm-project/pull/112882 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] MachineUniformityAnalysis: Improve isConstantOrUndefValuePhi (PR #112866)
@@ -54,9 +54,34 @@ const MachineBasicBlock *MachineSSAContext::getDefBlock(Register value) const { return F->getRegInfo().getVRegDef(value)->getParent(); } +static bool isUndef(const MachineInstr &MI) { + return MI.getOpcode() == TargetOpcode::G_IMPLICIT_DEF || + MI.getOpcode() == TargetOpcode::IMPLICIT_DEF; +} + +/// MachineInstr equivalent of PHINode::hasConstantOrUndefValue() for G_PHI. template <> bool MachineSSAContext::isConstantOrUndefValuePhi(const MachineInstr &Phi) { - return Phi.isConstantValuePHI(); + if (!Phi.isPHI()) +return false; + + // In later passes PHI may appear with an undef operand, getVRegDef can fail. + if (Phi.getOpcode() == TargetOpcode::PHI) +return Phi.isConstantValuePHI(); + + // For G_PHI we do equivalent of PHINode::hasConstantOrUndefValue(). + const MachineRegisterInfo &MRI = Phi.getMF()->getRegInfo(); + Register This = Phi.getOperand(0).getReg(); + Register ConstantValue; + for (unsigned i = 1, e = Phi.getNumOperands(); i < e; i += 2) { +Register Incoming = Phi.getOperand(i).getReg(); +if (Incoming != This && !isUndef(*MRI.getVRegDef(Incoming))) { + if (ConstantValue && ConstantValue != Incoming) +return false; + ConstantValue = Incoming; +} + } nhaehnle wrote: I don't understand this logic. Why are you looking at operand 0? What's the meaning of `This` vs. `ConstantValue`? This looks very fishy. https://github.com/llvm/llvm-project/pull/112866 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: Temporal divergence lowering i1 (PR #124299)
@@ -219,6 +220,54 @@ bool DivergenceLoweringHelper::lowerTemporalDivergence() { return false; } +bool DivergenceLoweringHelper::lowerTemporalDivergenceI1() { + MachineRegisterInfo::VRegAttrs BoolS1 = {ST->getBoolRC(), LLT::scalar(1)}; + initializeLaneMaskRegisterAttributes(BoolS1); + + for (auto [Inst, UseInst, Cycle] : MUI->getTemporalDivergenceList()) { +Register Reg = Inst->getOperand(0).getReg(); +if (MRI->getType(Reg) != LLT::scalar(1)) + continue; + +Register MergedMask = MRI->createVirtualRegister(BoolS1); +Register PrevIterMask = MRI->createVirtualRegister(BoolS1); + +MachineBasicBlock *CycleHeaderMBB = Cycle->getHeader(); +SmallVector ExitingBlocks; +Cycle->getExitingBlocks(ExitingBlocks); +assert(ExitingBlocks.size() == 1); +MachineBasicBlock *CycleExitingMBB = ExitingBlocks[0]; + +B.setInsertPt(*CycleHeaderMBB, CycleHeaderMBB->begin()); +auto CrossIterPHI = B.buildInstr(AMDGPU::PHI).addDef(PrevIterMask); + +// We only care about cycle iterration path - merge Reg with previous +// iteration. For other incomings use implicit def. +// Predecessors should be CyclePredecessor and CycleExitingMBB. +// In older versions of irreducible control flow lowering there could be +// cases with more predecessors. To keep this lowering as generic as +// possible also handle those cases. +for (auto MBB : CycleHeaderMBB->predecessors()) { + if (MBB == CycleExitingMBB) { +CrossIterPHI.addReg(MergedMask); + } else { +B.setInsertPt(*MBB, MBB->getFirstTerminator()); +auto ImplDef = B.buildInstr(AMDGPU::IMPLICIT_DEF, {BoolS1}, {}); +CrossIterPHI.addReg(ImplDef.getReg(0)); + } + CrossIterPHI.addMBB(MBB); +} + +MachineBasicBlock *MBB = Inst->getParent(); +buildMergeLaneMasks(*MBB, MBB->getFirstTerminator(), {}, MergedMask, +PrevIterMask, Reg); nhaehnle wrote: In this case, it would be better to move the lane merging to directly after `Inst`. The register pressure calculus is exactly opposite from the non-i1 case: * In the non-i1 case, shifting the COPY down is good because it reduces the live range of the VGPR while potentially making the live range of the SGPR longer. VGPRs are more expensive than SGPRs, so this is a good trade-off. * In the i1 case, we'll have a live range for the merged mask extending across the entire cycle anyway. By moving the lane merging closer to Inst, we leave the live range of the merged mask unchanged, but we (most likely) reduce the live range of the i1 value produced by Inst. https://github.com/llvm/llvm-project/pull/124299 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: Temporal divergence lowering i1 (PR #124299)
https://github.com/nhaehnle edited https://github.com/llvm/llvm-project/pull/124299 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: Temporal divergence lowering i1 (PR #124299)
https://github.com/nhaehnle edited https://github.com/llvm/llvm-project/pull/124299 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: Temporal divergence lowering i1 (PR #124299)
https://github.com/nhaehnle edited https://github.com/llvm/llvm-project/pull/124299 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: Temporal divergence lowering i1 (PR #124299)
https://github.com/nhaehnle edited https://github.com/llvm/llvm-project/pull/124299 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: Temporal divergence lowering i1 (PR #124299)
https://github.com/nhaehnle edited https://github.com/llvm/llvm-project/pull/124299 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: Temporal divergence lowering (non i1) (PR #124298)
@@ -188,6 +190,35 @@ void DivergenceLoweringHelper::constrainAsLaneMask(Incoming &In) { In.Reg = Copy.getReg(0); } +void replaceUsesOfRegInInstWith(Register Reg, MachineInstr *Inst, +Register NewReg) { + for (MachineOperand &Op : Inst->operands()) { +if (Op.isReg() && Op.getReg() == Reg) + Op.setReg(NewReg); + } +} + +bool DivergenceLoweringHelper::lowerTemporalDivergence() { + AMDGPU::IntrinsicLaneMaskAnalyzer ILMA(*MF); + + for (auto [Inst, UseInst, _] : MUI->getTemporalDivergenceList()) { +Register Reg = Inst->getOperand(0).getReg(); +if (MRI->getType(Reg) == LLT::scalar(1) || MUI->isDivergent(Reg) || +ILMA.isS32S64LaneMask(Reg)) + continue; + +MachineBasicBlock *MBB = Inst->getParent(); +B.setInsertPt(*MBB, MBB->SkipPHIsAndLabels(std::next(Inst->getIterator(; + +Register VgprReg = MRI->createGenericVirtualRegister(MRI->getType(Reg)); +B.buildInstr(AMDGPU::COPY, {VgprReg}, {Reg}) +.addUse(ExecReg, RegState::Implicit); + +replaceUsesOfRegInInstWith(Reg, UseInst, VgprReg); + } + return false; +} nhaehnle wrote: I do have one high-level comment about this. Every `Inst` may potentially appear with many `UseInst`s in the temporal divergence list. The current code will create multiple new registers and multiple `COPY` instructions, which seems wasteful even if downstream passes can often clean it up. I would suggest capturing the created register in a `DenseMap` for re-use. Also, how about inserting the `COPY` at the end of `Inst->getParent()`? That way, the live range of the VGPR is reduced. https://github.com/llvm/llvm-project/pull/124298 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: Temporal divergence lowering (non i1) (PR #124298)
https://github.com/nhaehnle commented: I haven't done a detailed review of the code, but from a high-level algorithmic view this change already looks pretty reasonable to me. https://github.com/llvm/llvm-project/pull/124298 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: Temporal divergence lowering (non i1) (PR #124298)
https://github.com/nhaehnle edited https://github.com/llvm/llvm-project/pull/124298 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: Temporal divergence lowering i1 (PR #124299)
https://github.com/nhaehnle edited https://github.com/llvm/llvm-project/pull/124299 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: Temporal divergence lowering i1 (PR #124299)
@@ -219,6 +220,54 @@ bool DivergenceLoweringHelper::lowerTemporalDivergence() { return false; } +bool DivergenceLoweringHelper::lowerTemporalDivergenceI1() { + MachineRegisterInfo::VRegAttrs BoolS1 = {ST->getBoolRC(), LLT::scalar(1)}; + initializeLaneMaskRegisterAttributes(BoolS1); + + for (auto [Inst, UseInst, Cycle] : MUI->getTemporalDivergenceList()) { +Register Reg = Inst->getOperand(0).getReg(); +if (MRI->getType(Reg) != LLT::scalar(1)) + continue; + +Register MergedMask = MRI->createVirtualRegister(BoolS1); +Register PrevIterMask = MRI->createVirtualRegister(BoolS1); + +MachineBasicBlock *CycleHeaderMBB = Cycle->getHeader(); +SmallVector ExitingBlocks; +Cycle->getExitingBlocks(ExitingBlocks); +assert(ExitingBlocks.size() == 1); +MachineBasicBlock *CycleExitingMBB = ExitingBlocks[0]; + +B.setInsertPt(*CycleHeaderMBB, CycleHeaderMBB->begin()); +auto CrossIterPHI = B.buildInstr(AMDGPU::PHI).addDef(PrevIterMask); + +// We only care about cycle iterration path - merge Reg with previous +// iteration. For other incomings use implicit def. +// Predecessors should be CyclePredecessor and CycleExitingMBB. +// In older versions of irreducible control flow lowering there could be +// cases with more predecessors. To keep this lowering as generic as +// possible also handle those cases. +for (auto MBB : CycleHeaderMBB->predecessors()) { + if (MBB == CycleExitingMBB) { +CrossIterPHI.addReg(MergedMask); + } else { +B.setInsertPt(*MBB, MBB->getFirstTerminator()); +auto ImplDef = B.buildInstr(AMDGPU::IMPLICIT_DEF, {BoolS1}, {}); +CrossIterPHI.addReg(ImplDef.getReg(0)); + } + CrossIterPHI.addMBB(MBB); +} nhaehnle wrote: This still feels vaguely off to me. First of all, can you please add a `Cycle->isReducible()` assert here just in case, since you're relying on there only being one header? (Actually, that assert should probably be in `GenericCycle::getHeader`.) Second, I'm worried about that case distinction between different kinds of predecessors. It really doesn't feel properly generic to me. Latches and exiting blocks aren't necessarily the same thing. (The current structurizer tends to make it that way, but even with the structurizer we probably don't have a guarantee, let alone with a future different control flow lowering.) Let's think through some cases: * If the predecessor is outside the cycle, it should be an `IMPLICIT_DEF` * If it is inside the cycle, i.e. a latch, there are two cases: * If the predecessor is dominated by `Inst`, then clearly use `MergedMask` * Otherwise, it's actually not clear: could it be possible that control flow leaves the dominance region of `Inst` and goes around the loop again? On that last point, you could perhaps make a case that because the loop is temporally divergent, having the CFG in wave CFG form requires a certain structure. But I'm not convinced. In fact, consider: ```mermaid flowchart TD Header --> B Header --> Inst Inst --> B B --> Header Inst --> C C --> Header C --> Exit ``` Let's say the branches of Header and C are uniform; only the branch of the basic block containing Inst is divergent. Then we have temporal divergence in Inst, and need to handle the `i1`. But the CFG is already *reconvergent*: the divergent branch is post-dominated by one of its immediate successors. This means it is possible to insert EXEC masking code for this CFG correctly without structuring it. Therefore, the code here should really handle this case correctly. I think this calls for using MachineSSAUpdater for a robust solution: * Initialize the updater with IMPLICIT_DEF in every predecessor of the header that is outside the cycle. * Insert the MergedMask into the updater for the `Inst->getParent()` * Then build the lane mask merging, querying the updater for the previous mask (i.e., let the updater create that register) * The MergedMask can still be used by UseInst due to dominance Except now there's an additional issue with caching the MergedMask that doesn't exist for the non-i1 case: Different `UseInst`s could appear with respect to different `Cycle`s. The non-i1 case doesn't actually care about the cycle structure, but the code here does. In particular, with my suggestions, it will use the cycle to determine how to initialize the updater. We can still get away with a single `MergedMask` per `Inst`: * The lazy way to do it would be to not initialize the updater with IMPLICIT_DEFs and just let it figure them out by itself. This would generate correct code. * However, this has a compile-time cost and could end up being too pessimistic in terms of the generated code. * The Inst and all its uses could be contained in a cycle. This naive way would lead to a phi node at the header of this cycle, and therefore needlessly extending the live range o
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: Temporal divergence lowering i1 (PR #124299)
https://github.com/nhaehnle commented: Thank you for moving the insert point. An analogous comment applies here like in the non-i1 case: it would be good to cache the `MergedMask` register for re-use when the same `Inst` occurs with multiple `UseInst`s. https://github.com/llvm/llvm-project/pull/124299 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: Temporal divergence lowering i1 (PR #124299)
https://github.com/nhaehnle edited https://github.com/llvm/llvm-project/pull/124299 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: Temporal divergence lowering i1 (PR #124299)
https://github.com/nhaehnle edited https://github.com/llvm/llvm-project/pull/124299 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: Temporal divergence lowering (non i1) (PR #124298)
@@ -188,6 +190,37 @@ void DivergenceLoweringHelper::constrainAsLaneMask(Incoming &In) { In.Reg = Copy.getReg(0); } +void replaceUsesOfRegInInstWith(Register Reg, MachineInstr *Inst, +Register NewReg) { + for (MachineOperand &Op : Inst->operands()) { +if (Op.isReg() && Op.getReg() == Reg) + Op.setReg(NewReg); + } +} + +bool DivergenceLoweringHelper::lowerTempDivergence() { + AMDGPU::IntrinsicLaneMaskAnalyzer ILMA(*MF); + + for (auto [Inst, UseInst, _] : MUI->getUsesOutsideCycleWithDivergentExit()) { +Register Reg = Inst->getOperand(0).getReg(); +if (MRI->getType(Reg) == LLT::scalar(1) || MUI->isDivergent(Reg) || +ILMA.isS32S64LaneMask(Reg)) + continue; + +MachineInstr *MI = const_cast(Inst); nhaehnle wrote: These come out of the analysis. The analysis itself uses const pointers/references in its implementation, which I believe is a good idea for const correctness. I wouldn't change that. So a `const_cast` is needed at some point. The only question is where. I think here is as good a place as any, though perhaps grouping them together with a small explanation is in order. Something like: ```c++ // As an analysis, UniformityAnalysis treats instructions as const. We have the parent function // as non-const, so casting const away here is inelegant but justified. MachineInstr *MI = const_cast(Inst); MachineInstr *UseMI = const_cast(UseInst); ``` https://github.com/llvm/llvm-project/pull/124298 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: AMDGPURegBankLegalize (PR #112864)
https://github.com/nhaehnle approved this pull request. https://github.com/llvm/llvm-project/pull/112864 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] MachineUniformityAnalysis: Improve isConstantOrUndefValuePhi (PR #112866)
https://github.com/nhaehnle approved this pull request. https://github.com/llvm/llvm-project/pull/112866 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: add RegBankLegalize rules for extends and trunc (PR #132383)
@@ -251,8 +245,11 @@ body: | ; CHECK: liveins: $vgpr0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 -; CHECK-NEXT: [[TRUNC:%[0-9]+]]:vgpr(s1) = G_TRUNC [[COPY]](s32) -; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:vgpr(s32) = G_ANYEXT [[TRUNC]](s1) +; CHECK-NEXT: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1 +; CHECK-NEXT: [[AND:%[0-9]+]]:vgpr(s32) = G_AND [[COPY]], [[C]] +; CHECK-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 +; CHECK-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(ne), [[AND]](s32), [[C1]] +; CHECK-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP]](s1), [[C]], [[C1]] nhaehnle wrote: Similar here: This could be combined down to just a no-op -- don't combiners do that already? They should, and so this should probably not be handled separately by legalization https://github.com/llvm/llvm-project/pull/132383 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: add RegBankLegalize rules for extends and trunc (PR #132383)
@@ -269,10 +266,12 @@ body: | ; CHECK: liveins: $vgpr0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 -; CHECK-NEXT: [[TRUNC:%[0-9]+]]:vgpr(s1) = G_TRUNC [[COPY]](s32) -; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:vgpr(s32) = G_ANYEXT [[TRUNC]](s1) -; CHECK-NEXT: [[DEF:%[0-9]+]]:vgpr(s32) = G_IMPLICIT_DEF -; CHECK-NEXT: [[MV:%[0-9]+]]:vgpr(s64) = G_MERGE_VALUES [[ANYEXT]](s32), [[DEF]](s32) +; CHECK-NEXT: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1 +; CHECK-NEXT: [[AND:%[0-9]+]]:vgpr(s32) = G_AND [[COPY]], [[C]] +; CHECK-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 +; CHECK-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(ne), [[AND]](s32), [[C1]] +; CHECK-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP]](s1), [[C]], [[C1]] +; CHECK-NEXT: [[MV:%[0-9]+]]:vgpr(s64) = G_MERGE_VALUES [[SELECT]](s32), [[C1]](s32) nhaehnle wrote: Could just be a single G_ANYEXT https://github.com/llvm/llvm-project/pull/132383 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: add RegBankLegalize rules for extends and trunc (PR #132383)
@@ -233,8 +222,13 @@ body: | ; CHECK: liveins: $vgpr0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 -; CHECK-NEXT: [[TRUNC:%[0-9]+]]:vgpr(s1) = G_TRUNC [[COPY]](s32) -; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:vgpr(s16) = G_ANYEXT [[TRUNC]](s1) +; CHECK-NEXT: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1 +; CHECK-NEXT: [[AND:%[0-9]+]]:vgpr(s32) = G_AND [[COPY]], [[C]] +; CHECK-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 +; CHECK-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(ne), [[AND]](s32), [[C1]] +; CHECK-NEXT: [[C2:%[0-9]+]]:vgpr(s16) = G_CONSTANT i16 1 +; CHECK-NEXT: [[C3:%[0-9]+]]:vgpr(s16) = G_CONSTANT i16 0 +; CHECK-NEXT: [[SELECT:%[0-9]+]]:vgpr(s16) = G_SELECT [[ICMP]](s1), [[C2]], [[C3]] nhaehnle wrote: This is unnecessarily convoluted. A single `G_TRUNC` should do the trick. (Isn't that something a combiner could do?) https://github.com/llvm/llvm-project/pull/132383 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: add RegBankLegalize rules for extends and trunc (PR #132383)
https://github.com/nhaehnle edited https://github.com/llvm/llvm-project/pull/132383 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: add RegBankLegalize rules for extends and trunc (PR #132383)
@@ -215,8 +205,7 @@ body: | ; CHECK: liveins: $sgpr0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 -; CHECK-NEXT: [[TRUNC:%[0-9]+]]:sgpr(s1) = G_TRUNC [[COPY]](s32) -; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:sgpr(s64) = G_ANYEXT [[TRUNC]](s1) +; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:sgpr(s64) = G_ANYEXT [[COPY]](s32) nhaehnle wrote: Isn't this a correctness regression? I'm not entirely certain because I remember there was some weirdness around what G_TRUNC means semantically. Can you explain why there is no need for a trunc or bitwise and or something like that in the output? Note that `anyext_s1_to_s32_vgpr` does leave a G_AND, so either that test shows a code quality issue or this test is incorrect. https://github.com/llvm/llvm-project/pull/132383 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: add RegBankLegalize rules for extends and trunc (PR #132383)
@@ -160,8 +154,7 @@ body: | ; CHECK-NEXT: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1 ; CHECK-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 ; CHECK-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP]](s1), [[C]], [[C1]] -; CHECK-NEXT: [[DEF:%[0-9]+]]:vgpr(s32) = G_IMPLICIT_DEF -; CHECK-NEXT: [[MV:%[0-9]+]]:vgpr(s64) = G_MERGE_VALUES [[SELECT]](s32), [[DEF]](s32) +; CHECK-NEXT: [[MV:%[0-9]+]]:vgpr(s64) = G_MERGE_VALUES [[SELECT]](s32), [[C1]](s32) nhaehnle wrote: This change is a code quality regression: the input has `G_ANYEXT`, so the high half can be undefined. https://github.com/llvm/llvm-project/pull/132383 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: add RegBankLegalize rules for extends and trunc (PR #132383)
https://github.com/nhaehnle commented: I didn't look at everything, I just went through some of the tests. https://github.com/llvm/llvm-project/pull/132383 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: add RegBankLegalize rules for extends and trunc (PR #132383)
@@ -13,7 +12,8 @@ body: | ; CHECK: liveins: $sgpr0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 -; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:sgpr(s64) = G_ANYEXT [[COPY]](s32) +; CHECK-NEXT: [[DEF:%[0-9]+]]:sgpr(s32) = G_IMPLICIT_DEF +; CHECK-NEXT: [[MV:%[0-9]+]]:sgpr(s64) = G_MERGE_VALUES [[COPY]](s32), [[DEF]](s32) nhaehnle wrote: Why are we legalizing this G_ANYEXT to G_MERGE_VALUES, but in `anyext_s1_to_s64_scc` we generate a new `G_ANYEXT`? https://github.com/llvm/llvm-project/pull/132383 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: Temporal divergence lowering i1 (PR #124299)
@@ -228,6 +229,66 @@ bool DivergenceLoweringHelper::lowerTemporalDivergence() { return false; } +bool DivergenceLoweringHelper::lowerTemporalDivergenceI1() { + MachineRegisterInfo::VRegAttrs BoolS1 = {ST->getBoolRC(), LLT::scalar(1)}; + initializeLaneMaskRegisterAttributes(BoolS1); + MachineSSAUpdater SSAUpdater(*MF); + + // In case of use outside muliple nested cycles or muliple uses we only need + // to merge lane mask across largest relevant cycle. + SmallDenseMap> LRCCache; + for (auto [Reg, UseInst, LRC] : MUI->getTemporalDivergenceList()) { +if (MRI->getType(Reg) != LLT::scalar(1)) + continue; + +const MachineCycle *CachedLRC = LRCCache.lookup(Reg).first; +if (CachedLRC) { + LRC = CachedLRC->contains(LRC) ? CachedLRC : LRC; + assert(LRC->contains(CachedLRC)); +} + +LRCCache[Reg] = {LRC, {}}; + } + + for (auto LRCIter : LRCCache) { +Register Reg = LRCIter.first; +const MachineCycle *Cycle = LRCIter.second.first; + +if (MRI->getType(Reg) != LLT::scalar(1)) + continue; + +Register MergedMask = MRI->createVirtualRegister(BoolS1); +SSAUpdater.Initialize(MergedMask); + +MachineBasicBlock *MBB = MRI->getVRegDef(Reg)->getParent(); +SSAUpdater.AddAvailableValue(MBB, MergedMask); + +for (auto Entry : Cycle->getEntries()) { + for (MachineBasicBlock *Pred : Entry->predecessors()) { +if (!Cycle->contains(Pred)) { + B.setInsertPt(*Pred, Pred->getFirstTerminator()); + auto ImplDef = B.buildInstr(AMDGPU::IMPLICIT_DEF, {BoolS1}, {}); + SSAUpdater.AddAvailableValue(Pred, ImplDef.getReg(0)); +} + } +} + +buildMergeLaneMasks(*MBB, MBB->getFirstTerminator(), {}, MergedMask, +SSAUpdater.GetValueInMiddleOfBlock(MBB), Reg); + +LRCCache[Reg].second = MergedMask; nhaehnle wrote: Should be able to just keep LRCIter/Entry as a reference and update via that instead of repeating the cache lookup. https://github.com/llvm/llvm-project/pull/124299 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: Temporal divergence lowering i1 (PR #124299)
https://github.com/nhaehnle commented: Thanks, this now looks good to me in terms of the overall flow. I have a bunch of nitpickier, mostly style-related comments. https://github.com/llvm/llvm-project/pull/124299 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: Temporal divergence lowering i1 (PR #124299)
@@ -228,6 +229,66 @@ bool DivergenceLoweringHelper::lowerTemporalDivergence() { return false; } +bool DivergenceLoweringHelper::lowerTemporalDivergenceI1() { + MachineRegisterInfo::VRegAttrs BoolS1 = {ST->getBoolRC(), LLT::scalar(1)}; + initializeLaneMaskRegisterAttributes(BoolS1); + MachineSSAUpdater SSAUpdater(*MF); + + // In case of use outside muliple nested cycles or muliple uses we only need + // to merge lane mask across largest relevant cycle. + SmallDenseMap> LRCCache; + for (auto [Reg, UseInst, LRC] : MUI->getTemporalDivergenceList()) { +if (MRI->getType(Reg) != LLT::scalar(1)) + continue; + +const MachineCycle *CachedLRC = LRCCache.lookup(Reg).first; +if (CachedLRC) { + LRC = CachedLRC->contains(LRC) ? CachedLRC : LRC; + assert(LRC->contains(CachedLRC)); +} + +LRCCache[Reg] = {LRC, {}}; + } + + for (auto LRCIter : LRCCache) { nhaehnle wrote: Naming: this isn't an iterator, a more accurate generic name would be just "Entry" (or LRCCacheEntry, but that's long) https://github.com/llvm/llvm-project/pull/124299 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: Temporal divergence lowering (non i1) (PR #124298)
https://github.com/nhaehnle approved this pull request. Thanks! https://github.com/llvm/llvm-project/pull/124298 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: Temporal divergence lowering i1 (PR #124299)
@@ -228,6 +229,66 @@ bool DivergenceLoweringHelper::lowerTemporalDivergence() { return false; } +bool DivergenceLoweringHelper::lowerTemporalDivergenceI1() { + MachineRegisterInfo::VRegAttrs BoolS1 = {ST->getBoolRC(), LLT::scalar(1)}; + initializeLaneMaskRegisterAttributes(BoolS1); + MachineSSAUpdater SSAUpdater(*MF); + + // In case of use outside muliple nested cycles or muliple uses we only need + // to merge lane mask across largest relevant cycle. + SmallDenseMap> LRCCache; + for (auto [Reg, UseInst, LRC] : MUI->getTemporalDivergenceList()) { +if (MRI->getType(Reg) != LLT::scalar(1)) + continue; + +const MachineCycle *CachedLRC = LRCCache.lookup(Reg).first; +if (CachedLRC) { + LRC = CachedLRC->contains(LRC) ? CachedLRC : LRC; + assert(LRC->contains(CachedLRC)); +} + +LRCCache[Reg] = {LRC, {}}; nhaehnle wrote: This is actually a great use case for try_emplace to do the cache lookup only once: ```suggestion const MachineCycle *&CachedLRC = LRCCache.try_emplace(Reg); if (!CachedLRC || !CachedLRC->contains(LRC)) CachedLRC = LRC; ``` https://github.com/llvm/llvm-project/pull/124299 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: Temporal divergence lowering i1 (PR #124299)
https://github.com/nhaehnle edited https://github.com/llvm/llvm-project/pull/124299 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: Temporal divergence lowering i1 (PR #124299)
@@ -228,6 +229,66 @@ bool DivergenceLoweringHelper::lowerTemporalDivergence() { return false; } +bool DivergenceLoweringHelper::lowerTemporalDivergenceI1() { + MachineRegisterInfo::VRegAttrs BoolS1 = {ST->getBoolRC(), LLT::scalar(1)}; + initializeLaneMaskRegisterAttributes(BoolS1); + MachineSSAUpdater SSAUpdater(*MF); + + // In case of use outside muliple nested cycles or muliple uses we only need + // to merge lane mask across largest relevant cycle. + SmallDenseMap> LRCCache; + for (auto [Reg, UseInst, LRC] : MUI->getTemporalDivergenceList()) { +if (MRI->getType(Reg) != LLT::scalar(1)) + continue; + +const MachineCycle *CachedLRC = LRCCache.lookup(Reg).first; +if (CachedLRC) { + LRC = CachedLRC->contains(LRC) ? CachedLRC : LRC; + assert(LRC->contains(CachedLRC)); +} + +LRCCache[Reg] = {LRC, {}}; + } + + for (auto LRCIter : LRCCache) { +Register Reg = LRCIter.first; +const MachineCycle *Cycle = LRCIter.second.first; + +if (MRI->getType(Reg) != LLT::scalar(1)) + continue; nhaehnle wrote: This check is now redundant https://github.com/llvm/llvm-project/pull/124299 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: Temporal divergence lowering i1 (PR #124299)
@@ -228,6 +229,66 @@ bool DivergenceLoweringHelper::lowerTemporalDivergence() { return false; } +bool DivergenceLoweringHelper::lowerTemporalDivergenceI1() { + MachineRegisterInfo::VRegAttrs BoolS1 = {ST->getBoolRC(), LLT::scalar(1)}; + initializeLaneMaskRegisterAttributes(BoolS1); + MachineSSAUpdater SSAUpdater(*MF); + + // In case of use outside muliple nested cycles or muliple uses we only need + // to merge lane mask across largest relevant cycle. + SmallDenseMap> LRCCache; + for (auto [Reg, UseInst, LRC] : MUI->getTemporalDivergenceList()) { +if (MRI->getType(Reg) != LLT::scalar(1)) + continue; + +const MachineCycle *CachedLRC = LRCCache.lookup(Reg).first; +if (CachedLRC) { + LRC = CachedLRC->contains(LRC) ? CachedLRC : LRC; + assert(LRC->contains(CachedLRC)); +} + +LRCCache[Reg] = {LRC, {}}; + } + + for (auto LRCIter : LRCCache) { +Register Reg = LRCIter.first; +const MachineCycle *Cycle = LRCIter.second.first; + +if (MRI->getType(Reg) != LLT::scalar(1)) + continue; + +Register MergedMask = MRI->createVirtualRegister(BoolS1); +SSAUpdater.Initialize(MergedMask); + +MachineBasicBlock *MBB = MRI->getVRegDef(Reg)->getParent(); +SSAUpdater.AddAvailableValue(MBB, MergedMask); + +for (auto Entry : Cycle->getEntries()) { + for (MachineBasicBlock *Pred : Entry->predecessors()) { +if (!Cycle->contains(Pred)) { + B.setInsertPt(*Pred, Pred->getFirstTerminator()); + auto ImplDef = B.buildInstr(AMDGPU::IMPLICIT_DEF, {BoolS1}, {}); + SSAUpdater.AddAvailableValue(Pred, ImplDef.getReg(0)); +} + } +} + +buildMergeLaneMasks(*MBB, MBB->getFirstTerminator(), {}, MergedMask, +SSAUpdater.GetValueInMiddleOfBlock(MBB), Reg); + +LRCCache[Reg].second = MergedMask; + } + + for (auto [Reg, UseInst, Cycle] : MUI->getTemporalDivergenceList()) { +if (MRI->getType(Reg) != LLT::scalar(1)) + continue; + +replaceUsesOfRegInInstWith(Reg, UseInst, LRCCache[Reg].second); nhaehnle wrote: Can use .lookup instead of operator[] for consistency with above. https://github.com/llvm/llvm-project/pull/124299 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: Update divergence lowering tests (PR #128702)
https://github.com/nhaehnle approved this pull request. https://github.com/llvm/llvm-project/pull/128702 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: Temporal divergence lowering (non i1) (PR #124298)
nhaehnle wrote: How about this comment from earlier: > Every Inst may potentially appear with many UseInsts in the temporal > divergence list. The current code will create multiple new registers and > multiple COPY instructions, which seems wasteful even if downstream passes > can often clean it up. > > I would suggest capturing the created register in a DenseMap Register> for re-use. > > Also, how about inserting the COPY at the end of Inst->getParent()? That way, > the live range of the VGPR is reduced. ? https://github.com/llvm/llvm-project/pull/124298 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: Add waterfall lowering in regbanklegalize (PR #145912)
https://github.com/nhaehnle edited https://github.com/llvm/llvm-project/pull/145912 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: Add waterfall lowering in regbanklegalize (PR #145912)
https://github.com/nhaehnle approved this pull request. LGTM https://github.com/llvm/llvm-project/pull/145912 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: Add waterfall lowering in regbanklegalize (PR #145912)
https://github.com/nhaehnle edited https://github.com/llvm/llvm-project/pull/145912 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: Add waterfall lowering in regbanklegalize (PR #145912)
@@ -57,6 +57,224 @@ void RegBankLegalizeHelper::findRuleAndApplyMapping(MachineInstr &MI) { lower(MI, Mapping, WaterfallSgprs); } +bool RegBankLegalizeHelper::executeInWaterfallLoop( +MachineIRBuilder &B, iterator_range Range, +SmallSet &SGPROperandRegs) { + // Track use registers which have already been expanded with a readfirstlane + // sequence. This may have multiple uses if moving a sequence. + DenseMap WaterfalledRegMap; + + MachineBasicBlock &MBB = B.getMBB(); + MachineFunction &MF = B.getMF(); + + const SIRegisterInfo *TRI = ST.getRegisterInfo(); + const TargetRegisterClass *WaveRC = TRI->getWaveMaskRegClass(); + unsigned MovExecOpc, MovExecTermOpc, XorTermOpc, AndSaveExecOpc, ExecReg; + if (IsWave32) { nhaehnle wrote: Long ago, I actually thought it would be nice to have a "LaneMaskHelper" class that does this setup, since it's duplicated in a bunch of places. But I don't feel particularly strongly about it -- not for this PR. https://github.com/llvm/llvm-project/pull/145912 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: Improve readanylane combines in regbanklegalize (PR #145911)
@@ -115,126 +117,233 @@ class AMDGPURegBankLegalizeCombiner { VgprRB(&RBI.getRegBank(AMDGPU::VGPRRegBankID)), VccRB(&RBI.getRegBank(AMDGPU::VCCRegBankID)) {}; - bool isLaneMask(Register Reg) { -const RegisterBank *RB = MRI.getRegBankOrNull(Reg); -if (RB && RB->getID() == AMDGPU::VCCRegBankID) - return true; + bool isLaneMask(Register Reg); + std::pair tryMatch(Register Src, unsigned Opcode); + std::pair tryMatchRALFromUnmerge(Register Src); + Register getReadAnyLaneSrc(Register Src); + void replaceRegWithOrBuildCopy(Register Dst, Register Src); -const TargetRegisterClass *RC = MRI.getRegClassOrNull(Reg); -return RC && TRI.isSGPRClass(RC) && MRI.getType(Reg) == LLT::scalar(1); - } + bool tryEliminateReadAnyLane(MachineInstr &Copy); + void tryCombineCopy(MachineInstr &MI); + void tryCombineS1AnyExt(MachineInstr &MI); +}; - void cleanUpAfterCombine(MachineInstr &MI, MachineInstr *Optional0) { -MI.eraseFromParent(); -if (Optional0 && isTriviallyDead(*Optional0, MRI)) - Optional0->eraseFromParent(); - } +bool AMDGPURegBankLegalizeCombiner::isLaneMask(Register Reg) { + const RegisterBank *RB = MRI.getRegBankOrNull(Reg); + if (RB && RB->getID() == AMDGPU::VCCRegBankID) +return true; - std::pair tryMatch(Register Src, unsigned Opcode) { -MachineInstr *MatchMI = MRI.getVRegDef(Src); -if (MatchMI->getOpcode() != Opcode) - return {nullptr, Register()}; -return {MatchMI, MatchMI->getOperand(1).getReg()}; - } + const TargetRegisterClass *RC = MRI.getRegClassOrNull(Reg); + return RC && TRI.isSGPRClass(RC) && MRI.getType(Reg) == LLT::scalar(1); +} - void tryCombineCopy(MachineInstr &MI) { -Register Dst = MI.getOperand(0).getReg(); -Register Src = MI.getOperand(1).getReg(); -// Skip copies of physical registers. -if (!Dst.isVirtual() || !Src.isVirtual()) - return; - -// This is a cross bank copy, sgpr S1 to lane mask. -// -// %Src:sgpr(s1) = G_TRUNC %TruncS32Src:sgpr(s32) -// %Dst:lane-mask(s1) = COPY %Src:sgpr(s1) -// -> -// %Dst:lane-mask(s1) = G_AMDGPU_COPY_VCC_SCC %TruncS32Src:sgpr(s32) -if (isLaneMask(Dst) && MRI.getRegBankOrNull(Src) == SgprRB) { - auto [Trunc, TruncS32Src] = tryMatch(Src, AMDGPU::G_TRUNC); - assert(Trunc && MRI.getType(TruncS32Src) == S32 && - "sgpr S1 must be result of G_TRUNC of sgpr S32"); - - B.setInstr(MI); - // Ensure that truncated bits in BoolSrc are 0. - auto One = B.buildConstant({SgprRB, S32}, 1); - auto BoolSrc = B.buildAnd({SgprRB, S32}, TruncS32Src, One); - B.buildInstr(AMDGPU::G_AMDGPU_COPY_VCC_SCC, {Dst}, {BoolSrc}); - cleanUpAfterCombine(MI, Trunc); - return; -} +std::pair +AMDGPURegBankLegalizeCombiner::tryMatch(Register Src, unsigned Opcode) { + MachineInstr *MatchMI = MRI.getVRegDef(Src); + if (MatchMI->getOpcode() != Opcode) +return {nullptr, Register()}; + return {MatchMI, MatchMI->getOperand(1).getReg()}; +} nhaehnle wrote: Is there a better name for this than `tryMatch`? Come to think of it, can the generic matching infrastructure be used for this? https://github.com/llvm/llvm-project/pull/145911 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: Improve readanylane combines in regbanklegalize (PR #145911)
@@ -115,126 +117,233 @@ class AMDGPURegBankLegalizeCombiner { VgprRB(&RBI.getRegBank(AMDGPU::VGPRRegBankID)), VccRB(&RBI.getRegBank(AMDGPU::VCCRegBankID)) {}; - bool isLaneMask(Register Reg) { -const RegisterBank *RB = MRI.getRegBankOrNull(Reg); -if (RB && RB->getID() == AMDGPU::VCCRegBankID) - return true; + bool isLaneMask(Register Reg); + std::pair tryMatch(Register Src, unsigned Opcode); + std::pair tryMatchRALFromUnmerge(Register Src); + Register getReadAnyLaneSrc(Register Src); + void replaceRegWithOrBuildCopy(Register Dst, Register Src); -const TargetRegisterClass *RC = MRI.getRegClassOrNull(Reg); -return RC && TRI.isSGPRClass(RC) && MRI.getType(Reg) == LLT::scalar(1); - } + bool tryEliminateReadAnyLane(MachineInstr &Copy); + void tryCombineCopy(MachineInstr &MI); + void tryCombineS1AnyExt(MachineInstr &MI); +}; - void cleanUpAfterCombine(MachineInstr &MI, MachineInstr *Optional0) { -MI.eraseFromParent(); -if (Optional0 && isTriviallyDead(*Optional0, MRI)) - Optional0->eraseFromParent(); - } +bool AMDGPURegBankLegalizeCombiner::isLaneMask(Register Reg) { + const RegisterBank *RB = MRI.getRegBankOrNull(Reg); + if (RB && RB->getID() == AMDGPU::VCCRegBankID) +return true; - std::pair tryMatch(Register Src, unsigned Opcode) { -MachineInstr *MatchMI = MRI.getVRegDef(Src); -if (MatchMI->getOpcode() != Opcode) - return {nullptr, Register()}; -return {MatchMI, MatchMI->getOperand(1).getReg()}; - } + const TargetRegisterClass *RC = MRI.getRegClassOrNull(Reg); + return RC && TRI.isSGPRClass(RC) && MRI.getType(Reg) == LLT::scalar(1); +} - void tryCombineCopy(MachineInstr &MI) { -Register Dst = MI.getOperand(0).getReg(); -Register Src = MI.getOperand(1).getReg(); -// Skip copies of physical registers. -if (!Dst.isVirtual() || !Src.isVirtual()) - return; - -// This is a cross bank copy, sgpr S1 to lane mask. -// -// %Src:sgpr(s1) = G_TRUNC %TruncS32Src:sgpr(s32) -// %Dst:lane-mask(s1) = COPY %Src:sgpr(s1) -// -> -// %Dst:lane-mask(s1) = G_AMDGPU_COPY_VCC_SCC %TruncS32Src:sgpr(s32) -if (isLaneMask(Dst) && MRI.getRegBankOrNull(Src) == SgprRB) { - auto [Trunc, TruncS32Src] = tryMatch(Src, AMDGPU::G_TRUNC); - assert(Trunc && MRI.getType(TruncS32Src) == S32 && - "sgpr S1 must be result of G_TRUNC of sgpr S32"); - - B.setInstr(MI); - // Ensure that truncated bits in BoolSrc are 0. - auto One = B.buildConstant({SgprRB, S32}, 1); - auto BoolSrc = B.buildAnd({SgprRB, S32}, TruncS32Src, One); - B.buildInstr(AMDGPU::G_AMDGPU_COPY_VCC_SCC, {Dst}, {BoolSrc}); - cleanUpAfterCombine(MI, Trunc); - return; -} +std::pair +AMDGPURegBankLegalizeCombiner::tryMatch(Register Src, unsigned Opcode) { + MachineInstr *MatchMI = MRI.getVRegDef(Src); + if (MatchMI->getOpcode() != Opcode) +return {nullptr, Register()}; + return {MatchMI, MatchMI->getOperand(1).getReg()}; +} + +std::pair +AMDGPURegBankLegalizeCombiner::tryMatchRALFromUnmerge(Register Src) { + MachineInstr *ReadAnyLane = MRI.getVRegDef(Src); + if (ReadAnyLane->getOpcode() != AMDGPU::G_AMDGPU_READANYLANE) +return {nullptr, -1}; + + Register RALSrc = ReadAnyLane->getOperand(1).getReg(); + if (auto *UnMerge = getOpcodeDef(RALSrc, MRI)) +return {UnMerge, UnMerge->findRegisterDefOperandIdx(RALSrc, nullptr)}; -// Src = G_AMDGPU_READANYLANE RALSrc -// Dst = COPY Src -// -> -// Dst = RALSrc -if (MRI.getRegBankOrNull(Dst) == VgprRB && -MRI.getRegBankOrNull(Src) == SgprRB) { - auto [RAL, RALSrc] = tryMatch(Src, AMDGPU::G_AMDGPU_READANYLANE); - if (!RAL) -return; - - assert(MRI.getRegBank(RALSrc) == VgprRB); - MRI.replaceRegWith(Dst, RALSrc); - cleanUpAfterCombine(MI, RAL); - return; + return {nullptr, -1}; +} + +Register AMDGPURegBankLegalizeCombiner::getReadAnyLaneSrc(Register Src) { + // Src = G_AMDGPU_READANYLANE RALSrc + auto [RAL, RALSrc] = tryMatch(Src, AMDGPU::G_AMDGPU_READANYLANE); + if (RAL) +return RALSrc; + + // LoVgpr, HiVgpr = G_UNMERGE_VALUES UnmergeSrc + // LoSgpr = G_AMDGPU_READANYLANE LoVgpr + // HiSgpr = G_AMDGPU_READANYLANE HiVgpr + // Src G_MERGE_VALUES LoSgpr, HiSgpr + auto *Merge = getOpcodeDef(Src, MRI); + if (Merge) { +unsigned NumElts = Merge->getNumSources(); +auto [Unmerge, Idx] = tryMatchRALFromUnmerge(Merge->getSourceReg(0)); +if (!Unmerge || Unmerge->getNumDefs() != NumElts || Idx != 0) + return {}; + +// Check if all elements are from same unmerge and there is no shuffling. +for (unsigned i = 1; i < NumElts; ++i) { + auto [UnmergeI, IdxI] = tryMatchRALFromUnmerge(Merge->getSourceReg(i)); + if (UnmergeI != Unmerge || (unsigned)IdxI != i) +return {}; } +return Unmerge->getSourceReg(); } - void tryCombineS1AnyExt(MachineInstr &MI) { -// %Src:sgpr(S1) = G_TRUNC %TruncSrc -
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: Improve readanylane combines in regbanklegalize (PR #145911)
@@ -115,126 +117,233 @@ class AMDGPURegBankLegalizeCombiner { VgprRB(&RBI.getRegBank(AMDGPU::VGPRRegBankID)), VccRB(&RBI.getRegBank(AMDGPU::VCCRegBankID)) {}; - bool isLaneMask(Register Reg) { -const RegisterBank *RB = MRI.getRegBankOrNull(Reg); -if (RB && RB->getID() == AMDGPU::VCCRegBankID) - return true; + bool isLaneMask(Register Reg); + std::pair tryMatch(Register Src, unsigned Opcode); + std::pair tryMatchRALFromUnmerge(Register Src); + Register getReadAnyLaneSrc(Register Src); + void replaceRegWithOrBuildCopy(Register Dst, Register Src); -const TargetRegisterClass *RC = MRI.getRegClassOrNull(Reg); -return RC && TRI.isSGPRClass(RC) && MRI.getType(Reg) == LLT::scalar(1); - } + bool tryEliminateReadAnyLane(MachineInstr &Copy); + void tryCombineCopy(MachineInstr &MI); + void tryCombineS1AnyExt(MachineInstr &MI); +}; - void cleanUpAfterCombine(MachineInstr &MI, MachineInstr *Optional0) { -MI.eraseFromParent(); -if (Optional0 && isTriviallyDead(*Optional0, MRI)) - Optional0->eraseFromParent(); - } +bool AMDGPURegBankLegalizeCombiner::isLaneMask(Register Reg) { + const RegisterBank *RB = MRI.getRegBankOrNull(Reg); + if (RB && RB->getID() == AMDGPU::VCCRegBankID) +return true; - std::pair tryMatch(Register Src, unsigned Opcode) { -MachineInstr *MatchMI = MRI.getVRegDef(Src); -if (MatchMI->getOpcode() != Opcode) - return {nullptr, Register()}; -return {MatchMI, MatchMI->getOperand(1).getReg()}; - } + const TargetRegisterClass *RC = MRI.getRegClassOrNull(Reg); + return RC && TRI.isSGPRClass(RC) && MRI.getType(Reg) == LLT::scalar(1); +} - void tryCombineCopy(MachineInstr &MI) { -Register Dst = MI.getOperand(0).getReg(); -Register Src = MI.getOperand(1).getReg(); -// Skip copies of physical registers. -if (!Dst.isVirtual() || !Src.isVirtual()) - return; - -// This is a cross bank copy, sgpr S1 to lane mask. -// -// %Src:sgpr(s1) = G_TRUNC %TruncS32Src:sgpr(s32) -// %Dst:lane-mask(s1) = COPY %Src:sgpr(s1) -// -> -// %Dst:lane-mask(s1) = G_AMDGPU_COPY_VCC_SCC %TruncS32Src:sgpr(s32) -if (isLaneMask(Dst) && MRI.getRegBankOrNull(Src) == SgprRB) { - auto [Trunc, TruncS32Src] = tryMatch(Src, AMDGPU::G_TRUNC); - assert(Trunc && MRI.getType(TruncS32Src) == S32 && - "sgpr S1 must be result of G_TRUNC of sgpr S32"); - - B.setInstr(MI); - // Ensure that truncated bits in BoolSrc are 0. - auto One = B.buildConstant({SgprRB, S32}, 1); - auto BoolSrc = B.buildAnd({SgprRB, S32}, TruncS32Src, One); - B.buildInstr(AMDGPU::G_AMDGPU_COPY_VCC_SCC, {Dst}, {BoolSrc}); - cleanUpAfterCombine(MI, Trunc); - return; -} +std::pair +AMDGPURegBankLegalizeCombiner::tryMatch(Register Src, unsigned Opcode) { + MachineInstr *MatchMI = MRI.getVRegDef(Src); + if (MatchMI->getOpcode() != Opcode) +return {nullptr, Register()}; + return {MatchMI, MatchMI->getOperand(1).getReg()}; +} + +std::pair +AMDGPURegBankLegalizeCombiner::tryMatchRALFromUnmerge(Register Src) { + MachineInstr *ReadAnyLane = MRI.getVRegDef(Src); + if (ReadAnyLane->getOpcode() != AMDGPU::G_AMDGPU_READANYLANE) +return {nullptr, -1}; + + Register RALSrc = ReadAnyLane->getOperand(1).getReg(); + if (auto *UnMerge = getOpcodeDef(RALSrc, MRI)) +return {UnMerge, UnMerge->findRegisterDefOperandIdx(RALSrc, nullptr)}; -// Src = G_AMDGPU_READANYLANE RALSrc -// Dst = COPY Src -// -> -// Dst = RALSrc -if (MRI.getRegBankOrNull(Dst) == VgprRB && -MRI.getRegBankOrNull(Src) == SgprRB) { - auto [RAL, RALSrc] = tryMatch(Src, AMDGPU::G_AMDGPU_READANYLANE); - if (!RAL) -return; - - assert(MRI.getRegBank(RALSrc) == VgprRB); - MRI.replaceRegWith(Dst, RALSrc); - cleanUpAfterCombine(MI, RAL); - return; + return {nullptr, -1}; +} + +Register AMDGPURegBankLegalizeCombiner::getReadAnyLaneSrc(Register Src) { + // Src = G_AMDGPU_READANYLANE RALSrc + auto [RAL, RALSrc] = tryMatch(Src, AMDGPU::G_AMDGPU_READANYLANE); + if (RAL) +return RALSrc; + + // LoVgpr, HiVgpr = G_UNMERGE_VALUES UnmergeSrc + // LoSgpr = G_AMDGPU_READANYLANE LoVgpr + // HiSgpr = G_AMDGPU_READANYLANE HiVgpr + // Src G_MERGE_VALUES LoSgpr, HiSgpr + auto *Merge = getOpcodeDef(Src, MRI); + if (Merge) { +unsigned NumElts = Merge->getNumSources(); +auto [Unmerge, Idx] = tryMatchRALFromUnmerge(Merge->getSourceReg(0)); +if (!Unmerge || Unmerge->getNumDefs() != NumElts || Idx != 0) + return {}; + +// Check if all elements are from same unmerge and there is no shuffling. +for (unsigned i = 1; i < NumElts; ++i) { + auto [UnmergeI, IdxI] = tryMatchRALFromUnmerge(Merge->getSourceReg(i)); + if (UnmergeI != Unmerge || (unsigned)IdxI != i) +return {}; } +return Unmerge->getSourceReg(); } - void tryCombineS1AnyExt(MachineInstr &MI) { -// %Src:sgpr(S1) = G_TRUNC %TruncSrc -
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: Improve readanylane combines in regbanklegalize (PR #145911)
@@ -115,126 +117,233 @@ class AMDGPURegBankLegalizeCombiner { VgprRB(&RBI.getRegBank(AMDGPU::VGPRRegBankID)), VccRB(&RBI.getRegBank(AMDGPU::VCCRegBankID)) {}; - bool isLaneMask(Register Reg) { -const RegisterBank *RB = MRI.getRegBankOrNull(Reg); -if (RB && RB->getID() == AMDGPU::VCCRegBankID) - return true; + bool isLaneMask(Register Reg); + std::pair tryMatch(Register Src, unsigned Opcode); + std::pair tryMatchRALFromUnmerge(Register Src); + Register getReadAnyLaneSrc(Register Src); + void replaceRegWithOrBuildCopy(Register Dst, Register Src); -const TargetRegisterClass *RC = MRI.getRegClassOrNull(Reg); -return RC && TRI.isSGPRClass(RC) && MRI.getType(Reg) == LLT::scalar(1); - } + bool tryEliminateReadAnyLane(MachineInstr &Copy); + void tryCombineCopy(MachineInstr &MI); + void tryCombineS1AnyExt(MachineInstr &MI); +}; - void cleanUpAfterCombine(MachineInstr &MI, MachineInstr *Optional0) { -MI.eraseFromParent(); -if (Optional0 && isTriviallyDead(*Optional0, MRI)) - Optional0->eraseFromParent(); - } +bool AMDGPURegBankLegalizeCombiner::isLaneMask(Register Reg) { + const RegisterBank *RB = MRI.getRegBankOrNull(Reg); + if (RB && RB->getID() == AMDGPU::VCCRegBankID) +return true; - std::pair tryMatch(Register Src, unsigned Opcode) { -MachineInstr *MatchMI = MRI.getVRegDef(Src); -if (MatchMI->getOpcode() != Opcode) - return {nullptr, Register()}; -return {MatchMI, MatchMI->getOperand(1).getReg()}; - } + const TargetRegisterClass *RC = MRI.getRegClassOrNull(Reg); + return RC && TRI.isSGPRClass(RC) && MRI.getType(Reg) == LLT::scalar(1); +} - void tryCombineCopy(MachineInstr &MI) { -Register Dst = MI.getOperand(0).getReg(); -Register Src = MI.getOperand(1).getReg(); -// Skip copies of physical registers. -if (!Dst.isVirtual() || !Src.isVirtual()) - return; - -// This is a cross bank copy, sgpr S1 to lane mask. -// -// %Src:sgpr(s1) = G_TRUNC %TruncS32Src:sgpr(s32) -// %Dst:lane-mask(s1) = COPY %Src:sgpr(s1) -// -> -// %Dst:lane-mask(s1) = G_AMDGPU_COPY_VCC_SCC %TruncS32Src:sgpr(s32) -if (isLaneMask(Dst) && MRI.getRegBankOrNull(Src) == SgprRB) { - auto [Trunc, TruncS32Src] = tryMatch(Src, AMDGPU::G_TRUNC); - assert(Trunc && MRI.getType(TruncS32Src) == S32 && - "sgpr S1 must be result of G_TRUNC of sgpr S32"); - - B.setInstr(MI); - // Ensure that truncated bits in BoolSrc are 0. - auto One = B.buildConstant({SgprRB, S32}, 1); - auto BoolSrc = B.buildAnd({SgprRB, S32}, TruncS32Src, One); - B.buildInstr(AMDGPU::G_AMDGPU_COPY_VCC_SCC, {Dst}, {BoolSrc}); - cleanUpAfterCombine(MI, Trunc); - return; -} +std::pair +AMDGPURegBankLegalizeCombiner::tryMatch(Register Src, unsigned Opcode) { + MachineInstr *MatchMI = MRI.getVRegDef(Src); + if (MatchMI->getOpcode() != Opcode) +return {nullptr, Register()}; + return {MatchMI, MatchMI->getOperand(1).getReg()}; +} + +std::pair +AMDGPURegBankLegalizeCombiner::tryMatchRALFromUnmerge(Register Src) { + MachineInstr *ReadAnyLane = MRI.getVRegDef(Src); + if (ReadAnyLane->getOpcode() != AMDGPU::G_AMDGPU_READANYLANE) +return {nullptr, -1}; + + Register RALSrc = ReadAnyLane->getOperand(1).getReg(); + if (auto *UnMerge = getOpcodeDef(RALSrc, MRI)) +return {UnMerge, UnMerge->findRegisterDefOperandIdx(RALSrc, nullptr)}; -// Src = G_AMDGPU_READANYLANE RALSrc -// Dst = COPY Src -// -> -// Dst = RALSrc -if (MRI.getRegBankOrNull(Dst) == VgprRB && -MRI.getRegBankOrNull(Src) == SgprRB) { - auto [RAL, RALSrc] = tryMatch(Src, AMDGPU::G_AMDGPU_READANYLANE); - if (!RAL) -return; - - assert(MRI.getRegBank(RALSrc) == VgprRB); - MRI.replaceRegWith(Dst, RALSrc); - cleanUpAfterCombine(MI, RAL); - return; + return {nullptr, -1}; +} + +Register AMDGPURegBankLegalizeCombiner::getReadAnyLaneSrc(Register Src) { + // Src = G_AMDGPU_READANYLANE RALSrc + auto [RAL, RALSrc] = tryMatch(Src, AMDGPU::G_AMDGPU_READANYLANE); + if (RAL) +return RALSrc; + + // LoVgpr, HiVgpr = G_UNMERGE_VALUES UnmergeSrc + // LoSgpr = G_AMDGPU_READANYLANE LoVgpr + // HiSgpr = G_AMDGPU_READANYLANE HiVgpr + // Src G_MERGE_VALUES LoSgpr, HiSgpr + auto *Merge = getOpcodeDef(Src, MRI); + if (Merge) { +unsigned NumElts = Merge->getNumSources(); +auto [Unmerge, Idx] = tryMatchRALFromUnmerge(Merge->getSourceReg(0)); +if (!Unmerge || Unmerge->getNumDefs() != NumElts || Idx != 0) + return {}; + +// Check if all elements are from same unmerge and there is no shuffling. +for (unsigned i = 1; i < NumElts; ++i) { + auto [UnmergeI, IdxI] = tryMatchRALFromUnmerge(Merge->getSourceReg(i)); + if (UnmergeI != Unmerge || (unsigned)IdxI != i) +return {}; } +return Unmerge->getSourceReg(); } - void tryCombineS1AnyExt(MachineInstr &MI) { -// %Src:sgpr(S1) = G_TRUNC %TruncSrc -