[llvm-branch-commits] [llvm] b76014a - RegionInfo: use a range-based for loop [NFCI]

2020-12-29 Thread Nicolai Hähnle via llvm-branch-commits

Author: Nicolai Hähnle
Date: 2020-12-29T16:00:26+01:00
New Revision: b76014a4f15ad9f3151862fcc6c6ab2f0f505199

URL: 
https://github.com/llvm/llvm-project/commit/b76014a4f15ad9f3151862fcc6c6ab2f0f505199
DIFF: 
https://github.com/llvm/llvm-project/commit/b76014a4f15ad9f3151862fcc6c6ab2f0f505199.diff

LOG: RegionInfo: use a range-based for loop [NFCI]

Change-Id: I9985d72191a2b0680195032acf8a14ad2ba954ed

Differential Revision: https://reviews.llvm.org/D92932

Added: 


Modified: 
llvm/include/llvm/Analysis/RegionInfoImpl.h

Removed: 




diff  --git a/llvm/include/llvm/Analysis/RegionInfoImpl.h 
b/llvm/include/llvm/Analysis/RegionInfoImpl.h
index 8d9ec646f519..bbd3dd5dc0ea 100644
--- a/llvm/include/llvm/Analysis/RegionInfoImpl.h
+++ b/llvm/include/llvm/Analysis/RegionInfoImpl.h
@@ -585,10 +585,8 @@ bool RegionInfoBase::isRegion(BlockT *entry, BlockT 
*exit) const {
   // Exit is the header of a loop that contains the entry. In this case,
   // the dominance frontier must only contain the exit.
   if (!DT->dominates(entry, exit)) {
-for (typename DST::iterator SI = entrySuccs->begin(),
-SE = entrySuccs->end();
- SI != SE; ++SI) {
-  if (*SI != exit && *SI != entry)
+for (BlockT *successor : *entrySuccs) {
+  if (successor != exit && successor != entry)
 return false;
 }
 



___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: RegBankLegalize rules for load (PR #112882)

2024-11-28 Thread Nicolai Hähnle via llvm-branch-commits


@@ -233,6 +327,126 @@ RegBankLegalizeRules::getRulesForOpc(MachineInstr &MI) 
const {
   return GRules.at(GRulesAlias.at(Opc));
 }
 
+// Syntactic sugar wrapper for predicate lambda that enables '&&', '||' and 
'!'.
+class Predicate {
+public:
+  struct Elt {
+// Save formula composed of Pred, '&&', '||' and '!' as a jump table.
+// Sink ! to Pred. For example !((A && !B) || C) -> (!A || B) && !C
+// Sequences of && and || will be represented by jumps, for example:
+// (A && B && ... X) or (A && B && ... X) || Y
+//   A == true jump to B
+//   A == false jump to end or Y, result is A(false) or Y
+// (A || B || ... X) or (A || B || ... X) && Y
+//   A == true jump to end or Y, result is B(true) or Y

nhaehnle wrote:

```suggestion
//   A == true jump to end or Y, result is A(true) or Y
```

https://github.com/llvm/llvm-project/pull/112882
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: RegBankLegalize rules for load (PR #112882)

2024-11-28 Thread Nicolai Hähnle via llvm-branch-commits


@@ -233,6 +327,126 @@ RegBankLegalizeRules::getRulesForOpc(MachineInstr &MI) 
const {
   return GRules.at(GRulesAlias.at(Opc));
 }
 
+// Syntactic sugar wrapper for predicate lambda that enables '&&', '||' and 
'!'.
+class Predicate {
+public:
+  struct Elt {
+// Save formula composed of Pred, '&&', '||' and '!' as a jump table.
+// Sink ! to Pred. For example !((A && !B) || C) -> (!A || B) && !C
+// Sequences of && and || will be represented by jumps, for example:
+// (A && B && ... X) or (A && B && ... X) || Y
+//   A == true jump to B
+//   A == false jump to end or Y, result is A(false) or Y
+// (A || B || ... X) or (A || B || ... X) && Y
+//   A == true jump to end or Y, result is B(true) or Y
+//   A == false jump to B
+// Notice that when negating expression, we simply flip Neg on each Pred
+// and swap TJumpOffset and FJumpOffset (&& becomes ||, || becomes &&).
+std::function Pred;
+bool Neg; // Neg of Pred is calculated before jump
+unsigned TJumpOffset;
+unsigned FJumpOffset;
+  };
+
+  SmallVector Expression;
+
+  Predicate(std::function Pred) {
+Expression.push_back({Pred, false, 1, 1});
+  };
+
+  Predicate(SmallVectorImpl &Expr) { Expression.swap(Expr); };

nhaehnle wrote:

I find having a constructor that is destructive on its argument in this way to 
be quite surprising. It is against good patterns in modern C++. Better have it 
take an rvalue reference (&&-reference). Then its callers have to explicitly 
`std::move`, but that makes it more obvious what happens at the call sites.

Also, this constructor should be private just like `Elt`.

https://github.com/llvm/llvm-project/pull/112882
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: RegBankLegalize rules for load (PR #112882)

2024-11-28 Thread Nicolai Hähnle via llvm-branch-commits


@@ -233,6 +327,126 @@ RegBankLegalizeRules::getRulesForOpc(MachineInstr &MI) 
const {
   return GRules.at(GRulesAlias.at(Opc));
 }
 
+// Syntactic sugar wrapper for predicate lambda that enables '&&', '||' and 
'!'.
+class Predicate {
+public:
+  struct Elt {
+// Save formula composed of Pred, '&&', '||' and '!' as a jump table.
+// Sink ! to Pred. For example !((A && !B) || C) -> (!A || B) && !C
+// Sequences of && and || will be represented by jumps, for example:
+// (A && B && ... X) or (A && B && ... X) || Y
+//   A == true jump to B
+//   A == false jump to end or Y, result is A(false) or Y
+// (A || B || ... X) or (A || B || ... X) && Y
+//   A == true jump to end or Y, result is B(true) or Y
+//   A == false jump to B
+// Notice that when negating expression, we simply flip Neg on each Pred
+// and swap TJumpOffset and FJumpOffset (&& becomes ||, || becomes &&).
+std::function Pred;
+bool Neg; // Neg of Pred is calculated before jump
+unsigned TJumpOffset;
+unsigned FJumpOffset;
+  };
+
+  SmallVector Expression;

nhaehnle wrote:

Make this and `Elt` private. `Elt` is quite subtle and should be hidden as an 
implementation detail.

https://github.com/llvm/llvm-project/pull/112882
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: RegBankLegalize rules for load (PR #112882)

2024-11-28 Thread Nicolai Hähnle via llvm-branch-commits


@@ -233,6 +327,126 @@ RegBankLegalizeRules::getRulesForOpc(MachineInstr &MI) 
const {
   return GRules.at(GRulesAlias.at(Opc));
 }
 
+// Syntactic sugar wrapper for predicate lambda that enables '&&', '||' and 
'!'.
+class Predicate {
+public:
+  struct Elt {
+// Save formula composed of Pred, '&&', '||' and '!' as a jump table.
+// Sink ! to Pred. For example !((A && !B) || C) -> (!A || B) && !C
+// Sequences of && and || will be represented by jumps, for example:
+// (A && B && ... X) or (A && B && ... X) || Y
+//   A == true jump to B
+//   A == false jump to end or Y, result is A(false) or Y
+// (A || B || ... X) or (A || B || ... X) && Y
+//   A == true jump to end or Y, result is B(true) or Y
+//   A == false jump to B
+// Notice that when negating expression, we simply flip Neg on each Pred
+// and swap TJumpOffset and FJumpOffset (&& becomes ||, || becomes &&).
+std::function Pred;
+bool Neg; // Neg of Pred is calculated before jump
+unsigned TJumpOffset;
+unsigned FJumpOffset;
+  };
+
+  SmallVector Expression;
+
+  Predicate(std::function Pred) {
+Expression.push_back({Pred, false, 1, 1});
+  };
+
+  Predicate(SmallVectorImpl &Expr) { Expression.swap(Expr); };
+
+  bool operator()(const MachineInstr &MI) const {
+unsigned Idx = 0;
+unsigned ResultIdx = Expression.size();
+bool Result;
+do {
+  Result = Expression[Idx].Pred(MI);
+  Result = Expression[Idx].Neg ? !Result : Result;
+  if (Result) {
+Idx += Expression[Idx].TJumpOffset;
+  } else {
+Idx += Expression[Idx].FJumpOffset;
+  }
+} while ((Idx != ResultIdx));
+
+return Result;
+  };
+
+  Predicate operator!() {
+SmallVector NegExpression;
+for (Elt &ExprElt : Expression) {
+  NegExpression.push_back({ExprElt.Pred, !ExprElt.Neg, ExprElt.FJumpOffset,
+   ExprElt.TJumpOffset});
+}
+return Predicate(NegExpression);
+  };
+
+  Predicate operator&&(Predicate &RHS) {

nhaehnle wrote:

Similar to the constructor above, the baseline of this should be a const method 
that accepts a const reference as an argument.

Do you have a good argument for the rvalue-argument overload below? If not, 
please just remove it.

The same applies to `operator||`.

https://github.com/llvm/llvm-project/pull/112882
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: RegBankLegalize rules for load (PR #112882)

2024-11-28 Thread Nicolai Hähnle via llvm-branch-commits


@@ -290,7 +504,86 @@ RegBankLegalizeRules::RegBankLegalizeRules(const 
GCNSubtarget &_ST,
   .Any({{UniS64, S32}, {{Sgpr64}, {Sgpr32}, Ext32To64}})
   .Any({{DivS64, S32}, {{Vgpr64}, {Vgpr32}, Ext32To64}});
 
-  addRulesForGOpcs({G_LOAD}).Any({{DivS32, DivP1}, {{Vgpr32}, {VgprP1}}});
+  bool hasUnAlignedLoads = ST->getGeneration() >= AMDGPUSubtarget::GFX12;
+  bool hasSMRDSmall = ST->hasScalarSubwordLoads();
+
+  Predicate isAlign16([](const MachineInstr &MI) -> bool {
+return (*MI.memoperands_begin())->getAlign() >= Align(16);
+  });
+
+  Predicate isAlign4([](const MachineInstr &MI) -> bool {
+return (*MI.memoperands_begin())->getAlign() >= Align(4);
+  });
+
+  Predicate isAtomicMMO([](const MachineInstr &MI) -> bool {
+return (*MI.memoperands_begin())->isAtomic();
+  });
+
+  Predicate isUniMMO([](const MachineInstr &MI) -> bool {
+return AMDGPUInstrInfo::isUniformMMO(*MI.memoperands_begin());
+  });
+
+  Predicate isConst([](const MachineInstr &MI) -> bool {
+// Address space in MMO be different then address space on pointer.
+const MachineMemOperand *MMO = *MI.memoperands_begin();
+const unsigned AS = MMO->getAddrSpace();
+return AS == AMDGPUAS::CONSTANT_ADDRESS ||
+   AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT;
+  });
+
+  Predicate isVolatileMMO([](const MachineInstr &MI) -> bool {
+return (*MI.memoperands_begin())->isVolatile();
+  });
+
+  Predicate isInvMMO([](const MachineInstr &MI) -> bool {
+return (*MI.memoperands_begin())->isInvariant();
+  });
+
+  Predicate isNoClobberMMO([](const MachineInstr &MI) -> bool {
+return (*MI.memoperands_begin())->getFlags() & MONoClobber;
+  });
+
+  Predicate isNaturalAlignedSmall([](const MachineInstr &MI) -> bool {
+const MachineMemOperand *MMO = *MI.memoperands_begin();
+const unsigned MemSize = 8 * MMO->getSize().getValue();
+return (MemSize == 16 && MMO->getAlign() >= Align(2)) ||
+   (MemSize == 8 && MMO->getAlign() >= Align(1));
+  });
+
+  auto isUL = !isAtomicMMO && isUniMMO && (isConst || !isVolatileMMO) &&
+  (isConst || isInvMMO || isNoClobberMMO);

nhaehnle wrote:

What's the logic behind the `isConst || !isVolatileMMO` part of this predicate? 
const && volatile doesn't make much sense to me, so why isn't this just 
`!isVolatileMMO`?

https://github.com/llvm/llvm-project/pull/112882
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: RegBankLegalize rules for load (PR #112882)

2024-11-28 Thread Nicolai Hähnle via llvm-branch-commits

https://github.com/nhaehnle edited 
https://github.com/llvm/llvm-project/pull/112882
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: RegBankLegalize rules for load (PR #112882)

2024-11-28 Thread Nicolai Hähnle via llvm-branch-commits


@@ -290,7 +504,86 @@ RegBankLegalizeRules::RegBankLegalizeRules(const 
GCNSubtarget &_ST,
   .Any({{UniS64, S32}, {{Sgpr64}, {Sgpr32}, Ext32To64}})
   .Any({{DivS64, S32}, {{Vgpr64}, {Vgpr32}, Ext32To64}});
 
-  addRulesForGOpcs({G_LOAD}).Any({{DivS32, DivP1}, {{Vgpr32}, {VgprP1}}});
+  bool hasUnAlignedLoads = ST->getGeneration() >= AMDGPUSubtarget::GFX12;

nhaehnle wrote:

"unaligned" is a single word:
```suggestion
  bool hasUnalignedLoads = ST->getGeneration() >= AMDGPUSubtarget::GFX12;
```

https://github.com/llvm/llvm-project/pull/112882
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: RegBankLegalize rules for load (PR #112882)

2024-11-28 Thread Nicolai Hähnle via llvm-branch-commits

https://github.com/nhaehnle commented:

I have a bunch of comments, but apart from that the change LGTM

https://github.com/llvm/llvm-project/pull/112882
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: RegBankLegalize rules for load (PR #112882)

2024-11-28 Thread Nicolai Hähnle via llvm-branch-commits


@@ -233,6 +327,126 @@ RegBankLegalizeRules::getRulesForOpc(MachineInstr &MI) 
const {
   return GRules.at(GRulesAlias.at(Opc));
 }
 
+// Syntactic sugar wrapper for predicate lambda that enables '&&', '||' and 
'!'.
+class Predicate {
+public:
+  struct Elt {
+// Save formula composed of Pred, '&&', '||' and '!' as a jump table.
+// Sink ! to Pred. For example !((A && !B) || C) -> (!A || B) && !C
+// Sequences of && and || will be represented by jumps, for example:
+// (A && B && ... X) or (A && B && ... X) || Y
+//   A == true jump to B
+//   A == false jump to end or Y, result is A(false) or Y
+// (A || B || ... X) or (A || B || ... X) && Y
+//   A == true jump to end or Y, result is B(true) or Y
+//   A == false jump to B
+// Notice that when negating expression, we simply flip Neg on each Pred
+// and swap TJumpOffset and FJumpOffset (&& becomes ||, || becomes &&).
+std::function Pred;
+bool Neg; // Neg of Pred is calculated before jump
+unsigned TJumpOffset;
+unsigned FJumpOffset;
+  };
+
+  SmallVector Expression;
+
+  Predicate(std::function Pred) {
+Expression.push_back({Pred, false, 1, 1});
+  };
+
+  Predicate(SmallVectorImpl &Expr) { Expression.swap(Expr); };
+
+  bool operator()(const MachineInstr &MI) const {
+unsigned Idx = 0;
+unsigned ResultIdx = Expression.size();
+bool Result;
+do {
+  Result = Expression[Idx].Pred(MI);
+  Result = Expression[Idx].Neg ? !Result : Result;
+  if (Result) {
+Idx += Expression[Idx].TJumpOffset;
+  } else {
+Idx += Expression[Idx].FJumpOffset;
+  }
+} while ((Idx != ResultIdx));
+
+return Result;
+  };
+
+  Predicate operator!() {

nhaehnle wrote:

Should be a const method.

https://github.com/llvm/llvm-project/pull/112882
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: RegBankLegalize rules for load (PR #112882)

2024-11-28 Thread Nicolai Hähnle via llvm-branch-commits


@@ -116,6 +193,50 @@ void RegBankLegalizeHelper::lower(MachineInstr &MI,
 MI.eraseFromParent();
 break;
   }
+  case SplitLoad: {
+LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
+unsigned Size = DstTy.getSizeInBits();
+// Even split to 128-bit loads
+if (Size > 128) {
+  LLT B128;
+  if (DstTy.isVector()) {
+LLT EltTy = DstTy.getElementType();
+B128 = LLT::fixed_vector(128 / EltTy.getSizeInBits(), EltTy);
+  } else {
+B128 = LLT::scalar(128);
+  }
+  if (Size / 128 == 2)
+splitLoad(MI, {B128, B128});
+  if (Size / 128 == 4)
+splitLoad(MI, {B128, B128, B128, B128});

nhaehnle wrote:

I would expect this to be an else-if chain with an `else llvm_unreachable` at 
the end.

https://github.com/llvm/llvm-project/pull/112882
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: AMDGPURegBankLegalize (PR #112864)

2025-01-06 Thread Nicolai Hähnle via llvm-branch-commits

https://github.com/nhaehnle commented:

LGTM aside from a small nit

https://github.com/llvm/llvm-project/pull/112864
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: AMDGPURegBankLegalize (PR #112864)

2025-01-06 Thread Nicolai Hähnle via llvm-branch-commits


@@ -217,6 +217,74 @@ bool AMDGPUInstructionSelector::selectCOPY(MachineInstr 
&I) const {
   return true;
 }
 
+bool AMDGPUInstructionSelector::selectCOPY_SCC_VCC(MachineInstr &I) const {

nhaehnle wrote:

I don't think it's trivial to avoid this a priori. I agree that a separate 
cleanup optimization could do it, in any case I'd say it's best left to a 
separate focused change.

https://github.com/llvm/llvm-project/pull/112864
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: AMDGPURegBankLegalize (PR #112864)

2025-01-06 Thread Nicolai Hähnle via llvm-branch-commits

https://github.com/nhaehnle edited 
https://github.com/llvm/llvm-project/pull/112864
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: AMDGPURegBankLegalize (PR #112864)

2025-01-06 Thread Nicolai Hähnle via llvm-branch-commits


@@ -9,7 +9,11 @@
 #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUGLOBALISELUTILS_H
 #define LLVM_LIB_TARGET_AMDGPU_AMDGPUGLOBALISELUTILS_H
 
+#include "AMDGPURegisterBankInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "llvm/ADT/DenseSet.h"
+#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
+#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"

nhaehnle wrote:

We should avoid unnecessary `#include`s in headers (for compile time reason). I 
doubt that all of these are really needed.

https://github.com/llvm/llvm-project/pull/112864
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: RegBankLegalize rules for load (PR #112882)

2025-01-06 Thread Nicolai Hähnle via llvm-branch-commits

https://github.com/nhaehnle approved this pull request.

LGTM

https://github.com/llvm/llvm-project/pull/112882
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] MachineUniformityAnalysis: Improve isConstantOrUndefValuePhi (PR #112866)

2025-01-06 Thread Nicolai Hähnle via llvm-branch-commits


@@ -54,9 +54,34 @@ const MachineBasicBlock 
*MachineSSAContext::getDefBlock(Register value) const {
   return F->getRegInfo().getVRegDef(value)->getParent();
 }
 
+static bool isUndef(const MachineInstr &MI) {
+  return MI.getOpcode() == TargetOpcode::G_IMPLICIT_DEF ||
+ MI.getOpcode() == TargetOpcode::IMPLICIT_DEF;
+}
+
+/// MachineInstr equivalent of PHINode::hasConstantOrUndefValue() for G_PHI.
 template <>
 bool MachineSSAContext::isConstantOrUndefValuePhi(const MachineInstr &Phi) {
-  return Phi.isConstantValuePHI();
+  if (!Phi.isPHI())
+return false;
+
+  // In later passes PHI may appear with an undef operand, getVRegDef can fail.
+  if (Phi.getOpcode() == TargetOpcode::PHI)
+return Phi.isConstantValuePHI();
+
+  // For G_PHI we do equivalent of PHINode::hasConstantOrUndefValue().
+  const MachineRegisterInfo &MRI = Phi.getMF()->getRegInfo();
+  Register This = Phi.getOperand(0).getReg();
+  Register ConstantValue;
+  for (unsigned i = 1, e = Phi.getNumOperands(); i < e; i += 2) {
+Register Incoming = Phi.getOperand(i).getReg();
+if (Incoming != This && !isUndef(*MRI.getVRegDef(Incoming))) {
+  if (ConstantValue && ConstantValue != Incoming)
+return false;
+  ConstantValue = Incoming;
+}
+  }

nhaehnle wrote:

I don't understand this logic. Why are you looking at operand 0? What's the 
meaning of `This` vs. `ConstantValue`? This looks very fishy.

https://github.com/llvm/llvm-project/pull/112866
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: Temporal divergence lowering i1 (PR #124299)

2025-02-02 Thread Nicolai Hähnle via llvm-branch-commits


@@ -219,6 +220,54 @@ bool DivergenceLoweringHelper::lowerTemporalDivergence() {
   return false;
 }
 
+bool DivergenceLoweringHelper::lowerTemporalDivergenceI1() {
+  MachineRegisterInfo::VRegAttrs BoolS1 = {ST->getBoolRC(), LLT::scalar(1)};
+  initializeLaneMaskRegisterAttributes(BoolS1);
+
+  for (auto [Inst, UseInst, Cycle] : MUI->getTemporalDivergenceList()) {
+Register Reg = Inst->getOperand(0).getReg();
+if (MRI->getType(Reg) != LLT::scalar(1))
+  continue;
+
+Register MergedMask = MRI->createVirtualRegister(BoolS1);
+Register PrevIterMask = MRI->createVirtualRegister(BoolS1);
+
+MachineBasicBlock *CycleHeaderMBB = Cycle->getHeader();
+SmallVector ExitingBlocks;
+Cycle->getExitingBlocks(ExitingBlocks);
+assert(ExitingBlocks.size() == 1);
+MachineBasicBlock *CycleExitingMBB = ExitingBlocks[0];
+
+B.setInsertPt(*CycleHeaderMBB, CycleHeaderMBB->begin());
+auto CrossIterPHI = B.buildInstr(AMDGPU::PHI).addDef(PrevIterMask);
+
+// We only care about cycle iterration path - merge Reg with previous
+// iteration. For other incomings use implicit def.
+// Predecessors should be CyclePredecessor and CycleExitingMBB.
+// In older versions of irreducible control flow lowering there could be
+// cases with more predecessors. To keep this lowering as generic as
+// possible also handle those cases.
+for (auto MBB : CycleHeaderMBB->predecessors()) {
+  if (MBB == CycleExitingMBB) {
+CrossIterPHI.addReg(MergedMask);
+  } else {
+B.setInsertPt(*MBB, MBB->getFirstTerminator());
+auto ImplDef = B.buildInstr(AMDGPU::IMPLICIT_DEF, {BoolS1}, {});
+CrossIterPHI.addReg(ImplDef.getReg(0));
+  }
+  CrossIterPHI.addMBB(MBB);
+}
+
+MachineBasicBlock *MBB = Inst->getParent();
+buildMergeLaneMasks(*MBB, MBB->getFirstTerminator(), {}, MergedMask,
+PrevIterMask, Reg);

nhaehnle wrote:

In this case, it would be better to move the lane merging to directly after 
`Inst`.

The register pressure calculus is exactly opposite from the non-i1 case:

* In the non-i1 case, shifting the COPY down is good because it reduces the 
live range of the VGPR while potentially making the live range of the SGPR 
longer. VGPRs are more expensive than SGPRs, so this is a good trade-off.
* In the i1 case, we'll have a live range for the merged mask extending across 
the entire cycle anyway. By moving the lane merging closer to Inst, we leave 
the live range of the merged mask unchanged, but we (most likely) reduce the 
live range of the i1 value produced by Inst.

https://github.com/llvm/llvm-project/pull/124299
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: Temporal divergence lowering i1 (PR #124299)

2025-02-02 Thread Nicolai Hähnle via llvm-branch-commits

https://github.com/nhaehnle edited 
https://github.com/llvm/llvm-project/pull/124299
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: Temporal divergence lowering i1 (PR #124299)

2025-02-02 Thread Nicolai Hähnle via llvm-branch-commits

https://github.com/nhaehnle edited 
https://github.com/llvm/llvm-project/pull/124299
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: Temporal divergence lowering i1 (PR #124299)

2025-02-02 Thread Nicolai Hähnle via llvm-branch-commits

https://github.com/nhaehnle edited 
https://github.com/llvm/llvm-project/pull/124299
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: Temporal divergence lowering i1 (PR #124299)

2025-02-02 Thread Nicolai Hähnle via llvm-branch-commits

https://github.com/nhaehnle edited 
https://github.com/llvm/llvm-project/pull/124299
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: Temporal divergence lowering i1 (PR #124299)

2025-02-02 Thread Nicolai Hähnle via llvm-branch-commits

https://github.com/nhaehnle edited 
https://github.com/llvm/llvm-project/pull/124299
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: Temporal divergence lowering (non i1) (PR #124298)

2025-02-02 Thread Nicolai Hähnle via llvm-branch-commits


@@ -188,6 +190,35 @@ void 
DivergenceLoweringHelper::constrainAsLaneMask(Incoming &In) {
   In.Reg = Copy.getReg(0);
 }
 
+void replaceUsesOfRegInInstWith(Register Reg, MachineInstr *Inst,
+Register NewReg) {
+  for (MachineOperand &Op : Inst->operands()) {
+if (Op.isReg() && Op.getReg() == Reg)
+  Op.setReg(NewReg);
+  }
+}
+
+bool DivergenceLoweringHelper::lowerTemporalDivergence() {
+  AMDGPU::IntrinsicLaneMaskAnalyzer ILMA(*MF);
+
+  for (auto [Inst, UseInst, _] : MUI->getTemporalDivergenceList()) {
+Register Reg = Inst->getOperand(0).getReg();
+if (MRI->getType(Reg) == LLT::scalar(1) || MUI->isDivergent(Reg) ||
+ILMA.isS32S64LaneMask(Reg))
+  continue;
+
+MachineBasicBlock *MBB = Inst->getParent();
+B.setInsertPt(*MBB, 
MBB->SkipPHIsAndLabels(std::next(Inst->getIterator(;
+
+Register VgprReg = MRI->createGenericVirtualRegister(MRI->getType(Reg));
+B.buildInstr(AMDGPU::COPY, {VgprReg}, {Reg})
+.addUse(ExecReg, RegState::Implicit);
+
+replaceUsesOfRegInInstWith(Reg, UseInst, VgprReg);
+  }
+  return false;
+}

nhaehnle wrote:

I do have one high-level comment about this.

Every `Inst` may potentially appear with many `UseInst`s in the temporal 
divergence list. The current code will create multiple new registers and 
multiple `COPY` instructions, which seems wasteful even if downstream passes 
can often clean it up.

I would suggest capturing the created register in a `DenseMap` for re-use.

Also, how about inserting the `COPY` at the end of `Inst->getParent()`? That 
way, the live range of the VGPR is reduced.

https://github.com/llvm/llvm-project/pull/124298
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: Temporal divergence lowering (non i1) (PR #124298)

2025-02-02 Thread Nicolai Hähnle via llvm-branch-commits

https://github.com/nhaehnle commented:

I haven't done a detailed review of the code, but from a high-level algorithmic 
view this change already looks pretty reasonable to me.

https://github.com/llvm/llvm-project/pull/124298
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: Temporal divergence lowering (non i1) (PR #124298)

2025-02-02 Thread Nicolai Hähnle via llvm-branch-commits

https://github.com/nhaehnle edited 
https://github.com/llvm/llvm-project/pull/124298
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: Temporal divergence lowering i1 (PR #124299)

2025-02-02 Thread Nicolai Hähnle via llvm-branch-commits

https://github.com/nhaehnle edited 
https://github.com/llvm/llvm-project/pull/124299
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: Temporal divergence lowering i1 (PR #124299)

2025-02-02 Thread Nicolai Hähnle via llvm-branch-commits


@@ -219,6 +220,54 @@ bool DivergenceLoweringHelper::lowerTemporalDivergence() {
   return false;
 }
 
+bool DivergenceLoweringHelper::lowerTemporalDivergenceI1() {
+  MachineRegisterInfo::VRegAttrs BoolS1 = {ST->getBoolRC(), LLT::scalar(1)};
+  initializeLaneMaskRegisterAttributes(BoolS1);
+
+  for (auto [Inst, UseInst, Cycle] : MUI->getTemporalDivergenceList()) {
+Register Reg = Inst->getOperand(0).getReg();
+if (MRI->getType(Reg) != LLT::scalar(1))
+  continue;
+
+Register MergedMask = MRI->createVirtualRegister(BoolS1);
+Register PrevIterMask = MRI->createVirtualRegister(BoolS1);
+
+MachineBasicBlock *CycleHeaderMBB = Cycle->getHeader();
+SmallVector ExitingBlocks;
+Cycle->getExitingBlocks(ExitingBlocks);
+assert(ExitingBlocks.size() == 1);
+MachineBasicBlock *CycleExitingMBB = ExitingBlocks[0];
+
+B.setInsertPt(*CycleHeaderMBB, CycleHeaderMBB->begin());
+auto CrossIterPHI = B.buildInstr(AMDGPU::PHI).addDef(PrevIterMask);
+
+// We only care about cycle iterration path - merge Reg with previous
+// iteration. For other incomings use implicit def.
+// Predecessors should be CyclePredecessor and CycleExitingMBB.
+// In older versions of irreducible control flow lowering there could be
+// cases with more predecessors. To keep this lowering as generic as
+// possible also handle those cases.
+for (auto MBB : CycleHeaderMBB->predecessors()) {
+  if (MBB == CycleExitingMBB) {
+CrossIterPHI.addReg(MergedMask);
+  } else {
+B.setInsertPt(*MBB, MBB->getFirstTerminator());
+auto ImplDef = B.buildInstr(AMDGPU::IMPLICIT_DEF, {BoolS1}, {});
+CrossIterPHI.addReg(ImplDef.getReg(0));
+  }
+  CrossIterPHI.addMBB(MBB);
+}

nhaehnle wrote:

This still feels vaguely off to me.

First of all, can you please add a `Cycle->isReducible()` assert here just in 
case, since you're relying on there only being one header? (Actually, that 
assert should probably be in `GenericCycle::getHeader`.)

Second, I'm worried about that case distinction between different kinds of 
predecessors. It really doesn't feel properly generic to me. Latches and 
exiting blocks aren't necessarily the same thing. (The current structurizer 
tends to make it that way, but even with the structurizer we probably don't 
have a guarantee, let alone with a future different control flow lowering.)

Let's think through some cases:

* If the predecessor is outside the cycle, it should be an `IMPLICIT_DEF`
* If it is inside the cycle, i.e. a latch, there are two cases:
  * If the predecessor is dominated by `Inst`, then clearly use `MergedMask`
  * Otherwise, it's actually not clear: could it be possible that control flow 
leaves the dominance region of `Inst` and goes around the loop again?

On that last point, you could perhaps make a case that because the loop is 
temporally divergent, having the CFG in wave CFG form requires a certain 
structure. But I'm not convinced. In fact, consider:
```mermaid
flowchart TD
Header --> B
Header --> Inst
Inst --> B
B --> Header
Inst --> C
C --> Header
C --> Exit
```
Let's say the branches of Header and C are uniform; only the branch of the 
basic block containing Inst is divergent.

Then we have temporal divergence in Inst, and need to handle the `i1`. But the 
CFG is already *reconvergent*: the divergent branch is post-dominated by one of 
its immediate successors. This means it is possible to insert EXEC masking code 
for this CFG correctly without structuring it. Therefore, the code here should 
really handle this case correctly.

I think this calls for using MachineSSAUpdater for a robust solution:

* Initialize the updater with IMPLICIT_DEF in every predecessor of the header 
that is outside the cycle.
* Insert the MergedMask into the updater for the `Inst->getParent()`
* Then build the lane mask merging, querying the updater for the previous mask 
(i.e., let the updater create that register) 
* The MergedMask can still be used by UseInst due to dominance

Except now there's an additional issue with caching the MergedMask that doesn't 
exist for the non-i1 case: Different `UseInst`s could appear with respect to 
different `Cycle`s. The non-i1 case doesn't actually care about the cycle 
structure, but the code here does.

In particular, with my suggestions, it will use the cycle to determine how to 
initialize the updater.

We can still get away with a single `MergedMask` per `Inst`:

* The lazy way to do it would be to not initialize the updater with 
IMPLICIT_DEFs and just let it figure them out by itself. This would generate 
correct code.
  * However, this has a compile-time cost and could end up being too 
pessimistic in terms of the generated code.
  * The Inst and all its uses could be contained in a cycle. This naive way 
would lead to a phi node at the header of this cycle, and therefore needlessly 
extending the live range o

[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: Temporal divergence lowering i1 (PR #124299)

2025-02-02 Thread Nicolai Hähnle via llvm-branch-commits

https://github.com/nhaehnle commented:

Thank you for moving the insert point.

An analogous comment applies here like in the non-i1 case: it would be good to 
cache the `MergedMask` register for re-use when the same `Inst` occurs with 
multiple `UseInst`s.

https://github.com/llvm/llvm-project/pull/124299
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: Temporal divergence lowering i1 (PR #124299)

2025-02-02 Thread Nicolai Hähnle via llvm-branch-commits

https://github.com/nhaehnle edited 
https://github.com/llvm/llvm-project/pull/124299
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: Temporal divergence lowering i1 (PR #124299)

2025-02-02 Thread Nicolai Hähnle via llvm-branch-commits

https://github.com/nhaehnle edited 
https://github.com/llvm/llvm-project/pull/124299
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: Temporal divergence lowering (non i1) (PR #124298)

2025-01-30 Thread Nicolai Hähnle via llvm-branch-commits


@@ -188,6 +190,37 @@ void 
DivergenceLoweringHelper::constrainAsLaneMask(Incoming &In) {
   In.Reg = Copy.getReg(0);
 }
 
+void replaceUsesOfRegInInstWith(Register Reg, MachineInstr *Inst,
+Register NewReg) {
+  for (MachineOperand &Op : Inst->operands()) {
+if (Op.isReg() && Op.getReg() == Reg)
+  Op.setReg(NewReg);
+  }
+}
+
+bool DivergenceLoweringHelper::lowerTempDivergence() {
+  AMDGPU::IntrinsicLaneMaskAnalyzer ILMA(*MF);
+
+  for (auto [Inst, UseInst, _] : MUI->getUsesOutsideCycleWithDivergentExit()) {
+Register Reg = Inst->getOperand(0).getReg();
+if (MRI->getType(Reg) == LLT::scalar(1) || MUI->isDivergent(Reg) ||
+ILMA.isS32S64LaneMask(Reg))
+  continue;
+
+MachineInstr *MI = const_cast(Inst);

nhaehnle wrote:

These come out of the analysis. The analysis itself uses const 
pointers/references in its implementation, which I believe is a good idea for 
const correctness. I wouldn't change that.

So a `const_cast` is needed at some point. The only question is where.

I think here is as good a place as any, though perhaps grouping them together 
with a small explanation is in order. Something like:
```c++
// As an analysis, UniformityAnalysis treats instructions as const. We have 
the parent function
// as non-const, so casting const away here is inelegant but justified.
MachineInstr *MI = const_cast(Inst);
MachineInstr *UseMI = const_cast(UseInst);
```

https://github.com/llvm/llvm-project/pull/124298
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: AMDGPURegBankLegalize (PR #112864)

2025-01-23 Thread Nicolai Hähnle via llvm-branch-commits

https://github.com/nhaehnle approved this pull request.


https://github.com/llvm/llvm-project/pull/112864
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] MachineUniformityAnalysis: Improve isConstantOrUndefValuePhi (PR #112866)

2025-01-23 Thread Nicolai Hähnle via llvm-branch-commits

https://github.com/nhaehnle approved this pull request.


https://github.com/llvm/llvm-project/pull/112866
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: add RegBankLegalize rules for extends and trunc (PR #132383)

2025-03-26 Thread Nicolai Hähnle via llvm-branch-commits


@@ -251,8 +245,11 @@ body: |
 ; CHECK: liveins: $vgpr0
 ; CHECK-NEXT: {{  $}}
 ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
-; CHECK-NEXT: [[TRUNC:%[0-9]+]]:vgpr(s1) = G_TRUNC [[COPY]](s32)
-; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:vgpr(s32) = G_ANYEXT [[TRUNC]](s1)
+; CHECK-NEXT: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1
+; CHECK-NEXT: [[AND:%[0-9]+]]:vgpr(s32) = G_AND [[COPY]], [[C]]
+; CHECK-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
+; CHECK-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(ne), [[AND]](s32), 
[[C1]]
+; CHECK-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP]](s1), [[C]], 
[[C1]]

nhaehnle wrote:

Similar here: This could be combined down to just a no-op -- don't combiners do 
that already? They should, and so this should probably not be handled 
separately by legalization

https://github.com/llvm/llvm-project/pull/132383
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: add RegBankLegalize rules for extends and trunc (PR #132383)

2025-03-26 Thread Nicolai Hähnle via llvm-branch-commits


@@ -269,10 +266,12 @@ body: |
 ; CHECK: liveins: $vgpr0
 ; CHECK-NEXT: {{  $}}
 ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
-; CHECK-NEXT: [[TRUNC:%[0-9]+]]:vgpr(s1) = G_TRUNC [[COPY]](s32)
-; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:vgpr(s32) = G_ANYEXT [[TRUNC]](s1)
-; CHECK-NEXT: [[DEF:%[0-9]+]]:vgpr(s32) = G_IMPLICIT_DEF
-; CHECK-NEXT: [[MV:%[0-9]+]]:vgpr(s64) = G_MERGE_VALUES [[ANYEXT]](s32), 
[[DEF]](s32)
+; CHECK-NEXT: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1
+; CHECK-NEXT: [[AND:%[0-9]+]]:vgpr(s32) = G_AND [[COPY]], [[C]]
+; CHECK-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
+; CHECK-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(ne), [[AND]](s32), 
[[C1]]
+; CHECK-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP]](s1), [[C]], 
[[C1]]
+; CHECK-NEXT: [[MV:%[0-9]+]]:vgpr(s64) = G_MERGE_VALUES [[SELECT]](s32), 
[[C1]](s32)

nhaehnle wrote:

Could just be a single G_ANYEXT

https://github.com/llvm/llvm-project/pull/132383
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: add RegBankLegalize rules for extends and trunc (PR #132383)

2025-03-26 Thread Nicolai Hähnle via llvm-branch-commits


@@ -233,8 +222,13 @@ body: |
 ; CHECK: liveins: $vgpr0
 ; CHECK-NEXT: {{  $}}
 ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
-; CHECK-NEXT: [[TRUNC:%[0-9]+]]:vgpr(s1) = G_TRUNC [[COPY]](s32)
-; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:vgpr(s16) = G_ANYEXT [[TRUNC]](s1)
+; CHECK-NEXT: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1
+; CHECK-NEXT: [[AND:%[0-9]+]]:vgpr(s32) = G_AND [[COPY]], [[C]]
+; CHECK-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
+; CHECK-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(ne), [[AND]](s32), 
[[C1]]
+; CHECK-NEXT: [[C2:%[0-9]+]]:vgpr(s16) = G_CONSTANT i16 1
+; CHECK-NEXT: [[C3:%[0-9]+]]:vgpr(s16) = G_CONSTANT i16 0
+; CHECK-NEXT: [[SELECT:%[0-9]+]]:vgpr(s16) = G_SELECT [[ICMP]](s1), 
[[C2]], [[C3]]

nhaehnle wrote:

This is unnecessarily convoluted. A single `G_TRUNC` should do the trick. 
(Isn't that something a combiner could do?)

https://github.com/llvm/llvm-project/pull/132383
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: add RegBankLegalize rules for extends and trunc (PR #132383)

2025-03-26 Thread Nicolai Hähnle via llvm-branch-commits

https://github.com/nhaehnle edited 
https://github.com/llvm/llvm-project/pull/132383
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: add RegBankLegalize rules for extends and trunc (PR #132383)

2025-03-26 Thread Nicolai Hähnle via llvm-branch-commits


@@ -215,8 +205,7 @@ body: |
 ; CHECK: liveins: $sgpr0
 ; CHECK-NEXT: {{  $}}
 ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
-; CHECK-NEXT: [[TRUNC:%[0-9]+]]:sgpr(s1) = G_TRUNC [[COPY]](s32)
-; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:sgpr(s64) = G_ANYEXT [[TRUNC]](s1)
+; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:sgpr(s64) = G_ANYEXT [[COPY]](s32)

nhaehnle wrote:

Isn't this a correctness regression? I'm not entirely certain because I 
remember there was some weirdness around what G_TRUNC means semantically. Can 
you explain why there is no need for a trunc or bitwise and or something like 
that in the output?

Note that `anyext_s1_to_s32_vgpr` does leave a G_AND, so either that test shows 
a code quality issue or this test is incorrect.

https://github.com/llvm/llvm-project/pull/132383
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: add RegBankLegalize rules for extends and trunc (PR #132383)

2025-03-26 Thread Nicolai Hähnle via llvm-branch-commits


@@ -160,8 +154,7 @@ body: |
 ; CHECK-NEXT: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1
 ; CHECK-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
 ; CHECK-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP]](s1), [[C]], 
[[C1]]
-; CHECK-NEXT: [[DEF:%[0-9]+]]:vgpr(s32) = G_IMPLICIT_DEF
-; CHECK-NEXT: [[MV:%[0-9]+]]:vgpr(s64) = G_MERGE_VALUES [[SELECT]](s32), 
[[DEF]](s32)
+; CHECK-NEXT: [[MV:%[0-9]+]]:vgpr(s64) = G_MERGE_VALUES [[SELECT]](s32), 
[[C1]](s32)

nhaehnle wrote:

This change is a code quality regression: the input has `G_ANYEXT`, so the high 
half can be undefined.

https://github.com/llvm/llvm-project/pull/132383
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: add RegBankLegalize rules for extends and trunc (PR #132383)

2025-03-26 Thread Nicolai Hähnle via llvm-branch-commits

https://github.com/nhaehnle commented:

I didn't look at everything, I just went through some of the tests.

https://github.com/llvm/llvm-project/pull/132383
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: add RegBankLegalize rules for extends and trunc (PR #132383)

2025-03-26 Thread Nicolai Hähnle via llvm-branch-commits


@@ -13,7 +12,8 @@ body: |
 ; CHECK: liveins: $sgpr0
 ; CHECK-NEXT: {{  $}}
 ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
-; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:sgpr(s64) = G_ANYEXT [[COPY]](s32)
+; CHECK-NEXT: [[DEF:%[0-9]+]]:sgpr(s32) = G_IMPLICIT_DEF
+; CHECK-NEXT: [[MV:%[0-9]+]]:sgpr(s64) = G_MERGE_VALUES [[COPY]](s32), 
[[DEF]](s32)

nhaehnle wrote:

Why are we legalizing this G_ANYEXT to G_MERGE_VALUES, but in 
`anyext_s1_to_s64_scc` we generate a new `G_ANYEXT`?

https://github.com/llvm/llvm-project/pull/132383
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: Temporal divergence lowering i1 (PR #124299)

2025-03-07 Thread Nicolai Hähnle via llvm-branch-commits


@@ -228,6 +229,66 @@ bool DivergenceLoweringHelper::lowerTemporalDivergence() {
   return false;
 }
 
+bool DivergenceLoweringHelper::lowerTemporalDivergenceI1() {
+  MachineRegisterInfo::VRegAttrs BoolS1 = {ST->getBoolRC(), LLT::scalar(1)};
+  initializeLaneMaskRegisterAttributes(BoolS1);
+  MachineSSAUpdater SSAUpdater(*MF);
+
+  // In case of use outside muliple nested cycles or muliple uses we only need
+  // to merge lane mask across largest relevant cycle.
+  SmallDenseMap> LRCCache;
+  for (auto [Reg, UseInst, LRC] : MUI->getTemporalDivergenceList()) {
+if (MRI->getType(Reg) != LLT::scalar(1))
+  continue;
+
+const MachineCycle *CachedLRC = LRCCache.lookup(Reg).first;
+if (CachedLRC) {
+  LRC = CachedLRC->contains(LRC) ? CachedLRC : LRC;
+  assert(LRC->contains(CachedLRC));
+}
+
+LRCCache[Reg] = {LRC, {}};
+  }
+
+  for (auto LRCIter : LRCCache) {
+Register Reg = LRCIter.first;
+const MachineCycle *Cycle = LRCIter.second.first;
+
+if (MRI->getType(Reg) != LLT::scalar(1))
+  continue;
+
+Register MergedMask = MRI->createVirtualRegister(BoolS1);
+SSAUpdater.Initialize(MergedMask);
+
+MachineBasicBlock *MBB = MRI->getVRegDef(Reg)->getParent();
+SSAUpdater.AddAvailableValue(MBB, MergedMask);
+
+for (auto Entry : Cycle->getEntries()) {
+  for (MachineBasicBlock *Pred : Entry->predecessors()) {
+if (!Cycle->contains(Pred)) {
+  B.setInsertPt(*Pred, Pred->getFirstTerminator());
+  auto ImplDef = B.buildInstr(AMDGPU::IMPLICIT_DEF, {BoolS1}, {});
+  SSAUpdater.AddAvailableValue(Pred, ImplDef.getReg(0));
+}
+  }
+}
+
+buildMergeLaneMasks(*MBB, MBB->getFirstTerminator(), {}, MergedMask,
+SSAUpdater.GetValueInMiddleOfBlock(MBB), Reg);
+
+LRCCache[Reg].second = MergedMask;

nhaehnle wrote:

Should be able to just keep LRCIter/Entry as a reference and update via that 
instead of repeating the cache lookup.

https://github.com/llvm/llvm-project/pull/124299
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: Temporal divergence lowering i1 (PR #124299)

2025-03-07 Thread Nicolai Hähnle via llvm-branch-commits

https://github.com/nhaehnle commented:

Thanks, this now looks good to me in terms of the overall flow. I have a bunch 
of nitpickier, mostly style-related comments.

https://github.com/llvm/llvm-project/pull/124299
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: Temporal divergence lowering i1 (PR #124299)

2025-03-07 Thread Nicolai Hähnle via llvm-branch-commits


@@ -228,6 +229,66 @@ bool DivergenceLoweringHelper::lowerTemporalDivergence() {
   return false;
 }
 
+bool DivergenceLoweringHelper::lowerTemporalDivergenceI1() {
+  MachineRegisterInfo::VRegAttrs BoolS1 = {ST->getBoolRC(), LLT::scalar(1)};
+  initializeLaneMaskRegisterAttributes(BoolS1);
+  MachineSSAUpdater SSAUpdater(*MF);
+
+  // In case of use outside muliple nested cycles or muliple uses we only need
+  // to merge lane mask across largest relevant cycle.
+  SmallDenseMap> LRCCache;
+  for (auto [Reg, UseInst, LRC] : MUI->getTemporalDivergenceList()) {
+if (MRI->getType(Reg) != LLT::scalar(1))
+  continue;
+
+const MachineCycle *CachedLRC = LRCCache.lookup(Reg).first;
+if (CachedLRC) {
+  LRC = CachedLRC->contains(LRC) ? CachedLRC : LRC;
+  assert(LRC->contains(CachedLRC));
+}
+
+LRCCache[Reg] = {LRC, {}};
+  }
+
+  for (auto LRCIter : LRCCache) {

nhaehnle wrote:

Naming: this isn't an iterator, a more accurate generic name would be just 
"Entry" (or LRCCacheEntry, but that's long)

https://github.com/llvm/llvm-project/pull/124299
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: Temporal divergence lowering (non i1) (PR #124298)

2025-03-07 Thread Nicolai Hähnle via llvm-branch-commits

https://github.com/nhaehnle approved this pull request.

Thanks!

https://github.com/llvm/llvm-project/pull/124298
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: Temporal divergence lowering i1 (PR #124299)

2025-03-07 Thread Nicolai Hähnle via llvm-branch-commits


@@ -228,6 +229,66 @@ bool DivergenceLoweringHelper::lowerTemporalDivergence() {
   return false;
 }
 
+bool DivergenceLoweringHelper::lowerTemporalDivergenceI1() {
+  MachineRegisterInfo::VRegAttrs BoolS1 = {ST->getBoolRC(), LLT::scalar(1)};
+  initializeLaneMaskRegisterAttributes(BoolS1);
+  MachineSSAUpdater SSAUpdater(*MF);
+
+  // In case of use outside muliple nested cycles or muliple uses we only need
+  // to merge lane mask across largest relevant cycle.
+  SmallDenseMap> LRCCache;
+  for (auto [Reg, UseInst, LRC] : MUI->getTemporalDivergenceList()) {
+if (MRI->getType(Reg) != LLT::scalar(1))
+  continue;
+
+const MachineCycle *CachedLRC = LRCCache.lookup(Reg).first;
+if (CachedLRC) {
+  LRC = CachedLRC->contains(LRC) ? CachedLRC : LRC;
+  assert(LRC->contains(CachedLRC));
+}
+
+LRCCache[Reg] = {LRC, {}};

nhaehnle wrote:

This is actually a great use case for try_emplace to do the cache lookup only 
once:
```suggestion
const MachineCycle *&CachedLRC = LRCCache.try_emplace(Reg);
if (!CachedLRC || !CachedLRC->contains(LRC))
  CachedLRC = LRC;
```

https://github.com/llvm/llvm-project/pull/124299
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: Temporal divergence lowering i1 (PR #124299)

2025-03-07 Thread Nicolai Hähnle via llvm-branch-commits

https://github.com/nhaehnle edited 
https://github.com/llvm/llvm-project/pull/124299
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: Temporal divergence lowering i1 (PR #124299)

2025-03-07 Thread Nicolai Hähnle via llvm-branch-commits


@@ -228,6 +229,66 @@ bool DivergenceLoweringHelper::lowerTemporalDivergence() {
   return false;
 }
 
+bool DivergenceLoweringHelper::lowerTemporalDivergenceI1() {
+  MachineRegisterInfo::VRegAttrs BoolS1 = {ST->getBoolRC(), LLT::scalar(1)};
+  initializeLaneMaskRegisterAttributes(BoolS1);
+  MachineSSAUpdater SSAUpdater(*MF);
+
+  // In case of use outside muliple nested cycles or muliple uses we only need
+  // to merge lane mask across largest relevant cycle.
+  SmallDenseMap> LRCCache;
+  for (auto [Reg, UseInst, LRC] : MUI->getTemporalDivergenceList()) {
+if (MRI->getType(Reg) != LLT::scalar(1))
+  continue;
+
+const MachineCycle *CachedLRC = LRCCache.lookup(Reg).first;
+if (CachedLRC) {
+  LRC = CachedLRC->contains(LRC) ? CachedLRC : LRC;
+  assert(LRC->contains(CachedLRC));
+}
+
+LRCCache[Reg] = {LRC, {}};
+  }
+
+  for (auto LRCIter : LRCCache) {
+Register Reg = LRCIter.first;
+const MachineCycle *Cycle = LRCIter.second.first;
+
+if (MRI->getType(Reg) != LLT::scalar(1))
+  continue;

nhaehnle wrote:

This check is now redundant

https://github.com/llvm/llvm-project/pull/124299
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: Temporal divergence lowering i1 (PR #124299)

2025-03-07 Thread Nicolai Hähnle via llvm-branch-commits


@@ -228,6 +229,66 @@ bool DivergenceLoweringHelper::lowerTemporalDivergence() {
   return false;
 }
 
+bool DivergenceLoweringHelper::lowerTemporalDivergenceI1() {
+  MachineRegisterInfo::VRegAttrs BoolS1 = {ST->getBoolRC(), LLT::scalar(1)};
+  initializeLaneMaskRegisterAttributes(BoolS1);
+  MachineSSAUpdater SSAUpdater(*MF);
+
+  // In case of use outside muliple nested cycles or muliple uses we only need
+  // to merge lane mask across largest relevant cycle.
+  SmallDenseMap> LRCCache;
+  for (auto [Reg, UseInst, LRC] : MUI->getTemporalDivergenceList()) {
+if (MRI->getType(Reg) != LLT::scalar(1))
+  continue;
+
+const MachineCycle *CachedLRC = LRCCache.lookup(Reg).first;
+if (CachedLRC) {
+  LRC = CachedLRC->contains(LRC) ? CachedLRC : LRC;
+  assert(LRC->contains(CachedLRC));
+}
+
+LRCCache[Reg] = {LRC, {}};
+  }
+
+  for (auto LRCIter : LRCCache) {
+Register Reg = LRCIter.first;
+const MachineCycle *Cycle = LRCIter.second.first;
+
+if (MRI->getType(Reg) != LLT::scalar(1))
+  continue;
+
+Register MergedMask = MRI->createVirtualRegister(BoolS1);
+SSAUpdater.Initialize(MergedMask);
+
+MachineBasicBlock *MBB = MRI->getVRegDef(Reg)->getParent();
+SSAUpdater.AddAvailableValue(MBB, MergedMask);
+
+for (auto Entry : Cycle->getEntries()) {
+  for (MachineBasicBlock *Pred : Entry->predecessors()) {
+if (!Cycle->contains(Pred)) {
+  B.setInsertPt(*Pred, Pred->getFirstTerminator());
+  auto ImplDef = B.buildInstr(AMDGPU::IMPLICIT_DEF, {BoolS1}, {});
+  SSAUpdater.AddAvailableValue(Pred, ImplDef.getReg(0));
+}
+  }
+}
+
+buildMergeLaneMasks(*MBB, MBB->getFirstTerminator(), {}, MergedMask,
+SSAUpdater.GetValueInMiddleOfBlock(MBB), Reg);
+
+LRCCache[Reg].second = MergedMask;
+  }
+
+  for (auto [Reg, UseInst, Cycle] : MUI->getTemporalDivergenceList()) {
+if (MRI->getType(Reg) != LLT::scalar(1))
+  continue;
+
+replaceUsesOfRegInInstWith(Reg, UseInst, LRCCache[Reg].second);

nhaehnle wrote:

Can use .lookup instead of operator[] for consistency with above.

https://github.com/llvm/llvm-project/pull/124299
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: Update divergence lowering tests (PR #128702)

2025-03-07 Thread Nicolai Hähnle via llvm-branch-commits

https://github.com/nhaehnle approved this pull request.


https://github.com/llvm/llvm-project/pull/128702
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: Temporal divergence lowering (non i1) (PR #124298)

2025-02-21 Thread Nicolai Hähnle via llvm-branch-commits

nhaehnle wrote:

How about this comment from earlier:

> Every Inst may potentially appear with many UseInsts in the temporal 
> divergence list. The current code will create multiple new registers and 
> multiple COPY instructions, which seems wasteful even if downstream passes 
> can often clean it up.
>
> I would suggest capturing the created register in a DenseMap Register> for re-use.
>
> Also, how about inserting the COPY at the end of Inst->getParent()? That way, 
> the live range of the VGPR is reduced.

?

https://github.com/llvm/llvm-project/pull/124298
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: Add waterfall lowering in regbanklegalize (PR #145912)

2025-06-27 Thread Nicolai Hähnle via llvm-branch-commits

https://github.com/nhaehnle edited 
https://github.com/llvm/llvm-project/pull/145912
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: Add waterfall lowering in regbanklegalize (PR #145912)

2025-06-27 Thread Nicolai Hähnle via llvm-branch-commits

https://github.com/nhaehnle approved this pull request.

LGTM

https://github.com/llvm/llvm-project/pull/145912
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: Add waterfall lowering in regbanklegalize (PR #145912)

2025-06-27 Thread Nicolai Hähnle via llvm-branch-commits

https://github.com/nhaehnle edited 
https://github.com/llvm/llvm-project/pull/145912
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: Add waterfall lowering in regbanklegalize (PR #145912)

2025-06-27 Thread Nicolai Hähnle via llvm-branch-commits


@@ -57,6 +57,224 @@ void 
RegBankLegalizeHelper::findRuleAndApplyMapping(MachineInstr &MI) {
   lower(MI, Mapping, WaterfallSgprs);
 }
 
+bool RegBankLegalizeHelper::executeInWaterfallLoop(
+MachineIRBuilder &B, iterator_range Range,
+SmallSet &SGPROperandRegs) {
+  // Track use registers which have already been expanded with a readfirstlane
+  // sequence. This may have multiple uses if moving a sequence.
+  DenseMap WaterfalledRegMap;
+
+  MachineBasicBlock &MBB = B.getMBB();
+  MachineFunction &MF = B.getMF();
+
+  const SIRegisterInfo *TRI = ST.getRegisterInfo();
+  const TargetRegisterClass *WaveRC = TRI->getWaveMaskRegClass();
+  unsigned MovExecOpc, MovExecTermOpc, XorTermOpc, AndSaveExecOpc, ExecReg;
+  if (IsWave32) {

nhaehnle wrote:

Long ago, I actually thought it would be nice to have a "LaneMaskHelper" class 
that does this setup, since it's duplicated in a bunch of places. But I don't 
feel particularly strongly about it -- not for this PR.

https://github.com/llvm/llvm-project/pull/145912
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: Improve readanylane combines in regbanklegalize (PR #145911)

2025-06-27 Thread Nicolai Hähnle via llvm-branch-commits


@@ -115,126 +117,233 @@ class AMDGPURegBankLegalizeCombiner {
 VgprRB(&RBI.getRegBank(AMDGPU::VGPRRegBankID)),
 VccRB(&RBI.getRegBank(AMDGPU::VCCRegBankID)) {};
 
-  bool isLaneMask(Register Reg) {
-const RegisterBank *RB = MRI.getRegBankOrNull(Reg);
-if (RB && RB->getID() == AMDGPU::VCCRegBankID)
-  return true;
+  bool isLaneMask(Register Reg);
+  std::pair tryMatch(Register Src, unsigned Opcode);
+  std::pair tryMatchRALFromUnmerge(Register Src);
+  Register getReadAnyLaneSrc(Register Src);
+  void replaceRegWithOrBuildCopy(Register Dst, Register Src);
 
-const TargetRegisterClass *RC = MRI.getRegClassOrNull(Reg);
-return RC && TRI.isSGPRClass(RC) && MRI.getType(Reg) == LLT::scalar(1);
-  }
+  bool tryEliminateReadAnyLane(MachineInstr &Copy);
+  void tryCombineCopy(MachineInstr &MI);
+  void tryCombineS1AnyExt(MachineInstr &MI);
+};
 
-  void cleanUpAfterCombine(MachineInstr &MI, MachineInstr *Optional0) {
-MI.eraseFromParent();
-if (Optional0 && isTriviallyDead(*Optional0, MRI))
-  Optional0->eraseFromParent();
-  }
+bool AMDGPURegBankLegalizeCombiner::isLaneMask(Register Reg) {
+  const RegisterBank *RB = MRI.getRegBankOrNull(Reg);
+  if (RB && RB->getID() == AMDGPU::VCCRegBankID)
+return true;
 
-  std::pair tryMatch(Register Src, unsigned Opcode) {
-MachineInstr *MatchMI = MRI.getVRegDef(Src);
-if (MatchMI->getOpcode() != Opcode)
-  return {nullptr, Register()};
-return {MatchMI, MatchMI->getOperand(1).getReg()};
-  }
+  const TargetRegisterClass *RC = MRI.getRegClassOrNull(Reg);
+  return RC && TRI.isSGPRClass(RC) && MRI.getType(Reg) == LLT::scalar(1);
+}
 
-  void tryCombineCopy(MachineInstr &MI) {
-Register Dst = MI.getOperand(0).getReg();
-Register Src = MI.getOperand(1).getReg();
-// Skip copies of physical registers.
-if (!Dst.isVirtual() || !Src.isVirtual())
-  return;
-
-// This is a cross bank copy, sgpr S1 to lane mask.
-//
-// %Src:sgpr(s1) = G_TRUNC %TruncS32Src:sgpr(s32)
-// %Dst:lane-mask(s1) = COPY %Src:sgpr(s1)
-// ->
-// %Dst:lane-mask(s1) = G_AMDGPU_COPY_VCC_SCC %TruncS32Src:sgpr(s32)
-if (isLaneMask(Dst) && MRI.getRegBankOrNull(Src) == SgprRB) {
-  auto [Trunc, TruncS32Src] = tryMatch(Src, AMDGPU::G_TRUNC);
-  assert(Trunc && MRI.getType(TruncS32Src) == S32 &&
- "sgpr S1 must be result of G_TRUNC of sgpr S32");
-
-  B.setInstr(MI);
-  // Ensure that truncated bits in BoolSrc are 0.
-  auto One = B.buildConstant({SgprRB, S32}, 1);
-  auto BoolSrc = B.buildAnd({SgprRB, S32}, TruncS32Src, One);
-  B.buildInstr(AMDGPU::G_AMDGPU_COPY_VCC_SCC, {Dst}, {BoolSrc});
-  cleanUpAfterCombine(MI, Trunc);
-  return;
-}
+std::pair
+AMDGPURegBankLegalizeCombiner::tryMatch(Register Src, unsigned Opcode) {
+  MachineInstr *MatchMI = MRI.getVRegDef(Src);
+  if (MatchMI->getOpcode() != Opcode)
+return {nullptr, Register()};
+  return {MatchMI, MatchMI->getOperand(1).getReg()};
+}

nhaehnle wrote:

Is there a better name for this than `tryMatch`? Come to think of it, can the 
generic matching infrastructure be used for this?

https://github.com/llvm/llvm-project/pull/145911
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: Improve readanylane combines in regbanklegalize (PR #145911)

2025-06-27 Thread Nicolai Hähnle via llvm-branch-commits


@@ -115,126 +117,233 @@ class AMDGPURegBankLegalizeCombiner {
 VgprRB(&RBI.getRegBank(AMDGPU::VGPRRegBankID)),
 VccRB(&RBI.getRegBank(AMDGPU::VCCRegBankID)) {};
 
-  bool isLaneMask(Register Reg) {
-const RegisterBank *RB = MRI.getRegBankOrNull(Reg);
-if (RB && RB->getID() == AMDGPU::VCCRegBankID)
-  return true;
+  bool isLaneMask(Register Reg);
+  std::pair tryMatch(Register Src, unsigned Opcode);
+  std::pair tryMatchRALFromUnmerge(Register Src);
+  Register getReadAnyLaneSrc(Register Src);
+  void replaceRegWithOrBuildCopy(Register Dst, Register Src);
 
-const TargetRegisterClass *RC = MRI.getRegClassOrNull(Reg);
-return RC && TRI.isSGPRClass(RC) && MRI.getType(Reg) == LLT::scalar(1);
-  }
+  bool tryEliminateReadAnyLane(MachineInstr &Copy);
+  void tryCombineCopy(MachineInstr &MI);
+  void tryCombineS1AnyExt(MachineInstr &MI);
+};
 
-  void cleanUpAfterCombine(MachineInstr &MI, MachineInstr *Optional0) {
-MI.eraseFromParent();
-if (Optional0 && isTriviallyDead(*Optional0, MRI))
-  Optional0->eraseFromParent();
-  }
+bool AMDGPURegBankLegalizeCombiner::isLaneMask(Register Reg) {
+  const RegisterBank *RB = MRI.getRegBankOrNull(Reg);
+  if (RB && RB->getID() == AMDGPU::VCCRegBankID)
+return true;
 
-  std::pair tryMatch(Register Src, unsigned Opcode) {
-MachineInstr *MatchMI = MRI.getVRegDef(Src);
-if (MatchMI->getOpcode() != Opcode)
-  return {nullptr, Register()};
-return {MatchMI, MatchMI->getOperand(1).getReg()};
-  }
+  const TargetRegisterClass *RC = MRI.getRegClassOrNull(Reg);
+  return RC && TRI.isSGPRClass(RC) && MRI.getType(Reg) == LLT::scalar(1);
+}
 
-  void tryCombineCopy(MachineInstr &MI) {
-Register Dst = MI.getOperand(0).getReg();
-Register Src = MI.getOperand(1).getReg();
-// Skip copies of physical registers.
-if (!Dst.isVirtual() || !Src.isVirtual())
-  return;
-
-// This is a cross bank copy, sgpr S1 to lane mask.
-//
-// %Src:sgpr(s1) = G_TRUNC %TruncS32Src:sgpr(s32)
-// %Dst:lane-mask(s1) = COPY %Src:sgpr(s1)
-// ->
-// %Dst:lane-mask(s1) = G_AMDGPU_COPY_VCC_SCC %TruncS32Src:sgpr(s32)
-if (isLaneMask(Dst) && MRI.getRegBankOrNull(Src) == SgprRB) {
-  auto [Trunc, TruncS32Src] = tryMatch(Src, AMDGPU::G_TRUNC);
-  assert(Trunc && MRI.getType(TruncS32Src) == S32 &&
- "sgpr S1 must be result of G_TRUNC of sgpr S32");
-
-  B.setInstr(MI);
-  // Ensure that truncated bits in BoolSrc are 0.
-  auto One = B.buildConstant({SgprRB, S32}, 1);
-  auto BoolSrc = B.buildAnd({SgprRB, S32}, TruncS32Src, One);
-  B.buildInstr(AMDGPU::G_AMDGPU_COPY_VCC_SCC, {Dst}, {BoolSrc});
-  cleanUpAfterCombine(MI, Trunc);
-  return;
-}
+std::pair
+AMDGPURegBankLegalizeCombiner::tryMatch(Register Src, unsigned Opcode) {
+  MachineInstr *MatchMI = MRI.getVRegDef(Src);
+  if (MatchMI->getOpcode() != Opcode)
+return {nullptr, Register()};
+  return {MatchMI, MatchMI->getOperand(1).getReg()};
+}
+
+std::pair
+AMDGPURegBankLegalizeCombiner::tryMatchRALFromUnmerge(Register Src) {
+  MachineInstr *ReadAnyLane = MRI.getVRegDef(Src);
+  if (ReadAnyLane->getOpcode() != AMDGPU::G_AMDGPU_READANYLANE)
+return {nullptr, -1};
+
+  Register RALSrc = ReadAnyLane->getOperand(1).getReg();
+  if (auto *UnMerge = getOpcodeDef(RALSrc, MRI))
+return {UnMerge, UnMerge->findRegisterDefOperandIdx(RALSrc, nullptr)};
 
-// Src = G_AMDGPU_READANYLANE RALSrc
-// Dst = COPY Src
-// ->
-// Dst = RALSrc
-if (MRI.getRegBankOrNull(Dst) == VgprRB &&
-MRI.getRegBankOrNull(Src) == SgprRB) {
-  auto [RAL, RALSrc] = tryMatch(Src, AMDGPU::G_AMDGPU_READANYLANE);
-  if (!RAL)
-return;
-
-  assert(MRI.getRegBank(RALSrc) == VgprRB);
-  MRI.replaceRegWith(Dst, RALSrc);
-  cleanUpAfterCombine(MI, RAL);
-  return;
+  return {nullptr, -1};
+}
+
+Register AMDGPURegBankLegalizeCombiner::getReadAnyLaneSrc(Register Src) {
+  // Src = G_AMDGPU_READANYLANE RALSrc
+  auto [RAL, RALSrc] = tryMatch(Src, AMDGPU::G_AMDGPU_READANYLANE);
+  if (RAL)
+return RALSrc;
+
+  // LoVgpr, HiVgpr = G_UNMERGE_VALUES UnmergeSrc
+  // LoSgpr = G_AMDGPU_READANYLANE LoVgpr
+  // HiSgpr = G_AMDGPU_READANYLANE HiVgpr
+  // Src G_MERGE_VALUES LoSgpr, HiSgpr
+  auto *Merge = getOpcodeDef(Src, MRI);
+  if (Merge) {
+unsigned NumElts = Merge->getNumSources();
+auto [Unmerge, Idx] = tryMatchRALFromUnmerge(Merge->getSourceReg(0));
+if (!Unmerge || Unmerge->getNumDefs() != NumElts || Idx != 0)
+  return {};
+
+// Check if all elements are from same unmerge and there is no shuffling.
+for (unsigned i = 1; i < NumElts; ++i) {
+  auto [UnmergeI, IdxI] = tryMatchRALFromUnmerge(Merge->getSourceReg(i));
+  if (UnmergeI != Unmerge || (unsigned)IdxI != i)
+return {};
 }
+return Unmerge->getSourceReg();
   }
 
-  void tryCombineS1AnyExt(MachineInstr &MI) {
-// %Src:sgpr(S1) = G_TRUNC %TruncSrc
-  

[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: Improve readanylane combines in regbanklegalize (PR #145911)

2025-06-27 Thread Nicolai Hähnle via llvm-branch-commits


@@ -115,126 +117,233 @@ class AMDGPURegBankLegalizeCombiner {
 VgprRB(&RBI.getRegBank(AMDGPU::VGPRRegBankID)),
 VccRB(&RBI.getRegBank(AMDGPU::VCCRegBankID)) {};
 
-  bool isLaneMask(Register Reg) {
-const RegisterBank *RB = MRI.getRegBankOrNull(Reg);
-if (RB && RB->getID() == AMDGPU::VCCRegBankID)
-  return true;
+  bool isLaneMask(Register Reg);
+  std::pair tryMatch(Register Src, unsigned Opcode);
+  std::pair tryMatchRALFromUnmerge(Register Src);
+  Register getReadAnyLaneSrc(Register Src);
+  void replaceRegWithOrBuildCopy(Register Dst, Register Src);
 
-const TargetRegisterClass *RC = MRI.getRegClassOrNull(Reg);
-return RC && TRI.isSGPRClass(RC) && MRI.getType(Reg) == LLT::scalar(1);
-  }
+  bool tryEliminateReadAnyLane(MachineInstr &Copy);
+  void tryCombineCopy(MachineInstr &MI);
+  void tryCombineS1AnyExt(MachineInstr &MI);
+};
 
-  void cleanUpAfterCombine(MachineInstr &MI, MachineInstr *Optional0) {
-MI.eraseFromParent();
-if (Optional0 && isTriviallyDead(*Optional0, MRI))
-  Optional0->eraseFromParent();
-  }
+bool AMDGPURegBankLegalizeCombiner::isLaneMask(Register Reg) {
+  const RegisterBank *RB = MRI.getRegBankOrNull(Reg);
+  if (RB && RB->getID() == AMDGPU::VCCRegBankID)
+return true;
 
-  std::pair tryMatch(Register Src, unsigned Opcode) {
-MachineInstr *MatchMI = MRI.getVRegDef(Src);
-if (MatchMI->getOpcode() != Opcode)
-  return {nullptr, Register()};
-return {MatchMI, MatchMI->getOperand(1).getReg()};
-  }
+  const TargetRegisterClass *RC = MRI.getRegClassOrNull(Reg);
+  return RC && TRI.isSGPRClass(RC) && MRI.getType(Reg) == LLT::scalar(1);
+}
 
-  void tryCombineCopy(MachineInstr &MI) {
-Register Dst = MI.getOperand(0).getReg();
-Register Src = MI.getOperand(1).getReg();
-// Skip copies of physical registers.
-if (!Dst.isVirtual() || !Src.isVirtual())
-  return;
-
-// This is a cross bank copy, sgpr S1 to lane mask.
-//
-// %Src:sgpr(s1) = G_TRUNC %TruncS32Src:sgpr(s32)
-// %Dst:lane-mask(s1) = COPY %Src:sgpr(s1)
-// ->
-// %Dst:lane-mask(s1) = G_AMDGPU_COPY_VCC_SCC %TruncS32Src:sgpr(s32)
-if (isLaneMask(Dst) && MRI.getRegBankOrNull(Src) == SgprRB) {
-  auto [Trunc, TruncS32Src] = tryMatch(Src, AMDGPU::G_TRUNC);
-  assert(Trunc && MRI.getType(TruncS32Src) == S32 &&
- "sgpr S1 must be result of G_TRUNC of sgpr S32");
-
-  B.setInstr(MI);
-  // Ensure that truncated bits in BoolSrc are 0.
-  auto One = B.buildConstant({SgprRB, S32}, 1);
-  auto BoolSrc = B.buildAnd({SgprRB, S32}, TruncS32Src, One);
-  B.buildInstr(AMDGPU::G_AMDGPU_COPY_VCC_SCC, {Dst}, {BoolSrc});
-  cleanUpAfterCombine(MI, Trunc);
-  return;
-}
+std::pair
+AMDGPURegBankLegalizeCombiner::tryMatch(Register Src, unsigned Opcode) {
+  MachineInstr *MatchMI = MRI.getVRegDef(Src);
+  if (MatchMI->getOpcode() != Opcode)
+return {nullptr, Register()};
+  return {MatchMI, MatchMI->getOperand(1).getReg()};
+}
+
+std::pair
+AMDGPURegBankLegalizeCombiner::tryMatchRALFromUnmerge(Register Src) {
+  MachineInstr *ReadAnyLane = MRI.getVRegDef(Src);
+  if (ReadAnyLane->getOpcode() != AMDGPU::G_AMDGPU_READANYLANE)
+return {nullptr, -1};
+
+  Register RALSrc = ReadAnyLane->getOperand(1).getReg();
+  if (auto *UnMerge = getOpcodeDef(RALSrc, MRI))
+return {UnMerge, UnMerge->findRegisterDefOperandIdx(RALSrc, nullptr)};
 
-// Src = G_AMDGPU_READANYLANE RALSrc
-// Dst = COPY Src
-// ->
-// Dst = RALSrc
-if (MRI.getRegBankOrNull(Dst) == VgprRB &&
-MRI.getRegBankOrNull(Src) == SgprRB) {
-  auto [RAL, RALSrc] = tryMatch(Src, AMDGPU::G_AMDGPU_READANYLANE);
-  if (!RAL)
-return;
-
-  assert(MRI.getRegBank(RALSrc) == VgprRB);
-  MRI.replaceRegWith(Dst, RALSrc);
-  cleanUpAfterCombine(MI, RAL);
-  return;
+  return {nullptr, -1};
+}
+
+Register AMDGPURegBankLegalizeCombiner::getReadAnyLaneSrc(Register Src) {
+  // Src = G_AMDGPU_READANYLANE RALSrc
+  auto [RAL, RALSrc] = tryMatch(Src, AMDGPU::G_AMDGPU_READANYLANE);
+  if (RAL)
+return RALSrc;
+
+  // LoVgpr, HiVgpr = G_UNMERGE_VALUES UnmergeSrc
+  // LoSgpr = G_AMDGPU_READANYLANE LoVgpr
+  // HiSgpr = G_AMDGPU_READANYLANE HiVgpr
+  // Src G_MERGE_VALUES LoSgpr, HiSgpr
+  auto *Merge = getOpcodeDef(Src, MRI);
+  if (Merge) {
+unsigned NumElts = Merge->getNumSources();
+auto [Unmerge, Idx] = tryMatchRALFromUnmerge(Merge->getSourceReg(0));
+if (!Unmerge || Unmerge->getNumDefs() != NumElts || Idx != 0)
+  return {};
+
+// Check if all elements are from same unmerge and there is no shuffling.
+for (unsigned i = 1; i < NumElts; ++i) {
+  auto [UnmergeI, IdxI] = tryMatchRALFromUnmerge(Merge->getSourceReg(i));
+  if (UnmergeI != Unmerge || (unsigned)IdxI != i)
+return {};
 }
+return Unmerge->getSourceReg();
   }
 
-  void tryCombineS1AnyExt(MachineInstr &MI) {
-// %Src:sgpr(S1) = G_TRUNC %TruncSrc
-  

[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: Improve readanylane combines in regbanklegalize (PR #145911)

2025-06-27 Thread Nicolai Hähnle via llvm-branch-commits


@@ -115,126 +117,233 @@ class AMDGPURegBankLegalizeCombiner {
 VgprRB(&RBI.getRegBank(AMDGPU::VGPRRegBankID)),
 VccRB(&RBI.getRegBank(AMDGPU::VCCRegBankID)) {};
 
-  bool isLaneMask(Register Reg) {
-const RegisterBank *RB = MRI.getRegBankOrNull(Reg);
-if (RB && RB->getID() == AMDGPU::VCCRegBankID)
-  return true;
+  bool isLaneMask(Register Reg);
+  std::pair tryMatch(Register Src, unsigned Opcode);
+  std::pair tryMatchRALFromUnmerge(Register Src);
+  Register getReadAnyLaneSrc(Register Src);
+  void replaceRegWithOrBuildCopy(Register Dst, Register Src);
 
-const TargetRegisterClass *RC = MRI.getRegClassOrNull(Reg);
-return RC && TRI.isSGPRClass(RC) && MRI.getType(Reg) == LLT::scalar(1);
-  }
+  bool tryEliminateReadAnyLane(MachineInstr &Copy);
+  void tryCombineCopy(MachineInstr &MI);
+  void tryCombineS1AnyExt(MachineInstr &MI);
+};
 
-  void cleanUpAfterCombine(MachineInstr &MI, MachineInstr *Optional0) {
-MI.eraseFromParent();
-if (Optional0 && isTriviallyDead(*Optional0, MRI))
-  Optional0->eraseFromParent();
-  }
+bool AMDGPURegBankLegalizeCombiner::isLaneMask(Register Reg) {
+  const RegisterBank *RB = MRI.getRegBankOrNull(Reg);
+  if (RB && RB->getID() == AMDGPU::VCCRegBankID)
+return true;
 
-  std::pair tryMatch(Register Src, unsigned Opcode) {
-MachineInstr *MatchMI = MRI.getVRegDef(Src);
-if (MatchMI->getOpcode() != Opcode)
-  return {nullptr, Register()};
-return {MatchMI, MatchMI->getOperand(1).getReg()};
-  }
+  const TargetRegisterClass *RC = MRI.getRegClassOrNull(Reg);
+  return RC && TRI.isSGPRClass(RC) && MRI.getType(Reg) == LLT::scalar(1);
+}
 
-  void tryCombineCopy(MachineInstr &MI) {
-Register Dst = MI.getOperand(0).getReg();
-Register Src = MI.getOperand(1).getReg();
-// Skip copies of physical registers.
-if (!Dst.isVirtual() || !Src.isVirtual())
-  return;
-
-// This is a cross bank copy, sgpr S1 to lane mask.
-//
-// %Src:sgpr(s1) = G_TRUNC %TruncS32Src:sgpr(s32)
-// %Dst:lane-mask(s1) = COPY %Src:sgpr(s1)
-// ->
-// %Dst:lane-mask(s1) = G_AMDGPU_COPY_VCC_SCC %TruncS32Src:sgpr(s32)
-if (isLaneMask(Dst) && MRI.getRegBankOrNull(Src) == SgprRB) {
-  auto [Trunc, TruncS32Src] = tryMatch(Src, AMDGPU::G_TRUNC);
-  assert(Trunc && MRI.getType(TruncS32Src) == S32 &&
- "sgpr S1 must be result of G_TRUNC of sgpr S32");
-
-  B.setInstr(MI);
-  // Ensure that truncated bits in BoolSrc are 0.
-  auto One = B.buildConstant({SgprRB, S32}, 1);
-  auto BoolSrc = B.buildAnd({SgprRB, S32}, TruncS32Src, One);
-  B.buildInstr(AMDGPU::G_AMDGPU_COPY_VCC_SCC, {Dst}, {BoolSrc});
-  cleanUpAfterCombine(MI, Trunc);
-  return;
-}
+std::pair
+AMDGPURegBankLegalizeCombiner::tryMatch(Register Src, unsigned Opcode) {
+  MachineInstr *MatchMI = MRI.getVRegDef(Src);
+  if (MatchMI->getOpcode() != Opcode)
+return {nullptr, Register()};
+  return {MatchMI, MatchMI->getOperand(1).getReg()};
+}
+
+std::pair
+AMDGPURegBankLegalizeCombiner::tryMatchRALFromUnmerge(Register Src) {
+  MachineInstr *ReadAnyLane = MRI.getVRegDef(Src);
+  if (ReadAnyLane->getOpcode() != AMDGPU::G_AMDGPU_READANYLANE)
+return {nullptr, -1};
+
+  Register RALSrc = ReadAnyLane->getOperand(1).getReg();
+  if (auto *UnMerge = getOpcodeDef(RALSrc, MRI))
+return {UnMerge, UnMerge->findRegisterDefOperandIdx(RALSrc, nullptr)};
 
-// Src = G_AMDGPU_READANYLANE RALSrc
-// Dst = COPY Src
-// ->
-// Dst = RALSrc
-if (MRI.getRegBankOrNull(Dst) == VgprRB &&
-MRI.getRegBankOrNull(Src) == SgprRB) {
-  auto [RAL, RALSrc] = tryMatch(Src, AMDGPU::G_AMDGPU_READANYLANE);
-  if (!RAL)
-return;
-
-  assert(MRI.getRegBank(RALSrc) == VgprRB);
-  MRI.replaceRegWith(Dst, RALSrc);
-  cleanUpAfterCombine(MI, RAL);
-  return;
+  return {nullptr, -1};
+}
+
+Register AMDGPURegBankLegalizeCombiner::getReadAnyLaneSrc(Register Src) {
+  // Src = G_AMDGPU_READANYLANE RALSrc
+  auto [RAL, RALSrc] = tryMatch(Src, AMDGPU::G_AMDGPU_READANYLANE);
+  if (RAL)
+return RALSrc;
+
+  // LoVgpr, HiVgpr = G_UNMERGE_VALUES UnmergeSrc
+  // LoSgpr = G_AMDGPU_READANYLANE LoVgpr
+  // HiSgpr = G_AMDGPU_READANYLANE HiVgpr
+  // Src G_MERGE_VALUES LoSgpr, HiSgpr
+  auto *Merge = getOpcodeDef(Src, MRI);
+  if (Merge) {
+unsigned NumElts = Merge->getNumSources();
+auto [Unmerge, Idx] = tryMatchRALFromUnmerge(Merge->getSourceReg(0));
+if (!Unmerge || Unmerge->getNumDefs() != NumElts || Idx != 0)
+  return {};
+
+// Check if all elements are from same unmerge and there is no shuffling.
+for (unsigned i = 1; i < NumElts; ++i) {
+  auto [UnmergeI, IdxI] = tryMatchRALFromUnmerge(Merge->getSourceReg(i));
+  if (UnmergeI != Unmerge || (unsigned)IdxI != i)
+return {};
 }
+return Unmerge->getSourceReg();
   }
 
-  void tryCombineS1AnyExt(MachineInstr &MI) {
-// %Src:sgpr(S1) = G_TRUNC %TruncSrc
-