date:20250527

[llvm-branch-commits] [mlir] users/banach space/vector/update create write (PR #141567)

2025-05-27 Thread via llvm-branch-commits


llvmbot wrote:




@llvm/pr-subscribers-mlir-vector

Author: Andrzej Warzyński (banach-space)


Changes

- **[[mlir][linalg] Refactor vectorization hooks to improve code reuse**
- **[mlir][linalg] Simplify `createWriteOrMaskedWrite` (NFC)**


---
Full diff: https://github.com/llvm/llvm-project/pull/141567.diff


1 Files Affected:

- (modified) mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp (+40-78) 


``diff
diff --git a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp 
b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
index 0113ba86a5ae3..2abb2f0ea467c 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
@@ -1590,61 +1590,46 @@ static bool 
isMaskTriviallyFoldable(SmallVector &maskSizes,
 /// Creates an optionally masked TransferWriteOp
 ///
 /// Generates the following operation:
-///   %res = vector.transfer_write %vectorToStore into %dest
+///   %res = vector.transfer_write %vecToStore into %dest
 ///
-/// If the leading N dimensions of the vector to store do not match
-/// `inputVecSizesForLeadingDims` (N = rank(inputVecSizesForLeadingDims)),
-/// masking is applied to ensure correctness:
+/// If shape(vecToStore) != shape(dest), masking is used to ensure correctness:
 ///
-///   %mask = vector.create_mask(%destShape) : %vectorToStoreShape
+///   %mask = vector.create_mask(%destShape) : %vecToStoreShape
 ///   %res = vector.mask %mask {
-/// vector.transfer_write %vectorToStore into %dest
+/// vector.transfer_write %vecToStore into %dest
 ///   }
 ///
-/// The mask shape is identical to `vectorToStore` (with the element type ==
+/// The mask shape is identical to `vecToStore` (with the element type ==
 /// i1), and the mask values are based on the shape of the `dest` tensor.
 ///
 /// If `useInBoundsInsteadOfMasking` is set to `true`, the `in_bounds` 
attribute
 /// is used instead of masking:
 ///
-///   %write = vector.transfer_write %vectorToStore into %dest
+///   %write = vector.transfer_write %vecToStore into %dest
 ///   in_bounds_flags = (...)
 ///   %res = vector.transfer_write %input into %dest
 ///   {in_bounds = in_bounds_flags}
 ///
-/// `writeIndices` specifies the offsets to use. If empty, all indices are set
-/// to 0.
-///
-/// NOTE: When N < rank(vectorToStore), the missing vector sizes are taken from
-/// `valueToStore`.
-/// TODO: `inputVecSizesForLeadingDims` should not be required - these sizes 
are
-/// already provided in `vectorToStore`.
+/// Finally, `writeIndices` specifies the offsets to use. If empty, all indices
+/// are set to 0.
 static Operation *
-createWriteOrMaskedWrite(OpBuilder &builder, Location loc, Value vectorToStore,
- Value dest,
- ArrayRef inputVecSizesForLeadingDims,
- SmallVector writeIndices = {},
+createWriteOrMaskedWrite(OpBuilder &builder, Location loc, Value vecToStore,
+ Value dest, SmallVector writeIndices = {},
  bool useInBoundsInsteadOfMasking = false) {
 
   ShapedType destType = cast(dest.getType());
   int64_t destRank = destType.getRank();
   auto destShape = destType.getShape();
 
-  VectorType vecToStoreType = cast(vectorToStore.getType());
+  VectorType vecToStoreType = cast(vecToStore.getType());
   int64_t vecToStoreRank = vecToStoreType.getRank();
   auto vecToStoreShape = vecToStoreType.getShape();
 
   // Compute the in_bounds attribute
   SmallVector inBoundsVal(vecToStoreRank, true);
   if (useInBoundsInsteadOfMasking) {
-// In this case, assume that all the required vector sizes have been
-// provided.
-assert(inputVecSizesForLeadingDims.size() ==
-   static_cast(vecToStoreType.getRank()) &&
-   "Insufficient number of input vector sizes!");
-// Update the inBounds attribute.
 for (unsigned i = 0; i < destRank; i++)
-  inBoundsVal[i] = (destShape[i] == inputVecSizesForLeadingDims[i]) &&
+  inBoundsVal[i] = (destShape[i] == vecToStoreShape[i]) &&
!ShapedType::isDynamic(destShape[i]);
   }
 
@@ -1660,7 +1645,7 @@ createWriteOrMaskedWrite(OpBuilder &builder, Location 
loc, Value vectorToStore,
   // Generate the xfer_write Op
   Operation *write =
   builder.create(loc,
-  /*vector=*/vectorToStore,
+  /*vector=*/vecToStore,
   /*source=*/dest,
   /*indices=*/writeIndices,
   /*inBounds=*/inBoundsVal);
@@ -1669,46 +1654,25 @@ createWriteOrMaskedWrite(OpBuilder &builder, Location 
loc, Value vectorToStore,
   if (useInBoundsInsteadOfMasking)
 return write;
 
-  assert(llvm::none_of(
- destShape.drop_front(inputVecSizesForLeadingDims.size()),
- [](int64_t size) { return size == ShapedType::kDyna

[llvm-branch-commits] [mlir] users/banach space/vector/update create write (PR #141567)

2025-05-27 Thread via llvm-branch-commits


llvmbot wrote:




@llvm/pr-subscribers-mlir-llvm

Author: Andrzej Warzyński (banach-space)


Changes

- **[[mlir][linalg] Refactor vectorization hooks to improve code reuse**
- **[mlir][linalg] Simplify `createWriteOrMaskedWrite` (NFC)**


---
Full diff: https://github.com/llvm/llvm-project/pull/141567.diff


1 Files Affected:

- (modified) mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp (+40-78) 


``diff
diff --git a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp 
b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
index 0113ba86a5ae3..2abb2f0ea467c 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
@@ -1590,61 +1590,46 @@ static bool 
isMaskTriviallyFoldable(SmallVector &maskSizes,
 /// Creates an optionally masked TransferWriteOp
 ///
 /// Generates the following operation:
-///   %res = vector.transfer_write %vectorToStore into %dest
+///   %res = vector.transfer_write %vecToStore into %dest
 ///
-/// If the leading N dimensions of the vector to store do not match
-/// `inputVecSizesForLeadingDims` (N = rank(inputVecSizesForLeadingDims)),
-/// masking is applied to ensure correctness:
+/// If shape(vecToStore) != shape(dest), masking is used to ensure correctness:
 ///
-///   %mask = vector.create_mask(%destShape) : %vectorToStoreShape
+///   %mask = vector.create_mask(%destShape) : %vecToStoreShape
 ///   %res = vector.mask %mask {
-/// vector.transfer_write %vectorToStore into %dest
+/// vector.transfer_write %vecToStore into %dest
 ///   }
 ///
-/// The mask shape is identical to `vectorToStore` (with the element type ==
+/// The mask shape is identical to `vecToStore` (with the element type ==
 /// i1), and the mask values are based on the shape of the `dest` tensor.
 ///
 /// If `useInBoundsInsteadOfMasking` is set to `true`, the `in_bounds` 
attribute
 /// is used instead of masking:
 ///
-///   %write = vector.transfer_write %vectorToStore into %dest
+///   %write = vector.transfer_write %vecToStore into %dest
 ///   in_bounds_flags = (...)
 ///   %res = vector.transfer_write %input into %dest
 ///   {in_bounds = in_bounds_flags}
 ///
-/// `writeIndices` specifies the offsets to use. If empty, all indices are set
-/// to 0.
-///
-/// NOTE: When N < rank(vectorToStore), the missing vector sizes are taken from
-/// `valueToStore`.
-/// TODO: `inputVecSizesForLeadingDims` should not be required - these sizes 
are
-/// already provided in `vectorToStore`.
+/// Finally, `writeIndices` specifies the offsets to use. If empty, all indices
+/// are set to 0.
 static Operation *
-createWriteOrMaskedWrite(OpBuilder &builder, Location loc, Value vectorToStore,
- Value dest,
- ArrayRef inputVecSizesForLeadingDims,
- SmallVector writeIndices = {},
+createWriteOrMaskedWrite(OpBuilder &builder, Location loc, Value vecToStore,
+ Value dest, SmallVector writeIndices = {},
  bool useInBoundsInsteadOfMasking = false) {
 
   ShapedType destType = cast(dest.getType());
   int64_t destRank = destType.getRank();
   auto destShape = destType.getShape();
 
-  VectorType vecToStoreType = cast(vectorToStore.getType());
+  VectorType vecToStoreType = cast(vecToStore.getType());
   int64_t vecToStoreRank = vecToStoreType.getRank();
   auto vecToStoreShape = vecToStoreType.getShape();
 
   // Compute the in_bounds attribute
   SmallVector inBoundsVal(vecToStoreRank, true);
   if (useInBoundsInsteadOfMasking) {
-// In this case, assume that all the required vector sizes have been
-// provided.
-assert(inputVecSizesForLeadingDims.size() ==
-   static_cast(vecToStoreType.getRank()) &&
-   "Insufficient number of input vector sizes!");
-// Update the inBounds attribute.
 for (unsigned i = 0; i < destRank; i++)
-  inBoundsVal[i] = (destShape[i] == inputVecSizesForLeadingDims[i]) &&
+  inBoundsVal[i] = (destShape[i] == vecToStoreShape[i]) &&
!ShapedType::isDynamic(destShape[i]);
   }
 
@@ -1660,7 +1645,7 @@ createWriteOrMaskedWrite(OpBuilder &builder, Location 
loc, Value vectorToStore,
   // Generate the xfer_write Op
   Operation *write =
   builder.create(loc,
-  /*vector=*/vectorToStore,
+  /*vector=*/vecToStore,
   /*source=*/dest,
   /*indices=*/writeIndices,
   /*inBounds=*/inBoundsVal);
@@ -1669,46 +1654,25 @@ createWriteOrMaskedWrite(OpBuilder &builder, Location 
loc, Value vectorToStore,
   if (useInBoundsInsteadOfMasking)
 return write;
 
-  assert(llvm::none_of(
- destShape.drop_front(inputVecSizesForLeadingDims.size()),
- [](int64_t size) { return size == ShapedType::kDynami

[llvm-branch-commits] [mlir] users/banach space/vector/update create write (PR #141567)

2025-05-27 Thread Andrzej Warzyński via llvm-branch-commits


https://github.com/banach-space edited 
https://github.com/llvm/llvm-project/pull/141567
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits

[llvm-branch-commits] [mlir] users/banach space/vector/update create write (PR #141567)

2025-05-27 Thread via llvm-branch-commits


llvmbot wrote:




@llvm/pr-subscribers-mlir-linalg

Author: Andrzej Warzyński (banach-space)


Changes

- **[[mlir][linalg] Refactor vectorization hooks to improve code reuse**
- **[mlir][linalg] Simplify `createWriteOrMaskedWrite` (NFC)**


---
Full diff: https://github.com/llvm/llvm-project/pull/141567.diff


1 Files Affected:

- (modified) mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp (+40-78) 


``diff
diff --git a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp 
b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
index 0113ba86a5ae3..2abb2f0ea467c 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
@@ -1590,61 +1590,46 @@ static bool 
isMaskTriviallyFoldable(SmallVector &maskSizes,
 /// Creates an optionally masked TransferWriteOp
 ///
 /// Generates the following operation:
-///   %res = vector.transfer_write %vectorToStore into %dest
+///   %res = vector.transfer_write %vecToStore into %dest
 ///
-/// If the leading N dimensions of the vector to store do not match
-/// `inputVecSizesForLeadingDims` (N = rank(inputVecSizesForLeadingDims)),
-/// masking is applied to ensure correctness:
+/// If shape(vecToStore) != shape(dest), masking is used to ensure correctness:
 ///
-///   %mask = vector.create_mask(%destShape) : %vectorToStoreShape
+///   %mask = vector.create_mask(%destShape) : %vecToStoreShape
 ///   %res = vector.mask %mask {
-/// vector.transfer_write %vectorToStore into %dest
+/// vector.transfer_write %vecToStore into %dest
 ///   }
 ///
-/// The mask shape is identical to `vectorToStore` (with the element type ==
+/// The mask shape is identical to `vecToStore` (with the element type ==
 /// i1), and the mask values are based on the shape of the `dest` tensor.
 ///
 /// If `useInBoundsInsteadOfMasking` is set to `true`, the `in_bounds` 
attribute
 /// is used instead of masking:
 ///
-///   %write = vector.transfer_write %vectorToStore into %dest
+///   %write = vector.transfer_write %vecToStore into %dest
 ///   in_bounds_flags = (...)
 ///   %res = vector.transfer_write %input into %dest
 ///   {in_bounds = in_bounds_flags}
 ///
-/// `writeIndices` specifies the offsets to use. If empty, all indices are set
-/// to 0.
-///
-/// NOTE: When N < rank(vectorToStore), the missing vector sizes are taken from
-/// `valueToStore`.
-/// TODO: `inputVecSizesForLeadingDims` should not be required - these sizes 
are
-/// already provided in `vectorToStore`.
+/// Finally, `writeIndices` specifies the offsets to use. If empty, all indices
+/// are set to 0.
 static Operation *
-createWriteOrMaskedWrite(OpBuilder &builder, Location loc, Value vectorToStore,
- Value dest,
- ArrayRef inputVecSizesForLeadingDims,
- SmallVector writeIndices = {},
+createWriteOrMaskedWrite(OpBuilder &builder, Location loc, Value vecToStore,
+ Value dest, SmallVector writeIndices = {},
  bool useInBoundsInsteadOfMasking = false) {
 
   ShapedType destType = cast(dest.getType());
   int64_t destRank = destType.getRank();
   auto destShape = destType.getShape();
 
-  VectorType vecToStoreType = cast(vectorToStore.getType());
+  VectorType vecToStoreType = cast(vecToStore.getType());
   int64_t vecToStoreRank = vecToStoreType.getRank();
   auto vecToStoreShape = vecToStoreType.getShape();
 
   // Compute the in_bounds attribute
   SmallVector inBoundsVal(vecToStoreRank, true);
   if (useInBoundsInsteadOfMasking) {
-// In this case, assume that all the required vector sizes have been
-// provided.
-assert(inputVecSizesForLeadingDims.size() ==
-   static_cast(vecToStoreType.getRank()) &&
-   "Insufficient number of input vector sizes!");
-// Update the inBounds attribute.
 for (unsigned i = 0; i < destRank; i++)
-  inBoundsVal[i] = (destShape[i] == inputVecSizesForLeadingDims[i]) &&
+  inBoundsVal[i] = (destShape[i] == vecToStoreShape[i]) &&
!ShapedType::isDynamic(destShape[i]);
   }
 
@@ -1660,7 +1645,7 @@ createWriteOrMaskedWrite(OpBuilder &builder, Location 
loc, Value vectorToStore,
   // Generate the xfer_write Op
   Operation *write =
   builder.create(loc,
-  /*vector=*/vectorToStore,
+  /*vector=*/vecToStore,
   /*source=*/dest,
   /*indices=*/writeIndices,
   /*inBounds=*/inBoundsVal);
@@ -1669,46 +1654,25 @@ createWriteOrMaskedWrite(OpBuilder &builder, Location 
loc, Value vectorToStore,
   if (useInBoundsInsteadOfMasking)
 return write;
 
-  assert(llvm::none_of(
- destShape.drop_front(inputVecSizesForLeadingDims.size()),
- [](int64_t size) { return size == ShapedType::kDyna

[llvm-branch-commits] [SPARC][IAS] Add definitions for OSA 2011 instructions (PR #138403)

2025-05-27 Thread Sergei Barannikov via llvm-branch-commits



@@ -665,3 +724,9 @@ def : InstAlias<"signx $rs1, $rd", (SRArr IntRegs:$rd, 
IntRegs:$rs1, G0), 0>, Re
 
 // sir -> sir 0
 def : InstAlias<"sir", (SIR 0), 0>;
+
+// pause reg_or_imm -> wrasr %g0, reg_or_imm, %asr27
+let Predicates = [HasOSA2011] in {
+def : InstAlias<"pause $rs2", (WRASRrr ASR27, G0, IntRegs:$rs2), 1>;
+def : InstAlias<"pause $simm13", (WRASRri ASR27, G0, simm13Op:$simm13), 1>;

s-barannikov wrote:

Indentation
```suggestion
  def : InstAlias<"pause $rs2", (WRASRrr ASR27, G0, IntRegs:$rs2)>;
  def : InstAlias<"pause $simm13", (WRASRri ASR27, G0, simm13Op:$simm13)>;
```

https://github.com/llvm/llvm-project/pull/138403
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits

[llvm-branch-commits] [SPARC][IAS] Add definitions for OSA 2011 instructions (PR #138403)

2025-05-27 Thread Sergei Barannikov via llvm-branch-commits



@@ -141,6 +147,26 @@ getMachineOpValue(const MCInst &MI, const MCOperand &MO,
   return 0;
 }
 
+unsigned SparcMCCodeEmitter::getSImm5OpValue(const MCInst &MI, unsigned OpNo,
+ SmallVectorImpl &Fixups,
+ const MCSubtargetInfo &STI) const 
{
+  const MCOperand &MO = MI.getOperand(OpNo);
+
+  if (MO.isImm())
+return MO.getImm();
+
+  assert(MO.isExpr() &&
+ "getSImm5OpValue expects only expressions or an immediate");
+
+  const MCExpr *Expr = MO.getExpr();
+
+  // Constant value, no fixup is needed
+  if (const MCConstantExpr *CE = dyn_cast(Expr))
+return CE->getValue();
+
+  llvm_unreachable("simm5 operands can only be used with constants!");

s-barannikov wrote:

Shouldn't it be a `R_SPARC_5` relocation?
EIther way, if this code is reachable, it shouldn't be `llvm_unreachable`.


https://github.com/llvm/llvm-project/pull/138403
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits

[llvm-branch-commits] [SPARC][IAS] Add definitions for OSA 2011 instructions (PR #138403)

2025-05-27 Thread Sergei Barannikov via llvm-branch-commits



@@ -331,6 +331,25 @@ multiclass reg_cond_alias {
 Requires<[Is64Bit]>;
 }
 
+// Instruction aliases for compare-and-branch.
+multiclass cwb_cond_alias {
+  def : InstAliashttps://github.com/llvm/llvm-project/pull/138403
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits

[llvm-branch-commits] [SPARC][IAS] Add definitions for OSA 2011 instructions (PR #138403)

2025-05-27 Thread Sergei Barannikov via llvm-branch-commits



@@ -408,6 +427,46 @@ defm : reg_cond_alias<"ne",   0b101>;
 defm : reg_cond_alias<"gz",   0b110>;
 defm : reg_cond_alias<"gez",  0b111>;
 
+defm : cwb_cond_alias<"ne",   0b1001>;
+defm : cwb_cond_alias<"e",0b0001>;
+defm : cwb_cond_alias<"g",0b1010>;
+defm : cwb_cond_alias<"le",   0b0010>;
+defm : cwb_cond_alias<"ge",   0b1011>;
+defm : cwb_cond_alias<"l",0b0011>;
+defm : cwb_cond_alias<"gu",   0b1100>;
+defm : cwb_cond_alias<"leu",  0b0100>;
+defm : cwb_cond_alias<"cc",   0b1101>;
+defm : cwb_cond_alias<"cs",   0b0101>;
+defm : cwb_cond_alias<"pos",  0b1110>;
+defm : cwb_cond_alias<"neg",  0b0110>;
+defm : cwb_cond_alias<"vc",   0b>;
+defm : cwb_cond_alias<"vs",   0b0111>;
+let EmitPriority = 0 in
+{

s-barannikov wrote:

```suggestion
let EmitPriority = 0 in {
```

https://github.com/llvm/llvm-project/pull/138403
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits

[llvm-branch-commits] [SPARC][IAS] Add definitions for OSA 2011 instructions (PR #138403)

2025-05-27 Thread Sergei Barannikov via llvm-branch-commits



@@ -102,6 +102,49 @@ class F2_4 pattern = [], InstrItinClass itin = NoItinerary>
+   : InstSP {

s-barannikov wrote:

```suggestion
: InstSP {
```

https://github.com/llvm/llvm-project/pull/138403
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits

[llvm-branch-commits] [SPARC][IAS] Add definitions for OSA 2011 instructions (PR #138403)

2025-05-27 Thread Sergei Barannikov via llvm-branch-commits



@@ -217,6 +243,18 @@ unsigned SparcMCCodeEmitter::getBranchOnRegTargetOpValue(
   return 0;
 }
 
+unsigned SparcMCCodeEmitter::getCompareAndBranchTargetOpValue(
+const MCInst &MI, unsigned OpNo, SmallVectorImpl &Fixups,
+const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpNo);
+  if (MO.isReg() || MO.isImm())

s-barannikov wrote:

It can't be a register, can it?

https://github.com/llvm/llvm-project/pull/138403
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits

[llvm-branch-commits] [SPARC][IAS] Add definitions for OSA 2011 instructions (PR #138403)

2025-05-27 Thread Sergei Barannikov via llvm-branch-commits



@@ -50,6 +50,15 @@ static unsigned adjustFixupValue(unsigned Kind, uint64_t 
Value) {
 return (d16hi << 20) | d16lo;
   }
 
+  case ELF::R_SPARC_WDISP10: {
+// 7.17 Compare and Branch
+// Inst{20-19} = d10hi;
+// Inst{12-5}  = d10lo;
+unsigned d10hi = (Value >> 10) & 0x3;

s-barannikov wrote:

Assert that `Value` is a multiple of 4? (Assuming it is guaranteed elsewhere.)


https://github.com/llvm/llvm-project/pull/138403
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits

[llvm-branch-commits] [SPARC][IAS] Add definitions for OSA 2011 instructions (PR #138403)

2025-05-27 Thread Sergei Barannikov via llvm-branch-commits



@@ -102,6 +102,49 @@ class F2_4 pattern = [], InstrItinClass itin = NoItinerary>
+   : InstSP {
+  bits<10> imm10;
+  bits<5>  rs1;
+  bits<5>  rs2;
+  bits<4>  cond;
+
+  let op  = 0; // op = 0
+
+  let Inst{29}= cond{3};
+  let Inst{28}= 1;
+  let Inst{27-25} = cond{2-0};
+  let Inst{24-22} = 0b011;
+  let Inst{21}= cc;
+  let Inst{20-19} = imm10{9-8};
+  let Inst{18-14} = rs1;
+  let Inst{13}= 0; // i = 0
+  let Inst{12-5}  = imm10{7-0};
+  let Inst{4-0}   = rs2;
+}
+
+class F2_6 pattern = [], InstrItinClass itin = NoItinerary>
+   : InstSP {

s-barannikov wrote:

```suggestion
: InstSP {
```

https://github.com/llvm/llvm-project/pull/138403
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits

[llvm-branch-commits] [SPARC][IAS] Add definitions for OSA 2011 instructions (PR #138403)

2025-05-27 Thread Sergei Barannikov via llvm-branch-commits



@@ -0,0 +1,267 @@
+! RUN: llvm-mc -triple=sparcv9 -mattr=+osa2011 -filetype=obj %s | llvm-objdump 
--mattr=+osa2011 --no-print-imm-hex -d - | FileCheck %s --check-prefix=BIN
+
+!! SPARCv9/SPARC64 BPr branches have different offset encoding from 
the others,

s-barannikov wrote:

Indentation looks unnecessary

https://github.com/llvm/llvm-project/pull/138403
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits

[llvm-branch-commits] [llvm] [LAA] Support monotonic pointers in LoopAccessAnalysis (PR #140721)

2025-05-27 Thread Sergey Kachkov via llvm-branch-commits


skachkov-sc wrote:

Gentle ping

https://github.com/llvm/llvm-project/pull/140721
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits

[llvm-branch-commits] [mlir] users/banach space/vector/update create write (PR #141567)

2025-05-27 Thread Andrzej Warzyński via llvm-branch-commits


https://github.com/banach-space edited 
https://github.com/llvm/llvm-project/pull/141567
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits

[llvm-branch-commits] [compiler-rt] 0412d56 - Revert "[compiler-rt][XRay] Make `xray_interface.h` C compliant (#140068)"

2025-05-27 Thread via llvm-branch-commits


Author: Jan Patrick Lehr
Date: 2025-05-27T11:14:59+02:00
New Revision: 0412d56eaebb747cc77bd025969ed46e2b5cb12d

URL: 
https://github.com/llvm/llvm-project/commit/0412d56eaebb747cc77bd025969ed46e2b5cb12d
DIFF: 
https://github.com/llvm/llvm-project/commit/0412d56eaebb747cc77bd025969ed46e2b5cb12d.diff

LOG: Revert "[compiler-rt][XRay] Make `xray_interface.h` C compliant (#140068)"

This reverts commit 80da58da343620e458e34f01df95b329e7a5763c.

Added: 


Modified: 
compiler-rt/include/xray/xray_interface.h

Removed: 
compiler-rt/test/xray/TestCases/Posix/patching-unpatching.c



diff  --git a/compiler-rt/include/xray/xray_interface.h 
b/compiler-rt/include/xray/xray_interface.h
index 3ef8ee348540f..675ea0cbc48c8 100644
--- a/compiler-rt/include/xray/xray_interface.h
+++ b/compiler-rt/include/xray/xray_interface.h
@@ -1,4 +1,4 @@
-//===- xray_interface.h 
---===//
+//===- xray_interface.h -*- C++ 
-*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -14,17 +14,10 @@
 #ifndef XRAY_XRAY_INTERFACE_H
 #define XRAY_XRAY_INTERFACE_H
 
-#ifdef __cplusplus
 #include 
 #include 
-#else
-#include 
-#include 
-#endif
 
-#ifdef __cplusplus
 extern "C" {
-#endif
 
 /// Synchronize this with AsmPrinter::SledKind in LLVM.
 enum XRayEntryType {
@@ -56,7 +49,7 @@ enum XRayEntryType {
 /// achieved by marking them all with: __attribute__((xray_never_instrument))
 ///
 /// Returns 1 on success, 0 on error.
-extern int __xray_set_handler(void (*entry)(int32_t, enum XRayEntryType));
+extern int __xray_set_handler(void (*entry)(int32_t, XRayEntryType));
 
 /// This removes whatever the currently provided handler is. Returns 1 on
 /// success, 0 on error.
@@ -67,7 +60,7 @@ extern int __xray_remove_handler();
 /// start logging their subsequent affected function calls (if patched).
 ///
 /// Returns 1 on success, 0 on error.
-extern int __xray_set_handler_arg1(void (*entry)(int32_t, enum XRayEntryType,
+extern int __xray_set_handler_arg1(void (*entry)(int32_t, XRayEntryType,
  uint64_t));
 
 /// Disables the XRay handler used to log first arguments of function calls.
@@ -75,7 +68,7 @@ extern int __xray_set_handler_arg1(void (*entry)(int32_t, 
enum XRayEntryType,
 extern int __xray_remove_handler_arg1();
 
 /// Provide a function to invoke when XRay encounters a custom event.
-extern int __xray_set_customevent_handler(void (*entry)(void *, size_t));
+extern int __xray_set_customevent_handler(void (*entry)(void *, std::size_t));
 
 /// This removes whatever the currently provided custom event handler is.
 /// Returns 1 on success, 0 on error.
@@ -102,39 +95,39 @@ enum XRayPatchingStatus {
 
 /// This tells XRay to patch the instrumentation points in all currently loaded
 /// objects. See XRayPatchingStatus for possible result values.
-extern enum XRayPatchingStatus __xray_patch();
+extern XRayPatchingStatus __xray_patch();
 
 /// This tells XRay to patch the instrumentation points in the given object.
 /// See XRayPatchingStatus for possible result values.
-extern enum XRayPatchingStatus __xray_patch_object(int32_t ObjId);
+extern XRayPatchingStatus __xray_patch_object(int32_t ObjId);
 
 /// Reverses the effect of __xray_patch(). See XRayPatchingStatus for possible
 /// result values.
-extern enum XRayPatchingStatus __xray_unpatch();
+extern XRayPatchingStatus __xray_unpatch();
 
 /// Reverses the effect of __xray_patch_object. See XRayPatchingStatus for
 /// possible result values.
-extern enum XRayPatchingStatus __xray_unpatch_object(int32_t ObjId);
+extern XRayPatchingStatus __xray_unpatch_object(int32_t ObjId);
 
 /// This unpacks the given (packed) function id and patches
 /// the corresponding function.  See XRayPatchingStatus for possible
 /// result values.
-extern enum XRayPatchingStatus __xray_patch_function(int32_t FuncId);
+extern XRayPatchingStatus __xray_patch_function(int32_t FuncId);
 
 /// This patches a specific function in the given object. See 
XRayPatchingStatus
 /// for possible result values.
-extern enum XRayPatchingStatus __xray_patch_function_in_object(int32_t FuncId,
-   int32_t ObjId);
+extern XRayPatchingStatus __xray_patch_function_in_object(int32_t FuncId,
+  int32_t ObjId);
 
 /// This unpacks the given (packed) function id and unpatches
 /// the corresponding function. See XRayPatchingStatus for possible
 /// result values.
-extern enum XRayPatchingStatus __xray_unpatch_function(int32_t FuncId);
+extern XRayPatchingStatus __xray_unpatch_function(int32_t FuncId);
 
 /// This unpatches a specific function in the given object.
 /// See XRayPatchingStatus for pos

[llvm-branch-commits] [llvm] AMDGPU: Add baseline tests for #139317 (PR #140607)

2025-05-27 Thread Matt Arsenault via llvm-branch-commits


arsenm wrote:

ping 

https://github.com/llvm/llvm-project/pull/140607
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits

[llvm-branch-commits] [mlir] [mlir][linalg] Simplify `createWriteOrMaskedWrite` (NFC) (PR #141567)

2025-05-27 Thread Andrzej Warzyński via llvm-branch-commits


https://github.com/banach-space edited 
https://github.com/llvm/llvm-project/pull/141567
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits

[llvm-branch-commits] [llvm] AMDGPU: Fix tracking subreg defs when folding through reg_sequence (PR #140608)

2025-05-27 Thread Matt Arsenault via llvm-branch-commits


arsenm wrote:

ping 

https://github.com/llvm/llvm-project/pull/140608
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits

[llvm-branch-commits] [lld] ELF: Add branch-to-branch optimization. (PR #138366)

2025-05-27 Thread Peter Smith via llvm-branch-commits



@@ -0,0 +1,92 @@
+//===- TargetImpl.h -*- C++ 
-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===--===//
+
+#ifndef LLD_ELF_ARCH_TARGETIMPL_H
+#define LLD_ELF_ARCH_TARGETIMPL_H
+
+#include "InputFiles.h"
+#include "InputSection.h"
+#include "Relocations.h"
+#include "Symbols.h"
+#include "llvm/BinaryFormat/ELF.h"
+
+namespace lld {
+namespace elf {
+
+// getControlTransferAddend: If this relocation is used for control transfer
+// instructions (e.g. branch, branch-link or call) or code references (e.g.
+// virtual function pointers) and indicates an address-insignificant reference,
+// return the effective addend for the relocation, otherwise return
+// std::nullopt. The effective addend for a relocation is the addend that is
+// used to determine its branch destination.
+//
+// getBranchInfo: If a control transfer relocation referring to is+offset
+// directly transfers control to a relocated branch instruction in the 
specified
+// section, return the relocation for the branch target as well as its 
effective
+// addend (see above). Otherwise return {nullptr, 0}.
+//
+// mergeControlTransferRelocations: Given r1, a relocation for which
+// getControlTransferAddend() returned a value, and r2, a relocation returned 
by
+// getBranchInfo(), modify r1 so that it branches directly to the target of r2.
+template 
+inline void applyBranchToBranchOptImpl(
+Ctx &ctx, GetBranchInfo getBranchInfo,
+GetControlTransferAddend getControlTransferAddend,
+MergeControlTransferRelocations mergeControlTransferRelocations) {
+  // Needs to run serially because it writes to the relocations array as well 
as
+  // reading relocations of other sections.
+  for (ELFFileBase *f : ctx.objectFiles) {
+auto getRelocBranchInfo =
+[&getBranchInfo](Relocation &r,
+ uint64_t addend) -> std::pair 
{
+  auto *target = dyn_cast_or_null(r.sym);
+  // We don't allow preemptible symbols or ifuncs (may go somewhere else),
+  // absolute symbols (runtime behavior unknown), non-executable memory
+  // (ditto) or non-regular sections (no section data).
+  if (!target || target->isPreemptible || target->isGnuIFunc() ||

smithp35 wrote:

Yes, just checked and it does copy the relocation addend.

I agree that this wouldn't need a test case.

As an aside when checking where the addends were read in I ran into this bit of 
copyRelocations 
https://github.com/llvm/llvm-project/blob/main/lld/ELF/InputSection.cpp#L433 
```
  if (ctx.arg.relax && !ctx.arg.relocatable &&
  (ctx.arg.emachine == EM_RISCV || ctx.arg.emachine == EM_LOONGARCH)) {
// On LoongArch and RISC-V, relaxation might change relocations: copy
// from internal ones that are updated by relaxation.
InputSectionBase *sec = getRelocatedSection();
copyRelocations(
ctx, buf,
llvm::make_range(sec->relocations.begin(), sec->relocations.end()));
```

I think I mentioned in a previous comment that bolt uses emit-relocations so it 
may be worth following suite here when the transformation is applied.

I suspect that if bolt trusts the original relocation then in worst case the 
transformation is undone though. 

https://github.com/llvm/llvm-project/pull/138366
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits

[llvm-branch-commits] [clang] [Driver] Fix link order of BareMetal toolchain object (PR #132806)

2025-05-27 Thread Garvit Gupta via llvm-branch-commits


https://github.com/quic-garvgupt updated 
https://github.com/llvm/llvm-project/pull/132806

>From 597b0e0514f26f22e9425f25652657194aa48fc0 Mon Sep 17 00:00:00 2001
From: Garvit Gupta 
Date: Mon, 24 Mar 2025 06:17:42 -0700
Subject: [PATCH] [Driver] Fix link order of BareMetal toolchain object

The linker job in BareMetal toolchain object will be used by gnuld and lld both.
However, gnuld process the arguments in the order in which they appear on 
command
line, whereas there is no such restriction with lld.

The previous order was:
LibraryPaths -> Libraries -> LTOOptions -> LinkerInputs
The new iorder is:
LibraryPaths -> LTOOptions -> LinkerInputs -> Libraries

LTO options need to be added before adding any linker inputs because file format
after compile stage during LTO is bitcode which gnuld natively cannot process.
Hence iwill need to pass appropriate plugins before adding any bitcode file on 
the
command line.

Object files that are getting linked need to be passed before processing any
libraries so that gnuld can appropriately do symbol resolution for the symbols
for which no definition is provided through user code.

Similar link order is also followed by other linker jobs for gnuld such as in
gnutools::Linker in Gnu.cpp

This is the 3rd patch in the series of patches of merging RISCVToolchain into
BareMetal toolchain object.

RFC:
https://discourse.llvm.org/t/merging-riscvtoolchain-and-baremetal-toolchains/75524

Change-Id: I0e68e403c08b5687cc3346e833981f7b9f3819c4
---
 clang/lib/Driver/ToolChains/BareMetal.cpp   | 32 -
 clang/test/Driver/aarch64-toolchain-extra.c |  2 +-
 clang/test/Driver/aarch64-toolchain.c   | 28 
 clang/test/Driver/arm-toolchain-extra.c |  2 +-
 clang/test/Driver/arm-toolchain.c   | 28 
 clang/test/Driver/baremetal-multilib.yaml   |  3 +-
 clang/test/Driver/baremetal-sysroot.cpp |  8 ++-
 clang/test/Driver/baremetal.cpp | 79 +
 8 files changed, 102 insertions(+), 80 deletions(-)

diff --git a/clang/lib/Driver/ToolChains/BareMetal.cpp 
b/clang/lib/Driver/ToolChains/BareMetal.cpp
index 8aee32c04d46a..35cc052429270 100644
--- a/clang/lib/Driver/ToolChains/BareMetal.cpp
+++ b/clang/lib/Driver/ToolChains/BareMetal.cpp
@@ -545,8 +545,6 @@ void baremetal::Linker::ConstructJob(Compilation &C, const 
JobAction &JA,
   const llvm::Triple::ArchType Arch = TC.getArch();
   const llvm::Triple &Triple = getToolChain().getEffectiveTriple();
 
-  AddLinkerInputs(TC, Inputs, Args, CmdArgs, JA);
-
   CmdArgs.push_back("-Bstatic");
 
   if (TC.getTriple().isRISCV() && Args.hasArg(options::OPT_mno_relax))
@@ -596,6 +594,22 @@ void baremetal::Linker::ConstructJob(Compilation &C, const 
JobAction &JA,
   for (const auto &LibPath : TC.getLibraryPaths())
 CmdArgs.push_back(Args.MakeArgString(llvm::Twine("-L", LibPath)));
 
+  if (D.isUsingLTO()) {
+assert(!Inputs.empty() && "Must have at least one input.");
+// Find the first filename InputInfo object.
+auto Input = llvm::find_if(
+Inputs, [](const InputInfo &II) -> bool { return II.isFilename(); });
+if (Input == Inputs.end())
+  // For a very rare case, all of the inputs to the linker are
+  // InputArg. If that happens, just use the first InputInfo.
+  Input = Inputs.begin();
+
+addLTOOptions(TC, Args, CmdArgs, Output, *Input,
+  D.getLTOMode() == LTOK_Thin);
+  }
+
+  AddLinkerInputs(TC, Inputs, Args, CmdArgs, JA);
+
   if (TC.ShouldLinkCXXStdlib(Args)) {
 bool OnlyLibstdcxxStatic = Args.hasArg(options::OPT_static_libstdcxx) &&
!Args.hasArg(options::OPT_static);
@@ -616,20 +630,6 @@ void baremetal::Linker::ConstructJob(Compilation &C, const 
JobAction &JA,
 CmdArgs.push_back("--end-group");
   }
 
-  if (D.isUsingLTO()) {
-assert(!Inputs.empty() && "Must have at least one input.");
-// Find the first filename InputInfo object.
-auto Input = llvm::find_if(
-Inputs, [](const InputInfo &II) -> bool { return II.isFilename(); });
-if (Input == Inputs.end())
-  // For a very rare case, all of the inputs to the linker are
-  // InputArg. If that happens, just use the first InputInfo.
-  Input = Inputs.begin();
-
-addLTOOptions(TC, Args, CmdArgs, Output, *Input,
-  D.getLTOMode() == LTOK_Thin);
-  }
-
   if ((TC.hasValidGCCInstallation() || hasGCCToolChainAlongSideClang(D)) &&
   NeedCRTs)
 CmdArgs.push_back(Args.MakeArgString(TC.GetFilePath(CRTEnd)));
diff --git a/clang/test/Driver/aarch64-toolchain-extra.c 
b/clang/test/Driver/aarch64-toolchain-extra.c
index 2a930e35acd45..a0b5f2902962f 100644
--- a/clang/test/Driver/aarch64-toolchain-extra.c
+++ b/clang/test/Driver/aarch64-toolchain-extra.c
@@ -31,5 +31,5 @@
 // C-AARCH64-BAREMETAL-NOGCC: 
"{{.*}}/aarch64-nogcc/bin/../aarch64-none-elf/lib/crt0.o"
 // C-AARCH64-BAREMETAL-NOGCC: 
"{{.*}}/aarch64-nogcc/{{.*}}/aarch64-none-elf/lib/crtbegin.o"
 // C-AARCH64-BAREMETAL-NOGCC

[llvm-branch-commits] [clang] [RISCV] Integrate RISCV target in baremetal toolchain object and deprecate RISCVToolchain object (PR #121831)

2025-05-27 Thread Garvit Gupta via llvm-branch-commits


https://github.com/quic-garvgupt updated 
https://github.com/llvm/llvm-project/pull/121831

>From 08e30252fc4bd87d84decc81161c081c774d398e Mon Sep 17 00:00:00 2001
From: Garvit Gupta 
Date: Mon, 6 Jan 2025 10:05:08 -0800
Subject: [PATCH] [RISCV] Integrate RISCV target in baremetal toolchain object
 and deprecate RISCVToolchain object

This patch:
- Adds CXXStdlib, runtimelib and unwindlib defaults for riscv target to
  BareMetal toolchain object.
- Add riscv 32 and 64-bit emulation flags to linker job of BareMetal
  toolchain.
- Removes call to RISCVToolChain object from llvm.

This PR is last patch in the series of patches of merging RISCVToolchain
object into BareMetal toolchain object.

RFC:
https: 
//discourse.llvm.org/t/merging-riscvtoolchain-and-baremetal-toolchains/75524
Change-Id: Ic5d64a4ed3ebc58c30c12d9827e7e57a02eb13ca
---
 clang/lib/Driver/CMakeLists.txt   |   1 -
 clang/lib/Driver/Driver.cpp   |  10 +-
 clang/lib/Driver/ToolChains/BareMetal.cpp |  20 ++
 clang/lib/Driver/ToolChains/BareMetal.h   |  11 +-
 .../lib/Driver/ToolChains/RISCVToolchain.cpp  | 232 --
 clang/lib/Driver/ToolChains/RISCVToolchain.h  |  67 -
 .../test/Driver/baremetal-undefined-symbols.c |  14 +-
 clang/test/Driver/riscv32-toolchain-extra.c   |   7 +-
 clang/test/Driver/riscv32-toolchain.c |  26 +-
 clang/test/Driver/riscv64-toolchain-extra.c   |   7 +-
 clang/test/Driver/riscv64-toolchain.c |  20 +-
 11 files changed, 61 insertions(+), 354 deletions(-)
 delete mode 100644 clang/lib/Driver/ToolChains/RISCVToolchain.cpp
 delete mode 100644 clang/lib/Driver/ToolChains/RISCVToolchain.h

diff --git a/clang/lib/Driver/CMakeLists.txt b/clang/lib/Driver/CMakeLists.txt
index 5bdb6614389cf..eee29af5d181a 100644
--- a/clang/lib/Driver/CMakeLists.txt
+++ b/clang/lib/Driver/CMakeLists.txt
@@ -74,7 +74,6 @@ add_clang_library(clangDriver
   ToolChains/OHOS.cpp
   ToolChains/OpenBSD.cpp
   ToolChains/PS4CPU.cpp
-  ToolChains/RISCVToolchain.cpp
   ToolChains/Solaris.cpp
   ToolChains/SPIRV.cpp
   ToolChains/SPIRVOpenMP.cpp
diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp
index 07e36ea2efba4..cfc0ba63d5749 100644
--- a/clang/lib/Driver/Driver.cpp
+++ b/clang/lib/Driver/Driver.cpp
@@ -41,7 +41,6 @@
 #include "ToolChains/PPCFreeBSD.h"
 #include "ToolChains/PPCLinux.h"
 #include "ToolChains/PS4CPU.h"
-#include "ToolChains/RISCVToolchain.h"
 #include "ToolChains/SPIRV.h"
 #include "ToolChains/SPIRVOpenMP.h"
 #include "ToolChains/SYCL.h"
@@ -6889,16 +6888,11 @@ const ToolChain &Driver::getToolChain(const ArgList 
&Args,
 TC = std::make_unique(*this, Target, Args);
 break;
   case llvm::Triple::msp430:
-TC =
-std::make_unique(*this, Target, Args);
+TC = std::make_unique(*this, Target, 
Args);
 break;
   case llvm::Triple::riscv32:
   case llvm::Triple::riscv64:
-if (toolchains::RISCVToolChain::hasGCCToolchain(*this, Args))
-  TC =
-  std::make_unique(*this, Target, 
Args);
-else
-  TC = std::make_unique(*this, Target, Args);
+TC = std::make_unique(*this, Target, Args);
 break;
   case llvm::Triple::ve:
 TC = std::make_unique(*this, Target, Args);
diff --git a/clang/lib/Driver/ToolChains/BareMetal.cpp 
b/clang/lib/Driver/ToolChains/BareMetal.cpp
index ac85757442e18..b9302433a1d63 100644
--- a/clang/lib/Driver/ToolChains/BareMetal.cpp
+++ b/clang/lib/Driver/ToolChains/BareMetal.cpp
@@ -352,6 +352,26 @@ BareMetal::OrderedMultilibs 
BareMetal::getOrderedMultilibs() const {
   return llvm::reverse(Default);
 }
 
+ToolChain::CXXStdlibType BareMetal::GetDefaultCXXStdlibType() const {
+  if (getTriple().isRISCV() && GCCInstallation.isValid())
+return ToolChain::CST_Libstdcxx;
+  return ToolChain::CST_Libcxx;
+}
+
+ToolChain::RuntimeLibType BareMetal::GetDefaultRuntimeLibType() const {
+  if (getTriple().isRISCV() && GCCInstallation.isValid())
+return ToolChain::RLT_Libgcc;
+  return ToolChain::RLT_CompilerRT;
+}
+
+ToolChain::UnwindLibType
+BareMetal::GetUnwindLibType(const llvm::opt::ArgList &Args) const {
+  if (getTriple().isRISCV())
+return ToolChain::UNW_None;
+
+  return ToolChain::GetUnwindLibType(Args);
+}
+
 void BareMetal::AddClangSystemIncludeArgs(const ArgList &DriverArgs,
   ArgStringList &CC1Args) const {
   if (DriverArgs.hasArg(options::OPT_nostdinc))
diff --git a/clang/lib/Driver/ToolChains/BareMetal.h 
b/clang/lib/Driver/ToolChains/BareMetal.h
index 87f173342def2..580f5c6903c1f 100644
--- a/clang/lib/Driver/ToolChains/BareMetal.h
+++ b/clang/lib/Driver/ToolChains/BareMetal.h
@@ -54,12 +54,11 @@ class LLVM_LIBRARY_VISIBILITY BareMetal : public 
Generic_ELF {
 return UnwindTableLevel::None;
   }
 
-  RuntimeLibType GetDefaultRuntimeLibType() const override {
-return ToolChain::RLT_CompilerRT;
-  }
-  CXXStdlibType GetDefaultCXXStdlibType() const overr

[llvm-branch-commits] [clang] [RISCV][Driver] Add riscv emulation mode to linker job of BareMetal toolchain (PR #134442)

2025-05-27 Thread Garvit Gupta via llvm-branch-commits


https://github.com/quic-garvgupt updated 
https://github.com/llvm/llvm-project/pull/134442

>From 46b1136d0bab3cfc30029070597b76b4c2cbcbcf Mon Sep 17 00:00:00 2001
From: Garvit Gupta 
Date: Fri, 4 Apr 2025 12:51:19 -0700
Subject: [PATCH] [RISCV][Driver] Add riscv emulation mode to linker job of
 BareMetal toolchain

Change-Id: Ifce8a3a7f1df9c12561d35ca3c923595e3619428
---
 clang/lib/Driver/ToolChains/BareMetal.cpp  | 15 -
 clang/lib/Driver/ToolChains/CommonArgs.cpp | 70 ++
 clang/lib/Driver/ToolChains/CommonArgs.h   |  2 +
 clang/lib/Driver/ToolChains/Gnu.cpp| 70 --
 clang/test/Driver/baremetal.cpp| 28 -
 5 files changed, 99 insertions(+), 86 deletions(-)

diff --git a/clang/lib/Driver/ToolChains/BareMetal.cpp 
b/clang/lib/Driver/ToolChains/BareMetal.cpp
index 691b6c7336c7a..ac85757442e18 100644
--- a/clang/lib/Driver/ToolChains/BareMetal.cpp
+++ b/clang/lib/Driver/ToolChains/BareMetal.cpp
@@ -550,8 +550,19 @@ void baremetal::Linker::ConstructJob(Compilation &C, const 
JobAction &JA,
 
   CmdArgs.push_back("-Bstatic");
 
-  if (TC.getTriple().isRISCV() && Args.hasArg(options::OPT_mno_relax))
-CmdArgs.push_back("--no-relax");
+  if (Triple.isRISCV()) {
+if (const char *LDMOption = getLDMOption(TC.getTriple(), Args)) {
+  CmdArgs.push_back("-m");
+  CmdArgs.push_back(LDMOption);
+} else {
+  D.Diag(diag::err_target_unknown_triple) << Triple.str();
+  return;
+}
+
+CmdArgs.push_back("-X");
+if (Args.hasArg(options::OPT_mno_relax))
+  CmdArgs.push_back("--no-relax");
+  }
 
   if (Triple.isARM() || Triple.isThumb()) {
 bool IsBigEndian = arm::isARMBigEndian(Triple, Args);
diff --git a/clang/lib/Driver/ToolChains/CommonArgs.cpp 
b/clang/lib/Driver/ToolChains/CommonArgs.cpp
index ddeadff8f6dfb..292d52acdc002 100644
--- a/clang/lib/Driver/ToolChains/CommonArgs.cpp
+++ b/clang/lib/Driver/ToolChains/CommonArgs.cpp
@@ -535,6 +535,76 @@ void tools::AddLinkerInputs(const ToolChain &TC, const 
InputInfoList &Inputs,
   }
 }
 
+const char *tools::getLDMOption(const llvm::Triple &T, const ArgList &Args) {
+  switch (T.getArch()) {
+  case llvm::Triple::x86:
+if (T.isOSIAMCU())
+  return "elf_iamcu";
+return "elf_i386";
+  case llvm::Triple::aarch64:
+return "aarch64linux";
+  case llvm::Triple::aarch64_be:
+return "aarch64linuxb";
+  case llvm::Triple::arm:
+  case llvm::Triple::thumb:
+  case llvm::Triple::armeb:
+  case llvm::Triple::thumbeb:
+return tools::arm::isARMBigEndian(T, Args) ? "armelfb_linux_eabi"
+   : "armelf_linux_eabi";
+  case llvm::Triple::m68k:
+return "m68kelf";
+  case llvm::Triple::ppc:
+if (T.isOSLinux())
+  return "elf32ppclinux";
+return "elf32ppc";
+  case llvm::Triple::ppcle:
+if (T.isOSLinux())
+  return "elf32lppclinux";
+return "elf32lppc";
+  case llvm::Triple::ppc64:
+return "elf64ppc";
+  case llvm::Triple::ppc64le:
+return "elf64lppc";
+  case llvm::Triple::riscv32:
+return "elf32lriscv";
+  case llvm::Triple::riscv64:
+return "elf64lriscv";
+  case llvm::Triple::sparc:
+  case llvm::Triple::sparcel:
+return "elf32_sparc";
+  case llvm::Triple::sparcv9:
+return "elf64_sparc";
+  case llvm::Triple::loongarch32:
+return "elf32loongarch";
+  case llvm::Triple::loongarch64:
+return "elf64loongarch";
+  case llvm::Triple::mips:
+return "elf32btsmip";
+  case llvm::Triple::mipsel:
+return "elf32ltsmip";
+  case llvm::Triple::mips64:
+if (tools::mips::hasMipsAbiArg(Args, "n32") || T.isABIN32())
+  return "elf32btsmipn32";
+return "elf64btsmip";
+  case llvm::Triple::mips64el:
+if (tools::mips::hasMipsAbiArg(Args, "n32") || T.isABIN32())
+  return "elf32ltsmipn32";
+return "elf64ltsmip";
+  case llvm::Triple::systemz:
+return "elf64_s390";
+  case llvm::Triple::x86_64:
+if (T.isX32())
+  return "elf32_x86_64";
+return "elf_x86_64";
+  case llvm::Triple::ve:
+return "elf64ve";
+  case llvm::Triple::csky:
+return "cskyelf_linux";
+  default:
+return nullptr;
+  }
+}
+
 void tools::addLinkerCompressDebugSectionsOption(
 const ToolChain &TC, const llvm::opt::ArgList &Args,
 llvm::opt::ArgStringList &CmdArgs) {
diff --git a/clang/lib/Driver/ToolChains/CommonArgs.h 
b/clang/lib/Driver/ToolChains/CommonArgs.h
index 96bc0619dcbc0..875354e969a2a 100644
--- a/clang/lib/Driver/ToolChains/CommonArgs.h
+++ b/clang/lib/Driver/ToolChains/CommonArgs.h
@@ -31,6 +31,8 @@ void AddLinkerInputs(const ToolChain &TC, const InputInfoList 
&Inputs,
  const llvm::opt::ArgList &Args,
  llvm::opt::ArgStringList &CmdArgs, const JobAction &JA);
 
+const char *getLDMOption(const llvm::Triple &T, const llvm::opt::ArgList 
&Args);
+
 void addLinkerCompressDebugSectionsOption(const ToolChain &TC,
   const llvm::opt::ArgList &Args,

[llvm-branch-commits] [llvm] [AMDGPU] Move S_BFE lowering into RegBankCombiner (PR #141589)

2025-05-27 Thread Pierre van Houtryve via llvm-branch-commits


Pierre-vh wrote:

> [!WARNING]
> This pull request is not mergeable via GitHub because a downstack PR is 
> open. Once all requirements are satisfied, merge this PR as a stack  href="https://app.graphite.dev/github/pr/llvm/llvm-project/141589?utm_source=stack-comment-downstack-mergeability-warning";
>  >on Graphite.
> https://graphite.dev/docs/merge-pull-requests";>Learn more

* **#141591** https://app.graphite.dev/github/pr/llvm/llvm-project/141591?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#141590** https://app.graphite.dev/github/pr/llvm/llvm-project/141590?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#141589** https://app.graphite.dev/github/pr/llvm/llvm-project/141589?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/> 👈 https://app.graphite.dev/github/pr/llvm/llvm-project/141589?utm_source=stack-comment-view-in-graphite";
 target="_blank">(View in Graphite)
* **#141588** https://app.graphite.dev/github/pr/llvm/llvm-project/141588?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* `main`




This stack of pull requests is managed by https://graphite.dev?utm-source=stack-comment";>Graphite. Learn 
more about https://stacking.dev/?utm_source=stack-comment";>stacking.


https://github.com/llvm/llvm-project/pull/141589
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits

[llvm-branch-commits] [llvm] [AMDGPU] Move S_BFE lowering into RegBankCombiner (PR #141589)

2025-05-27 Thread Pierre van Houtryve via llvm-branch-commits


https://github.com/Pierre-vh created 
https://github.com/llvm/llvm-project/pull/141589

None

>From e5f24775ff988e5c6ac302f36b010fc0421eca34 Mon Sep 17 00:00:00 2001
From: pvanhout 
Date: Tue, 27 May 2025 11:16:16 +0200
Subject: [PATCH] [AMDGPU] Move S_BFE lowering into RegBankCombiner

---
 llvm/lib/Target/AMDGPU/AMDGPUCombine.td   |  14 +-
 .../Target/AMDGPU/AMDGPURegBankCombiner.cpp   |  51 +++
 .../Target/AMDGPU/AMDGPURegisterBankInfo.cpp  | 125 --
 3 files changed, 119 insertions(+), 71 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td 
b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
index 9587fad1ecd63..94e1175b06b14 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
@@ -151,6 +151,17 @@ def zext_of_shift_amount_combines : GICombineGroup<[
   canonicalize_zext_lshr, canonicalize_zext_ashr, canonicalize_zext_shl
 ]>;
 
+// Early select of uniform BFX into S_BFE instructions.
+// These instructions encode the offset/width in a way that requires using
+// bitwise operations. Selecting these instructions early allow the combiner
+// to potentially fold these.
+class lower_uniform_bfx : GICombineRule<
+  (defs root:$bfx),
+  (combine (bfx $dst, $src, $o, $w):$bfx, [{ return lowerUniformBFX(*${bfx}); 
}])>;
+
+def lower_uniform_sbfx : lower_uniform_bfx;
+def lower_uniform_ubfx : lower_uniform_bfx;
+
 let Predicates = [Has16BitInsts, NotHasMed3_16] in {
 // For gfx8, expand f16-fmed3-as-f32 into a min/max f16 sequence. This
 // saves one instruction compared to the promotion.
@@ -198,5 +209,6 @@ def AMDGPURegBankCombiner : GICombiner<
zext_trunc_fold, int_minmax_to_med3, ptr_add_immed_chain,
fp_minmax_to_clamp, fp_minmax_to_med3, fmed3_intrinsic_to_clamp,
identity_combines, redundant_and, constant_fold_cast_op,
-   cast_of_cast_combines, sext_trunc, zext_of_shift_amount_combines]> {
+   cast_of_cast_combines, sext_trunc, zext_of_shift_amount_combines,
+   lower_uniform_sbfx, lower_uniform_ubfx]> {
 }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp 
b/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
index ee324a5e93f0f..2100900bb8eb2 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
@@ -89,6 +89,8 @@ class AMDGPURegBankCombinerImpl : public Combiner {
 
   void applyCanonicalizeZextShiftAmt(MachineInstr &MI, MachineInstr &Ext) 
const;
 
+  bool lowerUniformBFX(MachineInstr &MI) const;
+
 private:
   SIModeRegisterDefaults getMode() const;
   bool getIEEE() const;
@@ -392,6 +394,55 @@ void 
AMDGPURegBankCombinerImpl::applyCanonicalizeZextShiftAmt(
   MI.eraseFromParent();
 }
 
+bool AMDGPURegBankCombinerImpl::lowerUniformBFX(MachineInstr &MI) const {
+  assert(MI.getOpcode() == TargetOpcode::G_UBFX ||
+ MI.getOpcode() == TargetOpcode::G_SBFX);
+  const bool Signed = (MI.getOpcode() == TargetOpcode::G_SBFX);
+
+  Register DstReg = MI.getOperand(0).getReg();
+  const RegisterBank *RB = RBI.getRegBank(DstReg, MRI, TRI);
+  assert(RB && "No RB?");
+  if (RB->getID() != AMDGPU::SGPRRegBankID)
+return false;
+
+  Register SrcReg = MI.getOperand(1).getReg();
+  Register OffsetReg = MI.getOperand(2).getReg();
+  Register WidthReg = MI.getOperand(3).getReg();
+
+  const LLT S32 = LLT::scalar(32);
+  LLT Ty = MRI.getType(DstReg);
+
+  const unsigned Opc = (Ty == S32)
+   ? (Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32)
+   : (Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64);
+
+  // Ensure the high bits are clear to insert the offset.
+  auto OffsetMask = B.buildConstant(S32, maskTrailingOnes(6));
+  auto ClampOffset = B.buildAnd(S32, OffsetReg, OffsetMask);
+
+  // Zeros out the low bits, so don't bother clamping the input value.
+  auto ShiftAmt = B.buildConstant(S32, 16);
+  auto ShiftWidth = B.buildShl(S32, WidthReg, ShiftAmt);
+
+  // Transformation function, pack the offset and width of a BFE into
+  // the format expected by the S_BFE_I32 / S_BFE_U32. In the second
+  // source, bits [5:0] contain the offset and bits [22:16] the width.
+  auto MergedInputs = B.buildOr(S32, ClampOffset, ShiftWidth);
+
+  MRI.setRegBank(OffsetMask.getReg(0), *RB);
+  MRI.setRegBank(ClampOffset.getReg(0), *RB);
+  MRI.setRegBank(ShiftAmt.getReg(0), *RB);
+  MRI.setRegBank(ShiftWidth.getReg(0), *RB);
+  MRI.setRegBank(MergedInputs.getReg(0), *RB);
+
+  auto MIB = B.buildInstr(Opc, {DstReg}, {SrcReg, MergedInputs});
+  if (!constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI))
+llvm_unreachable("failed to constrain BFE");
+
+  MI.eraseFromParent();
+  return true;
+}
+
 SIModeRegisterDefaults AMDGPURegBankCombinerImpl::getMode() const {
   return MF.getInfo()->getMode();
 }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp 
b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index dd7aef8f0c583..0b7d64ee67c34 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/

[llvm-branch-commits] [llvm] AMDGPU: Fix tracking subreg defs when folding through reg_sequence (PR #140608)

2025-05-27 Thread Matt Arsenault via llvm-branch-commits


https://github.com/arsenm updated 
https://github.com/llvm/llvm-project/pull/140608

>From 6e7ab227c9d12cf82958ea0dd11461ec49bc4945 Mon Sep 17 00:00:00 2001
From: Matt Arsenault 
Date: Thu, 15 May 2025 10:51:39 +0200
Subject: [PATCH] AMDGPU: Fix tracking subreg defs when folding through
 reg_sequence

We weren't fully respecting the type of a def of an immediate vs.
the type at the use point. Refactor the folding logic to track the
value to fold, as well as a subregister to apply to the underlying
value. This is similar to how PeepholeOpt tracks subregisters (though
only for pure copy-like instructions, no constants).

Fixes #139317
---
 llvm/lib/Target/AMDGPU/SIFoldOperands.cpp | 408 +++---
 ...issue139317-bad-opsel-reg-sequence-fold.ll |   3 +-
 .../si-fold-operands-subreg-imm.gfx942.mir|   8 +-
 .../AMDGPU/si-fold-operands-subreg-imm.mir|   4 +-
 4 files changed, 253 insertions(+), 170 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp 
b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index cc18d6b4aba10..d7a4fa85e4034 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -25,52 +25,151 @@ using namespace llvm;
 
 namespace {
 
-struct FoldCandidate {
-  MachineInstr *UseMI;
+/// Track a value we may want to fold into downstream users, applying
+/// subregister extracts along the way.
+struct FoldableDef {
   union {
-MachineOperand *OpToFold;
+MachineOperand *OpToFold = nullptr;
 uint64_t ImmToFold;
 int FrameIndexToFold;
   };
-  int ShrinkOpcode;
-  unsigned UseOpNo;
+
+  /// Register class of the originally defined value.
+  const TargetRegisterClass *DefRC = nullptr;
+
+  /// Track the original defining instruction for the value.
+  const MachineInstr *DefMI = nullptr;
+
+  /// Subregister to apply to the value at the use point.
+  unsigned DefSubReg = AMDGPU::NoSubRegister;
+
+  /// Kind of value stored in the union.
   MachineOperand::MachineOperandType Kind;
-  bool Commuted;
 
-  FoldCandidate(MachineInstr *MI, unsigned OpNo, MachineOperand *FoldOp,
-bool Commuted_ = false,
-int ShrinkOp = -1) :
-UseMI(MI), OpToFold(nullptr), ShrinkOpcode(ShrinkOp), UseOpNo(OpNo),
-Kind(FoldOp->getType()),
-Commuted(Commuted_) {
-if (FoldOp->isImm()) {
-  ImmToFold = FoldOp->getImm();
-} else if (FoldOp->isFI()) {
-  FrameIndexToFold = FoldOp->getIndex();
+  FoldableDef() = delete;
+  FoldableDef(MachineOperand &FoldOp, const TargetRegisterClass *DefRC,
+  unsigned DefSubReg = AMDGPU::NoSubRegister)
+  : DefRC(DefRC), DefSubReg(DefSubReg), Kind(FoldOp.getType()) {
+
+if (FoldOp.isImm()) {
+  ImmToFold = FoldOp.getImm();
+} else if (FoldOp.isFI()) {
+  FrameIndexToFold = FoldOp.getIndex();
 } else {
-  assert(FoldOp->isReg() || FoldOp->isGlobal());
-  OpToFold = FoldOp;
+  assert(FoldOp.isReg() || FoldOp.isGlobal());
+  OpToFold = &FoldOp;
 }
+
+DefMI = FoldOp.getParent();
   }
 
-  FoldCandidate(MachineInstr *MI, unsigned OpNo, int64_t FoldImm,
-bool Commuted_ = false, int ShrinkOp = -1)
-  : UseMI(MI), ImmToFold(FoldImm), ShrinkOpcode(ShrinkOp), UseOpNo(OpNo),
-Kind(MachineOperand::MO_Immediate), Commuted(Commuted_) {}
+  FoldableDef(int64_t FoldImm, const TargetRegisterClass *DefRC,
+  unsigned DefSubReg = AMDGPU::NoSubRegister)
+  : ImmToFold(FoldImm), DefRC(DefRC), DefSubReg(DefSubReg),
+Kind(MachineOperand::MO_Immediate) {}
+
+  /// Copy the current def and apply \p SubReg to the value.
+  FoldableDef getWithSubReg(const SIRegisterInfo &TRI, unsigned SubReg) const {
+FoldableDef Copy(*this);
+Copy.DefSubReg = TRI.composeSubRegIndices(DefSubReg, SubReg);
+return Copy;
+  }
+
+  bool isReg() const { return Kind == MachineOperand::MO_Register; }
+
+  Register getReg() const {
+assert(isReg());
+return OpToFold->getReg();
+  }
+
+  unsigned getSubReg() const {
+assert(isReg());
+return OpToFold->getSubReg();
+  }
+
+  bool isImm() const { return Kind == MachineOperand::MO_Immediate; }
 
   bool isFI() const {
 return Kind == MachineOperand::MO_FrameIndex;
   }
 
-  bool isImm() const {
-return Kind == MachineOperand::MO_Immediate;
+  int getFI() const {
+assert(isFI());
+return FrameIndexToFold;
   }
 
-  bool isReg() const {
-return Kind == MachineOperand::MO_Register;
+  bool isGlobal() const { return OpToFold->isGlobal(); }
+
+  /// Return the effective immediate value defined by this instruction, after
+  /// application of any subregister extracts which may exist between the use
+  /// and def instruction.
+  std::optional getEffectiveImmVal() const {
+assert(isImm());
+return SIInstrInfo::extractSubregFromImm(ImmToFold, DefSubReg);
   }
 
-  bool isGlobal() const { return Kind == MachineOperand::MO_GlobalAddress; }
+  /// Check if it is legal to fold this effective value into

[llvm-branch-commits] [llvm] AMDGPU: Handle folding vector splats of inline split f64 inline immediates (PR #140878)

2025-05-27 Thread Matt Arsenault via llvm-branch-commits


https://github.com/arsenm updated 
https://github.com/llvm/llvm-project/pull/140878

>From 609dc72abf36343e62c4bb0bc149f9ba453f4236 Mon Sep 17 00:00:00 2001
From: Matt Arsenault 
Date: Mon, 19 May 2025 21:51:06 +0200
Subject: [PATCH] AMDGPU: Handle folding vector splats of inline split f64
 inline immediates

Recognize a reg_sequence with 32-bit elements that produce a 64-bit
splat value. This enables folding f64 constants into mfma operands
---
 llvm/lib/Target/AMDGPU/SIFoldOperands.cpp | 103 --
 .../CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll |  41 +--
 2 files changed, 76 insertions(+), 68 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp 
b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index d7a4fa85e4034..7cf2549804bee 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -227,12 +227,12 @@ class SIFoldOperandsImpl {
   getRegSeqInit(SmallVectorImpl> &Defs,
 Register UseReg) const;
 
-  std::pair
+  std::pair
   isRegSeqSplat(MachineInstr &RegSeg) const;
 
-  MachineOperand *tryFoldRegSeqSplat(MachineInstr *UseMI, unsigned UseOpIdx,
- MachineOperand *SplatVal,
- const TargetRegisterClass *SplatRC) const;
+  bool tryFoldRegSeqSplat(MachineInstr *UseMI, unsigned UseOpIdx,
+  int64_t SplatVal,
+  const TargetRegisterClass *SplatRC) const;
 
   bool tryToFoldACImm(const FoldableDef &OpToFold, MachineInstr *UseMI,
   unsigned UseOpIdx,
@@ -966,15 +966,15 @@ const TargetRegisterClass 
*SIFoldOperandsImpl::getRegSeqInit(
   return getRegSeqInit(*Def, Defs);
 }
 
-std::pair
+std::pair
 SIFoldOperandsImpl::isRegSeqSplat(MachineInstr &RegSeq) const {
   SmallVector, 32> Defs;
   const TargetRegisterClass *SrcRC = getRegSeqInit(RegSeq, Defs);
   if (!SrcRC)
 return {};
 
-  // TODO: Recognize 64-bit splats broken into 32-bit pieces (i.e. recognize
-  // every other other element is 0 for 64-bit immediates)
+  bool TryToMatchSplat64 = false;
+
   int64_t Imm;
   for (unsigned I = 0, E = Defs.size(); I != E; ++I) {
 const MachineOperand *Op = Defs[I].first;
@@ -986,38 +986,75 @@ SIFoldOperandsImpl::isRegSeqSplat(MachineInstr &RegSeq) 
const {
   Imm = SubImm;
   continue;
 }
-if (Imm != SubImm)
+
+if (Imm != SubImm) {
+  if (I == 1 && (E & 1) == 0) {
+// If we have an even number of inputs, there's a chance this is a
+// 64-bit element splat broken into 32-bit pieces.
+TryToMatchSplat64 = true;
+break;
+  }
+
   return {}; // Can only fold splat constants
+}
+  }
+
+  if (!TryToMatchSplat64)
+return {Defs[0].first->getImm(), SrcRC};
+
+  // Fallback to recognizing 64-bit splats broken into 32-bit pieces
+  // (i.e. recognize every other other element is 0 for 64-bit immediates)
+  int64_t SplatVal64;
+  for (unsigned I = 0, E = Defs.size(); I != E; I += 2) {
+const MachineOperand *Op0 = Defs[I].first;
+const MachineOperand *Op1 = Defs[I + 1].first;
+
+if (!Op0->isImm() || !Op1->isImm())
+  return {};
+
+unsigned SubReg0 = Defs[I].second;
+unsigned SubReg1 = Defs[I + 1].second;
+
+// Assume we're going to generally encounter reg_sequences with sorted
+// subreg indexes, so reject any that aren't consecutive.
+if (TRI->getChannelFromSubReg(SubReg0) + 1 !=
+TRI->getChannelFromSubReg(SubReg1))
+  return {};
+
+int64_t MergedVal = Make_64(Op1->getImm(), Op0->getImm());
+if (I == 0)
+  SplatVal64 = MergedVal;
+else if (SplatVal64 != MergedVal)
+  return {};
   }
 
-  return {Defs[0].first, SrcRC};
+  const TargetRegisterClass *RC64 = TRI->getSubRegisterClass(
+  MRI->getRegClass(RegSeq.getOperand(0).getReg()), AMDGPU::sub0_sub1);
+
+  return {SplatVal64, RC64};
 }
 
-MachineOperand *SIFoldOperandsImpl::tryFoldRegSeqSplat(
-MachineInstr *UseMI, unsigned UseOpIdx, MachineOperand *SplatVal,
+bool SIFoldOperandsImpl::tryFoldRegSeqSplat(
+MachineInstr *UseMI, unsigned UseOpIdx, int64_t SplatVal,
 const TargetRegisterClass *SplatRC) const {
   const MCInstrDesc &Desc = UseMI->getDesc();
   if (UseOpIdx >= Desc.getNumOperands())
-return nullptr;
+return false;
 
   // Filter out unhandled pseudos.
   if (!AMDGPU::isSISrcOperand(Desc, UseOpIdx))
-return nullptr;
+return false;
 
   int16_t RCID = Desc.operands()[UseOpIdx].RegClass;
   if (RCID == -1)
-return nullptr;
+return false;
+
+  const TargetRegisterClass *OpRC = TRI->getRegClass(RCID);
 
   // Special case 0/-1, since when interpreted as a 64-bit element both halves
-  // have the same bits. Effectively this code does not handle 64-bit element
-  // operands correctly, as the incoming 64-bit constants are already split 
into
-  // 32-bit sequence elements.
-  //
-  // TODO: We should try to figure out how to interpret the reg_sequence as a
-  /

[llvm-branch-commits] [llvm] AMDGPU: Remove redundant operand folding checks (PR #140587)

2025-05-27 Thread Matt Arsenault via llvm-branch-commits


https://github.com/arsenm updated 
https://github.com/llvm/llvm-project/pull/140587

>From c3f0ed4891b6cc34dc808e8673da5ff86a903df0 Mon Sep 17 00:00:00 2001
From: Matt Arsenault 
Date: Mon, 19 May 2025 20:02:54 +0200
Subject: [PATCH 1/2] AMDGPU: Remove redundant operand folding checks

This was pre-filtering out a specific situation from being
added to the fold candidate list. The operand legality will
ultimately be checked with isOperandLegal before the fold is
performed, so I don't see the plus in pre-filtering this one
case.
---
 llvm/lib/Target/AMDGPU/SIFoldOperands.cpp |  18 
 .../AMDGPU/fold-operands-frame-index.mir  | 101 ++
 2 files changed, 101 insertions(+), 18 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp 
b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index c45611582a53a..cc18d6b4aba10 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -777,24 +777,6 @@ bool SIFoldOperandsImpl::tryAddToFoldList(
   return true;
   }
 
-  // Check the case where we might introduce a second constant operand to a
-  // scalar instruction
-  if (TII->isSALU(MI->getOpcode())) {
-const MCInstrDesc &InstDesc = MI->getDesc();
-const MCOperandInfo &OpInfo = InstDesc.operands()[OpNo];
-
-// Fine if the operand can be encoded as an inline constant
-if (!OpToFold->isReg() && !TII->isInlineConstant(*OpToFold, OpInfo)) {
-  // Otherwise check for another constant
-  for (unsigned i = 0, e = InstDesc.getNumOperands(); i != e; ++i) {
-auto &Op = MI->getOperand(i);
-if (OpNo != i && !Op.isReg() &&
-!TII->isInlineConstant(Op, InstDesc.operands()[i]))
-  return false;
-  }
-}
-  }
-
   appendFoldCandidate(FoldList, MI, OpNo, OpToFold);
   return true;
 }
diff --git a/llvm/test/CodeGen/AMDGPU/fold-operands-frame-index.mir 
b/llvm/test/CodeGen/AMDGPU/fold-operands-frame-index.mir
index 4417f205646ee..7fad2f466bc9f 100644
--- a/llvm/test/CodeGen/AMDGPU/fold-operands-frame-index.mir
+++ b/llvm/test/CodeGen/AMDGPU/fold-operands-frame-index.mir
@@ -573,3 +573,104 @@ body: |
 S_ENDPGM 0, implicit %2
 
 ...
+
+---
+name:no_fold_multiple_fi_s_cselect_b32
+tracksRegLiveness: true
+stack:
+  - { id: 0, size: 64, alignment: 4 }
+  - { id: 1, size: 32, alignment: 4 }
+body: |
+  bb.0:
+; CHECK-LABEL: name: no_fold_multiple_fi_s_cselect_b32
+; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 %stack.1
+; CHECK-NEXT: [[S_CSELECT_B32_:%[0-9]+]]:sreg_32 = S_CSELECT_B32 killed 
[[S_MOV_B32_]], %stack.0, implicit undef $scc
+; CHECK-NEXT: S_ENDPGM 0, implicit [[S_CSELECT_B32_]]
+%0:sreg_32 = S_MOV_B32 %stack.0
+%1:sreg_32 = S_MOV_B32 %stack.1
+%2:sreg_32 = S_CSELECT_B32 killed %1, killed %0, implicit undef $scc
+S_ENDPGM 0, implicit %2
+
+...
+
+---
+name:no_fold_multiple_fi_v_cndmask_b32_e64
+tracksRegLiveness: true
+stack:
+  - { id: 0, size: 64, alignment: 4 }
+  - { id: 1, size: 32, alignment: 4 }
+body: |
+  bb.0:
+liveins: $sgpr8_sgpr9
+; GFX9-LABEL: name: no_fold_multiple_fi_v_cndmask_b32_e64
+; GFX9: liveins: $sgpr8_sgpr9
+; GFX9-NEXT: {{  $}}
+; GFX9-NEXT: [[COPY:%[0-9]+]]:sreg_64_xexec = COPY $sgpr8_sgpr9
+; GFX9-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, 
implicit $exec
+; GFX9-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.1, 
implicit $exec
+; GFX9-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 
killed [[V_MOV_B32_e32_]], 0, killed [[V_MOV_B32_e32_1]], [[COPY]], implicit 
$exec
+; GFX9-NEXT: S_ENDPGM 0, implicit [[V_CNDMASK_B32_e64_]]
+;
+; GFX10-LABEL: name: no_fold_multiple_fi_v_cndmask_b32_e64
+; GFX10: liveins: $sgpr8_sgpr9
+; GFX10-NEXT: {{  $}}
+; GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_64_xexec = COPY $sgpr8_sgpr9
+; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.1, 
implicit $exec
+; GFX10-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 
0, %stack.0, 0, killed [[V_MOV_B32_e32_]], [[COPY]], implicit $exec
+; GFX10-NEXT: S_ENDPGM 0, implicit [[V_CNDMASK_B32_e64_]]
+;
+; GFX12-LABEL: name: no_fold_multiple_fi_v_cndmask_b32_e64
+; GFX12: liveins: $sgpr8_sgpr9
+; GFX12-NEXT: {{  $}}
+; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_64_xexec = COPY $sgpr8_sgpr9
+; GFX12-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.1, 
implicit $exec
+; GFX12-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 
0, %stack.0, 0, killed [[V_MOV_B32_e32_]], [[COPY]], implicit $exec
+; GFX12-NEXT: S_ENDPGM 0, implicit [[V_CNDMASK_B32_e64_]]
+%0:sreg_64_xexec = COPY $sgpr8_sgpr9
+%1:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
+%2:vgpr_32 = V_MOV_B32_e32 %stack.1, implicit $exec
+%3:vgpr_32 = V_CNDMASK_B32_e64 0, killed %1, 0, killed %2, %0, implicit 
$exec
+

[llvm-branch-commits] [llvm] AMDGPU: Add baseline tests for #139317 (PR #140607)

2025-05-27 Thread Matt Arsenault via llvm-branch-commits


https://github.com/arsenm updated 
https://github.com/llvm/llvm-project/pull/140607

>From 8a6cb7bba02c0c6638a9b1789cf0feccd229f8b3 Mon Sep 17 00:00:00 2001
From: Matt Arsenault 
Date: Wed, 14 May 2025 08:50:59 +0200
Subject: [PATCH] AMDGPU: Add baseline tests for #139317

---
 .../CodeGen/AMDGPU/fold-imm-copy-agpr.mir | 135 +
 llvm/test/CodeGen/AMDGPU/fold-imm-copy.mir| 513 ++
 .../AMDGPU/fold-short-64-bit-literals.mir | 392 -
 ...issue139317-bad-opsel-reg-sequence-fold.ll |  66 +++
 .../si-fold-operands-subreg-imm.gfx942.mir| 202 +++
 .../AMDGPU/si-fold-operands-subreg-imm.mir|  26 +
 6 files changed, 1329 insertions(+), 5 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/fold-imm-copy-agpr.mir
 create mode 100644 
llvm/test/CodeGen/AMDGPU/issue139317-bad-opsel-reg-sequence-fold.ll
 create mode 100644 
llvm/test/CodeGen/AMDGPU/si-fold-operands-subreg-imm.gfx942.mir

diff --git a/llvm/test/CodeGen/AMDGPU/fold-imm-copy-agpr.mir 
b/llvm/test/CodeGen/AMDGPU/fold-imm-copy-agpr.mir
new file mode 100644
index 0..a079ee1296f41
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/fold-imm-copy-agpr.mir
@@ -0,0 +1,135 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py 
UTC_ARGS: --version 5
+# RUN: llc -mtriple=amdgcn -mcpu=gfx90a -run-pass=si-fold-operands %s -o - | 
FileCheck -check-prefix=GCN %s
+
+---
+name: v_mov_b64_pseudo_imm_0_copy_to_areg_64
+tracksRegLiveness: true
+body: |
+  bb.0:
+; GCN-LABEL: name: v_mov_b64_pseudo_imm_0_copy_to_areg_64
+; GCN: [[V_MOV_B:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO 0, implicit 
$exec
+; GCN-NEXT: [[COPY:%[0-9]+]]:areg_64_align2 = COPY [[V_MOV_B]]
+; GCN-NEXT: $agpr0_agpr1 = COPY [[COPY]]
+; GCN-NEXT: S_ENDPGM 0
+%0:vreg_64_align2 = V_MOV_B64_PSEUDO 0, implicit $exec
+%1:areg_64_align2 = COPY %0
+$agpr0_agpr1 = COPY %1
+S_ENDPGM 0
+
+...
+
+---
+name: v_mov_b64_pseudo_imm_neg1_copy_to_areg_64
+tracksRegLiveness: true
+body: |
+  bb.0:
+; GCN-LABEL: name: v_mov_b64_pseudo_imm_neg1_copy_to_areg_64
+; GCN: [[V_MOV_B:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO -1, implicit 
$exec
+; GCN-NEXT: [[COPY:%[0-9]+]]:areg_64_align2 = COPY [[V_MOV_B]]
+; GCN-NEXT: $agpr0_agpr1 = COPY [[COPY]]
+; GCN-NEXT: S_ENDPGM 0
+%0:vreg_64_align2 = V_MOV_B64_PSEUDO -1, implicit $exec
+%1:areg_64_align2 = COPY %0
+$agpr0_agpr1 = COPY %1
+S_ENDPGM 0
+
+...
+
+---
+name: v_mov_b64_pseudo_literal_copy_to_areg_64
+tracksRegLiveness: true
+body: |
+  bb.0:
+; GCN-LABEL: name: v_mov_b64_pseudo_literal_copy_to_areg_64
+; GCN: [[V_MOV_B:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO 999, implicit 
$exec
+; GCN-NEXT: [[COPY:%[0-9]+]]:areg_64_align2 = COPY [[V_MOV_B]]
+; GCN-NEXT: $agpr0_agpr1 = COPY [[COPY]]
+; GCN-NEXT: S_ENDPGM 0
+%0:vreg_64_align2 = V_MOV_B64_PSEUDO 999, implicit $exec
+%1:areg_64_align2 = COPY %0
+$agpr0_agpr1 = COPY %1
+S_ENDPGM 0
+
+...
+
+---
+name: v_mov_b64_pseudo_imm_0_copy_sub0_to_agpr_32
+tracksRegLiveness: true
+body: |
+  bb.0:
+; GCN-LABEL: name: v_mov_b64_pseudo_imm_0_copy_sub0_to_agpr_32
+; GCN: [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]]:agpr_32 = 
V_ACCVGPR_WRITE_B32_e64 0, implicit $exec
+; GCN-NEXT: $agpr0 = COPY [[V_ACCVGPR_WRITE_B32_e64_]]
+; GCN-NEXT: S_ENDPGM 0
+%0:vreg_64_align2 = V_MOV_B64_PSEUDO 0, implicit $exec
+%1:agpr_32 = COPY %0.sub0
+$agpr0 = COPY %1
+S_ENDPGM 0
+
+...
+
+---
+name: v_mov_b64_pseudo_imm_0_copy_sub1_to_agpr_32
+tracksRegLiveness: true
+body: |
+  bb.0:
+; GCN-LABEL: name: v_mov_b64_pseudo_imm_0_copy_sub1_to_agpr_32
+; GCN: [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]]:agpr_32 = 
V_ACCVGPR_WRITE_B32_e64 0, implicit $exec
+; GCN-NEXT: $agpr0 = COPY [[V_ACCVGPR_WRITE_B32_e64_]]
+; GCN-NEXT: S_ENDPGM 0
+%0:vreg_64_align2 = V_MOV_B64_PSEUDO 0, implicit $exec
+%1:agpr_32 = COPY %0.sub1
+$agpr0 = COPY %1
+S_ENDPGM 0
+
+...
+
+---
+name: v_mov_b64_pseudo_lit_copy_sub0_to_agpr_32
+tracksRegLiveness: true
+body: |
+  bb.0:
+; GCN-LABEL: name: v_mov_b64_pseudo_lit_copy_sub0_to_agpr_32
+; GCN: [[V_MOV_B:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO 
4290672329592, implicit $exec
+; GCN-NEXT: [[COPY:%[0-9]+]]:agpr_32 = COPY [[V_MOV_B]].sub0
+; GCN-NEXT: $agpr0 = COPY [[COPY]]
+; GCN-NEXT: S_ENDPGM 0
+%0:vreg_64_align2 = V_MOV_B64_PSEUDO 4290672329592, implicit $exec
+%1:agpr_32 = COPY %0.sub0
+$agpr0 = COPY %1
+S_ENDPGM 0
+
+...
+
+---
+name: v_mov_b64_pseudo_lit_copy_sub1_to_agpr_32
+tracksRegLiveness: true
+body: |
+  bb.0:
+; GCN-LABEL: name: v_mov_b64_pseudo_lit_copy_sub1_to_agpr_32
+; GCN: [[V_MOV_B:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO 
4290672329592, implicit $exec
+; GCN-NEXT: [[COPY:%[0-9]+]]:agpr_32 = COPY [[V_MOV_B]].sub1
+; GCN-NEXT: $agpr0 = COPY [[CO

[llvm-branch-commits] [llvm] AMDGPU: Add baseline tests for #139317 (PR #140607)

2025-05-27 Thread Matt Arsenault via llvm-branch-commits


https://github.com/arsenm updated 
https://github.com/llvm/llvm-project/pull/140607

>From 8a6cb7bba02c0c6638a9b1789cf0feccd229f8b3 Mon Sep 17 00:00:00 2001
From: Matt Arsenault 
Date: Wed, 14 May 2025 08:50:59 +0200
Subject: [PATCH] AMDGPU: Add baseline tests for #139317

---
 .../CodeGen/AMDGPU/fold-imm-copy-agpr.mir | 135 +
 llvm/test/CodeGen/AMDGPU/fold-imm-copy.mir| 513 ++
 .../AMDGPU/fold-short-64-bit-literals.mir | 392 -
 ...issue139317-bad-opsel-reg-sequence-fold.ll |  66 +++
 .../si-fold-operands-subreg-imm.gfx942.mir| 202 +++
 .../AMDGPU/si-fold-operands-subreg-imm.mir|  26 +
 6 files changed, 1329 insertions(+), 5 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/fold-imm-copy-agpr.mir
 create mode 100644 
llvm/test/CodeGen/AMDGPU/issue139317-bad-opsel-reg-sequence-fold.ll
 create mode 100644 
llvm/test/CodeGen/AMDGPU/si-fold-operands-subreg-imm.gfx942.mir

diff --git a/llvm/test/CodeGen/AMDGPU/fold-imm-copy-agpr.mir 
b/llvm/test/CodeGen/AMDGPU/fold-imm-copy-agpr.mir
new file mode 100644
index 0..a079ee1296f41
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/fold-imm-copy-agpr.mir
@@ -0,0 +1,135 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py 
UTC_ARGS: --version 5
+# RUN: llc -mtriple=amdgcn -mcpu=gfx90a -run-pass=si-fold-operands %s -o - | 
FileCheck -check-prefix=GCN %s
+
+---
+name: v_mov_b64_pseudo_imm_0_copy_to_areg_64
+tracksRegLiveness: true
+body: |
+  bb.0:
+; GCN-LABEL: name: v_mov_b64_pseudo_imm_0_copy_to_areg_64
+; GCN: [[V_MOV_B:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO 0, implicit 
$exec
+; GCN-NEXT: [[COPY:%[0-9]+]]:areg_64_align2 = COPY [[V_MOV_B]]
+; GCN-NEXT: $agpr0_agpr1 = COPY [[COPY]]
+; GCN-NEXT: S_ENDPGM 0
+%0:vreg_64_align2 = V_MOV_B64_PSEUDO 0, implicit $exec
+%1:areg_64_align2 = COPY %0
+$agpr0_agpr1 = COPY %1
+S_ENDPGM 0
+
+...
+
+---
+name: v_mov_b64_pseudo_imm_neg1_copy_to_areg_64
+tracksRegLiveness: true
+body: |
+  bb.0:
+; GCN-LABEL: name: v_mov_b64_pseudo_imm_neg1_copy_to_areg_64
+; GCN: [[V_MOV_B:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO -1, implicit 
$exec
+; GCN-NEXT: [[COPY:%[0-9]+]]:areg_64_align2 = COPY [[V_MOV_B]]
+; GCN-NEXT: $agpr0_agpr1 = COPY [[COPY]]
+; GCN-NEXT: S_ENDPGM 0
+%0:vreg_64_align2 = V_MOV_B64_PSEUDO -1, implicit $exec
+%1:areg_64_align2 = COPY %0
+$agpr0_agpr1 = COPY %1
+S_ENDPGM 0
+
+...
+
+---
+name: v_mov_b64_pseudo_literal_copy_to_areg_64
+tracksRegLiveness: true
+body: |
+  bb.0:
+; GCN-LABEL: name: v_mov_b64_pseudo_literal_copy_to_areg_64
+; GCN: [[V_MOV_B:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO 999, implicit 
$exec
+; GCN-NEXT: [[COPY:%[0-9]+]]:areg_64_align2 = COPY [[V_MOV_B]]
+; GCN-NEXT: $agpr0_agpr1 = COPY [[COPY]]
+; GCN-NEXT: S_ENDPGM 0
+%0:vreg_64_align2 = V_MOV_B64_PSEUDO 999, implicit $exec
+%1:areg_64_align2 = COPY %0
+$agpr0_agpr1 = COPY %1
+S_ENDPGM 0
+
+...
+
+---
+name: v_mov_b64_pseudo_imm_0_copy_sub0_to_agpr_32
+tracksRegLiveness: true
+body: |
+  bb.0:
+; GCN-LABEL: name: v_mov_b64_pseudo_imm_0_copy_sub0_to_agpr_32
+; GCN: [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]]:agpr_32 = 
V_ACCVGPR_WRITE_B32_e64 0, implicit $exec
+; GCN-NEXT: $agpr0 = COPY [[V_ACCVGPR_WRITE_B32_e64_]]
+; GCN-NEXT: S_ENDPGM 0
+%0:vreg_64_align2 = V_MOV_B64_PSEUDO 0, implicit $exec
+%1:agpr_32 = COPY %0.sub0
+$agpr0 = COPY %1
+S_ENDPGM 0
+
+...
+
+---
+name: v_mov_b64_pseudo_imm_0_copy_sub1_to_agpr_32
+tracksRegLiveness: true
+body: |
+  bb.0:
+; GCN-LABEL: name: v_mov_b64_pseudo_imm_0_copy_sub1_to_agpr_32
+; GCN: [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]]:agpr_32 = 
V_ACCVGPR_WRITE_B32_e64 0, implicit $exec
+; GCN-NEXT: $agpr0 = COPY [[V_ACCVGPR_WRITE_B32_e64_]]
+; GCN-NEXT: S_ENDPGM 0
+%0:vreg_64_align2 = V_MOV_B64_PSEUDO 0, implicit $exec
+%1:agpr_32 = COPY %0.sub1
+$agpr0 = COPY %1
+S_ENDPGM 0
+
+...
+
+---
+name: v_mov_b64_pseudo_lit_copy_sub0_to_agpr_32
+tracksRegLiveness: true
+body: |
+  bb.0:
+; GCN-LABEL: name: v_mov_b64_pseudo_lit_copy_sub0_to_agpr_32
+; GCN: [[V_MOV_B:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO 
4290672329592, implicit $exec
+; GCN-NEXT: [[COPY:%[0-9]+]]:agpr_32 = COPY [[V_MOV_B]].sub0
+; GCN-NEXT: $agpr0 = COPY [[COPY]]
+; GCN-NEXT: S_ENDPGM 0
+%0:vreg_64_align2 = V_MOV_B64_PSEUDO 4290672329592, implicit $exec
+%1:agpr_32 = COPY %0.sub0
+$agpr0 = COPY %1
+S_ENDPGM 0
+
+...
+
+---
+name: v_mov_b64_pseudo_lit_copy_sub1_to_agpr_32
+tracksRegLiveness: true
+body: |
+  bb.0:
+; GCN-LABEL: name: v_mov_b64_pseudo_lit_copy_sub1_to_agpr_32
+; GCN: [[V_MOV_B:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO 
4290672329592, implicit $exec
+; GCN-NEXT: [[COPY:%[0-9]+]]:agpr_32 = COPY [[V_MOV_B]].sub1
+; GCN-NEXT: $agpr0 = COPY [[CO

[llvm-branch-commits] [llvm] AMDGPU: Handle folding vector splats of inline split f64 inline immediates (PR #140878)

2025-05-27 Thread Matt Arsenault via llvm-branch-commits


https://github.com/arsenm updated 
https://github.com/llvm/llvm-project/pull/140878

>From 609dc72abf36343e62c4bb0bc149f9ba453f4236 Mon Sep 17 00:00:00 2001
From: Matt Arsenault 
Date: Mon, 19 May 2025 21:51:06 +0200
Subject: [PATCH] AMDGPU: Handle folding vector splats of inline split f64
 inline immediates

Recognize a reg_sequence with 32-bit elements that produce a 64-bit
splat value. This enables folding f64 constants into mfma operands
---
 llvm/lib/Target/AMDGPU/SIFoldOperands.cpp | 103 --
 .../CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll |  41 +--
 2 files changed, 76 insertions(+), 68 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp 
b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index d7a4fa85e4034..7cf2549804bee 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -227,12 +227,12 @@ class SIFoldOperandsImpl {
   getRegSeqInit(SmallVectorImpl> &Defs,
 Register UseReg) const;
 
-  std::pair
+  std::pair
   isRegSeqSplat(MachineInstr &RegSeg) const;
 
-  MachineOperand *tryFoldRegSeqSplat(MachineInstr *UseMI, unsigned UseOpIdx,
- MachineOperand *SplatVal,
- const TargetRegisterClass *SplatRC) const;
+  bool tryFoldRegSeqSplat(MachineInstr *UseMI, unsigned UseOpIdx,
+  int64_t SplatVal,
+  const TargetRegisterClass *SplatRC) const;
 
   bool tryToFoldACImm(const FoldableDef &OpToFold, MachineInstr *UseMI,
   unsigned UseOpIdx,
@@ -966,15 +966,15 @@ const TargetRegisterClass 
*SIFoldOperandsImpl::getRegSeqInit(
   return getRegSeqInit(*Def, Defs);
 }
 
-std::pair
+std::pair
 SIFoldOperandsImpl::isRegSeqSplat(MachineInstr &RegSeq) const {
   SmallVector, 32> Defs;
   const TargetRegisterClass *SrcRC = getRegSeqInit(RegSeq, Defs);
   if (!SrcRC)
 return {};
 
-  // TODO: Recognize 64-bit splats broken into 32-bit pieces (i.e. recognize
-  // every other other element is 0 for 64-bit immediates)
+  bool TryToMatchSplat64 = false;
+
   int64_t Imm;
   for (unsigned I = 0, E = Defs.size(); I != E; ++I) {
 const MachineOperand *Op = Defs[I].first;
@@ -986,38 +986,75 @@ SIFoldOperandsImpl::isRegSeqSplat(MachineInstr &RegSeq) 
const {
   Imm = SubImm;
   continue;
 }
-if (Imm != SubImm)
+
+if (Imm != SubImm) {
+  if (I == 1 && (E & 1) == 0) {
+// If we have an even number of inputs, there's a chance this is a
+// 64-bit element splat broken into 32-bit pieces.
+TryToMatchSplat64 = true;
+break;
+  }
+
   return {}; // Can only fold splat constants
+}
+  }
+
+  if (!TryToMatchSplat64)
+return {Defs[0].first->getImm(), SrcRC};
+
+  // Fallback to recognizing 64-bit splats broken into 32-bit pieces
+  // (i.e. recognize every other other element is 0 for 64-bit immediates)
+  int64_t SplatVal64;
+  for (unsigned I = 0, E = Defs.size(); I != E; I += 2) {
+const MachineOperand *Op0 = Defs[I].first;
+const MachineOperand *Op1 = Defs[I + 1].first;
+
+if (!Op0->isImm() || !Op1->isImm())
+  return {};
+
+unsigned SubReg0 = Defs[I].second;
+unsigned SubReg1 = Defs[I + 1].second;
+
+// Assume we're going to generally encounter reg_sequences with sorted
+// subreg indexes, so reject any that aren't consecutive.
+if (TRI->getChannelFromSubReg(SubReg0) + 1 !=
+TRI->getChannelFromSubReg(SubReg1))
+  return {};
+
+int64_t MergedVal = Make_64(Op1->getImm(), Op0->getImm());
+if (I == 0)
+  SplatVal64 = MergedVal;
+else if (SplatVal64 != MergedVal)
+  return {};
   }
 
-  return {Defs[0].first, SrcRC};
+  const TargetRegisterClass *RC64 = TRI->getSubRegisterClass(
+  MRI->getRegClass(RegSeq.getOperand(0).getReg()), AMDGPU::sub0_sub1);
+
+  return {SplatVal64, RC64};
 }
 
-MachineOperand *SIFoldOperandsImpl::tryFoldRegSeqSplat(
-MachineInstr *UseMI, unsigned UseOpIdx, MachineOperand *SplatVal,
+bool SIFoldOperandsImpl::tryFoldRegSeqSplat(
+MachineInstr *UseMI, unsigned UseOpIdx, int64_t SplatVal,
 const TargetRegisterClass *SplatRC) const {
   const MCInstrDesc &Desc = UseMI->getDesc();
   if (UseOpIdx >= Desc.getNumOperands())
-return nullptr;
+return false;
 
   // Filter out unhandled pseudos.
   if (!AMDGPU::isSISrcOperand(Desc, UseOpIdx))
-return nullptr;
+return false;
 
   int16_t RCID = Desc.operands()[UseOpIdx].RegClass;
   if (RCID == -1)
-return nullptr;
+return false;
+
+  const TargetRegisterClass *OpRC = TRI->getRegClass(RCID);
 
   // Special case 0/-1, since when interpreted as a 64-bit element both halves
-  // have the same bits. Effectively this code does not handle 64-bit element
-  // operands correctly, as the incoming 64-bit constants are already split 
into
-  // 32-bit sequence elements.
-  //
-  // TODO: We should try to figure out how to interpret the reg_sequence as a
-  /

[llvm-branch-commits] [llvm] [AMDGPU] Move S_BFE lowering into RegBankCombiner (PR #141589)

2025-05-27 Thread Pierre van Houtryve via llvm-branch-commits


https://github.com/Pierre-vh edited 
https://github.com/llvm/llvm-project/pull/141589
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits

[llvm-branch-commits] [llvm] [AMDGPU] Add BFX Formation Combines to RegBankCombiner (PR #141590)

2025-05-27 Thread Pierre van Houtryve via llvm-branch-commits


Pierre-vh wrote:

> [!WARNING]
> This pull request is not mergeable via GitHub because a downstack PR is 
> open. Once all requirements are satisfied, merge this PR as a stack  href="https://app.graphite.dev/github/pr/llvm/llvm-project/141590?utm_source=stack-comment-downstack-mergeability-warning";
>  >on Graphite.
> https://graphite.dev/docs/merge-pull-requests";>Learn more

* **#141591** https://app.graphite.dev/github/pr/llvm/llvm-project/141591?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#141590** https://app.graphite.dev/github/pr/llvm/llvm-project/141590?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/> 👈 https://app.graphite.dev/github/pr/llvm/llvm-project/141590?utm_source=stack-comment-view-in-graphite";
 target="_blank">(View in Graphite)
* **#141589** https://app.graphite.dev/github/pr/llvm/llvm-project/141589?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#141588** https://app.graphite.dev/github/pr/llvm/llvm-project/141588?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* `main`




This stack of pull requests is managed by https://graphite.dev?utm-source=stack-comment";>Graphite. Learn 
more about https://stacking.dev/?utm_source=stack-comment";>stacking.


https://github.com/llvm/llvm-project/pull/141590
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits

[llvm-branch-commits] [llvm] [AMDGPU] Add BFX Formation Combines to RegBankCombiner (PR #141590)

2025-05-27 Thread Pierre van Houtryve via llvm-branch-commits


https://github.com/Pierre-vh ready_for_review 
https://github.com/llvm/llvm-project/pull/141590
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits

[llvm-branch-commits] [llvm] Change inSectionBlame to return pair (FileIdx, LineNo). (PR #141540)

2025-05-27 Thread Qinkun Bao via llvm-branch-commits


https://github.com/qinkunbao edited 
https://github.com/llvm/llvm-project/pull/141540
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits

[llvm-branch-commits] [lld] ELF: Add branch-to-branch optimization. (PR #138366)

2025-05-27 Thread Peter Smith via llvm-branch-commits



@@ -975,6 +977,62 @@ void AArch64::relocateAlloc(InputSectionBase &sec, uint8_t 
*buf) const {
   }
 }
 
+static std::optional getControlTransferAddend(InputSection &is,
+Relocation &r) {
+  // Identify a control transfer relocation for the branch-to-branch
+  // optimization. A "control transfer relocation" means a B or BL
+  // target but it also includes relative vtable relocations for example.
+  //
+  // We require the relocation type to be JUMP26, CALL26 or PLT32. With a
+  // relocation type of PLT32 the value may be assumed to be used for branching
+  // directly to the symbol and the addend is only used to produce the 
relocated
+  // value (hence the effective addend is always 0). This is because if a PLT 
is
+  // needed the addend will be added to the address of the PLT, and it doesn't
+  // make sense to branch into the middle of a PLT. For example, relative 
vtable
+  // relocations use PLT32 and 0 or a positive value as the addend but still 
are
+  // used to branch to the symbol.
+  //
+  // With JUMP26 or CALL26 the only reasonable interpretation of a non-zero
+  // addend is that we are branching to symbol+addend so that becomes the
+  // effective addend.
+  if (r.type == R_AARCH64_PLT32)
+return 0;
+  if (r.type == R_AARCH64_JUMP26 || r.type == R_AARCH64_CALL26)
+return r.addend;
+  return std::nullopt;
+}
+
+static std::pair getBranchInfo(InputSection &is,
+   uint64_t offset) {
+  auto *i = std::lower_bound(
+  is.relocations.begin(), is.relocations.end(), offset,
+  [](Relocation &r, uint64_t offset) { return r.offset < offset; });
+  if (i != is.relocations.end() && i->offset == offset &&
+  i->type == R_AARCH64_JUMP26) {
+return {i, i->addend};
+  }

smithp35 wrote:

Agree that BTI instructions should be in a separate patch. It would require 
disassembling to find one so may result in longer link times. Skipping over BTI 
with direct branches could apply even when the target wasn't another branch.

https://github.com/llvm/llvm-project/pull/138366
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits

[llvm-branch-commits] [clang] [KeyInstr][Clang] Assign matrix element atom (PR #134650)

2025-05-27 Thread Stephen Tozer via llvm-branch-commits


https://github.com/SLTozer approved this pull request.


https://github.com/llvm/llvm-project/pull/134650
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits

[llvm-branch-commits] [lld] ELF: Add branch-to-branch optimization. (PR #138366)

2025-05-27 Thread Peter Smith via llvm-branch-commits


smithp35 wrote:

Thanks for the updates. I don't have any more comments.

https://github.com/llvm/llvm-project/pull/138366
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits

[llvm-branch-commits] [llvm] [AMDGPU] Add KnownBits simplification combines to RegBankCombiner (PR #141591)

2025-05-27 Thread Matt Arsenault via llvm-branch-commits


https://github.com/arsenm approved this pull request.


https://github.com/llvm/llvm-project/pull/141591
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits

[llvm-branch-commits] [llvm] [AMDGPU] Add KnownBits simplification combines to RegBankCombiner (PR #141591)

2025-05-27 Thread via llvm-branch-commits


llvmbot wrote:



@llvm/pr-subscribers-backend-amdgpu

@llvm/pr-subscribers-llvm-globalisel

Author: Pierre van Houtryve (Pierre-vh)


Changes

This 

---

Patch is 38.92 KiB, truncated to 20.00 KiB below, full version: 
https://github.com/llvm/llvm-project/pull/141591.diff


8 Files Affected:

- (modified) llvm/lib/Target/AMDGPU/AMDGPUCombine.td (+2-1) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll (+30-29) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll (+21-40) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll (+22-41) 
- (modified) llvm/test/CodeGen/AMDGPU/div_i128.ll (+13-17) 
- (modified) llvm/test/CodeGen/AMDGPU/itofp.i128.ll (+5-6) 
- (modified) llvm/test/CodeGen/AMDGPU/lround.ll (+9-9) 
- (modified) llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll (+2-14) 


``diff
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td 
b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
index 96be17c487130..df867aaa204b1 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
@@ -210,5 +210,6 @@ def AMDGPURegBankCombiner : GICombiner<
fp_minmax_to_clamp, fp_minmax_to_med3, fmed3_intrinsic_to_clamp,
identity_combines, redundant_and, constant_fold_cast_op,
cast_of_cast_combines, sext_trunc, zext_of_shift_amount_combines,
-   lower_uniform_sbfx, lower_uniform_ubfx, form_bitfield_extract]> {
+   lower_uniform_sbfx, lower_uniform_ubfx, form_bitfield_extract,
+   known_bits_simplifications]> {
 }
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll 
b/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll
index 6baa10bb48621..cc0f45681a3e2 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll
@@ -1744,63 +1744,64 @@ define i65 @v_lshr_i65_33(i65 %value) {
 ; GFX6-LABEL: v_lshr_i65_33:
 ; GFX6:   ; %bb.0:
 ; GFX6-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:v_mov_b32_e32 v3, v1
-; GFX6-NEXT:v_mov_b32_e32 v0, 1
+; GFX6-NEXT:v_mov_b32_e32 v3, 1
+; GFX6-NEXT:v_mov_b32_e32 v4, 0
+; GFX6-NEXT:v_and_b32_e32 v3, 1, v2
+; GFX6-NEXT:v_lshl_b64 v[2:3], v[3:4], 31
+; GFX6-NEXT:v_lshrrev_b32_e32 v0, 1, v1
+; GFX6-NEXT:v_or_b32_e32 v0, v0, v2
 ; GFX6-NEXT:v_mov_b32_e32 v1, 0
-; GFX6-NEXT:v_and_b32_e32 v0, 1, v2
-; GFX6-NEXT:v_lshl_b64 v[0:1], v[0:1], 31
-; GFX6-NEXT:v_lshrrev_b32_e32 v2, 1, v3
-; GFX6-NEXT:v_or_b32_e32 v0, v2, v0
 ; GFX6-NEXT:v_mov_b32_e32 v2, 0
 ; GFX6-NEXT:s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_lshr_i65_33:
 ; GFX8:   ; %bb.0:
 ; GFX8-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:v_mov_b32_e32 v3, v1
-; GFX8-NEXT:v_mov_b32_e32 v0, 1
+; GFX8-NEXT:v_mov_b32_e32 v3, 1
+; GFX8-NEXT:v_mov_b32_e32 v4, 0
+; GFX8-NEXT:v_and_b32_e32 v3, 1, v2
+; GFX8-NEXT:v_lshlrev_b64 v[2:3], 31, v[3:4]
+; GFX8-NEXT:v_lshrrev_b32_e32 v0, 1, v1
+; GFX8-NEXT:v_or_b32_e32 v0, v0, v2
 ; GFX8-NEXT:v_mov_b32_e32 v1, 0
-; GFX8-NEXT:v_and_b32_e32 v0, 1, v2
-; GFX8-NEXT:v_lshlrev_b64 v[0:1], 31, v[0:1]
-; GFX8-NEXT:v_lshrrev_b32_e32 v2, 1, v3
-; GFX8-NEXT:v_or_b32_e32 v0, v2, v0
 ; GFX8-NEXT:v_mov_b32_e32 v2, 0
 ; GFX8-NEXT:s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_lshr_i65_33:
 ; GFX9:   ; %bb.0:
 ; GFX9-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:v_mov_b32_e32 v3, v1
-; GFX9-NEXT:v_mov_b32_e32 v0, 1
+; GFX9-NEXT:v_mov_b32_e32 v3, 1
+; GFX9-NEXT:v_mov_b32_e32 v4, 0
+; GFX9-NEXT:v_and_b32_e32 v3, 1, v2
+; GFX9-NEXT:v_lshlrev_b64 v[2:3], 31, v[3:4]
+; GFX9-NEXT:v_lshrrev_b32_e32 v0, 1, v1
+; GFX9-NEXT:v_or_b32_e32 v0, v0, v2
 ; GFX9-NEXT:v_mov_b32_e32 v1, 0
-; GFX9-NEXT:v_and_b32_e32 v0, 1, v2
-; GFX9-NEXT:v_lshlrev_b64 v[0:1], 31, v[0:1]
-; GFX9-NEXT:v_lshrrev_b32_e32 v2, 1, v3
-; GFX9-NEXT:v_or_b32_e32 v0, v2, v0
 ; GFX9-NEXT:v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_lshr_i65_33:
 ; GFX10:   ; %bb.0:
 ; GFX10-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:v_mov_b32_e32 v3, v1
-; GFX10-NEXT:v_mov_b32_e32 v0, 1
+; GFX10-NEXT:v_mov_b32_e32 v3, 1
+; GFX10-NEXT:v_mov_b32_e32 v4, 0
+; GFX10-NEXT:v_and_b32_e32 v3, 1, v2
+; GFX10-NEXT:v_lshrrev_b32_e32 v0, 1, v1
 ; GFX10-NEXT:v_mov_b32_e32 v1, 0
-; GFX10-NEXT:v_and_b32_e32 v0, 1, v2
-; GFX10-NEXT:v_lshrrev_b32_e32 v2, 1, v3
-; GFX10-NEXT:v_lshlrev_b64 v[0:1], 31, v[0:1]
-; GFX10-NEXT:v_or_b32_e32 v0, v2, v0
+; GFX10-NEXT:v_lshlrev_b64 v[2:3], 31, v[3:4]
+; GFX10-NEXT:v_or_b32_e32 v0, v0, v2
 ; GFX10-NEXT:v_mov_b32_e32 v2, 0
 ; GFX10-NEXT:s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_lshr_i65_33:
 ; GFX11:   ; %bb.0:
 ; GFX11-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v0, 1
-; GFX11-NEXT:v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 1, v2
-; GFX11-NEXT:v_lshrrev

[llvm-branch-commits] [llvm] [AMDGPU] Add BFX Formation Combines to RegBankCombiner (PR #141590)

2025-05-27 Thread via llvm-branch-commits


llvmbot wrote:



@llvm/pr-subscribers-llvm-globalisel

@llvm/pr-subscribers-backend-amdgpu

Author: Pierre van Houtryve (Pierre-vh)


Changes

They're relatively safe to use there I believe. The only new registers
they may create are the constants for the BFX. For those, borrow the
RC from the source register.

Fixes #140040

---

Patch is 153.51 KiB, truncated to 20.00 KiB below, full version: 
https://github.com/llvm/llvm-project/pull/141590.diff


9 Files Affected:

- (modified) llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp (+29) 
- (modified) llvm/lib/Target/AMDGPU/AMDGPUCombine.td (+1-1) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll (+56-63) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll (+484-541) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll (+458-506) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.abs.ll (+13-15) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll (+111-121) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll (+172-182) 
- (modified) llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll (+8-9) 


``diff
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp 
b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index b1e851183de0d..8981b13dac7ed 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -4629,10 +4629,17 @@ bool CombinerHelper::matchBitfieldExtractFromSExtInReg(
   if (ShiftImm < 0 || ShiftImm + Width > Ty.getScalarSizeInBits())
 return false;
 
+  const RegisterBank *RB = getRegBank(ShiftSrc);
+
   MatchInfo = [=](MachineIRBuilder &B) {
 auto Cst1 = B.buildConstant(ExtractTy, ShiftImm);
 auto Cst2 = B.buildConstant(ExtractTy, Width);
 B.buildSbfx(Dst, ShiftSrc, Cst1, Cst2);
+
+if (RB) {
+  MRI.setRegBank(Cst1.getReg(0), *RB);
+  MRI.setRegBank(Cst2.getReg(0), *RB);
+}
   };
   return true;
 }
@@ -4667,10 +4674,18 @@ bool 
CombinerHelper::matchBitfieldExtractFromAnd(MachineInstr &MI,
 return false;
 
   uint64_t Width = APInt(Size, AndImm).countr_one();
+
+  const RegisterBank *RB = getRegBank(ShiftSrc);
+
   MatchInfo = [=](MachineIRBuilder &B) {
 auto WidthCst = B.buildConstant(ExtractTy, Width);
 auto LSBCst = B.buildConstant(ExtractTy, LSBImm);
 B.buildInstr(TargetOpcode::G_UBFX, {Dst}, {ShiftSrc, LSBCst, WidthCst});
+
+if (RB) {
+  MRI.setRegBank(WidthCst.getReg(0), *RB);
+  MRI.setRegBank(LSBCst.getReg(0), *RB);
+}
   };
   return true;
 }
@@ -4717,10 +4732,17 @@ bool CombinerHelper::matchBitfieldExtractFromShr(
   const int64_t Pos = ShrAmt - ShlAmt;
   const int64_t Width = Size - ShrAmt;
 
+  const RegisterBank *RB = getRegBank(ShlSrc);
+
   MatchInfo = [=](MachineIRBuilder &B) {
 auto WidthCst = B.buildConstant(ExtractTy, Width);
 auto PosCst = B.buildConstant(ExtractTy, Pos);
 B.buildInstr(ExtrOpcode, {Dst}, {ShlSrc, PosCst, WidthCst});
+
+if (RB) {
+  MRI.setRegBank(WidthCst.getReg(0), *RB);
+  MRI.setRegBank(PosCst.getReg(0), *RB);
+}
   };
   return true;
 }
@@ -4775,10 +4797,17 @@ bool CombinerHelper::matchBitfieldExtractFromShrAnd(
   if (Opcode == TargetOpcode::G_ASHR && Width + ShrAmt == Size)
 return false;
 
+  const RegisterBank *RB = getRegBank(AndSrc);
+
   MatchInfo = [=](MachineIRBuilder &B) {
 auto WidthCst = B.buildConstant(ExtractTy, Width);
 auto PosCst = B.buildConstant(ExtractTy, Pos);
 B.buildInstr(TargetOpcode::G_UBFX, {Dst}, {AndSrc, PosCst, WidthCst});
+
+if (RB) {
+  MRI.setRegBank(WidthCst.getReg(0), *RB);
+  MRI.setRegBank(PosCst.getReg(0), *RB);
+}
   };
   return true;
 }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td 
b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
index 94e1175b06b14..96be17c487130 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
@@ -210,5 +210,5 @@ def AMDGPURegBankCombiner : GICombiner<
fp_minmax_to_clamp, fp_minmax_to_med3, fmed3_intrinsic_to_clamp,
identity_combines, redundant_and, constant_fold_cast_op,
cast_of_cast_combines, sext_trunc, zext_of_shift_amount_combines,
-   lower_uniform_sbfx, lower_uniform_ubfx]> {
+   lower_uniform_sbfx, lower_uniform_ubfx, form_bitfield_extract]> {
 }
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll 
b/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll
index ff03cf1231d08..b0a239bef649e 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll
@@ -811,16 +811,15 @@ define amdgpu_ps i32 @s_ashr_v2i16(<2 x i16> inreg 
%value, <2 x i16> inreg %amou
 ;
 ; GFX8-LABEL: s_ashr_v2i16:
 ; GFX8:   ; %bb.0:
-; GFX8-NEXT:s_lshr_b32 s2, s0, 16
-; GFX8-NEXT:s_sext_i32_i16 s0, s0
-; GFX8-NEXT:s_lshr_b32 s3, s1, 16
-; GFX8-NEXT:s_ashr_i32 s0, s0, s1
-; GFX8-NEXT:s_sext_i32_i16 s1, s2
-; GFX8-NEXT:s_ashr_i32 s1, s1, s3
-; GFX8-NEXT:s_and_b32 s1, 0x, s1
+; GFX8-NEXT:s_lshr_b32 s2, s1, 16
+; GFX8-NE

[llvm-branch-commits] [llvm] [AMDGPU] Move S_BFE lowering into RegBankCombiner (PR #141589)

2025-05-27 Thread via llvm-branch-commits


llvmbot wrote:




@llvm/pr-subscribers-backend-amdgpu

Author: Pierre van Houtryve (Pierre-vh)


Changes

NFC

---
Full diff: https://github.com/llvm/llvm-project/pull/141589.diff


3 Files Affected:

- (modified) llvm/lib/Target/AMDGPU/AMDGPUCombine.td (+13-1) 
- (modified) llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp (+51) 
- (modified) llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp (+55-70) 


``diff
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td 
b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
index 9587fad1ecd63..94e1175b06b14 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
@@ -151,6 +151,17 @@ def zext_of_shift_amount_combines : GICombineGroup<[
   canonicalize_zext_lshr, canonicalize_zext_ashr, canonicalize_zext_shl
 ]>;
 
+// Early select of uniform BFX into S_BFE instructions.
+// These instructions encode the offset/width in a way that requires using
+// bitwise operations. Selecting these instructions early allow the combiner
+// to potentially fold these.
+class lower_uniform_bfx : GICombineRule<
+  (defs root:$bfx),
+  (combine (bfx $dst, $src, $o, $w):$bfx, [{ return lowerUniformBFX(*${bfx}); 
}])>;
+
+def lower_uniform_sbfx : lower_uniform_bfx;
+def lower_uniform_ubfx : lower_uniform_bfx;
+
 let Predicates = [Has16BitInsts, NotHasMed3_16] in {
 // For gfx8, expand f16-fmed3-as-f32 into a min/max f16 sequence. This
 // saves one instruction compared to the promotion.
@@ -198,5 +209,6 @@ def AMDGPURegBankCombiner : GICombiner<
zext_trunc_fold, int_minmax_to_med3, ptr_add_immed_chain,
fp_minmax_to_clamp, fp_minmax_to_med3, fmed3_intrinsic_to_clamp,
identity_combines, redundant_and, constant_fold_cast_op,
-   cast_of_cast_combines, sext_trunc, zext_of_shift_amount_combines]> {
+   cast_of_cast_combines, sext_trunc, zext_of_shift_amount_combines,
+   lower_uniform_sbfx, lower_uniform_ubfx]> {
 }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp 
b/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
index ee324a5e93f0f..2100900bb8eb2 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
@@ -89,6 +89,8 @@ class AMDGPURegBankCombinerImpl : public Combiner {
 
   void applyCanonicalizeZextShiftAmt(MachineInstr &MI, MachineInstr &Ext) 
const;
 
+  bool lowerUniformBFX(MachineInstr &MI) const;
+
 private:
   SIModeRegisterDefaults getMode() const;
   bool getIEEE() const;
@@ -392,6 +394,55 @@ void 
AMDGPURegBankCombinerImpl::applyCanonicalizeZextShiftAmt(
   MI.eraseFromParent();
 }
 
+bool AMDGPURegBankCombinerImpl::lowerUniformBFX(MachineInstr &MI) const {
+  assert(MI.getOpcode() == TargetOpcode::G_UBFX ||
+ MI.getOpcode() == TargetOpcode::G_SBFX);
+  const bool Signed = (MI.getOpcode() == TargetOpcode::G_SBFX);
+
+  Register DstReg = MI.getOperand(0).getReg();
+  const RegisterBank *RB = RBI.getRegBank(DstReg, MRI, TRI);
+  assert(RB && "No RB?");
+  if (RB->getID() != AMDGPU::SGPRRegBankID)
+return false;
+
+  Register SrcReg = MI.getOperand(1).getReg();
+  Register OffsetReg = MI.getOperand(2).getReg();
+  Register WidthReg = MI.getOperand(3).getReg();
+
+  const LLT S32 = LLT::scalar(32);
+  LLT Ty = MRI.getType(DstReg);
+
+  const unsigned Opc = (Ty == S32)
+   ? (Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32)
+   : (Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64);
+
+  // Ensure the high bits are clear to insert the offset.
+  auto OffsetMask = B.buildConstant(S32, maskTrailingOnes(6));
+  auto ClampOffset = B.buildAnd(S32, OffsetReg, OffsetMask);
+
+  // Zeros out the low bits, so don't bother clamping the input value.
+  auto ShiftAmt = B.buildConstant(S32, 16);
+  auto ShiftWidth = B.buildShl(S32, WidthReg, ShiftAmt);
+
+  // Transformation function, pack the offset and width of a BFE into
+  // the format expected by the S_BFE_I32 / S_BFE_U32. In the second
+  // source, bits [5:0] contain the offset and bits [22:16] the width.
+  auto MergedInputs = B.buildOr(S32, ClampOffset, ShiftWidth);
+
+  MRI.setRegBank(OffsetMask.getReg(0), *RB);
+  MRI.setRegBank(ClampOffset.getReg(0), *RB);
+  MRI.setRegBank(ShiftAmt.getReg(0), *RB);
+  MRI.setRegBank(ShiftWidth.getReg(0), *RB);
+  MRI.setRegBank(MergedInputs.getReg(0), *RB);
+
+  auto MIB = B.buildInstr(Opc, {DstReg}, {SrcReg, MergedInputs});
+  if (!constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI))
+llvm_unreachable("failed to constrain BFE");
+
+  MI.eraseFromParent();
+  return true;
+}
+
 SIModeRegisterDefaults AMDGPURegBankCombinerImpl::getMode() const {
   return MF.getInfo()->getMode();
 }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp 
b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index dd7aef8f0c583..0b7d64ee67c34 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -1492,88 +1492,73 @@ bool 
AMDGPURegisterBankInfo::applyMappingB

[llvm-branch-commits] [llvm] [AMDGPU] Add KnownBits simplification combines to RegBankCombiner (PR #141591)

2025-05-27 Thread Pierre van Houtryve via llvm-branch-commits


https://github.com/Pierre-vh edited 
https://github.com/llvm/llvm-project/pull/141591
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits

[llvm-branch-commits] [llvm] AMDGPU: Remove redundant operand folding checks (PR #140587)

2025-05-27 Thread Matt Arsenault via llvm-branch-commits


https://github.com/arsenm updated 
https://github.com/llvm/llvm-project/pull/140587

>From c3f0ed4891b6cc34dc808e8673da5ff86a903df0 Mon Sep 17 00:00:00 2001
From: Matt Arsenault 
Date: Mon, 19 May 2025 20:02:54 +0200
Subject: [PATCH 1/2] AMDGPU: Remove redundant operand folding checks

This was pre-filtering out a specific situation from being
added to the fold candidate list. The operand legality will
ultimately be checked with isOperandLegal before the fold is
performed, so I don't see the plus in pre-filtering this one
case.
---
 llvm/lib/Target/AMDGPU/SIFoldOperands.cpp |  18 
 .../AMDGPU/fold-operands-frame-index.mir  | 101 ++
 2 files changed, 101 insertions(+), 18 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp 
b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index c45611582a53a..cc18d6b4aba10 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -777,24 +777,6 @@ bool SIFoldOperandsImpl::tryAddToFoldList(
   return true;
   }
 
-  // Check the case where we might introduce a second constant operand to a
-  // scalar instruction
-  if (TII->isSALU(MI->getOpcode())) {
-const MCInstrDesc &InstDesc = MI->getDesc();
-const MCOperandInfo &OpInfo = InstDesc.operands()[OpNo];
-
-// Fine if the operand can be encoded as an inline constant
-if (!OpToFold->isReg() && !TII->isInlineConstant(*OpToFold, OpInfo)) {
-  // Otherwise check for another constant
-  for (unsigned i = 0, e = InstDesc.getNumOperands(); i != e; ++i) {
-auto &Op = MI->getOperand(i);
-if (OpNo != i && !Op.isReg() &&
-!TII->isInlineConstant(Op, InstDesc.operands()[i]))
-  return false;
-  }
-}
-  }
-
   appendFoldCandidate(FoldList, MI, OpNo, OpToFold);
   return true;
 }
diff --git a/llvm/test/CodeGen/AMDGPU/fold-operands-frame-index.mir 
b/llvm/test/CodeGen/AMDGPU/fold-operands-frame-index.mir
index 4417f205646ee..7fad2f466bc9f 100644
--- a/llvm/test/CodeGen/AMDGPU/fold-operands-frame-index.mir
+++ b/llvm/test/CodeGen/AMDGPU/fold-operands-frame-index.mir
@@ -573,3 +573,104 @@ body: |
 S_ENDPGM 0, implicit %2
 
 ...
+
+---
+name:no_fold_multiple_fi_s_cselect_b32
+tracksRegLiveness: true
+stack:
+  - { id: 0, size: 64, alignment: 4 }
+  - { id: 1, size: 32, alignment: 4 }
+body: |
+  bb.0:
+; CHECK-LABEL: name: no_fold_multiple_fi_s_cselect_b32
+; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 %stack.1
+; CHECK-NEXT: [[S_CSELECT_B32_:%[0-9]+]]:sreg_32 = S_CSELECT_B32 killed 
[[S_MOV_B32_]], %stack.0, implicit undef $scc
+; CHECK-NEXT: S_ENDPGM 0, implicit [[S_CSELECT_B32_]]
+%0:sreg_32 = S_MOV_B32 %stack.0
+%1:sreg_32 = S_MOV_B32 %stack.1
+%2:sreg_32 = S_CSELECT_B32 killed %1, killed %0, implicit undef $scc
+S_ENDPGM 0, implicit %2
+
+...
+
+---
+name:no_fold_multiple_fi_v_cndmask_b32_e64
+tracksRegLiveness: true
+stack:
+  - { id: 0, size: 64, alignment: 4 }
+  - { id: 1, size: 32, alignment: 4 }
+body: |
+  bb.0:
+liveins: $sgpr8_sgpr9
+; GFX9-LABEL: name: no_fold_multiple_fi_v_cndmask_b32_e64
+; GFX9: liveins: $sgpr8_sgpr9
+; GFX9-NEXT: {{  $}}
+; GFX9-NEXT: [[COPY:%[0-9]+]]:sreg_64_xexec = COPY $sgpr8_sgpr9
+; GFX9-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, 
implicit $exec
+; GFX9-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.1, 
implicit $exec
+; GFX9-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 
killed [[V_MOV_B32_e32_]], 0, killed [[V_MOV_B32_e32_1]], [[COPY]], implicit 
$exec
+; GFX9-NEXT: S_ENDPGM 0, implicit [[V_CNDMASK_B32_e64_]]
+;
+; GFX10-LABEL: name: no_fold_multiple_fi_v_cndmask_b32_e64
+; GFX10: liveins: $sgpr8_sgpr9
+; GFX10-NEXT: {{  $}}
+; GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_64_xexec = COPY $sgpr8_sgpr9
+; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.1, 
implicit $exec
+; GFX10-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 
0, %stack.0, 0, killed [[V_MOV_B32_e32_]], [[COPY]], implicit $exec
+; GFX10-NEXT: S_ENDPGM 0, implicit [[V_CNDMASK_B32_e64_]]
+;
+; GFX12-LABEL: name: no_fold_multiple_fi_v_cndmask_b32_e64
+; GFX12: liveins: $sgpr8_sgpr9
+; GFX12-NEXT: {{  $}}
+; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_64_xexec = COPY $sgpr8_sgpr9
+; GFX12-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.1, 
implicit $exec
+; GFX12-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 
0, %stack.0, 0, killed [[V_MOV_B32_e32_]], [[COPY]], implicit $exec
+; GFX12-NEXT: S_ENDPGM 0, implicit [[V_CNDMASK_B32_e64_]]
+%0:sreg_64_xexec = COPY $sgpr8_sgpr9
+%1:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
+%2:vgpr_32 = V_MOV_B32_e32 %stack.1, implicit $exec
+%3:vgpr_32 = V_CNDMASK_B32_e64 0, killed %1, 0, killed %2, %0, implicit 
$exec
+

[llvm-branch-commits] [llvm] [AMDGPU] Add KnownBits simplification combines to RegBankCombiner (PR #141591)

2025-05-27 Thread Pierre van Houtryve via llvm-branch-commits


Pierre-vh wrote:

> [!WARNING]
> This pull request is not mergeable via GitHub because a downstack PR is 
> open. Once all requirements are satisfied, merge this PR as a stack  href="https://app.graphite.dev/github/pr/llvm/llvm-project/141591?utm_source=stack-comment-downstack-mergeability-warning";
>  >on Graphite.
> https://graphite.dev/docs/merge-pull-requests";>Learn more

* **#141591** https://app.graphite.dev/github/pr/llvm/llvm-project/141591?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/> 👈 https://app.graphite.dev/github/pr/llvm/llvm-project/141591?utm_source=stack-comment-view-in-graphite";
 target="_blank">(View in Graphite)
* **#141590** https://app.graphite.dev/github/pr/llvm/llvm-project/141590?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#141589** https://app.graphite.dev/github/pr/llvm/llvm-project/141589?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#141588** https://app.graphite.dev/github/pr/llvm/llvm-project/141588?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* `main`




This stack of pull requests is managed by https://graphite.dev?utm-source=stack-comment";>Graphite. Learn 
more about https://stacking.dev/?utm_source=stack-comment";>stacking.


https://github.com/llvm/llvm-project/pull/141591
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits

[llvm-branch-commits] [llvm] Change inSectionBlame to return pair (FileIdx, LineNo). (PR #141540)

2025-05-27 Thread Qinkun Bao via llvm-branch-commits


https://github.com/qinkunbao updated 
https://github.com/llvm/llvm-project/pull/141540

>From d5508cc217f413b3bbb7a301b2110cfc0c2c6cbc Mon Sep 17 00:00:00 2001
From: Qinkun Bao 
Date: Tue, 27 May 2025 03:24:26 +
Subject: [PATCH 1/2] Format SpecialCaseList.h

Created using spr 1.3.6
---
 llvm/include/llvm/Support/SpecialCaseList.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/include/llvm/Support/SpecialCaseList.h 
b/llvm/include/llvm/Support/SpecialCaseList.h
index bce337f553a93..d54b242a9c501 100644
--- a/llvm/include/llvm/Support/SpecialCaseList.h
+++ b/llvm/include/llvm/Support/SpecialCaseList.h
@@ -17,8 +17,8 @@
 #include "llvm/Support/GlobPattern.h"
 #include "llvm/Support/Regex.h"
 #include 
-#include 
 #include 
+#include 
 #include 
 
 namespace llvm {

>From b094fc2f5e3fe0d9b65f86a3f6eda04a6ab41e47 Mon Sep 17 00:00:00 2001
From: Qinkun Bao 
Date: Tue, 27 May 2025 14:01:19 +
Subject: [PATCH 2/2] Remove irelevant format changes

Created using spr 1.3.6
---
 llvm/unittests/Support/SpecialCaseListTest.cpp | 13 +
 1 file changed, 5 insertions(+), 8 deletions(-)

diff --git a/llvm/unittests/Support/SpecialCaseListTest.cpp 
b/llvm/unittests/Support/SpecialCaseListTest.cpp
index da4c557e740e6..0fe6a427c0562 100644
--- a/llvm/unittests/Support/SpecialCaseListTest.cpp
+++ b/llvm/unittests/Support/SpecialCaseListTest.cpp
@@ -218,9 +218,8 @@ TEST_F(SpecialCaseListTest, NoTrigramsInARule) {
 }
 
 TEST_F(SpecialCaseListTest, RepetitiveRule) {
-  std::unique_ptr SCL =
-  makeSpecialCaseList("fun:*bar*bar*bar*bar*\n"
-  "fun:bar*\n");
+  std::unique_ptr SCL = 
makeSpecialCaseList("fun:*bar*bar*bar*bar*\n"
+ "fun:bar*\n");
   EXPECT_TRUE(SCL->inSection("", "fun", "bara"));
   EXPECT_FALSE(SCL->inSection("", "fun", "abara"));
   EXPECT_TRUE(SCL->inSection("", "fun", "barbarbarbar"));
@@ -229,8 +228,7 @@ TEST_F(SpecialCaseListTest, RepetitiveRule) {
 }
 
 TEST_F(SpecialCaseListTest, SpecialSymbolRule) {
-  std::unique_ptr SCL =
-  makeSpecialCaseList("src:*c\\+\\+abi*\n");
+  std::unique_ptr SCL = 
makeSpecialCaseList("src:*c\\+\\+abi*\n");
   EXPECT_TRUE(SCL->inSection("", "src", "c++abi"));
   EXPECT_FALSE(SCL->inSection("", "src", "c\\+\\+abi"));
 }
@@ -246,9 +244,8 @@ TEST_F(SpecialCaseListTest, PopularTrigram) {
 }
 
 TEST_F(SpecialCaseListTest, EscapedSymbols) {
-  std::unique_ptr SCL =
-  makeSpecialCaseList("src:*c\\+\\+abi*\n"
-  "src:*helloworld*\n");
+  std::unique_ptr SCL = 
makeSpecialCaseList("src:*c\\+\\+abi*\n"
+ 
"src:*helloworld*\n");
   EXPECT_TRUE(SCL->inSection("", "src", "dir/c++abi"));
   EXPECT_FALSE(SCL->inSection("", "src", "dir/c\\+\\+abi"));
   EXPECT_FALSE(SCL->inSection("", "src", "c\\+\\+abi"));

___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits

[llvm-branch-commits] [libcxx] release/20.x: [libcxx] Provide locale conversions to tests through lit substitution (#105651) (PR #136449)

2025-05-27 Thread Martin Storsjö via llvm-branch-commits


mstorsjo wrote:

So, this backport in itself should be clean, but running libcxx CI on the 20.x 
release branch is broken and would require a few other backports, that are 
entangled. So it's basically up to @ldionne if he thinks it's ok to merge 
despite the unrelated CI failures. (It is tricky because those failures _are_ 
in the same area; updated macOS changed locale definitions, which would require 
similar changes like these, plus more, to get a green run there. 
95d23f58b6bc8e31a5a2f027338c1f6ecab1a0f1 on git main just marks those tests as 
unsupported...)

https://github.com/llvm/llvm-project/pull/136449
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits

[llvm-branch-commits] [llvm] [AMDGPU] Add KnownBits simplification combines to RegBankCombiner (PR #141591)

2025-05-27 Thread Pierre van Houtryve via llvm-branch-commits


https://github.com/Pierre-vh edited 
https://github.com/llvm/llvm-project/pull/141591
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits

[llvm-branch-commits] [libcxx] [llvm] release/20.x: [libcxx] Provide locale conversions to tests through lit substitution (#105651) (PR #139468)

2025-05-27 Thread Martin Storsjö via llvm-branch-commits


mstorsjo wrote:

Closing this one. Backporting the extra changes turned out to be a bit messy 
and entangled. (It's not impossible to do though, but it would require buy-in 
from libcxx maintainers that we do want to fix up the CI on the 20.x branch.) 
The original backport in #136449 should be fine on its own though (while the CI 
has unrelated failures).

https://github.com/llvm/llvm-project/pull/139468
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits

[llvm-branch-commits] [libcxx] [llvm] release/20.x: [libcxx] Provide locale conversions to tests through lit substitution (#105651) (PR #139468)

2025-05-27 Thread Martin Storsjö via llvm-branch-commits


https://github.com/mstorsjo closed 
https://github.com/llvm/llvm-project/pull/139468
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits

[llvm-branch-commits] [llvm] Change SpecialCaseList::inSectionBlame to return pair (FileIdx, LineNo). (PR #141540)

2025-05-27 Thread Qinkun Bao via llvm-branch-commits


https://github.com/qinkunbao edited 
https://github.com/llvm/llvm-project/pull/141540
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits

[llvm-branch-commits] [llvm] [AMDGPU] Add KnownBits simplification combines to RegBankCombiner (PR #141591)

2025-05-27 Thread Pierre van Houtryve via llvm-branch-commits


https://github.com/Pierre-vh updated 
https://github.com/llvm/llvm-project/pull/141591

>From 687bf11493d38ba323e90c1b40ae6919d48ed016 Mon Sep 17 00:00:00 2001
From: pvanhout 
Date: Tue, 27 May 2025 12:29:02 +0200
Subject: [PATCH] [AMDGPU] Add KnownBits simplification combines to
 RegBankCombiner

---
 llvm/lib/Target/AMDGPU/AMDGPUCombine.td   |  3 +-
 llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll   | 59 -
 .../test/CodeGen/AMDGPU/GlobalISel/saddsat.ll | 61 +++---
 .../test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll | 63 +++
 llvm/test/CodeGen/AMDGPU/div_i128.ll  | 30 -
 llvm/test/CodeGen/AMDGPU/itofp.i128.ll| 11 ++--
 llvm/test/CodeGen/AMDGPU/lround.ll| 18 +++---
 llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll   | 16 +
 8 files changed, 104 insertions(+), 157 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td 
b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
index 96be17c487130..df867aaa204b1 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
@@ -210,5 +210,6 @@ def AMDGPURegBankCombiner : GICombiner<
fp_minmax_to_clamp, fp_minmax_to_med3, fmed3_intrinsic_to_clamp,
identity_combines, redundant_and, constant_fold_cast_op,
cast_of_cast_combines, sext_trunc, zext_of_shift_amount_combines,
-   lower_uniform_sbfx, lower_uniform_ubfx, form_bitfield_extract]> {
+   lower_uniform_sbfx, lower_uniform_ubfx, form_bitfield_extract,
+   known_bits_simplifications]> {
 }
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll 
b/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll
index 6baa10bb48621..cc0f45681a3e2 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll
@@ -1744,63 +1744,64 @@ define i65 @v_lshr_i65_33(i65 %value) {
 ; GFX6-LABEL: v_lshr_i65_33:
 ; GFX6:   ; %bb.0:
 ; GFX6-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:v_mov_b32_e32 v3, v1
-; GFX6-NEXT:v_mov_b32_e32 v0, 1
+; GFX6-NEXT:v_mov_b32_e32 v3, 1
+; GFX6-NEXT:v_mov_b32_e32 v4, 0
+; GFX6-NEXT:v_and_b32_e32 v3, 1, v2
+; GFX6-NEXT:v_lshl_b64 v[2:3], v[3:4], 31
+; GFX6-NEXT:v_lshrrev_b32_e32 v0, 1, v1
+; GFX6-NEXT:v_or_b32_e32 v0, v0, v2
 ; GFX6-NEXT:v_mov_b32_e32 v1, 0
-; GFX6-NEXT:v_and_b32_e32 v0, 1, v2
-; GFX6-NEXT:v_lshl_b64 v[0:1], v[0:1], 31
-; GFX6-NEXT:v_lshrrev_b32_e32 v2, 1, v3
-; GFX6-NEXT:v_or_b32_e32 v0, v2, v0
 ; GFX6-NEXT:v_mov_b32_e32 v2, 0
 ; GFX6-NEXT:s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_lshr_i65_33:
 ; GFX8:   ; %bb.0:
 ; GFX8-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:v_mov_b32_e32 v3, v1
-; GFX8-NEXT:v_mov_b32_e32 v0, 1
+; GFX8-NEXT:v_mov_b32_e32 v3, 1
+; GFX8-NEXT:v_mov_b32_e32 v4, 0
+; GFX8-NEXT:v_and_b32_e32 v3, 1, v2
+; GFX8-NEXT:v_lshlrev_b64 v[2:3], 31, v[3:4]
+; GFX8-NEXT:v_lshrrev_b32_e32 v0, 1, v1
+; GFX8-NEXT:v_or_b32_e32 v0, v0, v2
 ; GFX8-NEXT:v_mov_b32_e32 v1, 0
-; GFX8-NEXT:v_and_b32_e32 v0, 1, v2
-; GFX8-NEXT:v_lshlrev_b64 v[0:1], 31, v[0:1]
-; GFX8-NEXT:v_lshrrev_b32_e32 v2, 1, v3
-; GFX8-NEXT:v_or_b32_e32 v0, v2, v0
 ; GFX8-NEXT:v_mov_b32_e32 v2, 0
 ; GFX8-NEXT:s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_lshr_i65_33:
 ; GFX9:   ; %bb.0:
 ; GFX9-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:v_mov_b32_e32 v3, v1
-; GFX9-NEXT:v_mov_b32_e32 v0, 1
+; GFX9-NEXT:v_mov_b32_e32 v3, 1
+; GFX9-NEXT:v_mov_b32_e32 v4, 0
+; GFX9-NEXT:v_and_b32_e32 v3, 1, v2
+; GFX9-NEXT:v_lshlrev_b64 v[2:3], 31, v[3:4]
+; GFX9-NEXT:v_lshrrev_b32_e32 v0, 1, v1
+; GFX9-NEXT:v_or_b32_e32 v0, v0, v2
 ; GFX9-NEXT:v_mov_b32_e32 v1, 0
-; GFX9-NEXT:v_and_b32_e32 v0, 1, v2
-; GFX9-NEXT:v_lshlrev_b64 v[0:1], 31, v[0:1]
-; GFX9-NEXT:v_lshrrev_b32_e32 v2, 1, v3
-; GFX9-NEXT:v_or_b32_e32 v0, v2, v0
 ; GFX9-NEXT:v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_lshr_i65_33:
 ; GFX10:   ; %bb.0:
 ; GFX10-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:v_mov_b32_e32 v3, v1
-; GFX10-NEXT:v_mov_b32_e32 v0, 1
+; GFX10-NEXT:v_mov_b32_e32 v3, 1
+; GFX10-NEXT:v_mov_b32_e32 v4, 0
+; GFX10-NEXT:v_and_b32_e32 v3, 1, v2
+; GFX10-NEXT:v_lshrrev_b32_e32 v0, 1, v1
 ; GFX10-NEXT:v_mov_b32_e32 v1, 0
-; GFX10-NEXT:v_and_b32_e32 v0, 1, v2
-; GFX10-NEXT:v_lshrrev_b32_e32 v2, 1, v3
-; GFX10-NEXT:v_lshlrev_b64 v[0:1], 31, v[0:1]
-; GFX10-NEXT:v_or_b32_e32 v0, v2, v0
+; GFX10-NEXT:v_lshlrev_b64 v[2:3], 31, v[3:4]
+; GFX10-NEXT:v_or_b32_e32 v0, v0, v2
 ; GFX10-NEXT:v_mov_b32_e32 v2, 0
 ; GFX10-NEXT:s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_lshr_i65_33:
 ; GFX11:   ; %bb.0:
 ; GFX11-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v0, 1
-; GFX11-NEXT:v_dual_mov_b32 v1, 0 :: v_dual_and_b3

[llvm-branch-commits] [llvm] [AMDGPU] Move S_BFE lowering into RegBankCombiner (PR #141589)

2025-05-27 Thread Pierre van Houtryve via llvm-branch-commits


https://github.com/Pierre-vh updated 
https://github.com/llvm/llvm-project/pull/141589

>From 150fe8c86c080a075fef344b20cd15b1097d3f29 Mon Sep 17 00:00:00 2001
From: pvanhout 
Date: Tue, 27 May 2025 11:16:16 +0200
Subject: [PATCH] [AMDGPU] Move S_BFE lowering into RegBankCombiner

---
 llvm/lib/Target/AMDGPU/AMDGPUCombine.td   |  14 +-
 .../Target/AMDGPU/AMDGPURegBankCombiner.cpp   |  51 +++
 .../Target/AMDGPU/AMDGPURegisterBankInfo.cpp  | 125 --
 3 files changed, 119 insertions(+), 71 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td 
b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
index 9587fad1ecd63..94e1175b06b14 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
@@ -151,6 +151,17 @@ def zext_of_shift_amount_combines : GICombineGroup<[
   canonicalize_zext_lshr, canonicalize_zext_ashr, canonicalize_zext_shl
 ]>;
 
+// Early select of uniform BFX into S_BFE instructions.
+// These instructions encode the offset/width in a way that requires using
+// bitwise operations. Selecting these instructions early allow the combiner
+// to potentially fold these.
+class lower_uniform_bfx : GICombineRule<
+  (defs root:$bfx),
+  (combine (bfx $dst, $src, $o, $w):$bfx, [{ return lowerUniformBFX(*${bfx}); 
}])>;
+
+def lower_uniform_sbfx : lower_uniform_bfx;
+def lower_uniform_ubfx : lower_uniform_bfx;
+
 let Predicates = [Has16BitInsts, NotHasMed3_16] in {
 // For gfx8, expand f16-fmed3-as-f32 into a min/max f16 sequence. This
 // saves one instruction compared to the promotion.
@@ -198,5 +209,6 @@ def AMDGPURegBankCombiner : GICombiner<
zext_trunc_fold, int_minmax_to_med3, ptr_add_immed_chain,
fp_minmax_to_clamp, fp_minmax_to_med3, fmed3_intrinsic_to_clamp,
identity_combines, redundant_and, constant_fold_cast_op,
-   cast_of_cast_combines, sext_trunc, zext_of_shift_amount_combines]> {
+   cast_of_cast_combines, sext_trunc, zext_of_shift_amount_combines,
+   lower_uniform_sbfx, lower_uniform_ubfx]> {
 }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp 
b/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
index ee324a5e93f0f..2100900bb8eb2 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
@@ -89,6 +89,8 @@ class AMDGPURegBankCombinerImpl : public Combiner {
 
   void applyCanonicalizeZextShiftAmt(MachineInstr &MI, MachineInstr &Ext) 
const;
 
+  bool lowerUniformBFX(MachineInstr &MI) const;
+
 private:
   SIModeRegisterDefaults getMode() const;
   bool getIEEE() const;
@@ -392,6 +394,55 @@ void 
AMDGPURegBankCombinerImpl::applyCanonicalizeZextShiftAmt(
   MI.eraseFromParent();
 }
 
+bool AMDGPURegBankCombinerImpl::lowerUniformBFX(MachineInstr &MI) const {
+  assert(MI.getOpcode() == TargetOpcode::G_UBFX ||
+ MI.getOpcode() == TargetOpcode::G_SBFX);
+  const bool Signed = (MI.getOpcode() == TargetOpcode::G_SBFX);
+
+  Register DstReg = MI.getOperand(0).getReg();
+  const RegisterBank *RB = RBI.getRegBank(DstReg, MRI, TRI);
+  assert(RB && "No RB?");
+  if (RB->getID() != AMDGPU::SGPRRegBankID)
+return false;
+
+  Register SrcReg = MI.getOperand(1).getReg();
+  Register OffsetReg = MI.getOperand(2).getReg();
+  Register WidthReg = MI.getOperand(3).getReg();
+
+  const LLT S32 = LLT::scalar(32);
+  LLT Ty = MRI.getType(DstReg);
+
+  const unsigned Opc = (Ty == S32)
+   ? (Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32)
+   : (Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64);
+
+  // Ensure the high bits are clear to insert the offset.
+  auto OffsetMask = B.buildConstant(S32, maskTrailingOnes(6));
+  auto ClampOffset = B.buildAnd(S32, OffsetReg, OffsetMask);
+
+  // Zeros out the low bits, so don't bother clamping the input value.
+  auto ShiftAmt = B.buildConstant(S32, 16);
+  auto ShiftWidth = B.buildShl(S32, WidthReg, ShiftAmt);
+
+  // Transformation function, pack the offset and width of a BFE into
+  // the format expected by the S_BFE_I32 / S_BFE_U32. In the second
+  // source, bits [5:0] contain the offset and bits [22:16] the width.
+  auto MergedInputs = B.buildOr(S32, ClampOffset, ShiftWidth);
+
+  MRI.setRegBank(OffsetMask.getReg(0), *RB);
+  MRI.setRegBank(ClampOffset.getReg(0), *RB);
+  MRI.setRegBank(ShiftAmt.getReg(0), *RB);
+  MRI.setRegBank(ShiftWidth.getReg(0), *RB);
+  MRI.setRegBank(MergedInputs.getReg(0), *RB);
+
+  auto MIB = B.buildInstr(Opc, {DstReg}, {SrcReg, MergedInputs});
+  if (!constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI))
+llvm_unreachable("failed to constrain BFE");
+
+  MI.eraseFromParent();
+  return true;
+}
+
 SIModeRegisterDefaults AMDGPURegBankCombinerImpl::getMode() const {
   return MF.getInfo()->getMode();
 }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp 
b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index dd7aef8f0c583..0b7d64ee67c34 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Ta

[llvm-branch-commits] [llvm] [AMDGPU] Add KnownBits simplification combines to RegBankCombiner (PR #141591)

2025-05-27 Thread Pierre van Houtryve via llvm-branch-commits


https://github.com/Pierre-vh updated 
https://github.com/llvm/llvm-project/pull/141591

>From 687bf11493d38ba323e90c1b40ae6919d48ed016 Mon Sep 17 00:00:00 2001
From: pvanhout 
Date: Tue, 27 May 2025 12:29:02 +0200
Subject: [PATCH] [AMDGPU] Add KnownBits simplification combines to
 RegBankCombiner

---
 llvm/lib/Target/AMDGPU/AMDGPUCombine.td   |  3 +-
 llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll   | 59 -
 .../test/CodeGen/AMDGPU/GlobalISel/saddsat.ll | 61 +++---
 .../test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll | 63 +++
 llvm/test/CodeGen/AMDGPU/div_i128.ll  | 30 -
 llvm/test/CodeGen/AMDGPU/itofp.i128.ll| 11 ++--
 llvm/test/CodeGen/AMDGPU/lround.ll| 18 +++---
 llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll   | 16 +
 8 files changed, 104 insertions(+), 157 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td 
b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
index 96be17c487130..df867aaa204b1 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
@@ -210,5 +210,6 @@ def AMDGPURegBankCombiner : GICombiner<
fp_minmax_to_clamp, fp_minmax_to_med3, fmed3_intrinsic_to_clamp,
identity_combines, redundant_and, constant_fold_cast_op,
cast_of_cast_combines, sext_trunc, zext_of_shift_amount_combines,
-   lower_uniform_sbfx, lower_uniform_ubfx, form_bitfield_extract]> {
+   lower_uniform_sbfx, lower_uniform_ubfx, form_bitfield_extract,
+   known_bits_simplifications]> {
 }
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll 
b/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll
index 6baa10bb48621..cc0f45681a3e2 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll
@@ -1744,63 +1744,64 @@ define i65 @v_lshr_i65_33(i65 %value) {
 ; GFX6-LABEL: v_lshr_i65_33:
 ; GFX6:   ; %bb.0:
 ; GFX6-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:v_mov_b32_e32 v3, v1
-; GFX6-NEXT:v_mov_b32_e32 v0, 1
+; GFX6-NEXT:v_mov_b32_e32 v3, 1
+; GFX6-NEXT:v_mov_b32_e32 v4, 0
+; GFX6-NEXT:v_and_b32_e32 v3, 1, v2
+; GFX6-NEXT:v_lshl_b64 v[2:3], v[3:4], 31
+; GFX6-NEXT:v_lshrrev_b32_e32 v0, 1, v1
+; GFX6-NEXT:v_or_b32_e32 v0, v0, v2
 ; GFX6-NEXT:v_mov_b32_e32 v1, 0
-; GFX6-NEXT:v_and_b32_e32 v0, 1, v2
-; GFX6-NEXT:v_lshl_b64 v[0:1], v[0:1], 31
-; GFX6-NEXT:v_lshrrev_b32_e32 v2, 1, v3
-; GFX6-NEXT:v_or_b32_e32 v0, v2, v0
 ; GFX6-NEXT:v_mov_b32_e32 v2, 0
 ; GFX6-NEXT:s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_lshr_i65_33:
 ; GFX8:   ; %bb.0:
 ; GFX8-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:v_mov_b32_e32 v3, v1
-; GFX8-NEXT:v_mov_b32_e32 v0, 1
+; GFX8-NEXT:v_mov_b32_e32 v3, 1
+; GFX8-NEXT:v_mov_b32_e32 v4, 0
+; GFX8-NEXT:v_and_b32_e32 v3, 1, v2
+; GFX8-NEXT:v_lshlrev_b64 v[2:3], 31, v[3:4]
+; GFX8-NEXT:v_lshrrev_b32_e32 v0, 1, v1
+; GFX8-NEXT:v_or_b32_e32 v0, v0, v2
 ; GFX8-NEXT:v_mov_b32_e32 v1, 0
-; GFX8-NEXT:v_and_b32_e32 v0, 1, v2
-; GFX8-NEXT:v_lshlrev_b64 v[0:1], 31, v[0:1]
-; GFX8-NEXT:v_lshrrev_b32_e32 v2, 1, v3
-; GFX8-NEXT:v_or_b32_e32 v0, v2, v0
 ; GFX8-NEXT:v_mov_b32_e32 v2, 0
 ; GFX8-NEXT:s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_lshr_i65_33:
 ; GFX9:   ; %bb.0:
 ; GFX9-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:v_mov_b32_e32 v3, v1
-; GFX9-NEXT:v_mov_b32_e32 v0, 1
+; GFX9-NEXT:v_mov_b32_e32 v3, 1
+; GFX9-NEXT:v_mov_b32_e32 v4, 0
+; GFX9-NEXT:v_and_b32_e32 v3, 1, v2
+; GFX9-NEXT:v_lshlrev_b64 v[2:3], 31, v[3:4]
+; GFX9-NEXT:v_lshrrev_b32_e32 v0, 1, v1
+; GFX9-NEXT:v_or_b32_e32 v0, v0, v2
 ; GFX9-NEXT:v_mov_b32_e32 v1, 0
-; GFX9-NEXT:v_and_b32_e32 v0, 1, v2
-; GFX9-NEXT:v_lshlrev_b64 v[0:1], 31, v[0:1]
-; GFX9-NEXT:v_lshrrev_b32_e32 v2, 1, v3
-; GFX9-NEXT:v_or_b32_e32 v0, v2, v0
 ; GFX9-NEXT:v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_lshr_i65_33:
 ; GFX10:   ; %bb.0:
 ; GFX10-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:v_mov_b32_e32 v3, v1
-; GFX10-NEXT:v_mov_b32_e32 v0, 1
+; GFX10-NEXT:v_mov_b32_e32 v3, 1
+; GFX10-NEXT:v_mov_b32_e32 v4, 0
+; GFX10-NEXT:v_and_b32_e32 v3, 1, v2
+; GFX10-NEXT:v_lshrrev_b32_e32 v0, 1, v1
 ; GFX10-NEXT:v_mov_b32_e32 v1, 0
-; GFX10-NEXT:v_and_b32_e32 v0, 1, v2
-; GFX10-NEXT:v_lshrrev_b32_e32 v2, 1, v3
-; GFX10-NEXT:v_lshlrev_b64 v[0:1], 31, v[0:1]
-; GFX10-NEXT:v_or_b32_e32 v0, v2, v0
+; GFX10-NEXT:v_lshlrev_b64 v[2:3], 31, v[3:4]
+; GFX10-NEXT:v_or_b32_e32 v0, v0, v2
 ; GFX10-NEXT:v_mov_b32_e32 v2, 0
 ; GFX10-NEXT:s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_lshr_i65_33:
 ; GFX11:   ; %bb.0:
 ; GFX11-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v0, 1
-; GFX11-NEXT:v_dual_mov_b32 v1, 0 :: v_dual_and_b3

[llvm-branch-commits] [llvm] [AMDGPU] Move S_BFE lowering into RegBankCombiner (PR #141589)

2025-05-27 Thread Pierre van Houtryve via llvm-branch-commits


https://github.com/Pierre-vh updated 
https://github.com/llvm/llvm-project/pull/141589

>From 150fe8c86c080a075fef344b20cd15b1097d3f29 Mon Sep 17 00:00:00 2001
From: pvanhout 
Date: Tue, 27 May 2025 11:16:16 +0200
Subject: [PATCH] [AMDGPU] Move S_BFE lowering into RegBankCombiner

---
 llvm/lib/Target/AMDGPU/AMDGPUCombine.td   |  14 +-
 .../Target/AMDGPU/AMDGPURegBankCombiner.cpp   |  51 +++
 .../Target/AMDGPU/AMDGPURegisterBankInfo.cpp  | 125 --
 3 files changed, 119 insertions(+), 71 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td 
b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
index 9587fad1ecd63..94e1175b06b14 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
@@ -151,6 +151,17 @@ def zext_of_shift_amount_combines : GICombineGroup<[
   canonicalize_zext_lshr, canonicalize_zext_ashr, canonicalize_zext_shl
 ]>;
 
+// Early select of uniform BFX into S_BFE instructions.
+// These instructions encode the offset/width in a way that requires using
+// bitwise operations. Selecting these instructions early allow the combiner
+// to potentially fold these.
+class lower_uniform_bfx : GICombineRule<
+  (defs root:$bfx),
+  (combine (bfx $dst, $src, $o, $w):$bfx, [{ return lowerUniformBFX(*${bfx}); 
}])>;
+
+def lower_uniform_sbfx : lower_uniform_bfx;
+def lower_uniform_ubfx : lower_uniform_bfx;
+
 let Predicates = [Has16BitInsts, NotHasMed3_16] in {
 // For gfx8, expand f16-fmed3-as-f32 into a min/max f16 sequence. This
 // saves one instruction compared to the promotion.
@@ -198,5 +209,6 @@ def AMDGPURegBankCombiner : GICombiner<
zext_trunc_fold, int_minmax_to_med3, ptr_add_immed_chain,
fp_minmax_to_clamp, fp_minmax_to_med3, fmed3_intrinsic_to_clamp,
identity_combines, redundant_and, constant_fold_cast_op,
-   cast_of_cast_combines, sext_trunc, zext_of_shift_amount_combines]> {
+   cast_of_cast_combines, sext_trunc, zext_of_shift_amount_combines,
+   lower_uniform_sbfx, lower_uniform_ubfx]> {
 }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp 
b/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
index ee324a5e93f0f..2100900bb8eb2 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
@@ -89,6 +89,8 @@ class AMDGPURegBankCombinerImpl : public Combiner {
 
   void applyCanonicalizeZextShiftAmt(MachineInstr &MI, MachineInstr &Ext) 
const;
 
+  bool lowerUniformBFX(MachineInstr &MI) const;
+
 private:
   SIModeRegisterDefaults getMode() const;
   bool getIEEE() const;
@@ -392,6 +394,55 @@ void 
AMDGPURegBankCombinerImpl::applyCanonicalizeZextShiftAmt(
   MI.eraseFromParent();
 }
 
+bool AMDGPURegBankCombinerImpl::lowerUniformBFX(MachineInstr &MI) const {
+  assert(MI.getOpcode() == TargetOpcode::G_UBFX ||
+ MI.getOpcode() == TargetOpcode::G_SBFX);
+  const bool Signed = (MI.getOpcode() == TargetOpcode::G_SBFX);
+
+  Register DstReg = MI.getOperand(0).getReg();
+  const RegisterBank *RB = RBI.getRegBank(DstReg, MRI, TRI);
+  assert(RB && "No RB?");
+  if (RB->getID() != AMDGPU::SGPRRegBankID)
+return false;
+
+  Register SrcReg = MI.getOperand(1).getReg();
+  Register OffsetReg = MI.getOperand(2).getReg();
+  Register WidthReg = MI.getOperand(3).getReg();
+
+  const LLT S32 = LLT::scalar(32);
+  LLT Ty = MRI.getType(DstReg);
+
+  const unsigned Opc = (Ty == S32)
+   ? (Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32)
+   : (Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64);
+
+  // Ensure the high bits are clear to insert the offset.
+  auto OffsetMask = B.buildConstant(S32, maskTrailingOnes(6));
+  auto ClampOffset = B.buildAnd(S32, OffsetReg, OffsetMask);
+
+  // Zeros out the low bits, so don't bother clamping the input value.
+  auto ShiftAmt = B.buildConstant(S32, 16);
+  auto ShiftWidth = B.buildShl(S32, WidthReg, ShiftAmt);
+
+  // Transformation function, pack the offset and width of a BFE into
+  // the format expected by the S_BFE_I32 / S_BFE_U32. In the second
+  // source, bits [5:0] contain the offset and bits [22:16] the width.
+  auto MergedInputs = B.buildOr(S32, ClampOffset, ShiftWidth);
+
+  MRI.setRegBank(OffsetMask.getReg(0), *RB);
+  MRI.setRegBank(ClampOffset.getReg(0), *RB);
+  MRI.setRegBank(ShiftAmt.getReg(0), *RB);
+  MRI.setRegBank(ShiftWidth.getReg(0), *RB);
+  MRI.setRegBank(MergedInputs.getReg(0), *RB);
+
+  auto MIB = B.buildInstr(Opc, {DstReg}, {SrcReg, MergedInputs});
+  if (!constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI))
+llvm_unreachable("failed to constrain BFE");
+
+  MI.eraseFromParent();
+  return true;
+}
+
 SIModeRegisterDefaults AMDGPURegBankCombinerImpl::getMode() const {
   return MF.getInfo()->getMode();
 }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp 
b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index dd7aef8f0c583..0b7d64ee67c34 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Ta

[llvm-branch-commits] [llvm] [AMDGPU] Move S_BFE lowering into RegBankCombiner (PR #141589)

2025-05-27 Thread Matt Arsenault via llvm-branch-commits



@@ -151,6 +151,17 @@ def zext_of_shift_amount_combines : GICombineGroup<[
   canonicalize_zext_lshr, canonicalize_zext_ashr, canonicalize_zext_shl
 ]>;
 
+// Early select of uniform BFX into S_BFE instructions.
+// These instructions encode the offset/width in a way that requires using
+// bitwise operations. Selecting these instructions early allow the combiner
+// to potentially fold these.
+class lower_uniform_bfx : GICombineRule<
+  (defs root:$bfx),
+  (combine (bfx $dst, $src, $o, $w):$bfx, [{ return lowerUniformBFX(*${bfx}); 
}])>;
+
+def lower_uniform_sbfx : lower_uniform_bfx;
+def lower_uniform_ubfx : lower_uniform_bfx;

arsenm wrote:

This needs more elaboration; needs to be clear that this can't be a mandatory 
lowering performed in a combiner 

https://github.com/llvm/llvm-project/pull/141589
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits

[llvm-branch-commits] [llvm] [AMDGPU] Move S_BFE lowering into RegBankCombiner (PR #141589)

2025-05-27 Thread Matt Arsenault via llvm-branch-commits



@@ -392,6 +394,55 @@ void 
AMDGPURegBankCombinerImpl::applyCanonicalizeZextShiftAmt(
   MI.eraseFromParent();
 }
 
+bool AMDGPURegBankCombinerImpl::lowerUniformBFX(MachineInstr &MI) const {
+  assert(MI.getOpcode() == TargetOpcode::G_UBFX ||
+ MI.getOpcode() == TargetOpcode::G_SBFX);
+  const bool Signed = (MI.getOpcode() == TargetOpcode::G_SBFX);
+
+  Register DstReg = MI.getOperand(0).getReg();
+  const RegisterBank *RB = RBI.getRegBank(DstReg, MRI, TRI);
+  assert(RB && "No RB?");
+  if (RB->getID() != AMDGPU::SGPRRegBankID)
+return false;
+
+  Register SrcReg = MI.getOperand(1).getReg();
+  Register OffsetReg = MI.getOperand(2).getReg();
+  Register WidthReg = MI.getOperand(3).getReg();
+
+  const LLT S32 = LLT::scalar(32);
+  LLT Ty = MRI.getType(DstReg);
+
+  const unsigned Opc = (Ty == S32)

arsenm wrote:

```suggestion
  const unsigned Opc = Ty == S32
```

https://github.com/llvm/llvm-project/pull/141589
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits

[llvm-branch-commits] [clang] [clang] Introduce CallGraphSection option (PR #117037)

2025-05-27 Thread Prabhu Rajasekaran via llvm-branch-commits


https://github.com/Prabhuk updated 
https://github.com/llvm/llvm-project/pull/117037

>From 6a12be2c5b60a95a06875b0b2c4f14228d1fa882 Mon Sep 17 00:00:00 2001
From: prabhukr 
Date: Wed, 12 Mar 2025 23:30:01 +
Subject: [PATCH] Fix EOF newlines.

Created using spr 1.3.6-beta.1
---
 clang/test/Driver/call-graph-section.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang/test/Driver/call-graph-section.c 
b/clang/test/Driver/call-graph-section.c
index 108446729d857..5832aa6754137 100644
--- a/clang/test/Driver/call-graph-section.c
+++ b/clang/test/Driver/call-graph-section.c
@@ -2,4 +2,4 @@
 // RUN: %clang -### -S -fcall-graph-section -fno-call-graph-section %s 2>&1 | 
FileCheck --check-prefix=NO-CALL-GRAPH-SECTION %s
 
 // CALL-GRAPH-SECTION: "-fcall-graph-section"
-// NO-CALL-GRAPH-SECTION-NOT: "-fcall-graph-section"
\ No newline at end of file
+// NO-CALL-GRAPH-SECTION-NOT: "-fcall-graph-section"

___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits

[llvm-branch-commits] [llvm] release/20.x: [MachO] Improve bounds check (#141083) (PR #141461)

2025-05-27 Thread Jonas Devlieghere via llvm-branch-commits


https://github.com/JDevlieghere approved this pull request.


https://github.com/llvm/llvm-project/pull/141461
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits

[llvm-branch-commits] [clang] [clang] callee_type metadata for indirect calls (PR #117036)

2025-05-27 Thread Prabhu Rajasekaran via llvm-branch-commits


https://github.com/Prabhuk updated 
https://github.com/llvm/llvm-project/pull/117036

>From b7fbe09b32ff02d4f7c52d82fbf8b5cd28138852 Mon Sep 17 00:00:00 2001
From: prabhukr 
Date: Wed, 23 Apr 2025 04:05:47 +
Subject: [PATCH] Address review comments.

Created using spr 1.3.6-beta.1
---
 clang/lib/CodeGen/CGCall.cpp|  8 
 clang/lib/CodeGen/CodeGenModule.cpp | 10 +-
 clang/lib/CodeGen/CodeGenModule.h   |  4 ++--
 3 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp
index 185ee1a970aac..d8ab7140f7943 100644
--- a/clang/lib/CodeGen/CGCall.cpp
+++ b/clang/lib/CodeGen/CGCall.cpp
@@ -5780,19 +5780,19 @@ RValue CodeGenFunction::EmitCall(const CGFunctionInfo 
&CallInfo,
   if (callOrInvoke) {
 *callOrInvoke = CI;
 if (CGM.getCodeGenOpts().CallGraphSection) {
-  assert((TargetDecl && TargetDecl->getFunctionType() ||
-  Callee.getAbstractInfo().getCalleeFunctionProtoType()) &&
- "cannot find callsite type");
   QualType CST;
   if (TargetDecl && TargetDecl->getFunctionType())
 CST = QualType(TargetDecl->getFunctionType(), 0);
   else if (const auto *FPT =
Callee.getAbstractInfo().getCalleeFunctionProtoType())
 CST = QualType(FPT, 0);
+  else
+llvm_unreachable(
+"Cannot find the callee type to generate callee_type metadata.");
 
   // Set type identifier metadata of indirect calls for call graph section.
   if (!CST.isNull())
-CGM.CreateCalleeTypeMetadataForIcall(CST, *callOrInvoke);
+CGM.createCalleeTypeMetadataForIcall(CST, *callOrInvoke);
 }
   }
 
diff --git a/clang/lib/CodeGen/CodeGenModule.cpp 
b/clang/lib/CodeGen/CodeGenModule.cpp
index 43cd2405571cf..2fc99639a75cb 100644
--- a/clang/lib/CodeGen/CodeGenModule.cpp
+++ b/clang/lib/CodeGen/CodeGenModule.cpp
@@ -2654,7 +2654,7 @@ void 
CodeGenModule::SetLLVMFunctionAttributesForDefinition(const Decl *D,
   // Skip available_externally functions. They won't be codegen'ed in the
   // current module anyway.
   if (getContext().GetGVALinkageForFunction(FD) != GVA_AvailableExternally)
-CreateFunctionTypeMetadataForIcall(FD, F);
+createFunctionTypeMetadataForIcall(FD, F);
 }
   }
 
@@ -2868,7 +2868,7 @@ static bool hasExistingGeneralizedTypeMD(llvm::Function 
*F) {
   return MD->hasGeneralizedMDString();
 }
 
-void CodeGenModule::CreateFunctionTypeMetadataForIcall(const FunctionDecl *FD,
+void CodeGenModule::createFunctionTypeMetadataForIcall(const FunctionDecl *FD,
llvm::Function *F) {
   if (CodeGenOpts.CallGraphSection && !hasExistingGeneralizedTypeMD(F) &&
   (!F->hasLocalLinkage() ||
@@ -2898,7 +2898,7 @@ void 
CodeGenModule::CreateFunctionTypeMetadataForIcall(const FunctionDecl *FD,
   F->addTypeMetadata(0, llvm::ConstantAsMetadata::get(CrossDsoTypeId));
 }
 
-void CodeGenModule::CreateCalleeTypeMetadataForIcall(const QualType &QT,
+void CodeGenModule::createCalleeTypeMetadataForIcall(const QualType &QT,
  llvm::CallBase *CB) {
   // Only if needed for call graph section and only for indirect calls.
   if (!CodeGenOpts.CallGraphSection || !CB->isIndirectCall())
@@ -2909,7 +2909,7 @@ void 
CodeGenModule::CreateCalleeTypeMetadataForIcall(const QualType &QT,
   getLLVMContext(), {llvm::ConstantAsMetadata::get(llvm::ConstantInt::get(
  llvm::Type::getInt64Ty(getLLVMContext()), 0)),
  TypeIdMD});
-  llvm::MDTuple *MDN = llvm::MDNode::get(getLLVMContext(), { TypeTuple });
+  llvm::MDTuple *MDN = llvm::MDNode::get(getLLVMContext(), {TypeTuple});
   CB->setMetadata(llvm::LLVMContext::MD_callee_type, MDN);
 }
 
@@ -3041,7 +3041,7 @@ void CodeGenModule::SetFunctionAttributes(GlobalDecl GD, 
llvm::Function *F,
   // jump table.
   if (!CodeGenOpts.SanitizeCfiCrossDso ||
   !CodeGenOpts.SanitizeCfiCanonicalJumpTables)
-CreateFunctionTypeMetadataForIcall(FD, F);
+createFunctionTypeMetadataForIcall(FD, F);
 
   if (LangOpts.Sanitize.has(SanitizerKind::KCFI))
 setKCFIType(FD, F);
diff --git a/clang/lib/CodeGen/CodeGenModule.h 
b/clang/lib/CodeGen/CodeGenModule.h
index dfbe4388349dd..4b53f0f241b52 100644
--- a/clang/lib/CodeGen/CodeGenModule.h
+++ b/clang/lib/CodeGen/CodeGenModule.h
@@ -1619,11 +1619,11 @@ class CodeGenModule : public CodeGenTypeCache {
   llvm::Metadata *CreateMetadataIdentifierGeneralized(QualType T);
 
   /// Create and attach type metadata to the given function.
-  void CreateFunctionTypeMetadataForIcall(const FunctionDecl *FD,
+  void createFunctionTypeMetadataForIcall(const FunctionDecl *FD,
   llvm::Function *F);
 
   /// Create and attach type metadata to the given call.
-  void CreateCalleeTypeMetadataForIcall(const QualType &QT, llvm::CallBase 
*CB);
+  void createCa

[llvm-branch-commits] LowerTypeTests: Set small code model on imported globals. (PR #141324)

2025-05-27 Thread Peter Collingbourne via llvm-branch-commits


https://github.com/pcc updated https://github.com/llvm/llvm-project/pull/141324


___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits

[llvm-branch-commits] LowerTypeTests: Set small code model on imported globals. (PR #141324)

2025-05-27 Thread Peter Collingbourne via llvm-branch-commits



@@ -1019,8 +1019,14 @@ LowerTypeTestsModule::importTypeId(StringRef TypeId) {
 return C;
   };
 
-  if (TIL.TheKind != TypeTestResolution::Unsat)
-TIL.OffsetedGlobal = ImportGlobal("global_addr");
+  if (TIL.TheKind != TypeTestResolution::Unsat) {
+auto *GV = ImportGlobal("global_addr");
+// This is either a vtable (in .data.rel.ro) or a jump table (in .text).
+// Either way it's expected to be in the low 2 GiB, so set the small code

pcc wrote:

Added comment with an explanation.

https://github.com/llvm/llvm-project/pull/141324
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits

[llvm-branch-commits] LowerTypeTests: Set small code model on imported globals. (PR #141324)

2025-05-27 Thread Peter Collingbourne via llvm-branch-commits


https://github.com/pcc updated https://github.com/llvm/llvm-project/pull/141324


___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits

[llvm-branch-commits] [llvm] [llvm][EmbedBitcodePass] Prevent modifying the module with ThinLTO (PR #139999)

2025-05-27 Thread Nikita Popov via llvm-branch-commits



@@ -33,8 +34,11 @@ PreservedAnalyses EmbedBitcodePass::run(Module &M, 
ModuleAnalysisManager &AM) {
 
   std::string Data;
   raw_string_ostream OS(Data);
+  // Clone the module with with Thin LTO, since ThinLTOBitcodeWriterPass 
changes

nikic wrote:

```suggestion
  // Clone the module with Thin LTO, since ThinLTOBitcodeWriterPass changes
```

https://github.com/llvm/llvm-project/pull/13
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits

[llvm-branch-commits] [llvm] [llvm][EmbedBitcodePass] Prevent modifying the module with ThinLTO (PR #139999)

2025-05-27 Thread Nikita Popov via llvm-branch-commits


https://github.com/nikic approved this pull request.

Okay, let's go with this for now.

Compile-time impact of cloning the module is about 0.2% when building clang 
with fat LTO: 
https://llvm-compile-time-tracker.com/compare.php?from=11a01e851a06188ae946ace1140f866d7a667221&to=46e037d763e7997a83ce78c9a602248fd67f0d44&stat=instructions:u

https://github.com/llvm/llvm-project/pull/13
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits

[llvm-branch-commits] [llvm] [llvm][EmbedBitcodePass] Prevent modifying the module with ThinLTO (PR #139999)

2025-05-27 Thread Nikita Popov via llvm-branch-commits


https://github.com/nikic edited https://github.com/llvm/llvm-project/pull/13
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits

[llvm-branch-commits] [llvm] [AMDGPU] Add KnownBits simplification combines to RegBankCombiner (PR #141591)

2025-05-27 Thread Pierre van Houtryve via llvm-branch-commits


https://github.com/Pierre-vh created 
https://github.com/llvm/llvm-project/pull/141591

None

>From d102621b16b8c893c4b56248d9c4cf59b3e1bf6e Mon Sep 17 00:00:00 2001
From: pvanhout 
Date: Tue, 27 May 2025 12:29:02 +0200
Subject: [PATCH] [AMDGPU] Add KnownBits simplification combines to
 RegBankCombiner

---
 llvm/lib/Target/AMDGPU/AMDGPUCombine.td   |  3 +-
 llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll   | 59 -
 .../test/CodeGen/AMDGPU/GlobalISel/saddsat.ll | 61 +++---
 .../test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll | 63 +++
 llvm/test/CodeGen/AMDGPU/div_i128.ll  | 30 -
 llvm/test/CodeGen/AMDGPU/itofp.i128.ll| 11 ++--
 llvm/test/CodeGen/AMDGPU/lround.ll| 18 +++---
 llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll   | 16 +
 8 files changed, 104 insertions(+), 157 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td 
b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
index 96be17c487130..df867aaa204b1 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
@@ -210,5 +210,6 @@ def AMDGPURegBankCombiner : GICombiner<
fp_minmax_to_clamp, fp_minmax_to_med3, fmed3_intrinsic_to_clamp,
identity_combines, redundant_and, constant_fold_cast_op,
cast_of_cast_combines, sext_trunc, zext_of_shift_amount_combines,
-   lower_uniform_sbfx, lower_uniform_ubfx, form_bitfield_extract]> {
+   lower_uniform_sbfx, lower_uniform_ubfx, form_bitfield_extract,
+   known_bits_simplifications]> {
 }
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll 
b/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll
index 6baa10bb48621..cc0f45681a3e2 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll
@@ -1744,63 +1744,64 @@ define i65 @v_lshr_i65_33(i65 %value) {
 ; GFX6-LABEL: v_lshr_i65_33:
 ; GFX6:   ; %bb.0:
 ; GFX6-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:v_mov_b32_e32 v3, v1
-; GFX6-NEXT:v_mov_b32_e32 v0, 1
+; GFX6-NEXT:v_mov_b32_e32 v3, 1
+; GFX6-NEXT:v_mov_b32_e32 v4, 0
+; GFX6-NEXT:v_and_b32_e32 v3, 1, v2
+; GFX6-NEXT:v_lshl_b64 v[2:3], v[3:4], 31
+; GFX6-NEXT:v_lshrrev_b32_e32 v0, 1, v1
+; GFX6-NEXT:v_or_b32_e32 v0, v0, v2
 ; GFX6-NEXT:v_mov_b32_e32 v1, 0
-; GFX6-NEXT:v_and_b32_e32 v0, 1, v2
-; GFX6-NEXT:v_lshl_b64 v[0:1], v[0:1], 31
-; GFX6-NEXT:v_lshrrev_b32_e32 v2, 1, v3
-; GFX6-NEXT:v_or_b32_e32 v0, v2, v0
 ; GFX6-NEXT:v_mov_b32_e32 v2, 0
 ; GFX6-NEXT:s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_lshr_i65_33:
 ; GFX8:   ; %bb.0:
 ; GFX8-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:v_mov_b32_e32 v3, v1
-; GFX8-NEXT:v_mov_b32_e32 v0, 1
+; GFX8-NEXT:v_mov_b32_e32 v3, 1
+; GFX8-NEXT:v_mov_b32_e32 v4, 0
+; GFX8-NEXT:v_and_b32_e32 v3, 1, v2
+; GFX8-NEXT:v_lshlrev_b64 v[2:3], 31, v[3:4]
+; GFX8-NEXT:v_lshrrev_b32_e32 v0, 1, v1
+; GFX8-NEXT:v_or_b32_e32 v0, v0, v2
 ; GFX8-NEXT:v_mov_b32_e32 v1, 0
-; GFX8-NEXT:v_and_b32_e32 v0, 1, v2
-; GFX8-NEXT:v_lshlrev_b64 v[0:1], 31, v[0:1]
-; GFX8-NEXT:v_lshrrev_b32_e32 v2, 1, v3
-; GFX8-NEXT:v_or_b32_e32 v0, v2, v0
 ; GFX8-NEXT:v_mov_b32_e32 v2, 0
 ; GFX8-NEXT:s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_lshr_i65_33:
 ; GFX9:   ; %bb.0:
 ; GFX9-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:v_mov_b32_e32 v3, v1
-; GFX9-NEXT:v_mov_b32_e32 v0, 1
+; GFX9-NEXT:v_mov_b32_e32 v3, 1
+; GFX9-NEXT:v_mov_b32_e32 v4, 0
+; GFX9-NEXT:v_and_b32_e32 v3, 1, v2
+; GFX9-NEXT:v_lshlrev_b64 v[2:3], 31, v[3:4]
+; GFX9-NEXT:v_lshrrev_b32_e32 v0, 1, v1
+; GFX9-NEXT:v_or_b32_e32 v0, v0, v2
 ; GFX9-NEXT:v_mov_b32_e32 v1, 0
-; GFX9-NEXT:v_and_b32_e32 v0, 1, v2
-; GFX9-NEXT:v_lshlrev_b64 v[0:1], 31, v[0:1]
-; GFX9-NEXT:v_lshrrev_b32_e32 v2, 1, v3
-; GFX9-NEXT:v_or_b32_e32 v0, v2, v0
 ; GFX9-NEXT:v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_lshr_i65_33:
 ; GFX10:   ; %bb.0:
 ; GFX10-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:v_mov_b32_e32 v3, v1
-; GFX10-NEXT:v_mov_b32_e32 v0, 1
+; GFX10-NEXT:v_mov_b32_e32 v3, 1
+; GFX10-NEXT:v_mov_b32_e32 v4, 0
+; GFX10-NEXT:v_and_b32_e32 v3, 1, v2
+; GFX10-NEXT:v_lshrrev_b32_e32 v0, 1, v1
 ; GFX10-NEXT:v_mov_b32_e32 v1, 0
-; GFX10-NEXT:v_and_b32_e32 v0, 1, v2
-; GFX10-NEXT:v_lshrrev_b32_e32 v2, 1, v3
-; GFX10-NEXT:v_lshlrev_b64 v[0:1], 31, v[0:1]
-; GFX10-NEXT:v_or_b32_e32 v0, v2, v0
+; GFX10-NEXT:v_lshlrev_b64 v[2:3], 31, v[3:4]
+; GFX10-NEXT:v_or_b32_e32 v0, v0, v2
 ; GFX10-NEXT:v_mov_b32_e32 v2, 0
 ; GFX10-NEXT:s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_lshr_i65_33:
 ; GFX11:   ; %bb.0:
 ; GFX11-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v0, 1
-; GFX11-NEXT:v_dual_mov_b32 v1, 0 :: v_dual_

[llvm-branch-commits] X86: Add X86TargetLowering::isProfitableToHoist hook for immediate operands. (PR #141326)

2025-05-27 Thread Peter Collingbourne via llvm-branch-commits


pcc wrote:

I'm fixing the code generation for the test cases that I'm adding 
(inhibit-zext-constant-hoist.ll) which were all extracted from a build of a 
large internal program built with CFI. Previously f1 looked like this where 
align was hoisted:
```
f1: # @f1
.cfi_startproc
# %bb.0:
movl$__typeid__ZTS1S_align, %eax
movzbl  %al, %ecx
movb$64, %al
subb%cl, %al
movzbl  %al, %eax
testl   %edi, %edi
je  .LBB0_3
# %bb.1:
movq(%rsi), %r8
movl$__typeid__ZTS1S_global_addr, %edx
movq%r8, %rdi
subq%rdx, %rdi
movq%rdi, %rdx
# kill: def $cl killed $cl killed $rcx
shrq%cl, %rdx
movl%eax, %ecx
shlq%cl, %rdi
orq %rdx, %rdi
cmpq$__typeid__ZTS1S_size_m1@ABS8, %rdi
jbe .LBB0_4
.LBB0_2:
ud1l2(%eax), %eax
.LBB0_3:
movq(%rdx), %r8
movl$__typeid__ZTS1S_global_addr, %esi
movq%r8, %rdi
subq%rsi, %rdi
movq%rdi, %rsi
# kill: def $cl killed $cl killed $rcx
shrq%cl, %rsi
movl%eax, %ecx
shlq%cl, %rdi
orq %rsi, %rdi
cmpq$__typeid__ZTS1S_size_m1@ABS8, %rdi
movq%rdx, %rsi
ja  .LBB0_2
.LBB0_4:
movq%rsi, %rdi
jmpq*(%r8)  # TAILCALL
.Lfunc_end0:
.size   f1, .Lfunc_end0-f1
.cfi_endproc
```
Now f1 looks like this:
```
f1: # @f1
.cfi_startproc
# %bb.0:
testl   %edi, %edi
je  .LBB0_3
# %bb.1:
movq(%rsi), %rax
movl$__typeid__ZTS1S_global_addr, %ecx
movq%rax, %rdx
subq%rcx, %rdx
rorq$__typeid__ZTS1S_align, %rdx
cmpq$__typeid__ZTS1S_size_m1@ABS8, %rdx
jbe .LBB0_4
.LBB0_2:
ud1l2(%eax), %eax
.LBB0_3:
movq(%rdx), %rax
movl$__typeid__ZTS1S_global_addr, %ecx
movq%rax, %rsi
subq%rcx, %rsi
rorq$__typeid__ZTS1S_align, %rsi
cmpq$__typeid__ZTS1S_size_m1@ABS8, %rsi
movq%rdx, %rsi
ja  .LBB0_2
.LBB0_4:
movq%rsi, %rdi
jmpq*(%rax) # TAILCALL
.Lfunc_end0:
.size   f1, .Lfunc_end0-f1
.cfi_endproc
```
The other cases look similar before my change.

The poor codegen issue was introduced (I believe) by 
https://github.com/llvm/llvm-project/pull/71040 which removed the zext 
ConstantExprs that LowerTypeTests was using to keep everything in the same 
basic block for matching.

I think that because it's the zext being hoisted and not the shifts it wouldn't 
make a difference to use fshl/fshr but I can check.

https://github.com/llvm/llvm-project/pull/141326
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits

[llvm-branch-commits] [llvm] [BOLT] Gadget scanner: account for BRK when searching for auth oracles (PR #137975)

2025-05-27 Thread Anatoly Trosinenko via llvm-branch-commits


https://github.com/atrosinenko updated 
https://github.com/llvm/llvm-project/pull/137975

>From ec1655b1fab18e3c2e13bc7b35ac1e151af4b615 Mon Sep 17 00:00:00 2001
From: Anatoly Trosinenko 
Date: Wed, 30 Apr 2025 16:08:10 +0300
Subject: [PATCH] [BOLT] Gadget scanner: account for BRK when searching for
 auth oracles

An authenticated pointer can be explicitly checked by the compiler via a
sequence of instructions that executes BRK on failure. It is important
to recognize such BRK instruction as checking every register (as it is
expected to immediately trigger an abnormal program termination) to
prevent false positive reports about authentication oracles:

autia   x2, x3
autia   x0, x1
; neither x0 nor x2 are checked at this point
eor x16, x0, x0, lsl #1
tbz x16, #62, on_success ; marks x0 as checked
; end of BB: for x2 to be checked here, it must be checked in both
; successor basic blocks
  on_failure:
brk 0xc470
  on_success:
; x2 is checked
ldr x1, [x2] ; marks x2 as checked
---
 bolt/include/bolt/Core/MCPlusBuilder.h| 14 ++
 bolt/lib/Passes/PAuthGadgetScanner.cpp| 13 +-
 .../Target/AArch64/AArch64MCPlusBuilder.cpp   | 24 --
 .../AArch64/gs-pauth-address-checks.s | 44 +--
 .../AArch64/gs-pauth-authentication-oracles.s |  9 ++--
 .../AArch64/gs-pauth-signing-oracles.s|  6 +--
 6 files changed, 75 insertions(+), 35 deletions(-)

diff --git a/bolt/include/bolt/Core/MCPlusBuilder.h 
b/bolt/include/bolt/Core/MCPlusBuilder.h
index b233452985502..c8cbcaf33f4b5 100644
--- a/bolt/include/bolt/Core/MCPlusBuilder.h
+++ b/bolt/include/bolt/Core/MCPlusBuilder.h
@@ -707,6 +707,20 @@ class MCPlusBuilder {
 return false;
   }
 
+  /// Returns true if Inst is a trap instruction.
+  ///
+  /// Tests if Inst is an instruction that immediately causes an abnormal
+  /// program termination, for example when a security violation is detected
+  /// by a compiler-inserted check.
+  ///
+  /// @note An implementation of this method should likely return false for
+  /// calls to library functions like abort(), as it is possible that the
+  /// execution state is partially attacker-controlled at this point.
+  virtual bool isTrap(const MCInst &Inst) const {
+llvm_unreachable("not implemented");
+return false;
+  }
+
   virtual bool isBreakpoint(const MCInst &Inst) const {
 llvm_unreachable("not implemented");
 return false;
diff --git a/bolt/lib/Passes/PAuthGadgetScanner.cpp 
b/bolt/lib/Passes/PAuthGadgetScanner.cpp
index b539f1a211d4f..20d921728283e 100644
--- a/bolt/lib/Passes/PAuthGadgetScanner.cpp
+++ b/bolt/lib/Passes/PAuthGadgetScanner.cpp
@@ -1053,6 +1053,15 @@ class DstSafetyAnalysis {
   dbgs() << ")\n";
 });
 
+// If this instruction terminates the program immediately, no
+// authentication oracles are possible past this point.
+if (BC.MIB->isTrap(Point)) {
+  LLVM_DEBUG({ traceInst(BC, "Trap instruction found", Point); });
+  DstState Next(NumRegs, RegsToTrackInstsFor.getNumTrackedRegisters());
+  Next.CannotEscapeUnchecked.set();
+  return Next;
+}
+
 // If this instruction is reachable by the analysis, a non-empty state will
 // be propagated to it sooner or later. Until then, skip computeNext().
 if (Cur.empty()) {
@@ -1160,8 +1169,8 @@ class DataflowDstSafetyAnalysis
 //
 // A basic block without any successors, on the other hand, can be
 // pessimistically initialized to everything-is-unsafe: this will naturally
-// handle both return and tail call instructions and is harmless for
-// internal indirect branch instructions (such as computed gotos).
+// handle return, trap and tail call instructions. At the same time, it is
+// harmless for internal indirect branch instructions, like computed gotos.
 if (BB.succ_empty())
   return createUnsafeState();
 
diff --git a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp 
b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
index 9d5a578cfbdff..b669d32cc2032 100644
--- a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
+++ b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
@@ -386,10 +386,9 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
 // the list of successors of this basic block as appropriate.
 
 // Any of the above code sequences assume the fall-through basic block
-// is a dead-end BRK instruction (any immediate operand is accepted).
+// is a dead-end trap instruction.
 const BinaryBasicBlock *BreakBB = BB.getFallthrough();
-if (!BreakBB || BreakBB->empty() ||
-BreakBB->front().getOpcode() != AArch64::BRK)
+if (!BreakBB || BreakBB->empty() || !isTrap(BreakBB->front()))
   return std::nullopt;
 
 // Iterate over the instructions of BB in reverse order, matching opcodes
@@ -1751,6 +1750,25 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
 Inst.addOperand(MCOperand::createImm(0));
   }

[llvm-branch-commits] [llvm] [BOLT] Gadget scanner: account for BRK when searching for auth oracles (PR #137975)

2025-05-27 Thread Anatoly Trosinenko via llvm-branch-commits


https://github.com/atrosinenko updated 
https://github.com/llvm/llvm-project/pull/137975

>From ec1655b1fab18e3c2e13bc7b35ac1e151af4b615 Mon Sep 17 00:00:00 2001
From: Anatoly Trosinenko 
Date: Wed, 30 Apr 2025 16:08:10 +0300
Subject: [PATCH] [BOLT] Gadget scanner: account for BRK when searching for
 auth oracles

An authenticated pointer can be explicitly checked by the compiler via a
sequence of instructions that executes BRK on failure. It is important
to recognize such BRK instruction as checking every register (as it is
expected to immediately trigger an abnormal program termination) to
prevent false positive reports about authentication oracles:

autia   x2, x3
autia   x0, x1
; neither x0 nor x2 are checked at this point
eor x16, x0, x0, lsl #1
tbz x16, #62, on_success ; marks x0 as checked
; end of BB: for x2 to be checked here, it must be checked in both
; successor basic blocks
  on_failure:
brk 0xc470
  on_success:
; x2 is checked
ldr x1, [x2] ; marks x2 as checked
---
 bolt/include/bolt/Core/MCPlusBuilder.h| 14 ++
 bolt/lib/Passes/PAuthGadgetScanner.cpp| 13 +-
 .../Target/AArch64/AArch64MCPlusBuilder.cpp   | 24 --
 .../AArch64/gs-pauth-address-checks.s | 44 +--
 .../AArch64/gs-pauth-authentication-oracles.s |  9 ++--
 .../AArch64/gs-pauth-signing-oracles.s|  6 +--
 6 files changed, 75 insertions(+), 35 deletions(-)

diff --git a/bolt/include/bolt/Core/MCPlusBuilder.h 
b/bolt/include/bolt/Core/MCPlusBuilder.h
index b233452985502..c8cbcaf33f4b5 100644
--- a/bolt/include/bolt/Core/MCPlusBuilder.h
+++ b/bolt/include/bolt/Core/MCPlusBuilder.h
@@ -707,6 +707,20 @@ class MCPlusBuilder {
 return false;
   }
 
+  /// Returns true if Inst is a trap instruction.
+  ///
+  /// Tests if Inst is an instruction that immediately causes an abnormal
+  /// program termination, for example when a security violation is detected
+  /// by a compiler-inserted check.
+  ///
+  /// @note An implementation of this method should likely return false for
+  /// calls to library functions like abort(), as it is possible that the
+  /// execution state is partially attacker-controlled at this point.
+  virtual bool isTrap(const MCInst &Inst) const {
+llvm_unreachable("not implemented");
+return false;
+  }
+
   virtual bool isBreakpoint(const MCInst &Inst) const {
 llvm_unreachable("not implemented");
 return false;
diff --git a/bolt/lib/Passes/PAuthGadgetScanner.cpp 
b/bolt/lib/Passes/PAuthGadgetScanner.cpp
index b539f1a211d4f..20d921728283e 100644
--- a/bolt/lib/Passes/PAuthGadgetScanner.cpp
+++ b/bolt/lib/Passes/PAuthGadgetScanner.cpp
@@ -1053,6 +1053,15 @@ class DstSafetyAnalysis {
   dbgs() << ")\n";
 });
 
+// If this instruction terminates the program immediately, no
+// authentication oracles are possible past this point.
+if (BC.MIB->isTrap(Point)) {
+  LLVM_DEBUG({ traceInst(BC, "Trap instruction found", Point); });
+  DstState Next(NumRegs, RegsToTrackInstsFor.getNumTrackedRegisters());
+  Next.CannotEscapeUnchecked.set();
+  return Next;
+}
+
 // If this instruction is reachable by the analysis, a non-empty state will
 // be propagated to it sooner or later. Until then, skip computeNext().
 if (Cur.empty()) {
@@ -1160,8 +1169,8 @@ class DataflowDstSafetyAnalysis
 //
 // A basic block without any successors, on the other hand, can be
 // pessimistically initialized to everything-is-unsafe: this will naturally
-// handle both return and tail call instructions and is harmless for
-// internal indirect branch instructions (such as computed gotos).
+// handle return, trap and tail call instructions. At the same time, it is
+// harmless for internal indirect branch instructions, like computed gotos.
 if (BB.succ_empty())
   return createUnsafeState();
 
diff --git a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp 
b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
index 9d5a578cfbdff..b669d32cc2032 100644
--- a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
+++ b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
@@ -386,10 +386,9 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
 // the list of successors of this basic block as appropriate.
 
 // Any of the above code sequences assume the fall-through basic block
-// is a dead-end BRK instruction (any immediate operand is accepted).
+// is a dead-end trap instruction.
 const BinaryBasicBlock *BreakBB = BB.getFallthrough();
-if (!BreakBB || BreakBB->empty() ||
-BreakBB->front().getOpcode() != AArch64::BRK)
+if (!BreakBB || BreakBB->empty() || !isTrap(BreakBB->front()))
   return std::nullopt;
 
 // Iterate over the instructions of BB in reverse order, matching opcodes
@@ -1751,6 +1750,25 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
 Inst.addOperand(MCOperand::createImm(0));
   }

[llvm-branch-commits] [llvm] [BOLT] Gadget scanner: detect untrusted LR before tail call (PR #137224)

2025-05-27 Thread Anatoly Trosinenko via llvm-branch-commits


https://github.com/atrosinenko updated 
https://github.com/llvm/llvm-project/pull/137224

>From 11094a446c4b193d5b5e3023cdd01de0e619 Mon Sep 17 00:00:00 2001
From: Anatoly Trosinenko 
Date: Tue, 22 Apr 2025 21:43:14 +0300
Subject: [PATCH 1/2] [BOLT] Gadget scanner: detect untrusted LR before tail
 call

Implement the detection of tail calls performed with untrusted link
register, which violates the assumption made on entry to every function.

Unlike other pauth gadgets, this one involves some amount of guessing
which branch instructions should be checked as tail calls.
---
 bolt/lib/Passes/PAuthGadgetScanner.cpp|  94 ++-
 .../AArch64/gs-pacret-autiasp.s   |  31 +-
 .../AArch64/gs-pauth-debug-output.s   |  30 +-
 .../AArch64/gs-pauth-tail-calls.s | 597 ++
 4 files changed, 706 insertions(+), 46 deletions(-)
 create mode 100644 bolt/test/binary-analysis/AArch64/gs-pauth-tail-calls.s

diff --git a/bolt/lib/Passes/PAuthGadgetScanner.cpp 
b/bolt/lib/Passes/PAuthGadgetScanner.cpp
index fc6120e922baa..b539f1a211d4f 100644
--- a/bolt/lib/Passes/PAuthGadgetScanner.cpp
+++ b/bolt/lib/Passes/PAuthGadgetScanner.cpp
@@ -737,19 +737,14 @@ template  class CFGUnawareAnalysis {
 //
 // Then, a function can be split into a number of disjoint contiguous sequences
 // of instructions without labels in between. These sequences can be processed
-// the same way basic blocks are processed by data-flow analysis, assuming
-// pessimistically that all registers are unsafe at the start of each sequence.
+// the same way basic blocks are processed by data-flow analysis, with the same
+// pessimistic estimation of the initial state at the start of each sequence
+// (except the first instruction of the function).
 class CFGUnawareSrcSafetyAnalysis : public SrcSafetyAnalysis,
 public CFGUnawareAnalysis {
   using SrcSafetyAnalysis::BC;
   BinaryFunction &BF;
 
-  /// Creates a state with all registers marked unsafe (not to be confused
-  /// with empty state).
-  SrcState createUnsafeState() const {
-return SrcState(NumRegs, RegsToTrackInstsFor.getNumTrackedRegisters());
-  }
-
 public:
   CFGUnawareSrcSafetyAnalysis(BinaryFunction &BF,
   MCPlusBuilder::AllocatorIdTy AllocId,
@@ -759,6 +754,7 @@ class CFGUnawareSrcSafetyAnalysis : public 
SrcSafetyAnalysis,
   }
 
   void run() override {
+const SrcState DefaultState = computePessimisticState(BF);
 SrcState S = createEntryState();
 for (auto &I : BF.instrs()) {
   MCInst &Inst = I.second;
@@ -773,7 +769,7 @@ class CFGUnawareSrcSafetyAnalysis : public 
SrcSafetyAnalysis,
 LLVM_DEBUG({
   traceInst(BC, "Due to label, resetting the state before", Inst);
 });
-S = createUnsafeState();
+S = DefaultState;
   }
 
   // Attach the state *before* this instruction executes.
@@ -1302,6 +1298,83 @@ shouldReportReturnGadget(const BinaryContext &BC, const 
MCInstReference &Inst,
   return make_gadget_report(RetKind, Inst, *RetReg);
 }
 
+/// While BOLT already marks some of the branch instructions as tail calls,
+/// this function tries to improve the coverage by including less obvious cases
+/// when it is possible to do without introducing too many false positives.
+static bool shouldAnalyzeTailCallInst(const BinaryContext &BC,
+  const BinaryFunction &BF,
+  const MCInstReference &Inst) {
+  // Some BC.MIB->isXYZ(Inst) methods simply delegate to MCInstrDesc::isXYZ()
+  // (such as isBranch at the time of writing this comment), some don't (such
+  // as isCall). For that reason, call MCInstrDesc's methods explicitly when
+  // it is important.
+  const MCInstrDesc &Desc =
+  BC.MII->get(static_cast(Inst).getOpcode());
+  // Tail call should be a branch (but not necessarily an indirect one).
+  if (!Desc.isBranch())
+return false;
+
+  // Always analyze the branches already marked as tail calls by BOLT.
+  if (BC.MIB->isTailCall(Inst))
+return true;
+
+  // Try to also check the branches marked as "UNKNOWN CONTROL FLOW" - the
+  // below is a simplified condition from BinaryContext::printInstruction.
+  bool IsUnknownControlFlow =
+  BC.MIB->isIndirectBranch(Inst) && !BC.MIB->getJumpTable(Inst);
+
+  if (BF.hasCFG() && IsUnknownControlFlow)
+return true;
+
+  return false;
+}
+
+static std::optional>
+shouldReportUnsafeTailCall(const BinaryContext &BC, const BinaryFunction &BF,
+   const MCInstReference &Inst, const SrcState &S) {
+  static const GadgetKind UntrustedLRKind(
+  "untrusted link register found before tail call");
+
+  if (!shouldAnalyzeTailCallInst(BC, BF, Inst))
+return std::nullopt;
+
+  // Not only the set of registers returned by getTrustedLiveInRegs() can be
+  // seen as a reasonable target-independent _approximation_ of "the LR", these
+  // are *exactly* those regis

[llvm-branch-commits] [llvm] [BOLT] Gadget scanner: prevent false positives due to jump tables (PR #138884)

2025-05-27 Thread Anatoly Trosinenko via llvm-branch-commits


https://github.com/atrosinenko updated 
https://github.com/llvm/llvm-project/pull/138884

>From 65f42b50bc58d2e5b78946bf82be3db9f5d63230 Mon Sep 17 00:00:00 2001
From: Anatoly Trosinenko 
Date: Tue, 6 May 2025 11:31:03 +0300
Subject: [PATCH] [BOLT] Gadget scanner: prevent false positives due to jump
 tables

As part of PAuth hardening, AArch64 LLVM backend can use a special
BR_JumpTable pseudo (enabled by -faarch64-jump-table-hardening
Clang option) which is expanded in the AsmPrinter into a contiguous
sequence without unsafe instructions in the middle.

This commit adds another target-specific callback to MCPlusBuilder
to make it possible to inhibit false positives for known-safe jump
table dispatch sequences. Without special handling, the branch
instruction is likely to be reported as a non-protected call (as its
destination is not produced by an auth instruction, PC-relative address
materialization, etc.) and possibly as a tail call being performed with
unsafe link register (as the detection whether the branch instruction
is a tail call is an heuristic).

For now, only the specific instruction sequence used by the AArch64
LLVM backend is matched.
---
 bolt/include/bolt/Core/MCInstUtils.h  |   9 +
 bolt/include/bolt/Core/MCPlusBuilder.h|  14 +
 bolt/lib/Core/MCInstUtils.cpp |  20 +
 bolt/lib/Passes/PAuthGadgetScanner.cpp|  10 +
 .../Target/AArch64/AArch64MCPlusBuilder.cpp   |  73 ++
 .../AArch64/gs-pauth-jump-table.s | 703 ++
 6 files changed, 829 insertions(+)
 create mode 100644 bolt/test/binary-analysis/AArch64/gs-pauth-jump-table.s

diff --git a/bolt/include/bolt/Core/MCInstUtils.h 
b/bolt/include/bolt/Core/MCInstUtils.h
index 50b7d56470c99..33d36cccbcfff 100644
--- a/bolt/include/bolt/Core/MCInstUtils.h
+++ b/bolt/include/bolt/Core/MCInstUtils.h
@@ -154,6 +154,15 @@ class MCInstReference {
 return nullptr;
   }
 
+  /// Returns the only preceding instruction, or std::nullopt if multiple or no
+  /// predecessors are possible.
+  ///
+  /// If CFG information is available, basic block boundary can be crossed,
+  /// provided there is exactly one predecessor. If CFG is not available, the
+  /// preceding instruction in the offset order is returned, unless this is the
+  /// first instruction of the function.
+  std::optional getSinglePredecessor();
+
   raw_ostream &print(raw_ostream &OS) const;
 };
 
diff --git a/bolt/include/bolt/Core/MCPlusBuilder.h 
b/bolt/include/bolt/Core/MCPlusBuilder.h
index c8cbcaf33f4b5..3abf4d18e94da 100644
--- a/bolt/include/bolt/Core/MCPlusBuilder.h
+++ b/bolt/include/bolt/Core/MCPlusBuilder.h
@@ -14,6 +14,7 @@
 #ifndef BOLT_CORE_MCPLUSBUILDER_H
 #define BOLT_CORE_MCPLUSBUILDER_H
 
+#include "bolt/Core/MCInstUtils.h"
 #include "bolt/Core/MCPlus.h"
 #include "bolt/Core/Relocation.h"
 #include "llvm/ADT/ArrayRef.h"
@@ -700,6 +701,19 @@ class MCPlusBuilder {
 return std::nullopt;
   }
 
+  /// Tests if BranchInst corresponds to an instruction sequence which is known
+  /// to be a safe dispatch via jump table.
+  ///
+  /// The target can decide which instruction sequences to consider "safe" from
+  /// the Pointer Authentication point of view, such as any jump table dispatch
+  /// sequence without function calls inside, any sequence which is contiguous,
+  /// or only some specific well-known sequences.
+  virtual bool
+  isSafeJumpTableBranchForPtrAuth(MCInstReference BranchInst) const {
+llvm_unreachable("not implemented");
+return false;
+  }
+
   virtual bool isTerminator(const MCInst &Inst) const;
 
   virtual bool isNoop(const MCInst &Inst) const {
diff --git a/bolt/lib/Core/MCInstUtils.cpp b/bolt/lib/Core/MCInstUtils.cpp
index 40f6edd59135c..b7c6d898988af 100644
--- a/bolt/lib/Core/MCInstUtils.cpp
+++ b/bolt/lib/Core/MCInstUtils.cpp
@@ -55,3 +55,23 @@ raw_ostream &MCInstReference::print(raw_ostream &OS) const {
   OS << ">";
   return OS;
 }
+
+std::optional MCInstReference::getSinglePredecessor() {
+  if (const RefInBB *Ref = tryGetRefInBB()) {
+if (Ref->It != Ref->BB->begin())
+  return MCInstReference(Ref->BB, &*std::prev(Ref->It));
+
+if (Ref->BB->pred_size() != 1)
+  return std::nullopt;
+
+BinaryBasicBlock *PredBB = *Ref->BB->pred_begin();
+assert(!PredBB->empty() && "Empty basic blocks are not supported yet");
+return MCInstReference(PredBB, &*PredBB->rbegin());
+  }
+
+  const RefInBF &Ref = getRefInBF();
+  if (Ref.It == Ref.BF->instrs().begin())
+return std::nullopt;
+
+  return MCInstReference(Ref.BF, std::prev(Ref.It));
+}
diff --git a/bolt/lib/Passes/PAuthGadgetScanner.cpp 
b/bolt/lib/Passes/PAuthGadgetScanner.cpp
index d8d14b50d9216..85c0f34af74a1 100644
--- a/bolt/lib/Passes/PAuthGadgetScanner.cpp
+++ b/bolt/lib/Passes/PAuthGadgetScanner.cpp
@@ -1342,6 +1342,11 @@ shouldReportUnsafeTailCall(const BinaryContext &BC, 
const BinaryFunction &BF,
 return std::nullopt;
   }
 
+  if (BC.MIB->isSafeJumpTableBranchForPtrAuth(Inst)) {
+LL

[llvm-branch-commits] [llvm] [BOLT] Gadget scanner: improve handling of unreachable basic blocks (PR #136183)

2025-05-27 Thread Anatoly Trosinenko via llvm-branch-commits


https://github.com/atrosinenko updated 
https://github.com/llvm/llvm-project/pull/136183

>From 1bd54289ecbb513831c9b94adfc1822abf3deb73 Mon Sep 17 00:00:00 2001
From: Anatoly Trosinenko 
Date: Thu, 17 Apr 2025 20:51:16 +0300
Subject: [PATCH 1/3] [BOLT] Gadget scanner: improve handling of unreachable
 basic blocks

Instead of refusing to analyze an instruction completely, when it is
unreachable according to the CFG reconstructed by BOLT, pessimistically
assume all registers to be unsafe at the start of basic blocks without
any predecessors. Nevertheless, unreachable basic blocks found in
optimized code likely means imprecise CFG reconstruction, thus report a
warning once per basic block without predecessors.
---
 bolt/lib/Passes/PAuthGadgetScanner.cpp| 46 ++-
 .../AArch64/gs-pacret-autiasp.s   |  7 ++-
 .../binary-analysis/AArch64/gs-pauth-calls.s  | 57 +++
 3 files changed, 95 insertions(+), 15 deletions(-)

diff --git a/bolt/lib/Passes/PAuthGadgetScanner.cpp 
b/bolt/lib/Passes/PAuthGadgetScanner.cpp
index c419ff74992a7..0b0f8fb77b2fd 100644
--- a/bolt/lib/Passes/PAuthGadgetScanner.cpp
+++ b/bolt/lib/Passes/PAuthGadgetScanner.cpp
@@ -342,6 +342,12 @@ class SrcSafetyAnalysis {
 return S;
   }
 
+  /// Creates a state with all registers marked unsafe (not to be confused
+  /// with empty state).
+  SrcState createUnsafeState() const {
+return SrcState(NumRegs, RegsToTrackInstsFor.getNumTrackedRegisters());
+  }
+
   BitVector getClobberedRegs(const MCInst &Point) const {
 BitVector Clobbered(NumRegs);
 // Assume a call can clobber all registers, including callee-saved
@@ -585,6 +591,13 @@ class DataflowSrcSafetyAnalysis
 if (BB.isEntryPoint())
   return createEntryState();
 
+// If a basic block without any predecessors is found in an optimized code,
+// this likely means that some CFG edges were not detected. Pessimistically
+// assume all registers to be unsafe before this basic block and warn about
+// this fact in FunctionAnalysis::findUnsafeUses().
+if (BB.pred_empty())
+  return createUnsafeState();
+
 return SrcState();
   }
 
@@ -689,12 +702,6 @@ class CFGUnawareSrcSafetyAnalysis : public 
SrcSafetyAnalysis,
   using SrcSafetyAnalysis::BC;
   BinaryFunction &BF;
 
-  /// Creates a state with all registers marked unsafe (not to be confused
-  /// with empty state).
-  SrcState createUnsafeState() const {
-return SrcState(NumRegs, RegsToTrackInstsFor.getNumTrackedRegisters());
-  }
-
 public:
   CFGUnawareSrcSafetyAnalysis(BinaryFunction &BF,
   MCPlusBuilder::AllocatorIdTy AllocId,
@@ -1355,19 +1362,30 @@ void FunctionAnalysisContext::findUnsafeUses(
 BF.dump();
   });
 
+  if (BF.hasCFG()) {
+// Warn on basic blocks being unreachable according to BOLT, as this
+// likely means CFG is imprecise.
+for (BinaryBasicBlock &BB : BF) {
+  if (!BB.pred_empty() || BB.isEntryPoint())
+continue;
+  // Arbitrarily attach the report to the first instruction of BB.
+  MCInst *InstToReport = BB.getFirstNonPseudoInstr();
+  if (!InstToReport)
+continue; // BB has no real instructions
+
+  Reports.push_back(
+  make_generic_report(MCInstReference::get(InstToReport, BF),
+  "Warning: no predecessor basic blocks detected "
+  "(possibly incomplete CFG)"));
+}
+  }
+
   iterateOverInstrs(BF, [&](MCInstReference Inst) {
 if (BC.MIB->isCFI(Inst))
   return;
 
 const SrcState &S = Analysis->getStateBefore(Inst);
-
-// If non-empty state was never propagated from the entry basic block
-// to Inst, assume it to be unreachable and report a warning.
-if (S.empty()) {
-  Reports.push_back(
-  make_generic_report(Inst, "Warning: unreachable instruction found"));
-  return;
-}
+assert(!S.empty() && "Instruction has no associated state");
 
 if (auto Report = shouldReportReturnGadget(BC, Inst, S))
   Reports.push_back(*Report);
diff --git a/bolt/test/binary-analysis/AArch64/gs-pacret-autiasp.s 
b/bolt/test/binary-analysis/AArch64/gs-pacret-autiasp.s
index 284f0bea607a5..6559ba336e8de 100644
--- a/bolt/test/binary-analysis/AArch64/gs-pacret-autiasp.s
+++ b/bolt/test/binary-analysis/AArch64/gs-pacret-autiasp.s
@@ -215,12 +215,17 @@ f_callclobbered_calleesaved:
 .globl  f_unreachable_instruction
 .type   f_unreachable_instruction,@function
 f_unreachable_instruction:
-// CHECK-LABEL: GS-PAUTH: Warning: unreachable instruction found in function 
f_unreachable_instruction, basic block {{[0-9a-zA-Z.]+}}, at address
+// CHECK-LABEL: GS-PAUTH: Warning: no predecessor basic blocks detected 
(possibly incomplete CFG) in function f_unreachable_instruction, basic block 
{{[0-9a-zA-Z.]+}}, at address
 // CHECK-NEXT:The instruction is {{[0-9a-f]+}}:   add x0, x1, 
x2
 // CHECK-NOT:   instructions that write t

[llvm-branch-commits] [llvm] [BOLT] Introduce helpers to match `MCInst`s one at a time (NFC) (PR #138883)

2025-05-27 Thread Anatoly Trosinenko via llvm-branch-commits


https://github.com/atrosinenko updated 
https://github.com/llvm/llvm-project/pull/138883

>From daff3ce49a0272612af7d94d6be176a2c65e305c Mon Sep 17 00:00:00 2001
From: Anatoly Trosinenko 
Date: Wed, 7 May 2025 16:42:00 +0300
Subject: [PATCH] [BOLT] Introduce helpers to match `MCInst`s one at a time
 (NFC)

Introduce matchInst helper function to capture and/or match the operands
of MCInst. Unlike the existing `MCPlusBuilder::MCInstMatcher` machinery,
matchInst is intended for the use cases when precise control over the
instruction order is required. For example, when validating PtrAuth
hardening, all registers are usually considered unsafe after a function
call, even though callee-saved registers should preserve their old
values *under normal operation*.
---
 bolt/include/bolt/Core/MCInstUtils.h  | 128 ++
 .../Target/AArch64/AArch64MCPlusBuilder.cpp   |  90 +---
 2 files changed, 162 insertions(+), 56 deletions(-)

diff --git a/bolt/include/bolt/Core/MCInstUtils.h 
b/bolt/include/bolt/Core/MCInstUtils.h
index 69bf5e6159b74..50b7d56470c99 100644
--- a/bolt/include/bolt/Core/MCInstUtils.h
+++ b/bolt/include/bolt/Core/MCInstUtils.h
@@ -162,6 +162,134 @@ static inline raw_ostream &operator<<(raw_ostream &OS,
   return Ref.print(OS);
 }
 
+/// Instruction-matching helpers operating on a single instruction at a time.
+///
+/// Unlike MCPlusBuilder::MCInstMatcher, this matchInst() function focuses on
+/// the cases where a precise control over the instruction order is important:
+///
+/// // Bring the short names into the local scope:
+/// using namespace MCInstMatcher;
+/// // Declare the registers to capture:
+/// Reg Xn, Xm;
+/// // Capture the 0th and 1st operands, match the 2nd operand against the
+/// // just captured Xm register, match the 3rd operand against literal 0:
+/// if (!matchInst(MaybeAdd, AArch64::ADDXrs, Xm, Xn, Xm, Imm(0))
+///   return AArch64::NoRegister;
+/// // Match the 0th operand against Xm:
+/// if (!matchInst(MaybeBr, AArch64::BR, Xm))
+///   return AArch64::NoRegister;
+/// // Return the matched register:
+/// return Xm.get();
+namespace MCInstMatcher {
+
+// The base class to match an operand of type T.
+//
+// The subclasses of OpMatcher are intended to be allocated on the stack and
+// to only be used by passing them to matchInst() and by calling their get()
+// function, thus the peculiar `mutable` specifiers: to make the calling code
+// compact and readable, the templated matchInst() function has to accept both
+// long-lived Imm/Reg wrappers declared as local variables (intended to capture
+// the first operand's value and match the subsequent operands, whether inside
+// a single instruction or across multiple instructions), as well as temporary
+// wrappers around literal values to match, f.e. Imm(42) or Reg(AArch64::XZR).
+template  class OpMatcher {
+  mutable std::optional Value;
+  mutable std::optional SavedValue;
+
+  // Remember/restore the last Value - to be called by matchInst.
+  void remember() const { SavedValue = Value; }
+  void restore() const { Value = SavedValue; }
+
+  template 
+  friend bool matchInst(const MCInst &, unsigned, const OpMatchers &...);
+
+protected:
+  OpMatcher(std::optional ValueToMatch) : Value(ValueToMatch) {}
+
+  bool matchValue(T OpValue) const {
+// Check that OpValue does not contradict the existing Value.
+bool MatchResult = !Value || *Value == OpValue;
+// If MatchResult is false, all matchers will be reset before returning 
from
+// matchInst, including this one, thus no need to assign conditionally.
+Value = OpValue;
+
+return MatchResult;
+  }
+
+public:
+  /// Returns the captured value.
+  T get() const {
+assert(Value.has_value());
+return *Value;
+  }
+};
+
+class Reg : public OpMatcher {
+  bool matches(const MCOperand &Op) const {
+if (!Op.isReg())
+  return false;
+
+return matchValue(Op.getReg());
+  }
+
+  template 
+  friend bool matchInst(const MCInst &, unsigned, const OpMatchers &...);
+
+public:
+  Reg(std::optional RegToMatch = std::nullopt)
+  : OpMatcher(RegToMatch) {}
+};
+
+class Imm : public OpMatcher {
+  bool matches(const MCOperand &Op) const {
+if (!Op.isImm())
+  return false;
+
+return matchValue(Op.getImm());
+  }
+
+  template 
+  friend bool matchInst(const MCInst &, unsigned, const OpMatchers &...);
+
+public:
+  Imm(std::optional ImmToMatch = std::nullopt)
+  : OpMatcher(ImmToMatch) {}
+};
+
+/// Tries to match Inst and updates Ops on success.
+///
+/// If Inst has the specified Opcode and its operand list prefix matches Ops,
+/// this function returns true and updates Ops, otherwise false is returned and
+/// values of Ops are kept as before matchInst was called.
+///
+/// Please note that while Ops are technically passed by a const reference to
+/// make invocations like `matchInst(MI, Opcode, Imm(42))` possible, all their
+/// fields are marked mut

[llvm-branch-commits] [llvm] [BOLT] Gadget scanner: detect untrusted LR before tail call (PR #137224)

2025-05-27 Thread Anatoly Trosinenko via llvm-branch-commits


https://github.com/atrosinenko updated 
https://github.com/llvm/llvm-project/pull/137224

>From 11094a446c4b193d5b5e3023cdd01de0e619 Mon Sep 17 00:00:00 2001
From: Anatoly Trosinenko 
Date: Tue, 22 Apr 2025 21:43:14 +0300
Subject: [PATCH 1/2] [BOLT] Gadget scanner: detect untrusted LR before tail
 call

Implement the detection of tail calls performed with untrusted link
register, which violates the assumption made on entry to every function.

Unlike other pauth gadgets, this one involves some amount of guessing
which branch instructions should be checked as tail calls.
---
 bolt/lib/Passes/PAuthGadgetScanner.cpp|  94 ++-
 .../AArch64/gs-pacret-autiasp.s   |  31 +-
 .../AArch64/gs-pauth-debug-output.s   |  30 +-
 .../AArch64/gs-pauth-tail-calls.s | 597 ++
 4 files changed, 706 insertions(+), 46 deletions(-)
 create mode 100644 bolt/test/binary-analysis/AArch64/gs-pauth-tail-calls.s

diff --git a/bolt/lib/Passes/PAuthGadgetScanner.cpp 
b/bolt/lib/Passes/PAuthGadgetScanner.cpp
index fc6120e922baa..b539f1a211d4f 100644
--- a/bolt/lib/Passes/PAuthGadgetScanner.cpp
+++ b/bolt/lib/Passes/PAuthGadgetScanner.cpp
@@ -737,19 +737,14 @@ template  class CFGUnawareAnalysis {
 //
 // Then, a function can be split into a number of disjoint contiguous sequences
 // of instructions without labels in between. These sequences can be processed
-// the same way basic blocks are processed by data-flow analysis, assuming
-// pessimistically that all registers are unsafe at the start of each sequence.
+// the same way basic blocks are processed by data-flow analysis, with the same
+// pessimistic estimation of the initial state at the start of each sequence
+// (except the first instruction of the function).
 class CFGUnawareSrcSafetyAnalysis : public SrcSafetyAnalysis,
 public CFGUnawareAnalysis {
   using SrcSafetyAnalysis::BC;
   BinaryFunction &BF;
 
-  /// Creates a state with all registers marked unsafe (not to be confused
-  /// with empty state).
-  SrcState createUnsafeState() const {
-return SrcState(NumRegs, RegsToTrackInstsFor.getNumTrackedRegisters());
-  }
-
 public:
   CFGUnawareSrcSafetyAnalysis(BinaryFunction &BF,
   MCPlusBuilder::AllocatorIdTy AllocId,
@@ -759,6 +754,7 @@ class CFGUnawareSrcSafetyAnalysis : public 
SrcSafetyAnalysis,
   }
 
   void run() override {
+const SrcState DefaultState = computePessimisticState(BF);
 SrcState S = createEntryState();
 for (auto &I : BF.instrs()) {
   MCInst &Inst = I.second;
@@ -773,7 +769,7 @@ class CFGUnawareSrcSafetyAnalysis : public 
SrcSafetyAnalysis,
 LLVM_DEBUG({
   traceInst(BC, "Due to label, resetting the state before", Inst);
 });
-S = createUnsafeState();
+S = DefaultState;
   }
 
   // Attach the state *before* this instruction executes.
@@ -1302,6 +1298,83 @@ shouldReportReturnGadget(const BinaryContext &BC, const 
MCInstReference &Inst,
   return make_gadget_report(RetKind, Inst, *RetReg);
 }
 
+/// While BOLT already marks some of the branch instructions as tail calls,
+/// this function tries to improve the coverage by including less obvious cases
+/// when it is possible to do without introducing too many false positives.
+static bool shouldAnalyzeTailCallInst(const BinaryContext &BC,
+  const BinaryFunction &BF,
+  const MCInstReference &Inst) {
+  // Some BC.MIB->isXYZ(Inst) methods simply delegate to MCInstrDesc::isXYZ()
+  // (such as isBranch at the time of writing this comment), some don't (such
+  // as isCall). For that reason, call MCInstrDesc's methods explicitly when
+  // it is important.
+  const MCInstrDesc &Desc =
+  BC.MII->get(static_cast(Inst).getOpcode());
+  // Tail call should be a branch (but not necessarily an indirect one).
+  if (!Desc.isBranch())
+return false;
+
+  // Always analyze the branches already marked as tail calls by BOLT.
+  if (BC.MIB->isTailCall(Inst))
+return true;
+
+  // Try to also check the branches marked as "UNKNOWN CONTROL FLOW" - the
+  // below is a simplified condition from BinaryContext::printInstruction.
+  bool IsUnknownControlFlow =
+  BC.MIB->isIndirectBranch(Inst) && !BC.MIB->getJumpTable(Inst);
+
+  if (BF.hasCFG() && IsUnknownControlFlow)
+return true;
+
+  return false;
+}
+
+static std::optional>
+shouldReportUnsafeTailCall(const BinaryContext &BC, const BinaryFunction &BF,
+   const MCInstReference &Inst, const SrcState &S) {
+  static const GadgetKind UntrustedLRKind(
+  "untrusted link register found before tail call");
+
+  if (!shouldAnalyzeTailCallInst(BC, BF, Inst))
+return std::nullopt;
+
+  // Not only the set of registers returned by getTrustedLiveInRegs() can be
+  // seen as a reasonable target-independent _approximation_ of "the LR", these
+  // are *exactly* those regis

[llvm-branch-commits] [llvm] [BOLT] Gadget scanner: optionally assume auth traps on failure (PR #139778)

2025-05-27 Thread Anatoly Trosinenko via llvm-branch-commits


https://github.com/atrosinenko updated 
https://github.com/llvm/llvm-project/pull/139778

>From d6536f26399b2c385821c65af779fc694e8eee72 Mon Sep 17 00:00:00 2001
From: Anatoly Trosinenko 
Date: Tue, 13 May 2025 19:50:41 +0300
Subject: [PATCH] [BOLT] Gadget scanner: optionally assume auth traps on
 failure

On AArch64 it is possible for an auth instruction to either return an
invalid address value on failure (without FEAT_FPAC) or generate an
error (with FEAT_FPAC). It thus may be possible to never emit explicit
pointer checks, if the target CPU is known to support FEAT_FPAC.

This commit implements an --auth-traps-on-failure command line option,
which essentially makes "safe-to-dereference" and "trusted" register
properties identical and disables scanning for authentication oracles
completely.
---
 bolt/lib/Passes/PAuthGadgetScanner.cpp| 112 +++
 .../binary-analysis/AArch64/cmdline-args.test |   1 +
 .../AArch64/gs-pauth-authentication-oracles.s |   6 +-
 .../binary-analysis/AArch64/gs-pauth-calls.s  |   5 +-
 .../AArch64/gs-pauth-debug-output.s   | 177 ++---
 .../AArch64/gs-pauth-jump-table.s |   6 +-
 .../AArch64/gs-pauth-signing-oracles.s|  54 ++---
 .../AArch64/gs-pauth-tail-calls.s | 184 +-
 8 files changed, 318 insertions(+), 227 deletions(-)

diff --git a/bolt/lib/Passes/PAuthGadgetScanner.cpp 
b/bolt/lib/Passes/PAuthGadgetScanner.cpp
index 85c0f34af74a1..b7573c96d183e 100644
--- a/bolt/lib/Passes/PAuthGadgetScanner.cpp
+++ b/bolt/lib/Passes/PAuthGadgetScanner.cpp
@@ -14,6 +14,7 @@
 #include "bolt/Passes/PAuthGadgetScanner.h"
 #include "bolt/Core/ParallelUtilities.h"
 #include "bolt/Passes/DataflowAnalysis.h"
+#include "bolt/Utils/CommandLineOpts.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/MC/MCInst.h"
@@ -26,6 +27,11 @@ namespace llvm {
 namespace bolt {
 namespace PAuthGadgetScanner {
 
+static cl::opt AuthTrapsOnFailure(
+"auth-traps-on-failure",
+cl::desc("Assume authentication instructions always trap on failure"),
+cl::cat(opts::BinaryAnalysisCategory));
+
 [[maybe_unused]] static void traceInst(const BinaryContext &BC, StringRef 
Label,
const MCInst &MI) {
   dbgs() << "  " << Label << ": ";
@@ -364,6 +370,34 @@ class SrcSafetyAnalysis {
 return Clobbered;
   }
 
+  std::optional getRegMadeTrustedByChecking(const MCInst &Inst,
+   SrcState Cur) const {
+// This functions cannot return multiple registers. This is never the case
+// on AArch64.
+std::optional RegCheckedByInst =
+BC.MIB->getAuthCheckedReg(Inst, /*MayOverwrite=*/false);
+if (RegCheckedByInst && Cur.SafeToDerefRegs[*RegCheckedByInst])
+  return *RegCheckedByInst;
+
+auto It = CheckerSequenceInfo.find(&Inst);
+if (It == CheckerSequenceInfo.end())
+  return std::nullopt;
+
+MCPhysReg RegCheckedBySequence = It->second.first;
+const MCInst *FirstCheckerInst = It->second.second;
+
+// FirstCheckerInst should belong to the same basic block (see the
+// assertion in DataflowSrcSafetyAnalysis::run()), meaning it was
+// deterministically processed a few steps before this instruction.
+const SrcState &StateBeforeChecker = getStateBefore(*FirstCheckerInst);
+
+// The sequence checks the register, but it should be authenticated before.
+if (!StateBeforeChecker.SafeToDerefRegs[RegCheckedBySequence])
+  return std::nullopt;
+
+return RegCheckedBySequence;
+  }
+
   // Returns all registers that can be treated as if they are written by an
   // authentication instruction.
   SmallVector getRegsMadeSafeToDeref(const MCInst &Point,
@@ -386,18 +420,38 @@ class SrcSafetyAnalysis {
 Regs.push_back(DstAndSrc->first);
 }
 
+// Make sure explicit checker sequence keeps register safe-to-dereference
+// when the register would be clobbered according to the regular rules:
+//
+//; LR is safe to dereference here
+//mov   x16, x30  ; start of the sequence, LR is s-t-d right before
+//xpaclri ; clobbers LR, LR is not safe anymore
+//cmp   x30, x16
+//b.eq  1f; end of the sequence: LR is marked as trusted
+//brk   0x1234
+//  1:
+//; at this point LR would be marked as trusted,
+//; but not safe-to-dereference
+//
+// or even just
+//
+//; X1 is safe to dereference here
+//ldr x0, [x1, #8]!
+//; X1 is trusted here, but it was clobbered due to address write-back
+if (auto CheckedReg = getRegMadeTrustedByChecking(Point, Cur))
+  Regs.push_back(*CheckedReg);
+
 return Regs;
   }
 
   // Returns all registers made trusted by this instruction.
   SmallVector getRegsMadeTrusted(const MCInst &Point,
 const SrcState &Cur) const {
+assert(!AuthTrapsOnFailure &&

[llvm-branch-commits] [llvm] [BOLT] Introduce helpers to match `MCInst`s one at a time (NFC) (PR #138883)

2025-05-27 Thread Anatoly Trosinenko via llvm-branch-commits


https://github.com/atrosinenko updated 
https://github.com/llvm/llvm-project/pull/138883

>From daff3ce49a0272612af7d94d6be176a2c65e305c Mon Sep 17 00:00:00 2001
From: Anatoly Trosinenko 
Date: Wed, 7 May 2025 16:42:00 +0300
Subject: [PATCH] [BOLT] Introduce helpers to match `MCInst`s one at a time
 (NFC)

Introduce matchInst helper function to capture and/or match the operands
of MCInst. Unlike the existing `MCPlusBuilder::MCInstMatcher` machinery,
matchInst is intended for the use cases when precise control over the
instruction order is required. For example, when validating PtrAuth
hardening, all registers are usually considered unsafe after a function
call, even though callee-saved registers should preserve their old
values *under normal operation*.
---
 bolt/include/bolt/Core/MCInstUtils.h  | 128 ++
 .../Target/AArch64/AArch64MCPlusBuilder.cpp   |  90 +---
 2 files changed, 162 insertions(+), 56 deletions(-)

diff --git a/bolt/include/bolt/Core/MCInstUtils.h 
b/bolt/include/bolt/Core/MCInstUtils.h
index 69bf5e6159b74..50b7d56470c99 100644
--- a/bolt/include/bolt/Core/MCInstUtils.h
+++ b/bolt/include/bolt/Core/MCInstUtils.h
@@ -162,6 +162,134 @@ static inline raw_ostream &operator<<(raw_ostream &OS,
   return Ref.print(OS);
 }
 
+/// Instruction-matching helpers operating on a single instruction at a time.
+///
+/// Unlike MCPlusBuilder::MCInstMatcher, this matchInst() function focuses on
+/// the cases where a precise control over the instruction order is important:
+///
+/// // Bring the short names into the local scope:
+/// using namespace MCInstMatcher;
+/// // Declare the registers to capture:
+/// Reg Xn, Xm;
+/// // Capture the 0th and 1st operands, match the 2nd operand against the
+/// // just captured Xm register, match the 3rd operand against literal 0:
+/// if (!matchInst(MaybeAdd, AArch64::ADDXrs, Xm, Xn, Xm, Imm(0))
+///   return AArch64::NoRegister;
+/// // Match the 0th operand against Xm:
+/// if (!matchInst(MaybeBr, AArch64::BR, Xm))
+///   return AArch64::NoRegister;
+/// // Return the matched register:
+/// return Xm.get();
+namespace MCInstMatcher {
+
+// The base class to match an operand of type T.
+//
+// The subclasses of OpMatcher are intended to be allocated on the stack and
+// to only be used by passing them to matchInst() and by calling their get()
+// function, thus the peculiar `mutable` specifiers: to make the calling code
+// compact and readable, the templated matchInst() function has to accept both
+// long-lived Imm/Reg wrappers declared as local variables (intended to capture
+// the first operand's value and match the subsequent operands, whether inside
+// a single instruction or across multiple instructions), as well as temporary
+// wrappers around literal values to match, f.e. Imm(42) or Reg(AArch64::XZR).
+template  class OpMatcher {
+  mutable std::optional Value;
+  mutable std::optional SavedValue;
+
+  // Remember/restore the last Value - to be called by matchInst.
+  void remember() const { SavedValue = Value; }
+  void restore() const { Value = SavedValue; }
+
+  template 
+  friend bool matchInst(const MCInst &, unsigned, const OpMatchers &...);
+
+protected:
+  OpMatcher(std::optional ValueToMatch) : Value(ValueToMatch) {}
+
+  bool matchValue(T OpValue) const {
+// Check that OpValue does not contradict the existing Value.
+bool MatchResult = !Value || *Value == OpValue;
+// If MatchResult is false, all matchers will be reset before returning 
from
+// matchInst, including this one, thus no need to assign conditionally.
+Value = OpValue;
+
+return MatchResult;
+  }
+
+public:
+  /// Returns the captured value.
+  T get() const {
+assert(Value.has_value());
+return *Value;
+  }
+};
+
+class Reg : public OpMatcher {
+  bool matches(const MCOperand &Op) const {
+if (!Op.isReg())
+  return false;
+
+return matchValue(Op.getReg());
+  }
+
+  template 
+  friend bool matchInst(const MCInst &, unsigned, const OpMatchers &...);
+
+public:
+  Reg(std::optional RegToMatch = std::nullopt)
+  : OpMatcher(RegToMatch) {}
+};
+
+class Imm : public OpMatcher {
+  bool matches(const MCOperand &Op) const {
+if (!Op.isImm())
+  return false;
+
+return matchValue(Op.getImm());
+  }
+
+  template 
+  friend bool matchInst(const MCInst &, unsigned, const OpMatchers &...);
+
+public:
+  Imm(std::optional ImmToMatch = std::nullopt)
+  : OpMatcher(ImmToMatch) {}
+};
+
+/// Tries to match Inst and updates Ops on success.
+///
+/// If Inst has the specified Opcode and its operand list prefix matches Ops,
+/// this function returns true and updates Ops, otherwise false is returned and
+/// values of Ops are kept as before matchInst was called.
+///
+/// Please note that while Ops are technically passed by a const reference to
+/// make invocations like `matchInst(MI, Opcode, Imm(42))` possible, all their
+/// fields are marked mut

[llvm-branch-commits] [llvm] [BOLT] Gadget scanner: optionally assume auth traps on failure (PR #139778)

2025-05-27 Thread Anatoly Trosinenko via llvm-branch-commits


https://github.com/atrosinenko updated 
https://github.com/llvm/llvm-project/pull/139778

>From d6536f26399b2c385821c65af779fc694e8eee72 Mon Sep 17 00:00:00 2001
From: Anatoly Trosinenko 
Date: Tue, 13 May 2025 19:50:41 +0300
Subject: [PATCH] [BOLT] Gadget scanner: optionally assume auth traps on
 failure

On AArch64 it is possible for an auth instruction to either return an
invalid address value on failure (without FEAT_FPAC) or generate an
error (with FEAT_FPAC). It thus may be possible to never emit explicit
pointer checks, if the target CPU is known to support FEAT_FPAC.

This commit implements an --auth-traps-on-failure command line option,
which essentially makes "safe-to-dereference" and "trusted" register
properties identical and disables scanning for authentication oracles
completely.
---
 bolt/lib/Passes/PAuthGadgetScanner.cpp| 112 +++
 .../binary-analysis/AArch64/cmdline-args.test |   1 +
 .../AArch64/gs-pauth-authentication-oracles.s |   6 +-
 .../binary-analysis/AArch64/gs-pauth-calls.s  |   5 +-
 .../AArch64/gs-pauth-debug-output.s   | 177 ++---
 .../AArch64/gs-pauth-jump-table.s |   6 +-
 .../AArch64/gs-pauth-signing-oracles.s|  54 ++---
 .../AArch64/gs-pauth-tail-calls.s | 184 +-
 8 files changed, 318 insertions(+), 227 deletions(-)

diff --git a/bolt/lib/Passes/PAuthGadgetScanner.cpp 
b/bolt/lib/Passes/PAuthGadgetScanner.cpp
index 85c0f34af74a1..b7573c96d183e 100644
--- a/bolt/lib/Passes/PAuthGadgetScanner.cpp
+++ b/bolt/lib/Passes/PAuthGadgetScanner.cpp
@@ -14,6 +14,7 @@
 #include "bolt/Passes/PAuthGadgetScanner.h"
 #include "bolt/Core/ParallelUtilities.h"
 #include "bolt/Passes/DataflowAnalysis.h"
+#include "bolt/Utils/CommandLineOpts.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/MC/MCInst.h"
@@ -26,6 +27,11 @@ namespace llvm {
 namespace bolt {
 namespace PAuthGadgetScanner {
 
+static cl::opt AuthTrapsOnFailure(
+"auth-traps-on-failure",
+cl::desc("Assume authentication instructions always trap on failure"),
+cl::cat(opts::BinaryAnalysisCategory));
+
 [[maybe_unused]] static void traceInst(const BinaryContext &BC, StringRef 
Label,
const MCInst &MI) {
   dbgs() << "  " << Label << ": ";
@@ -364,6 +370,34 @@ class SrcSafetyAnalysis {
 return Clobbered;
   }
 
+  std::optional getRegMadeTrustedByChecking(const MCInst &Inst,
+   SrcState Cur) const {
+// This functions cannot return multiple registers. This is never the case
+// on AArch64.
+std::optional RegCheckedByInst =
+BC.MIB->getAuthCheckedReg(Inst, /*MayOverwrite=*/false);
+if (RegCheckedByInst && Cur.SafeToDerefRegs[*RegCheckedByInst])
+  return *RegCheckedByInst;
+
+auto It = CheckerSequenceInfo.find(&Inst);
+if (It == CheckerSequenceInfo.end())
+  return std::nullopt;
+
+MCPhysReg RegCheckedBySequence = It->second.first;
+const MCInst *FirstCheckerInst = It->second.second;
+
+// FirstCheckerInst should belong to the same basic block (see the
+// assertion in DataflowSrcSafetyAnalysis::run()), meaning it was
+// deterministically processed a few steps before this instruction.
+const SrcState &StateBeforeChecker = getStateBefore(*FirstCheckerInst);
+
+// The sequence checks the register, but it should be authenticated before.
+if (!StateBeforeChecker.SafeToDerefRegs[RegCheckedBySequence])
+  return std::nullopt;
+
+return RegCheckedBySequence;
+  }
+
   // Returns all registers that can be treated as if they are written by an
   // authentication instruction.
   SmallVector getRegsMadeSafeToDeref(const MCInst &Point,
@@ -386,18 +420,38 @@ class SrcSafetyAnalysis {
 Regs.push_back(DstAndSrc->first);
 }
 
+// Make sure explicit checker sequence keeps register safe-to-dereference
+// when the register would be clobbered according to the regular rules:
+//
+//; LR is safe to dereference here
+//mov   x16, x30  ; start of the sequence, LR is s-t-d right before
+//xpaclri ; clobbers LR, LR is not safe anymore
+//cmp   x30, x16
+//b.eq  1f; end of the sequence: LR is marked as trusted
+//brk   0x1234
+//  1:
+//; at this point LR would be marked as trusted,
+//; but not safe-to-dereference
+//
+// or even just
+//
+//; X1 is safe to dereference here
+//ldr x0, [x1, #8]!
+//; X1 is trusted here, but it was clobbered due to address write-back
+if (auto CheckedReg = getRegMadeTrustedByChecking(Point, Cur))
+  Regs.push_back(*CheckedReg);
+
 return Regs;
   }
 
   // Returns all registers made trusted by this instruction.
   SmallVector getRegsMadeTrusted(const MCInst &Point,
 const SrcState &Cur) const {
+assert(!AuthTrapsOnFailure &&

[llvm-branch-commits] [llvm] [BOLT] Gadget scanner: prevent false positives due to jump tables (PR #138884)

2025-05-27 Thread Anatoly Trosinenko via llvm-branch-commits


https://github.com/atrosinenko updated 
https://github.com/llvm/llvm-project/pull/138884

>From 65f42b50bc58d2e5b78946bf82be3db9f5d63230 Mon Sep 17 00:00:00 2001
From: Anatoly Trosinenko 
Date: Tue, 6 May 2025 11:31:03 +0300
Subject: [PATCH] [BOLT] Gadget scanner: prevent false positives due to jump
 tables

As part of PAuth hardening, AArch64 LLVM backend can use a special
BR_JumpTable pseudo (enabled by -faarch64-jump-table-hardening
Clang option) which is expanded in the AsmPrinter into a contiguous
sequence without unsafe instructions in the middle.

This commit adds another target-specific callback to MCPlusBuilder
to make it possible to inhibit false positives for known-safe jump
table dispatch sequences. Without special handling, the branch
instruction is likely to be reported as a non-protected call (as its
destination is not produced by an auth instruction, PC-relative address
materialization, etc.) and possibly as a tail call being performed with
unsafe link register (as the detection whether the branch instruction
is a tail call is an heuristic).

For now, only the specific instruction sequence used by the AArch64
LLVM backend is matched.
---
 bolt/include/bolt/Core/MCInstUtils.h  |   9 +
 bolt/include/bolt/Core/MCPlusBuilder.h|  14 +
 bolt/lib/Core/MCInstUtils.cpp |  20 +
 bolt/lib/Passes/PAuthGadgetScanner.cpp|  10 +
 .../Target/AArch64/AArch64MCPlusBuilder.cpp   |  73 ++
 .../AArch64/gs-pauth-jump-table.s | 703 ++
 6 files changed, 829 insertions(+)
 create mode 100644 bolt/test/binary-analysis/AArch64/gs-pauth-jump-table.s

diff --git a/bolt/include/bolt/Core/MCInstUtils.h 
b/bolt/include/bolt/Core/MCInstUtils.h
index 50b7d56470c99..33d36cccbcfff 100644
--- a/bolt/include/bolt/Core/MCInstUtils.h
+++ b/bolt/include/bolt/Core/MCInstUtils.h
@@ -154,6 +154,15 @@ class MCInstReference {
 return nullptr;
   }
 
+  /// Returns the only preceding instruction, or std::nullopt if multiple or no
+  /// predecessors are possible.
+  ///
+  /// If CFG information is available, basic block boundary can be crossed,
+  /// provided there is exactly one predecessor. If CFG is not available, the
+  /// preceding instruction in the offset order is returned, unless this is the
+  /// first instruction of the function.
+  std::optional getSinglePredecessor();
+
   raw_ostream &print(raw_ostream &OS) const;
 };
 
diff --git a/bolt/include/bolt/Core/MCPlusBuilder.h 
b/bolt/include/bolt/Core/MCPlusBuilder.h
index c8cbcaf33f4b5..3abf4d18e94da 100644
--- a/bolt/include/bolt/Core/MCPlusBuilder.h
+++ b/bolt/include/bolt/Core/MCPlusBuilder.h
@@ -14,6 +14,7 @@
 #ifndef BOLT_CORE_MCPLUSBUILDER_H
 #define BOLT_CORE_MCPLUSBUILDER_H
 
+#include "bolt/Core/MCInstUtils.h"
 #include "bolt/Core/MCPlus.h"
 #include "bolt/Core/Relocation.h"
 #include "llvm/ADT/ArrayRef.h"
@@ -700,6 +701,19 @@ class MCPlusBuilder {
 return std::nullopt;
   }
 
+  /// Tests if BranchInst corresponds to an instruction sequence which is known
+  /// to be a safe dispatch via jump table.
+  ///
+  /// The target can decide which instruction sequences to consider "safe" from
+  /// the Pointer Authentication point of view, such as any jump table dispatch
+  /// sequence without function calls inside, any sequence which is contiguous,
+  /// or only some specific well-known sequences.
+  virtual bool
+  isSafeJumpTableBranchForPtrAuth(MCInstReference BranchInst) const {
+llvm_unreachable("not implemented");
+return false;
+  }
+
   virtual bool isTerminator(const MCInst &Inst) const;
 
   virtual bool isNoop(const MCInst &Inst) const {
diff --git a/bolt/lib/Core/MCInstUtils.cpp b/bolt/lib/Core/MCInstUtils.cpp
index 40f6edd59135c..b7c6d898988af 100644
--- a/bolt/lib/Core/MCInstUtils.cpp
+++ b/bolt/lib/Core/MCInstUtils.cpp
@@ -55,3 +55,23 @@ raw_ostream &MCInstReference::print(raw_ostream &OS) const {
   OS << ">";
   return OS;
 }
+
+std::optional MCInstReference::getSinglePredecessor() {
+  if (const RefInBB *Ref = tryGetRefInBB()) {
+if (Ref->It != Ref->BB->begin())
+  return MCInstReference(Ref->BB, &*std::prev(Ref->It));
+
+if (Ref->BB->pred_size() != 1)
+  return std::nullopt;
+
+BinaryBasicBlock *PredBB = *Ref->BB->pred_begin();
+assert(!PredBB->empty() && "Empty basic blocks are not supported yet");
+return MCInstReference(PredBB, &*PredBB->rbegin());
+  }
+
+  const RefInBF &Ref = getRefInBF();
+  if (Ref.It == Ref.BF->instrs().begin())
+return std::nullopt;
+
+  return MCInstReference(Ref.BF, std::prev(Ref.It));
+}
diff --git a/bolt/lib/Passes/PAuthGadgetScanner.cpp 
b/bolt/lib/Passes/PAuthGadgetScanner.cpp
index d8d14b50d9216..85c0f34af74a1 100644
--- a/bolt/lib/Passes/PAuthGadgetScanner.cpp
+++ b/bolt/lib/Passes/PAuthGadgetScanner.cpp
@@ -1342,6 +1342,11 @@ shouldReportUnsafeTailCall(const BinaryContext &BC, 
const BinaryFunction &BF,
 return std::nullopt;
   }
 
+  if (BC.MIB->isSafeJumpTableBranchForPtrAuth(Inst)) {
+LL

[llvm-branch-commits] [llvm] [BOLT] Gadget scanner: make use of C++17 features and LLVM helpers (PR #141665)

2025-05-27 Thread Anatoly Trosinenko via llvm-branch-commits


https://github.com/atrosinenko created 
https://github.com/llvm/llvm-project/pull/141665

Perform trivial syntactical cleanups:
* make use of structured binding declarations
* use LLVM utility functions when appropriate
* omit braces around single expression inside single-line LLVM_DEBUG()

This patch is NFC aside from minor debug output changes.

>From cbf7911e11d6d69dbab934a384004b8f886aad13 Mon Sep 17 00:00:00 2001
From: Anatoly Trosinenko 
Date: Tue, 27 May 2025 21:06:03 +0300
Subject: [PATCH] [BOLT] Gadget scanner: make use of C++17 features and LLVM
 helpers

Perform trivial syntactical cleanups:
* make use of structured binding declarations
* use LLVM utility functions when appropriate
* omit braces around single expression inside single-line LLVM_DEBUG()

This patch is NFC aside from minor debug output changes.
---
 bolt/lib/Passes/PAuthGadgetScanner.cpp| 67 +--
 .../AArch64/gs-pauth-debug-output.s   | 14 ++--
 2 files changed, 38 insertions(+), 43 deletions(-)

diff --git a/bolt/lib/Passes/PAuthGadgetScanner.cpp 
b/bolt/lib/Passes/PAuthGadgetScanner.cpp
index b7573c96d183e..4d283482d9472 100644
--- a/bolt/lib/Passes/PAuthGadgetScanner.cpp
+++ b/bolt/lib/Passes/PAuthGadgetScanner.cpp
@@ -88,8 +88,8 @@ class TrackedRegisters {
   TrackedRegisters(ArrayRef RegsToTrack)
   : Registers(RegsToTrack),
 RegToIndexMapping(getMappingSize(RegsToTrack), NoIndex) {
-for (unsigned I = 0; I < RegsToTrack.size(); ++I)
-  RegToIndexMapping[RegsToTrack[I]] = I;
+for (auto [MappedIndex, Reg] : llvm::enumerate(RegsToTrack))
+  RegToIndexMapping[Reg] = MappedIndex;
   }
 
   ArrayRef getRegisters() const { return Registers; }
@@ -203,9 +203,9 @@ struct SrcState {
 
 SafeToDerefRegs &= StateIn.SafeToDerefRegs;
 TrustedRegs &= StateIn.TrustedRegs;
-for (unsigned I = 0; I < LastInstWritingReg.size(); ++I)
-  for (const MCInst *J : StateIn.LastInstWritingReg[I])
-LastInstWritingReg[I].insert(J);
+for (auto [ThisSet, OtherSet] :
+ llvm::zip_equal(LastInstWritingReg, StateIn.LastInstWritingReg))
+  ThisSet.insert_range(OtherSet);
 return *this;
   }
 
@@ -224,11 +224,9 @@ struct SrcState {
 static void printInstsShort(raw_ostream &OS,
 ArrayRef Insts) {
   OS << "Insts: ";
-  for (unsigned I = 0; I < Insts.size(); ++I) {
-auto &Set = Insts[I];
+  for (auto [I, PtrSet] : llvm::enumerate(Insts)) {
 OS << "[" << I << "](";
-for (const MCInst *MCInstP : Set)
-  OS << MCInstP << " ";
+interleave(PtrSet, OS, " ");
 OS << ")";
   }
 }
@@ -416,8 +414,9 @@ class SrcSafetyAnalysis {
 // ... an address can be updated in a safe manner, producing the result
 // which is as trusted as the input address.
 if (auto DstAndSrc = BC.MIB->analyzeAddressArithmeticsForPtrAuth(Point)) {
-  if (Cur.SafeToDerefRegs[DstAndSrc->second])
-Regs.push_back(DstAndSrc->first);
+  auto [DstReg, SrcReg] = *DstAndSrc;
+  if (Cur.SafeToDerefRegs[SrcReg])
+Regs.push_back(DstReg);
 }
 
 // Make sure explicit checker sequence keeps register safe-to-dereference
@@ -469,8 +468,9 @@ class SrcSafetyAnalysis {
 // ... an address can be updated in a safe manner, producing the result
 // which is as trusted as the input address.
 if (auto DstAndSrc = BC.MIB->analyzeAddressArithmeticsForPtrAuth(Point)) {
-  if (Cur.TrustedRegs[DstAndSrc->second])
-Regs.push_back(DstAndSrc->first);
+  auto [DstReg, SrcReg] = *DstAndSrc;
+  if (Cur.TrustedRegs[SrcReg])
+Regs.push_back(DstReg);
 }
 
 return Regs;
@@ -845,9 +845,9 @@ struct DstState {
   return (*this = StateIn);
 
 CannotEscapeUnchecked &= StateIn.CannotEscapeUnchecked;
-for (unsigned I = 0; I < FirstInstLeakingReg.size(); ++I)
-  for (const MCInst *J : StateIn.FirstInstLeakingReg[I])
-FirstInstLeakingReg[I].insert(J);
+for (auto [ThisSet, OtherSet] :
+ llvm::zip_equal(FirstInstLeakingReg, StateIn.FirstInstLeakingReg))
+  ThisSet.insert_range(OtherSet);
 return *this;
   }
 
@@ -1012,8 +1012,7 @@ class DstSafetyAnalysis {
 
 // ... an address can be updated in a safe manner, or
 if (auto DstAndSrc = BC.MIB->analyzeAddressArithmeticsForPtrAuth(Inst)) {
-  MCPhysReg DstReg, SrcReg;
-  std::tie(DstReg, SrcReg) = *DstAndSrc;
+  auto [DstReg, SrcReg] = *DstAndSrc;
   // Note that *all* registers containing the derived values must be safe,
   // both source and destination ones. No temporaries are supported at now.
   if (Cur.CannotEscapeUnchecked[SrcReg] &&
@@ -1052,7 +1051,7 @@ class DstSafetyAnalysis {
 // If this instruction terminates the program immediately, no
 // authentication oracles are possible past this point.
 if (BC.MIB->isTrap(Point)) {
-  LLVM_DEBUG({ traceInst(BC, "Trap instruction found", Point); });
+  LLVM_DEBUG(traceInst(BC, "Trap instruction found", Point));

[llvm-branch-commits] [llvm] [BOLT] Gadget scanner: make use of C++17 features and LLVM helpers (PR #141665)

2025-05-27 Thread Anatoly Trosinenko via llvm-branch-commits


atrosinenko wrote:

> [!WARNING]
> This pull request is not mergeable via GitHub because a downstack PR is 
> open. Once all requirements are satisfied, merge this PR as a stack  href="https://app.graphite.dev/github/pr/llvm/llvm-project/141665?utm_source=stack-comment-downstack-mergeability-warning";
>  >on Graphite.
> https://graphite.dev/docs/merge-pull-requests";>Learn more

* **#141665** https://app.graphite.dev/github/pr/llvm/llvm-project/141665?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/> 👈 https://app.graphite.dev/github/pr/llvm/llvm-project/141665?utm_source=stack-comment-view-in-graphite";
 target="_blank">(View in Graphite)
* **#139778** https://app.graphite.dev/github/pr/llvm/llvm-project/139778?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#138884** https://app.graphite.dev/github/pr/llvm/llvm-project/138884?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#138883** https://app.graphite.dev/github/pr/llvm/llvm-project/138883?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#138655** https://app.graphite.dev/github/pr/llvm/llvm-project/138655?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#137975** https://app.graphite.dev/github/pr/llvm/llvm-project/137975?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#137224** https://app.graphite.dev/github/pr/llvm/llvm-project/137224?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#136183** https://app.graphite.dev/github/pr/llvm/llvm-project/136183?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#136151** https://app.graphite.dev/github/pr/llvm/llvm-project/136151?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#135663** https://app.graphite.dev/github/pr/llvm/llvm-project/135663?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#136147** https://app.graphite.dev/github/pr/llvm/llvm-project/136147?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#135662** https://app.graphite.dev/github/pr/llvm/llvm-project/135662?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#135661** https://app.graphite.dev/github/pr/llvm/llvm-project/135661?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#134146** https://app.graphite.dev/github/pr/llvm/llvm-project/134146?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#133461** https://app.graphite.dev/github/pr/llvm/llvm-project/133461?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#135073** https://app.graphite.dev/github/pr/llvm/llvm-project/135073?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* `main`




This stack of pull requests is managed by https://graphite.dev?utm-source=stack-comment";>Graphite. Learn 
more about https://stacking.dev/?utm_source=stack-comment";>stacking.


https://github.com/llvm/llvm-project/pull/141665
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits

[llvm-branch-commits] [llvm] [BOLT] Gadget scanner: make use of C++17 features and LLVM helpers (PR #141665)

2025-05-27 Thread Anatoly Trosinenko via llvm-branch-commits


https://github.com/atrosinenko ready_for_review 
https://github.com/llvm/llvm-project/pull/141665
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits

[llvm-branch-commits] [llvm] [BOLT] Gadget scanner: do not crash on debug-printing CFI instructions (PR #136151)

2025-05-27 Thread Anatoly Trosinenko via llvm-branch-commits


https://github.com/atrosinenko updated 
https://github.com/llvm/llvm-project/pull/136151

>From 25fda06fe3c11cd52ee67e0bbd42b6f8dc44921d Mon Sep 17 00:00:00 2001
From: Anatoly Trosinenko 
Date: Tue, 15 Apr 2025 21:47:18 +0300
Subject: [PATCH] [BOLT] Gadget scanner: do not crash on debug-printing CFI
 instructions

Some instruction-printing code used under LLVM_DEBUG does not handle CFI
instructions well. While CFI instructions seem to be harmless for the
correctness of the analysis results, they do not convey any useful
information to the analysis either, so skip them early.
---
 bolt/lib/Passes/PAuthGadgetScanner.cpp| 16 ++
 .../AArch64/gs-pauth-debug-output.s   | 32 +++
 2 files changed, 48 insertions(+)

diff --git a/bolt/lib/Passes/PAuthGadgetScanner.cpp 
b/bolt/lib/Passes/PAuthGadgetScanner.cpp
index 6a130e71c842e..c419ff74992a7 100644
--- a/bolt/lib/Passes/PAuthGadgetScanner.cpp
+++ b/bolt/lib/Passes/PAuthGadgetScanner.cpp
@@ -430,6 +430,9 @@ class SrcSafetyAnalysis {
   }
 
   SrcState computeNext(const MCInst &Point, const SrcState &Cur) {
+if (BC.MIB->isCFI(Point))
+  return Cur;
+
 SrcStatePrinter P(BC);
 LLVM_DEBUG({
   dbgs() << "  SrcSafetyAnalysis::ComputeNext(";
@@ -704,6 +707,8 @@ class CFGUnawareSrcSafetyAnalysis : public 
SrcSafetyAnalysis,
 SrcState S = createEntryState();
 for (auto &I : BF.instrs()) {
   MCInst &Inst = I.second;
+  if (BC.MIB->isCFI(Inst))
+continue;
 
   // If there is a label before this instruction, it is possible that it
   // can be jumped-to, thus conservatively resetting S. As an exception,
@@ -985,6 +990,9 @@ class DstSafetyAnalysis {
   }
 
   DstState computeNext(const MCInst &Point, const DstState &Cur) {
+if (BC.MIB->isCFI(Point))
+  return Cur;
+
 DstStatePrinter P(BC);
 LLVM_DEBUG({
   dbgs() << "  DstSafetyAnalysis::ComputeNext(";
@@ -1156,6 +1164,8 @@ class CFGUnawareDstSafetyAnalysis : public 
DstSafetyAnalysis,
 DstState S = createUnsafeState();
 for (auto &I : llvm::reverse(BF.instrs())) {
   MCInst &Inst = I.second;
+  if (BC.MIB->isCFI(Inst))
+continue;
 
   // If Inst can change the control flow, we cannot be sure that the next
   // instruction (to be executed in analyzed program) is the one processed
@@ -1346,6 +1356,9 @@ void FunctionAnalysisContext::findUnsafeUses(
   });
 
   iterateOverInstrs(BF, [&](MCInstReference Inst) {
+if (BC.MIB->isCFI(Inst))
+  return;
+
 const SrcState &S = Analysis->getStateBefore(Inst);
 
 // If non-empty state was never propagated from the entry basic block
@@ -1409,6 +1422,9 @@ void FunctionAnalysisContext::findUnsafeDefs(
   });
 
   iterateOverInstrs(BF, [&](MCInstReference Inst) {
+if (BC.MIB->isCFI(Inst))
+  return;
+
 const DstState &S = Analysis->getStateAfter(Inst);
 
 if (auto Report = shouldReportAuthOracle(BC, Inst, S))
diff --git a/bolt/test/binary-analysis/AArch64/gs-pauth-debug-output.s 
b/bolt/test/binary-analysis/AArch64/gs-pauth-debug-output.s
index 61aa84377b88e..5aec945621987 100644
--- a/bolt/test/binary-analysis/AArch64/gs-pauth-debug-output.s
+++ b/bolt/test/binary-analysis/AArch64/gs-pauth-debug-output.s
@@ -329,6 +329,38 @@ auth_oracle:
 // PAUTH-EMPTY:
 // PAUTH-NEXT:   Attaching leakage info to: :  autia   x0, x1 
# DataflowDstSafetyAnalysis: dst-state
 
+// Gadget scanner should not crash on CFI instructions, including when 
debug-printing them.
+// Note that the particular debug output is not checked, but BOLT should be
+// compiled with assertions enabled to support -debug-only argument.
+
+.globl  cfi_inst_df
+.type   cfi_inst_df,@function
+cfi_inst_df:
+.cfi_startproc
+sub sp, sp, #16
+.cfi_def_cfa_offset 16
+add sp, sp, #16
+.cfi_def_cfa_offset 0
+ret
+.size   cfi_inst_df, .-cfi_inst_df
+.cfi_endproc
+
+.globl  cfi_inst_nocfg
+.type   cfi_inst_nocfg,@function
+cfi_inst_nocfg:
+.cfi_startproc
+sub sp, sp, #16
+.cfi_def_cfa_offset 16
+
+adr x0, 1f
+br  x0
+1:
+add sp, sp, #16
+.cfi_def_cfa_offset 0
+ret
+.size   cfi_inst_nocfg, .-cfi_inst_nocfg
+.cfi_endproc
+
 // CHECK-LABEL:Analyzing function main, AllocatorId = 1
 .globl  main
 .type   main,@function

___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits

[llvm-branch-commits] [clang] [RISCV][Driver] Add riscv emulation mode to linker job of BareMetal toolchain (PR #134442)

2025-05-27 Thread Garvit Gupta via llvm-branch-commits



@@ -534,8 +534,18 @@ void baremetal::Linker::ConstructJob(Compilation &C, const 
JobAction &JA,
 
   CmdArgs.push_back("-Bstatic");
 
-  if (TC.getTriple().isRISCV() && Args.hasArg(options::OPT_mno_relax))
-CmdArgs.push_back("--no-relax");
+  if (Triple.isRISCV()) {
+CmdArgs.push_back("-X");
+if (Args.hasArg(options::OPT_mno_relax))
+  CmdArgs.push_back("--no-relax");
+if (const char *LDMOption = getLDMOption(TC.getTriple(), Args)) {
+  CmdArgs.push_back("-m");
+  CmdArgs.push_back(LDMOption);
+} else {
+  D.Diag(diag::err_target_unknown_triple) << Triple.str();
+  return;
+}

quic-garvgupt wrote:

Done in the latest patchset

https://github.com/llvm/llvm-project/pull/134442
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits

[llvm-branch-commits] [llvm] [X86] Add atomic vector tests for unaligned >1 sizes. (PR #120387)

2025-05-27 Thread via llvm-branch-commits


https://github.com/jofrn updated 
https://github.com/llvm/llvm-project/pull/120387

>From 7b4708f1ddcd76bd8ba94b0c85317e86bab36ef7 Mon Sep 17 00:00:00 2001
From: jofrn 
Date: Wed, 18 Dec 2024 03:40:32 -0500
Subject: [PATCH] [X86] Add atomic vector tests for unaligned >1 sizes.

Unaligned atomic vectors with size >1 are lowered to calls.
Adding their tests separately here.

commit-id:a06a5cc6
---
 llvm/test/CodeGen/X86/atomic-load-store.ll | 588 +
 1 file changed, 588 insertions(+)

diff --git a/llvm/test/CodeGen/X86/atomic-load-store.ll 
b/llvm/test/CodeGen/X86/atomic-load-store.ll
index 9fab8b98b4af0..3e7b73a65fe07 100644
--- a/llvm/test/CodeGen/X86/atomic-load-store.ll
+++ b/llvm/test/CodeGen/X86/atomic-load-store.ll
@@ -270,6 +270,82 @@ define <1 x i64> @atomic_vec1_i64_align(ptr %x) nounwind {
   ret <1 x i64> %ret
 }
 
+define <1 x ptr> @atomic_vec1_ptr(ptr %x) nounwind {
+; CHECK-O3-LABEL: atomic_vec1_ptr:
+; CHECK-O3:   # %bb.0:
+; CHECK-O3-NEXT:pushq %rax
+; CHECK-O3-NEXT:movq %rdi, %rsi
+; CHECK-O3-NEXT:movq %rsp, %rdx
+; CHECK-O3-NEXT:movl $8, %edi
+; CHECK-O3-NEXT:movl $2, %ecx
+; CHECK-O3-NEXT:callq __atomic_load@PLT
+; CHECK-O3-NEXT:movq (%rsp), %rax
+; CHECK-O3-NEXT:popq %rcx
+; CHECK-O3-NEXT:retq
+;
+; CHECK-SSE-O3-LABEL: atomic_vec1_ptr:
+; CHECK-SSE-O3:   # %bb.0:
+; CHECK-SSE-O3-NEXT:pushq %rax
+; CHECK-SSE-O3-NEXT:movq %rdi, %rsi
+; CHECK-SSE-O3-NEXT:movq %rsp, %rdx
+; CHECK-SSE-O3-NEXT:movl $8, %edi
+; CHECK-SSE-O3-NEXT:movl $2, %ecx
+; CHECK-SSE-O3-NEXT:callq __atomic_load@PLT
+; CHECK-SSE-O3-NEXT:movq (%rsp), %rax
+; CHECK-SSE-O3-NEXT:popq %rcx
+; CHECK-SSE-O3-NEXT:retq
+;
+; CHECK-AVX-O3-LABEL: atomic_vec1_ptr:
+; CHECK-AVX-O3:   # %bb.0:
+; CHECK-AVX-O3-NEXT:pushq %rax
+; CHECK-AVX-O3-NEXT:movq %rdi, %rsi
+; CHECK-AVX-O3-NEXT:movq %rsp, %rdx
+; CHECK-AVX-O3-NEXT:movl $8, %edi
+; CHECK-AVX-O3-NEXT:movl $2, %ecx
+; CHECK-AVX-O3-NEXT:callq __atomic_load@PLT
+; CHECK-AVX-O3-NEXT:movq (%rsp), %rax
+; CHECK-AVX-O3-NEXT:popq %rcx
+; CHECK-AVX-O3-NEXT:retq
+;
+; CHECK-O0-LABEL: atomic_vec1_ptr:
+; CHECK-O0:   # %bb.0:
+; CHECK-O0-NEXT:pushq %rax
+; CHECK-O0-NEXT:movq %rdi, %rsi
+; CHECK-O0-NEXT:movl $8, %edi
+; CHECK-O0-NEXT:movq %rsp, %rdx
+; CHECK-O0-NEXT:movl $2, %ecx
+; CHECK-O0-NEXT:callq __atomic_load@PLT
+; CHECK-O0-NEXT:movq (%rsp), %rax
+; CHECK-O0-NEXT:popq %rcx
+; CHECK-O0-NEXT:retq
+;
+; CHECK-SSE-O0-LABEL: atomic_vec1_ptr:
+; CHECK-SSE-O0:   # %bb.0:
+; CHECK-SSE-O0-NEXT:pushq %rax
+; CHECK-SSE-O0-NEXT:movq %rdi, %rsi
+; CHECK-SSE-O0-NEXT:movl $8, %edi
+; CHECK-SSE-O0-NEXT:movq %rsp, %rdx
+; CHECK-SSE-O0-NEXT:movl $2, %ecx
+; CHECK-SSE-O0-NEXT:callq __atomic_load@PLT
+; CHECK-SSE-O0-NEXT:movq (%rsp), %rax
+; CHECK-SSE-O0-NEXT:popq %rcx
+; CHECK-SSE-O0-NEXT:retq
+;
+; CHECK-AVX-O0-LABEL: atomic_vec1_ptr:
+; CHECK-AVX-O0:   # %bb.0:
+; CHECK-AVX-O0-NEXT:pushq %rax
+; CHECK-AVX-O0-NEXT:movq %rdi, %rsi
+; CHECK-AVX-O0-NEXT:movl $8, %edi
+; CHECK-AVX-O0-NEXT:movq %rsp, %rdx
+; CHECK-AVX-O0-NEXT:movl $2, %ecx
+; CHECK-AVX-O0-NEXT:callq __atomic_load@PLT
+; CHECK-AVX-O0-NEXT:movq (%rsp), %rax
+; CHECK-AVX-O0-NEXT:popq %rcx
+; CHECK-AVX-O0-NEXT:retq
+  %ret = load atomic <1 x ptr>, ptr %x acquire, align 4
+  ret <1 x ptr> %ret
+}
+
 define <1 x half> @atomic_vec1_half(ptr %x) {
 ; CHECK-O3-LABEL: atomic_vec1_half:
 ; CHECK-O3:   # %bb.0:
@@ -386,3 +462,515 @@ define <1 x double> @atomic_vec1_double_align(ptr %x) 
nounwind {
   %ret = load atomic <1 x double>, ptr %x acquire, align 8
   ret <1 x double> %ret
 }
+
+define <1 x i64> @atomic_vec1_i64(ptr %x) nounwind {
+; CHECK-O3-LABEL: atomic_vec1_i64:
+; CHECK-O3:   # %bb.0:
+; CHECK-O3-NEXT:pushq %rax
+; CHECK-O3-NEXT:movq %rdi, %rsi
+; CHECK-O3-NEXT:movq %rsp, %rdx
+; CHECK-O3-NEXT:movl $8, %edi
+; CHECK-O3-NEXT:movl $2, %ecx
+; CHECK-O3-NEXT:callq __atomic_load@PLT
+; CHECK-O3-NEXT:movq (%rsp), %rax
+; CHECK-O3-NEXT:popq %rcx
+; CHECK-O3-NEXT:retq
+;
+; CHECK-SSE-O3-LABEL: atomic_vec1_i64:
+; CHECK-SSE-O3:   # %bb.0:
+; CHECK-SSE-O3-NEXT:pushq %rax
+; CHECK-SSE-O3-NEXT:movq %rdi, %rsi
+; CHECK-SSE-O3-NEXT:movq %rsp, %rdx
+; CHECK-SSE-O3-NEXT:movl $8, %edi
+; CHECK-SSE-O3-NEXT:movl $2, %ecx
+; CHECK-SSE-O3-NEXT:callq __atomic_load@PLT
+; CHECK-SSE-O3-NEXT:movq (%rsp), %rax
+; CHECK-SSE-O3-NEXT:popq %rcx
+; CHECK-SSE-O3-NEXT:retq
+;
+; CHECK-AVX-O3-LABEL: atomic_vec1_i64:
+; CHECK-AVX-O3:   # %bb.0:
+; CHECK-AVX-O3-NEXT:pushq %rax
+; CHECK-AVX-O3-NEXT:movq %rdi, %rsi
+; CHECK-AVX-O3-NEXT:movq %rsp, %rdx
+; CHECK-AVX-O3-NEXT:movl $8, %edi
+; CHECK-AVX-O3-NEXT:movl $2, %ecx
+; CHECK-AVX-O3-NEXT:callq __atomic_load@PLT
+; CHECK-AVX-O3-NEXT:movq (%rsp), %rax
+; CHECK-AV

[llvm-branch-commits] [llvm] [SelectionDAG][X86] Remove unused elements from atomic vector. (PR #125432)

2025-05-27 Thread via llvm-branch-commits


https://github.com/jofrn updated 
https://github.com/llvm/llvm-project/pull/125432

>From 7ba3e69a759f59bf746cb14640ea8ea426fa09fd Mon Sep 17 00:00:00 2001
From: jofrn 
Date: Fri, 31 Jan 2025 13:12:56 -0500
Subject: [PATCH] [SelectionDAG][X86] Remove unused elements from atomic
 vector.

After splitting, all elements are created. The two components must
be found by looking at the upper and lower half of the value.
This change extends EltsFromConsecutiveLoads
to understand AtomicSDNode so that unused elements can be removed.

commit-id:b83937a8
---
 llvm/include/llvm/CodeGen/SelectionDAG.h  |  2 +-
 .../lib/CodeGen/SelectionDAG/SelectionDAG.cpp |  2 +-
 llvm/lib/Target/X86/X86ISelLowering.cpp   | 61 +++
 3 files changed, 51 insertions(+), 14 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/SelectionDAG.h 
b/llvm/include/llvm/CodeGen/SelectionDAG.h
index 87b6914f8a0ee..40550d96a5b3d 100644
--- a/llvm/include/llvm/CodeGen/SelectionDAG.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAG.h
@@ -1873,7 +1873,7 @@ class SelectionDAG {
   /// chain to the token factor. This ensures that the new memory node will 
have
   /// the same relative memory dependency position as the old load. Returns the
   /// new merged load chain.
-  SDValue makeEquivalentMemoryOrdering(LoadSDNode *OldLoad, SDValue NewMemOp);
+  SDValue makeEquivalentMemoryOrdering(MemSDNode *OldLoad, SDValue NewMemOp);
 
   /// Topological-sort the AllNodes list and a
   /// assign a unique node id for each node in the DAG based on their
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp 
b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index e6a7d092b7b79..1c1445f9f44b7 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -12236,7 +12236,7 @@ SDValue 
SelectionDAG::makeEquivalentMemoryOrdering(SDValue OldChain,
   return TokenFactor;
 }
 
-SDValue SelectionDAG::makeEquivalentMemoryOrdering(LoadSDNode *OldLoad,
+SDValue SelectionDAG::makeEquivalentMemoryOrdering(MemSDNode *OldLoad,
SDValue NewMemOp) {
   assert(isa(NewMemOp.getNode()) && "Expected a memop node");
   SDValue OldChain = SDValue(OldLoad, 1);
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp 
b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 1fc50a36fda72..5ce8d83feb0dd 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -7193,15 +7193,19 @@ static SDValue LowerAsSplatVectorLoad(SDValue SrcOp, 
MVT VT, const SDLoc &dl,
 }
 
 // Recurse to find a LoadSDNode source and the accumulated ByteOffest.
-static bool findEltLoadSrc(SDValue Elt, LoadSDNode *&Ld, int64_t &ByteOffset) {
-  if (ISD::isNON_EXTLoad(Elt.getNode())) {
-auto *BaseLd = cast(Elt);
-if (!BaseLd->isSimple())
-  return false;
+static bool findEltLoadSrc(SDValue Elt, MemSDNode *&Ld, int64_t &ByteOffset) {
+  if (auto *BaseLd = dyn_cast(Elt)) {
 Ld = BaseLd;
 ByteOffset = 0;
 return true;
-  }
+  } else if (auto *BaseLd = dyn_cast(Elt))
+if (ISD::isNON_EXTLoad(Elt.getNode())) {
+  if (!BaseLd->isSimple())
+return false;
+  Ld = BaseLd;
+  ByteOffset = 0;
+  return true;
+}
 
   switch (Elt.getOpcode()) {
   case ISD::BITCAST:
@@ -7254,7 +7258,7 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, 
ArrayRef Elts,
   APInt ZeroMask = APInt::getZero(NumElems);
   APInt UndefMask = APInt::getZero(NumElems);
 
-  SmallVector Loads(NumElems, nullptr);
+  SmallVector Loads(NumElems, nullptr);
   SmallVector ByteOffsets(NumElems, 0);
 
   // For each element in the initializer, see if we've found a load, zero or an
@@ -7304,7 +7308,7 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, 
ArrayRef Elts,
   EVT EltBaseVT = EltBase.getValueType();
   assert(EltBaseVT.getSizeInBits() == EltBaseVT.getStoreSizeInBits() &&
  "Register/Memory size mismatch");
-  LoadSDNode *LDBase = Loads[FirstLoadedElt];
+  MemSDNode *LDBase = Loads[FirstLoadedElt];
   assert(LDBase && "Did not find base load for merging consecutive loads");
   unsigned BaseSizeInBits = EltBaseVT.getStoreSizeInBits();
   unsigned BaseSizeInBytes = BaseSizeInBits / 8;
@@ -7318,15 +7322,18 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, 
ArrayRef Elts,
 
   // Check to see if the element's load is consecutive to the base load
   // or offset from a previous (already checked) load.
-  auto CheckConsecutiveLoad = [&](LoadSDNode *Base, int EltIdx) {
-LoadSDNode *Ld = Loads[EltIdx];
+  auto CheckConsecutiveLoad = [&](MemSDNode *Base, int EltIdx) {
+MemSDNode *Ld = Loads[EltIdx];
 int64_t ByteOffset = ByteOffsets[EltIdx];
 if (ByteOffset && (ByteOffset % BaseSizeInBytes) == 0) {
   int64_t BaseIdx = EltIdx - (ByteOffset / BaseSizeInBytes);
   return (0 <= BaseIdx && BaseIdx < (int)NumElems && LoadMask[BaseIdx] &&
   Loads[BaseIdx] == Ld && ByteOffsets[BaseIdx] == 0);
 }
-return DAG.areNo

[llvm-branch-commits] [llvm] [AtomicExpand] Add bitcasts when expanding load atomic vector (PR #120716)

2025-05-27 Thread via llvm-branch-commits


https://github.com/jofrn updated 
https://github.com/llvm/llvm-project/pull/120716

>From ac4069c8fa8e69173b203824f4db5fbd73ecb5a4 Mon Sep 17 00:00:00 2001
From: jofrn 
Date: Fri, 20 Dec 2024 06:14:28 -0500
Subject: [PATCH] [AtomicExpand] Add bitcasts when expanding load atomic vector

AtomicExpand fails for aligned `load atomic ` because it
does not find a compatible library call. This change adds appropriate
bitcasts so that the call can be lowered. It also adds support for
128 bit lowering in tablegen to support SSE/AVX.

commit-id:f430c1af
---
 .../include/llvm/Target/TargetSelectionDAG.td |  14 +
 llvm/lib/CodeGen/AtomicExpandPass.cpp |  15 +-
 llvm/lib/Target/X86/X86InstrCompiler.td   |   5 +
 llvm/test/CodeGen/ARM/atomic-load-store.ll|  51 
 llvm/test/CodeGen/X86/atomic-load-store.ll|  94 +++
 .../X86/expand-atomic-non-integer.ll  | 263 --
 6 files changed, 360 insertions(+), 82 deletions(-)

diff --git a/llvm/include/llvm/Target/TargetSelectionDAG.td 
b/llvm/include/llvm/Target/TargetSelectionDAG.td
index 406baa4f5fdaa..3b8a34ca0eb51 100644
--- a/llvm/include/llvm/Target/TargetSelectionDAG.td
+++ b/llvm/include/llvm/Target/TargetSelectionDAG.td
@@ -1904,6 +1904,20 @@ def atomic_load_64 :
   let MemoryVT = i64;
 }
 
+def atomic_load_128_v2i64 :
+  PatFrag<(ops node:$ptr),
+  (atomic_load node:$ptr)> {
+  let IsAtomic = true;
+  let MemoryVT = v2i64;
+}
+
+def atomic_load_128_v4i32 :
+  PatFrag<(ops node:$ptr),
+  (atomic_load node:$ptr)> {
+  let IsAtomic = true;
+  let MemoryVT = v4i32;
+}
+
 def atomic_load_nonext_8 :
   PatFrag<(ops node:$ptr), (atomic_load_nonext node:$ptr)> {
   let IsAtomic = true; // FIXME: Should be IsLoad and/or IsAtomic?
diff --git a/llvm/lib/CodeGen/AtomicExpandPass.cpp 
b/llvm/lib/CodeGen/AtomicExpandPass.cpp
index c376de877ac7d..70f59eafc6ecb 100644
--- a/llvm/lib/CodeGen/AtomicExpandPass.cpp
+++ b/llvm/lib/CodeGen/AtomicExpandPass.cpp
@@ -2066,9 +2066,18 @@ bool AtomicExpandImpl::expandAtomicOpToLibcall(
 I->replaceAllUsesWith(V);
   } else if (HasResult) {
 Value *V;
-if (UseSizedLibcall)
-  V = Builder.CreateBitOrPointerCast(Result, I->getType());
-else {
+if (UseSizedLibcall) {
+  // Add bitcasts from Result's scalar type to I's  vector type
+  auto *PtrTy = dyn_cast(I->getType()->getScalarType());
+  auto *VTy = dyn_cast(I->getType());
+  if (VTy && PtrTy && !Result->getType()->isVectorTy()) {
+unsigned AS = PtrTy->getAddressSpace();
+Value *BC = Builder.CreateBitCast(
+Result, VTy->getWithNewType(DL.getIntPtrType(Ctx, AS)));
+V = Builder.CreateIntToPtr(BC, I->getType());
+  } else
+V = Builder.CreateBitOrPointerCast(Result, I->getType());
+} else {
   V = Builder.CreateAlignedLoad(I->getType(), AllocaResult,
 AllocaAlignment);
   Builder.CreateLifetimeEnd(AllocaResult, SizeVal64);
diff --git a/llvm/lib/Target/X86/X86InstrCompiler.td 
b/llvm/lib/Target/X86/X86InstrCompiler.td
index 26b76dd1ca83a..3143015b7ec66 100644
--- a/llvm/lib/Target/X86/X86InstrCompiler.td
+++ b/llvm/lib/Target/X86/X86InstrCompiler.td
@@ -1211,6 +1211,11 @@ def : Pat<(v4i32 (scalar_to_vector (i32 (atomic_load_32 
addr:$src,
 def : Pat<(v2i64 (scalar_to_vector (i64 (atomic_load_64 addr:$src,
(MOV64toPQIrm  addr:$src)>; // load atomic <2 x i32,float>
 
+def : Pat<(v2i64 (atomic_load_128_v2i64 addr:$src)),
+   (VMOVAPDrm addr:$src)>; // load atomic <2 x i64>
+def : Pat<(v4i32 (atomic_load_128_v4i32 addr:$src)),
+   (VMOVAPDrm addr:$src)>; // load atomic <4 x i32>
+
 // Floating point loads/stores.
 def : Pat<(atomic_store_32 (i32 (bitconvert (f32 FR32:$src))), addr:$dst),
   (MOVSSmr addr:$dst, FR32:$src)>, Requires<[UseSSE1]>;
diff --git a/llvm/test/CodeGen/ARM/atomic-load-store.ll 
b/llvm/test/CodeGen/ARM/atomic-load-store.ll
index 560dfde356c29..eaa2ffd9b2731 100644
--- a/llvm/test/CodeGen/ARM/atomic-load-store.ll
+++ b/llvm/test/CodeGen/ARM/atomic-load-store.ll
@@ -983,3 +983,54 @@ define void @store_atomic_f64__seq_cst(ptr %ptr, double 
%val1) {
   store atomic double %val1, ptr %ptr seq_cst, align 8
   ret void
 }
+
+define <1 x ptr> @atomic_vec1_ptr(ptr %x) #0 {
+; ARM-LABEL: atomic_vec1_ptr:
+; ARM:   @ %bb.0:
+; ARM-NEXT:ldr r0, [r0]
+; ARM-NEXT:dmb ish
+; ARM-NEXT:bx lr
+;
+; ARMOPTNONE-LABEL: atomic_vec1_ptr:
+; ARMOPTNONE:   @ %bb.0:
+; ARMOPTNONE-NEXT:ldr r0, [r0]
+; ARMOPTNONE-NEXT:dmb ish
+; ARMOPTNONE-NEXT:bx lr
+;
+; THUMBTWO-LABEL: atomic_vec1_ptr:
+; THUMBTWO:   @ %bb.0:
+; THUMBTWO-NEXT:ldr r0, [r0]
+; THUMBTWO-NEXT:dmb ish
+; THUMBTWO-NEXT:bx lr
+;
+; THUMBONE-LABEL: atomic_vec1_ptr:
+; THUMBONE:   @ %bb.0:
+; THUMBONE-NEXT:push {r7, lr}
+; THUMBONE-NEXT:movs r1, #0
+; THUMBONE-NEXT:mov r2, r1
+; THUMBONE-NEXT:bl __sync_val_compare_and_swap_4
+; THUMBONE-NEXT:

[llvm-branch-commits] [llvm] [X86] Remove extra MOV after widening atomic load (PR #138635)

2025-05-27 Thread via llvm-branch-commits


https://github.com/jofrn updated 
https://github.com/llvm/llvm-project/pull/138635

>From 109bc6009d93645b42e0be8fbf858368770f49e7 Mon Sep 17 00:00:00 2001
From: jofernau_amdeng 
Date: Tue, 6 May 2025 01:48:11 -0400
Subject: [PATCH] [X86] Remove extra MOV after widening atomic load

This change adds patterns to optimize out an extra MOV
present after widening the atomic load. It also casts floats
to ints in an atomic load during AtomicExpand to support 128 bit
vectors in SSE/AVX.

commit-id:45989503
---
 llvm/lib/Target/X86/X86ISelLowering.cpp|   7 +
 llvm/lib/Target/X86/X86ISelLowering.h  |   2 +
 llvm/lib/Target/X86/X86InstrCompiler.td|   7 +
 llvm/test/CodeGen/X86/atomic-load-store.ll | 249 +++--
 4 files changed, 102 insertions(+), 163 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp 
b/llvm/lib/Target/X86/X86ISelLowering.cpp
index a203c84630fe1..1fc50a36fda72 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -32070,6 +32070,13 @@ 
X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
   }
 }
 
+TargetLowering::AtomicExpansionKind
+X86TargetLowering::shouldCastAtomicLoadInIR(LoadInst *LI) const {
+  if (LI->getType()->getScalarType()->isFloatingPointTy())
+return AtomicExpansionKind::CastToInteger;
+  return AtomicExpansionKind::None;
+}
+
 LoadInst *
 X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
   unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
diff --git a/llvm/lib/Target/X86/X86ISelLowering.h 
b/llvm/lib/Target/X86/X86ISelLowering.h
index 359f24768b3da..14b79e8d726f6 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/llvm/lib/Target/X86/X86ISelLowering.h
@@ -1838,6 +1838,8 @@ namespace llvm {
 shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override;
 TargetLoweringBase::AtomicExpansionKind
 shouldExpandLogicAtomicRMWInIR(AtomicRMWInst *AI) const;
+TargetLoweringBase::AtomicExpansionKind
+shouldCastAtomicLoadInIR(LoadInst *LI) const override;
 void emitBitTestAtomicRMWIntrinsic(AtomicRMWInst *AI) const override;
 void emitCmpArithAtomicRMWIntrinsic(AtomicRMWInst *AI) const override;
 
diff --git a/llvm/lib/Target/X86/X86InstrCompiler.td 
b/llvm/lib/Target/X86/X86InstrCompiler.td
index 927b2c8b22f05..26b76dd1ca83a 100644
--- a/llvm/lib/Target/X86/X86InstrCompiler.td
+++ b/llvm/lib/Target/X86/X86InstrCompiler.td
@@ -1204,6 +1204,13 @@ def : Pat<(i16 (atomic_load_nonext_16 addr:$src)), 
(MOV16rm addr:$src)>;
 def : Pat<(i32 (atomic_load_nonext_32 addr:$src)), (MOV32rm addr:$src)>;
 def : Pat<(i64 (atomic_load_nonext_64 addr:$src)), (MOV64rm addr:$src)>;
 
+def : Pat<(v4i32 (scalar_to_vector (i32 (zext (i16 (atomic_load_16 
addr:$src)),
+   (MOVDI2PDIrm addr:$src)>;   // load atomic <2 x i8>
+def : Pat<(v4i32 (scalar_to_vector (i32 (atomic_load_32 addr:$src,
+   (MOVDI2PDIrm addr:$src)>;   // load atomic <2 x i16>
+def : Pat<(v2i64 (scalar_to_vector (i64 (atomic_load_64 addr:$src,
+   (MOV64toPQIrm  addr:$src)>; // load atomic <2 x i32,float>
+
 // Floating point loads/stores.
 def : Pat<(atomic_store_32 (i32 (bitconvert (f32 FR32:$src))), addr:$dst),
   (MOVSSmr addr:$dst, FR32:$src)>, Requires<[UseSSE1]>;
diff --git a/llvm/test/CodeGen/X86/atomic-load-store.ll 
b/llvm/test/CodeGen/X86/atomic-load-store.ll
index ff5391f44bbe3..535a87316e162 100644
--- a/llvm/test/CodeGen/X86/atomic-load-store.ll
+++ b/llvm/test/CodeGen/X86/atomic-load-store.ll
@@ -207,19 +207,19 @@ define <1 x bfloat> @atomic_vec1_bfloat(ptr %x) {
 ; CHECK-O3-LABEL: atomic_vec1_bfloat:
 ; CHECK-O3:   # %bb.0:
 ; CHECK-O3-NEXT:movzwl (%rdi), %eax
-; CHECK-O3-NEXT:pinsrw $0, %eax, %xmm0
+; CHECK-O3-NEXT:movd %eax, %xmm0
 ; CHECK-O3-NEXT:retq
 ;
 ; CHECK-SSE-O3-LABEL: atomic_vec1_bfloat:
 ; CHECK-SSE-O3:   # %bb.0:
 ; CHECK-SSE-O3-NEXT:movzwl (%rdi), %eax
-; CHECK-SSE-O3-NEXT:pinsrw $0, %eax, %xmm0
+; CHECK-SSE-O3-NEXT:movd %eax, %xmm0
 ; CHECK-SSE-O3-NEXT:retq
 ;
 ; CHECK-AVX-O3-LABEL: atomic_vec1_bfloat:
 ; CHECK-AVX-O3:   # %bb.0:
 ; CHECK-AVX-O3-NEXT:movzwl (%rdi), %eax
-; CHECK-AVX-O3-NEXT:vpinsrw $0, %eax, %xmm0, %xmm0
+; CHECK-AVX-O3-NEXT:vmovd %eax, %xmm0
 ; CHECK-AVX-O3-NEXT:retq
 ;
 ; CHECK-O0-LABEL: atomic_vec1_bfloat:
@@ -227,8 +227,7 @@ define <1 x bfloat> @atomic_vec1_bfloat(ptr %x) {
 ; CHECK-O0-NEXT:movw (%rdi), %cx
 ; CHECK-O0-NEXT:# implicit-def: $eax
 ; CHECK-O0-NEXT:movw %cx, %ax
-; CHECK-O0-NEXT:# implicit-def: $xmm0
-; CHECK-O0-NEXT:pinsrw $0, %eax, %xmm0
+; CHECK-O0-NEXT:movd %eax, %xmm0
 ; CHECK-O0-NEXT:retq
 ;
 ; CHECK-SSE-O0-LABEL: atomic_vec1_bfloat:
@@ -236,8 +235,7 @@ define <1 x bfloat> @atomic_vec1_bfloat(ptr %x) {
 ; CHECK-SSE-O0-NEXT:movw (%rdi), %cx
 ; CHECK-SSE-O0-NEXT:# implicit-def: $eax
 ; CHECK-SSE-O0-NEXT:movw %cx, %ax
-; CHECK-SSE-O0-NEXT:# implicit-def: $xm

[llvm-branch-commits] [llvm] [SelectionDAG][X86] Remove unused elements from atomic vector. (PR #125432)

2025-05-27 Thread via llvm-branch-commits


https://github.com/jofrn updated 
https://github.com/llvm/llvm-project/pull/125432

>From 7ba3e69a759f59bf746cb14640ea8ea426fa09fd Mon Sep 17 00:00:00 2001
From: jofrn 
Date: Fri, 31 Jan 2025 13:12:56 -0500
Subject: [PATCH] [SelectionDAG][X86] Remove unused elements from atomic
 vector.

After splitting, all elements are created. The two components must
be found by looking at the upper and lower half of the value.
This change extends EltsFromConsecutiveLoads
to understand AtomicSDNode so that unused elements can be removed.

commit-id:b83937a8
---
 llvm/include/llvm/CodeGen/SelectionDAG.h  |  2 +-
 .../lib/CodeGen/SelectionDAG/SelectionDAG.cpp |  2 +-
 llvm/lib/Target/X86/X86ISelLowering.cpp   | 61 +++
 3 files changed, 51 insertions(+), 14 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/SelectionDAG.h 
b/llvm/include/llvm/CodeGen/SelectionDAG.h
index 87b6914f8a0ee..40550d96a5b3d 100644
--- a/llvm/include/llvm/CodeGen/SelectionDAG.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAG.h
@@ -1873,7 +1873,7 @@ class SelectionDAG {
   /// chain to the token factor. This ensures that the new memory node will 
have
   /// the same relative memory dependency position as the old load. Returns the
   /// new merged load chain.
-  SDValue makeEquivalentMemoryOrdering(LoadSDNode *OldLoad, SDValue NewMemOp);
+  SDValue makeEquivalentMemoryOrdering(MemSDNode *OldLoad, SDValue NewMemOp);
 
   /// Topological-sort the AllNodes list and a
   /// assign a unique node id for each node in the DAG based on their
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp 
b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index e6a7d092b7b79..1c1445f9f44b7 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -12236,7 +12236,7 @@ SDValue 
SelectionDAG::makeEquivalentMemoryOrdering(SDValue OldChain,
   return TokenFactor;
 }
 
-SDValue SelectionDAG::makeEquivalentMemoryOrdering(LoadSDNode *OldLoad,
+SDValue SelectionDAG::makeEquivalentMemoryOrdering(MemSDNode *OldLoad,
SDValue NewMemOp) {
   assert(isa(NewMemOp.getNode()) && "Expected a memop node");
   SDValue OldChain = SDValue(OldLoad, 1);
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp 
b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 1fc50a36fda72..5ce8d83feb0dd 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -7193,15 +7193,19 @@ static SDValue LowerAsSplatVectorLoad(SDValue SrcOp, 
MVT VT, const SDLoc &dl,
 }
 
 // Recurse to find a LoadSDNode source and the accumulated ByteOffest.
-static bool findEltLoadSrc(SDValue Elt, LoadSDNode *&Ld, int64_t &ByteOffset) {
-  if (ISD::isNON_EXTLoad(Elt.getNode())) {
-auto *BaseLd = cast(Elt);
-if (!BaseLd->isSimple())
-  return false;
+static bool findEltLoadSrc(SDValue Elt, MemSDNode *&Ld, int64_t &ByteOffset) {
+  if (auto *BaseLd = dyn_cast(Elt)) {
 Ld = BaseLd;
 ByteOffset = 0;
 return true;
-  }
+  } else if (auto *BaseLd = dyn_cast(Elt))
+if (ISD::isNON_EXTLoad(Elt.getNode())) {
+  if (!BaseLd->isSimple())
+return false;
+  Ld = BaseLd;
+  ByteOffset = 0;
+  return true;
+}
 
   switch (Elt.getOpcode()) {
   case ISD::BITCAST:
@@ -7254,7 +7258,7 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, 
ArrayRef Elts,
   APInt ZeroMask = APInt::getZero(NumElems);
   APInt UndefMask = APInt::getZero(NumElems);
 
-  SmallVector Loads(NumElems, nullptr);
+  SmallVector Loads(NumElems, nullptr);
   SmallVector ByteOffsets(NumElems, 0);
 
   // For each element in the initializer, see if we've found a load, zero or an
@@ -7304,7 +7308,7 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, 
ArrayRef Elts,
   EVT EltBaseVT = EltBase.getValueType();
   assert(EltBaseVT.getSizeInBits() == EltBaseVT.getStoreSizeInBits() &&
  "Register/Memory size mismatch");
-  LoadSDNode *LDBase = Loads[FirstLoadedElt];
+  MemSDNode *LDBase = Loads[FirstLoadedElt];
   assert(LDBase && "Did not find base load for merging consecutive loads");
   unsigned BaseSizeInBits = EltBaseVT.getStoreSizeInBits();
   unsigned BaseSizeInBytes = BaseSizeInBits / 8;
@@ -7318,15 +7322,18 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, 
ArrayRef Elts,
 
   // Check to see if the element's load is consecutive to the base load
   // or offset from a previous (already checked) load.
-  auto CheckConsecutiveLoad = [&](LoadSDNode *Base, int EltIdx) {
-LoadSDNode *Ld = Loads[EltIdx];
+  auto CheckConsecutiveLoad = [&](MemSDNode *Base, int EltIdx) {
+MemSDNode *Ld = Loads[EltIdx];
 int64_t ByteOffset = ByteOffsets[EltIdx];
 if (ByteOffset && (ByteOffset % BaseSizeInBytes) == 0) {
   int64_t BaseIdx = EltIdx - (ByteOffset / BaseSizeInBytes);
   return (0 <= BaseIdx && BaseIdx < (int)NumElems && LoadMask[BaseIdx] &&
   Loads[BaseIdx] == Ld && ByteOffsets[BaseIdx] == 0);
 }
-return DAG.areNo

[llvm-branch-commits] [llvm] [SelectionDAG] Widen <2 x T> vector types for atomic load (PR #120598)

2025-05-27 Thread via llvm-branch-commits


https://github.com/jofrn updated 
https://github.com/llvm/llvm-project/pull/120598

>From 39f039e23b95204affb61d1ac004c562f08222c5 Mon Sep 17 00:00:00 2001
From: jofrn 
Date: Thu, 19 Dec 2024 11:19:39 -0500
Subject: [PATCH] [SelectionDAG] Widen <2 x T> vector types for atomic load

Vector types of 2 elements must be widened. This change does this
for vector types of atomic load in SelectionDAG
so that it can translate aligned vectors of >1 size.

commit-id:2894ccd1
---
 llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h |   1 +
 .../SelectionDAG/LegalizeVectorTypes.cpp  |  97 --
 llvm/test/CodeGen/X86/atomic-load-store.ll| 286 ++
 3 files changed, 361 insertions(+), 23 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h 
b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index d24b4517a460d..b6e018ba0e454 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -1068,6 +1068,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
   SDValue WidenVecRes_EXTRACT_SUBVECTOR(SDNode* N);
   SDValue WidenVecRes_INSERT_SUBVECTOR(SDNode *N);
   SDValue WidenVecRes_INSERT_VECTOR_ELT(SDNode* N);
+  SDValue WidenVecRes_ATOMIC_LOAD(AtomicSDNode *N);
   SDValue WidenVecRes_LOAD(SDNode* N);
   SDValue WidenVecRes_VP_LOAD(VPLoadSDNode *N);
   SDValue WidenVecRes_VP_STRIDED_LOAD(VPStridedLoadSDNode *N);
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp 
b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index d6cbf2211f053..42763aab5bb55 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -4622,6 +4622,9 @@ void DAGTypeLegalizer::WidenVectorResult(SDNode *N, 
unsigned ResNo) {
 break;
   case ISD::EXTRACT_SUBVECTOR: Res = WidenVecRes_EXTRACT_SUBVECTOR(N); break;
   case ISD::INSERT_VECTOR_ELT: Res = WidenVecRes_INSERT_VECTOR_ELT(N); break;
+  case ISD::ATOMIC_LOAD:
+Res = WidenVecRes_ATOMIC_LOAD(cast(N));
+break;
   case ISD::LOAD:  Res = WidenVecRes_LOAD(N); break;
   case ISD::STEP_VECTOR:
   case ISD::SPLAT_VECTOR:
@@ -6003,6 +6006,74 @@ SDValue 
DAGTypeLegalizer::WidenVecRes_INSERT_VECTOR_ELT(SDNode *N) {
  N->getOperand(1), N->getOperand(2));
 }
 
+/// Either return the same load or provide appropriate casts
+/// from the load and return that.
+static SDValue coerceLoadedValue(SDValue LdOp, EVT FirstVT, EVT WidenVT,
+ TypeSize LdWidth, TypeSize FirstVTWidth,
+ SDLoc dl, SelectionDAG &DAG) {
+  assert(TypeSize::isKnownLE(LdWidth, FirstVTWidth));
+  TypeSize WidenWidth = WidenVT.getSizeInBits();
+  if (!FirstVT.isVector()) {
+unsigned NumElts =
+WidenWidth.getFixedValue() / FirstVTWidth.getFixedValue();
+EVT NewVecVT = EVT::getVectorVT(*DAG.getContext(), FirstVT, NumElts);
+SDValue VecOp = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, NewVecVT, LdOp);
+return DAG.getNode(ISD::BITCAST, dl, WidenVT, VecOp);
+  }
+  assert(FirstVT == WidenVT);
+  return LdOp;
+}
+
+static std::optional findMemType(SelectionDAG &DAG,
+  const TargetLowering &TLI, unsigned 
Width,
+  EVT WidenVT, unsigned Align,
+  unsigned WidenEx);
+
+SDValue DAGTypeLegalizer::WidenVecRes_ATOMIC_LOAD(AtomicSDNode *LD) {
+  EVT WidenVT =
+  TLI.getTypeToTransformTo(*DAG.getContext(), LD->getValueType(0));
+  EVT LdVT = LD->getMemoryVT();
+  SDLoc dl(LD);
+  assert(LdVT.isVector() && WidenVT.isVector() && "Expected vectors");
+  assert(LdVT.isScalableVector() == WidenVT.isScalableVector() &&
+ "Must be scalable");
+  assert(LdVT.getVectorElementType() == WidenVT.getVectorElementType() &&
+ "Expected equivalent element types");
+
+  // Load information
+  SDValue Chain = LD->getChain();
+  SDValue BasePtr = LD->getBasePtr();
+  MachineMemOperand::Flags MMOFlags = LD->getMemOperand()->getFlags();
+  AAMDNodes AAInfo = LD->getAAInfo();
+
+  TypeSize LdWidth = LdVT.getSizeInBits();
+  TypeSize WidenWidth = WidenVT.getSizeInBits();
+  TypeSize WidthDiff = WidenWidth - LdWidth;
+
+  // Find the vector type that can load from.
+  std::optional FirstVT =
+  findMemType(DAG, TLI, LdWidth.getKnownMinValue(), WidenVT, /*LdAlign=*/0,
+  WidthDiff.getKnownMinValue());
+
+  if (!FirstVT)
+return SDValue();
+
+  SmallVector MemVTs;
+  TypeSize FirstVTWidth = FirstVT->getSizeInBits();
+
+  SDValue LdOp = DAG.getAtomicLoad(ISD::NON_EXTLOAD, dl, *FirstVT, *FirstVT,
+   Chain, BasePtr, LD->getMemOperand());
+
+  // Load the element with one instruction.
+  SDValue Result = coerceLoadedValue(LdOp, *FirstVT, WidenVT, LdWidth,
+ FirstVTWidth, dl, DAG);
+
+  // Modified the chain - switch anything that used the old chain to use
+  // the new one.
+

[llvm-branch-commits] [llvm] [SelectionDAG][X86] Remove unused elements from atomic vector. (PR #125432)

2025-05-27 Thread via llvm-branch-commits


https://github.com/jofrn updated 
https://github.com/llvm/llvm-project/pull/125432

>From 7ba3e69a759f59bf746cb14640ea8ea426fa09fd Mon Sep 17 00:00:00 2001
From: jofrn 
Date: Fri, 31 Jan 2025 13:12:56 -0500
Subject: [PATCH] [SelectionDAG][X86] Remove unused elements from atomic
 vector.

After splitting, all elements are created. The two components must
be found by looking at the upper and lower half of the value.
This change extends EltsFromConsecutiveLoads
to understand AtomicSDNode so that unused elements can be removed.

commit-id:b83937a8
---
 llvm/include/llvm/CodeGen/SelectionDAG.h  |  2 +-
 .../lib/CodeGen/SelectionDAG/SelectionDAG.cpp |  2 +-
 llvm/lib/Target/X86/X86ISelLowering.cpp   | 61 +++
 3 files changed, 51 insertions(+), 14 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/SelectionDAG.h 
b/llvm/include/llvm/CodeGen/SelectionDAG.h
index 87b6914f8a0ee..40550d96a5b3d 100644
--- a/llvm/include/llvm/CodeGen/SelectionDAG.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAG.h
@@ -1873,7 +1873,7 @@ class SelectionDAG {
   /// chain to the token factor. This ensures that the new memory node will 
have
   /// the same relative memory dependency position as the old load. Returns the
   /// new merged load chain.
-  SDValue makeEquivalentMemoryOrdering(LoadSDNode *OldLoad, SDValue NewMemOp);
+  SDValue makeEquivalentMemoryOrdering(MemSDNode *OldLoad, SDValue NewMemOp);
 
   /// Topological-sort the AllNodes list and a
   /// assign a unique node id for each node in the DAG based on their
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp 
b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index e6a7d092b7b79..1c1445f9f44b7 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -12236,7 +12236,7 @@ SDValue 
SelectionDAG::makeEquivalentMemoryOrdering(SDValue OldChain,
   return TokenFactor;
 }
 
-SDValue SelectionDAG::makeEquivalentMemoryOrdering(LoadSDNode *OldLoad,
+SDValue SelectionDAG::makeEquivalentMemoryOrdering(MemSDNode *OldLoad,
SDValue NewMemOp) {
   assert(isa(NewMemOp.getNode()) && "Expected a memop node");
   SDValue OldChain = SDValue(OldLoad, 1);
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp 
b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 1fc50a36fda72..5ce8d83feb0dd 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -7193,15 +7193,19 @@ static SDValue LowerAsSplatVectorLoad(SDValue SrcOp, 
MVT VT, const SDLoc &dl,
 }
 
 // Recurse to find a LoadSDNode source and the accumulated ByteOffest.
-static bool findEltLoadSrc(SDValue Elt, LoadSDNode *&Ld, int64_t &ByteOffset) {
-  if (ISD::isNON_EXTLoad(Elt.getNode())) {
-auto *BaseLd = cast(Elt);
-if (!BaseLd->isSimple())
-  return false;
+static bool findEltLoadSrc(SDValue Elt, MemSDNode *&Ld, int64_t &ByteOffset) {
+  if (auto *BaseLd = dyn_cast(Elt)) {
 Ld = BaseLd;
 ByteOffset = 0;
 return true;
-  }
+  } else if (auto *BaseLd = dyn_cast(Elt))
+if (ISD::isNON_EXTLoad(Elt.getNode())) {
+  if (!BaseLd->isSimple())
+return false;
+  Ld = BaseLd;
+  ByteOffset = 0;
+  return true;
+}
 
   switch (Elt.getOpcode()) {
   case ISD::BITCAST:
@@ -7254,7 +7258,7 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, 
ArrayRef Elts,
   APInt ZeroMask = APInt::getZero(NumElems);
   APInt UndefMask = APInt::getZero(NumElems);
 
-  SmallVector Loads(NumElems, nullptr);
+  SmallVector Loads(NumElems, nullptr);
   SmallVector ByteOffsets(NumElems, 0);
 
   // For each element in the initializer, see if we've found a load, zero or an
@@ -7304,7 +7308,7 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, 
ArrayRef Elts,
   EVT EltBaseVT = EltBase.getValueType();
   assert(EltBaseVT.getSizeInBits() == EltBaseVT.getStoreSizeInBits() &&
  "Register/Memory size mismatch");
-  LoadSDNode *LDBase = Loads[FirstLoadedElt];
+  MemSDNode *LDBase = Loads[FirstLoadedElt];
   assert(LDBase && "Did not find base load for merging consecutive loads");
   unsigned BaseSizeInBits = EltBaseVT.getStoreSizeInBits();
   unsigned BaseSizeInBytes = BaseSizeInBits / 8;
@@ -7318,15 +7322,18 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, 
ArrayRef Elts,
 
   // Check to see if the element's load is consecutive to the base load
   // or offset from a previous (already checked) load.
-  auto CheckConsecutiveLoad = [&](LoadSDNode *Base, int EltIdx) {
-LoadSDNode *Ld = Loads[EltIdx];
+  auto CheckConsecutiveLoad = [&](MemSDNode *Base, int EltIdx) {
+MemSDNode *Ld = Loads[EltIdx];
 int64_t ByteOffset = ByteOffsets[EltIdx];
 if (ByteOffset && (ByteOffset % BaseSizeInBytes) == 0) {
   int64_t BaseIdx = EltIdx - (ByteOffset / BaseSizeInBytes);
   return (0 <= BaseIdx && BaseIdx < (int)NumElems && LoadMask[BaseIdx] &&
   Loads[BaseIdx] == Ld && ByteOffsets[BaseIdx] == 0);
 }
-return DAG.areNo

[llvm-branch-commits] [llvm] [SelectionDAG] Split vector types for atomic load (PR #120640)

2025-05-27 Thread via llvm-branch-commits


https://github.com/jofrn updated 
https://github.com/llvm/llvm-project/pull/120640

>From f91634795e4dbfd6e081f0b096b872411051ff0f Mon Sep 17 00:00:00 2001
From: jofrn 
Date: Thu, 19 Dec 2024 16:25:55 -0500
Subject: [PATCH] [SelectionDAG] Split vector types for atomic load

Vector types that aren't widened are split
so that a single ATOMIC_LOAD is issued for the entire vector at once.
This change utilizes the load vectorization infrastructure in
SelectionDAG in order to group the vectors. This enables SelectionDAG
to translate vectors with type bfloat,half.

commit-id:3a045357
---
 llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h |  1 +
 .../SelectionDAG/LegalizeVectorTypes.cpp  | 37 
 llvm/test/CodeGen/X86/atomic-load-store.ll| 86 +++
 3 files changed, 124 insertions(+)

diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h 
b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index b6e018ba0e454..b15f70cbec1cd 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -965,6 +965,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
   void SplitVecRes_FPOp_MultiType(SDNode *N, SDValue &Lo, SDValue &Hi);
   void SplitVecRes_IS_FPCLASS(SDNode *N, SDValue &Lo, SDValue &Hi);
   void SplitVecRes_INSERT_VECTOR_ELT(SDNode *N, SDValue &Lo, SDValue &Hi);
+  void SplitVecRes_ATOMIC_LOAD(AtomicSDNode *LD, SDValue &Lo, SDValue &Hi);
   void SplitVecRes_LOAD(LoadSDNode *LD, SDValue &Lo, SDValue &Hi);
   void SplitVecRes_VP_LOAD(VPLoadSDNode *LD, SDValue &Lo, SDValue &Hi);
   void SplitVecRes_VP_STRIDED_LOAD(VPStridedLoadSDNode *SLD, SDValue &Lo,
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp 
b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index 42763aab5bb55..af48a5e216803 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -1161,6 +1161,9 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, 
unsigned ResNo) {
 SplitVecRes_STEP_VECTOR(N, Lo, Hi);
 break;
   case ISD::SIGN_EXTEND_INREG: SplitVecRes_InregOp(N, Lo, Hi); break;
+  case ISD::ATOMIC_LOAD:
+SplitVecRes_ATOMIC_LOAD(cast(N), Lo, Hi);
+break;
   case ISD::LOAD:
 SplitVecRes_LOAD(cast(N), Lo, Hi);
 break;
@@ -1414,6 +1417,40 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, 
unsigned ResNo) {
 SetSplitVector(SDValue(N, ResNo), Lo, Hi);
 }
 
+void DAGTypeLegalizer::SplitVecRes_ATOMIC_LOAD(AtomicSDNode *LD, SDValue &Lo,
+   SDValue &Hi) {
+  assert(LD->getExtensionType() == ISD::NON_EXTLOAD &&
+ "Extended load during type legalization!");
+  SDLoc dl(LD);
+  EVT VT = LD->getValueType(0);
+  EVT LoVT, HiVT;
+  std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
+
+  SDValue Ch = LD->getChain();
+  SDValue Ptr = LD->getBasePtr();
+
+  EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
+  EVT MemIntVT =
+  EVT::getIntegerVT(*DAG.getContext(), LD->getMemoryVT().getSizeInBits());
+  SDValue ALD = DAG.getAtomicLoad(ISD::NON_EXTLOAD, dl, MemIntVT, IntVT, Ch,
+  Ptr, LD->getMemOperand());
+
+  EVT LoIntVT = EVT::getIntegerVT(*DAG.getContext(), LoVT.getSizeInBits());
+  EVT HiIntVT = EVT::getIntegerVT(*DAG.getContext(), HiVT.getSizeInBits());
+  SDValue ExtractLo = DAG.getNode(ISD::TRUNCATE, dl, LoIntVT, ALD);
+  SDValue ExtractHi =
+  DAG.getNode(ISD::SRL, dl, IntVT, ALD,
+  DAG.getIntPtrConstant(VT.getSizeInBits() / 2, dl));
+  ExtractHi = DAG.getNode(ISD::TRUNCATE, dl, HiIntVT, ExtractHi);
+
+  Lo = DAG.getBitcast(LoVT, ExtractLo);
+  Hi = DAG.getBitcast(HiVT, ExtractHi);
+
+  // Legalize the chain result - switch anything that used the old chain to
+  // use the new one.
+  ReplaceValueWith(SDValue(LD, 1), ALD.getValue(1));
+}
+
 void DAGTypeLegalizer::IncrementPointer(MemSDNode *N, EVT MemVT,
 MachinePointerInfo &MPI, SDValue &Ptr,
 uint64_t *ScaledOffset) {
diff --git a/llvm/test/CodeGen/X86/atomic-load-store.ll 
b/llvm/test/CodeGen/X86/atomic-load-store.ll
index 535a87316e162..039edcbf83544 100644
--- a/llvm/test/CodeGen/X86/atomic-load-store.ll
+++ b/llvm/test/CodeGen/X86/atomic-load-store.ll
@@ -374,6 +374,74 @@ define <2 x float> @atomic_vec2_float_align(ptr %x) {
   ret <2 x float> %ret
 }
 
+define <2 x half> @atomic_vec2_half(ptr %x) {
+; CHECK-O3-LABEL: atomic_vec2_half:
+; CHECK-O3:   # %bb.0:
+; CHECK-O3-NEXT:movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-O3-NEXT:retq
+;
+; CHECK-SSE-O3-LABEL: atomic_vec2_half:
+; CHECK-SSE-O3:   # %bb.0:
+; CHECK-SSE-O3-NEXT:movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-SSE-O3-NEXT:retq
+;
+; CHECK-AVX-O3-LABEL: atomic_vec2_half:
+; CHECK-AVX-O3:   # %bb.0:
+; CHECK-AVX-O3-NEXT:movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-AVX-O3-NEXT:retq
+;
+

[llvm-branch-commits] [llvm] [SelectionDAG] Legalize <1 x T> vector types for atomic load (PR #120385)

2025-05-27 Thread via llvm-branch-commits


https://github.com/jofrn updated 
https://github.com/llvm/llvm-project/pull/120385

>From 85e5dc5e42cfe0f2f4875cb4db990f92b68295ed Mon Sep 17 00:00:00 2001
From: jofrn 
Date: Wed, 18 Dec 2024 03:37:17 -0500
Subject: [PATCH] [SelectionDAG] Legalize <1 x T> vector types for atomic load

`load atomic <1 x T>` is not valid. This change legalizes
vector types of atomic load via scalarization in SelectionDAG
so that it can, for example, translate from `v1i32` to `i32`.

commit-id:5c36cc8c
---
 llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h |   1 +
 .../SelectionDAG/LegalizeVectorTypes.cpp  |  15 ++
 llvm/test/CodeGen/X86/atomic-load-store.ll| 250 +-
 3 files changed, 257 insertions(+), 9 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h 
b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index dd9af47da5287..d24b4517a460d 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -879,6 +879,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
   SDValue ScalarizeVecRes_UnaryOpWithExtraInput(SDNode *N);
   SDValue ScalarizeVecRes_INSERT_VECTOR_ELT(SDNode *N);
   SDValue ScalarizeVecRes_LOAD(LoadSDNode *N);
+  SDValue ScalarizeVecRes_ATOMIC_LOAD(AtomicSDNode *N);
   SDValue ScalarizeVecRes_SCALAR_TO_VECTOR(SDNode *N);
   SDValue ScalarizeVecRes_VSELECT(SDNode *N);
   SDValue ScalarizeVecRes_SELECT(SDNode *N);
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp 
b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index 4d844f0036a75..d6cbf2211f053 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -65,6 +65,9 @@ void DAGTypeLegalizer::ScalarizeVectorResult(SDNode *N, 
unsigned ResNo) {
 R = ScalarizeVecRes_UnaryOpWithExtraInput(N);
 break;
   case ISD::INSERT_VECTOR_ELT: R = ScalarizeVecRes_INSERT_VECTOR_ELT(N); break;
+  case ISD::ATOMIC_LOAD:
+R = ScalarizeVecRes_ATOMIC_LOAD(cast(N));
+break;
   case ISD::LOAD:   R = 
ScalarizeVecRes_LOAD(cast(N));break;
   case ISD::SCALAR_TO_VECTOR:  R = ScalarizeVecRes_SCALAR_TO_VECTOR(N); break;
   case ISD::SIGN_EXTEND_INREG: R = ScalarizeVecRes_InregOp(N); break;
@@ -455,6 +458,18 @@ SDValue 
DAGTypeLegalizer::ScalarizeVecRes_INSERT_VECTOR_ELT(SDNode *N) {
   return Op;
 }
 
+SDValue DAGTypeLegalizer::ScalarizeVecRes_ATOMIC_LOAD(AtomicSDNode *N) {
+  SDValue Result = DAG.getAtomicLoad(
+  ISD::NON_EXTLOAD, SDLoc(N), N->getMemoryVT().getVectorElementType(),
+  N->getValueType(0).getVectorElementType(), N->getChain(), 
N->getBasePtr(),
+  N->getMemOperand());
+
+  // Legalize the chain result - switch anything that used the old chain to
+  // use the new one.
+  ReplaceValueWith(SDValue(N, 1), Result.getValue(1));
+  return Result;
+}
+
 SDValue DAGTypeLegalizer::ScalarizeVecRes_LOAD(LoadSDNode *N) {
   assert(N->isUnindexed() && "Indexed vector load?");
 
diff --git a/llvm/test/CodeGen/X86/atomic-load-store.ll 
b/llvm/test/CodeGen/X86/atomic-load-store.ll
index 45277ce3d26c4..4f5cb5a4e9247 100644
--- a/llvm/test/CodeGen/X86/atomic-load-store.ll
+++ b/llvm/test/CodeGen/X86/atomic-load-store.ll
@@ -1,12 +1,12 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs -mcpu=x86-64| 
FileCheck %s --check-prefixes=CHECK,CHECK-O3
-; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs -mcpu=x86-64-v2 | 
FileCheck %s --check-prefixes=CHECK,CHECK-O3
-; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs -mcpu=x86-64-v3 | 
FileCheck %s --check-prefixes=CHECK,CHECK-O3
-; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs -mcpu=x86-64-v4 | 
FileCheck %s --check-prefixes=CHECK,CHECK-O3
+; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs -mcpu=x86-64-v2 | 
FileCheck %s --check-prefixes=CHECK,CHECK-SSE-O3
+; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs -mcpu=x86-64-v3 | 
FileCheck %s --check-prefixes=CHECK,CHECK-AVX-O3
+; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs -mcpu=x86-64-v4 | 
FileCheck %s --check-prefixes=CHECK,CHECK-AVX-O3
 ; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs -O0 -mcpu=x86-64| 
FileCheck %s --check-prefixes=CHECK,CHECK-O0
-; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs -O0 -mcpu=x86-64-v2 | 
FileCheck %s --check-prefixes=CHECK,CHECK-O0
-; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs -O0 -mcpu=x86-64-v3 | 
FileCheck %s --check-prefixes=CHECK,CHECK-O0
-; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs -O0 -mcpu=x86-64-v4 | 
FileCheck %s --check-prefixes=CHECK,CHECK-O0
+; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs -O0 -mcpu=x86-64-v2 | 
FileCheck %s --check-prefixes=CHECK,CHECK-SSE-O0
+; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs -O0 -mcpu=x86-64-v3 | 
FileCheck %s --check-prefixes=CHECK,CHECK-AVX-O0
+; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs -O0 -mcpu=x86-6

[llvm-branch-commits] [llvm] [X86] Manage atomic load of fp -> int promotion in DAG (PR #120386)

2025-05-27 Thread via llvm-branch-commits


https://github.com/jofrn updated 
https://github.com/llvm/llvm-project/pull/120386

>From 87d478c96fa8e64c4a5035c467cc800d0c55df2c Mon Sep 17 00:00:00 2001
From: jofrn 
Date: Wed, 18 Dec 2024 03:38:23 -0500
Subject: [PATCH] [X86] Manage atomic load of fp -> int promotion in DAG

When lowering atomic <1 x T> vector types with floats, selection can fail since
this pattern is unsupported. To support this, floats can be casted to
an integer type of the same size.

commit-id:f9d761c5
---
 llvm/lib/Target/X86/X86ISelLowering.cpp|   4 +
 llvm/test/CodeGen/X86/atomic-load-store.ll | 117 +
 2 files changed, 121 insertions(+)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp 
b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 99a82cab384aa..a203c84630fe1 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -2653,6 +2653,10 @@ X86TargetLowering::X86TargetLowering(const 
X86TargetMachine &TM,
 setOperationAction(Op, MVT::f32, Promote);
   }
 
+  setOperationPromotedToType(ISD::ATOMIC_LOAD, MVT::f16, MVT::i16);
+  setOperationPromotedToType(ISD::ATOMIC_LOAD, MVT::f32, MVT::i32);
+  setOperationPromotedToType(ISD::ATOMIC_LOAD, MVT::f64, MVT::i64);
+
   // We have target-specific dag combine patterns for the following nodes:
   setTargetDAGCombine({ISD::VECTOR_SHUFFLE,
ISD::SCALAR_TO_VECTOR,
diff --git a/llvm/test/CodeGen/X86/atomic-load-store.ll 
b/llvm/test/CodeGen/X86/atomic-load-store.ll
index 4f5cb5a4e9247..9fab8b98b4af0 100644
--- a/llvm/test/CodeGen/X86/atomic-load-store.ll
+++ b/llvm/test/CodeGen/X86/atomic-load-store.ll
@@ -269,3 +269,120 @@ define <1 x i64> @atomic_vec1_i64_align(ptr %x) nounwind {
   %ret = load atomic <1 x i64>, ptr %x acquire, align 8
   ret <1 x i64> %ret
 }
+
+define <1 x half> @atomic_vec1_half(ptr %x) {
+; CHECK-O3-LABEL: atomic_vec1_half:
+; CHECK-O3:   # %bb.0:
+; CHECK-O3-NEXT:movzwl (%rdi), %eax
+; CHECK-O3-NEXT:pinsrw $0, %eax, %xmm0
+; CHECK-O3-NEXT:retq
+;
+; CHECK-SSE-O3-LABEL: atomic_vec1_half:
+; CHECK-SSE-O3:   # %bb.0:
+; CHECK-SSE-O3-NEXT:movzwl (%rdi), %eax
+; CHECK-SSE-O3-NEXT:pinsrw $0, %eax, %xmm0
+; CHECK-SSE-O3-NEXT:retq
+;
+; CHECK-AVX-O3-LABEL: atomic_vec1_half:
+; CHECK-AVX-O3:   # %bb.0:
+; CHECK-AVX-O3-NEXT:movzwl (%rdi), %eax
+; CHECK-AVX-O3-NEXT:vpinsrw $0, %eax, %xmm0, %xmm0
+; CHECK-AVX-O3-NEXT:retq
+;
+; CHECK-O0-LABEL: atomic_vec1_half:
+; CHECK-O0:   # %bb.0:
+; CHECK-O0-NEXT:movw (%rdi), %cx
+; CHECK-O0-NEXT:# implicit-def: $eax
+; CHECK-O0-NEXT:movw %cx, %ax
+; CHECK-O0-NEXT:# implicit-def: $xmm0
+; CHECK-O0-NEXT:pinsrw $0, %eax, %xmm0
+; CHECK-O0-NEXT:retq
+;
+; CHECK-SSE-O0-LABEL: atomic_vec1_half:
+; CHECK-SSE-O0:   # %bb.0:
+; CHECK-SSE-O0-NEXT:movw (%rdi), %cx
+; CHECK-SSE-O0-NEXT:# implicit-def: $eax
+; CHECK-SSE-O0-NEXT:movw %cx, %ax
+; CHECK-SSE-O0-NEXT:# implicit-def: $xmm0
+; CHECK-SSE-O0-NEXT:pinsrw $0, %eax, %xmm0
+; CHECK-SSE-O0-NEXT:retq
+;
+; CHECK-AVX-O0-LABEL: atomic_vec1_half:
+; CHECK-AVX-O0:   # %bb.0:
+; CHECK-AVX-O0-NEXT:movw (%rdi), %cx
+; CHECK-AVX-O0-NEXT:# implicit-def: $eax
+; CHECK-AVX-O0-NEXT:movw %cx, %ax
+; CHECK-AVX-O0-NEXT:# implicit-def: $xmm0
+; CHECK-AVX-O0-NEXT:vpinsrw $0, %eax, %xmm0, %xmm0
+; CHECK-AVX-O0-NEXT:retq
+  %ret = load atomic <1 x half>, ptr %x acquire, align 2
+  ret <1 x half> %ret
+}
+
+define <1 x float> @atomic_vec1_float(ptr %x) {
+; CHECK-O3-LABEL: atomic_vec1_float:
+; CHECK-O3:   # %bb.0:
+; CHECK-O3-NEXT:movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-O3-NEXT:retq
+;
+; CHECK-SSE-O3-LABEL: atomic_vec1_float:
+; CHECK-SSE-O3:   # %bb.0:
+; CHECK-SSE-O3-NEXT:movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-SSE-O3-NEXT:retq
+;
+; CHECK-AVX-O3-LABEL: atomic_vec1_float:
+; CHECK-AVX-O3:   # %bb.0:
+; CHECK-AVX-O3-NEXT:vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-AVX-O3-NEXT:retq
+;
+; CHECK-O0-LABEL: atomic_vec1_float:
+; CHECK-O0:   # %bb.0:
+; CHECK-O0-NEXT:movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-O0-NEXT:retq
+;
+; CHECK-SSE-O0-LABEL: atomic_vec1_float:
+; CHECK-SSE-O0:   # %bb.0:
+; CHECK-SSE-O0-NEXT:movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-SSE-O0-NEXT:retq
+;
+; CHECK-AVX-O0-LABEL: atomic_vec1_float:
+; CHECK-AVX-O0:   # %bb.0:
+; CHECK-AVX-O0-NEXT:vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-AVX-O0-NEXT:retq
+  %ret = load atomic <1 x float>, ptr %x acquire, align 4
+  ret <1 x float> %ret
+}
+
+define <1 x double> @atomic_vec1_double_align(ptr %x) nounwind {
+; CHECK-O3-LABEL: atomic_vec1_double_align:
+; CHECK-O3:   # %bb.0:
+; CHECK-O3-NEXT:movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-O3-NEXT:retq
+;
+; CHECK-SSE-O3-LABEL: atomic_vec1_double_align:
+; CHECK-SSE-O3:   # %bb.0:
+; CHECK-SSE-O3-NEXT:movsd {{.*#+}} xmm0 = m

[llvm-branch-commits] [llvm] [SelectionDAG] Split vector types for atomic load (PR #120640)

2025-05-27 Thread via llvm-branch-commits


https://github.com/jofrn updated 
https://github.com/llvm/llvm-project/pull/120640

>From f91634795e4dbfd6e081f0b096b872411051ff0f Mon Sep 17 00:00:00 2001
From: jofrn 
Date: Thu, 19 Dec 2024 16:25:55 -0500
Subject: [PATCH] [SelectionDAG] Split vector types for atomic load

Vector types that aren't widened are split
so that a single ATOMIC_LOAD is issued for the entire vector at once.
This change utilizes the load vectorization infrastructure in
SelectionDAG in order to group the vectors. This enables SelectionDAG
to translate vectors with type bfloat,half.

commit-id:3a045357
---
 llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h |  1 +
 .../SelectionDAG/LegalizeVectorTypes.cpp  | 37 
 llvm/test/CodeGen/X86/atomic-load-store.ll| 86 +++
 3 files changed, 124 insertions(+)

diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h 
b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index b6e018ba0e454..b15f70cbec1cd 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -965,6 +965,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
   void SplitVecRes_FPOp_MultiType(SDNode *N, SDValue &Lo, SDValue &Hi);
   void SplitVecRes_IS_FPCLASS(SDNode *N, SDValue &Lo, SDValue &Hi);
   void SplitVecRes_INSERT_VECTOR_ELT(SDNode *N, SDValue &Lo, SDValue &Hi);
+  void SplitVecRes_ATOMIC_LOAD(AtomicSDNode *LD, SDValue &Lo, SDValue &Hi);
   void SplitVecRes_LOAD(LoadSDNode *LD, SDValue &Lo, SDValue &Hi);
   void SplitVecRes_VP_LOAD(VPLoadSDNode *LD, SDValue &Lo, SDValue &Hi);
   void SplitVecRes_VP_STRIDED_LOAD(VPStridedLoadSDNode *SLD, SDValue &Lo,
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp 
b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index 42763aab5bb55..af48a5e216803 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -1161,6 +1161,9 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, 
unsigned ResNo) {
 SplitVecRes_STEP_VECTOR(N, Lo, Hi);
 break;
   case ISD::SIGN_EXTEND_INREG: SplitVecRes_InregOp(N, Lo, Hi); break;
+  case ISD::ATOMIC_LOAD:
+SplitVecRes_ATOMIC_LOAD(cast(N), Lo, Hi);
+break;
   case ISD::LOAD:
 SplitVecRes_LOAD(cast(N), Lo, Hi);
 break;
@@ -1414,6 +1417,40 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, 
unsigned ResNo) {
 SetSplitVector(SDValue(N, ResNo), Lo, Hi);
 }
 
+void DAGTypeLegalizer::SplitVecRes_ATOMIC_LOAD(AtomicSDNode *LD, SDValue &Lo,
+   SDValue &Hi) {
+  assert(LD->getExtensionType() == ISD::NON_EXTLOAD &&
+ "Extended load during type legalization!");
+  SDLoc dl(LD);
+  EVT VT = LD->getValueType(0);
+  EVT LoVT, HiVT;
+  std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
+
+  SDValue Ch = LD->getChain();
+  SDValue Ptr = LD->getBasePtr();
+
+  EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
+  EVT MemIntVT =
+  EVT::getIntegerVT(*DAG.getContext(), LD->getMemoryVT().getSizeInBits());
+  SDValue ALD = DAG.getAtomicLoad(ISD::NON_EXTLOAD, dl, MemIntVT, IntVT, Ch,
+  Ptr, LD->getMemOperand());
+
+  EVT LoIntVT = EVT::getIntegerVT(*DAG.getContext(), LoVT.getSizeInBits());
+  EVT HiIntVT = EVT::getIntegerVT(*DAG.getContext(), HiVT.getSizeInBits());
+  SDValue ExtractLo = DAG.getNode(ISD::TRUNCATE, dl, LoIntVT, ALD);
+  SDValue ExtractHi =
+  DAG.getNode(ISD::SRL, dl, IntVT, ALD,
+  DAG.getIntPtrConstant(VT.getSizeInBits() / 2, dl));
+  ExtractHi = DAG.getNode(ISD::TRUNCATE, dl, HiIntVT, ExtractHi);
+
+  Lo = DAG.getBitcast(LoVT, ExtractLo);
+  Hi = DAG.getBitcast(HiVT, ExtractHi);
+
+  // Legalize the chain result - switch anything that used the old chain to
+  // use the new one.
+  ReplaceValueWith(SDValue(LD, 1), ALD.getValue(1));
+}
+
 void DAGTypeLegalizer::IncrementPointer(MemSDNode *N, EVT MemVT,
 MachinePointerInfo &MPI, SDValue &Ptr,
 uint64_t *ScaledOffset) {
diff --git a/llvm/test/CodeGen/X86/atomic-load-store.ll 
b/llvm/test/CodeGen/X86/atomic-load-store.ll
index 535a87316e162..039edcbf83544 100644
--- a/llvm/test/CodeGen/X86/atomic-load-store.ll
+++ b/llvm/test/CodeGen/X86/atomic-load-store.ll
@@ -374,6 +374,74 @@ define <2 x float> @atomic_vec2_float_align(ptr %x) {
   ret <2 x float> %ret
 }
 
+define <2 x half> @atomic_vec2_half(ptr %x) {
+; CHECK-O3-LABEL: atomic_vec2_half:
+; CHECK-O3:   # %bb.0:
+; CHECK-O3-NEXT:movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-O3-NEXT:retq
+;
+; CHECK-SSE-O3-LABEL: atomic_vec2_half:
+; CHECK-SSE-O3:   # %bb.0:
+; CHECK-SSE-O3-NEXT:movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-SSE-O3-NEXT:retq
+;
+; CHECK-AVX-O3-LABEL: atomic_vec2_half:
+; CHECK-AVX-O3:   # %bb.0:
+; CHECK-AVX-O3-NEXT:movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-AVX-O3-NEXT:retq
+;
+

[llvm-branch-commits] [clang] [UBSan] Support src:*=sanitize for multiple ignorelists. (PR #141640)

2025-05-27 Thread Qinkun Bao via llvm-branch-commits


https://github.com/qinkunbao updated 
https://github.com/llvm/llvm-project/pull/141640

>From cddba024f55d52e30d9c74369b3707b5fce64a20 Mon Sep 17 00:00:00 2001
From: Qinkun Bao 
Date: Tue, 27 May 2025 17:34:51 +
Subject: [PATCH] Add some comments.

Created using spr 1.3.6
---
 clang/lib/Basic/NoSanitizeList.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/clang/lib/Basic/NoSanitizeList.cpp 
b/clang/lib/Basic/NoSanitizeList.cpp
index c58a67971dfb6..549bbda55e459 100644
--- a/clang/lib/Basic/NoSanitizeList.cpp
+++ b/clang/lib/Basic/NoSanitizeList.cpp
@@ -53,6 +53,8 @@ bool NoSanitizeList::containsFile(SanitizerMask Mask, 
StringRef FileName,
   // If we have two cases such as `src:a.cpp=sanitize` and `src:a.cpp`, the
   // current entry override the previous entry.
   if (SanLine > 0)
+// std::pair uses lexicographic comparison. It will compare the file index
+// first and then comapre the line number.
 return std::make_pair(NoSanFileIdx, NoSanLine) >
std::make_pair(SanFileIdx, SanLine);
   return true;

___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits

[llvm-branch-commits] [clang] [UBSan] Support src:*=sanitize for multiple ignorelists. (PR #141640)

2025-05-27 Thread Qinkun Bao via llvm-branch-commits


https://github.com/qinkunbao edited 
https://github.com/llvm/llvm-project/pull/141640
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits

[llvm-branch-commits] [llvm] [X86] Add atomic vector tests for unaligned >1 sizes. (PR #120387)

2025-05-27 Thread via llvm-branch-commits


https://github.com/jofrn updated 
https://github.com/llvm/llvm-project/pull/120387

>From 7b4708f1ddcd76bd8ba94b0c85317e86bab36ef7 Mon Sep 17 00:00:00 2001
From: jofrn 
Date: Wed, 18 Dec 2024 03:40:32 -0500
Subject: [PATCH] [X86] Add atomic vector tests for unaligned >1 sizes.

Unaligned atomic vectors with size >1 are lowered to calls.
Adding their tests separately here.

commit-id:a06a5cc6
---
 llvm/test/CodeGen/X86/atomic-load-store.ll | 588 +
 1 file changed, 588 insertions(+)

diff --git a/llvm/test/CodeGen/X86/atomic-load-store.ll 
b/llvm/test/CodeGen/X86/atomic-load-store.ll
index 9fab8b98b4af0..3e7b73a65fe07 100644
--- a/llvm/test/CodeGen/X86/atomic-load-store.ll
+++ b/llvm/test/CodeGen/X86/atomic-load-store.ll
@@ -270,6 +270,82 @@ define <1 x i64> @atomic_vec1_i64_align(ptr %x) nounwind {
   ret <1 x i64> %ret
 }
 
+define <1 x ptr> @atomic_vec1_ptr(ptr %x) nounwind {
+; CHECK-O3-LABEL: atomic_vec1_ptr:
+; CHECK-O3:   # %bb.0:
+; CHECK-O3-NEXT:pushq %rax
+; CHECK-O3-NEXT:movq %rdi, %rsi
+; CHECK-O3-NEXT:movq %rsp, %rdx
+; CHECK-O3-NEXT:movl $8, %edi
+; CHECK-O3-NEXT:movl $2, %ecx
+; CHECK-O3-NEXT:callq __atomic_load@PLT
+; CHECK-O3-NEXT:movq (%rsp), %rax
+; CHECK-O3-NEXT:popq %rcx
+; CHECK-O3-NEXT:retq
+;
+; CHECK-SSE-O3-LABEL: atomic_vec1_ptr:
+; CHECK-SSE-O3:   # %bb.0:
+; CHECK-SSE-O3-NEXT:pushq %rax
+; CHECK-SSE-O3-NEXT:movq %rdi, %rsi
+; CHECK-SSE-O3-NEXT:movq %rsp, %rdx
+; CHECK-SSE-O3-NEXT:movl $8, %edi
+; CHECK-SSE-O3-NEXT:movl $2, %ecx
+; CHECK-SSE-O3-NEXT:callq __atomic_load@PLT
+; CHECK-SSE-O3-NEXT:movq (%rsp), %rax
+; CHECK-SSE-O3-NEXT:popq %rcx
+; CHECK-SSE-O3-NEXT:retq
+;
+; CHECK-AVX-O3-LABEL: atomic_vec1_ptr:
+; CHECK-AVX-O3:   # %bb.0:
+; CHECK-AVX-O3-NEXT:pushq %rax
+; CHECK-AVX-O3-NEXT:movq %rdi, %rsi
+; CHECK-AVX-O3-NEXT:movq %rsp, %rdx
+; CHECK-AVX-O3-NEXT:movl $8, %edi
+; CHECK-AVX-O3-NEXT:movl $2, %ecx
+; CHECK-AVX-O3-NEXT:callq __atomic_load@PLT
+; CHECK-AVX-O3-NEXT:movq (%rsp), %rax
+; CHECK-AVX-O3-NEXT:popq %rcx
+; CHECK-AVX-O3-NEXT:retq
+;
+; CHECK-O0-LABEL: atomic_vec1_ptr:
+; CHECK-O0:   # %bb.0:
+; CHECK-O0-NEXT:pushq %rax
+; CHECK-O0-NEXT:movq %rdi, %rsi
+; CHECK-O0-NEXT:movl $8, %edi
+; CHECK-O0-NEXT:movq %rsp, %rdx
+; CHECK-O0-NEXT:movl $2, %ecx
+; CHECK-O0-NEXT:callq __atomic_load@PLT
+; CHECK-O0-NEXT:movq (%rsp), %rax
+; CHECK-O0-NEXT:popq %rcx
+; CHECK-O0-NEXT:retq
+;
+; CHECK-SSE-O0-LABEL: atomic_vec1_ptr:
+; CHECK-SSE-O0:   # %bb.0:
+; CHECK-SSE-O0-NEXT:pushq %rax
+; CHECK-SSE-O0-NEXT:movq %rdi, %rsi
+; CHECK-SSE-O0-NEXT:movl $8, %edi
+; CHECK-SSE-O0-NEXT:movq %rsp, %rdx
+; CHECK-SSE-O0-NEXT:movl $2, %ecx
+; CHECK-SSE-O0-NEXT:callq __atomic_load@PLT
+; CHECK-SSE-O0-NEXT:movq (%rsp), %rax
+; CHECK-SSE-O0-NEXT:popq %rcx
+; CHECK-SSE-O0-NEXT:retq
+;
+; CHECK-AVX-O0-LABEL: atomic_vec1_ptr:
+; CHECK-AVX-O0:   # %bb.0:
+; CHECK-AVX-O0-NEXT:pushq %rax
+; CHECK-AVX-O0-NEXT:movq %rdi, %rsi
+; CHECK-AVX-O0-NEXT:movl $8, %edi
+; CHECK-AVX-O0-NEXT:movq %rsp, %rdx
+; CHECK-AVX-O0-NEXT:movl $2, %ecx
+; CHECK-AVX-O0-NEXT:callq __atomic_load@PLT
+; CHECK-AVX-O0-NEXT:movq (%rsp), %rax
+; CHECK-AVX-O0-NEXT:popq %rcx
+; CHECK-AVX-O0-NEXT:retq
+  %ret = load atomic <1 x ptr>, ptr %x acquire, align 4
+  ret <1 x ptr> %ret
+}
+
 define <1 x half> @atomic_vec1_half(ptr %x) {
 ; CHECK-O3-LABEL: atomic_vec1_half:
 ; CHECK-O3:   # %bb.0:
@@ -386,3 +462,515 @@ define <1 x double> @atomic_vec1_double_align(ptr %x) 
nounwind {
   %ret = load atomic <1 x double>, ptr %x acquire, align 8
   ret <1 x double> %ret
 }
+
+define <1 x i64> @atomic_vec1_i64(ptr %x) nounwind {
+; CHECK-O3-LABEL: atomic_vec1_i64:
+; CHECK-O3:   # %bb.0:
+; CHECK-O3-NEXT:pushq %rax
+; CHECK-O3-NEXT:movq %rdi, %rsi
+; CHECK-O3-NEXT:movq %rsp, %rdx
+; CHECK-O3-NEXT:movl $8, %edi
+; CHECK-O3-NEXT:movl $2, %ecx
+; CHECK-O3-NEXT:callq __atomic_load@PLT
+; CHECK-O3-NEXT:movq (%rsp), %rax
+; CHECK-O3-NEXT:popq %rcx
+; CHECK-O3-NEXT:retq
+;
+; CHECK-SSE-O3-LABEL: atomic_vec1_i64:
+; CHECK-SSE-O3:   # %bb.0:
+; CHECK-SSE-O3-NEXT:pushq %rax
+; CHECK-SSE-O3-NEXT:movq %rdi, %rsi
+; CHECK-SSE-O3-NEXT:movq %rsp, %rdx
+; CHECK-SSE-O3-NEXT:movl $8, %edi
+; CHECK-SSE-O3-NEXT:movl $2, %ecx
+; CHECK-SSE-O3-NEXT:callq __atomic_load@PLT
+; CHECK-SSE-O3-NEXT:movq (%rsp), %rax
+; CHECK-SSE-O3-NEXT:popq %rcx
+; CHECK-SSE-O3-NEXT:retq
+;
+; CHECK-AVX-O3-LABEL: atomic_vec1_i64:
+; CHECK-AVX-O3:   # %bb.0:
+; CHECK-AVX-O3-NEXT:pushq %rax
+; CHECK-AVX-O3-NEXT:movq %rdi, %rsi
+; CHECK-AVX-O3-NEXT:movq %rsp, %rdx
+; CHECK-AVX-O3-NEXT:movl $8, %edi
+; CHECK-AVX-O3-NEXT:movl $2, %ecx
+; CHECK-AVX-O3-NEXT:callq __atomic_load@PLT
+; CHECK-AVX-O3-NEXT:movq (%rsp), %rax
+; CHECK-AV

[llvm-branch-commits] [llvm] [SelectionDAG] Widen <2 x T> vector types for atomic load (PR #120598)

2025-05-27 Thread via llvm-branch-commits


https://github.com/jofrn updated 
https://github.com/llvm/llvm-project/pull/120598

>From 39f039e23b95204affb61d1ac004c562f08222c5 Mon Sep 17 00:00:00 2001
From: jofrn 
Date: Thu, 19 Dec 2024 11:19:39 -0500
Subject: [PATCH] [SelectionDAG] Widen <2 x T> vector types for atomic load

Vector types of 2 elements must be widened. This change does this
for vector types of atomic load in SelectionDAG
so that it can translate aligned vectors of >1 size.

commit-id:2894ccd1
---
 llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h |   1 +
 .../SelectionDAG/LegalizeVectorTypes.cpp  |  97 --
 llvm/test/CodeGen/X86/atomic-load-store.ll| 286 ++
 3 files changed, 361 insertions(+), 23 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h 
b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index d24b4517a460d..b6e018ba0e454 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -1068,6 +1068,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
   SDValue WidenVecRes_EXTRACT_SUBVECTOR(SDNode* N);
   SDValue WidenVecRes_INSERT_SUBVECTOR(SDNode *N);
   SDValue WidenVecRes_INSERT_VECTOR_ELT(SDNode* N);
+  SDValue WidenVecRes_ATOMIC_LOAD(AtomicSDNode *N);
   SDValue WidenVecRes_LOAD(SDNode* N);
   SDValue WidenVecRes_VP_LOAD(VPLoadSDNode *N);
   SDValue WidenVecRes_VP_STRIDED_LOAD(VPStridedLoadSDNode *N);
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp 
b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index d6cbf2211f053..42763aab5bb55 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -4622,6 +4622,9 @@ void DAGTypeLegalizer::WidenVectorResult(SDNode *N, 
unsigned ResNo) {
 break;
   case ISD::EXTRACT_SUBVECTOR: Res = WidenVecRes_EXTRACT_SUBVECTOR(N); break;
   case ISD::INSERT_VECTOR_ELT: Res = WidenVecRes_INSERT_VECTOR_ELT(N); break;
+  case ISD::ATOMIC_LOAD:
+Res = WidenVecRes_ATOMIC_LOAD(cast(N));
+break;
   case ISD::LOAD:  Res = WidenVecRes_LOAD(N); break;
   case ISD::STEP_VECTOR:
   case ISD::SPLAT_VECTOR:
@@ -6003,6 +6006,74 @@ SDValue 
DAGTypeLegalizer::WidenVecRes_INSERT_VECTOR_ELT(SDNode *N) {
  N->getOperand(1), N->getOperand(2));
 }
 
+/// Either return the same load or provide appropriate casts
+/// from the load and return that.
+static SDValue coerceLoadedValue(SDValue LdOp, EVT FirstVT, EVT WidenVT,
+ TypeSize LdWidth, TypeSize FirstVTWidth,
+ SDLoc dl, SelectionDAG &DAG) {
+  assert(TypeSize::isKnownLE(LdWidth, FirstVTWidth));
+  TypeSize WidenWidth = WidenVT.getSizeInBits();
+  if (!FirstVT.isVector()) {
+unsigned NumElts =
+WidenWidth.getFixedValue() / FirstVTWidth.getFixedValue();
+EVT NewVecVT = EVT::getVectorVT(*DAG.getContext(), FirstVT, NumElts);
+SDValue VecOp = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, NewVecVT, LdOp);
+return DAG.getNode(ISD::BITCAST, dl, WidenVT, VecOp);
+  }
+  assert(FirstVT == WidenVT);
+  return LdOp;
+}
+
+static std::optional findMemType(SelectionDAG &DAG,
+  const TargetLowering &TLI, unsigned 
Width,
+  EVT WidenVT, unsigned Align,
+  unsigned WidenEx);
+
+SDValue DAGTypeLegalizer::WidenVecRes_ATOMIC_LOAD(AtomicSDNode *LD) {
+  EVT WidenVT =
+  TLI.getTypeToTransformTo(*DAG.getContext(), LD->getValueType(0));
+  EVT LdVT = LD->getMemoryVT();
+  SDLoc dl(LD);
+  assert(LdVT.isVector() && WidenVT.isVector() && "Expected vectors");
+  assert(LdVT.isScalableVector() == WidenVT.isScalableVector() &&
+ "Must be scalable");
+  assert(LdVT.getVectorElementType() == WidenVT.getVectorElementType() &&
+ "Expected equivalent element types");
+
+  // Load information
+  SDValue Chain = LD->getChain();
+  SDValue BasePtr = LD->getBasePtr();
+  MachineMemOperand::Flags MMOFlags = LD->getMemOperand()->getFlags();
+  AAMDNodes AAInfo = LD->getAAInfo();
+
+  TypeSize LdWidth = LdVT.getSizeInBits();
+  TypeSize WidenWidth = WidenVT.getSizeInBits();
+  TypeSize WidthDiff = WidenWidth - LdWidth;
+
+  // Find the vector type that can load from.
+  std::optional FirstVT =
+  findMemType(DAG, TLI, LdWidth.getKnownMinValue(), WidenVT, /*LdAlign=*/0,
+  WidthDiff.getKnownMinValue());
+
+  if (!FirstVT)
+return SDValue();
+
+  SmallVector MemVTs;
+  TypeSize FirstVTWidth = FirstVT->getSizeInBits();
+
+  SDValue LdOp = DAG.getAtomicLoad(ISD::NON_EXTLOAD, dl, *FirstVT, *FirstVT,
+   Chain, BasePtr, LD->getMemOperand());
+
+  // Load the element with one instruction.
+  SDValue Result = coerceLoadedValue(LdOp, *FirstVT, WidenVT, LdWidth,
+ FirstVTWidth, dl, DAG);
+
+  // Modified the chain - switch anything that used the old chain to use
+  // the new one.
+

[llvm-branch-commits] [llvm] [AtomicExpand] Add bitcasts when expanding load atomic vector (PR #120716)

2025-05-27 Thread via llvm-branch-commits


https://github.com/jofrn updated 
https://github.com/llvm/llvm-project/pull/120716

>From ac4069c8fa8e69173b203824f4db5fbd73ecb5a4 Mon Sep 17 00:00:00 2001
From: jofrn 
Date: Fri, 20 Dec 2024 06:14:28 -0500
Subject: [PATCH] [AtomicExpand] Add bitcasts when expanding load atomic vector

AtomicExpand fails for aligned `load atomic ` because it
does not find a compatible library call. This change adds appropriate
bitcasts so that the call can be lowered. It also adds support for
128 bit lowering in tablegen to support SSE/AVX.

commit-id:f430c1af
---
 .../include/llvm/Target/TargetSelectionDAG.td |  14 +
 llvm/lib/CodeGen/AtomicExpandPass.cpp |  15 +-
 llvm/lib/Target/X86/X86InstrCompiler.td   |   5 +
 llvm/test/CodeGen/ARM/atomic-load-store.ll|  51 
 llvm/test/CodeGen/X86/atomic-load-store.ll|  94 +++
 .../X86/expand-atomic-non-integer.ll  | 263 --
 6 files changed, 360 insertions(+), 82 deletions(-)

diff --git a/llvm/include/llvm/Target/TargetSelectionDAG.td 
b/llvm/include/llvm/Target/TargetSelectionDAG.td
index 406baa4f5fdaa..3b8a34ca0eb51 100644
--- a/llvm/include/llvm/Target/TargetSelectionDAG.td
+++ b/llvm/include/llvm/Target/TargetSelectionDAG.td
@@ -1904,6 +1904,20 @@ def atomic_load_64 :
   let MemoryVT = i64;
 }
 
+def atomic_load_128_v2i64 :
+  PatFrag<(ops node:$ptr),
+  (atomic_load node:$ptr)> {
+  let IsAtomic = true;
+  let MemoryVT = v2i64;
+}
+
+def atomic_load_128_v4i32 :
+  PatFrag<(ops node:$ptr),
+  (atomic_load node:$ptr)> {
+  let IsAtomic = true;
+  let MemoryVT = v4i32;
+}
+
 def atomic_load_nonext_8 :
   PatFrag<(ops node:$ptr), (atomic_load_nonext node:$ptr)> {
   let IsAtomic = true; // FIXME: Should be IsLoad and/or IsAtomic?
diff --git a/llvm/lib/CodeGen/AtomicExpandPass.cpp 
b/llvm/lib/CodeGen/AtomicExpandPass.cpp
index c376de877ac7d..70f59eafc6ecb 100644
--- a/llvm/lib/CodeGen/AtomicExpandPass.cpp
+++ b/llvm/lib/CodeGen/AtomicExpandPass.cpp
@@ -2066,9 +2066,18 @@ bool AtomicExpandImpl::expandAtomicOpToLibcall(
 I->replaceAllUsesWith(V);
   } else if (HasResult) {
 Value *V;
-if (UseSizedLibcall)
-  V = Builder.CreateBitOrPointerCast(Result, I->getType());
-else {
+if (UseSizedLibcall) {
+  // Add bitcasts from Result's scalar type to I's  vector type
+  auto *PtrTy = dyn_cast(I->getType()->getScalarType());
+  auto *VTy = dyn_cast(I->getType());
+  if (VTy && PtrTy && !Result->getType()->isVectorTy()) {
+unsigned AS = PtrTy->getAddressSpace();
+Value *BC = Builder.CreateBitCast(
+Result, VTy->getWithNewType(DL.getIntPtrType(Ctx, AS)));
+V = Builder.CreateIntToPtr(BC, I->getType());
+  } else
+V = Builder.CreateBitOrPointerCast(Result, I->getType());
+} else {
   V = Builder.CreateAlignedLoad(I->getType(), AllocaResult,
 AllocaAlignment);
   Builder.CreateLifetimeEnd(AllocaResult, SizeVal64);
diff --git a/llvm/lib/Target/X86/X86InstrCompiler.td 
b/llvm/lib/Target/X86/X86InstrCompiler.td
index 26b76dd1ca83a..3143015b7ec66 100644
--- a/llvm/lib/Target/X86/X86InstrCompiler.td
+++ b/llvm/lib/Target/X86/X86InstrCompiler.td
@@ -1211,6 +1211,11 @@ def : Pat<(v4i32 (scalar_to_vector (i32 (atomic_load_32 
addr:$src,
 def : Pat<(v2i64 (scalar_to_vector (i64 (atomic_load_64 addr:$src,
(MOV64toPQIrm  addr:$src)>; // load atomic <2 x i32,float>
 
+def : Pat<(v2i64 (atomic_load_128_v2i64 addr:$src)),
+   (VMOVAPDrm addr:$src)>; // load atomic <2 x i64>
+def : Pat<(v4i32 (atomic_load_128_v4i32 addr:$src)),
+   (VMOVAPDrm addr:$src)>; // load atomic <4 x i32>
+
 // Floating point loads/stores.
 def : Pat<(atomic_store_32 (i32 (bitconvert (f32 FR32:$src))), addr:$dst),
   (MOVSSmr addr:$dst, FR32:$src)>, Requires<[UseSSE1]>;
diff --git a/llvm/test/CodeGen/ARM/atomic-load-store.ll 
b/llvm/test/CodeGen/ARM/atomic-load-store.ll
index 560dfde356c29..eaa2ffd9b2731 100644
--- a/llvm/test/CodeGen/ARM/atomic-load-store.ll
+++ b/llvm/test/CodeGen/ARM/atomic-load-store.ll
@@ -983,3 +983,54 @@ define void @store_atomic_f64__seq_cst(ptr %ptr, double 
%val1) {
   store atomic double %val1, ptr %ptr seq_cst, align 8
   ret void
 }
+
+define <1 x ptr> @atomic_vec1_ptr(ptr %x) #0 {
+; ARM-LABEL: atomic_vec1_ptr:
+; ARM:   @ %bb.0:
+; ARM-NEXT:ldr r0, [r0]
+; ARM-NEXT:dmb ish
+; ARM-NEXT:bx lr
+;
+; ARMOPTNONE-LABEL: atomic_vec1_ptr:
+; ARMOPTNONE:   @ %bb.0:
+; ARMOPTNONE-NEXT:ldr r0, [r0]
+; ARMOPTNONE-NEXT:dmb ish
+; ARMOPTNONE-NEXT:bx lr
+;
+; THUMBTWO-LABEL: atomic_vec1_ptr:
+; THUMBTWO:   @ %bb.0:
+; THUMBTWO-NEXT:ldr r0, [r0]
+; THUMBTWO-NEXT:dmb ish
+; THUMBTWO-NEXT:bx lr
+;
+; THUMBONE-LABEL: atomic_vec1_ptr:
+; THUMBONE:   @ %bb.0:
+; THUMBONE-NEXT:push {r7, lr}
+; THUMBONE-NEXT:movs r1, #0
+; THUMBONE-NEXT:mov r2, r1
+; THUMBONE-NEXT:bl __sync_val_compare_and_swap_4
+; THUMBONE-NEXT:

1 2 3 >

1 - 100 of 208 matches

Mail list logo