[llvm-branch-commits] [mlir] a11869c - Revert "[mlir] Make remove-dead-values pass remove blocks arguments first (#1…"

2025-11-06 Thread via llvm-branch-commits

Author: lonely eagle
Date: 2025-11-06T18:46:46+08:00
New Revision: a11869ccf13d99f3559a4b244dfe2c8593db783a

URL: 
https://github.com/llvm/llvm-project/commit/a11869ccf13d99f3559a4b244dfe2c8593db783a
DIFF: 
https://github.com/llvm/llvm-project/commit/a11869ccf13d99f3559a4b244dfe2c8593db783a.diff

LOG: Revert "[mlir] Make remove-dead-values pass remove blocks arguments first 
(#1…"

This reverts commit a928c61961004cc94c4cb37bc4c414f1537e7660.

Added: 


Modified: 
mlir/lib/Transforms/RemoveDeadValues.cpp
mlir/test/Transforms/remove-dead-values.mlir

Removed: 




diff  --git a/mlir/lib/Transforms/RemoveDeadValues.cpp 
b/mlir/lib/Transforms/RemoveDeadValues.cpp
index 979b3965e4ba9..41f3f9d76a3b1 100644
--- a/mlir/lib/Transforms/RemoveDeadValues.cpp
+++ b/mlir/lib/Transforms/RemoveDeadValues.cpp
@@ -742,25 +742,7 @@ static void processBranchOp(BranchOpInterface branchOp, 
RunLivenessAnalysis &la,
 static void cleanUpDeadVals(RDVFinalCleanupList &list) {
   LDBG() << "Starting cleanup of dead values...";
 
-  // 1. Blocks
-  LDBG() << "Cleaning up " << list.blocks.size() << " block argument lists";
-  for (auto &b : list.blocks) {
-// blocks that are accessed via multiple codepaths processed once
-if (b.b->getNumArguments() != b.nonLiveArgs.size())
-  continue;
-LDBG() << "Erasing " << b.nonLiveArgs.count()
-   << " non-live arguments from block: " << b.b;
-// it iterates backwards because erase invalidates all successor indexes
-for (int i = b.nonLiveArgs.size() - 1; i >= 0; --i) {
-  if (!b.nonLiveArgs[i])
-continue;
-  LDBG() << "  Erasing block argument " << i << ": " << 
b.b->getArgument(i);
-  b.b->getArgument(i).dropAllUses();
-  b.b->eraseArgument(i);
-}
-  }
-
-  // 2. Operations
+  // 1. Operations
   LDBG() << "Cleaning up " << list.operations.size() << " operations";
   for (auto &op : list.operations) {
 LDBG() << "Erasing operation: "
@@ -769,14 +751,14 @@ static void cleanUpDeadVals(RDVFinalCleanupList &list) {
 op->erase();
   }
 
-  // 3. Values
+  // 2. Values
   LDBG() << "Cleaning up " << list.values.size() << " values";
   for (auto &v : list.values) {
 LDBG() << "Dropping all uses of value: " << v;
 v.dropAllUses();
   }
 
-  // 4. Functions
+  // 3. Functions
   LDBG() << "Cleaning up " << list.functions.size() << " functions";
   // Record which function arguments were erased so we can shrink call-site
   // argument segments for CallOpInterface operations (e.g. ops using
@@ -798,7 +780,7 @@ static void cleanUpDeadVals(RDVFinalCleanupList &list) {
 (void)f.funcOp.eraseResults(f.nonLiveRets);
   }
 
-  // 5. Operands
+  // 4. Operands
   LDBG() << "Cleaning up " << list.operands.size() << " operand lists";
   for (OperationToCleanup &o : list.operands) {
 // Handle call-specific cleanup only when we have a cached callee 
reference.
@@ -840,7 +822,7 @@ static void cleanUpDeadVals(RDVFinalCleanupList &list) {
 }
   }
 
-  // 6. Results
+  // 5. Results
   LDBG() << "Cleaning up " << list.results.size() << " result lists";
   for (auto &r : list.results) {
 LDBG() << "Erasing " << r.nonLive.count()
@@ -849,6 +831,24 @@ static void cleanUpDeadVals(RDVFinalCleanupList &list) {
 dropUsesAndEraseResults(r.op, r.nonLive);
   }
 
+  // 6. Blocks
+  LDBG() << "Cleaning up " << list.blocks.size() << " block argument lists";
+  for (auto &b : list.blocks) {
+// blocks that are accessed via multiple codepaths processed once
+if (b.b->getNumArguments() != b.nonLiveArgs.size())
+  continue;
+LDBG() << "Erasing " << b.nonLiveArgs.count()
+   << " non-live arguments from block: " << b.b;
+// it iterates backwards because erase invalidates all successor indexes
+for (int i = b.nonLiveArgs.size() - 1; i >= 0; --i) {
+  if (!b.nonLiveArgs[i])
+continue;
+  LDBG() << "  Erasing block argument " << i << ": " << 
b.b->getArgument(i);
+  b.b->getArgument(i).dropAllUses();
+  b.b->eraseArgument(i);
+}
+  }
+
   // 7. Successor Operands
   LDBG() << "Cleaning up " << list.successorOperands.size()
  << " successor operand lists";

diff  --git a/mlir/test/Transforms/remove-dead-values.mlir 
b/mlir/test/Transforms/remove-dead-values.mlir
index 8b5ccdcf204dd..e7304505c809e 100644
--- a/mlir/test/Transforms/remove-dead-values.mlir
+++ b/mlir/test/Transforms/remove-dead-values.mlir
@@ -674,18 +674,3 @@ func.func @dead_value_loop_ivs_no_result(%lb: index, %ub: 
index, %step: index, %
   }
   return
 }
-
-// -
-
-// CHECK-LABEL: func @op_block_have_dead_arg
-func.func @op_block_have_dead_arg(%arg0: index, %arg1: index, %arg2: index, 
%arg3: i1) {
-  scf.for %iv = %arg0 to %arg1 step %arg2 {
-scf.execute_region {
-  cf.cond_br %arg3, ^bb1(%arg0 : index), ^bb1(%arg1 : index)
-^bb1(%0: index):
-scf.yield
-}
-  }
-// CHECK-NEXT: return
-  r

[llvm-branch-commits] [llvm] [AMDGPU] Add wave reduce intrinsics for float types - 2 (PR #161815)

2025-11-06 Thread Juan Manuel Martinez Caamaño via llvm-branch-commits


@@ -5330,11 +5330,13 @@ static uint32_t 
getIdentityValueFor32BitWaveReduction(unsigned Opc) {
   case AMDGPU::S_MAX_U32:
 return std::numeric_limits::min();
   case AMDGPU::S_MAX_I32:
+  case AMDGPU::V_SUB_F32_e64: // +0.0

jmmartinez wrote:

I haven't thought about this, but why do we take `-0.0` if the reduction is a 
sub and `+0.0` if it is an add ? Does it come from any specification ?

https://github.com/llvm/llvm-project/pull/161815
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [AMDGPU] Add wave reduce intrinsics for float types - 2 (PR #161815)

2025-11-06 Thread Juan Manuel Martinez Caamaño via llvm-branch-commits


@@ -5330,11 +5330,13 @@ static uint32_t 
getIdentityValueFor32BitWaveReduction(unsigned Opc) {
   case AMDGPU::S_MAX_U32:
 return std::numeric_limits::min();
   case AMDGPU::S_MAX_I32:
+  case AMDGPU::V_SUB_F32_e64: // +0.0

jmmartinez wrote:

This doesn't seem right.

Isn't `0b1000` the opposite, -0.0 ?

I'd feel reassured if you used a bitcast instead of a comment (comments tend to 
diverge from the code eventually): `__builtin_bit_cast(+0.0f, uint32_t)`.

https://github.com/llvm/llvm-project/pull/161815
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [clang] [AMDGPU] Add builtins for wave reduction intrinsics (PR #161816)

2025-11-06 Thread Juan Manuel Martinez Caamaño via llvm-branch-commits

https://github.com/jmmartinez approved this pull request.


https://github.com/llvm/llvm-project/pull/161816
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [LoongArch] Initial implementation for `enableMemCmpExpansion` hook (PR #166526)

2025-11-06 Thread via llvm-branch-commits

zhaoqi5 wrote:

> How does this optimization affect the benchmark? For example 
> `llvm-test-suite/MicroBenchmarks/MemFunctions`. Add some test results to help 
> with comparison and code review. It might be necessary to test the data under 
> various conditions, including different vector widths and optimization levels 
> (e.g., O2 or Os).

Okay, I will try to test some benchmarks such as test-suite or spec cpu and add 
the results later. Thanks.

https://github.com/llvm/llvm-project/pull/166526
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [clang] [AMDGPU] Add builtins for wave reduction intrinsics (PR #161816)

2025-11-06 Thread via llvm-branch-commits

https://github.com/easyonaadit updated 
https://github.com/llvm/llvm-project/pull/161816

>From 0e9bcce2647a3adc91bc049dfc5761cbeefa19b1 Mon Sep 17 00:00:00 2001
From: Aaditya 
Date: Tue, 30 Sep 2025 11:37:42 +0530
Subject: [PATCH] [AMDGPU] Add builtins for wave reduction intrinsics

---
 clang/include/clang/Basic/BuiltinsAMDGPU.def | 4 
 clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp  | 8 
 2 files changed, 12 insertions(+)

diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def 
b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index fda16e42d2c6b..ebc0ac35f42d9 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -402,6 +402,10 @@ BUILTIN(__builtin_amdgcn_wave_reduce_max_u64, "WUiWUiZi", 
"nc")
 BUILTIN(__builtin_amdgcn_wave_reduce_and_b64, "WiWiZi", "nc")
 BUILTIN(__builtin_amdgcn_wave_reduce_or_b64, "WiWiZi", "nc")
 BUILTIN(__builtin_amdgcn_wave_reduce_xor_b64, "WiWiZi", "nc")
+BUILTIN(__builtin_amdgcn_wave_reduce_add_f32, "ffZi", "nc")
+BUILTIN(__builtin_amdgcn_wave_reduce_sub_f32, "ffZi", "nc")
+BUILTIN(__builtin_amdgcn_wave_reduce_min_f32, "ffZi", "nc")
+BUILTIN(__builtin_amdgcn_wave_reduce_max_f32, "ffZi", "nc")
 
 
//===--===//
 // R600-NI only builtins.
diff --git a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp 
b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
index 07cf08c54985a..4de722077c8e9 100644
--- a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
+++ b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
@@ -301,18 +301,22 @@ static Intrinsic::ID 
getIntrinsicIDforWaveReduction(unsigned BuiltinID) {
 llvm_unreachable("Unknown BuiltinID for wave reduction");
   case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_add_u32:
   case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_add_u64:
+  case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_add_f32:
 return Intrinsic::amdgcn_wave_reduce_add;
   case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_sub_u32:
   case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_sub_u64:
+  case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_sub_f32:
 return Intrinsic::amdgcn_wave_reduce_sub;
   case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_min_i32:
   case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_min_i64:
+  case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_min_f32:
 return Intrinsic::amdgcn_wave_reduce_min;
   case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_min_u32:
   case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_min_u64:
 return Intrinsic::amdgcn_wave_reduce_umin;
   case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_max_i32:
   case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_max_i64:
+  case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_max_f32:
 return Intrinsic::amdgcn_wave_reduce_max;
   case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_max_u32:
   case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_max_u64:
@@ -335,11 +339,15 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned 
BuiltinID,
   llvm::SyncScope::ID SSID;
   switch (BuiltinID) {
   case AMDGPU::BI__builtin_amdgcn_wave_reduce_add_u32:
+  case AMDGPU::BI__builtin_amdgcn_wave_reduce_add_f32:
   case AMDGPU::BI__builtin_amdgcn_wave_reduce_sub_u32:
+  case AMDGPU::BI__builtin_amdgcn_wave_reduce_sub_f32:
   case AMDGPU::BI__builtin_amdgcn_wave_reduce_min_i32:
   case AMDGPU::BI__builtin_amdgcn_wave_reduce_min_u32:
+  case AMDGPU::BI__builtin_amdgcn_wave_reduce_min_f32:
   case AMDGPU::BI__builtin_amdgcn_wave_reduce_max_i32:
   case AMDGPU::BI__builtin_amdgcn_wave_reduce_max_u32:
+  case AMDGPU::BI__builtin_amdgcn_wave_reduce_max_f32:
   case AMDGPU::BI__builtin_amdgcn_wave_reduce_and_b32:
   case AMDGPU::BI__builtin_amdgcn_wave_reduce_or_b32:
   case AMDGPU::BI__builtin_amdgcn_wave_reduce_xor_b32:

___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [AMDGPU] Remove named-barrier LDS lowering logic from amdgpu-lower-module-lds (PR #166731)

2025-11-06 Thread via llvm-branch-commits

https://github.com/skc7 edited https://github.com/llvm/llvm-project/pull/166731
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [AMDGPU] Enable amdgpu-lower-exec-sync pass in pipeline (PR #165746)

2025-11-06 Thread via llvm-branch-commits

https://github.com/skc7 edited https://github.com/llvm/llvm-project/pull/165746
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [AMDGPU] Remove named-barrier LDS lowering logic from amdgpu-lower-module-lds (PR #166731)

2025-11-06 Thread via llvm-branch-commits

https://github.com/skc7 ready_for_review 
https://github.com/llvm/llvm-project/pull/166731
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [AMDGPU] Remove named-barrier LDS lowering logic from amdgpu-lower-module-lds (PR #166731)

2025-11-06 Thread via llvm-branch-commits

llvmbot wrote:




@llvm/pr-subscribers-backend-amdgpu

Author: Chaitanya (skc7)


Changes

This PR removes the named-barrier LDS lowering from `amdgpu-lower-module-lds` 
pass, since it is now handled by `amdgpu-lower-exec-sync` pass

This PR is 3rd one in the stack.
PR1 : https://github.com/llvm/llvm-project/pull/165692
PR2 : https://github.com/llvm/llvm-project/pull/165746
-> PR3 : https://github.com/llvm/llvm-project/pull/166731

---
Full diff: https://github.com/llvm/llvm-project/pull/166731.diff


1 Files Affected:

- (modified) llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp (-126) 


``diff
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp 
b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
index a4ef524c43466..3c0328e93ffbd 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
@@ -922,126 +922,6 @@ class AMDGPULowerModuleLDS {
 return KernelToCreatedDynamicLDS;
   }
 
-  static GlobalVariable *uniquifyGVPerKernel(Module &M, GlobalVariable *GV,
- Function *KF) {
-bool NeedsReplacement = false;
-for (Use &U : GV->uses()) {
-  if (auto *I = dyn_cast(U.getUser())) {
-Function *F = I->getFunction();
-if (isKernelLDS(F) && F != KF) {
-  NeedsReplacement = true;
-  break;
-}
-  }
-}
-if (!NeedsReplacement)
-  return GV;
-// Create a new GV used only by this kernel and its function
-GlobalVariable *NewGV = new GlobalVariable(
-M, GV->getValueType(), GV->isConstant(), GV->getLinkage(),
-GV->getInitializer(), GV->getName() + "." + KF->getName(), nullptr,
-GV->getThreadLocalMode(), GV->getType()->getAddressSpace());
-NewGV->copyAttributesFrom(GV);
-for (Use &U : make_early_inc_range(GV->uses())) {
-  if (auto *I = dyn_cast(U.getUser())) {
-Function *F = I->getFunction();
-if (!isKernelLDS(F) || F == KF) {
-  U.getUser()->replaceUsesOfWith(GV, NewGV);
-}
-  }
-}
-return NewGV;
-  }
-
-  bool lowerSpecialLDSVariables(
-  Module &M, LDSUsesInfoTy &LDSUsesInfo,
-  VariableFunctionMap &LDSToKernelsThatNeedToAccessItIndirectly) {
-bool Changed = false;
-const DataLayout &DL = M.getDataLayout();
-// The 1st round: give module-absolute assignments
-int NumAbsolutes = 0;
-std::vector OrderedGVs;
-for (auto &K : LDSToKernelsThatNeedToAccessItIndirectly) {
-  GlobalVariable *GV = K.first;
-  if (!isNamedBarrier(*GV))
-continue;
-  // give a module-absolute assignment if it is indirectly accessed by
-  // multiple kernels. This is not precise, but we don't want to duplicate
-  // a function when it is called by multiple kernels.
-  if (LDSToKernelsThatNeedToAccessItIndirectly[GV].size() > 1) {
-OrderedGVs.push_back(GV);
-  } else {
-// leave it to the 2nd round, which will give a kernel-relative
-// assignment if it is only indirectly accessed by one kernel
-LDSUsesInfo.direct_access[*K.second.begin()].insert(GV);
-  }
-  LDSToKernelsThatNeedToAccessItIndirectly.erase(GV);
-}
-OrderedGVs = sortByName(std::move(OrderedGVs));
-for (GlobalVariable *GV : OrderedGVs) {
-  unsigned BarrierScope = llvm::AMDGPU::Barrier::BARRIER_SCOPE_WORKGROUP;
-  unsigned BarId = NumAbsolutes + 1;
-  unsigned BarCnt = DL.getTypeAllocSize(GV->getValueType()) / 16;
-  NumAbsolutes += BarCnt;
-
-  // 4 bits for alignment, 5 bits for the barrier num,
-  // 3 bits for the barrier scope
-  unsigned Offset = 0x802000u | BarrierScope << 9 | BarId << 4;
-  recordLDSAbsoluteAddress(&M, GV, Offset);
-}
-OrderedGVs.clear();
-
-// The 2nd round: give a kernel-relative assignment for GV that
-// either only indirectly accessed by single kernel or only directly
-// accessed by multiple kernels.
-std::vector OrderedKernels;
-for (auto &K : LDSUsesInfo.direct_access) {
-  Function *F = K.first;
-  assert(isKernelLDS(F));
-  OrderedKernels.push_back(F);
-}
-OrderedKernels = sortByName(std::move(OrderedKernels));
-
-llvm::DenseMap Kernel2BarId;
-for (Function *F : OrderedKernels) {
-  for (GlobalVariable *GV : LDSUsesInfo.direct_access[F]) {
-if (!isNamedBarrier(*GV))
-  continue;
-
-LDSUsesInfo.direct_access[F].erase(GV);
-if (GV->isAbsoluteSymbolRef()) {
-  // already assigned
-  continue;
-}
-OrderedGVs.push_back(GV);
-  }
-  OrderedGVs = sortByName(std::move(OrderedGVs));
-  for (GlobalVariable *GV : OrderedGVs) {
-// GV could also be used directly by other kernels. If so, we need to
-// create a new GV used only by this kernel and its function.
-auto NewGV = uniquifyGVPerKernel(M, GV, F);
-Changed |= (NewGV != GV);
-unsigned BarrierScope

[llvm-branch-commits] [clang] [NFC][HIP] Add __builtin_*_load_lds type check test cases (PR #165388)

2025-11-06 Thread Rana Pratap Reddy via llvm-branch-commits

https://github.com/ranapratap55 approved this pull request.


https://github.com/llvm/llvm-project/pull/165388
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [clang] [HIP][AMDGPU] Remove 't' from all __builtin_*_load_lds builtins (PR #165389)

2025-11-06 Thread Rana Pratap Reddy via llvm-branch-commits

https://github.com/ranapratap55 approved this pull request.


https://github.com/llvm/llvm-project/pull/165389
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [IR] Add CallBr intrinsics support (PR #133907)

2025-11-06 Thread Robert Imschweiler via llvm-branch-commits

ro-i wrote:

wait, does github now finally have the reopen feature

https://github.com/llvm/llvm-project/pull/133907
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [IR] Add CallBr intrinsics support (PR #133907)

2025-11-06 Thread Robert Imschweiler via llvm-branch-commits

https://github.com/ro-i reopened 
https://github.com/llvm/llvm-project/pull/133907
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [LoongArch] Initial implementation for `enableMemCmpExpansion` hook (PR #166526)

2025-11-06 Thread via llvm-branch-commits

llvmbot wrote:




@llvm/pr-subscribers-backend-loongarch

Author: ZhaoQi (zhaoqi5)


Changes

After overriding `TargetTransformInfo::enableMemCmpExpansion` in this commit, 
`MergeICmps` and `ExpandMemCmp` passes will be enabled on LoongArch.

---

Patch is 220.23 KiB, truncated to 20.00 KiB below, full version: 
https://github.com/llvm/llvm-project/pull/166526.diff


5 Files Affected:

- (modified) llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.cpp (+24-1) 
- (modified) llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.h (+2-1) 
- (modified) llvm/test/CodeGen/LoongArch/expandmemcmp-optsize.ll (+1619-527) 
- (modified) llvm/test/CodeGen/LoongArch/expandmemcmp.ll (+2594-715) 
- (modified) llvm/test/CodeGen/LoongArch/memcmp.ll (+18-9) 


``diff
diff --git a/llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.cpp 
b/llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.cpp
index f548a8dd0532b..f6637ef58cf9c 100644
--- a/llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.cpp
@@ -111,4 +111,27 @@ bool LoongArchTTIImpl::shouldExpandReduction(const 
IntrinsicInst *II) const {
   }
 }
 
-// TODO: Implement more hooks to provide TTI machinery for LoongArch.
+LoongArchTTIImpl::TTI::MemCmpExpansionOptions
+LoongArchTTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
+  TTI::MemCmpExpansionOptions Options;
+
+  if (!ST->hasUAL())
+return Options;
+
+  // TODO: Set same as the default value of MaxLoadsPerMemcmp or
+  // MaxLoadsPerMemcmpOptSize. May need more consideration?
+  Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
+  Options.NumLoadsPerBlock = Options.MaxNumLoads;
+  Options.AllowOverlappingLoads = true;
+
+  // TODO: Support for vectors.
+  if (ST->is64Bit()) {
+Options.LoadSizes = {8, 4, 2, 1};
+Options.AllowedTailExpansions = {3, 5, 6};
+  } else {
+Options.LoadSizes = {4, 2, 1};
+Options.AllowedTailExpansions = {3};
+  }
+
+  return Options;
+}
diff --git a/llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.h 
b/llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.h
index e3f16c7804994..9b479f9dc0dc5 100644
--- a/llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.h
+++ b/llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.h
@@ -55,7 +55,8 @@ class LoongArchTTIImpl : public 
BasicTTIImplBase {
 
   bool shouldExpandReduction(const IntrinsicInst *II) const override;
 
-  // TODO: Implement more hooks to provide TTI machinery for LoongArch.
+  TTI::MemCmpExpansionOptions
+  enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const override;
 };
 
 } // end namespace llvm
diff --git a/llvm/test/CodeGen/LoongArch/expandmemcmp-optsize.ll 
b/llvm/test/CodeGen/LoongArch/expandmemcmp-optsize.ll
index 82fe899bb795b..a6ed1f1db1678 100644
--- a/llvm/test/CodeGen/LoongArch/expandmemcmp-optsize.ll
+++ b/llvm/test/CodeGen/LoongArch/expandmemcmp-optsize.ll
@@ -38,260 +38,488 @@ entry:
 }
 
 define i32 @bcmp_size_1(ptr %s1, ptr %s2) nounwind optsize {
-; LA32-LABEL: bcmp_size_1:
-; LA32:   # %bb.0: # %entry
-; LA32-NEXT:addi.w $sp, $sp, -16
-; LA32-NEXT:st.w $ra, $sp, 12 # 4-byte Folded Spill
-; LA32-NEXT:ori $a2, $zero, 1
-; LA32-NEXT:bl bcmp
-; LA32-NEXT:ld.w $ra, $sp, 12 # 4-byte Folded Reload
-; LA32-NEXT:addi.w $sp, $sp, 16
-; LA32-NEXT:ret
+; LA32-UAL-LABEL: bcmp_size_1:
+; LA32-UAL:   # %bb.0: # %entry
+; LA32-UAL-NEXT:ld.bu $a0, $a0, 0
+; LA32-UAL-NEXT:ld.bu $a1, $a1, 0
+; LA32-UAL-NEXT:xor $a0, $a0, $a1
+; LA32-UAL-NEXT:sltu $a0, $zero, $a0
+; LA32-UAL-NEXT:ret
 ;
-; LA64-LABEL: bcmp_size_1:
-; LA64:   # %bb.0: # %entry
-; LA64-NEXT:addi.d $sp, $sp, -16
-; LA64-NEXT:st.d $ra, $sp, 8 # 8-byte Folded Spill
-; LA64-NEXT:ori $a2, $zero, 1
-; LA64-NEXT:pcaddu18i $ra, %call36(bcmp)
-; LA64-NEXT:jirl $ra, $ra, 0
-; LA64-NEXT:ld.d $ra, $sp, 8 # 8-byte Folded Reload
-; LA64-NEXT:addi.d $sp, $sp, 16
-; LA64-NEXT:ret
+; LA64-UAL-LABEL: bcmp_size_1:
+; LA64-UAL:   # %bb.0: # %entry
+; LA64-UAL-NEXT:ld.bu $a0, $a0, 0
+; LA64-UAL-NEXT:ld.bu $a1, $a1, 0
+; LA64-UAL-NEXT:xor $a0, $a0, $a1
+; LA64-UAL-NEXT:sltu $a0, $zero, $a0
+; LA64-UAL-NEXT:ret
+;
+; LA32-NUAL-LABEL: bcmp_size_1:
+; LA32-NUAL:   # %bb.0: # %entry
+; LA32-NUAL-NEXT:addi.w $sp, $sp, -16
+; LA32-NUAL-NEXT:st.w $ra, $sp, 12 # 4-byte Folded Spill
+; LA32-NUAL-NEXT:ori $a2, $zero, 1
+; LA32-NUAL-NEXT:bl bcmp
+; LA32-NUAL-NEXT:ld.w $ra, $sp, 12 # 4-byte Folded Reload
+; LA32-NUAL-NEXT:addi.w $sp, $sp, 16
+; LA32-NUAL-NEXT:ret
+;
+; LA64-NUAL-LABEL: bcmp_size_1:
+; LA64-NUAL:   # %bb.0: # %entry
+; LA64-NUAL-NEXT:addi.d $sp, $sp, -16
+; LA64-NUAL-NEXT:st.d $ra, $sp, 8 # 8-byte Folded Spill
+; LA64-NUAL-NEXT:ori $a2, $zero, 1
+; LA64-NUAL-NEXT:pcaddu18i $ra, %call36(bcmp)
+; LA64-NUAL-NEXT:jirl $ra, $ra, 0
+; LA64-NUAL-NEXT:ld.d $ra, $sp, 8 #

[llvm-branch-commits] [compiler-rt][sanitizers] Mark three tests as unsupported on Android (PR #166639)

2025-11-06 Thread Aiden Grossman via llvm-branch-commits

https://github.com/boomanaiden154 updated 
https://github.com/llvm/llvm-project/pull/166639


___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [compiler-rt][sanitizers] Mark three tests as unsupported on Android (PR #166639)

2025-11-06 Thread Aiden Grossman via llvm-branch-commits

https://github.com/boomanaiden154 updated 
https://github.com/llvm/llvm-project/pull/166639


___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [IR] Add CallBr intrinsics support (PR #133907)

2025-11-06 Thread Robert Imschweiler via llvm-branch-commits

https://github.com/ro-i closed https://github.com/llvm/llvm-project/pull/133907
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [IR] Add CallBr intrinsics support (PR #133907)

2025-11-06 Thread Robert Imschweiler via llvm-branch-commits

ro-i wrote:

Hm, somehow github decided to automatically close this PR after I just merged 
the PR down the stack. Will reopen

https://github.com/llvm/llvm-project/pull/133907
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [AMDGPU] Enable amdgpu-lower-exec-sync pass in pipeline (PR #165746)

2025-11-06 Thread via llvm-branch-commits

https://github.com/skc7 updated https://github.com/llvm/llvm-project/pull/165746

>From ca4b858851a2b6c2a0e81fe6d48618332d18ca15 Mon Sep 17 00:00:00 2001
From: skc7 
Date: Thu, 30 Oct 2025 22:42:33 +0530
Subject: [PATCH 1/4] [AMDGPU] Enable amdgpu-lower-special-lds pass in pipeline

---
 .../AMDGPU/AMDGPULowerModuleLDSPass.cpp   | 126 --
 llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.cpp  |   6 +
 llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp   |   3 +-
 .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp |  14 ++
 ...amdgpu-lower-special-lds-and-module-lds.ll | 119 +
 .../amdgpu-lower-special-lds-and-sw-lds.ll|  86 
 llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll  |   6 +-
 llvm/test/CodeGen/AMDGPU/llc-pipeline.ll  |   5 +
 .../test/CodeGen/AMDGPU/s-barrier-lowering.ll |   2 +-
 9 files changed, 236 insertions(+), 131 deletions(-)
 create mode 100644 
llvm/test/CodeGen/AMDGPU/amdgpu-lower-special-lds-and-module-lds.ll
 create mode 100644 
llvm/test/CodeGen/AMDGPU/amdgpu-lower-special-lds-and-sw-lds.ll

diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp 
b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
index a4ef524c43466..3c0328e93ffbd 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
@@ -922,126 +922,6 @@ class AMDGPULowerModuleLDS {
 return KernelToCreatedDynamicLDS;
   }
 
-  static GlobalVariable *uniquifyGVPerKernel(Module &M, GlobalVariable *GV,
- Function *KF) {
-bool NeedsReplacement = false;
-for (Use &U : GV->uses()) {
-  if (auto *I = dyn_cast(U.getUser())) {
-Function *F = I->getFunction();
-if (isKernelLDS(F) && F != KF) {
-  NeedsReplacement = true;
-  break;
-}
-  }
-}
-if (!NeedsReplacement)
-  return GV;
-// Create a new GV used only by this kernel and its function
-GlobalVariable *NewGV = new GlobalVariable(
-M, GV->getValueType(), GV->isConstant(), GV->getLinkage(),
-GV->getInitializer(), GV->getName() + "." + KF->getName(), nullptr,
-GV->getThreadLocalMode(), GV->getType()->getAddressSpace());
-NewGV->copyAttributesFrom(GV);
-for (Use &U : make_early_inc_range(GV->uses())) {
-  if (auto *I = dyn_cast(U.getUser())) {
-Function *F = I->getFunction();
-if (!isKernelLDS(F) || F == KF) {
-  U.getUser()->replaceUsesOfWith(GV, NewGV);
-}
-  }
-}
-return NewGV;
-  }
-
-  bool lowerSpecialLDSVariables(
-  Module &M, LDSUsesInfoTy &LDSUsesInfo,
-  VariableFunctionMap &LDSToKernelsThatNeedToAccessItIndirectly) {
-bool Changed = false;
-const DataLayout &DL = M.getDataLayout();
-// The 1st round: give module-absolute assignments
-int NumAbsolutes = 0;
-std::vector OrderedGVs;
-for (auto &K : LDSToKernelsThatNeedToAccessItIndirectly) {
-  GlobalVariable *GV = K.first;
-  if (!isNamedBarrier(*GV))
-continue;
-  // give a module-absolute assignment if it is indirectly accessed by
-  // multiple kernels. This is not precise, but we don't want to duplicate
-  // a function when it is called by multiple kernels.
-  if (LDSToKernelsThatNeedToAccessItIndirectly[GV].size() > 1) {
-OrderedGVs.push_back(GV);
-  } else {
-// leave it to the 2nd round, which will give a kernel-relative
-// assignment if it is only indirectly accessed by one kernel
-LDSUsesInfo.direct_access[*K.second.begin()].insert(GV);
-  }
-  LDSToKernelsThatNeedToAccessItIndirectly.erase(GV);
-}
-OrderedGVs = sortByName(std::move(OrderedGVs));
-for (GlobalVariable *GV : OrderedGVs) {
-  unsigned BarrierScope = llvm::AMDGPU::Barrier::BARRIER_SCOPE_WORKGROUP;
-  unsigned BarId = NumAbsolutes + 1;
-  unsigned BarCnt = DL.getTypeAllocSize(GV->getValueType()) / 16;
-  NumAbsolutes += BarCnt;
-
-  // 4 bits for alignment, 5 bits for the barrier num,
-  // 3 bits for the barrier scope
-  unsigned Offset = 0x802000u | BarrierScope << 9 | BarId << 4;
-  recordLDSAbsoluteAddress(&M, GV, Offset);
-}
-OrderedGVs.clear();
-
-// The 2nd round: give a kernel-relative assignment for GV that
-// either only indirectly accessed by single kernel or only directly
-// accessed by multiple kernels.
-std::vector OrderedKernels;
-for (auto &K : LDSUsesInfo.direct_access) {
-  Function *F = K.first;
-  assert(isKernelLDS(F));
-  OrderedKernels.push_back(F);
-}
-OrderedKernels = sortByName(std::move(OrderedKernels));
-
-llvm::DenseMap Kernel2BarId;
-for (Function *F : OrderedKernels) {
-  for (GlobalVariable *GV : LDSUsesInfo.direct_access[F]) {
-if (!isNamedBarrier(*GV))
-  continue;
-
-LDSUsesInfo.direct_access[F].erase(GV);
-if (GV->isAbsoluteSymbolRef()) {
-  // already assigned
-  

[llvm-branch-commits] [llvm] [AMDGPU] Remove named-barrier LDS lowering logic from amdgpu-lower-module-lds (PR #166731)

2025-11-06 Thread via llvm-branch-commits

https://github.com/skc7 created https://github.com/llvm/llvm-project/pull/166731

This PR removes the named-barrier LDS lowering from `amdgpu-lower-module-lds` 
pass, since it is now handled by `amdgpu-lower-exec-sync` pass

>From 0a2e9ee17ea82a7cb3fe191626ee84b05c37be83 Mon Sep 17 00:00:00 2001
From: skc7 
Date: Thu, 6 Nov 2025 14:29:17 +0530
Subject: [PATCH] [AMDGPU] Remove lowering named-barrier LDS logci from
 amdgpu-lower-module-lds

---
 .../AMDGPU/AMDGPULowerModuleLDSPass.cpp   | 126 --
 1 file changed, 126 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp 
b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
index a4ef524c43466..3c0328e93ffbd 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
@@ -922,126 +922,6 @@ class AMDGPULowerModuleLDS {
 return KernelToCreatedDynamicLDS;
   }
 
-  static GlobalVariable *uniquifyGVPerKernel(Module &M, GlobalVariable *GV,
- Function *KF) {
-bool NeedsReplacement = false;
-for (Use &U : GV->uses()) {
-  if (auto *I = dyn_cast(U.getUser())) {
-Function *F = I->getFunction();
-if (isKernelLDS(F) && F != KF) {
-  NeedsReplacement = true;
-  break;
-}
-  }
-}
-if (!NeedsReplacement)
-  return GV;
-// Create a new GV used only by this kernel and its function
-GlobalVariable *NewGV = new GlobalVariable(
-M, GV->getValueType(), GV->isConstant(), GV->getLinkage(),
-GV->getInitializer(), GV->getName() + "." + KF->getName(), nullptr,
-GV->getThreadLocalMode(), GV->getType()->getAddressSpace());
-NewGV->copyAttributesFrom(GV);
-for (Use &U : make_early_inc_range(GV->uses())) {
-  if (auto *I = dyn_cast(U.getUser())) {
-Function *F = I->getFunction();
-if (!isKernelLDS(F) || F == KF) {
-  U.getUser()->replaceUsesOfWith(GV, NewGV);
-}
-  }
-}
-return NewGV;
-  }
-
-  bool lowerSpecialLDSVariables(
-  Module &M, LDSUsesInfoTy &LDSUsesInfo,
-  VariableFunctionMap &LDSToKernelsThatNeedToAccessItIndirectly) {
-bool Changed = false;
-const DataLayout &DL = M.getDataLayout();
-// The 1st round: give module-absolute assignments
-int NumAbsolutes = 0;
-std::vector OrderedGVs;
-for (auto &K : LDSToKernelsThatNeedToAccessItIndirectly) {
-  GlobalVariable *GV = K.first;
-  if (!isNamedBarrier(*GV))
-continue;
-  // give a module-absolute assignment if it is indirectly accessed by
-  // multiple kernels. This is not precise, but we don't want to duplicate
-  // a function when it is called by multiple kernels.
-  if (LDSToKernelsThatNeedToAccessItIndirectly[GV].size() > 1) {
-OrderedGVs.push_back(GV);
-  } else {
-// leave it to the 2nd round, which will give a kernel-relative
-// assignment if it is only indirectly accessed by one kernel
-LDSUsesInfo.direct_access[*K.second.begin()].insert(GV);
-  }
-  LDSToKernelsThatNeedToAccessItIndirectly.erase(GV);
-}
-OrderedGVs = sortByName(std::move(OrderedGVs));
-for (GlobalVariable *GV : OrderedGVs) {
-  unsigned BarrierScope = llvm::AMDGPU::Barrier::BARRIER_SCOPE_WORKGROUP;
-  unsigned BarId = NumAbsolutes + 1;
-  unsigned BarCnt = DL.getTypeAllocSize(GV->getValueType()) / 16;
-  NumAbsolutes += BarCnt;
-
-  // 4 bits for alignment, 5 bits for the barrier num,
-  // 3 bits for the barrier scope
-  unsigned Offset = 0x802000u | BarrierScope << 9 | BarId << 4;
-  recordLDSAbsoluteAddress(&M, GV, Offset);
-}
-OrderedGVs.clear();
-
-// The 2nd round: give a kernel-relative assignment for GV that
-// either only indirectly accessed by single kernel or only directly
-// accessed by multiple kernels.
-std::vector OrderedKernels;
-for (auto &K : LDSUsesInfo.direct_access) {
-  Function *F = K.first;
-  assert(isKernelLDS(F));
-  OrderedKernels.push_back(F);
-}
-OrderedKernels = sortByName(std::move(OrderedKernels));
-
-llvm::DenseMap Kernel2BarId;
-for (Function *F : OrderedKernels) {
-  for (GlobalVariable *GV : LDSUsesInfo.direct_access[F]) {
-if (!isNamedBarrier(*GV))
-  continue;
-
-LDSUsesInfo.direct_access[F].erase(GV);
-if (GV->isAbsoluteSymbolRef()) {
-  // already assigned
-  continue;
-}
-OrderedGVs.push_back(GV);
-  }
-  OrderedGVs = sortByName(std::move(OrderedGVs));
-  for (GlobalVariable *GV : OrderedGVs) {
-// GV could also be used directly by other kernels. If so, we need to
-// create a new GV used only by this kernel and its function.
-auto NewGV = uniquifyGVPerKernel(M, GV, F);
-Changed |= (NewGV != GV);
-unsigned BarrierScope = llvm::AMDGPU::Barrier::BARRIER_SCOPE_WORKGROUP;
-uns

[llvm-branch-commits] [clang] [AMDGPU] Add builtins for wave reduction intrinsics (PR #161816)

2025-11-06 Thread via llvm-branch-commits

https://github.com/easyonaadit updated 
https://github.com/llvm/llvm-project/pull/161816

>From 62867d1bcdb3d8d0eba2b04a78f61f98b92e7de6 Mon Sep 17 00:00:00 2001
From: Aaditya 
Date: Tue, 30 Sep 2025 11:37:42 +0530
Subject: [PATCH] [AMDGPU] Add builtins for wave reduction intrinsics

---
 clang/include/clang/Basic/BuiltinsAMDGPU.def |  4 +
 clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp  |  8 ++
 clang/test/CodeGenOpenCL/builtins-amdgcn.cl  | 84 
 3 files changed, 96 insertions(+)

diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def 
b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index fda16e42d2c6b..ebc0ac35f42d9 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -402,6 +402,10 @@ BUILTIN(__builtin_amdgcn_wave_reduce_max_u64, "WUiWUiZi", 
"nc")
 BUILTIN(__builtin_amdgcn_wave_reduce_and_b64, "WiWiZi", "nc")
 BUILTIN(__builtin_amdgcn_wave_reduce_or_b64, "WiWiZi", "nc")
 BUILTIN(__builtin_amdgcn_wave_reduce_xor_b64, "WiWiZi", "nc")
+BUILTIN(__builtin_amdgcn_wave_reduce_add_f32, "ffZi", "nc")
+BUILTIN(__builtin_amdgcn_wave_reduce_sub_f32, "ffZi", "nc")
+BUILTIN(__builtin_amdgcn_wave_reduce_min_f32, "ffZi", "nc")
+BUILTIN(__builtin_amdgcn_wave_reduce_max_f32, "ffZi", "nc")
 
 
//===--===//
 // R600-NI only builtins.
diff --git a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp 
b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
index 07cf08c54985a..4de722077c8e9 100644
--- a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
+++ b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
@@ -301,18 +301,22 @@ static Intrinsic::ID 
getIntrinsicIDforWaveReduction(unsigned BuiltinID) {
 llvm_unreachable("Unknown BuiltinID for wave reduction");
   case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_add_u32:
   case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_add_u64:
+  case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_add_f32:
 return Intrinsic::amdgcn_wave_reduce_add;
   case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_sub_u32:
   case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_sub_u64:
+  case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_sub_f32:
 return Intrinsic::amdgcn_wave_reduce_sub;
   case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_min_i32:
   case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_min_i64:
+  case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_min_f32:
 return Intrinsic::amdgcn_wave_reduce_min;
   case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_min_u32:
   case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_min_u64:
 return Intrinsic::amdgcn_wave_reduce_umin;
   case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_max_i32:
   case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_max_i64:
+  case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_max_f32:
 return Intrinsic::amdgcn_wave_reduce_max;
   case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_max_u32:
   case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_max_u64:
@@ -335,11 +339,15 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned 
BuiltinID,
   llvm::SyncScope::ID SSID;
   switch (BuiltinID) {
   case AMDGPU::BI__builtin_amdgcn_wave_reduce_add_u32:
+  case AMDGPU::BI__builtin_amdgcn_wave_reduce_add_f32:
   case AMDGPU::BI__builtin_amdgcn_wave_reduce_sub_u32:
+  case AMDGPU::BI__builtin_amdgcn_wave_reduce_sub_f32:
   case AMDGPU::BI__builtin_amdgcn_wave_reduce_min_i32:
   case AMDGPU::BI__builtin_amdgcn_wave_reduce_min_u32:
+  case AMDGPU::BI__builtin_amdgcn_wave_reduce_min_f32:
   case AMDGPU::BI__builtin_amdgcn_wave_reduce_max_i32:
   case AMDGPU::BI__builtin_amdgcn_wave_reduce_max_u32:
+  case AMDGPU::BI__builtin_amdgcn_wave_reduce_max_f32:
   case AMDGPU::BI__builtin_amdgcn_wave_reduce_and_b32:
   case AMDGPU::BI__builtin_amdgcn_wave_reduce_or_b32:
   case AMDGPU::BI__builtin_amdgcn_wave_reduce_xor_b32:
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl 
b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl
index 039d03237b530..a8856ab56a55d 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl
@@ -412,6 +412,13 @@ void test_wave_reduce_add_u64_default(global int* out, 
long in)
   *out = __builtin_amdgcn_wave_reduce_add_u64(in, 0);
 }
 
+// CHECK-LABEL: @test_wave_reduce_add_f32_default
+// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.wave.reduce.add.f32(
+void test_wave_reduce_add_f32_default(global float* out, float in)
+{
+  *out = __builtin_amdgcn_wave_reduce_add_f32(in, 0);
+}
+
 // CHECK-LABEL: @test_wave_reduce_add_u32_iterative
 // CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.add.i32(
 void test_wave_reduce_add_u32_iterative(global int* out, int in)
@@ -426,6 +433,13 @@ void test_wave_reduce_add_u64_iterative(global int* out, 
long in)
   *out = __builtin_amdgcn_wave_reduce_add_u64(in, 1);
 }
 
+// CHECK-LABEL: @test_wave_reduce_add_f32_iterative
+// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.wave.reduce.add.f32(

[llvm-branch-commits] [llvm] [AMDGPU] Add wave reduce intrinsics for float types - 2 (PR #161815)

2025-11-06 Thread via llvm-branch-commits

https://github.com/easyonaadit updated 
https://github.com/llvm/llvm-project/pull/161815

>From 27c0f126455f8249b7eda83b5ef900bc6d07de52 Mon Sep 17 00:00:00 2001
From: Aaditya 
Date: Mon, 29 Sep 2025 18:58:10 +0530
Subject: [PATCH] [AMDGPU] Add wave reduce intrinsics for float types - 2

Supported Ops: `fadd`, `fsub`
---
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp |  40 +-
 llvm/lib/Target/AMDGPU/SIInstructions.td  |   2 +
 .../CodeGen/AMDGPU/llvm.amdgcn.reduce.add.ll  | 949 +
 .../CodeGen/AMDGPU/llvm.amdgcn.reduce.sub.ll  | 967 ++
 4 files changed, 1955 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp 
b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 2f1598e25a621..ced967b73cba5 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -5330,11 +5330,13 @@ static uint32_t 
getIdentityValueFor32BitWaveReduction(unsigned Opc) {
   case AMDGPU::S_MAX_U32:
 return std::numeric_limits::min();
   case AMDGPU::S_MAX_I32:
+  case AMDGPU::V_SUB_F32_e64: // +0.0
 return std::numeric_limits::min();
   case AMDGPU::S_ADD_I32:
   case AMDGPU::S_SUB_I32:
   case AMDGPU::S_OR_B32:
   case AMDGPU::S_XOR_B32:
+  case AMDGPU::V_ADD_F32_e64: // -0.0
 return std::numeric_limits::min();
   case AMDGPU::S_AND_B32:
 return std::numeric_limits::max();
@@ -5382,11 +5384,13 @@ static bool is32bitWaveReduceOperation(unsigned Opc) {
  Opc == AMDGPU::S_ADD_I32 || Opc == AMDGPU::S_SUB_I32 ||
  Opc == AMDGPU::S_AND_B32 || Opc == AMDGPU::S_OR_B32 ||
  Opc == AMDGPU::S_XOR_B32 || Opc == AMDGPU::V_MIN_F32_e64 ||
- Opc == AMDGPU::V_MAX_F32_e64;
+ Opc == AMDGPU::V_MAX_F32_e64 || Opc == AMDGPU::V_ADD_F32_e64 ||
+ Opc == AMDGPU::V_SUB_F32_e64;
 }
 
 static bool isFloatingPointWaveReduceOperation(unsigned Opc) {
-  return Opc == AMDGPU::V_MIN_F32_e64 || Opc == AMDGPU::V_MAX_F32_e64;
+  return Opc == AMDGPU::V_MIN_F32_e64 || Opc == AMDGPU::V_MAX_F32_e64 ||
+ Opc == AMDGPU::V_ADD_F32_e64 || Opc == AMDGPU::V_SUB_F32_e64;
 }
 
 static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
@@ -5433,8 +5437,10 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr 
&MI,
 case AMDGPU::S_XOR_B64:
 case AMDGPU::S_ADD_I32:
 case AMDGPU::S_ADD_U64_PSEUDO:
+case AMDGPU::V_ADD_F32_e64:
 case AMDGPU::S_SUB_I32:
-case AMDGPU::S_SUB_U64_PSEUDO: {
+case AMDGPU::S_SUB_U64_PSEUDO:
+case AMDGPU::V_SUB_F32_e64: {
   const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
   const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
   Register ExecMask = MRI.createVirtualRegister(WaveMaskRegClass);
@@ -5589,6 +5595,30 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr 
&MI,
 .addImm(AMDGPU::sub1);
 break;
   }
+  case AMDGPU::V_ADD_F32_e64:
+  case AMDGPU::V_SUB_F32_e64: {
+Register ActiveLanesVreg =
+MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+Register DstVreg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+// Get number of active lanes as a float val.
+BuildMI(BB, MI, DL, TII->get(AMDGPU::V_CVT_F32_I32_e64),
+ActiveLanesVreg)
+.addReg(NewAccumulator->getOperand(0).getReg())
+.addImm(0)  // clamp
+.addImm(0); // output-modifier
+
+// Take negation of input for SUB reduction
+unsigned srcMod = Opc == AMDGPU::V_SUB_F32_e64 ? 1 : 0;
+BuildMI(BB, MI, DL, TII->get(AMDGPU::V_MUL_F32_e64), DstVreg)
+.addImm(srcMod) // src0 modifier
+.addReg(SrcReg)
+.addImm(0) // src1 modifier
+.addReg(ActiveLanesVreg)
+.addImm(0)  // clamp
+.addImm(0); // output-mod
+BuildMI(BB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
+.addReg(DstVreg);
+  }
   }
   RetBB = &BB;
 }
@@ -5833,10 +5863,14 @@ 
SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_I32);
   case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_U64:
 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_U64_PSEUDO);
+  case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_F32:
+return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_ADD_F32_e64);
   case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_I32:
 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_I32);
   case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_U64:
 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_U64_PSEUDO);
+  case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_F32:
+return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_SUB_F32_e64);
   case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B32:
 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_AND_B32);
   case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B64:
diff --git a/llvm/lib/Target/AMDGPU/SIInstruc

[llvm-branch-commits] [llvm] [AMDGPU] Add wave reduce intrinsics for float types - 2 (PR #161815)

2025-11-06 Thread via llvm-branch-commits

https://github.com/easyonaadit updated 
https://github.com/llvm/llvm-project/pull/161815

>From 27c0f126455f8249b7eda83b5ef900bc6d07de52 Mon Sep 17 00:00:00 2001
From: Aaditya 
Date: Mon, 29 Sep 2025 18:58:10 +0530
Subject: [PATCH] [AMDGPU] Add wave reduce intrinsics for float types - 2

Supported Ops: `fadd`, `fsub`
---
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp |  40 +-
 llvm/lib/Target/AMDGPU/SIInstructions.td  |   2 +
 .../CodeGen/AMDGPU/llvm.amdgcn.reduce.add.ll  | 949 +
 .../CodeGen/AMDGPU/llvm.amdgcn.reduce.sub.ll  | 967 ++
 4 files changed, 1955 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp 
b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 2f1598e25a621..ced967b73cba5 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -5330,11 +5330,13 @@ static uint32_t 
getIdentityValueFor32BitWaveReduction(unsigned Opc) {
   case AMDGPU::S_MAX_U32:
 return std::numeric_limits::min();
   case AMDGPU::S_MAX_I32:
+  case AMDGPU::V_SUB_F32_e64: // +0.0
 return std::numeric_limits::min();
   case AMDGPU::S_ADD_I32:
   case AMDGPU::S_SUB_I32:
   case AMDGPU::S_OR_B32:
   case AMDGPU::S_XOR_B32:
+  case AMDGPU::V_ADD_F32_e64: // -0.0
 return std::numeric_limits::min();
   case AMDGPU::S_AND_B32:
 return std::numeric_limits::max();
@@ -5382,11 +5384,13 @@ static bool is32bitWaveReduceOperation(unsigned Opc) {
  Opc == AMDGPU::S_ADD_I32 || Opc == AMDGPU::S_SUB_I32 ||
  Opc == AMDGPU::S_AND_B32 || Opc == AMDGPU::S_OR_B32 ||
  Opc == AMDGPU::S_XOR_B32 || Opc == AMDGPU::V_MIN_F32_e64 ||
- Opc == AMDGPU::V_MAX_F32_e64;
+ Opc == AMDGPU::V_MAX_F32_e64 || Opc == AMDGPU::V_ADD_F32_e64 ||
+ Opc == AMDGPU::V_SUB_F32_e64;
 }
 
 static bool isFloatingPointWaveReduceOperation(unsigned Opc) {
-  return Opc == AMDGPU::V_MIN_F32_e64 || Opc == AMDGPU::V_MAX_F32_e64;
+  return Opc == AMDGPU::V_MIN_F32_e64 || Opc == AMDGPU::V_MAX_F32_e64 ||
+ Opc == AMDGPU::V_ADD_F32_e64 || Opc == AMDGPU::V_SUB_F32_e64;
 }
 
 static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
@@ -5433,8 +5437,10 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr 
&MI,
 case AMDGPU::S_XOR_B64:
 case AMDGPU::S_ADD_I32:
 case AMDGPU::S_ADD_U64_PSEUDO:
+case AMDGPU::V_ADD_F32_e64:
 case AMDGPU::S_SUB_I32:
-case AMDGPU::S_SUB_U64_PSEUDO: {
+case AMDGPU::S_SUB_U64_PSEUDO:
+case AMDGPU::V_SUB_F32_e64: {
   const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
   const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
   Register ExecMask = MRI.createVirtualRegister(WaveMaskRegClass);
@@ -5589,6 +5595,30 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr 
&MI,
 .addImm(AMDGPU::sub1);
 break;
   }
+  case AMDGPU::V_ADD_F32_e64:
+  case AMDGPU::V_SUB_F32_e64: {
+Register ActiveLanesVreg =
+MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+Register DstVreg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+// Get number of active lanes as a float val.
+BuildMI(BB, MI, DL, TII->get(AMDGPU::V_CVT_F32_I32_e64),
+ActiveLanesVreg)
+.addReg(NewAccumulator->getOperand(0).getReg())
+.addImm(0)  // clamp
+.addImm(0); // output-modifier
+
+// Take negation of input for SUB reduction
+unsigned srcMod = Opc == AMDGPU::V_SUB_F32_e64 ? 1 : 0;
+BuildMI(BB, MI, DL, TII->get(AMDGPU::V_MUL_F32_e64), DstVreg)
+.addImm(srcMod) // src0 modifier
+.addReg(SrcReg)
+.addImm(0) // src1 modifier
+.addReg(ActiveLanesVreg)
+.addImm(0)  // clamp
+.addImm(0); // output-mod
+BuildMI(BB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
+.addReg(DstVreg);
+  }
   }
   RetBB = &BB;
 }
@@ -5833,10 +5863,14 @@ 
SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_I32);
   case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_U64:
 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_U64_PSEUDO);
+  case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_F32:
+return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_ADD_F32_e64);
   case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_I32:
 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_I32);
   case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_U64:
 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_U64_PSEUDO);
+  case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_F32:
+return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_SUB_F32_e64);
   case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B32:
 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_AND_B32);
   case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B64:
diff --git a/llvm/lib/Target/AMDGPU/SIInstruc

[llvm-branch-commits] [llvm] [CI] Make premerge_advisor_explain write comments (PR #166605)

2025-11-06 Thread David Spickett via llvm-branch-commits

https://github.com/DavidSpickett edited 
https://github.com/llvm/llvm-project/pull/166605
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [mlir] [mlir][vector] Simplify createReadOrMaskedRead (PR #163736)

2025-11-06 Thread Andrzej Warzyński via llvm-branch-commits

https://github.com/banach-space closed 
https://github.com/llvm/llvm-project/pull/163736
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] release/21.x: [RISCV] Correct the CFA offsets for stack probing. (#166616) (PR #166783)

2025-11-06 Thread via llvm-branch-commits

https://github.com/llvmbot milestoned 
https://github.com/llvm/llvm-project/pull/166783
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] release/21.x: [RISCV] Correct the CFA offsets for stack probing. (#166616) (PR #166783)

2025-11-06 Thread via llvm-branch-commits

https://github.com/llvmbot created 
https://github.com/llvm/llvm-project/pull/166783

Backport ff11b93bb8f5578c9eb7296160570ea001a1155f

Requested by: @topperc

>From c343ce6d630b0c5819fbe50fec76de0408789112 Mon Sep 17 00:00:00 2001
From: Craig Topper 
Date: Thu, 6 Nov 2025 07:09:52 -0800
Subject: [PATCH] [RISCV] Correct the CFA offsets for stack probing. (#166616)

We need to take into account that we may have already done a FirstSPAdjust.

Fixes #164805.

(cherry picked from commit ff11b93bb8f5578c9eb7296160570ea001a1155f)
---
 llvm/lib/Target/RISCV/RISCVFrameLowering.cpp | 6 --
 llvm/test/CodeGen/RISCV/rvv/stack-probing-dynamic.ll | 8 
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp 
b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
index 6c8e3da80b932..c4f41b8d8e4d9 100644
--- a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
@@ -768,6 +768,8 @@ void RISCVFrameLowering::allocateStack(MachineBasicBlock 
&MBB,
 
   // Unroll the probe loop depending on the number of iterations.
   if (Offset < ProbeSize * 5) {
+uint64_t CFAAdjust = RealStackSize - Offset;
+
 uint64_t CurrentOffset = 0;
 while (CurrentOffset + ProbeSize <= Offset) {
   RI->adjustReg(MBB, MBBI, DL, SPReg, SPReg,
@@ -781,7 +783,7 @@ void RISCVFrameLowering::allocateStack(MachineBasicBlock 
&MBB,
 
   CurrentOffset += ProbeSize;
   if (EmitCFI)
-CFIBuilder.buildDefCFAOffset(CurrentOffset);
+CFIBuilder.buildDefCFAOffset(CurrentOffset + CFAAdjust);
 }
 
 uint64_t Residual = Offset - CurrentOffset;
@@ -789,7 +791,7 @@ void RISCVFrameLowering::allocateStack(MachineBasicBlock 
&MBB,
   RI->adjustReg(MBB, MBBI, DL, SPReg, SPReg,
 StackOffset::getFixed(-Residual), Flag, getStackAlign());
   if (EmitCFI)
-CFIBuilder.buildDefCFAOffset(Offset);
+CFIBuilder.buildDefCFAOffset(RealStackSize);
 
   if (DynAllocation) {
 // s[d|w] zero, 0(sp)
diff --git a/llvm/test/CodeGen/RISCV/rvv/stack-probing-dynamic.ll 
b/llvm/test/CodeGen/RISCV/rvv/stack-probing-dynamic.ll
index d666832cf6e0b..c79fb0f91b21f 100644
--- a/llvm/test/CodeGen/RISCV/rvv/stack-probing-dynamic.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/stack-probing-dynamic.ll
@@ -460,9 +460,9 @@ define void @reserved_call_frame(i64 %n) #0 {
 ; RV64I-NEXT:lui a0, 1
 ; RV64I-NEXT:sub sp, sp, a0
 ; RV64I-NEXT:sd zero, 0(sp)
-; RV64I-NEXT:.cfi_def_cfa_offset 4096
+; RV64I-NEXT:.cfi_def_cfa_offset 6128
 ; RV64I-NEXT:addi sp, sp, -48
-; RV64I-NEXT:.cfi_def_cfa_offset 4144
+; RV64I-NEXT:.cfi_def_cfa_offset 6176
 ; RV64I-NEXT:lui a0, 1
 ; RV64I-NEXT:add a0, sp, a0
 ; RV64I-NEXT:call callee_stack_args
@@ -485,9 +485,9 @@ define void @reserved_call_frame(i64 %n) #0 {
 ; RV32I-NEXT:lui a0, 1
 ; RV32I-NEXT:sub sp, sp, a0
 ; RV32I-NEXT:sw zero, 0(sp)
-; RV32I-NEXT:.cfi_def_cfa_offset 4096
+; RV32I-NEXT:.cfi_def_cfa_offset 6128
 ; RV32I-NEXT:addi sp, sp, -80
-; RV32I-NEXT:.cfi_def_cfa_offset 4176
+; RV32I-NEXT:.cfi_def_cfa_offset 6208
 ; RV32I-NEXT:lui a0, 1
 ; RV32I-NEXT:addi a0, a0, 36
 ; RV32I-NEXT:add a0, sp, a0

___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] release/21.x: [RISCV] Correct the CFA offsets for stack probing. (#166616) (PR #166783)

2025-11-06 Thread via llvm-branch-commits

llvmbot wrote:

@kito-cheng What do you think about merging this PR to the release branch?

https://github.com/llvm/llvm-project/pull/166783
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] release/21.x: [RISCV] Correct the CFA offsets for stack probing. (#166616) (PR #166783)

2025-11-06 Thread via llvm-branch-commits

llvmbot wrote:




@llvm/pr-subscribers-backend-risc-v

Author: None (llvmbot)


Changes

Backport ff11b93bb8f5578c9eb7296160570ea001a1155f

Requested by: @topperc

---
Full diff: https://github.com/llvm/llvm-project/pull/166783.diff


2 Files Affected:

- (modified) llvm/lib/Target/RISCV/RISCVFrameLowering.cpp (+4-2) 
- (modified) llvm/test/CodeGen/RISCV/rvv/stack-probing-dynamic.ll (+4-4) 


``diff
diff --git a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp 
b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
index 6c8e3da80b932..c4f41b8d8e4d9 100644
--- a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
@@ -768,6 +768,8 @@ void RISCVFrameLowering::allocateStack(MachineBasicBlock 
&MBB,
 
   // Unroll the probe loop depending on the number of iterations.
   if (Offset < ProbeSize * 5) {
+uint64_t CFAAdjust = RealStackSize - Offset;
+
 uint64_t CurrentOffset = 0;
 while (CurrentOffset + ProbeSize <= Offset) {
   RI->adjustReg(MBB, MBBI, DL, SPReg, SPReg,
@@ -781,7 +783,7 @@ void RISCVFrameLowering::allocateStack(MachineBasicBlock 
&MBB,
 
   CurrentOffset += ProbeSize;
   if (EmitCFI)
-CFIBuilder.buildDefCFAOffset(CurrentOffset);
+CFIBuilder.buildDefCFAOffset(CurrentOffset + CFAAdjust);
 }
 
 uint64_t Residual = Offset - CurrentOffset;
@@ -789,7 +791,7 @@ void RISCVFrameLowering::allocateStack(MachineBasicBlock 
&MBB,
   RI->adjustReg(MBB, MBBI, DL, SPReg, SPReg,
 StackOffset::getFixed(-Residual), Flag, getStackAlign());
   if (EmitCFI)
-CFIBuilder.buildDefCFAOffset(Offset);
+CFIBuilder.buildDefCFAOffset(RealStackSize);
 
   if (DynAllocation) {
 // s[d|w] zero, 0(sp)
diff --git a/llvm/test/CodeGen/RISCV/rvv/stack-probing-dynamic.ll 
b/llvm/test/CodeGen/RISCV/rvv/stack-probing-dynamic.ll
index d666832cf6e0b..c79fb0f91b21f 100644
--- a/llvm/test/CodeGen/RISCV/rvv/stack-probing-dynamic.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/stack-probing-dynamic.ll
@@ -460,9 +460,9 @@ define void @reserved_call_frame(i64 %n) #0 {
 ; RV64I-NEXT:lui a0, 1
 ; RV64I-NEXT:sub sp, sp, a0
 ; RV64I-NEXT:sd zero, 0(sp)
-; RV64I-NEXT:.cfi_def_cfa_offset 4096
+; RV64I-NEXT:.cfi_def_cfa_offset 6128
 ; RV64I-NEXT:addi sp, sp, -48
-; RV64I-NEXT:.cfi_def_cfa_offset 4144
+; RV64I-NEXT:.cfi_def_cfa_offset 6176
 ; RV64I-NEXT:lui a0, 1
 ; RV64I-NEXT:add a0, sp, a0
 ; RV64I-NEXT:call callee_stack_args
@@ -485,9 +485,9 @@ define void @reserved_call_frame(i64 %n) #0 {
 ; RV32I-NEXT:lui a0, 1
 ; RV32I-NEXT:sub sp, sp, a0
 ; RV32I-NEXT:sw zero, 0(sp)
-; RV32I-NEXT:.cfi_def_cfa_offset 4096
+; RV32I-NEXT:.cfi_def_cfa_offset 6128
 ; RV32I-NEXT:addi sp, sp, -80
-; RV32I-NEXT:.cfi_def_cfa_offset 4176
+; RV32I-NEXT:.cfi_def_cfa_offset 6208
 ; RV32I-NEXT:lui a0, 1
 ; RV32I-NEXT:addi a0, a0, 36
 ; RV32I-NEXT:add a0, sp, a0

``




https://github.com/llvm/llvm-project/pull/166783
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [MLIR][Python] Update Nanobind Warnings List for clang-cl on Windows (PR #166828)

2025-11-06 Thread Aiden Grossman via llvm-branch-commits

https://github.com/boomanaiden154 updated 
https://github.com/llvm/llvm-project/pull/166828

>From bc870644188ae13da4141efdf75eab0137ddcc30 Mon Sep 17 00:00:00 2001
From: Aiden Grossman 
Date: Thu, 6 Nov 2025 19:05:09 +
Subject: [PATCH] =?UTF-8?q?[=F0=9D=98=80=F0=9D=97=BD=F0=9D=97=BF]=20change?=
 =?UTF-8?q?s=20to=20main=20this=20commit=20is=20based=20on?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Created using spr 1.3.7

[skip ci]
---
 llvm/include/llvm/Support/thread.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/include/llvm/Support/thread.h 
b/llvm/include/llvm/Support/thread.h
index ecde62d8368e7..51873e7d529bf 100644
--- a/llvm/include/llvm/Support/thread.h
+++ b/llvm/include/llvm/Support/thread.h
@@ -34,7 +34,7 @@ typedef PVOID HANDLE;
 
 namespace llvm {
 
-#if LLVM_ON_UNIX || _WIN32
+#if defined(LLVM_ON_UNIX) || defined(_WIN32)
 
 /// LLVM thread following std::thread interface with added constructor to
 /// specify stack size.
@@ -49,7 +49,7 @@ class thread {
   }
 
 public:
-#if LLVM_ON_UNIX
+#ifdef LLVM_ON_UNIX
   using native_handle_type = pthread_t;
   using id = pthread_t;
   using start_routine_type = void *(*)(void *);

___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [MLIR][Python] Update Nanobind Warnings List for clang-cl on Windows (PR #166828)

2025-11-06 Thread Aiden Grossman via llvm-branch-commits

boomanaiden154 wrote:

> This thing is a perennial PITA. I'm stamping to unblock but can you can also 
> try https://github.com/wjakob/nanobind/pull/868.

Yeah, looks like it. Very interesting to see a project that is pretty against 
disabling warnings, but also against accepting patches to fix warnings that do 
come up. I'll land this as is and see if I can get `NB_SUPPRESS_WARNINGS` to 
eliminate the need for the custom compile options.

https://github.com/llvm/llvm-project/pull/166828
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [MLIR][Python] Update Nanobind Warnings List for clang-cl on Windows (PR #166828)

2025-11-06 Thread Aiden Grossman via llvm-branch-commits

https://github.com/boomanaiden154 updated 
https://github.com/llvm/llvm-project/pull/166828


___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [ConstantTime][RISCV] Add comprehensive tests for ct.select (PR #166708)

2025-11-06 Thread Julius Alexandre via llvm-branch-commits

https://github.com/wizardengineer updated 
https://github.com/llvm/llvm-project/pull/166708

>From 7aec58aa6f8029c514857a755b5a381e6a6b22af Mon Sep 17 00:00:00 2001
From: wizardengineer 
Date: Wed, 5 Nov 2025 11:01:00 -0500
Subject: [PATCH] [ConstantTime][RISCV] Add comprehensive tests for ct.select

Add comprehensive test suite for RISC-V fallback implementation:
- Edge cases (zero conditions, large integers, sign extension)
- Pattern matching (nested selects, chains)
- Vector support with RVV extensions
- Side effects and memory operations

The basic fallback test is in the core infrastructure PR.
---
 .../RISCV/ctselect-fallback-edge-cases.ll | 214 +
 .../RISCV/ctselect-fallback-patterns.ll   | 383 +
 .../RISCV/ctselect-fallback-vector-rvv.ll | 804 ++
 .../CodeGen/RISCV/ctselect-side-effects.ll| 176 
 4 files changed, 1577 insertions(+)
 create mode 100644 llvm/test/CodeGen/RISCV/ctselect-fallback-edge-cases.ll
 create mode 100644 llvm/test/CodeGen/RISCV/ctselect-fallback-patterns.ll
 create mode 100644 llvm/test/CodeGen/RISCV/ctselect-fallback-vector-rvv.ll
 create mode 100644 llvm/test/CodeGen/RISCV/ctselect-side-effects.ll

diff --git a/llvm/test/CodeGen/RISCV/ctselect-fallback-edge-cases.ll 
b/llvm/test/CodeGen/RISCV/ctselect-fallback-edge-cases.ll
new file mode 100644
index 0..af1be0c8f3ddc
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/ctselect-fallback-edge-cases.ll
@@ -0,0 +1,214 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 
UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=riscv64 -O3 | FileCheck %s --check-prefix=RV64
+; RUN: llc < %s -mtriple=riscv32 -O3 | FileCheck %s --check-prefix=RV32
+
+; Test with small integer types
+define i1 @test_ctselect_i1(i1 %cond, i1 %a, i1 %b) {
+; RV64-LABEL: test_ctselect_i1:
+; RV64:   # %bb.0:
+; RV64-NEXT:and a1, a0, a1
+; RV64-NEXT:xori a0, a0, 1
+; RV64-NEXT:and a0, a0, a2
+; RV64-NEXT:or a0, a1, a0
+; RV64-NEXT:ret
+;
+; RV32-LABEL: test_ctselect_i1:
+; RV32:   # %bb.0:
+; RV32-NEXT:and a1, a0, a1
+; RV32-NEXT:xori a0, a0, 1
+; RV32-NEXT:and a0, a0, a2
+; RV32-NEXT:or a0, a1, a0
+; RV32-NEXT:ret
+  %result = call i1 @llvm.ct.select.i1(i1 %cond, i1 %a, i1 %b)
+  ret i1 %result
+}
+
+; Test with extremal values
+define i32 @test_ctselect_extremal_values(i1 %cond) {
+; RV64-LABEL: test_ctselect_extremal_values:
+; RV64:   # %bb.0:
+; RV64-NEXT:andi a0, a0, 1
+; RV64-NEXT:lui a1, 524288
+; RV64-NEXT:subw a0, a1, a0
+; RV64-NEXT:ret
+;
+; RV32-LABEL: test_ctselect_extremal_values:
+; RV32:   # %bb.0:
+; RV32-NEXT:andi a0, a0, 1
+; RV32-NEXT:lui a1, 524288
+; RV32-NEXT:addi a2, a0, -1
+; RV32-NEXT:neg a0, a0
+; RV32-NEXT:and a1, a2, a1
+; RV32-NEXT:slli a0, a0, 1
+; RV32-NEXT:srli a0, a0, 1
+; RV32-NEXT:or a0, a0, a1
+; RV32-NEXT:ret
+  %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 2147483647, i32 
-2147483648)
+  ret i32 %result
+}
+
+; Test with null pointers
+define ptr @test_ctselect_null_ptr(i1 %cond, ptr %ptr) {
+; RV64-LABEL: test_ctselect_null_ptr:
+; RV64:   # %bb.0:
+; RV64-NEXT:slli a0, a0, 63
+; RV64-NEXT:srai a0, a0, 63
+; RV64-NEXT:and a0, a0, a1
+; RV64-NEXT:ret
+;
+; RV32-LABEL: test_ctselect_null_ptr:
+; RV32:   # %bb.0:
+; RV32-NEXT:slli a0, a0, 31
+; RV32-NEXT:srai a0, a0, 31
+; RV32-NEXT:and a0, a0, a1
+; RV32-NEXT:ret
+  %result = call ptr @llvm.ct.select.p0(i1 %cond, ptr %ptr, ptr null)
+  ret ptr %result
+}
+
+; Test with function pointers
+define ptr @test_ctselect_function_ptr(i1 %cond, ptr %func1, ptr %func2) {
+; RV64-LABEL: test_ctselect_function_ptr:
+; RV64:   # %bb.0:
+; RV64-NEXT:andi a0, a0, 1
+; RV64-NEXT:neg a3, a0
+; RV64-NEXT:addi a0, a0, -1
+; RV64-NEXT:and a1, a3, a1
+; RV64-NEXT:and a0, a0, a2
+; RV64-NEXT:or a0, a1, a0
+; RV64-NEXT:ret
+;
+; RV32-LABEL: test_ctselect_function_ptr:
+; RV32:   # %bb.0:
+; RV32-NEXT:andi a0, a0, 1
+; RV32-NEXT:neg a3, a0
+; RV32-NEXT:addi a0, a0, -1
+; RV32-NEXT:and a1, a3, a1
+; RV32-NEXT:and a0, a0, a2
+; RV32-NEXT:or a0, a1, a0
+; RV32-NEXT:ret
+  %result = call ptr @llvm.ct.select.p0(i1 %cond, ptr %func1, ptr %func2)
+  ret ptr %result
+}
+
+; Test with condition from icmp on pointers
+define ptr @test_ctselect_ptr_cmp(ptr %p1, ptr %p2, ptr %a, ptr %b) {
+; RV64-LABEL: test_ctselect_ptr_cmp:
+; RV64:   # %bb.0:
+; RV64-NEXT:xor a0, a0, a1
+; RV64-NEXT:snez a0, a0
+; RV64-NEXT:addi a0, a0, -1
+; RV64-NEXT:and a2, a0, a2
+; RV64-NEXT:not a0, a0
+; RV64-NEXT:and a0, a0, a3
+; RV64-NEXT:or a0, a2, a0
+; RV64-NEXT:ret
+;
+; RV32-LABEL: test_ctselect_ptr_cmp:
+; RV32:   # %bb.0:
+; RV32-NEXT:xor a0, a0, a1
+; RV32-NEXT:snez a0, a0
+; RV32-NEXT:addi a0, a0, -1
+; RV32-NEXT:and a2, a0, a2
+; RV32-NEXT:not a0, a0
+; RV32-NEXT:  

[llvm-branch-commits] [llvm] CodeGen: Handle bundled instructions in two-address-instructions pass (PR #166212)

2025-11-06 Thread Nicolai Hähnle via llvm-branch-commits

nhaehnle wrote:

> Have you considered the case where the instructions inside the bundle have 
> two uses of RegB, but only one of them is tied with RegA? I think it is 
> almost impossible to handle that optimally given only the summarised 
> information that you get from the operands of the BUNDLE. It might be worth 
> adding a test case like that, just to check that we don't crash and still 
> generate well formed MIR.
> 
> The fundamental question here is, can `processTiedPairs` really operate at 
> the BUNDLE level (and then fix up the instructions inside)? Or is it going to 
> have to operate on the individual instructions (and then fix up the summary 
> information on the BUNDLE)?

Yes, I thought about it. I did not find a good answer to what tied operands 
inside of a pre-RA bundle really mean *in the general case*. Bundles mean 
different things to different people. The main use of bundles outside of AMDGPU 
is for VLIW. In AMDGPU so far, it is used for memory clauses (which could 
potentially have tied operands for atomicrmw, but we only form them post-RA) 
and for a few niche cases like keeping S_GETPC_B64 together with its uses for 
PC-relative addressing.

What we're working towards here is a new pre-RA use case that can be vaguely 
described as "decomposing a single instruction into virtual micro-ops during 
codegen for the purpose of avoiding combinatorial explosion in opcodes etc.". 
For that use case, the requirements on tied operand support will be fairly 
restricted, and so I'd rather make this change more conservative and 
restrictive and not attempt to support something that we don't actually use and 
don't know how to test properly. And then build on that if and when we actually 
know what else might be needed.

https://github.com/llvm/llvm-project/pull/166212
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] CodeGen: Handle bundled instructions in two-address-instructions pass (PR #166212)

2025-11-06 Thread Nicolai Hähnle via llvm-branch-commits


@@ -1665,6 +1665,22 @@ void 
TwoAddressInstructionImpl::processTiedPairs(MachineInstr *MI,
 // by SubRegB is compatible with RegA with no subregister. So regardless of
 // whether the dest oper writes a subreg, the source oper should not.
 MO.setSubReg(0);
+
+// Update uses of RegB to uses of RegA inside the bundle.
+if (MI->isBundle()) {
+  for (MachineInstr *InnerMI = MI; InnerMI->isBundledWithSucc();) {
+InnerMI = InnerMI->getNextNode();
+
+for (MachineOperand &MO : InnerMI->all_uses()) {
+  if (MO.isReg() && MO.getReg() == RegB) {
+assert(
+MO.getSubReg() == 0 &&
+"tied subregister uses in bundled instructions not supported");
+MO.setReg(RegA);

nhaehnle wrote:

I played around with this but it got very confusing to me even in the 
restricted use case that I described in the other comment. I'd prefer to just 
keep this more restrictive for now, and if we find later on that supporting 
subregisters is beneficial we relax it separately.

https://github.com/llvm/llvm-project/pull/166212
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] CodeGen/AMDGPU: Allow 3-address conversion of bundled instructions (PR #166213)

2025-11-06 Thread Nicolai Hähnle via llvm-branch-commits

https://github.com/nhaehnle updated 
https://github.com/llvm/llvm-project/pull/166213

From 69ff9c0f0dd1af8333d4b160003d7f8a6eea61aa Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicolai=20H=C3=A4hnle?= 
Date: Tue, 7 Oct 2025 12:17:02 -0700
Subject: [PATCH] CodeGen/AMDGPU: Allow 3-address conversion of bundled
 instructions

This is in preparation for future changes in AMDGPU that will make more
substantial use of bundles pre-RA. For now, simply test this with
degenerate (single-instruction) bundles.

commit-id:4a30cb78
---
 .../lib/CodeGen/TwoAddressInstructionPass.cpp | 54 ++
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp| 56 +--
 llvm/test/CodeGen/AMDGPU/twoaddr-bundle.mir   |  9 ++-
 3 files changed, 87 insertions(+), 32 deletions(-)

diff --git a/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp 
b/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
index 713ef779588cf..3ff6da2b6dc63 100644
--- a/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
+++ b/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
@@ -794,29 +794,34 @@ bool TwoAddressInstructionImpl::convertInstTo3Addr(
   if (!NewMI)
 return false;
 
-  LLVM_DEBUG(dbgs() << "2addr: CONVERTING 2-ADDR: " << *mi);
-  LLVM_DEBUG(dbgs() << "2addr: TO 3-ADDR: " << *NewMI);
-
-  // If the old instruction is debug value tracked, an update is required.
-  if (auto OldInstrNum = mi->peekDebugInstrNum()) {
-assert(mi->getNumExplicitDefs() == 1);
-assert(NewMI->getNumExplicitDefs() == 1);
-
-// Find the old and new def location.
-unsigned OldIdx = mi->defs().begin()->getOperandNo();
-unsigned NewIdx = NewMI->defs().begin()->getOperandNo();
-
-// Record that one def has been replaced by the other.
-unsigned NewInstrNum = NewMI->getDebugInstrNum();
-MF->makeDebugValueSubstitution(std::make_pair(OldInstrNum, OldIdx),
-   std::make_pair(NewInstrNum, NewIdx));
-  }
-
-  MBB->erase(mi); // Nuke the old inst.
-
   for (MachineInstr &MI : MIS)
 DistanceMap.insert(std::make_pair(&MI, Dist++));
-  Dist--;
+
+  if (&*mi == NewMI) {
+LLVM_DEBUG(dbgs() << "2addr: CONVERTED IN-PLACE TO 3-ADDR: " << *mi);
+  } else {
+LLVM_DEBUG(dbgs() << "2addr: CONVERTING 2-ADDR: " << *mi);
+LLVM_DEBUG(dbgs() << "2addr: TO 3-ADDR: " << *NewMI);
+
+// If the old instruction is debug value tracked, an update is required.
+if (auto OldInstrNum = mi->peekDebugInstrNum()) {
+  assert(mi->getNumExplicitDefs() == 1);
+  assert(NewMI->getNumExplicitDefs() == 1);
+
+  // Find the old and new def location.
+  unsigned OldIdx = mi->defs().begin()->getOperandNo();
+  unsigned NewIdx = NewMI->defs().begin()->getOperandNo();
+
+  // Record that one def has been replaced by the other.
+  unsigned NewInstrNum = NewMI->getDebugInstrNum();
+  MF->makeDebugValueSubstitution(std::make_pair(OldInstrNum, OldIdx),
+ std::make_pair(NewInstrNum, NewIdx));
+}
+
+MBB->erase(mi); // Nuke the old inst.
+Dist--;
+  }
+
   mi = NewMI;
   nmi = std::next(mi);
 
@@ -1329,6 +1334,9 @@ bool TwoAddressInstructionImpl::tryInstructionTransform(
 
   bool Commuted = tryInstructionCommute(&MI, DstIdx, SrcIdx, regBKilled, Dist);
 
+  // Give targets a chance to convert bundled instructions.
+  bool ConvertibleTo3Addr = MI.isConvertibleTo3Addr(MachineInstr::AnyInBundle);
+
   // If the instruction is convertible to 3 Addr, instead
   // of returning try 3 Addr transformation aggressively and
   // use this variable to check later. Because it might be better.
@@ -1337,7 +1345,7 @@ bool TwoAddressInstructionImpl::tryInstructionTransform(
   //   addl %esi, %edi
   //   movl %edi, %eax
   //   ret
-  if (Commuted && !MI.isConvertibleTo3Addr())
+  if (Commuted && !ConvertibleTo3Addr)
 return false;
 
   if (shouldOnlyCommute)
@@ -1357,7 +1365,7 @@ bool TwoAddressInstructionImpl::tryInstructionTransform(
 regBKilled = isKilled(MI, regB, true);
   }
 
-  if (MI.isConvertibleTo3Addr()) {
+  if (ConvertibleTo3Addr) {
 // This instruction is potentially convertible to a true
 // three-address instruction.  Check if it is profitable.
 if (!regBKilled || isProfitableToConv3Addr(regA, regB)) {
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp 
b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 6ce18ea921a9b..deeb8beb04332 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -4047,10 +4047,29 @@ MachineInstr 
*SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
  LiveVariables *LV,
  LiveIntervals *LIS) const {
   MachineBasicBlock &MBB = *MI.getParent();
+  MachineInstr *CandidateMI = &MI;
+
+  if (MI.isBundle()) {
+// This is a temporary placeholder for bundle handling that enables us to
+// exercise the relevant code paths in the two-address instruction pass.
+i

[llvm-branch-commits] [llvm] CodeGen/AMDGPU: Allow 3-address conversion of bundled instructions (PR #166213)

2025-11-06 Thread Nicolai Hähnle via llvm-branch-commits

https://github.com/nhaehnle updated 
https://github.com/llvm/llvm-project/pull/166213

From 69ff9c0f0dd1af8333d4b160003d7f8a6eea61aa Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicolai=20H=C3=A4hnle?= 
Date: Tue, 7 Oct 2025 12:17:02 -0700
Subject: [PATCH] CodeGen/AMDGPU: Allow 3-address conversion of bundled
 instructions

This is in preparation for future changes in AMDGPU that will make more
substantial use of bundles pre-RA. For now, simply test this with
degenerate (single-instruction) bundles.

commit-id:4a30cb78
---
 .../lib/CodeGen/TwoAddressInstructionPass.cpp | 54 ++
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp| 56 +--
 llvm/test/CodeGen/AMDGPU/twoaddr-bundle.mir   |  9 ++-
 3 files changed, 87 insertions(+), 32 deletions(-)

diff --git a/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp 
b/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
index 713ef779588cf..3ff6da2b6dc63 100644
--- a/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
+++ b/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
@@ -794,29 +794,34 @@ bool TwoAddressInstructionImpl::convertInstTo3Addr(
   if (!NewMI)
 return false;
 
-  LLVM_DEBUG(dbgs() << "2addr: CONVERTING 2-ADDR: " << *mi);
-  LLVM_DEBUG(dbgs() << "2addr: TO 3-ADDR: " << *NewMI);
-
-  // If the old instruction is debug value tracked, an update is required.
-  if (auto OldInstrNum = mi->peekDebugInstrNum()) {
-assert(mi->getNumExplicitDefs() == 1);
-assert(NewMI->getNumExplicitDefs() == 1);
-
-// Find the old and new def location.
-unsigned OldIdx = mi->defs().begin()->getOperandNo();
-unsigned NewIdx = NewMI->defs().begin()->getOperandNo();
-
-// Record that one def has been replaced by the other.
-unsigned NewInstrNum = NewMI->getDebugInstrNum();
-MF->makeDebugValueSubstitution(std::make_pair(OldInstrNum, OldIdx),
-   std::make_pair(NewInstrNum, NewIdx));
-  }
-
-  MBB->erase(mi); // Nuke the old inst.
-
   for (MachineInstr &MI : MIS)
 DistanceMap.insert(std::make_pair(&MI, Dist++));
-  Dist--;
+
+  if (&*mi == NewMI) {
+LLVM_DEBUG(dbgs() << "2addr: CONVERTED IN-PLACE TO 3-ADDR: " << *mi);
+  } else {
+LLVM_DEBUG(dbgs() << "2addr: CONVERTING 2-ADDR: " << *mi);
+LLVM_DEBUG(dbgs() << "2addr: TO 3-ADDR: " << *NewMI);
+
+// If the old instruction is debug value tracked, an update is required.
+if (auto OldInstrNum = mi->peekDebugInstrNum()) {
+  assert(mi->getNumExplicitDefs() == 1);
+  assert(NewMI->getNumExplicitDefs() == 1);
+
+  // Find the old and new def location.
+  unsigned OldIdx = mi->defs().begin()->getOperandNo();
+  unsigned NewIdx = NewMI->defs().begin()->getOperandNo();
+
+  // Record that one def has been replaced by the other.
+  unsigned NewInstrNum = NewMI->getDebugInstrNum();
+  MF->makeDebugValueSubstitution(std::make_pair(OldInstrNum, OldIdx),
+ std::make_pair(NewInstrNum, NewIdx));
+}
+
+MBB->erase(mi); // Nuke the old inst.
+Dist--;
+  }
+
   mi = NewMI;
   nmi = std::next(mi);
 
@@ -1329,6 +1334,9 @@ bool TwoAddressInstructionImpl::tryInstructionTransform(
 
   bool Commuted = tryInstructionCommute(&MI, DstIdx, SrcIdx, regBKilled, Dist);
 
+  // Give targets a chance to convert bundled instructions.
+  bool ConvertibleTo3Addr = MI.isConvertibleTo3Addr(MachineInstr::AnyInBundle);
+
   // If the instruction is convertible to 3 Addr, instead
   // of returning try 3 Addr transformation aggressively and
   // use this variable to check later. Because it might be better.
@@ -1337,7 +1345,7 @@ bool TwoAddressInstructionImpl::tryInstructionTransform(
   //   addl %esi, %edi
   //   movl %edi, %eax
   //   ret
-  if (Commuted && !MI.isConvertibleTo3Addr())
+  if (Commuted && !ConvertibleTo3Addr)
 return false;
 
   if (shouldOnlyCommute)
@@ -1357,7 +1365,7 @@ bool TwoAddressInstructionImpl::tryInstructionTransform(
 regBKilled = isKilled(MI, regB, true);
   }
 
-  if (MI.isConvertibleTo3Addr()) {
+  if (ConvertibleTo3Addr) {
 // This instruction is potentially convertible to a true
 // three-address instruction.  Check if it is profitable.
 if (!regBKilled || isProfitableToConv3Addr(regA, regB)) {
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp 
b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 6ce18ea921a9b..deeb8beb04332 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -4047,10 +4047,29 @@ MachineInstr 
*SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
  LiveVariables *LV,
  LiveIntervals *LIS) const {
   MachineBasicBlock &MBB = *MI.getParent();
+  MachineInstr *CandidateMI = &MI;
+
+  if (MI.isBundle()) {
+// This is a temporary placeholder for bundle handling that enables us to
+// exercise the relevant code paths in the two-address instruction pass.
+i

[llvm-branch-commits] [llvm] CodeGen/AMDGPU: Allow 3-address conversion of bundled instructions (PR #166213)

2025-11-06 Thread Nicolai Hähnle via llvm-branch-commits

https://github.com/nhaehnle updated 
https://github.com/llvm/llvm-project/pull/166213

From 69ff9c0f0dd1af8333d4b160003d7f8a6eea61aa Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicolai=20H=C3=A4hnle?= 
Date: Tue, 7 Oct 2025 12:17:02 -0700
Subject: [PATCH] CodeGen/AMDGPU: Allow 3-address conversion of bundled
 instructions

This is in preparation for future changes in AMDGPU that will make more
substantial use of bundles pre-RA. For now, simply test this with
degenerate (single-instruction) bundles.

commit-id:4a30cb78
---
 .../lib/CodeGen/TwoAddressInstructionPass.cpp | 54 ++
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp| 56 +--
 llvm/test/CodeGen/AMDGPU/twoaddr-bundle.mir   |  9 ++-
 3 files changed, 87 insertions(+), 32 deletions(-)

diff --git a/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp 
b/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
index 713ef779588cf..3ff6da2b6dc63 100644
--- a/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
+++ b/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
@@ -794,29 +794,34 @@ bool TwoAddressInstructionImpl::convertInstTo3Addr(
   if (!NewMI)
 return false;
 
-  LLVM_DEBUG(dbgs() << "2addr: CONVERTING 2-ADDR: " << *mi);
-  LLVM_DEBUG(dbgs() << "2addr: TO 3-ADDR: " << *NewMI);
-
-  // If the old instruction is debug value tracked, an update is required.
-  if (auto OldInstrNum = mi->peekDebugInstrNum()) {
-assert(mi->getNumExplicitDefs() == 1);
-assert(NewMI->getNumExplicitDefs() == 1);
-
-// Find the old and new def location.
-unsigned OldIdx = mi->defs().begin()->getOperandNo();
-unsigned NewIdx = NewMI->defs().begin()->getOperandNo();
-
-// Record that one def has been replaced by the other.
-unsigned NewInstrNum = NewMI->getDebugInstrNum();
-MF->makeDebugValueSubstitution(std::make_pair(OldInstrNum, OldIdx),
-   std::make_pair(NewInstrNum, NewIdx));
-  }
-
-  MBB->erase(mi); // Nuke the old inst.
-
   for (MachineInstr &MI : MIS)
 DistanceMap.insert(std::make_pair(&MI, Dist++));
-  Dist--;
+
+  if (&*mi == NewMI) {
+LLVM_DEBUG(dbgs() << "2addr: CONVERTED IN-PLACE TO 3-ADDR: " << *mi);
+  } else {
+LLVM_DEBUG(dbgs() << "2addr: CONVERTING 2-ADDR: " << *mi);
+LLVM_DEBUG(dbgs() << "2addr: TO 3-ADDR: " << *NewMI);
+
+// If the old instruction is debug value tracked, an update is required.
+if (auto OldInstrNum = mi->peekDebugInstrNum()) {
+  assert(mi->getNumExplicitDefs() == 1);
+  assert(NewMI->getNumExplicitDefs() == 1);
+
+  // Find the old and new def location.
+  unsigned OldIdx = mi->defs().begin()->getOperandNo();
+  unsigned NewIdx = NewMI->defs().begin()->getOperandNo();
+
+  // Record that one def has been replaced by the other.
+  unsigned NewInstrNum = NewMI->getDebugInstrNum();
+  MF->makeDebugValueSubstitution(std::make_pair(OldInstrNum, OldIdx),
+ std::make_pair(NewInstrNum, NewIdx));
+}
+
+MBB->erase(mi); // Nuke the old inst.
+Dist--;
+  }
+
   mi = NewMI;
   nmi = std::next(mi);
 
@@ -1329,6 +1334,9 @@ bool TwoAddressInstructionImpl::tryInstructionTransform(
 
   bool Commuted = tryInstructionCommute(&MI, DstIdx, SrcIdx, regBKilled, Dist);
 
+  // Give targets a chance to convert bundled instructions.
+  bool ConvertibleTo3Addr = MI.isConvertibleTo3Addr(MachineInstr::AnyInBundle);
+
   // If the instruction is convertible to 3 Addr, instead
   // of returning try 3 Addr transformation aggressively and
   // use this variable to check later. Because it might be better.
@@ -1337,7 +1345,7 @@ bool TwoAddressInstructionImpl::tryInstructionTransform(
   //   addl %esi, %edi
   //   movl %edi, %eax
   //   ret
-  if (Commuted && !MI.isConvertibleTo3Addr())
+  if (Commuted && !ConvertibleTo3Addr)
 return false;
 
   if (shouldOnlyCommute)
@@ -1357,7 +1365,7 @@ bool TwoAddressInstructionImpl::tryInstructionTransform(
 regBKilled = isKilled(MI, regB, true);
   }
 
-  if (MI.isConvertibleTo3Addr()) {
+  if (ConvertibleTo3Addr) {
 // This instruction is potentially convertible to a true
 // three-address instruction.  Check if it is profitable.
 if (!regBKilled || isProfitableToConv3Addr(regA, regB)) {
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp 
b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 6ce18ea921a9b..deeb8beb04332 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -4047,10 +4047,29 @@ MachineInstr 
*SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
  LiveVariables *LV,
  LiveIntervals *LIS) const {
   MachineBasicBlock &MBB = *MI.getParent();
+  MachineInstr *CandidateMI = &MI;
+
+  if (MI.isBundle()) {
+// This is a temporary placeholder for bundle handling that enables us to
+// exercise the relevant code paths in the two-address instruction pass.
+i

[llvm-branch-commits] [llvm] CodeGen: Handle bundled instructions in two-address-instructions pass (PR #166212)

2025-11-06 Thread Nicolai Hähnle via llvm-branch-commits

https://github.com/nhaehnle updated 
https://github.com/llvm/llvm-project/pull/166212

From b6bf0c47fd34efff8a4df14df69eb1f06785 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicolai=20H=C3=A4hnle?= 
Date: Tue, 23 Sep 2025 19:08:52 -0700
Subject: [PATCH] CodeGen: Handle bundled instructions in
 two-address-instructions pass

If the instruction with tied operands is a BUNDLE instruction and we
handle it by replacing an operand, then we need to update the
corresponding internal operands as well. Otherwise, the resulting MIR is
invalid.

The test case is degenerate in the sense that the bundle only contains a
single instruction, but it is sufficient to exercise this issue.

commit-id:6760a9b7
---
 .../lib/CodeGen/TwoAddressInstructionPass.cpp | 12 
 llvm/test/CodeGen/AMDGPU/twoaddr-bundle.mir   | 57 +++
 2 files changed, 69 insertions(+)
 create mode 100644 llvm/test/CodeGen/AMDGPU/twoaddr-bundle.mir

diff --git a/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp 
b/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
index 414e414738b71..713ef779588cf 100644
--- a/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
+++ b/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
@@ -1665,6 +1665,18 @@ void 
TwoAddressInstructionImpl::processTiedPairs(MachineInstr *MI,
 // by SubRegB is compatible with RegA with no subregister. So regardless of
 // whether the dest oper writes a subreg, the source oper should not.
 MO.setSubReg(0);
+
+// Update uses of RegB to uses of RegA inside the bundle.
+if (MI->isBundle()) {
+  for (MachineOperand &MO : mi_bundle_ops(*MI)) {
+if (MO.isReg() && MO.getReg() == RegB) {
+  assert(
+  MO.getSubReg() == 0 &&
+  "tied subregister uses in bundled instructions not supported");
+  MO.setReg(RegA);
+}
+  }
+}
   }
 
   if (AllUsesCopied) {
diff --git a/llvm/test/CodeGen/AMDGPU/twoaddr-bundle.mir 
b/llvm/test/CodeGen/AMDGPU/twoaddr-bundle.mir
new file mode 100644
index 0..696962a88c8b8
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/twoaddr-bundle.mir
@@ -0,0 +1,57 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py 
UTC_ARGS: --version 6
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 %s --passes=two-address-instruction 
-verify-each -o - | FileCheck --check-prefixes=GCN %s
+
+# Exercise very basic handling of BUNDLE'd instructions by the 
two-address-instruction pass.
+
+# This test is an example where it is best to keep the two-address instruction
+# and resolve the tie with a COPY that is expected to be coalesced.
+---
+name:test_fmac_bundle
+body: |
+  bb.0:
+
+; GCN-LABEL: name: test_fmac_bundle
+; GCN: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+; GCN-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], 
[[COPY1]], 0, implicit $exec
+; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+; GCN-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_U32_e64_]]
+; GCN-NEXT: BUNDLE implicit-def [[COPY2]], implicit [[DEF]], implicit 
[[DEF1]], implicit [[COPY2]](tied-def 0), implicit $mode, implicit $exec {
+; GCN-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = V_FMAC_F32_e32 killed [[DEF]], 
killed [[DEF1]], killed [[COPY2]], implicit $mode, implicit $exec
+; GCN-NEXT: }
+%10:vgpr_32 = COPY $vgpr0
+%11:vgpr_32 = COPY $vgpr1
+%2:vgpr_32 = V_ADD_U32_e64 %10, %11, 0, implicit $exec
+%0:vgpr_32 = IMPLICIT_DEF
+%1:vgpr_32 = IMPLICIT_DEF
+BUNDLE implicit-def %3:vgpr_32, implicit %0, implicit %1, implicit killed 
%2(tied-def 0), implicit $mode, implicit $exec {
+  %3:vgpr_32 = V_FMAC_F32_e32 killed %0, killed %1, killed %2, implicit 
$mode, implicit $exec
+}
+
+...
+
+# This test is an example where conversion to three-address form would be 
beneficial.
+---
+name:test_fmac_reuse_bundle
+body: |
+  bb.0:
+
+; GCN-LABEL: name: test_fmac_reuse_bundle
+; GCN: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+; GCN-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
+; GCN-NEXT: BUNDLE implicit-def [[COPY1]], implicit [[DEF]], implicit 
[[DEF1]], implicit [[COPY1]](tied-def 0), implicit $mode, implicit $exec {
+; GCN-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = V_FMAC_F32_e32 killed [[DEF]], 
killed [[DEF1]], killed [[COPY1]], implicit $mode, implicit $exec
+; GCN-NEXT: }
+; GCN-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY1]], 
[[COPY]], 0, implicit $exec
+%2:vgpr_32 = COPY $vgpr0
+%0:vgpr_32 = IMPLICIT_DEF
+%1:vgpr_32 = IMPLICIT_DEF
+BUNDLE implicit-def %3:vgpr_32, implicit %0, implicit %1, implicit 
%2(tied-def 0), implicit $mode, implicit $exec {
+  %3:vgpr_32 = V_FMAC_F32_e32 killed %0, killed %1, killed %2, i

[llvm-branch-commits] [llvm] CodeGen: Handle bundled instructions in two-address-instructions pass (PR #166212)

2025-11-06 Thread Nicolai Hähnle via llvm-branch-commits

https://github.com/nhaehnle updated 
https://github.com/llvm/llvm-project/pull/166212

From b6bf0c47fd34efff8a4df14df69eb1f06785 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicolai=20H=C3=A4hnle?= 
Date: Tue, 23 Sep 2025 19:08:52 -0700
Subject: [PATCH] CodeGen: Handle bundled instructions in
 two-address-instructions pass

If the instruction with tied operands is a BUNDLE instruction and we
handle it by replacing an operand, then we need to update the
corresponding internal operands as well. Otherwise, the resulting MIR is
invalid.

The test case is degenerate in the sense that the bundle only contains a
single instruction, but it is sufficient to exercise this issue.

commit-id:6760a9b7
---
 .../lib/CodeGen/TwoAddressInstructionPass.cpp | 12 
 llvm/test/CodeGen/AMDGPU/twoaddr-bundle.mir   | 57 +++
 2 files changed, 69 insertions(+)
 create mode 100644 llvm/test/CodeGen/AMDGPU/twoaddr-bundle.mir

diff --git a/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp 
b/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
index 414e414738b71..713ef779588cf 100644
--- a/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
+++ b/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
@@ -1665,6 +1665,18 @@ void 
TwoAddressInstructionImpl::processTiedPairs(MachineInstr *MI,
 // by SubRegB is compatible with RegA with no subregister. So regardless of
 // whether the dest oper writes a subreg, the source oper should not.
 MO.setSubReg(0);
+
+// Update uses of RegB to uses of RegA inside the bundle.
+if (MI->isBundle()) {
+  for (MachineOperand &MO : mi_bundle_ops(*MI)) {
+if (MO.isReg() && MO.getReg() == RegB) {
+  assert(
+  MO.getSubReg() == 0 &&
+  "tied subregister uses in bundled instructions not supported");
+  MO.setReg(RegA);
+}
+  }
+}
   }
 
   if (AllUsesCopied) {
diff --git a/llvm/test/CodeGen/AMDGPU/twoaddr-bundle.mir 
b/llvm/test/CodeGen/AMDGPU/twoaddr-bundle.mir
new file mode 100644
index 0..696962a88c8b8
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/twoaddr-bundle.mir
@@ -0,0 +1,57 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py 
UTC_ARGS: --version 6
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 %s --passes=two-address-instruction 
-verify-each -o - | FileCheck --check-prefixes=GCN %s
+
+# Exercise very basic handling of BUNDLE'd instructions by the 
two-address-instruction pass.
+
+# This test is an example where it is best to keep the two-address instruction
+# and resolve the tie with a COPY that is expected to be coalesced.
+---
+name:test_fmac_bundle
+body: |
+  bb.0:
+
+; GCN-LABEL: name: test_fmac_bundle
+; GCN: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+; GCN-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], 
[[COPY1]], 0, implicit $exec
+; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+; GCN-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_U32_e64_]]
+; GCN-NEXT: BUNDLE implicit-def [[COPY2]], implicit [[DEF]], implicit 
[[DEF1]], implicit [[COPY2]](tied-def 0), implicit $mode, implicit $exec {
+; GCN-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = V_FMAC_F32_e32 killed [[DEF]], 
killed [[DEF1]], killed [[COPY2]], implicit $mode, implicit $exec
+; GCN-NEXT: }
+%10:vgpr_32 = COPY $vgpr0
+%11:vgpr_32 = COPY $vgpr1
+%2:vgpr_32 = V_ADD_U32_e64 %10, %11, 0, implicit $exec
+%0:vgpr_32 = IMPLICIT_DEF
+%1:vgpr_32 = IMPLICIT_DEF
+BUNDLE implicit-def %3:vgpr_32, implicit %0, implicit %1, implicit killed 
%2(tied-def 0), implicit $mode, implicit $exec {
+  %3:vgpr_32 = V_FMAC_F32_e32 killed %0, killed %1, killed %2, implicit 
$mode, implicit $exec
+}
+
+...
+
+# This test is an example where conversion to three-address form would be 
beneficial.
+---
+name:test_fmac_reuse_bundle
+body: |
+  bb.0:
+
+; GCN-LABEL: name: test_fmac_reuse_bundle
+; GCN: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+; GCN-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
+; GCN-NEXT: BUNDLE implicit-def [[COPY1]], implicit [[DEF]], implicit 
[[DEF1]], implicit [[COPY1]](tied-def 0), implicit $mode, implicit $exec {
+; GCN-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = V_FMAC_F32_e32 killed [[DEF]], 
killed [[DEF1]], killed [[COPY1]], implicit $mode, implicit $exec
+; GCN-NEXT: }
+; GCN-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY1]], 
[[COPY]], 0, implicit $exec
+%2:vgpr_32 = COPY $vgpr0
+%0:vgpr_32 = IMPLICIT_DEF
+%1:vgpr_32 = IMPLICIT_DEF
+BUNDLE implicit-def %3:vgpr_32, implicit %0, implicit %1, implicit 
%2(tied-def 0), implicit $mode, implicit $exec {
+  %3:vgpr_32 = V_FMAC_F32_e32 killed %0, killed %1, killed %2, i

[llvm-branch-commits] [llvm] CodeGen: Handle bundled instructions in two-address-instructions pass (PR #166212)

2025-11-06 Thread Jay Foad via llvm-branch-commits


@@ -1665,6 +1665,22 @@ void 
TwoAddressInstructionImpl::processTiedPairs(MachineInstr *MI,
 // by SubRegB is compatible with RegA with no subregister. So regardless of
 // whether the dest oper writes a subreg, the source oper should not.
 MO.setSubReg(0);
+
+// Update uses of RegB to uses of RegA inside the bundle.
+if (MI->isBundle()) {
+  for (MachineInstr *InnerMI = MI; InnerMI->isBundledWithSucc();) {
+InnerMI = InnerMI->getNextNode();
+
+for (MachineOperand &MO : InnerMI->all_uses()) {
+  if (MO.isReg() && MO.getReg() == RegB) {
+assert(
+MO.getSubReg() == 0 &&
+"tied subregister uses in bundled instructions not supported");
+MO.setReg(RegA);

jayfoad wrote:

Then maybe assert that `MO.getSubReg() == 0 && SubRegB == 0`? Otherwise this 
code will replace a use of one reg with a use of another reg with a different 
size.

https://github.com/llvm/llvm-project/pull/166212
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] CodeGen: Handle bundled instructions in two-address-instructions pass (PR #166212)

2025-11-06 Thread via llvm-branch-commits

github-actions[bot] wrote:




:warning: C/C++ code formatter, clang-format found issues in your code. 
:warning:



You can test this locally with the following command:


``bash
git-clang-format --diff origin/main HEAD --extensions cpp -- 
llvm/lib/CodeGen/TwoAddressInstructionPass.cpp --diff_from_common_commit
``

:warning:
The reproduction instructions above might return results for more than one PR
in a stack if you are using a stacked PR workflow. You can limit the results by
changing `origin/main` to the base branch/commit you want to compare against.
:warning:





View the diff from clang-format here.


``diff
diff --git a/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp 
b/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
index 713ef7795..264e6c866 100644
--- a/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
+++ b/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
@@ -1670,9 +1670,8 @@ void 
TwoAddressInstructionImpl::processTiedPairs(MachineInstr *MI,
 if (MI->isBundle()) {
   for (MachineOperand &MO : mi_bundle_ops(*MI)) {
 if (MO.isReg() && MO.getReg() == RegB) {
-  assert(
-  MO.getSubReg() == 0 &&
-  "tied subregister uses in bundled instructions not supported");
+  assert(MO.getSubReg() == 0 &&
+ "tied subregister uses in bundled instructions not 
supported");
   MO.setReg(RegA);
 }
   }

``




https://github.com/llvm/llvm-project/pull/166212
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] CodeGen: Handle bundled instructions in two-address-instructions pass (PR #166212)

2025-11-06 Thread Jay Foad via llvm-branch-commits

https://github.com/jayfoad approved this pull request.

Seems OK on the understanding that it is slightly experimental, and after some 
more experience we may need to change things and/or nail down the exact rules 
for what cases are and are not supported.

https://github.com/llvm/llvm-project/pull/166212
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] CodeGen/AMDGPU: Allow 3-address conversion of bundled instructions (PR #166213)

2025-11-06 Thread Nicolai Hähnle via llvm-branch-commits

https://github.com/nhaehnle updated 
https://github.com/llvm/llvm-project/pull/166213

From 2ca173d4a9a8a59304a5915e7b46ce46ea5c0bf7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicolai=20H=C3=A4hnle?= 
Date: Tue, 7 Oct 2025 12:17:02 -0700
Subject: [PATCH] CodeGen/AMDGPU: Allow 3-address conversion of bundled
 instructions

This is in preparation for future changes in AMDGPU that will make more
substantial use of bundles pre-RA. For now, simply test this with
degenerate (single-instruction) bundles.

commit-id:4a30cb78
---
 .../lib/CodeGen/TwoAddressInstructionPass.cpp | 54 ++
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp| 56 +--
 llvm/test/CodeGen/AMDGPU/twoaddr-bundle.mir   |  9 ++-
 3 files changed, 87 insertions(+), 32 deletions(-)

diff --git a/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp 
b/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
index 1f816b94cf56b..7056ced5385ed 100644
--- a/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
+++ b/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
@@ -794,29 +794,34 @@ bool TwoAddressInstructionImpl::convertInstTo3Addr(
   if (!NewMI)
 return false;
 
-  LLVM_DEBUG(dbgs() << "2addr: CONVERTING 2-ADDR: " << *mi);
-  LLVM_DEBUG(dbgs() << "2addr: TO 3-ADDR: " << *NewMI);
-
-  // If the old instruction is debug value tracked, an update is required.
-  if (auto OldInstrNum = mi->peekDebugInstrNum()) {
-assert(mi->getNumExplicitDefs() == 1);
-assert(NewMI->getNumExplicitDefs() == 1);
-
-// Find the old and new def location.
-unsigned OldIdx = mi->defs().begin()->getOperandNo();
-unsigned NewIdx = NewMI->defs().begin()->getOperandNo();
-
-// Record that one def has been replaced by the other.
-unsigned NewInstrNum = NewMI->getDebugInstrNum();
-MF->makeDebugValueSubstitution(std::make_pair(OldInstrNum, OldIdx),
-   std::make_pair(NewInstrNum, NewIdx));
-  }
-
-  MBB->erase(mi); // Nuke the old inst.
-
   for (MachineInstr &MI : MIS)
 DistanceMap.insert(std::make_pair(&MI, Dist++));
-  Dist--;
+
+  if (&*mi == NewMI) {
+LLVM_DEBUG(dbgs() << "2addr: CONVERTED IN-PLACE TO 3-ADDR: " << *mi);
+  } else {
+LLVM_DEBUG(dbgs() << "2addr: CONVERTING 2-ADDR: " << *mi);
+LLVM_DEBUG(dbgs() << "2addr: TO 3-ADDR: " << *NewMI);
+
+// If the old instruction is debug value tracked, an update is required.
+if (auto OldInstrNum = mi->peekDebugInstrNum()) {
+  assert(mi->getNumExplicitDefs() == 1);
+  assert(NewMI->getNumExplicitDefs() == 1);
+
+  // Find the old and new def location.
+  unsigned OldIdx = mi->defs().begin()->getOperandNo();
+  unsigned NewIdx = NewMI->defs().begin()->getOperandNo();
+
+  // Record that one def has been replaced by the other.
+  unsigned NewInstrNum = NewMI->getDebugInstrNum();
+  MF->makeDebugValueSubstitution(std::make_pair(OldInstrNum, OldIdx),
+ std::make_pair(NewInstrNum, NewIdx));
+}
+
+MBB->erase(mi); // Nuke the old inst.
+Dist--;
+  }
+
   mi = NewMI;
   nmi = std::next(mi);
 
@@ -1329,6 +1334,9 @@ bool TwoAddressInstructionImpl::tryInstructionTransform(
 
   bool Commuted = tryInstructionCommute(&MI, DstIdx, SrcIdx, regBKilled, Dist);
 
+  // Give targets a chance to convert bundled instructions.
+  bool ConvertibleTo3Addr = MI.isConvertibleTo3Addr(MachineInstr::AnyInBundle);
+
   // If the instruction is convertible to 3 Addr, instead
   // of returning try 3 Addr transformation aggressively and
   // use this variable to check later. Because it might be better.
@@ -1337,7 +1345,7 @@ bool TwoAddressInstructionImpl::tryInstructionTransform(
   //   addl %esi, %edi
   //   movl %edi, %eax
   //   ret
-  if (Commuted && !MI.isConvertibleTo3Addr())
+  if (Commuted && !ConvertibleTo3Addr)
 return false;
 
   if (shouldOnlyCommute)
@@ -1357,7 +1365,7 @@ bool TwoAddressInstructionImpl::tryInstructionTransform(
 regBKilled = isKilled(MI, regB, true);
   }
 
-  if (MI.isConvertibleTo3Addr()) {
+  if (ConvertibleTo3Addr) {
 // This instruction is potentially convertible to a true
 // three-address instruction.  Check if it is profitable.
 if (!regBKilled || isProfitableToConv3Addr(regA, regB)) {
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp 
b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 6ce18ea921a9b..deeb8beb04332 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -4047,10 +4047,29 @@ MachineInstr 
*SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
  LiveVariables *LV,
  LiveIntervals *LIS) const {
   MachineBasicBlock &MBB = *MI.getParent();
+  MachineInstr *CandidateMI = &MI;
+
+  if (MI.isBundle()) {
+// This is a temporary placeholder for bundle handling that enables us to
+// exercise the relevant code paths in the two-address instruction pass.
+i

[llvm-branch-commits] [llvm] CodeGen/AMDGPU: Allow 3-address conversion of bundled instructions (PR #166213)

2025-11-06 Thread Nicolai Hähnle via llvm-branch-commits

https://github.com/nhaehnle updated 
https://github.com/llvm/llvm-project/pull/166213

From 2ca173d4a9a8a59304a5915e7b46ce46ea5c0bf7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicolai=20H=C3=A4hnle?= 
Date: Tue, 7 Oct 2025 12:17:02 -0700
Subject: [PATCH] CodeGen/AMDGPU: Allow 3-address conversion of bundled
 instructions

This is in preparation for future changes in AMDGPU that will make more
substantial use of bundles pre-RA. For now, simply test this with
degenerate (single-instruction) bundles.

commit-id:4a30cb78
---
 .../lib/CodeGen/TwoAddressInstructionPass.cpp | 54 ++
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp| 56 +--
 llvm/test/CodeGen/AMDGPU/twoaddr-bundle.mir   |  9 ++-
 3 files changed, 87 insertions(+), 32 deletions(-)

diff --git a/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp 
b/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
index 1f816b94cf56b..7056ced5385ed 100644
--- a/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
+++ b/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
@@ -794,29 +794,34 @@ bool TwoAddressInstructionImpl::convertInstTo3Addr(
   if (!NewMI)
 return false;
 
-  LLVM_DEBUG(dbgs() << "2addr: CONVERTING 2-ADDR: " << *mi);
-  LLVM_DEBUG(dbgs() << "2addr: TO 3-ADDR: " << *NewMI);
-
-  // If the old instruction is debug value tracked, an update is required.
-  if (auto OldInstrNum = mi->peekDebugInstrNum()) {
-assert(mi->getNumExplicitDefs() == 1);
-assert(NewMI->getNumExplicitDefs() == 1);
-
-// Find the old and new def location.
-unsigned OldIdx = mi->defs().begin()->getOperandNo();
-unsigned NewIdx = NewMI->defs().begin()->getOperandNo();
-
-// Record that one def has been replaced by the other.
-unsigned NewInstrNum = NewMI->getDebugInstrNum();
-MF->makeDebugValueSubstitution(std::make_pair(OldInstrNum, OldIdx),
-   std::make_pair(NewInstrNum, NewIdx));
-  }
-
-  MBB->erase(mi); // Nuke the old inst.
-
   for (MachineInstr &MI : MIS)
 DistanceMap.insert(std::make_pair(&MI, Dist++));
-  Dist--;
+
+  if (&*mi == NewMI) {
+LLVM_DEBUG(dbgs() << "2addr: CONVERTED IN-PLACE TO 3-ADDR: " << *mi);
+  } else {
+LLVM_DEBUG(dbgs() << "2addr: CONVERTING 2-ADDR: " << *mi);
+LLVM_DEBUG(dbgs() << "2addr: TO 3-ADDR: " << *NewMI);
+
+// If the old instruction is debug value tracked, an update is required.
+if (auto OldInstrNum = mi->peekDebugInstrNum()) {
+  assert(mi->getNumExplicitDefs() == 1);
+  assert(NewMI->getNumExplicitDefs() == 1);
+
+  // Find the old and new def location.
+  unsigned OldIdx = mi->defs().begin()->getOperandNo();
+  unsigned NewIdx = NewMI->defs().begin()->getOperandNo();
+
+  // Record that one def has been replaced by the other.
+  unsigned NewInstrNum = NewMI->getDebugInstrNum();
+  MF->makeDebugValueSubstitution(std::make_pair(OldInstrNum, OldIdx),
+ std::make_pair(NewInstrNum, NewIdx));
+}
+
+MBB->erase(mi); // Nuke the old inst.
+Dist--;
+  }
+
   mi = NewMI;
   nmi = std::next(mi);
 
@@ -1329,6 +1334,9 @@ bool TwoAddressInstructionImpl::tryInstructionTransform(
 
   bool Commuted = tryInstructionCommute(&MI, DstIdx, SrcIdx, regBKilled, Dist);
 
+  // Give targets a chance to convert bundled instructions.
+  bool ConvertibleTo3Addr = MI.isConvertibleTo3Addr(MachineInstr::AnyInBundle);
+
   // If the instruction is convertible to 3 Addr, instead
   // of returning try 3 Addr transformation aggressively and
   // use this variable to check later. Because it might be better.
@@ -1337,7 +1345,7 @@ bool TwoAddressInstructionImpl::tryInstructionTransform(
   //   addl %esi, %edi
   //   movl %edi, %eax
   //   ret
-  if (Commuted && !MI.isConvertibleTo3Addr())
+  if (Commuted && !ConvertibleTo3Addr)
 return false;
 
   if (shouldOnlyCommute)
@@ -1357,7 +1365,7 @@ bool TwoAddressInstructionImpl::tryInstructionTransform(
 regBKilled = isKilled(MI, regB, true);
   }
 
-  if (MI.isConvertibleTo3Addr()) {
+  if (ConvertibleTo3Addr) {
 // This instruction is potentially convertible to a true
 // three-address instruction.  Check if it is profitable.
 if (!regBKilled || isProfitableToConv3Addr(regA, regB)) {
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp 
b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 6ce18ea921a9b..deeb8beb04332 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -4047,10 +4047,29 @@ MachineInstr 
*SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
  LiveVariables *LV,
  LiveIntervals *LIS) const {
   MachineBasicBlock &MBB = *MI.getParent();
+  MachineInstr *CandidateMI = &MI;
+
+  if (MI.isBundle()) {
+// This is a temporary placeholder for bundle handling that enables us to
+// exercise the relevant code paths in the two-address instruction pass.
+i

[llvm-branch-commits] [llvm] CodeGen: Handle bundled instructions in two-address-instructions pass (PR #166212)

2025-11-06 Thread Nicolai Hähnle via llvm-branch-commits

https://github.com/nhaehnle updated 
https://github.com/llvm/llvm-project/pull/166212

From 1224dba5fcb35911c3e80f0a734394d2ce0cd640 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicolai=20H=C3=A4hnle?= 
Date: Tue, 23 Sep 2025 19:08:52 -0700
Subject: [PATCH] CodeGen: Handle bundled instructions in
 two-address-instructions pass

If the instruction with tied operands is a BUNDLE instruction and we
handle it by replacing an operand, then we need to update the
corresponding internal operands as well. Otherwise, the resulting MIR is
invalid.

The test case is degenerate in the sense that the bundle only contains a
single instruction, but it is sufficient to exercise this issue.

commit-id:6760a9b7
---
 .../lib/CodeGen/TwoAddressInstructionPass.cpp | 16 ++
 llvm/test/CodeGen/AMDGPU/twoaddr-bundle.mir   | 57 +++
 2 files changed, 73 insertions(+)
 create mode 100644 llvm/test/CodeGen/AMDGPU/twoaddr-bundle.mir

diff --git a/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp 
b/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
index 414e414738b71..1f816b94cf56b 100644
--- a/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
+++ b/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
@@ -1665,6 +1665,22 @@ void 
TwoAddressInstructionImpl::processTiedPairs(MachineInstr *MI,
 // by SubRegB is compatible with RegA with no subregister. So regardless of
 // whether the dest oper writes a subreg, the source oper should not.
 MO.setSubReg(0);
+
+// Update uses of RegB to uses of RegA inside the bundle.
+if (MI->isBundle()) {
+  for (MachineInstr *InnerMI = MI; InnerMI->isBundledWithSucc();) {
+InnerMI = InnerMI->getNextNode();
+
+for (MachineOperand &MO : InnerMI->all_uses()) {
+  if (MO.isReg() && MO.getReg() == RegB) {
+assert(
+MO.getSubReg() == 0 &&
+"tied subregister uses in bundled instructions not supported");
+MO.setReg(RegA);
+  }
+}
+  }
+}
   }
 
   if (AllUsesCopied) {
diff --git a/llvm/test/CodeGen/AMDGPU/twoaddr-bundle.mir 
b/llvm/test/CodeGen/AMDGPU/twoaddr-bundle.mir
new file mode 100644
index 0..696962a88c8b8
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/twoaddr-bundle.mir
@@ -0,0 +1,57 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py 
UTC_ARGS: --version 6
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 %s --passes=two-address-instruction 
-verify-each -o - | FileCheck --check-prefixes=GCN %s
+
+# Exercise very basic handling of BUNDLE'd instructions by the 
two-address-instruction pass.
+
+# This test is an example where it is best to keep the two-address instruction
+# and resolve the tie with a COPY that is expected to be coalesced.
+---
+name:test_fmac_bundle
+body: |
+  bb.0:
+
+; GCN-LABEL: name: test_fmac_bundle
+; GCN: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+; GCN-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], 
[[COPY1]], 0, implicit $exec
+; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+; GCN-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_U32_e64_]]
+; GCN-NEXT: BUNDLE implicit-def [[COPY2]], implicit [[DEF]], implicit 
[[DEF1]], implicit [[COPY2]](tied-def 0), implicit $mode, implicit $exec {
+; GCN-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = V_FMAC_F32_e32 killed [[DEF]], 
killed [[DEF1]], killed [[COPY2]], implicit $mode, implicit $exec
+; GCN-NEXT: }
+%10:vgpr_32 = COPY $vgpr0
+%11:vgpr_32 = COPY $vgpr1
+%2:vgpr_32 = V_ADD_U32_e64 %10, %11, 0, implicit $exec
+%0:vgpr_32 = IMPLICIT_DEF
+%1:vgpr_32 = IMPLICIT_DEF
+BUNDLE implicit-def %3:vgpr_32, implicit %0, implicit %1, implicit killed 
%2(tied-def 0), implicit $mode, implicit $exec {
+  %3:vgpr_32 = V_FMAC_F32_e32 killed %0, killed %1, killed %2, implicit 
$mode, implicit $exec
+}
+
+...
+
+# This test is an example where conversion to three-address form would be 
beneficial.
+---
+name:test_fmac_reuse_bundle
+body: |
+  bb.0:
+
+; GCN-LABEL: name: test_fmac_reuse_bundle
+; GCN: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+; GCN-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
+; GCN-NEXT: BUNDLE implicit-def [[COPY1]], implicit [[DEF]], implicit 
[[DEF1]], implicit [[COPY1]](tied-def 0), implicit $mode, implicit $exec {
+; GCN-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = V_FMAC_F32_e32 killed [[DEF]], 
killed [[DEF1]], killed [[COPY1]], implicit $mode, implicit $exec
+; GCN-NEXT: }
+; GCN-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY1]], 
[[COPY]], 0, implicit $exec
+%2:vgpr_32 = COPY $vgpr0
+%0:vgpr_32 = IMPLICIT_DEF
+%1:vgpr_32 = IMPLICIT_DEF
+BUNDLE implicit-def %3:vgpr_32, implicit

[llvm-branch-commits] [llvm] CodeGen/AMDGPU: Allow 3-address conversion of bundled instructions (PR #166213)

2025-11-06 Thread Nicolai Hähnle via llvm-branch-commits

https://github.com/nhaehnle updated 
https://github.com/llvm/llvm-project/pull/166213

From 2ca173d4a9a8a59304a5915e7b46ce46ea5c0bf7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicolai=20H=C3=A4hnle?= 
Date: Tue, 7 Oct 2025 12:17:02 -0700
Subject: [PATCH] CodeGen/AMDGPU: Allow 3-address conversion of bundled
 instructions

This is in preparation for future changes in AMDGPU that will make more
substantial use of bundles pre-RA. For now, simply test this with
degenerate (single-instruction) bundles.

commit-id:4a30cb78
---
 .../lib/CodeGen/TwoAddressInstructionPass.cpp | 54 ++
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp| 56 +--
 llvm/test/CodeGen/AMDGPU/twoaddr-bundle.mir   |  9 ++-
 3 files changed, 87 insertions(+), 32 deletions(-)

diff --git a/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp 
b/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
index 1f816b94cf56b..7056ced5385ed 100644
--- a/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
+++ b/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
@@ -794,29 +794,34 @@ bool TwoAddressInstructionImpl::convertInstTo3Addr(
   if (!NewMI)
 return false;
 
-  LLVM_DEBUG(dbgs() << "2addr: CONVERTING 2-ADDR: " << *mi);
-  LLVM_DEBUG(dbgs() << "2addr: TO 3-ADDR: " << *NewMI);
-
-  // If the old instruction is debug value tracked, an update is required.
-  if (auto OldInstrNum = mi->peekDebugInstrNum()) {
-assert(mi->getNumExplicitDefs() == 1);
-assert(NewMI->getNumExplicitDefs() == 1);
-
-// Find the old and new def location.
-unsigned OldIdx = mi->defs().begin()->getOperandNo();
-unsigned NewIdx = NewMI->defs().begin()->getOperandNo();
-
-// Record that one def has been replaced by the other.
-unsigned NewInstrNum = NewMI->getDebugInstrNum();
-MF->makeDebugValueSubstitution(std::make_pair(OldInstrNum, OldIdx),
-   std::make_pair(NewInstrNum, NewIdx));
-  }
-
-  MBB->erase(mi); // Nuke the old inst.
-
   for (MachineInstr &MI : MIS)
 DistanceMap.insert(std::make_pair(&MI, Dist++));
-  Dist--;
+
+  if (&*mi == NewMI) {
+LLVM_DEBUG(dbgs() << "2addr: CONVERTED IN-PLACE TO 3-ADDR: " << *mi);
+  } else {
+LLVM_DEBUG(dbgs() << "2addr: CONVERTING 2-ADDR: " << *mi);
+LLVM_DEBUG(dbgs() << "2addr: TO 3-ADDR: " << *NewMI);
+
+// If the old instruction is debug value tracked, an update is required.
+if (auto OldInstrNum = mi->peekDebugInstrNum()) {
+  assert(mi->getNumExplicitDefs() == 1);
+  assert(NewMI->getNumExplicitDefs() == 1);
+
+  // Find the old and new def location.
+  unsigned OldIdx = mi->defs().begin()->getOperandNo();
+  unsigned NewIdx = NewMI->defs().begin()->getOperandNo();
+
+  // Record that one def has been replaced by the other.
+  unsigned NewInstrNum = NewMI->getDebugInstrNum();
+  MF->makeDebugValueSubstitution(std::make_pair(OldInstrNum, OldIdx),
+ std::make_pair(NewInstrNum, NewIdx));
+}
+
+MBB->erase(mi); // Nuke the old inst.
+Dist--;
+  }
+
   mi = NewMI;
   nmi = std::next(mi);
 
@@ -1329,6 +1334,9 @@ bool TwoAddressInstructionImpl::tryInstructionTransform(
 
   bool Commuted = tryInstructionCommute(&MI, DstIdx, SrcIdx, regBKilled, Dist);
 
+  // Give targets a chance to convert bundled instructions.
+  bool ConvertibleTo3Addr = MI.isConvertibleTo3Addr(MachineInstr::AnyInBundle);
+
   // If the instruction is convertible to 3 Addr, instead
   // of returning try 3 Addr transformation aggressively and
   // use this variable to check later. Because it might be better.
@@ -1337,7 +1345,7 @@ bool TwoAddressInstructionImpl::tryInstructionTransform(
   //   addl %esi, %edi
   //   movl %edi, %eax
   //   ret
-  if (Commuted && !MI.isConvertibleTo3Addr())
+  if (Commuted && !ConvertibleTo3Addr)
 return false;
 
   if (shouldOnlyCommute)
@@ -1357,7 +1365,7 @@ bool TwoAddressInstructionImpl::tryInstructionTransform(
 regBKilled = isKilled(MI, regB, true);
   }
 
-  if (MI.isConvertibleTo3Addr()) {
+  if (ConvertibleTo3Addr) {
 // This instruction is potentially convertible to a true
 // three-address instruction.  Check if it is profitable.
 if (!regBKilled || isProfitableToConv3Addr(regA, regB)) {
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp 
b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 6ce18ea921a9b..deeb8beb04332 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -4047,10 +4047,29 @@ MachineInstr 
*SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
  LiveVariables *LV,
  LiveIntervals *LIS) const {
   MachineBasicBlock &MBB = *MI.getParent();
+  MachineInstr *CandidateMI = &MI;
+
+  if (MI.isBundle()) {
+// This is a temporary placeholder for bundle handling that enables us to
+// exercise the relevant code paths in the two-address instruction pass.
+i

[llvm-branch-commits] [llvm] CodeGen: Handle bundled instructions in two-address-instructions pass (PR #166212)

2025-11-06 Thread Nicolai Hähnle via llvm-branch-commits

https://github.com/nhaehnle updated 
https://github.com/llvm/llvm-project/pull/166212

From 1224dba5fcb35911c3e80f0a734394d2ce0cd640 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicolai=20H=C3=A4hnle?= 
Date: Tue, 23 Sep 2025 19:08:52 -0700
Subject: [PATCH] CodeGen: Handle bundled instructions in
 two-address-instructions pass

If the instruction with tied operands is a BUNDLE instruction and we
handle it by replacing an operand, then we need to update the
corresponding internal operands as well. Otherwise, the resulting MIR is
invalid.

The test case is degenerate in the sense that the bundle only contains a
single instruction, but it is sufficient to exercise this issue.

commit-id:6760a9b7
---
 .../lib/CodeGen/TwoAddressInstructionPass.cpp | 16 ++
 llvm/test/CodeGen/AMDGPU/twoaddr-bundle.mir   | 57 +++
 2 files changed, 73 insertions(+)
 create mode 100644 llvm/test/CodeGen/AMDGPU/twoaddr-bundle.mir

diff --git a/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp 
b/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
index 414e414738b71..1f816b94cf56b 100644
--- a/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
+++ b/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
@@ -1665,6 +1665,22 @@ void 
TwoAddressInstructionImpl::processTiedPairs(MachineInstr *MI,
 // by SubRegB is compatible with RegA with no subregister. So regardless of
 // whether the dest oper writes a subreg, the source oper should not.
 MO.setSubReg(0);
+
+// Update uses of RegB to uses of RegA inside the bundle.
+if (MI->isBundle()) {
+  for (MachineInstr *InnerMI = MI; InnerMI->isBundledWithSucc();) {
+InnerMI = InnerMI->getNextNode();
+
+for (MachineOperand &MO : InnerMI->all_uses()) {
+  if (MO.isReg() && MO.getReg() == RegB) {
+assert(
+MO.getSubReg() == 0 &&
+"tied subregister uses in bundled instructions not supported");
+MO.setReg(RegA);
+  }
+}
+  }
+}
   }
 
   if (AllUsesCopied) {
diff --git a/llvm/test/CodeGen/AMDGPU/twoaddr-bundle.mir 
b/llvm/test/CodeGen/AMDGPU/twoaddr-bundle.mir
new file mode 100644
index 0..696962a88c8b8
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/twoaddr-bundle.mir
@@ -0,0 +1,57 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py 
UTC_ARGS: --version 6
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 %s --passes=two-address-instruction 
-verify-each -o - | FileCheck --check-prefixes=GCN %s
+
+# Exercise very basic handling of BUNDLE'd instructions by the 
two-address-instruction pass.
+
+# This test is an example where it is best to keep the two-address instruction
+# and resolve the tie with a COPY that is expected to be coalesced.
+---
+name:test_fmac_bundle
+body: |
+  bb.0:
+
+; GCN-LABEL: name: test_fmac_bundle
+; GCN: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+; GCN-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], 
[[COPY1]], 0, implicit $exec
+; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+; GCN-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_U32_e64_]]
+; GCN-NEXT: BUNDLE implicit-def [[COPY2]], implicit [[DEF]], implicit 
[[DEF1]], implicit [[COPY2]](tied-def 0), implicit $mode, implicit $exec {
+; GCN-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = V_FMAC_F32_e32 killed [[DEF]], 
killed [[DEF1]], killed [[COPY2]], implicit $mode, implicit $exec
+; GCN-NEXT: }
+%10:vgpr_32 = COPY $vgpr0
+%11:vgpr_32 = COPY $vgpr1
+%2:vgpr_32 = V_ADD_U32_e64 %10, %11, 0, implicit $exec
+%0:vgpr_32 = IMPLICIT_DEF
+%1:vgpr_32 = IMPLICIT_DEF
+BUNDLE implicit-def %3:vgpr_32, implicit %0, implicit %1, implicit killed 
%2(tied-def 0), implicit $mode, implicit $exec {
+  %3:vgpr_32 = V_FMAC_F32_e32 killed %0, killed %1, killed %2, implicit 
$mode, implicit $exec
+}
+
+...
+
+# This test is an example where conversion to three-address form would be 
beneficial.
+---
+name:test_fmac_reuse_bundle
+body: |
+  bb.0:
+
+; GCN-LABEL: name: test_fmac_reuse_bundle
+; GCN: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+; GCN-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
+; GCN-NEXT: BUNDLE implicit-def [[COPY1]], implicit [[DEF]], implicit 
[[DEF1]], implicit [[COPY1]](tied-def 0), implicit $mode, implicit $exec {
+; GCN-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = V_FMAC_F32_e32 killed [[DEF]], 
killed [[DEF1]], killed [[COPY1]], implicit $mode, implicit $exec
+; GCN-NEXT: }
+; GCN-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY1]], 
[[COPY]], 0, implicit $exec
+%2:vgpr_32 = COPY $vgpr0
+%0:vgpr_32 = IMPLICIT_DEF
+%1:vgpr_32 = IMPLICIT_DEF
+BUNDLE implicit-def %3:vgpr_32, implicit

[llvm-branch-commits] [llvm] CodeGen: Handle bundled instructions in two-address-instructions pass (PR #166212)

2025-11-06 Thread Nicolai Hähnle via llvm-branch-commits

https://github.com/nhaehnle updated 
https://github.com/llvm/llvm-project/pull/166212

From 1224dba5fcb35911c3e80f0a734394d2ce0cd640 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicolai=20H=C3=A4hnle?= 
Date: Tue, 23 Sep 2025 19:08:52 -0700
Subject: [PATCH] CodeGen: Handle bundled instructions in
 two-address-instructions pass

If the instruction with tied operands is a BUNDLE instruction and we
handle it by replacing an operand, then we need to update the
corresponding internal operands as well. Otherwise, the resulting MIR is
invalid.

The test case is degenerate in the sense that the bundle only contains a
single instruction, but it is sufficient to exercise this issue.

commit-id:6760a9b7
---
 .../lib/CodeGen/TwoAddressInstructionPass.cpp | 16 ++
 llvm/test/CodeGen/AMDGPU/twoaddr-bundle.mir   | 57 +++
 2 files changed, 73 insertions(+)
 create mode 100644 llvm/test/CodeGen/AMDGPU/twoaddr-bundle.mir

diff --git a/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp 
b/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
index 414e414738b71..1f816b94cf56b 100644
--- a/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
+++ b/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
@@ -1665,6 +1665,22 @@ void 
TwoAddressInstructionImpl::processTiedPairs(MachineInstr *MI,
 // by SubRegB is compatible with RegA with no subregister. So regardless of
 // whether the dest oper writes a subreg, the source oper should not.
 MO.setSubReg(0);
+
+// Update uses of RegB to uses of RegA inside the bundle.
+if (MI->isBundle()) {
+  for (MachineInstr *InnerMI = MI; InnerMI->isBundledWithSucc();) {
+InnerMI = InnerMI->getNextNode();
+
+for (MachineOperand &MO : InnerMI->all_uses()) {
+  if (MO.isReg() && MO.getReg() == RegB) {
+assert(
+MO.getSubReg() == 0 &&
+"tied subregister uses in bundled instructions not supported");
+MO.setReg(RegA);
+  }
+}
+  }
+}
   }
 
   if (AllUsesCopied) {
diff --git a/llvm/test/CodeGen/AMDGPU/twoaddr-bundle.mir 
b/llvm/test/CodeGen/AMDGPU/twoaddr-bundle.mir
new file mode 100644
index 0..696962a88c8b8
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/twoaddr-bundle.mir
@@ -0,0 +1,57 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py 
UTC_ARGS: --version 6
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 %s --passes=two-address-instruction 
-verify-each -o - | FileCheck --check-prefixes=GCN %s
+
+# Exercise very basic handling of BUNDLE'd instructions by the 
two-address-instruction pass.
+
+# This test is an example where it is best to keep the two-address instruction
+# and resolve the tie with a COPY that is expected to be coalesced.
+---
+name:test_fmac_bundle
+body: |
+  bb.0:
+
+; GCN-LABEL: name: test_fmac_bundle
+; GCN: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+; GCN-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], 
[[COPY1]], 0, implicit $exec
+; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+; GCN-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_U32_e64_]]
+; GCN-NEXT: BUNDLE implicit-def [[COPY2]], implicit [[DEF]], implicit 
[[DEF1]], implicit [[COPY2]](tied-def 0), implicit $mode, implicit $exec {
+; GCN-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = V_FMAC_F32_e32 killed [[DEF]], 
killed [[DEF1]], killed [[COPY2]], implicit $mode, implicit $exec
+; GCN-NEXT: }
+%10:vgpr_32 = COPY $vgpr0
+%11:vgpr_32 = COPY $vgpr1
+%2:vgpr_32 = V_ADD_U32_e64 %10, %11, 0, implicit $exec
+%0:vgpr_32 = IMPLICIT_DEF
+%1:vgpr_32 = IMPLICIT_DEF
+BUNDLE implicit-def %3:vgpr_32, implicit %0, implicit %1, implicit killed 
%2(tied-def 0), implicit $mode, implicit $exec {
+  %3:vgpr_32 = V_FMAC_F32_e32 killed %0, killed %1, killed %2, implicit 
$mode, implicit $exec
+}
+
+...
+
+# This test is an example where conversion to three-address form would be 
beneficial.
+---
+name:test_fmac_reuse_bundle
+body: |
+  bb.0:
+
+; GCN-LABEL: name: test_fmac_reuse_bundle
+; GCN: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+; GCN-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
+; GCN-NEXT: BUNDLE implicit-def [[COPY1]], implicit [[DEF]], implicit 
[[DEF1]], implicit [[COPY1]](tied-def 0), implicit $mode, implicit $exec {
+; GCN-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = V_FMAC_F32_e32 killed [[DEF]], 
killed [[DEF1]], killed [[COPY1]], implicit $mode, implicit $exec
+; GCN-NEXT: }
+; GCN-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY1]], 
[[COPY]], 0, implicit $exec
+%2:vgpr_32 = COPY $vgpr0
+%0:vgpr_32 = IMPLICIT_DEF
+%1:vgpr_32 = IMPLICIT_DEF
+BUNDLE implicit-def %3:vgpr_32, implicit

[llvm-branch-commits] [llvm] [CI] Make premerge_advisor_explain write comments (PR #166605)

2025-11-06 Thread Aiden Grossman via llvm-branch-commits

https://github.com/boomanaiden154 updated 
https://github.com/llvm/llvm-project/pull/166605

>From 06c030dcb4ee57be287beffd96d1b21ef1697dd4 Mon Sep 17 00:00:00 2001
From: Aiden Grossman 
Date: Wed, 5 Nov 2025 18:23:46 +
Subject: [PATCH] fix

Created using spr 1.3.7
---
 .ci/premerge_advisor_explain.py | 34 -
 .ci/utils.sh| 10 +-
 2 files changed, 22 insertions(+), 22 deletions(-)

diff --git a/.ci/premerge_advisor_explain.py b/.ci/premerge_advisor_explain.py
index 4d840a33c3cf2..1d487af9e9ec7 100644
--- a/.ci/premerge_advisor_explain.py
+++ b/.ci/premerge_advisor_explain.py
@@ -31,22 +31,11 @@ def get_comment_id(platform: str, pr: 
github.PullRequest.PullRequest) -> int | N
 def get_comment(
 github_token: str,
 pr_number: int,
-junit_objects,
-ninja_logs,
-advisor_response,
-return_code,
+body: str,
 ) -> dict[str, str]:
 repo = github.Github(github_token).get_repo("llvm/llvm-project")
 pr = repo.get_issue(pr_number).as_pull_request()
-comment = {
-"body": generate_test_report_lib.generate_report(
-generate_test_report_lib.compute_platform_title(),
-return_code,
-junit_objects,
-ninja_logs,
-failure_explanations_list=advisor_response,
-)
-}
+comment = {"body": body}
 comment_id = get_comment_id(platform.system(), pr)
 if comment_id:
 comment["id"] = comment_id
@@ -59,6 +48,14 @@ def main(
 pr_number: int,
 return_code: int,
 ):
+if return_code == 0:
+with open("comment", "w") as comment_file_handle:
+comment = get_comment(
+":white_check_mark: With the latest revision this PR passed "
+"the premerge checks."
+)
+if comment["id"]:
+json.dump([comment], comment_file_handle)
 junit_objects, ninja_logs = generate_test_report_lib.load_info_from_files(
 build_log_files
 )
@@ -90,10 +87,13 @@ def main(
 get_comment(
 github_token,
 pr_number,
-junit_objects,
-ninja_logs,
-advisor_response.json(),
-return_code,
+generate_test_report_lib.generate_report(
+generate_test_report_lib.compute_platform_title(),
+return_code,
+junit_objects,
+ninja_logs,
+failure_explanations_list=advisor_response.json(),
+),
 )
 ]
 with open("comment", "w") as comment_file_handle:
diff --git a/.ci/utils.sh b/.ci/utils.sh
index 72f4b04f5bf3a..91c27319f3534 100644
--- a/.ci/utils.sh
+++ b/.ci/utils.sh
@@ -33,18 +33,18 @@ function at-exit {
   # If building fails there will be no results files.
   shopt -s nullglob
 
-  if [[ "$GITHUB_STEP_SUMMARY" != "" ]]; then
+  if [[ "$GITHUB_ACTIONS" != "" ]]; then
 python "${MONOREPO_ROOT}"/.ci/generate_test_report_github.py \
   $retcode "${BUILD_DIR}"/test-results.*.xml "${MONOREPO_ROOT}"/ninja*.log 
\
   >> $GITHUB_STEP_SUMMARY
+python "${MONOREPO_ROOT}"/.ci/premerge_advisor_explain.py \
+  $(git rev-parse HEAD~1) $retcode ${{ secrets.GITHUB_TOKEN }} \
+  $GITHUB_PR_NUMBER "${BUILD_DIR}"/test-results.*.xml \
+  "${MONOREPO_ROOT}"/ninja*.log
   fi
 
   if [[ "$retcode" != "0" ]]; then
 if [[ "$GITHUB_ACTIONS" != "" ]]; then
-  python "${MONOREPO_ROOT}"/.ci/premerge_advisor_explain.py \
-$(git rev-parse HEAD~1) $retcode ${{ secrets.GITHUB_TOKEN }} \
-$GITHUB_PR_NUMBER "${BUILD_DIR}"/test-results.*.xml \
-"${MONOREPO_ROOT}"/ninja*.log
   python "${MONOREPO_ROOT}"/.ci/premerge_advisor_upload.py \
 $(git rev-parse HEAD~1) $GITHUB_RUN_NUMBER \
 "${BUILD_DIR}"/test-results.*.xml "${MONOREPO_ROOT}"/ninja*.log

___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [CI] Make premerge upload/write comments (PR #166609)

2025-11-06 Thread Aiden Grossman via llvm-branch-commits

https://github.com/boomanaiden154 updated 
https://github.com/llvm/llvm-project/pull/166609


___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [CI] Make premerge upload/write comments (PR #166609)

2025-11-06 Thread Aiden Grossman via llvm-branch-commits

https://github.com/boomanaiden154 updated 
https://github.com/llvm/llvm-project/pull/166609


___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [CI] Make premerge_advisor_explain write comments (PR #166605)

2025-11-06 Thread Aiden Grossman via llvm-branch-commits

https://github.com/boomanaiden154 updated 
https://github.com/llvm/llvm-project/pull/166605

>From 06c030dcb4ee57be287beffd96d1b21ef1697dd4 Mon Sep 17 00:00:00 2001
From: Aiden Grossman 
Date: Wed, 5 Nov 2025 18:23:46 +
Subject: [PATCH] fix

Created using spr 1.3.7
---
 .ci/premerge_advisor_explain.py | 34 -
 .ci/utils.sh| 10 +-
 2 files changed, 22 insertions(+), 22 deletions(-)

diff --git a/.ci/premerge_advisor_explain.py b/.ci/premerge_advisor_explain.py
index 4d840a33c3cf2..1d487af9e9ec7 100644
--- a/.ci/premerge_advisor_explain.py
+++ b/.ci/premerge_advisor_explain.py
@@ -31,22 +31,11 @@ def get_comment_id(platform: str, pr: 
github.PullRequest.PullRequest) -> int | N
 def get_comment(
 github_token: str,
 pr_number: int,
-junit_objects,
-ninja_logs,
-advisor_response,
-return_code,
+body: str,
 ) -> dict[str, str]:
 repo = github.Github(github_token).get_repo("llvm/llvm-project")
 pr = repo.get_issue(pr_number).as_pull_request()
-comment = {
-"body": generate_test_report_lib.generate_report(
-generate_test_report_lib.compute_platform_title(),
-return_code,
-junit_objects,
-ninja_logs,
-failure_explanations_list=advisor_response,
-)
-}
+comment = {"body": body}
 comment_id = get_comment_id(platform.system(), pr)
 if comment_id:
 comment["id"] = comment_id
@@ -59,6 +48,14 @@ def main(
 pr_number: int,
 return_code: int,
 ):
+if return_code == 0:
+with open("comment", "w") as comment_file_handle:
+comment = get_comment(
+":white_check_mark: With the latest revision this PR passed "
+"the premerge checks."
+)
+if comment["id"]:
+json.dump([comment], comment_file_handle)
 junit_objects, ninja_logs = generate_test_report_lib.load_info_from_files(
 build_log_files
 )
@@ -90,10 +87,13 @@ def main(
 get_comment(
 github_token,
 pr_number,
-junit_objects,
-ninja_logs,
-advisor_response.json(),
-return_code,
+generate_test_report_lib.generate_report(
+generate_test_report_lib.compute_platform_title(),
+return_code,
+junit_objects,
+ninja_logs,
+failure_explanations_list=advisor_response.json(),
+),
 )
 ]
 with open("comment", "w") as comment_file_handle:
diff --git a/.ci/utils.sh b/.ci/utils.sh
index 72f4b04f5bf3a..91c27319f3534 100644
--- a/.ci/utils.sh
+++ b/.ci/utils.sh
@@ -33,18 +33,18 @@ function at-exit {
   # If building fails there will be no results files.
   shopt -s nullglob
 
-  if [[ "$GITHUB_STEP_SUMMARY" != "" ]]; then
+  if [[ "$GITHUB_ACTIONS" != "" ]]; then
 python "${MONOREPO_ROOT}"/.ci/generate_test_report_github.py \
   $retcode "${BUILD_DIR}"/test-results.*.xml "${MONOREPO_ROOT}"/ninja*.log 
\
   >> $GITHUB_STEP_SUMMARY
+python "${MONOREPO_ROOT}"/.ci/premerge_advisor_explain.py \
+  $(git rev-parse HEAD~1) $retcode ${{ secrets.GITHUB_TOKEN }} \
+  $GITHUB_PR_NUMBER "${BUILD_DIR}"/test-results.*.xml \
+  "${MONOREPO_ROOT}"/ninja*.log
   fi
 
   if [[ "$retcode" != "0" ]]; then
 if [[ "$GITHUB_ACTIONS" != "" ]]; then
-  python "${MONOREPO_ROOT}"/.ci/premerge_advisor_explain.py \
-$(git rev-parse HEAD~1) $retcode ${{ secrets.GITHUB_TOKEN }} \
-$GITHUB_PR_NUMBER "${BUILD_DIR}"/test-results.*.xml \
-"${MONOREPO_ROOT}"/ninja*.log
   python "${MONOREPO_ROOT}"/.ci/premerge_advisor_upload.py \
 $(git rev-parse HEAD~1) $GITHUB_RUN_NUMBER \
 "${BUILD_DIR}"/test-results.*.xml "${MONOREPO_ROOT}"/ninja*.log

___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [AtomicExpand] Add bitcasts when expanding load atomic vector (PR #148900)

2025-11-06 Thread via llvm-branch-commits

https://github.com/jofrn updated 
https://github.com/llvm/llvm-project/pull/148900

>From 429dfd75797bfb24e208266cab1fbc14fc79c717 Mon Sep 17 00:00:00 2001
From: jofrn 
Date: Tue, 15 Jul 2025 13:03:15 -0400
Subject: [PATCH] [AtomicExpand] Add bitcasts when expanding load atomic vector

AtomicExpand fails for aligned `load atomic ` because it
does not find a compatible library call. This change adds appropriate
bitcasts so that the call can be lowered. It also adds support for
128 bit lowering in tablegen to support SSE/AVX.
---
 llvm/lib/CodeGen/AtomicExpandPass.cpp | 22 -
 llvm/test/CodeGen/ARM/atomic-load-store.ll| 51 +++
 llvm/test/CodeGen/X86/atomic-load-store.ll| 91 ++-
 .../X86/expand-atomic-non-integer.ll  | 66 ++
 4 files changed, 225 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/CodeGen/AtomicExpandPass.cpp 
b/llvm/lib/CodeGen/AtomicExpandPass.cpp
index 53f1cfe24a68d..45cdc7980fdc6 100644
--- a/llvm/lib/CodeGen/AtomicExpandPass.cpp
+++ b/llvm/lib/CodeGen/AtomicExpandPass.cpp
@@ -483,7 +483,12 @@ LoadInst 
*AtomicExpandImpl::convertAtomicLoadToIntegerType(LoadInst *LI) {
   NewLI->setAtomic(LI->getOrdering(), LI->getSyncScopeID());
   LLVM_DEBUG(dbgs() << "Replaced " << *LI << " with " << *NewLI << "\n");
 
-  Value *NewVal = Builder.CreateBitCast(NewLI, LI->getType());
+  Value *NewVal =
+  LI->getType()->isPointerTy() ||
+  (LI->getType()->isVectorTy() &&
+   
cast(LI->getType())->getElementType()->isPointerTy())
+  ? Builder.CreateIntToPtr(NewLI, LI->getType())
+  : Builder.CreateBitCast(NewLI, LI->getType());
   LI->replaceAllUsesWith(NewVal);
   LI->eraseFromParent();
   return NewLI;
@@ -2093,9 +2098,18 @@ bool AtomicExpandImpl::expandAtomicOpToLibcall(
 I->replaceAllUsesWith(V);
   } else if (HasResult) {
 Value *V;
-if (UseSizedLibcall)
-  V = Builder.CreateBitOrPointerCast(Result, I->getType());
-else {
+if (UseSizedLibcall) {
+  // Add bitcasts from Result's scalar type to I's  vector type
+  auto *PtrTy = dyn_cast(I->getType()->getScalarType());
+  auto *VTy = dyn_cast(I->getType());
+  if (VTy && PtrTy && !Result->getType()->isVectorTy()) {
+unsigned AS = PtrTy->getAddressSpace();
+Value *BC = Builder.CreateBitCast(
+Result, VTy->getWithNewType(DL.getIntPtrType(Ctx, AS)));
+V = Builder.CreateIntToPtr(BC, I->getType());
+  } else
+V = Builder.CreateBitOrPointerCast(Result, I->getType());
+} else {
   V = Builder.CreateAlignedLoad(I->getType(), AllocaResult,
 AllocaAlignment);
   Builder.CreateLifetimeEnd(AllocaResult);
diff --git a/llvm/test/CodeGen/ARM/atomic-load-store.ll 
b/llvm/test/CodeGen/ARM/atomic-load-store.ll
index 560dfde356c29..eaa2ffd9b2731 100644
--- a/llvm/test/CodeGen/ARM/atomic-load-store.ll
+++ b/llvm/test/CodeGen/ARM/atomic-load-store.ll
@@ -983,3 +983,54 @@ define void @store_atomic_f64__seq_cst(ptr %ptr, double 
%val1) {
   store atomic double %val1, ptr %ptr seq_cst, align 8
   ret void
 }
+
+define <1 x ptr> @atomic_vec1_ptr(ptr %x) #0 {
+; ARM-LABEL: atomic_vec1_ptr:
+; ARM:   @ %bb.0:
+; ARM-NEXT:ldr r0, [r0]
+; ARM-NEXT:dmb ish
+; ARM-NEXT:bx lr
+;
+; ARMOPTNONE-LABEL: atomic_vec1_ptr:
+; ARMOPTNONE:   @ %bb.0:
+; ARMOPTNONE-NEXT:ldr r0, [r0]
+; ARMOPTNONE-NEXT:dmb ish
+; ARMOPTNONE-NEXT:bx lr
+;
+; THUMBTWO-LABEL: atomic_vec1_ptr:
+; THUMBTWO:   @ %bb.0:
+; THUMBTWO-NEXT:ldr r0, [r0]
+; THUMBTWO-NEXT:dmb ish
+; THUMBTWO-NEXT:bx lr
+;
+; THUMBONE-LABEL: atomic_vec1_ptr:
+; THUMBONE:   @ %bb.0:
+; THUMBONE-NEXT:push {r7, lr}
+; THUMBONE-NEXT:movs r1, #0
+; THUMBONE-NEXT:mov r2, r1
+; THUMBONE-NEXT:bl __sync_val_compare_and_swap_4
+; THUMBONE-NEXT:pop {r7, pc}
+;
+; ARMV4-LABEL: atomic_vec1_ptr:
+; ARMV4:   @ %bb.0:
+; ARMV4-NEXT:push {r11, lr}
+; ARMV4-NEXT:mov r1, #2
+; ARMV4-NEXT:bl __atomic_load_4
+; ARMV4-NEXT:pop {r11, lr}
+; ARMV4-NEXT:mov pc, lr
+;
+; ARMV6-LABEL: atomic_vec1_ptr:
+; ARMV6:   @ %bb.0:
+; ARMV6-NEXT:ldr r0, [r0]
+; ARMV6-NEXT:mov r1, #0
+; ARMV6-NEXT:mcr p15, #0, r1, c7, c10, #5
+; ARMV6-NEXT:bx lr
+;
+; THUMBM-LABEL: atomic_vec1_ptr:
+; THUMBM:   @ %bb.0:
+; THUMBM-NEXT:ldr r0, [r0]
+; THUMBM-NEXT:dmb sy
+; THUMBM-NEXT:bx lr
+  %ret = load atomic <1 x ptr>, ptr %x acquire, align 4
+  ret <1 x ptr> %ret
+}
diff --git a/llvm/test/CodeGen/X86/atomic-load-store.ll 
b/llvm/test/CodeGen/X86/atomic-load-store.ll
index 00310f6d1f219..867a4acb791bc 100644
--- a/llvm/test/CodeGen/X86/atomic-load-store.ll
+++ b/llvm/test/CodeGen/X86/atomic-load-store.ll
@@ -244,6 +244,96 @@ define <2 x ptr addrspace(270)> @atomic_vec2_ptr270(ptr 
%x) {
   %ret = load atomic <2 x ptr addrspace(270)>, ptr %x acquire, align 8
   ret <2 x ptr addrspace(270)> %ret
 }
+define <2 x ptr> @atomic_ve

[llvm-branch-commits] [llvm] [X86] Cast atomic vectors in IR to support floats (PR #148899)

2025-11-06 Thread via llvm-branch-commits

https://github.com/jofrn updated 
https://github.com/llvm/llvm-project/pull/148899

>From 4d1cdadc3259ed811a186b049bb1589ebc4e5470 Mon Sep 17 00:00:00 2001
From: jofrn 
Date: Tue, 15 Jul 2025 13:02:04 -0400
Subject: [PATCH] [X86] Cast atomic vectors in IR to support floats

This commit casts floats to ints in an atomic load during AtomicExpand to 
support
floating point types. It also is required to support 128 bit vectors in SSE/AVX.
---
 llvm/lib/Target/X86/X86ISelLowering.cpp|   7 +
 llvm/lib/Target/X86/X86ISelLowering.h  |   2 +
 llvm/lib/Target/X86/X86InstrCompiler.td|  15 +
 llvm/test/CodeGen/X86/atomic-load-store.ll | 385 ++---
 4 files changed, 122 insertions(+), 287 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp 
b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 49beadae63f03..e15f17281b958 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -32136,6 +32136,13 @@ 
X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
   }
 }
 
+TargetLowering::AtomicExpansionKind
+X86TargetLowering::shouldCastAtomicLoadInIR(LoadInst *LI) const {
+  if (LI->getType()->getScalarType()->isFloatingPointTy())
+return AtomicExpansionKind::CastToInteger;
+  return AtomicExpansionKind::None;
+}
+
 LoadInst *
 X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
   unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
diff --git a/llvm/lib/Target/X86/X86ISelLowering.h 
b/llvm/lib/Target/X86/X86ISelLowering.h
index b7151f65942b4..f9a8adbd7da0d 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/llvm/lib/Target/X86/X86ISelLowering.h
@@ -1841,6 +1841,8 @@ namespace llvm {
 shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override;
 TargetLoweringBase::AtomicExpansionKind
 shouldExpandLogicAtomicRMWInIR(AtomicRMWInst *AI) const;
+TargetLoweringBase::AtomicExpansionKind
+shouldCastAtomicLoadInIR(LoadInst *LI) const override;
 void emitBitTestAtomicRMWIntrinsic(AtomicRMWInst *AI) const override;
 void emitCmpArithAtomicRMWIntrinsic(AtomicRMWInst *AI) const override;
 
diff --git a/llvm/lib/Target/X86/X86InstrCompiler.td 
b/llvm/lib/Target/X86/X86InstrCompiler.td
index ce429b5916280..3f542297fea19 100644
--- a/llvm/lib/Target/X86/X86InstrCompiler.td
+++ b/llvm/lib/Target/X86/X86InstrCompiler.td
@@ -1220,6 +1220,21 @@ def : Pat<(v2i64 (scalar_to_vector (i64 (atomic_load_64 
addr:$src,
 def : Pat<(v2i64 (scalar_to_vector (i64 (atomic_load_64 addr:$src,
   (VMOV64toPQIZrm addr:$src)>, Requires<[HasAVX512]>;
 
+// load atomic <2 x i64>
+def : Pat<(v2i64 (atomic_load_128_v2i64 addr:$src)),
+  (MOVAPDrm addr:$src)>, Requires<[UseSSE2]>;
+def : Pat<(v2i64 (atomic_load_128_v2i64 addr:$src)),
+  (VMOVAPDrm addr:$src)>, Requires<[UseAVX]>;
+def : Pat<(v2i64 (atomic_load_128_v2i64 addr:$src)),
+  (VMOVAPDZ128rm addr:$src)>, Requires<[HasAVX512]>;
+// load atomic <4 x i32>
+def : Pat<(v4i32 (atomic_load_128_v4i32 addr:$src)),
+  (MOVAPDrm addr:$src)>, Requires<[UseSSE2]>;
+def : Pat<(v4i32 (atomic_load_128_v4i32 addr:$src)),
+  (VMOVAPDrm addr:$src)>, Requires<[UseAVX]>;
+def : Pat<(v4i32 (atomic_load_128_v4i32 addr:$src)),
+  (VMOVAPDZ128rm addr:$src)>, Requires<[HasAVX512]>;
+
 // Floating point loads/stores.
 def : Pat<(atomic_store_32 (i32 (bitconvert (f32 FR32:$src))), addr:$dst),
   (MOVSSmr addr:$dst, FR32:$src)>, Requires<[UseSSE1]>;
diff --git a/llvm/test/CodeGen/X86/atomic-load-store.ll 
b/llvm/test/CodeGen/X86/atomic-load-store.ll
index 928dfef3143da..00310f6d1f219 100644
--- a/llvm/test/CodeGen/X86/atomic-load-store.ll
+++ b/llvm/test/CodeGen/X86/atomic-load-store.ll
@@ -119,13 +119,13 @@ define <1 x bfloat> @atomic_vec1_bfloat(ptr %x) {
 ; CHECK-SSE-O3-LABEL: atomic_vec1_bfloat:
 ; CHECK-SSE-O3:   # %bb.0:
 ; CHECK-SSE-O3-NEXT:movzwl (%rdi), %eax
-; CHECK-SSE-O3-NEXT:pinsrw $0, %eax, %xmm0
+; CHECK-SSE-O3-NEXT:movd %eax, %xmm0
 ; CHECK-SSE-O3-NEXT:retq
 ;
 ; CHECK-AVX-O3-LABEL: atomic_vec1_bfloat:
 ; CHECK-AVX-O3:   # %bb.0:
 ; CHECK-AVX-O3-NEXT:movzwl (%rdi), %eax
-; CHECK-AVX-O3-NEXT:vpinsrw $0, %eax, %xmm0, %xmm0
+; CHECK-AVX-O3-NEXT:vmovd %eax, %xmm0
 ; CHECK-AVX-O3-NEXT:retq
 ;
 ; CHECK-SSE-O0-LABEL: atomic_vec1_bfloat:
@@ -133,8 +133,7 @@ define <1 x bfloat> @atomic_vec1_bfloat(ptr %x) {
 ; CHECK-SSE-O0-NEXT:movw (%rdi), %cx
 ; CHECK-SSE-O0-NEXT:# implicit-def: $eax
 ; CHECK-SSE-O0-NEXT:movw %cx, %ax
-; CHECK-SSE-O0-NEXT:# implicit-def: $xmm0
-; CHECK-SSE-O0-NEXT:pinsrw $0, %eax, %xmm0
+; CHECK-SSE-O0-NEXT:movd %eax, %xmm0
 ; CHECK-SSE-O0-NEXT:retq
 ;
 ; CHECK-AVX-O0-LABEL: atomic_vec1_bfloat:
@@ -142,8 +141,7 @@ define <1 x bfloat> @atomic_vec1_bfloat(ptr %x) {
 ; CHECK-AVX-O0-NEXT:movw (%rdi), %cx
 ; CHECK-AVX-O0-NEXT:# implicit-def: $eax
 ; CHECK-AVX-O0-NEXT:movw %cx, %ax
-; CHECK-AVX-O0-NEXT:# imp

[llvm-branch-commits] [llvm] [AtomicExpand] Add bitcasts when expanding load atomic vector (PR #148900)

2025-11-06 Thread via llvm-branch-commits

https://github.com/jofrn updated 
https://github.com/llvm/llvm-project/pull/148900

>From 429dfd75797bfb24e208266cab1fbc14fc79c717 Mon Sep 17 00:00:00 2001
From: jofrn 
Date: Tue, 15 Jul 2025 13:03:15 -0400
Subject: [PATCH] [AtomicExpand] Add bitcasts when expanding load atomic vector

AtomicExpand fails for aligned `load atomic ` because it
does not find a compatible library call. This change adds appropriate
bitcasts so that the call can be lowered. It also adds support for
128 bit lowering in tablegen to support SSE/AVX.
---
 llvm/lib/CodeGen/AtomicExpandPass.cpp | 22 -
 llvm/test/CodeGen/ARM/atomic-load-store.ll| 51 +++
 llvm/test/CodeGen/X86/atomic-load-store.ll| 91 ++-
 .../X86/expand-atomic-non-integer.ll  | 66 ++
 4 files changed, 225 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/CodeGen/AtomicExpandPass.cpp 
b/llvm/lib/CodeGen/AtomicExpandPass.cpp
index 53f1cfe24a68d..45cdc7980fdc6 100644
--- a/llvm/lib/CodeGen/AtomicExpandPass.cpp
+++ b/llvm/lib/CodeGen/AtomicExpandPass.cpp
@@ -483,7 +483,12 @@ LoadInst 
*AtomicExpandImpl::convertAtomicLoadToIntegerType(LoadInst *LI) {
   NewLI->setAtomic(LI->getOrdering(), LI->getSyncScopeID());
   LLVM_DEBUG(dbgs() << "Replaced " << *LI << " with " << *NewLI << "\n");
 
-  Value *NewVal = Builder.CreateBitCast(NewLI, LI->getType());
+  Value *NewVal =
+  LI->getType()->isPointerTy() ||
+  (LI->getType()->isVectorTy() &&
+   
cast(LI->getType())->getElementType()->isPointerTy())
+  ? Builder.CreateIntToPtr(NewLI, LI->getType())
+  : Builder.CreateBitCast(NewLI, LI->getType());
   LI->replaceAllUsesWith(NewVal);
   LI->eraseFromParent();
   return NewLI;
@@ -2093,9 +2098,18 @@ bool AtomicExpandImpl::expandAtomicOpToLibcall(
 I->replaceAllUsesWith(V);
   } else if (HasResult) {
 Value *V;
-if (UseSizedLibcall)
-  V = Builder.CreateBitOrPointerCast(Result, I->getType());
-else {
+if (UseSizedLibcall) {
+  // Add bitcasts from Result's scalar type to I's  vector type
+  auto *PtrTy = dyn_cast(I->getType()->getScalarType());
+  auto *VTy = dyn_cast(I->getType());
+  if (VTy && PtrTy && !Result->getType()->isVectorTy()) {
+unsigned AS = PtrTy->getAddressSpace();
+Value *BC = Builder.CreateBitCast(
+Result, VTy->getWithNewType(DL.getIntPtrType(Ctx, AS)));
+V = Builder.CreateIntToPtr(BC, I->getType());
+  } else
+V = Builder.CreateBitOrPointerCast(Result, I->getType());
+} else {
   V = Builder.CreateAlignedLoad(I->getType(), AllocaResult,
 AllocaAlignment);
   Builder.CreateLifetimeEnd(AllocaResult);
diff --git a/llvm/test/CodeGen/ARM/atomic-load-store.ll 
b/llvm/test/CodeGen/ARM/atomic-load-store.ll
index 560dfde356c29..eaa2ffd9b2731 100644
--- a/llvm/test/CodeGen/ARM/atomic-load-store.ll
+++ b/llvm/test/CodeGen/ARM/atomic-load-store.ll
@@ -983,3 +983,54 @@ define void @store_atomic_f64__seq_cst(ptr %ptr, double 
%val1) {
   store atomic double %val1, ptr %ptr seq_cst, align 8
   ret void
 }
+
+define <1 x ptr> @atomic_vec1_ptr(ptr %x) #0 {
+; ARM-LABEL: atomic_vec1_ptr:
+; ARM:   @ %bb.0:
+; ARM-NEXT:ldr r0, [r0]
+; ARM-NEXT:dmb ish
+; ARM-NEXT:bx lr
+;
+; ARMOPTNONE-LABEL: atomic_vec1_ptr:
+; ARMOPTNONE:   @ %bb.0:
+; ARMOPTNONE-NEXT:ldr r0, [r0]
+; ARMOPTNONE-NEXT:dmb ish
+; ARMOPTNONE-NEXT:bx lr
+;
+; THUMBTWO-LABEL: atomic_vec1_ptr:
+; THUMBTWO:   @ %bb.0:
+; THUMBTWO-NEXT:ldr r0, [r0]
+; THUMBTWO-NEXT:dmb ish
+; THUMBTWO-NEXT:bx lr
+;
+; THUMBONE-LABEL: atomic_vec1_ptr:
+; THUMBONE:   @ %bb.0:
+; THUMBONE-NEXT:push {r7, lr}
+; THUMBONE-NEXT:movs r1, #0
+; THUMBONE-NEXT:mov r2, r1
+; THUMBONE-NEXT:bl __sync_val_compare_and_swap_4
+; THUMBONE-NEXT:pop {r7, pc}
+;
+; ARMV4-LABEL: atomic_vec1_ptr:
+; ARMV4:   @ %bb.0:
+; ARMV4-NEXT:push {r11, lr}
+; ARMV4-NEXT:mov r1, #2
+; ARMV4-NEXT:bl __atomic_load_4
+; ARMV4-NEXT:pop {r11, lr}
+; ARMV4-NEXT:mov pc, lr
+;
+; ARMV6-LABEL: atomic_vec1_ptr:
+; ARMV6:   @ %bb.0:
+; ARMV6-NEXT:ldr r0, [r0]
+; ARMV6-NEXT:mov r1, #0
+; ARMV6-NEXT:mcr p15, #0, r1, c7, c10, #5
+; ARMV6-NEXT:bx lr
+;
+; THUMBM-LABEL: atomic_vec1_ptr:
+; THUMBM:   @ %bb.0:
+; THUMBM-NEXT:ldr r0, [r0]
+; THUMBM-NEXT:dmb sy
+; THUMBM-NEXT:bx lr
+  %ret = load atomic <1 x ptr>, ptr %x acquire, align 4
+  ret <1 x ptr> %ret
+}
diff --git a/llvm/test/CodeGen/X86/atomic-load-store.ll 
b/llvm/test/CodeGen/X86/atomic-load-store.ll
index 00310f6d1f219..867a4acb791bc 100644
--- a/llvm/test/CodeGen/X86/atomic-load-store.ll
+++ b/llvm/test/CodeGen/X86/atomic-load-store.ll
@@ -244,6 +244,96 @@ define <2 x ptr addrspace(270)> @atomic_vec2_ptr270(ptr 
%x) {
   %ret = load atomic <2 x ptr addrspace(270)>, ptr %x acquire, align 8
   ret <2 x ptr addrspace(270)> %ret
 }
+define <2 x ptr> @atomic_ve

[llvm-branch-commits] [llvm] [X86] Cast atomic vectors in IR to support floats (PR #148899)

2025-11-06 Thread via llvm-branch-commits

https://github.com/jofrn updated 
https://github.com/llvm/llvm-project/pull/148899

>From 4d1cdadc3259ed811a186b049bb1589ebc4e5470 Mon Sep 17 00:00:00 2001
From: jofrn 
Date: Tue, 15 Jul 2025 13:02:04 -0400
Subject: [PATCH] [X86] Cast atomic vectors in IR to support floats

This commit casts floats to ints in an atomic load during AtomicExpand to 
support
floating point types. It also is required to support 128 bit vectors in SSE/AVX.
---
 llvm/lib/Target/X86/X86ISelLowering.cpp|   7 +
 llvm/lib/Target/X86/X86ISelLowering.h  |   2 +
 llvm/lib/Target/X86/X86InstrCompiler.td|  15 +
 llvm/test/CodeGen/X86/atomic-load-store.ll | 385 ++---
 4 files changed, 122 insertions(+), 287 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp 
b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 49beadae63f03..e15f17281b958 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -32136,6 +32136,13 @@ 
X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
   }
 }
 
+TargetLowering::AtomicExpansionKind
+X86TargetLowering::shouldCastAtomicLoadInIR(LoadInst *LI) const {
+  if (LI->getType()->getScalarType()->isFloatingPointTy())
+return AtomicExpansionKind::CastToInteger;
+  return AtomicExpansionKind::None;
+}
+
 LoadInst *
 X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
   unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
diff --git a/llvm/lib/Target/X86/X86ISelLowering.h 
b/llvm/lib/Target/X86/X86ISelLowering.h
index b7151f65942b4..f9a8adbd7da0d 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/llvm/lib/Target/X86/X86ISelLowering.h
@@ -1841,6 +1841,8 @@ namespace llvm {
 shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override;
 TargetLoweringBase::AtomicExpansionKind
 shouldExpandLogicAtomicRMWInIR(AtomicRMWInst *AI) const;
+TargetLoweringBase::AtomicExpansionKind
+shouldCastAtomicLoadInIR(LoadInst *LI) const override;
 void emitBitTestAtomicRMWIntrinsic(AtomicRMWInst *AI) const override;
 void emitCmpArithAtomicRMWIntrinsic(AtomicRMWInst *AI) const override;
 
diff --git a/llvm/lib/Target/X86/X86InstrCompiler.td 
b/llvm/lib/Target/X86/X86InstrCompiler.td
index ce429b5916280..3f542297fea19 100644
--- a/llvm/lib/Target/X86/X86InstrCompiler.td
+++ b/llvm/lib/Target/X86/X86InstrCompiler.td
@@ -1220,6 +1220,21 @@ def : Pat<(v2i64 (scalar_to_vector (i64 (atomic_load_64 
addr:$src,
 def : Pat<(v2i64 (scalar_to_vector (i64 (atomic_load_64 addr:$src,
   (VMOV64toPQIZrm addr:$src)>, Requires<[HasAVX512]>;
 
+// load atomic <2 x i64>
+def : Pat<(v2i64 (atomic_load_128_v2i64 addr:$src)),
+  (MOVAPDrm addr:$src)>, Requires<[UseSSE2]>;
+def : Pat<(v2i64 (atomic_load_128_v2i64 addr:$src)),
+  (VMOVAPDrm addr:$src)>, Requires<[UseAVX]>;
+def : Pat<(v2i64 (atomic_load_128_v2i64 addr:$src)),
+  (VMOVAPDZ128rm addr:$src)>, Requires<[HasAVX512]>;
+// load atomic <4 x i32>
+def : Pat<(v4i32 (atomic_load_128_v4i32 addr:$src)),
+  (MOVAPDrm addr:$src)>, Requires<[UseSSE2]>;
+def : Pat<(v4i32 (atomic_load_128_v4i32 addr:$src)),
+  (VMOVAPDrm addr:$src)>, Requires<[UseAVX]>;
+def : Pat<(v4i32 (atomic_load_128_v4i32 addr:$src)),
+  (VMOVAPDZ128rm addr:$src)>, Requires<[HasAVX512]>;
+
 // Floating point loads/stores.
 def : Pat<(atomic_store_32 (i32 (bitconvert (f32 FR32:$src))), addr:$dst),
   (MOVSSmr addr:$dst, FR32:$src)>, Requires<[UseSSE1]>;
diff --git a/llvm/test/CodeGen/X86/atomic-load-store.ll 
b/llvm/test/CodeGen/X86/atomic-load-store.ll
index 928dfef3143da..00310f6d1f219 100644
--- a/llvm/test/CodeGen/X86/atomic-load-store.ll
+++ b/llvm/test/CodeGen/X86/atomic-load-store.ll
@@ -119,13 +119,13 @@ define <1 x bfloat> @atomic_vec1_bfloat(ptr %x) {
 ; CHECK-SSE-O3-LABEL: atomic_vec1_bfloat:
 ; CHECK-SSE-O3:   # %bb.0:
 ; CHECK-SSE-O3-NEXT:movzwl (%rdi), %eax
-; CHECK-SSE-O3-NEXT:pinsrw $0, %eax, %xmm0
+; CHECK-SSE-O3-NEXT:movd %eax, %xmm0
 ; CHECK-SSE-O3-NEXT:retq
 ;
 ; CHECK-AVX-O3-LABEL: atomic_vec1_bfloat:
 ; CHECK-AVX-O3:   # %bb.0:
 ; CHECK-AVX-O3-NEXT:movzwl (%rdi), %eax
-; CHECK-AVX-O3-NEXT:vpinsrw $0, %eax, %xmm0, %xmm0
+; CHECK-AVX-O3-NEXT:vmovd %eax, %xmm0
 ; CHECK-AVX-O3-NEXT:retq
 ;
 ; CHECK-SSE-O0-LABEL: atomic_vec1_bfloat:
@@ -133,8 +133,7 @@ define <1 x bfloat> @atomic_vec1_bfloat(ptr %x) {
 ; CHECK-SSE-O0-NEXT:movw (%rdi), %cx
 ; CHECK-SSE-O0-NEXT:# implicit-def: $eax
 ; CHECK-SSE-O0-NEXT:movw %cx, %ax
-; CHECK-SSE-O0-NEXT:# implicit-def: $xmm0
-; CHECK-SSE-O0-NEXT:pinsrw $0, %eax, %xmm0
+; CHECK-SSE-O0-NEXT:movd %eax, %xmm0
 ; CHECK-SSE-O0-NEXT:retq
 ;
 ; CHECK-AVX-O0-LABEL: atomic_vec1_bfloat:
@@ -142,8 +141,7 @@ define <1 x bfloat> @atomic_vec1_bfloat(ptr %x) {
 ; CHECK-AVX-O0-NEXT:movw (%rdi), %cx
 ; CHECK-AVX-O0-NEXT:# implicit-def: $eax
 ; CHECK-AVX-O0-NEXT:movw %cx, %ax
-; CHECK-AVX-O0-NEXT:# imp

[llvm-branch-commits] [llvm] [SelectionDAG] Split vector types for atomic load (PR #165818)

2025-11-06 Thread via llvm-branch-commits

https://github.com/jofrn updated 
https://github.com/llvm/llvm-project/pull/165818

>From 94119264a0fd461b3cb18d6dbd30337f274e403b Mon Sep 17 00:00:00 2001
From: jofrn 
Date: Thu, 30 Oct 2025 12:19:59 -0400
Subject: [PATCH] [SelectionDAG] Split vector types for atomic load

Vector types that aren't widened are split
so that a single ATOMIC_LOAD is issued for the entire vector at once.
This change utilizes the load vectorization infrastructure in
SelectionDAG in order to group the vectors. This enables SelectionDAG
to translate vectors with type bfloat,half.
---
 .../include/llvm/Target/TargetSelectionDAG.td |  14 +
 llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h |   1 +
 .../SelectionDAG/LegalizeVectorTypes.cpp  |  37 ++
 llvm/test/CodeGen/X86/atomic-load-store.ll| 352 +-
 4 files changed, 400 insertions(+), 4 deletions(-)

diff --git a/llvm/include/llvm/Target/TargetSelectionDAG.td 
b/llvm/include/llvm/Target/TargetSelectionDAG.td
index 07a858fd682fc..239fee8a3022d 100644
--- a/llvm/include/llvm/Target/TargetSelectionDAG.td
+++ b/llvm/include/llvm/Target/TargetSelectionDAG.td
@@ -1949,6 +1949,20 @@ def atomic_load_64 :
   let MemoryVT = i64;
 }
 
+def atomic_load_128_v2i64 :
+  PatFrag<(ops node:$ptr),
+  (atomic_load node:$ptr)> {
+  let IsAtomic = true;
+  let MemoryVT = v2i64;
+}
+
+def atomic_load_128_v4i32 :
+  PatFrag<(ops node:$ptr),
+  (atomic_load node:$ptr)> {
+  let IsAtomic = true;
+  let MemoryVT = v4i32;
+}
+
 def atomic_load_nonext_8 :
   PatFrag<(ops node:$ptr), (atomic_load_nonext node:$ptr)> {
   let IsAtomic = true; // FIXME: Should be IsLoad and/or IsAtomic?
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h 
b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index ed2c30be7d71d..9028ff4d3401c 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -978,6 +978,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
   void SplitVecRes_FPOp_MultiType(SDNode *N, SDValue &Lo, SDValue &Hi);
   void SplitVecRes_IS_FPCLASS(SDNode *N, SDValue &Lo, SDValue &Hi);
   void SplitVecRes_INSERT_VECTOR_ELT(SDNode *N, SDValue &Lo, SDValue &Hi);
+  void SplitVecRes_ATOMIC_LOAD(AtomicSDNode *LD, SDValue &Lo, SDValue &Hi);
   void SplitVecRes_LOAD(LoadSDNode *LD, SDValue &Lo, SDValue &Hi);
   void SplitVecRes_VP_LOAD(VPLoadSDNode *LD, SDValue &Lo, SDValue &Hi);
   void SplitVecRes_VP_LOAD_FF(VPLoadFFSDNode *LD, SDValue &Lo, SDValue &Hi);
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp 
b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index 22f9fd548f52b..e34b9fa8e787c 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -1226,6 +1226,9 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, 
unsigned ResNo) {
 SplitVecRes_STEP_VECTOR(N, Lo, Hi);
 break;
   case ISD::SIGN_EXTEND_INREG: SplitVecRes_InregOp(N, Lo, Hi); break;
+  case ISD::ATOMIC_LOAD:
+SplitVecRes_ATOMIC_LOAD(cast(N), Lo, Hi);
+break;
   case ISD::LOAD:
 SplitVecRes_LOAD(cast(N), Lo, Hi);
 break;
@@ -2202,6 +2205,40 @@ void DAGTypeLegalizer::SplitVecRes_VP_SPLAT(SDNode *N, 
SDValue &Lo,
   Hi = DAG.getNode(N->getOpcode(), dl, HiVT, N->getOperand(0), MaskHi, EVLHi);
 }
 
+void DAGTypeLegalizer::SplitVecRes_ATOMIC_LOAD(AtomicSDNode *LD, SDValue &Lo,
+   SDValue &Hi) {
+  assert(LD->getExtensionType() == ISD::NON_EXTLOAD &&
+ "Extended load during type legalization!");
+  SDLoc dl(LD);
+  EVT VT = LD->getValueType(0);
+  EVT LoVT, HiVT;
+  std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
+
+  SDValue Ch = LD->getChain();
+  SDValue Ptr = LD->getBasePtr();
+
+  EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
+  EVT MemIntVT =
+  EVT::getIntegerVT(*DAG.getContext(), LD->getMemoryVT().getSizeInBits());
+  SDValue ALD = DAG.getAtomicLoad(ISD::NON_EXTLOAD, dl, MemIntVT, IntVT, Ch,
+  Ptr, LD->getMemOperand());
+
+  EVT LoIntVT = EVT::getIntegerVT(*DAG.getContext(), LoVT.getSizeInBits());
+  EVT HiIntVT = EVT::getIntegerVT(*DAG.getContext(), HiVT.getSizeInBits());
+  SDValue ExtractLo = DAG.getNode(ISD::TRUNCATE, dl, LoIntVT, ALD);
+  SDValue ExtractHi =
+  DAG.getNode(ISD::SRL, dl, IntVT, ALD,
+  DAG.getIntPtrConstant(VT.getSizeInBits() / 2, dl));
+  ExtractHi = DAG.getNode(ISD::TRUNCATE, dl, HiIntVT, ExtractHi);
+
+  Lo = DAG.getBitcast(LoVT, ExtractLo);
+  Hi = DAG.getBitcast(HiVT, ExtractHi);
+
+  // Legalize the chain result - switch anything that used the old chain to
+  // use the new one.
+  ReplaceValueWith(SDValue(LD, 1), ALD.getValue(1));
+}
+
 void DAGTypeLegalizer::SplitVecRes_LOAD(LoadSDNode *LD, SDValue &Lo,
 SDValue &Hi) {
   assert(ISD::isUNINDEXEDLoad(LD) && "Indexed load during type legalization!");
diff --git a/llvm/test/CodeGen/X86/atomic-l

[llvm-branch-commits] [MLIR][Python] Update Nanobind Warnings List for clang-cl on Windows (PR #166828)

2025-11-06 Thread via llvm-branch-commits

llvmbot wrote:




@llvm/pr-subscribers-mlir

Author: Aiden Grossman (boomanaiden154)


Changes

We recently moved over to compiling with clang-cl on Windows. This ended
up causing a large increase in warnings, particularly due to how
warnings are handled in nanobind. cd91d0fff9293a904704784c92c28637bfebef45
initially set -Wall -Wextra and -Wpedantic while fixing another issue,
which is probably not what we want to do on third-party code. We also
need to disable -Wmissing-field-initializers to get things clean in this
configuration.


---
Full diff: https://github.com/llvm/llvm-project/pull/166828.diff


1 Files Affected:

- (modified) mlir/cmake/modules/AddMLIRPython.cmake (+2-2) 


``diff
diff --git a/mlir/cmake/modules/AddMLIRPython.cmake 
b/mlir/cmake/modules/AddMLIRPython.cmake
index fa6aec8a603a9..8196e2a2a3321 100644
--- a/mlir/cmake/modules/AddMLIRPython.cmake
+++ b/mlir/cmake/modules/AddMLIRPython.cmake
@@ -791,7 +791,6 @@ function(add_mlir_python_extension libname extname)
   get_property(NB_LIBRARY_TARGET_NAME TARGET ${libname} PROPERTY 
LINK_LIBRARIES)
   target_compile_options(${NB_LIBRARY_TARGET_NAME}
 PRIVATE
-  -Wall -Wextra -Wpedantic
   -Wno-c++98-compat-extra-semi
   -Wno-cast-qual
   -Wno-covered-switch-default
@@ -799,11 +798,11 @@ function(add_mlir_python_extension libname extname)
   -Wno-nested-anon-types
   -Wno-unused-parameter
   -Wno-zero-length-array
+  -Wno-missing-field-initializers
   ${eh_rtti_enable})
 
   target_compile_options(${libname}
 PRIVATE
-  -Wall -Wextra -Wpedantic
   -Wno-c++98-compat-extra-semi
   -Wno-cast-qual
   -Wno-covered-switch-default
@@ -811,6 +810,7 @@ function(add_mlir_python_extension libname extname)
   -Wno-nested-anon-types
   -Wno-unused-parameter
   -Wno-zero-length-array
+  -Wno-missing-field-initializers
   ${eh_rtti_enable})
 endif()
 

``




https://github.com/llvm/llvm-project/pull/166828
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [MLIR][Python] Update Nanobind Warnings List for clang-cl on Windows (PR #166828)

2025-11-06 Thread Aiden Grossman via llvm-branch-commits

https://github.com/boomanaiden154 created 
https://github.com/llvm/llvm-project/pull/166828

We recently moved over to compiling with clang-cl on Windows. This ended
up causing a large increase in warnings, particularly due to how
warnings are handled in nanobind. cd91d0fff9293a904704784c92c28637bfebef45
initially set -Wall -Wextra and -Wpedantic while fixing another issue,
which is probably not what we want to do on third-party code. We also
need to disable -Wmissing-field-initializers to get things clean in this
configuration.



___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [CI] Make premerge_advisor_explain write comments (PR #166605)

2025-11-06 Thread Aiden Grossman via llvm-branch-commits


@@ -12,6 +12,94 @@ certifi==2025.8.3 \
 
--hash=sha256:e564105f78ded564e3ae7c923924435e1daa7463faeab5bb932bc53ffae63407 \
 
--hash=sha256:f6c12493cfb1b06ba2ff328595af9350c65d6644968e5d3a2ffd78699af217a5
 # via requests
+cffi==2.0.0 \
+
--hash=sha256:00bdf7acc5f795150faa6957054fbbca2439db2f775ce831222b66f192f03beb \
+
--hash=sha256:07b271772c100085dd28b74fa0cd81c8fb1a3ba18b21e03d7c27f3436a10606b \
+
--hash=sha256:087067fa8953339c723661eda6b54bc98c5625757ea62e95eb4898ad5e776e9f \
+
--hash=sha256:0a1527a803f0a659de1af2e1fd700213caba79377e27e4693648c2923da066f9 \
+
--hash=sha256:0cf2d91ecc3fcc0625c2c530fe004f82c110405f101548512cce44322fa8ac44 \
+
--hash=sha256:0f6084a0ea23d05d20c3edcda20c3d006f9b6f3fefeac38f59262e10cef47ee2 \
+
--hash=sha256:12873ca6cb9b0f0d3a0da705d6086fe911591737a59f28b7936bdfed27c0d47c \
+
--hash=sha256:19f705ada2530c1167abacb171925dd886168931e0a7b78f5bffcae5c6b5be75 \
+
--hash=sha256:1cd13c99ce269b3ed80b417dcd591415d3372bcac067009b6e0f59c7d4015e65 \
+
--hash=sha256:1e3a615586f05fc4065a8b22b8152f0c1b00cdbc60596d187c2a74f9e3036e4e \
+
--hash=sha256:1f72fb8906754ac8a2cc3f9f5aaa298070652a0ffae577e0ea9bd480dc3c931a \
+
--hash=sha256:1fc9ea04857caf665289b7a75923f2c6ed559b8298a1b8c49e59f7dd95c8481e \
+
--hash=sha256:203a48d1fb583fc7d78a4c6655692963b860a417c0528492a6bc21f1aaefab25 \
+
--hash=sha256:2081580ebb843f759b9f617314a24ed5738c51d2aee65d31e02f6f7a2b97707a \
+
--hash=sha256:21d1152871b019407d8ac3985f6775c079416c282e431a4da6afe7aefd2bccbe \
+
--hash=sha256:24b6f81f1983e6df8db3adc38562c83f7d4a0c36162885ec7f7b77c7dcbec97b \
+
--hash=sha256:256f80b80ca3853f90c21b23ee78cd008713787b1b1e93eae9f3d6a7134abd91 \
+
--hash=sha256:28a3a209b96630bca57cce802da70c266eb08c6e97e5afd61a75611ee6c64592 \
+
--hash=sha256:2c8f814d84194c9ea681642fd164267891702542f028a15fc97d4674b6206187 \
+
--hash=sha256:2de9a304e27f7596cd03d16f1b7c72219bd944e99cc52b84d0145aefb07cbd3c \
+
--hash=sha256:38100abb9d1b1435bc4cc340bb4489635dc2f0da7456590877030c9b3d40b0c1 \
+
--hash=sha256:3925dd22fa2b7699ed2617149842d2e6adde22b262fcbfada50e3d195e4b3a94 \
+
--hash=sha256:3e17ed538242334bf70832644a32a7aae3d83b57567f9fd60a26257e992b79ba \
+
--hash=sha256:3e837e369566884707ddaf85fc1744b47575005c0a229de3327f8f9a20f4efeb \
+
--hash=sha256:3f4d46d8b35698056ec29bca21546e1551a205058ae1a181d871e278b0b28165 \
+
--hash=sha256:44d1b5909021139fe36001ae048dbdde8214afa20200eda0f64c068cac5d5529 \
+
--hash=sha256:45d5e886156860dc35862657e1494b9bae8dfa63bf56796f2fb56e1679fc0bca \
+
--hash=sha256:4647afc2f90d1ddd33441e5b0e85b16b12ddec4fca55f0d9671fef036ecca27c \
+
--hash=sha256:4671d9dd5ec934cb9a73e7ee9676f9362aba54f7f34910956b84d727b0d73fb6 \
+
--hash=sha256:53f77cbe57044e88bbd5ed26ac1d0514d2acf0591dd6bb02a3ae37f76811b80c \
+
--hash=sha256:5eda85d6d1879e692d546a078b44251cdd08dd1cfb98dfb77b670c97cee49ea0 \
+
--hash=sha256:5fed36fccc0612a53f1d4d9a816b50a36702c28a2aa880cb8a122b3466638743 \
+
--hash=sha256:61d028e90346df14fedc3d1e5441df818d095f3b87d286825dfcbd6459b7ef63 \
+
--hash=sha256:66f011380d0e49ed280c789fbd08ff0d40968ee7b665575489afa95c98196ab5 \
+
--hash=sha256:6824f87845e3396029f3820c206e459ccc91760e8fa24422f8b0c3d1731cbec5 \
+
--hash=sha256:6c6c373cfc5c83a975506110d17457138c8c63016b563cc9ed6e056a82f13ce4 \
+
--hash=sha256:6d02d6655b0e54f54c4ef0b94eb6be0607b70853c45ce98bd278dc7de718be5d \
+
--hash=sha256:6d50360be4546678fc1b79ffe7a66265e28667840010348dd69a314145807a1b \
+
--hash=sha256:730cacb21e1bdff3ce90babf007d0a0917cc3e6492f336c2f0134101e0944f93 \
+
--hash=sha256:737fe7d37e1a1bffe70bd5754ea763a62a066dc5913ca57e957824b72a85e205 \
+
--hash=sha256:74a03b9698e198d47562765773b4a8309919089150a0bb17d829ad7b44b60d27 \
+
--hash=sha256:7553fb2090d71822f02c629afe6042c299edf91ba1bf94951165613553984512 \
+
--hash=sha256:7a66c7204d8869299919db4d5069a82f1561581af12b11b3c9f48c584eb8743d \
+
--hash=sha256:7cc09976e8b56f8cebd752f7113ad07752461f48a58cbba644139015ac24954c \
+
--hash=sha256:81afed14892743bbe14dacb9e36d9e0e504cd204e0b165062c488942b9718037 \
+
--hash=sha256:8941aaadaf67246224cee8c3803777eed332a19d909b47e29c9842ef1e79ac26 \
+
--hash=sha256:89472c9762729b5ae1ad974b777416bfda4ac5642423fa93bd57a09204712322 \
+
--hash=sha256:8ea985900c5c95ce9db1745f7933eeef5d314f0565b27625d9a10ec9881e1bfb \
+
--hash=sha256:8eca2a813c1cb7ad4fb74d368c2ffbbb4789d377ee5bb8df98373c2cc0dee76c \
+
--hash=sha256:92b68146a71df78564e4ef48af17551a5ddd142e5190cdf2c5624d0c3ff5b2e8 \
+
--hash=sha256:9332088d75dc3241c702d852d4671613136d90fa6881da7d770a483fd05248b4 \
+
--hash=sha256:94698a9c5f91f9d138526b48fe26a199609544591f859c870d477351dc7b2414 \
+
--hash=sha256:9a67fc9e8eb39039280526379fb3a70023d77caec1852002b4da7e8b270c4dd9 \
+
--hash=sha256:9de40a7b0323d889cf8d23d1ef214f565ab154443c42737dfe52ff82cf857664 \
+
--hash=sha256:a05d0c237b3349096

[llvm-branch-commits] [llvm] [BOLT] Move call probe information to CallSiteInfo (PR #165490)

2025-11-06 Thread Rafael Auler via llvm-branch-commits

https://github.com/rafaelauler approved this pull request.


https://github.com/llvm/llvm-project/pull/165490
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [CI] Make premerge_advisor_explain write comments (PR #166605)

2025-11-06 Thread Aiden Grossman via llvm-branch-commits


@@ -45,13 +83,31 @@ def main(commit_sha: str, build_log_files: list[str]):
 )
 if advisor_response.status_code == 200:
 print(advisor_response.json())
+comments = [
+get_comment(
+github_token,
+pr_number,
+generate_test_report_lib.generate_report(
+generate_test_report_lib.compute_platform_title(),
+return_code,
+junit_objects,
+ninja_logs,
+failure_explanations_list=advisor_response.json(),

boomanaiden154 wrote:

> I think these comments could get quite large, but then again we are only 
> leaving at most 1. You might have to add some more size limits for the 
> extreme cases but not worth doing until we see it happen in practice.

Yeah. We already have the size limit that will get applied here, but those can 
still be large. I agree it's probably not worth doing until we see it in 
practice. I don't think I've seen any PR with more than ~10 failures so far.

> I wonder if the premerge advisor content should also go to the build summary, 
> but perhaps A: it already does or B: the advisor runs at a point where we've 
> already submitted the build summary.

> Then again, having the build summary be very much "this is exactly what 
> happened" and the comments on the PR be more "human" and maybe speculative 
> makes some sense to me.

It doesn't currently go to the build summary. I think I'd like to keep the 
build summary as exactly what happened for now. We can revisit that decision 
based on user feedback. And yeah, a big part of the reason to surface the 
advisor findings in a comment is because a lot of people do not realize the 
summary view exists.

https://github.com/llvm/llvm-project/pull/166605
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [BOLT] Move call probe information to CallSiteInfo (PR #165490)

2025-11-06 Thread Amir Ayupov via llvm-branch-commits

https://github.com/aaupov closed 
https://github.com/llvm/llvm-project/pull/165490
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [lldb] release/21.x: [lldb] Implement DW_CFA_val_offset and DW_CFA_val_offset_sf (#150732) (PR #166611)

2025-11-06 Thread Jonas Devlieghere via llvm-branch-commits

https://github.com/JDevlieghere approved this pull request.


https://github.com/llvm/llvm-project/pull/166611
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [MLIR][Python] Update Nanobind Warnings List for clang-cl on Windows (PR #166828)

2025-11-06 Thread Maksim Levental via llvm-branch-commits

https://github.com/makslevental approved this pull request.

This thing is a perennial PITA. I'm stamping to unblock but can you can also 
try [NB_SUPPRESS_WARNINGS](https://github.com/wjakob/nanobind/pull/868).

https://github.com/llvm/llvm-project/pull/166828
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [MLIR][Python] Update Nanobind Warnings List for clang-cl on Windows (PR #166828)

2025-11-06 Thread Maksim Levental via llvm-branch-commits

makslevental wrote:

PS also this https://github.com/wjakob/nanobind/issues/994 🙂

https://github.com/llvm/llvm-project/pull/166828
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [CI] Make premerge_advisor_explain write comments (PR #166605)

2025-11-06 Thread Aiden Grossman via llvm-branch-commits

https://github.com/boomanaiden154 updated 
https://github.com/llvm/llvm-project/pull/166605

>From 06c030dcb4ee57be287beffd96d1b21ef1697dd4 Mon Sep 17 00:00:00 2001
From: Aiden Grossman 
Date: Wed, 5 Nov 2025 18:23:46 +
Subject: [PATCH 1/2] fix

Created using spr 1.3.7
---
 .ci/premerge_advisor_explain.py | 34 -
 .ci/utils.sh| 10 +-
 2 files changed, 22 insertions(+), 22 deletions(-)

diff --git a/.ci/premerge_advisor_explain.py b/.ci/premerge_advisor_explain.py
index 4d840a33c3cf2..1d487af9e9ec7 100644
--- a/.ci/premerge_advisor_explain.py
+++ b/.ci/premerge_advisor_explain.py
@@ -31,22 +31,11 @@ def get_comment_id(platform: str, pr: 
github.PullRequest.PullRequest) -> int | N
 def get_comment(
 github_token: str,
 pr_number: int,
-junit_objects,
-ninja_logs,
-advisor_response,
-return_code,
+body: str,
 ) -> dict[str, str]:
 repo = github.Github(github_token).get_repo("llvm/llvm-project")
 pr = repo.get_issue(pr_number).as_pull_request()
-comment = {
-"body": generate_test_report_lib.generate_report(
-generate_test_report_lib.compute_platform_title(),
-return_code,
-junit_objects,
-ninja_logs,
-failure_explanations_list=advisor_response,
-)
-}
+comment = {"body": body}
 comment_id = get_comment_id(platform.system(), pr)
 if comment_id:
 comment["id"] = comment_id
@@ -59,6 +48,14 @@ def main(
 pr_number: int,
 return_code: int,
 ):
+if return_code == 0:
+with open("comment", "w") as comment_file_handle:
+comment = get_comment(
+":white_check_mark: With the latest revision this PR passed "
+"the premerge checks."
+)
+if comment["id"]:
+json.dump([comment], comment_file_handle)
 junit_objects, ninja_logs = generate_test_report_lib.load_info_from_files(
 build_log_files
 )
@@ -90,10 +87,13 @@ def main(
 get_comment(
 github_token,
 pr_number,
-junit_objects,
-ninja_logs,
-advisor_response.json(),
-return_code,
+generate_test_report_lib.generate_report(
+generate_test_report_lib.compute_platform_title(),
+return_code,
+junit_objects,
+ninja_logs,
+failure_explanations_list=advisor_response.json(),
+),
 )
 ]
 with open("comment", "w") as comment_file_handle:
diff --git a/.ci/utils.sh b/.ci/utils.sh
index 72f4b04f5bf3a..91c27319f3534 100644
--- a/.ci/utils.sh
+++ b/.ci/utils.sh
@@ -33,18 +33,18 @@ function at-exit {
   # If building fails there will be no results files.
   shopt -s nullglob
 
-  if [[ "$GITHUB_STEP_SUMMARY" != "" ]]; then
+  if [[ "$GITHUB_ACTIONS" != "" ]]; then
 python "${MONOREPO_ROOT}"/.ci/generate_test_report_github.py \
   $retcode "${BUILD_DIR}"/test-results.*.xml "${MONOREPO_ROOT}"/ninja*.log 
\
   >> $GITHUB_STEP_SUMMARY
+python "${MONOREPO_ROOT}"/.ci/premerge_advisor_explain.py \
+  $(git rev-parse HEAD~1) $retcode ${{ secrets.GITHUB_TOKEN }} \
+  $GITHUB_PR_NUMBER "${BUILD_DIR}"/test-results.*.xml \
+  "${MONOREPO_ROOT}"/ninja*.log
   fi
 
   if [[ "$retcode" != "0" ]]; then
 if [[ "$GITHUB_ACTIONS" != "" ]]; then
-  python "${MONOREPO_ROOT}"/.ci/premerge_advisor_explain.py \
-$(git rev-parse HEAD~1) $retcode ${{ secrets.GITHUB_TOKEN }} \
-$GITHUB_PR_NUMBER "${BUILD_DIR}"/test-results.*.xml \
-"${MONOREPO_ROOT}"/ninja*.log
   python "${MONOREPO_ROOT}"/.ci/premerge_advisor_upload.py \
 $(git rev-parse HEAD~1) $GITHUB_RUN_NUMBER \
 "${BUILD_DIR}"/test-results.*.xml "${MONOREPO_ROOT}"/ninja*.log

>From 7e44989fceaeec33405c5368e16d999f5701a7b2 Mon Sep 17 00:00:00 2001
From: Aiden Grossman 
Date: Thu, 6 Nov 2025 16:57:02 +
Subject: [PATCH 2/2] docs

Created using spr 1.3.7
---
 .ci/premerge_advisor_explain.py | 25 +
 1 file changed, 25 insertions(+)

diff --git a/.ci/premerge_advisor_explain.py b/.ci/premerge_advisor_explain.py
index 1d487af9e9ec7..08ccfb3d0e3d4 100644
--- a/.ci/premerge_advisor_explain.py
+++ b/.ci/premerge_advisor_explain.py
@@ -48,6 +48,31 @@ def main(
 pr_number: int,
 return_code: int,
 ):
+"""The main entrypoint for the script.
+
+This function parses failures from files, requests information from the
+premerge advisor, and may write a Github comment depending upon the output.
+There are four different scenarios:
+1. There has never been a previous failure and the job passes - We do not
+   create a comment. We write out an empty file to the comment path so the
+   issue-write workflow knows not to create anything.
+2. There has never been

[llvm-branch-commits] [llvm] [CI] Make premerge_advisor_explain write comments (PR #166605)

2025-11-06 Thread Aiden Grossman via llvm-branch-commits


@@ -4,20 +4,58 @@
 """Script for getting explanations from the premerge advisor."""
 
 import argparse
-import os
 import platform
 import sys
+import json
 
 import requests
+import github
+import github.PullRequest
 
 import generate_test_report_lib
 
 PREMERGE_ADVISOR_URL = (
 "http://premerge-advisor.premerge-advisor.svc.cluster.local:5000/explain";
 )
+COMMENT_TAG = ""
 
 
-def main(commit_sha: str, build_log_files: list[str]):
+def get_comment_id(platform: str, pr: github.PullRequest.PullRequest) -> int | 
None:
+platform_comment_tag = COMMENT_TAG.format(platform=platform)
+for comment in pr.as_issue().get_comments():
+if platform_comment_tag in comment.body:
+return comment.id
+return None
+
+
+def get_comment(
+github_token: str,
+pr_number: int,
+body: str,
+) -> dict[str, str]:
+repo = github.Github(github_token).get_repo("llvm/llvm-project")
+pr = repo.get_issue(pr_number).as_pull_request()
+comment = {"body": body}
+comment_id = get_comment_id(platform.system(), pr)
+if comment_id:
+comment["id"] = comment_id
+
+
+def main(
+commit_sha: str,
+build_log_files: list[str],
+github_token: str,
+pr_number: int,
+return_code: int,
+):
+if return_code == 0:
+with open("comment", "w") as comment_file_handle:
+comment = get_comment(
+":white_check_mark: With the latest revision this PR passed "
+"the premerge checks."
+)
+if comment["id"]:

boomanaiden154 wrote:

Yeah, this matches the behavior of the formatter.

I've added a docstring enumerating the cases. The formatter helper script does 
not have good documentation on this.

https://github.com/llvm/llvm-project/pull/166605
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [SelectionDAG] Split vector types for atomic load (PR #165818)

2025-11-06 Thread via llvm-branch-commits

https://github.com/jofrn updated 
https://github.com/llvm/llvm-project/pull/165818

>From 1434bcf8d9be03eeabce92430d00e02b0e434069 Mon Sep 17 00:00:00 2001
From: jofrn 
Date: Thu, 30 Oct 2025 12:19:59 -0400
Subject: [PATCH] [SelectionDAG] Split vector types for atomic load

Vector types that aren't widened are split
so that a single ATOMIC_LOAD is issued for the entire vector at once.
This change utilizes the load vectorization infrastructure in
SelectionDAG in order to group the vectors. This enables SelectionDAG
to translate vectors with type bfloat,half.
---
 .../include/llvm/Target/TargetSelectionDAG.td |  14 +
 llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h |   1 +
 .../SelectionDAG/LegalizeVectorTypes.cpp  |  37 ++
 llvm/test/CodeGen/X86/atomic-load-store.ll| 352 +-
 4 files changed, 400 insertions(+), 4 deletions(-)

diff --git a/llvm/include/llvm/Target/TargetSelectionDAG.td 
b/llvm/include/llvm/Target/TargetSelectionDAG.td
index 07a858fd682fc..239fee8a3022d 100644
--- a/llvm/include/llvm/Target/TargetSelectionDAG.td
+++ b/llvm/include/llvm/Target/TargetSelectionDAG.td
@@ -1949,6 +1949,20 @@ def atomic_load_64 :
   let MemoryVT = i64;
 }
 
+def atomic_load_128_v2i64 :
+  PatFrag<(ops node:$ptr),
+  (atomic_load node:$ptr)> {
+  let IsAtomic = true;
+  let MemoryVT = v2i64;
+}
+
+def atomic_load_128_v4i32 :
+  PatFrag<(ops node:$ptr),
+  (atomic_load node:$ptr)> {
+  let IsAtomic = true;
+  let MemoryVT = v4i32;
+}
+
 def atomic_load_nonext_8 :
   PatFrag<(ops node:$ptr), (atomic_load_nonext node:$ptr)> {
   let IsAtomic = true; // FIXME: Should be IsLoad and/or IsAtomic?
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h 
b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index ed2c30be7d71d..9028ff4d3401c 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -978,6 +978,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
   void SplitVecRes_FPOp_MultiType(SDNode *N, SDValue &Lo, SDValue &Hi);
   void SplitVecRes_IS_FPCLASS(SDNode *N, SDValue &Lo, SDValue &Hi);
   void SplitVecRes_INSERT_VECTOR_ELT(SDNode *N, SDValue &Lo, SDValue &Hi);
+  void SplitVecRes_ATOMIC_LOAD(AtomicSDNode *LD, SDValue &Lo, SDValue &Hi);
   void SplitVecRes_LOAD(LoadSDNode *LD, SDValue &Lo, SDValue &Hi);
   void SplitVecRes_VP_LOAD(VPLoadSDNode *LD, SDValue &Lo, SDValue &Hi);
   void SplitVecRes_VP_LOAD_FF(VPLoadFFSDNode *LD, SDValue &Lo, SDValue &Hi);
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp 
b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index d0e2c8b2e3799..f3fb50be27f02 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -1226,6 +1226,9 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, 
unsigned ResNo) {
 SplitVecRes_STEP_VECTOR(N, Lo, Hi);
 break;
   case ISD::SIGN_EXTEND_INREG: SplitVecRes_InregOp(N, Lo, Hi); break;
+  case ISD::ATOMIC_LOAD:
+SplitVecRes_ATOMIC_LOAD(cast(N), Lo, Hi);
+break;
   case ISD::LOAD:
 SplitVecRes_LOAD(cast(N), Lo, Hi);
 break;
@@ -2202,6 +2205,40 @@ void DAGTypeLegalizer::SplitVecRes_VP_SPLAT(SDNode *N, 
SDValue &Lo,
   Hi = DAG.getNode(N->getOpcode(), dl, HiVT, N->getOperand(0), MaskHi, EVLHi);
 }
 
+void DAGTypeLegalizer::SplitVecRes_ATOMIC_LOAD(AtomicSDNode *LD, SDValue &Lo,
+   SDValue &Hi) {
+  assert(LD->getExtensionType() == ISD::NON_EXTLOAD &&
+ "Extended load during type legalization!");
+  SDLoc dl(LD);
+  EVT VT = LD->getValueType(0);
+  EVT LoVT, HiVT;
+  std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
+
+  SDValue Ch = LD->getChain();
+  SDValue Ptr = LD->getBasePtr();
+
+  EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
+  EVT MemIntVT =
+  EVT::getIntegerVT(*DAG.getContext(), LD->getMemoryVT().getSizeInBits());
+  SDValue ALD = DAG.getAtomicLoad(LD->getExtensionType(), dl, MemIntVT, IntVT,
+  Ch, Ptr, LD->getMemOperand());
+
+  EVT LoIntVT = EVT::getIntegerVT(*DAG.getContext(), LoVT.getSizeInBits());
+  EVT HiIntVT = EVT::getIntegerVT(*DAG.getContext(), HiVT.getSizeInBits());
+  SDValue ExtractLo = DAG.getNode(ISD::TRUNCATE, dl, LoIntVT, ALD);
+  SDValue ExtractHi = DAG.getNode(
+  ISD::SRL, dl, IntVT, ALD,
+  DAG.getShiftAmountConstant(VT.getSizeInBits() / 2, IntVT, dl));
+  ExtractHi = DAG.getNode(ISD::TRUNCATE, dl, HiIntVT, ExtractHi);
+
+  Lo = DAG.getBitcast(LoVT, ExtractLo);
+  Hi = DAG.getBitcast(HiVT, ExtractHi);
+
+  // Legalize the chain result - switch anything that used the old chain to
+  // use the new one.
+  ReplaceValueWith(SDValue(LD, 1), ALD.getValue(1));
+}
+
 void DAGTypeLegalizer::SplitVecRes_LOAD(LoadSDNode *LD, SDValue &Lo,
 SDValue &Hi) {
   assert(ISD::isUNINDEXEDLoad(LD) && "Indexed load during type legalization!");
diff --git a/llvm/test/CodeGen/X86/a

[llvm-branch-commits] [llvm] [X86] Remove extra MOV after widening atomic load (PR #148898)

2025-11-06 Thread via llvm-branch-commits

https://github.com/jofrn updated 
https://github.com/llvm/llvm-project/pull/148898

>From b92b6dac8913654dc0ba987ce328c47fa7330778 Mon Sep 17 00:00:00 2001
From: jofrn 
Date: Tue, 15 Jul 2025 13:01:24 -0400
Subject: [PATCH] [X86] Remove extra MOV after widening atomic load

This change adds patterns to optimize out an extra MOV
present after widening the atomic load.
---
 llvm/lib/Target/X86/X86InstrCompiler.td| 16 +
 llvm/test/CodeGen/X86/atomic-load-store.ll | 72 --
 2 files changed, 40 insertions(+), 48 deletions(-)

diff --git a/llvm/lib/Target/X86/X86InstrCompiler.td 
b/llvm/lib/Target/X86/X86InstrCompiler.td
index ec31675731b79..ce429b5916280 100644
--- a/llvm/lib/Target/X86/X86InstrCompiler.td
+++ b/llvm/lib/Target/X86/X86InstrCompiler.td
@@ -1204,6 +1204,22 @@ def : Pat<(i16 (atomic_load_nonext_16 addr:$src)), 
(MOV16rm addr:$src)>;
 def : Pat<(i32 (atomic_load_nonext_32 addr:$src)), (MOV32rm addr:$src)>;
 def : Pat<(i64 (atomic_load_nonext_64 addr:$src)), (MOV64rm addr:$src)>;
 
+// load atomic <2 x i16>
+def : Pat<(v4i32 (scalar_to_vector (i32 (atomic_load_32 addr:$src,
+  (MOVDI2PDIrm addr:$src)>, Requires<[UseSSE2]>;
+def : Pat<(v4i32 (scalar_to_vector (i32 (atomic_load_32 addr:$src,
+  (VMOVSSrm addr:$src)>, Requires<[UseAVX]>;
+def : Pat<(v4i32 (scalar_to_vector (i32 (atomic_load_32 addr:$src,
+  (VMOVSSZrm addr:$src)>, Requires<[HasAVX512]>;
+
+// load atomic <2 x i32,float>
+def : Pat<(v2i64 (scalar_to_vector (i64 (atomic_load_64 addr:$src,
+  (MOV64toPQIrm addr:$src)>, Requires<[UseSSE2]>;
+def : Pat<(v2i64 (scalar_to_vector (i64 (atomic_load_64 addr:$src,
+  (VMOV64toPQIrm addr:$src)>, Requires<[UseAVX]>;
+def : Pat<(v2i64 (scalar_to_vector (i64 (atomic_load_64 addr:$src,
+  (VMOV64toPQIZrm addr:$src)>, Requires<[HasAVX512]>;
+
 // Floating point loads/stores.
 def : Pat<(atomic_store_32 (i32 (bitconvert (f32 FR32:$src))), addr:$dst),
   (MOVSSmr addr:$dst, FR32:$src)>, Requires<[UseSSE1]>;
diff --git a/llvm/test/CodeGen/X86/atomic-load-store.ll 
b/llvm/test/CodeGen/X86/atomic-load-store.ll
index fc32c3668d1dd..7e15b9303887f 100644
--- a/llvm/test/CodeGen/X86/atomic-load-store.ll
+++ b/llvm/test/CodeGen/X86/atomic-load-store.ll
@@ -202,26 +202,22 @@ define <2 x i8> @atomic_vec2_i8(ptr %x) {
 define <2 x i16> @atomic_vec2_i16(ptr %x) {
 ; CHECK-SSE-O3-LABEL: atomic_vec2_i16:
 ; CHECK-SSE-O3:   # %bb.0:
-; CHECK-SSE-O3-NEXT:movl (%rdi), %eax
-; CHECK-SSE-O3-NEXT:movd %eax, %xmm0
+; CHECK-SSE-O3-NEXT:movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; CHECK-SSE-O3-NEXT:retq
 ;
 ; CHECK-AVX-O3-LABEL: atomic_vec2_i16:
 ; CHECK-AVX-O3:   # %bb.0:
-; CHECK-AVX-O3-NEXT:movl (%rdi), %eax
-; CHECK-AVX-O3-NEXT:vmovd %eax, %xmm0
+; CHECK-AVX-O3-NEXT:vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; CHECK-AVX-O3-NEXT:retq
 ;
 ; CHECK-SSE-O0-LABEL: atomic_vec2_i16:
 ; CHECK-SSE-O0:   # %bb.0:
-; CHECK-SSE-O0-NEXT:movl (%rdi), %eax
-; CHECK-SSE-O0-NEXT:movd %eax, %xmm0
+; CHECK-SSE-O0-NEXT:movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; CHECK-SSE-O0-NEXT:retq
 ;
 ; CHECK-AVX-O0-LABEL: atomic_vec2_i16:
 ; CHECK-AVX-O0:   # %bb.0:
-; CHECK-AVX-O0-NEXT:movl (%rdi), %eax
-; CHECK-AVX-O0-NEXT:vmovd %eax, %xmm0
+; CHECK-AVX-O0-NEXT:vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; CHECK-AVX-O0-NEXT:retq
   %ret = load atomic <2 x i16>, ptr %x acquire, align 4
   ret <2 x i16> %ret
@@ -230,26 +226,22 @@ define <2 x i16> @atomic_vec2_i16(ptr %x) {
 define <2 x ptr addrspace(270)> @atomic_vec2_ptr270(ptr %x) {
 ; CHECK-SSE-O3-LABEL: atomic_vec2_ptr270:
 ; CHECK-SSE-O3:   # %bb.0:
-; CHECK-SSE-O3-NEXT:movq (%rdi), %rax
-; CHECK-SSE-O3-NEXT:movq %rax, %xmm0
+; CHECK-SSE-O3-NEXT:movq (%rdi), %xmm0
 ; CHECK-SSE-O3-NEXT:retq
 ;
 ; CHECK-AVX-O3-LABEL: atomic_vec2_ptr270:
 ; CHECK-AVX-O3:   # %bb.0:
-; CHECK-AVX-O3-NEXT:movq (%rdi), %rax
-; CHECK-AVX-O3-NEXT:vmovq %rax, %xmm0
+; CHECK-AVX-O3-NEXT:vmovq (%rdi), %xmm0
 ; CHECK-AVX-O3-NEXT:retq
 ;
 ; CHECK-SSE-O0-LABEL: atomic_vec2_ptr270:
 ; CHECK-SSE-O0:   # %bb.0:
-; CHECK-SSE-O0-NEXT:movq (%rdi), %rax
-; CHECK-SSE-O0-NEXT:movq %rax, %xmm0
+; CHECK-SSE-O0-NEXT:movq (%rdi), %xmm0
 ; CHECK-SSE-O0-NEXT:retq
 ;
 ; CHECK-AVX-O0-LABEL: atomic_vec2_ptr270:
 ; CHECK-AVX-O0:   # %bb.0:
-; CHECK-AVX-O0-NEXT:movq (%rdi), %rax
-; CHECK-AVX-O0-NEXT:vmovq %rax, %xmm0
+; CHECK-AVX-O0-NEXT:vmovq (%rdi), %xmm0
 ; CHECK-AVX-O0-NEXT:retq
   %ret = load atomic <2 x ptr addrspace(270)>, ptr %x acquire, align 8
   ret <2 x ptr addrspace(270)> %ret
@@ -258,26 +250,22 @@ define <2 x ptr addrspace(270)> @atomic_vec2_ptr270(ptr 
%x) {
 define <2 x i32> @atomic_vec2_i32_align(ptr %x) {
 ; CHECK-SSE-O3-LABEL: atomic_vec2_i32_align:
 ; CHECK-SSE-O3:   # %bb.0:
-; CHECK-SSE-O3-NEXT:movq (%rdi), %rax
-; CHECK-SSE-O3-NEXT:movq %rax, %xmm0
+; CHECK-SSE-O3-NEXT:movq (%rdi), %xmm0

[llvm-branch-commits] [llvm] [AtomicExpand] Add bitcasts when expanding load atomic vector (PR #148900)

2025-11-06 Thread via llvm-branch-commits

https://github.com/jofrn updated 
https://github.com/llvm/llvm-project/pull/148900

>From a657bd946e7be59892a00a447ca7018d0715c6a5 Mon Sep 17 00:00:00 2001
From: jofrn 
Date: Tue, 15 Jul 2025 13:03:15 -0400
Subject: [PATCH] [AtomicExpand] Add bitcasts when expanding load atomic vector

AtomicExpand fails for aligned `load atomic ` because it
does not find a compatible library call. This change adds appropriate
bitcasts so that the call can be lowered. It also adds support for
128 bit lowering in tablegen to support SSE/AVX.
---
 llvm/lib/CodeGen/AtomicExpandPass.cpp | 19 +++-
 llvm/test/CodeGen/ARM/atomic-load-store.ll| 51 +++
 llvm/test/CodeGen/X86/atomic-load-store.ll| 91 ++-
 .../X86/expand-atomic-non-integer.ll  | 66 ++
 4 files changed, 222 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/CodeGen/AtomicExpandPass.cpp 
b/llvm/lib/CodeGen/AtomicExpandPass.cpp
index 53f1cfe24a68d..8dc14bb416345 100644
--- a/llvm/lib/CodeGen/AtomicExpandPass.cpp
+++ b/llvm/lib/CodeGen/AtomicExpandPass.cpp
@@ -483,7 +483,9 @@ LoadInst 
*AtomicExpandImpl::convertAtomicLoadToIntegerType(LoadInst *LI) {
   NewLI->setAtomic(LI->getOrdering(), LI->getSyncScopeID());
   LLVM_DEBUG(dbgs() << "Replaced " << *LI << " with " << *NewLI << "\n");
 
-  Value *NewVal = Builder.CreateBitCast(NewLI, LI->getType());
+  Value *NewVal = LI->getType()->isPtrOrPtrVectorTy()
+  ? Builder.CreateIntToPtr(NewLI, LI->getType())
+  : Builder.CreateBitCast(NewLI, LI->getType());
   LI->replaceAllUsesWith(NewVal);
   LI->eraseFromParent();
   return NewLI;
@@ -2093,9 +2095,18 @@ bool AtomicExpandImpl::expandAtomicOpToLibcall(
 I->replaceAllUsesWith(V);
   } else if (HasResult) {
 Value *V;
-if (UseSizedLibcall)
-  V = Builder.CreateBitOrPointerCast(Result, I->getType());
-else {
+if (UseSizedLibcall) {
+  // Add bitcasts from Result's scalar type to I's  vector type
+  auto *PtrTy = dyn_cast(I->getType()->getScalarType());
+  auto *VTy = dyn_cast(I->getType());
+  if (VTy && PtrTy && !Result->getType()->isVectorTy()) {
+unsigned AS = PtrTy->getAddressSpace();
+Value *BC = Builder.CreateBitCast(
+Result, VTy->getWithNewType(DL.getIntPtrType(Ctx, AS)));
+V = Builder.CreateIntToPtr(BC, I->getType());
+  } else
+V = Builder.CreateBitOrPointerCast(Result, I->getType());
+} else {
   V = Builder.CreateAlignedLoad(I->getType(), AllocaResult,
 AllocaAlignment);
   Builder.CreateLifetimeEnd(AllocaResult);
diff --git a/llvm/test/CodeGen/ARM/atomic-load-store.ll 
b/llvm/test/CodeGen/ARM/atomic-load-store.ll
index 560dfde356c29..eaa2ffd9b2731 100644
--- a/llvm/test/CodeGen/ARM/atomic-load-store.ll
+++ b/llvm/test/CodeGen/ARM/atomic-load-store.ll
@@ -983,3 +983,54 @@ define void @store_atomic_f64__seq_cst(ptr %ptr, double 
%val1) {
   store atomic double %val1, ptr %ptr seq_cst, align 8
   ret void
 }
+
+define <1 x ptr> @atomic_vec1_ptr(ptr %x) #0 {
+; ARM-LABEL: atomic_vec1_ptr:
+; ARM:   @ %bb.0:
+; ARM-NEXT:ldr r0, [r0]
+; ARM-NEXT:dmb ish
+; ARM-NEXT:bx lr
+;
+; ARMOPTNONE-LABEL: atomic_vec1_ptr:
+; ARMOPTNONE:   @ %bb.0:
+; ARMOPTNONE-NEXT:ldr r0, [r0]
+; ARMOPTNONE-NEXT:dmb ish
+; ARMOPTNONE-NEXT:bx lr
+;
+; THUMBTWO-LABEL: atomic_vec1_ptr:
+; THUMBTWO:   @ %bb.0:
+; THUMBTWO-NEXT:ldr r0, [r0]
+; THUMBTWO-NEXT:dmb ish
+; THUMBTWO-NEXT:bx lr
+;
+; THUMBONE-LABEL: atomic_vec1_ptr:
+; THUMBONE:   @ %bb.0:
+; THUMBONE-NEXT:push {r7, lr}
+; THUMBONE-NEXT:movs r1, #0
+; THUMBONE-NEXT:mov r2, r1
+; THUMBONE-NEXT:bl __sync_val_compare_and_swap_4
+; THUMBONE-NEXT:pop {r7, pc}
+;
+; ARMV4-LABEL: atomic_vec1_ptr:
+; ARMV4:   @ %bb.0:
+; ARMV4-NEXT:push {r11, lr}
+; ARMV4-NEXT:mov r1, #2
+; ARMV4-NEXT:bl __atomic_load_4
+; ARMV4-NEXT:pop {r11, lr}
+; ARMV4-NEXT:mov pc, lr
+;
+; ARMV6-LABEL: atomic_vec1_ptr:
+; ARMV6:   @ %bb.0:
+; ARMV6-NEXT:ldr r0, [r0]
+; ARMV6-NEXT:mov r1, #0
+; ARMV6-NEXT:mcr p15, #0, r1, c7, c10, #5
+; ARMV6-NEXT:bx lr
+;
+; THUMBM-LABEL: atomic_vec1_ptr:
+; THUMBM:   @ %bb.0:
+; THUMBM-NEXT:ldr r0, [r0]
+; THUMBM-NEXT:dmb sy
+; THUMBM-NEXT:bx lr
+  %ret = load atomic <1 x ptr>, ptr %x acquire, align 4
+  ret <1 x ptr> %ret
+}
diff --git a/llvm/test/CodeGen/X86/atomic-load-store.ll 
b/llvm/test/CodeGen/X86/atomic-load-store.ll
index 00310f6d1f219..867a4acb791bc 100644
--- a/llvm/test/CodeGen/X86/atomic-load-store.ll
+++ b/llvm/test/CodeGen/X86/atomic-load-store.ll
@@ -244,6 +244,96 @@ define <2 x ptr addrspace(270)> @atomic_vec2_ptr270(ptr 
%x) {
   %ret = load atomic <2 x ptr addrspace(270)>, ptr %x acquire, align 8
   ret <2 x ptr addrspace(270)> %ret
 }
+define <2 x ptr> @atomic_vec2_ptr_align(ptr %x) nounwind {
+; CHECK-SSE2-O3-LABEL: atomic_vec2_ptr_align:
+; CHECK-SSE2-O3:   

[llvm-branch-commits] [llvm] [SelectionDAG] Split vector types for atomic load (PR #165818)

2025-11-06 Thread via llvm-branch-commits

https://github.com/jofrn updated 
https://github.com/llvm/llvm-project/pull/165818

>From 1434bcf8d9be03eeabce92430d00e02b0e434069 Mon Sep 17 00:00:00 2001
From: jofrn 
Date: Thu, 30 Oct 2025 12:19:59 -0400
Subject: [PATCH] [SelectionDAG] Split vector types for atomic load

Vector types that aren't widened are split
so that a single ATOMIC_LOAD is issued for the entire vector at once.
This change utilizes the load vectorization infrastructure in
SelectionDAG in order to group the vectors. This enables SelectionDAG
to translate vectors with type bfloat,half.
---
 .../include/llvm/Target/TargetSelectionDAG.td |  14 +
 llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h |   1 +
 .../SelectionDAG/LegalizeVectorTypes.cpp  |  37 ++
 llvm/test/CodeGen/X86/atomic-load-store.ll| 352 +-
 4 files changed, 400 insertions(+), 4 deletions(-)

diff --git a/llvm/include/llvm/Target/TargetSelectionDAG.td 
b/llvm/include/llvm/Target/TargetSelectionDAG.td
index 07a858fd682fc..239fee8a3022d 100644
--- a/llvm/include/llvm/Target/TargetSelectionDAG.td
+++ b/llvm/include/llvm/Target/TargetSelectionDAG.td
@@ -1949,6 +1949,20 @@ def atomic_load_64 :
   let MemoryVT = i64;
 }
 
+def atomic_load_128_v2i64 :
+  PatFrag<(ops node:$ptr),
+  (atomic_load node:$ptr)> {
+  let IsAtomic = true;
+  let MemoryVT = v2i64;
+}
+
+def atomic_load_128_v4i32 :
+  PatFrag<(ops node:$ptr),
+  (atomic_load node:$ptr)> {
+  let IsAtomic = true;
+  let MemoryVT = v4i32;
+}
+
 def atomic_load_nonext_8 :
   PatFrag<(ops node:$ptr), (atomic_load_nonext node:$ptr)> {
   let IsAtomic = true; // FIXME: Should be IsLoad and/or IsAtomic?
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h 
b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index ed2c30be7d71d..9028ff4d3401c 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -978,6 +978,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
   void SplitVecRes_FPOp_MultiType(SDNode *N, SDValue &Lo, SDValue &Hi);
   void SplitVecRes_IS_FPCLASS(SDNode *N, SDValue &Lo, SDValue &Hi);
   void SplitVecRes_INSERT_VECTOR_ELT(SDNode *N, SDValue &Lo, SDValue &Hi);
+  void SplitVecRes_ATOMIC_LOAD(AtomicSDNode *LD, SDValue &Lo, SDValue &Hi);
   void SplitVecRes_LOAD(LoadSDNode *LD, SDValue &Lo, SDValue &Hi);
   void SplitVecRes_VP_LOAD(VPLoadSDNode *LD, SDValue &Lo, SDValue &Hi);
   void SplitVecRes_VP_LOAD_FF(VPLoadFFSDNode *LD, SDValue &Lo, SDValue &Hi);
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp 
b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index d0e2c8b2e3799..f3fb50be27f02 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -1226,6 +1226,9 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, 
unsigned ResNo) {
 SplitVecRes_STEP_VECTOR(N, Lo, Hi);
 break;
   case ISD::SIGN_EXTEND_INREG: SplitVecRes_InregOp(N, Lo, Hi); break;
+  case ISD::ATOMIC_LOAD:
+SplitVecRes_ATOMIC_LOAD(cast(N), Lo, Hi);
+break;
   case ISD::LOAD:
 SplitVecRes_LOAD(cast(N), Lo, Hi);
 break;
@@ -2202,6 +2205,40 @@ void DAGTypeLegalizer::SplitVecRes_VP_SPLAT(SDNode *N, 
SDValue &Lo,
   Hi = DAG.getNode(N->getOpcode(), dl, HiVT, N->getOperand(0), MaskHi, EVLHi);
 }
 
+void DAGTypeLegalizer::SplitVecRes_ATOMIC_LOAD(AtomicSDNode *LD, SDValue &Lo,
+   SDValue &Hi) {
+  assert(LD->getExtensionType() == ISD::NON_EXTLOAD &&
+ "Extended load during type legalization!");
+  SDLoc dl(LD);
+  EVT VT = LD->getValueType(0);
+  EVT LoVT, HiVT;
+  std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
+
+  SDValue Ch = LD->getChain();
+  SDValue Ptr = LD->getBasePtr();
+
+  EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
+  EVT MemIntVT =
+  EVT::getIntegerVT(*DAG.getContext(), LD->getMemoryVT().getSizeInBits());
+  SDValue ALD = DAG.getAtomicLoad(LD->getExtensionType(), dl, MemIntVT, IntVT,
+  Ch, Ptr, LD->getMemOperand());
+
+  EVT LoIntVT = EVT::getIntegerVT(*DAG.getContext(), LoVT.getSizeInBits());
+  EVT HiIntVT = EVT::getIntegerVT(*DAG.getContext(), HiVT.getSizeInBits());
+  SDValue ExtractLo = DAG.getNode(ISD::TRUNCATE, dl, LoIntVT, ALD);
+  SDValue ExtractHi = DAG.getNode(
+  ISD::SRL, dl, IntVT, ALD,
+  DAG.getShiftAmountConstant(VT.getSizeInBits() / 2, IntVT, dl));
+  ExtractHi = DAG.getNode(ISD::TRUNCATE, dl, HiIntVT, ExtractHi);
+
+  Lo = DAG.getBitcast(LoVT, ExtractLo);
+  Hi = DAG.getBitcast(HiVT, ExtractHi);
+
+  // Legalize the chain result - switch anything that used the old chain to
+  // use the new one.
+  ReplaceValueWith(SDValue(LD, 1), ALD.getValue(1));
+}
+
 void DAGTypeLegalizer::SplitVecRes_LOAD(LoadSDNode *LD, SDValue &Lo,
 SDValue &Hi) {
   assert(ISD::isUNINDEXEDLoad(LD) && "Indexed load during type legalization!");
diff --git a/llvm/test/CodeGen/X86/a

[llvm-branch-commits] [llvm] [X86] Cast atomic vectors in IR to support floats (PR #148899)

2025-11-06 Thread via llvm-branch-commits

https://github.com/jofrn updated 
https://github.com/llvm/llvm-project/pull/148899

>From f9b99b992450687c7da5048c82e9ce38efc3ff1d Mon Sep 17 00:00:00 2001
From: jofrn 
Date: Tue, 15 Jul 2025 13:02:04 -0400
Subject: [PATCH] [X86] Cast atomic vectors in IR to support floats

This commit casts floats to ints in an atomic load during AtomicExpand to 
support
floating point types. It also is required to support 128 bit vectors in SSE/AVX.
---
 llvm/lib/Target/X86/X86ISelLowering.cpp|   7 +
 llvm/lib/Target/X86/X86ISelLowering.h  |   2 +
 llvm/lib/Target/X86/X86InstrCompiler.td|  15 +
 llvm/test/CodeGen/X86/atomic-load-store.ll | 385 ++---
 4 files changed, 122 insertions(+), 287 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp 
b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 49beadae63f03..e15f17281b958 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -32136,6 +32136,13 @@ 
X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
   }
 }
 
+TargetLowering::AtomicExpansionKind
+X86TargetLowering::shouldCastAtomicLoadInIR(LoadInst *LI) const {
+  if (LI->getType()->getScalarType()->isFloatingPointTy())
+return AtomicExpansionKind::CastToInteger;
+  return AtomicExpansionKind::None;
+}
+
 LoadInst *
 X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
   unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
diff --git a/llvm/lib/Target/X86/X86ISelLowering.h 
b/llvm/lib/Target/X86/X86ISelLowering.h
index b7151f65942b4..f9a8adbd7da0d 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/llvm/lib/Target/X86/X86ISelLowering.h
@@ -1841,6 +1841,8 @@ namespace llvm {
 shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override;
 TargetLoweringBase::AtomicExpansionKind
 shouldExpandLogicAtomicRMWInIR(AtomicRMWInst *AI) const;
+TargetLoweringBase::AtomicExpansionKind
+shouldCastAtomicLoadInIR(LoadInst *LI) const override;
 void emitBitTestAtomicRMWIntrinsic(AtomicRMWInst *AI) const override;
 void emitCmpArithAtomicRMWIntrinsic(AtomicRMWInst *AI) const override;
 
diff --git a/llvm/lib/Target/X86/X86InstrCompiler.td 
b/llvm/lib/Target/X86/X86InstrCompiler.td
index ce429b5916280..3f542297fea19 100644
--- a/llvm/lib/Target/X86/X86InstrCompiler.td
+++ b/llvm/lib/Target/X86/X86InstrCompiler.td
@@ -1220,6 +1220,21 @@ def : Pat<(v2i64 (scalar_to_vector (i64 (atomic_load_64 
addr:$src,
 def : Pat<(v2i64 (scalar_to_vector (i64 (atomic_load_64 addr:$src,
   (VMOV64toPQIZrm addr:$src)>, Requires<[HasAVX512]>;
 
+// load atomic <2 x i64>
+def : Pat<(v2i64 (atomic_load_128_v2i64 addr:$src)),
+  (MOVAPDrm addr:$src)>, Requires<[UseSSE2]>;
+def : Pat<(v2i64 (atomic_load_128_v2i64 addr:$src)),
+  (VMOVAPDrm addr:$src)>, Requires<[UseAVX]>;
+def : Pat<(v2i64 (atomic_load_128_v2i64 addr:$src)),
+  (VMOVAPDZ128rm addr:$src)>, Requires<[HasAVX512]>;
+// load atomic <4 x i32>
+def : Pat<(v4i32 (atomic_load_128_v4i32 addr:$src)),
+  (MOVAPDrm addr:$src)>, Requires<[UseSSE2]>;
+def : Pat<(v4i32 (atomic_load_128_v4i32 addr:$src)),
+  (VMOVAPDrm addr:$src)>, Requires<[UseAVX]>;
+def : Pat<(v4i32 (atomic_load_128_v4i32 addr:$src)),
+  (VMOVAPDZ128rm addr:$src)>, Requires<[HasAVX512]>;
+
 // Floating point loads/stores.
 def : Pat<(atomic_store_32 (i32 (bitconvert (f32 FR32:$src))), addr:$dst),
   (MOVSSmr addr:$dst, FR32:$src)>, Requires<[UseSSE1]>;
diff --git a/llvm/test/CodeGen/X86/atomic-load-store.ll 
b/llvm/test/CodeGen/X86/atomic-load-store.ll
index 928dfef3143da..00310f6d1f219 100644
--- a/llvm/test/CodeGen/X86/atomic-load-store.ll
+++ b/llvm/test/CodeGen/X86/atomic-load-store.ll
@@ -119,13 +119,13 @@ define <1 x bfloat> @atomic_vec1_bfloat(ptr %x) {
 ; CHECK-SSE-O3-LABEL: atomic_vec1_bfloat:
 ; CHECK-SSE-O3:   # %bb.0:
 ; CHECK-SSE-O3-NEXT:movzwl (%rdi), %eax
-; CHECK-SSE-O3-NEXT:pinsrw $0, %eax, %xmm0
+; CHECK-SSE-O3-NEXT:movd %eax, %xmm0
 ; CHECK-SSE-O3-NEXT:retq
 ;
 ; CHECK-AVX-O3-LABEL: atomic_vec1_bfloat:
 ; CHECK-AVX-O3:   # %bb.0:
 ; CHECK-AVX-O3-NEXT:movzwl (%rdi), %eax
-; CHECK-AVX-O3-NEXT:vpinsrw $0, %eax, %xmm0, %xmm0
+; CHECK-AVX-O3-NEXT:vmovd %eax, %xmm0
 ; CHECK-AVX-O3-NEXT:retq
 ;
 ; CHECK-SSE-O0-LABEL: atomic_vec1_bfloat:
@@ -133,8 +133,7 @@ define <1 x bfloat> @atomic_vec1_bfloat(ptr %x) {
 ; CHECK-SSE-O0-NEXT:movw (%rdi), %cx
 ; CHECK-SSE-O0-NEXT:# implicit-def: $eax
 ; CHECK-SSE-O0-NEXT:movw %cx, %ax
-; CHECK-SSE-O0-NEXT:# implicit-def: $xmm0
-; CHECK-SSE-O0-NEXT:pinsrw $0, %eax, %xmm0
+; CHECK-SSE-O0-NEXT:movd %eax, %xmm0
 ; CHECK-SSE-O0-NEXT:retq
 ;
 ; CHECK-AVX-O0-LABEL: atomic_vec1_bfloat:
@@ -142,8 +141,7 @@ define <1 x bfloat> @atomic_vec1_bfloat(ptr %x) {
 ; CHECK-AVX-O0-NEXT:movw (%rdi), %cx
 ; CHECK-AVX-O0-NEXT:# implicit-def: $eax
 ; CHECK-AVX-O0-NEXT:movw %cx, %ax
-; CHECK-AVX-O0-NEXT:# imp

[llvm-branch-commits] [llvm] [X86] Cast atomic vectors in IR to support floats (PR #148899)

2025-11-06 Thread via llvm-branch-commits

https://github.com/jofrn updated 
https://github.com/llvm/llvm-project/pull/148899

>From f9b99b992450687c7da5048c82e9ce38efc3ff1d Mon Sep 17 00:00:00 2001
From: jofrn 
Date: Tue, 15 Jul 2025 13:02:04 -0400
Subject: [PATCH] [X86] Cast atomic vectors in IR to support floats

This commit casts floats to ints in an atomic load during AtomicExpand to 
support
floating point types. It also is required to support 128 bit vectors in SSE/AVX.
---
 llvm/lib/Target/X86/X86ISelLowering.cpp|   7 +
 llvm/lib/Target/X86/X86ISelLowering.h  |   2 +
 llvm/lib/Target/X86/X86InstrCompiler.td|  15 +
 llvm/test/CodeGen/X86/atomic-load-store.ll | 385 ++---
 4 files changed, 122 insertions(+), 287 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp 
b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 49beadae63f03..e15f17281b958 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -32136,6 +32136,13 @@ 
X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
   }
 }
 
+TargetLowering::AtomicExpansionKind
+X86TargetLowering::shouldCastAtomicLoadInIR(LoadInst *LI) const {
+  if (LI->getType()->getScalarType()->isFloatingPointTy())
+return AtomicExpansionKind::CastToInteger;
+  return AtomicExpansionKind::None;
+}
+
 LoadInst *
 X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
   unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
diff --git a/llvm/lib/Target/X86/X86ISelLowering.h 
b/llvm/lib/Target/X86/X86ISelLowering.h
index b7151f65942b4..f9a8adbd7da0d 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/llvm/lib/Target/X86/X86ISelLowering.h
@@ -1841,6 +1841,8 @@ namespace llvm {
 shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override;
 TargetLoweringBase::AtomicExpansionKind
 shouldExpandLogicAtomicRMWInIR(AtomicRMWInst *AI) const;
+TargetLoweringBase::AtomicExpansionKind
+shouldCastAtomicLoadInIR(LoadInst *LI) const override;
 void emitBitTestAtomicRMWIntrinsic(AtomicRMWInst *AI) const override;
 void emitCmpArithAtomicRMWIntrinsic(AtomicRMWInst *AI) const override;
 
diff --git a/llvm/lib/Target/X86/X86InstrCompiler.td 
b/llvm/lib/Target/X86/X86InstrCompiler.td
index ce429b5916280..3f542297fea19 100644
--- a/llvm/lib/Target/X86/X86InstrCompiler.td
+++ b/llvm/lib/Target/X86/X86InstrCompiler.td
@@ -1220,6 +1220,21 @@ def : Pat<(v2i64 (scalar_to_vector (i64 (atomic_load_64 
addr:$src,
 def : Pat<(v2i64 (scalar_to_vector (i64 (atomic_load_64 addr:$src,
   (VMOV64toPQIZrm addr:$src)>, Requires<[HasAVX512]>;
 
+// load atomic <2 x i64>
+def : Pat<(v2i64 (atomic_load_128_v2i64 addr:$src)),
+  (MOVAPDrm addr:$src)>, Requires<[UseSSE2]>;
+def : Pat<(v2i64 (atomic_load_128_v2i64 addr:$src)),
+  (VMOVAPDrm addr:$src)>, Requires<[UseAVX]>;
+def : Pat<(v2i64 (atomic_load_128_v2i64 addr:$src)),
+  (VMOVAPDZ128rm addr:$src)>, Requires<[HasAVX512]>;
+// load atomic <4 x i32>
+def : Pat<(v4i32 (atomic_load_128_v4i32 addr:$src)),
+  (MOVAPDrm addr:$src)>, Requires<[UseSSE2]>;
+def : Pat<(v4i32 (atomic_load_128_v4i32 addr:$src)),
+  (VMOVAPDrm addr:$src)>, Requires<[UseAVX]>;
+def : Pat<(v4i32 (atomic_load_128_v4i32 addr:$src)),
+  (VMOVAPDZ128rm addr:$src)>, Requires<[HasAVX512]>;
+
 // Floating point loads/stores.
 def : Pat<(atomic_store_32 (i32 (bitconvert (f32 FR32:$src))), addr:$dst),
   (MOVSSmr addr:$dst, FR32:$src)>, Requires<[UseSSE1]>;
diff --git a/llvm/test/CodeGen/X86/atomic-load-store.ll 
b/llvm/test/CodeGen/X86/atomic-load-store.ll
index 928dfef3143da..00310f6d1f219 100644
--- a/llvm/test/CodeGen/X86/atomic-load-store.ll
+++ b/llvm/test/CodeGen/X86/atomic-load-store.ll
@@ -119,13 +119,13 @@ define <1 x bfloat> @atomic_vec1_bfloat(ptr %x) {
 ; CHECK-SSE-O3-LABEL: atomic_vec1_bfloat:
 ; CHECK-SSE-O3:   # %bb.0:
 ; CHECK-SSE-O3-NEXT:movzwl (%rdi), %eax
-; CHECK-SSE-O3-NEXT:pinsrw $0, %eax, %xmm0
+; CHECK-SSE-O3-NEXT:movd %eax, %xmm0
 ; CHECK-SSE-O3-NEXT:retq
 ;
 ; CHECK-AVX-O3-LABEL: atomic_vec1_bfloat:
 ; CHECK-AVX-O3:   # %bb.0:
 ; CHECK-AVX-O3-NEXT:movzwl (%rdi), %eax
-; CHECK-AVX-O3-NEXT:vpinsrw $0, %eax, %xmm0, %xmm0
+; CHECK-AVX-O3-NEXT:vmovd %eax, %xmm0
 ; CHECK-AVX-O3-NEXT:retq
 ;
 ; CHECK-SSE-O0-LABEL: atomic_vec1_bfloat:
@@ -133,8 +133,7 @@ define <1 x bfloat> @atomic_vec1_bfloat(ptr %x) {
 ; CHECK-SSE-O0-NEXT:movw (%rdi), %cx
 ; CHECK-SSE-O0-NEXT:# implicit-def: $eax
 ; CHECK-SSE-O0-NEXT:movw %cx, %ax
-; CHECK-SSE-O0-NEXT:# implicit-def: $xmm0
-; CHECK-SSE-O0-NEXT:pinsrw $0, %eax, %xmm0
+; CHECK-SSE-O0-NEXT:movd %eax, %xmm0
 ; CHECK-SSE-O0-NEXT:retq
 ;
 ; CHECK-AVX-O0-LABEL: atomic_vec1_bfloat:
@@ -142,8 +141,7 @@ define <1 x bfloat> @atomic_vec1_bfloat(ptr %x) {
 ; CHECK-AVX-O0-NEXT:movw (%rdi), %cx
 ; CHECK-AVX-O0-NEXT:# implicit-def: $eax
 ; CHECK-AVX-O0-NEXT:movw %cx, %ax
-; CHECK-AVX-O0-NEXT:# imp

[llvm-branch-commits] [llvm] [X86] Remove extra MOV after widening atomic load (PR #148898)

2025-11-06 Thread via llvm-branch-commits

https://github.com/jofrn updated 
https://github.com/llvm/llvm-project/pull/148898

>From b92b6dac8913654dc0ba987ce328c47fa7330778 Mon Sep 17 00:00:00 2001
From: jofrn 
Date: Tue, 15 Jul 2025 13:01:24 -0400
Subject: [PATCH] [X86] Remove extra MOV after widening atomic load

This change adds patterns to optimize out an extra MOV
present after widening the atomic load.
---
 llvm/lib/Target/X86/X86InstrCompiler.td| 16 +
 llvm/test/CodeGen/X86/atomic-load-store.ll | 72 --
 2 files changed, 40 insertions(+), 48 deletions(-)

diff --git a/llvm/lib/Target/X86/X86InstrCompiler.td 
b/llvm/lib/Target/X86/X86InstrCompiler.td
index ec31675731b79..ce429b5916280 100644
--- a/llvm/lib/Target/X86/X86InstrCompiler.td
+++ b/llvm/lib/Target/X86/X86InstrCompiler.td
@@ -1204,6 +1204,22 @@ def : Pat<(i16 (atomic_load_nonext_16 addr:$src)), 
(MOV16rm addr:$src)>;
 def : Pat<(i32 (atomic_load_nonext_32 addr:$src)), (MOV32rm addr:$src)>;
 def : Pat<(i64 (atomic_load_nonext_64 addr:$src)), (MOV64rm addr:$src)>;
 
+// load atomic <2 x i16>
+def : Pat<(v4i32 (scalar_to_vector (i32 (atomic_load_32 addr:$src,
+  (MOVDI2PDIrm addr:$src)>, Requires<[UseSSE2]>;
+def : Pat<(v4i32 (scalar_to_vector (i32 (atomic_load_32 addr:$src,
+  (VMOVSSrm addr:$src)>, Requires<[UseAVX]>;
+def : Pat<(v4i32 (scalar_to_vector (i32 (atomic_load_32 addr:$src,
+  (VMOVSSZrm addr:$src)>, Requires<[HasAVX512]>;
+
+// load atomic <2 x i32,float>
+def : Pat<(v2i64 (scalar_to_vector (i64 (atomic_load_64 addr:$src,
+  (MOV64toPQIrm addr:$src)>, Requires<[UseSSE2]>;
+def : Pat<(v2i64 (scalar_to_vector (i64 (atomic_load_64 addr:$src,
+  (VMOV64toPQIrm addr:$src)>, Requires<[UseAVX]>;
+def : Pat<(v2i64 (scalar_to_vector (i64 (atomic_load_64 addr:$src,
+  (VMOV64toPQIZrm addr:$src)>, Requires<[HasAVX512]>;
+
 // Floating point loads/stores.
 def : Pat<(atomic_store_32 (i32 (bitconvert (f32 FR32:$src))), addr:$dst),
   (MOVSSmr addr:$dst, FR32:$src)>, Requires<[UseSSE1]>;
diff --git a/llvm/test/CodeGen/X86/atomic-load-store.ll 
b/llvm/test/CodeGen/X86/atomic-load-store.ll
index fc32c3668d1dd..7e15b9303887f 100644
--- a/llvm/test/CodeGen/X86/atomic-load-store.ll
+++ b/llvm/test/CodeGen/X86/atomic-load-store.ll
@@ -202,26 +202,22 @@ define <2 x i8> @atomic_vec2_i8(ptr %x) {
 define <2 x i16> @atomic_vec2_i16(ptr %x) {
 ; CHECK-SSE-O3-LABEL: atomic_vec2_i16:
 ; CHECK-SSE-O3:   # %bb.0:
-; CHECK-SSE-O3-NEXT:movl (%rdi), %eax
-; CHECK-SSE-O3-NEXT:movd %eax, %xmm0
+; CHECK-SSE-O3-NEXT:movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; CHECK-SSE-O3-NEXT:retq
 ;
 ; CHECK-AVX-O3-LABEL: atomic_vec2_i16:
 ; CHECK-AVX-O3:   # %bb.0:
-; CHECK-AVX-O3-NEXT:movl (%rdi), %eax
-; CHECK-AVX-O3-NEXT:vmovd %eax, %xmm0
+; CHECK-AVX-O3-NEXT:vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; CHECK-AVX-O3-NEXT:retq
 ;
 ; CHECK-SSE-O0-LABEL: atomic_vec2_i16:
 ; CHECK-SSE-O0:   # %bb.0:
-; CHECK-SSE-O0-NEXT:movl (%rdi), %eax
-; CHECK-SSE-O0-NEXT:movd %eax, %xmm0
+; CHECK-SSE-O0-NEXT:movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; CHECK-SSE-O0-NEXT:retq
 ;
 ; CHECK-AVX-O0-LABEL: atomic_vec2_i16:
 ; CHECK-AVX-O0:   # %bb.0:
-; CHECK-AVX-O0-NEXT:movl (%rdi), %eax
-; CHECK-AVX-O0-NEXT:vmovd %eax, %xmm0
+; CHECK-AVX-O0-NEXT:vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; CHECK-AVX-O0-NEXT:retq
   %ret = load atomic <2 x i16>, ptr %x acquire, align 4
   ret <2 x i16> %ret
@@ -230,26 +226,22 @@ define <2 x i16> @atomic_vec2_i16(ptr %x) {
 define <2 x ptr addrspace(270)> @atomic_vec2_ptr270(ptr %x) {
 ; CHECK-SSE-O3-LABEL: atomic_vec2_ptr270:
 ; CHECK-SSE-O3:   # %bb.0:
-; CHECK-SSE-O3-NEXT:movq (%rdi), %rax
-; CHECK-SSE-O3-NEXT:movq %rax, %xmm0
+; CHECK-SSE-O3-NEXT:movq (%rdi), %xmm0
 ; CHECK-SSE-O3-NEXT:retq
 ;
 ; CHECK-AVX-O3-LABEL: atomic_vec2_ptr270:
 ; CHECK-AVX-O3:   # %bb.0:
-; CHECK-AVX-O3-NEXT:movq (%rdi), %rax
-; CHECK-AVX-O3-NEXT:vmovq %rax, %xmm0
+; CHECK-AVX-O3-NEXT:vmovq (%rdi), %xmm0
 ; CHECK-AVX-O3-NEXT:retq
 ;
 ; CHECK-SSE-O0-LABEL: atomic_vec2_ptr270:
 ; CHECK-SSE-O0:   # %bb.0:
-; CHECK-SSE-O0-NEXT:movq (%rdi), %rax
-; CHECK-SSE-O0-NEXT:movq %rax, %xmm0
+; CHECK-SSE-O0-NEXT:movq (%rdi), %xmm0
 ; CHECK-SSE-O0-NEXT:retq
 ;
 ; CHECK-AVX-O0-LABEL: atomic_vec2_ptr270:
 ; CHECK-AVX-O0:   # %bb.0:
-; CHECK-AVX-O0-NEXT:movq (%rdi), %rax
-; CHECK-AVX-O0-NEXT:vmovq %rax, %xmm0
+; CHECK-AVX-O0-NEXT:vmovq (%rdi), %xmm0
 ; CHECK-AVX-O0-NEXT:retq
   %ret = load atomic <2 x ptr addrspace(270)>, ptr %x acquire, align 8
   ret <2 x ptr addrspace(270)> %ret
@@ -258,26 +250,22 @@ define <2 x ptr addrspace(270)> @atomic_vec2_ptr270(ptr 
%x) {
 define <2 x i32> @atomic_vec2_i32_align(ptr %x) {
 ; CHECK-SSE-O3-LABEL: atomic_vec2_i32_align:
 ; CHECK-SSE-O3:   # %bb.0:
-; CHECK-SSE-O3-NEXT:movq (%rdi), %rax
-; CHECK-SSE-O3-NEXT:movq %rax, %xmm0
+; CHECK-SSE-O3-NEXT:movq (%rdi), %xmm0

[llvm-branch-commits] [llvm] [AtomicExpand] Add bitcasts when expanding load atomic vector (PR #148900)

2025-11-06 Thread via llvm-branch-commits

https://github.com/jofrn updated 
https://github.com/llvm/llvm-project/pull/148900

>From a657bd946e7be59892a00a447ca7018d0715c6a5 Mon Sep 17 00:00:00 2001
From: jofrn 
Date: Tue, 15 Jul 2025 13:03:15 -0400
Subject: [PATCH] [AtomicExpand] Add bitcasts when expanding load atomic vector

AtomicExpand fails for aligned `load atomic ` because it
does not find a compatible library call. This change adds appropriate
bitcasts so that the call can be lowered. It also adds support for
128 bit lowering in tablegen to support SSE/AVX.
---
 llvm/lib/CodeGen/AtomicExpandPass.cpp | 19 +++-
 llvm/test/CodeGen/ARM/atomic-load-store.ll| 51 +++
 llvm/test/CodeGen/X86/atomic-load-store.ll| 91 ++-
 .../X86/expand-atomic-non-integer.ll  | 66 ++
 4 files changed, 222 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/CodeGen/AtomicExpandPass.cpp 
b/llvm/lib/CodeGen/AtomicExpandPass.cpp
index 53f1cfe24a68d..8dc14bb416345 100644
--- a/llvm/lib/CodeGen/AtomicExpandPass.cpp
+++ b/llvm/lib/CodeGen/AtomicExpandPass.cpp
@@ -483,7 +483,9 @@ LoadInst 
*AtomicExpandImpl::convertAtomicLoadToIntegerType(LoadInst *LI) {
   NewLI->setAtomic(LI->getOrdering(), LI->getSyncScopeID());
   LLVM_DEBUG(dbgs() << "Replaced " << *LI << " with " << *NewLI << "\n");
 
-  Value *NewVal = Builder.CreateBitCast(NewLI, LI->getType());
+  Value *NewVal = LI->getType()->isPtrOrPtrVectorTy()
+  ? Builder.CreateIntToPtr(NewLI, LI->getType())
+  : Builder.CreateBitCast(NewLI, LI->getType());
   LI->replaceAllUsesWith(NewVal);
   LI->eraseFromParent();
   return NewLI;
@@ -2093,9 +2095,18 @@ bool AtomicExpandImpl::expandAtomicOpToLibcall(
 I->replaceAllUsesWith(V);
   } else if (HasResult) {
 Value *V;
-if (UseSizedLibcall)
-  V = Builder.CreateBitOrPointerCast(Result, I->getType());
-else {
+if (UseSizedLibcall) {
+  // Add bitcasts from Result's scalar type to I's  vector type
+  auto *PtrTy = dyn_cast(I->getType()->getScalarType());
+  auto *VTy = dyn_cast(I->getType());
+  if (VTy && PtrTy && !Result->getType()->isVectorTy()) {
+unsigned AS = PtrTy->getAddressSpace();
+Value *BC = Builder.CreateBitCast(
+Result, VTy->getWithNewType(DL.getIntPtrType(Ctx, AS)));
+V = Builder.CreateIntToPtr(BC, I->getType());
+  } else
+V = Builder.CreateBitOrPointerCast(Result, I->getType());
+} else {
   V = Builder.CreateAlignedLoad(I->getType(), AllocaResult,
 AllocaAlignment);
   Builder.CreateLifetimeEnd(AllocaResult);
diff --git a/llvm/test/CodeGen/ARM/atomic-load-store.ll 
b/llvm/test/CodeGen/ARM/atomic-load-store.ll
index 560dfde356c29..eaa2ffd9b2731 100644
--- a/llvm/test/CodeGen/ARM/atomic-load-store.ll
+++ b/llvm/test/CodeGen/ARM/atomic-load-store.ll
@@ -983,3 +983,54 @@ define void @store_atomic_f64__seq_cst(ptr %ptr, double 
%val1) {
   store atomic double %val1, ptr %ptr seq_cst, align 8
   ret void
 }
+
+define <1 x ptr> @atomic_vec1_ptr(ptr %x) #0 {
+; ARM-LABEL: atomic_vec1_ptr:
+; ARM:   @ %bb.0:
+; ARM-NEXT:ldr r0, [r0]
+; ARM-NEXT:dmb ish
+; ARM-NEXT:bx lr
+;
+; ARMOPTNONE-LABEL: atomic_vec1_ptr:
+; ARMOPTNONE:   @ %bb.0:
+; ARMOPTNONE-NEXT:ldr r0, [r0]
+; ARMOPTNONE-NEXT:dmb ish
+; ARMOPTNONE-NEXT:bx lr
+;
+; THUMBTWO-LABEL: atomic_vec1_ptr:
+; THUMBTWO:   @ %bb.0:
+; THUMBTWO-NEXT:ldr r0, [r0]
+; THUMBTWO-NEXT:dmb ish
+; THUMBTWO-NEXT:bx lr
+;
+; THUMBONE-LABEL: atomic_vec1_ptr:
+; THUMBONE:   @ %bb.0:
+; THUMBONE-NEXT:push {r7, lr}
+; THUMBONE-NEXT:movs r1, #0
+; THUMBONE-NEXT:mov r2, r1
+; THUMBONE-NEXT:bl __sync_val_compare_and_swap_4
+; THUMBONE-NEXT:pop {r7, pc}
+;
+; ARMV4-LABEL: atomic_vec1_ptr:
+; ARMV4:   @ %bb.0:
+; ARMV4-NEXT:push {r11, lr}
+; ARMV4-NEXT:mov r1, #2
+; ARMV4-NEXT:bl __atomic_load_4
+; ARMV4-NEXT:pop {r11, lr}
+; ARMV4-NEXT:mov pc, lr
+;
+; ARMV6-LABEL: atomic_vec1_ptr:
+; ARMV6:   @ %bb.0:
+; ARMV6-NEXT:ldr r0, [r0]
+; ARMV6-NEXT:mov r1, #0
+; ARMV6-NEXT:mcr p15, #0, r1, c7, c10, #5
+; ARMV6-NEXT:bx lr
+;
+; THUMBM-LABEL: atomic_vec1_ptr:
+; THUMBM:   @ %bb.0:
+; THUMBM-NEXT:ldr r0, [r0]
+; THUMBM-NEXT:dmb sy
+; THUMBM-NEXT:bx lr
+  %ret = load atomic <1 x ptr>, ptr %x acquire, align 4
+  ret <1 x ptr> %ret
+}
diff --git a/llvm/test/CodeGen/X86/atomic-load-store.ll 
b/llvm/test/CodeGen/X86/atomic-load-store.ll
index 00310f6d1f219..867a4acb791bc 100644
--- a/llvm/test/CodeGen/X86/atomic-load-store.ll
+++ b/llvm/test/CodeGen/X86/atomic-load-store.ll
@@ -244,6 +244,96 @@ define <2 x ptr addrspace(270)> @atomic_vec2_ptr270(ptr 
%x) {
   %ret = load atomic <2 x ptr addrspace(270)>, ptr %x acquire, align 8
   ret <2 x ptr addrspace(270)> %ret
 }
+define <2 x ptr> @atomic_vec2_ptr_align(ptr %x) nounwind {
+; CHECK-SSE2-O3-LABEL: atomic_vec2_ptr_align:
+; CHECK-SSE2-O3:   

[llvm-branch-commits] [llvm] CodeGen/AMDGPU: Allow 3-address conversion of bundled instructions (PR #166213)

2025-11-06 Thread Nicolai Hähnle via llvm-branch-commits

https://github.com/nhaehnle updated 
https://github.com/llvm/llvm-project/pull/166213

From dd8c2ece4a1287580cec17fff56e8eaa314ffef7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicolai=20H=C3=A4hnle?= 
Date: Tue, 7 Oct 2025 12:17:02 -0700
Subject: [PATCH] CodeGen/AMDGPU: Allow 3-address conversion of bundled
 instructions

This is in preparation for future changes in AMDGPU that will make more
substantial use of bundles pre-RA. For now, simply test this with
degenerate (single-instruction) bundles.

commit-id:4a30cb78
---
 .../lib/CodeGen/TwoAddressInstructionPass.cpp | 54 ++
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp| 56 +--
 llvm/test/CodeGen/AMDGPU/twoaddr-bundle.mir   |  9 ++-
 3 files changed, 87 insertions(+), 32 deletions(-)

diff --git a/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp 
b/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
index b99e1c7f19b71..562a6a00045f5 100644
--- a/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
+++ b/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
@@ -794,29 +794,34 @@ bool TwoAddressInstructionImpl::convertInstTo3Addr(
   if (!NewMI)
 return false;
 
-  LLVM_DEBUG(dbgs() << "2addr: CONVERTING 2-ADDR: " << *mi);
-  LLVM_DEBUG(dbgs() << "2addr: TO 3-ADDR: " << *NewMI);
-
-  // If the old instruction is debug value tracked, an update is required.
-  if (auto OldInstrNum = mi->peekDebugInstrNum()) {
-assert(mi->getNumExplicitDefs() == 1);
-assert(NewMI->getNumExplicitDefs() == 1);
-
-// Find the old and new def location.
-unsigned OldIdx = mi->defs().begin()->getOperandNo();
-unsigned NewIdx = NewMI->defs().begin()->getOperandNo();
-
-// Record that one def has been replaced by the other.
-unsigned NewInstrNum = NewMI->getDebugInstrNum();
-MF->makeDebugValueSubstitution(std::make_pair(OldInstrNum, OldIdx),
-   std::make_pair(NewInstrNum, NewIdx));
-  }
-
-  MBB->erase(mi); // Nuke the old inst.
-
   for (MachineInstr &MI : MIS)
 DistanceMap.insert(std::make_pair(&MI, Dist++));
-  Dist--;
+
+  if (&*mi == NewMI) {
+LLVM_DEBUG(dbgs() << "2addr: CONVERTED IN-PLACE TO 3-ADDR: " << *mi);
+  } else {
+LLVM_DEBUG(dbgs() << "2addr: CONVERTING 2-ADDR: " << *mi);
+LLVM_DEBUG(dbgs() << "2addr: TO 3-ADDR: " << *NewMI);
+
+// If the old instruction is debug value tracked, an update is required.
+if (auto OldInstrNum = mi->peekDebugInstrNum()) {
+  assert(mi->getNumExplicitDefs() == 1);
+  assert(NewMI->getNumExplicitDefs() == 1);
+
+  // Find the old and new def location.
+  unsigned OldIdx = mi->defs().begin()->getOperandNo();
+  unsigned NewIdx = NewMI->defs().begin()->getOperandNo();
+
+  // Record that one def has been replaced by the other.
+  unsigned NewInstrNum = NewMI->getDebugInstrNum();
+  MF->makeDebugValueSubstitution(std::make_pair(OldInstrNum, OldIdx),
+ std::make_pair(NewInstrNum, NewIdx));
+}
+
+MBB->erase(mi); // Nuke the old inst.
+Dist--;
+  }
+
   mi = NewMI;
   nmi = std::next(mi);
 
@@ -1329,6 +1334,9 @@ bool TwoAddressInstructionImpl::tryInstructionTransform(
 
   bool Commuted = tryInstructionCommute(&MI, DstIdx, SrcIdx, regBKilled, Dist);
 
+  // Give targets a chance to convert bundled instructions.
+  bool ConvertibleTo3Addr = MI.isConvertibleTo3Addr(MachineInstr::AnyInBundle);
+
   // If the instruction is convertible to 3 Addr, instead
   // of returning try 3 Addr transformation aggressively and
   // use this variable to check later. Because it might be better.
@@ -1337,7 +1345,7 @@ bool TwoAddressInstructionImpl::tryInstructionTransform(
   //   addl %esi, %edi
   //   movl %edi, %eax
   //   ret
-  if (Commuted && !MI.isConvertibleTo3Addr())
+  if (Commuted && !ConvertibleTo3Addr)
 return false;
 
   if (shouldOnlyCommute)
@@ -1357,7 +1365,7 @@ bool TwoAddressInstructionImpl::tryInstructionTransform(
 regBKilled = isKilled(MI, regB, true);
   }
 
-  if (MI.isConvertibleTo3Addr()) {
+  if (ConvertibleTo3Addr) {
 // This instruction is potentially convertible to a true
 // three-address instruction.  Check if it is profitable.
 if (!regBKilled || isProfitableToConv3Addr(regA, regB)) {
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp 
b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 6ce18ea921a9b..deeb8beb04332 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -4047,10 +4047,29 @@ MachineInstr 
*SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
  LiveVariables *LV,
  LiveIntervals *LIS) const {
   MachineBasicBlock &MBB = *MI.getParent();
+  MachineInstr *CandidateMI = &MI;
+
+  if (MI.isBundle()) {
+// This is a temporary placeholder for bundle handling that enables us to
+// exercise the relevant code paths in the two-address instruction pass.
+i

[llvm-branch-commits] [llvm] CodeGen/AMDGPU: Allow 3-address conversion of bundled instructions (PR #166213)

2025-11-06 Thread Nicolai Hähnle via llvm-branch-commits

https://github.com/nhaehnle updated 
https://github.com/llvm/llvm-project/pull/166213

From cc06ca25470188cc8e767eab72fcfe83958cf4b2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicolai=20H=C3=A4hnle?= 
Date: Tue, 7 Oct 2025 12:17:02 -0700
Subject: [PATCH] CodeGen/AMDGPU: Allow 3-address conversion of bundled
 instructions

This is in preparation for future changes in AMDGPU that will make more
substantial use of bundles pre-RA. For now, simply test this with
degenerate (single-instruction) bundles.

commit-id:4a30cb78
---
 .../lib/CodeGen/TwoAddressInstructionPass.cpp | 54 +
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp| 58 +--
 llvm/test/CodeGen/AMDGPU/twoaddr-bundle.mir   |  9 ++-
 3 files changed, 89 insertions(+), 32 deletions(-)

diff --git a/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp 
b/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
index b99e1c7f19b71..562a6a00045f5 100644
--- a/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
+++ b/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
@@ -794,29 +794,34 @@ bool TwoAddressInstructionImpl::convertInstTo3Addr(
   if (!NewMI)
 return false;
 
-  LLVM_DEBUG(dbgs() << "2addr: CONVERTING 2-ADDR: " << *mi);
-  LLVM_DEBUG(dbgs() << "2addr: TO 3-ADDR: " << *NewMI);
-
-  // If the old instruction is debug value tracked, an update is required.
-  if (auto OldInstrNum = mi->peekDebugInstrNum()) {
-assert(mi->getNumExplicitDefs() == 1);
-assert(NewMI->getNumExplicitDefs() == 1);
-
-// Find the old and new def location.
-unsigned OldIdx = mi->defs().begin()->getOperandNo();
-unsigned NewIdx = NewMI->defs().begin()->getOperandNo();
-
-// Record that one def has been replaced by the other.
-unsigned NewInstrNum = NewMI->getDebugInstrNum();
-MF->makeDebugValueSubstitution(std::make_pair(OldInstrNum, OldIdx),
-   std::make_pair(NewInstrNum, NewIdx));
-  }
-
-  MBB->erase(mi); // Nuke the old inst.
-
   for (MachineInstr &MI : MIS)
 DistanceMap.insert(std::make_pair(&MI, Dist++));
-  Dist--;
+
+  if (&*mi == NewMI) {
+LLVM_DEBUG(dbgs() << "2addr: CONVERTED IN-PLACE TO 3-ADDR: " << *mi);
+  } else {
+LLVM_DEBUG(dbgs() << "2addr: CONVERTING 2-ADDR: " << *mi);
+LLVM_DEBUG(dbgs() << "2addr: TO 3-ADDR: " << *NewMI);
+
+// If the old instruction is debug value tracked, an update is required.
+if (auto OldInstrNum = mi->peekDebugInstrNum()) {
+  assert(mi->getNumExplicitDefs() == 1);
+  assert(NewMI->getNumExplicitDefs() == 1);
+
+  // Find the old and new def location.
+  unsigned OldIdx = mi->defs().begin()->getOperandNo();
+  unsigned NewIdx = NewMI->defs().begin()->getOperandNo();
+
+  // Record that one def has been replaced by the other.
+  unsigned NewInstrNum = NewMI->getDebugInstrNum();
+  MF->makeDebugValueSubstitution(std::make_pair(OldInstrNum, OldIdx),
+ std::make_pair(NewInstrNum, NewIdx));
+}
+
+MBB->erase(mi); // Nuke the old inst.
+Dist--;
+  }
+
   mi = NewMI;
   nmi = std::next(mi);
 
@@ -1329,6 +1334,9 @@ bool TwoAddressInstructionImpl::tryInstructionTransform(
 
   bool Commuted = tryInstructionCommute(&MI, DstIdx, SrcIdx, regBKilled, Dist);
 
+  // Give targets a chance to convert bundled instructions.
+  bool ConvertibleTo3Addr = MI.isConvertibleTo3Addr(MachineInstr::AnyInBundle);
+
   // If the instruction is convertible to 3 Addr, instead
   // of returning try 3 Addr transformation aggressively and
   // use this variable to check later. Because it might be better.
@@ -1337,7 +1345,7 @@ bool TwoAddressInstructionImpl::tryInstructionTransform(
   //   addl %esi, %edi
   //   movl %edi, %eax
   //   ret
-  if (Commuted && !MI.isConvertibleTo3Addr())
+  if (Commuted && !ConvertibleTo3Addr)
 return false;
 
   if (shouldOnlyCommute)
@@ -1357,7 +1365,7 @@ bool TwoAddressInstructionImpl::tryInstructionTransform(
 regBKilled = isKilled(MI, regB, true);
   }
 
-  if (MI.isConvertibleTo3Addr()) {
+  if (ConvertibleTo3Addr) {
 // This instruction is potentially convertible to a true
 // three-address instruction.  Check if it is profitable.
 if (!regBKilled || isProfitableToConv3Addr(regA, regB)) {
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp 
b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 6ce18ea921a9b..1fde07190339c 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -4047,10 +4047,29 @@ MachineInstr 
*SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
  LiveVariables *LV,
  LiveIntervals *LIS) const {
   MachineBasicBlock &MBB = *MI.getParent();
+  MachineInstr *CandidateMI = &MI;
+
+  if (MI.isBundle()) {
+// This is a temporary placeholder for bundle handling that enables us to
+// exercise the relevant code paths in the two-address instruction pass.
+if

[llvm-branch-commits] [llvm] CodeGen/AMDGPU: Allow 3-address conversion of bundled instructions (PR #166213)

2025-11-06 Thread Nicolai Hähnle via llvm-branch-commits


@@ -4088,7 +4107,20 @@ MachineInstr 
*SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
 LV->getVarInfo(DefReg).AliveBlocks.clear();
 }
 
-if (LIS) {
+if (MI.isBundle()) {
+  VirtRegInfo VRI = AnalyzeVirtRegInBundle(MI, DefReg);
+  if (!VRI.Reads && !VRI.Writes) {
+for (MachineOperand &MO : MI.all_uses()) {
+  if (MO.isReg() && MO.getReg() == DefReg) {

nhaehnle wrote:

See the discussion with Jay in #166212 -- I looked into it and decided to just 
prevent and forbid tied sub-registers on bundles in pre-RA as the safer route 
due to the complexities involved.

I'm adding an assert to that effect here.

https://github.com/llvm/llvm-project/pull/166213
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] CodeGen/AMDGPU: Allow 3-address conversion of bundled instructions (PR #166213)

2025-11-06 Thread Nicolai Hähnle via llvm-branch-commits

https://github.com/nhaehnle updated 
https://github.com/llvm/llvm-project/pull/166213

From cc06ca25470188cc8e767eab72fcfe83958cf4b2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicolai=20H=C3=A4hnle?= 
Date: Tue, 7 Oct 2025 12:17:02 -0700
Subject: [PATCH] CodeGen/AMDGPU: Allow 3-address conversion of bundled
 instructions

This is in preparation for future changes in AMDGPU that will make more
substantial use of bundles pre-RA. For now, simply test this with
degenerate (single-instruction) bundles.

commit-id:4a30cb78
---
 .../lib/CodeGen/TwoAddressInstructionPass.cpp | 54 +
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp| 58 +--
 llvm/test/CodeGen/AMDGPU/twoaddr-bundle.mir   |  9 ++-
 3 files changed, 89 insertions(+), 32 deletions(-)

diff --git a/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp 
b/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
index b99e1c7f19b71..562a6a00045f5 100644
--- a/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
+++ b/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
@@ -794,29 +794,34 @@ bool TwoAddressInstructionImpl::convertInstTo3Addr(
   if (!NewMI)
 return false;
 
-  LLVM_DEBUG(dbgs() << "2addr: CONVERTING 2-ADDR: " << *mi);
-  LLVM_DEBUG(dbgs() << "2addr: TO 3-ADDR: " << *NewMI);
-
-  // If the old instruction is debug value tracked, an update is required.
-  if (auto OldInstrNum = mi->peekDebugInstrNum()) {
-assert(mi->getNumExplicitDefs() == 1);
-assert(NewMI->getNumExplicitDefs() == 1);
-
-// Find the old and new def location.
-unsigned OldIdx = mi->defs().begin()->getOperandNo();
-unsigned NewIdx = NewMI->defs().begin()->getOperandNo();
-
-// Record that one def has been replaced by the other.
-unsigned NewInstrNum = NewMI->getDebugInstrNum();
-MF->makeDebugValueSubstitution(std::make_pair(OldInstrNum, OldIdx),
-   std::make_pair(NewInstrNum, NewIdx));
-  }
-
-  MBB->erase(mi); // Nuke the old inst.
-
   for (MachineInstr &MI : MIS)
 DistanceMap.insert(std::make_pair(&MI, Dist++));
-  Dist--;
+
+  if (&*mi == NewMI) {
+LLVM_DEBUG(dbgs() << "2addr: CONVERTED IN-PLACE TO 3-ADDR: " << *mi);
+  } else {
+LLVM_DEBUG(dbgs() << "2addr: CONVERTING 2-ADDR: " << *mi);
+LLVM_DEBUG(dbgs() << "2addr: TO 3-ADDR: " << *NewMI);
+
+// If the old instruction is debug value tracked, an update is required.
+if (auto OldInstrNum = mi->peekDebugInstrNum()) {
+  assert(mi->getNumExplicitDefs() == 1);
+  assert(NewMI->getNumExplicitDefs() == 1);
+
+  // Find the old and new def location.
+  unsigned OldIdx = mi->defs().begin()->getOperandNo();
+  unsigned NewIdx = NewMI->defs().begin()->getOperandNo();
+
+  // Record that one def has been replaced by the other.
+  unsigned NewInstrNum = NewMI->getDebugInstrNum();
+  MF->makeDebugValueSubstitution(std::make_pair(OldInstrNum, OldIdx),
+ std::make_pair(NewInstrNum, NewIdx));
+}
+
+MBB->erase(mi); // Nuke the old inst.
+Dist--;
+  }
+
   mi = NewMI;
   nmi = std::next(mi);
 
@@ -1329,6 +1334,9 @@ bool TwoAddressInstructionImpl::tryInstructionTransform(
 
   bool Commuted = tryInstructionCommute(&MI, DstIdx, SrcIdx, regBKilled, Dist);
 
+  // Give targets a chance to convert bundled instructions.
+  bool ConvertibleTo3Addr = MI.isConvertibleTo3Addr(MachineInstr::AnyInBundle);
+
   // If the instruction is convertible to 3 Addr, instead
   // of returning try 3 Addr transformation aggressively and
   // use this variable to check later. Because it might be better.
@@ -1337,7 +1345,7 @@ bool TwoAddressInstructionImpl::tryInstructionTransform(
   //   addl %esi, %edi
   //   movl %edi, %eax
   //   ret
-  if (Commuted && !MI.isConvertibleTo3Addr())
+  if (Commuted && !ConvertibleTo3Addr)
 return false;
 
   if (shouldOnlyCommute)
@@ -1357,7 +1365,7 @@ bool TwoAddressInstructionImpl::tryInstructionTransform(
 regBKilled = isKilled(MI, regB, true);
   }
 
-  if (MI.isConvertibleTo3Addr()) {
+  if (ConvertibleTo3Addr) {
 // This instruction is potentially convertible to a true
 // three-address instruction.  Check if it is profitable.
 if (!regBKilled || isProfitableToConv3Addr(regA, regB)) {
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp 
b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 6ce18ea921a9b..1fde07190339c 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -4047,10 +4047,29 @@ MachineInstr 
*SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
  LiveVariables *LV,
  LiveIntervals *LIS) const {
   MachineBasicBlock &MBB = *MI.getParent();
+  MachineInstr *CandidateMI = &MI;
+
+  if (MI.isBundle()) {
+// This is a temporary placeholder for bundle handling that enables us to
+// exercise the relevant code paths in the two-address instruction pass.
+if

[llvm-branch-commits] [llvm] [X86] Remove extra MOV after widening atomic load (PR #148898)

2025-11-06 Thread Simon Pilgrim via llvm-branch-commits

https://github.com/RKSimon approved this pull request.

LGTM

https://github.com/llvm/llvm-project/pull/148898
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [CI][NFC] Refactor compute_platform_title into generate_test_report_lib (PR #166604)

2025-11-06 Thread Nathan Gauër via llvm-branch-commits

https://github.com/Keenuts approved this pull request.


https://github.com/llvm/llvm-project/pull/166604
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [CI] Add Ability to Explain Failures (PR #166590)

2025-11-06 Thread Nathan Gauër via llvm-branch-commits

https://github.com/Keenuts edited 
https://github.com/llvm/llvm-project/pull/166590
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [CI] Add Ability to Explain Failures (PR #166590)

2025-11-06 Thread Nathan Gauër via llvm-branch-commits

https://github.com/Keenuts approved this pull request.

a small thing, otherwise LGTM (modulus test coverage request by David)

https://github.com/llvm/llvm-project/pull/166590
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [CI] Add Ability to Explain Failures (PR #166590)

2025-11-06 Thread Nathan Gauër via llvm-branch-commits


@@ -82,16 +93,29 @@ def find_failure_in_ninja_logs(ninja_logs: list[list[str]]) 
-> list[tuple[str, s
 return failures
 
 
-def _format_failures(failures: list[tuple[str, str]]) -> list[str]:
+def _format_failures(
+failures: list[tuple[str, str]], failure_explanations: dict[str, 
FailureExplanation]
+) -> list[str]:
 """Formats failures into summary views for the report."""
 output = []
 for build_failure in failures:
 failed_action, failure_message = build_failure
+failure_explanation = None
+if failed_action in failure_explanations:
+failure_explanation = failure_explanations[failed_action]
+output.append("")
+if failure_explanation:
+output.extend(
+[
+f"{failed_action} (Likely Already 
Failing)" "",

Keenuts wrote:

`html.escape` the build logs before embedding in the xml?

https://github.com/llvm/llvm-project/pull/166590
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [mlir] [mlir][vector] Simplify createReadOrMaskedRead (PR #163736)

2025-11-06 Thread Andrzej Warzyński via llvm-branch-commits

https://github.com/banach-space reopened 
https://github.com/llvm/llvm-project/pull/163736
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] release/21.x: [RISCV] Correct the CFA offsets for stack probing. (#166616) (PR #166783)

2025-11-06 Thread Paul Kirth via llvm-branch-commits

https://github.com/ilovepi approved this pull request.

LGTM. 

https://github.com/llvm/llvm-project/pull/166783
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [CI] Make premerge_advisor_explain write comments (PR #166605)

2025-11-06 Thread Aiden Grossman via llvm-branch-commits


@@ -33,17 +33,18 @@ function at-exit {
   # If building fails there will be no results files.
   shopt -s nullglob
 
-  if [[ "$GITHUB_STEP_SUMMARY" != "" ]]; then
+  if [[ "$GITHUB_ACTIONS" != "" ]]; then

boomanaiden154 wrote:

This checks that we are running in a Github actions workflow (as opposed to 
running locally or inside buildbot). We don't need to change what we're looking 
for (`GITHUB_STEP_SUMMARY` is implied by `GITHUB_ACTIONS`), but I felt changing 
it up made it more clear given we are now also running other scripts inside 
this conditional that don't have anything to do with the step summary.

https://github.com/llvm/llvm-project/pull/166605
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [AtomicExpand] Add bitcasts when expanding load atomic vector (PR #148900)

2025-11-06 Thread via llvm-branch-commits

https://github.com/jofrn updated 
https://github.com/llvm/llvm-project/pull/148900

>From 242cf54a6b527e573c4d30a3bea47e3a458fb8c1 Mon Sep 17 00:00:00 2001
From: jofrn 
Date: Tue, 15 Jul 2025 13:03:15 -0400
Subject: [PATCH] [AtomicExpand] Add bitcasts when expanding load atomic vector

AtomicExpand fails for aligned `load atomic ` because it
does not find a compatible library call. This change adds appropriate
bitcasts so that the call can be lowered. It also adds support for
128 bit lowering in tablegen to support SSE/AVX.
---
 llvm/lib/CodeGen/AtomicExpandPass.cpp | 19 +++-
 llvm/test/CodeGen/ARM/atomic-load-store.ll| 51 +++
 llvm/test/CodeGen/X86/atomic-load-store.ll| 91 ++-
 .../X86/expand-atomic-non-integer.ll  | 66 ++
 4 files changed, 222 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/CodeGen/AtomicExpandPass.cpp 
b/llvm/lib/CodeGen/AtomicExpandPass.cpp
index 53f1cfe24a68d7..8dc14bb4163451 100644
--- a/llvm/lib/CodeGen/AtomicExpandPass.cpp
+++ b/llvm/lib/CodeGen/AtomicExpandPass.cpp
@@ -483,7 +483,9 @@ LoadInst 
*AtomicExpandImpl::convertAtomicLoadToIntegerType(LoadInst *LI) {
   NewLI->setAtomic(LI->getOrdering(), LI->getSyncScopeID());
   LLVM_DEBUG(dbgs() << "Replaced " << *LI << " with " << *NewLI << "\n");
 
-  Value *NewVal = Builder.CreateBitCast(NewLI, LI->getType());
+  Value *NewVal = LI->getType()->isPtrOrPtrVectorTy()
+  ? Builder.CreateIntToPtr(NewLI, LI->getType())
+  : Builder.CreateBitCast(NewLI, LI->getType());
   LI->replaceAllUsesWith(NewVal);
   LI->eraseFromParent();
   return NewLI;
@@ -2093,9 +2095,18 @@ bool AtomicExpandImpl::expandAtomicOpToLibcall(
 I->replaceAllUsesWith(V);
   } else if (HasResult) {
 Value *V;
-if (UseSizedLibcall)
-  V = Builder.CreateBitOrPointerCast(Result, I->getType());
-else {
+if (UseSizedLibcall) {
+  // Add bitcasts from Result's scalar type to I's  vector type
+  auto *PtrTy = dyn_cast(I->getType()->getScalarType());
+  auto *VTy = dyn_cast(I->getType());
+  if (VTy && PtrTy && !Result->getType()->isVectorTy()) {
+unsigned AS = PtrTy->getAddressSpace();
+Value *BC = Builder.CreateBitCast(
+Result, VTy->getWithNewType(DL.getIntPtrType(Ctx, AS)));
+V = Builder.CreateIntToPtr(BC, I->getType());
+  } else
+V = Builder.CreateBitOrPointerCast(Result, I->getType());
+} else {
   V = Builder.CreateAlignedLoad(I->getType(), AllocaResult,
 AllocaAlignment);
   Builder.CreateLifetimeEnd(AllocaResult);
diff --git a/llvm/test/CodeGen/ARM/atomic-load-store.ll 
b/llvm/test/CodeGen/ARM/atomic-load-store.ll
index 560dfde356c29d..eaa2ffd9b27318 100644
--- a/llvm/test/CodeGen/ARM/atomic-load-store.ll
+++ b/llvm/test/CodeGen/ARM/atomic-load-store.ll
@@ -983,3 +983,54 @@ define void @store_atomic_f64__seq_cst(ptr %ptr, double 
%val1) {
   store atomic double %val1, ptr %ptr seq_cst, align 8
   ret void
 }
+
+define <1 x ptr> @atomic_vec1_ptr(ptr %x) #0 {
+; ARM-LABEL: atomic_vec1_ptr:
+; ARM:   @ %bb.0:
+; ARM-NEXT:ldr r0, [r0]
+; ARM-NEXT:dmb ish
+; ARM-NEXT:bx lr
+;
+; ARMOPTNONE-LABEL: atomic_vec1_ptr:
+; ARMOPTNONE:   @ %bb.0:
+; ARMOPTNONE-NEXT:ldr r0, [r0]
+; ARMOPTNONE-NEXT:dmb ish
+; ARMOPTNONE-NEXT:bx lr
+;
+; THUMBTWO-LABEL: atomic_vec1_ptr:
+; THUMBTWO:   @ %bb.0:
+; THUMBTWO-NEXT:ldr r0, [r0]
+; THUMBTWO-NEXT:dmb ish
+; THUMBTWO-NEXT:bx lr
+;
+; THUMBONE-LABEL: atomic_vec1_ptr:
+; THUMBONE:   @ %bb.0:
+; THUMBONE-NEXT:push {r7, lr}
+; THUMBONE-NEXT:movs r1, #0
+; THUMBONE-NEXT:mov r2, r1
+; THUMBONE-NEXT:bl __sync_val_compare_and_swap_4
+; THUMBONE-NEXT:pop {r7, pc}
+;
+; ARMV4-LABEL: atomic_vec1_ptr:
+; ARMV4:   @ %bb.0:
+; ARMV4-NEXT:push {r11, lr}
+; ARMV4-NEXT:mov r1, #2
+; ARMV4-NEXT:bl __atomic_load_4
+; ARMV4-NEXT:pop {r11, lr}
+; ARMV4-NEXT:mov pc, lr
+;
+; ARMV6-LABEL: atomic_vec1_ptr:
+; ARMV6:   @ %bb.0:
+; ARMV6-NEXT:ldr r0, [r0]
+; ARMV6-NEXT:mov r1, #0
+; ARMV6-NEXT:mcr p15, #0, r1, c7, c10, #5
+; ARMV6-NEXT:bx lr
+;
+; THUMBM-LABEL: atomic_vec1_ptr:
+; THUMBM:   @ %bb.0:
+; THUMBM-NEXT:ldr r0, [r0]
+; THUMBM-NEXT:dmb sy
+; THUMBM-NEXT:bx lr
+  %ret = load atomic <1 x ptr>, ptr %x acquire, align 4
+  ret <1 x ptr> %ret
+}
diff --git a/llvm/test/CodeGen/X86/atomic-load-store.ll 
b/llvm/test/CodeGen/X86/atomic-load-store.ll
index 00310f6d1f219e..867a4acb791bca 100644
--- a/llvm/test/CodeGen/X86/atomic-load-store.ll
+++ b/llvm/test/CodeGen/X86/atomic-load-store.ll
@@ -244,6 +244,96 @@ define <2 x ptr addrspace(270)> @atomic_vec2_ptr270(ptr 
%x) {
   %ret = load atomic <2 x ptr addrspace(270)>, ptr %x acquire, align 8
   ret <2 x ptr addrspace(270)> %ret
 }
+define <2 x ptr> @atomic_vec2_ptr_align(ptr %x) nounwind {
+; CHECK-SSE2-O3-LABEL: atomic_vec2_ptr_align:
+; CHECK-SSE2-

[llvm-branch-commits] [llvm] [SelectionDAG] Split vector types for atomic load (PR #165818)

2025-11-06 Thread via llvm-branch-commits

https://github.com/jofrn updated 
https://github.com/llvm/llvm-project/pull/165818

>From 8466578444bc27c0d8c5dc2ee95f074a96b5e47f Mon Sep 17 00:00:00 2001
From: jofrn 
Date: Thu, 30 Oct 2025 12:19:59 -0400
Subject: [PATCH] [SelectionDAG] Split vector types for atomic load

Vector types that aren't widened are split
so that a single ATOMIC_LOAD is issued for the entire vector at once.
This change utilizes the load vectorization infrastructure in
SelectionDAG in order to group the vectors. This enables SelectionDAG
to translate vectors with type bfloat,half.
---
 .../include/llvm/Target/TargetSelectionDAG.td |  14 +
 llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h |   1 +
 .../SelectionDAG/LegalizeVectorTypes.cpp  |  34 ++
 llvm/test/CodeGen/X86/atomic-load-store.ll| 352 +-
 4 files changed, 397 insertions(+), 4 deletions(-)

diff --git a/llvm/include/llvm/Target/TargetSelectionDAG.td 
b/llvm/include/llvm/Target/TargetSelectionDAG.td
index 07a858fd682fc..239fee8a3022d 100644
--- a/llvm/include/llvm/Target/TargetSelectionDAG.td
+++ b/llvm/include/llvm/Target/TargetSelectionDAG.td
@@ -1949,6 +1949,20 @@ def atomic_load_64 :
   let MemoryVT = i64;
 }
 
+def atomic_load_128_v2i64 :
+  PatFrag<(ops node:$ptr),
+  (atomic_load node:$ptr)> {
+  let IsAtomic = true;
+  let MemoryVT = v2i64;
+}
+
+def atomic_load_128_v4i32 :
+  PatFrag<(ops node:$ptr),
+  (atomic_load node:$ptr)> {
+  let IsAtomic = true;
+  let MemoryVT = v4i32;
+}
+
 def atomic_load_nonext_8 :
   PatFrag<(ops node:$ptr), (atomic_load_nonext node:$ptr)> {
   let IsAtomic = true; // FIXME: Should be IsLoad and/or IsAtomic?
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h 
b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index ed2c30be7d71d..9028ff4d3401c 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -978,6 +978,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
   void SplitVecRes_FPOp_MultiType(SDNode *N, SDValue &Lo, SDValue &Hi);
   void SplitVecRes_IS_FPCLASS(SDNode *N, SDValue &Lo, SDValue &Hi);
   void SplitVecRes_INSERT_VECTOR_ELT(SDNode *N, SDValue &Lo, SDValue &Hi);
+  void SplitVecRes_ATOMIC_LOAD(AtomicSDNode *LD, SDValue &Lo, SDValue &Hi);
   void SplitVecRes_LOAD(LoadSDNode *LD, SDValue &Lo, SDValue &Hi);
   void SplitVecRes_VP_LOAD(VPLoadSDNode *LD, SDValue &Lo, SDValue &Hi);
   void SplitVecRes_VP_LOAD_FF(VPLoadFFSDNode *LD, SDValue &Lo, SDValue &Hi);
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp 
b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index d0e2c8b2e3799..fe89a4a9f9634 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -1226,6 +1226,9 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, 
unsigned ResNo) {
 SplitVecRes_STEP_VECTOR(N, Lo, Hi);
 break;
   case ISD::SIGN_EXTEND_INREG: SplitVecRes_InregOp(N, Lo, Hi); break;
+  case ISD::ATOMIC_LOAD:
+SplitVecRes_ATOMIC_LOAD(cast(N), Lo, Hi);
+break;
   case ISD::LOAD:
 SplitVecRes_LOAD(cast(N), Lo, Hi);
 break;
@@ -2202,6 +2205,37 @@ void DAGTypeLegalizer::SplitVecRes_VP_SPLAT(SDNode *N, 
SDValue &Lo,
   Hi = DAG.getNode(N->getOpcode(), dl, HiVT, N->getOperand(0), MaskHi, EVLHi);
 }
 
+void DAGTypeLegalizer::SplitVecRes_ATOMIC_LOAD(AtomicSDNode *LD, SDValue &Lo,
+   SDValue &Hi) {
+  assert(LD->getExtensionType() == ISD::NON_EXTLOAD &&
+ "Extended load during type legalization!");
+  SDLoc dl(LD);
+  EVT VT = LD->getValueType(0);
+  EVT LoVT, HiVT;
+  std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
+
+  SDValue Ch = LD->getChain();
+  SDValue Ptr = LD->getBasePtr();
+
+  EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
+  EVT MemIntVT =
+  EVT::getIntegerVT(*DAG.getContext(), LD->getMemoryVT().getSizeInBits());
+  SDValue ALD = DAG.getAtomicLoad(LD->getExtensionType(), dl, MemIntVT, IntVT,
+  Ch, Ptr, LD->getMemOperand());
+
+  EVT LoIntVT = EVT::getIntegerVT(*DAG.getContext(), LoVT.getSizeInBits());
+  EVT HiIntVT = EVT::getIntegerVT(*DAG.getContext(), HiVT.getSizeInBits());
+  SDValue ExtractLo, ExtractHi;
+  SplitInteger(ALD, LoIntVT, HiIntVT, ExtractLo, ExtractHi);
+
+  Lo = DAG.getBitcast(LoVT, ExtractLo);
+  Hi = DAG.getBitcast(HiVT, ExtractHi);
+
+  // Legalize the chain result - switch anything that used the old chain to
+  // use the new one.
+  ReplaceValueWith(SDValue(LD, 1), ALD.getValue(1));
+}
+
 void DAGTypeLegalizer::SplitVecRes_LOAD(LoadSDNode *LD, SDValue &Lo,
 SDValue &Hi) {
   assert(ISD::isUNINDEXEDLoad(LD) && "Indexed load during type legalization!");
diff --git a/llvm/test/CodeGen/X86/atomic-load-store.ll 
b/llvm/test/CodeGen/X86/atomic-load-store.ll
index 7e15b9303887f..928dfef3143da 100644
--- a/llvm/test/CodeGen/X86/atomic-load-store.ll
+++ b/llvm/test/CodeGen/

[llvm-branch-commits] [llvm] [AtomicExpand] Add bitcasts when expanding load atomic vector (PR #148900)

2025-11-06 Thread via llvm-branch-commits

https://github.com/jofrn updated 
https://github.com/llvm/llvm-project/pull/148900

>From 242cf54a6b527e573c4d30a3bea47e3a458fb8c1 Mon Sep 17 00:00:00 2001
From: jofrn 
Date: Tue, 15 Jul 2025 13:03:15 -0400
Subject: [PATCH] [AtomicExpand] Add bitcasts when expanding load atomic vector

AtomicExpand fails for aligned `load atomic ` because it
does not find a compatible library call. This change adds appropriate
bitcasts so that the call can be lowered. It also adds support for
128 bit lowering in tablegen to support SSE/AVX.
---
 llvm/lib/CodeGen/AtomicExpandPass.cpp | 19 +++-
 llvm/test/CodeGen/ARM/atomic-load-store.ll| 51 +++
 llvm/test/CodeGen/X86/atomic-load-store.ll| 91 ++-
 .../X86/expand-atomic-non-integer.ll  | 66 ++
 4 files changed, 222 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/CodeGen/AtomicExpandPass.cpp 
b/llvm/lib/CodeGen/AtomicExpandPass.cpp
index 53f1cfe24a68d..8dc14bb416345 100644
--- a/llvm/lib/CodeGen/AtomicExpandPass.cpp
+++ b/llvm/lib/CodeGen/AtomicExpandPass.cpp
@@ -483,7 +483,9 @@ LoadInst 
*AtomicExpandImpl::convertAtomicLoadToIntegerType(LoadInst *LI) {
   NewLI->setAtomic(LI->getOrdering(), LI->getSyncScopeID());
   LLVM_DEBUG(dbgs() << "Replaced " << *LI << " with " << *NewLI << "\n");
 
-  Value *NewVal = Builder.CreateBitCast(NewLI, LI->getType());
+  Value *NewVal = LI->getType()->isPtrOrPtrVectorTy()
+  ? Builder.CreateIntToPtr(NewLI, LI->getType())
+  : Builder.CreateBitCast(NewLI, LI->getType());
   LI->replaceAllUsesWith(NewVal);
   LI->eraseFromParent();
   return NewLI;
@@ -2093,9 +2095,18 @@ bool AtomicExpandImpl::expandAtomicOpToLibcall(
 I->replaceAllUsesWith(V);
   } else if (HasResult) {
 Value *V;
-if (UseSizedLibcall)
-  V = Builder.CreateBitOrPointerCast(Result, I->getType());
-else {
+if (UseSizedLibcall) {
+  // Add bitcasts from Result's scalar type to I's  vector type
+  auto *PtrTy = dyn_cast(I->getType()->getScalarType());
+  auto *VTy = dyn_cast(I->getType());
+  if (VTy && PtrTy && !Result->getType()->isVectorTy()) {
+unsigned AS = PtrTy->getAddressSpace();
+Value *BC = Builder.CreateBitCast(
+Result, VTy->getWithNewType(DL.getIntPtrType(Ctx, AS)));
+V = Builder.CreateIntToPtr(BC, I->getType());
+  } else
+V = Builder.CreateBitOrPointerCast(Result, I->getType());
+} else {
   V = Builder.CreateAlignedLoad(I->getType(), AllocaResult,
 AllocaAlignment);
   Builder.CreateLifetimeEnd(AllocaResult);
diff --git a/llvm/test/CodeGen/ARM/atomic-load-store.ll 
b/llvm/test/CodeGen/ARM/atomic-load-store.ll
index 560dfde356c29..eaa2ffd9b2731 100644
--- a/llvm/test/CodeGen/ARM/atomic-load-store.ll
+++ b/llvm/test/CodeGen/ARM/atomic-load-store.ll
@@ -983,3 +983,54 @@ define void @store_atomic_f64__seq_cst(ptr %ptr, double 
%val1) {
   store atomic double %val1, ptr %ptr seq_cst, align 8
   ret void
 }
+
+define <1 x ptr> @atomic_vec1_ptr(ptr %x) #0 {
+; ARM-LABEL: atomic_vec1_ptr:
+; ARM:   @ %bb.0:
+; ARM-NEXT:ldr r0, [r0]
+; ARM-NEXT:dmb ish
+; ARM-NEXT:bx lr
+;
+; ARMOPTNONE-LABEL: atomic_vec1_ptr:
+; ARMOPTNONE:   @ %bb.0:
+; ARMOPTNONE-NEXT:ldr r0, [r0]
+; ARMOPTNONE-NEXT:dmb ish
+; ARMOPTNONE-NEXT:bx lr
+;
+; THUMBTWO-LABEL: atomic_vec1_ptr:
+; THUMBTWO:   @ %bb.0:
+; THUMBTWO-NEXT:ldr r0, [r0]
+; THUMBTWO-NEXT:dmb ish
+; THUMBTWO-NEXT:bx lr
+;
+; THUMBONE-LABEL: atomic_vec1_ptr:
+; THUMBONE:   @ %bb.0:
+; THUMBONE-NEXT:push {r7, lr}
+; THUMBONE-NEXT:movs r1, #0
+; THUMBONE-NEXT:mov r2, r1
+; THUMBONE-NEXT:bl __sync_val_compare_and_swap_4
+; THUMBONE-NEXT:pop {r7, pc}
+;
+; ARMV4-LABEL: atomic_vec1_ptr:
+; ARMV4:   @ %bb.0:
+; ARMV4-NEXT:push {r11, lr}
+; ARMV4-NEXT:mov r1, #2
+; ARMV4-NEXT:bl __atomic_load_4
+; ARMV4-NEXT:pop {r11, lr}
+; ARMV4-NEXT:mov pc, lr
+;
+; ARMV6-LABEL: atomic_vec1_ptr:
+; ARMV6:   @ %bb.0:
+; ARMV6-NEXT:ldr r0, [r0]
+; ARMV6-NEXT:mov r1, #0
+; ARMV6-NEXT:mcr p15, #0, r1, c7, c10, #5
+; ARMV6-NEXT:bx lr
+;
+; THUMBM-LABEL: atomic_vec1_ptr:
+; THUMBM:   @ %bb.0:
+; THUMBM-NEXT:ldr r0, [r0]
+; THUMBM-NEXT:dmb sy
+; THUMBM-NEXT:bx lr
+  %ret = load atomic <1 x ptr>, ptr %x acquire, align 4
+  ret <1 x ptr> %ret
+}
diff --git a/llvm/test/CodeGen/X86/atomic-load-store.ll 
b/llvm/test/CodeGen/X86/atomic-load-store.ll
index 00310f6d1f219..867a4acb791bc 100644
--- a/llvm/test/CodeGen/X86/atomic-load-store.ll
+++ b/llvm/test/CodeGen/X86/atomic-load-store.ll
@@ -244,6 +244,96 @@ define <2 x ptr addrspace(270)> @atomic_vec2_ptr270(ptr 
%x) {
   %ret = load atomic <2 x ptr addrspace(270)>, ptr %x acquire, align 8
   ret <2 x ptr addrspace(270)> %ret
 }
+define <2 x ptr> @atomic_vec2_ptr_align(ptr %x) nounwind {
+; CHECK-SSE2-O3-LABEL: atomic_vec2_ptr_align:
+; CHECK-SSE2-O3:   

[llvm-branch-commits] [llvm] [X86] Cast atomic vectors in IR to support floats (PR #148899)

2025-11-06 Thread via llvm-branch-commits

https://github.com/jofrn updated 
https://github.com/llvm/llvm-project/pull/148899

>From 23fb9283f42bd418afb4d478dfaa7215c4d16093 Mon Sep 17 00:00:00 2001
From: jofrn 
Date: Tue, 15 Jul 2025 13:02:04 -0400
Subject: [PATCH] [X86] Cast atomic vectors in IR to support floats

This commit casts floats to ints in an atomic load during AtomicExpand to 
support
floating point types. It also is required to support 128 bit vectors in SSE/AVX.
---
 llvm/lib/Target/X86/X86ISelLowering.cpp|   7 +
 llvm/lib/Target/X86/X86ISelLowering.h  |   2 +
 llvm/lib/Target/X86/X86InstrCompiler.td|  15 +
 llvm/test/CodeGen/X86/atomic-load-store.ll | 385 ++---
 4 files changed, 122 insertions(+), 287 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp 
b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 49beadae63f03..e15f17281b958 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -32136,6 +32136,13 @@ 
X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
   }
 }
 
+TargetLowering::AtomicExpansionKind
+X86TargetLowering::shouldCastAtomicLoadInIR(LoadInst *LI) const {
+  if (LI->getType()->getScalarType()->isFloatingPointTy())
+return AtomicExpansionKind::CastToInteger;
+  return AtomicExpansionKind::None;
+}
+
 LoadInst *
 X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
   unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
diff --git a/llvm/lib/Target/X86/X86ISelLowering.h 
b/llvm/lib/Target/X86/X86ISelLowering.h
index b7151f65942b4..f9a8adbd7da0d 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/llvm/lib/Target/X86/X86ISelLowering.h
@@ -1841,6 +1841,8 @@ namespace llvm {
 shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override;
 TargetLoweringBase::AtomicExpansionKind
 shouldExpandLogicAtomicRMWInIR(AtomicRMWInst *AI) const;
+TargetLoweringBase::AtomicExpansionKind
+shouldCastAtomicLoadInIR(LoadInst *LI) const override;
 void emitBitTestAtomicRMWIntrinsic(AtomicRMWInst *AI) const override;
 void emitCmpArithAtomicRMWIntrinsic(AtomicRMWInst *AI) const override;
 
diff --git a/llvm/lib/Target/X86/X86InstrCompiler.td 
b/llvm/lib/Target/X86/X86InstrCompiler.td
index ce429b5916280..3f542297fea19 100644
--- a/llvm/lib/Target/X86/X86InstrCompiler.td
+++ b/llvm/lib/Target/X86/X86InstrCompiler.td
@@ -1220,6 +1220,21 @@ def : Pat<(v2i64 (scalar_to_vector (i64 (atomic_load_64 
addr:$src,
 def : Pat<(v2i64 (scalar_to_vector (i64 (atomic_load_64 addr:$src,
   (VMOV64toPQIZrm addr:$src)>, Requires<[HasAVX512]>;
 
+// load atomic <2 x i64>
+def : Pat<(v2i64 (atomic_load_128_v2i64 addr:$src)),
+  (MOVAPDrm addr:$src)>, Requires<[UseSSE2]>;
+def : Pat<(v2i64 (atomic_load_128_v2i64 addr:$src)),
+  (VMOVAPDrm addr:$src)>, Requires<[UseAVX]>;
+def : Pat<(v2i64 (atomic_load_128_v2i64 addr:$src)),
+  (VMOVAPDZ128rm addr:$src)>, Requires<[HasAVX512]>;
+// load atomic <4 x i32>
+def : Pat<(v4i32 (atomic_load_128_v4i32 addr:$src)),
+  (MOVAPDrm addr:$src)>, Requires<[UseSSE2]>;
+def : Pat<(v4i32 (atomic_load_128_v4i32 addr:$src)),
+  (VMOVAPDrm addr:$src)>, Requires<[UseAVX]>;
+def : Pat<(v4i32 (atomic_load_128_v4i32 addr:$src)),
+  (VMOVAPDZ128rm addr:$src)>, Requires<[HasAVX512]>;
+
 // Floating point loads/stores.
 def : Pat<(atomic_store_32 (i32 (bitconvert (f32 FR32:$src))), addr:$dst),
   (MOVSSmr addr:$dst, FR32:$src)>, Requires<[UseSSE1]>;
diff --git a/llvm/test/CodeGen/X86/atomic-load-store.ll 
b/llvm/test/CodeGen/X86/atomic-load-store.ll
index 928dfef3143da..00310f6d1f219 100644
--- a/llvm/test/CodeGen/X86/atomic-load-store.ll
+++ b/llvm/test/CodeGen/X86/atomic-load-store.ll
@@ -119,13 +119,13 @@ define <1 x bfloat> @atomic_vec1_bfloat(ptr %x) {
 ; CHECK-SSE-O3-LABEL: atomic_vec1_bfloat:
 ; CHECK-SSE-O3:   # %bb.0:
 ; CHECK-SSE-O3-NEXT:movzwl (%rdi), %eax
-; CHECK-SSE-O3-NEXT:pinsrw $0, %eax, %xmm0
+; CHECK-SSE-O3-NEXT:movd %eax, %xmm0
 ; CHECK-SSE-O3-NEXT:retq
 ;
 ; CHECK-AVX-O3-LABEL: atomic_vec1_bfloat:
 ; CHECK-AVX-O3:   # %bb.0:
 ; CHECK-AVX-O3-NEXT:movzwl (%rdi), %eax
-; CHECK-AVX-O3-NEXT:vpinsrw $0, %eax, %xmm0, %xmm0
+; CHECK-AVX-O3-NEXT:vmovd %eax, %xmm0
 ; CHECK-AVX-O3-NEXT:retq
 ;
 ; CHECK-SSE-O0-LABEL: atomic_vec1_bfloat:
@@ -133,8 +133,7 @@ define <1 x bfloat> @atomic_vec1_bfloat(ptr %x) {
 ; CHECK-SSE-O0-NEXT:movw (%rdi), %cx
 ; CHECK-SSE-O0-NEXT:# implicit-def: $eax
 ; CHECK-SSE-O0-NEXT:movw %cx, %ax
-; CHECK-SSE-O0-NEXT:# implicit-def: $xmm0
-; CHECK-SSE-O0-NEXT:pinsrw $0, %eax, %xmm0
+; CHECK-SSE-O0-NEXT:movd %eax, %xmm0
 ; CHECK-SSE-O0-NEXT:retq
 ;
 ; CHECK-AVX-O0-LABEL: atomic_vec1_bfloat:
@@ -142,8 +141,7 @@ define <1 x bfloat> @atomic_vec1_bfloat(ptr %x) {
 ; CHECK-AVX-O0-NEXT:movw (%rdi), %cx
 ; CHECK-AVX-O0-NEXT:# implicit-def: $eax
 ; CHECK-AVX-O0-NEXT:movw %cx, %ax
-; CHECK-AVX-O0-NEXT:# imp

[llvm-branch-commits] [llvm] [X86] Cast atomic vectors in IR to support floats (PR #148899)

2025-11-06 Thread via llvm-branch-commits

https://github.com/jofrn updated 
https://github.com/llvm/llvm-project/pull/148899

>From 23fb9283f42bd418afb4d478dfaa7215c4d16093 Mon Sep 17 00:00:00 2001
From: jofrn 
Date: Tue, 15 Jul 2025 13:02:04 -0400
Subject: [PATCH] [X86] Cast atomic vectors in IR to support floats

This commit casts floats to ints in an atomic load during AtomicExpand to 
support
floating point types. It also is required to support 128 bit vectors in SSE/AVX.
---
 llvm/lib/Target/X86/X86ISelLowering.cpp|   7 +
 llvm/lib/Target/X86/X86ISelLowering.h  |   2 +
 llvm/lib/Target/X86/X86InstrCompiler.td|  15 +
 llvm/test/CodeGen/X86/atomic-load-store.ll | 385 ++---
 4 files changed, 122 insertions(+), 287 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp 
b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 49beadae63f03e..e15f17281b9585 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -32136,6 +32136,13 @@ 
X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
   }
 }
 
+TargetLowering::AtomicExpansionKind
+X86TargetLowering::shouldCastAtomicLoadInIR(LoadInst *LI) const {
+  if (LI->getType()->getScalarType()->isFloatingPointTy())
+return AtomicExpansionKind::CastToInteger;
+  return AtomicExpansionKind::None;
+}
+
 LoadInst *
 X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
   unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
diff --git a/llvm/lib/Target/X86/X86ISelLowering.h 
b/llvm/lib/Target/X86/X86ISelLowering.h
index b7151f65942b4d..f9a8adbd7da0d1 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/llvm/lib/Target/X86/X86ISelLowering.h
@@ -1841,6 +1841,8 @@ namespace llvm {
 shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override;
 TargetLoweringBase::AtomicExpansionKind
 shouldExpandLogicAtomicRMWInIR(AtomicRMWInst *AI) const;
+TargetLoweringBase::AtomicExpansionKind
+shouldCastAtomicLoadInIR(LoadInst *LI) const override;
 void emitBitTestAtomicRMWIntrinsic(AtomicRMWInst *AI) const override;
 void emitCmpArithAtomicRMWIntrinsic(AtomicRMWInst *AI) const override;
 
diff --git a/llvm/lib/Target/X86/X86InstrCompiler.td 
b/llvm/lib/Target/X86/X86InstrCompiler.td
index ce429b59162805..3f542297fea196 100644
--- a/llvm/lib/Target/X86/X86InstrCompiler.td
+++ b/llvm/lib/Target/X86/X86InstrCompiler.td
@@ -1220,6 +1220,21 @@ def : Pat<(v2i64 (scalar_to_vector (i64 (atomic_load_64 
addr:$src,
 def : Pat<(v2i64 (scalar_to_vector (i64 (atomic_load_64 addr:$src,
   (VMOV64toPQIZrm addr:$src)>, Requires<[HasAVX512]>;
 
+// load atomic <2 x i64>
+def : Pat<(v2i64 (atomic_load_128_v2i64 addr:$src)),
+  (MOVAPDrm addr:$src)>, Requires<[UseSSE2]>;
+def : Pat<(v2i64 (atomic_load_128_v2i64 addr:$src)),
+  (VMOVAPDrm addr:$src)>, Requires<[UseAVX]>;
+def : Pat<(v2i64 (atomic_load_128_v2i64 addr:$src)),
+  (VMOVAPDZ128rm addr:$src)>, Requires<[HasAVX512]>;
+// load atomic <4 x i32>
+def : Pat<(v4i32 (atomic_load_128_v4i32 addr:$src)),
+  (MOVAPDrm addr:$src)>, Requires<[UseSSE2]>;
+def : Pat<(v4i32 (atomic_load_128_v4i32 addr:$src)),
+  (VMOVAPDrm addr:$src)>, Requires<[UseAVX]>;
+def : Pat<(v4i32 (atomic_load_128_v4i32 addr:$src)),
+  (VMOVAPDZ128rm addr:$src)>, Requires<[HasAVX512]>;
+
 // Floating point loads/stores.
 def : Pat<(atomic_store_32 (i32 (bitconvert (f32 FR32:$src))), addr:$dst),
   (MOVSSmr addr:$dst, FR32:$src)>, Requires<[UseSSE1]>;
diff --git a/llvm/test/CodeGen/X86/atomic-load-store.ll 
b/llvm/test/CodeGen/X86/atomic-load-store.ll
index 928dfef3143da5..00310f6d1f219e 100644
--- a/llvm/test/CodeGen/X86/atomic-load-store.ll
+++ b/llvm/test/CodeGen/X86/atomic-load-store.ll
@@ -119,13 +119,13 @@ define <1 x bfloat> @atomic_vec1_bfloat(ptr %x) {
 ; CHECK-SSE-O3-LABEL: atomic_vec1_bfloat:
 ; CHECK-SSE-O3:   # %bb.0:
 ; CHECK-SSE-O3-NEXT:movzwl (%rdi), %eax
-; CHECK-SSE-O3-NEXT:pinsrw $0, %eax, %xmm0
+; CHECK-SSE-O3-NEXT:movd %eax, %xmm0
 ; CHECK-SSE-O3-NEXT:retq
 ;
 ; CHECK-AVX-O3-LABEL: atomic_vec1_bfloat:
 ; CHECK-AVX-O3:   # %bb.0:
 ; CHECK-AVX-O3-NEXT:movzwl (%rdi), %eax
-; CHECK-AVX-O3-NEXT:vpinsrw $0, %eax, %xmm0, %xmm0
+; CHECK-AVX-O3-NEXT:vmovd %eax, %xmm0
 ; CHECK-AVX-O3-NEXT:retq
 ;
 ; CHECK-SSE-O0-LABEL: atomic_vec1_bfloat:
@@ -133,8 +133,7 @@ define <1 x bfloat> @atomic_vec1_bfloat(ptr %x) {
 ; CHECK-SSE-O0-NEXT:movw (%rdi), %cx
 ; CHECK-SSE-O0-NEXT:# implicit-def: $eax
 ; CHECK-SSE-O0-NEXT:movw %cx, %ax
-; CHECK-SSE-O0-NEXT:# implicit-def: $xmm0
-; CHECK-SSE-O0-NEXT:pinsrw $0, %eax, %xmm0
+; CHECK-SSE-O0-NEXT:movd %eax, %xmm0
 ; CHECK-SSE-O0-NEXT:retq
 ;
 ; CHECK-AVX-O0-LABEL: atomic_vec1_bfloat:
@@ -142,8 +141,7 @@ define <1 x bfloat> @atomic_vec1_bfloat(ptr %x) {
 ; CHECK-AVX-O0-NEXT:movw (%rdi), %cx
 ; CHECK-AVX-O0-NEXT:# implicit-def: $eax
 ; CHECK-AVX-O0-NEXT:movw %cx, %ax
-; CHECK-AVX-O0-NEXT: 

[llvm-branch-commits] [llvm] [ConstantTime][RISCV] Add comprehensive tests for ct.select (PR #166708)

2025-11-06 Thread Julius Alexandre via llvm-branch-commits

https://github.com/wizardengineer ready_for_review 
https://github.com/llvm/llvm-project/pull/166708
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [ConstantTime][WebAssembly] Add comprehensive tests for ct.select (PR #166709)

2025-11-06 Thread Julius Alexandre via llvm-branch-commits

https://github.com/wizardengineer ready_for_review 
https://github.com/llvm/llvm-project/pull/166709
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [clang] [ConstantTime][Clang] Add __builtin_ct_select for constant-time selection (PR #166703)

2025-11-06 Thread Julius Alexandre via llvm-branch-commits

https://github.com/wizardengineer ready_for_review 
https://github.com/llvm/llvm-project/pull/166703
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [ConstantTime] Native ct.select support for ARM32 and Thumb (PR #166707)

2025-11-06 Thread Julius Alexandre via llvm-branch-commits

https://github.com/wizardengineer ready_for_review 
https://github.com/llvm/llvm-project/pull/166707
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [ConstantTime] Native ct.select support for X86 and i386 (PR #166704)

2025-11-06 Thread Julius Alexandre via llvm-branch-commits

https://github.com/wizardengineer ready_for_review 
https://github.com/llvm/llvm-project/pull/166704
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


  1   2   >