[llvm-branch-commits] [mlir] a11869c - Revert "[mlir] Make remove-dead-values pass remove blocks arguments first (#1…"
Author: lonely eagle
Date: 2025-11-06T18:46:46+08:00
New Revision: a11869ccf13d99f3559a4b244dfe2c8593db783a
URL:
https://github.com/llvm/llvm-project/commit/a11869ccf13d99f3559a4b244dfe2c8593db783a
DIFF:
https://github.com/llvm/llvm-project/commit/a11869ccf13d99f3559a4b244dfe2c8593db783a.diff
LOG: Revert "[mlir] Make remove-dead-values pass remove blocks arguments first
(#1…"
This reverts commit a928c61961004cc94c4cb37bc4c414f1537e7660.
Added:
Modified:
mlir/lib/Transforms/RemoveDeadValues.cpp
mlir/test/Transforms/remove-dead-values.mlir
Removed:
diff --git a/mlir/lib/Transforms/RemoveDeadValues.cpp
b/mlir/lib/Transforms/RemoveDeadValues.cpp
index 979b3965e4ba9..41f3f9d76a3b1 100644
--- a/mlir/lib/Transforms/RemoveDeadValues.cpp
+++ b/mlir/lib/Transforms/RemoveDeadValues.cpp
@@ -742,25 +742,7 @@ static void processBranchOp(BranchOpInterface branchOp,
RunLivenessAnalysis &la,
static void cleanUpDeadVals(RDVFinalCleanupList &list) {
LDBG() << "Starting cleanup of dead values...";
- // 1. Blocks
- LDBG() << "Cleaning up " << list.blocks.size() << " block argument lists";
- for (auto &b : list.blocks) {
-// blocks that are accessed via multiple codepaths processed once
-if (b.b->getNumArguments() != b.nonLiveArgs.size())
- continue;
-LDBG() << "Erasing " << b.nonLiveArgs.count()
- << " non-live arguments from block: " << b.b;
-// it iterates backwards because erase invalidates all successor indexes
-for (int i = b.nonLiveArgs.size() - 1; i >= 0; --i) {
- if (!b.nonLiveArgs[i])
-continue;
- LDBG() << " Erasing block argument " << i << ": " <<
b.b->getArgument(i);
- b.b->getArgument(i).dropAllUses();
- b.b->eraseArgument(i);
-}
- }
-
- // 2. Operations
+ // 1. Operations
LDBG() << "Cleaning up " << list.operations.size() << " operations";
for (auto &op : list.operations) {
LDBG() << "Erasing operation: "
@@ -769,14 +751,14 @@ static void cleanUpDeadVals(RDVFinalCleanupList &list) {
op->erase();
}
- // 3. Values
+ // 2. Values
LDBG() << "Cleaning up " << list.values.size() << " values";
for (auto &v : list.values) {
LDBG() << "Dropping all uses of value: " << v;
v.dropAllUses();
}
- // 4. Functions
+ // 3. Functions
LDBG() << "Cleaning up " << list.functions.size() << " functions";
// Record which function arguments were erased so we can shrink call-site
// argument segments for CallOpInterface operations (e.g. ops using
@@ -798,7 +780,7 @@ static void cleanUpDeadVals(RDVFinalCleanupList &list) {
(void)f.funcOp.eraseResults(f.nonLiveRets);
}
- // 5. Operands
+ // 4. Operands
LDBG() << "Cleaning up " << list.operands.size() << " operand lists";
for (OperationToCleanup &o : list.operands) {
// Handle call-specific cleanup only when we have a cached callee
reference.
@@ -840,7 +822,7 @@ static void cleanUpDeadVals(RDVFinalCleanupList &list) {
}
}
- // 6. Results
+ // 5. Results
LDBG() << "Cleaning up " << list.results.size() << " result lists";
for (auto &r : list.results) {
LDBG() << "Erasing " << r.nonLive.count()
@@ -849,6 +831,24 @@ static void cleanUpDeadVals(RDVFinalCleanupList &list) {
dropUsesAndEraseResults(r.op, r.nonLive);
}
+ // 6. Blocks
+ LDBG() << "Cleaning up " << list.blocks.size() << " block argument lists";
+ for (auto &b : list.blocks) {
+// blocks that are accessed via multiple codepaths processed once
+if (b.b->getNumArguments() != b.nonLiveArgs.size())
+ continue;
+LDBG() << "Erasing " << b.nonLiveArgs.count()
+ << " non-live arguments from block: " << b.b;
+// it iterates backwards because erase invalidates all successor indexes
+for (int i = b.nonLiveArgs.size() - 1; i >= 0; --i) {
+ if (!b.nonLiveArgs[i])
+continue;
+ LDBG() << " Erasing block argument " << i << ": " <<
b.b->getArgument(i);
+ b.b->getArgument(i).dropAllUses();
+ b.b->eraseArgument(i);
+}
+ }
+
// 7. Successor Operands
LDBG() << "Cleaning up " << list.successorOperands.size()
<< " successor operand lists";
diff --git a/mlir/test/Transforms/remove-dead-values.mlir
b/mlir/test/Transforms/remove-dead-values.mlir
index 8b5ccdcf204dd..e7304505c809e 100644
--- a/mlir/test/Transforms/remove-dead-values.mlir
+++ b/mlir/test/Transforms/remove-dead-values.mlir
@@ -674,18 +674,3 @@ func.func @dead_value_loop_ivs_no_result(%lb: index, %ub:
index, %step: index, %
}
return
}
-
-// -
-
-// CHECK-LABEL: func @op_block_have_dead_arg
-func.func @op_block_have_dead_arg(%arg0: index, %arg1: index, %arg2: index,
%arg3: i1) {
- scf.for %iv = %arg0 to %arg1 step %arg2 {
-scf.execute_region {
- cf.cond_br %arg3, ^bb1(%arg0 : index), ^bb1(%arg1 : index)
-^bb1(%0: index):
-scf.yield
-}
- }
-// CHECK-NEXT: return
- r
[llvm-branch-commits] [llvm] [AMDGPU] Add wave reduce intrinsics for float types - 2 (PR #161815)
@@ -5330,11 +5330,13 @@ static uint32_t
getIdentityValueFor32BitWaveReduction(unsigned Opc) {
case AMDGPU::S_MAX_U32:
return std::numeric_limits::min();
case AMDGPU::S_MAX_I32:
+ case AMDGPU::V_SUB_F32_e64: // +0.0
jmmartinez wrote:
I haven't thought about this, but why do we take `-0.0` if the reduction is a
sub and `+0.0` if it is an add ? Does it come from any specification ?
https://github.com/llvm/llvm-project/pull/161815
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [AMDGPU] Add wave reduce intrinsics for float types - 2 (PR #161815)
@@ -5330,11 +5330,13 @@ static uint32_t
getIdentityValueFor32BitWaveReduction(unsigned Opc) {
case AMDGPU::S_MAX_U32:
return std::numeric_limits::min();
case AMDGPU::S_MAX_I32:
+ case AMDGPU::V_SUB_F32_e64: // +0.0
jmmartinez wrote:
This doesn't seem right.
Isn't `0b1000` the opposite, -0.0 ?
I'd feel reassured if you used a bitcast instead of a comment (comments tend to
diverge from the code eventually): `__builtin_bit_cast(+0.0f, uint32_t)`.
https://github.com/llvm/llvm-project/pull/161815
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [clang] [AMDGPU] Add builtins for wave reduction intrinsics (PR #161816)
https://github.com/jmmartinez approved this pull request. https://github.com/llvm/llvm-project/pull/161816 ___ llvm-branch-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [LoongArch] Initial implementation for `enableMemCmpExpansion` hook (PR #166526)
zhaoqi5 wrote: > How does this optimization affect the benchmark? For example > `llvm-test-suite/MicroBenchmarks/MemFunctions`. Add some test results to help > with comparison and code review. It might be necessary to test the data under > various conditions, including different vector widths and optimization levels > (e.g., O2 or Os). Okay, I will try to test some benchmarks such as test-suite or spec cpu and add the results later. Thanks. https://github.com/llvm/llvm-project/pull/166526 ___ llvm-branch-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [clang] [AMDGPU] Add builtins for wave reduction intrinsics (PR #161816)
https://github.com/easyonaadit updated
https://github.com/llvm/llvm-project/pull/161816
>From 0e9bcce2647a3adc91bc049dfc5761cbeefa19b1 Mon Sep 17 00:00:00 2001
From: Aaditya
Date: Tue, 30 Sep 2025 11:37:42 +0530
Subject: [PATCH] [AMDGPU] Add builtins for wave reduction intrinsics
---
clang/include/clang/Basic/BuiltinsAMDGPU.def | 4
clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp | 8
2 files changed, 12 insertions(+)
diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def
b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index fda16e42d2c6b..ebc0ac35f42d9 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -402,6 +402,10 @@ BUILTIN(__builtin_amdgcn_wave_reduce_max_u64, "WUiWUiZi",
"nc")
BUILTIN(__builtin_amdgcn_wave_reduce_and_b64, "WiWiZi", "nc")
BUILTIN(__builtin_amdgcn_wave_reduce_or_b64, "WiWiZi", "nc")
BUILTIN(__builtin_amdgcn_wave_reduce_xor_b64, "WiWiZi", "nc")
+BUILTIN(__builtin_amdgcn_wave_reduce_add_f32, "ffZi", "nc")
+BUILTIN(__builtin_amdgcn_wave_reduce_sub_f32, "ffZi", "nc")
+BUILTIN(__builtin_amdgcn_wave_reduce_min_f32, "ffZi", "nc")
+BUILTIN(__builtin_amdgcn_wave_reduce_max_f32, "ffZi", "nc")
//===--===//
// R600-NI only builtins.
diff --git a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
index 07cf08c54985a..4de722077c8e9 100644
--- a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
+++ b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
@@ -301,18 +301,22 @@ static Intrinsic::ID
getIntrinsicIDforWaveReduction(unsigned BuiltinID) {
llvm_unreachable("Unknown BuiltinID for wave reduction");
case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_add_u32:
case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_add_u64:
+ case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_add_f32:
return Intrinsic::amdgcn_wave_reduce_add;
case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_sub_u32:
case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_sub_u64:
+ case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_sub_f32:
return Intrinsic::amdgcn_wave_reduce_sub;
case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_min_i32:
case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_min_i64:
+ case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_min_f32:
return Intrinsic::amdgcn_wave_reduce_min;
case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_min_u32:
case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_min_u64:
return Intrinsic::amdgcn_wave_reduce_umin;
case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_max_i32:
case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_max_i64:
+ case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_max_f32:
return Intrinsic::amdgcn_wave_reduce_max;
case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_max_u32:
case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_max_u64:
@@ -335,11 +339,15 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned
BuiltinID,
llvm::SyncScope::ID SSID;
switch (BuiltinID) {
case AMDGPU::BI__builtin_amdgcn_wave_reduce_add_u32:
+ case AMDGPU::BI__builtin_amdgcn_wave_reduce_add_f32:
case AMDGPU::BI__builtin_amdgcn_wave_reduce_sub_u32:
+ case AMDGPU::BI__builtin_amdgcn_wave_reduce_sub_f32:
case AMDGPU::BI__builtin_amdgcn_wave_reduce_min_i32:
case AMDGPU::BI__builtin_amdgcn_wave_reduce_min_u32:
+ case AMDGPU::BI__builtin_amdgcn_wave_reduce_min_f32:
case AMDGPU::BI__builtin_amdgcn_wave_reduce_max_i32:
case AMDGPU::BI__builtin_amdgcn_wave_reduce_max_u32:
+ case AMDGPU::BI__builtin_amdgcn_wave_reduce_max_f32:
case AMDGPU::BI__builtin_amdgcn_wave_reduce_and_b32:
case AMDGPU::BI__builtin_amdgcn_wave_reduce_or_b32:
case AMDGPU::BI__builtin_amdgcn_wave_reduce_xor_b32:
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [AMDGPU] Remove named-barrier LDS lowering logic from amdgpu-lower-module-lds (PR #166731)
https://github.com/skc7 edited https://github.com/llvm/llvm-project/pull/166731 ___ llvm-branch-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [AMDGPU] Enable amdgpu-lower-exec-sync pass in pipeline (PR #165746)
https://github.com/skc7 edited https://github.com/llvm/llvm-project/pull/165746 ___ llvm-branch-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [AMDGPU] Remove named-barrier LDS lowering logic from amdgpu-lower-module-lds (PR #166731)
https://github.com/skc7 ready_for_review https://github.com/llvm/llvm-project/pull/166731 ___ llvm-branch-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [AMDGPU] Remove named-barrier LDS lowering logic from amdgpu-lower-module-lds (PR #166731)
llvmbot wrote:
@llvm/pr-subscribers-backend-amdgpu
Author: Chaitanya (skc7)
Changes
This PR removes the named-barrier LDS lowering from `amdgpu-lower-module-lds`
pass, since it is now handled by `amdgpu-lower-exec-sync` pass
This PR is 3rd one in the stack.
PR1 : https://github.com/llvm/llvm-project/pull/165692
PR2 : https://github.com/llvm/llvm-project/pull/165746
-> PR3 : https://github.com/llvm/llvm-project/pull/166731
---
Full diff: https://github.com/llvm/llvm-project/pull/166731.diff
1 Files Affected:
- (modified) llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp (-126)
``diff
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
index a4ef524c43466..3c0328e93ffbd 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
@@ -922,126 +922,6 @@ class AMDGPULowerModuleLDS {
return KernelToCreatedDynamicLDS;
}
- static GlobalVariable *uniquifyGVPerKernel(Module &M, GlobalVariable *GV,
- Function *KF) {
-bool NeedsReplacement = false;
-for (Use &U : GV->uses()) {
- if (auto *I = dyn_cast(U.getUser())) {
-Function *F = I->getFunction();
-if (isKernelLDS(F) && F != KF) {
- NeedsReplacement = true;
- break;
-}
- }
-}
-if (!NeedsReplacement)
- return GV;
-// Create a new GV used only by this kernel and its function
-GlobalVariable *NewGV = new GlobalVariable(
-M, GV->getValueType(), GV->isConstant(), GV->getLinkage(),
-GV->getInitializer(), GV->getName() + "." + KF->getName(), nullptr,
-GV->getThreadLocalMode(), GV->getType()->getAddressSpace());
-NewGV->copyAttributesFrom(GV);
-for (Use &U : make_early_inc_range(GV->uses())) {
- if (auto *I = dyn_cast(U.getUser())) {
-Function *F = I->getFunction();
-if (!isKernelLDS(F) || F == KF) {
- U.getUser()->replaceUsesOfWith(GV, NewGV);
-}
- }
-}
-return NewGV;
- }
-
- bool lowerSpecialLDSVariables(
- Module &M, LDSUsesInfoTy &LDSUsesInfo,
- VariableFunctionMap &LDSToKernelsThatNeedToAccessItIndirectly) {
-bool Changed = false;
-const DataLayout &DL = M.getDataLayout();
-// The 1st round: give module-absolute assignments
-int NumAbsolutes = 0;
-std::vector OrderedGVs;
-for (auto &K : LDSToKernelsThatNeedToAccessItIndirectly) {
- GlobalVariable *GV = K.first;
- if (!isNamedBarrier(*GV))
-continue;
- // give a module-absolute assignment if it is indirectly accessed by
- // multiple kernels. This is not precise, but we don't want to duplicate
- // a function when it is called by multiple kernels.
- if (LDSToKernelsThatNeedToAccessItIndirectly[GV].size() > 1) {
-OrderedGVs.push_back(GV);
- } else {
-// leave it to the 2nd round, which will give a kernel-relative
-// assignment if it is only indirectly accessed by one kernel
-LDSUsesInfo.direct_access[*K.second.begin()].insert(GV);
- }
- LDSToKernelsThatNeedToAccessItIndirectly.erase(GV);
-}
-OrderedGVs = sortByName(std::move(OrderedGVs));
-for (GlobalVariable *GV : OrderedGVs) {
- unsigned BarrierScope = llvm::AMDGPU::Barrier::BARRIER_SCOPE_WORKGROUP;
- unsigned BarId = NumAbsolutes + 1;
- unsigned BarCnt = DL.getTypeAllocSize(GV->getValueType()) / 16;
- NumAbsolutes += BarCnt;
-
- // 4 bits for alignment, 5 bits for the barrier num,
- // 3 bits for the barrier scope
- unsigned Offset = 0x802000u | BarrierScope << 9 | BarId << 4;
- recordLDSAbsoluteAddress(&M, GV, Offset);
-}
-OrderedGVs.clear();
-
-// The 2nd round: give a kernel-relative assignment for GV that
-// either only indirectly accessed by single kernel or only directly
-// accessed by multiple kernels.
-std::vector OrderedKernels;
-for (auto &K : LDSUsesInfo.direct_access) {
- Function *F = K.first;
- assert(isKernelLDS(F));
- OrderedKernels.push_back(F);
-}
-OrderedKernels = sortByName(std::move(OrderedKernels));
-
-llvm::DenseMap Kernel2BarId;
-for (Function *F : OrderedKernels) {
- for (GlobalVariable *GV : LDSUsesInfo.direct_access[F]) {
-if (!isNamedBarrier(*GV))
- continue;
-
-LDSUsesInfo.direct_access[F].erase(GV);
-if (GV->isAbsoluteSymbolRef()) {
- // already assigned
- continue;
-}
-OrderedGVs.push_back(GV);
- }
- OrderedGVs = sortByName(std::move(OrderedGVs));
- for (GlobalVariable *GV : OrderedGVs) {
-// GV could also be used directly by other kernels. If so, we need to
-// create a new GV used only by this kernel and its function.
-auto NewGV = uniquifyGVPerKernel(M, GV, F);
-Changed |= (NewGV != GV);
-unsigned BarrierScope
[llvm-branch-commits] [clang] [NFC][HIP] Add __builtin_*_load_lds type check test cases (PR #165388)
https://github.com/ranapratap55 approved this pull request. https://github.com/llvm/llvm-project/pull/165388 ___ llvm-branch-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [clang] [HIP][AMDGPU] Remove 't' from all __builtin_*_load_lds builtins (PR #165389)
https://github.com/ranapratap55 approved this pull request. https://github.com/llvm/llvm-project/pull/165389 ___ llvm-branch-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [IR] Add CallBr intrinsics support (PR #133907)
ro-i wrote: wait, does github now finally have the reopen feature https://github.com/llvm/llvm-project/pull/133907 ___ llvm-branch-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [IR] Add CallBr intrinsics support (PR #133907)
https://github.com/ro-i reopened https://github.com/llvm/llvm-project/pull/133907 ___ llvm-branch-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [LoongArch] Initial implementation for `enableMemCmpExpansion` hook (PR #166526)
llvmbot wrote:
@llvm/pr-subscribers-backend-loongarch
Author: ZhaoQi (zhaoqi5)
Changes
After overriding `TargetTransformInfo::enableMemCmpExpansion` in this commit,
`MergeICmps` and `ExpandMemCmp` passes will be enabled on LoongArch.
---
Patch is 220.23 KiB, truncated to 20.00 KiB below, full version:
https://github.com/llvm/llvm-project/pull/166526.diff
5 Files Affected:
- (modified) llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.cpp (+24-1)
- (modified) llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.h (+2-1)
- (modified) llvm/test/CodeGen/LoongArch/expandmemcmp-optsize.ll (+1619-527)
- (modified) llvm/test/CodeGen/LoongArch/expandmemcmp.ll (+2594-715)
- (modified) llvm/test/CodeGen/LoongArch/memcmp.ll (+18-9)
``diff
diff --git a/llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.cpp
b/llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.cpp
index f548a8dd0532b..f6637ef58cf9c 100644
--- a/llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.cpp
@@ -111,4 +111,27 @@ bool LoongArchTTIImpl::shouldExpandReduction(const
IntrinsicInst *II) const {
}
}
-// TODO: Implement more hooks to provide TTI machinery for LoongArch.
+LoongArchTTIImpl::TTI::MemCmpExpansionOptions
+LoongArchTTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
+ TTI::MemCmpExpansionOptions Options;
+
+ if (!ST->hasUAL())
+return Options;
+
+ // TODO: Set same as the default value of MaxLoadsPerMemcmp or
+ // MaxLoadsPerMemcmpOptSize. May need more consideration?
+ Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
+ Options.NumLoadsPerBlock = Options.MaxNumLoads;
+ Options.AllowOverlappingLoads = true;
+
+ // TODO: Support for vectors.
+ if (ST->is64Bit()) {
+Options.LoadSizes = {8, 4, 2, 1};
+Options.AllowedTailExpansions = {3, 5, 6};
+ } else {
+Options.LoadSizes = {4, 2, 1};
+Options.AllowedTailExpansions = {3};
+ }
+
+ return Options;
+}
diff --git a/llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.h
b/llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.h
index e3f16c7804994..9b479f9dc0dc5 100644
--- a/llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.h
+++ b/llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.h
@@ -55,7 +55,8 @@ class LoongArchTTIImpl : public
BasicTTIImplBase {
bool shouldExpandReduction(const IntrinsicInst *II) const override;
- // TODO: Implement more hooks to provide TTI machinery for LoongArch.
+ TTI::MemCmpExpansionOptions
+ enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const override;
};
} // end namespace llvm
diff --git a/llvm/test/CodeGen/LoongArch/expandmemcmp-optsize.ll
b/llvm/test/CodeGen/LoongArch/expandmemcmp-optsize.ll
index 82fe899bb795b..a6ed1f1db1678 100644
--- a/llvm/test/CodeGen/LoongArch/expandmemcmp-optsize.ll
+++ b/llvm/test/CodeGen/LoongArch/expandmemcmp-optsize.ll
@@ -38,260 +38,488 @@ entry:
}
define i32 @bcmp_size_1(ptr %s1, ptr %s2) nounwind optsize {
-; LA32-LABEL: bcmp_size_1:
-; LA32: # %bb.0: # %entry
-; LA32-NEXT:addi.w $sp, $sp, -16
-; LA32-NEXT:st.w $ra, $sp, 12 # 4-byte Folded Spill
-; LA32-NEXT:ori $a2, $zero, 1
-; LA32-NEXT:bl bcmp
-; LA32-NEXT:ld.w $ra, $sp, 12 # 4-byte Folded Reload
-; LA32-NEXT:addi.w $sp, $sp, 16
-; LA32-NEXT:ret
+; LA32-UAL-LABEL: bcmp_size_1:
+; LA32-UAL: # %bb.0: # %entry
+; LA32-UAL-NEXT:ld.bu $a0, $a0, 0
+; LA32-UAL-NEXT:ld.bu $a1, $a1, 0
+; LA32-UAL-NEXT:xor $a0, $a0, $a1
+; LA32-UAL-NEXT:sltu $a0, $zero, $a0
+; LA32-UAL-NEXT:ret
;
-; LA64-LABEL: bcmp_size_1:
-; LA64: # %bb.0: # %entry
-; LA64-NEXT:addi.d $sp, $sp, -16
-; LA64-NEXT:st.d $ra, $sp, 8 # 8-byte Folded Spill
-; LA64-NEXT:ori $a2, $zero, 1
-; LA64-NEXT:pcaddu18i $ra, %call36(bcmp)
-; LA64-NEXT:jirl $ra, $ra, 0
-; LA64-NEXT:ld.d $ra, $sp, 8 # 8-byte Folded Reload
-; LA64-NEXT:addi.d $sp, $sp, 16
-; LA64-NEXT:ret
+; LA64-UAL-LABEL: bcmp_size_1:
+; LA64-UAL: # %bb.0: # %entry
+; LA64-UAL-NEXT:ld.bu $a0, $a0, 0
+; LA64-UAL-NEXT:ld.bu $a1, $a1, 0
+; LA64-UAL-NEXT:xor $a0, $a0, $a1
+; LA64-UAL-NEXT:sltu $a0, $zero, $a0
+; LA64-UAL-NEXT:ret
+;
+; LA32-NUAL-LABEL: bcmp_size_1:
+; LA32-NUAL: # %bb.0: # %entry
+; LA32-NUAL-NEXT:addi.w $sp, $sp, -16
+; LA32-NUAL-NEXT:st.w $ra, $sp, 12 # 4-byte Folded Spill
+; LA32-NUAL-NEXT:ori $a2, $zero, 1
+; LA32-NUAL-NEXT:bl bcmp
+; LA32-NUAL-NEXT:ld.w $ra, $sp, 12 # 4-byte Folded Reload
+; LA32-NUAL-NEXT:addi.w $sp, $sp, 16
+; LA32-NUAL-NEXT:ret
+;
+; LA64-NUAL-LABEL: bcmp_size_1:
+; LA64-NUAL: # %bb.0: # %entry
+; LA64-NUAL-NEXT:addi.d $sp, $sp, -16
+; LA64-NUAL-NEXT:st.d $ra, $sp, 8 # 8-byte Folded Spill
+; LA64-NUAL-NEXT:ori $a2, $zero, 1
+; LA64-NUAL-NEXT:pcaddu18i $ra, %call36(bcmp)
+; LA64-NUAL-NEXT:jirl $ra, $ra, 0
+; LA64-NUAL-NEXT:ld.d $ra, $sp, 8 #
[llvm-branch-commits] [compiler-rt][sanitizers] Mark three tests as unsupported on Android (PR #166639)
https://github.com/boomanaiden154 updated https://github.com/llvm/llvm-project/pull/166639 ___ llvm-branch-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [compiler-rt][sanitizers] Mark three tests as unsupported on Android (PR #166639)
https://github.com/boomanaiden154 updated https://github.com/llvm/llvm-project/pull/166639 ___ llvm-branch-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [IR] Add CallBr intrinsics support (PR #133907)
https://github.com/ro-i closed https://github.com/llvm/llvm-project/pull/133907 ___ llvm-branch-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [IR] Add CallBr intrinsics support (PR #133907)
ro-i wrote: Hm, somehow github decided to automatically close this PR after I just merged the PR down the stack. Will reopen https://github.com/llvm/llvm-project/pull/133907 ___ llvm-branch-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [AMDGPU] Enable amdgpu-lower-exec-sync pass in pipeline (PR #165746)
https://github.com/skc7 updated https://github.com/llvm/llvm-project/pull/165746
>From ca4b858851a2b6c2a0e81fe6d48618332d18ca15 Mon Sep 17 00:00:00 2001
From: skc7
Date: Thu, 30 Oct 2025 22:42:33 +0530
Subject: [PATCH 1/4] [AMDGPU] Enable amdgpu-lower-special-lds pass in pipeline
---
.../AMDGPU/AMDGPULowerModuleLDSPass.cpp | 126 --
llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.cpp | 6 +
llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp | 3 +-
.../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 14 ++
...amdgpu-lower-special-lds-and-module-lds.ll | 119 +
.../amdgpu-lower-special-lds-and-sw-lds.ll| 86
llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll | 6 +-
llvm/test/CodeGen/AMDGPU/llc-pipeline.ll | 5 +
.../test/CodeGen/AMDGPU/s-barrier-lowering.ll | 2 +-
9 files changed, 236 insertions(+), 131 deletions(-)
create mode 100644
llvm/test/CodeGen/AMDGPU/amdgpu-lower-special-lds-and-module-lds.ll
create mode 100644
llvm/test/CodeGen/AMDGPU/amdgpu-lower-special-lds-and-sw-lds.ll
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
index a4ef524c43466..3c0328e93ffbd 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
@@ -922,126 +922,6 @@ class AMDGPULowerModuleLDS {
return KernelToCreatedDynamicLDS;
}
- static GlobalVariable *uniquifyGVPerKernel(Module &M, GlobalVariable *GV,
- Function *KF) {
-bool NeedsReplacement = false;
-for (Use &U : GV->uses()) {
- if (auto *I = dyn_cast(U.getUser())) {
-Function *F = I->getFunction();
-if (isKernelLDS(F) && F != KF) {
- NeedsReplacement = true;
- break;
-}
- }
-}
-if (!NeedsReplacement)
- return GV;
-// Create a new GV used only by this kernel and its function
-GlobalVariable *NewGV = new GlobalVariable(
-M, GV->getValueType(), GV->isConstant(), GV->getLinkage(),
-GV->getInitializer(), GV->getName() + "." + KF->getName(), nullptr,
-GV->getThreadLocalMode(), GV->getType()->getAddressSpace());
-NewGV->copyAttributesFrom(GV);
-for (Use &U : make_early_inc_range(GV->uses())) {
- if (auto *I = dyn_cast(U.getUser())) {
-Function *F = I->getFunction();
-if (!isKernelLDS(F) || F == KF) {
- U.getUser()->replaceUsesOfWith(GV, NewGV);
-}
- }
-}
-return NewGV;
- }
-
- bool lowerSpecialLDSVariables(
- Module &M, LDSUsesInfoTy &LDSUsesInfo,
- VariableFunctionMap &LDSToKernelsThatNeedToAccessItIndirectly) {
-bool Changed = false;
-const DataLayout &DL = M.getDataLayout();
-// The 1st round: give module-absolute assignments
-int NumAbsolutes = 0;
-std::vector OrderedGVs;
-for (auto &K : LDSToKernelsThatNeedToAccessItIndirectly) {
- GlobalVariable *GV = K.first;
- if (!isNamedBarrier(*GV))
-continue;
- // give a module-absolute assignment if it is indirectly accessed by
- // multiple kernels. This is not precise, but we don't want to duplicate
- // a function when it is called by multiple kernels.
- if (LDSToKernelsThatNeedToAccessItIndirectly[GV].size() > 1) {
-OrderedGVs.push_back(GV);
- } else {
-// leave it to the 2nd round, which will give a kernel-relative
-// assignment if it is only indirectly accessed by one kernel
-LDSUsesInfo.direct_access[*K.second.begin()].insert(GV);
- }
- LDSToKernelsThatNeedToAccessItIndirectly.erase(GV);
-}
-OrderedGVs = sortByName(std::move(OrderedGVs));
-for (GlobalVariable *GV : OrderedGVs) {
- unsigned BarrierScope = llvm::AMDGPU::Barrier::BARRIER_SCOPE_WORKGROUP;
- unsigned BarId = NumAbsolutes + 1;
- unsigned BarCnt = DL.getTypeAllocSize(GV->getValueType()) / 16;
- NumAbsolutes += BarCnt;
-
- // 4 bits for alignment, 5 bits for the barrier num,
- // 3 bits for the barrier scope
- unsigned Offset = 0x802000u | BarrierScope << 9 | BarId << 4;
- recordLDSAbsoluteAddress(&M, GV, Offset);
-}
-OrderedGVs.clear();
-
-// The 2nd round: give a kernel-relative assignment for GV that
-// either only indirectly accessed by single kernel or only directly
-// accessed by multiple kernels.
-std::vector OrderedKernels;
-for (auto &K : LDSUsesInfo.direct_access) {
- Function *F = K.first;
- assert(isKernelLDS(F));
- OrderedKernels.push_back(F);
-}
-OrderedKernels = sortByName(std::move(OrderedKernels));
-
-llvm::DenseMap Kernel2BarId;
-for (Function *F : OrderedKernels) {
- for (GlobalVariable *GV : LDSUsesInfo.direct_access[F]) {
-if (!isNamedBarrier(*GV))
- continue;
-
-LDSUsesInfo.direct_access[F].erase(GV);
-if (GV->isAbsoluteSymbolRef()) {
- // already assigned
-
[llvm-branch-commits] [llvm] [AMDGPU] Remove named-barrier LDS lowering logic from amdgpu-lower-module-lds (PR #166731)
https://github.com/skc7 created https://github.com/llvm/llvm-project/pull/166731
This PR removes the named-barrier LDS lowering from `amdgpu-lower-module-lds`
pass, since it is now handled by `amdgpu-lower-exec-sync` pass
>From 0a2e9ee17ea82a7cb3fe191626ee84b05c37be83 Mon Sep 17 00:00:00 2001
From: skc7
Date: Thu, 6 Nov 2025 14:29:17 +0530
Subject: [PATCH] [AMDGPU] Remove lowering named-barrier LDS logci from
amdgpu-lower-module-lds
---
.../AMDGPU/AMDGPULowerModuleLDSPass.cpp | 126 --
1 file changed, 126 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
index a4ef524c43466..3c0328e93ffbd 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
@@ -922,126 +922,6 @@ class AMDGPULowerModuleLDS {
return KernelToCreatedDynamicLDS;
}
- static GlobalVariable *uniquifyGVPerKernel(Module &M, GlobalVariable *GV,
- Function *KF) {
-bool NeedsReplacement = false;
-for (Use &U : GV->uses()) {
- if (auto *I = dyn_cast(U.getUser())) {
-Function *F = I->getFunction();
-if (isKernelLDS(F) && F != KF) {
- NeedsReplacement = true;
- break;
-}
- }
-}
-if (!NeedsReplacement)
- return GV;
-// Create a new GV used only by this kernel and its function
-GlobalVariable *NewGV = new GlobalVariable(
-M, GV->getValueType(), GV->isConstant(), GV->getLinkage(),
-GV->getInitializer(), GV->getName() + "." + KF->getName(), nullptr,
-GV->getThreadLocalMode(), GV->getType()->getAddressSpace());
-NewGV->copyAttributesFrom(GV);
-for (Use &U : make_early_inc_range(GV->uses())) {
- if (auto *I = dyn_cast(U.getUser())) {
-Function *F = I->getFunction();
-if (!isKernelLDS(F) || F == KF) {
- U.getUser()->replaceUsesOfWith(GV, NewGV);
-}
- }
-}
-return NewGV;
- }
-
- bool lowerSpecialLDSVariables(
- Module &M, LDSUsesInfoTy &LDSUsesInfo,
- VariableFunctionMap &LDSToKernelsThatNeedToAccessItIndirectly) {
-bool Changed = false;
-const DataLayout &DL = M.getDataLayout();
-// The 1st round: give module-absolute assignments
-int NumAbsolutes = 0;
-std::vector OrderedGVs;
-for (auto &K : LDSToKernelsThatNeedToAccessItIndirectly) {
- GlobalVariable *GV = K.first;
- if (!isNamedBarrier(*GV))
-continue;
- // give a module-absolute assignment if it is indirectly accessed by
- // multiple kernels. This is not precise, but we don't want to duplicate
- // a function when it is called by multiple kernels.
- if (LDSToKernelsThatNeedToAccessItIndirectly[GV].size() > 1) {
-OrderedGVs.push_back(GV);
- } else {
-// leave it to the 2nd round, which will give a kernel-relative
-// assignment if it is only indirectly accessed by one kernel
-LDSUsesInfo.direct_access[*K.second.begin()].insert(GV);
- }
- LDSToKernelsThatNeedToAccessItIndirectly.erase(GV);
-}
-OrderedGVs = sortByName(std::move(OrderedGVs));
-for (GlobalVariable *GV : OrderedGVs) {
- unsigned BarrierScope = llvm::AMDGPU::Barrier::BARRIER_SCOPE_WORKGROUP;
- unsigned BarId = NumAbsolutes + 1;
- unsigned BarCnt = DL.getTypeAllocSize(GV->getValueType()) / 16;
- NumAbsolutes += BarCnt;
-
- // 4 bits for alignment, 5 bits for the barrier num,
- // 3 bits for the barrier scope
- unsigned Offset = 0x802000u | BarrierScope << 9 | BarId << 4;
- recordLDSAbsoluteAddress(&M, GV, Offset);
-}
-OrderedGVs.clear();
-
-// The 2nd round: give a kernel-relative assignment for GV that
-// either only indirectly accessed by single kernel or only directly
-// accessed by multiple kernels.
-std::vector OrderedKernels;
-for (auto &K : LDSUsesInfo.direct_access) {
- Function *F = K.first;
- assert(isKernelLDS(F));
- OrderedKernels.push_back(F);
-}
-OrderedKernels = sortByName(std::move(OrderedKernels));
-
-llvm::DenseMap Kernel2BarId;
-for (Function *F : OrderedKernels) {
- for (GlobalVariable *GV : LDSUsesInfo.direct_access[F]) {
-if (!isNamedBarrier(*GV))
- continue;
-
-LDSUsesInfo.direct_access[F].erase(GV);
-if (GV->isAbsoluteSymbolRef()) {
- // already assigned
- continue;
-}
-OrderedGVs.push_back(GV);
- }
- OrderedGVs = sortByName(std::move(OrderedGVs));
- for (GlobalVariable *GV : OrderedGVs) {
-// GV could also be used directly by other kernels. If so, we need to
-// create a new GV used only by this kernel and its function.
-auto NewGV = uniquifyGVPerKernel(M, GV, F);
-Changed |= (NewGV != GV);
-unsigned BarrierScope = llvm::AMDGPU::Barrier::BARRIER_SCOPE_WORKGROUP;
-uns
[llvm-branch-commits] [clang] [AMDGPU] Add builtins for wave reduction intrinsics (PR #161816)
https://github.com/easyonaadit updated
https://github.com/llvm/llvm-project/pull/161816
>From 62867d1bcdb3d8d0eba2b04a78f61f98b92e7de6 Mon Sep 17 00:00:00 2001
From: Aaditya
Date: Tue, 30 Sep 2025 11:37:42 +0530
Subject: [PATCH] [AMDGPU] Add builtins for wave reduction intrinsics
---
clang/include/clang/Basic/BuiltinsAMDGPU.def | 4 +
clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp | 8 ++
clang/test/CodeGenOpenCL/builtins-amdgcn.cl | 84
3 files changed, 96 insertions(+)
diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def
b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index fda16e42d2c6b..ebc0ac35f42d9 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -402,6 +402,10 @@ BUILTIN(__builtin_amdgcn_wave_reduce_max_u64, "WUiWUiZi",
"nc")
BUILTIN(__builtin_amdgcn_wave_reduce_and_b64, "WiWiZi", "nc")
BUILTIN(__builtin_amdgcn_wave_reduce_or_b64, "WiWiZi", "nc")
BUILTIN(__builtin_amdgcn_wave_reduce_xor_b64, "WiWiZi", "nc")
+BUILTIN(__builtin_amdgcn_wave_reduce_add_f32, "ffZi", "nc")
+BUILTIN(__builtin_amdgcn_wave_reduce_sub_f32, "ffZi", "nc")
+BUILTIN(__builtin_amdgcn_wave_reduce_min_f32, "ffZi", "nc")
+BUILTIN(__builtin_amdgcn_wave_reduce_max_f32, "ffZi", "nc")
//===--===//
// R600-NI only builtins.
diff --git a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
index 07cf08c54985a..4de722077c8e9 100644
--- a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
+++ b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
@@ -301,18 +301,22 @@ static Intrinsic::ID
getIntrinsicIDforWaveReduction(unsigned BuiltinID) {
llvm_unreachable("Unknown BuiltinID for wave reduction");
case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_add_u32:
case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_add_u64:
+ case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_add_f32:
return Intrinsic::amdgcn_wave_reduce_add;
case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_sub_u32:
case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_sub_u64:
+ case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_sub_f32:
return Intrinsic::amdgcn_wave_reduce_sub;
case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_min_i32:
case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_min_i64:
+ case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_min_f32:
return Intrinsic::amdgcn_wave_reduce_min;
case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_min_u32:
case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_min_u64:
return Intrinsic::amdgcn_wave_reduce_umin;
case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_max_i32:
case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_max_i64:
+ case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_max_f32:
return Intrinsic::amdgcn_wave_reduce_max;
case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_max_u32:
case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_max_u64:
@@ -335,11 +339,15 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned
BuiltinID,
llvm::SyncScope::ID SSID;
switch (BuiltinID) {
case AMDGPU::BI__builtin_amdgcn_wave_reduce_add_u32:
+ case AMDGPU::BI__builtin_amdgcn_wave_reduce_add_f32:
case AMDGPU::BI__builtin_amdgcn_wave_reduce_sub_u32:
+ case AMDGPU::BI__builtin_amdgcn_wave_reduce_sub_f32:
case AMDGPU::BI__builtin_amdgcn_wave_reduce_min_i32:
case AMDGPU::BI__builtin_amdgcn_wave_reduce_min_u32:
+ case AMDGPU::BI__builtin_amdgcn_wave_reduce_min_f32:
case AMDGPU::BI__builtin_amdgcn_wave_reduce_max_i32:
case AMDGPU::BI__builtin_amdgcn_wave_reduce_max_u32:
+ case AMDGPU::BI__builtin_amdgcn_wave_reduce_max_f32:
case AMDGPU::BI__builtin_amdgcn_wave_reduce_and_b32:
case AMDGPU::BI__builtin_amdgcn_wave_reduce_or_b32:
case AMDGPU::BI__builtin_amdgcn_wave_reduce_xor_b32:
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl
b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl
index 039d03237b530..a8856ab56a55d 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl
@@ -412,6 +412,13 @@ void test_wave_reduce_add_u64_default(global int* out,
long in)
*out = __builtin_amdgcn_wave_reduce_add_u64(in, 0);
}
+// CHECK-LABEL: @test_wave_reduce_add_f32_default
+// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.wave.reduce.add.f32(
+void test_wave_reduce_add_f32_default(global float* out, float in)
+{
+ *out = __builtin_amdgcn_wave_reduce_add_f32(in, 0);
+}
+
// CHECK-LABEL: @test_wave_reduce_add_u32_iterative
// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.add.i32(
void test_wave_reduce_add_u32_iterative(global int* out, int in)
@@ -426,6 +433,13 @@ void test_wave_reduce_add_u64_iterative(global int* out,
long in)
*out = __builtin_amdgcn_wave_reduce_add_u64(in, 1);
}
+// CHECK-LABEL: @test_wave_reduce_add_f32_iterative
+// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.wave.reduce.add.f32(
[llvm-branch-commits] [llvm] [AMDGPU] Add wave reduce intrinsics for float types - 2 (PR #161815)
https://github.com/easyonaadit updated
https://github.com/llvm/llvm-project/pull/161815
>From 27c0f126455f8249b7eda83b5ef900bc6d07de52 Mon Sep 17 00:00:00 2001
From: Aaditya
Date: Mon, 29 Sep 2025 18:58:10 +0530
Subject: [PATCH] [AMDGPU] Add wave reduce intrinsics for float types - 2
Supported Ops: `fadd`, `fsub`
---
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 40 +-
llvm/lib/Target/AMDGPU/SIInstructions.td | 2 +
.../CodeGen/AMDGPU/llvm.amdgcn.reduce.add.ll | 949 +
.../CodeGen/AMDGPU/llvm.amdgcn.reduce.sub.ll | 967 ++
4 files changed, 1955 insertions(+), 3 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 2f1598e25a621..ced967b73cba5 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -5330,11 +5330,13 @@ static uint32_t
getIdentityValueFor32BitWaveReduction(unsigned Opc) {
case AMDGPU::S_MAX_U32:
return std::numeric_limits::min();
case AMDGPU::S_MAX_I32:
+ case AMDGPU::V_SUB_F32_e64: // +0.0
return std::numeric_limits::min();
case AMDGPU::S_ADD_I32:
case AMDGPU::S_SUB_I32:
case AMDGPU::S_OR_B32:
case AMDGPU::S_XOR_B32:
+ case AMDGPU::V_ADD_F32_e64: // -0.0
return std::numeric_limits::min();
case AMDGPU::S_AND_B32:
return std::numeric_limits::max();
@@ -5382,11 +5384,13 @@ static bool is32bitWaveReduceOperation(unsigned Opc) {
Opc == AMDGPU::S_ADD_I32 || Opc == AMDGPU::S_SUB_I32 ||
Opc == AMDGPU::S_AND_B32 || Opc == AMDGPU::S_OR_B32 ||
Opc == AMDGPU::S_XOR_B32 || Opc == AMDGPU::V_MIN_F32_e64 ||
- Opc == AMDGPU::V_MAX_F32_e64;
+ Opc == AMDGPU::V_MAX_F32_e64 || Opc == AMDGPU::V_ADD_F32_e64 ||
+ Opc == AMDGPU::V_SUB_F32_e64;
}
static bool isFloatingPointWaveReduceOperation(unsigned Opc) {
- return Opc == AMDGPU::V_MIN_F32_e64 || Opc == AMDGPU::V_MAX_F32_e64;
+ return Opc == AMDGPU::V_MIN_F32_e64 || Opc == AMDGPU::V_MAX_F32_e64 ||
+ Opc == AMDGPU::V_ADD_F32_e64 || Opc == AMDGPU::V_SUB_F32_e64;
}
static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
@@ -5433,8 +5437,10 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr
&MI,
case AMDGPU::S_XOR_B64:
case AMDGPU::S_ADD_I32:
case AMDGPU::S_ADD_U64_PSEUDO:
+case AMDGPU::V_ADD_F32_e64:
case AMDGPU::S_SUB_I32:
-case AMDGPU::S_SUB_U64_PSEUDO: {
+case AMDGPU::S_SUB_U64_PSEUDO:
+case AMDGPU::V_SUB_F32_e64: {
const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
Register ExecMask = MRI.createVirtualRegister(WaveMaskRegClass);
@@ -5589,6 +5595,30 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr
&MI,
.addImm(AMDGPU::sub1);
break;
}
+ case AMDGPU::V_ADD_F32_e64:
+ case AMDGPU::V_SUB_F32_e64: {
+Register ActiveLanesVreg =
+MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+Register DstVreg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+// Get number of active lanes as a float val.
+BuildMI(BB, MI, DL, TII->get(AMDGPU::V_CVT_F32_I32_e64),
+ActiveLanesVreg)
+.addReg(NewAccumulator->getOperand(0).getReg())
+.addImm(0) // clamp
+.addImm(0); // output-modifier
+
+// Take negation of input for SUB reduction
+unsigned srcMod = Opc == AMDGPU::V_SUB_F32_e64 ? 1 : 0;
+BuildMI(BB, MI, DL, TII->get(AMDGPU::V_MUL_F32_e64), DstVreg)
+.addImm(srcMod) // src0 modifier
+.addReg(SrcReg)
+.addImm(0) // src1 modifier
+.addReg(ActiveLanesVreg)
+.addImm(0) // clamp
+.addImm(0); // output-mod
+BuildMI(BB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
+.addReg(DstVreg);
+ }
}
RetBB = &BB;
}
@@ -5833,10 +5863,14 @@
SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_I32);
case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_U64:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_U64_PSEUDO);
+ case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_F32:
+return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_ADD_F32_e64);
case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_I32:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_I32);
case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_U64:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_U64_PSEUDO);
+ case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_F32:
+return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_SUB_F32_e64);
case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B32:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_AND_B32);
case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B64:
diff --git a/llvm/lib/Target/AMDGPU/SIInstruc
[llvm-branch-commits] [llvm] [AMDGPU] Add wave reduce intrinsics for float types - 2 (PR #161815)
https://github.com/easyonaadit updated
https://github.com/llvm/llvm-project/pull/161815
>From 27c0f126455f8249b7eda83b5ef900bc6d07de52 Mon Sep 17 00:00:00 2001
From: Aaditya
Date: Mon, 29 Sep 2025 18:58:10 +0530
Subject: [PATCH] [AMDGPU] Add wave reduce intrinsics for float types - 2
Supported Ops: `fadd`, `fsub`
---
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 40 +-
llvm/lib/Target/AMDGPU/SIInstructions.td | 2 +
.../CodeGen/AMDGPU/llvm.amdgcn.reduce.add.ll | 949 +
.../CodeGen/AMDGPU/llvm.amdgcn.reduce.sub.ll | 967 ++
4 files changed, 1955 insertions(+), 3 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 2f1598e25a621..ced967b73cba5 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -5330,11 +5330,13 @@ static uint32_t
getIdentityValueFor32BitWaveReduction(unsigned Opc) {
case AMDGPU::S_MAX_U32:
return std::numeric_limits::min();
case AMDGPU::S_MAX_I32:
+ case AMDGPU::V_SUB_F32_e64: // +0.0
return std::numeric_limits::min();
case AMDGPU::S_ADD_I32:
case AMDGPU::S_SUB_I32:
case AMDGPU::S_OR_B32:
case AMDGPU::S_XOR_B32:
+ case AMDGPU::V_ADD_F32_e64: // -0.0
return std::numeric_limits::min();
case AMDGPU::S_AND_B32:
return std::numeric_limits::max();
@@ -5382,11 +5384,13 @@ static bool is32bitWaveReduceOperation(unsigned Opc) {
Opc == AMDGPU::S_ADD_I32 || Opc == AMDGPU::S_SUB_I32 ||
Opc == AMDGPU::S_AND_B32 || Opc == AMDGPU::S_OR_B32 ||
Opc == AMDGPU::S_XOR_B32 || Opc == AMDGPU::V_MIN_F32_e64 ||
- Opc == AMDGPU::V_MAX_F32_e64;
+ Opc == AMDGPU::V_MAX_F32_e64 || Opc == AMDGPU::V_ADD_F32_e64 ||
+ Opc == AMDGPU::V_SUB_F32_e64;
}
static bool isFloatingPointWaveReduceOperation(unsigned Opc) {
- return Opc == AMDGPU::V_MIN_F32_e64 || Opc == AMDGPU::V_MAX_F32_e64;
+ return Opc == AMDGPU::V_MIN_F32_e64 || Opc == AMDGPU::V_MAX_F32_e64 ||
+ Opc == AMDGPU::V_ADD_F32_e64 || Opc == AMDGPU::V_SUB_F32_e64;
}
static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
@@ -5433,8 +5437,10 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr
&MI,
case AMDGPU::S_XOR_B64:
case AMDGPU::S_ADD_I32:
case AMDGPU::S_ADD_U64_PSEUDO:
+case AMDGPU::V_ADD_F32_e64:
case AMDGPU::S_SUB_I32:
-case AMDGPU::S_SUB_U64_PSEUDO: {
+case AMDGPU::S_SUB_U64_PSEUDO:
+case AMDGPU::V_SUB_F32_e64: {
const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
Register ExecMask = MRI.createVirtualRegister(WaveMaskRegClass);
@@ -5589,6 +5595,30 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr
&MI,
.addImm(AMDGPU::sub1);
break;
}
+ case AMDGPU::V_ADD_F32_e64:
+ case AMDGPU::V_SUB_F32_e64: {
+Register ActiveLanesVreg =
+MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+Register DstVreg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+// Get number of active lanes as a float val.
+BuildMI(BB, MI, DL, TII->get(AMDGPU::V_CVT_F32_I32_e64),
+ActiveLanesVreg)
+.addReg(NewAccumulator->getOperand(0).getReg())
+.addImm(0) // clamp
+.addImm(0); // output-modifier
+
+// Take negation of input for SUB reduction
+unsigned srcMod = Opc == AMDGPU::V_SUB_F32_e64 ? 1 : 0;
+BuildMI(BB, MI, DL, TII->get(AMDGPU::V_MUL_F32_e64), DstVreg)
+.addImm(srcMod) // src0 modifier
+.addReg(SrcReg)
+.addImm(0) // src1 modifier
+.addReg(ActiveLanesVreg)
+.addImm(0) // clamp
+.addImm(0); // output-mod
+BuildMI(BB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
+.addReg(DstVreg);
+ }
}
RetBB = &BB;
}
@@ -5833,10 +5863,14 @@
SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_I32);
case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_U64:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_U64_PSEUDO);
+ case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_F32:
+return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_ADD_F32_e64);
case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_I32:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_I32);
case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_U64:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_U64_PSEUDO);
+ case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_F32:
+return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_SUB_F32_e64);
case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B32:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_AND_B32);
case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B64:
diff --git a/llvm/lib/Target/AMDGPU/SIInstruc
[llvm-branch-commits] [llvm] [CI] Make premerge_advisor_explain write comments (PR #166605)
https://github.com/DavidSpickett edited https://github.com/llvm/llvm-project/pull/166605 ___ llvm-branch-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [mlir] [mlir][vector] Simplify createReadOrMaskedRead (PR #163736)
https://github.com/banach-space closed https://github.com/llvm/llvm-project/pull/163736 ___ llvm-branch-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] release/21.x: [RISCV] Correct the CFA offsets for stack probing. (#166616) (PR #166783)
https://github.com/llvmbot milestoned https://github.com/llvm/llvm-project/pull/166783 ___ llvm-branch-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] release/21.x: [RISCV] Correct the CFA offsets for stack probing. (#166616) (PR #166783)
https://github.com/llvmbot created
https://github.com/llvm/llvm-project/pull/166783
Backport ff11b93bb8f5578c9eb7296160570ea001a1155f
Requested by: @topperc
>From c343ce6d630b0c5819fbe50fec76de0408789112 Mon Sep 17 00:00:00 2001
From: Craig Topper
Date: Thu, 6 Nov 2025 07:09:52 -0800
Subject: [PATCH] [RISCV] Correct the CFA offsets for stack probing. (#166616)
We need to take into account that we may have already done a FirstSPAdjust.
Fixes #164805.
(cherry picked from commit ff11b93bb8f5578c9eb7296160570ea001a1155f)
---
llvm/lib/Target/RISCV/RISCVFrameLowering.cpp | 6 --
llvm/test/CodeGen/RISCV/rvv/stack-probing-dynamic.ll | 8
2 files changed, 8 insertions(+), 6 deletions(-)
diff --git a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
index 6c8e3da80b932..c4f41b8d8e4d9 100644
--- a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
@@ -768,6 +768,8 @@ void RISCVFrameLowering::allocateStack(MachineBasicBlock
&MBB,
// Unroll the probe loop depending on the number of iterations.
if (Offset < ProbeSize * 5) {
+uint64_t CFAAdjust = RealStackSize - Offset;
+
uint64_t CurrentOffset = 0;
while (CurrentOffset + ProbeSize <= Offset) {
RI->adjustReg(MBB, MBBI, DL, SPReg, SPReg,
@@ -781,7 +783,7 @@ void RISCVFrameLowering::allocateStack(MachineBasicBlock
&MBB,
CurrentOffset += ProbeSize;
if (EmitCFI)
-CFIBuilder.buildDefCFAOffset(CurrentOffset);
+CFIBuilder.buildDefCFAOffset(CurrentOffset + CFAAdjust);
}
uint64_t Residual = Offset - CurrentOffset;
@@ -789,7 +791,7 @@ void RISCVFrameLowering::allocateStack(MachineBasicBlock
&MBB,
RI->adjustReg(MBB, MBBI, DL, SPReg, SPReg,
StackOffset::getFixed(-Residual), Flag, getStackAlign());
if (EmitCFI)
-CFIBuilder.buildDefCFAOffset(Offset);
+CFIBuilder.buildDefCFAOffset(RealStackSize);
if (DynAllocation) {
// s[d|w] zero, 0(sp)
diff --git a/llvm/test/CodeGen/RISCV/rvv/stack-probing-dynamic.ll
b/llvm/test/CodeGen/RISCV/rvv/stack-probing-dynamic.ll
index d666832cf6e0b..c79fb0f91b21f 100644
--- a/llvm/test/CodeGen/RISCV/rvv/stack-probing-dynamic.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/stack-probing-dynamic.ll
@@ -460,9 +460,9 @@ define void @reserved_call_frame(i64 %n) #0 {
; RV64I-NEXT:lui a0, 1
; RV64I-NEXT:sub sp, sp, a0
; RV64I-NEXT:sd zero, 0(sp)
-; RV64I-NEXT:.cfi_def_cfa_offset 4096
+; RV64I-NEXT:.cfi_def_cfa_offset 6128
; RV64I-NEXT:addi sp, sp, -48
-; RV64I-NEXT:.cfi_def_cfa_offset 4144
+; RV64I-NEXT:.cfi_def_cfa_offset 6176
; RV64I-NEXT:lui a0, 1
; RV64I-NEXT:add a0, sp, a0
; RV64I-NEXT:call callee_stack_args
@@ -485,9 +485,9 @@ define void @reserved_call_frame(i64 %n) #0 {
; RV32I-NEXT:lui a0, 1
; RV32I-NEXT:sub sp, sp, a0
; RV32I-NEXT:sw zero, 0(sp)
-; RV32I-NEXT:.cfi_def_cfa_offset 4096
+; RV32I-NEXT:.cfi_def_cfa_offset 6128
; RV32I-NEXT:addi sp, sp, -80
-; RV32I-NEXT:.cfi_def_cfa_offset 4176
+; RV32I-NEXT:.cfi_def_cfa_offset 6208
; RV32I-NEXT:lui a0, 1
; RV32I-NEXT:addi a0, a0, 36
; RV32I-NEXT:add a0, sp, a0
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] release/21.x: [RISCV] Correct the CFA offsets for stack probing. (#166616) (PR #166783)
llvmbot wrote: @kito-cheng What do you think about merging this PR to the release branch? https://github.com/llvm/llvm-project/pull/166783 ___ llvm-branch-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] release/21.x: [RISCV] Correct the CFA offsets for stack probing. (#166616) (PR #166783)
llvmbot wrote:
@llvm/pr-subscribers-backend-risc-v
Author: None (llvmbot)
Changes
Backport ff11b93bb8f5578c9eb7296160570ea001a1155f
Requested by: @topperc
---
Full diff: https://github.com/llvm/llvm-project/pull/166783.diff
2 Files Affected:
- (modified) llvm/lib/Target/RISCV/RISCVFrameLowering.cpp (+4-2)
- (modified) llvm/test/CodeGen/RISCV/rvv/stack-probing-dynamic.ll (+4-4)
``diff
diff --git a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
index 6c8e3da80b932..c4f41b8d8e4d9 100644
--- a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
@@ -768,6 +768,8 @@ void RISCVFrameLowering::allocateStack(MachineBasicBlock
&MBB,
// Unroll the probe loop depending on the number of iterations.
if (Offset < ProbeSize * 5) {
+uint64_t CFAAdjust = RealStackSize - Offset;
+
uint64_t CurrentOffset = 0;
while (CurrentOffset + ProbeSize <= Offset) {
RI->adjustReg(MBB, MBBI, DL, SPReg, SPReg,
@@ -781,7 +783,7 @@ void RISCVFrameLowering::allocateStack(MachineBasicBlock
&MBB,
CurrentOffset += ProbeSize;
if (EmitCFI)
-CFIBuilder.buildDefCFAOffset(CurrentOffset);
+CFIBuilder.buildDefCFAOffset(CurrentOffset + CFAAdjust);
}
uint64_t Residual = Offset - CurrentOffset;
@@ -789,7 +791,7 @@ void RISCVFrameLowering::allocateStack(MachineBasicBlock
&MBB,
RI->adjustReg(MBB, MBBI, DL, SPReg, SPReg,
StackOffset::getFixed(-Residual), Flag, getStackAlign());
if (EmitCFI)
-CFIBuilder.buildDefCFAOffset(Offset);
+CFIBuilder.buildDefCFAOffset(RealStackSize);
if (DynAllocation) {
// s[d|w] zero, 0(sp)
diff --git a/llvm/test/CodeGen/RISCV/rvv/stack-probing-dynamic.ll
b/llvm/test/CodeGen/RISCV/rvv/stack-probing-dynamic.ll
index d666832cf6e0b..c79fb0f91b21f 100644
--- a/llvm/test/CodeGen/RISCV/rvv/stack-probing-dynamic.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/stack-probing-dynamic.ll
@@ -460,9 +460,9 @@ define void @reserved_call_frame(i64 %n) #0 {
; RV64I-NEXT:lui a0, 1
; RV64I-NEXT:sub sp, sp, a0
; RV64I-NEXT:sd zero, 0(sp)
-; RV64I-NEXT:.cfi_def_cfa_offset 4096
+; RV64I-NEXT:.cfi_def_cfa_offset 6128
; RV64I-NEXT:addi sp, sp, -48
-; RV64I-NEXT:.cfi_def_cfa_offset 4144
+; RV64I-NEXT:.cfi_def_cfa_offset 6176
; RV64I-NEXT:lui a0, 1
; RV64I-NEXT:add a0, sp, a0
; RV64I-NEXT:call callee_stack_args
@@ -485,9 +485,9 @@ define void @reserved_call_frame(i64 %n) #0 {
; RV32I-NEXT:lui a0, 1
; RV32I-NEXT:sub sp, sp, a0
; RV32I-NEXT:sw zero, 0(sp)
-; RV32I-NEXT:.cfi_def_cfa_offset 4096
+; RV32I-NEXT:.cfi_def_cfa_offset 6128
; RV32I-NEXT:addi sp, sp, -80
-; RV32I-NEXT:.cfi_def_cfa_offset 4176
+; RV32I-NEXT:.cfi_def_cfa_offset 6208
; RV32I-NEXT:lui a0, 1
; RV32I-NEXT:addi a0, a0, 36
; RV32I-NEXT:add a0, sp, a0
``
https://github.com/llvm/llvm-project/pull/166783
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [MLIR][Python] Update Nanobind Warnings List for clang-cl on Windows (PR #166828)
https://github.com/boomanaiden154 updated
https://github.com/llvm/llvm-project/pull/166828
>From bc870644188ae13da4141efdf75eab0137ddcc30 Mon Sep 17 00:00:00 2001
From: Aiden Grossman
Date: Thu, 6 Nov 2025 19:05:09 +
Subject: [PATCH] =?UTF-8?q?[=F0=9D=98=80=F0=9D=97=BD=F0=9D=97=BF]=20change?=
=?UTF-8?q?s=20to=20main=20this=20commit=20is=20based=20on?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Created using spr 1.3.7
[skip ci]
---
llvm/include/llvm/Support/thread.h | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/llvm/include/llvm/Support/thread.h
b/llvm/include/llvm/Support/thread.h
index ecde62d8368e7..51873e7d529bf 100644
--- a/llvm/include/llvm/Support/thread.h
+++ b/llvm/include/llvm/Support/thread.h
@@ -34,7 +34,7 @@ typedef PVOID HANDLE;
namespace llvm {
-#if LLVM_ON_UNIX || _WIN32
+#if defined(LLVM_ON_UNIX) || defined(_WIN32)
/// LLVM thread following std::thread interface with added constructor to
/// specify stack size.
@@ -49,7 +49,7 @@ class thread {
}
public:
-#if LLVM_ON_UNIX
+#ifdef LLVM_ON_UNIX
using native_handle_type = pthread_t;
using id = pthread_t;
using start_routine_type = void *(*)(void *);
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [MLIR][Python] Update Nanobind Warnings List for clang-cl on Windows (PR #166828)
boomanaiden154 wrote: > This thing is a perennial PITA. I'm stamping to unblock but can you can also > try https://github.com/wjakob/nanobind/pull/868. Yeah, looks like it. Very interesting to see a project that is pretty against disabling warnings, but also against accepting patches to fix warnings that do come up. I'll land this as is and see if I can get `NB_SUPPRESS_WARNINGS` to eliminate the need for the custom compile options. https://github.com/llvm/llvm-project/pull/166828 ___ llvm-branch-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [MLIR][Python] Update Nanobind Warnings List for clang-cl on Windows (PR #166828)
https://github.com/boomanaiden154 updated https://github.com/llvm/llvm-project/pull/166828 ___ llvm-branch-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [ConstantTime][RISCV] Add comprehensive tests for ct.select (PR #166708)
https://github.com/wizardengineer updated
https://github.com/llvm/llvm-project/pull/166708
>From 7aec58aa6f8029c514857a755b5a381e6a6b22af Mon Sep 17 00:00:00 2001
From: wizardengineer
Date: Wed, 5 Nov 2025 11:01:00 -0500
Subject: [PATCH] [ConstantTime][RISCV] Add comprehensive tests for ct.select
Add comprehensive test suite for RISC-V fallback implementation:
- Edge cases (zero conditions, large integers, sign extension)
- Pattern matching (nested selects, chains)
- Vector support with RVV extensions
- Side effects and memory operations
The basic fallback test is in the core infrastructure PR.
---
.../RISCV/ctselect-fallback-edge-cases.ll | 214 +
.../RISCV/ctselect-fallback-patterns.ll | 383 +
.../RISCV/ctselect-fallback-vector-rvv.ll | 804 ++
.../CodeGen/RISCV/ctselect-side-effects.ll| 176
4 files changed, 1577 insertions(+)
create mode 100644 llvm/test/CodeGen/RISCV/ctselect-fallback-edge-cases.ll
create mode 100644 llvm/test/CodeGen/RISCV/ctselect-fallback-patterns.ll
create mode 100644 llvm/test/CodeGen/RISCV/ctselect-fallback-vector-rvv.ll
create mode 100644 llvm/test/CodeGen/RISCV/ctselect-side-effects.ll
diff --git a/llvm/test/CodeGen/RISCV/ctselect-fallback-edge-cases.ll
b/llvm/test/CodeGen/RISCV/ctselect-fallback-edge-cases.ll
new file mode 100644
index 0..af1be0c8f3ddc
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/ctselect-fallback-edge-cases.ll
@@ -0,0 +1,214 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=riscv64 -O3 | FileCheck %s --check-prefix=RV64
+; RUN: llc < %s -mtriple=riscv32 -O3 | FileCheck %s --check-prefix=RV32
+
+; Test with small integer types
+define i1 @test_ctselect_i1(i1 %cond, i1 %a, i1 %b) {
+; RV64-LABEL: test_ctselect_i1:
+; RV64: # %bb.0:
+; RV64-NEXT:and a1, a0, a1
+; RV64-NEXT:xori a0, a0, 1
+; RV64-NEXT:and a0, a0, a2
+; RV64-NEXT:or a0, a1, a0
+; RV64-NEXT:ret
+;
+; RV32-LABEL: test_ctselect_i1:
+; RV32: # %bb.0:
+; RV32-NEXT:and a1, a0, a1
+; RV32-NEXT:xori a0, a0, 1
+; RV32-NEXT:and a0, a0, a2
+; RV32-NEXT:or a0, a1, a0
+; RV32-NEXT:ret
+ %result = call i1 @llvm.ct.select.i1(i1 %cond, i1 %a, i1 %b)
+ ret i1 %result
+}
+
+; Test with extremal values
+define i32 @test_ctselect_extremal_values(i1 %cond) {
+; RV64-LABEL: test_ctselect_extremal_values:
+; RV64: # %bb.0:
+; RV64-NEXT:andi a0, a0, 1
+; RV64-NEXT:lui a1, 524288
+; RV64-NEXT:subw a0, a1, a0
+; RV64-NEXT:ret
+;
+; RV32-LABEL: test_ctselect_extremal_values:
+; RV32: # %bb.0:
+; RV32-NEXT:andi a0, a0, 1
+; RV32-NEXT:lui a1, 524288
+; RV32-NEXT:addi a2, a0, -1
+; RV32-NEXT:neg a0, a0
+; RV32-NEXT:and a1, a2, a1
+; RV32-NEXT:slli a0, a0, 1
+; RV32-NEXT:srli a0, a0, 1
+; RV32-NEXT:or a0, a0, a1
+; RV32-NEXT:ret
+ %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 2147483647, i32
-2147483648)
+ ret i32 %result
+}
+
+; Test with null pointers
+define ptr @test_ctselect_null_ptr(i1 %cond, ptr %ptr) {
+; RV64-LABEL: test_ctselect_null_ptr:
+; RV64: # %bb.0:
+; RV64-NEXT:slli a0, a0, 63
+; RV64-NEXT:srai a0, a0, 63
+; RV64-NEXT:and a0, a0, a1
+; RV64-NEXT:ret
+;
+; RV32-LABEL: test_ctselect_null_ptr:
+; RV32: # %bb.0:
+; RV32-NEXT:slli a0, a0, 31
+; RV32-NEXT:srai a0, a0, 31
+; RV32-NEXT:and a0, a0, a1
+; RV32-NEXT:ret
+ %result = call ptr @llvm.ct.select.p0(i1 %cond, ptr %ptr, ptr null)
+ ret ptr %result
+}
+
+; Test with function pointers
+define ptr @test_ctselect_function_ptr(i1 %cond, ptr %func1, ptr %func2) {
+; RV64-LABEL: test_ctselect_function_ptr:
+; RV64: # %bb.0:
+; RV64-NEXT:andi a0, a0, 1
+; RV64-NEXT:neg a3, a0
+; RV64-NEXT:addi a0, a0, -1
+; RV64-NEXT:and a1, a3, a1
+; RV64-NEXT:and a0, a0, a2
+; RV64-NEXT:or a0, a1, a0
+; RV64-NEXT:ret
+;
+; RV32-LABEL: test_ctselect_function_ptr:
+; RV32: # %bb.0:
+; RV32-NEXT:andi a0, a0, 1
+; RV32-NEXT:neg a3, a0
+; RV32-NEXT:addi a0, a0, -1
+; RV32-NEXT:and a1, a3, a1
+; RV32-NEXT:and a0, a0, a2
+; RV32-NEXT:or a0, a1, a0
+; RV32-NEXT:ret
+ %result = call ptr @llvm.ct.select.p0(i1 %cond, ptr %func1, ptr %func2)
+ ret ptr %result
+}
+
+; Test with condition from icmp on pointers
+define ptr @test_ctselect_ptr_cmp(ptr %p1, ptr %p2, ptr %a, ptr %b) {
+; RV64-LABEL: test_ctselect_ptr_cmp:
+; RV64: # %bb.0:
+; RV64-NEXT:xor a0, a0, a1
+; RV64-NEXT:snez a0, a0
+; RV64-NEXT:addi a0, a0, -1
+; RV64-NEXT:and a2, a0, a2
+; RV64-NEXT:not a0, a0
+; RV64-NEXT:and a0, a0, a3
+; RV64-NEXT:or a0, a2, a0
+; RV64-NEXT:ret
+;
+; RV32-LABEL: test_ctselect_ptr_cmp:
+; RV32: # %bb.0:
+; RV32-NEXT:xor a0, a0, a1
+; RV32-NEXT:snez a0, a0
+; RV32-NEXT:addi a0, a0, -1
+; RV32-NEXT:and a2, a0, a2
+; RV32-NEXT:not a0, a0
+; RV32-NEXT:
[llvm-branch-commits] [llvm] CodeGen: Handle bundled instructions in two-address-instructions pass (PR #166212)
nhaehnle wrote: > Have you considered the case where the instructions inside the bundle have > two uses of RegB, but only one of them is tied with RegA? I think it is > almost impossible to handle that optimally given only the summarised > information that you get from the operands of the BUNDLE. It might be worth > adding a test case like that, just to check that we don't crash and still > generate well formed MIR. > > The fundamental question here is, can `processTiedPairs` really operate at > the BUNDLE level (and then fix up the instructions inside)? Or is it going to > have to operate on the individual instructions (and then fix up the summary > information on the BUNDLE)? Yes, I thought about it. I did not find a good answer to what tied operands inside of a pre-RA bundle really mean *in the general case*. Bundles mean different things to different people. The main use of bundles outside of AMDGPU is for VLIW. In AMDGPU so far, it is used for memory clauses (which could potentially have tied operands for atomicrmw, but we only form them post-RA) and for a few niche cases like keeping S_GETPC_B64 together with its uses for PC-relative addressing. What we're working towards here is a new pre-RA use case that can be vaguely described as "decomposing a single instruction into virtual micro-ops during codegen for the purpose of avoiding combinatorial explosion in opcodes etc.". For that use case, the requirements on tied operand support will be fairly restricted, and so I'd rather make this change more conservative and restrictive and not attempt to support something that we don't actually use and don't know how to test properly. And then build on that if and when we actually know what else might be needed. https://github.com/llvm/llvm-project/pull/166212 ___ llvm-branch-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] CodeGen: Handle bundled instructions in two-address-instructions pass (PR #166212)
@@ -1665,6 +1665,22 @@ void
TwoAddressInstructionImpl::processTiedPairs(MachineInstr *MI,
// by SubRegB is compatible with RegA with no subregister. So regardless of
// whether the dest oper writes a subreg, the source oper should not.
MO.setSubReg(0);
+
+// Update uses of RegB to uses of RegA inside the bundle.
+if (MI->isBundle()) {
+ for (MachineInstr *InnerMI = MI; InnerMI->isBundledWithSucc();) {
+InnerMI = InnerMI->getNextNode();
+
+for (MachineOperand &MO : InnerMI->all_uses()) {
+ if (MO.isReg() && MO.getReg() == RegB) {
+assert(
+MO.getSubReg() == 0 &&
+"tied subregister uses in bundled instructions not supported");
+MO.setReg(RegA);
nhaehnle wrote:
I played around with this but it got very confusing to me even in the
restricted use case that I described in the other comment. I'd prefer to just
keep this more restrictive for now, and if we find later on that supporting
subregisters is beneficial we relax it separately.
https://github.com/llvm/llvm-project/pull/166212
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] CodeGen/AMDGPU: Allow 3-address conversion of bundled instructions (PR #166213)
https://github.com/nhaehnle updated
https://github.com/llvm/llvm-project/pull/166213
From 69ff9c0f0dd1af8333d4b160003d7f8a6eea61aa Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicolai=20H=C3=A4hnle?=
Date: Tue, 7 Oct 2025 12:17:02 -0700
Subject: [PATCH] CodeGen/AMDGPU: Allow 3-address conversion of bundled
instructions
This is in preparation for future changes in AMDGPU that will make more
substantial use of bundles pre-RA. For now, simply test this with
degenerate (single-instruction) bundles.
commit-id:4a30cb78
---
.../lib/CodeGen/TwoAddressInstructionPass.cpp | 54 ++
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp| 56 +--
llvm/test/CodeGen/AMDGPU/twoaddr-bundle.mir | 9 ++-
3 files changed, 87 insertions(+), 32 deletions(-)
diff --git a/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
b/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
index 713ef779588cf..3ff6da2b6dc63 100644
--- a/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
+++ b/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
@@ -794,29 +794,34 @@ bool TwoAddressInstructionImpl::convertInstTo3Addr(
if (!NewMI)
return false;
- LLVM_DEBUG(dbgs() << "2addr: CONVERTING 2-ADDR: " << *mi);
- LLVM_DEBUG(dbgs() << "2addr: TO 3-ADDR: " << *NewMI);
-
- // If the old instruction is debug value tracked, an update is required.
- if (auto OldInstrNum = mi->peekDebugInstrNum()) {
-assert(mi->getNumExplicitDefs() == 1);
-assert(NewMI->getNumExplicitDefs() == 1);
-
-// Find the old and new def location.
-unsigned OldIdx = mi->defs().begin()->getOperandNo();
-unsigned NewIdx = NewMI->defs().begin()->getOperandNo();
-
-// Record that one def has been replaced by the other.
-unsigned NewInstrNum = NewMI->getDebugInstrNum();
-MF->makeDebugValueSubstitution(std::make_pair(OldInstrNum, OldIdx),
- std::make_pair(NewInstrNum, NewIdx));
- }
-
- MBB->erase(mi); // Nuke the old inst.
-
for (MachineInstr &MI : MIS)
DistanceMap.insert(std::make_pair(&MI, Dist++));
- Dist--;
+
+ if (&*mi == NewMI) {
+LLVM_DEBUG(dbgs() << "2addr: CONVERTED IN-PLACE TO 3-ADDR: " << *mi);
+ } else {
+LLVM_DEBUG(dbgs() << "2addr: CONVERTING 2-ADDR: " << *mi);
+LLVM_DEBUG(dbgs() << "2addr: TO 3-ADDR: " << *NewMI);
+
+// If the old instruction is debug value tracked, an update is required.
+if (auto OldInstrNum = mi->peekDebugInstrNum()) {
+ assert(mi->getNumExplicitDefs() == 1);
+ assert(NewMI->getNumExplicitDefs() == 1);
+
+ // Find the old and new def location.
+ unsigned OldIdx = mi->defs().begin()->getOperandNo();
+ unsigned NewIdx = NewMI->defs().begin()->getOperandNo();
+
+ // Record that one def has been replaced by the other.
+ unsigned NewInstrNum = NewMI->getDebugInstrNum();
+ MF->makeDebugValueSubstitution(std::make_pair(OldInstrNum, OldIdx),
+ std::make_pair(NewInstrNum, NewIdx));
+}
+
+MBB->erase(mi); // Nuke the old inst.
+Dist--;
+ }
+
mi = NewMI;
nmi = std::next(mi);
@@ -1329,6 +1334,9 @@ bool TwoAddressInstructionImpl::tryInstructionTransform(
bool Commuted = tryInstructionCommute(&MI, DstIdx, SrcIdx, regBKilled, Dist);
+ // Give targets a chance to convert bundled instructions.
+ bool ConvertibleTo3Addr = MI.isConvertibleTo3Addr(MachineInstr::AnyInBundle);
+
// If the instruction is convertible to 3 Addr, instead
// of returning try 3 Addr transformation aggressively and
// use this variable to check later. Because it might be better.
@@ -1337,7 +1345,7 @@ bool TwoAddressInstructionImpl::tryInstructionTransform(
// addl %esi, %edi
// movl %edi, %eax
// ret
- if (Commuted && !MI.isConvertibleTo3Addr())
+ if (Commuted && !ConvertibleTo3Addr)
return false;
if (shouldOnlyCommute)
@@ -1357,7 +1365,7 @@ bool TwoAddressInstructionImpl::tryInstructionTransform(
regBKilled = isKilled(MI, regB, true);
}
- if (MI.isConvertibleTo3Addr()) {
+ if (ConvertibleTo3Addr) {
// This instruction is potentially convertible to a true
// three-address instruction. Check if it is profitable.
if (!regBKilled || isProfitableToConv3Addr(regA, regB)) {
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 6ce18ea921a9b..deeb8beb04332 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -4047,10 +4047,29 @@ MachineInstr
*SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
LiveVariables *LV,
LiveIntervals *LIS) const {
MachineBasicBlock &MBB = *MI.getParent();
+ MachineInstr *CandidateMI = &MI;
+
+ if (MI.isBundle()) {
+// This is a temporary placeholder for bundle handling that enables us to
+// exercise the relevant code paths in the two-address instruction pass.
+i
[llvm-branch-commits] [llvm] CodeGen/AMDGPU: Allow 3-address conversion of bundled instructions (PR #166213)
https://github.com/nhaehnle updated
https://github.com/llvm/llvm-project/pull/166213
From 69ff9c0f0dd1af8333d4b160003d7f8a6eea61aa Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicolai=20H=C3=A4hnle?=
Date: Tue, 7 Oct 2025 12:17:02 -0700
Subject: [PATCH] CodeGen/AMDGPU: Allow 3-address conversion of bundled
instructions
This is in preparation for future changes in AMDGPU that will make more
substantial use of bundles pre-RA. For now, simply test this with
degenerate (single-instruction) bundles.
commit-id:4a30cb78
---
.../lib/CodeGen/TwoAddressInstructionPass.cpp | 54 ++
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp| 56 +--
llvm/test/CodeGen/AMDGPU/twoaddr-bundle.mir | 9 ++-
3 files changed, 87 insertions(+), 32 deletions(-)
diff --git a/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
b/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
index 713ef779588cf..3ff6da2b6dc63 100644
--- a/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
+++ b/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
@@ -794,29 +794,34 @@ bool TwoAddressInstructionImpl::convertInstTo3Addr(
if (!NewMI)
return false;
- LLVM_DEBUG(dbgs() << "2addr: CONVERTING 2-ADDR: " << *mi);
- LLVM_DEBUG(dbgs() << "2addr: TO 3-ADDR: " << *NewMI);
-
- // If the old instruction is debug value tracked, an update is required.
- if (auto OldInstrNum = mi->peekDebugInstrNum()) {
-assert(mi->getNumExplicitDefs() == 1);
-assert(NewMI->getNumExplicitDefs() == 1);
-
-// Find the old and new def location.
-unsigned OldIdx = mi->defs().begin()->getOperandNo();
-unsigned NewIdx = NewMI->defs().begin()->getOperandNo();
-
-// Record that one def has been replaced by the other.
-unsigned NewInstrNum = NewMI->getDebugInstrNum();
-MF->makeDebugValueSubstitution(std::make_pair(OldInstrNum, OldIdx),
- std::make_pair(NewInstrNum, NewIdx));
- }
-
- MBB->erase(mi); // Nuke the old inst.
-
for (MachineInstr &MI : MIS)
DistanceMap.insert(std::make_pair(&MI, Dist++));
- Dist--;
+
+ if (&*mi == NewMI) {
+LLVM_DEBUG(dbgs() << "2addr: CONVERTED IN-PLACE TO 3-ADDR: " << *mi);
+ } else {
+LLVM_DEBUG(dbgs() << "2addr: CONVERTING 2-ADDR: " << *mi);
+LLVM_DEBUG(dbgs() << "2addr: TO 3-ADDR: " << *NewMI);
+
+// If the old instruction is debug value tracked, an update is required.
+if (auto OldInstrNum = mi->peekDebugInstrNum()) {
+ assert(mi->getNumExplicitDefs() == 1);
+ assert(NewMI->getNumExplicitDefs() == 1);
+
+ // Find the old and new def location.
+ unsigned OldIdx = mi->defs().begin()->getOperandNo();
+ unsigned NewIdx = NewMI->defs().begin()->getOperandNo();
+
+ // Record that one def has been replaced by the other.
+ unsigned NewInstrNum = NewMI->getDebugInstrNum();
+ MF->makeDebugValueSubstitution(std::make_pair(OldInstrNum, OldIdx),
+ std::make_pair(NewInstrNum, NewIdx));
+}
+
+MBB->erase(mi); // Nuke the old inst.
+Dist--;
+ }
+
mi = NewMI;
nmi = std::next(mi);
@@ -1329,6 +1334,9 @@ bool TwoAddressInstructionImpl::tryInstructionTransform(
bool Commuted = tryInstructionCommute(&MI, DstIdx, SrcIdx, regBKilled, Dist);
+ // Give targets a chance to convert bundled instructions.
+ bool ConvertibleTo3Addr = MI.isConvertibleTo3Addr(MachineInstr::AnyInBundle);
+
// If the instruction is convertible to 3 Addr, instead
// of returning try 3 Addr transformation aggressively and
// use this variable to check later. Because it might be better.
@@ -1337,7 +1345,7 @@ bool TwoAddressInstructionImpl::tryInstructionTransform(
// addl %esi, %edi
// movl %edi, %eax
// ret
- if (Commuted && !MI.isConvertibleTo3Addr())
+ if (Commuted && !ConvertibleTo3Addr)
return false;
if (shouldOnlyCommute)
@@ -1357,7 +1365,7 @@ bool TwoAddressInstructionImpl::tryInstructionTransform(
regBKilled = isKilled(MI, regB, true);
}
- if (MI.isConvertibleTo3Addr()) {
+ if (ConvertibleTo3Addr) {
// This instruction is potentially convertible to a true
// three-address instruction. Check if it is profitable.
if (!regBKilled || isProfitableToConv3Addr(regA, regB)) {
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 6ce18ea921a9b..deeb8beb04332 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -4047,10 +4047,29 @@ MachineInstr
*SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
LiveVariables *LV,
LiveIntervals *LIS) const {
MachineBasicBlock &MBB = *MI.getParent();
+ MachineInstr *CandidateMI = &MI;
+
+ if (MI.isBundle()) {
+// This is a temporary placeholder for bundle handling that enables us to
+// exercise the relevant code paths in the two-address instruction pass.
+i
[llvm-branch-commits] [llvm] CodeGen/AMDGPU: Allow 3-address conversion of bundled instructions (PR #166213)
https://github.com/nhaehnle updated
https://github.com/llvm/llvm-project/pull/166213
From 69ff9c0f0dd1af8333d4b160003d7f8a6eea61aa Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicolai=20H=C3=A4hnle?=
Date: Tue, 7 Oct 2025 12:17:02 -0700
Subject: [PATCH] CodeGen/AMDGPU: Allow 3-address conversion of bundled
instructions
This is in preparation for future changes in AMDGPU that will make more
substantial use of bundles pre-RA. For now, simply test this with
degenerate (single-instruction) bundles.
commit-id:4a30cb78
---
.../lib/CodeGen/TwoAddressInstructionPass.cpp | 54 ++
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp| 56 +--
llvm/test/CodeGen/AMDGPU/twoaddr-bundle.mir | 9 ++-
3 files changed, 87 insertions(+), 32 deletions(-)
diff --git a/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
b/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
index 713ef779588cf..3ff6da2b6dc63 100644
--- a/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
+++ b/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
@@ -794,29 +794,34 @@ bool TwoAddressInstructionImpl::convertInstTo3Addr(
if (!NewMI)
return false;
- LLVM_DEBUG(dbgs() << "2addr: CONVERTING 2-ADDR: " << *mi);
- LLVM_DEBUG(dbgs() << "2addr: TO 3-ADDR: " << *NewMI);
-
- // If the old instruction is debug value tracked, an update is required.
- if (auto OldInstrNum = mi->peekDebugInstrNum()) {
-assert(mi->getNumExplicitDefs() == 1);
-assert(NewMI->getNumExplicitDefs() == 1);
-
-// Find the old and new def location.
-unsigned OldIdx = mi->defs().begin()->getOperandNo();
-unsigned NewIdx = NewMI->defs().begin()->getOperandNo();
-
-// Record that one def has been replaced by the other.
-unsigned NewInstrNum = NewMI->getDebugInstrNum();
-MF->makeDebugValueSubstitution(std::make_pair(OldInstrNum, OldIdx),
- std::make_pair(NewInstrNum, NewIdx));
- }
-
- MBB->erase(mi); // Nuke the old inst.
-
for (MachineInstr &MI : MIS)
DistanceMap.insert(std::make_pair(&MI, Dist++));
- Dist--;
+
+ if (&*mi == NewMI) {
+LLVM_DEBUG(dbgs() << "2addr: CONVERTED IN-PLACE TO 3-ADDR: " << *mi);
+ } else {
+LLVM_DEBUG(dbgs() << "2addr: CONVERTING 2-ADDR: " << *mi);
+LLVM_DEBUG(dbgs() << "2addr: TO 3-ADDR: " << *NewMI);
+
+// If the old instruction is debug value tracked, an update is required.
+if (auto OldInstrNum = mi->peekDebugInstrNum()) {
+ assert(mi->getNumExplicitDefs() == 1);
+ assert(NewMI->getNumExplicitDefs() == 1);
+
+ // Find the old and new def location.
+ unsigned OldIdx = mi->defs().begin()->getOperandNo();
+ unsigned NewIdx = NewMI->defs().begin()->getOperandNo();
+
+ // Record that one def has been replaced by the other.
+ unsigned NewInstrNum = NewMI->getDebugInstrNum();
+ MF->makeDebugValueSubstitution(std::make_pair(OldInstrNum, OldIdx),
+ std::make_pair(NewInstrNum, NewIdx));
+}
+
+MBB->erase(mi); // Nuke the old inst.
+Dist--;
+ }
+
mi = NewMI;
nmi = std::next(mi);
@@ -1329,6 +1334,9 @@ bool TwoAddressInstructionImpl::tryInstructionTransform(
bool Commuted = tryInstructionCommute(&MI, DstIdx, SrcIdx, regBKilled, Dist);
+ // Give targets a chance to convert bundled instructions.
+ bool ConvertibleTo3Addr = MI.isConvertibleTo3Addr(MachineInstr::AnyInBundle);
+
// If the instruction is convertible to 3 Addr, instead
// of returning try 3 Addr transformation aggressively and
// use this variable to check later. Because it might be better.
@@ -1337,7 +1345,7 @@ bool TwoAddressInstructionImpl::tryInstructionTransform(
// addl %esi, %edi
// movl %edi, %eax
// ret
- if (Commuted && !MI.isConvertibleTo3Addr())
+ if (Commuted && !ConvertibleTo3Addr)
return false;
if (shouldOnlyCommute)
@@ -1357,7 +1365,7 @@ bool TwoAddressInstructionImpl::tryInstructionTransform(
regBKilled = isKilled(MI, regB, true);
}
- if (MI.isConvertibleTo3Addr()) {
+ if (ConvertibleTo3Addr) {
// This instruction is potentially convertible to a true
// three-address instruction. Check if it is profitable.
if (!regBKilled || isProfitableToConv3Addr(regA, regB)) {
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 6ce18ea921a9b..deeb8beb04332 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -4047,10 +4047,29 @@ MachineInstr
*SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
LiveVariables *LV,
LiveIntervals *LIS) const {
MachineBasicBlock &MBB = *MI.getParent();
+ MachineInstr *CandidateMI = &MI;
+
+ if (MI.isBundle()) {
+// This is a temporary placeholder for bundle handling that enables us to
+// exercise the relevant code paths in the two-address instruction pass.
+i
[llvm-branch-commits] [llvm] CodeGen: Handle bundled instructions in two-address-instructions pass (PR #166212)
https://github.com/nhaehnle updated
https://github.com/llvm/llvm-project/pull/166212
From b6bf0c47fd34efff8a4df14df69eb1f06785 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicolai=20H=C3=A4hnle?=
Date: Tue, 23 Sep 2025 19:08:52 -0700
Subject: [PATCH] CodeGen: Handle bundled instructions in
two-address-instructions pass
If the instruction with tied operands is a BUNDLE instruction and we
handle it by replacing an operand, then we need to update the
corresponding internal operands as well. Otherwise, the resulting MIR is
invalid.
The test case is degenerate in the sense that the bundle only contains a
single instruction, but it is sufficient to exercise this issue.
commit-id:6760a9b7
---
.../lib/CodeGen/TwoAddressInstructionPass.cpp | 12
llvm/test/CodeGen/AMDGPU/twoaddr-bundle.mir | 57 +++
2 files changed, 69 insertions(+)
create mode 100644 llvm/test/CodeGen/AMDGPU/twoaddr-bundle.mir
diff --git a/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
b/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
index 414e414738b71..713ef779588cf 100644
--- a/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
+++ b/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
@@ -1665,6 +1665,18 @@ void
TwoAddressInstructionImpl::processTiedPairs(MachineInstr *MI,
// by SubRegB is compatible with RegA with no subregister. So regardless of
// whether the dest oper writes a subreg, the source oper should not.
MO.setSubReg(0);
+
+// Update uses of RegB to uses of RegA inside the bundle.
+if (MI->isBundle()) {
+ for (MachineOperand &MO : mi_bundle_ops(*MI)) {
+if (MO.isReg() && MO.getReg() == RegB) {
+ assert(
+ MO.getSubReg() == 0 &&
+ "tied subregister uses in bundled instructions not supported");
+ MO.setReg(RegA);
+}
+ }
+}
}
if (AllUsesCopied) {
diff --git a/llvm/test/CodeGen/AMDGPU/twoaddr-bundle.mir
b/llvm/test/CodeGen/AMDGPU/twoaddr-bundle.mir
new file mode 100644
index 0..696962a88c8b8
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/twoaddr-bundle.mir
@@ -0,0 +1,57 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
UTC_ARGS: --version 6
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 %s --passes=two-address-instruction
-verify-each -o - | FileCheck --check-prefixes=GCN %s
+
+# Exercise very basic handling of BUNDLE'd instructions by the
two-address-instruction pass.
+
+# This test is an example where it is best to keep the two-address instruction
+# and resolve the tie with a COPY that is expected to be coalesced.
+---
+name:test_fmac_bundle
+body: |
+ bb.0:
+
+; GCN-LABEL: name: test_fmac_bundle
+; GCN: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+; GCN-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]],
[[COPY1]], 0, implicit $exec
+; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+; GCN-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_U32_e64_]]
+; GCN-NEXT: BUNDLE implicit-def [[COPY2]], implicit [[DEF]], implicit
[[DEF1]], implicit [[COPY2]](tied-def 0), implicit $mode, implicit $exec {
+; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = V_FMAC_F32_e32 killed [[DEF]],
killed [[DEF1]], killed [[COPY2]], implicit $mode, implicit $exec
+; GCN-NEXT: }
+%10:vgpr_32 = COPY $vgpr0
+%11:vgpr_32 = COPY $vgpr1
+%2:vgpr_32 = V_ADD_U32_e64 %10, %11, 0, implicit $exec
+%0:vgpr_32 = IMPLICIT_DEF
+%1:vgpr_32 = IMPLICIT_DEF
+BUNDLE implicit-def %3:vgpr_32, implicit %0, implicit %1, implicit killed
%2(tied-def 0), implicit $mode, implicit $exec {
+ %3:vgpr_32 = V_FMAC_F32_e32 killed %0, killed %1, killed %2, implicit
$mode, implicit $exec
+}
+
+...
+
+# This test is an example where conversion to three-address form would be
beneficial.
+---
+name:test_fmac_reuse_bundle
+body: |
+ bb.0:
+
+; GCN-LABEL: name: test_fmac_reuse_bundle
+; GCN: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+; GCN-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
+; GCN-NEXT: BUNDLE implicit-def [[COPY1]], implicit [[DEF]], implicit
[[DEF1]], implicit [[COPY1]](tied-def 0), implicit $mode, implicit $exec {
+; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = V_FMAC_F32_e32 killed [[DEF]],
killed [[DEF1]], killed [[COPY1]], implicit $mode, implicit $exec
+; GCN-NEXT: }
+; GCN-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY1]],
[[COPY]], 0, implicit $exec
+%2:vgpr_32 = COPY $vgpr0
+%0:vgpr_32 = IMPLICIT_DEF
+%1:vgpr_32 = IMPLICIT_DEF
+BUNDLE implicit-def %3:vgpr_32, implicit %0, implicit %1, implicit
%2(tied-def 0), implicit $mode, implicit $exec {
+ %3:vgpr_32 = V_FMAC_F32_e32 killed %0, killed %1, killed %2, i
[llvm-branch-commits] [llvm] CodeGen: Handle bundled instructions in two-address-instructions pass (PR #166212)
https://github.com/nhaehnle updated
https://github.com/llvm/llvm-project/pull/166212
From b6bf0c47fd34efff8a4df14df69eb1f06785 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicolai=20H=C3=A4hnle?=
Date: Tue, 23 Sep 2025 19:08:52 -0700
Subject: [PATCH] CodeGen: Handle bundled instructions in
two-address-instructions pass
If the instruction with tied operands is a BUNDLE instruction and we
handle it by replacing an operand, then we need to update the
corresponding internal operands as well. Otherwise, the resulting MIR is
invalid.
The test case is degenerate in the sense that the bundle only contains a
single instruction, but it is sufficient to exercise this issue.
commit-id:6760a9b7
---
.../lib/CodeGen/TwoAddressInstructionPass.cpp | 12
llvm/test/CodeGen/AMDGPU/twoaddr-bundle.mir | 57 +++
2 files changed, 69 insertions(+)
create mode 100644 llvm/test/CodeGen/AMDGPU/twoaddr-bundle.mir
diff --git a/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
b/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
index 414e414738b71..713ef779588cf 100644
--- a/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
+++ b/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
@@ -1665,6 +1665,18 @@ void
TwoAddressInstructionImpl::processTiedPairs(MachineInstr *MI,
// by SubRegB is compatible with RegA with no subregister. So regardless of
// whether the dest oper writes a subreg, the source oper should not.
MO.setSubReg(0);
+
+// Update uses of RegB to uses of RegA inside the bundle.
+if (MI->isBundle()) {
+ for (MachineOperand &MO : mi_bundle_ops(*MI)) {
+if (MO.isReg() && MO.getReg() == RegB) {
+ assert(
+ MO.getSubReg() == 0 &&
+ "tied subregister uses in bundled instructions not supported");
+ MO.setReg(RegA);
+}
+ }
+}
}
if (AllUsesCopied) {
diff --git a/llvm/test/CodeGen/AMDGPU/twoaddr-bundle.mir
b/llvm/test/CodeGen/AMDGPU/twoaddr-bundle.mir
new file mode 100644
index 0..696962a88c8b8
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/twoaddr-bundle.mir
@@ -0,0 +1,57 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
UTC_ARGS: --version 6
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 %s --passes=two-address-instruction
-verify-each -o - | FileCheck --check-prefixes=GCN %s
+
+# Exercise very basic handling of BUNDLE'd instructions by the
two-address-instruction pass.
+
+# This test is an example where it is best to keep the two-address instruction
+# and resolve the tie with a COPY that is expected to be coalesced.
+---
+name:test_fmac_bundle
+body: |
+ bb.0:
+
+; GCN-LABEL: name: test_fmac_bundle
+; GCN: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+; GCN-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]],
[[COPY1]], 0, implicit $exec
+; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+; GCN-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_U32_e64_]]
+; GCN-NEXT: BUNDLE implicit-def [[COPY2]], implicit [[DEF]], implicit
[[DEF1]], implicit [[COPY2]](tied-def 0), implicit $mode, implicit $exec {
+; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = V_FMAC_F32_e32 killed [[DEF]],
killed [[DEF1]], killed [[COPY2]], implicit $mode, implicit $exec
+; GCN-NEXT: }
+%10:vgpr_32 = COPY $vgpr0
+%11:vgpr_32 = COPY $vgpr1
+%2:vgpr_32 = V_ADD_U32_e64 %10, %11, 0, implicit $exec
+%0:vgpr_32 = IMPLICIT_DEF
+%1:vgpr_32 = IMPLICIT_DEF
+BUNDLE implicit-def %3:vgpr_32, implicit %0, implicit %1, implicit killed
%2(tied-def 0), implicit $mode, implicit $exec {
+ %3:vgpr_32 = V_FMAC_F32_e32 killed %0, killed %1, killed %2, implicit
$mode, implicit $exec
+}
+
+...
+
+# This test is an example where conversion to three-address form would be
beneficial.
+---
+name:test_fmac_reuse_bundle
+body: |
+ bb.0:
+
+; GCN-LABEL: name: test_fmac_reuse_bundle
+; GCN: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+; GCN-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
+; GCN-NEXT: BUNDLE implicit-def [[COPY1]], implicit [[DEF]], implicit
[[DEF1]], implicit [[COPY1]](tied-def 0), implicit $mode, implicit $exec {
+; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = V_FMAC_F32_e32 killed [[DEF]],
killed [[DEF1]], killed [[COPY1]], implicit $mode, implicit $exec
+; GCN-NEXT: }
+; GCN-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY1]],
[[COPY]], 0, implicit $exec
+%2:vgpr_32 = COPY $vgpr0
+%0:vgpr_32 = IMPLICIT_DEF
+%1:vgpr_32 = IMPLICIT_DEF
+BUNDLE implicit-def %3:vgpr_32, implicit %0, implicit %1, implicit
%2(tied-def 0), implicit $mode, implicit $exec {
+ %3:vgpr_32 = V_FMAC_F32_e32 killed %0, killed %1, killed %2, i
[llvm-branch-commits] [llvm] CodeGen: Handle bundled instructions in two-address-instructions pass (PR #166212)
@@ -1665,6 +1665,22 @@ void
TwoAddressInstructionImpl::processTiedPairs(MachineInstr *MI,
// by SubRegB is compatible with RegA with no subregister. So regardless of
// whether the dest oper writes a subreg, the source oper should not.
MO.setSubReg(0);
+
+// Update uses of RegB to uses of RegA inside the bundle.
+if (MI->isBundle()) {
+ for (MachineInstr *InnerMI = MI; InnerMI->isBundledWithSucc();) {
+InnerMI = InnerMI->getNextNode();
+
+for (MachineOperand &MO : InnerMI->all_uses()) {
+ if (MO.isReg() && MO.getReg() == RegB) {
+assert(
+MO.getSubReg() == 0 &&
+"tied subregister uses in bundled instructions not supported");
+MO.setReg(RegA);
jayfoad wrote:
Then maybe assert that `MO.getSubReg() == 0 && SubRegB == 0`? Otherwise this
code will replace a use of one reg with a use of another reg with a different
size.
https://github.com/llvm/llvm-project/pull/166212
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] CodeGen: Handle bundled instructions in two-address-instructions pass (PR #166212)
github-actions[bot] wrote:
:warning: C/C++ code formatter, clang-format found issues in your code.
:warning:
You can test this locally with the following command:
``bash
git-clang-format --diff origin/main HEAD --extensions cpp --
llvm/lib/CodeGen/TwoAddressInstructionPass.cpp --diff_from_common_commit
``
:warning:
The reproduction instructions above might return results for more than one PR
in a stack if you are using a stacked PR workflow. You can limit the results by
changing `origin/main` to the base branch/commit you want to compare against.
:warning:
View the diff from clang-format here.
``diff
diff --git a/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
b/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
index 713ef7795..264e6c866 100644
--- a/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
+++ b/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
@@ -1670,9 +1670,8 @@ void
TwoAddressInstructionImpl::processTiedPairs(MachineInstr *MI,
if (MI->isBundle()) {
for (MachineOperand &MO : mi_bundle_ops(*MI)) {
if (MO.isReg() && MO.getReg() == RegB) {
- assert(
- MO.getSubReg() == 0 &&
- "tied subregister uses in bundled instructions not supported");
+ assert(MO.getSubReg() == 0 &&
+ "tied subregister uses in bundled instructions not
supported");
MO.setReg(RegA);
}
}
``
https://github.com/llvm/llvm-project/pull/166212
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] CodeGen: Handle bundled instructions in two-address-instructions pass (PR #166212)
https://github.com/jayfoad approved this pull request. Seems OK on the understanding that it is slightly experimental, and after some more experience we may need to change things and/or nail down the exact rules for what cases are and are not supported. https://github.com/llvm/llvm-project/pull/166212 ___ llvm-branch-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] CodeGen/AMDGPU: Allow 3-address conversion of bundled instructions (PR #166213)
https://github.com/nhaehnle updated
https://github.com/llvm/llvm-project/pull/166213
From 2ca173d4a9a8a59304a5915e7b46ce46ea5c0bf7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicolai=20H=C3=A4hnle?=
Date: Tue, 7 Oct 2025 12:17:02 -0700
Subject: [PATCH] CodeGen/AMDGPU: Allow 3-address conversion of bundled
instructions
This is in preparation for future changes in AMDGPU that will make more
substantial use of bundles pre-RA. For now, simply test this with
degenerate (single-instruction) bundles.
commit-id:4a30cb78
---
.../lib/CodeGen/TwoAddressInstructionPass.cpp | 54 ++
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp| 56 +--
llvm/test/CodeGen/AMDGPU/twoaddr-bundle.mir | 9 ++-
3 files changed, 87 insertions(+), 32 deletions(-)
diff --git a/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
b/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
index 1f816b94cf56b..7056ced5385ed 100644
--- a/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
+++ b/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
@@ -794,29 +794,34 @@ bool TwoAddressInstructionImpl::convertInstTo3Addr(
if (!NewMI)
return false;
- LLVM_DEBUG(dbgs() << "2addr: CONVERTING 2-ADDR: " << *mi);
- LLVM_DEBUG(dbgs() << "2addr: TO 3-ADDR: " << *NewMI);
-
- // If the old instruction is debug value tracked, an update is required.
- if (auto OldInstrNum = mi->peekDebugInstrNum()) {
-assert(mi->getNumExplicitDefs() == 1);
-assert(NewMI->getNumExplicitDefs() == 1);
-
-// Find the old and new def location.
-unsigned OldIdx = mi->defs().begin()->getOperandNo();
-unsigned NewIdx = NewMI->defs().begin()->getOperandNo();
-
-// Record that one def has been replaced by the other.
-unsigned NewInstrNum = NewMI->getDebugInstrNum();
-MF->makeDebugValueSubstitution(std::make_pair(OldInstrNum, OldIdx),
- std::make_pair(NewInstrNum, NewIdx));
- }
-
- MBB->erase(mi); // Nuke the old inst.
-
for (MachineInstr &MI : MIS)
DistanceMap.insert(std::make_pair(&MI, Dist++));
- Dist--;
+
+ if (&*mi == NewMI) {
+LLVM_DEBUG(dbgs() << "2addr: CONVERTED IN-PLACE TO 3-ADDR: " << *mi);
+ } else {
+LLVM_DEBUG(dbgs() << "2addr: CONVERTING 2-ADDR: " << *mi);
+LLVM_DEBUG(dbgs() << "2addr: TO 3-ADDR: " << *NewMI);
+
+// If the old instruction is debug value tracked, an update is required.
+if (auto OldInstrNum = mi->peekDebugInstrNum()) {
+ assert(mi->getNumExplicitDefs() == 1);
+ assert(NewMI->getNumExplicitDefs() == 1);
+
+ // Find the old and new def location.
+ unsigned OldIdx = mi->defs().begin()->getOperandNo();
+ unsigned NewIdx = NewMI->defs().begin()->getOperandNo();
+
+ // Record that one def has been replaced by the other.
+ unsigned NewInstrNum = NewMI->getDebugInstrNum();
+ MF->makeDebugValueSubstitution(std::make_pair(OldInstrNum, OldIdx),
+ std::make_pair(NewInstrNum, NewIdx));
+}
+
+MBB->erase(mi); // Nuke the old inst.
+Dist--;
+ }
+
mi = NewMI;
nmi = std::next(mi);
@@ -1329,6 +1334,9 @@ bool TwoAddressInstructionImpl::tryInstructionTransform(
bool Commuted = tryInstructionCommute(&MI, DstIdx, SrcIdx, regBKilled, Dist);
+ // Give targets a chance to convert bundled instructions.
+ bool ConvertibleTo3Addr = MI.isConvertibleTo3Addr(MachineInstr::AnyInBundle);
+
// If the instruction is convertible to 3 Addr, instead
// of returning try 3 Addr transformation aggressively and
// use this variable to check later. Because it might be better.
@@ -1337,7 +1345,7 @@ bool TwoAddressInstructionImpl::tryInstructionTransform(
// addl %esi, %edi
// movl %edi, %eax
// ret
- if (Commuted && !MI.isConvertibleTo3Addr())
+ if (Commuted && !ConvertibleTo3Addr)
return false;
if (shouldOnlyCommute)
@@ -1357,7 +1365,7 @@ bool TwoAddressInstructionImpl::tryInstructionTransform(
regBKilled = isKilled(MI, regB, true);
}
- if (MI.isConvertibleTo3Addr()) {
+ if (ConvertibleTo3Addr) {
// This instruction is potentially convertible to a true
// three-address instruction. Check if it is profitable.
if (!regBKilled || isProfitableToConv3Addr(regA, regB)) {
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 6ce18ea921a9b..deeb8beb04332 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -4047,10 +4047,29 @@ MachineInstr
*SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
LiveVariables *LV,
LiveIntervals *LIS) const {
MachineBasicBlock &MBB = *MI.getParent();
+ MachineInstr *CandidateMI = &MI;
+
+ if (MI.isBundle()) {
+// This is a temporary placeholder for bundle handling that enables us to
+// exercise the relevant code paths in the two-address instruction pass.
+i
[llvm-branch-commits] [llvm] CodeGen/AMDGPU: Allow 3-address conversion of bundled instructions (PR #166213)
https://github.com/nhaehnle updated
https://github.com/llvm/llvm-project/pull/166213
From 2ca173d4a9a8a59304a5915e7b46ce46ea5c0bf7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicolai=20H=C3=A4hnle?=
Date: Tue, 7 Oct 2025 12:17:02 -0700
Subject: [PATCH] CodeGen/AMDGPU: Allow 3-address conversion of bundled
instructions
This is in preparation for future changes in AMDGPU that will make more
substantial use of bundles pre-RA. For now, simply test this with
degenerate (single-instruction) bundles.
commit-id:4a30cb78
---
.../lib/CodeGen/TwoAddressInstructionPass.cpp | 54 ++
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp| 56 +--
llvm/test/CodeGen/AMDGPU/twoaddr-bundle.mir | 9 ++-
3 files changed, 87 insertions(+), 32 deletions(-)
diff --git a/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
b/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
index 1f816b94cf56b..7056ced5385ed 100644
--- a/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
+++ b/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
@@ -794,29 +794,34 @@ bool TwoAddressInstructionImpl::convertInstTo3Addr(
if (!NewMI)
return false;
- LLVM_DEBUG(dbgs() << "2addr: CONVERTING 2-ADDR: " << *mi);
- LLVM_DEBUG(dbgs() << "2addr: TO 3-ADDR: " << *NewMI);
-
- // If the old instruction is debug value tracked, an update is required.
- if (auto OldInstrNum = mi->peekDebugInstrNum()) {
-assert(mi->getNumExplicitDefs() == 1);
-assert(NewMI->getNumExplicitDefs() == 1);
-
-// Find the old and new def location.
-unsigned OldIdx = mi->defs().begin()->getOperandNo();
-unsigned NewIdx = NewMI->defs().begin()->getOperandNo();
-
-// Record that one def has been replaced by the other.
-unsigned NewInstrNum = NewMI->getDebugInstrNum();
-MF->makeDebugValueSubstitution(std::make_pair(OldInstrNum, OldIdx),
- std::make_pair(NewInstrNum, NewIdx));
- }
-
- MBB->erase(mi); // Nuke the old inst.
-
for (MachineInstr &MI : MIS)
DistanceMap.insert(std::make_pair(&MI, Dist++));
- Dist--;
+
+ if (&*mi == NewMI) {
+LLVM_DEBUG(dbgs() << "2addr: CONVERTED IN-PLACE TO 3-ADDR: " << *mi);
+ } else {
+LLVM_DEBUG(dbgs() << "2addr: CONVERTING 2-ADDR: " << *mi);
+LLVM_DEBUG(dbgs() << "2addr: TO 3-ADDR: " << *NewMI);
+
+// If the old instruction is debug value tracked, an update is required.
+if (auto OldInstrNum = mi->peekDebugInstrNum()) {
+ assert(mi->getNumExplicitDefs() == 1);
+ assert(NewMI->getNumExplicitDefs() == 1);
+
+ // Find the old and new def location.
+ unsigned OldIdx = mi->defs().begin()->getOperandNo();
+ unsigned NewIdx = NewMI->defs().begin()->getOperandNo();
+
+ // Record that one def has been replaced by the other.
+ unsigned NewInstrNum = NewMI->getDebugInstrNum();
+ MF->makeDebugValueSubstitution(std::make_pair(OldInstrNum, OldIdx),
+ std::make_pair(NewInstrNum, NewIdx));
+}
+
+MBB->erase(mi); // Nuke the old inst.
+Dist--;
+ }
+
mi = NewMI;
nmi = std::next(mi);
@@ -1329,6 +1334,9 @@ bool TwoAddressInstructionImpl::tryInstructionTransform(
bool Commuted = tryInstructionCommute(&MI, DstIdx, SrcIdx, regBKilled, Dist);
+ // Give targets a chance to convert bundled instructions.
+ bool ConvertibleTo3Addr = MI.isConvertibleTo3Addr(MachineInstr::AnyInBundle);
+
// If the instruction is convertible to 3 Addr, instead
// of returning try 3 Addr transformation aggressively and
// use this variable to check later. Because it might be better.
@@ -1337,7 +1345,7 @@ bool TwoAddressInstructionImpl::tryInstructionTransform(
// addl %esi, %edi
// movl %edi, %eax
// ret
- if (Commuted && !MI.isConvertibleTo3Addr())
+ if (Commuted && !ConvertibleTo3Addr)
return false;
if (shouldOnlyCommute)
@@ -1357,7 +1365,7 @@ bool TwoAddressInstructionImpl::tryInstructionTransform(
regBKilled = isKilled(MI, regB, true);
}
- if (MI.isConvertibleTo3Addr()) {
+ if (ConvertibleTo3Addr) {
// This instruction is potentially convertible to a true
// three-address instruction. Check if it is profitable.
if (!regBKilled || isProfitableToConv3Addr(regA, regB)) {
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 6ce18ea921a9b..deeb8beb04332 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -4047,10 +4047,29 @@ MachineInstr
*SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
LiveVariables *LV,
LiveIntervals *LIS) const {
MachineBasicBlock &MBB = *MI.getParent();
+ MachineInstr *CandidateMI = &MI;
+
+ if (MI.isBundle()) {
+// This is a temporary placeholder for bundle handling that enables us to
+// exercise the relevant code paths in the two-address instruction pass.
+i
[llvm-branch-commits] [llvm] CodeGen: Handle bundled instructions in two-address-instructions pass (PR #166212)
https://github.com/nhaehnle updated
https://github.com/llvm/llvm-project/pull/166212
From 1224dba5fcb35911c3e80f0a734394d2ce0cd640 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicolai=20H=C3=A4hnle?=
Date: Tue, 23 Sep 2025 19:08:52 -0700
Subject: [PATCH] CodeGen: Handle bundled instructions in
two-address-instructions pass
If the instruction with tied operands is a BUNDLE instruction and we
handle it by replacing an operand, then we need to update the
corresponding internal operands as well. Otherwise, the resulting MIR is
invalid.
The test case is degenerate in the sense that the bundle only contains a
single instruction, but it is sufficient to exercise this issue.
commit-id:6760a9b7
---
.../lib/CodeGen/TwoAddressInstructionPass.cpp | 16 ++
llvm/test/CodeGen/AMDGPU/twoaddr-bundle.mir | 57 +++
2 files changed, 73 insertions(+)
create mode 100644 llvm/test/CodeGen/AMDGPU/twoaddr-bundle.mir
diff --git a/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
b/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
index 414e414738b71..1f816b94cf56b 100644
--- a/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
+++ b/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
@@ -1665,6 +1665,22 @@ void
TwoAddressInstructionImpl::processTiedPairs(MachineInstr *MI,
// by SubRegB is compatible with RegA with no subregister. So regardless of
// whether the dest oper writes a subreg, the source oper should not.
MO.setSubReg(0);
+
+// Update uses of RegB to uses of RegA inside the bundle.
+if (MI->isBundle()) {
+ for (MachineInstr *InnerMI = MI; InnerMI->isBundledWithSucc();) {
+InnerMI = InnerMI->getNextNode();
+
+for (MachineOperand &MO : InnerMI->all_uses()) {
+ if (MO.isReg() && MO.getReg() == RegB) {
+assert(
+MO.getSubReg() == 0 &&
+"tied subregister uses in bundled instructions not supported");
+MO.setReg(RegA);
+ }
+}
+ }
+}
}
if (AllUsesCopied) {
diff --git a/llvm/test/CodeGen/AMDGPU/twoaddr-bundle.mir
b/llvm/test/CodeGen/AMDGPU/twoaddr-bundle.mir
new file mode 100644
index 0..696962a88c8b8
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/twoaddr-bundle.mir
@@ -0,0 +1,57 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
UTC_ARGS: --version 6
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 %s --passes=two-address-instruction
-verify-each -o - | FileCheck --check-prefixes=GCN %s
+
+# Exercise very basic handling of BUNDLE'd instructions by the
two-address-instruction pass.
+
+# This test is an example where it is best to keep the two-address instruction
+# and resolve the tie with a COPY that is expected to be coalesced.
+---
+name:test_fmac_bundle
+body: |
+ bb.0:
+
+; GCN-LABEL: name: test_fmac_bundle
+; GCN: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+; GCN-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]],
[[COPY1]], 0, implicit $exec
+; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+; GCN-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_U32_e64_]]
+; GCN-NEXT: BUNDLE implicit-def [[COPY2]], implicit [[DEF]], implicit
[[DEF1]], implicit [[COPY2]](tied-def 0), implicit $mode, implicit $exec {
+; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = V_FMAC_F32_e32 killed [[DEF]],
killed [[DEF1]], killed [[COPY2]], implicit $mode, implicit $exec
+; GCN-NEXT: }
+%10:vgpr_32 = COPY $vgpr0
+%11:vgpr_32 = COPY $vgpr1
+%2:vgpr_32 = V_ADD_U32_e64 %10, %11, 0, implicit $exec
+%0:vgpr_32 = IMPLICIT_DEF
+%1:vgpr_32 = IMPLICIT_DEF
+BUNDLE implicit-def %3:vgpr_32, implicit %0, implicit %1, implicit killed
%2(tied-def 0), implicit $mode, implicit $exec {
+ %3:vgpr_32 = V_FMAC_F32_e32 killed %0, killed %1, killed %2, implicit
$mode, implicit $exec
+}
+
+...
+
+# This test is an example where conversion to three-address form would be
beneficial.
+---
+name:test_fmac_reuse_bundle
+body: |
+ bb.0:
+
+; GCN-LABEL: name: test_fmac_reuse_bundle
+; GCN: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+; GCN-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
+; GCN-NEXT: BUNDLE implicit-def [[COPY1]], implicit [[DEF]], implicit
[[DEF1]], implicit [[COPY1]](tied-def 0), implicit $mode, implicit $exec {
+; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = V_FMAC_F32_e32 killed [[DEF]],
killed [[DEF1]], killed [[COPY1]], implicit $mode, implicit $exec
+; GCN-NEXT: }
+; GCN-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY1]],
[[COPY]], 0, implicit $exec
+%2:vgpr_32 = COPY $vgpr0
+%0:vgpr_32 = IMPLICIT_DEF
+%1:vgpr_32 = IMPLICIT_DEF
+BUNDLE implicit-def %3:vgpr_32, implicit
[llvm-branch-commits] [llvm] CodeGen/AMDGPU: Allow 3-address conversion of bundled instructions (PR #166213)
https://github.com/nhaehnle updated
https://github.com/llvm/llvm-project/pull/166213
From 2ca173d4a9a8a59304a5915e7b46ce46ea5c0bf7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicolai=20H=C3=A4hnle?=
Date: Tue, 7 Oct 2025 12:17:02 -0700
Subject: [PATCH] CodeGen/AMDGPU: Allow 3-address conversion of bundled
instructions
This is in preparation for future changes in AMDGPU that will make more
substantial use of bundles pre-RA. For now, simply test this with
degenerate (single-instruction) bundles.
commit-id:4a30cb78
---
.../lib/CodeGen/TwoAddressInstructionPass.cpp | 54 ++
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp| 56 +--
llvm/test/CodeGen/AMDGPU/twoaddr-bundle.mir | 9 ++-
3 files changed, 87 insertions(+), 32 deletions(-)
diff --git a/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
b/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
index 1f816b94cf56b..7056ced5385ed 100644
--- a/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
+++ b/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
@@ -794,29 +794,34 @@ bool TwoAddressInstructionImpl::convertInstTo3Addr(
if (!NewMI)
return false;
- LLVM_DEBUG(dbgs() << "2addr: CONVERTING 2-ADDR: " << *mi);
- LLVM_DEBUG(dbgs() << "2addr: TO 3-ADDR: " << *NewMI);
-
- // If the old instruction is debug value tracked, an update is required.
- if (auto OldInstrNum = mi->peekDebugInstrNum()) {
-assert(mi->getNumExplicitDefs() == 1);
-assert(NewMI->getNumExplicitDefs() == 1);
-
-// Find the old and new def location.
-unsigned OldIdx = mi->defs().begin()->getOperandNo();
-unsigned NewIdx = NewMI->defs().begin()->getOperandNo();
-
-// Record that one def has been replaced by the other.
-unsigned NewInstrNum = NewMI->getDebugInstrNum();
-MF->makeDebugValueSubstitution(std::make_pair(OldInstrNum, OldIdx),
- std::make_pair(NewInstrNum, NewIdx));
- }
-
- MBB->erase(mi); // Nuke the old inst.
-
for (MachineInstr &MI : MIS)
DistanceMap.insert(std::make_pair(&MI, Dist++));
- Dist--;
+
+ if (&*mi == NewMI) {
+LLVM_DEBUG(dbgs() << "2addr: CONVERTED IN-PLACE TO 3-ADDR: " << *mi);
+ } else {
+LLVM_DEBUG(dbgs() << "2addr: CONVERTING 2-ADDR: " << *mi);
+LLVM_DEBUG(dbgs() << "2addr: TO 3-ADDR: " << *NewMI);
+
+// If the old instruction is debug value tracked, an update is required.
+if (auto OldInstrNum = mi->peekDebugInstrNum()) {
+ assert(mi->getNumExplicitDefs() == 1);
+ assert(NewMI->getNumExplicitDefs() == 1);
+
+ // Find the old and new def location.
+ unsigned OldIdx = mi->defs().begin()->getOperandNo();
+ unsigned NewIdx = NewMI->defs().begin()->getOperandNo();
+
+ // Record that one def has been replaced by the other.
+ unsigned NewInstrNum = NewMI->getDebugInstrNum();
+ MF->makeDebugValueSubstitution(std::make_pair(OldInstrNum, OldIdx),
+ std::make_pair(NewInstrNum, NewIdx));
+}
+
+MBB->erase(mi); // Nuke the old inst.
+Dist--;
+ }
+
mi = NewMI;
nmi = std::next(mi);
@@ -1329,6 +1334,9 @@ bool TwoAddressInstructionImpl::tryInstructionTransform(
bool Commuted = tryInstructionCommute(&MI, DstIdx, SrcIdx, regBKilled, Dist);
+ // Give targets a chance to convert bundled instructions.
+ bool ConvertibleTo3Addr = MI.isConvertibleTo3Addr(MachineInstr::AnyInBundle);
+
// If the instruction is convertible to 3 Addr, instead
// of returning try 3 Addr transformation aggressively and
// use this variable to check later. Because it might be better.
@@ -1337,7 +1345,7 @@ bool TwoAddressInstructionImpl::tryInstructionTransform(
// addl %esi, %edi
// movl %edi, %eax
// ret
- if (Commuted && !MI.isConvertibleTo3Addr())
+ if (Commuted && !ConvertibleTo3Addr)
return false;
if (shouldOnlyCommute)
@@ -1357,7 +1365,7 @@ bool TwoAddressInstructionImpl::tryInstructionTransform(
regBKilled = isKilled(MI, regB, true);
}
- if (MI.isConvertibleTo3Addr()) {
+ if (ConvertibleTo3Addr) {
// This instruction is potentially convertible to a true
// three-address instruction. Check if it is profitable.
if (!regBKilled || isProfitableToConv3Addr(regA, regB)) {
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 6ce18ea921a9b..deeb8beb04332 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -4047,10 +4047,29 @@ MachineInstr
*SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
LiveVariables *LV,
LiveIntervals *LIS) const {
MachineBasicBlock &MBB = *MI.getParent();
+ MachineInstr *CandidateMI = &MI;
+
+ if (MI.isBundle()) {
+// This is a temporary placeholder for bundle handling that enables us to
+// exercise the relevant code paths in the two-address instruction pass.
+i
[llvm-branch-commits] [llvm] CodeGen: Handle bundled instructions in two-address-instructions pass (PR #166212)
https://github.com/nhaehnle updated
https://github.com/llvm/llvm-project/pull/166212
From 1224dba5fcb35911c3e80f0a734394d2ce0cd640 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicolai=20H=C3=A4hnle?=
Date: Tue, 23 Sep 2025 19:08:52 -0700
Subject: [PATCH] CodeGen: Handle bundled instructions in
two-address-instructions pass
If the instruction with tied operands is a BUNDLE instruction and we
handle it by replacing an operand, then we need to update the
corresponding internal operands as well. Otherwise, the resulting MIR is
invalid.
The test case is degenerate in the sense that the bundle only contains a
single instruction, but it is sufficient to exercise this issue.
commit-id:6760a9b7
---
.../lib/CodeGen/TwoAddressInstructionPass.cpp | 16 ++
llvm/test/CodeGen/AMDGPU/twoaddr-bundle.mir | 57 +++
2 files changed, 73 insertions(+)
create mode 100644 llvm/test/CodeGen/AMDGPU/twoaddr-bundle.mir
diff --git a/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
b/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
index 414e414738b71..1f816b94cf56b 100644
--- a/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
+++ b/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
@@ -1665,6 +1665,22 @@ void
TwoAddressInstructionImpl::processTiedPairs(MachineInstr *MI,
// by SubRegB is compatible with RegA with no subregister. So regardless of
// whether the dest oper writes a subreg, the source oper should not.
MO.setSubReg(0);
+
+// Update uses of RegB to uses of RegA inside the bundle.
+if (MI->isBundle()) {
+ for (MachineInstr *InnerMI = MI; InnerMI->isBundledWithSucc();) {
+InnerMI = InnerMI->getNextNode();
+
+for (MachineOperand &MO : InnerMI->all_uses()) {
+ if (MO.isReg() && MO.getReg() == RegB) {
+assert(
+MO.getSubReg() == 0 &&
+"tied subregister uses in bundled instructions not supported");
+MO.setReg(RegA);
+ }
+}
+ }
+}
}
if (AllUsesCopied) {
diff --git a/llvm/test/CodeGen/AMDGPU/twoaddr-bundle.mir
b/llvm/test/CodeGen/AMDGPU/twoaddr-bundle.mir
new file mode 100644
index 0..696962a88c8b8
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/twoaddr-bundle.mir
@@ -0,0 +1,57 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
UTC_ARGS: --version 6
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 %s --passes=two-address-instruction
-verify-each -o - | FileCheck --check-prefixes=GCN %s
+
+# Exercise very basic handling of BUNDLE'd instructions by the
two-address-instruction pass.
+
+# This test is an example where it is best to keep the two-address instruction
+# and resolve the tie with a COPY that is expected to be coalesced.
+---
+name:test_fmac_bundle
+body: |
+ bb.0:
+
+; GCN-LABEL: name: test_fmac_bundle
+; GCN: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+; GCN-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]],
[[COPY1]], 0, implicit $exec
+; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+; GCN-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_U32_e64_]]
+; GCN-NEXT: BUNDLE implicit-def [[COPY2]], implicit [[DEF]], implicit
[[DEF1]], implicit [[COPY2]](tied-def 0), implicit $mode, implicit $exec {
+; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = V_FMAC_F32_e32 killed [[DEF]],
killed [[DEF1]], killed [[COPY2]], implicit $mode, implicit $exec
+; GCN-NEXT: }
+%10:vgpr_32 = COPY $vgpr0
+%11:vgpr_32 = COPY $vgpr1
+%2:vgpr_32 = V_ADD_U32_e64 %10, %11, 0, implicit $exec
+%0:vgpr_32 = IMPLICIT_DEF
+%1:vgpr_32 = IMPLICIT_DEF
+BUNDLE implicit-def %3:vgpr_32, implicit %0, implicit %1, implicit killed
%2(tied-def 0), implicit $mode, implicit $exec {
+ %3:vgpr_32 = V_FMAC_F32_e32 killed %0, killed %1, killed %2, implicit
$mode, implicit $exec
+}
+
+...
+
+# This test is an example where conversion to three-address form would be
beneficial.
+---
+name:test_fmac_reuse_bundle
+body: |
+ bb.0:
+
+; GCN-LABEL: name: test_fmac_reuse_bundle
+; GCN: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+; GCN-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
+; GCN-NEXT: BUNDLE implicit-def [[COPY1]], implicit [[DEF]], implicit
[[DEF1]], implicit [[COPY1]](tied-def 0), implicit $mode, implicit $exec {
+; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = V_FMAC_F32_e32 killed [[DEF]],
killed [[DEF1]], killed [[COPY1]], implicit $mode, implicit $exec
+; GCN-NEXT: }
+; GCN-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY1]],
[[COPY]], 0, implicit $exec
+%2:vgpr_32 = COPY $vgpr0
+%0:vgpr_32 = IMPLICIT_DEF
+%1:vgpr_32 = IMPLICIT_DEF
+BUNDLE implicit-def %3:vgpr_32, implicit
[llvm-branch-commits] [llvm] CodeGen: Handle bundled instructions in two-address-instructions pass (PR #166212)
https://github.com/nhaehnle updated
https://github.com/llvm/llvm-project/pull/166212
From 1224dba5fcb35911c3e80f0a734394d2ce0cd640 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicolai=20H=C3=A4hnle?=
Date: Tue, 23 Sep 2025 19:08:52 -0700
Subject: [PATCH] CodeGen: Handle bundled instructions in
two-address-instructions pass
If the instruction with tied operands is a BUNDLE instruction and we
handle it by replacing an operand, then we need to update the
corresponding internal operands as well. Otherwise, the resulting MIR is
invalid.
The test case is degenerate in the sense that the bundle only contains a
single instruction, but it is sufficient to exercise this issue.
commit-id:6760a9b7
---
.../lib/CodeGen/TwoAddressInstructionPass.cpp | 16 ++
llvm/test/CodeGen/AMDGPU/twoaddr-bundle.mir | 57 +++
2 files changed, 73 insertions(+)
create mode 100644 llvm/test/CodeGen/AMDGPU/twoaddr-bundle.mir
diff --git a/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
b/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
index 414e414738b71..1f816b94cf56b 100644
--- a/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
+++ b/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
@@ -1665,6 +1665,22 @@ void
TwoAddressInstructionImpl::processTiedPairs(MachineInstr *MI,
// by SubRegB is compatible with RegA with no subregister. So regardless of
// whether the dest oper writes a subreg, the source oper should not.
MO.setSubReg(0);
+
+// Update uses of RegB to uses of RegA inside the bundle.
+if (MI->isBundle()) {
+ for (MachineInstr *InnerMI = MI; InnerMI->isBundledWithSucc();) {
+InnerMI = InnerMI->getNextNode();
+
+for (MachineOperand &MO : InnerMI->all_uses()) {
+ if (MO.isReg() && MO.getReg() == RegB) {
+assert(
+MO.getSubReg() == 0 &&
+"tied subregister uses in bundled instructions not supported");
+MO.setReg(RegA);
+ }
+}
+ }
+}
}
if (AllUsesCopied) {
diff --git a/llvm/test/CodeGen/AMDGPU/twoaddr-bundle.mir
b/llvm/test/CodeGen/AMDGPU/twoaddr-bundle.mir
new file mode 100644
index 0..696962a88c8b8
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/twoaddr-bundle.mir
@@ -0,0 +1,57 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
UTC_ARGS: --version 6
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 %s --passes=two-address-instruction
-verify-each -o - | FileCheck --check-prefixes=GCN %s
+
+# Exercise very basic handling of BUNDLE'd instructions by the
two-address-instruction pass.
+
+# This test is an example where it is best to keep the two-address instruction
+# and resolve the tie with a COPY that is expected to be coalesced.
+---
+name:test_fmac_bundle
+body: |
+ bb.0:
+
+; GCN-LABEL: name: test_fmac_bundle
+; GCN: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+; GCN-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]],
[[COPY1]], 0, implicit $exec
+; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+; GCN-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_U32_e64_]]
+; GCN-NEXT: BUNDLE implicit-def [[COPY2]], implicit [[DEF]], implicit
[[DEF1]], implicit [[COPY2]](tied-def 0), implicit $mode, implicit $exec {
+; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = V_FMAC_F32_e32 killed [[DEF]],
killed [[DEF1]], killed [[COPY2]], implicit $mode, implicit $exec
+; GCN-NEXT: }
+%10:vgpr_32 = COPY $vgpr0
+%11:vgpr_32 = COPY $vgpr1
+%2:vgpr_32 = V_ADD_U32_e64 %10, %11, 0, implicit $exec
+%0:vgpr_32 = IMPLICIT_DEF
+%1:vgpr_32 = IMPLICIT_DEF
+BUNDLE implicit-def %3:vgpr_32, implicit %0, implicit %1, implicit killed
%2(tied-def 0), implicit $mode, implicit $exec {
+ %3:vgpr_32 = V_FMAC_F32_e32 killed %0, killed %1, killed %2, implicit
$mode, implicit $exec
+}
+
+...
+
+# This test is an example where conversion to three-address form would be
beneficial.
+---
+name:test_fmac_reuse_bundle
+body: |
+ bb.0:
+
+; GCN-LABEL: name: test_fmac_reuse_bundle
+; GCN: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+; GCN-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
+; GCN-NEXT: BUNDLE implicit-def [[COPY1]], implicit [[DEF]], implicit
[[DEF1]], implicit [[COPY1]](tied-def 0), implicit $mode, implicit $exec {
+; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = V_FMAC_F32_e32 killed [[DEF]],
killed [[DEF1]], killed [[COPY1]], implicit $mode, implicit $exec
+; GCN-NEXT: }
+; GCN-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY1]],
[[COPY]], 0, implicit $exec
+%2:vgpr_32 = COPY $vgpr0
+%0:vgpr_32 = IMPLICIT_DEF
+%1:vgpr_32 = IMPLICIT_DEF
+BUNDLE implicit-def %3:vgpr_32, implicit
[llvm-branch-commits] [llvm] [CI] Make premerge_advisor_explain write comments (PR #166605)
https://github.com/boomanaiden154 updated
https://github.com/llvm/llvm-project/pull/166605
>From 06c030dcb4ee57be287beffd96d1b21ef1697dd4 Mon Sep 17 00:00:00 2001
From: Aiden Grossman
Date: Wed, 5 Nov 2025 18:23:46 +
Subject: [PATCH] fix
Created using spr 1.3.7
---
.ci/premerge_advisor_explain.py | 34 -
.ci/utils.sh| 10 +-
2 files changed, 22 insertions(+), 22 deletions(-)
diff --git a/.ci/premerge_advisor_explain.py b/.ci/premerge_advisor_explain.py
index 4d840a33c3cf2..1d487af9e9ec7 100644
--- a/.ci/premerge_advisor_explain.py
+++ b/.ci/premerge_advisor_explain.py
@@ -31,22 +31,11 @@ def get_comment_id(platform: str, pr:
github.PullRequest.PullRequest) -> int | N
def get_comment(
github_token: str,
pr_number: int,
-junit_objects,
-ninja_logs,
-advisor_response,
-return_code,
+body: str,
) -> dict[str, str]:
repo = github.Github(github_token).get_repo("llvm/llvm-project")
pr = repo.get_issue(pr_number).as_pull_request()
-comment = {
-"body": generate_test_report_lib.generate_report(
-generate_test_report_lib.compute_platform_title(),
-return_code,
-junit_objects,
-ninja_logs,
-failure_explanations_list=advisor_response,
-)
-}
+comment = {"body": body}
comment_id = get_comment_id(platform.system(), pr)
if comment_id:
comment["id"] = comment_id
@@ -59,6 +48,14 @@ def main(
pr_number: int,
return_code: int,
):
+if return_code == 0:
+with open("comment", "w") as comment_file_handle:
+comment = get_comment(
+":white_check_mark: With the latest revision this PR passed "
+"the premerge checks."
+)
+if comment["id"]:
+json.dump([comment], comment_file_handle)
junit_objects, ninja_logs = generate_test_report_lib.load_info_from_files(
build_log_files
)
@@ -90,10 +87,13 @@ def main(
get_comment(
github_token,
pr_number,
-junit_objects,
-ninja_logs,
-advisor_response.json(),
-return_code,
+generate_test_report_lib.generate_report(
+generate_test_report_lib.compute_platform_title(),
+return_code,
+junit_objects,
+ninja_logs,
+failure_explanations_list=advisor_response.json(),
+),
)
]
with open("comment", "w") as comment_file_handle:
diff --git a/.ci/utils.sh b/.ci/utils.sh
index 72f4b04f5bf3a..91c27319f3534 100644
--- a/.ci/utils.sh
+++ b/.ci/utils.sh
@@ -33,18 +33,18 @@ function at-exit {
# If building fails there will be no results files.
shopt -s nullglob
- if [[ "$GITHUB_STEP_SUMMARY" != "" ]]; then
+ if [[ "$GITHUB_ACTIONS" != "" ]]; then
python "${MONOREPO_ROOT}"/.ci/generate_test_report_github.py \
$retcode "${BUILD_DIR}"/test-results.*.xml "${MONOREPO_ROOT}"/ninja*.log
\
>> $GITHUB_STEP_SUMMARY
+python "${MONOREPO_ROOT}"/.ci/premerge_advisor_explain.py \
+ $(git rev-parse HEAD~1) $retcode ${{ secrets.GITHUB_TOKEN }} \
+ $GITHUB_PR_NUMBER "${BUILD_DIR}"/test-results.*.xml \
+ "${MONOREPO_ROOT}"/ninja*.log
fi
if [[ "$retcode" != "0" ]]; then
if [[ "$GITHUB_ACTIONS" != "" ]]; then
- python "${MONOREPO_ROOT}"/.ci/premerge_advisor_explain.py \
-$(git rev-parse HEAD~1) $retcode ${{ secrets.GITHUB_TOKEN }} \
-$GITHUB_PR_NUMBER "${BUILD_DIR}"/test-results.*.xml \
-"${MONOREPO_ROOT}"/ninja*.log
python "${MONOREPO_ROOT}"/.ci/premerge_advisor_upload.py \
$(git rev-parse HEAD~1) $GITHUB_RUN_NUMBER \
"${BUILD_DIR}"/test-results.*.xml "${MONOREPO_ROOT}"/ninja*.log
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [CI] Make premerge upload/write comments (PR #166609)
https://github.com/boomanaiden154 updated https://github.com/llvm/llvm-project/pull/166609 ___ llvm-branch-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [CI] Make premerge upload/write comments (PR #166609)
https://github.com/boomanaiden154 updated https://github.com/llvm/llvm-project/pull/166609 ___ llvm-branch-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [CI] Make premerge_advisor_explain write comments (PR #166605)
https://github.com/boomanaiden154 updated
https://github.com/llvm/llvm-project/pull/166605
>From 06c030dcb4ee57be287beffd96d1b21ef1697dd4 Mon Sep 17 00:00:00 2001
From: Aiden Grossman
Date: Wed, 5 Nov 2025 18:23:46 +
Subject: [PATCH] fix
Created using spr 1.3.7
---
.ci/premerge_advisor_explain.py | 34 -
.ci/utils.sh| 10 +-
2 files changed, 22 insertions(+), 22 deletions(-)
diff --git a/.ci/premerge_advisor_explain.py b/.ci/premerge_advisor_explain.py
index 4d840a33c3cf2..1d487af9e9ec7 100644
--- a/.ci/premerge_advisor_explain.py
+++ b/.ci/premerge_advisor_explain.py
@@ -31,22 +31,11 @@ def get_comment_id(platform: str, pr:
github.PullRequest.PullRequest) -> int | N
def get_comment(
github_token: str,
pr_number: int,
-junit_objects,
-ninja_logs,
-advisor_response,
-return_code,
+body: str,
) -> dict[str, str]:
repo = github.Github(github_token).get_repo("llvm/llvm-project")
pr = repo.get_issue(pr_number).as_pull_request()
-comment = {
-"body": generate_test_report_lib.generate_report(
-generate_test_report_lib.compute_platform_title(),
-return_code,
-junit_objects,
-ninja_logs,
-failure_explanations_list=advisor_response,
-)
-}
+comment = {"body": body}
comment_id = get_comment_id(platform.system(), pr)
if comment_id:
comment["id"] = comment_id
@@ -59,6 +48,14 @@ def main(
pr_number: int,
return_code: int,
):
+if return_code == 0:
+with open("comment", "w") as comment_file_handle:
+comment = get_comment(
+":white_check_mark: With the latest revision this PR passed "
+"the premerge checks."
+)
+if comment["id"]:
+json.dump([comment], comment_file_handle)
junit_objects, ninja_logs = generate_test_report_lib.load_info_from_files(
build_log_files
)
@@ -90,10 +87,13 @@ def main(
get_comment(
github_token,
pr_number,
-junit_objects,
-ninja_logs,
-advisor_response.json(),
-return_code,
+generate_test_report_lib.generate_report(
+generate_test_report_lib.compute_platform_title(),
+return_code,
+junit_objects,
+ninja_logs,
+failure_explanations_list=advisor_response.json(),
+),
)
]
with open("comment", "w") as comment_file_handle:
diff --git a/.ci/utils.sh b/.ci/utils.sh
index 72f4b04f5bf3a..91c27319f3534 100644
--- a/.ci/utils.sh
+++ b/.ci/utils.sh
@@ -33,18 +33,18 @@ function at-exit {
# If building fails there will be no results files.
shopt -s nullglob
- if [[ "$GITHUB_STEP_SUMMARY" != "" ]]; then
+ if [[ "$GITHUB_ACTIONS" != "" ]]; then
python "${MONOREPO_ROOT}"/.ci/generate_test_report_github.py \
$retcode "${BUILD_DIR}"/test-results.*.xml "${MONOREPO_ROOT}"/ninja*.log
\
>> $GITHUB_STEP_SUMMARY
+python "${MONOREPO_ROOT}"/.ci/premerge_advisor_explain.py \
+ $(git rev-parse HEAD~1) $retcode ${{ secrets.GITHUB_TOKEN }} \
+ $GITHUB_PR_NUMBER "${BUILD_DIR}"/test-results.*.xml \
+ "${MONOREPO_ROOT}"/ninja*.log
fi
if [[ "$retcode" != "0" ]]; then
if [[ "$GITHUB_ACTIONS" != "" ]]; then
- python "${MONOREPO_ROOT}"/.ci/premerge_advisor_explain.py \
-$(git rev-parse HEAD~1) $retcode ${{ secrets.GITHUB_TOKEN }} \
-$GITHUB_PR_NUMBER "${BUILD_DIR}"/test-results.*.xml \
-"${MONOREPO_ROOT}"/ninja*.log
python "${MONOREPO_ROOT}"/.ci/premerge_advisor_upload.py \
$(git rev-parse HEAD~1) $GITHUB_RUN_NUMBER \
"${BUILD_DIR}"/test-results.*.xml "${MONOREPO_ROOT}"/ninja*.log
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [AtomicExpand] Add bitcasts when expanding load atomic vector (PR #148900)
https://github.com/jofrn updated
https://github.com/llvm/llvm-project/pull/148900
>From 429dfd75797bfb24e208266cab1fbc14fc79c717 Mon Sep 17 00:00:00 2001
From: jofrn
Date: Tue, 15 Jul 2025 13:03:15 -0400
Subject: [PATCH] [AtomicExpand] Add bitcasts when expanding load atomic vector
AtomicExpand fails for aligned `load atomic ` because it
does not find a compatible library call. This change adds appropriate
bitcasts so that the call can be lowered. It also adds support for
128 bit lowering in tablegen to support SSE/AVX.
---
llvm/lib/CodeGen/AtomicExpandPass.cpp | 22 -
llvm/test/CodeGen/ARM/atomic-load-store.ll| 51 +++
llvm/test/CodeGen/X86/atomic-load-store.ll| 91 ++-
.../X86/expand-atomic-non-integer.ll | 66 ++
4 files changed, 225 insertions(+), 5 deletions(-)
diff --git a/llvm/lib/CodeGen/AtomicExpandPass.cpp
b/llvm/lib/CodeGen/AtomicExpandPass.cpp
index 53f1cfe24a68d..45cdc7980fdc6 100644
--- a/llvm/lib/CodeGen/AtomicExpandPass.cpp
+++ b/llvm/lib/CodeGen/AtomicExpandPass.cpp
@@ -483,7 +483,12 @@ LoadInst
*AtomicExpandImpl::convertAtomicLoadToIntegerType(LoadInst *LI) {
NewLI->setAtomic(LI->getOrdering(), LI->getSyncScopeID());
LLVM_DEBUG(dbgs() << "Replaced " << *LI << " with " << *NewLI << "\n");
- Value *NewVal = Builder.CreateBitCast(NewLI, LI->getType());
+ Value *NewVal =
+ LI->getType()->isPointerTy() ||
+ (LI->getType()->isVectorTy() &&
+
cast(LI->getType())->getElementType()->isPointerTy())
+ ? Builder.CreateIntToPtr(NewLI, LI->getType())
+ : Builder.CreateBitCast(NewLI, LI->getType());
LI->replaceAllUsesWith(NewVal);
LI->eraseFromParent();
return NewLI;
@@ -2093,9 +2098,18 @@ bool AtomicExpandImpl::expandAtomicOpToLibcall(
I->replaceAllUsesWith(V);
} else if (HasResult) {
Value *V;
-if (UseSizedLibcall)
- V = Builder.CreateBitOrPointerCast(Result, I->getType());
-else {
+if (UseSizedLibcall) {
+ // Add bitcasts from Result's scalar type to I's vector type
+ auto *PtrTy = dyn_cast(I->getType()->getScalarType());
+ auto *VTy = dyn_cast(I->getType());
+ if (VTy && PtrTy && !Result->getType()->isVectorTy()) {
+unsigned AS = PtrTy->getAddressSpace();
+Value *BC = Builder.CreateBitCast(
+Result, VTy->getWithNewType(DL.getIntPtrType(Ctx, AS)));
+V = Builder.CreateIntToPtr(BC, I->getType());
+ } else
+V = Builder.CreateBitOrPointerCast(Result, I->getType());
+} else {
V = Builder.CreateAlignedLoad(I->getType(), AllocaResult,
AllocaAlignment);
Builder.CreateLifetimeEnd(AllocaResult);
diff --git a/llvm/test/CodeGen/ARM/atomic-load-store.ll
b/llvm/test/CodeGen/ARM/atomic-load-store.ll
index 560dfde356c29..eaa2ffd9b2731 100644
--- a/llvm/test/CodeGen/ARM/atomic-load-store.ll
+++ b/llvm/test/CodeGen/ARM/atomic-load-store.ll
@@ -983,3 +983,54 @@ define void @store_atomic_f64__seq_cst(ptr %ptr, double
%val1) {
store atomic double %val1, ptr %ptr seq_cst, align 8
ret void
}
+
+define <1 x ptr> @atomic_vec1_ptr(ptr %x) #0 {
+; ARM-LABEL: atomic_vec1_ptr:
+; ARM: @ %bb.0:
+; ARM-NEXT:ldr r0, [r0]
+; ARM-NEXT:dmb ish
+; ARM-NEXT:bx lr
+;
+; ARMOPTNONE-LABEL: atomic_vec1_ptr:
+; ARMOPTNONE: @ %bb.0:
+; ARMOPTNONE-NEXT:ldr r0, [r0]
+; ARMOPTNONE-NEXT:dmb ish
+; ARMOPTNONE-NEXT:bx lr
+;
+; THUMBTWO-LABEL: atomic_vec1_ptr:
+; THUMBTWO: @ %bb.0:
+; THUMBTWO-NEXT:ldr r0, [r0]
+; THUMBTWO-NEXT:dmb ish
+; THUMBTWO-NEXT:bx lr
+;
+; THUMBONE-LABEL: atomic_vec1_ptr:
+; THUMBONE: @ %bb.0:
+; THUMBONE-NEXT:push {r7, lr}
+; THUMBONE-NEXT:movs r1, #0
+; THUMBONE-NEXT:mov r2, r1
+; THUMBONE-NEXT:bl __sync_val_compare_and_swap_4
+; THUMBONE-NEXT:pop {r7, pc}
+;
+; ARMV4-LABEL: atomic_vec1_ptr:
+; ARMV4: @ %bb.0:
+; ARMV4-NEXT:push {r11, lr}
+; ARMV4-NEXT:mov r1, #2
+; ARMV4-NEXT:bl __atomic_load_4
+; ARMV4-NEXT:pop {r11, lr}
+; ARMV4-NEXT:mov pc, lr
+;
+; ARMV6-LABEL: atomic_vec1_ptr:
+; ARMV6: @ %bb.0:
+; ARMV6-NEXT:ldr r0, [r0]
+; ARMV6-NEXT:mov r1, #0
+; ARMV6-NEXT:mcr p15, #0, r1, c7, c10, #5
+; ARMV6-NEXT:bx lr
+;
+; THUMBM-LABEL: atomic_vec1_ptr:
+; THUMBM: @ %bb.0:
+; THUMBM-NEXT:ldr r0, [r0]
+; THUMBM-NEXT:dmb sy
+; THUMBM-NEXT:bx lr
+ %ret = load atomic <1 x ptr>, ptr %x acquire, align 4
+ ret <1 x ptr> %ret
+}
diff --git a/llvm/test/CodeGen/X86/atomic-load-store.ll
b/llvm/test/CodeGen/X86/atomic-load-store.ll
index 00310f6d1f219..867a4acb791bc 100644
--- a/llvm/test/CodeGen/X86/atomic-load-store.ll
+++ b/llvm/test/CodeGen/X86/atomic-load-store.ll
@@ -244,6 +244,96 @@ define <2 x ptr addrspace(270)> @atomic_vec2_ptr270(ptr
%x) {
%ret = load atomic <2 x ptr addrspace(270)>, ptr %x acquire, align 8
ret <2 x ptr addrspace(270)> %ret
}
+define <2 x ptr> @atomic_ve
[llvm-branch-commits] [llvm] [X86] Cast atomic vectors in IR to support floats (PR #148899)
https://github.com/jofrn updated
https://github.com/llvm/llvm-project/pull/148899
>From 4d1cdadc3259ed811a186b049bb1589ebc4e5470 Mon Sep 17 00:00:00 2001
From: jofrn
Date: Tue, 15 Jul 2025 13:02:04 -0400
Subject: [PATCH] [X86] Cast atomic vectors in IR to support floats
This commit casts floats to ints in an atomic load during AtomicExpand to
support
floating point types. It also is required to support 128 bit vectors in SSE/AVX.
---
llvm/lib/Target/X86/X86ISelLowering.cpp| 7 +
llvm/lib/Target/X86/X86ISelLowering.h | 2 +
llvm/lib/Target/X86/X86InstrCompiler.td| 15 +
llvm/test/CodeGen/X86/atomic-load-store.ll | 385 ++---
4 files changed, 122 insertions(+), 287 deletions(-)
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp
b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 49beadae63f03..e15f17281b958 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -32136,6 +32136,13 @@
X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
}
}
+TargetLowering::AtomicExpansionKind
+X86TargetLowering::shouldCastAtomicLoadInIR(LoadInst *LI) const {
+ if (LI->getType()->getScalarType()->isFloatingPointTy())
+return AtomicExpansionKind::CastToInteger;
+ return AtomicExpansionKind::None;
+}
+
LoadInst *
X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
diff --git a/llvm/lib/Target/X86/X86ISelLowering.h
b/llvm/lib/Target/X86/X86ISelLowering.h
index b7151f65942b4..f9a8adbd7da0d 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/llvm/lib/Target/X86/X86ISelLowering.h
@@ -1841,6 +1841,8 @@ namespace llvm {
shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override;
TargetLoweringBase::AtomicExpansionKind
shouldExpandLogicAtomicRMWInIR(AtomicRMWInst *AI) const;
+TargetLoweringBase::AtomicExpansionKind
+shouldCastAtomicLoadInIR(LoadInst *LI) const override;
void emitBitTestAtomicRMWIntrinsic(AtomicRMWInst *AI) const override;
void emitCmpArithAtomicRMWIntrinsic(AtomicRMWInst *AI) const override;
diff --git a/llvm/lib/Target/X86/X86InstrCompiler.td
b/llvm/lib/Target/X86/X86InstrCompiler.td
index ce429b5916280..3f542297fea19 100644
--- a/llvm/lib/Target/X86/X86InstrCompiler.td
+++ b/llvm/lib/Target/X86/X86InstrCompiler.td
@@ -1220,6 +1220,21 @@ def : Pat<(v2i64 (scalar_to_vector (i64 (atomic_load_64
addr:$src,
def : Pat<(v2i64 (scalar_to_vector (i64 (atomic_load_64 addr:$src,
(VMOV64toPQIZrm addr:$src)>, Requires<[HasAVX512]>;
+// load atomic <2 x i64>
+def : Pat<(v2i64 (atomic_load_128_v2i64 addr:$src)),
+ (MOVAPDrm addr:$src)>, Requires<[UseSSE2]>;
+def : Pat<(v2i64 (atomic_load_128_v2i64 addr:$src)),
+ (VMOVAPDrm addr:$src)>, Requires<[UseAVX]>;
+def : Pat<(v2i64 (atomic_load_128_v2i64 addr:$src)),
+ (VMOVAPDZ128rm addr:$src)>, Requires<[HasAVX512]>;
+// load atomic <4 x i32>
+def : Pat<(v4i32 (atomic_load_128_v4i32 addr:$src)),
+ (MOVAPDrm addr:$src)>, Requires<[UseSSE2]>;
+def : Pat<(v4i32 (atomic_load_128_v4i32 addr:$src)),
+ (VMOVAPDrm addr:$src)>, Requires<[UseAVX]>;
+def : Pat<(v4i32 (atomic_load_128_v4i32 addr:$src)),
+ (VMOVAPDZ128rm addr:$src)>, Requires<[HasAVX512]>;
+
// Floating point loads/stores.
def : Pat<(atomic_store_32 (i32 (bitconvert (f32 FR32:$src))), addr:$dst),
(MOVSSmr addr:$dst, FR32:$src)>, Requires<[UseSSE1]>;
diff --git a/llvm/test/CodeGen/X86/atomic-load-store.ll
b/llvm/test/CodeGen/X86/atomic-load-store.ll
index 928dfef3143da..00310f6d1f219 100644
--- a/llvm/test/CodeGen/X86/atomic-load-store.ll
+++ b/llvm/test/CodeGen/X86/atomic-load-store.ll
@@ -119,13 +119,13 @@ define <1 x bfloat> @atomic_vec1_bfloat(ptr %x) {
; CHECK-SSE-O3-LABEL: atomic_vec1_bfloat:
; CHECK-SSE-O3: # %bb.0:
; CHECK-SSE-O3-NEXT:movzwl (%rdi), %eax
-; CHECK-SSE-O3-NEXT:pinsrw $0, %eax, %xmm0
+; CHECK-SSE-O3-NEXT:movd %eax, %xmm0
; CHECK-SSE-O3-NEXT:retq
;
; CHECK-AVX-O3-LABEL: atomic_vec1_bfloat:
; CHECK-AVX-O3: # %bb.0:
; CHECK-AVX-O3-NEXT:movzwl (%rdi), %eax
-; CHECK-AVX-O3-NEXT:vpinsrw $0, %eax, %xmm0, %xmm0
+; CHECK-AVX-O3-NEXT:vmovd %eax, %xmm0
; CHECK-AVX-O3-NEXT:retq
;
; CHECK-SSE-O0-LABEL: atomic_vec1_bfloat:
@@ -133,8 +133,7 @@ define <1 x bfloat> @atomic_vec1_bfloat(ptr %x) {
; CHECK-SSE-O0-NEXT:movw (%rdi), %cx
; CHECK-SSE-O0-NEXT:# implicit-def: $eax
; CHECK-SSE-O0-NEXT:movw %cx, %ax
-; CHECK-SSE-O0-NEXT:# implicit-def: $xmm0
-; CHECK-SSE-O0-NEXT:pinsrw $0, %eax, %xmm0
+; CHECK-SSE-O0-NEXT:movd %eax, %xmm0
; CHECK-SSE-O0-NEXT:retq
;
; CHECK-AVX-O0-LABEL: atomic_vec1_bfloat:
@@ -142,8 +141,7 @@ define <1 x bfloat> @atomic_vec1_bfloat(ptr %x) {
; CHECK-AVX-O0-NEXT:movw (%rdi), %cx
; CHECK-AVX-O0-NEXT:# implicit-def: $eax
; CHECK-AVX-O0-NEXT:movw %cx, %ax
-; CHECK-AVX-O0-NEXT:# imp
[llvm-branch-commits] [llvm] [AtomicExpand] Add bitcasts when expanding load atomic vector (PR #148900)
https://github.com/jofrn updated
https://github.com/llvm/llvm-project/pull/148900
>From 429dfd75797bfb24e208266cab1fbc14fc79c717 Mon Sep 17 00:00:00 2001
From: jofrn
Date: Tue, 15 Jul 2025 13:03:15 -0400
Subject: [PATCH] [AtomicExpand] Add bitcasts when expanding load atomic vector
AtomicExpand fails for aligned `load atomic ` because it
does not find a compatible library call. This change adds appropriate
bitcasts so that the call can be lowered. It also adds support for
128 bit lowering in tablegen to support SSE/AVX.
---
llvm/lib/CodeGen/AtomicExpandPass.cpp | 22 -
llvm/test/CodeGen/ARM/atomic-load-store.ll| 51 +++
llvm/test/CodeGen/X86/atomic-load-store.ll| 91 ++-
.../X86/expand-atomic-non-integer.ll | 66 ++
4 files changed, 225 insertions(+), 5 deletions(-)
diff --git a/llvm/lib/CodeGen/AtomicExpandPass.cpp
b/llvm/lib/CodeGen/AtomicExpandPass.cpp
index 53f1cfe24a68d..45cdc7980fdc6 100644
--- a/llvm/lib/CodeGen/AtomicExpandPass.cpp
+++ b/llvm/lib/CodeGen/AtomicExpandPass.cpp
@@ -483,7 +483,12 @@ LoadInst
*AtomicExpandImpl::convertAtomicLoadToIntegerType(LoadInst *LI) {
NewLI->setAtomic(LI->getOrdering(), LI->getSyncScopeID());
LLVM_DEBUG(dbgs() << "Replaced " << *LI << " with " << *NewLI << "\n");
- Value *NewVal = Builder.CreateBitCast(NewLI, LI->getType());
+ Value *NewVal =
+ LI->getType()->isPointerTy() ||
+ (LI->getType()->isVectorTy() &&
+
cast(LI->getType())->getElementType()->isPointerTy())
+ ? Builder.CreateIntToPtr(NewLI, LI->getType())
+ : Builder.CreateBitCast(NewLI, LI->getType());
LI->replaceAllUsesWith(NewVal);
LI->eraseFromParent();
return NewLI;
@@ -2093,9 +2098,18 @@ bool AtomicExpandImpl::expandAtomicOpToLibcall(
I->replaceAllUsesWith(V);
} else if (HasResult) {
Value *V;
-if (UseSizedLibcall)
- V = Builder.CreateBitOrPointerCast(Result, I->getType());
-else {
+if (UseSizedLibcall) {
+ // Add bitcasts from Result's scalar type to I's vector type
+ auto *PtrTy = dyn_cast(I->getType()->getScalarType());
+ auto *VTy = dyn_cast(I->getType());
+ if (VTy && PtrTy && !Result->getType()->isVectorTy()) {
+unsigned AS = PtrTy->getAddressSpace();
+Value *BC = Builder.CreateBitCast(
+Result, VTy->getWithNewType(DL.getIntPtrType(Ctx, AS)));
+V = Builder.CreateIntToPtr(BC, I->getType());
+ } else
+V = Builder.CreateBitOrPointerCast(Result, I->getType());
+} else {
V = Builder.CreateAlignedLoad(I->getType(), AllocaResult,
AllocaAlignment);
Builder.CreateLifetimeEnd(AllocaResult);
diff --git a/llvm/test/CodeGen/ARM/atomic-load-store.ll
b/llvm/test/CodeGen/ARM/atomic-load-store.ll
index 560dfde356c29..eaa2ffd9b2731 100644
--- a/llvm/test/CodeGen/ARM/atomic-load-store.ll
+++ b/llvm/test/CodeGen/ARM/atomic-load-store.ll
@@ -983,3 +983,54 @@ define void @store_atomic_f64__seq_cst(ptr %ptr, double
%val1) {
store atomic double %val1, ptr %ptr seq_cst, align 8
ret void
}
+
+define <1 x ptr> @atomic_vec1_ptr(ptr %x) #0 {
+; ARM-LABEL: atomic_vec1_ptr:
+; ARM: @ %bb.0:
+; ARM-NEXT:ldr r0, [r0]
+; ARM-NEXT:dmb ish
+; ARM-NEXT:bx lr
+;
+; ARMOPTNONE-LABEL: atomic_vec1_ptr:
+; ARMOPTNONE: @ %bb.0:
+; ARMOPTNONE-NEXT:ldr r0, [r0]
+; ARMOPTNONE-NEXT:dmb ish
+; ARMOPTNONE-NEXT:bx lr
+;
+; THUMBTWO-LABEL: atomic_vec1_ptr:
+; THUMBTWO: @ %bb.0:
+; THUMBTWO-NEXT:ldr r0, [r0]
+; THUMBTWO-NEXT:dmb ish
+; THUMBTWO-NEXT:bx lr
+;
+; THUMBONE-LABEL: atomic_vec1_ptr:
+; THUMBONE: @ %bb.0:
+; THUMBONE-NEXT:push {r7, lr}
+; THUMBONE-NEXT:movs r1, #0
+; THUMBONE-NEXT:mov r2, r1
+; THUMBONE-NEXT:bl __sync_val_compare_and_swap_4
+; THUMBONE-NEXT:pop {r7, pc}
+;
+; ARMV4-LABEL: atomic_vec1_ptr:
+; ARMV4: @ %bb.0:
+; ARMV4-NEXT:push {r11, lr}
+; ARMV4-NEXT:mov r1, #2
+; ARMV4-NEXT:bl __atomic_load_4
+; ARMV4-NEXT:pop {r11, lr}
+; ARMV4-NEXT:mov pc, lr
+;
+; ARMV6-LABEL: atomic_vec1_ptr:
+; ARMV6: @ %bb.0:
+; ARMV6-NEXT:ldr r0, [r0]
+; ARMV6-NEXT:mov r1, #0
+; ARMV6-NEXT:mcr p15, #0, r1, c7, c10, #5
+; ARMV6-NEXT:bx lr
+;
+; THUMBM-LABEL: atomic_vec1_ptr:
+; THUMBM: @ %bb.0:
+; THUMBM-NEXT:ldr r0, [r0]
+; THUMBM-NEXT:dmb sy
+; THUMBM-NEXT:bx lr
+ %ret = load atomic <1 x ptr>, ptr %x acquire, align 4
+ ret <1 x ptr> %ret
+}
diff --git a/llvm/test/CodeGen/X86/atomic-load-store.ll
b/llvm/test/CodeGen/X86/atomic-load-store.ll
index 00310f6d1f219..867a4acb791bc 100644
--- a/llvm/test/CodeGen/X86/atomic-load-store.ll
+++ b/llvm/test/CodeGen/X86/atomic-load-store.ll
@@ -244,6 +244,96 @@ define <2 x ptr addrspace(270)> @atomic_vec2_ptr270(ptr
%x) {
%ret = load atomic <2 x ptr addrspace(270)>, ptr %x acquire, align 8
ret <2 x ptr addrspace(270)> %ret
}
+define <2 x ptr> @atomic_ve
[llvm-branch-commits] [llvm] [X86] Cast atomic vectors in IR to support floats (PR #148899)
https://github.com/jofrn updated
https://github.com/llvm/llvm-project/pull/148899
>From 4d1cdadc3259ed811a186b049bb1589ebc4e5470 Mon Sep 17 00:00:00 2001
From: jofrn
Date: Tue, 15 Jul 2025 13:02:04 -0400
Subject: [PATCH] [X86] Cast atomic vectors in IR to support floats
This commit casts floats to ints in an atomic load during AtomicExpand to
support
floating point types. It also is required to support 128 bit vectors in SSE/AVX.
---
llvm/lib/Target/X86/X86ISelLowering.cpp| 7 +
llvm/lib/Target/X86/X86ISelLowering.h | 2 +
llvm/lib/Target/X86/X86InstrCompiler.td| 15 +
llvm/test/CodeGen/X86/atomic-load-store.ll | 385 ++---
4 files changed, 122 insertions(+), 287 deletions(-)
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp
b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 49beadae63f03..e15f17281b958 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -32136,6 +32136,13 @@
X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
}
}
+TargetLowering::AtomicExpansionKind
+X86TargetLowering::shouldCastAtomicLoadInIR(LoadInst *LI) const {
+ if (LI->getType()->getScalarType()->isFloatingPointTy())
+return AtomicExpansionKind::CastToInteger;
+ return AtomicExpansionKind::None;
+}
+
LoadInst *
X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
diff --git a/llvm/lib/Target/X86/X86ISelLowering.h
b/llvm/lib/Target/X86/X86ISelLowering.h
index b7151f65942b4..f9a8adbd7da0d 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/llvm/lib/Target/X86/X86ISelLowering.h
@@ -1841,6 +1841,8 @@ namespace llvm {
shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override;
TargetLoweringBase::AtomicExpansionKind
shouldExpandLogicAtomicRMWInIR(AtomicRMWInst *AI) const;
+TargetLoweringBase::AtomicExpansionKind
+shouldCastAtomicLoadInIR(LoadInst *LI) const override;
void emitBitTestAtomicRMWIntrinsic(AtomicRMWInst *AI) const override;
void emitCmpArithAtomicRMWIntrinsic(AtomicRMWInst *AI) const override;
diff --git a/llvm/lib/Target/X86/X86InstrCompiler.td
b/llvm/lib/Target/X86/X86InstrCompiler.td
index ce429b5916280..3f542297fea19 100644
--- a/llvm/lib/Target/X86/X86InstrCompiler.td
+++ b/llvm/lib/Target/X86/X86InstrCompiler.td
@@ -1220,6 +1220,21 @@ def : Pat<(v2i64 (scalar_to_vector (i64 (atomic_load_64
addr:$src,
def : Pat<(v2i64 (scalar_to_vector (i64 (atomic_load_64 addr:$src,
(VMOV64toPQIZrm addr:$src)>, Requires<[HasAVX512]>;
+// load atomic <2 x i64>
+def : Pat<(v2i64 (atomic_load_128_v2i64 addr:$src)),
+ (MOVAPDrm addr:$src)>, Requires<[UseSSE2]>;
+def : Pat<(v2i64 (atomic_load_128_v2i64 addr:$src)),
+ (VMOVAPDrm addr:$src)>, Requires<[UseAVX]>;
+def : Pat<(v2i64 (atomic_load_128_v2i64 addr:$src)),
+ (VMOVAPDZ128rm addr:$src)>, Requires<[HasAVX512]>;
+// load atomic <4 x i32>
+def : Pat<(v4i32 (atomic_load_128_v4i32 addr:$src)),
+ (MOVAPDrm addr:$src)>, Requires<[UseSSE2]>;
+def : Pat<(v4i32 (atomic_load_128_v4i32 addr:$src)),
+ (VMOVAPDrm addr:$src)>, Requires<[UseAVX]>;
+def : Pat<(v4i32 (atomic_load_128_v4i32 addr:$src)),
+ (VMOVAPDZ128rm addr:$src)>, Requires<[HasAVX512]>;
+
// Floating point loads/stores.
def : Pat<(atomic_store_32 (i32 (bitconvert (f32 FR32:$src))), addr:$dst),
(MOVSSmr addr:$dst, FR32:$src)>, Requires<[UseSSE1]>;
diff --git a/llvm/test/CodeGen/X86/atomic-load-store.ll
b/llvm/test/CodeGen/X86/atomic-load-store.ll
index 928dfef3143da..00310f6d1f219 100644
--- a/llvm/test/CodeGen/X86/atomic-load-store.ll
+++ b/llvm/test/CodeGen/X86/atomic-load-store.ll
@@ -119,13 +119,13 @@ define <1 x bfloat> @atomic_vec1_bfloat(ptr %x) {
; CHECK-SSE-O3-LABEL: atomic_vec1_bfloat:
; CHECK-SSE-O3: # %bb.0:
; CHECK-SSE-O3-NEXT:movzwl (%rdi), %eax
-; CHECK-SSE-O3-NEXT:pinsrw $0, %eax, %xmm0
+; CHECK-SSE-O3-NEXT:movd %eax, %xmm0
; CHECK-SSE-O3-NEXT:retq
;
; CHECK-AVX-O3-LABEL: atomic_vec1_bfloat:
; CHECK-AVX-O3: # %bb.0:
; CHECK-AVX-O3-NEXT:movzwl (%rdi), %eax
-; CHECK-AVX-O3-NEXT:vpinsrw $0, %eax, %xmm0, %xmm0
+; CHECK-AVX-O3-NEXT:vmovd %eax, %xmm0
; CHECK-AVX-O3-NEXT:retq
;
; CHECK-SSE-O0-LABEL: atomic_vec1_bfloat:
@@ -133,8 +133,7 @@ define <1 x bfloat> @atomic_vec1_bfloat(ptr %x) {
; CHECK-SSE-O0-NEXT:movw (%rdi), %cx
; CHECK-SSE-O0-NEXT:# implicit-def: $eax
; CHECK-SSE-O0-NEXT:movw %cx, %ax
-; CHECK-SSE-O0-NEXT:# implicit-def: $xmm0
-; CHECK-SSE-O0-NEXT:pinsrw $0, %eax, %xmm0
+; CHECK-SSE-O0-NEXT:movd %eax, %xmm0
; CHECK-SSE-O0-NEXT:retq
;
; CHECK-AVX-O0-LABEL: atomic_vec1_bfloat:
@@ -142,8 +141,7 @@ define <1 x bfloat> @atomic_vec1_bfloat(ptr %x) {
; CHECK-AVX-O0-NEXT:movw (%rdi), %cx
; CHECK-AVX-O0-NEXT:# implicit-def: $eax
; CHECK-AVX-O0-NEXT:movw %cx, %ax
-; CHECK-AVX-O0-NEXT:# imp
[llvm-branch-commits] [llvm] [SelectionDAG] Split vector types for atomic load (PR #165818)
https://github.com/jofrn updated
https://github.com/llvm/llvm-project/pull/165818
>From 94119264a0fd461b3cb18d6dbd30337f274e403b Mon Sep 17 00:00:00 2001
From: jofrn
Date: Thu, 30 Oct 2025 12:19:59 -0400
Subject: [PATCH] [SelectionDAG] Split vector types for atomic load
Vector types that aren't widened are split
so that a single ATOMIC_LOAD is issued for the entire vector at once.
This change utilizes the load vectorization infrastructure in
SelectionDAG in order to group the vectors. This enables SelectionDAG
to translate vectors with type bfloat,half.
---
.../include/llvm/Target/TargetSelectionDAG.td | 14 +
llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h | 1 +
.../SelectionDAG/LegalizeVectorTypes.cpp | 37 ++
llvm/test/CodeGen/X86/atomic-load-store.ll| 352 +-
4 files changed, 400 insertions(+), 4 deletions(-)
diff --git a/llvm/include/llvm/Target/TargetSelectionDAG.td
b/llvm/include/llvm/Target/TargetSelectionDAG.td
index 07a858fd682fc..239fee8a3022d 100644
--- a/llvm/include/llvm/Target/TargetSelectionDAG.td
+++ b/llvm/include/llvm/Target/TargetSelectionDAG.td
@@ -1949,6 +1949,20 @@ def atomic_load_64 :
let MemoryVT = i64;
}
+def atomic_load_128_v2i64 :
+ PatFrag<(ops node:$ptr),
+ (atomic_load node:$ptr)> {
+ let IsAtomic = true;
+ let MemoryVT = v2i64;
+}
+
+def atomic_load_128_v4i32 :
+ PatFrag<(ops node:$ptr),
+ (atomic_load node:$ptr)> {
+ let IsAtomic = true;
+ let MemoryVT = v4i32;
+}
+
def atomic_load_nonext_8 :
PatFrag<(ops node:$ptr), (atomic_load_nonext node:$ptr)> {
let IsAtomic = true; // FIXME: Should be IsLoad and/or IsAtomic?
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index ed2c30be7d71d..9028ff4d3401c 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -978,6 +978,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
void SplitVecRes_FPOp_MultiType(SDNode *N, SDValue &Lo, SDValue &Hi);
void SplitVecRes_IS_FPCLASS(SDNode *N, SDValue &Lo, SDValue &Hi);
void SplitVecRes_INSERT_VECTOR_ELT(SDNode *N, SDValue &Lo, SDValue &Hi);
+ void SplitVecRes_ATOMIC_LOAD(AtomicSDNode *LD, SDValue &Lo, SDValue &Hi);
void SplitVecRes_LOAD(LoadSDNode *LD, SDValue &Lo, SDValue &Hi);
void SplitVecRes_VP_LOAD(VPLoadSDNode *LD, SDValue &Lo, SDValue &Hi);
void SplitVecRes_VP_LOAD_FF(VPLoadFFSDNode *LD, SDValue &Lo, SDValue &Hi);
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index 22f9fd548f52b..e34b9fa8e787c 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -1226,6 +1226,9 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N,
unsigned ResNo) {
SplitVecRes_STEP_VECTOR(N, Lo, Hi);
break;
case ISD::SIGN_EXTEND_INREG: SplitVecRes_InregOp(N, Lo, Hi); break;
+ case ISD::ATOMIC_LOAD:
+SplitVecRes_ATOMIC_LOAD(cast(N), Lo, Hi);
+break;
case ISD::LOAD:
SplitVecRes_LOAD(cast(N), Lo, Hi);
break;
@@ -2202,6 +2205,40 @@ void DAGTypeLegalizer::SplitVecRes_VP_SPLAT(SDNode *N,
SDValue &Lo,
Hi = DAG.getNode(N->getOpcode(), dl, HiVT, N->getOperand(0), MaskHi, EVLHi);
}
+void DAGTypeLegalizer::SplitVecRes_ATOMIC_LOAD(AtomicSDNode *LD, SDValue &Lo,
+ SDValue &Hi) {
+ assert(LD->getExtensionType() == ISD::NON_EXTLOAD &&
+ "Extended load during type legalization!");
+ SDLoc dl(LD);
+ EVT VT = LD->getValueType(0);
+ EVT LoVT, HiVT;
+ std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
+
+ SDValue Ch = LD->getChain();
+ SDValue Ptr = LD->getBasePtr();
+
+ EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
+ EVT MemIntVT =
+ EVT::getIntegerVT(*DAG.getContext(), LD->getMemoryVT().getSizeInBits());
+ SDValue ALD = DAG.getAtomicLoad(ISD::NON_EXTLOAD, dl, MemIntVT, IntVT, Ch,
+ Ptr, LD->getMemOperand());
+
+ EVT LoIntVT = EVT::getIntegerVT(*DAG.getContext(), LoVT.getSizeInBits());
+ EVT HiIntVT = EVT::getIntegerVT(*DAG.getContext(), HiVT.getSizeInBits());
+ SDValue ExtractLo = DAG.getNode(ISD::TRUNCATE, dl, LoIntVT, ALD);
+ SDValue ExtractHi =
+ DAG.getNode(ISD::SRL, dl, IntVT, ALD,
+ DAG.getIntPtrConstant(VT.getSizeInBits() / 2, dl));
+ ExtractHi = DAG.getNode(ISD::TRUNCATE, dl, HiIntVT, ExtractHi);
+
+ Lo = DAG.getBitcast(LoVT, ExtractLo);
+ Hi = DAG.getBitcast(HiVT, ExtractHi);
+
+ // Legalize the chain result - switch anything that used the old chain to
+ // use the new one.
+ ReplaceValueWith(SDValue(LD, 1), ALD.getValue(1));
+}
+
void DAGTypeLegalizer::SplitVecRes_LOAD(LoadSDNode *LD, SDValue &Lo,
SDValue &Hi) {
assert(ISD::isUNINDEXEDLoad(LD) && "Indexed load during type legalization!");
diff --git a/llvm/test/CodeGen/X86/atomic-l
[llvm-branch-commits] [MLIR][Python] Update Nanobind Warnings List for clang-cl on Windows (PR #166828)
llvmbot wrote:
@llvm/pr-subscribers-mlir
Author: Aiden Grossman (boomanaiden154)
Changes
We recently moved over to compiling with clang-cl on Windows. This ended
up causing a large increase in warnings, particularly due to how
warnings are handled in nanobind. cd91d0fff9293a904704784c92c28637bfebef45
initially set -Wall -Wextra and -Wpedantic while fixing another issue,
which is probably not what we want to do on third-party code. We also
need to disable -Wmissing-field-initializers to get things clean in this
configuration.
---
Full diff: https://github.com/llvm/llvm-project/pull/166828.diff
1 Files Affected:
- (modified) mlir/cmake/modules/AddMLIRPython.cmake (+2-2)
``diff
diff --git a/mlir/cmake/modules/AddMLIRPython.cmake
b/mlir/cmake/modules/AddMLIRPython.cmake
index fa6aec8a603a9..8196e2a2a3321 100644
--- a/mlir/cmake/modules/AddMLIRPython.cmake
+++ b/mlir/cmake/modules/AddMLIRPython.cmake
@@ -791,7 +791,6 @@ function(add_mlir_python_extension libname extname)
get_property(NB_LIBRARY_TARGET_NAME TARGET ${libname} PROPERTY
LINK_LIBRARIES)
target_compile_options(${NB_LIBRARY_TARGET_NAME}
PRIVATE
- -Wall -Wextra -Wpedantic
-Wno-c++98-compat-extra-semi
-Wno-cast-qual
-Wno-covered-switch-default
@@ -799,11 +798,11 @@ function(add_mlir_python_extension libname extname)
-Wno-nested-anon-types
-Wno-unused-parameter
-Wno-zero-length-array
+ -Wno-missing-field-initializers
${eh_rtti_enable})
target_compile_options(${libname}
PRIVATE
- -Wall -Wextra -Wpedantic
-Wno-c++98-compat-extra-semi
-Wno-cast-qual
-Wno-covered-switch-default
@@ -811,6 +810,7 @@ function(add_mlir_python_extension libname extname)
-Wno-nested-anon-types
-Wno-unused-parameter
-Wno-zero-length-array
+ -Wno-missing-field-initializers
${eh_rtti_enable})
endif()
``
https://github.com/llvm/llvm-project/pull/166828
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [MLIR][Python] Update Nanobind Warnings List for clang-cl on Windows (PR #166828)
https://github.com/boomanaiden154 created https://github.com/llvm/llvm-project/pull/166828 We recently moved over to compiling with clang-cl on Windows. This ended up causing a large increase in warnings, particularly due to how warnings are handled in nanobind. cd91d0fff9293a904704784c92c28637bfebef45 initially set -Wall -Wextra and -Wpedantic while fixing another issue, which is probably not what we want to do on third-party code. We also need to disable -Wmissing-field-initializers to get things clean in this configuration. ___ llvm-branch-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [CI] Make premerge_advisor_explain write comments (PR #166605)
@@ -12,6 +12,94 @@ certifi==2025.8.3 \ --hash=sha256:e564105f78ded564e3ae7c923924435e1daa7463faeab5bb932bc53ffae63407 \ --hash=sha256:f6c12493cfb1b06ba2ff328595af9350c65d6644968e5d3a2ffd78699af217a5 # via requests +cffi==2.0.0 \ + --hash=sha256:00bdf7acc5f795150faa6957054fbbca2439db2f775ce831222b66f192f03beb \ + --hash=sha256:07b271772c100085dd28b74fa0cd81c8fb1a3ba18b21e03d7c27f3436a10606b \ + --hash=sha256:087067fa8953339c723661eda6b54bc98c5625757ea62e95eb4898ad5e776e9f \ + --hash=sha256:0a1527a803f0a659de1af2e1fd700213caba79377e27e4693648c2923da066f9 \ + --hash=sha256:0cf2d91ecc3fcc0625c2c530fe004f82c110405f101548512cce44322fa8ac44 \ + --hash=sha256:0f6084a0ea23d05d20c3edcda20c3d006f9b6f3fefeac38f59262e10cef47ee2 \ + --hash=sha256:12873ca6cb9b0f0d3a0da705d6086fe911591737a59f28b7936bdfed27c0d47c \ + --hash=sha256:19f705ada2530c1167abacb171925dd886168931e0a7b78f5bffcae5c6b5be75 \ + --hash=sha256:1cd13c99ce269b3ed80b417dcd591415d3372bcac067009b6e0f59c7d4015e65 \ + --hash=sha256:1e3a615586f05fc4065a8b22b8152f0c1b00cdbc60596d187c2a74f9e3036e4e \ + --hash=sha256:1f72fb8906754ac8a2cc3f9f5aaa298070652a0ffae577e0ea9bd480dc3c931a \ + --hash=sha256:1fc9ea04857caf665289b7a75923f2c6ed559b8298a1b8c49e59f7dd95c8481e \ + --hash=sha256:203a48d1fb583fc7d78a4c6655692963b860a417c0528492a6bc21f1aaefab25 \ + --hash=sha256:2081580ebb843f759b9f617314a24ed5738c51d2aee65d31e02f6f7a2b97707a \ + --hash=sha256:21d1152871b019407d8ac3985f6775c079416c282e431a4da6afe7aefd2bccbe \ + --hash=sha256:24b6f81f1983e6df8db3adc38562c83f7d4a0c36162885ec7f7b77c7dcbec97b \ + --hash=sha256:256f80b80ca3853f90c21b23ee78cd008713787b1b1e93eae9f3d6a7134abd91 \ + --hash=sha256:28a3a209b96630bca57cce802da70c266eb08c6e97e5afd61a75611ee6c64592 \ + --hash=sha256:2c8f814d84194c9ea681642fd164267891702542f028a15fc97d4674b6206187 \ + --hash=sha256:2de9a304e27f7596cd03d16f1b7c72219bd944e99cc52b84d0145aefb07cbd3c \ + --hash=sha256:38100abb9d1b1435bc4cc340bb4489635dc2f0da7456590877030c9b3d40b0c1 \ + --hash=sha256:3925dd22fa2b7699ed2617149842d2e6adde22b262fcbfada50e3d195e4b3a94 \ + --hash=sha256:3e17ed538242334bf70832644a32a7aae3d83b57567f9fd60a26257e992b79ba \ + --hash=sha256:3e837e369566884707ddaf85fc1744b47575005c0a229de3327f8f9a20f4efeb \ + --hash=sha256:3f4d46d8b35698056ec29bca21546e1551a205058ae1a181d871e278b0b28165 \ + --hash=sha256:44d1b5909021139fe36001ae048dbdde8214afa20200eda0f64c068cac5d5529 \ + --hash=sha256:45d5e886156860dc35862657e1494b9bae8dfa63bf56796f2fb56e1679fc0bca \ + --hash=sha256:4647afc2f90d1ddd33441e5b0e85b16b12ddec4fca55f0d9671fef036ecca27c \ + --hash=sha256:4671d9dd5ec934cb9a73e7ee9676f9362aba54f7f34910956b84d727b0d73fb6 \ + --hash=sha256:53f77cbe57044e88bbd5ed26ac1d0514d2acf0591dd6bb02a3ae37f76811b80c \ + --hash=sha256:5eda85d6d1879e692d546a078b44251cdd08dd1cfb98dfb77b670c97cee49ea0 \ + --hash=sha256:5fed36fccc0612a53f1d4d9a816b50a36702c28a2aa880cb8a122b3466638743 \ + --hash=sha256:61d028e90346df14fedc3d1e5441df818d095f3b87d286825dfcbd6459b7ef63 \ + --hash=sha256:66f011380d0e49ed280c789fbd08ff0d40968ee7b665575489afa95c98196ab5 \ + --hash=sha256:6824f87845e3396029f3820c206e459ccc91760e8fa24422f8b0c3d1731cbec5 \ + --hash=sha256:6c6c373cfc5c83a975506110d17457138c8c63016b563cc9ed6e056a82f13ce4 \ + --hash=sha256:6d02d6655b0e54f54c4ef0b94eb6be0607b70853c45ce98bd278dc7de718be5d \ + --hash=sha256:6d50360be4546678fc1b79ffe7a66265e28667840010348dd69a314145807a1b \ + --hash=sha256:730cacb21e1bdff3ce90babf007d0a0917cc3e6492f336c2f0134101e0944f93 \ + --hash=sha256:737fe7d37e1a1bffe70bd5754ea763a62a066dc5913ca57e957824b72a85e205 \ + --hash=sha256:74a03b9698e198d47562765773b4a8309919089150a0bb17d829ad7b44b60d27 \ + --hash=sha256:7553fb2090d71822f02c629afe6042c299edf91ba1bf94951165613553984512 \ + --hash=sha256:7a66c7204d8869299919db4d5069a82f1561581af12b11b3c9f48c584eb8743d \ + --hash=sha256:7cc09976e8b56f8cebd752f7113ad07752461f48a58cbba644139015ac24954c \ + --hash=sha256:81afed14892743bbe14dacb9e36d9e0e504cd204e0b165062c488942b9718037 \ + --hash=sha256:8941aaadaf67246224cee8c3803777eed332a19d909b47e29c9842ef1e79ac26 \ + --hash=sha256:89472c9762729b5ae1ad974b777416bfda4ac5642423fa93bd57a09204712322 \ + --hash=sha256:8ea985900c5c95ce9db1745f7933eeef5d314f0565b27625d9a10ec9881e1bfb \ + --hash=sha256:8eca2a813c1cb7ad4fb74d368c2ffbbb4789d377ee5bb8df98373c2cc0dee76c \ + --hash=sha256:92b68146a71df78564e4ef48af17551a5ddd142e5190cdf2c5624d0c3ff5b2e8 \ + --hash=sha256:9332088d75dc3241c702d852d4671613136d90fa6881da7d770a483fd05248b4 \ + --hash=sha256:94698a9c5f91f9d138526b48fe26a199609544591f859c870d477351dc7b2414 \ + --hash=sha256:9a67fc9e8eb39039280526379fb3a70023d77caec1852002b4da7e8b270c4dd9 \ + --hash=sha256:9de40a7b0323d889cf8d23d1ef214f565ab154443c42737dfe52ff82cf857664 \ + --hash=sha256:a05d0c237b3349096
[llvm-branch-commits] [llvm] [BOLT] Move call probe information to CallSiteInfo (PR #165490)
https://github.com/rafaelauler approved this pull request. https://github.com/llvm/llvm-project/pull/165490 ___ llvm-branch-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [CI] Make premerge_advisor_explain write comments (PR #166605)
@@ -45,13 +83,31 @@ def main(commit_sha: str, build_log_files: list[str]): ) if advisor_response.status_code == 200: print(advisor_response.json()) +comments = [ +get_comment( +github_token, +pr_number, +generate_test_report_lib.generate_report( +generate_test_report_lib.compute_platform_title(), +return_code, +junit_objects, +ninja_logs, +failure_explanations_list=advisor_response.json(), boomanaiden154 wrote: > I think these comments could get quite large, but then again we are only > leaving at most 1. You might have to add some more size limits for the > extreme cases but not worth doing until we see it happen in practice. Yeah. We already have the size limit that will get applied here, but those can still be large. I agree it's probably not worth doing until we see it in practice. I don't think I've seen any PR with more than ~10 failures so far. > I wonder if the premerge advisor content should also go to the build summary, > but perhaps A: it already does or B: the advisor runs at a point where we've > already submitted the build summary. > Then again, having the build summary be very much "this is exactly what > happened" and the comments on the PR be more "human" and maybe speculative > makes some sense to me. It doesn't currently go to the build summary. I think I'd like to keep the build summary as exactly what happened for now. We can revisit that decision based on user feedback. And yeah, a big part of the reason to surface the advisor findings in a comment is because a lot of people do not realize the summary view exists. https://github.com/llvm/llvm-project/pull/166605 ___ llvm-branch-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [BOLT] Move call probe information to CallSiteInfo (PR #165490)
https://github.com/aaupov closed https://github.com/llvm/llvm-project/pull/165490 ___ llvm-branch-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [lldb] release/21.x: [lldb] Implement DW_CFA_val_offset and DW_CFA_val_offset_sf (#150732) (PR #166611)
https://github.com/JDevlieghere approved this pull request. https://github.com/llvm/llvm-project/pull/166611 ___ llvm-branch-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [MLIR][Python] Update Nanobind Warnings List for clang-cl on Windows (PR #166828)
https://github.com/makslevental approved this pull request. This thing is a perennial PITA. I'm stamping to unblock but can you can also try [NB_SUPPRESS_WARNINGS](https://github.com/wjakob/nanobind/pull/868). https://github.com/llvm/llvm-project/pull/166828 ___ llvm-branch-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [MLIR][Python] Update Nanobind Warnings List for clang-cl on Windows (PR #166828)
makslevental wrote: PS also this https://github.com/wjakob/nanobind/issues/994 🙂 https://github.com/llvm/llvm-project/pull/166828 ___ llvm-branch-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [CI] Make premerge_advisor_explain write comments (PR #166605)
https://github.com/boomanaiden154 updated
https://github.com/llvm/llvm-project/pull/166605
>From 06c030dcb4ee57be287beffd96d1b21ef1697dd4 Mon Sep 17 00:00:00 2001
From: Aiden Grossman
Date: Wed, 5 Nov 2025 18:23:46 +
Subject: [PATCH 1/2] fix
Created using spr 1.3.7
---
.ci/premerge_advisor_explain.py | 34 -
.ci/utils.sh| 10 +-
2 files changed, 22 insertions(+), 22 deletions(-)
diff --git a/.ci/premerge_advisor_explain.py b/.ci/premerge_advisor_explain.py
index 4d840a33c3cf2..1d487af9e9ec7 100644
--- a/.ci/premerge_advisor_explain.py
+++ b/.ci/premerge_advisor_explain.py
@@ -31,22 +31,11 @@ def get_comment_id(platform: str, pr:
github.PullRequest.PullRequest) -> int | N
def get_comment(
github_token: str,
pr_number: int,
-junit_objects,
-ninja_logs,
-advisor_response,
-return_code,
+body: str,
) -> dict[str, str]:
repo = github.Github(github_token).get_repo("llvm/llvm-project")
pr = repo.get_issue(pr_number).as_pull_request()
-comment = {
-"body": generate_test_report_lib.generate_report(
-generate_test_report_lib.compute_platform_title(),
-return_code,
-junit_objects,
-ninja_logs,
-failure_explanations_list=advisor_response,
-)
-}
+comment = {"body": body}
comment_id = get_comment_id(platform.system(), pr)
if comment_id:
comment["id"] = comment_id
@@ -59,6 +48,14 @@ def main(
pr_number: int,
return_code: int,
):
+if return_code == 0:
+with open("comment", "w") as comment_file_handle:
+comment = get_comment(
+":white_check_mark: With the latest revision this PR passed "
+"the premerge checks."
+)
+if comment["id"]:
+json.dump([comment], comment_file_handle)
junit_objects, ninja_logs = generate_test_report_lib.load_info_from_files(
build_log_files
)
@@ -90,10 +87,13 @@ def main(
get_comment(
github_token,
pr_number,
-junit_objects,
-ninja_logs,
-advisor_response.json(),
-return_code,
+generate_test_report_lib.generate_report(
+generate_test_report_lib.compute_platform_title(),
+return_code,
+junit_objects,
+ninja_logs,
+failure_explanations_list=advisor_response.json(),
+),
)
]
with open("comment", "w") as comment_file_handle:
diff --git a/.ci/utils.sh b/.ci/utils.sh
index 72f4b04f5bf3a..91c27319f3534 100644
--- a/.ci/utils.sh
+++ b/.ci/utils.sh
@@ -33,18 +33,18 @@ function at-exit {
# If building fails there will be no results files.
shopt -s nullglob
- if [[ "$GITHUB_STEP_SUMMARY" != "" ]]; then
+ if [[ "$GITHUB_ACTIONS" != "" ]]; then
python "${MONOREPO_ROOT}"/.ci/generate_test_report_github.py \
$retcode "${BUILD_DIR}"/test-results.*.xml "${MONOREPO_ROOT}"/ninja*.log
\
>> $GITHUB_STEP_SUMMARY
+python "${MONOREPO_ROOT}"/.ci/premerge_advisor_explain.py \
+ $(git rev-parse HEAD~1) $retcode ${{ secrets.GITHUB_TOKEN }} \
+ $GITHUB_PR_NUMBER "${BUILD_DIR}"/test-results.*.xml \
+ "${MONOREPO_ROOT}"/ninja*.log
fi
if [[ "$retcode" != "0" ]]; then
if [[ "$GITHUB_ACTIONS" != "" ]]; then
- python "${MONOREPO_ROOT}"/.ci/premerge_advisor_explain.py \
-$(git rev-parse HEAD~1) $retcode ${{ secrets.GITHUB_TOKEN }} \
-$GITHUB_PR_NUMBER "${BUILD_DIR}"/test-results.*.xml \
-"${MONOREPO_ROOT}"/ninja*.log
python "${MONOREPO_ROOT}"/.ci/premerge_advisor_upload.py \
$(git rev-parse HEAD~1) $GITHUB_RUN_NUMBER \
"${BUILD_DIR}"/test-results.*.xml "${MONOREPO_ROOT}"/ninja*.log
>From 7e44989fceaeec33405c5368e16d999f5701a7b2 Mon Sep 17 00:00:00 2001
From: Aiden Grossman
Date: Thu, 6 Nov 2025 16:57:02 +
Subject: [PATCH 2/2] docs
Created using spr 1.3.7
---
.ci/premerge_advisor_explain.py | 25 +
1 file changed, 25 insertions(+)
diff --git a/.ci/premerge_advisor_explain.py b/.ci/premerge_advisor_explain.py
index 1d487af9e9ec7..08ccfb3d0e3d4 100644
--- a/.ci/premerge_advisor_explain.py
+++ b/.ci/premerge_advisor_explain.py
@@ -48,6 +48,31 @@ def main(
pr_number: int,
return_code: int,
):
+"""The main entrypoint for the script.
+
+This function parses failures from files, requests information from the
+premerge advisor, and may write a Github comment depending upon the output.
+There are four different scenarios:
+1. There has never been a previous failure and the job passes - We do not
+ create a comment. We write out an empty file to the comment path so the
+ issue-write workflow knows not to create anything.
+2. There has never been
[llvm-branch-commits] [llvm] [CI] Make premerge_advisor_explain write comments (PR #166605)
@@ -4,20 +4,58 @@
"""Script for getting explanations from the premerge advisor."""
import argparse
-import os
import platform
import sys
+import json
import requests
+import github
+import github.PullRequest
import generate_test_report_lib
PREMERGE_ADVISOR_URL = (
"http://premerge-advisor.premerge-advisor.svc.cluster.local:5000/explain";
)
+COMMENT_TAG = ""
-def main(commit_sha: str, build_log_files: list[str]):
+def get_comment_id(platform: str, pr: github.PullRequest.PullRequest) -> int |
None:
+platform_comment_tag = COMMENT_TAG.format(platform=platform)
+for comment in pr.as_issue().get_comments():
+if platform_comment_tag in comment.body:
+return comment.id
+return None
+
+
+def get_comment(
+github_token: str,
+pr_number: int,
+body: str,
+) -> dict[str, str]:
+repo = github.Github(github_token).get_repo("llvm/llvm-project")
+pr = repo.get_issue(pr_number).as_pull_request()
+comment = {"body": body}
+comment_id = get_comment_id(platform.system(), pr)
+if comment_id:
+comment["id"] = comment_id
+
+
+def main(
+commit_sha: str,
+build_log_files: list[str],
+github_token: str,
+pr_number: int,
+return_code: int,
+):
+if return_code == 0:
+with open("comment", "w") as comment_file_handle:
+comment = get_comment(
+":white_check_mark: With the latest revision this PR passed "
+"the premerge checks."
+)
+if comment["id"]:
boomanaiden154 wrote:
Yeah, this matches the behavior of the formatter.
I've added a docstring enumerating the cases. The formatter helper script does
not have good documentation on this.
https://github.com/llvm/llvm-project/pull/166605
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [SelectionDAG] Split vector types for atomic load (PR #165818)
https://github.com/jofrn updated
https://github.com/llvm/llvm-project/pull/165818
>From 1434bcf8d9be03eeabce92430d00e02b0e434069 Mon Sep 17 00:00:00 2001
From: jofrn
Date: Thu, 30 Oct 2025 12:19:59 -0400
Subject: [PATCH] [SelectionDAG] Split vector types for atomic load
Vector types that aren't widened are split
so that a single ATOMIC_LOAD is issued for the entire vector at once.
This change utilizes the load vectorization infrastructure in
SelectionDAG in order to group the vectors. This enables SelectionDAG
to translate vectors with type bfloat,half.
---
.../include/llvm/Target/TargetSelectionDAG.td | 14 +
llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h | 1 +
.../SelectionDAG/LegalizeVectorTypes.cpp | 37 ++
llvm/test/CodeGen/X86/atomic-load-store.ll| 352 +-
4 files changed, 400 insertions(+), 4 deletions(-)
diff --git a/llvm/include/llvm/Target/TargetSelectionDAG.td
b/llvm/include/llvm/Target/TargetSelectionDAG.td
index 07a858fd682fc..239fee8a3022d 100644
--- a/llvm/include/llvm/Target/TargetSelectionDAG.td
+++ b/llvm/include/llvm/Target/TargetSelectionDAG.td
@@ -1949,6 +1949,20 @@ def atomic_load_64 :
let MemoryVT = i64;
}
+def atomic_load_128_v2i64 :
+ PatFrag<(ops node:$ptr),
+ (atomic_load node:$ptr)> {
+ let IsAtomic = true;
+ let MemoryVT = v2i64;
+}
+
+def atomic_load_128_v4i32 :
+ PatFrag<(ops node:$ptr),
+ (atomic_load node:$ptr)> {
+ let IsAtomic = true;
+ let MemoryVT = v4i32;
+}
+
def atomic_load_nonext_8 :
PatFrag<(ops node:$ptr), (atomic_load_nonext node:$ptr)> {
let IsAtomic = true; // FIXME: Should be IsLoad and/or IsAtomic?
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index ed2c30be7d71d..9028ff4d3401c 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -978,6 +978,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
void SplitVecRes_FPOp_MultiType(SDNode *N, SDValue &Lo, SDValue &Hi);
void SplitVecRes_IS_FPCLASS(SDNode *N, SDValue &Lo, SDValue &Hi);
void SplitVecRes_INSERT_VECTOR_ELT(SDNode *N, SDValue &Lo, SDValue &Hi);
+ void SplitVecRes_ATOMIC_LOAD(AtomicSDNode *LD, SDValue &Lo, SDValue &Hi);
void SplitVecRes_LOAD(LoadSDNode *LD, SDValue &Lo, SDValue &Hi);
void SplitVecRes_VP_LOAD(VPLoadSDNode *LD, SDValue &Lo, SDValue &Hi);
void SplitVecRes_VP_LOAD_FF(VPLoadFFSDNode *LD, SDValue &Lo, SDValue &Hi);
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index d0e2c8b2e3799..f3fb50be27f02 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -1226,6 +1226,9 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N,
unsigned ResNo) {
SplitVecRes_STEP_VECTOR(N, Lo, Hi);
break;
case ISD::SIGN_EXTEND_INREG: SplitVecRes_InregOp(N, Lo, Hi); break;
+ case ISD::ATOMIC_LOAD:
+SplitVecRes_ATOMIC_LOAD(cast(N), Lo, Hi);
+break;
case ISD::LOAD:
SplitVecRes_LOAD(cast(N), Lo, Hi);
break;
@@ -2202,6 +2205,40 @@ void DAGTypeLegalizer::SplitVecRes_VP_SPLAT(SDNode *N,
SDValue &Lo,
Hi = DAG.getNode(N->getOpcode(), dl, HiVT, N->getOperand(0), MaskHi, EVLHi);
}
+void DAGTypeLegalizer::SplitVecRes_ATOMIC_LOAD(AtomicSDNode *LD, SDValue &Lo,
+ SDValue &Hi) {
+ assert(LD->getExtensionType() == ISD::NON_EXTLOAD &&
+ "Extended load during type legalization!");
+ SDLoc dl(LD);
+ EVT VT = LD->getValueType(0);
+ EVT LoVT, HiVT;
+ std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
+
+ SDValue Ch = LD->getChain();
+ SDValue Ptr = LD->getBasePtr();
+
+ EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
+ EVT MemIntVT =
+ EVT::getIntegerVT(*DAG.getContext(), LD->getMemoryVT().getSizeInBits());
+ SDValue ALD = DAG.getAtomicLoad(LD->getExtensionType(), dl, MemIntVT, IntVT,
+ Ch, Ptr, LD->getMemOperand());
+
+ EVT LoIntVT = EVT::getIntegerVT(*DAG.getContext(), LoVT.getSizeInBits());
+ EVT HiIntVT = EVT::getIntegerVT(*DAG.getContext(), HiVT.getSizeInBits());
+ SDValue ExtractLo = DAG.getNode(ISD::TRUNCATE, dl, LoIntVT, ALD);
+ SDValue ExtractHi = DAG.getNode(
+ ISD::SRL, dl, IntVT, ALD,
+ DAG.getShiftAmountConstant(VT.getSizeInBits() / 2, IntVT, dl));
+ ExtractHi = DAG.getNode(ISD::TRUNCATE, dl, HiIntVT, ExtractHi);
+
+ Lo = DAG.getBitcast(LoVT, ExtractLo);
+ Hi = DAG.getBitcast(HiVT, ExtractHi);
+
+ // Legalize the chain result - switch anything that used the old chain to
+ // use the new one.
+ ReplaceValueWith(SDValue(LD, 1), ALD.getValue(1));
+}
+
void DAGTypeLegalizer::SplitVecRes_LOAD(LoadSDNode *LD, SDValue &Lo,
SDValue &Hi) {
assert(ISD::isUNINDEXEDLoad(LD) && "Indexed load during type legalization!");
diff --git a/llvm/test/CodeGen/X86/a
[llvm-branch-commits] [llvm] [X86] Remove extra MOV after widening atomic load (PR #148898)
https://github.com/jofrn updated
https://github.com/llvm/llvm-project/pull/148898
>From b92b6dac8913654dc0ba987ce328c47fa7330778 Mon Sep 17 00:00:00 2001
From: jofrn
Date: Tue, 15 Jul 2025 13:01:24 -0400
Subject: [PATCH] [X86] Remove extra MOV after widening atomic load
This change adds patterns to optimize out an extra MOV
present after widening the atomic load.
---
llvm/lib/Target/X86/X86InstrCompiler.td| 16 +
llvm/test/CodeGen/X86/atomic-load-store.ll | 72 --
2 files changed, 40 insertions(+), 48 deletions(-)
diff --git a/llvm/lib/Target/X86/X86InstrCompiler.td
b/llvm/lib/Target/X86/X86InstrCompiler.td
index ec31675731b79..ce429b5916280 100644
--- a/llvm/lib/Target/X86/X86InstrCompiler.td
+++ b/llvm/lib/Target/X86/X86InstrCompiler.td
@@ -1204,6 +1204,22 @@ def : Pat<(i16 (atomic_load_nonext_16 addr:$src)),
(MOV16rm addr:$src)>;
def : Pat<(i32 (atomic_load_nonext_32 addr:$src)), (MOV32rm addr:$src)>;
def : Pat<(i64 (atomic_load_nonext_64 addr:$src)), (MOV64rm addr:$src)>;
+// load atomic <2 x i16>
+def : Pat<(v4i32 (scalar_to_vector (i32 (atomic_load_32 addr:$src,
+ (MOVDI2PDIrm addr:$src)>, Requires<[UseSSE2]>;
+def : Pat<(v4i32 (scalar_to_vector (i32 (atomic_load_32 addr:$src,
+ (VMOVSSrm addr:$src)>, Requires<[UseAVX]>;
+def : Pat<(v4i32 (scalar_to_vector (i32 (atomic_load_32 addr:$src,
+ (VMOVSSZrm addr:$src)>, Requires<[HasAVX512]>;
+
+// load atomic <2 x i32,float>
+def : Pat<(v2i64 (scalar_to_vector (i64 (atomic_load_64 addr:$src,
+ (MOV64toPQIrm addr:$src)>, Requires<[UseSSE2]>;
+def : Pat<(v2i64 (scalar_to_vector (i64 (atomic_load_64 addr:$src,
+ (VMOV64toPQIrm addr:$src)>, Requires<[UseAVX]>;
+def : Pat<(v2i64 (scalar_to_vector (i64 (atomic_load_64 addr:$src,
+ (VMOV64toPQIZrm addr:$src)>, Requires<[HasAVX512]>;
+
// Floating point loads/stores.
def : Pat<(atomic_store_32 (i32 (bitconvert (f32 FR32:$src))), addr:$dst),
(MOVSSmr addr:$dst, FR32:$src)>, Requires<[UseSSE1]>;
diff --git a/llvm/test/CodeGen/X86/atomic-load-store.ll
b/llvm/test/CodeGen/X86/atomic-load-store.ll
index fc32c3668d1dd..7e15b9303887f 100644
--- a/llvm/test/CodeGen/X86/atomic-load-store.ll
+++ b/llvm/test/CodeGen/X86/atomic-load-store.ll
@@ -202,26 +202,22 @@ define <2 x i8> @atomic_vec2_i8(ptr %x) {
define <2 x i16> @atomic_vec2_i16(ptr %x) {
; CHECK-SSE-O3-LABEL: atomic_vec2_i16:
; CHECK-SSE-O3: # %bb.0:
-; CHECK-SSE-O3-NEXT:movl (%rdi), %eax
-; CHECK-SSE-O3-NEXT:movd %eax, %xmm0
+; CHECK-SSE-O3-NEXT:movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; CHECK-SSE-O3-NEXT:retq
;
; CHECK-AVX-O3-LABEL: atomic_vec2_i16:
; CHECK-AVX-O3: # %bb.0:
-; CHECK-AVX-O3-NEXT:movl (%rdi), %eax
-; CHECK-AVX-O3-NEXT:vmovd %eax, %xmm0
+; CHECK-AVX-O3-NEXT:vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; CHECK-AVX-O3-NEXT:retq
;
; CHECK-SSE-O0-LABEL: atomic_vec2_i16:
; CHECK-SSE-O0: # %bb.0:
-; CHECK-SSE-O0-NEXT:movl (%rdi), %eax
-; CHECK-SSE-O0-NEXT:movd %eax, %xmm0
+; CHECK-SSE-O0-NEXT:movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; CHECK-SSE-O0-NEXT:retq
;
; CHECK-AVX-O0-LABEL: atomic_vec2_i16:
; CHECK-AVX-O0: # %bb.0:
-; CHECK-AVX-O0-NEXT:movl (%rdi), %eax
-; CHECK-AVX-O0-NEXT:vmovd %eax, %xmm0
+; CHECK-AVX-O0-NEXT:vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; CHECK-AVX-O0-NEXT:retq
%ret = load atomic <2 x i16>, ptr %x acquire, align 4
ret <2 x i16> %ret
@@ -230,26 +226,22 @@ define <2 x i16> @atomic_vec2_i16(ptr %x) {
define <2 x ptr addrspace(270)> @atomic_vec2_ptr270(ptr %x) {
; CHECK-SSE-O3-LABEL: atomic_vec2_ptr270:
; CHECK-SSE-O3: # %bb.0:
-; CHECK-SSE-O3-NEXT:movq (%rdi), %rax
-; CHECK-SSE-O3-NEXT:movq %rax, %xmm0
+; CHECK-SSE-O3-NEXT:movq (%rdi), %xmm0
; CHECK-SSE-O3-NEXT:retq
;
; CHECK-AVX-O3-LABEL: atomic_vec2_ptr270:
; CHECK-AVX-O3: # %bb.0:
-; CHECK-AVX-O3-NEXT:movq (%rdi), %rax
-; CHECK-AVX-O3-NEXT:vmovq %rax, %xmm0
+; CHECK-AVX-O3-NEXT:vmovq (%rdi), %xmm0
; CHECK-AVX-O3-NEXT:retq
;
; CHECK-SSE-O0-LABEL: atomic_vec2_ptr270:
; CHECK-SSE-O0: # %bb.0:
-; CHECK-SSE-O0-NEXT:movq (%rdi), %rax
-; CHECK-SSE-O0-NEXT:movq %rax, %xmm0
+; CHECK-SSE-O0-NEXT:movq (%rdi), %xmm0
; CHECK-SSE-O0-NEXT:retq
;
; CHECK-AVX-O0-LABEL: atomic_vec2_ptr270:
; CHECK-AVX-O0: # %bb.0:
-; CHECK-AVX-O0-NEXT:movq (%rdi), %rax
-; CHECK-AVX-O0-NEXT:vmovq %rax, %xmm0
+; CHECK-AVX-O0-NEXT:vmovq (%rdi), %xmm0
; CHECK-AVX-O0-NEXT:retq
%ret = load atomic <2 x ptr addrspace(270)>, ptr %x acquire, align 8
ret <2 x ptr addrspace(270)> %ret
@@ -258,26 +250,22 @@ define <2 x ptr addrspace(270)> @atomic_vec2_ptr270(ptr
%x) {
define <2 x i32> @atomic_vec2_i32_align(ptr %x) {
; CHECK-SSE-O3-LABEL: atomic_vec2_i32_align:
; CHECK-SSE-O3: # %bb.0:
-; CHECK-SSE-O3-NEXT:movq (%rdi), %rax
-; CHECK-SSE-O3-NEXT:movq %rax, %xmm0
+; CHECK-SSE-O3-NEXT:movq (%rdi), %xmm0
[llvm-branch-commits] [llvm] [AtomicExpand] Add bitcasts when expanding load atomic vector (PR #148900)
https://github.com/jofrn updated
https://github.com/llvm/llvm-project/pull/148900
>From a657bd946e7be59892a00a447ca7018d0715c6a5 Mon Sep 17 00:00:00 2001
From: jofrn
Date: Tue, 15 Jul 2025 13:03:15 -0400
Subject: [PATCH] [AtomicExpand] Add bitcasts when expanding load atomic vector
AtomicExpand fails for aligned `load atomic ` because it
does not find a compatible library call. This change adds appropriate
bitcasts so that the call can be lowered. It also adds support for
128 bit lowering in tablegen to support SSE/AVX.
---
llvm/lib/CodeGen/AtomicExpandPass.cpp | 19 +++-
llvm/test/CodeGen/ARM/atomic-load-store.ll| 51 +++
llvm/test/CodeGen/X86/atomic-load-store.ll| 91 ++-
.../X86/expand-atomic-non-integer.ll | 66 ++
4 files changed, 222 insertions(+), 5 deletions(-)
diff --git a/llvm/lib/CodeGen/AtomicExpandPass.cpp
b/llvm/lib/CodeGen/AtomicExpandPass.cpp
index 53f1cfe24a68d..8dc14bb416345 100644
--- a/llvm/lib/CodeGen/AtomicExpandPass.cpp
+++ b/llvm/lib/CodeGen/AtomicExpandPass.cpp
@@ -483,7 +483,9 @@ LoadInst
*AtomicExpandImpl::convertAtomicLoadToIntegerType(LoadInst *LI) {
NewLI->setAtomic(LI->getOrdering(), LI->getSyncScopeID());
LLVM_DEBUG(dbgs() << "Replaced " << *LI << " with " << *NewLI << "\n");
- Value *NewVal = Builder.CreateBitCast(NewLI, LI->getType());
+ Value *NewVal = LI->getType()->isPtrOrPtrVectorTy()
+ ? Builder.CreateIntToPtr(NewLI, LI->getType())
+ : Builder.CreateBitCast(NewLI, LI->getType());
LI->replaceAllUsesWith(NewVal);
LI->eraseFromParent();
return NewLI;
@@ -2093,9 +2095,18 @@ bool AtomicExpandImpl::expandAtomicOpToLibcall(
I->replaceAllUsesWith(V);
} else if (HasResult) {
Value *V;
-if (UseSizedLibcall)
- V = Builder.CreateBitOrPointerCast(Result, I->getType());
-else {
+if (UseSizedLibcall) {
+ // Add bitcasts from Result's scalar type to I's vector type
+ auto *PtrTy = dyn_cast(I->getType()->getScalarType());
+ auto *VTy = dyn_cast(I->getType());
+ if (VTy && PtrTy && !Result->getType()->isVectorTy()) {
+unsigned AS = PtrTy->getAddressSpace();
+Value *BC = Builder.CreateBitCast(
+Result, VTy->getWithNewType(DL.getIntPtrType(Ctx, AS)));
+V = Builder.CreateIntToPtr(BC, I->getType());
+ } else
+V = Builder.CreateBitOrPointerCast(Result, I->getType());
+} else {
V = Builder.CreateAlignedLoad(I->getType(), AllocaResult,
AllocaAlignment);
Builder.CreateLifetimeEnd(AllocaResult);
diff --git a/llvm/test/CodeGen/ARM/atomic-load-store.ll
b/llvm/test/CodeGen/ARM/atomic-load-store.ll
index 560dfde356c29..eaa2ffd9b2731 100644
--- a/llvm/test/CodeGen/ARM/atomic-load-store.ll
+++ b/llvm/test/CodeGen/ARM/atomic-load-store.ll
@@ -983,3 +983,54 @@ define void @store_atomic_f64__seq_cst(ptr %ptr, double
%val1) {
store atomic double %val1, ptr %ptr seq_cst, align 8
ret void
}
+
+define <1 x ptr> @atomic_vec1_ptr(ptr %x) #0 {
+; ARM-LABEL: atomic_vec1_ptr:
+; ARM: @ %bb.0:
+; ARM-NEXT:ldr r0, [r0]
+; ARM-NEXT:dmb ish
+; ARM-NEXT:bx lr
+;
+; ARMOPTNONE-LABEL: atomic_vec1_ptr:
+; ARMOPTNONE: @ %bb.0:
+; ARMOPTNONE-NEXT:ldr r0, [r0]
+; ARMOPTNONE-NEXT:dmb ish
+; ARMOPTNONE-NEXT:bx lr
+;
+; THUMBTWO-LABEL: atomic_vec1_ptr:
+; THUMBTWO: @ %bb.0:
+; THUMBTWO-NEXT:ldr r0, [r0]
+; THUMBTWO-NEXT:dmb ish
+; THUMBTWO-NEXT:bx lr
+;
+; THUMBONE-LABEL: atomic_vec1_ptr:
+; THUMBONE: @ %bb.0:
+; THUMBONE-NEXT:push {r7, lr}
+; THUMBONE-NEXT:movs r1, #0
+; THUMBONE-NEXT:mov r2, r1
+; THUMBONE-NEXT:bl __sync_val_compare_and_swap_4
+; THUMBONE-NEXT:pop {r7, pc}
+;
+; ARMV4-LABEL: atomic_vec1_ptr:
+; ARMV4: @ %bb.0:
+; ARMV4-NEXT:push {r11, lr}
+; ARMV4-NEXT:mov r1, #2
+; ARMV4-NEXT:bl __atomic_load_4
+; ARMV4-NEXT:pop {r11, lr}
+; ARMV4-NEXT:mov pc, lr
+;
+; ARMV6-LABEL: atomic_vec1_ptr:
+; ARMV6: @ %bb.0:
+; ARMV6-NEXT:ldr r0, [r0]
+; ARMV6-NEXT:mov r1, #0
+; ARMV6-NEXT:mcr p15, #0, r1, c7, c10, #5
+; ARMV6-NEXT:bx lr
+;
+; THUMBM-LABEL: atomic_vec1_ptr:
+; THUMBM: @ %bb.0:
+; THUMBM-NEXT:ldr r0, [r0]
+; THUMBM-NEXT:dmb sy
+; THUMBM-NEXT:bx lr
+ %ret = load atomic <1 x ptr>, ptr %x acquire, align 4
+ ret <1 x ptr> %ret
+}
diff --git a/llvm/test/CodeGen/X86/atomic-load-store.ll
b/llvm/test/CodeGen/X86/atomic-load-store.ll
index 00310f6d1f219..867a4acb791bc 100644
--- a/llvm/test/CodeGen/X86/atomic-load-store.ll
+++ b/llvm/test/CodeGen/X86/atomic-load-store.ll
@@ -244,6 +244,96 @@ define <2 x ptr addrspace(270)> @atomic_vec2_ptr270(ptr
%x) {
%ret = load atomic <2 x ptr addrspace(270)>, ptr %x acquire, align 8
ret <2 x ptr addrspace(270)> %ret
}
+define <2 x ptr> @atomic_vec2_ptr_align(ptr %x) nounwind {
+; CHECK-SSE2-O3-LABEL: atomic_vec2_ptr_align:
+; CHECK-SSE2-O3:
[llvm-branch-commits] [llvm] [SelectionDAG] Split vector types for atomic load (PR #165818)
https://github.com/jofrn updated
https://github.com/llvm/llvm-project/pull/165818
>From 1434bcf8d9be03eeabce92430d00e02b0e434069 Mon Sep 17 00:00:00 2001
From: jofrn
Date: Thu, 30 Oct 2025 12:19:59 -0400
Subject: [PATCH] [SelectionDAG] Split vector types for atomic load
Vector types that aren't widened are split
so that a single ATOMIC_LOAD is issued for the entire vector at once.
This change utilizes the load vectorization infrastructure in
SelectionDAG in order to group the vectors. This enables SelectionDAG
to translate vectors with type bfloat,half.
---
.../include/llvm/Target/TargetSelectionDAG.td | 14 +
llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h | 1 +
.../SelectionDAG/LegalizeVectorTypes.cpp | 37 ++
llvm/test/CodeGen/X86/atomic-load-store.ll| 352 +-
4 files changed, 400 insertions(+), 4 deletions(-)
diff --git a/llvm/include/llvm/Target/TargetSelectionDAG.td
b/llvm/include/llvm/Target/TargetSelectionDAG.td
index 07a858fd682fc..239fee8a3022d 100644
--- a/llvm/include/llvm/Target/TargetSelectionDAG.td
+++ b/llvm/include/llvm/Target/TargetSelectionDAG.td
@@ -1949,6 +1949,20 @@ def atomic_load_64 :
let MemoryVT = i64;
}
+def atomic_load_128_v2i64 :
+ PatFrag<(ops node:$ptr),
+ (atomic_load node:$ptr)> {
+ let IsAtomic = true;
+ let MemoryVT = v2i64;
+}
+
+def atomic_load_128_v4i32 :
+ PatFrag<(ops node:$ptr),
+ (atomic_load node:$ptr)> {
+ let IsAtomic = true;
+ let MemoryVT = v4i32;
+}
+
def atomic_load_nonext_8 :
PatFrag<(ops node:$ptr), (atomic_load_nonext node:$ptr)> {
let IsAtomic = true; // FIXME: Should be IsLoad and/or IsAtomic?
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index ed2c30be7d71d..9028ff4d3401c 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -978,6 +978,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
void SplitVecRes_FPOp_MultiType(SDNode *N, SDValue &Lo, SDValue &Hi);
void SplitVecRes_IS_FPCLASS(SDNode *N, SDValue &Lo, SDValue &Hi);
void SplitVecRes_INSERT_VECTOR_ELT(SDNode *N, SDValue &Lo, SDValue &Hi);
+ void SplitVecRes_ATOMIC_LOAD(AtomicSDNode *LD, SDValue &Lo, SDValue &Hi);
void SplitVecRes_LOAD(LoadSDNode *LD, SDValue &Lo, SDValue &Hi);
void SplitVecRes_VP_LOAD(VPLoadSDNode *LD, SDValue &Lo, SDValue &Hi);
void SplitVecRes_VP_LOAD_FF(VPLoadFFSDNode *LD, SDValue &Lo, SDValue &Hi);
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index d0e2c8b2e3799..f3fb50be27f02 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -1226,6 +1226,9 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N,
unsigned ResNo) {
SplitVecRes_STEP_VECTOR(N, Lo, Hi);
break;
case ISD::SIGN_EXTEND_INREG: SplitVecRes_InregOp(N, Lo, Hi); break;
+ case ISD::ATOMIC_LOAD:
+SplitVecRes_ATOMIC_LOAD(cast(N), Lo, Hi);
+break;
case ISD::LOAD:
SplitVecRes_LOAD(cast(N), Lo, Hi);
break;
@@ -2202,6 +2205,40 @@ void DAGTypeLegalizer::SplitVecRes_VP_SPLAT(SDNode *N,
SDValue &Lo,
Hi = DAG.getNode(N->getOpcode(), dl, HiVT, N->getOperand(0), MaskHi, EVLHi);
}
+void DAGTypeLegalizer::SplitVecRes_ATOMIC_LOAD(AtomicSDNode *LD, SDValue &Lo,
+ SDValue &Hi) {
+ assert(LD->getExtensionType() == ISD::NON_EXTLOAD &&
+ "Extended load during type legalization!");
+ SDLoc dl(LD);
+ EVT VT = LD->getValueType(0);
+ EVT LoVT, HiVT;
+ std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
+
+ SDValue Ch = LD->getChain();
+ SDValue Ptr = LD->getBasePtr();
+
+ EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
+ EVT MemIntVT =
+ EVT::getIntegerVT(*DAG.getContext(), LD->getMemoryVT().getSizeInBits());
+ SDValue ALD = DAG.getAtomicLoad(LD->getExtensionType(), dl, MemIntVT, IntVT,
+ Ch, Ptr, LD->getMemOperand());
+
+ EVT LoIntVT = EVT::getIntegerVT(*DAG.getContext(), LoVT.getSizeInBits());
+ EVT HiIntVT = EVT::getIntegerVT(*DAG.getContext(), HiVT.getSizeInBits());
+ SDValue ExtractLo = DAG.getNode(ISD::TRUNCATE, dl, LoIntVT, ALD);
+ SDValue ExtractHi = DAG.getNode(
+ ISD::SRL, dl, IntVT, ALD,
+ DAG.getShiftAmountConstant(VT.getSizeInBits() / 2, IntVT, dl));
+ ExtractHi = DAG.getNode(ISD::TRUNCATE, dl, HiIntVT, ExtractHi);
+
+ Lo = DAG.getBitcast(LoVT, ExtractLo);
+ Hi = DAG.getBitcast(HiVT, ExtractHi);
+
+ // Legalize the chain result - switch anything that used the old chain to
+ // use the new one.
+ ReplaceValueWith(SDValue(LD, 1), ALD.getValue(1));
+}
+
void DAGTypeLegalizer::SplitVecRes_LOAD(LoadSDNode *LD, SDValue &Lo,
SDValue &Hi) {
assert(ISD::isUNINDEXEDLoad(LD) && "Indexed load during type legalization!");
diff --git a/llvm/test/CodeGen/X86/a
[llvm-branch-commits] [llvm] [X86] Cast atomic vectors in IR to support floats (PR #148899)
https://github.com/jofrn updated
https://github.com/llvm/llvm-project/pull/148899
>From f9b99b992450687c7da5048c82e9ce38efc3ff1d Mon Sep 17 00:00:00 2001
From: jofrn
Date: Tue, 15 Jul 2025 13:02:04 -0400
Subject: [PATCH] [X86] Cast atomic vectors in IR to support floats
This commit casts floats to ints in an atomic load during AtomicExpand to
support
floating point types. It also is required to support 128 bit vectors in SSE/AVX.
---
llvm/lib/Target/X86/X86ISelLowering.cpp| 7 +
llvm/lib/Target/X86/X86ISelLowering.h | 2 +
llvm/lib/Target/X86/X86InstrCompiler.td| 15 +
llvm/test/CodeGen/X86/atomic-load-store.ll | 385 ++---
4 files changed, 122 insertions(+), 287 deletions(-)
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp
b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 49beadae63f03..e15f17281b958 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -32136,6 +32136,13 @@
X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
}
}
+TargetLowering::AtomicExpansionKind
+X86TargetLowering::shouldCastAtomicLoadInIR(LoadInst *LI) const {
+ if (LI->getType()->getScalarType()->isFloatingPointTy())
+return AtomicExpansionKind::CastToInteger;
+ return AtomicExpansionKind::None;
+}
+
LoadInst *
X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
diff --git a/llvm/lib/Target/X86/X86ISelLowering.h
b/llvm/lib/Target/X86/X86ISelLowering.h
index b7151f65942b4..f9a8adbd7da0d 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/llvm/lib/Target/X86/X86ISelLowering.h
@@ -1841,6 +1841,8 @@ namespace llvm {
shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override;
TargetLoweringBase::AtomicExpansionKind
shouldExpandLogicAtomicRMWInIR(AtomicRMWInst *AI) const;
+TargetLoweringBase::AtomicExpansionKind
+shouldCastAtomicLoadInIR(LoadInst *LI) const override;
void emitBitTestAtomicRMWIntrinsic(AtomicRMWInst *AI) const override;
void emitCmpArithAtomicRMWIntrinsic(AtomicRMWInst *AI) const override;
diff --git a/llvm/lib/Target/X86/X86InstrCompiler.td
b/llvm/lib/Target/X86/X86InstrCompiler.td
index ce429b5916280..3f542297fea19 100644
--- a/llvm/lib/Target/X86/X86InstrCompiler.td
+++ b/llvm/lib/Target/X86/X86InstrCompiler.td
@@ -1220,6 +1220,21 @@ def : Pat<(v2i64 (scalar_to_vector (i64 (atomic_load_64
addr:$src,
def : Pat<(v2i64 (scalar_to_vector (i64 (atomic_load_64 addr:$src,
(VMOV64toPQIZrm addr:$src)>, Requires<[HasAVX512]>;
+// load atomic <2 x i64>
+def : Pat<(v2i64 (atomic_load_128_v2i64 addr:$src)),
+ (MOVAPDrm addr:$src)>, Requires<[UseSSE2]>;
+def : Pat<(v2i64 (atomic_load_128_v2i64 addr:$src)),
+ (VMOVAPDrm addr:$src)>, Requires<[UseAVX]>;
+def : Pat<(v2i64 (atomic_load_128_v2i64 addr:$src)),
+ (VMOVAPDZ128rm addr:$src)>, Requires<[HasAVX512]>;
+// load atomic <4 x i32>
+def : Pat<(v4i32 (atomic_load_128_v4i32 addr:$src)),
+ (MOVAPDrm addr:$src)>, Requires<[UseSSE2]>;
+def : Pat<(v4i32 (atomic_load_128_v4i32 addr:$src)),
+ (VMOVAPDrm addr:$src)>, Requires<[UseAVX]>;
+def : Pat<(v4i32 (atomic_load_128_v4i32 addr:$src)),
+ (VMOVAPDZ128rm addr:$src)>, Requires<[HasAVX512]>;
+
// Floating point loads/stores.
def : Pat<(atomic_store_32 (i32 (bitconvert (f32 FR32:$src))), addr:$dst),
(MOVSSmr addr:$dst, FR32:$src)>, Requires<[UseSSE1]>;
diff --git a/llvm/test/CodeGen/X86/atomic-load-store.ll
b/llvm/test/CodeGen/X86/atomic-load-store.ll
index 928dfef3143da..00310f6d1f219 100644
--- a/llvm/test/CodeGen/X86/atomic-load-store.ll
+++ b/llvm/test/CodeGen/X86/atomic-load-store.ll
@@ -119,13 +119,13 @@ define <1 x bfloat> @atomic_vec1_bfloat(ptr %x) {
; CHECK-SSE-O3-LABEL: atomic_vec1_bfloat:
; CHECK-SSE-O3: # %bb.0:
; CHECK-SSE-O3-NEXT:movzwl (%rdi), %eax
-; CHECK-SSE-O3-NEXT:pinsrw $0, %eax, %xmm0
+; CHECK-SSE-O3-NEXT:movd %eax, %xmm0
; CHECK-SSE-O3-NEXT:retq
;
; CHECK-AVX-O3-LABEL: atomic_vec1_bfloat:
; CHECK-AVX-O3: # %bb.0:
; CHECK-AVX-O3-NEXT:movzwl (%rdi), %eax
-; CHECK-AVX-O3-NEXT:vpinsrw $0, %eax, %xmm0, %xmm0
+; CHECK-AVX-O3-NEXT:vmovd %eax, %xmm0
; CHECK-AVX-O3-NEXT:retq
;
; CHECK-SSE-O0-LABEL: atomic_vec1_bfloat:
@@ -133,8 +133,7 @@ define <1 x bfloat> @atomic_vec1_bfloat(ptr %x) {
; CHECK-SSE-O0-NEXT:movw (%rdi), %cx
; CHECK-SSE-O0-NEXT:# implicit-def: $eax
; CHECK-SSE-O0-NEXT:movw %cx, %ax
-; CHECK-SSE-O0-NEXT:# implicit-def: $xmm0
-; CHECK-SSE-O0-NEXT:pinsrw $0, %eax, %xmm0
+; CHECK-SSE-O0-NEXT:movd %eax, %xmm0
; CHECK-SSE-O0-NEXT:retq
;
; CHECK-AVX-O0-LABEL: atomic_vec1_bfloat:
@@ -142,8 +141,7 @@ define <1 x bfloat> @atomic_vec1_bfloat(ptr %x) {
; CHECK-AVX-O0-NEXT:movw (%rdi), %cx
; CHECK-AVX-O0-NEXT:# implicit-def: $eax
; CHECK-AVX-O0-NEXT:movw %cx, %ax
-; CHECK-AVX-O0-NEXT:# imp
[llvm-branch-commits] [llvm] [X86] Cast atomic vectors in IR to support floats (PR #148899)
https://github.com/jofrn updated
https://github.com/llvm/llvm-project/pull/148899
>From f9b99b992450687c7da5048c82e9ce38efc3ff1d Mon Sep 17 00:00:00 2001
From: jofrn
Date: Tue, 15 Jul 2025 13:02:04 -0400
Subject: [PATCH] [X86] Cast atomic vectors in IR to support floats
This commit casts floats to ints in an atomic load during AtomicExpand to
support
floating point types. It also is required to support 128 bit vectors in SSE/AVX.
---
llvm/lib/Target/X86/X86ISelLowering.cpp| 7 +
llvm/lib/Target/X86/X86ISelLowering.h | 2 +
llvm/lib/Target/X86/X86InstrCompiler.td| 15 +
llvm/test/CodeGen/X86/atomic-load-store.ll | 385 ++---
4 files changed, 122 insertions(+), 287 deletions(-)
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp
b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 49beadae63f03..e15f17281b958 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -32136,6 +32136,13 @@
X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
}
}
+TargetLowering::AtomicExpansionKind
+X86TargetLowering::shouldCastAtomicLoadInIR(LoadInst *LI) const {
+ if (LI->getType()->getScalarType()->isFloatingPointTy())
+return AtomicExpansionKind::CastToInteger;
+ return AtomicExpansionKind::None;
+}
+
LoadInst *
X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
diff --git a/llvm/lib/Target/X86/X86ISelLowering.h
b/llvm/lib/Target/X86/X86ISelLowering.h
index b7151f65942b4..f9a8adbd7da0d 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/llvm/lib/Target/X86/X86ISelLowering.h
@@ -1841,6 +1841,8 @@ namespace llvm {
shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override;
TargetLoweringBase::AtomicExpansionKind
shouldExpandLogicAtomicRMWInIR(AtomicRMWInst *AI) const;
+TargetLoweringBase::AtomicExpansionKind
+shouldCastAtomicLoadInIR(LoadInst *LI) const override;
void emitBitTestAtomicRMWIntrinsic(AtomicRMWInst *AI) const override;
void emitCmpArithAtomicRMWIntrinsic(AtomicRMWInst *AI) const override;
diff --git a/llvm/lib/Target/X86/X86InstrCompiler.td
b/llvm/lib/Target/X86/X86InstrCompiler.td
index ce429b5916280..3f542297fea19 100644
--- a/llvm/lib/Target/X86/X86InstrCompiler.td
+++ b/llvm/lib/Target/X86/X86InstrCompiler.td
@@ -1220,6 +1220,21 @@ def : Pat<(v2i64 (scalar_to_vector (i64 (atomic_load_64
addr:$src,
def : Pat<(v2i64 (scalar_to_vector (i64 (atomic_load_64 addr:$src,
(VMOV64toPQIZrm addr:$src)>, Requires<[HasAVX512]>;
+// load atomic <2 x i64>
+def : Pat<(v2i64 (atomic_load_128_v2i64 addr:$src)),
+ (MOVAPDrm addr:$src)>, Requires<[UseSSE2]>;
+def : Pat<(v2i64 (atomic_load_128_v2i64 addr:$src)),
+ (VMOVAPDrm addr:$src)>, Requires<[UseAVX]>;
+def : Pat<(v2i64 (atomic_load_128_v2i64 addr:$src)),
+ (VMOVAPDZ128rm addr:$src)>, Requires<[HasAVX512]>;
+// load atomic <4 x i32>
+def : Pat<(v4i32 (atomic_load_128_v4i32 addr:$src)),
+ (MOVAPDrm addr:$src)>, Requires<[UseSSE2]>;
+def : Pat<(v4i32 (atomic_load_128_v4i32 addr:$src)),
+ (VMOVAPDrm addr:$src)>, Requires<[UseAVX]>;
+def : Pat<(v4i32 (atomic_load_128_v4i32 addr:$src)),
+ (VMOVAPDZ128rm addr:$src)>, Requires<[HasAVX512]>;
+
// Floating point loads/stores.
def : Pat<(atomic_store_32 (i32 (bitconvert (f32 FR32:$src))), addr:$dst),
(MOVSSmr addr:$dst, FR32:$src)>, Requires<[UseSSE1]>;
diff --git a/llvm/test/CodeGen/X86/atomic-load-store.ll
b/llvm/test/CodeGen/X86/atomic-load-store.ll
index 928dfef3143da..00310f6d1f219 100644
--- a/llvm/test/CodeGen/X86/atomic-load-store.ll
+++ b/llvm/test/CodeGen/X86/atomic-load-store.ll
@@ -119,13 +119,13 @@ define <1 x bfloat> @atomic_vec1_bfloat(ptr %x) {
; CHECK-SSE-O3-LABEL: atomic_vec1_bfloat:
; CHECK-SSE-O3: # %bb.0:
; CHECK-SSE-O3-NEXT:movzwl (%rdi), %eax
-; CHECK-SSE-O3-NEXT:pinsrw $0, %eax, %xmm0
+; CHECK-SSE-O3-NEXT:movd %eax, %xmm0
; CHECK-SSE-O3-NEXT:retq
;
; CHECK-AVX-O3-LABEL: atomic_vec1_bfloat:
; CHECK-AVX-O3: # %bb.0:
; CHECK-AVX-O3-NEXT:movzwl (%rdi), %eax
-; CHECK-AVX-O3-NEXT:vpinsrw $0, %eax, %xmm0, %xmm0
+; CHECK-AVX-O3-NEXT:vmovd %eax, %xmm0
; CHECK-AVX-O3-NEXT:retq
;
; CHECK-SSE-O0-LABEL: atomic_vec1_bfloat:
@@ -133,8 +133,7 @@ define <1 x bfloat> @atomic_vec1_bfloat(ptr %x) {
; CHECK-SSE-O0-NEXT:movw (%rdi), %cx
; CHECK-SSE-O0-NEXT:# implicit-def: $eax
; CHECK-SSE-O0-NEXT:movw %cx, %ax
-; CHECK-SSE-O0-NEXT:# implicit-def: $xmm0
-; CHECK-SSE-O0-NEXT:pinsrw $0, %eax, %xmm0
+; CHECK-SSE-O0-NEXT:movd %eax, %xmm0
; CHECK-SSE-O0-NEXT:retq
;
; CHECK-AVX-O0-LABEL: atomic_vec1_bfloat:
@@ -142,8 +141,7 @@ define <1 x bfloat> @atomic_vec1_bfloat(ptr %x) {
; CHECK-AVX-O0-NEXT:movw (%rdi), %cx
; CHECK-AVX-O0-NEXT:# implicit-def: $eax
; CHECK-AVX-O0-NEXT:movw %cx, %ax
-; CHECK-AVX-O0-NEXT:# imp
[llvm-branch-commits] [llvm] [X86] Remove extra MOV after widening atomic load (PR #148898)
https://github.com/jofrn updated
https://github.com/llvm/llvm-project/pull/148898
>From b92b6dac8913654dc0ba987ce328c47fa7330778 Mon Sep 17 00:00:00 2001
From: jofrn
Date: Tue, 15 Jul 2025 13:01:24 -0400
Subject: [PATCH] [X86] Remove extra MOV after widening atomic load
This change adds patterns to optimize out an extra MOV
present after widening the atomic load.
---
llvm/lib/Target/X86/X86InstrCompiler.td| 16 +
llvm/test/CodeGen/X86/atomic-load-store.ll | 72 --
2 files changed, 40 insertions(+), 48 deletions(-)
diff --git a/llvm/lib/Target/X86/X86InstrCompiler.td
b/llvm/lib/Target/X86/X86InstrCompiler.td
index ec31675731b79..ce429b5916280 100644
--- a/llvm/lib/Target/X86/X86InstrCompiler.td
+++ b/llvm/lib/Target/X86/X86InstrCompiler.td
@@ -1204,6 +1204,22 @@ def : Pat<(i16 (atomic_load_nonext_16 addr:$src)),
(MOV16rm addr:$src)>;
def : Pat<(i32 (atomic_load_nonext_32 addr:$src)), (MOV32rm addr:$src)>;
def : Pat<(i64 (atomic_load_nonext_64 addr:$src)), (MOV64rm addr:$src)>;
+// load atomic <2 x i16>
+def : Pat<(v4i32 (scalar_to_vector (i32 (atomic_load_32 addr:$src,
+ (MOVDI2PDIrm addr:$src)>, Requires<[UseSSE2]>;
+def : Pat<(v4i32 (scalar_to_vector (i32 (atomic_load_32 addr:$src,
+ (VMOVSSrm addr:$src)>, Requires<[UseAVX]>;
+def : Pat<(v4i32 (scalar_to_vector (i32 (atomic_load_32 addr:$src,
+ (VMOVSSZrm addr:$src)>, Requires<[HasAVX512]>;
+
+// load atomic <2 x i32,float>
+def : Pat<(v2i64 (scalar_to_vector (i64 (atomic_load_64 addr:$src,
+ (MOV64toPQIrm addr:$src)>, Requires<[UseSSE2]>;
+def : Pat<(v2i64 (scalar_to_vector (i64 (atomic_load_64 addr:$src,
+ (VMOV64toPQIrm addr:$src)>, Requires<[UseAVX]>;
+def : Pat<(v2i64 (scalar_to_vector (i64 (atomic_load_64 addr:$src,
+ (VMOV64toPQIZrm addr:$src)>, Requires<[HasAVX512]>;
+
// Floating point loads/stores.
def : Pat<(atomic_store_32 (i32 (bitconvert (f32 FR32:$src))), addr:$dst),
(MOVSSmr addr:$dst, FR32:$src)>, Requires<[UseSSE1]>;
diff --git a/llvm/test/CodeGen/X86/atomic-load-store.ll
b/llvm/test/CodeGen/X86/atomic-load-store.ll
index fc32c3668d1dd..7e15b9303887f 100644
--- a/llvm/test/CodeGen/X86/atomic-load-store.ll
+++ b/llvm/test/CodeGen/X86/atomic-load-store.ll
@@ -202,26 +202,22 @@ define <2 x i8> @atomic_vec2_i8(ptr %x) {
define <2 x i16> @atomic_vec2_i16(ptr %x) {
; CHECK-SSE-O3-LABEL: atomic_vec2_i16:
; CHECK-SSE-O3: # %bb.0:
-; CHECK-SSE-O3-NEXT:movl (%rdi), %eax
-; CHECK-SSE-O3-NEXT:movd %eax, %xmm0
+; CHECK-SSE-O3-NEXT:movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; CHECK-SSE-O3-NEXT:retq
;
; CHECK-AVX-O3-LABEL: atomic_vec2_i16:
; CHECK-AVX-O3: # %bb.0:
-; CHECK-AVX-O3-NEXT:movl (%rdi), %eax
-; CHECK-AVX-O3-NEXT:vmovd %eax, %xmm0
+; CHECK-AVX-O3-NEXT:vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; CHECK-AVX-O3-NEXT:retq
;
; CHECK-SSE-O0-LABEL: atomic_vec2_i16:
; CHECK-SSE-O0: # %bb.0:
-; CHECK-SSE-O0-NEXT:movl (%rdi), %eax
-; CHECK-SSE-O0-NEXT:movd %eax, %xmm0
+; CHECK-SSE-O0-NEXT:movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; CHECK-SSE-O0-NEXT:retq
;
; CHECK-AVX-O0-LABEL: atomic_vec2_i16:
; CHECK-AVX-O0: # %bb.0:
-; CHECK-AVX-O0-NEXT:movl (%rdi), %eax
-; CHECK-AVX-O0-NEXT:vmovd %eax, %xmm0
+; CHECK-AVX-O0-NEXT:vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; CHECK-AVX-O0-NEXT:retq
%ret = load atomic <2 x i16>, ptr %x acquire, align 4
ret <2 x i16> %ret
@@ -230,26 +226,22 @@ define <2 x i16> @atomic_vec2_i16(ptr %x) {
define <2 x ptr addrspace(270)> @atomic_vec2_ptr270(ptr %x) {
; CHECK-SSE-O3-LABEL: atomic_vec2_ptr270:
; CHECK-SSE-O3: # %bb.0:
-; CHECK-SSE-O3-NEXT:movq (%rdi), %rax
-; CHECK-SSE-O3-NEXT:movq %rax, %xmm0
+; CHECK-SSE-O3-NEXT:movq (%rdi), %xmm0
; CHECK-SSE-O3-NEXT:retq
;
; CHECK-AVX-O3-LABEL: atomic_vec2_ptr270:
; CHECK-AVX-O3: # %bb.0:
-; CHECK-AVX-O3-NEXT:movq (%rdi), %rax
-; CHECK-AVX-O3-NEXT:vmovq %rax, %xmm0
+; CHECK-AVX-O3-NEXT:vmovq (%rdi), %xmm0
; CHECK-AVX-O3-NEXT:retq
;
; CHECK-SSE-O0-LABEL: atomic_vec2_ptr270:
; CHECK-SSE-O0: # %bb.0:
-; CHECK-SSE-O0-NEXT:movq (%rdi), %rax
-; CHECK-SSE-O0-NEXT:movq %rax, %xmm0
+; CHECK-SSE-O0-NEXT:movq (%rdi), %xmm0
; CHECK-SSE-O0-NEXT:retq
;
; CHECK-AVX-O0-LABEL: atomic_vec2_ptr270:
; CHECK-AVX-O0: # %bb.0:
-; CHECK-AVX-O0-NEXT:movq (%rdi), %rax
-; CHECK-AVX-O0-NEXT:vmovq %rax, %xmm0
+; CHECK-AVX-O0-NEXT:vmovq (%rdi), %xmm0
; CHECK-AVX-O0-NEXT:retq
%ret = load atomic <2 x ptr addrspace(270)>, ptr %x acquire, align 8
ret <2 x ptr addrspace(270)> %ret
@@ -258,26 +250,22 @@ define <2 x ptr addrspace(270)> @atomic_vec2_ptr270(ptr
%x) {
define <2 x i32> @atomic_vec2_i32_align(ptr %x) {
; CHECK-SSE-O3-LABEL: atomic_vec2_i32_align:
; CHECK-SSE-O3: # %bb.0:
-; CHECK-SSE-O3-NEXT:movq (%rdi), %rax
-; CHECK-SSE-O3-NEXT:movq %rax, %xmm0
+; CHECK-SSE-O3-NEXT:movq (%rdi), %xmm0
[llvm-branch-commits] [llvm] [AtomicExpand] Add bitcasts when expanding load atomic vector (PR #148900)
https://github.com/jofrn updated
https://github.com/llvm/llvm-project/pull/148900
>From a657bd946e7be59892a00a447ca7018d0715c6a5 Mon Sep 17 00:00:00 2001
From: jofrn
Date: Tue, 15 Jul 2025 13:03:15 -0400
Subject: [PATCH] [AtomicExpand] Add bitcasts when expanding load atomic vector
AtomicExpand fails for aligned `load atomic ` because it
does not find a compatible library call. This change adds appropriate
bitcasts so that the call can be lowered. It also adds support for
128 bit lowering in tablegen to support SSE/AVX.
---
llvm/lib/CodeGen/AtomicExpandPass.cpp | 19 +++-
llvm/test/CodeGen/ARM/atomic-load-store.ll| 51 +++
llvm/test/CodeGen/X86/atomic-load-store.ll| 91 ++-
.../X86/expand-atomic-non-integer.ll | 66 ++
4 files changed, 222 insertions(+), 5 deletions(-)
diff --git a/llvm/lib/CodeGen/AtomicExpandPass.cpp
b/llvm/lib/CodeGen/AtomicExpandPass.cpp
index 53f1cfe24a68d..8dc14bb416345 100644
--- a/llvm/lib/CodeGen/AtomicExpandPass.cpp
+++ b/llvm/lib/CodeGen/AtomicExpandPass.cpp
@@ -483,7 +483,9 @@ LoadInst
*AtomicExpandImpl::convertAtomicLoadToIntegerType(LoadInst *LI) {
NewLI->setAtomic(LI->getOrdering(), LI->getSyncScopeID());
LLVM_DEBUG(dbgs() << "Replaced " << *LI << " with " << *NewLI << "\n");
- Value *NewVal = Builder.CreateBitCast(NewLI, LI->getType());
+ Value *NewVal = LI->getType()->isPtrOrPtrVectorTy()
+ ? Builder.CreateIntToPtr(NewLI, LI->getType())
+ : Builder.CreateBitCast(NewLI, LI->getType());
LI->replaceAllUsesWith(NewVal);
LI->eraseFromParent();
return NewLI;
@@ -2093,9 +2095,18 @@ bool AtomicExpandImpl::expandAtomicOpToLibcall(
I->replaceAllUsesWith(V);
} else if (HasResult) {
Value *V;
-if (UseSizedLibcall)
- V = Builder.CreateBitOrPointerCast(Result, I->getType());
-else {
+if (UseSizedLibcall) {
+ // Add bitcasts from Result's scalar type to I's vector type
+ auto *PtrTy = dyn_cast(I->getType()->getScalarType());
+ auto *VTy = dyn_cast(I->getType());
+ if (VTy && PtrTy && !Result->getType()->isVectorTy()) {
+unsigned AS = PtrTy->getAddressSpace();
+Value *BC = Builder.CreateBitCast(
+Result, VTy->getWithNewType(DL.getIntPtrType(Ctx, AS)));
+V = Builder.CreateIntToPtr(BC, I->getType());
+ } else
+V = Builder.CreateBitOrPointerCast(Result, I->getType());
+} else {
V = Builder.CreateAlignedLoad(I->getType(), AllocaResult,
AllocaAlignment);
Builder.CreateLifetimeEnd(AllocaResult);
diff --git a/llvm/test/CodeGen/ARM/atomic-load-store.ll
b/llvm/test/CodeGen/ARM/atomic-load-store.ll
index 560dfde356c29..eaa2ffd9b2731 100644
--- a/llvm/test/CodeGen/ARM/atomic-load-store.ll
+++ b/llvm/test/CodeGen/ARM/atomic-load-store.ll
@@ -983,3 +983,54 @@ define void @store_atomic_f64__seq_cst(ptr %ptr, double
%val1) {
store atomic double %val1, ptr %ptr seq_cst, align 8
ret void
}
+
+define <1 x ptr> @atomic_vec1_ptr(ptr %x) #0 {
+; ARM-LABEL: atomic_vec1_ptr:
+; ARM: @ %bb.0:
+; ARM-NEXT:ldr r0, [r0]
+; ARM-NEXT:dmb ish
+; ARM-NEXT:bx lr
+;
+; ARMOPTNONE-LABEL: atomic_vec1_ptr:
+; ARMOPTNONE: @ %bb.0:
+; ARMOPTNONE-NEXT:ldr r0, [r0]
+; ARMOPTNONE-NEXT:dmb ish
+; ARMOPTNONE-NEXT:bx lr
+;
+; THUMBTWO-LABEL: atomic_vec1_ptr:
+; THUMBTWO: @ %bb.0:
+; THUMBTWO-NEXT:ldr r0, [r0]
+; THUMBTWO-NEXT:dmb ish
+; THUMBTWO-NEXT:bx lr
+;
+; THUMBONE-LABEL: atomic_vec1_ptr:
+; THUMBONE: @ %bb.0:
+; THUMBONE-NEXT:push {r7, lr}
+; THUMBONE-NEXT:movs r1, #0
+; THUMBONE-NEXT:mov r2, r1
+; THUMBONE-NEXT:bl __sync_val_compare_and_swap_4
+; THUMBONE-NEXT:pop {r7, pc}
+;
+; ARMV4-LABEL: atomic_vec1_ptr:
+; ARMV4: @ %bb.0:
+; ARMV4-NEXT:push {r11, lr}
+; ARMV4-NEXT:mov r1, #2
+; ARMV4-NEXT:bl __atomic_load_4
+; ARMV4-NEXT:pop {r11, lr}
+; ARMV4-NEXT:mov pc, lr
+;
+; ARMV6-LABEL: atomic_vec1_ptr:
+; ARMV6: @ %bb.0:
+; ARMV6-NEXT:ldr r0, [r0]
+; ARMV6-NEXT:mov r1, #0
+; ARMV6-NEXT:mcr p15, #0, r1, c7, c10, #5
+; ARMV6-NEXT:bx lr
+;
+; THUMBM-LABEL: atomic_vec1_ptr:
+; THUMBM: @ %bb.0:
+; THUMBM-NEXT:ldr r0, [r0]
+; THUMBM-NEXT:dmb sy
+; THUMBM-NEXT:bx lr
+ %ret = load atomic <1 x ptr>, ptr %x acquire, align 4
+ ret <1 x ptr> %ret
+}
diff --git a/llvm/test/CodeGen/X86/atomic-load-store.ll
b/llvm/test/CodeGen/X86/atomic-load-store.ll
index 00310f6d1f219..867a4acb791bc 100644
--- a/llvm/test/CodeGen/X86/atomic-load-store.ll
+++ b/llvm/test/CodeGen/X86/atomic-load-store.ll
@@ -244,6 +244,96 @@ define <2 x ptr addrspace(270)> @atomic_vec2_ptr270(ptr
%x) {
%ret = load atomic <2 x ptr addrspace(270)>, ptr %x acquire, align 8
ret <2 x ptr addrspace(270)> %ret
}
+define <2 x ptr> @atomic_vec2_ptr_align(ptr %x) nounwind {
+; CHECK-SSE2-O3-LABEL: atomic_vec2_ptr_align:
+; CHECK-SSE2-O3:
[llvm-branch-commits] [llvm] CodeGen/AMDGPU: Allow 3-address conversion of bundled instructions (PR #166213)
https://github.com/nhaehnle updated
https://github.com/llvm/llvm-project/pull/166213
From dd8c2ece4a1287580cec17fff56e8eaa314ffef7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicolai=20H=C3=A4hnle?=
Date: Tue, 7 Oct 2025 12:17:02 -0700
Subject: [PATCH] CodeGen/AMDGPU: Allow 3-address conversion of bundled
instructions
This is in preparation for future changes in AMDGPU that will make more
substantial use of bundles pre-RA. For now, simply test this with
degenerate (single-instruction) bundles.
commit-id:4a30cb78
---
.../lib/CodeGen/TwoAddressInstructionPass.cpp | 54 ++
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp| 56 +--
llvm/test/CodeGen/AMDGPU/twoaddr-bundle.mir | 9 ++-
3 files changed, 87 insertions(+), 32 deletions(-)
diff --git a/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
b/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
index b99e1c7f19b71..562a6a00045f5 100644
--- a/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
+++ b/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
@@ -794,29 +794,34 @@ bool TwoAddressInstructionImpl::convertInstTo3Addr(
if (!NewMI)
return false;
- LLVM_DEBUG(dbgs() << "2addr: CONVERTING 2-ADDR: " << *mi);
- LLVM_DEBUG(dbgs() << "2addr: TO 3-ADDR: " << *NewMI);
-
- // If the old instruction is debug value tracked, an update is required.
- if (auto OldInstrNum = mi->peekDebugInstrNum()) {
-assert(mi->getNumExplicitDefs() == 1);
-assert(NewMI->getNumExplicitDefs() == 1);
-
-// Find the old and new def location.
-unsigned OldIdx = mi->defs().begin()->getOperandNo();
-unsigned NewIdx = NewMI->defs().begin()->getOperandNo();
-
-// Record that one def has been replaced by the other.
-unsigned NewInstrNum = NewMI->getDebugInstrNum();
-MF->makeDebugValueSubstitution(std::make_pair(OldInstrNum, OldIdx),
- std::make_pair(NewInstrNum, NewIdx));
- }
-
- MBB->erase(mi); // Nuke the old inst.
-
for (MachineInstr &MI : MIS)
DistanceMap.insert(std::make_pair(&MI, Dist++));
- Dist--;
+
+ if (&*mi == NewMI) {
+LLVM_DEBUG(dbgs() << "2addr: CONVERTED IN-PLACE TO 3-ADDR: " << *mi);
+ } else {
+LLVM_DEBUG(dbgs() << "2addr: CONVERTING 2-ADDR: " << *mi);
+LLVM_DEBUG(dbgs() << "2addr: TO 3-ADDR: " << *NewMI);
+
+// If the old instruction is debug value tracked, an update is required.
+if (auto OldInstrNum = mi->peekDebugInstrNum()) {
+ assert(mi->getNumExplicitDefs() == 1);
+ assert(NewMI->getNumExplicitDefs() == 1);
+
+ // Find the old and new def location.
+ unsigned OldIdx = mi->defs().begin()->getOperandNo();
+ unsigned NewIdx = NewMI->defs().begin()->getOperandNo();
+
+ // Record that one def has been replaced by the other.
+ unsigned NewInstrNum = NewMI->getDebugInstrNum();
+ MF->makeDebugValueSubstitution(std::make_pair(OldInstrNum, OldIdx),
+ std::make_pair(NewInstrNum, NewIdx));
+}
+
+MBB->erase(mi); // Nuke the old inst.
+Dist--;
+ }
+
mi = NewMI;
nmi = std::next(mi);
@@ -1329,6 +1334,9 @@ bool TwoAddressInstructionImpl::tryInstructionTransform(
bool Commuted = tryInstructionCommute(&MI, DstIdx, SrcIdx, regBKilled, Dist);
+ // Give targets a chance to convert bundled instructions.
+ bool ConvertibleTo3Addr = MI.isConvertibleTo3Addr(MachineInstr::AnyInBundle);
+
// If the instruction is convertible to 3 Addr, instead
// of returning try 3 Addr transformation aggressively and
// use this variable to check later. Because it might be better.
@@ -1337,7 +1345,7 @@ bool TwoAddressInstructionImpl::tryInstructionTransform(
// addl %esi, %edi
// movl %edi, %eax
// ret
- if (Commuted && !MI.isConvertibleTo3Addr())
+ if (Commuted && !ConvertibleTo3Addr)
return false;
if (shouldOnlyCommute)
@@ -1357,7 +1365,7 @@ bool TwoAddressInstructionImpl::tryInstructionTransform(
regBKilled = isKilled(MI, regB, true);
}
- if (MI.isConvertibleTo3Addr()) {
+ if (ConvertibleTo3Addr) {
// This instruction is potentially convertible to a true
// three-address instruction. Check if it is profitable.
if (!regBKilled || isProfitableToConv3Addr(regA, regB)) {
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 6ce18ea921a9b..deeb8beb04332 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -4047,10 +4047,29 @@ MachineInstr
*SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
LiveVariables *LV,
LiveIntervals *LIS) const {
MachineBasicBlock &MBB = *MI.getParent();
+ MachineInstr *CandidateMI = &MI;
+
+ if (MI.isBundle()) {
+// This is a temporary placeholder for bundle handling that enables us to
+// exercise the relevant code paths in the two-address instruction pass.
+i
[llvm-branch-commits] [llvm] CodeGen/AMDGPU: Allow 3-address conversion of bundled instructions (PR #166213)
https://github.com/nhaehnle updated
https://github.com/llvm/llvm-project/pull/166213
From cc06ca25470188cc8e767eab72fcfe83958cf4b2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicolai=20H=C3=A4hnle?=
Date: Tue, 7 Oct 2025 12:17:02 -0700
Subject: [PATCH] CodeGen/AMDGPU: Allow 3-address conversion of bundled
instructions
This is in preparation for future changes in AMDGPU that will make more
substantial use of bundles pre-RA. For now, simply test this with
degenerate (single-instruction) bundles.
commit-id:4a30cb78
---
.../lib/CodeGen/TwoAddressInstructionPass.cpp | 54 +
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp| 58 +--
llvm/test/CodeGen/AMDGPU/twoaddr-bundle.mir | 9 ++-
3 files changed, 89 insertions(+), 32 deletions(-)
diff --git a/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
b/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
index b99e1c7f19b71..562a6a00045f5 100644
--- a/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
+++ b/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
@@ -794,29 +794,34 @@ bool TwoAddressInstructionImpl::convertInstTo3Addr(
if (!NewMI)
return false;
- LLVM_DEBUG(dbgs() << "2addr: CONVERTING 2-ADDR: " << *mi);
- LLVM_DEBUG(dbgs() << "2addr: TO 3-ADDR: " << *NewMI);
-
- // If the old instruction is debug value tracked, an update is required.
- if (auto OldInstrNum = mi->peekDebugInstrNum()) {
-assert(mi->getNumExplicitDefs() == 1);
-assert(NewMI->getNumExplicitDefs() == 1);
-
-// Find the old and new def location.
-unsigned OldIdx = mi->defs().begin()->getOperandNo();
-unsigned NewIdx = NewMI->defs().begin()->getOperandNo();
-
-// Record that one def has been replaced by the other.
-unsigned NewInstrNum = NewMI->getDebugInstrNum();
-MF->makeDebugValueSubstitution(std::make_pair(OldInstrNum, OldIdx),
- std::make_pair(NewInstrNum, NewIdx));
- }
-
- MBB->erase(mi); // Nuke the old inst.
-
for (MachineInstr &MI : MIS)
DistanceMap.insert(std::make_pair(&MI, Dist++));
- Dist--;
+
+ if (&*mi == NewMI) {
+LLVM_DEBUG(dbgs() << "2addr: CONVERTED IN-PLACE TO 3-ADDR: " << *mi);
+ } else {
+LLVM_DEBUG(dbgs() << "2addr: CONVERTING 2-ADDR: " << *mi);
+LLVM_DEBUG(dbgs() << "2addr: TO 3-ADDR: " << *NewMI);
+
+// If the old instruction is debug value tracked, an update is required.
+if (auto OldInstrNum = mi->peekDebugInstrNum()) {
+ assert(mi->getNumExplicitDefs() == 1);
+ assert(NewMI->getNumExplicitDefs() == 1);
+
+ // Find the old and new def location.
+ unsigned OldIdx = mi->defs().begin()->getOperandNo();
+ unsigned NewIdx = NewMI->defs().begin()->getOperandNo();
+
+ // Record that one def has been replaced by the other.
+ unsigned NewInstrNum = NewMI->getDebugInstrNum();
+ MF->makeDebugValueSubstitution(std::make_pair(OldInstrNum, OldIdx),
+ std::make_pair(NewInstrNum, NewIdx));
+}
+
+MBB->erase(mi); // Nuke the old inst.
+Dist--;
+ }
+
mi = NewMI;
nmi = std::next(mi);
@@ -1329,6 +1334,9 @@ bool TwoAddressInstructionImpl::tryInstructionTransform(
bool Commuted = tryInstructionCommute(&MI, DstIdx, SrcIdx, regBKilled, Dist);
+ // Give targets a chance to convert bundled instructions.
+ bool ConvertibleTo3Addr = MI.isConvertibleTo3Addr(MachineInstr::AnyInBundle);
+
// If the instruction is convertible to 3 Addr, instead
// of returning try 3 Addr transformation aggressively and
// use this variable to check later. Because it might be better.
@@ -1337,7 +1345,7 @@ bool TwoAddressInstructionImpl::tryInstructionTransform(
// addl %esi, %edi
// movl %edi, %eax
// ret
- if (Commuted && !MI.isConvertibleTo3Addr())
+ if (Commuted && !ConvertibleTo3Addr)
return false;
if (shouldOnlyCommute)
@@ -1357,7 +1365,7 @@ bool TwoAddressInstructionImpl::tryInstructionTransform(
regBKilled = isKilled(MI, regB, true);
}
- if (MI.isConvertibleTo3Addr()) {
+ if (ConvertibleTo3Addr) {
// This instruction is potentially convertible to a true
// three-address instruction. Check if it is profitable.
if (!regBKilled || isProfitableToConv3Addr(regA, regB)) {
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 6ce18ea921a9b..1fde07190339c 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -4047,10 +4047,29 @@ MachineInstr
*SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
LiveVariables *LV,
LiveIntervals *LIS) const {
MachineBasicBlock &MBB = *MI.getParent();
+ MachineInstr *CandidateMI = &MI;
+
+ if (MI.isBundle()) {
+// This is a temporary placeholder for bundle handling that enables us to
+// exercise the relevant code paths in the two-address instruction pass.
+if
[llvm-branch-commits] [llvm] CodeGen/AMDGPU: Allow 3-address conversion of bundled instructions (PR #166213)
@@ -4088,7 +4107,20 @@ MachineInstr
*SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
LV->getVarInfo(DefReg).AliveBlocks.clear();
}
-if (LIS) {
+if (MI.isBundle()) {
+ VirtRegInfo VRI = AnalyzeVirtRegInBundle(MI, DefReg);
+ if (!VRI.Reads && !VRI.Writes) {
+for (MachineOperand &MO : MI.all_uses()) {
+ if (MO.isReg() && MO.getReg() == DefReg) {
nhaehnle wrote:
See the discussion with Jay in #166212 -- I looked into it and decided to just
prevent and forbid tied sub-registers on bundles in pre-RA as the safer route
due to the complexities involved.
I'm adding an assert to that effect here.
https://github.com/llvm/llvm-project/pull/166213
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] CodeGen/AMDGPU: Allow 3-address conversion of bundled instructions (PR #166213)
https://github.com/nhaehnle updated
https://github.com/llvm/llvm-project/pull/166213
From cc06ca25470188cc8e767eab72fcfe83958cf4b2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicolai=20H=C3=A4hnle?=
Date: Tue, 7 Oct 2025 12:17:02 -0700
Subject: [PATCH] CodeGen/AMDGPU: Allow 3-address conversion of bundled
instructions
This is in preparation for future changes in AMDGPU that will make more
substantial use of bundles pre-RA. For now, simply test this with
degenerate (single-instruction) bundles.
commit-id:4a30cb78
---
.../lib/CodeGen/TwoAddressInstructionPass.cpp | 54 +
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp| 58 +--
llvm/test/CodeGen/AMDGPU/twoaddr-bundle.mir | 9 ++-
3 files changed, 89 insertions(+), 32 deletions(-)
diff --git a/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
b/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
index b99e1c7f19b71..562a6a00045f5 100644
--- a/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
+++ b/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
@@ -794,29 +794,34 @@ bool TwoAddressInstructionImpl::convertInstTo3Addr(
if (!NewMI)
return false;
- LLVM_DEBUG(dbgs() << "2addr: CONVERTING 2-ADDR: " << *mi);
- LLVM_DEBUG(dbgs() << "2addr: TO 3-ADDR: " << *NewMI);
-
- // If the old instruction is debug value tracked, an update is required.
- if (auto OldInstrNum = mi->peekDebugInstrNum()) {
-assert(mi->getNumExplicitDefs() == 1);
-assert(NewMI->getNumExplicitDefs() == 1);
-
-// Find the old and new def location.
-unsigned OldIdx = mi->defs().begin()->getOperandNo();
-unsigned NewIdx = NewMI->defs().begin()->getOperandNo();
-
-// Record that one def has been replaced by the other.
-unsigned NewInstrNum = NewMI->getDebugInstrNum();
-MF->makeDebugValueSubstitution(std::make_pair(OldInstrNum, OldIdx),
- std::make_pair(NewInstrNum, NewIdx));
- }
-
- MBB->erase(mi); // Nuke the old inst.
-
for (MachineInstr &MI : MIS)
DistanceMap.insert(std::make_pair(&MI, Dist++));
- Dist--;
+
+ if (&*mi == NewMI) {
+LLVM_DEBUG(dbgs() << "2addr: CONVERTED IN-PLACE TO 3-ADDR: " << *mi);
+ } else {
+LLVM_DEBUG(dbgs() << "2addr: CONVERTING 2-ADDR: " << *mi);
+LLVM_DEBUG(dbgs() << "2addr: TO 3-ADDR: " << *NewMI);
+
+// If the old instruction is debug value tracked, an update is required.
+if (auto OldInstrNum = mi->peekDebugInstrNum()) {
+ assert(mi->getNumExplicitDefs() == 1);
+ assert(NewMI->getNumExplicitDefs() == 1);
+
+ // Find the old and new def location.
+ unsigned OldIdx = mi->defs().begin()->getOperandNo();
+ unsigned NewIdx = NewMI->defs().begin()->getOperandNo();
+
+ // Record that one def has been replaced by the other.
+ unsigned NewInstrNum = NewMI->getDebugInstrNum();
+ MF->makeDebugValueSubstitution(std::make_pair(OldInstrNum, OldIdx),
+ std::make_pair(NewInstrNum, NewIdx));
+}
+
+MBB->erase(mi); // Nuke the old inst.
+Dist--;
+ }
+
mi = NewMI;
nmi = std::next(mi);
@@ -1329,6 +1334,9 @@ bool TwoAddressInstructionImpl::tryInstructionTransform(
bool Commuted = tryInstructionCommute(&MI, DstIdx, SrcIdx, regBKilled, Dist);
+ // Give targets a chance to convert bundled instructions.
+ bool ConvertibleTo3Addr = MI.isConvertibleTo3Addr(MachineInstr::AnyInBundle);
+
// If the instruction is convertible to 3 Addr, instead
// of returning try 3 Addr transformation aggressively and
// use this variable to check later. Because it might be better.
@@ -1337,7 +1345,7 @@ bool TwoAddressInstructionImpl::tryInstructionTransform(
// addl %esi, %edi
// movl %edi, %eax
// ret
- if (Commuted && !MI.isConvertibleTo3Addr())
+ if (Commuted && !ConvertibleTo3Addr)
return false;
if (shouldOnlyCommute)
@@ -1357,7 +1365,7 @@ bool TwoAddressInstructionImpl::tryInstructionTransform(
regBKilled = isKilled(MI, regB, true);
}
- if (MI.isConvertibleTo3Addr()) {
+ if (ConvertibleTo3Addr) {
// This instruction is potentially convertible to a true
// three-address instruction. Check if it is profitable.
if (!regBKilled || isProfitableToConv3Addr(regA, regB)) {
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 6ce18ea921a9b..1fde07190339c 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -4047,10 +4047,29 @@ MachineInstr
*SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
LiveVariables *LV,
LiveIntervals *LIS) const {
MachineBasicBlock &MBB = *MI.getParent();
+ MachineInstr *CandidateMI = &MI;
+
+ if (MI.isBundle()) {
+// This is a temporary placeholder for bundle handling that enables us to
+// exercise the relevant code paths in the two-address instruction pass.
+if
[llvm-branch-commits] [llvm] [X86] Remove extra MOV after widening atomic load (PR #148898)
https://github.com/RKSimon approved this pull request. LGTM https://github.com/llvm/llvm-project/pull/148898 ___ llvm-branch-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [CI][NFC] Refactor compute_platform_title into generate_test_report_lib (PR #166604)
https://github.com/Keenuts approved this pull request. https://github.com/llvm/llvm-project/pull/166604 ___ llvm-branch-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [CI] Add Ability to Explain Failures (PR #166590)
https://github.com/Keenuts edited https://github.com/llvm/llvm-project/pull/166590 ___ llvm-branch-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [CI] Add Ability to Explain Failures (PR #166590)
https://github.com/Keenuts approved this pull request. a small thing, otherwise LGTM (modulus test coverage request by David) https://github.com/llvm/llvm-project/pull/166590 ___ llvm-branch-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [CI] Add Ability to Explain Failures (PR #166590)
@@ -82,16 +93,29 @@ def find_failure_in_ninja_logs(ninja_logs: list[list[str]])
-> list[tuple[str, s
return failures
-def _format_failures(failures: list[tuple[str, str]]) -> list[str]:
+def _format_failures(
+failures: list[tuple[str, str]], failure_explanations: dict[str,
FailureExplanation]
+) -> list[str]:
"""Formats failures into summary views for the report."""
output = []
for build_failure in failures:
failed_action, failure_message = build_failure
+failure_explanation = None
+if failed_action in failure_explanations:
+failure_explanation = failure_explanations[failed_action]
+output.append("")
+if failure_explanation:
+output.extend(
+[
+f"{failed_action} (Likely Already
Failing)" "",
Keenuts wrote:
`html.escape` the build logs before embedding in the xml?
https://github.com/llvm/llvm-project/pull/166590
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [mlir] [mlir][vector] Simplify createReadOrMaskedRead (PR #163736)
https://github.com/banach-space reopened https://github.com/llvm/llvm-project/pull/163736 ___ llvm-branch-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] release/21.x: [RISCV] Correct the CFA offsets for stack probing. (#166616) (PR #166783)
https://github.com/ilovepi approved this pull request. LGTM. https://github.com/llvm/llvm-project/pull/166783 ___ llvm-branch-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [CI] Make premerge_advisor_explain write comments (PR #166605)
@@ -33,17 +33,18 @@ function at-exit {
# If building fails there will be no results files.
shopt -s nullglob
- if [[ "$GITHUB_STEP_SUMMARY" != "" ]]; then
+ if [[ "$GITHUB_ACTIONS" != "" ]]; then
boomanaiden154 wrote:
This checks that we are running in a Github actions workflow (as opposed to
running locally or inside buildbot). We don't need to change what we're looking
for (`GITHUB_STEP_SUMMARY` is implied by `GITHUB_ACTIONS`), but I felt changing
it up made it more clear given we are now also running other scripts inside
this conditional that don't have anything to do with the step summary.
https://github.com/llvm/llvm-project/pull/166605
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [AtomicExpand] Add bitcasts when expanding load atomic vector (PR #148900)
https://github.com/jofrn updated
https://github.com/llvm/llvm-project/pull/148900
>From 242cf54a6b527e573c4d30a3bea47e3a458fb8c1 Mon Sep 17 00:00:00 2001
From: jofrn
Date: Tue, 15 Jul 2025 13:03:15 -0400
Subject: [PATCH] [AtomicExpand] Add bitcasts when expanding load atomic vector
AtomicExpand fails for aligned `load atomic ` because it
does not find a compatible library call. This change adds appropriate
bitcasts so that the call can be lowered. It also adds support for
128 bit lowering in tablegen to support SSE/AVX.
---
llvm/lib/CodeGen/AtomicExpandPass.cpp | 19 +++-
llvm/test/CodeGen/ARM/atomic-load-store.ll| 51 +++
llvm/test/CodeGen/X86/atomic-load-store.ll| 91 ++-
.../X86/expand-atomic-non-integer.ll | 66 ++
4 files changed, 222 insertions(+), 5 deletions(-)
diff --git a/llvm/lib/CodeGen/AtomicExpandPass.cpp
b/llvm/lib/CodeGen/AtomicExpandPass.cpp
index 53f1cfe24a68d7..8dc14bb4163451 100644
--- a/llvm/lib/CodeGen/AtomicExpandPass.cpp
+++ b/llvm/lib/CodeGen/AtomicExpandPass.cpp
@@ -483,7 +483,9 @@ LoadInst
*AtomicExpandImpl::convertAtomicLoadToIntegerType(LoadInst *LI) {
NewLI->setAtomic(LI->getOrdering(), LI->getSyncScopeID());
LLVM_DEBUG(dbgs() << "Replaced " << *LI << " with " << *NewLI << "\n");
- Value *NewVal = Builder.CreateBitCast(NewLI, LI->getType());
+ Value *NewVal = LI->getType()->isPtrOrPtrVectorTy()
+ ? Builder.CreateIntToPtr(NewLI, LI->getType())
+ : Builder.CreateBitCast(NewLI, LI->getType());
LI->replaceAllUsesWith(NewVal);
LI->eraseFromParent();
return NewLI;
@@ -2093,9 +2095,18 @@ bool AtomicExpandImpl::expandAtomicOpToLibcall(
I->replaceAllUsesWith(V);
} else if (HasResult) {
Value *V;
-if (UseSizedLibcall)
- V = Builder.CreateBitOrPointerCast(Result, I->getType());
-else {
+if (UseSizedLibcall) {
+ // Add bitcasts from Result's scalar type to I's vector type
+ auto *PtrTy = dyn_cast(I->getType()->getScalarType());
+ auto *VTy = dyn_cast(I->getType());
+ if (VTy && PtrTy && !Result->getType()->isVectorTy()) {
+unsigned AS = PtrTy->getAddressSpace();
+Value *BC = Builder.CreateBitCast(
+Result, VTy->getWithNewType(DL.getIntPtrType(Ctx, AS)));
+V = Builder.CreateIntToPtr(BC, I->getType());
+ } else
+V = Builder.CreateBitOrPointerCast(Result, I->getType());
+} else {
V = Builder.CreateAlignedLoad(I->getType(), AllocaResult,
AllocaAlignment);
Builder.CreateLifetimeEnd(AllocaResult);
diff --git a/llvm/test/CodeGen/ARM/atomic-load-store.ll
b/llvm/test/CodeGen/ARM/atomic-load-store.ll
index 560dfde356c29d..eaa2ffd9b27318 100644
--- a/llvm/test/CodeGen/ARM/atomic-load-store.ll
+++ b/llvm/test/CodeGen/ARM/atomic-load-store.ll
@@ -983,3 +983,54 @@ define void @store_atomic_f64__seq_cst(ptr %ptr, double
%val1) {
store atomic double %val1, ptr %ptr seq_cst, align 8
ret void
}
+
+define <1 x ptr> @atomic_vec1_ptr(ptr %x) #0 {
+; ARM-LABEL: atomic_vec1_ptr:
+; ARM: @ %bb.0:
+; ARM-NEXT:ldr r0, [r0]
+; ARM-NEXT:dmb ish
+; ARM-NEXT:bx lr
+;
+; ARMOPTNONE-LABEL: atomic_vec1_ptr:
+; ARMOPTNONE: @ %bb.0:
+; ARMOPTNONE-NEXT:ldr r0, [r0]
+; ARMOPTNONE-NEXT:dmb ish
+; ARMOPTNONE-NEXT:bx lr
+;
+; THUMBTWO-LABEL: atomic_vec1_ptr:
+; THUMBTWO: @ %bb.0:
+; THUMBTWO-NEXT:ldr r0, [r0]
+; THUMBTWO-NEXT:dmb ish
+; THUMBTWO-NEXT:bx lr
+;
+; THUMBONE-LABEL: atomic_vec1_ptr:
+; THUMBONE: @ %bb.0:
+; THUMBONE-NEXT:push {r7, lr}
+; THUMBONE-NEXT:movs r1, #0
+; THUMBONE-NEXT:mov r2, r1
+; THUMBONE-NEXT:bl __sync_val_compare_and_swap_4
+; THUMBONE-NEXT:pop {r7, pc}
+;
+; ARMV4-LABEL: atomic_vec1_ptr:
+; ARMV4: @ %bb.0:
+; ARMV4-NEXT:push {r11, lr}
+; ARMV4-NEXT:mov r1, #2
+; ARMV4-NEXT:bl __atomic_load_4
+; ARMV4-NEXT:pop {r11, lr}
+; ARMV4-NEXT:mov pc, lr
+;
+; ARMV6-LABEL: atomic_vec1_ptr:
+; ARMV6: @ %bb.0:
+; ARMV6-NEXT:ldr r0, [r0]
+; ARMV6-NEXT:mov r1, #0
+; ARMV6-NEXT:mcr p15, #0, r1, c7, c10, #5
+; ARMV6-NEXT:bx lr
+;
+; THUMBM-LABEL: atomic_vec1_ptr:
+; THUMBM: @ %bb.0:
+; THUMBM-NEXT:ldr r0, [r0]
+; THUMBM-NEXT:dmb sy
+; THUMBM-NEXT:bx lr
+ %ret = load atomic <1 x ptr>, ptr %x acquire, align 4
+ ret <1 x ptr> %ret
+}
diff --git a/llvm/test/CodeGen/X86/atomic-load-store.ll
b/llvm/test/CodeGen/X86/atomic-load-store.ll
index 00310f6d1f219e..867a4acb791bca 100644
--- a/llvm/test/CodeGen/X86/atomic-load-store.ll
+++ b/llvm/test/CodeGen/X86/atomic-load-store.ll
@@ -244,6 +244,96 @@ define <2 x ptr addrspace(270)> @atomic_vec2_ptr270(ptr
%x) {
%ret = load atomic <2 x ptr addrspace(270)>, ptr %x acquire, align 8
ret <2 x ptr addrspace(270)> %ret
}
+define <2 x ptr> @atomic_vec2_ptr_align(ptr %x) nounwind {
+; CHECK-SSE2-O3-LABEL: atomic_vec2_ptr_align:
+; CHECK-SSE2-
[llvm-branch-commits] [llvm] [SelectionDAG] Split vector types for atomic load (PR #165818)
https://github.com/jofrn updated
https://github.com/llvm/llvm-project/pull/165818
>From 8466578444bc27c0d8c5dc2ee95f074a96b5e47f Mon Sep 17 00:00:00 2001
From: jofrn
Date: Thu, 30 Oct 2025 12:19:59 -0400
Subject: [PATCH] [SelectionDAG] Split vector types for atomic load
Vector types that aren't widened are split
so that a single ATOMIC_LOAD is issued for the entire vector at once.
This change utilizes the load vectorization infrastructure in
SelectionDAG in order to group the vectors. This enables SelectionDAG
to translate vectors with type bfloat,half.
---
.../include/llvm/Target/TargetSelectionDAG.td | 14 +
llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h | 1 +
.../SelectionDAG/LegalizeVectorTypes.cpp | 34 ++
llvm/test/CodeGen/X86/atomic-load-store.ll| 352 +-
4 files changed, 397 insertions(+), 4 deletions(-)
diff --git a/llvm/include/llvm/Target/TargetSelectionDAG.td
b/llvm/include/llvm/Target/TargetSelectionDAG.td
index 07a858fd682fc..239fee8a3022d 100644
--- a/llvm/include/llvm/Target/TargetSelectionDAG.td
+++ b/llvm/include/llvm/Target/TargetSelectionDAG.td
@@ -1949,6 +1949,20 @@ def atomic_load_64 :
let MemoryVT = i64;
}
+def atomic_load_128_v2i64 :
+ PatFrag<(ops node:$ptr),
+ (atomic_load node:$ptr)> {
+ let IsAtomic = true;
+ let MemoryVT = v2i64;
+}
+
+def atomic_load_128_v4i32 :
+ PatFrag<(ops node:$ptr),
+ (atomic_load node:$ptr)> {
+ let IsAtomic = true;
+ let MemoryVT = v4i32;
+}
+
def atomic_load_nonext_8 :
PatFrag<(ops node:$ptr), (atomic_load_nonext node:$ptr)> {
let IsAtomic = true; // FIXME: Should be IsLoad and/or IsAtomic?
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index ed2c30be7d71d..9028ff4d3401c 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -978,6 +978,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
void SplitVecRes_FPOp_MultiType(SDNode *N, SDValue &Lo, SDValue &Hi);
void SplitVecRes_IS_FPCLASS(SDNode *N, SDValue &Lo, SDValue &Hi);
void SplitVecRes_INSERT_VECTOR_ELT(SDNode *N, SDValue &Lo, SDValue &Hi);
+ void SplitVecRes_ATOMIC_LOAD(AtomicSDNode *LD, SDValue &Lo, SDValue &Hi);
void SplitVecRes_LOAD(LoadSDNode *LD, SDValue &Lo, SDValue &Hi);
void SplitVecRes_VP_LOAD(VPLoadSDNode *LD, SDValue &Lo, SDValue &Hi);
void SplitVecRes_VP_LOAD_FF(VPLoadFFSDNode *LD, SDValue &Lo, SDValue &Hi);
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index d0e2c8b2e3799..fe89a4a9f9634 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -1226,6 +1226,9 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N,
unsigned ResNo) {
SplitVecRes_STEP_VECTOR(N, Lo, Hi);
break;
case ISD::SIGN_EXTEND_INREG: SplitVecRes_InregOp(N, Lo, Hi); break;
+ case ISD::ATOMIC_LOAD:
+SplitVecRes_ATOMIC_LOAD(cast(N), Lo, Hi);
+break;
case ISD::LOAD:
SplitVecRes_LOAD(cast(N), Lo, Hi);
break;
@@ -2202,6 +2205,37 @@ void DAGTypeLegalizer::SplitVecRes_VP_SPLAT(SDNode *N,
SDValue &Lo,
Hi = DAG.getNode(N->getOpcode(), dl, HiVT, N->getOperand(0), MaskHi, EVLHi);
}
+void DAGTypeLegalizer::SplitVecRes_ATOMIC_LOAD(AtomicSDNode *LD, SDValue &Lo,
+ SDValue &Hi) {
+ assert(LD->getExtensionType() == ISD::NON_EXTLOAD &&
+ "Extended load during type legalization!");
+ SDLoc dl(LD);
+ EVT VT = LD->getValueType(0);
+ EVT LoVT, HiVT;
+ std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
+
+ SDValue Ch = LD->getChain();
+ SDValue Ptr = LD->getBasePtr();
+
+ EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
+ EVT MemIntVT =
+ EVT::getIntegerVT(*DAG.getContext(), LD->getMemoryVT().getSizeInBits());
+ SDValue ALD = DAG.getAtomicLoad(LD->getExtensionType(), dl, MemIntVT, IntVT,
+ Ch, Ptr, LD->getMemOperand());
+
+ EVT LoIntVT = EVT::getIntegerVT(*DAG.getContext(), LoVT.getSizeInBits());
+ EVT HiIntVT = EVT::getIntegerVT(*DAG.getContext(), HiVT.getSizeInBits());
+ SDValue ExtractLo, ExtractHi;
+ SplitInteger(ALD, LoIntVT, HiIntVT, ExtractLo, ExtractHi);
+
+ Lo = DAG.getBitcast(LoVT, ExtractLo);
+ Hi = DAG.getBitcast(HiVT, ExtractHi);
+
+ // Legalize the chain result - switch anything that used the old chain to
+ // use the new one.
+ ReplaceValueWith(SDValue(LD, 1), ALD.getValue(1));
+}
+
void DAGTypeLegalizer::SplitVecRes_LOAD(LoadSDNode *LD, SDValue &Lo,
SDValue &Hi) {
assert(ISD::isUNINDEXEDLoad(LD) && "Indexed load during type legalization!");
diff --git a/llvm/test/CodeGen/X86/atomic-load-store.ll
b/llvm/test/CodeGen/X86/atomic-load-store.ll
index 7e15b9303887f..928dfef3143da 100644
--- a/llvm/test/CodeGen/X86/atomic-load-store.ll
+++ b/llvm/test/CodeGen/
[llvm-branch-commits] [llvm] [AtomicExpand] Add bitcasts when expanding load atomic vector (PR #148900)
https://github.com/jofrn updated
https://github.com/llvm/llvm-project/pull/148900
>From 242cf54a6b527e573c4d30a3bea47e3a458fb8c1 Mon Sep 17 00:00:00 2001
From: jofrn
Date: Tue, 15 Jul 2025 13:03:15 -0400
Subject: [PATCH] [AtomicExpand] Add bitcasts when expanding load atomic vector
AtomicExpand fails for aligned `load atomic ` because it
does not find a compatible library call. This change adds appropriate
bitcasts so that the call can be lowered. It also adds support for
128 bit lowering in tablegen to support SSE/AVX.
---
llvm/lib/CodeGen/AtomicExpandPass.cpp | 19 +++-
llvm/test/CodeGen/ARM/atomic-load-store.ll| 51 +++
llvm/test/CodeGen/X86/atomic-load-store.ll| 91 ++-
.../X86/expand-atomic-non-integer.ll | 66 ++
4 files changed, 222 insertions(+), 5 deletions(-)
diff --git a/llvm/lib/CodeGen/AtomicExpandPass.cpp
b/llvm/lib/CodeGen/AtomicExpandPass.cpp
index 53f1cfe24a68d..8dc14bb416345 100644
--- a/llvm/lib/CodeGen/AtomicExpandPass.cpp
+++ b/llvm/lib/CodeGen/AtomicExpandPass.cpp
@@ -483,7 +483,9 @@ LoadInst
*AtomicExpandImpl::convertAtomicLoadToIntegerType(LoadInst *LI) {
NewLI->setAtomic(LI->getOrdering(), LI->getSyncScopeID());
LLVM_DEBUG(dbgs() << "Replaced " << *LI << " with " << *NewLI << "\n");
- Value *NewVal = Builder.CreateBitCast(NewLI, LI->getType());
+ Value *NewVal = LI->getType()->isPtrOrPtrVectorTy()
+ ? Builder.CreateIntToPtr(NewLI, LI->getType())
+ : Builder.CreateBitCast(NewLI, LI->getType());
LI->replaceAllUsesWith(NewVal);
LI->eraseFromParent();
return NewLI;
@@ -2093,9 +2095,18 @@ bool AtomicExpandImpl::expandAtomicOpToLibcall(
I->replaceAllUsesWith(V);
} else if (HasResult) {
Value *V;
-if (UseSizedLibcall)
- V = Builder.CreateBitOrPointerCast(Result, I->getType());
-else {
+if (UseSizedLibcall) {
+ // Add bitcasts from Result's scalar type to I's vector type
+ auto *PtrTy = dyn_cast(I->getType()->getScalarType());
+ auto *VTy = dyn_cast(I->getType());
+ if (VTy && PtrTy && !Result->getType()->isVectorTy()) {
+unsigned AS = PtrTy->getAddressSpace();
+Value *BC = Builder.CreateBitCast(
+Result, VTy->getWithNewType(DL.getIntPtrType(Ctx, AS)));
+V = Builder.CreateIntToPtr(BC, I->getType());
+ } else
+V = Builder.CreateBitOrPointerCast(Result, I->getType());
+} else {
V = Builder.CreateAlignedLoad(I->getType(), AllocaResult,
AllocaAlignment);
Builder.CreateLifetimeEnd(AllocaResult);
diff --git a/llvm/test/CodeGen/ARM/atomic-load-store.ll
b/llvm/test/CodeGen/ARM/atomic-load-store.ll
index 560dfde356c29..eaa2ffd9b2731 100644
--- a/llvm/test/CodeGen/ARM/atomic-load-store.ll
+++ b/llvm/test/CodeGen/ARM/atomic-load-store.ll
@@ -983,3 +983,54 @@ define void @store_atomic_f64__seq_cst(ptr %ptr, double
%val1) {
store atomic double %val1, ptr %ptr seq_cst, align 8
ret void
}
+
+define <1 x ptr> @atomic_vec1_ptr(ptr %x) #0 {
+; ARM-LABEL: atomic_vec1_ptr:
+; ARM: @ %bb.0:
+; ARM-NEXT:ldr r0, [r0]
+; ARM-NEXT:dmb ish
+; ARM-NEXT:bx lr
+;
+; ARMOPTNONE-LABEL: atomic_vec1_ptr:
+; ARMOPTNONE: @ %bb.0:
+; ARMOPTNONE-NEXT:ldr r0, [r0]
+; ARMOPTNONE-NEXT:dmb ish
+; ARMOPTNONE-NEXT:bx lr
+;
+; THUMBTWO-LABEL: atomic_vec1_ptr:
+; THUMBTWO: @ %bb.0:
+; THUMBTWO-NEXT:ldr r0, [r0]
+; THUMBTWO-NEXT:dmb ish
+; THUMBTWO-NEXT:bx lr
+;
+; THUMBONE-LABEL: atomic_vec1_ptr:
+; THUMBONE: @ %bb.0:
+; THUMBONE-NEXT:push {r7, lr}
+; THUMBONE-NEXT:movs r1, #0
+; THUMBONE-NEXT:mov r2, r1
+; THUMBONE-NEXT:bl __sync_val_compare_and_swap_4
+; THUMBONE-NEXT:pop {r7, pc}
+;
+; ARMV4-LABEL: atomic_vec1_ptr:
+; ARMV4: @ %bb.0:
+; ARMV4-NEXT:push {r11, lr}
+; ARMV4-NEXT:mov r1, #2
+; ARMV4-NEXT:bl __atomic_load_4
+; ARMV4-NEXT:pop {r11, lr}
+; ARMV4-NEXT:mov pc, lr
+;
+; ARMV6-LABEL: atomic_vec1_ptr:
+; ARMV6: @ %bb.0:
+; ARMV6-NEXT:ldr r0, [r0]
+; ARMV6-NEXT:mov r1, #0
+; ARMV6-NEXT:mcr p15, #0, r1, c7, c10, #5
+; ARMV6-NEXT:bx lr
+;
+; THUMBM-LABEL: atomic_vec1_ptr:
+; THUMBM: @ %bb.0:
+; THUMBM-NEXT:ldr r0, [r0]
+; THUMBM-NEXT:dmb sy
+; THUMBM-NEXT:bx lr
+ %ret = load atomic <1 x ptr>, ptr %x acquire, align 4
+ ret <1 x ptr> %ret
+}
diff --git a/llvm/test/CodeGen/X86/atomic-load-store.ll
b/llvm/test/CodeGen/X86/atomic-load-store.ll
index 00310f6d1f219..867a4acb791bc 100644
--- a/llvm/test/CodeGen/X86/atomic-load-store.ll
+++ b/llvm/test/CodeGen/X86/atomic-load-store.ll
@@ -244,6 +244,96 @@ define <2 x ptr addrspace(270)> @atomic_vec2_ptr270(ptr
%x) {
%ret = load atomic <2 x ptr addrspace(270)>, ptr %x acquire, align 8
ret <2 x ptr addrspace(270)> %ret
}
+define <2 x ptr> @atomic_vec2_ptr_align(ptr %x) nounwind {
+; CHECK-SSE2-O3-LABEL: atomic_vec2_ptr_align:
+; CHECK-SSE2-O3:
[llvm-branch-commits] [llvm] [X86] Cast atomic vectors in IR to support floats (PR #148899)
https://github.com/jofrn updated
https://github.com/llvm/llvm-project/pull/148899
>From 23fb9283f42bd418afb4d478dfaa7215c4d16093 Mon Sep 17 00:00:00 2001
From: jofrn
Date: Tue, 15 Jul 2025 13:02:04 -0400
Subject: [PATCH] [X86] Cast atomic vectors in IR to support floats
This commit casts floats to ints in an atomic load during AtomicExpand to
support
floating point types. It also is required to support 128 bit vectors in SSE/AVX.
---
llvm/lib/Target/X86/X86ISelLowering.cpp| 7 +
llvm/lib/Target/X86/X86ISelLowering.h | 2 +
llvm/lib/Target/X86/X86InstrCompiler.td| 15 +
llvm/test/CodeGen/X86/atomic-load-store.ll | 385 ++---
4 files changed, 122 insertions(+), 287 deletions(-)
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp
b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 49beadae63f03..e15f17281b958 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -32136,6 +32136,13 @@
X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
}
}
+TargetLowering::AtomicExpansionKind
+X86TargetLowering::shouldCastAtomicLoadInIR(LoadInst *LI) const {
+ if (LI->getType()->getScalarType()->isFloatingPointTy())
+return AtomicExpansionKind::CastToInteger;
+ return AtomicExpansionKind::None;
+}
+
LoadInst *
X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
diff --git a/llvm/lib/Target/X86/X86ISelLowering.h
b/llvm/lib/Target/X86/X86ISelLowering.h
index b7151f65942b4..f9a8adbd7da0d 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/llvm/lib/Target/X86/X86ISelLowering.h
@@ -1841,6 +1841,8 @@ namespace llvm {
shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override;
TargetLoweringBase::AtomicExpansionKind
shouldExpandLogicAtomicRMWInIR(AtomicRMWInst *AI) const;
+TargetLoweringBase::AtomicExpansionKind
+shouldCastAtomicLoadInIR(LoadInst *LI) const override;
void emitBitTestAtomicRMWIntrinsic(AtomicRMWInst *AI) const override;
void emitCmpArithAtomicRMWIntrinsic(AtomicRMWInst *AI) const override;
diff --git a/llvm/lib/Target/X86/X86InstrCompiler.td
b/llvm/lib/Target/X86/X86InstrCompiler.td
index ce429b5916280..3f542297fea19 100644
--- a/llvm/lib/Target/X86/X86InstrCompiler.td
+++ b/llvm/lib/Target/X86/X86InstrCompiler.td
@@ -1220,6 +1220,21 @@ def : Pat<(v2i64 (scalar_to_vector (i64 (atomic_load_64
addr:$src,
def : Pat<(v2i64 (scalar_to_vector (i64 (atomic_load_64 addr:$src,
(VMOV64toPQIZrm addr:$src)>, Requires<[HasAVX512]>;
+// load atomic <2 x i64>
+def : Pat<(v2i64 (atomic_load_128_v2i64 addr:$src)),
+ (MOVAPDrm addr:$src)>, Requires<[UseSSE2]>;
+def : Pat<(v2i64 (atomic_load_128_v2i64 addr:$src)),
+ (VMOVAPDrm addr:$src)>, Requires<[UseAVX]>;
+def : Pat<(v2i64 (atomic_load_128_v2i64 addr:$src)),
+ (VMOVAPDZ128rm addr:$src)>, Requires<[HasAVX512]>;
+// load atomic <4 x i32>
+def : Pat<(v4i32 (atomic_load_128_v4i32 addr:$src)),
+ (MOVAPDrm addr:$src)>, Requires<[UseSSE2]>;
+def : Pat<(v4i32 (atomic_load_128_v4i32 addr:$src)),
+ (VMOVAPDrm addr:$src)>, Requires<[UseAVX]>;
+def : Pat<(v4i32 (atomic_load_128_v4i32 addr:$src)),
+ (VMOVAPDZ128rm addr:$src)>, Requires<[HasAVX512]>;
+
// Floating point loads/stores.
def : Pat<(atomic_store_32 (i32 (bitconvert (f32 FR32:$src))), addr:$dst),
(MOVSSmr addr:$dst, FR32:$src)>, Requires<[UseSSE1]>;
diff --git a/llvm/test/CodeGen/X86/atomic-load-store.ll
b/llvm/test/CodeGen/X86/atomic-load-store.ll
index 928dfef3143da..00310f6d1f219 100644
--- a/llvm/test/CodeGen/X86/atomic-load-store.ll
+++ b/llvm/test/CodeGen/X86/atomic-load-store.ll
@@ -119,13 +119,13 @@ define <1 x bfloat> @atomic_vec1_bfloat(ptr %x) {
; CHECK-SSE-O3-LABEL: atomic_vec1_bfloat:
; CHECK-SSE-O3: # %bb.0:
; CHECK-SSE-O3-NEXT:movzwl (%rdi), %eax
-; CHECK-SSE-O3-NEXT:pinsrw $0, %eax, %xmm0
+; CHECK-SSE-O3-NEXT:movd %eax, %xmm0
; CHECK-SSE-O3-NEXT:retq
;
; CHECK-AVX-O3-LABEL: atomic_vec1_bfloat:
; CHECK-AVX-O3: # %bb.0:
; CHECK-AVX-O3-NEXT:movzwl (%rdi), %eax
-; CHECK-AVX-O3-NEXT:vpinsrw $0, %eax, %xmm0, %xmm0
+; CHECK-AVX-O3-NEXT:vmovd %eax, %xmm0
; CHECK-AVX-O3-NEXT:retq
;
; CHECK-SSE-O0-LABEL: atomic_vec1_bfloat:
@@ -133,8 +133,7 @@ define <1 x bfloat> @atomic_vec1_bfloat(ptr %x) {
; CHECK-SSE-O0-NEXT:movw (%rdi), %cx
; CHECK-SSE-O0-NEXT:# implicit-def: $eax
; CHECK-SSE-O0-NEXT:movw %cx, %ax
-; CHECK-SSE-O0-NEXT:# implicit-def: $xmm0
-; CHECK-SSE-O0-NEXT:pinsrw $0, %eax, %xmm0
+; CHECK-SSE-O0-NEXT:movd %eax, %xmm0
; CHECK-SSE-O0-NEXT:retq
;
; CHECK-AVX-O0-LABEL: atomic_vec1_bfloat:
@@ -142,8 +141,7 @@ define <1 x bfloat> @atomic_vec1_bfloat(ptr %x) {
; CHECK-AVX-O0-NEXT:movw (%rdi), %cx
; CHECK-AVX-O0-NEXT:# implicit-def: $eax
; CHECK-AVX-O0-NEXT:movw %cx, %ax
-; CHECK-AVX-O0-NEXT:# imp
[llvm-branch-commits] [llvm] [X86] Cast atomic vectors in IR to support floats (PR #148899)
https://github.com/jofrn updated
https://github.com/llvm/llvm-project/pull/148899
>From 23fb9283f42bd418afb4d478dfaa7215c4d16093 Mon Sep 17 00:00:00 2001
From: jofrn
Date: Tue, 15 Jul 2025 13:02:04 -0400
Subject: [PATCH] [X86] Cast atomic vectors in IR to support floats
This commit casts floats to ints in an atomic load during AtomicExpand to
support
floating point types. It also is required to support 128 bit vectors in SSE/AVX.
---
llvm/lib/Target/X86/X86ISelLowering.cpp| 7 +
llvm/lib/Target/X86/X86ISelLowering.h | 2 +
llvm/lib/Target/X86/X86InstrCompiler.td| 15 +
llvm/test/CodeGen/X86/atomic-load-store.ll | 385 ++---
4 files changed, 122 insertions(+), 287 deletions(-)
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp
b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 49beadae63f03e..e15f17281b9585 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -32136,6 +32136,13 @@
X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
}
}
+TargetLowering::AtomicExpansionKind
+X86TargetLowering::shouldCastAtomicLoadInIR(LoadInst *LI) const {
+ if (LI->getType()->getScalarType()->isFloatingPointTy())
+return AtomicExpansionKind::CastToInteger;
+ return AtomicExpansionKind::None;
+}
+
LoadInst *
X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
diff --git a/llvm/lib/Target/X86/X86ISelLowering.h
b/llvm/lib/Target/X86/X86ISelLowering.h
index b7151f65942b4d..f9a8adbd7da0d1 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/llvm/lib/Target/X86/X86ISelLowering.h
@@ -1841,6 +1841,8 @@ namespace llvm {
shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override;
TargetLoweringBase::AtomicExpansionKind
shouldExpandLogicAtomicRMWInIR(AtomicRMWInst *AI) const;
+TargetLoweringBase::AtomicExpansionKind
+shouldCastAtomicLoadInIR(LoadInst *LI) const override;
void emitBitTestAtomicRMWIntrinsic(AtomicRMWInst *AI) const override;
void emitCmpArithAtomicRMWIntrinsic(AtomicRMWInst *AI) const override;
diff --git a/llvm/lib/Target/X86/X86InstrCompiler.td
b/llvm/lib/Target/X86/X86InstrCompiler.td
index ce429b59162805..3f542297fea196 100644
--- a/llvm/lib/Target/X86/X86InstrCompiler.td
+++ b/llvm/lib/Target/X86/X86InstrCompiler.td
@@ -1220,6 +1220,21 @@ def : Pat<(v2i64 (scalar_to_vector (i64 (atomic_load_64
addr:$src,
def : Pat<(v2i64 (scalar_to_vector (i64 (atomic_load_64 addr:$src,
(VMOV64toPQIZrm addr:$src)>, Requires<[HasAVX512]>;
+// load atomic <2 x i64>
+def : Pat<(v2i64 (atomic_load_128_v2i64 addr:$src)),
+ (MOVAPDrm addr:$src)>, Requires<[UseSSE2]>;
+def : Pat<(v2i64 (atomic_load_128_v2i64 addr:$src)),
+ (VMOVAPDrm addr:$src)>, Requires<[UseAVX]>;
+def : Pat<(v2i64 (atomic_load_128_v2i64 addr:$src)),
+ (VMOVAPDZ128rm addr:$src)>, Requires<[HasAVX512]>;
+// load atomic <4 x i32>
+def : Pat<(v4i32 (atomic_load_128_v4i32 addr:$src)),
+ (MOVAPDrm addr:$src)>, Requires<[UseSSE2]>;
+def : Pat<(v4i32 (atomic_load_128_v4i32 addr:$src)),
+ (VMOVAPDrm addr:$src)>, Requires<[UseAVX]>;
+def : Pat<(v4i32 (atomic_load_128_v4i32 addr:$src)),
+ (VMOVAPDZ128rm addr:$src)>, Requires<[HasAVX512]>;
+
// Floating point loads/stores.
def : Pat<(atomic_store_32 (i32 (bitconvert (f32 FR32:$src))), addr:$dst),
(MOVSSmr addr:$dst, FR32:$src)>, Requires<[UseSSE1]>;
diff --git a/llvm/test/CodeGen/X86/atomic-load-store.ll
b/llvm/test/CodeGen/X86/atomic-load-store.ll
index 928dfef3143da5..00310f6d1f219e 100644
--- a/llvm/test/CodeGen/X86/atomic-load-store.ll
+++ b/llvm/test/CodeGen/X86/atomic-load-store.ll
@@ -119,13 +119,13 @@ define <1 x bfloat> @atomic_vec1_bfloat(ptr %x) {
; CHECK-SSE-O3-LABEL: atomic_vec1_bfloat:
; CHECK-SSE-O3: # %bb.0:
; CHECK-SSE-O3-NEXT:movzwl (%rdi), %eax
-; CHECK-SSE-O3-NEXT:pinsrw $0, %eax, %xmm0
+; CHECK-SSE-O3-NEXT:movd %eax, %xmm0
; CHECK-SSE-O3-NEXT:retq
;
; CHECK-AVX-O3-LABEL: atomic_vec1_bfloat:
; CHECK-AVX-O3: # %bb.0:
; CHECK-AVX-O3-NEXT:movzwl (%rdi), %eax
-; CHECK-AVX-O3-NEXT:vpinsrw $0, %eax, %xmm0, %xmm0
+; CHECK-AVX-O3-NEXT:vmovd %eax, %xmm0
; CHECK-AVX-O3-NEXT:retq
;
; CHECK-SSE-O0-LABEL: atomic_vec1_bfloat:
@@ -133,8 +133,7 @@ define <1 x bfloat> @atomic_vec1_bfloat(ptr %x) {
; CHECK-SSE-O0-NEXT:movw (%rdi), %cx
; CHECK-SSE-O0-NEXT:# implicit-def: $eax
; CHECK-SSE-O0-NEXT:movw %cx, %ax
-; CHECK-SSE-O0-NEXT:# implicit-def: $xmm0
-; CHECK-SSE-O0-NEXT:pinsrw $0, %eax, %xmm0
+; CHECK-SSE-O0-NEXT:movd %eax, %xmm0
; CHECK-SSE-O0-NEXT:retq
;
; CHECK-AVX-O0-LABEL: atomic_vec1_bfloat:
@@ -142,8 +141,7 @@ define <1 x bfloat> @atomic_vec1_bfloat(ptr %x) {
; CHECK-AVX-O0-NEXT:movw (%rdi), %cx
; CHECK-AVX-O0-NEXT:# implicit-def: $eax
; CHECK-AVX-O0-NEXT:movw %cx, %ax
-; CHECK-AVX-O0-NEXT:
[llvm-branch-commits] [llvm] [ConstantTime][RISCV] Add comprehensive tests for ct.select (PR #166708)
https://github.com/wizardengineer ready_for_review https://github.com/llvm/llvm-project/pull/166708 ___ llvm-branch-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [ConstantTime][WebAssembly] Add comprehensive tests for ct.select (PR #166709)
https://github.com/wizardengineer ready_for_review https://github.com/llvm/llvm-project/pull/166709 ___ llvm-branch-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [clang] [ConstantTime][Clang] Add __builtin_ct_select for constant-time selection (PR #166703)
https://github.com/wizardengineer ready_for_review https://github.com/llvm/llvm-project/pull/166703 ___ llvm-branch-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [ConstantTime] Native ct.select support for ARM32 and Thumb (PR #166707)
https://github.com/wizardengineer ready_for_review https://github.com/llvm/llvm-project/pull/166707 ___ llvm-branch-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [ConstantTime] Native ct.select support for X86 and i386 (PR #166704)
https://github.com/wizardengineer ready_for_review https://github.com/llvm/llvm-project/pull/166704 ___ llvm-branch-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
