[llvm-branch-commits] [flang] [flang][OpenMP] Semantic checks for DYN_GROUPPRIVATE (PR #166214)
llvmbot wrote:
@llvm/pr-subscribers-flang-semantics
@llvm/pr-subscribers-flang-openmp
Author: Krzysztof Parzyszek (kparzysz)
Changes
---
Full diff: https://github.com/llvm/llvm-project/pull/166214.diff
4 Files Affected:
- (modified) flang/include/flang/Semantics/openmp-modifiers.h (+2)
- (modified) flang/lib/Semantics/check-omp-structure.cpp (+33-1)
- (modified) flang/lib/Semantics/openmp-modifiers.cpp (+32)
- (added) flang/test/Semantics/OpenMP/dyn-groupprivate.f90 (+8)
``diff
diff --git a/flang/include/flang/Semantics/openmp-modifiers.h
b/flang/include/flang/Semantics/openmp-modifiers.h
index bfa3aa4939cb1..283bf2a4c895e 100644
--- a/flang/include/flang/Semantics/openmp-modifiers.h
+++ b/flang/include/flang/Semantics/openmp-modifiers.h
@@ -67,6 +67,7 @@ template const OmpModifierDescriptor
&OmpGetDescriptor();
#define DECLARE_DESCRIPTOR(name) \
template <> const OmpModifierDescriptor &OmpGetDescriptor()
+DECLARE_DESCRIPTOR(parser::OmpAccessGroup);
DECLARE_DESCRIPTOR(parser::OmpAlignment);
DECLARE_DESCRIPTOR(parser::OmpAlignModifier);
DECLARE_DESCRIPTOR(parser::OmpAllocatorComplexModifier);
@@ -82,6 +83,7 @@ DECLARE_DESCRIPTOR(parser::OmpDependenceType);
DECLARE_DESCRIPTOR(parser::OmpDeviceModifier);
DECLARE_DESCRIPTOR(parser::OmpDirectiveNameModifier);
DECLARE_DESCRIPTOR(parser::OmpExpectation);
+DECLARE_DESCRIPTOR(parser::OmpFallbackModifier);
DECLARE_DESCRIPTOR(parser::OmpInteropPreference);
DECLARE_DESCRIPTOR(parser::OmpInteropType);
DECLARE_DESCRIPTOR(parser::OmpIterator);
diff --git a/flang/lib/Semantics/check-omp-structure.cpp
b/flang/lib/Semantics/check-omp-structure.cpp
index d7db15dd37949..4c46aba7391d6 100644
--- a/flang/lib/Semantics/check-omp-structure.cpp
+++ b/flang/lib/Semantics/check-omp-structure.cpp
@@ -682,6 +682,13 @@ void OmpStructureChecker::Enter(const
parser::OmpClause::Hint &x) {
}
}
+void OmpStructureChecker::Enter(const parser::OmpClause::DynGroupprivate &x) {
+ CheckAllowedClause(llvm::omp::Clause::OMPC_dyn_groupprivate);
+ parser::CharBlock source{GetContext().clauseSource};
+
+ OmpVerifyModifiers(x.v, llvm::omp::OMPC_dyn_groupprivate, source, context_);
+}
+
void OmpStructureChecker::Enter(const parser::OmpDirectiveSpecification &x) {
// OmpDirectiveSpecification exists on its own only in METADIRECTIVE.
// In other cases it's a part of other constructs that handle directive
@@ -3316,6 +3323,32 @@ void OmpStructureChecker::Leave(const
parser::OmpClauseList &) {
}
}
+ // Default access-group for DYN_GROUPPRIVATE is "cgroup". On a given
+ // construct there can be at most one DYN_GROUPPRIVATE with a given
+ // access-group.
+ const parser::OmpClause
+ *accGrpClause[parser::OmpAccessGroup::Value_enumSize] = {nullptr};
+ for (auto [_, clause] :
+ FindClauses(llvm::omp::Clause::OMPC_dyn_groupprivate)) {
+auto &wrapper{std::get(clause->u)};
+auto &modifiers{OmpGetModifiers(wrapper.v)};
+auto accGrp{parser::OmpAccessGroup::Value::Cgroup};
+if (auto *ag{OmpGetUniqueModifier(modifiers)}) {
+ accGrp = ag->v;
+}
+auto &firstClause{accGrpClause[llvm::to_underlying(accGrp)]};
+if (firstClause) {
+ context_
+ .Say(clause->source,
+ "The access-group modifier can only occur on a single clause in
a construct"_err_en_US)
+ .Attach(firstClause->source,
+ "Previous clause with access-group modifier"_en_US);
+ break;
+} else {
+ firstClause = clause;
+}
+ }
+
CheckRequireAtLeastOneOf();
}
@@ -5472,7 +5505,6 @@ CHECK_SIMPLE_CLAUSE(Default, OMPC_default)
CHECK_SIMPLE_CLAUSE(Depobj, OMPC_depobj)
CHECK_SIMPLE_CLAUSE(DeviceType, OMPC_device_type)
CHECK_SIMPLE_CLAUSE(DistSchedule, OMPC_dist_schedule)
-CHECK_SIMPLE_CLAUSE(DynGroupprivate, OMPC_dyn_groupprivate)
CHECK_SIMPLE_CLAUSE(Exclusive, OMPC_exclusive)
CHECK_SIMPLE_CLAUSE(Fail, OMPC_fail)
CHECK_SIMPLE_CLAUSE(Filter, OMPC_filter)
diff --git a/flang/lib/Semantics/openmp-modifiers.cpp
b/flang/lib/Semantics/openmp-modifiers.cpp
index 717fb0351ba5b..f191b4de2d579 100644
--- a/flang/lib/Semantics/openmp-modifiers.cpp
+++ b/flang/lib/Semantics/openmp-modifiers.cpp
@@ -74,6 +74,22 @@ unsigned OmpModifierDescriptor::since(llvm::omp::Clause id)
const {
// Note: The intent for these functions is to have them be automatically-
// generated in the future.
+template <>
+const OmpModifierDescriptor &OmpGetDescriptor() {
+ static const OmpModifierDescriptor desc{
+ /*name=*/"access-group",
+ /*props=*/
+ {
+ {61, {OmpProperty::Unique}},
+ },
+ /*clauses=*/
+ {
+ {61, {Clause::OMPC_dyn_groupprivate}},
+ },
+ };
+ return desc;
+}
+
template <>
const OmpModifierDescriptor &OmpGetDescriptor() {
static const OmpModifierDescriptor desc{
@@ -321,6 +337,22 @@ const OmpModifierDescriptor
&OmpGetDescriptor() {
return desc;
}
+template <>
+const OmpModifierDescriptor &OmpGetDescriptor() {
+ static co
[llvm-branch-commits] [flang] [flang][OpenMP] Semantic checks for DYN_GROUPPRIVATE (PR #166214)
https://github.com/kparzysz created
https://github.com/llvm/llvm-project/pull/166214
None
>From ebe00ba9ee15119b2ce127971ab4e038ddf62308 Mon Sep 17 00:00:00 2001
From: Krzysztof Parzyszek
Date: Thu, 14 Aug 2025 13:26:23 -0500
Subject: [PATCH] [flang][OpenMP] Semantic checks for DYN_GROUPPRIVATE
---
.../flang/Semantics/openmp-modifiers.h| 2 ++
flang/lib/Semantics/check-omp-structure.cpp | 34 ++-
flang/lib/Semantics/openmp-modifiers.cpp | 32 +
.../Semantics/OpenMP/dyn-groupprivate.f90 | 8 +
4 files changed, 75 insertions(+), 1 deletion(-)
create mode 100644 flang/test/Semantics/OpenMP/dyn-groupprivate.f90
diff --git a/flang/include/flang/Semantics/openmp-modifiers.h
b/flang/include/flang/Semantics/openmp-modifiers.h
index bfa3aa4939cb1..283bf2a4c895e 100644
--- a/flang/include/flang/Semantics/openmp-modifiers.h
+++ b/flang/include/flang/Semantics/openmp-modifiers.h
@@ -67,6 +67,7 @@ template const OmpModifierDescriptor
&OmpGetDescriptor();
#define DECLARE_DESCRIPTOR(name) \
template <> const OmpModifierDescriptor &OmpGetDescriptor()
+DECLARE_DESCRIPTOR(parser::OmpAccessGroup);
DECLARE_DESCRIPTOR(parser::OmpAlignment);
DECLARE_DESCRIPTOR(parser::OmpAlignModifier);
DECLARE_DESCRIPTOR(parser::OmpAllocatorComplexModifier);
@@ -82,6 +83,7 @@ DECLARE_DESCRIPTOR(parser::OmpDependenceType);
DECLARE_DESCRIPTOR(parser::OmpDeviceModifier);
DECLARE_DESCRIPTOR(parser::OmpDirectiveNameModifier);
DECLARE_DESCRIPTOR(parser::OmpExpectation);
+DECLARE_DESCRIPTOR(parser::OmpFallbackModifier);
DECLARE_DESCRIPTOR(parser::OmpInteropPreference);
DECLARE_DESCRIPTOR(parser::OmpInteropType);
DECLARE_DESCRIPTOR(parser::OmpIterator);
diff --git a/flang/lib/Semantics/check-omp-structure.cpp
b/flang/lib/Semantics/check-omp-structure.cpp
index d7db15dd37949..4c46aba7391d6 100644
--- a/flang/lib/Semantics/check-omp-structure.cpp
+++ b/flang/lib/Semantics/check-omp-structure.cpp
@@ -682,6 +682,13 @@ void OmpStructureChecker::Enter(const
parser::OmpClause::Hint &x) {
}
}
+void OmpStructureChecker::Enter(const parser::OmpClause::DynGroupprivate &x) {
+ CheckAllowedClause(llvm::omp::Clause::OMPC_dyn_groupprivate);
+ parser::CharBlock source{GetContext().clauseSource};
+
+ OmpVerifyModifiers(x.v, llvm::omp::OMPC_dyn_groupprivate, source, context_);
+}
+
void OmpStructureChecker::Enter(const parser::OmpDirectiveSpecification &x) {
// OmpDirectiveSpecification exists on its own only in METADIRECTIVE.
// In other cases it's a part of other constructs that handle directive
@@ -3316,6 +3323,32 @@ void OmpStructureChecker::Leave(const
parser::OmpClauseList &) {
}
}
+ // Default access-group for DYN_GROUPPRIVATE is "cgroup". On a given
+ // construct there can be at most one DYN_GROUPPRIVATE with a given
+ // access-group.
+ const parser::OmpClause
+ *accGrpClause[parser::OmpAccessGroup::Value_enumSize] = {nullptr};
+ for (auto [_, clause] :
+ FindClauses(llvm::omp::Clause::OMPC_dyn_groupprivate)) {
+auto &wrapper{std::get(clause->u)};
+auto &modifiers{OmpGetModifiers(wrapper.v)};
+auto accGrp{parser::OmpAccessGroup::Value::Cgroup};
+if (auto *ag{OmpGetUniqueModifier(modifiers)}) {
+ accGrp = ag->v;
+}
+auto &firstClause{accGrpClause[llvm::to_underlying(accGrp)]};
+if (firstClause) {
+ context_
+ .Say(clause->source,
+ "The access-group modifier can only occur on a single clause in
a construct"_err_en_US)
+ .Attach(firstClause->source,
+ "Previous clause with access-group modifier"_en_US);
+ break;
+} else {
+ firstClause = clause;
+}
+ }
+
CheckRequireAtLeastOneOf();
}
@@ -5472,7 +5505,6 @@ CHECK_SIMPLE_CLAUSE(Default, OMPC_default)
CHECK_SIMPLE_CLAUSE(Depobj, OMPC_depobj)
CHECK_SIMPLE_CLAUSE(DeviceType, OMPC_device_type)
CHECK_SIMPLE_CLAUSE(DistSchedule, OMPC_dist_schedule)
-CHECK_SIMPLE_CLAUSE(DynGroupprivate, OMPC_dyn_groupprivate)
CHECK_SIMPLE_CLAUSE(Exclusive, OMPC_exclusive)
CHECK_SIMPLE_CLAUSE(Fail, OMPC_fail)
CHECK_SIMPLE_CLAUSE(Filter, OMPC_filter)
diff --git a/flang/lib/Semantics/openmp-modifiers.cpp
b/flang/lib/Semantics/openmp-modifiers.cpp
index 717fb0351ba5b..f191b4de2d579 100644
--- a/flang/lib/Semantics/openmp-modifiers.cpp
+++ b/flang/lib/Semantics/openmp-modifiers.cpp
@@ -74,6 +74,22 @@ unsigned OmpModifierDescriptor::since(llvm::omp::Clause id)
const {
// Note: The intent for these functions is to have them be automatically-
// generated in the future.
+template <>
+const OmpModifierDescriptor &OmpGetDescriptor() {
+ static const OmpModifierDescriptor desc{
+ /*name=*/"access-group",
+ /*props=*/
+ {
+ {61, {OmpProperty::Unique}},
+ },
+ /*clauses=*/
+ {
+ {61, {Clause::OMPC_dyn_groupprivate}},
+ },
+ };
+ return desc;
+}
+
template <>
const OmpModifierDescriptor &OmpGetDescriptor() {
static const OmpModifierDescriptor desc{
[llvm-branch-commits] [llvm] [IR] "modular-format" attribute for functions using format strings (PR #147429)
mysterymath wrote: Coming back from the US dev meeting, I wanted to ping this one again. Is there anything left for me to do before this is ready to land? https://github.com/llvm/llvm-project/pull/147429 ___ llvm-branch-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [BOLT][PAC] Warn about synchronous unwind tables (PR #165227)
https://github.com/paschalis-mpeis commented: Can you elaborate how the >=10% threshold for the warning was chosen (instead of emiting it in all cases)? Maybe some of these details might also fit in the design doc too , where you talk about inconsistencies. https://github.com/llvm/llvm-project/pull/165227 ___ llvm-branch-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] 90d4c6d - Revert "[AMDGPU][UnifyDivergentExitNodes][StructurizeCFG] Add support for cal…"
Author: Robert Imschweiler
Date: 2025-11-03T16:53:01+01:00
New Revision: 90d4c6d8fd81c2a47d227bd8f69d2c9455a8d104
URL:
https://github.com/llvm/llvm-project/commit/90d4c6d8fd81c2a47d227bd8f69d2c9455a8d104
DIFF:
https://github.com/llvm/llvm-project/commit/90d4c6d8fd81c2a47d227bd8f69d2c9455a8d104.diff
LOG: Revert "[AMDGPU][UnifyDivergentExitNodes][StructurizeCFG] Add support for
cal…"
This reverts commit 332f9b5eeef85dca29112018ba111bf64a75d27d.
Added:
Modified:
llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp
llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
llvm/test/CodeGen/AMDGPU/do-not-unify-divergent-exit-nodes-with-musttail.ll
llvm/test/CodeGen/AMDGPU/infinite-loop.ll
llvm/test/CodeGen/AMDGPU/si-annotate-nested-control-flows.ll
llvm/test/CodeGen/AMDGPU/si-unify-exit-multiple-unreachables.ll
llvm/test/CodeGen/AMDGPU/update-phi.ll
Removed:
llvm/test/CodeGen/AMDGPU/callbr.ll
llvm/test/Transforms/StructurizeCFG/callbr.ll
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp
b/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp
index 706237b906cc3..733c5d520fb23 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp
@@ -181,52 +181,14 @@ BasicBlock
*AMDGPUUnifyDivergentExitNodesImpl::unifyReturnBlockSet(
return NewRetBlock;
}
-static BasicBlock *
-createDummyReturnBlock(Function &F,
- SmallVector &ReturningBlocks) {
- BasicBlock *DummyReturnBB =
- BasicBlock::Create(F.getContext(), "DummyReturnBlock", &F);
- Type *RetTy = F.getReturnType();
- Value *RetVal = RetTy->isVoidTy() ? nullptr : PoisonValue::get(RetTy);
- ReturnInst::Create(F.getContext(), RetVal, DummyReturnBB);
- ReturningBlocks.push_back(DummyReturnBB);
- return DummyReturnBB;
-}
-
-/// Handle conditional branch instructions (-> 2 targets) and callbr
-/// instructions with N targets.
-static void handleNBranch(Function &F, BasicBlock *BB, Instruction *BI,
- BasicBlock *DummyReturnBB,
- std::vector &Updates) {
- SmallVector Successors(successors(BB));
-
- // Create a new transition block to hold the conditional branch.
- BasicBlock *TransitionBB = BB->splitBasicBlock(BI, "TransitionBlock");
-
- Updates.reserve(Updates.size() + 2 * Successors.size() + 2);
-
- // 'Successors' become successors of TransitionBB instead of BB,
- // and TransitionBB becomes a single successor of BB.
- Updates.emplace_back(DominatorTree::Insert, BB, TransitionBB);
- for (BasicBlock *Successor : Successors) {
-Updates.emplace_back(DominatorTree::Insert, TransitionBB, Successor);
-Updates.emplace_back(DominatorTree::Delete, BB, Successor);
- }
-
- // Create a branch that will always branch to the transition block and
- // references DummyReturnBB.
- BB->getTerminator()->eraseFromParent();
- BranchInst::Create(TransitionBB, DummyReturnBB,
- ConstantInt::getTrue(F.getContext()), BB);
- Updates.emplace_back(DominatorTree::Insert, BB, DummyReturnBB);
-}
-
bool AMDGPUUnifyDivergentExitNodesImpl::run(Function &F, DominatorTree *DT,
const PostDominatorTree &PDT,
const UniformityInfo &UA) {
+ assert(hasOnlySimpleTerminator(F) && "Unsupported block terminator.");
+
if (PDT.root_size() == 0 ||
(PDT.root_size() == 1 &&
- !isa(PDT.getRoot()->getTerminator(
+ !isa(PDT.getRoot()->getTerminator(
return false;
// Loop over all of the blocks in a function, tracking all of the blocks that
@@ -260,27 +222,46 @@ bool AMDGPUUnifyDivergentExitNodesImpl::run(Function &F,
DominatorTree *DT,
if (HasDivergentExitBlock)
UnreachableBlocks.push_back(BB);
} else if (BranchInst *BI = dyn_cast(BB->getTerminator())) {
- if (!DummyReturnBB)
-DummyReturnBB = createDummyReturnBlock(F, ReturningBlocks);
+
+ ConstantInt *BoolTrue = ConstantInt::getTrue(F.getContext());
+ if (DummyReturnBB == nullptr) {
+DummyReturnBB = BasicBlock::Create(F.getContext(),
+ "DummyReturnBlock", &F);
+Type *RetTy = F.getReturnType();
+Value *RetVal = RetTy->isVoidTy() ? nullptr : PoisonValue::get(RetTy);
+ReturnInst::Create(F.getContext(), RetVal, DummyReturnBB);
+ReturningBlocks.push_back(DummyReturnBB);
+ }
if (BI->isUnconditional()) {
BasicBlock *LoopHeaderBB = BI->getSuccessor(0);
BI->eraseFromParent(); // Delete the unconditional branch.
// Add a new conditional branch with a dummy edge to the return block.
-BranchInst::Create(LoopHeaderBB, DummyReturnBB,
- ConstantInt::getTrue(F.getContext()), BB);
+BranchInst::Create(Loo
[llvm-branch-commits] [libc] e602aa8 - Revert "[libc] Add printf error handling (#162876)"
Author: Kewen Meng
Date: 2025-11-03T12:41:13-08:00
New Revision: e602aa8e24924e27c4f0965bd5d1f74554004042
URL:
https://github.com/llvm/llvm-project/commit/e602aa8e24924e27c4f0965bd5d1f74554004042
DIFF:
https://github.com/llvm/llvm-project/commit/e602aa8e24924e27c4f0965bd5d1f74554004042.diff
LOG: Revert "[libc] Add printf error handling (#162876)"
This reverts commit 0c707c9713f0657f7208b8f9a95a13af749d95c5.
Added:
Modified:
libc/src/stdio/CMakeLists.txt
libc/src/stdio/asprintf.cpp
libc/src/stdio/baremetal/CMakeLists.txt
libc/src/stdio/baremetal/printf.cpp
libc/src/stdio/baremetal/vprintf.cpp
libc/src/stdio/generic/CMakeLists.txt
libc/src/stdio/generic/fprintf.cpp
libc/src/stdio/generic/printf.cpp
libc/src/stdio/generic/vfprintf.cpp
libc/src/stdio/generic/vprintf.cpp
libc/src/stdio/printf_core/CMakeLists.txt
libc/src/stdio/printf_core/core_structs.h
libc/src/stdio/printf_core/printf_main.h
libc/src/stdio/printf_core/vasprintf_internal.h
libc/src/stdio/printf_core/vfprintf_internal.h
libc/src/stdio/printf_core/write_int_converter.h
libc/src/stdio/printf_core/writer.h
libc/src/stdio/snprintf.cpp
libc/src/stdio/sprintf.cpp
libc/src/stdio/vasprintf.cpp
libc/src/stdio/vsnprintf.cpp
libc/src/stdio/vsprintf.cpp
libc/src/stdlib/CMakeLists.txt
libc/src/stdlib/strfromd.cpp
libc/src/stdlib/strfromf.cpp
libc/src/stdlib/strfroml.cpp
libc/src/time/strftime_core/strftime_main.h
libc/test/src/stdio/CMakeLists.txt
libc/test/src/stdio/fprintf_test.cpp
libc/test/src/stdio/printf_core/converter_test.cpp
libc/test/src/stdio/printf_core/writer_test.cpp
libc/test/src/stdio/snprintf_test.cpp
libc/test/src/stdio/vfprintf_test.cpp
libc/test/src/stdlib/StrfromTest.h
Removed:
libc/src/stdio/printf_core/error_mapper.h
libc/src/stdio/printf_core/generic/CMakeLists.txt
libc/src/stdio/printf_core/generic/error_mapper.h
libc/src/stdio/printf_core/linux/CMakeLists.txt
libc/src/stdio/printf_core/linux/error_mapper.h
diff --git a/libc/src/stdio/CMakeLists.txt b/libc/src/stdio/CMakeLists.txt
index c75c8b11be2b5..b0a6ef1e291b5 100644
--- a/libc/src/stdio/CMakeLists.txt
+++ b/libc/src/stdio/CMakeLists.txt
@@ -125,10 +125,6 @@ add_entrypoint_object(
DEPENDS
libc.src.stdio.printf_core.printf_main
libc.src.stdio.printf_core.writer
-libc.src.stdio.printf_core.core_structs
-libc.src.stdio.printf_core.error_mapper
-libc.src.__support.libc_errno
-libc.src.__support.CPP.limits
)
add_entrypoint_object(
@@ -140,10 +136,6 @@ add_entrypoint_object(
DEPENDS
libc.src.stdio.printf_core.printf_main
libc.src.stdio.printf_core.writer
-libc.src.stdio.printf_core.core_structs
-libc.src.stdio.printf_core.error_mapper
-libc.src.__support.libc_errno
-libc.src.__support.CPP.limits
)
add_entrypoint_object(
@@ -154,10 +146,6 @@ add_entrypoint_object(
asprintf.h
DEPENDS
libc.src.stdio.printf_core.vasprintf_internal
-libc.src.stdio.printf_core.core_structs
-libc.src.stdio.printf_core.error_mapper
-libc.src.__support.libc_errno
-libc.src.__support.CPP.limits
)
add_entrypoint_object(
@@ -169,10 +157,6 @@ add_entrypoint_object(
DEPENDS
libc.src.stdio.printf_core.printf_main
libc.src.stdio.printf_core.writer
-libc.src.stdio.printf_core.core_structs
-libc.src.stdio.printf_core.error_mapper
-libc.src.__support.libc_errno
-libc.src.__support.CPP.limits
)
add_entrypoint_object(
@@ -184,10 +168,6 @@ add_entrypoint_object(
DEPENDS
libc.src.stdio.printf_core.printf_main
libc.src.stdio.printf_core.writer
-libc.src.stdio.printf_core.core_structs
-libc.src.stdio.printf_core.error_mapper
-libc.src.__support.libc_errno
-libc.src.__support.CPP.limits
)
add_entrypoint_object(
@@ -198,10 +178,6 @@ add_entrypoint_object(
vasprintf.h
DEPENDS
libc.src.stdio.printf_core.vasprintf_internal
-libc.src.stdio.printf_core.core_structs
-libc.src.stdio.printf_core.error_mapper
-libc.src.__support.libc_errno
-libc.src.__support.CPP.limits
)
add_subdirectory(printf_core)
diff --git a/libc/src/stdio/asprintf.cpp b/libc/src/stdio/asprintf.cpp
index 083f40c1f19fa..f8cfb74ce48ea 100644
--- a/libc/src/stdio/asprintf.cpp
+++ b/libc/src/stdio/asprintf.cpp
@@ -7,12 +7,8 @@
//===--===//
#include "src/stdio/asprintf.h"
-#include "src/__support/CPP/limits.h"
#include "src/__support/arg_list.h"
-#include "src/__support/libc_errno.h"
#include "src/__support/macros/config.h"
-#include "src/stdio/printf_core/core_structs.h"
-#include "src/stdio/printf_core/error_mapper.h"
#include "src/stdio/printf_core/vasprintf_internal.h"
namespace LIBC_NAMESPACE_DECL {
@@ -26,18 +22,8 @@ LLVM_LIBC_FUNCTION(
[llvm-branch-commits] [clang] dfd74b8 - Revert "[UBSan] Improve error message when a misalignment is due to target de…"
Author: Matthew Nagy
Date: 2025-11-03T16:28:26Z
New Revision: dfd74b8397d5ddd9a723e4dd0d2de6da77581312
URL:
https://github.com/llvm/llvm-project/commit/dfd74b8397d5ddd9a723e4dd0d2de6da77581312
DIFF:
https://github.com/llvm/llvm-project/commit/dfd74b8397d5ddd9a723e4dd0d2de6da77581312.diff
LOG: Revert "[UBSan] Improve error message when a misalignment is due to target
de…"
This reverts commit 47c54d55c9fac5ea7c87881e00f96e8c12b18174.
Added:
Modified:
clang/lib/CodeGen/CGExprCXX.cpp
clang/lib/CodeGen/CodeGenFunction.h
compiler-rt/lib/ubsan/ubsan_checks.inc
compiler-rt/lib/ubsan/ubsan_handlers.cpp
compiler-rt/test/ubsan/TestCases/TypeCheck/misaligned.cpp
Removed:
compiler-rt/test/ubsan/TestCases/TypeCheck/minimum-alignment.cpp
diff --git a/clang/lib/CodeGen/CGExprCXX.cpp b/clang/lib/CodeGen/CGExprCXX.cpp
index f2dd22e9bed3b..14d8db32bafc6 100644
--- a/clang/lib/CodeGen/CGExprCXX.cpp
+++ b/clang/lib/CodeGen/CGExprCXX.cpp
@@ -18,9 +18,6 @@
#include "ConstantEmitter.h"
#include "TargetInfo.h"
#include "clang/Basic/CodeGenOptions.h"
-#include "clang/Basic/Sanitizers.h"
-#include "clang/Basic/SourceLocation.h"
-#include "clang/Basic/SourceManager.h"
#include "clang/CodeGen/CGFunctionInfo.h"
#include "llvm/IR/Intrinsics.h"
@@ -1752,17 +1749,6 @@ llvm::Value *CodeGenFunction::EmitCXXNewExpr(const
CXXNewExpr *E) {
allocator->isReservedGlobalPlacementOperator())
result = Builder.CreateLaunderInvariantGroup(result);
- // Check the default alignment of the type and why. Users may incorrectly
- // return misaligned memory from a replaced operator new without knowing
- // about default alignment.
- TypeCheckKind checkKind = CodeGenFunction::TCK_ConstructorCall;
- const TargetInfo &TI = getContext().getTargetInfo();
- unsigned DefaultTargetAlignment = TI.getNewAlign() / TI.getCharWidth();
- if (SanOpts.has(SanitizerKind::Alignment) &&
- (DefaultTargetAlignment >
- CGM.getContext().getTypeAlignInChars(allocType).getQuantity()))
-checkKind = CodeGenFunction::TCK_ConstructorCallMinimumAlign;
-
// Emit sanitizer checks for pointer value now, so that in the case of an
// array it was checked only once and not at each constructor call. We may
// have already checked that the pointer is non-null.
@@ -1770,9 +1756,10 @@ llvm::Value *CodeGenFunction::EmitCXXNewExpr(const
CXXNewExpr *E) {
// we'll null check the wrong pointer here.
SanitizerSet SkippedChecks;
SkippedChecks.set(SanitizerKind::Null, nullCheck);
- EmitTypeCheck(
- checkKind, E->getAllocatedTypeSourceInfo()->getTypeLoc().getBeginLoc(),
- result, allocType, result.getAlignment(), SkippedChecks, numElements);
+ EmitTypeCheck(CodeGenFunction::TCK_ConstructorCall,
+E->getAllocatedTypeSourceInfo()->getTypeLoc().getBeginLoc(),
+result, allocType, result.getAlignment(), SkippedChecks,
+numElements);
EmitNewInitializer(*this, E, allocType, elementTy, result, numElements,
allocSizeWithoutCookie);
diff --git a/clang/lib/CodeGen/CodeGenFunction.h
b/clang/lib/CodeGen/CodeGenFunction.h
index 047ca844c79de..8c4c1c8c2dc95 100644
--- a/clang/lib/CodeGen/CodeGenFunction.h
+++ b/clang/lib/CodeGen/CodeGenFunction.h
@@ -3296,10 +3296,7 @@ class CodeGenFunction : public CodeGenTypeCache {
TCK_NonnullAssign,
/// Checking the operand of a dynamic_cast or a typeid expression. Must be
/// null or an object within its lifetime.
-TCK_DynamicOperation,
-/// Checking the 'this' poiner for a constructor call, including that the
-/// alignment is greater or equal to the targets minimum alignment
-TCK_ConstructorCallMinimumAlign
+TCK_DynamicOperation
};
/// Determine whether the pointer type check \p TCK permits null pointers.
diff --git a/compiler-rt/lib/ubsan/ubsan_checks.inc
b/compiler-rt/lib/ubsan/ubsan_checks.inc
index f8757d781afb8..b1d09a9024e7e 100644
--- a/compiler-rt/lib/ubsan/ubsan_checks.inc
+++ b/compiler-rt/lib/ubsan/ubsan_checks.inc
@@ -28,7 +28,6 @@ UBSAN_CHECK(NullptrAfterNonZeroOffset,
"nullptr-after-nonzero-offset",
UBSAN_CHECK(PointerOverflow, "pointer-overflow", "pointer-overflow")
UBSAN_CHECK(MisalignedPointerUse, "misaligned-pointer-use", "alignment")
UBSAN_CHECK(AlignmentAssumption, "alignment-assumption", "alignment")
-UBSAN_CHECK(MinumumAssumedAlignment, "minimum-assumed-alignment", "alignment")
UBSAN_CHECK(InsufficientObjectSize, "insufficient-object-size", "object-size")
UBSAN_CHECK(SignedIntegerOverflow, "signed-integer-overflow",
"signed-integer-overflow")
diff --git a/compiler-rt/lib/ubsan/ubsan_handlers.cpp
b/compiler-rt/lib/ubsan/ubsan_handlers.cpp
index fc6063af4562b..63319f46734a4 100644
--- a/compiler-rt/lib/ubsan/ubsan_handlers.cpp
+++ b/compiler-rt/lib/ubsan/ubsan_handlers.cpp
@@ -73,26 +73,14 @@ enum TypeCheckKind {
TCK
[llvm-branch-commits] [llvm] [BOLT][NFC] Rename Pointer Auth DWARF rewriter passes (PR #164622)
https://github.com/bgergely0 updated
https://github.com/llvm/llvm-project/pull/164622
From 77a0b64af37649b4ec4c0de34284a5f0c57b0a53 Mon Sep 17 00:00:00 2001
From: Gergely Balint
Date: Wed, 22 Oct 2025 12:44:37 +
Subject: [PATCH 1/3] [BOLT][NFC] Rename Pointer Auth DWARF rewriter passes
Original names were "working titles". After initial patches are merged,
I'd like to rename these passes to names that reflect their intent
better and show their relationship to each other:
InsertNegateRAStatePass renamed to PointerAuthCFIFixup,
MarkRAStates renamed to PointerAuthCFIAnalyzer.
---
bolt/docs/PacRetDesign.md | 23 ++---
...arkRAStates.h => PointerAuthCFIAnalyzer.h} | 14
...ateRAStatePass.h => PointerAuthCFIFixup.h} | 14
bolt/lib/Core/Exceptions.cpp | 8 ++---
bolt/lib/Passes/CMakeLists.txt| 4 +--
...AStates.cpp => PointerAuthCFIAnalyzer.cpp} | 16 +-
...AStatePass.cpp => PointerAuthCFIFixup.cpp} | 32 +--
bolt/lib/Rewrite/BinaryPassManager.cpp| 8 ++---
bolt/test/AArch64/negate-ra-state-incorrect.s | 2 +-
bolt/test/AArch64/negate-ra-state.s | 8 ++---
bolt/test/AArch64/pacret-split-funcs.s| 4 +--
bolt/unittests/Passes/CMakeLists.txt | 2 +-
...ateRAState.cpp => PointerAuthCFIFixup.cpp} | 6 ++--
.../gn/secondary/bolt/lib/Passes/BUILD.gn | 4 +--
14 files changed, 73 insertions(+), 72 deletions(-)
rename bolt/include/bolt/Passes/{MarkRAStates.h => PointerAuthCFIAnalyzer.h}
(63%)
rename bolt/include/bolt/Passes/{InsertNegateRAStatePass.h =>
PointerAuthCFIFixup.h} (87%)
rename bolt/lib/Passes/{MarkRAStates.cpp => PointerAuthCFIAnalyzer.cpp} (91%)
rename bolt/lib/Passes/{InsertNegateRAStatePass.cpp =>
PointerAuthCFIFixup.cpp} (91%)
rename bolt/unittests/Passes/{InsertNegateRAState.cpp =>
PointerAuthCFIFixup.cpp} (97%)
diff --git a/bolt/docs/PacRetDesign.md b/bolt/docs/PacRetDesign.md
index c7c76cac3a100..0de2da50f8fd6 100644
--- a/bolt/docs/PacRetDesign.md
+++ b/bolt/docs/PacRetDesign.md
@@ -104,9 +104,9 @@ negate-ra-state CFIs will become invalid during BasicBlock
reordering.
## Solution design
The implementation introduces two new passes:
-1. `MarkRAStatesPass`: assigns the RA state to each instruction based on the
CFIs
-in the input binary
-2. `InsertNegateRAStatePass`: reads those assigned instruction RA states after
+1. `PointerAuthCFIAnalyzer`: assigns the RA state to each instruction based on
+the CFI in the input binary
+2. `PointerAuthCFIFixup`: reads those assigned instruction RA states after
optimizations, and emits `DW_CFA_AARCH64_negate_ra_state` CFIs at the
correct
places: wherever there is a state change between two consecutive
instructions
in the layout order.
@@ -129,7 +129,7 @@ instruction.
This special case is handled by adding an `initialRAState` bool to each
BinaryFunction.
If the `Offset` the CFI refers to is zero, we don't store an annotation, but
set
the `initialRAState` in `FillCFIInfoFor`. This information is then used in
-`MarkRAStates`.
+`PointerAuthCFIAnalyzer`.
### Binaries without DWARF info
@@ -146,7 +146,7 @@ In summary:
- pointer auth is used, and we have DWARF CFIs: passes run, and rewrite the
negate-ra-state CFI.
-### MarkRAStates pass
+### PointerAuthCFIAnalyzer pass
This pass runs before optimizations reorder anything.
@@ -173,9 +173,9 @@ what we have before the pass, and after it.
| autiasp | negate-ra-state | signed |
| ret | | unsigned |
-# Error handling in MarkRAState Pass:
+# Error handling in PointerAuthCFIAnalyzer pass:
-Whenever the MarkRAStates pass finds inconsistencies in the current
+Whenever the PointerAuthCFIAnalyzer pass finds inconsistencies in the current
BinaryFunction, it marks the function as ignored using `BF.setIgnored()`. BOLT
will not optimize this function but will emit it unchanged in the original
section
(`.bolt.org.text`).
@@ -188,16 +188,17 @@ The inconsistencies are as follows:
Users will be informed about the number of ignored functions in the pass, the
exact functions ignored, and the found inconsistency.
-### InsertNegateRAStatePass
+### PointerAuthCFIFixup
-This pass runs after optimizations. It performns the _inverse_ of MarkRAState
pa s:
+This pass runs after optimizations. It performns the _inverse_ of
PointerAuthCFIAnalyzer
+pass:
1. it reads the RA state annotations attached to the instructions, and
2. whenever the state changes, it adds a PseudoInstruction that holds an
OpNegateRAState CFI.
# Covering newly generated instructions:
-Some BOLT passes can add new Instructions. In InsertNegateRAStatePass, we have
+Some BOLT passes can add new Instructions. In PointerAuthCFIFixup, we have
to know what RA state these have.
> [!important]
@@ -230,7 +231,7 @@ freely. The only special case is function splitting. W
[llvm-branch-commits] [llvm] [BOLT][PAC] Warn about synchronous unwind tables (PR #165227)
bgergely0 wrote: As the related issue states: > Example: when applying BOLT to llama.cpp, this is seen in the logs: Ignored > 1633 functions (99.03%) because of CFI inconsistencies. The huge % of ignored > functions makes BOLT useless in such cases. I think the double-digit territory is a good boundary to start emitting this warning. After all, we don't check/don't know if this is the reason for the errors, but the higher the percentage the more likely that this is the issue. Low-percentage issues can be from older compiler versions emitting CFIs incorrectly (as I've seen while testing the work). So the 10% is a "magic constant" and is up for discussion, I think it is a reasonable value to start emitting warnings. https://github.com/llvm/llvm-project/pull/165227 ___ llvm-branch-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [BOLT][NFC] Rename Pointer Auth DWARF rewriter passes (PR #164622)
https://github.com/bgergely0 updated
https://github.com/llvm/llvm-project/pull/164622
From 77a0b64af37649b4ec4c0de34284a5f0c57b0a53 Mon Sep 17 00:00:00 2001
From: Gergely Balint
Date: Wed, 22 Oct 2025 12:44:37 +
Subject: [PATCH 1/2] [BOLT][NFC] Rename Pointer Auth DWARF rewriter passes
Original names were "working titles". After initial patches are merged,
I'd like to rename these passes to names that reflect their intent
better and show their relationship to each other:
InsertNegateRAStatePass renamed to PointerAuthCFIFixup,
MarkRAStates renamed to PointerAuthCFIAnalyzer.
---
bolt/docs/PacRetDesign.md | 23 ++---
...arkRAStates.h => PointerAuthCFIAnalyzer.h} | 14
...ateRAStatePass.h => PointerAuthCFIFixup.h} | 14
bolt/lib/Core/Exceptions.cpp | 8 ++---
bolt/lib/Passes/CMakeLists.txt| 4 +--
...AStates.cpp => PointerAuthCFIAnalyzer.cpp} | 16 +-
...AStatePass.cpp => PointerAuthCFIFixup.cpp} | 32 +--
bolt/lib/Rewrite/BinaryPassManager.cpp| 8 ++---
bolt/test/AArch64/negate-ra-state-incorrect.s | 2 +-
bolt/test/AArch64/negate-ra-state.s | 8 ++---
bolt/test/AArch64/pacret-split-funcs.s| 4 +--
bolt/unittests/Passes/CMakeLists.txt | 2 +-
...ateRAState.cpp => PointerAuthCFIFixup.cpp} | 6 ++--
.../gn/secondary/bolt/lib/Passes/BUILD.gn | 4 +--
14 files changed, 73 insertions(+), 72 deletions(-)
rename bolt/include/bolt/Passes/{MarkRAStates.h => PointerAuthCFIAnalyzer.h}
(63%)
rename bolt/include/bolt/Passes/{InsertNegateRAStatePass.h =>
PointerAuthCFIFixup.h} (87%)
rename bolt/lib/Passes/{MarkRAStates.cpp => PointerAuthCFIAnalyzer.cpp} (91%)
rename bolt/lib/Passes/{InsertNegateRAStatePass.cpp =>
PointerAuthCFIFixup.cpp} (91%)
rename bolt/unittests/Passes/{InsertNegateRAState.cpp =>
PointerAuthCFIFixup.cpp} (97%)
diff --git a/bolt/docs/PacRetDesign.md b/bolt/docs/PacRetDesign.md
index c7c76cac3a100..0de2da50f8fd6 100644
--- a/bolt/docs/PacRetDesign.md
+++ b/bolt/docs/PacRetDesign.md
@@ -104,9 +104,9 @@ negate-ra-state CFIs will become invalid during BasicBlock
reordering.
## Solution design
The implementation introduces two new passes:
-1. `MarkRAStatesPass`: assigns the RA state to each instruction based on the
CFIs
-in the input binary
-2. `InsertNegateRAStatePass`: reads those assigned instruction RA states after
+1. `PointerAuthCFIAnalyzer`: assigns the RA state to each instruction based on
+the CFI in the input binary
+2. `PointerAuthCFIFixup`: reads those assigned instruction RA states after
optimizations, and emits `DW_CFA_AARCH64_negate_ra_state` CFIs at the
correct
places: wherever there is a state change between two consecutive
instructions
in the layout order.
@@ -129,7 +129,7 @@ instruction.
This special case is handled by adding an `initialRAState` bool to each
BinaryFunction.
If the `Offset` the CFI refers to is zero, we don't store an annotation, but
set
the `initialRAState` in `FillCFIInfoFor`. This information is then used in
-`MarkRAStates`.
+`PointerAuthCFIAnalyzer`.
### Binaries without DWARF info
@@ -146,7 +146,7 @@ In summary:
- pointer auth is used, and we have DWARF CFIs: passes run, and rewrite the
negate-ra-state CFI.
-### MarkRAStates pass
+### PointerAuthCFIAnalyzer pass
This pass runs before optimizations reorder anything.
@@ -173,9 +173,9 @@ what we have before the pass, and after it.
| autiasp | negate-ra-state | signed |
| ret | | unsigned |
-# Error handling in MarkRAState Pass:
+# Error handling in PointerAuthCFIAnalyzer pass:
-Whenever the MarkRAStates pass finds inconsistencies in the current
+Whenever the PointerAuthCFIAnalyzer pass finds inconsistencies in the current
BinaryFunction, it marks the function as ignored using `BF.setIgnored()`. BOLT
will not optimize this function but will emit it unchanged in the original
section
(`.bolt.org.text`).
@@ -188,16 +188,17 @@ The inconsistencies are as follows:
Users will be informed about the number of ignored functions in the pass, the
exact functions ignored, and the found inconsistency.
-### InsertNegateRAStatePass
+### PointerAuthCFIFixup
-This pass runs after optimizations. It performns the _inverse_ of MarkRAState
pa s:
+This pass runs after optimizations. It performns the _inverse_ of
PointerAuthCFIAnalyzer
+pass:
1. it reads the RA state annotations attached to the instructions, and
2. whenever the state changes, it adds a PseudoInstruction that holds an
OpNegateRAState CFI.
# Covering newly generated instructions:
-Some BOLT passes can add new Instructions. In InsertNegateRAStatePass, we have
+Some BOLT passes can add new Instructions. In PointerAuthCFIFixup, we have
to know what RA state these have.
> [!important]
@@ -230,7 +231,7 @@ freely. The only special case is function splitting. W
[llvm-branch-commits] [CAS] Add llvm-cas tools to inspect on-disk LLVMCAS (PR #114104)
https://github.com/cachemeifyoucan updated https://github.com/llvm/llvm-project/pull/114104 ___ llvm-branch-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [CAS] Add llvm-cas tools to inspect on-disk LLVMCAS (PR #114104)
https://github.com/cachemeifyoucan updated https://github.com/llvm/llvm-project/pull/114104 ___ llvm-branch-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [CAS] Add llvm-cas tools to inspect on-disk LLVMCAS (PR #114104)
https://github.com/cachemeifyoucan updated
https://github.com/llvm/llvm-project/pull/114104
>From 63c4928ed65fb2a83a4a25f3c098af7d931fc0af Mon Sep 17 00:00:00 2001
From: Steven Wu
Date: Mon, 3 Nov 2025 12:09:19 -0800
Subject: [PATCH] clang-format
Created using spr 1.3.7
---
llvm/tools/llvm-cas/llvm-cas.cpp | 11 +--
1 file changed, 5 insertions(+), 6 deletions(-)
diff --git a/llvm/tools/llvm-cas/llvm-cas.cpp b/llvm/tools/llvm-cas/llvm-cas.cpp
index b1e4f606211b5..e59313eb808e8 100644
--- a/llvm/tools/llvm-cas/llvm-cas.cpp
+++ b/llvm/tools/llvm-cas/llvm-cas.cpp
@@ -175,13 +175,12 @@ int main(int Argc, char **Argv) {
return validateObject(*CAS, ID);
}
-static Expected>
-openBuffer(StringRef DataPath) {
+static Expected> openBuffer(StringRef DataPath) {
if (DataPath.empty())
return createStringError(inconvertibleErrorCode(), "--data missing");
- return errorOrToExpected(
- DataPath == "-" ? llvm::MemoryBuffer::getSTDIN()
- : llvm::MemoryBuffer::getFile(DataPath));
+ return errorOrToExpected(DataPath == "-"
+ ? llvm::MemoryBuffer::getSTDIN()
+ : llvm::MemoryBuffer::getFile(DataPath));
}
int dump(ObjectStore &CAS) {
@@ -311,7 +310,7 @@ int validateIfNeeded(StringRef Path, bool CheckHash, bool
Force,
Exec = ExecStorage;
}
ValidationResult Result =
ExitOnErr(validateOnDiskUnifiedCASDatabasesIfNeeded(
-Path, CheckHash, AllowRecovery, Force, Exec));
+ Path, CheckHash, AllowRecovery, Force, Exec));
switch (Result) {
case ValidationResult::Valid:
outs() << "validated successfully\n";
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [CAS] Add llvm-cas tools to inspect on-disk LLVMCAS (PR #114104)
github-actions[bot] wrote:
:warning: Python code formatter, darker found issues in your code. :warning:
You can test this locally with the following command:
``bash
darker --check --diff -r origin/main...HEAD llvm/test/lit.cfg.py
``
:warning:
The reproduction instructions above might return results for more than one PR
in a stack if you are using a stacked PR workflow. You can limit the results by
changing `origin/main` to the base branch/commit you want to compare against.
:warning:
View the diff from darker here.
``diff
--- lit.cfg.py 2025-11-03 20:07:41.00 +
+++ lit.cfg.py 2025-11-03 20:09:10.006801 +
@@ -787,11 +787,11 @@
if config.expensive_checks:
config.available_features.add("expensive_checks")
if config.have_ondisk_cas:
-config.available_features.add('ondisk_cas')
+config.available_features.add("ondisk_cas")
if "MemoryWithOrigins" in config.llvm_use_sanitizer:
config.available_features.add("use_msan_with_origins")
``
https://github.com/llvm/llvm-project/pull/114104
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [clang] [clang] "modular_format" attribute for functions using format strings (PR #147431)
https://github.com/mysterymath updated
https://github.com/llvm/llvm-project/pull/147431
>From a9ac2282d609b7aaca4f7d733960301602e1637b Mon Sep 17 00:00:00 2001
From: Daniel Thornburgh
Date: Tue, 10 Jun 2025 14:06:53 -0700
Subject: [PATCH 1/8] [clang] "modular_format" attribute for functions using
format strings
This provides a C language version of the new IR modular-format
attribute. This, in concert with the format attribute, allows a library
function to declare that a modular version of its implementation is
available.
See issue #146159 for context.
---
clang/include/clang/Basic/Attr.td | 11 +++
clang/include/clang/Basic/AttrDocs.td | 25 +
clang/lib/CodeGen/CGCall.cpp | 12
clang/lib/Sema/SemaDeclAttr.cpp | 27 +++
4 files changed, 75 insertions(+)
diff --git a/clang/include/clang/Basic/Attr.td
b/clang/include/clang/Basic/Attr.td
index 22e60aa9fe312..69f5bf5bba461 100644
--- a/clang/include/clang/Basic/Attr.td
+++ b/clang/include/clang/Basic/Attr.td
@@ -5290,3 +5290,14 @@ def NonString : InheritableAttr {
let Subjects = SubjectList<[Var, Field]>;
let Documentation = [NonStringDocs];
}
+
+def ModularFormat : InheritableAttr {
+ let Spellings = [Clang<"modular_format">];
+ let Args = [
+IdentifierArgument<"ModularImplFn">,
+StringArgument<"ImplName">,
+VariadicStringArgument<"Aspects">
+ ];
+ let Subjects = SubjectList<[Function]>;
+ let Documentation = [ModularFormatDocs];
+}
diff --git a/clang/include/clang/Basic/AttrDocs.td
b/clang/include/clang/Basic/AttrDocs.td
index e0bbda083b5cf..ebf1a45dbbb50 100644
--- a/clang/include/clang/Basic/AttrDocs.td
+++ b/clang/include/clang/Basic/AttrDocs.td
@@ -9635,3 +9635,28 @@ silence diagnostics with code like:
__attribute__((nonstring)) char NotAStr[3] = "foo"; // Not diagnosed
}];
}
+
+def ModularFormatDocs : Documentation {
+ let Category = DocCatFunction;
+ let Content = [{
+The ``modular_format`` attribute can be applied to a function that bears the
+``format`` attribute to indicate that the implementation is modular on the
+format string argument. When the format argument for a given call is constant,
+the compiler may redirect the call to the symbol given as the first argument to
+the attribute (the modular implementation function).
+
+The second argument is a implementation name, and the remaining arguments are
+aspects of the format string for the compiler to report. If the compiler does
+not understand a aspect, it must summarily report that the format string has
+that aspect.
+
+The compiler reports an aspect by issing a relocation for the symbol
+`_``. This arranges for code and data needed to support the
+aspect of the implementation to be brought into the link to satisfy weak
+references in the modular implemenation function.
+
+The following aspects are currently supported:
+
+- ``float``: The call has a floating point argument
+ }];
+}
diff --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp
index 741fa44713ac8..67765f7fab28b 100644
--- a/clang/lib/CodeGen/CGCall.cpp
+++ b/clang/lib/CodeGen/CGCall.cpp
@@ -2557,6 +2557,18 @@ void CodeGenModule::ConstructAttributeList(StringRef
Name,
if (TargetDecl->hasAttr())
FuncAttrs.addAttribute("aarch64_pstate_sm_body");
+
+if (auto *ModularFormat = TargetDecl->getAttr()) {
+ // TODO: Error checking
+ FormatAttr *Format = TargetDecl->getAttr();
+ std::string FormatIdx = std::to_string(Format->getFormatIdx());
+ std::string FirstArg = std::to_string(Format->getFirstArg());
+ SmallVector Args = {
+ FormatIdx, FirstArg, ModularFormat->getModularImplFn()->getName(),
+ ModularFormat->getImplName()};
+ llvm::append_range(Args, ModularFormat->aspects());
+ FuncAttrs.addAttribute("modular-format", llvm::join(Args, ","));
+}
}
// Attach "no-builtins" attributes to:
diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp
index e6f8748db7644..8fcfb38661a8f 100644
--- a/clang/lib/Sema/SemaDeclAttr.cpp
+++ b/clang/lib/Sema/SemaDeclAttr.cpp
@@ -6783,6 +6783,29 @@ static void handleVTablePointerAuthentication(Sema &S,
Decl *D,
CustomDiscriminationValue));
}
+static void handleModularFormat(Sema &S, Decl *D, const ParsedAttr &AL) {
+ StringRef ImplName;
+ if (!S.checkStringLiteralArgumentAttr(AL, 1, ImplName))
+return;
+ SmallVector Aspects;
+ for (unsigned I = 2, E = AL.getNumArgs(); I != E; ++I) {
+StringRef Aspect;
+if (!S.checkStringLiteralArgumentAttr(AL, I, Aspect))
+ return;
+Aspects.push_back(Aspect);
+ }
+
+ // Store aspects sorted and without duplicates.
+ llvm::sort(Aspects);
+ Aspects.erase(llvm::unique(Aspects), Aspects.end());
+
+ // TODO: Type checking on identifier
+ // TODO: Merge attributes
+ D->addAttr(::new (S.Context) ModularFormatAttr(
+ S.Context, AL, AL.getArgAsIdent(0)->getIdentifierInfo(), ImplName,
+
[llvm-branch-commits] [llvm] [BOLT] Match functions with pseudo probes (PR #100446)
https://github.com/aaupov edited https://github.com/llvm/llvm-project/pull/100446 ___ llvm-branch-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [BOLT][PAC] Warn about synchronous unwind tables (PR #165227)
https://github.com/bgergely0 updated
https://github.com/llvm/llvm-project/pull/165227
From 2e4cc91f665904310c721dfbfb6add32e2ccefe0 Mon Sep 17 00:00:00 2001
From: Gergely Balint
Date: Mon, 27 Oct 2025 09:29:54 +
Subject: [PATCH 1/2] [BOLT][PAC] Warn about synchronous unwind tables
BOLT currently ignores functions with synchronous PAuth DWARF info.
When more than 10% of functions get ignored for inconsistencies, we
should emit a warning to only use asynchronous unwind tables.
See also: #165215
---
bolt/lib/Passes/PointerAuthCFIAnalyzer.cpp| 8 -
.../AArch64/pacret-synchronous-unwind.cpp | 32 +++
2 files changed, 39 insertions(+), 1 deletion(-)
create mode 100644 bolt/test/runtime/AArch64/pacret-synchronous-unwind.cpp
diff --git a/bolt/lib/Passes/PointerAuthCFIAnalyzer.cpp
b/bolt/lib/Passes/PointerAuthCFIAnalyzer.cpp
index 91030544d2b88..cc28ca47c26b1 100644
--- a/bolt/lib/Passes/PointerAuthCFIAnalyzer.cpp
+++ b/bolt/lib/Passes/PointerAuthCFIAnalyzer.cpp
@@ -133,11 +133,17 @@ Error
PointerAuthCFIAnalyzer::runOnFunctions(BinaryContext &BC) {
ParallelUtilities::runOnEachFunction(
BC, ParallelUtilities::SchedulingPolicy::SP_INST_LINEAR, WorkFun,
SkipPredicate, "PointerAuthCFIAnalyzer");
+
+ float IgnoredPercent = (100.0 * FunctionsIgnored) / Total;
BC.outs() << "BOLT-INFO: PointerAuthCFIAnalyzer ran on " << Total
<< " functions. Ignored " << FunctionsIgnored << " functions "
-<< format("(%.2lf%%)", (100.0 * FunctionsIgnored) / Total)
+<< format("(%.2lf%%)", IgnoredPercent)
<< " because of CFI inconsistencies\n";
+ if (IgnoredPercent >= 10.0)
+BC.outs() << "BOLT-WARNING: PointerAuthCFIAnalyzer only supports "
+ "asynchronous unwind tables.\n";
+
return Error::success();
}
diff --git a/bolt/test/runtime/AArch64/pacret-synchronous-unwind.cpp
b/bolt/test/runtime/AArch64/pacret-synchronous-unwind.cpp
new file mode 100644
index 0..e90882833323d
--- /dev/null
+++ b/bolt/test/runtime/AArch64/pacret-synchronous-unwind.cpp
@@ -0,0 +1,32 @@
+// Test to demonstrate that functions compiled with synchronous unwind tables
+// are ignored by the PointerAuthCFIAnalyzer.
+// Exception handling is needed to have _any_ unwind tables, otherwise the
+// PointerAuthCFIAnalyzer does not run on these functions, so it does not
ignore
+// any function.
+//
+// REQUIRES: system-linux,bolt-runtime
+//
+// RUN: %clangxx --target=aarch64-unknown-linux-gnu \
+// RUN: -mbranch-protection=pac-ret \
+// RUN: -fno-asynchronous-unwind-tables \
+// RUN: %s -o %t.exe -Wl,-q
+// RUN: llvm-bolt %t.exe -o %t.bolt | FileCheck %s --check-prefix=CHECK
+//
+// CHECK: PointerAuthCFIAnalyzer ran on 3 functions. Ignored
+// CHECK-NOT: 0 functions (0.00%) because of CFI inconsistencies
+// CHECK-SAME: 1 functions (33.33%) because of CFI inconsistencies
+// CHECK-NEXT: PointerAuthCFIAnalyzer only supports asynchronous unwind tables
+
+#include
+#include
+
+void foo() { throw std::runtime_error("Exception from foo()."); }
+
+int main() {
+ try {
+foo();
+ } catch (const std::exception &e) {
+printf("Exception caught: %s\n", e.what());
+ }
+ return 0;
+}
From 61e2f72fbfc2e51bd8bfaa052001c05328a73ab7 Mon Sep 17 00:00:00 2001
From: Gergely Balint
Date: Tue, 28 Oct 2025 09:23:08 +
Subject: [PATCH 2/2] [BOLT] Use opts::Verbosity in PointerAuthCFIAnalyzer
---
bolt/lib/Passes/PointerAuthCFIAnalyzer.cpp| 27 ---
bolt/test/AArch64/negate-ra-state-incorrect.s | 2 +-
2 files changed, 18 insertions(+), 11 deletions(-)
diff --git a/bolt/lib/Passes/PointerAuthCFIAnalyzer.cpp
b/bolt/lib/Passes/PointerAuthCFIAnalyzer.cpp
index cc28ca47c26b1..e4efb11356a3d 100644
--- a/bolt/lib/Passes/PointerAuthCFIAnalyzer.cpp
+++ b/bolt/lib/Passes/PointerAuthCFIAnalyzer.cpp
@@ -28,6 +28,10 @@
using namespace llvm;
+namespace opts {
+extern llvm::cl::opt Verbosity;
+} // namespace opts
+
namespace llvm {
namespace bolt {
@@ -43,9 +47,10 @@ bool PointerAuthCFIAnalyzer::runOnFunction(BinaryFunction
&BF) {
// Not all functions have .cfi_negate_ra_state in them. But if one
does,
// we expect psign/pauth instructions to have the hasNegateRAState
// annotation.
-BC.outs() << "BOLT-INFO: inconsistent RAStates in function "
- << BF.getPrintName()
- << ": ptr sign/auth inst without .cfi_negate_ra_state\n";
+if (opts::Verbosity >= 1)
+ BC.outs() << "BOLT-INFO: inconsistent RAStates in function "
+<< BF.getPrintName()
+<< ": ptr sign/auth inst without .cfi_negate_ra_state\n";
std::lock_guard Lock(IgnoreMutex);
BF.setIgnored();
return false;
@@ -65,9 +70,10 @@ bool PointerAuthCFIAnalyzer::runOnFunction(BinaryFunction
&BF) {
if (BC.MIB->isPSignOnLR(Inst)) {
if (RAState) {
// RA signing instructions should only follow
[llvm-branch-commits] [llvm] [BOLT] Improve InsertNegateRAStatePass::inferUnknownStates (PR #163381)
https://github.com/bgergely0 updated
https://github.com/llvm/llvm-project/pull/163381
From 5b0920828b645e54ede2525406696229ca935d88 Mon Sep 17 00:00:00 2001
From: Gergely Balint
Date: Tue, 7 Oct 2025 14:01:47 +
Subject: [PATCH 1/2] [BOLT] Improve
InsertNegateRAStatePass::inferUnknownStates
Previous implementation used a simple heuristic. This can be improved in
several ways:
- If a BasicBlock has instruction both with known RAState and unknown RAState,
use the known states to work out the unknown ones.
- If a BasicBlock only consists of instructions with unknown RAState,
use the last known RAState from its predecessors, or the first known
from its successors to set the RAStates in the BasicBlock. This includes
error checking: all predecessors/successors should have the same RAState.
- Some BasicBlocks may only contain instructions with unknown RAState,
and have no CFG neighbors. These already have incorrect unwind info.
For these, we copy the last known RAState based on the layout order.
Updated bolt/docs/PacRetDesign.md to reflect changes.
---
bolt/docs/PacRetDesign.md | 23 +-
.../bolt/Passes/InsertNegateRAStatePass.h | 34 ++-
bolt/lib/Passes/InsertNegateRAStatePass.cpp | 226 --
3 files changed, 255 insertions(+), 28 deletions(-)
diff --git a/bolt/docs/PacRetDesign.md b/bolt/docs/PacRetDesign.md
index f3fe5fbd522cb..c7c76cac3a100 100644
--- a/bolt/docs/PacRetDesign.md
+++ b/bolt/docs/PacRetDesign.md
@@ -200,16 +200,29 @@ This pass runs after optimizations. It performns the
_inverse_ of MarkRAState pa
Some BOLT passes can add new Instructions. In InsertNegateRAStatePass, we have
to know what RA state these have.
-The current solution has the `inferUnknownStates` function to cover these,
using
-a fairly simple strategy: unknown states inherit the last known state.
-
-This will be updated to a more robust solution.
-
> [!important]
> As issue #160989 describes, unwind info is incorrect in stubs with multiple
> callers.
> For this same reason, we cannot generate correct pac-specific unwind info:
> the signess
> of the _incorrect_ return address is meaningless.
+Assignment of RAStates to newly generated instructions is done in
`inferUnknownStates`.
+We have three different cases to cover:
+
+1. If a BasicBlock has some instructions with known RA state, and some
without, we
+ can copy the RAState of known instructions to the unknown ones. As the
control
+ flow only changes between BasicBlocks, instructions in the same BasicBlock
have the
+ same return address.
+
+2. If all instructions in a BasicBlock are unknown, we can look at all CFG
neighbors
+ (that is predecessors/successors). The RAState should be the same as of the
+ neighboring blocks. Conflicting RAStates in neighbors indicate an error.
Such
+ functions should be ignored.
+
+3. If a BasicBlock has no CFG neighbors, we have to copy the RAState of the
previous
+BasicBlock in layout order.
+
+If any BasicBlocks remain with unknown instructions, the function will be
ignored.
+
### Optimizations requiring special attention
Marking states before optimizations ensure that instructions can be moved
around
diff --git a/bolt/include/bolt/Passes/InsertNegateRAStatePass.h
b/bolt/include/bolt/Passes/InsertNegateRAStatePass.h
index 836948bf5e9c0..b4b428207b657 100644
--- a/bolt/include/bolt/Passes/InsertNegateRAStatePass.h
+++ b/bolt/include/bolt/Passes/InsertNegateRAStatePass.h
@@ -1,4 +1,4 @@
-//===- bolt/Passes/InsertNegateRAStatePass.cpp
===//
+//===- bolt/Passes/InsertNegateRAStatePass.h
--===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM
Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
@@ -30,9 +30,39 @@ class InsertNegateRAState : public BinaryFunctionPass {
private:
/// Because states are tracked as MCAnnotations on individual instructions,
/// newly inserted instructions do not have a state associated with them.
- /// New states are "inherited" from the last known state.
void inferUnknownStates(BinaryFunction &BF);
+ /// Simple case: copy RAStates to unknown insts from previous inst.
+ /// Account for signing and authenticating insts.
+ void fillUnknownStateInBB(BinaryContext &BC, BinaryBasicBlock &BB);
+
+ /// Fill unknown RAStates in BBs with no successors/predecessors. These are
+ /// Stubs inserted by LongJmp. As of #160989, we have to copy the RAState
from
+ /// the previous BB in the layout, because CFIs are already incorrect here.
+ void fillUnknownStubs(BinaryFunction &BF);
+
+ /// Fills unknowns RAStates of BBs with successors/predecessors. Uses
+ /// getRAStateByCFG to determine the RAState. Does more than one iteration if
+ /// needed. Reports an error, if it cannot find the RAState for all BBs with
+ /// predecessors/successors.
+ void fillUnknownBlocksInCFG(BinaryFunction &BF);
+
+ /// For
[llvm-branch-commits] [llvm] [BOLT][NFC] Rename Pointer Auth DWARF rewriter passes (PR #164622)
https://github.com/bgergely0 updated
https://github.com/llvm/llvm-project/pull/164622
From f2f3e86e2b3e4686b8ba522301235877725c3a86 Mon Sep 17 00:00:00 2001
From: Gergely Balint
Date: Wed, 22 Oct 2025 12:44:37 +
Subject: [PATCH] [BOLT][NFC] Rename Pointer Auth DWARF rewriter passes
Original names were "working titles". After initial patches are merged,
I'd like to rename these passes to names that reflect their intent
better and show their relationship to each other:
InsertNegateRAStatePass renamed to PointerAuthCFIFixup,
MarkRAStates renamed to PointerAuthCFIAnalyzer.
---
bolt/docs/PacRetDesign.md | 23 ++---
...arkRAStates.h => PointerAuthCFIAnalyzer.h} | 14
...ateRAStatePass.h => PointerAuthCFIFixup.h} | 14
bolt/lib/Core/Exceptions.cpp | 8 ++---
bolt/lib/Passes/CMakeLists.txt| 4 +--
...AStates.cpp => PointerAuthCFIAnalyzer.cpp} | 16 +-
...AStatePass.cpp => PointerAuthCFIFixup.cpp} | 32 +--
bolt/lib/Rewrite/BinaryPassManager.cpp| 8 ++---
bolt/test/AArch64/negate-ra-state-incorrect.s | 2 +-
bolt/test/AArch64/negate-ra-state.s | 8 ++---
bolt/test/AArch64/pacret-split-funcs.s| 4 +--
bolt/unittests/Passes/CMakeLists.txt | 2 +-
...ateRAState.cpp => PointerAuthCFIFixup.cpp} | 6 ++--
.../gn/secondary/bolt/lib/Passes/BUILD.gn | 4 +--
14 files changed, 73 insertions(+), 72 deletions(-)
rename bolt/include/bolt/Passes/{MarkRAStates.h => PointerAuthCFIAnalyzer.h}
(63%)
rename bolt/include/bolt/Passes/{InsertNegateRAStatePass.h =>
PointerAuthCFIFixup.h} (87%)
rename bolt/lib/Passes/{MarkRAStates.cpp => PointerAuthCFIAnalyzer.cpp} (91%)
rename bolt/lib/Passes/{InsertNegateRAStatePass.cpp =>
PointerAuthCFIFixup.cpp} (91%)
rename bolt/unittests/Passes/{InsertNegateRAState.cpp =>
PointerAuthCFIFixup.cpp} (96%)
diff --git a/bolt/docs/PacRetDesign.md b/bolt/docs/PacRetDesign.md
index c7c76cac3a100..0de2da50f8fd6 100644
--- a/bolt/docs/PacRetDesign.md
+++ b/bolt/docs/PacRetDesign.md
@@ -104,9 +104,9 @@ negate-ra-state CFIs will become invalid during BasicBlock
reordering.
## Solution design
The implementation introduces two new passes:
-1. `MarkRAStatesPass`: assigns the RA state to each instruction based on the
CFIs
-in the input binary
-2. `InsertNegateRAStatePass`: reads those assigned instruction RA states after
+1. `PointerAuthCFIAnalyzer`: assigns the RA state to each instruction based on
+the CFI in the input binary
+2. `PointerAuthCFIFixup`: reads those assigned instruction RA states after
optimizations, and emits `DW_CFA_AARCH64_negate_ra_state` CFIs at the
correct
places: wherever there is a state change between two consecutive
instructions
in the layout order.
@@ -129,7 +129,7 @@ instruction.
This special case is handled by adding an `initialRAState` bool to each
BinaryFunction.
If the `Offset` the CFI refers to is zero, we don't store an annotation, but
set
the `initialRAState` in `FillCFIInfoFor`. This information is then used in
-`MarkRAStates`.
+`PointerAuthCFIAnalyzer`.
### Binaries without DWARF info
@@ -146,7 +146,7 @@ In summary:
- pointer auth is used, and we have DWARF CFIs: passes run, and rewrite the
negate-ra-state CFI.
-### MarkRAStates pass
+### PointerAuthCFIAnalyzer pass
This pass runs before optimizations reorder anything.
@@ -173,9 +173,9 @@ what we have before the pass, and after it.
| autiasp | negate-ra-state | signed |
| ret | | unsigned |
-# Error handling in MarkRAState Pass:
+# Error handling in PointerAuthCFIAnalyzer pass:
-Whenever the MarkRAStates pass finds inconsistencies in the current
+Whenever the PointerAuthCFIAnalyzer pass finds inconsistencies in the current
BinaryFunction, it marks the function as ignored using `BF.setIgnored()`. BOLT
will not optimize this function but will emit it unchanged in the original
section
(`.bolt.org.text`).
@@ -188,16 +188,17 @@ The inconsistencies are as follows:
Users will be informed about the number of ignored functions in the pass, the
exact functions ignored, and the found inconsistency.
-### InsertNegateRAStatePass
+### PointerAuthCFIFixup
-This pass runs after optimizations. It performns the _inverse_ of MarkRAState
pa s:
+This pass runs after optimizations. It performns the _inverse_ of
PointerAuthCFIAnalyzer
+pass:
1. it reads the RA state annotations attached to the instructions, and
2. whenever the state changes, it adds a PseudoInstruction that holds an
OpNegateRAState CFI.
# Covering newly generated instructions:
-Some BOLT passes can add new Instructions. In InsertNegateRAStatePass, we have
+Some BOLT passes can add new Instructions. In PointerAuthCFIFixup, we have
to know what RA state these have.
> [!important]
@@ -230,7 +231,7 @@ freely. The only special case is function splitting. When
[llvm-branch-commits] [llvm] [BOLT][PAC] Warn about synchronous unwind tables (PR #165227)
https://github.com/bgergely0 updated
https://github.com/llvm/llvm-project/pull/165227
From 2e4cc91f665904310c721dfbfb6add32e2ccefe0 Mon Sep 17 00:00:00 2001
From: Gergely Balint
Date: Mon, 27 Oct 2025 09:29:54 +
Subject: [PATCH 1/2] [BOLT][PAC] Warn about synchronous unwind tables
BOLT currently ignores functions with synchronous PAuth DWARF info.
When more than 10% of functions get ignored for inconsistencies, we
should emit a warning to only use asynchronous unwind tables.
See also: #165215
---
bolt/lib/Passes/PointerAuthCFIAnalyzer.cpp| 8 -
.../AArch64/pacret-synchronous-unwind.cpp | 32 +++
2 files changed, 39 insertions(+), 1 deletion(-)
create mode 100644 bolt/test/runtime/AArch64/pacret-synchronous-unwind.cpp
diff --git a/bolt/lib/Passes/PointerAuthCFIAnalyzer.cpp
b/bolt/lib/Passes/PointerAuthCFIAnalyzer.cpp
index 91030544d2b88..cc28ca47c26b1 100644
--- a/bolt/lib/Passes/PointerAuthCFIAnalyzer.cpp
+++ b/bolt/lib/Passes/PointerAuthCFIAnalyzer.cpp
@@ -133,11 +133,17 @@ Error
PointerAuthCFIAnalyzer::runOnFunctions(BinaryContext &BC) {
ParallelUtilities::runOnEachFunction(
BC, ParallelUtilities::SchedulingPolicy::SP_INST_LINEAR, WorkFun,
SkipPredicate, "PointerAuthCFIAnalyzer");
+
+ float IgnoredPercent = (100.0 * FunctionsIgnored) / Total;
BC.outs() << "BOLT-INFO: PointerAuthCFIAnalyzer ran on " << Total
<< " functions. Ignored " << FunctionsIgnored << " functions "
-<< format("(%.2lf%%)", (100.0 * FunctionsIgnored) / Total)
+<< format("(%.2lf%%)", IgnoredPercent)
<< " because of CFI inconsistencies\n";
+ if (IgnoredPercent >= 10.0)
+BC.outs() << "BOLT-WARNING: PointerAuthCFIAnalyzer only supports "
+ "asynchronous unwind tables.\n";
+
return Error::success();
}
diff --git a/bolt/test/runtime/AArch64/pacret-synchronous-unwind.cpp
b/bolt/test/runtime/AArch64/pacret-synchronous-unwind.cpp
new file mode 100644
index 0..e90882833323d
--- /dev/null
+++ b/bolt/test/runtime/AArch64/pacret-synchronous-unwind.cpp
@@ -0,0 +1,32 @@
+// Test to demonstrate that functions compiled with synchronous unwind tables
+// are ignored by the PointerAuthCFIAnalyzer.
+// Exception handling is needed to have _any_ unwind tables, otherwise the
+// PointerAuthCFIAnalyzer does not run on these functions, so it does not
ignore
+// any function.
+//
+// REQUIRES: system-linux,bolt-runtime
+//
+// RUN: %clangxx --target=aarch64-unknown-linux-gnu \
+// RUN: -mbranch-protection=pac-ret \
+// RUN: -fno-asynchronous-unwind-tables \
+// RUN: %s -o %t.exe -Wl,-q
+// RUN: llvm-bolt %t.exe -o %t.bolt | FileCheck %s --check-prefix=CHECK
+//
+// CHECK: PointerAuthCFIAnalyzer ran on 3 functions. Ignored
+// CHECK-NOT: 0 functions (0.00%) because of CFI inconsistencies
+// CHECK-SAME: 1 functions (33.33%) because of CFI inconsistencies
+// CHECK-NEXT: PointerAuthCFIAnalyzer only supports asynchronous unwind tables
+
+#include
+#include
+
+void foo() { throw std::runtime_error("Exception from foo()."); }
+
+int main() {
+ try {
+foo();
+ } catch (const std::exception &e) {
+printf("Exception caught: %s\n", e.what());
+ }
+ return 0;
+}
From 61e2f72fbfc2e51bd8bfaa052001c05328a73ab7 Mon Sep 17 00:00:00 2001
From: Gergely Balint
Date: Tue, 28 Oct 2025 09:23:08 +
Subject: [PATCH 2/2] [BOLT] Use opts::Verbosity in PointerAuthCFIAnalyzer
---
bolt/lib/Passes/PointerAuthCFIAnalyzer.cpp| 27 ---
bolt/test/AArch64/negate-ra-state-incorrect.s | 2 +-
2 files changed, 18 insertions(+), 11 deletions(-)
diff --git a/bolt/lib/Passes/PointerAuthCFIAnalyzer.cpp
b/bolt/lib/Passes/PointerAuthCFIAnalyzer.cpp
index cc28ca47c26b1..e4efb11356a3d 100644
--- a/bolt/lib/Passes/PointerAuthCFIAnalyzer.cpp
+++ b/bolt/lib/Passes/PointerAuthCFIAnalyzer.cpp
@@ -28,6 +28,10 @@
using namespace llvm;
+namespace opts {
+extern llvm::cl::opt Verbosity;
+} // namespace opts
+
namespace llvm {
namespace bolt {
@@ -43,9 +47,10 @@ bool PointerAuthCFIAnalyzer::runOnFunction(BinaryFunction
&BF) {
// Not all functions have .cfi_negate_ra_state in them. But if one
does,
// we expect psign/pauth instructions to have the hasNegateRAState
// annotation.
-BC.outs() << "BOLT-INFO: inconsistent RAStates in function "
- << BF.getPrintName()
- << ": ptr sign/auth inst without .cfi_negate_ra_state\n";
+if (opts::Verbosity >= 1)
+ BC.outs() << "BOLT-INFO: inconsistent RAStates in function "
+<< BF.getPrintName()
+<< ": ptr sign/auth inst without .cfi_negate_ra_state\n";
std::lock_guard Lock(IgnoreMutex);
BF.setIgnored();
return false;
@@ -65,9 +70,10 @@ bool PointerAuthCFIAnalyzer::runOnFunction(BinaryFunction
&BF) {
if (BC.MIB->isPSignOnLR(Inst)) {
if (RAState) {
// RA signing instructions should only follow
[llvm-branch-commits] [llvm] [BOLT][NFC] Rename Pointer Auth DWARF rewriter passes (PR #164622)
https://github.com/bgergely0 updated
https://github.com/llvm/llvm-project/pull/164622
From f2f3e86e2b3e4686b8ba522301235877725c3a86 Mon Sep 17 00:00:00 2001
From: Gergely Balint
Date: Wed, 22 Oct 2025 12:44:37 +
Subject: [PATCH] [BOLT][NFC] Rename Pointer Auth DWARF rewriter passes
Original names were "working titles". After initial patches are merged,
I'd like to rename these passes to names that reflect their intent
better and show their relationship to each other:
InsertNegateRAStatePass renamed to PointerAuthCFIFixup,
MarkRAStates renamed to PointerAuthCFIAnalyzer.
---
bolt/docs/PacRetDesign.md | 23 ++---
...arkRAStates.h => PointerAuthCFIAnalyzer.h} | 14
...ateRAStatePass.h => PointerAuthCFIFixup.h} | 14
bolt/lib/Core/Exceptions.cpp | 8 ++---
bolt/lib/Passes/CMakeLists.txt| 4 +--
...AStates.cpp => PointerAuthCFIAnalyzer.cpp} | 16 +-
...AStatePass.cpp => PointerAuthCFIFixup.cpp} | 32 +--
bolt/lib/Rewrite/BinaryPassManager.cpp| 8 ++---
bolt/test/AArch64/negate-ra-state-incorrect.s | 2 +-
bolt/test/AArch64/negate-ra-state.s | 8 ++---
bolt/test/AArch64/pacret-split-funcs.s| 4 +--
bolt/unittests/Passes/CMakeLists.txt | 2 +-
...ateRAState.cpp => PointerAuthCFIFixup.cpp} | 6 ++--
.../gn/secondary/bolt/lib/Passes/BUILD.gn | 4 +--
14 files changed, 73 insertions(+), 72 deletions(-)
rename bolt/include/bolt/Passes/{MarkRAStates.h => PointerAuthCFIAnalyzer.h}
(63%)
rename bolt/include/bolt/Passes/{InsertNegateRAStatePass.h =>
PointerAuthCFIFixup.h} (87%)
rename bolt/lib/Passes/{MarkRAStates.cpp => PointerAuthCFIAnalyzer.cpp} (91%)
rename bolt/lib/Passes/{InsertNegateRAStatePass.cpp =>
PointerAuthCFIFixup.cpp} (91%)
rename bolt/unittests/Passes/{InsertNegateRAState.cpp =>
PointerAuthCFIFixup.cpp} (96%)
diff --git a/bolt/docs/PacRetDesign.md b/bolt/docs/PacRetDesign.md
index c7c76cac3a100..0de2da50f8fd6 100644
--- a/bolt/docs/PacRetDesign.md
+++ b/bolt/docs/PacRetDesign.md
@@ -104,9 +104,9 @@ negate-ra-state CFIs will become invalid during BasicBlock
reordering.
## Solution design
The implementation introduces two new passes:
-1. `MarkRAStatesPass`: assigns the RA state to each instruction based on the
CFIs
-in the input binary
-2. `InsertNegateRAStatePass`: reads those assigned instruction RA states after
+1. `PointerAuthCFIAnalyzer`: assigns the RA state to each instruction based on
+the CFI in the input binary
+2. `PointerAuthCFIFixup`: reads those assigned instruction RA states after
optimizations, and emits `DW_CFA_AARCH64_negate_ra_state` CFIs at the
correct
places: wherever there is a state change between two consecutive
instructions
in the layout order.
@@ -129,7 +129,7 @@ instruction.
This special case is handled by adding an `initialRAState` bool to each
BinaryFunction.
If the `Offset` the CFI refers to is zero, we don't store an annotation, but
set
the `initialRAState` in `FillCFIInfoFor`. This information is then used in
-`MarkRAStates`.
+`PointerAuthCFIAnalyzer`.
### Binaries without DWARF info
@@ -146,7 +146,7 @@ In summary:
- pointer auth is used, and we have DWARF CFIs: passes run, and rewrite the
negate-ra-state CFI.
-### MarkRAStates pass
+### PointerAuthCFIAnalyzer pass
This pass runs before optimizations reorder anything.
@@ -173,9 +173,9 @@ what we have before the pass, and after it.
| autiasp | negate-ra-state | signed |
| ret | | unsigned |
-# Error handling in MarkRAState Pass:
+# Error handling in PointerAuthCFIAnalyzer pass:
-Whenever the MarkRAStates pass finds inconsistencies in the current
+Whenever the PointerAuthCFIAnalyzer pass finds inconsistencies in the current
BinaryFunction, it marks the function as ignored using `BF.setIgnored()`. BOLT
will not optimize this function but will emit it unchanged in the original
section
(`.bolt.org.text`).
@@ -188,16 +188,17 @@ The inconsistencies are as follows:
Users will be informed about the number of ignored functions in the pass, the
exact functions ignored, and the found inconsistency.
-### InsertNegateRAStatePass
+### PointerAuthCFIFixup
-This pass runs after optimizations. It performns the _inverse_ of MarkRAState
pa s:
+This pass runs after optimizations. It performns the _inverse_ of
PointerAuthCFIAnalyzer
+pass:
1. it reads the RA state annotations attached to the instructions, and
2. whenever the state changes, it adds a PseudoInstruction that holds an
OpNegateRAState CFI.
# Covering newly generated instructions:
-Some BOLT passes can add new Instructions. In InsertNegateRAStatePass, we have
+Some BOLT passes can add new Instructions. In PointerAuthCFIFixup, we have
to know what RA state these have.
> [!important]
@@ -230,7 +231,7 @@ freely. The only special case is function splitting. When
[llvm-branch-commits] [lldb] bdf29ee - Revert "[lldb-dap] Use protocol types for exceptioninfo (#165858)"
Author: David Spickett
Date: 2025-11-03T13:12:02Z
New Revision: bdf29ee3e8d23a48aea02acd0f77dcba5bdf93db
URL:
https://github.com/llvm/llvm-project/commit/bdf29ee3e8d23a48aea02acd0f77dcba5bdf93db
DIFF:
https://github.com/llvm/llvm-project/commit/bdf29ee3e8d23a48aea02acd0f77dcba5bdf93db.diff
LOG: Revert "[lldb-dap] Use protocol types for exceptioninfo (#165858)"
This reverts commit 5f3f175a517a25ca9f2ef38ea5cda83fc7a8d0d6.
Added:
Modified:
lldb/tools/lldb-dap/Handler/ExceptionInfoRequestHandler.cpp
lldb/tools/lldb-dap/Handler/RequestHandler.h
lldb/tools/lldb-dap/Protocol/ProtocolRequests.cpp
lldb/tools/lldb-dap/Protocol/ProtocolRequests.h
lldb/tools/lldb-dap/Protocol/ProtocolTypes.cpp
lldb/tools/lldb-dap/Protocol/ProtocolTypes.h
lldb/unittests/DAP/CMakeLists.txt
lldb/unittests/DAP/ProtocolTypesTest.cpp
lldb/unittests/TestingSupport/TestUtilities.cpp
lldb/unittests/TestingSupport/TestUtilities.h
Removed:
lldb/unittests/DAP/ProtocolRequestsTest.cpp
diff --git a/lldb/tools/lldb-dap/Handler/ExceptionInfoRequestHandler.cpp
b/lldb/tools/lldb-dap/Handler/ExceptionInfoRequestHandler.cpp
index ddf55e6fb382d..c1c2adb32a510 100644
--- a/lldb/tools/lldb-dap/Handler/ExceptionInfoRequestHandler.cpp
+++ b/lldb/tools/lldb-dap/Handler/ExceptionInfoRequestHandler.cpp
@@ -7,75 +7,168 @@
//===--===//
#include "DAP.h"
-#include "DAPError.h"
-#include "Protocol/ProtocolRequests.h"
-#include "Protocol/ProtocolTypes.h"
+#include "EventHelper.h"
+#include "JSONUtils.h"
#include "RequestHandler.h"
#include "lldb/API/SBStream.h"
-using namespace lldb_dap::protocol;
-
namespace lldb_dap {
-/// Retrieves the details of the exception that caused this event to be raised.
-///
-/// Clients should only call this request if the corresponding capability
-/// `supportsExceptionInfoRequest` is true.
-llvm::Expected
-ExceptionInfoRequestHandler::Run(const ExceptionInfoArguments &args) const {
-
- lldb::SBThread thread = dap.GetLLDBThread(args.threadId);
- if (!thread.IsValid())
-return llvm::make_error(
-llvm::formatv("Invalid thread id: {}", args.threadId).str());
-
- ExceptionInfoResponseBody response;
- response.breakMode = eExceptionBreakModeAlways;
- const lldb::StopReason stop_reason = thread.GetStopReason();
- switch (stop_reason) {
- case lldb::eStopReasonSignal:
-response.exceptionId = "signal";
-break;
- case lldb::eStopReasonBreakpoint: {
-const ExceptionBreakpoint *exc_bp =
-dap.GetExceptionBPFromStopReason(thread);
-if (exc_bp) {
- response.exceptionId = exc_bp->GetFilter();
- response.description = exc_bp->GetLabel();
+// "ExceptionInfoRequest": {
+// "allOf": [ { "$ref": "#/definitions/Request" }, {
+// "type": "object",
+// "description": "Retrieves the details of the exception that
+// caused this event to be raised. Clients should only call this request if
+// the corresponding capability `supportsExceptionInfoRequest` is true.",
+// "properties": {
+// "command": {
+// "type": "string",
+// "enum": [ "exceptionInfo" ]
+// },
+// "arguments": {
+// "$ref": "#/definitions/ExceptionInfoArguments"
+// }
+// },
+// "required": [ "command", "arguments" ]
+// }]
+// },
+// "ExceptionInfoArguments": {
+// "type": "object",
+// "description": "Arguments for `exceptionInfo` request.",
+// "properties": {
+// "threadId": {
+// "type": "integer",
+// "description": "Thread for which exception information should be
+// retrieved."
+// }
+// },
+// "required": [ "threadId" ]
+// },
+// "ExceptionInfoResponse": {
+// "allOf": [ { "$ref": "#/definitions/Response" }, {
+// "type": "object",
+// "description": "Response to `exceptionInfo` request.",
+// "properties": {
+// "body": {
+// "type": "object",
+// "properties": {
+// "exceptionId": {
+// "type": "string",
+// "description": "ID of the exception that was thrown."
+// },
+// "description": {
+// "type": "string",
+// "description": "Descriptive text for the exception."
+// },
+// "breakMode": {
+// "$ref": "#/definitions/ExceptionBreakMode",
+//"description": "Mode that caused the exception notification to
+//be raised."
+// },
+// "details": {
+// "$ref": "#/definitions/ExceptionDetails",
+//"description": "Detailed information about the exception."
+// }
+// },
+// "required": [ "exceptionId", "breakMode" ]
+// }
+// },
+// "required": [ "body" ]
+// }]
+// }
+// "ExceptionDetails": {
+// "type": "object",
+// "description": "Detailed informa
[llvm-branch-commits] [openmp] 8e2cd28 - [OpenMP] Fix preprocessor mismatches between include and usages of hwloc (#158349)
Author: Peter Arzt
Date: 2025-11-03T09:35:47Z
New Revision: 8e2cd28cd4ba46613a46467b0c91b1cabead26cd
URL:
https://github.com/llvm/llvm-project/commit/8e2cd28cd4ba46613a46467b0c91b1cabead26cd
DIFF:
https://github.com/llvm/llvm-project/commit/8e2cd28cd4ba46613a46467b0c91b1cabead26cd.diff
LOG: [OpenMP] Fix preprocessor mismatches between include and usages of hwloc
(#158349)
Fix https://github.com/llvm/llvm-project/issues/156679
There is a mismatch between the preprocessor guards around the include
of `hwloc.h` and those protecting its usages, leading to build failures
on Darwin: https://github.com/spack/spack-packages/pull/1212
This change introduces `KMP_HWLOC_ENABLED` that reflects
whether hwloc is actually used.
(cherry picked from commit cd24d108a2c19c23c4ac80b501fa7361963cca3d)
Added:
Modified:
openmp/runtime/src/kmp.h
openmp/runtime/src/kmp_affinity.cpp
openmp/runtime/src/kmp_affinity.h
openmp/runtime/src/kmp_alloc.cpp
openmp/runtime/src/kmp_dispatch.h
openmp/runtime/src/kmp_global.cpp
openmp/runtime/src/kmp_settings.cpp
Removed:
diff --git a/openmp/runtime/src/kmp.h b/openmp/runtime/src/kmp.h
index f62cabee6ea84..197cf54765285 100644
--- a/openmp/runtime/src/kmp.h
+++ b/openmp/runtime/src/kmp.h
@@ -106,12 +106,15 @@ class kmp_stats_list;
// OMPD_SKIP_HWLOC used in libompd/omp-icv.cpp to avoid OMPD depending on hwloc
#if KMP_USE_HWLOC && KMP_AFFINITY_SUPPORTED && !defined(OMPD_SKIP_HWLOC)
#include "hwloc.h"
+#define KMP_HWLOC_ENABLED 1
#ifndef HWLOC_OBJ_NUMANODE
#define HWLOC_OBJ_NUMANODE HWLOC_OBJ_NODE
#endif
#ifndef HWLOC_OBJ_PACKAGE
#define HWLOC_OBJ_PACKAGE HWLOC_OBJ_SOCKET
#endif
+#else
+#define KMP_HWLOC_ENABLED 0
#endif
#if KMP_ARCH_X86 || KMP_ARCH_X86_64
@@ -692,10 +695,10 @@ typedef BOOL (*kmp_SetThreadGroupAffinity_t)(HANDLE,
const GROUP_AFFINITY *,
extern kmp_SetThreadGroupAffinity_t __kmp_SetThreadGroupAffinity;
#endif /* KMP_OS_WINDOWS */
-#if KMP_USE_HWLOC && !defined(OMPD_SKIP_HWLOC)
+#if KMP_HWLOC_ENABLED
extern hwloc_topology_t __kmp_hwloc_topology;
extern int __kmp_hwloc_error;
-#endif
+#endif // KMP_HWLOC_ENABLED
extern size_t __kmp_affin_mask_size;
#define KMP_AFFINITY_CAPABLE() (__kmp_affin_mask_size > 0)
@@ -804,10 +807,10 @@ class KMPAffinity {
static void destroy_api();
enum api_type {
NATIVE_OS
-#if KMP_USE_HWLOC
+#if KMP_HWLOC_ENABLED
,
HWLOC
-#endif
+#endif // KMP_HWLOC_ENABLED
};
virtual api_type get_api_type() const {
KMP_ASSERT(0);
@@ -876,9 +879,9 @@ enum affinity_top_method {
affinity_top_method_group,
#endif /* KMP_GROUP_AFFINITY */
affinity_top_method_flat,
-#if KMP_USE_HWLOC
+#if KMP_HWLOC_ENABLED
affinity_top_method_hwloc,
-#endif
+#endif // KMP_HWLOC_ENABLED
affinity_top_method_default
};
@@ -1145,9 +1148,9 @@ typedef struct kmp_allocator_t {
omp_alloctrait_value_t target_access;
omp_alloctrait_value_t atomic_scope;
size_t part_size;
-#if KMP_USE_HWLOC
+#if KMP_HWLOC_ENABLED
omp_alloctrait_value_t membind;
-#endif
+#endif // KMP_HWLOC_ENABLED
} kmp_allocator_t;
extern omp_allocator_handle_t __kmpc_init_allocator(int gtid,
@@ -2107,12 +2110,12 @@ typedef struct dispatch_shared_info {
#if KMP_USE_HIER_SCHED
void *hier;
#endif
-#if KMP_USE_HWLOC
+#if KMP_HWLOC_ENABLED
// When linking with libhwloc, the ORDERED EPCC test slows down on big
// machines (> 48 cores). Performance analysis showed that a cache thrash
// was occurring and this padding helps alleviate the problem.
char padding[64];
-#endif
+#endif // KMP_HWLOC_ENABLED
} dispatch_shared_info_t;
typedef struct kmp_disp {
diff --git a/openmp/runtime/src/kmp_affinity.cpp
b/openmp/runtime/src/kmp_affinity.cpp
index a6065fe792d55..50389502d3b45 100644
--- a/openmp/runtime/src/kmp_affinity.cpp
+++ b/openmp/runtime/src/kmp_affinity.cpp
@@ -19,13 +19,13 @@
#if KMP_USE_HIER_SCHED
#include "kmp_dispatch_hier.h"
#endif
-#if KMP_USE_HWLOC
+#if KMP_HWLOC_ENABLED
// Copied from hwloc
#define HWLOC_GROUP_KIND_INTEL_MODULE 102
#define HWLOC_GROUP_KIND_INTEL_TILE 103
#define HWLOC_GROUP_KIND_INTEL_DIE 104
#define HWLOC_GROUP_KIND_WINDOWS_PROCESSOR_GROUP 220
-#endif
+#endif // KMP_HWLOC_ENABLED
#include
// The machine topology
@@ -1438,7 +1438,7 @@ void KMPAffinity::pick_api() {
KMPAffinity *affinity_dispatch;
if (picked_api)
return;
-#if KMP_USE_HWLOC
+#if KMP_HWLOC_ENABLED
// Only use Hwloc if affinity isn't explicitly disabled and
// user requests Hwloc topology method
if (__kmp_affinity_top_method == affinity_top_method_hwloc &&
@@ -1446,7 +1446,7 @@ void KMPAffinity::pick_api() {
affinity_dispatch = new KMPHwlocAffinity();
__kmp_hwloc_available = true;
} else
-#endif
+#endif // KMP_HWLOC_ENABLED
{
affinity_dispatch = new KMPNativeAffinity();
}
@@ -1697,7 +1697,7 @@ kmp_affin_mask_t *__kmp_affin_fullMask = NULL;
// Original
[llvm-branch-commits] [openmp] release/21.x: [OpenMP] Fix preprocessor mismatches between include and usages of hwloc (#158349) (PR #163768)
https://github.com/c-rhodes closed https://github.com/llvm/llvm-project/pull/163768 ___ llvm-branch-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [openmp] release/21.x: [OpenMP] Fix preprocessor mismatches between include and usages of hwloc (#158349) (PR #163768)
https://github.com/c-rhodes updated
https://github.com/llvm/llvm-project/pull/163768
>From 8e2cd28cd4ba46613a46467b0c91b1cabead26cd Mon Sep 17 00:00:00 2001
From: Peter Arzt
Date: Wed, 15 Oct 2025 10:58:41 +0200
Subject: [PATCH] [OpenMP] Fix preprocessor mismatches between include and
usages of hwloc (#158349)
Fix https://github.com/llvm/llvm-project/issues/156679
There is a mismatch between the preprocessor guards around the include
of `hwloc.h` and those protecting its usages, leading to build failures
on Darwin: https://github.com/spack/spack-packages/pull/1212
This change introduces `KMP_HWLOC_ENABLED` that reflects
whether hwloc is actually used.
(cherry picked from commit cd24d108a2c19c23c4ac80b501fa7361963cca3d)
---
openmp/runtime/src/kmp.h| 23 --
openmp/runtime/src/kmp_affinity.cpp | 24 +++
openmp/runtime/src/kmp_affinity.h | 6 +++---
openmp/runtime/src/kmp_alloc.cpp| 30 ++---
openmp/runtime/src/kmp_dispatch.h | 4 ++--
openmp/runtime/src/kmp_global.cpp | 4 ++--
openmp/runtime/src/kmp_settings.cpp | 20 +--
7 files changed, 57 insertions(+), 54 deletions(-)
diff --git a/openmp/runtime/src/kmp.h b/openmp/runtime/src/kmp.h
index f62cabee6ea84..197cf54765285 100644
--- a/openmp/runtime/src/kmp.h
+++ b/openmp/runtime/src/kmp.h
@@ -106,12 +106,15 @@ class kmp_stats_list;
// OMPD_SKIP_HWLOC used in libompd/omp-icv.cpp to avoid OMPD depending on hwloc
#if KMP_USE_HWLOC && KMP_AFFINITY_SUPPORTED && !defined(OMPD_SKIP_HWLOC)
#include "hwloc.h"
+#define KMP_HWLOC_ENABLED 1
#ifndef HWLOC_OBJ_NUMANODE
#define HWLOC_OBJ_NUMANODE HWLOC_OBJ_NODE
#endif
#ifndef HWLOC_OBJ_PACKAGE
#define HWLOC_OBJ_PACKAGE HWLOC_OBJ_SOCKET
#endif
+#else
+#define KMP_HWLOC_ENABLED 0
#endif
#if KMP_ARCH_X86 || KMP_ARCH_X86_64
@@ -692,10 +695,10 @@ typedef BOOL (*kmp_SetThreadGroupAffinity_t)(HANDLE,
const GROUP_AFFINITY *,
extern kmp_SetThreadGroupAffinity_t __kmp_SetThreadGroupAffinity;
#endif /* KMP_OS_WINDOWS */
-#if KMP_USE_HWLOC && !defined(OMPD_SKIP_HWLOC)
+#if KMP_HWLOC_ENABLED
extern hwloc_topology_t __kmp_hwloc_topology;
extern int __kmp_hwloc_error;
-#endif
+#endif // KMP_HWLOC_ENABLED
extern size_t __kmp_affin_mask_size;
#define KMP_AFFINITY_CAPABLE() (__kmp_affin_mask_size > 0)
@@ -804,10 +807,10 @@ class KMPAffinity {
static void destroy_api();
enum api_type {
NATIVE_OS
-#if KMP_USE_HWLOC
+#if KMP_HWLOC_ENABLED
,
HWLOC
-#endif
+#endif // KMP_HWLOC_ENABLED
};
virtual api_type get_api_type() const {
KMP_ASSERT(0);
@@ -876,9 +879,9 @@ enum affinity_top_method {
affinity_top_method_group,
#endif /* KMP_GROUP_AFFINITY */
affinity_top_method_flat,
-#if KMP_USE_HWLOC
+#if KMP_HWLOC_ENABLED
affinity_top_method_hwloc,
-#endif
+#endif // KMP_HWLOC_ENABLED
affinity_top_method_default
};
@@ -1145,9 +1148,9 @@ typedef struct kmp_allocator_t {
omp_alloctrait_value_t target_access;
omp_alloctrait_value_t atomic_scope;
size_t part_size;
-#if KMP_USE_HWLOC
+#if KMP_HWLOC_ENABLED
omp_alloctrait_value_t membind;
-#endif
+#endif // KMP_HWLOC_ENABLED
} kmp_allocator_t;
extern omp_allocator_handle_t __kmpc_init_allocator(int gtid,
@@ -2107,12 +2110,12 @@ typedef struct dispatch_shared_info {
#if KMP_USE_HIER_SCHED
void *hier;
#endif
-#if KMP_USE_HWLOC
+#if KMP_HWLOC_ENABLED
// When linking with libhwloc, the ORDERED EPCC test slows down on big
// machines (> 48 cores). Performance analysis showed that a cache thrash
// was occurring and this padding helps alleviate the problem.
char padding[64];
-#endif
+#endif // KMP_HWLOC_ENABLED
} dispatch_shared_info_t;
typedef struct kmp_disp {
diff --git a/openmp/runtime/src/kmp_affinity.cpp
b/openmp/runtime/src/kmp_affinity.cpp
index a6065fe792d55..50389502d3b45 100644
--- a/openmp/runtime/src/kmp_affinity.cpp
+++ b/openmp/runtime/src/kmp_affinity.cpp
@@ -19,13 +19,13 @@
#if KMP_USE_HIER_SCHED
#include "kmp_dispatch_hier.h"
#endif
-#if KMP_USE_HWLOC
+#if KMP_HWLOC_ENABLED
// Copied from hwloc
#define HWLOC_GROUP_KIND_INTEL_MODULE 102
#define HWLOC_GROUP_KIND_INTEL_TILE 103
#define HWLOC_GROUP_KIND_INTEL_DIE 104
#define HWLOC_GROUP_KIND_WINDOWS_PROCESSOR_GROUP 220
-#endif
+#endif // KMP_HWLOC_ENABLED
#include
// The machine topology
@@ -1438,7 +1438,7 @@ void KMPAffinity::pick_api() {
KMPAffinity *affinity_dispatch;
if (picked_api)
return;
-#if KMP_USE_HWLOC
+#if KMP_HWLOC_ENABLED
// Only use Hwloc if affinity isn't explicitly disabled and
// user requests Hwloc topology method
if (__kmp_affinity_top_method == affinity_top_method_hwloc &&
@@ -1446,7 +1446,7 @@ void KMPAffinity::pick_api() {
affinity_dispatch = new KMPHwlocAffinity();
__kmp_hwloc_available = true;
} else
-#endif
+#endif // KMP_HWLOC_ENABLED
{
affinity_dispatch = new KMPNativeAffinity();
}
@@ -1697,7 +1697,7 @@ kmp_affin_mask_t *__kmp_affin_fullMask = NULL;
[llvm-branch-commits] [openmp] release/21.x: [OpenMP] Fix preprocessor mismatches between include and usages of hwloc (#158349) (PR #163768)
github-actions[bot] wrote: @pearzt (or anyone else). If you would like to add a note about this fix in the release notes (completely optional). Please reply to this comment with a one or two sentence description of the fix. When you are done, please add the release:note label to this PR. https://github.com/llvm/llvm-project/pull/163768 ___ llvm-branch-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [flang] [flang][OpenMP] Use OmpDirectiveSpecification in ALLOCATE (PR #165865)
https://github.com/Stylie777 approved this pull request. LGTM https://github.com/llvm/llvm-project/pull/165865 ___ llvm-branch-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [flang] [flang][OpenMP] Use OmpDirectiveSpecification in ALLOCATE (PR #165865)
https://github.com/Stylie777 edited https://github.com/llvm/llvm-project/pull/165865 ___ llvm-branch-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [flang] [flang][OpenMP] Use OmpDirectiveSpecification in ALLOCATE (PR #165865)
@@ -2558,11 +2574,24 @@ bool OmpAttributeVisitor::Pre(const
parser::OpenMPThreadprivate &x) {
return true;
}
-bool OmpAttributeVisitor::Pre(const parser::OpenMPDeclarativeAllocate &x) {
+bool OmpAttributeVisitor::Pre(const parser::OmpAllocateDirective &x) {
PushContext(x.source, llvm::omp::Directive::OMPD_allocate);
- if (const auto &list{std::get>(x.t)}) {
-ResolveOmpObjectList(*list, Symbol::Flag::OmpDeclarativeAllocateDirective);
+ assert(!partStack_.empty() && "Misplaced directive");
+
+ auto ompFlag{partStack_.back() == PartKind::SpecificationPart
+ ? Symbol::Flag::OmpDeclarativeAllocateDirective
+ : Symbol::Flag::OmpExecutableAllocateDirective};
+
+ parser::omp::OmpAllocateInfo info{parser::omp::SplitOmpAllocate(x)};
+ for (const parser::OmpAllocateDirective *ad : info.dirs) {
+for (const parser::OmpArgument &arg : ad->BeginDir().Arguments().v) {
+ if (auto *object{omp::GetArgumentObject(arg)}) {
+ResolveOmpObject(*object, ompFlag);
+ }
+}
}
+
+ PopContext();
Stylie777 wrote:
Makes sense, thanks for that.
https://github.com/llvm/llvm-project/pull/165865
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [flang] [flang][OpenMP] Use OmpDirectiveSpecification in ALLOCATE (PR #165865)
@@ -239,33 +238,138 @@ class CanonicalizationOfOmp {
}
}
- void RewriteOmpAllocations(parser::ExecutionPart &body) {
-// Rewrite leading declarative allocations so they are nested
-// within their respective executable allocate directive
-//
-// Original:
-// ExecutionPartConstruct -> OpenMPDeclarativeAllocate
-// ExecutionPartConstruct -> OpenMPDeclarativeAllocate
-// ExecutionPartConstruct -> OpenMPExecutableAllocate
-//
-// After rewriting:
-// ExecutionPartConstruct -> OpenMPExecutableAllocate
-// ExecutionPartConstruct -> OpenMPDeclarativeAllocate
-// ExecutionPartConstruct -> OpenMPDeclarativeAllocate
-for (auto it = body.v.rbegin(); it != body.v.rend();) {
- if (auto *exec = GetOmpIf(*(it++))) {
-parser::OpenMPDeclarativeAllocate *decl;
-std::list subAllocates;
-while (it != body.v.rend() &&
-(decl = GetOmpIf(*it))) {
- subAllocates.push_front(std::move(*decl));
- it = decltype(it)(body.v.erase(std::next(it).base()));
+ // Canonicalization of allocate directives
+ //
+ // In OpenMP 5.0 and 5.1 the allocate directive could either be a declarative
+ // one or an executable one. As usual in such cases, this poses a problem
+ // when the directive appears at the boundary between the specification part
+ // and the execution part.
+ // The executable form can actually consist of several adjacent directives,
+ // whereas the declarative form is always standalone. Additionally, the
+ // executable form must be associated with an allocate statement.
+ //
+ // The parser tries to parse declarative statements first, so in the
+ // following case, the two directives will be declarative, even though
+ // they should be treated as a single executable form:
+ // integer, allocatable :: x, y ! Specification
+ // !$omp allocate(x)
+ // !$omp allocate(y)
+ // allocate(x, y) ! Execution
+ //
+ void CanonicalizeAllocateDirectives(parser::SpecificationPart &spec) {
+auto found = blockForSpec_.find(&spec);
+if (found == blockForSpec_.end()) {
+ // There is no corresponding execution part, so there is nothing to do.
+ return;
+}
+parser::Block &block = *found->second;
+
+auto isAllocateStmt = [](const parser::ExecutionPartConstruct &epc) {
+ if (auto *ec = std::get_if(&epc.u)) {
+if (auto *as =
+std::get_if>(&ec->u)) {
+ return std::holds_alternative<
+ common::Indirection>(as->statement.u);
+}
+ }
+ return false;
+};
+
+if (!block.empty() && isAllocateStmt(block.front())) {
+ // There are two places where an OpenMP declarative construct can
+ // show up in the tuple in specification part:
+ // (1) in std::list, or
+ // (2) in std::list.
+ // The case (1) is only possible is the list (2) is empty.
+
+ auto &omps =
+ std::get>(spec.t);
+ auto &decls = std::get>(spec.t);
+
+ if (!decls.empty()) {
+MakeExecutableAllocateFromDecls(decls, block);
+ } else {
+MakeExecutableAllocateFromOmps(omps, block);
+ }
+}
+ }
+
+ parser::ExecutionPartConstruct EmbedInExec(
+ parser::OmpAllocateDirective *alo, parser::ExecutionPartConstruct &&epc)
{
+// Nest current epc inside the allocate directive.
+std::get(alo->t).push_front(std::move(epc));
+// Set the new epc to be the ExecutionPartConstruct made from
+// the allocate directive.
+parser::OpenMPConstruct opc(std::move(*alo));
+common::Indirection ind(std::move(opc));
+parser::ExecutableConstruct ec(std::move(ind));
+return parser::ExecutionPartConstruct(std::move(ec));
+ }
+
+ void MakeExecutableAllocateFromDecls(
+ std::list &decls, parser::Block &body) {
+using OpenMPDeclarativeConstruct =
+common::Indirection;
+
+auto getAllocate = [](parser::DeclarationConstruct *dc) {
+ if (auto *sc = std::get_if(&dc->u)) {
+if (auto *odc = std::get_if(&sc->u)) {
+ if (auto *alo =
+ std::get_if(&odc->value().u)) {
+return alo;
+ }
+}
+ }
+ return static_cast(nullptr);
+};
+
+std::list::reverse_iterator rlast = [&]() {
+ for (auto rit = decls.rbegin(), rend = decls.rend(); rit != rend; ++rit)
{
Stylie777 wrote:
If its matching what is already done, then thats ok with me!
https://github.com/llvm/llvm-project/pull/165865
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [clang] [llvm] [AArch64][llvm] Add instructions for FEAT_MOPS_GO (PR #164913)
https://github.com/jthackray updated
https://github.com/llvm/llvm-project/pull/164913
>From 7b8957b6ea8ece09d5b0822fdbab7d637e09bf4f Mon Sep 17 00:00:00 2001
From: Jonathan Thackray
Date: Tue, 2 Sep 2025 16:26:53 +0100
Subject: [PATCH] [AArch64][llvm] Add instructions for FEAT_MOPS_GO
Add the following `FEAT_MOPS_GO` instructions:
* `SETGOP`, `SETGOM`, `SETGOE`
* `SETGOPN`, `SETGOMN`, `SETGOEN`
* `SETGOPT`, `SETGOMT`, `SETGOET`
* `SETGOPTN`, `SETGOMTN`, `SETGOETN`
as documented here:
https://developer.arm.com/documentation/109697/2025_09/Future-Architecture-Technologies
---
clang/test/Driver/aarch64-vfat.c | 4 +
.../print-supported-extensions-aarch64.c | 1 +
llvm/lib/Target/AArch64/AArch64Features.td| 3 +
.../lib/Target/AArch64/AArch64InstrFormats.td | 39 +---
llvm/lib/Target/AArch64/AArch64InstrInfo.td | 11 +++
.../AArch64/AsmParser/AArch64AsmParser.cpp| 15 +++-
.../test/MC/AArch64/arm-mops-go-diagnostics.s | 56
llvm/test/MC/AArch64/arm-mops-go.s| 89 +++
.../TargetParser/TargetParserTest.cpp | 3 +
9 files changed, 206 insertions(+), 15 deletions(-)
create mode 100644 llvm/test/MC/AArch64/arm-mops-go-diagnostics.s
create mode 100644 llvm/test/MC/AArch64/arm-mops-go.s
diff --git a/clang/test/Driver/aarch64-vfat.c b/clang/test/Driver/aarch64-vfat.c
index fa268641a86e0..63096336ceb76 100644
--- a/clang/test/Driver/aarch64-vfat.c
+++ b/clang/test/Driver/aarch64-vfat.c
@@ -13,3 +13,7 @@
// RUN: %clang -target aarch64 -march=armv9.7a+btie -### -c %s 2>&1 |
FileCheck -check-prefix=VFAT-BTIE %s
// RUN: %clang -target aarch64 -march=armv9.7-a+btie -### -c %s 2>&1 |
FileCheck -check-prefix=VFAT-BTIE %s
// VFAT-BTIE: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-cpu" "generic"
"-target-feature" "+v9.7a"{{.*}} "-target-feature" "+btie"
+
+// RUN: %clang -target aarch64 -march=armv9.7a+mops-go -### -c %s 2>&1 |
FileCheck -check-prefix=VFAT-MOPS-GO %s
+// RUN: %clang -target aarch64 -march=armv9.7-a+mops-go -### -c %s 2>&1 |
FileCheck -check-prefix=VFAT-MOPS-GO %s
+// VFAT-MOPS-GO: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-cpu"
"generic" "-target-feature" "+v9.7a"{{.*}} "-target-feature" "+mops-go"
diff --git a/clang/test/Driver/print-supported-extensions-aarch64.c
b/clang/test/Driver/print-supported-extensions-aarch64.c
index d0c86c7065281..93373f41ad2cf 100644
--- a/clang/test/Driver/print-supported-extensions-aarch64.c
+++ b/clang/test/Driver/print-supported-extensions-aarch64.c
@@ -50,6 +50,7 @@
// CHECK-NEXT: lsuiFEAT_LSUI
Enable Armv9.6-A unprivileged load/store instructions
// CHECK-NEXT: lut FEAT_LUT
Enable Lookup Table instructions
// CHECK-NEXT: mopsFEAT_MOPS
Enable Armv8.8-A memcpy and memset acceleration instructions
+// CHECK-NEXT: mops-go FEAT_MOPS_GO
Enable memset acceleration granule only
// CHECK-NEXT: mpamv2 FEAT_MPAMv2
Enable Armv9.7-A MPAMv2 Lookaside Buffer Invalidate instructions
// CHECK-NEXT: memtag FEAT_MTE, FEAT_MTE2
Enable Memory Tagging Extension
// CHECK-NEXT: mtetc FEAT_MTETC
Enable Virtual Memory Tagging Extension
diff --git a/llvm/lib/Target/AArch64/AArch64Features.td
b/llvm/lib/Target/AArch64/AArch64Features.td
index c4f6e000dff66..51e602ad7e0f2 100644
--- a/llvm/lib/Target/AArch64/AArch64Features.td
+++ b/llvm/lib/Target/AArch64/AArch64Features.td
@@ -638,6 +638,9 @@ def FeatureS1POE2: ExtensionWithMArch<"poe2", "POE2",
"FEAT_S1POE2",
def FeatureTEV: ExtensionWithMArch<"tev", "TEV", "FEAT_TEV",
"Enable TIndex Exception-like Vector instructions">;
+def FeatureMOPS_GO: ExtensionWithMArch<"mops-go", "MOPS_GO", "FEAT_MOPS_GO",
+ "Enable memset acceleration granule only">;
+
//===--===//
// Other Features
//===--===//
diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td
b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
index a39bb31e61196..37b9363d45083 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
@@ -12604,7 +12604,7 @@ class MOPSMemoryMove opcode, bits<2> op1,
bits<2> op2, string asm>
: MOPSMemoryCopyMoveBase<1, opcode, op1, op2, asm>;
class MOPSMemorySetBase opcode, bit op1, bit op2,
-string asm>
+bit op3, string asm>
: I<(outs GPR64common:$Rd_wb, GPR64:$Rn_wb),
(ins GPR64common:$Rd, GPR64:$Rn, GPR64:$Rm),
asm, "\t[$Rd]!, $Rn!, $Rm"
[llvm-branch-commits] [clang] [llvm] [AArch64][llvm] Add instructions for FEAT_MOPS_GO (PR #164913)
https://github.com/jthackray updated
https://github.com/llvm/llvm-project/pull/164913
>From 7b8957b6ea8ece09d5b0822fdbab7d637e09bf4f Mon Sep 17 00:00:00 2001
From: Jonathan Thackray
Date: Tue, 2 Sep 2025 16:26:53 +0100
Subject: [PATCH] [AArch64][llvm] Add instructions for FEAT_MOPS_GO
Add the following `FEAT_MOPS_GO` instructions:
* `SETGOP`, `SETGOM`, `SETGOE`
* `SETGOPN`, `SETGOMN`, `SETGOEN`
* `SETGOPT`, `SETGOMT`, `SETGOET`
* `SETGOPTN`, `SETGOMTN`, `SETGOETN`
as documented here:
https://developer.arm.com/documentation/109697/2025_09/Future-Architecture-Technologies
---
clang/test/Driver/aarch64-vfat.c | 4 +
.../print-supported-extensions-aarch64.c | 1 +
llvm/lib/Target/AArch64/AArch64Features.td| 3 +
.../lib/Target/AArch64/AArch64InstrFormats.td | 39 +---
llvm/lib/Target/AArch64/AArch64InstrInfo.td | 11 +++
.../AArch64/AsmParser/AArch64AsmParser.cpp| 15 +++-
.../test/MC/AArch64/arm-mops-go-diagnostics.s | 56
llvm/test/MC/AArch64/arm-mops-go.s| 89 +++
.../TargetParser/TargetParserTest.cpp | 3 +
9 files changed, 206 insertions(+), 15 deletions(-)
create mode 100644 llvm/test/MC/AArch64/arm-mops-go-diagnostics.s
create mode 100644 llvm/test/MC/AArch64/arm-mops-go.s
diff --git a/clang/test/Driver/aarch64-vfat.c b/clang/test/Driver/aarch64-vfat.c
index fa268641a86e0..63096336ceb76 100644
--- a/clang/test/Driver/aarch64-vfat.c
+++ b/clang/test/Driver/aarch64-vfat.c
@@ -13,3 +13,7 @@
// RUN: %clang -target aarch64 -march=armv9.7a+btie -### -c %s 2>&1 |
FileCheck -check-prefix=VFAT-BTIE %s
// RUN: %clang -target aarch64 -march=armv9.7-a+btie -### -c %s 2>&1 |
FileCheck -check-prefix=VFAT-BTIE %s
// VFAT-BTIE: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-cpu" "generic"
"-target-feature" "+v9.7a"{{.*}} "-target-feature" "+btie"
+
+// RUN: %clang -target aarch64 -march=armv9.7a+mops-go -### -c %s 2>&1 |
FileCheck -check-prefix=VFAT-MOPS-GO %s
+// RUN: %clang -target aarch64 -march=armv9.7-a+mops-go -### -c %s 2>&1 |
FileCheck -check-prefix=VFAT-MOPS-GO %s
+// VFAT-MOPS-GO: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-cpu"
"generic" "-target-feature" "+v9.7a"{{.*}} "-target-feature" "+mops-go"
diff --git a/clang/test/Driver/print-supported-extensions-aarch64.c
b/clang/test/Driver/print-supported-extensions-aarch64.c
index d0c86c7065281..93373f41ad2cf 100644
--- a/clang/test/Driver/print-supported-extensions-aarch64.c
+++ b/clang/test/Driver/print-supported-extensions-aarch64.c
@@ -50,6 +50,7 @@
// CHECK-NEXT: lsuiFEAT_LSUI
Enable Armv9.6-A unprivileged load/store instructions
// CHECK-NEXT: lut FEAT_LUT
Enable Lookup Table instructions
// CHECK-NEXT: mopsFEAT_MOPS
Enable Armv8.8-A memcpy and memset acceleration instructions
+// CHECK-NEXT: mops-go FEAT_MOPS_GO
Enable memset acceleration granule only
// CHECK-NEXT: mpamv2 FEAT_MPAMv2
Enable Armv9.7-A MPAMv2 Lookaside Buffer Invalidate instructions
// CHECK-NEXT: memtag FEAT_MTE, FEAT_MTE2
Enable Memory Tagging Extension
// CHECK-NEXT: mtetc FEAT_MTETC
Enable Virtual Memory Tagging Extension
diff --git a/llvm/lib/Target/AArch64/AArch64Features.td
b/llvm/lib/Target/AArch64/AArch64Features.td
index c4f6e000dff66..51e602ad7e0f2 100644
--- a/llvm/lib/Target/AArch64/AArch64Features.td
+++ b/llvm/lib/Target/AArch64/AArch64Features.td
@@ -638,6 +638,9 @@ def FeatureS1POE2: ExtensionWithMArch<"poe2", "POE2",
"FEAT_S1POE2",
def FeatureTEV: ExtensionWithMArch<"tev", "TEV", "FEAT_TEV",
"Enable TIndex Exception-like Vector instructions">;
+def FeatureMOPS_GO: ExtensionWithMArch<"mops-go", "MOPS_GO", "FEAT_MOPS_GO",
+ "Enable memset acceleration granule only">;
+
//===--===//
// Other Features
//===--===//
diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td
b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
index a39bb31e61196..37b9363d45083 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
@@ -12604,7 +12604,7 @@ class MOPSMemoryMove opcode, bits<2> op1,
bits<2> op2, string asm>
: MOPSMemoryCopyMoveBase<1, opcode, op1, op2, asm>;
class MOPSMemorySetBase opcode, bit op1, bit op2,
-string asm>
+bit op3, string asm>
: I<(outs GPR64common:$Rd_wb, GPR64:$Rn_wb),
(ins GPR64common:$Rd, GPR64:$Rn, GPR64:$Rm),
asm, "\t[$Rd]!, $Rn!, $Rm"
[llvm-branch-commits] [llvm] [BOLT][PAC] Warn about synchronous unwind tables (PR #165227)
https://github.com/bgergely0 updated
https://github.com/llvm/llvm-project/pull/165227
From 7f94312fa57230dd8bfa0874d2cc26b7cf86f1b3 Mon Sep 17 00:00:00 2001
From: Gergely Balint
Date: Mon, 27 Oct 2025 09:29:54 +
Subject: [PATCH 1/2] [BOLT][PAC] Warn about synchronous unwind tables
BOLT currently ignores functions with synchronous PAuth DWARF info.
When more than 10% of functions get ignored for inconsistencies, we
should emit a warning to only use asynchronous unwind tables.
See also: #165215
---
bolt/lib/Passes/PointerAuthCFIAnalyzer.cpp| 9 +-
.../AArch64/pacret-synchronous-unwind.cpp | 32 +++
2 files changed, 40 insertions(+), 1 deletion(-)
create mode 100644 bolt/test/runtime/AArch64/pacret-synchronous-unwind.cpp
diff --git a/bolt/lib/Passes/PointerAuthCFIAnalyzer.cpp
b/bolt/lib/Passes/PointerAuthCFIAnalyzer.cpp
index 91030544d2b88..01af88818a21d 100644
--- a/bolt/lib/Passes/PointerAuthCFIAnalyzer.cpp
+++ b/bolt/lib/Passes/PointerAuthCFIAnalyzer.cpp
@@ -133,11 +133,18 @@ Error
PointerAuthCFIAnalyzer::runOnFunctions(BinaryContext &BC) {
ParallelUtilities::runOnEachFunction(
BC, ParallelUtilities::SchedulingPolicy::SP_INST_LINEAR, WorkFun,
SkipPredicate, "PointerAuthCFIAnalyzer");
+
+ float IgnoredPercent = (100.0 * FunctionsIgnored) / Total;
BC.outs() << "BOLT-INFO: PointerAuthCFIAnalyzer ran on " << Total
<< " functions. Ignored " << FunctionsIgnored << " functions "
-<< format("(%.2lf%%)", (100.0 * FunctionsIgnored) / Total)
+<< format("(%.2lf%%)", IgnoredPercent)
<< " because of CFI inconsistencies\n";
+ if (IgnoredPercent >= 10.0)
+BC.outs() << "BOLT-WARNING: PointerAuthCFIAnalyzer only supports "
+ "asynchronous unwind tables. For C compilers, see "
+ "-fasynchronous-unwind-tables.\n";
+
return Error::success();
}
diff --git a/bolt/test/runtime/AArch64/pacret-synchronous-unwind.cpp
b/bolt/test/runtime/AArch64/pacret-synchronous-unwind.cpp
new file mode 100644
index 0..025075245efa0
--- /dev/null
+++ b/bolt/test/runtime/AArch64/pacret-synchronous-unwind.cpp
@@ -0,0 +1,32 @@
+// Test to demonstrate that functions compiled with synchronous unwind tables
+// are ignored by the PointerAuthCFIAnalyzer.
+// Exception handling is needed to have _any_ unwind tables, otherwise the
+// PointerAuthCFIAnalyzer does not run on these functions, so it does not
ignore
+// any function.
+//
+// REQUIRES: system-linux,bolt-runtime
+//
+// RUN: %clangxx --target=aarch64-unknown-linux-gnu \
+// RUN: -mbranch-protection=pac-ret \
+// RUN: -fno-asynchronous-unwind-tables \
+// RUN: %s -o %t.exe -Wl,-q
+// RUN: llvm-bolt %t.exe -o %t.bolt | FileCheck %s --check-prefix=CHECK
+//
+// CHECK: PointerAuthCFIAnalyzer ran on 3 functions. Ignored
+// CHECK-NOT: 0 functions (0.00%) because of CFI inconsistencies
+// CHECK-SAME: 1 functions (33.33%) because of CFI inconsistencies
+// CHECK-NEXT: BOLT-WARNING: PointerAuthCFIAnalyzer only supports asynchronous
unwind tables. For C compilers, see -fasynchronous-unwind-tables.
+
+#include
+#include
+
+void foo() { throw std::runtime_error("Exception from foo()."); }
+
+int main() {
+ try {
+foo();
+ } catch (const std::exception &e) {
+printf("Exception caught: %s\n", e.what());
+ }
+ return 0;
+}
From 2164b391e7547538cc6f467253e3f5a1a1f1dd4c Mon Sep 17 00:00:00 2001
From: Gergely Balint
Date: Tue, 28 Oct 2025 09:23:08 +
Subject: [PATCH 2/2] [BOLT] Use opts::Verbosity in PointerAuthCFIAnalyzer
---
bolt/lib/Passes/PointerAuthCFIAnalyzer.cpp| 27 ---
bolt/test/AArch64/negate-ra-state-incorrect.s | 2 +-
2 files changed, 18 insertions(+), 11 deletions(-)
diff --git a/bolt/lib/Passes/PointerAuthCFIAnalyzer.cpp
b/bolt/lib/Passes/PointerAuthCFIAnalyzer.cpp
index 01af88818a21d..5979d5fb01818 100644
--- a/bolt/lib/Passes/PointerAuthCFIAnalyzer.cpp
+++ b/bolt/lib/Passes/PointerAuthCFIAnalyzer.cpp
@@ -28,6 +28,10 @@
using namespace llvm;
+namespace opts {
+extern llvm::cl::opt Verbosity;
+} // namespace opts
+
namespace llvm {
namespace bolt {
@@ -43,9 +47,10 @@ bool PointerAuthCFIAnalyzer::runOnFunction(BinaryFunction
&BF) {
// Not all functions have .cfi_negate_ra_state in them. But if one
does,
// we expect psign/pauth instructions to have the hasNegateRAState
// annotation.
-BC.outs() << "BOLT-INFO: inconsistent RAStates in function "
- << BF.getPrintName()
- << ": ptr sign/auth inst without .cfi_negate_ra_state\n";
+if (opts::Verbosity >= 1)
+ BC.outs() << "BOLT-INFO: inconsistent RAStates in function "
+<< BF.getPrintName()
+<< ": ptr sign/auth inst without .cfi_negate_ra_state\n";
std::lock_guard Lock(IgnoreMutex);
BF.setIgnored();
return false;
@@ -65,9 +70,10 @@ bool PointerAuthCFIAnalyzer::runOnFunction(Bina
[llvm-branch-commits] [llvm] [BOLT][PAC] Warn about synchronous unwind tables (PR #165227)
@@ -133,11 +140,17 @@ Error
PointerAuthCFIAnalyzer::runOnFunctions(BinaryContext &BC) {
ParallelUtilities::runOnEachFunction(
BC, ParallelUtilities::SchedulingPolicy::SP_INST_LINEAR, WorkFun,
SkipPredicate, "PointerAuthCFIAnalyzer");
+
+ float IgnoredPercent = (100.0 * FunctionsIgnored) / Total;
BC.outs() << "BOLT-INFO: PointerAuthCFIAnalyzer ran on " << Total
<< " functions. Ignored " << FunctionsIgnored << " functions "
-<< format("(%.2lf%%)", (100.0 * FunctionsIgnored) / Total)
+<< format("(%.2lf%%)", IgnoredPercent)
<< " because of CFI inconsistencies\n";
+ if (IgnoredPercent >= 10.0)
+BC.outs() << "BOLT-WARNING: PointerAuthCFIAnalyzer only supports "
+ "asynchronous unwind tables.\n";
bgergely0 wrote:
updated
https://github.com/llvm/llvm-project/pull/165227
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [BOLT][PAC] Warn about synchronous unwind tables (PR #165227)
https://github.com/bgergely0 updated
https://github.com/llvm/llvm-project/pull/165227
From e2dd1e8c70375efd7701b0eb84e77ca1b9693ec4 Mon Sep 17 00:00:00 2001
From: Gergely Balint
Date: Mon, 27 Oct 2025 09:29:54 +
Subject: [PATCH 1/2] [BOLT][PAC] Warn about synchronous unwind tables
BOLT currently ignores functions with synchronous PAuth DWARF info.
When more than 10% of functions get ignored for inconsistencies, we
should emit a warning to only use asynchronous unwind tables.
See also: #165215
---
bolt/lib/Passes/PointerAuthCFIAnalyzer.cpp| 9 -
.../AArch64/pacret-synchronous-unwind.cpp | 33 +++
2 files changed, 41 insertions(+), 1 deletion(-)
create mode 100644 bolt/test/runtime/AArch64/pacret-synchronous-unwind.cpp
diff --git a/bolt/lib/Passes/PointerAuthCFIAnalyzer.cpp
b/bolt/lib/Passes/PointerAuthCFIAnalyzer.cpp
index 91030544d2b88..01af88818a21d 100644
--- a/bolt/lib/Passes/PointerAuthCFIAnalyzer.cpp
+++ b/bolt/lib/Passes/PointerAuthCFIAnalyzer.cpp
@@ -133,11 +133,18 @@ Error
PointerAuthCFIAnalyzer::runOnFunctions(BinaryContext &BC) {
ParallelUtilities::runOnEachFunction(
BC, ParallelUtilities::SchedulingPolicy::SP_INST_LINEAR, WorkFun,
SkipPredicate, "PointerAuthCFIAnalyzer");
+
+ float IgnoredPercent = (100.0 * FunctionsIgnored) / Total;
BC.outs() << "BOLT-INFO: PointerAuthCFIAnalyzer ran on " << Total
<< " functions. Ignored " << FunctionsIgnored << " functions "
-<< format("(%.2lf%%)", (100.0 * FunctionsIgnored) / Total)
+<< format("(%.2lf%%)", IgnoredPercent)
<< " because of CFI inconsistencies\n";
+ if (IgnoredPercent >= 10.0)
+BC.outs() << "BOLT-WARNING: PointerAuthCFIAnalyzer only supports "
+ "asynchronous unwind tables. For C compilers, see "
+ "-fasynchronous-unwind-tables.\n";
+
return Error::success();
}
diff --git a/bolt/test/runtime/AArch64/pacret-synchronous-unwind.cpp
b/bolt/test/runtime/AArch64/pacret-synchronous-unwind.cpp
new file mode 100644
index 0..b505dd6b81f1f
--- /dev/null
+++ b/bolt/test/runtime/AArch64/pacret-synchronous-unwind.cpp
@@ -0,0 +1,33 @@
+// Test to demonstrate that functions compiled with synchronous unwind tables
+// are ignored by the PointerAuthCFIAnalyzer.
+// Exception handling is needed to have _any_ unwind tables, otherwise the
+// PointerAuthCFIAnalyzer does not run on these functions, so it does not
ignore
+// any function.
+//
+// REQUIRES: system-linux,bolt-runtime
+//
+// RUN: %clangxx --target=aarch64-unknown-linux-gnu \
+// RUN: -mbranch-protection=pac-ret \
+// RUN: -fno-asynchronous-unwind-tables \
+// RUN: %s -o %t.exe -Wl,-q
+// RUN: llvm-bolt %t.exe -o %t.bolt | FileCheck %s --check-prefix=CHECK
+//
+// CHECK: PointerAuthCFIAnalyzer ran on 3 functions. Ignored
+// CHECK-NOT: 0 functions (0.00%) because of CFI inconsistencies
+// CHECK-SAME: 1 functions (33.33%) because of CFI inconsistencies
+// CHECK-NEXT: BOLT-WARNING: PointerAuthCFIAnalyzer only supports asynchronous
+// unwind tables. For C compilers, see -fasynchronous-unwind-tables.
+
+#include
+#include
+
+void foo() { throw std::runtime_error("Exception from foo()."); }
+
+int main() {
+ try {
+foo();
+ } catch (const std::exception &e) {
+printf("Exception caught: %s\n", e.what());
+ }
+ return 0;
+}
From 503fbba12c9b919561a465d20b33f11d0262b225 Mon Sep 17 00:00:00 2001
From: Gergely Balint
Date: Tue, 28 Oct 2025 09:23:08 +
Subject: [PATCH 2/2] [BOLT] Use opts::Verbosity in PointerAuthCFIAnalyzer
---
bolt/lib/Passes/PointerAuthCFIAnalyzer.cpp| 27 ---
bolt/test/AArch64/negate-ra-state-incorrect.s | 2 +-
2 files changed, 18 insertions(+), 11 deletions(-)
diff --git a/bolt/lib/Passes/PointerAuthCFIAnalyzer.cpp
b/bolt/lib/Passes/PointerAuthCFIAnalyzer.cpp
index 01af88818a21d..5979d5fb01818 100644
--- a/bolt/lib/Passes/PointerAuthCFIAnalyzer.cpp
+++ b/bolt/lib/Passes/PointerAuthCFIAnalyzer.cpp
@@ -28,6 +28,10 @@
using namespace llvm;
+namespace opts {
+extern llvm::cl::opt Verbosity;
+} // namespace opts
+
namespace llvm {
namespace bolt {
@@ -43,9 +47,10 @@ bool PointerAuthCFIAnalyzer::runOnFunction(BinaryFunction
&BF) {
// Not all functions have .cfi_negate_ra_state in them. But if one
does,
// we expect psign/pauth instructions to have the hasNegateRAState
// annotation.
-BC.outs() << "BOLT-INFO: inconsistent RAStates in function "
- << BF.getPrintName()
- << ": ptr sign/auth inst without .cfi_negate_ra_state\n";
+if (opts::Verbosity >= 1)
+ BC.outs() << "BOLT-INFO: inconsistent RAStates in function "
+<< BF.getPrintName()
+<< ": ptr sign/auth inst without .cfi_negate_ra_state\n";
std::lock_guard Lock(IgnoreMutex);
BF.setIgnored();
return false;
@@ -65,9 +70,10 @@ bool PointerAuthCFIAnalyzer::runOnFunction(Bi
[llvm-branch-commits] [llvm] [BOLT][PAC] Warn about synchronous unwind tables (PR #165227)
github-actions[bot] wrote: :warning: C/C++ code formatter, clang-format found issues in your code. :warning: You can test this locally with the following command: ``bash git-clang-format --diff origin/main HEAD --extensions cpp -- bolt/test/runtime/AArch64/pacret-synchronous-unwind.cpp bolt/lib/Passes/PointerAuthCFIAnalyzer.cpp --diff_from_common_commit `` :warning: The reproduction instructions above might return results for more than one PR in a stack if you are using a stacked PR workflow. You can limit the results by changing `origin/main` to the base branch/commit you want to compare against. :warning: View the diff from clang-format here. ``diff diff --git a/bolt/test/runtime/AArch64/pacret-synchronous-unwind.cpp b/bolt/test/runtime/AArch64/pacret-synchronous-unwind.cpp index 025075245..b505dd6b8 100644 --- a/bolt/test/runtime/AArch64/pacret-synchronous-unwind.cpp +++ b/bolt/test/runtime/AArch64/pacret-synchronous-unwind.cpp @@ -15,7 +15,8 @@ // CHECK: PointerAuthCFIAnalyzer ran on 3 functions. Ignored // CHECK-NOT: 0 functions (0.00%) because of CFI inconsistencies // CHECK-SAME: 1 functions (33.33%) because of CFI inconsistencies -// CHECK-NEXT: BOLT-WARNING: PointerAuthCFIAnalyzer only supports asynchronous unwind tables. For C compilers, see -fasynchronous-unwind-tables. +// CHECK-NEXT: BOLT-WARNING: PointerAuthCFIAnalyzer only supports asynchronous +// unwind tables. For C compilers, see -fasynchronous-unwind-tables. #include #include `` https://github.com/llvm/llvm-project/pull/165227 ___ llvm-branch-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [BOLT][PAC] Warn about synchronous unwind tables (PR #165227)
https://github.com/bgergely0 updated
https://github.com/llvm/llvm-project/pull/165227
From 938fa78ff75cdea7580a45fad4b3d6d0dfe4a8de Mon Sep 17 00:00:00 2001
From: Gergely Balint
Date: Mon, 27 Oct 2025 09:29:54 +
Subject: [PATCH 1/2] [BOLT][PAC] Warn about synchronous unwind tables
BOLT currently ignores functions with synchronous PAuth DWARF info.
When more than 10% of functions get ignored for inconsistencies, we
should emit a warning to only use asynchronous unwind tables.
See also: #165215
---
bolt/lib/Passes/PointerAuthCFIAnalyzer.cpp| 9 -
.../AArch64/pacret-synchronous-unwind.cpp | 33 +++
2 files changed, 41 insertions(+), 1 deletion(-)
create mode 100644 bolt/test/runtime/AArch64/pacret-synchronous-unwind.cpp
diff --git a/bolt/lib/Passes/PointerAuthCFIAnalyzer.cpp
b/bolt/lib/Passes/PointerAuthCFIAnalyzer.cpp
index 91030544d2b88..01af88818a21d 100644
--- a/bolt/lib/Passes/PointerAuthCFIAnalyzer.cpp
+++ b/bolt/lib/Passes/PointerAuthCFIAnalyzer.cpp
@@ -133,11 +133,18 @@ Error
PointerAuthCFIAnalyzer::runOnFunctions(BinaryContext &BC) {
ParallelUtilities::runOnEachFunction(
BC, ParallelUtilities::SchedulingPolicy::SP_INST_LINEAR, WorkFun,
SkipPredicate, "PointerAuthCFIAnalyzer");
+
+ float IgnoredPercent = (100.0 * FunctionsIgnored) / Total;
BC.outs() << "BOLT-INFO: PointerAuthCFIAnalyzer ran on " << Total
<< " functions. Ignored " << FunctionsIgnored << " functions "
-<< format("(%.2lf%%)", (100.0 * FunctionsIgnored) / Total)
+<< format("(%.2lf%%)", IgnoredPercent)
<< " because of CFI inconsistencies\n";
+ if (IgnoredPercent >= 10.0)
+BC.outs() << "BOLT-WARNING: PointerAuthCFIAnalyzer only supports "
+ "asynchronous unwind tables. For C compilers, see "
+ "-fasynchronous-unwind-tables.\n";
+
return Error::success();
}
diff --git a/bolt/test/runtime/AArch64/pacret-synchronous-unwind.cpp
b/bolt/test/runtime/AArch64/pacret-synchronous-unwind.cpp
new file mode 100644
index 0..1bfeeaed3715a
--- /dev/null
+++ b/bolt/test/runtime/AArch64/pacret-synchronous-unwind.cpp
@@ -0,0 +1,33 @@
+// Test to demonstrate that functions compiled with synchronous unwind tables
+// are ignored by the PointerAuthCFIAnalyzer.
+// Exception handling is needed to have _any_ unwind tables, otherwise the
+// PointerAuthCFIAnalyzer does not run on these functions, so it does not
ignore
+// any function.
+//
+// REQUIRES: system-linux,bolt-runtime
+//
+// RUN: %clangxx --target=aarch64-unknown-linux-gnu \
+// RUN: -mbranch-protection=pac-ret \
+// RUN: -fno-asynchronous-unwind-tables \
+// RUN: %s -o %t.exe -Wl,-q
+// RUN: llvm-bolt %t.exe -o %t.bolt | FileCheck %s --check-prefix=CHECK
+//
+// CHECK: PointerAuthCFIAnalyzer ran on 3 functions. Ignored
+// CHECK-NOT: 0 functions (0.00%) because of CFI inconsistencies
+// CHECK-SAME: 1 functions (33.33%) because of CFI inconsistencies
+// CHECK-NEXT: BOLT-WARNING: PointerAuthCFIAnalyzer only supports asynchronous
+// CHECK-SAME: unwind tables. For C compilers, see
-fasynchronous-unwind-tables.
+
+#include
+#include
+
+void foo() { throw std::runtime_error("Exception from foo()."); }
+
+int main() {
+ try {
+foo();
+ } catch (const std::exception &e) {
+printf("Exception caught: %s\n", e.what());
+ }
+ return 0;
+}
From 8583399d7b5b8fe645c1481a6ba4a14e5f1847c6 Mon Sep 17 00:00:00 2001
From: Gergely Balint
Date: Tue, 28 Oct 2025 09:23:08 +
Subject: [PATCH 2/2] [BOLT] Use opts::Verbosity in PointerAuthCFIAnalyzer
---
bolt/lib/Passes/PointerAuthCFIAnalyzer.cpp| 27 ---
bolt/test/AArch64/negate-ra-state-incorrect.s | 2 +-
2 files changed, 18 insertions(+), 11 deletions(-)
diff --git a/bolt/lib/Passes/PointerAuthCFIAnalyzer.cpp
b/bolt/lib/Passes/PointerAuthCFIAnalyzer.cpp
index 01af88818a21d..5979d5fb01818 100644
--- a/bolt/lib/Passes/PointerAuthCFIAnalyzer.cpp
+++ b/bolt/lib/Passes/PointerAuthCFIAnalyzer.cpp
@@ -28,6 +28,10 @@
using namespace llvm;
+namespace opts {
+extern llvm::cl::opt Verbosity;
+} // namespace opts
+
namespace llvm {
namespace bolt {
@@ -43,9 +47,10 @@ bool PointerAuthCFIAnalyzer::runOnFunction(BinaryFunction
&BF) {
// Not all functions have .cfi_negate_ra_state in them. But if one
does,
// we expect psign/pauth instructions to have the hasNegateRAState
// annotation.
-BC.outs() << "BOLT-INFO: inconsistent RAStates in function "
- << BF.getPrintName()
- << ": ptr sign/auth inst without .cfi_negate_ra_state\n";
+if (opts::Verbosity >= 1)
+ BC.outs() << "BOLT-INFO: inconsistent RAStates in function "
+<< BF.getPrintName()
+<< ": ptr sign/auth inst without .cfi_negate_ra_state\n";
std::lock_guard Lock(IgnoreMutex);
BF.setIgnored();
return false;
@@ -65,9 +70,10 @@ bool PointerAuthCFIAnalyzer::run
[llvm-branch-commits] [llvm] AArch64: Enable terminal rule (PR #165959)
https://github.com/davemgreen approved this pull request. LGTM https://github.com/llvm/llvm-project/pull/165959 ___ llvm-branch-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] ARM: Enable terminal rule (PR #165958)
https://github.com/davemgreen approved this pull request. LGTM https://github.com/llvm/llvm-project/pull/165958 ___ llvm-branch-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [LV] Choose best reduction for VPlan (PR #166138)
@@ -0,0 +1,136 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
UTC_ARGS: --version 6
+; RUN: opt -S -mcpu=neoverse-v2 -passes=loop-vectorize -mtriple=aarch64 < %s |
FileCheck %s
+target triple = "aarch64"
+
+; Check that a partial reduction is reverted back to a regular reduction,
+; so that we compare "the VPlan with the best kind of reduction for "
+; vs "the VPlan with the best kind of reduction for ",
+
+; Function Attrs: nofree norecurse nosync nounwind memory(argmem: read)
uwtable vscale_range(1,16)
+define dso_local i64 @foo(ptr noundef readonly captures(none) %0, i32 noundef
%1) local_unnamed_addr #0 {
SamTebbs33 wrote:
Could you rename this test to something like `revert_add` and remove
`local_unnamed_addr #0`?
https://github.com/llvm/llvm-project/pull/166138
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [LV] Choose best reduction for VPlan (PR #166138)
@@ -3773,19 +3775,76 @@ static void
tryToCreateAbstractReductionRecipe(VPReductionRecipe *Red,
AbstractR = ExtRed;
// Cannot create abstract inloop reduction recipes.
if (!AbstractR)
-return;
+return false;
AbstractR->insertBefore(*VPBB, IP);
Red->replaceAllUsesWith(AbstractR);
+ return true;
+}
+
+/// Lower a partial reduction back to a regular reduction, by
+/// changing the in-loop partial reduction to a binop and removing
+/// the scale factor from the PHI node.
+static void lowerPartialReduction(VPlan &Plan, VPPartialReductionRecipe *Red,
+ VPCostContext &Ctx) {
+ VPRecipeBase *Acc = Red->getChainOp()->getDefiningRecipe();
+ if (auto *PhiR = dyn_cast(Acc)) {
+PhiR->setVFScaleFactor(1);
+
+// We also need to update the scale factor of the reduction-start-vector
+// operand.
+VPValue *StartV, *IdentityV;
+if (!match(PhiR->getOperand(0),
+ m_VPInstruction(
+ m_VPValue(StartV), m_VPValue(IdentityV), m_VPValue(
+ llvm_unreachable("Unexpected operand for a partial reduction");
+Type *I32Ty = IntegerType::getInt32Ty(Plan.getContext());
+auto *ScaleFactorVPV = Plan.getOrAddLiveIn(ConstantInt::get(I32Ty, 1));
+cast(PhiR->getOperand(0))->setOperand(2, ScaleFactorVPV);
+ }
+
+ if (auto *R = dyn_cast(Acc))
+if (R->getVFScaleFactor() != 1)
+ lowerPartialReduction(Plan, R, Ctx);
+
+ LLVM_DEBUG(
+ dbgs() << "LV: Lowering " << *Red
+ << " back to regular reduction, because it is not profitable\n");
+
+ // Lower the partial reduction to a regular binop.
+ VPBuilder Builder(Red);
+ VPInstruction *Add = Builder.createNaryOp(
+ RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind()),
+ {Red->getChainOp(), Red->getVecOp()});
+ if (Red->isConditional())
+Add = Builder.createSelect(Red->getCondOp(), Add, Red->getChainOp());
+
+ Red->replaceAllUsesWith(Add);
+ Red->eraseFromParent();
}
void VPlanTransforms::convertToAbstractRecipes(VPlan &Plan, VPCostContext &Ctx,
VFRange &Range) {
for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly(
vp_depth_first_deep(Plan.getVectorLoopRegion( {
for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
- if (auto *Red = dyn_cast(&R))
-tryToCreateAbstractReductionRecipe(Red, Ctx, Range);
+ auto *Red = dyn_cast(&R);
+ if (!Red)
+continue;
+
+ if (!tryToCreateAbstractReductionRecipe(Red, Ctx, Range) &&
+ isa(Red)) {
+// If there isn't a profitable VPExpression for a partial reduction,
+// then that suggests using a partial reduction is not profitable
+// for this VPlan. It seems better to resort to a regular
(middle-block)
+// reduction, so that the this plan is still profitable to consider.
+// Otherwise, the plan might be discarded in favour of a smaller VF.
+//
+// FIXME: There's a lot to unpick when it comes to partial
+// reductions, but this should provide a temporary stop-gap until we
+// reimplement the logic for creating partial reductions.
+lowerPartialReduction(Plan, cast(Red), Ctx);
SamTebbs33 wrote:
I think we should only call this if the reduction actually is partial,
otherwise we'll waste some time essentially doing nothing in the lower function.
https://github.com/llvm/llvm-project/pull/166138
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [LV] Choose best reduction for VPlan (PR #166138)
@@ -3773,19 +3775,76 @@ static void
tryToCreateAbstractReductionRecipe(VPReductionRecipe *Red,
AbstractR = ExtRed;
// Cannot create abstract inloop reduction recipes.
if (!AbstractR)
-return;
+return false;
AbstractR->insertBefore(*VPBB, IP);
Red->replaceAllUsesWith(AbstractR);
+ return true;
+}
+
+/// Lower a partial reduction back to a regular reduction, by
+/// changing the in-loop partial reduction to a binop and removing
+/// the scale factor from the PHI node.
+static void lowerPartialReduction(VPlan &Plan, VPPartialReductionRecipe *Red,
+ VPCostContext &Ctx) {
+ VPRecipeBase *Acc = Red->getChainOp()->getDefiningRecipe();
+ if (auto *PhiR = dyn_cast(Acc)) {
+PhiR->setVFScaleFactor(1);
+
+// We also need to update the scale factor of the reduction-start-vector
+// operand.
+VPValue *StartV, *IdentityV;
+if (!match(PhiR->getOperand(0),
+ m_VPInstruction(
+ m_VPValue(StartV), m_VPValue(IdentityV), m_VPValue(
+ llvm_unreachable("Unexpected operand for a partial reduction");
+Type *I32Ty = IntegerType::getInt32Ty(Plan.getContext());
+auto *ScaleFactorVPV = Plan.getOrAddLiveIn(ConstantInt::get(I32Ty, 1));
+cast(PhiR->getOperand(0))->setOperand(2, ScaleFactorVPV);
+ }
+
+ if (auto *R = dyn_cast(Acc))
+if (R->getVFScaleFactor() != 1)
+ lowerPartialReduction(Plan, R, Ctx);
+
+ LLVM_DEBUG(
+ dbgs() << "LV: Lowering " << *Red
+ << " back to regular reduction, because it is not profitable\n");
+
+ // Lower the partial reduction to a regular binop.
+ VPBuilder Builder(Red);
+ VPInstruction *Add = Builder.createNaryOp(
+ RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind()),
+ {Red->getChainOp(), Red->getVecOp()});
+ if (Red->isConditional())
+Add = Builder.createSelect(Red->getCondOp(), Add, Red->getChainOp());
+
+ Red->replaceAllUsesWith(Add);
+ Red->eraseFromParent();
}
void VPlanTransforms::convertToAbstractRecipes(VPlan &Plan, VPCostContext &Ctx,
VFRange &Range) {
for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly(
vp_depth_first_deep(Plan.getVectorLoopRegion( {
for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
- if (auto *Red = dyn_cast(&R))
-tryToCreateAbstractReductionRecipe(Red, Ctx, Range);
+ auto *Red = dyn_cast(&R);
+ if (!Red)
+continue;
+
+ if (!tryToCreateAbstractReductionRecipe(Red, Ctx, Range) &&
+ isa(Red)) {
+// If there isn't a profitable VPExpression for a partial reduction,
+// then that suggests using a partial reduction is not profitable
+// for this VPlan. It seems better to resort to a regular
(middle-block)
+// reduction, so that the this plan is still profitable to consider.
SamTebbs33 wrote:
Extra `the` here.
https://github.com/llvm/llvm-project/pull/166138
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [LV] Choose best reduction for VPlan (PR #166138)
@@ -0,0 +1,136 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
UTC_ARGS: --version 6
+; RUN: opt -S -mcpu=neoverse-v2 -passes=loop-vectorize -mtriple=aarch64 < %s |
FileCheck %s
+target triple = "aarch64"
+
+; Check that a partial reduction is reverted back to a regular reduction,
+; so that we compare "the VPlan with the best kind of reduction for "
+; vs "the VPlan with the best kind of reduction for ",
+
+; Function Attrs: nofree norecurse nosync nounwind memory(argmem: read)
uwtable vscale_range(1,16)
+define dso_local i64 @foo(ptr noundef readonly captures(none) %0, i32 noundef
%1) local_unnamed_addr #0 {
+; CHECK-LABEL: define dso_local i64 @foo(
+; CHECK-SAME: ptr noundef readonly captures(none) [[TMP0:%.*]], i32 noundef
[[TMP1:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:[[TMP3:%.*]] = icmp sgt i32 [[TMP1]], 0
+; CHECK-NEXT:br i1 [[TMP3]], label %[[ITER_CHECK:.*]], label %[[BB27:.*]]
+; CHECK: [[ITER_CHECK]]:
+; CHECK-NEXT:[[TMP4:%.*]] = zext nneg i32 [[TMP1]] to i64
+; CHECK-NEXT:[[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP4]], 4
+; CHECK-NEXT:br i1 [[MIN_ITERS_CHECK]], label
%[[VEC_EPILOG_SCALAR_PH:.*]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK:.*]]
+; CHECK: [[VECTOR_MAIN_LOOP_ITER_CHECK]]:
+; CHECK-NEXT:[[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[TMP4]], 16
+; CHECK-NEXT:br i1 [[MIN_ITERS_CHECK1]], label %[[VEC_EPILOG_PH:.*]],
label %[[VECTOR_PH:.*]]
+; CHECK: [[VECTOR_PH]]:
+; CHECK-NEXT:[[N_MOD_VF:%.*]] = urem i64 [[TMP4]], 16
+; CHECK-NEXT:[[N_VEC:%.*]] = sub i64 [[TMP4]], [[N_MOD_VF]]
+; CHECK-NEXT:br label %[[VECTOR_BODY:.*]]
+; CHECK: [[VECTOR_BODY]]:
+; CHECK-NEXT:[[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [
[[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:[[VEC_PHI:%.*]] = phi <4 x i64> [ zeroinitializer,
%[[VECTOR_PH]] ], [ [[TMP13:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:[[VEC_PHI2:%.*]] = phi <4 x i64> [ zeroinitializer,
%[[VECTOR_PH]] ], [ [[TMP14:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:[[VEC_PHI3:%.*]] = phi <4 x i64> [ zeroinitializer,
%[[VECTOR_PH]] ], [ [[TMP15:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:[[VEC_PHI4:%.*]] = phi <4 x i64> [ zeroinitializer,
%[[VECTOR_PH]] ], [ [[TMP16:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:[[TMP5:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP0]],
i64 [[INDEX]]
+; CHECK-NEXT:[[TMP6:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP5]],
i32 4
+; CHECK-NEXT:[[TMP7:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP5]],
i32 8
+; CHECK-NEXT:[[TMP8:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP5]],
i32 12
+; CHECK-NEXT:[[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP5]], align 4
+; CHECK-NEXT:[[WIDE_LOAD5:%.*]] = load <4 x i32>, ptr [[TMP6]], align 4
+; CHECK-NEXT:[[WIDE_LOAD6:%.*]] = load <4 x i32>, ptr [[TMP7]], align 4
+; CHECK-NEXT:[[WIDE_LOAD7:%.*]] = load <4 x i32>, ptr [[TMP8]], align 4
+; CHECK-NEXT:[[TMP9:%.*]] = sext <4 x i32> [[WIDE_LOAD]] to <4 x i64>
+; CHECK-NEXT:[[TMP10:%.*]] = sext <4 x i32> [[WIDE_LOAD5]] to <4 x i64>
+; CHECK-NEXT:[[TMP11:%.*]] = sext <4 x i32> [[WIDE_LOAD6]] to <4 x i64>
+; CHECK-NEXT:[[TMP12:%.*]] = sext <4 x i32> [[WIDE_LOAD7]] to <4 x i64>
+; CHECK-NEXT:[[TMP13]] = add <4 x i64> [[VEC_PHI]], [[TMP9]]
+; CHECK-NEXT:[[TMP14]] = add <4 x i64> [[VEC_PHI2]], [[TMP10]]
+; CHECK-NEXT:[[TMP15]] = add <4 x i64> [[VEC_PHI3]], [[TMP11]]
+; CHECK-NEXT:[[TMP16]] = add <4 x i64> [[VEC_PHI4]], [[TMP12]]
+; CHECK-NEXT:[[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-NEXT:[[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:br i1 [[TMP17]], label %[[MIDDLE_BLOCK:.*]], label
%[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK: [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:[[BIN_RDX:%.*]] = add <4 x i64> [[TMP14]], [[TMP13]]
+; CHECK-NEXT:[[BIN_RDX8:%.*]] = add <4 x i64> [[TMP15]], [[BIN_RDX]]
+; CHECK-NEXT:[[BIN_RDX9:%.*]] = add <4 x i64> [[TMP16]], [[BIN_RDX8]]
+; CHECK-NEXT:[[TMP18:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x
i64> [[BIN_RDX9]])
+; CHECK-NEXT:[[CMP_N:%.*]] = icmp eq i64 [[TMP4]], [[N_VEC]]
+; CHECK-NEXT:br i1 [[CMP_N]], label %[[BB25:.*]], label
%[[VEC_EPILOG_ITER_CHECK:.*]]
+; CHECK: [[VEC_EPILOG_ITER_CHECK]]:
+; CHECK-NEXT:[[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 4
+; CHECK-NEXT:br i1 [[MIN_EPILOG_ITERS_CHECK]], label
%[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF3:![0-9]+]]
+; CHECK: [[VEC_EPILOG_PH]]:
+; CHECK-NEXT:[[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]],
%[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; CHECK-NEXT:[[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP18]],
%[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; CHECK-NEXT:[[N_MOD_VF10:%.*]] = urem i64 [[TMP4]], 4
+; CHECK-NEXT:[[N_VEC11:%.*]] = sub i64
[llvm-branch-commits] [llvm] [LV] Choose best reduction for VPlan (PR #166138)
@@ -3773,19 +3775,76 @@ static void
tryToCreateAbstractReductionRecipe(VPReductionRecipe *Red,
AbstractR = ExtRed;
// Cannot create abstract inloop reduction recipes.
if (!AbstractR)
-return;
+return false;
AbstractR->insertBefore(*VPBB, IP);
Red->replaceAllUsesWith(AbstractR);
+ return true;
+}
+
+/// Lower a partial reduction back to a regular reduction, by
+/// changing the in-loop partial reduction to a binop and removing
+/// the scale factor from the PHI node.
+static void lowerPartialReduction(VPlan &Plan, VPPartialReductionRecipe *Red,
+ VPCostContext &Ctx) {
+ VPRecipeBase *Acc = Red->getChainOp()->getDefiningRecipe();
+ if (auto *PhiR = dyn_cast(Acc)) {
+PhiR->setVFScaleFactor(1);
+
+// We also need to update the scale factor of the reduction-start-vector
+// operand.
+VPValue *StartV, *IdentityV;
+if (!match(PhiR->getOperand(0),
+ m_VPInstruction(
+ m_VPValue(StartV), m_VPValue(IdentityV), m_VPValue(
+ llvm_unreachable("Unexpected operand for a partial reduction");
+Type *I32Ty = IntegerType::getInt32Ty(Plan.getContext());
+auto *ScaleFactorVPV = Plan.getOrAddLiveIn(ConstantInt::get(I32Ty, 1));
+cast(PhiR->getOperand(0))->setOperand(2, ScaleFactorVPV);
+ }
+
+ if (auto *R = dyn_cast(Acc))
+if (R->getVFScaleFactor() != 1)
+ lowerPartialReduction(Plan, R, Ctx);
+
+ LLVM_DEBUG(
+ dbgs() << "LV: Lowering " << *Red
+ << " back to regular reduction, because it is not profitable\n");
SamTebbs33 wrote:
I think we want an `a` before `regular`.
https://github.com/llvm/llvm-project/pull/166138
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [LV] Choose best reduction for VPlan (PR #166138)
@@ -0,0 +1,136 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6 +; RUN: opt -S -mcpu=neoverse-v2 -passes=loop-vectorize -mtriple=aarch64 < %s | FileCheck %s +target triple = "aarch64" + +; Check that a partial reduction is reverted back to a regular reduction, SamTebbs33 wrote: Could you add some check statements that make sure that the reversion to a normal reduction happened? I think we'll also want a test with a subtract, as well as chained add and chained subtract. https://github.com/llvm/llvm-project/pull/166138 ___ llvm-branch-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [AMDGPU] Enable amdgpu-lower-special-lds pass in pipeline (PR #165746)
llvmbot wrote:
@llvm/pr-subscribers-backend-amdgpu
Author: Chaitanya (skc7)
Changes
This PR enables the pass `amdgpu-lower-special-lds` pass in the amdgpu pass
pipeline.
Also adds tests which validate the lowering of named-barrier globals in asan
and normal scenarios.
---
Patch is 39.90 KiB, truncated to 20.00 KiB below, full version:
https://github.com/llvm/llvm-project/pull/165746.diff
9 Files Affected:
- (modified) llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp (-126)
- (modified) llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.cpp (+6)
- (modified) llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp (+2-2)
- (modified) llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp (+14)
- (added) llvm/test/CodeGen/AMDGPU/amdgpu-lower-special-lds-and-module-lds.ll
(+122)
- (added) llvm/test/CodeGen/AMDGPU/amdgpu-lower-special-lds-and-sw-lds.ll (+73)
- (modified) llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll (+3-3)
- (modified) llvm/test/CodeGen/AMDGPU/llc-pipeline.ll (+5)
- (modified) llvm/test/CodeGen/AMDGPU/s-barrier-lowering.ll (+1-1)
``diff
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
index a4ef524c43466..3c0328e93ffbd 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
@@ -922,126 +922,6 @@ class AMDGPULowerModuleLDS {
return KernelToCreatedDynamicLDS;
}
- static GlobalVariable *uniquifyGVPerKernel(Module &M, GlobalVariable *GV,
- Function *KF) {
-bool NeedsReplacement = false;
-for (Use &U : GV->uses()) {
- if (auto *I = dyn_cast(U.getUser())) {
-Function *F = I->getFunction();
-if (isKernelLDS(F) && F != KF) {
- NeedsReplacement = true;
- break;
-}
- }
-}
-if (!NeedsReplacement)
- return GV;
-// Create a new GV used only by this kernel and its function
-GlobalVariable *NewGV = new GlobalVariable(
-M, GV->getValueType(), GV->isConstant(), GV->getLinkage(),
-GV->getInitializer(), GV->getName() + "." + KF->getName(), nullptr,
-GV->getThreadLocalMode(), GV->getType()->getAddressSpace());
-NewGV->copyAttributesFrom(GV);
-for (Use &U : make_early_inc_range(GV->uses())) {
- if (auto *I = dyn_cast(U.getUser())) {
-Function *F = I->getFunction();
-if (!isKernelLDS(F) || F == KF) {
- U.getUser()->replaceUsesOfWith(GV, NewGV);
-}
- }
-}
-return NewGV;
- }
-
- bool lowerSpecialLDSVariables(
- Module &M, LDSUsesInfoTy &LDSUsesInfo,
- VariableFunctionMap &LDSToKernelsThatNeedToAccessItIndirectly) {
-bool Changed = false;
-const DataLayout &DL = M.getDataLayout();
-// The 1st round: give module-absolute assignments
-int NumAbsolutes = 0;
-std::vector OrderedGVs;
-for (auto &K : LDSToKernelsThatNeedToAccessItIndirectly) {
- GlobalVariable *GV = K.first;
- if (!isNamedBarrier(*GV))
-continue;
- // give a module-absolute assignment if it is indirectly accessed by
- // multiple kernels. This is not precise, but we don't want to duplicate
- // a function when it is called by multiple kernels.
- if (LDSToKernelsThatNeedToAccessItIndirectly[GV].size() > 1) {
-OrderedGVs.push_back(GV);
- } else {
-// leave it to the 2nd round, which will give a kernel-relative
-// assignment if it is only indirectly accessed by one kernel
-LDSUsesInfo.direct_access[*K.second.begin()].insert(GV);
- }
- LDSToKernelsThatNeedToAccessItIndirectly.erase(GV);
-}
-OrderedGVs = sortByName(std::move(OrderedGVs));
-for (GlobalVariable *GV : OrderedGVs) {
- unsigned BarrierScope = llvm::AMDGPU::Barrier::BARRIER_SCOPE_WORKGROUP;
- unsigned BarId = NumAbsolutes + 1;
- unsigned BarCnt = DL.getTypeAllocSize(GV->getValueType()) / 16;
- NumAbsolutes += BarCnt;
-
- // 4 bits for alignment, 5 bits for the barrier num,
- // 3 bits for the barrier scope
- unsigned Offset = 0x802000u | BarrierScope << 9 | BarId << 4;
- recordLDSAbsoluteAddress(&M, GV, Offset);
-}
-OrderedGVs.clear();
-
-// The 2nd round: give a kernel-relative assignment for GV that
-// either only indirectly accessed by single kernel or only directly
-// accessed by multiple kernels.
-std::vector OrderedKernels;
-for (auto &K : LDSUsesInfo.direct_access) {
- Function *F = K.first;
- assert(isKernelLDS(F));
- OrderedKernels.push_back(F);
-}
-OrderedKernels = sortByName(std::move(OrderedKernels));
-
-llvm::DenseMap Kernel2BarId;
-for (Function *F : OrderedKernels) {
- for (GlobalVariable *GV : LDSUsesInfo.direct_access[F]) {
-if (!isNamedBarrier(*GV))
- continue;
-
-LDSUsesInfo.direct_access[F].erase(GV);
-if (GV->isAbsoluteSymbolRef()) {
- // already
[llvm-branch-commits] [llvm] [AMDGPU] Enable amdgpu-lower-special-lds pass in pipeline (PR #165746)
https://github.com/skc7 edited https://github.com/llvm/llvm-project/pull/165746 ___ llvm-branch-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [AMDGPU] Enable amdgpu-lower-special-lds pass in pipeline (PR #165746)
https://github.com/skc7 ready_for_review https://github.com/llvm/llvm-project/pull/165746 ___ llvm-branch-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [AMDGPU] Enable amdgpu-lower-special-lds pass in pipeline (PR #165746)
https://github.com/skc7 updated https://github.com/llvm/llvm-project/pull/165746
>From 32323922ae49ee0fe2ca9ab0572ba43a63336198 Mon Sep 17 00:00:00 2001
From: skc7
Date: Thu, 30 Oct 2025 22:42:33 +0530
Subject: [PATCH 1/2] [AMDGPU] Enable amdgpu-lower-special-lds pass in pipeline
---
.../AMDGPU/AMDGPULowerModuleLDSPass.cpp | 126 --
llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.cpp | 6 +
llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp | 3 +-
.../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 14 ++
...amdgpu-lower-special-lds-and-module-lds.ll | 119 +
.../amdgpu-lower-special-lds-and-sw-lds.ll| 86
llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll | 6 +-
llvm/test/CodeGen/AMDGPU/llc-pipeline.ll | 5 +
.../test/CodeGen/AMDGPU/s-barrier-lowering.ll | 2 +-
9 files changed, 236 insertions(+), 131 deletions(-)
create mode 100644
llvm/test/CodeGen/AMDGPU/amdgpu-lower-special-lds-and-module-lds.ll
create mode 100644
llvm/test/CodeGen/AMDGPU/amdgpu-lower-special-lds-and-sw-lds.ll
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
index a4ef524c43466..3c0328e93ffbd 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
@@ -922,126 +922,6 @@ class AMDGPULowerModuleLDS {
return KernelToCreatedDynamicLDS;
}
- static GlobalVariable *uniquifyGVPerKernel(Module &M, GlobalVariable *GV,
- Function *KF) {
-bool NeedsReplacement = false;
-for (Use &U : GV->uses()) {
- if (auto *I = dyn_cast(U.getUser())) {
-Function *F = I->getFunction();
-if (isKernelLDS(F) && F != KF) {
- NeedsReplacement = true;
- break;
-}
- }
-}
-if (!NeedsReplacement)
- return GV;
-// Create a new GV used only by this kernel and its function
-GlobalVariable *NewGV = new GlobalVariable(
-M, GV->getValueType(), GV->isConstant(), GV->getLinkage(),
-GV->getInitializer(), GV->getName() + "." + KF->getName(), nullptr,
-GV->getThreadLocalMode(), GV->getType()->getAddressSpace());
-NewGV->copyAttributesFrom(GV);
-for (Use &U : make_early_inc_range(GV->uses())) {
- if (auto *I = dyn_cast(U.getUser())) {
-Function *F = I->getFunction();
-if (!isKernelLDS(F) || F == KF) {
- U.getUser()->replaceUsesOfWith(GV, NewGV);
-}
- }
-}
-return NewGV;
- }
-
- bool lowerSpecialLDSVariables(
- Module &M, LDSUsesInfoTy &LDSUsesInfo,
- VariableFunctionMap &LDSToKernelsThatNeedToAccessItIndirectly) {
-bool Changed = false;
-const DataLayout &DL = M.getDataLayout();
-// The 1st round: give module-absolute assignments
-int NumAbsolutes = 0;
-std::vector OrderedGVs;
-for (auto &K : LDSToKernelsThatNeedToAccessItIndirectly) {
- GlobalVariable *GV = K.first;
- if (!isNamedBarrier(*GV))
-continue;
- // give a module-absolute assignment if it is indirectly accessed by
- // multiple kernels. This is not precise, but we don't want to duplicate
- // a function when it is called by multiple kernels.
- if (LDSToKernelsThatNeedToAccessItIndirectly[GV].size() > 1) {
-OrderedGVs.push_back(GV);
- } else {
-// leave it to the 2nd round, which will give a kernel-relative
-// assignment if it is only indirectly accessed by one kernel
-LDSUsesInfo.direct_access[*K.second.begin()].insert(GV);
- }
- LDSToKernelsThatNeedToAccessItIndirectly.erase(GV);
-}
-OrderedGVs = sortByName(std::move(OrderedGVs));
-for (GlobalVariable *GV : OrderedGVs) {
- unsigned BarrierScope = llvm::AMDGPU::Barrier::BARRIER_SCOPE_WORKGROUP;
- unsigned BarId = NumAbsolutes + 1;
- unsigned BarCnt = DL.getTypeAllocSize(GV->getValueType()) / 16;
- NumAbsolutes += BarCnt;
-
- // 4 bits for alignment, 5 bits for the barrier num,
- // 3 bits for the barrier scope
- unsigned Offset = 0x802000u | BarrierScope << 9 | BarId << 4;
- recordLDSAbsoluteAddress(&M, GV, Offset);
-}
-OrderedGVs.clear();
-
-// The 2nd round: give a kernel-relative assignment for GV that
-// either only indirectly accessed by single kernel or only directly
-// accessed by multiple kernels.
-std::vector OrderedKernels;
-for (auto &K : LDSUsesInfo.direct_access) {
- Function *F = K.first;
- assert(isKernelLDS(F));
- OrderedKernels.push_back(F);
-}
-OrderedKernels = sortByName(std::move(OrderedKernels));
-
-llvm::DenseMap Kernel2BarId;
-for (Function *F : OrderedKernels) {
- for (GlobalVariable *GV : LDSUsesInfo.direct_access[F]) {
-if (!isNamedBarrier(*GV))
- continue;
-
-LDSUsesInfo.direct_access[F].erase(GV);
-if (GV->isAbsoluteSymbolRef()) {
- // already assigned
-
[llvm-branch-commits] [llvm] [LV] Choose best reduction for VPlan (PR #166138)
https://github.com/sdesmalen-arm created
https://github.com/llvm/llvm-project/pull/166138
The way partial reductions currently work is as follows:
* Reductions are analysed if they are suitable partial reductions, and if so a
VPlan is constructed with partial reductions.
* When creating VPExpressions, the LV tries to see if it's beneficial to bundle
the operation into a VPExpression. If the cost of a partial reduction is too
high, then the answer is 'no' and it will remain unbundled. This means the LV
may end up calculating too high a cost for a partial reduction VPlan, because
it still includes the cost of the extends.
* When the cost of a VPlan with partial reductions is higher than the plan of a
VPlan without partial reductions, it will favour the plan without partial
reductions. But this is often a plan with a lower VF, because partial
reductions get the extends for free (and to do this for a full vector, it would
need a higher VF).
* This means that if the cost of a partial reduction is too high, it will pick
a lower VF, rather than trying to fall back onto a regular reduction (possibly
with the same VF).
This PR is a workaround and not the full solution, but there are so many things
to unpick with partial reductions, that I think this is a good intermediary
step before changing how we create partial reduction vplans.
The better solution would be to wait with the decision on which style of
reduction to choose, based on the cost of the VPExpressions which also do the
analysis to see what kind of expression it is, and whether the extends can be
folded into the operation.
This aims to address the issue reported in #165226
>From 04d8e7071b963652e77fdcc6847fc3173e655d8a Mon Sep 17 00:00:00 2001
From: Sander de Smalen
Date: Sun, 2 Nov 2025 21:58:55 +
Subject: [PATCH] [LV] Choose best reduction for VPlan
The way partial reductions currently work is as follows:
* Reductions are analysed if they are suitable partial reductions,
and if so a VPlan is constructed with partial reductions.
* When creating VPExpressions, the LV tries to see if it's beneficial
to bundle the operation into a VPExpression. If the cost of a partial
reduction is too high, then the answer is 'no' and it will remain
unbundled. This means the LV may end up calculating too high a cost
for a partial reduction VPlan, because it still includes the cost of
the extends.
* When the cost of a VPlan with partial reductions is higher than the
plan of a VPlan without partial reductions, it will favour the plan
without partial reductions. But this is often a plan with a lower VF,
because partial reductions get the extends for free (and to do this
for a full vector, it would need a higher VF).
* This means that if the cost of a partial reduction is too high, it
will pick a lower VF, rather than trying to fall back onto a regular
reduction (possibly with the same VF).
This PR is a workaround and not the full solution, but there are so
many things to unpick with partial reductions, that I think this is a
good intermediary step before changing how we create partial reduction
vplans.
The better solution would be to wait with the decision on which
style of reduction to choose, based on the cost of the VPExpressions
which also do the analysis to see what kind of expression it is, and
whether the extends can be folded into the operation.
This aims to address the issue reported in #165226
---
.../AArch64/AArch64TargetTransformInfo.cpp| 14 +-
llvm/lib/Transforms/Vectorize/VPlan.h | 2 +
.../Transforms/Vectorize/VPlanTransforms.cpp | 67 -
.../AArch64/partial-reduce-constant-ops.ll| 16 +--
...tial-reduce-lower-back-to-reguar-reduce.ll | 136 ++
.../LoopVectorize/AArch64/partial-reduce.ll | 40 +++---
6 files changed, 241 insertions(+), 34 deletions(-)
create mode 100644
llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-lower-back-to-reguar-reduce.ll
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index e8352be692aaf..d454d4e98bfc1 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -5679,6 +5679,18 @@ InstructionCost AArch64TTIImpl::getPartialReductionCost(
if (CostKind != TTI::TCK_RecipThroughput)
return Invalid;
+ unsigned Ratio =
+ AccumType->getScalarSizeInBits() / InputTypeA->getScalarSizeInBits();
+
+ // A ratio of 1 would mean it's similar to a regular add, e.g.
+ // v4i64 partial.reduce(v4i64 %acc, v4i64 %vec)
+ // <=> add v4i64 %acc, %vec
+ if (Ratio == 1) {
+auto *T = VectorType::get(AccumType, VF);
+return getArithmeticInstrCost(Opcode, T, CostKind) +
+ (BinOp ? getArithmeticInstrCost(*BinOp, T, CostKind) : 0);
+ }
+
if (VF.isFixed() && !ST->isSVEorStreamingSVEAvailable() &&
(!ST->isNeonAvailable() || !ST->hasDotProd()))
[llvm-branch-commits] [llvm] [LV] Choose best reduction for VPlan (PR #166138)
llvmbot wrote:
@llvm/pr-subscribers-backend-aarch64
Author: Sander de Smalen (sdesmalen-arm)
Changes
The way partial reductions currently work is as follows:
* Reductions are analysed if they are suitable partial reductions, and if so a
VPlan is constructed with partial reductions.
* When creating VPExpressions, the LV tries to see if it's beneficial to bundle
the operation into a VPExpression. If the cost of a partial reduction is too
high, then the answer is 'no' and it will remain unbundled. This means the LV
may end up calculating too high a cost for a partial reduction VPlan, because
it still includes the cost of the extends.
* When the cost of a VPlan with partial reductions is higher than the plan of a
VPlan without partial reductions, it will favour the plan without partial
reductions. But this is often a plan with a lower VF, because partial
reductions get the extends for free (and to do this for a full vector, it would
need a higher VF).
* This means that if the cost of a partial reduction is too high, it will pick
a lower VF, rather than trying to fall back onto a regular reduction (possibly
with the same VF).
This PR is a workaround and not the full solution, but there are so many things
to unpick with partial reductions, that I think this is a good intermediary
step before changing how we create partial reduction vplans.
The better solution would be to wait with the decision on which style of
reduction to choose, based on the cost of the VPExpressions which also do the
analysis to see what kind of expression it is, and whether the extends can be
folded into the operation.
This aims to address the issue reported in #165226
---
Patch is 29.44 KiB, truncated to 20.00 KiB below, full version:
https://github.com/llvm/llvm-project/pull/166138.diff
6 Files Affected:
- (modified) llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp (+12-2)
- (modified) llvm/lib/Transforms/Vectorize/VPlan.h (+2)
- (modified) llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp (+63-4)
- (modified)
llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-constant-ops.ll
(+8-8)
- (added)
llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-lower-back-to-reguar-reduce.ll
(+136)
- (modified) llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce.ll
(+20-20)
``diff
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index e8352be692aaf..d454d4e98bfc1 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -5679,6 +5679,18 @@ InstructionCost AArch64TTIImpl::getPartialReductionCost(
if (CostKind != TTI::TCK_RecipThroughput)
return Invalid;
+ unsigned Ratio =
+ AccumType->getScalarSizeInBits() / InputTypeA->getScalarSizeInBits();
+
+ // A ratio of 1 would mean it's similar to a regular add, e.g.
+ // v4i64 partial.reduce(v4i64 %acc, v4i64 %vec)
+ // <=> add v4i64 %acc, %vec
+ if (Ratio == 1) {
+auto *T = VectorType::get(AccumType, VF);
+return getArithmeticInstrCost(Opcode, T, CostKind) +
+ (BinOp ? getArithmeticInstrCost(*BinOp, T, CostKind) : 0);
+ }
+
if (VF.isFixed() && !ST->isSVEorStreamingSVEAvailable() &&
(!ST->isNeonAvailable() || !ST->hasDotProd()))
return Invalid;
@@ -5700,8 +5712,6 @@ InstructionCost AArch64TTIImpl::getPartialReductionCost(
if (IsUSDot && !ST->hasMatMulInt8())
return Invalid;
- unsigned Ratio =
- AccumType->getScalarSizeInBits() / InputTypeA->getScalarSizeInBits();
if (VF.getKnownMinValue() <= Ratio)
return Invalid;
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h
b/llvm/lib/Transforms/Vectorize/VPlan.h
index aba6d351a8e5d..ac0bbb16b2334 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -2378,6 +2378,8 @@ class VPReductionPHIRecipe : public VPHeaderPHIRecipe,
/// Get the factor that the VF of this recipe's output should be scaled by.
unsigned getVFScaleFactor() const { return VFScaleFactor; }
+ void setVFScaleFactor(unsigned F) { VFScaleFactor = F; }
+
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
/// Print the recipe.
void print(raw_ostream &O, const Twine &Indent,
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index b45536869c5af..88df3d49d5b0c 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -40,6 +40,8 @@
using namespace llvm;
using namespace VPlanPatternMatch;
+#define DEBUG_TYPE "loop-vectorize"
+
static cl::opt EnableWideActiveLaneMask(
"enable-wide-lane-mask", cl::init(false), cl::Hidden,
cl::desc("Enable use of wide get active lane mask instructions"));
@@ -3761,7 +3763,7 @@
tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red,
/// This function tries to create
[llvm-branch-commits] [llvm] [BOLT][NFC] Rename Pointer Auth DWARF rewriter passes (PR #164622)
https://github.com/bgergely0 updated
https://github.com/llvm/llvm-project/pull/164622
From 77a0b64af37649b4ec4c0de34284a5f0c57b0a53 Mon Sep 17 00:00:00 2001
From: Gergely Balint
Date: Wed, 22 Oct 2025 12:44:37 +
Subject: [PATCH] [BOLT][NFC] Rename Pointer Auth DWARF rewriter passes
Original names were "working titles". After initial patches are merged,
I'd like to rename these passes to names that reflect their intent
better and show their relationship to each other:
InsertNegateRAStatePass renamed to PointerAuthCFIFixup,
MarkRAStates renamed to PointerAuthCFIAnalyzer.
---
bolt/docs/PacRetDesign.md | 23 ++---
...arkRAStates.h => PointerAuthCFIAnalyzer.h} | 14
...ateRAStatePass.h => PointerAuthCFIFixup.h} | 14
bolt/lib/Core/Exceptions.cpp | 8 ++---
bolt/lib/Passes/CMakeLists.txt| 4 +--
...AStates.cpp => PointerAuthCFIAnalyzer.cpp} | 16 +-
...AStatePass.cpp => PointerAuthCFIFixup.cpp} | 32 +--
bolt/lib/Rewrite/BinaryPassManager.cpp| 8 ++---
bolt/test/AArch64/negate-ra-state-incorrect.s | 2 +-
bolt/test/AArch64/negate-ra-state.s | 8 ++---
bolt/test/AArch64/pacret-split-funcs.s| 4 +--
bolt/unittests/Passes/CMakeLists.txt | 2 +-
...ateRAState.cpp => PointerAuthCFIFixup.cpp} | 6 ++--
.../gn/secondary/bolt/lib/Passes/BUILD.gn | 4 +--
14 files changed, 73 insertions(+), 72 deletions(-)
rename bolt/include/bolt/Passes/{MarkRAStates.h => PointerAuthCFIAnalyzer.h}
(63%)
rename bolt/include/bolt/Passes/{InsertNegateRAStatePass.h =>
PointerAuthCFIFixup.h} (87%)
rename bolt/lib/Passes/{MarkRAStates.cpp => PointerAuthCFIAnalyzer.cpp} (91%)
rename bolt/lib/Passes/{InsertNegateRAStatePass.cpp =>
PointerAuthCFIFixup.cpp} (91%)
rename bolt/unittests/Passes/{InsertNegateRAState.cpp =>
PointerAuthCFIFixup.cpp} (97%)
diff --git a/bolt/docs/PacRetDesign.md b/bolt/docs/PacRetDesign.md
index c7c76cac3a100..0de2da50f8fd6 100644
--- a/bolt/docs/PacRetDesign.md
+++ b/bolt/docs/PacRetDesign.md
@@ -104,9 +104,9 @@ negate-ra-state CFIs will become invalid during BasicBlock
reordering.
## Solution design
The implementation introduces two new passes:
-1. `MarkRAStatesPass`: assigns the RA state to each instruction based on the
CFIs
-in the input binary
-2. `InsertNegateRAStatePass`: reads those assigned instruction RA states after
+1. `PointerAuthCFIAnalyzer`: assigns the RA state to each instruction based on
+the CFI in the input binary
+2. `PointerAuthCFIFixup`: reads those assigned instruction RA states after
optimizations, and emits `DW_CFA_AARCH64_negate_ra_state` CFIs at the
correct
places: wherever there is a state change between two consecutive
instructions
in the layout order.
@@ -129,7 +129,7 @@ instruction.
This special case is handled by adding an `initialRAState` bool to each
BinaryFunction.
If the `Offset` the CFI refers to is zero, we don't store an annotation, but
set
the `initialRAState` in `FillCFIInfoFor`. This information is then used in
-`MarkRAStates`.
+`PointerAuthCFIAnalyzer`.
### Binaries without DWARF info
@@ -146,7 +146,7 @@ In summary:
- pointer auth is used, and we have DWARF CFIs: passes run, and rewrite the
negate-ra-state CFI.
-### MarkRAStates pass
+### PointerAuthCFIAnalyzer pass
This pass runs before optimizations reorder anything.
@@ -173,9 +173,9 @@ what we have before the pass, and after it.
| autiasp | negate-ra-state | signed |
| ret | | unsigned |
-# Error handling in MarkRAState Pass:
+# Error handling in PointerAuthCFIAnalyzer pass:
-Whenever the MarkRAStates pass finds inconsistencies in the current
+Whenever the PointerAuthCFIAnalyzer pass finds inconsistencies in the current
BinaryFunction, it marks the function as ignored using `BF.setIgnored()`. BOLT
will not optimize this function but will emit it unchanged in the original
section
(`.bolt.org.text`).
@@ -188,16 +188,17 @@ The inconsistencies are as follows:
Users will be informed about the number of ignored functions in the pass, the
exact functions ignored, and the found inconsistency.
-### InsertNegateRAStatePass
+### PointerAuthCFIFixup
-This pass runs after optimizations. It performns the _inverse_ of MarkRAState
pa s:
+This pass runs after optimizations. It performns the _inverse_ of
PointerAuthCFIAnalyzer
+pass:
1. it reads the RA state annotations attached to the instructions, and
2. whenever the state changes, it adds a PseudoInstruction that holds an
OpNegateRAState CFI.
# Covering newly generated instructions:
-Some BOLT passes can add new Instructions. In InsertNegateRAStatePass, we have
+Some BOLT passes can add new Instructions. In PointerAuthCFIFixup, we have
to know what RA state these have.
> [!important]
@@ -230,7 +231,7 @@ freely. The only special case is function splitting. When
[llvm-branch-commits] [llvm] [BOLT][NFC] Rename Pointer Auth DWARF rewriter passes (PR #164622)
https://github.com/bgergely0 updated
https://github.com/llvm/llvm-project/pull/164622
From 77a0b64af37649b4ec4c0de34284a5f0c57b0a53 Mon Sep 17 00:00:00 2001
From: Gergely Balint
Date: Wed, 22 Oct 2025 12:44:37 +
Subject: [PATCH] [BOLT][NFC] Rename Pointer Auth DWARF rewriter passes
Original names were "working titles". After initial patches are merged,
I'd like to rename these passes to names that reflect their intent
better and show their relationship to each other:
InsertNegateRAStatePass renamed to PointerAuthCFIFixup,
MarkRAStates renamed to PointerAuthCFIAnalyzer.
---
bolt/docs/PacRetDesign.md | 23 ++---
...arkRAStates.h => PointerAuthCFIAnalyzer.h} | 14
...ateRAStatePass.h => PointerAuthCFIFixup.h} | 14
bolt/lib/Core/Exceptions.cpp | 8 ++---
bolt/lib/Passes/CMakeLists.txt| 4 +--
...AStates.cpp => PointerAuthCFIAnalyzer.cpp} | 16 +-
...AStatePass.cpp => PointerAuthCFIFixup.cpp} | 32 +--
bolt/lib/Rewrite/BinaryPassManager.cpp| 8 ++---
bolt/test/AArch64/negate-ra-state-incorrect.s | 2 +-
bolt/test/AArch64/negate-ra-state.s | 8 ++---
bolt/test/AArch64/pacret-split-funcs.s| 4 +--
bolt/unittests/Passes/CMakeLists.txt | 2 +-
...ateRAState.cpp => PointerAuthCFIFixup.cpp} | 6 ++--
.../gn/secondary/bolt/lib/Passes/BUILD.gn | 4 +--
14 files changed, 73 insertions(+), 72 deletions(-)
rename bolt/include/bolt/Passes/{MarkRAStates.h => PointerAuthCFIAnalyzer.h}
(63%)
rename bolt/include/bolt/Passes/{InsertNegateRAStatePass.h =>
PointerAuthCFIFixup.h} (87%)
rename bolt/lib/Passes/{MarkRAStates.cpp => PointerAuthCFIAnalyzer.cpp} (91%)
rename bolt/lib/Passes/{InsertNegateRAStatePass.cpp =>
PointerAuthCFIFixup.cpp} (91%)
rename bolt/unittests/Passes/{InsertNegateRAState.cpp =>
PointerAuthCFIFixup.cpp} (97%)
diff --git a/bolt/docs/PacRetDesign.md b/bolt/docs/PacRetDesign.md
index c7c76cac3a100..0de2da50f8fd6 100644
--- a/bolt/docs/PacRetDesign.md
+++ b/bolt/docs/PacRetDesign.md
@@ -104,9 +104,9 @@ negate-ra-state CFIs will become invalid during BasicBlock
reordering.
## Solution design
The implementation introduces two new passes:
-1. `MarkRAStatesPass`: assigns the RA state to each instruction based on the
CFIs
-in the input binary
-2. `InsertNegateRAStatePass`: reads those assigned instruction RA states after
+1. `PointerAuthCFIAnalyzer`: assigns the RA state to each instruction based on
+the CFI in the input binary
+2. `PointerAuthCFIFixup`: reads those assigned instruction RA states after
optimizations, and emits `DW_CFA_AARCH64_negate_ra_state` CFIs at the
correct
places: wherever there is a state change between two consecutive
instructions
in the layout order.
@@ -129,7 +129,7 @@ instruction.
This special case is handled by adding an `initialRAState` bool to each
BinaryFunction.
If the `Offset` the CFI refers to is zero, we don't store an annotation, but
set
the `initialRAState` in `FillCFIInfoFor`. This information is then used in
-`MarkRAStates`.
+`PointerAuthCFIAnalyzer`.
### Binaries without DWARF info
@@ -146,7 +146,7 @@ In summary:
- pointer auth is used, and we have DWARF CFIs: passes run, and rewrite the
negate-ra-state CFI.
-### MarkRAStates pass
+### PointerAuthCFIAnalyzer pass
This pass runs before optimizations reorder anything.
@@ -173,9 +173,9 @@ what we have before the pass, and after it.
| autiasp | negate-ra-state | signed |
| ret | | unsigned |
-# Error handling in MarkRAState Pass:
+# Error handling in PointerAuthCFIAnalyzer pass:
-Whenever the MarkRAStates pass finds inconsistencies in the current
+Whenever the PointerAuthCFIAnalyzer pass finds inconsistencies in the current
BinaryFunction, it marks the function as ignored using `BF.setIgnored()`. BOLT
will not optimize this function but will emit it unchanged in the original
section
(`.bolt.org.text`).
@@ -188,16 +188,17 @@ The inconsistencies are as follows:
Users will be informed about the number of ignored functions in the pass, the
exact functions ignored, and the found inconsistency.
-### InsertNegateRAStatePass
+### PointerAuthCFIFixup
-This pass runs after optimizations. It performns the _inverse_ of MarkRAState
pa s:
+This pass runs after optimizations. It performns the _inverse_ of
PointerAuthCFIAnalyzer
+pass:
1. it reads the RA state annotations attached to the instructions, and
2. whenever the state changes, it adds a PseudoInstruction that holds an
OpNegateRAState CFI.
# Covering newly generated instructions:
-Some BOLT passes can add new Instructions. In InsertNegateRAStatePass, we have
+Some BOLT passes can add new Instructions. In PointerAuthCFIFixup, we have
to know what RA state these have.
> [!important]
@@ -230,7 +231,7 @@ freely. The only special case is function splitting. When
[llvm-branch-commits] [llvm] [BOLT][PAC] Warn about synchronous unwind tables (PR #165227)
https://github.com/bgergely0 updated
https://github.com/llvm/llvm-project/pull/165227
From b1a908146a5b0c5ff7b4f27b63395f4577867847 Mon Sep 17 00:00:00 2001
From: Gergely Balint
Date: Mon, 27 Oct 2025 09:29:54 +
Subject: [PATCH 1/2] [BOLT][PAC] Warn about synchronous unwind tables
BOLT currently ignores functions with synchronous PAuth DWARF info.
When more than 10% of functions get ignored for inconsistencies, we
should emit a warning to only use asynchronous unwind tables.
See also: #165215
---
bolt/lib/Passes/PointerAuthCFIAnalyzer.cpp| 8 -
.../AArch64/pacret-synchronous-unwind.cpp | 32 +++
2 files changed, 39 insertions(+), 1 deletion(-)
create mode 100644 bolt/test/runtime/AArch64/pacret-synchronous-unwind.cpp
diff --git a/bolt/lib/Passes/PointerAuthCFIAnalyzer.cpp
b/bolt/lib/Passes/PointerAuthCFIAnalyzer.cpp
index 91030544d2b88..cc28ca47c26b1 100644
--- a/bolt/lib/Passes/PointerAuthCFIAnalyzer.cpp
+++ b/bolt/lib/Passes/PointerAuthCFIAnalyzer.cpp
@@ -133,11 +133,17 @@ Error
PointerAuthCFIAnalyzer::runOnFunctions(BinaryContext &BC) {
ParallelUtilities::runOnEachFunction(
BC, ParallelUtilities::SchedulingPolicy::SP_INST_LINEAR, WorkFun,
SkipPredicate, "PointerAuthCFIAnalyzer");
+
+ float IgnoredPercent = (100.0 * FunctionsIgnored) / Total;
BC.outs() << "BOLT-INFO: PointerAuthCFIAnalyzer ran on " << Total
<< " functions. Ignored " << FunctionsIgnored << " functions "
-<< format("(%.2lf%%)", (100.0 * FunctionsIgnored) / Total)
+<< format("(%.2lf%%)", IgnoredPercent)
<< " because of CFI inconsistencies\n";
+ if (IgnoredPercent >= 10.0)
+BC.outs() << "BOLT-WARNING: PointerAuthCFIAnalyzer only supports "
+ "asynchronous unwind tables.\n";
+
return Error::success();
}
diff --git a/bolt/test/runtime/AArch64/pacret-synchronous-unwind.cpp
b/bolt/test/runtime/AArch64/pacret-synchronous-unwind.cpp
new file mode 100644
index 0..e90882833323d
--- /dev/null
+++ b/bolt/test/runtime/AArch64/pacret-synchronous-unwind.cpp
@@ -0,0 +1,32 @@
+// Test to demonstrate that functions compiled with synchronous unwind tables
+// are ignored by the PointerAuthCFIAnalyzer.
+// Exception handling is needed to have _any_ unwind tables, otherwise the
+// PointerAuthCFIAnalyzer does not run on these functions, so it does not
ignore
+// any function.
+//
+// REQUIRES: system-linux,bolt-runtime
+//
+// RUN: %clangxx --target=aarch64-unknown-linux-gnu \
+// RUN: -mbranch-protection=pac-ret \
+// RUN: -fno-asynchronous-unwind-tables \
+// RUN: %s -o %t.exe -Wl,-q
+// RUN: llvm-bolt %t.exe -o %t.bolt | FileCheck %s --check-prefix=CHECK
+//
+// CHECK: PointerAuthCFIAnalyzer ran on 3 functions. Ignored
+// CHECK-NOT: 0 functions (0.00%) because of CFI inconsistencies
+// CHECK-SAME: 1 functions (33.33%) because of CFI inconsistencies
+// CHECK-NEXT: PointerAuthCFIAnalyzer only supports asynchronous unwind tables
+
+#include
+#include
+
+void foo() { throw std::runtime_error("Exception from foo()."); }
+
+int main() {
+ try {
+foo();
+ } catch (const std::exception &e) {
+printf("Exception caught: %s\n", e.what());
+ }
+ return 0;
+}
From f32f7116eead18e9d2489321af239cce4b05f43c Mon Sep 17 00:00:00 2001
From: Gergely Balint
Date: Tue, 28 Oct 2025 09:23:08 +
Subject: [PATCH 2/2] [BOLT] Use opts::Verbosity in PointerAuthCFIAnalyzer
---
bolt/lib/Passes/PointerAuthCFIAnalyzer.cpp| 27 ---
bolt/test/AArch64/negate-ra-state-incorrect.s | 2 +-
2 files changed, 18 insertions(+), 11 deletions(-)
diff --git a/bolt/lib/Passes/PointerAuthCFIAnalyzer.cpp
b/bolt/lib/Passes/PointerAuthCFIAnalyzer.cpp
index cc28ca47c26b1..e4efb11356a3d 100644
--- a/bolt/lib/Passes/PointerAuthCFIAnalyzer.cpp
+++ b/bolt/lib/Passes/PointerAuthCFIAnalyzer.cpp
@@ -28,6 +28,10 @@
using namespace llvm;
+namespace opts {
+extern llvm::cl::opt Verbosity;
+} // namespace opts
+
namespace llvm {
namespace bolt {
@@ -43,9 +47,10 @@ bool PointerAuthCFIAnalyzer::runOnFunction(BinaryFunction
&BF) {
// Not all functions have .cfi_negate_ra_state in them. But if one
does,
// we expect psign/pauth instructions to have the hasNegateRAState
// annotation.
-BC.outs() << "BOLT-INFO: inconsistent RAStates in function "
- << BF.getPrintName()
- << ": ptr sign/auth inst without .cfi_negate_ra_state\n";
+if (opts::Verbosity >= 1)
+ BC.outs() << "BOLT-INFO: inconsistent RAStates in function "
+<< BF.getPrintName()
+<< ": ptr sign/auth inst without .cfi_negate_ra_state\n";
std::lock_guard Lock(IgnoreMutex);
BF.setIgnored();
return false;
@@ -65,9 +70,10 @@ bool PointerAuthCFIAnalyzer::runOnFunction(BinaryFunction
&BF) {
if (BC.MIB->isPSignOnLR(Inst)) {
if (RAState) {
// RA signing instructions should only follow
[llvm-branch-commits] [llvm] [BOLT] Improve InsertNegateRAStatePass::inferUnknownStates (PR #163381)
https://github.com/bgergely0 updated
https://github.com/llvm/llvm-project/pull/163381
From 5b0920828b645e54ede2525406696229ca935d88 Mon Sep 17 00:00:00 2001
From: Gergely Balint
Date: Tue, 7 Oct 2025 14:01:47 +
Subject: [PATCH 1/4] [BOLT] Improve
InsertNegateRAStatePass::inferUnknownStates
Previous implementation used a simple heuristic. This can be improved in
several ways:
- If a BasicBlock has instruction both with known RAState and unknown RAState,
use the known states to work out the unknown ones.
- If a BasicBlock only consists of instructions with unknown RAState,
use the last known RAState from its predecessors, or the first known
from its successors to set the RAStates in the BasicBlock. This includes
error checking: all predecessors/successors should have the same RAState.
- Some BasicBlocks may only contain instructions with unknown RAState,
and have no CFG neighbors. These already have incorrect unwind info.
For these, we copy the last known RAState based on the layout order.
Updated bolt/docs/PacRetDesign.md to reflect changes.
---
bolt/docs/PacRetDesign.md | 23 +-
.../bolt/Passes/InsertNegateRAStatePass.h | 34 ++-
bolt/lib/Passes/InsertNegateRAStatePass.cpp | 226 --
3 files changed, 255 insertions(+), 28 deletions(-)
diff --git a/bolt/docs/PacRetDesign.md b/bolt/docs/PacRetDesign.md
index f3fe5fbd522cb..c7c76cac3a100 100644
--- a/bolt/docs/PacRetDesign.md
+++ b/bolt/docs/PacRetDesign.md
@@ -200,16 +200,29 @@ This pass runs after optimizations. It performns the
_inverse_ of MarkRAState pa
Some BOLT passes can add new Instructions. In InsertNegateRAStatePass, we have
to know what RA state these have.
-The current solution has the `inferUnknownStates` function to cover these,
using
-a fairly simple strategy: unknown states inherit the last known state.
-
-This will be updated to a more robust solution.
-
> [!important]
> As issue #160989 describes, unwind info is incorrect in stubs with multiple
> callers.
> For this same reason, we cannot generate correct pac-specific unwind info:
> the signess
> of the _incorrect_ return address is meaningless.
+Assignment of RAStates to newly generated instructions is done in
`inferUnknownStates`.
+We have three different cases to cover:
+
+1. If a BasicBlock has some instructions with known RA state, and some
without, we
+ can copy the RAState of known instructions to the unknown ones. As the
control
+ flow only changes between BasicBlocks, instructions in the same BasicBlock
have the
+ same return address.
+
+2. If all instructions in a BasicBlock are unknown, we can look at all CFG
neighbors
+ (that is predecessors/successors). The RAState should be the same as of the
+ neighboring blocks. Conflicting RAStates in neighbors indicate an error.
Such
+ functions should be ignored.
+
+3. If a BasicBlock has no CFG neighbors, we have to copy the RAState of the
previous
+BasicBlock in layout order.
+
+If any BasicBlocks remain with unknown instructions, the function will be
ignored.
+
### Optimizations requiring special attention
Marking states before optimizations ensure that instructions can be moved
around
diff --git a/bolt/include/bolt/Passes/InsertNegateRAStatePass.h
b/bolt/include/bolt/Passes/InsertNegateRAStatePass.h
index 836948bf5e9c0..b4b428207b657 100644
--- a/bolt/include/bolt/Passes/InsertNegateRAStatePass.h
+++ b/bolt/include/bolt/Passes/InsertNegateRAStatePass.h
@@ -1,4 +1,4 @@
-//===- bolt/Passes/InsertNegateRAStatePass.cpp
===//
+//===- bolt/Passes/InsertNegateRAStatePass.h
--===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM
Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
@@ -30,9 +30,39 @@ class InsertNegateRAState : public BinaryFunctionPass {
private:
/// Because states are tracked as MCAnnotations on individual instructions,
/// newly inserted instructions do not have a state associated with them.
- /// New states are "inherited" from the last known state.
void inferUnknownStates(BinaryFunction &BF);
+ /// Simple case: copy RAStates to unknown insts from previous inst.
+ /// Account for signing and authenticating insts.
+ void fillUnknownStateInBB(BinaryContext &BC, BinaryBasicBlock &BB);
+
+ /// Fill unknown RAStates in BBs with no successors/predecessors. These are
+ /// Stubs inserted by LongJmp. As of #160989, we have to copy the RAState
from
+ /// the previous BB in the layout, because CFIs are already incorrect here.
+ void fillUnknownStubs(BinaryFunction &BF);
+
+ /// Fills unknowns RAStates of BBs with successors/predecessors. Uses
+ /// getRAStateByCFG to determine the RAState. Does more than one iteration if
+ /// needed. Reports an error, if it cannot find the RAState for all BBs with
+ /// predecessors/successors.
+ void fillUnknownBlocksInCFG(BinaryFunction &BF);
+
+ /// For
[llvm-branch-commits] [llvm] [BOLT][PAC] Warn about synchronous unwind tables (PR #165227)
https://github.com/bgergely0 updated
https://github.com/llvm/llvm-project/pull/165227
From b1a908146a5b0c5ff7b4f27b63395f4577867847 Mon Sep 17 00:00:00 2001
From: Gergely Balint
Date: Mon, 27 Oct 2025 09:29:54 +
Subject: [PATCH 1/2] [BOLT][PAC] Warn about synchronous unwind tables
BOLT currently ignores functions with synchronous PAuth DWARF info.
When more than 10% of functions get ignored for inconsistencies, we
should emit a warning to only use asynchronous unwind tables.
See also: #165215
---
bolt/lib/Passes/PointerAuthCFIAnalyzer.cpp| 8 -
.../AArch64/pacret-synchronous-unwind.cpp | 32 +++
2 files changed, 39 insertions(+), 1 deletion(-)
create mode 100644 bolt/test/runtime/AArch64/pacret-synchronous-unwind.cpp
diff --git a/bolt/lib/Passes/PointerAuthCFIAnalyzer.cpp
b/bolt/lib/Passes/PointerAuthCFIAnalyzer.cpp
index 91030544d2b88..cc28ca47c26b1 100644
--- a/bolt/lib/Passes/PointerAuthCFIAnalyzer.cpp
+++ b/bolt/lib/Passes/PointerAuthCFIAnalyzer.cpp
@@ -133,11 +133,17 @@ Error
PointerAuthCFIAnalyzer::runOnFunctions(BinaryContext &BC) {
ParallelUtilities::runOnEachFunction(
BC, ParallelUtilities::SchedulingPolicy::SP_INST_LINEAR, WorkFun,
SkipPredicate, "PointerAuthCFIAnalyzer");
+
+ float IgnoredPercent = (100.0 * FunctionsIgnored) / Total;
BC.outs() << "BOLT-INFO: PointerAuthCFIAnalyzer ran on " << Total
<< " functions. Ignored " << FunctionsIgnored << " functions "
-<< format("(%.2lf%%)", (100.0 * FunctionsIgnored) / Total)
+<< format("(%.2lf%%)", IgnoredPercent)
<< " because of CFI inconsistencies\n";
+ if (IgnoredPercent >= 10.0)
+BC.outs() << "BOLT-WARNING: PointerAuthCFIAnalyzer only supports "
+ "asynchronous unwind tables.\n";
+
return Error::success();
}
diff --git a/bolt/test/runtime/AArch64/pacret-synchronous-unwind.cpp
b/bolt/test/runtime/AArch64/pacret-synchronous-unwind.cpp
new file mode 100644
index 0..e90882833323d
--- /dev/null
+++ b/bolt/test/runtime/AArch64/pacret-synchronous-unwind.cpp
@@ -0,0 +1,32 @@
+// Test to demonstrate that functions compiled with synchronous unwind tables
+// are ignored by the PointerAuthCFIAnalyzer.
+// Exception handling is needed to have _any_ unwind tables, otherwise the
+// PointerAuthCFIAnalyzer does not run on these functions, so it does not
ignore
+// any function.
+//
+// REQUIRES: system-linux,bolt-runtime
+//
+// RUN: %clangxx --target=aarch64-unknown-linux-gnu \
+// RUN: -mbranch-protection=pac-ret \
+// RUN: -fno-asynchronous-unwind-tables \
+// RUN: %s -o %t.exe -Wl,-q
+// RUN: llvm-bolt %t.exe -o %t.bolt | FileCheck %s --check-prefix=CHECK
+//
+// CHECK: PointerAuthCFIAnalyzer ran on 3 functions. Ignored
+// CHECK-NOT: 0 functions (0.00%) because of CFI inconsistencies
+// CHECK-SAME: 1 functions (33.33%) because of CFI inconsistencies
+// CHECK-NEXT: PointerAuthCFIAnalyzer only supports asynchronous unwind tables
+
+#include
+#include
+
+void foo() { throw std::runtime_error("Exception from foo()."); }
+
+int main() {
+ try {
+foo();
+ } catch (const std::exception &e) {
+printf("Exception caught: %s\n", e.what());
+ }
+ return 0;
+}
From f32f7116eead18e9d2489321af239cce4b05f43c Mon Sep 17 00:00:00 2001
From: Gergely Balint
Date: Tue, 28 Oct 2025 09:23:08 +
Subject: [PATCH 2/2] [BOLT] Use opts::Verbosity in PointerAuthCFIAnalyzer
---
bolt/lib/Passes/PointerAuthCFIAnalyzer.cpp| 27 ---
bolt/test/AArch64/negate-ra-state-incorrect.s | 2 +-
2 files changed, 18 insertions(+), 11 deletions(-)
diff --git a/bolt/lib/Passes/PointerAuthCFIAnalyzer.cpp
b/bolt/lib/Passes/PointerAuthCFIAnalyzer.cpp
index cc28ca47c26b1..e4efb11356a3d 100644
--- a/bolt/lib/Passes/PointerAuthCFIAnalyzer.cpp
+++ b/bolt/lib/Passes/PointerAuthCFIAnalyzer.cpp
@@ -28,6 +28,10 @@
using namespace llvm;
+namespace opts {
+extern llvm::cl::opt Verbosity;
+} // namespace opts
+
namespace llvm {
namespace bolt {
@@ -43,9 +47,10 @@ bool PointerAuthCFIAnalyzer::runOnFunction(BinaryFunction
&BF) {
// Not all functions have .cfi_negate_ra_state in them. But if one
does,
// we expect psign/pauth instructions to have the hasNegateRAState
// annotation.
-BC.outs() << "BOLT-INFO: inconsistent RAStates in function "
- << BF.getPrintName()
- << ": ptr sign/auth inst without .cfi_negate_ra_state\n";
+if (opts::Verbosity >= 1)
+ BC.outs() << "BOLT-INFO: inconsistent RAStates in function "
+<< BF.getPrintName()
+<< ": ptr sign/auth inst without .cfi_negate_ra_state\n";
std::lock_guard Lock(IgnoreMutex);
BF.setIgnored();
return false;
@@ -65,9 +70,10 @@ bool PointerAuthCFIAnalyzer::runOnFunction(BinaryFunction
&BF) {
if (BC.MIB->isPSignOnLR(Inst)) {
if (RAState) {
// RA signing instructions should only follow
[llvm-branch-commits] [clang] Backport of #136412 to release/21.x (PR #165842)
https://github.com/AaronBallman edited https://github.com/llvm/llvm-project/pull/165842 ___ llvm-branch-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [clang] Backport of #136412 to release/21.x (PR #165842)
https://github.com/AaronBallman commented: Thanks! I'd like a bit more details on why this should be backported. (What's the need and why are these changes safe to land?) https://github.com/llvm/llvm-project/pull/165842 ___ llvm-branch-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [clang] Backport of #136412 to release/21.x (PR #165842)
@@ -0,0 +1,28 @@ +// Test HIPSPV static device library linking +// REQUIRES: system-linux +// UNSUPPORTED: system-windows AaronBallman wrote: Why is this unsupported on Windows? https://github.com/llvm/llvm-project/pull/165842 ___ llvm-branch-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [BOLT] Improve InsertNegateRAStatePass::inferUnknownStates (PR #163381)
@@ -91,44 +105,214 @@ void
InsertNegateRAState::coverFunctionFragmentStart(BinaryFunction &BF,
});
// If a function is already split in the input, the first FF can also start
// with Signed state. This covers that scenario as well.
- auto RAState = BC.MIB->getRAState(*(*FirstNonEmpty)->begin());
+ auto II = (*FirstNonEmpty)->getFirstNonPseudo();
+ auto RAState = BC.MIB->getRAState(*II);
if (!RAState) {
BC.errs() << "BOLT-ERROR: unknown RAState after inferUnknownStates "
<< " in function " << BF.getPrintName() << "\n";
PassFailed = true;
return;
}
if (*RAState)
-BF.addCFIInstruction(*FirstNonEmpty, (*FirstNonEmpty)->begin(),
+BF.addCFIInstruction(*FirstNonEmpty, II,
MCCFIInstruction::createNegateRAState(nullptr));
}
-void InsertNegateRAState::inferUnknownStates(BinaryFunction &BF) {
+std::optional
+InsertNegateRAState::getFirstKnownRAState(BinaryContext &BC,
+ BinaryBasicBlock &BB) {
+ for (const MCInst &Inst : BB) {
+if (BC.MIB->isCFI(Inst))
+ continue;
+auto RAStateOpt = BC.MIB->getRAState(Inst);
+if (RAStateOpt)
+ return RAStateOpt;
+ }
+ return std::nullopt;
+}
+
+void InsertNegateRAState::fillUnknownStateInBB(BinaryContext &BC,
+ BinaryBasicBlock &BB) {
+
+ auto First = BB.getFirstNonPseudo();
+ if (First == BB.end())
+return;
+ // If the first instruction has unknown RAState, we should copy the first
+ // known RAState.
+ auto RAStateOpt = BC.MIB->getRAState(*First);
+ if (!RAStateOpt) {
+auto FirstRAState = getFirstKnownRAState(BC, BB);
+if (!FirstRAState)
+ // We fill unknown BBs later.
+ return;
+
+BC.MIB->setRAState(*First, *FirstRAState);
+ }
+
+ // At this point we know the RAState of the first instruction,
+ // so we can propagate the RAStates to all subsequent unknown instructions.
+ MCInst Prev = *First;
+ for (auto It = BB.begin() + 1; It != BB.end(); ++It) {
+MCInst &Inst = *It;
+if (BC.MIB->isCFI(Inst))
+ continue;
+
+auto PrevRAState = BC.MIB->getRAState(Prev);
+if (!PrevRAState)
+ llvm_unreachable("Previous Instruction has no RAState.");
bgergely0 wrote:
TODO: change to fatal BOLT error, based on the discussion in #162820
https://github.com/llvm/llvm-project/pull/163381
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [BOLT] Improve InsertNegateRAStatePass::inferUnknownStates (PR #163381)
@@ -91,44 +105,214 @@ void
InsertNegateRAState::coverFunctionFragmentStart(BinaryFunction &BF,
});
// If a function is already split in the input, the first FF can also start
// with Signed state. This covers that scenario as well.
- auto RAState = BC.MIB->getRAState(*(*FirstNonEmpty)->begin());
+ auto II = (*FirstNonEmpty)->getFirstNonPseudo();
+ auto RAState = BC.MIB->getRAState(*II);
if (!RAState) {
BC.errs() << "BOLT-ERROR: unknown RAState after inferUnknownStates "
<< " in function " << BF.getPrintName() << "\n";
PassFailed = true;
return;
}
if (*RAState)
-BF.addCFIInstruction(*FirstNonEmpty, (*FirstNonEmpty)->begin(),
+BF.addCFIInstruction(*FirstNonEmpty, II,
MCCFIInstruction::createNegateRAState(nullptr));
}
-void InsertNegateRAState::inferUnknownStates(BinaryFunction &BF) {
+std::optional
+InsertNegateRAState::getFirstKnownRAState(BinaryContext &BC,
+ BinaryBasicBlock &BB) {
+ for (const MCInst &Inst : BB) {
+if (BC.MIB->isCFI(Inst))
+ continue;
+auto RAStateOpt = BC.MIB->getRAState(Inst);
+if (RAStateOpt)
+ return RAStateOpt;
+ }
+ return std::nullopt;
+}
+
+void InsertNegateRAState::fillUnknownStateInBB(BinaryContext &BC,
+ BinaryBasicBlock &BB) {
+
+ auto First = BB.getFirstNonPseudo();
+ if (First == BB.end())
+return;
+ // If the first instruction has unknown RAState, we should copy the first
+ // known RAState.
+ auto RAStateOpt = BC.MIB->getRAState(*First);
+ if (!RAStateOpt) {
+auto FirstRAState = getFirstKnownRAState(BC, BB);
+if (!FirstRAState)
+ // We fill unknown BBs later.
+ return;
+
+BC.MIB->setRAState(*First, *FirstRAState);
+ }
+
+ // At this point we know the RAState of the first instruction,
+ // so we can propagate the RAStates to all subsequent unknown instructions.
+ MCInst Prev = *First;
+ for (auto It = BB.begin() + 1; It != BB.end(); ++It) {
+MCInst &Inst = *It;
+if (BC.MIB->isCFI(Inst))
+ continue;
+
+auto PrevRAState = BC.MIB->getRAState(Prev);
+if (!PrevRAState)
+ llvm_unreachable("Previous Instruction has no RAState.");
+
+auto RAState = BC.MIB->getRAState(Inst);
+if (!RAState) {
+ if (BC.MIB->isPSignOnLR(Prev))
+PrevRAState = true;
+ else if (BC.MIB->isPAuthOnLR(Prev))
+PrevRAState = false;
+ BC.MIB->setRAState(Inst, *PrevRAState);
+}
+Prev = Inst;
+ }
+}
+
+bool InsertNegateRAState::isUnknownBlock(BinaryContext &BC,
+ BinaryBasicBlock &BB) {
+ for (const MCInst &Inst : BB) {
+if (BC.MIB->isCFI(Inst))
+ continue;
+auto RAState = BC.MIB->getRAState(Inst);
+if (RAState)
+ return false;
+ }
+ return true;
+}
+
+void InsertNegateRAState::markUnknownBlock(BinaryContext &BC,
+ BinaryBasicBlock &BB, bool State) {
+ // If we call this when an Instruction has either kRASigned or kRAUnsigned
+ // annotation, setRASigned or setRAUnsigned would fail.
+ assert(isUnknownBlock(BC, BB) &&
+ "markUnknownBlock should only be called on unknown blocks");
+ for (MCInst &Inst : BB) {
+if (BC.MIB->isCFI(Inst))
+ continue;
+BC.MIB->setRAState(Inst, State);
+ }
+}
+
+std::optional InsertNegateRAState::getRAStateByCFG(BinaryBasicBlock &BB,
+ BinaryFunction &BF) {
BinaryContext &BC = BF.getBinaryContext();
- bool FirstIter = true;
- MCInst PrevInst;
- for (BinaryBasicBlock &BB : BF) {
-for (MCInst &Inst : BB) {
- if (BC.MIB->isCFI(Inst))
+
+ auto checkRAState = [&](std::optional &NeighborRAState, MCInst &Inst) {
+auto RAState = BC.MIB->getRAState(Inst);
+if (!RAState)
+ return;
+if (!NeighborRAState) {
+ NeighborRAState = *RAState;
+ return;
+}
+if (NeighborRAState != *RAState) {
+ BC.outs() << "BOLT-WARNING: Conflicting RAState found in function "
+<< BF.getPrintName() << ". Function will not be optimized.\n";
+ BF.setIgnored();
bgergely0 wrote:
TODO: decide if we want to `ignore + continue execution`, or create a fatal
error.
https://github.com/llvm/llvm-project/pull/163381
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [BOLT] Improve InsertNegateRAStatePass::inferUnknownStates (PR #163381)
@@ -91,44 +105,214 @@ void
InsertNegateRAState::coverFunctionFragmentStart(BinaryFunction &BF,
});
// If a function is already split in the input, the first FF can also start
// with Signed state. This covers that scenario as well.
- auto RAState = BC.MIB->getRAState(*(*FirstNonEmpty)->begin());
+ auto II = (*FirstNonEmpty)->getFirstNonPseudo();
+ auto RAState = BC.MIB->getRAState(*II);
if (!RAState) {
BC.errs() << "BOLT-ERROR: unknown RAState after inferUnknownStates "
<< " in function " << BF.getPrintName() << "\n";
PassFailed = true;
return;
}
if (*RAState)
-BF.addCFIInstruction(*FirstNonEmpty, (*FirstNonEmpty)->begin(),
+BF.addCFIInstruction(*FirstNonEmpty, II,
MCCFIInstruction::createNegateRAState(nullptr));
}
-void InsertNegateRAState::inferUnknownStates(BinaryFunction &BF) {
+std::optional
+InsertNegateRAState::getFirstKnownRAState(BinaryContext &BC,
+ BinaryBasicBlock &BB) {
+ for (const MCInst &Inst : BB) {
+if (BC.MIB->isCFI(Inst))
+ continue;
+auto RAStateOpt = BC.MIB->getRAState(Inst);
+if (RAStateOpt)
+ return RAStateOpt;
+ }
+ return std::nullopt;
+}
+
+void InsertNegateRAState::fillUnknownStateInBB(BinaryContext &BC,
+ BinaryBasicBlock &BB) {
+
+ auto First = BB.getFirstNonPseudo();
+ if (First == BB.end())
+return;
+ // If the first instruction has unknown RAState, we should copy the first
+ // known RAState.
+ auto RAStateOpt = BC.MIB->getRAState(*First);
+ if (!RAStateOpt) {
+auto FirstRAState = getFirstKnownRAState(BC, BB);
+if (!FirstRAState)
+ // We fill unknown BBs later.
+ return;
+
+BC.MIB->setRAState(*First, *FirstRAState);
+ }
+
+ // At this point we know the RAState of the first instruction,
+ // so we can propagate the RAStates to all subsequent unknown instructions.
+ MCInst Prev = *First;
+ for (auto It = BB.begin() + 1; It != BB.end(); ++It) {
+MCInst &Inst = *It;
+if (BC.MIB->isCFI(Inst))
+ continue;
+
+auto PrevRAState = BC.MIB->getRAState(Prev);
+if (!PrevRAState)
+ llvm_unreachable("Previous Instruction has no RAState.");
+
+auto RAState = BC.MIB->getRAState(Inst);
+if (!RAState) {
+ if (BC.MIB->isPSignOnLR(Prev))
+PrevRAState = true;
+ else if (BC.MIB->isPAuthOnLR(Prev))
+PrevRAState = false;
+ BC.MIB->setRAState(Inst, *PrevRAState);
+}
+Prev = Inst;
+ }
+}
+
+bool InsertNegateRAState::isUnknownBlock(BinaryContext &BC,
+ BinaryBasicBlock &BB) {
+ for (const MCInst &Inst : BB) {
+if (BC.MIB->isCFI(Inst))
+ continue;
+auto RAState = BC.MIB->getRAState(Inst);
+if (RAState)
+ return false;
+ }
+ return true;
+}
+
+void InsertNegateRAState::markUnknownBlock(BinaryContext &BC,
+ BinaryBasicBlock &BB, bool State) {
+ // If we call this when an Instruction has either kRASigned or kRAUnsigned
+ // annotation, setRASigned or setRAUnsigned would fail.
+ assert(isUnknownBlock(BC, BB) &&
+ "markUnknownBlock should only be called on unknown blocks");
+ for (MCInst &Inst : BB) {
+if (BC.MIB->isCFI(Inst))
+ continue;
+BC.MIB->setRAState(Inst, State);
+ }
+}
+
+std::optional InsertNegateRAState::getRAStateByCFG(BinaryBasicBlock &BB,
+ BinaryFunction &BF) {
BinaryContext &BC = BF.getBinaryContext();
- bool FirstIter = true;
- MCInst PrevInst;
- for (BinaryBasicBlock &BB : BF) {
-for (MCInst &Inst : BB) {
- if (BC.MIB->isCFI(Inst))
+
+ auto checkRAState = [&](std::optional &NeighborRAState, MCInst &Inst) {
+auto RAState = BC.MIB->getRAState(Inst);
+if (!RAState)
+ return;
+if (!NeighborRAState) {
+ NeighborRAState = *RAState;
+ return;
+}
+if (NeighborRAState != *RAState) {
+ BC.outs() << "BOLT-WARNING: Conflicting RAState found in function "
+<< BF.getPrintName() << ". Function will not be optimized.\n";
+ BF.setIgnored();
+}
+ };
+
+ // Holds the first found RAState from CFG neighbors.
+ std::optional NeighborRAState = std::nullopt;
+ if (BB.pred_size() != 0) {
+for (BinaryBasicBlock *PredBB : BB.predecessors()) {
+ // find last inst of Predecessor with known RA State.
+ auto LI = PredBB->getLastNonPseudo();
+ if (LI == PredBB->rend())
+continue;
+ MCInst &LastInst = *LI;
+ checkRAState(NeighborRAState, LastInst);
+}
+ } else if (BB.succ_size() != 0) {
+for (BinaryBasicBlock *SuccBB : BB.successors()) {
+ // find first inst of Successor with known RA State.
+ auto FI = SuccBB->getFirstNonPseudo();
+ if (FI == SuccBB->end())
continue;
+ MCInst &FirstInst = *FI;
+ checkRAState(NeighborRAState, FirstInst);
+}
[llvm-branch-commits] [llvm] [BOLT] Improve InsertNegateRAStatePass::inferUnknownStates (PR #163381)
@@ -91,44 +105,214 @@ void
InsertNegateRAState::coverFunctionFragmentStart(BinaryFunction &BF,
});
// If a function is already split in the input, the first FF can also start
// with Signed state. This covers that scenario as well.
- auto RAState = BC.MIB->getRAState(*(*FirstNonEmpty)->begin());
+ auto II = (*FirstNonEmpty)->getFirstNonPseudo();
+ auto RAState = BC.MIB->getRAState(*II);
if (!RAState) {
BC.errs() << "BOLT-ERROR: unknown RAState after inferUnknownStates "
<< " in function " << BF.getPrintName() << "\n";
PassFailed = true;
return;
}
if (*RAState)
-BF.addCFIInstruction(*FirstNonEmpty, (*FirstNonEmpty)->begin(),
+BF.addCFIInstruction(*FirstNonEmpty, II,
MCCFIInstruction::createNegateRAState(nullptr));
}
-void InsertNegateRAState::inferUnknownStates(BinaryFunction &BF) {
+std::optional
+InsertNegateRAState::getFirstKnownRAState(BinaryContext &BC,
+ BinaryBasicBlock &BB) {
+ for (const MCInst &Inst : BB) {
+if (BC.MIB->isCFI(Inst))
+ continue;
+auto RAStateOpt = BC.MIB->getRAState(Inst);
+if (RAStateOpt)
+ return RAStateOpt;
+ }
+ return std::nullopt;
+}
+
+void InsertNegateRAState::fillUnknownStateInBB(BinaryContext &BC,
+ BinaryBasicBlock &BB) {
+
+ auto First = BB.getFirstNonPseudo();
+ if (First == BB.end())
+return;
+ // If the first instruction has unknown RAState, we should copy the first
+ // known RAState.
+ auto RAStateOpt = BC.MIB->getRAState(*First);
+ if (!RAStateOpt) {
+auto FirstRAState = getFirstKnownRAState(BC, BB);
+if (!FirstRAState)
+ // We fill unknown BBs later.
+ return;
+
+BC.MIB->setRAState(*First, *FirstRAState);
+ }
+
+ // At this point we know the RAState of the first instruction,
+ // so we can propagate the RAStates to all subsequent unknown instructions.
+ MCInst Prev = *First;
+ for (auto It = BB.begin() + 1; It != BB.end(); ++It) {
+MCInst &Inst = *It;
+if (BC.MIB->isCFI(Inst))
+ continue;
+
+auto PrevRAState = BC.MIB->getRAState(Prev);
+if (!PrevRAState)
+ llvm_unreachable("Previous Instruction has no RAState.");
+
+auto RAState = BC.MIB->getRAState(Inst);
+if (!RAState) {
+ if (BC.MIB->isPSignOnLR(Prev))
+PrevRAState = true;
+ else if (BC.MIB->isPAuthOnLR(Prev))
+PrevRAState = false;
+ BC.MIB->setRAState(Inst, *PrevRAState);
+}
+Prev = Inst;
+ }
+}
+
+bool InsertNegateRAState::isUnknownBlock(BinaryContext &BC,
+ BinaryBasicBlock &BB) {
+ for (const MCInst &Inst : BB) {
+if (BC.MIB->isCFI(Inst))
+ continue;
+auto RAState = BC.MIB->getRAState(Inst);
+if (RAState)
+ return false;
+ }
+ return true;
+}
+
+void InsertNegateRAState::markUnknownBlock(BinaryContext &BC,
+ BinaryBasicBlock &BB, bool State) {
+ // If we call this when an Instruction has either kRASigned or kRAUnsigned
+ // annotation, setRASigned or setRAUnsigned would fail.
+ assert(isUnknownBlock(BC, BB) &&
+ "markUnknownBlock should only be called on unknown blocks");
+ for (MCInst &Inst : BB) {
+if (BC.MIB->isCFI(Inst))
+ continue;
+BC.MIB->setRAState(Inst, State);
+ }
+}
+
+std::optional InsertNegateRAState::getRAStateByCFG(BinaryBasicBlock &BB,
+ BinaryFunction &BF) {
BinaryContext &BC = BF.getBinaryContext();
- bool FirstIter = true;
- MCInst PrevInst;
- for (BinaryBasicBlock &BB : BF) {
-for (MCInst &Inst : BB) {
- if (BC.MIB->isCFI(Inst))
+
+ auto checkRAState = [&](std::optional &NeighborRAState, MCInst &Inst) {
+auto RAState = BC.MIB->getRAState(Inst);
+if (!RAState)
+ return;
+if (!NeighborRAState) {
+ NeighborRAState = *RAState;
+ return;
+}
+if (NeighborRAState != *RAState) {
+ BC.outs() << "BOLT-WARNING: Conflicting RAState found in function "
+<< BF.getPrintName() << ". Function will not be optimized.\n";
+ BF.setIgnored();
+}
+ };
+
+ // Holds the first found RAState from CFG neighbors.
+ std::optional NeighborRAState = std::nullopt;
+ if (BB.pred_size() != 0) {
+for (BinaryBasicBlock *PredBB : BB.predecessors()) {
+ // find last inst of Predecessor with known RA State.
+ auto LI = PredBB->getLastNonPseudo();
+ if (LI == PredBB->rend())
+continue;
+ MCInst &LastInst = *LI;
+ checkRAState(NeighborRAState, LastInst);
+}
+ } else if (BB.succ_size() != 0) {
+for (BinaryBasicBlock *SuccBB : BB.successors()) {
+ // find first inst of Successor with known RA State.
+ auto FI = SuccBB->getFirstNonPseudo();
+ if (FI == SuccBB->end())
continue;
+ MCInst &FirstInst = *FI;
+ checkRAState(NeighborRAState, FirstInst);
+}
[llvm-branch-commits] [llvm] [BOLT][NFC] Rename Pointer Auth DWARF rewriter passes (PR #164622)
https://github.com/paschalis-mpeis approved this pull request. Looks good, thanks for the renaming, Gergely. Could you rework a bit the commit description? Since everything now falls under PointerAuth*, should we also rename the PacRetDesign.md for consistency? We could also consider prefixing tests with `pauth-*` to keep them close to each other? Regarding BUILD.gn: IIRC the last time I modified it, I was told ~ _it's better to let the syncbot do it, though a manual change shouldn't cause any issues_. https://github.com/llvm/llvm-project/pull/164622 ___ llvm-branch-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [BOLT][NFC] Rename Pointer Auth DWARF rewriter passes (PR #164622)
https://github.com/paschalis-mpeis edited https://github.com/llvm/llvm-project/pull/164622 ___ llvm-branch-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [BOLT][NFC] Rename Pointer Auth DWARF rewriter passes (PR #164622)
@@ -104,9 +104,9 @@ negate-ra-state CFIs will become invalid during BasicBlock reordering. ## Solution design The implementation introduces two new passes: -1. `MarkRAStatesPass`: assigns the RA state to each instruction based on the CFIs -in the input binary -2. `InsertNegateRAStatePass`: reads those assigned instruction RA states after +1. `PointerAuthCFIAnalyzer`: assigns the RA state to each instruction based on +the CFI in the input binary paschalis-mpeis wrote: Did you intend to drop plural here? https://github.com/llvm/llvm-project/pull/164622 ___ llvm-branch-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [BOLT][NFC] Rename Pointer Auth DWARF rewriter passes (PR #164622)
@@ -188,16 +188,17 @@ The inconsistencies are as follows: Users will be informed about the number of ignored functions in the pass, the exact functions ignored, and the found inconsistency. -### InsertNegateRAStatePass +### PointerAuthCFIFixup -This pass runs after optimizations. It performns the _inverse_ of MarkRAState pa s: +This pass runs after optimizations. It performns the _inverse_ of PointerAuthCFIAnalyzer paschalis-mpeis wrote: typo: ```suggestion This pass runs after optimizations. It performs the _inverse_ of PointerAuthCFIAnalyzer ``` https://github.com/llvm/llvm-project/pull/164622 ___ llvm-branch-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [WIP] Handle guard insertion in callbacks to OpenMP runtime functions. (PR #164655)
https://github.com/abidh updated
https://github.com/llvm/llvm-project/pull/164655
>From 56037a64dbd5f73d2c020dd5d58d2c99758b35d0 Mon Sep 17 00:00:00 2001
From: Abid Qadeer
Date: Tue, 21 Oct 2025 20:53:46 +0100
Subject: [PATCH 1/6] Add callback metadata to runtime functions which take
callbacks.
---
llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp | 25
.../Frontend/OpenMPIRBuilderTest.cpp | 58 +++
2 files changed, 83 insertions(+)
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index c164d32f8f98c..312e119c4280d 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -750,6 +750,31 @@ OpenMPIRBuilder::getOrCreateRuntimeFunction(Module &M,
RuntimeFunction FnID) {
*MDNode::get(Ctx, {MDB.createCallbackEncoding(
2, {-1, -1}, /* VarArgsArePassed */ true)}));
}
+
+} else if (FnID == OMPRTL___kmpc_distribute_static_loop_4 ||
+ FnID == OMPRTL___kmpc_distribute_static_loop_4u ||
+ FnID == OMPRTL___kmpc_distribute_static_loop_8 ||
+ FnID == OMPRTL___kmpc_distribute_static_loop_8u ||
+ FnID == OMPRTL___kmpc_distribute_for_static_loop_4 ||
+ FnID == OMPRTL___kmpc_distribute_for_static_loop_4u ||
+ FnID == OMPRTL___kmpc_distribute_for_static_loop_8 ||
+ FnID == OMPRTL___kmpc_distribute_for_static_loop_8u ||
+ FnID == OMPRTL___kmpc_for_static_loop_4 ||
+ FnID == OMPRTL___kmpc_for_static_loop_4u ||
+ FnID == OMPRTL___kmpc_for_static_loop_8 ||
+ FnID == OMPRTL___kmpc_for_static_loop_8u) {
+ if (!Fn->hasMetadata(LLVMContext::MD_callback)) {
+LLVMContext &Ctx = Fn->getContext();
+MDBuilder MDB(Ctx);
+// Annotate the callback behavior of the runtime function:
+// - The callback callee is argument number 1.
+// - The first argument of the callback callee is unknown (-1).
+// - The second argument of the callback callee is argument number 2
+Fn->addMetadata(
+LLVMContext::MD_callback,
+*MDNode::get(Ctx, {MDB.createCallbackEncoding(
+ 1, {-1, 2}, /* VarArgsArePassed */ false)}));
+ }
}
LLVM_DEBUG(dbgs() << "Created OpenMP runtime function " << Fn->getName()
diff --git a/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp
b/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp
index d231a778a8a97..aca2153f85c26 100644
--- a/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp
+++ b/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp
@@ -7957,4 +7957,62 @@ TEST_F(OpenMPIRBuilderTest, spliceBBWithEmptyBB) {
EXPECT_FALSE(Terminator->getDbgRecordRange().empty());
}
+TEST_F(OpenMPIRBuilderTest, callBackFunctions) {
+ OpenMPIRBuilder OMPBuilder(*M);
+ OMPBuilder.Config.IsTargetDevice = true;
+ OMPBuilder.initialize();
+
+ // Test multiple runtime functions that should have callback metadata
+ std::vector CallbackFunctions = {
+OMPRTL___kmpc_distribute_static_loop_4,
+OMPRTL___kmpc_distribute_static_loop_4u,
+OMPRTL___kmpc_distribute_static_loop_8,
+OMPRTL___kmpc_distribute_static_loop_8u,
+OMPRTL___kmpc_distribute_for_static_loop_4,
+OMPRTL___kmpc_distribute_for_static_loop_4u,
+OMPRTL___kmpc_distribute_for_static_loop_8,
+OMPRTL___kmpc_distribute_for_static_loop_8u,
+OMPRTL___kmpc_for_static_loop_4,
+OMPRTL___kmpc_for_static_loop_4u,
+OMPRTL___kmpc_for_static_loop_8,
+OMPRTL___kmpc_for_static_loop_8u
+ };
+
+ for (RuntimeFunction RF : CallbackFunctions) {
+Function *Fn = OMPBuilder.getOrCreateRuntimeFunctionPtr(RF);
+ASSERT_NE(Fn, nullptr) << "Function should exist for runtime function";
+
+MDNode *CallbackMD = Fn->getMetadata(LLVMContext::MD_callback);
+EXPECT_NE(CallbackMD, nullptr) << "Function should have callback metadata";
+
+if (CallbackMD) {
+ // Should have at least one callback
+ EXPECT_GE(CallbackMD->getNumOperands(), 1U);
+
+ // Test first callback entry
+ MDNode *FirstCallback = cast(CallbackMD->getOperand(0));
+ EXPECT_EQ(FirstCallback->getNumOperands(), 4U);
+
+ // Callee index should be valid
+ auto *CalleeIdxCM =
cast(FirstCallback->getOperand(0));
+ uint64_t CalleeIdx =
cast(CalleeIdxCM->getValue())->getZExtValue();
+ EXPECT_EQ(CalleeIdx, 1u);
+
+ // Verify payload arguments re (-1, 2)
+ auto *Arg0CM = cast(FirstCallback->getOperand(1));
+ int64_t Arg0 = cast(Arg0CM->getValue())->getSExtValue();
+ EXPECT_EQ(Arg0, -1);
+ auto *Arg1CM = cast(FirstCallback->getOperand(2));
+ int64_t Arg1 = cast(Arg1CM->getValue())->getSExtValue();
+ EXPECT_EQ(Arg1, 2);
+
+ // Verify the varArgs is false.
+ auto *VarArgCM = cast(FirstCallback->getOperand(3));
+ uint64_t VarAr
[llvm-branch-commits] [llvm] [IR] Add CallBr intrinsics support (PR #133907)
https://github.com/ro-i updated https://github.com/llvm/llvm-project/pull/133907
>From adbca593dd83f8f74cbfc0d1ba9932e3beb4adb0 Mon Sep 17 00:00:00 2001
From: Robert Imschweiler
Date: Tue, 1 Apr 2025 08:03:16 -0500
Subject: [PATCH] [IR] Add CallBr intrinsics support
This commit adds support for using intrinsics with callbr.
The uses of this will most of the time look like this example:
```llvm
callbr void @llvm.amdgcn.kill(i1 %c) to label %cont [label %kill]
kill:
unreachable
cont:
...
```
---
llvm/docs/LangRef.rst | 27 +++--
.../llvm/CodeGen/GlobalISel/IRTranslator.h| 2 +
llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp | 35 +-
.../SelectionDAG/SelectionDAGBuilder.cpp | 68
.../SelectionDAG/SelectionDAGBuilder.h| 4 +-
llvm/lib/IR/Verifier.cpp | 33 +-
llvm/test/Assembler/callbr.ll | 20
llvm/test/CodeGen/AMDGPU/callbr-intrinsics.ll | 101 ++
llvm/test/Verifier/callbr-intrinsic.ll| 57 ++
9 files changed, 313 insertions(+), 34 deletions(-)
create mode 100644 llvm/test/Assembler/callbr.ll
create mode 100644 llvm/test/CodeGen/AMDGPU/callbr-intrinsics.ll
create mode 100644 llvm/test/Verifier/callbr-intrinsic.ll
diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index 3c089b5a0ba79..c17e1000f9e8c 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -9788,8 +9788,12 @@ The '``callbr``' instruction causes control to transfer
to a specified
function, with the possibility of control flow transfer to either the
'``fallthrough``' label or one of the '``indirect``' labels.
-This instruction should only be used to implement the "goto" feature of gcc
-style inline assembly. Any other usage is an error in the IR verifier.
+This instruction can currently only be used
+
+#. to implement the "goto" feature of gcc style inline assembly or
+#. to call selected intrinsics.
+
+Any other usage is an error in the IR verifier.
Note that in order to support outputs along indirect edges, LLVM may need to
split critical edges, which may require synthesizing a replacement block for
@@ -9838,7 +9842,7 @@ This instruction requires several arguments:
indicates the function accepts a variable number of arguments, the
extra arguments can be specified.
#. '``fallthrough label``': the label reached when the inline assembly's
- execution exits the bottom.
+ execution exits the bottom / the intrinsic call returns.
#. '``indirect labels``': the labels reached when a callee transfers control
to a location other than the '``fallthrough label``'. Label constraints
refer to these destinations.
@@ -9856,9 +9860,12 @@ flow goes after the call.
The output values of a '``callbr``' instruction are available both in the
the '``fallthrough``' block, and any '``indirect``' blocks(s).
-The only use of this today is to implement the "goto" feature of gcc inline
-assembly where additional labels can be provided as locations for the inline
-assembly to jump to.
+The only current uses of this are:
+
+#. implement the "goto" feature of gcc inline assembly where additional
+ labels can be provided as locations for the inline assembly to jump to.
+#. support selected intrinsics which manipulate control flow and should
+ be chained to specific terminators, such as '``unreachable``'.
Example:
@@ -9873,6 +9880,14 @@ Example:
= callbr i32 asm "", "=r,r,!i"(i32 %x)
to label %fallthrough [label %indirect]
+ ; intrinsic which should be followed by unreachable (the order of the
+ ; blocks after the callbr instruction doesn't matter)
+callbr void @llvm.amdgcn.kill(i1 %c) to label %cont [label %kill]
+ cont:
+...
+ kill:
+unreachable
+
.. _i_resume:
'``resume``' Instruction
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/IRTranslator.h
b/llvm/include/llvm/CodeGen/GlobalISel/IRTranslator.h
index 9d6038db4391f..5f5a6f5c72abf 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/IRTranslator.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/IRTranslator.h
@@ -317,6 +317,8 @@ class IRTranslator : public MachineFunctionPass {
bool translateInvoke(const User &U, MachineIRBuilder &MIRBuilder);
bool translateCallBr(const User &U, MachineIRBuilder &MIRBuilder);
+ bool translateCallBrIntrinsic(const CallBrInst &I,
+MachineIRBuilder &MIRBuilder);
bool translateLandingPad(const User &U, MachineIRBuilder &MIRBuilder);
diff --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
index 78a633f7a049d..5810c470e4d84 100644
--- a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
@@ -2824,7 +2824,7 @@ bool IRTranslator::translateCall(const User &U,
MachineIRBuilder &MIRBuilder) {
IsTgtMemIntrinsic ? &Info : nullptr);
}
-//
[llvm-branch-commits] [llvm] 65976dd - fix formatting
Author: Robert Imschweiler
Date: 2025-11-03T09:58:09-06:00
New Revision: 65976dd55fc301a9c2ea14c995323e29fa29d5b4
URL:
https://github.com/llvm/llvm-project/commit/65976dd55fc301a9c2ea14c995323e29fa29d5b4
DIFF:
https://github.com/llvm/llvm-project/commit/65976dd55fc301a9c2ea14c995323e29fa29d5b4.diff
LOG: fix formatting
Added:
Modified:
llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp
Removed:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp
b/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp
index 733c5d520fb23..ddf9a24eb5230 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp
@@ -225,8 +225,8 @@ bool AMDGPUUnifyDivergentExitNodesImpl::run(Function &F,
DominatorTree *DT,
ConstantInt *BoolTrue = ConstantInt::getTrue(F.getContext());
if (DummyReturnBB == nullptr) {
-DummyReturnBB = BasicBlock::Create(F.getContext(),
- "DummyReturnBlock", &F);
+DummyReturnBB =
+BasicBlock::Create(F.getContext(), "DummyReturnBlock", &F);
Type *RetTy = F.getReturnType();
Value *RetVal = RetTy->isVoidTy() ? nullptr : PoisonValue::get(RetTy);
ReturnInst::Create(F.getContext(), RetVal, DummyReturnBB);
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [CAS] Add llvm-cas tools to inspect on-disk LLVMCAS (PR #114104)
https://github.com/cachemeifyoucan updated
https://github.com/llvm/llvm-project/pull/114104
>From 63c4928ed65fb2a83a4a25f3c098af7d931fc0af Mon Sep 17 00:00:00 2001
From: Steven Wu
Date: Mon, 3 Nov 2025 12:09:19 -0800
Subject: [PATCH 1/2] clang-format
Created using spr 1.3.7
---
llvm/tools/llvm-cas/llvm-cas.cpp | 11 +--
1 file changed, 5 insertions(+), 6 deletions(-)
diff --git a/llvm/tools/llvm-cas/llvm-cas.cpp b/llvm/tools/llvm-cas/llvm-cas.cpp
index b1e4f606211b5..e59313eb808e8 100644
--- a/llvm/tools/llvm-cas/llvm-cas.cpp
+++ b/llvm/tools/llvm-cas/llvm-cas.cpp
@@ -175,13 +175,12 @@ int main(int Argc, char **Argv) {
return validateObject(*CAS, ID);
}
-static Expected>
-openBuffer(StringRef DataPath) {
+static Expected> openBuffer(StringRef DataPath) {
if (DataPath.empty())
return createStringError(inconvertibleErrorCode(), "--data missing");
- return errorOrToExpected(
- DataPath == "-" ? llvm::MemoryBuffer::getSTDIN()
- : llvm::MemoryBuffer::getFile(DataPath));
+ return errorOrToExpected(DataPath == "-"
+ ? llvm::MemoryBuffer::getSTDIN()
+ : llvm::MemoryBuffer::getFile(DataPath));
}
int dump(ObjectStore &CAS) {
@@ -311,7 +310,7 @@ int validateIfNeeded(StringRef Path, bool CheckHash, bool
Force,
Exec = ExecStorage;
}
ValidationResult Result =
ExitOnErr(validateOnDiskUnifiedCASDatabasesIfNeeded(
-Path, CheckHash, AllowRecovery, Force, Exec));
+ Path, CheckHash, AllowRecovery, Force, Exec));
switch (Result) {
case ValidationResult::Valid:
outs() << "validated successfully\n";
>From 76fbb642c630302353ae67a50df93db71e7f33cc Mon Sep 17 00:00:00 2001
From: Steven Wu
Date: Mon, 3 Nov 2025 12:13:40 -0800
Subject: [PATCH 2/2] darker check fix
Created using spr 1.3.7
---
llvm/test/lit.cfg.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/llvm/test/lit.cfg.py b/llvm/test/lit.cfg.py
index b78dc10ff6ad5..bca196e80640b 100644
--- a/llvm/test/lit.cfg.py
+++ b/llvm/test/lit.cfg.py
@@ -789,7 +789,7 @@ def host_unwind_supports_jit():
config.available_features.add("expensive_checks")
if config.have_ondisk_cas:
-config.available_features.add('ondisk_cas')
+config.available_features.add("ondisk_cas")
if "MemoryWithOrigins" in config.llvm_use_sanitizer:
config.available_features.add("use_msan_with_origins")
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [LVer][profcheck] explicitly set unknown branch weights for the versioned/unversioned selector (PR #164507)
https://github.com/mtrofin updated
https://github.com/llvm/llvm-project/pull/164507
>From 7a7a7f211a62ae2fa6bb6149b5b5e98b806f0e3b Mon Sep 17 00:00:00 2001
From: Mircea Trofin
Date: Tue, 21 Oct 2025 15:20:14 -0700
Subject: [PATCH] [LVer][profcheck] explicitly set unknown branch weights for
the versioned/unversioned selector
---
llvm/lib/Transforms/Utils/LoopVersioning.cpp | 10 --
.../Transforms/LoopDistribute/basic-with-memchecks.ll | 5 +++--
2 files changed, 11 insertions(+), 4 deletions(-)
diff --git a/llvm/lib/Transforms/Utils/LoopVersioning.cpp
b/llvm/lib/Transforms/Utils/LoopVersioning.cpp
index ec2e6c1ab796b..4786819d18fa4 100644
--- a/llvm/lib/Transforms/Utils/LoopVersioning.cpp
+++ b/llvm/lib/Transforms/Utils/LoopVersioning.cpp
@@ -23,6 +23,7 @@
#include "llvm/IR/Dominators.h"
#include "llvm/IR/MDBuilder.h"
#include "llvm/IR/PassManager.h"
+#include "llvm/IR/ProfDataUtils.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
#include "llvm/Transforms/Utils/Cloning.h"
@@ -109,8 +110,13 @@ void LoopVersioning::versionLoop(
// Insert the conditional branch based on the result of the memchecks.
Instruction *OrigTerm = RuntimeCheckBB->getTerminator();
Builder.SetInsertPoint(OrigTerm);
- Builder.CreateCondBr(RuntimeCheck, NonVersionedLoop->getLoopPreheader(),
- VersionedLoop->getLoopPreheader());
+ auto *BI =
+ Builder.CreateCondBr(RuntimeCheck, NonVersionedLoop->getLoopPreheader(),
+ VersionedLoop->getLoopPreheader());
+ // We don't know what the probability of executing the versioned vs the
+ // unversioned variants is.
+ setExplicitlyUnknownBranchWeightsIfProfiled(
+ *BI, *BI->getParent()->getParent(), DEBUG_TYPE);
OrigTerm->eraseFromParent();
// The loops merge in the original exit block. This is now dominated by the
diff --git a/llvm/test/Transforms/LoopDistribute/basic-with-memchecks.ll
b/llvm/test/Transforms/LoopDistribute/basic-with-memchecks.ll
index 97ea2c6708dad..2828882afe779 100644
--- a/llvm/test/Transforms/LoopDistribute/basic-with-memchecks.ll
+++ b/llvm/test/Transforms/LoopDistribute/basic-with-memchecks.ll
@@ -28,7 +28,7 @@ target triple = "x86_64-apple-macosx10.10.0"
@E = common global ptr null, align 8
; CHECK-LABEL: @f(
-define void @f() {
+define void @f() !prof !{!"function_entry_count", i32 10} {
entry:
%a = load ptr, ptr @A, align 8
%b = load ptr, ptr @B, align 8
@@ -55,7 +55,7 @@ entry:
; CHECK: = icmp
; CHECK-NOT: = icmp
-; CHECK: br i1 %conflict.rdx15, label %for.body.ph.lver.orig, label
%for.body.ph.ldist1
+; CHECK: br i1 %conflict.rdx15, label %for.body.ph.lver.orig, label
%for.body.ph.ldist1, !prof ![[PROF1:[0-9]]]
; The non-distributed loop that the memchecks fall back on.
@@ -289,3 +289,4 @@ attributes #1 = { nounwind convergent }
!0 = distinct !{!0, !1}
!1 = !{!"llvm.loop.distribute.enable", i1 true}
+; CHECK: ![[PROF1]] = !{!"unknown", !"loop-versioning"}
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [LSCFG][profcheck] Add dummy branch weights for the dummy switch to dead exits (PR #164714)
https://github.com/mtrofin updated
https://github.com/llvm/llvm-project/pull/164714
>From 82f4ee86a4d5b907783f71a17ba645ae6e598af2 Mon Sep 17 00:00:00 2001
From: Mircea Trofin
Date: Wed, 22 Oct 2025 14:34:31 -0700
Subject: [PATCH] [LSCFG][profcheck] Add dummy branch weights for the dummy
switch to dead exits
---
.../lib/Transforms/Scalar/LoopSimplifyCFG.cpp | 12 ++
.../LoopSimplifyCFG/constant-fold-branch.ll | 104 +-
2 files changed, 66 insertions(+), 50 deletions(-)
diff --git a/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp
b/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp
index b9546c5fa236b..e902b71776973 100644
--- a/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp
@@ -24,6 +24,7 @@
#include "llvm/Analysis/ScalarEvolution.h"
#include "llvm/IR/Dominators.h"
#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/ProfDataUtils.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Transforms/Scalar.h"
#include "llvm/Transforms/Scalar/LoopPassManager.h"
@@ -393,6 +394,17 @@ class ConstantTerminatorFoldingImpl {
DTUpdates.push_back({DominatorTree::Insert, Preheader, BB});
++NumLoopExitsDeleted;
}
+// We don't really need to add branch weights to DummySwitch, because all
+// but one branches are just a temporary artifact - see the comment on top
+// of this function. But, it's easy to estimate the weights, and it helps
+// maintain a property of the overall compiler - that the branch weights
+// don't "just get dropped" accidentally (i.e. profcheck)
+if (DummySwitch->getParent()->getParent()->hasProfileData()) {
+ SmallVector DummyBranchWeights(1 + DummySwitch->getNumCases());
+ // default. 100% probability, the rest are dead.
+ DummyBranchWeights[0] = 1;
+ setBranchWeights(*DummySwitch, DummyBranchWeights, /*IsExpected=*/false);
+}
assert(L.getLoopPreheader() == NewPreheader && "Malformed CFG?");
if (Loop *OuterLoop = LI.getLoopFor(Preheader)) {
diff --git a/llvm/test/Transforms/LoopSimplifyCFG/constant-fold-branch.ll
b/llvm/test/Transforms/LoopSimplifyCFG/constant-fold-branch.ll
index 1ec212f0bb5ea..46b6209986fed 100644
--- a/llvm/test/Transforms/LoopSimplifyCFG/constant-fold-branch.ll
+++ b/llvm/test/Transforms/LoopSimplifyCFG/constant-fold-branch.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
UTC_ARGS: --check-globals
; REQUIRES: asserts
; RUN: opt -S -enable-loop-simplifycfg-term-folding=true
-passes=loop-simplifycfg -verify-loop-info -verify-dom-info -verify-loop-lcssa
< %s | FileCheck %s
; RUN: opt -S -enable-loop-simplifycfg-term-folding=true
-passes='require,loop(loop-simplifycfg)' -verify-loop-info
-verify-dom-info -verify-loop-lcssa < %s | FileCheck %s
@@ -59,7 +59,7 @@ define i32 @dead_backedge_test_switch_loop(i32 %end) {
; CHECK: dead_backedge:
; CHECK-NEXT:[[I_2]] = add i32 [[I_1]], 10
; CHECK-NEXT:switch i32 1, label [[EXIT:%.*]] [
-; CHECK-NEXT:i32 0, label [[HEADER_BACKEDGE]]
+; CHECK-NEXT: i32 0, label [[HEADER_BACKEDGE]]
; CHECK-NEXT:]
; CHECK: exit:
; CHECK-NEXT:[[I_2_LCSSA:%.*]] = phi i32 [ [[I_2]], [[DEAD_BACKEDGE]] ]
@@ -233,12 +233,12 @@ exit:
; Check that we preserve static reachibility of a dead exit block while
deleting
; a branch.
-define i32 @dead_exit_test_branch_loop(i32 %end) {
+define i32 @dead_exit_test_branch_loop(i32 %end) !prof
!{!"function_entry_count", i32 10} {
; CHECK-LABEL: @dead_exit_test_branch_loop(
; CHECK-NEXT: preheader:
; CHECK-NEXT:switch i32 0, label [[PREHEADER_SPLIT:%.*]] [
-; CHECK-NEXT:i32 1, label [[DEAD:%.*]]
-; CHECK-NEXT:]
+; CHECK-NEXT: i32 1, label [[DEAD:%.*]]
+; CHECK-NEXT:], !prof [[PROF1:![0-9]+]]
; CHECK: preheader.split:
; CHECK-NEXT:br label [[HEADER:%.*]]
; CHECK: header:
@@ -262,7 +262,7 @@ preheader:
header:
%i = phi i32 [0, %preheader], [%i.inc, %backedge]
- br i1 true, label %backedge, label %dead
+ br i1 true, label %backedge, label %dead, !prof !{!"branch_weights", i32 10,
i32 1}
dead:
br label %dummy
@@ -286,7 +286,7 @@ define i32 @dead_exit_test_switch_loop(i32 %end) {
; CHECK-LABEL: @dead_exit_test_switch_loop(
; CHECK-NEXT: preheader:
; CHECK-NEXT:switch i32 0, label [[PREHEADER_SPLIT:%.*]] [
-; CHECK-NEXT:i32 1, label [[DEAD:%.*]]
+; CHECK-NEXT: i32 1, label [[DEAD:%.*]]
; CHECK-NEXT:]
; CHECK: preheader.split:
; CHECK-NEXT:br label [[HEADER:%.*]]
@@ -383,9 +383,9 @@ define i32 @dead_loop_test_switch_loop(i32 %end) {
; CHECK: header:
; CHECK-NEXT:[[I:%.*]] = phi i32 [ 0, [[PREHEADER:%.*]] ], [
[[I_INC:%.*]], [[BACKEDGE:%.*]] ]
; CHECK-NEXT:switch i32 1, label [[DEAD:%.*]] [
-; CHECK-NEXT:i32 0, label [[DEAD]]
-; CHECK-NEXT:i32 1, label [[BACKEDGE]]
-; CHECK-NEXT:i32 2, lab
[llvm-branch-commits] [llvm] [LIR][profcheck] Reuse the loop's exit condition profile (PR #164523)
https://github.com/mtrofin updated
https://github.com/llvm/llvm-project/pull/164523
>From d2a04866764eac8bcfdeda442b6a111fb0a3c49c Mon Sep 17 00:00:00 2001
From: Mircea Trofin
Date: Tue, 21 Oct 2025 17:24:49 -0700
Subject: [PATCH] [LIR][profcheck] Reuse the loop's exit condition profile
---
.../Transforms/Scalar/LoopIdiomRecognize.cpp | 40 +--
.../LoopIdiom/X86/preserve-profile.ll | 70 +++
2 files changed, 106 insertions(+), 4 deletions(-)
create mode 100644 llvm/test/Transforms/LoopIdiom/X86/preserve-profile.ll
diff --git a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
index 019536ca91ae0..9070d252ae09f 100644
--- a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
@@ -72,6 +72,7 @@
#include "llvm/IR/Module.h"
#include "llvm/IR/PassManager.h"
#include "llvm/IR/PatternMatch.h"
+#include "llvm/IR/ProfDataUtils.h"
#include "llvm/IR/Type.h"
#include "llvm/IR/User.h"
#include "llvm/IR/Value.h"
@@ -105,6 +106,7 @@ STATISTIC(
STATISTIC(NumShiftUntilZero,
"Number of uncountable loops recognized as 'shift until zero'
idiom");
+namespace llvm {
bool DisableLIRP::All;
static cl::opt
DisableLIRPAll("disable-" DEBUG_TYPE "-all",
@@ -163,6 +165,10 @@ static cl::opt ForceMemsetPatternIntrinsic(
cl::desc("Use memset.pattern intrinsic whenever possible"),
cl::init(false),
cl::Hidden);
+extern cl::opt ProfcheckDisableMetadataFixes;
+
+} // namespace llvm
+
namespace {
class LoopIdiomRecognize {
@@ -3199,7 +3205,21 @@ bool LoopIdiomRecognize::recognizeShiftUntilBitTest() {
// The loop trip count check.
auto *IVCheck = Builder.CreateICmpEQ(IVNext, LoopTripCount,
CurLoop->getName() + ".ivcheck");
- Builder.CreateCondBr(IVCheck, SuccessorBB, LoopHeaderBB);
+ SmallVector BranchWeights;
+ const bool HasBranchWeights =
+ !ProfcheckDisableMetadataFixes &&
+ extractBranchWeights(*LoopHeaderBB->getTerminator(), BranchWeights);
+
+ auto *BI = Builder.CreateCondBr(IVCheck, SuccessorBB, LoopHeaderBB);
+ if (HasBranchWeights) {
+if (SuccessorBB == LoopHeaderBB->getTerminator()->getSuccessor(1))
+ std::swap(BranchWeights[0], BranchWeights[1]);
+// We're not changing the loop profile, so we can reuse the original loop's
+// profile.
+setBranchWeights(*BI, BranchWeights,
+ /*IsExpected=*/false);
+ }
+
LoopHeaderBB->getTerminator()->eraseFromParent();
// Populate the IV PHI.
@@ -3368,10 +3388,10 @@ static bool detectShiftUntilZeroIdiom(Loop *CurLoop,
ScalarEvolution *SE,
/// %start = <...>
/// %extraoffset = <...>
/// <...>
-/// br label %for.cond
+/// br label %loop
///
/// loop:
-/// %iv = phi i8 [ %start, %entry ], [ %iv.next, %for.cond ]
+/// %iv = phi i8 [ %start, %entry ], [ %iv.next, %loop ]
/// %nbits = add nsw i8 %iv, %extraoffset
/// %val.shifted = {{l,a}shr,shl} i8 %val, %nbits
/// %val.shifted.iszero = icmp eq i8 %val.shifted, 0
@@ -3533,7 +3553,19 @@ bool LoopIdiomRecognize::recognizeShiftUntilZero() {
// The loop terminator.
Builder.SetInsertPoint(LoopHeaderBB->getTerminator());
- Builder.CreateCondBr(CIVCheck, SuccessorBB, LoopHeaderBB);
+ SmallVector BranchWeights;
+ const bool HasBranchWeights =
+ !ProfcheckDisableMetadataFixes &&
+ extractBranchWeights(*LoopHeaderBB->getTerminator(), BranchWeights);
+
+ auto *BI = Builder.CreateCondBr(CIVCheck, SuccessorBB, LoopHeaderBB);
+ if (HasBranchWeights) {
+if (InvertedCond)
+ std::swap(BranchWeights[0], BranchWeights[1]);
+// We're not changing the loop profile, so we can reuse the original loop's
+// profile.
+setBranchWeights(*BI, BranchWeights, /*IsExpected=*/false);
+ }
LoopHeaderBB->getTerminator()->eraseFromParent();
// Populate the IV PHI.
diff --git a/llvm/test/Transforms/LoopIdiom/X86/preserve-profile.ll
b/llvm/test/Transforms/LoopIdiom/X86/preserve-profile.ll
new file mode 100644
index 0..d01bb748d9422
--- /dev/null
+++ b/llvm/test/Transforms/LoopIdiom/X86/preserve-profile.ll
@@ -0,0 +1,70 @@
+; RUN: opt
-passes="module(print),function(loop(loop-idiom)),module(print)"
-mtriple=x86_64 -mcpu=core-avx2 %s -disable-output 2>&1 | FileCheck
--check-prefix=PROFILE %s
+
+declare void @escape_inner(i8, i8, i8, i1, i8)
+declare void @escape_outer(i8, i8, i8, i1, i8)
+
+declare i8 @gen.i8()
+
+; Most basic pattern; Note that iff the shift amount is offset, said offsetting
+; must not cause an overflow, but `add nsw` is fine.
+define i8 @p0(i8 %val, i8 %start, i8 %extraoffset) mustprogress {
+entry:
+ br label %loop
+
+loop:
+ %iv = phi i8 [ %start, %entry ], [ %iv.next, %loop ]
+ %nbits = add nsw i8 %iv, %extraoffset
+ %val.shifted = ashr i8 %val, %nbits
+ %val.shifted.iszero = icmp eq i8 %val.shifted, 0
+ %iv.next = add i8 %iv, 1
+
+ call void @escap
[llvm-branch-commits] [llvm] [SLU][profcheck] create likely branch weights for guard->branch (PR #164271)
https://github.com/mtrofin updated
https://github.com/llvm/llvm-project/pull/164271
>From e7d4aa95ce5417d232769b564863abb47bbf42dc Mon Sep 17 00:00:00 2001
From: Mircea Trofin
Date: Mon, 20 Oct 2025 08:21:26 -0700
Subject: [PATCH] [SLU][profcheck] create likely branch weights for
guard->branch
---
llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp | 12 +---
llvm/test/Transforms/SimpleLoopUnswitch/guards.ll | 10 --
2 files changed, 17 insertions(+), 5 deletions(-)
diff --git a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
index 239526e85e1fd..86b2090081ed0 100644
--- a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
+++ b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
@@ -40,6 +40,7 @@
#include "llvm/IR/Instruction.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/MDBuilder.h"
#include "llvm/IR/Module.h"
#include "llvm/IR/PatternMatch.h"
#include "llvm/IR/ProfDataUtils.h"
@@ -2831,9 +2832,14 @@ static BranchInst *turnGuardIntoBranch(IntrinsicInst
*GI, Loop &L,
MSSAU->getMemorySSA()->verifyMemorySSA();
DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
- Instruction *DeoptBlockTerm =
- SplitBlockAndInsertIfThen(GI->getArgOperand(0), GI, true,
-GI->getMetadata(LLVMContext::MD_prof), &DTU,
&LI);
+ // llvm.experimental.guard doesn't have branch weights. We can assume,
+ // however, that the deopt path is unlikely.
+ Instruction *DeoptBlockTerm = SplitBlockAndInsertIfThen(
+ GI->getArgOperand(0), GI, true,
+ !ProfcheckDisableMetadataFixes && EstimateProfile
+ ? MDBuilder(GI->getContext()).createUnlikelyBranchWeights()
+ : nullptr,
+ &DTU, &LI);
BranchInst *CheckBI = cast(CheckBB->getTerminator());
// SplitBlockAndInsertIfThen inserts control flow that branches to
// DeoptBlockTerm if the condition is true. We want the opposite.
diff --git a/llvm/test/Transforms/SimpleLoopUnswitch/guards.ll
b/llvm/test/Transforms/SimpleLoopUnswitch/guards.ll
index da4f4cc80d96f..812a183673bcc 100644
--- a/llvm/test/Transforms/SimpleLoopUnswitch/guards.ll
+++ b/llvm/test/Transforms/SimpleLoopUnswitch/guards.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
UTC_ARGS: -p --version 5
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
UTC_ARGS: -p --check-globals all --version 5
; RUN: opt -passes='loop(simple-loop-unswitch),verify'
-simple-loop-unswitch-guards -S < %s | FileCheck %s
; RUN: opt -passes='simple-loop-unswitch'
-simple-loop-unswitch-guards -S < %s | FileCheck %s
; RUN: opt -passes='loop-mssa(simple-loop-unswitch),verify'
-simple-loop-unswitch-guards -verify-memoryssa -verify-loop-info -S < %s |
FileCheck %s
@@ -167,7 +167,7 @@ exit:
define void @test_nested_loop(i1 %cond, i32 %N, i1 %arg) {
; CHECK-LABEL: define void @test_nested_loop(i1 %cond, i32 %N, i1 %arg) {
; CHECK-NEXT: entry:
-; CHECK-NEXT:br i1 %cond, label %entry.split, label %outer_loop.split
+; CHECK-NEXT:br i1 %cond, label %entry.split, label %outer_loop.split,
!prof !1
; CHECK: entry.split:
; CHECK-NEXT:br i1 %arg, label %entry.split.split.us, label
%entry.split.split
; CHECK: entry.split.split.us:
@@ -337,3 +337,9 @@ exit:
declare void @may_throw(i32 %i)
declare i32 @__CxxFrameHandler3(...)
+
+!0 = !{!"function_entry_count", i32 10}
+;.
+; CHECK: !0 = !{!"function_entry_count", i32 10}
+; CHECK: !1 = !{!"branch_weights", i32 1048575, i32 1}
+;.
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [LIR][profcheck] Reuse the loop's exit condition profile (PR #164523)
https://github.com/mtrofin updated
https://github.com/llvm/llvm-project/pull/164523
>From d2a04866764eac8bcfdeda442b6a111fb0a3c49c Mon Sep 17 00:00:00 2001
From: Mircea Trofin
Date: Tue, 21 Oct 2025 17:24:49 -0700
Subject: [PATCH] [LIR][profcheck] Reuse the loop's exit condition profile
---
.../Transforms/Scalar/LoopIdiomRecognize.cpp | 40 +--
.../LoopIdiom/X86/preserve-profile.ll | 70 +++
2 files changed, 106 insertions(+), 4 deletions(-)
create mode 100644 llvm/test/Transforms/LoopIdiom/X86/preserve-profile.ll
diff --git a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
index 019536ca91ae0..9070d252ae09f 100644
--- a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
@@ -72,6 +72,7 @@
#include "llvm/IR/Module.h"
#include "llvm/IR/PassManager.h"
#include "llvm/IR/PatternMatch.h"
+#include "llvm/IR/ProfDataUtils.h"
#include "llvm/IR/Type.h"
#include "llvm/IR/User.h"
#include "llvm/IR/Value.h"
@@ -105,6 +106,7 @@ STATISTIC(
STATISTIC(NumShiftUntilZero,
"Number of uncountable loops recognized as 'shift until zero'
idiom");
+namespace llvm {
bool DisableLIRP::All;
static cl::opt
DisableLIRPAll("disable-" DEBUG_TYPE "-all",
@@ -163,6 +165,10 @@ static cl::opt ForceMemsetPatternIntrinsic(
cl::desc("Use memset.pattern intrinsic whenever possible"),
cl::init(false),
cl::Hidden);
+extern cl::opt ProfcheckDisableMetadataFixes;
+
+} // namespace llvm
+
namespace {
class LoopIdiomRecognize {
@@ -3199,7 +3205,21 @@ bool LoopIdiomRecognize::recognizeShiftUntilBitTest() {
// The loop trip count check.
auto *IVCheck = Builder.CreateICmpEQ(IVNext, LoopTripCount,
CurLoop->getName() + ".ivcheck");
- Builder.CreateCondBr(IVCheck, SuccessorBB, LoopHeaderBB);
+ SmallVector BranchWeights;
+ const bool HasBranchWeights =
+ !ProfcheckDisableMetadataFixes &&
+ extractBranchWeights(*LoopHeaderBB->getTerminator(), BranchWeights);
+
+ auto *BI = Builder.CreateCondBr(IVCheck, SuccessorBB, LoopHeaderBB);
+ if (HasBranchWeights) {
+if (SuccessorBB == LoopHeaderBB->getTerminator()->getSuccessor(1))
+ std::swap(BranchWeights[0], BranchWeights[1]);
+// We're not changing the loop profile, so we can reuse the original loop's
+// profile.
+setBranchWeights(*BI, BranchWeights,
+ /*IsExpected=*/false);
+ }
+
LoopHeaderBB->getTerminator()->eraseFromParent();
// Populate the IV PHI.
@@ -3368,10 +3388,10 @@ static bool detectShiftUntilZeroIdiom(Loop *CurLoop,
ScalarEvolution *SE,
/// %start = <...>
/// %extraoffset = <...>
/// <...>
-/// br label %for.cond
+/// br label %loop
///
/// loop:
-/// %iv = phi i8 [ %start, %entry ], [ %iv.next, %for.cond ]
+/// %iv = phi i8 [ %start, %entry ], [ %iv.next, %loop ]
/// %nbits = add nsw i8 %iv, %extraoffset
/// %val.shifted = {{l,a}shr,shl} i8 %val, %nbits
/// %val.shifted.iszero = icmp eq i8 %val.shifted, 0
@@ -3533,7 +3553,19 @@ bool LoopIdiomRecognize::recognizeShiftUntilZero() {
// The loop terminator.
Builder.SetInsertPoint(LoopHeaderBB->getTerminator());
- Builder.CreateCondBr(CIVCheck, SuccessorBB, LoopHeaderBB);
+ SmallVector BranchWeights;
+ const bool HasBranchWeights =
+ !ProfcheckDisableMetadataFixes &&
+ extractBranchWeights(*LoopHeaderBB->getTerminator(), BranchWeights);
+
+ auto *BI = Builder.CreateCondBr(CIVCheck, SuccessorBB, LoopHeaderBB);
+ if (HasBranchWeights) {
+if (InvertedCond)
+ std::swap(BranchWeights[0], BranchWeights[1]);
+// We're not changing the loop profile, so we can reuse the original loop's
+// profile.
+setBranchWeights(*BI, BranchWeights, /*IsExpected=*/false);
+ }
LoopHeaderBB->getTerminator()->eraseFromParent();
// Populate the IV PHI.
diff --git a/llvm/test/Transforms/LoopIdiom/X86/preserve-profile.ll
b/llvm/test/Transforms/LoopIdiom/X86/preserve-profile.ll
new file mode 100644
index 0..d01bb748d9422
--- /dev/null
+++ b/llvm/test/Transforms/LoopIdiom/X86/preserve-profile.ll
@@ -0,0 +1,70 @@
+; RUN: opt
-passes="module(print),function(loop(loop-idiom)),module(print)"
-mtriple=x86_64 -mcpu=core-avx2 %s -disable-output 2>&1 | FileCheck
--check-prefix=PROFILE %s
+
+declare void @escape_inner(i8, i8, i8, i1, i8)
+declare void @escape_outer(i8, i8, i8, i1, i8)
+
+declare i8 @gen.i8()
+
+; Most basic pattern; Note that iff the shift amount is offset, said offsetting
+; must not cause an overflow, but `add nsw` is fine.
+define i8 @p0(i8 %val, i8 %start, i8 %extraoffset) mustprogress {
+entry:
+ br label %loop
+
+loop:
+ %iv = phi i8 [ %start, %entry ], [ %iv.next, %loop ]
+ %nbits = add nsw i8 %iv, %extraoffset
+ %val.shifted = ashr i8 %val, %nbits
+ %val.shifted.iszero = icmp eq i8 %val.shifted, 0
+ %iv.next = add i8 %iv, 1
+
+ call void @escap
[llvm-branch-commits] [llvm] [SLU][profcheck] Propagate profile for branches on injected conditions. (PR #164476)
https://github.com/mtrofin updated
https://github.com/llvm/llvm-project/pull/164476
>From 4e408eea7761b2979a1778a0a9bc9168a7fa65e7 Mon Sep 17 00:00:00 2001
From: Mircea Trofin
Date: Tue, 21 Oct 2025 11:22:01 -0700
Subject: [PATCH] [SLU][profcheck] Propagate profile for branches on injected
conditions.
---
.../Transforms/Scalar/SimpleLoopUnswitch.cpp | 9 +-
.../inject-invariant-conditions.ll| 142 +-
2 files changed, 79 insertions(+), 72 deletions(-)
diff --git a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
index 86b2090081ed0..0577ddbd2353c 100644
--- a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
+++ b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
@@ -3203,10 +3203,15 @@
injectPendingInvariantConditions(NonTrivialUnswitchCandidate Candidate, Loop &L,
Builder.SetInsertPoint(TI);
auto *InvariantBr =
Builder.CreateCondBr(InjectedCond, InLoopSucc, CheckBlock);
+ // We don't know anything about the relation between the limits.
+ setExplicitlyUnknownBranchWeightsIfProfiled(
+ *InvariantBr, *InvariantBr->getParent()->getParent(), DEBUG_TYPE);
Builder.SetInsertPoint(CheckBlock);
- Builder.CreateCondBr(TI->getCondition(), TI->getSuccessor(0),
- TI->getSuccessor(1));
+ Builder.CreateCondBr(
+ TI->getCondition(), TI->getSuccessor(0), TI->getSuccessor(1),
+ !ProfcheckDisableMetadataFixes ? TI->getMetadata(LLVMContext::MD_prof)
+ : nullptr);
TI->eraseFromParent();
// Fixup phis.
diff --git
a/llvm/test/Transforms/SimpleLoopUnswitch/inject-invariant-conditions.ll
b/llvm/test/Transforms/SimpleLoopUnswitch/inject-invariant-conditions.ll
index 536e0c6a0e74a..3c84dea2a0672 100644
--- a/llvm/test/Transforms/SimpleLoopUnswitch/inject-invariant-conditions.ll
+++ b/llvm/test/Transforms/SimpleLoopUnswitch/inject-invariant-conditions.ll
@@ -2,40 +2,40 @@
; RUN: opt < %s -S -simple-loop-unswitch-inject-invariant-conditions=true
-passes="loop(simple-loop-unswitch),simplifycfg" | FileCheck %s
; RUN: opt < %s -S -simple-loop-unswitch-inject-invariant-conditions=true
-passes="loop-mssa(simple-loop-unswitch),simplifycfg"
-verify-memoryssa | FileCheck %s
-define i32 @test_01(ptr noundef %p, i32 noundef %n, i32 noundef %limit, ptr
noundef %arr, ptr noundef %x_p) {
+define i32 @test_01(ptr noundef %p, i32 noundef %n, i32 noundef %limit, ptr
noundef %arr, ptr noundef %x_p) !prof !{!"function_entry_count", i32 10} {
; CHECK-LABEL: @test_01(
; CHECK-NEXT: entry:
-; CHECK-NEXT:[[X:%.*]] = load i32, ptr [[X_P:%.*]], align 4, !noundef !0
+; CHECK-NEXT:[[X:%.*]] = load i32, ptr [[X_P:%.*]], align 4, !noundef
[[META1:![0-9]+]]
; CHECK-NEXT:[[INJECTED_COND:%.*]] = icmp ule i32 [[LIMIT:%.*]], [[X]]
-; CHECK-NEXT:br i1 [[INJECTED_COND]], label [[LOOP_US:%.*]], label
[[LOOP:%.*]]
+; CHECK-NEXT:br i1 [[INJECTED_COND]], label [[LOOP_US:%.*]], label
[[LOOP:%.*]], !prof [[PROF2:![0-9]+]]
; CHECK: loop.us:
-; CHECK-NEXT:[[IV_US:%.*]] = phi i32 [ [[IV_NEXT_US:%.*]],
[[GUARDED_US:%.*]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:[[EL_PTR_US:%.*]] = getelementptr i32, ptr [[P:%.*]], i32
[[IV_US]]
-; CHECK-NEXT:[[EL_US:%.*]] = load i32, ptr [[EL_PTR_US]], align 4
-; CHECK-NEXT:[[BOUND_CHECK_US:%.*]] = icmp ult i32 [[EL_US]], [[LIMIT]]
-; CHECK-NEXT:br i1 [[BOUND_CHECK_US]], label [[GUARDED_US]], label
[[COMMON_RET:%.*]], !prof [[PROF1:![0-9]+]]
+; CHECK-NEXT:[[IV:%.*]] = phi i32 [ [[IV_NEXT_US:%.*]], [[GUARDED_US:%.*]]
], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:[[EL_PTR:%.*]] = getelementptr i32, ptr [[P:%.*]], i32 [[IV]]
+; CHECK-NEXT:[[EL:%.*]] = load i32, ptr [[EL_PTR]], align 4
+; CHECK-NEXT:[[BOUND_CHECK:%.*]] = icmp ult i32 [[EL]], [[LIMIT]]
+; CHECK-NEXT:br i1 [[BOUND_CHECK]], label [[GUARDED_US]], label
[[COMMON_RET:%.*]], !prof [[PROF3:![0-9]+]]
; CHECK: guarded.us:
-; CHECK-NEXT:[[RANGE_CHECK_US:%.*]] = icmp ult i32 [[EL_US]], [[X]]
-; CHECK-NEXT:[[ARR_PTR_US:%.*]] = getelementptr i32, ptr [[ARR:%.*]], i32
[[EL_US]]
-; CHECK-NEXT:store i32 [[IV_US]], ptr [[ARR_PTR_US]], align 4
-; CHECK-NEXT:[[IV_NEXT_US]] = add i32 [[IV_US]], 1
+; CHECK-NEXT:[[RANGE_CHECK_US:%.*]] = icmp ult i32 [[EL]], [[X]]
+; CHECK-NEXT:[[ARR_PTR_US:%.*]] = getelementptr i32, ptr [[ARR:%.*]], i32
[[EL]]
+; CHECK-NEXT:store i32 [[IV]], ptr [[ARR_PTR_US]], align 4
+; CHECK-NEXT:[[IV_NEXT_US]] = add i32 [[IV]], 1
; CHECK-NEXT:[[LOOP_COND_US:%.*]] = icmp slt i32 [[IV_NEXT_US]], [[N:%.*]]
; CHECK-NEXT:br i1 [[LOOP_COND_US]], label [[LOOP_US]], label
[[COMMON_RET]]
; CHECK: loop:
-; CHECK-NEXT:[[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], [[BACKEDGE:%.*]] ], [
0, [[ENTRY]] ]
-; CHECK-NEXT:[[EL_PTR:%.*]] = getelementptr i32, ptr [[P]], i32 [[IV]]
-; CHECK-NEXT:[[EL:%.*]] = load i32, ptr [[EL_PTR]], align 4
-; CHECK-NEXT:[[BOUND_CHEC
[llvm-branch-commits] [llvm] [LVer][profcheck] explicitly set unknown branch weights for the versioned/unversioned selector (PR #164507)
https://github.com/mtrofin updated
https://github.com/llvm/llvm-project/pull/164507
>From 7a7a7f211a62ae2fa6bb6149b5b5e98b806f0e3b Mon Sep 17 00:00:00 2001
From: Mircea Trofin
Date: Tue, 21 Oct 2025 15:20:14 -0700
Subject: [PATCH] [LVer][profcheck] explicitly set unknown branch weights for
the versioned/unversioned selector
---
llvm/lib/Transforms/Utils/LoopVersioning.cpp | 10 --
.../Transforms/LoopDistribute/basic-with-memchecks.ll | 5 +++--
2 files changed, 11 insertions(+), 4 deletions(-)
diff --git a/llvm/lib/Transforms/Utils/LoopVersioning.cpp
b/llvm/lib/Transforms/Utils/LoopVersioning.cpp
index ec2e6c1ab796b..4786819d18fa4 100644
--- a/llvm/lib/Transforms/Utils/LoopVersioning.cpp
+++ b/llvm/lib/Transforms/Utils/LoopVersioning.cpp
@@ -23,6 +23,7 @@
#include "llvm/IR/Dominators.h"
#include "llvm/IR/MDBuilder.h"
#include "llvm/IR/PassManager.h"
+#include "llvm/IR/ProfDataUtils.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
#include "llvm/Transforms/Utils/Cloning.h"
@@ -109,8 +110,13 @@ void LoopVersioning::versionLoop(
// Insert the conditional branch based on the result of the memchecks.
Instruction *OrigTerm = RuntimeCheckBB->getTerminator();
Builder.SetInsertPoint(OrigTerm);
- Builder.CreateCondBr(RuntimeCheck, NonVersionedLoop->getLoopPreheader(),
- VersionedLoop->getLoopPreheader());
+ auto *BI =
+ Builder.CreateCondBr(RuntimeCheck, NonVersionedLoop->getLoopPreheader(),
+ VersionedLoop->getLoopPreheader());
+ // We don't know what the probability of executing the versioned vs the
+ // unversioned variants is.
+ setExplicitlyUnknownBranchWeightsIfProfiled(
+ *BI, *BI->getParent()->getParent(), DEBUG_TYPE);
OrigTerm->eraseFromParent();
// The loops merge in the original exit block. This is now dominated by the
diff --git a/llvm/test/Transforms/LoopDistribute/basic-with-memchecks.ll
b/llvm/test/Transforms/LoopDistribute/basic-with-memchecks.ll
index 97ea2c6708dad..2828882afe779 100644
--- a/llvm/test/Transforms/LoopDistribute/basic-with-memchecks.ll
+++ b/llvm/test/Transforms/LoopDistribute/basic-with-memchecks.ll
@@ -28,7 +28,7 @@ target triple = "x86_64-apple-macosx10.10.0"
@E = common global ptr null, align 8
; CHECK-LABEL: @f(
-define void @f() {
+define void @f() !prof !{!"function_entry_count", i32 10} {
entry:
%a = load ptr, ptr @A, align 8
%b = load ptr, ptr @B, align 8
@@ -55,7 +55,7 @@ entry:
; CHECK: = icmp
; CHECK-NOT: = icmp
-; CHECK: br i1 %conflict.rdx15, label %for.body.ph.lver.orig, label
%for.body.ph.ldist1
+; CHECK: br i1 %conflict.rdx15, label %for.body.ph.lver.orig, label
%for.body.ph.ldist1, !prof ![[PROF1:[0-9]]]
; The non-distributed loop that the memchecks fall back on.
@@ -289,3 +289,4 @@ attributes #1 = { nounwind convergent }
!0 = distinct !{!0, !1}
!1 = !{!"llvm.loop.distribute.enable", i1 true}
+; CHECK: ![[PROF1]] = !{!"unknown", !"loop-versioning"}
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [LSCFG][profcheck] Add dummy branch weights for the dummy switch to dead exits (PR #164714)
https://github.com/mtrofin updated
https://github.com/llvm/llvm-project/pull/164714
>From 82f4ee86a4d5b907783f71a17ba645ae6e598af2 Mon Sep 17 00:00:00 2001
From: Mircea Trofin
Date: Wed, 22 Oct 2025 14:34:31 -0700
Subject: [PATCH] [LSCFG][profcheck] Add dummy branch weights for the dummy
switch to dead exits
---
.../lib/Transforms/Scalar/LoopSimplifyCFG.cpp | 12 ++
.../LoopSimplifyCFG/constant-fold-branch.ll | 104 +-
2 files changed, 66 insertions(+), 50 deletions(-)
diff --git a/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp
b/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp
index b9546c5fa236b..e902b71776973 100644
--- a/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp
@@ -24,6 +24,7 @@
#include "llvm/Analysis/ScalarEvolution.h"
#include "llvm/IR/Dominators.h"
#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/ProfDataUtils.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Transforms/Scalar.h"
#include "llvm/Transforms/Scalar/LoopPassManager.h"
@@ -393,6 +394,17 @@ class ConstantTerminatorFoldingImpl {
DTUpdates.push_back({DominatorTree::Insert, Preheader, BB});
++NumLoopExitsDeleted;
}
+// We don't really need to add branch weights to DummySwitch, because all
+// but one branches are just a temporary artifact - see the comment on top
+// of this function. But, it's easy to estimate the weights, and it helps
+// maintain a property of the overall compiler - that the branch weights
+// don't "just get dropped" accidentally (i.e. profcheck)
+if (DummySwitch->getParent()->getParent()->hasProfileData()) {
+ SmallVector DummyBranchWeights(1 + DummySwitch->getNumCases());
+ // default. 100% probability, the rest are dead.
+ DummyBranchWeights[0] = 1;
+ setBranchWeights(*DummySwitch, DummyBranchWeights, /*IsExpected=*/false);
+}
assert(L.getLoopPreheader() == NewPreheader && "Malformed CFG?");
if (Loop *OuterLoop = LI.getLoopFor(Preheader)) {
diff --git a/llvm/test/Transforms/LoopSimplifyCFG/constant-fold-branch.ll
b/llvm/test/Transforms/LoopSimplifyCFG/constant-fold-branch.ll
index 1ec212f0bb5ea..46b6209986fed 100644
--- a/llvm/test/Transforms/LoopSimplifyCFG/constant-fold-branch.ll
+++ b/llvm/test/Transforms/LoopSimplifyCFG/constant-fold-branch.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
UTC_ARGS: --check-globals
; REQUIRES: asserts
; RUN: opt -S -enable-loop-simplifycfg-term-folding=true
-passes=loop-simplifycfg -verify-loop-info -verify-dom-info -verify-loop-lcssa
< %s | FileCheck %s
; RUN: opt -S -enable-loop-simplifycfg-term-folding=true
-passes='require,loop(loop-simplifycfg)' -verify-loop-info
-verify-dom-info -verify-loop-lcssa < %s | FileCheck %s
@@ -59,7 +59,7 @@ define i32 @dead_backedge_test_switch_loop(i32 %end) {
; CHECK: dead_backedge:
; CHECK-NEXT:[[I_2]] = add i32 [[I_1]], 10
; CHECK-NEXT:switch i32 1, label [[EXIT:%.*]] [
-; CHECK-NEXT:i32 0, label [[HEADER_BACKEDGE]]
+; CHECK-NEXT: i32 0, label [[HEADER_BACKEDGE]]
; CHECK-NEXT:]
; CHECK: exit:
; CHECK-NEXT:[[I_2_LCSSA:%.*]] = phi i32 [ [[I_2]], [[DEAD_BACKEDGE]] ]
@@ -233,12 +233,12 @@ exit:
; Check that we preserve static reachibility of a dead exit block while
deleting
; a branch.
-define i32 @dead_exit_test_branch_loop(i32 %end) {
+define i32 @dead_exit_test_branch_loop(i32 %end) !prof
!{!"function_entry_count", i32 10} {
; CHECK-LABEL: @dead_exit_test_branch_loop(
; CHECK-NEXT: preheader:
; CHECK-NEXT:switch i32 0, label [[PREHEADER_SPLIT:%.*]] [
-; CHECK-NEXT:i32 1, label [[DEAD:%.*]]
-; CHECK-NEXT:]
+; CHECK-NEXT: i32 1, label [[DEAD:%.*]]
+; CHECK-NEXT:], !prof [[PROF1:![0-9]+]]
; CHECK: preheader.split:
; CHECK-NEXT:br label [[HEADER:%.*]]
; CHECK: header:
@@ -262,7 +262,7 @@ preheader:
header:
%i = phi i32 [0, %preheader], [%i.inc, %backedge]
- br i1 true, label %backedge, label %dead
+ br i1 true, label %backedge, label %dead, !prof !{!"branch_weights", i32 10,
i32 1}
dead:
br label %dummy
@@ -286,7 +286,7 @@ define i32 @dead_exit_test_switch_loop(i32 %end) {
; CHECK-LABEL: @dead_exit_test_switch_loop(
; CHECK-NEXT: preheader:
; CHECK-NEXT:switch i32 0, label [[PREHEADER_SPLIT:%.*]] [
-; CHECK-NEXT:i32 1, label [[DEAD:%.*]]
+; CHECK-NEXT: i32 1, label [[DEAD:%.*]]
; CHECK-NEXT:]
; CHECK: preheader.split:
; CHECK-NEXT:br label [[HEADER:%.*]]
@@ -383,9 +383,9 @@ define i32 @dead_loop_test_switch_loop(i32 %end) {
; CHECK: header:
; CHECK-NEXT:[[I:%.*]] = phi i32 [ 0, [[PREHEADER:%.*]] ], [
[[I_INC:%.*]], [[BACKEDGE:%.*]] ]
; CHECK-NEXT:switch i32 1, label [[DEAD:%.*]] [
-; CHECK-NEXT:i32 0, label [[DEAD]]
-; CHECK-NEXT:i32 1, label [[BACKEDGE]]
-; CHECK-NEXT:i32 2, lab
[llvm-branch-commits] [llvm] [SLU][profcheck] Propagate profile for branches on injected conditions. (PR #164476)
https://github.com/mtrofin updated
https://github.com/llvm/llvm-project/pull/164476
>From 4e408eea7761b2979a1778a0a9bc9168a7fa65e7 Mon Sep 17 00:00:00 2001
From: Mircea Trofin
Date: Tue, 21 Oct 2025 11:22:01 -0700
Subject: [PATCH] [SLU][profcheck] Propagate profile for branches on injected
conditions.
---
.../Transforms/Scalar/SimpleLoopUnswitch.cpp | 9 +-
.../inject-invariant-conditions.ll| 142 +-
2 files changed, 79 insertions(+), 72 deletions(-)
diff --git a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
index 86b2090081ed0..0577ddbd2353c 100644
--- a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
+++ b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
@@ -3203,10 +3203,15 @@
injectPendingInvariantConditions(NonTrivialUnswitchCandidate Candidate, Loop &L,
Builder.SetInsertPoint(TI);
auto *InvariantBr =
Builder.CreateCondBr(InjectedCond, InLoopSucc, CheckBlock);
+ // We don't know anything about the relation between the limits.
+ setExplicitlyUnknownBranchWeightsIfProfiled(
+ *InvariantBr, *InvariantBr->getParent()->getParent(), DEBUG_TYPE);
Builder.SetInsertPoint(CheckBlock);
- Builder.CreateCondBr(TI->getCondition(), TI->getSuccessor(0),
- TI->getSuccessor(1));
+ Builder.CreateCondBr(
+ TI->getCondition(), TI->getSuccessor(0), TI->getSuccessor(1),
+ !ProfcheckDisableMetadataFixes ? TI->getMetadata(LLVMContext::MD_prof)
+ : nullptr);
TI->eraseFromParent();
// Fixup phis.
diff --git
a/llvm/test/Transforms/SimpleLoopUnswitch/inject-invariant-conditions.ll
b/llvm/test/Transforms/SimpleLoopUnswitch/inject-invariant-conditions.ll
index 536e0c6a0e74a..3c84dea2a0672 100644
--- a/llvm/test/Transforms/SimpleLoopUnswitch/inject-invariant-conditions.ll
+++ b/llvm/test/Transforms/SimpleLoopUnswitch/inject-invariant-conditions.ll
@@ -2,40 +2,40 @@
; RUN: opt < %s -S -simple-loop-unswitch-inject-invariant-conditions=true
-passes="loop(simple-loop-unswitch),simplifycfg" | FileCheck %s
; RUN: opt < %s -S -simple-loop-unswitch-inject-invariant-conditions=true
-passes="loop-mssa(simple-loop-unswitch),simplifycfg"
-verify-memoryssa | FileCheck %s
-define i32 @test_01(ptr noundef %p, i32 noundef %n, i32 noundef %limit, ptr
noundef %arr, ptr noundef %x_p) {
+define i32 @test_01(ptr noundef %p, i32 noundef %n, i32 noundef %limit, ptr
noundef %arr, ptr noundef %x_p) !prof !{!"function_entry_count", i32 10} {
; CHECK-LABEL: @test_01(
; CHECK-NEXT: entry:
-; CHECK-NEXT:[[X:%.*]] = load i32, ptr [[X_P:%.*]], align 4, !noundef !0
+; CHECK-NEXT:[[X:%.*]] = load i32, ptr [[X_P:%.*]], align 4, !noundef
[[META1:![0-9]+]]
; CHECK-NEXT:[[INJECTED_COND:%.*]] = icmp ule i32 [[LIMIT:%.*]], [[X]]
-; CHECK-NEXT:br i1 [[INJECTED_COND]], label [[LOOP_US:%.*]], label
[[LOOP:%.*]]
+; CHECK-NEXT:br i1 [[INJECTED_COND]], label [[LOOP_US:%.*]], label
[[LOOP:%.*]], !prof [[PROF2:![0-9]+]]
; CHECK: loop.us:
-; CHECK-NEXT:[[IV_US:%.*]] = phi i32 [ [[IV_NEXT_US:%.*]],
[[GUARDED_US:%.*]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:[[EL_PTR_US:%.*]] = getelementptr i32, ptr [[P:%.*]], i32
[[IV_US]]
-; CHECK-NEXT:[[EL_US:%.*]] = load i32, ptr [[EL_PTR_US]], align 4
-; CHECK-NEXT:[[BOUND_CHECK_US:%.*]] = icmp ult i32 [[EL_US]], [[LIMIT]]
-; CHECK-NEXT:br i1 [[BOUND_CHECK_US]], label [[GUARDED_US]], label
[[COMMON_RET:%.*]], !prof [[PROF1:![0-9]+]]
+; CHECK-NEXT:[[IV:%.*]] = phi i32 [ [[IV_NEXT_US:%.*]], [[GUARDED_US:%.*]]
], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:[[EL_PTR:%.*]] = getelementptr i32, ptr [[P:%.*]], i32 [[IV]]
+; CHECK-NEXT:[[EL:%.*]] = load i32, ptr [[EL_PTR]], align 4
+; CHECK-NEXT:[[BOUND_CHECK:%.*]] = icmp ult i32 [[EL]], [[LIMIT]]
+; CHECK-NEXT:br i1 [[BOUND_CHECK]], label [[GUARDED_US]], label
[[COMMON_RET:%.*]], !prof [[PROF3:![0-9]+]]
; CHECK: guarded.us:
-; CHECK-NEXT:[[RANGE_CHECK_US:%.*]] = icmp ult i32 [[EL_US]], [[X]]
-; CHECK-NEXT:[[ARR_PTR_US:%.*]] = getelementptr i32, ptr [[ARR:%.*]], i32
[[EL_US]]
-; CHECK-NEXT:store i32 [[IV_US]], ptr [[ARR_PTR_US]], align 4
-; CHECK-NEXT:[[IV_NEXT_US]] = add i32 [[IV_US]], 1
+; CHECK-NEXT:[[RANGE_CHECK_US:%.*]] = icmp ult i32 [[EL]], [[X]]
+; CHECK-NEXT:[[ARR_PTR_US:%.*]] = getelementptr i32, ptr [[ARR:%.*]], i32
[[EL]]
+; CHECK-NEXT:store i32 [[IV]], ptr [[ARR_PTR_US]], align 4
+; CHECK-NEXT:[[IV_NEXT_US]] = add i32 [[IV]], 1
; CHECK-NEXT:[[LOOP_COND_US:%.*]] = icmp slt i32 [[IV_NEXT_US]], [[N:%.*]]
; CHECK-NEXT:br i1 [[LOOP_COND_US]], label [[LOOP_US]], label
[[COMMON_RET]]
; CHECK: loop:
-; CHECK-NEXT:[[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], [[BACKEDGE:%.*]] ], [
0, [[ENTRY]] ]
-; CHECK-NEXT:[[EL_PTR:%.*]] = getelementptr i32, ptr [[P]], i32 [[IV]]
-; CHECK-NEXT:[[EL:%.*]] = load i32, ptr [[EL_PTR]], align 4
-; CHECK-NEXT:[[BOUND_CHEC
[llvm-branch-commits] [llvm] [SLU][profcheck] create likely branch weights for guard->branch (PR #164271)
https://github.com/mtrofin edited https://github.com/llvm/llvm-project/pull/164271 ___ llvm-branch-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [WIP] Handle guard insertion in callbacks to OpenMP runtime functions. (PR #164655)
https://github.com/abidh updated
https://github.com/llvm/llvm-project/pull/164655
>From 56037a64dbd5f73d2c020dd5d58d2c99758b35d0 Mon Sep 17 00:00:00 2001
From: Abid Qadeer
Date: Tue, 21 Oct 2025 20:53:46 +0100
Subject: [PATCH 1/5] Add callback metadata to runtime functions which take
callbacks.
---
llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp | 25
.../Frontend/OpenMPIRBuilderTest.cpp | 58 +++
2 files changed, 83 insertions(+)
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index c164d32f8f98c..312e119c4280d 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -750,6 +750,31 @@ OpenMPIRBuilder::getOrCreateRuntimeFunction(Module &M,
RuntimeFunction FnID) {
*MDNode::get(Ctx, {MDB.createCallbackEncoding(
2, {-1, -1}, /* VarArgsArePassed */ true)}));
}
+
+} else if (FnID == OMPRTL___kmpc_distribute_static_loop_4 ||
+ FnID == OMPRTL___kmpc_distribute_static_loop_4u ||
+ FnID == OMPRTL___kmpc_distribute_static_loop_8 ||
+ FnID == OMPRTL___kmpc_distribute_static_loop_8u ||
+ FnID == OMPRTL___kmpc_distribute_for_static_loop_4 ||
+ FnID == OMPRTL___kmpc_distribute_for_static_loop_4u ||
+ FnID == OMPRTL___kmpc_distribute_for_static_loop_8 ||
+ FnID == OMPRTL___kmpc_distribute_for_static_loop_8u ||
+ FnID == OMPRTL___kmpc_for_static_loop_4 ||
+ FnID == OMPRTL___kmpc_for_static_loop_4u ||
+ FnID == OMPRTL___kmpc_for_static_loop_8 ||
+ FnID == OMPRTL___kmpc_for_static_loop_8u) {
+ if (!Fn->hasMetadata(LLVMContext::MD_callback)) {
+LLVMContext &Ctx = Fn->getContext();
+MDBuilder MDB(Ctx);
+// Annotate the callback behavior of the runtime function:
+// - The callback callee is argument number 1.
+// - The first argument of the callback callee is unknown (-1).
+// - The second argument of the callback callee is argument number 2
+Fn->addMetadata(
+LLVMContext::MD_callback,
+*MDNode::get(Ctx, {MDB.createCallbackEncoding(
+ 1, {-1, 2}, /* VarArgsArePassed */ false)}));
+ }
}
LLVM_DEBUG(dbgs() << "Created OpenMP runtime function " << Fn->getName()
diff --git a/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp
b/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp
index d231a778a8a97..aca2153f85c26 100644
--- a/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp
+++ b/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp
@@ -7957,4 +7957,62 @@ TEST_F(OpenMPIRBuilderTest, spliceBBWithEmptyBB) {
EXPECT_FALSE(Terminator->getDbgRecordRange().empty());
}
+TEST_F(OpenMPIRBuilderTest, callBackFunctions) {
+ OpenMPIRBuilder OMPBuilder(*M);
+ OMPBuilder.Config.IsTargetDevice = true;
+ OMPBuilder.initialize();
+
+ // Test multiple runtime functions that should have callback metadata
+ std::vector CallbackFunctions = {
+OMPRTL___kmpc_distribute_static_loop_4,
+OMPRTL___kmpc_distribute_static_loop_4u,
+OMPRTL___kmpc_distribute_static_loop_8,
+OMPRTL___kmpc_distribute_static_loop_8u,
+OMPRTL___kmpc_distribute_for_static_loop_4,
+OMPRTL___kmpc_distribute_for_static_loop_4u,
+OMPRTL___kmpc_distribute_for_static_loop_8,
+OMPRTL___kmpc_distribute_for_static_loop_8u,
+OMPRTL___kmpc_for_static_loop_4,
+OMPRTL___kmpc_for_static_loop_4u,
+OMPRTL___kmpc_for_static_loop_8,
+OMPRTL___kmpc_for_static_loop_8u
+ };
+
+ for (RuntimeFunction RF : CallbackFunctions) {
+Function *Fn = OMPBuilder.getOrCreateRuntimeFunctionPtr(RF);
+ASSERT_NE(Fn, nullptr) << "Function should exist for runtime function";
+
+MDNode *CallbackMD = Fn->getMetadata(LLVMContext::MD_callback);
+EXPECT_NE(CallbackMD, nullptr) << "Function should have callback metadata";
+
+if (CallbackMD) {
+ // Should have at least one callback
+ EXPECT_GE(CallbackMD->getNumOperands(), 1U);
+
+ // Test first callback entry
+ MDNode *FirstCallback = cast(CallbackMD->getOperand(0));
+ EXPECT_EQ(FirstCallback->getNumOperands(), 4U);
+
+ // Callee index should be valid
+ auto *CalleeIdxCM =
cast(FirstCallback->getOperand(0));
+ uint64_t CalleeIdx =
cast(CalleeIdxCM->getValue())->getZExtValue();
+ EXPECT_EQ(CalleeIdx, 1u);
+
+ // Verify payload arguments re (-1, 2)
+ auto *Arg0CM = cast(FirstCallback->getOperand(1));
+ int64_t Arg0 = cast(Arg0CM->getValue())->getSExtValue();
+ EXPECT_EQ(Arg0, -1);
+ auto *Arg1CM = cast(FirstCallback->getOperand(2));
+ int64_t Arg1 = cast(Arg1CM->getValue())->getSExtValue();
+ EXPECT_EQ(Arg1, 2);
+
+ // Verify the varArgs is false.
+ auto *VarArgCM = cast(FirstCallback->getOperand(3));
+ uint64_t VarAr
[llvm-branch-commits] [llvm] [SLU][profcheck] create likely branch weights for guard->branch (PR #164271)
https://github.com/mtrofin updated
https://github.com/llvm/llvm-project/pull/164271
>From 472b932477f8e50d0c3b3b8319b444b559c60b84 Mon Sep 17 00:00:00 2001
From: Mircea Trofin
Date: Mon, 20 Oct 2025 08:21:26 -0700
Subject: [PATCH] [SLU][profcheck] create likely branch weights for
guard->branch
---
.../Transforms/Scalar/SimpleLoopUnswitch.cpp | 12 +-
.../Transforms/SimpleLoopUnswitch/guards.ll | 181 +-
2 files changed, 139 insertions(+), 54 deletions(-)
diff --git a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
index 239526e85e1fd..86b2090081ed0 100644
--- a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
+++ b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
@@ -40,6 +40,7 @@
#include "llvm/IR/Instruction.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/MDBuilder.h"
#include "llvm/IR/Module.h"
#include "llvm/IR/PatternMatch.h"
#include "llvm/IR/ProfDataUtils.h"
@@ -2831,9 +2832,14 @@ static BranchInst *turnGuardIntoBranch(IntrinsicInst
*GI, Loop &L,
MSSAU->getMemorySSA()->verifyMemorySSA();
DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
- Instruction *DeoptBlockTerm =
- SplitBlockAndInsertIfThen(GI->getArgOperand(0), GI, true,
-GI->getMetadata(LLVMContext::MD_prof), &DTU,
&LI);
+ // llvm.experimental.guard doesn't have branch weights. We can assume,
+ // however, that the deopt path is unlikely.
+ Instruction *DeoptBlockTerm = SplitBlockAndInsertIfThen(
+ GI->getArgOperand(0), GI, true,
+ !ProfcheckDisableMetadataFixes && EstimateProfile
+ ? MDBuilder(GI->getContext()).createUnlikelyBranchWeights()
+ : nullptr,
+ &DTU, &LI);
BranchInst *CheckBI = cast(CheckBB->getTerminator());
// SplitBlockAndInsertIfThen inserts control flow that branches to
// DeoptBlockTerm if the condition is true. We want the opposite.
diff --git a/llvm/test/Transforms/SimpleLoopUnswitch/guards.ll
b/llvm/test/Transforms/SimpleLoopUnswitch/guards.ll
index 533b1f691f5ad..e83047e397d3d 100644
--- a/llvm/test/Transforms/SimpleLoopUnswitch/guards.ll
+++ b/llvm/test/Transforms/SimpleLoopUnswitch/guards.ll
@@ -1,26 +1,34 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
UTC_ARGS: -p --version 5
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
UTC_ARGS: -p --check-globals all --version 5
; RUN: opt -passes='loop(simple-loop-unswitch),verify'
-simple-loop-unswitch-guards -S < %s | FileCheck %s
; RUN: opt -passes='simple-loop-unswitch'
-simple-loop-unswitch-guards -S < %s | FileCheck %s
; RUN: opt -passes='loop-mssa(simple-loop-unswitch),verify'
-simple-loop-unswitch-guards -verify-memoryssa -verify-loop-info -S < %s |
FileCheck %s
declare void @llvm.experimental.guard(i1, ...)
-define void @test_simple_case(i1 %cond, i32 %N) {
-; CHECK-LABEL: @test_simple_case(
+define void @test_simple_case(i1 %cond, i32 %N) !prof !0 {
+; CHECK-LABEL: define void @test_simple_case(i1 %cond, i32 %N) !prof !0 {
; CHECK-NEXT: entry:
-; CHECK-NEXT:br i1 [[COND:%.*]], label [[ENTRY_SPLIT_US:%.*]], label
[[ENTRY_SPLIT:%.*]]
+; CHECK-NEXT:br i1 %cond, label %entry.split.us, label %entry.split, !prof
!1
; CHECK: entry.split.us:
-; CHECK-NEXT:br label [[LOOP_US:%.*]]
+; CHECK-NEXT:br label %loop.us
; CHECK: loop.us:
-; CHECK-NEXT:[[IV_US:%.*]] = phi i32 [ 0, [[ENTRY_SPLIT_US]] ], [
[[IV_NEXT_US:%.*]], [[GUARDED_US:%.*]] ]
-; CHECK-NEXT:br label [[GUARDED_US]]
+; CHECK-NEXT:%iv.us = phi i32 [ 0, %entry.split.us ], [ %iv.next.us,
%guarded.us ]
+; CHECK-NEXT:br label %guarded.us
; CHECK: guarded.us:
-; CHECK-NEXT:[[IV_NEXT_US]] = add i32 [[IV_US]], 1
-; CHECK-NEXT:[[LOOP_COND_US:%.*]] = icmp slt i32 [[IV_NEXT_US]], [[N:%.*]]
-; CHECK-NEXT:br i1 [[LOOP_COND_US]], label [[LOOP_US]], label
[[EXIT_SPLIT_US:%.*]]
+; CHECK-NEXT:%iv.next.us = add i32 %iv.us, 1
+; CHECK-NEXT:%loop.cond.us = icmp slt i32 %iv.next.us, %N
+; CHECK-NEXT:br i1 %loop.cond.us, label %loop.us, label %exit.split.us
+; CHECK: exit.split.us:
+; CHECK-NEXT:br label %exit
+; CHECK: entry.split:
+; CHECK-NEXT:br label %loop
+; CHECK: loop:
+; CHECK-NEXT:br label %deopt
; CHECK: deopt:
; CHECK-NEXT:call void (i1, ...) @llvm.experimental.guard(i1 false) [
"deopt"() ]
; CHECK-NEXT:unreachable
+; CHECK: exit:
+; CHECK-NEXT:ret void
;
entry:
@@ -38,25 +46,39 @@ exit:
}
define void @test_two_guards(i1 %cond1, i1 %cond2, i32 %N) {
-; CHECK-LABEL: @test_two_guards(
+; CHECK-LABEL: define void @test_two_guards(i1 %cond1, i1 %cond2, i32 %N) {
; CHECK-NEXT: entry:
-; CHECK-NEXT:br i1 [[COND1:%.*]], label [[ENTRY_SPLIT_US:%.*]], label
[[ENTRY_SPLIT:%.*]]
+; CHECK-NEXT:br i1 %cond1, label %entry.split.us, label %entry.split,
!prof !1
; CHECK:
[llvm-branch-commits] [llvm] [SLU][profcheck] Propagate profile for branches on injected conditions. (PR #164476)
https://github.com/mtrofin updated
https://github.com/llvm/llvm-project/pull/164476
>From 7a5b8b055ec7b901339801d0c98f13560a51bf70 Mon Sep 17 00:00:00 2001
From: Mircea Trofin
Date: Tue, 21 Oct 2025 11:22:01 -0700
Subject: [PATCH] [SLU][profcheck] Propagate profile for branches on injected
conditions.
---
.../Transforms/Scalar/SimpleLoopUnswitch.cpp | 9 +-
.../inject-invariant-conditions.ll| 142 +-
2 files changed, 79 insertions(+), 72 deletions(-)
diff --git a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
index 86b2090081ed0..0577ddbd2353c 100644
--- a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
+++ b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
@@ -3203,10 +3203,15 @@
injectPendingInvariantConditions(NonTrivialUnswitchCandidate Candidate, Loop &L,
Builder.SetInsertPoint(TI);
auto *InvariantBr =
Builder.CreateCondBr(InjectedCond, InLoopSucc, CheckBlock);
+ // We don't know anything about the relation between the limits.
+ setExplicitlyUnknownBranchWeightsIfProfiled(
+ *InvariantBr, *InvariantBr->getParent()->getParent(), DEBUG_TYPE);
Builder.SetInsertPoint(CheckBlock);
- Builder.CreateCondBr(TI->getCondition(), TI->getSuccessor(0),
- TI->getSuccessor(1));
+ Builder.CreateCondBr(
+ TI->getCondition(), TI->getSuccessor(0), TI->getSuccessor(1),
+ !ProfcheckDisableMetadataFixes ? TI->getMetadata(LLVMContext::MD_prof)
+ : nullptr);
TI->eraseFromParent();
// Fixup phis.
diff --git
a/llvm/test/Transforms/SimpleLoopUnswitch/inject-invariant-conditions.ll
b/llvm/test/Transforms/SimpleLoopUnswitch/inject-invariant-conditions.ll
index 536e0c6a0e74a..3c84dea2a0672 100644
--- a/llvm/test/Transforms/SimpleLoopUnswitch/inject-invariant-conditions.ll
+++ b/llvm/test/Transforms/SimpleLoopUnswitch/inject-invariant-conditions.ll
@@ -2,40 +2,40 @@
; RUN: opt < %s -S -simple-loop-unswitch-inject-invariant-conditions=true
-passes="loop(simple-loop-unswitch),simplifycfg" | FileCheck %s
; RUN: opt < %s -S -simple-loop-unswitch-inject-invariant-conditions=true
-passes="loop-mssa(simple-loop-unswitch),simplifycfg"
-verify-memoryssa | FileCheck %s
-define i32 @test_01(ptr noundef %p, i32 noundef %n, i32 noundef %limit, ptr
noundef %arr, ptr noundef %x_p) {
+define i32 @test_01(ptr noundef %p, i32 noundef %n, i32 noundef %limit, ptr
noundef %arr, ptr noundef %x_p) !prof !{!"function_entry_count", i32 10} {
; CHECK-LABEL: @test_01(
; CHECK-NEXT: entry:
-; CHECK-NEXT:[[X:%.*]] = load i32, ptr [[X_P:%.*]], align 4, !noundef !0
+; CHECK-NEXT:[[X:%.*]] = load i32, ptr [[X_P:%.*]], align 4, !noundef
[[META1:![0-9]+]]
; CHECK-NEXT:[[INJECTED_COND:%.*]] = icmp ule i32 [[LIMIT:%.*]], [[X]]
-; CHECK-NEXT:br i1 [[INJECTED_COND]], label [[LOOP_US:%.*]], label
[[LOOP:%.*]]
+; CHECK-NEXT:br i1 [[INJECTED_COND]], label [[LOOP_US:%.*]], label
[[LOOP:%.*]], !prof [[PROF2:![0-9]+]]
; CHECK: loop.us:
-; CHECK-NEXT:[[IV_US:%.*]] = phi i32 [ [[IV_NEXT_US:%.*]],
[[GUARDED_US:%.*]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:[[EL_PTR_US:%.*]] = getelementptr i32, ptr [[P:%.*]], i32
[[IV_US]]
-; CHECK-NEXT:[[EL_US:%.*]] = load i32, ptr [[EL_PTR_US]], align 4
-; CHECK-NEXT:[[BOUND_CHECK_US:%.*]] = icmp ult i32 [[EL_US]], [[LIMIT]]
-; CHECK-NEXT:br i1 [[BOUND_CHECK_US]], label [[GUARDED_US]], label
[[COMMON_RET:%.*]], !prof [[PROF1:![0-9]+]]
+; CHECK-NEXT:[[IV:%.*]] = phi i32 [ [[IV_NEXT_US:%.*]], [[GUARDED_US:%.*]]
], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:[[EL_PTR:%.*]] = getelementptr i32, ptr [[P:%.*]], i32 [[IV]]
+; CHECK-NEXT:[[EL:%.*]] = load i32, ptr [[EL_PTR]], align 4
+; CHECK-NEXT:[[BOUND_CHECK:%.*]] = icmp ult i32 [[EL]], [[LIMIT]]
+; CHECK-NEXT:br i1 [[BOUND_CHECK]], label [[GUARDED_US]], label
[[COMMON_RET:%.*]], !prof [[PROF3:![0-9]+]]
; CHECK: guarded.us:
-; CHECK-NEXT:[[RANGE_CHECK_US:%.*]] = icmp ult i32 [[EL_US]], [[X]]
-; CHECK-NEXT:[[ARR_PTR_US:%.*]] = getelementptr i32, ptr [[ARR:%.*]], i32
[[EL_US]]
-; CHECK-NEXT:store i32 [[IV_US]], ptr [[ARR_PTR_US]], align 4
-; CHECK-NEXT:[[IV_NEXT_US]] = add i32 [[IV_US]], 1
+; CHECK-NEXT:[[RANGE_CHECK_US:%.*]] = icmp ult i32 [[EL]], [[X]]
+; CHECK-NEXT:[[ARR_PTR_US:%.*]] = getelementptr i32, ptr [[ARR:%.*]], i32
[[EL]]
+; CHECK-NEXT:store i32 [[IV]], ptr [[ARR_PTR_US]], align 4
+; CHECK-NEXT:[[IV_NEXT_US]] = add i32 [[IV]], 1
; CHECK-NEXT:[[LOOP_COND_US:%.*]] = icmp slt i32 [[IV_NEXT_US]], [[N:%.*]]
; CHECK-NEXT:br i1 [[LOOP_COND_US]], label [[LOOP_US]], label
[[COMMON_RET]]
; CHECK: loop:
-; CHECK-NEXT:[[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], [[BACKEDGE:%.*]] ], [
0, [[ENTRY]] ]
-; CHECK-NEXT:[[EL_PTR:%.*]] = getelementptr i32, ptr [[P]], i32 [[IV]]
-; CHECK-NEXT:[[EL:%.*]] = load i32, ptr [[EL_PTR]], align 4
-; CHECK-NEXT:[[BOUND_CHEC
[llvm-branch-commits] [llvm] [LVer][profcheck] explicitly set unknown branch weights for the versioned/unversioned selector (PR #164507)
https://github.com/mtrofin updated
https://github.com/llvm/llvm-project/pull/164507
>From a5d8823aca32426e0cc98d920957182c2e2efb0b Mon Sep 17 00:00:00 2001
From: Mircea Trofin
Date: Tue, 21 Oct 2025 15:20:14 -0700
Subject: [PATCH] [LVer][profcheck] explicitly set unknown branch weights for
the versioned/unversioned selector
---
llvm/lib/Transforms/Utils/LoopVersioning.cpp | 10 --
.../Transforms/LoopDistribute/basic-with-memchecks.ll | 5 +++--
2 files changed, 11 insertions(+), 4 deletions(-)
diff --git a/llvm/lib/Transforms/Utils/LoopVersioning.cpp
b/llvm/lib/Transforms/Utils/LoopVersioning.cpp
index ec2e6c1ab796b..4786819d18fa4 100644
--- a/llvm/lib/Transforms/Utils/LoopVersioning.cpp
+++ b/llvm/lib/Transforms/Utils/LoopVersioning.cpp
@@ -23,6 +23,7 @@
#include "llvm/IR/Dominators.h"
#include "llvm/IR/MDBuilder.h"
#include "llvm/IR/PassManager.h"
+#include "llvm/IR/ProfDataUtils.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
#include "llvm/Transforms/Utils/Cloning.h"
@@ -109,8 +110,13 @@ void LoopVersioning::versionLoop(
// Insert the conditional branch based on the result of the memchecks.
Instruction *OrigTerm = RuntimeCheckBB->getTerminator();
Builder.SetInsertPoint(OrigTerm);
- Builder.CreateCondBr(RuntimeCheck, NonVersionedLoop->getLoopPreheader(),
- VersionedLoop->getLoopPreheader());
+ auto *BI =
+ Builder.CreateCondBr(RuntimeCheck, NonVersionedLoop->getLoopPreheader(),
+ VersionedLoop->getLoopPreheader());
+ // We don't know what the probability of executing the versioned vs the
+ // unversioned variants is.
+ setExplicitlyUnknownBranchWeightsIfProfiled(
+ *BI, *BI->getParent()->getParent(), DEBUG_TYPE);
OrigTerm->eraseFromParent();
// The loops merge in the original exit block. This is now dominated by the
diff --git a/llvm/test/Transforms/LoopDistribute/basic-with-memchecks.ll
b/llvm/test/Transforms/LoopDistribute/basic-with-memchecks.ll
index 97ea2c6708dad..2828882afe779 100644
--- a/llvm/test/Transforms/LoopDistribute/basic-with-memchecks.ll
+++ b/llvm/test/Transforms/LoopDistribute/basic-with-memchecks.ll
@@ -28,7 +28,7 @@ target triple = "x86_64-apple-macosx10.10.0"
@E = common global ptr null, align 8
; CHECK-LABEL: @f(
-define void @f() {
+define void @f() !prof !{!"function_entry_count", i32 10} {
entry:
%a = load ptr, ptr @A, align 8
%b = load ptr, ptr @B, align 8
@@ -55,7 +55,7 @@ entry:
; CHECK: = icmp
; CHECK-NOT: = icmp
-; CHECK: br i1 %conflict.rdx15, label %for.body.ph.lver.orig, label
%for.body.ph.ldist1
+; CHECK: br i1 %conflict.rdx15, label %for.body.ph.lver.orig, label
%for.body.ph.ldist1, !prof ![[PROF1:[0-9]]]
; The non-distributed loop that the memchecks fall back on.
@@ -289,3 +289,4 @@ attributes #1 = { nounwind convergent }
!0 = distinct !{!0, !1}
!1 = !{!"llvm.loop.distribute.enable", i1 true}
+; CHECK: ![[PROF1]] = !{!"unknown", !"loop-versioning"}
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [LSCFG][profcheck] Add dummy branch weights for the dummy switch to dead exits (PR #164714)
https://github.com/mtrofin updated
https://github.com/llvm/llvm-project/pull/164714
>From 64eaa8574dfa3343c192502349672dff24570c66 Mon Sep 17 00:00:00 2001
From: Mircea Trofin
Date: Wed, 22 Oct 2025 14:34:31 -0700
Subject: [PATCH] [LSCFG][profcheck] Add dummy branch weights for the dummy
switch to dead exits
---
.../lib/Transforms/Scalar/LoopSimplifyCFG.cpp | 12 ++
.../LoopSimplifyCFG/constant-fold-branch.ll | 104 +-
2 files changed, 66 insertions(+), 50 deletions(-)
diff --git a/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp
b/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp
index b9546c5fa236b..e902b71776973 100644
--- a/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp
@@ -24,6 +24,7 @@
#include "llvm/Analysis/ScalarEvolution.h"
#include "llvm/IR/Dominators.h"
#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/ProfDataUtils.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Transforms/Scalar.h"
#include "llvm/Transforms/Scalar/LoopPassManager.h"
@@ -393,6 +394,17 @@ class ConstantTerminatorFoldingImpl {
DTUpdates.push_back({DominatorTree::Insert, Preheader, BB});
++NumLoopExitsDeleted;
}
+// We don't really need to add branch weights to DummySwitch, because all
+// but one branches are just a temporary artifact - see the comment on top
+// of this function. But, it's easy to estimate the weights, and it helps
+// maintain a property of the overall compiler - that the branch weights
+// don't "just get dropped" accidentally (i.e. profcheck)
+if (DummySwitch->getParent()->getParent()->hasProfileData()) {
+ SmallVector DummyBranchWeights(1 + DummySwitch->getNumCases());
+ // default. 100% probability, the rest are dead.
+ DummyBranchWeights[0] = 1;
+ setBranchWeights(*DummySwitch, DummyBranchWeights, /*IsExpected=*/false);
+}
assert(L.getLoopPreheader() == NewPreheader && "Malformed CFG?");
if (Loop *OuterLoop = LI.getLoopFor(Preheader)) {
diff --git a/llvm/test/Transforms/LoopSimplifyCFG/constant-fold-branch.ll
b/llvm/test/Transforms/LoopSimplifyCFG/constant-fold-branch.ll
index 1ec212f0bb5ea..46b6209986fed 100644
--- a/llvm/test/Transforms/LoopSimplifyCFG/constant-fold-branch.ll
+++ b/llvm/test/Transforms/LoopSimplifyCFG/constant-fold-branch.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
UTC_ARGS: --check-globals
; REQUIRES: asserts
; RUN: opt -S -enable-loop-simplifycfg-term-folding=true
-passes=loop-simplifycfg -verify-loop-info -verify-dom-info -verify-loop-lcssa
< %s | FileCheck %s
; RUN: opt -S -enable-loop-simplifycfg-term-folding=true
-passes='require,loop(loop-simplifycfg)' -verify-loop-info
-verify-dom-info -verify-loop-lcssa < %s | FileCheck %s
@@ -59,7 +59,7 @@ define i32 @dead_backedge_test_switch_loop(i32 %end) {
; CHECK: dead_backedge:
; CHECK-NEXT:[[I_2]] = add i32 [[I_1]], 10
; CHECK-NEXT:switch i32 1, label [[EXIT:%.*]] [
-; CHECK-NEXT:i32 0, label [[HEADER_BACKEDGE]]
+; CHECK-NEXT: i32 0, label [[HEADER_BACKEDGE]]
; CHECK-NEXT:]
; CHECK: exit:
; CHECK-NEXT:[[I_2_LCSSA:%.*]] = phi i32 [ [[I_2]], [[DEAD_BACKEDGE]] ]
@@ -233,12 +233,12 @@ exit:
; Check that we preserve static reachibility of a dead exit block while
deleting
; a branch.
-define i32 @dead_exit_test_branch_loop(i32 %end) {
+define i32 @dead_exit_test_branch_loop(i32 %end) !prof
!{!"function_entry_count", i32 10} {
; CHECK-LABEL: @dead_exit_test_branch_loop(
; CHECK-NEXT: preheader:
; CHECK-NEXT:switch i32 0, label [[PREHEADER_SPLIT:%.*]] [
-; CHECK-NEXT:i32 1, label [[DEAD:%.*]]
-; CHECK-NEXT:]
+; CHECK-NEXT: i32 1, label [[DEAD:%.*]]
+; CHECK-NEXT:], !prof [[PROF1:![0-9]+]]
; CHECK: preheader.split:
; CHECK-NEXT:br label [[HEADER:%.*]]
; CHECK: header:
@@ -262,7 +262,7 @@ preheader:
header:
%i = phi i32 [0, %preheader], [%i.inc, %backedge]
- br i1 true, label %backedge, label %dead
+ br i1 true, label %backedge, label %dead, !prof !{!"branch_weights", i32 10,
i32 1}
dead:
br label %dummy
@@ -286,7 +286,7 @@ define i32 @dead_exit_test_switch_loop(i32 %end) {
; CHECK-LABEL: @dead_exit_test_switch_loop(
; CHECK-NEXT: preheader:
; CHECK-NEXT:switch i32 0, label [[PREHEADER_SPLIT:%.*]] [
-; CHECK-NEXT:i32 1, label [[DEAD:%.*]]
+; CHECK-NEXT: i32 1, label [[DEAD:%.*]]
; CHECK-NEXT:]
; CHECK: preheader.split:
; CHECK-NEXT:br label [[HEADER:%.*]]
@@ -383,9 +383,9 @@ define i32 @dead_loop_test_switch_loop(i32 %end) {
; CHECK: header:
; CHECK-NEXT:[[I:%.*]] = phi i32 [ 0, [[PREHEADER:%.*]] ], [
[[I_INC:%.*]], [[BACKEDGE:%.*]] ]
; CHECK-NEXT:switch i32 1, label [[DEAD:%.*]] [
-; CHECK-NEXT:i32 0, label [[DEAD]]
-; CHECK-NEXT:i32 1, label [[BACKEDGE]]
-; CHECK-NEXT:i32 2, lab
[llvm-branch-commits] [llvm] [LVer][profcheck] explicitly set unknown branch weights for the versioned/unversioned selector (PR #164507)
https://github.com/mtrofin updated
https://github.com/llvm/llvm-project/pull/164507
>From a5d8823aca32426e0cc98d920957182c2e2efb0b Mon Sep 17 00:00:00 2001
From: Mircea Trofin
Date: Tue, 21 Oct 2025 15:20:14 -0700
Subject: [PATCH] [LVer][profcheck] explicitly set unknown branch weights for
the versioned/unversioned selector
---
llvm/lib/Transforms/Utils/LoopVersioning.cpp | 10 --
.../Transforms/LoopDistribute/basic-with-memchecks.ll | 5 +++--
2 files changed, 11 insertions(+), 4 deletions(-)
diff --git a/llvm/lib/Transforms/Utils/LoopVersioning.cpp
b/llvm/lib/Transforms/Utils/LoopVersioning.cpp
index ec2e6c1ab796b..4786819d18fa4 100644
--- a/llvm/lib/Transforms/Utils/LoopVersioning.cpp
+++ b/llvm/lib/Transforms/Utils/LoopVersioning.cpp
@@ -23,6 +23,7 @@
#include "llvm/IR/Dominators.h"
#include "llvm/IR/MDBuilder.h"
#include "llvm/IR/PassManager.h"
+#include "llvm/IR/ProfDataUtils.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
#include "llvm/Transforms/Utils/Cloning.h"
@@ -109,8 +110,13 @@ void LoopVersioning::versionLoop(
// Insert the conditional branch based on the result of the memchecks.
Instruction *OrigTerm = RuntimeCheckBB->getTerminator();
Builder.SetInsertPoint(OrigTerm);
- Builder.CreateCondBr(RuntimeCheck, NonVersionedLoop->getLoopPreheader(),
- VersionedLoop->getLoopPreheader());
+ auto *BI =
+ Builder.CreateCondBr(RuntimeCheck, NonVersionedLoop->getLoopPreheader(),
+ VersionedLoop->getLoopPreheader());
+ // We don't know what the probability of executing the versioned vs the
+ // unversioned variants is.
+ setExplicitlyUnknownBranchWeightsIfProfiled(
+ *BI, *BI->getParent()->getParent(), DEBUG_TYPE);
OrigTerm->eraseFromParent();
// The loops merge in the original exit block. This is now dominated by the
diff --git a/llvm/test/Transforms/LoopDistribute/basic-with-memchecks.ll
b/llvm/test/Transforms/LoopDistribute/basic-with-memchecks.ll
index 97ea2c6708dad..2828882afe779 100644
--- a/llvm/test/Transforms/LoopDistribute/basic-with-memchecks.ll
+++ b/llvm/test/Transforms/LoopDistribute/basic-with-memchecks.ll
@@ -28,7 +28,7 @@ target triple = "x86_64-apple-macosx10.10.0"
@E = common global ptr null, align 8
; CHECK-LABEL: @f(
-define void @f() {
+define void @f() !prof !{!"function_entry_count", i32 10} {
entry:
%a = load ptr, ptr @A, align 8
%b = load ptr, ptr @B, align 8
@@ -55,7 +55,7 @@ entry:
; CHECK: = icmp
; CHECK-NOT: = icmp
-; CHECK: br i1 %conflict.rdx15, label %for.body.ph.lver.orig, label
%for.body.ph.ldist1
+; CHECK: br i1 %conflict.rdx15, label %for.body.ph.lver.orig, label
%for.body.ph.ldist1, !prof ![[PROF1:[0-9]]]
; The non-distributed loop that the memchecks fall back on.
@@ -289,3 +289,4 @@ attributes #1 = { nounwind convergent }
!0 = distinct !{!0, !1}
!1 = !{!"llvm.loop.distribute.enable", i1 true}
+; CHECK: ![[PROF1]] = !{!"unknown", !"loop-versioning"}
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [LIR][profcheck] Reuse the loop's exit condition profile (PR #164523)
https://github.com/mtrofin updated
https://github.com/llvm/llvm-project/pull/164523
>From 648356dd4efbb775b55dff0f32a8f5df97cb4d96 Mon Sep 17 00:00:00 2001
From: Mircea Trofin
Date: Tue, 21 Oct 2025 17:24:49 -0700
Subject: [PATCH] [LIR][profcheck] Reuse the loop's exit condition profile
---
.../Transforms/Scalar/LoopIdiomRecognize.cpp | 40 +--
.../LoopIdiom/X86/preserve-profile.ll | 70 +++
2 files changed, 106 insertions(+), 4 deletions(-)
create mode 100644 llvm/test/Transforms/LoopIdiom/X86/preserve-profile.ll
diff --git a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
index 019536ca91ae0..9070d252ae09f 100644
--- a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
@@ -72,6 +72,7 @@
#include "llvm/IR/Module.h"
#include "llvm/IR/PassManager.h"
#include "llvm/IR/PatternMatch.h"
+#include "llvm/IR/ProfDataUtils.h"
#include "llvm/IR/Type.h"
#include "llvm/IR/User.h"
#include "llvm/IR/Value.h"
@@ -105,6 +106,7 @@ STATISTIC(
STATISTIC(NumShiftUntilZero,
"Number of uncountable loops recognized as 'shift until zero'
idiom");
+namespace llvm {
bool DisableLIRP::All;
static cl::opt
DisableLIRPAll("disable-" DEBUG_TYPE "-all",
@@ -163,6 +165,10 @@ static cl::opt ForceMemsetPatternIntrinsic(
cl::desc("Use memset.pattern intrinsic whenever possible"),
cl::init(false),
cl::Hidden);
+extern cl::opt ProfcheckDisableMetadataFixes;
+
+} // namespace llvm
+
namespace {
class LoopIdiomRecognize {
@@ -3199,7 +3205,21 @@ bool LoopIdiomRecognize::recognizeShiftUntilBitTest() {
// The loop trip count check.
auto *IVCheck = Builder.CreateICmpEQ(IVNext, LoopTripCount,
CurLoop->getName() + ".ivcheck");
- Builder.CreateCondBr(IVCheck, SuccessorBB, LoopHeaderBB);
+ SmallVector BranchWeights;
+ const bool HasBranchWeights =
+ !ProfcheckDisableMetadataFixes &&
+ extractBranchWeights(*LoopHeaderBB->getTerminator(), BranchWeights);
+
+ auto *BI = Builder.CreateCondBr(IVCheck, SuccessorBB, LoopHeaderBB);
+ if (HasBranchWeights) {
+if (SuccessorBB == LoopHeaderBB->getTerminator()->getSuccessor(1))
+ std::swap(BranchWeights[0], BranchWeights[1]);
+// We're not changing the loop profile, so we can reuse the original loop's
+// profile.
+setBranchWeights(*BI, BranchWeights,
+ /*IsExpected=*/false);
+ }
+
LoopHeaderBB->getTerminator()->eraseFromParent();
// Populate the IV PHI.
@@ -3368,10 +3388,10 @@ static bool detectShiftUntilZeroIdiom(Loop *CurLoop,
ScalarEvolution *SE,
/// %start = <...>
/// %extraoffset = <...>
/// <...>
-/// br label %for.cond
+/// br label %loop
///
/// loop:
-/// %iv = phi i8 [ %start, %entry ], [ %iv.next, %for.cond ]
+/// %iv = phi i8 [ %start, %entry ], [ %iv.next, %loop ]
/// %nbits = add nsw i8 %iv, %extraoffset
/// %val.shifted = {{l,a}shr,shl} i8 %val, %nbits
/// %val.shifted.iszero = icmp eq i8 %val.shifted, 0
@@ -3533,7 +3553,19 @@ bool LoopIdiomRecognize::recognizeShiftUntilZero() {
// The loop terminator.
Builder.SetInsertPoint(LoopHeaderBB->getTerminator());
- Builder.CreateCondBr(CIVCheck, SuccessorBB, LoopHeaderBB);
+ SmallVector BranchWeights;
+ const bool HasBranchWeights =
+ !ProfcheckDisableMetadataFixes &&
+ extractBranchWeights(*LoopHeaderBB->getTerminator(), BranchWeights);
+
+ auto *BI = Builder.CreateCondBr(CIVCheck, SuccessorBB, LoopHeaderBB);
+ if (HasBranchWeights) {
+if (InvertedCond)
+ std::swap(BranchWeights[0], BranchWeights[1]);
+// We're not changing the loop profile, so we can reuse the original loop's
+// profile.
+setBranchWeights(*BI, BranchWeights, /*IsExpected=*/false);
+ }
LoopHeaderBB->getTerminator()->eraseFromParent();
// Populate the IV PHI.
diff --git a/llvm/test/Transforms/LoopIdiom/X86/preserve-profile.ll
b/llvm/test/Transforms/LoopIdiom/X86/preserve-profile.ll
new file mode 100644
index 0..d01bb748d9422
--- /dev/null
+++ b/llvm/test/Transforms/LoopIdiom/X86/preserve-profile.ll
@@ -0,0 +1,70 @@
+; RUN: opt
-passes="module(print),function(loop(loop-idiom)),module(print)"
-mtriple=x86_64 -mcpu=core-avx2 %s -disable-output 2>&1 | FileCheck
--check-prefix=PROFILE %s
+
+declare void @escape_inner(i8, i8, i8, i1, i8)
+declare void @escape_outer(i8, i8, i8, i1, i8)
+
+declare i8 @gen.i8()
+
+; Most basic pattern; Note that iff the shift amount is offset, said offsetting
+; must not cause an overflow, but `add nsw` is fine.
+define i8 @p0(i8 %val, i8 %start, i8 %extraoffset) mustprogress {
+entry:
+ br label %loop
+
+loop:
+ %iv = phi i8 [ %start, %entry ], [ %iv.next, %loop ]
+ %nbits = add nsw i8 %iv, %extraoffset
+ %val.shifted = ashr i8 %val, %nbits
+ %val.shifted.iszero = icmp eq i8 %val.shifted, 0
+ %iv.next = add i8 %iv, 1
+
+ call void @escap
[llvm-branch-commits] [llvm] [LIR][profcheck] Reuse the loop's exit condition profile (PR #164523)
https://github.com/mtrofin updated
https://github.com/llvm/llvm-project/pull/164523
>From 648356dd4efbb775b55dff0f32a8f5df97cb4d96 Mon Sep 17 00:00:00 2001
From: Mircea Trofin
Date: Tue, 21 Oct 2025 17:24:49 -0700
Subject: [PATCH] [LIR][profcheck] Reuse the loop's exit condition profile
---
.../Transforms/Scalar/LoopIdiomRecognize.cpp | 40 +--
.../LoopIdiom/X86/preserve-profile.ll | 70 +++
2 files changed, 106 insertions(+), 4 deletions(-)
create mode 100644 llvm/test/Transforms/LoopIdiom/X86/preserve-profile.ll
diff --git a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
index 019536ca91ae0..9070d252ae09f 100644
--- a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
@@ -72,6 +72,7 @@
#include "llvm/IR/Module.h"
#include "llvm/IR/PassManager.h"
#include "llvm/IR/PatternMatch.h"
+#include "llvm/IR/ProfDataUtils.h"
#include "llvm/IR/Type.h"
#include "llvm/IR/User.h"
#include "llvm/IR/Value.h"
@@ -105,6 +106,7 @@ STATISTIC(
STATISTIC(NumShiftUntilZero,
"Number of uncountable loops recognized as 'shift until zero'
idiom");
+namespace llvm {
bool DisableLIRP::All;
static cl::opt
DisableLIRPAll("disable-" DEBUG_TYPE "-all",
@@ -163,6 +165,10 @@ static cl::opt ForceMemsetPatternIntrinsic(
cl::desc("Use memset.pattern intrinsic whenever possible"),
cl::init(false),
cl::Hidden);
+extern cl::opt ProfcheckDisableMetadataFixes;
+
+} // namespace llvm
+
namespace {
class LoopIdiomRecognize {
@@ -3199,7 +3205,21 @@ bool LoopIdiomRecognize::recognizeShiftUntilBitTest() {
// The loop trip count check.
auto *IVCheck = Builder.CreateICmpEQ(IVNext, LoopTripCount,
CurLoop->getName() + ".ivcheck");
- Builder.CreateCondBr(IVCheck, SuccessorBB, LoopHeaderBB);
+ SmallVector BranchWeights;
+ const bool HasBranchWeights =
+ !ProfcheckDisableMetadataFixes &&
+ extractBranchWeights(*LoopHeaderBB->getTerminator(), BranchWeights);
+
+ auto *BI = Builder.CreateCondBr(IVCheck, SuccessorBB, LoopHeaderBB);
+ if (HasBranchWeights) {
+if (SuccessorBB == LoopHeaderBB->getTerminator()->getSuccessor(1))
+ std::swap(BranchWeights[0], BranchWeights[1]);
+// We're not changing the loop profile, so we can reuse the original loop's
+// profile.
+setBranchWeights(*BI, BranchWeights,
+ /*IsExpected=*/false);
+ }
+
LoopHeaderBB->getTerminator()->eraseFromParent();
// Populate the IV PHI.
@@ -3368,10 +3388,10 @@ static bool detectShiftUntilZeroIdiom(Loop *CurLoop,
ScalarEvolution *SE,
/// %start = <...>
/// %extraoffset = <...>
/// <...>
-/// br label %for.cond
+/// br label %loop
///
/// loop:
-/// %iv = phi i8 [ %start, %entry ], [ %iv.next, %for.cond ]
+/// %iv = phi i8 [ %start, %entry ], [ %iv.next, %loop ]
/// %nbits = add nsw i8 %iv, %extraoffset
/// %val.shifted = {{l,a}shr,shl} i8 %val, %nbits
/// %val.shifted.iszero = icmp eq i8 %val.shifted, 0
@@ -3533,7 +3553,19 @@ bool LoopIdiomRecognize::recognizeShiftUntilZero() {
// The loop terminator.
Builder.SetInsertPoint(LoopHeaderBB->getTerminator());
- Builder.CreateCondBr(CIVCheck, SuccessorBB, LoopHeaderBB);
+ SmallVector BranchWeights;
+ const bool HasBranchWeights =
+ !ProfcheckDisableMetadataFixes &&
+ extractBranchWeights(*LoopHeaderBB->getTerminator(), BranchWeights);
+
+ auto *BI = Builder.CreateCondBr(CIVCheck, SuccessorBB, LoopHeaderBB);
+ if (HasBranchWeights) {
+if (InvertedCond)
+ std::swap(BranchWeights[0], BranchWeights[1]);
+// We're not changing the loop profile, so we can reuse the original loop's
+// profile.
+setBranchWeights(*BI, BranchWeights, /*IsExpected=*/false);
+ }
LoopHeaderBB->getTerminator()->eraseFromParent();
// Populate the IV PHI.
diff --git a/llvm/test/Transforms/LoopIdiom/X86/preserve-profile.ll
b/llvm/test/Transforms/LoopIdiom/X86/preserve-profile.ll
new file mode 100644
index 0..d01bb748d9422
--- /dev/null
+++ b/llvm/test/Transforms/LoopIdiom/X86/preserve-profile.ll
@@ -0,0 +1,70 @@
+; RUN: opt
-passes="module(print),function(loop(loop-idiom)),module(print)"
-mtriple=x86_64 -mcpu=core-avx2 %s -disable-output 2>&1 | FileCheck
--check-prefix=PROFILE %s
+
+declare void @escape_inner(i8, i8, i8, i1, i8)
+declare void @escape_outer(i8, i8, i8, i1, i8)
+
+declare i8 @gen.i8()
+
+; Most basic pattern; Note that iff the shift amount is offset, said offsetting
+; must not cause an overflow, but `add nsw` is fine.
+define i8 @p0(i8 %val, i8 %start, i8 %extraoffset) mustprogress {
+entry:
+ br label %loop
+
+loop:
+ %iv = phi i8 [ %start, %entry ], [ %iv.next, %loop ]
+ %nbits = add nsw i8 %iv, %extraoffset
+ %val.shifted = ashr i8 %val, %nbits
+ %val.shifted.iszero = icmp eq i8 %val.shifted, 0
+ %iv.next = add i8 %iv, 1
+
+ call void @escap
[llvm-branch-commits] [llvm] [SLU][profcheck] Propagate profile for branches on injected conditions. (PR #164476)
https://github.com/mtrofin updated
https://github.com/llvm/llvm-project/pull/164476
>From 7a5b8b055ec7b901339801d0c98f13560a51bf70 Mon Sep 17 00:00:00 2001
From: Mircea Trofin
Date: Tue, 21 Oct 2025 11:22:01 -0700
Subject: [PATCH] [SLU][profcheck] Propagate profile for branches on injected
conditions.
---
.../Transforms/Scalar/SimpleLoopUnswitch.cpp | 9 +-
.../inject-invariant-conditions.ll| 142 +-
2 files changed, 79 insertions(+), 72 deletions(-)
diff --git a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
index 86b2090081ed0..0577ddbd2353c 100644
--- a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
+++ b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
@@ -3203,10 +3203,15 @@
injectPendingInvariantConditions(NonTrivialUnswitchCandidate Candidate, Loop &L,
Builder.SetInsertPoint(TI);
auto *InvariantBr =
Builder.CreateCondBr(InjectedCond, InLoopSucc, CheckBlock);
+ // We don't know anything about the relation between the limits.
+ setExplicitlyUnknownBranchWeightsIfProfiled(
+ *InvariantBr, *InvariantBr->getParent()->getParent(), DEBUG_TYPE);
Builder.SetInsertPoint(CheckBlock);
- Builder.CreateCondBr(TI->getCondition(), TI->getSuccessor(0),
- TI->getSuccessor(1));
+ Builder.CreateCondBr(
+ TI->getCondition(), TI->getSuccessor(0), TI->getSuccessor(1),
+ !ProfcheckDisableMetadataFixes ? TI->getMetadata(LLVMContext::MD_prof)
+ : nullptr);
TI->eraseFromParent();
// Fixup phis.
diff --git
a/llvm/test/Transforms/SimpleLoopUnswitch/inject-invariant-conditions.ll
b/llvm/test/Transforms/SimpleLoopUnswitch/inject-invariant-conditions.ll
index 536e0c6a0e74a..3c84dea2a0672 100644
--- a/llvm/test/Transforms/SimpleLoopUnswitch/inject-invariant-conditions.ll
+++ b/llvm/test/Transforms/SimpleLoopUnswitch/inject-invariant-conditions.ll
@@ -2,40 +2,40 @@
; RUN: opt < %s -S -simple-loop-unswitch-inject-invariant-conditions=true
-passes="loop(simple-loop-unswitch),simplifycfg" | FileCheck %s
; RUN: opt < %s -S -simple-loop-unswitch-inject-invariant-conditions=true
-passes="loop-mssa(simple-loop-unswitch),simplifycfg"
-verify-memoryssa | FileCheck %s
-define i32 @test_01(ptr noundef %p, i32 noundef %n, i32 noundef %limit, ptr
noundef %arr, ptr noundef %x_p) {
+define i32 @test_01(ptr noundef %p, i32 noundef %n, i32 noundef %limit, ptr
noundef %arr, ptr noundef %x_p) !prof !{!"function_entry_count", i32 10} {
; CHECK-LABEL: @test_01(
; CHECK-NEXT: entry:
-; CHECK-NEXT:[[X:%.*]] = load i32, ptr [[X_P:%.*]], align 4, !noundef !0
+; CHECK-NEXT:[[X:%.*]] = load i32, ptr [[X_P:%.*]], align 4, !noundef
[[META1:![0-9]+]]
; CHECK-NEXT:[[INJECTED_COND:%.*]] = icmp ule i32 [[LIMIT:%.*]], [[X]]
-; CHECK-NEXT:br i1 [[INJECTED_COND]], label [[LOOP_US:%.*]], label
[[LOOP:%.*]]
+; CHECK-NEXT:br i1 [[INJECTED_COND]], label [[LOOP_US:%.*]], label
[[LOOP:%.*]], !prof [[PROF2:![0-9]+]]
; CHECK: loop.us:
-; CHECK-NEXT:[[IV_US:%.*]] = phi i32 [ [[IV_NEXT_US:%.*]],
[[GUARDED_US:%.*]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:[[EL_PTR_US:%.*]] = getelementptr i32, ptr [[P:%.*]], i32
[[IV_US]]
-; CHECK-NEXT:[[EL_US:%.*]] = load i32, ptr [[EL_PTR_US]], align 4
-; CHECK-NEXT:[[BOUND_CHECK_US:%.*]] = icmp ult i32 [[EL_US]], [[LIMIT]]
-; CHECK-NEXT:br i1 [[BOUND_CHECK_US]], label [[GUARDED_US]], label
[[COMMON_RET:%.*]], !prof [[PROF1:![0-9]+]]
+; CHECK-NEXT:[[IV:%.*]] = phi i32 [ [[IV_NEXT_US:%.*]], [[GUARDED_US:%.*]]
], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:[[EL_PTR:%.*]] = getelementptr i32, ptr [[P:%.*]], i32 [[IV]]
+; CHECK-NEXT:[[EL:%.*]] = load i32, ptr [[EL_PTR]], align 4
+; CHECK-NEXT:[[BOUND_CHECK:%.*]] = icmp ult i32 [[EL]], [[LIMIT]]
+; CHECK-NEXT:br i1 [[BOUND_CHECK]], label [[GUARDED_US]], label
[[COMMON_RET:%.*]], !prof [[PROF3:![0-9]+]]
; CHECK: guarded.us:
-; CHECK-NEXT:[[RANGE_CHECK_US:%.*]] = icmp ult i32 [[EL_US]], [[X]]
-; CHECK-NEXT:[[ARR_PTR_US:%.*]] = getelementptr i32, ptr [[ARR:%.*]], i32
[[EL_US]]
-; CHECK-NEXT:store i32 [[IV_US]], ptr [[ARR_PTR_US]], align 4
-; CHECK-NEXT:[[IV_NEXT_US]] = add i32 [[IV_US]], 1
+; CHECK-NEXT:[[RANGE_CHECK_US:%.*]] = icmp ult i32 [[EL]], [[X]]
+; CHECK-NEXT:[[ARR_PTR_US:%.*]] = getelementptr i32, ptr [[ARR:%.*]], i32
[[EL]]
+; CHECK-NEXT:store i32 [[IV]], ptr [[ARR_PTR_US]], align 4
+; CHECK-NEXT:[[IV_NEXT_US]] = add i32 [[IV]], 1
; CHECK-NEXT:[[LOOP_COND_US:%.*]] = icmp slt i32 [[IV_NEXT_US]], [[N:%.*]]
; CHECK-NEXT:br i1 [[LOOP_COND_US]], label [[LOOP_US]], label
[[COMMON_RET]]
; CHECK: loop:
-; CHECK-NEXT:[[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], [[BACKEDGE:%.*]] ], [
0, [[ENTRY]] ]
-; CHECK-NEXT:[[EL_PTR:%.*]] = getelementptr i32, ptr [[P]], i32 [[IV]]
-; CHECK-NEXT:[[EL:%.*]] = load i32, ptr [[EL_PTR]], align 4
-; CHECK-NEXT:[[BOUND_CHEC
[llvm-branch-commits] [llvm] [SLU][profcheck] create likely branch weights for guard->branch (PR #164271)
https://github.com/mtrofin updated
https://github.com/llvm/llvm-project/pull/164271
>From 472b932477f8e50d0c3b3b8319b444b559c60b84 Mon Sep 17 00:00:00 2001
From: Mircea Trofin
Date: Mon, 20 Oct 2025 08:21:26 -0700
Subject: [PATCH] [SLU][profcheck] create likely branch weights for
guard->branch
---
.../Transforms/Scalar/SimpleLoopUnswitch.cpp | 12 +-
.../Transforms/SimpleLoopUnswitch/guards.ll | 181 +-
2 files changed, 139 insertions(+), 54 deletions(-)
diff --git a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
index 239526e85e1fd..86b2090081ed0 100644
--- a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
+++ b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
@@ -40,6 +40,7 @@
#include "llvm/IR/Instruction.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/MDBuilder.h"
#include "llvm/IR/Module.h"
#include "llvm/IR/PatternMatch.h"
#include "llvm/IR/ProfDataUtils.h"
@@ -2831,9 +2832,14 @@ static BranchInst *turnGuardIntoBranch(IntrinsicInst
*GI, Loop &L,
MSSAU->getMemorySSA()->verifyMemorySSA();
DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
- Instruction *DeoptBlockTerm =
- SplitBlockAndInsertIfThen(GI->getArgOperand(0), GI, true,
-GI->getMetadata(LLVMContext::MD_prof), &DTU,
&LI);
+ // llvm.experimental.guard doesn't have branch weights. We can assume,
+ // however, that the deopt path is unlikely.
+ Instruction *DeoptBlockTerm = SplitBlockAndInsertIfThen(
+ GI->getArgOperand(0), GI, true,
+ !ProfcheckDisableMetadataFixes && EstimateProfile
+ ? MDBuilder(GI->getContext()).createUnlikelyBranchWeights()
+ : nullptr,
+ &DTU, &LI);
BranchInst *CheckBI = cast(CheckBB->getTerminator());
// SplitBlockAndInsertIfThen inserts control flow that branches to
// DeoptBlockTerm if the condition is true. We want the opposite.
diff --git a/llvm/test/Transforms/SimpleLoopUnswitch/guards.ll
b/llvm/test/Transforms/SimpleLoopUnswitch/guards.ll
index 533b1f691f5ad..e83047e397d3d 100644
--- a/llvm/test/Transforms/SimpleLoopUnswitch/guards.ll
+++ b/llvm/test/Transforms/SimpleLoopUnswitch/guards.ll
@@ -1,26 +1,34 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
UTC_ARGS: -p --version 5
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
UTC_ARGS: -p --check-globals all --version 5
; RUN: opt -passes='loop(simple-loop-unswitch),verify'
-simple-loop-unswitch-guards -S < %s | FileCheck %s
; RUN: opt -passes='simple-loop-unswitch'
-simple-loop-unswitch-guards -S < %s | FileCheck %s
; RUN: opt -passes='loop-mssa(simple-loop-unswitch),verify'
-simple-loop-unswitch-guards -verify-memoryssa -verify-loop-info -S < %s |
FileCheck %s
declare void @llvm.experimental.guard(i1, ...)
-define void @test_simple_case(i1 %cond, i32 %N) {
-; CHECK-LABEL: @test_simple_case(
+define void @test_simple_case(i1 %cond, i32 %N) !prof !0 {
+; CHECK-LABEL: define void @test_simple_case(i1 %cond, i32 %N) !prof !0 {
; CHECK-NEXT: entry:
-; CHECK-NEXT:br i1 [[COND:%.*]], label [[ENTRY_SPLIT_US:%.*]], label
[[ENTRY_SPLIT:%.*]]
+; CHECK-NEXT:br i1 %cond, label %entry.split.us, label %entry.split, !prof
!1
; CHECK: entry.split.us:
-; CHECK-NEXT:br label [[LOOP_US:%.*]]
+; CHECK-NEXT:br label %loop.us
; CHECK: loop.us:
-; CHECK-NEXT:[[IV_US:%.*]] = phi i32 [ 0, [[ENTRY_SPLIT_US]] ], [
[[IV_NEXT_US:%.*]], [[GUARDED_US:%.*]] ]
-; CHECK-NEXT:br label [[GUARDED_US]]
+; CHECK-NEXT:%iv.us = phi i32 [ 0, %entry.split.us ], [ %iv.next.us,
%guarded.us ]
+; CHECK-NEXT:br label %guarded.us
; CHECK: guarded.us:
-; CHECK-NEXT:[[IV_NEXT_US]] = add i32 [[IV_US]], 1
-; CHECK-NEXT:[[LOOP_COND_US:%.*]] = icmp slt i32 [[IV_NEXT_US]], [[N:%.*]]
-; CHECK-NEXT:br i1 [[LOOP_COND_US]], label [[LOOP_US]], label
[[EXIT_SPLIT_US:%.*]]
+; CHECK-NEXT:%iv.next.us = add i32 %iv.us, 1
+; CHECK-NEXT:%loop.cond.us = icmp slt i32 %iv.next.us, %N
+; CHECK-NEXT:br i1 %loop.cond.us, label %loop.us, label %exit.split.us
+; CHECK: exit.split.us:
+; CHECK-NEXT:br label %exit
+; CHECK: entry.split:
+; CHECK-NEXT:br label %loop
+; CHECK: loop:
+; CHECK-NEXT:br label %deopt
; CHECK: deopt:
; CHECK-NEXT:call void (i1, ...) @llvm.experimental.guard(i1 false) [
"deopt"() ]
; CHECK-NEXT:unreachable
+; CHECK: exit:
+; CHECK-NEXT:ret void
;
entry:
@@ -38,25 +46,39 @@ exit:
}
define void @test_two_guards(i1 %cond1, i1 %cond2, i32 %N) {
-; CHECK-LABEL: @test_two_guards(
+; CHECK-LABEL: define void @test_two_guards(i1 %cond1, i1 %cond2, i32 %N) {
; CHECK-NEXT: entry:
-; CHECK-NEXT:br i1 [[COND1:%.*]], label [[ENTRY_SPLIT_US:%.*]], label
[[ENTRY_SPLIT:%.*]]
+; CHECK-NEXT:br i1 %cond1, label %entry.split.us, label %entry.split,
!prof !1
; CHECK:
[llvm-branch-commits] [llvm] [LSCFG][profcheck] Add dummy branch weights for the dummy switch to dead exits (PR #164714)
https://github.com/mtrofin updated
https://github.com/llvm/llvm-project/pull/164714
>From 64eaa8574dfa3343c192502349672dff24570c66 Mon Sep 17 00:00:00 2001
From: Mircea Trofin
Date: Wed, 22 Oct 2025 14:34:31 -0700
Subject: [PATCH] [LSCFG][profcheck] Add dummy branch weights for the dummy
switch to dead exits
---
.../lib/Transforms/Scalar/LoopSimplifyCFG.cpp | 12 ++
.../LoopSimplifyCFG/constant-fold-branch.ll | 104 +-
2 files changed, 66 insertions(+), 50 deletions(-)
diff --git a/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp
b/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp
index b9546c5fa236b..e902b71776973 100644
--- a/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp
@@ -24,6 +24,7 @@
#include "llvm/Analysis/ScalarEvolution.h"
#include "llvm/IR/Dominators.h"
#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/ProfDataUtils.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Transforms/Scalar.h"
#include "llvm/Transforms/Scalar/LoopPassManager.h"
@@ -393,6 +394,17 @@ class ConstantTerminatorFoldingImpl {
DTUpdates.push_back({DominatorTree::Insert, Preheader, BB});
++NumLoopExitsDeleted;
}
+// We don't really need to add branch weights to DummySwitch, because all
+// but one branches are just a temporary artifact - see the comment on top
+// of this function. But, it's easy to estimate the weights, and it helps
+// maintain a property of the overall compiler - that the branch weights
+// don't "just get dropped" accidentally (i.e. profcheck)
+if (DummySwitch->getParent()->getParent()->hasProfileData()) {
+ SmallVector DummyBranchWeights(1 + DummySwitch->getNumCases());
+ // default. 100% probability, the rest are dead.
+ DummyBranchWeights[0] = 1;
+ setBranchWeights(*DummySwitch, DummyBranchWeights, /*IsExpected=*/false);
+}
assert(L.getLoopPreheader() == NewPreheader && "Malformed CFG?");
if (Loop *OuterLoop = LI.getLoopFor(Preheader)) {
diff --git a/llvm/test/Transforms/LoopSimplifyCFG/constant-fold-branch.ll
b/llvm/test/Transforms/LoopSimplifyCFG/constant-fold-branch.ll
index 1ec212f0bb5ea..46b6209986fed 100644
--- a/llvm/test/Transforms/LoopSimplifyCFG/constant-fold-branch.ll
+++ b/llvm/test/Transforms/LoopSimplifyCFG/constant-fold-branch.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
UTC_ARGS: --check-globals
; REQUIRES: asserts
; RUN: opt -S -enable-loop-simplifycfg-term-folding=true
-passes=loop-simplifycfg -verify-loop-info -verify-dom-info -verify-loop-lcssa
< %s | FileCheck %s
; RUN: opt -S -enable-loop-simplifycfg-term-folding=true
-passes='require,loop(loop-simplifycfg)' -verify-loop-info
-verify-dom-info -verify-loop-lcssa < %s | FileCheck %s
@@ -59,7 +59,7 @@ define i32 @dead_backedge_test_switch_loop(i32 %end) {
; CHECK: dead_backedge:
; CHECK-NEXT:[[I_2]] = add i32 [[I_1]], 10
; CHECK-NEXT:switch i32 1, label [[EXIT:%.*]] [
-; CHECK-NEXT:i32 0, label [[HEADER_BACKEDGE]]
+; CHECK-NEXT: i32 0, label [[HEADER_BACKEDGE]]
; CHECK-NEXT:]
; CHECK: exit:
; CHECK-NEXT:[[I_2_LCSSA:%.*]] = phi i32 [ [[I_2]], [[DEAD_BACKEDGE]] ]
@@ -233,12 +233,12 @@ exit:
; Check that we preserve static reachibility of a dead exit block while
deleting
; a branch.
-define i32 @dead_exit_test_branch_loop(i32 %end) {
+define i32 @dead_exit_test_branch_loop(i32 %end) !prof
!{!"function_entry_count", i32 10} {
; CHECK-LABEL: @dead_exit_test_branch_loop(
; CHECK-NEXT: preheader:
; CHECK-NEXT:switch i32 0, label [[PREHEADER_SPLIT:%.*]] [
-; CHECK-NEXT:i32 1, label [[DEAD:%.*]]
-; CHECK-NEXT:]
+; CHECK-NEXT: i32 1, label [[DEAD:%.*]]
+; CHECK-NEXT:], !prof [[PROF1:![0-9]+]]
; CHECK: preheader.split:
; CHECK-NEXT:br label [[HEADER:%.*]]
; CHECK: header:
@@ -262,7 +262,7 @@ preheader:
header:
%i = phi i32 [0, %preheader], [%i.inc, %backedge]
- br i1 true, label %backedge, label %dead
+ br i1 true, label %backedge, label %dead, !prof !{!"branch_weights", i32 10,
i32 1}
dead:
br label %dummy
@@ -286,7 +286,7 @@ define i32 @dead_exit_test_switch_loop(i32 %end) {
; CHECK-LABEL: @dead_exit_test_switch_loop(
; CHECK-NEXT: preheader:
; CHECK-NEXT:switch i32 0, label [[PREHEADER_SPLIT:%.*]] [
-; CHECK-NEXT:i32 1, label [[DEAD:%.*]]
+; CHECK-NEXT: i32 1, label [[DEAD:%.*]]
; CHECK-NEXT:]
; CHECK: preheader.split:
; CHECK-NEXT:br label [[HEADER:%.*]]
@@ -383,9 +383,9 @@ define i32 @dead_loop_test_switch_loop(i32 %end) {
; CHECK: header:
; CHECK-NEXT:[[I:%.*]] = phi i32 [ 0, [[PREHEADER:%.*]] ], [
[[I_INC:%.*]], [[BACKEDGE:%.*]] ]
; CHECK-NEXT:switch i32 1, label [[DEAD:%.*]] [
-; CHECK-NEXT:i32 0, label [[DEAD]]
-; CHECK-NEXT:i32 1, label [[BACKEDGE]]
-; CHECK-NEXT:i32 2, lab
[llvm-branch-commits] [llvm] CodeGen: More accurate mayAlias for instructions with multiple MMOs (PR #166211)
llvmbot wrote:
@llvm/pr-subscribers-backend-systemz
Author: Nicolai Hähnle (nhaehnle)
Changes
There can only be meaningful aliasing between the memory accesses of
different instructions if at least one of the accesses modifies memory.
This check is applied at the instruction-level earlier in the method.
This change merely extends the check on a per-MMO basis.
This affects a SystemZ test because PFD instructions are both mayLoad
and mayStore but may carry a load-only MMO which is now no longer
treated as aliasing loads. The PFD instructions are from llvm.prefetch
generated by loop-data-prefetch.
---
**Stack**:
- [5/5] #166213
- [4/5] #166212
- [3/5] #166211 ⬅
- [2/5] #166210
- [1/5] #166209
⚠️ *Part of a stack created by [spr](https://github.com/ejoffe/spr). Merging
this PR using the GitHub UI may have unexpected results.*
---
Full diff: https://github.com/llvm/llvm-project/pull/166211.diff
2 Files Affected:
- (modified) llvm/lib/CodeGen/MachineInstr.cpp (+6-2)
- (modified) llvm/test/CodeGen/SystemZ/vec-load-element.ll (+2-2)
``diff
diff --git a/llvm/lib/CodeGen/MachineInstr.cpp
b/llvm/lib/CodeGen/MachineInstr.cpp
index 8ad9245a47684..37e5c517d24d8 100644
--- a/llvm/lib/CodeGen/MachineInstr.cpp
+++ b/llvm/lib/CodeGen/MachineInstr.cpp
@@ -1547,10 +1547,14 @@ bool MachineInstr::mayAlias(BatchAAResults *AA, const
MachineInstr &Other,
// Check each pair of memory operands from both instructions, which can't
// alias only if all pairs won't alias.
- for (auto *MMOa : memoperands())
-for (auto *MMOb : Other.memoperands())
+ for (auto *MMOa : memoperands()) {
+for (auto *MMOb : Other.memoperands()) {
+ if (!MMOa->isStore() && !MMOb->isStore())
+continue;
if (MemOperandsHaveAlias(MFI, AA, UseTBAA, MMOa, MMOb))
return true;
+}
+ }
return false;
}
diff --git a/llvm/test/CodeGen/SystemZ/vec-load-element.ll
b/llvm/test/CodeGen/SystemZ/vec-load-element.ll
index 2baaed19546df..9bef279d7c0fa 100644
--- a/llvm/test/CodeGen/SystemZ/vec-load-element.ll
+++ b/llvm/test/CodeGen/SystemZ/vec-load-element.ll
@@ -5,8 +5,8 @@
; CHECK-LABEL: .LBB0_1:
; CHECK-NOT: l %r
; CHECK-NOT: vlvgf
-; CHECK: pfd
-; CHECK: vlef
+; CHECK-DAG: pfd
+; CHECK-DAG: vlef
%type0 = type { i32, [400 x i8], i32, i32, i32, i32, i32, i32, i32, i32, i32,
i32 }
@Mem = external global [150 x %type0], align 4
``
https://github.com/llvm/llvm-project/pull/166211
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] CodeGen/AMDGPU: Allow 3-address conversion of bundled instructions (PR #166213)
llvmbot wrote:
@llvm/pr-subscribers-backend-amdgpu
Author: Nicolai Hähnle (nhaehnle)
Changes
This is in preparation for future changes in AMDGPU that will make more
substantial use of bundles pre-RA. For now, simply test this with
degenerate (single-instruction) bundles.
---
**Stack**:
- [5/5] #166213 ⬅
- [4/5] #166212
- [3/5] #166211
- [2/5] #166210
- [1/5] #166209
⚠️ *Part of a stack created by [spr](https://github.com/ejoffe/spr). Merging
this PR using the GitHub UI may have unexpected results.*
---
Full diff: https://github.com/llvm/llvm-project/pull/166213.diff
3 Files Affected:
- (modified) llvm/lib/CodeGen/TwoAddressInstructionPass.cpp (+31-23)
- (modified) llvm/lib/Target/AMDGPU/SIInstrInfo.cpp (+52-4)
- (modified) llvm/test/CodeGen/AMDGPU/twoaddr-bundle.mir (+4-5)
``diff
diff --git a/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
b/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
index 1f816b94cf56b..d6d23061be16e 100644
--- a/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
+++ b/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
@@ -794,29 +794,34 @@ bool TwoAddressInstructionImpl::convertInstTo3Addr(
if (!NewMI)
return false;
- LLVM_DEBUG(dbgs() << "2addr: CONVERTING 2-ADDR: " << *mi);
- LLVM_DEBUG(dbgs() << "2addr: TO 3-ADDR: " << *NewMI);
-
- // If the old instruction is debug value tracked, an update is required.
- if (auto OldInstrNum = mi->peekDebugInstrNum()) {
-assert(mi->getNumExplicitDefs() == 1);
-assert(NewMI->getNumExplicitDefs() == 1);
-
-// Find the old and new def location.
-unsigned OldIdx = mi->defs().begin()->getOperandNo();
-unsigned NewIdx = NewMI->defs().begin()->getOperandNo();
-
-// Record that one def has been replaced by the other.
-unsigned NewInstrNum = NewMI->getDebugInstrNum();
-MF->makeDebugValueSubstitution(std::make_pair(OldInstrNum, OldIdx),
- std::make_pair(NewInstrNum, NewIdx));
- }
-
- MBB->erase(mi); // Nuke the old inst.
-
for (MachineInstr &MI : MIS)
DistanceMap.insert(std::make_pair(&MI, Dist++));
- Dist--;
+
+ if (&*mi == NewMI) {
+LLVM_DEBUG(dbgs() << "2addr: CONVERTED IN-PLACE TO 3-ADDR: " << *mi);
+ } else {
+LLVM_DEBUG(dbgs() << "2addr: CONVERTING 2-ADDR: " << *mi);
+LLVM_DEBUG(dbgs() << "2addr: TO 3-ADDR: " << *NewMI);
+
+// If the old instruction is debug value tracked, an update is required.
+if (auto OldInstrNum = mi->peekDebugInstrNum()) {
+ assert(mi->getNumExplicitDefs() == 1);
+ assert(NewMI->getNumExplicitDefs() == 1);
+
+ // Find the old and new def location.
+ unsigned OldIdx = mi->defs().begin()->getOperandNo();
+ unsigned NewIdx = NewMI->defs().begin()->getOperandNo();
+
+ // Record that one def has been replaced by the other.
+ unsigned NewInstrNum = NewMI->getDebugInstrNum();
+ MF->makeDebugValueSubstitution(std::make_pair(OldInstrNum, OldIdx),
+std::make_pair(NewInstrNum, NewIdx));
+}
+
+MBB->erase(mi); // Nuke the old inst.
+Dist--;
+ }
+
mi = NewMI;
nmi = std::next(mi);
@@ -1329,6 +1334,9 @@ bool TwoAddressInstructionImpl::tryInstructionTransform(
bool Commuted = tryInstructionCommute(&MI, DstIdx, SrcIdx, regBKilled, Dist);
+ // Give targets a chance to convert bundled instructions.
+ bool ConvertibleTo3Addr = MI.isConvertibleTo3Addr(MachineInstr::AnyInBundle);
+
// If the instruction is convertible to 3 Addr, instead
// of returning try 3 Addr transformation aggressively and
// use this variable to check later. Because it might be better.
@@ -1337,7 +1345,7 @@ bool TwoAddressInstructionImpl::tryInstructionTransform(
// addl %esi, %edi
// movl %edi, %eax
// ret
- if (Commuted && !MI.isConvertibleTo3Addr())
+ if (Commuted && !ConvertibleTo3Addr)
return false;
if (shouldOnlyCommute)
@@ -1357,7 +1365,7 @@ bool TwoAddressInstructionImpl::tryInstructionTransform(
regBKilled = isKilled(MI, regB, true);
}
- if (MI.isConvertibleTo3Addr()) {
+ if (ConvertibleTo3Addr) {
// This instruction is potentially convertible to a true
// three-address instruction. Check if it is profitable.
if (!regBKilled || isProfitableToConv3Addr(regA, regB)) {
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index d930a21c2d7f5..031ed90e0ad15 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -4044,10 +4044,29 @@ MachineInstr
*SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
LiveVariables *LV,
LiveIntervals *LIS) const {
MachineBasicBlock &MBB = *MI.getParent();
+ MachineInstr *CandidateMI = &MI;
+
+ if (MI.isBundle()) {
+// This is a temporary placeholder for bundle handling that enables us to
+// exercise the relevant code paths
[llvm-branch-commits] [llvm] CodeGen: Handle bundled instructions in two-address-instructions pass (PR #166212)
llvmbot wrote:
@llvm/pr-subscribers-backend-amdgpu
Author: Nicolai Hähnle (nhaehnle)
Changes
If the instruction with tied operands is a BUNDLE instruction and we
handle it by replacing an operand, then we need to update the
corresponding internal operands as well. Otherwise, the resulting MIR is
invalid.
The test case is degenerate in the sense that the bundle only contains a
single instruction, but it is sufficient to exercise this issue.
---
**Stack**:
- [5/5] #166213
- [4/5] #166212 ⬅
- [3/5] #166211
- [2/5] #166210
- [1/5] #166209
⚠️ *Part of a stack created by [spr](https://github.com/ejoffe/spr). Merging
this PR using the GitHub UI may have unexpected results.*
---
Full diff: https://github.com/llvm/llvm-project/pull/166212.diff
2 Files Affected:
- (modified) llvm/lib/CodeGen/TwoAddressInstructionPass.cpp (+16)
- (added) llvm/test/CodeGen/AMDGPU/twoaddr-bundle.mir (+57)
``diff
diff --git a/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
b/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
index 414e414738b71..1f816b94cf56b 100644
--- a/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
+++ b/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
@@ -1665,6 +1665,22 @@ void
TwoAddressInstructionImpl::processTiedPairs(MachineInstr *MI,
// by SubRegB is compatible with RegA with no subregister. So regardless of
// whether the dest oper writes a subreg, the source oper should not.
MO.setSubReg(0);
+
+// Update uses of RegB to uses of RegA inside the bundle.
+if (MI->isBundle()) {
+ for (MachineInstr *InnerMI = MI; InnerMI->isBundledWithSucc();) {
+InnerMI = InnerMI->getNextNode();
+
+for (MachineOperand &MO : InnerMI->all_uses()) {
+ if (MO.isReg() && MO.getReg() == RegB) {
+assert(
+MO.getSubReg() == 0 &&
+"tied subregister uses in bundled instructions not supported");
+MO.setReg(RegA);
+ }
+}
+ }
+}
}
if (AllUsesCopied) {
diff --git a/llvm/test/CodeGen/AMDGPU/twoaddr-bundle.mir
b/llvm/test/CodeGen/AMDGPU/twoaddr-bundle.mir
new file mode 100644
index 0..696962a88c8b8
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/twoaddr-bundle.mir
@@ -0,0 +1,57 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
UTC_ARGS: --version 6
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 %s --passes=two-address-instruction
-verify-each -o - | FileCheck --check-prefixes=GCN %s
+
+# Exercise very basic handling of BUNDLE'd instructions by the
two-address-instruction pass.
+
+# This test is an example where it is best to keep the two-address instruction
+# and resolve the tie with a COPY that is expected to be coalesced.
+---
+name:test_fmac_bundle
+body: |
+ bb.0:
+
+; GCN-LABEL: name: test_fmac_bundle
+; GCN: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+; GCN-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]],
[[COPY1]], 0, implicit $exec
+; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+; GCN-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_U32_e64_]]
+; GCN-NEXT: BUNDLE implicit-def [[COPY2]], implicit [[DEF]], implicit
[[DEF1]], implicit [[COPY2]](tied-def 0), implicit $mode, implicit $exec {
+; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = V_FMAC_F32_e32 killed [[DEF]],
killed [[DEF1]], killed [[COPY2]], implicit $mode, implicit $exec
+; GCN-NEXT: }
+%10:vgpr_32 = COPY $vgpr0
+%11:vgpr_32 = COPY $vgpr1
+%2:vgpr_32 = V_ADD_U32_e64 %10, %11, 0, implicit $exec
+%0:vgpr_32 = IMPLICIT_DEF
+%1:vgpr_32 = IMPLICIT_DEF
+BUNDLE implicit-def %3:vgpr_32, implicit %0, implicit %1, implicit killed
%2(tied-def 0), implicit $mode, implicit $exec {
+ %3:vgpr_32 = V_FMAC_F32_e32 killed %0, killed %1, killed %2, implicit
$mode, implicit $exec
+}
+
+...
+
+# This test is an example where conversion to three-address form would be
beneficial.
+---
+name:test_fmac_reuse_bundle
+body: |
+ bb.0:
+
+; GCN-LABEL: name: test_fmac_reuse_bundle
+; GCN: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+; GCN-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
+; GCN-NEXT: BUNDLE implicit-def [[COPY1]], implicit [[DEF]], implicit
[[DEF1]], implicit [[COPY1]](tied-def 0), implicit $mode, implicit $exec {
+; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = V_FMAC_F32_e32 killed [[DEF]],
killed [[DEF1]], killed [[COPY1]], implicit $mode, implicit $exec
+; GCN-NEXT: }
+; GCN-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY1]],
[[COPY]], 0, implicit $exec
+%2:vgpr_32 = COPY $vgpr0
+%0:vgpr_32 = IMPLICIT_DEF
+%1:vgpr_32 = IMPLICIT_DEF
+BUNDLE implicit-def %3:vgpr_32, implicit %0, im
[llvm-branch-commits] [llvm] CodeGen: Record MMOs in finalizeBundle (PR #166210)
llvmbot wrote:
@llvm/pr-subscribers-llvm-globalisel
Author: Nicolai Hähnle (nhaehnle)
Changes
This allows more accurate alias analysis to apply at the bundle level.
This has a bunch of minor effects in post-RA scheduling that look mostly
beneficial to me, all of them in AMDGPU (the Thumb2 change is cosmetic).
The pre-existing (and unchanged) test in
CodeGen/MIR/AMDGPU/custom-pseudo-source-values.ll tests that MIR with a
bundle with MMOs can be parsed successfully.
---
**Stack**:
- [5/5] #166213
- [4/5] #166212
- [3/5] #166211
- [2/5] #166210 ⬅
- [1/5] #166209
⚠️ *Part of a stack created by [spr](https://github.com/ejoffe/spr). Merging
this PR using the GitHub UI may have unexpected results.*
---
Patch is 2.73 MiB, truncated to 20.00 KiB below, full version:
https://github.com/llvm/llvm-project/pull/166210.diff
52 Files Affected:
- (modified) llvm/lib/CodeGen/MIRParser/MIParser.cpp (+2)
- (modified) llvm/lib/CodeGen/MachineInstrBundle.cpp (+5)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll
(+1-1)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll (+10-10)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll (+3-4)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll (+4839-5115)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll (+172-191)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll (+724-779)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.576bit.ll (+143-152)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll (+172-185)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll (+184-186)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll (+252-282)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll (+394-445)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll (+372-384)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll (+657-704)
- (modified) llvm/test/CodeGen/AMDGPU/bf16.ll (+14-14)
- (modified) llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-memcpy.ll (+39-10)
- (modified) llvm/test/CodeGen/AMDGPU/call-argument-types.ll (+6-6)
- (modified) llvm/test/CodeGen/AMDGPU/ds_write2.ll (+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/ds_write2_a_v.ll (+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll (+32-31)
- (modified) llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll (+76-77)
- (modified) llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll (+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/hard-clauses-img-gfx11.mir (+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/hard-clauses-img-gfx12.mir (+2-2)
- (modified)
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier-fastregalloc.ll (+13-8)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier.ll (+1-1)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll (+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f16.f16.ll (+4-4)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll (+64-92)
- (modified) llvm/test/CodeGen/AMDGPU/load-constant-i1.ll (+9-9)
- (modified) llvm/test/CodeGen/AMDGPU/load-global-i16.ll (+3-3)
- (modified) llvm/test/CodeGen/AMDGPU/load-global-i32.ll (+5-5)
- (modified) llvm/test/CodeGen/AMDGPU/load-global-i8.ll (+7-8)
- (modified) llvm/test/CodeGen/AMDGPU/load-local-i16.ll (+105-109)
- (modified)
llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-lastuse-metadata.ll (+14-17)
- (modified)
llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-nontemporal-metadata.ll
(+27-31)
- (modified) llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-memcpy.ll (+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa.ll (+4-4)
- (modified) llvm/test/CodeGen/AMDGPU/max.ll (+1-1)
- (modified) llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll (+968-1117)
- (modified) llvm/test/CodeGen/AMDGPU/mixed-vmem-types.ll (+3-3)
- (modified) llvm/test/CodeGen/AMDGPU/packed-fp32.ll (+103-100)
- (modified) llvm/test/CodeGen/AMDGPU/postra-bundle-memops.mir (+3-2)
- (modified) llvm/test/CodeGen/AMDGPU/postra-bundle-vimage-vsample-gfx12.mir
(+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll (+21-20)
- (modified) llvm/test/CodeGen/AMDGPU/scratch-simple.ll (+1192-1178)
- (modified) llvm/test/CodeGen/AMDGPU/soft-clause-exceeds-register-budget.ll
(+12-3)
- (modified) llvm/test/CodeGen/AMDGPU/spill-agpr.ll (+3-3)
- (modified) llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll (+10-12)
- (modified) llvm/test/CodeGen/AMDGPU/stack-realign.ll (+1-5)
- (modified) llvm/test/CodeGen/Thumb2/mve-vpt-block-fold-vcmp.mir (+23-22)
``diff
diff --git a/llvm/lib/CodeGen/MIRParser/MIParser.cpp
b/llvm/lib/CodeGen/MIRParser/MIParser.cpp
index 4795d81e3f348..434a579c3be3f 100644
--- a/llvm/lib/CodeGen/MIRParser/MIParser.cpp
+++ b/llvm/lib/CodeGen/MIRParser/MIParser.cpp
@@ -1161,6 +1161,8 @@ bool MIParser::parse(MachineInstr *&MI) {
MemOperands.push_back(MemOp);
[llvm-branch-commits] [llvm] CodeGen: Record MMOs in finalizeBundle (PR #166210)
llvmbot wrote:
@llvm/pr-subscribers-backend-amdgpu
Author: Nicolai Hähnle (nhaehnle)
Changes
This allows more accurate alias analysis to apply at the bundle level.
This has a bunch of minor effects in post-RA scheduling that look mostly
beneficial to me, all of them in AMDGPU (the Thumb2 change is cosmetic).
The pre-existing (and unchanged) test in
CodeGen/MIR/AMDGPU/custom-pseudo-source-values.ll tests that MIR with a
bundle with MMOs can be parsed successfully.
---
**Stack**:
- [5/5] #166213
- [4/5] #166212
- [3/5] #166211
- [2/5] #166210 ⬅
- [1/5] #166209
⚠️ *Part of a stack created by [spr](https://github.com/ejoffe/spr). Merging
this PR using the GitHub UI may have unexpected results.*
---
Patch is 2.73 MiB, truncated to 20.00 KiB below, full version:
https://github.com/llvm/llvm-project/pull/166210.diff
52 Files Affected:
- (modified) llvm/lib/CodeGen/MIRParser/MIParser.cpp (+2)
- (modified) llvm/lib/CodeGen/MachineInstrBundle.cpp (+5)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll
(+1-1)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll (+10-10)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll (+3-4)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll (+4839-5115)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll (+172-191)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll (+724-779)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.576bit.ll (+143-152)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll (+172-185)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll (+184-186)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll (+252-282)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll (+394-445)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll (+372-384)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll (+657-704)
- (modified) llvm/test/CodeGen/AMDGPU/bf16.ll (+14-14)
- (modified) llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-memcpy.ll (+39-10)
- (modified) llvm/test/CodeGen/AMDGPU/call-argument-types.ll (+6-6)
- (modified) llvm/test/CodeGen/AMDGPU/ds_write2.ll (+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/ds_write2_a_v.ll (+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll (+32-31)
- (modified) llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll (+76-77)
- (modified) llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll (+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/hard-clauses-img-gfx11.mir (+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/hard-clauses-img-gfx12.mir (+2-2)
- (modified)
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier-fastregalloc.ll (+13-8)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier.ll (+1-1)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll (+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f16.f16.ll (+4-4)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll (+64-92)
- (modified) llvm/test/CodeGen/AMDGPU/load-constant-i1.ll (+9-9)
- (modified) llvm/test/CodeGen/AMDGPU/load-global-i16.ll (+3-3)
- (modified) llvm/test/CodeGen/AMDGPU/load-global-i32.ll (+5-5)
- (modified) llvm/test/CodeGen/AMDGPU/load-global-i8.ll (+7-8)
- (modified) llvm/test/CodeGen/AMDGPU/load-local-i16.ll (+105-109)
- (modified)
llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-lastuse-metadata.ll (+14-17)
- (modified)
llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-nontemporal-metadata.ll
(+27-31)
- (modified) llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-memcpy.ll (+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa.ll (+4-4)
- (modified) llvm/test/CodeGen/AMDGPU/max.ll (+1-1)
- (modified) llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll (+968-1117)
- (modified) llvm/test/CodeGen/AMDGPU/mixed-vmem-types.ll (+3-3)
- (modified) llvm/test/CodeGen/AMDGPU/packed-fp32.ll (+103-100)
- (modified) llvm/test/CodeGen/AMDGPU/postra-bundle-memops.mir (+3-2)
- (modified) llvm/test/CodeGen/AMDGPU/postra-bundle-vimage-vsample-gfx12.mir
(+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll (+21-20)
- (modified) llvm/test/CodeGen/AMDGPU/scratch-simple.ll (+1192-1178)
- (modified) llvm/test/CodeGen/AMDGPU/soft-clause-exceeds-register-budget.ll
(+12-3)
- (modified) llvm/test/CodeGen/AMDGPU/spill-agpr.ll (+3-3)
- (modified) llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll (+10-12)
- (modified) llvm/test/CodeGen/AMDGPU/stack-realign.ll (+1-5)
- (modified) llvm/test/CodeGen/Thumb2/mve-vpt-block-fold-vcmp.mir (+23-22)
``diff
diff --git a/llvm/lib/CodeGen/MIRParser/MIParser.cpp
b/llvm/lib/CodeGen/MIRParser/MIParser.cpp
index 4795d81e3f348..434a579c3be3f 100644
--- a/llvm/lib/CodeGen/MIRParser/MIParser.cpp
+++ b/llvm/lib/CodeGen/MIRParser/MIParser.cpp
@@ -1161,6 +1161,8 @@ bool MIParser::parse(MachineInstr *&MI) {
MemOperands.push_back(MemOp);
[llvm-branch-commits] [llvm] CodeGen/AMDGPU: Allow 3-address conversion of bundled instructions (PR #166213)
https://github.com/nhaehnle created
https://github.com/llvm/llvm-project/pull/166213
This is in preparation for future changes in AMDGPU that will make more
substantial use of bundles pre-RA. For now, simply test this with
degenerate (single-instruction) bundles.
---
**Stack**:
- [5/5] #166213 ⬅
- [4/5] #166212
- [3/5] #166211
- [2/5] #166210
- [1/5] #166209
⚠️ *Part of a stack created by [spr](https://github.com/ejoffe/spr). Merging
this PR using the GitHub UI may have unexpected results.*
From 4ce0cedd58f67a1ae8df40df6dd56a8093d12d97 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicolai=20H=C3=A4hnle?=
Date: Tue, 7 Oct 2025 12:17:02 -0700
Subject: [PATCH] CodeGen/AMDGPU: Allow 3-address conversion of bundled
instructions
This is in preparation for future changes in AMDGPU that will make more
substantial use of bundles pre-RA. For now, simply test this with
degenerate (single-instruction) bundles.
commit-id:4a30cb78
Pull Request: https://github.com/nhaehnle/llvm-project/pull/12
---
.../lib/CodeGen/TwoAddressInstructionPass.cpp | 54 ++
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp| 56 +--
llvm/test/CodeGen/AMDGPU/twoaddr-bundle.mir | 9 ++-
3 files changed, 87 insertions(+), 32 deletions(-)
diff --git a/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
b/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
index 1f816b94cf56b..d6d23061be16e 100644
--- a/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
+++ b/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
@@ -794,29 +794,34 @@ bool TwoAddressInstructionImpl::convertInstTo3Addr(
if (!NewMI)
return false;
- LLVM_DEBUG(dbgs() << "2addr: CONVERTING 2-ADDR: " << *mi);
- LLVM_DEBUG(dbgs() << "2addr: TO 3-ADDR: " << *NewMI);
-
- // If the old instruction is debug value tracked, an update is required.
- if (auto OldInstrNum = mi->peekDebugInstrNum()) {
-assert(mi->getNumExplicitDefs() == 1);
-assert(NewMI->getNumExplicitDefs() == 1);
-
-// Find the old and new def location.
-unsigned OldIdx = mi->defs().begin()->getOperandNo();
-unsigned NewIdx = NewMI->defs().begin()->getOperandNo();
-
-// Record that one def has been replaced by the other.
-unsigned NewInstrNum = NewMI->getDebugInstrNum();
-MF->makeDebugValueSubstitution(std::make_pair(OldInstrNum, OldIdx),
- std::make_pair(NewInstrNum, NewIdx));
- }
-
- MBB->erase(mi); // Nuke the old inst.
-
for (MachineInstr &MI : MIS)
DistanceMap.insert(std::make_pair(&MI, Dist++));
- Dist--;
+
+ if (&*mi == NewMI) {
+LLVM_DEBUG(dbgs() << "2addr: CONVERTED IN-PLACE TO 3-ADDR: " << *mi);
+ } else {
+LLVM_DEBUG(dbgs() << "2addr: CONVERTING 2-ADDR: " << *mi);
+LLVM_DEBUG(dbgs() << "2addr: TO 3-ADDR: " << *NewMI);
+
+// If the old instruction is debug value tracked, an update is required.
+if (auto OldInstrNum = mi->peekDebugInstrNum()) {
+ assert(mi->getNumExplicitDefs() == 1);
+ assert(NewMI->getNumExplicitDefs() == 1);
+
+ // Find the old and new def location.
+ unsigned OldIdx = mi->defs().begin()->getOperandNo();
+ unsigned NewIdx = NewMI->defs().begin()->getOperandNo();
+
+ // Record that one def has been replaced by the other.
+ unsigned NewInstrNum = NewMI->getDebugInstrNum();
+ MF->makeDebugValueSubstitution(std::make_pair(OldInstrNum, OldIdx),
+std::make_pair(NewInstrNum, NewIdx));
+}
+
+MBB->erase(mi); // Nuke the old inst.
+Dist--;
+ }
+
mi = NewMI;
nmi = std::next(mi);
@@ -1329,6 +1334,9 @@ bool TwoAddressInstructionImpl::tryInstructionTransform(
bool Commuted = tryInstructionCommute(&MI, DstIdx, SrcIdx, regBKilled, Dist);
+ // Give targets a chance to convert bundled instructions.
+ bool ConvertibleTo3Addr = MI.isConvertibleTo3Addr(MachineInstr::AnyInBundle);
+
// If the instruction is convertible to 3 Addr, instead
// of returning try 3 Addr transformation aggressively and
// use this variable to check later. Because it might be better.
@@ -1337,7 +1345,7 @@ bool TwoAddressInstructionImpl::tryInstructionTransform(
// addl %esi, %edi
// movl %edi, %eax
// ret
- if (Commuted && !MI.isConvertibleTo3Addr())
+ if (Commuted && !ConvertibleTo3Addr)
return false;
if (shouldOnlyCommute)
@@ -1357,7 +1365,7 @@ bool TwoAddressInstructionImpl::tryInstructionTransform(
regBKilled = isKilled(MI, regB, true);
}
- if (MI.isConvertibleTo3Addr()) {
+ if (ConvertibleTo3Addr) {
// This instruction is potentially convertible to a true
// three-address instruction. Check if it is profitable.
if (!regBKilled || isProfitableToConv3Addr(regA, regB)) {
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index d930a21c2d7f5..031ed90e0ad15 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -4044,10 +4044,29 @@ Machi
[llvm-branch-commits] [llvm] CodeGen: Handle bundled instructions in two-address-instructions pass (PR #166212)
https://github.com/nhaehnle created
https://github.com/llvm/llvm-project/pull/166212
If the instruction with tied operands is a BUNDLE instruction and we
handle it by replacing an operand, then we need to update the
corresponding internal operands as well. Otherwise, the resulting MIR is
invalid.
The test case is degenerate in the sense that the bundle only contains a
single instruction, but it is sufficient to exercise this issue.
---
**Stack**:
- [3/3] #166211
- [2/3] #166210
- [1/3] #166209
⚠️ *Part of a stack created by [spr](https://github.com/ejoffe/spr). Merging
this PR using the GitHub UI may have unexpected results.*
From 883e5633b33ec4571e2b59e6add9dbbc07cbe962 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicolai=20H=C3=A4hnle?=
Date: Tue, 23 Sep 2025 19:08:52 -0700
Subject: [PATCH] CodeGen: Handle bundled instructions in
two-address-instructions pass
If the instruction with tied operands is a BUNDLE instruction and we
handle it by replacing an operand, then we need to update the
corresponding internal operands as well. Otherwise, the resulting MIR is
invalid.
The test case is degenerate in the sense that the bundle only contains a
single instruction, but it is sufficient to exercise this issue.
commit-id:6760a9b7
Pull Request: https://github.com/nhaehnle/llvm-project/pull/11
---
.../lib/CodeGen/TwoAddressInstructionPass.cpp | 16 ++
llvm/test/CodeGen/AMDGPU/twoaddr-bundle.mir | 57 +++
2 files changed, 73 insertions(+)
create mode 100644 llvm/test/CodeGen/AMDGPU/twoaddr-bundle.mir
diff --git a/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
b/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
index 414e414738b71..1f816b94cf56b 100644
--- a/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
+++ b/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
@@ -1665,6 +1665,22 @@ void
TwoAddressInstructionImpl::processTiedPairs(MachineInstr *MI,
// by SubRegB is compatible with RegA with no subregister. So regardless of
// whether the dest oper writes a subreg, the source oper should not.
MO.setSubReg(0);
+
+// Update uses of RegB to uses of RegA inside the bundle.
+if (MI->isBundle()) {
+ for (MachineInstr *InnerMI = MI; InnerMI->isBundledWithSucc();) {
+InnerMI = InnerMI->getNextNode();
+
+for (MachineOperand &MO : InnerMI->all_uses()) {
+ if (MO.isReg() && MO.getReg() == RegB) {
+assert(
+MO.getSubReg() == 0 &&
+"tied subregister uses in bundled instructions not supported");
+MO.setReg(RegA);
+ }
+}
+ }
+}
}
if (AllUsesCopied) {
diff --git a/llvm/test/CodeGen/AMDGPU/twoaddr-bundle.mir
b/llvm/test/CodeGen/AMDGPU/twoaddr-bundle.mir
new file mode 100644
index 0..696962a88c8b8
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/twoaddr-bundle.mir
@@ -0,0 +1,57 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
UTC_ARGS: --version 6
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 %s --passes=two-address-instruction
-verify-each -o - | FileCheck --check-prefixes=GCN %s
+
+# Exercise very basic handling of BUNDLE'd instructions by the
two-address-instruction pass.
+
+# This test is an example where it is best to keep the two-address instruction
+# and resolve the tie with a COPY that is expected to be coalesced.
+---
+name:test_fmac_bundle
+body: |
+ bb.0:
+
+; GCN-LABEL: name: test_fmac_bundle
+; GCN: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+; GCN-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]],
[[COPY1]], 0, implicit $exec
+; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+; GCN-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_U32_e64_]]
+; GCN-NEXT: BUNDLE implicit-def [[COPY2]], implicit [[DEF]], implicit
[[DEF1]], implicit [[COPY2]](tied-def 0), implicit $mode, implicit $exec {
+; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = V_FMAC_F32_e32 killed [[DEF]],
killed [[DEF1]], killed [[COPY2]], implicit $mode, implicit $exec
+; GCN-NEXT: }
+%10:vgpr_32 = COPY $vgpr0
+%11:vgpr_32 = COPY $vgpr1
+%2:vgpr_32 = V_ADD_U32_e64 %10, %11, 0, implicit $exec
+%0:vgpr_32 = IMPLICIT_DEF
+%1:vgpr_32 = IMPLICIT_DEF
+BUNDLE implicit-def %3:vgpr_32, implicit %0, implicit %1, implicit killed
%2(tied-def 0), implicit $mode, implicit $exec {
+ %3:vgpr_32 = V_FMAC_F32_e32 killed %0, killed %1, killed %2, implicit
$mode, implicit $exec
+}
+
+...
+
+# This test is an example where conversion to three-address form would be
beneficial.
+---
+name:test_fmac_reuse_bundle
+body: |
+ bb.0:
+
+; GCN-LABEL: name: test_fmac_reuse_bundle
+; GCN: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+; GCN-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
[llvm-branch-commits] [llvm] CodeGen: More accurate mayAlias for instructions with multiple MMOs (PR #166211)
https://github.com/nhaehnle created
https://github.com/llvm/llvm-project/pull/166211
There can only be meaningful aliasing between the memory accesses of
different instructions if at least one of the accesses modifies memory.
This check is applied at the instruction-level earlier in the method.
This change merely extends the check on a per-MMO basis.
This affects a SystemZ test because PFD instructions are both mayLoad
and mayStore but may carry a load-only MMO which is now no longer
treated as aliasing loads. The PFD instructions are from llvm.prefetch
generated by loop-data-prefetch.
---
**Stack**:
- [2/2] #166210
- [1/2] #166209
⚠️ *Part of a stack created by [spr](https://github.com/ejoffe/spr). Merging
this PR using the GitHub UI may have unexpected results.*
From e64b3258066ef62b47f877586c6e17c18ecfea65 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicolai=20H=C3=A4hnle?=
Date: Fri, 3 Oct 2025 18:20:22 -0700
Subject: [PATCH] CodeGen: More accurate mayAlias for instructions with
multiple MMOs
There can only be meaningful aliasing between the memory accesses of
different instructions if at least one of the accesses modifies memory.
This check is applied at the instruction-level earlier in the method.
This change merely extends the check on a per-MMO basis.
This affects a SystemZ test because PFD instructions are both mayLoad
and mayStore but may carry a load-only MMO which is now no longer
treated as aliasing loads. The PFD instructions are from llvm.prefetch
generated by loop-data-prefetch.
commit-id:667859fc
Pull Request: https://github.com/nhaehnle/llvm-project/pull/10
---
llvm/lib/CodeGen/MachineInstr.cpp | 8 ++--
llvm/test/CodeGen/SystemZ/vec-load-element.ll | 4 ++--
2 files changed, 8 insertions(+), 4 deletions(-)
diff --git a/llvm/lib/CodeGen/MachineInstr.cpp
b/llvm/lib/CodeGen/MachineInstr.cpp
index 8ad9245a47684..37e5c517d24d8 100644
--- a/llvm/lib/CodeGen/MachineInstr.cpp
+++ b/llvm/lib/CodeGen/MachineInstr.cpp
@@ -1547,10 +1547,14 @@ bool MachineInstr::mayAlias(BatchAAResults *AA, const
MachineInstr &Other,
// Check each pair of memory operands from both instructions, which can't
// alias only if all pairs won't alias.
- for (auto *MMOa : memoperands())
-for (auto *MMOb : Other.memoperands())
+ for (auto *MMOa : memoperands()) {
+for (auto *MMOb : Other.memoperands()) {
+ if (!MMOa->isStore() && !MMOb->isStore())
+continue;
if (MemOperandsHaveAlias(MFI, AA, UseTBAA, MMOa, MMOb))
return true;
+}
+ }
return false;
}
diff --git a/llvm/test/CodeGen/SystemZ/vec-load-element.ll
b/llvm/test/CodeGen/SystemZ/vec-load-element.ll
index 2baaed19546df..9bef279d7c0fa 100644
--- a/llvm/test/CodeGen/SystemZ/vec-load-element.ll
+++ b/llvm/test/CodeGen/SystemZ/vec-load-element.ll
@@ -5,8 +5,8 @@
; CHECK-LABEL: .LBB0_1:
; CHECK-NOT: l %r
; CHECK-NOT: vlvgf
-; CHECK: pfd
-; CHECK: vlef
+; CHECK-DAG: pfd
+; CHECK-DAG: vlef
%type0 = type { i32, [400 x i8], i32, i32, i32, i32, i32, i32, i32, i32, i32,
i32 }
@Mem = external global [150 x %type0], align 4
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
