[llvm-branch-commits] [llvm] [LSCFG][profcheck] Add dummy branch weights for the dummy switch to dead exits (PR #164714)
https://github.com/mtrofin updated
https://github.com/llvm/llvm-project/pull/164714
>From 79cfc198e89191126f2ef546c0a509c5a271e00e Mon Sep 17 00:00:00 2001
From: Mircea Trofin
Date: Wed, 22 Oct 2025 14:34:31 -0700
Subject: [PATCH] [LSCFG][profcheck] Add dummy branch weights for the dummy
switch to dead exits
---
.../lib/Transforms/Scalar/LoopSimplifyCFG.cpp | 12 ++
.../LoopSimplifyCFG/constant-fold-branch.ll | 104 +-
2 files changed, 66 insertions(+), 50 deletions(-)
diff --git a/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp
b/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp
index b9546c5fa236b..e902b71776973 100644
--- a/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp
@@ -24,6 +24,7 @@
#include "llvm/Analysis/ScalarEvolution.h"
#include "llvm/IR/Dominators.h"
#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/ProfDataUtils.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Transforms/Scalar.h"
#include "llvm/Transforms/Scalar/LoopPassManager.h"
@@ -393,6 +394,17 @@ class ConstantTerminatorFoldingImpl {
DTUpdates.push_back({DominatorTree::Insert, Preheader, BB});
++NumLoopExitsDeleted;
}
+// We don't really need to add branch weights to DummySwitch, because all
+// but one branches are just a temporary artifact - see the comment on top
+// of this function. But, it's easy to estimate the weights, and it helps
+// maintain a property of the overall compiler - that the branch weights
+// don't "just get dropped" accidentally (i.e. profcheck)
+if (DummySwitch->getParent()->getParent()->hasProfileData()) {
+ SmallVector DummyBranchWeights(1 + DummySwitch->getNumCases());
+ // default. 100% probability, the rest are dead.
+ DummyBranchWeights[0] = 1;
+ setBranchWeights(*DummySwitch, DummyBranchWeights, /*IsExpected=*/false);
+}
assert(L.getLoopPreheader() == NewPreheader && "Malformed CFG?");
if (Loop *OuterLoop = LI.getLoopFor(Preheader)) {
diff --git a/llvm/test/Transforms/LoopSimplifyCFG/constant-fold-branch.ll
b/llvm/test/Transforms/LoopSimplifyCFG/constant-fold-branch.ll
index 1ec212f0bb5ea..46b6209986fed 100644
--- a/llvm/test/Transforms/LoopSimplifyCFG/constant-fold-branch.ll
+++ b/llvm/test/Transforms/LoopSimplifyCFG/constant-fold-branch.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
UTC_ARGS: --check-globals
; REQUIRES: asserts
; RUN: opt -S -enable-loop-simplifycfg-term-folding=true
-passes=loop-simplifycfg -verify-loop-info -verify-dom-info -verify-loop-lcssa
< %s | FileCheck %s
; RUN: opt -S -enable-loop-simplifycfg-term-folding=true
-passes='require,loop(loop-simplifycfg)' -verify-loop-info
-verify-dom-info -verify-loop-lcssa < %s | FileCheck %s
@@ -59,7 +59,7 @@ define i32 @dead_backedge_test_switch_loop(i32 %end) {
; CHECK: dead_backedge:
; CHECK-NEXT:[[I_2]] = add i32 [[I_1]], 10
; CHECK-NEXT:switch i32 1, label [[EXIT:%.*]] [
-; CHECK-NEXT:i32 0, label [[HEADER_BACKEDGE]]
+; CHECK-NEXT: i32 0, label [[HEADER_BACKEDGE]]
; CHECK-NEXT:]
; CHECK: exit:
; CHECK-NEXT:[[I_2_LCSSA:%.*]] = phi i32 [ [[I_2]], [[DEAD_BACKEDGE]] ]
@@ -233,12 +233,12 @@ exit:
; Check that we preserve static reachibility of a dead exit block while
deleting
; a branch.
-define i32 @dead_exit_test_branch_loop(i32 %end) {
+define i32 @dead_exit_test_branch_loop(i32 %end) !prof
!{!"function_entry_count", i32 10} {
; CHECK-LABEL: @dead_exit_test_branch_loop(
; CHECK-NEXT: preheader:
; CHECK-NEXT:switch i32 0, label [[PREHEADER_SPLIT:%.*]] [
-; CHECK-NEXT:i32 1, label [[DEAD:%.*]]
-; CHECK-NEXT:]
+; CHECK-NEXT: i32 1, label [[DEAD:%.*]]
+; CHECK-NEXT:], !prof [[PROF1:![0-9]+]]
; CHECK: preheader.split:
; CHECK-NEXT:br label [[HEADER:%.*]]
; CHECK: header:
@@ -262,7 +262,7 @@ preheader:
header:
%i = phi i32 [0, %preheader], [%i.inc, %backedge]
- br i1 true, label %backedge, label %dead
+ br i1 true, label %backedge, label %dead, !prof !{!"branch_weights", i32 10,
i32 1}
dead:
br label %dummy
@@ -286,7 +286,7 @@ define i32 @dead_exit_test_switch_loop(i32 %end) {
; CHECK-LABEL: @dead_exit_test_switch_loop(
; CHECK-NEXT: preheader:
; CHECK-NEXT:switch i32 0, label [[PREHEADER_SPLIT:%.*]] [
-; CHECK-NEXT:i32 1, label [[DEAD:%.*]]
+; CHECK-NEXT: i32 1, label [[DEAD:%.*]]
; CHECK-NEXT:]
; CHECK: preheader.split:
; CHECK-NEXT:br label [[HEADER:%.*]]
@@ -383,9 +383,9 @@ define i32 @dead_loop_test_switch_loop(i32 %end) {
; CHECK: header:
; CHECK-NEXT:[[I:%.*]] = phi i32 [ 0, [[PREHEADER:%.*]] ], [
[[I_INC:%.*]], [[BACKEDGE:%.*]] ]
; CHECK-NEXT:switch i32 1, label [[DEAD:%.*]] [
-; CHECK-NEXT:i32 0, label [[DEAD]]
-; CHECK-NEXT:i32 1, label [[BACKEDGE]]
-; CHECK-NEXT:i32 2, lab
[llvm-branch-commits] [llvm] [SLU][profcheck] Propagate profile for branches on injected conditions. (PR #164476)
https://github.com/mtrofin updated
https://github.com/llvm/llvm-project/pull/164476
>From fc8a630e3d1107ff1a61484080049ca322be07c6 Mon Sep 17 00:00:00 2001
From: Mircea Trofin
Date: Tue, 21 Oct 2025 11:22:01 -0700
Subject: [PATCH] [SLU][profcheck] Propagate profile for branches on injected
conditions.
---
.../Transforms/Scalar/SimpleLoopUnswitch.cpp | 9 +-
.../inject-invariant-conditions.ll| 142 +-
2 files changed, 79 insertions(+), 72 deletions(-)
diff --git a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
index 86b2090081ed0..0577ddbd2353c 100644
--- a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
+++ b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
@@ -3203,10 +3203,15 @@
injectPendingInvariantConditions(NonTrivialUnswitchCandidate Candidate, Loop &L,
Builder.SetInsertPoint(TI);
auto *InvariantBr =
Builder.CreateCondBr(InjectedCond, InLoopSucc, CheckBlock);
+ // We don't know anything about the relation between the limits.
+ setExplicitlyUnknownBranchWeightsIfProfiled(
+ *InvariantBr, *InvariantBr->getParent()->getParent(), DEBUG_TYPE);
Builder.SetInsertPoint(CheckBlock);
- Builder.CreateCondBr(TI->getCondition(), TI->getSuccessor(0),
- TI->getSuccessor(1));
+ Builder.CreateCondBr(
+ TI->getCondition(), TI->getSuccessor(0), TI->getSuccessor(1),
+ !ProfcheckDisableMetadataFixes ? TI->getMetadata(LLVMContext::MD_prof)
+ : nullptr);
TI->eraseFromParent();
// Fixup phis.
diff --git
a/llvm/test/Transforms/SimpleLoopUnswitch/inject-invariant-conditions.ll
b/llvm/test/Transforms/SimpleLoopUnswitch/inject-invariant-conditions.ll
index 536e0c6a0e74a..3c84dea2a0672 100644
--- a/llvm/test/Transforms/SimpleLoopUnswitch/inject-invariant-conditions.ll
+++ b/llvm/test/Transforms/SimpleLoopUnswitch/inject-invariant-conditions.ll
@@ -2,40 +2,40 @@
; RUN: opt < %s -S -simple-loop-unswitch-inject-invariant-conditions=true
-passes="loop(simple-loop-unswitch),simplifycfg" | FileCheck %s
; RUN: opt < %s -S -simple-loop-unswitch-inject-invariant-conditions=true
-passes="loop-mssa(simple-loop-unswitch),simplifycfg"
-verify-memoryssa | FileCheck %s
-define i32 @test_01(ptr noundef %p, i32 noundef %n, i32 noundef %limit, ptr
noundef %arr, ptr noundef %x_p) {
+define i32 @test_01(ptr noundef %p, i32 noundef %n, i32 noundef %limit, ptr
noundef %arr, ptr noundef %x_p) !prof !{!"function_entry_count", i32 10} {
; CHECK-LABEL: @test_01(
; CHECK-NEXT: entry:
-; CHECK-NEXT:[[X:%.*]] = load i32, ptr [[X_P:%.*]], align 4, !noundef !0
+; CHECK-NEXT:[[X:%.*]] = load i32, ptr [[X_P:%.*]], align 4, !noundef
[[META1:![0-9]+]]
; CHECK-NEXT:[[INJECTED_COND:%.*]] = icmp ule i32 [[LIMIT:%.*]], [[X]]
-; CHECK-NEXT:br i1 [[INJECTED_COND]], label [[LOOP_US:%.*]], label
[[LOOP:%.*]]
+; CHECK-NEXT:br i1 [[INJECTED_COND]], label [[LOOP_US:%.*]], label
[[LOOP:%.*]], !prof [[PROF2:![0-9]+]]
; CHECK: loop.us:
-; CHECK-NEXT:[[IV_US:%.*]] = phi i32 [ [[IV_NEXT_US:%.*]],
[[GUARDED_US:%.*]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:[[EL_PTR_US:%.*]] = getelementptr i32, ptr [[P:%.*]], i32
[[IV_US]]
-; CHECK-NEXT:[[EL_US:%.*]] = load i32, ptr [[EL_PTR_US]], align 4
-; CHECK-NEXT:[[BOUND_CHECK_US:%.*]] = icmp ult i32 [[EL_US]], [[LIMIT]]
-; CHECK-NEXT:br i1 [[BOUND_CHECK_US]], label [[GUARDED_US]], label
[[COMMON_RET:%.*]], !prof [[PROF1:![0-9]+]]
+; CHECK-NEXT:[[IV:%.*]] = phi i32 [ [[IV_NEXT_US:%.*]], [[GUARDED_US:%.*]]
], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:[[EL_PTR:%.*]] = getelementptr i32, ptr [[P:%.*]], i32 [[IV]]
+; CHECK-NEXT:[[EL:%.*]] = load i32, ptr [[EL_PTR]], align 4
+; CHECK-NEXT:[[BOUND_CHECK:%.*]] = icmp ult i32 [[EL]], [[LIMIT]]
+; CHECK-NEXT:br i1 [[BOUND_CHECK]], label [[GUARDED_US]], label
[[COMMON_RET:%.*]], !prof [[PROF3:![0-9]+]]
; CHECK: guarded.us:
-; CHECK-NEXT:[[RANGE_CHECK_US:%.*]] = icmp ult i32 [[EL_US]], [[X]]
-; CHECK-NEXT:[[ARR_PTR_US:%.*]] = getelementptr i32, ptr [[ARR:%.*]], i32
[[EL_US]]
-; CHECK-NEXT:store i32 [[IV_US]], ptr [[ARR_PTR_US]], align 4
-; CHECK-NEXT:[[IV_NEXT_US]] = add i32 [[IV_US]], 1
+; CHECK-NEXT:[[RANGE_CHECK_US:%.*]] = icmp ult i32 [[EL]], [[X]]
+; CHECK-NEXT:[[ARR_PTR_US:%.*]] = getelementptr i32, ptr [[ARR:%.*]], i32
[[EL]]
+; CHECK-NEXT:store i32 [[IV]], ptr [[ARR_PTR_US]], align 4
+; CHECK-NEXT:[[IV_NEXT_US]] = add i32 [[IV]], 1
; CHECK-NEXT:[[LOOP_COND_US:%.*]] = icmp slt i32 [[IV_NEXT_US]], [[N:%.*]]
; CHECK-NEXT:br i1 [[LOOP_COND_US]], label [[LOOP_US]], label
[[COMMON_RET]]
; CHECK: loop:
-; CHECK-NEXT:[[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], [[BACKEDGE:%.*]] ], [
0, [[ENTRY]] ]
-; CHECK-NEXT:[[EL_PTR:%.*]] = getelementptr i32, ptr [[P]], i32 [[IV]]
-; CHECK-NEXT:[[EL:%.*]] = load i32, ptr [[EL_PTR]], align 4
-; CHECK-NEXT:[[BOUND_CHEC
[llvm-branch-commits] [llvm] [LIR][profcheck] Reuse the loop's exit condition profile (PR #164523)
https://github.com/mtrofin updated
https://github.com/llvm/llvm-project/pull/164523
>From c5282b720c3e347617fbf48071af77a4ea0d5dfe Mon Sep 17 00:00:00 2001
From: Mircea Trofin
Date: Tue, 21 Oct 2025 17:24:49 -0700
Subject: [PATCH] [LIR][profcheck] Reuse the loop's exit condition profile
---
.../Transforms/Scalar/LoopIdiomRecognize.cpp | 40 +--
.../LoopIdiom/X86/preserve-profile.ll | 70 +++
2 files changed, 106 insertions(+), 4 deletions(-)
create mode 100644 llvm/test/Transforms/LoopIdiom/X86/preserve-profile.ll
diff --git a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
index 019536ca91ae0..9070d252ae09f 100644
--- a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
@@ -72,6 +72,7 @@
#include "llvm/IR/Module.h"
#include "llvm/IR/PassManager.h"
#include "llvm/IR/PatternMatch.h"
+#include "llvm/IR/ProfDataUtils.h"
#include "llvm/IR/Type.h"
#include "llvm/IR/User.h"
#include "llvm/IR/Value.h"
@@ -105,6 +106,7 @@ STATISTIC(
STATISTIC(NumShiftUntilZero,
"Number of uncountable loops recognized as 'shift until zero'
idiom");
+namespace llvm {
bool DisableLIRP::All;
static cl::opt
DisableLIRPAll("disable-" DEBUG_TYPE "-all",
@@ -163,6 +165,10 @@ static cl::opt ForceMemsetPatternIntrinsic(
cl::desc("Use memset.pattern intrinsic whenever possible"),
cl::init(false),
cl::Hidden);
+extern cl::opt ProfcheckDisableMetadataFixes;
+
+} // namespace llvm
+
namespace {
class LoopIdiomRecognize {
@@ -3199,7 +3205,21 @@ bool LoopIdiomRecognize::recognizeShiftUntilBitTest() {
// The loop trip count check.
auto *IVCheck = Builder.CreateICmpEQ(IVNext, LoopTripCount,
CurLoop->getName() + ".ivcheck");
- Builder.CreateCondBr(IVCheck, SuccessorBB, LoopHeaderBB);
+ SmallVector BranchWeights;
+ const bool HasBranchWeights =
+ !ProfcheckDisableMetadataFixes &&
+ extractBranchWeights(*LoopHeaderBB->getTerminator(), BranchWeights);
+
+ auto *BI = Builder.CreateCondBr(IVCheck, SuccessorBB, LoopHeaderBB);
+ if (HasBranchWeights) {
+if (SuccessorBB == LoopHeaderBB->getTerminator()->getSuccessor(1))
+ std::swap(BranchWeights[0], BranchWeights[1]);
+// We're not changing the loop profile, so we can reuse the original loop's
+// profile.
+setBranchWeights(*BI, BranchWeights,
+ /*IsExpected=*/false);
+ }
+
LoopHeaderBB->getTerminator()->eraseFromParent();
// Populate the IV PHI.
@@ -3368,10 +3388,10 @@ static bool detectShiftUntilZeroIdiom(Loop *CurLoop,
ScalarEvolution *SE,
/// %start = <...>
/// %extraoffset = <...>
/// <...>
-/// br label %for.cond
+/// br label %loop
///
/// loop:
-/// %iv = phi i8 [ %start, %entry ], [ %iv.next, %for.cond ]
+/// %iv = phi i8 [ %start, %entry ], [ %iv.next, %loop ]
/// %nbits = add nsw i8 %iv, %extraoffset
/// %val.shifted = {{l,a}shr,shl} i8 %val, %nbits
/// %val.shifted.iszero = icmp eq i8 %val.shifted, 0
@@ -3533,7 +3553,19 @@ bool LoopIdiomRecognize::recognizeShiftUntilZero() {
// The loop terminator.
Builder.SetInsertPoint(LoopHeaderBB->getTerminator());
- Builder.CreateCondBr(CIVCheck, SuccessorBB, LoopHeaderBB);
+ SmallVector BranchWeights;
+ const bool HasBranchWeights =
+ !ProfcheckDisableMetadataFixes &&
+ extractBranchWeights(*LoopHeaderBB->getTerminator(), BranchWeights);
+
+ auto *BI = Builder.CreateCondBr(CIVCheck, SuccessorBB, LoopHeaderBB);
+ if (HasBranchWeights) {
+if (InvertedCond)
+ std::swap(BranchWeights[0], BranchWeights[1]);
+// We're not changing the loop profile, so we can reuse the original loop's
+// profile.
+setBranchWeights(*BI, BranchWeights, /*IsExpected=*/false);
+ }
LoopHeaderBB->getTerminator()->eraseFromParent();
// Populate the IV PHI.
diff --git a/llvm/test/Transforms/LoopIdiom/X86/preserve-profile.ll
b/llvm/test/Transforms/LoopIdiom/X86/preserve-profile.ll
new file mode 100644
index 0..d01bb748d9422
--- /dev/null
+++ b/llvm/test/Transforms/LoopIdiom/X86/preserve-profile.ll
@@ -0,0 +1,70 @@
+; RUN: opt
-passes="module(print),function(loop(loop-idiom)),module(print)"
-mtriple=x86_64 -mcpu=core-avx2 %s -disable-output 2>&1 | FileCheck
--check-prefix=PROFILE %s
+
+declare void @escape_inner(i8, i8, i8, i1, i8)
+declare void @escape_outer(i8, i8, i8, i1, i8)
+
+declare i8 @gen.i8()
+
+; Most basic pattern; Note that iff the shift amount is offset, said offsetting
+; must not cause an overflow, but `add nsw` is fine.
+define i8 @p0(i8 %val, i8 %start, i8 %extraoffset) mustprogress {
+entry:
+ br label %loop
+
+loop:
+ %iv = phi i8 [ %start, %entry ], [ %iv.next, %loop ]
+ %nbits = add nsw i8 %iv, %extraoffset
+ %val.shifted = ashr i8 %val, %nbits
+ %val.shifted.iszero = icmp eq i8 %val.shifted, 0
+ %iv.next = add i8 %iv, 1
+
+ call void @escap
[llvm-branch-commits] [llvm] CodeGen: Record tied virtual register operands in finalizeBundle (PR #166209)
https://github.com/arsenm approved this pull request. https://github.com/llvm/llvm-project/pull/166209 ___ llvm-branch-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [X86][NewPM] Port lower-amx-intrinsics to NewPM (PR #165113)
@@ -179,7 +179,18 @@ FunctionPass *createX86LowerAMXTypeLegacyPass();
/// The pass transforms amx intrinsics to scalar operation if the function has
/// optnone attribute or it is O0.
-FunctionPass *createX86LowerAMXIntrinsicsPass();
+class X86LowerAMXIntrinsicsPass
+: public PassInfoMixin {
+private:
+ const TargetMachine *TM;
+
+public:
+ X86LowerAMXIntrinsicsPass(const TargetMachine *TM) : TM(TM) {}
+ PreservedAnalyses run(Function &F, FunctionAnalysisManager &FAM);
+ static bool isRequired() { return true; }
boomanaiden154 wrote:
Yes. The comment at the top of the file:
```
/// To decouple the dependency of the shape, we transform amx intrinsics
/// to scalar operation, so that compiling doesn't fail. In long term, we
/// should improve fast register allocation to allocate amx register.
```
makes it seem like this might require some effort though.
https://github.com/llvm/llvm-project/pull/165113
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [X86][NewPM] Port X86PartialReduction to NewPM (PR #166048)
https://github.com/boomanaiden154 updated
https://github.com/llvm/llvm-project/pull/166048
>From 54ac616a28d1aa5544192a8a2cdbce30641fa22f Mon Sep 17 00:00:00 2001
From: Aiden Grossman
Date: Sun, 2 Nov 2025 09:07:51 +
Subject: [PATCH 1/2] =?UTF-8?q?[=F0=9D=98=80=F0=9D=97=BD=F0=9D=97=BF]=20ch?=
=?UTF-8?q?anges=20to=20main=20this=20commit=20is=20based=20on?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Created using spr 1.3.7
[skip ci]
---
llvm/lib/Target/X86/X86.h | 13 -
llvm/lib/Target/X86/X86LowerAMXIntrinsics.cpp | 48 +++
llvm/lib/Target/X86/X86PassRegistry.def | 2 +-
llvm/lib/Target/X86/X86TargetMachine.cpp | 2 +-
.../AMX/amx-low-intrinsics-no-amx-bitcast.ll | 3 +-
.../CodeGen/X86/AMX/amx-low-intrinsics.ll | 3 +-
6 files changed, 56 insertions(+), 15 deletions(-)
diff --git a/llvm/lib/Target/X86/X86.h b/llvm/lib/Target/X86/X86.h
index 51b540a7a51d0..bdb43cfb4adb4 100644
--- a/llvm/lib/Target/X86/X86.h
+++ b/llvm/lib/Target/X86/X86.h
@@ -179,7 +179,18 @@ FunctionPass *createX86LowerAMXTypeLegacyPass();
/// The pass transforms amx intrinsics to scalar operation if the function has
/// optnone attribute or it is O0.
-FunctionPass *createX86LowerAMXIntrinsicsPass();
+class X86LowerAMXIntrinsicsPass
+: public PassInfoMixin {
+private:
+ const TargetMachine *TM;
+
+public:
+ X86LowerAMXIntrinsicsPass(const TargetMachine *TM) : TM(TM) {}
+ PreservedAnalyses run(Function &F, FunctionAnalysisManager &FAM);
+ static bool isRequired() { return true; }
+};
+
+FunctionPass *createX86LowerAMXIntrinsicsLegacyPass();
InstructionSelector *createX86InstructionSelector(const X86TargetMachine &TM,
const X86Subtarget &,
diff --git a/llvm/lib/Target/X86/X86LowerAMXIntrinsics.cpp
b/llvm/lib/Target/X86/X86LowerAMXIntrinsics.cpp
index 7f3393910da2c..662aec2c15241 100644
--- a/llvm/lib/Target/X86/X86LowerAMXIntrinsics.cpp
+++ b/llvm/lib/Target/X86/X86LowerAMXIntrinsics.cpp
@@ -23,12 +23,15 @@
#include "llvm/CodeGen/Passes.h"
#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/IR/Analysis.h"
#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Dominators.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/IntrinsicsX86.h"
+#include "llvm/IR/PassManager.h"
#include "llvm/IR/PatternMatch.h"
#include "llvm/InitializePasses.h"
#include "llvm/Pass.h"
@@ -40,7 +43,7 @@
using namespace llvm;
using namespace PatternMatch;
-#define DEBUG_TYPE "lower-amx-intrinsics"
+#define DEBUG_TYPE "x86-lower-amx-intrinsics"
#ifndef NDEBUG
static bool isV256I32Ty(Type *Ty) {
@@ -626,6 +629,37 @@ bool X86LowerAMXIntrinsics::visit() {
return C;
}
+namespace {
+bool shouldRunLowerAMXIntrinsics(const Function &F, const TargetMachine *TM) {
+ return X86ScalarizeAMX && (F.hasFnAttribute(Attribute::OptimizeNone) ||
+ TM->getOptLevel() == CodeGenOptLevel::None);
+}
+
+bool runLowerAMXIntrinsics(Function &F, DominatorTree *DT, LoopInfo *LI) {
+ DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy);
+
+ X86LowerAMXIntrinsics LAT(F, DTU, LI);
+ return LAT.visit();
+}
+} // namespace
+
+PreservedAnalyses X86LowerAMXIntrinsicsPass::run(Function &F,
+ FunctionAnalysisManager &FAM)
{
+ if (!shouldRunLowerAMXIntrinsics(F, TM))
+return PreservedAnalyses::all();
+
+ DominatorTree &DT = FAM.getResult(F);
+ LoopInfo &LI = FAM.getResult(F);
+ bool Changed = runLowerAMXIntrinsics(F, &DT, &LI);
+ if (!Changed)
+return PreservedAnalyses::all();
+
+ PreservedAnalyses PA = PreservedAnalyses::none();
+ PA.preserve();
+ PA.preserve();
+ return PA;
+}
+
namespace {
class X86LowerAMXIntrinsicsLegacyPass : public FunctionPass {
public:
@@ -634,21 +668,15 @@ class X86LowerAMXIntrinsicsLegacyPass : public
FunctionPass {
X86LowerAMXIntrinsicsLegacyPass() : FunctionPass(ID) {}
bool runOnFunction(Function &F) override {
-if (!X86ScalarizeAMX)
- return false;
TargetMachine *TM =
&getAnalysis().getTM();
-if (!F.hasFnAttribute(Attribute::OptimizeNone) &&
-TM->getOptLevel() != CodeGenOptLevel::None)
+if (!shouldRunLowerAMXIntrinsics(F, TM))
return false;
auto *DTWP = getAnalysisIfAvailable();
auto *DT = DTWP ? &DTWP->getDomTree() : nullptr;
auto *LIWP = getAnalysisIfAvailable();
auto *LI = LIWP ? &LIWP->getLoopInfo() : nullptr;
-DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy);
-
-X86LowerAMXIntrinsics LAT(F, DTU, LI);
-return LAT.visit();
+return runLowerAMXIntrinsics(F, DT, LI);
}
StringRef getPassName() const override { return "Lower AMX intrinsics"; }
@@ -668,6 +696,6 @@ INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
INITIALIZE_P
[llvm-branch-commits] [llvm] [X86][NewPM] Port X86PartialReduction to NewPM (PR #166048)
https://github.com/boomanaiden154 updated
https://github.com/llvm/llvm-project/pull/166048
>From 54ac616a28d1aa5544192a8a2cdbce30641fa22f Mon Sep 17 00:00:00 2001
From: Aiden Grossman
Date: Sun, 2 Nov 2025 09:07:51 +
Subject: [PATCH 1/2] =?UTF-8?q?[=F0=9D=98=80=F0=9D=97=BD=F0=9D=97=BF]=20ch?=
=?UTF-8?q?anges=20to=20main=20this=20commit=20is=20based=20on?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Created using spr 1.3.7
[skip ci]
---
llvm/lib/Target/X86/X86.h | 13 -
llvm/lib/Target/X86/X86LowerAMXIntrinsics.cpp | 48 +++
llvm/lib/Target/X86/X86PassRegistry.def | 2 +-
llvm/lib/Target/X86/X86TargetMachine.cpp | 2 +-
.../AMX/amx-low-intrinsics-no-amx-bitcast.ll | 3 +-
.../CodeGen/X86/AMX/amx-low-intrinsics.ll | 3 +-
6 files changed, 56 insertions(+), 15 deletions(-)
diff --git a/llvm/lib/Target/X86/X86.h b/llvm/lib/Target/X86/X86.h
index 51b540a7a51d0..bdb43cfb4adb4 100644
--- a/llvm/lib/Target/X86/X86.h
+++ b/llvm/lib/Target/X86/X86.h
@@ -179,7 +179,18 @@ FunctionPass *createX86LowerAMXTypeLegacyPass();
/// The pass transforms amx intrinsics to scalar operation if the function has
/// optnone attribute or it is O0.
-FunctionPass *createX86LowerAMXIntrinsicsPass();
+class X86LowerAMXIntrinsicsPass
+: public PassInfoMixin {
+private:
+ const TargetMachine *TM;
+
+public:
+ X86LowerAMXIntrinsicsPass(const TargetMachine *TM) : TM(TM) {}
+ PreservedAnalyses run(Function &F, FunctionAnalysisManager &FAM);
+ static bool isRequired() { return true; }
+};
+
+FunctionPass *createX86LowerAMXIntrinsicsLegacyPass();
InstructionSelector *createX86InstructionSelector(const X86TargetMachine &TM,
const X86Subtarget &,
diff --git a/llvm/lib/Target/X86/X86LowerAMXIntrinsics.cpp
b/llvm/lib/Target/X86/X86LowerAMXIntrinsics.cpp
index 7f3393910da2c..662aec2c15241 100644
--- a/llvm/lib/Target/X86/X86LowerAMXIntrinsics.cpp
+++ b/llvm/lib/Target/X86/X86LowerAMXIntrinsics.cpp
@@ -23,12 +23,15 @@
#include "llvm/CodeGen/Passes.h"
#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/IR/Analysis.h"
#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Dominators.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/IntrinsicsX86.h"
+#include "llvm/IR/PassManager.h"
#include "llvm/IR/PatternMatch.h"
#include "llvm/InitializePasses.h"
#include "llvm/Pass.h"
@@ -40,7 +43,7 @@
using namespace llvm;
using namespace PatternMatch;
-#define DEBUG_TYPE "lower-amx-intrinsics"
+#define DEBUG_TYPE "x86-lower-amx-intrinsics"
#ifndef NDEBUG
static bool isV256I32Ty(Type *Ty) {
@@ -626,6 +629,37 @@ bool X86LowerAMXIntrinsics::visit() {
return C;
}
+namespace {
+bool shouldRunLowerAMXIntrinsics(const Function &F, const TargetMachine *TM) {
+ return X86ScalarizeAMX && (F.hasFnAttribute(Attribute::OptimizeNone) ||
+ TM->getOptLevel() == CodeGenOptLevel::None);
+}
+
+bool runLowerAMXIntrinsics(Function &F, DominatorTree *DT, LoopInfo *LI) {
+ DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy);
+
+ X86LowerAMXIntrinsics LAT(F, DTU, LI);
+ return LAT.visit();
+}
+} // namespace
+
+PreservedAnalyses X86LowerAMXIntrinsicsPass::run(Function &F,
+ FunctionAnalysisManager &FAM)
{
+ if (!shouldRunLowerAMXIntrinsics(F, TM))
+return PreservedAnalyses::all();
+
+ DominatorTree &DT = FAM.getResult(F);
+ LoopInfo &LI = FAM.getResult(F);
+ bool Changed = runLowerAMXIntrinsics(F, &DT, &LI);
+ if (!Changed)
+return PreservedAnalyses::all();
+
+ PreservedAnalyses PA = PreservedAnalyses::none();
+ PA.preserve();
+ PA.preserve();
+ return PA;
+}
+
namespace {
class X86LowerAMXIntrinsicsLegacyPass : public FunctionPass {
public:
@@ -634,21 +668,15 @@ class X86LowerAMXIntrinsicsLegacyPass : public
FunctionPass {
X86LowerAMXIntrinsicsLegacyPass() : FunctionPass(ID) {}
bool runOnFunction(Function &F) override {
-if (!X86ScalarizeAMX)
- return false;
TargetMachine *TM =
&getAnalysis().getTM();
-if (!F.hasFnAttribute(Attribute::OptimizeNone) &&
-TM->getOptLevel() != CodeGenOptLevel::None)
+if (!shouldRunLowerAMXIntrinsics(F, TM))
return false;
auto *DTWP = getAnalysisIfAvailable();
auto *DT = DTWP ? &DTWP->getDomTree() : nullptr;
auto *LIWP = getAnalysisIfAvailable();
auto *LI = LIWP ? &LIWP->getLoopInfo() : nullptr;
-DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy);
-
-X86LowerAMXIntrinsics LAT(F, DTU, LI);
-return LAT.visit();
+return runLowerAMXIntrinsics(F, DT, LI);
}
StringRef getPassName() const override { return "Lower AMX intrinsics"; }
@@ -668,6 +696,6 @@ INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
INITIALIZE_P
[llvm-branch-commits] [llvm] CodeGen: More accurate mayAlias for instructions with multiple MMOs (PR #166211)
https://github.com/nhaehnle edited https://github.com/llvm/llvm-project/pull/166211 ___ llvm-branch-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] CodeGen: Record MMOs in finalizeBundle (PR #166210)
https://github.com/nhaehnle edited https://github.com/llvm/llvm-project/pull/166210 ___ llvm-branch-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] CodeGen: Record tied virtual register operands in finalizeBundle (PR #166209)
@@ -0,0 +1,61 @@ +//===- MachineInstrBundleTest.cpp -===// nhaehnle wrote: Indeed, that is a lot nicer https://github.com/llvm/llvm-project/pull/166209 ___ llvm-branch-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] CodeGen: Handle bundled instructions in two-address-instructions pass (PR #166212)
https://github.com/nhaehnle edited https://github.com/llvm/llvm-project/pull/166212 ___ llvm-branch-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] CodeGen/AMDGPU: Allow 3-address conversion of bundled instructions (PR #166213)
https://github.com/nhaehnle edited https://github.com/llvm/llvm-project/pull/166213 ___ llvm-branch-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] CodeGen: Record tied virtual register operands in finalizeBundle (PR #166209)
https://github.com/nhaehnle edited https://github.com/llvm/llvm-project/pull/166209 ___ llvm-branch-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [LVer][profcheck] explicitly set unknown branch weights for the versioned/unversioned selector (PR #164507)
https://github.com/mtrofin updated
https://github.com/llvm/llvm-project/pull/164507
>From 7142f46d62fdd2596c5f6876eed896c07af7ceca Mon Sep 17 00:00:00 2001
From: Mircea Trofin
Date: Tue, 21 Oct 2025 15:20:14 -0700
Subject: [PATCH] [LVer][profcheck] explicitly set unknown branch weights for
the versioned/unversioned selector
---
llvm/lib/Transforms/Utils/LoopVersioning.cpp | 10 --
.../Transforms/LoopDistribute/basic-with-memchecks.ll | 5 +++--
2 files changed, 11 insertions(+), 4 deletions(-)
diff --git a/llvm/lib/Transforms/Utils/LoopVersioning.cpp
b/llvm/lib/Transforms/Utils/LoopVersioning.cpp
index ec2e6c1ab796b..4786819d18fa4 100644
--- a/llvm/lib/Transforms/Utils/LoopVersioning.cpp
+++ b/llvm/lib/Transforms/Utils/LoopVersioning.cpp
@@ -23,6 +23,7 @@
#include "llvm/IR/Dominators.h"
#include "llvm/IR/MDBuilder.h"
#include "llvm/IR/PassManager.h"
+#include "llvm/IR/ProfDataUtils.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
#include "llvm/Transforms/Utils/Cloning.h"
@@ -109,8 +110,13 @@ void LoopVersioning::versionLoop(
// Insert the conditional branch based on the result of the memchecks.
Instruction *OrigTerm = RuntimeCheckBB->getTerminator();
Builder.SetInsertPoint(OrigTerm);
- Builder.CreateCondBr(RuntimeCheck, NonVersionedLoop->getLoopPreheader(),
- VersionedLoop->getLoopPreheader());
+ auto *BI =
+ Builder.CreateCondBr(RuntimeCheck, NonVersionedLoop->getLoopPreheader(),
+ VersionedLoop->getLoopPreheader());
+ // We don't know what the probability of executing the versioned vs the
+ // unversioned variants is.
+ setExplicitlyUnknownBranchWeightsIfProfiled(
+ *BI, *BI->getParent()->getParent(), DEBUG_TYPE);
OrigTerm->eraseFromParent();
// The loops merge in the original exit block. This is now dominated by the
diff --git a/llvm/test/Transforms/LoopDistribute/basic-with-memchecks.ll
b/llvm/test/Transforms/LoopDistribute/basic-with-memchecks.ll
index 97ea2c6708dad..2828882afe779 100644
--- a/llvm/test/Transforms/LoopDistribute/basic-with-memchecks.ll
+++ b/llvm/test/Transforms/LoopDistribute/basic-with-memchecks.ll
@@ -28,7 +28,7 @@ target triple = "x86_64-apple-macosx10.10.0"
@E = common global ptr null, align 8
; CHECK-LABEL: @f(
-define void @f() {
+define void @f() !prof !{!"function_entry_count", i32 10} {
entry:
%a = load ptr, ptr @A, align 8
%b = load ptr, ptr @B, align 8
@@ -55,7 +55,7 @@ entry:
; CHECK: = icmp
; CHECK-NOT: = icmp
-; CHECK: br i1 %conflict.rdx15, label %for.body.ph.lver.orig, label
%for.body.ph.ldist1
+; CHECK: br i1 %conflict.rdx15, label %for.body.ph.lver.orig, label
%for.body.ph.ldist1, !prof ![[PROF1:[0-9]]]
; The non-distributed loop that the memchecks fall back on.
@@ -289,3 +289,4 @@ attributes #1 = { nounwind convergent }
!0 = distinct !{!0, !1}
!1 = !{!"llvm.loop.distribute.enable", i1 true}
+; CHECK: ![[PROF1]] = !{!"unknown", !"loop-versioning"}
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [LIR][profcheck] Reuse the loop's exit condition profile (PR #164523)
https://github.com/mtrofin updated
https://github.com/llvm/llvm-project/pull/164523
>From c5282b720c3e347617fbf48071af77a4ea0d5dfe Mon Sep 17 00:00:00 2001
From: Mircea Trofin
Date: Tue, 21 Oct 2025 17:24:49 -0700
Subject: [PATCH] [LIR][profcheck] Reuse the loop's exit condition profile
---
.../Transforms/Scalar/LoopIdiomRecognize.cpp | 40 +--
.../LoopIdiom/X86/preserve-profile.ll | 70 +++
2 files changed, 106 insertions(+), 4 deletions(-)
create mode 100644 llvm/test/Transforms/LoopIdiom/X86/preserve-profile.ll
diff --git a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
index 019536ca91ae0..9070d252ae09f 100644
--- a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
@@ -72,6 +72,7 @@
#include "llvm/IR/Module.h"
#include "llvm/IR/PassManager.h"
#include "llvm/IR/PatternMatch.h"
+#include "llvm/IR/ProfDataUtils.h"
#include "llvm/IR/Type.h"
#include "llvm/IR/User.h"
#include "llvm/IR/Value.h"
@@ -105,6 +106,7 @@ STATISTIC(
STATISTIC(NumShiftUntilZero,
"Number of uncountable loops recognized as 'shift until zero'
idiom");
+namespace llvm {
bool DisableLIRP::All;
static cl::opt
DisableLIRPAll("disable-" DEBUG_TYPE "-all",
@@ -163,6 +165,10 @@ static cl::opt ForceMemsetPatternIntrinsic(
cl::desc("Use memset.pattern intrinsic whenever possible"),
cl::init(false),
cl::Hidden);
+extern cl::opt ProfcheckDisableMetadataFixes;
+
+} // namespace llvm
+
namespace {
class LoopIdiomRecognize {
@@ -3199,7 +3205,21 @@ bool LoopIdiomRecognize::recognizeShiftUntilBitTest() {
// The loop trip count check.
auto *IVCheck = Builder.CreateICmpEQ(IVNext, LoopTripCount,
CurLoop->getName() + ".ivcheck");
- Builder.CreateCondBr(IVCheck, SuccessorBB, LoopHeaderBB);
+ SmallVector BranchWeights;
+ const bool HasBranchWeights =
+ !ProfcheckDisableMetadataFixes &&
+ extractBranchWeights(*LoopHeaderBB->getTerminator(), BranchWeights);
+
+ auto *BI = Builder.CreateCondBr(IVCheck, SuccessorBB, LoopHeaderBB);
+ if (HasBranchWeights) {
+if (SuccessorBB == LoopHeaderBB->getTerminator()->getSuccessor(1))
+ std::swap(BranchWeights[0], BranchWeights[1]);
+// We're not changing the loop profile, so we can reuse the original loop's
+// profile.
+setBranchWeights(*BI, BranchWeights,
+ /*IsExpected=*/false);
+ }
+
LoopHeaderBB->getTerminator()->eraseFromParent();
// Populate the IV PHI.
@@ -3368,10 +3388,10 @@ static bool detectShiftUntilZeroIdiom(Loop *CurLoop,
ScalarEvolution *SE,
/// %start = <...>
/// %extraoffset = <...>
/// <...>
-/// br label %for.cond
+/// br label %loop
///
/// loop:
-/// %iv = phi i8 [ %start, %entry ], [ %iv.next, %for.cond ]
+/// %iv = phi i8 [ %start, %entry ], [ %iv.next, %loop ]
/// %nbits = add nsw i8 %iv, %extraoffset
/// %val.shifted = {{l,a}shr,shl} i8 %val, %nbits
/// %val.shifted.iszero = icmp eq i8 %val.shifted, 0
@@ -3533,7 +3553,19 @@ bool LoopIdiomRecognize::recognizeShiftUntilZero() {
// The loop terminator.
Builder.SetInsertPoint(LoopHeaderBB->getTerminator());
- Builder.CreateCondBr(CIVCheck, SuccessorBB, LoopHeaderBB);
+ SmallVector BranchWeights;
+ const bool HasBranchWeights =
+ !ProfcheckDisableMetadataFixes &&
+ extractBranchWeights(*LoopHeaderBB->getTerminator(), BranchWeights);
+
+ auto *BI = Builder.CreateCondBr(CIVCheck, SuccessorBB, LoopHeaderBB);
+ if (HasBranchWeights) {
+if (InvertedCond)
+ std::swap(BranchWeights[0], BranchWeights[1]);
+// We're not changing the loop profile, so we can reuse the original loop's
+// profile.
+setBranchWeights(*BI, BranchWeights, /*IsExpected=*/false);
+ }
LoopHeaderBB->getTerminator()->eraseFromParent();
// Populate the IV PHI.
diff --git a/llvm/test/Transforms/LoopIdiom/X86/preserve-profile.ll
b/llvm/test/Transforms/LoopIdiom/X86/preserve-profile.ll
new file mode 100644
index 0..d01bb748d9422
--- /dev/null
+++ b/llvm/test/Transforms/LoopIdiom/X86/preserve-profile.ll
@@ -0,0 +1,70 @@
+; RUN: opt
-passes="module(print),function(loop(loop-idiom)),module(print)"
-mtriple=x86_64 -mcpu=core-avx2 %s -disable-output 2>&1 | FileCheck
--check-prefix=PROFILE %s
+
+declare void @escape_inner(i8, i8, i8, i1, i8)
+declare void @escape_outer(i8, i8, i8, i1, i8)
+
+declare i8 @gen.i8()
+
+; Most basic pattern; Note that iff the shift amount is offset, said offsetting
+; must not cause an overflow, but `add nsw` is fine.
+define i8 @p0(i8 %val, i8 %start, i8 %extraoffset) mustprogress {
+entry:
+ br label %loop
+
+loop:
+ %iv = phi i8 [ %start, %entry ], [ %iv.next, %loop ]
+ %nbits = add nsw i8 %iv, %extraoffset
+ %val.shifted = ashr i8 %val, %nbits
+ %val.shifted.iszero = icmp eq i8 %val.shifted, 0
+ %iv.next = add i8 %iv, 1
+
+ call void @escap
[llvm-branch-commits] [llvm] [SLU][profcheck] Propagate profile for branches on injected conditions. (PR #164476)
https://github.com/mtrofin updated
https://github.com/llvm/llvm-project/pull/164476
>From fc8a630e3d1107ff1a61484080049ca322be07c6 Mon Sep 17 00:00:00 2001
From: Mircea Trofin
Date: Tue, 21 Oct 2025 11:22:01 -0700
Subject: [PATCH] [SLU][profcheck] Propagate profile for branches on injected
conditions.
---
.../Transforms/Scalar/SimpleLoopUnswitch.cpp | 9 +-
.../inject-invariant-conditions.ll| 142 +-
2 files changed, 79 insertions(+), 72 deletions(-)
diff --git a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
index 86b2090081ed0..0577ddbd2353c 100644
--- a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
+++ b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
@@ -3203,10 +3203,15 @@
injectPendingInvariantConditions(NonTrivialUnswitchCandidate Candidate, Loop &L,
Builder.SetInsertPoint(TI);
auto *InvariantBr =
Builder.CreateCondBr(InjectedCond, InLoopSucc, CheckBlock);
+ // We don't know anything about the relation between the limits.
+ setExplicitlyUnknownBranchWeightsIfProfiled(
+ *InvariantBr, *InvariantBr->getParent()->getParent(), DEBUG_TYPE);
Builder.SetInsertPoint(CheckBlock);
- Builder.CreateCondBr(TI->getCondition(), TI->getSuccessor(0),
- TI->getSuccessor(1));
+ Builder.CreateCondBr(
+ TI->getCondition(), TI->getSuccessor(0), TI->getSuccessor(1),
+ !ProfcheckDisableMetadataFixes ? TI->getMetadata(LLVMContext::MD_prof)
+ : nullptr);
TI->eraseFromParent();
// Fixup phis.
diff --git
a/llvm/test/Transforms/SimpleLoopUnswitch/inject-invariant-conditions.ll
b/llvm/test/Transforms/SimpleLoopUnswitch/inject-invariant-conditions.ll
index 536e0c6a0e74a..3c84dea2a0672 100644
--- a/llvm/test/Transforms/SimpleLoopUnswitch/inject-invariant-conditions.ll
+++ b/llvm/test/Transforms/SimpleLoopUnswitch/inject-invariant-conditions.ll
@@ -2,40 +2,40 @@
; RUN: opt < %s -S -simple-loop-unswitch-inject-invariant-conditions=true
-passes="loop(simple-loop-unswitch),simplifycfg" | FileCheck %s
; RUN: opt < %s -S -simple-loop-unswitch-inject-invariant-conditions=true
-passes="loop-mssa(simple-loop-unswitch),simplifycfg"
-verify-memoryssa | FileCheck %s
-define i32 @test_01(ptr noundef %p, i32 noundef %n, i32 noundef %limit, ptr
noundef %arr, ptr noundef %x_p) {
+define i32 @test_01(ptr noundef %p, i32 noundef %n, i32 noundef %limit, ptr
noundef %arr, ptr noundef %x_p) !prof !{!"function_entry_count", i32 10} {
; CHECK-LABEL: @test_01(
; CHECK-NEXT: entry:
-; CHECK-NEXT:[[X:%.*]] = load i32, ptr [[X_P:%.*]], align 4, !noundef !0
+; CHECK-NEXT:[[X:%.*]] = load i32, ptr [[X_P:%.*]], align 4, !noundef
[[META1:![0-9]+]]
; CHECK-NEXT:[[INJECTED_COND:%.*]] = icmp ule i32 [[LIMIT:%.*]], [[X]]
-; CHECK-NEXT:br i1 [[INJECTED_COND]], label [[LOOP_US:%.*]], label
[[LOOP:%.*]]
+; CHECK-NEXT:br i1 [[INJECTED_COND]], label [[LOOP_US:%.*]], label
[[LOOP:%.*]], !prof [[PROF2:![0-9]+]]
; CHECK: loop.us:
-; CHECK-NEXT:[[IV_US:%.*]] = phi i32 [ [[IV_NEXT_US:%.*]],
[[GUARDED_US:%.*]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:[[EL_PTR_US:%.*]] = getelementptr i32, ptr [[P:%.*]], i32
[[IV_US]]
-; CHECK-NEXT:[[EL_US:%.*]] = load i32, ptr [[EL_PTR_US]], align 4
-; CHECK-NEXT:[[BOUND_CHECK_US:%.*]] = icmp ult i32 [[EL_US]], [[LIMIT]]
-; CHECK-NEXT:br i1 [[BOUND_CHECK_US]], label [[GUARDED_US]], label
[[COMMON_RET:%.*]], !prof [[PROF1:![0-9]+]]
+; CHECK-NEXT:[[IV:%.*]] = phi i32 [ [[IV_NEXT_US:%.*]], [[GUARDED_US:%.*]]
], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:[[EL_PTR:%.*]] = getelementptr i32, ptr [[P:%.*]], i32 [[IV]]
+; CHECK-NEXT:[[EL:%.*]] = load i32, ptr [[EL_PTR]], align 4
+; CHECK-NEXT:[[BOUND_CHECK:%.*]] = icmp ult i32 [[EL]], [[LIMIT]]
+; CHECK-NEXT:br i1 [[BOUND_CHECK]], label [[GUARDED_US]], label
[[COMMON_RET:%.*]], !prof [[PROF3:![0-9]+]]
; CHECK: guarded.us:
-; CHECK-NEXT:[[RANGE_CHECK_US:%.*]] = icmp ult i32 [[EL_US]], [[X]]
-; CHECK-NEXT:[[ARR_PTR_US:%.*]] = getelementptr i32, ptr [[ARR:%.*]], i32
[[EL_US]]
-; CHECK-NEXT:store i32 [[IV_US]], ptr [[ARR_PTR_US]], align 4
-; CHECK-NEXT:[[IV_NEXT_US]] = add i32 [[IV_US]], 1
+; CHECK-NEXT:[[RANGE_CHECK_US:%.*]] = icmp ult i32 [[EL]], [[X]]
+; CHECK-NEXT:[[ARR_PTR_US:%.*]] = getelementptr i32, ptr [[ARR:%.*]], i32
[[EL]]
+; CHECK-NEXT:store i32 [[IV]], ptr [[ARR_PTR_US]], align 4
+; CHECK-NEXT:[[IV_NEXT_US]] = add i32 [[IV]], 1
; CHECK-NEXT:[[LOOP_COND_US:%.*]] = icmp slt i32 [[IV_NEXT_US]], [[N:%.*]]
; CHECK-NEXT:br i1 [[LOOP_COND_US]], label [[LOOP_US]], label
[[COMMON_RET]]
; CHECK: loop:
-; CHECK-NEXT:[[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], [[BACKEDGE:%.*]] ], [
0, [[ENTRY]] ]
-; CHECK-NEXT:[[EL_PTR:%.*]] = getelementptr i32, ptr [[P]], i32 [[IV]]
-; CHECK-NEXT:[[EL:%.*]] = load i32, ptr [[EL_PTR]], align 4
-; CHECK-NEXT:[[BOUND_CHEC
[llvm-branch-commits] [llvm] [LSCFG][profcheck] Add dummy branch weights for the dummy switch to dead exits (PR #164714)
https://github.com/mtrofin updated
https://github.com/llvm/llvm-project/pull/164714
>From 79cfc198e89191126f2ef546c0a509c5a271e00e Mon Sep 17 00:00:00 2001
From: Mircea Trofin
Date: Wed, 22 Oct 2025 14:34:31 -0700
Subject: [PATCH] [LSCFG][profcheck] Add dummy branch weights for the dummy
switch to dead exits
---
.../lib/Transforms/Scalar/LoopSimplifyCFG.cpp | 12 ++
.../LoopSimplifyCFG/constant-fold-branch.ll | 104 +-
2 files changed, 66 insertions(+), 50 deletions(-)
diff --git a/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp
b/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp
index b9546c5fa236b..e902b71776973 100644
--- a/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp
@@ -24,6 +24,7 @@
#include "llvm/Analysis/ScalarEvolution.h"
#include "llvm/IR/Dominators.h"
#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/ProfDataUtils.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Transforms/Scalar.h"
#include "llvm/Transforms/Scalar/LoopPassManager.h"
@@ -393,6 +394,17 @@ class ConstantTerminatorFoldingImpl {
DTUpdates.push_back({DominatorTree::Insert, Preheader, BB});
++NumLoopExitsDeleted;
}
+// We don't really need to add branch weights to DummySwitch, because all
+// but one branches are just a temporary artifact - see the comment on top
+// of this function. But, it's easy to estimate the weights, and it helps
+// maintain a property of the overall compiler - that the branch weights
+// don't "just get dropped" accidentally (i.e. profcheck)
+if (DummySwitch->getParent()->getParent()->hasProfileData()) {
+ SmallVector DummyBranchWeights(1 + DummySwitch->getNumCases());
+ // default. 100% probability, the rest are dead.
+ DummyBranchWeights[0] = 1;
+ setBranchWeights(*DummySwitch, DummyBranchWeights, /*IsExpected=*/false);
+}
assert(L.getLoopPreheader() == NewPreheader && "Malformed CFG?");
if (Loop *OuterLoop = LI.getLoopFor(Preheader)) {
diff --git a/llvm/test/Transforms/LoopSimplifyCFG/constant-fold-branch.ll
b/llvm/test/Transforms/LoopSimplifyCFG/constant-fold-branch.ll
index 1ec212f0bb5ea..46b6209986fed 100644
--- a/llvm/test/Transforms/LoopSimplifyCFG/constant-fold-branch.ll
+++ b/llvm/test/Transforms/LoopSimplifyCFG/constant-fold-branch.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
UTC_ARGS: --check-globals
; REQUIRES: asserts
; RUN: opt -S -enable-loop-simplifycfg-term-folding=true
-passes=loop-simplifycfg -verify-loop-info -verify-dom-info -verify-loop-lcssa
< %s | FileCheck %s
; RUN: opt -S -enable-loop-simplifycfg-term-folding=true
-passes='require,loop(loop-simplifycfg)' -verify-loop-info
-verify-dom-info -verify-loop-lcssa < %s | FileCheck %s
@@ -59,7 +59,7 @@ define i32 @dead_backedge_test_switch_loop(i32 %end) {
; CHECK: dead_backedge:
; CHECK-NEXT:[[I_2]] = add i32 [[I_1]], 10
; CHECK-NEXT:switch i32 1, label [[EXIT:%.*]] [
-; CHECK-NEXT:i32 0, label [[HEADER_BACKEDGE]]
+; CHECK-NEXT: i32 0, label [[HEADER_BACKEDGE]]
; CHECK-NEXT:]
; CHECK: exit:
; CHECK-NEXT:[[I_2_LCSSA:%.*]] = phi i32 [ [[I_2]], [[DEAD_BACKEDGE]] ]
@@ -233,12 +233,12 @@ exit:
; Check that we preserve static reachibility of a dead exit block while
deleting
; a branch.
-define i32 @dead_exit_test_branch_loop(i32 %end) {
+define i32 @dead_exit_test_branch_loop(i32 %end) !prof
!{!"function_entry_count", i32 10} {
; CHECK-LABEL: @dead_exit_test_branch_loop(
; CHECK-NEXT: preheader:
; CHECK-NEXT:switch i32 0, label [[PREHEADER_SPLIT:%.*]] [
-; CHECK-NEXT:i32 1, label [[DEAD:%.*]]
-; CHECK-NEXT:]
+; CHECK-NEXT: i32 1, label [[DEAD:%.*]]
+; CHECK-NEXT:], !prof [[PROF1:![0-9]+]]
; CHECK: preheader.split:
; CHECK-NEXT:br label [[HEADER:%.*]]
; CHECK: header:
@@ -262,7 +262,7 @@ preheader:
header:
%i = phi i32 [0, %preheader], [%i.inc, %backedge]
- br i1 true, label %backedge, label %dead
+ br i1 true, label %backedge, label %dead, !prof !{!"branch_weights", i32 10,
i32 1}
dead:
br label %dummy
@@ -286,7 +286,7 @@ define i32 @dead_exit_test_switch_loop(i32 %end) {
; CHECK-LABEL: @dead_exit_test_switch_loop(
; CHECK-NEXT: preheader:
; CHECK-NEXT:switch i32 0, label [[PREHEADER_SPLIT:%.*]] [
-; CHECK-NEXT:i32 1, label [[DEAD:%.*]]
+; CHECK-NEXT: i32 1, label [[DEAD:%.*]]
; CHECK-NEXT:]
; CHECK: preheader.split:
; CHECK-NEXT:br label [[HEADER:%.*]]
@@ -383,9 +383,9 @@ define i32 @dead_loop_test_switch_loop(i32 %end) {
; CHECK: header:
; CHECK-NEXT:[[I:%.*]] = phi i32 [ 0, [[PREHEADER:%.*]] ], [
[[I_INC:%.*]], [[BACKEDGE:%.*]] ]
; CHECK-NEXT:switch i32 1, label [[DEAD:%.*]] [
-; CHECK-NEXT:i32 0, label [[DEAD]]
-; CHECK-NEXT:i32 1, label [[BACKEDGE]]
-; CHECK-NEXT:i32 2, lab
[llvm-branch-commits] [llvm] [LVer][profcheck] explicitly set unknown branch weights for the versioned/unversioned selector (PR #164507)
https://github.com/mtrofin updated
https://github.com/llvm/llvm-project/pull/164507
>From 7142f46d62fdd2596c5f6876eed896c07af7ceca Mon Sep 17 00:00:00 2001
From: Mircea Trofin
Date: Tue, 21 Oct 2025 15:20:14 -0700
Subject: [PATCH] [LVer][profcheck] explicitly set unknown branch weights for
the versioned/unversioned selector
---
llvm/lib/Transforms/Utils/LoopVersioning.cpp | 10 --
.../Transforms/LoopDistribute/basic-with-memchecks.ll | 5 +++--
2 files changed, 11 insertions(+), 4 deletions(-)
diff --git a/llvm/lib/Transforms/Utils/LoopVersioning.cpp
b/llvm/lib/Transforms/Utils/LoopVersioning.cpp
index ec2e6c1ab796b..4786819d18fa4 100644
--- a/llvm/lib/Transforms/Utils/LoopVersioning.cpp
+++ b/llvm/lib/Transforms/Utils/LoopVersioning.cpp
@@ -23,6 +23,7 @@
#include "llvm/IR/Dominators.h"
#include "llvm/IR/MDBuilder.h"
#include "llvm/IR/PassManager.h"
+#include "llvm/IR/ProfDataUtils.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
#include "llvm/Transforms/Utils/Cloning.h"
@@ -109,8 +110,13 @@ void LoopVersioning::versionLoop(
// Insert the conditional branch based on the result of the memchecks.
Instruction *OrigTerm = RuntimeCheckBB->getTerminator();
Builder.SetInsertPoint(OrigTerm);
- Builder.CreateCondBr(RuntimeCheck, NonVersionedLoop->getLoopPreheader(),
- VersionedLoop->getLoopPreheader());
+ auto *BI =
+ Builder.CreateCondBr(RuntimeCheck, NonVersionedLoop->getLoopPreheader(),
+ VersionedLoop->getLoopPreheader());
+ // We don't know what the probability of executing the versioned vs the
+ // unversioned variants is.
+ setExplicitlyUnknownBranchWeightsIfProfiled(
+ *BI, *BI->getParent()->getParent(), DEBUG_TYPE);
OrigTerm->eraseFromParent();
// The loops merge in the original exit block. This is now dominated by the
diff --git a/llvm/test/Transforms/LoopDistribute/basic-with-memchecks.ll
b/llvm/test/Transforms/LoopDistribute/basic-with-memchecks.ll
index 97ea2c6708dad..2828882afe779 100644
--- a/llvm/test/Transforms/LoopDistribute/basic-with-memchecks.ll
+++ b/llvm/test/Transforms/LoopDistribute/basic-with-memchecks.ll
@@ -28,7 +28,7 @@ target triple = "x86_64-apple-macosx10.10.0"
@E = common global ptr null, align 8
; CHECK-LABEL: @f(
-define void @f() {
+define void @f() !prof !{!"function_entry_count", i32 10} {
entry:
%a = load ptr, ptr @A, align 8
%b = load ptr, ptr @B, align 8
@@ -55,7 +55,7 @@ entry:
; CHECK: = icmp
; CHECK-NOT: = icmp
-; CHECK: br i1 %conflict.rdx15, label %for.body.ph.lver.orig, label
%for.body.ph.ldist1
+; CHECK: br i1 %conflict.rdx15, label %for.body.ph.lver.orig, label
%for.body.ph.ldist1, !prof ![[PROF1:[0-9]]]
; The non-distributed loop that the memchecks fall back on.
@@ -289,3 +289,4 @@ attributes #1 = { nounwind convergent }
!0 = distinct !{!0, !1}
!1 = !{!"llvm.loop.distribute.enable", i1 true}
+; CHECK: ![[PROF1]] = !{!"unknown", !"loop-versioning"}
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [LVer][profcheck] explicitly set unknown branch weights for the versioned/unversioned selector (PR #164507)
https://github.com/mtrofin updated
https://github.com/llvm/llvm-project/pull/164507
>From 057f2521ff54aa71f8068fd64d86fbb41e846fc0 Mon Sep 17 00:00:00 2001
From: Mircea Trofin
Date: Tue, 21 Oct 2025 15:20:14 -0700
Subject: [PATCH] [LVer][profcheck] explicitly set unknown branch weights for
the versioned/unversioned selector
---
llvm/lib/Transforms/Utils/LoopVersioning.cpp | 10 --
.../Transforms/LoopDistribute/basic-with-memchecks.ll | 5 +++--
2 files changed, 11 insertions(+), 4 deletions(-)
diff --git a/llvm/lib/Transforms/Utils/LoopVersioning.cpp
b/llvm/lib/Transforms/Utils/LoopVersioning.cpp
index ec2e6c1ab796b..4786819d18fa4 100644
--- a/llvm/lib/Transforms/Utils/LoopVersioning.cpp
+++ b/llvm/lib/Transforms/Utils/LoopVersioning.cpp
@@ -23,6 +23,7 @@
#include "llvm/IR/Dominators.h"
#include "llvm/IR/MDBuilder.h"
#include "llvm/IR/PassManager.h"
+#include "llvm/IR/ProfDataUtils.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
#include "llvm/Transforms/Utils/Cloning.h"
@@ -109,8 +110,13 @@ void LoopVersioning::versionLoop(
// Insert the conditional branch based on the result of the memchecks.
Instruction *OrigTerm = RuntimeCheckBB->getTerminator();
Builder.SetInsertPoint(OrigTerm);
- Builder.CreateCondBr(RuntimeCheck, NonVersionedLoop->getLoopPreheader(),
- VersionedLoop->getLoopPreheader());
+ auto *BI =
+ Builder.CreateCondBr(RuntimeCheck, NonVersionedLoop->getLoopPreheader(),
+ VersionedLoop->getLoopPreheader());
+ // We don't know what the probability of executing the versioned vs the
+ // unversioned variants is.
+ setExplicitlyUnknownBranchWeightsIfProfiled(
+ *BI, *BI->getParent()->getParent(), DEBUG_TYPE);
OrigTerm->eraseFromParent();
// The loops merge in the original exit block. This is now dominated by the
diff --git a/llvm/test/Transforms/LoopDistribute/basic-with-memchecks.ll
b/llvm/test/Transforms/LoopDistribute/basic-with-memchecks.ll
index 97ea2c6708dad..2828882afe779 100644
--- a/llvm/test/Transforms/LoopDistribute/basic-with-memchecks.ll
+++ b/llvm/test/Transforms/LoopDistribute/basic-with-memchecks.ll
@@ -28,7 +28,7 @@ target triple = "x86_64-apple-macosx10.10.0"
@E = common global ptr null, align 8
; CHECK-LABEL: @f(
-define void @f() {
+define void @f() !prof !{!"function_entry_count", i32 10} {
entry:
%a = load ptr, ptr @A, align 8
%b = load ptr, ptr @B, align 8
@@ -55,7 +55,7 @@ entry:
; CHECK: = icmp
; CHECK-NOT: = icmp
-; CHECK: br i1 %conflict.rdx15, label %for.body.ph.lver.orig, label
%for.body.ph.ldist1
+; CHECK: br i1 %conflict.rdx15, label %for.body.ph.lver.orig, label
%for.body.ph.ldist1, !prof ![[PROF1:[0-9]]]
; The non-distributed loop that the memchecks fall back on.
@@ -289,3 +289,4 @@ attributes #1 = { nounwind convergent }
!0 = distinct !{!0, !1}
!1 = !{!"llvm.loop.distribute.enable", i1 true}
+; CHECK: ![[PROF1]] = !{!"unknown", !"loop-versioning"}
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [LVer][profcheck] explicitly set unknown branch weights for the versioned/unversioned selector (PR #164507)
https://github.com/mtrofin updated
https://github.com/llvm/llvm-project/pull/164507
>From 057f2521ff54aa71f8068fd64d86fbb41e846fc0 Mon Sep 17 00:00:00 2001
From: Mircea Trofin
Date: Tue, 21 Oct 2025 15:20:14 -0700
Subject: [PATCH] [LVer][profcheck] explicitly set unknown branch weights for
the versioned/unversioned selector
---
llvm/lib/Transforms/Utils/LoopVersioning.cpp | 10 --
.../Transforms/LoopDistribute/basic-with-memchecks.ll | 5 +++--
2 files changed, 11 insertions(+), 4 deletions(-)
diff --git a/llvm/lib/Transforms/Utils/LoopVersioning.cpp
b/llvm/lib/Transforms/Utils/LoopVersioning.cpp
index ec2e6c1ab796b..4786819d18fa4 100644
--- a/llvm/lib/Transforms/Utils/LoopVersioning.cpp
+++ b/llvm/lib/Transforms/Utils/LoopVersioning.cpp
@@ -23,6 +23,7 @@
#include "llvm/IR/Dominators.h"
#include "llvm/IR/MDBuilder.h"
#include "llvm/IR/PassManager.h"
+#include "llvm/IR/ProfDataUtils.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
#include "llvm/Transforms/Utils/Cloning.h"
@@ -109,8 +110,13 @@ void LoopVersioning::versionLoop(
// Insert the conditional branch based on the result of the memchecks.
Instruction *OrigTerm = RuntimeCheckBB->getTerminator();
Builder.SetInsertPoint(OrigTerm);
- Builder.CreateCondBr(RuntimeCheck, NonVersionedLoop->getLoopPreheader(),
- VersionedLoop->getLoopPreheader());
+ auto *BI =
+ Builder.CreateCondBr(RuntimeCheck, NonVersionedLoop->getLoopPreheader(),
+ VersionedLoop->getLoopPreheader());
+ // We don't know what the probability of executing the versioned vs the
+ // unversioned variants is.
+ setExplicitlyUnknownBranchWeightsIfProfiled(
+ *BI, *BI->getParent()->getParent(), DEBUG_TYPE);
OrigTerm->eraseFromParent();
// The loops merge in the original exit block. This is now dominated by the
diff --git a/llvm/test/Transforms/LoopDistribute/basic-with-memchecks.ll
b/llvm/test/Transforms/LoopDistribute/basic-with-memchecks.ll
index 97ea2c6708dad..2828882afe779 100644
--- a/llvm/test/Transforms/LoopDistribute/basic-with-memchecks.ll
+++ b/llvm/test/Transforms/LoopDistribute/basic-with-memchecks.ll
@@ -28,7 +28,7 @@ target triple = "x86_64-apple-macosx10.10.0"
@E = common global ptr null, align 8
; CHECK-LABEL: @f(
-define void @f() {
+define void @f() !prof !{!"function_entry_count", i32 10} {
entry:
%a = load ptr, ptr @A, align 8
%b = load ptr, ptr @B, align 8
@@ -55,7 +55,7 @@ entry:
; CHECK: = icmp
; CHECK-NOT: = icmp
-; CHECK: br i1 %conflict.rdx15, label %for.body.ph.lver.orig, label
%for.body.ph.ldist1
+; CHECK: br i1 %conflict.rdx15, label %for.body.ph.lver.orig, label
%for.body.ph.ldist1, !prof ![[PROF1:[0-9]]]
; The non-distributed loop that the memchecks fall back on.
@@ -289,3 +289,4 @@ attributes #1 = { nounwind convergent }
!0 = distinct !{!0, !1}
!1 = !{!"llvm.loop.distribute.enable", i1 true}
+; CHECK: ![[PROF1]] = !{!"unknown", !"loop-versioning"}
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [LIR][profcheck] Reuse the loop's exit condition profile (PR #164523)
https://github.com/mtrofin updated
https://github.com/llvm/llvm-project/pull/164523
>From b09504467d9e8e0219d43f49301985f7e0c832cc Mon Sep 17 00:00:00 2001
From: Mircea Trofin
Date: Tue, 21 Oct 2025 17:24:49 -0700
Subject: [PATCH] [LIR][profcheck] Reuse the loop's exit condition profile
---
.../Transforms/Scalar/LoopIdiomRecognize.cpp | 40 +--
.../LoopIdiom/X86/preserve-profile.ll | 70 +++
2 files changed, 106 insertions(+), 4 deletions(-)
create mode 100644 llvm/test/Transforms/LoopIdiom/X86/preserve-profile.ll
diff --git a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
index 019536ca91ae0..9070d252ae09f 100644
--- a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
@@ -72,6 +72,7 @@
#include "llvm/IR/Module.h"
#include "llvm/IR/PassManager.h"
#include "llvm/IR/PatternMatch.h"
+#include "llvm/IR/ProfDataUtils.h"
#include "llvm/IR/Type.h"
#include "llvm/IR/User.h"
#include "llvm/IR/Value.h"
@@ -105,6 +106,7 @@ STATISTIC(
STATISTIC(NumShiftUntilZero,
"Number of uncountable loops recognized as 'shift until zero'
idiom");
+namespace llvm {
bool DisableLIRP::All;
static cl::opt
DisableLIRPAll("disable-" DEBUG_TYPE "-all",
@@ -163,6 +165,10 @@ static cl::opt ForceMemsetPatternIntrinsic(
cl::desc("Use memset.pattern intrinsic whenever possible"),
cl::init(false),
cl::Hidden);
+extern cl::opt ProfcheckDisableMetadataFixes;
+
+} // namespace llvm
+
namespace {
class LoopIdiomRecognize {
@@ -3199,7 +3205,21 @@ bool LoopIdiomRecognize::recognizeShiftUntilBitTest() {
// The loop trip count check.
auto *IVCheck = Builder.CreateICmpEQ(IVNext, LoopTripCount,
CurLoop->getName() + ".ivcheck");
- Builder.CreateCondBr(IVCheck, SuccessorBB, LoopHeaderBB);
+ SmallVector BranchWeights;
+ const bool HasBranchWeights =
+ !ProfcheckDisableMetadataFixes &&
+ extractBranchWeights(*LoopHeaderBB->getTerminator(), BranchWeights);
+
+ auto *BI = Builder.CreateCondBr(IVCheck, SuccessorBB, LoopHeaderBB);
+ if (HasBranchWeights) {
+if (SuccessorBB == LoopHeaderBB->getTerminator()->getSuccessor(1))
+ std::swap(BranchWeights[0], BranchWeights[1]);
+// We're not changing the loop profile, so we can reuse the original loop's
+// profile.
+setBranchWeights(*BI, BranchWeights,
+ /*IsExpected=*/false);
+ }
+
LoopHeaderBB->getTerminator()->eraseFromParent();
// Populate the IV PHI.
@@ -3368,10 +3388,10 @@ static bool detectShiftUntilZeroIdiom(Loop *CurLoop,
ScalarEvolution *SE,
/// %start = <...>
/// %extraoffset = <...>
/// <...>
-/// br label %for.cond
+/// br label %loop
///
/// loop:
-/// %iv = phi i8 [ %start, %entry ], [ %iv.next, %for.cond ]
+/// %iv = phi i8 [ %start, %entry ], [ %iv.next, %loop ]
/// %nbits = add nsw i8 %iv, %extraoffset
/// %val.shifted = {{l,a}shr,shl} i8 %val, %nbits
/// %val.shifted.iszero = icmp eq i8 %val.shifted, 0
@@ -3533,7 +3553,19 @@ bool LoopIdiomRecognize::recognizeShiftUntilZero() {
// The loop terminator.
Builder.SetInsertPoint(LoopHeaderBB->getTerminator());
- Builder.CreateCondBr(CIVCheck, SuccessorBB, LoopHeaderBB);
+ SmallVector BranchWeights;
+ const bool HasBranchWeights =
+ !ProfcheckDisableMetadataFixes &&
+ extractBranchWeights(*LoopHeaderBB->getTerminator(), BranchWeights);
+
+ auto *BI = Builder.CreateCondBr(CIVCheck, SuccessorBB, LoopHeaderBB);
+ if (HasBranchWeights) {
+if (InvertedCond)
+ std::swap(BranchWeights[0], BranchWeights[1]);
+// We're not changing the loop profile, so we can reuse the original loop's
+// profile.
+setBranchWeights(*BI, BranchWeights, /*IsExpected=*/false);
+ }
LoopHeaderBB->getTerminator()->eraseFromParent();
// Populate the IV PHI.
diff --git a/llvm/test/Transforms/LoopIdiom/X86/preserve-profile.ll
b/llvm/test/Transforms/LoopIdiom/X86/preserve-profile.ll
new file mode 100644
index 0..d01bb748d9422
--- /dev/null
+++ b/llvm/test/Transforms/LoopIdiom/X86/preserve-profile.ll
@@ -0,0 +1,70 @@
+; RUN: opt
-passes="module(print),function(loop(loop-idiom)),module(print)"
-mtriple=x86_64 -mcpu=core-avx2 %s -disable-output 2>&1 | FileCheck
--check-prefix=PROFILE %s
+
+declare void @escape_inner(i8, i8, i8, i1, i8)
+declare void @escape_outer(i8, i8, i8, i1, i8)
+
+declare i8 @gen.i8()
+
+; Most basic pattern; Note that iff the shift amount is offset, said offsetting
+; must not cause an overflow, but `add nsw` is fine.
+define i8 @p0(i8 %val, i8 %start, i8 %extraoffset) mustprogress {
+entry:
+ br label %loop
+
+loop:
+ %iv = phi i8 [ %start, %entry ], [ %iv.next, %loop ]
+ %nbits = add nsw i8 %iv, %extraoffset
+ %val.shifted = ashr i8 %val, %nbits
+ %val.shifted.iszero = icmp eq i8 %val.shifted, 0
+ %iv.next = add i8 %iv, 1
+
+ call void @escap
[llvm-branch-commits] [llvm] [LIR][profcheck] Reuse the loop's exit condition profile (PR #164523)
https://github.com/mtrofin updated
https://github.com/llvm/llvm-project/pull/164523
>From b09504467d9e8e0219d43f49301985f7e0c832cc Mon Sep 17 00:00:00 2001
From: Mircea Trofin
Date: Tue, 21 Oct 2025 17:24:49 -0700
Subject: [PATCH] [LIR][profcheck] Reuse the loop's exit condition profile
---
.../Transforms/Scalar/LoopIdiomRecognize.cpp | 40 +--
.../LoopIdiom/X86/preserve-profile.ll | 70 +++
2 files changed, 106 insertions(+), 4 deletions(-)
create mode 100644 llvm/test/Transforms/LoopIdiom/X86/preserve-profile.ll
diff --git a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
index 019536ca91ae0..9070d252ae09f 100644
--- a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
@@ -72,6 +72,7 @@
#include "llvm/IR/Module.h"
#include "llvm/IR/PassManager.h"
#include "llvm/IR/PatternMatch.h"
+#include "llvm/IR/ProfDataUtils.h"
#include "llvm/IR/Type.h"
#include "llvm/IR/User.h"
#include "llvm/IR/Value.h"
@@ -105,6 +106,7 @@ STATISTIC(
STATISTIC(NumShiftUntilZero,
"Number of uncountable loops recognized as 'shift until zero'
idiom");
+namespace llvm {
bool DisableLIRP::All;
static cl::opt
DisableLIRPAll("disable-" DEBUG_TYPE "-all",
@@ -163,6 +165,10 @@ static cl::opt ForceMemsetPatternIntrinsic(
cl::desc("Use memset.pattern intrinsic whenever possible"),
cl::init(false),
cl::Hidden);
+extern cl::opt ProfcheckDisableMetadataFixes;
+
+} // namespace llvm
+
namespace {
class LoopIdiomRecognize {
@@ -3199,7 +3205,21 @@ bool LoopIdiomRecognize::recognizeShiftUntilBitTest() {
// The loop trip count check.
auto *IVCheck = Builder.CreateICmpEQ(IVNext, LoopTripCount,
CurLoop->getName() + ".ivcheck");
- Builder.CreateCondBr(IVCheck, SuccessorBB, LoopHeaderBB);
+ SmallVector BranchWeights;
+ const bool HasBranchWeights =
+ !ProfcheckDisableMetadataFixes &&
+ extractBranchWeights(*LoopHeaderBB->getTerminator(), BranchWeights);
+
+ auto *BI = Builder.CreateCondBr(IVCheck, SuccessorBB, LoopHeaderBB);
+ if (HasBranchWeights) {
+if (SuccessorBB == LoopHeaderBB->getTerminator()->getSuccessor(1))
+ std::swap(BranchWeights[0], BranchWeights[1]);
+// We're not changing the loop profile, so we can reuse the original loop's
+// profile.
+setBranchWeights(*BI, BranchWeights,
+ /*IsExpected=*/false);
+ }
+
LoopHeaderBB->getTerminator()->eraseFromParent();
// Populate the IV PHI.
@@ -3368,10 +3388,10 @@ static bool detectShiftUntilZeroIdiom(Loop *CurLoop,
ScalarEvolution *SE,
/// %start = <...>
/// %extraoffset = <...>
/// <...>
-/// br label %for.cond
+/// br label %loop
///
/// loop:
-/// %iv = phi i8 [ %start, %entry ], [ %iv.next, %for.cond ]
+/// %iv = phi i8 [ %start, %entry ], [ %iv.next, %loop ]
/// %nbits = add nsw i8 %iv, %extraoffset
/// %val.shifted = {{l,a}shr,shl} i8 %val, %nbits
/// %val.shifted.iszero = icmp eq i8 %val.shifted, 0
@@ -3533,7 +3553,19 @@ bool LoopIdiomRecognize::recognizeShiftUntilZero() {
// The loop terminator.
Builder.SetInsertPoint(LoopHeaderBB->getTerminator());
- Builder.CreateCondBr(CIVCheck, SuccessorBB, LoopHeaderBB);
+ SmallVector BranchWeights;
+ const bool HasBranchWeights =
+ !ProfcheckDisableMetadataFixes &&
+ extractBranchWeights(*LoopHeaderBB->getTerminator(), BranchWeights);
+
+ auto *BI = Builder.CreateCondBr(CIVCheck, SuccessorBB, LoopHeaderBB);
+ if (HasBranchWeights) {
+if (InvertedCond)
+ std::swap(BranchWeights[0], BranchWeights[1]);
+// We're not changing the loop profile, so we can reuse the original loop's
+// profile.
+setBranchWeights(*BI, BranchWeights, /*IsExpected=*/false);
+ }
LoopHeaderBB->getTerminator()->eraseFromParent();
// Populate the IV PHI.
diff --git a/llvm/test/Transforms/LoopIdiom/X86/preserve-profile.ll
b/llvm/test/Transforms/LoopIdiom/X86/preserve-profile.ll
new file mode 100644
index 0..d01bb748d9422
--- /dev/null
+++ b/llvm/test/Transforms/LoopIdiom/X86/preserve-profile.ll
@@ -0,0 +1,70 @@
+; RUN: opt
-passes="module(print),function(loop(loop-idiom)),module(print)"
-mtriple=x86_64 -mcpu=core-avx2 %s -disable-output 2>&1 | FileCheck
--check-prefix=PROFILE %s
+
+declare void @escape_inner(i8, i8, i8, i1, i8)
+declare void @escape_outer(i8, i8, i8, i1, i8)
+
+declare i8 @gen.i8()
+
+; Most basic pattern; Note that iff the shift amount is offset, said offsetting
+; must not cause an overflow, but `add nsw` is fine.
+define i8 @p0(i8 %val, i8 %start, i8 %extraoffset) mustprogress {
+entry:
+ br label %loop
+
+loop:
+ %iv = phi i8 [ %start, %entry ], [ %iv.next, %loop ]
+ %nbits = add nsw i8 %iv, %extraoffset
+ %val.shifted = ashr i8 %val, %nbits
+ %val.shifted.iszero = icmp eq i8 %val.shifted, 0
+ %iv.next = add i8 %iv, 1
+
+ call void @escap
[llvm-branch-commits] [llvm] [SLU][profcheck] Propagate profile for branches on injected conditions. (PR #164476)
https://github.com/mtrofin updated
https://github.com/llvm/llvm-project/pull/164476
>From 8f285601c9110fd4fe09fdb88dac12969c77c88f Mon Sep 17 00:00:00 2001
From: Mircea Trofin
Date: Tue, 21 Oct 2025 11:22:01 -0700
Subject: [PATCH] [SLU][profcheck] Propagate profile for branches on injected
conditions.
---
.../Transforms/Scalar/SimpleLoopUnswitch.cpp | 9 +-
.../inject-invariant-conditions.ll| 142 +-
2 files changed, 79 insertions(+), 72 deletions(-)
diff --git a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
index 86b2090081ed0..0577ddbd2353c 100644
--- a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
+++ b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
@@ -3203,10 +3203,15 @@
injectPendingInvariantConditions(NonTrivialUnswitchCandidate Candidate, Loop &L,
Builder.SetInsertPoint(TI);
auto *InvariantBr =
Builder.CreateCondBr(InjectedCond, InLoopSucc, CheckBlock);
+ // We don't know anything about the relation between the limits.
+ setExplicitlyUnknownBranchWeightsIfProfiled(
+ *InvariantBr, *InvariantBr->getParent()->getParent(), DEBUG_TYPE);
Builder.SetInsertPoint(CheckBlock);
- Builder.CreateCondBr(TI->getCondition(), TI->getSuccessor(0),
- TI->getSuccessor(1));
+ Builder.CreateCondBr(
+ TI->getCondition(), TI->getSuccessor(0), TI->getSuccessor(1),
+ !ProfcheckDisableMetadataFixes ? TI->getMetadata(LLVMContext::MD_prof)
+ : nullptr);
TI->eraseFromParent();
// Fixup phis.
diff --git
a/llvm/test/Transforms/SimpleLoopUnswitch/inject-invariant-conditions.ll
b/llvm/test/Transforms/SimpleLoopUnswitch/inject-invariant-conditions.ll
index 536e0c6a0e74a..3c84dea2a0672 100644
--- a/llvm/test/Transforms/SimpleLoopUnswitch/inject-invariant-conditions.ll
+++ b/llvm/test/Transforms/SimpleLoopUnswitch/inject-invariant-conditions.ll
@@ -2,40 +2,40 @@
; RUN: opt < %s -S -simple-loop-unswitch-inject-invariant-conditions=true
-passes="loop(simple-loop-unswitch),simplifycfg" | FileCheck %s
; RUN: opt < %s -S -simple-loop-unswitch-inject-invariant-conditions=true
-passes="loop-mssa(simple-loop-unswitch),simplifycfg"
-verify-memoryssa | FileCheck %s
-define i32 @test_01(ptr noundef %p, i32 noundef %n, i32 noundef %limit, ptr
noundef %arr, ptr noundef %x_p) {
+define i32 @test_01(ptr noundef %p, i32 noundef %n, i32 noundef %limit, ptr
noundef %arr, ptr noundef %x_p) !prof !{!"function_entry_count", i32 10} {
; CHECK-LABEL: @test_01(
; CHECK-NEXT: entry:
-; CHECK-NEXT:[[X:%.*]] = load i32, ptr [[X_P:%.*]], align 4, !noundef !0
+; CHECK-NEXT:[[X:%.*]] = load i32, ptr [[X_P:%.*]], align 4, !noundef
[[META1:![0-9]+]]
; CHECK-NEXT:[[INJECTED_COND:%.*]] = icmp ule i32 [[LIMIT:%.*]], [[X]]
-; CHECK-NEXT:br i1 [[INJECTED_COND]], label [[LOOP_US:%.*]], label
[[LOOP:%.*]]
+; CHECK-NEXT:br i1 [[INJECTED_COND]], label [[LOOP_US:%.*]], label
[[LOOP:%.*]], !prof [[PROF2:![0-9]+]]
; CHECK: loop.us:
-; CHECK-NEXT:[[IV_US:%.*]] = phi i32 [ [[IV_NEXT_US:%.*]],
[[GUARDED_US:%.*]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:[[EL_PTR_US:%.*]] = getelementptr i32, ptr [[P:%.*]], i32
[[IV_US]]
-; CHECK-NEXT:[[EL_US:%.*]] = load i32, ptr [[EL_PTR_US]], align 4
-; CHECK-NEXT:[[BOUND_CHECK_US:%.*]] = icmp ult i32 [[EL_US]], [[LIMIT]]
-; CHECK-NEXT:br i1 [[BOUND_CHECK_US]], label [[GUARDED_US]], label
[[COMMON_RET:%.*]], !prof [[PROF1:![0-9]+]]
+; CHECK-NEXT:[[IV:%.*]] = phi i32 [ [[IV_NEXT_US:%.*]], [[GUARDED_US:%.*]]
], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:[[EL_PTR:%.*]] = getelementptr i32, ptr [[P:%.*]], i32 [[IV]]
+; CHECK-NEXT:[[EL:%.*]] = load i32, ptr [[EL_PTR]], align 4
+; CHECK-NEXT:[[BOUND_CHECK:%.*]] = icmp ult i32 [[EL]], [[LIMIT]]
+; CHECK-NEXT:br i1 [[BOUND_CHECK]], label [[GUARDED_US]], label
[[COMMON_RET:%.*]], !prof [[PROF3:![0-9]+]]
; CHECK: guarded.us:
-; CHECK-NEXT:[[RANGE_CHECK_US:%.*]] = icmp ult i32 [[EL_US]], [[X]]
-; CHECK-NEXT:[[ARR_PTR_US:%.*]] = getelementptr i32, ptr [[ARR:%.*]], i32
[[EL_US]]
-; CHECK-NEXT:store i32 [[IV_US]], ptr [[ARR_PTR_US]], align 4
-; CHECK-NEXT:[[IV_NEXT_US]] = add i32 [[IV_US]], 1
+; CHECK-NEXT:[[RANGE_CHECK_US:%.*]] = icmp ult i32 [[EL]], [[X]]
+; CHECK-NEXT:[[ARR_PTR_US:%.*]] = getelementptr i32, ptr [[ARR:%.*]], i32
[[EL]]
+; CHECK-NEXT:store i32 [[IV]], ptr [[ARR_PTR_US]], align 4
+; CHECK-NEXT:[[IV_NEXT_US]] = add i32 [[IV]], 1
; CHECK-NEXT:[[LOOP_COND_US:%.*]] = icmp slt i32 [[IV_NEXT_US]], [[N:%.*]]
; CHECK-NEXT:br i1 [[LOOP_COND_US]], label [[LOOP_US]], label
[[COMMON_RET]]
; CHECK: loop:
-; CHECK-NEXT:[[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], [[BACKEDGE:%.*]] ], [
0, [[ENTRY]] ]
-; CHECK-NEXT:[[EL_PTR:%.*]] = getelementptr i32, ptr [[P]], i32 [[IV]]
-; CHECK-NEXT:[[EL:%.*]] = load i32, ptr [[EL_PTR]], align 4
-; CHECK-NEXT:[[BOUND_CHEC
[llvm-branch-commits] [llvm] [LSCFG][profcheck] Add dummy branch weights for the dummy switch to dead exits (PR #164714)
https://github.com/mtrofin updated
https://github.com/llvm/llvm-project/pull/164714
>From c5c997dfb7d356422457291b53534ddc3f02cca6 Mon Sep 17 00:00:00 2001
From: Mircea Trofin
Date: Wed, 22 Oct 2025 14:34:31 -0700
Subject: [PATCH] [LSCFG][profcheck] Add dummy branch weights for the dummy
switch to dead exits
---
.../lib/Transforms/Scalar/LoopSimplifyCFG.cpp | 12 ++
.../LoopSimplifyCFG/constant-fold-branch.ll | 104 +-
2 files changed, 66 insertions(+), 50 deletions(-)
diff --git a/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp
b/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp
index b9546c5fa236b..e902b71776973 100644
--- a/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp
@@ -24,6 +24,7 @@
#include "llvm/Analysis/ScalarEvolution.h"
#include "llvm/IR/Dominators.h"
#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/ProfDataUtils.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Transforms/Scalar.h"
#include "llvm/Transforms/Scalar/LoopPassManager.h"
@@ -393,6 +394,17 @@ class ConstantTerminatorFoldingImpl {
DTUpdates.push_back({DominatorTree::Insert, Preheader, BB});
++NumLoopExitsDeleted;
}
+// We don't really need to add branch weights to DummySwitch, because all
+// but one branches are just a temporary artifact - see the comment on top
+// of this function. But, it's easy to estimate the weights, and it helps
+// maintain a property of the overall compiler - that the branch weights
+// don't "just get dropped" accidentally (i.e. profcheck)
+if (DummySwitch->getParent()->getParent()->hasProfileData()) {
+ SmallVector DummyBranchWeights(1 + DummySwitch->getNumCases());
+ // default. 100% probability, the rest are dead.
+ DummyBranchWeights[0] = 1;
+ setBranchWeights(*DummySwitch, DummyBranchWeights, /*IsExpected=*/false);
+}
assert(L.getLoopPreheader() == NewPreheader && "Malformed CFG?");
if (Loop *OuterLoop = LI.getLoopFor(Preheader)) {
diff --git a/llvm/test/Transforms/LoopSimplifyCFG/constant-fold-branch.ll
b/llvm/test/Transforms/LoopSimplifyCFG/constant-fold-branch.ll
index 1ec212f0bb5ea..46b6209986fed 100644
--- a/llvm/test/Transforms/LoopSimplifyCFG/constant-fold-branch.ll
+++ b/llvm/test/Transforms/LoopSimplifyCFG/constant-fold-branch.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
UTC_ARGS: --check-globals
; REQUIRES: asserts
; RUN: opt -S -enable-loop-simplifycfg-term-folding=true
-passes=loop-simplifycfg -verify-loop-info -verify-dom-info -verify-loop-lcssa
< %s | FileCheck %s
; RUN: opt -S -enable-loop-simplifycfg-term-folding=true
-passes='require,loop(loop-simplifycfg)' -verify-loop-info
-verify-dom-info -verify-loop-lcssa < %s | FileCheck %s
@@ -59,7 +59,7 @@ define i32 @dead_backedge_test_switch_loop(i32 %end) {
; CHECK: dead_backedge:
; CHECK-NEXT:[[I_2]] = add i32 [[I_1]], 10
; CHECK-NEXT:switch i32 1, label [[EXIT:%.*]] [
-; CHECK-NEXT:i32 0, label [[HEADER_BACKEDGE]]
+; CHECK-NEXT: i32 0, label [[HEADER_BACKEDGE]]
; CHECK-NEXT:]
; CHECK: exit:
; CHECK-NEXT:[[I_2_LCSSA:%.*]] = phi i32 [ [[I_2]], [[DEAD_BACKEDGE]] ]
@@ -233,12 +233,12 @@ exit:
; Check that we preserve static reachibility of a dead exit block while
deleting
; a branch.
-define i32 @dead_exit_test_branch_loop(i32 %end) {
+define i32 @dead_exit_test_branch_loop(i32 %end) !prof
!{!"function_entry_count", i32 10} {
; CHECK-LABEL: @dead_exit_test_branch_loop(
; CHECK-NEXT: preheader:
; CHECK-NEXT:switch i32 0, label [[PREHEADER_SPLIT:%.*]] [
-; CHECK-NEXT:i32 1, label [[DEAD:%.*]]
-; CHECK-NEXT:]
+; CHECK-NEXT: i32 1, label [[DEAD:%.*]]
+; CHECK-NEXT:], !prof [[PROF1:![0-9]+]]
; CHECK: preheader.split:
; CHECK-NEXT:br label [[HEADER:%.*]]
; CHECK: header:
@@ -262,7 +262,7 @@ preheader:
header:
%i = phi i32 [0, %preheader], [%i.inc, %backedge]
- br i1 true, label %backedge, label %dead
+ br i1 true, label %backedge, label %dead, !prof !{!"branch_weights", i32 10,
i32 1}
dead:
br label %dummy
@@ -286,7 +286,7 @@ define i32 @dead_exit_test_switch_loop(i32 %end) {
; CHECK-LABEL: @dead_exit_test_switch_loop(
; CHECK-NEXT: preheader:
; CHECK-NEXT:switch i32 0, label [[PREHEADER_SPLIT:%.*]] [
-; CHECK-NEXT:i32 1, label [[DEAD:%.*]]
+; CHECK-NEXT: i32 1, label [[DEAD:%.*]]
; CHECK-NEXT:]
; CHECK: preheader.split:
; CHECK-NEXT:br label [[HEADER:%.*]]
@@ -383,9 +383,9 @@ define i32 @dead_loop_test_switch_loop(i32 %end) {
; CHECK: header:
; CHECK-NEXT:[[I:%.*]] = phi i32 [ 0, [[PREHEADER:%.*]] ], [
[[I_INC:%.*]], [[BACKEDGE:%.*]] ]
; CHECK-NEXT:switch i32 1, label [[DEAD:%.*]] [
-; CHECK-NEXT:i32 0, label [[DEAD]]
-; CHECK-NEXT:i32 1, label [[BACKEDGE]]
-; CHECK-NEXT:i32 2, lab
[llvm-branch-commits] [llvm] [LSCFG][profcheck] Add dummy branch weights for the dummy switch to dead exits (PR #164714)
https://github.com/mtrofin updated
https://github.com/llvm/llvm-project/pull/164714
>From c5c997dfb7d356422457291b53534ddc3f02cca6 Mon Sep 17 00:00:00 2001
From: Mircea Trofin
Date: Wed, 22 Oct 2025 14:34:31 -0700
Subject: [PATCH] [LSCFG][profcheck] Add dummy branch weights for the dummy
switch to dead exits
---
.../lib/Transforms/Scalar/LoopSimplifyCFG.cpp | 12 ++
.../LoopSimplifyCFG/constant-fold-branch.ll | 104 +-
2 files changed, 66 insertions(+), 50 deletions(-)
diff --git a/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp
b/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp
index b9546c5fa236b..e902b71776973 100644
--- a/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp
@@ -24,6 +24,7 @@
#include "llvm/Analysis/ScalarEvolution.h"
#include "llvm/IR/Dominators.h"
#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/ProfDataUtils.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Transforms/Scalar.h"
#include "llvm/Transforms/Scalar/LoopPassManager.h"
@@ -393,6 +394,17 @@ class ConstantTerminatorFoldingImpl {
DTUpdates.push_back({DominatorTree::Insert, Preheader, BB});
++NumLoopExitsDeleted;
}
+// We don't really need to add branch weights to DummySwitch, because all
+// but one branches are just a temporary artifact - see the comment on top
+// of this function. But, it's easy to estimate the weights, and it helps
+// maintain a property of the overall compiler - that the branch weights
+// don't "just get dropped" accidentally (i.e. profcheck)
+if (DummySwitch->getParent()->getParent()->hasProfileData()) {
+ SmallVector DummyBranchWeights(1 + DummySwitch->getNumCases());
+ // default. 100% probability, the rest are dead.
+ DummyBranchWeights[0] = 1;
+ setBranchWeights(*DummySwitch, DummyBranchWeights, /*IsExpected=*/false);
+}
assert(L.getLoopPreheader() == NewPreheader && "Malformed CFG?");
if (Loop *OuterLoop = LI.getLoopFor(Preheader)) {
diff --git a/llvm/test/Transforms/LoopSimplifyCFG/constant-fold-branch.ll
b/llvm/test/Transforms/LoopSimplifyCFG/constant-fold-branch.ll
index 1ec212f0bb5ea..46b6209986fed 100644
--- a/llvm/test/Transforms/LoopSimplifyCFG/constant-fold-branch.ll
+++ b/llvm/test/Transforms/LoopSimplifyCFG/constant-fold-branch.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
UTC_ARGS: --check-globals
; REQUIRES: asserts
; RUN: opt -S -enable-loop-simplifycfg-term-folding=true
-passes=loop-simplifycfg -verify-loop-info -verify-dom-info -verify-loop-lcssa
< %s | FileCheck %s
; RUN: opt -S -enable-loop-simplifycfg-term-folding=true
-passes='require,loop(loop-simplifycfg)' -verify-loop-info
-verify-dom-info -verify-loop-lcssa < %s | FileCheck %s
@@ -59,7 +59,7 @@ define i32 @dead_backedge_test_switch_loop(i32 %end) {
; CHECK: dead_backedge:
; CHECK-NEXT:[[I_2]] = add i32 [[I_1]], 10
; CHECK-NEXT:switch i32 1, label [[EXIT:%.*]] [
-; CHECK-NEXT:i32 0, label [[HEADER_BACKEDGE]]
+; CHECK-NEXT: i32 0, label [[HEADER_BACKEDGE]]
; CHECK-NEXT:]
; CHECK: exit:
; CHECK-NEXT:[[I_2_LCSSA:%.*]] = phi i32 [ [[I_2]], [[DEAD_BACKEDGE]] ]
@@ -233,12 +233,12 @@ exit:
; Check that we preserve static reachibility of a dead exit block while
deleting
; a branch.
-define i32 @dead_exit_test_branch_loop(i32 %end) {
+define i32 @dead_exit_test_branch_loop(i32 %end) !prof
!{!"function_entry_count", i32 10} {
; CHECK-LABEL: @dead_exit_test_branch_loop(
; CHECK-NEXT: preheader:
; CHECK-NEXT:switch i32 0, label [[PREHEADER_SPLIT:%.*]] [
-; CHECK-NEXT:i32 1, label [[DEAD:%.*]]
-; CHECK-NEXT:]
+; CHECK-NEXT: i32 1, label [[DEAD:%.*]]
+; CHECK-NEXT:], !prof [[PROF1:![0-9]+]]
; CHECK: preheader.split:
; CHECK-NEXT:br label [[HEADER:%.*]]
; CHECK: header:
@@ -262,7 +262,7 @@ preheader:
header:
%i = phi i32 [0, %preheader], [%i.inc, %backedge]
- br i1 true, label %backedge, label %dead
+ br i1 true, label %backedge, label %dead, !prof !{!"branch_weights", i32 10,
i32 1}
dead:
br label %dummy
@@ -286,7 +286,7 @@ define i32 @dead_exit_test_switch_loop(i32 %end) {
; CHECK-LABEL: @dead_exit_test_switch_loop(
; CHECK-NEXT: preheader:
; CHECK-NEXT:switch i32 0, label [[PREHEADER_SPLIT:%.*]] [
-; CHECK-NEXT:i32 1, label [[DEAD:%.*]]
+; CHECK-NEXT: i32 1, label [[DEAD:%.*]]
; CHECK-NEXT:]
; CHECK: preheader.split:
; CHECK-NEXT:br label [[HEADER:%.*]]
@@ -383,9 +383,9 @@ define i32 @dead_loop_test_switch_loop(i32 %end) {
; CHECK: header:
; CHECK-NEXT:[[I:%.*]] = phi i32 [ 0, [[PREHEADER:%.*]] ], [
[[I_INC:%.*]], [[BACKEDGE:%.*]] ]
; CHECK-NEXT:switch i32 1, label [[DEAD:%.*]] [
-; CHECK-NEXT:i32 0, label [[DEAD]]
-; CHECK-NEXT:i32 1, label [[BACKEDGE]]
-; CHECK-NEXT:i32 2, lab
[llvm-branch-commits] [llvm] [SLU][profcheck] Propagate profile for branches on injected conditions. (PR #164476)
https://github.com/mtrofin updated
https://github.com/llvm/llvm-project/pull/164476
>From 8f285601c9110fd4fe09fdb88dac12969c77c88f Mon Sep 17 00:00:00 2001
From: Mircea Trofin
Date: Tue, 21 Oct 2025 11:22:01 -0700
Subject: [PATCH] [SLU][profcheck] Propagate profile for branches on injected
conditions.
---
.../Transforms/Scalar/SimpleLoopUnswitch.cpp | 9 +-
.../inject-invariant-conditions.ll| 142 +-
2 files changed, 79 insertions(+), 72 deletions(-)
diff --git a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
index 86b2090081ed0..0577ddbd2353c 100644
--- a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
+++ b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
@@ -3203,10 +3203,15 @@
injectPendingInvariantConditions(NonTrivialUnswitchCandidate Candidate, Loop &L,
Builder.SetInsertPoint(TI);
auto *InvariantBr =
Builder.CreateCondBr(InjectedCond, InLoopSucc, CheckBlock);
+ // We don't know anything about the relation between the limits.
+ setExplicitlyUnknownBranchWeightsIfProfiled(
+ *InvariantBr, *InvariantBr->getParent()->getParent(), DEBUG_TYPE);
Builder.SetInsertPoint(CheckBlock);
- Builder.CreateCondBr(TI->getCondition(), TI->getSuccessor(0),
- TI->getSuccessor(1));
+ Builder.CreateCondBr(
+ TI->getCondition(), TI->getSuccessor(0), TI->getSuccessor(1),
+ !ProfcheckDisableMetadataFixes ? TI->getMetadata(LLVMContext::MD_prof)
+ : nullptr);
TI->eraseFromParent();
// Fixup phis.
diff --git
a/llvm/test/Transforms/SimpleLoopUnswitch/inject-invariant-conditions.ll
b/llvm/test/Transforms/SimpleLoopUnswitch/inject-invariant-conditions.ll
index 536e0c6a0e74a..3c84dea2a0672 100644
--- a/llvm/test/Transforms/SimpleLoopUnswitch/inject-invariant-conditions.ll
+++ b/llvm/test/Transforms/SimpleLoopUnswitch/inject-invariant-conditions.ll
@@ -2,40 +2,40 @@
; RUN: opt < %s -S -simple-loop-unswitch-inject-invariant-conditions=true
-passes="loop(simple-loop-unswitch),simplifycfg" | FileCheck %s
; RUN: opt < %s -S -simple-loop-unswitch-inject-invariant-conditions=true
-passes="loop-mssa(simple-loop-unswitch),simplifycfg"
-verify-memoryssa | FileCheck %s
-define i32 @test_01(ptr noundef %p, i32 noundef %n, i32 noundef %limit, ptr
noundef %arr, ptr noundef %x_p) {
+define i32 @test_01(ptr noundef %p, i32 noundef %n, i32 noundef %limit, ptr
noundef %arr, ptr noundef %x_p) !prof !{!"function_entry_count", i32 10} {
; CHECK-LABEL: @test_01(
; CHECK-NEXT: entry:
-; CHECK-NEXT:[[X:%.*]] = load i32, ptr [[X_P:%.*]], align 4, !noundef !0
+; CHECK-NEXT:[[X:%.*]] = load i32, ptr [[X_P:%.*]], align 4, !noundef
[[META1:![0-9]+]]
; CHECK-NEXT:[[INJECTED_COND:%.*]] = icmp ule i32 [[LIMIT:%.*]], [[X]]
-; CHECK-NEXT:br i1 [[INJECTED_COND]], label [[LOOP_US:%.*]], label
[[LOOP:%.*]]
+; CHECK-NEXT:br i1 [[INJECTED_COND]], label [[LOOP_US:%.*]], label
[[LOOP:%.*]], !prof [[PROF2:![0-9]+]]
; CHECK: loop.us:
-; CHECK-NEXT:[[IV_US:%.*]] = phi i32 [ [[IV_NEXT_US:%.*]],
[[GUARDED_US:%.*]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:[[EL_PTR_US:%.*]] = getelementptr i32, ptr [[P:%.*]], i32
[[IV_US]]
-; CHECK-NEXT:[[EL_US:%.*]] = load i32, ptr [[EL_PTR_US]], align 4
-; CHECK-NEXT:[[BOUND_CHECK_US:%.*]] = icmp ult i32 [[EL_US]], [[LIMIT]]
-; CHECK-NEXT:br i1 [[BOUND_CHECK_US]], label [[GUARDED_US]], label
[[COMMON_RET:%.*]], !prof [[PROF1:![0-9]+]]
+; CHECK-NEXT:[[IV:%.*]] = phi i32 [ [[IV_NEXT_US:%.*]], [[GUARDED_US:%.*]]
], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:[[EL_PTR:%.*]] = getelementptr i32, ptr [[P:%.*]], i32 [[IV]]
+; CHECK-NEXT:[[EL:%.*]] = load i32, ptr [[EL_PTR]], align 4
+; CHECK-NEXT:[[BOUND_CHECK:%.*]] = icmp ult i32 [[EL]], [[LIMIT]]
+; CHECK-NEXT:br i1 [[BOUND_CHECK]], label [[GUARDED_US]], label
[[COMMON_RET:%.*]], !prof [[PROF3:![0-9]+]]
; CHECK: guarded.us:
-; CHECK-NEXT:[[RANGE_CHECK_US:%.*]] = icmp ult i32 [[EL_US]], [[X]]
-; CHECK-NEXT:[[ARR_PTR_US:%.*]] = getelementptr i32, ptr [[ARR:%.*]], i32
[[EL_US]]
-; CHECK-NEXT:store i32 [[IV_US]], ptr [[ARR_PTR_US]], align 4
-; CHECK-NEXT:[[IV_NEXT_US]] = add i32 [[IV_US]], 1
+; CHECK-NEXT:[[RANGE_CHECK_US:%.*]] = icmp ult i32 [[EL]], [[X]]
+; CHECK-NEXT:[[ARR_PTR_US:%.*]] = getelementptr i32, ptr [[ARR:%.*]], i32
[[EL]]
+; CHECK-NEXT:store i32 [[IV]], ptr [[ARR_PTR_US]], align 4
+; CHECK-NEXT:[[IV_NEXT_US]] = add i32 [[IV]], 1
; CHECK-NEXT:[[LOOP_COND_US:%.*]] = icmp slt i32 [[IV_NEXT_US]], [[N:%.*]]
; CHECK-NEXT:br i1 [[LOOP_COND_US]], label [[LOOP_US]], label
[[COMMON_RET]]
; CHECK: loop:
-; CHECK-NEXT:[[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], [[BACKEDGE:%.*]] ], [
0, [[ENTRY]] ]
-; CHECK-NEXT:[[EL_PTR:%.*]] = getelementptr i32, ptr [[P]], i32 [[IV]]
-; CHECK-NEXT:[[EL:%.*]] = load i32, ptr [[EL_PTR]], align 4
-; CHECK-NEXT:[[BOUND_CHEC
[llvm-branch-commits] [llvm] CodeGen/AMDGPU: Allow 3-address conversion of bundled instructions (PR #166213)
https://github.com/nhaehnle updated
https://github.com/llvm/llvm-project/pull/166213
From e0611e449d84c99d08b0bd6a5f338cb605740574 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicolai=20H=C3=A4hnle?=
Date: Tue, 7 Oct 2025 12:17:02 -0700
Subject: [PATCH] CodeGen/AMDGPU: Allow 3-address conversion of bundled
instructions
This is in preparation for future changes in AMDGPU that will make more
substantial use of bundles pre-RA. For now, simply test this with
degenerate (single-instruction) bundles.
commit-id:4a30cb78
---
.../lib/CodeGen/TwoAddressInstructionPass.cpp | 54 ++
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp| 56 +--
llvm/test/CodeGen/AMDGPU/twoaddr-bundle.mir | 9 ++-
3 files changed, 87 insertions(+), 32 deletions(-)
diff --git a/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
b/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
index 1f816b94cf56b..7056ced5385ed 100644
--- a/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
+++ b/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
@@ -794,29 +794,34 @@ bool TwoAddressInstructionImpl::convertInstTo3Addr(
if (!NewMI)
return false;
- LLVM_DEBUG(dbgs() << "2addr: CONVERTING 2-ADDR: " << *mi);
- LLVM_DEBUG(dbgs() << "2addr: TO 3-ADDR: " << *NewMI);
-
- // If the old instruction is debug value tracked, an update is required.
- if (auto OldInstrNum = mi->peekDebugInstrNum()) {
-assert(mi->getNumExplicitDefs() == 1);
-assert(NewMI->getNumExplicitDefs() == 1);
-
-// Find the old and new def location.
-unsigned OldIdx = mi->defs().begin()->getOperandNo();
-unsigned NewIdx = NewMI->defs().begin()->getOperandNo();
-
-// Record that one def has been replaced by the other.
-unsigned NewInstrNum = NewMI->getDebugInstrNum();
-MF->makeDebugValueSubstitution(std::make_pair(OldInstrNum, OldIdx),
- std::make_pair(NewInstrNum, NewIdx));
- }
-
- MBB->erase(mi); // Nuke the old inst.
-
for (MachineInstr &MI : MIS)
DistanceMap.insert(std::make_pair(&MI, Dist++));
- Dist--;
+
+ if (&*mi == NewMI) {
+LLVM_DEBUG(dbgs() << "2addr: CONVERTED IN-PLACE TO 3-ADDR: " << *mi);
+ } else {
+LLVM_DEBUG(dbgs() << "2addr: CONVERTING 2-ADDR: " << *mi);
+LLVM_DEBUG(dbgs() << "2addr: TO 3-ADDR: " << *NewMI);
+
+// If the old instruction is debug value tracked, an update is required.
+if (auto OldInstrNum = mi->peekDebugInstrNum()) {
+ assert(mi->getNumExplicitDefs() == 1);
+ assert(NewMI->getNumExplicitDefs() == 1);
+
+ // Find the old and new def location.
+ unsigned OldIdx = mi->defs().begin()->getOperandNo();
+ unsigned NewIdx = NewMI->defs().begin()->getOperandNo();
+
+ // Record that one def has been replaced by the other.
+ unsigned NewInstrNum = NewMI->getDebugInstrNum();
+ MF->makeDebugValueSubstitution(std::make_pair(OldInstrNum, OldIdx),
+ std::make_pair(NewInstrNum, NewIdx));
+}
+
+MBB->erase(mi); // Nuke the old inst.
+Dist--;
+ }
+
mi = NewMI;
nmi = std::next(mi);
@@ -1329,6 +1334,9 @@ bool TwoAddressInstructionImpl::tryInstructionTransform(
bool Commuted = tryInstructionCommute(&MI, DstIdx, SrcIdx, regBKilled, Dist);
+ // Give targets a chance to convert bundled instructions.
+ bool ConvertibleTo3Addr = MI.isConvertibleTo3Addr(MachineInstr::AnyInBundle);
+
// If the instruction is convertible to 3 Addr, instead
// of returning try 3 Addr transformation aggressively and
// use this variable to check later. Because it might be better.
@@ -1337,7 +1345,7 @@ bool TwoAddressInstructionImpl::tryInstructionTransform(
// addl %esi, %edi
// movl %edi, %eax
// ret
- if (Commuted && !MI.isConvertibleTo3Addr())
+ if (Commuted && !ConvertibleTo3Addr)
return false;
if (shouldOnlyCommute)
@@ -1357,7 +1365,7 @@ bool TwoAddressInstructionImpl::tryInstructionTransform(
regBKilled = isKilled(MI, regB, true);
}
- if (MI.isConvertibleTo3Addr()) {
+ if (ConvertibleTo3Addr) {
// This instruction is potentially convertible to a true
// three-address instruction. Check if it is profitable.
if (!regBKilled || isProfitableToConv3Addr(regA, regB)) {
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 9460145d47111..857296459aaf7 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -4044,10 +4044,29 @@ MachineInstr
*SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
LiveVariables *LV,
LiveIntervals *LIS) const {
MachineBasicBlock &MBB = *MI.getParent();
+ MachineInstr *CandidateMI = &MI;
+
+ if (MI.isBundle()) {
+// This is a temporary placeholder for bundle handling that enables us to
+// exercise the relevant code paths in the two-address instruction pass.
+i
[llvm-branch-commits] [llvm] CodeGen/AMDGPU: Allow 3-address conversion of bundled instructions (PR #166213)
https://github.com/nhaehnle updated
https://github.com/llvm/llvm-project/pull/166213
From e0611e449d84c99d08b0bd6a5f338cb605740574 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicolai=20H=C3=A4hnle?=
Date: Tue, 7 Oct 2025 12:17:02 -0700
Subject: [PATCH] CodeGen/AMDGPU: Allow 3-address conversion of bundled
instructions
This is in preparation for future changes in AMDGPU that will make more
substantial use of bundles pre-RA. For now, simply test this with
degenerate (single-instruction) bundles.
commit-id:4a30cb78
---
.../lib/CodeGen/TwoAddressInstructionPass.cpp | 54 ++
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp| 56 +--
llvm/test/CodeGen/AMDGPU/twoaddr-bundle.mir | 9 ++-
3 files changed, 87 insertions(+), 32 deletions(-)
diff --git a/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
b/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
index 1f816b94cf56b..7056ced5385ed 100644
--- a/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
+++ b/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
@@ -794,29 +794,34 @@ bool TwoAddressInstructionImpl::convertInstTo3Addr(
if (!NewMI)
return false;
- LLVM_DEBUG(dbgs() << "2addr: CONVERTING 2-ADDR: " << *mi);
- LLVM_DEBUG(dbgs() << "2addr: TO 3-ADDR: " << *NewMI);
-
- // If the old instruction is debug value tracked, an update is required.
- if (auto OldInstrNum = mi->peekDebugInstrNum()) {
-assert(mi->getNumExplicitDefs() == 1);
-assert(NewMI->getNumExplicitDefs() == 1);
-
-// Find the old and new def location.
-unsigned OldIdx = mi->defs().begin()->getOperandNo();
-unsigned NewIdx = NewMI->defs().begin()->getOperandNo();
-
-// Record that one def has been replaced by the other.
-unsigned NewInstrNum = NewMI->getDebugInstrNum();
-MF->makeDebugValueSubstitution(std::make_pair(OldInstrNum, OldIdx),
- std::make_pair(NewInstrNum, NewIdx));
- }
-
- MBB->erase(mi); // Nuke the old inst.
-
for (MachineInstr &MI : MIS)
DistanceMap.insert(std::make_pair(&MI, Dist++));
- Dist--;
+
+ if (&*mi == NewMI) {
+LLVM_DEBUG(dbgs() << "2addr: CONVERTED IN-PLACE TO 3-ADDR: " << *mi);
+ } else {
+LLVM_DEBUG(dbgs() << "2addr: CONVERTING 2-ADDR: " << *mi);
+LLVM_DEBUG(dbgs() << "2addr: TO 3-ADDR: " << *NewMI);
+
+// If the old instruction is debug value tracked, an update is required.
+if (auto OldInstrNum = mi->peekDebugInstrNum()) {
+ assert(mi->getNumExplicitDefs() == 1);
+ assert(NewMI->getNumExplicitDefs() == 1);
+
+ // Find the old and new def location.
+ unsigned OldIdx = mi->defs().begin()->getOperandNo();
+ unsigned NewIdx = NewMI->defs().begin()->getOperandNo();
+
+ // Record that one def has been replaced by the other.
+ unsigned NewInstrNum = NewMI->getDebugInstrNum();
+ MF->makeDebugValueSubstitution(std::make_pair(OldInstrNum, OldIdx),
+ std::make_pair(NewInstrNum, NewIdx));
+}
+
+MBB->erase(mi); // Nuke the old inst.
+Dist--;
+ }
+
mi = NewMI;
nmi = std::next(mi);
@@ -1329,6 +1334,9 @@ bool TwoAddressInstructionImpl::tryInstructionTransform(
bool Commuted = tryInstructionCommute(&MI, DstIdx, SrcIdx, regBKilled, Dist);
+ // Give targets a chance to convert bundled instructions.
+ bool ConvertibleTo3Addr = MI.isConvertibleTo3Addr(MachineInstr::AnyInBundle);
+
// If the instruction is convertible to 3 Addr, instead
// of returning try 3 Addr transformation aggressively and
// use this variable to check later. Because it might be better.
@@ -1337,7 +1345,7 @@ bool TwoAddressInstructionImpl::tryInstructionTransform(
// addl %esi, %edi
// movl %edi, %eax
// ret
- if (Commuted && !MI.isConvertibleTo3Addr())
+ if (Commuted && !ConvertibleTo3Addr)
return false;
if (shouldOnlyCommute)
@@ -1357,7 +1365,7 @@ bool TwoAddressInstructionImpl::tryInstructionTransform(
regBKilled = isKilled(MI, regB, true);
}
- if (MI.isConvertibleTo3Addr()) {
+ if (ConvertibleTo3Addr) {
// This instruction is potentially convertible to a true
// three-address instruction. Check if it is profitable.
if (!regBKilled || isProfitableToConv3Addr(regA, regB)) {
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 9460145d47111..857296459aaf7 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -4044,10 +4044,29 @@ MachineInstr
*SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
LiveVariables *LV,
LiveIntervals *LIS) const {
MachineBasicBlock &MBB = *MI.getParent();
+ MachineInstr *CandidateMI = &MI;
+
+ if (MI.isBundle()) {
+// This is a temporary placeholder for bundle handling that enables us to
+// exercise the relevant code paths in the two-address instruction pass.
+i
[llvm-branch-commits] [llvm] CodeGen: More accurate mayAlias for instructions with multiple MMOs (PR #166211)
https://github.com/nhaehnle updated
https://github.com/llvm/llvm-project/pull/166211
From f40a20f2d9f58bf0fe5445295794cc187c66c60b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicolai=20H=C3=A4hnle?=
Date: Fri, 3 Oct 2025 18:20:22 -0700
Subject: [PATCH] CodeGen: More accurate mayAlias for instructions with
multiple MMOs
There can only be meaningful aliasing between the memory accesses of
different instructions if at least one of the accesses modifies memory.
This check is applied at the instruction-level earlier in the method.
This change merely extends the check on a per-MMO basis.
This affects a SystemZ test because PFD instructions are both mayLoad
and mayStore but may carry a load-only MMO which is now no longer
treated as aliasing loads. The PFD instructions are from llvm.prefetch
generated by loop-data-prefetch.
commit-id:667859fc
---
llvm/lib/CodeGen/MachineInstr.cpp | 8 ++--
llvm/test/CodeGen/SystemZ/vec-load-element.ll | 4 ++--
2 files changed, 8 insertions(+), 4 deletions(-)
diff --git a/llvm/lib/CodeGen/MachineInstr.cpp
b/llvm/lib/CodeGen/MachineInstr.cpp
index 8ad9245a47684..37e5c517d24d8 100644
--- a/llvm/lib/CodeGen/MachineInstr.cpp
+++ b/llvm/lib/CodeGen/MachineInstr.cpp
@@ -1547,10 +1547,14 @@ bool MachineInstr::mayAlias(BatchAAResults *AA, const
MachineInstr &Other,
// Check each pair of memory operands from both instructions, which can't
// alias only if all pairs won't alias.
- for (auto *MMOa : memoperands())
-for (auto *MMOb : Other.memoperands())
+ for (auto *MMOa : memoperands()) {
+for (auto *MMOb : Other.memoperands()) {
+ if (!MMOa->isStore() && !MMOb->isStore())
+continue;
if (MemOperandsHaveAlias(MFI, AA, UseTBAA, MMOa, MMOb))
return true;
+}
+ }
return false;
}
diff --git a/llvm/test/CodeGen/SystemZ/vec-load-element.ll
b/llvm/test/CodeGen/SystemZ/vec-load-element.ll
index 2baaed19546df..9bef279d7c0fa 100644
--- a/llvm/test/CodeGen/SystemZ/vec-load-element.ll
+++ b/llvm/test/CodeGen/SystemZ/vec-load-element.ll
@@ -5,8 +5,8 @@
; CHECK-LABEL: .LBB0_1:
; CHECK-NOT: l %r
; CHECK-NOT: vlvgf
-; CHECK: pfd
-; CHECK: vlef
+; CHECK-DAG: pfd
+; CHECK-DAG: vlef
%type0 = type { i32, [400 x i8], i32, i32, i32, i32, i32, i32, i32, i32, i32,
i32 }
@Mem = external global [150 x %type0], align 4
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] CodeGen: Handle bundled instructions in two-address-instructions pass (PR #166212)
https://github.com/nhaehnle updated
https://github.com/llvm/llvm-project/pull/166212
From 97427fa06abb642ffe0d0f5f892fdf326830d88f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicolai=20H=C3=A4hnle?=
Date: Tue, 23 Sep 2025 19:08:52 -0700
Subject: [PATCH] CodeGen: Handle bundled instructions in
two-address-instructions pass
If the instruction with tied operands is a BUNDLE instruction and we
handle it by replacing an operand, then we need to update the
corresponding internal operands as well. Otherwise, the resulting MIR is
invalid.
The test case is degenerate in the sense that the bundle only contains a
single instruction, but it is sufficient to exercise this issue.
commit-id:6760a9b7
---
.../lib/CodeGen/TwoAddressInstructionPass.cpp | 16 ++
llvm/test/CodeGen/AMDGPU/twoaddr-bundle.mir | 57 +++
2 files changed, 73 insertions(+)
create mode 100644 llvm/test/CodeGen/AMDGPU/twoaddr-bundle.mir
diff --git a/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
b/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
index 414e414738b71..1f816b94cf56b 100644
--- a/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
+++ b/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
@@ -1665,6 +1665,22 @@ void
TwoAddressInstructionImpl::processTiedPairs(MachineInstr *MI,
// by SubRegB is compatible with RegA with no subregister. So regardless of
// whether the dest oper writes a subreg, the source oper should not.
MO.setSubReg(0);
+
+// Update uses of RegB to uses of RegA inside the bundle.
+if (MI->isBundle()) {
+ for (MachineInstr *InnerMI = MI; InnerMI->isBundledWithSucc();) {
+InnerMI = InnerMI->getNextNode();
+
+for (MachineOperand &MO : InnerMI->all_uses()) {
+ if (MO.isReg() && MO.getReg() == RegB) {
+assert(
+MO.getSubReg() == 0 &&
+"tied subregister uses in bundled instructions not supported");
+MO.setReg(RegA);
+ }
+}
+ }
+}
}
if (AllUsesCopied) {
diff --git a/llvm/test/CodeGen/AMDGPU/twoaddr-bundle.mir
b/llvm/test/CodeGen/AMDGPU/twoaddr-bundle.mir
new file mode 100644
index 0..696962a88c8b8
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/twoaddr-bundle.mir
@@ -0,0 +1,57 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
UTC_ARGS: --version 6
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 %s --passes=two-address-instruction
-verify-each -o - | FileCheck --check-prefixes=GCN %s
+
+# Exercise very basic handling of BUNDLE'd instructions by the
two-address-instruction pass.
+
+# This test is an example where it is best to keep the two-address instruction
+# and resolve the tie with a COPY that is expected to be coalesced.
+---
+name:test_fmac_bundle
+body: |
+ bb.0:
+
+; GCN-LABEL: name: test_fmac_bundle
+; GCN: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+; GCN-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]],
[[COPY1]], 0, implicit $exec
+; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+; GCN-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_U32_e64_]]
+; GCN-NEXT: BUNDLE implicit-def [[COPY2]], implicit [[DEF]], implicit
[[DEF1]], implicit [[COPY2]](tied-def 0), implicit $mode, implicit $exec {
+; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = V_FMAC_F32_e32 killed [[DEF]],
killed [[DEF1]], killed [[COPY2]], implicit $mode, implicit $exec
+; GCN-NEXT: }
+%10:vgpr_32 = COPY $vgpr0
+%11:vgpr_32 = COPY $vgpr1
+%2:vgpr_32 = V_ADD_U32_e64 %10, %11, 0, implicit $exec
+%0:vgpr_32 = IMPLICIT_DEF
+%1:vgpr_32 = IMPLICIT_DEF
+BUNDLE implicit-def %3:vgpr_32, implicit %0, implicit %1, implicit killed
%2(tied-def 0), implicit $mode, implicit $exec {
+ %3:vgpr_32 = V_FMAC_F32_e32 killed %0, killed %1, killed %2, implicit
$mode, implicit $exec
+}
+
+...
+
+# This test is an example where conversion to three-address form would be
beneficial.
+---
+name:test_fmac_reuse_bundle
+body: |
+ bb.0:
+
+; GCN-LABEL: name: test_fmac_reuse_bundle
+; GCN: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+; GCN-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
+; GCN-NEXT: BUNDLE implicit-def [[COPY1]], implicit [[DEF]], implicit
[[DEF1]], implicit [[COPY1]](tied-def 0), implicit $mode, implicit $exec {
+; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = V_FMAC_F32_e32 killed [[DEF]],
killed [[DEF1]], killed [[COPY1]], implicit $mode, implicit $exec
+; GCN-NEXT: }
+; GCN-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY1]],
[[COPY]], 0, implicit $exec
+%2:vgpr_32 = COPY $vgpr0
+%0:vgpr_32 = IMPLICIT_DEF
+%1:vgpr_32 = IMPLICIT_DEF
+BUNDLE implicit-def %3:vgpr_32, implicit
[llvm-branch-commits] [llvm] CodeGen: Handle bundled instructions in two-address-instructions pass (PR #166212)
https://github.com/nhaehnle updated
https://github.com/llvm/llvm-project/pull/166212
From 97427fa06abb642ffe0d0f5f892fdf326830d88f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicolai=20H=C3=A4hnle?=
Date: Tue, 23 Sep 2025 19:08:52 -0700
Subject: [PATCH] CodeGen: Handle bundled instructions in
two-address-instructions pass
If the instruction with tied operands is a BUNDLE instruction and we
handle it by replacing an operand, then we need to update the
corresponding internal operands as well. Otherwise, the resulting MIR is
invalid.
The test case is degenerate in the sense that the bundle only contains a
single instruction, but it is sufficient to exercise this issue.
commit-id:6760a9b7
---
.../lib/CodeGen/TwoAddressInstructionPass.cpp | 16 ++
llvm/test/CodeGen/AMDGPU/twoaddr-bundle.mir | 57 +++
2 files changed, 73 insertions(+)
create mode 100644 llvm/test/CodeGen/AMDGPU/twoaddr-bundle.mir
diff --git a/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
b/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
index 414e414738b71..1f816b94cf56b 100644
--- a/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
+++ b/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
@@ -1665,6 +1665,22 @@ void
TwoAddressInstructionImpl::processTiedPairs(MachineInstr *MI,
// by SubRegB is compatible with RegA with no subregister. So regardless of
// whether the dest oper writes a subreg, the source oper should not.
MO.setSubReg(0);
+
+// Update uses of RegB to uses of RegA inside the bundle.
+if (MI->isBundle()) {
+ for (MachineInstr *InnerMI = MI; InnerMI->isBundledWithSucc();) {
+InnerMI = InnerMI->getNextNode();
+
+for (MachineOperand &MO : InnerMI->all_uses()) {
+ if (MO.isReg() && MO.getReg() == RegB) {
+assert(
+MO.getSubReg() == 0 &&
+"tied subregister uses in bundled instructions not supported");
+MO.setReg(RegA);
+ }
+}
+ }
+}
}
if (AllUsesCopied) {
diff --git a/llvm/test/CodeGen/AMDGPU/twoaddr-bundle.mir
b/llvm/test/CodeGen/AMDGPU/twoaddr-bundle.mir
new file mode 100644
index 0..696962a88c8b8
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/twoaddr-bundle.mir
@@ -0,0 +1,57 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
UTC_ARGS: --version 6
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 %s --passes=two-address-instruction
-verify-each -o - | FileCheck --check-prefixes=GCN %s
+
+# Exercise very basic handling of BUNDLE'd instructions by the
two-address-instruction pass.
+
+# This test is an example where it is best to keep the two-address instruction
+# and resolve the tie with a COPY that is expected to be coalesced.
+---
+name:test_fmac_bundle
+body: |
+ bb.0:
+
+; GCN-LABEL: name: test_fmac_bundle
+; GCN: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+; GCN-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]],
[[COPY1]], 0, implicit $exec
+; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+; GCN-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_U32_e64_]]
+; GCN-NEXT: BUNDLE implicit-def [[COPY2]], implicit [[DEF]], implicit
[[DEF1]], implicit [[COPY2]](tied-def 0), implicit $mode, implicit $exec {
+; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = V_FMAC_F32_e32 killed [[DEF]],
killed [[DEF1]], killed [[COPY2]], implicit $mode, implicit $exec
+; GCN-NEXT: }
+%10:vgpr_32 = COPY $vgpr0
+%11:vgpr_32 = COPY $vgpr1
+%2:vgpr_32 = V_ADD_U32_e64 %10, %11, 0, implicit $exec
+%0:vgpr_32 = IMPLICIT_DEF
+%1:vgpr_32 = IMPLICIT_DEF
+BUNDLE implicit-def %3:vgpr_32, implicit %0, implicit %1, implicit killed
%2(tied-def 0), implicit $mode, implicit $exec {
+ %3:vgpr_32 = V_FMAC_F32_e32 killed %0, killed %1, killed %2, implicit
$mode, implicit $exec
+}
+
+...
+
+# This test is an example where conversion to three-address form would be
beneficial.
+---
+name:test_fmac_reuse_bundle
+body: |
+ bb.0:
+
+; GCN-LABEL: name: test_fmac_reuse_bundle
+; GCN: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+; GCN-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
+; GCN-NEXT: BUNDLE implicit-def [[COPY1]], implicit [[DEF]], implicit
[[DEF1]], implicit [[COPY1]](tied-def 0), implicit $mode, implicit $exec {
+; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = V_FMAC_F32_e32 killed [[DEF]],
killed [[DEF1]], killed [[COPY1]], implicit $mode, implicit $exec
+; GCN-NEXT: }
+; GCN-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY1]],
[[COPY]], 0, implicit $exec
+%2:vgpr_32 = COPY $vgpr0
+%0:vgpr_32 = IMPLICIT_DEF
+%1:vgpr_32 = IMPLICIT_DEF
+BUNDLE implicit-def %3:vgpr_32, implicit
[llvm-branch-commits] [llvm] CodeGen: Handle bundled instructions in two-address-instructions pass (PR #166212)
https://github.com/nhaehnle updated
https://github.com/llvm/llvm-project/pull/166212
From 97427fa06abb642ffe0d0f5f892fdf326830d88f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicolai=20H=C3=A4hnle?=
Date: Tue, 23 Sep 2025 19:08:52 -0700
Subject: [PATCH] CodeGen: Handle bundled instructions in
two-address-instructions pass
If the instruction with tied operands is a BUNDLE instruction and we
handle it by replacing an operand, then we need to update the
corresponding internal operands as well. Otherwise, the resulting MIR is
invalid.
The test case is degenerate in the sense that the bundle only contains a
single instruction, but it is sufficient to exercise this issue.
commit-id:6760a9b7
---
.../lib/CodeGen/TwoAddressInstructionPass.cpp | 16 ++
llvm/test/CodeGen/AMDGPU/twoaddr-bundle.mir | 57 +++
2 files changed, 73 insertions(+)
create mode 100644 llvm/test/CodeGen/AMDGPU/twoaddr-bundle.mir
diff --git a/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
b/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
index 414e414738b71..1f816b94cf56b 100644
--- a/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
+++ b/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
@@ -1665,6 +1665,22 @@ void
TwoAddressInstructionImpl::processTiedPairs(MachineInstr *MI,
// by SubRegB is compatible with RegA with no subregister. So regardless of
// whether the dest oper writes a subreg, the source oper should not.
MO.setSubReg(0);
+
+// Update uses of RegB to uses of RegA inside the bundle.
+if (MI->isBundle()) {
+ for (MachineInstr *InnerMI = MI; InnerMI->isBundledWithSucc();) {
+InnerMI = InnerMI->getNextNode();
+
+for (MachineOperand &MO : InnerMI->all_uses()) {
+ if (MO.isReg() && MO.getReg() == RegB) {
+assert(
+MO.getSubReg() == 0 &&
+"tied subregister uses in bundled instructions not supported");
+MO.setReg(RegA);
+ }
+}
+ }
+}
}
if (AllUsesCopied) {
diff --git a/llvm/test/CodeGen/AMDGPU/twoaddr-bundle.mir
b/llvm/test/CodeGen/AMDGPU/twoaddr-bundle.mir
new file mode 100644
index 0..696962a88c8b8
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/twoaddr-bundle.mir
@@ -0,0 +1,57 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
UTC_ARGS: --version 6
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 %s --passes=two-address-instruction
-verify-each -o - | FileCheck --check-prefixes=GCN %s
+
+# Exercise very basic handling of BUNDLE'd instructions by the
two-address-instruction pass.
+
+# This test is an example where it is best to keep the two-address instruction
+# and resolve the tie with a COPY that is expected to be coalesced.
+---
+name:test_fmac_bundle
+body: |
+ bb.0:
+
+; GCN-LABEL: name: test_fmac_bundle
+; GCN: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+; GCN-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]],
[[COPY1]], 0, implicit $exec
+; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+; GCN-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_U32_e64_]]
+; GCN-NEXT: BUNDLE implicit-def [[COPY2]], implicit [[DEF]], implicit
[[DEF1]], implicit [[COPY2]](tied-def 0), implicit $mode, implicit $exec {
+; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = V_FMAC_F32_e32 killed [[DEF]],
killed [[DEF1]], killed [[COPY2]], implicit $mode, implicit $exec
+; GCN-NEXT: }
+%10:vgpr_32 = COPY $vgpr0
+%11:vgpr_32 = COPY $vgpr1
+%2:vgpr_32 = V_ADD_U32_e64 %10, %11, 0, implicit $exec
+%0:vgpr_32 = IMPLICIT_DEF
+%1:vgpr_32 = IMPLICIT_DEF
+BUNDLE implicit-def %3:vgpr_32, implicit %0, implicit %1, implicit killed
%2(tied-def 0), implicit $mode, implicit $exec {
+ %3:vgpr_32 = V_FMAC_F32_e32 killed %0, killed %1, killed %2, implicit
$mode, implicit $exec
+}
+
+...
+
+# This test is an example where conversion to three-address form would be
beneficial.
+---
+name:test_fmac_reuse_bundle
+body: |
+ bb.0:
+
+; GCN-LABEL: name: test_fmac_reuse_bundle
+; GCN: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+; GCN-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
+; GCN-NEXT: BUNDLE implicit-def [[COPY1]], implicit [[DEF]], implicit
[[DEF1]], implicit [[COPY1]](tied-def 0), implicit $mode, implicit $exec {
+; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = V_FMAC_F32_e32 killed [[DEF]],
killed [[DEF1]], killed [[COPY1]], implicit $mode, implicit $exec
+; GCN-NEXT: }
+; GCN-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY1]],
[[COPY]], 0, implicit $exec
+%2:vgpr_32 = COPY $vgpr0
+%0:vgpr_32 = IMPLICIT_DEF
+%1:vgpr_32 = IMPLICIT_DEF
+BUNDLE implicit-def %3:vgpr_32, implicit
[llvm-branch-commits] [llvm] CodeGen: More accurate mayAlias for instructions with multiple MMOs (PR #166211)
https://github.com/nhaehnle updated
https://github.com/llvm/llvm-project/pull/166211
From f40a20f2d9f58bf0fe5445295794cc187c66c60b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicolai=20H=C3=A4hnle?=
Date: Fri, 3 Oct 2025 18:20:22 -0700
Subject: [PATCH] CodeGen: More accurate mayAlias for instructions with
multiple MMOs
There can only be meaningful aliasing between the memory accesses of
different instructions if at least one of the accesses modifies memory.
This check is applied at the instruction-level earlier in the method.
This change merely extends the check on a per-MMO basis.
This affects a SystemZ test because PFD instructions are both mayLoad
and mayStore but may carry a load-only MMO which is now no longer
treated as aliasing loads. The PFD instructions are from llvm.prefetch
generated by loop-data-prefetch.
commit-id:667859fc
---
llvm/lib/CodeGen/MachineInstr.cpp | 8 ++--
llvm/test/CodeGen/SystemZ/vec-load-element.ll | 4 ++--
2 files changed, 8 insertions(+), 4 deletions(-)
diff --git a/llvm/lib/CodeGen/MachineInstr.cpp
b/llvm/lib/CodeGen/MachineInstr.cpp
index 8ad9245a47684..37e5c517d24d8 100644
--- a/llvm/lib/CodeGen/MachineInstr.cpp
+++ b/llvm/lib/CodeGen/MachineInstr.cpp
@@ -1547,10 +1547,14 @@ bool MachineInstr::mayAlias(BatchAAResults *AA, const
MachineInstr &Other,
// Check each pair of memory operands from both instructions, which can't
// alias only if all pairs won't alias.
- for (auto *MMOa : memoperands())
-for (auto *MMOb : Other.memoperands())
+ for (auto *MMOa : memoperands()) {
+for (auto *MMOb : Other.memoperands()) {
+ if (!MMOa->isStore() && !MMOb->isStore())
+continue;
if (MemOperandsHaveAlias(MFI, AA, UseTBAA, MMOa, MMOb))
return true;
+}
+ }
return false;
}
diff --git a/llvm/test/CodeGen/SystemZ/vec-load-element.ll
b/llvm/test/CodeGen/SystemZ/vec-load-element.ll
index 2baaed19546df..9bef279d7c0fa 100644
--- a/llvm/test/CodeGen/SystemZ/vec-load-element.ll
+++ b/llvm/test/CodeGen/SystemZ/vec-load-element.ll
@@ -5,8 +5,8 @@
; CHECK-LABEL: .LBB0_1:
; CHECK-NOT: l %r
; CHECK-NOT: vlvgf
-; CHECK: pfd
-; CHECK: vlef
+; CHECK-DAG: pfd
+; CHECK-DAG: vlef
%type0 = type { i32, [400 x i8], i32, i32, i32, i32, i32, i32, i32, i32, i32,
i32 }
@Mem = external global [150 x %type0], align 4
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] CodeGen/AMDGPU: Allow 3-address conversion of bundled instructions (PR #166213)
https://github.com/nhaehnle updated
https://github.com/llvm/llvm-project/pull/166213
From e0611e449d84c99d08b0bd6a5f338cb605740574 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicolai=20H=C3=A4hnle?=
Date: Tue, 7 Oct 2025 12:17:02 -0700
Subject: [PATCH] CodeGen/AMDGPU: Allow 3-address conversion of bundled
instructions
This is in preparation for future changes in AMDGPU that will make more
substantial use of bundles pre-RA. For now, simply test this with
degenerate (single-instruction) bundles.
commit-id:4a30cb78
---
.../lib/CodeGen/TwoAddressInstructionPass.cpp | 54 ++
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp| 56 +--
llvm/test/CodeGen/AMDGPU/twoaddr-bundle.mir | 9 ++-
3 files changed, 87 insertions(+), 32 deletions(-)
diff --git a/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
b/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
index 1f816b94cf56b..7056ced5385ed 100644
--- a/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
+++ b/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
@@ -794,29 +794,34 @@ bool TwoAddressInstructionImpl::convertInstTo3Addr(
if (!NewMI)
return false;
- LLVM_DEBUG(dbgs() << "2addr: CONVERTING 2-ADDR: " << *mi);
- LLVM_DEBUG(dbgs() << "2addr: TO 3-ADDR: " << *NewMI);
-
- // If the old instruction is debug value tracked, an update is required.
- if (auto OldInstrNum = mi->peekDebugInstrNum()) {
-assert(mi->getNumExplicitDefs() == 1);
-assert(NewMI->getNumExplicitDefs() == 1);
-
-// Find the old and new def location.
-unsigned OldIdx = mi->defs().begin()->getOperandNo();
-unsigned NewIdx = NewMI->defs().begin()->getOperandNo();
-
-// Record that one def has been replaced by the other.
-unsigned NewInstrNum = NewMI->getDebugInstrNum();
-MF->makeDebugValueSubstitution(std::make_pair(OldInstrNum, OldIdx),
- std::make_pair(NewInstrNum, NewIdx));
- }
-
- MBB->erase(mi); // Nuke the old inst.
-
for (MachineInstr &MI : MIS)
DistanceMap.insert(std::make_pair(&MI, Dist++));
- Dist--;
+
+ if (&*mi == NewMI) {
+LLVM_DEBUG(dbgs() << "2addr: CONVERTED IN-PLACE TO 3-ADDR: " << *mi);
+ } else {
+LLVM_DEBUG(dbgs() << "2addr: CONVERTING 2-ADDR: " << *mi);
+LLVM_DEBUG(dbgs() << "2addr: TO 3-ADDR: " << *NewMI);
+
+// If the old instruction is debug value tracked, an update is required.
+if (auto OldInstrNum = mi->peekDebugInstrNum()) {
+ assert(mi->getNumExplicitDefs() == 1);
+ assert(NewMI->getNumExplicitDefs() == 1);
+
+ // Find the old and new def location.
+ unsigned OldIdx = mi->defs().begin()->getOperandNo();
+ unsigned NewIdx = NewMI->defs().begin()->getOperandNo();
+
+ // Record that one def has been replaced by the other.
+ unsigned NewInstrNum = NewMI->getDebugInstrNum();
+ MF->makeDebugValueSubstitution(std::make_pair(OldInstrNum, OldIdx),
+ std::make_pair(NewInstrNum, NewIdx));
+}
+
+MBB->erase(mi); // Nuke the old inst.
+Dist--;
+ }
+
mi = NewMI;
nmi = std::next(mi);
@@ -1329,6 +1334,9 @@ bool TwoAddressInstructionImpl::tryInstructionTransform(
bool Commuted = tryInstructionCommute(&MI, DstIdx, SrcIdx, regBKilled, Dist);
+ // Give targets a chance to convert bundled instructions.
+ bool ConvertibleTo3Addr = MI.isConvertibleTo3Addr(MachineInstr::AnyInBundle);
+
// If the instruction is convertible to 3 Addr, instead
// of returning try 3 Addr transformation aggressively and
// use this variable to check later. Because it might be better.
@@ -1337,7 +1345,7 @@ bool TwoAddressInstructionImpl::tryInstructionTransform(
// addl %esi, %edi
// movl %edi, %eax
// ret
- if (Commuted && !MI.isConvertibleTo3Addr())
+ if (Commuted && !ConvertibleTo3Addr)
return false;
if (shouldOnlyCommute)
@@ -1357,7 +1365,7 @@ bool TwoAddressInstructionImpl::tryInstructionTransform(
regBKilled = isKilled(MI, regB, true);
}
- if (MI.isConvertibleTo3Addr()) {
+ if (ConvertibleTo3Addr) {
// This instruction is potentially convertible to a true
// three-address instruction. Check if it is profitable.
if (!regBKilled || isProfitableToConv3Addr(regA, regB)) {
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 9460145d47111..857296459aaf7 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -4044,10 +4044,29 @@ MachineInstr
*SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
LiveVariables *LV,
LiveIntervals *LIS) const {
MachineBasicBlock &MBB = *MI.getParent();
+ MachineInstr *CandidateMI = &MI;
+
+ if (MI.isBundle()) {
+// This is a temporary placeholder for bundle handling that enables us to
+// exercise the relevant code paths in the two-address instruction pass.
+i
[llvm-branch-commits] [llvm] CodeGen: More accurate mayAlias for instructions with multiple MMOs (PR #166211)
https://github.com/nhaehnle updated
https://github.com/llvm/llvm-project/pull/166211
From f40a20f2d9f58bf0fe5445295794cc187c66c60b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicolai=20H=C3=A4hnle?=
Date: Fri, 3 Oct 2025 18:20:22 -0700
Subject: [PATCH] CodeGen: More accurate mayAlias for instructions with
multiple MMOs
There can only be meaningful aliasing between the memory accesses of
different instructions if at least one of the accesses modifies memory.
This check is applied at the instruction-level earlier in the method.
This change merely extends the check on a per-MMO basis.
This affects a SystemZ test because PFD instructions are both mayLoad
and mayStore but may carry a load-only MMO which is now no longer
treated as aliasing loads. The PFD instructions are from llvm.prefetch
generated by loop-data-prefetch.
commit-id:667859fc
---
llvm/lib/CodeGen/MachineInstr.cpp | 8 ++--
llvm/test/CodeGen/SystemZ/vec-load-element.ll | 4 ++--
2 files changed, 8 insertions(+), 4 deletions(-)
diff --git a/llvm/lib/CodeGen/MachineInstr.cpp
b/llvm/lib/CodeGen/MachineInstr.cpp
index 8ad9245a47684..37e5c517d24d8 100644
--- a/llvm/lib/CodeGen/MachineInstr.cpp
+++ b/llvm/lib/CodeGen/MachineInstr.cpp
@@ -1547,10 +1547,14 @@ bool MachineInstr::mayAlias(BatchAAResults *AA, const
MachineInstr &Other,
// Check each pair of memory operands from both instructions, which can't
// alias only if all pairs won't alias.
- for (auto *MMOa : memoperands())
-for (auto *MMOb : Other.memoperands())
+ for (auto *MMOa : memoperands()) {
+for (auto *MMOb : Other.memoperands()) {
+ if (!MMOa->isStore() && !MMOb->isStore())
+continue;
if (MemOperandsHaveAlias(MFI, AA, UseTBAA, MMOa, MMOb))
return true;
+}
+ }
return false;
}
diff --git a/llvm/test/CodeGen/SystemZ/vec-load-element.ll
b/llvm/test/CodeGen/SystemZ/vec-load-element.ll
index 2baaed19546df..9bef279d7c0fa 100644
--- a/llvm/test/CodeGen/SystemZ/vec-load-element.ll
+++ b/llvm/test/CodeGen/SystemZ/vec-load-element.ll
@@ -5,8 +5,8 @@
; CHECK-LABEL: .LBB0_1:
; CHECK-NOT: l %r
; CHECK-NOT: vlvgf
-; CHECK: pfd
-; CHECK: vlef
+; CHECK-DAG: pfd
+; CHECK-DAG: vlef
%type0 = type { i32, [400 x i8], i32, i32, i32, i32, i32, i32, i32, i32, i32,
i32 }
@Mem = external global [150 x %type0], align 4
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU: Really use AV classes by default for vector classes (PR #166483)
https://github.com/shiltian approved this pull request. https://github.com/llvm/llvm-project/pull/166483 ___ llvm-branch-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] ExpandFp: Require RuntimeLibcallsInfo analysis (PR #165197)
https://github.com/arsenm updated
https://github.com/llvm/llvm-project/pull/165197
>From 7fd144cf1ef36cbc2c7f896fd23185292920816d Mon Sep 17 00:00:00 2001
From: Matt Arsenault
Date: Sun, 26 Oct 2025 02:44:00 +0900
Subject: [PATCH] ExpandFp: Require RuntimeLibcallsInfo analysis
Not sure I'm doing the new pass manager handling correctly. I do
not like needing to manually check if the cached module pass is
available and manually erroring in every pass.
---
llvm/lib/CodeGen/ExpandFp.cpp | 14 ++
llvm/test/Transforms/ExpandFp/AMDGPU/frem-inf.ll | 4 ++--
llvm/test/Transforms/ExpandFp/AMDGPU/frem.ll | 2 +-
.../Transforms/ExpandFp/AMDGPU/missing-analysis.ll | 6 ++
.../Transforms/ExpandFp/AMDGPU/pass-parameters.ll | 8
5 files changed, 27 insertions(+), 7 deletions(-)
create mode 100644 llvm/test/Transforms/ExpandFp/AMDGPU/missing-analysis.ll
diff --git a/llvm/lib/CodeGen/ExpandFp.cpp b/llvm/lib/CodeGen/ExpandFp.cpp
index f44eb227133ae..9386ffe7791a3 100644
--- a/llvm/lib/CodeGen/ExpandFp.cpp
+++ b/llvm/lib/CodeGen/ExpandFp.cpp
@@ -18,6 +18,7 @@
#include "llvm/ADT/SmallVector.h"
#include "llvm/Analysis/AssumptionCache.h"
#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/RuntimeLibcallInfo.h"
#include "llvm/Analysis/SimplifyQuery.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/CodeGen/ISDOpcodes.h"
@@ -1092,6 +1093,8 @@ class ExpandFpLegacyPass : public FunctionPass {
auto *TM = &getAnalysis().getTM();
auto *TLI = TM->getSubtargetImpl(F)->getTargetLowering();
AssumptionCache *AC = nullptr;
+const RTLIB::RuntimeLibcallsInfo *Libcalls =
+&getAnalysis().getRTLCI(*F.getParent());
if (OptLevel != CodeGenOptLevel::None && !F.hasOptNone())
AC = &getAnalysis().getAssumptionCache(F);
@@ -1104,6 +1107,7 @@ class ExpandFpLegacyPass : public FunctionPass {
AU.addRequired();
AU.addPreserved();
AU.addPreserved();
+AU.addRequired();
}
};
} // namespace
@@ -1126,6 +1130,15 @@ PreservedAnalyses ExpandFpPass::run(Function &F,
FunctionAnalysisManager &FAM) {
AssumptionCache *AC = nullptr;
if (OptLevel != CodeGenOptLevel::None)
AC = &FAM.getResult(F);
+
+ auto &MAMProxy = FAM.getResult(F);
+ const RTLIB::RuntimeLibcallsInfo *Libcalls =
+ MAMProxy.getCachedResult(*F.getParent());
+ if (!Libcalls) {
+F.getContext().emitError("'runtime-libcall-info' analysis required");
+return PreservedAnalyses::all();
+ }
+
return runImpl(F, TLI, AC) ? PreservedAnalyses::none()
: PreservedAnalyses::all();
}
@@ -1133,6 +1146,7 @@ PreservedAnalyses ExpandFpPass::run(Function &F,
FunctionAnalysisManager &FAM) {
char ExpandFpLegacyPass::ID = 0;
INITIALIZE_PASS_BEGIN(ExpandFpLegacyPass, "expand-fp",
"Expand certain fp instructions", false, false)
+INITIALIZE_PASS_DEPENDENCY(RuntimeLibraryInfoWrapper)
INITIALIZE_PASS_END(ExpandFpLegacyPass, "expand-fp", "Expand fp", false, false)
FunctionPass *llvm::createExpandFpPass(CodeGenOptLevel OptLevel) {
diff --git a/llvm/test/Transforms/ExpandFp/AMDGPU/frem-inf.ll
b/llvm/test/Transforms/ExpandFp/AMDGPU/frem-inf.ll
index f70f0d25f172d..4d302f63e1f0b 100644
--- a/llvm/test/Transforms/ExpandFp/AMDGPU/frem-inf.ll
+++ b/llvm/test/Transforms/ExpandFp/AMDGPU/frem-inf.ll
@@ -1,5 +1,5 @@
-; RUN: opt -mtriple=amdgcn -passes="expand-fp" %s -S -o - | FileCheck
--check-prefixes CHECK %s
-; RUN: opt -mtriple=amdgcn -passes="expand-fp" %s -S -o - | FileCheck
--check-prefixes CHECK,OPT1 %s
+; RUN: opt -mtriple=amdgcn
-passes="require,expand-fp" %s -S -o - | FileCheck
--check-prefixes CHECK %s
+; RUN: opt -mtriple=amdgcn
-passes="require,expand-fp" %s -S -o - | FileCheck
--check-prefixes CHECK,OPT1 %s
; Check the handling of potentially infinite numerators in the frem
; expansion at different optimization levels and with different
diff --git a/llvm/test/Transforms/ExpandFp/AMDGPU/frem.ll
b/llvm/test/Transforms/ExpandFp/AMDGPU/frem.ll
index 4c0f9db147c96..56ccfb6bf454c 100644
--- a/llvm/test/Transforms/ExpandFp/AMDGPU/frem.ll
+++ b/llvm/test/Transforms/ExpandFp/AMDGPU/frem.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
UTC_ARGS: --version 5
-; RUN: opt -mtriple=amdgcn -passes="expand-fp" %s -S -o - | FileCheck %s
+; RUN: opt -mtriple=amdgcn
-passes="require,expand-fp" %s -S -o - | FileCheck %s
define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1)
%in1,
; CHECK-LABEL: define amdgpu_kernel void @frem_f16(
diff --git a/llvm/test/Transforms/ExpandFp/AMDGPU/missing-analysis.ll
b/llvm/test/Transforms/ExpandFp/AMDGPU/missing-analysis.ll
new file mode 100644
index 0..5cad68e66d3ee
--- /dev/null
+++ b/llvm/test/Transforms/ExpandFp/AMDGPU/missing-analysis.ll
@@ -0,0 +1,6 @@
+; RUN: not opt -mtriple=amdgcn -passes=expand-fp -disable-output %s 2>&1 |
FileCheck %s
+
+; CHECK: 'runtime-libc
[llvm-branch-commits] [llvm] ExpandFp: Require RuntimeLibcallsInfo analysis (PR #165197)
https://github.com/arsenm updated
https://github.com/llvm/llvm-project/pull/165197
>From 7fd144cf1ef36cbc2c7f896fd23185292920816d Mon Sep 17 00:00:00 2001
From: Matt Arsenault
Date: Sun, 26 Oct 2025 02:44:00 +0900
Subject: [PATCH] ExpandFp: Require RuntimeLibcallsInfo analysis
Not sure I'm doing the new pass manager handling correctly. I do
not like needing to manually check if the cached module pass is
available and manually erroring in every pass.
---
llvm/lib/CodeGen/ExpandFp.cpp | 14 ++
llvm/test/Transforms/ExpandFp/AMDGPU/frem-inf.ll | 4 ++--
llvm/test/Transforms/ExpandFp/AMDGPU/frem.ll | 2 +-
.../Transforms/ExpandFp/AMDGPU/missing-analysis.ll | 6 ++
.../Transforms/ExpandFp/AMDGPU/pass-parameters.ll | 8
5 files changed, 27 insertions(+), 7 deletions(-)
create mode 100644 llvm/test/Transforms/ExpandFp/AMDGPU/missing-analysis.ll
diff --git a/llvm/lib/CodeGen/ExpandFp.cpp b/llvm/lib/CodeGen/ExpandFp.cpp
index f44eb227133ae..9386ffe7791a3 100644
--- a/llvm/lib/CodeGen/ExpandFp.cpp
+++ b/llvm/lib/CodeGen/ExpandFp.cpp
@@ -18,6 +18,7 @@
#include "llvm/ADT/SmallVector.h"
#include "llvm/Analysis/AssumptionCache.h"
#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/RuntimeLibcallInfo.h"
#include "llvm/Analysis/SimplifyQuery.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/CodeGen/ISDOpcodes.h"
@@ -1092,6 +1093,8 @@ class ExpandFpLegacyPass : public FunctionPass {
auto *TM = &getAnalysis().getTM();
auto *TLI = TM->getSubtargetImpl(F)->getTargetLowering();
AssumptionCache *AC = nullptr;
+const RTLIB::RuntimeLibcallsInfo *Libcalls =
+&getAnalysis().getRTLCI(*F.getParent());
if (OptLevel != CodeGenOptLevel::None && !F.hasOptNone())
AC = &getAnalysis().getAssumptionCache(F);
@@ -1104,6 +1107,7 @@ class ExpandFpLegacyPass : public FunctionPass {
AU.addRequired();
AU.addPreserved();
AU.addPreserved();
+AU.addRequired();
}
};
} // namespace
@@ -1126,6 +1130,15 @@ PreservedAnalyses ExpandFpPass::run(Function &F,
FunctionAnalysisManager &FAM) {
AssumptionCache *AC = nullptr;
if (OptLevel != CodeGenOptLevel::None)
AC = &FAM.getResult(F);
+
+ auto &MAMProxy = FAM.getResult(F);
+ const RTLIB::RuntimeLibcallsInfo *Libcalls =
+ MAMProxy.getCachedResult(*F.getParent());
+ if (!Libcalls) {
+F.getContext().emitError("'runtime-libcall-info' analysis required");
+return PreservedAnalyses::all();
+ }
+
return runImpl(F, TLI, AC) ? PreservedAnalyses::none()
: PreservedAnalyses::all();
}
@@ -1133,6 +1146,7 @@ PreservedAnalyses ExpandFpPass::run(Function &F,
FunctionAnalysisManager &FAM) {
char ExpandFpLegacyPass::ID = 0;
INITIALIZE_PASS_BEGIN(ExpandFpLegacyPass, "expand-fp",
"Expand certain fp instructions", false, false)
+INITIALIZE_PASS_DEPENDENCY(RuntimeLibraryInfoWrapper)
INITIALIZE_PASS_END(ExpandFpLegacyPass, "expand-fp", "Expand fp", false, false)
FunctionPass *llvm::createExpandFpPass(CodeGenOptLevel OptLevel) {
diff --git a/llvm/test/Transforms/ExpandFp/AMDGPU/frem-inf.ll
b/llvm/test/Transforms/ExpandFp/AMDGPU/frem-inf.ll
index f70f0d25f172d..4d302f63e1f0b 100644
--- a/llvm/test/Transforms/ExpandFp/AMDGPU/frem-inf.ll
+++ b/llvm/test/Transforms/ExpandFp/AMDGPU/frem-inf.ll
@@ -1,5 +1,5 @@
-; RUN: opt -mtriple=amdgcn -passes="expand-fp" %s -S -o - | FileCheck
--check-prefixes CHECK %s
-; RUN: opt -mtriple=amdgcn -passes="expand-fp" %s -S -o - | FileCheck
--check-prefixes CHECK,OPT1 %s
+; RUN: opt -mtriple=amdgcn
-passes="require,expand-fp" %s -S -o - | FileCheck
--check-prefixes CHECK %s
+; RUN: opt -mtriple=amdgcn
-passes="require,expand-fp" %s -S -o - | FileCheck
--check-prefixes CHECK,OPT1 %s
; Check the handling of potentially infinite numerators in the frem
; expansion at different optimization levels and with different
diff --git a/llvm/test/Transforms/ExpandFp/AMDGPU/frem.ll
b/llvm/test/Transforms/ExpandFp/AMDGPU/frem.ll
index 4c0f9db147c96..56ccfb6bf454c 100644
--- a/llvm/test/Transforms/ExpandFp/AMDGPU/frem.ll
+++ b/llvm/test/Transforms/ExpandFp/AMDGPU/frem.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
UTC_ARGS: --version 5
-; RUN: opt -mtriple=amdgcn -passes="expand-fp" %s -S -o - | FileCheck %s
+; RUN: opt -mtriple=amdgcn
-passes="require,expand-fp" %s -S -o - | FileCheck %s
define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1)
%in1,
; CHECK-LABEL: define amdgpu_kernel void @frem_f16(
diff --git a/llvm/test/Transforms/ExpandFp/AMDGPU/missing-analysis.ll
b/llvm/test/Transforms/ExpandFp/AMDGPU/missing-analysis.ll
new file mode 100644
index 0..5cad68e66d3ee
--- /dev/null
+++ b/llvm/test/Transforms/ExpandFp/AMDGPU/missing-analysis.ll
@@ -0,0 +1,6 @@
+; RUN: not opt -mtriple=amdgcn -passes=expand-fp -disable-output %s 2>&1 |
FileCheck %s
+
+; CHECK: 'runtime-libc
[llvm-branch-commits] [llvm] Analysis: Add RuntimeLibcall analysis pass (PR #165196)
https://github.com/arsenm updated
https://github.com/llvm/llvm-project/pull/165196
>From 0158564d168101f8ec0f6ecb8a0ac98e5f3e Mon Sep 17 00:00:00 2001
From: Matt Arsenault
Date: Mon, 2 Jun 2025 18:32:22 +0200
Subject: [PATCH] Analysis: Add RuntimeLibcall analysis pass
Currently RuntimeLibcallsInfo is a hardcoded list based on the triple.
In the future the available libcall set should be dynamically modifiable
with module flags.
Note this isn't really used yet. TargetLowering is still constructing
its own copy, and untangling that to use this requires several more
steps.
---
.../llvm/Analysis/RuntimeLibcallInfo.h| 60 +++
llvm/include/llvm/CodeGen/SelectionDAGISel.h | 1 +
llvm/include/llvm/IR/RuntimeLibcalls.h| 10
llvm/include/llvm/InitializePasses.h | 1 +
llvm/include/llvm/Passes/CodeGenPassBuilder.h | 3 +
llvm/lib/Analysis/Analysis.cpp| 1 +
llvm/lib/Analysis/CMakeLists.txt | 1 +
llvm/lib/Analysis/RuntimeLibcallInfo.cpp | 43 +
llvm/lib/IR/RuntimeLibcalls.cpp | 7 ++-
llvm/lib/Passes/PassBuilder.cpp | 1 +
llvm/lib/Passes/PassRegistry.def | 1 +
llvm/lib/Target/Target.cpp| 1 +
12 files changed, 129 insertions(+), 1 deletion(-)
create mode 100644 llvm/include/llvm/Analysis/RuntimeLibcallInfo.h
create mode 100644 llvm/lib/Analysis/RuntimeLibcallInfo.cpp
diff --git a/llvm/include/llvm/Analysis/RuntimeLibcallInfo.h
b/llvm/include/llvm/Analysis/RuntimeLibcallInfo.h
new file mode 100644
index 0..a3e1014b417e5
--- /dev/null
+++ b/llvm/include/llvm/Analysis/RuntimeLibcallInfo.h
@@ -0,0 +1,60 @@
+//===-- RuntimeLibcallInfo.h - Runtime library information --*- C++
-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===--===//
+
+#ifndef LLVM_ANALYSIS_RUNTIMELIBCALLINFO_H
+#define LLVM_ANALYSIS_RUNTIMELIBCALLINFO_H
+
+#include "llvm/IR/RuntimeLibcalls.h"
+#include "llvm/Pass.h"
+
+namespace llvm {
+
+class LLVM_ABI RuntimeLibraryAnalysis
+: public AnalysisInfoMixin {
+public:
+ using Result = RTLIB::RuntimeLibcallsInfo;
+
+ RuntimeLibraryAnalysis() = default;
+ RuntimeLibraryAnalysis(RTLIB::RuntimeLibcallsInfo &&BaselineInfoImpl)
+ : LibcallsInfo(std::move(BaselineInfoImpl)) {}
+ explicit RuntimeLibraryAnalysis(const Triple &T) : LibcallsInfo(T) {}
+
+ LLVM_ABI RTLIB::RuntimeLibcallsInfo run(const Module &M,
+ ModuleAnalysisManager &);
+
+private:
+ friend AnalysisInfoMixin;
+ LLVM_ABI static AnalysisKey Key;
+
+ RTLIB::RuntimeLibcallsInfo LibcallsInfo;
+};
+
+class LLVM_ABI RuntimeLibraryInfoWrapper : public ImmutablePass {
+ RuntimeLibraryAnalysis RTLA;
+ std::optional RTLCI;
+
+public:
+ static char ID;
+ RuntimeLibraryInfoWrapper();
+ explicit RuntimeLibraryInfoWrapper(const Triple &T);
+ explicit RuntimeLibraryInfoWrapper(const RTLIB::RuntimeLibcallsInfo &RTLCI);
+
+ const RTLIB::RuntimeLibcallsInfo &getRTLCI(const Module &M) {
+ModuleAnalysisManager DummyMAM;
+RTLCI = RTLA.run(M, DummyMAM);
+return *RTLCI;
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override;
+};
+
+LLVM_ABI ModulePass *createRuntimeLibraryInfoWrapperPass();
+
+} // namespace llvm
+
+#endif
diff --git a/llvm/include/llvm/CodeGen/SelectionDAGISel.h
b/llvm/include/llvm/CodeGen/SelectionDAGISel.h
index 5241a51dd8cd8..d7921c3eb3f7c 100644
--- a/llvm/include/llvm/CodeGen/SelectionDAGISel.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAGISel.h
@@ -46,6 +46,7 @@ class SelectionDAGISel {
public:
TargetMachine &TM;
const TargetLibraryInfo *LibInfo;
+ const RTLIB::RuntimeLibcallsInfo *RuntimeLibCallInfo;
std::unique_ptr FuncInfo;
std::unique_ptr SwiftError;
MachineFunction *MF;
diff --git a/llvm/include/llvm/IR/RuntimeLibcalls.h
b/llvm/include/llvm/IR/RuntimeLibcalls.h
index 78e4b1723aafa..c822b6530a441 100644
--- a/llvm/include/llvm/IR/RuntimeLibcalls.h
+++ b/llvm/include/llvm/IR/RuntimeLibcalls.h
@@ -9,6 +9,8 @@
// This file implements a common interface to work with library calls into a
// runtime that may be emitted by a given backend.
//
+// FIXME: This should probably move to Analysis
+//
//===--===//
#ifndef LLVM_IR_RUNTIME_LIBCALLS_H
@@ -20,6 +22,7 @@
#include "llvm/ADT/StringTable.h"
#include "llvm/IR/CallingConv.h"
#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/PassManager.h"
#include "llvm/Support/AtomicOrdering.h"
#include "llvm/Support/CodeGen.h"
#include "llvm/Support/Compiler.h"
@@ -74,6 +77,8 @@ struct RuntimeLibcallsInfo {
public:
friend class llvm::LibcallLoweringInfo;
+ RuntimeLibcallsInfo() = default;
+
[llvm-branch-commits] [llvm] Analysis: Add RuntimeLibcall analysis pass (PR #165196)
https://github.com/arsenm updated
https://github.com/llvm/llvm-project/pull/165196
>From 0158564d168101f8ec0f6ecb8a0ac98e5f3e Mon Sep 17 00:00:00 2001
From: Matt Arsenault
Date: Mon, 2 Jun 2025 18:32:22 +0200
Subject: [PATCH] Analysis: Add RuntimeLibcall analysis pass
Currently RuntimeLibcallsInfo is a hardcoded list based on the triple.
In the future the available libcall set should be dynamically modifiable
with module flags.
Note this isn't really used yet. TargetLowering is still constructing
its own copy, and untangling that to use this requires several more
steps.
---
.../llvm/Analysis/RuntimeLibcallInfo.h| 60 +++
llvm/include/llvm/CodeGen/SelectionDAGISel.h | 1 +
llvm/include/llvm/IR/RuntimeLibcalls.h| 10
llvm/include/llvm/InitializePasses.h | 1 +
llvm/include/llvm/Passes/CodeGenPassBuilder.h | 3 +
llvm/lib/Analysis/Analysis.cpp| 1 +
llvm/lib/Analysis/CMakeLists.txt | 1 +
llvm/lib/Analysis/RuntimeLibcallInfo.cpp | 43 +
llvm/lib/IR/RuntimeLibcalls.cpp | 7 ++-
llvm/lib/Passes/PassBuilder.cpp | 1 +
llvm/lib/Passes/PassRegistry.def | 1 +
llvm/lib/Target/Target.cpp| 1 +
12 files changed, 129 insertions(+), 1 deletion(-)
create mode 100644 llvm/include/llvm/Analysis/RuntimeLibcallInfo.h
create mode 100644 llvm/lib/Analysis/RuntimeLibcallInfo.cpp
diff --git a/llvm/include/llvm/Analysis/RuntimeLibcallInfo.h
b/llvm/include/llvm/Analysis/RuntimeLibcallInfo.h
new file mode 100644
index 0..a3e1014b417e5
--- /dev/null
+++ b/llvm/include/llvm/Analysis/RuntimeLibcallInfo.h
@@ -0,0 +1,60 @@
+//===-- RuntimeLibcallInfo.h - Runtime library information --*- C++
-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===--===//
+
+#ifndef LLVM_ANALYSIS_RUNTIMELIBCALLINFO_H
+#define LLVM_ANALYSIS_RUNTIMELIBCALLINFO_H
+
+#include "llvm/IR/RuntimeLibcalls.h"
+#include "llvm/Pass.h"
+
+namespace llvm {
+
+class LLVM_ABI RuntimeLibraryAnalysis
+: public AnalysisInfoMixin {
+public:
+ using Result = RTLIB::RuntimeLibcallsInfo;
+
+ RuntimeLibraryAnalysis() = default;
+ RuntimeLibraryAnalysis(RTLIB::RuntimeLibcallsInfo &&BaselineInfoImpl)
+ : LibcallsInfo(std::move(BaselineInfoImpl)) {}
+ explicit RuntimeLibraryAnalysis(const Triple &T) : LibcallsInfo(T) {}
+
+ LLVM_ABI RTLIB::RuntimeLibcallsInfo run(const Module &M,
+ ModuleAnalysisManager &);
+
+private:
+ friend AnalysisInfoMixin;
+ LLVM_ABI static AnalysisKey Key;
+
+ RTLIB::RuntimeLibcallsInfo LibcallsInfo;
+};
+
+class LLVM_ABI RuntimeLibraryInfoWrapper : public ImmutablePass {
+ RuntimeLibraryAnalysis RTLA;
+ std::optional RTLCI;
+
+public:
+ static char ID;
+ RuntimeLibraryInfoWrapper();
+ explicit RuntimeLibraryInfoWrapper(const Triple &T);
+ explicit RuntimeLibraryInfoWrapper(const RTLIB::RuntimeLibcallsInfo &RTLCI);
+
+ const RTLIB::RuntimeLibcallsInfo &getRTLCI(const Module &M) {
+ModuleAnalysisManager DummyMAM;
+RTLCI = RTLA.run(M, DummyMAM);
+return *RTLCI;
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override;
+};
+
+LLVM_ABI ModulePass *createRuntimeLibraryInfoWrapperPass();
+
+} // namespace llvm
+
+#endif
diff --git a/llvm/include/llvm/CodeGen/SelectionDAGISel.h
b/llvm/include/llvm/CodeGen/SelectionDAGISel.h
index 5241a51dd8cd8..d7921c3eb3f7c 100644
--- a/llvm/include/llvm/CodeGen/SelectionDAGISel.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAGISel.h
@@ -46,6 +46,7 @@ class SelectionDAGISel {
public:
TargetMachine &TM;
const TargetLibraryInfo *LibInfo;
+ const RTLIB::RuntimeLibcallsInfo *RuntimeLibCallInfo;
std::unique_ptr FuncInfo;
std::unique_ptr SwiftError;
MachineFunction *MF;
diff --git a/llvm/include/llvm/IR/RuntimeLibcalls.h
b/llvm/include/llvm/IR/RuntimeLibcalls.h
index 78e4b1723aafa..c822b6530a441 100644
--- a/llvm/include/llvm/IR/RuntimeLibcalls.h
+++ b/llvm/include/llvm/IR/RuntimeLibcalls.h
@@ -9,6 +9,8 @@
// This file implements a common interface to work with library calls into a
// runtime that may be emitted by a given backend.
//
+// FIXME: This should probably move to Analysis
+//
//===--===//
#ifndef LLVM_IR_RUNTIME_LIBCALLS_H
@@ -20,6 +22,7 @@
#include "llvm/ADT/StringTable.h"
#include "llvm/IR/CallingConv.h"
#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/PassManager.h"
#include "llvm/Support/AtomicOrdering.h"
#include "llvm/Support/CodeGen.h"
#include "llvm/Support/Compiler.h"
@@ -74,6 +77,8 @@ struct RuntimeLibcallsInfo {
public:
friend class llvm::LibcallLoweringInfo;
+ RuntimeLibcallsInfo() = default;
+
[llvm-branch-commits] [llvm] [LV] Choose best reduction for VPlan (PR #166138)
@@ -0,0 +1,136 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
UTC_ARGS: --version 6
+; RUN: opt -S -mcpu=neoverse-v2 -passes=loop-vectorize -mtriple=aarch64 < %s |
FileCheck %s
+target triple = "aarch64"
+
+; Check that a partial reduction is reverted back to a regular reduction,
+; so that we compare "the VPlan with the best kind of reduction for "
+; vs "the VPlan with the best kind of reduction for ",
+
+; Function Attrs: nofree norecurse nosync nounwind memory(argmem: read)
uwtable vscale_range(1,16)
+define dso_local i64 @foo(ptr noundef readonly captures(none) %0, i32 noundef
%1) local_unnamed_addr #0 {
+; CHECK-LABEL: define dso_local i64 @foo(
+; CHECK-SAME: ptr noundef readonly captures(none) [[TMP0:%.*]], i32 noundef
[[TMP1:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:[[TMP3:%.*]] = icmp sgt i32 [[TMP1]], 0
+; CHECK-NEXT:br i1 [[TMP3]], label %[[ITER_CHECK:.*]], label %[[BB27:.*]]
+; CHECK: [[ITER_CHECK]]:
+; CHECK-NEXT:[[TMP4:%.*]] = zext nneg i32 [[TMP1]] to i64
+; CHECK-NEXT:[[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP4]], 4
+; CHECK-NEXT:br i1 [[MIN_ITERS_CHECK]], label
%[[VEC_EPILOG_SCALAR_PH:.*]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK:.*]]
+; CHECK: [[VECTOR_MAIN_LOOP_ITER_CHECK]]:
+; CHECK-NEXT:[[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[TMP4]], 16
+; CHECK-NEXT:br i1 [[MIN_ITERS_CHECK1]], label %[[VEC_EPILOG_PH:.*]],
label %[[VECTOR_PH:.*]]
+; CHECK: [[VECTOR_PH]]:
+; CHECK-NEXT:[[N_MOD_VF:%.*]] = urem i64 [[TMP4]], 16
+; CHECK-NEXT:[[N_VEC:%.*]] = sub i64 [[TMP4]], [[N_MOD_VF]]
+; CHECK-NEXT:br label %[[VECTOR_BODY:.*]]
+; CHECK: [[VECTOR_BODY]]:
+; CHECK-NEXT:[[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [
[[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:[[VEC_PHI:%.*]] = phi <4 x i64> [ zeroinitializer,
%[[VECTOR_PH]] ], [ [[TMP13:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:[[VEC_PHI2:%.*]] = phi <4 x i64> [ zeroinitializer,
%[[VECTOR_PH]] ], [ [[TMP14:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:[[VEC_PHI3:%.*]] = phi <4 x i64> [ zeroinitializer,
%[[VECTOR_PH]] ], [ [[TMP15:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:[[VEC_PHI4:%.*]] = phi <4 x i64> [ zeroinitializer,
%[[VECTOR_PH]] ], [ [[TMP16:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:[[TMP5:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP0]],
i64 [[INDEX]]
+; CHECK-NEXT:[[TMP6:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP5]],
i32 4
+; CHECK-NEXT:[[TMP7:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP5]],
i32 8
+; CHECK-NEXT:[[TMP8:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP5]],
i32 12
+; CHECK-NEXT:[[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP5]], align 4
+; CHECK-NEXT:[[WIDE_LOAD5:%.*]] = load <4 x i32>, ptr [[TMP6]], align 4
+; CHECK-NEXT:[[WIDE_LOAD6:%.*]] = load <4 x i32>, ptr [[TMP7]], align 4
+; CHECK-NEXT:[[WIDE_LOAD7:%.*]] = load <4 x i32>, ptr [[TMP8]], align 4
+; CHECK-NEXT:[[TMP9:%.*]] = sext <4 x i32> [[WIDE_LOAD]] to <4 x i64>
+; CHECK-NEXT:[[TMP10:%.*]] = sext <4 x i32> [[WIDE_LOAD5]] to <4 x i64>
+; CHECK-NEXT:[[TMP11:%.*]] = sext <4 x i32> [[WIDE_LOAD6]] to <4 x i64>
+; CHECK-NEXT:[[TMP12:%.*]] = sext <4 x i32> [[WIDE_LOAD7]] to <4 x i64>
+; CHECK-NEXT:[[TMP13]] = add <4 x i64> [[VEC_PHI]], [[TMP9]]
+; CHECK-NEXT:[[TMP14]] = add <4 x i64> [[VEC_PHI2]], [[TMP10]]
+; CHECK-NEXT:[[TMP15]] = add <4 x i64> [[VEC_PHI3]], [[TMP11]]
+; CHECK-NEXT:[[TMP16]] = add <4 x i64> [[VEC_PHI4]], [[TMP12]]
+; CHECK-NEXT:[[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-NEXT:[[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:br i1 [[TMP17]], label %[[MIDDLE_BLOCK:.*]], label
%[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK: [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:[[BIN_RDX:%.*]] = add <4 x i64> [[TMP14]], [[TMP13]]
+; CHECK-NEXT:[[BIN_RDX8:%.*]] = add <4 x i64> [[TMP15]], [[BIN_RDX]]
+; CHECK-NEXT:[[BIN_RDX9:%.*]] = add <4 x i64> [[TMP16]], [[BIN_RDX8]]
+; CHECK-NEXT:[[TMP18:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x
i64> [[BIN_RDX9]])
+; CHECK-NEXT:[[CMP_N:%.*]] = icmp eq i64 [[TMP4]], [[N_VEC]]
+; CHECK-NEXT:br i1 [[CMP_N]], label %[[BB25:.*]], label
%[[VEC_EPILOG_ITER_CHECK:.*]]
+; CHECK: [[VEC_EPILOG_ITER_CHECK]]:
+; CHECK-NEXT:[[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 4
+; CHECK-NEXT:br i1 [[MIN_EPILOG_ITERS_CHECK]], label
%[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF3:![0-9]+]]
+; CHECK: [[VEC_EPILOG_PH]]:
+; CHECK-NEXT:[[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]],
%[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; CHECK-NEXT:[[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP18]],
%[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; CHECK-NEXT:[[N_MOD_VF10:%.*]] = urem i64 [[TMP4]], 4
+; CHECK-NEXT:[[N_VEC11:%.*]] = sub i64
[llvm-branch-commits] [llvm] [LV] Choose best reduction for VPlan (PR #166138)
@@ -0,0 +1,136 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
UTC_ARGS: --version 6
+; RUN: opt -S -mcpu=neoverse-v2 -passes=loop-vectorize -mtriple=aarch64 < %s |
FileCheck %s
+target triple = "aarch64"
+
+; Check that a partial reduction is reverted back to a regular reduction,
+; so that we compare "the VPlan with the best kind of reduction for "
+; vs "the VPlan with the best kind of reduction for ",
+
+; Function Attrs: nofree norecurse nosync nounwind memory(argmem: read)
uwtable vscale_range(1,16)
+define dso_local i64 @foo(ptr noundef readonly captures(none) %0, i32 noundef
%1) local_unnamed_addr #0 {
+; CHECK-LABEL: define dso_local i64 @foo(
+; CHECK-SAME: ptr noundef readonly captures(none) [[TMP0:%.*]], i32 noundef
[[TMP1:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:[[TMP3:%.*]] = icmp sgt i32 [[TMP1]], 0
+; CHECK-NEXT:br i1 [[TMP3]], label %[[ITER_CHECK:.*]], label %[[BB27:.*]]
+; CHECK: [[ITER_CHECK]]:
+; CHECK-NEXT:[[TMP4:%.*]] = zext nneg i32 [[TMP1]] to i64
+; CHECK-NEXT:[[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP4]], 4
+; CHECK-NEXT:br i1 [[MIN_ITERS_CHECK]], label
%[[VEC_EPILOG_SCALAR_PH:.*]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK:.*]]
+; CHECK: [[VECTOR_MAIN_LOOP_ITER_CHECK]]:
+; CHECK-NEXT:[[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[TMP4]], 16
+; CHECK-NEXT:br i1 [[MIN_ITERS_CHECK1]], label %[[VEC_EPILOG_PH:.*]],
label %[[VECTOR_PH:.*]]
+; CHECK: [[VECTOR_PH]]:
+; CHECK-NEXT:[[N_MOD_VF:%.*]] = urem i64 [[TMP4]], 16
+; CHECK-NEXT:[[N_VEC:%.*]] = sub i64 [[TMP4]], [[N_MOD_VF]]
+; CHECK-NEXT:br label %[[VECTOR_BODY:.*]]
+; CHECK: [[VECTOR_BODY]]:
+; CHECK-NEXT:[[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [
[[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:[[VEC_PHI:%.*]] = phi <4 x i64> [ zeroinitializer,
%[[VECTOR_PH]] ], [ [[TMP13:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:[[VEC_PHI2:%.*]] = phi <4 x i64> [ zeroinitializer,
%[[VECTOR_PH]] ], [ [[TMP14:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:[[VEC_PHI3:%.*]] = phi <4 x i64> [ zeroinitializer,
%[[VECTOR_PH]] ], [ [[TMP15:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:[[VEC_PHI4:%.*]] = phi <4 x i64> [ zeroinitializer,
%[[VECTOR_PH]] ], [ [[TMP16:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:[[TMP5:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP0]],
i64 [[INDEX]]
+; CHECK-NEXT:[[TMP6:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP5]],
i32 4
+; CHECK-NEXT:[[TMP7:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP5]],
i32 8
+; CHECK-NEXT:[[TMP8:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP5]],
i32 12
+; CHECK-NEXT:[[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP5]], align 4
+; CHECK-NEXT:[[WIDE_LOAD5:%.*]] = load <4 x i32>, ptr [[TMP6]], align 4
+; CHECK-NEXT:[[WIDE_LOAD6:%.*]] = load <4 x i32>, ptr [[TMP7]], align 4
+; CHECK-NEXT:[[WIDE_LOAD7:%.*]] = load <4 x i32>, ptr [[TMP8]], align 4
+; CHECK-NEXT:[[TMP9:%.*]] = sext <4 x i32> [[WIDE_LOAD]] to <4 x i64>
+; CHECK-NEXT:[[TMP10:%.*]] = sext <4 x i32> [[WIDE_LOAD5]] to <4 x i64>
+; CHECK-NEXT:[[TMP11:%.*]] = sext <4 x i32> [[WIDE_LOAD6]] to <4 x i64>
+; CHECK-NEXT:[[TMP12:%.*]] = sext <4 x i32> [[WIDE_LOAD7]] to <4 x i64>
+; CHECK-NEXT:[[TMP13]] = add <4 x i64> [[VEC_PHI]], [[TMP9]]
+; CHECK-NEXT:[[TMP14]] = add <4 x i64> [[VEC_PHI2]], [[TMP10]]
+; CHECK-NEXT:[[TMP15]] = add <4 x i64> [[VEC_PHI3]], [[TMP11]]
+; CHECK-NEXT:[[TMP16]] = add <4 x i64> [[VEC_PHI4]], [[TMP12]]
+; CHECK-NEXT:[[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-NEXT:[[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:br i1 [[TMP17]], label %[[MIDDLE_BLOCK:.*]], label
%[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK: [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:[[BIN_RDX:%.*]] = add <4 x i64> [[TMP14]], [[TMP13]]
+; CHECK-NEXT:[[BIN_RDX8:%.*]] = add <4 x i64> [[TMP15]], [[BIN_RDX]]
+; CHECK-NEXT:[[BIN_RDX9:%.*]] = add <4 x i64> [[TMP16]], [[BIN_RDX8]]
+; CHECK-NEXT:[[TMP18:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x
i64> [[BIN_RDX9]])
+; CHECK-NEXT:[[CMP_N:%.*]] = icmp eq i64 [[TMP4]], [[N_VEC]]
+; CHECK-NEXT:br i1 [[CMP_N]], label %[[BB25:.*]], label
%[[VEC_EPILOG_ITER_CHECK:.*]]
+; CHECK: [[VEC_EPILOG_ITER_CHECK]]:
+; CHECK-NEXT:[[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 4
+; CHECK-NEXT:br i1 [[MIN_EPILOG_ITERS_CHECK]], label
%[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF3:![0-9]+]]
+; CHECK: [[VEC_EPILOG_PH]]:
+; CHECK-NEXT:[[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]],
%[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; CHECK-NEXT:[[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP18]],
%[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; CHECK-NEXT:[[N_MOD_VF10:%.*]] = urem i64 [[TMP4]], 4
+; CHECK-NEXT:[[N_VEC11:%.*]] = sub i64
[llvm-branch-commits] [llvm] [LV] Choose best reduction for VPlan (PR #166138)
@@ -0,0 +1,136 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
UTC_ARGS: --version 6
+; RUN: opt -S -mcpu=neoverse-v2 -passes=loop-vectorize -mtriple=aarch64 < %s |
FileCheck %s
+target triple = "aarch64"
+
+; Check that a partial reduction is reverted back to a regular reduction,
+; so that we compare "the VPlan with the best kind of reduction for "
+; vs "the VPlan with the best kind of reduction for ",
+
+; Function Attrs: nofree norecurse nosync nounwind memory(argmem: read)
uwtable vscale_range(1,16)
+define dso_local i64 @foo(ptr noundef readonly captures(none) %0, i32 noundef
%1) local_unnamed_addr #0 {
+; CHECK-LABEL: define dso_local i64 @foo(
+; CHECK-SAME: ptr noundef readonly captures(none) [[TMP0:%.*]], i32 noundef
[[TMP1:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:[[TMP3:%.*]] = icmp sgt i32 [[TMP1]], 0
+; CHECK-NEXT:br i1 [[TMP3]], label %[[ITER_CHECK:.*]], label %[[BB27:.*]]
+; CHECK: [[ITER_CHECK]]:
+; CHECK-NEXT:[[TMP4:%.*]] = zext nneg i32 [[TMP1]] to i64
+; CHECK-NEXT:[[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP4]], 4
+; CHECK-NEXT:br i1 [[MIN_ITERS_CHECK]], label
%[[VEC_EPILOG_SCALAR_PH:.*]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK:.*]]
+; CHECK: [[VECTOR_MAIN_LOOP_ITER_CHECK]]:
+; CHECK-NEXT:[[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[TMP4]], 16
+; CHECK-NEXT:br i1 [[MIN_ITERS_CHECK1]], label %[[VEC_EPILOG_PH:.*]],
label %[[VECTOR_PH:.*]]
+; CHECK: [[VECTOR_PH]]:
+; CHECK-NEXT:[[N_MOD_VF:%.*]] = urem i64 [[TMP4]], 16
+; CHECK-NEXT:[[N_VEC:%.*]] = sub i64 [[TMP4]], [[N_MOD_VF]]
+; CHECK-NEXT:br label %[[VECTOR_BODY:.*]]
+; CHECK: [[VECTOR_BODY]]:
+; CHECK-NEXT:[[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [
[[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:[[VEC_PHI:%.*]] = phi <4 x i64> [ zeroinitializer,
%[[VECTOR_PH]] ], [ [[TMP13:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:[[VEC_PHI2:%.*]] = phi <4 x i64> [ zeroinitializer,
%[[VECTOR_PH]] ], [ [[TMP14:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:[[VEC_PHI3:%.*]] = phi <4 x i64> [ zeroinitializer,
%[[VECTOR_PH]] ], [ [[TMP15:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:[[VEC_PHI4:%.*]] = phi <4 x i64> [ zeroinitializer,
%[[VECTOR_PH]] ], [ [[TMP16:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:[[TMP5:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP0]],
i64 [[INDEX]]
+; CHECK-NEXT:[[TMP6:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP5]],
i32 4
+; CHECK-NEXT:[[TMP7:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP5]],
i32 8
+; CHECK-NEXT:[[TMP8:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP5]],
i32 12
+; CHECK-NEXT:[[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP5]], align 4
+; CHECK-NEXT:[[WIDE_LOAD5:%.*]] = load <4 x i32>, ptr [[TMP6]], align 4
+; CHECK-NEXT:[[WIDE_LOAD6:%.*]] = load <4 x i32>, ptr [[TMP7]], align 4
+; CHECK-NEXT:[[WIDE_LOAD7:%.*]] = load <4 x i32>, ptr [[TMP8]], align 4
+; CHECK-NEXT:[[TMP9:%.*]] = sext <4 x i32> [[WIDE_LOAD]] to <4 x i64>
+; CHECK-NEXT:[[TMP10:%.*]] = sext <4 x i32> [[WIDE_LOAD5]] to <4 x i64>
+; CHECK-NEXT:[[TMP11:%.*]] = sext <4 x i32> [[WIDE_LOAD6]] to <4 x i64>
+; CHECK-NEXT:[[TMP12:%.*]] = sext <4 x i32> [[WIDE_LOAD7]] to <4 x i64>
+; CHECK-NEXT:[[TMP13]] = add <4 x i64> [[VEC_PHI]], [[TMP9]]
+; CHECK-NEXT:[[TMP14]] = add <4 x i64> [[VEC_PHI2]], [[TMP10]]
+; CHECK-NEXT:[[TMP15]] = add <4 x i64> [[VEC_PHI3]], [[TMP11]]
+; CHECK-NEXT:[[TMP16]] = add <4 x i64> [[VEC_PHI4]], [[TMP12]]
+; CHECK-NEXT:[[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-NEXT:[[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:br i1 [[TMP17]], label %[[MIDDLE_BLOCK:.*]], label
%[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK: [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:[[BIN_RDX:%.*]] = add <4 x i64> [[TMP14]], [[TMP13]]
+; CHECK-NEXT:[[BIN_RDX8:%.*]] = add <4 x i64> [[TMP15]], [[BIN_RDX]]
+; CHECK-NEXT:[[BIN_RDX9:%.*]] = add <4 x i64> [[TMP16]], [[BIN_RDX8]]
+; CHECK-NEXT:[[TMP18:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x
i64> [[BIN_RDX9]])
+; CHECK-NEXT:[[CMP_N:%.*]] = icmp eq i64 [[TMP4]], [[N_VEC]]
+; CHECK-NEXT:br i1 [[CMP_N]], label %[[BB25:.*]], label
%[[VEC_EPILOG_ITER_CHECK:.*]]
+; CHECK: [[VEC_EPILOG_ITER_CHECK]]:
+; CHECK-NEXT:[[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 4
+; CHECK-NEXT:br i1 [[MIN_EPILOG_ITERS_CHECK]], label
%[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF3:![0-9]+]]
+; CHECK: [[VEC_EPILOG_PH]]:
+; CHECK-NEXT:[[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]],
%[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; CHECK-NEXT:[[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP18]],
%[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; CHECK-NEXT:[[N_MOD_VF10:%.*]] = urem i64 [[TMP4]], 4
+; CHECK-NEXT:[[N_VEC11:%.*]] = sub i64
[llvm-branch-commits] [llvm] [LV] Choose best reduction for VPlan (PR #166138)
@@ -0,0 +1,136 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
UTC_ARGS: --version 6
+; RUN: opt -S -mcpu=neoverse-v2 -passes=loop-vectorize -mtriple=aarch64 < %s |
FileCheck %s
+target triple = "aarch64"
+
+; Check that a partial reduction is reverted back to a regular reduction,
+; so that we compare "the VPlan with the best kind of reduction for "
+; vs "the VPlan with the best kind of reduction for ",
+
+; Function Attrs: nofree norecurse nosync nounwind memory(argmem: read)
uwtable vscale_range(1,16)
+define dso_local i64 @foo(ptr noundef readonly captures(none) %0, i32 noundef
%1) local_unnamed_addr #0 {
+; CHECK-LABEL: define dso_local i64 @foo(
+; CHECK-SAME: ptr noundef readonly captures(none) [[TMP0:%.*]], i32 noundef
[[TMP1:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:[[TMP3:%.*]] = icmp sgt i32 [[TMP1]], 0
+; CHECK-NEXT:br i1 [[TMP3]], label %[[ITER_CHECK:.*]], label %[[BB27:.*]]
+; CHECK: [[ITER_CHECK]]:
+; CHECK-NEXT:[[TMP4:%.*]] = zext nneg i32 [[TMP1]] to i64
+; CHECK-NEXT:[[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP4]], 4
+; CHECK-NEXT:br i1 [[MIN_ITERS_CHECK]], label
%[[VEC_EPILOG_SCALAR_PH:.*]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK:.*]]
+; CHECK: [[VECTOR_MAIN_LOOP_ITER_CHECK]]:
+; CHECK-NEXT:[[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[TMP4]], 16
+; CHECK-NEXT:br i1 [[MIN_ITERS_CHECK1]], label %[[VEC_EPILOG_PH:.*]],
label %[[VECTOR_PH:.*]]
+; CHECK: [[VECTOR_PH]]:
+; CHECK-NEXT:[[N_MOD_VF:%.*]] = urem i64 [[TMP4]], 16
+; CHECK-NEXT:[[N_VEC:%.*]] = sub i64 [[TMP4]], [[N_MOD_VF]]
+; CHECK-NEXT:br label %[[VECTOR_BODY:.*]]
+; CHECK: [[VECTOR_BODY]]:
+; CHECK-NEXT:[[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [
[[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:[[VEC_PHI:%.*]] = phi <4 x i64> [ zeroinitializer,
%[[VECTOR_PH]] ], [ [[TMP13:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:[[VEC_PHI2:%.*]] = phi <4 x i64> [ zeroinitializer,
%[[VECTOR_PH]] ], [ [[TMP14:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:[[VEC_PHI3:%.*]] = phi <4 x i64> [ zeroinitializer,
%[[VECTOR_PH]] ], [ [[TMP15:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:[[VEC_PHI4:%.*]] = phi <4 x i64> [ zeroinitializer,
%[[VECTOR_PH]] ], [ [[TMP16:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:[[TMP5:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP0]],
i64 [[INDEX]]
+; CHECK-NEXT:[[TMP6:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP5]],
i32 4
+; CHECK-NEXT:[[TMP7:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP5]],
i32 8
+; CHECK-NEXT:[[TMP8:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP5]],
i32 12
+; CHECK-NEXT:[[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP5]], align 4
+; CHECK-NEXT:[[WIDE_LOAD5:%.*]] = load <4 x i32>, ptr [[TMP6]], align 4
+; CHECK-NEXT:[[WIDE_LOAD6:%.*]] = load <4 x i32>, ptr [[TMP7]], align 4
+; CHECK-NEXT:[[WIDE_LOAD7:%.*]] = load <4 x i32>, ptr [[TMP8]], align 4
+; CHECK-NEXT:[[TMP9:%.*]] = sext <4 x i32> [[WIDE_LOAD]] to <4 x i64>
+; CHECK-NEXT:[[TMP10:%.*]] = sext <4 x i32> [[WIDE_LOAD5]] to <4 x i64>
+; CHECK-NEXT:[[TMP11:%.*]] = sext <4 x i32> [[WIDE_LOAD6]] to <4 x i64>
+; CHECK-NEXT:[[TMP12:%.*]] = sext <4 x i32> [[WIDE_LOAD7]] to <4 x i64>
+; CHECK-NEXT:[[TMP13]] = add <4 x i64> [[VEC_PHI]], [[TMP9]]
+; CHECK-NEXT:[[TMP14]] = add <4 x i64> [[VEC_PHI2]], [[TMP10]]
+; CHECK-NEXT:[[TMP15]] = add <4 x i64> [[VEC_PHI3]], [[TMP11]]
+; CHECK-NEXT:[[TMP16]] = add <4 x i64> [[VEC_PHI4]], [[TMP12]]
+; CHECK-NEXT:[[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-NEXT:[[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:br i1 [[TMP17]], label %[[MIDDLE_BLOCK:.*]], label
%[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK: [[MIDDLE_BLOCK]]:
sushgokh wrote:
you dont need to check post middle block. You can use something like
'--filter-out-after' when generating the test
https://github.com/llvm/llvm-project/pull/166138
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [LV] Choose best reduction for VPlan (PR #166138)
@@ -3773,19 +3775,76 @@ static void tryToCreateAbstractReductionRecipe(VPReductionRecipe *Red, AbstractR = ExtRed; // Cannot create abstract inloop reduction recipes. if (!AbstractR) -return; +return false; AbstractR->insertBefore(*VPBB, IP); Red->replaceAllUsesWith(AbstractR); + return true; sushgokh wrote: Need to add `Red->eraseFromParent();` before this ? https://github.com/llvm/llvm-project/pull/166138 ___ llvm-branch-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [LV] Choose best reduction for VPlan (PR #166138)
@@ -3773,19 +3775,76 @@ static void
tryToCreateAbstractReductionRecipe(VPReductionRecipe *Red,
AbstractR = ExtRed;
// Cannot create abstract inloop reduction recipes.
if (!AbstractR)
-return;
+return false;
AbstractR->insertBefore(*VPBB, IP);
Red->replaceAllUsesWith(AbstractR);
+ return true;
+}
+
+/// Lower a partial reduction back to a regular reduction, by
+/// changing the in-loop partial reduction to a binop and removing
+/// the scale factor from the PHI node.
+static void lowerPartialReduction(VPlan &Plan, VPPartialReductionRecipe *Red,
+ VPCostContext &Ctx) {
+ VPRecipeBase *Acc = Red->getChainOp()->getDefiningRecipe();
+ if (auto *PhiR = dyn_cast(Acc)) {
+PhiR->setVFScaleFactor(1);
+
+// We also need to update the scale factor of the reduction-start-vector
+// operand.
+VPValue *StartV, *IdentityV;
+if (!match(PhiR->getOperand(0),
+ m_VPInstruction(
+ m_VPValue(StartV), m_VPValue(IdentityV), m_VPValue(
+ llvm_unreachable("Unexpected operand for a partial reduction");
+Type *I32Ty = IntegerType::getInt32Ty(Plan.getContext());
+auto *ScaleFactorVPV = Plan.getOrAddLiveIn(ConstantInt::get(I32Ty, 1));
+cast(PhiR->getOperand(0))->setOperand(2, ScaleFactorVPV);
+ }
+
+ if (auto *R = dyn_cast(Acc))
+if (R->getVFScaleFactor() != 1)
+ lowerPartialReduction(Plan, R, Ctx);
+
+ LLVM_DEBUG(
+ dbgs() << "LV: Lowering " << *Red
+ << " back to regular reduction, because it is not profitable\n");
+
+ // Lower the partial reduction to a regular binop.
+ VPBuilder Builder(Red);
+ VPInstruction *Add = Builder.createNaryOp(
+ RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind()),
+ {Red->getChainOp(), Red->getVecOp()});
+ if (Red->isConditional())
+Add = Builder.createSelect(Red->getCondOp(), Add, Red->getChainOp());
+
+ Red->replaceAllUsesWith(Add);
+ Red->eraseFromParent();
}
void VPlanTransforms::convertToAbstractRecipes(VPlan &Plan, VPCostContext &Ctx,
VFRange &Range) {
for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly(
vp_depth_first_deep(Plan.getVectorLoopRegion( {
for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
- if (auto *Red = dyn_cast(&R))
-tryToCreateAbstractReductionRecipe(Red, Ctx, Range);
+ auto *Red = dyn_cast(&R);
+ if (!Red)
+continue;
+
+ if (!tryToCreateAbstractReductionRecipe(Red, Ctx, Range) &&
sushgokh wrote:
If 'Red' is converted to AbstractRecipe, Red should be null. Can you check for
`Red` instead of returning bool from the `tryToCreateAbstractReductionRecipe` ?
https://github.com/llvm/llvm-project/pull/166138
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [LV] Choose best reduction for VPlan (PR #166138)
@@ -5679,6 +5679,18 @@ InstructionCost AArch64TTIImpl::getPartialReductionCost(
if (CostKind != TTI::TCK_RecipThroughput)
return Invalid;
+ unsigned Ratio =
+ AccumType->getScalarSizeInBits() / InputTypeA->getScalarSizeInBits();
+
+ // A ratio of 1 would mean it's similar to a regular add, e.g.
+ // v4i64 partial.reduce(v4i64 %acc, v4i64 %vec)
+ // <=> add v4i64 %acc, %vec
+ if (Ratio == 1) {
sushgokh wrote:
rather than checking for ratio=1, you should either replace
1.
https://github.com/llvm/llvm-project/blob/main/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp#L323
with getArithmeticInstructionCost (I believe this line is simply checking for
the simple add reduction)
OR
2.
https://github.com/llvm/llvm-project/blob/main/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp#L3552
with getAirthmeticInstructionCost
https://github.com/llvm/llvm-project/pull/166138
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [LV] Choose best reduction for VPlan (PR #166138)
@@ -3773,19 +3775,76 @@ static void
tryToCreateAbstractReductionRecipe(VPReductionRecipe *Red,
AbstractR = ExtRed;
// Cannot create abstract inloop reduction recipes.
if (!AbstractR)
-return;
+return false;
AbstractR->insertBefore(*VPBB, IP);
Red->replaceAllUsesWith(AbstractR);
+ return true;
+}
+
+/// Lower a partial reduction back to a regular reduction, by
+/// changing the in-loop partial reduction to a binop and removing
+/// the scale factor from the PHI node.
+static void lowerPartialReduction(VPlan &Plan, VPPartialReductionRecipe *Red,
+ VPCostContext &Ctx) {
+ VPRecipeBase *Acc = Red->getChainOp()->getDefiningRecipe();
+ if (auto *PhiR = dyn_cast(Acc)) {
+PhiR->setVFScaleFactor(1);
+
+// We also need to update the scale factor of the reduction-start-vector
+// operand.
+VPValue *StartV, *IdentityV;
+if (!match(PhiR->getOperand(0),
+ m_VPInstruction(
+ m_VPValue(StartV), m_VPValue(IdentityV), m_VPValue(
+ llvm_unreachable("Unexpected operand for a partial reduction");
+Type *I32Ty = IntegerType::getInt32Ty(Plan.getContext());
+auto *ScaleFactorVPV = Plan.getOrAddLiveIn(ConstantInt::get(I32Ty, 1));
+cast(PhiR->getOperand(0))->setOperand(2, ScaleFactorVPV);
+ }
+
+ if (auto *R = dyn_cast(Acc))
+if (R->getVFScaleFactor() != 1)
+ lowerPartialReduction(Plan, R, Ctx);
+
+ LLVM_DEBUG(
+ dbgs() << "LV: Lowering " << *Red
+ << " back to regular reduction, because it is not profitable\n");
+
+ // Lower the partial reduction to a regular binop.
+ VPBuilder Builder(Red);
+ VPInstruction *Add = Builder.createNaryOp(
+ RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind()),
+ {Red->getChainOp(), Red->getVecOp()});
+ if (Red->isConditional())
+Add = Builder.createSelect(Red->getCondOp(), Add, Red->getChainOp());
+
+ Red->replaceAllUsesWith(Add);
+ Red->eraseFromParent();
}
void VPlanTransforms::convertToAbstractRecipes(VPlan &Plan, VPCostContext &Ctx,
VFRange &Range) {
for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly(
vp_depth_first_deep(Plan.getVectorLoopRegion( {
for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
- if (auto *Red = dyn_cast(&R))
-tryToCreateAbstractReductionRecipe(Red, Ctx, Range);
+ auto *Red = dyn_cast(&R);
+ if (!Red)
+continue;
+
+ if (!tryToCreateAbstractReductionRecipe(Red, Ctx, Range) &&
+ isa(Red)) {
+// If there isn't a profitable VPExpression for a partial reduction,
+// then that suggests using a partial reduction is not profitable
+// for this VPlan. It seems better to resort to a regular
(middle-block)
+// reduction, so that the this plan is still profitable to consider.
+// Otherwise, the plan might be discarded in favour of a smaller VF.
+//
+// FIXME: There's a lot to unpick when it comes to partial
+// reductions, but this should provide a temporary stop-gap until we
+// reimplement the logic for creating partial reductions.
+lowerPartialReduction(Plan, cast(Red), Ctx);
sushgokh wrote:
it would be great if we can lower this
[here](https://github.com/llvm/llvm-project/blob/main/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp#L8444),
basically after this
https://github.com/llvm/llvm-project/pull/166138
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [BOLT][NFC] Rename Pointer Auth DWARF rewriter passes (PR #164622)
Gergely =?utf-8?q?Bálint?= , Gergely =?utf-8?q?Bálint?= Message-ID: In-Reply-To: paschalis-mpeis wrote: > One piece of doubt that I have since I opened this PR: the term CFI is > overloaded, it can mean Call Frame Information and Control Flow Integrity. > This is even more problematic than usually in this case, where we talk about > the Call Frame Information related to a Control Flow Integrity mechanism, > namely Pointer Authentication. > > Another way to name these would be PointerAuthDwarf* (or capital DWARF*), to > signal that these passes take care of the DWARF CFIs in the binary. That is a good point. Generalizing to DWARF may help, but I don't have strong opinions. If the name feels too long, can consider `PAuth<..>`. It may be worth briefly describing both terms in the doc (as a hint / callout near the top) and expanding the CFI acronyms in both `*Analyzer` and `*Fixup` headers (ie [here](https://github.com/llvm/llvm-project/pull/164622/files#L10)). https://github.com/llvm/llvm-project/pull/164622 ___ llvm-branch-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [AArch64][SME] Handle zeroing ZA and ZT0 in functions with ZT0 state (PR #166361)
MacDue wrote: This is a stacked PR. See other PRs below: 1. https://github.com/llvm/llvm-project/pull/166360 2. :point_right: https://github.com/llvm/llvm-project/pull/166361 3. https://github.com/llvm/llvm-project/pull/166362 https://github.com/llvm/llvm-project/pull/166361 ___ llvm-branch-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [AArch64][SME] Support saving/restoring ZT0 in the MachineSMEABIPass (PR #166362)
llvmbot wrote: @llvm/pr-subscribers-backend-aarch64 Author: Benjamin Maxwell (MacDue) Changes This patch extends the MachineSMEABIPass to support ZT0. This is done with the addition of two new states: - `ACTIVE_ZT0_SAVED` * This is used when calling a function that shares ZA, but does share ZT0 (i.e., no ZT0 attributes). * This state indicates ZT0 must be saved to the save slot, but must remain on, with no lazy save setup - `LOCAL_COMMITTED` * This is used for saving ZT0 in functions without ZA state. * This state indicates ZA is off and ZT0 has been saved. * This state is general enough to support ZA, but those have not been implemented† To aid with readability, the state transitions have been reworked to a switch of `transitionFrom().to( )`, rather than nested ifs, which helps manage more transitions. † This could be implemented to handle some cases of undefined behavior better. --- Patch is 29.60 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/166362.diff 7 Files Affected: - (modified) llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp (+1) - (modified) llvm/lib/Target/AArch64/AArch64ISelLowering.cpp (+8-3) - (modified) llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td (+6) - (modified) llvm/lib/Target/AArch64/MachineSMEABIPass.cpp (+150-26) - (modified) llvm/test/CodeGen/AArch64/sme-peephole-opts.ll (-4) - (modified) llvm/test/CodeGen/AArch64/sme-za-exceptions.ll (+96-28) - (modified) llvm/test/CodeGen/AArch64/sme-zt0-state.ll (+60-44) ``diff diff --git a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp index 34d74d04c4419..60e6a82d41cc8 100644 --- a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp +++ b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp @@ -1717,6 +1717,7 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB, } case AArch64::InOutZAUsePseudo: case AArch64::RequiresZASavePseudo: + case AArch64::RequiresZT0SavePseudo: case AArch64::SMEStateAllocPseudo: case AArch64::COALESCER_BARRIER_FPR16: case AArch64::COALESCER_BARRIER_FPR32: diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 30f961043e78b..20c1c6790b2fb 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -9457,6 +9457,8 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, if (CallAttrs.requiresLazySave() || CallAttrs.requiresPreservingAllZAState()) ZAMarkerNode = AArch64ISD::REQUIRES_ZA_SAVE; +else if (CallAttrs.requiresPreservingZT0()) + ZAMarkerNode = AArch64ISD::REQUIRES_ZT0_SAVE; else if (CallAttrs.caller().hasZAState() || CallAttrs.caller().hasZT0State()) ZAMarkerNode = AArch64ISD::INOUT_ZA_USE; @@ -9576,7 +9578,8 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, SDValue ZTFrameIdx; MachineFrameInfo &MFI = MF.getFrameInfo(); - bool ShouldPreserveZT0 = CallAttrs.requiresPreservingZT0(); + bool ShouldPreserveZT0 = + !UseNewSMEABILowering && CallAttrs.requiresPreservingZT0(); // If the caller has ZT0 state which will not be preserved by the callee, // spill ZT0 before the call. @@ -9589,7 +9592,8 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, // If caller shares ZT0 but the callee is not shared ZA, we need to stop // PSTATE.ZA before the call if there is no lazy-save active. - bool DisableZA = CallAttrs.requiresDisablingZABeforeCall(); + bool DisableZA = + !UseNewSMEABILowering && CallAttrs.requiresDisablingZABeforeCall(); assert((!DisableZA || !RequiresLazySave) && "Lazy-save should have PSTATE.SM=1 on entry to the function"); @@ -10074,7 +10078,8 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, getSMToggleCondition(CallAttrs)); } - if (RequiresLazySave || CallAttrs.requiresEnablingZAAfterCall()) + if (!UseNewSMEABILowering && + (RequiresLazySave || CallAttrs.requiresEnablingZAAfterCall())) // Unconditionally resume ZA. Result = DAG.getNode( AArch64ISD::SMSTART, DL, DAG.getVTList(MVT::Other, MVT::Glue), Result, diff --git a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td index 8f8f211c5fceb..2753a4561daae 100644 --- a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td @@ -102,6 +102,7 @@ def : Pat<(i64 (AArch64AllocateSMESaveBuffer GPR64:$size)), let hasSideEffects = 1, isMeta = 1 in { def InOutZAUsePseudo : Pseudo<(outs), (ins), []>, Sched<[]>; def RequiresZASavePseudo : Pseudo<(outs), (ins), []>, Sched<[]>; + def RequiresZT0SavePseudo : Pseudo<(outs), (ins), []>, Sched<[]>; } def SMEStateAllocPseudo : Pseudo<(outs), (ins), []>, Sched<[]>; @@ -122,6 +123,11 @@ def AArch64_requires_za_sav
[llvm-branch-commits] [llvm] [AArch64][SME] Handle zeroing ZA and ZT0 in functions with ZT0 state (PR #166361)
llvmbot wrote:
@llvm/pr-subscribers-backend-aarch64
Author: Benjamin Maxwell (MacDue)
Changes
In the MachineSMEABIPass, if we have a function with ZT0 state, then there are
some additional cases where we need to zero ZA and ZT0.
If the function has a private ZA interface, i.e., new ZT0 (and new ZA if
present). Then ZT0/ZA must be zeroed when committing the incoming ZA save.
If the function has a shared ZA interface, e.g. new ZA and shared ZT0. Then ZA
must be zeroed on function entry (without a ZA save commit).
The logic in the ABI pass has been reworked to use an "ENTRY" state to handle
this (rather than the more specific "CALLER_DORMANT" state).
---
Full diff: https://github.com/llvm/llvm-project/pull/166361.diff
3 Files Affected:
- (modified) llvm/lib/Target/AArch64/AArch64ISelLowering.cpp (-9)
- (modified) llvm/lib/Target/AArch64/MachineSMEABIPass.cpp (+57-42)
- (modified) llvm/test/CodeGen/AArch64/sme-zt0-state.ll (+11-18)
``diff
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 60aa61e993b26..30f961043e78b 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -8735,15 +8735,6 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
}
}
- if (getTM().useNewSMEABILowering()) {
-// Clear new ZT0 state. TODO: Move this to the SME ABI pass.
-if (Attrs.isNewZT0())
- Chain = DAG.getNode(
- ISD::INTRINSIC_VOID, DL, MVT::Other, Chain,
- DAG.getConstant(Intrinsic::aarch64_sme_zero_zt, DL, MVT::i32),
- DAG.getTargetConstant(0, DL, MVT::i32));
- }
-
return Chain;
}
diff --git a/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp
b/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp
index 8f9aae944ad6d..bb4dfe8c60904 100644
--- a/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp
+++ b/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp
@@ -82,8 +82,8 @@ enum ZAState {
// A ZA save has been set up or committed (i.e. ZA is dormant or off)
LOCAL_SAVED,
- // ZA is off or a lazy save has been set up by the caller
- CALLER_DORMANT,
+ // The ZA/ZT0 state on entry to the function.
+ ENTRY,
// ZA is off
OFF,
@@ -200,7 +200,7 @@ StringRef getZAStateString(ZAState State) {
MAKE_CASE(ZAState::ANY)
MAKE_CASE(ZAState::ACTIVE)
MAKE_CASE(ZAState::LOCAL_SAVED)
-MAKE_CASE(ZAState::CALLER_DORMANT)
+MAKE_CASE(ZAState::ENTRY)
MAKE_CASE(ZAState::OFF)
default:
llvm_unreachable("Unexpected ZAState");
@@ -281,8 +281,8 @@ struct MachineSMEABI : public MachineFunctionPass {
void propagateDesiredStates(FunctionInfo &FnInfo, bool Forwards = true);
// Emission routines for private and shared ZA functions (using lazy saves).
- void emitNewZAPrologue(MachineBasicBlock &MBB,
- MachineBasicBlock::iterator MBBI);
+ void emitSMEPrologue(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI);
void emitRestoreLazySave(EmitContext &, MachineBasicBlock &MBB,
MachineBasicBlock::iterator MBBI,
LiveRegs PhysLiveRegs);
@@ -395,9 +395,7 @@ FunctionInfo MachineSMEABI::collectNeededZAStates(SMEAttrs
SMEFnAttrs) {
if (MBB.isEntryBlock()) {
// Entry block:
- Block.FixedEntryState = SMEFnAttrs.hasPrivateZAInterface()
- ? ZAState::CALLER_DORMANT
- : ZAState::ACTIVE;
+ Block.FixedEntryState = ZAState::ENTRY;
} else if (MBB.isEHPad()) {
// EH entry block:
Block.FixedEntryState = ZAState::LOCAL_SAVED;
@@ -815,32 +813,49 @@ void MachineSMEABI::emitAllocateLazySaveBuffer(
}
}
-void MachineSMEABI::emitNewZAPrologue(MachineBasicBlock &MBB,
- MachineBasicBlock::iterator MBBI) {
+static constexpr unsigned ZERO_ALL_ZA_MASK = 0b;
+
+void MachineSMEABI::emitSMEPrologue(MachineBasicBlock &MBB,
+MachineBasicBlock::iterator MBBI) {
auto *TLI = Subtarget->getTargetLowering();
DebugLoc DL = getDebugLoc(MBB, MBBI);
- // Get current TPIDR2_EL0.
- Register TPIDR2EL0 = MRI->createVirtualRegister(&AArch64::GPR64RegClass);
- BuildMI(MBB, MBBI, DL, TII->get(AArch64::MRS))
- .addReg(TPIDR2EL0, RegState::Define)
- .addImm(AArch64SysReg::TPIDR2_EL0);
- // If TPIDR2_EL0 is non-zero, commit the lazy save.
- // NOTE: Functions that only use ZT0 don't need to zero ZA.
- bool ZeroZA = AFI->getSMEFnAttrs().hasZAState();
- auto CommitZASave =
- BuildMI(MBB, MBBI, DL, TII->get(AArch64::CommitZASavePseudo))
- .addReg(TPIDR2EL0)
- .addImm(ZeroZA ? 1 : 0)
- .addImm(/*ZeroZT0=*/false)
- .addExternalSymbol(TLI->getLibcallName(RTLIB::SMEABI_TPIDR2_SAVE))
- .addRegMask(TRI->SMEABISupportRoutinesCallPreservedMaskFromX0());
- if (ZeroZA)
-CommitZASave.addDef(AArch6
[llvm-branch-commits] [llvm] [AArch64][SME] Support saving/restoring ZT0 in the MachineSMEABIPass (PR #166362)
https://github.com/MacDue ready_for_review https://github.com/llvm/llvm-project/pull/166362 ___ llvm-branch-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [AArch64][SME] Support saving/restoring ZT0 in the MachineSMEABIPass (PR #166362)
MacDue wrote: This is a stacked PR. See other PRs below: 1. https://github.com/llvm/llvm-project/pull/166360 2. https://github.com/llvm/llvm-project/pull/166361 3. :point_right: https://github.com/llvm/llvm-project/pull/166362 https://github.com/llvm/llvm-project/pull/166362 ___ llvm-branch-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [AArch64][SME] Handle zeroing ZA and ZT0 in functions with ZT0 state (PR #166361)
https://github.com/MacDue ready_for_review https://github.com/llvm/llvm-project/pull/166361 ___ llvm-branch-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [BOLT][PAC] Warn about synchronous unwind tables (PR #165227)
paschalis-mpeis wrote: Sounds good. So BOLT doesnt know whether inconsistencies are due to sync/async unwind tables, but instead makes an assumption to help the user? Could you summarise this with a comment in code around the point you emit the warning? Since you are on this, should it handle cases like the below? (we've started seeing these recently) > BOLT-INFO: MarkRAStates ran on 0 functions. Ignored 0 functions (nan%) > because of CFI inconsistencies https://github.com/llvm/llvm-project/pull/165227 ___ llvm-branch-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [AArch64][SME] Support saving/restoring ZT0 in the MachineSMEABIPass (PR #166362)
https://github.com/MacDue created https://github.com/llvm/llvm-project/pull/166362 This patch extends the MachineSMEABIPass to support ZT0. This is done with the addition of two new states: - `ACTIVE_ZT0_SAVED` * This is used when calling a function that shares ZA, but does share ZT0 (i.e., no ZT0 attributes). * This state indicates ZT0 must be saved to the save slot, but must remain on, with no lazy save setup - `LOCAL_COMMITTED` * This is used for saving ZT0 in functions without ZA state. * This state indicates ZA is off and ZT0 has been saved. * This state is general enough to support ZA, but those have not been implemented† To aid with readability, the state transitions have been reworked to a switch of `transitionFrom().to()`, rather than nested ifs, which helps manage more transitions. † This could be implemented to handle some cases of undefined behavior better. >From dc41be430aa17616f431e0ce793e66f92df28881 Mon Sep 17 00:00:00 2001 From: Benjamin Maxwell Date: Mon, 3 Nov 2025 15:41:49 + Subject: [PATCH] [AArch64][SME] Support saving/restoring ZT0 in the MachineSMEABIPass MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch extends the MachineSMEABIPass to support ZT0. This is done with the addition of two new states: - `ACTIVE_ZT0_SAVED` * This is used when calling a function that shares ZA, but does share ZT0 (i.e., no ZT0 attributes). * This state indicates ZT0 must be saved to the save slot, but must remain on, with no lazy save setup - `LOCAL_COMMITTED` * This is used for saving ZT0 in functions without ZA state. * This state indicates ZA is off and ZT0 has been saved. * This state is general enough to support ZA, but those have not been implemented† To aid with readability, the state transitions have been reworked to a switch of `transitionFrom().to()`, rather than nested ifs, which helps manage more transitions. † This could be implemented to handle some cases of undefined behavior better. Change-Id: I14be4a7f8b998fe667bfaade5088f88039515f91 --- .../AArch64/AArch64ExpandPseudoInsts.cpp | 1 + .../Target/AArch64/AArch64ISelLowering.cpp| 11 +- .../lib/Target/AArch64/AArch64SMEInstrInfo.td | 6 + llvm/lib/Target/AArch64/MachineSMEABIPass.cpp | 176 +++--- .../test/CodeGen/AArch64/sme-peephole-opts.ll | 4 - .../test/CodeGen/AArch64/sme-za-exceptions.ll | 124 +--- llvm/test/CodeGen/AArch64/sme-zt0-state.ll| 104 ++- 7 files changed, 321 insertions(+), 105 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp index 34d74d04c4419..60e6a82d41cc8 100644 --- a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp +++ b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp @@ -1717,6 +1717,7 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB, } case AArch64::InOutZAUsePseudo: case AArch64::RequiresZASavePseudo: + case AArch64::RequiresZT0SavePseudo: case AArch64::SMEStateAllocPseudo: case AArch64::COALESCER_BARRIER_FPR16: case AArch64::COALESCER_BARRIER_FPR32: diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 30f961043e78b..20c1c6790b2fb 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -9457,6 +9457,8 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, if (CallAttrs.requiresLazySave() || CallAttrs.requiresPreservingAllZAState()) ZAMarkerNode = AArch64ISD::REQUIRES_ZA_SAVE; +else if (CallAttrs.requiresPreservingZT0()) + ZAMarkerNode = AArch64ISD::REQUIRES_ZT0_SAVE; else if (CallAttrs.caller().hasZAState() || CallAttrs.caller().hasZT0State()) ZAMarkerNode = AArch64ISD::INOUT_ZA_USE; @@ -9576,7 +9578,8 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, SDValue ZTFrameIdx; MachineFrameInfo &MFI = MF.getFrameInfo(); - bool ShouldPreserveZT0 = CallAttrs.requiresPreservingZT0(); + bool ShouldPreserveZT0 = + !UseNewSMEABILowering && CallAttrs.requiresPreservingZT0(); // If the caller has ZT0 state which will not be preserved by the callee, // spill ZT0 before the call. @@ -9589,7 +9592,8 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, // If caller shares ZT0 but the callee is not shared ZA, we need to stop // PSTATE.ZA before the call if there is no lazy-save active. - bool DisableZA = CallAttrs.requiresDisablingZABeforeCall(); + bool DisableZA = + !UseNewSMEABILowering && CallAttrs.requiresDisablingZABeforeCall(); assert((!DisableZA || !RequiresLazySave) && "Lazy-save should have PSTATE.SM=1 on entry to the function"); @@ -10074,7 +10078,8 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, getSMToggleCondition(CallAttrs));
[llvm-branch-commits] [llvm] [AArch64][SME] Handle zeroing ZA and ZT0 in functions with ZT0 state (PR #166361)
https://github.com/MacDue created
https://github.com/llvm/llvm-project/pull/166361
In the MachineSMEABIPass, if we have a function with ZT0 state, then there are
some additional cases where we need to zero ZA and ZT0.
If the function has a private ZA interface, i.e., new ZT0 (and new ZA if
present). Then ZT0/ZA must be zeroed when committing the incoming ZA save.
If the function has a shared ZA interface, e.g. new ZA and shared ZT0. Then ZA
must be zeroed on function entry (without a ZA save commit).
The logic in the ABI pass has been reworked to use an "ENTRY" state to handle
this (rather than the more specific "CALLER_DORMANT" state).
>From ae3ec416aaed38c254f0bbcef1c5b6671d1ce2a6 Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell
Date: Mon, 3 Nov 2025 15:55:59 +
Subject: [PATCH] [AArch64][SME] Handle zeroing ZA and ZT0 in functions with
ZT0 state
In the MachineSMEABIPass, if we have a function with ZT0 state, then
there are some additional cases where we need to zero ZA and ZT0.
If the function has a private ZA interface, i.e., new ZT0 (and new ZA if
present). Then ZT0/ZA must be zeroed when committing the incoming ZA
save.
If the function has a shared ZA interface, e.g. new ZA and shared ZT0.
Then ZA must be zeroed on function entry (without a ZA save commit).
The logic in the ABI pass has been reworked to use an "ENTRY" state to
handle this (rather than the more specific "CALLER_DORMANT" state).
Change-Id: Ib91e9b13ffa4752320fe6a7a720afe919cf00198
---
.../Target/AArch64/AArch64ISelLowering.cpp| 9 --
llvm/lib/Target/AArch64/MachineSMEABIPass.cpp | 99 +++
llvm/test/CodeGen/AArch64/sme-zt0-state.ll| 29 +++---
3 files changed, 68 insertions(+), 69 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 60aa61e993b26..30f961043e78b 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -8735,15 +8735,6 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
}
}
- if (getTM().useNewSMEABILowering()) {
-// Clear new ZT0 state. TODO: Move this to the SME ABI pass.
-if (Attrs.isNewZT0())
- Chain = DAG.getNode(
- ISD::INTRINSIC_VOID, DL, MVT::Other, Chain,
- DAG.getConstant(Intrinsic::aarch64_sme_zero_zt, DL, MVT::i32),
- DAG.getTargetConstant(0, DL, MVT::i32));
- }
-
return Chain;
}
diff --git a/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp
b/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp
index 8f9aae944ad6d..bb4dfe8c60904 100644
--- a/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp
+++ b/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp
@@ -82,8 +82,8 @@ enum ZAState {
// A ZA save has been set up or committed (i.e. ZA is dormant or off)
LOCAL_SAVED,
- // ZA is off or a lazy save has been set up by the caller
- CALLER_DORMANT,
+ // The ZA/ZT0 state on entry to the function.
+ ENTRY,
// ZA is off
OFF,
@@ -200,7 +200,7 @@ StringRef getZAStateString(ZAState State) {
MAKE_CASE(ZAState::ANY)
MAKE_CASE(ZAState::ACTIVE)
MAKE_CASE(ZAState::LOCAL_SAVED)
-MAKE_CASE(ZAState::CALLER_DORMANT)
+MAKE_CASE(ZAState::ENTRY)
MAKE_CASE(ZAState::OFF)
default:
llvm_unreachable("Unexpected ZAState");
@@ -281,8 +281,8 @@ struct MachineSMEABI : public MachineFunctionPass {
void propagateDesiredStates(FunctionInfo &FnInfo, bool Forwards = true);
// Emission routines for private and shared ZA functions (using lazy saves).
- void emitNewZAPrologue(MachineBasicBlock &MBB,
- MachineBasicBlock::iterator MBBI);
+ void emitSMEPrologue(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI);
void emitRestoreLazySave(EmitContext &, MachineBasicBlock &MBB,
MachineBasicBlock::iterator MBBI,
LiveRegs PhysLiveRegs);
@@ -395,9 +395,7 @@ FunctionInfo MachineSMEABI::collectNeededZAStates(SMEAttrs
SMEFnAttrs) {
if (MBB.isEntryBlock()) {
// Entry block:
- Block.FixedEntryState = SMEFnAttrs.hasPrivateZAInterface()
- ? ZAState::CALLER_DORMANT
- : ZAState::ACTIVE;
+ Block.FixedEntryState = ZAState::ENTRY;
} else if (MBB.isEHPad()) {
// EH entry block:
Block.FixedEntryState = ZAState::LOCAL_SAVED;
@@ -815,32 +813,49 @@ void MachineSMEABI::emitAllocateLazySaveBuffer(
}
}
-void MachineSMEABI::emitNewZAPrologue(MachineBasicBlock &MBB,
- MachineBasicBlock::iterator MBBI) {
+static constexpr unsigned ZERO_ALL_ZA_MASK = 0b;
+
+void MachineSMEABI::emitSMEPrologue(MachineBasicBlock &MBB,
+MachineBasicBlock::iterator MBBI) {
auto *TLI = Subtarget->getTargetLowering();
DebugLoc DL = getDebugLoc(MBB, MBBI);
- // Get current TPIDR2_EL0.
- Register TPIDR2EL0 =
[llvm-branch-commits] [X86][NewPM] Port lower-amx-intrinsics to NewPM (PR #165113)
boomanaiden154 wrote: @arsenm Is this good to go or do you want more thought put into when this is supposed to run (which looks like it might require a lot of work on AMX lowering that I would prefer not to do). https://github.com/llvm/llvm-project/pull/165113 ___ llvm-branch-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [CAS] Add llvm-cas tools to inspect on-disk LLVMCAS (PR #114104)
https://github.com/cachemeifyoucan closed https://github.com/llvm/llvm-project/pull/114104 ___ llvm-branch-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [clang] [clang] "modular_format" attribute for functions using format strings (PR #147431)
https://github.com/mysterymath updated
https://github.com/llvm/llvm-project/pull/147431
>From 3ac748150a5c7caf8fed4d7c488770722d505068 Mon Sep 17 00:00:00 2001
From: Daniel Thornburgh
Date: Tue, 10 Jun 2025 14:06:53 -0700
Subject: [PATCH 01/12] [clang] "modular_format" attribute for functions using
format strings
This provides a C language version of the new IR modular-format
attribute. This, in concert with the format attribute, allows a library
function to declare that a modular version of its implementation is
available.
See issue #146159 for context.
---
clang/include/clang/Basic/Attr.td | 11 +++
clang/include/clang/Basic/AttrDocs.td | 25 +
clang/lib/CodeGen/CGCall.cpp | 12
clang/lib/Sema/SemaDeclAttr.cpp | 27 +++
4 files changed, 75 insertions(+)
diff --git a/clang/include/clang/Basic/Attr.td
b/clang/include/clang/Basic/Attr.td
index 749f531ec9ab1..8605032df2eee 100644
--- a/clang/include/clang/Basic/Attr.td
+++ b/clang/include/clang/Basic/Attr.td
@@ -5309,3 +5309,14 @@ def NonString : InheritableAttr {
let Subjects = SubjectList<[Var, Field]>;
let Documentation = [NonStringDocs];
}
+
+def ModularFormat : InheritableAttr {
+ let Spellings = [Clang<"modular_format">];
+ let Args = [
+IdentifierArgument<"ModularImplFn">,
+StringArgument<"ImplName">,
+VariadicStringArgument<"Aspects">
+ ];
+ let Subjects = SubjectList<[Function]>;
+ let Documentation = [ModularFormatDocs];
+}
diff --git a/clang/include/clang/Basic/AttrDocs.td
b/clang/include/clang/Basic/AttrDocs.td
index 2fdd041c1b46e..7baee073b5cfd 100644
--- a/clang/include/clang/Basic/AttrDocs.td
+++ b/clang/include/clang/Basic/AttrDocs.td
@@ -9674,3 +9674,28 @@ silence diagnostics with code like:
__attribute__((nonstring)) char NotAStr[3] = "foo"; // Not diagnosed
}];
}
+
+def ModularFormatDocs : Documentation {
+ let Category = DocCatFunction;
+ let Content = [{
+The ``modular_format`` attribute can be applied to a function that bears the
+``format`` attribute to indicate that the implementation is modular on the
+format string argument. When the format argument for a given call is constant,
+the compiler may redirect the call to the symbol given as the first argument to
+the attribute (the modular implementation function).
+
+The second argument is a implementation name, and the remaining arguments are
+aspects of the format string for the compiler to report. If the compiler does
+not understand a aspect, it must summarily report that the format string has
+that aspect.
+
+The compiler reports an aspect by issing a relocation for the symbol
+`_``. This arranges for code and data needed to support the
+aspect of the implementation to be brought into the link to satisfy weak
+references in the modular implemenation function.
+
+The following aspects are currently supported:
+
+- ``float``: The call has a floating point argument
+ }];
+}
diff --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp
index 465f3f4e670c2..cfff662757c78 100644
--- a/clang/lib/CodeGen/CGCall.cpp
+++ b/clang/lib/CodeGen/CGCall.cpp
@@ -2557,6 +2557,18 @@ void CodeGenModule::ConstructAttributeList(StringRef
Name,
if (TargetDecl->hasAttr())
FuncAttrs.addAttribute("aarch64_pstate_sm_body");
+
+if (auto *ModularFormat = TargetDecl->getAttr()) {
+ // TODO: Error checking
+ FormatAttr *Format = TargetDecl->getAttr();
+ std::string FormatIdx = std::to_string(Format->getFormatIdx());
+ std::string FirstArg = std::to_string(Format->getFirstArg());
+ SmallVector Args = {
+ FormatIdx, FirstArg, ModularFormat->getModularImplFn()->getName(),
+ ModularFormat->getImplName()};
+ llvm::append_range(Args, ModularFormat->aspects());
+ FuncAttrs.addAttribute("modular-format", llvm::join(Args, ","));
+}
}
// Attach "no-builtins" attributes to:
diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp
index a9e7b44ac9d73..484e4ad921835 100644
--- a/clang/lib/Sema/SemaDeclAttr.cpp
+++ b/clang/lib/Sema/SemaDeclAttr.cpp
@@ -6911,6 +6911,29 @@ static void handleVTablePointerAuthentication(Sema &S,
Decl *D,
CustomDiscriminationValue));
}
+static void handleModularFormat(Sema &S, Decl *D, const ParsedAttr &AL) {
+ StringRef ImplName;
+ if (!S.checkStringLiteralArgumentAttr(AL, 1, ImplName))
+return;
+ SmallVector Aspects;
+ for (unsigned I = 2, E = AL.getNumArgs(); I != E; ++I) {
+StringRef Aspect;
+if (!S.checkStringLiteralArgumentAttr(AL, I, Aspect))
+ return;
+Aspects.push_back(Aspect);
+ }
+
+ // Store aspects sorted and without duplicates.
+ llvm::sort(Aspects);
+ Aspects.erase(llvm::unique(Aspects), Aspects.end());
+
+ // TODO: Type checking on identifier
+ // TODO: Merge attributes
+ D->addAttr(::new (S.Context) ModularFormatAttr(
+ S.Context, AL, AL.getArgAsIdent(0)->getIdentifierInfo(), ImplName,
+
[llvm-branch-commits] [llvm] [LSCFG][profcheck] Add dummy branch weights for the dummy switch to dead exits (PR #164714)
https://github.com/mtrofin updated
https://github.com/llvm/llvm-project/pull/164714
>From 66c5d03f0905ba41fa43b66c3d5cbd3e7b003c3b Mon Sep 17 00:00:00 2001
From: Mircea Trofin
Date: Wed, 22 Oct 2025 14:34:31 -0700
Subject: [PATCH] [LSCFG][profcheck] Add dummy branch weights for the dummy
switch to dead exits
---
.../lib/Transforms/Scalar/LoopSimplifyCFG.cpp | 12 ++
.../LoopSimplifyCFG/constant-fold-branch.ll | 104 +-
2 files changed, 66 insertions(+), 50 deletions(-)
diff --git a/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp
b/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp
index b9546c5fa236b..e902b71776973 100644
--- a/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp
@@ -24,6 +24,7 @@
#include "llvm/Analysis/ScalarEvolution.h"
#include "llvm/IR/Dominators.h"
#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/ProfDataUtils.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Transforms/Scalar.h"
#include "llvm/Transforms/Scalar/LoopPassManager.h"
@@ -393,6 +394,17 @@ class ConstantTerminatorFoldingImpl {
DTUpdates.push_back({DominatorTree::Insert, Preheader, BB});
++NumLoopExitsDeleted;
}
+// We don't really need to add branch weights to DummySwitch, because all
+// but one branches are just a temporary artifact - see the comment on top
+// of this function. But, it's easy to estimate the weights, and it helps
+// maintain a property of the overall compiler - that the branch weights
+// don't "just get dropped" accidentally (i.e. profcheck)
+if (DummySwitch->getParent()->getParent()->hasProfileData()) {
+ SmallVector DummyBranchWeights(1 + DummySwitch->getNumCases());
+ // default. 100% probability, the rest are dead.
+ DummyBranchWeights[0] = 1;
+ setBranchWeights(*DummySwitch, DummyBranchWeights, /*IsExpected=*/false);
+}
assert(L.getLoopPreheader() == NewPreheader && "Malformed CFG?");
if (Loop *OuterLoop = LI.getLoopFor(Preheader)) {
diff --git a/llvm/test/Transforms/LoopSimplifyCFG/constant-fold-branch.ll
b/llvm/test/Transforms/LoopSimplifyCFG/constant-fold-branch.ll
index 1ec212f0bb5ea..46b6209986fed 100644
--- a/llvm/test/Transforms/LoopSimplifyCFG/constant-fold-branch.ll
+++ b/llvm/test/Transforms/LoopSimplifyCFG/constant-fold-branch.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
UTC_ARGS: --check-globals
; REQUIRES: asserts
; RUN: opt -S -enable-loop-simplifycfg-term-folding=true
-passes=loop-simplifycfg -verify-loop-info -verify-dom-info -verify-loop-lcssa
< %s | FileCheck %s
; RUN: opt -S -enable-loop-simplifycfg-term-folding=true
-passes='require,loop(loop-simplifycfg)' -verify-loop-info
-verify-dom-info -verify-loop-lcssa < %s | FileCheck %s
@@ -59,7 +59,7 @@ define i32 @dead_backedge_test_switch_loop(i32 %end) {
; CHECK: dead_backedge:
; CHECK-NEXT:[[I_2]] = add i32 [[I_1]], 10
; CHECK-NEXT:switch i32 1, label [[EXIT:%.*]] [
-; CHECK-NEXT:i32 0, label [[HEADER_BACKEDGE]]
+; CHECK-NEXT: i32 0, label [[HEADER_BACKEDGE]]
; CHECK-NEXT:]
; CHECK: exit:
; CHECK-NEXT:[[I_2_LCSSA:%.*]] = phi i32 [ [[I_2]], [[DEAD_BACKEDGE]] ]
@@ -233,12 +233,12 @@ exit:
; Check that we preserve static reachibility of a dead exit block while
deleting
; a branch.
-define i32 @dead_exit_test_branch_loop(i32 %end) {
+define i32 @dead_exit_test_branch_loop(i32 %end) !prof
!{!"function_entry_count", i32 10} {
; CHECK-LABEL: @dead_exit_test_branch_loop(
; CHECK-NEXT: preheader:
; CHECK-NEXT:switch i32 0, label [[PREHEADER_SPLIT:%.*]] [
-; CHECK-NEXT:i32 1, label [[DEAD:%.*]]
-; CHECK-NEXT:]
+; CHECK-NEXT: i32 1, label [[DEAD:%.*]]
+; CHECK-NEXT:], !prof [[PROF1:![0-9]+]]
; CHECK: preheader.split:
; CHECK-NEXT:br label [[HEADER:%.*]]
; CHECK: header:
@@ -262,7 +262,7 @@ preheader:
header:
%i = phi i32 [0, %preheader], [%i.inc, %backedge]
- br i1 true, label %backedge, label %dead
+ br i1 true, label %backedge, label %dead, !prof !{!"branch_weights", i32 10,
i32 1}
dead:
br label %dummy
@@ -286,7 +286,7 @@ define i32 @dead_exit_test_switch_loop(i32 %end) {
; CHECK-LABEL: @dead_exit_test_switch_loop(
; CHECK-NEXT: preheader:
; CHECK-NEXT:switch i32 0, label [[PREHEADER_SPLIT:%.*]] [
-; CHECK-NEXT:i32 1, label [[DEAD:%.*]]
+; CHECK-NEXT: i32 1, label [[DEAD:%.*]]
; CHECK-NEXT:]
; CHECK: preheader.split:
; CHECK-NEXT:br label [[HEADER:%.*]]
@@ -383,9 +383,9 @@ define i32 @dead_loop_test_switch_loop(i32 %end) {
; CHECK: header:
; CHECK-NEXT:[[I:%.*]] = phi i32 [ 0, [[PREHEADER:%.*]] ], [
[[I_INC:%.*]], [[BACKEDGE:%.*]] ]
; CHECK-NEXT:switch i32 1, label [[DEAD:%.*]] [
-; CHECK-NEXT:i32 0, label [[DEAD]]
-; CHECK-NEXT:i32 1, label [[BACKEDGE]]
-; CHECK-NEXT:i32 2, lab
[llvm-branch-commits] [llvm] [llvm][mustache] Avoid redundant saves in accessor splitting (PR #159197)
https://github.com/ilovepi updated
https://github.com/llvm/llvm-project/pull/159197
>From 0cf7370c83c7fcfab8559ebb36092f48020b8c8f Mon Sep 17 00:00:00 2001
From: Paul Kirth
Date: Tue, 16 Sep 2025 00:11:47 -0700
Subject: [PATCH] [llvm][mustache] Avoid redundant saves in accessor splitting
The splitMustacheString function was saving StringRefs that
were already backed by an arena-allocated string. This was
unnecessary work. This change removes the redundant
Ctx.Saver.save() call.
This optimization provides a small but measurable performance
improvement on top of the single-pass tokenizer, most notably
reducing branch misses.
Metric | Baseline | Optimized | Change
-- | | - | ---
Time (ms) | 35.77| 35.57 | -0.56%
Cycles | 35.16M | 34.91M| -0.71%
Instructions | 85.77M | 85.54M| -0.27%
Branch Misses | 113.9K | 111.9K| -1.76%
Cache Misses | 237.7K | 242.1K| +1.85%
---
llvm/lib/Support/Mustache.cpp | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/llvm/lib/Support/Mustache.cpp b/llvm/lib/Support/Mustache.cpp
index 24e3105c5e8a9..012e1ffd534d2 100644
--- a/llvm/lib/Support/Mustache.cpp
+++ b/llvm/lib/Support/Mustache.cpp
@@ -51,7 +51,7 @@ static Accessor splitMustacheString(StringRef Str,
MustacheContext &Ctx) {
std::tie(Part, Str) = Str.split('.');
// Each part of the accessor needs to be saved to the arena
// to ensure it has a stable address.
- Tokens.push_back(Ctx.Saver.save(Part.trim()));
+ Tokens.push_back(Part.trim());
}
}
// Now, allocate memory for the array of StringRefs in the arena.
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [llvm][mustache] Avoid redundant saves in accessor splitting (PR #159197)
https://github.com/ilovepi updated
https://github.com/llvm/llvm-project/pull/159197
>From 0cf7370c83c7fcfab8559ebb36092f48020b8c8f Mon Sep 17 00:00:00 2001
From: Paul Kirth
Date: Tue, 16 Sep 2025 00:11:47 -0700
Subject: [PATCH] [llvm][mustache] Avoid redundant saves in accessor splitting
The splitMustacheString function was saving StringRefs that
were already backed by an arena-allocated string. This was
unnecessary work. This change removes the redundant
Ctx.Saver.save() call.
This optimization provides a small but measurable performance
improvement on top of the single-pass tokenizer, most notably
reducing branch misses.
Metric | Baseline | Optimized | Change
-- | | - | ---
Time (ms) | 35.77| 35.57 | -0.56%
Cycles | 35.16M | 34.91M| -0.71%
Instructions | 85.77M | 85.54M| -0.27%
Branch Misses | 113.9K | 111.9K| -1.76%
Cache Misses | 237.7K | 242.1K| +1.85%
---
llvm/lib/Support/Mustache.cpp | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/llvm/lib/Support/Mustache.cpp b/llvm/lib/Support/Mustache.cpp
index 24e3105c5e8a9..012e1ffd534d2 100644
--- a/llvm/lib/Support/Mustache.cpp
+++ b/llvm/lib/Support/Mustache.cpp
@@ -51,7 +51,7 @@ static Accessor splitMustacheString(StringRef Str,
MustacheContext &Ctx) {
std::tie(Part, Str) = Str.split('.');
// Each part of the accessor needs to be saved to the arena
// to ensure it has a stable address.
- Tokens.push_back(Ctx.Saver.save(Part.trim()));
+ Tokens.push_back(Part.trim());
}
}
// Now, allocate memory for the array of StringRefs in the arena.
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [llvm][mustache] Optimize accessor splitting with a single pass (PR #159198)
https://github.com/ilovepi updated
https://github.com/llvm/llvm-project/pull/159198
>From d96bab0fb7af58cef733922dbe42a4bcdf5b4953 Mon Sep 17 00:00:00 2001
From: Paul Kirth
Date: Tue, 16 Sep 2025 00:24:43 -0700
Subject: [PATCH] [llvm][mustache] Optimize accessor splitting with a single
pass
The splitMustacheString function previously used a loop of
StringRef::split and StringRef::trim. This was inefficient as
it scanned each segment of the accessor string multiple times.
This change introduces a custom splitAndTrim function that
performs both operations in a single pass over the string,
reducing redundant work and improving performance, most notably
in the number of CPU cycles executed.
Metric | Baseline | Optimized | Change
-- | | - | ---
Time (ms) | 35.57| 35.36 | -0.59%
Cycles | 34.91M | 34.26M| -1.86%
Instructions | 85.54M | 85.24M| -0.35%
Branch Misses | 111.9K | 112.2K| +0.27%
Cache Misses | 242.1K | 239.9K| -0.91%
---
llvm/lib/Support/Mustache.cpp | 33 ++---
1 file changed, 26 insertions(+), 7 deletions(-)
diff --git a/llvm/lib/Support/Mustache.cpp b/llvm/lib/Support/Mustache.cpp
index 012e1ffd534d2..9eb1ec2b8425c 100644
--- a/llvm/lib/Support/Mustache.cpp
+++ b/llvm/lib/Support/Mustache.cpp
@@ -34,6 +34,31 @@ static bool isContextFalsey(const json::Value *V) {
return isFalsey(*V);
}
+static void splitAndTrim(StringRef Str, SmallVectorImpl &Tokens) {
+ size_t CurrentPos = 0;
+ while (CurrentPos < Str.size()) {
+// Find the next delimiter.
+size_t DelimiterPos = Str.find('.', CurrentPos);
+
+// If no delimiter is found, process the rest of the string.
+if (DelimiterPos == StringRef::npos)
+ DelimiterPos = Str.size();
+
+// Get the current part, which may have whitespace.
+StringRef Part = Str.slice(CurrentPos, DelimiterPos);
+
+// Manually trim the part without creating a new string object.
+size_t Start = Part.find_first_not_of(" \t\r\n");
+if (Start != StringRef::npos) {
+ size_t End = Part.find_last_not_of(" \t\r\n");
+ Tokens.push_back(Part.slice(Start, End + 1));
+}
+
+// Move past the delimiter for the next iteration.
+CurrentPos = DelimiterPos + 1;
+ }
+}
+
static Accessor splitMustacheString(StringRef Str, MustacheContext &Ctx) {
// We split the mustache string into an accessor.
// For example:
@@ -46,13 +71,7 @@ static Accessor splitMustacheString(StringRef Str,
MustacheContext &Ctx) {
// It's a literal, so it doesn't need to be saved.
Tokens.push_back(".");
} else {
-while (!Str.empty()) {
- StringRef Part;
- std::tie(Part, Str) = Str.split('.');
- // Each part of the accessor needs to be saved to the arena
- // to ensure it has a stable address.
- Tokens.push_back(Part.trim());
-}
+splitAndTrim(Str, Tokens);
}
// Now, allocate memory for the array of StringRefs in the arena.
StringRef *ArenaTokens = Ctx.Allocator.Allocate(Tokens.size());
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [llvm][mustache] Optimize accessor splitting with a single pass (PR #159198)
https://github.com/ilovepi updated
https://github.com/llvm/llvm-project/pull/159198
>From d96bab0fb7af58cef733922dbe42a4bcdf5b4953 Mon Sep 17 00:00:00 2001
From: Paul Kirth
Date: Tue, 16 Sep 2025 00:24:43 -0700
Subject: [PATCH] [llvm][mustache] Optimize accessor splitting with a single
pass
The splitMustacheString function previously used a loop of
StringRef::split and StringRef::trim. This was inefficient as
it scanned each segment of the accessor string multiple times.
This change introduces a custom splitAndTrim function that
performs both operations in a single pass over the string,
reducing redundant work and improving performance, most notably
in the number of CPU cycles executed.
Metric | Baseline | Optimized | Change
-- | | - | ---
Time (ms) | 35.57| 35.36 | -0.59%
Cycles | 34.91M | 34.26M| -1.86%
Instructions | 85.54M | 85.24M| -0.35%
Branch Misses | 111.9K | 112.2K| +0.27%
Cache Misses | 242.1K | 239.9K| -0.91%
---
llvm/lib/Support/Mustache.cpp | 33 ++---
1 file changed, 26 insertions(+), 7 deletions(-)
diff --git a/llvm/lib/Support/Mustache.cpp b/llvm/lib/Support/Mustache.cpp
index 012e1ffd534d2..9eb1ec2b8425c 100644
--- a/llvm/lib/Support/Mustache.cpp
+++ b/llvm/lib/Support/Mustache.cpp
@@ -34,6 +34,31 @@ static bool isContextFalsey(const json::Value *V) {
return isFalsey(*V);
}
+static void splitAndTrim(StringRef Str, SmallVectorImpl &Tokens) {
+ size_t CurrentPos = 0;
+ while (CurrentPos < Str.size()) {
+// Find the next delimiter.
+size_t DelimiterPos = Str.find('.', CurrentPos);
+
+// If no delimiter is found, process the rest of the string.
+if (DelimiterPos == StringRef::npos)
+ DelimiterPos = Str.size();
+
+// Get the current part, which may have whitespace.
+StringRef Part = Str.slice(CurrentPos, DelimiterPos);
+
+// Manually trim the part without creating a new string object.
+size_t Start = Part.find_first_not_of(" \t\r\n");
+if (Start != StringRef::npos) {
+ size_t End = Part.find_last_not_of(" \t\r\n");
+ Tokens.push_back(Part.slice(Start, End + 1));
+}
+
+// Move past the delimiter for the next iteration.
+CurrentPos = DelimiterPos + 1;
+ }
+}
+
static Accessor splitMustacheString(StringRef Str, MustacheContext &Ctx) {
// We split the mustache string into an accessor.
// For example:
@@ -46,13 +71,7 @@ static Accessor splitMustacheString(StringRef Str,
MustacheContext &Ctx) {
// It's a literal, so it doesn't need to be saved.
Tokens.push_back(".");
} else {
-while (!Str.empty()) {
- StringRef Part;
- std::tie(Part, Str) = Str.split('.');
- // Each part of the accessor needs to be saved to the arena
- // to ensure it has a stable address.
- Tokens.push_back(Part.trim());
-}
+splitAndTrim(Str, Tokens);
}
// Now, allocate memory for the array of StringRefs in the arena.
StringRef *ArenaTokens = Ctx.Allocator.Allocate(Tokens.size());
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [llvm][mustache] Avoid extra allocations in parseSection (PR #159199)
https://github.com/ilovepi updated https://github.com/llvm/llvm-project/pull/159199 >From 5a028c4e11ef7fe1543e31f58fc76a74d5362e0c Mon Sep 17 00:00:00 2001 From: Paul Kirth Date: Tue, 16 Sep 2025 09:40:04 -0700 Subject: [PATCH] [llvm][mustache] Avoid extra allocations in parseSection We don't need to have extra allocations when concatenating raw bodies. --- llvm/lib/Support/Mustache.cpp | 9 - 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Support/Mustache.cpp b/llvm/lib/Support/Mustache.cpp index 9eb1ec2b8425c..6c140be59fc4b 100644 --- a/llvm/lib/Support/Mustache.cpp +++ b/llvm/lib/Support/Mustache.cpp @@ -599,9 +599,16 @@ void Parser::parseSection(ASTNode *Parent, ASTNode::Type Ty, size_t Start = CurrentPtr; parseMustache(CurrentNode); const size_t End = CurrentPtr - 1; + + size_t RawBodySize = 0; + for (size_t I = Start; I < End; ++I) +RawBodySize += Tokens[I].RawBody.size(); + SmallString<128> RawBody; - for (std::size_t I = Start; I < End; I++) + RawBody.reserve(RawBodySize); + for (std::size_t I = Start; I < End; ++I) RawBody += Tokens[I].RawBody; + CurrentNode->setRawBody(Ctx.Saver.save(StringRef(RawBody))); Parent->addChild(CurrentNode); } ___ llvm-branch-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [llvm][mustache] Avoid extra allocations in parseSection (PR #159199)
https://github.com/ilovepi updated https://github.com/llvm/llvm-project/pull/159199 >From 5a028c4e11ef7fe1543e31f58fc76a74d5362e0c Mon Sep 17 00:00:00 2001 From: Paul Kirth Date: Tue, 16 Sep 2025 09:40:04 -0700 Subject: [PATCH] [llvm][mustache] Avoid extra allocations in parseSection We don't need to have extra allocations when concatenating raw bodies. --- llvm/lib/Support/Mustache.cpp | 9 - 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Support/Mustache.cpp b/llvm/lib/Support/Mustache.cpp index 9eb1ec2b8425c..6c140be59fc4b 100644 --- a/llvm/lib/Support/Mustache.cpp +++ b/llvm/lib/Support/Mustache.cpp @@ -599,9 +599,16 @@ void Parser::parseSection(ASTNode *Parent, ASTNode::Type Ty, size_t Start = CurrentPtr; parseMustache(CurrentNode); const size_t End = CurrentPtr - 1; + + size_t RawBodySize = 0; + for (size_t I = Start; I < End; ++I) +RawBodySize += Tokens[I].RawBody.size(); + SmallString<128> RawBody; - for (std::size_t I = Start; I < End; I++) + RawBody.reserve(RawBodySize); + for (std::size_t I = Start; I < End; ++I) RawBody += Tokens[I].RawBody; + CurrentNode->setRawBody(Ctx.Saver.save(StringRef(RawBody))); Parent->addChild(CurrentNode); } ___ llvm-branch-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [CAS] Add llvm-cas tools to inspect on-disk LLVMCAS (PR #114104)
@@ -0,0 +1 @@ +content ilovepi wrote: lol. wow. I'm apparently *very* observant :facepalm: https://github.com/llvm/llvm-project/pull/114104 ___ llvm-branch-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [CAS] Add llvm-cas tools to inspect on-disk LLVMCAS (PR #114104)
https://github.com/cachemeifyoucan updated
https://github.com/llvm/llvm-project/pull/114104
>From 63c4928ed65fb2a83a4a25f3c098af7d931fc0af Mon Sep 17 00:00:00 2001
From: Steven Wu
Date: Mon, 3 Nov 2025 12:09:19 -0800
Subject: [PATCH 1/2] clang-format
Created using spr 1.3.7
---
llvm/tools/llvm-cas/llvm-cas.cpp | 11 +--
1 file changed, 5 insertions(+), 6 deletions(-)
diff --git a/llvm/tools/llvm-cas/llvm-cas.cpp b/llvm/tools/llvm-cas/llvm-cas.cpp
index b1e4f606211b5..e59313eb808e8 100644
--- a/llvm/tools/llvm-cas/llvm-cas.cpp
+++ b/llvm/tools/llvm-cas/llvm-cas.cpp
@@ -175,13 +175,12 @@ int main(int Argc, char **Argv) {
return validateObject(*CAS, ID);
}
-static Expected>
-openBuffer(StringRef DataPath) {
+static Expected> openBuffer(StringRef DataPath) {
if (DataPath.empty())
return createStringError(inconvertibleErrorCode(), "--data missing");
- return errorOrToExpected(
- DataPath == "-" ? llvm::MemoryBuffer::getSTDIN()
- : llvm::MemoryBuffer::getFile(DataPath));
+ return errorOrToExpected(DataPath == "-"
+ ? llvm::MemoryBuffer::getSTDIN()
+ : llvm::MemoryBuffer::getFile(DataPath));
}
int dump(ObjectStore &CAS) {
@@ -311,7 +310,7 @@ int validateIfNeeded(StringRef Path, bool CheckHash, bool
Force,
Exec = ExecStorage;
}
ValidationResult Result =
ExitOnErr(validateOnDiskUnifiedCASDatabasesIfNeeded(
-Path, CheckHash, AllowRecovery, Force, Exec));
+ Path, CheckHash, AllowRecovery, Force, Exec));
switch (Result) {
case ValidationResult::Valid:
outs() << "validated successfully\n";
>From 76fbb642c630302353ae67a50df93db71e7f33cc Mon Sep 17 00:00:00 2001
From: Steven Wu
Date: Mon, 3 Nov 2025 12:13:40 -0800
Subject: [PATCH 2/2] darker check fix
Created using spr 1.3.7
---
llvm/test/lit.cfg.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/llvm/test/lit.cfg.py b/llvm/test/lit.cfg.py
index b78dc10ff6ad5..bca196e80640b 100644
--- a/llvm/test/lit.cfg.py
+++ b/llvm/test/lit.cfg.py
@@ -789,7 +789,7 @@ def host_unwind_supports_jit():
config.available_features.add("expensive_checks")
if config.have_ondisk_cas:
-config.available_features.add('ondisk_cas')
+config.available_features.add("ondisk_cas")
if "MemoryWithOrigins" in config.llvm_use_sanitizer:
config.available_features.add("use_msan_with_origins")
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [CAS] Add llvm-cas tools to inspect on-disk LLVMCAS (PR #114104)
https://github.com/cachemeifyoucan updated
https://github.com/llvm/llvm-project/pull/114104
Unicorn! · GitHub
body {
background-color: #f1f1f1;
margin: 0;
font-family: "Helvetica Neue", Helvetica, Arial, sans-serif;
}
.container { margin: 50px auto 40px auto; width: 600px; text-align:
center; }
a { color: #4183c4; text-decoration: none; }
a:hover { text-decoration: underline; }
h1 { letter-spacing: -1px; line-height: 60px; font-size: 60px;
font-weight: 100; margin: 0px; text-shadow: 0 1px 0 #fff; }
p { color: rgba(0, 0, 0, 0.5); margin: 10px 0 10px; font-size: 18px;
font-weight: 200; line-height: 1.6em;}
ul { list-style: none; margin: 25px 0; padding: 0; }
li { display: table-cell; font-weight: bold; width: 1%; }
.logo { display: inline-block; margin-top: 35px; }
.logo-img-2x { display: none; }
@media
only screen and (-webkit-min-device-pixel-ratio: 2),
only screen and ( min--moz-device-pixel-ratio: 2),
only screen and ( -o-min-device-pixel-ratio: 2/1),
only screen and (min-device-pixel-ratio: 2),
only screen and (min-resolution: 192dpi),
only screen and (min-resolution: 2dppx) {
.logo-img-1x { display: none; }
.logo-img-2x { display: inline-block; }
}
#suggestions {
margin-top: 35px;
color: #ccc;
}
#suggestions a {
color: #66;
font-weight: 200;
font-size: 14px;
margin: 0 10px;
}
No server is currently available to service your
request.
Sorry about that. Please try refreshing and contact us if the problem
persists.
https://github.com/contact";>Contact Support —
https://www.githubstatus.com";>GitHub Status —
https://twitter.com/githubstatus";>@githubstatus
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] DAG: Merge all sincos_stret emission code into legalizer (PR #166295)
https://github.com/ilovepi approved this pull request. https://github.com/llvm/llvm-project/pull/166295 ___ llvm-branch-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [LIR][profcheck] Reuse the loop's exit condition profile (PR #164523)
https://github.com/mtrofin updated
https://github.com/llvm/llvm-project/pull/164523
>From 0037b3ee964df164b8215b0b6107a212b00b33ae Mon Sep 17 00:00:00 2001
From: Mircea Trofin
Date: Tue, 21 Oct 2025 17:24:49 -0700
Subject: [PATCH] [LIR][profcheck] Reuse the loop's exit condition profile
---
.../Transforms/Scalar/LoopIdiomRecognize.cpp | 40 +--
.../LoopIdiom/X86/preserve-profile.ll | 70 +++
2 files changed, 106 insertions(+), 4 deletions(-)
create mode 100644 llvm/test/Transforms/LoopIdiom/X86/preserve-profile.ll
diff --git a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
index 019536ca91ae0..9070d252ae09f 100644
--- a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
@@ -72,6 +72,7 @@
#include "llvm/IR/Module.h"
#include "llvm/IR/PassManager.h"
#include "llvm/IR/PatternMatch.h"
+#include "llvm/IR/ProfDataUtils.h"
#include "llvm/IR/Type.h"
#include "llvm/IR/User.h"
#include "llvm/IR/Value.h"
@@ -105,6 +106,7 @@ STATISTIC(
STATISTIC(NumShiftUntilZero,
"Number of uncountable loops recognized as 'shift until zero'
idiom");
+namespace llvm {
bool DisableLIRP::All;
static cl::opt
DisableLIRPAll("disable-" DEBUG_TYPE "-all",
@@ -163,6 +165,10 @@ static cl::opt ForceMemsetPatternIntrinsic(
cl::desc("Use memset.pattern intrinsic whenever possible"),
cl::init(false),
cl::Hidden);
+extern cl::opt ProfcheckDisableMetadataFixes;
+
+} // namespace llvm
+
namespace {
class LoopIdiomRecognize {
@@ -3199,7 +3205,21 @@ bool LoopIdiomRecognize::recognizeShiftUntilBitTest() {
// The loop trip count check.
auto *IVCheck = Builder.CreateICmpEQ(IVNext, LoopTripCount,
CurLoop->getName() + ".ivcheck");
- Builder.CreateCondBr(IVCheck, SuccessorBB, LoopHeaderBB);
+ SmallVector BranchWeights;
+ const bool HasBranchWeights =
+ !ProfcheckDisableMetadataFixes &&
+ extractBranchWeights(*LoopHeaderBB->getTerminator(), BranchWeights);
+
+ auto *BI = Builder.CreateCondBr(IVCheck, SuccessorBB, LoopHeaderBB);
+ if (HasBranchWeights) {
+if (SuccessorBB == LoopHeaderBB->getTerminator()->getSuccessor(1))
+ std::swap(BranchWeights[0], BranchWeights[1]);
+// We're not changing the loop profile, so we can reuse the original loop's
+// profile.
+setBranchWeights(*BI, BranchWeights,
+ /*IsExpected=*/false);
+ }
+
LoopHeaderBB->getTerminator()->eraseFromParent();
// Populate the IV PHI.
@@ -3368,10 +3388,10 @@ static bool detectShiftUntilZeroIdiom(Loop *CurLoop,
ScalarEvolution *SE,
/// %start = <...>
/// %extraoffset = <...>
/// <...>
-/// br label %for.cond
+/// br label %loop
///
/// loop:
-/// %iv = phi i8 [ %start, %entry ], [ %iv.next, %for.cond ]
+/// %iv = phi i8 [ %start, %entry ], [ %iv.next, %loop ]
/// %nbits = add nsw i8 %iv, %extraoffset
/// %val.shifted = {{l,a}shr,shl} i8 %val, %nbits
/// %val.shifted.iszero = icmp eq i8 %val.shifted, 0
@@ -3533,7 +3553,19 @@ bool LoopIdiomRecognize::recognizeShiftUntilZero() {
// The loop terminator.
Builder.SetInsertPoint(LoopHeaderBB->getTerminator());
- Builder.CreateCondBr(CIVCheck, SuccessorBB, LoopHeaderBB);
+ SmallVector BranchWeights;
+ const bool HasBranchWeights =
+ !ProfcheckDisableMetadataFixes &&
+ extractBranchWeights(*LoopHeaderBB->getTerminator(), BranchWeights);
+
+ auto *BI = Builder.CreateCondBr(CIVCheck, SuccessorBB, LoopHeaderBB);
+ if (HasBranchWeights) {
+if (InvertedCond)
+ std::swap(BranchWeights[0], BranchWeights[1]);
+// We're not changing the loop profile, so we can reuse the original loop's
+// profile.
+setBranchWeights(*BI, BranchWeights, /*IsExpected=*/false);
+ }
LoopHeaderBB->getTerminator()->eraseFromParent();
// Populate the IV PHI.
diff --git a/llvm/test/Transforms/LoopIdiom/X86/preserve-profile.ll
b/llvm/test/Transforms/LoopIdiom/X86/preserve-profile.ll
new file mode 100644
index 0..d01bb748d9422
--- /dev/null
+++ b/llvm/test/Transforms/LoopIdiom/X86/preserve-profile.ll
@@ -0,0 +1,70 @@
+; RUN: opt
-passes="module(print),function(loop(loop-idiom)),module(print)"
-mtriple=x86_64 -mcpu=core-avx2 %s -disable-output 2>&1 | FileCheck
--check-prefix=PROFILE %s
+
+declare void @escape_inner(i8, i8, i8, i1, i8)
+declare void @escape_outer(i8, i8, i8, i1, i8)
+
+declare i8 @gen.i8()
+
+; Most basic pattern; Note that iff the shift amount is offset, said offsetting
+; must not cause an overflow, but `add nsw` is fine.
+define i8 @p0(i8 %val, i8 %start, i8 %extraoffset) mustprogress {
+entry:
+ br label %loop
+
+loop:
+ %iv = phi i8 [ %start, %entry ], [ %iv.next, %loop ]
+ %nbits = add nsw i8 %iv, %extraoffset
+ %val.shifted = ashr i8 %val, %nbits
+ %val.shifted.iszero = icmp eq i8 %val.shifted, 0
+ %iv.next = add i8 %iv, 1
+
+ call void @escap
[llvm-branch-commits] [llvm] [SLU][profcheck] create likely branch weights for guard->branch (PR #164271)
https://github.com/mtrofin updated
https://github.com/llvm/llvm-project/pull/164271
>From 0a768f01ce410e4d1ee377442246a5f7f0442494 Mon Sep 17 00:00:00 2001
From: Mircea Trofin
Date: Mon, 20 Oct 2025 08:21:26 -0700
Subject: [PATCH] [SLU][profcheck] create likely branch weights for
guard->branch
---
.../Transforms/Scalar/SimpleLoopUnswitch.cpp | 12 ++---
.../Transforms/SimpleLoopUnswitch/guards.ll | 26 ---
2 files changed, 25 insertions(+), 13 deletions(-)
diff --git a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
index 239526e85e1fd..86b2090081ed0 100644
--- a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
+++ b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
@@ -40,6 +40,7 @@
#include "llvm/IR/Instruction.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/MDBuilder.h"
#include "llvm/IR/Module.h"
#include "llvm/IR/PatternMatch.h"
#include "llvm/IR/ProfDataUtils.h"
@@ -2831,9 +2832,14 @@ static BranchInst *turnGuardIntoBranch(IntrinsicInst
*GI, Loop &L,
MSSAU->getMemorySSA()->verifyMemorySSA();
DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
- Instruction *DeoptBlockTerm =
- SplitBlockAndInsertIfThen(GI->getArgOperand(0), GI, true,
-GI->getMetadata(LLVMContext::MD_prof), &DTU,
&LI);
+ // llvm.experimental.guard doesn't have branch weights. We can assume,
+ // however, that the deopt path is unlikely.
+ Instruction *DeoptBlockTerm = SplitBlockAndInsertIfThen(
+ GI->getArgOperand(0), GI, true,
+ !ProfcheckDisableMetadataFixes && EstimateProfile
+ ? MDBuilder(GI->getContext()).createUnlikelyBranchWeights()
+ : nullptr,
+ &DTU, &LI);
BranchInst *CheckBI = cast(CheckBB->getTerminator());
// SplitBlockAndInsertIfThen inserts control flow that branches to
// DeoptBlockTerm if the condition is true. We want the opposite.
diff --git a/llvm/test/Transforms/SimpleLoopUnswitch/guards.ll
b/llvm/test/Transforms/SimpleLoopUnswitch/guards.ll
index 706b49df14749..42b32e769d8d7 100644
--- a/llvm/test/Transforms/SimpleLoopUnswitch/guards.ll
+++ b/llvm/test/Transforms/SimpleLoopUnswitch/guards.ll
@@ -1,15 +1,15 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
UTC_ARGS: --version 5
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
UTC_ARGS: --check-globals all --version 5
; RUN: opt -passes='loop(simple-loop-unswitch),verify'
-simple-loop-unswitch-guards -S < %s | FileCheck %s
; RUN: opt -passes='simple-loop-unswitch'
-simple-loop-unswitch-guards -S < %s | FileCheck %s
; RUN: opt -passes='loop-mssa(simple-loop-unswitch),verify'
-simple-loop-unswitch-guards -verify-memoryssa -verify-loop-info -S < %s |
FileCheck %s
declare void @llvm.experimental.guard(i1, ...)
-define void @test_simple_case(i1 %cond, i32 %N) {
+define void @test_simple_case(i1 %cond, i32 %N) !prof !0 {
; CHECK-LABEL: define void @test_simple_case(
-; CHECK-SAME: i1 [[COND:%.*]], i32 [[N:%.*]]) {
+; CHECK-SAME: i1 [[COND:%.*]], i32 [[N:%.*]]) !prof [[PROF0:![0-9]+]] {
; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT:br i1 [[COND]], label %[[ENTRY_SPLIT_US:.*]], label
%[[ENTRY_SPLIT:.*]]
+; CHECK-NEXT:br i1 [[COND]], label %[[ENTRY_SPLIT_US:.*]], label
%[[ENTRY_SPLIT:.*]], !prof [[PROF1:![0-9]+]]
; CHECK: [[ENTRY_SPLIT_US]]:
; CHECK-NEXT:br label %[[LOOP_US:.*]]
; CHECK: [[LOOP_US]]:
@@ -50,9 +50,9 @@ define void @test_two_guards(i1 %cond1, i1 %cond2, i32 %N) {
; CHECK-LABEL: define void @test_two_guards(
; CHECK-SAME: i1 [[COND1:%.*]], i1 [[COND2:%.*]], i32 [[N:%.*]]) {
; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT:br i1 [[COND1]], label %[[ENTRY_SPLIT_US:.*]], label
%[[ENTRY_SPLIT:.*]]
+; CHECK-NEXT:br i1 [[COND1]], label %[[ENTRY_SPLIT_US:.*]], label
%[[ENTRY_SPLIT:.*]], !prof [[PROF1]]
; CHECK: [[ENTRY_SPLIT_US]]:
-; CHECK-NEXT:br i1 [[COND2]], label %[[ENTRY_SPLIT_US_SPLIT_US:.*]], label
%[[ENTRY_SPLIT_US_SPLIT:.*]]
+; CHECK-NEXT:br i1 [[COND2]], label %[[ENTRY_SPLIT_US_SPLIT_US:.*]], label
%[[ENTRY_SPLIT_US_SPLIT:.*]], !prof [[PROF1]]
; CHECK: [[ENTRY_SPLIT_US_SPLIT_US]]:
; CHECK-NEXT:br label %[[LOOP_US_US:.*]]
; CHECK: [[LOOP_US_US]]:
@@ -108,7 +108,7 @@ define void @test_conditional_guards(i1 %cond, i32 %N) {
; CHECK-SAME: i1 [[COND:%.*]], i32 [[N:%.*]]) {
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT:[[COND_FR:%.*]] = freeze i1 [[COND]]
-; CHECK-NEXT:br i1 [[COND_FR]], label %[[ENTRY_SPLIT_US:.*]], label
%[[ENTRY_SPLIT:.*]]
+; CHECK-NEXT:br i1 [[COND_FR]], label %[[ENTRY_SPLIT_US:.*]], label
%[[ENTRY_SPLIT:.*]], !prof [[PROF1]]
; CHECK: [[ENTRY_SPLIT_US]]:
; CHECK-NEXT:br label %[[LOOP_US:.*]]
; CHECK: [[LOOP_US]]:
@@ -171,7 +171,7 @@ define void @test_nested_loop(i1 %cond, i32 %N, i1 %arg) {
; CHECK-LABEL: define void @test_nested_loo
[llvm-branch-commits] [llvm] [LIR][profcheck] Reuse the loop's exit condition profile (PR #164523)
https://github.com/mtrofin updated
https://github.com/llvm/llvm-project/pull/164523
>From 0037b3ee964df164b8215b0b6107a212b00b33ae Mon Sep 17 00:00:00 2001
From: Mircea Trofin
Date: Tue, 21 Oct 2025 17:24:49 -0700
Subject: [PATCH] [LIR][profcheck] Reuse the loop's exit condition profile
---
.../Transforms/Scalar/LoopIdiomRecognize.cpp | 40 +--
.../LoopIdiom/X86/preserve-profile.ll | 70 +++
2 files changed, 106 insertions(+), 4 deletions(-)
create mode 100644 llvm/test/Transforms/LoopIdiom/X86/preserve-profile.ll
diff --git a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
index 019536ca91ae0..9070d252ae09f 100644
--- a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
@@ -72,6 +72,7 @@
#include "llvm/IR/Module.h"
#include "llvm/IR/PassManager.h"
#include "llvm/IR/PatternMatch.h"
+#include "llvm/IR/ProfDataUtils.h"
#include "llvm/IR/Type.h"
#include "llvm/IR/User.h"
#include "llvm/IR/Value.h"
@@ -105,6 +106,7 @@ STATISTIC(
STATISTIC(NumShiftUntilZero,
"Number of uncountable loops recognized as 'shift until zero'
idiom");
+namespace llvm {
bool DisableLIRP::All;
static cl::opt
DisableLIRPAll("disable-" DEBUG_TYPE "-all",
@@ -163,6 +165,10 @@ static cl::opt ForceMemsetPatternIntrinsic(
cl::desc("Use memset.pattern intrinsic whenever possible"),
cl::init(false),
cl::Hidden);
+extern cl::opt ProfcheckDisableMetadataFixes;
+
+} // namespace llvm
+
namespace {
class LoopIdiomRecognize {
@@ -3199,7 +3205,21 @@ bool LoopIdiomRecognize::recognizeShiftUntilBitTest() {
// The loop trip count check.
auto *IVCheck = Builder.CreateICmpEQ(IVNext, LoopTripCount,
CurLoop->getName() + ".ivcheck");
- Builder.CreateCondBr(IVCheck, SuccessorBB, LoopHeaderBB);
+ SmallVector BranchWeights;
+ const bool HasBranchWeights =
+ !ProfcheckDisableMetadataFixes &&
+ extractBranchWeights(*LoopHeaderBB->getTerminator(), BranchWeights);
+
+ auto *BI = Builder.CreateCondBr(IVCheck, SuccessorBB, LoopHeaderBB);
+ if (HasBranchWeights) {
+if (SuccessorBB == LoopHeaderBB->getTerminator()->getSuccessor(1))
+ std::swap(BranchWeights[0], BranchWeights[1]);
+// We're not changing the loop profile, so we can reuse the original loop's
+// profile.
+setBranchWeights(*BI, BranchWeights,
+ /*IsExpected=*/false);
+ }
+
LoopHeaderBB->getTerminator()->eraseFromParent();
// Populate the IV PHI.
@@ -3368,10 +3388,10 @@ static bool detectShiftUntilZeroIdiom(Loop *CurLoop,
ScalarEvolution *SE,
/// %start = <...>
/// %extraoffset = <...>
/// <...>
-/// br label %for.cond
+/// br label %loop
///
/// loop:
-/// %iv = phi i8 [ %start, %entry ], [ %iv.next, %for.cond ]
+/// %iv = phi i8 [ %start, %entry ], [ %iv.next, %loop ]
/// %nbits = add nsw i8 %iv, %extraoffset
/// %val.shifted = {{l,a}shr,shl} i8 %val, %nbits
/// %val.shifted.iszero = icmp eq i8 %val.shifted, 0
@@ -3533,7 +3553,19 @@ bool LoopIdiomRecognize::recognizeShiftUntilZero() {
// The loop terminator.
Builder.SetInsertPoint(LoopHeaderBB->getTerminator());
- Builder.CreateCondBr(CIVCheck, SuccessorBB, LoopHeaderBB);
+ SmallVector BranchWeights;
+ const bool HasBranchWeights =
+ !ProfcheckDisableMetadataFixes &&
+ extractBranchWeights(*LoopHeaderBB->getTerminator(), BranchWeights);
+
+ auto *BI = Builder.CreateCondBr(CIVCheck, SuccessorBB, LoopHeaderBB);
+ if (HasBranchWeights) {
+if (InvertedCond)
+ std::swap(BranchWeights[0], BranchWeights[1]);
+// We're not changing the loop profile, so we can reuse the original loop's
+// profile.
+setBranchWeights(*BI, BranchWeights, /*IsExpected=*/false);
+ }
LoopHeaderBB->getTerminator()->eraseFromParent();
// Populate the IV PHI.
diff --git a/llvm/test/Transforms/LoopIdiom/X86/preserve-profile.ll
b/llvm/test/Transforms/LoopIdiom/X86/preserve-profile.ll
new file mode 100644
index 0..d01bb748d9422
--- /dev/null
+++ b/llvm/test/Transforms/LoopIdiom/X86/preserve-profile.ll
@@ -0,0 +1,70 @@
+; RUN: opt
-passes="module(print),function(loop(loop-idiom)),module(print)"
-mtriple=x86_64 -mcpu=core-avx2 %s -disable-output 2>&1 | FileCheck
--check-prefix=PROFILE %s
+
+declare void @escape_inner(i8, i8, i8, i1, i8)
+declare void @escape_outer(i8, i8, i8, i1, i8)
+
+declare i8 @gen.i8()
+
+; Most basic pattern; Note that iff the shift amount is offset, said offsetting
+; must not cause an overflow, but `add nsw` is fine.
+define i8 @p0(i8 %val, i8 %start, i8 %extraoffset) mustprogress {
+entry:
+ br label %loop
+
+loop:
+ %iv = phi i8 [ %start, %entry ], [ %iv.next, %loop ]
+ %nbits = add nsw i8 %iv, %extraoffset
+ %val.shifted = ashr i8 %val, %nbits
+ %val.shifted.iszero = icmp eq i8 %val.shifted, 0
+ %iv.next = add i8 %iv, 1
+
+ call void @escap
[llvm-branch-commits] [llvm] [SLU][profcheck] Propagate profile for branches on injected conditions. (PR #164476)
https://github.com/mtrofin updated
https://github.com/llvm/llvm-project/pull/164476
>From 84a4031a304acf8092dfadc815ec8863f0790ec7 Mon Sep 17 00:00:00 2001
From: Mircea Trofin
Date: Tue, 21 Oct 2025 11:22:01 -0700
Subject: [PATCH] [SLU][profcheck] Propagate profile for branches on injected
conditions.
---
.../Transforms/Scalar/SimpleLoopUnswitch.cpp | 9 +-
.../inject-invariant-conditions.ll| 142 +-
2 files changed, 79 insertions(+), 72 deletions(-)
diff --git a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
index 86b2090081ed0..0577ddbd2353c 100644
--- a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
+++ b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
@@ -3203,10 +3203,15 @@
injectPendingInvariantConditions(NonTrivialUnswitchCandidate Candidate, Loop &L,
Builder.SetInsertPoint(TI);
auto *InvariantBr =
Builder.CreateCondBr(InjectedCond, InLoopSucc, CheckBlock);
+ // We don't know anything about the relation between the limits.
+ setExplicitlyUnknownBranchWeightsIfProfiled(
+ *InvariantBr, *InvariantBr->getParent()->getParent(), DEBUG_TYPE);
Builder.SetInsertPoint(CheckBlock);
- Builder.CreateCondBr(TI->getCondition(), TI->getSuccessor(0),
- TI->getSuccessor(1));
+ Builder.CreateCondBr(
+ TI->getCondition(), TI->getSuccessor(0), TI->getSuccessor(1),
+ !ProfcheckDisableMetadataFixes ? TI->getMetadata(LLVMContext::MD_prof)
+ : nullptr);
TI->eraseFromParent();
// Fixup phis.
diff --git
a/llvm/test/Transforms/SimpleLoopUnswitch/inject-invariant-conditions.ll
b/llvm/test/Transforms/SimpleLoopUnswitch/inject-invariant-conditions.ll
index 536e0c6a0e74a..3c84dea2a0672 100644
--- a/llvm/test/Transforms/SimpleLoopUnswitch/inject-invariant-conditions.ll
+++ b/llvm/test/Transforms/SimpleLoopUnswitch/inject-invariant-conditions.ll
@@ -2,40 +2,40 @@
; RUN: opt < %s -S -simple-loop-unswitch-inject-invariant-conditions=true
-passes="loop(simple-loop-unswitch),simplifycfg" | FileCheck %s
; RUN: opt < %s -S -simple-loop-unswitch-inject-invariant-conditions=true
-passes="loop-mssa(simple-loop-unswitch),simplifycfg"
-verify-memoryssa | FileCheck %s
-define i32 @test_01(ptr noundef %p, i32 noundef %n, i32 noundef %limit, ptr
noundef %arr, ptr noundef %x_p) {
+define i32 @test_01(ptr noundef %p, i32 noundef %n, i32 noundef %limit, ptr
noundef %arr, ptr noundef %x_p) !prof !{!"function_entry_count", i32 10} {
; CHECK-LABEL: @test_01(
; CHECK-NEXT: entry:
-; CHECK-NEXT:[[X:%.*]] = load i32, ptr [[X_P:%.*]], align 4, !noundef !0
+; CHECK-NEXT:[[X:%.*]] = load i32, ptr [[X_P:%.*]], align 4, !noundef
[[META1:![0-9]+]]
; CHECK-NEXT:[[INJECTED_COND:%.*]] = icmp ule i32 [[LIMIT:%.*]], [[X]]
-; CHECK-NEXT:br i1 [[INJECTED_COND]], label [[LOOP_US:%.*]], label
[[LOOP:%.*]]
+; CHECK-NEXT:br i1 [[INJECTED_COND]], label [[LOOP_US:%.*]], label
[[LOOP:%.*]], !prof [[PROF2:![0-9]+]]
; CHECK: loop.us:
-; CHECK-NEXT:[[IV_US:%.*]] = phi i32 [ [[IV_NEXT_US:%.*]],
[[GUARDED_US:%.*]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:[[EL_PTR_US:%.*]] = getelementptr i32, ptr [[P:%.*]], i32
[[IV_US]]
-; CHECK-NEXT:[[EL_US:%.*]] = load i32, ptr [[EL_PTR_US]], align 4
-; CHECK-NEXT:[[BOUND_CHECK_US:%.*]] = icmp ult i32 [[EL_US]], [[LIMIT]]
-; CHECK-NEXT:br i1 [[BOUND_CHECK_US]], label [[GUARDED_US]], label
[[COMMON_RET:%.*]], !prof [[PROF1:![0-9]+]]
+; CHECK-NEXT:[[IV:%.*]] = phi i32 [ [[IV_NEXT_US:%.*]], [[GUARDED_US:%.*]]
], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:[[EL_PTR:%.*]] = getelementptr i32, ptr [[P:%.*]], i32 [[IV]]
+; CHECK-NEXT:[[EL:%.*]] = load i32, ptr [[EL_PTR]], align 4
+; CHECK-NEXT:[[BOUND_CHECK:%.*]] = icmp ult i32 [[EL]], [[LIMIT]]
+; CHECK-NEXT:br i1 [[BOUND_CHECK]], label [[GUARDED_US]], label
[[COMMON_RET:%.*]], !prof [[PROF3:![0-9]+]]
; CHECK: guarded.us:
-; CHECK-NEXT:[[RANGE_CHECK_US:%.*]] = icmp ult i32 [[EL_US]], [[X]]
-; CHECK-NEXT:[[ARR_PTR_US:%.*]] = getelementptr i32, ptr [[ARR:%.*]], i32
[[EL_US]]
-; CHECK-NEXT:store i32 [[IV_US]], ptr [[ARR_PTR_US]], align 4
-; CHECK-NEXT:[[IV_NEXT_US]] = add i32 [[IV_US]], 1
+; CHECK-NEXT:[[RANGE_CHECK_US:%.*]] = icmp ult i32 [[EL]], [[X]]
+; CHECK-NEXT:[[ARR_PTR_US:%.*]] = getelementptr i32, ptr [[ARR:%.*]], i32
[[EL]]
+; CHECK-NEXT:store i32 [[IV]], ptr [[ARR_PTR_US]], align 4
+; CHECK-NEXT:[[IV_NEXT_US]] = add i32 [[IV]], 1
; CHECK-NEXT:[[LOOP_COND_US:%.*]] = icmp slt i32 [[IV_NEXT_US]], [[N:%.*]]
; CHECK-NEXT:br i1 [[LOOP_COND_US]], label [[LOOP_US]], label
[[COMMON_RET]]
; CHECK: loop:
-; CHECK-NEXT:[[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], [[BACKEDGE:%.*]] ], [
0, [[ENTRY]] ]
-; CHECK-NEXT:[[EL_PTR:%.*]] = getelementptr i32, ptr [[P]], i32 [[IV]]
-; CHECK-NEXT:[[EL:%.*]] = load i32, ptr [[EL_PTR]], align 4
-; CHECK-NEXT:[[BOUND_CHEC
[llvm-branch-commits] [llvm] [LVer][profcheck] explicitly set unknown branch weights for the versioned/unversioned selector (PR #164507)
https://github.com/mtrofin updated
https://github.com/llvm/llvm-project/pull/164507
>From b1dfb82c180e4cde2fb5da7328a90cdaaab29935 Mon Sep 17 00:00:00 2001
From: Mircea Trofin
Date: Tue, 21 Oct 2025 15:20:14 -0700
Subject: [PATCH] [LVer][profcheck] explicitly set unknown branch weights for
the versioned/unversioned selector
---
llvm/lib/Transforms/Utils/LoopVersioning.cpp | 10 --
.../Transforms/LoopDistribute/basic-with-memchecks.ll | 5 +++--
2 files changed, 11 insertions(+), 4 deletions(-)
diff --git a/llvm/lib/Transforms/Utils/LoopVersioning.cpp
b/llvm/lib/Transforms/Utils/LoopVersioning.cpp
index ec2e6c1ab796b..4786819d18fa4 100644
--- a/llvm/lib/Transforms/Utils/LoopVersioning.cpp
+++ b/llvm/lib/Transforms/Utils/LoopVersioning.cpp
@@ -23,6 +23,7 @@
#include "llvm/IR/Dominators.h"
#include "llvm/IR/MDBuilder.h"
#include "llvm/IR/PassManager.h"
+#include "llvm/IR/ProfDataUtils.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
#include "llvm/Transforms/Utils/Cloning.h"
@@ -109,8 +110,13 @@ void LoopVersioning::versionLoop(
// Insert the conditional branch based on the result of the memchecks.
Instruction *OrigTerm = RuntimeCheckBB->getTerminator();
Builder.SetInsertPoint(OrigTerm);
- Builder.CreateCondBr(RuntimeCheck, NonVersionedLoop->getLoopPreheader(),
- VersionedLoop->getLoopPreheader());
+ auto *BI =
+ Builder.CreateCondBr(RuntimeCheck, NonVersionedLoop->getLoopPreheader(),
+ VersionedLoop->getLoopPreheader());
+ // We don't know what the probability of executing the versioned vs the
+ // unversioned variants is.
+ setExplicitlyUnknownBranchWeightsIfProfiled(
+ *BI, *BI->getParent()->getParent(), DEBUG_TYPE);
OrigTerm->eraseFromParent();
// The loops merge in the original exit block. This is now dominated by the
diff --git a/llvm/test/Transforms/LoopDistribute/basic-with-memchecks.ll
b/llvm/test/Transforms/LoopDistribute/basic-with-memchecks.ll
index 97ea2c6708dad..2828882afe779 100644
--- a/llvm/test/Transforms/LoopDistribute/basic-with-memchecks.ll
+++ b/llvm/test/Transforms/LoopDistribute/basic-with-memchecks.ll
@@ -28,7 +28,7 @@ target triple = "x86_64-apple-macosx10.10.0"
@E = common global ptr null, align 8
; CHECK-LABEL: @f(
-define void @f() {
+define void @f() !prof !{!"function_entry_count", i32 10} {
entry:
%a = load ptr, ptr @A, align 8
%b = load ptr, ptr @B, align 8
@@ -55,7 +55,7 @@ entry:
; CHECK: = icmp
; CHECK-NOT: = icmp
-; CHECK: br i1 %conflict.rdx15, label %for.body.ph.lver.orig, label
%for.body.ph.ldist1
+; CHECK: br i1 %conflict.rdx15, label %for.body.ph.lver.orig, label
%for.body.ph.ldist1, !prof ![[PROF1:[0-9]]]
; The non-distributed loop that the memchecks fall back on.
@@ -289,3 +289,4 @@ attributes #1 = { nounwind convergent }
!0 = distinct !{!0, !1}
!1 = !{!"llvm.loop.distribute.enable", i1 true}
+; CHECK: ![[PROF1]] = !{!"unknown", !"loop-versioning"}
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [SLU][profcheck] create likely branch weights for guard->branch (PR #164271)
https://github.com/mtrofin updated
https://github.com/llvm/llvm-project/pull/164271
>From 0a768f01ce410e4d1ee377442246a5f7f0442494 Mon Sep 17 00:00:00 2001
From: Mircea Trofin
Date: Mon, 20 Oct 2025 08:21:26 -0700
Subject: [PATCH] [SLU][profcheck] create likely branch weights for
guard->branch
---
.../Transforms/Scalar/SimpleLoopUnswitch.cpp | 12 ++---
.../Transforms/SimpleLoopUnswitch/guards.ll | 26 ---
2 files changed, 25 insertions(+), 13 deletions(-)
diff --git a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
index 239526e85e1fd..86b2090081ed0 100644
--- a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
+++ b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
@@ -40,6 +40,7 @@
#include "llvm/IR/Instruction.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/MDBuilder.h"
#include "llvm/IR/Module.h"
#include "llvm/IR/PatternMatch.h"
#include "llvm/IR/ProfDataUtils.h"
@@ -2831,9 +2832,14 @@ static BranchInst *turnGuardIntoBranch(IntrinsicInst
*GI, Loop &L,
MSSAU->getMemorySSA()->verifyMemorySSA();
DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
- Instruction *DeoptBlockTerm =
- SplitBlockAndInsertIfThen(GI->getArgOperand(0), GI, true,
-GI->getMetadata(LLVMContext::MD_prof), &DTU,
&LI);
+ // llvm.experimental.guard doesn't have branch weights. We can assume,
+ // however, that the deopt path is unlikely.
+ Instruction *DeoptBlockTerm = SplitBlockAndInsertIfThen(
+ GI->getArgOperand(0), GI, true,
+ !ProfcheckDisableMetadataFixes && EstimateProfile
+ ? MDBuilder(GI->getContext()).createUnlikelyBranchWeights()
+ : nullptr,
+ &DTU, &LI);
BranchInst *CheckBI = cast(CheckBB->getTerminator());
// SplitBlockAndInsertIfThen inserts control flow that branches to
// DeoptBlockTerm if the condition is true. We want the opposite.
diff --git a/llvm/test/Transforms/SimpleLoopUnswitch/guards.ll
b/llvm/test/Transforms/SimpleLoopUnswitch/guards.ll
index 706b49df14749..42b32e769d8d7 100644
--- a/llvm/test/Transforms/SimpleLoopUnswitch/guards.ll
+++ b/llvm/test/Transforms/SimpleLoopUnswitch/guards.ll
@@ -1,15 +1,15 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
UTC_ARGS: --version 5
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
UTC_ARGS: --check-globals all --version 5
; RUN: opt -passes='loop(simple-loop-unswitch),verify'
-simple-loop-unswitch-guards -S < %s | FileCheck %s
; RUN: opt -passes='simple-loop-unswitch'
-simple-loop-unswitch-guards -S < %s | FileCheck %s
; RUN: opt -passes='loop-mssa(simple-loop-unswitch),verify'
-simple-loop-unswitch-guards -verify-memoryssa -verify-loop-info -S < %s |
FileCheck %s
declare void @llvm.experimental.guard(i1, ...)
-define void @test_simple_case(i1 %cond, i32 %N) {
+define void @test_simple_case(i1 %cond, i32 %N) !prof !0 {
; CHECK-LABEL: define void @test_simple_case(
-; CHECK-SAME: i1 [[COND:%.*]], i32 [[N:%.*]]) {
+; CHECK-SAME: i1 [[COND:%.*]], i32 [[N:%.*]]) !prof [[PROF0:![0-9]+]] {
; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT:br i1 [[COND]], label %[[ENTRY_SPLIT_US:.*]], label
%[[ENTRY_SPLIT:.*]]
+; CHECK-NEXT:br i1 [[COND]], label %[[ENTRY_SPLIT_US:.*]], label
%[[ENTRY_SPLIT:.*]], !prof [[PROF1:![0-9]+]]
; CHECK: [[ENTRY_SPLIT_US]]:
; CHECK-NEXT:br label %[[LOOP_US:.*]]
; CHECK: [[LOOP_US]]:
@@ -50,9 +50,9 @@ define void @test_two_guards(i1 %cond1, i1 %cond2, i32 %N) {
; CHECK-LABEL: define void @test_two_guards(
; CHECK-SAME: i1 [[COND1:%.*]], i1 [[COND2:%.*]], i32 [[N:%.*]]) {
; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT:br i1 [[COND1]], label %[[ENTRY_SPLIT_US:.*]], label
%[[ENTRY_SPLIT:.*]]
+; CHECK-NEXT:br i1 [[COND1]], label %[[ENTRY_SPLIT_US:.*]], label
%[[ENTRY_SPLIT:.*]], !prof [[PROF1]]
; CHECK: [[ENTRY_SPLIT_US]]:
-; CHECK-NEXT:br i1 [[COND2]], label %[[ENTRY_SPLIT_US_SPLIT_US:.*]], label
%[[ENTRY_SPLIT_US_SPLIT:.*]]
+; CHECK-NEXT:br i1 [[COND2]], label %[[ENTRY_SPLIT_US_SPLIT_US:.*]], label
%[[ENTRY_SPLIT_US_SPLIT:.*]], !prof [[PROF1]]
; CHECK: [[ENTRY_SPLIT_US_SPLIT_US]]:
; CHECK-NEXT:br label %[[LOOP_US_US:.*]]
; CHECK: [[LOOP_US_US]]:
@@ -108,7 +108,7 @@ define void @test_conditional_guards(i1 %cond, i32 %N) {
; CHECK-SAME: i1 [[COND:%.*]], i32 [[N:%.*]]) {
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT:[[COND_FR:%.*]] = freeze i1 [[COND]]
-; CHECK-NEXT:br i1 [[COND_FR]], label %[[ENTRY_SPLIT_US:.*]], label
%[[ENTRY_SPLIT:.*]]
+; CHECK-NEXT:br i1 [[COND_FR]], label %[[ENTRY_SPLIT_US:.*]], label
%[[ENTRY_SPLIT:.*]], !prof [[PROF1]]
; CHECK: [[ENTRY_SPLIT_US]]:
; CHECK-NEXT:br label %[[LOOP_US:.*]]
; CHECK: [[LOOP_US]]:
@@ -171,7 +171,7 @@ define void @test_nested_loop(i1 %cond, i32 %N, i1 %arg) {
; CHECK-LABEL: define void @test_nested_loo
[llvm-branch-commits] [llvm] [LSCFG][profcheck] Add dummy branch weights for the dummy switch to dead exits (PR #164714)
https://github.com/mtrofin updated
https://github.com/llvm/llvm-project/pull/164714
>From b54f064b74cf7fb7bead465878439446e27c0fd8 Mon Sep 17 00:00:00 2001
From: Mircea Trofin
Date: Wed, 22 Oct 2025 14:34:31 -0700
Subject: [PATCH] [LSCFG][profcheck] Add dummy branch weights for the dummy
switch to dead exits
---
.../lib/Transforms/Scalar/LoopSimplifyCFG.cpp | 12 ++
.../LoopSimplifyCFG/constant-fold-branch.ll | 104 +-
2 files changed, 66 insertions(+), 50 deletions(-)
diff --git a/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp
b/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp
index b9546c5fa236b..e902b71776973 100644
--- a/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp
@@ -24,6 +24,7 @@
#include "llvm/Analysis/ScalarEvolution.h"
#include "llvm/IR/Dominators.h"
#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/ProfDataUtils.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Transforms/Scalar.h"
#include "llvm/Transforms/Scalar/LoopPassManager.h"
@@ -393,6 +394,17 @@ class ConstantTerminatorFoldingImpl {
DTUpdates.push_back({DominatorTree::Insert, Preheader, BB});
++NumLoopExitsDeleted;
}
+// We don't really need to add branch weights to DummySwitch, because all
+// but one branches are just a temporary artifact - see the comment on top
+// of this function. But, it's easy to estimate the weights, and it helps
+// maintain a property of the overall compiler - that the branch weights
+// don't "just get dropped" accidentally (i.e. profcheck)
+if (DummySwitch->getParent()->getParent()->hasProfileData()) {
+ SmallVector DummyBranchWeights(1 + DummySwitch->getNumCases());
+ // default. 100% probability, the rest are dead.
+ DummyBranchWeights[0] = 1;
+ setBranchWeights(*DummySwitch, DummyBranchWeights, /*IsExpected=*/false);
+}
assert(L.getLoopPreheader() == NewPreheader && "Malformed CFG?");
if (Loop *OuterLoop = LI.getLoopFor(Preheader)) {
diff --git a/llvm/test/Transforms/LoopSimplifyCFG/constant-fold-branch.ll
b/llvm/test/Transforms/LoopSimplifyCFG/constant-fold-branch.ll
index 1ec212f0bb5ea..46b6209986fed 100644
--- a/llvm/test/Transforms/LoopSimplifyCFG/constant-fold-branch.ll
+++ b/llvm/test/Transforms/LoopSimplifyCFG/constant-fold-branch.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
UTC_ARGS: --check-globals
; REQUIRES: asserts
; RUN: opt -S -enable-loop-simplifycfg-term-folding=true
-passes=loop-simplifycfg -verify-loop-info -verify-dom-info -verify-loop-lcssa
< %s | FileCheck %s
; RUN: opt -S -enable-loop-simplifycfg-term-folding=true
-passes='require,loop(loop-simplifycfg)' -verify-loop-info
-verify-dom-info -verify-loop-lcssa < %s | FileCheck %s
@@ -59,7 +59,7 @@ define i32 @dead_backedge_test_switch_loop(i32 %end) {
; CHECK: dead_backedge:
; CHECK-NEXT:[[I_2]] = add i32 [[I_1]], 10
; CHECK-NEXT:switch i32 1, label [[EXIT:%.*]] [
-; CHECK-NEXT:i32 0, label [[HEADER_BACKEDGE]]
+; CHECK-NEXT: i32 0, label [[HEADER_BACKEDGE]]
; CHECK-NEXT:]
; CHECK: exit:
; CHECK-NEXT:[[I_2_LCSSA:%.*]] = phi i32 [ [[I_2]], [[DEAD_BACKEDGE]] ]
@@ -233,12 +233,12 @@ exit:
; Check that we preserve static reachibility of a dead exit block while
deleting
; a branch.
-define i32 @dead_exit_test_branch_loop(i32 %end) {
+define i32 @dead_exit_test_branch_loop(i32 %end) !prof
!{!"function_entry_count", i32 10} {
; CHECK-LABEL: @dead_exit_test_branch_loop(
; CHECK-NEXT: preheader:
; CHECK-NEXT:switch i32 0, label [[PREHEADER_SPLIT:%.*]] [
-; CHECK-NEXT:i32 1, label [[DEAD:%.*]]
-; CHECK-NEXT:]
+; CHECK-NEXT: i32 1, label [[DEAD:%.*]]
+; CHECK-NEXT:], !prof [[PROF1:![0-9]+]]
; CHECK: preheader.split:
; CHECK-NEXT:br label [[HEADER:%.*]]
; CHECK: header:
@@ -262,7 +262,7 @@ preheader:
header:
%i = phi i32 [0, %preheader], [%i.inc, %backedge]
- br i1 true, label %backedge, label %dead
+ br i1 true, label %backedge, label %dead, !prof !{!"branch_weights", i32 10,
i32 1}
dead:
br label %dummy
@@ -286,7 +286,7 @@ define i32 @dead_exit_test_switch_loop(i32 %end) {
; CHECK-LABEL: @dead_exit_test_switch_loop(
; CHECK-NEXT: preheader:
; CHECK-NEXT:switch i32 0, label [[PREHEADER_SPLIT:%.*]] [
-; CHECK-NEXT:i32 1, label [[DEAD:%.*]]
+; CHECK-NEXT: i32 1, label [[DEAD:%.*]]
; CHECK-NEXT:]
; CHECK: preheader.split:
; CHECK-NEXT:br label [[HEADER:%.*]]
@@ -383,9 +383,9 @@ define i32 @dead_loop_test_switch_loop(i32 %end) {
; CHECK: header:
; CHECK-NEXT:[[I:%.*]] = phi i32 [ 0, [[PREHEADER:%.*]] ], [
[[I_INC:%.*]], [[BACKEDGE:%.*]] ]
; CHECK-NEXT:switch i32 1, label [[DEAD:%.*]] [
-; CHECK-NEXT:i32 0, label [[DEAD]]
-; CHECK-NEXT:i32 1, label [[BACKEDGE]]
-; CHECK-NEXT:i32 2, lab
[llvm-branch-commits] [llvm] [LVer][profcheck] explicitly set unknown branch weights for the versioned/unversioned selector (PR #164507)
https://github.com/mtrofin updated
https://github.com/llvm/llvm-project/pull/164507
>From b1dfb82c180e4cde2fb5da7328a90cdaaab29935 Mon Sep 17 00:00:00 2001
From: Mircea Trofin
Date: Tue, 21 Oct 2025 15:20:14 -0700
Subject: [PATCH] [LVer][profcheck] explicitly set unknown branch weights for
the versioned/unversioned selector
---
llvm/lib/Transforms/Utils/LoopVersioning.cpp | 10 --
.../Transforms/LoopDistribute/basic-with-memchecks.ll | 5 +++--
2 files changed, 11 insertions(+), 4 deletions(-)
diff --git a/llvm/lib/Transforms/Utils/LoopVersioning.cpp
b/llvm/lib/Transforms/Utils/LoopVersioning.cpp
index ec2e6c1ab796b..4786819d18fa4 100644
--- a/llvm/lib/Transforms/Utils/LoopVersioning.cpp
+++ b/llvm/lib/Transforms/Utils/LoopVersioning.cpp
@@ -23,6 +23,7 @@
#include "llvm/IR/Dominators.h"
#include "llvm/IR/MDBuilder.h"
#include "llvm/IR/PassManager.h"
+#include "llvm/IR/ProfDataUtils.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
#include "llvm/Transforms/Utils/Cloning.h"
@@ -109,8 +110,13 @@ void LoopVersioning::versionLoop(
// Insert the conditional branch based on the result of the memchecks.
Instruction *OrigTerm = RuntimeCheckBB->getTerminator();
Builder.SetInsertPoint(OrigTerm);
- Builder.CreateCondBr(RuntimeCheck, NonVersionedLoop->getLoopPreheader(),
- VersionedLoop->getLoopPreheader());
+ auto *BI =
+ Builder.CreateCondBr(RuntimeCheck, NonVersionedLoop->getLoopPreheader(),
+ VersionedLoop->getLoopPreheader());
+ // We don't know what the probability of executing the versioned vs the
+ // unversioned variants is.
+ setExplicitlyUnknownBranchWeightsIfProfiled(
+ *BI, *BI->getParent()->getParent(), DEBUG_TYPE);
OrigTerm->eraseFromParent();
// The loops merge in the original exit block. This is now dominated by the
diff --git a/llvm/test/Transforms/LoopDistribute/basic-with-memchecks.ll
b/llvm/test/Transforms/LoopDistribute/basic-with-memchecks.ll
index 97ea2c6708dad..2828882afe779 100644
--- a/llvm/test/Transforms/LoopDistribute/basic-with-memchecks.ll
+++ b/llvm/test/Transforms/LoopDistribute/basic-with-memchecks.ll
@@ -28,7 +28,7 @@ target triple = "x86_64-apple-macosx10.10.0"
@E = common global ptr null, align 8
; CHECK-LABEL: @f(
-define void @f() {
+define void @f() !prof !{!"function_entry_count", i32 10} {
entry:
%a = load ptr, ptr @A, align 8
%b = load ptr, ptr @B, align 8
@@ -55,7 +55,7 @@ entry:
; CHECK: = icmp
; CHECK-NOT: = icmp
-; CHECK: br i1 %conflict.rdx15, label %for.body.ph.lver.orig, label
%for.body.ph.ldist1
+; CHECK: br i1 %conflict.rdx15, label %for.body.ph.lver.orig, label
%for.body.ph.ldist1, !prof ![[PROF1:[0-9]]]
; The non-distributed loop that the memchecks fall back on.
@@ -289,3 +289,4 @@ attributes #1 = { nounwind convergent }
!0 = distinct !{!0, !1}
!1 = !{!"llvm.loop.distribute.enable", i1 true}
+; CHECK: ![[PROF1]] = !{!"unknown", !"loop-versioning"}
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [LSCFG][profcheck] Add dummy branch weights for the dummy switch to dead exits (PR #164714)
https://github.com/mtrofin updated
https://github.com/llvm/llvm-project/pull/164714
>From b54f064b74cf7fb7bead465878439446e27c0fd8 Mon Sep 17 00:00:00 2001
From: Mircea Trofin
Date: Wed, 22 Oct 2025 14:34:31 -0700
Subject: [PATCH] [LSCFG][profcheck] Add dummy branch weights for the dummy
switch to dead exits
---
.../lib/Transforms/Scalar/LoopSimplifyCFG.cpp | 12 ++
.../LoopSimplifyCFG/constant-fold-branch.ll | 104 +-
2 files changed, 66 insertions(+), 50 deletions(-)
diff --git a/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp
b/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp
index b9546c5fa236b..e902b71776973 100644
--- a/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp
@@ -24,6 +24,7 @@
#include "llvm/Analysis/ScalarEvolution.h"
#include "llvm/IR/Dominators.h"
#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/ProfDataUtils.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Transforms/Scalar.h"
#include "llvm/Transforms/Scalar/LoopPassManager.h"
@@ -393,6 +394,17 @@ class ConstantTerminatorFoldingImpl {
DTUpdates.push_back({DominatorTree::Insert, Preheader, BB});
++NumLoopExitsDeleted;
}
+// We don't really need to add branch weights to DummySwitch, because all
+// but one branches are just a temporary artifact - see the comment on top
+// of this function. But, it's easy to estimate the weights, and it helps
+// maintain a property of the overall compiler - that the branch weights
+// don't "just get dropped" accidentally (i.e. profcheck)
+if (DummySwitch->getParent()->getParent()->hasProfileData()) {
+ SmallVector DummyBranchWeights(1 + DummySwitch->getNumCases());
+ // default. 100% probability, the rest are dead.
+ DummyBranchWeights[0] = 1;
+ setBranchWeights(*DummySwitch, DummyBranchWeights, /*IsExpected=*/false);
+}
assert(L.getLoopPreheader() == NewPreheader && "Malformed CFG?");
if (Loop *OuterLoop = LI.getLoopFor(Preheader)) {
diff --git a/llvm/test/Transforms/LoopSimplifyCFG/constant-fold-branch.ll
b/llvm/test/Transforms/LoopSimplifyCFG/constant-fold-branch.ll
index 1ec212f0bb5ea..46b6209986fed 100644
--- a/llvm/test/Transforms/LoopSimplifyCFG/constant-fold-branch.ll
+++ b/llvm/test/Transforms/LoopSimplifyCFG/constant-fold-branch.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
UTC_ARGS: --check-globals
; REQUIRES: asserts
; RUN: opt -S -enable-loop-simplifycfg-term-folding=true
-passes=loop-simplifycfg -verify-loop-info -verify-dom-info -verify-loop-lcssa
< %s | FileCheck %s
; RUN: opt -S -enable-loop-simplifycfg-term-folding=true
-passes='require,loop(loop-simplifycfg)' -verify-loop-info
-verify-dom-info -verify-loop-lcssa < %s | FileCheck %s
@@ -59,7 +59,7 @@ define i32 @dead_backedge_test_switch_loop(i32 %end) {
; CHECK: dead_backedge:
; CHECK-NEXT:[[I_2]] = add i32 [[I_1]], 10
; CHECK-NEXT:switch i32 1, label [[EXIT:%.*]] [
-; CHECK-NEXT:i32 0, label [[HEADER_BACKEDGE]]
+; CHECK-NEXT: i32 0, label [[HEADER_BACKEDGE]]
; CHECK-NEXT:]
; CHECK: exit:
; CHECK-NEXT:[[I_2_LCSSA:%.*]] = phi i32 [ [[I_2]], [[DEAD_BACKEDGE]] ]
@@ -233,12 +233,12 @@ exit:
; Check that we preserve static reachibility of a dead exit block while
deleting
; a branch.
-define i32 @dead_exit_test_branch_loop(i32 %end) {
+define i32 @dead_exit_test_branch_loop(i32 %end) !prof
!{!"function_entry_count", i32 10} {
; CHECK-LABEL: @dead_exit_test_branch_loop(
; CHECK-NEXT: preheader:
; CHECK-NEXT:switch i32 0, label [[PREHEADER_SPLIT:%.*]] [
-; CHECK-NEXT:i32 1, label [[DEAD:%.*]]
-; CHECK-NEXT:]
+; CHECK-NEXT: i32 1, label [[DEAD:%.*]]
+; CHECK-NEXT:], !prof [[PROF1:![0-9]+]]
; CHECK: preheader.split:
; CHECK-NEXT:br label [[HEADER:%.*]]
; CHECK: header:
@@ -262,7 +262,7 @@ preheader:
header:
%i = phi i32 [0, %preheader], [%i.inc, %backedge]
- br i1 true, label %backedge, label %dead
+ br i1 true, label %backedge, label %dead, !prof !{!"branch_weights", i32 10,
i32 1}
dead:
br label %dummy
@@ -286,7 +286,7 @@ define i32 @dead_exit_test_switch_loop(i32 %end) {
; CHECK-LABEL: @dead_exit_test_switch_loop(
; CHECK-NEXT: preheader:
; CHECK-NEXT:switch i32 0, label [[PREHEADER_SPLIT:%.*]] [
-; CHECK-NEXT:i32 1, label [[DEAD:%.*]]
+; CHECK-NEXT: i32 1, label [[DEAD:%.*]]
; CHECK-NEXT:]
; CHECK: preheader.split:
; CHECK-NEXT:br label [[HEADER:%.*]]
@@ -383,9 +383,9 @@ define i32 @dead_loop_test_switch_loop(i32 %end) {
; CHECK: header:
; CHECK-NEXT:[[I:%.*]] = phi i32 [ 0, [[PREHEADER:%.*]] ], [
[[I_INC:%.*]], [[BACKEDGE:%.*]] ]
; CHECK-NEXT:switch i32 1, label [[DEAD:%.*]] [
-; CHECK-NEXT:i32 0, label [[DEAD]]
-; CHECK-NEXT:i32 1, label [[BACKEDGE]]
-; CHECK-NEXT:i32 2, lab
[llvm-branch-commits] [llvm] [SLU][profcheck] Propagate profile for branches on injected conditions. (PR #164476)
https://github.com/mtrofin updated
https://github.com/llvm/llvm-project/pull/164476
>From 84a4031a304acf8092dfadc815ec8863f0790ec7 Mon Sep 17 00:00:00 2001
From: Mircea Trofin
Date: Tue, 21 Oct 2025 11:22:01 -0700
Subject: [PATCH] [SLU][profcheck] Propagate profile for branches on injected
conditions.
---
.../Transforms/Scalar/SimpleLoopUnswitch.cpp | 9 +-
.../inject-invariant-conditions.ll| 142 +-
2 files changed, 79 insertions(+), 72 deletions(-)
diff --git a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
index 86b2090081ed0..0577ddbd2353c 100644
--- a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
+++ b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
@@ -3203,10 +3203,15 @@
injectPendingInvariantConditions(NonTrivialUnswitchCandidate Candidate, Loop &L,
Builder.SetInsertPoint(TI);
auto *InvariantBr =
Builder.CreateCondBr(InjectedCond, InLoopSucc, CheckBlock);
+ // We don't know anything about the relation between the limits.
+ setExplicitlyUnknownBranchWeightsIfProfiled(
+ *InvariantBr, *InvariantBr->getParent()->getParent(), DEBUG_TYPE);
Builder.SetInsertPoint(CheckBlock);
- Builder.CreateCondBr(TI->getCondition(), TI->getSuccessor(0),
- TI->getSuccessor(1));
+ Builder.CreateCondBr(
+ TI->getCondition(), TI->getSuccessor(0), TI->getSuccessor(1),
+ !ProfcheckDisableMetadataFixes ? TI->getMetadata(LLVMContext::MD_prof)
+ : nullptr);
TI->eraseFromParent();
// Fixup phis.
diff --git
a/llvm/test/Transforms/SimpleLoopUnswitch/inject-invariant-conditions.ll
b/llvm/test/Transforms/SimpleLoopUnswitch/inject-invariant-conditions.ll
index 536e0c6a0e74a..3c84dea2a0672 100644
--- a/llvm/test/Transforms/SimpleLoopUnswitch/inject-invariant-conditions.ll
+++ b/llvm/test/Transforms/SimpleLoopUnswitch/inject-invariant-conditions.ll
@@ -2,40 +2,40 @@
; RUN: opt < %s -S -simple-loop-unswitch-inject-invariant-conditions=true
-passes="loop(simple-loop-unswitch),simplifycfg" | FileCheck %s
; RUN: opt < %s -S -simple-loop-unswitch-inject-invariant-conditions=true
-passes="loop-mssa(simple-loop-unswitch),simplifycfg"
-verify-memoryssa | FileCheck %s
-define i32 @test_01(ptr noundef %p, i32 noundef %n, i32 noundef %limit, ptr
noundef %arr, ptr noundef %x_p) {
+define i32 @test_01(ptr noundef %p, i32 noundef %n, i32 noundef %limit, ptr
noundef %arr, ptr noundef %x_p) !prof !{!"function_entry_count", i32 10} {
; CHECK-LABEL: @test_01(
; CHECK-NEXT: entry:
-; CHECK-NEXT:[[X:%.*]] = load i32, ptr [[X_P:%.*]], align 4, !noundef !0
+; CHECK-NEXT:[[X:%.*]] = load i32, ptr [[X_P:%.*]], align 4, !noundef
[[META1:![0-9]+]]
; CHECK-NEXT:[[INJECTED_COND:%.*]] = icmp ule i32 [[LIMIT:%.*]], [[X]]
-; CHECK-NEXT:br i1 [[INJECTED_COND]], label [[LOOP_US:%.*]], label
[[LOOP:%.*]]
+; CHECK-NEXT:br i1 [[INJECTED_COND]], label [[LOOP_US:%.*]], label
[[LOOP:%.*]], !prof [[PROF2:![0-9]+]]
; CHECK: loop.us:
-; CHECK-NEXT:[[IV_US:%.*]] = phi i32 [ [[IV_NEXT_US:%.*]],
[[GUARDED_US:%.*]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:[[EL_PTR_US:%.*]] = getelementptr i32, ptr [[P:%.*]], i32
[[IV_US]]
-; CHECK-NEXT:[[EL_US:%.*]] = load i32, ptr [[EL_PTR_US]], align 4
-; CHECK-NEXT:[[BOUND_CHECK_US:%.*]] = icmp ult i32 [[EL_US]], [[LIMIT]]
-; CHECK-NEXT:br i1 [[BOUND_CHECK_US]], label [[GUARDED_US]], label
[[COMMON_RET:%.*]], !prof [[PROF1:![0-9]+]]
+; CHECK-NEXT:[[IV:%.*]] = phi i32 [ [[IV_NEXT_US:%.*]], [[GUARDED_US:%.*]]
], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:[[EL_PTR:%.*]] = getelementptr i32, ptr [[P:%.*]], i32 [[IV]]
+; CHECK-NEXT:[[EL:%.*]] = load i32, ptr [[EL_PTR]], align 4
+; CHECK-NEXT:[[BOUND_CHECK:%.*]] = icmp ult i32 [[EL]], [[LIMIT]]
+; CHECK-NEXT:br i1 [[BOUND_CHECK]], label [[GUARDED_US]], label
[[COMMON_RET:%.*]], !prof [[PROF3:![0-9]+]]
; CHECK: guarded.us:
-; CHECK-NEXT:[[RANGE_CHECK_US:%.*]] = icmp ult i32 [[EL_US]], [[X]]
-; CHECK-NEXT:[[ARR_PTR_US:%.*]] = getelementptr i32, ptr [[ARR:%.*]], i32
[[EL_US]]
-; CHECK-NEXT:store i32 [[IV_US]], ptr [[ARR_PTR_US]], align 4
-; CHECK-NEXT:[[IV_NEXT_US]] = add i32 [[IV_US]], 1
+; CHECK-NEXT:[[RANGE_CHECK_US:%.*]] = icmp ult i32 [[EL]], [[X]]
+; CHECK-NEXT:[[ARR_PTR_US:%.*]] = getelementptr i32, ptr [[ARR:%.*]], i32
[[EL]]
+; CHECK-NEXT:store i32 [[IV]], ptr [[ARR_PTR_US]], align 4
+; CHECK-NEXT:[[IV_NEXT_US]] = add i32 [[IV]], 1
; CHECK-NEXT:[[LOOP_COND_US:%.*]] = icmp slt i32 [[IV_NEXT_US]], [[N:%.*]]
; CHECK-NEXT:br i1 [[LOOP_COND_US]], label [[LOOP_US]], label
[[COMMON_RET]]
; CHECK: loop:
-; CHECK-NEXT:[[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], [[BACKEDGE:%.*]] ], [
0, [[ENTRY]] ]
-; CHECK-NEXT:[[EL_PTR:%.*]] = getelementptr i32, ptr [[P]], i32 [[IV]]
-; CHECK-NEXT:[[EL:%.*]] = load i32, ptr [[EL_PTR]], align 4
-; CHECK-NEXT:[[BOUND_CHEC
[llvm-branch-commits] [llvm] Backport #164873 and #166067 to `release/21.x` (PR #166409)
https://github.com/nga888 created
https://github.com/llvm/llvm-project/pull/166409
Backport the following commits to `release/21.x`:
0341fb63: [ThinLTO] Avoid creating map entries on lookup (NFCI) (#164873)
564c3de6: [ThinLTO][NFC] Improve performance of `addThinLTO` (#166067)
>From 0b5b2ad793aa4384b58eb9c0fd917499f50a44a4 Mon Sep 17 00:00:00 2001
From: Teresa Johnson
Date: Thu, 23 Oct 2025 19:40:42 -0700
Subject: [PATCH 1/2] [ThinLTO] Avoid creating map entries on lookup (NFCI)
(#164873)
We could inadvertently create new entries in the PrevailingModuleForGUID
map during lookup, which was always using operator[]. In most cases we
will have one for external symbols, but not in cases where the
prevailing copy is in a native object. Or if this happened to be looked
up for a local.
Make the map private and create and use accessors.
(cherry picked from commit 0341fb63f2abe2ce98434c45fef8826718f9198c)
---
llvm/include/llvm/LTO/LTO.h | 13 +
llvm/lib/LTO/LTO.cpp| 12 ++--
2 files changed, 19 insertions(+), 6 deletions(-)
diff --git a/llvm/include/llvm/LTO/LTO.h b/llvm/include/llvm/LTO/LTO.h
index d8e632b5a49d5..d5e7d2ede7e9b 100644
--- a/llvm/include/llvm/LTO/LTO.h
+++ b/llvm/include/llvm/LTO/LTO.h
@@ -464,6 +464,19 @@ class LTO {
ModuleMapType ModuleMap;
// The bitcode modules to compile, if specified by the LTO Config.
std::optional ModulesToCompile;
+
+void setPrevailingModuleForGUID(GlobalValue::GUID GUID, StringRef Module) {
+ PrevailingModuleForGUID[GUID] = Module;
+}
+bool isPrevailingModuleForGUID(GlobalValue::GUID GUID,
+ StringRef Module) const {
+ auto It = PrevailingModuleForGUID.find(GUID);
+ return It != PrevailingModuleForGUID.end() && It->second == Module;
+}
+
+ private:
+// Make this private so all accesses must go through above accessor methods
+// to avoid inadvertently creating new entries on lookups.
DenseMap PrevailingModuleForGUID;
} ThinLTO;
diff --git a/llvm/lib/LTO/LTO.cpp b/llvm/lib/LTO/LTO.cpp
index 73e79c08a56ca..523ece12d1666 100644
--- a/llvm/lib/LTO/LTO.cpp
+++ b/llvm/lib/LTO/LTO.cpp
@@ -1045,15 +1045,15 @@ Error LTO::addThinLTO(BitcodeModule BM,
ArrayRef Syms,
GlobalValue::getGlobalIdentifier(Sym.getIRName(),
GlobalValue::ExternalLinkage, ""));
if (Res.Prevailing)
-ThinLTO.PrevailingModuleForGUID[GUID] = BM.getModuleIdentifier();
+ThinLTO.setPrevailingModuleForGUID(GUID, BM.getModuleIdentifier());
}
}
if (Error Err =
BM.readSummary(ThinLTO.CombinedIndex, BM.getModuleIdentifier(),
[&](GlobalValue::GUID GUID) {
- return ThinLTO.PrevailingModuleForGUID[GUID] ==
- BM.getModuleIdentifier();
+ return ThinLTO.isPrevailingModuleForGUID(
+ GUID, BM.getModuleIdentifier());
}))
return Err;
LLVM_DEBUG(dbgs() << "Module " << BM.getModuleIdentifier() << "\n");
@@ -1067,8 +1067,8 @@ Error LTO::addThinLTO(BitcodeModule BM,
ArrayRef Syms,
GlobalValue::getGlobalIdentifier(Sym.getIRName(),
GlobalValue::ExternalLinkage, ""));
if (Res.Prevailing) {
-assert(ThinLTO.PrevailingModuleForGUID[GUID] ==
- BM.getModuleIdentifier());
+assert(
+ThinLTO.isPrevailingModuleForGUID(GUID, BM.getModuleIdentifier()));
// For linker redefined symbols (via --wrap or --defsym) we want to
// switch the linkage to `weak` to prevent IPOs from happening.
@@ -1974,7 +1974,7 @@ Error LTO::runThinLTO(AddStreamFn AddStream, FileCache
Cache,
LocalWPDTargetsMap);
auto isPrevailing = [&](GlobalValue::GUID GUID, const GlobalValueSummary *S)
{
-return ThinLTO.PrevailingModuleForGUID[GUID] == S->modulePath();
+return ThinLTO.isPrevailingModuleForGUID(GUID, S->modulePath());
};
if (EnableMemProfContextDisambiguation) {
MemProfContextDisambiguation ContextDisambiguation;
>From 115ae08777c6022d8a148d0806d250c859313244 Mon Sep 17 00:00:00 2001
From: Andrew Ng
Date: Mon, 3 Nov 2025 11:10:44 +
Subject: [PATCH 2/2] [ThinLTO][NFC] Improve performance of `addThinLTO`
(#166067)
Avoid the construction of `GUID` when not required. This improves the
performance of a LLD `--thinlto-index-only` link of `clang` by ~4-5% on
both Windows and Linux.
(cherry picked from commit 564c3de67d20d578d05678b49045378fdcf5ccaa)
---
llvm/lib/LTO/LTO.cpp | 40 ++--
1 file changed, 18 insertions(+), 22 deletions(-)
diff --git a/llvm/lib/LTO/LTO.cpp b/llvm/lib/LTO/LTO.cpp
index 523ece12d1666..3c6951d8ec5fe 100644
--- a/llvm/lib/LTO/LTO.cpp
+++ b/llvm/lib/LTO/LTO.cpp
@@ -1035,63 +1035,59 @@ Error LTO::linkRegularLTO(RegularLTOState::
[llvm-branch-commits] [llvm] Backport #164873 and #166067 to `release/21.x` (PR #166409)
llvmbot wrote:
@llvm/pr-subscribers-lto
Author: Andrew Ng (nga888)
Changes
Backport the following commits to `release/21.x`:
0341fb63: [ThinLTO] Avoid creating map entries on lookup (NFCI) (#164873)
564c3de6: [ThinLTO][NFC] Improve performance of `addThinLTO` (#166067)
---
Full diff: https://github.com/llvm/llvm-project/pull/166409.diff
2 Files Affected:
- (modified) llvm/include/llvm/LTO/LTO.h (+13)
- (modified) llvm/lib/LTO/LTO.cpp (+19-23)
``diff
diff --git a/llvm/include/llvm/LTO/LTO.h b/llvm/include/llvm/LTO/LTO.h
index d8e632b5a49d5..d5e7d2ede7e9b 100644
--- a/llvm/include/llvm/LTO/LTO.h
+++ b/llvm/include/llvm/LTO/LTO.h
@@ -464,6 +464,19 @@ class LTO {
ModuleMapType ModuleMap;
// The bitcode modules to compile, if specified by the LTO Config.
std::optional ModulesToCompile;
+
+void setPrevailingModuleForGUID(GlobalValue::GUID GUID, StringRef Module) {
+ PrevailingModuleForGUID[GUID] = Module;
+}
+bool isPrevailingModuleForGUID(GlobalValue::GUID GUID,
+ StringRef Module) const {
+ auto It = PrevailingModuleForGUID.find(GUID);
+ return It != PrevailingModuleForGUID.end() && It->second == Module;
+}
+
+ private:
+// Make this private so all accesses must go through above accessor methods
+// to avoid inadvertently creating new entries on lookups.
DenseMap PrevailingModuleForGUID;
} ThinLTO;
diff --git a/llvm/lib/LTO/LTO.cpp b/llvm/lib/LTO/LTO.cpp
index 73e79c08a56ca..3c6951d8ec5fe 100644
--- a/llvm/lib/LTO/LTO.cpp
+++ b/llvm/lib/LTO/LTO.cpp
@@ -1035,63 +1035,59 @@ Error LTO::linkRegularLTO(RegularLTOState::AddedModule
Mod,
Error LTO::addThinLTO(BitcodeModule BM, ArrayRef Syms,
const SymbolResolution *&ResI,
const SymbolResolution *ResE) {
+ const auto BMID = BM.getModuleIdentifier();
const SymbolResolution *ResITmp = ResI;
for (const InputFile::Symbol &Sym : Syms) {
assert(ResITmp != ResE);
SymbolResolution Res = *ResITmp++;
-if (!Sym.getIRName().empty()) {
+if (!Sym.getIRName().empty() && Res.Prevailing) {
auto GUID = GlobalValue::getGUIDAssumingExternalLinkage(
GlobalValue::getGlobalIdentifier(Sym.getIRName(),
GlobalValue::ExternalLinkage, ""));
- if (Res.Prevailing)
-ThinLTO.PrevailingModuleForGUID[GUID] = BM.getModuleIdentifier();
+ ThinLTO.setPrevailingModuleForGUID(GUID, BMID);
}
}
- if (Error Err =
- BM.readSummary(ThinLTO.CombinedIndex, BM.getModuleIdentifier(),
- [&](GlobalValue::GUID GUID) {
- return ThinLTO.PrevailingModuleForGUID[GUID] ==
- BM.getModuleIdentifier();
- }))
+ if (Error Err = BM.readSummary(
+ ThinLTO.CombinedIndex, BMID, [&](GlobalValue::GUID GUID) {
+return ThinLTO.isPrevailingModuleForGUID(GUID, BMID);
+ }))
return Err;
- LLVM_DEBUG(dbgs() << "Module " << BM.getModuleIdentifier() << "\n");
+ LLVM_DEBUG(dbgs() << "Module " << BMID << "\n");
for (const InputFile::Symbol &Sym : Syms) {
assert(ResI != ResE);
SymbolResolution Res = *ResI++;
-if (!Sym.getIRName().empty()) {
+if (!Sym.getIRName().empty() &&
+(Res.Prevailing || Res.FinalDefinitionInLinkageUnit)) {
auto GUID = GlobalValue::getGUIDAssumingExternalLinkage(
GlobalValue::getGlobalIdentifier(Sym.getIRName(),
GlobalValue::ExternalLinkage, ""));
if (Res.Prevailing) {
-assert(ThinLTO.PrevailingModuleForGUID[GUID] ==
- BM.getModuleIdentifier());
+assert(ThinLTO.isPrevailingModuleForGUID(GUID, BMID));
// For linker redefined symbols (via --wrap or --defsym) we want to
// switch the linkage to `weak` to prevent IPOs from happening.
// Find the summary in the module for this very GV and record the new
// linkage so that we can switch it when we import the GV.
if (Res.LinkerRedefined)
- if (auto S = ThinLTO.CombinedIndex.findSummaryInModule(
- GUID, BM.getModuleIdentifier()))
+ if (auto S = ThinLTO.CombinedIndex.findSummaryInModule(GUID, BMID))
S->setLinkage(GlobalValue::WeakAnyLinkage);
}
// If the linker resolved the symbol to a local definition then mark it
// as local in the summary for the module we are adding.
if (Res.FinalDefinitionInLinkageUnit) {
-if (auto S = ThinLTO.CombinedIndex.findSummaryInModule(
-GUID, BM.getModuleIdentifier())) {
+if (auto S = ThinLTO.CombinedIndex.findSummaryInModule(GUID, BMID)) {
S->setDSOLocal(true);
}
}
}
}
- if (!ThinLTO.ModuleMap.insert({BM.getModuleIdentifier(), BM}).second)
+ if (!ThinLTO.ModuleMap.insert({BMID, BM}).second)
[llvm-branch-commits] [llvm] Backport #164873 and #166067 to `release/21.x` (PR #166409)
https://github.com/nga888 milestoned https://github.com/llvm/llvm-project/pull/166409 ___ llvm-branch-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [mlir] [Backport][MLIR] Properties.td fix from main commit 77f2560 (PR #165768)
yu810226 wrote: Thanks for the review! This is a clean backport of the fix from main. As pointed out, the issue is indirectly tested in the original commit (via the ptr op), so I didn’t backport the tests here. If anyone familiar with the backport testing process could comment on the test coverage question (raised above) or help with merging, that’d be much appreciated. I don’t have write access myself. https://github.com/llvm/llvm-project/pull/165768 ___ llvm-branch-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] Backport #164873 and #166067 to `release/21.x` (PR #166409)
https://github.com/teresajohnson approved this pull request. lgtm thanks https://github.com/llvm/llvm-project/pull/166409 ___ llvm-branch-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [clang] [LifetimeSafety] Optimize fact storage with IDs and vector-based lookup (PR #165963)
https://github.com/usx95 updated
https://github.com/llvm/llvm-project/pull/165963
>From 66119a726e96dca212860e47a06f40b5af6717fe Mon Sep 17 00:00:00 2001
From: Utkarsh Saxena
Date: Sat, 1 Nov 2025 03:16:20 +
Subject: [PATCH] Avoid using DenseMap for CFGBlock and program points
---
.../Analysis/Analyses/LifetimeSafety/Facts.h | 31 ++-
clang/lib/Analysis/LifetimeSafety/Dataflow.h | 14 ++---
clang/lib/Analysis/LifetimeSafety/Facts.cpp | 13 +++-
.../LifetimeSafety/LifetimeSafety.cpp | 1 +
4 files changed, 38 insertions(+), 21 deletions(-)
diff --git a/clang/include/clang/Analysis/Analyses/LifetimeSafety/Facts.h
b/clang/include/clang/Analysis/Analyses/LifetimeSafety/Facts.h
index 063cb5c2d42ab..b9cad5340c940 100644
--- a/clang/include/clang/Analysis/Analyses/LifetimeSafety/Facts.h
+++ b/clang/include/clang/Analysis/Analyses/LifetimeSafety/Facts.h
@@ -16,6 +16,7 @@
#include "clang/Analysis/Analyses/LifetimeSafety/Loans.h"
#include "clang/Analysis/Analyses/LifetimeSafety/Origins.h"
+#include "clang/Analysis/Analyses/LifetimeSafety/Utils.h"
#include "clang/Analysis/AnalysisDeclContext.h"
#include "clang/Analysis/CFG.h"
#include "llvm/ADT/SmallVector.h"
@@ -23,6 +24,9 @@
#include
namespace clang::lifetimes::internal {
+
+using FactID = utils::ID;
+
/// An abstract base class for a single, atomic lifetime-relevant event.
class Fact {
@@ -48,6 +52,7 @@ class Fact {
private:
Kind K;
+ FactID ID;
protected:
Fact(Kind K) : K(K) {}
@@ -56,6 +61,9 @@ class Fact {
virtual ~Fact() = default;
Kind getKind() const { return K; }
+ void setID(FactID ID) { this->ID = ID; }
+ FactID getID() const { return ID; }
+
template const T *getAs() const {
if (T::classof(this))
return static_cast(this);
@@ -183,22 +191,26 @@ class TestPointFact : public Fact {
class FactManager {
public:
+ void init(const CFG &Cfg) {
+assert(BlockToFacts.empty() && "FactManager already initialized");
+BlockToFacts.resize(Cfg.getNumBlockIDs());
+ }
+
llvm::ArrayRef getFacts(const CFGBlock *B) const {
-auto It = BlockToFactsMap.find(B);
-if (It != BlockToFactsMap.end())
- return It->second;
-return {};
+return BlockToFacts[B->getBlockID()];
}
void addBlockFacts(const CFGBlock *B, llvm::ArrayRef NewFacts) {
if (!NewFacts.empty())
- BlockToFactsMap[B].assign(NewFacts.begin(), NewFacts.end());
+ BlockToFacts[B->getBlockID()].assign(NewFacts.begin(), NewFacts.end());
}
template
FactType *createFact(Args &&...args) {
void *Mem = FactAllocator.Allocate();
-return new (Mem) FactType(std::forward(args)...);
+FactType *Res = new (Mem) FactType(std::forward(args)...);
+Res->setID(NextFactID++);
+return Res;
}
void dump(const CFG &Cfg, AnalysisDeclContext &AC) const;
@@ -214,16 +226,19 @@ class FactManager {
/// \note This is intended for testing only.
llvm::StringMap getTestPoints() const;
+ unsigned getNumFacts() const { return NextFactID.Value; }
+
LoanManager &getLoanMgr() { return LoanMgr; }
const LoanManager &getLoanMgr() const { return LoanMgr; }
OriginManager &getOriginMgr() { return OriginMgr; }
const OriginManager &getOriginMgr() const { return OriginMgr; }
private:
+ FactID NextFactID{0};
LoanManager LoanMgr;
OriginManager OriginMgr;
- llvm::DenseMap>
- BlockToFactsMap;
+ /// Facts for each CFG block, indexed by block ID.
+ llvm::SmallVector> BlockToFacts;
llvm::BumpPtrAllocator FactAllocator;
};
} // namespace clang::lifetimes::internal
diff --git a/clang/lib/Analysis/LifetimeSafety/Dataflow.h
b/clang/lib/Analysis/LifetimeSafety/Dataflow.h
index 2f7bcb6e5dc81..de821bb17eb6b 100644
--- a/clang/lib/Analysis/LifetimeSafety/Dataflow.h
+++ b/clang/lib/Analysis/LifetimeSafety/Dataflow.h
@@ -67,10 +67,10 @@ class DataflowAnalysis {
llvm::DenseMap InStates;
/// The dataflow state after a basic block is processed.
llvm::DenseMap OutStates;
- /// The dataflow state at a Program Point.
+ /// Dataflow state at each program point, indexed by Fact ID.
/// In a forward analysis, this is the state after the Fact at that point has
/// been applied, while in a backward analysis, it is the state before.
- llvm::DenseMap PerPointStates;
+ llvm::SmallVector PointToState;
static constexpr bool isForward() { return Dir == Direction::Forward; }
@@ -86,6 +86,8 @@ class DataflowAnalysis {
Derived &D = static_cast(*this);
llvm::TimeTraceScope Time(D.getAnalysisName());
+PointToState.resize(FactMgr.getNumFacts());
+
using Worklist =
std::conditional_t;
@@ -116,7 +118,9 @@ class DataflowAnalysis {
}
protected:
- Lattice getState(ProgramPoint P) const { return PerPointStates.lookup(P); }
+ Lattice getState(ProgramPoint P) const {
+return PointToState[P->getID().Value];
+ }
std::optional getInState(const CFGBlock *B) const {
auto It = InStates.find(B);
@@ -144
[llvm-branch-commits] [clang] [LifetimeSafety] Optimize fact storage with IDs and vector-based lookup (PR #165963)
https://github.com/usx95 updated
https://github.com/llvm/llvm-project/pull/165963
>From 66119a726e96dca212860e47a06f40b5af6717fe Mon Sep 17 00:00:00 2001
From: Utkarsh Saxena
Date: Sat, 1 Nov 2025 03:16:20 +
Subject: [PATCH] Avoid using DenseMap for CFGBlock and program points
---
.../Analysis/Analyses/LifetimeSafety/Facts.h | 31 ++-
clang/lib/Analysis/LifetimeSafety/Dataflow.h | 14 ++---
clang/lib/Analysis/LifetimeSafety/Facts.cpp | 13 +++-
.../LifetimeSafety/LifetimeSafety.cpp | 1 +
4 files changed, 38 insertions(+), 21 deletions(-)
diff --git a/clang/include/clang/Analysis/Analyses/LifetimeSafety/Facts.h
b/clang/include/clang/Analysis/Analyses/LifetimeSafety/Facts.h
index 063cb5c2d42ab..b9cad5340c940 100644
--- a/clang/include/clang/Analysis/Analyses/LifetimeSafety/Facts.h
+++ b/clang/include/clang/Analysis/Analyses/LifetimeSafety/Facts.h
@@ -16,6 +16,7 @@
#include "clang/Analysis/Analyses/LifetimeSafety/Loans.h"
#include "clang/Analysis/Analyses/LifetimeSafety/Origins.h"
+#include "clang/Analysis/Analyses/LifetimeSafety/Utils.h"
#include "clang/Analysis/AnalysisDeclContext.h"
#include "clang/Analysis/CFG.h"
#include "llvm/ADT/SmallVector.h"
@@ -23,6 +24,9 @@
#include
namespace clang::lifetimes::internal {
+
+using FactID = utils::ID;
+
/// An abstract base class for a single, atomic lifetime-relevant event.
class Fact {
@@ -48,6 +52,7 @@ class Fact {
private:
Kind K;
+ FactID ID;
protected:
Fact(Kind K) : K(K) {}
@@ -56,6 +61,9 @@ class Fact {
virtual ~Fact() = default;
Kind getKind() const { return K; }
+ void setID(FactID ID) { this->ID = ID; }
+ FactID getID() const { return ID; }
+
template const T *getAs() const {
if (T::classof(this))
return static_cast(this);
@@ -183,22 +191,26 @@ class TestPointFact : public Fact {
class FactManager {
public:
+ void init(const CFG &Cfg) {
+assert(BlockToFacts.empty() && "FactManager already initialized");
+BlockToFacts.resize(Cfg.getNumBlockIDs());
+ }
+
llvm::ArrayRef getFacts(const CFGBlock *B) const {
-auto It = BlockToFactsMap.find(B);
-if (It != BlockToFactsMap.end())
- return It->second;
-return {};
+return BlockToFacts[B->getBlockID()];
}
void addBlockFacts(const CFGBlock *B, llvm::ArrayRef NewFacts) {
if (!NewFacts.empty())
- BlockToFactsMap[B].assign(NewFacts.begin(), NewFacts.end());
+ BlockToFacts[B->getBlockID()].assign(NewFacts.begin(), NewFacts.end());
}
template
FactType *createFact(Args &&...args) {
void *Mem = FactAllocator.Allocate();
-return new (Mem) FactType(std::forward(args)...);
+FactType *Res = new (Mem) FactType(std::forward(args)...);
+Res->setID(NextFactID++);
+return Res;
}
void dump(const CFG &Cfg, AnalysisDeclContext &AC) const;
@@ -214,16 +226,19 @@ class FactManager {
/// \note This is intended for testing only.
llvm::StringMap getTestPoints() const;
+ unsigned getNumFacts() const { return NextFactID.Value; }
+
LoanManager &getLoanMgr() { return LoanMgr; }
const LoanManager &getLoanMgr() const { return LoanMgr; }
OriginManager &getOriginMgr() { return OriginMgr; }
const OriginManager &getOriginMgr() const { return OriginMgr; }
private:
+ FactID NextFactID{0};
LoanManager LoanMgr;
OriginManager OriginMgr;
- llvm::DenseMap>
- BlockToFactsMap;
+ /// Facts for each CFG block, indexed by block ID.
+ llvm::SmallVector> BlockToFacts;
llvm::BumpPtrAllocator FactAllocator;
};
} // namespace clang::lifetimes::internal
diff --git a/clang/lib/Analysis/LifetimeSafety/Dataflow.h
b/clang/lib/Analysis/LifetimeSafety/Dataflow.h
index 2f7bcb6e5dc81..de821bb17eb6b 100644
--- a/clang/lib/Analysis/LifetimeSafety/Dataflow.h
+++ b/clang/lib/Analysis/LifetimeSafety/Dataflow.h
@@ -67,10 +67,10 @@ class DataflowAnalysis {
llvm::DenseMap InStates;
/// The dataflow state after a basic block is processed.
llvm::DenseMap OutStates;
- /// The dataflow state at a Program Point.
+ /// Dataflow state at each program point, indexed by Fact ID.
/// In a forward analysis, this is the state after the Fact at that point has
/// been applied, while in a backward analysis, it is the state before.
- llvm::DenseMap PerPointStates;
+ llvm::SmallVector PointToState;
static constexpr bool isForward() { return Dir == Direction::Forward; }
@@ -86,6 +86,8 @@ class DataflowAnalysis {
Derived &D = static_cast(*this);
llvm::TimeTraceScope Time(D.getAnalysisName());
+PointToState.resize(FactMgr.getNumFacts());
+
using Worklist =
std::conditional_t;
@@ -116,7 +118,9 @@ class DataflowAnalysis {
}
protected:
- Lattice getState(ProgramPoint P) const { return PerPointStates.lookup(P); }
+ Lattice getState(ProgramPoint P) const {
+return PointToState[P->getID().Value];
+ }
std::optional getInState(const CFGBlock *B) const {
auto It = InStates.find(B);
@@ -144
[llvm-branch-commits] [llvm] CodeGen: Record MMOs in finalizeBundle (PR #166210)
@@ -200,6 +201,8 @@ void llvm::finalizeBundle(MachineBasicBlock &MBB, MIB.setMIFlag(MachineInstr::FrameSetup); if (MII->getFlag(MachineInstr::FrameDestroy)) MIB.setMIFlag(MachineInstr::FrameDestroy); + +append_range(MMOs, MII->memoperands()); arsenm wrote: It might be worth uniquing these, which cloneMergedMemRefs seems to do for you https://github.com/llvm/llvm-project/pull/166210 ___ llvm-branch-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] CodeGen: Record MMOs in finalizeBundle (PR #166210)
@@ -32,7 +32,6 @@ define void @needs_align16_default_stack_align(i32 %idx) #0 {
; GCN-NEXT:buffer_store_dword v1, v0, s[0:3], 0 offen
; GCN-NEXT:s_waitcnt vmcnt(0)
; GCN-NEXT:s_setpc_b64 s[30:31]
-; GCN: ; ScratchSize: 144
arsenm wrote:
Lost all of these ScratchSize checks
https://github.com/llvm/llvm-project/pull/166210
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [WIP] Handle guard insertion in callbacks to OpenMP runtime functions. (PR #164655)
https://github.com/abidh updated
https://github.com/llvm/llvm-project/pull/164655
>From 56037a64dbd5f73d2c020dd5d58d2c99758b35d0 Mon Sep 17 00:00:00 2001
From: Abid Qadeer
Date: Tue, 21 Oct 2025 20:53:46 +0100
Subject: [PATCH 1/9] Add callback metadata to runtime functions which take
callbacks.
---
llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp | 25
.../Frontend/OpenMPIRBuilderTest.cpp | 58 +++
2 files changed, 83 insertions(+)
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index c164d32f8f98c..312e119c4280d 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -750,6 +750,31 @@ OpenMPIRBuilder::getOrCreateRuntimeFunction(Module &M,
RuntimeFunction FnID) {
*MDNode::get(Ctx, {MDB.createCallbackEncoding(
2, {-1, -1}, /* VarArgsArePassed */ true)}));
}
+
+} else if (FnID == OMPRTL___kmpc_distribute_static_loop_4 ||
+ FnID == OMPRTL___kmpc_distribute_static_loop_4u ||
+ FnID == OMPRTL___kmpc_distribute_static_loop_8 ||
+ FnID == OMPRTL___kmpc_distribute_static_loop_8u ||
+ FnID == OMPRTL___kmpc_distribute_for_static_loop_4 ||
+ FnID == OMPRTL___kmpc_distribute_for_static_loop_4u ||
+ FnID == OMPRTL___kmpc_distribute_for_static_loop_8 ||
+ FnID == OMPRTL___kmpc_distribute_for_static_loop_8u ||
+ FnID == OMPRTL___kmpc_for_static_loop_4 ||
+ FnID == OMPRTL___kmpc_for_static_loop_4u ||
+ FnID == OMPRTL___kmpc_for_static_loop_8 ||
+ FnID == OMPRTL___kmpc_for_static_loop_8u) {
+ if (!Fn->hasMetadata(LLVMContext::MD_callback)) {
+LLVMContext &Ctx = Fn->getContext();
+MDBuilder MDB(Ctx);
+// Annotate the callback behavior of the runtime function:
+// - The callback callee is argument number 1.
+// - The first argument of the callback callee is unknown (-1).
+// - The second argument of the callback callee is argument number 2
+Fn->addMetadata(
+LLVMContext::MD_callback,
+*MDNode::get(Ctx, {MDB.createCallbackEncoding(
+ 1, {-1, 2}, /* VarArgsArePassed */ false)}));
+ }
}
LLVM_DEBUG(dbgs() << "Created OpenMP runtime function " << Fn->getName()
diff --git a/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp
b/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp
index d231a778a8a97..aca2153f85c26 100644
--- a/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp
+++ b/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp
@@ -7957,4 +7957,62 @@ TEST_F(OpenMPIRBuilderTest, spliceBBWithEmptyBB) {
EXPECT_FALSE(Terminator->getDbgRecordRange().empty());
}
+TEST_F(OpenMPIRBuilderTest, callBackFunctions) {
+ OpenMPIRBuilder OMPBuilder(*M);
+ OMPBuilder.Config.IsTargetDevice = true;
+ OMPBuilder.initialize();
+
+ // Test multiple runtime functions that should have callback metadata
+ std::vector CallbackFunctions = {
+OMPRTL___kmpc_distribute_static_loop_4,
+OMPRTL___kmpc_distribute_static_loop_4u,
+OMPRTL___kmpc_distribute_static_loop_8,
+OMPRTL___kmpc_distribute_static_loop_8u,
+OMPRTL___kmpc_distribute_for_static_loop_4,
+OMPRTL___kmpc_distribute_for_static_loop_4u,
+OMPRTL___kmpc_distribute_for_static_loop_8,
+OMPRTL___kmpc_distribute_for_static_loop_8u,
+OMPRTL___kmpc_for_static_loop_4,
+OMPRTL___kmpc_for_static_loop_4u,
+OMPRTL___kmpc_for_static_loop_8,
+OMPRTL___kmpc_for_static_loop_8u
+ };
+
+ for (RuntimeFunction RF : CallbackFunctions) {
+Function *Fn = OMPBuilder.getOrCreateRuntimeFunctionPtr(RF);
+ASSERT_NE(Fn, nullptr) << "Function should exist for runtime function";
+
+MDNode *CallbackMD = Fn->getMetadata(LLVMContext::MD_callback);
+EXPECT_NE(CallbackMD, nullptr) << "Function should have callback metadata";
+
+if (CallbackMD) {
+ // Should have at least one callback
+ EXPECT_GE(CallbackMD->getNumOperands(), 1U);
+
+ // Test first callback entry
+ MDNode *FirstCallback = cast(CallbackMD->getOperand(0));
+ EXPECT_EQ(FirstCallback->getNumOperands(), 4U);
+
+ // Callee index should be valid
+ auto *CalleeIdxCM =
cast(FirstCallback->getOperand(0));
+ uint64_t CalleeIdx =
cast(CalleeIdxCM->getValue())->getZExtValue();
+ EXPECT_EQ(CalleeIdx, 1u);
+
+ // Verify payload arguments re (-1, 2)
+ auto *Arg0CM = cast(FirstCallback->getOperand(1));
+ int64_t Arg0 = cast(Arg0CM->getValue())->getSExtValue();
+ EXPECT_EQ(Arg0, -1);
+ auto *Arg1CM = cast(FirstCallback->getOperand(2));
+ int64_t Arg1 = cast(Arg1CM->getValue())->getSExtValue();
+ EXPECT_EQ(Arg1, 2);
+
+ // Verify the varArgs is false.
+ auto *VarArgCM = cast(FirstCallback->getOperand(3));
+ uint64_t VarAr
[llvm-branch-commits] [X86][NewPM] Port lower-amx-intrinsics to NewPM (PR #165113)
@@ -179,7 +179,18 @@ FunctionPass *createX86LowerAMXTypeLegacyPass();
/// The pass transforms amx intrinsics to scalar operation if the function has
/// optnone attribute or it is O0.
-FunctionPass *createX86LowerAMXIntrinsicsPass();
+class X86LowerAMXIntrinsicsPass
+: public PassInfoMixin {
+private:
+ const TargetMachine *TM;
+
+public:
+ X86LowerAMXIntrinsicsPass(const TargetMachine *TM) : TM(TM) {}
+ PreservedAnalyses run(Function &F, FunctionAnalysisManager &FAM);
+ static bool isRequired() { return true; }
arsenm wrote:
Sounds like a solution to a problem that shouldn't be solved?
https://github.com/llvm/llvm-project/pull/165113
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [X86][NewPM] Port lower-amx-intrinsics to NewPM (PR #165113)
https://github.com/arsenm approved this pull request. https://github.com/llvm/llvm-project/pull/165113 ___ llvm-branch-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [X86][NewPM] Port X86PartialReduction to NewPM (PR #166048)
https://github.com/arsenm approved this pull request. https://github.com/llvm/llvm-project/pull/166048 ___ llvm-branch-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [mlir] 3414bbf - Revert "[mlir][memref]: Collapse strided unit dim even if strides are dynamic…"
Author: Han-Chung Wang
Date: 2025-11-04T13:28:00-08:00
New Revision: 3414bbf12a661bc518953d91e859a5b67b6dc432
URL:
https://github.com/llvm/llvm-project/commit/3414bbf12a661bc518953d91e859a5b67b6dc432
DIFF:
https://github.com/llvm/llvm-project/commit/3414bbf12a661bc518953d91e859a5b67b6dc432.diff
LOG: Revert "[mlir][memref]: Collapse strided unit dim even if strides are
dynamic…"
This reverts commit f74e90961f51c9437461007c89b037be41e4e887.
Added:
Modified:
mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp
mlir/test/Dialect/MemRef/ops.mlir
Removed:
diff --git a/mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp
b/mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp
index e271ac58db327..1c21a2f270da6 100644
--- a/mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp
+++ b/mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp
@@ -2568,11 +2568,6 @@ computeCollapsedLayoutMap(MemRefType srcType,
auto trailingReassocs = ArrayRef(reassoc).drop_front();
auto stride = SaturatedInteger::wrap(resultStrides[resultStrideIndex--]);
for (int64_t idx : llvm::reverse(trailingReassocs)) {
- // Dimensions of size 1 should be skipped, because their strides are
- // meaningless and could have any arbitrary value.
- if (srcShape[idx - 1] == 1)
-continue;
-
stride = stride * SaturatedInteger::wrap(srcShape[idx]);
// Both source and result stride must have the same static value. In that
@@ -2587,6 +2582,11 @@ computeCollapsedLayoutMap(MemRefType srcType,
if (strict && (stride.saturated || srcStride.saturated))
return failure();
+ // Dimensions of size 1 should be skipped, because their strides are
+ // meaningless and could have any arbitrary value.
+ if (srcShape[idx - 1] == 1)
+continue;
+
if (!stride.saturated && !srcStride.saturated && stride != srcStride)
return failure();
}
diff --git a/mlir/test/Dialect/MemRef/ops.mlir
b/mlir/test/Dialect/MemRef/ops.mlir
index b1db99bb3ad08..a90c9505a8405 100644
--- a/mlir/test/Dialect/MemRef/ops.mlir
+++ b/mlir/test/Dialect/MemRef/ops.mlir
@@ -440,8 +440,7 @@ func.func @expand_collapse_shape_dynamic(%arg0:
memref,
%arg4: index,
%arg5: index,
%arg6: index,
- %arg7: memref<4x?x4xf32>,
- %arg8: memref<1x1x18x?xsi8, strided<[?, ?, ?, 1], offset: ?>>) {
+ %arg7: memref<4x?x4xf32>) {
// CHECK: memref.collapse_shape {{.*}} {{\[}}[0, 1], [2]]
// CHECK-SAME: memref into memref
%0 = memref.collapse_shape %arg0 [[0, 1], [2]] :
@@ -490,10 +489,6 @@ func.func @expand_collapse_shape_dynamic(%arg0:
memref,
// CHECK: memref.expand_shape {{.*}} {{\[}}[0, 1], [2], [3, 4]]
%4 = memref.expand_shape %arg7 [[0, 1], [2], [3, 4]] output_shape [2, 2,
%arg4, 2, 2]
: memref<4x?x4xf32> into memref<2x2x?x2x2xf32>
-
-// CHECK: memref.collapse_shape {{.*}} {{\[}}[0, 1], [2], [3]]
-// CHECK-SAME: memref<1x1x18x?xsi8, strided<[?, ?, ?, 1], offset: ?>>
into memref<1x18x?xsi8, strided<[?, ?, 1], offset: ?>>
- %5 = memref.collapse_shape %arg8 [[0, 1], [2], [3]] : memref<1x1x18x?xsi8,
strided<[?, ?, ?, 1], offset: ?>> into memref<1x18x?xsi8, strided<[?, ?, 1],
offset: ?>>
return
}
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] CodeGen: More accurate mayAlias for instructions with multiple MMOs (PR #166211)
https://github.com/jayfoad commented: This patch highlights a problem that I have run into before: _if_ we want to support a single opcode (in this case BUNDLE) that may or may not load or store depending on its operands, then the current definitions of `MachineInstr::mayLoad` and `mayStore` are not good enough, because they just check a static property of the opcode. If `mayLoad` and `mayStore` were more sophisticated then this check would have been handled already at L1527. Having said that, in the short term I have no objection to this patch. https://github.com/llvm/llvm-project/pull/166211 ___ llvm-branch-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [X86][NewPM] Port X86PartialReduction to NewPM (PR #166048)
https://github.com/RKSimon approved this pull request. LGTM https://github.com/llvm/llvm-project/pull/166048 ___ llvm-branch-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
