[llvm] [mlir] [lld] [clang] [NFC][IRCE] Add unit test to show room for improvement (PR #71506)

2023-11-07 Thread Aleksandr Popov via cfe-commits

https://github.com/aleks-tmb updated 
https://github.com/llvm/llvm-project/pull/71506

>From 1c946c72f19bb553ad7af2e9b4edb1796eee Mon Sep 17 00:00:00 2001
From: Aleksander Popov 
Date: Tue, 7 Nov 2023 05:16:45 +0100
Subject: [PATCH] [NFC][IRCE] Add unit test to show room for improvement

Add tests for compound loop bounds where IRCE is possible

if (K > 0 && M > 0)
  for (i = 0; i < min(K, M); i++) {...}

if (K > 0 && M > 0)
  for (i = min(K, M); i >= 0; i--) {...}
---
 .../Transforms/IRCE/compound-loop-bound.ll| 132 ++
 1 file changed, 132 insertions(+)
 create mode 100644 llvm/test/Transforms/IRCE/compound-loop-bound.ll

diff --git a/llvm/test/Transforms/IRCE/compound-loop-bound.ll 
b/llvm/test/Transforms/IRCE/compound-loop-bound.ll
new file mode 100644
index 000..0930d19e22154fc
--- /dev/null
+++ b/llvm/test/Transforms/IRCE/compound-loop-bound.ll
@@ -0,0 +1,132 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py 
UTC_ARGS: --version 3
+; RUN: opt -passes=irce < %s -S | FileCheck %s
+
+; if (K > 0 && M > 0)
+;   for (i = 0; i < min(K, M); i++) {...}
+;
+; TODO: Loop bounds are safe according to loop guards. IRCE is allowed.
+define void @incrementing_loop(ptr %arr, ptr %len_ptr, i32 %K, i32 %M) {
+; CHECK-LABEL: define void @incrementing_loop(
+; CHECK-SAME: ptr [[ARR:%.*]], ptr [[LEN_PTR:%.*]], i32 [[K:%.*]], i32 
[[M:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:[[LEN:%.*]] = load i32, ptr [[LEN_PTR]], align 4, !range 
[[RNG0:![0-9]+]]
+; CHECK-NEXT:[[CHECK0:%.*]] = icmp sgt i32 [[K]], 0
+; CHECK-NEXT:[[CHECK1:%.*]] = icmp sgt i32 [[M]], 0
+; CHECK-NEXT:[[AND:%.*]] = and i1 [[CHECK0]], [[CHECK1]]
+; CHECK-NEXT:br i1 [[AND]], label [[PREHEADER:%.*]], label [[EXIT:%.*]]
+; CHECK:   preheader:
+; CHECK-NEXT:[[SMIN:%.*]] = call i32 @llvm.smin.i32(i32 [[K]], i32 [[M]])
+; CHECK-NEXT:br label [[LOOP:%.*]]
+; CHECK:   loop:
+; CHECK-NEXT:[[IDX:%.*]] = phi i32 [ 0, [[PREHEADER]] ], [ 
[[IDX_NEXT:%.*]], [[IN_BOUNDS:%.*]] ]
+; CHECK-NEXT:[[IDX_NEXT]] = add i32 [[IDX]], 1
+; CHECK-NEXT:[[GUARD:%.*]] = icmp slt i32 [[IDX]], [[LEN]]
+; CHECK-NEXT:br i1 [[GUARD]], label [[IN_BOUNDS]], label 
[[OUT_OF_BOUNDS:%.*]]
+; CHECK:   in.bounds:
+; CHECK-NEXT:[[ADDR:%.*]] = getelementptr i32, ptr [[ARR]], i32 [[IDX]]
+; CHECK-NEXT:store i32 0, ptr [[ADDR]], align 4
+; CHECK-NEXT:[[NEXT:%.*]] = icmp slt i32 [[IDX_NEXT]], [[SMIN]]
+; CHECK-NEXT:br i1 [[NEXT]], label [[LOOP]], label [[EXIT_LOOPEXIT:%.*]]
+; CHECK:   out.of.bounds:
+; CHECK-NEXT:ret void
+; CHECK:   exit.loopexit:
+; CHECK-NEXT:br label [[EXIT]]
+; CHECK:   exit:
+; CHECK-NEXT:ret void
+;
+entry:
+  %len = load i32, ptr %len_ptr, !range !0
+  %check0 = icmp sgt i32 %K, 0
+  %check1 = icmp sgt i32 %M, 0
+  %and = and i1 %check0, %check1
+  br i1 %and, label %preheader, label %exit
+
+preheader:
+  %smin = call i32 @llvm.smin.i32(i32 %K, i32 %M)
+  br label %loop
+
+loop:
+  %idx = phi i32 [ 0, %preheader ], [ %idx.next, %in.bounds ]
+  %idx.next = add i32 %idx, 1
+  %guard = icmp slt i32 %idx, %len
+  br i1 %guard, label %in.bounds, label %out.of.bounds
+
+in.bounds:
+  %addr = getelementptr i32, ptr %arr, i32 %idx
+  store i32 0, ptr %addr
+  %next = icmp slt i32 %idx.next, %smin
+  br i1 %next, label %loop, label %exit
+
+out.of.bounds:
+  ret void
+
+exit:
+  ret void
+}
+
+; if (K > 0 && M > 0)
+;   for (i = min(K, M); i >= 0; i--) {...}
+;
+; TODO: Loop bounds are safe according to loop guards. IRCE is allowed.
+define void @decrementing_loop(ptr %arr, ptr %len_ptr, i32 %K, i32 %M) {
+; CHECK-LABEL: define void @decrementing_loop(
+; CHECK-SAME: ptr [[ARR:%.*]], ptr [[LEN_PTR:%.*]], i32 [[K:%.*]], i32 
[[M:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:[[LEN:%.*]] = load i32, ptr [[LEN_PTR]], align 4, !range 
[[RNG0]]
+; CHECK-NEXT:[[CHECK0:%.*]] = icmp sgt i32 [[K]], 0
+; CHECK-NEXT:[[CHECK1:%.*]] = icmp sgt i32 [[M]], 0
+; CHECK-NEXT:[[AND:%.*]] = and i1 [[CHECK0]], [[CHECK1]]
+; CHECK-NEXT:br i1 [[AND]], label [[PREHEADER:%.*]], label [[EXIT:%.*]]
+; CHECK:   preheader:
+; CHECK-NEXT:[[SMIN:%.*]] = call i32 @llvm.smin.i32(i32 [[K]], i32 [[M]])
+; CHECK-NEXT:br label [[LOOP:%.*]]
+; CHECK:   loop:
+; CHECK-NEXT:[[IDX:%.*]] = phi i32 [ [[SMIN]], [[PREHEADER]] ], [ 
[[IDX_DEC:%.*]], [[IN_BOUNDS:%.*]] ]
+; CHECK-NEXT:[[IDX_DEC]] = sub i32 [[IDX]], 1
+; CHECK-NEXT:[[GUARD:%.*]] = icmp slt i32 [[IDX]], [[LEN]]
+; CHECK-NEXT:br i1 [[GUARD]], label [[IN_BOUNDS]], label 
[[OUT_OF_BOUNDS:%.*]]
+; CHECK:   in.bounds:
+; CHECK-NEXT:[[ADDR:%.*]] = getelementptr i32, ptr [[ARR]], i32 [[IDX]]
+; CHECK-NEXT:store i32 0, ptr [[ADDR]], align 4
+; CHECK-NEXT:[[NEXT:%.*]] = icmp sgt i32 [[IDX_DEC]], -1
+; CHECK-NEXT:br i1 [[NEXT]], label [[LOOP]], label [[EXIT_LOOPEXIT:%.*]]
+; CHECK:   out.of.bounds:
+; CHECK-NEXT:ret void
+; CHECK:   ex

[clang] [LoopPeeling] Fix weights updating of peeled off branches (PR #70094)

2023-10-30 Thread Aleksandr Popov via cfe-commits

aleks-tmb wrote:

@MatzeB Hi, could you please take a look. Described issue manifested after your 
patch:
```
SHA: b30c9c937802a78ef986cb4219eba51148f76e6c
Author: Matthias Braun
Date: 2023-09-11 14:23:29 -0700 -0700
LoopUnrollRuntime: Add weights to all branches
```

Actually your patch added branch weights scaling. So imagine a loop with scaled 
latch weights ratio from 1:2 to 127:254 one. If we decide to peel 3 iterations 
of such loop, initial 1:2 ratio will be updated as:

```
0th peeled latch: 1:2
1st peeled latch: 1:1 
2nd peeled latch: 1:1 
```
And with initial 127:254:
```
0th peeled latch: 127:254
1st peeled latch: 127:127
2nd peeled latch: 127:1
```
So the hotness of the mail loop header changed from neutral 1:1 to 127:1, 
making it almost unreachable.

In my patch I tried to make it:

https://github.com/llvm/llvm-project/pull/70094
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang-tools-extra] [LoopPeeling] Fix weights updating of peeled off branches (PR #70094)

2023-10-30 Thread Aleksandr Popov via cfe-commits

aleks-tmb wrote:

@MatzeB Hi, could you please take a look. Described issue manifested after your 
patch:
```
SHA: b30c9c937802a78ef986cb4219eba51148f76e6c
Author: Matthias Braun
Date: 2023-09-11 14:23:29 -0700 -0700
LoopUnrollRuntime: Add weights to all branches
```

Actually your patch added branch weights scaling. So imagine a loop with scaled 
latch weights ratio from 1:2 to 127:254 one. If we decide to peel 3 iterations 
of such loop, initial 1:2 ratio will be updated as:

```
0th peeled latch: 1:2
1st peeled latch: 1:1 
2nd peeled latch: 1:1 
```
And with initial 127:254:
```
0th peeled latch: 127:254
1st peeled latch: 127:127
2nd peeled latch: 127:1
```
So the hotness of the mail loop header changed from neutral 1:1 to 127:1, 
making it almost unreachable.

In my patch I tried to make it:

https://github.com/llvm/llvm-project/pull/70094
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[libcxx] [libc] [flang] [clang-tools-extra] [clang] [compiler-rt] [llvm] [LoopPeeling] Fix weights updating of peeled off branches (PR #70094)

2023-10-31 Thread Aleksandr Popov via cfe-commits

https://github.com/aleks-tmb updated 
https://github.com/llvm/llvm-project/pull/70094

>From 4265cc4a05d939a29e57a116e696fa1eb032c249 Mon Sep 17 00:00:00 2001
From: Aleksandr Popov 
Date: Tue, 24 Oct 2023 16:47:50 +
Subject: [PATCH] [LoopPeeling] Fix weights updating of peeled off branches

In https://reviews.llvm.org/D64235 a new algorithm has been introduced
for updating the branch weights of latch blocks and their copies.

It increases the probability of going to the exit block for each next
peel iteration, calculating weights by (F - I * E, E), where:
- F is a weight of the edge from latch to header.
- E is a weight of the edge from latch to exit.
- I is a number of peeling iteration.

E.g: Let's say the latch branch weights are (100,300) and the estimated
trip count is 4. If we peel off all 4 iterations the weights of the
copied branches will be:
0: (100,300)
1: (100,200)
2: (100,100)
3: (100,1)

https://godbolt.org/z/93KnoEsT6

So we make the original loop almost unreachable from the 3rd peeled copy
according to the profile data. But that's only true if the profiling
data is accurate.
Underestimated trip count can lead to a performance issues with the
register allocator, which may decide to spill intervals inside the loop
assuming it's unreachable.

Since we don't know how accurate the profiling data is, it seems better
to set neutral 1/1 weights on the last peeled latch branch. After this
change, the weights in the example above will look like this:
0: (100,300)
1: (100,200)
2: (100,100)
3: (100,100)
---
 llvm/lib/Transforms/Utils/LoopPeel.cpp | 10 +++---
 llvm/test/Transforms/LoopUnroll/peel-loop-pgo-deopt.ll |  5 ++---
 llvm/test/Transforms/LoopUnroll/peel-loop-pgo.ll   |  5 ++---
 3 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/llvm/lib/Transforms/Utils/LoopPeel.cpp 
b/llvm/lib/Transforms/Utils/LoopPeel.cpp
index 31f065b691f864c..30c525dd82bbd97 100644
--- a/llvm/lib/Transforms/Utils/LoopPeel.cpp
+++ b/llvm/lib/Transforms/Utils/LoopPeel.cpp
@@ -636,9 +636,13 @@ static void updateBranchWeights(Instruction *Term, 
WeightInfo &Info) {
 MDB.createBranchWeights(Info.Weights));
   for (auto [Idx, SubWeight] : enumerate(Info.SubWeights))
 if (SubWeight != 0)
-  Info.Weights[Idx] = Info.Weights[Idx] > SubWeight
-  ? Info.Weights[Idx] - SubWeight
-  : 1;
+  // Don't set the probability of taking the edge from latch to loop header
+  // to less than 1, as this could significantly reduce the loop's hotness,
+  // which would be incorrect in the case of underestimating the trip 
count.
+  Info.Weights[Idx] =
+  Info.Weights[Idx] > SubWeight
+  ? std::max(Info.Weights[Idx] - SubWeight, SubWeight)
+  : SubWeight;
 }
 
 /// Initialize the weights for all exiting blocks.
diff --git a/llvm/test/Transforms/LoopUnroll/peel-loop-pgo-deopt.ll 
b/llvm/test/Transforms/LoopUnroll/peel-loop-pgo-deopt.ll
index a0dc216133fb7d5..d91cb5bab382782 100644
--- a/llvm/test/Transforms/LoopUnroll/peel-loop-pgo-deopt.ll
+++ b/llvm/test/Transforms/LoopUnroll/peel-loop-pgo-deopt.ll
@@ -21,7 +21,7 @@
 ; CHECK: br i1 %{{.*}}, label %[[NEXT2:.*]], label 
%for.cond.for.end_crit_edge, !prof !18
 ; CHECK: [[NEXT2]]:
 ; CHECK: br i1 %c, label %{{.*}}, label %side_exit.loopexit, !prof !15
-; CHECK: br i1 %{{.*}}, label %for.body, label %{{.*}}, !prof !19
+; CHECK: br i1 %{{.*}}, label %for.body, label %{{.*}}, !prof !18
 
 define i32 @basic(ptr %p, i32 %k, i1 %c) #0 !prof !15 {
 entry:
@@ -85,6 +85,5 @@ attributes #1 = { nounwind optsize }
 ; This is a weights of latch and its copies.
 ;CHECK: !16 = !{!"branch_weights", i32 3001, i32 1001}
 ;CHECK: !17 = !{!"branch_weights", i32 2000, i32 1001}
-;CHECK: !18 = !{!"branch_weights", i32 999, i32 1001}
-;CHECK: !19 = !{!"branch_weights", i32 1, i32 1001}
+;CHECK: !18 = !{!"branch_weights", i32 1001, i32 1001}
 
diff --git a/llvm/test/Transforms/LoopUnroll/peel-loop-pgo.ll 
b/llvm/test/Transforms/LoopUnroll/peel-loop-pgo.ll
index cadb6739dbc3fbb..15dce234baee91d 100644
--- a/llvm/test/Transforms/LoopUnroll/peel-loop-pgo.ll
+++ b/llvm/test/Transforms/LoopUnroll/peel-loop-pgo.ll
@@ -24,7 +24,7 @@
 ; CHECK: [[NEXT1]]:
 ; CHECK: br i1 %{{.*}}, label %[[NEXT2:.*]], label 
%for.cond.for.end_crit_edge, !prof !17
 ; CHECK: [[NEXT2]]:
-; CHECK: br i1 %{{.*}}, label %for.body, label %{{.*}}, !prof !18
+; CHECK: br i1 %{{.*}}, label %for.body, label %{{.*}}, !prof !17
 
 define void @basic(ptr %p, i32 %k) #0 !prof !15 {
 entry:
@@ -105,6 +105,5 @@ attributes #1 = { nounwind optsize }
 
 ;CHECK: !15 = !{!"branch_weights", i32 3001, i32 1001}
 ;CHECK: !16 = !{!"branch_weights", i32 2000, i32 1001}
-;CHECK: !17 = !{!"branch_weights", i32 999, i32 1001}
-;CHECK: !18 = !{!"branch_weights", i32 1, i32 1001}
+;CHECK: !17 = !{!"branch_weights", i32 1001, i32 1001}
 

___
cfe-commits mailing list
cfe

[llvm] [clang-tools-extra] [LoopConstrainer] Apply loop gurads to check that loop bounds are safe (PR #71531)

2024-01-18 Thread Aleksandr Popov via cfe-commits

https://github.com/aleks-tmb updated 
https://github.com/llvm/llvm-project/pull/71531

>From c734c948599ace737d48b32a817bd6410bd2b13e Mon Sep 17 00:00:00 2001
From: Aleksander Popov 
Date: Tue, 7 Nov 2023 05:31:13 +0100
Subject: [PATCH] [LoopConstrainer] Apply loop gurads to check that loop bounds
 are safe

Loop guards that apply to loop SCEV bounds allow IRCE for cases with
compound loop bounds such as:

if (K > 0 && M > 0)
  for (i = 0; i < min(K, M); i++) {...}

if (K > 0 && M > 0)
  for (i = min(K, M); i >= 0; i--) {...}

Otherwise SCEV couldn't prove that loops have safe bounds in these
cases.
---
 llvm/lib/Transforms/Utils/LoopConstrainer.cpp | 22 +++--
 .../Transforms/IRCE/compound-loop-bound.ll| 85 +--
 2 files changed, 90 insertions(+), 17 deletions(-)

diff --git a/llvm/lib/Transforms/Utils/LoopConstrainer.cpp 
b/llvm/lib/Transforms/Utils/LoopConstrainer.cpp
index ea6d952cfa7d4f..da6f37df5b314a 100644
--- a/llvm/lib/Transforms/Utils/LoopConstrainer.cpp
+++ b/llvm/lib/Transforms/Utils/LoopConstrainer.cpp
@@ -42,8 +42,11 @@ static bool isSafeDecreasingBound(const SCEV *Start, const 
SCEV *BoundSCEV,
   ICmpInst::Predicate BoundPred =
   IsSigned ? CmpInst::ICMP_SGT : CmpInst::ICMP_UGT;
 
+  auto StartLG = SE.applyLoopGuards(Start, L);
+  auto BoundLG = SE.applyLoopGuards(BoundSCEV, L);
+
   if (LatchBrExitIdx == 1)
-return SE.isLoopEntryGuardedByCond(L, BoundPred, Start, BoundSCEV);
+return SE.isLoopEntryGuardedByCond(L, BoundPred, StartLG, BoundLG);
 
   assert(LatchBrExitIdx == 0 && "LatchBrExitIdx should be either 0 or 1");
 
@@ -54,10 +57,10 @@ static bool isSafeDecreasingBound(const SCEV *Start, const 
SCEV *BoundSCEV,
   const SCEV *Limit = SE.getMinusSCEV(SE.getConstant(Min), StepPlusOne);
 
   const SCEV *MinusOne =
-  SE.getMinusSCEV(BoundSCEV, SE.getOne(BoundSCEV->getType()));
+  SE.getMinusSCEV(BoundLG, SE.getOne(BoundLG->getType()));
 
-  return SE.isLoopEntryGuardedByCond(L, BoundPred, Start, MinusOne) &&
- SE.isLoopEntryGuardedByCond(L, BoundPred, BoundSCEV, Limit);
+  return SE.isLoopEntryGuardedByCond(L, BoundPred, StartLG, MinusOne) &&
+ SE.isLoopEntryGuardedByCond(L, BoundPred, BoundLG, Limit);
 }
 
 /// Given a loop with an increasing induction variable, is it possible to
@@ -86,8 +89,11 @@ static bool isSafeIncreasingBound(const SCEV *Start, const 
SCEV *BoundSCEV,
   ICmpInst::Predicate BoundPred =
   IsSigned ? CmpInst::ICMP_SLT : CmpInst::ICMP_ULT;
 
+  auto StartLG = SE.applyLoopGuards(Start, L);
+  auto BoundLG = SE.applyLoopGuards(BoundSCEV, L);
+
   if (LatchBrExitIdx == 1)
-return SE.isLoopEntryGuardedByCond(L, BoundPred, Start, BoundSCEV);
+return SE.isLoopEntryGuardedByCond(L, BoundPred, StartLG, BoundLG);
 
   assert(LatchBrExitIdx == 0 && "LatchBrExitIdx should be 0 or 1");
 
@@ -97,9 +103,9 @@ static bool isSafeIncreasingBound(const SCEV *Start, const 
SCEV *BoundSCEV,
: APInt::getMaxValue(BitWidth);
   const SCEV *Limit = SE.getMinusSCEV(SE.getConstant(Max), StepMinusOne);
 
-  return (SE.isLoopEntryGuardedByCond(L, BoundPred, Start,
-  SE.getAddExpr(BoundSCEV, Step)) &&
-  SE.isLoopEntryGuardedByCond(L, BoundPred, BoundSCEV, Limit));
+  return (SE.isLoopEntryGuardedByCond(L, BoundPred, StartLG,
+  SE.getAddExpr(BoundLG, Step)) &&
+  SE.isLoopEntryGuardedByCond(L, BoundPred, BoundLG, Limit));
 }
 
 /// Returns estimate for max latch taken count of the loop of the narrowest
diff --git a/llvm/test/Transforms/IRCE/compound-loop-bound.ll 
b/llvm/test/Transforms/IRCE/compound-loop-bound.ll
index 0930d19e22154f..e50d8c6127f401 100644
--- a/llvm/test/Transforms/IRCE/compound-loop-bound.ll
+++ b/llvm/test/Transforms/IRCE/compound-loop-bound.ll
@@ -16,23 +16,56 @@ define void @incrementing_loop(ptr %arr, ptr %len_ptr, i32 
%K, i32 %M) {
 ; CHECK-NEXT:br i1 [[AND]], label [[PREHEADER:%.*]], label [[EXIT:%.*]]
 ; CHECK:   preheader:
 ; CHECK-NEXT:[[SMIN:%.*]] = call i32 @llvm.smin.i32(i32 [[K]], i32 [[M]])
+; CHECK-NEXT:[[SMIN1:%.*]] = call i32 @llvm.smin.i32(i32 [[LEN]], i32 
[[M]])
+; CHECK-NEXT:[[SMIN2:%.*]] = call i32 @llvm.smin.i32(i32 [[SMIN1]], i32 
[[K]])
+; CHECK-NEXT:[[EXIT_MAINLOOP_AT:%.*]] = call i32 @llvm.smax.i32(i32 
[[SMIN2]], i32 0)
+; CHECK-NEXT:[[TMP0:%.*]] = icmp slt i32 0, [[EXIT_MAINLOOP_AT]]
+; CHECK-NEXT:br i1 [[TMP0]], label [[LOOP_PREHEADER:%.*]], label 
[[MAIN_PSEUDO_EXIT:%.*]]
+; CHECK:   loop.preheader:
 ; CHECK-NEXT:br label [[LOOP:%.*]]
 ; CHECK:   loop:
-; CHECK-NEXT:[[IDX:%.*]] = phi i32 [ 0, [[PREHEADER]] ], [ 
[[IDX_NEXT:%.*]], [[IN_BOUNDS:%.*]] ]
-; CHECK-NEXT:[[IDX_NEXT]] = add i32 [[IDX]], 1
+; CHECK-NEXT:[[IDX:%.*]] = phi i32 [ [[IDX_NEXT:%.*]], [[IN_BOUNDS:%.*]] 
], [ 0, [[LOOP_PREHEADER]] ]
+; CHECK-NEXT:[[IDX_NEXT]] = add nsw i32 [[IDX]], 1
 ; CHECK-NEXT:[[GUARD:%.*]] = icmp slt i32 [[IDX]], [

[clang-tools-extra] [LoopPeeling] Fix weights updating of peeled off branches (PR #70094)

2023-10-27 Thread Aleksandr Popov via cfe-commits

https://github.com/aleks-tmb updated 
https://github.com/llvm/llvm-project/pull/70094

>From 4265cc4a05d939a29e57a116e696fa1eb032c249 Mon Sep 17 00:00:00 2001
From: Aleksandr Popov 
Date: Tue, 24 Oct 2023 16:47:50 +
Subject: [PATCH] [LoopPeeling] Fix weights updating of peeled off branches

In https://reviews.llvm.org/D64235 a new algorithm has been introduced
for updating the branch weights of latch blocks and their copies.

It increases the probability of going to the exit block for each next
peel iteration, calculating weights by (F - I * E, E), where:
- F is a weight of the edge from latch to header.
- E is a weight of the edge from latch to exit.
- I is a number of peeling iteration.

E.g: Let's say the latch branch weights are (100,300) and the estimated
trip count is 4. If we peel off all 4 iterations the weights of the
copied branches will be:
0: (100,300)
1: (100,200)
2: (100,100)
3: (100,1)

https://godbolt.org/z/93KnoEsT6

So we make the original loop almost unreachable from the 3rd peeled copy
according to the profile data. But that's only true if the profiling
data is accurate.
Underestimated trip count can lead to a performance issues with the
register allocator, which may decide to spill intervals inside the loop
assuming it's unreachable.

Since we don't know how accurate the profiling data is, it seems better
to set neutral 1/1 weights on the last peeled latch branch. After this
change, the weights in the example above will look like this:
0: (100,300)
1: (100,200)
2: (100,100)
3: (100,100)
---
 llvm/lib/Transforms/Utils/LoopPeel.cpp | 10 +++---
 llvm/test/Transforms/LoopUnroll/peel-loop-pgo-deopt.ll |  5 ++---
 llvm/test/Transforms/LoopUnroll/peel-loop-pgo.ll   |  5 ++---
 3 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/llvm/lib/Transforms/Utils/LoopPeel.cpp 
b/llvm/lib/Transforms/Utils/LoopPeel.cpp
index 31f065b691f864c..30c525dd82bbd97 100644
--- a/llvm/lib/Transforms/Utils/LoopPeel.cpp
+++ b/llvm/lib/Transforms/Utils/LoopPeel.cpp
@@ -636,9 +636,13 @@ static void updateBranchWeights(Instruction *Term, 
WeightInfo &Info) {
 MDB.createBranchWeights(Info.Weights));
   for (auto [Idx, SubWeight] : enumerate(Info.SubWeights))
 if (SubWeight != 0)
-  Info.Weights[Idx] = Info.Weights[Idx] > SubWeight
-  ? Info.Weights[Idx] - SubWeight
-  : 1;
+  // Don't set the probability of taking the edge from latch to loop header
+  // to less than 1, as this could significantly reduce the loop's hotness,
+  // which would be incorrect in the case of underestimating the trip 
count.
+  Info.Weights[Idx] =
+  Info.Weights[Idx] > SubWeight
+  ? std::max(Info.Weights[Idx] - SubWeight, SubWeight)
+  : SubWeight;
 }
 
 /// Initialize the weights for all exiting blocks.
diff --git a/llvm/test/Transforms/LoopUnroll/peel-loop-pgo-deopt.ll 
b/llvm/test/Transforms/LoopUnroll/peel-loop-pgo-deopt.ll
index a0dc216133fb7d5..d91cb5bab382782 100644
--- a/llvm/test/Transforms/LoopUnroll/peel-loop-pgo-deopt.ll
+++ b/llvm/test/Transforms/LoopUnroll/peel-loop-pgo-deopt.ll
@@ -21,7 +21,7 @@
 ; CHECK: br i1 %{{.*}}, label %[[NEXT2:.*]], label 
%for.cond.for.end_crit_edge, !prof !18
 ; CHECK: [[NEXT2]]:
 ; CHECK: br i1 %c, label %{{.*}}, label %side_exit.loopexit, !prof !15
-; CHECK: br i1 %{{.*}}, label %for.body, label %{{.*}}, !prof !19
+; CHECK: br i1 %{{.*}}, label %for.body, label %{{.*}}, !prof !18
 
 define i32 @basic(ptr %p, i32 %k, i1 %c) #0 !prof !15 {
 entry:
@@ -85,6 +85,5 @@ attributes #1 = { nounwind optsize }
 ; This is a weights of latch and its copies.
 ;CHECK: !16 = !{!"branch_weights", i32 3001, i32 1001}
 ;CHECK: !17 = !{!"branch_weights", i32 2000, i32 1001}
-;CHECK: !18 = !{!"branch_weights", i32 999, i32 1001}
-;CHECK: !19 = !{!"branch_weights", i32 1, i32 1001}
+;CHECK: !18 = !{!"branch_weights", i32 1001, i32 1001}
 
diff --git a/llvm/test/Transforms/LoopUnroll/peel-loop-pgo.ll 
b/llvm/test/Transforms/LoopUnroll/peel-loop-pgo.ll
index cadb6739dbc3fbb..15dce234baee91d 100644
--- a/llvm/test/Transforms/LoopUnroll/peel-loop-pgo.ll
+++ b/llvm/test/Transforms/LoopUnroll/peel-loop-pgo.ll
@@ -24,7 +24,7 @@
 ; CHECK: [[NEXT1]]:
 ; CHECK: br i1 %{{.*}}, label %[[NEXT2:.*]], label 
%for.cond.for.end_crit_edge, !prof !17
 ; CHECK: [[NEXT2]]:
-; CHECK: br i1 %{{.*}}, label %for.body, label %{{.*}}, !prof !18
+; CHECK: br i1 %{{.*}}, label %for.body, label %{{.*}}, !prof !17
 
 define void @basic(ptr %p, i32 %k) #0 !prof !15 {
 entry:
@@ -105,6 +105,5 @@ attributes #1 = { nounwind optsize }
 
 ;CHECK: !15 = !{!"branch_weights", i32 3001, i32 1001}
 ;CHECK: !16 = !{!"branch_weights", i32 2000, i32 1001}
-;CHECK: !17 = !{!"branch_weights", i32 999, i32 1001}
-;CHECK: !18 = !{!"branch_weights", i32 1, i32 1001}
+;CHECK: !17 = !{!"branch_weights", i32 1001, i32 1001}
 

___
cfe-commits mailing list
cfe