commit:     f848da3b363c51d555e4b1a391afd3d6cf0a6fe3
Author:     Sam James <sam <AT> gentoo <DOT> org>
AuthorDate: Fri Apr  4 19:06:02 2025 +0000
Commit:     Sam James <sam <AT> gentoo <DOT> org>
CommitDate: Fri Apr  4 19:06:31 2025 +0000
URL:        https://gitweb.gentoo.org/proj/gcc-patches.git/commit/?id=f848da3b

15.0.0: add combine patches

They should be committed shortly but I want to test all the musttail
patches now, so let's save me building twice.

Signed-off-by: Sam James <sam <AT> gentoo.org>

 ...ow-2-2-combinations-but-with-a-tweak-PR11.patch | 204 ++++++++++++++++++
 ...id-split_i2i3-search-if-i2-is-unchanged-P.patch |  54 +++++
 ...Optimise-distribute_links-search-PR116398.patch |  99 +++++++++
 ...it-insn-searchs-for-2-2-combinations-PR11.patch | 240 +++++++++++++++++++++
 15.0.0/gentoo/README.history                       |   7 +
 5 files changed, 604 insertions(+)

diff --git 
a/15.0.0/gentoo/83_all_combine-Allow-2-2-combinations-but-with-a-tweak-PR11.patch
 
b/15.0.0/gentoo/83_all_combine-Allow-2-2-combinations-but-with-a-tweak-PR11.patch
new file mode 100644
index 0000000..fd85bcd
--- /dev/null
+++ 
b/15.0.0/gentoo/83_all_combine-Allow-2-2-combinations-but-with-a-tweak-PR11.patch
@@ -0,0 +1,204 @@
+https://inbox.sourceware.org/gcc-patches/[email protected]/
+
+From de2f5e9c4ce4bb674fb0da191102838a6bdd92f6 Mon Sep 17 00:00:00 2001
+Message-ID: 
<de2f5e9c4ce4bb674fb0da191102838a6bdd92f6.1743793443.git....@gentoo.org>
+From: Richard Sandiford <[email protected]>
+Date: Fri, 4 Apr 2025 10:21:43 +0100
+Subject: [PATCH 1/4] combine: Allow 2->2 combinations, but with a tweak
+ [PR116398]
+
+One of the problems in PR101523 was that, after each successful
+2->2 combination attempt, try_combine would restart combination
+attempts at i2 even if i2 hadn't changed.  This led to quadratic
+behaviour as the same failed combinations between i2 and i3 were
+tried repeatedly.
+
+The original patch for the PR dealt with that by disallowing 2->2
+combinations.  However, that led to various optimisation regressions,
+so there was interest in allowing the combinations again, at least
+until an alternative way of getting the same results is in place.
+
+This patch is a variant of Richi's in:
+
+  https://gcc.gnu.org/bugzilla/show_bug.cgi?id=101523#c53
+
+but limited to when we're combining 2 instructions.
+
+This speeds up combine by 10x on the original PR101523 testcase
+and reduces combine's memory footprint by 100x.
+
+gcc/
+       PR testsuite/116398
+       * combine.cc (try_combine): Reallow 2->2 combinations.  Detect when
+       only i3 has changed and restart from i3 in that case.
+
+gcc/testsuite/
+       * gcc.target/aarch64/popcnt-le-1.c: Account for commutativity of TST.
+       * gcc.target/aarch64/popcnt-le-3.c: Likewise AND.
+       * gcc.target/aarch64/sve/pred-not-gen-1.c: Revert previous patch.
+       * gcc.target/aarch64/sve/pred-not-gen-4.c: Likewise.
+       * gcc.target/aarch64/sve/var_stride_2.c: Likewise.
+       * gcc.target/aarch64/sve/var_stride_4.c: Likewise.
+
+Co-authored-by: Richard Biener <[email protected]>
+---
+ gcc/combine.cc                                     | 14 ++++----------
+ gcc/testsuite/gcc.target/aarch64/popcnt-le-1.c     |  4 ++--
+ gcc/testsuite/gcc.target/aarch64/popcnt-le-3.c     |  4 ++--
+ gcc/testsuite/gcc.target/aarch64/pr100056.c        |  4 +++-
+ .../gcc.target/aarch64/sve/pred-not-gen-1.c        |  4 ++--
+ .../gcc.target/aarch64/sve/pred-not-gen-4.c        |  4 ++--
+ .../gcc.target/aarch64/sve/var_stride_2.c          |  3 ++-
+ .../gcc.target/aarch64/sve/var_stride_4.c          |  3 ++-
+ 8 files changed, 19 insertions(+), 21 deletions(-)
+
+diff --git a/gcc/combine.cc b/gcc/combine.cc
+index 1b6c4e314cc9..65a87a45b3be 100644
+--- a/gcc/combine.cc
++++ b/gcc/combine.cc
+@@ -4210,16 +4210,7 @@ try_combine (rtx_insn *i3, rtx_insn *i2, rtx_insn *i1, 
rtx_insn *i0,
+       adjust_for_new_dest (i3);
+     }
+ 
+-  /* If I2 didn't change, this is not a combination (but a simplification or
+-     canonicalisation with context), which should not be done here.  Doing
+-     it here explodes the algorithm.  Don't.  */
+-  if (rtx_equal_p (newi2pat, PATTERN (i2)))
+-    {
+-      if (dump_file)
+-      fprintf (dump_file, "i2 didn't change, not doing this\n");
+-      undo_all ();
+-      return 0;
+-    }
++  bool only_i3_changed = !i0 && !i1 && rtx_equal_p (newi2pat, PATTERN (i2));
+ 
+   /* We now know that we can do this combination.  Merge the insns and
+      update the status of registers and LOG_LINKS.  */
+@@ -4787,6 +4778,9 @@ try_combine (rtx_insn *i3, rtx_insn *i2, rtx_insn *i1, 
rtx_insn *i0,
+   combine_successes++;
+   undo_commit ();
+ 
++  if (only_i3_changed)
++    return i3;
++
+   rtx_insn *ret = newi2pat ? i2 : i3;
+   if (added_links_insn && DF_INSN_LUID (added_links_insn) < DF_INSN_LUID 
(ret))
+     ret = added_links_insn;
+diff --git a/gcc/testsuite/gcc.target/aarch64/popcnt-le-1.c 
b/gcc/testsuite/gcc.target/aarch64/popcnt-le-1.c
+index b4141da982c9..843fdac9fd8e 100644
+--- a/gcc/testsuite/gcc.target/aarch64/popcnt-le-1.c
++++ b/gcc/testsuite/gcc.target/aarch64/popcnt-le-1.c
+@@ -8,7 +8,7 @@
+ /*
+ ** le32:
+ **    sub     w([0-9]+), w0, #1
+-**    tst     w0, w\1
++**    tst     (?:w0, w\1|w\1, w0)
+ **    cset    w0, eq
+ **    ret
+ */
+@@ -20,7 +20,7 @@ unsigned le32 (const unsigned int a) {
+ /*
+ ** gt32:
+ **    sub     w([0-9]+), w0, #1
+-**    tst     w0, w\1
++**    tst     (?:w0, w\1|w\1, w0)
+ **    cset    w0, ne
+ **    ret
+ */
+diff --git a/gcc/testsuite/gcc.target/aarch64/popcnt-le-3.c 
b/gcc/testsuite/gcc.target/aarch64/popcnt-le-3.c
+index b811e6f6e8fe..3b558e95d819 100644
+--- a/gcc/testsuite/gcc.target/aarch64/popcnt-le-3.c
++++ b/gcc/testsuite/gcc.target/aarch64/popcnt-le-3.c
+@@ -8,7 +8,7 @@
+ /*
+ ** le16:
+ **    sub     w([0-9]+), w0, #1
+-**    and     w([0-9]+), w0, w\1
++**    and     w([0-9]+), (?:w0, w\1|w\1, w0)
+ **    tst     w\2, 65535
+ **    cset    w0, eq
+ **    ret
+@@ -21,7 +21,7 @@ unsigned le16 (const unsigned short a) {
+ /*
+ ** gt16:
+ **    sub     w([0-9]+), w0, #1
+-**    and     w([0-9]+), w0, w\1
++**    and     w([0-9]+), (?:w0, w\1|w\1, w0)
+ **    tst     w\2, 65535
+ **    cset    w0, ne
+ **    ret
+diff --git a/gcc/testsuite/gcc.target/aarch64/pr100056.c 
b/gcc/testsuite/gcc.target/aarch64/pr100056.c
+index 0b77824da457..70499772d285 100644
+--- a/gcc/testsuite/gcc.target/aarch64/pr100056.c
++++ b/gcc/testsuite/gcc.target/aarch64/pr100056.c
+@@ -1,7 +1,9 @@
+ /* PR target/100056 */
+ /* { dg-do compile } */
+ /* { dg-options "-O2" } */
+-/* { dg-final { scan-assembler-not {\t[us]bfiz\tw[0-9]+, w[0-9]+, 11} } } */
++/* { dg-final { scan-assembler-not {\t[us]bfiz\tw[0-9]+, w[0-9]+, 11} { xfail 
*-*-* } } } */
++/* { dg-final { scan-assembler-times {\t[us]bfiz\tw[0-9]+, w[0-9]+, 11} 2 } } 
*/
++/* { dg-final { scan-assembler-times {\tadd\tw[0-9]+, w[0-9]+, w[0-9]+, 
uxtb\n} 2 } } */
+ 
+ int
+ or_shift_u8 (unsigned char i)
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pred-not-gen-1.c 
b/gcc/testsuite/gcc.target/aarch64/sve/pred-not-gen-1.c
+index a7d2795ebe23..c9a8b82c48ac 100644
+--- a/gcc/testsuite/gcc.target/aarch64/sve/pred-not-gen-1.c
++++ b/gcc/testsuite/gcc.target/aarch64/sve/pred-not-gen-1.c
+@@ -19,6 +19,6 @@ void f10(double * restrict z, double * restrict w, double * 
restrict x, double *
+     }
+ }
+ 
+-/* { dg-final { scan-assembler-not {\tbic\t} { xfail *-*-* } } } */
+-/* { dg-final { scan-assembler-times {\tnot\tp[0-9]+\.b, p[0-9]+/z, 
p[0-9]+\.b\n} 1 { xfail *-*-* } } } */
++/* { dg-final { scan-assembler-not {\tbic\t} } } */
++/* { dg-final { scan-assembler-times {\tnot\tp[0-9]+\.b, p[0-9]+/z, 
p[0-9]+\.b\n} 1 } } */
+ /* { dg-final { scan-assembler-times {\tfcmgt\tp[0-9]+\.d, p[0-9]+/z, 
z[0-9]+\.d, #0} 1 } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pred-not-gen-4.c 
b/gcc/testsuite/gcc.target/aarch64/sve/pred-not-gen-4.c
+index 20cbd7550b7e..1845bd3f0f70 100644
+--- a/gcc/testsuite/gcc.target/aarch64/sve/pred-not-gen-4.c
++++ b/gcc/testsuite/gcc.target/aarch64/sve/pred-not-gen-4.c
+@@ -8,6 +8,6 @@ void f13(double * restrict z, double * restrict w, double * 
restrict x, double *
+     }
+ }
+ 
+-/* { dg-final { scan-assembler-not {\tbic\t} { xfail *-*-* } } } */
+-/* { dg-final { scan-assembler-times {\tnot\tp[0-9]+\.b, p[0-9]+/z, 
p[0-9]+\.b\n} 1 { xfail *-*-* } } } */
++/* { dg-final { scan-assembler-not {\tbic\t} } } */
++/* { dg-final { scan-assembler-times {\tnot\tp[0-9]+\.b, p[0-9]+/z, 
p[0-9]+\.b\n} 1 } } */
+ /* { dg-final { scan-assembler-times {\tfcmuo\tp[0-9]+\.d, p[0-9]+/z, 
z[0-9]+\.d, z[0-9]+\.d} 1 } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/var_stride_2.c 
b/gcc/testsuite/gcc.target/aarch64/sve/var_stride_2.c
+index 33b9f0f197e4..b8afea70207f 100644
+--- a/gcc/testsuite/gcc.target/aarch64/sve/var_stride_2.c
++++ b/gcc/testsuite/gcc.target/aarch64/sve/var_stride_2.c
+@@ -16,7 +16,8 @@ f (TYPE *x, TYPE *y, unsigned short n, unsigned short m)
+ /* { dg-final { scan-assembler {\tldr\tw[0-9]+} } } */
+ /* { dg-final { scan-assembler {\tstr\tw[0-9]+} } } */
+ /* Should multiply by (257-1)*4 rather than (VF-1)*4 or (VF-2)*4.  */
+-/* { dg-final { scan-assembler-times {\tadd\tx[0-9]+, x[0-9]+, x[0-9]+, lsl 
10\n} 2 } } */
++/* { dg-final { scan-assembler-times {\tubfiz\tx[0-9]+, x2, 10, 16\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tubfiz\tx[0-9]+, x3, 10, 16\n} 1 } } */
+ /* { dg-final { scan-assembler-not {\tcmp\tx[0-9]+, 0} } } */
+ /* { dg-final { scan-assembler-not {\tcmp\tw[0-9]+, 0} } } */
+ /* { dg-final { scan-assembler-not {\tcsel\tx[0-9]+} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/var_stride_4.c 
b/gcc/testsuite/gcc.target/aarch64/sve/var_stride_4.c
+index 71b826a4c1bb..d2e74f9d4175 100644
+--- a/gcc/testsuite/gcc.target/aarch64/sve/var_stride_4.c
++++ b/gcc/testsuite/gcc.target/aarch64/sve/var_stride_4.c
+@@ -16,7 +16,8 @@ f (TYPE *x, TYPE *y, int n, int m)
+ /* { dg-final { scan-assembler {\tldr\tw[0-9]+} } } */
+ /* { dg-final { scan-assembler {\tstr\tw[0-9]+} } } */
+ /* Should multiply by (257-1)*4 rather than (VF-1)*4.  */
+-/* { dg-final { scan-assembler-times {\t(?:lsl\tx[0-9]+, x[0-9]+, 
10|sbfiz\tx[0-9]+, x[0-9]+, 10, 32)\n} 2 } } */
++/* { dg-final { scan-assembler-times {\tsbfiz\tx[0-9]+, x2, 10, 32\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tsbfiz\tx[0-9]+, x3, 10, 32\n} 1 } } */
+ /* { dg-final { scan-assembler {\tcmp\tw2, 0} } } */
+ /* { dg-final { scan-assembler {\tcmp\tw3, 0} } } */
+ /* { dg-final { scan-assembler-times {\tcsel\tx[0-9]+} 4 } } */
+
+base-commit: d25728c98682c058bfda79333c94b0a8cf2a3f49
+-- 
+2.49.0
+

diff --git 
a/15.0.0/gentoo/84_all_combine-Avoid-split_i2i3-search-if-i2-is-unchanged-P.patch
 
b/15.0.0/gentoo/84_all_combine-Avoid-split_i2i3-search-if-i2-is-unchanged-P.patch
new file mode 100644
index 0000000..19d4917
--- /dev/null
+++ 
b/15.0.0/gentoo/84_all_combine-Avoid-split_i2i3-search-if-i2-is-unchanged-P.patch
@@ -0,0 +1,54 @@
+https://inbox.sourceware.org/gcc-patches/[email protected]/
+
+From fac8faddc3fca52b6d51c0d1a57323012a1fe4e9 Mon Sep 17 00:00:00 2001
+Message-ID: 
<fac8faddc3fca52b6d51c0d1a57323012a1fe4e9.1743793443.git....@gentoo.org>
+In-Reply-To: 
<de2f5e9c4ce4bb674fb0da191102838a6bdd92f6.1743793443.git....@gentoo.org>
+References: 
<de2f5e9c4ce4bb674fb0da191102838a6bdd92f6.1743793443.git....@gentoo.org>
+From: Richard Sandiford <[email protected]>
+Date: Fri, 4 Apr 2025 10:22:12 +0100
+Subject: [PATCH 2/4] combine: Avoid split_i2i3 search if i2 is unchanged
+ [PR116398]
+
+When combining a single-set i2 into a multi-set i3, combine
+first tries to match the new multi-set in-place.  If that fails,
+combine considers splitting the multi-set so that one set goes in
+i2 and the other set stays in i3.  That moves a destination from i3
+to i2 and so combine needs to update any associated log link for that
+destination to point to i2 rather than i3.
+
+However, that kind of split can also occur for 2->2 combinations.
+For a 2-instruction combination in which i2 doesn't die in i3, combine
+tries a 2->1 combination by turning i3 into a parallel of the original
+i2 and the combined i3.  If that fails, combine will split the parallel
+as above, so that the first set goes in i2 and the second set goes in i3.
+But that can often leave i2 unchanged, meaning that no destinations have
+moved and so no search is necessary.
+
+gcc/
+       PR testsuite/116398
+       * combine.cc (try_combine): Shortcut the split_i2i3 handling if
+       i2 is unchanged.
+---
+ gcc/combine.cc | 6 ++++++
+ 1 file changed, 6 insertions(+)
+
+diff --git a/gcc/combine.cc b/gcc/combine.cc
+index 65a87a45b3be..e29cff7147d9 100644
+--- a/gcc/combine.cc
++++ b/gcc/combine.cc
+@@ -4212,6 +4212,12 @@ try_combine (rtx_insn *i3, rtx_insn *i2, rtx_insn *i1, 
rtx_insn *i0,
+ 
+   bool only_i3_changed = !i0 && !i1 && rtx_equal_p (newi2pat, PATTERN (i2));
+ 
++  /* If only i3 has changed, any split of the combined instruction just
++     restored i2 to its original state.  No destinations moved from i3
++     to i2.  */
++  if (only_i3_changed)
++    split_i2i3 = false;
++
+   /* We now know that we can do this combination.  Merge the insns and
+      update the status of registers and LOG_LINKS.  */
+ 
+-- 
+2.49.0
+

diff --git 
a/15.0.0/gentoo/85_all_combine-Optimise-distribute_links-search-PR116398.patch 
b/15.0.0/gentoo/85_all_combine-Optimise-distribute_links-search-PR116398.patch
new file mode 100644
index 0000000..f8849ba
--- /dev/null
+++ 
b/15.0.0/gentoo/85_all_combine-Optimise-distribute_links-search-PR116398.patch
@@ -0,0 +1,99 @@
+https://inbox.sourceware.org/gcc-patches/[email protected]/
+
+From bc27a2f371602bf897ff4da1620a56eeeeebf8ff Mon Sep 17 00:00:00 2001
+Message-ID: 
<bc27a2f371602bf897ff4da1620a56eeeeebf8ff.1743793443.git....@gentoo.org>
+In-Reply-To: 
<de2f5e9c4ce4bb674fb0da191102838a6bdd92f6.1743793443.git....@gentoo.org>
+References: 
<de2f5e9c4ce4bb674fb0da191102838a6bdd92f6.1743793443.git....@gentoo.org>
+From: Richard Sandiford <[email protected]>
+Date: Fri, 4 Apr 2025 10:22:32 +0100
+Subject: [PATCH 3/4] combine: Optimise distribute_links search [PR116398]
+
+Another problem in PR101523 was that, after each successful 2->2
+combination attempt, distribute_links would search further and further
+for the next combinable use of the i2 destination.  Each search would
+start at i2 itself, making the search quadratic in the worst case.
+
+In a 2->2 combination, if i2 is unchanged, the search can start at i3
+instead of i2.  The same thing applies to i2 when distributing i2's
+links, since the only changes to earlier instructions are the deletion
+of i0 and i1.
+
+This change, combined with the previous split_i2i3 patch, gives a
+34.6% speedup in combine for the testcase in PR101523.  Combine
+goes from being 41% to 34% of compile time.
+
+gcc/
+       PR testsuite/116398
+       * combine.cc (distribute_links): Take an optional start point.
+       (try_combine): If only i3 has changed, only distribute i3's links,
+       not i2's.  Start the search for the new use from i3 rather than
+       from the definition instruction.  Likewise start the search for
+       the new use from i2 when distributing i2's links.
+---
+ gcc/combine.cc | 27 +++++++++++++++++++--------
+ 1 file changed, 19 insertions(+), 8 deletions(-)
+
+diff --git a/gcc/combine.cc b/gcc/combine.cc
+index e29cff7147d9..e99b064c98d4 100644
+--- a/gcc/combine.cc
++++ b/gcc/combine.cc
+@@ -472,7 +472,7 @@ static void move_deaths (rtx, rtx, int, rtx_insn *, rtx *);
+ static bool reg_bitfield_target_p (rtx, rtx);
+ static void distribute_notes (rtx, rtx_insn *, rtx_insn *, rtx_insn *,
+                             rtx, rtx, rtx);
+-static void distribute_links (struct insn_link *);
++static void distribute_links (struct insn_link *, rtx_insn * = nullptr);
+ static void mark_used_regs_combine (rtx);
+ static void record_promoted_value (rtx_insn *, rtx);
+ static bool unmentioned_reg_p (rtx, rtx);
+@@ -4592,10 +4592,15 @@ try_combine (rtx_insn *i3, rtx_insn *i2, rtx_insn *i1, 
rtx_insn *i0,
+                           NULL_RTX, NULL_RTX, NULL_RTX);
+       }
+ 
+-    distribute_links (i3links);
+-    distribute_links (i2links);
+-    distribute_links (i1links);
+-    distribute_links (i0links);
++    if (only_i3_changed)
++      distribute_links (i3links, i3);
++    else
++      {
++      distribute_links (i3links);
++      distribute_links (i2links, i2);
++      distribute_links (i1links);
++      distribute_links (i0links);
++      }
+ 
+     if (REG_P (i2dest))
+       {
+@@ -14986,10 +14991,13 @@ distribute_notes (rtx notes, rtx_insn *from_insn, 
rtx_insn *i3, rtx_insn *i2,
+ 
+ /* Similarly to above, distribute the LOG_LINKS that used to be present on
+    I3, I2, and I1 to new locations.  This is also called to add a link
+-   pointing at I3 when I3's destination is changed.  */
++   pointing at I3 when I3's destination is changed.
++
++   If START is nonnull and an insn, we know that the next location for each
++   link is no earlier than START.  */
+ 
+ static void
+-distribute_links (struct insn_link *links)
++distribute_links (struct insn_link *links, rtx_insn *start)
+ {
+   struct insn_link *link, *next_link;
+ 
+@@ -15055,7 +15063,10 @@ distribute_links (struct insn_link *links)
+        I3 to I2.  Also note that not much searching is typically done here
+        since most links don't point very far away.  */
+ 
+-      for (insn = NEXT_INSN (link->insn);
++      insn = start;
++      if (!insn || NOTE_P (insn))
++      insn = NEXT_INSN (link->insn);
++      for (;
+          (insn && (this_basic_block->next_bb == EXIT_BLOCK_PTR_FOR_FN (cfun)
+                    || BB_HEAD (this_basic_block->next_bb) != insn));
+          insn = NEXT_INSN (insn))
+-- 
+2.49.0
+

diff --git 
a/15.0.0/gentoo/86_all_combine-Limit-insn-searchs-for-2-2-combinations-PR11.patch
 
b/15.0.0/gentoo/86_all_combine-Limit-insn-searchs-for-2-2-combinations-PR11.patch
new file mode 100644
index 0000000..b185cba
--- /dev/null
+++ 
b/15.0.0/gentoo/86_all_combine-Limit-insn-searchs-for-2-2-combinations-PR11.patch
@@ -0,0 +1,240 @@
+https://inbox.sourceware.org/gcc-patches/[email protected]/
+
+From 247ac81832821bade9fa36c350d220dca039d383 Mon Sep 17 00:00:00 2001
+Message-ID: 
<247ac81832821bade9fa36c350d220dca039d383.1743793443.git....@gentoo.org>
+In-Reply-To: 
<de2f5e9c4ce4bb674fb0da191102838a6bdd92f6.1743793443.git....@gentoo.org>
+References: 
<de2f5e9c4ce4bb674fb0da191102838a6bdd92f6.1743793443.git....@gentoo.org>
+From: Richard Sandiford <[email protected]>
+Date: Fri, 4 Apr 2025 10:23:00 +0100
+Subject: [PATCH 4/4] combine: Limit insn searchs for 2->2 combinations
+ [PR116398]
+
+As noted in the previous patch, combine still takes >30% of
+compile time in the original testcase for PR101523.  The problem
+is that try_combine uses linear insn searches for some dataflow
+queries, so in the worst case, an unlimited number of 2->2
+combinations for the same i2 can lead to quadratic behaviour.
+
+This patch limits distribute_links to a certain number
+of instructions when i2 is unchanged.  As Segher said in the PR trail,
+it would make more conceptual sense to apply the limit unconditionally,
+but I thought it would be better to change as little as possible at
+this development stage.  Logically, in stage 1, the --param should
+be applied directly by distribute_links with no input from callers.
+
+As I mentioned in:
+
+  https://gcc.gnu.org/bugzilla/show_bug.cgi?id=116398#c28
+
+I think it's safe to drop log links even if a use exists.  All
+processing of log links seems to handle the absence of a link
+for a particular register in a conservative way.
+
+The initial set-up errs on the side of dropping links, since for example
+create_log_links has:
+
+             /* flow.c claimed:
+
+                 We don't build a LOG_LINK for hard registers contained
+                 in ASM_OPERANDs.  If these registers get replaced,
+                 we might wind up changing the semantics of the insn,
+                 even if reload can make what appear to be valid
+                 assignments later.  */
+              if (regno < FIRST_PSEUDO_REGISTER
+                  && asm_noperands (PATTERN (use_insn)) >= 0)
+                continue;
+
+which excludes combinations by dropping log links, rather than during
+try_combine.  And:
+
+      /* If this register is being initialized using itself, and the
+         register is uninitialized in this basic block, and there are
+         no LOG_LINKS which set the register, then part of the
+         register is uninitialized.  In that case we can't assume
+         anything about the number of nonzero bits.
+
+         ??? We could do better if we checked this in
+         reg_{nonzero_bits,num_sign_bit_copies}_for_combine.  Then we
+         could avoid making assumptions about the insn which initially
+         sets the register, while still using the information in other
+         insns.  We would have to be careful to check every insn
+         involved in the combination.  */
+
+      if (insn
+          && reg_referenced_p (x, PATTERN (insn))
+          && !REGNO_REG_SET_P (DF_LR_IN (BLOCK_FOR_INSN (insn)),
+                               REGNO (x)))
+        {
+          struct insn_link *link;
+
+          FOR_EACH_LOG_LINK (link, insn)
+            if (dead_or_set_p (link->insn, x))
+              break;
+          if (!link)
+            {
+              rsp->nonzero_bits = GET_MODE_MASK (mode);
+              rsp->sign_bit_copies = 1;
+              return;
+            }
+        }
+
+treats the lack of a log link as a possible sign of uninitialised data,
+but that would be a missed optimisation rather than a correctness issue.
+
+One question is what the default --param value should be.  I went with
+Jakub's suggestion of 3000 from:
+
+  https://gcc.gnu.org/bugzilla/show_bug.cgi?id=116398#c25
+
+Also, to answer Jakub's question in that comment, I tried bisecting:
+
+  int limit = atoi (getenv ("BISECT"));
+
+(so applying the limit for all calls from try_combine) with an
+abort in distribute_links if the limit caused a link to be skipped.
+The minimum BISECT value that allowed an aarch64-linux-gnu bootstrap
+to succeed with --enable-languages=all --enable-checking=yes,rtl,extra
+was 142, so much lower than the parameter value.  I realised too late
+that --enable-checking=release would probably have been a more
+interesting test.
+
+The previous patch meant that distribute_links itself is now linear
+for a given i2 definition, since each search starts at the previous
+last use, rather than at i2 itself.  This means that the limit has
+to be applied cumulatively across all searches for the same link.
+
+The patch does that by storing a counter in the insn_link structure.
+There was a 32-bit hole there on LP64 hosts.
+
+gcc/
+       PR testsuite/116398
+       * params.opt (-param=max-combine-search-insns=): New param.
+       * doc/invoke.texi: Document it.
+       * combine.cc (insn_link::insn_count): New field.
+       (alloc_insn_link): Initialize it.
+       (distribute_links): Add a limit parameter.
+       (try_combine): Use the new param to limit distribute_links
+       when only i3 has changed.
+---
+ gcc/combine.cc      | 21 +++++++++++++++++----
+ gcc/doc/invoke.texi |  9 +++++++++
+ gcc/params.opt      |  4 ++++
+ 3 files changed, 30 insertions(+), 4 deletions(-)
+
+diff --git a/gcc/combine.cc b/gcc/combine.cc
+index e99b064c98d4..5f085187cfef 100644
+--- a/gcc/combine.cc
++++ b/gcc/combine.cc
+@@ -309,6 +309,7 @@ static int *uid_insn_cost;
+ struct insn_link {
+   rtx_insn *insn;
+   unsigned int regno;
++  int insn_count;
+   struct insn_link *next;
+ };
+ 
+@@ -342,6 +343,7 @@ alloc_insn_link (rtx_insn *insn, unsigned int regno, 
struct insn_link *next)
+                                         sizeof (struct insn_link));
+   l->insn = insn;
+   l->regno = regno;
++  l->insn_count = 0;
+   l->next = next;
+   return l;
+ }
+@@ -472,7 +474,8 @@ static void move_deaths (rtx, rtx, int, rtx_insn *, rtx *);
+ static bool reg_bitfield_target_p (rtx, rtx);
+ static void distribute_notes (rtx, rtx_insn *, rtx_insn *, rtx_insn *,
+                             rtx, rtx, rtx);
+-static void distribute_links (struct insn_link *, rtx_insn * = nullptr);
++static void distribute_links (struct insn_link *, rtx_insn * = nullptr,
++                            int limit = INT_MAX);
+ static void mark_used_regs_combine (rtx);
+ static void record_promoted_value (rtx_insn *, rtx);
+ static bool unmentioned_reg_p (rtx, rtx);
+@@ -4593,7 +4596,7 @@ try_combine (rtx_insn *i3, rtx_insn *i2, rtx_insn *i1, 
rtx_insn *i0,
+       }
+ 
+     if (only_i3_changed)
+-      distribute_links (i3links, i3);
++      distribute_links (i3links, i3, param_max_combine_search_insns);
+     else
+       {
+       distribute_links (i3links);
+@@ -14994,10 +14997,12 @@ distribute_notes (rtx notes, rtx_insn *from_insn, 
rtx_insn *i3, rtx_insn *i2,
+    pointing at I3 when I3's destination is changed.
+ 
+    If START is nonnull and an insn, we know that the next location for each
+-   link is no earlier than START.  */
++   link is no earlier than START.  LIMIT is the maximum number of nondebug
++   instructions that can be scanned when looking for the next use of a
++   definition.  */
+ 
+ static void
+-distribute_links (struct insn_link *links, rtx_insn *start)
++distribute_links (struct insn_link *links, rtx_insn *start, int limit)
+ {
+   struct insn_link *link, *next_link;
+ 
+@@ -15063,9 +15068,12 @@ distribute_links (struct insn_link *links, rtx_insn 
*start)
+        I3 to I2.  Also note that not much searching is typically done here
+        since most links don't point very far away.  */
+ 
++      int count = 0;
+       insn = start;
+       if (!insn || NOTE_P (insn))
+       insn = NEXT_INSN (link->insn);
++      else
++      count = link->insn_count;
+       for (;
+          (insn && (this_basic_block->next_bb == EXIT_BLOCK_PTR_FOR_FN (cfun)
+                    || BB_HEAD (this_basic_block->next_bb) != insn));
+@@ -15086,6 +15094,11 @@ distribute_links (struct insn_link *links, rtx_insn 
*start)
+         }
+       else if (INSN_P (insn) && reg_set_p (reg, insn))
+         break;
++      else if (count >= limit)
++        break;
++      else
++        count += 1;
++      link->insn_count = count;
+ 
+       /* If we found a place to put the link, place it there unless there
+        is already a link to the same insn as LINK at that point.  */
+diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
+index 4c9af429ab08..0903cd2b7f23 100644
+--- a/gcc/doc/invoke.texi
++++ b/gcc/doc/invoke.texi
+@@ -16518,6 +16518,15 @@ in combiner for a pseudo register as last known value 
of that register.
+ @item max-combine-insns
+ The maximum number of instructions the RTL combiner tries to combine.
+ 
++@item max-combine-search-insns
++The maximum number of instructions that the RTL combiner searches in order
++to find the next use of a given register definition.  If this limit is reached
++without finding such a use, the combiner will stop trying to optimize the
++definition.
++
++Currently this limit only applies after certain successful combination
++attempts, but it could be extended to other cases in future.
++
+ @item integer-share-limit
+ Small integer constants can use a shared data structure, reducing the
+ compiler's memory usage and increasing its speed.  This sets the maximum
+diff --git a/gcc/params.opt b/gcc/params.opt
+index 4f4eb4d7a2a5..422d082b3557 100644
+--- a/gcc/params.opt
++++ b/gcc/params.opt
+@@ -477,6 +477,10 @@ The maximum number of instructions to consider to unroll 
in a loop on average.
+ Common Joined UInteger Var(param_max_combine_insns) Init(4) IntegerRange(2, 
4) Param Optimization
+ The maximum number of insns combine tries to combine.
+ 
++-param=max-combine-search-insns=
++Common Joined UInteger Var(param_max_combine_search_insns) Init(3000) Param 
Optimization
++The maximum number of instructions that combine searches in order to find the 
next use of a particular register.
++
+ -param=max-completely-peel-loop-nest-depth=
+ Common Joined UInteger Var(param_max_unroll_iterations) Init(8) Param 
Optimization
+ The maximum depth of a loop nest we completely peel.
+-- 
+2.49.0
+

diff --git a/15.0.0/gentoo/README.history b/15.0.0/gentoo/README.history
index f01476e..5e4d154 100644
--- a/15.0.0/gentoo/README.history
+++ b/15.0.0/gentoo/README.history
@@ -1,3 +1,10 @@
+52     ????
+
+       + 83_all_combine-Allow-2-2-combinations-but-with-a-tweak-PR11.patch
+       + 84_all_combine-Avoid-split_i2i3-search-if-i2-is-unchanged-P.patch
+       + 85_all_combine-Optimise-distribute_links-search-PR116398.patch
+       + 86_all_combine-Limit-insn-searchs-for-2-2-combinations-PR11.patch
+
 51     2 April 2025
 
        - 
80_all_PR119376-tailc-Don-t-fail-musttail-calls-if-they-use-or-could.patch

Reply via email to