commit: f848da3b363c51d555e4b1a391afd3d6cf0a6fe3 Author: Sam James <sam <AT> gentoo <DOT> org> AuthorDate: Fri Apr 4 19:06:02 2025 +0000 Commit: Sam James <sam <AT> gentoo <DOT> org> CommitDate: Fri Apr 4 19:06:31 2025 +0000 URL: https://gitweb.gentoo.org/proj/gcc-patches.git/commit/?id=f848da3b
15.0.0: add combine patches They should be committed shortly but I want to test all the musttail patches now, so let's save me building twice. Signed-off-by: Sam James <sam <AT> gentoo.org> ...ow-2-2-combinations-but-with-a-tweak-PR11.patch | 204 ++++++++++++++++++ ...id-split_i2i3-search-if-i2-is-unchanged-P.patch | 54 +++++ ...Optimise-distribute_links-search-PR116398.patch | 99 +++++++++ ...it-insn-searchs-for-2-2-combinations-PR11.patch | 240 +++++++++++++++++++++ 15.0.0/gentoo/README.history | 7 + 5 files changed, 604 insertions(+) diff --git a/15.0.0/gentoo/83_all_combine-Allow-2-2-combinations-but-with-a-tweak-PR11.patch b/15.0.0/gentoo/83_all_combine-Allow-2-2-combinations-but-with-a-tweak-PR11.patch new file mode 100644 index 0000000..fd85bcd --- /dev/null +++ b/15.0.0/gentoo/83_all_combine-Allow-2-2-combinations-but-with-a-tweak-PR11.patch @@ -0,0 +1,204 @@ +https://inbox.sourceware.org/gcc-patches/[email protected]/ + +From de2f5e9c4ce4bb674fb0da191102838a6bdd92f6 Mon Sep 17 00:00:00 2001 +Message-ID: <de2f5e9c4ce4bb674fb0da191102838a6bdd92f6.1743793443.git....@gentoo.org> +From: Richard Sandiford <[email protected]> +Date: Fri, 4 Apr 2025 10:21:43 +0100 +Subject: [PATCH 1/4] combine: Allow 2->2 combinations, but with a tweak + [PR116398] + +One of the problems in PR101523 was that, after each successful +2->2 combination attempt, try_combine would restart combination +attempts at i2 even if i2 hadn't changed. This led to quadratic +behaviour as the same failed combinations between i2 and i3 were +tried repeatedly. + +The original patch for the PR dealt with that by disallowing 2->2 +combinations. However, that led to various optimisation regressions, +so there was interest in allowing the combinations again, at least +until an alternative way of getting the same results is in place. + +This patch is a variant of Richi's in: + + https://gcc.gnu.org/bugzilla/show_bug.cgi?id=101523#c53 + +but limited to when we're combining 2 instructions. + +This speeds up combine by 10x on the original PR101523 testcase +and reduces combine's memory footprint by 100x. + +gcc/ + PR testsuite/116398 + * combine.cc (try_combine): Reallow 2->2 combinations. Detect when + only i3 has changed and restart from i3 in that case. + +gcc/testsuite/ + * gcc.target/aarch64/popcnt-le-1.c: Account for commutativity of TST. + * gcc.target/aarch64/popcnt-le-3.c: Likewise AND. + * gcc.target/aarch64/sve/pred-not-gen-1.c: Revert previous patch. + * gcc.target/aarch64/sve/pred-not-gen-4.c: Likewise. + * gcc.target/aarch64/sve/var_stride_2.c: Likewise. + * gcc.target/aarch64/sve/var_stride_4.c: Likewise. + +Co-authored-by: Richard Biener <[email protected]> +--- + gcc/combine.cc | 14 ++++---------- + gcc/testsuite/gcc.target/aarch64/popcnt-le-1.c | 4 ++-- + gcc/testsuite/gcc.target/aarch64/popcnt-le-3.c | 4 ++-- + gcc/testsuite/gcc.target/aarch64/pr100056.c | 4 +++- + .../gcc.target/aarch64/sve/pred-not-gen-1.c | 4 ++-- + .../gcc.target/aarch64/sve/pred-not-gen-4.c | 4 ++-- + .../gcc.target/aarch64/sve/var_stride_2.c | 3 ++- + .../gcc.target/aarch64/sve/var_stride_4.c | 3 ++- + 8 files changed, 19 insertions(+), 21 deletions(-) + +diff --git a/gcc/combine.cc b/gcc/combine.cc +index 1b6c4e314cc9..65a87a45b3be 100644 +--- a/gcc/combine.cc ++++ b/gcc/combine.cc +@@ -4210,16 +4210,7 @@ try_combine (rtx_insn *i3, rtx_insn *i2, rtx_insn *i1, rtx_insn *i0, + adjust_for_new_dest (i3); + } + +- /* If I2 didn't change, this is not a combination (but a simplification or +- canonicalisation with context), which should not be done here. Doing +- it here explodes the algorithm. Don't. */ +- if (rtx_equal_p (newi2pat, PATTERN (i2))) +- { +- if (dump_file) +- fprintf (dump_file, "i2 didn't change, not doing this\n"); +- undo_all (); +- return 0; +- } ++ bool only_i3_changed = !i0 && !i1 && rtx_equal_p (newi2pat, PATTERN (i2)); + + /* We now know that we can do this combination. Merge the insns and + update the status of registers and LOG_LINKS. */ +@@ -4787,6 +4778,9 @@ try_combine (rtx_insn *i3, rtx_insn *i2, rtx_insn *i1, rtx_insn *i0, + combine_successes++; + undo_commit (); + ++ if (only_i3_changed) ++ return i3; ++ + rtx_insn *ret = newi2pat ? i2 : i3; + if (added_links_insn && DF_INSN_LUID (added_links_insn) < DF_INSN_LUID (ret)) + ret = added_links_insn; +diff --git a/gcc/testsuite/gcc.target/aarch64/popcnt-le-1.c b/gcc/testsuite/gcc.target/aarch64/popcnt-le-1.c +index b4141da982c9..843fdac9fd8e 100644 +--- a/gcc/testsuite/gcc.target/aarch64/popcnt-le-1.c ++++ b/gcc/testsuite/gcc.target/aarch64/popcnt-le-1.c +@@ -8,7 +8,7 @@ + /* + ** le32: + ** sub w([0-9]+), w0, #1 +-** tst w0, w\1 ++** tst (?:w0, w\1|w\1, w0) + ** cset w0, eq + ** ret + */ +@@ -20,7 +20,7 @@ unsigned le32 (const unsigned int a) { + /* + ** gt32: + ** sub w([0-9]+), w0, #1 +-** tst w0, w\1 ++** tst (?:w0, w\1|w\1, w0) + ** cset w0, ne + ** ret + */ +diff --git a/gcc/testsuite/gcc.target/aarch64/popcnt-le-3.c b/gcc/testsuite/gcc.target/aarch64/popcnt-le-3.c +index b811e6f6e8fe..3b558e95d819 100644 +--- a/gcc/testsuite/gcc.target/aarch64/popcnt-le-3.c ++++ b/gcc/testsuite/gcc.target/aarch64/popcnt-le-3.c +@@ -8,7 +8,7 @@ + /* + ** le16: + ** sub w([0-9]+), w0, #1 +-** and w([0-9]+), w0, w\1 ++** and w([0-9]+), (?:w0, w\1|w\1, w0) + ** tst w\2, 65535 + ** cset w0, eq + ** ret +@@ -21,7 +21,7 @@ unsigned le16 (const unsigned short a) { + /* + ** gt16: + ** sub w([0-9]+), w0, #1 +-** and w([0-9]+), w0, w\1 ++** and w([0-9]+), (?:w0, w\1|w\1, w0) + ** tst w\2, 65535 + ** cset w0, ne + ** ret +diff --git a/gcc/testsuite/gcc.target/aarch64/pr100056.c b/gcc/testsuite/gcc.target/aarch64/pr100056.c +index 0b77824da457..70499772d285 100644 +--- a/gcc/testsuite/gcc.target/aarch64/pr100056.c ++++ b/gcc/testsuite/gcc.target/aarch64/pr100056.c +@@ -1,7 +1,9 @@ + /* PR target/100056 */ + /* { dg-do compile } */ + /* { dg-options "-O2" } */ +-/* { dg-final { scan-assembler-not {\t[us]bfiz\tw[0-9]+, w[0-9]+, 11} } } */ ++/* { dg-final { scan-assembler-not {\t[us]bfiz\tw[0-9]+, w[0-9]+, 11} { xfail *-*-* } } } */ ++/* { dg-final { scan-assembler-times {\t[us]bfiz\tw[0-9]+, w[0-9]+, 11} 2 } } */ ++/* { dg-final { scan-assembler-times {\tadd\tw[0-9]+, w[0-9]+, w[0-9]+, uxtb\n} 2 } } */ + + int + or_shift_u8 (unsigned char i) +diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pred-not-gen-1.c b/gcc/testsuite/gcc.target/aarch64/sve/pred-not-gen-1.c +index a7d2795ebe23..c9a8b82c48ac 100644 +--- a/gcc/testsuite/gcc.target/aarch64/sve/pred-not-gen-1.c ++++ b/gcc/testsuite/gcc.target/aarch64/sve/pred-not-gen-1.c +@@ -19,6 +19,6 @@ void f10(double * restrict z, double * restrict w, double * restrict x, double * + } + } + +-/* { dg-final { scan-assembler-not {\tbic\t} { xfail *-*-* } } } */ +-/* { dg-final { scan-assembler-times {\tnot\tp[0-9]+\.b, p[0-9]+/z, p[0-9]+\.b\n} 1 { xfail *-*-* } } } */ ++/* { dg-final { scan-assembler-not {\tbic\t} } } */ ++/* { dg-final { scan-assembler-times {\tnot\tp[0-9]+\.b, p[0-9]+/z, p[0-9]+\.b\n} 1 } } */ + /* { dg-final { scan-assembler-times {\tfcmgt\tp[0-9]+\.d, p[0-9]+/z, z[0-9]+\.d, #0} 1 } } */ +diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pred-not-gen-4.c b/gcc/testsuite/gcc.target/aarch64/sve/pred-not-gen-4.c +index 20cbd7550b7e..1845bd3f0f70 100644 +--- a/gcc/testsuite/gcc.target/aarch64/sve/pred-not-gen-4.c ++++ b/gcc/testsuite/gcc.target/aarch64/sve/pred-not-gen-4.c +@@ -8,6 +8,6 @@ void f13(double * restrict z, double * restrict w, double * restrict x, double * + } + } + +-/* { dg-final { scan-assembler-not {\tbic\t} { xfail *-*-* } } } */ +-/* { dg-final { scan-assembler-times {\tnot\tp[0-9]+\.b, p[0-9]+/z, p[0-9]+\.b\n} 1 { xfail *-*-* } } } */ ++/* { dg-final { scan-assembler-not {\tbic\t} } } */ ++/* { dg-final { scan-assembler-times {\tnot\tp[0-9]+\.b, p[0-9]+/z, p[0-9]+\.b\n} 1 } } */ + /* { dg-final { scan-assembler-times {\tfcmuo\tp[0-9]+\.d, p[0-9]+/z, z[0-9]+\.d, z[0-9]+\.d} 1 } } */ +diff --git a/gcc/testsuite/gcc.target/aarch64/sve/var_stride_2.c b/gcc/testsuite/gcc.target/aarch64/sve/var_stride_2.c +index 33b9f0f197e4..b8afea70207f 100644 +--- a/gcc/testsuite/gcc.target/aarch64/sve/var_stride_2.c ++++ b/gcc/testsuite/gcc.target/aarch64/sve/var_stride_2.c +@@ -16,7 +16,8 @@ f (TYPE *x, TYPE *y, unsigned short n, unsigned short m) + /* { dg-final { scan-assembler {\tldr\tw[0-9]+} } } */ + /* { dg-final { scan-assembler {\tstr\tw[0-9]+} } } */ + /* Should multiply by (257-1)*4 rather than (VF-1)*4 or (VF-2)*4. */ +-/* { dg-final { scan-assembler-times {\tadd\tx[0-9]+, x[0-9]+, x[0-9]+, lsl 10\n} 2 } } */ ++/* { dg-final { scan-assembler-times {\tubfiz\tx[0-9]+, x2, 10, 16\n} 1 } } */ ++/* { dg-final { scan-assembler-times {\tubfiz\tx[0-9]+, x3, 10, 16\n} 1 } } */ + /* { dg-final { scan-assembler-not {\tcmp\tx[0-9]+, 0} } } */ + /* { dg-final { scan-assembler-not {\tcmp\tw[0-9]+, 0} } } */ + /* { dg-final { scan-assembler-not {\tcsel\tx[0-9]+} } } */ +diff --git a/gcc/testsuite/gcc.target/aarch64/sve/var_stride_4.c b/gcc/testsuite/gcc.target/aarch64/sve/var_stride_4.c +index 71b826a4c1bb..d2e74f9d4175 100644 +--- a/gcc/testsuite/gcc.target/aarch64/sve/var_stride_4.c ++++ b/gcc/testsuite/gcc.target/aarch64/sve/var_stride_4.c +@@ -16,7 +16,8 @@ f (TYPE *x, TYPE *y, int n, int m) + /* { dg-final { scan-assembler {\tldr\tw[0-9]+} } } */ + /* { dg-final { scan-assembler {\tstr\tw[0-9]+} } } */ + /* Should multiply by (257-1)*4 rather than (VF-1)*4. */ +-/* { dg-final { scan-assembler-times {\t(?:lsl\tx[0-9]+, x[0-9]+, 10|sbfiz\tx[0-9]+, x[0-9]+, 10, 32)\n} 2 } } */ ++/* { dg-final { scan-assembler-times {\tsbfiz\tx[0-9]+, x2, 10, 32\n} 1 } } */ ++/* { dg-final { scan-assembler-times {\tsbfiz\tx[0-9]+, x3, 10, 32\n} 1 } } */ + /* { dg-final { scan-assembler {\tcmp\tw2, 0} } } */ + /* { dg-final { scan-assembler {\tcmp\tw3, 0} } } */ + /* { dg-final { scan-assembler-times {\tcsel\tx[0-9]+} 4 } } */ + +base-commit: d25728c98682c058bfda79333c94b0a8cf2a3f49 +-- +2.49.0 + diff --git a/15.0.0/gentoo/84_all_combine-Avoid-split_i2i3-search-if-i2-is-unchanged-P.patch b/15.0.0/gentoo/84_all_combine-Avoid-split_i2i3-search-if-i2-is-unchanged-P.patch new file mode 100644 index 0000000..19d4917 --- /dev/null +++ b/15.0.0/gentoo/84_all_combine-Avoid-split_i2i3-search-if-i2-is-unchanged-P.patch @@ -0,0 +1,54 @@ +https://inbox.sourceware.org/gcc-patches/[email protected]/ + +From fac8faddc3fca52b6d51c0d1a57323012a1fe4e9 Mon Sep 17 00:00:00 2001 +Message-ID: <fac8faddc3fca52b6d51c0d1a57323012a1fe4e9.1743793443.git....@gentoo.org> +In-Reply-To: <de2f5e9c4ce4bb674fb0da191102838a6bdd92f6.1743793443.git....@gentoo.org> +References: <de2f5e9c4ce4bb674fb0da191102838a6bdd92f6.1743793443.git....@gentoo.org> +From: Richard Sandiford <[email protected]> +Date: Fri, 4 Apr 2025 10:22:12 +0100 +Subject: [PATCH 2/4] combine: Avoid split_i2i3 search if i2 is unchanged + [PR116398] + +When combining a single-set i2 into a multi-set i3, combine +first tries to match the new multi-set in-place. If that fails, +combine considers splitting the multi-set so that one set goes in +i2 and the other set stays in i3. That moves a destination from i3 +to i2 and so combine needs to update any associated log link for that +destination to point to i2 rather than i3. + +However, that kind of split can also occur for 2->2 combinations. +For a 2-instruction combination in which i2 doesn't die in i3, combine +tries a 2->1 combination by turning i3 into a parallel of the original +i2 and the combined i3. If that fails, combine will split the parallel +as above, so that the first set goes in i2 and the second set goes in i3. +But that can often leave i2 unchanged, meaning that no destinations have +moved and so no search is necessary. + +gcc/ + PR testsuite/116398 + * combine.cc (try_combine): Shortcut the split_i2i3 handling if + i2 is unchanged. +--- + gcc/combine.cc | 6 ++++++ + 1 file changed, 6 insertions(+) + +diff --git a/gcc/combine.cc b/gcc/combine.cc +index 65a87a45b3be..e29cff7147d9 100644 +--- a/gcc/combine.cc ++++ b/gcc/combine.cc +@@ -4212,6 +4212,12 @@ try_combine (rtx_insn *i3, rtx_insn *i2, rtx_insn *i1, rtx_insn *i0, + + bool only_i3_changed = !i0 && !i1 && rtx_equal_p (newi2pat, PATTERN (i2)); + ++ /* If only i3 has changed, any split of the combined instruction just ++ restored i2 to its original state. No destinations moved from i3 ++ to i2. */ ++ if (only_i3_changed) ++ split_i2i3 = false; ++ + /* We now know that we can do this combination. Merge the insns and + update the status of registers and LOG_LINKS. */ + +-- +2.49.0 + diff --git a/15.0.0/gentoo/85_all_combine-Optimise-distribute_links-search-PR116398.patch b/15.0.0/gentoo/85_all_combine-Optimise-distribute_links-search-PR116398.patch new file mode 100644 index 0000000..f8849ba --- /dev/null +++ b/15.0.0/gentoo/85_all_combine-Optimise-distribute_links-search-PR116398.patch @@ -0,0 +1,99 @@ +https://inbox.sourceware.org/gcc-patches/[email protected]/ + +From bc27a2f371602bf897ff4da1620a56eeeeebf8ff Mon Sep 17 00:00:00 2001 +Message-ID: <bc27a2f371602bf897ff4da1620a56eeeeebf8ff.1743793443.git....@gentoo.org> +In-Reply-To: <de2f5e9c4ce4bb674fb0da191102838a6bdd92f6.1743793443.git....@gentoo.org> +References: <de2f5e9c4ce4bb674fb0da191102838a6bdd92f6.1743793443.git....@gentoo.org> +From: Richard Sandiford <[email protected]> +Date: Fri, 4 Apr 2025 10:22:32 +0100 +Subject: [PATCH 3/4] combine: Optimise distribute_links search [PR116398] + +Another problem in PR101523 was that, after each successful 2->2 +combination attempt, distribute_links would search further and further +for the next combinable use of the i2 destination. Each search would +start at i2 itself, making the search quadratic in the worst case. + +In a 2->2 combination, if i2 is unchanged, the search can start at i3 +instead of i2. The same thing applies to i2 when distributing i2's +links, since the only changes to earlier instructions are the deletion +of i0 and i1. + +This change, combined with the previous split_i2i3 patch, gives a +34.6% speedup in combine for the testcase in PR101523. Combine +goes from being 41% to 34% of compile time. + +gcc/ + PR testsuite/116398 + * combine.cc (distribute_links): Take an optional start point. + (try_combine): If only i3 has changed, only distribute i3's links, + not i2's. Start the search for the new use from i3 rather than + from the definition instruction. Likewise start the search for + the new use from i2 when distributing i2's links. +--- + gcc/combine.cc | 27 +++++++++++++++++++-------- + 1 file changed, 19 insertions(+), 8 deletions(-) + +diff --git a/gcc/combine.cc b/gcc/combine.cc +index e29cff7147d9..e99b064c98d4 100644 +--- a/gcc/combine.cc ++++ b/gcc/combine.cc +@@ -472,7 +472,7 @@ static void move_deaths (rtx, rtx, int, rtx_insn *, rtx *); + static bool reg_bitfield_target_p (rtx, rtx); + static void distribute_notes (rtx, rtx_insn *, rtx_insn *, rtx_insn *, + rtx, rtx, rtx); +-static void distribute_links (struct insn_link *); ++static void distribute_links (struct insn_link *, rtx_insn * = nullptr); + static void mark_used_regs_combine (rtx); + static void record_promoted_value (rtx_insn *, rtx); + static bool unmentioned_reg_p (rtx, rtx); +@@ -4592,10 +4592,15 @@ try_combine (rtx_insn *i3, rtx_insn *i2, rtx_insn *i1, rtx_insn *i0, + NULL_RTX, NULL_RTX, NULL_RTX); + } + +- distribute_links (i3links); +- distribute_links (i2links); +- distribute_links (i1links); +- distribute_links (i0links); ++ if (only_i3_changed) ++ distribute_links (i3links, i3); ++ else ++ { ++ distribute_links (i3links); ++ distribute_links (i2links, i2); ++ distribute_links (i1links); ++ distribute_links (i0links); ++ } + + if (REG_P (i2dest)) + { +@@ -14986,10 +14991,13 @@ distribute_notes (rtx notes, rtx_insn *from_insn, rtx_insn *i3, rtx_insn *i2, + + /* Similarly to above, distribute the LOG_LINKS that used to be present on + I3, I2, and I1 to new locations. This is also called to add a link +- pointing at I3 when I3's destination is changed. */ ++ pointing at I3 when I3's destination is changed. ++ ++ If START is nonnull and an insn, we know that the next location for each ++ link is no earlier than START. */ + + static void +-distribute_links (struct insn_link *links) ++distribute_links (struct insn_link *links, rtx_insn *start) + { + struct insn_link *link, *next_link; + +@@ -15055,7 +15063,10 @@ distribute_links (struct insn_link *links) + I3 to I2. Also note that not much searching is typically done here + since most links don't point very far away. */ + +- for (insn = NEXT_INSN (link->insn); ++ insn = start; ++ if (!insn || NOTE_P (insn)) ++ insn = NEXT_INSN (link->insn); ++ for (; + (insn && (this_basic_block->next_bb == EXIT_BLOCK_PTR_FOR_FN (cfun) + || BB_HEAD (this_basic_block->next_bb) != insn)); + insn = NEXT_INSN (insn)) +-- +2.49.0 + diff --git a/15.0.0/gentoo/86_all_combine-Limit-insn-searchs-for-2-2-combinations-PR11.patch b/15.0.0/gentoo/86_all_combine-Limit-insn-searchs-for-2-2-combinations-PR11.patch new file mode 100644 index 0000000..b185cba --- /dev/null +++ b/15.0.0/gentoo/86_all_combine-Limit-insn-searchs-for-2-2-combinations-PR11.patch @@ -0,0 +1,240 @@ +https://inbox.sourceware.org/gcc-patches/[email protected]/ + +From 247ac81832821bade9fa36c350d220dca039d383 Mon Sep 17 00:00:00 2001 +Message-ID: <247ac81832821bade9fa36c350d220dca039d383.1743793443.git....@gentoo.org> +In-Reply-To: <de2f5e9c4ce4bb674fb0da191102838a6bdd92f6.1743793443.git....@gentoo.org> +References: <de2f5e9c4ce4bb674fb0da191102838a6bdd92f6.1743793443.git....@gentoo.org> +From: Richard Sandiford <[email protected]> +Date: Fri, 4 Apr 2025 10:23:00 +0100 +Subject: [PATCH 4/4] combine: Limit insn searchs for 2->2 combinations + [PR116398] + +As noted in the previous patch, combine still takes >30% of +compile time in the original testcase for PR101523. The problem +is that try_combine uses linear insn searches for some dataflow +queries, so in the worst case, an unlimited number of 2->2 +combinations for the same i2 can lead to quadratic behaviour. + +This patch limits distribute_links to a certain number +of instructions when i2 is unchanged. As Segher said in the PR trail, +it would make more conceptual sense to apply the limit unconditionally, +but I thought it would be better to change as little as possible at +this development stage. Logically, in stage 1, the --param should +be applied directly by distribute_links with no input from callers. + +As I mentioned in: + + https://gcc.gnu.org/bugzilla/show_bug.cgi?id=116398#c28 + +I think it's safe to drop log links even if a use exists. All +processing of log links seems to handle the absence of a link +for a particular register in a conservative way. + +The initial set-up errs on the side of dropping links, since for example +create_log_links has: + + /* flow.c claimed: + + We don't build a LOG_LINK for hard registers contained + in ASM_OPERANDs. If these registers get replaced, + we might wind up changing the semantics of the insn, + even if reload can make what appear to be valid + assignments later. */ + if (regno < FIRST_PSEUDO_REGISTER + && asm_noperands (PATTERN (use_insn)) >= 0) + continue; + +which excludes combinations by dropping log links, rather than during +try_combine. And: + + /* If this register is being initialized using itself, and the + register is uninitialized in this basic block, and there are + no LOG_LINKS which set the register, then part of the + register is uninitialized. In that case we can't assume + anything about the number of nonzero bits. + + ??? We could do better if we checked this in + reg_{nonzero_bits,num_sign_bit_copies}_for_combine. Then we + could avoid making assumptions about the insn which initially + sets the register, while still using the information in other + insns. We would have to be careful to check every insn + involved in the combination. */ + + if (insn + && reg_referenced_p (x, PATTERN (insn)) + && !REGNO_REG_SET_P (DF_LR_IN (BLOCK_FOR_INSN (insn)), + REGNO (x))) + { + struct insn_link *link; + + FOR_EACH_LOG_LINK (link, insn) + if (dead_or_set_p (link->insn, x)) + break; + if (!link) + { + rsp->nonzero_bits = GET_MODE_MASK (mode); + rsp->sign_bit_copies = 1; + return; + } + } + +treats the lack of a log link as a possible sign of uninitialised data, +but that would be a missed optimisation rather than a correctness issue. + +One question is what the default --param value should be. I went with +Jakub's suggestion of 3000 from: + + https://gcc.gnu.org/bugzilla/show_bug.cgi?id=116398#c25 + +Also, to answer Jakub's question in that comment, I tried bisecting: + + int limit = atoi (getenv ("BISECT")); + +(so applying the limit for all calls from try_combine) with an +abort in distribute_links if the limit caused a link to be skipped. +The minimum BISECT value that allowed an aarch64-linux-gnu bootstrap +to succeed with --enable-languages=all --enable-checking=yes,rtl,extra +was 142, so much lower than the parameter value. I realised too late +that --enable-checking=release would probably have been a more +interesting test. + +The previous patch meant that distribute_links itself is now linear +for a given i2 definition, since each search starts at the previous +last use, rather than at i2 itself. This means that the limit has +to be applied cumulatively across all searches for the same link. + +The patch does that by storing a counter in the insn_link structure. +There was a 32-bit hole there on LP64 hosts. + +gcc/ + PR testsuite/116398 + * params.opt (-param=max-combine-search-insns=): New param. + * doc/invoke.texi: Document it. + * combine.cc (insn_link::insn_count): New field. + (alloc_insn_link): Initialize it. + (distribute_links): Add a limit parameter. + (try_combine): Use the new param to limit distribute_links + when only i3 has changed. +--- + gcc/combine.cc | 21 +++++++++++++++++---- + gcc/doc/invoke.texi | 9 +++++++++ + gcc/params.opt | 4 ++++ + 3 files changed, 30 insertions(+), 4 deletions(-) + +diff --git a/gcc/combine.cc b/gcc/combine.cc +index e99b064c98d4..5f085187cfef 100644 +--- a/gcc/combine.cc ++++ b/gcc/combine.cc +@@ -309,6 +309,7 @@ static int *uid_insn_cost; + struct insn_link { + rtx_insn *insn; + unsigned int regno; ++ int insn_count; + struct insn_link *next; + }; + +@@ -342,6 +343,7 @@ alloc_insn_link (rtx_insn *insn, unsigned int regno, struct insn_link *next) + sizeof (struct insn_link)); + l->insn = insn; + l->regno = regno; ++ l->insn_count = 0; + l->next = next; + return l; + } +@@ -472,7 +474,8 @@ static void move_deaths (rtx, rtx, int, rtx_insn *, rtx *); + static bool reg_bitfield_target_p (rtx, rtx); + static void distribute_notes (rtx, rtx_insn *, rtx_insn *, rtx_insn *, + rtx, rtx, rtx); +-static void distribute_links (struct insn_link *, rtx_insn * = nullptr); ++static void distribute_links (struct insn_link *, rtx_insn * = nullptr, ++ int limit = INT_MAX); + static void mark_used_regs_combine (rtx); + static void record_promoted_value (rtx_insn *, rtx); + static bool unmentioned_reg_p (rtx, rtx); +@@ -4593,7 +4596,7 @@ try_combine (rtx_insn *i3, rtx_insn *i2, rtx_insn *i1, rtx_insn *i0, + } + + if (only_i3_changed) +- distribute_links (i3links, i3); ++ distribute_links (i3links, i3, param_max_combine_search_insns); + else + { + distribute_links (i3links); +@@ -14994,10 +14997,12 @@ distribute_notes (rtx notes, rtx_insn *from_insn, rtx_insn *i3, rtx_insn *i2, + pointing at I3 when I3's destination is changed. + + If START is nonnull and an insn, we know that the next location for each +- link is no earlier than START. */ ++ link is no earlier than START. LIMIT is the maximum number of nondebug ++ instructions that can be scanned when looking for the next use of a ++ definition. */ + + static void +-distribute_links (struct insn_link *links, rtx_insn *start) ++distribute_links (struct insn_link *links, rtx_insn *start, int limit) + { + struct insn_link *link, *next_link; + +@@ -15063,9 +15068,12 @@ distribute_links (struct insn_link *links, rtx_insn *start) + I3 to I2. Also note that not much searching is typically done here + since most links don't point very far away. */ + ++ int count = 0; + insn = start; + if (!insn || NOTE_P (insn)) + insn = NEXT_INSN (link->insn); ++ else ++ count = link->insn_count; + for (; + (insn && (this_basic_block->next_bb == EXIT_BLOCK_PTR_FOR_FN (cfun) + || BB_HEAD (this_basic_block->next_bb) != insn)); +@@ -15086,6 +15094,11 @@ distribute_links (struct insn_link *links, rtx_insn *start) + } + else if (INSN_P (insn) && reg_set_p (reg, insn)) + break; ++ else if (count >= limit) ++ break; ++ else ++ count += 1; ++ link->insn_count = count; + + /* If we found a place to put the link, place it there unless there + is already a link to the same insn as LINK at that point. */ +diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi +index 4c9af429ab08..0903cd2b7f23 100644 +--- a/gcc/doc/invoke.texi ++++ b/gcc/doc/invoke.texi +@@ -16518,6 +16518,15 @@ in combiner for a pseudo register as last known value of that register. + @item max-combine-insns + The maximum number of instructions the RTL combiner tries to combine. + ++@item max-combine-search-insns ++The maximum number of instructions that the RTL combiner searches in order ++to find the next use of a given register definition. If this limit is reached ++without finding such a use, the combiner will stop trying to optimize the ++definition. ++ ++Currently this limit only applies after certain successful combination ++attempts, but it could be extended to other cases in future. ++ + @item integer-share-limit + Small integer constants can use a shared data structure, reducing the + compiler's memory usage and increasing its speed. This sets the maximum +diff --git a/gcc/params.opt b/gcc/params.opt +index 4f4eb4d7a2a5..422d082b3557 100644 +--- a/gcc/params.opt ++++ b/gcc/params.opt +@@ -477,6 +477,10 @@ The maximum number of instructions to consider to unroll in a loop on average. + Common Joined UInteger Var(param_max_combine_insns) Init(4) IntegerRange(2, 4) Param Optimization + The maximum number of insns combine tries to combine. + ++-param=max-combine-search-insns= ++Common Joined UInteger Var(param_max_combine_search_insns) Init(3000) Param Optimization ++The maximum number of instructions that combine searches in order to find the next use of a particular register. ++ + -param=max-completely-peel-loop-nest-depth= + Common Joined UInteger Var(param_max_unroll_iterations) Init(8) Param Optimization + The maximum depth of a loop nest we completely peel. +-- +2.49.0 + diff --git a/15.0.0/gentoo/README.history b/15.0.0/gentoo/README.history index f01476e..5e4d154 100644 --- a/15.0.0/gentoo/README.history +++ b/15.0.0/gentoo/README.history @@ -1,3 +1,10 @@ +52 ???? + + + 83_all_combine-Allow-2-2-combinations-but-with-a-tweak-PR11.patch + + 84_all_combine-Avoid-split_i2i3-search-if-i2-is-unchanged-P.patch + + 85_all_combine-Optimise-distribute_links-search-PR116398.patch + + 86_all_combine-Limit-insn-searchs-for-2-2-combinations-PR11.patch + 51 2 April 2025 - 80_all_PR119376-tailc-Don-t-fail-musttail-calls-if-they-use-or-could.patch
