https://gcc.gnu.org/g:4d7a634f6d41029811cdcbd5f7282b5b07890094
commit r15-9239-g4d7a634f6d41029811cdcbd5f7282b5b07890094 Author: Richard Sandiford <richard.sandif...@arm.com> Date: Mon Apr 7 08:03:47 2025 +0100 combine: Allow 2->2 combinations, but with a tweak [PR116398] One of the problems in PR101523 was that, after each successful 2->2 combination attempt, try_combine would restart combination attempts at i2 even if i2 hadn't changed. This led to quadratic behaviour as the same failed combinations between i2 and i3 were tried repeatedly. The original patch for the PR dealt with that by disallowing 2->2 combinations. However, that led to various optimisation regressions, so there was interest in allowing the combinations again, at least until an alternative way of getting the same results is in place. This patch is a variant of Richi's in: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=101523#c53 but limited to when we're combining 2 instructions. This speeds up combine by 10x on the original PR101523 testcase and reduces combine's memory footprint by 100x. gcc/ PR testsuite/116398 * combine.cc (try_combine): Reallow 2->2 combinations. Detect when only i3 has changed and restart from i3 in that case. gcc/testsuite/ * gcc.target/aarch64/popcnt-le-1.c: Account for commutativity of TST. * gcc.target/aarch64/popcnt-le-3.c: Likewise AND. * gcc.target/aarch64/pr100056.c: Revert previous patch. * gcc.target/aarch64/sve/pred-not-gen-1.c: Likewise. * gcc.target/aarch64/sve/pred-not-gen-4.c: Likewise. * gcc.target/aarch64/sve/var_stride_2.c: Likewise. * gcc.target/aarch64/sve/var_stride_4.c: Likewise. Co-authored-by: Richard Biener <rguent...@suse.de> Diff: --- gcc/combine.cc | 14 ++++---------- gcc/testsuite/gcc.target/aarch64/popcnt-le-1.c | 4 ++-- gcc/testsuite/gcc.target/aarch64/popcnt-le-3.c | 4 ++-- gcc/testsuite/gcc.target/aarch64/pr100056.c | 4 +++- gcc/testsuite/gcc.target/aarch64/sve/pred-not-gen-1.c | 4 ++-- gcc/testsuite/gcc.target/aarch64/sve/pred-not-gen-4.c | 4 ++-- gcc/testsuite/gcc.target/aarch64/sve/var_stride_2.c | 3 ++- gcc/testsuite/gcc.target/aarch64/sve/var_stride_4.c | 3 ++- 8 files changed, 19 insertions(+), 21 deletions(-) diff --git a/gcc/combine.cc b/gcc/combine.cc index 1b6c4e314cc9..65a87a45b3be 100644 --- a/gcc/combine.cc +++ b/gcc/combine.cc @@ -4210,16 +4210,7 @@ try_combine (rtx_insn *i3, rtx_insn *i2, rtx_insn *i1, rtx_insn *i0, adjust_for_new_dest (i3); } - /* If I2 didn't change, this is not a combination (but a simplification or - canonicalisation with context), which should not be done here. Doing - it here explodes the algorithm. Don't. */ - if (rtx_equal_p (newi2pat, PATTERN (i2))) - { - if (dump_file) - fprintf (dump_file, "i2 didn't change, not doing this\n"); - undo_all (); - return 0; - } + bool only_i3_changed = !i0 && !i1 && rtx_equal_p (newi2pat, PATTERN (i2)); /* We now know that we can do this combination. Merge the insns and update the status of registers and LOG_LINKS. */ @@ -4787,6 +4778,9 @@ try_combine (rtx_insn *i3, rtx_insn *i2, rtx_insn *i1, rtx_insn *i0, combine_successes++; undo_commit (); + if (only_i3_changed) + return i3; + rtx_insn *ret = newi2pat ? i2 : i3; if (added_links_insn && DF_INSN_LUID (added_links_insn) < DF_INSN_LUID (ret)) ret = added_links_insn; diff --git a/gcc/testsuite/gcc.target/aarch64/popcnt-le-1.c b/gcc/testsuite/gcc.target/aarch64/popcnt-le-1.c index b4141da982c9..843fdac9fd8e 100644 --- a/gcc/testsuite/gcc.target/aarch64/popcnt-le-1.c +++ b/gcc/testsuite/gcc.target/aarch64/popcnt-le-1.c @@ -8,7 +8,7 @@ /* ** le32: ** sub w([0-9]+), w0, #1 -** tst w0, w\1 +** tst (?:w0, w\1|w\1, w0) ** cset w0, eq ** ret */ @@ -20,7 +20,7 @@ unsigned le32 (const unsigned int a) { /* ** gt32: ** sub w([0-9]+), w0, #1 -** tst w0, w\1 +** tst (?:w0, w\1|w\1, w0) ** cset w0, ne ** ret */ diff --git a/gcc/testsuite/gcc.target/aarch64/popcnt-le-3.c b/gcc/testsuite/gcc.target/aarch64/popcnt-le-3.c index b811e6f6e8fe..3b558e95d819 100644 --- a/gcc/testsuite/gcc.target/aarch64/popcnt-le-3.c +++ b/gcc/testsuite/gcc.target/aarch64/popcnt-le-3.c @@ -8,7 +8,7 @@ /* ** le16: ** sub w([0-9]+), w0, #1 -** and w([0-9]+), w0, w\1 +** and w([0-9]+), (?:w0, w\1|w\1, w0) ** tst w\2, 65535 ** cset w0, eq ** ret @@ -21,7 +21,7 @@ unsigned le16 (const unsigned short a) { /* ** gt16: ** sub w([0-9]+), w0, #1 -** and w([0-9]+), w0, w\1 +** and w([0-9]+), (?:w0, w\1|w\1, w0) ** tst w\2, 65535 ** cset w0, ne ** ret diff --git a/gcc/testsuite/gcc.target/aarch64/pr100056.c b/gcc/testsuite/gcc.target/aarch64/pr100056.c index 0b77824da457..70499772d285 100644 --- a/gcc/testsuite/gcc.target/aarch64/pr100056.c +++ b/gcc/testsuite/gcc.target/aarch64/pr100056.c @@ -1,7 +1,9 @@ /* PR target/100056 */ /* { dg-do compile } */ /* { dg-options "-O2" } */ -/* { dg-final { scan-assembler-not {\t[us]bfiz\tw[0-9]+, w[0-9]+, 11} } } */ +/* { dg-final { scan-assembler-not {\t[us]bfiz\tw[0-9]+, w[0-9]+, 11} { xfail *-*-* } } } */ +/* { dg-final { scan-assembler-times {\t[us]bfiz\tw[0-9]+, w[0-9]+, 11} 2 } } */ +/* { dg-final { scan-assembler-times {\tadd\tw[0-9]+, w[0-9]+, w[0-9]+, uxtb\n} 2 } } */ int or_shift_u8 (unsigned char i) diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pred-not-gen-1.c b/gcc/testsuite/gcc.target/aarch64/sve/pred-not-gen-1.c index a7d2795ebe23..c9a8b82c48ac 100644 --- a/gcc/testsuite/gcc.target/aarch64/sve/pred-not-gen-1.c +++ b/gcc/testsuite/gcc.target/aarch64/sve/pred-not-gen-1.c @@ -19,6 +19,6 @@ void f10(double * restrict z, double * restrict w, double * restrict x, double * } } -/* { dg-final { scan-assembler-not {\tbic\t} { xfail *-*-* } } } */ -/* { dg-final { scan-assembler-times {\tnot\tp[0-9]+\.b, p[0-9]+/z, p[0-9]+\.b\n} 1 { xfail *-*-* } } } */ +/* { dg-final { scan-assembler-not {\tbic\t} } } */ +/* { dg-final { scan-assembler-times {\tnot\tp[0-9]+\.b, p[0-9]+/z, p[0-9]+\.b\n} 1 } } */ /* { dg-final { scan-assembler-times {\tfcmgt\tp[0-9]+\.d, p[0-9]+/z, z[0-9]+\.d, #0} 1 } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pred-not-gen-4.c b/gcc/testsuite/gcc.target/aarch64/sve/pred-not-gen-4.c index 20cbd7550b7e..1845bd3f0f70 100644 --- a/gcc/testsuite/gcc.target/aarch64/sve/pred-not-gen-4.c +++ b/gcc/testsuite/gcc.target/aarch64/sve/pred-not-gen-4.c @@ -8,6 +8,6 @@ void f13(double * restrict z, double * restrict w, double * restrict x, double * } } -/* { dg-final { scan-assembler-not {\tbic\t} { xfail *-*-* } } } */ -/* { dg-final { scan-assembler-times {\tnot\tp[0-9]+\.b, p[0-9]+/z, p[0-9]+\.b\n} 1 { xfail *-*-* } } } */ +/* { dg-final { scan-assembler-not {\tbic\t} } } */ +/* { dg-final { scan-assembler-times {\tnot\tp[0-9]+\.b, p[0-9]+/z, p[0-9]+\.b\n} 1 } } */ /* { dg-final { scan-assembler-times {\tfcmuo\tp[0-9]+\.d, p[0-9]+/z, z[0-9]+\.d, z[0-9]+\.d} 1 } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/var_stride_2.c b/gcc/testsuite/gcc.target/aarch64/sve/var_stride_2.c index 33b9f0f197e4..b8afea70207f 100644 --- a/gcc/testsuite/gcc.target/aarch64/sve/var_stride_2.c +++ b/gcc/testsuite/gcc.target/aarch64/sve/var_stride_2.c @@ -16,7 +16,8 @@ f (TYPE *x, TYPE *y, unsigned short n, unsigned short m) /* { dg-final { scan-assembler {\tldr\tw[0-9]+} } } */ /* { dg-final { scan-assembler {\tstr\tw[0-9]+} } } */ /* Should multiply by (257-1)*4 rather than (VF-1)*4 or (VF-2)*4. */ -/* { dg-final { scan-assembler-times {\tadd\tx[0-9]+, x[0-9]+, x[0-9]+, lsl 10\n} 2 } } */ +/* { dg-final { scan-assembler-times {\tubfiz\tx[0-9]+, x2, 10, 16\n} 1 } } */ +/* { dg-final { scan-assembler-times {\tubfiz\tx[0-9]+, x3, 10, 16\n} 1 } } */ /* { dg-final { scan-assembler-not {\tcmp\tx[0-9]+, 0} } } */ /* { dg-final { scan-assembler-not {\tcmp\tw[0-9]+, 0} } } */ /* { dg-final { scan-assembler-not {\tcsel\tx[0-9]+} } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/var_stride_4.c b/gcc/testsuite/gcc.target/aarch64/sve/var_stride_4.c index 71b826a4c1bb..d2e74f9d4175 100644 --- a/gcc/testsuite/gcc.target/aarch64/sve/var_stride_4.c +++ b/gcc/testsuite/gcc.target/aarch64/sve/var_stride_4.c @@ -16,7 +16,8 @@ f (TYPE *x, TYPE *y, int n, int m) /* { dg-final { scan-assembler {\tldr\tw[0-9]+} } } */ /* { dg-final { scan-assembler {\tstr\tw[0-9]+} } } */ /* Should multiply by (257-1)*4 rather than (VF-1)*4. */ -/* { dg-final { scan-assembler-times {\t(?:lsl\tx[0-9]+, x[0-9]+, 10|sbfiz\tx[0-9]+, x[0-9]+, 10, 32)\n} 2 } } */ +/* { dg-final { scan-assembler-times {\tsbfiz\tx[0-9]+, x2, 10, 32\n} 1 } } */ +/* { dg-final { scan-assembler-times {\tsbfiz\tx[0-9]+, x3, 10, 32\n} 1 } } */ /* { dg-final { scan-assembler {\tcmp\tw2, 0} } } */ /* { dg-final { scan-assembler {\tcmp\tw3, 0} } } */ /* { dg-final { scan-assembler-times {\tcsel\tx[0-9]+} 4 } } */