On Fri, 4 Apr 2025, Richard Sandiford wrote: > One of the problems in PR101523 was that, after each successful > 2->2 combination attempt, try_combine would restart combination > attempts at i2 even if i2 hadn't changed. This led to quadratic > behaviour as the same failed combinations between i2 and i3 were > tried repeatedly. > > The original patch for the PR dealt with that by disallowing 2->2 > combinations. However, that led to various optimisation regressions, > so there was interest in allowing the combinations again, at least > until an alternative way of getting the same results is in place. > > This patch is a variant of Richi's in: > > https://gcc.gnu.org/bugzilla/show_bug.cgi?id=101523#c53 > > but limited to when we're combining 2 instructions. > > This speeds up combine by 10x on the original PR101523 testcase > and reduces combine's memory footprint by 100x.
I'll OK this one, please leave others time to comment. We have deployed this to gcc13 and gcc14 internally without any problems reported because of it. Richard. > gcc/ > PR testsuite/116398 > * combine.cc (try_combine): Reallow 2->2 combinations. Detect when > only i3 has changed and restart from i3 in that case. > > gcc/testsuite/ > * gcc.target/aarch64/popcnt-le-1.c: Account for commutativity of TST. > * gcc.target/aarch64/popcnt-le-3.c: Likewise AND. > * gcc.target/aarch64/sve/pred-not-gen-1.c: Revert previous patch. > * gcc.target/aarch64/sve/pred-not-gen-4.c: Likewise. > * gcc.target/aarch64/sve/var_stride_2.c: Likewise. > * gcc.target/aarch64/sve/var_stride_4.c: Likewise. > > Co-authored-by: Richard Biener <rguent...@suse.de> > --- > gcc/combine.cc | 14 ++++---------- > gcc/testsuite/gcc.target/aarch64/popcnt-le-1.c | 4 ++-- > gcc/testsuite/gcc.target/aarch64/popcnt-le-3.c | 4 ++-- > gcc/testsuite/gcc.target/aarch64/pr100056.c | 4 +++- > .../gcc.target/aarch64/sve/pred-not-gen-1.c | 4 ++-- > .../gcc.target/aarch64/sve/pred-not-gen-4.c | 4 ++-- > .../gcc.target/aarch64/sve/var_stride_2.c | 3 ++- > .../gcc.target/aarch64/sve/var_stride_4.c | 3 ++- > 8 files changed, 19 insertions(+), 21 deletions(-) > > diff --git a/gcc/combine.cc b/gcc/combine.cc > index ef13f5d5e90..e12a78c8d89 100644 > --- a/gcc/combine.cc > +++ b/gcc/combine.cc > @@ -4208,16 +4208,7 @@ try_combine (rtx_insn *i3, rtx_insn *i2, rtx_insn *i1, > rtx_insn *i0, > adjust_for_new_dest (i3); > } > > - /* If I2 didn't change, this is not a combination (but a simplification or > - canonicalisation with context), which should not be done here. Doing > - it here explodes the algorithm. Don't. */ > - if (rtx_equal_p (newi2pat, PATTERN (i2))) > - { > - if (dump_file) > - fprintf (dump_file, "i2 didn't change, not doing this\n"); > - undo_all (); > - return 0; > - } > + bool only_i3_changed = !i0 && !i1 && rtx_equal_p (newi2pat, PATTERN (i2)); > > /* We now know that we can do this combination. Merge the insns and > update the status of registers and LOG_LINKS. */ > @@ -4785,6 +4776,9 @@ try_combine (rtx_insn *i3, rtx_insn *i2, rtx_insn *i1, > rtx_insn *i0, > combine_successes++; > undo_commit (); > > + if (only_i3_changed) > + return i3; > + > rtx_insn *ret = newi2pat ? i2 : i3; > if (added_links_insn && DF_INSN_LUID (added_links_insn) < DF_INSN_LUID > (ret)) > ret = added_links_insn; > diff --git a/gcc/testsuite/gcc.target/aarch64/popcnt-le-1.c > b/gcc/testsuite/gcc.target/aarch64/popcnt-le-1.c > index b4141da982c..843fdac9fd8 100644 > --- a/gcc/testsuite/gcc.target/aarch64/popcnt-le-1.c > +++ b/gcc/testsuite/gcc.target/aarch64/popcnt-le-1.c > @@ -8,7 +8,7 @@ > /* > ** le32: > ** sub w([0-9]+), w0, #1 > -** tst w0, w\1 > +** tst (?:w0, w\1|w\1, w0) > ** cset w0, eq > ** ret > */ > @@ -20,7 +20,7 @@ unsigned le32 (const unsigned int a) { > /* > ** gt32: > ** sub w([0-9]+), w0, #1 > -** tst w0, w\1 > +** tst (?:w0, w\1|w\1, w0) > ** cset w0, ne > ** ret > */ > diff --git a/gcc/testsuite/gcc.target/aarch64/popcnt-le-3.c > b/gcc/testsuite/gcc.target/aarch64/popcnt-le-3.c > index b811e6f6e8f..3b558e95d81 100644 > --- a/gcc/testsuite/gcc.target/aarch64/popcnt-le-3.c > +++ b/gcc/testsuite/gcc.target/aarch64/popcnt-le-3.c > @@ -8,7 +8,7 @@ > /* > ** le16: > ** sub w([0-9]+), w0, #1 > -** and w([0-9]+), w0, w\1 > +** and w([0-9]+), (?:w0, w\1|w\1, w0) > ** tst w\2, 65535 > ** cset w0, eq > ** ret > @@ -21,7 +21,7 @@ unsigned le16 (const unsigned short a) { > /* > ** gt16: > ** sub w([0-9]+), w0, #1 > -** and w([0-9]+), w0, w\1 > +** and w([0-9]+), (?:w0, w\1|w\1, w0) > ** tst w\2, 65535 > ** cset w0, ne > ** ret > diff --git a/gcc/testsuite/gcc.target/aarch64/pr100056.c > b/gcc/testsuite/gcc.target/aarch64/pr100056.c > index 0b77824da45..70499772d28 100644 > --- a/gcc/testsuite/gcc.target/aarch64/pr100056.c > +++ b/gcc/testsuite/gcc.target/aarch64/pr100056.c > @@ -1,7 +1,9 @@ > /* PR target/100056 */ > /* { dg-do compile } */ > /* { dg-options "-O2" } */ > -/* { dg-final { scan-assembler-not {\t[us]bfiz\tw[0-9]+, w[0-9]+, 11} } } */ > +/* { dg-final { scan-assembler-not {\t[us]bfiz\tw[0-9]+, w[0-9]+, 11} { > xfail *-*-* } } } */ > +/* { dg-final { scan-assembler-times {\t[us]bfiz\tw[0-9]+, w[0-9]+, 11} 2 } > } */ > +/* { dg-final { scan-assembler-times {\tadd\tw[0-9]+, w[0-9]+, w[0-9]+, > uxtb\n} 2 } } */ > > int > or_shift_u8 (unsigned char i) > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pred-not-gen-1.c > b/gcc/testsuite/gcc.target/aarch64/sve/pred-not-gen-1.c > index a7d2795ebe2..c9a8b82c48a 100644 > --- a/gcc/testsuite/gcc.target/aarch64/sve/pred-not-gen-1.c > +++ b/gcc/testsuite/gcc.target/aarch64/sve/pred-not-gen-1.c > @@ -19,6 +19,6 @@ void f10(double * restrict z, double * restrict w, double * > restrict x, double * > } > } > > -/* { dg-final { scan-assembler-not {\tbic\t} { xfail *-*-* } } } */ > -/* { dg-final { scan-assembler-times {\tnot\tp[0-9]+\.b, p[0-9]+/z, > p[0-9]+\.b\n} 1 { xfail *-*-* } } } */ > +/* { dg-final { scan-assembler-not {\tbic\t} } } */ > +/* { dg-final { scan-assembler-times {\tnot\tp[0-9]+\.b, p[0-9]+/z, > p[0-9]+\.b\n} 1 } } */ > /* { dg-final { scan-assembler-times {\tfcmgt\tp[0-9]+\.d, p[0-9]+/z, > z[0-9]+\.d, #0} 1 } } */ > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pred-not-gen-4.c > b/gcc/testsuite/gcc.target/aarch64/sve/pred-not-gen-4.c > index 20cbd7550b7..1845bd3f0f7 100644 > --- a/gcc/testsuite/gcc.target/aarch64/sve/pred-not-gen-4.c > +++ b/gcc/testsuite/gcc.target/aarch64/sve/pred-not-gen-4.c > @@ -8,6 +8,6 @@ void f13(double * restrict z, double * restrict w, double * > restrict x, double * > } > } > > -/* { dg-final { scan-assembler-not {\tbic\t} { xfail *-*-* } } } */ > -/* { dg-final { scan-assembler-times {\tnot\tp[0-9]+\.b, p[0-9]+/z, > p[0-9]+\.b\n} 1 { xfail *-*-* } } } */ > +/* { dg-final { scan-assembler-not {\tbic\t} } } */ > +/* { dg-final { scan-assembler-times {\tnot\tp[0-9]+\.b, p[0-9]+/z, > p[0-9]+\.b\n} 1 } } */ > /* { dg-final { scan-assembler-times {\tfcmuo\tp[0-9]+\.d, p[0-9]+/z, > z[0-9]+\.d, z[0-9]+\.d} 1 } } */ > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/var_stride_2.c > b/gcc/testsuite/gcc.target/aarch64/sve/var_stride_2.c > index 33b9f0f197e..b8afea70207 100644 > --- a/gcc/testsuite/gcc.target/aarch64/sve/var_stride_2.c > +++ b/gcc/testsuite/gcc.target/aarch64/sve/var_stride_2.c > @@ -16,7 +16,8 @@ f (TYPE *x, TYPE *y, unsigned short n, unsigned short m) > /* { dg-final { scan-assembler {\tldr\tw[0-9]+} } } */ > /* { dg-final { scan-assembler {\tstr\tw[0-9]+} } } */ > /* Should multiply by (257-1)*4 rather than (VF-1)*4 or (VF-2)*4. */ > -/* { dg-final { scan-assembler-times {\tadd\tx[0-9]+, x[0-9]+, x[0-9]+, lsl > 10\n} 2 } } */ > +/* { dg-final { scan-assembler-times {\tubfiz\tx[0-9]+, x2, 10, 16\n} 1 } } > */ > +/* { dg-final { scan-assembler-times {\tubfiz\tx[0-9]+, x3, 10, 16\n} 1 } } > */ > /* { dg-final { scan-assembler-not {\tcmp\tx[0-9]+, 0} } } */ > /* { dg-final { scan-assembler-not {\tcmp\tw[0-9]+, 0} } } */ > /* { dg-final { scan-assembler-not {\tcsel\tx[0-9]+} } } */ > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/var_stride_4.c > b/gcc/testsuite/gcc.target/aarch64/sve/var_stride_4.c > index 71b826a4c1b..d2e74f9d417 100644 > --- a/gcc/testsuite/gcc.target/aarch64/sve/var_stride_4.c > +++ b/gcc/testsuite/gcc.target/aarch64/sve/var_stride_4.c > @@ -16,7 +16,8 @@ f (TYPE *x, TYPE *y, int n, int m) > /* { dg-final { scan-assembler {\tldr\tw[0-9]+} } } */ > /* { dg-final { scan-assembler {\tstr\tw[0-9]+} } } */ > /* Should multiply by (257-1)*4 rather than (VF-1)*4. */ > -/* { dg-final { scan-assembler-times {\t(?:lsl\tx[0-9]+, x[0-9]+, > 10|sbfiz\tx[0-9]+, x[0-9]+, 10, 32)\n} 2 } } */ > +/* { dg-final { scan-assembler-times {\tsbfiz\tx[0-9]+, x2, 10, 32\n} 1 } } > */ > +/* { dg-final { scan-assembler-times {\tsbfiz\tx[0-9]+, x3, 10, 32\n} 1 } } > */ > /* { dg-final { scan-assembler {\tcmp\tw2, 0} } } */ > /* { dg-final { scan-assembler {\tcmp\tw3, 0} } } */ > /* { dg-final { scan-assembler-times {\tcsel\tx[0-9]+} 4 } } */ > -- Richard Biener <rguent...@suse.de> SUSE Software Solutions Germany GmbH, Frankenstrasse 146, 90461 Nuernberg, Germany; GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG Nuernberg)