One of the problems in PR101523 was that, after each successful
2->2 combination attempt, try_combine would restart combination
attempts at i2 even if i2 hadn't changed.  This led to quadratic
behaviour as the same failed combinations between i2 and i3 were
tried repeatedly.

The original patch for the PR dealt with that by disallowing 2->2
combinations.  However, that led to various optimisation regressions,
so there was interest in allowing the combinations again, at least
until an alternative way of getting the same results is in place.

This patch is a variant of Richi's in:

  https://gcc.gnu.org/bugzilla/show_bug.cgi?id=101523#c53

but limited to when we're combining 2 instructions.

This speeds up combine by 10x on the original PR101523 testcase
and reduces combine's memory footprint by 100x.

gcc/
        PR testsuite/116398
        * combine.cc (try_combine): Reallow 2->2 combinations.  Detect when
        only i3 has changed and restart from i3 in that case.

gcc/testsuite/
        * gcc.target/aarch64/popcnt-le-1.c: Account for commutativity of TST.
        * gcc.target/aarch64/popcnt-le-3.c: Likewise AND.
        * gcc.target/aarch64/sve/pred-not-gen-1.c: Revert previous patch.
        * gcc.target/aarch64/sve/pred-not-gen-4.c: Likewise.
        * gcc.target/aarch64/sve/var_stride_2.c: Likewise.
        * gcc.target/aarch64/sve/var_stride_4.c: Likewise.

Co-authored-by: Richard Biener <rguent...@suse.de>
---
 gcc/combine.cc                                     | 14 ++++----------
 gcc/testsuite/gcc.target/aarch64/popcnt-le-1.c     |  4 ++--
 gcc/testsuite/gcc.target/aarch64/popcnt-le-3.c     |  4 ++--
 gcc/testsuite/gcc.target/aarch64/pr100056.c        |  4 +++-
 .../gcc.target/aarch64/sve/pred-not-gen-1.c        |  4 ++--
 .../gcc.target/aarch64/sve/pred-not-gen-4.c        |  4 ++--
 .../gcc.target/aarch64/sve/var_stride_2.c          |  3 ++-
 .../gcc.target/aarch64/sve/var_stride_4.c          |  3 ++-
 8 files changed, 19 insertions(+), 21 deletions(-)

diff --git a/gcc/combine.cc b/gcc/combine.cc
index ef13f5d5e90..e12a78c8d89 100644
--- a/gcc/combine.cc
+++ b/gcc/combine.cc
@@ -4208,16 +4208,7 @@ try_combine (rtx_insn *i3, rtx_insn *i2, rtx_insn *i1, 
rtx_insn *i0,
       adjust_for_new_dest (i3);
     }
 
-  /* If I2 didn't change, this is not a combination (but a simplification or
-     canonicalisation with context), which should not be done here.  Doing
-     it here explodes the algorithm.  Don't.  */
-  if (rtx_equal_p (newi2pat, PATTERN (i2)))
-    {
-      if (dump_file)
-       fprintf (dump_file, "i2 didn't change, not doing this\n");
-      undo_all ();
-      return 0;
-    }
+  bool only_i3_changed = !i0 && !i1 && rtx_equal_p (newi2pat, PATTERN (i2));
 
   /* We now know that we can do this combination.  Merge the insns and
      update the status of registers and LOG_LINKS.  */
@@ -4785,6 +4776,9 @@ try_combine (rtx_insn *i3, rtx_insn *i2, rtx_insn *i1, 
rtx_insn *i0,
   combine_successes++;
   undo_commit ();
 
+  if (only_i3_changed)
+    return i3;
+
   rtx_insn *ret = newi2pat ? i2 : i3;
   if (added_links_insn && DF_INSN_LUID (added_links_insn) < DF_INSN_LUID (ret))
     ret = added_links_insn;
diff --git a/gcc/testsuite/gcc.target/aarch64/popcnt-le-1.c 
b/gcc/testsuite/gcc.target/aarch64/popcnt-le-1.c
index b4141da982c..843fdac9fd8 100644
--- a/gcc/testsuite/gcc.target/aarch64/popcnt-le-1.c
+++ b/gcc/testsuite/gcc.target/aarch64/popcnt-le-1.c
@@ -8,7 +8,7 @@
 /*
 ** le32:
 **     sub     w([0-9]+), w0, #1
-**     tst     w0, w\1
+**     tst     (?:w0, w\1|w\1, w0)
 **     cset    w0, eq
 **     ret
 */
@@ -20,7 +20,7 @@ unsigned le32 (const unsigned int a) {
 /*
 ** gt32:
 **     sub     w([0-9]+), w0, #1
-**     tst     w0, w\1
+**     tst     (?:w0, w\1|w\1, w0)
 **     cset    w0, ne
 **     ret
 */
diff --git a/gcc/testsuite/gcc.target/aarch64/popcnt-le-3.c 
b/gcc/testsuite/gcc.target/aarch64/popcnt-le-3.c
index b811e6f6e8f..3b558e95d81 100644
--- a/gcc/testsuite/gcc.target/aarch64/popcnt-le-3.c
+++ b/gcc/testsuite/gcc.target/aarch64/popcnt-le-3.c
@@ -8,7 +8,7 @@
 /*
 ** le16:
 **     sub     w([0-9]+), w0, #1
-**     and     w([0-9]+), w0, w\1
+**     and     w([0-9]+), (?:w0, w\1|w\1, w0)
 **     tst     w\2, 65535
 **     cset    w0, eq
 **     ret
@@ -21,7 +21,7 @@ unsigned le16 (const unsigned short a) {
 /*
 ** gt16:
 **     sub     w([0-9]+), w0, #1
-**     and     w([0-9]+), w0, w\1
+**     and     w([0-9]+), (?:w0, w\1|w\1, w0)
 **     tst     w\2, 65535
 **     cset    w0, ne
 **     ret
diff --git a/gcc/testsuite/gcc.target/aarch64/pr100056.c 
b/gcc/testsuite/gcc.target/aarch64/pr100056.c
index 0b77824da45..70499772d28 100644
--- a/gcc/testsuite/gcc.target/aarch64/pr100056.c
+++ b/gcc/testsuite/gcc.target/aarch64/pr100056.c
@@ -1,7 +1,9 @@
 /* PR target/100056 */
 /* { dg-do compile } */
 /* { dg-options "-O2" } */
-/* { dg-final { scan-assembler-not {\t[us]bfiz\tw[0-9]+, w[0-9]+, 11} } } */
+/* { dg-final { scan-assembler-not {\t[us]bfiz\tw[0-9]+, w[0-9]+, 11} { xfail 
*-*-* } } } */
+/* { dg-final { scan-assembler-times {\t[us]bfiz\tw[0-9]+, w[0-9]+, 11} 2 } } 
*/
+/* { dg-final { scan-assembler-times {\tadd\tw[0-9]+, w[0-9]+, w[0-9]+, 
uxtb\n} 2 } } */
 
 int
 or_shift_u8 (unsigned char i)
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pred-not-gen-1.c 
b/gcc/testsuite/gcc.target/aarch64/sve/pred-not-gen-1.c
index a7d2795ebe2..c9a8b82c48a 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/pred-not-gen-1.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pred-not-gen-1.c
@@ -19,6 +19,6 @@ void f10(double * restrict z, double * restrict w, double * 
restrict x, double *
     }
 }
 
-/* { dg-final { scan-assembler-not {\tbic\t} { xfail *-*-* } } } */
-/* { dg-final { scan-assembler-times {\tnot\tp[0-9]+\.b, p[0-9]+/z, 
p[0-9]+\.b\n} 1 { xfail *-*-* } } } */
+/* { dg-final { scan-assembler-not {\tbic\t} } } */
+/* { dg-final { scan-assembler-times {\tnot\tp[0-9]+\.b, p[0-9]+/z, 
p[0-9]+\.b\n} 1 } } */
 /* { dg-final { scan-assembler-times {\tfcmgt\tp[0-9]+\.d, p[0-9]+/z, 
z[0-9]+\.d, #0} 1 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pred-not-gen-4.c 
b/gcc/testsuite/gcc.target/aarch64/sve/pred-not-gen-4.c
index 20cbd7550b7..1845bd3f0f7 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/pred-not-gen-4.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pred-not-gen-4.c
@@ -8,6 +8,6 @@ void f13(double * restrict z, double * restrict w, double * 
restrict x, double *
     }
 }
 
-/* { dg-final { scan-assembler-not {\tbic\t} { xfail *-*-* } } } */
-/* { dg-final { scan-assembler-times {\tnot\tp[0-9]+\.b, p[0-9]+/z, 
p[0-9]+\.b\n} 1 { xfail *-*-* } } } */
+/* { dg-final { scan-assembler-not {\tbic\t} } } */
+/* { dg-final { scan-assembler-times {\tnot\tp[0-9]+\.b, p[0-9]+/z, 
p[0-9]+\.b\n} 1 } } */
 /* { dg-final { scan-assembler-times {\tfcmuo\tp[0-9]+\.d, p[0-9]+/z, 
z[0-9]+\.d, z[0-9]+\.d} 1 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/var_stride_2.c 
b/gcc/testsuite/gcc.target/aarch64/sve/var_stride_2.c
index 33b9f0f197e..b8afea70207 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/var_stride_2.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/var_stride_2.c
@@ -16,7 +16,8 @@ f (TYPE *x, TYPE *y, unsigned short n, unsigned short m)
 /* { dg-final { scan-assembler {\tldr\tw[0-9]+} } } */
 /* { dg-final { scan-assembler {\tstr\tw[0-9]+} } } */
 /* Should multiply by (257-1)*4 rather than (VF-1)*4 or (VF-2)*4.  */
-/* { dg-final { scan-assembler-times {\tadd\tx[0-9]+, x[0-9]+, x[0-9]+, lsl 
10\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tubfiz\tx[0-9]+, x2, 10, 16\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tubfiz\tx[0-9]+, x3, 10, 16\n} 1 } } */
 /* { dg-final { scan-assembler-not {\tcmp\tx[0-9]+, 0} } } */
 /* { dg-final { scan-assembler-not {\tcmp\tw[0-9]+, 0} } } */
 /* { dg-final { scan-assembler-not {\tcsel\tx[0-9]+} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/var_stride_4.c 
b/gcc/testsuite/gcc.target/aarch64/sve/var_stride_4.c
index 71b826a4c1b..d2e74f9d417 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/var_stride_4.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/var_stride_4.c
@@ -16,7 +16,8 @@ f (TYPE *x, TYPE *y, int n, int m)
 /* { dg-final { scan-assembler {\tldr\tw[0-9]+} } } */
 /* { dg-final { scan-assembler {\tstr\tw[0-9]+} } } */
 /* Should multiply by (257-1)*4 rather than (VF-1)*4.  */
-/* { dg-final { scan-assembler-times {\t(?:lsl\tx[0-9]+, x[0-9]+, 
10|sbfiz\tx[0-9]+, x[0-9]+, 10, 32)\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tsbfiz\tx[0-9]+, x2, 10, 32\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tsbfiz\tx[0-9]+, x3, 10, 32\n} 1 } } */
 /* { dg-final { scan-assembler {\tcmp\tw2, 0} } } */
 /* { dg-final { scan-assembler {\tcmp\tw3, 0} } } */
 /* { dg-final { scan-assembler-times {\tcsel\tx[0-9]+} 4 } } */
-- 
2.25.1

Reply via email to