[gcc r15-9239] combine: Allow 2->2 combinations, but with a tweak [PR116398]

Richard Sandiford via Gcc-cvs Mon, 07 Apr 2025 00:06:10 -0700

https://gcc.gnu.org/g:4d7a634f6d41029811cdcbd5f7282b5b07890094


commit r15-9239-g4d7a634f6d41029811cdcbd5f7282b5b07890094
Author: Richard Sandiford <richard.sandif...@arm.com>
Date:   Mon Apr 7 08:03:47 2025 +0100

    combine: Allow 2->2 combinations, but with a tweak [PR116398]
    
    One of the problems in PR101523 was that, after each successful
    2->2 combination attempt, try_combine would restart combination
    attempts at i2 even if i2 hadn't changed.  This led to quadratic
    behaviour as the same failed combinations between i2 and i3 were
    tried repeatedly.
    
    The original patch for the PR dealt with that by disallowing 2->2
    combinations.  However, that led to various optimisation regressions,
    so there was interest in allowing the combinations again, at least
    until an alternative way of getting the same results is in place.
    
    This patch is a variant of Richi's in:
    
      https://gcc.gnu.org/bugzilla/show_bug.cgi?id=101523#c53
    
    but limited to when we're combining 2 instructions.
    
    This speeds up combine by 10x on the original PR101523 testcase
    and reduces combine's memory footprint by 100x.
    
    gcc/
            PR testsuite/116398
            * combine.cc (try_combine): Reallow 2->2 combinations.  Detect when
            only i3 has changed and restart from i3 in that case.
    
    gcc/testsuite/
            * gcc.target/aarch64/popcnt-le-1.c: Account for commutativity of 
TST.
            * gcc.target/aarch64/popcnt-le-3.c: Likewise AND.
            * gcc.target/aarch64/pr100056.c: Revert previous patch.
            * gcc.target/aarch64/sve/pred-not-gen-1.c: Likewise.
            * gcc.target/aarch64/sve/pred-not-gen-4.c: Likewise.
            * gcc.target/aarch64/sve/var_stride_2.c: Likewise.
            * gcc.target/aarch64/sve/var_stride_4.c: Likewise.
    
    Co-authored-by: Richard Biener <rguent...@suse.de>

Diff:
---
 gcc/combine.cc                                        | 14 ++++----------
 gcc/testsuite/gcc.target/aarch64/popcnt-le-1.c        |  4 ++--
 gcc/testsuite/gcc.target/aarch64/popcnt-le-3.c        |  4 ++--
 gcc/testsuite/gcc.target/aarch64/pr100056.c           |  4 +++-
 gcc/testsuite/gcc.target/aarch64/sve/pred-not-gen-1.c |  4 ++--
 gcc/testsuite/gcc.target/aarch64/sve/pred-not-gen-4.c |  4 ++--
 gcc/testsuite/gcc.target/aarch64/sve/var_stride_2.c   |  3 ++-
 gcc/testsuite/gcc.target/aarch64/sve/var_stride_4.c   |  3 ++-
 8 files changed, 19 insertions(+), 21 deletions(-)

diff --git a/gcc/combine.cc b/gcc/combine.cc
index 1b6c4e314cc9..65a87a45b3be 100644
--- a/gcc/combine.cc
+++ b/gcc/combine.cc
@@ -4210,16 +4210,7 @@ try_combine (rtx_insn *i3, rtx_insn *i2, rtx_insn *i1, 
rtx_insn *i0,
       adjust_for_new_dest (i3);
     }
 
-  /* If I2 didn't change, this is not a combination (but a simplification or
-     canonicalisation with context), which should not be done here.  Doing
-     it here explodes the algorithm.  Don't.  */
-  if (rtx_equal_p (newi2pat, PATTERN (i2)))
-    {
-      if (dump_file)
-       fprintf (dump_file, "i2 didn't change, not doing this\n");
-      undo_all ();
-      return 0;
-    }
+  bool only_i3_changed = !i0 && !i1 && rtx_equal_p (newi2pat, PATTERN (i2));
 
   /* We now know that we can do this combination.  Merge the insns and
      update the status of registers and LOG_LINKS.  */
@@ -4787,6 +4778,9 @@ try_combine (rtx_insn *i3, rtx_insn *i2, rtx_insn *i1, 
rtx_insn *i0,
   combine_successes++;
   undo_commit ();
 
+  if (only_i3_changed)
+    return i3;
+
   rtx_insn *ret = newi2pat ? i2 : i3;
   if (added_links_insn && DF_INSN_LUID (added_links_insn) < DF_INSN_LUID (ret))
     ret = added_links_insn;
diff --git a/gcc/testsuite/gcc.target/aarch64/popcnt-le-1.c 
b/gcc/testsuite/gcc.target/aarch64/popcnt-le-1.c
index b4141da982c9..843fdac9fd8e 100644
--- a/gcc/testsuite/gcc.target/aarch64/popcnt-le-1.c
+++ b/gcc/testsuite/gcc.target/aarch64/popcnt-le-1.c
@@ -8,7 +8,7 @@
 /*
 ** le32:
 **     sub     w([0-9]+), w0, #1
-**     tst     w0, w\1
+**     tst     (?:w0, w\1|w\1, w0)
 **     cset    w0, eq
 **     ret
 */
@@ -20,7 +20,7 @@ unsigned le32 (const unsigned int a) {
 /*
 ** gt32:
 **     sub     w([0-9]+), w0, #1
-**     tst     w0, w\1
+**     tst     (?:w0, w\1|w\1, w0)
 **     cset    w0, ne
 **     ret
 */
diff --git a/gcc/testsuite/gcc.target/aarch64/popcnt-le-3.c 
b/gcc/testsuite/gcc.target/aarch64/popcnt-le-3.c
index b811e6f6e8fe..3b558e95d819 100644
--- a/gcc/testsuite/gcc.target/aarch64/popcnt-le-3.c
+++ b/gcc/testsuite/gcc.target/aarch64/popcnt-le-3.c
@@ -8,7 +8,7 @@
 /*
 ** le16:
 **     sub     w([0-9]+), w0, #1
-**     and     w([0-9]+), w0, w\1
+**     and     w([0-9]+), (?:w0, w\1|w\1, w0)
 **     tst     w\2, 65535
 **     cset    w0, eq
 **     ret
@@ -21,7 +21,7 @@ unsigned le16 (const unsigned short a) {
 /*
 ** gt16:
 **     sub     w([0-9]+), w0, #1
-**     and     w([0-9]+), w0, w\1
+**     and     w([0-9]+), (?:w0, w\1|w\1, w0)
 **     tst     w\2, 65535
 **     cset    w0, ne
 **     ret
diff --git a/gcc/testsuite/gcc.target/aarch64/pr100056.c 
b/gcc/testsuite/gcc.target/aarch64/pr100056.c
index 0b77824da457..70499772d285 100644
--- a/gcc/testsuite/gcc.target/aarch64/pr100056.c
+++ b/gcc/testsuite/gcc.target/aarch64/pr100056.c
@@ -1,7 +1,9 @@
 /* PR target/100056 */
 /* { dg-do compile } */
 /* { dg-options "-O2" } */
-/* { dg-final { scan-assembler-not {\t[us]bfiz\tw[0-9]+, w[0-9]+, 11} } } */
+/* { dg-final { scan-assembler-not {\t[us]bfiz\tw[0-9]+, w[0-9]+, 11} { xfail 
*-*-* } } } */
+/* { dg-final { scan-assembler-times {\t[us]bfiz\tw[0-9]+, w[0-9]+, 11} 2 } } 
*/
+/* { dg-final { scan-assembler-times {\tadd\tw[0-9]+, w[0-9]+, w[0-9]+, 
uxtb\n} 2 } } */
 
 int
 or_shift_u8 (unsigned char i)
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pred-not-gen-1.c 
b/gcc/testsuite/gcc.target/aarch64/sve/pred-not-gen-1.c
index a7d2795ebe23..c9a8b82c48ac 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/pred-not-gen-1.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pred-not-gen-1.c
@@ -19,6 +19,6 @@ void f10(double * restrict z, double * restrict w, double * 
restrict x, double *
     }
 }
 
-/* { dg-final { scan-assembler-not {\tbic\t} { xfail *-*-* } } } */
-/* { dg-final { scan-assembler-times {\tnot\tp[0-9]+\.b, p[0-9]+/z, 
p[0-9]+\.b\n} 1 { xfail *-*-* } } } */
+/* { dg-final { scan-assembler-not {\tbic\t} } } */
+/* { dg-final { scan-assembler-times {\tnot\tp[0-9]+\.b, p[0-9]+/z, 
p[0-9]+\.b\n} 1 } } */
 /* { dg-final { scan-assembler-times {\tfcmgt\tp[0-9]+\.d, p[0-9]+/z, 
z[0-9]+\.d, #0} 1 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pred-not-gen-4.c 
b/gcc/testsuite/gcc.target/aarch64/sve/pred-not-gen-4.c
index 20cbd7550b7e..1845bd3f0f70 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/pred-not-gen-4.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pred-not-gen-4.c
@@ -8,6 +8,6 @@ void f13(double * restrict z, double * restrict w, double * 
restrict x, double *
     }
 }
 
-/* { dg-final { scan-assembler-not {\tbic\t} { xfail *-*-* } } } */
-/* { dg-final { scan-assembler-times {\tnot\tp[0-9]+\.b, p[0-9]+/z, 
p[0-9]+\.b\n} 1 { xfail *-*-* } } } */
+/* { dg-final { scan-assembler-not {\tbic\t} } } */
+/* { dg-final { scan-assembler-times {\tnot\tp[0-9]+\.b, p[0-9]+/z, 
p[0-9]+\.b\n} 1 } } */
 /* { dg-final { scan-assembler-times {\tfcmuo\tp[0-9]+\.d, p[0-9]+/z, 
z[0-9]+\.d, z[0-9]+\.d} 1 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/var_stride_2.c 
b/gcc/testsuite/gcc.target/aarch64/sve/var_stride_2.c
index 33b9f0f197e4..b8afea70207f 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/var_stride_2.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/var_stride_2.c
@@ -16,7 +16,8 @@ f (TYPE *x, TYPE *y, unsigned short n, unsigned short m)
 /* { dg-final { scan-assembler {\tldr\tw[0-9]+} } } */
 /* { dg-final { scan-assembler {\tstr\tw[0-9]+} } } */
 /* Should multiply by (257-1)*4 rather than (VF-1)*4 or (VF-2)*4.  */
-/* { dg-final { scan-assembler-times {\tadd\tx[0-9]+, x[0-9]+, x[0-9]+, lsl 
10\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tubfiz\tx[0-9]+, x2, 10, 16\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tubfiz\tx[0-9]+, x3, 10, 16\n} 1 } } */
 /* { dg-final { scan-assembler-not {\tcmp\tx[0-9]+, 0} } } */
 /* { dg-final { scan-assembler-not {\tcmp\tw[0-9]+, 0} } } */
 /* { dg-final { scan-assembler-not {\tcsel\tx[0-9]+} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/var_stride_4.c 
b/gcc/testsuite/gcc.target/aarch64/sve/var_stride_4.c
index 71b826a4c1bb..d2e74f9d4175 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/var_stride_4.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/var_stride_4.c
@@ -16,7 +16,8 @@ f (TYPE *x, TYPE *y, int n, int m)
 /* { dg-final { scan-assembler {\tldr\tw[0-9]+} } } */
 /* { dg-final { scan-assembler {\tstr\tw[0-9]+} } } */
 /* Should multiply by (257-1)*4 rather than (VF-1)*4.  */
-/* { dg-final { scan-assembler-times {\t(?:lsl\tx[0-9]+, x[0-9]+, 
10|sbfiz\tx[0-9]+, x[0-9]+, 10, 32)\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tsbfiz\tx[0-9]+, x2, 10, 32\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tsbfiz\tx[0-9]+, x3, 10, 32\n} 1 } } */
 /* { dg-final { scan-assembler {\tcmp\tw2, 0} } } */
 /* { dg-final { scan-assembler {\tcmp\tw3, 0} } } */
 /* { dg-final { scan-assembler-times {\tcsel\tx[0-9]+} 4 } } */

[gcc r15-9239] combine: Allow 2->2 combinations, but with a tweak [PR116398]

Reply via email to