[gcc r15-4963] PR target/117449: Restrict vector rotate match and split to pre-reload

Kyrylo Tkachov via Gcc-cvs Tue, 05 Nov 2024 09:08:46 -0800

https://gcc.gnu.org/g:161e246cf32f1298400aa3c1d86110490a3cd0ce


commit r15-4963-g161e246cf32f1298400aa3c1d86110490a3cd0ce
Author: Kyrylo Tkachov <ktkac...@nvidia.com>
Date:   Tue Nov 5 05:10:22 2024 -0800

    PR target/117449: Restrict vector rotate match and split to pre-reload
    
    The vector rotate splitter has some logic to deal with post-reload splitting
    but not all cases in aarch64_emit_opt_vec_rotate are post-reload-safe.
    In particular the ROTATE+XOR expansion for TARGET_SHA3 can create RTL that
    can later be simplified to a simple ROTATE post-reload, which would then
    match the insn again and try to split it.
    So do a clean split pre-reload and avoid going down this path post-reload
    by restricting the insn_and_split to can_create_pseudo_p ().
    
    Bootstrapped and tested on aarch64-none-linux.
    
    Signed-off-by: Kyrylo Tkachov <ktkac...@nvidia.com>
    gcc/
    
            PR target/117449
            * config/aarch64/aarch64-simd.md (*aarch64_simd_rotate_imm<mode>):
            Match only when can_create_pseudo_p ().
            * config/aarch64/aarch64.cc (aarch64_emit_opt_vec_rotate): Assume
            can_create_pseudo_p ().
    
    gcc/testsuite/
    
            PR target/117449
            * gcc.c-torture/compile/pr117449.c: New test.

Diff:
---
 gcc/config/aarch64/aarch64-simd.md             |  6 ++++--
 gcc/config/aarch64/aarch64.cc                  | 11 ++++++-----
 gcc/testsuite/gcc.c-torture/compile/pr117449.c |  8 ++++++++
 3 files changed, 18 insertions(+), 7 deletions(-)

diff --git a/gcc/config/aarch64/aarch64-simd.md 
b/gcc/config/aarch64/aarch64-simd.md
index a91222b6e3b2..cfe95bd4c316 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -1296,11 +1296,13 @@
 
 ;; After all the combinations and propagations of ROTATE have been
 ;; attempted split any remaining vector rotates into SHL + USRA sequences.
+;; Don't match this after reload as the various possible sequence for this
+;; require temporary registers.
 (define_insn_and_split "*aarch64_simd_rotate_imm<mode>"
   [(set (match_operand:VDQ_I 0 "register_operand" "=&w")
        (rotate:VDQ_I (match_operand:VDQ_I 1 "register_operand" "w")
                      (match_operand:VDQ_I 2 "aarch64_simd_lshift_imm")))]
-  "TARGET_SIMD"
+  "TARGET_SIMD && can_create_pseudo_p ()"
   "#"
   "&& 1"
   [(set (match_dup 3)
@@ -1316,7 +1318,7 @@
     if (aarch64_emit_opt_vec_rotate (operands[0], operands[1], operands[2]))
       DONE;
 
-    operands[3] = reload_completed ? operands[0] : gen_reg_rtx (<MODE>mode);
+    operands[3] = gen_reg_rtx (<MODE>mode);
     rtx shft_amnt = unwrap_const_vec_duplicate (operands[2]);
     int bitwidth = GET_MODE_UNIT_SIZE (<MODE>mode) * BITS_PER_UNIT;
     operands[4]
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 9347e06f0e9e..f2b53475adbe 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -16030,6 +16030,8 @@ aarch64_emit_opt_vec_rotate (rtx dst, rtx reg, rtx 
amnt_vec)
   gcc_assert (CONST_INT_P (amnt));
   HOST_WIDE_INT rotamnt = UINTVAL (amnt);
   machine_mode mode = GET_MODE (reg);
+  /* Don't end up here after reload.  */
+  gcc_assert (can_create_pseudo_p ());
   /* Rotates by half the element width map down to REV* instructions and should
      always be preferred when possible.  */
   if (rotamnt == GET_MODE_UNIT_BITSIZE (mode) / 2
@@ -16037,11 +16039,10 @@ aarch64_emit_opt_vec_rotate (rtx dst, rtx reg, rtx 
amnt_vec)
     return true;
   /* 64 and 128-bit vector modes can use the XAR instruction
      when available.  */
-  else if (can_create_pseudo_p ()
-          && ((TARGET_SHA3 && mode == V2DImode)
-              || (TARGET_SVE2
-                  && (known_eq (GET_MODE_SIZE (mode), 8)
-                      || known_eq (GET_MODE_SIZE (mode), 16)))))
+  else if ((TARGET_SHA3 && mode == V2DImode)
+          || (TARGET_SVE2
+              && (known_eq (GET_MODE_SIZE (mode), 8)
+                  || known_eq (GET_MODE_SIZE (mode), 16))))
     {
       rtx zeroes = aarch64_gen_shareable_zero (mode);
       rtx xar_op
diff --git a/gcc/testsuite/gcc.c-torture/compile/pr117449.c 
b/gcc/testsuite/gcc.c-torture/compile/pr117449.c
new file mode 100644
index 000000000000..8ae0071fca6b
--- /dev/null
+++ b/gcc/testsuite/gcc.c-torture/compile/pr117449.c
@@ -0,0 +1,8 @@
+/* { dg-additional-options "-march=armv8.2-a+sha3" { target aarch64*-*-* } } */
+
+unsigned long *a;
+int i;
+void f() {
+  for (i = 0; i < 80; i++)
+    a[i] = (a[i] >> 8 | a[i] << 64 - 8) ^ a[i];
+}

[gcc r15-4963] PR target/117449: Restrict vector rotate match and split to pre-reload

Reply via email to