https://gcc.gnu.org/g:1411d39bc72515227de2e490eb8f629d8bf74c95

commit r15-4875-g1411d39bc72515227de2e490eb8f629d8bf74c95
Author: Kyrylo Tkachov <ktkac...@nvidia.com>
Date:   Tue Oct 15 06:33:11 2024 -0700

    PR 117048: aarch64: Add define_insn_and_split for vector ROTATE
    
    The ultimate goal in this PR is to match the XAR pattern that is represented
    as a (ROTATE (XOR X Y) VCST) from the ACLE intrinsics code in the testcase.
    The first blocker for this was the missing recognition of ROTATE in
    simplify-rtx, which is fixed in the previous patch.
    The next problem is that once the ROTATE has been matched from the shifts
    and orr/xor/plus, it will try to match it in an insn before trying to 
combine
    the XOR into it.  But as we don't have a backend pattern for a vector ROTATE
    this recog fails and combine does not try the followup XOR+ROTATE 
combination
    which would have succeeded.
    
    This patch solves that by introducing a sort of "scaffolding" pattern for
    vector ROTATE, which allows it to be combined into the XAR.
    If it fails to be combined into anything the splitter will break it back
    down into the SHL+USRA sequence that it would have emitted.
    By having this splitter we can special-case some rotate amounts in the 
future
    to emit more specialised instructions e.g. from the REV* family.
    This can be done if the ROTATE is not combined into something else.
    
    This optimisation is done in the next patch in the series.
    
    Bootstrapped and tested on aarch64-none-linux-gnu.
    
    Signed-off-by: Kyrylo Tkachov <ktkac...@nvidia.com>
    
    gcc/
    
            PR target/117048
            * config/aarch64/aarch64-simd.md (*aarch64_simd_rotate_imm<mode>):
            New define_insn_and_split.
    
    gcc/testsuite/
    
            PR target/117048
            * gcc.target/aarch64/simd/pr117048.c: New test.

Diff:
---
 gcc/config/aarch64/aarch64-simd.md               | 29 ++++++++++
 gcc/testsuite/gcc.target/aarch64/simd/pr117048.c | 73 ++++++++++++++++++++++++
 2 files changed, 102 insertions(+)

diff --git a/gcc/config/aarch64/aarch64-simd.md 
b/gcc/config/aarch64/aarch64-simd.md
index e456f693d2f3..08b121227eee 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -1294,6 +1294,35 @@
   [(set_attr "type" "neon_shift_acc<q>")]
 )
 
+;; After all the combinations and propagations of ROTATE have been
+;; attempted split any remaining vector rotates into SHL + USRA sequences.
+(define_insn_and_split "*aarch64_simd_rotate_imm<mode>"
+  [(set (match_operand:VDQ_I 0 "register_operand" "=&w")
+       (rotate:VDQ_I (match_operand:VDQ_I 1 "register_operand" "w")
+                     (match_operand:VDQ_I 2 "aarch64_simd_lshift_imm")))]
+  "TARGET_SIMD"
+  "#"
+  "&& 1"
+  [(set (match_dup 3)
+       (ashift:VDQ_I (match_dup 1)
+                     (match_dup 2)))
+   (set (match_dup 0)
+       (plus:VDQ_I
+         (lshiftrt:VDQ_I
+           (match_dup 1)
+           (match_dup 4))
+         (match_dup 3)))]
+  {
+    operands[3] = reload_completed ? operands[0] : gen_reg_rtx (<MODE>mode);
+    rtx shft_amnt = unwrap_const_vec_duplicate (operands[2]);
+    int bitwidth = GET_MODE_UNIT_SIZE (<MODE>mode) * BITS_PER_UNIT;
+    operands[4]
+      = aarch64_simd_gen_const_vector_dup (<MODE>mode,
+                                          bitwidth - INTVAL (shft_amnt));
+  }
+  [(set_attr "length" "8")]
+)
+
 (define_insn "aarch64_<sra_op>rsra_n<mode>_insn"
  [(set (match_operand:VSDQ_I_DI 0 "register_operand" "=w")
        (plus:VSDQ_I_DI
diff --git a/gcc/testsuite/gcc.target/aarch64/simd/pr117048.c 
b/gcc/testsuite/gcc.target/aarch64/simd/pr117048.c
new file mode 100644
index 000000000000..621c0f46fc4b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/simd/pr117048.c
@@ -0,0 +1,73 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+#include <arm_neon.h>
+
+#pragma GCC target "+sha3"
+
+/*
+** func_shl_eor:
+**     xar     v0\.2d, v([0-9]+)\.2d, v([0-9]+)\.2d, 63
+**     ret 
+*/
+uint64x2_t
+func_shl_eor (uint64x2_t a, uint64x2_t b) {
+  uint64x2_t c = veorq_u64 (a, b);
+  return veorq_u64(vshlq_n_u64(c, 1), vshrq_n_u64(c, 63));
+}
+
+/*
+** func_add_eor:
+**     xar     v0\.2d, v([0-9]+)\.2d, v([0-9]+)\.2d, 63
+**     ret 
+*/
+uint64x2_t
+func_add_eor (uint64x2_t a, uint64x2_t b) {
+  uint64x2_t c = veorq_u64 (a, b);
+  return veorq_u64(vaddq_u64(c, c), vshrq_n_u64(c, 63));
+}
+
+/*
+** func_shl_orr:
+**     xar     v0\.2d, v([0-9]+)\.2d, v([0-9]+)\.2d, 63
+**     ret 
+*/
+uint64x2_t
+func_shl_orr (uint64x2_t a, uint64x2_t b) {
+  uint64x2_t c = veorq_u64 (a, b);
+  return vorrq_u64(vshlq_n_u64(c, 1), vshrq_n_u64(c, 63));
+}
+
+/*
+** func_add_orr:
+**     xar     v0\.2d, v([0-9]+)\.2d, v([0-9]+)\.2d, 63
+**     ret 
+*/
+uint64x2_t
+func_add_orr (uint64x2_t a, uint64x2_t b) {
+  uint64x2_t c = veorq_u64 (a, b);
+  return vorrq_u64(vaddq_u64(c, c), vshrq_n_u64(c, 63));
+}
+
+/*
+** func_shl_add:
+**     xar     v0\.2d, v([0-9]+)\.2d, v([0-9]+)\.2d, 63
+**     ret 
+*/
+uint64x2_t
+func_shl_add (uint64x2_t a, uint64x2_t b) {
+  uint64x2_t c = veorq_u64 (a, b);
+  return vaddq_u64(vshlq_n_u64(c, 1), vshrq_n_u64(c, 63));
+}
+
+/*
+** func_add_add:
+**     xar     v0\.2d, v([0-9]+)\.2d, v([0-9]+)\.2d, 63
+**     ret 
+*/
+uint64x2_t
+func_add_add (uint64x2_t a, uint64x2_t b) {
+  uint64x2_t c = veorq_u64 (a, b);
+  return vaddq_u64(vaddq_u64(c, c), vshrq_n_u64(c, 63));
+}

Reply via email to