https://gcc.gnu.org/g:14cb23e743e02e6923f7e46a14717e9f561f6723
commit r15-4877-g14cb23e743e02e6923f7e46a14717e9f561f6723
Author: Kyrylo Tkachov <ktkac...@nvidia.com>
Date:   Tue Oct 22 07:52:36 2024 -0700

    aarch64: Emit XAR for vector rotates where possible
    
    We can make use of the integrated rotate step of the XAR instruction
    to implement most vector integer rotates, as long we zero out one
    of the input registers for it.  This allows for a lower-latency sequence
    than the fallback SHL+USRA, especially when we can hoist the zeroing 
operation
    away from loops and hot parts.  This should be safe to do for 64-bit vectors
    as well even though the XAR instructions operate on 128-bit values, as the
    bottom 64-bit results is later accessed through the right subregs.
    
    This strategy is used whenever we have XAR instructions, the logic
    in aarch64_emit_opt_vec_rotate is adjusted to resort to
    expand_rotate_as_vec_perm only when it's expected to generate a single REV*
    instruction or when XAR instructions are not present.
    
    With this patch we can gerate for the input:
    v4si
    G1 (v4si r)
    {
        return (r >> 23) | (r << 9);
    }
    
    v8qi
    G2 (v8qi r)
    {
      return (r << 3) | (r >> 5);
    }
    the assembly for +sve2:
    G1:
            movi    v31.4s, 0
            xar     z0.s, z0.s, z31.s, #23
            ret
    
    G2:
            movi    v31.4s, 0
            xar     z0.b, z0.b, z31.b, #5
            ret
    
    instead of the current:
    G1:
            shl     v31.4s, v0.4s, 9
            usra    v31.4s, v0.4s, 23
            mov     v0.16b, v31.16b
            ret
    G2:
            shl     v31.8b, v0.8b, 3
            usra    v31.8b, v0.8b, 5
            mov     v0.8b, v31.8b
            ret
    
    Bootstrapped and tested on aarch64-none-linux-gnu.
    
    Signed-off-by: Kyrylo Tkachov <ktkac...@nvidia.com>
    
    gcc/
    
            * config/aarch64/aarch64.cc (aarch64_emit_opt_vec_rotate): Add
            generation of XAR sequences when possible.
    
    gcc/testsuite/
    
            * gcc.target/aarch64/rotate_xar_1.c: New test.

Diff:
---
 gcc/config/aarch64/aarch64.cc                   | 34 +++++++--
 gcc/testsuite/gcc.target/aarch64/rotate_xar_1.c | 93 +++++++++++++++++++++++++
 2 files changed, 121 insertions(+), 6 deletions(-)

diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 7388f6b8fdf1..00f99d5004ca 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -16019,17 +16019,39 @@ aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
 }
 
 /* Emit an optimized sequence to perform a vector rotate
-   of REG by the vector constant amount AMNT and place the result
+   of REG by the vector constant amount AMNT_VEC and place the result
    in DST.  Return true iff successful.  */
 
 bool
-aarch64_emit_opt_vec_rotate (rtx dst, rtx reg, rtx amnt)
+aarch64_emit_opt_vec_rotate (rtx dst, rtx reg, rtx amnt_vec)
 {
+  rtx amnt = unwrap_const_vec_duplicate (amnt_vec);
+  gcc_assert (CONST_INT_P (amnt));
+  HOST_WIDE_INT rotamnt = UINTVAL (amnt);
   machine_mode mode = GET_MODE (reg);
-  /* Attempt to expand the rotate as a vector permute.
-     For some rotate amounts they can be single instructions and
-     even the general single-vector TBL permute has good throughput.  */
-  if (expand_rotate_as_vec_perm (mode, dst, reg, amnt))
+  /* Rotates by half the element width map down to REV* instructions and should
+     always be preferred when possible.  */
+  if (rotamnt == GET_MODE_UNIT_BITSIZE (mode) / 2
+      && expand_rotate_as_vec_perm (mode, dst, reg, amnt))
+    return true;
+  /* 64 and 128-bit vector modes can use the XAR instruction
+     when available.  */
+  else if (can_create_pseudo_p ()
+          && ((TARGET_SHA3 && mode == V2DImode)
+              || (TARGET_SVE2
+                  && (known_eq (GET_MODE_SIZE (mode), 8)
+                      || known_eq (GET_MODE_SIZE (mode), 16)))))
+    {
+      rtx zeroes = aarch64_gen_shareable_zero (mode);
+      rtx xar_op
+       = gen_rtx_ROTATE (mode, gen_rtx_XOR (mode, reg, zeroes),
+                                               amnt_vec);
+      emit_set_insn (dst, xar_op);
+      return true;
+    }
+  /* If none of the above, try to expand rotates by any byte amount as
+     permutes.  */
+  else if (expand_rotate_as_vec_perm (mode, dst, reg, amnt))
     return true;
   return false;
 }
diff --git a/gcc/testsuite/gcc.target/aarch64/rotate_xar_1.c 
b/gcc/testsuite/gcc.target/aarch64/rotate_xar_1.c
new file mode 100644
index 000000000000..73007701cfb4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/rotate_xar_1.c
@@ -0,0 +1,93 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+typedef char __attribute__ ((vector_size (16))) v16qi;
+typedef unsigned short __attribute__ ((vector_size (16))) v8hi;
+typedef unsigned int __attribute__ ((vector_size (16))) v4si;
+typedef unsigned long long __attribute__ ((vector_size (16))) v2di;
+typedef char __attribute__ ((vector_size (8))) v8qi;
+typedef unsigned short __attribute__ ((vector_size (8))) v4hi;
+typedef unsigned int __attribute__ ((vector_size (8))) v2si;
+
+#pragma GCC target "+sve2+sha3"
+
+/*
+** G1:
+**     movi?   [vdz][0-9]+\.?(?:[0-9]*[bhsd])?, #?0
+**     xar     v0\.2d, v[0-9]+\.2d, v[0-9]+\.2d, 39
+**      ret
+*/
+v2di
+G1 (v2di r) {
+    return (r >> 39) | (r << 25);
+}
+
+/*
+** G2:
+**     movi?   [vdz][0-9]+\.?(?:[0-9]*[bhsd])?, #?0
+**     xar     z0\.s, z[0-9]+\.s, z[0-9]+\.s, #23
+**      ret
+*/
+v4si
+G2 (v4si r) {
+    return (r >> 23) | (r << 9);
+}
+
+/*
+** G3:
+**     movi?   [vdz][0-9]+\.?(?:[0-9]*[bhsd])?, #?0
+**     xar     z0\.h, z[0-9]+\.h, z[0-9]+\.h, #5
+**      ret
+*/
+v8hi
+G3 (v8hi r) {
+    return (r >> 5) | (r << 11);
+}
+
+/*
+** G4:
+**     movi?   [vdz][0-9]+\.?(?:[0-9]*[bhsd])?, #?0
+**     xar     z0\.b, z[0-9]+\.b, z[0-9]+\.b, #6
+**      ret
+*/
+v16qi
+G4 (v16qi r)
+{
+  return (r << 2) | (r >> 6);
+}
+
+/*
+** G5:
+**     movi?   [vdz][0-9]+\.?(?:[0-9]*[bhsd])?, #?0
+**     xar     z0\.s, z[0-9]+\.s, z[0-9]+\.s, #22
+**      ret
+*/
+v2si
+G5 (v2si r) {
+    return (r >> 22) | (r << 10);
+}
+
+/*
+** G6:
+**     movi?   [vdz][0-9]+\.?(?:[0-9]*[bhsd])?, #?0
+**     xar     z0\.h, z[0-9]+\.h, z[0-9]+\.h, #7
+**      ret
+*/
+v4hi
+G6 (v4hi r) {
+    return (r >> 7) | (r << 9);
+}
+
+/*
+** G7:
+**     movi?   [vdz][0-9]+\.?(?:[0-9]*[bhsd])?, #?0
+**     xar     z0\.b, z[0-9]+\.b, z[0-9]+\.b, #5
+**      ret
+*/
+v8qi
+G7 (v8qi r)
+{
+  return (r << 3) | (r >> 5);
+}
+

Reply via email to