[gcc r15-4876] aarch64: Optimize vector rotates as vector permutes where possible

Kyrylo Tkachov via Gcc-cvs Mon, 04 Nov 2024 01:21:34 -0800

https://gcc.gnu.org/g:19757e1c28de07b45da03117e6ff7ae3e21e5a7a


commit r15-4876-g19757e1c28de07b45da03117e6ff7ae3e21e5a7a
Author: Kyrylo Tkachov <ktkac...@nvidia.com>
Date:   Wed Oct 16 04:10:08 2024 -0700

    aarch64: Optimize vector rotates as vector permutes where possible
    
    Some vector rotate operations can be implemented in a single instruction
    rather than using the fallback SHL+USRA sequence.
    In particular, when the rotate amount is half the bitwidth of the element
    we can use a REV64,REV32,REV16 instruction.
    More generally, rotates by a byte amount can be implented using vector
    permutes.
    This patch adds such a generic routine in expmed.cc called
    expand_rotate_as_vec_perm that calculates the required permute indices
    and uses the expand_vec_perm_const interface.
    
    On aarch64 this ends up generating the single-instruction sequences above
    where possible and can use LDR+TBL sequences too, which are a good choice.
    
    With help from Richard, the routine should be VLA-safe.
    However, the only use of expand_rotate_as_vec_perm introduced in this patch
    is in aarch64-specific code that for now only handles fixed-width modes.
    
    A runtime aarch64 test is added to ensure the permute indices are not messed
    up.
    
    Bootstrapped and tested on aarch64-none-linux-gnu.
    
    Signed-off-by: Kyrylo Tkachov <ktkac...@nvidia.com>
    
    gcc/
    
            * expmed.h (expand_rotate_as_vec_perm): Declare.
            * expmed.cc (expand_rotate_as_vec_perm): Define.
            * config/aarch64/aarch64-protos.h (aarch64_emit_opt_vec_rotate):
            Declare prototype.
            * config/aarch64/aarch64.cc (aarch64_emit_opt_vec_rotate): 
Implement.
            * config/aarch64/aarch64-simd.md (*aarch64_simd_rotate_imm<mode>):
            Call the above.
    
    gcc/testsuite/
    
            * gcc.target/aarch64/vec-rot-exec.c: New test.
            * gcc.target/aarch64/simd/pr117048_2.c: New test.

Diff:
---
 gcc/config/aarch64/aarch64-protos.h                |   1 +
 gcc/config/aarch64/aarch64-simd.md                 |   3 +
 gcc/config/aarch64/aarch64.cc                      |  16 ++++
 gcc/expmed.cc                                      |  44 +++++++++
 gcc/expmed.h                                       |   1 +
 gcc/testsuite/gcc.target/aarch64/simd/pr117048_2.c |  66 ++++++++++++++
 gcc/testsuite/gcc.target/aarch64/vec-rot-exec.c    | 101 +++++++++++++++++++++
 7 files changed, 232 insertions(+)

diff --git a/gcc/config/aarch64/aarch64-protos.h 
b/gcc/config/aarch64/aarch64-protos.h
index 05caad5e2fee..e8588e1cb177 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -851,6 +851,7 @@ bool aarch64_rnd_imm_p (rtx);
 bool aarch64_constant_address_p (rtx);
 bool aarch64_emit_approx_div (rtx, rtx, rtx);
 bool aarch64_emit_approx_sqrt (rtx, rtx, bool);
+bool aarch64_emit_opt_vec_rotate (rtx, rtx, rtx);
 tree aarch64_vector_load_decl (tree);
 rtx aarch64_gen_callee_cookie (aarch64_isa_mode, arm_pcs);
 void aarch64_expand_call (rtx, rtx, rtx, bool);
diff --git a/gcc/config/aarch64/aarch64-simd.md 
b/gcc/config/aarch64/aarch64-simd.md
index 08b121227eee..a91222b6e3b2 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -1313,6 +1313,9 @@
            (match_dup 4))
          (match_dup 3)))]
   {
+    if (aarch64_emit_opt_vec_rotate (operands[0], operands[1], operands[2]))
+      DONE;
+
     operands[3] = reload_completed ? operands[0] : gen_reg_rtx (<MODE>mode);
     rtx shft_amnt = unwrap_const_vec_duplicate (operands[2]);
     int bitwidth = GET_MODE_UNIT_SIZE (<MODE>mode) * BITS_PER_UNIT;
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 0fa7927d821a..7388f6b8fdf1 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -16018,6 +16018,22 @@ aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
   return true;
 }
 
+/* Emit an optimized sequence to perform a vector rotate
+   of REG by the vector constant amount AMNT and place the result
+   in DST.  Return true iff successful.  */
+
+bool
+aarch64_emit_opt_vec_rotate (rtx dst, rtx reg, rtx amnt)
+{
+  machine_mode mode = GET_MODE (reg);
+  /* Attempt to expand the rotate as a vector permute.
+     For some rotate amounts they can be single instructions and
+     even the general single-vector TBL permute has good throughput.  */
+  if (expand_rotate_as_vec_perm (mode, dst, reg, amnt))
+    return true;
+  return false;
+}
+
 /* Return the number of instructions that can be issued per cycle.  */
 static int
 aarch64_sched_issue_rate (void)
diff --git a/gcc/expmed.cc b/gcc/expmed.cc
index aa9f1abc8aba..2d5e5243ce8e 100644
--- a/gcc/expmed.cc
+++ b/gcc/expmed.cc
@@ -6286,6 +6286,50 @@ emit_store_flag_force (rtx target, enum rtx_code code, 
rtx op0, rtx op1,
   return target;
 }
 
+/* Expand a vector (left) rotate of MODE of X by an immediate AMT as a vector
+   permute operation.  Emit code to put the result in DST if successfull and
+   return it.  Otherwise return NULL.  This is intended to implement vector
+   rotates by byte amounts using vector permutes when the target does not offer
+   native vector rotate operations.  */
+rtx
+expand_rotate_as_vec_perm (machine_mode mode, rtx dst, rtx x, rtx amt)
+{
+  rtx amt_unwrap = unwrap_const_vec_duplicate (amt);
+  /* For now handle only rotate by the same integer constant in all lanes.
+     In principle rotates by any constant vector are representable through
+     permutes as long as the individual rotate amounts are multiples of
+     BITS_PER_UNIT.  */
+  if (!CONST_INT_P (amt_unwrap))
+    return NULL_RTX;
+
+  int rotamnt = INTVAL (amt_unwrap);
+  if (rotamnt % BITS_PER_UNIT != 0)
+    return NULL_RTX;
+  machine_mode qimode;
+  if (!qimode_for_vec_perm (mode).exists (&qimode))
+    return NULL_RTX;
+
+  vec_perm_builder builder;
+  unsigned nunits = GET_MODE_SIZE (GET_MODE_INNER (mode));
+  poly_uint64 total_units = GET_MODE_SIZE (mode);
+  builder.new_vector (total_units, nunits, 3);
+  unsigned rot_bytes = rotamnt / BITS_PER_UNIT;
+  unsigned rot_to_perm = BYTES_BIG_ENDIAN ? rot_bytes : nunits - rot_bytes;
+  for (unsigned j = 0; j < 3 * nunits; j += nunits)
+    for (unsigned i = 0; i < nunits; i++)
+      builder.quick_push ((rot_to_perm + i) % nunits + j);
+
+  rtx perm_src = lowpart_subreg (qimode, x, mode);
+  rtx perm_dst = lowpart_subreg (qimode, dst, mode);
+  rtx res
+    = expand_vec_perm_const (qimode, perm_src, perm_src, builder,
+                            qimode, perm_dst);
+  if (!res)
+    return NULL_RTX;
+  emit_move_insn (dst, lowpart_subreg (mode, res, qimode));
+  return dst;
+}
+
 /* Helper function for canonicalize_cmp_for_target.  Swap between inclusive
    and exclusive ranges in order to create an equivalent comparison.  See
    canonicalize_cmp_for_target for the possible cases.  */
diff --git a/gcc/expmed.h b/gcc/expmed.h
index 0a19176b77ab..2414c3cb27db 100644
--- a/gcc/expmed.h
+++ b/gcc/expmed.h
@@ -726,5 +726,6 @@ extern rtx expand_mult_highpart_adjust (scalar_int_mode, 
rtx, rtx, rtx,
                                        rtx, int);
 extern rtx expmed_mult_highpart_optab (scalar_int_mode, rtx, rtx, rtx,
                                       int, int);
+extern rtx expand_rotate_as_vec_perm (machine_mode, rtx, rtx, rtx);
 
 #endif  // EXPMED_H
diff --git a/gcc/testsuite/gcc.target/aarch64/simd/pr117048_2.c 
b/gcc/testsuite/gcc.target/aarch64/simd/pr117048_2.c
new file mode 100644
index 000000000000..7baf35818705
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/simd/pr117048_2.c
@@ -0,0 +1,66 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mlittle-endian" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+typedef char __attribute__ ((vector_size (16))) v16qi;
+typedef unsigned short __attribute__ ((vector_size (16))) v8hi;
+typedef unsigned int __attribute__ ((vector_size (16))) v4si;
+typedef unsigned long long __attribute__ ((vector_size (16))) v2di;
+typedef unsigned short __attribute__ ((vector_size (8))) v4hi;
+typedef unsigned int __attribute__ ((vector_size (8))) v2si;
+
+/*
+** G1:
+**     rev64   v0\.4s, v0\.4s
+**     ret 
+*/
+v2di
+G1 (v2di r)
+{
+  return (r >> 32) | (r << 32);
+}
+
+/*
+** G2:
+**     rev32   v0\.8h, v0\.8h
+**     ret 
+*/
+v4si
+G2 (v4si r)
+{
+  return (r >> 16) | (r << 16);
+}
+
+/*
+** G3:
+**     rev16   v0\.16b, v0\.16b
+**     ret 
+*/
+v8hi
+G3 (v8hi r)
+{
+  return (r >> 8) | (r << 8);
+}
+
+/*
+** G4:
+**     rev32   v0\.4h, v0\.4h
+**     ret 
+*/
+v2si
+G4 (v2si r)
+{
+  return (r >> 16) | (r << 16);
+}
+
+/*
+** G5:
+**     rev16   v0\.8b, v0\.8b
+**     ret 
+*/
+v4hi
+G5 (v4hi r)
+{
+  return (r >> 8) | (r << 8);
+}
+
diff --git a/gcc/testsuite/gcc.target/aarch64/vec-rot-exec.c 
b/gcc/testsuite/gcc.target/aarch64/vec-rot-exec.c
new file mode 100644
index 000000000000..130a9b1aa647
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vec-rot-exec.c
@@ -0,0 +1,101 @@
+/* { dg-do run } */
+/* { dg-options "-O2" } */
+
+typedef char __attribute__ ((vector_size (16))) v16qi;
+typedef unsigned short __attribute__ ((vector_size (16))) v8hi;
+typedef unsigned int __attribute__ ((vector_size (16))) v4si;
+typedef unsigned long long __attribute__ ((vector_size (16))) v2di;
+typedef char __attribute__ ((vector_size (8))) v8qi;
+typedef unsigned short __attribute__ ((vector_size (8))) v4hi;
+typedef unsigned int __attribute__ ((vector_size (8))) v2si;
+#define VEC_ELTS(X) (sizeof (X) / (sizeof (X[0])))
+
+static const char __attribute__ ((aligned (16))) *str = 
"abcdefghijklmnopqrstuvwxyz";
+
+unsigned long long
+__attribute__((noipa,noinline)) 
+rot_64_one (unsigned long long x, unsigned amt)
+{
+  return (x << amt) | (x >> (64 - amt));
+}
+unsigned int
+__attribute__((noipa,noinline)) 
+rot_32_one (unsigned int x, unsigned amt)
+{
+  return (x << amt) | (x >> (32 - amt));
+}
+
+unsigned short
+__attribute__((noipa,noinline)) 
+rot_16_one (unsigned short x, unsigned short amt)
+{
+  return (x << amt) | (x >> (16 - amt));
+}
+
+
+#define ROTFUNC(M,S,A)                                 \
+M                                                      \
+__attribute__((noipa,noinline))                        \
+rot_##M##_##S##_##A (M x)                              \
+{                                                      \
+  return (x << A) | (x >> (S - A));                    \
+}                                                      \
+                                                       \
+void                                                   \
+test_rot_##M##_##S##_##A (void)                                \
+{                                                      \
+  M vec = *(M *)str;                                   \
+  M res = rot_##M##_##S##_##A (vec);                   \
+  for (__SIZE_TYPE__ i = 0; i < VEC_ELTS (vec); i++)   \
+    if (res[i] != rot_##S##_one (vec[i], A))           \
+      __builtin_abort ();                              \
+}
+
+ROTFUNC (v2di, 64, 56)
+ROTFUNC (v2di, 64, 48)
+ROTFUNC (v2di, 64, 40)
+ROTFUNC (v2di, 64, 32)
+ROTFUNC (v2di, 64, 24)
+ROTFUNC (v2di, 64, 16)
+ROTFUNC (v2di, 64, 8)
+
+ROTFUNC (v4si, 32, 24)
+ROTFUNC (v4si, 32, 16)
+ROTFUNC (v4si, 32, 8)
+
+ROTFUNC (v8hi, 16, 8)
+
+ROTFUNC (v2si, 32, 24)
+ROTFUNC (v2si, 32, 16)
+ROTFUNC (v2si, 32, 8)
+
+ROTFUNC (v4hi, 16, 8)
+
+#define CALL_TEST(M,S,A) test_rot_##M##_##S##_##A ()
+
+int
+main (void)
+{
+  CALL_TEST (v2di, 64, 56);
+  CALL_TEST (v2di, 64, 48);
+  CALL_TEST (v2di, 64, 40);
+  CALL_TEST (v2di, 64, 32);
+  CALL_TEST (v2di, 64, 24);
+  CALL_TEST (v2di, 64, 16);
+  CALL_TEST (v2di, 64, 8);
+
+  CALL_TEST (v4si, 32, 24);
+  CALL_TEST (v4si, 32, 16);
+  CALL_TEST (v4si, 32, 8);
+
+  CALL_TEST (v8hi, 16, 8);
+
+  CALL_TEST (v2si, 32, 24);
+  CALL_TEST (v2si, 32, 16);
+  CALL_TEST (v2si, 32, 8);
+
+  CALL_TEST (v4hi, 16, 8);
+
+  return 0;
+}
+

[gcc r15-4876] aarch64: Optimize vector rotates as vector permutes where possible

Reply via email to