[gcc r15-4270] PR target/117048 aarch64: Use more canonical and optimization-friendly representation for XAR instru

Kyrylo Tkachov via Gcc-cvs Fri, 11 Oct 2024 08:23:56 -0700

https://gcc.gnu.org/g:1dcc6a1a67165a469d4cd9b6b39514c46cc656ad


commit r15-4270-g1dcc6a1a67165a469d4cd9b6b39514c46cc656ad
Author: Kyrylo Tkachov <ktkac...@nvidia.com>
Date:   Wed Oct 9 09:40:33 2024 -0700

    PR target/117048 aarch64: Use more canonical and optimization-friendly 
representation for XAR instruction
    
    The pattern for the Advanced SIMD XAR instruction isn't very
    optimization-friendly at the moment.
    In the testcase from the PR once simlify-rtx has done its work it
    generates the RTL:
    (set (reg:V2DI 119 [ _14 ])
        (rotate:V2DI (xor:V2DI (reg:V2DI 114 [ vect__1.12_16 ])
                (reg:V2DI 116 [ *m1_01_8(D) ]))
            (const_vector:V2DI [
                    (const_int 32 [0x20]) repeated x2
                ])))
    
    which fails to match our XAR pattern because the pattern expects:
    1) A ROTATERT instead of the ROTATE.  However, according to the RTL ops
    documentation the preferred form of rotate-by-immediate is ROTATE, which
    I take to mean it's the canonical form.
    ROTATE (x, C) <-> ROTATERT (x, MODE_WIDTH - C) so it's better to match just
    one canonical representation.
    2) A CONST_INT shift amount whereas the midend asks for a repeated vector
    constant.
    
    These issues are fixed by introducing a dedicated expander for the
    aarch64_xarqv2di name, needed by the arm_neon.h intrinsic, that translate
    the intrinsic-level CONST_INT immediate (the right-rotate amount) into
    a repeated vector constant subtracted from 64 to give the corresponding
    left-rotate amount that is fed to the new representation for the XAR
    define_insn that uses the ROTATE RTL code.  This is a similar approach
    to have we handle the discrepancy between intrinsic-level and RTL-level
    vector lane numbers for big-endian.
    
    With this patch and [1/2] the arithmetic parts of the testcase now simplify
    to just one XAR instruction.
    
    Bootstrapped and tested on aarch64-none-linux-gnu.
    
    Signed-off-by: Kyrylo Tkachov <ktkac...@nvidia.com>
    
    gcc/
            PR target/117048
            * config/aarch64/aarch64-simd.md (aarch64_xarqv2di): Redefine into a
            define_expand.
            (*aarch64_xarqv2di_insn): Define.
    
    gcc/testsuite/
            PR target/117048
            * g++.target/aarch64/pr117048.C: New test.

Diff:
---
 gcc/config/aarch64/aarch64-simd.md          | 33 ++++++++++++++++++++++++----
 gcc/testsuite/g++.target/aarch64/pr117048.C | 34 +++++++++++++++++++++++++++++
 2 files changed, 63 insertions(+), 4 deletions(-)

diff --git a/gcc/config/aarch64/aarch64-simd.md 
b/gcc/config/aarch64/aarch64-simd.md
index 11d405ed640f..bf272bc0b4eb 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -9046,18 +9046,43 @@
   [(set_attr "type" "crypto_sha3")]
 )
 
-(define_insn "aarch64_xarqv2di"
+(define_insn "*aarch64_xarqv2di_insn"
   [(set (match_operand:V2DI 0 "register_operand" "=w")
-       (rotatert:V2DI
+       (rotate:V2DI
         (xor:V2DI
          (match_operand:V2DI 1 "register_operand" "%w")
          (match_operand:V2DI 2 "register_operand" "w"))
-        (match_operand:SI 3 "aarch64_simd_shift_imm_di" "Usd")))]
+        (match_operand:V2DI 3 "aarch64_simd_lshift_imm" "Dl")))]
   "TARGET_SHA3"
-  "xar\\t%0.2d, %1.2d, %2.2d, %3"
+  {
+    operands[3]
+      = GEN_INT (64 - INTVAL (unwrap_const_vec_duplicate (operands[3])));
+    return "xar\\t%0.2d, %1.2d, %2.2d, %3";
+  }
   [(set_attr "type" "crypto_sha3")]
 )
 
+;; The semantics of the vxarq_u64 intrinsics treat the immediate argument as a
+;; right-rotate amount but the recommended representation of rotates by a
+;; constant in RTL is with the left ROTATE code.  Translate between the
+;; intrinsic-provided amount and the RTL operands in the expander here.
+;; The define_insn for XAR will translate back to instruction semantics in its
+;; output logic.
+(define_expand "aarch64_xarqv2di"
+  [(set (match_operand:V2DI 0 "register_operand")
+       (rotate:V2DI
+        (xor:V2DI
+         (match_operand:V2DI 1 "register_operand")
+         (match_operand:V2DI 2 "register_operand"))
+        (match_operand:SI 3 "aarch64_simd_shift_imm_di")))]
+  "TARGET_SHA3"
+  {
+    operands[3]
+      = aarch64_simd_gen_const_vector_dup (V2DImode,
+                                          64 - INTVAL (operands[3]));
+  }
+)
+
 (define_insn "bcaxq<mode>4"
   [(set (match_operand:VQ_I 0 "register_operand" "=w")
        (xor:VQ_I
diff --git a/gcc/testsuite/g++.target/aarch64/pr117048.C 
b/gcc/testsuite/g++.target/aarch64/pr117048.C
new file mode 100644
index 000000000000..ae46e5875e4c
--- /dev/null
+++ b/gcc/testsuite/g++.target/aarch64/pr117048.C
@@ -0,0 +1,34 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+#include <arm_neon.h>
+
+#pragma GCC target "+sha3"
+
+static inline uint64x2_t
+rotr64_vec(uint64x2_t x, const int b)
+{
+    int64x2_t neg_b = vdupq_n_s64(-b);
+    int64x2_t left_shift = vsubq_s64(vdupq_n_s64(64), vdupq_n_s64(b));
+    
+    uint64x2_t right_shifted = vshlq_u64(x, neg_b);
+    uint64x2_t left_shifted = vshlq_u64(x, left_shift);
+    
+    return vorrq_u64(right_shifted, left_shifted);
+}
+
+void G(
+    int64_t* v,
+    int64x2_t& m1_01, 
+    int64x2_t& m1_23, 
+    int64x2_t& m2_01, 
+    int64x2_t& m2_23   
+) {
+    int64x2_t vd01 = {v[12],v[13]};
+    vd01 = veorq_s64(vd01, m1_01);
+    vd01 = vreinterpretq_s64_u64(rotr64_vec( vreinterpretq_u64_s64 (vd01), 
32));
+    v[12] = vgetq_lane_s64(vd01, 0);
+}
+
+/* { dg-final { scan-assembler {\txar\tv[0-9]+\.2d, v[0-9]+\.2d, v[0-9]+\.2d, 
32\n} } } */
+

[gcc r15-4270] PR target/117048 aarch64: Use more canonical and optimization-friendly representation for XAR instru

Reply via email to