[gcc r16-5212] aarch64: Use eor3 for more double xor cases

Andre Simoes Dias Vieira via Gcc-cvs Thu, 13 Nov 2025 02:49:18 -0800

https://gcc.gnu.org/g:e840037fccceff0ad585946ff4d454f220c51759


commit r16-5212-ge840037fccceff0ad585946ff4d454f220c51759
Author: Andre Vieira <[email protected]>
Date:   Thu Nov 13 10:46:56 2025 +0000

    aarch64: Use eor3 for more double xor cases
    
    Expands the use of eor3 where we'd otherwise use two vector eor's.
    
    gcc/ChangeLog:
    
            * config/aarch64/aarch64-simd.md (*eor3q<mode>4): New insn to be 
used by
            combine after reload to optimize any grouping of eor's that are 
using FP
            registers for scalar modes.
    
    gcc/testsuite/ChangeLog:
    
            * gcc.target/aarch64/eor3-opt.c: New test.

Diff:
---
 gcc/config/aarch64/aarch64-simd.md          |  12 ++
 gcc/testsuite/gcc.target/aarch64/eor3-opt.c | 209 ++++++++++++++++++++++++++++
 2 files changed, 221 insertions(+)

diff --git a/gcc/config/aarch64/aarch64-simd.md 
b/gcc/config/aarch64/aarch64-simd.md
index e7c459dceb3a..1d2248ab57f5 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -9449,6 +9449,18 @@
   [(set_attr "type" "crypto_sha3")]
 )
 
+(define_insn "*eor3q<mode>4"
+  [(set (match_operand:ALLI 0 "register_operand" "=w")
+       (xor:ALLI
+        (xor:ALLI
+         (match_operand:ALLI 2 "register_operand" "w")
+         (match_operand:ALLI 3 "register_operand" "w"))
+        (match_operand:ALLI 1 "register_operand" "w")))]
+  "TARGET_SHA3 && reload_completed"
+  "eor3\\t%0.16b, %1.16b, %2.16b, %3.16b"
+  [(set_attr "type" "crypto_sha3")]
+)
+
 (define_insn "aarch64_rax1qv2di"
   [(set (match_operand:V2DI 0 "register_operand" "=w")
        (xor:V2DI
diff --git a/gcc/testsuite/gcc.target/aarch64/eor3-opt.c 
b/gcc/testsuite/gcc.target/aarch64/eor3-opt.c
new file mode 100644
index 000000000000..51f36f9e7806
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/eor3-opt.c
@@ -0,0 +1,209 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O2 -fno-schedule-insns -fno-schedule-insns2" } */
+/* { dg-final { check-function-bodies "**" "" {} } } */
+#include <arm_neon.h>
+
+#pragma GCC target ("+sha3")
+
+#define EOR_SCALAR_FN(type)                \
+type eor3_##type (type a, type b, type c) { \
+    return a ^ b ^ c;                      \
+}
+
+
+EOR_SCALAR_FN(uint64x1_t)
+/*
+** eor3_uint64x1_t:
+**     eor3    v0.16b, v0.16b, v1.16b, v2.16b
+**     ret
+*/
+EOR_SCALAR_FN(int64x1_t)
+/*
+** eor3_int64x1_t:
+**     eor3    v0.16b, v0.16b, v1.16b, v2.16b
+**     ret
+*/
+
+#define EOR_VEC_FN(type)                 \
+type eor3_##type (type a, type b, type c) \
+{                                        \
+  type res = a;                                  \
+  res[0] = a[0] ^ b[0] ^ c[0];           \
+  return res;                            \
+}
+
+EOR_VEC_FN(int32x4_t)
+/*
+** eor3_int32x4_t:
+**     eor3    v([0-2]).16b, v[0-2].16b, v[0-2].16b, v[0-2].16b
+**     ins     v0.s\[0\], v\1.s\[0\]
+**     ret
+*/
+EOR_VEC_FN(int32x2_t)
+/*
+** eor3_int32x2_t:
+**     eor3    v([0-2]).16b, v[0-2].16b, v[0-2].16b, v[0-2].16b
+**     ins     v0.s\[0\], v\1.s\[0\]
+**     ret
+*/
+EOR_VEC_FN(uint32x4_t)
+/*
+** eor3_uint32x4_t:
+**     eor3    v([0-2]).16b, v[0-2].16b, v[0-2].16b, v[0-2].16b
+**     ins     v0.s\[0\], v\1.s\[0\]
+**     ret
+*/
+EOR_VEC_FN(uint32x2_t)
+/*
+** eor3_uint32x2_t:
+**     eor3    v([0-2]).16b, v[0-2].16b, v[0-2].16b, v[0-2].16b
+**     ins     v0.s\[0\], v\1.s\[0\]
+**     ret
+*/
+EOR_VEC_FN(int16x8_t)
+/*
+** eor3_int16x8_t:
+**     eor3    v([0-2]).16b, v[0-2].16b, v[0-2].16b, v[0-2].16b
+**     ins     v0.h\[0\], v\1.h\[0\]
+**     ret
+*/
+EOR_VEC_FN(int16x4_t)
+/*
+** eor3_int16x4_t:
+**     eor3    v([0-2]).16b, v[0-2].16b, v[0-2].16b, v[0-2].16b
+**     ins     v0.h\[0\], v\1.h\[0\]
+**     ret
+*/
+EOR_VEC_FN(uint16x8_t)
+/*
+** eor3_uint16x8_t:
+**     eor3    v([0-2]).16b, v[0-2].16b, v[0-2].16b, v[0-2].16b
+**     ins     v0.h\[0\], v\1.h\[0\]
+**     ret
+*/
+EOR_VEC_FN(uint16x4_t)
+/*
+** eor3_uint16x4_t:
+**     eor3    v([0-2]).16b, v[0-2].16b, v[0-2].16b, v[0-2].16b
+**     ins     v0.h\[0\], v\1.h\[0\]
+**     ret
+*/
+EOR_VEC_FN(int8x16_t)
+/*
+** eor3_int8x16_t:
+**     eor3    v([0-2]).16b, v[0-2].16b, v[0-2].16b, v[0-2].16b
+**     ins     v0.b\[0\], v\1.b\[0\]
+**     ret
+*/
+EOR_VEC_FN(int8x8_t)
+/*
+** eor3_int8x8_t:
+**     eor3    v([0-2]).16b, v[0-2].16b, v[0-2].16b, v[0-2].16b
+**     ins     v0.b\[0\], v\1.b\[0\]
+**     ret
+*/
+EOR_VEC_FN(uint8x16_t)
+/*
+** eor3_uint8x16_t:
+**     eor3    v([0-2]).16b, v[0-2].16b, v[0-2].16b, v[0-2].16b
+**     ins     v0.b\[0\], v\1.b\[0\]
+**     ret
+*/
+EOR_VEC_FN(uint8x8_t)
+/*
+** eor3_uint8x8_t:
+**     eor3    v([0-2]).16b, v[0-2].16b, v[0-2].16b, v[0-2].16b
+**     ins     v0.b\[0\], v\1.b\[0\]
+**     ret
+*/
+
+/* The following tests should not be optimized to EOR3 as that would involve
+   unnecessary register file moves.  */
+
+EOR_SCALAR_FN(uint64_t)
+/*
+** eor3_uint64_t:
+**     eor     x1, x1, x2
+**     eor     x0, x1, x0
+**     ret
+*/
+EOR_SCALAR_FN(int64_t)
+/*
+** eor3_int64_t:
+**     eor     x1, x1, x2
+**     eor     x0, x1, x0
+**     ret
+*/
+EOR_SCALAR_FN(uint32_t)
+/*
+** eor3_uint32_t:
+**     eor     w1, w1, w2
+**     eor     w0, w1, w0
+**     ret
+*/
+EOR_SCALAR_FN(int32_t)
+/*
+** eor3_int32_t:
+**     eor     w1, w1, w2
+**     eor     w0, w1, w0
+**     ret
+*/
+EOR_SCALAR_FN(uint16_t)
+/*
+** eor3_uint16_t:
+**     eor     w1, w1, w2
+**     eor     w0, w0, w1
+**     ret
+*/
+EOR_SCALAR_FN(int16_t)
+/*
+** eor3_int16_t:
+**     eor     w1, w1, w2
+**     eor     w0, w0, w1
+**     ret
+*/
+EOR_SCALAR_FN(uint8_t)
+/*
+** eor3_uint8_t:
+**     eor     w1, w1, w2
+**     eor     w0, w0, w1
+**     ret
+*/
+EOR_SCALAR_FN(int8_t)
+/*
+** eor3_int8_t:
+**     eor     w1, w1, w2
+**     eor     w0, w0, w1
+**     ret
+*/
+
+void not_eor3_long(long *p)
+{
+  p[6] = p[4] ^ p[0] ^ (p[2] << 2);
+}
+/*
+** not_eor3_long:
+**     ldr     x1, \[x0\]
+**     ldr     x2, \[x0, 32\]
+**     eor     x2, x2, x1
+**     ldr     x1, \[x0, 16\]
+**     eor     x1, x2, x1, lsl 2
+**     str     x1, \[x0, 48\]
+**     ret
+*/
+
+int64x2_t not_eor3_int64_t (int64x2_t a, int64_t b, int64_t c)
+{
+  int64x2_t res;
+  res[0] = a[0] ^ b ^ c;
+  return res;
+}
+/*
+** not_eor3_int64_t:
+**     eor     x0, x0, x1
+**     fmov    d31, x0
+**     eor     v0.8b, v31.8b, v0.8b
+**     fmov    d0, d0
+**     ret
+*/
+

[gcc r16-5212] aarch64: Use eor3 for more double xor cases

Reply via email to