On 30/04/18 15:12, Tamar Christina wrote: > Hi All, > > This patch adds the missing neon intrinsics for all 128 bit vector Integer > modes for the > three-way XOR and negate and xor instructions for Arm8.2-a to Armv8.4-a. > > Bootstrapped and regtested on aarch64-none-linux-gnue and no issues. > > Ok for master? And for backport to the GCC-8 branch? > > gcc/ > 2018-04-30 Tamar Christina <tamar.christ...@arm.com> > > * config/aarch64/aarch64-simd.md (aarch64_eor3qv8hi): Change to > eor3q<mode>4. > (aarch64_bcaxqv8hi): Change to bcaxq<mode>4. > * config/aarch64/aarch64-simd-builtins.def (veor3q_u8, veor3q_u32, > veor3q_u64, veor3q_s8, veor3q_s16, veor3q_s32, veor3q_s64, vbcaxq_u8, > vbcaxq_u32, vbcaxq_u64, vbcaxq_s8, vbcaxq_s16, vbcaxq_s32, > vbcaxq_s64): New. > * config/aarch64/arm_neon.h: Likewise. > * config/aarch64/iterators.md (VQ_I): New. > > gcc/testsuite/ > 2018-04-30 Tamar Christina <tamar.christ...@arm.com> > > * gcc.target/gcc.target/aarch64/sha3.h (veor3q_u8, veor3q_u32, > veor3q_u64, veor3q_s8, veor3q_s16, veor3q_s32, veor3q_s64, vbcaxq_u8, > vbcaxq_u32, vbcaxq_u64, vbcaxq_s8, vbcaxq_s16, vbcaxq_s32, > vbcaxq_s64): New. > * gcc.target/gcc.target/aarch64/sha3_1.c: Likewise. > * gcc.target/gcc.target/aarch64/sha3_1.c: Likewise. > * gcc.target/gcc.target/aarch64/sha3_1.c: Likewise. > > Thanks, > Tamar >
As just discussed off-list. There's no point in marking an operation as commutative in the register constraints if the constraints are identical. If it didn't match with the first ordering of the operands, swapping them over can't help and just wasted cycles. So please drop the redundant % markers. OK for trunk with that change. This isn't a regression, so I don't think it warrants a back-port. R. > > rb9185.patch > > > diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def > b/gcc/config/aarch64/aarch64-simd-builtins.def > index > b383f2485e5a287c6d833122d6be0c9ff2ef72a2..439d4837fe724b33d4c1bd834570fb464f47eb5b > 100644 > --- a/gcc/config/aarch64/aarch64-simd-builtins.def > +++ b/gcc/config/aarch64/aarch64-simd-builtins.def > @@ -599,14 +599,16 @@ > VAR1 (BINOPU, crypto_sha512su0q, 0, v2di) > /* Implemented by aarch64_crypto_sha512su1qv2di. */ > VAR1 (TERNOPU, crypto_sha512su1q, 0, v2di) > - /* Implemented by aarch64_eor3qv8hi. */ > - VAR1 (TERNOPU, eor3q, 0, v8hi) > + /* Implemented by eor3q<mode>4. */ > + BUILTIN_VQ_I (TERNOPU, eor3q, 4) > + BUILTIN_VQ_I (TERNOP, eor3q, 4) > /* Implemented by aarch64_rax1qv2di. */ > VAR1 (BINOPU, rax1q, 0, v2di) > /* Implemented by aarch64_xarqv2di. */ > VAR1 (TERNOPUI, xarq, 0, v2di) > - /* Implemented by aarch64_bcaxqv8hi. */ > - VAR1 (TERNOPU, bcaxq, 0, v8hi) > + /* Implemented by bcaxq<mode>4. */ > + BUILTIN_VQ_I (TERNOPU, bcaxq, 4) > + BUILTIN_VQ_I (TERNOP, bcaxq, 4) > > /* Implemented by aarch64_fml<f16mac1>l<f16quad>_low<mode>. */ > VAR1 (TERNOP, fmlal_low, 0, v2sf) > diff --git a/gcc/config/aarch64/aarch64-simd.md > b/gcc/config/aarch64/aarch64-simd.md > index > 1154fc3d58deaa33413ea3050ff7feec37f092a6..12fea393fa74f04a61c0c81342898dfc0e7228b5 > 100644 > --- a/gcc/config/aarch64/aarch64-simd.md > +++ b/gcc/config/aarch64/aarch64-simd.md > @@ -5955,13 +5955,13 @@ > > ;; sha3 > > -(define_insn "aarch64_eor3qv8hi" > - [(set (match_operand:V8HI 0 "register_operand" "=w") > - (xor:V8HI > - (xor:V8HI > - (match_operand:V8HI 2 "register_operand" "%w") > - (match_operand:V8HI 3 "register_operand" "w")) > - (match_operand:V8HI 1 "register_operand" "w")))] > +(define_insn "eor3q<mode>4" > + [(set (match_operand:VQ_I 0 "register_operand" "=w") > + (xor:VQ_I > + (xor:VQ_I > + (match_operand:VQ_I 2 "register_operand" "%w") > + (match_operand:VQ_I 3 "register_operand" "w")) > + (match_operand:VQ_I 1 "register_operand" "w")))] > "TARGET_SIMD && TARGET_SHA3" > "eor3\\t%0.16b, %1.16b, %2.16b, %3.16b" > [(set_attr "type" "crypto_sha3")] > @@ -5991,13 +5991,13 @@ > [(set_attr "type" "crypto_sha3")] > ) > > -(define_insn "aarch64_bcaxqv8hi" > - [(set (match_operand:V8HI 0 "register_operand" "=w") > - (xor:V8HI > - (and:V8HI > - (not:V8HI (match_operand:V8HI 3 "register_operand" "w")) > - (match_operand:V8HI 2 "register_operand" "w")) > - (match_operand:V8HI 1 "register_operand" "w")))] > +(define_insn "bcaxq<mode>4" > + [(set (match_operand:VQ_I 0 "register_operand" "=w") > + (xor:VQ_I > + (and:VQ_I > + (not:VQ_I (match_operand:VQ_I 3 "register_operand" "w")) > + (match_operand:VQ_I 2 "register_operand" "w")) > + (match_operand:VQ_I 1 "register_operand" "w")))] > "TARGET_SIMD && TARGET_SHA3" > "bcax\\t%0.16b, %1.16b, %2.16b, %3.16b" > [(set_attr "type" "crypto_sha3")] > diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h > index > c45c29ae815c9ff373eb2f57a77ebeda910a30cf..4ff76b4133959ae598468dff2554db37a0d07a62 > 100644 > --- a/gcc/config/aarch64/arm_neon.h > +++ b/gcc/config/aarch64/arm_neon.h > @@ -32068,6 +32068,13 @@ vsha512su1q_u64 (uint64x2_t __a, uint64x2_t __b, > uint64x2_t __c) > return __builtin_aarch64_crypto_sha512su1qv2di_uuuu (__a, __b, __c); > } > > +__extension__ extern __inline uint8x16_t > +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > +veor3q_u8 (uint8x16_t __a, uint8x16_t __b, uint8x16_t __c) > +{ > + return __builtin_aarch64_eor3qv16qi_uuuu (__a, __b, __c); > +} > + > __extension__ extern __inline uint16x8_t > __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > veor3q_u16 (uint16x8_t __a, uint16x8_t __b, uint16x8_t __c) > @@ -32075,6 +32082,49 @@ veor3q_u16 (uint16x8_t __a, uint16x8_t __b, > uint16x8_t __c) > return __builtin_aarch64_eor3qv8hi_uuuu (__a, __b, __c); > } > > +__extension__ extern __inline uint32x4_t > +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > +veor3q_u32 (uint32x4_t __a, uint32x4_t __b, uint32x4_t __c) > +{ > + return __builtin_aarch64_eor3qv4si_uuuu (__a, __b, __c); > +} > + > +__extension__ extern __inline uint64x2_t > +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > +veor3q_u64 (uint64x2_t __a, uint64x2_t __b, uint64x2_t __c) > +{ > + return __builtin_aarch64_eor3qv2di_uuuu (__a, __b, __c); > +} > + > + > +__extension__ extern __inline int8x16_t > +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > +veor3q_s8 (int8x16_t __a, int8x16_t __b, int8x16_t __c) > +{ > + return __builtin_aarch64_eor3qv16qi (__a, __b, __c); > +} > + > +__extension__ extern __inline int16x8_t > +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > +veor3q_s16 (int16x8_t __a, int16x8_t __b, int16x8_t __c) > +{ > + return __builtin_aarch64_eor3qv8hi (__a, __b, __c); > +} > + > +__extension__ extern __inline int32x4_t > +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > +veor3q_s32 (int32x4_t __a, int32x4_t __b, int32x4_t __c) > +{ > + return __builtin_aarch64_eor3qv4si (__a, __b, __c); > +} > + > +__extension__ extern __inline int64x2_t > +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > +veor3q_s64 (int64x2_t __a, int64x2_t __b, int64x2_t __c) > +{ > + return __builtin_aarch64_eor3qv2di (__a, __b, __c); > +} > + > __extension__ extern __inline uint64x2_t > __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > vrax1q_u64 (uint64x2_t __a, uint64x2_t __b) > @@ -32089,12 +32139,63 @@ vxarq_u64 (uint64x2_t __a, uint64x2_t __b, const > int imm6) > return __builtin_aarch64_xarqv2di_uuus (__a, __b,imm6); > } > > +__extension__ extern __inline uint8x16_t > +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > +vbcaxq_u8 (uint8x16_t __a, uint8x16_t __b, uint8x16_t __c) > +{ > + return __builtin_aarch64_bcaxqv16qi_uuuu (__a, __b, __c); > +} > + > __extension__ extern __inline uint16x8_t > __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > vbcaxq_u16 (uint16x8_t __a, uint16x8_t __b, uint16x8_t __c) > { > return __builtin_aarch64_bcaxqv8hi_uuuu (__a, __b, __c); > } > + > +__extension__ extern __inline uint32x4_t > +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > +vbcaxq_u32 (uint32x4_t __a, uint32x4_t __b, uint32x4_t __c) > +{ > + return __builtin_aarch64_bcaxqv4si_uuuu (__a, __b, __c); > +} > + > +__extension__ extern __inline uint64x2_t > +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > +vbcaxq_u64 (uint64x2_t __a, uint64x2_t __b, uint64x2_t __c) > +{ > + return __builtin_aarch64_bcaxqv2di_uuuu (__a, __b, __c); > +} > + > +__extension__ extern __inline int8x16_t > +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > +vbcaxq_s8 (int8x16_t __a, int8x16_t __b, int8x16_t __c) > +{ > + return __builtin_aarch64_bcaxqv16qi (__a, __b, __c); > +} > + > +__extension__ extern __inline int16x8_t > +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > +vbcaxq_s16 (int16x8_t __a, int16x8_t __b, int16x8_t __c) > +{ > + return __builtin_aarch64_bcaxqv8hi (__a, __b, __c); > +} > + > +__extension__ extern __inline int32x4_t > +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > +vbcaxq_s32 (int32x4_t __a, int32x4_t __b, int32x4_t __c) > +{ > + return __builtin_aarch64_bcaxqv4si (__a, __b, __c); > +} > + > +__extension__ extern __inline int64x2_t > +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > +vbcaxq_s64 (int64x2_t __a, int64x2_t __b, int64x2_t __c) > +{ > + return __builtin_aarch64_bcaxqv2di (__a, __b, __c); > +} > + > + > #pragma GCC pop_options > > #pragma GCC push_options > diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md > index > fa181794392d4dc48e9a6df5cf5db14a9824cd2d..fff84329c7cb26cecbe86f1d92ce4853bcff9b62 > 100644 > --- a/gcc/config/aarch64/iterators.md > +++ b/gcc/config/aarch64/iterators.md > @@ -78,6 +78,9 @@ > ;; Quad vector modes. > (define_mode_iterator VQ [V16QI V8HI V4SI V2DI V8HF V4SF V2DF]) > > +;; Quad integer vector modes. > +(define_mode_iterator VQ_I [V16QI V8HI V4SI V2DI]) > + > ;; VQ without 2 element modes. > (define_mode_iterator VQ_NO2E [V16QI V8HI V4SI V8HF V4SF]) > > diff --git a/gcc/testsuite/gcc.target/aarch64/sha3.h > b/gcc/testsuite/gcc.target/aarch64/sha3.h > index > 76dd1931dffbf60c521e824a0c5d51d9aa08c9f0..c8537c251963317258237f5346b9ff3a7282de5a > 100644 > --- a/gcc/testsuite/gcc.target/aarch64/sha3.h > +++ b/gcc/testsuite/gcc.target/aarch64/sha3.h > @@ -1,10 +1,26 @@ > #include "arm_neon.h" > > -uint16x8_t > -test_veor3q_u16 (uint16x8_t a, uint16x8_t b, uint16x8_t c) > -{ > - return veor3q_u16 (a, b, c); > -} > +#define TEST_VEOR3(T, S) T \ > +test_veor3q_ ## S (T a, T b, T c) \ > +{ \ > + return veor3q_ ## S (a, b, c); \ > +} \ > + > +#define TEST_VBCAX(T, S) T \ > +test_vbcaxq_ ## S (T a, T b, T c) \ > +{ \ > + return vbcaxq_ ## S (a, b, c); \ > +} \ > + > + > +TEST_VEOR3 (uint8x16_t, u8) > +TEST_VEOR3 (uint16x8_t, u16) > +TEST_VEOR3 (uint32x4_t, u32) > +TEST_VEOR3 (uint64x2_t, u64) > +TEST_VEOR3 (int8x16_t, s8) > +TEST_VEOR3 (int16x8_t, s16) > +TEST_VEOR3 (int32x4_t, s32) > +TEST_VEOR3 (int64x2_t, s64) > > uint64x2_t > test_vrax1q_u64 (uint64x2_t a, uint64x2_t b) > @@ -18,8 +34,12 @@ test_vxarq_u64 (uint64x2_t a, uint64x2_t b) > return vxarq_u64 (a, b, 15); > } > > -uint16x8_t > -test_vbcaxq_u16 (uint16x8_t a, uint16x8_t b, uint16x8_t c) > -{ > - return vbcaxq_u16 (a, b, c); > -} > +TEST_VBCAX (uint8x16_t, u8) > +TEST_VBCAX (uint16x8_t, u16) > +TEST_VBCAX (uint32x4_t, u32) > +TEST_VBCAX (uint64x2_t, u64) > +TEST_VBCAX (int8x16_t, s8) > +TEST_VBCAX (int16x8_t, s16) > +TEST_VBCAX (int32x4_t, s32) > +TEST_VBCAX (int64x2_t, s64) > + > diff --git a/gcc/testsuite/gcc.target/aarch64/sha3_1.c > b/gcc/testsuite/gcc.target/aarch64/sha3_1.c > index > 879eadd875e899c70b32680d40bdb3de419f00a1..0727ce770283844ea69fe4ccdd858e03f9396bc8 > 100644 > --- a/gcc/testsuite/gcc.target/aarch64/sha3_1.c > +++ b/gcc/testsuite/gcc.target/aarch64/sha3_1.c > @@ -4,7 +4,7 @@ > #include "sha3.h" > > > -/* { dg-final { scan-assembler-times "eor3\\tv\[0-9\]+\.16b, v\[0-9\]+\.16b, > v\[0-9\]+\.16b, v\[0-9\]+\.16b" 1 } } */ > +/* { dg-final { scan-assembler-times "eor3\\tv\[0-9\]+\.16b, v\[0-9\]+\.16b, > v\[0-9\]+\.16b, v\[0-9\]+\.16b" 8 } } */ > /* { dg-final { scan-assembler-times "rax1\\tv\[0-9\]+\.2d, v\[0-9\]+\.2d, > v\[0-9\]+\.2d" 1 } } */ > /* { dg-final { scan-assembler-times "xar\\tv\[0-9\]+\.2d, v\[0-9\]+\.2d, > v\[0-9\]+\.2d, 15" 1 } } */ > -/* { dg-final { scan-assembler-times "bcax\\tv\[0-9\]+\.16b, v\[0-9\]+\.16b, > v\[0-9\]+\.16b, v\[0-9\]+\.16b" 1 } } */ > +/* { dg-final { scan-assembler-times "bcax\\tv\[0-9\]+\.16b, v\[0-9\]+\.16b, > v\[0-9\]+\.16b, v\[0-9\]+\.16b" 8 } } */ > diff --git a/gcc/testsuite/gcc.target/aarch64/sha3_2.c > b/gcc/testsuite/gcc.target/aarch64/sha3_2.c > index > 2afe28c47445af53194427912b9d6a9de9b5ff04..2d05116113371b37f0788334cf94f6b35ed9d43f > 100644 > --- a/gcc/testsuite/gcc.target/aarch64/sha3_2.c > +++ b/gcc/testsuite/gcc.target/aarch64/sha3_2.c > @@ -3,7 +3,7 @@ > > #include "sha3.h" > > -/* { dg-final { scan-assembler-times "eor3\\tv\[0-9\]+\.16b, v\[0-9\]+\.16b, > v\[0-9\]+\.16b, v\[0-9\]+\.16b" 1 } } */ > +/* { dg-final { scan-assembler-times "eor3\\tv\[0-9\]+\.16b, v\[0-9\]+\.16b, > v\[0-9\]+\.16b, v\[0-9\]+\.16b" 8 } } */ > /* { dg-final { scan-assembler-times "rax1\\tv\[0-9\]+\.2d, v\[0-9\]+\.2d, > v\[0-9\]+\.2d" 1 } } */ > /* { dg-final { scan-assembler-times "xar\\tv\[0-9\]+\.2d, v\[0-9\]+\.2d, > v\[0-9\]+\.2d, 15" 1 } } */ > -/* { dg-final { scan-assembler-times "bcax\\tv\[0-9\]+\.16b, v\[0-9\]+\.16b, > v\[0-9\]+\.16b, v\[0-9\]+\.16b" 1 } } */ > +/* { dg-final { scan-assembler-times "bcax\\tv\[0-9\]+\.16b, v\[0-9\]+\.16b, > v\[0-9\]+\.16b, v\[0-9\]+\.16b" 8 } } */ > diff --git a/gcc/testsuite/gcc.target/aarch64/sha3_3.c > b/gcc/testsuite/gcc.target/aarch64/sha3_3.c > index > 8915c805c3e55cb46691602dcf6a3627a28cb3c2..8d8ee77c2934610212286859297708528ec85ad8 > 100644 > --- a/gcc/testsuite/gcc.target/aarch64/sha3_3.c > +++ b/gcc/testsuite/gcc.target/aarch64/sha3_3.c > @@ -3,7 +3,7 @@ > > #include "sha3.h" > > -/* { dg-final { scan-assembler-times "eor3\\tv\[0-9\]+\.16b, v\[0-9\]+\.16b, > v\[0-9\]+\.16b, v\[0-9\]+\.16b" 1 } } */ > +/* { dg-final { scan-assembler-times "eor3\\tv\[0-9\]+\.16b, v\[0-9\]+\.16b, > v\[0-9\]+\.16b, v\[0-9\]+\.16b" 8 } } */ > /* { dg-final { scan-assembler-times "rax1\\tv\[0-9\]+\.2d, v\[0-9\]+\.2d, > v\[0-9\]+\.2d" 1 } } */ > /* { dg-final { scan-assembler-times "xar\\tv\[0-9\]+\.2d, v\[0-9\]+\.2d, > v\[0-9\]+\.2d, 15" 1 } } */ > -/* { dg-final { scan-assembler-times "bcax\\tv\[0-9\]+\.16b, v\[0-9\]+\.16b, > v\[0-9\]+\.16b, v\[0-9\]+\.16b" 1 } } */ > +/* { dg-final { scan-assembler-times "bcax\\tv\[0-9\]+\.16b, v\[0-9\]+\.16b, > v\[0-9\]+\.16b, v\[0-9\]+\.16b" 8 } } */ >