Jonathan Wright via Gcc-patches <gcc-patches@gcc.gnu.org> writes: > Hi, > > As subject, this patch rewrites the vpaddq Neon intrinsics to use RTL > builtins rather than inline assembly code, allowing for better scheduling > and optimization. > > Regression tested and bootstrapped on aarch64-none-linux-gnu - no > issues. > > Ok for master?
OK, thanks. Richard > Thanks, > Jonathan > > --- > > gcc/ChangeLog: > > 2021-02-08 Jonathan Wright <jonathan.wri...@arm.com> > > * config/aarch64/aarch64-simd-builtins.def: Use VDQ_I iterator > for aarch64_addp<mode> builtin macro generator. > * config/aarch64/aarch64-simd.md: Use VDQ_I iterator in > aarch64_addp<mode> RTL pattern. > * config/aarch64/arm_neon.h (vpaddq_s8): Use RTL builtin > instead of inline asm. > (vpaddq_s16): Likewise. > (vpaddq_s32): Likewise. > (vpaddq_s64): Likewise. > (vpaddq_u8): Likewise. > (vpaddq_u16): Likewise. > (vpaddq_u32): Likewise. > (vpaddq_u64): Likewise. > > diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def > b/gcc/config/aarch64/aarch64-simd-builtins.def > index > c29c492913c06fed078f24efb144022a7d6adbbb..dc7b5d22b654d5bcca0152907c1c6967755e9548 > 100644 > --- a/gcc/config/aarch64/aarch64-simd-builtins.def > +++ b/gcc/config/aarch64/aarch64-simd-builtins.def > @@ -50,7 +50,7 @@ > VAR1 (BINOP, pmull_hi, 0, NONE, v16qi) > BUILTIN_VHSDF_HSDF (BINOP, fmulx, 0, FP) > BUILTIN_VHSDF_DF (UNOP, sqrt, 2, FP) > - BUILTIN_VD_BHSI (BINOP, addp, 0, NONE) > + BUILTIN_VDQ_I (BINOP, addp, 0, NONE) > VAR1 (UNOP, addp, 0, NONE, di) > BUILTIN_VDQ_BHSI (UNOP, clrsb, 2, NONE) > BUILTIN_VDQ_BHSI (UNOP, clz, 2, NONE) > diff --git a/gcc/config/aarch64/aarch64-simd.md > b/gcc/config/aarch64/aarch64-simd.md > index > 84db72478eb661ae4712e920bd4377c7c2af038b..6fc472c19493d6d10fb1c5d0686e519d53973692 > 100644 > --- a/gcc/config/aarch64/aarch64-simd.md > +++ b/gcc/config/aarch64/aarch64-simd.md > @@ -6057,10 +6057,10 @@ > ;; addp > > (define_insn "aarch64_addp<mode>" > - [(set (match_operand:VD_BHSI 0 "register_operand" "=w") > - (unspec:VD_BHSI > - [(match_operand:VD_BHSI 1 "register_operand" "w") > - (match_operand:VD_BHSI 2 "register_operand" "w")] > + [(set (match_operand:VDQ_I 0 "register_operand" "=w") > + (unspec:VDQ_I > + [(match_operand:VDQ_I 1 "register_operand" "w") > + (match_operand:VDQ_I 2 "register_operand" "w")] > UNSPEC_ADDP))] > "TARGET_SIMD" > "addp\t%<v>0<Vmtype>, %<v>1<Vmtype>, %<v>2<Vmtype>" > diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h > index > 48cce8fed8a02f4fa791fb958e772eeacecd1de1..b8de77bcc02dfddf73980442919ec1990e28ee72 > 100644 > --- a/gcc/config/aarch64/arm_neon.h > +++ b/gcc/config/aarch64/arm_neon.h > @@ -8665,96 +8665,60 @@ __extension__ extern __inline int8x16_t > __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > vpaddq_s8 (int8x16_t __a, int8x16_t __b) > { > - int8x16_t __result; > - __asm__ ("addp %0.16b,%1.16b,%2.16b" > - : "=w"(__result) > - : "w"(__a), "w"(__b) > - : /* No clobbers */); > - return __result; > + return __builtin_aarch64_addpv16qi (__a, __b); > } > > __extension__ extern __inline int16x8_t > __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > vpaddq_s16 (int16x8_t __a, int16x8_t __b) > { > - int16x8_t __result; > - __asm__ ("addp %0.8h,%1.8h,%2.8h" > - : "=w"(__result) > - : "w"(__a), "w"(__b) > - : /* No clobbers */); > - return __result; > + return __builtin_aarch64_addpv8hi (__a, __b); > } > > __extension__ extern __inline int32x4_t > __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > vpaddq_s32 (int32x4_t __a, int32x4_t __b) > { > - int32x4_t __result; > - __asm__ ("addp %0.4s,%1.4s,%2.4s" > - : "=w"(__result) > - : "w"(__a), "w"(__b) > - : /* No clobbers */); > - return __result; > + return __builtin_aarch64_addpv4si (__a, __b); > } > > __extension__ extern __inline int64x2_t > __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > vpaddq_s64 (int64x2_t __a, int64x2_t __b) > { > - int64x2_t __result; > - __asm__ ("addp %0.2d,%1.2d,%2.2d" > - : "=w"(__result) > - : "w"(__a), "w"(__b) > - : /* No clobbers */); > - return __result; > + return __builtin_aarch64_addpv2di (__a, __b); > } > > __extension__ extern __inline uint8x16_t > __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > vpaddq_u8 (uint8x16_t __a, uint8x16_t __b) > { > - uint8x16_t __result; > - __asm__ ("addp %0.16b,%1.16b,%2.16b" > - : "=w"(__result) > - : "w"(__a), "w"(__b) > - : /* No clobbers */); > - return __result; > + return (uint8x16_t) __builtin_aarch64_addpv16qi ((int8x16_t) __a, > + (int8x16_t) __b); > } > > __extension__ extern __inline uint16x8_t > __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > vpaddq_u16 (uint16x8_t __a, uint16x8_t __b) > { > - uint16x8_t __result; > - __asm__ ("addp %0.8h,%1.8h,%2.8h" > - : "=w"(__result) > - : "w"(__a), "w"(__b) > - : /* No clobbers */); > - return __result; > + return (uint16x8_t) __builtin_aarch64_addpv8hi ((int16x8_t) __a, > + (int16x8_t) __b); > } > > __extension__ extern __inline uint32x4_t > __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > vpaddq_u32 (uint32x4_t __a, uint32x4_t __b) > { > - uint32x4_t __result; > - __asm__ ("addp %0.4s,%1.4s,%2.4s" > - : "=w"(__result) > - : "w"(__a), "w"(__b) > - : /* No clobbers */); > - return __result; > + return (uint32x4_t) __builtin_aarch64_addpv4si ((int32x4_t) __a, > + (int32x4_t) __b); > } > > __extension__ extern __inline uint64x2_t > __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > vpaddq_u64 (uint64x2_t __a, uint64x2_t __b) > { > - uint64x2_t __result; > - __asm__ ("addp %0.2d,%1.2d,%2.2d" > - : "=w"(__result) > - : "w"(__a), "w"(__b) > - : /* No clobbers */); > - return __result; > + return (uint64x2_t) __builtin_aarch64_addpv2di ((int64x2_t) __a, > + (int64x2_t) __b); > } > > __extension__ extern __inline int16x4_t