[PING^4] [PATCH] [AArch64, NEON] Improve vmulX intrinsics
Hi, This is a ping for: https://gcc.gnu.org/ml/gcc-patches/2015-03/msg00772.html Regtested with aarch64-linux-gnu on QEMU. This patch has no regressions for aarch64_be-linux-gnu big-endian target too. OK for the trunk? Thanks. Jiang jiji
Re: [PING^3] [PATCH] [AArch64, NEON] Improve vmulX intrinsics
Hi James, Thanks for your comment. Seems we need a 'dup' before 'fmul' if we use the GCC vector extension syntax way. Example: dup v1.2s, v1.s[0] fmulv0.2s, v1.2s, v0.2s And we need another pattern to combine this two insns into 'fmul %0.2s,%1.2s,%2.s[0]', which is kind of complex. BTW: maybe it's better to reconsider this issue after this patch, right? Thanks. Jiang jiji On Sat, Apr 11, 2015 at 11:37:47AM +0100, Jiangjiji wrote: > Hi, > This is a ping for: https://gcc.gnu.org/ml/gcc-patches/2015-03/msg00772.html > Regtested with aarch64-linux-gnu on QEMU. > This patch has no regressions for aarch64_be-linux-gnu big-endian target > too. > OK for the trunk? > > Thanks. > Jiang jiji > > > -- > Re: [PING^2] [PATCH] [AArch64, NEON] Improve vmulX intrinsics > > Hi, Kyrill > Thank you for your suggestion. > I fixed it and regtested with aarch64-linux-gnu on QEMU. > This patch has no regressions for aarch64_be-linux-gnu big-endian target > too. > OK for the trunk? Hi Jiang, I'm sorry that I've taken so long to get to this, I've been out of office for several weeks. I have one comment. > +__extension__ static __inline float32x2_t __attribute__ > +((__always_inline__)) > +vmul_n_f32 (float32x2_t __a, float32_t __b) { > + return __builtin_aarch64_mul_nv2sf (__a, __b); } > + For vmul_n_* intrinsics, is there a reason we don't want to use the GCC vector extension syntax to allow us to write these as: __extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) vmul_n_f32 (float32x2_t __a, float32_t __b) { return __a * __b; } It would be great if we could make that work. Thanks, James
[PING^2] [PATCH] [AArch64, NEON] Improve vmulX intrinsics
Hi, This is a ping for: https://gcc.gnu.org/ml/gcc-patches/2014-12/msg00775.html Regtested with aarch64-linux-gnu on QEMU. This patch has no regressions for aarch64_be-linux-gnu big-endian target too. OK for the trunk? Thanks. Index: gcc/ChangeLog === --- gcc/ChangeLog (revision 219845) +++ gcc/ChangeLog (working copy) @@ -1,3 +1,38 @@ +2014-12-11 Felix Yang + Jiji Jiang + + * config/aarch64/aarch64-simd.md (aarch64_mul_n, + aarch64_mull_n, aarch64_mull, + aarch64_simd_mull2_n, aarch64_mull2_n, + aarch64_mull_lane, aarch64_mull2_lane_internal, + aarch64_mull_laneq, aarch64_mull2_laneq_internal, + aarch64_smull2_lane, aarch64_umull2_lane, + aarch64_smull2_laneq, aarch64_umull2_laneq, + aarch64_fmulx, aarch64_fmulx, aarch64_fmulx_lane, + aarch64_pmull2v16qi, aarch64_pmullv8qi): New patterns. + * config/aarch64/aarch64-simd-builtins.def (vec_widen_smult_hi_, + vec_widen_umult_hi_, umull, smull, smull_n, umull_n, mul_n, smull2_n, + umull2_n, smull_lane, umull_lane, smull_laneq, umull_laneq, pmull, + umull2_lane, smull2_laneq, umull2_laneq, fmulx, fmulx_lane, pmull2, + smull2_lane): New builtins. + * config/aarch64/arm_neon.h (vmul_n_f32, vmul_n_s16, vmul_n_s32, + vmul_n_u16, vmul_n_u32, vmulq_n_f32, vmulq_n_f64, vmulq_n_s16, + vmulq_n_s32, vmulq_n_u16, vmulq_n_u32, vmull_high_lane_s16, + vmull_high_lane_s32, vmull_high_lane_u16, vmull_high_lane_u32, + vmull_high_laneq_s16, vmull_high_laneq_s32, vmull_high_laneq_u16, + vmull_high_laneq_u32, vmull_high_n_s16, vmull_high_n_s32, + vmull_high_n_u16, vmull_high_n_u32, vmull_high_p8, vmull_high_s8, + vmull_high_s16, vmull_high_s32, vmull_high_u8, vmull_high_u16, + vmull_high_u32, vmull_lane_s16, vmull_lane_s32, vmull_lane_u16, + vmull_lane_u32, vmull_laneq_s16, vmull_laneq_s32, vmull_laneq_u16, + vmull_laneq_u32, vmull_n_s16, vmull_n_s32, vmull_n_u16, vmull_n_u32, + vmull_p8, vmull_s8, vmull_s16, vmull_s32, vmull_u8, vmull_u16, + vmull_u32, vmulx_f32, vmulx_lane_f32, vmulxd_f64, vmulxq_f32, + vmulxq_f64, vmulxq_lane_f32, vmulxq_lane_f64, vmulxs_f32): Rewrite + using builtin functions. + * config/aarch64/iterators.md (UNSPEC_FMULX, UNSPEC_FMULX_LANE, + VDQF_Q): New unspec and int iterator. + 2015-01-19 Jiong Wang Andrew Pinski Index: gcc/config/aarch64/arm_neon.h === --- gcc/config/aarch64/arm_neon.h (revision 219845) +++ gcc/config/aarch64/arm_neon.h (working copy) @@ -7580,671 +7580,6 @@ vmovn_u64 (uint64x2_t a) return result; } -__extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) -vmul_n_f32 (float32x2_t a, float32_t b) -{ - float32x2_t result; - __asm__ ("fmul %0.2s,%1.2s,%2.s[0]" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) -vmul_n_s16 (int16x4_t a, int16_t b) -{ - int16x4_t result; - __asm__ ("mul %0.4h,%1.4h,%2.h[0]" - : "=w"(result) - : "w"(a), "x"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) -vmul_n_s32 (int32x2_t a, int32_t b) -{ - int32x2_t result; - __asm__ ("mul %0.2s,%1.2s,%2.s[0]" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) -vmul_n_u16 (uint16x4_t a, uint16_t b) -{ - uint16x4_t result; - __asm__ ("mul %0.4h,%1.4h,%2.h[0]" - : "=w"(result) - : "w"(a), "x"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) -vmul_n_u32 (uint32x2_t a, uint32_t b) -{ - uint32x2_t result; - __asm__ ("mul %0.2s,%1.2s,%2.s[0]" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -#define vmull_high_lane_s16(a, b, c)\ - __extension__ \ -({ \ - int16x4_t b_ = (b); \ - int16x8_t a_ = (a); \ - int32x4_t result;\ - __asm__ ("smull2 %0.4s, %1.8h, %2.h[%3]" \ -: "=w"(result) \ -: "w"(a_), "x"(b_), "i"(c) \ -: /* No clobbers */);
Re: [PING^2] [PATCH] [AArch64, NEON] Improve vmulX intrinsics
Hi, Kyrill Thank you for your suggestion. I fixed it and regtested with aarch64-linux-gnu on QEMU. This patch has no regressions for aarch64_be-linux-gnu big-endian target too. OK for the trunk? Thanks. Jiang jiji Index: gcc/ChangeLog === --- gcc/ChangeLog (revision 221393) +++ gcc/ChangeLog (working copy) @@ -1,3 +1,38 @@ +2015-03-14 Felix Yang + Jiji Jiang + + * config/aarch64/aarch64-simd.md (aarch64_mul_n, + aarch64_mull_n, aarch64_mull, + aarch64_simd_mull2_n, aarch64_mull2_n, + aarch64_mull_lane, aarch64_mull2_lane_internal, + aarch64_mull_laneq, aarch64_mull2_laneq_internal, + aarch64_smull2_lane, aarch64_umull2_lane, + aarch64_smull2_laneq, aarch64_umull2_laneq, + aarch64_fmulx, aarch64_fmulx, aarch64_fmulx_lane, + aarch64_pmull2v16qi, aarch64_pmullv8qi): New patterns. + * config/aarch64/aarch64-simd-builtins.def (vec_widen_smult_hi_, + vec_widen_umult_hi_, umull, smull, smull_n, umull_n, mul_n, smull2_n, + umull2_n, smull_lane, umull_lane, smull_laneq, umull_laneq, pmull, + umull2_lane, smull2_laneq, umull2_laneq, fmulx, fmulx_lane, pmull2, + smull2_lane): New builtins. + * config/aarch64/arm_neon.h (vmul_n_f32, vmul_n_s16, vmul_n_s32, + vmul_n_u16, vmul_n_u32, vmulq_n_f32, vmulq_n_f64, vmulq_n_s16, + vmulq_n_s32, vmulq_n_u16, vmulq_n_u32, vmull_high_lane_s16, + vmull_high_lane_s32, vmull_high_lane_u16, vmull_high_lane_u32, + vmull_high_laneq_s16, vmull_high_laneq_s32, vmull_high_laneq_u16, + vmull_high_laneq_u32, vmull_high_n_s16, vmull_high_n_s32, + vmull_high_n_u16, vmull_high_n_u32, vmull_high_p8, vmull_high_s8, + vmull_high_s16, vmull_high_s32, vmull_high_u8, vmull_high_u16, + vmull_high_u32, vmull_lane_s16, vmull_lane_s32, vmull_lane_u16, + vmull_lane_u32, vmull_laneq_s16, vmull_laneq_s32, vmull_laneq_u16, + vmull_laneq_u32, vmull_n_s16, vmull_n_s32, vmull_n_u16, vmull_n_u32, + vmull_p8, vmull_s8, vmull_s16, vmull_s32, vmull_u8, vmull_u16, + vmull_u32, vmulx_f32, vmulx_lane_f32, vmulxd_f64, vmulxq_f32, + vmulxq_f64, vmulxq_lane_f32, vmulxq_lane_f64, vmulxs_f32): Rewrite + using builtin functions. + * config/aarch64/iterators.md (UNSPEC_FMULX, UNSPEC_FMULX_LANE, + VDQF_Q): New unspec and int iterator. + 2015-03-12 Kyrylo Tkachov PR rtl-optimization/65235 Index: gcc/config/aarch64/arm_neon.h === --- gcc/config/aarch64/arm_neon.h (revision 221393) +++ gcc/config/aarch64/arm_neon.h (working copy) @@ -7580,671 +7580,6 @@ vmovn_u64 (uint64x2_t a) return result; } -__extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) -vmul_n_f32 (float32x2_t a, float32_t b) -{ - float32x2_t result; - __asm__ ("fmul %0.2s,%1.2s,%2.s[0]" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) -vmul_n_s16 (int16x4_t a, int16_t b) -{ - int16x4_t result; - __asm__ ("mul %0.4h,%1.4h,%2.h[0]" - : "=w"(result) - : "w"(a), "x"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) -vmul_n_s32 (int32x2_t a, int32_t b) -{ - int32x2_t result; - __asm__ ("mul %0.2s,%1.2s,%2.s[0]" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) -vmul_n_u16 (uint16x4_t a, uint16_t b) -{ - uint16x4_t result; - __asm__ ("mul %0.4h,%1.4h,%2.h[0]" - : "=w"(result) - : "w"(a), "x"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) -vmul_n_u32 (uint32x2_t a, uint32_t b) -{ - uint32x2_t result; - __asm__ ("mul %0.2s,%1.2s,%2.s[0]" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -#define vmull_high_lane_s16(a, b, c)\ - __extension__ \ -({ \ - int16x4_t b_ = (b); \ - int16x8_t a_ = (a); \ - int32x4_t result;\ - __asm__ ("smull2 %0.4s, %1.8h, %2.h[%3]" \ -: "=w"(result) \ -: "w"(a_), "x"(b_), "i"(c) \ -: /* No clobbers */);
[PING^3] [PATCH] [AArch64, NEON] Improve vmulX intrinsics
Hi, This is a ping for: https://gcc.gnu.org/ml/gcc-patches/2015-03/msg00772.html Regtested with aarch64-linux-gnu on QEMU. This patch has no regressions for aarch64_be-linux-gnu big-endian target too. OK for the trunk? Thanks. Jiang jiji -- Re: [PING^2] [PATCH] [AArch64, NEON] Improve vmulX intrinsics Hi, Kyrill Thank you for your suggestion. I fixed it and regtested with aarch64-linux-gnu on QEMU. This patch has no regressions for aarch64_be-linux-gnu big-endian target too. OK for the trunk? Thanks. Jiang jiji Index: gcc/ChangeLog === --- gcc/ChangeLog (revision 221393) +++ gcc/ChangeLog (working copy) @@ -1,3 +1,38 @@ +2015-03-14 Felix Yang + Jiji Jiang + + * config/aarch64/aarch64-simd.md (aarch64_mul_n, + aarch64_mull_n, aarch64_mull, + aarch64_simd_mull2_n, aarch64_mull2_n, + aarch64_mull_lane, aarch64_mull2_lane_internal, + aarch64_mull_laneq, aarch64_mull2_laneq_internal, + aarch64_smull2_lane, aarch64_umull2_lane, + aarch64_smull2_laneq, aarch64_umull2_laneq, + aarch64_fmulx, aarch64_fmulx, aarch64_fmulx_lane, + aarch64_pmull2v16qi, aarch64_pmullv8qi): New patterns. + * config/aarch64/aarch64-simd-builtins.def (vec_widen_smult_hi_, + vec_widen_umult_hi_, umull, smull, smull_n, umull_n, mul_n, smull2_n, + umull2_n, smull_lane, umull_lane, smull_laneq, umull_laneq, pmull, + umull2_lane, smull2_laneq, umull2_laneq, fmulx, fmulx_lane, pmull2, + smull2_lane): New builtins. + * config/aarch64/arm_neon.h (vmul_n_f32, vmul_n_s16, vmul_n_s32, + vmul_n_u16, vmul_n_u32, vmulq_n_f32, vmulq_n_f64, vmulq_n_s16, + vmulq_n_s32, vmulq_n_u16, vmulq_n_u32, vmull_high_lane_s16, + vmull_high_lane_s32, vmull_high_lane_u16, vmull_high_lane_u32, + vmull_high_laneq_s16, vmull_high_laneq_s32, vmull_high_laneq_u16, + vmull_high_laneq_u32, vmull_high_n_s16, vmull_high_n_s32, + vmull_high_n_u16, vmull_high_n_u32, vmull_high_p8, vmull_high_s8, + vmull_high_s16, vmull_high_s32, vmull_high_u8, vmull_high_u16, + vmull_high_u32, vmull_lane_s16, vmull_lane_s32, vmull_lane_u16, + vmull_lane_u32, vmull_laneq_s16, vmull_laneq_s32, vmull_laneq_u16, + vmull_laneq_u32, vmull_n_s16, vmull_n_s32, vmull_n_u16, vmull_n_u32, + vmull_p8, vmull_s8, vmull_s16, vmull_s32, vmull_u8, vmull_u16, + vmull_u32, vmulx_f32, vmulx_lane_f32, vmulxd_f64, vmulxq_f32, + vmulxq_f64, vmulxq_lane_f32, vmulxq_lane_f64, vmulxs_f32): Rewrite + using builtin functions. + * config/aarch64/iterators.md (UNSPEC_FMULX, UNSPEC_FMULX_LANE, + VDQF_Q): New unspec and int iterator. + 2015-03-12 Kyrylo Tkachov PR rtl-optimization/65235 Index: gcc/config/aarch64/arm_neon.h === --- gcc/config/aarch64/arm_neon.h (revision 221393) +++ gcc/config/aarch64/arm_neon.h (working copy) @@ -7580,671 +7580,6 @@ vmovn_u64 (uint64x2_t a) return result; } -__extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) -vmul_n_f32 (float32x2_t a, float32_t b) -{ - float32x2_t result; - __asm__ ("fmul %0.2s,%1.2s,%2.s[0]" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) -vmul_n_s16 (int16x4_t a, int16_t b) -{ - int16x4_t result; - __asm__ ("mul %0.4h,%1.4h,%2.h[0]" - : "=w"(result) - : "w"(a), "x"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) -vmul_n_s32 (int32x2_t a, int32_t b) -{ - int32x2_t result; - __asm__ ("mul %0.2s,%1.2s,%2.s[0]" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) -vmul_n_u16 (uint16x4_t a, uint16_t b) -{ - uint16x4_t result; - __asm__ ("mul %0.4h,%1.4h,%2.h[0]" - : "=w"(result) - : "w"(a), "x"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) -vmul_n_u32 (uint32x2_t a, uint32_t b) -{ - uint32x2_t result; - __asm__ ("mul %0.2s,%1.2s,%2.s[0]" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -#define vmull_high_lane_s16(a, b, c)\ - __extension__ \ -({ \ - int16x4_t b_ = (b); \ - int16x8_t a_ = (a); \ - int32x4_t result;
[PING] [PATCH] [AArch64, NEON] Improve vmulX intrinsics
Hi, This is a ping for: https://gcc.gnu.org/ml/gcc-patches/2014-12/msg00775.html Regtested with aarch64-linux-gnu on QEMU. This patch has no regressions for aarch64_be-linux-gnu big-endian target too. OK for the trunk? Thanks. Index: gcc/ChangeLog === --- gcc/ChangeLog (revision 219845) +++ gcc/ChangeLog (working copy) @@ -1,3 +1,38 @@ +2014-12-11 Felix Yang + Jiji Jiang + + * config/aarch64/aarch64-simd.md (aarch64_mul_n, + aarch64_mull_n, aarch64_mull, + aarch64_simd_mull2_n, aarch64_mull2_n, + aarch64_mull_lane, aarch64_mull2_lane_internal, + aarch64_mull_laneq, aarch64_mull2_laneq_internal, + aarch64_smull2_lane, aarch64_umull2_lane, + aarch64_smull2_laneq, aarch64_umull2_laneq, + aarch64_fmulx, aarch64_fmulx, aarch64_fmulx_lane, + aarch64_pmull2v16qi, aarch64_pmullv8qi): New patterns. + * config/aarch64/aarch64-simd-builtins.def (vec_widen_smult_hi_, + vec_widen_umult_hi_, umull, smull, smull_n, umull_n, mul_n, smull2_n, + umull2_n, smull_lane, umull_lane, smull_laneq, umull_laneq, pmull, + umull2_lane, smull2_laneq, umull2_laneq, fmulx, fmulx_lane, pmull2, + smull2_lane): New builtins. + * config/aarch64/arm_neon.h (vmul_n_f32, vmul_n_s16, vmul_n_s32, + vmul_n_u16, vmul_n_u32, vmulq_n_f32, vmulq_n_f64, vmulq_n_s16, + vmulq_n_s32, vmulq_n_u16, vmulq_n_u32, vmull_high_lane_s16, + vmull_high_lane_s32, vmull_high_lane_u16, vmull_high_lane_u32, + vmull_high_laneq_s16, vmull_high_laneq_s32, vmull_high_laneq_u16, + vmull_high_laneq_u32, vmull_high_n_s16, vmull_high_n_s32, + vmull_high_n_u16, vmull_high_n_u32, vmull_high_p8, vmull_high_s8, + vmull_high_s16, vmull_high_s32, vmull_high_u8, vmull_high_u16, + vmull_high_u32, vmull_lane_s16, vmull_lane_s32, vmull_lane_u16, + vmull_lane_u32, vmull_laneq_s16, vmull_laneq_s32, vmull_laneq_u16, + vmull_laneq_u32, vmull_n_s16, vmull_n_s32, vmull_n_u16, vmull_n_u32, + vmull_p8, vmull_s8, vmull_s16, vmull_s32, vmull_u8, vmull_u16, + vmull_u32, vmulx_f32, vmulx_lane_f32, vmulxd_f64, vmulxq_f32, + vmulxq_f64, vmulxq_lane_f32, vmulxq_lane_f64, vmulxs_f32): Rewrite + using builtin functions. + * config/aarch64/iterators.md (UNSPEC_FMULX, UNSPEC_FMULX_LANE, + VDQF_Q): New unspec and int iterator. + 2015-01-19 Jiong Wang Andrew Pinski Index: gcc/config/aarch64/arm_neon.h === --- gcc/config/aarch64/arm_neon.h (revision 219845) +++ gcc/config/aarch64/arm_neon.h (working copy) @@ -7580,671 +7580,6 @@ vmovn_u64 (uint64x2_t a) return result; } -__extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) -vmul_n_f32 (float32x2_t a, float32_t b) -{ - float32x2_t result; - __asm__ ("fmul %0.2s,%1.2s,%2.s[0]" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) -vmul_n_s16 (int16x4_t a, int16_t b) -{ - int16x4_t result; - __asm__ ("mul %0.4h,%1.4h,%2.h[0]" - : "=w"(result) - : "w"(a), "x"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) -vmul_n_s32 (int32x2_t a, int32_t b) -{ - int32x2_t result; - __asm__ ("mul %0.2s,%1.2s,%2.s[0]" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) -vmul_n_u16 (uint16x4_t a, uint16_t b) -{ - uint16x4_t result; - __asm__ ("mul %0.4h,%1.4h,%2.h[0]" - : "=w"(result) - : "w"(a), "x"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) -vmul_n_u32 (uint32x2_t a, uint32_t b) -{ - uint32x2_t result; - __asm__ ("mul %0.2s,%1.2s,%2.s[0]" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -#define vmull_high_lane_s16(a, b, c)\ - __extension__ \ -({ \ - int16x4_t b_ = (b); \ - int16x8_t a_ = (a); \ - int32x4_t result;\ - __asm__ ("smull2 %0.4s, %1.8h, %2.h[%3]" \ -: "=w"(result) \ -: "w"(a_), "x"(b_), "i"(c) \ -: /* No clobbers */);
[AArch64, NEON] Improve vmulX intrinsics
Hi, This patch converts more intrinsics to use builtin functions instead of the previous inline assembly syntax. Passed the glorious testsuite of Christophe Lyon. Three testcases are added for the testing of intriniscs which are not covered by the testsuite: gcc.target/aarch64/vmull_high.c gcc.target/aarch64/vmull_high_lane.c gcc.target/aarch64/vmull_high_n.c Regtested with aarch64-linux-gnu on QEMU. This patch has no regressions for aarch64_be-linux-gnu big-endian target too. OK for the trunk? Index: gcc/ChangeLog === --- gcc/ChangeLog (revision 218464) +++ gcc/ChangeLog (working copy) @@ -1,3 +1,38 @@ +2014-12-09 Felix Yang +Jiji Jiang + + * config/aarch64/aarch64-simd.md (aarch64_mul_n, + aarch64_mull_n, aarch64_mull, + aarch64_simd_mull2_n, aarch64_mull2_n, + aarch64_mull_lane, aarch64_mull2_lane_internal, + aarch64_mull_laneq, aarch64_mull2_laneq_internal, + aarch64_smull2_lane, aarch64_umull2_lane, + aarch64_smull2_laneq, aarch64_umull2_laneq, + aarch64_fmulx, aarch64_fmulx, aarch64_fmulx_lane, + aarch64_pmull2v16qi, aarch64_pmullv8qi): New patterns. + * config/aarch64/aarch64-simd-builtins.def (vec_widen_smult_hi_, + vec_widen_umult_hi_, umull, smull, smull_n, umull_n, mul_n, smull2_n, + umull2_n, smull_lane, umull_lane, smull_laneq, umull_laneq, pmull, + umull2_lane, smull2_laneq, umull2_laneq, fmulx, fmulx_lane, pmull2, + smull2_lane): New builtins. + * config/aarch64/arm_neon.h (vmul_n_f32, vmul_n_s16, vmul_n_s32, + vmul_n_u16, vmul_n_u32, vmulq_n_f32, vmulq_n_f64, vmulq_n_s16, + vmulq_n_s32, vmulq_n_u16, vmulq_n_u32, vmull_high_lane_s16, + vmull_high_lane_s32, vmull_high_lane_u16, vmull_high_lane_u32, + vmull_high_laneq_s16, vmull_high_laneq_s32, vmull_high_laneq_u16, + vmull_high_laneq_u32, vmull_high_n_s16, vmull_high_n_s32, + vmull_high_n_u16, vmull_high_n_u32, vmull_high_p8, vmull_high_s8, + vmull_high_s16, vmull_high_s32, vmull_high_u8, vmull_high_u16, + vmull_high_u32, vmull_lane_s16, vmull_lane_s32, vmull_lane_u16, + vmull_lane_u32, vmull_laneq_s16, vmull_laneq_s32, vmull_laneq_u16, + vmull_laneq_u32, vmull_n_s16, vmull_n_s32, vmull_n_u16, vmull_n_u32, + vmull_p8, vmull_s8, vmull_s16, vmull_s32, vmull_u8, vmull_u16, + vmull_u32, vmulx_f32, vmulx_lane_f32, vmulxd_f64, vmulxq_f32, + vmulxq_f64, vmulxq_lane_f32, vmulxq_lane_f64, vmulxs_f32): Rewrite + using builtin functions. + * config/aarch64/iterators.md (UNSPEC_FMULX, UNSPEC_FMULX_LANE, + VDQF_Q): New unspec and int iterator. + 2014-12-07 Felix Yang Shanyao Chen Index: gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmull_high.c === --- gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmull_high.c (revision 0) +++ gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmull_high.c (revision 0) @@ -0,0 +1,111 @@ +#include +#include "arm-neon-ref.h" +#include "compute-ref-data.h" + + +/* Expected results. */ +VECT_VAR_DECL(expected,int,16,8) [] = { 0xfc48, 0xfcbf, 0xfd36, 0xfdad, +0xfe24, 0xfe9b, 0xff12, 0xff89 }; +VECT_VAR_DECL(expected,int,32,4) [] = { 0xf9a0, 0xfa28, +0xfab0, 0xfb38 }; +VECT_VAR_DECL(expected,int,64,2) [] = { 0xf7a2, +0xf83b }; +VECT_VAR_DECL(expected,uint,16,8) [] = { 0xa4b0, 0xa55a, 0xa604, 0xa6ae, + 0xa758, 0xa802, 0xa8ac, 0xa956 }; +VECT_VAR_DECL(expected,uint,32,4) [] = { 0xbaf73c, 0xbaf7f7, + 0xbaf8b2, 0xbaf96d }; +VECT_VAR_DECL(expected,uint,64,2) [] = { 0xcbf4d8, + 0xcbf5a4}; +VECT_VAR_DECL(expected,poly,16,8) [] = { 0x6530, 0x659a, 0x6464, 0x64ce, + 0x6798, 0x6732, 0x66cc, 0x }; + +#ifndef INSN_NAME +#define INSN_NAME vmull_high +#define TEST_MSG "VMUL_HIGH" +#endif + +#define FNNAME1(NAME) exec_ ## NAME +#define FNNAME(NAME) FNNAME1(NAME) + +void FNNAME (INSN_NAME) (void) +{ +#define DECL_VMUL(T, W, N) \ + DECL_VARIABLE(vector1, T, W, N); \ + DECL_VARIABLE(vector2, T, W, N); + + /* vector_res = OP(vector1, vector2), then store the result. */ +#define TEST_VMULL_HIGH1(INSN, Q, T1, T2, W, N, W1, N1) \ + VECT_VAR(vector_res, T1, W1, N1) =\ +INSN##Q##_##T2##W(VECT_VAR(vector1, T1, W, N), \ + VECT_VAR(vector2, T1, W, N));\ + vst1q##_##T2##W1(VECT_VAR(result, T1, W1, N1),\ +
Re: [AArch64, NEON] Improve vmulX intrinsics
Hi, Christophe Lyon These testcases are not covered by the glorious testsuite. If these cases are in your todo list , I will exclude them. Thanks. -邮件原件- 发件人: Christophe Lyon [mailto:christophe.l...@linaro.org] 发送时间: 2014年12月9日 21:43 收件人: Jiangjiji 抄送: gcc-patches@gcc.gnu.org; Richard Earnshaw; Yangfei (Felix); Marcus Shawcroft 主题: Re: [AArch64, NEON] Improve vmulX intrinsics On 9 December 2014 at 13:52, Jiangjiji wrote: > Hi, > This patch converts more intrinsics to use builtin functions instead of > the > previous inline assembly syntax. > Passed the glorious testsuite of Christophe Lyon. > > Three testcases are added for the testing of intriniscs which are not > covered by the testsuite: > gcc.target/aarch64/vmull_high.c > gcc.target/aarch64/vmull_high_lane.c > gcc.target/aarch64/vmull_high_n.c > As I said here: https://gcc.gnu.org/ml/gcc-patches/2014-10/msg01934.html I am in tre process of converting my existing testsuite to GCC/Dejagnu. Please do not duplicate work. > Regtested with aarch64-linux-gnu on QEMU. > This patch has no regressions for aarch64_be-linux-gnu big-endian > target too. > OK for the trunk? > > > > Index: gcc/ChangeLog > === > --- gcc/ChangeLog (revision 218464) > +++ gcc/ChangeLog (working copy) > @@ -1,3 +1,38 @@ > +2014-12-09 Felix Yang > +Jiji Jiang > + > + * config/aarch64/aarch64-simd.md (aarch64_mul_n, > + aarch64_mull_n, aarch64_mull, > + aarch64_simd_mull2_n, aarch64_mull2_n, > + aarch64_mull_lane, aarch64_mull2_lane_internal, > + aarch64_mull_laneq, > aarch64_mull2_laneq_internal, > + aarch64_smull2_lane, aarch64_umull2_lane, > + aarch64_smull2_laneq, aarch64_umull2_laneq, > + aarch64_fmulx, aarch64_fmulx, aarch64_fmulx_lane, > + aarch64_pmull2v16qi, aarch64_pmullv8qi): New patterns. > + * config/aarch64/aarch64-simd-builtins.def (vec_widen_smult_hi_, > + vec_widen_umult_hi_, umull, smull, smull_n, umull_n, mul_n, > smull2_n, > + umull2_n, smull_lane, umull_lane, smull_laneq, umull_laneq, pmull, > + umull2_lane, smull2_laneq, umull2_laneq, fmulx, fmulx_lane, pmull2, > + smull2_lane): New builtins. > + * config/aarch64/arm_neon.h (vmul_n_f32, vmul_n_s16, vmul_n_s32, > + vmul_n_u16, vmul_n_u32, vmulq_n_f32, vmulq_n_f64, vmulq_n_s16, > + vmulq_n_s32, vmulq_n_u16, vmulq_n_u32, vmull_high_lane_s16, > + vmull_high_lane_s32, vmull_high_lane_u16, vmull_high_lane_u32, > + vmull_high_laneq_s16, vmull_high_laneq_s32, vmull_high_laneq_u16, > + vmull_high_laneq_u32, vmull_high_n_s16, vmull_high_n_s32, > + vmull_high_n_u16, vmull_high_n_u32, vmull_high_p8, vmull_high_s8, > + vmull_high_s16, vmull_high_s32, vmull_high_u8, vmull_high_u16, > + vmull_high_u32, vmull_lane_s16, vmull_lane_s32, vmull_lane_u16, > + vmull_lane_u32, vmull_laneq_s16, vmull_laneq_s32, vmull_laneq_u16, > + vmull_laneq_u32, vmull_n_s16, vmull_n_s32, vmull_n_u16, vmull_n_u32, > + vmull_p8, vmull_s8, vmull_s16, vmull_s32, vmull_u8, vmull_u16, > + vmull_u32, vmulx_f32, vmulx_lane_f32, vmulxd_f64, vmulxq_f32, > + vmulxq_f64, vmulxq_lane_f32, vmulxq_lane_f64, vmulxs_f32): Rewrite > + using builtin functions. > + * config/aarch64/iterators.md (UNSPEC_FMULX, UNSPEC_FMULX_LANE, > + VDQF_Q): New unspec and int iterator. > + > 2014-12-07 Felix Yang > Shanyao Chen > Index: gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmull_high.c > === > --- gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmull_high.c > (revision 0) > +++ gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmull_high.c > (revision 0) > @@ -0,0 +1,111 @@ > +#include > +#include "arm-neon-ref.h" > +#include "compute-ref-data.h" > + > + > +/* Expected results. */ > +VECT_VAR_DECL(expected,int,16,8) [] = { 0xfc48, 0xfcbf, 0xfd36, 0xfdad, > +0xfe24, 0xfe9b, 0xff12, 0xff89 }; > +VECT_VAR_DECL(expected,int,32,4) [] = { 0xf9a0, 0xfa28, > +0xfab0, 0xfb38 }; > +VECT_VAR_DECL(expected,int,64,2) [] = { 0xf7a2, > +0xf83b }; > +VECT_VAR_DECL(expected,uint,16,8) [] = { 0xa4b0, 0xa55a, 0xa604, 0xa6ae, > + 0xa758, 0xa802, 0xa8ac, 0xa956 }; > +VECT_VAR_DECL(expected,uint,32,4) [] = { 0xbaf73c, 0xbaf7f7, > + 0xbaf8b2, 0xbaf96d }; > +V