[PING^4] [PATCH] [AArch64, NEON] Improve vmulX intrinsics

2015-05-04 Thread Jiangjiji

Hi, 
  This is a ping for: https://gcc.gnu.org/ml/gcc-patches/2015-03/msg00772.html
  Regtested with aarch64-linux-gnu on QEMU.
  This patch has no regressions for aarch64_be-linux-gnu big-endian target too. 
  OK for the trunk? 

Thanks.
Jiang jiji







Re: [PING^3] [PATCH] [AArch64, NEON] Improve vmulX intrinsics

2015-05-05 Thread Jiangjiji
Hi James,

Thanks for your comment.

Seems we need a 'dup' before 'fmul' if we use the GCC vector extension syntax 
way.

Example:
dup v1.2s, v1.s[0]
fmulv0.2s, v1.2s, v0.2s

And we need another pattern to combine this two insns into 'fmul 
%0.2s,%1.2s,%2.s[0]', which is kind of complex.

BTW: maybe it's better to reconsider this issue after this patch, right?


Thanks.
Jiang jiji



On Sat, Apr 11, 2015 at 11:37:47AM +0100, Jiangjiji wrote:
> Hi,
>   This is a ping for: https://gcc.gnu.org/ml/gcc-patches/2015-03/msg00772.html
>   Regtested with aarch64-linux-gnu on QEMU.
>   This patch has no regressions for aarch64_be-linux-gnu big-endian target 
> too.
>   OK for the trunk?
> 
> Thanks.
> Jiang jiji
> 
> 
> --
> Re: [PING^2] [PATCH] [AArch64, NEON] Improve vmulX intrinsics
> 
> Hi, Kyrill
>   Thank you for your suggestion.
>   I fixed it and regtested with aarch64-linux-gnu on QEMU.
>   This patch has no regressions for aarch64_be-linux-gnu big-endian target 
> too.
>   OK for the trunk?

Hi Jiang,

I'm sorry that I've taken so long to get to this, I've been out of office for 
several weeks. I have one comment.

> +__extension__ static __inline float32x2_t __attribute__ 
> +((__always_inline__))
> +vmul_n_f32 (float32x2_t __a, float32_t __b) {
> +  return __builtin_aarch64_mul_nv2sf (__a, __b); }
> +

For vmul_n_* intrinsics, is there a reason we don't want to use the GCC vector 
extension syntax to allow us to write these as:

  __extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
  vmul_n_f32 (float32x2_t __a, float32_t __b)
  {
return __a * __b;
  }

It would be great if we could make that work.

Thanks,
James



[PING^2] [PATCH] [AArch64, NEON] Improve vmulX intrinsics

2015-03-12 Thread Jiangjiji
Hi, 
  This is a ping for: https://gcc.gnu.org/ml/gcc-patches/2014-12/msg00775.html
  Regtested with aarch64-linux-gnu on QEMU.
  This patch has no regressions for aarch64_be-linux-gnu big-endian target too. 
  OK for the trunk? Thanks.


Index: gcc/ChangeLog
===
--- gcc/ChangeLog   (revision 219845)
+++ gcc/ChangeLog   (working copy)
@@ -1,3 +1,38 @@
+2014-12-11  Felix Yang  
+   Jiji Jiang  
+
+   * config/aarch64/aarch64-simd.md (aarch64_mul_n,
+   aarch64_mull_n, aarch64_mull,
+   aarch64_simd_mull2_n, aarch64_mull2_n,
+   aarch64_mull_lane, aarch64_mull2_lane_internal,
+   aarch64_mull_laneq, aarch64_mull2_laneq_internal,
+   aarch64_smull2_lane, aarch64_umull2_lane,
+   aarch64_smull2_laneq, aarch64_umull2_laneq,
+   aarch64_fmulx, aarch64_fmulx, aarch64_fmulx_lane,
+   aarch64_pmull2v16qi, aarch64_pmullv8qi): New patterns.
+   * config/aarch64/aarch64-simd-builtins.def (vec_widen_smult_hi_,
+   vec_widen_umult_hi_, umull, smull, smull_n, umull_n, mul_n, smull2_n,
+   umull2_n, smull_lane, umull_lane, smull_laneq, umull_laneq, pmull,
+   umull2_lane, smull2_laneq, umull2_laneq, fmulx, fmulx_lane, pmull2,
+   smull2_lane): New builtins.
+   * config/aarch64/arm_neon.h (vmul_n_f32, vmul_n_s16, vmul_n_s32,
+   vmul_n_u16, vmul_n_u32, vmulq_n_f32, vmulq_n_f64, vmulq_n_s16,
+   vmulq_n_s32, vmulq_n_u16, vmulq_n_u32, vmull_high_lane_s16,
+   vmull_high_lane_s32, vmull_high_lane_u16, vmull_high_lane_u32,
+   vmull_high_laneq_s16, vmull_high_laneq_s32, vmull_high_laneq_u16,
+   vmull_high_laneq_u32, vmull_high_n_s16, vmull_high_n_s32,
+   vmull_high_n_u16, vmull_high_n_u32, vmull_high_p8, vmull_high_s8,
+   vmull_high_s16, vmull_high_s32, vmull_high_u8, vmull_high_u16,
+   vmull_high_u32, vmull_lane_s16, vmull_lane_s32, vmull_lane_u16,
+   vmull_lane_u32, vmull_laneq_s16, vmull_laneq_s32, vmull_laneq_u16,
+   vmull_laneq_u32, vmull_n_s16, vmull_n_s32, vmull_n_u16, vmull_n_u32,
+   vmull_p8, vmull_s8, vmull_s16, vmull_s32, vmull_u8, vmull_u16,
+   vmull_u32, vmulx_f32, vmulx_lane_f32, vmulxd_f64, vmulxq_f32,
+   vmulxq_f64, vmulxq_lane_f32, vmulxq_lane_f64, vmulxs_f32): Rewrite
+   using builtin functions.
+   * config/aarch64/iterators.md (UNSPEC_FMULX, UNSPEC_FMULX_LANE,
+   VDQF_Q): New unspec and int iterator.
+
 2015-01-19  Jiong Wang  
Andrew Pinski  
 
Index: gcc/config/aarch64/arm_neon.h
===
--- gcc/config/aarch64/arm_neon.h   (revision 219845)
+++ gcc/config/aarch64/arm_neon.h   (working copy)
@@ -7580,671 +7580,6 @@ vmovn_u64 (uint64x2_t a)
   return result;
 }
 
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-vmul_n_f32 (float32x2_t a, float32_t b)
-{
-  float32x2_t result;
-  __asm__ ("fmul %0.2s,%1.2s,%2.s[0]"
-   : "=w"(result)
-   : "w"(a), "w"(b)
-   : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vmul_n_s16 (int16x4_t a, int16_t b)
-{
-  int16x4_t result;
-  __asm__ ("mul %0.4h,%1.4h,%2.h[0]"
-   : "=w"(result)
-   : "w"(a), "x"(b)
-   : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vmul_n_s32 (int32x2_t a, int32_t b)
-{
-  int32x2_t result;
-  __asm__ ("mul %0.2s,%1.2s,%2.s[0]"
-   : "=w"(result)
-   : "w"(a), "w"(b)
-   : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vmul_n_u16 (uint16x4_t a, uint16_t b)
-{
-  uint16x4_t result;
-  __asm__ ("mul %0.4h,%1.4h,%2.h[0]"
-   : "=w"(result)
-   : "w"(a), "x"(b)
-   : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vmul_n_u32 (uint32x2_t a, uint32_t b)
-{
-  uint32x2_t result;
-  __asm__ ("mul %0.2s,%1.2s,%2.s[0]"
-   : "=w"(result)
-   : "w"(a), "w"(b)
-   : /* No clobbers */);
-  return result;
-}
-
-#define vmull_high_lane_s16(a, b, c)\
-  __extension__ \
-({  \
-   int16x4_t b_ = (b);  \
-   int16x8_t a_ = (a);  \
-   int32x4_t result;\
-   __asm__ ("smull2 %0.4s, %1.8h, %2.h[%3]" \
-: "=w"(result)  \
-: "w"(a_), "x"(b_), "i"(c)  \
-: /* No clobbers */); 

Re: [PING^2] [PATCH] [AArch64, NEON] Improve vmulX intrinsics

2015-03-14 Thread Jiangjiji
Hi, Kyrill
  Thank you for your suggestion. 
  I fixed it and regtested with aarch64-linux-gnu on QEMU.
  This patch has no regressions for aarch64_be-linux-gnu big-endian target too. 
  OK for the trunk? 

Thanks.
Jiang jiji



Index: gcc/ChangeLog
===
--- gcc/ChangeLog   (revision 221393)
+++ gcc/ChangeLog   (working copy)
@@ -1,3 +1,38 @@
+2015-03-14  Felix Yang  
+   Jiji Jiang  
+
+   * config/aarch64/aarch64-simd.md (aarch64_mul_n,
+   aarch64_mull_n, aarch64_mull,
+   aarch64_simd_mull2_n, aarch64_mull2_n,
+   aarch64_mull_lane, aarch64_mull2_lane_internal,
+   aarch64_mull_laneq, aarch64_mull2_laneq_internal,
+   aarch64_smull2_lane, aarch64_umull2_lane,
+   aarch64_smull2_laneq, aarch64_umull2_laneq,
+   aarch64_fmulx, aarch64_fmulx, aarch64_fmulx_lane,
+   aarch64_pmull2v16qi, aarch64_pmullv8qi): New patterns.
+   * config/aarch64/aarch64-simd-builtins.def (vec_widen_smult_hi_,
+   vec_widen_umult_hi_, umull, smull, smull_n, umull_n, mul_n, smull2_n,
+   umull2_n, smull_lane, umull_lane, smull_laneq, umull_laneq, pmull,
+   umull2_lane, smull2_laneq, umull2_laneq, fmulx, fmulx_lane, pmull2,
+   smull2_lane): New builtins.
+   * config/aarch64/arm_neon.h (vmul_n_f32, vmul_n_s16, vmul_n_s32,
+   vmul_n_u16, vmul_n_u32, vmulq_n_f32, vmulq_n_f64, vmulq_n_s16,
+   vmulq_n_s32, vmulq_n_u16, vmulq_n_u32, vmull_high_lane_s16,
+   vmull_high_lane_s32, vmull_high_lane_u16, vmull_high_lane_u32,
+   vmull_high_laneq_s16, vmull_high_laneq_s32, vmull_high_laneq_u16,
+   vmull_high_laneq_u32, vmull_high_n_s16, vmull_high_n_s32,
+   vmull_high_n_u16, vmull_high_n_u32, vmull_high_p8, vmull_high_s8,
+   vmull_high_s16, vmull_high_s32, vmull_high_u8, vmull_high_u16,
+   vmull_high_u32, vmull_lane_s16, vmull_lane_s32, vmull_lane_u16,
+   vmull_lane_u32, vmull_laneq_s16, vmull_laneq_s32, vmull_laneq_u16,
+   vmull_laneq_u32, vmull_n_s16, vmull_n_s32, vmull_n_u16, vmull_n_u32,
+   vmull_p8, vmull_s8, vmull_s16, vmull_s32, vmull_u8, vmull_u16,
+   vmull_u32, vmulx_f32, vmulx_lane_f32, vmulxd_f64, vmulxq_f32,
+   vmulxq_f64, vmulxq_lane_f32, vmulxq_lane_f64, vmulxs_f32): Rewrite
+   using builtin functions.
+   * config/aarch64/iterators.md (UNSPEC_FMULX, UNSPEC_FMULX_LANE,
+   VDQF_Q): New unspec and int iterator.
+
 2015-03-12  Kyrylo Tkachov  
 
PR rtl-optimization/65235
Index: gcc/config/aarch64/arm_neon.h
===
--- gcc/config/aarch64/arm_neon.h   (revision 221393)
+++ gcc/config/aarch64/arm_neon.h   (working copy)
@@ -7580,671 +7580,6 @@ vmovn_u64 (uint64x2_t a)
   return result;
 }
 
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-vmul_n_f32 (float32x2_t a, float32_t b)
-{
-  float32x2_t result;
-  __asm__ ("fmul %0.2s,%1.2s,%2.s[0]"
-   : "=w"(result)
-   : "w"(a), "w"(b)
-   : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vmul_n_s16 (int16x4_t a, int16_t b)
-{
-  int16x4_t result;
-  __asm__ ("mul %0.4h,%1.4h,%2.h[0]"
-   : "=w"(result)
-   : "w"(a), "x"(b)
-   : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vmul_n_s32 (int32x2_t a, int32_t b)
-{
-  int32x2_t result;
-  __asm__ ("mul %0.2s,%1.2s,%2.s[0]"
-   : "=w"(result)
-   : "w"(a), "w"(b)
-   : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vmul_n_u16 (uint16x4_t a, uint16_t b)
-{
-  uint16x4_t result;
-  __asm__ ("mul %0.4h,%1.4h,%2.h[0]"
-   : "=w"(result)
-   : "w"(a), "x"(b)
-   : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vmul_n_u32 (uint32x2_t a, uint32_t b)
-{
-  uint32x2_t result;
-  __asm__ ("mul %0.2s,%1.2s,%2.s[0]"
-   : "=w"(result)
-   : "w"(a), "w"(b)
-   : /* No clobbers */);
-  return result;
-}
-
-#define vmull_high_lane_s16(a, b, c)\
-  __extension__ \
-({  \
-   int16x4_t b_ = (b);  \
-   int16x8_t a_ = (a);  \
-   int32x4_t result;\
-   __asm__ ("smull2 %0.4s, %1.8h, %2.h[%3]" \
-: "=w"(result)  \
-: "w"(a_), "x"(b_), "i"(c)  \
-: /* No clobbers */); 

[PING^3] [PATCH] [AArch64, NEON] Improve vmulX intrinsics

2015-04-11 Thread Jiangjiji
Hi, 
  This is a ping for: https://gcc.gnu.org/ml/gcc-patches/2015-03/msg00772.html
  Regtested with aarch64-linux-gnu on QEMU.
  This patch has no regressions for aarch64_be-linux-gnu big-endian target too. 
  OK for the trunk? 

Thanks.
Jiang jiji


--
Re: [PING^2] [PATCH] [AArch64, NEON] Improve vmulX intrinsics

Hi, Kyrill
  Thank you for your suggestion. 
  I fixed it and regtested with aarch64-linux-gnu on QEMU.
  This patch has no regressions for aarch64_be-linux-gnu big-endian target too. 
  OK for the trunk? 

Thanks.
Jiang jiji



Index: gcc/ChangeLog
===
--- gcc/ChangeLog   (revision 221393)
+++ gcc/ChangeLog   (working copy)
@@ -1,3 +1,38 @@
+2015-03-14  Felix Yang  
+   Jiji Jiang  
+
+   * config/aarch64/aarch64-simd.md (aarch64_mul_n,
+   aarch64_mull_n, aarch64_mull,
+   aarch64_simd_mull2_n, aarch64_mull2_n,
+   aarch64_mull_lane, aarch64_mull2_lane_internal,
+   aarch64_mull_laneq, aarch64_mull2_laneq_internal,
+   aarch64_smull2_lane, aarch64_umull2_lane,
+   aarch64_smull2_laneq, aarch64_umull2_laneq,
+   aarch64_fmulx, aarch64_fmulx, aarch64_fmulx_lane,
+   aarch64_pmull2v16qi, aarch64_pmullv8qi): New patterns.
+   * config/aarch64/aarch64-simd-builtins.def (vec_widen_smult_hi_,
+   vec_widen_umult_hi_, umull, smull, smull_n, umull_n, mul_n, smull2_n,
+   umull2_n, smull_lane, umull_lane, smull_laneq, umull_laneq, pmull,
+   umull2_lane, smull2_laneq, umull2_laneq, fmulx, fmulx_lane, pmull2,
+   smull2_lane): New builtins.
+   * config/aarch64/arm_neon.h (vmul_n_f32, vmul_n_s16, vmul_n_s32,
+   vmul_n_u16, vmul_n_u32, vmulq_n_f32, vmulq_n_f64, vmulq_n_s16,
+   vmulq_n_s32, vmulq_n_u16, vmulq_n_u32, vmull_high_lane_s16,
+   vmull_high_lane_s32, vmull_high_lane_u16, vmull_high_lane_u32,
+   vmull_high_laneq_s16, vmull_high_laneq_s32, vmull_high_laneq_u16,
+   vmull_high_laneq_u32, vmull_high_n_s16, vmull_high_n_s32,
+   vmull_high_n_u16, vmull_high_n_u32, vmull_high_p8, vmull_high_s8,
+   vmull_high_s16, vmull_high_s32, vmull_high_u8, vmull_high_u16,
+   vmull_high_u32, vmull_lane_s16, vmull_lane_s32, vmull_lane_u16,
+   vmull_lane_u32, vmull_laneq_s16, vmull_laneq_s32, vmull_laneq_u16,
+   vmull_laneq_u32, vmull_n_s16, vmull_n_s32, vmull_n_u16, vmull_n_u32,
+   vmull_p8, vmull_s8, vmull_s16, vmull_s32, vmull_u8, vmull_u16,
+   vmull_u32, vmulx_f32, vmulx_lane_f32, vmulxd_f64, vmulxq_f32,
+   vmulxq_f64, vmulxq_lane_f32, vmulxq_lane_f64, vmulxs_f32): Rewrite
+   using builtin functions.
+   * config/aarch64/iterators.md (UNSPEC_FMULX, UNSPEC_FMULX_LANE,
+   VDQF_Q): New unspec and int iterator.
+
 2015-03-12  Kyrylo Tkachov  
 
PR rtl-optimization/65235
Index: gcc/config/aarch64/arm_neon.h
===
--- gcc/config/aarch64/arm_neon.h   (revision 221393)
+++ gcc/config/aarch64/arm_neon.h   (working copy)
@@ -7580,671 +7580,6 @@ vmovn_u64 (uint64x2_t a)
   return result;
 }
 
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-vmul_n_f32 (float32x2_t a, float32_t b)
-{
-  float32x2_t result;
-  __asm__ ("fmul %0.2s,%1.2s,%2.s[0]"
-   : "=w"(result)
-   : "w"(a), "w"(b)
-   : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vmul_n_s16 (int16x4_t a, int16_t b)
-{
-  int16x4_t result;
-  __asm__ ("mul %0.4h,%1.4h,%2.h[0]"
-   : "=w"(result)
-   : "w"(a), "x"(b)
-   : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vmul_n_s32 (int32x2_t a, int32_t b)
-{
-  int32x2_t result;
-  __asm__ ("mul %0.2s,%1.2s,%2.s[0]"
-   : "=w"(result)
-   : "w"(a), "w"(b)
-   : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vmul_n_u16 (uint16x4_t a, uint16_t b)
-{
-  uint16x4_t result;
-  __asm__ ("mul %0.4h,%1.4h,%2.h[0]"
-   : "=w"(result)
-   : "w"(a), "x"(b)
-   : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vmul_n_u32 (uint32x2_t a, uint32_t b)
-{
-  uint32x2_t result;
-  __asm__ ("mul %0.2s,%1.2s,%2.s[0]"
-   : "=w"(result)
-   : "w"(a), "w"(b)
-   : /* No clobbers */);
-  return result;
-}
-
-#define vmull_high_lane_s16(a, b, c)\
-  __extension__ \
-({  \
-   int16x4_t b_ = (b);  \
-   int16x8_t a_ = (a);  \
-   int32x4_t result;

[PING] [PATCH] [AArch64, NEON] Improve vmulX intrinsics

2015-01-20 Thread Jiangjiji
Hi, 
  This is a ping for: https://gcc.gnu.org/ml/gcc-patches/2014-12/msg00775.html
  Regtested with aarch64-linux-gnu on QEMU.
  This patch has no regressions for aarch64_be-linux-gnu big-endian target too. 
  OK for the trunk? Thanks.


Index: gcc/ChangeLog
===
--- gcc/ChangeLog   (revision 219845)
+++ gcc/ChangeLog   (working copy)
@@ -1,3 +1,38 @@
+2014-12-11  Felix Yang  
+   Jiji Jiang  
+
+   * config/aarch64/aarch64-simd.md (aarch64_mul_n,
+   aarch64_mull_n, aarch64_mull,
+   aarch64_simd_mull2_n, aarch64_mull2_n,
+   aarch64_mull_lane, aarch64_mull2_lane_internal,
+   aarch64_mull_laneq, aarch64_mull2_laneq_internal,
+   aarch64_smull2_lane, aarch64_umull2_lane,
+   aarch64_smull2_laneq, aarch64_umull2_laneq,
+   aarch64_fmulx, aarch64_fmulx, aarch64_fmulx_lane,
+   aarch64_pmull2v16qi, aarch64_pmullv8qi): New patterns.
+   * config/aarch64/aarch64-simd-builtins.def (vec_widen_smult_hi_,
+   vec_widen_umult_hi_, umull, smull, smull_n, umull_n, mul_n, smull2_n,
+   umull2_n, smull_lane, umull_lane, smull_laneq, umull_laneq, pmull,
+   umull2_lane, smull2_laneq, umull2_laneq, fmulx, fmulx_lane, pmull2,
+   smull2_lane): New builtins.
+   * config/aarch64/arm_neon.h (vmul_n_f32, vmul_n_s16, vmul_n_s32,
+   vmul_n_u16, vmul_n_u32, vmulq_n_f32, vmulq_n_f64, vmulq_n_s16,
+   vmulq_n_s32, vmulq_n_u16, vmulq_n_u32, vmull_high_lane_s16,
+   vmull_high_lane_s32, vmull_high_lane_u16, vmull_high_lane_u32,
+   vmull_high_laneq_s16, vmull_high_laneq_s32, vmull_high_laneq_u16,
+   vmull_high_laneq_u32, vmull_high_n_s16, vmull_high_n_s32,
+   vmull_high_n_u16, vmull_high_n_u32, vmull_high_p8, vmull_high_s8,
+   vmull_high_s16, vmull_high_s32, vmull_high_u8, vmull_high_u16,
+   vmull_high_u32, vmull_lane_s16, vmull_lane_s32, vmull_lane_u16,
+   vmull_lane_u32, vmull_laneq_s16, vmull_laneq_s32, vmull_laneq_u16,
+   vmull_laneq_u32, vmull_n_s16, vmull_n_s32, vmull_n_u16, vmull_n_u32,
+   vmull_p8, vmull_s8, vmull_s16, vmull_s32, vmull_u8, vmull_u16,
+   vmull_u32, vmulx_f32, vmulx_lane_f32, vmulxd_f64, vmulxq_f32,
+   vmulxq_f64, vmulxq_lane_f32, vmulxq_lane_f64, vmulxs_f32): Rewrite
+   using builtin functions.
+   * config/aarch64/iterators.md (UNSPEC_FMULX, UNSPEC_FMULX_LANE,
+   VDQF_Q): New unspec and int iterator.
+
 2015-01-19  Jiong Wang  
Andrew Pinski  
 
Index: gcc/config/aarch64/arm_neon.h
===
--- gcc/config/aarch64/arm_neon.h   (revision 219845)
+++ gcc/config/aarch64/arm_neon.h   (working copy)
@@ -7580,671 +7580,6 @@ vmovn_u64 (uint64x2_t a)
   return result;
 }
 
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-vmul_n_f32 (float32x2_t a, float32_t b)
-{
-  float32x2_t result;
-  __asm__ ("fmul %0.2s,%1.2s,%2.s[0]"
-   : "=w"(result)
-   : "w"(a), "w"(b)
-   : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vmul_n_s16 (int16x4_t a, int16_t b)
-{
-  int16x4_t result;
-  __asm__ ("mul %0.4h,%1.4h,%2.h[0]"
-   : "=w"(result)
-   : "w"(a), "x"(b)
-   : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vmul_n_s32 (int32x2_t a, int32_t b)
-{
-  int32x2_t result;
-  __asm__ ("mul %0.2s,%1.2s,%2.s[0]"
-   : "=w"(result)
-   : "w"(a), "w"(b)
-   : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vmul_n_u16 (uint16x4_t a, uint16_t b)
-{
-  uint16x4_t result;
-  __asm__ ("mul %0.4h,%1.4h,%2.h[0]"
-   : "=w"(result)
-   : "w"(a), "x"(b)
-   : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vmul_n_u32 (uint32x2_t a, uint32_t b)
-{
-  uint32x2_t result;
-  __asm__ ("mul %0.2s,%1.2s,%2.s[0]"
-   : "=w"(result)
-   : "w"(a), "w"(b)
-   : /* No clobbers */);
-  return result;
-}
-
-#define vmull_high_lane_s16(a, b, c)\
-  __extension__ \
-({  \
-   int16x4_t b_ = (b);  \
-   int16x8_t a_ = (a);  \
-   int32x4_t result;\
-   __asm__ ("smull2 %0.4s, %1.8h, %2.h[%3]" \
-: "=w"(result)  \
-: "w"(a_), "x"(b_), "i"(c)  \
-: /* No clobbers */); 

[AArch64, NEON] Improve vmulX intrinsics

2014-12-09 Thread Jiangjiji

Hi,
 This patch converts more intrinsics to use builtin functions instead of the
previous inline assembly syntax.
 Passed the glorious testsuite of Christophe Lyon.

 Three testcases are added for the testing of intriniscs which are not
covered by the testsuite:
 gcc.target/aarch64/vmull_high.c
 gcc.target/aarch64/vmull_high_lane.c
 gcc.target/aarch64/vmull_high_n.c

 Regtested with aarch64-linux-gnu on QEMU.
 This patch has no regressions for aarch64_be-linux-gnu big-endian target 
too.
 OK for the trunk?



Index: gcc/ChangeLog
===
--- gcc/ChangeLog   (revision 218464)
+++ gcc/ChangeLog   (working copy)
@@ -1,3 +1,38 @@
+2014-12-09  Felix Yang  
+Jiji Jiang  
+
+   * config/aarch64/aarch64-simd.md (aarch64_mul_n,
+   aarch64_mull_n, aarch64_mull,
+   aarch64_simd_mull2_n, aarch64_mull2_n,
+   aarch64_mull_lane, aarch64_mull2_lane_internal,
+   aarch64_mull_laneq, aarch64_mull2_laneq_internal,
+   aarch64_smull2_lane, aarch64_umull2_lane,
+   aarch64_smull2_laneq, aarch64_umull2_laneq,
+   aarch64_fmulx, aarch64_fmulx, aarch64_fmulx_lane,
+   aarch64_pmull2v16qi, aarch64_pmullv8qi): New patterns.
+   * config/aarch64/aarch64-simd-builtins.def (vec_widen_smult_hi_,
+   vec_widen_umult_hi_, umull, smull, smull_n, umull_n, mul_n, smull2_n,
+   umull2_n, smull_lane, umull_lane, smull_laneq, umull_laneq, pmull,
+   umull2_lane, smull2_laneq, umull2_laneq, fmulx, fmulx_lane, pmull2,
+   smull2_lane): New builtins.
+   * config/aarch64/arm_neon.h (vmul_n_f32, vmul_n_s16, vmul_n_s32,
+   vmul_n_u16, vmul_n_u32, vmulq_n_f32, vmulq_n_f64, vmulq_n_s16,
+   vmulq_n_s32, vmulq_n_u16, vmulq_n_u32, vmull_high_lane_s16,
+   vmull_high_lane_s32, vmull_high_lane_u16, vmull_high_lane_u32,
+   vmull_high_laneq_s16, vmull_high_laneq_s32, vmull_high_laneq_u16,
+   vmull_high_laneq_u32, vmull_high_n_s16, vmull_high_n_s32,
+   vmull_high_n_u16, vmull_high_n_u32, vmull_high_p8, vmull_high_s8,
+   vmull_high_s16, vmull_high_s32, vmull_high_u8, vmull_high_u16,
+   vmull_high_u32, vmull_lane_s16, vmull_lane_s32, vmull_lane_u16,
+   vmull_lane_u32, vmull_laneq_s16, vmull_laneq_s32, vmull_laneq_u16,
+   vmull_laneq_u32, vmull_n_s16, vmull_n_s32, vmull_n_u16, vmull_n_u32,
+   vmull_p8, vmull_s8, vmull_s16, vmull_s32, vmull_u8, vmull_u16,
+   vmull_u32, vmulx_f32, vmulx_lane_f32, vmulxd_f64, vmulxq_f32,
+   vmulxq_f64, vmulxq_lane_f32, vmulxq_lane_f64, vmulxs_f32): Rewrite
+   using builtin functions.
+   * config/aarch64/iterators.md (UNSPEC_FMULX, UNSPEC_FMULX_LANE,
+   VDQF_Q): New unspec and int iterator.
+
 2014-12-07  Felix Yang  
Shanyao Chen  
 
Index: gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmull_high.c

===
--- gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmull_high.c
(revision 0)
+++ gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmull_high.c
(revision 0)
@@ -0,0 +1,111 @@
+#include 
+#include "arm-neon-ref.h"
+#include "compute-ref-data.h"
+
+
+/* Expected results.  */
+VECT_VAR_DECL(expected,int,16,8) [] = { 0xfc48, 0xfcbf, 0xfd36, 0xfdad,
+0xfe24, 0xfe9b, 0xff12, 0xff89 };
+VECT_VAR_DECL(expected,int,32,4) [] = { 0xf9a0, 0xfa28,
+0xfab0, 0xfb38 };
+VECT_VAR_DECL(expected,int,64,2) [] = { 0xf7a2,
+0xf83b };
+VECT_VAR_DECL(expected,uint,16,8) [] = { 0xa4b0, 0xa55a, 0xa604, 0xa6ae,
+ 0xa758, 0xa802, 0xa8ac, 0xa956 };
+VECT_VAR_DECL(expected,uint,32,4) [] = { 0xbaf73c, 0xbaf7f7,
+ 0xbaf8b2, 0xbaf96d };
+VECT_VAR_DECL(expected,uint,64,2) [] = { 0xcbf4d8,
+ 0xcbf5a4};
+VECT_VAR_DECL(expected,poly,16,8) [] = { 0x6530, 0x659a, 0x6464, 0x64ce,
+ 0x6798, 0x6732, 0x66cc, 0x };
+
+#ifndef INSN_NAME
+#define INSN_NAME vmull_high
+#define TEST_MSG "VMUL_HIGH"
+#endif
+
+#define FNNAME1(NAME) exec_ ## NAME
+#define FNNAME(NAME) FNNAME1(NAME)
+
+void FNNAME (INSN_NAME) (void)
+{
+#define DECL_VMUL(T, W, N)  \
+  DECL_VARIABLE(vector1, T, W, N);  \
+  DECL_VARIABLE(vector2, T, W, N); 
+
+  /* vector_res = OP(vector1, vector2), then store the result.  */
+#define TEST_VMULL_HIGH1(INSN, Q, T1, T2, W, N, W1, N1) \
+  VECT_VAR(vector_res, T1, W1, N1) =\
+INSN##Q##_##T2##W(VECT_VAR(vector1, T1, W, N),  \
+   VECT_VAR(vector2, T1, W, N));\
+  vst1q##_##T2##W1(VECT_VAR(result, T1, W1, N1),\
+  

Re: [AArch64, NEON] Improve vmulX intrinsics

2014-12-10 Thread Jiangjiji
Hi, Christophe Lyon
These testcases are not covered by the glorious testsuite.
If these cases are in your todo list , I will exclude them.

Thanks.

-邮件原件-
发件人: Christophe Lyon [mailto:christophe.l...@linaro.org] 
发送时间: 2014年12月9日 21:43
收件人: Jiangjiji
抄送: gcc-patches@gcc.gnu.org; Richard Earnshaw; Yangfei (Felix); Marcus Shawcroft
主题: Re: [AArch64, NEON] Improve vmulX intrinsics

On 9 December 2014 at 13:52, Jiangjiji  wrote:
> Hi,
>  This patch converts more intrinsics to use builtin functions instead of
> the
> previous inline assembly syntax.
>  Passed the glorious testsuite of Christophe Lyon.
>
>  Three testcases are added for the testing of intriniscs which are not
> covered by the testsuite:
>  gcc.target/aarch64/vmull_high.c
>  gcc.target/aarch64/vmull_high_lane.c
>  gcc.target/aarch64/vmull_high_n.c
>

As I said here:
https://gcc.gnu.org/ml/gcc-patches/2014-10/msg01934.html
I am in tre process of converting my existing testsuite to GCC/Dejagnu.
Please do not duplicate work.


>  Regtested with aarch64-linux-gnu on QEMU.
>  This patch has no regressions for aarch64_be-linux-gnu big-endian
> target too.
>  OK for the trunk?
>
>
>
> Index: gcc/ChangeLog
> ===
> --- gcc/ChangeLog   (revision 218464)
> +++ gcc/ChangeLog   (working copy)
> @@ -1,3 +1,38 @@
> +2014-12-09  Felix Yang  
> +Jiji Jiang  
> +
> +   * config/aarch64/aarch64-simd.md (aarch64_mul_n,
> +   aarch64_mull_n, aarch64_mull,
> +   aarch64_simd_mull2_n, aarch64_mull2_n,
> +   aarch64_mull_lane, aarch64_mull2_lane_internal,
> +   aarch64_mull_laneq,
> aarch64_mull2_laneq_internal,
> +   aarch64_smull2_lane, aarch64_umull2_lane,
> +   aarch64_smull2_laneq, aarch64_umull2_laneq,
> +   aarch64_fmulx, aarch64_fmulx, aarch64_fmulx_lane,
> +   aarch64_pmull2v16qi, aarch64_pmullv8qi): New patterns.
> +   * config/aarch64/aarch64-simd-builtins.def (vec_widen_smult_hi_,
> +   vec_widen_umult_hi_, umull, smull, smull_n, umull_n, mul_n,
> smull2_n,
> +   umull2_n, smull_lane, umull_lane, smull_laneq, umull_laneq, pmull,
> +   umull2_lane, smull2_laneq, umull2_laneq, fmulx, fmulx_lane, pmull2,
> +   smull2_lane): New builtins.
> +   * config/aarch64/arm_neon.h (vmul_n_f32, vmul_n_s16, vmul_n_s32,
> +   vmul_n_u16, vmul_n_u32, vmulq_n_f32, vmulq_n_f64, vmulq_n_s16,
> +   vmulq_n_s32, vmulq_n_u16, vmulq_n_u32, vmull_high_lane_s16,
> +   vmull_high_lane_s32, vmull_high_lane_u16, vmull_high_lane_u32,
> +   vmull_high_laneq_s16, vmull_high_laneq_s32, vmull_high_laneq_u16,
> +   vmull_high_laneq_u32, vmull_high_n_s16, vmull_high_n_s32,
> +   vmull_high_n_u16, vmull_high_n_u32, vmull_high_p8, vmull_high_s8,
> +   vmull_high_s16, vmull_high_s32, vmull_high_u8, vmull_high_u16,
> +   vmull_high_u32, vmull_lane_s16, vmull_lane_s32, vmull_lane_u16,
> +   vmull_lane_u32, vmull_laneq_s16, vmull_laneq_s32, vmull_laneq_u16,
> +   vmull_laneq_u32, vmull_n_s16, vmull_n_s32, vmull_n_u16, vmull_n_u32,
> +   vmull_p8, vmull_s8, vmull_s16, vmull_s32, vmull_u8, vmull_u16,
> +   vmull_u32, vmulx_f32, vmulx_lane_f32, vmulxd_f64, vmulxq_f32,
> +   vmulxq_f64, vmulxq_lane_f32, vmulxq_lane_f64, vmulxs_f32): Rewrite
> +   using builtin functions.
> +   * config/aarch64/iterators.md (UNSPEC_FMULX, UNSPEC_FMULX_LANE,
> +   VDQF_Q): New unspec and int iterator.
> +
>  2014-12-07  Felix Yang  
> Shanyao Chen  
>  Index: gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmull_high.c
> ===
> --- gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmull_high.c
> (revision 0)
> +++ gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmull_high.c
> (revision 0)
> @@ -0,0 +1,111 @@
> +#include 
> +#include "arm-neon-ref.h"
> +#include "compute-ref-data.h"
> +
> +
> +/* Expected results.  */
> +VECT_VAR_DECL(expected,int,16,8) [] = { 0xfc48, 0xfcbf, 0xfd36, 0xfdad,
> +0xfe24, 0xfe9b, 0xff12, 0xff89 };
> +VECT_VAR_DECL(expected,int,32,4) [] = { 0xf9a0, 0xfa28,
> +0xfab0, 0xfb38 };
> +VECT_VAR_DECL(expected,int,64,2) [] = { 0xf7a2,
> +0xf83b };
> +VECT_VAR_DECL(expected,uint,16,8) [] = { 0xa4b0, 0xa55a, 0xa604, 0xa6ae,
> + 0xa758, 0xa802, 0xa8ac, 0xa956 };
> +VECT_VAR_DECL(expected,uint,32,4) [] = { 0xbaf73c, 0xbaf7f7,
> + 0xbaf8b2, 0xbaf96d };
> +V