Much like the first patch, this adds the equivalent ...q... intrinsics for
float16x8_t, using GCC vector extensions.
gcc/ChangeLog:
* config/arm/arm_neon.h (vdupq_lane_f16, vld1q_lane_f16, vld1q_dup_f16,
vreinterpretq_p8_f16, vreinterpretq_f16_p8, vreinterpretq_f16_p16,
vreinterpretq_f16_f32, vreinterpretq_f16_p64, vreinterpretq_f16_p128,
vreinterpretq_f16_s64, vreinterpretq_f16_u64, vreinterpretq_f16_s8,
vreinterpretq_f16_s16, vreinterpretq_f16_s32, vreinterpretq_f16_u8,
vreinterpretq_f16_u16, vreinterpretq_f16_u32, vreinterpretq_f32_f16,
vreinterpretq_p64_f16, vreinterpretq_p128_f16, vreinterpretq_s64_f16,
vreinterpretq_u64_f16, vreinterpretq_s8_f16, vreinterpretq_s16_f16,
vreinterpretq_s32_f16, vreinterpretq_u8_f16, vreinterpretq_u16_f16,
vreinterpretq_u32_f16): New.
diff --git a/gcc/config/arm/arm_neon.h b/gcc/config/arm/arm_neon.h
index 7259852a6a450c5f693b03cf6342f33190f266c6..d214fd673565c1cd020203c40c514762dfead520 100644
--- a/gcc/config/arm/arm_neon.h
+++ b/gcc/config/arm/arm_neon.h
@@ -5264,6 +5264,16 @@ vgetq_lane_s32 (int32x4_t __a, const int __b)
return (int32_t)__builtin_neon_vget_lanev4si (__a, __b);
}
+#define vgetq_lane_f16(__v, __i) \
+ __extension__ \
+ ({ \
+ float16x8_t __vec = (__v); \
+ int __idx = (__i); \
+ __builtin_arm_lane_check (8, __idx); \
+ float16_t __res = __vec[__idx]; \
+ __res; \
+ })
+
__extension__ static __inline float32_t __attribute__ ((__always_inline__))
vgetq_lane_f32 (float32x4_t __a, const int __b)
{
@@ -5407,6 +5417,17 @@ vsetq_lane_s32 (int32_t __a, int32x4_t __b, const int __c)
return (int32x4_t)__builtin_neon_vset_lanev4si ((__builtin_neon_si) __a, __b, __c);
}
+#define vsetq_lane_f16(__e, __v, __i) \
+ __extension__ \
+ ({ \
+ float16_t __elem = (__e); \
+ float16x8_t __vec = (__v); \
+ int __idx = (__i); \
+ __builtin_arm_lane_check (4, __idx); \
+ __vec[__idx] = __elem; \
+ __vec; \
+ })
+
__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
vsetq_lane_f32 (float32_t __a, float32x4_t __b, const int __c)
{
@@ -5642,6 +5663,13 @@ vdupq_n_s32 (int32_t __a)
return (int32x4_t)__builtin_neon_vdup_nv4si ((__builtin_neon_si) __a);
}
+#define vdupq_n_f16(__e1) \
+ __extension__ \
+ ({ \
+ float16_t __e = (__e1); \
+ (float16x8_t) {__e, __e, __e, __e, __e, __e, __e, __e}; \
+ })
+
__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
vdupq_n_f32 (float32_t __a)
{
@@ -5920,6 +5948,12 @@ vdupq_lane_s32 (int32x2_t __a, const int __b)
return (int32x4_t)__builtin_neon_vdup_lanev4si (__a, __b);
}
+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+vdupq_lane_f16 (float16x8_t __a, const int __b)
+{
+ return vdupq_n_f16 (vgetq_lane_f16 (__a, __b));
+}
+
__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
vdupq_lane_f32 (float32x2_t __a, const int __b)
{
@@ -8903,6 +8937,12 @@ vld1q_lane_s32 (const int32_t * __a, int32x4_t __b, const int __c)
return (int32x4_t)__builtin_neon_vld1_lanev4si ((const __builtin_neon_si *) __a, __b, __c);
}
+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+vld1q_lane_f16 (const float16_t * __a, float16x8_t __b, const int __c)
+{
+ return vsetq_lane_f16 (*__a, __b, __c);
+}
+
__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
vld1q_lane_f32 (const float32_t * __a, float32x4_t __b, const int __c)
{
@@ -9057,6 +9097,12 @@ vld1q_dup_s32 (const int32_t * __a)
return (int32x4_t)__builtin_neon_vld1_dupv4si ((const __builtin_neon_si *) __a);
}
+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+vld1q_dup_f16 (const float16_t * __a)
+{
+ return vdupq_n_f16 (*__a);
+}
+
__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
vld1q_dup_f32 (const float32_t * __a)
{
@@ -12851,6 +12897,12 @@ vreinterpretq_p8_p16 (poly16x8_t __a)
}
__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
+vreinterpretq_p8_f16 (float16x8_t __a)
+{
+ return (poly8x16_t) __a;
+}
+
+__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
vreinterpretq_p8_f32 (float32x4_t __a)
{
return (poly8x16_t)__builtin_neon_vreinterpretv16qiv4sf (__a);
@@ -12996,6 +13048,88 @@ vreinterpretq_p16_u32 (uint32x4_t __a)
return (poly16x8_t)__builtin_neon_vreinterpretv8hiv4si ((int32x4_t) __a);
}
+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+vreinterpretq_f16_p8 (poly8x16_t __a)
+{
+ return (float16x8_t) __a;
+}
+
+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+vreinterpretq_f16_p16 (poly16x8_t __a)
+{
+ return (float16x8_t) __a;
+}
+
+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+vreinterpretq_f16_f32 (float32x4_t __a)
+{
+ return (float16x8_t) __a;
+}
+
+#ifdef __ARM_FEATURE_CRYPTO
+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+vreinterpretq_f16_p64 (poly64x2_t __a)
+{
+ return (float16x8_t) __a;
+}
+
+#endif
+#ifdef __ARM_FEATURE_CRYPTO
+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+vreinterpretq_f16_p128 (poly128_t __a)
+{
+ return (float16x8_t) __a;
+}
+
+#endif
+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+vreinterpretq_f16_s64 (int64x2_t __a)
+{
+ return (float16x8_t) __a;
+}
+
+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+vreinterpretq_f16_u64 (uint64x2_t __a)
+{
+ return (float16x8_t) __a;
+}
+
+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+vreinterpretq_f16_s8 (int8x16_t __a)
+{
+ return (float16x8_t) __a;
+}
+
+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+vreinterpretq_f16_s16 (int16x8_t __a)
+{
+ return (float16x8_t) __a;
+}
+
+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+vreinterpretq_f16_s32 (int32x4_t __a)
+{
+ return (float16x8_t) __a;
+}
+
+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+vreinterpretq_f16_u8 (uint8x16_t __a)
+{
+ return (float16x8_t) __a;
+}
+
+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+vreinterpretq_f16_u16 (uint16x8_t __a)
+{
+ return (float16x8_t) __a;
+}
+
+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+vreinterpretq_f16_u32 (uint32x4_t __a)
+{
+ return (float16x8_t) __a;
+}
+
__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
vreinterpretq_f32_p8 (poly8x16_t __a)
{
@@ -13008,6 +13142,12 @@ vreinterpretq_f32_p16 (poly16x8_t __a)
return (float32x4_t)__builtin_neon_vreinterpretv4sfv8hi ((int16x8_t) __a);
}
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vreinterpretq_f32_f16 (float16x8_t __a)
+{
+ return (float32x4_t) __a;
+}
+
#ifdef __ARM_FEATURE_CRYPTO
__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
vreinterpretq_f32_p64 (poly64x2_t __a)
@@ -13090,6 +13230,14 @@ vreinterpretq_p64_p16 (poly16x8_t __a)
#endif
#ifdef __ARM_FEATURE_CRYPTO
__extension__ static __inline poly64x2_t __attribute__ ((__always_inline__))
+vreinterpretq_p64_f16 (float16x8_t __a)
+{
+ return (poly64x2_t) __a;
+}
+
+#endif
+#ifdef __ARM_FEATURE_CRYPTO
+__extension__ static __inline poly64x2_t __attribute__ ((__always_inline__))
vreinterpretq_p64_f32 (float32x4_t __a)
{
return (poly64x2_t)__builtin_neon_vreinterpretv2div4sf (__a);
@@ -13186,6 +13334,14 @@ vreinterpretq_p128_p16 (poly16x8_t __a)
#endif
#ifdef __ARM_FEATURE_CRYPTO
__extension__ static __inline poly128_t __attribute__ ((__always_inline__))
+vreinterpretq_p128_f16 (float16x8_t __a)
+{
+ return (poly128_t) __a;
+}
+
+#endif
+#ifdef __ARM_FEATURE_CRYPTO
+__extension__ static __inline poly128_t __attribute__ ((__always_inline__))
vreinterpretq_p128_f32 (float32x4_t __a)
{
return (poly128_t)__builtin_neon_vreinterprettiv4sf (__a);
@@ -13277,6 +13433,12 @@ vreinterpretq_s64_p16 (poly16x8_t __a)
}
__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vreinterpretq_s64_f16 (float16x8_t __a)
+{
+ return (int64x2_t) __a;
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
vreinterpretq_s64_f32 (float32x4_t __a)
{
return (int64x2_t)__builtin_neon_vreinterpretv2div4sf (__a);
@@ -13353,6 +13515,12 @@ vreinterpretq_u64_p16 (poly16x8_t __a)
}
__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vreinterpretq_u64_f16 (float16x8_t __a)
+{
+ return (uint64x2_t) __a;
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
vreinterpretq_u64_f32 (float32x4_t __a)
{
return (uint64x2_t)__builtin_neon_vreinterpretv2div4sf (__a);
@@ -13429,6 +13597,12 @@ vreinterpretq_s8_p16 (poly16x8_t __a)
}
__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+vreinterpretq_s8_f16 (float16x8_t __a)
+{
+ return (int8x16_t) __a;
+}
+
+__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
vreinterpretq_s8_f32 (float32x4_t __a)
{
return (int8x16_t)__builtin_neon_vreinterpretv16qiv4sf (__a);
@@ -13505,6 +13679,12 @@ vreinterpretq_s16_p16 (poly16x8_t __a)
}
__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vreinterpretq_s16_f16 (float16x8_t __a)
+{
+ return (int16x8_t) __a;
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
vreinterpretq_s16_f32 (float32x4_t __a)
{
return (int16x8_t)__builtin_neon_vreinterpretv8hiv4sf (__a);
@@ -13581,6 +13761,12 @@ vreinterpretq_s32_p16 (poly16x8_t __a)
}
__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vreinterpretq_s32_f16 (float16x8_t __a)
+{
+ return (int32x4_t)__builtin_neon_vreinterpretv4siv8hi ((int16x8_t) __a);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
vreinterpretq_s32_f32 (float32x4_t __a)
{
return (int32x4_t)__builtin_neon_vreinterpretv4siv4sf (__a);
@@ -13657,6 +13843,12 @@ vreinterpretq_u8_p16 (poly16x8_t __a)
}
__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vreinterpretq_u8_f16 (float16x8_t __a)
+{
+ return (uint8x16_t) __a;
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
vreinterpretq_u8_f32 (float32x4_t __a)
{
return (uint8x16_t)__builtin_neon_vreinterpretv16qiv4sf (__a);
@@ -13733,6 +13925,12 @@ vreinterpretq_u16_p16 (poly16x8_t __a)
}
__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vreinterpretq_u16_f16 (float16x8_t __a)
+{
+ return (uint16x8_t) __a;
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
vreinterpretq_u16_f32 (float32x4_t __a)
{
return (uint16x8_t)__builtin_neon_vreinterpretv8hiv4sf (__a);
@@ -13809,6 +14007,12 @@ vreinterpretq_u32_p16 (poly16x8_t __a)
}
__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vreinterpretq_u32_f16 (float16x8_t __a)
+{
+ return (uint32x4_t) __a;
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
vreinterpretq_u32_f32 (float32x4_t __a)
{
return (uint32x4_t)__builtin_neon_vreinterpretv4siv4sf (__a);