Hi all, Second version of the patch here implementing the bfloat16_t neon related store intrinsics: vst2_lane_bf16, vst2q_lane_bf16, vst3_lane_bf16, vst3q_lane_bf16 vst4_lane_bf16, vst4q_lane_bf16.
Please see refer to: ACLE <https://developer.arm.com/docs/101028/latest> ISA <https://developer.arm.com/docs/ddi0596/latest> This better narrows testcases so they do not cause regressions for the arm backend where these intrinsics are not yet present. Please see refer to: ACLE <https://developer.arm.com/docs/101028/latest> ISA <https://developer.arm.com/docs/ddi0596/latest> Okay for trunk? Thanks! Andrea
>From 16803710f96889ec89349c5bb6ff1fb96a9d32d8 Mon Sep 17 00:00:00 2001 From: Andrea Corallo <andrea.cora...@arm.com> Date: Thu, 8 Oct 2020 11:02:09 +0200 Subject: [PATCH] aarch64: Add vstN_lane_bf16 + vstNq_lane_bf16 intrinsics gcc/ChangeLog 2020-10-19 Andrea Corallo <andrea.cora...@arm.com> * config/aarch64/arm_neon.h (__STX_LANE_FUNC): Move to the bottom of the file so we can use these also for defining the bf16 related intrinsics. (vst2_lane_bf16, vst2q_lane_bf16, vst3_lane_bf16, vst3q_lane_bf16) (vst4_lane_bf16, vst4q_lane_bf16): Add new intrinsics. gcc/testsuite/ChangeLog 2020-10-19 Andrea Corallo <andrea.cora...@arm.com> * gcc.target/aarch64/advsimd-intrinsics/arm-neon-ref.h (hbfloat16_t): Define type. (CHECK_FP): Make it working for bfloat types. * gcc.target/aarch64/advsimd-intrinsics/bf16_vstN_lane_1.c: New file. * gcc.target/aarch64/advsimd-intrinsics/bf16_vstN_lane_2.c: Likewise. * gcc.target/aarch64/advsimd-intrinsics/vst2_lane_bf16_indices_1.c: Likewise. * gcc.target/aarch64/advsimd-intrinsics/vst2q_lane_bf16_indices_1.c: Likewise. * gcc.target/aarch64/advsimd-intrinsics/vst3_lane_bf16_indices_1.c: Likewise. * gcc.target/aarch64/advsimd-intrinsics/vst3q_lane_bf16_indices_1.c: Likewise. * gcc.target/aarch64/advsimd-intrinsics/vst4_lane_bf16_indices_1.c: Likewise. * gcc.target/aarch64/advsimd-intrinsics/vst4q_lane_bf16_indices_1.c: Likewise. --- gcc/config/aarch64/arm_neon.h | 534 +++++++++--------- .../aarch64/advsimd-intrinsics/arm-neon-ref.h | 4 +- .../advsimd-intrinsics/bf16_vstN_lane_1.c | 227 ++++++++ .../advsimd-intrinsics/bf16_vstN_lane_2.c | 52 ++ .../vst2_lane_bf16_indices_1.c | 16 + .../vst2q_lane_bf16_indices_1.c | 16 + .../vst3_lane_bf16_indices_1.c | 16 + .../vst3q_lane_bf16_indices_1.c | 16 + .../vst4_lane_bf16_indices_1.c | 16 + .../vst4q_lane_bf16_indices_1.c | 16 + 10 files changed, 656 insertions(+), 257 deletions(-) create mode 100644 gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bf16_vstN_lane_1.c create mode 100644 gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bf16_vstN_lane_2.c create mode 100644 gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vst2_lane_bf16_indices_1.c create mode 100644 gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vst2q_lane_bf16_indices_1.c create mode 100644 gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vst3_lane_bf16_indices_1.c create mode 100644 gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vst3q_lane_bf16_indices_1.c create mode 100644 gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vst4_lane_bf16_indices_1.c create mode 100644 gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vst4q_lane_bf16_indices_1.c diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h index 2bb20e15069..0088ea9896f 100644 --- a/gcc/config/aarch64/arm_neon.h +++ b/gcc/config/aarch64/arm_neon.h @@ -10873,262 +10873,6 @@ __STRUCTN (poly, 8, 4) __STRUCTN (float, 64, 4) #undef __STRUCTN - -#define __ST2_LANE_FUNC(intype, largetype, ptrtype, mode, \ - qmode, ptr_mode, funcsuffix, signedtype) \ -__extension__ extern __inline void \ -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \ -vst2_lane_ ## funcsuffix (ptrtype *__ptr, \ - intype __b, const int __c) \ -{ \ - __builtin_aarch64_simd_oi __o; \ - largetype __temp; \ - __temp.val[0] \ - = vcombine_##funcsuffix (__b.val[0], \ - vcreate_##funcsuffix (__AARCH64_UINT64_C (0))); \ - __temp.val[1] \ - = vcombine_##funcsuffix (__b.val[1], \ - vcreate_##funcsuffix (__AARCH64_UINT64_C (0))); \ - __o = __builtin_aarch64_set_qregoi##qmode (__o, \ - (signedtype) __temp.val[0], 0); \ - __o = __builtin_aarch64_set_qregoi##qmode (__o, \ - (signedtype) __temp.val[1], 1); \ - __builtin_aarch64_st2_lane##mode ((__builtin_aarch64_simd_ ## ptr_mode *) \ - __ptr, __o, __c); \ -} - -__ST2_LANE_FUNC (float16x4x2_t, float16x8x2_t, float16_t, v4hf, v8hf, hf, f16, - float16x8_t) -__ST2_LANE_FUNC (float32x2x2_t, float32x4x2_t, float32_t, v2sf, v4sf, sf, f32, - float32x4_t) -__ST2_LANE_FUNC (float64x1x2_t, float64x2x2_t, float64_t, df, v2df, df, f64, - float64x2_t) -__ST2_LANE_FUNC (poly8x8x2_t, poly8x16x2_t, poly8_t, v8qi, v16qi, qi, p8, - int8x16_t) -__ST2_LANE_FUNC (poly16x4x2_t, poly16x8x2_t, poly16_t, v4hi, v8hi, hi, p16, - int16x8_t) -__ST2_LANE_FUNC (poly64x1x2_t, poly64x2x2_t, poly64_t, di, v2di_ssps, di, p64, - poly64x2_t) -__ST2_LANE_FUNC (int8x8x2_t, int8x16x2_t, int8_t, v8qi, v16qi, qi, s8, - int8x16_t) -__ST2_LANE_FUNC (int16x4x2_t, int16x8x2_t, int16_t, v4hi, v8hi, hi, s16, - int16x8_t) -__ST2_LANE_FUNC (int32x2x2_t, int32x4x2_t, int32_t, v2si, v4si, si, s32, - int32x4_t) -__ST2_LANE_FUNC (int64x1x2_t, int64x2x2_t, int64_t, di, v2di, di, s64, - int64x2_t) -__ST2_LANE_FUNC (uint8x8x2_t, uint8x16x2_t, uint8_t, v8qi, v16qi, qi, u8, - int8x16_t) -__ST2_LANE_FUNC (uint16x4x2_t, uint16x8x2_t, uint16_t, v4hi, v8hi, hi, u16, - int16x8_t) -__ST2_LANE_FUNC (uint32x2x2_t, uint32x4x2_t, uint32_t, v2si, v4si, si, u32, - int32x4_t) -__ST2_LANE_FUNC (uint64x1x2_t, uint64x2x2_t, uint64_t, di, v2di, di, u64, - int64x2_t) - -#undef __ST2_LANE_FUNC -#define __ST2_LANE_FUNC(intype, ptrtype, mode, ptr_mode, funcsuffix) \ -__extension__ extern __inline void \ -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \ -vst2q_lane_ ## funcsuffix (ptrtype *__ptr, \ - intype __b, const int __c) \ -{ \ - union { intype __i; \ - __builtin_aarch64_simd_oi __o; } __temp = { __b }; \ - __builtin_aarch64_st2_lane##mode ((__builtin_aarch64_simd_ ## ptr_mode *) \ - __ptr, __temp.__o, __c); \ -} - -__ST2_LANE_FUNC (float16x8x2_t, float16_t, v8hf, hf, f16) -__ST2_LANE_FUNC (float32x4x2_t, float32_t, v4sf, sf, f32) -__ST2_LANE_FUNC (float64x2x2_t, float64_t, v2df, df, f64) -__ST2_LANE_FUNC (poly8x16x2_t, poly8_t, v16qi, qi, p8) -__ST2_LANE_FUNC (poly16x8x2_t, poly16_t, v8hi, hi, p16) -__ST2_LANE_FUNC (poly64x2x2_t, poly64_t, v2di, di, p64) -__ST2_LANE_FUNC (int8x16x2_t, int8_t, v16qi, qi, s8) -__ST2_LANE_FUNC (int16x8x2_t, int16_t, v8hi, hi, s16) -__ST2_LANE_FUNC (int32x4x2_t, int32_t, v4si, si, s32) -__ST2_LANE_FUNC (int64x2x2_t, int64_t, v2di, di, s64) -__ST2_LANE_FUNC (uint8x16x2_t, uint8_t, v16qi, qi, u8) -__ST2_LANE_FUNC (uint16x8x2_t, uint16_t, v8hi, hi, u16) -__ST2_LANE_FUNC (uint32x4x2_t, uint32_t, v4si, si, u32) -__ST2_LANE_FUNC (uint64x2x2_t, uint64_t, v2di, di, u64) - -#define __ST3_LANE_FUNC(intype, largetype, ptrtype, mode, \ - qmode, ptr_mode, funcsuffix, signedtype) \ -__extension__ extern __inline void \ -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \ -vst3_lane_ ## funcsuffix (ptrtype *__ptr, \ - intype __b, const int __c) \ -{ \ - __builtin_aarch64_simd_ci __o; \ - largetype __temp; \ - __temp.val[0] \ - = vcombine_##funcsuffix (__b.val[0], \ - vcreate_##funcsuffix (__AARCH64_UINT64_C (0))); \ - __temp.val[1] \ - = vcombine_##funcsuffix (__b.val[1], \ - vcreate_##funcsuffix (__AARCH64_UINT64_C (0))); \ - __temp.val[2] \ - = vcombine_##funcsuffix (__b.val[2], \ - vcreate_##funcsuffix (__AARCH64_UINT64_C (0))); \ - __o = __builtin_aarch64_set_qregci##qmode (__o, \ - (signedtype) __temp.val[0], 0); \ - __o = __builtin_aarch64_set_qregci##qmode (__o, \ - (signedtype) __temp.val[1], 1); \ - __o = __builtin_aarch64_set_qregci##qmode (__o, \ - (signedtype) __temp.val[2], 2); \ - __builtin_aarch64_st3_lane##mode ((__builtin_aarch64_simd_ ## ptr_mode *) \ - __ptr, __o, __c); \ -} - -__ST3_LANE_FUNC (float16x4x3_t, float16x8x3_t, float16_t, v4hf, v8hf, hf, f16, - float16x8_t) -__ST3_LANE_FUNC (float32x2x3_t, float32x4x3_t, float32_t, v2sf, v4sf, sf, f32, - float32x4_t) -__ST3_LANE_FUNC (float64x1x3_t, float64x2x3_t, float64_t, df, v2df, df, f64, - float64x2_t) -__ST3_LANE_FUNC (poly8x8x3_t, poly8x16x3_t, poly8_t, v8qi, v16qi, qi, p8, - int8x16_t) -__ST3_LANE_FUNC (poly16x4x3_t, poly16x8x3_t, poly16_t, v4hi, v8hi, hi, p16, - int16x8_t) -__ST3_LANE_FUNC (poly64x1x3_t, poly64x2x3_t, poly64_t, di, v2di_ssps, di, p64, - poly64x2_t) -__ST3_LANE_FUNC (int8x8x3_t, int8x16x3_t, int8_t, v8qi, v16qi, qi, s8, - int8x16_t) -__ST3_LANE_FUNC (int16x4x3_t, int16x8x3_t, int16_t, v4hi, v8hi, hi, s16, - int16x8_t) -__ST3_LANE_FUNC (int32x2x3_t, int32x4x3_t, int32_t, v2si, v4si, si, s32, - int32x4_t) -__ST3_LANE_FUNC (int64x1x3_t, int64x2x3_t, int64_t, di, v2di, di, s64, - int64x2_t) -__ST3_LANE_FUNC (uint8x8x3_t, uint8x16x3_t, uint8_t, v8qi, v16qi, qi, u8, - int8x16_t) -__ST3_LANE_FUNC (uint16x4x3_t, uint16x8x3_t, uint16_t, v4hi, v8hi, hi, u16, - int16x8_t) -__ST3_LANE_FUNC (uint32x2x3_t, uint32x4x3_t, uint32_t, v2si, v4si, si, u32, - int32x4_t) -__ST3_LANE_FUNC (uint64x1x3_t, uint64x2x3_t, uint64_t, di, v2di, di, u64, - int64x2_t) - -#undef __ST3_LANE_FUNC -#define __ST3_LANE_FUNC(intype, ptrtype, mode, ptr_mode, funcsuffix) \ -__extension__ extern __inline void \ -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \ -vst3q_lane_ ## funcsuffix (ptrtype *__ptr, \ - intype __b, const int __c) \ -{ \ - union { intype __i; \ - __builtin_aarch64_simd_ci __o; } __temp = { __b }; \ - __builtin_aarch64_st3_lane##mode ((__builtin_aarch64_simd_ ## ptr_mode *) \ - __ptr, __temp.__o, __c); \ -} - -__ST3_LANE_FUNC (float16x8x3_t, float16_t, v8hf, hf, f16) -__ST3_LANE_FUNC (float32x4x3_t, float32_t, v4sf, sf, f32) -__ST3_LANE_FUNC (float64x2x3_t, float64_t, v2df, df, f64) -__ST3_LANE_FUNC (poly8x16x3_t, poly8_t, v16qi, qi, p8) -__ST3_LANE_FUNC (poly16x8x3_t, poly16_t, v8hi, hi, p16) -__ST3_LANE_FUNC (poly64x2x3_t, poly64_t, v2di, di, p64) -__ST3_LANE_FUNC (int8x16x3_t, int8_t, v16qi, qi, s8) -__ST3_LANE_FUNC (int16x8x3_t, int16_t, v8hi, hi, s16) -__ST3_LANE_FUNC (int32x4x3_t, int32_t, v4si, si, s32) -__ST3_LANE_FUNC (int64x2x3_t, int64_t, v2di, di, s64) -__ST3_LANE_FUNC (uint8x16x3_t, uint8_t, v16qi, qi, u8) -__ST3_LANE_FUNC (uint16x8x3_t, uint16_t, v8hi, hi, u16) -__ST3_LANE_FUNC (uint32x4x3_t, uint32_t, v4si, si, u32) -__ST3_LANE_FUNC (uint64x2x3_t, uint64_t, v2di, di, u64) - -#define __ST4_LANE_FUNC(intype, largetype, ptrtype, mode, \ - qmode, ptr_mode, funcsuffix, signedtype) \ -__extension__ extern __inline void \ -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \ -vst4_lane_ ## funcsuffix (ptrtype *__ptr, \ - intype __b, const int __c) \ -{ \ - __builtin_aarch64_simd_xi __o; \ - largetype __temp; \ - __temp.val[0] \ - = vcombine_##funcsuffix (__b.val[0], \ - vcreate_##funcsuffix (__AARCH64_UINT64_C (0))); \ - __temp.val[1] \ - = vcombine_##funcsuffix (__b.val[1], \ - vcreate_##funcsuffix (__AARCH64_UINT64_C (0))); \ - __temp.val[2] \ - = vcombine_##funcsuffix (__b.val[2], \ - vcreate_##funcsuffix (__AARCH64_UINT64_C (0))); \ - __temp.val[3] \ - = vcombine_##funcsuffix (__b.val[3], \ - vcreate_##funcsuffix (__AARCH64_UINT64_C (0))); \ - __o = __builtin_aarch64_set_qregxi##qmode (__o, \ - (signedtype) __temp.val[0], 0); \ - __o = __builtin_aarch64_set_qregxi##qmode (__o, \ - (signedtype) __temp.val[1], 1); \ - __o = __builtin_aarch64_set_qregxi##qmode (__o, \ - (signedtype) __temp.val[2], 2); \ - __o = __builtin_aarch64_set_qregxi##qmode (__o, \ - (signedtype) __temp.val[3], 3); \ - __builtin_aarch64_st4_lane##mode ((__builtin_aarch64_simd_ ## ptr_mode *) \ - __ptr, __o, __c); \ -} - -__ST4_LANE_FUNC (float16x4x4_t, float16x8x4_t, float16_t, v4hf, v8hf, hf, f16, - float16x8_t) -__ST4_LANE_FUNC (float32x2x4_t, float32x4x4_t, float32_t, v2sf, v4sf, sf, f32, - float32x4_t) -__ST4_LANE_FUNC (float64x1x4_t, float64x2x4_t, float64_t, df, v2df, df, f64, - float64x2_t) -__ST4_LANE_FUNC (poly8x8x4_t, poly8x16x4_t, poly8_t, v8qi, v16qi, qi, p8, - int8x16_t) -__ST4_LANE_FUNC (poly16x4x4_t, poly16x8x4_t, poly16_t, v4hi, v8hi, hi, p16, - int16x8_t) -__ST4_LANE_FUNC (poly64x1x4_t, poly64x2x4_t, poly64_t, di, v2di_ssps, di, p64, - poly64x2_t) -__ST4_LANE_FUNC (int8x8x4_t, int8x16x4_t, int8_t, v8qi, v16qi, qi, s8, - int8x16_t) -__ST4_LANE_FUNC (int16x4x4_t, int16x8x4_t, int16_t, v4hi, v8hi, hi, s16, - int16x8_t) -__ST4_LANE_FUNC (int32x2x4_t, int32x4x4_t, int32_t, v2si, v4si, si, s32, - int32x4_t) -__ST4_LANE_FUNC (int64x1x4_t, int64x2x4_t, int64_t, di, v2di, di, s64, - int64x2_t) -__ST4_LANE_FUNC (uint8x8x4_t, uint8x16x4_t, uint8_t, v8qi, v16qi, qi, u8, - int8x16_t) -__ST4_LANE_FUNC (uint16x4x4_t, uint16x8x4_t, uint16_t, v4hi, v8hi, hi, u16, - int16x8_t) -__ST4_LANE_FUNC (uint32x2x4_t, uint32x4x4_t, uint32_t, v2si, v4si, si, u32, - int32x4_t) -__ST4_LANE_FUNC (uint64x1x4_t, uint64x2x4_t, uint64_t, di, v2di, di, u64, - int64x2_t) - -#undef __ST4_LANE_FUNC -#define __ST4_LANE_FUNC(intype, ptrtype, mode, ptr_mode, funcsuffix) \ -__extension__ extern __inline void \ -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \ -vst4q_lane_ ## funcsuffix (ptrtype *__ptr, \ - intype __b, const int __c) \ -{ \ - union { intype __i; \ - __builtin_aarch64_simd_xi __o; } __temp = { __b }; \ - __builtin_aarch64_st4_lane##mode ((__builtin_aarch64_simd_ ## ptr_mode *) \ - __ptr, __temp.__o, __c); \ -} - -__ST4_LANE_FUNC (float16x8x4_t, float16_t, v8hf, hf, f16) -__ST4_LANE_FUNC (float32x4x4_t, float32_t, v4sf, sf, f32) -__ST4_LANE_FUNC (float64x2x4_t, float64_t, v2df, df, f64) -__ST4_LANE_FUNC (poly8x16x4_t, poly8_t, v16qi, qi, p8) -__ST4_LANE_FUNC (poly16x8x4_t, poly16_t, v8hi, hi, p16) -__ST4_LANE_FUNC (poly64x2x4_t, poly64_t, v2di, di, p64) -__ST4_LANE_FUNC (int8x16x4_t, int8_t, v16qi, qi, s8) -__ST4_LANE_FUNC (int16x8x4_t, int16_t, v8hi, hi, s16) -__ST4_LANE_FUNC (int32x4x4_t, int32_t, v4si, si, s32) -__ST4_LANE_FUNC (int64x2x4_t, int64_t, v2di, di, s64) -__ST4_LANE_FUNC (uint8x16x4_t, uint8_t, v16qi, qi, u8) -__ST4_LANE_FUNC (uint16x8x4_t, uint16_t, v8hi, hi, u16) -__ST4_LANE_FUNC (uint32x4x4_t, uint32_t, v4si, si, u32) -__ST4_LANE_FUNC (uint64x2x4_t, uint64_t, v2di, di, u64) - __extension__ extern __inline int64_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vaddlv_s32 (int32x2_t __a) @@ -35568,6 +35312,284 @@ vaddq_p128 (poly128_t __a, poly128_t __b) return __a ^ __b; } +/* vst2_lane */ + +#define __ST2_LANE_FUNC(intype, largetype, ptrtype, mode, \ + qmode, ptr_mode, funcsuffix, signedtype) \ +__extension__ extern __inline void \ +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \ +vst2_lane_ ## funcsuffix (ptrtype *__ptr, \ + intype __b, const int __c) \ +{ \ + __builtin_aarch64_simd_oi __o; \ + largetype __temp; \ + __temp.val[0] \ + = vcombine_##funcsuffix (__b.val[0], \ + vcreate_##funcsuffix (__AARCH64_UINT64_C (0))); \ + __temp.val[1] \ + = vcombine_##funcsuffix (__b.val[1], \ + vcreate_##funcsuffix (__AARCH64_UINT64_C (0))); \ + __o = __builtin_aarch64_set_qregoi##qmode (__o, \ + (signedtype) __temp.val[0], 0); \ + __o = __builtin_aarch64_set_qregoi##qmode (__o, \ + (signedtype) __temp.val[1], 1); \ + __builtin_aarch64_st2_lane##mode ((__builtin_aarch64_simd_ ## ptr_mode *) \ + __ptr, __o, __c); \ +} + +__ST2_LANE_FUNC (float16x4x2_t, float16x8x2_t, float16_t, v4hf, v8hf, hf, f16, + float16x8_t) +__ST2_LANE_FUNC (float32x2x2_t, float32x4x2_t, float32_t, v2sf, v4sf, sf, f32, + float32x4_t) +__ST2_LANE_FUNC (float64x1x2_t, float64x2x2_t, float64_t, df, v2df, df, f64, + float64x2_t) +__ST2_LANE_FUNC (poly8x8x2_t, poly8x16x2_t, poly8_t, v8qi, v16qi, qi, p8, + int8x16_t) +__ST2_LANE_FUNC (poly16x4x2_t, poly16x8x2_t, poly16_t, v4hi, v8hi, hi, p16, + int16x8_t) +__ST2_LANE_FUNC (poly64x1x2_t, poly64x2x2_t, poly64_t, di, v2di_ssps, di, p64, + poly64x2_t) +__ST2_LANE_FUNC (int8x8x2_t, int8x16x2_t, int8_t, v8qi, v16qi, qi, s8, + int8x16_t) +__ST2_LANE_FUNC (int16x4x2_t, int16x8x2_t, int16_t, v4hi, v8hi, hi, s16, + int16x8_t) +__ST2_LANE_FUNC (int32x2x2_t, int32x4x2_t, int32_t, v2si, v4si, si, s32, + int32x4_t) +__ST2_LANE_FUNC (int64x1x2_t, int64x2x2_t, int64_t, di, v2di, di, s64, + int64x2_t) +__ST2_LANE_FUNC (uint8x8x2_t, uint8x16x2_t, uint8_t, v8qi, v16qi, qi, u8, + int8x16_t) +__ST2_LANE_FUNC (uint16x4x2_t, uint16x8x2_t, uint16_t, v4hi, v8hi, hi, u16, + int16x8_t) +__ST2_LANE_FUNC (uint32x2x2_t, uint32x4x2_t, uint32_t, v2si, v4si, si, u32, + int32x4_t) +__ST2_LANE_FUNC (uint64x1x2_t, uint64x2x2_t, uint64_t, di, v2di, di, u64, + int64x2_t) +__ST2_LANE_FUNC (bfloat16x4x2_t, bfloat16x8x2_t, bfloat16_t, v4bf, v8bf, bf, bf16, + bfloat16x8_t) + +/* vst2q_lane */ + +#undef __ST2_LANE_FUNC +#define __ST2_LANE_FUNC(intype, ptrtype, mode, ptr_mode, funcsuffix) \ +__extension__ extern __inline void \ +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \ +vst2q_lane_ ## funcsuffix (ptrtype *__ptr, \ + intype __b, const int __c) \ +{ \ + union { intype __i; \ + __builtin_aarch64_simd_oi __o; } __temp = { __b }; \ + __builtin_aarch64_st2_lane##mode ((__builtin_aarch64_simd_ ## ptr_mode *) \ + __ptr, __temp.__o, __c); \ +} + +__ST2_LANE_FUNC (float16x8x2_t, float16_t, v8hf, hf, f16) +__ST2_LANE_FUNC (float32x4x2_t, float32_t, v4sf, sf, f32) +__ST2_LANE_FUNC (float64x2x2_t, float64_t, v2df, df, f64) +__ST2_LANE_FUNC (poly8x16x2_t, poly8_t, v16qi, qi, p8) +__ST2_LANE_FUNC (poly16x8x2_t, poly16_t, v8hi, hi, p16) +__ST2_LANE_FUNC (poly64x2x2_t, poly64_t, v2di, di, p64) +__ST2_LANE_FUNC (int8x16x2_t, int8_t, v16qi, qi, s8) +__ST2_LANE_FUNC (int16x8x2_t, int16_t, v8hi, hi, s16) +__ST2_LANE_FUNC (int32x4x2_t, int32_t, v4si, si, s32) +__ST2_LANE_FUNC (int64x2x2_t, int64_t, v2di, di, s64) +__ST2_LANE_FUNC (uint8x16x2_t, uint8_t, v16qi, qi, u8) +__ST2_LANE_FUNC (uint16x8x2_t, uint16_t, v8hi, hi, u16) +__ST2_LANE_FUNC (uint32x4x2_t, uint32_t, v4si, si, u32) +__ST2_LANE_FUNC (uint64x2x2_t, uint64_t, v2di, di, u64) +__ST2_LANE_FUNC (bfloat16x8x2_t, bfloat16_t, v8bf, bf, bf16) + +/* vst3_lane */ + +#define __ST3_LANE_FUNC(intype, largetype, ptrtype, mode, \ + qmode, ptr_mode, funcsuffix, signedtype) \ +__extension__ extern __inline void \ +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \ +vst3_lane_ ## funcsuffix (ptrtype *__ptr, \ + intype __b, const int __c) \ +{ \ + __builtin_aarch64_simd_ci __o; \ + largetype __temp; \ + __temp.val[0] \ + = vcombine_##funcsuffix (__b.val[0], \ + vcreate_##funcsuffix (__AARCH64_UINT64_C (0))); \ + __temp.val[1] \ + = vcombine_##funcsuffix (__b.val[1], \ + vcreate_##funcsuffix (__AARCH64_UINT64_C (0))); \ + __temp.val[2] \ + = vcombine_##funcsuffix (__b.val[2], \ + vcreate_##funcsuffix (__AARCH64_UINT64_C (0))); \ + __o = __builtin_aarch64_set_qregci##qmode (__o, \ + (signedtype) __temp.val[0], 0); \ + __o = __builtin_aarch64_set_qregci##qmode (__o, \ + (signedtype) __temp.val[1], 1); \ + __o = __builtin_aarch64_set_qregci##qmode (__o, \ + (signedtype) __temp.val[2], 2); \ + __builtin_aarch64_st3_lane##mode ((__builtin_aarch64_simd_ ## ptr_mode *) \ + __ptr, __o, __c); \ +} + +__ST3_LANE_FUNC (float16x4x3_t, float16x8x3_t, float16_t, v4hf, v8hf, hf, f16, + float16x8_t) +__ST3_LANE_FUNC (float32x2x3_t, float32x4x3_t, float32_t, v2sf, v4sf, sf, f32, + float32x4_t) +__ST3_LANE_FUNC (float64x1x3_t, float64x2x3_t, float64_t, df, v2df, df, f64, + float64x2_t) +__ST3_LANE_FUNC (poly8x8x3_t, poly8x16x3_t, poly8_t, v8qi, v16qi, qi, p8, + int8x16_t) +__ST3_LANE_FUNC (poly16x4x3_t, poly16x8x3_t, poly16_t, v4hi, v8hi, hi, p16, + int16x8_t) +__ST3_LANE_FUNC (poly64x1x3_t, poly64x2x3_t, poly64_t, di, v2di_ssps, di, p64, + poly64x2_t) +__ST3_LANE_FUNC (int8x8x3_t, int8x16x3_t, int8_t, v8qi, v16qi, qi, s8, + int8x16_t) +__ST3_LANE_FUNC (int16x4x3_t, int16x8x3_t, int16_t, v4hi, v8hi, hi, s16, + int16x8_t) +__ST3_LANE_FUNC (int32x2x3_t, int32x4x3_t, int32_t, v2si, v4si, si, s32, + int32x4_t) +__ST3_LANE_FUNC (int64x1x3_t, int64x2x3_t, int64_t, di, v2di, di, s64, + int64x2_t) +__ST3_LANE_FUNC (uint8x8x3_t, uint8x16x3_t, uint8_t, v8qi, v16qi, qi, u8, + int8x16_t) +__ST3_LANE_FUNC (uint16x4x3_t, uint16x8x3_t, uint16_t, v4hi, v8hi, hi, u16, + int16x8_t) +__ST3_LANE_FUNC (uint32x2x3_t, uint32x4x3_t, uint32_t, v2si, v4si, si, u32, + int32x4_t) +__ST3_LANE_FUNC (uint64x1x3_t, uint64x2x3_t, uint64_t, di, v2di, di, u64, + int64x2_t) +__ST3_LANE_FUNC (bfloat16x4x3_t, bfloat16x8x3_t, bfloat16_t, v4bf, v8bf, bf, bf16, + bfloat16x8_t) + +/* vst3q_lane */ + +#undef __ST3_LANE_FUNC +#define __ST3_LANE_FUNC(intype, ptrtype, mode, ptr_mode, funcsuffix) \ +__extension__ extern __inline void \ +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \ +vst3q_lane_ ## funcsuffix (ptrtype *__ptr, \ + intype __b, const int __c) \ +{ \ + union { intype __i; \ + __builtin_aarch64_simd_ci __o; } __temp = { __b }; \ + __builtin_aarch64_st3_lane##mode ((__builtin_aarch64_simd_ ## ptr_mode *) \ + __ptr, __temp.__o, __c); \ +} + +__ST3_LANE_FUNC (float16x8x3_t, float16_t, v8hf, hf, f16) +__ST3_LANE_FUNC (float32x4x3_t, float32_t, v4sf, sf, f32) +__ST3_LANE_FUNC (float64x2x3_t, float64_t, v2df, df, f64) +__ST3_LANE_FUNC (poly8x16x3_t, poly8_t, v16qi, qi, p8) +__ST3_LANE_FUNC (poly16x8x3_t, poly16_t, v8hi, hi, p16) +__ST3_LANE_FUNC (poly64x2x3_t, poly64_t, v2di, di, p64) +__ST3_LANE_FUNC (int8x16x3_t, int8_t, v16qi, qi, s8) +__ST3_LANE_FUNC (int16x8x3_t, int16_t, v8hi, hi, s16) +__ST3_LANE_FUNC (int32x4x3_t, int32_t, v4si, si, s32) +__ST3_LANE_FUNC (int64x2x3_t, int64_t, v2di, di, s64) +__ST3_LANE_FUNC (uint8x16x3_t, uint8_t, v16qi, qi, u8) +__ST3_LANE_FUNC (uint16x8x3_t, uint16_t, v8hi, hi, u16) +__ST3_LANE_FUNC (uint32x4x3_t, uint32_t, v4si, si, u32) +__ST3_LANE_FUNC (uint64x2x3_t, uint64_t, v2di, di, u64) +__ST3_LANE_FUNC (bfloat16x8x3_t, bfloat16_t, v8bf, bf, bf16) + +/* vst4_lane */ + +#define __ST4_LANE_FUNC(intype, largetype, ptrtype, mode, \ + qmode, ptr_mode, funcsuffix, signedtype) \ +__extension__ extern __inline void \ +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \ +vst4_lane_ ## funcsuffix (ptrtype *__ptr, \ + intype __b, const int __c) \ +{ \ + __builtin_aarch64_simd_xi __o; \ + largetype __temp; \ + __temp.val[0] \ + = vcombine_##funcsuffix (__b.val[0], \ + vcreate_##funcsuffix (__AARCH64_UINT64_C (0))); \ + __temp.val[1] \ + = vcombine_##funcsuffix (__b.val[1], \ + vcreate_##funcsuffix (__AARCH64_UINT64_C (0))); \ + __temp.val[2] \ + = vcombine_##funcsuffix (__b.val[2], \ + vcreate_##funcsuffix (__AARCH64_UINT64_C (0))); \ + __temp.val[3] \ + = vcombine_##funcsuffix (__b.val[3], \ + vcreate_##funcsuffix (__AARCH64_UINT64_C (0))); \ + __o = __builtin_aarch64_set_qregxi##qmode (__o, \ + (signedtype) __temp.val[0], 0); \ + __o = __builtin_aarch64_set_qregxi##qmode (__o, \ + (signedtype) __temp.val[1], 1); \ + __o = __builtin_aarch64_set_qregxi##qmode (__o, \ + (signedtype) __temp.val[2], 2); \ + __o = __builtin_aarch64_set_qregxi##qmode (__o, \ + (signedtype) __temp.val[3], 3); \ + __builtin_aarch64_st4_lane##mode ((__builtin_aarch64_simd_ ## ptr_mode *) \ + __ptr, __o, __c); \ +} + +__ST4_LANE_FUNC (float16x4x4_t, float16x8x4_t, float16_t, v4hf, v8hf, hf, f16, + float16x8_t) +__ST4_LANE_FUNC (float32x2x4_t, float32x4x4_t, float32_t, v2sf, v4sf, sf, f32, + float32x4_t) +__ST4_LANE_FUNC (float64x1x4_t, float64x2x4_t, float64_t, df, v2df, df, f64, + float64x2_t) +__ST4_LANE_FUNC (poly8x8x4_t, poly8x16x4_t, poly8_t, v8qi, v16qi, qi, p8, + int8x16_t) +__ST4_LANE_FUNC (poly16x4x4_t, poly16x8x4_t, poly16_t, v4hi, v8hi, hi, p16, + int16x8_t) +__ST4_LANE_FUNC (poly64x1x4_t, poly64x2x4_t, poly64_t, di, v2di_ssps, di, p64, + poly64x2_t) +__ST4_LANE_FUNC (int8x8x4_t, int8x16x4_t, int8_t, v8qi, v16qi, qi, s8, + int8x16_t) +__ST4_LANE_FUNC (int16x4x4_t, int16x8x4_t, int16_t, v4hi, v8hi, hi, s16, + int16x8_t) +__ST4_LANE_FUNC (int32x2x4_t, int32x4x4_t, int32_t, v2si, v4si, si, s32, + int32x4_t) +__ST4_LANE_FUNC (int64x1x4_t, int64x2x4_t, int64_t, di, v2di, di, s64, + int64x2_t) +__ST4_LANE_FUNC (uint8x8x4_t, uint8x16x4_t, uint8_t, v8qi, v16qi, qi, u8, + int8x16_t) +__ST4_LANE_FUNC (uint16x4x4_t, uint16x8x4_t, uint16_t, v4hi, v8hi, hi, u16, + int16x8_t) +__ST4_LANE_FUNC (uint32x2x4_t, uint32x4x4_t, uint32_t, v2si, v4si, si, u32, + int32x4_t) +__ST4_LANE_FUNC (uint64x1x4_t, uint64x2x4_t, uint64_t, di, v2di, di, u64, + int64x2_t) +__ST4_LANE_FUNC (bfloat16x4x4_t, bfloat16x8x4_t, bfloat16_t, v4bf, v8bf, bf, bf16, + bfloat16x8_t) + +/* vst4q_lane */ + +#undef __ST4_LANE_FUNC +#define __ST4_LANE_FUNC(intype, ptrtype, mode, ptr_mode, funcsuffix) \ +__extension__ extern __inline void \ +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \ +vst4q_lane_ ## funcsuffix (ptrtype *__ptr, \ + intype __b, const int __c) \ +{ \ + union { intype __i; \ + __builtin_aarch64_simd_xi __o; } __temp = { __b }; \ + __builtin_aarch64_st4_lane##mode ((__builtin_aarch64_simd_ ## ptr_mode *) \ + __ptr, __temp.__o, __c); \ +} + +__ST4_LANE_FUNC (float16x8x4_t, float16_t, v8hf, hf, f16) +__ST4_LANE_FUNC (float32x4x4_t, float32_t, v4sf, sf, f32) +__ST4_LANE_FUNC (float64x2x4_t, float64_t, v2df, df, f64) +__ST4_LANE_FUNC (poly8x16x4_t, poly8_t, v16qi, qi, p8) +__ST4_LANE_FUNC (poly16x8x4_t, poly16_t, v8hi, hi, p16) +__ST4_LANE_FUNC (poly64x2x4_t, poly64_t, v2di, di, p64) +__ST4_LANE_FUNC (int8x16x4_t, int8_t, v16qi, qi, s8) +__ST4_LANE_FUNC (int16x8x4_t, int16_t, v8hi, hi, s16) +__ST4_LANE_FUNC (int32x4x4_t, int32_t, v4si, si, s32) +__ST4_LANE_FUNC (int64x2x4_t, int64_t, v2di, di, s64) +__ST4_LANE_FUNC (uint8x16x4_t, uint8_t, v16qi, qi, u8) +__ST4_LANE_FUNC (uint16x8x4_t, uint16_t, v8hi, hi, u16) +__ST4_LANE_FUNC (uint32x4x4_t, uint32_t, v4si, si, u32) +__ST4_LANE_FUNC (uint64x2x4_t, uint64_t, v2di, di, u64) +__ST4_LANE_FUNC (bfloat16x8x4_t, bfloat16_t, v8bf, bf, bf16) + +#undef __ST4_LANE_FUNC + /* vld2_lane */ #define __LD2_LANE_FUNC(intype, vectype, largetype, ptrtype, mode, \ diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/arm-neon-ref.h b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/arm-neon-ref.h index 791972c737e..61fe7e759dc 100644 --- a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/arm-neon-ref.h +++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/arm-neon-ref.h @@ -11,6 +11,8 @@ typedef uint16_t hfloat16_t; typedef uint32_t hfloat32_t; typedef uint64_t hfloat64_t; +typedef uint16_t hbfloat16_t; + extern void abort(void); extern void *memset(void *, int, size_t); extern void *memcpy(void *, const void *, size_t); @@ -107,7 +109,7 @@ extern size_t strlen(const char *); { \ union fp_operand { \ uint##W##_t i; \ - float##W##_t f; \ + T##W##_t f; \ } tmp_res, tmp_exp; \ tmp_res.f = VECT_VAR(result, T, W, N)[i]; \ tmp_exp.i = VECT_VAR(EXPECTED, h##T, W, N)[i]; \ diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bf16_vstN_lane_1.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bf16_vstN_lane_1.c new file mode 100644 index 00000000000..2c70bb9de9c --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bf16_vstN_lane_1.c @@ -0,0 +1,227 @@ +/* { dg-do run { target { aarch64*-*-* } } } */ +/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */ +/* { dg-add-options arm_v8_2a_bf16_neon } */ + +#include <arm_neon.h> +#include "arm-neon-ref.h" +#include "compute-ref-data.h" + +/* Expected results for vst2, chunk 0. */ +VECT_VAR_DECL(expected_st2_0,hbfloat,16,4) [] = { 0xABAB, 0x3210, 0x0, 0x0 }; +VECT_VAR_DECL(expected_st2_0,hbfloat,16,8) [] = { 0xABAB, 0x3210, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0 }; + +/* Expected results for vst2, chunk 1. */ +VECT_VAR_DECL(expected_st2_1,hbfloat,16,4) [] = { 0x0, 0x0, 0x0, 0x0 }; +VECT_VAR_DECL(expected_st2_1,hbfloat,16,8) [] = { 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0 }; + +/* Expected results for vst3, chunk 0. */ +VECT_VAR_DECL(expected_st3_0,hbfloat,16,4) [] = { 0xABAB, 0x3210, 0xCAFE, 0x0 }; +VECT_VAR_DECL(expected_st3_0,hbfloat,16,8) [] = { 0xABAB, 0x3210, 0xCAFE, 0x0, + 0x0, 0x0, 0x0, 0x0 }; + +/* Expected results for vst3, chunk 1. */ +VECT_VAR_DECL(expected_st3_1,hbfloat,16,4) [] = { 0x0, 0x0, 0x0, 0x0 }; +VECT_VAR_DECL(expected_st3_1,hbfloat,16,8) [] = { 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0 }; + +/* Expected results for vst3, chunk 2. */ +VECT_VAR_DECL(expected_st3_2,hbfloat,16,4) [] = { 0x0, 0x0, 0x0, 0x0 }; +VECT_VAR_DECL(expected_st3_2,hbfloat,16,8) [] = { 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0 }; + +/* Expected results for vst4, chunk 0. */ +VECT_VAR_DECL(expected_st4_0,hbfloat,16,4) [] = + { 0xABAB, 0x3210, 0xCAFE, 0x1234 }; +VECT_VAR_DECL(expected_st4_0,hbfloat,16,8) [] = + { 0xABAB, 0x3210, 0xCAFE, 0x1234, 0x0, 0x0, 0x0, 0x0 }; + +/* Expected results for vst4, chunk 1. */ +VECT_VAR_DECL(expected_st4_1,hbfloat,16,4) [] = { 0x0, 0x0, 0x0, 0x0 }; +VECT_VAR_DECL(expected_st4_1,hbfloat,16,8) [] = { 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0 }; + +/* Expected results for vst4, chunk 2. */ +VECT_VAR_DECL(expected_st4_2,hbfloat,16,4) [] = { 0x0, 0x0, 0x0, 0x0 }; +VECT_VAR_DECL(expected_st4_2,hbfloat,16,8) [] = { 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0 }; + +/* Expected results for vst4, chunk 3. */ +VECT_VAR_DECL(expected_st4_3,hbfloat,16,4) [] = { 0x0, 0x0, 0x0, 0x0 }; +VECT_VAR_DECL(expected_st4_3,hbfloat,16,8) [] = { 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0 }; + +typedef union +{ + bfloat16_t bf16; + uint16_t u16; +} bfloat16_u_t; + +static bfloat16_t result_bfloat16x4[4]; +static bfloat16_t result_bfloat16x8[8]; + +void exec_vstX_lane (void) +{ + bfloat16_u_t bfloat16_data[4]; + bfloat16_data[0].u16 = 0xABAB; + bfloat16_data[1].u16 = 0x3210; + bfloat16_data[2].u16 = 0xCAFE; + bfloat16_data[3].u16 = 0x1234; + + bfloat16_t buffer_vld2_lane_bfloat16x2 [2] = + { bfloat16_data[0].bf16, + bfloat16_data[1].bf16 }; + bfloat16_t buffer_vld3_lane_bfloat16x3 [3] = + { bfloat16_data[0].bf16, + bfloat16_data[1].bf16, + bfloat16_data[2].bf16 }; + bfloat16_t buffer_vld4_lane_bfloat16x4 [4] = + { bfloat16_data[0].bf16, + bfloat16_data[1].bf16, + bfloat16_data[2].bf16, + bfloat16_data[3].bf16 }; + + /* In this case, input variables are arrays of vectors. */ +#define DECL_VSTX_LANE(T1, W, N, X) \ + VECT_ARRAY_TYPE(T1, W, N, X) VECT_ARRAY_VAR(vector, T1, W, N, X); \ + VECT_ARRAY_TYPE(T1, W, N, X) VECT_ARRAY_VAR(vector_src, T1, W, N, X); \ + VECT_VAR_DECL(result_bis_##X, T1, W, N)[X * N] + + /* We need to use a temporary result buffer (result_bis), because + the one used for other tests is not large enough. A subset of the + result data is moved from result_bis to result, and it is this + subset which is used to check the actual behavior. The next + macro enables to move another chunk of data from result_bis to + result. */ + /* We also use another extra input buffer (buffer_src), which we + fill with 0xAA, and which it used to load a vector from which we + read a given lane. */ +#define TEST_VSTX_LANE(Q, T1, T2, W, N, X, L) \ + memset (VECT_VAR(buffer_src, T1, W, N), 0xAA, \ + sizeof(VECT_VAR(buffer_src, T1, W, N))); \ + memset (VECT_VAR(result_bis_##X, T1, W, N), 0, \ + sizeof(VECT_VAR(result_bis_##X, T1, W, N))); \ + \ + VECT_ARRAY_VAR(vector_src, T1, W, N, X) = \ + vld##X##Q##_##T2##W(VECT_VAR(buffer_src, T1, W, N)); \ + \ + VECT_ARRAY_VAR(vector, T1, W, N, X) = \ + /* Use dedicated init buffer, of size X. */ \ + vld##X##Q##_lane_##T2##W(VECT_VAR(buffer_vld##X##_lane, T1, W, X), \ + VECT_ARRAY_VAR(vector_src, T1, W, N, X), \ + L); \ + vst##X##Q##_lane_##T2##W(VECT_VAR(result_bis_##X, T1, W, N), \ + VECT_ARRAY_VAR(vector, T1, W, N, X), \ + L); \ + memcpy(VECT_VAR(result, T1, W, N), VECT_VAR(result_bis_##X, T1, W, N), \ + sizeof(VECT_VAR(result, T1, W, N))); + + /* Overwrite "result" with the contents of "result_bis"[Y]. */ +#define TEST_EXTRA_CHUNK(T1, W, N, X, Y) \ + memcpy(VECT_VAR(result, T1, W, N), \ + &(VECT_VAR(result_bis_##X, T1, W, N)[Y*N]), \ + sizeof(VECT_VAR(result, T1, W, N))); + +#define DUMMY_ARRAY(V, T, W, N, L) VECT_VAR_DECL(V,T,W,N)[N*L] + + DECL_VSTX_LANE(bfloat, 16, 4, 2); + DECL_VSTX_LANE(bfloat, 16, 8, 2); + DECL_VSTX_LANE(bfloat, 16, 4, 3); + DECL_VSTX_LANE(bfloat, 16, 8, 3); + DECL_VSTX_LANE(bfloat, 16, 4, 4); + DECL_VSTX_LANE(bfloat, 16, 8, 4); + + DUMMY_ARRAY(buffer_src, bfloat, 16, 4, 4); + DUMMY_ARRAY(buffer_src, bfloat, 16, 8, 4); + + /* Check vst2_lane/vst2q_lane. */ + clean_results (); + TEST_VSTX_LANE(, bfloat, bf, 16, 4, 2, 2); + TEST_VSTX_LANE(q, bfloat, bf, 16, 8, 2, 6); + +#undef CMT +#define CMT " (chunk 0)" +#undef TEST_MSG +#define TEST_MSG "VST2_LANE/VST2Q_LANE" + CHECK_FP(TEST_MSG, bfloat, 16, 4, PRIx16, expected_st2_0, CMT); + CHECK_FP(TEST_MSG, bfloat, 16, 8, PRIx16, expected_st2_0, CMT); + TEST_EXTRA_CHUNK(bfloat, 16, 4, 2, 1); + TEST_EXTRA_CHUNK(bfloat, 16, 8, 2, 1); + +#undef CMT +#define CMT " (chunk 1)" + CHECK_FP(TEST_MSG, bfloat, 16, 4, PRIx16, expected_st2_1, CMT); + CHECK_FP(TEST_MSG, bfloat, 16, 8, PRIx16, expected_st2_1, CMT); + + /* Check vst3_lane/vst3q_lane. */ + clean_results (); +#undef TEST_MSG +#define TEST_MSG "VST3_LANE/VST3Q_LANE" + TEST_VSTX_LANE(, bfloat, bf, 16, 4, 3, 2); + TEST_VSTX_LANE(q, bfloat, bf, 16, 8, 3, 6); + +#undef CMT +#define CMT " (chunk 0)" + CHECK_FP(TEST_MSG, bfloat, 16, 4, PRIx16, expected_st3_0, CMT); + CHECK_FP(TEST_MSG, bfloat, 16, 8, PRIx16, expected_st3_0, CMT); + + TEST_EXTRA_CHUNK(bfloat, 16, 4, 3, 1); + TEST_EXTRA_CHUNK(bfloat, 16, 8, 3, 1); + + +#undef CMT +#define CMT " (chunk 1)" + CHECK_FP(TEST_MSG, bfloat, 16, 4, PRIx16, expected_st3_1, CMT); + CHECK_FP(TEST_MSG, bfloat, 16, 8, PRIx16, expected_st3_1, CMT); + + TEST_EXTRA_CHUNK(bfloat, 16, 4, 3, 2); + TEST_EXTRA_CHUNK(bfloat, 16, 8, 3, 2); + +#undef CMT +#define CMT " (chunk 2)" + CHECK_FP(TEST_MSG, bfloat, 16, 4, PRIx16, expected_st3_2, CMT); + CHECK_FP(TEST_MSG, bfloat, 16, 8, PRIx16, expected_st3_2, CMT); + + /* Check vst4_lane/vst4q_lane. */ + clean_results (); +#undef TEST_MSG +#define TEST_MSG "VST4_LANE/VST4Q_LANE" + TEST_VSTX_LANE(, bfloat, bf, 16, 4, 4, 2); + TEST_VSTX_LANE(q, bfloat, bf, 16, 8, 4, 6); + +#undef CMT +#define CMT " (chunk 0)" + CHECK_FP(TEST_MSG, bfloat, 16, 4, PRIx16, expected_st4_0, CMT); + CHECK_FP(TEST_MSG, bfloat, 16, 8, PRIx16, expected_st4_0, CMT); + + TEST_EXTRA_CHUNK(bfloat, 16, 4, 4, 1); + TEST_EXTRA_CHUNK(bfloat, 16, 8, 4, 1); + +#undef CMT +#define CMT " (chunk 1)" + CHECK_FP(TEST_MSG, bfloat, 16, 4, PRIx16, expected_st4_1, CMT); + CHECK_FP(TEST_MSG, bfloat, 16, 8, PRIx16, expected_st4_1, CMT); + + TEST_EXTRA_CHUNK(bfloat, 16, 4, 4, 2); + TEST_EXTRA_CHUNK(bfloat, 16, 8, 4, 2); + +#undef CMT +#define CMT " (chunk 2)" + CHECK_FP(TEST_MSG, bfloat, 16, 4, PRIx16, expected_st4_2, CMT); + CHECK_FP(TEST_MSG, bfloat, 16, 8, PRIx16, expected_st4_2, CMT); + + TEST_EXTRA_CHUNK(bfloat, 16, 4, 4, 3); + TEST_EXTRA_CHUNK(bfloat, 16, 8, 4, 3); + +#undef CMT +#define CMT " (chunk 3)" + CHECK_FP(TEST_MSG, bfloat, 16, 4, PRIx16, expected_st4_3, CMT); + CHECK_FP(TEST_MSG, bfloat, 16, 8, PRIx16, expected_st4_3, CMT); +} + +int main (void) +{ + exec_vstX_lane (); + return 0; +} diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bf16_vstN_lane_2.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bf16_vstN_lane_2.c new file mode 100644 index 00000000000..f70c34dbd83 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bf16_vstN_lane_2.c @@ -0,0 +1,52 @@ +/* { dg-do assemble { target { aarch64*-*-* } } } */ +/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */ +/* { dg-add-options arm_v8_2a_bf16_neon } */ +/* { dg-additional-options "-O2 --save-temps" } */ + +#include <arm_neon.h> + +void +test_vst2_lane_bf16 (bfloat16_t *ptr, bfloat16x4x2_t b) +{ + vst2_lane_bf16 (ptr, b, 2); +} + +/* { dg-final { scan-assembler-times "st2\\t{v2.h - v3.h}\\\[2\\\], \\\[x0\\\]" 1 } } */ + +void +test_vst2q_lane_bf16 (bfloat16_t *ptr, bfloat16x8x2_t b) +{ + vst2q_lane_bf16 (ptr, b, 2); +} + +/* { dg-final { scan-assembler-times "st2\\t{v0.h - v1.h}\\\[2\\\], \\\[x0\\\]" 1 } } */ + +void +test_vst3_lane_bf16 (bfloat16_t *ptr, bfloat16x4x3_t b) +{ + vst3_lane_bf16 (ptr, b, 2); +} + +void +test_vst3q_lane_bf16 (bfloat16_t *ptr, bfloat16x8x3_t b) +{ + vst3q_lane_bf16 (ptr, b, 2); +} + +/* { dg-final { scan-assembler-times "st3\\t{v4.h - v6.h}\\\[2\\\], \\\[x0\\\]" 2 } } */ + +void +test_vst4_lane_bf16 (bfloat16_t *ptr, bfloat16x4x4_t b) +{ + vst4_lane_bf16 (ptr, b, 2); +} + +/* { dg-final { scan-assembler-times "st4\\t{v4.h - v7.h}\\\[2\\\], \\\[x0\\\]" 1 } } */ + +void +test_vst4q_lane_bf16 (bfloat16_t *ptr, bfloat16x8x4_t b) +{ + vst4q_lane_bf16 (ptr, b, 2); +} + +/* { dg-final { scan-assembler-times "st4\\t{v0.h - v3.h}\\\[2\\\], \\\[x0\\\]" 1 } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vst2_lane_bf16_indices_1.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vst2_lane_bf16_indices_1.c new file mode 100644 index 00000000000..00e3b9bfe5f --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vst2_lane_bf16_indices_1.c @@ -0,0 +1,16 @@ +#include <arm_neon.h> + +/* { dg-do compile { target { aarch64*-*-* } } } */ +/* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */ +/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok { target { arm*-*-* } } } */ +/* { dg-add-options arm_v8_2a_bf16_neon } */ + +void +f_vst2_lane_bf16 (bfloat16_t * p, bfloat16x4x2_t v) +{ + /* { dg-error "lane 4 out of range 0 - 3" "" { target *-*-* } 0 } */ + vst2_lane_bf16 (p, v, 4); + /* { dg-error "lane -1 out of range 0 - 3" "" { target *-*-* } 0 } */ + vst2_lane_bf16 (p, v, -1); + return; +} diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vst2q_lane_bf16_indices_1.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vst2q_lane_bf16_indices_1.c new file mode 100644 index 00000000000..e39968008f9 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vst2q_lane_bf16_indices_1.c @@ -0,0 +1,16 @@ +#include <arm_neon.h> + +/* { dg-do compile { target { aarch64*-*-* } } } */ +/* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */ +/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok { target { arm*-*-* } } } */ +/* { dg-add-options arm_v8_2a_bf16_neon } */ + +void +f_vst2q_lane_bf16 (bfloat16_t * p, bfloat16x8x2_t v) +{ + /* { dg-error "lane 8 out of range 0 - 7" "" { target *-*-* } 0 } */ + vst2q_lane_bf16 (p, v, 8); + /* { dg-error "lane -1 out of range 0 - 7" "" { target *-*-* } 0 } */ + vst2q_lane_bf16 (p, v, -1); + return; +} diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vst3_lane_bf16_indices_1.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vst3_lane_bf16_indices_1.c new file mode 100644 index 00000000000..9dcba196791 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vst3_lane_bf16_indices_1.c @@ -0,0 +1,16 @@ +#include <arm_neon.h> + +/* { dg-do compile { target { aarch64*-*-* } } } */ +/* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */ +/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok { target { arm*-*-* } } } */ +/* { dg-add-options arm_v8_2a_bf16_neon } */ + +void +f_vst3_lane_bf16 (bfloat16_t * p, bfloat16x4x3_t v) +{ + /* { dg-error "lane 4 out of range 0 - 3" "" { target *-*-* } 0 } */ + vst3_lane_bf16 (p, v, 4); + /* { dg-error "lane -1 out of range 0 - 3" "" { target *-*-* } 0 } */ + vst3_lane_bf16 (p, v, -1); + return; +} diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vst3q_lane_bf16_indices_1.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vst3q_lane_bf16_indices_1.c new file mode 100644 index 00000000000..f9aa76a06dc --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vst3q_lane_bf16_indices_1.c @@ -0,0 +1,16 @@ +#include <arm_neon.h> + +/* { dg-do compile { target { aarch64*-*-* } } } */ +/* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */ +/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok { target { arm*-*-* } } } */ +/* { dg-add-options arm_v8_2a_bf16_neon } */ + +void +f_vst3q_lane_bf16 (bfloat16_t * p, bfloat16x8x3_t v) +{ + /* { dg-error "lane 8 out of range 0 - 7" "" { target *-*-* } 0 } */ + vst3q_lane_bf16 (p, v, 8); + /* { dg-error "lane -1 out of range 0 - 7" "" { target *-*-* } 0 } */ + vst3q_lane_bf16 (p, v, -1); + return; +} diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vst4_lane_bf16_indices_1.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vst4_lane_bf16_indices_1.c new file mode 100644 index 00000000000..405b583aabd --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vst4_lane_bf16_indices_1.c @@ -0,0 +1,16 @@ +#include <arm_neon.h> + +/* { dg-do compile { target { aarch64*-*-* } } } */ +/* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */ +/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok { target { arm*-*-* } } } */ +/* { dg-add-options arm_v8_2a_bf16_neon } */ + +void +f_vst4_lane_bf16 (bfloat16_t * p, bfloat16x4x4_t v) +{ + /* { dg-error "lane 4 out of range 0 - 3" "" { target *-*-* } 0 } */ + vst4_lane_bf16 (p, v, 4); + /* { dg-error "lane -1 out of range 0 - 3" "" { target *-*-* } 0 } */ + vst4_lane_bf16 (p, v, -1); + return; +} diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vst4q_lane_bf16_indices_1.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vst4q_lane_bf16_indices_1.c new file mode 100644 index 00000000000..51e372197b0 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vst4q_lane_bf16_indices_1.c @@ -0,0 +1,16 @@ +#include <arm_neon.h> + +/* { dg-do compile { target { aarch64*-*-* } } } */ +/* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */ +/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok { target { arm*-*-* } } } */ +/* { dg-add-options arm_v8_2a_bf16_neon } */ + +void +f_vst4q_lane_bf16 (bfloat16_t * p, bfloat16x8x4_t v) +{ + /* { dg-error "lane 8 out of range 0 - 7" "" { target *-*-* } 0 } */ + vst4q_lane_bf16 (p, v, 8); + /* { dg-error "lane -1 out of range 0 - 7" "" { target *-*-* } 0 } */ + vst4q_lane_bf16 (p, v, -1); + return; +} -- 2.20.1