On Mon, May 16, 2016 at 10:09:26AM +0100, Jiong Wang wrote: > The support of vfma_n_f64, vfms_n_f32, vfmsq_n_f32, vfmsq_n_f64 are > missing in current gcc arm_neon.h. > > Meanwhile, besides "(fma (vec_dup (vec_select)))", fma by element can > also comes from "(fma (vec_dup(scalar" where the scalar value is already > sitting in vector register then duplicated to other lanes, and there is > no lane size change. > > This patch implement this and can generate better code under some > context. For example: > > cat test.c > === > typedef __Float32x2_t float32x2_t; > typedef float float32_t; > > float32x2_t > vfma_n_f32 (float32x2_t __a, float32x2_t __b, float32_t __c) > { > return __builtin_aarch64_fmav2sf (__b, (float32x2_t) {__c, > __c}, __a); > } > > before (-O2) > === > vfma_n_f32: > dup v2.2s, v2.s[0] > fmla v0.2s, v1.2s, v2.2s > ret > after > === > vfma_n_f32: > fmla v0.2s, v1.2s, v2.s[0] > ret > > OK for trunk? > > 2016-05-16 Jiong Wang <jiong.w...@arm.com>
This ChangeLog entry is not correctly formatted. There should be two spaces between your name and your email, and each line should start with a tab. > > gcc/ > * config/aarch64/aarch64-simd.md (*aarch64_fma4_elt_to_128df): Rename > to *aarch64_fma4_elt_from_dup<mode>. > (*aarch64_fnma4_elt_to_128df): Rename to > *aarch64_fnma4_elt_from_dup<mode>. > * config/aarch64/arm_neon.h (vfma_n_f64): New. > (vfms_n_f32): Likewise. > (vfms_n_f64): Likewise. > (vfmsq_n_f32): Likewise. > (vfmsq_n_f64): Likewise. > > gcc/testsuite/ > * gcc/testsuite/gcc.target/aarch64/fmla_intrinsic_1.c: Use > standard syntax. > * gcc/testsuite/gcc.target/aarch64/fmls_intrinsic_1.c: Likewise. The paths of these two entries are incorrect. Remove the gcc/testsuite from the front. I don't understand what you mean by "Use standard syntax.", please fix this to describe what you are actually changing. > * gcc.target/aarch64/advsimd-intrinsics/arm-neon-ref.h: New entry > for float64x1. > * gcc.target/aarch64/advsimd-intrinsics/vfms_vfma_n.c: New. These two changes need approval from an ARM maintainer as they are in common files. >From an AArch64 perspective, this patch is OK with a fixed ChangeLog. Please wait for an ARM OK for the test changes. Thanks, James > diff --git a/gcc/config/aarch64/aarch64-simd.md > b/gcc/config/aarch64/aarch64-simd.md > index > bd73bce64414e8bc01732d14311d742cf28f4586..90eaca176b4706e6cc42f16ce2c956f1c8ad17b1 > 100644 > --- a/gcc/config/aarch64/aarch64-simd.md > +++ b/gcc/config/aarch64/aarch64-simd.md > @@ -1579,16 +1579,16 @@ > [(set_attr "type" "neon_fp_mla_<Vetype>_scalar<q>")] > ) > > -(define_insn "*aarch64_fma4_elt_to_128df" > - [(set (match_operand:V2DF 0 "register_operand" "=w") > - (fma:V2DF > - (vec_duplicate:V2DF > - (match_operand:DF 1 "register_operand" "w")) > - (match_operand:V2DF 2 "register_operand" "w") > - (match_operand:V2DF 3 "register_operand" "0")))] > +(define_insn "*aarch64_fma4_elt_from_dup<mode>" > + [(set (match_operand:VMUL 0 "register_operand" "=w") > + (fma:VMUL > + (vec_duplicate:VMUL > + (match_operand:<VEL> 1 "register_operand" "w")) > + (match_operand:VMUL 2 "register_operand" "w") > + (match_operand:VMUL 3 "register_operand" "0")))] > "TARGET_SIMD" > - "fmla\\t%0.2d, %2.2d, %1.2d[0]" > - [(set_attr "type" "neon_fp_mla_d_scalar_q")] > + "fmla\t%0.<Vtype>, %2.<Vtype>, %1.<Vetype>[0]" > + [(set_attr "type" "neon<fp>_mla_<Vetype>_scalar<q>")] > ) > > (define_insn "*aarch64_fma4_elt_to_64v2df" > @@ -1656,17 +1656,17 @@ > [(set_attr "type" "neon_fp_mla_<Vetype>_scalar<q>")] > ) > > -(define_insn "*aarch64_fnma4_elt_to_128df" > - [(set (match_operand:V2DF 0 "register_operand" "=w") > - (fma:V2DF > - (neg:V2DF > - (match_operand:V2DF 2 "register_operand" "w")) > - (vec_duplicate:V2DF > - (match_operand:DF 1 "register_operand" "w")) > - (match_operand:V2DF 3 "register_operand" "0")))] > - "TARGET_SIMD" > - "fmls\\t%0.2d, %2.2d, %1.2d[0]" > - [(set_attr "type" "neon_fp_mla_d_scalar_q")] > +(define_insn "*aarch64_fnma4_elt_from_dup<mode>" > + [(set (match_operand:VMUL 0 "register_operand" "=w") > + (fma:VMUL > + (neg:VMUL > + (match_operand:VMUL 2 "register_operand" "w")) > + (vec_duplicate:VMUL > + (match_operand:<VEL> 1 "register_operand" "w")) > + (match_operand:VMUL 3 "register_operand" "0")))] > + "TARGET_SIMD" > + "fmls\t%0.<Vtype>, %2.<Vtype>, %1.<Vetype>[0]" > + [(set_attr "type" "neon<fp>_mla_<Vetype>_scalar<q>")] > ) > > (define_insn "*aarch64_fnma4_elt_to_64v2df" > diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h > index > 2612a325718918cf7cd808f28c09c9c4c7b11c07..ca7ace5aa656163826569d046fcbf02f9f7d4d6c > 100644 > --- a/gcc/config/aarch64/arm_neon.h > +++ b/gcc/config/aarch64/arm_neon.h > @@ -14456,6 +14456,12 @@ vfma_n_f32 (float32x2_t __a, float32x2_t __b, > float32_t __c) > return __builtin_aarch64_fmav2sf (__b, vdup_n_f32 (__c), __a); > } > > +__extension__ static __inline float64x1_t __attribute__ ((__always_inline__)) > +vfma_n_f64 (float64x1_t __a, float64x1_t __b, float64_t __c) > +{ > + return (float64x1_t) {__b[0] * __c + __a[0]}; > +} > + > __extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) > vfmaq_n_f32 (float32x4_t __a, float32x4_t __b, float32_t __c) > { > @@ -14597,6 +14603,29 @@ vfmsq_f64 (float64x2_t __a, float64x2_t __b, > float64x2_t __c) > return __builtin_aarch64_fmav2df (-__b, __c, __a); > } > > +__extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) > +vfms_n_f32 (float32x2_t __a, float32x2_t __b, float32_t __c) > +{ > + return __builtin_aarch64_fmav2sf (-__b, vdup_n_f32 (__c), __a); > +} > + > +__extension__ static __inline float64x1_t __attribute__ ((__always_inline__)) > +vfms_n_f64 (float64x1_t __a, float64x1_t __b, float64_t __c) > +{ > + return (float64x1_t) {-__b[0] * __c + __a[0]}; > +} > + > +__extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) > +vfmsq_n_f32 (float32x4_t __a, float32x4_t __b, float32_t __c) > +{ > + return __builtin_aarch64_fmav4sf (-__b, vdupq_n_f32 (__c), __a); > +} > + > +__extension__ static __inline float64x2_t __attribute__ ((__always_inline__)) > +vfmsq_n_f64 (float64x2_t __a, float64x2_t __b, float64_t __c) > +{ > + return __builtin_aarch64_fmav2df (-__b, vdupq_n_f64 (__c), __a); > +} > > /* vfms_lane */ > > diff --git > a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/arm-neon-ref.h > b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/arm-neon-ref.h > index > 49fbd843e507ede8aa81d02c175a82a1221750a4..cf90825f87391b72aca9a29980210d21f4321c04 > 100644 > --- a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/arm-neon-ref.h > +++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/arm-neon-ref.h > @@ -136,6 +136,7 @@ static ARRAY(result, poly, 16, 4); > #if defined (__ARM_FP16_FORMAT_IEEE) || defined > (__ARM_FP16_FORMAT_ALTERNATIVE) > static ARRAY(result, float, 16, 4); > #endif > +static ARRAY(result, float, 64, 1); > static ARRAY(result, float, 32, 2); > static ARRAY(result, int, 8, 16); > static ARRAY(result, int, 16, 8); > @@ -169,6 +170,7 @@ extern ARRAY(expected, poly, 8, 8); > extern ARRAY(expected, poly, 16, 4); > extern ARRAY(expected, hfloat, 16, 4); > extern ARRAY(expected, hfloat, 32, 2); > +extern ARRAY(expected, hfloat, 64, 1); > extern ARRAY(expected, int, 8, 16); > extern ARRAY(expected, int, 16, 8); > extern ARRAY(expected, int, 32, 4); > diff --git > a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vfms_vfma_n.c > b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vfms_vfma_n.c > new file mode 100644 > index > 0000000000000000000000000000000000000000..26223763c59c849607b5320f6ec37098a556ce2e > --- /dev/null > +++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vfms_vfma_n.c > @@ -0,0 +1,490 @@ > +#include <arm_neon.h> > +#include "arm-neon-ref.h" > +#include "compute-ref-data.h" > + > +#define A0 123.4f > +#define A1 -3.8f > +#define A2 -29.4f > +#define A3 (__builtin_inff ()) > +#define A4 0.0f > +#define A5 24.0f > +#define A6 124.0f > +#define A7 1024.0f > + > +#define B0 -5.8f > +#define B1 -0.0f > +#define B2 -10.8f > +#define B3 10.0f > +#define B4 23.4f > +#define B5 -1234.8f > +#define B6 8.9f > +#define B7 4.0f > + > +#define E0 9.8f > +#define E1 -1024.0f > +#define E2 (-__builtin_inff ()) > +#define E3 479.0f > +float32_t elem0 = E0; > +float32_t elem1 = E1; > +float32_t elem2 = E2; > +float32_t elem3 = E3; > + > +#define DA0 1231234.4 > +#define DA1 -3.8 > +#define DA2 -2980.4 > +#define DA3 -5.8 > +#define DA4 0.01123 > +#define DA5 24.0 > +#define DA6 124.12345 > +#define DA7 1024.0 > + > +#define DB0 -5.8 > +#define DB1 (__builtin_inf ()) > +#define DB2 -105.8 > +#define DB3 10.0 > +#define DB4 (-__builtin_inf ()) > +#define DB5 -1234.8 > +#define DB6 848.9 > +#define DB7 44444.0 > + > +#define DE0 9.8 > +#define DE1 -1024.0 > +#define DE2 105.8 > +#define DE3 479.0 > +float64_t delem0 = DE0; > +float64_t delem1 = DE1; > +float64_t delem2 = DE2; > +float64_t delem3 = DE3; > + > +#if defined(__aarch64__) && defined(__ARM_FEATURE_FMA) > + > +/* Expected results for vfms_n. */ > + > +VECT_VAR_DECL(expectedfms0, float, 32, 2) [] = {A0 + -B0 * E0, A1 + -B1 * > E0}; > +VECT_VAR_DECL(expectedfms1, float, 32, 2) [] = {A2 + -B2 * E1, A3 + -B3 * > E1}; > +VECT_VAR_DECL(expectedfms2, float, 32, 2) [] = {A4 + -B4 * E2, A5 + -B5 * > E2}; > +VECT_VAR_DECL(expectedfms3, float, 32, 2) [] = {A6 + -B6 * E3, A7 + -B7 * > E3}; > +VECT_VAR_DECL(expectedfma0, float, 32, 2) [] = {A0 + B0 * E0, A1 + B1 * E0}; > +VECT_VAR_DECL(expectedfma1, float, 32, 2) [] = {A2 + B2 * E1, A3 + B3 * E1}; > +VECT_VAR_DECL(expectedfma2, float, 32, 2) [] = {A4 + B4 * E2, A5 + B5 * E2}; > +VECT_VAR_DECL(expectedfma3, float, 32, 2) [] = {A6 + B6 * E3, A7 + B7 * E3}; > + > +hfloat32_t * VECT_VAR (expectedfms0_static, hfloat, 32, 2) = > + (hfloat32_t *) VECT_VAR (expectedfms0, float, 32, 2); > +hfloat32_t * VECT_VAR (expectedfms1_static, hfloat, 32, 2) = > + (hfloat32_t *) VECT_VAR (expectedfms1, float, 32, 2); > +hfloat32_t * VECT_VAR (expectedfms2_static, hfloat, 32, 2) = > + (hfloat32_t *) VECT_VAR (expectedfms2, float, 32, 2); > +hfloat32_t * VECT_VAR (expectedfms3_static, hfloat, 32, 2) = > + (hfloat32_t *) VECT_VAR (expectedfms3, float, 32, 2); > +hfloat32_t * VECT_VAR (expectedfma0_static, hfloat, 32, 2) = > + (hfloat32_t *) VECT_VAR (expectedfma0, float, 32, 2); > +hfloat32_t * VECT_VAR (expectedfma1_static, hfloat, 32, 2) = > + (hfloat32_t *) VECT_VAR (expectedfma1, float, 32, 2); > +hfloat32_t * VECT_VAR (expectedfma2_static, hfloat, 32, 2) = > + (hfloat32_t *) VECT_VAR (expectedfma2, float, 32, 2); > +hfloat32_t * VECT_VAR (expectedfma3_static, hfloat, 32, 2) = > + (hfloat32_t *) VECT_VAR (expectedfma3, float, 32, 2); > + > + > +VECT_VAR_DECL(expectedfms0, float, 32, 4) [] = {A0 + -B0 * E0, A1 + -B1 * E0, > + A2 + -B2 * E0, A3 + -B3 * E0}; > +VECT_VAR_DECL(expectedfms1, float, 32, 4) [] = {A4 + -B4 * E1, A5 + -B5 * E1, > + A6 + -B6 * E1, A7 + -B7 * E1}; > +VECT_VAR_DECL(expectedfms2, float, 32, 4) [] = {A0 + -B0 * E2, A2 + -B2 * E2, > + A4 + -B4 * E2, A6 + -B6 * E2}; > +VECT_VAR_DECL(expectedfms3, float, 32, 4) [] = {A1 + -B1 * E3, A3 + -B3 * E3, > + A5 + -B5 * E3, A7 + -B7 * E3}; > +VECT_VAR_DECL(expectedfma0, float, 32, 4) [] = {A0 + B0 * E0, A1 + B1 * E0, > + A2 + B2 * E0, A3 + B3 * E0}; > +VECT_VAR_DECL(expectedfma1, float, 32, 4) [] = {A4 + B4 * E1, A5 + B5 * E1, > + A6 + B6 * E1, A7 + B7 * E1}; > +VECT_VAR_DECL(expectedfma2, float, 32, 4) [] = {A0 + B0 * E2, A2 + B2 * E2, > + A4 + B4 * E2, A6 + B6 * E2}; > +VECT_VAR_DECL(expectedfma3, float, 32, 4) [] = {A1 + B1 * E3, A3 + B3 * E3, > + A5 + B5 * E3, A7 + B7 * E3}; > + > +hfloat32_t * VECT_VAR (expectedfms0_static, hfloat, 32, 4) = > + (hfloat32_t *) VECT_VAR (expectedfms0, float, 32, 4); > +hfloat32_t * VECT_VAR (expectedfms1_static, hfloat, 32, 4) = > + (hfloat32_t *) VECT_VAR (expectedfms1, float, 32, 4); > +hfloat32_t * VECT_VAR (expectedfms2_static, hfloat, 32, 4) = > + (hfloat32_t *) VECT_VAR (expectedfms2, float, 32, 4); > +hfloat32_t * VECT_VAR (expectedfms3_static, hfloat, 32, 4) = > + (hfloat32_t *) VECT_VAR (expectedfms3, float, 32, 4); > +hfloat32_t * VECT_VAR (expectedfma0_static, hfloat, 32, 4) = > + (hfloat32_t *) VECT_VAR (expectedfma0, float, 32, 4); > +hfloat32_t * VECT_VAR (expectedfma1_static, hfloat, 32, 4) = > + (hfloat32_t *) VECT_VAR (expectedfma1, float, 32, 4); > +hfloat32_t * VECT_VAR (expectedfma2_static, hfloat, 32, 4) = > + (hfloat32_t *) VECT_VAR (expectedfma2, float, 32, 4); > +hfloat32_t * VECT_VAR (expectedfma3_static, hfloat, 32, 4) = > + (hfloat32_t *) VECT_VAR (expectedfma3, float, 32, 4); > + > +VECT_VAR_DECL(expectedfms0, float, 64, 2) [] = {DA0 + -DB0 * DE0, > + DA1 + -DB1 * DE0}; > +VECT_VAR_DECL(expectedfms1, float, 64, 2) [] = {DA2 + -DB2 * DE1, > + DA3 + -DB3 * DE1}; > +VECT_VAR_DECL(expectedfms2, float, 64, 2) [] = {DA4 + -DB4 * DE2, > + DA5 + -DB5 * DE2}; > +VECT_VAR_DECL(expectedfms3, float, 64, 2) [] = {DA6 + -DB6 * DE3, > + DA7 + -DB7 * DE3}; > +VECT_VAR_DECL(expectedfma0, float, 64, 2) [] = {DA0 + DB0 * DE0, > + DA1 + DB1 * DE0}; > +VECT_VAR_DECL(expectedfma1, float, 64, 2) [] = {DA2 + DB2 * DE1, > + DA3 + DB3 * DE1}; > +VECT_VAR_DECL(expectedfma2, float, 64, 2) [] = {DA4 + DB4 * DE2, > + DA5 + DB5 * DE2}; > +VECT_VAR_DECL(expectedfma3, float, 64, 2) [] = {DA6 + DB6 * DE3, > + DA7 + DB7 * DE3}; > +hfloat64_t * VECT_VAR (expectedfms0_static, hfloat, 64, 2) = > + (hfloat64_t *) VECT_VAR (expectedfms0, float, 64, 2); > +hfloat64_t * VECT_VAR (expectedfms1_static, hfloat, 64, 2) = > + (hfloat64_t *) VECT_VAR (expectedfms1, float, 64, 2); > +hfloat64_t * VECT_VAR (expectedfms2_static, hfloat, 64, 2) = > + (hfloat64_t *) VECT_VAR (expectedfms2, float, 64, 2); > +hfloat64_t * VECT_VAR (expectedfms3_static, hfloat, 64, 2) = > + (hfloat64_t *) VECT_VAR (expectedfms3, float, 64, 2); > +hfloat64_t * VECT_VAR (expectedfma0_static, hfloat, 64, 2) = > + (hfloat64_t *) VECT_VAR (expectedfma0, float, 64, 2); > +hfloat64_t * VECT_VAR (expectedfma1_static, hfloat, 64, 2) = > + (hfloat64_t *) VECT_VAR (expectedfma1, float, 64, 2); > +hfloat64_t * VECT_VAR (expectedfma2_static, hfloat, 64, 2) = > + (hfloat64_t *) VECT_VAR (expectedfma2, float, 64, 2); > +hfloat64_t * VECT_VAR (expectedfma3_static, hfloat, 64, 2) = > + (hfloat64_t *) VECT_VAR (expectedfma3, float, 64, 2); > + > +VECT_VAR_DECL(expectedfms0, float, 64, 1) [] = {DA0 + -DB0 * DE0}; > +VECT_VAR_DECL(expectedfms1, float, 64, 1) [] = {DA2 + -DB2 * DE1}; > +VECT_VAR_DECL(expectedfms2, float, 64, 1) [] = {DA4 + -DB4 * DE2}; > +VECT_VAR_DECL(expectedfms3, float, 64, 1) [] = {DA6 + -DB6 * DE3}; > +VECT_VAR_DECL(expectedfma0, float, 64, 1) [] = {DA0 + DB0 * DE0}; > +VECT_VAR_DECL(expectedfma1, float, 64, 1) [] = {DA2 + DB2 * DE1}; > +VECT_VAR_DECL(expectedfma2, float, 64, 1) [] = {DA4 + DB4 * DE2}; > +VECT_VAR_DECL(expectedfma3, float, 64, 1) [] = {DA6 + DB6 * DE3}; > + > +hfloat64_t * VECT_VAR (expectedfms0_static, hfloat, 64, 1) = > + (hfloat64_t *) VECT_VAR (expectedfms0, float, 64, 1); > +hfloat64_t * VECT_VAR (expectedfms1_static, hfloat, 64, 1) = > + (hfloat64_t *) VECT_VAR (expectedfms1, float, 64, 1); > +hfloat64_t * VECT_VAR (expectedfms2_static, hfloat, 64, 1) = > + (hfloat64_t *) VECT_VAR (expectedfms2, float, 64, 1); > +hfloat64_t * VECT_VAR (expectedfms3_static, hfloat, 64, 1) = > + (hfloat64_t *) VECT_VAR (expectedfms3, float, 64, 1); > +hfloat64_t * VECT_VAR (expectedfma0_static, hfloat, 64, 1) = > + (hfloat64_t *) VECT_VAR (expectedfma0, float, 64, 1); > +hfloat64_t * VECT_VAR (expectedfma1_static, hfloat, 64, 1) = > + (hfloat64_t *) VECT_VAR (expectedfma1, float, 64, 1); > +hfloat64_t * VECT_VAR (expectedfma2_static, hfloat, 64, 1) = > + (hfloat64_t *) VECT_VAR (expectedfma2, float, 64, 1); > +hfloat64_t * VECT_VAR (expectedfma3_static, hfloat, 64, 1) = > + (hfloat64_t *) VECT_VAR (expectedfma3, float, 64, 1); > + > +void exec_vfma_vfms_n (void) > +{ > +#undef TEST_MSG > +#define TEST_MSG "VFMS_VFMA_N (FP32)" > + clean_results (); > + > + DECL_VARIABLE(vsrc_1, float, 32, 2); > + DECL_VARIABLE(vsrc_2, float, 32, 2); > + VECT_VAR_DECL (buf_src_1, float, 32, 2) [] = {A0, A1}; > + VECT_VAR_DECL (buf_src_2, float, 32, 2) [] = {B0, B1}; > + VLOAD (vsrc_1, buf_src_1, , float, f, 32, 2); > + VLOAD (vsrc_2, buf_src_2, , float, f, 32, 2); > + DECL_VARIABLE (vector_res, float, 32, 2) = > + vfms_n_f32 (VECT_VAR (vsrc_1, float, 32, 2), > + VECT_VAR (vsrc_2, float, 32, 2), elem0); > + vst1_f32 (VECT_VAR (result, float, 32, 2), > + VECT_VAR (vector_res, float, 32, 2)); > + CHECK_FP (TEST_MSG, float, 32, 2, PRIx16, expectedfms0_static, ""); > + VECT_VAR (vector_res, float, 32, 2) = > + vfma_n_f32 (VECT_VAR (vsrc_1, float, 32, 2), > + VECT_VAR (vsrc_2, float, 32, 2), elem0); > + vst1_f32 (VECT_VAR (result, float, 32, 2), > + VECT_VAR (vector_res, float, 32, 2)); > + CHECK_FP (TEST_MSG, float, 32, 2, PRIx16, expectedfma0_static, ""); > + > + VECT_VAR_DECL (buf_src_3, float, 32, 2) [] = {A2, A3}; > + VECT_VAR_DECL (buf_src_4, float, 32, 2) [] = {B2, B3}; > + VLOAD (vsrc_1, buf_src_3, , float, f, 32, 2); > + VLOAD (vsrc_2, buf_src_4, , float, f, 32, 2); > + VECT_VAR (vector_res, float, 32, 2) = > + vfms_n_f32 (VECT_VAR (vsrc_1, float, 32, 2), > + VECT_VAR (vsrc_2, float, 32, 2), elem1); > + vst1_f32 (VECT_VAR (result, float, 32, 2), > + VECT_VAR (vector_res, float, 32, 2)); > + CHECK_FP (TEST_MSG, float, 32, 2, PRIx16, expectedfms1_static, ""); > + VECT_VAR (vector_res, float, 32, 2) = > + vfma_n_f32 (VECT_VAR (vsrc_1, float, 32, 2), > + VECT_VAR (vsrc_2, float, 32, 2), elem1); > + vst1_f32 (VECT_VAR (result, float, 32, 2), > + VECT_VAR (vector_res, float, 32, 2)); > + CHECK_FP (TEST_MSG, float, 32, 2, PRIx16, expectedfma1_static, ""); > + > + VECT_VAR_DECL (buf_src_5, float, 32, 2) [] = {A4, A5}; > + VECT_VAR_DECL (buf_src_6, float, 32, 2) [] = {B4, B5}; > + VLOAD (vsrc_1, buf_src_5, , float, f, 32, 2); > + VLOAD (vsrc_2, buf_src_6, , float, f, 32, 2); > + VECT_VAR (vector_res, float, 32, 2) = > + vfms_n_f32 (VECT_VAR (vsrc_1, float, 32, 2), > + VECT_VAR (vsrc_2, float, 32, 2), elem2); > + vst1_f32 (VECT_VAR (result, float, 32, 2), > + VECT_VAR (vector_res, float, 32, 2)); > + CHECK_FP (TEST_MSG, float, 32, 2, PRIx16, expectedfms2_static, ""); > + VECT_VAR (vector_res, float, 32, 2) = > + vfma_n_f32 (VECT_VAR (vsrc_1, float, 32, 2), > + VECT_VAR (vsrc_2, float, 32, 2), elem2); > + vst1_f32 (VECT_VAR (result, float, 32, 2), > + VECT_VAR (vector_res, float, 32, 2)); > + CHECK_FP (TEST_MSG, float, 32, 2, PRIx16, expectedfma2_static, ""); > + > + VECT_VAR_DECL (buf_src_7, float, 32, 2) [] = {A6, A7}; > + VECT_VAR_DECL (buf_src_8, float, 32, 2) [] = {B6, B7}; > + VLOAD (vsrc_1, buf_src_7, , float, f, 32, 2); > + VLOAD (vsrc_2, buf_src_8, , float, f, 32, 2); > + VECT_VAR (vector_res, float, 32, 2) = > + vfms_n_f32 (VECT_VAR (vsrc_1, float, 32, 2), > + VECT_VAR (vsrc_2, float, 32, 2), elem3); > + vst1_f32 (VECT_VAR (result, float, 32, 2), > + VECT_VAR (vector_res, float, 32, 2)); > + CHECK_FP (TEST_MSG, float, 32, 2, PRIx16, expectedfms3_static, ""); > + VECT_VAR (vector_res, float, 32, 2) = > + vfma_n_f32 (VECT_VAR (vsrc_1, float, 32, 2), > + VECT_VAR (vsrc_2, float, 32, 2), elem3); > + vst1_f32 (VECT_VAR (result, float, 32, 2), > + VECT_VAR (vector_res, float, 32, 2)); > + CHECK_FP (TEST_MSG, float, 32, 2, PRIx16, expectedfma3_static, ""); > + > +#undef TEST_MSG > +#define TEST_MSG "VFMSQ_VFMAQ_N (FP32)" > + clean_results (); > + > + DECL_VARIABLE(vsrc_1, float, 32, 4); > + DECL_VARIABLE(vsrc_2, float, 32, 4); > + VECT_VAR_DECL (buf_src_1, float, 32, 4) [] = {A0, A1, A2, A3}; > + VECT_VAR_DECL (buf_src_2, float, 32, 4) [] = {B0, B1, B2, B3}; > + VLOAD (vsrc_1, buf_src_1, q, float, f, 32, 4); > + VLOAD (vsrc_2, buf_src_2, q, float, f, 32, 4); > + DECL_VARIABLE (vector_res, float, 32, 4) = > + vfmsq_n_f32 (VECT_VAR (vsrc_1, float, 32, 4), > + VECT_VAR (vsrc_2, float, 32, 4), elem0); > + vst1q_f32 (VECT_VAR (result, float, 32, 4), > + VECT_VAR (vector_res, float, 32, 4)); > + CHECK_FP (TEST_MSG, float, 32, 4, PRIx16, expectedfms0_static, ""); > + VECT_VAR (vector_res, float, 32, 4) = > + vfmaq_n_f32 (VECT_VAR (vsrc_1, float, 32, 4), > + VECT_VAR (vsrc_2, float, 32, 4), elem0); > + vst1q_f32 (VECT_VAR (result, float, 32, 4), > + VECT_VAR (vector_res, float, 32, 4)); > + CHECK_FP (TEST_MSG, float, 32, 4, PRIx16, expectedfma0_static, ""); > + > + VECT_VAR_DECL (buf_src_3, float, 32, 4) [] = {A4, A5, A6, A7}; > + VECT_VAR_DECL (buf_src_4, float, 32, 4) [] = {B4, B5, B6, B7}; > + VLOAD (vsrc_1, buf_src_3, q, float, f, 32, 4); > + VLOAD (vsrc_2, buf_src_4, q, float, f, 32, 4); > + VECT_VAR (vector_res, float, 32, 4) = > + vfmsq_n_f32 (VECT_VAR (vsrc_1, float, 32, 4), > + VECT_VAR (vsrc_2, float, 32, 4), elem1); > + vst1q_f32 (VECT_VAR (result, float, 32, 4), > + VECT_VAR (vector_res, float, 32, 4)); > + CHECK_FP (TEST_MSG, float, 32, 4, PRIx16, expectedfms1_static, ""); > + VECT_VAR (vector_res, float, 32, 4) = > + vfmaq_n_f32 (VECT_VAR (vsrc_1, float, 32, 4), > + VECT_VAR (vsrc_2, float, 32, 4), elem1); > + vst1q_f32 (VECT_VAR (result, float, 32, 4), > + VECT_VAR (vector_res, float, 32, 4)); > + CHECK_FP (TEST_MSG, float, 32, 4, PRIx16, expectedfma1_static, ""); > + > + VECT_VAR_DECL (buf_src_5, float, 32, 4) [] = {A0, A2, A4, A6}; > + VECT_VAR_DECL (buf_src_6, float, 32, 4) [] = {B0, B2, B4, B6}; > + VLOAD (vsrc_1, buf_src_5, q, float, f, 32, 4); > + VLOAD (vsrc_2, buf_src_6, q, float, f, 32, 4); > + VECT_VAR (vector_res, float, 32, 4) = > + vfmsq_n_f32 (VECT_VAR (vsrc_1, float, 32, 4), > + VECT_VAR (vsrc_2, float, 32, 4), elem2); > + vst1q_f32 (VECT_VAR (result, float, 32, 4), > + VECT_VAR (vector_res, float, 32, 4)); > + CHECK_FP (TEST_MSG, float, 32, 4, PRIx16, expectedfms2_static, ""); > + VECT_VAR (vector_res, float, 32, 4) = > + vfmaq_n_f32 (VECT_VAR (vsrc_1, float, 32, 4), > + VECT_VAR (vsrc_2, float, 32, 4), elem2); > + vst1q_f32 (VECT_VAR (result, float, 32, 4), > + VECT_VAR (vector_res, float, 32, 4)); > + CHECK_FP (TEST_MSG, float, 32, 4, PRIx16, expectedfma2_static, ""); > + > + VECT_VAR_DECL (buf_src_7, float, 32, 4) [] = {A1, A3, A5, A7}; > + VECT_VAR_DECL (buf_src_8, float, 32, 4) [] = {B1, B3, B5, B7}; > + VLOAD (vsrc_1, buf_src_7, q, float, f, 32, 4); > + VLOAD (vsrc_2, buf_src_8, q, float, f, 32, 4); > + VECT_VAR (vector_res, float, 32, 4) = > + vfmsq_n_f32 (VECT_VAR (vsrc_1, float, 32, 4), > + VECT_VAR (vsrc_2, float, 32, 4), elem3); > + vst1q_f32 (VECT_VAR (result, float, 32, 4), > + VECT_VAR (vector_res, float, 32, 4)); > + CHECK_FP (TEST_MSG, float, 32, 4, PRIx16, expectedfms3_static, ""); > + VECT_VAR (vector_res, float, 32, 4) = > + vfmaq_n_f32 (VECT_VAR (vsrc_1, float, 32, 4), > + VECT_VAR (vsrc_2, float, 32, 4), elem3); > + vst1q_f32 (VECT_VAR (result, float, 32, 4), > + VECT_VAR (vector_res, float, 32, 4)); > + CHECK_FP (TEST_MSG, float, 32, 4, PRIx16, expectedfma3_static, ""); > + > +#undef TEST_MSG > +#define TEST_MSG "VFMSQ_VFMAQ_N (FP64)" > + clean_results (); > + > + DECL_VARIABLE(vsrc_1, float, 64, 2); > + DECL_VARIABLE(vsrc_2, float, 64, 2); > + VECT_VAR_DECL (buf_src_1, float, 64, 2) [] = {DA0, DA1}; > + VECT_VAR_DECL (buf_src_2, float, 64, 2) [] = {DB0, DB1}; > + VLOAD (vsrc_1, buf_src_1, q, float, f, 64, 2); > + VLOAD (vsrc_2, buf_src_2, q, float, f, 64, 2); > + DECL_VARIABLE (vector_res, float, 64, 2) = > + vfmsq_n_f64 (VECT_VAR (vsrc_1, float, 64, 2), > + VECT_VAR (vsrc_2, float, 64, 2), delem0); > + vst1q_f64 (VECT_VAR (result, float, 64, 2), > + VECT_VAR (vector_res, float, 64, 2)); > + CHECK_FP (TEST_MSG, float, 64, 2, PRIx16, expectedfms0_static, ""); > + VECT_VAR (vector_res, float, 64, 2) = > + vfmaq_n_f64 (VECT_VAR (vsrc_1, float, 64, 2), > + VECT_VAR (vsrc_2, float, 64, 2), delem0); > + vst1q_f64 (VECT_VAR (result, float, 64, 2), > + VECT_VAR (vector_res, float, 64, 2)); > + CHECK_FP (TEST_MSG, float, 64, 2, PRIx16, expectedfma0_static, ""); > + > + VECT_VAR_DECL (buf_src_3, float, 64, 2) [] = {DA2, DA3}; > + VECT_VAR_DECL (buf_src_4, float, 64, 2) [] = {DB2, DB3}; > + VLOAD (vsrc_1, buf_src_3, q, float, f, 64, 2); > + VLOAD (vsrc_2, buf_src_4, q, float, f, 64, 2); > + VECT_VAR (vector_res, float, 64, 2) = > + vfmsq_n_f64 (VECT_VAR (vsrc_1, float, 64, 2), > + VECT_VAR (vsrc_2, float, 64, 2), delem1); > + vst1q_f64 (VECT_VAR (result, float, 64, 2), > + VECT_VAR (vector_res, float, 64, 2)); > + CHECK_FP (TEST_MSG, float, 64, 2, PRIx16, expectedfms1_static, ""); > + VECT_VAR (vector_res, float, 64, 2) = > + vfmaq_n_f64 (VECT_VAR (vsrc_1, float, 64, 2), > + VECT_VAR (vsrc_2, float, 64, 2), delem1); > + vst1q_f64 (VECT_VAR (result, float, 64, 2), > + VECT_VAR (vector_res, float, 64, 2)); > + CHECK_FP (TEST_MSG, float, 64, 2, PRIx16, expectedfma1_static, ""); > + > + VECT_VAR_DECL (buf_src_5, float, 64, 2) [] = {DA4, DA5}; > + VECT_VAR_DECL (buf_src_6, float, 64, 2) [] = {DB4, DB5}; > + VLOAD (vsrc_1, buf_src_5, q, float, f, 64, 2); > + VLOAD (vsrc_2, buf_src_6, q, float, f, 64, 2); > + VECT_VAR (vector_res, float, 64, 2) = > + vfmsq_n_f64 (VECT_VAR (vsrc_1, float, 64, 2), > + VECT_VAR (vsrc_2, float, 64, 2), delem2); > + vst1q_f64 (VECT_VAR (result, float, 64, 2), > + VECT_VAR (vector_res, float, 64, 2)); > + CHECK_FP (TEST_MSG, float, 64, 2, PRIx16, expectedfms2_static, ""); > + VECT_VAR (vector_res, float, 64, 2) = > + vfmaq_n_f64 (VECT_VAR (vsrc_1, float, 64, 2), > + VECT_VAR (vsrc_2, float, 64, 2), delem2); > + vst1q_f64 (VECT_VAR (result, float, 64, 2), > + VECT_VAR (vector_res, float, 64, 2)); > + CHECK_FP (TEST_MSG, float, 64, 2, PRIx16, expectedfma2_static, ""); > + > + VECT_VAR_DECL (buf_src_7, float, 64, 2) [] = {DA6, DA7}; > + VECT_VAR_DECL (buf_src_8, float, 64, 2) [] = {DB6, DB7}; > + VLOAD (vsrc_1, buf_src_7, q, float, f, 64, 2); > + VLOAD (vsrc_2, buf_src_8, q, float, f, 64, 2); > + VECT_VAR (vector_res, float, 64, 2) = > + vfmsq_n_f64 (VECT_VAR (vsrc_1, float, 64, 2), > + VECT_VAR (vsrc_2, float, 64, 2), delem3); > + vst1q_f64 (VECT_VAR (result, float, 64, 2), > + VECT_VAR (vector_res, float, 64, 2)); > + CHECK_FP (TEST_MSG, float, 64, 2, PRIx16, expectedfms3_static, ""); > + VECT_VAR (vector_res, float, 64, 2) = > + vfmaq_n_f64 (VECT_VAR (vsrc_1, float, 64, 2), > + VECT_VAR (vsrc_2, float, 64, 2), delem3); > + vst1q_f64 (VECT_VAR (result, float, 64, 2), > + VECT_VAR (vector_res, float, 64, 2)); > + CHECK_FP (TEST_MSG, float, 64, 2, PRIx16, expectedfma3_static, ""); > + > +#undef TEST_MSG > +#define TEST_MSG "VFMS_VFMA_N (FP64)" > + clean_results (); > + > + DECL_VARIABLE(vsrc_1, float, 64, 1); > + DECL_VARIABLE(vsrc_2, float, 64, 1); > + VECT_VAR_DECL (buf_src_1, float, 64, 1) [] = {DA0}; > + VECT_VAR_DECL (buf_src_2, float, 64, 1) [] = {DB0}; > + VLOAD (vsrc_1, buf_src_1, , float, f, 64, 1); > + VLOAD (vsrc_2, buf_src_2, , float, f, 64, 1); > + DECL_VARIABLE (vector_res, float, 64, 1) = > + vfms_n_f64 (VECT_VAR (vsrc_1, float, 64, 1), > + VECT_VAR (vsrc_2, float, 64, 1), delem0); > + vst1_f64 (VECT_VAR (result, float, 64, 1), > + VECT_VAR (vector_res, float, 64, 1)); > + CHECK_FP (TEST_MSG, float, 64, 1, PRIx16, expectedfms0_static, ""); > + VECT_VAR (vector_res, float, 64, 1) = > + vfma_n_f64 (VECT_VAR (vsrc_1, float, 64, 1), > + VECT_VAR (vsrc_2, float, 64, 1), delem0); > + vst1_f64 (VECT_VAR (result, float, 64, 1), > + VECT_VAR (vector_res, float, 64, 1)); > + CHECK_FP (TEST_MSG, float, 64, 1, PRIx16, expectedfma0_static, ""); > + > + VECT_VAR_DECL (buf_src_3, float, 64, 1) [] = {DA2}; > + VECT_VAR_DECL (buf_src_4, float, 64, 1) [] = {DB2}; > + VLOAD (vsrc_1, buf_src_3, , float, f, 64, 1); > + VLOAD (vsrc_2, buf_src_4, , float, f, 64, 1); > + VECT_VAR (vector_res, float, 64, 1) = > + vfms_n_f64 (VECT_VAR (vsrc_1, float, 64, 1), > + VECT_VAR (vsrc_2, float, 64, 1), delem1); > + vst1_f64 (VECT_VAR (result, float, 64, 1), > + VECT_VAR (vector_res, float, 64, 1)); > + CHECK_FP (TEST_MSG, float, 64, 1, PRIx16, expectedfms1_static, ""); > + VECT_VAR (vector_res, float, 64, 1) = > + vfma_n_f64 (VECT_VAR (vsrc_1, float, 64, 1), > + VECT_VAR (vsrc_2, float, 64, 1), delem1); > + vst1_f64 (VECT_VAR (result, float, 64, 1), > + VECT_VAR (vector_res, float, 64, 1)); > + CHECK_FP (TEST_MSG, float, 64, 1, PRIx16, expectedfma1_static, ""); > + > + VECT_VAR_DECL (buf_src_5, float, 64, 1) [] = {DA4}; > + VECT_VAR_DECL (buf_src_6, float, 64, 1) [] = {DB4}; > + VLOAD (vsrc_1, buf_src_5, , float, f, 64, 1); > + VLOAD (vsrc_2, buf_src_6, , float, f, 64, 1); > + VECT_VAR (vector_res, float, 64, 1) = > + vfms_n_f64 (VECT_VAR (vsrc_1, float, 64, 1), > + VECT_VAR (vsrc_2, float, 64, 1), delem2); > + vst1_f64 (VECT_VAR (result, float, 64, 1), > + VECT_VAR (vector_res, float, 64, 1)); > + CHECK_FP (TEST_MSG, float, 64, 1, PRIx16, expectedfms2_static, ""); > + VECT_VAR (vector_res, float, 64, 1) = > + vfma_n_f64 (VECT_VAR (vsrc_1, float, 64, 1), > + VECT_VAR (vsrc_2, float, 64, 1), delem2); > + vst1_f64 (VECT_VAR (result, float, 64, 1), > + VECT_VAR (vector_res, float, 64, 1)); > + CHECK_FP (TEST_MSG, float, 64, 1, PRIx16, expectedfma2_static, ""); > + > + VECT_VAR_DECL (buf_src_7, float, 64, 1) [] = {DA6}; > + VECT_VAR_DECL (buf_src_8, float, 64, 1) [] = {DB6}; > + VLOAD (vsrc_1, buf_src_7, , float, f, 64, 1); > + VLOAD (vsrc_2, buf_src_8, , float, f, 64, 1); > + VECT_VAR (vector_res, float, 64, 1) = > + vfms_n_f64 (VECT_VAR (vsrc_1, float, 64, 1), > + VECT_VAR (vsrc_2, float, 64, 1), delem3); > + vst1_f64 (VECT_VAR (result, float, 64, 1), > + VECT_VAR (vector_res, float, 64, 1)); > + CHECK_FP (TEST_MSG, float, 64, 1, PRIx16, expectedfms3_static, ""); > + VECT_VAR (vector_res, float, 64, 1) = > + vfma_n_f64 (VECT_VAR (vsrc_1, float, 64, 1), > + VECT_VAR (vsrc_2, float, 64, 1), delem3); > + vst1_f64 (VECT_VAR (result, float, 64, 1), > + VECT_VAR (vector_res, float, 64, 1)); > + CHECK_FP (TEST_MSG, float, 64, 1, PRIx16, expectedfma3_static, ""); > +} > +#endif > + > +int > +main (void) > +{ > +#if defined(__aarch64__) && defined(__ARM_FEATURE_FMA) > + exec_vfma_vfms_n (); > +#endif > + return 0; > +} > diff --git a/gcc/testsuite/gcc.target/aarch64/fmla_intrinsic_1.c > b/gcc/testsuite/gcc.target/aarch64/fmla_intrinsic_1.c > index > 1ba1fed98a0711496815e00d2d702e5bfa2a7d43..5b348827002dcfef1f589900a4cf5ff7ada26697 > 100644 > --- a/gcc/testsuite/gcc.target/aarch64/fmla_intrinsic_1.c > +++ b/gcc/testsuite/gcc.target/aarch64/fmla_intrinsic_1.c > @@ -110,6 +110,6 @@ main (int argc, char **argv) > /* vfmaq_lane_f64. > vfma_laneq_f64. > vfmaq_laneq_f64. */ > -/* { dg-final { scan-assembler-times "fmla\\tv\[0-9\]+\.2d, v\[0-9\]+\.2d, > v\[0-9\]+\.2d\\\[\[0-9\]+\\\]" 3 } } */ > +/* { dg-final { scan-assembler-times "fmla\\tv\[0-9\]+\.2d, v\[0-9\]+\.2d, > v\[0-9\]+\.2?d\\\[\[0-9\]+\\\]" 3 } } */ > > > diff --git a/gcc/testsuite/gcc.target/aarch64/fmls_intrinsic_1.c > b/gcc/testsuite/gcc.target/aarch64/fmls_intrinsic_1.c > index > 887ebae10da715c8d301a8494a2225e53f15bd7d..6c194a023d34ebafb4d732edc303985531f92a63 > 100644 > --- a/gcc/testsuite/gcc.target/aarch64/fmls_intrinsic_1.c > +++ b/gcc/testsuite/gcc.target/aarch64/fmls_intrinsic_1.c > @@ -111,6 +111,6 @@ main (int argc, char **argv) > /* vfmsq_lane_f64. > vfms_laneq_f64. > vfmsq_laneq_f64. */ > -/* { dg-final { scan-assembler-times "fmls\\tv\[0-9\]+\.2d, v\[0-9\]+\.2d, > v\[0-9\]+\.2d\\\[\[0-9\]+\\\]" 3 } } */ > +/* { dg-final { scan-assembler-times "fmls\\tv\[0-9\]+\.2d, v\[0-9\]+\.2d, > v\[0-9\]+\.2?d\\\[\[0-9\]+\\\]" 3 } } */ > > >