Some of the comments on the BFMMLA/BFMLA[LT] patch apply here too. Delia Burduv <delia.bur...@arm.com> writes: > This patch adds the Armv8.6-a ACLE intrinsics for bfmmla, bfmlalb and > bfmlalt as part of the BFloat16 extension.
That's the other patch :-) > [...] > diff --git a/gcc/config/aarch64/aarch64-simd.md > b/gcc/config/aarch64/aarch64-simd.md > index > 55660ae248f4fa75d35ba2949cd4b9d5139ff5f5..ff7a1f5f34a19b05eba48dba96c736dfdfdf7bac > 100644 > --- a/gcc/config/aarch64/aarch64-simd.md > +++ b/gcc/config/aarch64/aarch64-simd.md > @@ -7027,3 +7027,32 @@ > "xtn\t%0.<Vntype>, %1.<Vtype>" > [(set_attr "type" "neon_shift_imm_narrow_q")] > ) > + > +;; bfcvtn > +(define_insn "aarch64_bfcvtn<q><mode>" > + [(set (match_operand:V4SF_TO_BF 0 "register_operand" "=w") > + (unspec:V4SF_TO_BF [(match_operand:V4SF 1 "register_operand" "w")] > + UNSPEC_BFCVTN))] > + "TARGET_BF16_SIMD" > + "bfcvtn\\t%0.4h, %1.4s" > + [(set_attr "type" "f_cvt")] > +) > + If I've understood the naming convention correctly, the closest type seems to be "neon_fp_cvt_narrow_s_q". > +(define_insn "aarch64_bfcvtn2v8bf" > + [(set (match_operand:V8BF 0 "register_operand" "=w") > + (unspec:V8BF [(match_operand:V8BF 1 "register_operand" "w") > + (match_operand:V4SF 2 "register_operand" "w")] > + UNSPEC_BFCVTN2))] > + "TARGET_BF16_SIMD" > + "bfcvtn2\\t%0.8h, %2.4s" > + [(set_attr "type" "f_cvt")] > +) Same here. The constraint on operand 1 needs to be "0", otherwise operands 1 and 0 could end up in different registers. You could test for this using something like: bfloat16x8_t test_bfcvtnq2_untied (bfloat16x8_t unused, bfloat16x8_t inactive, float32x4_t a) { return vcvtq_high_bf16_f32 (inactive, a); } which when compiled at -O should produce something like: /* **test_bfcvtnq2_untied: ** mov v0\.8h, v1\.8h ** bfcvtn2 v0\.8h, v2\.4s ** ret */ (Completely untested, the code above is probably wrong.) > + > +(define_insn "aarch64_bfcvtbf" > + [(set (match_operand:BF 0 "register_operand" "=w") > + (unspec:BF [(match_operand:SF 1 "register_operand" "w")] > + UNSPEC_BFCVT))] > + "TARGET_BF16_SIMD" I think this just needs the scalar macro rather than *_SIMD. > + "bfcvt\\t%h0, %s1" > + [(set_attr "type" "f_cvt")] > +) > diff --git a/gcc/config/aarch64/arm_bf16.h b/gcc/config/aarch64/arm_bf16.h > index > aedb0972735ce549fac1870bacd1ef3101e8fd26..1b9ab3690d35e153cd4f24b9e3bbb5b4cc4b4f4d > 100644 > --- a/gcc/config/aarch64/arm_bf16.h > +++ b/gcc/config/aarch64/arm_bf16.h > @@ -34,7 +34,15 @@ > #ifdef __ARM_FEATURE_BF16_SCALAR_ARITHMETIC > > typedef __bf16 bfloat16_t; > - > +typedef float float32_t; > + > +__extension__ extern __inline bfloat16_t > +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > +vcvth_bf16_f32 \ > + (float32_t __a) No need for the line break here. > +{ > + return __builtin_aarch64_bfcvtbf (__a); > +} > > #endif > #pragma GCC pop_options > diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h > index > 6cdbf381f0156ed993f03b847228b36ebbdd14f8..120f4b7d8827aee51834e75aeaa6ab8f8451980e > 100644 > --- a/gcc/config/aarch64/arm_neon.h > +++ b/gcc/config/aarch64/arm_neon.h > @@ -34610,6 +34610,35 @@ vrnd64xq_f64 (float64x2_t __a) > > #include "arm_bf16.h" > > +#pragma GCC push_options > +#pragma GCC target ("arch=armv8.2-a+bf16") > +#ifdef __ARM_FEATURE_BF16_VECTOR_ARITHMETIC > + > +__extension__ extern __inline bfloat16x4_t > +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > +vcvt_bf16_f32 (float32x4_t __a) > +{ > + return __builtin_aarch64_bfcvtnv4bf (__a); > + > +} Nit: extra blank line. > + > +__extension__ extern __inline bfloat16x8_t > +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > +vcvtq_low_bf16_f32 (float32x4_t __a) > +{ > + return __builtin_aarch64_bfcvtn_qv8bf (__a); > +} > + > +__extension__ extern __inline bfloat16x8_t > +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > +vcvtq_high_bf16_f32 (bfloat16x8_t __inactive, float32x4_t __a) > +{ > + return __builtin_aarch64_bfcvtn2v8bf (__inactive, __a); > +} > + > +#endif > +#pragma GCC pop_options > + > #pragma GCC pop_options > > #undef __aarch64_vget_lane_any > diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md > index > 931166da5e47302afe810498eea9c8c2ab89b9de..f9f0bafb1eca4da42e564224fca1fd43d89f6ed1 > 100644 > --- a/gcc/config/aarch64/iterators.md > +++ b/gcc/config/aarch64/iterators.md > @@ -431,6 +431,9 @@ > ;; SVE predicate modes that control 16-bit, 32-bit or 64-bit elements. > (define_mode_iterator PRED_HSD [VNx8BI VNx4BI VNx2BI]) > > +;; Bfloat16 modes to which V4SF can be converted > +(define_mode_iterator V4SF_TO_BF [V4BF V8BF]) > + > ;; ------------------------------------------------------------------ > ;; Unspec enumerations for Advance SIMD. These could well go into > ;; aarch64.md but for their use in int_iterators here. > @@ -673,6 +676,9 @@ > UNSPEC_UMULHS ; Used in aarch64-sve2.md. > UNSPEC_UMULHRS ; Used in aarch64-sve2.md. > UNSPEC_ASRD ; Used in aarch64-sve.md. > + UNSPEC_BFCVTN ; Used in aarch64-simd.md. > + UNSPEC_BFCVTN2 ; Used in aarch64-simd.md. > + UNSPEC_BFCVT ; Used in aarch64-simd.md. > ]) > > ;; ------------------------------------------------------------------ > diff --git a/gcc/config/arm/types.md b/gcc/config/arm/types.md > index > df39522f2ad63a52c910b1a6bcc7aa13aaf5d021..dbcb4d58798d7f51b1b8310cd446c58317d7b50d > 100644 > --- a/gcc/config/arm/types.md > +++ b/gcc/config/arm/types.md > @@ -1097,7 +1097,8 @@ > crypto_sm4,\ > coproc,\ > tme,\ > - memtag" > + memtag,\ > + bf_cvt" This doesn't seem to be used. > diff --git > a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfcvt-compile.c > b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfcvt-compile.c > new file mode 100644 > index > 0000000000000000000000000000000000000000..ebe5b578c1fa82a6f2a166d55c7dc7e905b87135 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfcvt-compile.c > @@ -0,0 +1,56 @@ > +/* { dg-do assemble { target { aarch64*-*-* } } } */ > +/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */ > +/* { dg-add-options arm_v8_2a_bf16_neon } */ > +/* { dg-additional-options "-save-temps" } */ > +/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */ > + > +#include <arm_neon.h> > + > +/* > +**test_bfcvtn: > +** ... > +** bfcvtn\tv[0-9]+.4h, v[0-9]+.4s > +** ... > +*/ > +bfloat16x4_t test_bfcvtn (float32x4_t a) > +{ > + return vcvt_bf16_f32 (a); > +} > + > +/* > +**test_bfcvtnq: > +** ... > +** bfcvtn v[0-9]+.4h, v[0-9]+.4s > +** ... > +*/ > +bfloat16x8_t test_bfcvtnq (float32x4_t a) > +{ > + return vcvtq_low_bf16_f32 (a); > +} > + > +/* > +**test_bfcvtnq2: > +** ... > +** bfcvtn v[0-9]+.4h, v[0-9]+.4s > +** ... > +*/ > +bfloat16x8_t test_bfcvtnq2 (bfloat16x8_t inactive, float32x4_t a) > +{ > + return vcvtq_high_bf16_f32 (inactive, a); > +} > + > +/* > +**test_bfcvt: > +** ... > +** bfcvt h[0-9]+, s[0-9]+ > +** ... > +*/ > +bfloat16_t test_bfcvt (float32_t a) > +{ > + return vcvth_bf16_f32 (a); > +} > + > +/* { dg-final { scan-assembler {bfcvtn\tv[0-9]+.4h, v[0-9]+.4s} } } */ > +/* { dg-final { scan-assembler {bfcvtn\tv[0-9]+.4h, v[0-9]+.4s} } } */ > +/* { dg-final { scan-assembler {bfcvtn\tv[0-9]+.4h, v[0-9]+.4s} } } */ > +/* { dg-final { scan-assembler {bfcvt\th[0-9]+, s[0-9]+} } } */ Same comments as for the BFMMLA/BFMLA[BT] tests. As well as testing all these combinations for the SIMD case, it would be good to have a direct arm_bf16.h-only test for: #pragma GCC target "arch=armv8.2-a+bf16+nosimd" test_bfcvt should still work in that case. It would also be good to have a test that test_bfcvt reports an appropriate error if compiled after: #pragma GCC target "arch=armv8.2-a+nobf16" Thanks, Richard