Some of the comments on the BFMMLA/BFMLA[LT] patch apply here too.

Delia Burduv <delia.bur...@arm.com> writes:
> This patch adds the Armv8.6-a ACLE intrinsics for bfmmla, bfmlalb and 
> bfmlalt as part of the BFloat16 extension.

That's the other patch :-)

> [...]
> diff --git a/gcc/config/aarch64/aarch64-simd.md 
> b/gcc/config/aarch64/aarch64-simd.md
> index 
> 55660ae248f4fa75d35ba2949cd4b9d5139ff5f5..ff7a1f5f34a19b05eba48dba96c736dfdfdf7bac
>  100644
> --- a/gcc/config/aarch64/aarch64-simd.md
> +++ b/gcc/config/aarch64/aarch64-simd.md
> @@ -7027,3 +7027,32 @@
>    "xtn\t%0.<Vntype>, %1.<Vtype>"
>    [(set_attr "type" "neon_shift_imm_narrow_q")]
>  )
> +
> +;; bfcvtn
> +(define_insn "aarch64_bfcvtn<q><mode>"
> +  [(set (match_operand:V4SF_TO_BF 0 "register_operand" "=w")
> +        (unspec:V4SF_TO_BF [(match_operand:V4SF 1 "register_operand" "w")]
> +                            UNSPEC_BFCVTN))]
> +  "TARGET_BF16_SIMD"
> +  "bfcvtn\\t%0.4h, %1.4s"
> +  [(set_attr "type" "f_cvt")]
> +)
> +

If I've understood the naming convention correctly, the closest type
seems to be "neon_fp_cvt_narrow_s_q".

> +(define_insn "aarch64_bfcvtn2v8bf"
> +  [(set (match_operand:V8BF 0 "register_operand" "=w")
> +        (unspec:V8BF [(match_operand:V8BF 1 "register_operand" "w")
> +                      (match_operand:V4SF 2 "register_operand" "w")]
> +                      UNSPEC_BFCVTN2))]
> +  "TARGET_BF16_SIMD"
> +  "bfcvtn2\\t%0.8h, %2.4s"
> +  [(set_attr "type" "f_cvt")]
> +)

Same here.

The constraint on operand 1 needs to be "0", otherwise operands 1 and 0
could end up in different registers.  You could test for this using
something like:

bfloat16x8_t test_bfcvtnq2_untied (bfloat16x8_t unused, bfloat16x8_t inactive,
                                   float32x4_t a)
{
  return vcvtq_high_bf16_f32 (inactive, a);
}

which when compiled at -O should produce something like:

/*
**test_bfcvtnq2_untied:
**      mov     v0\.8h, v1\.8h
**      bfcvtn2 v0\.8h, v2\.4s
**      ret
*/

(Completely untested, the code above is probably wrong.)

> +
> +(define_insn "aarch64_bfcvtbf"
> +  [(set (match_operand:BF 0 "register_operand" "=w")
> +        (unspec:BF [(match_operand:SF 1 "register_operand" "w")]
> +                    UNSPEC_BFCVT))]
> +  "TARGET_BF16_SIMD"

I think this just needs the scalar macro rather than *_SIMD.

> +  "bfcvt\\t%h0, %s1"
> +  [(set_attr "type" "f_cvt")]
> +)
> diff --git a/gcc/config/aarch64/arm_bf16.h b/gcc/config/aarch64/arm_bf16.h
> index 
> aedb0972735ce549fac1870bacd1ef3101e8fd26..1b9ab3690d35e153cd4f24b9e3bbb5b4cc4b4f4d
>  100644
> --- a/gcc/config/aarch64/arm_bf16.h
> +++ b/gcc/config/aarch64/arm_bf16.h
> @@ -34,7 +34,15 @@
>  #ifdef __ARM_FEATURE_BF16_SCALAR_ARITHMETIC
>  
>  typedef __bf16 bfloat16_t;
> -
> +typedef float float32_t;
> +
> +__extension__ extern __inline bfloat16_t
> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> +vcvth_bf16_f32 \
> +      (float32_t __a)

No need for the line break here.

> +{
> +  return __builtin_aarch64_bfcvtbf (__a);
> +}
>  
>  #endif
>  #pragma GCC pop_options
> diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
> index 
> 6cdbf381f0156ed993f03b847228b36ebbdd14f8..120f4b7d8827aee51834e75aeaa6ab8f8451980e
>  100644
> --- a/gcc/config/aarch64/arm_neon.h
> +++ b/gcc/config/aarch64/arm_neon.h
> @@ -34610,6 +34610,35 @@ vrnd64xq_f64 (float64x2_t __a)
>  
>  #include "arm_bf16.h"
>  
> +#pragma GCC push_options
> +#pragma GCC target ("arch=armv8.2-a+bf16")
> +#ifdef __ARM_FEATURE_BF16_VECTOR_ARITHMETIC
> +
> +__extension__ extern __inline bfloat16x4_t
> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> +vcvt_bf16_f32 (float32x4_t __a)
> +{
> +  return __builtin_aarch64_bfcvtnv4bf (__a);
> +
> +}

Nit: extra blank line.

> +
> +__extension__ extern __inline bfloat16x8_t
> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> +vcvtq_low_bf16_f32 (float32x4_t __a)
> +{
> +  return __builtin_aarch64_bfcvtn_qv8bf (__a);
> +}
> +
> +__extension__ extern __inline bfloat16x8_t
> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> +vcvtq_high_bf16_f32 (bfloat16x8_t __inactive, float32x4_t __a)
> +{
> +  return __builtin_aarch64_bfcvtn2v8bf (__inactive, __a);
> +}
> +
> +#endif
> +#pragma GCC pop_options
> +
>  #pragma GCC pop_options
>  
>  #undef __aarch64_vget_lane_any
> diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
> index 
> 931166da5e47302afe810498eea9c8c2ab89b9de..f9f0bafb1eca4da42e564224fca1fd43d89f6ed1
>  100644
> --- a/gcc/config/aarch64/iterators.md
> +++ b/gcc/config/aarch64/iterators.md
> @@ -431,6 +431,9 @@
>  ;; SVE predicate modes that control 16-bit, 32-bit or 64-bit elements.
>  (define_mode_iterator PRED_HSD [VNx8BI VNx4BI VNx2BI])
>  
> +;; Bfloat16 modes to which V4SF can be converted
> +(define_mode_iterator V4SF_TO_BF [V4BF V8BF])
> +
>  ;; ------------------------------------------------------------------
>  ;; Unspec enumerations for Advance SIMD. These could well go into
>  ;; aarch64.md but for their use in int_iterators here.
> @@ -673,6 +676,9 @@
>      UNSPEC_UMULHS    ; Used in aarch64-sve2.md.
>      UNSPEC_UMULHRS   ; Used in aarch64-sve2.md.
>      UNSPEC_ASRD              ; Used in aarch64-sve.md.
> +    UNSPEC_BFCVTN    ; Used in aarch64-simd.md.
> +    UNSPEC_BFCVTN2   ; Used in aarch64-simd.md.
> +    UNSPEC_BFCVT     ; Used in aarch64-simd.md.
>  ])
>  
>  ;; ------------------------------------------------------------------
> diff --git a/gcc/config/arm/types.md b/gcc/config/arm/types.md
> index 
> df39522f2ad63a52c910b1a6bcc7aa13aaf5d021..dbcb4d58798d7f51b1b8310cd446c58317d7b50d
>  100644
> --- a/gcc/config/arm/types.md
> +++ b/gcc/config/arm/types.md
> @@ -1097,7 +1097,8 @@
>    crypto_sm4,\
>    coproc,\
>    tme,\
> -  memtag"
> +  memtag,\
> +  bf_cvt"

This doesn't seem to be used.

> diff --git 
> a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfcvt-compile.c 
> b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfcvt-compile.c
> new file mode 100644
> index 
> 0000000000000000000000000000000000000000..ebe5b578c1fa82a6f2a166d55c7dc7e905b87135
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfcvt-compile.c
> @@ -0,0 +1,56 @@
> +/* { dg-do assemble { target { aarch64*-*-* } } } */
> +/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
> +/* { dg-add-options arm_v8_2a_bf16_neon } */
> +/* { dg-additional-options "-save-temps" } */
> +/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
> +
> +#include <arm_neon.h>
> +
> +/*
> +**test_bfcvtn:
> +**   ...
> +**   bfcvtn\tv[0-9]+.4h, v[0-9]+.4s
> +**   ...
> +*/
> +bfloat16x4_t test_bfcvtn (float32x4_t a)
> +{
> +  return vcvt_bf16_f32 (a);
> +}
> +
> +/*
> +**test_bfcvtnq:
> +**   ...
> +**   bfcvtn  v[0-9]+.4h, v[0-9]+.4s
> +**   ...
> +*/
> +bfloat16x8_t test_bfcvtnq (float32x4_t a)
> +{
> +  return vcvtq_low_bf16_f32 (a);
> +}
> +
> +/*
> +**test_bfcvtnq2:
> +**   ...
> +**   bfcvtn  v[0-9]+.4h, v[0-9]+.4s
> +**   ...
> +*/
> +bfloat16x8_t test_bfcvtnq2 (bfloat16x8_t inactive, float32x4_t a)
> +{
> +  return vcvtq_high_bf16_f32 (inactive, a);
> +}
> +
> +/*
> +**test_bfcvt:
> +**   ...
> +**   bfcvt   h[0-9]+, s[0-9]+
> +**   ...
> +*/
> +bfloat16_t test_bfcvt (float32_t a)
> +{
> +  return vcvth_bf16_f32 (a);
> +}
> +
> +/* { dg-final { scan-assembler {bfcvtn\tv[0-9]+.4h, v[0-9]+.4s} } } */
> +/* { dg-final { scan-assembler {bfcvtn\tv[0-9]+.4h, v[0-9]+.4s} } } */
> +/* { dg-final { scan-assembler {bfcvtn\tv[0-9]+.4h, v[0-9]+.4s} } } */
> +/* { dg-final { scan-assembler {bfcvt\th[0-9]+, s[0-9]+} } } */

Same comments as for the BFMMLA/BFMLA[BT] tests.

As well as testing all these combinations for the SIMD case,
it would be good to have a direct arm_bf16.h-only test for:

#pragma GCC target "arch=armv8.2-a+bf16+nosimd"

test_bfcvt should still work in that case.

It would also be good to have a test that test_bfcvt reports
an appropriate error if compiled after:

#pragma GCC target "arch=armv8.2-a+nobf16"

Thanks,
Richard

Reply via email to