Stam Markianos-Wright <stam.markianos-wri...@arm.com> writes:
> diff --git a/gcc/config/aarch64/aarch64-simd.md 
> b/gcc/config/aarch64/aarch64-simd.md
> index 
> adfda96f077075ad53d4bea2919c4d3b326e49f5..7587bc46ba1c80389ea49fa83a0e6f8a489711e9
>  100644
> --- a/gcc/config/aarch64/aarch64-simd.md
> +++ b/gcc/config/aarch64/aarch64-simd.md
> @@ -7028,3 +7028,36 @@
>    "xtn\t%0.<Vntype>, %1.<Vtype>"
>    [(set_attr "type" "neon_shift_imm_narrow_q")]
>  )
> +
> +(define_insn "aarch64_bfdot<mode>"
> +  [(set (match_operand:VDQSF 0 "register_operand" "=w")
> +     (plus:VDQSF
> +       (unspec:VDQSF
> +        [(match_operand:<VBFMLA_W> 2 "register_operand" "w")
> +         (match_operand:<VBFMLA_W> 3 "register_operand" "w")]
> +         UNSPEC_BFDOT)
> +       (match_operand:VDQSF 1 "register_operand" "0")))]
> +  "TARGET_BF16_SIMD"
> +  "bfdot\t%0.<Vtype>, %2.<Vbfdottype>, %3.<Vbfdottype>"
> +  [(set_attr "type" "neon_dot<q>")]
> +)
> +
> +
> +(define_insn "aarch64_bfdot_lane<VBF:isquadop><VDQSF:mode>"

Too many blank lines.

> +  [(set (match_operand:VDQSF 0 "register_operand" "=w")
> +     (plus:VDQSF
> +       (unspec:VDQSF
> +        [(match_operand:<VDQSF:VBFMLA_W> 2 "register_operand" "w")
> +         (match_operand:VBF 3 "register_operand" "w")
> +         (match_operand:SI 4 "const_int_operand" "n")]
> +         UNSPEC_BFDOT)
> +       (match_operand:VDQSF 1 "register_operand" "0")))]
> +  "TARGET_BF16_SIMD"
> +{
> +  int nunits = GET_MODE_NUNITS (<VBF:MODE>mode).to_constant ();
> +  int lane = INTVAL (operands[4]);
> +  operands[4] = gen_int_mode (ENDIAN_LANE_N (nunits / 2, lane), SImode);
> +  return "bfdot\t%0.<VDQSF:Vtype>, %2.<VDQSF:Vbfdottype>, %3.2h[%4]";
> +}
> +  [(set_attr "type" "neon_dot<VDQSF:q>")]
> +)
> [...]
> diff --git 
> a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfdot-compile-1.c 
> b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfdot-compile-1.c
> new file mode 100644
> index 
> 0000000000000000000000000000000000000000..c575dcd3901172a52fa9403c9179d58eea44eb72
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfdot-compile-1.c
> @@ -0,0 +1,91 @@
> +/* { dg-do assemble { target { aarch64*-*-* } } } */
> +/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
> +/* { dg-add-options arm_v8_2a_bf16_neon }  */
> +/* { dg-additional-options "-O -save-temps" } */
> +/* { dg-final { check-function-bodies "**" "" } } */
> +/* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */

Same comment as for USDOT/SUDOT regarding the dg- markup.

> +
> +#include <arm_neon.h>
> +
> +/*
> +**ufoo:
> +**   bfdot   v0.2s, (v1.4h, v2.4h|v2.4h, v1.4h)
> +**   ret
> +*/
> +float32x2_t ufoo(float32x2_t r, bfloat16x4_t x, bfloat16x4_t y)
> +{
> +  return vbfdot_f32 (r, x, y);
> +}
> +
> +/*
> +**ufooq:
> +**   bfdot   v0.4s, (v1.8h, v2.8h|v2.8h, v1.8h)
> +**   ret
> +*/
> +float32x4_t ufooq(float32x4_t r, bfloat16x8_t x, bfloat16x8_t y)
> +{
> +  return vbfdotq_f32 (r, x, y);
> +}

The (...|...)s here are correct.

> +
> +/*
> +**ufoo_lane:
> +**   bfdot   v0.2s, (v1.4h, v2.2h\[0\]|v2.4h, v1.2h\[0\])
> +**   ret
> +*/
> +float32x2_t ufoo_lane(float32x2_t r, bfloat16x4_t x, bfloat16x4_t y)
> +{
> +  return vbfdot_lane_f32 (r, x, y, 0);
> +}
> +
> +/*
> +**ufooq_laneq:
> +**   bfdot   v0.4s, (v1.8h, v2.2h\[2\]|v2.8h, v1.2h\[2\])
> +**   ret
> +*/
> +float32x4_t ufooq_laneq(float32x4_t r, bfloat16x8_t x, bfloat16x8_t y)
> +{
> +  return vbfdotq_laneq_f32 (r, x, y, 2);
> +}
> +
> +/*
> +**ufoo_laneq:
> +**   bfdot   v0.2s, (v1.4h, v2.2h\[3\]|v2.4h, v1.2h\[3\])
> +**   ret
> +*/
> +float32x2_t ufoo_laneq(float32x2_t r, bfloat16x4_t x, bfloat16x8_t y)
> +{
> +  return vbfdot_laneq_f32 (r, x, y, 3);
> +}
> +
> +/*
> +**ufooq_lane:
> +**   bfdot   v0.4s, (v1.8h, v2.2h\[1\]|v2.8h, v1.2h\[1\])
> +**   ret
> +*/
> +float32x4_t ufooq_lane(float32x4_t r, bfloat16x8_t x, bfloat16x4_t y)
> +{
> +  return vbfdotq_lane_f32 (r, x, y, 1);
> +}

But these aren't, since the operands must be in the order given.

> +
> +/*
> +**ufoo_untied:
> +**   mov     v0.8b, v1.8b
> +**   bfdot   v0.2s, (v2.4h, v3.4h|v3.4h, v2.4h)
> +**   ret
> +*/
> +float32x2_t ufoo_untied(float32x4_t unused, float32x2_t r, bfloat16x4_t x, 
> bfloat16x4_t y)
> +{
> +  return vbfdot_f32 (r, x, y);
> +}

Similarly, OK here.

> +
> +/*
> +**ufooq_lane_untied:
> +**   mov     v0.16b, v1.16b
> +**   bfdot   v0.4s, (v2.8h, v3.2h\[1\]|v3.8h, v2.2h\[1\])
> +**   ret
> +*/
> +float32x4_t ufooq_lane_untied(float32x4_t unused, float32x4_t r, 
> bfloat16x8_t x, bfloat16x4_t y)
> +{
> +  return vbfdotq_lane_f32 (r, x, y, 1);
> +}

...but not here.

Same comments for the big-endian test.

Thanks,
Richard

Reply via email to