Stam Markianos-Wright <stam.markianos-wri...@arm.com> writes: > diff --git a/gcc/config/aarch64/aarch64-simd.md > b/gcc/config/aarch64/aarch64-simd.md > index > adfda96f077075ad53d4bea2919c4d3b326e49f5..7587bc46ba1c80389ea49fa83a0e6f8a489711e9 > 100644 > --- a/gcc/config/aarch64/aarch64-simd.md > +++ b/gcc/config/aarch64/aarch64-simd.md > @@ -7028,3 +7028,36 @@ > "xtn\t%0.<Vntype>, %1.<Vtype>" > [(set_attr "type" "neon_shift_imm_narrow_q")] > ) > + > +(define_insn "aarch64_bfdot<mode>" > + [(set (match_operand:VDQSF 0 "register_operand" "=w") > + (plus:VDQSF > + (unspec:VDQSF > + [(match_operand:<VBFMLA_W> 2 "register_operand" "w") > + (match_operand:<VBFMLA_W> 3 "register_operand" "w")] > + UNSPEC_BFDOT) > + (match_operand:VDQSF 1 "register_operand" "0")))] > + "TARGET_BF16_SIMD" > + "bfdot\t%0.<Vtype>, %2.<Vbfdottype>, %3.<Vbfdottype>" > + [(set_attr "type" "neon_dot<q>")] > +) > + > + > +(define_insn "aarch64_bfdot_lane<VBF:isquadop><VDQSF:mode>"
Too many blank lines. > + [(set (match_operand:VDQSF 0 "register_operand" "=w") > + (plus:VDQSF > + (unspec:VDQSF > + [(match_operand:<VDQSF:VBFMLA_W> 2 "register_operand" "w") > + (match_operand:VBF 3 "register_operand" "w") > + (match_operand:SI 4 "const_int_operand" "n")] > + UNSPEC_BFDOT) > + (match_operand:VDQSF 1 "register_operand" "0")))] > + "TARGET_BF16_SIMD" > +{ > + int nunits = GET_MODE_NUNITS (<VBF:MODE>mode).to_constant (); > + int lane = INTVAL (operands[4]); > + operands[4] = gen_int_mode (ENDIAN_LANE_N (nunits / 2, lane), SImode); > + return "bfdot\t%0.<VDQSF:Vtype>, %2.<VDQSF:Vbfdottype>, %3.2h[%4]"; > +} > + [(set_attr "type" "neon_dot<VDQSF:q>")] > +) > [...] > diff --git > a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfdot-compile-1.c > b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfdot-compile-1.c > new file mode 100644 > index > 0000000000000000000000000000000000000000..c575dcd3901172a52fa9403c9179d58eea44eb72 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfdot-compile-1.c > @@ -0,0 +1,91 @@ > +/* { dg-do assemble { target { aarch64*-*-* } } } */ > +/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */ > +/* { dg-add-options arm_v8_2a_bf16_neon } */ > +/* { dg-additional-options "-O -save-temps" } */ > +/* { dg-final { check-function-bodies "**" "" } } */ > +/* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */ Same comment as for USDOT/SUDOT regarding the dg- markup. > + > +#include <arm_neon.h> > + > +/* > +**ufoo: > +** bfdot v0.2s, (v1.4h, v2.4h|v2.4h, v1.4h) > +** ret > +*/ > +float32x2_t ufoo(float32x2_t r, bfloat16x4_t x, bfloat16x4_t y) > +{ > + return vbfdot_f32 (r, x, y); > +} > + > +/* > +**ufooq: > +** bfdot v0.4s, (v1.8h, v2.8h|v2.8h, v1.8h) > +** ret > +*/ > +float32x4_t ufooq(float32x4_t r, bfloat16x8_t x, bfloat16x8_t y) > +{ > + return vbfdotq_f32 (r, x, y); > +} The (...|...)s here are correct. > + > +/* > +**ufoo_lane: > +** bfdot v0.2s, (v1.4h, v2.2h\[0\]|v2.4h, v1.2h\[0\]) > +** ret > +*/ > +float32x2_t ufoo_lane(float32x2_t r, bfloat16x4_t x, bfloat16x4_t y) > +{ > + return vbfdot_lane_f32 (r, x, y, 0); > +} > + > +/* > +**ufooq_laneq: > +** bfdot v0.4s, (v1.8h, v2.2h\[2\]|v2.8h, v1.2h\[2\]) > +** ret > +*/ > +float32x4_t ufooq_laneq(float32x4_t r, bfloat16x8_t x, bfloat16x8_t y) > +{ > + return vbfdotq_laneq_f32 (r, x, y, 2); > +} > + > +/* > +**ufoo_laneq: > +** bfdot v0.2s, (v1.4h, v2.2h\[3\]|v2.4h, v1.2h\[3\]) > +** ret > +*/ > +float32x2_t ufoo_laneq(float32x2_t r, bfloat16x4_t x, bfloat16x8_t y) > +{ > + return vbfdot_laneq_f32 (r, x, y, 3); > +} > + > +/* > +**ufooq_lane: > +** bfdot v0.4s, (v1.8h, v2.2h\[1\]|v2.8h, v1.2h\[1\]) > +** ret > +*/ > +float32x4_t ufooq_lane(float32x4_t r, bfloat16x8_t x, bfloat16x4_t y) > +{ > + return vbfdotq_lane_f32 (r, x, y, 1); > +} But these aren't, since the operands must be in the order given. > + > +/* > +**ufoo_untied: > +** mov v0.8b, v1.8b > +** bfdot v0.2s, (v2.4h, v3.4h|v3.4h, v2.4h) > +** ret > +*/ > +float32x2_t ufoo_untied(float32x4_t unused, float32x2_t r, bfloat16x4_t x, > bfloat16x4_t y) > +{ > + return vbfdot_f32 (r, x, y); > +} Similarly, OK here. > + > +/* > +**ufooq_lane_untied: > +** mov v0.16b, v1.16b > +** bfdot v0.4s, (v2.8h, v3.2h\[1\]|v3.8h, v2.2h\[1\]) > +** ret > +*/ > +float32x4_t ufooq_lane_untied(float32x4_t unused, float32x4_t r, > bfloat16x8_t x, bfloat16x4_t y) > +{ > + return vbfdotq_lane_f32 (r, x, y, 1); > +} ...but not here. Same comments for the big-endian test. Thanks, Richard