Stam Markianos-Wright <stam.markianos-wri...@arm.com> writes: > diff --git a/gcc/config/aarch64/aarch64-simd.md > b/gcc/config/aarch64/aarch64-simd.md > index > ad4676bc167f08951e693916c7ef796e3501762a..eba71f004ef67af654f9c512b720aa6cfdd1d7fc > 100644 > --- a/gcc/config/aarch64/aarch64-simd.md > +++ b/gcc/config/aarch64/aarch64-simd.md > @@ -506,6 +506,19 @@ > [(set_attr "type" "neon_dot<q>")] > ) > > +;; These instructions map to the __builtins for the armv8.6a I8MM usdot > +;; (vector) Dot Product operation. > +(define_insn "aarch64_usdot<vsi2qi>" > + [(set (match_operand:VS 0 "register_operand" "=w") > + (plus:VS (match_operand:VS 1 "register_operand" "0") > + (unspec:VS [(match_operand:<VSI2QI> 2 "register_operand" "w") > + (match_operand:<VSI2QI> 3 "register_operand" "w")] > + UNSPEC_USDOT)))] > + "TARGET_SIMD && TARGET_I8MM" > + "usdot\\t%0.<Vtype>, %2.<Vdottype>, %3.<Vdottype>" > + [(set_attr "type" "neon_dot<q>")] > +) > + > ;; These expands map to the Dot Product optab the vectorizer checks for. > ;; The auto-vectorizer expects a dot product builtin that also does an > ;; accumulation into the provided register.
Sorry for not raising it last time, but this should just be "TARGET_I8MM". TARGET_SIMD is always true when TARGET_I8MM is. > @@ -573,6 +586,25 @@ > [(set_attr "type" "neon_dot<q>")] > ) > > +;; These instructions map to the __builtins for the armv8.6a I8MM usdot, > sudot > +;; (by element) Dot Product operations. > +(define_insn "aarch64_<DOTPROD_I8MM:sur>dot_lane<VB:isquadop><VS:vsi2qi>" > + [(set (match_operand:VS 0 "register_operand" "=w") > + (plus:VS (match_operand:VS 1 "register_operand" "0") > + (unspec:VS [(match_operand:<VS:VSI2QI> 2 "register_operand" "w") > + (match_operand:VB 3 "register_operand" "w") > + (match_operand:SI 4 "immediate_operand" "i")] > + DOTPROD_I8MM)))] > + "TARGET_SIMD && TARGET_I8MM" > + { > + int nunits = GET_MODE_NUNITS (<VB:MODE>mode).to_constant (); > + int lane = INTVAL (operands[4]); > + operands[4] = gen_int_mode (ENDIAN_LANE_N (nunits / 4, lane), SImode); > + return "<DOTPROD_I8MM:sur>dot\\t%0.<VS:Vtype>, %2.<VS:Vdottype>, > %3.4b[%4]"; > + } > + [(set_attr "type" "neon_dot<VS:q>")] > +) > + > (define_expand "copysign<mode>3" > [(match_operand:VHSDF 0 "register_operand") > (match_operand:VHSDF 1 "register_operand") Same here. Another thing I should have noticed last time is that the canonical order for (plus ...) is to have the more complicated expression first. Operand 1 and the (unpec ...) should therefore be the other way around in the expression above. (Having operand 1 "later" than operands 2, 3 and 4 is OK.) > diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h > index > 8b861601a48b2150aa5768d717c61e0d1416747f..95b92dff69343e2b6c74174b39f3cd9d9838ddab > 100644 > --- a/gcc/config/aarch64/arm_neon.h > +++ b/gcc/config/aarch64/arm_neon.h > @@ -34606,6 +34606,89 @@ vrnd64xq_f64 (float64x2_t __a) > > #pragma GCC pop_options > > +/* AdvSIMD 8-bit Integer Matrix Multiply (I8MM) intrinsics. */ > + > +#pragma GCC push_options > +#pragma GCC target ("arch=armv8.2-a+i8mm") > + > +__extension__ extern __inline int32x2_t > +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > +vusdot_s32 (int32x2_t __r, uint8x8_t __a, int8x8_t __b) > +{ > + return __builtin_aarch64_usdotv8qi_ssus (__r, __a, __b); > +} > + > +__extension__ extern __inline int32x4_t > +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > +vusdotq_s32 (int32x4_t __r, uint8x16_t __a, int8x16_t __b) > +{ > + return __builtin_aarch64_usdotv16qi_ssus (__r, __a, __b); > +} > + > +__extension__ extern __inline int32x2_t > +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > +vusdot_lane_s32 (int32x2_t __r, uint8x8_t __a, int8x8_t __b, const int > __index) > +{ > + return __builtin_aarch64_usdot_lanev8qi_ssuss (__r, __a, __b, __index); > +} > + > +__extension__ extern __inline int32x2_t > +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > +vusdot_laneq_s32 \ > + (int32x2_t __r, uint8x8_t __a, int8x16_t __b, const int __index) Stray backslash. It's probably easier to split the line after "__b," instead of before "(". Same for later function. > diff --git > a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vdot-compile-3-1.c > b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vdot-compile-3-1.c > new file mode 100755 > index > 0000000000000000000000000000000000000000..6a4ff054589b736c224bb2fabdcfa48439a8a420 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vdot-compile-3-1.c > @@ -0,0 +1,133 @@ > +/* { dg-do assemble { target { aarch64*-*-* } } } */ > +/* { dg-require-effective-target arm_v8_2a_i8mm_ok } */ > +/* { dg-add-options arm_v8_2a_i8mm } */ > +/* { dg-additional-options "--save-temps" } */ > +/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */ > + > +#include <arm_neon.h> > + > +/* Unsigned-Signed Dot Product instructions. */ > + > +/* > +**ufoo: > +** ... > +** usdot\tv[0-9]+.2s, v[0-9]+.8b, v[0-9]+.8b Can just use a literal tab instead of "\t". Later tests check for "\." rather than ".", so might as well do that here too. > +** ... > +** ret > +*/ > +int32x2_t ufoo (int32x2_t r, uint8x8_t x, int8x8_t y) > +{ > + return vusdot_s32 (r, x, y); > +} > + If we're using check-function-bodies anyway, it might be slightly more robust to compile at -O and check for the exact RA. E.g.: /* **ufoo: ** usdot v0\.2s, (v1\.8b, v2\.8b|v2\.8b, v1\.8b) ** ret */ Just a suggestion though -- either way is fine. OK with those changes (or without the last one), thanks. Richard