Hi all, This patch is part of a series adding support for Armv8.6-A features. It depends on the Armv8.6-A effective target checking patch, https://gcc.gnu.org/ml/gcc-patches/2019-12/msg00857.html.
This patch adds intrinsics for matrix multiply-accumulate operations including vmmlaq_s32, vmmlaq_u32, and vusmmlaq_s32. ACLE documents are at https://developer.arm.com/docs/101028/latest ISA documents are at https://developer.arm.com/docs/ddi0596/latest Regtested & bootstrapped for aarch64-none-linux-gnu. Is it OK for trunk please? Thanks Dennis gcc/ChangeLog: 2019-12-13 Dennis Zhang <dennis.zh...@arm.com> * config/aarch64/aarch64-builtins.c (TYPES_TERNOP_SSUS): New macro. * config/aarch64/aarch64-simd-builtins.def (simd_smmla): New. (simd_ummla, simd_usmmla): New. * config/aarch64/aarch64-simd.md (aarch64_simd_<sur>mmlav16qi): New. * config/aarch64/arm_neon.h (vmmlaq_s32, vmmlaq_u32): New. (vusmmlaq_s32): New. * config/aarch64/iterators.md (unspec): Add UNSPEC_SMATMUL, UNSPEC_UMATMUL, and UNSPEC_USMATMUL. (sur): Likewise. (MATMUL): New. gcc/testsuite/ChangeLog: 2019-12-13 Dennis Zhang <dennis.zh...@arm.com> * gcc.target/aarch64/advsimd-intrinsics/vmmla.c: New test.
diff --git a/gcc/config/aarch64/aarch64-builtins.c b/gcc/config/aarch64/aarch64-builtins.c index c35a1b1f029..5b048dc9402 100644 --- a/gcc/config/aarch64/aarch64-builtins.c +++ b/gcc/config/aarch64/aarch64-builtins.c @@ -173,6 +173,10 @@ aarch64_types_ternopu_imm_qualifiers[SIMD_MAX_BUILTIN_ARGS] = { qualifier_unsigned, qualifier_unsigned, qualifier_unsigned, qualifier_immediate }; #define TYPES_TERNOPUI (aarch64_types_ternopu_imm_qualifiers) +static enum aarch64_type_qualifiers +aarch64_types_ternop_ssus_qualifiers[SIMD_MAX_BUILTIN_ARGS] + = { qualifier_none, qualifier_none, qualifier_unsigned, qualifier_none }; +#define TYPES_TERNOP_SSUS (aarch64_types_ternop_ssus_qualifiers) static enum aarch64_type_qualifiers diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def index f4ca35a5970..744f880c450 100644 --- a/gcc/config/aarch64/aarch64-simd-builtins.def +++ b/gcc/config/aarch64/aarch64-simd-builtins.def @@ -682,3 +682,8 @@ BUILTIN_VSFDF (UNOP, frint32x, 0) BUILTIN_VSFDF (UNOP, frint64z, 0) BUILTIN_VSFDF (UNOP, frint64x, 0) + + /* Implemented by aarch64_simd_<sur>mmlav16qi. */ + VAR1 (TERNOP, simd_smmla, 0, v16qi) + VAR1 (TERNOPU, simd_ummla, 0, v16qi) + VAR1 (TERNOP_SSUS, simd_usmmla, 0, v16qi) \ No newline at end of file diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md index ad4676bc167..fc0c8d21599 100644 --- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -7025,3 +7025,15 @@ "xtn\t%0.<Vntype>, %1.<Vtype>" [(set_attr "type" "neon_shift_imm_narrow_q")] ) + +;; 8-bit integer matrix multiply-accumulate +(define_insn "aarch64_simd_<sur>mmlav16qi" + [(set (match_operand:V4SI 0 "register_operand" "=w") + (plus:V4SI (match_operand:V4SI 1 "register_operand" "0") + (unspec:V4SI [(match_operand:V16QI 2 "register_operand" "w") + (match_operand:V16QI 3 "register_operand" "w")] + MATMUL)))] + "TARGET_I8MM" + "<sur>mmla\\t%0.4s, %2.16b, %3.16b" + [(set_attr "type" "neon_mla_s_q")] +) \ No newline at end of file diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h index 8b861601a48..e6af2c2960d 100644 --- a/gcc/config/aarch64/arm_neon.h +++ b/gcc/config/aarch64/arm_neon.h @@ -34606,6 +34606,36 @@ vrnd64xq_f64 (float64x2_t __a) #pragma GCC pop_options +/* AdvSIMD 8-bit Integer Matrix Multiply (I8MM) intrinsics. */ + +#pragma GCC push_options +#pragma GCC target ("arch=armv8.2-a+i8mm") + +/* Matrix Multiply-Accumulate. */ + +__extension__ extern __inline int32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmmlaq_s32 (int32x4_t __r, int8x16_t __a, int8x16_t __b) +{ + return __builtin_aarch64_simd_smmlav16qi (__r, __a, __b); +} + +__extension__ extern __inline uint32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmmlaq_u32 (uint32x4_t __r, uint8x16_t __a, uint8x16_t __b) +{ + return __builtin_aarch64_simd_ummlav16qi_uuuu (__r, __a, __b); +} + +__extension__ extern __inline int32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vusmmlaq_s32 (int32x4_t __r, uint8x16_t __a, int8x16_t __b) +{ + return __builtin_aarch64_simd_usmmlav16qi_ssus (__r, __a, __b); +} + +#pragma GCC pop_options + #undef __aarch64_vget_lane_any #undef __aarch64_vdup_lane_any diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md index 83a0d156e84..f2a9298fbf8 100644 --- a/gcc/config/aarch64/iterators.md +++ b/gcc/config/aarch64/iterators.md @@ -650,6 +650,9 @@ UNSPEC_UMULHS ; Used in aarch64-sve2.md. UNSPEC_UMULHRS ; Used in aarch64-sve2.md. UNSPEC_ASRD ; Used in aarch64-sve.md. + UNSPEC_SMATMUL ; Used in aarch64-simd.md. + UNSPEC_UMATMUL ; Used in aarch64-simd.md. + UNSPEC_USMATMUL ; Used in aarch64-simd.md. ]) ;; ------------------------------------------------------------------ @@ -2102,6 +2105,8 @@ (define_int_iterator SVE_PITER [UNSPEC_PFIRST UNSPEC_PNEXT]) +(define_int_iterator MATMUL [UNSPEC_SMATMUL UNSPEC_UMATMUL UNSPEC_USMATMUL]) + ;; Iterators for atomic operations. (define_int_iterator ATOMIC_LDOP @@ -2292,6 +2297,8 @@ (UNSPEC_URSHL "ur") (UNSPEC_SRSHL "sr") (UNSPEC_UQRSHL "u") (UNSPEC_SQRSHL "s") (UNSPEC_SDOT "s") (UNSPEC_UDOT "u") + (UNSPEC_SMATMUL "s") (UNSPEC_UMATMUL "u") + (UNSPEC_USMATMUL "us") ]) (define_int_attr r [(UNSPEC_SQDMULH "") (UNSPEC_SQRDMULH "r") diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmmla.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmmla.c new file mode 100644 index 00000000000..348b2f51779 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmmla.c @@ -0,0 +1,37 @@ +/* { dg-do assemble } */ +/* { dg-require-effective-target arm_v8_2a_i8mm_ok } */ +/* { dg-options "-save-temps -O2" } */ +/* { dg-additional-options "-march=armv8.2-a+i8mm" } */ + +#include "arm_neon.h" + +extern void abort(); + +#define VAR4(v) {v, v, v, v} +#define VAR16(v) {v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v} +#define TEST(t, f, r, a, b, ...) { \ + t##32x4_t f##_ref = { __VA_ARGS__ }; \ + t##32x4_t f##_out = f (r, a, b); \ + for (int i = 0; i < 4; i++) \ + if (f##_out[i] != f##_ref[i]) \ + abort(); \ +} + +int +main() +{ + int32x4_t s32 = VAR4(-1); + uint32x4_t u32 = VAR4(1); + int8x16_t s8 = VAR16(-1); + uint8x16_t u8 = VAR16(1); + + TEST(int, vmmlaq_s32, s32, s8, s8, 7, 7, 7, 7); + TEST(uint, vmmlaq_u32, u32, u8, u8, 9, 9, 9, 9); + TEST(int, vusmmlaq_s32, s32, u8, s8, -9, -9, -9, -9); + + return 0; +} + +/* { dg-final { scan-assembler {smmla\tv[0-9]+.4s, v[0-9]+.16b, v[0-9]+.16b} } } */ +/* { dg-final { scan-assembler {ummla\tv[0-9]+.4s, v[0-9]+.16b, v[0-9]+.16b} } } */ +/* { dg-final { scan-assembler {usmmla\tv[0-9]+.4s, v[0-9]+.16b, v[0-9]+.16b} } } */ \ No newline at end of file