Re: [PATCH][AArch64] ACLE 8-bit integer matrix multiply-accumulate intrinsics

Richard Sandiford Thu, 23 Jan 2020 07:30:07 -0800

Dennis Zhang <dennis.zh...@arm.com> writes:
> Hi all,
> On 16/12/2019 13:53, Dennis Zhang wrote:
>> Hi all,
>> 
>> This patch is part of a series adding support for Armv8.6-A features.
>> It depends on the Armv8.6-A effective target checking patch, 
>> https://gcc.gnu.org/ml/gcc-patches/2019-12/msg00857.html.
>> 
>> This patch adds intrinsics for matrix multiply-accumulate operations 
>> including vmmlaq_s32, vmmlaq_u32, and vusmmlaq_s32.
>> 
>> ACLE documents are at https://developer.arm.com/docs/101028/latest
>> ISA documents are at https://developer.arm.com/docs/ddi0596/latest
>> 
>> Regtested & bootstrapped for aarch64-none-linux-gnu.
>> 
>> Is it OK for trunk please?
>> 
>
> This patch is rebased to the trunk top.
> There is no dependence on any other patches now.
> Regtested again.
>
> Is it OK for trunk please?
>
> Cheers
> Dennis
>
> gcc/ChangeLog:
>
> 2020-01-23  Dennis Zhang  <dennis.zh...@arm.com>
>
>       * config/aarch64/aarch64-builtins.c (TYPES_TERNOP_SSUS): New macro.
>       * config/aarch64/aarch64-simd-builtins.def (simd_smmla): New.
>       (simd_ummla, simd_usmmla): New.
>       * config/aarch64/aarch64-simd.md (aarch64_simd_<sur>mmlav16qi): New.
>       * config/aarch64/arm_neon.h (vmmlaq_s32, vmmlaq_u32): New.
>       (vusmmlaq_s32): New.
>       * config/aarch64/iterators.md (unspec): Add UNSPEC_SMATMUL,
>       UNSPEC_UMATMUL, and UNSPEC_USMATMUL.
>       (sur): Likewise.
>       (MATMUL): New iterator.
>
> gcc/testsuite/ChangeLog:
>
> 2020-01-23  Dennis Zhang  <dennis.zh...@arm.com>
>
>       * gcc.target/aarch64/advsimd-intrinsics/vmmla.c: New test.
>
> diff --git a/gcc/config/aarch64/aarch64-builtins.c 
> b/gcc/config/aarch64/aarch64-builtins.c
> index f0e0461b7f0..033a6d4e92f 100644
> --- a/gcc/config/aarch64/aarch64-builtins.c
> +++ b/gcc/config/aarch64/aarch64-builtins.c
> @@ -176,6 +176,10 @@ 
> aarch64_types_ternopu_imm_qualifiers[SIMD_MAX_BUILTIN_ARGS]
>    = { qualifier_unsigned, qualifier_unsigned,
>        qualifier_unsigned, qualifier_immediate };
>  #define TYPES_TERNOPUI (aarch64_types_ternopu_imm_qualifiers)
> +static enum aarch64_type_qualifiers
> +aarch64_types_ternop_ssus_qualifiers[SIMD_MAX_BUILTIN_ARGS]
> +  = { qualifier_none, qualifier_none, qualifier_unsigned, qualifier_none };
> +#define TYPES_TERNOP_SSUS (aarch64_types_ternop_ssus_qualifiers)
>  
>  
>  static enum aarch64_type_qualifiers
> diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def 
> b/gcc/config/aarch64/aarch64-simd-builtins.def
> index 57fc5933b43..06025b110cc 100644
> --- a/gcc/config/aarch64/aarch64-simd-builtins.def
> +++ b/gcc/config/aarch64/aarch64-simd-builtins.def
> @@ -682,3 +682,8 @@
>    BUILTIN_VSFDF (UNOP, frint32x, 0)
>    BUILTIN_VSFDF (UNOP, frint64z, 0)
>    BUILTIN_VSFDF (UNOP, frint64x, 0)
> +
> +  /* Implemented by aarch64_simd_<sur>mmlav16qi.  */
> +  VAR1 (TERNOP, simd_smmla, 0, v16qi)
> +  VAR1 (TERNOPU, simd_ummla, 0, v16qi)
> +  VAR1 (TERNOP_SSUS, simd_usmmla, 0, v16qi)
> \ No newline at end of file
> diff --git a/gcc/config/aarch64/aarch64-simd.md 
> b/gcc/config/aarch64/aarch64-simd.md
> index 2989096b170..409ec28d293 100644
> --- a/gcc/config/aarch64/aarch64-simd.md
> +++ b/gcc/config/aarch64/aarch64-simd.md
> @@ -7025,3 +7025,15 @@
>    "xtn\t%0.<Vntype>, %1.<Vtype>"
>    [(set_attr "type" "neon_shift_imm_narrow_q")]
>  )
> +
> +;; 8-bit integer matrix multiply-accumulate
> +(define_insn "aarch64_simd_<sur>mmlav16qi"
> +  [(set (match_operand:V4SI 0 "register_operand" "=w")
> +     (plus:V4SI (match_operand:V4SI 1 "register_operand" "0")
> +                (unspec:V4SI [(match_operand:V16QI 2 "register_operand" "w")
> +                              (match_operand:V16QI 3 "register_operand" "w")]
> +                 MATMUL)))]
> +  "TARGET_I8MM"
> +  "<sur>mmla\\t%0.4s, %2.16b, %3.16b"
> +  [(set_attr "type" "neon_mla_s_q")]
> +)
> \ No newline at end of file


(Would be good to add the newline)

The canonical rtl order for commutative operations like plus is
to put the most complicated expression first (roughly speaking --
the rules are a bit more precise than that).  So this should be:

  [(set (match_operand:V4SI 0 "register_operand" "=w")
        (plus:V4SI (unspec:V4SI [(match_operand:V16QI 2 "register_operand" "w")
                                 (match_operand:V16QI 3 "register_operand" "w")]
                    MATMUL)
                   (match_operand:V4SI 1 "register_operand" "0")))]

> diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
> index eaba156e26c..918000d98dc 100644
> --- a/gcc/config/aarch64/arm_neon.h
> +++ b/gcc/config/aarch64/arm_neon.h
> @@ -34609,6 +34609,36 @@ vrnd64xq_f64 (float64x2_t __a)
>  
>  #pragma GCC pop_options
>  
> +/* AdvSIMD 8-bit Integer Matrix Multiply (I8MM) intrinsics.  */
> +
> +#pragma GCC push_options
> +#pragma GCC target ("arch=armv8.2-a+i8mm")
> +
> +/* Matrix Multiply-Accumulate.  */
> +
> +__extension__ extern __inline int32x4_t
> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> +vmmlaq_s32 (int32x4_t __r, int8x16_t __a, int8x16_t __b)
> +{
> +  return __builtin_aarch64_simd_smmlav16qi (__r, __a, __b);
> +}
> +
> +__extension__ extern __inline uint32x4_t
> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> +vmmlaq_u32 (uint32x4_t __r, uint8x16_t __a, uint8x16_t __b)
> +{
> +  return __builtin_aarch64_simd_ummlav16qi_uuuu (__r, __a, __b);
> +}
> +
> +__extension__ extern __inline int32x4_t
> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> +vusmmlaq_s32 (int32x4_t __r, uint8x16_t __a, int8x16_t __b)
> +{
> +  return __builtin_aarch64_simd_usmmlav16qi_ssus (__r, __a, __b);
> +}
> +
> +#pragma GCC pop_options
> +
>  #include "arm_bf16.h"
>  
>  #undef __aarch64_vget_lane_any
> diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
> index b9843b83c5f..57aca36f646 100644
> --- a/gcc/config/aarch64/iterators.md
> +++ b/gcc/config/aarch64/iterators.md
> @@ -581,6 +581,9 @@
>      UNSPEC_FMLSL     ; Used in aarch64-simd.md.
>      UNSPEC_FMLAL2    ; Used in aarch64-simd.md.
>      UNSPEC_FMLSL2    ; Used in aarch64-simd.md.
> +    UNSPEC_SMATMUL   ; Used in aarch64-simd.md.
> +    UNSPEC_UMATMUL   ; Used in aarch64-simd.md.
> +    UNSPEC_USMATMUL  ; Used in aarch64-simd.md.
>      UNSPEC_ADR               ; Used in aarch64-sve.md.
>      UNSPEC_SEL               ; Used in aarch64-sve.md.
>      UNSPEC_BRKA              ; Used in aarch64-sve.md.
> @@ -2531,6 +2534,8 @@
>  
>  (define_int_iterator SVE_PITER [UNSPEC_PFIRST UNSPEC_PNEXT])
>  
> +(define_int_iterator MATMUL [UNSPEC_SMATMUL UNSPEC_UMATMUL UNSPEC_USMATMUL])
> +
>  ;; Iterators for atomic operations.
>  
>  (define_int_iterator ATOMIC_LDOP
> @@ -2738,6 +2743,8 @@
>                     (UNSPEC_URSHL  "ur") (UNSPEC_SRSHL  "sr")
>                     (UNSPEC_UQRSHL  "u") (UNSPEC_SQRSHL  "s")
>                     (UNSPEC_SDOT "s") (UNSPEC_UDOT "u")
> +                   (UNSPEC_SMATMUL "s") (UNSPEC_UMATMUL "u")
> +                   (UNSPEC_USMATMUL "us")
>  ])
>  
>  (define_int_attr r [(UNSPEC_SQDMULH "") (UNSPEC_SQRDMULH "r")
> diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmmla.c 
> b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmmla.c
> new file mode 100644
> index 00000000000..348b2f51779
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmmla.c
> @@ -0,0 +1,37 @@
> +/* { dg-do assemble } */

I assume this should be dg-run, otherwise there's no point in having
the main function and comparison.  The dg-run would need to be
conditional on whether the target supports i8mm.

Alternatively, we could keep it simple and stick to an assembler
test, in which case I think we should have one function per call,
with no main.

> +/* { dg-require-effective-target arm_v8_2a_i8mm_ok } */
> +/* { dg-options "-save-temps -O2" } */
> +/* { dg-additional-options "-march=armv8.2-a+i8mm" } */
> +
> +#include "arm_neon.h"
> +
> +extern void abort();
> +
> +#define VAR4(v) {v, v, v, v}
> +#define VAR16(v) {v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v}
> +#define TEST(t, f, r, a, b, ...) { \
> +  t##32x4_t f##_ref = { __VA_ARGS__ }; \
> +  t##32x4_t f##_out =  f (r, a, b); \
> +  for (int i = 0; i < 4; i++) \
> +    if (f##_out[i] != f##_ref[i]) \
> +      abort(); \
> +}
> +
> +int
> +main()
> +{
> +  int32x4_t s32 = VAR4(-1);
> +  uint32x4_t u32 = VAR4(1);
> +  int8x16_t s8 = VAR16(-1);
> +  uint8x16_t u8 = VAR16(1);
> +
> +  TEST(int, vmmlaq_s32, s32, s8, s8, 7, 7, 7, 7);
> +  TEST(uint, vmmlaq_u32, u32, u8, u8, 9, 9, 9, 9);
> +  TEST(int, vusmmlaq_s32, s32, u8, s8, -9, -9, -9, -9);
> +
> +  return 0;
> +}
> +
> +/* { dg-final { scan-assembler {smmla\tv[0-9]+.4s, v[0-9]+.16b, v[0-9]+.16b} 
> } } */
> +/* { dg-final { scan-assembler {ummla\tv[0-9]+.4s, v[0-9]+.16b, v[0-9]+.16b} 
> } } */
> +/* { dg-final { scan-assembler {usmmla\tv[0-9]+.4s, v[0-9]+.16b, 
> v[0-9]+.16b} } } */
> \ No newline at end of file

This is going to look like inventing a new rule, sorry, since nothing
else in the directory does this yet, but: IMO it's better to put a
\t at the beginning of each scan-assembler.  As it stands the
usmmla instruction would satisfy the first scan-assembler too,
so we wouldn't pick up cases in which smmla failed to be generated.

Thanks,
Richard

Re: [PATCH][AArch64] ACLE 8-bit integer matrix multiply-accumulate intrinsics

Reply via email to