Re: [Aarch64][SVE] Dot product support

Richard Sandiford Fri, 26 Apr 2019 06:29:54 -0700

Alejandro Martinez Vicente <alejandro.martinezvice...@arm.com> writes:
> Hi,
>
> This patch does two things. For the general vectorizer, it adds support to
> perform fully masked reductions over expressions that don't support masking.
> This is achieved by using VEC_COND_EXPR where possible.  At the moment this is
> implemented for DOT_PROD_EXPR only, but the framework is there to extend it to
> other expressions.
>
> Related to that, this patch adds support to vectorize dot product using SVE.  
> It
> also uses the new functionality to ensure that the resulting loop is masked.
>
> Given this input code:
>
> uint32_t
> dotprod (uint8_t *restrict x, uint8_t *restrict y, int n)
> {
>   uint32_t sum = 0;
>
>   for (int i = 0; i < n; i++)
>     {
>       sum += x[i] * y[i];
>     }
>
>   return sum;
> }
>
> The resulting SVE code is:
>
> 0000000000000000 <dotprod>:
>    0: 7100005f        cmp     w2, #0x0
>    4: 5400024d        b.le    4c <dotprod+0x4c>
>    8: d2800003        mov     x3, #0x0                        // #0
>    c: 93407c42        sxtw    x2, w2
>   10: 2538c001        mov     z1.b, #0
>   14: 25221fe0        whilelo p0.b, xzr, x2
>   18: 2538c003        mov     z3.b, #0
>   1c: d503201f        nop
>   20: a4034002        ld1b    {z2.b}, p0/z, [x0, x3]
>   24: a4034020        ld1b    {z0.b}, p0/z, [x1, x3]
>   28: 0430e3e3        incb    x3
>   2c: 0523c000        sel     z0.b, p0, z0.b, z3.b
>   30: 25221c60        whilelo p0.b, x3, x2
>   34: 44820401        udot    z1.s, z0.b, z2.b
>   38: 54ffff41        b.ne    20 <dotprod+0x20>  // b.any
>   3c: 2598e3e0        ptrue   p0.s
>   40: 04812021        uaddv   d1, p0, z1.s
>   44: 1e260020        fmov    w0, s1
>   48: d65f03c0        ret
>   4c: 1e2703e1        fmov    s1, wzr
>   50: 1e260020        fmov    w0, s1
>   54: d65f03c0        ret
>
> Notice how udot is used inside a fully masked loop.
>
> I tested this patch in an aarch64 machine bootstrapping the compiler and
> running the checks.
>
> I admit it is too late to merge this into gcc 9, but I'm posting it anyway so
> it can be considered for gcc 10.
>
> Alejandro
>
>
> gcc/Changelog:
>
> 2019-01-31  Alejandro Martinez  <alejandro.martinezvice...@arm.com>
>
>       * config/aarch64/aarch64-sve.md (<sur>dot_prod<vsi2qi>): Taken from SVE
>       ACLE branch.
>       * config/aarch64/iterators.md: Copied Vetype_fourth, VSI2QI and vsi2qi 
> from
>       SVE ACLE branch.
>       * tree-vect-loop.c (use_mask_by_cond_expr_p): New function to check if a
>       VEC_COND_EXPR be inserted to emulate a conditional internal function.
>       (build_vect_cond_expr): Emit the VEC_COND_EXPR.
>       (vectorizable_reduction): Use the functions above to vectorize in a
>       fully masked loop codes that don't have a conditional internal
>       function.
>
> gcc/testsuite/Changelog:
>  
> 2019-01-31  Alejandro Martinez  <alejandro.martinezvice...@arm.com>
>
>       * gcc.target/aarch64/sve/dot_1.c: New test for dot product.
>
> diff --git a/gcc/config/aarch64/aarch64-sve.md 
> b/gcc/config/aarch64/aarch64-sve.md
> index 5bb3422..2779a21 100644
> --- a/gcc/config/aarch64/aarch64-sve.md
> +++ b/gcc/config/aarch64/aarch64-sve.md
> @@ -3128,3 +3128,17 @@
>      DONE;
>    }
>  )
> +
> +;; Unpredicated DOT product.
> +(define_insn "<sur>dot_prod<vsi2qi>"
> +  [(set (match_operand:SVE_SDI 0 "register_operand" "=w, ?&w")
> +     (plus:SVE_SDI (unspec:SVE_SDI [(match_operand:<VSI2QI> 1 
> "register_operand" "w, w")
> +                                    (match_operand:<VSI2QI> 2 
> "register_operand" "w, w")]
> +                    DOTPROD)
> +     (match_operand:SVE_SDI 3 "register_operand" "0, w")))]
> +  "TARGET_SVE"
> +  "@
> +   <sur>dot\\t%0.<Vetype>, %1.<Vetype_fourth>, %2.<Vetype_fourth>
> +   movprfx\t%0, %3\;<sur>dot\\t%0.<Vetype>, %1.<Vetype_fourth>, 
> %2.<Vetype_fourth>"
> +  [(set_attr "movprfx" "*,yes")]
> +)
> diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
> index 85fa161..faf588a 100644
> --- a/gcc/config/aarch64/iterators.md
> +++ b/gcc/config/aarch64/iterators.md
> @@ -663,6 +663,9 @@
>                         (QI "b")   (HI "h")
>                         (SI "s")   (DI "d")])
>  
> +;; Mode to one fourth individual element type mapping used in instruction 
> DOT.
> +(define_mode_attr Vetype_fourth [(VNx4SI "b") (VNx2DI "h")])


Maybe better as:

;; Like Vetype, but map to types that are a quarter of the element size.

> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/dot_1.c 
> b/gcc/testsuite/gcc.target/aarch64/sve/dot_1.c
> new file mode 100644
> index 0000000..8ff6671
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/dot_1.c
> @@ -0,0 +1,39 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -ftree-vectorize" } */
> +
> +#include <stdint.h>
> +
> +#define DEF_DOT(TYPE1, TYPE2)                                                
> \
> +TYPE1 __attribute__ ((noinline, noclone))                            \
> +dot_##TYPE1##_##TYPE2 (TYPE2 *restrict x, TYPE2 *restrict y, int n)  \
> +{                                                                    \
> +  TYPE1 sum = 0;                                                     \
> +  for (int i = 0; i < n; i++)                                                
> \
> +    {                                                                        
> \
> +      sum += x[i] * y[i];                                            \
> +    }                                                                        
> \
> +  return sum;                                                                
> \
> +}
> +
> +DEF_DOT(uint32_t, uint8_t)
> +DEF_DOT(int32_t, int8_t)
> +DEF_DOT(int64_t, int16_t)
> +
> +/* The uint16_t->uint64_t dot product requires a casting to satisfy the C
> +   language rules.  */
> +uint64_t __attribute__ ((noinline, noclone))
> +dot_uint64_t_uint16_t (uint16_t *restrict x, uint16_t *restrict y, int n)
> +{
> +  uint64_t sum = 0;
> +  for (int i = 0; i < n; i++)
> +    {
> +      sum += (unsigned int)x[i] * y[i];
> +    }
> +  return sum;
> +}
> +
> +/* { dg-final { scan-assembler-times {\tudot\tz[0-9]+\.s, z[0-9]+\.b, 
> z[0-9]+\.b\n} 1 } } */
> +/* { dg-final { scan-assembler-times {\tsdot\tz[0-9]+\.s, z[0-9]+\.b, 
> z[0-9]+\.b\n} 1 } } */
> +/* { dg-final { scan-assembler-times {\tudot\tz[0-9]+\.d, z[0-9]+\.h, 
> z[0-9]+\.h\n} 1 } } */
> +/* { dg-final { scan-assembler-times {\tsdot\tz[0-9]+\.d, z[0-9]+\.h, 
> z[0-9]+\.h\n} 1 } } */
> +/* { dg-final { scan-assembler-times {\twhilelo\t} 8 } } */
> diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
> index 202cab9..2278e8b 100644
> --- a/gcc/tree-vect-loop.c
> +++ b/gcc/tree-vect-loop.c
> @@ -5885,6 +5885,52 @@ is_nonwrapping_integer_induction (stmt_vec_info 
> stmt_vinfo, struct loop *loop)
>         <= TYPE_PRECISION (lhs_type));
>  }
>  
> +/* If there is no conditional internal function availabe to vectorize CODE, 
> can

available

> +   a VEC_COND_EXPR be inserted to emulate one?  */
> +static bool
> +use_mask_by_cond_expr_p (enum tree_code code, internal_fn cond_fn,
> +                      tree vectype_in)

Need to document the COND_FN and VECTYPE_IN parameters too.

> +{
> +  if (cond_fn != IFN_LAST
> +      && direct_internal_fn_supported_p (cond_fn, vectype_in,
> +                                      OPTIMIZE_FOR_SPEED))
> +    return false;
> +
> +  switch (code)
> +    {
> +    case DOT_PROD_EXPR:
> +      return true;
> +
> +    default:
> +      return false;
> +    }
> +}
> +
> +/* Create a vectorized conditional version of code by using a non-conditional

...of CODE by...

> +   vector expression plus any VEC_COND_EXPR that may be needed.  */
> +static void
> +build_vect_cond_expr (enum tree_code code, tree vop[3], tree mask,
> +                   tree vectype_in, gimple_stmt_iterator * gsi)

Need to document the parameters here too.  Should be no space after
pointer "*".  (Yeah, the code's not very consistent about this.)

> +{
> +  switch (code)
> +    {
> +    case DOT_PROD_EXPR:
> +      {
> +     tree zero = build_int_cst (TREE_TYPE (vectype_in), 0);
> +     zero = build_vector_from_val (vectype_in, zero);

Should reduce to:

  tree zero = build_zero_cst (vectype_in);

> +     tree new_op1 = make_temp_ssa_name (vectype_in, NULL, "new_op1");

Maybe "masked_" rather than "new_"?  This name will persist in
debugging dumps for later passes.

Thanks,
Richard

Re: [Aarch64][SVE] Dot product support

Reply via email to