Re: [PATCH 2/2][RFC] Add loop masking support for x86

Hongtao Liu via Gcc-patches Thu, 15 Jul 2021 18:40:51 -0700

On Thu, Jul 15, 2021 at 7:48 PM Richard Biener <[email protected]> wrote:
>
> On Thu, 15 Jul 2021, Hongtao Liu wrote:
>
> > On Thu, Jul 15, 2021 at 6:45 PM Richard Biener via Gcc-patches
> > <[email protected]> wrote:
> > >
> > > On Thu, Jul 15, 2021 at 12:30 PM Richard Biener <[email protected]> wrote:
> > > >
> > > > The following extends the existing loop masking support using
> > > > SVE WHILE_ULT to x86 by proving an alternate way to produce the
> > > > mask using VEC_COND_EXPRs.  So with --param vect-partial-vector-usage
> > > > you can now enable masked vectorized epilogues (=1) or fully
> > > > masked vector loops (=2).
> > > >
> > > > What's missing is using a scalar IV for the loop control
> > > > (but in principle AVX512 can use the mask here - just the patch
> > > > doesn't seem to work for AVX512 yet for some reason - likely
> > > > expand_vec_cond_expr_p doesn't work there).  What's also missing
> > > > is providing more support for predicated operations in the case
> > > > of reductions either via VEC_COND_EXPRs or via implementing
> > > > some of the .COND_{ADD,SUB,MUL...} internal functions as mapping
> > > > to masked AVX512 operations.
> > > >
> > > > For AVX2 and
> > > >
> > > > int foo (unsigned *a, unsigned * __restrict b, int n)
> > > > {
> > > >   unsigned sum = 1;
> > > >   for (int i = 0; i < n; ++i)
> > > >     b[i] += a[i];
> > > >   return sum;
> > > > }
> > > >
> > > > we get
> > > >
> > > > .L3:
> > > >         vpmaskmovd      (%rsi,%rax), %ymm0, %ymm3
> > > >         vpmaskmovd      (%rdi,%rax), %ymm0, %ymm1
> > > >         addl    $8, %edx
> > > >         vpaddd  %ymm3, %ymm1, %ymm1
> > > >         vpmaskmovd      %ymm1, %ymm0, (%rsi,%rax)
> > > >         vmovd   %edx, %xmm1
> > > >         vpsubd  %ymm15, %ymm2, %ymm0
> > > >         addq    $32, %rax
> > > >         vpbroadcastd    %xmm1, %ymm1
> > > >         vpaddd  %ymm4, %ymm1, %ymm1
> > > >         vpsubd  %ymm15, %ymm1, %ymm1
> > > >         vpcmpgtd        %ymm1, %ymm0, %ymm0
> > > >         vptest  %ymm0, %ymm0
> > > >         jne     .L3
> > > >
> > > > for the fully masked loop body and for the masked epilogue
> > > > we see
> > > >
> > > > .L4:
> > > >         vmovdqu (%rsi,%rax), %ymm3
> > > >         vpaddd  (%rdi,%rax), %ymm3, %ymm0
> > > >         vmovdqu %ymm0, (%rsi,%rax)
> > > >         addq    $32, %rax
> > > >         cmpq    %rax, %rcx
> > > >         jne     .L4
> > > >         movl    %edx, %eax
> > > >         andl    $-8, %eax
> > > >         testb   $7, %dl
> > > >         je      .L11
> > > > .L3:
> > > >         subl    %eax, %edx
> > > >         vmovdqa .LC0(%rip), %ymm1
> > > >         salq    $2, %rax
> > > >         vmovd   %edx, %xmm0
> > > >         movl    $-2147483648, %edx
> > > >         addq    %rax, %rsi
> > > >         vmovd   %edx, %xmm15
> > > >         vpbroadcastd    %xmm0, %ymm0
> > > >         vpbroadcastd    %xmm15, %ymm15
> > > >         vpsubd  %ymm15, %ymm1, %ymm1
> > > >         vpsubd  %ymm15, %ymm0, %ymm0
> > > >         vpcmpgtd        %ymm1, %ymm0, %ymm0
> > > >         vpmaskmovd      (%rsi), %ymm0, %ymm1
> > > >         vpmaskmovd      (%rdi,%rax), %ymm0, %ymm2
> > > >         vpaddd  %ymm2, %ymm1, %ymm1
> > > >         vpmaskmovd      %ymm1, %ymm0, (%rsi)
> > > > .L11:
> > > >         vzeroupper
> > > >
> > > > compared to
> > > >
> > > > .L3:
> > > >         movl    %edx, %r8d
> > > >         subl    %eax, %r8d
> > > >         leal    -1(%r8), %r9d
> > > >         cmpl    $2, %r9d
> > > >         jbe     .L6
> > > >         leaq    (%rcx,%rax,4), %r9
> > > >         vmovdqu (%rdi,%rax,4), %xmm2
> > > >         movl    %r8d, %eax
> > > >         andl    $-4, %eax
> > > >         vpaddd  (%r9), %xmm2, %xmm0
> > > >         addl    %eax, %esi
> > > >         andl    $3, %r8d
> > > >         vmovdqu %xmm0, (%r9)
> > > >         je      .L2
> > > > .L6:
> > > >         movslq  %esi, %r8
> > > >         leaq    0(,%r8,4), %rax
> > > >         movl    (%rdi,%r8,4), %r8d
> > > >         addl    %r8d, (%rcx,%rax)
> > > >         leal    1(%rsi), %r8d
> > > >         cmpl    %r8d, %edx
> > > >         jle     .L2
> > > >         addl    $2, %esi
> > > >         movl    4(%rdi,%rax), %r8d
> > > >         addl    %r8d, 4(%rcx,%rax)
> > > >         cmpl    %esi, %edx
> > > >         jle     .L2
> > > >         movl    8(%rdi,%rax), %edx
> > > >         addl    %edx, 8(%rcx,%rax)
> > > > .L2:
> > > >
> > > > I'm giving this a little testing right now but will dig on why
> > > > I don't get masked loops when AVX512 is enabled.
> > >
> > > Ah, a simple thinko - rgroup_controls vectypes seem to be
> > > always VECTOR_BOOLEAN_TYPE_P and thus we can
> > > use expand_vec_cmp_expr_p.  The AVX512 fully masked
> > > loop then looks like
> > >
> > > .L3:
> > >         vmovdqu32       (%rsi,%rax,4), %ymm2{%k1}
> > >         vmovdqu32       (%rdi,%rax,4), %ymm1{%k1}
> > >         vpaddd  %ymm2, %ymm1, %ymm0
> > >         vmovdqu32       %ymm0, (%rsi,%rax,4){%k1}
> > >         addq    $8, %rax
> > >         vpbroadcastd    %eax, %ymm0
> > >         vpaddd  %ymm4, %ymm0, %ymm0
> > >         vpcmpud $6, %ymm0, %ymm3, %k1
> > >         kortestb        %k1, %k1
> > >         jne     .L3
> > >
> > > I guess for x86 it's not worth preserving the VEC_COND_EXPR
> > > mask generation but other archs may not provide all required vec_cmp
> > > expanders.
> >
> > For the main loop, the full-masked loop's codegen seems much worse.
> > Basically, we need at least 4 instructions to do what while_ult in arm does.
> >
> >          vpbroadcastd    %eax, %ymm0
> >          vpaddd  %ymm4, %ymm0, %ymm0
> >          vpcmpud $6, %ymm0, %ymm3, %k1
> >          kortestb        %k1, %k1
> > vs
> >        whilelo(or some other while<op>)
> >
> > more instructions are needed for avx2 since there's no direct
> > instruction for .COND_{ADD,SUB..}
> >
> > original
> > .L4:
> >         vmovdqu (%rcx,%rax), %ymm1
> >         vpaddd (%rdi,%rax), %ymm1, %ymm0
> >         vmovdqu %ymm0, (%rcx,%rax)
> >         addq $32, %rax
> >         cmpq %rax, %rsi
> >         jne .L4
> >
> > vs
> > avx512 full-masked loop
> > .L3:
> >          vmovdqu32       (%rsi,%rax,4), %ymm2{%k1}
> >          vmovdqu32       (%rdi,%rax,4), %ymm1{%k1}
> >          vpaddd  %ymm2, %ymm1, %ymm0
> >          vmovdqu32       %ymm0, (%rsi,%rax,4){%k1}
> >          addq    $8, %rax
> >          vpbroadcastd    %eax, %ymm0
> >          vpaddd  %ymm4, %ymm0, %ymm0
> >          vpcmpud $6, %ymm0, %ymm3, %k1
> >          kortestb        %k1, %k1
> >          jne     .L3
> >
> > vs
> > avx2 full-masked loop
> > .L3:
> >          vpmaskmovd      (%rsi,%rax), %ymm0, %ymm3
> >          vpmaskmovd      (%rdi,%rax), %ymm0, %ymm1
> >          addl    $8, %edx
> >          vpaddd  %ymm3, %ymm1, %ymm1
> >          vpmaskmovd      %ymm1, %ymm0, (%rsi,%rax)
> >          vmovd   %edx, %xmm1
> >          vpsubd  %ymm15, %ymm2, %ymm0
> >          addq    $32, %rax
> >          vpbroadcastd    %xmm1, %ymm1
> >          vpaddd  %ymm4, %ymm1, %ymm1
> >          vpsubd  %ymm15, %ymm1, %ymm1
> >          vpcmpgtd        %ymm1, %ymm0, %ymm0
> >         vptest  %ymm0, %ymm0
> >          jne     .L3
> >
> > vs  arm64's code
> >
> > .L3:
> >     ld1w z1.s, p0/z, [x1, x3, lsl 2]
> >     ld1w z0.s, p0/z, [x0, x3, lsl 2]
> >     add z0.s, z0.s, z1.s
> >     st1w z0.s, p0, [x1, x3, lsl 2]
> >     add x3, x3, x4
> >     whilelo p0.s, w3, w2
> >     b.any .L3
>
> Yes, that's true - it might still be OK for vectorizing epilogues
> and thus --param vect-partial-vector-usage=1
>
> Can AVX512 do any better than this?
>
>         vpbroadcastd    %eax, %ymm0
>         vpaddd  %ymm4, %ymm0, %ymm0
>         vpcmpud $6, %ymm0, %ymm3, %k1
>


we can hoist vpbroadcastd out of the loop by define
iv_vector as {start_index, start_index + 1, start_index +2, ... }
and do add and cmp in loop body.

> Note with multiple types involved things get even worse
> since you need masks for each vector mode.  But as far as
> I can see that's the same for SVE (but there we have the
> single-instruction whilelo).  I guess we'll also generate
> wrong code at the moment for the case where we need
> multiple vectors to hold the full mask - vect_gen_while
> doesn't seem to be prepared for this?
>
> So with
>
> int foo (unsigned long *a, unsigned * __restrict b, int n)
> {
>   unsigned sum = 1;
>   for (int i = 0; i < n; ++i)
>     {
>       b[i] += a[i];
>     }
>   return sum;
> }
>
> SVE uses
>
> .L3:
>         ld1d    z0.d, p0/z, [x1, x3, lsl 3]
>         ld1d    z1.d, p0/z, [x0, x3, lsl 3]
>         adr     z0.d, [z0.d, z1.d, lsl 2]
>         st1d    z0.d, p0, [x1, x3, lsl 3]
>         add     x3, x3, x4
>         whilelo p0.d, w3, w2
>         b.any   .L3
>
> so p0 vs. p0/z, whatever that means and it looks like
> the vector add can somehow concatenate z0.d and z1.d.
> Truly fascinating ;)
>
> It looks like --param vect_partial_vector_usage defaults to 2,
> power forces it to 1 (power10) or 0 otherwise.
>
> I think we'd need a target hook that toggles this per mode
> so we could tune this dependent on AVX512 vectorization vs. AVX2.
>
> The reason I even started looking at this is that we now have
> so many vector modes and end up with quite big code for
> vectorized epilogues.  And I do remember Intel folks contributing
> patches to do fully masked AVX512 loops as well.
>
> Boostrap/testing on x86_64-unknown-linux-gnu (with a slightly
> altered patch) reveals no fails besides some assembler scans.
>
> For reference the tested patch is below.
>
> Thanks,
> Richard.
>
> commit 221110851fafe17d5a351f1b2da3fc3a40e3b61a
> Author: Richard Biener <[email protected]>
> Date:   Thu Jul 15 12:15:18 2021 +0200
>
>     Add loop masking support for x86
>
>     The following extends the existing loop masking support using
>     SVE WHILE_ULT to x86 by proving an alternate way to produce the
>     mask using VEC_COND_EXPRs.  So with --param vect-partial-vector-usage
>     you can now enable masked vectorized epilogues (=1) or fully
>     masked vector loops (=2).
>
>     What's missing is using a scalar IV for the loop control in
>     case that's profitable - the mask generation can then move
>     from preheader + latch to the header.  But AVX2 and AVX512
>     can use vptest and kortestb just fine.
>
>     What's also missing is providing more support for predicated
>     operations in the case of reductions either via VEC_COND_EXPRs
>     or via implementing some of the .COND_{ADD,SUB,MUL...} internal
>     functions as mapping to masked AVX512 operations.
>
>     For AVX2 and
>
>     int foo (unsigned *a, unsigned * __restrict b, int n)
>     {
>       unsigned sum = 1;
>       for (int i = 0; i < n; ++i)
>         b[i] += a[i];
>       return sum;
>     }
>
>     we get
>
>     .L3:
>             vpmaskmovd      (%rsi,%rax), %ymm0, %ymm3
>             vpmaskmovd      (%rdi,%rax), %ymm0, %ymm1
>             addl    $8, %edx
>             vpaddd  %ymm3, %ymm1, %ymm1
>             vpmaskmovd      %ymm1, %ymm0, (%rsi,%rax)
>             vmovd   %edx, %xmm1
>             vpsubd  %ymm15, %ymm2, %ymm0
>             addq    $32, %rax
>             vpbroadcastd    %xmm1, %ymm1
>             vpaddd  %ymm4, %ymm1, %ymm1
>             vpsubd  %ymm15, %ymm1, %ymm1
>             vpcmpgtd        %ymm1, %ymm0, %ymm0
>             vptest  %ymm0, %ymm0
>             jne     .L3
>
>     for the fully masked loop body and for the masked epilogue
>     we see
>
>     .L4:
>             vmovdqu (%rsi,%rax), %ymm3
>             vpaddd  (%rdi,%rax), %ymm3, %ymm0
>             vmovdqu %ymm0, (%rsi,%rax)
>             addq    $32, %rax
>             cmpq    %rax, %rcx
>             jne     .L4
>             movl    %edx, %eax
>             andl    $-8, %eax
>             testb   $7, %dl
>             je      .L11
>     .L3:
>             subl    %eax, %edx
>             vmovdqa .LC0(%rip), %ymm1
>             salq    $2, %rax
>             vmovd   %edx, %xmm0
>             movl    $-2147483648, %edx
>             addq    %rax, %rsi
>             vmovd   %edx, %xmm15
>             vpbroadcastd    %xmm0, %ymm0
>             vpbroadcastd    %xmm15, %ymm15
>             vpsubd  %ymm15, %ymm1, %ymm1
>             vpsubd  %ymm15, %ymm0, %ymm0
>             vpcmpgtd        %ymm1, %ymm0, %ymm0
>             vpmaskmovd      (%rsi), %ymm0, %ymm1
>             vpmaskmovd      (%rdi,%rax), %ymm0, %ymm2
>             vpaddd  %ymm2, %ymm1, %ymm1
>             vpmaskmovd      %ymm1, %ymm0, (%rsi)
>     .L11:
>             vzeroupper
>
>     compared to
>
>     .L3:
>             movl    %edx, %r8d
>             subl    %eax, %r8d
>             leal    -1(%r8), %r9d
>             cmpl    $2, %r9d
>             jbe     .L6
>             leaq    (%rcx,%rax,4), %r9
>             vmovdqu (%rdi,%rax,4), %xmm2
>             movl    %r8d, %eax
>             andl    $-4, %eax
>             vpaddd  (%r9), %xmm2, %xmm0
>             addl    %eax, %esi
>             andl    $3, %r8d
>             vmovdqu %xmm0, (%r9)
>             je      .L2
>     .L6:
>             movslq  %esi, %r8
>             leaq    0(,%r8,4), %rax
>             movl    (%rdi,%r8,4), %r8d
>             addl    %r8d, (%rcx,%rax)
>             leal    1(%rsi), %r8d
>             cmpl    %r8d, %edx
>             jle     .L2
>             addl    $2, %esi
>             movl    4(%rdi,%rax), %r8d
>             addl    %r8d, 4(%rcx,%rax)
>             cmpl    %esi, %edx
>             jle     .L2
>             movl    8(%rdi,%rax), %edx
>             addl    %edx, 8(%rcx,%rax)
>     .L2:
>
>     The AVX512 fully masked loop would be
>
>             vmovdqa .LC0(%rip), %ymm4
>             vpbroadcastd    %edx, %ymm3
>             vpcmpud $6, %ymm4, %ymm3, %k1
>             xorl    %eax, %eax
>             .p2align 4,,10
>             .p2align 3
>     .L3:
>             vmovdqu32       (%rsi,%rax,4), %ymm2{%k1}
>             vmovdqu32       (%rdi,%rax,4), %ymm1{%k1}
>             vpaddd  %ymm2, %ymm1, %ymm0
>             vmovdqu32       %ymm0, (%rsi,%rax,4){%k1}
>             addq    $8, %rax
>             vpbroadcastd    %eax, %ymm0
>             vpaddd  %ymm4, %ymm0, %ymm0
>             vpcmpud $6, %ymm0, %ymm3, %k1
>             kortestb        %k1, %k1
>             jne     .L3
>
>     loop control using %rax would likely be more latency friendly
>     here and the mask generation could be unified to a single place.
>
>     2021-07-15  Richard Biener  <[email protected]>
>
>             * tree-vect-stmts.c (can_produce_all_loop_masks_p): We
>             also can produce masks with VEC_COND_EXPRs.
>             * tree-vect-loop.c (vect_gen_while): Generate the mask
>             with a VEC_COND_EXPR in case WHILE_ULT is not supported.
>
> diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
> index fc3dab0d143..230d6e34208 100644
> --- a/gcc/tree-vect-loop.c
> +++ b/gcc/tree-vect-loop.c
> @@ -975,11 +975,17 @@ can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, 
> tree cmp_type)
>  {
>    rgroup_controls *rgm;
>    unsigned int i;
> +  tree cmp_vectype;
>    FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
>      if (rgm->type != NULL_TREE
>         && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
>                                             cmp_type, rgm->type,
> -                                           OPTIMIZE_FOR_SPEED))
> +                                           OPTIMIZE_FOR_SPEED)
> +       && ((cmp_vectype = build_vector_type
> +                            (cmp_type, TYPE_VECTOR_SUBPARTS (rgm->type))),
> +           true)
> +       && !(VECTOR_BOOLEAN_TYPE_P (rgm->type)
> +            && expand_vec_cmp_expr_p (cmp_vectype, rgm->type, LT_EXPR)))
>        return false;
>    return true;
>  }
> diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c
> index 6a25d661800..18c4c66cb2d 100644
> --- a/gcc/tree-vect-stmts.c
> +++ b/gcc/tree-vect-stmts.c
> @@ -12007,16 +12007,43 @@ vect_gen_while (gimple_seq *seq, tree mask_type, 
> tree start_index,
>                 tree end_index, const char *name)
>  {
>    tree cmp_type = TREE_TYPE (start_index);
> -  gcc_checking_assert (direct_internal_fn_supported_p (IFN_WHILE_ULT,
> -                                                      cmp_type, mask_type,
> -                                                      OPTIMIZE_FOR_SPEED));
> -  gcall *call = gimple_build_call_internal (IFN_WHILE_ULT, 3,
> -                                           start_index, end_index,
> -                                           build_zero_cst (mask_type));
> -  tree tmp = make_temp_ssa_name (mask_type, NULL, name);
> -  gimple_call_set_lhs (call, tmp);
> -  gimple_seq_add_stmt (seq, call);
> -  return tmp;
> +  if (direct_internal_fn_supported_p (IFN_WHILE_ULT,
> +                                     cmp_type, mask_type,
> +                                     OPTIMIZE_FOR_SPEED))
> +    {
> +      gcall *call = gimple_build_call_internal (IFN_WHILE_ULT, 3,
> +                                               start_index, end_index,
> +                                               build_zero_cst (mask_type));
> +      tree tmp = make_temp_ssa_name (mask_type, NULL, name);
> +      gimple_call_set_lhs (call, tmp);
> +      gimple_seq_add_stmt (seq, call);
> +      return tmp;
> +    }
> +  else
> +    {
> +      /* Generate
> +          _1 = { start_index, start_index, ... };
> +          _2 = { end_index, end_index, ... };
> +          _3 = _1 + { 0, 1, 2 ... };
> +          _4 = _3 < _2;  */
> +      tree cvectype = build_vector_type (cmp_type,
> +                                        TYPE_VECTOR_SUBPARTS (mask_type));
> +      gcc_assert (VECTOR_BOOLEAN_TYPE_P (mask_type)
> +                 && expand_vec_cmp_expr_p (cvectype, mask_type, LT_EXPR));
> +      tree si = make_ssa_name (cvectype);
> +      gassign *ass = gimple_build_assign
> +                       (si, build_vector_from_val (cvectype, start_index));
> +      gimple_seq_add_stmt (seq, ass);
> +      tree ei = make_ssa_name (cvectype);
> +      ass = gimple_build_assign (ei,
> +                                build_vector_from_val (cvectype, end_index));
> +      gimple_seq_add_stmt (seq, ass);
> +      tree incr = build_vec_series (cvectype, build_zero_cst (cmp_type),
> +                                   build_one_cst (cmp_type));
> +      si = gimple_build (seq, PLUS_EXPR, cvectype, si, incr);
> +      return gimple_build (seq, LT_EXPR, truth_type_for (cvectype),
> +                          si, ei);
> +    }
>  }
>
>  /* Generate a vector mask of type MASK_TYPE for which index I is false iff



-- 
BR,
Hongtao

Re: [PATCH 2/2][RFC] Add loop masking support for x86

Reply via email to