Re: [Vectorizer] Add SLP support for masked loads

Richard Sandiford Mon, 20 May 2019 05:20:07 -0700

Alejandro Martinez Vicente <[email protected]> writes:
> Hi Richards,
>
> This is the new version of the patch, addressing your comments.


You forgot the changelog :-)

The patch LGTM otherwise.  Richard, any objections?

>
> Alejandro
>
>> -----Original Message-----
>> From: Richard Sandiford <[email protected]>
>> Sent: 08 May 2019 14:36
>> To: Richard Biener <[email protected]>
>> Cc: Alejandro Martinez Vicente <[email protected]>; GCC
>> Patches <[email protected]>; nd <[email protected]>
>> Subject: Re: [Vectorizer] Add SLP support for masked loads
>> 
>> Richard Biener <[email protected]> writes:
>> > On Fri, Apr 26, 2019 at 3:14 PM Richard Sandiford
>> > <[email protected]> wrote:
>> >>
>> >> Alejandro Martinez Vicente <[email protected]>
>> writes:
>> >> > Hi,
>> >> >
>> >> > Current vectorizer doesn't support masked loads for SLP. We should
>> >> > add that, to allow things like:
>> >> >
>> >> > void
>> >> > f (int *restrict x, int *restrict y, int *restrict z, int n) {
>> >> >   for (int i = 0; i < n; i += 2)
>> >> >     {
>> >> >       x[i] = y[i] ? z[i] : 1;
>> >> >       x[i + 1] = y[i + 1] ? z[i + 1] : 2;
>> >> >     }
>> >> > }
>> >> >
>> >> > to be vectorized using contiguous loads rather than LD2 and ST2.
>> >> >
>> >> > This patch was motivated by SVE, but it is completely generic and
>> >> > should apply to any architecture with masked loads.
>> >> >
>> >> > After the patch is applied, the above code generates this output
>> >> > (-march=armv8.2-a+sve -O2 -ftree-vectorize):
>> >> >
>> >> > 0000000000000000 <f>:
>> >> >    0: 7100007f        cmp     w3, #0x0
>> >> >    4: 540002cd        b.le    5c <f+0x5c>
>> >> >    8: 51000464        sub     w4, w3, #0x1
>> >> >    c: d2800003        mov     x3, #0x0                        // #0
>> >> >   10: 90000005        adrp    x5, 0 <f>
>> >> >   14: 25d8e3e0        ptrue   p0.d
>> >> >   18: 53017c84        lsr     w4, w4, #1
>> >> >   1c: 910000a5        add     x5, x5, #0x0
>> >> >   20: 11000484        add     w4, w4, #0x1
>> >> >   24: 85c0e0a1        ld1rd   {z1.d}, p0/z, [x5]
>> >> >   28: 2598e3e3        ptrue   p3.s
>> >> >   2c: d37ff884        lsl     x4, x4, #1
>> >> >   30: 25a41fe2        whilelo p2.s, xzr, x4
>> >> >   34: d503201f        nop
>> >> >   38: a5434820        ld1w    {z0.s}, p2/z, [x1, x3, lsl #2]
>> >> >   3c: 25808c11        cmpne   p1.s, p3/z, z0.s, #0
>> >> >   40: 25808810        cmpne   p0.s, p2/z, z0.s, #0
>> >> >   44: a5434040        ld1w    {z0.s}, p0/z, [x2, x3, lsl #2]
>> >> >   48: 05a1c400        sel     z0.s, p1, z0.s, z1.s
>> >> >   4c: e5434800        st1w    {z0.s}, p2, [x0, x3, lsl #2]
>> >> >   50: 04b0e3e3        incw    x3
>> >> >   54: 25a41c62        whilelo p2.s, x3, x4
>> >> >   58: 54ffff01        b.ne    38 <f+0x38>  // b.any
>> >> >   5c: d65f03c0        ret
>> >> >
>> >> >
>> >> > I tested this patch in an aarch64 machine bootstrapping the
>> >> > compiler and running the checks.
>> >> >
>> >> > Alejandro
>> >> >
>> >> > gcc/Changelog:
>> >> >
>> >> > 2019-01-16  Alejandro Martinez  <[email protected]>
>> >> >
>> >> >       * config/aarch64/aarch64-sve.md (copysign<mode>3): New
>> define_expand.
>> >> >       (xorsign<mode>3): Likewise.
>> >> >       internal-fn.c: Marked mask_load_direct and mask_store_direct as
>> >> >       vectorizable.
>> >> >       tree-data-ref.c (data_ref_compare_tree): Fixed comment typo.
>> >> >       tree-vect-data-refs.c (can_group_stmts_p): Allow masked loads to 
>> >> > be
>> >> >       combined even if masks different.
>> >> >       (slp_vect_only_p): New function to detect masked loads that are 
>> >> > only
>> >> >       vectorizable using SLP.
>> >> >       (vect_analyze_data_ref_accesses): Mark SLP only vectorizable 
>> >> > groups.
>> >> >       tree-vect-loop.c (vect_dissolve_slp_only_groups): New function to
>> >> >       dissolve SLP-only vectorizable groups when SLP has been discarded.
>> >> >       (vect_analyze_loop_2): Call vect_dissolve_slp_only_groups when
>> needed.
>> >> >       tree-vect-slp.c (vect_get_and_check_slp_defs): Check masked loads
>> >> >       masks.
>> >> >       (vect_build_slp_tree_1): Fixed comment typo.
>> >> >       (vect_build_slp_tree_2): Include masks from masked loads in SLP
>> tree.
>> >> >       tree-vect-stmts.c (vect_get_vec_defs_for_operand): New function to
>> get
>> >> >       vec_defs for operand with optional SLP and vectype.
>> >> >       (vectorizable_load): Allow vectorizaion of masked loads for SLP 
>> >> > only.
>> >> >       tree-vectorizer.h (_stmt_vec_info): Added flag for SLP-only
>> >> >       vectorizable.
>> >> >       tree-vectorizer.c (vec_info::new_stmt_vec_info): Likewise.
>> >> >
>> >> > gcc/testsuite/Changelog:
>> >> >
>> >> > 2019-01-16  Alejandro Martinez  <[email protected]>
>> >> >
>> >> >       * gcc.target/aarch64/sve/mask_load_slp_1.c: New test for SLP
>> >> >       vectorized masked loads.
>> >> >
>> >> > diff --git a/gcc/internal-fn.c b/gcc/internal-fn.c index
>> >> > 4f2ef45..67eee59 100644
>> >> > --- a/gcc/internal-fn.c
>> >> > +++ b/gcc/internal-fn.c
>> >> > @@ -100,11 +100,11 @@ init_internal_fns ()
>> >> >  /* Create static initializers for the information returned by
>> >> >     direct_internal_fn.  */
>> >> >  #define not_direct { -2, -2, false } -#define mask_load_direct {
>> >> > -1, 2, false }
>> >> > +#define mask_load_direct { -1, 2, true }
>> >> >  #define load_lanes_direct { -1, -1, false }  #define
>> >> > mask_load_lanes_direct { -1, -1, false }  #define
>> >> > gather_load_direct { -1, -1, false } -#define mask_store_direct {
>> >> > 3, 2, false }
>> >> > +#define mask_store_direct { 3, 2, true }
>> >> >  #define store_lanes_direct { 0, 0, false }  #define
>> >> > mask_store_lanes_direct { 0, 0, false }  #define
>> >> > scatter_store_direct { 3, 3, false } diff --git
>> >> > a/gcc/testsuite/gcc.target/aarch64/sve/mask_load_slp_1.c
>> >> > b/gcc/testsuite/gcc.target/aarch64/sve/mask_load_slp_1.c
>> >> > new file mode 100644
>> >> > index 0000000..b106cae
>> >> > --- /dev/null
>> >> > +++ b/gcc/testsuite/gcc.target/aarch64/sve/mask_load_slp_1.c
>> >> > @@ -0,0 +1,74 @@
>> >> > +/* { dg-do compile } */
>> >> > +/* { dg-options "-O2 -ftree-vectorize" } */
>> >> > +
>> >> > +#include <stdint.h>
>> >> > +
>> >> > +#define MASK_SLP_2(TYPE_COND, ALT_VAL)                                 
>> >> >       \
>> >> > +void __attribute__ ((noinline, noclone))                             \
>> >> > +mask_slp_##TYPE_COND##_2_##ALT_VAL (int *restrict x, int *restrict y,
>> \
>> >> > +                                 TYPE_COND *restrict z, int n)       \
>> >> > +{                                                                    \
>> >> > +  for (int i = 0; i < n; i += 2)                                     \
>> >> > +    {                                                                  
>> >> >       \
>> >> > +      x[i] = y[i] ? z[i] : 1;                                          
>> >> >       \
>> >> > +      x[i + 1] = y[i + 1] ? z[i + 1] : ALT_VAL;                        
>> >> >       \
>> >> > +    }                                                                  
>> >> >       \
>> >> > +}
>> >> > +
>> >> > +#define MASK_SLP_4(TYPE_COND, ALT_VAL)                                 
>> >> >       \
>> >> > +void __attribute__ ((noinline, noclone))                             \
>> >> > +mask_slp_##TYPE_COND##_4_##ALT_VAL (int *restrict x, int *restrict y,
>> \
>> >> > +                                 TYPE_COND *restrict z, int n)       \
>> >> > +{                                                                    \
>> >> > +  for (int i = 0; i < n; i += 4)                                     \
>> >> > +    {                                                                  
>> >> >       \
>> >> > +      x[i] = y[i] ? z[i] : 1;                                          
>> >> >       \
>> >> > +      x[i + 1] = y[i + 1] ? z[i + 1] : ALT_VAL;                        
>> >> >       \
>> >> > +      x[i + 2] = y[i + 2] ? z[i + 2] : 1;                            \
>> >> > +      x[i + 3] = y[i + 3] ? z[i + 3] : ALT_VAL;                        
>> >> >       \
>> >> > +    }                                                                  
>> >> >       \
>> >> > +}
>> >> > +
>> >> > +#define MASK_SLP_8(TYPE_COND, ALT_VAL)                                 
>> >> >       \
>> >> > +void __attribute__ ((noinline, noclone))                             \
>> >> > +mask_slp_##TYPE_COND##_8_##ALT_VAL (int *restrict x, int *restrict y,
>> \
>> >> > +                                 TYPE_COND *restrict z, int n)       \
>> >> > +{                                                                    \
>> >> > +  for (int i = 0; i < n; i += 8)                                     \
>> >> > +    {                                                                  
>> >> >       \
>> >> > +      x[i] = y[i] ? z[i] : 1;                                          
>> >> >       \
>> >> > +      x[i + 1] = y[i + 1] ? z[i + 1] : ALT_VAL;                        
>> >> >       \
>> >> > +      x[i + 2] = y[i + 2] ? z[i + 2] : 1;                            \
>> >> > +      x[i + 3] = y[i + 3] ? z[i + 3] : ALT_VAL;                        
>> >> >       \
>> >> > +      x[i + 4] = y[i + 4] ? z[i + 4] : 1;                            \
>> >> > +      x[i + 5] = y[i + 5] ? z[i + 5] : ALT_VAL;                        
>> >> >       \
>> >> > +      x[i + 6] = y[i + 6] ? z[i + 6] : 1;                            \
>> >> > +      x[i + 7] = y[i + 7] ? z[i + 7] : ALT_VAL;                        
>> >> >       \
>> >> > +    }                                                                  
>> >> >       \
>> >> > +}
>> >> > +
>> >> > +MASK_SLP_2(int8_t, 1)
>> >> > +MASK_SLP_2(int8_t, 2)
>> >> > +MASK_SLP_2(int, 1)
>> >> > +MASK_SLP_2(int, 2)
>> >> > +MASK_SLP_2(int64_t, 1)
>> >> > +MASK_SLP_2(int64_t, 2)
>> >> > +
>> >> > +MASK_SLP_4(int8_t, 1)
>> >> > +MASK_SLP_4(int8_t, 2)
>> >> > +MASK_SLP_4(int, 1)
>> >> > +MASK_SLP_4(int, 2)
>> >> > +MASK_SLP_4(int64_t, 1)
>> >> > +MASK_SLP_4(int64_t, 2)
>> >> > +
>> >> > +MASK_SLP_8(int8_t, 1)
>> >> > +MASK_SLP_8(int8_t, 2)
>> >> > +MASK_SLP_8(int, 1)
>> >> > +MASK_SLP_8(int, 2)
>> >> > +MASK_SLP_8(int64_t, 1)
>> >> > +MASK_SLP_8(int64_t, 2)
>> >> > +
>> >> > +/* { dg-final { scan-assembler-not {\tld2w\t} } } */
>> >> > +/* { dg-final { scan-assembler-not {\tst2w\t} } } */
>> >> > +/* { dg-final { scan-assembler-times {\tld1w\t} 48 } } */
>> >> > +/* { dg-final { scan-assembler-times {\tst1w\t} 40 } } */
>> >> > diff --git a/gcc/tree-data-ref.c b/gcc/tree-data-ref.c index
>> >> > 7d1f03c..1833a5f 100644
>> >> > --- a/gcc/tree-data-ref.c
>> >> > +++ b/gcc/tree-data-ref.c
>> >> > @@ -1272,7 +1272,7 @@ create_data_ref (edge nest, loop_p loop, tree
>> memref, gimple *stmt,
>> >> >    return dr;
>> >> >  }
>> >> >
>> >> > -/*  A helper function computes order between two tree epxressions T1
>> and T2.
>> >> > +/*  A helper function computes order between two tree expressions T1
>> and T2.
>> >> >      This is used in comparator functions sorting objects based on the
>> order
>> >> >      of tree expressions.  The function returns -1, 0, or 1.  */
>> >> >
>> >> > diff --git a/gcc/tree-vect-data-refs.c b/gcc/tree-vect-data-refs.c
>> >> > index 7bbd47f..8a82147 100644
>> >> > --- a/gcc/tree-vect-data-refs.c
>> >> > +++ b/gcc/tree-vect-data-refs.c
>> >> > @@ -2837,22 +2837,72 @@ can_group_stmts_p (stmt_vec_info
>> stmt1_info, stmt_vec_info stmt2_info)
>> >> >        if (ifn != gimple_call_internal_fn (call2))
>> >> >       return false;
>> >> >
>> >> > -      /* Check that the masks are the same.  Cope with casts of masks,
>> >> > +      /* Check that the masks can be combined.  */
>> >> > +      tree mask1 = gimple_call_arg (call1, 2);
>> >> > +      tree mask2 = gimple_call_arg (call2, 2);
>> >> > +      if (!operand_equal_p (mask1, mask2, 0))
>> >> > +     {
>> >> > +       /* Stores need identical masks.  */
>> >> > +       if (ifn == IFN_MASK_STORE)
>> >> > +         {
>> >> > +           mask1 = strip_conversion (mask1);
>> >> > +           if (!mask1)
>> >> > +             return false;
>> >> > +           mask2 = strip_conversion (mask2);
>> >> > +           if (!mask2)
>> >> > +             return false;
>> >> > +           if (!operand_equal_p (mask1, mask2, 0))
>> >> > +             return false;
>> >> > +         }
>> >> > +       /* Loads are allowed different masks under SLP only.
>> >> > +          (See slp_vect_only_p () below).  */
>> >> > +     }
>> >> > +      return true;
>> >> > +    }
>> >> > +
>> >> > +  return false;
>> >> > +}
>> >> > +
>> >> > +/* Return true if vectorizable_* routines can handle statements
>> STMT1_INFO
>> >> > +   and STMT2_INFO being in a single group for SLP only.  */
>> >> > +
>> >> > +static bool
>> >> > +slp_vect_only_p (stmt_vec_info stmt1_info, stmt_vec_info
>> >> > +stmt2_info) {
>> >> > +  if (gimple_assign_single_p (stmt1_info->stmt))
>> >> > +    {
>> >> > +      gcc_assert (gimple_assign_single_p (stmt2_info->stmt));
>> >> > +      return false;
>> >> > +    }
>> >> > +
>> >> > +  gcall *call1 = dyn_cast <gcall *> (stmt1_info->stmt);  if (call1
>> >> > + && gimple_call_internal_p (call1))
>> >> > +    {
>> >> > +      /* Check for two masked loads or two masked stores.  */
>> >> > +      gcall *call2 = dyn_cast <gcall *> (stmt2_info->stmt);
>> >> > +      gcc_assert (call2 && gimple_call_internal_p (call2));
>> >> > +      internal_fn ifn = gimple_call_internal_fn (call1);
>> >> > +      if (ifn != IFN_MASK_LOAD)
>> >> > +     return false;
>> >> > +      gcc_assert (ifn == gimple_call_internal_fn (call2));
>> >> > +
>> >> > +      /* Check if the masks are the same.  Cope with casts of
>> >> > + masks,
>> >> >        like those created by build_mask_conversion.  */
>> >> >        tree mask1 = gimple_call_arg (call1, 2);
>> >> >        tree mask2 = gimple_call_arg (call2, 2);
>> >> >        if (!operand_equal_p (mask1, mask2, 0))
>> >> >       {
>> >> > +       /* This is the only case that is just for SLP: non-identical but
>> >> > +          otherwise slp-compatible masks.  */
>> >> >         mask1 = strip_conversion (mask1);
>> >> >         if (!mask1)
>> >> > -         return false;
>> >> > +         return true;
>> >> >         mask2 = strip_conversion (mask2);
>> >> >         if (!mask2)
>> >> > -         return false;
>> >> > +         return true;
>> >> >         if (!operand_equal_p (mask1, mask2, 0))
>> >> > -         return false;
>> >> > +         return true;
>> >> >       }
>> >> > -      return true;
>> >> >      }
>> >> >
>> >> >    return false;
>> >>
>> >> Normally I'd say it would be better to add a bool argument to
>> >> can_group_stmts_p that says whether we want non-SLP-only rules, or
>> >> perhaps convert the return type to an enum.  But given that the
>> >> non-SLP path is going away soon anyway, I guess separate functions
>> >> are better despite the cut-&-paste.
>> >>
>> >> > diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c index
>> >> > afbf9a9..754a2e4 100644
>> >> > --- a/gcc/tree-vect-loop.c
>> >> > +++ b/gcc/tree-vect-loop.c
>> >> > @@ -1755,6 +1755,49 @@ vect_get_datarefs_in_loop (loop_p loop,
>> basic_block *bbs,
>> >> >    return opt_result::success ();
>> >> >  }
>> >> >
>> >> > +/* Look for SLP-only access groups and turn each individual access into
>> its own
>> >> > +   group.  */
>> >> > +static void
>> >> > +vect_dissolve_slp_only_groups (loop_vec_info loop_vinfo) {
>> >> > +  unsigned int i;
>> >> > +  struct data_reference *dr;
>> >> > +
>> >> > +  DUMP_VECT_SCOPE ("vect_dissolve_slp_only_groups");
>> >> > +
>> >> > +  vec<data_reference_p> datarefs = loop_vinfo->shared->datarefs;
>> >> > + FOR_EACH_VEC_ELT (datarefs, i, dr)
>> >> > +    {
>> >> > +      gcc_assert (DR_REF (dr));
>> >> > +      stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (DR_STMT
>> >> > + (dr));
>> >> > +
>> >> > +      /* Check if the load is a part of an interleaving chain.  */
>> >> > +      if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
>> >> > +     {
>> >> > +       stmt_vec_info first_element = DR_GROUP_FIRST_ELEMENT
>> (stmt_info);
>> >> > +       unsigned int group_size = DR_GROUP_SIZE (first_element);
>> >> > +
>> >> > +       /* Check if SLP-only groups.  */
>> >> > +       if (STMT_VINFO_SLP_VECT_ONLY (first_element))
>> >> > +         {
>> >> > +             /* Dissolve the group.  */
>> >> > +             STMT_VINFO_SLP_VECT_ONLY (first_element) = false;
>> >> > +
>> >> > +             stmt_vec_info vinfo = first_element;
>> >> > +             while (vinfo)
>> >> > +               {
>> >> > +                 stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (vinfo);
>> >> > +                 DR_GROUP_FIRST_ELEMENT (vinfo) = NULL;
>> >> > +                 DR_GROUP_NEXT_ELEMENT (vinfo) = NULL;
>> >> > +                 DR_GROUP_SIZE (vinfo) = 1;
>> >> > +                 DR_GROUP_GAP (vinfo) = group_size - 1;
>> >> > +                 vinfo = next;
>> >>
>> >> I think DR_GROUP_FIRST_ELEMENT should be vinfo here, so that it
>> >> remains a grouped access with only one element.
>> >
>> > Then the above looks like single-element interleaving?  Do we handle
>> > interleaving at all for masked loads/stores?
>> 
>> Not yet, but it's on the wishlist.
>> 
>> > Generally a no longer grouped access would have
>> DR_GROUP_FIRST_ELEMENT
>> > NULL and "no" size/gap (well, nobody looks at those fields then).  It
>> > would need vectorization with strided accesses then though thus you
>> > need to set the strided flag.
>> 
>> But with the way get_load_store_type is structured, single-element groups
>> give strictly more information than a strided access.  We still fall back on
>> gather/scatter or elementwise accesses if necessary.
>> 
>> (One of the reasons for adding get_load_store_type was to avoid the
>> group/stride choice dictating a particular implementation.)
>> 
>> So I think single element groups are better here.
>> 
>> > I think you want to have a testcase exercising this path.
>> 
>> No argument with this of course. :-)
>> 
>> Richard
>
> diff --git a/gcc/internal-fn.c b/gcc/internal-fn.c
> index 04081f3..3051a7a 100644
> --- a/gcc/internal-fn.c
> +++ b/gcc/internal-fn.c
> @@ -100,7 +100,7 @@ init_internal_fns ()
>  /* Create static initializers for the information returned by
>     direct_internal_fn.  */
>  #define not_direct { -2, -2, false }
> -#define mask_load_direct { -1, 2, false }
> +#define mask_load_direct { -1, 2, true }
>  #define load_lanes_direct { -1, -1, false }
>  #define mask_load_lanes_direct { -1, -1, false }
>  #define gather_load_direct { -1, -1, false }
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/mask_load_slp_1.c 
> b/gcc/testsuite/gcc.target/aarch64/sve/mask_load_slp_1.c
> new file mode 100644
> index 0000000..78c70b2
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/mask_load_slp_1.c
> @@ -0,0 +1,90 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -ftree-vectorize" } */
> +
> +#include <stdint.h>
> +
> +#define MASK_SLP_2(TYPE_COND, ALT_VAL)                                       
> \
> +void __attribute__ ((noinline, noclone))                             \
> +mask_slp_##TYPE_COND##_2_##ALT_VAL (int *restrict x, int *restrict y,        
> \
> +                                 TYPE_COND *restrict z, int n)       \
> +{                                                                    \
> +  for (int i = 0; i < n; i += 2)                                     \
> +    {                                                                        
> \
> +      x[i] = y[i] ? z[i] : 1;                                                
> \
> +      x[i + 1] = y[i + 1] ? z[i + 1] : ALT_VAL;                              
> \
> +    }                                                                        
> \
> +}
> +
> +#define MASK_SLP_4(TYPE_COND, ALT_VAL)                                       
> \
> +void __attribute__ ((noinline, noclone))                             \
> +mask_slp_##TYPE_COND##_4_##ALT_VAL (int *restrict x, int *restrict y,        
> \
> +                                 TYPE_COND *restrict z, int n)       \
> +{                                                                    \
> +  for (int i = 0; i < n; i += 4)                                     \
> +    {                                                                        
> \
> +      x[i] = y[i] ? z[i] : 1;                                                
> \
> +      x[i + 1] = y[i + 1] ? z[i + 1] : ALT_VAL;                              
> \
> +      x[i + 2] = y[i + 2] ? z[i + 2] : 1;                            \
> +      x[i + 3] = y[i + 3] ? z[i + 3] : ALT_VAL;                              
> \
> +    }                                                                        
> \
> +}
> +
> +#define MASK_SLP_8(TYPE_COND, ALT_VAL)                                       
> \
> +void __attribute__ ((noinline, noclone))                             \
> +mask_slp_##TYPE_COND##_8_##ALT_VAL (int *restrict x, int *restrict y,        
> \
> +                                 TYPE_COND *restrict z, int n)       \
> +{                                                                    \
> +  for (int i = 0; i < n; i += 8)                                     \
> +    {                                                                        
> \
> +      x[i] = y[i] ? z[i] : 1;                                                
> \
> +      x[i + 1] = y[i + 1] ? z[i + 1] : ALT_VAL;                              
> \
> +      x[i + 2] = y[i + 2] ? z[i + 2] : 1;                            \
> +      x[i + 3] = y[i + 3] ? z[i + 3] : ALT_VAL;                              
> \
> +      x[i + 4] = y[i + 4] ? z[i + 4] : 1;                            \
> +      x[i + 5] = y[i + 5] ? z[i + 5] : ALT_VAL;                              
> \
> +      x[i + 6] = y[i + 6] ? z[i + 6] : 1;                            \
> +      x[i + 7] = y[i + 7] ? z[i + 7] : ALT_VAL;                              
> \
> +    }                                                                        
> \
> +}
> +
> +#define MASK_SLP_FAIL(TYPE_COND)                                     \
> +void __attribute__ ((noinline, noclone))                             \
> +mask_slp_##TYPE_COND##_FAIL (int *restrict x, int *restrict y,               
> \
> +                          TYPE_COND *restrict z, int n)              \
> +{                                                                    \
> +  for (int i = 0; i < n; i += 2)                                     \
> +    {                                                                        
> \
> +      x[i] = y[i] ? z[i] : 1;                                                
> \
> +      x[i + 1] = y[i + 1] ? z[i + 1] : x[z[i + 1]];                  \
> +    }                                                                        
> \
> +}
> +
> +MASK_SLP_2(int8_t, 1)
> +MASK_SLP_2(int8_t, 2)
> +MASK_SLP_2(int, 1)
> +MASK_SLP_2(int, 2)
> +MASK_SLP_2(int64_t, 1)
> +MASK_SLP_2(int64_t, 2)
> +
> +MASK_SLP_4(int8_t, 1)
> +MASK_SLP_4(int8_t, 2)
> +MASK_SLP_4(int, 1)
> +MASK_SLP_4(int, 2)
> +MASK_SLP_4(int64_t, 1)
> +MASK_SLP_4(int64_t, 2)
> +
> +MASK_SLP_8(int8_t, 1)
> +MASK_SLP_8(int8_t, 2)
> +MASK_SLP_8(int, 1)
> +MASK_SLP_8(int, 2)
> +MASK_SLP_8(int64_t, 1)
> +MASK_SLP_8(int64_t, 2)
> +
> +MASK_SLP_FAIL(int8_t)
> +MASK_SLP_FAIL(int)
> +MASK_SLP_FAIL(int64_t)
> +
> +/* { dg-final { scan-assembler-not {\tld2w\t} } } */
> +/* { dg-final { scan-assembler-not {\tst2w\t} } } */
> +/* { dg-final { scan-assembler-times {\tld1w\t} 48 } } */
> +/* { dg-final { scan-assembler-times {\tst1w\t} 40 } } */
> diff --git a/gcc/tree-data-ref.c b/gcc/tree-data-ref.c
> index 67b960d..4dc03ef 100644
> --- a/gcc/tree-data-ref.c
> +++ b/gcc/tree-data-ref.c
> @@ -1271,7 +1271,7 @@ create_data_ref (edge nest, loop_p loop, tree memref, 
> gimple *stmt,
>    return dr;
>  }
>  
> -/*  A helper function computes order between two tree epxressions T1 and T2.
> +/*  A helper function computes order between two tree expressions T1 and T2.
>      This is used in comparator functions sorting objects based on the order
>      of tree expressions.  The function returns -1, 0, or 1.  */
>  
> diff --git a/gcc/tree-vect-data-refs.c b/gcc/tree-vect-data-refs.c
> index d71a39f..55d87f8 100644
> --- a/gcc/tree-vect-data-refs.c
> +++ b/gcc/tree-vect-data-refs.c
> @@ -2863,10 +2863,12 @@ strip_conversion (tree op)
>  }
>  
>  /* Return true if vectorizable_* routines can handle statements STMT1_INFO
> -   and STMT2_INFO being in a single group.  */
> +   and STMT2_INFO being in a single group.  When ALLOW_SLP_P, masked loads 
> can
> +   be grouped in SLP mode.  */
>  
>  static bool
> -can_group_stmts_p (stmt_vec_info stmt1_info, stmt_vec_info stmt2_info)
> +can_group_stmts_p (stmt_vec_info stmt1_info, stmt_vec_info stmt2_info,
> +                bool allow_slp_p)
>  {
>    if (gimple_assign_single_p (stmt1_info->stmt))
>      return gimple_assign_single_p (stmt2_info->stmt);
> @@ -2888,7 +2890,8 @@ can_group_stmts_p (stmt_vec_info stmt1_info, 
> stmt_vec_info stmt2_info)
>        like those created by build_mask_conversion.  */
>        tree mask1 = gimple_call_arg (call1, 2);
>        tree mask2 = gimple_call_arg (call2, 2);
> -      if (!operand_equal_p (mask1, mask2, 0))
> +      if (!operand_equal_p (mask1, mask2, 0)
> +          && (ifn == IFN_MASK_STORE || !allow_slp_p))
>       {
>         mask1 = strip_conversion (mask1);
>         if (!mask1)
> @@ -2974,7 +2977,7 @@ vect_analyze_data_ref_accesses (vec_info *vinfo)
>             || data_ref_compare_tree (DR_BASE_ADDRESS (dra),
>                                       DR_BASE_ADDRESS (drb)) != 0
>             || data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb)) != 0
> -           || !can_group_stmts_p (stmtinfo_a, stmtinfo_b))
> +           || !can_group_stmts_p (stmtinfo_a, stmtinfo_b, true))
>           break;
>  
>         /* Check that the data-refs have the same constant size.  */
> @@ -3059,6 +3062,13 @@ vect_analyze_data_ref_accesses (vec_info *vinfo)
>         DR_GROUP_NEXT_ELEMENT (lastinfo) = stmtinfo_b;
>         lastinfo = stmtinfo_b;
>  
> +       STMT_VINFO_SLP_VECT_ONLY (stmtinfo_a)
> +         = !can_group_stmts_p (stmtinfo_a, stmtinfo_b, false);
> +
> +       if (dump_enabled_p () && STMT_VINFO_SLP_VECT_ONLY (stmtinfo_a))
> +         dump_printf_loc (MSG_NOTE, vect_location,
> +                          "Load suitable for SLP vectorization only.\n");
> +
>         if (init_b == init_prev
>             && !to_fixup.add (DR_GROUP_FIRST_ELEMENT (stmtinfo_a))
>             && dump_enabled_p ())
> diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
> index 057a874..5166b42 100644
> --- a/gcc/tree-vect-loop.c
> +++ b/gcc/tree-vect-loop.c
> @@ -1753,6 +1753,50 @@ vect_get_datarefs_in_loop (loop_p loop, basic_block 
> *bbs,
>    return opt_result::success ();
>  }
>  
> +/* Look for SLP-only access groups and turn each individual access into its 
> own
> +   group.  */
> +static void
> +vect_dissolve_slp_only_groups (loop_vec_info loop_vinfo)
> +{
> +  unsigned int i;
> +  struct data_reference *dr;
> +
> +  DUMP_VECT_SCOPE ("vect_dissolve_slp_only_groups");
> +
> +  vec<data_reference_p> datarefs = loop_vinfo->shared->datarefs;
> +  FOR_EACH_VEC_ELT (datarefs, i, dr)
> +    {
> +      gcc_assert (DR_REF (dr));
> +      stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (DR_STMT (dr));
> +
> +      /* Check if the load is a part of an interleaving chain.  */
> +      if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
> +     {
> +       stmt_vec_info first_element = DR_GROUP_FIRST_ELEMENT (stmt_info);
> +       unsigned int group_size = DR_GROUP_SIZE (first_element);
> +
> +       /* Check if SLP-only groups.  */
> +       if (!STMT_SLP_TYPE (stmt_info)
> +           && STMT_VINFO_SLP_VECT_ONLY (first_element))
> +         {
> +           /* Dissolve the group.  */
> +           STMT_VINFO_SLP_VECT_ONLY (first_element) = false;
> +
> +           stmt_vec_info vinfo = first_element;
> +           while (vinfo)
> +             {
> +               stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (vinfo);
> +               DR_GROUP_FIRST_ELEMENT (vinfo) = vinfo;
> +               DR_GROUP_NEXT_ELEMENT (vinfo) = NULL;
> +               DR_GROUP_SIZE (vinfo) = 1;
> +               DR_GROUP_GAP (vinfo) = group_size - 1;
> +               vinfo = next;
> +             }
> +         }
> +     }
> +    }
> +}
> +
>  /* Function vect_analyze_loop_2.
>  
>     Apply a set of analyses on LOOP, and create a loop_vec_info struct
> @@ -1964,6 +2008,9 @@ start_over:
>       }
>      }
>  
> +  /* Dissolve SLP-only groups.  */
> +  vect_dissolve_slp_only_groups (loop_vinfo);
> +
>    /* Scan all the remaining operations in the loop that are not subject
>       to SLP and make sure they are vectorizable.  */
>    ok = vect_analyze_loop_operations (loop_vinfo);
> diff --git a/gcc/tree-vect-slp.c b/gcc/tree-vect-slp.c
> index 2a1e5b8..0d2784e 100644
> --- a/gcc/tree-vect-slp.c
> +++ b/gcc/tree-vect-slp.c
> @@ -325,6 +325,14 @@ vect_get_and_check_slp_defs (vec_info *vinfo, unsigned 
> char *swap,
>       {
>         internal_fn ifn = gimple_call_internal_fn (stmt);
>         commutative_op = first_commutative_argument (ifn);
> +
> +       /* Masked load, only look at mask.  */
> +       if (ifn == IFN_MASK_LOAD)
> +         {
> +           number_of_oprnds = 1;
> +           /* Mask operand index.  */
> +           first_op_idx = 5;
> +         }
>       }
>      }
>    else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
> @@ -624,7 +632,7 @@ vect_two_operations_perm_ok_p (vec<stmt_vec_info> stmts,
>     is false then this indicates the comparison could not be
>     carried out or the stmts will never be vectorized by SLP.
>  
> -   Note COND_EXPR is possibly ismorphic to another one after swapping its
> +   Note COND_EXPR is possibly isomorphic to another one after swapping its
>     operands.  Set SWAP[i] to 1 if stmt I is COND_EXPR and isomorphic to
>     the first stmt by swapping the two operands of comparison; set SWAP[i]
>     to 2 if stmt I is isormorphic to the first stmt by inverting the code
> @@ -1143,14 +1151,23 @@ vect_build_slp_tree_2 (vec_info *vinfo,
>                             &this_max_nunits, matches, &two_operators))
>      return NULL;
>  
> -  /* If the SLP node is a load, terminate the recursion.  */
> +  /* If the SLP node is a load, terminate the recursion unless masked.  */
>    if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
>        && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
>      {
> -      *max_nunits = this_max_nunits;
> -      (*tree_size)++;
> -      node = vect_create_new_slp_node (stmts);
> -      return node;
> +      if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
> +     {
> +       /* Masked load.  */
> +       gcc_assert (gimple_call_internal_p (stmt, IFN_MASK_LOAD));
> +       nops = 1;
> +     }
> +      else
> +     {
> +       *max_nunits = this_max_nunits;
> +       (*tree_size)++;
> +       node = vect_create_new_slp_node (stmts);
> +       return node;
> +     }
>      }
>  
>    /* Get at the operands, verifying they are compatible.  */
> diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c
> index ced4264..7122ce9 100644
> --- a/gcc/tree-vect-stmts.c
> +++ b/gcc/tree-vect-stmts.c
> @@ -7622,14 +7622,6 @@ vectorizable_load (stmt_vec_info stmt_info, 
> gimple_stmt_iterator *gsi,
>        if (!scalar_dest)
>       return false;
>  
> -      if (slp_node != NULL)
> -     {
> -       if (dump_enabled_p ())
> -         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> -                          "SLP of masked loads not supported.\n");
> -       return false;
> -     }
> -
>        int mask_index = internal_fn_mask_index (ifn);
>        if (mask_index >= 0)
>       {
> @@ -7712,6 +7704,15 @@ vectorizable_load (stmt_vec_info stmt_info, 
> gimple_stmt_iterator *gsi,
>        first_stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
>        group_size = DR_GROUP_SIZE (first_stmt_info);
>  
> +      /* Refuse non-SLP vectorization of SLP-only groups.  */
> +      if (!slp && STMT_VINFO_SLP_VECT_ONLY (first_stmt_info))
> +     {
> +       if (dump_enabled_p ())
> +         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> +                          "cannot vectorize load in non-SLP mode.\n");
> +       return false;
> +     }
> +
>        if (slp && SLP_TREE_LOAD_PERMUTATION (slp_node).exists ())
>       slp_perm = true;
>  
> @@ -8389,8 +8390,19 @@ vectorizable_load (stmt_vec_info stmt_info, 
> gimple_stmt_iterator *gsi,
>                                         simd_lane_access_p,
>                                         byte_offset, bump);
>         if (mask)
> -         vec_mask = vect_get_vec_def_for_operand (mask, stmt_info,
> -                                                  mask_vectype);
> +         {
> +           if (slp_node)
> +             {
> +               auto_vec<tree> ops (1);
> +               auto_vec<vec<tree> > vec_defs (1);
> +               ops.quick_push (mask);
> +               vect_get_slp_defs (ops, slp_node, &vec_defs);
> +               vec_mask = vec_defs[0][0];
> +             }
> +           else
> +             vec_mask = vect_get_vec_def_for_operand (mask, stmt_info,
> +                                                      mask_vectype);
> +         }
>       }
>        else
>       {
> diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
> index 6f59af6..62d9341 100644
> --- a/gcc/tree-vectorizer.h
> +++ b/gcc/tree-vectorizer.h
> @@ -396,7 +396,7 @@ typedef struct _loop_vec_info : public vec_info {
>    /* Condition under which this loop is analyzed and versioned.  */
>    tree num_iters_assumptions;
>  
> -  /* Threshold of number of iterations below which vectorzation will not be
> +  /* Threshold of number of iterations below which vectorization will not be
>       performed. It is calculated from MIN_PROFITABLE_ITERS and
>       PARAM_MIN_VECT_LOOP_BOUND.  */
>    unsigned int th;
> @@ -935,6 +935,9 @@ struct _stmt_vec_info {
>       and OPERATION_BITS without changing the result.  */
>    unsigned int operation_precision;
>    signop operation_sign;
> +
> +  /* True if this is only suitable for SLP vectorization.  */
> +  bool slp_vect_only_p;
>  };
>  
>  /* Information about a gather/scatter call.  */
> @@ -1030,6 +1033,7 @@ STMT_VINFO_BB_VINFO (stmt_vec_info stmt_vinfo)
>  #define STMT_VINFO_NUM_SLP_USES(S)   (S)->num_slp_uses
>  #define STMT_VINFO_REDUC_TYPE(S)     (S)->reduc_type
>  #define STMT_VINFO_REDUC_DEF(S)              (S)->reduc_def
> +#define STMT_VINFO_SLP_VECT_ONLY(S)     (S)->slp_vect_only_p
>  
>  #define DR_GROUP_FIRST_ELEMENT(S) \
>    (gcc_checking_assert ((S)->dr_aux.dr), (S)->first_element)
> diff --git a/gcc/tree-vectorizer.c b/gcc/tree-vectorizer.c
> index d271049..4f6c65f 100644
> --- a/gcc/tree-vectorizer.c
> +++ b/gcc/tree-vectorizer.c
> @@ -641,6 +641,7 @@ vec_info::new_stmt_vec_info (gimple *stmt)
>    STMT_VINFO_VECTORIZABLE (res) = true;
>    STMT_VINFO_VEC_REDUCTION_TYPE (res) = TREE_CODE_REDUCTION;
>    STMT_VINFO_VEC_CONST_COND_REDUC_CODE (res) = ERROR_MARK;
> +  STMT_VINFO_SLP_VECT_ONLY (res) = false;
>  
>    if (gimple_code (stmt) == GIMPLE_PHI
>        && is_loop_header_bb_p (gimple_bb (stmt)))

Re: [Vectorizer] Add SLP support for masked loads

Reply via email to