From: Ju-Zhe Zhong <juzhe.zh...@rivai.ai>

Hi, Richard and Richi.

The len_mask_gather_load/len_mask_scatter_store patterns have been added.
Now, this patch applies them into vectorizer.

Here is the example:

void
f (int *restrict a,
   int *restrict b, int n,
   int base, int step,
   int *restrict cond)
{
  for (int i = 0; i < n; ++i)
    {
      if (cond[i])
        a[i * 4] = b[i];
    }
}

Gimple IR:

  <bb 3> [local count: 105119324]:
  _58 = (unsigned long) n_13(D);

  <bb 4> [local count: 630715945]:
  # vectp_cond.7_45 = PHI <vectp_cond.7_46(4), cond_14(D)(3)>
  # vectp_b.11_51 = PHI <vectp_b.11_52(4), b_15(D)(3)>
  # vectp_a.14_55 = PHI <vectp_a.14_56(4), a_16(D)(3)>
  # ivtmp_59 = PHI <ivtmp_60(4), _58(3)>
  _61 = .SELECT_VL (ivtmp_59, POLY_INT_CST [2, 2]);
  ivtmp_44 = _61 * 4;
  vect__4.9_47 = .LEN_MASK_LOAD (vectp_cond.7_45, 32B, _61, 0, { -1, ... });
  mask__24.10_49 = vect__4.9_47 != { 0, ... };
  vect__8.13_53 = .LEN_MASK_LOAD (vectp_b.11_51, 32B, _61, 0, mask__24.10_49);
  ivtmp_54 = _61 * 16;
  .LEN_MASK_SCATTER_STORE (vectp_a.14_55, { 0, 16, 32, ... }, 1, vect__8.13_53, 
_61, 0, mask__24.10_49);
  vectp_cond.7_46 = vectp_cond.7_45 + ivtmp_44;
  vectp_b.11_52 = vectp_b.11_51 + ivtmp_44;
  vectp_a.14_56 = vectp_a.14_55 + ivtmp_54;
  ivtmp_60 = ivtmp_59 - _61;
  if (ivtmp_60 != 0)
    goto <bb 4>; [83.33%]
  else
    goto <bb 5>; [16.67%]

gcc/ChangeLog:

        * optabs-query.cc (supports_vec_gather_load_p): Apply 
LEN_MASK_GATHER_LOAD/SCATTER_STORE into vectorizer.
        (supports_vec_scatter_store_p): Ditto.
        * tree-vect-data-refs.cc (vect_gather_scatter_fn_p): Ditto.
        * tree-vect-stmts.cc (check_load_store_for_partial_vectors): Ditto.
        (vect_get_strided_load_store_ops): Ditto.
        (vectorizable_store): Ditto.
        (vectorizable_load): Ditto.

---
 gcc/optabs-query.cc        |   2 +
 gcc/tree-vect-data-refs.cc |  15 +++-
 gcc/tree-vect-stmts.cc     | 136 ++++++++++++++++++++++++++++++++-----
 3 files changed, 134 insertions(+), 19 deletions(-)

diff --git a/gcc/optabs-query.cc b/gcc/optabs-query.cc
index 2fdd0d34354..bf1f484e874 100644
--- a/gcc/optabs-query.cc
+++ b/gcc/optabs-query.cc
@@ -676,6 +676,7 @@ supports_vec_gather_load_p (machine_mode mode)
     this_fn_optabs->supports_vec_gather_load[mode]
       = (supports_vec_convert_optab_p (gather_load_optab, mode)
         || supports_vec_convert_optab_p (mask_gather_load_optab, mode)
+        || supports_vec_convert_optab_p (len_mask_gather_load_optab, mode)
         ? 1 : -1);
 
   return this_fn_optabs->supports_vec_gather_load[mode] > 0;
@@ -692,6 +693,7 @@ supports_vec_scatter_store_p (machine_mode mode)
     this_fn_optabs->supports_vec_scatter_store[mode]
       = (supports_vec_convert_optab_p (scatter_store_optab, mode)
         || supports_vec_convert_optab_p (mask_scatter_store_optab, mode)
+        || supports_vec_convert_optab_p (len_mask_scatter_store_optab, mode)
         ? 1 : -1);
 
   return this_fn_optabs->supports_vec_scatter_store[mode] > 0;
diff --git a/gcc/tree-vect-data-refs.cc b/gcc/tree-vect-data-refs.cc
index ebe93832b1e..8d32eb3c83b 100644
--- a/gcc/tree-vect-data-refs.cc
+++ b/gcc/tree-vect-data-refs.cc
@@ -3873,16 +3873,24 @@ vect_gather_scatter_fn_p (vec_info *vinfo, bool read_p, 
bool masked_p,
     return false;
 
   /* Work out which function we need.  */
-  internal_fn ifn, alt_ifn;
+  internal_fn ifn, alt_ifn, len_mask_ifn;
   if (read_p)
     {
       ifn = masked_p ? IFN_MASK_GATHER_LOAD : IFN_GATHER_LOAD;
       alt_ifn = IFN_MASK_GATHER_LOAD;
+      /* When target supports LEN_MASK_GATHER_LOAD, we always
+        use LEN_MASK_GATHER_LOAD regardless whether len and
+        mask are valid or not.  */
+      len_mask_ifn = IFN_LEN_MASK_GATHER_LOAD;
     }
   else
     {
       ifn = masked_p ? IFN_MASK_SCATTER_STORE : IFN_SCATTER_STORE;
       alt_ifn = IFN_MASK_SCATTER_STORE;
+      /* When target supports LEN_MASK_SCATTER_STORE, we always
+        use LEN_MASK_SCATTER_STORE regardless whether len and
+        mask are valid or not.  */
+      len_mask_ifn = IFN_LEN_MASK_SCATTER_STORE;
     }
 
   for (;;)
@@ -3893,7 +3901,10 @@ vect_gather_scatter_fn_p (vec_info *vinfo, bool read_p, 
bool masked_p,
 
       /* Test whether the target supports this combination.  */
       if (internal_gather_scatter_fn_supported_p (ifn, vectype, memory_type,
-                                                 offset_vectype, scale))
+                                                 offset_vectype, scale)
+         || internal_gather_scatter_fn_supported_p (len_mask_ifn, vectype,
+                                                    memory_type,
+                                                    offset_vectype, scale))
        {
          *ifn_out = ifn;
          *offset_vectype_out = offset_vectype;
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index a0c39268bf0..1f607b7102b 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -1771,6 +1771,18 @@ check_load_store_for_partial_vectors (loop_vec_info 
loop_vinfo, tree vectype,
                                                   gs_info->offset_vectype,
                                                   gs_info->scale))
        {
+         ifn = (is_load
+                ? IFN_LEN_MASK_GATHER_LOAD
+                : IFN_LEN_MASK_SCATTER_STORE);
+         if (internal_gather_scatter_fn_supported_p (ifn, vectype,
+                                                     gs_info->memory_type,
+                                                     gs_info->offset_vectype,
+                                                     gs_info->scale))
+           {
+             vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
+             vect_record_loop_len (loop_vinfo, lens, nvectors, vectype, 1);
+             return;
+           }
          if (dump_enabled_p ())
            dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
                             "can't operate on partial vectors because"
@@ -3129,16 +3141,39 @@ vect_get_gather_scatter_ops (loop_vec_info loop_vinfo,
 static void
 vect_get_strided_load_store_ops (stmt_vec_info stmt_info,
                                 loop_vec_info loop_vinfo,
+                                gimple_stmt_iterator *gsi,
                                 gather_scatter_info *gs_info,
-                                tree *dataref_bump, tree *vec_offset)
+                                tree *dataref_bump, tree *vec_offset,
+                                vec_loop_lens *loop_lens)
 {
   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
 
-  tree bump = size_binop (MULT_EXPR,
-                         fold_convert (sizetype, unshare_expr (DR_STEP (dr))),
-                         size_int (TYPE_VECTOR_SUBPARTS (vectype)));
-  *dataref_bump = cse_and_gimplify_to_preheader (loop_vinfo, bump);
+  if (LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo))
+    {
+      /* _31 = .SELECT_VL (ivtmp_29, POLY_INT_CST [4, 4]);
+        ivtmp_8 = _31 * 16 (step in bytes);
+        .LEN_MASK_SCATTER_STORE (vectp_a.9_7, ... );
+        vectp_a.9_26 = vectp_a.9_7 + ivtmp_8;  */
+      tree loop_len
+       = vect_get_loop_len (loop_vinfo, gsi, loop_lens, 1, vectype, 0, 0);
+      tree tmp
+       = fold_build2 (MULT_EXPR, sizetype,
+                      fold_convert (sizetype, unshare_expr (DR_STEP (dr))),
+                      loop_len);
+      tree bump = make_temp_ssa_name (sizetype, NULL, "ivtmp");
+      gassign *assign = gimple_build_assign (bump, tmp);
+      gsi_insert_before (gsi, assign, GSI_SAME_STMT);
+      *dataref_bump = bump;
+    }
+  else
+    {
+      tree bump
+       = size_binop (MULT_EXPR,
+                     fold_convert (sizetype, unshare_expr (DR_STEP (dr))),
+                     size_int (TYPE_VECTOR_SUBPARTS (vectype)));
+      *dataref_bump = cse_and_gimplify_to_preheader (loop_vinfo, bump);
+    }
 
   /* The offset given in GS_INFO can have pointer type, so use the element
      type of the vector instead.  */
@@ -8685,8 +8720,8 @@ vectorizable_store (vec_info *vinfo,
   else if (memory_access_type == VMAT_GATHER_SCATTER)
     {
       aggr_type = elem_type;
-      vect_get_strided_load_store_ops (stmt_info, loop_vinfo, &gs_info,
-                                      &bump, &vec_offset);
+      vect_get_strided_load_store_ops (stmt_info, loop_vinfo, gsi, &gs_info,
+                                      &bump, &vec_offset, loop_lens);
     }
   else
     {
@@ -8915,6 +8950,8 @@ vectorizable_store (vec_info *vinfo,
              unsigned HOST_WIDE_INT align;
 
              tree final_mask = NULL_TREE;
+             tree final_len = NULL_TREE;
+             tree bias = NULL_TREE;
              if (loop_masks)
                final_mask = vect_get_loop_mask (loop_vinfo, gsi, loop_masks,
                                                 vec_num * ncopies,
@@ -8929,8 +8966,43 @@ vectorizable_store (vec_info *vinfo,
                  if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
                    vec_offset = vec_offsets[vec_num * j + i];
                  tree scale = size_int (gs_info.scale);
+
+                 if (internal_gather_scatter_fn_supported_p (
+                       IFN_LEN_MASK_SCATTER_STORE, vectype, 
gs_info.memory_type,
+                       TREE_TYPE (vec_offset), gs_info.scale))
+                   {
+                     if (loop_lens)
+                       {
+                         final_len
+                           = vect_get_loop_len (loop_vinfo, gsi, loop_lens,
+                                                vec_num * ncopies, vectype,
+                                                vec_num * j + i, 1);
+                       }
+                     else
+                       {
+                         tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
+                         final_len
+                           = build_int_cst (iv_type,
+                                            TYPE_VECTOR_SUBPARTS (vectype));
+                       }
+                     signed char biasval
+                       = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
+                     bias = build_int_cst (intQI_type_node, biasval);
+                     if (!final_mask)
+                       {
+                         mask_vectype = truth_type_for (vectype);
+                         final_mask = build_minus_one_cst (mask_vectype);
+                       }
+                   }
+
                  gcall *call;
-                 if (final_mask)
+                 if (final_len && final_len)
+                   call
+                     = gimple_build_call_internal (IFN_LEN_MASK_SCATTER_STORE,
+                                                   7, dataref_ptr, vec_offset,
+                                                   scale, vec_oprnd, final_len,
+                                                   bias, final_mask);
+                 else if (final_mask)
                    call = gimple_build_call_internal
                      (IFN_MASK_SCATTER_STORE, 5, dataref_ptr, vec_offset,
                       scale, vec_oprnd, final_mask);
@@ -9047,9 +9119,6 @@ vectorizable_store (vec_info *vinfo,
              machine_mode vmode = TYPE_MODE (vectype);
              machine_mode new_vmode = vmode;
              internal_fn partial_ifn = IFN_LAST;
-             /* Produce 'len' and 'bias' argument.  */
-             tree final_len = NULL_TREE;
-             tree bias = NULL_TREE;
              if (loop_lens)
                {
                  opt_machine_mode new_ovmode
@@ -10177,8 +10246,8 @@ vectorizable_load (vec_info *vinfo,
   else if (memory_access_type == VMAT_GATHER_SCATTER)
     {
       aggr_type = elem_type;
-      vect_get_strided_load_store_ops (stmt_info, loop_vinfo, &gs_info,
-                                      &bump, &vec_offset);
+      vect_get_strided_load_store_ops (stmt_info, loop_vinfo, gsi, &gs_info,
+                                      &bump, &vec_offset, loop_lens);
     }
   else
     {
@@ -10339,6 +10408,8 @@ vectorizable_load (vec_info *vinfo,
          for (i = 0; i < vec_num; i++)
            {
              tree final_mask = NULL_TREE;
+             tree final_len = NULL_TREE;
+             tree bias = NULL_TREE;
              if (loop_masks
                  && memory_access_type != VMAT_INVARIANT)
                final_mask = vect_get_loop_mask (loop_vinfo, gsi, loop_masks,
@@ -10368,8 +10439,42 @@ vectorizable_load (vec_info *vinfo,
                          vec_offset = vec_offsets[vec_num * j + i];
                        tree zero = build_zero_cst (vectype);
                        tree scale = size_int (gs_info.scale);
+
+                       if (internal_gather_scatter_fn_supported_p (
+                             IFN_LEN_MASK_GATHER_LOAD, vectype,
+                             gs_info.memory_type, TREE_TYPE (vec_offset),
+                             gs_info.scale))
+                         {
+                           if (loop_lens)
+                             {
+                               final_len = vect_get_loop_len (
+                                 loop_vinfo, gsi, loop_lens, vec_num * ncopies,
+                                 vectype, vec_num * j + i, 1);
+                             }
+                           else
+                             {
+                               tree iv_type
+                                 = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
+                               final_len = build_int_cst (
+                                 iv_type, TYPE_VECTOR_SUBPARTS (vectype));
+                             }
+                           signed char biasval
+                             = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
+                           bias = build_int_cst (intQI_type_node, biasval);
+                           if (!final_mask)
+                             {
+                               mask_vectype = truth_type_for (vectype);
+                               final_mask = build_minus_one_cst (mask_vectype);
+                             }
+                         }
+
                        gcall *call;
-                       if (final_mask)
+                       if (final_len && final_mask)
+                         call = gimple_build_call_internal (
+                           IFN_LEN_MASK_GATHER_LOAD, 7, dataref_ptr,
+                           vec_offset, scale, zero, final_len, bias,
+                           final_mask);
+                       else if (final_mask)
                          call = gimple_build_call_internal
                            (IFN_MASK_GATHER_LOAD, 5, dataref_ptr,
                             vec_offset, scale, zero, final_mask);
@@ -10462,9 +10567,6 @@ vectorizable_load (vec_info *vinfo,
                    machine_mode vmode = TYPE_MODE (vectype);
                    machine_mode new_vmode = vmode;
                    internal_fn partial_ifn = IFN_LAST;
-                   /* Produce 'len' and 'bias' argument.  */
-                   tree final_len = NULL_TREE;
-                   tree bias = NULL_TREE;
                    if (loop_lens)
                      {
                        opt_machine_mode new_ovmode
-- 
2.36.1

Reply via email to