Re: [PATCH 05/10] vect: Adjust vectorizable_store costing on VMAT_ELEMENTWISE and VMAT_STRIDED_SLP

Richard Biener Wed, 27 Sep 2023 04:29:00 -0700

On Thu, Sep 14, 2023 at 5:12 AM Kewen Lin <li...@linux.ibm.com> wrote:
>
> This patch adjusts the cost handling on VMAT_ELEMENTWISE
> and VMAT_STRIDED_SLP in function vectorizable_store.  We
> don't call function vect_model_store_cost for them any more.
>
> Like what we improved for PR82255 on load side, this change
> helps us to get rid of unnecessary vec_to_scalar costing
> for some case with VMAT_STRIDED_SLP.  One typical test case
> gcc.dg/vect/costmodel/ppc/costmodel-vect-store-1.c has been
> associated.  And it helps some cases with some inconsistent
> costing too.
>
> Besides, this also special-cases the interleaving stores
> for these two affected memory access types, since for the
> interleaving stores the whole chain is vectorized when the
> last store in the chain is reached, the other stores in the
> group would be skipped.  To keep consistent with this and
> follows the transforming handlings like iterating the whole
> group, it only costs for the first store in the group.
> Ideally we can only cost for the last one but it's not
> trivial and using the first one is actually equivalent.


OK

> gcc/ChangeLog:
>
>         * tree-vect-stmts.cc (vect_model_store_cost): Assert it won't get
>         VMAT_ELEMENTWISE and VMAT_STRIDED_SLP any more, and remove their
>         related handlings.
>         (vectorizable_store): Adjust the cost handling on VMAT_ELEMENTWISE
>         and VMAT_STRIDED_SLP without calling vect_model_store_cost.
>
> gcc/testsuite/ChangeLog:
>
>         * gcc.dg/vect/costmodel/ppc/costmodel-vect-store-1.c: New test.
> ---
>  .../costmodel/ppc/costmodel-vect-store-1.c    |  23 +++
>  gcc/tree-vect-stmts.cc                        | 160 +++++++++++-------
>  2 files changed, 120 insertions(+), 63 deletions(-)
>  create mode 100644 
> gcc/testsuite/gcc.dg/vect/costmodel/ppc/costmodel-vect-store-1.c
>
> diff --git a/gcc/testsuite/gcc.dg/vect/costmodel/ppc/costmodel-vect-store-1.c 
> b/gcc/testsuite/gcc.dg/vect/costmodel/ppc/costmodel-vect-store-1.c
> new file mode 100644
> index 00000000000..ab5f3301492
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/vect/costmodel/ppc/costmodel-vect-store-1.c
> @@ -0,0 +1,23 @@
> +/* { dg-do compile } */
> +/* { dg-require-effective-target vect_int } */
> +/* { dg-additional-options "-O3" }
> +
> +/* This test case is partially extracted from case
> +   gcc.dg/vect/vect-avg-16.c, it's to verify we don't
> +   cost a store with vec_to_scalar when we shouldn't.  */
> +
> +void
> +test (signed char *restrict a, signed char *restrict b, signed char 
> *restrict c,
> +      int n)
> +{
> +  for (int j = 0; j < n; ++j)
> +    {
> +      for (int i = 0; i < 16; ++i)
> +       a[i] = (b[i] + c[i]) >> 1;
> +      a += 20;
> +      b += 20;
> +      c += 20;
> +    }
> +}
> +
> +/* { dg-final { scan-tree-dump-times "vec_to_scalar" 0 "vect" } } */
> diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
> index 048c14d291c..3d01168080a 100644
> --- a/gcc/tree-vect-stmts.cc
> +++ b/gcc/tree-vect-stmts.cc
> @@ -964,7 +964,9 @@ vect_model_store_cost (vec_info *vinfo, stmt_vec_info 
> stmt_info, int ncopies,
>                        vec_load_store_type vls_type, slp_tree slp_node,
>                        stmt_vector_for_cost *cost_vec)
>  {
> -  gcc_assert (memory_access_type != VMAT_GATHER_SCATTER);
> +  gcc_assert (memory_access_type != VMAT_GATHER_SCATTER
> +             && memory_access_type != VMAT_ELEMENTWISE
> +             && memory_access_type != VMAT_STRIDED_SLP);
>    unsigned int inside_cost = 0, prologue_cost = 0;
>    stmt_vec_info first_stmt_info = stmt_info;
>    bool grouped_access_p = STMT_VINFO_GROUPED_ACCESS (stmt_info);
> @@ -1010,29 +1012,9 @@ vect_model_store_cost (vec_info *vinfo, stmt_vec_info 
> stmt_info, int ncopies,
>                           group_size);
>      }
>
> -  tree vectype = STMT_VINFO_VECTYPE (stmt_info);
>    /* Costs of the stores.  */
> -  if (memory_access_type == VMAT_ELEMENTWISE)
> -    {
> -      unsigned int assumed_nunits = vect_nunits_for_cost (vectype);
> -      /* N scalar stores plus extracting the elements.  */
> -      inside_cost += record_stmt_cost (cost_vec,
> -                                      ncopies * assumed_nunits,
> -                                      scalar_store, stmt_info, 0, vect_body);
> -    }
> -  else
> -    vect_get_store_cost (vinfo, stmt_info, ncopies, alignment_support_scheme,
> -                        misalignment, &inside_cost, cost_vec);
> -
> -  if (memory_access_type == VMAT_ELEMENTWISE
> -      || memory_access_type == VMAT_STRIDED_SLP)
> -    {
> -      /* N scalar stores plus extracting the elements.  */
> -      unsigned int assumed_nunits = vect_nunits_for_cost (vectype);
> -      inside_cost += record_stmt_cost (cost_vec,
> -                                      ncopies * assumed_nunits,
> -                                      vec_to_scalar, stmt_info, 0, 
> vect_body);
> -    }
> +  vect_get_store_cost (vinfo, stmt_info, ncopies, alignment_support_scheme,
> +                      misalignment, &inside_cost, cost_vec);
>
>    /* When vectorizing a store into the function result assign
>       a penalty if the function returns in a multi-register location.
> @@ -8416,6 +8398,18 @@ vectorizable_store (vec_info *vinfo,
>                          "Vectorizing an unaligned access.\n");
>
>        STMT_VINFO_TYPE (stmt_info) = store_vec_info_type;
> +
> +      /* As function vect_transform_stmt shows, for interleaving stores
> +        the whole chain is vectorized when the last store in the chain
> +        is reached, the other stores in the group are skipped.  So we
> +        want to only cost the last one here, but it's not trivial to
> +        get the last, as it's equivalent to use the first one for
> +        costing, use the first one instead.  */
> +      if (grouped_store
> +         && !slp
> +         && first_stmt_info != stmt_info
> +         && memory_access_type == VMAT_ELEMENTWISE)
> +       return true;
>      }
>    gcc_assert (memory_access_type == STMT_VINFO_MEMORY_ACCESS_TYPE 
> (stmt_info));
>
> @@ -8488,14 +8482,7 @@ vectorizable_store (vec_info *vinfo,
>    if (memory_access_type == VMAT_ELEMENTWISE
>        || memory_access_type == VMAT_STRIDED_SLP)
>      {
> -      if (costing_p)
> -       {
> -         vect_model_store_cost (vinfo, stmt_info, ncopies, 
> memory_access_type,
> -                                alignment_support_scheme, misalignment,
> -                                vls_type, slp_node, cost_vec);
> -         return true;
> -       }
> -
> +      unsigned inside_cost = 0, prologue_cost = 0;
>        gimple_stmt_iterator incr_gsi;
>        bool insert_after;
>        gimple *incr;
> @@ -8503,7 +8490,7 @@ vectorizable_store (vec_info *vinfo,
>        tree ivstep;
>        tree running_off;
>        tree stride_base, stride_step, alias_off;
> -      tree vec_oprnd;
> +      tree vec_oprnd = NULL_TREE;
>        tree dr_offset;
>        unsigned int g;
>        /* Checked by get_load_store_type.  */
> @@ -8609,26 +8596,30 @@ vectorizable_store (vec_info *vinfo,
>                   lnel = const_nunits;
>                   ltype = vectype;
>                   lvectype = vectype;
> +                 alignment_support_scheme = dr_align;
> +                 misalignment = mis_align;
>                 }
>             }
>           ltype = build_aligned_type (ltype, TYPE_ALIGN (elem_type));
>           ncopies = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
>         }
>
> -      ivstep = stride_step;
> -      ivstep = fold_build2 (MULT_EXPR, TREE_TYPE (ivstep), ivstep,
> -                           build_int_cst (TREE_TYPE (ivstep), vf));
> +      if (!costing_p)
> +       {
> +         ivstep = stride_step;
> +         ivstep = fold_build2 (MULT_EXPR, TREE_TYPE (ivstep), ivstep,
> +                               build_int_cst (TREE_TYPE (ivstep), vf));
>
> -      standard_iv_increment_position (loop, &incr_gsi, &insert_after);
> +         standard_iv_increment_position (loop, &incr_gsi, &insert_after);
>
> -      stride_base = cse_and_gimplify_to_preheader (loop_vinfo, stride_base);
> -      ivstep = cse_and_gimplify_to_preheader (loop_vinfo, ivstep);
> -      create_iv (stride_base, PLUS_EXPR, ivstep, NULL,
> -                loop, &incr_gsi, insert_after,
> -                &offvar, NULL);
> -      incr = gsi_stmt (incr_gsi);
> +         stride_base = cse_and_gimplify_to_preheader (loop_vinfo, 
> stride_base);
> +         ivstep = cse_and_gimplify_to_preheader (loop_vinfo, ivstep);
> +         create_iv (stride_base, PLUS_EXPR, ivstep, NULL, loop, &incr_gsi,
> +                    insert_after, &offvar, NULL);
> +         incr = gsi_stmt (incr_gsi);
>
> -      stride_step = cse_and_gimplify_to_preheader (loop_vinfo, stride_step);
> +         stride_step = cse_and_gimplify_to_preheader (loop_vinfo, 
> stride_step);
> +       }
>
>        alias_off = build_int_cst (ref_type, 0);
>        stmt_vec_info next_stmt_info = first_stmt_info;
> @@ -8636,39 +8627,76 @@ vectorizable_store (vec_info *vinfo,
>        for (g = 0; g < group_size; g++)
>         {
>           running_off = offvar;
> -         if (g)
> +         if (!costing_p)
>             {
> -             tree size = TYPE_SIZE_UNIT (ltype);
> -             tree pos = fold_build2 (MULT_EXPR, sizetype, size_int (g),
> -                                     size);
> -             tree newoff = copy_ssa_name (running_off, NULL);
> -             incr = gimple_build_assign (newoff, POINTER_PLUS_EXPR,
> -                                         running_off, pos);
> -             vect_finish_stmt_generation (vinfo, stmt_info, incr, gsi);
> -             running_off = newoff;
> +             if (g)
> +               {
> +                 tree size = TYPE_SIZE_UNIT (ltype);
> +                 tree pos
> +                   = fold_build2 (MULT_EXPR, sizetype, size_int (g), size);
> +                 tree newoff = copy_ssa_name (running_off, NULL);
> +                 incr = gimple_build_assign (newoff, POINTER_PLUS_EXPR,
> +                                             running_off, pos);
> +                 vect_finish_stmt_generation (vinfo, stmt_info, incr, gsi);
> +                 running_off = newoff;
> +               }
>             }
>           if (!slp)
>             op = vect_get_store_rhs (next_stmt_info);
> -         vect_get_vec_defs (vinfo, next_stmt_info, slp_node, ncopies,
> -                            op, &vec_oprnds);
> +         if (!costing_p)
> +           vect_get_vec_defs (vinfo, next_stmt_info, slp_node, ncopies, op,
> +                              &vec_oprnds);
> +         else if (!slp)
> +           {
> +             enum vect_def_type cdt;
> +             gcc_assert (vect_is_simple_use (op, vinfo, &cdt));
> +             if (cdt == vect_constant_def || cdt == vect_external_def)
> +               prologue_cost += record_stmt_cost (cost_vec, 1, scalar_to_vec,
> +                                                  stmt_info, 0, 
> vect_prologue);
> +           }
>           unsigned int group_el = 0;
>           unsigned HOST_WIDE_INT
>             elsz = tree_to_uhwi (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
>           for (j = 0; j < ncopies; j++)
>             {
> -             vec_oprnd = vec_oprnds[j];
> -             /* Pun the vector to extract from if necessary.  */
> -             if (lvectype != vectype)
> +             if (!costing_p)
>                 {
> -                 tree tem = make_ssa_name (lvectype);
> -                 gimple *pun
> -                   = gimple_build_assign (tem, build1 (VIEW_CONVERT_EXPR,
> -                                                       lvectype, vec_oprnd));
> -                 vect_finish_stmt_generation (vinfo, stmt_info, pun, gsi);
> -                 vec_oprnd = tem;
> +                 vec_oprnd = vec_oprnds[j];
> +                 /* Pun the vector to extract from if necessary.  */
> +                 if (lvectype != vectype)
> +                   {
> +                     tree tem = make_ssa_name (lvectype);
> +                     tree cvt
> +                       = build1 (VIEW_CONVERT_EXPR, lvectype, vec_oprnd);
> +                     gimple *pun = gimple_build_assign (tem, cvt);
> +                     vect_finish_stmt_generation (vinfo, stmt_info, pun, 
> gsi);
> +                     vec_oprnd = tem;
> +                   }
>                 }
>               for (i = 0; i < nstores; i++)
>                 {
> +                 if (costing_p)
> +                   {
> +                     /* Only need vector extracting when there are more
> +                        than one stores.  */
> +                     if (nstores > 1)
> +                       inside_cost
> +                         += record_stmt_cost (cost_vec, 1, vec_to_scalar,
> +                                              stmt_info, 0, vect_body);
> +                     /* Take a single lane vector type store as scalar
> +                        store to avoid ICE like 110776.  */
> +                     if (VECTOR_TYPE_P (ltype)
> +                         && known_ne (TYPE_VECTOR_SUBPARTS (ltype), 1U))
> +                       vect_get_store_cost (vinfo, stmt_info, 1,
> +                                            alignment_support_scheme,
> +                                            misalignment, &inside_cost,
> +                                            cost_vec);
> +                     else
> +                       inside_cost
> +                         += record_stmt_cost (cost_vec, 1, scalar_store,
> +                                              stmt_info, 0, vect_body);
> +                     continue;
> +                   }
>                   tree newref, newoff;
>                   gimple *incr, *assign;
>                   tree size = TYPE_SIZE (ltype);
> @@ -8719,6 +8747,12 @@ vectorizable_store (vec_info *vinfo,
>             break;
>         }
>
> +      if (costing_p && dump_enabled_p ())
> +       dump_printf_loc (MSG_NOTE, vect_location,
> +                        "vect_model_store_cost: inside_cost = %d, "
> +                        "prologue_cost = %d .\n",
> +                        inside_cost, prologue_cost);
> +
>        return true;
>      }
>
> --
> 2.31.1
>

Re: [PATCH 05/10] vect: Adjust vectorizable_store costing on VMAT_ELEMENTWISE and VMAT_STRIDED_SLP

Reply via email to