On Thu, Sep 14, 2023 at 5:12 AM Kewen Lin <li...@linux.ibm.com> wrote: > > This patch adjusts the cost handling on VMAT_ELEMENTWISE > and VMAT_STRIDED_SLP in function vectorizable_store. We > don't call function vect_model_store_cost for them any more. > > Like what we improved for PR82255 on load side, this change > helps us to get rid of unnecessary vec_to_scalar costing > for some case with VMAT_STRIDED_SLP. One typical test case > gcc.dg/vect/costmodel/ppc/costmodel-vect-store-1.c has been > associated. And it helps some cases with some inconsistent > costing too. > > Besides, this also special-cases the interleaving stores > for these two affected memory access types, since for the > interleaving stores the whole chain is vectorized when the > last store in the chain is reached, the other stores in the > group would be skipped. To keep consistent with this and > follows the transforming handlings like iterating the whole > group, it only costs for the first store in the group. > Ideally we can only cost for the last one but it's not > trivial and using the first one is actually equivalent.
OK > gcc/ChangeLog: > > * tree-vect-stmts.cc (vect_model_store_cost): Assert it won't get > VMAT_ELEMENTWISE and VMAT_STRIDED_SLP any more, and remove their > related handlings. > (vectorizable_store): Adjust the cost handling on VMAT_ELEMENTWISE > and VMAT_STRIDED_SLP without calling vect_model_store_cost. > > gcc/testsuite/ChangeLog: > > * gcc.dg/vect/costmodel/ppc/costmodel-vect-store-1.c: New test. > --- > .../costmodel/ppc/costmodel-vect-store-1.c | 23 +++ > gcc/tree-vect-stmts.cc | 160 +++++++++++------- > 2 files changed, 120 insertions(+), 63 deletions(-) > create mode 100644 > gcc/testsuite/gcc.dg/vect/costmodel/ppc/costmodel-vect-store-1.c > > diff --git a/gcc/testsuite/gcc.dg/vect/costmodel/ppc/costmodel-vect-store-1.c > b/gcc/testsuite/gcc.dg/vect/costmodel/ppc/costmodel-vect-store-1.c > new file mode 100644 > index 00000000000..ab5f3301492 > --- /dev/null > +++ b/gcc/testsuite/gcc.dg/vect/costmodel/ppc/costmodel-vect-store-1.c > @@ -0,0 +1,23 @@ > +/* { dg-do compile } */ > +/* { dg-require-effective-target vect_int } */ > +/* { dg-additional-options "-O3" } > + > +/* This test case is partially extracted from case > + gcc.dg/vect/vect-avg-16.c, it's to verify we don't > + cost a store with vec_to_scalar when we shouldn't. */ > + > +void > +test (signed char *restrict a, signed char *restrict b, signed char > *restrict c, > + int n) > +{ > + for (int j = 0; j < n; ++j) > + { > + for (int i = 0; i < 16; ++i) > + a[i] = (b[i] + c[i]) >> 1; > + a += 20; > + b += 20; > + c += 20; > + } > +} > + > +/* { dg-final { scan-tree-dump-times "vec_to_scalar" 0 "vect" } } */ > diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc > index 048c14d291c..3d01168080a 100644 > --- a/gcc/tree-vect-stmts.cc > +++ b/gcc/tree-vect-stmts.cc > @@ -964,7 +964,9 @@ vect_model_store_cost (vec_info *vinfo, stmt_vec_info > stmt_info, int ncopies, > vec_load_store_type vls_type, slp_tree slp_node, > stmt_vector_for_cost *cost_vec) > { > - gcc_assert (memory_access_type != VMAT_GATHER_SCATTER); > + gcc_assert (memory_access_type != VMAT_GATHER_SCATTER > + && memory_access_type != VMAT_ELEMENTWISE > + && memory_access_type != VMAT_STRIDED_SLP); > unsigned int inside_cost = 0, prologue_cost = 0; > stmt_vec_info first_stmt_info = stmt_info; > bool grouped_access_p = STMT_VINFO_GROUPED_ACCESS (stmt_info); > @@ -1010,29 +1012,9 @@ vect_model_store_cost (vec_info *vinfo, stmt_vec_info > stmt_info, int ncopies, > group_size); > } > > - tree vectype = STMT_VINFO_VECTYPE (stmt_info); > /* Costs of the stores. */ > - if (memory_access_type == VMAT_ELEMENTWISE) > - { > - unsigned int assumed_nunits = vect_nunits_for_cost (vectype); > - /* N scalar stores plus extracting the elements. */ > - inside_cost += record_stmt_cost (cost_vec, > - ncopies * assumed_nunits, > - scalar_store, stmt_info, 0, vect_body); > - } > - else > - vect_get_store_cost (vinfo, stmt_info, ncopies, alignment_support_scheme, > - misalignment, &inside_cost, cost_vec); > - > - if (memory_access_type == VMAT_ELEMENTWISE > - || memory_access_type == VMAT_STRIDED_SLP) > - { > - /* N scalar stores plus extracting the elements. */ > - unsigned int assumed_nunits = vect_nunits_for_cost (vectype); > - inside_cost += record_stmt_cost (cost_vec, > - ncopies * assumed_nunits, > - vec_to_scalar, stmt_info, 0, > vect_body); > - } > + vect_get_store_cost (vinfo, stmt_info, ncopies, alignment_support_scheme, > + misalignment, &inside_cost, cost_vec); > > /* When vectorizing a store into the function result assign > a penalty if the function returns in a multi-register location. > @@ -8416,6 +8398,18 @@ vectorizable_store (vec_info *vinfo, > "Vectorizing an unaligned access.\n"); > > STMT_VINFO_TYPE (stmt_info) = store_vec_info_type; > + > + /* As function vect_transform_stmt shows, for interleaving stores > + the whole chain is vectorized when the last store in the chain > + is reached, the other stores in the group are skipped. So we > + want to only cost the last one here, but it's not trivial to > + get the last, as it's equivalent to use the first one for > + costing, use the first one instead. */ > + if (grouped_store > + && !slp > + && first_stmt_info != stmt_info > + && memory_access_type == VMAT_ELEMENTWISE) > + return true; > } > gcc_assert (memory_access_type == STMT_VINFO_MEMORY_ACCESS_TYPE > (stmt_info)); > > @@ -8488,14 +8482,7 @@ vectorizable_store (vec_info *vinfo, > if (memory_access_type == VMAT_ELEMENTWISE > || memory_access_type == VMAT_STRIDED_SLP) > { > - if (costing_p) > - { > - vect_model_store_cost (vinfo, stmt_info, ncopies, > memory_access_type, > - alignment_support_scheme, misalignment, > - vls_type, slp_node, cost_vec); > - return true; > - } > - > + unsigned inside_cost = 0, prologue_cost = 0; > gimple_stmt_iterator incr_gsi; > bool insert_after; > gimple *incr; > @@ -8503,7 +8490,7 @@ vectorizable_store (vec_info *vinfo, > tree ivstep; > tree running_off; > tree stride_base, stride_step, alias_off; > - tree vec_oprnd; > + tree vec_oprnd = NULL_TREE; > tree dr_offset; > unsigned int g; > /* Checked by get_load_store_type. */ > @@ -8609,26 +8596,30 @@ vectorizable_store (vec_info *vinfo, > lnel = const_nunits; > ltype = vectype; > lvectype = vectype; > + alignment_support_scheme = dr_align; > + misalignment = mis_align; > } > } > ltype = build_aligned_type (ltype, TYPE_ALIGN (elem_type)); > ncopies = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node); > } > > - ivstep = stride_step; > - ivstep = fold_build2 (MULT_EXPR, TREE_TYPE (ivstep), ivstep, > - build_int_cst (TREE_TYPE (ivstep), vf)); > + if (!costing_p) > + { > + ivstep = stride_step; > + ivstep = fold_build2 (MULT_EXPR, TREE_TYPE (ivstep), ivstep, > + build_int_cst (TREE_TYPE (ivstep), vf)); > > - standard_iv_increment_position (loop, &incr_gsi, &insert_after); > + standard_iv_increment_position (loop, &incr_gsi, &insert_after); > > - stride_base = cse_and_gimplify_to_preheader (loop_vinfo, stride_base); > - ivstep = cse_and_gimplify_to_preheader (loop_vinfo, ivstep); > - create_iv (stride_base, PLUS_EXPR, ivstep, NULL, > - loop, &incr_gsi, insert_after, > - &offvar, NULL); > - incr = gsi_stmt (incr_gsi); > + stride_base = cse_and_gimplify_to_preheader (loop_vinfo, > stride_base); > + ivstep = cse_and_gimplify_to_preheader (loop_vinfo, ivstep); > + create_iv (stride_base, PLUS_EXPR, ivstep, NULL, loop, &incr_gsi, > + insert_after, &offvar, NULL); > + incr = gsi_stmt (incr_gsi); > > - stride_step = cse_and_gimplify_to_preheader (loop_vinfo, stride_step); > + stride_step = cse_and_gimplify_to_preheader (loop_vinfo, > stride_step); > + } > > alias_off = build_int_cst (ref_type, 0); > stmt_vec_info next_stmt_info = first_stmt_info; > @@ -8636,39 +8627,76 @@ vectorizable_store (vec_info *vinfo, > for (g = 0; g < group_size; g++) > { > running_off = offvar; > - if (g) > + if (!costing_p) > { > - tree size = TYPE_SIZE_UNIT (ltype); > - tree pos = fold_build2 (MULT_EXPR, sizetype, size_int (g), > - size); > - tree newoff = copy_ssa_name (running_off, NULL); > - incr = gimple_build_assign (newoff, POINTER_PLUS_EXPR, > - running_off, pos); > - vect_finish_stmt_generation (vinfo, stmt_info, incr, gsi); > - running_off = newoff; > + if (g) > + { > + tree size = TYPE_SIZE_UNIT (ltype); > + tree pos > + = fold_build2 (MULT_EXPR, sizetype, size_int (g), size); > + tree newoff = copy_ssa_name (running_off, NULL); > + incr = gimple_build_assign (newoff, POINTER_PLUS_EXPR, > + running_off, pos); > + vect_finish_stmt_generation (vinfo, stmt_info, incr, gsi); > + running_off = newoff; > + } > } > if (!slp) > op = vect_get_store_rhs (next_stmt_info); > - vect_get_vec_defs (vinfo, next_stmt_info, slp_node, ncopies, > - op, &vec_oprnds); > + if (!costing_p) > + vect_get_vec_defs (vinfo, next_stmt_info, slp_node, ncopies, op, > + &vec_oprnds); > + else if (!slp) > + { > + enum vect_def_type cdt; > + gcc_assert (vect_is_simple_use (op, vinfo, &cdt)); > + if (cdt == vect_constant_def || cdt == vect_external_def) > + prologue_cost += record_stmt_cost (cost_vec, 1, scalar_to_vec, > + stmt_info, 0, > vect_prologue); > + } > unsigned int group_el = 0; > unsigned HOST_WIDE_INT > elsz = tree_to_uhwi (TYPE_SIZE_UNIT (TREE_TYPE (vectype))); > for (j = 0; j < ncopies; j++) > { > - vec_oprnd = vec_oprnds[j]; > - /* Pun the vector to extract from if necessary. */ > - if (lvectype != vectype) > + if (!costing_p) > { > - tree tem = make_ssa_name (lvectype); > - gimple *pun > - = gimple_build_assign (tem, build1 (VIEW_CONVERT_EXPR, > - lvectype, vec_oprnd)); > - vect_finish_stmt_generation (vinfo, stmt_info, pun, gsi); > - vec_oprnd = tem; > + vec_oprnd = vec_oprnds[j]; > + /* Pun the vector to extract from if necessary. */ > + if (lvectype != vectype) > + { > + tree tem = make_ssa_name (lvectype); > + tree cvt > + = build1 (VIEW_CONVERT_EXPR, lvectype, vec_oprnd); > + gimple *pun = gimple_build_assign (tem, cvt); > + vect_finish_stmt_generation (vinfo, stmt_info, pun, > gsi); > + vec_oprnd = tem; > + } > } > for (i = 0; i < nstores; i++) > { > + if (costing_p) > + { > + /* Only need vector extracting when there are more > + than one stores. */ > + if (nstores > 1) > + inside_cost > + += record_stmt_cost (cost_vec, 1, vec_to_scalar, > + stmt_info, 0, vect_body); > + /* Take a single lane vector type store as scalar > + store to avoid ICE like 110776. */ > + if (VECTOR_TYPE_P (ltype) > + && known_ne (TYPE_VECTOR_SUBPARTS (ltype), 1U)) > + vect_get_store_cost (vinfo, stmt_info, 1, > + alignment_support_scheme, > + misalignment, &inside_cost, > + cost_vec); > + else > + inside_cost > + += record_stmt_cost (cost_vec, 1, scalar_store, > + stmt_info, 0, vect_body); > + continue; > + } > tree newref, newoff; > gimple *incr, *assign; > tree size = TYPE_SIZE (ltype); > @@ -8719,6 +8747,12 @@ vectorizable_store (vec_info *vinfo, > break; > } > > + if (costing_p && dump_enabled_p ()) > + dump_printf_loc (MSG_NOTE, vect_location, > + "vect_model_store_cost: inside_cost = %d, " > + "prologue_cost = %d .\n", > + inside_cost, prologue_cost); > + > return true; > } > > -- > 2.31.1 >