On Tue, May 23, 2023 at 5:01 AM Kewen.Lin <li...@linux.ibm.com> wrote: > > Hi Richi, > > Thanks for the review! > > on 2023/5/22 21:44, Richard Biener wrote: > > On Wed, May 17, 2023 at 8:15 AM Kewen.Lin <li...@linux.ibm.com> wrote: > >> > >> Hi, > >> > >> Following Richi's suggestion in [1], I'm working on deferring > >> cost evaluation next to the transformation, this patch is > >> to enhance function vect_transform_slp_perm_load_1 which > >> could under-cost for vector permutation, since the costing > >> doesn't try to consider nvectors_per_build, it's inconsistent > >> with the transformation part. > >> > >> Bootstrapped and regtested on x86_64-redhat-linux, > >> aarch64-linux-gnu and powerpc64{,le}-linux-gnu. > >> > >> Is it ok for trunk? > >> > >> [1] https://gcc.gnu.org/pipermail/gcc-patches/2021-January/563624.html > >> > >> BR, > >> Kewen > >> ----- > >> gcc/ChangeLog: > >> > >> * tree-vect-slp.cc (vect_transform_slp_perm_load_1): Adjust the > >> calculation on n_perms by considering nvectors_per_build. > >> > >> gcc/testsuite/ChangeLog: > >> > >> * gcc.dg/vect/costmodel/ppc/costmodel-slp-perm.c: New test. > >> --- > >> .../vect/costmodel/ppc/costmodel-slp-perm.c | 23 +++++++ > >> gcc/tree-vect-slp.cc | 66 ++++++++++--------- > >> 2 files changed, 57 insertions(+), 32 deletions(-) > >> create mode 100644 > >> gcc/testsuite/gcc.dg/vect/costmodel/ppc/costmodel-slp-perm.c > >> > >> diff --git a/gcc/testsuite/gcc.dg/vect/costmodel/ppc/costmodel-slp-perm.c > >> b/gcc/testsuite/gcc.dg/vect/costmodel/ppc/costmodel-slp-perm.c > >> new file mode 100644 > >> index 00000000000..e5c4dceddfb > >> --- /dev/null > >> +++ b/gcc/testsuite/gcc.dg/vect/costmodel/ppc/costmodel-slp-perm.c > >> @@ -0,0 +1,23 @@ > >> +/* { dg-do compile } */ > >> +/* { dg-require-effective-target vect_int } */ > >> +/* { dg-require-effective-target powerpc_p9vector_ok } */ > >> +/* Specify power9 to ensure the vectorization is profitable > >> + and test point stands, otherwise it could be not profitable > >> + to vectorize. */ > >> +/* { dg-additional-options "-mdejagnu-cpu=power9 -mpower9-vector" } */ > >> + > >> +/* Verify we cost the exact count for required vec_perm. */ > >> + > >> +int x[1024], y[1024]; > >> + > >> +void > >> +foo () > >> +{ > >> + for (int i = 0; i < 512; ++i) > >> + { > >> + x[2 * i] = y[1023 - (2 * i)]; > >> + x[2 * i + 1] = y[1023 - (2 * i + 1)]; > >> + } > >> +} > >> + > >> +/* { dg-final { scan-tree-dump-times "2 times vec_perm" 1 "vect" } } */ > >> diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc > >> index e5c9d7e766e..af9a6dd4fa9 100644 > >> --- a/gcc/tree-vect-slp.cc > >> +++ b/gcc/tree-vect-slp.cc > >> @@ -8115,12 +8115,12 @@ vect_transform_slp_perm_load_1 (vec_info *vinfo, > >> slp_tree node, > >> > >> mode = TYPE_MODE (vectype); > >> poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype); > >> + unsigned int nstmts = SLP_TREE_NUMBER_OF_VEC_STMTS (node); > >> > >> /* Initialize the vect stmts of NODE to properly insert the generated > >> stmts later. */ > >> if (! analyze_only) > >> - for (unsigned i = SLP_TREE_VEC_STMTS (node).length (); > >> - i < SLP_TREE_NUMBER_OF_VEC_STMTS (node); i++) > >> + for (unsigned i = SLP_TREE_VEC_STMTS (node).length (); i < nstmts; > >> i++) > >> SLP_TREE_VEC_STMTS (node).quick_push (NULL); > >> > >> /* Generate permutation masks for every NODE. Number of masks for each > >> NODE > >> @@ -8161,7 +8161,10 @@ vect_transform_slp_perm_load_1 (vec_info *vinfo, > >> slp_tree node, > >> (b) the permutes only need a single vector input. */ > >> mask.new_vector (nunits, group_size, 3); > >> nelts_to_build = mask.encoded_nelts (); > >> - nvectors_per_build = SLP_TREE_VEC_STMTS (node).length (); > >> + /* It's possible to obtain zero nstmts during analyze_only, so make > >> + it at least one to ensure the later computation for n_perms > >> + proceed. */ > >> + nvectors_per_build = nstmts > 0 ? nstmts : 1; > >> in_nlanes = DR_GROUP_SIZE (stmt_info) * 3; > >> } > >> else > >> @@ -8252,40 +8255,39 @@ vect_transform_slp_perm_load_1 (vec_info *vinfo, > >> slp_tree node, > >> return false; > >> } > >> > >> - ++*n_perms; > >> - > >> + tree mask_vec = NULL_TREE; > >> if (!analyze_only) > >> - { > >> - tree mask_vec = vect_gen_perm_mask_checked (vectype, > >> indices); > >> + mask_vec = vect_gen_perm_mask_checked (vectype, indices); > >> > >> - if (second_vec_index == -1) > >> - second_vec_index = first_vec_index; > >> + if (second_vec_index == -1) > >> + second_vec_index = first_vec_index; > >> > >> - for (unsigned int ri = 0; ri < nvectors_per_build; ++ri) > >> + for (unsigned int ri = 0; ri < nvectors_per_build; ++ri) > >> + { > >> + ++*n_perms; > > > > So the "real" change is doing > > > > *n_perms += nvectors_per_build; > > > > and *n_perms was unused when !analyze_only? And since at > > Yes, although both !analyze_only and analyze_only calls pass n_perms in, now > only the call sites with analyze_only will use the returned n_perms further. > > > analysis time we (sometimes?) have zero nvectors you have to > > fixup above? Which cases are that? > > Yes, the fixup is to avoid to result in unexpected n_perms in function > vect_optimize_slp_pass::internal_node_cost。 One typical case is > gcc.dg/vect/bb-slp-50.c, without special casing zero, slp2 fails to optimize > out one more vec_perm unexpectedly. > > In vect_optimize_slp_pass::internal_node_cost, it checks if the returned > n_perms > is zero or not (vec_perm not needed or needed). > > if (!vect_transform_slp_perm_load_1 (m_vinfo, node, tmp_perm, vNULL, > nullptr, vf, true, false, > &n_perms)) > { > auto rep = SLP_TREE_REPRESENTATIVE (node); > if (out_layout_i == 0) > { > /* Use the fallback cost if the load is an N-to-N permutation. > Otherwise assume that the node will be rejected later > and rebuilt from scalars. */ > if (STMT_VINFO_GROUPED_ACCESS (rep) > && (DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (rep)) > == SLP_TREE_LANES (node))) > return fallback_cost; > return 0; > } > return -1; > } > > /* See the comment above the corresponding VEC_PERM_EXPR handling. */ > return n_perms == 0 ? 0 : 1; > > In vect_optimize_slp_pass::forward_pass (), it only considers the case that > factor > 0 (there is some vec_perm needed). > > /* Accumulate the cost of using LAYOUT_I within NODE, > both for the inputs and the outputs. */ > int factor = internal_node_cost (vertex.node, layout_i, > layout_i); > if (factor < 0) > { > is_possible = false; > break; > } > else if (factor) > layout_costs.internal_cost.add_serial_cost > ({ vertex.weight * factor, m_optimize_size });
Ah, OK - thanks for clarifying. The patch is OK. Richard. > BR, > Kewen > > > > > In principle the patch looks good to me. > > > > Richard. > > > >> + if (analyze_only) > >> + continue; > >> + /* Generate the permute statement if necessary. */ > >> + tree first_vec = dr_chain[first_vec_index + ri]; > >> + tree second_vec = dr_chain[second_vec_index + ri]; > >> + gassign *stmt = as_a<gassign *> (stmt_info->stmt); > >> + tree perm_dest > >> + = vect_create_destination_var (gimple_assign_lhs > >> (stmt), > >> + vectype); > >> + perm_dest = make_ssa_name (perm_dest); > >> + gimple *perm_stmt > >> + = gimple_build_assign (perm_dest, VEC_PERM_EXPR, > >> first_vec, > >> + second_vec, mask_vec); > >> + vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, > >> + gsi); > >> + if (dce_chain) > >> { > >> - /* Generate the permute statement if necessary. */ > >> - tree first_vec = dr_chain[first_vec_index + ri]; > >> - tree second_vec = dr_chain[second_vec_index + ri]; > >> - gassign *stmt = as_a<gassign *> (stmt_info->stmt); > >> - tree perm_dest > >> - = vect_create_destination_var (gimple_assign_lhs > >> (stmt), > >> - vectype); > >> - perm_dest = make_ssa_name (perm_dest); > >> - gimple *perm_stmt > >> - = gimple_build_assign (perm_dest, VEC_PERM_EXPR, > >> - first_vec, second_vec, > >> mask_vec); > >> - vect_finish_stmt_generation (vinfo, stmt_info, > >> perm_stmt, > >> - gsi); > >> - if (dce_chain) > >> - { > >> - bitmap_set_bit (used_defs, first_vec_index + ri); > >> - bitmap_set_bit (used_defs, second_vec_index + > >> ri); > >> - } > >> - > >> - /* Store the vector statement in NODE. */ > >> - SLP_TREE_VEC_STMTS (node) [vect_stmts_counter++] > >> - = perm_stmt; > >> + bitmap_set_bit (used_defs, first_vec_index + ri); > >> + bitmap_set_bit (used_defs, second_vec_index + ri); > >> } > >> + > >> + /* Store the vector statement in NODE. */ > >> + SLP_TREE_VEC_STMTS (node)[vect_stmts_counter++] = > >> perm_stmt; > >> } > >> } > >> else if (!analyze_only) > >> -- > >> 2.39.1 >