On Thu, May 30, 2024 at 4:45 PM Feng Xue OS <f...@os.amperecomputing.com> wrote: > > This is a patch that is split out from > https://gcc.gnu.org/pipermail/gcc-patches/2024-May/652626.html. > > Check if an operation is lane-reducing requires comparison of code against > three kinds (DOT_PROD_EXPR/WIDEN_SUM_EXPR/SAD_EXPR). Add an utility > function to make source coding for the check handy and concise.
OK. Thanks, Richard. > Feng > -- > gcc/ > * tree-vectorizer.h (lane_reducing_op_p): New function. > * tree-vect-slp.cc (vect_analyze_slp): Use new function > lane_reducing_op_p to check statement code. > * tree-vect-loop.cc (vect_transform_reduction): Likewise. > (vectorizable_reduction): Likewise, and change name of a local > variable that holds the result flag. > --- > gcc/tree-vect-loop.cc | 29 ++++++++++++----------------- > gcc/tree-vect-slp.cc | 4 +--- > gcc/tree-vectorizer.h | 6 ++++++ > 3 files changed, 19 insertions(+), 20 deletions(-) > > diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc > index 04a9ac64df7..a42d79c7cbf 100644 > --- a/gcc/tree-vect-loop.cc > +++ b/gcc/tree-vect-loop.cc > @@ -7650,9 +7650,7 @@ vectorizable_reduction (loop_vec_info loop_vinfo, > gimple_match_op op; > if (!gimple_extract_op (stmt_info->stmt, &op)) > gcc_unreachable (); > - bool lane_reduc_code_p = (op.code == DOT_PROD_EXPR > - || op.code == WIDEN_SUM_EXPR > - || op.code == SAD_EXPR); > + bool lane_reducing = lane_reducing_op_p (op.code); > > if (!POINTER_TYPE_P (op.type) && !INTEGRAL_TYPE_P (op.type) > && !SCALAR_FLOAT_TYPE_P (op.type)) > @@ -7664,7 +7662,7 @@ vectorizable_reduction (loop_vec_info loop_vinfo, > > /* For lane-reducing ops we're reducing the number of reduction PHIs > which means the only use of that may be in the lane-reducing operation. > */ > - if (lane_reduc_code_p > + if (lane_reducing > && reduc_chain_length != 1 > && !only_slp_reduc_chain) > { > @@ -7678,7 +7676,7 @@ vectorizable_reduction (loop_vec_info loop_vinfo, > since we'll mix lanes belonging to different reductions. But it's > OK to use them in a reduction chain or when the reduction group > has just one element. */ > - if (lane_reduc_code_p > + if (lane_reducing > && slp_node > && !REDUC_GROUP_FIRST_ELEMENT (stmt_info) > && SLP_TREE_LANES (slp_node) > 1) > @@ -7738,7 +7736,7 @@ vectorizable_reduction (loop_vec_info loop_vinfo, > /* To properly compute ncopies we are interested in the widest > non-reduction input type in case we're looking at a widening > accumulation that we later handle in vect_transform_reduction. */ > - if (lane_reduc_code_p > + if (lane_reducing > && vectype_op[i] > && (!vectype_in > || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in))) > @@ -8211,7 +8209,7 @@ vectorizable_reduction (loop_vec_info loop_vinfo, > && loop_vinfo->suggested_unroll_factor == 1) > single_defuse_cycle = true; > > - if (single_defuse_cycle || lane_reduc_code_p) > + if (single_defuse_cycle || lane_reducing) > { > gcc_assert (op.code != COND_EXPR); > > @@ -8227,7 +8225,7 @@ vectorizable_reduction (loop_vec_info loop_vinfo, > mixed-sign dot-products can be implemented using signed > dot-products. */ > machine_mode vec_mode = TYPE_MODE (vectype_in); > - if (!lane_reduc_code_p > + if (!lane_reducing > && !directly_supported_p (op.code, vectype_in, optab_vector)) > { > if (dump_enabled_p ()) > @@ -8252,7 +8250,7 @@ vectorizable_reduction (loop_vec_info loop_vinfo, > For the other cases try without the single cycle optimization. */ > if (!ok) > { > - if (lane_reduc_code_p) > + if (lane_reducing) > return false; > else > single_defuse_cycle = false; > @@ -8263,7 +8261,7 @@ vectorizable_reduction (loop_vec_info loop_vinfo, > /* If the reduction stmt is one of the patterns that have lane > reduction embedded we cannot handle the case of ! single_defuse_cycle. > */ > if ((ncopies > 1 && ! single_defuse_cycle) > - && lane_reduc_code_p) > + && lane_reducing) > { > if (dump_enabled_p ()) > dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, > @@ -8274,7 +8272,7 @@ vectorizable_reduction (loop_vec_info loop_vinfo, > > if (slp_node > && !(!single_defuse_cycle > - && !lane_reduc_code_p > + && !lane_reducing > && reduction_type != FOLD_LEFT_REDUCTION)) > for (i = 0; i < (int) op.num_ops; i++) > if (!vect_maybe_update_slp_op_vectype (slp_op[i], vectype_op[i])) > @@ -8295,7 +8293,7 @@ vectorizable_reduction (loop_vec_info loop_vinfo, > /* Cost the reduction op inside the loop if transformed via > vect_transform_reduction. Otherwise this is costed by the > separate vectorizable_* routines. */ > - if (single_defuse_cycle || lane_reduc_code_p) > + if (single_defuse_cycle || lane_reducing) > { > int factor = 1; > if (vect_is_emulated_mixed_dot_prod (loop_vinfo, stmt_info)) > @@ -8313,7 +8311,7 @@ vectorizable_reduction (loop_vec_info loop_vinfo, > /* All but single defuse-cycle optimized, lane-reducing and fold-left > reductions go through their own vectorizable_* routines. */ > if (!single_defuse_cycle > - && !lane_reduc_code_p > + && !lane_reducing > && reduction_type != FOLD_LEFT_REDUCTION) > { > stmt_vec_info tem > @@ -8555,10 +8553,7 @@ vect_transform_reduction (loop_vec_info loop_vinfo, > } > > bool single_defuse_cycle = STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info); > - gcc_assert (single_defuse_cycle > - || code == DOT_PROD_EXPR > - || code == WIDEN_SUM_EXPR > - || code == SAD_EXPR); > + gcc_assert (single_defuse_cycle || lane_reducing_op_p (code)); > > /* Create the destination vector */ > tree scalar_dest = gimple_get_lhs (stmt_info->stmt); > diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc > index bc7a85d6bfc..bf1f467f53f 100644 > --- a/gcc/tree-vect-slp.cc > +++ b/gcc/tree-vect-slp.cc > @@ -3928,9 +3928,7 @@ vect_analyze_slp (vec_info *vinfo, unsigned > max_tree_size) > /* Do not discover SLP reductions for lane-reducing ops, > that > will fail later. */ > && (!(g = dyn_cast <gassign *> (STMT_VINFO_STMT > (next_info))) > - || (gimple_assign_rhs_code (g) != DOT_PROD_EXPR > - && gimple_assign_rhs_code (g) != WIDEN_SUM_EXPR > - && gimple_assign_rhs_code (g) != SAD_EXPR))) > + || !lane_reducing_op_p (gimple_assign_rhs_code (g)))) > scalar_stmts.quick_push (next_info); > } > if (scalar_stmts.length () > 1) > diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h > index 479823412fd..97ec9c341e7 100644 > --- a/gcc/tree-vectorizer.h > +++ b/gcc/tree-vectorizer.h > @@ -2169,6 +2169,12 @@ vect_apply_runtime_profitability_check_p > (loop_vec_info loop_vinfo) > && th >= vect_vf_for_cost (loop_vinfo)); > } > > +inline bool > +lane_reducing_op_p (code_helper code) > +{ > + return code == DOT_PROD_EXPR || code == WIDEN_SUM_EXPR || code == SAD_EXPR; > +} > + > /* Source location + hotness information. */ > extern dump_user_location_t vect_location; > > -- > 2.17.1