On Thu, May 30, 2024 at 4:45 PM Feng Xue OS <f...@os.amperecomputing.com> wrote:
>
> This is a patch that is split out from 
> https://gcc.gnu.org/pipermail/gcc-patches/2024-May/652626.html.
>
> Check if an operation is lane-reducing requires comparison of code against
> three kinds (DOT_PROD_EXPR/WIDEN_SUM_EXPR/SAD_EXPR).  Add an utility
> function to make source coding for the check handy and concise.

OK.

Thanks,
Richard.

> Feng
> --
> gcc/
>         * tree-vectorizer.h (lane_reducing_op_p): New function.
>         * tree-vect-slp.cc (vect_analyze_slp): Use new function
>         lane_reducing_op_p to check statement code.
>         * tree-vect-loop.cc (vect_transform_reduction): Likewise.
>         (vectorizable_reduction): Likewise, and change name of a local
>         variable that holds the result flag.
> ---
>  gcc/tree-vect-loop.cc | 29 ++++++++++++-----------------
>  gcc/tree-vect-slp.cc  |  4 +---
>  gcc/tree-vectorizer.h |  6 ++++++
>  3 files changed, 19 insertions(+), 20 deletions(-)
>
> diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
> index 04a9ac64df7..a42d79c7cbf 100644
> --- a/gcc/tree-vect-loop.cc
> +++ b/gcc/tree-vect-loop.cc
> @@ -7650,9 +7650,7 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
>    gimple_match_op op;
>    if (!gimple_extract_op (stmt_info->stmt, &op))
>      gcc_unreachable ();
> -  bool lane_reduc_code_p = (op.code == DOT_PROD_EXPR
> -                           || op.code == WIDEN_SUM_EXPR
> -                           || op.code == SAD_EXPR);
> +  bool lane_reducing = lane_reducing_op_p (op.code);
>
>    if (!POINTER_TYPE_P (op.type) && !INTEGRAL_TYPE_P (op.type)
>        && !SCALAR_FLOAT_TYPE_P (op.type))
> @@ -7664,7 +7662,7 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
>
>    /* For lane-reducing ops we're reducing the number of reduction PHIs
>       which means the only use of that may be in the lane-reducing operation. 
>  */
> -  if (lane_reduc_code_p
> +  if (lane_reducing
>        && reduc_chain_length != 1
>        && !only_slp_reduc_chain)
>      {
> @@ -7678,7 +7676,7 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
>       since we'll mix lanes belonging to different reductions.  But it's
>       OK to use them in a reduction chain or when the reduction group
>       has just one element.  */
> -  if (lane_reduc_code_p
> +  if (lane_reducing
>        && slp_node
>        && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
>        && SLP_TREE_LANES (slp_node) > 1)
> @@ -7738,7 +7736,7 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
>        /* To properly compute ncopies we are interested in the widest
>          non-reduction input type in case we're looking at a widening
>          accumulation that we later handle in vect_transform_reduction.  */
> -      if (lane_reduc_code_p
> +      if (lane_reducing
>           && vectype_op[i]
>           && (!vectype_in
>               || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
> @@ -8211,7 +8209,7 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
>        && loop_vinfo->suggested_unroll_factor == 1)
>      single_defuse_cycle = true;
>
> -  if (single_defuse_cycle || lane_reduc_code_p)
> +  if (single_defuse_cycle || lane_reducing)
>      {
>        gcc_assert (op.code != COND_EXPR);
>
> @@ -8227,7 +8225,7 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
>          mixed-sign dot-products can be implemented using signed
>          dot-products.  */
>        machine_mode vec_mode = TYPE_MODE (vectype_in);
> -      if (!lane_reduc_code_p
> +      if (!lane_reducing
>           && !directly_supported_p (op.code, vectype_in, optab_vector))
>          {
>            if (dump_enabled_p ())
> @@ -8252,7 +8250,7 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
>           For the other cases try without the single cycle optimization.  */
>        if (!ok)
>         {
> -         if (lane_reduc_code_p)
> +         if (lane_reducing)
>             return false;
>           else
>             single_defuse_cycle = false;
> @@ -8263,7 +8261,7 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
>    /* If the reduction stmt is one of the patterns that have lane
>       reduction embedded we cannot handle the case of ! single_defuse_cycle.  
> */
>    if ((ncopies > 1 && ! single_defuse_cycle)
> -      && lane_reduc_code_p)
> +      && lane_reducing)
>      {
>        if (dump_enabled_p ())
>         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> @@ -8274,7 +8272,7 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
>
>    if (slp_node
>        && !(!single_defuse_cycle
> -          && !lane_reduc_code_p
> +          && !lane_reducing
>            && reduction_type != FOLD_LEFT_REDUCTION))
>      for (i = 0; i < (int) op.num_ops; i++)
>        if (!vect_maybe_update_slp_op_vectype (slp_op[i], vectype_op[i]))
> @@ -8295,7 +8293,7 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
>    /* Cost the reduction op inside the loop if transformed via
>       vect_transform_reduction.  Otherwise this is costed by the
>       separate vectorizable_* routines.  */
> -  if (single_defuse_cycle || lane_reduc_code_p)
> +  if (single_defuse_cycle || lane_reducing)
>      {
>        int factor = 1;
>        if (vect_is_emulated_mixed_dot_prod (loop_vinfo, stmt_info))
> @@ -8313,7 +8311,7 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
>    /* All but single defuse-cycle optimized, lane-reducing and fold-left
>       reductions go through their own vectorizable_* routines.  */
>    if (!single_defuse_cycle
> -      && !lane_reduc_code_p
> +      && !lane_reducing
>        && reduction_type != FOLD_LEFT_REDUCTION)
>      {
>        stmt_vec_info tem
> @@ -8555,10 +8553,7 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
>      }
>
>    bool single_defuse_cycle = STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info);
> -  gcc_assert (single_defuse_cycle
> -             || code == DOT_PROD_EXPR
> -             || code == WIDEN_SUM_EXPR
> -             || code == SAD_EXPR);
> +  gcc_assert (single_defuse_cycle || lane_reducing_op_p (code));
>
>    /* Create the destination vector  */
>    tree scalar_dest = gimple_get_lhs (stmt_info->stmt);
> diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc
> index bc7a85d6bfc..bf1f467f53f 100644
> --- a/gcc/tree-vect-slp.cc
> +++ b/gcc/tree-vect-slp.cc
> @@ -3928,9 +3928,7 @@ vect_analyze_slp (vec_info *vinfo, unsigned 
> max_tree_size)
>                   /* Do not discover SLP reductions for lane-reducing ops, 
> that
>                      will fail later.  */
>                   && (!(g = dyn_cast <gassign *> (STMT_VINFO_STMT 
> (next_info)))
> -                     || (gimple_assign_rhs_code (g) != DOT_PROD_EXPR
> -                         && gimple_assign_rhs_code (g) != WIDEN_SUM_EXPR
> -                         && gimple_assign_rhs_code (g) != SAD_EXPR)))
> +                     || !lane_reducing_op_p (gimple_assign_rhs_code (g))))
>                 scalar_stmts.quick_push (next_info);
>             }
>           if (scalar_stmts.length () > 1)
> diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
> index 479823412fd..97ec9c341e7 100644
> --- a/gcc/tree-vectorizer.h
> +++ b/gcc/tree-vectorizer.h
> @@ -2169,6 +2169,12 @@ vect_apply_runtime_profitability_check_p 
> (loop_vec_info loop_vinfo)
>           && th >= vect_vf_for_cost (loop_vinfo));
>  }
>
> +inline bool
> +lane_reducing_op_p (code_helper code)
> +{
> +  return code == DOT_PROD_EXPR || code == WIDEN_SUM_EXPR || code == SAD_EXPR;
> +}
> +
>  /* Source location + hotness information. */
>  extern dump_user_location_t vect_location;
>
> --
> 2.17.1

Reply via email to