> -----Original Message-----
> From: Richard Biener <[email protected]>
> Sent: Friday, August 29, 2025 12:41 PM
> To: [email protected]
> Cc: RISC-V CI <[email protected]>; Tamar Christina
> <[email protected]>
> Subject: [PATCH 4/4] Eliminate some STMT_VINFO_REDUC_IDX for
> SLP_TREE_REDUC_IDX
> 
> The following uses SLP_TREE_REDUC_IDX where it looks more appropriate.
> 
> Bootstrapped and tested on x86_64-unknown-linux-gnu.
> 
> Tamar, can you test 1+2 (separately)?  Possibly also the full stack
> if the first part succeeds.

1+2 is successful with the additional fixup for aarch64 in patch 2, and clean 
on arm.
The full thing is also clean on arm, aarch64 with the fixup.

The final fixup ended up being:

diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 1cdd5a26a83..6c9bb39fb75 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -17770,13 +17770,20 @@ aarch64_adjust_stmt_cost (vec_info *vinfo, 
vect_cost_for_stmt kind,

    with the single accumulator being read and written multiple times.  */
 static bool
-aarch64_force_single_cycle (vec_info *vinfo, stmt_vec_info stmt_info)
+aarch64_force_single_cycle (vec_info *vinfo, slp_tree node)
 {
+  stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
   if (!STMT_VINFO_REDUC_DEF (stmt_info))
     return false;

-  auto reduc_info = info_for_reduction (vinfo, stmt_info);
-  return VECT_REDUC_INFO_FORCE_SINGLE_CYCLE (reduc_info);
+  auto loop_vinfo = as_a <loop_vec_info> (vinfo);
+  vect_reduc_info reduc_info;
+
+  if (loop_vinfo
+      && (reduc_info = info_for_reduction (loop_vinfo, node)))
+    return VECT_REDUC_INFO_FORCE_SINGLE_CYCLE (reduc_info);
+
+  return false;
 }

 /* COUNT, KIND and STMT_INFO are the same as for vector_costs::add_stmt_cost
@@ -17803,7 +17810,7 @@ aarch64_vector_costs::count_ops (unsigned int count, 
vect_cost_for_stmt kind,
        = aarch64_in_loop_reduction_latency (m_vinfo, node,
                                             stmt_info, m_vec_flags);
       if (m_costing_for_scalar
-         || aarch64_force_single_cycle (m_vinfo, stmt_info))
+         || aarch64_force_single_cycle (m_vinfo, node))
        /* ??? Ideally we'd use a tree to reduce the copies down to 1 vector,
           and then accumulate that, but at the moment the loop-carried
           dependency includes all copies.  */

---

Though I wonder why info_for_reduction is only supported for loops, I though we 
had reduction
support for BB vectorization.

Thanks,
Tamar

> 
> Thanks,
> Richard.
> 
>       * tree-vect-loop.cc (vect_create_epilog_for_reduction):
>       Use SLP_TREE_REDUC_IDX for following the SLP graph and
>       for identifying whether we use the 'else' in a COND.
>       (vectorizable_lane_reducing): Simplify check of whether
>       we are in a reduction.
>       (vectorizable_reduction): Add sanity checking around
>       SLP_TREE_REDUC_IDX and use it where it looks appropriate.
>       (vect_transform_reduction): Use SLP_TREE_REDUC_IDX.
>       * tree-vect-stmts.cc (vectorizable_call): Likewise.
>       (vectorizable_operation): Likewise.
>       (vectorizable_condition): Likewise.
> ---
>  gcc/tree-vect-loop.cc  | 31 +++++++++++++------------------
>  gcc/tree-vect-stmts.cc |  8 ++++----
>  2 files changed, 17 insertions(+), 22 deletions(-)
> 
> diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
> index 4af7283485e..b187d0d8533 100644
> --- a/gcc/tree-vect-loop.cc
> +++ b/gcc/tree-vect-loop.cc
> @@ -5497,7 +5497,6 @@ vect_create_epilog_for_reduction (loop_vec_info
> loop_vinfo,
>        while (cond_node != slp_node_instance->reduc_phis)
>       {
>         stmt_vec_info cond_info = SLP_TREE_REPRESENTATIVE (cond_node);
> -       int slp_reduc_idx;
>         if (gimple_assign_rhs_code (cond_info->stmt) == COND_EXPR)
>           {
>             gimple *vec_stmt
> @@ -5505,16 +5504,9 @@ vect_create_epilog_for_reduction (loop_vec_info
> loop_vinfo,
>             gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
>             ccompares.safe_push
>               (std::make_pair (gimple_assign_rhs1 (vec_stmt),
> -                              STMT_VINFO_REDUC_IDX (cond_info) == 2));
> -           /* ???  We probably want to have REDUC_IDX on the SLP node?
> -              We have both three and four children COND_EXPR nodes
> -              dependent on whether the comparison is still embedded
> -              as GENERIC.  So work backwards.  */
> -           slp_reduc_idx = (SLP_TREE_CHILDREN (cond_node).length () - 3
> -                            + STMT_VINFO_REDUC_IDX (cond_info));
> +                              SLP_TREE_REDUC_IDX (cond_node) == 2));
>           }
> -       else
> -         slp_reduc_idx = STMT_VINFO_REDUC_IDX (cond_info);
> +       int slp_reduc_idx = SLP_TREE_REDUC_IDX (cond_node);
>         cond_node = SLP_TREE_CHILDREN (cond_node)[slp_reduc_idx];
>       }
>        gcc_assert (ccompares.length () != 0);
> @@ -6882,14 +6874,13 @@ vectorizable_lane_reducing (loop_vec_info
> loop_vinfo, stmt_vec_info stmt_info,
>    if (!type_has_mode_precision_p (type))
>      return false;
> 
> +  vect_reduc_info reduc_info = info_for_reduction (loop_vinfo, slp_node);
> +
>    /* TODO: Support lane-reducing operation that does not directly participate
>       in loop reduction.  */
> -  if (!STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))
> -      || STMT_VINFO_REDUC_IDX (stmt_info) < 0)
> +  if (!reduc_info)
>      return false;
> 
> -  vect_reduc_info reduc_info = info_for_reduction (loop_vinfo, slp_node);
> -
>    /* Lane-reducing pattern inside any inner loop of LOOP_VINFO is not
>       recoginized.  */
>    gcc_assert (!nested_in_vect_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
> stmt_info));
> @@ -7135,7 +7126,8 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
>        stmt_vec_info def = loop_vinfo->lookup_def (reduc_def);
>        stmt_vec_info vdef = vect_stmt_to_vectorize (def);
>        int reduc_idx = STMT_VINFO_REDUC_IDX (vdef);
> -      if (reduc_idx == -1)
> +      if (STMT_VINFO_REDUC_IDX (vdef) == -1
> +       || SLP_TREE_REDUC_IDX (vdef_slp) == -1)
>       {
>         if (dump_enabled_p ())
>           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> @@ -7204,7 +7196,10 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
>         else if (!vectype_in)
>           vectype_in = SLP_TREE_VECTYPE (slp_node);
>         if (!REDUC_GROUP_FIRST_ELEMENT (vdef))
> -         vdef_slp = SLP_TREE_CHILDREN (vdef_slp)[reduc_idx];
> +         {
> +           gcc_assert (reduc_idx == SLP_TREE_REDUC_IDX (vdef_slp));
> +           vdef_slp = SLP_TREE_CHILDREN (vdef_slp)[reduc_idx];
> +         }
>       }
> 
>        reduc_def = op.ops[reduc_idx];
> @@ -7361,7 +7356,7 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
>       return false;
> 
>        /* When the condition uses the reduction value in the condition, fail. 
>  */
> -      if (STMT_VINFO_REDUC_IDX (stmt_info) == 0)
> +      if (SLP_TREE_REDUC_IDX (slp_node) == 0)
>       {
>         if (dump_enabled_p ())
>           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> @@ -8001,7 +7996,7 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
>       The last use is the reduction variable.  In case of nested cycle this
>       assumption is not true: we use reduc_index to record the index of the
>       reduction variable.  */
> -  int reduc_index = STMT_VINFO_REDUC_IDX (stmt_info);
> +  int reduc_index = SLP_TREE_REDUC_IDX (slp_node);
>    tree vectype_in = SLP_TREE_VECTYPE (SLP_TREE_CHILDREN (slp_node)[0]);
> 
>    vec_num = vect_get_num_copies (loop_vinfo, slp_node, vectype_in);
> diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
> index 77a03ed4a7b..15e0d069dcc 100644
> --- a/gcc/tree-vect-stmts.cc
> +++ b/gcc/tree-vect-stmts.cc
> @@ -3432,7 +3432,7 @@ vectorizable_call (vec_info *vinfo,
>       }
>      }
> 
> -  int reduc_idx = STMT_VINFO_REDUC_IDX (stmt_info);
> +  int reduc_idx = SLP_TREE_REDUC_IDX (slp_node);
>    internal_fn cond_fn = get_conditional_internal_fn (ifn);
>    internal_fn cond_len_fn = get_len_internal_fn (ifn);
>    int len_opno = internal_fn_len_index (cond_len_fn);
> @@ -6452,7 +6452,7 @@ vectorizable_operation (vec_info *vinfo,
>        using_emulated_vectors_p = true;
>      }
> 
> -  int reduc_idx = STMT_VINFO_REDUC_IDX (stmt_info);
> +  int reduc_idx = SLP_TREE_REDUC_IDX (slp_node);
>    vec_loop_masks *masks = (loop_vinfo ? &LOOP_VINFO_MASKS (loop_vinfo) :
> NULL);
>    vec_loop_lens *lens = (loop_vinfo ? &LOOP_VINFO_LENS (loop_vinfo) : NULL);
>    internal_fn cond_fn = get_conditional_internal_fn (code);
> @@ -6570,7 +6570,7 @@ vectorizable_operation (vec_info *vinfo,
>    else if (arith_code_with_undefined_signed_overflow (orig_code)
>          && ANY_INTEGRAL_TYPE_P (vectype)
>          && TYPE_OVERFLOW_UNDEFINED (vectype)
> -        && STMT_VINFO_REDUC_IDX (stmt_info) != -1)
> +        && SLP_TREE_REDUC_IDX (slp_node) != -1)
>      {
>        gcc_assert (orig_code == PLUS_EXPR || orig_code == MINUS_EXPR
>                 || orig_code == MULT_EXPR || orig_code ==
> POINTER_PLUS_EXPR);
> @@ -11560,7 +11560,7 @@ vectorizable_condition (vec_info *vinfo,
>    if (code != COND_EXPR)
>      return false;
> 
> -  int reduc_index = STMT_VINFO_REDUC_IDX (stmt_info);
> +  int reduc_index = SLP_TREE_REDUC_IDX (slp_node);
>    vect_reduction_type reduction_type = TREE_CODE_REDUCTION;
>    bool nested_cycle_p = false;
>    bool for_reduction = vect_is_reduction (stmt_info);
> --
> 2.43.0

Reply via email to