https://gcc.gnu.org/g:855b61b61e63b17cc9770cbe1c5387e4f59c1ffe

commit r15-7985-g855b61b61e63b17cc9770cbe1c5387e4f59c1ffe
Author: Richard Sandiford <richard.sandif...@arm.com>
Date:   Wed Mar 12 09:40:10 2025 +0000

    vect: Fix ncopies when costing SLP reductions [PR116901]
    
    pr110625_[24].c started failing after r15-1329-gd66b820f392aa9a7,
    which switched to single def-use cycles for single-lane SLP.
    The problem is that we only costed one vector accumulator
    operation for an N-vector cycle.
    
    The problem seems to have been latent, and meant that we also
    only costed one FADDA for reduc_strict_4.c and reduc_strict_5.c,
    even though they need 4 and 6 FADDAs respectively.
    
    I'm not sure why:
    
       if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
           && ncopies > 1)
    
    was previously only necessary for non-SLP, but the patch preserves
    that for safety.
    
    gcc/
            PR tree-optimization/116901
            * tree-vect-loop.cc (vectorizable_reduction): Set ncopies to
            SLP_TREE_NUMBER_OF_VEC_STMTS for SLP.
    
    gcc/testsuite/
            PR tree-optimization/116901
            * gcc.target/aarch64/sve/reduc_strict_4.c: Turn off costing.
            * gcc.target/aarch64/sve/reduc_strict_5.c: Likewise.

Diff:
---
 gcc/testsuite/gcc.target/aarch64/sve/reduc_strict_4.c |  2 +-
 gcc/testsuite/gcc.target/aarch64/sve/reduc_strict_5.c |  2 +-
 gcc/tree-vect-loop.cc                                 | 14 +++++++-------
 3 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/gcc/testsuite/gcc.target/aarch64/sve/reduc_strict_4.c 
b/gcc/testsuite/gcc.target/aarch64/sve/reduc_strict_4.c
index 9a12edad42ec..8dad5ee60166 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/reduc_strict_4.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/reduc_strict_4.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-O2 -ftree-vectorize" } */
+/* { dg-options "-O2 -ftree-vectorize -fno-vect-cost-model" } */
 
 double mat[100][8];
 
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/reduc_strict_5.c 
b/gcc/testsuite/gcc.target/aarch64/sve/reduc_strict_5.c
index 7c3068fe87ad..9e117812d340 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/reduc_strict_5.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/reduc_strict_5.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-O2 -ftree-vectorize" } */
+/* { dg-options "-O2 -ftree-vectorize -fno-vect-cost-model" } */
 
 double mat[100][12];
 
diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index 52533623cab9..9413dcef7025 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -8180,7 +8180,7 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
     return false;
 
   if (slp_node)
-    ncopies = 1;
+    ncopies = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
   else
     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
 
@@ -8288,7 +8288,7 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
        || reduction_type == CONST_COND_REDUCTION
        || reduction_type == EXTRACT_LAST_REDUCTION)
       && slp_node
-      && SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node) > 1)
+      && ncopies > 1)
     {
       if (dump_enabled_p ())
        dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
@@ -8297,6 +8297,7 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
     }
 
   if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
+      && !slp_node
       && ncopies > 1)
     {
       if (dump_enabled_p ())
@@ -8523,11 +8524,10 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
    participating.  When unrolling we want each unrolled iteration to have its
    own reduction accumulator since one of the main goals of unrolling a
    reduction is to reduce the aggregate loop-carried latency.  */
-  if ((ncopies > 1
-       || (slp_node
-          && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
-          && SLP_TREE_LANES (slp_node) == 1
-          && vect_get_num_copies (loop_vinfo, vectype_in) > 1))
+  if (ncopies > 1
+      && (!slp_node
+         || (!REDUC_GROUP_FIRST_ELEMENT (stmt_info)
+             && SLP_TREE_LANES (slp_node) == 1))
       && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
       && reduc_chain_length == 1
       && loop_vinfo->suggested_unroll_factor == 1)

Reply via email to