This fixes double-scaling of the inner loop scalar cost caused
by routing the scalar costs through the add_stmt_cost hook and
using vect_body as the location.  The issue makes almost every
outer loop vectorization profitable.

Bootstrapped and tested on x86_64-unknown-linux-gnu, pushed.

2021-08-23  Richard Biener  <rguent...@suse.de>

        * tree-vect-loop.c (vect_compute_single_scalar_iteration_cost):
        Properly scale the inner loop cost only once.
---
 gcc/tree-vect-loop.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
index 995d143dbbd..c521b43a47c 100644
--- a/gcc/tree-vect-loop.c
+++ b/gcc/tree-vect-loop.c
@@ -1284,6 +1284,8 @@ vect_compute_single_scalar_iteration_cost (loop_vec_info 
loop_vinfo)
          else
             kind = scalar_stmt;
 
+         /* We are using vect_prologue here to avoid scaling twice
+            by the inner loop factor.  */
          record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
                            factor, kind, stmt_info, 0, vect_prologue);
         }
@@ -1297,11 +1299,13 @@ vect_compute_single_scalar_iteration_cost 
(loop_vec_info loop_vinfo)
                    j, si)
     (void) add_stmt_cost (loop_vinfo, target_cost_data, si->count,
                          si->kind, si->stmt_info, si->vectype,
-                         si->misalign, vect_body);
-  unsigned dummy, body_cost = 0;
-  finish_cost (target_cost_data, &dummy, &body_cost, &dummy);
+                         si->misalign, si->where);
+  unsigned prologue_cost = 0, body_cost = 0, epilogue_cost = 0;
+  finish_cost (target_cost_data, &prologue_cost, &body_cost,
+              &epilogue_cost);
   destroy_cost_data (target_cost_data);
-  LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo) = body_cost;
+  LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo)
+    = prologue_cost + body_cost + epilogue_cost;
 }
 
 
-- 
2.31.1

Reply via email to