[gcc r14-10440] tree-optimization/115841 - reduction epilogue placement issue

Richard Biener via Gcc-cvs Tue, 16 Jul 2024 07:24:25 -0700

https://gcc.gnu.org/g:59ed01d5e3d2b0e59163d3248bdba9f1e35de599


commit r14-10440-g59ed01d5e3d2b0e59163d3248bdba9f1e35de599
Author: Richard Biener <[email protected]>
Date:   Tue Jul 16 11:53:17 2024 +0200

    tree-optimization/115841 - reduction epilogue placement issue
    
    When emitting the compensation to the vectorized main loop for
    a vector reduction value to be re-used in the vectorized epilogue
    we fail to place it in the correct block when the main loop is
    known to be entered (no loop_vinfo->main_loop_edge) but the
    epilogue is not (a loop_vinfo->skip_this_loop_edge).  The code
    currently disregards this situation.
    
    With the recent znver4 cost fix I couldn't trigger this situation
    with the testcase but I adjusted it so it could eventually trigger
    on other targets.
    
            PR tree-optimization/115841
            * tree-vect-loop.cc (vect_transform_cycle_phi): Correctly
            place the partial vector reduction for the accumulator
            re-use when the main loop cannot be skipped but the
            epilogue can.
    
            * gcc.dg/vect/pr115841.c: New testcase.
    
    (cherry picked from commit 016c947b02e79a5c0c0c2d4ad5cb71aa04db3efd)

Diff:
---
 gcc/testsuite/gcc.dg/vect/pr115841.c | 42 ++++++++++++++++++++++++++++++++++++
 gcc/tree-vect-loop.cc                |  7 +++---
 2 files changed, 46 insertions(+), 3 deletions(-)

diff --git a/gcc/testsuite/gcc.dg/vect/pr115841.c 
b/gcc/testsuite/gcc.dg/vect/pr115841.c
new file mode 100644
index 000000000000..aa5c66004a03
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/pr115841.c
@@ -0,0 +1,42 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-Ofast -fcommon -fvect-cost-model=dynamic --param 
vect-partial-vector-usage=1" } */
+/* { dg-additional-options "-mavx512vl" { target avx512vl } } */
+
+/* To trigger the bug costing needs to determine that aligning the A170
+   accesses with a prologue is good and there should be a vectorized
+   epilogue with a smaller vector size, re-using the vector accumulator
+   from the vectorized main loop that's statically known to execute
+   but the epilogue loop is not.  */
+
+static unsigned char xl[192];
+unsigned char A170[192*3];
+
+void jerate (unsigned char *, unsigned char *);
+float foo (unsigned n)
+{
+  jerate (xl, A170);
+
+  unsigned i = 32;
+  int kr = 1;
+  float sfn11s = 0.f;
+  float sfn12s = 0.f;
+  do
+    {
+      int krm1 = kr - 1;
+      long j = krm1;
+      float a = (*(float(*)[n])A170)[j];
+      float b = (*(float(*)[n])xl)[j];
+      float c = a * b;
+      float d = c * 6.93149983882904052734375e-1f;
+      float e = (*(float(*)[n])A170)[j+48];
+      float f = (*(float(*)[n])A170)[j+96];
+      float g = d * e;
+      sfn11s = sfn11s + g;
+      float h = f * d;
+      sfn12s = sfn12s + h;
+      kr++;
+    }
+  while (--i != 0);
+  float tem = sfn11s + sfn12s;
+  return tem;
+}
diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index 832399f7e9d7..feed73585921 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -8880,14 +8880,15 @@ vect_transform_cycle_phi (loop_vec_info loop_vinfo,
          /* And the reduction could be carried out using a different sign.  */
          if (!useless_type_conversion_p (vectype_out, TREE_TYPE (def)))
            def = gimple_convert (&stmts, vectype_out, def);
-         if (loop_vinfo->main_loop_edge)
+         edge e;
+         if ((e = loop_vinfo->main_loop_edge)
+             || (e = loop_vinfo->skip_this_loop_edge))
            {
              /* While we'd like to insert on the edge this will split
                 blocks and disturb bookkeeping, we also will eventually
                 need this on the skip edge.  Rely on sinking to
                 fixup optimal placement and insert in the pred.  */
-             gimple_stmt_iterator gsi
-               = gsi_last_bb (loop_vinfo->main_loop_edge->src);
+             gimple_stmt_iterator gsi = gsi_last_bb (e->src);
              /* Insert before a cond that eventually skips the
                 epilogue.  */
              if (!gsi_end_p (gsi) && stmt_ends_bb_p (gsi_stmt (gsi)))

[gcc r14-10440] tree-optimization/115841 - reduction epilogue placement issue

Reply via email to