The following addresses a too conservative sanity check of SLP nodes
we want to promote external.  The issue lies in code generation
for such external which relies on get_later_stmt to figure an
insert location.  But get_later_stmt relies on the ability to
totally order stmts, specifically implementation-wise that they
are all from the same BB, which is what is verified at the moment.

The patch changes this to require stmts to be orderable by
dominance queries.  For simplicity and seemingly enough for the
testcase in PR119960, this handles the case of two distinct BBs.

Bootstrapped on x86_64-unknown-linux-gnu.

I'm considering this for GCC 15.2 given it's a recent optimization
regression.  It requires some dependences to be backported though.

        PR tree-optimization/119960
        * tree-vect-slp.cc (vect_slp_can_convert_to_external):
        Handle cases where defs from multiple BBs are ordered
        by their dominance relation.

        * gcc.dg/vect/bb-slp-pr119960-1.c: New testcase.
---
 gcc/testsuite/gcc.dg/vect/bb-slp-pr119960-1.c | 15 +++++
 gcc/tree-vect-slp.cc                          | 63 ++++++++++++++++---
 2 files changed, 71 insertions(+), 7 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/vect/bb-slp-pr119960-1.c

diff --git a/gcc/testsuite/gcc.dg/vect/bb-slp-pr119960-1.c 
b/gcc/testsuite/gcc.dg/vect/bb-slp-pr119960-1.c
new file mode 100644
index 00000000000..955fc7e3220
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/bb-slp-pr119960-1.c
@@ -0,0 +1,15 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target vect_double } */
+
+double foo (double *dst, double *src, int b)
+{
+  double y = src[1];
+  if (b)
+    {
+      dst[0] = src[0];
+      dst[1] = y;
+    }
+  return y;
+}
+
+/* { dg-final { scan-tree-dump "optimized: basic block part vectorized" "slp2" 
{ target vect_double } } } */
diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc
index 7791fe1f87f..f7c51b6cf68 100644
--- a/gcc/tree-vect-slp.cc
+++ b/gcc/tree-vect-slp.cc
@@ -7846,21 +7846,70 @@ vect_slp_analyze_node_operations_1 (vec_info *vinfo, 
slp_tree node,
                            node, node_instance, cost_vec);
 }
 
+static int
+sort_ints (const void *a_, const void *b_)
+{
+  int a = *(const int *)a_;
+  int b = *(const int *)b_;
+  return a - b;
+}
+
 /* Verify if we can externalize a set of internal defs.  */
 
 static bool
 vect_slp_can_convert_to_external (const vec<stmt_vec_info> &stmts)
 {
+  /* Constant generation uses get_later_stmt which can only handle
+     defs from the same BB or a set of defs that can be ordered
+     with a dominance query.  */
   basic_block bb = NULL;
+  bool all_same = true;
+  auto_vec<int> bbs;
+  bbs.reserve_exact (stmts.length ());
   for (stmt_vec_info stmt : stmts)
-    if (!stmt)
-      return false;
-    /* Constant generation uses get_later_stmt which can only handle
-       defs from the same BB.  */
-    else if (!bb)
-      bb = gimple_bb (stmt->stmt);
-    else if (gimple_bb (stmt->stmt) != bb)
+    {
+      if (!stmt)
+       return false;
+      else if (!bb)
+       bb = gimple_bb (stmt->stmt);
+      else if (gimple_bb (stmt->stmt) != bb)
+       all_same = false;
+      bbs.quick_push (gimple_bb (stmt->stmt)->index);
+    }
+  if (all_same)
+    return true;
+
+  /* Produce a vector of unique BB indexes for the defs.  */
+  bbs.qsort (sort_ints);
+  unsigned i, j;
+  for (i = 1, j = 1; i < bbs.length (); ++i)
+    if (bbs[i] != bbs[j-1])
+      bbs[j++] = bbs[i];
+  gcc_assert (j >= 2);
+  bbs.truncate (j);
+
+  if (bbs.length () == 2)
+    return (dominated_by_p (CDI_DOMINATORS,
+                           BASIC_BLOCK_FOR_FN (cfun, bbs[0]),
+                           BASIC_BLOCK_FOR_FN (cfun, bbs[1]))
+           || dominated_by_p (CDI_DOMINATORS,
+                              BASIC_BLOCK_FOR_FN (cfun, bbs[1]),
+                              BASIC_BLOCK_FOR_FN (cfun, bbs[0])));
+
+  /* ???  For more than two BBs we can sort the vector and verify the
+     result is a total order.  But we can't use vec::qsort with a
+     compare function using a dominance query since there's no way to
+     signal failure and any fallback for an unordered pair would
+     fail qsort_chk later.
+     For now simply hope that ordering after BB index provides the
+     best candidate total order.  If required we can implement our
+     own mergesort or export an entry without checking.  */
+  for (unsigned i = 1; i < bbs.length (); ++i)
+    if (!dominated_by_p (CDI_DOMINATORS,
+                        BASIC_BLOCK_FOR_FN (cfun, bbs[i]),
+                        BASIC_BLOCK_FOR_FN (cfun, bbs[i-1])))
       return false;
+
   return true;
 }
 
-- 
2.43.0

Reply via email to