https://gcc.gnu.org/g:8fb3d9066266ea30de62c395239bda4e992297a3
commit r15-9782-g8fb3d9066266ea30de62c395239bda4e992297a3 Author: Richard Biener <rguent...@suse.de> Date: Tue Apr 29 13:23:41 2025 +0200 tree-optimization/119960 - failed external SLP promotion The following addresses a too conservative sanity check of SLP nodes we want to promote external. The issue lies in code generation for such external which relies on get_later_stmt to figure an insert location. But get_later_stmt relies on the ability to totally order stmts, specifically implementation-wise that they are all from the same BB, which is what is verified at the moment. The patch changes this to require stmts to be orderable by dominance queries. For simplicity and seemingly enough for the testcase in PR119960, this handles the case of two distinct BBs. PR tree-optimization/119960 * tree-vect-slp.cc (vect_slp_can_convert_to_external): Handle cases where defs from multiple BBs are ordered by their dominance relation. * gcc.dg/vect/bb-slp-pr119960-1.c: New testcase. (cherry picked from commit cc74e2f2b39b6debbef1787a087abad2108e95dd) Diff: --- gcc/testsuite/gcc.dg/vect/bb-slp-pr119960-1.c | 15 +++++++ gcc/tree-vect-slp.cc | 63 ++++++++++++++++++++++++--- 2 files changed, 71 insertions(+), 7 deletions(-) diff --git a/gcc/testsuite/gcc.dg/vect/bb-slp-pr119960-1.c b/gcc/testsuite/gcc.dg/vect/bb-slp-pr119960-1.c new file mode 100644 index 000000000000..955fc7e32208 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/bb-slp-pr119960-1.c @@ -0,0 +1,15 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target vect_double } */ + +double foo (double *dst, double *src, int b) +{ + double y = src[1]; + if (b) + { + dst[0] = src[0]; + dst[1] = y; + } + return y; +} + +/* { dg-final { scan-tree-dump "optimized: basic block part vectorized" "slp2" { target vect_double } } } */ diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc index b5addd6d76bb..f5286e6b819e 100644 --- a/gcc/tree-vect-slp.cc +++ b/gcc/tree-vect-slp.cc @@ -7842,21 +7842,70 @@ vect_slp_analyze_node_operations_1 (vec_info *vinfo, slp_tree node, node, node_instance, cost_vec); } +static int +sort_ints (const void *a_, const void *b_) +{ + int a = *(const int *)a_; + int b = *(const int *)b_; + return a - b; +} + /* Verify if we can externalize a set of internal defs. */ static bool vect_slp_can_convert_to_external (const vec<stmt_vec_info> &stmts) { + /* Constant generation uses get_later_stmt which can only handle + defs from the same BB or a set of defs that can be ordered + with a dominance query. */ basic_block bb = NULL; + bool all_same = true; + auto_vec<int> bbs; + bbs.reserve_exact (stmts.length ()); for (stmt_vec_info stmt : stmts) - if (!stmt) - return false; - /* Constant generation uses get_later_stmt which can only handle - defs from the same BB. */ - else if (!bb) - bb = gimple_bb (stmt->stmt); - else if (gimple_bb (stmt->stmt) != bb) + { + if (!stmt) + return false; + else if (!bb) + bb = gimple_bb (stmt->stmt); + else if (gimple_bb (stmt->stmt) != bb) + all_same = false; + bbs.quick_push (gimple_bb (stmt->stmt)->index); + } + if (all_same) + return true; + + /* Produce a vector of unique BB indexes for the defs. */ + bbs.qsort (sort_ints); + unsigned i, j; + for (i = 1, j = 1; i < bbs.length (); ++i) + if (bbs[i] != bbs[j-1]) + bbs[j++] = bbs[i]; + gcc_assert (j >= 2); + bbs.truncate (j); + + if (bbs.length () == 2) + return (dominated_by_p (CDI_DOMINATORS, + BASIC_BLOCK_FOR_FN (cfun, bbs[0]), + BASIC_BLOCK_FOR_FN (cfun, bbs[1])) + || dominated_by_p (CDI_DOMINATORS, + BASIC_BLOCK_FOR_FN (cfun, bbs[1]), + BASIC_BLOCK_FOR_FN (cfun, bbs[0]))); + + /* ??? For more than two BBs we can sort the vector and verify the + result is a total order. But we can't use vec::qsort with a + compare function using a dominance query since there's no way to + signal failure and any fallback for an unordered pair would + fail qsort_chk later. + For now simply hope that ordering after BB index provides the + best candidate total order. If required we can implement our + own mergesort or export an entry without checking. */ + for (unsigned i = 1; i < bbs.length (); ++i) + if (!dominated_by_p (CDI_DOMINATORS, + BASIC_BLOCK_FOR_FN (cfun, bbs[i]), + BASIC_BLOCK_FOR_FN (cfun, bbs[i-1]))) return false; + return true; }