With SLP forced we fail to consider using single-lane SLP for a case that we still end up discovering as hybrid (in the PR in question this is because we run into the SLP discovery limit due to excessive association).
Bootstrapped on x86_64-unknown-linux-gnu, testing in progress. This solves a bit of the 433.milc regression. PR tree-optimization/117874 * tree-vect-loop.cc (vect_analyze_loop_2): When non-SLP analysis fails, try single-lane SLP. * gcc.dg/vect/pr117874.c: New testcase. --- gcc/testsuite/gcc.dg/vect/pr117874.c | 50 ++++++++++++++++++++++++++++ gcc/tree-vect-loop.cc | 7 ++-- 2 files changed, 53 insertions(+), 4 deletions(-) create mode 100644 gcc/testsuite/gcc.dg/vect/pr117874.c diff --git a/gcc/testsuite/gcc.dg/vect/pr117874.c b/gcc/testsuite/gcc.dg/vect/pr117874.c new file mode 100644 index 00000000000..27e5f8ca369 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/pr117874.c @@ -0,0 +1,50 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target vect_double } */ + +typedef struct { + double real; + double imag; +} complex; + +typedef struct { complex e[3][3]; } su3_matrix; + +void mult_su3_an(su3_matrix *a, su3_matrix *b, su3_matrix *c) +{ + int j; + double a0r,a0i,a1r,a1i,a2r,a2i; + double b0r,b0i,b1r,b1i,b2r,b2i; + for(j=0;j<3;j++) + { + a0r=a->e[0][0].real; a0i=a->e[0][0].imag; + b0r=b->e[0][j].real; b0i=b->e[0][j].imag; + a1r=a->e[1][0].real; a1i=a->e[1][0].imag; + b1r=b->e[1][j].real; b1i=b->e[1][j].imag; + a2r=a->e[2][0].real; a2i=a->e[2][0].imag; + b2r=b->e[2][j].real; b2i=b->e[2][j].imag; + + c->e[0][j].real = a0r*b0r + a0i*b0i + a1r*b1r + a1i*b1i + a2r*b2r + a2i*b2i; + c->e[0][j].imag = a0r*b0i - a0i*b0r + a1r*b1i - a1i*b1r + a2r*b2i - a2i*b2r; + + a0r=a->e[0][1].real; a0i=a->e[0][1].imag; + b0r=b->e[0][j].real; b0i=b->e[0][j].imag; + a1r=a->e[1][1].real; a1i=a->e[1][1].imag; + b1r=b->e[1][j].real; b1i=b->e[1][j].imag; + a2r=a->e[2][1].real; a2i=a->e[2][1].imag; + b2r=b->e[2][j].real; b2i=b->e[2][j].imag; + + c->e[1][j].real = a0r*b0r + a0i*b0i + a1r*b1r + a1i*b1i + a2r*b2r + a2i*b2i; + c->e[1][j].imag = a0r*b0i - a0i*b0r + a1r*b1i - a1i*b1r + a2r*b2i - a2i*b2r; + + a0r=a->e[0][2].real; a0i=a->e[0][2].imag; + b0r=b->e[0][j].real; b0i=b->e[0][j].imag; + a1r=a->e[1][2].real; a1i=a->e[1][2].imag; + b1r=b->e[1][j].real; b1i=b->e[1][j].imag; + a2r=a->e[2][2].real; a2i=a->e[2][2].imag; + b2r=b->e[2][j].real; b2i=b->e[2][j].imag; + + c->e[2][j].real = a0r*b0r + a0i*b0i + a1r*b1r + a1i*b1i + a2r*b2r + a2i*b2i; + c->e[2][j].imag = a0r*b0i - a0i*b0r + a1r*b1i - a1i*b1r + a2r*b2i - a2i*b2r; + } +} + +/* { dg-final { scan-tree-dump "vectorized 1 loops" "vect" { target vect_hw_misalign } } } */ diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc index 5a24fb8bf4c..85209604486 100644 --- a/gcc/tree-vect-loop.cc +++ b/gcc/tree-vect-loop.cc @@ -3005,10 +3005,9 @@ start_over: ok = vect_analyze_loop_operations (loop_vinfo); if (!ok) { - if (dump_enabled_p ()) - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "bad operation or unsupported loop bound.\n"); - return ok; + ok = opt_result::failure_at (vect_location, + "bad operation or unsupported loop bound\n"); + goto again; } /* For now, we don't expect to mix both masking and length approaches for one -- 2.43.0