With SLP forced we fail to consider using single-lane SLP for a case
that we still end up discovering as hybrid (in the PR in question
this is because we run into the SLP discovery limit due to excessive
association).

Bootstrapped on x86_64-unknown-linux-gnu, testing in progress.

This solves a bit of the 433.milc regression.

        PR tree-optimization/117874
        * tree-vect-loop.cc (vect_analyze_loop_2): When non-SLP
        analysis fails, try single-lane SLP.

        * gcc.dg/vect/pr117874.c: New testcase.
---
 gcc/testsuite/gcc.dg/vect/pr117874.c | 50 ++++++++++++++++++++++++++++
 gcc/tree-vect-loop.cc                |  7 ++--
 2 files changed, 53 insertions(+), 4 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/vect/pr117874.c

diff --git a/gcc/testsuite/gcc.dg/vect/pr117874.c 
b/gcc/testsuite/gcc.dg/vect/pr117874.c
new file mode 100644
index 00000000000..27e5f8ca369
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/pr117874.c
@@ -0,0 +1,50 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target vect_double } */
+
+typedef struct {
+    double real;
+    double imag;
+} complex;
+
+typedef struct { complex e[3][3]; } su3_matrix;
+
+void mult_su3_an(su3_matrix *a, su3_matrix *b, su3_matrix *c)
+{
+  int j;
+  double a0r,a0i,a1r,a1i,a2r,a2i;
+  double b0r,b0i,b1r,b1i,b2r,b2i;
+  for(j=0;j<3;j++)
+    {
+      a0r=a->e[0][0].real; a0i=a->e[0][0].imag;
+      b0r=b->e[0][j].real; b0i=b->e[0][j].imag;
+      a1r=a->e[1][0].real; a1i=a->e[1][0].imag;
+      b1r=b->e[1][j].real; b1i=b->e[1][j].imag;
+      a2r=a->e[2][0].real; a2i=a->e[2][0].imag;
+      b2r=b->e[2][j].real; b2i=b->e[2][j].imag;
+
+      c->e[0][j].real = a0r*b0r + a0i*b0i + a1r*b1r + a1i*b1i + a2r*b2r + 
a2i*b2i;
+      c->e[0][j].imag = a0r*b0i - a0i*b0r + a1r*b1i - a1i*b1r + a2r*b2i - 
a2i*b2r;
+
+      a0r=a->e[0][1].real; a0i=a->e[0][1].imag;
+      b0r=b->e[0][j].real; b0i=b->e[0][j].imag;
+      a1r=a->e[1][1].real; a1i=a->e[1][1].imag;
+      b1r=b->e[1][j].real; b1i=b->e[1][j].imag;
+      a2r=a->e[2][1].real; a2i=a->e[2][1].imag;
+      b2r=b->e[2][j].real; b2i=b->e[2][j].imag;
+
+      c->e[1][j].real = a0r*b0r + a0i*b0i + a1r*b1r + a1i*b1i + a2r*b2r + 
a2i*b2i;
+      c->e[1][j].imag = a0r*b0i - a0i*b0r + a1r*b1i - a1i*b1r + a2r*b2i - 
a2i*b2r;
+
+      a0r=a->e[0][2].real; a0i=a->e[0][2].imag;
+      b0r=b->e[0][j].real; b0i=b->e[0][j].imag;
+      a1r=a->e[1][2].real; a1i=a->e[1][2].imag;
+      b1r=b->e[1][j].real; b1i=b->e[1][j].imag;
+      a2r=a->e[2][2].real; a2i=a->e[2][2].imag;
+      b2r=b->e[2][j].real; b2i=b->e[2][j].imag;
+
+      c->e[2][j].real = a0r*b0r + a0i*b0i + a1r*b1r + a1i*b1i + a2r*b2r + 
a2i*b2i;
+      c->e[2][j].imag = a0r*b0i - a0i*b0r + a1r*b1i - a1i*b1r + a2r*b2i - 
a2i*b2r;
+    }
+}
+
+/* { dg-final { scan-tree-dump "vectorized 1 loops" "vect" { target 
vect_hw_misalign } } } */
diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index 5a24fb8bf4c..85209604486 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -3005,10 +3005,9 @@ start_over:
   ok = vect_analyze_loop_operations (loop_vinfo);
   if (!ok)
     {
-      if (dump_enabled_p ())
-       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                        "bad operation or unsupported loop bound.\n");
-      return ok;
+      ok = opt_result::failure_at (vect_location,
+                                  "bad operation or unsupported loop bound\n");
+      goto again;
     }
 
   /* For now, we don't expect to mix both masking and length approaches for one
-- 
2.43.0

Reply via email to