The following adds the ability to vectorize a fma reduction pair
as SLP reduction (we cannot yet handle ternary association in
reduction vectorization yet).

Bootstrapped and tested on x86_64-unknown-linux-gnu, pushed.

I'll file a bug about the missed handling for fold-left reductions.

        PR tree-optimization/109892
        * tree-vect-loop.cc (check_reduction_path): Handle fma.
        (vectorizable_reduction): Apply FOLD_LEFT_REDUCTION code
        generation constraints.

        * gcc.dg/vect/vect-reduc-fma-1.c: New testcase.
        * gcc.dg/vect/vect-reduc-fma-2.c: Likewise.
        * gcc.dg/vect/vect-reduc-fma-3.c: Likewise.
---
 gcc/testsuite/gcc.dg/vect/vect-reduc-fma-1.c | 15 +++++++++++++++
 gcc/testsuite/gcc.dg/vect/vect-reduc-fma-2.c | 20 ++++++++++++++++++++
 gcc/testsuite/gcc.dg/vect/vect-reduc-fma-3.c | 16 ++++++++++++++++
 gcc/tree-vect-loop.cc                        | 17 +++++++++++++++++
 4 files changed, 68 insertions(+)
 create mode 100644 gcc/testsuite/gcc.dg/vect/vect-reduc-fma-1.c
 create mode 100644 gcc/testsuite/gcc.dg/vect/vect-reduc-fma-2.c
 create mode 100644 gcc/testsuite/gcc.dg/vect/vect-reduc-fma-3.c

diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-fma-1.c 
b/gcc/testsuite/gcc.dg/vect/vect-reduc-fma-1.c
new file mode 100644
index 00000000000..e958b43e23b
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-fma-1.c
@@ -0,0 +1,15 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-mfma" { target { x86_64-*-* i?86-*-* } } } */
+
+double f(double x[], long n)
+{
+    double r0 = 0, r1 = 0;
+    for (; n; x += 2, n--) {
+        r0 = __builtin_fma(x[0], x[0], r0);
+        r1 = __builtin_fma(x[1], x[1], r1);
+    }
+    return r0 + r1;
+}
+
+/* We should vectorize this as SLP reduction.  */
+/* { dg-final { scan-tree-dump "loop vectorized using 16 byte vectors and 
unroll factor 1" "vect" { target { x86_64-*-* i?86-*-* } } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-fma-2.c 
b/gcc/testsuite/gcc.dg/vect/vect-reduc-fma-2.c
new file mode 100644
index 00000000000..ea1ca9720e5
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-fma-2.c
@@ -0,0 +1,20 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-ffp-contract=on" } */
+/* { dg-additional-options "-mfma" { target { x86_64-*-* i?86-*-* } } } */
+
+static double muladd(double x, double y, double z)
+{
+    return x * y + z;
+}
+double g(double x[], long n)
+{
+    double r0 = 0, r1 = 0;
+    for (; n; x += 2, n--) {
+        r0 = muladd(x[0], x[0], r0);
+        r1 = muladd(x[1], x[1], r1);
+    }
+    return r0 + r1;
+}
+
+/* We should vectorize this as SLP reduction.  */
+/* { dg-final { scan-tree-dump "loop vectorized using 16 byte vectors and 
unroll factor 1" "vect" { target { x86_64-*-* i?86-*-* } } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-fma-3.c 
b/gcc/testsuite/gcc.dg/vect/vect-reduc-fma-3.c
new file mode 100644
index 00000000000..10cecedd8e5
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-fma-3.c
@@ -0,0 +1,16 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-ffast-math" } */
+/* { dg-additional-options "-mfma" { target { x86_64-*-* i?86-*-* } } } */
+
+double f(double x[], long n)
+{
+    double r0 = 0, r1 = 0;
+    for (; n; x += 2, n--) {
+        r0 = __builtin_fma(x[0], x[0], r0);
+        r1 = __builtin_fma(x[1], x[1], r1);
+    }
+    return r0 + r1;
+}
+
+/* We should vectorize this as SLP reduction, higher VF possible.  */
+/* { dg-final { scan-tree-dump "optimized: loop vectorized" "vect" { target { 
x86_64-*-* i?86-*-* } } } } */
diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index a3f95433a5b..9a4b89e9113 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -4139,6 +4139,10 @@ pop:
          if (op.ops[2] == op.ops[opi])
            neg = ! neg;
        }
+      /* For an FMA the reduction code is the PLUS if the addition chain
+        is the reduction.  */
+      else if (op.code == IFN_FMA && opi == 2)
+       op.code = PLUS_EXPR;
       if (CONVERT_EXPR_CODE_P (op.code)
          && tree_nop_conversion_p (op.type, TREE_TYPE (op.ops[0])))
        ;
@@ -8084,6 +8088,19 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
                                 "in-order reduction chain without SLP.\n");
              return false;
            }
+         /* Code generation doesn't support function calls other
+            than .COND_*.  */
+         if (!op.code.is_tree_code ()
+             && !(op.code.is_internal_fn ()
+                  && conditional_internal_fn_code (internal_fn (op.code))
+                       != ERROR_MARK))
+           {
+             if (dump_enabled_p ())
+               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                                "in-order reduction chain operation not "
+                                "supported.\n");
+             return false;
+           }
          STMT_VINFO_REDUC_TYPE (reduc_info)
            = reduction_type = FOLD_LEFT_REDUCTION;
        }
-- 
2.43.0

Reply via email to