[gcc r16-1673] tree-optimization/109892 - SLP reduction of fma

Richard Biener via Gcc-cvs Wed, 25 Jun 2025 06:02:37 -0700

https://gcc.gnu.org/g:5aca8510abea6c3fac3336a7445863db07fd4a06


commit r16-1673-g5aca8510abea6c3fac3336a7445863db07fd4a06
Author: Richard Biener <rguent...@suse.de>
Date:   Wed Jun 25 10:36:59 2025 +0200

    tree-optimization/109892 - SLP reduction of fma
    
    The following adds the ability to vectorize a fma reduction pair
    as SLP reduction (we cannot yet handle ternary association in
    reduction vectorization yet).
    
            PR tree-optimization/109892
            * tree-vect-loop.cc (check_reduction_path): Handle fma.
            (vectorizable_reduction): Apply FOLD_LEFT_REDUCTION code
            generation constraints.
    
            * gcc.dg/vect/vect-reduc-fma-1.c: New testcase.
            * gcc.dg/vect/vect-reduc-fma-2.c: Likewise.
            * gcc.dg/vect/vect-reduc-fma-3.c: Likewise.

Diff:
---
 gcc/testsuite/gcc.dg/vect/vect-reduc-fma-1.c | 15 +++++++++++++++
 gcc/testsuite/gcc.dg/vect/vect-reduc-fma-2.c | 20 ++++++++++++++++++++
 gcc/testsuite/gcc.dg/vect/vect-reduc-fma-3.c | 16 ++++++++++++++++
 gcc/tree-vect-loop.cc                        | 17 +++++++++++++++++
 4 files changed, 68 insertions(+)

diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-fma-1.c 
b/gcc/testsuite/gcc.dg/vect/vect-reduc-fma-1.c
new file mode 100644
index 000000000000..e958b43e23b6
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-fma-1.c
@@ -0,0 +1,15 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-mfma" { target { x86_64-*-* i?86-*-* } } } */
+
+double f(double x[], long n)
+{
+    double r0 = 0, r1 = 0;
+    for (; n; x += 2, n--) {
+        r0 = __builtin_fma(x[0], x[0], r0);
+        r1 = __builtin_fma(x[1], x[1], r1);
+    }
+    return r0 + r1;
+}
+
+/* We should vectorize this as SLP reduction.  */
+/* { dg-final { scan-tree-dump "loop vectorized using 16 byte vectors and 
unroll factor 1" "vect" { target { x86_64-*-* i?86-*-* } } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-fma-2.c 
b/gcc/testsuite/gcc.dg/vect/vect-reduc-fma-2.c
new file mode 100644
index 000000000000..ea1ca9720e5a
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-fma-2.c
@@ -0,0 +1,20 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-ffp-contract=on" } */
+/* { dg-additional-options "-mfma" { target { x86_64-*-* i?86-*-* } } } */
+
+static double muladd(double x, double y, double z)
+{
+    return x * y + z;
+}
+double g(double x[], long n)
+{
+    double r0 = 0, r1 = 0;
+    for (; n; x += 2, n--) {
+        r0 = muladd(x[0], x[0], r0);
+        r1 = muladd(x[1], x[1], r1);
+    }
+    return r0 + r1;
+}
+
+/* We should vectorize this as SLP reduction.  */
+/* { dg-final { scan-tree-dump "loop vectorized using 16 byte vectors and 
unroll factor 1" "vect" { target { x86_64-*-* i?86-*-* } } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-fma-3.c 
b/gcc/testsuite/gcc.dg/vect/vect-reduc-fma-3.c
new file mode 100644
index 000000000000..10cecedd8e5f
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-fma-3.c
@@ -0,0 +1,16 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-ffast-math" } */
+/* { dg-additional-options "-mfma" { target { x86_64-*-* i?86-*-* } } } */
+
+double f(double x[], long n)
+{
+    double r0 = 0, r1 = 0;
+    for (; n; x += 2, n--) {
+        r0 = __builtin_fma(x[0], x[0], r0);
+        r1 = __builtin_fma(x[1], x[1], r1);
+    }
+    return r0 + r1;
+}
+
+/* We should vectorize this as SLP reduction, higher VF possible.  */
+/* { dg-final { scan-tree-dump "optimized: loop vectorized" "vect" { target { 
x86_64-*-* i?86-*-* } } } } */
diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index 9ee8e50ee75a..5b6769af31c3 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -4126,6 +4126,10 @@ pop:
          if (op.ops[2] == op.ops[opi])
            neg = ! neg;
        }
+      /* For an FMA the reduction code is the PLUS if the addition chain
+        is the reduction.  */
+      else if (op.code == IFN_FMA && opi == 2)
+       op.code = PLUS_EXPR;
       if (CONVERT_EXPR_CODE_P (op.code)
          && tree_nop_conversion_p (op.type, TREE_TYPE (op.ops[0])))
        ;
@@ -8070,6 +8074,19 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
                                 "in-order reduction chain without SLP.\n");
              return false;
            }
+         /* Code generation doesn't support function calls other
+            than .COND_*.  */
+         if (!op.code.is_tree_code ()
+             && !(op.code.is_internal_fn ()
+                  && conditional_internal_fn_code (internal_fn (op.code))
+                       != ERROR_MARK))
+           {
+             if (dump_enabled_p ())
+               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                                "in-order reduction chain operation not "
+                                "supported.\n");
+             return false;
+           }
          STMT_VINFO_REDUC_TYPE (reduc_info)
            = reduction_type = FOLD_LEFT_REDUCTION;
        }

[gcc r16-1673] tree-optimization/109892 - SLP reduction of fma

Reply via email to