https://gcc.gnu.org/g:5aca8510abea6c3fac3336a7445863db07fd4a06
commit r16-1673-g5aca8510abea6c3fac3336a7445863db07fd4a06 Author: Richard Biener <rguent...@suse.de> Date: Wed Jun 25 10:36:59 2025 +0200 tree-optimization/109892 - SLP reduction of fma The following adds the ability to vectorize a fma reduction pair as SLP reduction (we cannot yet handle ternary association in reduction vectorization yet). PR tree-optimization/109892 * tree-vect-loop.cc (check_reduction_path): Handle fma. (vectorizable_reduction): Apply FOLD_LEFT_REDUCTION code generation constraints. * gcc.dg/vect/vect-reduc-fma-1.c: New testcase. * gcc.dg/vect/vect-reduc-fma-2.c: Likewise. * gcc.dg/vect/vect-reduc-fma-3.c: Likewise. Diff: --- gcc/testsuite/gcc.dg/vect/vect-reduc-fma-1.c | 15 +++++++++++++++ gcc/testsuite/gcc.dg/vect/vect-reduc-fma-2.c | 20 ++++++++++++++++++++ gcc/testsuite/gcc.dg/vect/vect-reduc-fma-3.c | 16 ++++++++++++++++ gcc/tree-vect-loop.cc | 17 +++++++++++++++++ 4 files changed, 68 insertions(+) diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-fma-1.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-fma-1.c new file mode 100644 index 000000000000..e958b43e23b6 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-fma-1.c @@ -0,0 +1,15 @@ +/* { dg-do compile } */ +/* { dg-additional-options "-mfma" { target { x86_64-*-* i?86-*-* } } } */ + +double f(double x[], long n) +{ + double r0 = 0, r1 = 0; + for (; n; x += 2, n--) { + r0 = __builtin_fma(x[0], x[0], r0); + r1 = __builtin_fma(x[1], x[1], r1); + } + return r0 + r1; +} + +/* We should vectorize this as SLP reduction. */ +/* { dg-final { scan-tree-dump "loop vectorized using 16 byte vectors and unroll factor 1" "vect" { target { x86_64-*-* i?86-*-* } } } } */ diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-fma-2.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-fma-2.c new file mode 100644 index 000000000000..ea1ca9720e5a --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-fma-2.c @@ -0,0 +1,20 @@ +/* { dg-do compile } */ +/* { dg-additional-options "-ffp-contract=on" } */ +/* { dg-additional-options "-mfma" { target { x86_64-*-* i?86-*-* } } } */ + +static double muladd(double x, double y, double z) +{ + return x * y + z; +} +double g(double x[], long n) +{ + double r0 = 0, r1 = 0; + for (; n; x += 2, n--) { + r0 = muladd(x[0], x[0], r0); + r1 = muladd(x[1], x[1], r1); + } + return r0 + r1; +} + +/* We should vectorize this as SLP reduction. */ +/* { dg-final { scan-tree-dump "loop vectorized using 16 byte vectors and unroll factor 1" "vect" { target { x86_64-*-* i?86-*-* } } } } */ diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-fma-3.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-fma-3.c new file mode 100644 index 000000000000..10cecedd8e5f --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-fma-3.c @@ -0,0 +1,16 @@ +/* { dg-do compile } */ +/* { dg-additional-options "-ffast-math" } */ +/* { dg-additional-options "-mfma" { target { x86_64-*-* i?86-*-* } } } */ + +double f(double x[], long n) +{ + double r0 = 0, r1 = 0; + for (; n; x += 2, n--) { + r0 = __builtin_fma(x[0], x[0], r0); + r1 = __builtin_fma(x[1], x[1], r1); + } + return r0 + r1; +} + +/* We should vectorize this as SLP reduction, higher VF possible. */ +/* { dg-final { scan-tree-dump "optimized: loop vectorized" "vect" { target { x86_64-*-* i?86-*-* } } } } */ diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc index 9ee8e50ee75a..5b6769af31c3 100644 --- a/gcc/tree-vect-loop.cc +++ b/gcc/tree-vect-loop.cc @@ -4126,6 +4126,10 @@ pop: if (op.ops[2] == op.ops[opi]) neg = ! neg; } + /* For an FMA the reduction code is the PLUS if the addition chain + is the reduction. */ + else if (op.code == IFN_FMA && opi == 2) + op.code = PLUS_EXPR; if (CONVERT_EXPR_CODE_P (op.code) && tree_nop_conversion_p (op.type, TREE_TYPE (op.ops[0]))) ; @@ -8070,6 +8074,19 @@ vectorizable_reduction (loop_vec_info loop_vinfo, "in-order reduction chain without SLP.\n"); return false; } + /* Code generation doesn't support function calls other + than .COND_*. */ + if (!op.code.is_tree_code () + && !(op.code.is_internal_fn () + && conditional_internal_fn_code (internal_fn (op.code)) + != ERROR_MARK)) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "in-order reduction chain operation not " + "supported.\n"); + return false; + } STMT_VINFO_REDUC_TYPE (reduc_info) = reduction_type = FOLD_LEFT_REDUCTION; }