The following adds the ability to vectorize a fma reduction pair as SLP reduction (we cannot yet handle ternary association in reduction vectorization yet).
Bootstrapped and tested on x86_64-unknown-linux-gnu, pushed. I'll file a bug about the missed handling for fold-left reductions. PR tree-optimization/109892 * tree-vect-loop.cc (check_reduction_path): Handle fma. (vectorizable_reduction): Apply FOLD_LEFT_REDUCTION code generation constraints. * gcc.dg/vect/vect-reduc-fma-1.c: New testcase. * gcc.dg/vect/vect-reduc-fma-2.c: Likewise. * gcc.dg/vect/vect-reduc-fma-3.c: Likewise. --- gcc/testsuite/gcc.dg/vect/vect-reduc-fma-1.c | 15 +++++++++++++++ gcc/testsuite/gcc.dg/vect/vect-reduc-fma-2.c | 20 ++++++++++++++++++++ gcc/testsuite/gcc.dg/vect/vect-reduc-fma-3.c | 16 ++++++++++++++++ gcc/tree-vect-loop.cc | 17 +++++++++++++++++ 4 files changed, 68 insertions(+) create mode 100644 gcc/testsuite/gcc.dg/vect/vect-reduc-fma-1.c create mode 100644 gcc/testsuite/gcc.dg/vect/vect-reduc-fma-2.c create mode 100644 gcc/testsuite/gcc.dg/vect/vect-reduc-fma-3.c diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-fma-1.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-fma-1.c new file mode 100644 index 00000000000..e958b43e23b --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-fma-1.c @@ -0,0 +1,15 @@ +/* { dg-do compile } */ +/* { dg-additional-options "-mfma" { target { x86_64-*-* i?86-*-* } } } */ + +double f(double x[], long n) +{ + double r0 = 0, r1 = 0; + for (; n; x += 2, n--) { + r0 = __builtin_fma(x[0], x[0], r0); + r1 = __builtin_fma(x[1], x[1], r1); + } + return r0 + r1; +} + +/* We should vectorize this as SLP reduction. */ +/* { dg-final { scan-tree-dump "loop vectorized using 16 byte vectors and unroll factor 1" "vect" { target { x86_64-*-* i?86-*-* } } } } */ diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-fma-2.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-fma-2.c new file mode 100644 index 00000000000..ea1ca9720e5 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-fma-2.c @@ -0,0 +1,20 @@ +/* { dg-do compile } */ +/* { dg-additional-options "-ffp-contract=on" } */ +/* { dg-additional-options "-mfma" { target { x86_64-*-* i?86-*-* } } } */ + +static double muladd(double x, double y, double z) +{ + return x * y + z; +} +double g(double x[], long n) +{ + double r0 = 0, r1 = 0; + for (; n; x += 2, n--) { + r0 = muladd(x[0], x[0], r0); + r1 = muladd(x[1], x[1], r1); + } + return r0 + r1; +} + +/* We should vectorize this as SLP reduction. */ +/* { dg-final { scan-tree-dump "loop vectorized using 16 byte vectors and unroll factor 1" "vect" { target { x86_64-*-* i?86-*-* } } } } */ diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-fma-3.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-fma-3.c new file mode 100644 index 00000000000..10cecedd8e5 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-fma-3.c @@ -0,0 +1,16 @@ +/* { dg-do compile } */ +/* { dg-additional-options "-ffast-math" } */ +/* { dg-additional-options "-mfma" { target { x86_64-*-* i?86-*-* } } } */ + +double f(double x[], long n) +{ + double r0 = 0, r1 = 0; + for (; n; x += 2, n--) { + r0 = __builtin_fma(x[0], x[0], r0); + r1 = __builtin_fma(x[1], x[1], r1); + } + return r0 + r1; +} + +/* We should vectorize this as SLP reduction, higher VF possible. */ +/* { dg-final { scan-tree-dump "optimized: loop vectorized" "vect" { target { x86_64-*-* i?86-*-* } } } } */ diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc index a3f95433a5b..9a4b89e9113 100644 --- a/gcc/tree-vect-loop.cc +++ b/gcc/tree-vect-loop.cc @@ -4139,6 +4139,10 @@ pop: if (op.ops[2] == op.ops[opi]) neg = ! neg; } + /* For an FMA the reduction code is the PLUS if the addition chain + is the reduction. */ + else if (op.code == IFN_FMA && opi == 2) + op.code = PLUS_EXPR; if (CONVERT_EXPR_CODE_P (op.code) && tree_nop_conversion_p (op.type, TREE_TYPE (op.ops[0]))) ; @@ -8084,6 +8088,19 @@ vectorizable_reduction (loop_vec_info loop_vinfo, "in-order reduction chain without SLP.\n"); return false; } + /* Code generation doesn't support function calls other + than .COND_*. */ + if (!op.code.is_tree_code () + && !(op.code.is_internal_fn () + && conditional_internal_fn_code (internal_fn (op.code)) + != ERROR_MARK)) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "in-order reduction chain operation not " + "supported.\n"); + return false; + } STMT_VINFO_REDUC_TYPE (reduc_info) = reduction_type = FOLD_LEFT_REDUCTION; } -- 2.43.0