Robin Dapp <[email protected]> writes:
> Changed as suggested. The difference to v5 is thus:
>
> + if (cond_fn_p)
> + {
> + gcall *call = dyn_cast<gcall *> (use_stmt);
> + unsigned else_pos
> + = internal_fn_else_index (internal_fn (op.code));
> +
> + for (unsigned int j = 0; j < gimple_call_num_args (call); ++j)
> + {
> + if (j == else_pos)
> + continue;
> + if (gimple_call_arg (call, j) == op.ops[opi])
> + cnt++;
> + }
> + }
> + else if (!is_gimple_debug (op_use_stmt)
>
> as well as internal_fn_else_index.
>
> Testsuite on riscv is unchanged, bootstrap and testsuite on power10 done,
> aarch64 and x86 still running.
>
> Regards
> Robin
>
> From e11ac2b5889558c58ce711d8119ebcd78173ac6c Mon Sep 17 00:00:00 2001
> From: Robin Dapp <[email protected]>
> Date: Wed, 13 Sep 2023 22:19:35 +0200
> Subject: [PATCH v6] ifcvt/vect: Emit COND_OP for conditional scalar reduction.
>
> As described in PR111401 we currently emit a COND and a PLUS expression
> for conditional reductions. This makes it difficult to combine both
> into a masked reduction statement later.
> This patch improves that by directly emitting a COND_ADD/COND_OP during
> ifcvt and adjusting some vectorizer code to handle it.
>
> It also makes neutral_op_for_reduction return -0 if HONOR_SIGNED_ZEROS
> is true.
>
> gcc/ChangeLog:
>
> PR middle-end/111401
> * internal-fn.cc (internal_fn_else_index): New function.
> * internal-fn.h (internal_fn_else_index): Define.
> * tree-if-conv.cc (convert_scalar_cond_reduction): Emit COND_OP
> if supported.
> (predicate_scalar_phi): Add whitespace.
> * tree-vect-loop.cc (fold_left_reduction_fn): Add IFN_COND_OP.
> (neutral_op_for_reduction): Return -0 for PLUS.
> (check_reduction_path): Don't count else operand in COND_OP.
> (vect_is_simple_reduction): Ditto.
> (vect_create_epilog_for_reduction): Fix whitespace.
> (vectorize_fold_left_reduction): Add COND_OP handling.
> (vectorizable_reduction): Don't count else operand in COND_OP.
> (vect_transform_reduction): Add COND_OP handling.
> * tree-vectorizer.h (neutral_op_for_reduction): Add default
> parameter.
>
> gcc/testsuite/ChangeLog:
>
> * gcc.dg/vect/vect-cond-reduc-in-order-2-signed-zero.c: New test.
> * gcc.target/riscv/rvv/autovec/cond/pr111401.c: New test.
> * gcc.target/riscv/rvv/autovec/reduc/reduc_call-2.c: Adjust.
> * gcc.target/riscv/rvv/autovec/reduc/reduc_call-4.c: Ditto.
>
> ---
> gcc/internal-fn.cc | 58 ++++++
> gcc/internal-fn.h | 1 +
> .../vect-cond-reduc-in-order-2-signed-zero.c | 141 +++++++++++++
> .../riscv/rvv/autovec/cond/pr111401.c | 139 +++++++++++++
> .../riscv/rvv/autovec/reduc/reduc_call-2.c | 4 +-
> .../riscv/rvv/autovec/reduc/reduc_call-4.c | 4 +-
> gcc/tree-if-conv.cc | 49 +++--
> gcc/tree-vect-loop.cc | 193 ++++++++++++++----
> gcc/tree-vectorizer.h | 2 +-
> 9 files changed, 536 insertions(+), 55 deletions(-)
> create mode 100644
> gcc/testsuite/gcc.dg/vect/vect-cond-reduc-in-order-2-signed-zero.c
> create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/cond/pr111401.c
>
> diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc
> index 61d5a9e4772..018175261b9 100644
> --- a/gcc/internal-fn.cc
> +++ b/gcc/internal-fn.cc
> @@ -4697,6 +4697,64 @@ internal_fn_len_index (internal_fn fn)
> }
> }
>
> +int
> +internal_fn_else_index (internal_fn fn)
The function needs a comment, maybe:
/* If FN is an IFN_COND_* or IFN_COND_LEN_* function, return the index of the
argument that is used when the condition is false. Return -1 otherwise. */
OK for the internal-fn* and tree-if-conv.cc bits (which were the
parts I commented on earlier). I'll look at cleaning up the
definition of conditional internal functions separately, so that
the list of functions isn't necessary.
Thanks,
Richard
> +{
> + switch (fn)
> + {
> + case IFN_COND_NEG:
> + case IFN_COND_NOT:
> + case IFN_COND_LEN_NEG:
> + case IFN_COND_LEN_NOT:
> + return 2;
> +
> + case IFN_COND_ADD:
> + case IFN_COND_SUB:
> + case IFN_COND_MUL:
> + case IFN_COND_DIV:
> + case IFN_COND_MOD:
> + case IFN_COND_MIN:
> + case IFN_COND_MAX:
> + case IFN_COND_FMIN:
> + case IFN_COND_FMAX:
> + case IFN_COND_AND:
> + case IFN_COND_IOR:
> + case IFN_COND_XOR:
> + case IFN_COND_SHL:
> + case IFN_COND_SHR:
> + case IFN_COND_LEN_ADD:
> + case IFN_COND_LEN_SUB:
> + case IFN_COND_LEN_MUL:
> + case IFN_COND_LEN_DIV:
> + case IFN_COND_LEN_MOD:
> + case IFN_COND_LEN_MIN:
> + case IFN_COND_LEN_MAX:
> + case IFN_COND_LEN_FMIN:
> + case IFN_COND_LEN_FMAX:
> + case IFN_COND_LEN_AND:
> + case IFN_COND_LEN_IOR:
> + case IFN_COND_LEN_XOR:
> + case IFN_COND_LEN_SHL:
> + case IFN_COND_LEN_SHR:
> + return 3;
> +
> + case IFN_COND_FMA:
> + case IFN_COND_FMS:
> + case IFN_COND_FNMA:
> + case IFN_COND_FNMS:
> + case IFN_COND_LEN_FMA:
> + case IFN_COND_LEN_FMS:
> + case IFN_COND_LEN_FNMA:
> + case IFN_COND_LEN_FNMS:
> + return 4;
> +
> + default:
> + return -1;
> + }
> +
> + return -1;
> +}
> +
> /* If FN takes a vector mask argument, return the index of that argument,
> otherwise return -1. */
>
> diff --git a/gcc/internal-fn.h b/gcc/internal-fn.h
> index 99de13a0199..7d72f4db2d0 100644
> --- a/gcc/internal-fn.h
> +++ b/gcc/internal-fn.h
> @@ -237,6 +237,7 @@ extern bool internal_store_fn_p (internal_fn);
> extern bool internal_gather_scatter_fn_p (internal_fn);
> extern int internal_fn_mask_index (internal_fn);
> extern int internal_fn_len_index (internal_fn);
> +extern int internal_fn_else_index (internal_fn);
> extern int internal_fn_stored_value_index (internal_fn);
> extern bool internal_gather_scatter_fn_supported_p (internal_fn, tree,
> tree, tree, int);
> diff --git
> a/gcc/testsuite/gcc.dg/vect/vect-cond-reduc-in-order-2-signed-zero.c
> b/gcc/testsuite/gcc.dg/vect/vect-cond-reduc-in-order-2-signed-zero.c
> new file mode 100644
> index 00000000000..7b46e7d8a2a
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/vect/vect-cond-reduc-in-order-2-signed-zero.c
> @@ -0,0 +1,141 @@
> +/* Make sure a -0 stays -0 when we perform a conditional reduction. */
> +/* { dg-do run } */
> +/* { dg-require-effective-target vect_double } */
> +/* { dg-add-options ieee } */
> +/* { dg-additional-options "-std=gnu99 -fno-fast-math" } */
> +
> +#include "tree-vect.h"
> +
> +#include <math.h>
> +
> +#define N (VECTOR_BITS * 17)
> +
> +double __attribute__ ((noinline, noclone))
> +reduc_plus_double (double *restrict a, double init, int *cond, int n)
> +{
> + double res = init;
> + for (int i = 0; i < n; i++)
> + if (cond[i])
> + res += a[i];
> + return res;
> +}
> +
> +double __attribute__ ((noinline, noclone, optimize ("0")))
> +reduc_plus_double_ref (double *restrict a, double init, int *cond, int n)
> +{
> + double res = init;
> + for (int i = 0; i < n; i++)
> + if (cond[i])
> + res += a[i];
> + return res;
> +}
> +
> +double __attribute__ ((noinline, noclone))
> +reduc_minus_double (double *restrict a, double init, int *cond, int n)
> +{
> + double res = init;
> + for (int i = 0; i < n; i++)
> + if (cond[i])
> + res -= a[i];
> + return res;
> +}
> +
> +double __attribute__ ((noinline, noclone, optimize ("0")))
> +reduc_minus_double_ref (double *restrict a, double init, int *cond, int n)
> +{
> + double res = init;
> + for (int i = 0; i < n; i++)
> + if (cond[i])
> + res -= a[i];
> + return res;
> +}
> +
> +int __attribute__ ((optimize (1)))
> +main ()
> +{
> + int n = 19;
> + double a[N];
> + int cond1[N], cond2[N];
> +
> + for (int i = 0; i < N; i++)
> + {
> + a[i] = (i * 0.1) * (i & 1 ? 1 : -1);
> + cond1[i] = 0;
> + cond2[i] = i & 4 ? 1 : 0;
> + asm volatile ("" ::: "memory");
> + }
> +
> + double res1 = reduc_plus_double (a, -0.0, cond1, n);
> + double ref1 = reduc_plus_double_ref (a, -0.0, cond1, n);
> + double res2 = reduc_minus_double (a, -0.0, cond1, n);
> + double ref2 = reduc_minus_double_ref (a, -0.0, cond1, n);
> + double res3 = reduc_plus_double (a, -0.0, cond1, n);
> + double ref3 = reduc_plus_double_ref (a, -0.0, cond1, n);
> + double res4 = reduc_minus_double (a, -0.0, cond1, n);
> + double ref4 = reduc_minus_double_ref (a, -0.0, cond1, n);
> +
> + if (res1 != ref1 || signbit (res1) != signbit (ref1))
> + __builtin_abort ();
> + if (res2 != ref2 || signbit (res2) != signbit (ref2))
> + __builtin_abort ();
> + if (res3 != ref3 || signbit (res3) != signbit (ref3))
> + __builtin_abort ();
> + if (res4 != ref4 || signbit (res4) != signbit (ref4))
> + __builtin_abort ();
> +
> + res1 = reduc_plus_double (a, 0.0, cond1, n);
> + ref1 = reduc_plus_double_ref (a, 0.0, cond1, n);
> + res2 = reduc_minus_double (a, 0.0, cond1, n);
> + ref2 = reduc_minus_double_ref (a, 0.0, cond1, n);
> + res3 = reduc_plus_double (a, 0.0, cond1, n);
> + ref3 = reduc_plus_double_ref (a, 0.0, cond1, n);
> + res4 = reduc_minus_double (a, 0.0, cond1, n);
> + ref4 = reduc_minus_double_ref (a, 0.0, cond1, n);
> +
> + if (res1 != ref1 || signbit (res1) != signbit (ref1))
> + __builtin_abort ();
> + if (res2 != ref2 || signbit (res2) != signbit (ref2))
> + __builtin_abort ();
> + if (res3 != ref3 || signbit (res3) != signbit (ref3))
> + __builtin_abort ();
> + if (res4 != ref4 || signbit (res4) != signbit (ref4))
> + __builtin_abort ();
> +
> + res1 = reduc_plus_double (a, -0.0, cond2, n);
> + ref1 = reduc_plus_double_ref (a, -0.0, cond2, n);
> + res2 = reduc_minus_double (a, -0.0, cond2, n);
> + ref2 = reduc_minus_double_ref (a, -0.0, cond2, n);
> + res3 = reduc_plus_double (a, -0.0, cond2, n);
> + ref3 = reduc_plus_double_ref (a, -0.0, cond2, n);
> + res4 = reduc_minus_double (a, -0.0, cond2, n);
> + ref4 = reduc_minus_double_ref (a, -0.0, cond2, n);
> +
> + if (res1 != ref1 || signbit (res1) != signbit (ref1))
> + __builtin_abort ();
> + if (res2 != ref2 || signbit (res2) != signbit (ref2))
> + __builtin_abort ();
> + if (res3 != ref3 || signbit (res3) != signbit (ref3))
> + __builtin_abort ();
> + if (res4 != ref4 || signbit (res4) != signbit (ref4))
> + __builtin_abort ();
> +
> + res1 = reduc_plus_double (a, 0.0, cond2, n);
> + ref1 = reduc_plus_double_ref (a, 0.0, cond2, n);
> + res2 = reduc_minus_double (a, 0.0, cond2, n);
> + ref2 = reduc_minus_double_ref (a, 0.0, cond2, n);
> + res3 = reduc_plus_double (a, 0.0, cond2, n);
> + ref3 = reduc_plus_double_ref (a, 0.0, cond2, n);
> + res4 = reduc_minus_double (a, 0.0, cond2, n);
> + ref4 = reduc_minus_double_ref (a, 0.0, cond2, n);
> +
> + if (res1 != ref1 || signbit (res1) != signbit (ref1))
> + __builtin_abort ();
> + if (res2 != ref2 || signbit (res2) != signbit (ref2))
> + __builtin_abort ();
> + if (res3 != ref3 || signbit (res3) != signbit (ref3))
> + __builtin_abort ();
> + if (res4 != ref4 || signbit (res4) != signbit (ref4))
> + __builtin_abort ();
> +
> + return 0;
> +}
> diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/cond/pr111401.c
> b/gcc/testsuite/gcc.target/riscv/rvv/autovec/cond/pr111401.c
> new file mode 100644
> index 00000000000..83dbd61b3f3
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/cond/pr111401.c
> @@ -0,0 +1,139 @@
> +/* { dg-do run { target { riscv_v } } } */
> +/* { dg-additional-options "-march=rv64gcv -mabi=lp64d --param
> riscv-autovec-preference=scalable -fdump-tree-vect-details" } */
> +
> +double
> +__attribute__ ((noipa))
> +foo2 (double *__restrict a, double init, int *__restrict cond, int n)
> +{
> + for (int i = 0; i < n; i++)
> + if (cond[i])
> + init += a[i];
> + return init;
> +}
> +
> +double
> +__attribute__ ((noipa))
> +foo3 (double *__restrict a, double init, int *__restrict cond, int n)
> +{
> + for (int i = 0; i < n; i++)
> + if (cond[i])
> + init -= a[i];
> + return init;
> +}
> +
> +double
> +__attribute__ ((noipa))
> +foo4 (double *__restrict a, double init, int *__restrict cond, int n)
> +{
> + for (int i = 0; i < n; i++)
> + if (cond[i])
> + init *= a[i];
> + return init;
> +}
> +
> +int
> +__attribute__ ((noipa))
> +foo5 (int *__restrict a, int init, int *__restrict cond, int n)
> +{
> + for (int i = 0; i < n; i++)
> + if (cond[i])
> + init &= a[i];
> + return init;
> +}
> +
> +int
> +__attribute__ ((noipa))
> +foo6 (int *__restrict a, int init, int *__restrict cond, int n)
> +{
> + for (int i = 0; i < n; i++)
> + if (cond[i])
> + init |= a[i];
> + return init;
> +}
> +
> +int
> +__attribute__ ((noipa))
> +foo7 (int *__restrict a, int init, int *__restrict cond, int n)
> +{
> + for (int i = 0; i < n; i++)
> + if (cond[i])
> + init ^= a[i];
> + return init;
> +}
> +
> +#define SZ 125
> +
> +int
> +main ()
> +{
> + double res1 = 0, res2 = 0, res3 = 0;
> + double a1[SZ], a2[SZ], a3[SZ];
> + int c1[SZ], c2[SZ], c3[SZ];
> +
> + int a4[SZ], a5[SZ], a6[SZ];
> + int res4 = 0, res5 = 0, res6 = 0;
> + int c4[SZ], c5[SZ], c6[SZ];
> +
> + for (int i = 0; i < SZ; i++)
> + {
> + a1[i] = i * 3 + (i & 4) - (i & 7);
> + a2[i] = i * 3 + (i & 4) - (i & 7);
> + a3[i] = i * 0.05 + (i & 4) - (i & 7);
> + a4[i] = i * 3 + (i & 4) - (i & 7);
> + a5[i] = i * 3 + (i & 4) - (i & 7);
> + a6[i] = i * 3 + (i & 4) - (i & 7);
> + c1[i] = i & 1;
> + c2[i] = i & 2;
> + c3[i] = i & 3;
> + c4[i] = i & 4;
> + c5[i] = i & 5;
> + c6[i] = i & 6;
> + __asm__ volatile ("" : : : "memory");
> + }
> +
> + double init1 = 2.7, init2 = 8.2, init3 = 0.1;
> + double ref1 = init1, ref2 = init2, ref3 = init3;
> +
> + int init4 = 87, init5 = 11, init6 = -123894344;
> + int ref4 = init4, ref5 = init5, ref6 = init6;
> +
> +#pragma GCC novector
> + for (int i = 0; i < SZ; i++)
> + {
> + if (c1[i])
> + ref1 += a1[i];
> + if (c2[i])
> + ref2 -= a2[i];
> + if (c3[i])
> + ref3 *= a3[i];
> + if (c4[i])
> + ref4 &= a4[i];
> + if (c5[i])
> + ref5 |= a5[i];
> + if (c6[i])
> + ref6 ^= a6[i];
> + }
> +
> + res1 = foo2 (a1, init1, c1, SZ);
> + res2 = foo3 (a2, init2, c2, SZ);
> + res3 = foo4 (a3, init3, c3, SZ);
> + res4 = foo5 (a4, init4, c4, SZ);
> + res5 = foo6 (a5, init5, c5, SZ);
> + res6 = foo7 (a6, init6, c6, SZ);
> +
> + if (res1 != ref1)
> + __builtin_abort ();
> + if (res2 != ref2)
> + __builtin_abort ();
> + if (res3 != ref3)
> + __builtin_abort ();
> + if (res4 != ref4)
> + __builtin_abort ();
> + if (res5 != ref5)
> + __builtin_abort ();
> + if (res6 != ref6)
> + __builtin_abort ();
> +}
> +
> +/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 5 "vect" } } */
> +/* { dg-final { scan-tree-dump-not "VCOND_MASK" "vect" } } */
> diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/reduc/reduc_call-2.c
> b/gcc/testsuite/gcc.target/riscv/rvv/autovec/reduc/reduc_call-2.c
> index cc07a047cd5..7be22d60bf2 100644
> --- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/reduc/reduc_call-2.c
> +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/reduc/reduc_call-2.c
> @@ -3,4 +3,6 @@
>
> #include "reduc_call-1.c"
>
> -/* { dg-final { scan-assembler-times
> {vfmacc\.vv\s+v[0-9]+,v[0-9]+,v[0-9]+,v0.t} 1 } } */
> +/* { dg-final { scan-assembler-times {vfmadd\.vv\s+v[0-9]+,v[0-9]+,v[0-9]+}
> 1 } } */
> +/* { dg-final { scan-assembler-times
> {vfadd\.vv\s+v[0-9]+,v[0-9]+,v[0-9]+,v0.t} 1 } } */
> +/* { dg-final { scan-assembler-not {vmerge} } } */
> diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/reduc/reduc_call-4.c
> b/gcc/testsuite/gcc.target/riscv/rvv/autovec/reduc/reduc_call-4.c
> index 6d00c404d2a..83beabeff97 100644
> --- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/reduc/reduc_call-4.c
> +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/reduc/reduc_call-4.c
> @@ -3,4 +3,6 @@
>
> #include "reduc_call-1.c"
>
> -/* { dg-final { scan-assembler {vfmacc\.vv\s+v[0-9]+,v[0-9]+,v[0-9]+,v0.t} }
> } */
> +/* { dg-final { scan-assembler {vfmadd\.vv\s+v[0-9]+,v[0-9]+,v[0-9]+} } } */
> +/* { dg-final { scan-assembler {vfadd\.vv\s+v[0-9]+,v[0-9]+,v[0-9]+,v0.t} }
> } */
> +/* { dg-final { scan-assembler-not {vmerge} } } */
> diff --git a/gcc/tree-if-conv.cc b/gcc/tree-if-conv.cc
> index 262765139ff..4fea1000911 100644
> --- a/gcc/tree-if-conv.cc
> +++ b/gcc/tree-if-conv.cc
> @@ -1856,10 +1856,12 @@ convert_scalar_cond_reduction (gimple *reduc,
> gimple_stmt_iterator *gsi,
> gimple *new_assign;
> tree rhs;
> tree rhs1 = gimple_assign_rhs1 (reduc);
> + tree lhs = gimple_assign_lhs (reduc);
> tree tmp = make_temp_ssa_name (TREE_TYPE (rhs1), NULL, "_ifc_");
> tree c;
> enum tree_code reduction_op = gimple_assign_rhs_code (reduc);
> - tree op_nochange = neutral_op_for_reduction (TREE_TYPE (rhs1),
> reduction_op, NULL);
> + tree op_nochange = neutral_op_for_reduction (TREE_TYPE (rhs1),
> reduction_op,
> + NULL, false);
> gimple_seq stmts = NULL;
>
> if (dump_file && (dump_flags & TDF_DETAILS))
> @@ -1868,19 +1870,36 @@ convert_scalar_cond_reduction (gimple *reduc,
> gimple_stmt_iterator *gsi,
> print_gimple_stmt (dump_file, reduc, 0, TDF_SLIM);
> }
>
> - /* Build cond expression using COND and constant operand
> - of reduction rhs. */
> - c = fold_build_cond_expr (TREE_TYPE (rhs1),
> - unshare_expr (cond),
> - swap ? op_nochange : op1,
> - swap ? op1 : op_nochange);
> -
> - /* Create assignment stmt and insert it at GSI. */
> - new_assign = gimple_build_assign (tmp, c);
> - gsi_insert_before (gsi, new_assign, GSI_SAME_STMT);
> - /* Build rhs for unconditional increment/decrement/logic_operation. */
> - rhs = gimple_build (&stmts, reduction_op,
> - TREE_TYPE (rhs1), op0, tmp);
> + /* If possible create a COND_OP instead of a COND_EXPR and an OP_EXPR.
> + The COND_OP will have a neutral_op else value. */
> + internal_fn ifn;
> + ifn = get_conditional_internal_fn (reduction_op);
> + if (ifn != IFN_LAST
> + && vectorized_internal_fn_supported_p (ifn, TREE_TYPE (lhs))
> + && !swap)
> + {
> + gcall *cond_call = gimple_build_call_internal (ifn, 4,
> + unshare_expr (cond),
> + op0, op1, op0);
> + gsi_insert_before (gsi, cond_call, GSI_SAME_STMT);
> + gimple_call_set_lhs (cond_call, tmp);
> + rhs = tmp;
> + }
> + else
> + {
> + /* Build cond expression using COND and constant operand
> + of reduction rhs. */
> + c = fold_build_cond_expr (TREE_TYPE (rhs1),
> + unshare_expr (cond),
> + swap ? op_nochange : op1,
> + swap ? op1 : op_nochange);
> + /* Create assignment stmt and insert it at GSI. */
> + new_assign = gimple_build_assign (tmp, c);
> + gsi_insert_before (gsi, new_assign, GSI_SAME_STMT);
> + /* Build rhs for unconditional increment/decrement/logic_operation. */
> + rhs = gimple_build (&stmts, reduction_op,
> + TREE_TYPE (rhs1), op0, tmp);
> + }
>
> if (has_nop)
> {
> @@ -2292,7 +2311,7 @@ predicate_scalar_phi (gphi *phi, gimple_stmt_iterator
> *gsi)
> {
> /* Convert reduction stmt into vectorizable form. */
> rhs = convert_scalar_cond_reduction (reduc, gsi, cond, op0, op1,
> - swap,has_nop, nop_reduc);
> + swap, has_nop, nop_reduc);
> redundant_ssa_names.safe_push (std::make_pair (res, rhs));
> }
> new_stmt = gimple_build_assign (res, rhs);
> diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
> index 40f167d2795..3b28c826b3b 100644
> --- a/gcc/tree-vect-loop.cc
> +++ b/gcc/tree-vect-loop.cc
> @@ -3762,7 +3762,10 @@ vect_analyze_loop (class loop *loop, vec_info_shared
> *shared)
> static bool
> fold_left_reduction_fn (code_helper code, internal_fn *reduc_fn)
> {
> - if (code == PLUS_EXPR)
> + /* We support MINUS_EXPR by negating the operand. This also preserves an
> + initial -0.0 since -0.0 - 0.0 (neutral op for MINUS_EXPR) == -0.0 +
> + (-0.0) = -0.0. */
> + if (code == PLUS_EXPR || code == MINUS_EXPR)
> {
> *reduc_fn = IFN_FOLD_LEFT_PLUS;
> return true;
> @@ -3841,23 +3844,29 @@ reduction_fn_for_scalar_code (code_helper code,
> internal_fn *reduc_fn)
> by the introduction of additional X elements, return that X, otherwise
> return null. CODE is the code of the reduction and SCALAR_TYPE is type
> of the scalar elements. If the reduction has just a single initial value
> - then INITIAL_VALUE is that value, otherwise it is null. */
> + then INITIAL_VALUE is that value, otherwise it is null.
> + If AS_INITIAL is TRUE the value is supposed to be used as initial value.
> + In that case no signed zero is returned. */
>
> tree
> neutral_op_for_reduction (tree scalar_type, code_helper code,
> - tree initial_value)
> + tree initial_value, bool as_initial)
> {
> if (code.is_tree_code ())
> switch (tree_code (code))
> {
> - case WIDEN_SUM_EXPR:
> case DOT_PROD_EXPR:
> case SAD_EXPR:
> - case PLUS_EXPR:
> case MINUS_EXPR:
> case BIT_IOR_EXPR:
> case BIT_XOR_EXPR:
> return build_zero_cst (scalar_type);
> + case WIDEN_SUM_EXPR:
> + case PLUS_EXPR:
> + if (!as_initial && HONOR_SIGNED_ZEROS (scalar_type))
> + return build_real (scalar_type, dconstm0);
> + else
> + return build_zero_cst (scalar_type);
>
> case MULT_EXPR:
> return build_one_cst (scalar_type);
> @@ -4079,12 +4088,37 @@ pop:
> use_operand_p use_p;
> gimple *op_use_stmt;
> unsigned cnt = 0;
> + bool cond_fn_p = op.code.is_internal_fn ()
> + && (conditional_internal_fn_code (internal_fn (op.code))
> + != ERROR_MARK);
> +
> FOR_EACH_IMM_USE_STMT (op_use_stmt, imm_iter, op.ops[opi])
> - if (!is_gimple_debug (op_use_stmt)
> - && (*code != ERROR_MARK
> - || flow_bb_inside_loop_p (loop, gimple_bb (op_use_stmt))))
> - FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
> - cnt++;
> + {
> + /* In case of a COND_OP (mask, op1, op2, op1) reduction we might have
> + op1 twice (once as definition, once as else) in the same operation.
> + Allow this. */
> + if (cond_fn_p)
> + {
> + gcall *call = dyn_cast<gcall *> (use_stmt);
> + unsigned else_pos
> + = internal_fn_else_index (internal_fn (op.code));
> +
> + for (unsigned int j = 0; j < gimple_call_num_args (call); ++j)
> + {
> + if (j == else_pos)
> + continue;
> + if (gimple_call_arg (call, j) == op.ops[opi])
> + cnt++;
> + }
> + }
> + else if (!is_gimple_debug (op_use_stmt)
> + && (*code != ERROR_MARK
> + || flow_bb_inside_loop_p (loop,
> + gimple_bb (op_use_stmt))))
> + FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
> + cnt++;
> + }
> +
> if (cnt != 1)
> {
> fail = true;
> @@ -4187,8 +4221,14 @@ vect_is_simple_reduction (loop_vec_info loop_info,
> stmt_vec_info phi_info,
> return NULL;
> }
>
> - nphi_def_loop_uses++;
> - phi_use_stmt = use_stmt;
> + /* In case of a COND_OP (mask, op1, op2, op1) reduction we might have
> + op1 twice (once as definition, once as else) in the same operation.
> + Only count it as one. */
> + if (use_stmt != phi_use_stmt)
> + {
> + nphi_def_loop_uses++;
> + phi_use_stmt = use_stmt;
> + }
> }
>
> tree latch_def = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
> @@ -6122,7 +6162,7 @@ vect_create_epilog_for_reduction (loop_vec_info
> loop_vinfo,
> gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
> gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt_info);
> }
> -
> +
> scalar_dest = gimple_get_lhs (orig_stmt_info->stmt);
> scalar_type = TREE_TYPE (scalar_dest);
> scalar_results.truncate (0);
> @@ -6459,7 +6499,7 @@ vect_create_epilog_for_reduction (loop_vec_info
> loop_vinfo,
> if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
> initial_value = reduc_info->reduc_initial_values[0];
> neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype), code,
> - initial_value);
> + initial_value, false);
> }
> if (neutral_op)
> vector_identity = gimple_build_vector_from_val (&seq, vectype,
> @@ -6941,8 +6981,8 @@ vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
> gimple_stmt_iterator *gsi,
> gimple **vec_stmt, slp_tree slp_node,
> gimple *reduc_def_stmt,
> - tree_code code, internal_fn reduc_fn,
> - tree ops[3], tree vectype_in,
> + code_helper code, internal_fn reduc_fn,
> + tree *ops, int num_ops, tree vectype_in,
> int reduc_index, vec_loop_masks *masks,
> vec_loop_lens *lens)
> {
> @@ -6958,17 +6998,48 @@ vectorize_fold_left_reduction (loop_vec_info
> loop_vinfo,
>
> gcc_assert (!nested_in_vect_loop_p (loop, stmt_info));
> gcc_assert (ncopies == 1);
> - gcc_assert (TREE_CODE_LENGTH (code) == binary_op);
> +
> + bool is_cond_op = false;
> + if (!code.is_tree_code ())
> + {
> + code = conditional_internal_fn_code (internal_fn (code));
> + gcc_assert (code != ERROR_MARK);
> + is_cond_op = true;
> + }
> +
> + gcc_assert (TREE_CODE_LENGTH (tree_code (code)) == binary_op);
>
> if (slp_node)
> - gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
> - TYPE_VECTOR_SUBPARTS (vectype_in)));
> + {
> + if (is_cond_op)
> + {
> + if (dump_enabled_p ())
> + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> + "fold-left reduction on SLP not supported.\n");
> + return false;
> + }
> +
> + gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
> + TYPE_VECTOR_SUBPARTS (vectype_in)));
> + }
>
> - tree op0 = ops[1 - reduc_index];
> + /* The operands either come from a binary operation or an IFN_COND
> operation.
> + The former is a gimple assign with binary rhs and the latter is a
> + gimple call with four arguments. */
> + gcc_assert (num_ops == 2 || num_ops == 4);
> + tree op0, opmask;
> + if (!is_cond_op)
> + op0 = ops[1 - reduc_index];
> + else
> + {
> + op0 = ops[2];
> + opmask = ops[0];
> + gcc_assert (!slp_node);
> + }
>
> int group_size = 1;
> stmt_vec_info scalar_dest_def_info;
> - auto_vec<tree> vec_oprnds0;
> + auto_vec<tree> vec_oprnds0, vec_opmask;
> if (slp_node)
> {
> auto_vec<vec<tree> > vec_defs (2);
> @@ -6984,9 +7055,15 @@ vectorize_fold_left_reduction (loop_vec_info
> loop_vinfo,
> vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
> op0, &vec_oprnds0);
> scalar_dest_def_info = stmt_info;
> +
> + /* For an IFN_COND_OP we also need the vector mask operand. */
> + if (is_cond_op)
> + vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
> + opmask, &vec_opmask);
> }
>
> - tree scalar_dest = gimple_assign_lhs (scalar_dest_def_info->stmt);
> + gimple *sdef = scalar_dest_def_info->stmt;
> + tree scalar_dest = gimple_get_lhs (sdef);
> tree scalar_type = TREE_TYPE (scalar_dest);
> tree reduc_var = gimple_phi_result (reduc_def_stmt);
>
> @@ -7020,13 +7097,16 @@ vectorize_fold_left_reduction (loop_vec_info
> loop_vinfo,
> tree bias = NULL_TREE;
> if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
> mask = vect_get_loop_mask (loop_vinfo, gsi, masks, vec_num, vectype_in,
> i);
> + else if (is_cond_op)
> + mask = vec_opmask[0];
> if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
> {
> len = vect_get_loop_len (loop_vinfo, gsi, lens, vec_num, vectype_in,
> i, 1);
> signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
> bias = build_int_cst (intQI_type_node, biasval);
> - mask = build_minus_one_cst (truth_type_for (vectype_in));
> + if (!is_cond_op)
> + mask = build_minus_one_cst (truth_type_for (vectype_in));
> }
>
> /* Handle MINUS by adding the negative. */
> @@ -7038,7 +7118,8 @@ vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
> def0 = negated;
> }
>
> - if (mask && mask_reduc_fn == IFN_LAST)
> + if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
> + && mask && mask_reduc_fn == IFN_LAST)
> def0 = merge_with_identity (gsi, mask, vectype_out, def0,
> vector_identity);
>
> @@ -7069,8 +7150,8 @@ vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
> }
> else
> {
> - reduc_var = vect_expand_fold_left (gsi, scalar_dest_var, code,
> - reduc_var, def0);
> + reduc_var = vect_expand_fold_left (gsi, scalar_dest_var,
> + tree_code (code), reduc_var, def0);
> new_stmt = SSA_NAME_DEF_STMT (reduc_var);
> /* Remove the statement, so that we can use the same code paths
> as for statements that we've just created. */
> @@ -7521,8 +7602,13 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
> if (i == STMT_VINFO_REDUC_IDX (stmt_info))
> continue;
>
> + /* For an IFN_COND_OP we might hit the reduction definition operand
> + twice (once as definition, once as else). */
> + if (op.ops[i] == op.ops[STMT_VINFO_REDUC_IDX (stmt_info)])
> + continue;
> +
> /* There should be only one cycle def in the stmt, the one
> - leading to reduc_def. */
> + leading to reduc_def. */
> if (VECTORIZABLE_CYCLE_DEF (dt))
> return false;
>
> @@ -7721,6 +7807,15 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
> when generating the code inside the loop. */
>
> code_helper orig_code = STMT_VINFO_REDUC_CODE (phi_info);
> +
> + /* If conversion might have created a conditional operation like
> + IFN_COND_ADD already. Use the internal code for the following checks.
> */
> + if (orig_code.is_internal_fn ())
> + {
> + tree_code new_code = conditional_internal_fn_code (internal_fn
> (orig_code));
> + orig_code = new_code != ERROR_MARK ? new_code : orig_code;
> + }
> +
> STMT_VINFO_REDUC_CODE (reduc_info) = orig_code;
>
> vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
> @@ -7759,7 +7854,7 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
> {
> if (dump_enabled_p ())
> dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> - "reduction: not commutative/associative");
> + "reduction: not commutative/associative\n");
> return false;
> }
> }
> @@ -8143,9 +8238,8 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
> LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
> }
> else if (reduction_type == FOLD_LEFT_REDUCTION
> - && reduc_fn == IFN_LAST
> + && internal_fn_mask_index (reduc_fn) == -1
> && FLOAT_TYPE_P (vectype_in)
> - && HONOR_SIGNED_ZEROS (vectype_in)
> && HONOR_SIGN_DEPENDENT_ROUNDING (vectype_in))
> {
> if (dump_enabled_p ())
> @@ -8294,6 +8388,7 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
>
> code_helper code = canonicalize_code (op.code, op.type);
> internal_fn cond_fn = get_conditional_internal_fn (code, op.type);
> +
> vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
> vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
> bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn,
> vectype_in);
> @@ -8312,17 +8407,29 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
> if (code == COND_EXPR)
> gcc_assert (ncopies == 1);
>
> + /* A binary COND_OP reduction must have the same definition and else
> + value. */
> + bool cond_fn_p = code.is_internal_fn ()
> + && conditional_internal_fn_code (internal_fn (code)) != ERROR_MARK;
> + if (cond_fn_p)
> + {
> + gcc_assert (code == IFN_COND_ADD || code == IFN_COND_SUB
> + || code == IFN_COND_MUL || code == IFN_COND_AND
> + || code == IFN_COND_IOR || code == IFN_COND_XOR);
> + gcc_assert (op.num_ops == 4 && (op.ops[1] == op.ops[3]));
> + }
> +
> bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
>
> vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
> if (reduction_type == FOLD_LEFT_REDUCTION)
> {
> internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
> - gcc_assert (code.is_tree_code ());
> + gcc_assert (code.is_tree_code () || cond_fn_p);
> return vectorize_fold_left_reduction
> (loop_vinfo, stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi,
> - tree_code (code), reduc_fn, op.ops, vectype_in, reduc_index, masks,
> - lens);
> + code, reduc_fn, op.ops, op.num_ops, vectype_in,
> + reduc_index, masks, lens);
> }
>
> bool single_defuse_cycle = STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info);
> @@ -8335,14 +8442,20 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
> tree scalar_dest = gimple_get_lhs (stmt_info->stmt);
> tree vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
>
> + /* Get NCOPIES vector definitions for all operands except the reduction
> + definition. */
> vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies,
> single_defuse_cycle && reduc_index == 0
> ? NULL_TREE : op.ops[0], &vec_oprnds0,
> single_defuse_cycle && reduc_index == 1
> ? NULL_TREE : op.ops[1], &vec_oprnds1,
> - op.num_ops == 3
> - && !(single_defuse_cycle && reduc_index == 2)
> + op.num_ops == 4
> + || (op.num_ops == 3
> + && !(single_defuse_cycle && reduc_index == 2))
> ? op.ops[2] : NULL_TREE, &vec_oprnds2);
> +
> + /* For single def-use cycles get one copy of the vectorized reduction
> + definition. */
> if (single_defuse_cycle)
> {
> gcc_assert (!slp_node);
> @@ -8382,7 +8495,7 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
> }
> else
> {
> - if (op.num_ops == 3)
> + if (op.num_ops >= 3)
> vop[2] = vec_oprnds2[i];
>
> if (masked_loop_p && mask_by_cond_expr)
> @@ -8395,10 +8508,16 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
> if (emulated_mixed_dot_prod)
> new_stmt = vect_emulate_mixed_dot_prod (loop_vinfo, stmt_info, gsi,
> vec_dest, vop);
> - else if (code.is_internal_fn ())
> +
> + else if (code.is_internal_fn () && !cond_fn_p)
> new_stmt = gimple_build_call_internal (internal_fn (code),
> op.num_ops,
> vop[0], vop[1], vop[2]);
> + else if (code.is_internal_fn () && cond_fn_p)
> + new_stmt = gimple_build_call_internal (internal_fn (code),
> + op.num_ops,
> + vop[0], vop[1], vop[2],
> + vop[1]);
> else
> new_stmt = gimple_build_assign (vec_dest, tree_code (op.code),
> vop[0], vop[1], vop[2]);
> diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
> index a4043e4a656..254d172231d 100644
> --- a/gcc/tree-vectorizer.h
> +++ b/gcc/tree-vectorizer.h
> @@ -2350,7 +2350,7 @@ extern tree vect_create_addr_base_for_vector_ref
> (vec_info *,
> tree);
>
> /* In tree-vect-loop.cc. */
> -extern tree neutral_op_for_reduction (tree, code_helper, tree);
> +extern tree neutral_op_for_reduction (tree, code_helper, tree, bool = true);
> extern widest_int vect_iv_limit_for_partial_vectors (loop_vec_info
> loop_vinfo);
> bool vect_rgroup_iv_might_wrap_p (loop_vec_info, rgroup_controls *);
> /* Used in tree-vect-loop-manip.cc */