Hi, this restricts tree-ifcvt to only create COND_OPs when we versioned the loop for vectorization. Apart from that it re-creates a VEC_COND_EXPR in vect_expand_fold_left if we emitted a COND_OP.
I'm still missing the "bail out" part for vect_expand_fold_left, though? Bootstrap, testsuites are unchanged but so they were for the original patch, so... :) Regards Robin gcc/ChangeLog: PR tree-optimization/112361 PR target/112359 PR middle-end/112406 * tree-if-conv.cc (convert_scalar_cond_reduction): Remember if loop was versioned and only then create COND_OPs. (predicate_scalar_phi): Do not create COND_OP when not vectorizing. * tree-vect-loop.cc (vect_expand_fold_left): Re-create VEC_COND_EXPR. (vectorize_fold_left_reduction): Pass mask and whether a cond_op was used to vect_expand_fold_left. gcc/testsuite/ChangeLog: * gcc.dg/pr112359.c: New test. --- gcc/testsuite/gcc.dg/pr112359.c | 14 +++++++++++ gcc/tree-if-conv.cc | 41 ++++++++++++++++++++++----------- gcc/tree-vect-loop.cc | 22 ++++++++++++++++-- 3 files changed, 62 insertions(+), 15 deletions(-) create mode 100644 gcc/testsuite/gcc.dg/pr112359.c diff --git a/gcc/testsuite/gcc.dg/pr112359.c b/gcc/testsuite/gcc.dg/pr112359.c new file mode 100644 index 00000000000..26c77457399 --- /dev/null +++ b/gcc/testsuite/gcc.dg/pr112359.c @@ -0,0 +1,14 @@ +/* { dg-do compile } */ +/* { dg-options "-O -mavx512fp16 -ftree-loop-if-convert" } */ + +int i, c; +unsigned long long u; + +void +foo (void) +{ + for (; i; i++) + if (c) + u |= i; +} + diff --git a/gcc/tree-if-conv.cc b/gcc/tree-if-conv.cc index a7a751b668c..0190cf2369e 100644 --- a/gcc/tree-if-conv.cc +++ b/gcc/tree-if-conv.cc @@ -1845,12 +1845,15 @@ is_cond_scalar_reduction (gimple *phi, gimple **reduc, tree arg_0, tree arg_1, res_2 = res_13 + _ifc__1; Argument SWAP tells that arguments of conditional expression should be swapped. + If LOOP_VERSIONED is true if we assume that we versioned the loop for + vectorization. In that case we can create a COND_OP. Returns rhs of resulting PHI assignment. */ static tree convert_scalar_cond_reduction (gimple *reduc, gimple_stmt_iterator *gsi, tree cond, tree op0, tree op1, bool swap, - bool has_nop, gimple* nop_reduc) + bool has_nop, gimple* nop_reduc, + bool loop_versioned) { gimple_stmt_iterator stmt_it; gimple *new_assign; @@ -1874,7 +1877,7 @@ convert_scalar_cond_reduction (gimple *reduc, gimple_stmt_iterator *gsi, The COND_OP will have a neutral_op else value. */ internal_fn ifn; ifn = get_conditional_internal_fn (reduction_op); - if (ifn != IFN_LAST + if (loop_versioned && ifn != IFN_LAST && vectorized_internal_fn_supported_p (ifn, TREE_TYPE (lhs)) && !swap) { @@ -2129,11 +2132,13 @@ cmp_arg_entry (const void *p1, const void *p2, void * /* data. */) The generated code is inserted at GSI that points to the top of basic block's statement list. If PHI node has more than two arguments a chain of conditional - expression is produced. */ + expression is produced. + LOOP_VERSIONED should be true if we know that the loop was versioned for + vectorization. */ static void -predicate_scalar_phi (gphi *phi, gimple_stmt_iterator *gsi) +predicate_scalar_phi (gphi *phi, gimple_stmt_iterator *gsi, bool loop_versioned) { gimple *new_stmt = NULL, *reduc, *nop_reduc; tree rhs, res, arg0, arg1, op0, op1, scev; @@ -2213,7 +2218,8 @@ predicate_scalar_phi (gphi *phi, gimple_stmt_iterator *gsi) /* Convert reduction stmt into vectorizable form. */ rhs = convert_scalar_cond_reduction (reduc, gsi, cond, op0, op1, true_bb != gimple_bb (reduc), - has_nop, nop_reduc); + has_nop, nop_reduc, + loop_versioned); redundant_ssa_names.safe_push (std::make_pair (res, rhs)); } else @@ -2311,7 +2317,8 @@ predicate_scalar_phi (gphi *phi, gimple_stmt_iterator *gsi) { /* Convert reduction stmt into vectorizable form. */ rhs = convert_scalar_cond_reduction (reduc, gsi, cond, op0, op1, - swap, has_nop, nop_reduc); + swap, has_nop, nop_reduc, + loop_versioned); redundant_ssa_names.safe_push (std::make_pair (res, rhs)); } new_stmt = gimple_build_assign (res, rhs); @@ -2334,10 +2341,12 @@ predicate_scalar_phi (gphi *phi, gimple_stmt_iterator *gsi) } /* Replaces in LOOP all the scalar phi nodes other than those in the - LOOP->header block with conditional modify expressions. */ + LOOP->header block with conditional modify expressions. + LOOP_VERSIONED should be true if we know that the loop was versioned for + vectorization. */ static void -predicate_all_scalar_phis (class loop *loop) +predicate_all_scalar_phis (class loop *loop, bool loop_versioned) { basic_block bb; unsigned int orig_loop_num_nodes = loop->num_nodes; @@ -2365,7 +2374,7 @@ predicate_all_scalar_phis (class loop *loop) gsi_next (&phi_gsi); else { - predicate_scalar_phi (phi, &gsi); + predicate_scalar_phi (phi, &gsi, loop_versioned); remove_phi_node (&phi_gsi, false); } } @@ -2887,10 +2896,12 @@ remove_conditions_and_labels (loop_p loop) } /* Combine all the basic blocks from LOOP into one or two super basic - blocks. Replace PHI nodes with conditional modify expressions. */ + blocks. Replace PHI nodes with conditional modify expressions. + LOOP_VERSIONED should be true if we know that the loop was versioned for + vectorization. */ static void -combine_blocks (class loop *loop) +combine_blocks (class loop *loop, bool loop_versioned) { basic_block bb, exit_bb, merge_target_bb; unsigned int orig_loop_num_nodes = loop->num_nodes; @@ -2900,7 +2911,7 @@ combine_blocks (class loop *loop) remove_conditions_and_labels (loop); insert_gimplified_predicates (loop); - predicate_all_scalar_phis (loop); + predicate_all_scalar_phis (loop, loop_versioned); if (need_to_predicate || need_to_rewrite_undefined) predicate_statements (loop); @@ -3727,6 +3738,7 @@ tree_if_conversion (class loop *loop, vec<gimple *> *preds) bitmap exit_bbs; edge pe; auto_vec<data_reference_p, 10> refs; + bool loop_versioned; again: rloop = NULL; @@ -3736,6 +3748,7 @@ tree_if_conversion (class loop *loop, vec<gimple *> *preds) need_to_predicate = false; need_to_rewrite_undefined = false; any_complicated_phi = false; + loop_versioned = false; /* Apply more aggressive if-conversion when loop or its outer loop were marked with simd pragma. When that's the case, we try to if-convert @@ -3843,6 +3856,8 @@ tree_if_conversion (class loop *loop, vec<gimple *> *preds) will re-use that for things like runtime alias versioning whose condition can end up using those invariants. */ pe = single_pred_edge (gimple_bb (preds->last ())); + + loop_versioned = true; } if (need_to_lower_bitfields) @@ -3875,7 +3890,7 @@ tree_if_conversion (class loop *loop, vec<gimple *> *preds) /* Now all statements are if-convertible. Combine all the basic blocks into one huge basic block doing the if-conversion on-the-fly. */ - combine_blocks (loop); + combine_blocks (loop, loop_versioned); } /* Perform local CSE, this esp. helps the vectorizer analysis if loads diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc index 362856a6507..d48685ae308 100644 --- a/gcc/tree-vect-loop.cc +++ b/gcc/tree-vect-loop.cc @@ -6907,7 +6907,8 @@ merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype, static tree vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest, - tree_code code, tree lhs, tree vector_rhs) + tree_code code, tree lhs, tree vector_rhs, + bool is_cond_op, tree mask) { tree vectype = TREE_TYPE (vector_rhs); tree scalar_type = TREE_TYPE (vectype); @@ -6915,6 +6916,22 @@ vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest, unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype)); unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize); + /* If we created a COND_OP in ifcvt re-create a VEC_COND_EXPR to mask the + input here in order to be able to perform an unconditional element-wise + reduction of it. */ + if (is_cond_op) + { + tree masked_vector_rhs = make_temp_ssa_name (vectype, NULL, + "masked_vector_rhs"); + tree neutral_op = neutral_op_for_reduction (scalar_type, code, NULL_TREE, + false); + tree vector_identity = build_vector_from_val (vectype, neutral_op); + gassign *select = gimple_build_assign (masked_vector_rhs, VEC_COND_EXPR, + mask, vector_rhs, vector_identity); + gsi_insert_before (gsi, select, GSI_SAME_STMT); + vector_rhs = masked_vector_rhs; + } + for (unsigned HOST_WIDE_INT bit_offset = 0; bit_offset < vec_size_in_bits; bit_offset += element_bitsize) @@ -7151,7 +7168,8 @@ vectorize_fold_left_reduction (loop_vec_info loop_vinfo, else { reduc_var = vect_expand_fold_left (gsi, scalar_dest_var, - tree_code (code), reduc_var, def0); + tree_code (code), reduc_var, def0, + is_cond_op, mask); new_stmt = SSA_NAME_DEF_STMT (reduc_var); /* Remove the statement, so that we can use the same code paths as for statements that we've just created. */ -- 2.41.0