On Tue, 7 Nov 2023, Robin Dapp wrote:

> Hi,
> 
> this restricts tree-ifcvt to only create COND_OPs when we versioned the
> loop for vectorization.  Apart from that it re-creates a VEC_COND_EXPR
> in vect_expand_fold_left if we emitted a COND_OP.
> 
> I'm still missing the "bail out" part for vect_expand_fold_left, though?
> 
> Bootstrap, testsuites are unchanged but so they were for the original
> patch, so... :)

Looks good, maybe ...

> Regards
>  Robin
> 
> gcc/ChangeLog:
> 
>       PR tree-optimization/112361
>       PR target/112359
>       PR middle-end/112406
> 
>       * tree-if-conv.cc (convert_scalar_cond_reduction): Remember if
>       loop was versioned and only then create COND_OPs.
>       (predicate_scalar_phi): Do not create COND_OP when not
>       vectorizing.
>       * tree-vect-loop.cc (vect_expand_fold_left): Re-create
>       VEC_COND_EXPR.
>       (vectorize_fold_left_reduction): Pass mask and whether a cond_op
>       was used to vect_expand_fold_left.
> 
> gcc/testsuite/ChangeLog:
> 
>       * gcc.dg/pr112359.c: New test.
> ---
>  gcc/testsuite/gcc.dg/pr112359.c | 14 +++++++++++
>  gcc/tree-if-conv.cc             | 41 ++++++++++++++++++++++-----------
>  gcc/tree-vect-loop.cc           | 22 ++++++++++++++++--
>  3 files changed, 62 insertions(+), 15 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.dg/pr112359.c
> 
> diff --git a/gcc/testsuite/gcc.dg/pr112359.c b/gcc/testsuite/gcc.dg/pr112359.c
> new file mode 100644
> index 00000000000..26c77457399
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/pr112359.c
> @@ -0,0 +1,14 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O -mavx512fp16 -ftree-loop-if-convert" } */
> +
> +int i, c;
> +unsigned long long u;
> +
> +void
> +foo (void)
> +{
> +  for (; i; i++)
> +    if (c)
> +      u |= i;
> +}
> +
> diff --git a/gcc/tree-if-conv.cc b/gcc/tree-if-conv.cc
> index a7a751b668c..0190cf2369e 100644
> --- a/gcc/tree-if-conv.cc
> +++ b/gcc/tree-if-conv.cc
> @@ -1845,12 +1845,15 @@ is_cond_scalar_reduction (gimple *phi, gimple 
> **reduc, tree arg_0, tree arg_1,
>      res_2 = res_13 + _ifc__1;
>    Argument SWAP tells that arguments of conditional expression should be
>    swapped.
> +  If LOOP_VERSIONED is true if we assume that we versioned the loop for
> +  vectorization.  In that case we can create a COND_OP.
>    Returns rhs of resulting PHI assignment.  */
>  
>  static tree
>  convert_scalar_cond_reduction (gimple *reduc, gimple_stmt_iterator *gsi,
>                              tree cond, tree op0, tree op1, bool swap,
> -                            bool has_nop, gimple* nop_reduc)
> +                            bool has_nop, gimple* nop_reduc,
> +                            bool loop_versioned)
>  {
>    gimple_stmt_iterator stmt_it;
>    gimple *new_assign;
> @@ -1874,7 +1877,7 @@ convert_scalar_cond_reduction (gimple *reduc, 
> gimple_stmt_iterator *gsi,
>       The COND_OP will have a neutral_op else value.  */
>    internal_fn ifn;
>    ifn = get_conditional_internal_fn (reduction_op);
> -  if (ifn != IFN_LAST
> +  if (loop_versioned && ifn != IFN_LAST
>        && vectorized_internal_fn_supported_p (ifn, TREE_TYPE (lhs))
>        && !swap)
>      {
> @@ -2129,11 +2132,13 @@ cmp_arg_entry (const void *p1, const void *p2, void * 
> /* data.  */)
>     The generated code is inserted at GSI that points to the top of
>     basic block's statement list.
>     If PHI node has more than two arguments a chain of conditional
> -   expression is produced.  */
> +   expression is produced.
> +   LOOP_VERSIONED should be true if we know that the loop was versioned for
> +   vectorization. */
>  
>  
>  static void
> -predicate_scalar_phi (gphi *phi, gimple_stmt_iterator *gsi)
> +predicate_scalar_phi (gphi *phi, gimple_stmt_iterator *gsi, bool 
> loop_versioned)
>  {
>    gimple *new_stmt = NULL, *reduc, *nop_reduc;
>    tree rhs, res, arg0, arg1, op0, op1, scev;
> @@ -2213,7 +2218,8 @@ predicate_scalar_phi (gphi *phi, gimple_stmt_iterator 
> *gsi)
>         /* Convert reduction stmt into vectorizable form.  */
>         rhs = convert_scalar_cond_reduction (reduc, gsi, cond, op0, op1,
>                                              true_bb != gimple_bb (reduc),
> -                                            has_nop, nop_reduc);
> +                                            has_nop, nop_reduc,
> +                                            loop_versioned);
>         redundant_ssa_names.safe_push (std::make_pair (res, rhs));
>       }
>        else
> @@ -2311,7 +2317,8 @@ predicate_scalar_phi (gphi *phi, gimple_stmt_iterator 
> *gsi)
>       {
>         /* Convert reduction stmt into vectorizable form.  */
>         rhs = convert_scalar_cond_reduction (reduc, gsi, cond, op0, op1,
> -                                            swap, has_nop, nop_reduc);
> +                                            swap, has_nop, nop_reduc,
> +                                            loop_versioned);
>         redundant_ssa_names.safe_push (std::make_pair (res, rhs));
>       }
>        new_stmt = gimple_build_assign (res, rhs);
> @@ -2334,10 +2341,12 @@ predicate_scalar_phi (gphi *phi, gimple_stmt_iterator 
> *gsi)
>  }
>  
>  /* Replaces in LOOP all the scalar phi nodes other than those in the
> -   LOOP->header block with conditional modify expressions.  */
> +   LOOP->header block with conditional modify expressions.
> +   LOOP_VERSIONED should be true if we know that the loop was versioned for
> +   vectorization. */
>  
>  static void
> -predicate_all_scalar_phis (class loop *loop)
> +predicate_all_scalar_phis (class loop *loop, bool loop_versioned)
>  {
>    basic_block bb;
>    unsigned int orig_loop_num_nodes = loop->num_nodes;
> @@ -2365,7 +2374,7 @@ predicate_all_scalar_phis (class loop *loop)
>           gsi_next (&phi_gsi);
>         else
>           {
> -           predicate_scalar_phi (phi, &gsi);
> +           predicate_scalar_phi (phi, &gsi, loop_versioned);
>             remove_phi_node (&phi_gsi, false);
>           }
>       }
> @@ -2887,10 +2896,12 @@ remove_conditions_and_labels (loop_p loop)
>  }
>  
>  /* Combine all the basic blocks from LOOP into one or two super basic
> -   blocks.  Replace PHI nodes with conditional modify expressions.  */
> +   blocks.  Replace PHI nodes with conditional modify expressions.
> +   LOOP_VERSIONED should be true if we know that the loop was versioned for
> +   vectorization. */
>  
>  static void
> -combine_blocks (class loop *loop)
> +combine_blocks (class loop *loop, bool loop_versioned)
>  {
>    basic_block bb, exit_bb, merge_target_bb;
>    unsigned int orig_loop_num_nodes = loop->num_nodes;
> @@ -2900,7 +2911,7 @@ combine_blocks (class loop *loop)
>  
>    remove_conditions_and_labels (loop);
>    insert_gimplified_predicates (loop);
> -  predicate_all_scalar_phis (loop);
> +  predicate_all_scalar_phis (loop, loop_versioned);
>  
>    if (need_to_predicate || need_to_rewrite_undefined)
>      predicate_statements (loop);
> @@ -3727,6 +3738,7 @@ tree_if_conversion (class loop *loop, vec<gimple *> 
> *preds)
>    bitmap exit_bbs;
>    edge pe;
>    auto_vec<data_reference_p, 10> refs;
> +  bool loop_versioned;
>  
>   again:
>    rloop = NULL;
> @@ -3736,6 +3748,7 @@ tree_if_conversion (class loop *loop, vec<gimple *> 
> *preds)
>    need_to_predicate = false;
>    need_to_rewrite_undefined = false;
>    any_complicated_phi = false;
> +  loop_versioned = false;
>  
>    /* Apply more aggressive if-conversion when loop or its outer loop were
>       marked with simd pragma.  When that's the case, we try to if-convert
> @@ -3843,6 +3856,8 @@ tree_if_conversion (class loop *loop, vec<gimple *> 
> *preds)
>          will re-use that for things like runtime alias versioning
>          whose condition can end up using those invariants.  */
>       pe = single_pred_edge (gimple_bb (preds->last ()));
> +
> +      loop_versioned = true;
>      }
>  
>    if (need_to_lower_bitfields)
> @@ -3875,7 +3890,7 @@ tree_if_conversion (class loop *loop, vec<gimple *> 
> *preds)
>        /* Now all statements are if-convertible.  Combine all the basic
>        blocks into one huge basic block doing the if-conversion
>        on-the-fly.  */
> -      combine_blocks (loop);
> +      combine_blocks (loop, loop_versioned);
>      }
>  
>    /* Perform local CSE, this esp. helps the vectorizer analysis if loads
> diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
> index 362856a6507..d48685ae308 100644
> --- a/gcc/tree-vect-loop.cc
> +++ b/gcc/tree-vect-loop.cc
> @@ -6907,7 +6907,8 @@ merge_with_identity (gimple_stmt_iterator *gsi, tree 
> mask, tree vectype,
>  
>  static tree
>  vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
> -                    tree_code code, tree lhs, tree vector_rhs)
> +                    tree_code code, tree lhs, tree vector_rhs,
> +                    bool is_cond_op, tree mask)

isn't is_cond_op implied by mask != NULL?  That said, if we ever end
up here with a non-cond op but a loop mask we effectively want the
same behvior so I think eliding is_cond_op and instead checking
mask != NULL_TREE below is more future proof.

OK with that change.

I think we already have the "bail out" part for the case of applying
loop masking here:

      else if (reduction_type == FOLD_LEFT_REDUCTION
               && internal_fn_mask_index (reduc_fn) == -1
               && FLOAT_TYPE_P (vectype_in)
               && HONOR_SIGN_DEPENDENT_ROUNDING (vectype_in))
        {
          if (dump_enabled_p ())
            dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
                             "can't operate on partial vectors because"
                             " signed zeros cannot be preserved.\n");
          LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;

we'd have to do a hard-fail for the case the reduction op is
conditional with a similar condition (the above is too late for a
hard fail I think).

Thanks,
Richard.

>  {
>    tree vectype = TREE_TYPE (vector_rhs);
>    tree scalar_type = TREE_TYPE (vectype);
> @@ -6915,6 +6916,22 @@ vect_expand_fold_left (gimple_stmt_iterator *gsi, tree 
> scalar_dest,
>    unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE 
> (vectype));
>    unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
>  
> +  /* If we created a COND_OP in ifcvt re-create a VEC_COND_EXPR to mask the
> +     input here in order to be able to perform an unconditional element-wise
> +     reduction of it.  */
> +  if (is_cond_op)
> +    {
> +      tree masked_vector_rhs = make_temp_ssa_name (vectype, NULL,
> +                                                "masked_vector_rhs");
> +      tree neutral_op = neutral_op_for_reduction (scalar_type, code, 
> NULL_TREE,
> +                                               false);
> +      tree vector_identity = build_vector_from_val (vectype, neutral_op);
> +      gassign *select = gimple_build_assign (masked_vector_rhs, 
> VEC_COND_EXPR,
> +                                          mask, vector_rhs, vector_identity);
> +      gsi_insert_before (gsi, select, GSI_SAME_STMT);
> +      vector_rhs = masked_vector_rhs;
> +    }
> +
>    for (unsigned HOST_WIDE_INT bit_offset = 0;
>         bit_offset < vec_size_in_bits;
>         bit_offset += element_bitsize)
> @@ -7151,7 +7168,8 @@ vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
>        else
>       {
>         reduc_var = vect_expand_fold_left (gsi, scalar_dest_var,
> -                                          tree_code (code), reduc_var, def0);
> +                                          tree_code (code), reduc_var, def0,
> +                                          is_cond_op, mask);
>         new_stmt = SSA_NAME_DEF_STMT (reduc_var);
>         /* Remove the statement, so that we can use the same code paths
>            as for statements that we've just created.  */
> 

-- 
Richard Biener <rguent...@suse.de>
SUSE Software Solutions Germany GmbH,
Frankenstrasse 146, 90461 Nuernberg, Germany;
GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG Nuernberg)

Reply via email to