On Wed, 15 Nov 2023, Tamar Christina wrote:
> Patch updated to latest trunk,
>
> This splits the part of the function that does peeling for loops at exits to
> a different function. In this new function we also peel for early breaks.
>
> Peeling for early breaks works by redirecting all early break exits to a
> single "early break" block and combine them and the normal exit edge together
> later in a different block which then goes into the epilog preheader.
>
> This allows us to re-use all the existing code for IV updates, Additionally
> this
> also enables correct linking for multiple vector epilogues.
>
> flush_pending_stmts cannot be used in this scenario since it updates the PHI
> nodes in the order that they are in the exit destination blocks. This means
> they are in CFG visit order. With a single exit this doesn't matter but with
> multiple exits with different live values through the different exits the
> order
> usually does not line up.
>
> Additionally the vectorizer helper functions expect to be able to iterate over
> the nodes in the order that they occur in the loop header blocks. This is an
> invariant we must maintain. To do this we just inline the work
> flush_pending_stmts but maintain the order by using the header blocks to guide
> the work.
>
> Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
>
> Ok for master?
>
> Thanks,
> Tamar
>
> gcc/ChangeLog:
>
> * tree-vect-loop-manip.cc (vect_is_loop_exit_latch_pred): New.
> (slpeel_tree_duplicate_loop_for_vectorization): New.
> (slpeel_tree_duplicate_loop_to_edge_cfg): use it.
> * tree-vectorizer.h (is_loop_header_bb_p): Drop assert.
> (slpeel_tree_duplicate_loop_to_edge_cfg): Update signature.
> (vect_is_loop_exit_latch_pred): New.
>
> --- inline copy of patch ---
>
> diff --git a/gcc/tree-vect-loop-manip.cc b/gcc/tree-vect-loop-manip.cc
> index
> b9161274ce401a7307f3e61ad23aa036701190d7..fafbf924e8db18eb4eec7a4a1906d10f6ce9812f
> 100644
> --- a/gcc/tree-vect-loop-manip.cc
> +++ b/gcc/tree-vect-loop-manip.cc
> @@ -1392,6 +1392,153 @@ vect_set_loop_condition (class loop *loop, edge
> loop_e, loop_vec_info loop_vinfo
> (gimple *) cond_stmt);
> }
>
> +/* Determine if the exit choosen by the loop vectorizer differs from the
> + natural loop exit. i.e. if the exit leads to the loop patch or not.
> + When this happens we need to flip the understanding of main and other
> + exits by peeling and IV updates. */
> +
> +bool inline
> +vect_is_loop_exit_latch_pred (edge loop_exit, class loop *loop)
Ick, bad name - didn't see its use(s) in this patch?
> +{
> + return single_pred (loop->latch) == loop_exit->src;
> +}
> +
> +/* Perform peeling for when the peeled loop is placed after the original
> loop.
> + This maintains LCSSA and creates the appropriate blocks for multiple exit
> + vectorization. */
> +
> +void static
> +slpeel_tree_duplicate_loop_for_vectorization (class loop *loop, edge
> loop_exit,
> + vec<edge> &loop_exits,
> + class loop *new_loop,
> + bool flow_loops,
> + basic_block new_preheader)
also bad name ;) I don't see a strong reason to factor this out.
> +{
> + bool multiple_exits_p = loop_exits.length () > 1;
> + basic_block main_loop_exit_block = new_preheader;
> + if (multiple_exits_p)
> + {
> + edge loop_entry = single_succ_edge (new_preheader);
> + new_preheader = split_edge (loop_entry);
> + }
> +
> + auto_vec <gimple *> new_phis;
> + hash_map <tree, tree> new_phi_args;
> + /* First create the empty phi nodes so that when we flush the
> + statements they can be filled in. However because there is no order
> + between the PHI nodes in the exits and the loop headers we need to
> + order them base on the order of the two headers. First record the new
> + phi nodes. Then redirect the edges and flush the changes. This writes
> out
> + the new SSA names. */
> + for (auto gsi_from = gsi_start_phis (loop_exit->dest);
> + !gsi_end_p (gsi_from); gsi_next (&gsi_from))
> + {
> + gimple *from_phi = gsi_stmt (gsi_from);
> + tree new_res = copy_ssa_name (gimple_phi_result (from_phi));
> + gphi *res = create_phi_node (new_res, main_loop_exit_block);
> + new_phis.safe_push (res);
> + }
> +
> + for (auto exit : loop_exits)
> + {
> + basic_block dest
> + = exit == loop_exit ? main_loop_exit_block : new_preheader;
> + redirect_edge_and_branch (exit, dest);
> + }
> +
> + /* Only fush the main exit, the remaining exits we need to match the order
> + in the loop->header which with multiple exits may not be the same. */
> + flush_pending_stmts (loop_exit);
> +
> + /* Record the new SSA names in the cache so that we can skip materializing
> + them again when we fill in the rest of the LCSSA variables. */
> + for (auto phi : new_phis)
> + {
> + tree new_arg = gimple_phi_arg (phi, 0)->def;
> +
> + if (!SSA_VAR_P (new_arg))
> + continue;
> +
> + /* If the PHI MEM node dominates the loop then we shouldn't create
> + a new LC-SSSA PHI for it in the intermediate block. */
> + /* A MEM phi that consitutes a new DEF for the vUSE chain can either
> + be a .VDEF or a PHI that operates on MEM. And said definition
> + must not be inside the main loop. Or we must be a parameter.
> + In the last two cases we may remove a non-MEM PHI node, but since
> + they dominate both loops the removal is unlikely to cause trouble
> + as the exits must already be using them. */
> + if (virtual_operand_p (new_arg)
> + && (SSA_NAME_IS_DEFAULT_DEF (new_arg)
> + || !flow_bb_inside_loop_p (loop,
> + gimple_bb (SSA_NAME_DEF_STMT (new_arg)))))
> + {
> + auto gsi = gsi_for_stmt (phi);
> + remove_phi_node (&gsi, true);
> + continue;
> + }
> +
> + /* If we decide to remove the PHI node we should also not
> + rematerialize it later on. */
> + new_phi_args.put (new_arg, gimple_phi_result (phi));
> +
> + if (TREE_CODE (new_arg) != SSA_NAME)
> + continue;
> + }
> +
> + /* Copy the current loop LC PHI nodes between the original loop exit
> + block and the new loop header. This allows us to later split the
> + preheader block and still find the right LC nodes. */
> + edge loop_entry = single_succ_edge (new_preheader);
> + if (flow_loops)
> + for (auto gsi_from = gsi_start_phis (loop->header),
> + gsi_to = gsi_start_phis (new_loop->header);
> + !gsi_end_p (gsi_from) && !gsi_end_p (gsi_to);
> + gsi_next (&gsi_from), gsi_next (&gsi_to))
> + {
> + gimple *from_phi = gsi_stmt (gsi_from);
> + gimple *to_phi = gsi_stmt (gsi_to);
> + tree new_arg = PHI_ARG_DEF_FROM_EDGE (from_phi, loop_latch_edge (loop));
> + tree *res = NULL;
> +
> + /* Check if we've already created a new phi node during edge
> + redirection. If we have, only propagate the value downwards. */
> + if ((res = new_phi_args.get (new_arg)))
> + new_arg = *res;
> +
> + /* All other exits use the previous iters. */
> + if (multiple_exits_p)
> + {
> + tree alt_arg = gimple_phi_result (from_phi);
> + tree alt_res = copy_ssa_name (alt_arg);
> + gphi *alt_lcssa_phi = create_phi_node (alt_res, new_preheader);
> + edge main_e = single_succ_edge (main_loop_exit_block);
> + for (edge e : loop_exits)
> + if (e != loop_exit)
> + {
> + add_phi_arg (alt_lcssa_phi, alt_arg, e, UNKNOWN_LOCATION);
> + SET_PHI_ARG_DEF (alt_lcssa_phi, main_e->dest_idx, new_arg);
> + }
> + new_arg = alt_res; /* Push it down to the new_loop header. */
I think it would be clearer to separate alternate exit from main exit
handling more completely - we don't have the new_phi_args map for
the alternate exits.
Thus first only redirect and fixup the main exit and then redirect
the alternate exits, immediately wiping the edge_var_map, and
manually create the only relevant PHIs.
In principle this early-break handling could be fully within the
if (flow_loops) condition (including populating the new_phi_args
map for the main exit).
The code itself looks fine to me.
Richard.
> + } else if (!res) {
> + /* For non-early break we need to keep the possibly live values in
> + the exit block. For early break these are kept in the merge
> + block in the code above. */
> + tree new_res = copy_ssa_name (gimple_phi_result (from_phi));
> + gphi *lcssa_phi = create_phi_node (new_res, new_preheader);
> +
> + /* Main loop exit should use the final iter value. */
> + add_phi_arg (lcssa_phi, new_arg, loop_exit, UNKNOWN_LOCATION);
> + new_arg = new_res;
> + }
> +
> + adjust_phi_and_debug_stmts (to_phi, loop_entry, new_arg);
> + }
> +
> + /* Now clear all the redirect maps. */
> + for (auto exit : loop_exits)
> + redirect_edge_var_map_clear (exit);
> +}
> +
> /* Given LOOP this function generates a new copy of it and puts it
> on E which is either the entry or exit of LOOP. If SCALAR_LOOP is
> non-NULL, assume LOOP and SCALAR_LOOP are equivalent and copy the
> @@ -1403,13 +1550,16 @@ vect_set_loop_condition (class loop *loop, edge
> loop_e, loop_vec_info loop_vinfo
> copies remains the same.
>
> If UPDATED_DOMS is not NULL it is update with the list of basic blocks
> whoms
> - dominators were updated during the peeling. */
> + dominators were updated during the peeling. When doing early break
> vectorization
> + then LOOP_VINFO needs to be provided and is used to keep track of any
> newly created
> + memory references that need to be updated should we decide to vectorize.
> */
>
> class loop *
> slpeel_tree_duplicate_loop_to_edge_cfg (class loop *loop, edge loop_exit,
> class loop *scalar_loop,
> edge scalar_exit, edge e, edge *new_e,
> - bool flow_loops)
> + bool flow_loops,
> + vec<basic_block> *updated_doms)
> {
> class loop *new_loop;
> basic_block *new_bbs, *bbs, *pbbs;
> @@ -1526,7 +1676,9 @@ slpeel_tree_duplicate_loop_to_edge_cfg (class loop
> *loop, edge loop_exit,
> }
>
> auto loop_exits = get_loop_exit_edges (loop);
> + bool multiple_exits_p = loop_exits.length () > 1;
> auto_vec<basic_block> doms;
> + class loop *update_loop = NULL;
>
> if (at_exit) /* Add the loop copy at exit. */
> {
> @@ -1536,91 +1688,9 @@ slpeel_tree_duplicate_loop_to_edge_cfg (class loop
> *loop, edge loop_exit,
> flush_pending_stmts (new_exit);
> }
>
> - auto_vec <gimple *> new_phis;
> - hash_map <tree, tree> new_phi_args;
> - /* First create the empty phi nodes so that when we flush the
> - statements they can be filled in. However because there is no order
> - between the PHI nodes in the exits and the loop headers we need to
> - order them base on the order of the two headers. First record the new
> - phi nodes. */
> - for (auto gsi_from = gsi_start_phis (scalar_exit->dest);
> - !gsi_end_p (gsi_from); gsi_next (&gsi_from))
> - {
> - gimple *from_phi = gsi_stmt (gsi_from);
> - tree new_res = copy_ssa_name (gimple_phi_result (from_phi));
> - gphi *res = create_phi_node (new_res, new_preheader);
> - new_phis.safe_push (res);
> - }
> -
> - /* Then redirect the edges and flush the changes. This writes out the
> new
> - SSA names. */
> - for (edge exit : loop_exits)
> - {
> - edge temp_e = redirect_edge_and_branch (exit, new_preheader);
> - flush_pending_stmts (temp_e);
> - }
> - /* Record the new SSA names in the cache so that we can skip
> materializing
> - them again when we fill in the rest of the LCSSA variables. */
> - for (auto phi : new_phis)
> - {
> - tree new_arg = gimple_phi_arg (phi, 0)->def;
> -
> - if (!SSA_VAR_P (new_arg))
> - continue;
> - /* If the PHI MEM node dominates the loop then we shouldn't create
> - a new LC-SSSA PHI for it in the intermediate block. */
> - /* A MEM phi that consitutes a new DEF for the vUSE chain can either
> - be a .VDEF or a PHI that operates on MEM. And said definition
> - must not be inside the main loop. Or we must be a parameter.
> - In the last two cases we may remove a non-MEM PHI node, but since
> - they dominate both loops the removal is unlikely to cause trouble
> - as the exits must already be using them. */
> - if (virtual_operand_p (new_arg)
> - && (SSA_NAME_IS_DEFAULT_DEF (new_arg)
> - || !flow_bb_inside_loop_p (loop,
> - gimple_bb (SSA_NAME_DEF_STMT (new_arg)))))
> - {
> - auto gsi = gsi_for_stmt (phi);
> - remove_phi_node (&gsi, true);
> - continue;
> - }
> - new_phi_args.put (new_arg, gimple_phi_result (phi));
> -
> - if (TREE_CODE (new_arg) != SSA_NAME)
> - continue;
> - }
> -
> - /* Copy the current loop LC PHI nodes between the original loop exit
> - block and the new loop header. This allows us to later split the
> - preheader block and still find the right LC nodes. */
> - edge loop_entry = single_succ_edge (new_preheader);
> - if (flow_loops)
> - for (auto gsi_from = gsi_start_phis (loop->header),
> - gsi_to = gsi_start_phis (new_loop->header);
> - !gsi_end_p (gsi_from) && !gsi_end_p (gsi_to);
> - gsi_next (&gsi_from), gsi_next (&gsi_to))
> - {
> - gimple *from_phi = gsi_stmt (gsi_from);
> - gimple *to_phi = gsi_stmt (gsi_to);
> - tree new_arg = PHI_ARG_DEF_FROM_EDGE (from_phi,
> - loop_latch_edge (loop));
> -
> - /* Check if we've already created a new phi node during edge
> - redirection. If we have, only propagate the value downwards. */
> - if (tree *res = new_phi_args.get (new_arg))
> - {
> - adjust_phi_and_debug_stmts (to_phi, loop_entry, *res);
> - continue;
> - }
> -
> - tree new_res = copy_ssa_name (gimple_phi_result (from_phi));
> - gphi *lcssa_phi = create_phi_node (new_res, new_preheader);
> -
> - /* Main loop exit should use the final iter value. */
> - add_phi_arg (lcssa_phi, new_arg, loop_exit, UNKNOWN_LOCATION);
> -
> - adjust_phi_and_debug_stmts (to_phi, loop_entry, new_res);
> - }
> + slpeel_tree_duplicate_loop_for_vectorization (loop, loop_exit,
> loop_exits,
> + new_loop, flow_loops,
> + new_preheader);
>
> set_immediate_dominator (CDI_DOMINATORS, new_preheader, e->src);
>
> @@ -1634,6 +1704,21 @@ slpeel_tree_duplicate_loop_to_edge_cfg (class loop
> *loop, edge loop_exit,
> delete_basic_block (preheader);
> set_immediate_dominator (CDI_DOMINATORS, scalar_loop->header,
> loop_preheader_edge (scalar_loop)->src);
> +
> + /* Finally after wiring the new epilogue we need to update its main
> exit
> + to the original function exit we recorded. Other exits are already
> + correct. */
> + if (multiple_exits_p)
> + {
> + update_loop = new_loop;
> + for (edge e : get_loop_exit_edges (loop))
> + doms.safe_push (e->dest);
> + doms.safe_push (exit_dest);
> +
> + /* Likely a fall-through edge, so update if needed. */
> + if (single_succ_p (exit_dest))
> + doms.safe_push (single_succ (exit_dest));
> + }
> }
> else /* Add the copy at entry. */
> {
> @@ -1681,6 +1766,34 @@ slpeel_tree_duplicate_loop_to_edge_cfg (class loop
> *loop, edge loop_exit,
> delete_basic_block (new_preheader);
> set_immediate_dominator (CDI_DOMINATORS, new_loop->header,
> loop_preheader_edge (new_loop)->src);
> +
> + if (multiple_exits_p)
> + update_loop = loop;
> + }
> +
> + if (multiple_exits_p)
> + {
> + for (edge e : get_loop_exit_edges (update_loop))
> + {
> + edge ex;
> + edge_iterator ei;
> + FOR_EACH_EDGE (ex, ei, e->dest->succs)
> + {
> + /* Find the first non-fallthrough block as fall-throughs can't
> + dominate other blocks. */
> + if (single_succ_p (ex->dest))
> + {
> + doms.safe_push (ex->dest);
> + ex = single_succ_edge (ex->dest);
> + }
> + doms.safe_push (ex->dest);
> + }
> + doms.safe_push (e->dest);
> + }
> +
> + iterate_fix_dominators (CDI_DOMINATORS, doms, false);
> + if (updated_doms)
> + updated_doms->safe_splice (doms);
> }
>
> free (new_bbs);
> diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
> index
> 76451a7fefe6ff966cecfa2cbc7b11336b038565..b9a71a0b5f5407417e8366b0df132df20c7f60aa
> 100644
> --- a/gcc/tree-vectorizer.h
> +++ b/gcc/tree-vectorizer.h
> @@ -1821,7 +1821,7 @@ is_loop_header_bb_p (basic_block bb)
> {
> if (bb == (bb->loop_father)->header)
> return true;
> - gcc_checking_assert (EDGE_COUNT (bb->preds) == 1);
> +
> return false;
> }
>
> @@ -2212,7 +2212,8 @@ extern bool slpeel_can_duplicate_loop_p (const class
> loop *, const_edge,
> const_edge);
> class loop *slpeel_tree_duplicate_loop_to_edge_cfg (class loop *, edge,
> class loop *, edge,
> - edge, edge *, bool = true);
> + edge, edge *, bool = true,
> + vec<basic_block> * = NULL);
> class loop *vect_loop_versioning (loop_vec_info, gimple *);
> extern class loop *vect_do_peeling (loop_vec_info, tree, tree,
> tree *, tree *, tree *, int, bool, bool,
> @@ -2223,6 +2224,7 @@ extern dump_user_location_t find_loop_location (class
> loop *);
> extern bool vect_can_advance_ivs_p (loop_vec_info);
> extern void vect_update_inits_of_drs (loop_vec_info, tree, tree_code);
> extern edge vec_init_loop_exit_info (class loop *);
> +extern bool vect_is_loop_exit_latch_pred (edge, class loop *);
>
> /* In tree-vect-stmts.cc. */
> extern tree get_related_vectype_for_scalar_type (machine_mode, tree,
>
--
Richard Biener <[email protected]>
SUSE Software Solutions Germany GmbH,
Frankenstrasse 146, 90461 Nuernberg, Germany;
GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG Nuernberg)