On Wed, 16 Oct 2013, pins...@gmail.com wrote: > > > On Oct 15, 2013, at 5:32 AM, Jakub Jelinek <ja...@redhat.com> wrote: > > > > Hi! > > > > Especially on i?86/x86_64 if-conversion pass seems to be often > > a pessimization, but the vectorization relies on it and without it we can't > > vectorize a lot of the loops. > > I think on many other targets it actually helps. I know for one it > helps on octeon even though octeon has no vector instructions. I think > it helps most arm targets too.
The main issue is that it has no cost model - the only cost model being that it can successfully if-convert all conditional code in a loop, resulting in a single-BB loop. So it is clearly vectorization targeted. It's infrastructure may be useful to do a more sensible if-conversion on GIMPLE level on scalar code. Of course even the infrastructure needs some TLC (and some better generic machinery of keeping track and simplifying of a predicate combination). Thanks, Richard. > Thanks, > Andrew > > > > > Here is a prototype of a patch that will by default (unless explicit > > -ftree-loop-if-convert) only if-convert loops internally for vectorization, > > so the COND_EXPRs actually only appear as VEC_COND_EXPRs in the vectorized > > basic blocks, but will not appear if vectorization fails, or in the > > scalar loop if vectorization is conditional, or in the prologue or epilogue > > loops around the vectorized loop. > > > > Instead of moving the ifcvt pass inside of the vectorizer, this patch > > during ifcvt performs loop versioning depending on a special internal > > call, only if the internal call returns true we go to the if-converted > > original loop, otherwise the non-if-converted copy of the original loop > > is performed. And the vectorizer is taught to fold this internal call > > into true resp. false depending on if the loop was vectorized or not, and > > vectorizer loop versioning, peeling for alignment and for bound are adjusted > > to also copy from the non-if-converted loop rather than if-converted one. > > > > Besides fixing the various PRs where if-conversion pessimizes code I'd like > > to also move forward with this with conditional loads and stores, > > http://gcc.gnu.org/ml/gcc-patches/2012-11/msg00202.html > > where the if-unconversion approach looked like a failure. > > > > This patch doesn't yet handle if-converted inner loop in outer loop > > vectorization, something on my todo list (so several vect-cond-*.c tests > > FAIL because they are no longer vectorized) plus I had to change two > > SLP vectorization tests that silently relied on loop if-conversion being > > performed to actually optimize the basic block (if the same thing didn't > > appear in a loop, it wouldn't be optimized at all). > > > > On the newly added testcase on x86_64, there are before this patch > > 18 scalar conditional moves, with the patch just 2 (both in the checking > > routine). > > > > Comments? > > > > --- gcc/internal-fn.def.jj 2013-10-11 14:32:57.079909782 +0200 > > +++ gcc/internal-fn.def 2013-10-11 17:23:58.705526840 +0200 > > @@ -43,3 +43,4 @@ DEF_INTERNAL_FN (STORE_LANES, ECF_CONST > > DEF_INTERNAL_FN (GOMP_SIMD_LANE, ECF_NOVOPS | ECF_LEAF | ECF_NOTHROW) > > DEF_INTERNAL_FN (GOMP_SIMD_VF, ECF_CONST | ECF_LEAF | ECF_NOTHROW) > > DEF_INTERNAL_FN (GOMP_SIMD_LAST_LANE, ECF_CONST | ECF_LEAF | ECF_NOTHROW) > > +DEF_INTERNAL_FN (LOOP_VECTORIZED, ECF_NOVOPS | ECF_LEAF | ECF_NOTHROW) > > --- gcc/tree-vect-loop-manip.c.jj 2013-09-30 22:13:47.000000000 +0200 > > +++ gcc/tree-vect-loop-manip.c 2013-10-15 12:57:54.854970913 +0200 > > @@ -374,24 +374,31 @@ LOOP-> loop1 > > > > static void > > slpeel_update_phi_nodes_for_guard1 (edge guard_edge, struct loop *loop, > > + struct loop *scalar_loop, > > bool is_new_loop, basic_block > > *new_exit_bb) > > { > > - gimple orig_phi, new_phi; > > + gimple orig_phi, new_phi, scalar_phi = NULL; > > gimple update_phi, update_phi2; > > tree guard_arg, loop_arg; > > basic_block new_merge_bb = guard_edge->dest; > > edge e = EDGE_SUCC (new_merge_bb, 0); > > basic_block update_bb = e->dest; > > basic_block orig_bb = loop->header; > > - edge new_exit_e; > > + edge new_exit_e, scalar_e = NULL; > > tree current_new_name; > > - gimple_stmt_iterator gsi_orig, gsi_update; > > + gimple_stmt_iterator gsi_orig, gsi_update, gsi_scalar = gsi_none (); > > > > /* Create new bb between loop and new_merge_bb. */ > > *new_exit_bb = split_edge (single_exit (loop)); > > > > new_exit_e = EDGE_SUCC (*new_exit_bb, 0); > > > > + if (scalar_loop != NULL && !is_new_loop) > > + { > > + gsi_scalar = gsi_start_phis (scalar_loop->header); > > + scalar_e = EDGE_SUCC (scalar_loop->latch, 0); > > + } > > + > > for (gsi_orig = gsi_start_phis (orig_bb), > > gsi_update = gsi_start_phis (update_bb); > > !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_update); > > @@ -401,6 +408,11 @@ slpeel_update_phi_nodes_for_guard1 (edge > > tree new_res; > > orig_phi = gsi_stmt (gsi_orig); > > update_phi = gsi_stmt (gsi_update); > > + if (scalar_e != NULL) > > + { > > + scalar_phi = gsi_stmt (gsi_scalar); > > + gsi_next (&gsi_scalar); > > + } > > > > /** 1. Handle new-merge-point phis **/ > > > > @@ -460,7 +472,13 @@ slpeel_update_phi_nodes_for_guard1 (edge > > current_new_name = loop_arg; > > else > > { > > - current_new_name = get_current_def (loop_arg); > > + if (scalar_e) > > + { > > + current_new_name = PHI_ARG_DEF_FROM_EDGE (scalar_phi, scalar_e); > > + current_new_name = get_current_def (current_new_name); > > + } > > + else > > + current_new_name = get_current_def (loop_arg); > > /* current_def is not available only if the variable does not > > change inside the loop, in which case we also don't care > > about recording a current_def for it because we won't be > > @@ -503,6 +521,7 @@ LOOP-> loop2 > > > > static void > > slpeel_update_phi_nodes_for_guard2 (edge guard_edge, struct loop *loop, > > + struct loop *scalar_loop, > > bool is_new_loop, basic_block > > *new_exit_bb) > > { > > gimple orig_phi, new_phi; > > @@ -511,17 +530,23 @@ slpeel_update_phi_nodes_for_guard2 (edge > > basic_block new_merge_bb = guard_edge->dest; > > edge e = EDGE_SUCC (new_merge_bb, 0); > > basic_block update_bb = e->dest; > > - edge new_exit_e; > > + edge new_exit_e, scalar_e = NULL; > > tree orig_def, orig_def_new_name; > > tree new_name, new_name2; > > tree arg; > > - gimple_stmt_iterator gsi; > > + gimple_stmt_iterator gsi, gsi_scalar = gsi_none (); > > > > /* Create new bb between loop and new_merge_bb. */ > > *new_exit_bb = split_edge (single_exit (loop)); > > > > new_exit_e = EDGE_SUCC (*new_exit_bb, 0); > > > > + if (scalar_loop != NULL) > > + { > > + scalar_e = single_exit (scalar_loop); > > + gsi_scalar = gsi_start_phis (scalar_e->dest); > > + } > > + > > for (gsi = gsi_start_phis (update_bb); !gsi_end_p (gsi); gsi_next (&gsi)) > > { > > tree new_res; > > @@ -532,7 +557,16 @@ slpeel_update_phi_nodes_for_guard2 (edge > > out of the loop - the phi arg is a constant. */ > > if (TREE_CODE (orig_def) != SSA_NAME) > > continue; > > - orig_def_new_name = get_current_def (orig_def); > > + if (scalar_loop != NULL) > > + { > > + orig_def_new_name > > + = PHI_ARG_DEF_FROM_EDGE (gsi_stmt (gsi_scalar), scalar_e); > > + gcc_assert (TREE_CODE (orig_def_new_name) == SSA_NAME); > > + orig_def_new_name = get_current_def (orig_def_new_name); > > + gsi_next (&gsi_scalar); > > + } > > + else > > + orig_def_new_name = get_current_def (orig_def); > > arg = NULL_TREE; > > > > /** 1. Handle new-merge-point phis **/ > > @@ -693,7 +727,8 @@ slpeel_make_loop_iterate_ntimes (struct > > on E which is either the entry or exit of LOOP. */ > > > > struct loop * > > -slpeel_tree_duplicate_loop_to_edge_cfg (struct loop *loop, edge e) > > +slpeel_tree_duplicate_loop_to_edge_cfg (struct loop *loop, > > + struct loop *scalar_loop, edge e) > > { > > struct loop *new_loop; > > basic_block *new_bbs, *bbs; > > @@ -707,19 +742,22 @@ slpeel_tree_duplicate_loop_to_edge_cfg ( > > if (!at_exit && e != loop_preheader_edge (loop)) > > return NULL; > > > > - bbs = XNEWVEC (basic_block, loop->num_nodes + 1); > > - get_loop_body_with_size (loop, bbs, loop->num_nodes); > > + if (scalar_loop == NULL) > > + scalar_loop = loop; > > + > > + bbs = XNEWVEC (basic_block, scalar_loop->num_nodes + 1); > > + get_loop_body_with_size (scalar_loop, bbs, scalar_loop->num_nodes); > > > > /* Check whether duplication is possible. */ > > - if (!can_copy_bbs_p (bbs, loop->num_nodes)) > > + if (!can_copy_bbs_p (bbs, scalar_loop->num_nodes)) > > { > > free (bbs); > > return NULL; > > } > > > > /* Generate new loop structure. */ > > - new_loop = duplicate_loop (loop, loop_outer (loop)); > > - duplicate_subloops (loop, new_loop); > > + new_loop = duplicate_loop (scalar_loop, loop_outer (scalar_loop)); > > + duplicate_subloops (scalar_loop, new_loop); > > > > exit_dest = exit->dest; > > was_imm_dom = (get_immediate_dominator (CDI_DOMINATORS, > > @@ -729,35 +767,66 @@ slpeel_tree_duplicate_loop_to_edge_cfg ( > > /* Also copy the pre-header, this avoids jumping through hoops to > > duplicate the loop entry PHI arguments. Create an empty > > pre-header unconditionally for this. */ > > - basic_block preheader = split_edge (loop_preheader_edge (loop)); > > + basic_block preheader = split_edge (loop_preheader_edge (scalar_loop)); > > edge entry_e = single_pred_edge (preheader); > > - bbs[loop->num_nodes] = preheader; > > - new_bbs = XNEWVEC (basic_block, loop->num_nodes + 1); > > + bbs[scalar_loop->num_nodes] = preheader; > > + new_bbs = XNEWVEC (basic_block, scalar_loop->num_nodes + 1); > > > > - copy_bbs (bbs, loop->num_nodes + 1, new_bbs, > > + exit = single_exit (scalar_loop); > > + copy_bbs (bbs, scalar_loop->num_nodes + 1, new_bbs, > > &exit, 1, &new_exit, NULL, > > e->src, true); > > - basic_block new_preheader = new_bbs[loop->num_nodes]; > > + exit = single_exit (loop); > > + basic_block new_preheader = new_bbs[scalar_loop->num_nodes]; > > > > - add_phi_args_after_copy (new_bbs, loop->num_nodes + 1, NULL); > > + add_phi_args_after_copy (new_bbs, scalar_loop->num_nodes + 1, NULL); > > > > if (at_exit) /* Add the loop copy at exit. */ > > { > > + if (scalar_loop != loop) > > + { > > + gimple_stmt_iterator gsi; > > + new_exit = redirect_edge_and_branch (new_exit, exit_dest); > > + > > + for (gsi = gsi_start_phis (exit_dest); !gsi_end_p (gsi); > > + gsi_next (&gsi)) > > + { > > + gimple phi = gsi_stmt (gsi); > > + tree orig_arg = PHI_ARG_DEF_FROM_EDGE (phi, e); > > + location_t orig_locus > > + = gimple_phi_arg_location_from_edge (phi, e); > > + > > + add_phi_arg (phi, orig_arg, new_exit, orig_locus); > > + } > > + } > > redirect_edge_and_branch_force (e, new_preheader); > > flush_pending_stmts (e); > > set_immediate_dominator (CDI_DOMINATORS, new_preheader, e->src); > > if (was_imm_dom) > > - set_immediate_dominator (CDI_DOMINATORS, exit_dest, new_loop->header); > > + set_immediate_dominator (CDI_DOMINATORS, exit_dest, new_exit->src); > > > > /* And remove the non-necessary forwarder again. Keep the other > > one so we have a proper pre-header for the loop at the exit edge. > > */ > > - redirect_edge_pred (single_succ_edge (preheader), single_pred > > (preheader)); > > + redirect_edge_pred (single_succ_edge (preheader), > > + single_pred (preheader)); > > delete_basic_block (preheader); > > - set_immediate_dominator (CDI_DOMINATORS, loop->header, > > - loop_preheader_edge (loop)->src); > > + set_immediate_dominator (CDI_DOMINATORS, scalar_loop->header, > > + loop_preheader_edge (scalar_loop)->src); > > } > > else /* Add the copy at entry. */ > > { > > + if (scalar_loop != loop) > > + { > > + /* Remove the non-necessary forwarder of scalar_loop again. */ > > + redirect_edge_pred (single_succ_edge (preheader), > > + single_pred (preheader)); > > + delete_basic_block (preheader); > > + set_immediate_dominator (CDI_DOMINATORS, scalar_loop->header, > > + loop_preheader_edge (scalar_loop)->src); > > + preheader = split_edge (loop_preheader_edge (loop)); > > + entry_e = single_pred_edge (preheader); > > + } > > + > > redirect_edge_and_branch_force (entry_e, new_preheader); > > flush_pending_stmts (entry_e); > > set_immediate_dominator (CDI_DOMINATORS, new_preheader, entry_e->src); > > @@ -768,15 +837,39 @@ slpeel_tree_duplicate_loop_to_edge_cfg ( > > > > /* And remove the non-necessary forwarder again. Keep the other > > one so we have a proper pre-header for the loop at the exit edge. > > */ > > - redirect_edge_pred (single_succ_edge (new_preheader), single_pred > > (new_preheader)); > > + redirect_edge_pred (single_succ_edge (new_preheader), > > + single_pred (new_preheader)); > > delete_basic_block (new_preheader); > > set_immediate_dominator (CDI_DOMINATORS, new_loop->header, > > loop_preheader_edge (new_loop)->src); > > } > > > > - for (unsigned i = 0; i < loop->num_nodes+1; i++) > > + for (unsigned i = 0; i < scalar_loop->num_nodes + 1; i++) > > rename_variables_in_bb (new_bbs[i]); > > > > + if (scalar_loop != loop) > > + { > > + /* Update new_loop->header PHIs, so that on the preheader > > + edge they are the ones from loop rather than scalar_loop. */ > > + gimple_stmt_iterator gsi_orig, gsi_new; > > + edge orig_e = loop_preheader_edge (loop); > > + edge new_e = loop_preheader_edge (new_loop); > > + > > + for (gsi_orig = gsi_start_phis (loop->header), > > + gsi_new = gsi_start_phis (new_loop->header); > > + !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_new); > > + gsi_next (&gsi_orig), gsi_next (&gsi_new)) > > + { > > + gimple orig_phi = gsi_stmt (gsi_orig); > > + gimple new_phi = gsi_stmt (gsi_new); > > + tree orig_arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, orig_e); > > + location_t orig_locus > > + = gimple_phi_arg_location_from_edge (orig_phi, orig_e); > > + > > + add_phi_arg (new_phi, orig_arg, new_e, orig_locus); > > + } > > + } > > + > > free (new_bbs); > > free (bbs); > > > > @@ -1028,8 +1121,8 @@ set_prologue_iterations (basic_block bb_ > > FORNOW the resulting code will not be in loop-closed-ssa form. > > */ > > > > -static struct loop* > > -slpeel_tree_peel_loop_to_edge (struct loop *loop, > > +static struct loop * > > +slpeel_tree_peel_loop_to_edge (struct loop *loop, struct loop *scalar_loop, > > edge e, tree *first_niters, > > tree niters, bool update_first_loop_count, > > unsigned int th, bool check_profitability, > > @@ -1114,7 +1207,8 @@ slpeel_tree_peel_loop_to_edge (struct lo > > orig_exit_bb: > > */ > > > > - if (!(new_loop = slpeel_tree_duplicate_loop_to_edge_cfg (loop, e))) > > + if (!(new_loop = slpeel_tree_duplicate_loop_to_edge_cfg (loop, > > scalar_loop, > > + e))) > > { > > loop_loc = find_loop_location (loop); > > dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc, > > @@ -1291,7 +1385,7 @@ slpeel_tree_peel_loop_to_edge (struct lo > > inverse_probability (first_guard_probability)); > > scale_loop_profile (first_loop, first_guard_probability, > > check_profitability && (int)th > bound1 ? th : bound1); > > - slpeel_update_phi_nodes_for_guard1 (skip_e, first_loop, > > + slpeel_update_phi_nodes_for_guard1 (skip_e, first_loop, scalar_loop, > > first_loop == new_loop, > > &new_exit_bb); > > > > @@ -1331,7 +1425,7 @@ slpeel_tree_peel_loop_to_edge (struct lo > > bb_after_second_loop, > > bb_before_first_loop, > > inverse_probability (second_guard_probability)); > > scale_loop_profile (second_loop, probability_of_second_loop, bound2); > > - slpeel_update_phi_nodes_for_guard2 (skip_e, second_loop, > > + slpeel_update_phi_nodes_for_guard2 (skip_e, second_loop, scalar_loop, > > second_loop == new_loop, &new_exit_bb); > > > > /* 4. Make first-loop iterate FIRST_NITERS times, if requested. > > @@ -1755,6 +1849,7 @@ vect_do_peeling_for_loop_bound (loop_vec > > { > > tree ni_name, ratio_mult_vf_name; > > struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); > > + struct loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo); > > struct loop *new_loop; > > edge update_e; > > basic_block preheader; > > @@ -1780,11 +1875,12 @@ vect_do_peeling_for_loop_bound (loop_vec > > > > loop_num = loop->num; > > > > - new_loop = slpeel_tree_peel_loop_to_edge (loop, single_exit (loop), > > - &ratio_mult_vf_name, ni_name, > > false, > > - th, check_profitability, > > - cond_expr, cond_expr_stmt_list, > > - 0, LOOP_VINFO_VECT_FACTOR (loop_vinfo)); > > + new_loop > > + = slpeel_tree_peel_loop_to_edge (loop, scalar_loop, single_exit (loop), > > + &ratio_mult_vf_name, ni_name, false, > > + th, check_profitability, > > + cond_expr, cond_expr_stmt_list, > > + 0, LOOP_VINFO_VECT_FACTOR (loop_vinfo)); > > gcc_assert (new_loop); > > gcc_assert (loop_num == loop->num); > > #ifdef ENABLE_CHECKING > > @@ -2017,6 +2113,7 @@ vect_do_peeling_for_alignment (loop_vec_ > > unsigned int th, bool check_profitability) > > { > > struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); > > + struct loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo); > > tree niters_of_prolog_loop, ni_name; > > tree n_iters; > > tree wide_prolog_niters; > > @@ -2038,11 +2135,11 @@ vect_do_peeling_for_alignment (loop_vec_ > > > > /* Peel the prolog loop and iterate it niters_of_prolog_loop. */ > > new_loop = > > - slpeel_tree_peel_loop_to_edge (loop, loop_preheader_edge (loop), > > + slpeel_tree_peel_loop_to_edge (loop, scalar_loop, > > + loop_preheader_edge (loop), > > &niters_of_prolog_loop, ni_name, true, > > th, check_profitability, NULL_TREE, NULL, > > - bound, > > - 0); > > + bound, 0); > > > > gcc_assert (new_loop); > > #ifdef ENABLE_CHECKING > > @@ -2398,6 +2495,7 @@ vect_loop_versioning (loop_vec_info loop > > unsigned int th, bool check_profitability) > > { > > struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); > > + struct loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo); > > basic_block condition_bb; > > gimple_stmt_iterator gsi, cond_exp_gsi; > > basic_block merge_bb; > > @@ -2433,8 +2531,45 @@ vect_loop_versioning (loop_vec_info loop > > gimple_seq_add_seq (&cond_expr_stmt_list, gimplify_stmt_list); > > > > initialize_original_copy_tables (); > > - loop_version (loop, cond_expr, &condition_bb, > > - prob, prob, REG_BR_PROB_BASE - prob, true); > > + if (scalar_loop) > > + { > > + edge scalar_e; > > + basic_block preheader, scalar_preheader; > > + > > + /* We don't want to scale SCALAR_LOOP's frequencies, we need to > > + scale LOOP's frequencies instead. */ > > + loop_version (scalar_loop, cond_expr, &condition_bb, > > + prob, REG_BR_PROB_BASE, REG_BR_PROB_BASE - prob, true); > > + scale_loop_frequencies (loop, prob, REG_BR_PROB_BASE); > > + /* CONDITION_BB was created above SCALAR_LOOP's preheader, > > + while we need to move it above LOOP's preheader. */ > > + e = loop_preheader_edge (loop); > > + scalar_e = loop_preheader_edge (scalar_loop); > > + gcc_assert (gimple_seq_empty_p (bb_seq (e->src)) > > + && gimple_seq_empty_p (phi_nodes (e->src)) > > + && single_pred_p (e->src)); > > + gcc_assert (gimple_seq_empty_p (bb_seq (scalar_e->src)) > > + && gimple_seq_empty_p (phi_nodes (scalar_e->src)) > > + && single_pred_p (scalar_e->src)); > > + gcc_assert (single_pred_p (condition_bb)); > > + preheader = e->src; > > + scalar_preheader = scalar_e->src; > > + scalar_e = find_edge (condition_bb, scalar_preheader); > > + e = single_pred_edge (preheader); > > + redirect_edge_and_branch_force (single_pred_edge (condition_bb), > > + scalar_preheader); > > + redirect_edge_and_branch_force (scalar_e, preheader); > > + redirect_edge_and_branch_force (e, condition_bb); > > + set_immediate_dominator (CDI_DOMINATORS, condition_bb, > > + single_pred (condition_bb)); > > + set_immediate_dominator (CDI_DOMINATORS, scalar_preheader, > > + single_pred (scalar_preheader)); > > + set_immediate_dominator (CDI_DOMINATORS, preheader, > > + condition_bb); > > + } > > + else > > + loop_version (loop, cond_expr, &condition_bb, > > + prob, prob, REG_BR_PROB_BASE - prob, true); > > > > if (LOCATION_LOCUS (vect_location) != UNKNOWN_LOC > > && dump_enabled_p ()) > > @@ -2457,24 +2592,29 @@ vect_loop_versioning (loop_vec_info loop > > basic block (i.e. it has two predecessors). Just in order to simplify > > following transformations in the vectorizer, we fix this situation > > here by adding a new (empty) block on the exit-edge of the loop, > > - with the proper loop-exit phis to maintain loop-closed-form. */ > > + with the proper loop-exit phis to maintain loop-closed-form. > > + If loop versioning wasn't done from loop, but scalar_loop instead, > > + merge_bb will have already just a single successor. */ > > > > merge_bb = single_exit (loop)->dest; > > - gcc_assert (EDGE_COUNT (merge_bb->preds) == 2); > > - new_exit_bb = split_edge (single_exit (loop)); > > - new_exit_e = single_exit (loop); > > - e = EDGE_SUCC (new_exit_bb, 0); > > - > > - for (gsi = gsi_start_phis (merge_bb); !gsi_end_p (gsi); gsi_next (&gsi)) > > + if (scalar_loop == NULL || EDGE_COUNT (merge_bb->preds) >= 2) > > { > > - tree new_res; > > - orig_phi = gsi_stmt (gsi); > > - new_res = copy_ssa_name (PHI_RESULT (orig_phi), NULL); > > - new_phi = create_phi_node (new_res, new_exit_bb); > > - arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, e); > > - add_phi_arg (new_phi, arg, new_exit_e, > > - gimple_phi_arg_location_from_edge (orig_phi, e)); > > - adjust_phi_and_debug_stmts (orig_phi, e, PHI_RESULT (new_phi)); > > + gcc_assert (EDGE_COUNT (merge_bb->preds) >= 2); > > + new_exit_bb = split_edge (single_exit (loop)); > > + new_exit_e = single_exit (loop); > > + e = EDGE_SUCC (new_exit_bb, 0); > > + > > + for (gsi = gsi_start_phis (merge_bb); !gsi_end_p (gsi); gsi_next > > (&gsi)) > > + { > > + tree new_res; > > + orig_phi = gsi_stmt (gsi); > > + new_res = copy_ssa_name (PHI_RESULT (orig_phi), NULL); > > + new_phi = create_phi_node (new_res, new_exit_bb); > > + arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, e); > > + add_phi_arg (new_phi, arg, new_exit_e, > > + gimple_phi_arg_location_from_edge (orig_phi, e)); > > + adjust_phi_and_debug_stmts (orig_phi, e, PHI_RESULT (new_phi)); > > + } > > } > > > > /* End loop-exit-fixes after versioning. */ > > --- gcc/tree-vectorizer.c.jj 2013-10-11 14:32:57.082909767 +0200 > > +++ gcc/tree-vectorizer.c 2013-10-14 15:34:19.921860478 +0200 > > @@ -306,6 +306,43 @@ vect_destroy_datarefs (loop_vec_info loo > > } > > > > > > +/* If LOOP has been versioned during ifcvt, return the internal call > > + guarding it. */ > > + > > +static gimple > > +vect_loop_vectorized_call (struct loop *loop) > > +{ > > + basic_block bb = loop_preheader_edge (loop)->src; > > + gimple g; > > + do > > + { > > + g = last_stmt (bb); > > + if (g) > > + break; > > + if (!single_pred_p (bb)) > > + break; > > + bb = single_pred (bb); > > + } > > + while (1); > > + if (g && gimple_code (g) == GIMPLE_COND) > > + { > > + gimple_stmt_iterator gsi = gsi_for_stmt (g); > > + gsi_prev (&gsi); > > + if (!gsi_end_p (gsi)) > > + { > > + g = gsi_stmt (gsi); > > + if (is_gimple_call (g) > > + && gimple_call_internal_p (g) > > + && gimple_call_internal_fn (g) == IFN_LOOP_VECTORIZED > > + && (tree_low_cst (gimple_call_arg (g, 0), 0) == loop->num > > + || tree_low_cst (gimple_call_arg (g, 1), 0) == loop->num)) > > + return g; > > + } > > + } > > + return NULL; > > +} > > + > > + > > /* Function vectorize_loops. > > > > Entry point to loop vectorization phase. */ > > @@ -320,6 +357,8 @@ vectorize_loops (void) > > struct loop *loop; > > hash_table <simduid_to_vf> simduid_to_vf_htab; > > hash_table <simd_array_to_simduid> simd_array_to_simduid_htab; > > + bool any_ifcvt_loops = false; > > + unsigned ret = 0; > > > > vect_loops_num = number_of_loops (cfun); > > > > @@ -342,8 +381,11 @@ vectorize_loops (void) > > than all previously defined loops. This fact allows us to run > > only over initial loops skipping newly generated ones. */ > > FOR_EACH_LOOP (li, loop, 0) > > - if ((flag_tree_loop_vectorize && optimize_loop_nest_for_speed_p (loop)) > > - || loop->force_vect) > > + if (loop->dont_vectorize) > > + any_ifcvt_loops = true; > > + else if ((flag_tree_loop_vectorize > > + && optimize_loop_nest_for_speed_p (loop)) > > + || loop->force_vect) > > { > > loop_vec_info loop_vinfo; > > vect_location = find_loop_location (loop); > > @@ -361,6 +403,38 @@ vectorize_loops (void) > > if (!dbg_cnt (vect_loop)) > > break; > > > > + gimple loop_vectorized_call = vect_loop_vectorized_call (loop); > > + if (loop_vectorized_call) > > + { > > + tree arg = gimple_call_arg (loop_vectorized_call, 1); > > + basic_block *bbs; > > + unsigned int i; > > + struct loop *scalar_loop = get_loop (cfun, tree_low_cst (arg, 0)); > > + > > + LOOP_VINFO_SCALAR_LOOP (loop_vinfo) = scalar_loop; > > + gcc_checking_assert (vect_loop_vectorized_call > > + (LOOP_VINFO_SCALAR_LOOP (loop_vinfo)) > > + == loop_vectorized_call); > > + bbs = get_loop_body (scalar_loop); > > + for (i = 0; i < scalar_loop->num_nodes; i++) > > + { > > + basic_block bb = bbs[i]; > > + gimple_stmt_iterator gsi; > > + for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); > > + gsi_next (&gsi)) > > + { > > + gimple phi = gsi_stmt (gsi); > > + gimple_set_uid (phi, 0); > > + } > > + for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); > > + gsi_next (&gsi)) > > + { > > + gimple stmt = gsi_stmt (gsi); > > + gimple_set_uid (stmt, 0); > > + } > > + } > > + free (bbs); > > + } > > if (LOCATION_LOCUS (vect_location) != UNKNOWN_LOC > > && dump_enabled_p ()) > > dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location, > > @@ -381,6 +455,25 @@ vectorize_loops (void) > > *simduid_to_vf_htab.find_slot (simduid_to_vf_data, INSERT) > > = simduid_to_vf_data; > > } > > + > > + if (loop_vectorized_call) > > + { > > + gimple g = loop_vectorized_call; > > + tree lhs = gimple_call_lhs (g); > > + gimple_stmt_iterator gsi = gsi_for_stmt (g); > > + gimplify_and_update_call_from_tree (&gsi, boolean_true_node); > > + gsi_next (&gsi); > > + if (!gsi_end_p (gsi)) > > + { > > + g = gsi_stmt (gsi); > > + if (gimple_code (g) == GIMPLE_COND > > + && gimple_cond_lhs (g) == lhs) > > + { > > + gimple_cond_set_lhs (g, boolean_true_node); > > + update_stmt (g); > > + } > > + } > > + } > > } > > > > vect_location = UNKNOWN_LOC; > > @@ -394,6 +487,34 @@ vectorize_loops (void) > > > > /* ----------- Finalize. ----------- */ > > > > + if (any_ifcvt_loops) > > + for (i = 1; i < vect_loops_num; i++) > > + { > > + loop = get_loop (cfun, i); > > + if (loop && loop->dont_vectorize) > > + { > > + gimple g = vect_loop_vectorized_call (loop); > > + if (g) > > + { > > + tree lhs = gimple_call_lhs (g); > > + gimple_stmt_iterator gsi = gsi_for_stmt (g); > > + gimplify_and_update_call_from_tree (&gsi, boolean_false_node); > > + gsi_next (&gsi); > > + if (!gsi_end_p (gsi)) > > + { > > + g = gsi_stmt (gsi); > > + if (gimple_code (g) == GIMPLE_COND > > + && gimple_cond_lhs (g) == lhs) > > + { > > + gimple_cond_set_lhs (g, boolean_false_node); > > + update_stmt (g); > > + } > > + } > > + ret = TODO_cleanup_cfg; > > + } > > + } > > + } > > + > > for (i = 1; i < vect_loops_num; i++) > > { > > loop_vec_info loop_vinfo; > > @@ -451,7 +572,7 @@ vectorize_loops (void) > > return TODO_cleanup_cfg; > > } > > > > - return 0; > > + return ret; > > } > > > > > > --- gcc/tree-vectorizer.h.jj 2013-10-11 14:32:57.086909746 +0200 > > +++ gcc/tree-vectorizer.h 2013-10-14 14:32:55.538688209 +0200 > > @@ -314,6 +314,10 @@ typedef struct _loop_vec_info { > > fix it up. */ > > bool operands_swapped; > > > > + /* If if-conversion versioned this loop before conversion, this is the > > + loop version without if-conversion. */ > > + struct loop *scalar_loop; > > + > > } *loop_vec_info; > > > > /* Access Functions. */ > > @@ -345,6 +349,7 @@ typedef struct _loop_vec_info { > > #define LOOP_VINFO_TARGET_COST_DATA(L) (L)->target_cost_data > > #define LOOP_VINFO_PEELING_FOR_GAPS(L) (L)->peeling_for_gaps > > #define LOOP_VINFO_OPERANDS_SWAPPED(L) (L)->operands_swapped > > +#define LOOP_VINFO_SCALAR_LOOP(L) (L)->scalar_loop > > > > #define LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT(L) \ > > (L)->may_misalign_stmts.length () > 0 > > @@ -899,7 +904,8 @@ extern LOC vect_location; > > in tree-vect-loop-manip.c. */ > > extern void slpeel_make_loop_iterate_ntimes (struct loop *, tree); > > extern bool slpeel_can_duplicate_loop_p (const struct loop *, const_edge); > > -struct loop *slpeel_tree_duplicate_loop_to_edge_cfg (struct loop *, edge); > > +struct loop *slpeel_tree_duplicate_loop_to_edge_cfg (struct loop *, > > + struct loop *, edge); > > extern void vect_loop_versioning (loop_vec_info, unsigned int, bool); > > extern void vect_do_peeling_for_loop_bound (loop_vec_info, tree *, > > unsigned int, bool); > > --- gcc/cfgloop.h.jj 2013-10-11 14:32:57.089909730 +0200 > > +++ gcc/cfgloop.h 2013-10-11 17:23:58.706526905 +0200 > > @@ -177,6 +177,9 @@ struct GTY ((chain_next ("%h.next"))) lo > > /* True if we should try harder to vectorize this loop. */ > > bool force_vect; > > > > + /* True if this loop should never be vectorized. */ > > + bool dont_vectorize; > > + > > /* For SIMD loops, this is a unique identifier of the loop, referenced > > by IFN_GOMP_SIMD_VF, IFN_GOMP_SIMD_LANE and IFN_GOMP_SIMD_LAST_LANE > > builtins. */ > > --- gcc/tree-loop-distribution.c.jj 2013-10-07 15:06:40.000000000 +0200 > > +++ gcc/tree-loop-distribution.c 2013-10-14 14:33:22.448549212 +0200 > > @@ -673,7 +673,7 @@ copy_loop_before (struct loop *loop) > > edge preheader = loop_preheader_edge (loop); > > > > initialize_original_copy_tables (); > > - res = slpeel_tree_duplicate_loop_to_edge_cfg (loop, preheader); > > + res = slpeel_tree_duplicate_loop_to_edge_cfg (loop, NULL, preheader); > > gcc_assert (res != NULL); > > free_original_copy_tables (); > > delete_update_ssa (); > > --- gcc/internal-fn.c.jj 2013-10-11 14:32:57.092909715 +0200 > > +++ gcc/internal-fn.c 2013-10-11 17:23:58.706526905 +0200 > > @@ -133,6 +133,14 @@ expand_GOMP_SIMD_LAST_LANE (gimple stmt > > gcc_unreachable (); > > } > > > > +/* This should get folded in tree-vectorizer.c. */ > > + > > +static void > > +expand_LOOP_VECTORIZED (gimple stmt ATTRIBUTE_UNUSED) > > +{ > > + gcc_unreachable (); > > +} > > + > > /* Routines to expand each internal function, indexed by function number. > > Each routine has the prototype: > > > > --- gcc/tree-if-conv.c.jj 2013-10-11 14:32:57.095909699 +0200 > > +++ gcc/tree-if-conv.c 2013-10-11 17:23:58.707526969 +0200 > > @@ -1735,6 +1735,48 @@ combine_blocks (struct loop *loop) > > ifc_bbs = NULL; > > } > > > > +static bool > > +version_loop_for_if_conversion (struct loop *loop) > > +{ > > + basic_block cond_bb; > > + tree cond = make_ssa_name (boolean_type_node, NULL); > > + struct loop *new_loop; > > + gimple g; > > + gimple_stmt_iterator gsi; > > + void **aux = XNEWVEC (void *, loop->num_nodes); > > + unsigned int i; > > + > > + /* We have data stored in bb->aux, but loop_version also > > + uses it, so save it temporarily and restore after loop_version. */ > > + for (i = 0; i < loop->num_nodes; i++) > > + { > > + aux[i] = ifc_bbs[i]->aux; > > + ifc_bbs[i]->aux = NULL; > > + } > > + g = gimple_build_call_internal (IFN_LOOP_VECTORIZED, 2, > > + build_int_cst (integer_type_node, loop->num), > > + integer_zero_node); > > + gimple_call_set_lhs (g, cond); > > + > > + initialize_original_copy_tables (); > > + new_loop = loop_version (loop, cond, &cond_bb, > > + REG_BR_PROB_BASE, REG_BR_PROB_BASE, > > + REG_BR_PROB_BASE, true); > > + free_original_copy_tables (); > > + for (i = 0; i < loop->num_nodes; i++) > > + ifc_bbs[i]->aux = aux[i]; > > + XDELETEVEC (aux); > > + if (new_loop == NULL) > > + return false; > > + new_loop->dont_vectorize = true; > > + new_loop->force_vect = false; > > + gsi = gsi_last_bb (cond_bb); > > + gimple_call_set_arg (g, 1, build_int_cst (integer_type_node, > > new_loop->num)); > > + gsi_insert_before (&gsi, g, GSI_SAME_STMT); > > + update_ssa (TODO_update_ssa); > > + return true; > > +} > > + > > /* If-convert LOOP when it is legal. For the moment this pass has no > > profitability analysis. Returns true when something changed. */ > > > > @@ -1744,10 +1786,18 @@ tree_if_conversion (struct loop *loop) > > bool changed = false; > > ifc_bbs = NULL; > > > > + if (loop->dont_vectorize) > > + goto cleanup; > > + > > if (!if_convertible_loop_p (loop) > > || !dbg_cnt (if_conversion_tree)) > > goto cleanup; > > > > + if ((flag_tree_loop_vectorize || loop->force_vect) > > + && flag_tree_loop_if_convert == -1 > > + && !version_loop_for_if_conversion (loop)) > > + goto cleanup; > > + > > /* Now all statements are if-convertible. Combine all the basic > > blocks into one huge basic block doing the if-conversion > > on-the-fly. */ > > --- gcc/testsuite/gcc.dg/vect/vect-cond-11.c.jj 2013-10-15 > > 14:01:07.877814190 +0200 > > +++ gcc/testsuite/gcc.dg/vect/vect-cond-11.c 2013-10-15 > > 14:02:29.302414970 +0200 > > @@ -0,0 +1,116 @@ > > +#include "tree-vect.h" > > + > > +#define N 1024 > > +typedef int V __attribute__((vector_size (4))); > > +unsigned int a[N * 2] __attribute__((aligned)); > > +unsigned int b[N * 2] __attribute__((aligned)); > > +V c[N]; > > + > > +__attribute__((noinline, noclone)) unsigned int > > +foo (unsigned int *a, unsigned int *b) > > +{ > > + int i; > > + unsigned int r = 0; > > + for (i = 0; i < N; i++) > > + { > > + unsigned int x = a[i], y = b[i]; > > + if (x < 32) > > + { > > + x = x + 127; > > + y = y * 2; > > + } > > + else > > + { > > + x = x - 16; > > + y = y + 1; > > + } > > + a[i] = x; > > + b[i] = y; > > + r += x; > > + } > > + return r; > > +} > > + > > +__attribute__((noinline, noclone)) unsigned int > > +bar (unsigned int *a, unsigned int *b) > > +{ > > + int i; > > + unsigned int r = 0; > > + for (i = 0; i < N; i++) > > + { > > + unsigned int x = a[i], y = b[i]; > > + if (x < 32) > > + { > > + x = x + 127; > > + y = y * 2; > > + } > > + else > > + { > > + x = x - 16; > > + y = y + 1; > > + } > > + a[i] = x; > > + b[i] = y; > > + c[i] = c[i] + 1; > > + r += x; > > + } > > + return r; > > +} > > + > > +void > > +baz (unsigned int *a, unsigned int *b, > > + unsigned int (*fn) (unsigned int *, unsigned int *)) > > +{ > > + int i; > > + for (i = -64; i < 0; i++) > > + { > > + a[i] = 19; > > + b[i] = 17; > > + } > > + for (; i < N; i++) > > + { > > + a[i] = i - 512; > > + b[i] = i; > > + } > > + for (; i < N + 64; i++) > > + { > > + a[i] = 27; > > + b[i] = 19; > > + } > > + if (fn (a, b) != -512U - (N - 32) * 16U + 32 * 127U) > > + __builtin_abort (); > > + for (i = -64; i < 0; i++) > > + if (a[i] != 19 || b[i] != 17) > > + __builtin_abort (); > > + for (; i < N; i++) > > + if (a[i] != (i - 512U < 32U ? i - 512U + 127 : i - 512U - 16) > > + || b[i] != (i - 512U < 32U ? i * 2U : i + 1U)) > > + __builtin_abort (); > > + for (; i < N + 64; i++) > > + if (a[i] != 27 || b[i] != 19) > > + __builtin_abort (); > > +} > > + > > +int > > +main () > > +{ > > + int i; > > + check_vect (); > > + baz (a + 512, b + 512, foo); > > + baz (a + 512, b + 512, bar); > > + baz (a + 512 + 1, b + 512 + 1, foo); > > + baz (a + 512 + 1, b + 512 + 1, bar); > > + baz (a + 512 + 31, b + 512 + 31, foo); > > + baz (a + 512 + 31, b + 512 + 31, bar); > > + baz (a + 512 + 1, b + 512, foo); > > + baz (a + 512 + 1, b + 512, bar); > > + baz (a + 512 + 31, b + 512, foo); > > + baz (a + 512 + 31, b + 512, bar); > > + baz (a + 512, b + 512 + 1, foo); > > + baz (a + 512, b + 512 + 1, bar); > > + baz (a + 512, b + 512 + 31, foo); > > + baz (a + 512, b + 512 + 31, bar); > > + return 0; > > +} > > + > > +/* { dg-final { cleanup-tree-dump "vect" } } */ > > --- gcc/testsuite/gcc.dg/vect/bb-slp-cond-1.c.jj 2013-08-30 > > 14:38:40.000000000 +0200 > > +++ gcc/testsuite/gcc.dg/vect/bb-slp-cond-1.c 2013-10-14 > > 13:19:21.704256653 +0200 > > @@ -1,4 +1,5 @@ > > /* { dg-require-effective-target vect_condition } */ > > +/* { dg-additional-options "-ftree-loop-if-convert" } */ > > > > #include "tree-vect.h" > > > > --- gcc/testsuite/gcc.dg/vect/bb-slp-pattern-2.c.jj 2013-08-30 > > 14:38:40.000000000 +0200 > > +++ gcc/testsuite/gcc.dg/vect/bb-slp-pattern-2.c 2013-10-14 > > 13:19:35.678195952 +0200 > > @@ -1,4 +1,5 @@ > > /* { dg-require-effective-target vect_condition } */ > > +/* { dg-additional-options "-ftree-loop-if-convert" } */ > > > > #include "tree-vect.h" > > > > > > Jakub > > -- Richard Biener <rguent...@suse.de> SUSE / SUSE Labs SUSE LINUX Products GmbH - Nuernberg - AG Nuernberg - HRB 16746 GF: Jeff Hawn, Jennifer Guild, Felix Imend