Hi! Especially on i?86/x86_64 if-conversion pass seems to be often a pessimization, but the vectorization relies on it and without it we can't vectorize a lot of the loops.
Here is a prototype of a patch that will by default (unless explicit -ftree-loop-if-convert) only if-convert loops internally for vectorization, so the COND_EXPRs actually only appear as VEC_COND_EXPRs in the vectorized basic blocks, but will not appear if vectorization fails, or in the scalar loop if vectorization is conditional, or in the prologue or epilogue loops around the vectorized loop. Instead of moving the ifcvt pass inside of the vectorizer, this patch during ifcvt performs loop versioning depending on a special internal call, only if the internal call returns true we go to the if-converted original loop, otherwise the non-if-converted copy of the original loop is performed. And the vectorizer is taught to fold this internal call into true resp. false depending on if the loop was vectorized or not, and vectorizer loop versioning, peeling for alignment and for bound are adjusted to also copy from the non-if-converted loop rather than if-converted one. Besides fixing the various PRs where if-conversion pessimizes code I'd like to also move forward with this with conditional loads and stores, http://gcc.gnu.org/ml/gcc-patches/2012-11/msg00202.html where the if-unconversion approach looked like a failure. This patch doesn't yet handle if-converted inner loop in outer loop vectorization, something on my todo list (so several vect-cond-*.c tests FAIL because they are no longer vectorized) plus I had to change two SLP vectorization tests that silently relied on loop if-conversion being performed to actually optimize the basic block (if the same thing didn't appear in a loop, it wouldn't be optimized at all). On the newly added testcase on x86_64, there are before this patch 18 scalar conditional moves, with the patch just 2 (both in the checking routine). Comments? --- gcc/internal-fn.def.jj 2013-10-11 14:32:57.079909782 +0200 +++ gcc/internal-fn.def 2013-10-11 17:23:58.705526840 +0200 @@ -43,3 +43,4 @@ DEF_INTERNAL_FN (STORE_LANES, ECF_CONST DEF_INTERNAL_FN (GOMP_SIMD_LANE, ECF_NOVOPS | ECF_LEAF | ECF_NOTHROW) DEF_INTERNAL_FN (GOMP_SIMD_VF, ECF_CONST | ECF_LEAF | ECF_NOTHROW) DEF_INTERNAL_FN (GOMP_SIMD_LAST_LANE, ECF_CONST | ECF_LEAF | ECF_NOTHROW) +DEF_INTERNAL_FN (LOOP_VECTORIZED, ECF_NOVOPS | ECF_LEAF | ECF_NOTHROW) --- gcc/tree-vect-loop-manip.c.jj 2013-09-30 22:13:47.000000000 +0200 +++ gcc/tree-vect-loop-manip.c 2013-10-15 12:57:54.854970913 +0200 @@ -374,24 +374,31 @@ LOOP-> loop1 static void slpeel_update_phi_nodes_for_guard1 (edge guard_edge, struct loop *loop, + struct loop *scalar_loop, bool is_new_loop, basic_block *new_exit_bb) { - gimple orig_phi, new_phi; + gimple orig_phi, new_phi, scalar_phi = NULL; gimple update_phi, update_phi2; tree guard_arg, loop_arg; basic_block new_merge_bb = guard_edge->dest; edge e = EDGE_SUCC (new_merge_bb, 0); basic_block update_bb = e->dest; basic_block orig_bb = loop->header; - edge new_exit_e; + edge new_exit_e, scalar_e = NULL; tree current_new_name; - gimple_stmt_iterator gsi_orig, gsi_update; + gimple_stmt_iterator gsi_orig, gsi_update, gsi_scalar = gsi_none (); /* Create new bb between loop and new_merge_bb. */ *new_exit_bb = split_edge (single_exit (loop)); new_exit_e = EDGE_SUCC (*new_exit_bb, 0); + if (scalar_loop != NULL && !is_new_loop) + { + gsi_scalar = gsi_start_phis (scalar_loop->header); + scalar_e = EDGE_SUCC (scalar_loop->latch, 0); + } + for (gsi_orig = gsi_start_phis (orig_bb), gsi_update = gsi_start_phis (update_bb); !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_update); @@ -401,6 +408,11 @@ slpeel_update_phi_nodes_for_guard1 (edge tree new_res; orig_phi = gsi_stmt (gsi_orig); update_phi = gsi_stmt (gsi_update); + if (scalar_e != NULL) + { + scalar_phi = gsi_stmt (gsi_scalar); + gsi_next (&gsi_scalar); + } /** 1. Handle new-merge-point phis **/ @@ -460,7 +472,13 @@ slpeel_update_phi_nodes_for_guard1 (edge current_new_name = loop_arg; else { - current_new_name = get_current_def (loop_arg); + if (scalar_e) + { + current_new_name = PHI_ARG_DEF_FROM_EDGE (scalar_phi, scalar_e); + current_new_name = get_current_def (current_new_name); + } + else + current_new_name = get_current_def (loop_arg); /* current_def is not available only if the variable does not change inside the loop, in which case we also don't care about recording a current_def for it because we won't be @@ -503,6 +521,7 @@ LOOP-> loop2 static void slpeel_update_phi_nodes_for_guard2 (edge guard_edge, struct loop *loop, + struct loop *scalar_loop, bool is_new_loop, basic_block *new_exit_bb) { gimple orig_phi, new_phi; @@ -511,17 +530,23 @@ slpeel_update_phi_nodes_for_guard2 (edge basic_block new_merge_bb = guard_edge->dest; edge e = EDGE_SUCC (new_merge_bb, 0); basic_block update_bb = e->dest; - edge new_exit_e; + edge new_exit_e, scalar_e = NULL; tree orig_def, orig_def_new_name; tree new_name, new_name2; tree arg; - gimple_stmt_iterator gsi; + gimple_stmt_iterator gsi, gsi_scalar = gsi_none (); /* Create new bb between loop and new_merge_bb. */ *new_exit_bb = split_edge (single_exit (loop)); new_exit_e = EDGE_SUCC (*new_exit_bb, 0); + if (scalar_loop != NULL) + { + scalar_e = single_exit (scalar_loop); + gsi_scalar = gsi_start_phis (scalar_e->dest); + } + for (gsi = gsi_start_phis (update_bb); !gsi_end_p (gsi); gsi_next (&gsi)) { tree new_res; @@ -532,7 +557,16 @@ slpeel_update_phi_nodes_for_guard2 (edge out of the loop - the phi arg is a constant. */ if (TREE_CODE (orig_def) != SSA_NAME) continue; - orig_def_new_name = get_current_def (orig_def); + if (scalar_loop != NULL) + { + orig_def_new_name + = PHI_ARG_DEF_FROM_EDGE (gsi_stmt (gsi_scalar), scalar_e); + gcc_assert (TREE_CODE (orig_def_new_name) == SSA_NAME); + orig_def_new_name = get_current_def (orig_def_new_name); + gsi_next (&gsi_scalar); + } + else + orig_def_new_name = get_current_def (orig_def); arg = NULL_TREE; /** 1. Handle new-merge-point phis **/ @@ -693,7 +727,8 @@ slpeel_make_loop_iterate_ntimes (struct on E which is either the entry or exit of LOOP. */ struct loop * -slpeel_tree_duplicate_loop_to_edge_cfg (struct loop *loop, edge e) +slpeel_tree_duplicate_loop_to_edge_cfg (struct loop *loop, + struct loop *scalar_loop, edge e) { struct loop *new_loop; basic_block *new_bbs, *bbs; @@ -707,19 +742,22 @@ slpeel_tree_duplicate_loop_to_edge_cfg ( if (!at_exit && e != loop_preheader_edge (loop)) return NULL; - bbs = XNEWVEC (basic_block, loop->num_nodes + 1); - get_loop_body_with_size (loop, bbs, loop->num_nodes); + if (scalar_loop == NULL) + scalar_loop = loop; + + bbs = XNEWVEC (basic_block, scalar_loop->num_nodes + 1); + get_loop_body_with_size (scalar_loop, bbs, scalar_loop->num_nodes); /* Check whether duplication is possible. */ - if (!can_copy_bbs_p (bbs, loop->num_nodes)) + if (!can_copy_bbs_p (bbs, scalar_loop->num_nodes)) { free (bbs); return NULL; } /* Generate new loop structure. */ - new_loop = duplicate_loop (loop, loop_outer (loop)); - duplicate_subloops (loop, new_loop); + new_loop = duplicate_loop (scalar_loop, loop_outer (scalar_loop)); + duplicate_subloops (scalar_loop, new_loop); exit_dest = exit->dest; was_imm_dom = (get_immediate_dominator (CDI_DOMINATORS, @@ -729,35 +767,66 @@ slpeel_tree_duplicate_loop_to_edge_cfg ( /* Also copy the pre-header, this avoids jumping through hoops to duplicate the loop entry PHI arguments. Create an empty pre-header unconditionally for this. */ - basic_block preheader = split_edge (loop_preheader_edge (loop)); + basic_block preheader = split_edge (loop_preheader_edge (scalar_loop)); edge entry_e = single_pred_edge (preheader); - bbs[loop->num_nodes] = preheader; - new_bbs = XNEWVEC (basic_block, loop->num_nodes + 1); + bbs[scalar_loop->num_nodes] = preheader; + new_bbs = XNEWVEC (basic_block, scalar_loop->num_nodes + 1); - copy_bbs (bbs, loop->num_nodes + 1, new_bbs, + exit = single_exit (scalar_loop); + copy_bbs (bbs, scalar_loop->num_nodes + 1, new_bbs, &exit, 1, &new_exit, NULL, e->src, true); - basic_block new_preheader = new_bbs[loop->num_nodes]; + exit = single_exit (loop); + basic_block new_preheader = new_bbs[scalar_loop->num_nodes]; - add_phi_args_after_copy (new_bbs, loop->num_nodes + 1, NULL); + add_phi_args_after_copy (new_bbs, scalar_loop->num_nodes + 1, NULL); if (at_exit) /* Add the loop copy at exit. */ { + if (scalar_loop != loop) + { + gimple_stmt_iterator gsi; + new_exit = redirect_edge_and_branch (new_exit, exit_dest); + + for (gsi = gsi_start_phis (exit_dest); !gsi_end_p (gsi); + gsi_next (&gsi)) + { + gimple phi = gsi_stmt (gsi); + tree orig_arg = PHI_ARG_DEF_FROM_EDGE (phi, e); + location_t orig_locus + = gimple_phi_arg_location_from_edge (phi, e); + + add_phi_arg (phi, orig_arg, new_exit, orig_locus); + } + } redirect_edge_and_branch_force (e, new_preheader); flush_pending_stmts (e); set_immediate_dominator (CDI_DOMINATORS, new_preheader, e->src); if (was_imm_dom) - set_immediate_dominator (CDI_DOMINATORS, exit_dest, new_loop->header); + set_immediate_dominator (CDI_DOMINATORS, exit_dest, new_exit->src); /* And remove the non-necessary forwarder again. Keep the other one so we have a proper pre-header for the loop at the exit edge. */ - redirect_edge_pred (single_succ_edge (preheader), single_pred (preheader)); + redirect_edge_pred (single_succ_edge (preheader), + single_pred (preheader)); delete_basic_block (preheader); - set_immediate_dominator (CDI_DOMINATORS, loop->header, - loop_preheader_edge (loop)->src); + set_immediate_dominator (CDI_DOMINATORS, scalar_loop->header, + loop_preheader_edge (scalar_loop)->src); } else /* Add the copy at entry. */ { + if (scalar_loop != loop) + { + /* Remove the non-necessary forwarder of scalar_loop again. */ + redirect_edge_pred (single_succ_edge (preheader), + single_pred (preheader)); + delete_basic_block (preheader); + set_immediate_dominator (CDI_DOMINATORS, scalar_loop->header, + loop_preheader_edge (scalar_loop)->src); + preheader = split_edge (loop_preheader_edge (loop)); + entry_e = single_pred_edge (preheader); + } + redirect_edge_and_branch_force (entry_e, new_preheader); flush_pending_stmts (entry_e); set_immediate_dominator (CDI_DOMINATORS, new_preheader, entry_e->src); @@ -768,15 +837,39 @@ slpeel_tree_duplicate_loop_to_edge_cfg ( /* And remove the non-necessary forwarder again. Keep the other one so we have a proper pre-header for the loop at the exit edge. */ - redirect_edge_pred (single_succ_edge (new_preheader), single_pred (new_preheader)); + redirect_edge_pred (single_succ_edge (new_preheader), + single_pred (new_preheader)); delete_basic_block (new_preheader); set_immediate_dominator (CDI_DOMINATORS, new_loop->header, loop_preheader_edge (new_loop)->src); } - for (unsigned i = 0; i < loop->num_nodes+1; i++) + for (unsigned i = 0; i < scalar_loop->num_nodes + 1; i++) rename_variables_in_bb (new_bbs[i]); + if (scalar_loop != loop) + { + /* Update new_loop->header PHIs, so that on the preheader + edge they are the ones from loop rather than scalar_loop. */ + gimple_stmt_iterator gsi_orig, gsi_new; + edge orig_e = loop_preheader_edge (loop); + edge new_e = loop_preheader_edge (new_loop); + + for (gsi_orig = gsi_start_phis (loop->header), + gsi_new = gsi_start_phis (new_loop->header); + !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_new); + gsi_next (&gsi_orig), gsi_next (&gsi_new)) + { + gimple orig_phi = gsi_stmt (gsi_orig); + gimple new_phi = gsi_stmt (gsi_new); + tree orig_arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, orig_e); + location_t orig_locus + = gimple_phi_arg_location_from_edge (orig_phi, orig_e); + + add_phi_arg (new_phi, orig_arg, new_e, orig_locus); + } + } + free (new_bbs); free (bbs); @@ -1028,8 +1121,8 @@ set_prologue_iterations (basic_block bb_ FORNOW the resulting code will not be in loop-closed-ssa form. */ -static struct loop* -slpeel_tree_peel_loop_to_edge (struct loop *loop, +static struct loop * +slpeel_tree_peel_loop_to_edge (struct loop *loop, struct loop *scalar_loop, edge e, tree *first_niters, tree niters, bool update_first_loop_count, unsigned int th, bool check_profitability, @@ -1114,7 +1207,8 @@ slpeel_tree_peel_loop_to_edge (struct lo orig_exit_bb: */ - if (!(new_loop = slpeel_tree_duplicate_loop_to_edge_cfg (loop, e))) + if (!(new_loop = slpeel_tree_duplicate_loop_to_edge_cfg (loop, scalar_loop, + e))) { loop_loc = find_loop_location (loop); dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc, @@ -1291,7 +1385,7 @@ slpeel_tree_peel_loop_to_edge (struct lo inverse_probability (first_guard_probability)); scale_loop_profile (first_loop, first_guard_probability, check_profitability && (int)th > bound1 ? th : bound1); - slpeel_update_phi_nodes_for_guard1 (skip_e, first_loop, + slpeel_update_phi_nodes_for_guard1 (skip_e, first_loop, scalar_loop, first_loop == new_loop, &new_exit_bb); @@ -1331,7 +1425,7 @@ slpeel_tree_peel_loop_to_edge (struct lo bb_after_second_loop, bb_before_first_loop, inverse_probability (second_guard_probability)); scale_loop_profile (second_loop, probability_of_second_loop, bound2); - slpeel_update_phi_nodes_for_guard2 (skip_e, second_loop, + slpeel_update_phi_nodes_for_guard2 (skip_e, second_loop, scalar_loop, second_loop == new_loop, &new_exit_bb); /* 4. Make first-loop iterate FIRST_NITERS times, if requested. @@ -1755,6 +1849,7 @@ vect_do_peeling_for_loop_bound (loop_vec { tree ni_name, ratio_mult_vf_name; struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); + struct loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo); struct loop *new_loop; edge update_e; basic_block preheader; @@ -1780,11 +1875,12 @@ vect_do_peeling_for_loop_bound (loop_vec loop_num = loop->num; - new_loop = slpeel_tree_peel_loop_to_edge (loop, single_exit (loop), - &ratio_mult_vf_name, ni_name, false, - th, check_profitability, - cond_expr, cond_expr_stmt_list, - 0, LOOP_VINFO_VECT_FACTOR (loop_vinfo)); + new_loop + = slpeel_tree_peel_loop_to_edge (loop, scalar_loop, single_exit (loop), + &ratio_mult_vf_name, ni_name, false, + th, check_profitability, + cond_expr, cond_expr_stmt_list, + 0, LOOP_VINFO_VECT_FACTOR (loop_vinfo)); gcc_assert (new_loop); gcc_assert (loop_num == loop->num); #ifdef ENABLE_CHECKING @@ -2017,6 +2113,7 @@ vect_do_peeling_for_alignment (loop_vec_ unsigned int th, bool check_profitability) { struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); + struct loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo); tree niters_of_prolog_loop, ni_name; tree n_iters; tree wide_prolog_niters; @@ -2038,11 +2135,11 @@ vect_do_peeling_for_alignment (loop_vec_ /* Peel the prolog loop and iterate it niters_of_prolog_loop. */ new_loop = - slpeel_tree_peel_loop_to_edge (loop, loop_preheader_edge (loop), + slpeel_tree_peel_loop_to_edge (loop, scalar_loop, + loop_preheader_edge (loop), &niters_of_prolog_loop, ni_name, true, th, check_profitability, NULL_TREE, NULL, - bound, - 0); + bound, 0); gcc_assert (new_loop); #ifdef ENABLE_CHECKING @@ -2398,6 +2495,7 @@ vect_loop_versioning (loop_vec_info loop unsigned int th, bool check_profitability) { struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); + struct loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo); basic_block condition_bb; gimple_stmt_iterator gsi, cond_exp_gsi; basic_block merge_bb; @@ -2433,8 +2531,45 @@ vect_loop_versioning (loop_vec_info loop gimple_seq_add_seq (&cond_expr_stmt_list, gimplify_stmt_list); initialize_original_copy_tables (); - loop_version (loop, cond_expr, &condition_bb, - prob, prob, REG_BR_PROB_BASE - prob, true); + if (scalar_loop) + { + edge scalar_e; + basic_block preheader, scalar_preheader; + + /* We don't want to scale SCALAR_LOOP's frequencies, we need to + scale LOOP's frequencies instead. */ + loop_version (scalar_loop, cond_expr, &condition_bb, + prob, REG_BR_PROB_BASE, REG_BR_PROB_BASE - prob, true); + scale_loop_frequencies (loop, prob, REG_BR_PROB_BASE); + /* CONDITION_BB was created above SCALAR_LOOP's preheader, + while we need to move it above LOOP's preheader. */ + e = loop_preheader_edge (loop); + scalar_e = loop_preheader_edge (scalar_loop); + gcc_assert (gimple_seq_empty_p (bb_seq (e->src)) + && gimple_seq_empty_p (phi_nodes (e->src)) + && single_pred_p (e->src)); + gcc_assert (gimple_seq_empty_p (bb_seq (scalar_e->src)) + && gimple_seq_empty_p (phi_nodes (scalar_e->src)) + && single_pred_p (scalar_e->src)); + gcc_assert (single_pred_p (condition_bb)); + preheader = e->src; + scalar_preheader = scalar_e->src; + scalar_e = find_edge (condition_bb, scalar_preheader); + e = single_pred_edge (preheader); + redirect_edge_and_branch_force (single_pred_edge (condition_bb), + scalar_preheader); + redirect_edge_and_branch_force (scalar_e, preheader); + redirect_edge_and_branch_force (e, condition_bb); + set_immediate_dominator (CDI_DOMINATORS, condition_bb, + single_pred (condition_bb)); + set_immediate_dominator (CDI_DOMINATORS, scalar_preheader, + single_pred (scalar_preheader)); + set_immediate_dominator (CDI_DOMINATORS, preheader, + condition_bb); + } + else + loop_version (loop, cond_expr, &condition_bb, + prob, prob, REG_BR_PROB_BASE - prob, true); if (LOCATION_LOCUS (vect_location) != UNKNOWN_LOC && dump_enabled_p ()) @@ -2457,24 +2592,29 @@ vect_loop_versioning (loop_vec_info loop basic block (i.e. it has two predecessors). Just in order to simplify following transformations in the vectorizer, we fix this situation here by adding a new (empty) block on the exit-edge of the loop, - with the proper loop-exit phis to maintain loop-closed-form. */ + with the proper loop-exit phis to maintain loop-closed-form. + If loop versioning wasn't done from loop, but scalar_loop instead, + merge_bb will have already just a single successor. */ merge_bb = single_exit (loop)->dest; - gcc_assert (EDGE_COUNT (merge_bb->preds) == 2); - new_exit_bb = split_edge (single_exit (loop)); - new_exit_e = single_exit (loop); - e = EDGE_SUCC (new_exit_bb, 0); - - for (gsi = gsi_start_phis (merge_bb); !gsi_end_p (gsi); gsi_next (&gsi)) + if (scalar_loop == NULL || EDGE_COUNT (merge_bb->preds) >= 2) { - tree new_res; - orig_phi = gsi_stmt (gsi); - new_res = copy_ssa_name (PHI_RESULT (orig_phi), NULL); - new_phi = create_phi_node (new_res, new_exit_bb); - arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, e); - add_phi_arg (new_phi, arg, new_exit_e, - gimple_phi_arg_location_from_edge (orig_phi, e)); - adjust_phi_and_debug_stmts (orig_phi, e, PHI_RESULT (new_phi)); + gcc_assert (EDGE_COUNT (merge_bb->preds) >= 2); + new_exit_bb = split_edge (single_exit (loop)); + new_exit_e = single_exit (loop); + e = EDGE_SUCC (new_exit_bb, 0); + + for (gsi = gsi_start_phis (merge_bb); !gsi_end_p (gsi); gsi_next (&gsi)) + { + tree new_res; + orig_phi = gsi_stmt (gsi); + new_res = copy_ssa_name (PHI_RESULT (orig_phi), NULL); + new_phi = create_phi_node (new_res, new_exit_bb); + arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, e); + add_phi_arg (new_phi, arg, new_exit_e, + gimple_phi_arg_location_from_edge (orig_phi, e)); + adjust_phi_and_debug_stmts (orig_phi, e, PHI_RESULT (new_phi)); + } } /* End loop-exit-fixes after versioning. */ --- gcc/tree-vectorizer.c.jj 2013-10-11 14:32:57.082909767 +0200 +++ gcc/tree-vectorizer.c 2013-10-14 15:34:19.921860478 +0200 @@ -306,6 +306,43 @@ vect_destroy_datarefs (loop_vec_info loo } +/* If LOOP has been versioned during ifcvt, return the internal call + guarding it. */ + +static gimple +vect_loop_vectorized_call (struct loop *loop) +{ + basic_block bb = loop_preheader_edge (loop)->src; + gimple g; + do + { + g = last_stmt (bb); + if (g) + break; + if (!single_pred_p (bb)) + break; + bb = single_pred (bb); + } + while (1); + if (g && gimple_code (g) == GIMPLE_COND) + { + gimple_stmt_iterator gsi = gsi_for_stmt (g); + gsi_prev (&gsi); + if (!gsi_end_p (gsi)) + { + g = gsi_stmt (gsi); + if (is_gimple_call (g) + && gimple_call_internal_p (g) + && gimple_call_internal_fn (g) == IFN_LOOP_VECTORIZED + && (tree_low_cst (gimple_call_arg (g, 0), 0) == loop->num + || tree_low_cst (gimple_call_arg (g, 1), 0) == loop->num)) + return g; + } + } + return NULL; +} + + /* Function vectorize_loops. Entry point to loop vectorization phase. */ @@ -320,6 +357,8 @@ vectorize_loops (void) struct loop *loop; hash_table <simduid_to_vf> simduid_to_vf_htab; hash_table <simd_array_to_simduid> simd_array_to_simduid_htab; + bool any_ifcvt_loops = false; + unsigned ret = 0; vect_loops_num = number_of_loops (cfun); @@ -342,8 +381,11 @@ vectorize_loops (void) than all previously defined loops. This fact allows us to run only over initial loops skipping newly generated ones. */ FOR_EACH_LOOP (li, loop, 0) - if ((flag_tree_loop_vectorize && optimize_loop_nest_for_speed_p (loop)) - || loop->force_vect) + if (loop->dont_vectorize) + any_ifcvt_loops = true; + else if ((flag_tree_loop_vectorize + && optimize_loop_nest_for_speed_p (loop)) + || loop->force_vect) { loop_vec_info loop_vinfo; vect_location = find_loop_location (loop); @@ -361,6 +403,38 @@ vectorize_loops (void) if (!dbg_cnt (vect_loop)) break; + gimple loop_vectorized_call = vect_loop_vectorized_call (loop); + if (loop_vectorized_call) + { + tree arg = gimple_call_arg (loop_vectorized_call, 1); + basic_block *bbs; + unsigned int i; + struct loop *scalar_loop = get_loop (cfun, tree_low_cst (arg, 0)); + + LOOP_VINFO_SCALAR_LOOP (loop_vinfo) = scalar_loop; + gcc_checking_assert (vect_loop_vectorized_call + (LOOP_VINFO_SCALAR_LOOP (loop_vinfo)) + == loop_vectorized_call); + bbs = get_loop_body (scalar_loop); + for (i = 0; i < scalar_loop->num_nodes; i++) + { + basic_block bb = bbs[i]; + gimple_stmt_iterator gsi; + for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); + gsi_next (&gsi)) + { + gimple phi = gsi_stmt (gsi); + gimple_set_uid (phi, 0); + } + for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); + gsi_next (&gsi)) + { + gimple stmt = gsi_stmt (gsi); + gimple_set_uid (stmt, 0); + } + } + free (bbs); + } if (LOCATION_LOCUS (vect_location) != UNKNOWN_LOC && dump_enabled_p ()) dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location, @@ -381,6 +455,25 @@ vectorize_loops (void) *simduid_to_vf_htab.find_slot (simduid_to_vf_data, INSERT) = simduid_to_vf_data; } + + if (loop_vectorized_call) + { + gimple g = loop_vectorized_call; + tree lhs = gimple_call_lhs (g); + gimple_stmt_iterator gsi = gsi_for_stmt (g); + gimplify_and_update_call_from_tree (&gsi, boolean_true_node); + gsi_next (&gsi); + if (!gsi_end_p (gsi)) + { + g = gsi_stmt (gsi); + if (gimple_code (g) == GIMPLE_COND + && gimple_cond_lhs (g) == lhs) + { + gimple_cond_set_lhs (g, boolean_true_node); + update_stmt (g); + } + } + } } vect_location = UNKNOWN_LOC; @@ -394,6 +487,34 @@ vectorize_loops (void) /* ----------- Finalize. ----------- */ + if (any_ifcvt_loops) + for (i = 1; i < vect_loops_num; i++) + { + loop = get_loop (cfun, i); + if (loop && loop->dont_vectorize) + { + gimple g = vect_loop_vectorized_call (loop); + if (g) + { + tree lhs = gimple_call_lhs (g); + gimple_stmt_iterator gsi = gsi_for_stmt (g); + gimplify_and_update_call_from_tree (&gsi, boolean_false_node); + gsi_next (&gsi); + if (!gsi_end_p (gsi)) + { + g = gsi_stmt (gsi); + if (gimple_code (g) == GIMPLE_COND + && gimple_cond_lhs (g) == lhs) + { + gimple_cond_set_lhs (g, boolean_false_node); + update_stmt (g); + } + } + ret = TODO_cleanup_cfg; + } + } + } + for (i = 1; i < vect_loops_num; i++) { loop_vec_info loop_vinfo; @@ -451,7 +572,7 @@ vectorize_loops (void) return TODO_cleanup_cfg; } - return 0; + return ret; } --- gcc/tree-vectorizer.h.jj 2013-10-11 14:32:57.086909746 +0200 +++ gcc/tree-vectorizer.h 2013-10-14 14:32:55.538688209 +0200 @@ -314,6 +314,10 @@ typedef struct _loop_vec_info { fix it up. */ bool operands_swapped; + /* If if-conversion versioned this loop before conversion, this is the + loop version without if-conversion. */ + struct loop *scalar_loop; + } *loop_vec_info; /* Access Functions. */ @@ -345,6 +349,7 @@ typedef struct _loop_vec_info { #define LOOP_VINFO_TARGET_COST_DATA(L) (L)->target_cost_data #define LOOP_VINFO_PEELING_FOR_GAPS(L) (L)->peeling_for_gaps #define LOOP_VINFO_OPERANDS_SWAPPED(L) (L)->operands_swapped +#define LOOP_VINFO_SCALAR_LOOP(L) (L)->scalar_loop #define LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT(L) \ (L)->may_misalign_stmts.length () > 0 @@ -899,7 +904,8 @@ extern LOC vect_location; in tree-vect-loop-manip.c. */ extern void slpeel_make_loop_iterate_ntimes (struct loop *, tree); extern bool slpeel_can_duplicate_loop_p (const struct loop *, const_edge); -struct loop *slpeel_tree_duplicate_loop_to_edge_cfg (struct loop *, edge); +struct loop *slpeel_tree_duplicate_loop_to_edge_cfg (struct loop *, + struct loop *, edge); extern void vect_loop_versioning (loop_vec_info, unsigned int, bool); extern void vect_do_peeling_for_loop_bound (loop_vec_info, tree *, unsigned int, bool); --- gcc/cfgloop.h.jj 2013-10-11 14:32:57.089909730 +0200 +++ gcc/cfgloop.h 2013-10-11 17:23:58.706526905 +0200 @@ -177,6 +177,9 @@ struct GTY ((chain_next ("%h.next"))) lo /* True if we should try harder to vectorize this loop. */ bool force_vect; + /* True if this loop should never be vectorized. */ + bool dont_vectorize; + /* For SIMD loops, this is a unique identifier of the loop, referenced by IFN_GOMP_SIMD_VF, IFN_GOMP_SIMD_LANE and IFN_GOMP_SIMD_LAST_LANE builtins. */ --- gcc/tree-loop-distribution.c.jj 2013-10-07 15:06:40.000000000 +0200 +++ gcc/tree-loop-distribution.c 2013-10-14 14:33:22.448549212 +0200 @@ -673,7 +673,7 @@ copy_loop_before (struct loop *loop) edge preheader = loop_preheader_edge (loop); initialize_original_copy_tables (); - res = slpeel_tree_duplicate_loop_to_edge_cfg (loop, preheader); + res = slpeel_tree_duplicate_loop_to_edge_cfg (loop, NULL, preheader); gcc_assert (res != NULL); free_original_copy_tables (); delete_update_ssa (); --- gcc/internal-fn.c.jj 2013-10-11 14:32:57.092909715 +0200 +++ gcc/internal-fn.c 2013-10-11 17:23:58.706526905 +0200 @@ -133,6 +133,14 @@ expand_GOMP_SIMD_LAST_LANE (gimple stmt gcc_unreachable (); } +/* This should get folded in tree-vectorizer.c. */ + +static void +expand_LOOP_VECTORIZED (gimple stmt ATTRIBUTE_UNUSED) +{ + gcc_unreachable (); +} + /* Routines to expand each internal function, indexed by function number. Each routine has the prototype: --- gcc/tree-if-conv.c.jj 2013-10-11 14:32:57.095909699 +0200 +++ gcc/tree-if-conv.c 2013-10-11 17:23:58.707526969 +0200 @@ -1735,6 +1735,48 @@ combine_blocks (struct loop *loop) ifc_bbs = NULL; } +static bool +version_loop_for_if_conversion (struct loop *loop) +{ + basic_block cond_bb; + tree cond = make_ssa_name (boolean_type_node, NULL); + struct loop *new_loop; + gimple g; + gimple_stmt_iterator gsi; + void **aux = XNEWVEC (void *, loop->num_nodes); + unsigned int i; + + /* We have data stored in bb->aux, but loop_version also + uses it, so save it temporarily and restore after loop_version. */ + for (i = 0; i < loop->num_nodes; i++) + { + aux[i] = ifc_bbs[i]->aux; + ifc_bbs[i]->aux = NULL; + } + g = gimple_build_call_internal (IFN_LOOP_VECTORIZED, 2, + build_int_cst (integer_type_node, loop->num), + integer_zero_node); + gimple_call_set_lhs (g, cond); + + initialize_original_copy_tables (); + new_loop = loop_version (loop, cond, &cond_bb, + REG_BR_PROB_BASE, REG_BR_PROB_BASE, + REG_BR_PROB_BASE, true); + free_original_copy_tables (); + for (i = 0; i < loop->num_nodes; i++) + ifc_bbs[i]->aux = aux[i]; + XDELETEVEC (aux); + if (new_loop == NULL) + return false; + new_loop->dont_vectorize = true; + new_loop->force_vect = false; + gsi = gsi_last_bb (cond_bb); + gimple_call_set_arg (g, 1, build_int_cst (integer_type_node, new_loop->num)); + gsi_insert_before (&gsi, g, GSI_SAME_STMT); + update_ssa (TODO_update_ssa); + return true; +} + /* If-convert LOOP when it is legal. For the moment this pass has no profitability analysis. Returns true when something changed. */ @@ -1744,10 +1786,18 @@ tree_if_conversion (struct loop *loop) bool changed = false; ifc_bbs = NULL; + if (loop->dont_vectorize) + goto cleanup; + if (!if_convertible_loop_p (loop) || !dbg_cnt (if_conversion_tree)) goto cleanup; + if ((flag_tree_loop_vectorize || loop->force_vect) + && flag_tree_loop_if_convert == -1 + && !version_loop_for_if_conversion (loop)) + goto cleanup; + /* Now all statements are if-convertible. Combine all the basic blocks into one huge basic block doing the if-conversion on-the-fly. */ --- gcc/testsuite/gcc.dg/vect/vect-cond-11.c.jj 2013-10-15 14:01:07.877814190 +0200 +++ gcc/testsuite/gcc.dg/vect/vect-cond-11.c 2013-10-15 14:02:29.302414970 +0200 @@ -0,0 +1,116 @@ +#include "tree-vect.h" + +#define N 1024 +typedef int V __attribute__((vector_size (4))); +unsigned int a[N * 2] __attribute__((aligned)); +unsigned int b[N * 2] __attribute__((aligned)); +V c[N]; + +__attribute__((noinline, noclone)) unsigned int +foo (unsigned int *a, unsigned int *b) +{ + int i; + unsigned int r = 0; + for (i = 0; i < N; i++) + { + unsigned int x = a[i], y = b[i]; + if (x < 32) + { + x = x + 127; + y = y * 2; + } + else + { + x = x - 16; + y = y + 1; + } + a[i] = x; + b[i] = y; + r += x; + } + return r; +} + +__attribute__((noinline, noclone)) unsigned int +bar (unsigned int *a, unsigned int *b) +{ + int i; + unsigned int r = 0; + for (i = 0; i < N; i++) + { + unsigned int x = a[i], y = b[i]; + if (x < 32) + { + x = x + 127; + y = y * 2; + } + else + { + x = x - 16; + y = y + 1; + } + a[i] = x; + b[i] = y; + c[i] = c[i] + 1; + r += x; + } + return r; +} + +void +baz (unsigned int *a, unsigned int *b, + unsigned int (*fn) (unsigned int *, unsigned int *)) +{ + int i; + for (i = -64; i < 0; i++) + { + a[i] = 19; + b[i] = 17; + } + for (; i < N; i++) + { + a[i] = i - 512; + b[i] = i; + } + for (; i < N + 64; i++) + { + a[i] = 27; + b[i] = 19; + } + if (fn (a, b) != -512U - (N - 32) * 16U + 32 * 127U) + __builtin_abort (); + for (i = -64; i < 0; i++) + if (a[i] != 19 || b[i] != 17) + __builtin_abort (); + for (; i < N; i++) + if (a[i] != (i - 512U < 32U ? i - 512U + 127 : i - 512U - 16) + || b[i] != (i - 512U < 32U ? i * 2U : i + 1U)) + __builtin_abort (); + for (; i < N + 64; i++) + if (a[i] != 27 || b[i] != 19) + __builtin_abort (); +} + +int +main () +{ + int i; + check_vect (); + baz (a + 512, b + 512, foo); + baz (a + 512, b + 512, bar); + baz (a + 512 + 1, b + 512 + 1, foo); + baz (a + 512 + 1, b + 512 + 1, bar); + baz (a + 512 + 31, b + 512 + 31, foo); + baz (a + 512 + 31, b + 512 + 31, bar); + baz (a + 512 + 1, b + 512, foo); + baz (a + 512 + 1, b + 512, bar); + baz (a + 512 + 31, b + 512, foo); + baz (a + 512 + 31, b + 512, bar); + baz (a + 512, b + 512 + 1, foo); + baz (a + 512, b + 512 + 1, bar); + baz (a + 512, b + 512 + 31, foo); + baz (a + 512, b + 512 + 31, bar); + return 0; +} + +/* { dg-final { cleanup-tree-dump "vect" } } */ --- gcc/testsuite/gcc.dg/vect/bb-slp-cond-1.c.jj 2013-08-30 14:38:40.000000000 +0200 +++ gcc/testsuite/gcc.dg/vect/bb-slp-cond-1.c 2013-10-14 13:19:21.704256653 +0200 @@ -1,4 +1,5 @@ /* { dg-require-effective-target vect_condition } */ +/* { dg-additional-options "-ftree-loop-if-convert" } */ #include "tree-vect.h" --- gcc/testsuite/gcc.dg/vect/bb-slp-pattern-2.c.jj 2013-08-30 14:38:40.000000000 +0200 +++ gcc/testsuite/gcc.dg/vect/bb-slp-pattern-2.c 2013-10-14 13:19:35.678195952 +0200 @@ -1,4 +1,5 @@ /* { dg-require-effective-target vect_condition } */ +/* { dg-additional-options "-ftree-loop-if-convert" } */ #include "tree-vect.h" Jakub