Hi, This patch introduces changes required to run vectorizer on loop epilogue. This also enables epilogue vectorization using a vector of smaller size.
Thanks, Ilya -- gcc/ 2016-05-19 Ilya Enkovich <ilya.enkov...@intel.com> * tree-if-conv.c (tree_if_conversion): Make public. * tree-if-conv.h: New file. * tree-vect-data-refs.c (vect_enhance_data_refs_alignment): Don't try to enhance alignment for epilogues. * tree-vect-loop-manip.c (vect_do_peeling_for_loop_bound): Return created loop. * tree-vect-loop.c: include tree-if-conv.h. (destroy_loop_vec_info): Preserve LOOP_VINFO_ORIG_LOOP_INFO in loop->aux. (vect_analyze_loop_form): Init LOOP_VINFO_ORIG_LOOP_INFO and reset loop->aux. (vect_analyze_loop): Reset loop->aux. (vect_transform_loop): Check if created epilogue should be returned for further vectorization. If-convert epilogue if required. * tree-vectorizer.c (vectorize_loops): Add a queue of loops to process and insert vectorized loop epilogues into this queue. * tree-vectorizer.h (vect_do_peeling_for_loop_bound): Return created loop. (vect_transform_loop): Return created loop. diff --git a/gcc/tree-if-conv.c b/gcc/tree-if-conv.c index c38e21b..41b6c99 100644 --- a/gcc/tree-if-conv.c +++ b/gcc/tree-if-conv.c @@ -2801,7 +2801,7 @@ ifcvt_local_dce (basic_block bb) profitability analysis. Returns non-zero todo flags when something changed. */ -static unsigned int +unsigned int tree_if_conversion (struct loop *loop) { unsigned int todo = 0; diff --git a/gcc/tree-if-conv.h b/gcc/tree-if-conv.h new file mode 100644 index 0000000..3a732c2 --- /dev/null +++ b/gcc/tree-if-conv.h @@ -0,0 +1,24 @@ +/* Copyright (C) 2016 Free Software Foundation, Inc. + +This file is part of GCC. + +GCC is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free +Software Foundation; either version 3, or (at your option) any later +version. + +GCC is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or +FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received a copy of the GNU General Public License +along with GCC; see the file COPYING3. If not see +<http://www.gnu.org/licenses/>. */ + +#ifndef GCC_TREE_IF_CONV_H +#define GCC_TREE_IF_CONV_H + +unsigned int tree_if_conversion (struct loop *); + +#endif /* GCC_TREE_IF_CONV_H */ diff --git a/gcc/tree-vect-data-refs.c b/gcc/tree-vect-data-refs.c index 7652e21..f275933 100644 --- a/gcc/tree-vect-data-refs.c +++ b/gcc/tree-vect-data-refs.c @@ -1595,7 +1595,10 @@ vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo) /* Check if we can possibly peel the loop. */ if (!vect_can_advance_ivs_p (loop_vinfo) || !slpeel_can_duplicate_loop_p (loop, single_exit (loop)) - || loop->inner) + || loop->inner + /* Required peeling was performed in prologue and + is not required for epilogue. */ + || LOOP_VINFO_EPILOGUE_P (loop_vinfo)) do_peeling = false; if (do_peeling @@ -1875,7 +1878,10 @@ vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo) do_versioning = optimize_loop_nest_for_speed_p (loop) - && (!loop->inner); /* FORNOW */ + && (!loop->inner) /* FORNOW */ + /* Required versioning was performed for the + original loop and is not required for epilogue. */ + && !LOOP_VINFO_EPILOGUE_P (loop_vinfo); if (do_versioning) { diff --git a/gcc/tree-vect-loop-manip.c b/gcc/tree-vect-loop-manip.c index 7ec6dae..fab5879 100644 --- a/gcc/tree-vect-loop-manip.c +++ b/gcc/tree-vect-loop-manip.c @@ -1742,9 +1742,11 @@ vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo, tree niters, NITERS / VECTORIZATION_FACTOR times (this value is placed into RATIO). COND_EXPR and COND_EXPR_STMT_LIST are combined with a new generated - test. */ + test. -void + Return created loop. */ + +struct loop * vect_do_peeling_for_loop_bound (loop_vec_info loop_vinfo, tree ni_name, tree ratio_mult_vf_name, unsigned int th, bool check_profitability) @@ -1812,6 +1814,8 @@ vect_do_peeling_for_loop_bound (loop_vec_info loop_vinfo, scev_reset (); free_original_copy_tables (); + + return new_loop; } diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c index aac0df9..a537ef4 100644 --- a/gcc/tree-vect-loop.c +++ b/gcc/tree-vect-loop.c @@ -47,6 +47,7 @@ along with GCC; see the file COPYING3. If not see #include "tree-vectorizer.h" #include "gimple-fold.h" #include "cgraph.h" +#include "tree-if-conv.h" /* Loop Vectorization Pass. @@ -1212,8 +1213,8 @@ destroy_loop_vec_info (loop_vec_info loop_vinfo, bool clean_stmts) destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)); loop_vinfo->scalar_cost_vec.release (); + loop->aux = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo); free (loop_vinfo); - loop->aux = NULL; } @@ -1499,13 +1500,24 @@ vect_analyze_loop_form (struct loop *loop) if (! vect_analyze_loop_form_1 (loop, &loop_cond, &number_of_iterationsm1, &number_of_iterations, &inner_loop_cond)) - return NULL; + { + loop->aux = NULL; + return NULL; + } loop_vec_info loop_vinfo = new_loop_vec_info (loop); LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1; LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations; LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations; + /* For epilogues we want to vectorize aux holds + loop_vec_info of the original loop. */ + if (loop->aux) + { + gcc_assert (LOOP_VINFO_VECTORIZABLE_P ((loop_vec_info)loop->aux)); + LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = (loop_vec_info)loop->aux; + } + if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)) { if (dump_enabled_p ()) @@ -1522,7 +1534,6 @@ vect_analyze_loop_form (struct loop *loop) STMT_VINFO_TYPE (vinfo_for_stmt (inner_loop_cond)) = loop_exit_ctrl_vec_info_type; - gcc_assert (!loop->aux); loop->aux = loop_vinfo; return loop_vinfo; } @@ -2280,7 +2291,10 @@ vect_analyze_loop (struct loop *loop) if (fatal || vector_sizes == 0 || current_vector_size == 0) - return NULL; + { + loop->aux = NULL; + return NULL; + } /* Try the next biggest vector size. */ current_vector_size = 1 << floor_log2 (vector_sizes); @@ -6576,10 +6590,11 @@ vect_generate_tmps_on_preheader (loop_vec_info loop_vinfo, Vectorize the loop - created vectorized stmts to replace the scalar stmts in the loop, and update the loop exit condition. */ -void +struct loop * vect_transform_loop (loop_vec_info loop_vinfo) { struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); + struct loop *epilogue = NULL; basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo); int nbbs = loop->num_nodes; int i; @@ -6661,8 +6676,9 @@ vect_transform_loop (loop_vec_info loop_vinfo) ni_name = vect_build_loop_niters (loop_vinfo); vect_generate_tmps_on_preheader (loop_vinfo, ni_name, &ratio_mult_vf, &ratio); - vect_do_peeling_for_loop_bound (loop_vinfo, ni_name, ratio_mult_vf, - th, check_profitability); + epilogue = vect_do_peeling_for_loop_bound (loop_vinfo, ni_name, + ratio_mult_vf, th, + check_profitability); } else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)) ratio = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)), @@ -6959,6 +6975,64 @@ vect_transform_loop (loop_vec_info loop_vinfo) FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance) vect_free_slp_instance (instance); LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release (); + + /* Don't vectorize epilogue for epilogue. */ + if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)) + epilogue = NULL; + /* Scalar epilogue is not vectorized in case + we use combined vector epilogue. */ + else if (LOOP_VINFO_COMBINE_EPILOGUE (loop_vinfo)) + epilogue = NULL; + /* FORNOW: Currently alias checks are not inherited for epilogues. + Don't try to vectorize epilogue because it will require + additional alias checks. */ + else if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo)) + epilogue = NULL; + + if (epilogue) + { + if (!LOOP_VINFO_MASK_EPILOGUE (loop_vinfo)) + { + unsigned int vector_sizes + = targetm.vectorize.autovectorize_vector_sizes (); + vector_sizes &= current_vector_size - 1; + + if (!(flag_tree_vectorize_epilogues & VECT_EPILOGUE_NOMASK)) + epilogue = NULL; + else if (!vector_sizes) + epilogue = NULL; + else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) + && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0) + { + int smallest_vec_size = 1 << ctz_hwi (vector_sizes); + int ratio = current_vector_size / smallest_vec_size; + int eiters = LOOP_VINFO_INT_NITERS (loop_vinfo) + - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo); + eiters = eiters % vectorization_factor; + + epilogue->nb_iterations_upper_bound = eiters - 1; + + if (eiters < vectorization_factor / ratio) + epilogue = NULL; + } + } + } + + if (epilogue) + { + epilogue->force_vectorize = loop->force_vectorize; + epilogue->safelen = loop->safelen; + epilogue->dont_vectorize = false; + + /* We may need to if-convert epilogue to vectorize it. */ + if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo)) + tree_if_conversion (epilogue); + + gcc_assert (!epilogue->aux); + epilogue->aux = loop_vinfo; + } + + return epilogue; } /* The code below is trying to perform simple optimization - revert diff --git a/gcc/tree-vectorizer.c b/gcc/tree-vectorizer.c index 2b25b45..5f15246 100644 --- a/gcc/tree-vectorizer.c +++ b/gcc/tree-vectorizer.c @@ -491,14 +491,16 @@ vectorize_loops (void) { unsigned int i; unsigned int num_vectorized_loops = 0; - unsigned int vect_loops_num; + unsigned int vect_loops_num = number_of_loops (cfun); struct loop *loop; hash_table<simduid_to_vf> *simduid_to_vf_htab = NULL; hash_table<simd_array_to_simduid> *simd_array_to_simduid_htab = NULL; bool any_ifcvt_loops = false; unsigned ret = 0; + auto_vec<unsigned int> loops (vect_loops_num); - vect_loops_num = number_of_loops (cfun); + FOR_EACH_LOOP (loop, 0) + loops.quick_push (loop->num); /* Bail out if there are no loops. */ if (vect_loops_num <= 1) @@ -514,14 +516,18 @@ vectorize_loops (void) /* If some loop was duplicated, it gets bigger number than all previously defined loops. This fact allows us to run only over initial loops skipping newly generated ones. */ - FOR_EACH_LOOP (loop, 0) - if (loop->dont_vectorize) + for (i = 0; i < loops.length (); i++) + if (!(loop = get_loop (cfun, loops[i]))) + continue; + else if (loop->dont_vectorize) any_ifcvt_loops = true; else if ((flag_tree_loop_vectorize - && optimize_loop_nest_for_speed_p (loop)) + && (optimize_loop_nest_for_speed_p (loop) + || loop->aux)) || loop->force_vectorize) { loop_vec_info loop_vinfo; + struct loop *new_loop; vect_location = find_loop_location (loop); if (LOCATION_LOCUS (vect_location) != UNKNOWN_LOCATION && dump_enabled_p ()) @@ -551,12 +557,21 @@ vectorize_loops (void) && dump_enabled_p ()) dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location, "loop vectorized\n"); - vect_transform_loop (loop_vinfo); + new_loop = vect_transform_loop (loop_vinfo); num_vectorized_loops++; /* Now that the loop has been vectorized, allow it to be unrolled etc. */ loop->force_vectorize = false; + /* Add new loop to a processing queue. To make it easier + to match loop and its epilogue vectorization in dumps + put new loop as the next loop to process. */ + if (new_loop) + { + loops.safe_insert (i + 1, new_loop->num); + vect_loops_num = number_of_loops (cfun); + } + if (loop->simduid) { simduid_to_vf *simduid_to_vf_data = XNEW (simduid_to_vf); diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h index 4c19317..b269752 100644 --- a/gcc/tree-vectorizer.h +++ b/gcc/tree-vectorizer.h @@ -984,8 +984,8 @@ extern bool slpeel_can_duplicate_loop_p (const struct loop *, const_edge); struct loop *slpeel_tree_duplicate_loop_to_edge_cfg (struct loop *, struct loop *, edge); extern void vect_loop_versioning (loop_vec_info, unsigned int, bool); -extern void vect_do_peeling_for_loop_bound (loop_vec_info, tree, tree, - unsigned int, bool); +extern struct loop *vect_do_peeling_for_loop_bound (loop_vec_info, tree, tree, + unsigned int, bool); extern void vect_do_peeling_for_alignment (loop_vec_info, tree, unsigned int, bool); extern source_location find_loop_location (struct loop *); @@ -1101,7 +1101,7 @@ extern gimple *vect_force_simple_reduction (loop_vec_info, gimple *, bool, /* Drive for loop analysis stage. */ extern loop_vec_info vect_analyze_loop (struct loop *); /* Drive for loop transformation stage. */ -extern void vect_transform_loop (loop_vec_info); +extern struct loop *vect_transform_loop (loop_vec_info); extern loop_vec_info vect_analyze_loop_form (struct loop *); extern bool vectorizable_live_operation (gimple *, gimple_stmt_iterator *, gimple **);