This patch adds a mode in which the vectoriser tries each available base vector mode and picks the one with the lowest cost. For now the behaviour is behind a default-off --param, but a later patch enables it by default for SVE.
The patch keeps the current behaviour of preferring a VF of loop->simdlen over any larger or smaller VF, regardless of costs or target preferences. 2019-11-05 Richard Sandiford <richard.sandif...@arm.com> gcc/ * params.def (vect-compare-loop-costs): New param. * doc/invoke.texi: Document it. * tree-vectorizer.h (_loop_vec_info::vec_outside_cost) (_loop_vec_info::vec_inside_cost): New member variables. * tree-vect-loop.c (_loop_vec_info::_loop_vec_info): Initialize them. (vect_better_loop_vinfo_p, vect_joust_loop_vinfos): New functions. (vect_analyze_loop): When the new parameter allows, try vectorizing the loop with each available vector mode and picking the one with the lowest cost. (vect_estimate_min_profitable_iters): Record the computed costs in the loop_vec_info. Index: gcc/params.def =================================================================== --- gcc/params.def 2019-10-31 17:15:25.470517368 +0000 +++ gcc/params.def 2019-11-05 14:19:58.781197820 +0000 @@ -661,6 +661,13 @@ DEFPARAM(PARAM_VECT_MAX_PEELING_FOR_ALIG "Maximum number of loop peels to enhance alignment of data references in a loop.", -1, -1, 64) +DEFPARAM(PARAM_VECT_COMPARE_LOOP_COSTS, + "vect-compare-loop-costs", + "Whether to try vectorizing a loop using each supported" + " combination of vector types and picking the version with the" + " lowest cost.", + 0, 0, 1) + DEFPARAM(PARAM_MAX_CSELIB_MEMORY_LOCATIONS, "max-cselib-memory-locations", "The maximum memory locations recorded by cselib.", Index: gcc/doc/invoke.texi =================================================================== --- gcc/doc/invoke.texi 2019-11-04 21:13:57.611756365 +0000 +++ gcc/doc/invoke.texi 2019-11-05 14:19:58.777197850 +0000 @@ -11563,6 +11563,12 @@ doing loop versioning for alias in the v The maximum number of loop peels to enhance access alignment for vectorizer. Value -1 means no limit. +@item vect-compare-loop-costs +Whether to try vectorizing a loop using each supported combination of +vector types and picking the version with the lowest cost. This parameter +has no effect when @option{-fno-vect-cost-model} or +@option{-fvect-cost-model=unlimited} are used. + @item max-iterations-to-track The maximum number of iterations of a loop the brute-force algorithm for analysis of the number of iterations of the loop tries to evaluate. Index: gcc/tree-vectorizer.h =================================================================== --- gcc/tree-vectorizer.h 2019-11-05 14:19:33.829371745 +0000 +++ gcc/tree-vectorizer.h 2019-11-05 14:19:58.781197820 +0000 @@ -601,6 +601,13 @@ typedef class _loop_vec_info : public ve /* Cost of a single scalar iteration. */ int single_scalar_iteration_cost; + /* The cost of the vector prologue and epilogue, including peeled + iterations and set-up code. */ + int vec_outside_cost; + + /* The cost of the vector loop body. */ + int vec_inside_cost; + /* Is the loop vectorizable? */ bool vectorizable; Index: gcc/tree-vect-loop.c =================================================================== --- gcc/tree-vect-loop.c 2019-11-05 14:19:33.829371745 +0000 +++ gcc/tree-vect-loop.c 2019-11-05 14:19:58.781197820 +0000 @@ -830,6 +830,8 @@ _loop_vec_info::_loop_vec_info (class lo scan_map (NULL), slp_unrolling_factor (1), single_scalar_iteration_cost (0), + vec_outside_cost (0), + vec_inside_cost (0), vectorizable (false), can_fully_mask_p (true), fully_masked_p (false), @@ -2373,6 +2375,80 @@ vect_analyze_loop_2 (loop_vec_info loop_ goto start_over; } +/* Return true if vectorizing a loop using NEW_LOOP_VINFO appears + to be better than vectorizing it using OLD_LOOP_VINFO. Assume that + OLD_LOOP_VINFO is better unless something specifically indicates + otherwise. + + Note that this deliberately isn't a partial order. */ + +static bool +vect_better_loop_vinfo_p (loop_vec_info new_loop_vinfo, + loop_vec_info old_loop_vinfo) +{ + struct loop *loop = LOOP_VINFO_LOOP (new_loop_vinfo); + gcc_assert (LOOP_VINFO_LOOP (old_loop_vinfo) == loop); + + poly_int64 new_vf = LOOP_VINFO_VECT_FACTOR (new_loop_vinfo); + poly_int64 old_vf = LOOP_VINFO_VECT_FACTOR (old_loop_vinfo); + + /* Always prefer a VF of loop->simdlen over any other VF. */ + if (loop->simdlen) + { + bool new_simdlen_p = known_eq (new_vf, loop->simdlen); + bool old_simdlen_p = known_eq (old_vf, loop->simdlen); + if (new_simdlen_p != old_simdlen_p) + return new_simdlen_p; + } + + /* Limit the VFs to what is likely to be the maximum number of iterations, + to handle cases in which at least one loop_vinfo is fully-masked. */ + HOST_WIDE_INT estimated_max_niter = likely_max_stmt_executions_int (loop); + if (estimated_max_niter != -1) + { + if (known_le (estimated_max_niter, new_vf)) + new_vf = estimated_max_niter; + if (known_le (estimated_max_niter, old_vf)) + old_vf = estimated_max_niter; + } + + /* Check whether the (fractional) cost per scalar iteration is lower + or higher: new_inside_cost / new_vf vs. old_inside_cost / old_vf. */ + poly_widest_int rel_new = (new_loop_vinfo->vec_inside_cost + * poly_widest_int (old_vf)); + poly_widest_int rel_old = (old_loop_vinfo->vec_inside_cost + * poly_widest_int (new_vf)); + if (maybe_lt (rel_old, rel_new)) + return false; + if (known_lt (rel_new, rel_old)) + return true; + + /* If there's nothing to choose between the loop bodies, see whether + there's a difference in the prologue and epilogue costs. */ + if (new_loop_vinfo->vec_outside_cost != old_loop_vinfo->vec_outside_cost) + return new_loop_vinfo->vec_outside_cost < old_loop_vinfo->vec_outside_cost; + + return false; +} + +/* Decide whether to replace OLD_LOOP_VINFO with NEW_LOOP_VINFO. Return + true if we should. */ + +static bool +vect_joust_loop_vinfos (loop_vec_info new_loop_vinfo, + loop_vec_info old_loop_vinfo) +{ + if (!vect_better_loop_vinfo_p (new_loop_vinfo, old_loop_vinfo)) + return false; + + if (dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, vect_location, + "***** Preferring vector mode %s to vector mode %s\n", + GET_MODE_NAME (new_loop_vinfo->vector_mode), + GET_MODE_NAME (old_loop_vinfo->vector_mode)); + return true; +} + /* Function vect_analyze_loop. Apply a set of analyses on LOOP, and create a loop_vec_info struct @@ -2408,6 +2484,8 @@ vect_analyze_loop (class loop *loop, vec machine_mode next_vector_mode = VOIDmode; poly_uint64 lowest_th = 0; unsigned vectorized_loops = 0; + bool pick_lowest_cost_p = (PARAM_VALUE (PARAM_VECT_COMPARE_LOOP_COSTS) + && !unlimited_cost_model (loop)); bool vect_epilogues = false; opt_result res = opt_result::success (); @@ -2428,6 +2506,34 @@ vect_analyze_loop (class loop *loop, vec bool fatal = false; + /* When pick_lowest_cost_p is true, we should in principle iterate + over all the loop_vec_infos that LOOP_VINFO could replace and + try to vectorize LOOP_VINFO under the same conditions. + E.g. when trying to replace an epilogue loop, we should vectorize + LOOP_VINFO as an epilogue loop with the same VF limit. When trying + to replace the main loop, we should vectorize LOOP_VINFO as a main + loop too. + + However, autovectorize_vector_modes is usually sorted as follows: + + - Modes that naturally produce lower VFs usually follow modes that + naturally produce higher VFs. + + - When modes naturally produce the same VF, maskable modes + usually follow unmaskable ones, so that the maskable mode + can be used to vectorize the epilogue of the unmaskable mode. + + This order is preferred because it leads to the maximum + epilogue vectorization opportunities. Targets should only use + a different order if they want to make wide modes available while + disparaging them relative to earlier, smaller modes. The assumption + in that case is that the wider modes are more expensive in some + way that isn't reflected directly in the costs. + + There should therefore be few interesting cases in which + LOOP_VINFO fails when treated as an epilogue loop, succeeds when + treated as a standalone loop, and ends up being genuinely cheaper + than FIRST_LOOP_VINFO. */ if (vect_epilogues) LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = first_loop_vinfo; @@ -2475,13 +2581,34 @@ vect_analyze_loop (class loop *loop, vec LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = NULL; simdlen = 0; } + else if (pick_lowest_cost_p && first_loop_vinfo) + { + /* Keep trying to roll back vectorization attempts while the + loop_vec_infos they produced were worse than this one. */ + vec<loop_vec_info> &vinfos = first_loop_vinfo->epilogue_vinfos; + while (!vinfos.is_empty () + && vect_joust_loop_vinfos (loop_vinfo, vinfos.last ())) + { + gcc_assert (vect_epilogues); + delete vinfos.pop (); + } + if (vinfos.is_empty () + && vect_joust_loop_vinfos (loop_vinfo, first_loop_vinfo)) + { + delete first_loop_vinfo; + first_loop_vinfo = opt_loop_vec_info::success (NULL); + LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = NULL; + } + } if (first_loop_vinfo == NULL) { first_loop_vinfo = loop_vinfo; lowest_th = LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo); } - else if (vect_epilogues) + else if (vect_epilogues + /* For now only allow one epilogue loop. */ + && first_loop_vinfo->epilogue_vinfos.is_empty ()) { first_loop_vinfo->epilogue_vinfos.safe_push (loop_vinfo); poly_uint64 th = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo); @@ -2501,12 +2628,14 @@ vect_analyze_loop (class loop *loop, vec && loop->inner == NULL && PARAM_VALUE (PARAM_VECT_EPILOGUES_NOMASK) && LOOP_VINFO_PEELING_FOR_NITER (first_loop_vinfo) - /* For now only allow one epilogue loop. */ - && first_loop_vinfo->epilogue_vinfos.is_empty ()); + /* For now only allow one epilogue loop, but allow + pick_lowest_cost_p to replace it. */ + && (first_loop_vinfo->epilogue_vinfos.is_empty () + || pick_lowest_cost_p)); /* Commit to first_loop_vinfo if we have no reason to try alternatives. */ - if (!simdlen && !vect_epilogues) + if (!simdlen && !vect_epilogues && !pick_lowest_cost_p) break; } else @@ -3454,7 +3583,11 @@ vect_estimate_min_profitable_iters (loop &vec_inside_cost, &vec_epilogue_cost); vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost); - + + /* Stash the costs so that we can compare two loop_vec_infos. */ + loop_vinfo->vec_inside_cost = vec_inside_cost; + loop_vinfo->vec_outside_cost = vec_outside_cost; + if (dump_enabled_p ()) { dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");