Richard Biener <[email protected]> writes:
> The following removes late deciding to elide vectorized epilogues to
> the analysis phase and also avoids altering the epilogues niter.
> The costing part from vect_determine_partial_vectors_and_peeling is
> moved to vect_analyze_loop_costing where we use the main loop
> analysis to constrain the epilogue scalar iterations.
>
> I have not tried to integrate this with vect_known_niters_smaller_than_vf.
>
> It seems the for_epilogue_p parameter in
> vect_determine_partial_vectors_and_peeling is largely useless and
> we could compute that in the function itself.
>
> Bootstrapped and tested on x86_64-unknown-linux-gnu, OK?
>
> I suppose testing on aarch64 would be nice-to-have - any takers?
Sorry, ran this earlier today and then forgot about it. And yeah,
it passes bootstrap & regtest on aarch64-linux-gnu (all languages).
LGTM FWIW, except:
> diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
> index 0a03f56aae7..f39a1ecb306 100644
> --- a/gcc/tree-vect-loop.cc
> +++ b/gcc/tree-vect-loop.cc
> @@ -2144,14 +2144,76 @@ vect_analyze_loop_costing (loop_vec_info loop_vinfo,
>
> /* Only loops that can handle partially-populated vectors can have
> iteration
> counts less than the vectorization factor. */
> - if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
> + if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
> + && vect_known_niters_smaller_than_vf (loop_vinfo))
> {
> - if (vect_known_niters_smaller_than_vf (loop_vinfo))
> + if (dump_enabled_p ())
> + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> + "not vectorized: iteration count smaller than "
> + "vectorization factor.\n");
> + return 0;
> + }
> +
> + /* If we know the number of iterations we can do better, for the
> + epilogue we can also decide whether the main loop leaves us
> + with enough iterations, prefering a smaller vector epilog then
> + also possibly used for the case we skip the vector loop. */
> + if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
> + && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
> + {
> + widest_int scalar_niters
> + = wi::to_widest (LOOP_VINFO_NITERSM1 (loop_vinfo)) + 1;
> + if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
> + {
> + loop_vec_info orig_loop_vinfo
> + = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
> + unsigned lowest_vf
> + = constant_lower_bound (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo));
> + int prolog_peeling = 0;
> + if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
> + prolog_peeling = LOOP_VINFO_PEELING_FOR_ALIGNMENT (orig_loop_vinfo);
> + if (prolog_peeling >= 0
> + && known_eq (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo),
> + lowest_vf))
> + {
> + unsigned gap
> + = LOOP_VINFO_PEELING_FOR_GAPS (orig_loop_vinfo) ? 1 : 0;
> + scalar_niters = ((scalar_niters - gap - prolog_peeling)
> + % lowest_vf + gap);
Are you sure we want this + gap? A vectorised epilogue can't handle the
gap either, at least for things that use (say) the first vector of LD2
and ignore the second vector.
Thanks,
Richard
> + if (scalar_niters == 0)
> + {
> + if (dump_enabled_p ())
> + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> + "not vectorized: loop never entered\n");
> + return 0;
> + }
> + }
> + }
> +
> + /* Check that the loop processes at least one full vector. */
> + poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
> + if (known_lt (scalar_niters, vf))
> {
> if (dump_enabled_p ())
> dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> - "not vectorized: iteration count smaller than "
> - "vectorization factor.\n");
> + "loop does not have enough iterations "
> + "to support vectorization.\n");
> + return 0;
> + }
> +
> + /* If we need to peel an extra epilogue iteration to handle data
> + accesses with gaps, check that there are enough scalar iterations
> + available.
> +
> + The check above is redundant with this one when peeling for gaps,
> + but the distinction is useful for diagnostics. */
> + if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
> + && known_le (scalar_niters, vf))
> + {
> + if (dump_enabled_p ())
> + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> + "loop does not have enough iterations "
> + "to support peeling for gaps.\n");
> return 0;
> }
> }
> @@ -2502,31 +2564,6 @@ vect_determine_partial_vectors_and_peeling
> (loop_vec_info loop_vinfo,
> LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo)));
> }
>
> - if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
> - && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
> - {
> - /* Check that the loop processes at least one full vector. */
> - poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
> - tree scalar_niters = LOOP_VINFO_NITERS (loop_vinfo);
> - if (known_lt (wi::to_widest (scalar_niters), vf))
> - return opt_result::failure_at (vect_location,
> - "loop does not have enough iterations"
> - " to support vectorization.\n");
> -
> - /* If we need to peel an extra epilogue iteration to handle data
> - accesses with gaps, check that there are enough scalar iterations
> - available.
> -
> - The check above is redundant with this one when peeling for gaps,
> - but the distinction is useful for diagnostics. */
> - tree scalar_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
> - if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
> - && known_lt (wi::to_widest (scalar_nitersm1), vf))
> - return opt_result::failure_at (vect_location,
> - "loop does not have enough iterations"
> - " to support peeling for gaps.\n");
> - }
> -
> LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
> = (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
> && need_peeling_or_partial_vectors_p);
> @@ -3002,7 +3039,8 @@ start_over:
> assuming that the loop will be used as a main loop. We will redo
> this analysis later if we instead decide to use the loop as an
> epilogue loop. */
> - ok = vect_determine_partial_vectors_and_peeling (loop_vinfo, false);
> + ok = vect_determine_partial_vectors_and_peeling
> + (loop_vinfo, LOOP_VINFO_EPILOGUE_P (loop_vinfo));
> if (!ok)
> return ok;