Thanks for the review.
Richard Biener <[email protected]> writes:
>> @@ -588,6 +600,23 @@ public:
>> /* Unrolling factor */
>> poly_uint64 vectorization_factor;
>>
>> + /* If this loop is an epilogue loop whose main loop can be skipped,
>> + MAIN_LOOP_EDGE is the edge from the main loop to this loop's
>> + preheader. SKIP_MAIN_LOOP_EDGE is then the edge that skips the
>> + main loop and goes straight to this loop's preheader.
>> +
>> + Both fields are null otherwise. */
>> + edge main_loop_edge;
>> + edge skip_main_loop_edge;
>> +
>> + /* If this loop is an epilogue loop that might be skipped after executing
>> + the main loop, this edge is the one that skips the epilogue. */
>> + edge skip_this_loop_edge;
>> +
>> + /* After vectorization, maps live-out SSA names to information about
>> + the reductions that generated them. */
>> + hash_map<tree, vect_reusable_accumulator> reusable_accumulators;
>
> Is that the LC PHI node defs or the definition inside of the loop?
> If the latter we could attach the info directly to its stmt-info?
Ah, yeah, I should improve the comment there. It's the vectoriser's
replacement for the original LC PHI node, i.e. the final scalar result
after the reduction has taken place.
>> @@ -1186,6 +1215,21 @@ public:
>> /* The vector type for performing the actual reduction. */
>> tree reduc_vectype;
>>
>> + /* If IS_REDUC_INFO is true and if the reduction is operating on N
>> + elements in parallel, this vector gives the initial values of these
>> + N elements. */
>
> That's N scalar elements or N vector elements? I suppose it's for
> SLP reductions (rather than SLP reduction chains) and never non-SLP
> reductions?
Yeah, poor wording again, sorry. I meant something closer to:
/* If IS_REDUC_INFO is true and if the vector code is performing
N scalar reductions in parallel, this vector gives the initial
scalar values of those N reductions. */
>> + vec<tree> reduc_initial_values;
>> +
>> + /* If IS_REDUC_INFO is true and if the reduction is operating on N
>> + elements in parallel, this vector gives the scalar result of each
>> + reduction. */
>> + vec<tree> reduc_scalar_results;
Same change here.
>> […]
>> diff --git a/gcc/tree-vect-loop-manip.c b/gcc/tree-vect-loop-manip.c
>> index 2909e8a0fc3..b7b0523e3c8 100644
>> --- a/gcc/tree-vect-loop-manip.c
>> +++ b/gcc/tree-vect-loop-manip.c
>> @@ -2457,6 +2457,31 @@ vect_update_epilogue_niters (loop_vec_info
>> epilogue_vinfo,
>> return vect_determine_partial_vectors_and_peeling (epilogue_vinfo, true);
>> }
>>
>> +/* LOOP_VINFO is an epilogue loop and MAIN_LOOP_VALUE is available on exit
>> + from the corresponding main loop. Return a value that is available in
>> + LOOP_VINFO's preheader, using SKIP_VALUE if the main loop is skipped.
>> + Passing a null SKIP_VALUE is equivalent to passing zero. */
>> +
>> +tree
>> +vect_get_main_loop_result (loop_vec_info loop_vinfo, tree main_loop_value,
>> + tree skip_value)
>> +{
>> + if (!loop_vinfo->main_loop_edge)
>> + return main_loop_value;
>> +
>> + if (!skip_value)
>> + skip_value = build_zero_cst (TREE_TYPE (main_loop_value));
>
> shouldn't that be the initial value?
For the current use case, the above two conditions are never true.
I wrote it like this because I had a follow-on patch (which might
not go anywhere) that needed this function for 0-based IVs.
Maybe that's a bad risk/reward trade-off though. Not having to pass
zero makes things only slightly simpler for the follow-on patch,
and I guess could be dangerous in other cases.
Perhaps in that case though I should change loop_vinfo->main_loop_edge
into a gcc_assert as well.
>> + tree phi_result = make_ssa_name (TREE_TYPE (main_loop_value));
>> + basic_block bb = loop_vinfo->main_loop_edge->dest;
>> + gphi *new_phi = create_phi_node (phi_result, bb);
>> + add_phi_arg (new_phi, main_loop_value, loop_vinfo->main_loop_edge,
>> + UNKNOWN_LOCATION);
>> + add_phi_arg (new_phi, skip_value,
>> + loop_vinfo->skip_main_loop_edge, UNKNOWN_LOCATION);
>> + return phi_result;
>> +}
>> +
>> /* Function vect_do_peeling.
>>
>> Input:
>> […]
>> @@ -4823,6 +4842,100 @@ info_for_reduction (vec_info *vinfo, stmt_vec_info
>> stmt_info)
>> return stmt_info;
>> }
>>
>> +/* PHI is a reduction in LOOP_VINFO that we are going to vectorize using
>> vector
>> + type VECTYPE. See if LOOP_VINFO is an epilogue loop whose main loop had
>> a
>> + matching reduction that we can build on. Adjust REDUC_INFO and return
>> true
>> + if so, otherwise return false. */
>> +
>> +static bool
>> +vect_find_reusable_accumulator (loop_vec_info loop_vinfo,
>> + stmt_vec_info reduc_info)
>> +{
>> + loop_vec_info main_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
>> + if (!main_loop_vinfo)
>> + return false;
>> +
>> + if (STMT_VINFO_REDUC_TYPE (reduc_info) != TREE_CODE_REDUCTION)
>> + return false;
>> +
>> + unsigned int num_phis = reduc_info->reduc_initial_values.length ();
>> + auto_vec<tree, 16> main_loop_results (num_phis);
>> + auto_vec<tree, 16> initial_values (num_phis);
>> + if (edge main_loop_edge = loop_vinfo->main_loop_edge)
>> + {
>> + /* The epilogue loop can be entered either from the main loop or
>> + from an earlier guard block. */
>> + edge skip_edge = loop_vinfo->skip_main_loop_edge;
>> + for (tree incoming_value : reduc_info->reduc_initial_values)
>> + {
>> + /* Look for:
>> +
>> + INCOMING_VALUE = phi<MAIN_LOOP_RESULT(main loop),
>> + INITIAL_VALUE(guard block)>. */
>> + gcc_assert (TREE_CODE (incoming_value) == SSA_NAME);
>> +
>> + gphi *phi = as_a <gphi *> (SSA_NAME_DEF_STMT (incoming_value));
>> + gcc_assert (gimple_bb (phi) == main_loop_edge->dest);
>> +
>> + tree from_main_loop = PHI_ARG_DEF_FROM_EDGE (phi, main_loop_edge);
>> + tree from_skip = PHI_ARG_DEF_FROM_EDGE (phi, skip_edge);
>> +
>> + main_loop_results.quick_push (from_main_loop);
>> + initial_values.quick_push (from_skip);
>> + }
>> + }
>> + else
>> + /* The main loop dominates the epilogue loop. */
>> + main_loop_results.splice (reduc_info->reduc_initial_values);
>> +
>> + /* See if the main loop has the kind of accumulator we need. */
>> + vect_reusable_accumulator *accumulator
>> + = main_loop_vinfo->reusable_accumulators.get (main_loop_results[0]);
>> + if (!accumulator
>> + || num_phis != accumulator->reduc_info->reduc_scalar_results.length ()
>> + || !std::equal (main_loop_results.begin (), main_loop_results.end (),
>> + accumulator->reduc_info->reduc_scalar_results.begin
>> ()))
>> + return false;
>> +
>> + /* For now, only handle the case in which both loops are operating on the
>> + same vector types. In future we could reduce wider vectors to narrower
>> + ones as well. */
>> + tree vectype = STMT_VINFO_VECTYPE (reduc_info);
>> + tree old_vectype = TREE_TYPE (accumulator->reduc_input);
>> + if (!useless_type_conversion_p (old_vectype, vectype))
>
> It should be indeed quite trivial to handle, likewise the case where we
> have multiple PHIs - just reduce to a single input vector and have the
> possibly multiple input vectors in the epilogue filled with neutral
> elements. I'll see if I can cook up stuff for this next week.
Yeah, agreed. The multi-vector epilogue case should be especially easy
to handle, but it's not interesting for SVE as things stand, since:
(a) non-SLP reductions use a single cycle for ncopies>1 (a misfeature
IMO -- on targets with wide pipelines we want exactly the opposite)
(b) SLP reductions are limited to single vectors for variable-length targets.
So it wasn't possible to trigger multiple epilogue vectors for the
motivating SVE use case.
>> […]
>> @@ -5196,6 +5305,37 @@ vect_create_epilog_for_reduction (loop_vec_info
>> loop_vinfo,
>> reduc_inputs.safe_push (single_input);
>> }
>>
>> + tree orig_reduc_input = reduc_inputs[0];
>> +
>> + /* If this loop is an epilogue loop that can be skipped after the
>> + main loop, we can only share a reduction operation between the
>> + main loop and the epilogue if we put it at the target of the
>> + skip edge.
>
> Do you have a testcase where we cannot do this?
No, it's being defensive. I wasn't sure how the epilogue code would
evolve in future.
>> + We can still reuse accumulators if this check fails. Doing so has
>> + the minor(?) benefit of making the epilogue loop's scalar result
>> + independent of the main loop's scalar result. */
>> + bool unify_with_main_loop_p = false;
>> + if (reduc_info->reused_accumulator
>> + && loop_vinfo->skip_this_loop_edge
>> + && single_succ_p (exit_bb)
>> + && single_succ (exit_bb) == loop_vinfo->skip_this_loop_edge->dest)
>> + {
>> + unify_with_main_loop_p = true;
>> +
>> + basic_block reduc_block = loop_vinfo->skip_this_loop_edge->dest;
>> + reduc_inputs[0] = make_ssa_name (vectype);
>> + gphi *new_phi = create_phi_node (reduc_inputs[0], reduc_block);
>> + add_phi_arg (new_phi, orig_reduc_input, single_succ_edge (exit_bb),
>> + UNKNOWN_LOCATION);
>> + add_phi_arg (new_phi, reduc_info->reused_accumulator->reduc_input,
>> + loop_vinfo->skip_this_loop_edge, UNKNOWN_LOCATION);
>> + exit_gsi = gsi_after_labels (reduc_block);
>> + }
>> +
>> + /* Shouldn't be used beyond this point. */
>> + exit_bb = nullptr;
>> +
>> if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
>> && reduc_fn != IFN_LAST)
>> {
>> @@ -5819,6 +5958,12 @@ vect_create_epilog_for_reduction (loop_vec_info
>> loop_vinfo,
>> scalar_results[0] = new_temp;
>> }
>>
>> + /* Record this operation if it could be reused by the epilogue loop. */
>> + if (STMT_VINFO_REDUC_TYPE (reduc_info) == TREE_CODE_REDUCTION
>> + && !double_reduc)
>
> what's the issue with double_reduc?
Probably nothing TBH. I haven't been able to construct a case that
uses predicated double reductions with vect-partial-vector-usage=1,
but that's probably a missed optimisation.
There again, double reductions themselves seem to be hard to trigger
now that we have loop interchange. Is there a good way of testing
them without -fno-loop-interchange?
Thanks,
Richard
>> + loop_vinfo->reusable_accumulators.put (scalar_results[0],
>> + { orig_reduc_input, reduc_info });
>> +
>> if (double_reduc)
>> loop = outer_loop;
>>