This extends optimized reduction epilog handling to cover the trivial single-lane SLP reduction case.
* tree-vect-loop.cc (vect_create_epilog_for_reduction): Allow direct opcode and shift reduction also for SLP reductions with a single lane. --- gcc/tree-vect-loop.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc index 83c0544b6aa..31abfe047a4 100644 --- a/gcc/tree-vect-loop.cc +++ b/gcc/tree-vect-loop.cc @@ -6500,7 +6500,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo, /* 2.3 Create the reduction code, using one of the three schemes described above. In SLP we simply need to extract all the elements from the vector (without reducing them), so we use scalar shifts. */ - else if (reduc_fn != IFN_LAST && !slp_reduc) + else if (reduc_fn != IFN_LAST && (!slp_reduc || group_size == 1)) { tree tmp; tree vec_elem_type; @@ -6670,7 +6670,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo, gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT); reduc_inputs[0] = new_temp; - if (reduce_with_shift && !slp_reduc) + if (reduce_with_shift && (!slp_reduc || group_size == 1)) { int element_bitsize = tree_to_uhwi (bitsize); /* Enforced by vectorizable_reduction, which disallows SLP reductions -- 2.35.3