https://gcc.gnu.org/bugzilla/show_bug.cgi?id=101842
--- Comment #2 from Richard Biener <rguenth at gcc dot gnu.org> --- OK, so with a hack like the following we vectorize the BB as vect__1.10_62 = MEM <vector(4) float> [(float *)p_34]; vect_powmult_9.11_61 = vect__1.10_62 * vect__1.10_62; _60 = .REDUC_PLUS (vect_powmult_9.11_61); d_25 = d_35 - _60; p_26 = p_34 + 16; i_27 = i_37 + 4; _10 = len_20(D) > i_27; _11 = lim_21(D) <= d_25; _12 = _10 & _11; if (_12 != 0) and on x86_64 we get .L3: movups (%rdi), %xmm2 addl $4, %eax addq $16, %rdi mulps %xmm2, %xmm2 movaps %xmm2, %xmm3 movhlps %xmm2, %xmm3 addps %xmm2, %xmm3 movaps %xmm3, %xmm2 shufps $85, %xmm3, %xmm2 addps %xmm3, %xmm2 subss %xmm2, %xmm0 cmpl %eax, %esi jle .L2 comiss %xmm1, %xmm0 jnb .L3 .L2: ret or with AVX .L3: vmovups (%rdi), %xmm4 addl $4, %eax addq $16, %rdi vmulps %xmm4, %xmm4, %xmm2 vmovhlps %xmm2, %xmm2, %xmm3 vaddps %xmm2, %xmm3, %xmm3 vshufps $85, %xmm3, %xmm3, %xmm2 vaddps %xmm3, %xmm2, %xmm2 vsubss %xmm2, %xmm0, %xmm0 cmpl %eax, %esi jle .L2 vcomiss %xmm1, %xmm0 jnb .L3 .L2: ret diff --git a/gcc/tree-vect-slp.c b/gcc/tree-vect-slp.c index f9ca24415a2..0e14c164635 100644 --- a/gcc/tree-vect-slp.c +++ b/gcc/tree-vect-slp.c @@ -5637,6 +5637,11 @@ vect_slp_check_for_constructors (bb_vec_info bb_vinfo) || (gimple_assign_rhs_code (use_stmt) != (code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR)))))) { + gassign *next_stmt = assign; + while (next_stmt) + { + assign = next_stmt; + next_stmt = NULL; /* We start the match at the end of a possible association chain. */ auto_vec<chain_op_t> chain; @@ -5666,10 +5671,12 @@ vect_slp_check_for_constructors (bb_vec_info bb_vinfo) { if (chain[i].dt != vect_internal_def) invalid_cst = true; - else if (chain[i].code != code) - invalid_op = true; else - valid_lanes++; + { + valid_lanes++; + if (chain[i].code != code) + invalid_op = true; + } } if (!invalid_op && !invalid_cst) { @@ -5707,8 +5714,13 @@ vect_slp_check_for_constructors (bb_vec_info bb_vinfo) statistics_counter_event (cfun, "BB reduction missed (cst)", 1); statistics_histogram_event (cfun, "BB reduction missed lanes", valid_lanes); + + /* Try again. */ + if (valid_lanes > 2) + next_stmt = as_a <gassign *> (chain_stmts[1]); } } + } } } } the hack simply re-starts reduction discovery at the "previous" stmt (this breaks down after skipping the first stmt eventually). As said, it's a hack. But is that the kind of vectorization you expect?