https://gcc.gnu.org/bugzilla/show_bug.cgi?id=101929
--- Comment #7 from Richard Biener <rguenth at gcc dot gnu.org> --- diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc index 9188d727e33..7f1f12fb6c6 100644 --- a/gcc/tree-vect-slp.cc +++ b/gcc/tree-vect-slp.cc @@ -2374,7 +2375,7 @@ fail: n_vector_builds++; } } - if (all_uniform_p + if ((all_uniform_p && !two_operators) || n_vector_builds > 1 || (n_vector_builds == children.length () && is_a <gphi *> (stmt_info->stmt))) will re-enable the vectorization - it evades the vect_construct cost bump by instead using scalar_to_vec (aka splat) which has not yet been fixed to account for a possible gpr to xmm move (so it would be a temporary "solution" at best). Another change to mute the effect somewhat (but not fixing x264) that was mentioned is diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc index b2bf90576d5..acf2cc977b4 100644 --- a/gcc/config/i386/i386.cc +++ b/gcc/config/i386/i386.cc @@ -22595,7 +22595,7 @@ ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost, case vec_construct: { /* N element inserts into SSE vectors. */ - int cost = TYPE_VECTOR_SUBPARTS (vectype) * ix86_cost->sse_op; + int cost = (TYPE_VECTOR_SUBPARTS (vectype) - 1) * ix86_cost->sse_op; /* One vinserti128 for combining two SSE vectors for AVX256. */ if (GET_MODE_BITSIZE (mode) == 256) cost += ix86_vec_cost (mode, ix86_cost->addss); which makes sense as the cost of the initial value of the xmm destination is now costed separately when from GPR and free when from xmm.