https://gcc.gnu.org/bugzilla/show_bug.cgi?id=29756
--- Comment #7 from Richard Biener <rguenth at gcc dot gnu.org> --- So I have it down to a x86 combine issue: ;; v_28 = BIT_FIELD_INSERT <v_27(D), _30, 0 (32 bits)>; (insn 7 6 8 (set (reg:SF 116) (vec_select:SF (reg/v:V4SF 115 [ v ]) (parallel [ (const_int 0 [0]) ]))) t.c:5 -1 (nil)) (insn 8 7 9 (set (reg:V4SF 117) (reg/v:V4SF 109 [ v ])) t.c:11 -1 (nil)) (insn 9 8 10 (set (reg:V4SF 117) (vec_merge:V4SF (vec_duplicate:V4SF (reg:SF 116)) (reg:V4SF 117) (const_int 1 [0x1]))) t.c:11 -1 (nil)) (insn 10 9 0 (set (reg/v:V4SF 110 [ v ]) (reg:V4SF 117)) t.c:11 -1 (nil)) that's from what vec_set_optab produces ;; _29 = __builtin_ia32_shufps (v_28, v_28, 0); (insn 11 10 12 (set (reg:V4SF 119) (reg/v:V4SF 110 [ v ])) t.c:12 -1 (nil)) (insn 12 11 13 (set (reg:V4SF 120) (reg/v:V4SF 110 [ v ])) t.c:12 -1 (nil)) (insn 13 12 14 (set (reg:V4SF 118) (vec_select:V4SF (vec_concat:V8SF (reg:V4SF 119) (reg:V4SF 120)) (parallel [ (const_int 0 [0]) (const_int 0 [0]) (const_int 4 [0x4]) (const_int 4 [0x4]) ]))) t.c:12 -1 (nil)) (insn 14 13 0 (set (reg:V4SF 111 [ _29 ]) (reg:V4SF 118)) t.c:12 -1 (nil)) and that's the shuffle. And after combine we have (insn 7 4 53 2 (set (reg:SF 116) (vec_select:SF (reg/v:V4SF 115 [ v ]) (parallel [ (const_int 0 [0]) ]))) t.c:5 2423 {*vec_extractv4sf_0} (nil)) (insn 9 53 13 2 (set (reg:V4SF 117 [ v ]) (vec_merge:V4SF (vec_duplicate:V4SF (reg:SF 116)) (const_vector:V4SF [ (const_double:SF 0.0 [0x0.0p+0]) (const_double:SF 0.0 [0x0.0p+0]) (const_double:SF 0.0 [0x0.0p+0]) (const_double:SF 0.0 [0x0.0p+0]) ]) (const_int 1 [0x1]))) t.c:11 2420 {vec_setv4sf_0} (expr_list:REG_DEAD (reg:SF 116) (nil))) (insn 13 9 15 2 (set (reg:V4SF 118) (vec_select:V4SF (vec_concat:V8SF (reg:V4SF 117 [ v ]) (reg:V4SF 117 [ v ])) (parallel [ (const_int 0 [0]) (const_int 0 [0]) (const_int 4 [0x4]) (const_int 4 [0x4]) ]))) t.c:12 2405 {sse_shufps_v4sf} (expr_list:REG_DEAD (reg:V4SF 117 [ v ]) (nil))) which combine doesn't manage to get down to (insn 9 4 13 2 (set (reg:V4SF 104) (vec_select:V4SF (vec_concat:V8SF (reg/v:V4SF 103 [ v ]) (reg/v:V4SF 103 [ v ])) (parallel [ (const_int 0 [0]) (const_int 0 [0]) (const_int 4 [0x4]) (const_int 4 [0x4]) ]))) t.c:18 2405 {sse_shufps_v4sf} (nil)) The testcase was the following. #include <emmintrin.h> template <int N> inline float component(__v4sf v) { return (reinterpret_cast<const float*>(&v))[N]; } inline __v4sf fill(float f) { __v4sf v; *(reinterpret_cast<float*>(&v))=f; return ((__m128) __builtin_ia32_shufps ((__v4sf)(v), (__v4sf)(v), 0)); } template <int N> inline __v4sf component_fill(__v4sf v) { return ((__m128) __builtin_ia32_shufps ((__v4sf)(v), (__v4sf)(v), ((((N) << 6) | ((N) << 4) | ((N) << 2) | (N))))); } __v4sf transform_bad(__v4sf m[4],__v4sf v) { return m[0]*fill(component<0>(v)) +m[1]*fill(component<1>(v)) +m[2]*fill(component<2>(v)) +m[3]*fill(component<3>(v)); } __v4sf transform_good(__v4sf m[4],__v4sf v) { return m[0]*component_fill<0>(v) +m[1]*component_fill<1>(v) +m[2]*component_fill<2>(v) +m[3]*component_fill<3>(v); }