https://gcc.gnu.org/bugzilla/show_bug.cgi?id=83250
--- Comment #6 from Jakub Jelinek <jakub at gcc dot gnu.org> --- That said, I think we should eventually try to add some define_insn_and_split that would help with generating better code e.g. on #include <immintrin.h> __m256d foo1 (__m128d __A) { return _mm256_insertf128_pd (_mm256_castpd128_pd256 (__A), _mm_setzero_pd (), 1); } __m256d foo2 (__m128d __A) { return (__m256d) __builtin_shuffle (_mm256_castpd128_pd256 (__A), _mm256_setzero_pd (), (__v4di) { 0, 1, 4, 5 }); } __m256d foo3 (__m128d __A) { return __extension__ (__m256d) { __A[0], __A[1], 0.0, 0.0 }; } __m256 foo4 (__m128 __A) { return _mm256_insertf128_ps (_mm256_castps128_ps256 (__A), _mm_setzero_ps (), 1); } __m256 foo5 (__m128 __A) { return (__m256) __builtin_shuffle (_mm256_castps128_ps256 (__A), _mm256_setzero_ps (), (__v8si) { 0, 1, 2, 3, 8, 9, 10, 11 }); } __m256 foo6 (__m128 __A) { return __extension__ (__m256) { __A[0], __A[1], __A[2], __A[3], 0.0f, 0.0f, 0.0f, 0.0f }; } __m256i foo7 (__m128i __A) { return _mm256_insertf128_si256 (_mm256_castsi128_si256 (__A), _mm_setzero_si128 (), 1); } __m256i foo8 (__m128i __A) { return (__m256i) __builtin_shuffle (_mm256_castsi128_si256 (__A), _mm256_setzero_si256 (), (__v4di) { 0, 1, 4, 5 }); } __m256i foo9 (__m128i __A) { return __extension__ (__m256i) { __A[0], __A[1], 0, 0 }; } __m256d foo10 (__m128d __A) { return _mm256_insertf128_pd (_mm256_setzero_pd (), __A, 0); } __m256 foo11 (__m128 __A) { return _mm256_insertf128_ps (_mm256_setzero_ps (), __A, 0); } __m256i foo12 (__m128i __A) { return _mm256_insertf128_si256 (_mm256_setzero_si256 (), __A, 0); } and #include <immintrin.h> __m512d foo1 (__m256d __A) { return _mm512_insertf64x4 (_mm512_castpd256_pd512 (__A), _mm256_setzero_pd (), 1); } __m512 foo2 (__m256 __A) { return (__m512) _mm512_insertf64x4 (_mm512_castpd256_pd512 ((__m256d) __A), _mm256_setzero_pd (), 1); } __m512i foo3 (__m256i __A) { return _mm512_inserti64x4 (_mm512_castsi256_si512 (__A), _mm256_setzero_si256 (), 1); } __m512d foo4 (__m256d __A) { return _mm512_insertf32x4 (_mm512_castps128_ps512 ((__m256) __A), _mm_setzero_ps (), 1); } __m512 foo5 (__m256 __A) { return (__m512) _mm512_insertf64x4 (_mm512_castpd256_pd512 ((__m256d) __A), _mm256_setzero_pd (), 1); } __m512i foo6 (__m256i __A) { return _mm512_inserti64x4 (_mm512_castsi256_si512 (__A), _mm256_setzero_si256 (), 1); } WIP patch is: --- gcc/config/i386/sse.md.jj 2019-08-05 12:25:34.477667658 +0200 +++ gcc/config/i386/sse.md 2019-08-12 14:06:44.748772344 +0200 @@ -20784,6 +20784,35 @@ (define_insn "vec_set_hi_v32qi" (set_attr "prefix" "vex,evex") (set_attr "mode" "OI")]) +(define_insn_and_split "*vec_set_hi_<mode>_cast" + [(set (match_operand:VI8F_256 0 "register_operand" "=Yv") + (vec_concat:VI8F_256 + (vec_select:<ssehalfvecmode> + (unspec:VI8F_256 [(match_operand:<ssehalfvecmode> 1 + "register_operand" "vm")] + UNSPEC_CAST) + (parallel [(const_int 0) (const_int 1)])) + (match_operand:<ssehalfvecmode> 2 "const0_operand" "C")))] + "TARGET_AVX" + "#" + "" + [(set (match_dup 0) (vec_concat:VI8F_256 (match_dup 1) (match_dup 2)))]) + +(define_insn_and_split "*vec_set_hi_<mode>_cast" + [(set (match_operand:VI4F_256 0 "register_operand" "=Yv") + (vec_concat:VI4F_256 + (vec_select:<ssehalfvecmode> + (unspec:VI4F_256 [(match_operand:<ssehalfvecmode> 1 + "register_operand" "vm")] + UNSPEC_CAST) + (parallel [(const_int 0) (const_int 1) + (const_int 2) (const_int 3)])) + (match_operand:<ssehalfvecmode> 2 "const0_operand" "C")))] + "TARGET_AVX" + "#" + "" + [(set (match_dup 0) (vec_concat:VI4F_256 (match_dup 1) (match_dup 2)))]) + (define_insn "<avx_avx2>_maskload<ssemodesuffix><avxsizesuffix>" [(set (match_operand:V48_AVX2 0 "register_operand" "=x") (unspec:V48_AVX2 that improves foo1, foo4 and foo7 in the first testcase above. If not anything, at least the cases with { __A[0], __A[1], __A[2], ..., 0, 0, 0, ... } should be handled as a priority, as that can happen even with generic vector code without any intrinsics.