https://gcc.gnu.org/bugzilla/show_bug.cgi?id=85482
Bug ID: 85482 Summary: unnecessary vmovaps/vmovapd/vmovdqa emitted Product: gcc Version: 8.0.1 Status: UNCONFIRMED Severity: normal Priority: P3 Component: target Assignee: unassigned at gcc dot gnu.org Reporter: kretz at kde dot org Target Milestone: --- Test case (cf. https://godbolt.org/g/QkJYSK): #include <x86intrin.h> __m256 zero_extend1(__m128 a) { return _mm256_insertf128_ps(__m256(), a, 0); } __m256d zero_extend1(__m128d a) { return _mm256_insertf128_pd(__m256d(), a, 0); } __m256i zero_extend1(__m128i a) { return _mm256_insertf128_si256(__m256i(), a, 0); } __m512 zero_extend2(__m128 a) { return _mm512_insertf32x4(__m512(), a, 0); } __m512d zero_extend2(__m128d a) { return _mm512_insertf64x2(__m512d(), a, 0); } __m512i zero_extend2(__m128i a) { return _mm512_inserti32x4(__m512i(), a, 0); } __m512 zero_extend3(__m256 a) { return _mm512_insertf32x8(__m512(), a, 0); } __m512d zero_extend3(__m256d a) { return _mm512_insertf64x4(__m512d(), a, 0); } __m512i zero_extend3(__m256i a) { return _mm512_inserti64x4(__m512i(), a, 0); } template <class T> T blackhole; void test(void *mem) { blackhole<__m256 > = zero_extend1(_mm_load_ps((float *)mem)); blackhole<__m256d> = zero_extend1(_mm_load_pd((double *)mem)); blackhole<__m256i> = zero_extend1(_mm_load_si128((__m128i *)mem)); blackhole<__m512 > = zero_extend2(_mm_load_ps((float *)mem)); blackhole<__m512d> = zero_extend2(_mm_load_pd((double *)mem)); blackhole<__m512i> = zero_extend2(_mm_load_si128((__m128i *)mem)); blackhole<__m512 > = zero_extend3(_mm256_load_ps((float *)mem)); blackhole<__m512d> = zero_extend3(_mm256_load_pd((double *)mem)); blackhole<__m512i> = zero_extend3(_mm256_load_si256((__m256i *)mem)); } Between every load and store instruction in the `test` function, the vmov(aps|apd|dqa) is superfluous. The preceding load instruction already zeroes the high bits. Instead of the load instruction, there could also be a different instructions, or instruction sequences, that already guarantee the high bits to be zero.