https://gcc.gnu.org/bugzilla/show_bug.cgi?id=104438
Bug ID: 104438 Summary: Combine optimization exposed after pro_and_epilogue Product: gcc Version: 12.0 Status: UNCONFIRMED Keywords: missed-optimization Severity: normal Priority: P3 Component: target Assignee: unassigned at gcc dot gnu.org Reporter: crazylht at gmail dot com Target Milestone: --- Host: x86_64-pc-linux-gnu Target: x86_64-*-* i?86-*-* #include<stdint.h> #include<immintrin.h> static __m256i __attribute__((always_inline)) load8bit_4x4_avx2(const uint8_t *const src, const uint32_t stride) { __m128i src01, src23; src01 = _mm_cvtsi32_si128(*(int32_t*)(src + 0 * stride)); src01 = _mm_insert_epi32(src01, *(int32_t *)(src + 1 * stride), 1); src23 = _mm_cvtsi32_si128(*(int32_t*)(src + 2 * stride)); src23 = _mm_insert_epi32(src23, *(int32_t *)(src + 3 * stride), 1); return _mm256_setr_m128i(src01, src23); } uint32_t compute4x_m_sad_avx2_intrin( uint8_t *src, // input parameter, source samples Ptr uint32_t src_stride, // input parameter, source stride uint8_t *ref, // input parameter, reference samples Ptr uint32_t ref_stride, // input parameter, reference stride uint32_t height, // input parameter, block height (M) uint32_t width) // input parameter, block width (N) { __m128i xmm0; __m256i ymm = _mm256_setzero_si256(); uint32_t y; (void)width; for (y = 0; y < height; y += 4) { const __m256i src0123 = load8bit_4x4_avx2(src, src_stride); const __m256i ref0123 = load8bit_4x4_avx2(ref, ref_stride); ymm = _mm256_add_epi32(ymm, _mm256_sad_epu8(src0123, ref0123)); src += src_stride << 2; ref += ref_stride << 2; } xmm0 = _mm_add_epi32(_mm256_castsi256_si128(ymm), _mm256_extracti128_si256(ymm, 1)); return (uint32_t)_mm_cvtsi128_si32(xmm0); } gcc -O2 -mavx2 -S suboptimal asm .L4: vpxor xmm3, xmm3, xmm3 # 12 [c=4 l=4] movv4di_internal/0 vpxor xmm0, xmm0, xmm0 # 11 [c=4 l=4] movv8si_internal/0 vextracti128 xmm3, ymm3, 0x1 # 409 [c=4 l=6] vec_extract_hi_v4di vpaddd xmm0, xmm0, xmm3 # 429 [c=4 l=4] *addv4si3/1 vmovd eax, xmm0 # 430 [c=4 l=4] *movsi_internal/12 ret # 437 [c=0 l=1] simple_return_internal It can be optimized to just xor eax, eax Before pro_and_epilogue, cfg is like .L2 ...asm... jmp .L4 .L3: vpxor xmm3, xmm3, xmm3 # 12 [c=4 l=4] movv4di_internal/0 vpxor xmm0, xmm0, xmm0 # 11 [c=4 l=4] movv8si_internal/0 .L4: vextracti128 xmm3, ymm3, 0x1 # 409 [c=4 l=6] vec_extract_hi_v4di vpaddd xmm0, xmm0, xmm3 # 429 [c=4 l=4] *addv4si3/1 vmovd eax, xmm0 # 430 [c=4 l=4] *movsi_internal/12 ret # 437 [c=0 l=1] simple_return_internal And Since there're 2 predecessor bbs for .L4, it can't be optimized off, but after pro_and_epilogue, GCC copy .L4 to .L2 and merge .L4 with .L3, and exposed the opportunity.