https://gcc.gnu.org/bugzilla/show_bug.cgi?id=104438
Bug ID: 104438
Summary: Combine optimization exposed after pro_and_epilogue
Product: gcc
Version: 12.0
Status: UNCONFIRMED
Keywords: missed-optimization
Severity: normal
Priority: P3
Component: target
Assignee: unassigned at gcc dot gnu.org
Reporter: crazylht at gmail dot com
Target Milestone: ---
Host: x86_64-pc-linux-gnu
Target: x86_64-*-* i?86-*-*
#include<stdint.h>
#include<immintrin.h>
static __m256i __attribute__((always_inline)) load8bit_4x4_avx2(const uint8_t
*const src,
const uint32_t stride)
{
__m128i src01, src23;
src01 = _mm_cvtsi32_si128(*(int32_t*)(src + 0 * stride));
src01 = _mm_insert_epi32(src01, *(int32_t *)(src + 1 * stride), 1);
src23 = _mm_cvtsi32_si128(*(int32_t*)(src + 2 * stride));
src23 = _mm_insert_epi32(src23, *(int32_t *)(src + 3 * stride), 1);
return _mm256_setr_m128i(src01, src23);
}
uint32_t compute4x_m_sad_avx2_intrin(
uint8_t *src, // input parameter, source samples Ptr
uint32_t src_stride, // input parameter, source stride
uint8_t *ref, // input parameter, reference samples Ptr
uint32_t ref_stride, // input parameter, reference stride
uint32_t height, // input parameter, block height (M)
uint32_t width) // input parameter, block width (N)
{
__m128i xmm0;
__m256i ymm = _mm256_setzero_si256();
uint32_t y;
(void)width;
for (y = 0; y < height; y += 4) {
const __m256i src0123 = load8bit_4x4_avx2(src, src_stride);
const __m256i ref0123 = load8bit_4x4_avx2(ref, ref_stride);
ymm = _mm256_add_epi32(ymm, _mm256_sad_epu8(src0123, ref0123));
src += src_stride << 2;
ref += ref_stride << 2;
}
xmm0 = _mm_add_epi32(_mm256_castsi256_si128(ymm),
_mm256_extracti128_si256(ymm, 1));
return (uint32_t)_mm_cvtsi128_si32(xmm0);
}
gcc -O2 -mavx2 -S
suboptimal asm
.L4:
vpxor xmm3, xmm3, xmm3 # 12 [c=4 l=4] movv4di_internal/0
vpxor xmm0, xmm0, xmm0 # 11 [c=4 l=4] movv8si_internal/0
vextracti128 xmm3, ymm3, 0x1 # 409 [c=4 l=6]
vec_extract_hi_v4di
vpaddd xmm0, xmm0, xmm3 # 429 [c=4 l=4] *addv4si3/1
vmovd eax, xmm0 # 430 [c=4 l=4] *movsi_internal/12
ret # 437 [c=0 l=1] simple_return_internal
It can be optimized to just
xor eax, eax
Before pro_and_epilogue, cfg is like
.L2
...asm...
jmp .L4
.L3:
vpxor xmm3, xmm3, xmm3 # 12 [c=4 l=4] movv4di_internal/0
vpxor xmm0, xmm0, xmm0 # 11 [c=4 l=4] movv8si_internal/0
.L4:
vextracti128 xmm3, ymm3, 0x1 # 409 [c=4 l=6]
vec_extract_hi_v4di
vpaddd xmm0, xmm0, xmm3 # 429 [c=4 l=4] *addv4si3/1
vmovd eax, xmm0 # 430 [c=4 l=4] *movsi_internal/12
ret # 437 [c=0 l=1] simple_return_internal
And Since there're 2 predecessor bbs for .L4, it can't be optimized off, but
after pro_and_epilogue, GCC copy .L4 to .L2 and merge .L4 with .L3, and exposed
the opportunity.