https://gcc.gnu.org/bugzilla/show_bug.cgi?id=122412
Bug ID: 122412
Summary: gcc generate bad code
Product: gcc
Version: 16.0
Status: UNCONFIRMED
Severity: normal
Priority: P3
Component: c++
Assignee: unassigned at gcc dot gnu.org
Reporter: rockeet at gmail dot com
Target Milestone: ---
#include <stddef.h>
#include <immintrin.h>
char* small_memcpy_align_1(char* bdst, const char* bsrc, size_t len) {
while (__builtin_expect(len >= 64, 0)) {
_mm512_storeu_epi8(bdst, _mm512_loadu_epi8(bsrc));
len -= 64;
bsrc += 64;
bdst += 64;
}
auto mask = _bzhi_u64(-1, len);
auto tail = _mm512_maskz_loadu_epi8(mask, bsrc);
_mm512_mask_storeu_epi8(bdst, mask, tail);
return bdst + len;
}
-------------------------
gcc generate bad code, even there is a `imul`(https://godbolt.org/z/r431M1q8x):
-------------------------
"small_memcpy_align_1(char*, char const*, unsigned long)":
xor eax, eax
.L2:
mov rcx, rdx
sub rcx, rax
cmp rcx, 63
jbe .L5
vmovdqu8 zmm0, ZMMWORD PTR [rsi+rax]
vmovdqu8 ZMMWORD PTR [rdi+rax], zmm0
add rax, 64
jmp .L2
.L5:
mov rax, rdx
shr rax, 6
imul rax, rax, -64
add rdx, rax
sub rdi, rax
sub rsi, rax
mov rax, -1
bzhi rax, rax, rdx
kmovq k1, rax
vmovdqu8 zmm0{k1}{z}, ZMMWORD PTR [rsi]
lea rax, [rdi+rdx]
vmovdqu8 ZMMWORD PTR [rdi]{k1}, zmm0
ret
-------------------------
while clang generate perfect code:
-------------------------
small_memcpy_align_1(char*, char const*, unsigned long):
mov rax, rdi
cmp rdx, 64
jae .LBB0_1
.LBB0_2:
mov rcx, -1
bzhi rcx, rcx, rdx
kmovq k1, rcx
vmovdqu8 zmm0 {k1} {z}, zmmword ptr [rsi]
vmovdqu8 zmmword ptr [rax] {k1}, zmm0
add rax, rdx
vzeroupper
ret
.LBB0_1:
vmovups zmm0, zmmword ptr [rsi]
vmovups zmmword ptr [rax], zmm0
add rdx, -64
add rsi, 64
add rax, 64
cmp rdx, 63
ja .LBB0_1
jmp .LBB0_2