https://gcc.gnu.org/bugzilla/show_bug.cgi?id=122412

            Bug ID: 122412
           Summary: gcc generate bad code
           Product: gcc
           Version: 16.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: c++
          Assignee: unassigned at gcc dot gnu.org
          Reporter: rockeet at gmail dot com
  Target Milestone: ---

#include <stddef.h>
#include <immintrin.h>
char* small_memcpy_align_1(char* bdst, const char* bsrc, size_t len) {
    while (__builtin_expect(len >= 64, 0)) {
        _mm512_storeu_epi8(bdst, _mm512_loadu_epi8(bsrc));
        len  -= 64;
        bsrc += 64;
        bdst += 64;
    }
    auto mask = _bzhi_u64(-1, len);
    auto tail = _mm512_maskz_loadu_epi8(mask, bsrc);
    _mm512_mask_storeu_epi8(bdst, mask, tail);
    return bdst + len;
}
-------------------------
gcc generate bad code, even there is a `imul`(https://godbolt.org/z/r431M1q8x):
-------------------------
"small_memcpy_align_1(char*, char const*, unsigned long)":
        xor     eax, eax
.L2:
        mov     rcx, rdx
        sub     rcx, rax
        cmp     rcx, 63
        jbe     .L5
        vmovdqu8        zmm0, ZMMWORD PTR [rsi+rax]
        vmovdqu8        ZMMWORD PTR [rdi+rax], zmm0
        add     rax, 64
        jmp     .L2
.L5:
        mov     rax, rdx
        shr     rax, 6
        imul    rax, rax, -64
        add     rdx, rax
        sub     rdi, rax
        sub     rsi, rax
        mov     rax, -1
        bzhi    rax, rax, rdx
        kmovq   k1, rax
        vmovdqu8        zmm0{k1}{z}, ZMMWORD PTR [rsi]
        lea     rax, [rdi+rdx]
        vmovdqu8        ZMMWORD PTR [rdi]{k1}, zmm0
        ret
-------------------------
while clang generate perfect code:
-------------------------
small_memcpy_align_1(char*, char const*, unsigned long):
        mov     rax, rdi
        cmp     rdx, 64
        jae     .LBB0_1
.LBB0_2:
        mov     rcx, -1
        bzhi    rcx, rcx, rdx
        kmovq   k1, rcx
        vmovdqu8        zmm0 {k1} {z}, zmmword ptr [rsi]
        vmovdqu8        zmmword ptr [rax] {k1}, zmm0
        add     rax, rdx
        vzeroupper
        ret
.LBB0_1:
        vmovups zmm0, zmmword ptr [rsi]
        vmovups zmmword ptr [rax], zmm0
        add     rdx, -64
        add     rsi, 64
        add     rax, 64
        cmp     rdx, 63
        ja      .LBB0_1
        jmp     .LBB0_2

Reply via email to