https://gcc.gnu.org/bugzilla/show_bug.cgi?id=122444

            Bug ID: 122444
           Summary: gcc suboptimal code for load __int128 from 8 & 12
                    bytes
           Product: gcc
           Version: 16.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: c++
          Assignee: unassigned at gcc dot gnu.org
          Reporter: rockeet at gmail dot com
  Target Milestone: ---

#include <stddef.h>
#include <stdint.h>
#include <string.h>
using Uint128 = unsigned __int128;
union NoAlias {
    Uint128 u128;
    uint64_t u64[2];
};
Uint128 LoadPrefixZeroSuffix08_1(const void* src) {
  constexpr size_t PrefixLen = 8;
  Uint128 dst;
  memcpy(&dst, src, PrefixLen);
  memset((char*)&dst + PrefixLen, 0, sizeof(dst) - PrefixLen);
  return dst;
}
Uint128 LoadPrefixZeroSuffix08_2(const void* src) {
  return Uint128(*(const uint64_t*)src);
}
Uint128 LoadPrefixZeroSuffix12_1(const void* src) {
    NoAlias un;
    un.u64[0] = ((const uint64_t*)src)[0];
    un.u64[1] = ((const uint32_t*)src)[2]; // zero extend uint32 to uint64
    return un.u128;
}
Uint128 LoadPrefixZeroSuffix12_2(const void* src) {
    uint64_t behi = ((const uint64_t*)src)[0];
    uint64_t belo = ((const uint32_t*)src)[2]; // zero extend uint32 to uint64
    return Uint128(belo) << 64 | behi;
}
Uint128 LoadPrefixZeroSuffix12_3(const void* src) {
    return ((const Uint128*)src)[0] &
     (Uint128(0xFFFF'FFFF'FFFF'FFFFull) << 32 | 0xFFFF'FFFF);
}

-----------------------------------------
for each of the above function, gcc generate suboptimal code:

"LoadPrefixZeroSuffix08_1(void const*)":
        mov     rax, QWORD PTR [rdi]
        mov     QWORD PTR [rsp-16], 0
        mov     QWORD PTR [rsp-24], rax
        mov     rdx, QWORD PTR [rsp-16]
        mov     rax, QWORD PTR [rsp-24]
        ret
"LoadPrefixZeroSuffix08_2(void const*)":
        mov     rax, QWORD PTR [rdi]
        xor     edx, edx
        ret
"LoadPrefixZeroSuffix12_1(void const*)":
        mov     ecx, DWORD PTR [rdi+8]
        mov     rax, QWORD PTR [rdi]
        mov     rdx, rcx
        ret
"LoadPrefixZeroSuffix12_2(void const*)":
        mov     eax, DWORD PTR [rdi+8]
        mov     rdx, rax
        mov     rax, QWORD PTR [rdi]
        ret
"LoadPrefixZeroSuffix12_3(void const*)":
        mov     rax, -1
        mov     edx, 4294967295
        and     rax, QWORD PTR [rdi]
        and     rdx, QWORD PTR [rdi+8]
        ret

where clang is much better:

LoadPrefixZeroSuffix08_1(void const*):
        mov     rax, qword ptr [rdi]
        xor     edx, edx
        ret

LoadPrefixZeroSuffix08_2(void const*):
        mov     rax, qword ptr [rdi]
        xor     edx, edx
        ret

LoadPrefixZeroSuffix12_1(void const*):
        mov     rax, qword ptr [rdi]
        mov     edx, dword ptr [rdi + 8]
        ret

LoadPrefixZeroSuffix12_2(void const*):
        mov     rax, qword ptr [rdi]
        mov     edx, dword ptr [rdi + 8]
        ret

LoadPrefixZeroSuffix12_3(void const*):
        mov     rax, qword ptr [rdi]
        mov     edx, dword ptr [rdi + 8]
        ret

--------------------
gcc is as good as clang just for the simplest LoadPrefixZeroSuffix08_2

https://godbolt.org/z/fWrn61hxW

Reply via email to