https://gcc.gnu.org/bugzilla/show_bug.cgi?id=122444
Bug ID: 122444
Summary: gcc suboptimal code for load __int128 from 8 & 12
bytes
Product: gcc
Version: 16.0
Status: UNCONFIRMED
Severity: normal
Priority: P3
Component: c++
Assignee: unassigned at gcc dot gnu.org
Reporter: rockeet at gmail dot com
Target Milestone: ---
#include <stddef.h>
#include <stdint.h>
#include <string.h>
using Uint128 = unsigned __int128;
union NoAlias {
Uint128 u128;
uint64_t u64[2];
};
Uint128 LoadPrefixZeroSuffix08_1(const void* src) {
constexpr size_t PrefixLen = 8;
Uint128 dst;
memcpy(&dst, src, PrefixLen);
memset((char*)&dst + PrefixLen, 0, sizeof(dst) - PrefixLen);
return dst;
}
Uint128 LoadPrefixZeroSuffix08_2(const void* src) {
return Uint128(*(const uint64_t*)src);
}
Uint128 LoadPrefixZeroSuffix12_1(const void* src) {
NoAlias un;
un.u64[0] = ((const uint64_t*)src)[0];
un.u64[1] = ((const uint32_t*)src)[2]; // zero extend uint32 to uint64
return un.u128;
}
Uint128 LoadPrefixZeroSuffix12_2(const void* src) {
uint64_t behi = ((const uint64_t*)src)[0];
uint64_t belo = ((const uint32_t*)src)[2]; // zero extend uint32 to uint64
return Uint128(belo) << 64 | behi;
}
Uint128 LoadPrefixZeroSuffix12_3(const void* src) {
return ((const Uint128*)src)[0] &
(Uint128(0xFFFF'FFFF'FFFF'FFFFull) << 32 | 0xFFFF'FFFF);
}
-----------------------------------------
for each of the above function, gcc generate suboptimal code:
"LoadPrefixZeroSuffix08_1(void const*)":
mov rax, QWORD PTR [rdi]
mov QWORD PTR [rsp-16], 0
mov QWORD PTR [rsp-24], rax
mov rdx, QWORD PTR [rsp-16]
mov rax, QWORD PTR [rsp-24]
ret
"LoadPrefixZeroSuffix08_2(void const*)":
mov rax, QWORD PTR [rdi]
xor edx, edx
ret
"LoadPrefixZeroSuffix12_1(void const*)":
mov ecx, DWORD PTR [rdi+8]
mov rax, QWORD PTR [rdi]
mov rdx, rcx
ret
"LoadPrefixZeroSuffix12_2(void const*)":
mov eax, DWORD PTR [rdi+8]
mov rdx, rax
mov rax, QWORD PTR [rdi]
ret
"LoadPrefixZeroSuffix12_3(void const*)":
mov rax, -1
mov edx, 4294967295
and rax, QWORD PTR [rdi]
and rdx, QWORD PTR [rdi+8]
ret
where clang is much better:
LoadPrefixZeroSuffix08_1(void const*):
mov rax, qword ptr [rdi]
xor edx, edx
ret
LoadPrefixZeroSuffix08_2(void const*):
mov rax, qword ptr [rdi]
xor edx, edx
ret
LoadPrefixZeroSuffix12_1(void const*):
mov rax, qword ptr [rdi]
mov edx, dword ptr [rdi + 8]
ret
LoadPrefixZeroSuffix12_2(void const*):
mov rax, qword ptr [rdi]
mov edx, dword ptr [rdi + 8]
ret
LoadPrefixZeroSuffix12_3(void const*):
mov rax, qword ptr [rdi]
mov edx, dword ptr [rdi + 8]
ret
--------------------
gcc is as good as clang just for the simplest LoadPrefixZeroSuffix08_2
https://godbolt.org/z/fWrn61hxW