--- failure.c ---
int _clz(unsigned long long argument) {
return __builtin_clzll(argument);
}
int _ctz(unsigned long long argument) {
return __builtin_ctzll(argument);
}
--- EOF ---
GCC 13.1 -m32 -mabm -mbmi -mlzcnt -O3 failure.c
<https://godbolt.org/z/MMf11hKch>
_clz(unsigned long long):
mov edx, DWORD PTR [esp+8]
xor ecx, ecx
xor eax, eax
lzcnt eax, DWORD PTR [esp+4]
add eax, 32
lzcnt ecx, edx
test edx, edx
cmovne eax, ecx
ret
_ctz(unsigned long long):
sub esp, 20
push DWORD PTR [esp+28]
push DWORD PTR [esp+28]
call __ctzdi2
add esp, 28
ret
OUCH: although EXPLICITLY enabled via -mabm (for AMD processors) and -mbmi
(for Intel processors), GCC generates slowmotion code calling __ctzdi2()
instead of TZCNT instructions available since 10 (in words: TEN) years.
GCC 13.1 -m32 -march=i386 -O3 failure.c
<https://godbolt.org/z/16ezfaexb>
_clz(unsigned long long):
mov edx, DWORD PTR [esp+4]
mov eax, DWORD PTR [esp+8]
test eax, eax
je .L2
bsr eax, eax
xor eax, 31
ret
.L2:
bsr eax, edx
xor eax, 31
lea eax, [eax+32]
ret
_ctz(unsigned long long):
sub esp, 20
push DWORD PTR [esp+28]
push DWORD PTR [esp+28]
call __ctzdi2
add esp, 28
ret
OUCH²: the BSF/BSR instructions were introduced 38 (in words: THIRTY-EIGHT)
years ago with the i386 processor, but GCC fails to know/use BSF --
a real shame!
OUCH³: an optimising compiler would of course generate "JMP __ctzdi2" instead
of code fiddling with the stack!
Stefan Kanthak