Hi,
compile the following 128-bit GCD routine for the AMD64 processor
with full optimization:
--- gcdti3.c ---
// Stein's algorithm: greatest common divisor
__uint128_t __gcdti3(__uint128_t p, __uint128_t q)
{
unsigned r, s = 0;
__uint128_t t;
if (p == 0)
return q;
if (q == 0)
return p;
if (p == q)
return p;
if (((unsigned long long) p | (unsigned long long) q) == 0)
p >>= 64, q >>= 64, s = 64;
r = __builtin_ctzll((unsigned long long) p | (unsigned long long) q), p >>=
r, q >>= r, s += r;
if ((unsigned long long) p == 0)
p >>= 64;
r = __builtin_ctzll(p), p >>= r;
do
{
if ((unsigned long long) q == 0)
q >>= 64;
r = __builtin_ctzll(q), q >>= r;
if (p < q)
t = q, q = p, p = t;
} while (q -= p);
return p << s;
}
--- EOF ---
GCC 12.2: gcc -O3 gcdti3.c
# https://godbolt.org/z/d1Ma9qnsf
__gcdti3(unsigned __int128, unsigned __int128):
mov rax, rsi # OOPS: GCCs plays six rounds of shell game!
mov r8, rdi #
mov rsi, rdi #
mov rdi, rax #
mov rax, rdx #
mov rdx, rcx #
mov rcx, rdi
or rcx, r8
je .L1
mov rcx, r8
mov r8, rdi
xor rcx, rax
xor r8, rdx
or rcx, r8
je .L9
mov rcx, rax
or rcx, rdx
je .L9
mov rcx, rsi
xor r10d, r10d
or rcx, rax
jne .L3
mov rsi, rdi
mov rax, rdx
xor edi, edi
xor edx, edx
mov rcx, rsi
mov r10d, 64
or rcx, rax
.L3:
rep bsf rcx, rcx
xor r8d, r8d # OUCH: BSF and TZCNT return at most 63,
shrd rsi, rdi, cl
shr rdi, cl
test cl, 64 # so this is dead code!
cmovne rsi, rdi #
cmovne rdi, r8 #
shrd rax, rdx, cl
xor r11d, r11d # OUCH: BSF and TZCNT return at most 63,
shr rdx, cl
test cl, 64 # so this is dead code!
cmovne rax, rdx #
cmovne rdx, r11 #
mov r8, rsi
mov r9, rdi
add r10d, ecx
mov rcx, r8
mov rsi, rax
mov rdi, rdx
test r8, r8
je .L14
.L4:
rep bsf rcx, rcx
mov rax, r8
mov rdx, r9
xor r11d, r11d # OUCH: BSF and TZCNT return at most 63,
shr rdx, cl
shrd rax, r9, cl
and ecx, 64 # (there's also no need to modify ECX)
cmovne rax, rdx # so this is dead code!
cmovne rdx, r11 #
.L7:
mov rcx, rsi
test rsi, rsi
jne .L5
mov rsi, rdi
xor edi, edi
mov rcx, rsi
.L5:
rep bsf rcx, rcx
xor r8d, r8d # OUCH: BSF and TZCNT return at most 63,
shrd rsi, rdi, cl
shr rdi, cl
test cl, 64 # so this is dead code,
mov rcx, rdx
cmovne rsi, rdi # and that too!
cmovne rdi, r8 #
cmp rax, rsi
sbb rcx, rdi
jnc .L6
mov r8, rax
mov r9, rdx
mov rax, rsi
mov rdx, rdi
mov rsi, r8
mov rdi, r9
.L6:
sub rsi, rax
sbb rdi, rdx
mov rcx, rdi
or rcx, rsi
jne .L7
mov ecx, r10d
xor esi, esi
shld rdx, rax, cl
sal rax, cl
and ecx, 64 # Oops: there's no need to modify ECX!
cmovne rdx, rax
cmovne rax, rsi
ret
.L9:
mov rax, rsi
mov rdx, rdi
.L1:
ret
.L14:
mov r8, r9
xor r9d, r9d
mov rcx, r8
jmp .L4
20 superfluous instructions of the total 102 instructions!
NOT AMUSED
Stefan Kanthak
PS: <https://skanthak.homepage.t-online.de/gcc.html#case27-amd64-s>
shows properly written assembly code.