https://gcc.gnu.org/bugzilla/show_bug.cgi?id=102974
--- Comment #11 from Mason <slash.tmp at free dot fr> --- Here's umul_least_64() rewritten as mul_64x64_128() in C typedef unsigned int u32; typedef unsigned long long u64; /* u32 acc[3], a[1], b[1] */ static void mul_add_32x32(u32 *acc, const u32 *a, const u32 *b) { u64 res = (u64)a[0] * b[0]; u32 lo = res, hi = res >> 32; asm("add %[LO], %[D0]\n\t" "adc %[HI], %[D1]\n\t" "adc $0, %[D2]" : [D0] "+m" (acc[0]), [D1] "+m" (acc[1]), [D2] "+m" (acc[2]) : [LO] "r" (lo), [HI] "r" (hi) : "cc"); } /* u32 acc[5], a[2], b[2] */ void mul_64x64_128(u32 *acc, const u32 *a, const u32 *b) { mul_add_32x32(acc+0, a+0, b+0); mul_add_32x32(acc+1, a+0, b+1); mul_add_32x32(acc+1, a+1, b+0); mul_add_32x32(acc+2, a+1, b+1); } gcc-trunk -O3 -m32 mul_64x64_128: pushl %esi pushl %ebx movl 16(%esp), %ebx ; ebx = a movl 20(%esp), %esi ; esi = b movl 12(%esp), %ecx ; ecx = acc movl (%esi), %eax ; b0 mull (%ebx) ; a0*b0 add %eax, (%ecx) adc %edx, 4(%ecx) adc $0, 8(%ecx) movl 4(%esi), %eax ; b1 mull (%ebx) ; a0*b1 add %eax, 4(%ecx) adc %edx, 8(%ecx) adc $0, 12(%ecx) movl (%esi), %eax ; b0 mull 4(%ebx) ; a1*b0 add %eax, 4(%ecx) adc %edx, 8(%ecx) adc $0, 12(%ecx) movl 4(%esi), %eax ; b1 mull 4(%ebx) ; a1*b1 add %eax, 8(%ecx) adc %edx, 12(%ecx) adc $0, 16(%ecx) popl %ebx popl %esi ret