https://gcc.gnu.org/bugzilla/show_bug.cgi?id=124546
Bug ID: 124546
Summary: Missed optimisation of 128-bit widening multiply
Product: gcc
Version: 15.0
Status: UNCONFIRMED
Keywords: missed-optimization
Severity: normal
Priority: P3
Component: tree-optimization
Assignee: unassigned at gcc dot gnu.org
Reporter: ktkachov at gcc dot gnu.org
Target Milestone: ---
Input:
#include <cstdint>
unsigned char add_uint64(uint64_t op1, uint64_t op2, uint64_t *r) {
*r = op1 + op2;
return static_cast<unsigned char>(*r < op1);
}
void mul_uint64 (uint64_t op1, uint64_t op2, uint64_t *r) {
auto op1_c_r = op1 & 0x00000000FFFFFFFF;
auto op2_c_r = op2 & 0x00000000FFFFFFFF;
op1 >>= 32;
op2 >>= 32;
auto m1 = op1 * op2_c_r;
uint64_t m;
auto left = op1 * op2 +
(static_cast<uint64_t>(add_uint64(m1, op2 * op1_c_r, &m)) << 32);
auto rt = op1_c_r * op2_c_r;
auto ts = (rt >> 32) + (m & 0x00000000FFFFFFFF);
r[1] = static_cast<uint64_t>(left + (m >> 32) + (ts >> 32));
r[0] = static_cast<uint64_t>((ts << 32) | (rt & 0x00000000FFFFFFFF));
}
On aarch64 with -O3 we generate:
add_uint64(unsigned long, unsigned long, unsigned long*):
adds x1, x0, x1
str x1, [x2]
cset w0, cs
ret
mul_uint64(unsigned long, unsigned long, unsigned long*):
lsr x4, x0, 32
lsr x6, x1, 32
umull x3, w0, w1
umull x1, w1, w4
and x5, x3, 4294967295
umaddl x0, w0, w6, x1
cmp x1, x0
and x1, x0, 4294967295
lsr x0, x0, 32
add x3, x1, x3, lsr 32
cset x1, hi
orr x5, x5, x3, lsl 32
umaddl x4, w4, w6, x0
extr x0, x1, x3, 32
add x0, x0, x4
stp x5, x0, [x2]
ret
whereas Clang can detect the wide multiply:
add_uint64(unsigned long, unsigned long, unsigned long*):
adds x8, x1, x0
cset w0, hs
str x8, [x2]
ret
mul_uint64(unsigned long, unsigned long, unsigned long*):
umulh x8, x1, x0
mul x9, x0, x1
stp x9, x8, [x2]
ret
This causes GCC to be much slower than Clang on a crypto workload where this
pattern appears