https://gcc.gnu.org/bugzilla/show_bug.cgi?id=51837
Peter Cordes <peter at cordes dot ca> changed:
What |Removed |Added
----------------------------------------------------------------------------
CC| |peter at cordes dot ca
--- Comment #1 from Peter Cordes <peter at cordes dot ca> ---
This bug isn't limited to __uint128. There seems to be a general problem with
full-multiplies (at least on x86). I see similar excess-mov instructions (or
saving/restoring a register that is not touched) with a 32*32->64b full
multiply in 32bit code. gcc does a bad job with mulx as well.
Reminder: mulx has two read-only input operands: implicit rdx and explicit
source operand (first operand in AT&T syntax). The next two operands are
write-only output operands: high half and low half, in that order in AT&T
syntax.
summary of things I noticed:
* sometimes mul r32 is the right choice, even when imul r64,r64 is available.
It's faster on some CPUs, but even -mtune=atom (the most extreme case of slow
64bit imul) uses imul r64,r64.
* mulx r64,r64,r64 has one cycle more latency than mul r64 on Haswell.
* gcc doesn't notice that it should put C in edx to for mulx when it needs B*C
and A*C.
* gcc is terrible at choosing output registers for mulx
* It can matter which order multiplies appear in the source. If one of the
multiplies has only half its results used, it can save instructions to do it
first, since only one of rdx and rax need to be saved. compilers should
re-order for optimal code regardless of source order.
* is it normal for -m32 -Os to default to making stack frames? clang -m32 -Os
doesn't.
test cases: (godbolt link: http://goo.gl/2iL7f2)
// 32bit version of Steven's testcase
#include <stdint.h>
uint64_t foo64(unsigned x, unsigned y) {
uint64_t z = (uint64_t)x * y;
return z ^ (z >> 32);
}
gcc 5.3.0 -O3 -m32
pushl %ebx
movl 12(%esp), %eax
mull 8(%esp)
popl %ebx # save/restore of an otherwise-unused register.
Regression from 4.9.2
xorl %edx, %eax
gcc 5.3.0 -O3 -m32 -mbmi2
pushl %ebx
movl 12(%esp), %eax
movl %eax, %edx # oops we're using mulx, not mul?
mulx 8(%esp), %ecx, %ebx
movl %ecx, %eax # this is sub-optimal even with that choice of
output reg for mulx
movl %ebx, %edx
xorl %ebx, %eax # note that even with ideal sequences, mulx
didn't gain anything
popl %ebx
64b: gcc 5.3.0 -O3 -mbmi2 # 64bit mode: 32bit mulx would help significantly,
but it isn't used
movl %edi, %eax
movl %esi, %esi
imulq %rax, %rsi
movq %rsi, %rax
shrq $32, %rax
xorq %rsi, %rax
hand-optimized 64bit: same as the 32bit case.
mov %edi, %eax
mul %esi # mulx isn't helpful here
xor %edx, %eax
Even when inlined somewhere that doesn't need the upper halves of input
regs zeroed, its 3 uops on SnB for mul r32 vs 3 for imul r,r + mov + shr, with
better or equal latency (depending on mov being 0 or 1c)
I guess this would require recognizing when we want 2 halves of a multiply, and
using the otherwise-slower single-operand form of mul. Note that AMD BD-family
runs `mul r32` faster than `imul r64, r64`, and so does Atom (but not
Silvermont).
-------------------
//Steven's function:
uint64_t foo128(uint64_t x, uint64_t y) {
__uint128_t z = (__uint128_t)x * y;
return z ^ (z >> 64);
}
gcc 5.3.0 -O3: same as Steven reported for gcc 4.7
gcc 5.3.0 -O3 -march=haswell
movq %rdi, %rdx # correct startup sequence
mulx %rsi, %r9, %r10 # bad choice of output regs, like 32bit
movq %r10, %rax # correct sequence for handling the
badly-chosen mulx outputs
xorq %r9, %rax
At 64bit operand size, mul has one cycle lower latency than mulx on Haswell,
so it's only a better choice when the choice of outputs helps, or the different
implicit input (rdx instead of rax).
Obviously we can avoid the mov and the REX prefixes by choosing different
output registers. clang uses rcx and rax as output registers for mulx, which
is the obvious choice. (or overwrite an input register).
----
// A slightly more complex function:
struct DE64 { uint64_t D,E; };
struct DE64 f64_structret(uint64_t A, uint64_t B, uint64_t C) {
__uint128_t AC = A * (__uint128_t)C; // gcc makes slightly better code with
BC first. Order shouldn't matter
__uint128_t BC = B * (__uint128_t)C;
uint64_t D = AC >> 64; // high half
uint64_t E = AC + (BC >> 64);
struct DE64 retval = { D, E };
return retval;
}
# C is already in rdx, which is perfect for mulx. In the 32bit case (below),
gcc doesn't realize it should put C into edx for easy reuse.
gcc 5.3.0 -O3 -march=haswell: one insn shorter if BC comes first in the
source!
movq %rsi, %rcx
pushq %rbx # at least rbx is actually used
mulx %rdi, %r9, %r10
movq %r10, %rax
mulx %rcx, %rcx, %rbx
leaq (%rbx,%r9), %rdx
popq %rbx
ret
clang 3.7.1 -O3 -march=haswell:
mulxq %rdi, %rcx, %rax
mulxq %rsi, %rsi, %rdx
addq %rcx, %rdx
retq
gcc 5.3.0 -O3 -march=haswell -mno-bmi2
movq %rdi, %rax
movq %rsi, %rcx
movq %rdx, %rsi
mulq %rdx
movq %rax, %r9
movq %rsi, %rax
movq %rdx, %r10
mulq %rcx
movq %r10, %rax
leaq (%rdx,%r9), %rdx
clang 3.7.1 -O3 -march=haswell -mno-bmi2 (one insn shorter than gcc, and
doesn't need to use an lea for adding)
movq %rdx, %rcx
movq %rcx, %rax
mulq %rdi
movq %rax, %rdi
movq %rdx, %r8
movq %rcx, %rax
mulq %rsi
addq %rdi, %rdx
movq %r8, %rax
# putting the BC= and AC= lines in the other order in the source lead to
clang using fewer regs, and not needing REX prefixes for r8, but the same
number of total insns.
I didn't try very hard, but I don't think we can get the job done any
more efficiently than this.
If the struct had its values in the opposite order, doing the multiplies
in the other order would save insns.
----
// 32bit version of the same function
struct DE { uint32_t D,E; } global_result; // simpler compiler output than
returning a struct
void f(uint32_t A, uint32_t B, uint32_t C)
{
uint64_t AC = A * (uint64_t)C;
uint64_t BC = B * (uint64_t)C;
global_result.D = AC >> 32; // high half
global_result.E = AC + (BC >> 32);
}
64bit output looks fine (uses imul r64,r64 and shifts even when mulx r32 is
available, but that's minor)
clang 3.7.1 -m32 -O3 -march=haswell (optimal: what we'd like gcc to do)
pushl %esi
movl 16(%esp), %edx
mulxl 8(%esp), %ecx, %eax
mulxl 12(%esp), %edx, %esi
addl %ecx, %esi
movl %eax, global_result
movl %esi, global_result+4
popl %esi
gcc 5.3.0 -m32 -O3 -march=haswell: doesn't think of putting the common
multiplicand (C) in %edx
pushl %edi
pushl %esi
pushl %ebx
movl 24(%esp), %ecx
movl %ecx, %edx
mulx 16(%esp), %esi, %edi
movl %ecx, %edx
movl %edi, global_result
mulx 20(%esp), %ecx, %ebx
leal (%esi,%ebx), %eax
popl %ebx
movl %eax, global_result+4
popl %esi
popl %edi
gcc 5.3.0 -m32 -Os -march=haswell: optimal (by good luck maybe?)
pushl %ebp # Is it normal for -Os to make stack frames?
movl %esp, %ebp
pushl %ebx
movl 16(%ebp), %edx
mulx 8(%ebp), %ecx, %ebx
mulx 12(%ebp), %eax, %edx
addl %edx, %ecx
movl %ebx, global_result
movl %ecx, global_result+4
popl %ebx
popl %ebp