128 bit multiply via __uint128_t not optimized

peter at cordes dot ca Thu, 04 Feb 2016 13:17:22 -0800

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=51837


Peter Cordes <peter at cordes dot ca> changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
                 CC|                            |peter at cordes dot ca

--- Comment #1 from Peter Cordes <peter at cordes dot ca> ---
This bug isn't limited to __uint128.  There seems to be a general problem with
full-multiplies (at least on x86).  I see similar excess-mov instructions (or
saving/restoring a register that is not touched) with a 32*32->64b full
multiply in 32bit code.  gcc does a bad job with mulx as well.


Reminder: mulx has two read-only input operands: implicit rdx and explicit
source operand (first operand in AT&T syntax).  The next two operands are
write-only output operands: high half and low half, in that order in AT&T
syntax.


summary of things I noticed:

* sometimes mul r32 is the right choice, even when imul r64,r64 is available. 
It's faster on some CPUs, but even -mtune=atom (the most extreme case of slow
64bit imul) uses imul r64,r64.

* mulx r64,r64,r64 has one cycle more latency than mul r64 on Haswell.

* gcc doesn't notice that it should put C in edx to for mulx when it needs B*C
and A*C.

* gcc is terrible at choosing output registers for mulx

* It can matter which order multiplies appear in the source.  If one of the
multiplies has only half its results used, it can save instructions to do it
first, since only one of rdx and rax need to be saved.  compilers should
re-order for optimal code regardless of source order.

* is it normal for -m32 -Os to default to making stack frames?  clang -m32 -Os
doesn't.



test cases:  (godbolt link: http://goo.gl/2iL7f2)


// 32bit version of Steven's testcase
#include <stdint.h>
uint64_t foo64(unsigned x, unsigned y) {
        uint64_t z = (uint64_t)x * y;
        return z ^ (z >> 32);
}
    gcc 5.3.0 -O3 -m32
        pushl   %ebx
        movl    12(%esp), %eax
        mull    8(%esp)
        popl    %ebx           # save/restore of an otherwise-unused register. 
Regression from 4.9.2
        xorl    %edx, %eax

    gcc 5.3.0 -O3 -m32 -mbmi2
        pushl   %ebx
        movl    12(%esp), %eax
        movl    %eax, %edx       # oops we're using mulx, not mul?
        mulx    8(%esp), %ecx, %ebx
        movl    %ecx, %eax       # this is sub-optimal even with that choice of
output reg for mulx
        movl    %ebx, %edx
        xorl    %ebx, %eax       # note that even with ideal sequences, mulx
didn't gain anything
        popl    %ebx

64b: gcc 5.3.0 -O3 -mbmi2   # 64bit mode: 32bit mulx would help significantly,
but it isn't used
        movl    %edi, %eax
        movl    %esi, %esi
        imulq   %rax, %rsi
        movq    %rsi, %rax
        shrq    $32, %rax
        xorq    %rsi, %rax

    hand-optimized 64bit: same as the 32bit case.
        mov     %edi, %eax
        mul     %esi        # mulx isn't helpful here
        xor     %edx, %eax
    Even when inlined somewhere that doesn't need the upper halves of input
regs zeroed, its 3 uops on SnB for mul r32 vs 3 for imul r,r + mov + shr, with
better or equal latency (depending on mov being 0 or 1c)

I guess this would require recognizing when we want 2 halves of a multiply, and
using the otherwise-slower single-operand form of mul.  Note that AMD BD-family
runs `mul r32` faster than `imul r64, r64`, and so does Atom (but not
Silvermont).

-------------------

//Steven's function:
uint64_t foo128(uint64_t x, uint64_t y) {
        __uint128_t z = (__uint128_t)x * y;
        return z ^ (z >> 64);
}
   gcc 5.3.0 -O3: same as Steven reported for gcc 4.7

   gcc 5.3.0 -O3 -march=haswell
        movq    %rdi, %rdx         # correct startup sequence
        mulx    %rsi, %r9, %r10    # bad choice of output regs, like 32bit
        movq    %r10, %rax         # correct sequence for handling the
badly-chosen mulx outputs
        xorq    %r9, %rax
   At 64bit operand size, mul has one cycle lower latency than mulx on Haswell,
so it's only a better choice when the choice of outputs helps, or the different
implicit input (rdx instead of rax).

Obviously we can avoid the mov and the REX prefixes by choosing different
output registers.  clang uses rcx and rax as output registers for mulx, which
is the obvious choice.  (or overwrite an input register).

----

// A slightly more complex function:

struct DE64 { uint64_t D,E; };

struct DE64 f64_structret(uint64_t A, uint64_t B, uint64_t C) {
  __uint128_t AC = A * (__uint128_t)C;  // gcc makes slightly better code with
BC first.  Order shouldn't matter
  __uint128_t BC = B * (__uint128_t)C;
  uint64_t D = AC >> 64;         // high half
  uint64_t E = AC + (BC >> 64);
  struct DE64 retval = { D, E };
  return retval;
}

 # C is already in rdx, which is perfect for mulx.  In the 32bit case (below),
gcc doesn't realize it should put C into edx for easy reuse.

  gcc 5.3.0 -O3 -march=haswell: one insn shorter if BC comes first in the
source!
        movq    %rsi, %rcx
        pushq   %rbx                # at least rbx is actually used
        mulx    %rdi, %r9, %r10
        movq    %r10, %rax
        mulx    %rcx, %rcx, %rbx
        leaq    (%rbx,%r9), %rdx
        popq    %rbx
        ret

  clang 3.7.1 -O3 -march=haswell:
        mulxq   %rdi, %rcx, %rax
        mulxq   %rsi, %rsi, %rdx
        addq    %rcx, %rdx
        retq


  gcc 5.3.0 -O3 -march=haswell -mno-bmi2
        movq    %rdi, %rax
        movq    %rsi, %rcx
        movq    %rdx, %rsi
        mulq    %rdx
        movq    %rax, %r9
        movq    %rsi, %rax
        movq    %rdx, %r10
        mulq    %rcx
        movq    %r10, %rax
        leaq    (%rdx,%r9), %rdx

  clang 3.7.1 -O3 -march=haswell -mno-bmi2  (one insn shorter than gcc, and
doesn't need to use an lea for adding)
        movq    %rdx, %rcx
        movq    %rcx, %rax
        mulq    %rdi
        movq    %rax, %rdi
        movq    %rdx, %r8
        movq    %rcx, %rax
        mulq    %rsi
        addq    %rdi, %rdx
        movq    %r8, %rax
       # putting the BC= and AC= lines in the other order in the source lead to
clang using fewer regs, and not needing REX prefixes for r8, but the same
number of total insns.
      I didn't try very hard, but I don't think we can get the job done any
more efficiently than this.
      If the struct had its values in the opposite order, doing the multiplies
in the other order would save insns.

----

// 32bit version of the same function
struct DE { uint32_t D,E; } global_result; // simpler compiler output than
returning a struct

void f(uint32_t A, uint32_t B, uint32_t C)
{
  uint64_t AC = A * (uint64_t)C;
  uint64_t BC = B * (uint64_t)C;
  global_result.D = AC >> 32;         // high half
  global_result.E = AC + (BC >> 32);
}

   64bit output looks fine (uses imul r64,r64 and shifts even when mulx r32 is
available, but that's minor)

   clang 3.7.1 -m32 -O3 -march=haswell  (optimal: what we'd like gcc to do)
        pushl   %esi
        movl    16(%esp), %edx
        mulxl   8(%esp), %ecx, %eax
        mulxl   12(%esp), %edx, %esi
        addl    %ecx, %esi
        movl    %eax, global_result
        movl    %esi, global_result+4
        popl    %esi


   gcc 5.3.0 -m32 -O3 -march=haswell:  doesn't think of putting the common
multiplicand (C) in %edx
        pushl   %edi
        pushl   %esi
        pushl   %ebx
        movl    24(%esp), %ecx
        movl    %ecx, %edx
        mulx    16(%esp), %esi, %edi
        movl    %ecx, %edx
        movl    %edi, global_result
        mulx    20(%esp), %ecx, %ebx
        leal    (%esi,%ebx), %eax
        popl    %ebx
        movl    %eax, global_result+4
        popl    %esi
        popl    %edi

   gcc 5.3.0 -m32 -Os -march=haswell:  optimal (by good luck maybe?)
        pushl   %ebp            # Is it normal for -Os to make stack frames?
        movl    %esp, %ebp

        pushl   %ebx
        movl    16(%ebp), %edx
        mulx    8(%ebp), %ecx, %ebx
        mulx    12(%ebp), %eax, %edx
        addl    %edx, %ecx
        movl    %ebx, global_result
        movl    %ecx, global_result+4
        popl    %ebx

        popl    %ebp

[Bug middle-end/51837] Use of result from 64*64->128 bit multiply via __uint128_t not optimized

Reply via email to