Complex multiplication in gcc

Sean McAllister Mon, 17 Jul 2017 09:52:25 -0700

When generating code for a simple inner loop (instantiated with
std::complex<float>)


template <typename cx>
void __attribute__((noinline)) benchcore(const cx* __restrict__ aa,
const cx* __restrict__ bb, const cx* __restrict__ cc, cx* __restrict__
dd, cx uu, cx vv, size_t nn) {
    for (ssize_t ii=0; ii < nn; ii++) {
        dd[ii] = (
            aa[ii]*uu +
            bb[ii]*vv +
            cc[ii]
        );
    }
}

g++ generates the following assembly code (g++ 7.1.0) (compiled with:
g++ -I. test.cc -O3 -ggdb3 -o test)

Dump of assembler code for function benchcore<std::complex<float>
>(std::complex<float> const*, std::complex<float> const*,
std::complex<float> const*, std::complex<float>*, std::complex<float>,
std::complex<float>, unsigned long):
   0x00000000004029d0 <+0>: push   %r15
   0x00000000004029d2 <+2>: push   %r14
   0x00000000004029d4 <+4>: push   %r13
   0x00000000004029d6 <+6>: push   %r12
   0x00000000004029d8 <+8>: push   %rbp
   0x00000000004029d9 <+9>: push   %rbx
   0x00000000004029da <+10>: sub    $0x38,%rsp
   0x00000000004029de <+14>: test   %r8,%r8
   0x00000000004029e1 <+17>: movq   %xmm0,0x28(%rsp)
   0x00000000004029e7 <+23>: movq   %xmm1,0x20(%rsp)
   0x00000000004029ed <+29>: movss  0x28(%rsp),%xmm4
   0x00000000004029f3 <+35>: movss  0x2c(%rsp),%xmm5
   0x00000000004029f9 <+41>: movss  0x20(%rsp),%xmm6
   0x00000000004029ff <+47>: movss  0x24(%rsp),%xmm7
   0x0000000000402a05 <+53>: movss  %xmm4,(%rsp)
   0x0000000000402a0a <+58>: movss  %xmm5,0x4(%rsp)
   0x0000000000402a10 <+64>: movss  %xmm6,0x8(%rsp)
   0x0000000000402a16 <+70>: movss  %xmm7,0xc(%rsp)
   0x0000000000402a1c <+76>: je     0x402abd
<benchcore<std::complex<float> >(std::complex<float> const*,
std::complex<float> const*, std::complex<float> const*,
std::complex<float>*, std::complex<float>, std::complex<float>,
unsigned long)+237>
   0x0000000000402a22 <+82>: mov    %r8,%r13
   0x0000000000402a25 <+85>: mov    %rcx,%r15
   0x0000000000402a28 <+88>: mov    %rdx,%r14
   0x0000000000402a2b <+91>: mov    %rsi,%r12
   0x0000000000402a2e <+94>: mov    %rdi,%rbp
   0x0000000000402a31 <+97>: xor    %ebx,%ebx
   0x0000000000402a33 <+99>: nopl   0x0(%rax,%rax,1)
   0x0000000000402a38 <+104>: movss  0x4(%r12,%rbx,8),%xmm3
   0x0000000000402a3f <+111>: movss  (%r12,%rbx,8),%xmm2
   0x0000000000402a45 <+117>: movss  0x8(%rsp),%xmm0
   0x0000000000402a4b <+123>: movss  0xc(%rsp),%xmm1
   0x0000000000402a51 <+129>: callq  0x400aa0 <__mulsc3@plt>
   0x0000000000402a56 <+134>: movq   %xmm0,0x18(%rsp)
   0x0000000000402a5c <+140>: movss  0x4(%rsp),%xmm3
   0x0000000000402a62 <+146>: movss  0x4(%rbp,%rbx,8),%xmm1
   0x0000000000402a68 <+152>: movss  0x0(%rbp,%rbx,8),%xmm0
   0x0000000000402a6e <+158>: movss  (%rsp),%xmm2
   0x0000000000402a73 <+163>: callq  0x400aa0 <__mulsc3@plt>
   0x0000000000402a78 <+168>: movq   %xmm0,0x10(%rsp)
   0x0000000000402a7e <+174>: movss  0x18(%rsp),%xmm1
   0x0000000000402a84 <+180>: movss  0x1c(%rsp),%xmm0
   0x0000000000402a8a <+186>: addss  0x14(%rsp),%xmm0
   0x0000000000402a90 <+192>: addss  0x10(%rsp),%xmm1
   0x0000000000402a96 <+198>: addss  0x4(%r14,%rbx,8),%xmm0
   0x0000000000402a9d <+205>: addss  (%r14,%rbx,8),%xmm1
   0x0000000000402aa3 <+211>: movss  %xmm0,0x4(%r15,%rbx,8)
   0x0000000000402aaa <+218>: movss  %xmm1,(%r15,%rbx,8)
   0x0000000000402ab0 <+224>: add    $0x1,%rbx
   0x0000000000402ab4 <+228>: cmp    %r13,%rbx
   0x0000000000402ab7 <+231>: jne    0x402a38
<benchcore<std::complex<float> >(std::complex<float> const*,
std::complex<float> const*, std::complex<float> const*,
std::complex<float>*, std::complex<float>, std::complex<float>,
unsigned long)+104>
   0x0000000000402abd <+237>: add    $0x38,%rsp
   0x0000000000402ac1 <+241>: pop    %rbx
   0x0000000000402ac2 <+242>: pop    %rbp
   0x0000000000402ac3 <+243>: pop    %r12
   0x0000000000402ac5 <+245>: pop    %r13
   0x0000000000402ac7 <+247>: pop    %r14
   0x0000000000402ac9 <+249>: pop    %r15
   0x0000000000402acb <+251>: retq
End of assembler dump.

The interesting part is the two calls to __mulsc3, which the docs
indicate computes complex multiplication according to Annex G of the
C99 standard.  This leads me to two questions.

First, disassembling __mulsc3 doesn't seem to contain anything:

(gdb) disassemble __mulsc3
Dump of assembler code for function __mulsc3@plt:
   0x0000000000400aa0 <+0>: jmpq   *0x2035d2(%rip)        # 0x604078
   0x0000000000400aa6 <+6>: pushq  $0xc
   0x0000000000400aab <+11>: jmpq   0x4009d0
End of assembler dump.

What's the cause of this?

Second, since I don't think I'll convince anyone to generate
non-standard conforming code by default, could the default performance
of complex multiplication be enhanced significantly by performing the
isnan() checks required by Annex G and only calling the function to
fix the results if they fail?  That would move the function call
overhead out of the critical path at least.

Complex multiplication in gcc

Reply via email to