When generating code for a simple inner loop (instantiated with
std::complex<float>)

template <typename cx>
void __attribute__((noinline)) benchcore(const cx* __restrict__ aa,
const cx* __restrict__ bb, const cx* __restrict__ cc, cx* __restrict__
dd, cx uu, cx vv, size_t nn) {
    for (ssize_t ii=0; ii < nn; ii++) {
        dd[ii] = (
            aa[ii]*uu +
            bb[ii]*vv +
            cc[ii]
        );
    }
}

g++ generates the following assembly code (g++ 7.1.0) (compiled with:
g++ -I. test.cc -O3 -ggdb3 -o test)

Dump of assembler code for function benchcore<std::complex<float>
>(std::complex<float> const*, std::complex<float> const*,
std::complex<float> const*, std::complex<float>*, std::complex<float>,
std::complex<float>, unsigned long):
   0x00000000004029d0 <+0>: push   %r15
   0x00000000004029d2 <+2>: push   %r14
   0x00000000004029d4 <+4>: push   %r13
   0x00000000004029d6 <+6>: push   %r12
   0x00000000004029d8 <+8>: push   %rbp
   0x00000000004029d9 <+9>: push   %rbx
   0x00000000004029da <+10>: sub    $0x38,%rsp
   0x00000000004029de <+14>: test   %r8,%r8
   0x00000000004029e1 <+17>: movq   %xmm0,0x28(%rsp)
   0x00000000004029e7 <+23>: movq   %xmm1,0x20(%rsp)
   0x00000000004029ed <+29>: movss  0x28(%rsp),%xmm4
   0x00000000004029f3 <+35>: movss  0x2c(%rsp),%xmm5
   0x00000000004029f9 <+41>: movss  0x20(%rsp),%xmm6
   0x00000000004029ff <+47>: movss  0x24(%rsp),%xmm7
   0x0000000000402a05 <+53>: movss  %xmm4,(%rsp)
   0x0000000000402a0a <+58>: movss  %xmm5,0x4(%rsp)
   0x0000000000402a10 <+64>: movss  %xmm6,0x8(%rsp)
   0x0000000000402a16 <+70>: movss  %xmm7,0xc(%rsp)
   0x0000000000402a1c <+76>: je     0x402abd
<benchcore<std::complex<float> >(std::complex<float> const*,
std::complex<float> const*, std::complex<float> const*,
std::complex<float>*, std::complex<float>, std::complex<float>,
unsigned long)+237>
   0x0000000000402a22 <+82>: mov    %r8,%r13
   0x0000000000402a25 <+85>: mov    %rcx,%r15
   0x0000000000402a28 <+88>: mov    %rdx,%r14
   0x0000000000402a2b <+91>: mov    %rsi,%r12
   0x0000000000402a2e <+94>: mov    %rdi,%rbp
   0x0000000000402a31 <+97>: xor    %ebx,%ebx
   0x0000000000402a33 <+99>: nopl   0x0(%rax,%rax,1)
   0x0000000000402a38 <+104>: movss  0x4(%r12,%rbx,8),%xmm3
   0x0000000000402a3f <+111>: movss  (%r12,%rbx,8),%xmm2
   0x0000000000402a45 <+117>: movss  0x8(%rsp),%xmm0
   0x0000000000402a4b <+123>: movss  0xc(%rsp),%xmm1
   0x0000000000402a51 <+129>: callq  0x400aa0 <__mulsc3@plt>
   0x0000000000402a56 <+134>: movq   %xmm0,0x18(%rsp)
   0x0000000000402a5c <+140>: movss  0x4(%rsp),%xmm3
   0x0000000000402a62 <+146>: movss  0x4(%rbp,%rbx,8),%xmm1
   0x0000000000402a68 <+152>: movss  0x0(%rbp,%rbx,8),%xmm0
   0x0000000000402a6e <+158>: movss  (%rsp),%xmm2
   0x0000000000402a73 <+163>: callq  0x400aa0 <__mulsc3@plt>
   0x0000000000402a78 <+168>: movq   %xmm0,0x10(%rsp)
   0x0000000000402a7e <+174>: movss  0x18(%rsp),%xmm1
   0x0000000000402a84 <+180>: movss  0x1c(%rsp),%xmm0
   0x0000000000402a8a <+186>: addss  0x14(%rsp),%xmm0
   0x0000000000402a90 <+192>: addss  0x10(%rsp),%xmm1
   0x0000000000402a96 <+198>: addss  0x4(%r14,%rbx,8),%xmm0
   0x0000000000402a9d <+205>: addss  (%r14,%rbx,8),%xmm1
   0x0000000000402aa3 <+211>: movss  %xmm0,0x4(%r15,%rbx,8)
   0x0000000000402aaa <+218>: movss  %xmm1,(%r15,%rbx,8)
   0x0000000000402ab0 <+224>: add    $0x1,%rbx
   0x0000000000402ab4 <+228>: cmp    %r13,%rbx
   0x0000000000402ab7 <+231>: jne    0x402a38
<benchcore<std::complex<float> >(std::complex<float> const*,
std::complex<float> const*, std::complex<float> const*,
std::complex<float>*, std::complex<float>, std::complex<float>,
unsigned long)+104>
   0x0000000000402abd <+237>: add    $0x38,%rsp
   0x0000000000402ac1 <+241>: pop    %rbx
   0x0000000000402ac2 <+242>: pop    %rbp
   0x0000000000402ac3 <+243>: pop    %r12
   0x0000000000402ac5 <+245>: pop    %r13
   0x0000000000402ac7 <+247>: pop    %r14
   0x0000000000402ac9 <+249>: pop    %r15
   0x0000000000402acb <+251>: retq
End of assembler dump.

The interesting part is the two calls to __mulsc3, which the docs
indicate computes complex multiplication according to Annex G of the
C99 standard.  This leads me to two questions.

First, disassembling __mulsc3 doesn't seem to contain anything:

(gdb) disassemble __mulsc3
Dump of assembler code for function __mulsc3@plt:
   0x0000000000400aa0 <+0>: jmpq   *0x2035d2(%rip)        # 0x604078
   0x0000000000400aa6 <+6>: pushq  $0xc
   0x0000000000400aab <+11>: jmpq   0x4009d0
End of assembler dump.

What's the cause of this?

Second, since I don't think I'll convince anyone to generate
non-standard conforming code by default, could the default performance
of complex multiplication be enhanced significantly by performing the
isnan() checks required by Annex G and only calling the function to
fix the results if they fail?  That would move the function call
overhead out of the critical path at least.

Reply via email to