When generating code for a simple inner loop (instantiated with std::complex<float>)
template <typename cx> void __attribute__((noinline)) benchcore(const cx* __restrict__ aa, const cx* __restrict__ bb, const cx* __restrict__ cc, cx* __restrict__ dd, cx uu, cx vv, size_t nn) { for (ssize_t ii=0; ii < nn; ii++) { dd[ii] = ( aa[ii]*uu + bb[ii]*vv + cc[ii] ); } } g++ generates the following assembly code (g++ 7.1.0) (compiled with: g++ -I. test.cc -O3 -ggdb3 -o test) Dump of assembler code for function benchcore<std::complex<float> >(std::complex<float> const*, std::complex<float> const*, std::complex<float> const*, std::complex<float>*, std::complex<float>, std::complex<float>, unsigned long): 0x00000000004029d0 <+0>: push %r15 0x00000000004029d2 <+2>: push %r14 0x00000000004029d4 <+4>: push %r13 0x00000000004029d6 <+6>: push %r12 0x00000000004029d8 <+8>: push %rbp 0x00000000004029d9 <+9>: push %rbx 0x00000000004029da <+10>: sub $0x38,%rsp 0x00000000004029de <+14>: test %r8,%r8 0x00000000004029e1 <+17>: movq %xmm0,0x28(%rsp) 0x00000000004029e7 <+23>: movq %xmm1,0x20(%rsp) 0x00000000004029ed <+29>: movss 0x28(%rsp),%xmm4 0x00000000004029f3 <+35>: movss 0x2c(%rsp),%xmm5 0x00000000004029f9 <+41>: movss 0x20(%rsp),%xmm6 0x00000000004029ff <+47>: movss 0x24(%rsp),%xmm7 0x0000000000402a05 <+53>: movss %xmm4,(%rsp) 0x0000000000402a0a <+58>: movss %xmm5,0x4(%rsp) 0x0000000000402a10 <+64>: movss %xmm6,0x8(%rsp) 0x0000000000402a16 <+70>: movss %xmm7,0xc(%rsp) 0x0000000000402a1c <+76>: je 0x402abd <benchcore<std::complex<float> >(std::complex<float> const*, std::complex<float> const*, std::complex<float> const*, std::complex<float>*, std::complex<float>, std::complex<float>, unsigned long)+237> 0x0000000000402a22 <+82>: mov %r8,%r13 0x0000000000402a25 <+85>: mov %rcx,%r15 0x0000000000402a28 <+88>: mov %rdx,%r14 0x0000000000402a2b <+91>: mov %rsi,%r12 0x0000000000402a2e <+94>: mov %rdi,%rbp 0x0000000000402a31 <+97>: xor %ebx,%ebx 0x0000000000402a33 <+99>: nopl 0x0(%rax,%rax,1) 0x0000000000402a38 <+104>: movss 0x4(%r12,%rbx,8),%xmm3 0x0000000000402a3f <+111>: movss (%r12,%rbx,8),%xmm2 0x0000000000402a45 <+117>: movss 0x8(%rsp),%xmm0 0x0000000000402a4b <+123>: movss 0xc(%rsp),%xmm1 0x0000000000402a51 <+129>: callq 0x400aa0 <__mulsc3@plt> 0x0000000000402a56 <+134>: movq %xmm0,0x18(%rsp) 0x0000000000402a5c <+140>: movss 0x4(%rsp),%xmm3 0x0000000000402a62 <+146>: movss 0x4(%rbp,%rbx,8),%xmm1 0x0000000000402a68 <+152>: movss 0x0(%rbp,%rbx,8),%xmm0 0x0000000000402a6e <+158>: movss (%rsp),%xmm2 0x0000000000402a73 <+163>: callq 0x400aa0 <__mulsc3@plt> 0x0000000000402a78 <+168>: movq %xmm0,0x10(%rsp) 0x0000000000402a7e <+174>: movss 0x18(%rsp),%xmm1 0x0000000000402a84 <+180>: movss 0x1c(%rsp),%xmm0 0x0000000000402a8a <+186>: addss 0x14(%rsp),%xmm0 0x0000000000402a90 <+192>: addss 0x10(%rsp),%xmm1 0x0000000000402a96 <+198>: addss 0x4(%r14,%rbx,8),%xmm0 0x0000000000402a9d <+205>: addss (%r14,%rbx,8),%xmm1 0x0000000000402aa3 <+211>: movss %xmm0,0x4(%r15,%rbx,8) 0x0000000000402aaa <+218>: movss %xmm1,(%r15,%rbx,8) 0x0000000000402ab0 <+224>: add $0x1,%rbx 0x0000000000402ab4 <+228>: cmp %r13,%rbx 0x0000000000402ab7 <+231>: jne 0x402a38 <benchcore<std::complex<float> >(std::complex<float> const*, std::complex<float> const*, std::complex<float> const*, std::complex<float>*, std::complex<float>, std::complex<float>, unsigned long)+104> 0x0000000000402abd <+237>: add $0x38,%rsp 0x0000000000402ac1 <+241>: pop %rbx 0x0000000000402ac2 <+242>: pop %rbp 0x0000000000402ac3 <+243>: pop %r12 0x0000000000402ac5 <+245>: pop %r13 0x0000000000402ac7 <+247>: pop %r14 0x0000000000402ac9 <+249>: pop %r15 0x0000000000402acb <+251>: retq End of assembler dump. The interesting part is the two calls to __mulsc3, which the docs indicate computes complex multiplication according to Annex G of the C99 standard. This leads me to two questions. First, disassembling __mulsc3 doesn't seem to contain anything: (gdb) disassemble __mulsc3 Dump of assembler code for function __mulsc3@plt: 0x0000000000400aa0 <+0>: jmpq *0x2035d2(%rip) # 0x604078 0x0000000000400aa6 <+6>: pushq $0xc 0x0000000000400aab <+11>: jmpq 0x4009d0 End of assembler dump. What's the cause of this? Second, since I don't think I'll convince anyone to generate non-standard conforming code by default, could the default performance of complex multiplication be enhanced significantly by performing the isnan() checks required by Annex G and only calling the function to fix the results if they fail? That would move the function call overhead out of the critical path at least.