On 8/13/21 8:58 PM, Stefan Kanthak wrote:
Hi,
compile the following naive implementation of nextafter() for AMD64:
JFTR: ignore the aliasing casts, they don't matter here!
$ cat repro.c
double nextafter(double from, double to)
{
if (to != to)
return to; // to is NAN
if (from != from)
return from; // from is NAN
if (from == to) // neither from nor to can be NAN here!
return to;
if (from == 0.0) // dito!
return to < 0.0 ? -0x1.0p-1074 : 0x1.0p-1074;
unsigned long long ull = *(unsigned long long *) &from;
if ((from < to) == (from < 0.0))
ull--;
else
ull++;
return *(double *) &ull;
}
$ gcc -m64 -o- -O3 -S repro.c
...
nextafter:
ucomisd %xmm1, %xmm1 // sets PF for unordered result, i.e. when at
jp .L10 // least one operand is NAN
ucomisd %xmm0, %xmm0 // same here
jp .L1
ucomisd %xmm0, %xmm1
jnp .L14 // OUCH: PF can't be set here!
// OUCH: and if it were, it's MORE LIKELY to be
// clear, so this branch would be taken
// ... against the branch prediction
.L11:
pxor %xmm2, %xmm2 // OUCH: switching from FP SSE to integer SSE
and
// vice versa incurs a penalty of 1
cycle
// on quite a lot Intel Core
processors!
// Better use XORPD instead (which is even 1
byte
// shorter)!
ucomisd %xmm2, %xmm0
jnp .L15 // OUCH: there's still no need to check PF
here!
.L4:
comisd %xmm0, %xmm1
movq %xmm0, %rdx
leaq -1(%rdx), %rax
seta %r8b
comisd %xmm0, %xmm2
seta %cl
addq $1, %rdx
cmpb %cl, %r8b
cmovne %rdx, %rax
movq %rax, %xmm0
.L1:
ret
.L14:
jne .L11
.L10:
movapd %xmm1, %xmm0
ret
.L15:
jne .L4
movabsq $-9223372036854775808, %rdx
movq %xmm1, %rax
andq %rdx, %rax
orq $1, %rax
movq %rax, %xmm0
ret
Stefan
Shouldn't this kind of stuff go to the Bugzilla ?