On 8/13/21 8:58 PM, Stefan Kanthak wrote:
Hi,

compile the following naive implementation of nextafter() for AMD64:

JFTR: ignore the aliasing casts, they don't matter here!

$ cat repro.c
double nextafter(double from, double to)
{
     if (to != to)
         return to;        // to is NAN

     if (from != from)
         return from;      // from is NAN

     if (from == to)       // neither from nor to can be NAN here!
         return to;

     if (from == 0.0)      // dito!
         return to < 0.0 ? -0x1.0p-1074 : 0x1.0p-1074;

     unsigned long long ull = *(unsigned long long *) &from;

     if ((from < to) == (from < 0.0))
         ull--;
     else
         ull++;

     return *(double *) &ull;
}
$ gcc -m64 -o- -O3 -S repro.c
...
nextafter:
         ucomisd %xmm1, %xmm1    // sets PF for unordered result, i.e. when at
         jp      .L10            //  least one operand is NAN
         ucomisd %xmm0, %xmm0    // same here
         jp      .L1
         ucomisd %xmm0, %xmm1
         jnp     .L14            // OUCH: PF can't be set here!
                                 // OUCH: and if it were, it's MORE LIKELY to be
                                 //        clear, so this branch would be taken
                                 //         ... against the branch prediction
.L11:
         pxor    %xmm2, %xmm2    // OUCH: switching from FP SSE to integer SSE 
and
                                 //        vice versa incurs a penalty of 1 
cycle
                                 //         on quite a lot Intel Core 
processors!
                                 // Better use XORPD instead (which is even 1 
byte
                                 //  shorter)!
         ucomisd %xmm2, %xmm0
         jnp     .L15            // OUCH: there's still no need to check PF 
here!
.L4:
         comisd  %xmm0, %xmm1
         movq    %xmm0, %rdx
         leaq    -1(%rdx), %rax
         seta    %r8b
         comisd  %xmm0, %xmm2
         seta    %cl
         addq    $1, %rdx
         cmpb    %cl, %r8b
         cmovne  %rdx, %rax
         movq    %rax, %xmm0
.L1:
         ret
.L14:
         jne     .L11
.L10:
         movapd  %xmm1, %xmm0
         ret
.L15:
         jne     .L4
         movabsq $-9223372036854775808, %rdx
         movq    %xmm1, %rax
         andq    %rdx, %rax
         orq     $1, %rax
         movq    %rax, %xmm0
         ret


Stefan
Shouldn't this kind of stuff go to the Bugzilla ?

Reply via email to