Worse code generation for FPU on versions after 6

katsunori.kumatani at gmail dot com Sat, 18 Feb 2017 15:37:56 -0800

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=79593


            Bug ID: 79593
           Summary: [Regression] Poor/Worse code generation for FPU on
                    versions after 6
           Product: gcc
           Version: 6.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: rtl-optimization
          Assignee: unassigned at gcc dot gnu.org
          Reporter: katsunori.kumatani at gmail dot com
  Target Milestone: ---

First of all sorry if the "Component" is set wrong, I didn't know what to pick
(in respect to worse code generation than former versions) :)

Newer GCC versions seem to have regressed in their x87 code generation and do
it extremely poor mistakes. This is about a regression in code generation for
x87 code, not a "wishlist" of better code (i.e. it's about the possibility of
having it reverted back to v5, at least in respect to x87 code gen). Sometimes
I want to use it for various reasons, for example for "long double" high
intermediate precision.

So technically this is a "performance bug" regression: In fact GCC version 5
generates quite good x87 code! So I'd simply want that generation reverted as a
goal with this report (if the culprit is found of course), so don't
misunderstand my intention please.

For example, since GCC 6, when you "convert" from say a "float" to a "long
double", it generates spurious instructions like this sequence:

        fld     st(0)
        fcomip  st, st(2)

Instead of just:

        fcomip  st, st(1)

Which is IMHO killing one of the few nice features of x87 that it has automatic
conversions to 80-bit all the time at no extra cost.

Bear in mind, GCC 5 is just fine and does the latter which is optimal. My C++
code actually does all calculations in "long double" on purpose, but it has to
interface with data found in a "float" on a constant basis (reading it), and
also returning some of it as float or double. The problem is, as you can see,
newer versions of GCC do pointless "conversions" or something like that (I'm
not sure myself why it does those pointless loads) when you load a "float" into
a "long double" or something similar to that effect.

This is a two-fold problem: not only there's extra instructions, but it also
requires one more "space" in the register stack resulting in more pointless
spills (of long doubles!).


Now to illustrate what I mean so you can verify yourself, I've included a
stupid little testcase that doesn't do much of anything, but it seems to
exhibit this "poor code" generation, here's the code:



#include <stdint.h>

#undef MIN
#undef MAX
template<typename T> T inline MIN(T a, T b) { return a < b ? a : b; }
template<typename T> T inline MAX(T a, T b) { return a > b ? a : b; }

struct foo
{
  uint32_t num;
  union { uint32_t i; float f; };
};

extern float global_data[1024];

float bar(foo* __restrict__ e, uint32_t id)
{
  if(id >= e->num) return 0.0f;
  long double delta = (global_data[0]), min = (global_data[1]);
  delta = ((delta < 0.0l) ? (min-((long double)e->i)) : ((e->f)-min)) / delta;
  return (MIN(MAX(delta, 0.0l), 1.0l));
}



I compiled with e.g. -m32 -Ofast -mfpmath=387    (to have the return value in
st(0) to show this issue better)

Here's the outputs for GCC versions 5, 6 and 7, with comments from me showing
obvious poor code:


GCC 5:
   sub     esp, 12
   fldz
   mov     eax, DWORD PTR [esp+16]
   mov     ecx, DWORD PTR [esp+20]
   cmp     DWORD PTR [eax], ecx
   jbe     .L2
   fld     DWORD PTR global_data
   fld     DWORD PTR global_data+4
   fxch    st(2)
   fcomip  st, st(1)
   ja      .L12
   fxch    st(1)
   fsubr   DWORD PTR [eax+4]
 .L5:
   fdivrp  st(1), st
   fldz
   fxch    st(1)
   fcomi   st, st(1)
   fcmovb  st, st(1)
   fstp    st(1)
   fld1
   fcomi   st, st(1)
   fcmovnb st, st(1)
   fstp    st(1)
 .L2:
   add     esp, 12
   ret
 .L12:
   mov     eax, DWORD PTR [eax+4] # here the only issue I have with GCC 5
   xor     edx, edx
   mov     DWORD PTR [esp+4], edx # I don't understand what's this spill for?
   mov     DWORD PTR [esp], eax   # can't it load directly from [eax+4]?
   fild    QWORD PTR [esp]        # fild DWORD PTR [eax+4] or am I wrong?
   fsubp   st(2), st
   fxch    st(1)
   jmp     .L5



GCC 6:
   sub     esp, 12
   fldz
   mov     eax, DWORD PTR [esp+16]
   mov     ecx, DWORD PTR [esp+20]
   cmp     DWORD PTR [eax], ecx
   jbe     .L1
   fld     DWORD PTR global_data
   fld     st(0)                    # this is the poor "conversion" mentioned
   fld     DWORD PTR global_data+4
   fxch    st(3)
   fcomip  st, st(2)                # here's its "pop" (unneeded otherwise)
   fstp    st(1)
   ja      .L12
   fxch    st(1)
   fsubr   DWORD PTR [eax+4]
 .L5:
   fdivrp  st(1), st
   fldz
   fxch    st(1)
   fcomi   st, st(1)
   fcmovb  st, st(1)
   fstp    st(1)
   fld1
   fcomi   st, st(1)
   fcmovnb st, st(1)
   fstp    st(1)
 .L1:
   add     esp, 12
   ret
 .L12:
   mov     eax, DWORD PTR [eax+4]
   xor     edx, edx
   mov     DWORD PTR [esp+4], edx
   mov     DWORD PTR [esp], eax
   fild    QWORD PTR [esp]
   fsubp   st(2), st
   fxch    st(1)
   jmp     .L5



GCC 7:
   sub     esp, 12
   fldz
   mov     eax, DWORD PTR [esp+16]
   mov     edx, DWORD PTR [esp+20]
   cmp     DWORD PTR [eax], edx
   jbe     .L1
   fld     DWORD PTR global_data
   mov     eax, DWORD PTR [eax+4]
   fld     st(0)                    # same pointless instruction as v6
   fld     DWORD PTR global_data+4
   fxch    st(3)
   fcomip  st, st(2)
   fstp    st(1)
   ja      .L12
   mov     DWORD PTR [esp], eax     # worse than v6: spills to stack!
   fld     DWORD PTR [esp]          # instead of 'fsubr DWORD PTR [eax+4]'
   fsubrp  st(2), st
 .L5:
   fdivp   st(1), st
   fldz
   fxch    st(1)
   fcomi   st, st(1)
   fcmovb  st, st(1)
   fstp    st(1)
   fld1
   fld     st(0)                    # here it "converts" the constant 1.0l?
   fcomip  st, st(2)
   jnb     .L13
   fstp    st(1)
   jmp     .L6
 .L13:
   fstp    st(0)
 .L6:
 .L1:
   add     esp, 12
   ret
 .L12:
   mov     DWORD PTR [esp], eax
   mov     DWORD PTR [esp+4], 0
   fild    QWORD PTR [esp]
   fsubp   st(2), st
   jmp     .L5



As you can see, each new version made it even worse than it was. GCC 7 is
especially bad, it even refuses to do "fsubr DWORD PTR [eax+4]" directly, and
opts for a spill and a load. Furthermore, it loads a constant twice for no
reason, even when I explicitly declared the constant literal as "long double".
Am I missing something obvious?

Can this behavior be reverted? Version 5 obviously produces much better x87
code. Please note that I don't know anything about GCC's internals, but IMO
code gen of version 5 is "quite good" and should be kept instead of breaking it
like now...

Alternatively, I'd love to hear if this is a new behavior consequence of a
command line setting that is now enabled by default on optimizing, so I can
unset it myself. Am I missing some command line setting here?

Let me know if you need more info so I can see what to provide.

[Bug rtl-optimization/79593] New: [Regression] Poor/Worse code generation for FPU on versions after 6

Reply via email to