https://gcc.gnu.org/bugzilla/show_bug.cgi?id=64191

            Bug ID: 64191
           Summary: -march=native messes up dead code elimination in loop
                    calling dtor
           Product: gcc
           Version: 5.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: c++
          Assignee: unassigned at gcc dot gnu.org
          Reporter: petschy at gmail dot com

Without -march=native, the loops in the 3 fns are eliminated as expected,
resulting in single retq's.

With -march=native, the loop which calls the defined, but empty dtor is
compiled into something rather weird. However, the other empty Nop() call is
optimized away as expected.

g++-5.0.0 -g -O3 -Wall -Wextra -c 20141205-dtor_loop.cpp 
g++-5.0.0 -g -O3 -Wall -Wextra -o 20141205-dtor_loop 20141205-dtor_loop.o

Dump of assembler code for function foo_dtor_loop(Foo*, unsigned int):
   0x0000000000400570 <+0>:    repz retq   
Dump of assembler code for function bar_dtor_loop(Bar*, unsigned int):
   0x0000000000400580 <+0>:    repz retq 
Dump of assembler code for function bar_nop_loop(Bar*, unsigned int):
   0x0000000000400590 <+0>:    repz retq 

So far so good.

g++-5.0.0 -g -O3 -march=native -Wall -Wextra -c 20141205-dtor_loop.cpp 
g++-5.0.0 -g -O3 -march=native -Wall -Wextra -o 20141205-dtor_loop
20141205-dtor_loop.o

Dump of assembler code for function foo_dtor_loop(Foo*, unsigned int):
   0x0000000000400570 <+0>:    retq   

Dump of assembler code for function bar_dtor_loop(Bar*, unsigned int):
   0x0000000000400578 <+0>:     test   %rdi,%rdi
   0x000000000040057b <+3>:     je     0x4005b8 <bar_dtor_loop(Bar*, unsigned
int)+64>
   0x000000000040057d <+5>:     mov    %esi,%esi
   0x000000000040057f <+7>:     lea    (%rdi,%rsi,4),%rax
   0x0000000000400583 <+11>:    cmp    %rax,%rdi
   0x0000000000400586 <+14>:    jae    0x4005b8 <bar_dtor_loop(Bar*, unsigned
int)+64>
   0x0000000000400588 <+16>:    mov    $0x3,%edx
   0x000000000040058d <+21>:    lea    -0x4(%rax),%rsi
   0x0000000000400591 <+25>:    sub    %rdi,%rdx
   0x0000000000400594 <+28>:    add    %rsi,%rdx
   0x0000000000400597 <+31>:    mov    %rdx,%rcx
   0x000000000040059a <+34>:    shr    $0x2,%rcx
   0x000000000040059e <+38>:    lea    0x1(%rcx),%r8
   0x00000000004005a2 <+42>:    dec    %rcx
   0x00000000004005a5 <+45>:    shr    %rcx
   0x00000000004005a8 <+48>:    lea    0x2(%rcx,%rcx,1),%rcx
   0x00000000004005ad <+53>:    cmp    $0x2f,%rdx
   0x00000000004005b1 <+57>:    jbe    0x4005b8 <bar_dtor_loop(Bar*, unsigned
int)+64>
   0x00000000004005b3 <+59>:    cmp    %rcx,%r8
   0x00000000004005b6 <+62>:    je     0x4005b8 <bar_dtor_loop(Bar*, unsigned
int)+64>
   0x00000000004005b8 <+64>:    retq   

Dump of assembler code for function bar_nop_loop(Bar*, unsigned int):
   0x00000000004005c0 <+0>:     retq   

The bar_dtor_loop() fn is clearly a mess, unfortunately I can't follow the
computation.

The bar_inc_loop() does a single int increment on each object, to see what loop
code is generated if not empty fns are called. It is as expected: the loop is
unrolled 16x times, and the residual part is executed in a tight loop:
   0x0000000000400648 <+120>:    sub    $0x4,%rdx
   0x000000000040064c <+124>:    incl   (%rdx)
   0x000000000040064e <+126>:    cmp    %rdx,%rdi
   0x0000000000400651 <+129>:    jb     0x400648 <bar_inc_loop(Bar*, unsigned
int)+120>

g++-5.0.0 -v
Using built-in specs.
COLLECT_GCC=g++-5.0.0
COLLECT_LTO_WRAPPER=/usr/local/libexec/gcc/x86_64-unknown-linux-gnu/5.0.0/lto-wrapper
Target: x86_64-unknown-linux-gnu
Configured with: ./configure --enable-languages=c,c++ --disable-multilib
--program-suffix=-5.0.0
Thread model: posix
gcc version 5.0.0 20141027 (experimental) (GCC)

cat /proc/cpuinfo 
processor    : 0
vendor_id    : AuthenticAMD
cpu family    : 21
model        : 1
model name    : AMD FX(tm)-8150 Eight-Core Processor
stepping    : 2
microcode    : 0x6000626
cpu MHz        : 1400.000
cache size    : 2048 KB
physical id    : 0
siblings    : 8
core id        : 0
cpu cores    : 4
apicid        : 16
initial apicid    : 0
fpu        : yes
fpu_exception    : yes
cpuid level    : 13
wp        : yes
flags        : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov
pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb
rdtscp lm constant_tsc rep_good nopl nonstop_tsc extd_apicid aperfmperf pni
pclmulqdq monitor ssse3 cx16 sse4_1 sse4_2 popcnt aes xsave avx lahf_lm
cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs
xop skinit wdt lwp fma4 nodeid_msr topoext perfctr_core perfctr_nb arat cpb
hw_pstate npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid
decodeassists pausefilter pfthreshold
bugs        : fxsave_leak
bogomips    : 7624.63
TLB size    : 1536 4K pages
clflush size    : 64
cache_alignment    : 64
address sizes    : 48 bits physical, 48 bits virtual
power management: ts ttp tm 100mhzsteps hwpstate cpb

Unfortunately, I couldn't test with the latest version since the build fails
with
../.././libcc1/findcomp.cc:20:20: fatal error: config.h: No such file or
directory
a while know, even after deleting everything and doing a git reset --hard HEAD.

----8<----8<----8<----
struct Foo
{
        int i;
};
void foo_dtor_loop(Foo* p, unsigned int n)
{
        if (p) {
                Foo* e = p + n;
                while (e > p) {
                        --e;
                        e->~Foo();
                }
        }
}

struct Bar
{
        int i;
        ~Bar() { }
        void Nop() { }
        void Inc() { ++i; }
};
void bar_dtor_loop(Bar* p, unsigned int n)
{
        if (p) {
                Bar* e = p + n;
                while (e > p) {
                        --e;
                        e->~Bar();
                }
        }
}
void bar_nop_loop(Bar* p, unsigned int n)
{
        if (p) {
                Bar* e = p + n;
                while (e > p) {
                        --e;
                        e->Nop();
                }
        }
}
void bar_inc_loop(Bar* p, unsigned int n)
{
        if (p) {
                Bar* e = p + n;
                while (e > p) {
                        --e;
                        e->Inc();
                }
        }
}

int main()
{
}

Reply via email to