[Bug c++/85640] New: Code size regression vs 7.3.1

petschy at gmail dot com Thu, 03 May 2018 14:22:42 -0700

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=85640


            Bug ID: 85640
           Summary: Code size regression vs 7.3.1
           Product: gcc
           Version: 8.1.1
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: c++
          Assignee: unassigned at gcc dot gnu.org
          Reporter: petschy at gmail dot com
  Target Milestone: ---

Created attachment 44062
  --> https://gcc.gnu.org/bugzilla/attachment.cgi?id=44062&action=edit
source

Attached the source of a simple Adler32 checksum class. The Update() fn is 32
bytes longer compared to the code generated with 7.3.1.

Dump of assembler code for function Adler32::Update(void const*, unsigned int):
7.3.1 0x0000000000400500 <+0>:     test   %edx,%edx
7.3.1 0x0000000000400502 <+2>:     je     0x400578 <Adler32::Update(void
const*, unsigned int)+120>
7.3.1 0x0000000000400504 <+4>:     mov    (%rdi),%ecx
7.3.1 0x0000000000400506 <+6>:     mov    0x4(%rdi),%r8d
7.3.1 0x000000000040050a <+10>:    mov    $0x80078071,%r10d
7.3.1 0x0000000000400510 <+16>:    xor    %r9d,%r9d
7.3.1 0x0000000000400513 <+19>:    cmp    $0x15af,%edx
7.3.1 0x0000000000400519 <+25>:    jbe    0x400527 <Adler32::Update(void
const*, unsigned int)+39>
7.3.1 0x000000000040051b <+27>:    lea    -0x15b0(%rdx),%r9d
7.3.1 0x0000000000400522 <+34>:    mov    $0x15b0,%edx
7.3.1 0x0000000000400527 <+39>:    lea    -0x1(%rdx),%eax
7.3.1 0x000000000040052a <+42>:    lea    0x1(%rsi,%rax,1),%rdx
7.3.1 0x000000000040052f <+47>:    nop
7.3.1 0x0000000000400530 <+48>:    add    $0x1,%rsi
7.3.1 0x0000000000400534 <+52>:    movzbl -0x1(%rsi),%eax
7.3.1 0x0000000000400538 <+56>:    add    %eax,%ecx
7.3.1 0x000000000040053a <+58>:    add    %ecx,%r8d
7.3.1 0x000000000040053d <+61>:    cmp    %rdx,%rsi
7.3.1 0x0000000000400540 <+64>:    mov    %ecx,(%rdi)
7.3.1 0x0000000000400542 <+66>:    mov    %r8d,0x4(%rdi)
7.3.1 0x0000000000400546 <+70>:    jne    0x400530 <Adler32::Update(void
const*, unsigned int)+48>
7.3.1 0x0000000000400548 <+72>:    mov    %ecx,%eax
7.3.1 0x000000000040054a <+74>:    mul    %r10d
7.3.1 0x000000000040054d <+77>:    mov    %r8d,%eax
7.3.1 0x0000000000400550 <+80>:    shr    $0xf,%edx
7.3.1 0x0000000000400553 <+83>:    imul   $0xfff1,%edx,%edx
7.3.1 0x0000000000400559 <+89>:    sub    %edx,%ecx
7.3.1 0x000000000040055b <+91>:    mul    %r10d
7.3.1 0x000000000040055e <+94>:    mov    %ecx,(%rdi)
7.3.1 0x0000000000400560 <+96>:    shr    $0xf,%edx
7.3.1 0x0000000000400563 <+99>:    imul   $0xfff1,%edx,%edx
7.3.1 0x0000000000400569 <+105>:   sub    %edx,%r8d
7.3.1 0x000000000040056c <+108>:   test   %r9d,%r9d
7.3.1 0x000000000040056f <+111>:   mov    %r9d,%edx
7.3.1 0x0000000000400572 <+114>:   mov    %r8d,0x4(%rdi)
7.3.1 0x0000000000400576 <+118>:   jne    0x400510 <Adler32::Update(void
const*, unsigned int)+16>
7.3.1 0x0000000000400578 <+120>:   repz retq 

Dump of assembler code for function Adler32::Update(void const*, unsigned int):
8.1.1 0x0000000000400500 <+0>:     test   %edx,%edx
8.1.1 0x0000000000400502 <+2>:     je     0x400598 <Adler32::Update(void
const*, unsigned int)+152>
8.1.1 0x0000000000400508 <+8>:     mov    (%rdi),%ecx
8.1.1 0x000000000040050a <+10>:    mov    0x4(%rdi),%r8d
8.1.1 0x000000000040050e <+14>:    push   %rbx
8.1.1 0x000000000040050f <+15>:    mov    $0x80078071,%ebx
8.1.1 0x0000000000400514 <+20>:    nopl   0x0(%rax)
8.1.1 0x0000000000400518 <+24>:    xor    %r11d,%r11d
8.1.1 0x000000000040051b <+27>:    cmp    $0x15af,%edx
8.1.1 0x0000000000400521 <+33>:    jbe    0x40052f <Adler32::Update(void
const*, unsigned int)+47>
8.1.1 0x0000000000400523 <+35>:    lea    -0x15b0(%rdx),%r11d
8.1.1 0x000000000040052a <+42>:    mov    $0x15b0,%edx
8.1.1 0x000000000040052f <+47>:    mov    %edx,%r10d
8.1.1 0x0000000000400532 <+50>:    mov    %rsi,%rax
8.1.1 0x0000000000400535 <+53>:    add    %rsi,%r10
8.1.1 0x0000000000400538 <+56>:    nopl   0x0(%rax,%rax,1)
8.1.1 0x0000000000400540 <+64>:    add    $0x1,%rax
8.1.1 0x0000000000400544 <+68>:    movzbl -0x1(%rax),%r9d
8.1.1 0x0000000000400549 <+73>:    add    %r9d,%ecx
8.1.1 0x000000000040054c <+76>:    add    %ecx,%r8d
8.1.1 0x000000000040054f <+79>:    mov    %ecx,(%rdi)
8.1.1 0x0000000000400551 <+81>:    mov    %r8d,0x4(%rdi)
8.1.1 0x0000000000400555 <+85>:    cmp    %r10,%rax
8.1.1 0x0000000000400558 <+88>:    jne    0x400540 <Adler32::Update(void
const*, unsigned int)+64>
8.1.1 0x000000000040055a <+90>:    lea    -0x1(%rdx),%eax
8.1.1 0x000000000040055d <+93>:    lea    0x1(%rsi,%rax,1),%rsi
8.1.1 0x0000000000400562 <+98>:    mov    %ecx,%eax
8.1.1 0x0000000000400564 <+100>:   mul    %ebx
8.1.1 0x0000000000400566 <+102>:   mov    %r8d,%eax
8.1.1 0x0000000000400569 <+105>:   shr    $0xf,%edx
8.1.1 0x000000000040056c <+108>:   imul   $0xfff1,%edx,%edx
8.1.1 0x0000000000400572 <+114>:   sub    %edx,%ecx
8.1.1 0x0000000000400574 <+116>:   mul    %ebx
8.1.1 0x0000000000400576 <+118>:   mov    %ecx,(%rdi)
8.1.1 0x0000000000400578 <+120>:   shr    $0xf,%edx
8.1.1 0x000000000040057b <+123>:   imul   $0xfff1,%edx,%edx
8.1.1 0x0000000000400581 <+129>:   sub    %edx,%r8d
8.1.1 0x0000000000400584 <+132>:   mov    %r11d,%edx
8.1.1 0x0000000000400587 <+135>:   mov    %r8d,0x4(%rdi)
8.1.1 0x000000000040058b <+139>:   test   %r11d,%r11d
8.1.1 0x000000000040058e <+142>:   jne    0x400518 <Adler32::Update(void
const*, unsigned int)+24>
8.1.1 0x0000000000400590 <+144>:   pop    %rbx
8.1.1 0x0000000000400591 <+145>:   retq   
8.1.1 0x0000000000400592 <+146>:   nopw   0x0(%rax,%rax,1)
8.1.1 0x0000000000400598 <+152>:   retq   

Here is an interwoven version, hopefully easier to follow:
7.3.1 0x0000000000400500 <+0>:     test   %edx,%edx
7.3.1 0x0000000000400502 <+2>:     je     0x400578 <Adler32::Update(void
const*, unsigned int)+120>
7.3.1 0x0000000000400504 <+4>:     mov    (%rdi),%ecx
7.3.1 0x0000000000400506 <+6>:     mov    0x4(%rdi),%r8d
7.3.1 0x000000000040050a <+10>:    mov    $0x80078071,%r10d

8.1.1 0x0000000000400500 <+0>:     test   %edx,%edx
8.1.1 0x0000000000400502 <+2>:     je     0x400598 <Adler32::Update(void
const*, unsigned int)+152>
8.1.1 0x0000000000400508 <+8>:     mov    (%rdi),%ecx
8.1.1 0x000000000040050a <+10>:    mov    0x4(%rdi),%r8d
8.1.1 0x000000000040050e <+14>:    push   %rbx
8.1.1 0x000000000040050f <+15>:    mov    $0x80078071,%ebx
8.1.1 0x0000000000400514 <+20>:    nopl   0x0(%rax)

Two things so far:
- the je is 6 bytes in 8.1.1 vs 2 bytes in 7.3.1 because the jump offset can't
fit in a byte
- in 8.1.1 ebx is used for the modulo magic, which is callee saved, so have to
push before use



7.3.1 0x0000000000400510 <+16>:    xor    %r9d,%r9d
7.3.1 0x0000000000400513 <+19>:    cmp    $0x15af,%edx
7.3.1 0x0000000000400519 <+25>:    jbe    0x400527 <Adler32::Update(void
const*, unsigned int)+39>
7.3.1 0x000000000040051b <+27>:    lea    -0x15b0(%rdx),%r9d
7.3.1 0x0000000000400522 <+34>:    mov    $0x15b0,%edx
7.3.1 0x0000000000400527 <+39>:    lea    -0x1(%rdx),%eax
7.3.1 0x000000000040052a <+42>:    lea    0x1(%rsi,%rax,1),%rdx
7.3.1 0x000000000040052f <+47>:    nop

8.1.1 0x0000000000400518 <+24>:    xor    %r11d,%r11d
8.1.1 0x000000000040051b <+27>:    cmp    $0x15af,%edx
8.1.1 0x0000000000400521 <+33>:    jbe    0x40052f <Adler32::Update(void
const*, unsigned int)+47>
8.1.1 0x0000000000400523 <+35>:    lea    -0x15b0(%rdx),%r11d
8.1.1 0x000000000040052a <+42>:    mov    $0x15b0,%edx
8.1.1 0x000000000040052f <+47>:    mov    %edx,%r10d
8.1.1 0x0000000000400532 <+50>:    mov    %rsi,%rax
8.1.1 0x0000000000400535 <+53>:    add    %rsi,%r10
8.1.1 0x0000000000400538 <+56>:    nopl   0x0(%rax,%rax,1)

This is the inner loop init, pretty similar.



7.3.1 0x0000000000400530 <+48>:    add    $0x1,%rsi
7.3.1 0x0000000000400534 <+52>:    movzbl -0x1(%rsi),%eax
7.3.1 0x0000000000400538 <+56>:    add    %eax,%ecx
7.3.1 0x000000000040053a <+58>:    add    %ecx,%r8d
7.3.1 0x000000000040053d <+61>:    cmp    %rdx,%rsi
7.3.1 0x0000000000400540 <+64>:    mov    %ecx,(%rdi)
7.3.1 0x0000000000400542 <+66>:    mov    %r8d,0x4(%rdi)
7.3.1 0x0000000000400546 <+70>:    jne    0x400530 <Adler32::Update(void
const*, unsigned int)+48>

8.1.1 0x0000000000400540 <+64>:    add    $0x1,%rax
8.1.1 0x0000000000400544 <+68>:    movzbl -0x1(%rax),%r9d
8.1.1 0x0000000000400549 <+73>:    add    %r9d,%ecx
8.1.1 0x000000000040054c <+76>:    add    %ecx,%r8d
8.1.1 0x000000000040054f <+79>:    mov    %ecx,(%rdi)
8.1.1 0x0000000000400551 <+81>:    mov    %r8d,0x4(%rdi)
8.1.1 0x0000000000400555 <+85>:    cmp    %r10,%rax
8.1.1 0x0000000000400558 <+88>:    jne    0x400540 <Adler32::Update(void
const*, unsigned int)+64>

These are the same, except the cmp is before/after the two stores, and the
movzbl and the first add is one byte cheaper in 7.3.1.


7.3.1 0x0000000000400548 <+72>:    mov    %ecx,%eax
7.3.1 0x000000000040054a <+74>:    mul    %r10d
7.3.1 0x000000000040054d <+77>:    mov    %r8d,%eax
7.3.1 0x0000000000400550 <+80>:    shr    $0xf,%edx
7.3.1 0x0000000000400553 <+83>:    imul   $0xfff1,%edx,%edx
7.3.1 0x0000000000400559 <+89>:    sub    %edx,%ecx
7.3.1 0x000000000040055b <+91>:    mul    %r10d
7.3.1 0x000000000040055e <+94>:    mov    %ecx,(%rdi)
7.3.1 0x0000000000400560 <+96>:    shr    $0xf,%edx
7.3.1 0x0000000000400563 <+99>:    imul   $0xfff1,%edx,%edx
7.3.1 0x0000000000400569 <+105>:   sub    %edx,%r8d
7.3.1 0x000000000040056c <+108>:   test   %r9d,%r9d
7.3.1 0x000000000040056f <+111>:   mov    %r9d,%edx
7.3.1 0x0000000000400572 <+114>:   mov    %r8d,0x4(%rdi)
7.3.1 0x0000000000400576 <+118>:   jne    0x400510 <Adler32::Update(void
const*, unsigned int)+16>
7.3.1 0x0000000000400578 <+120>:   repz retq 

8.1.1 0x000000000040055a <+90>:    lea    -0x1(%rdx),%eax
8.1.1 0x000000000040055d <+93>:    lea    0x1(%rsi,%rax,1),%rsi
8.1.1 0x0000000000400562 <+98>:    mov    %ecx,%eax
8.1.1 0x0000000000400564 <+100>:   mul    %ebx
8.1.1 0x0000000000400566 <+102>:   mov    %r8d,%eax
8.1.1 0x0000000000400569 <+105>:   shr    $0xf,%edx
8.1.1 0x000000000040056c <+108>:   imul   $0xfff1,%edx,%edx
8.1.1 0x0000000000400572 <+114>:   sub    %edx,%ecx
8.1.1 0x0000000000400574 <+116>:   mul    %ebx
8.1.1 0x0000000000400576 <+118>:   mov    %ecx,(%rdi)
8.1.1 0x0000000000400578 <+120>:   shr    $0xf,%edx
8.1.1 0x000000000040057b <+123>:   imul   $0xfff1,%edx,%edx
8.1.1 0x0000000000400581 <+129>:   sub    %edx,%r8d
8.1.1 0x0000000000400584 <+132>:   mov    %r11d,%edx
8.1.1 0x0000000000400587 <+135>:   mov    %r8d,0x4(%rdi)
8.1.1 0x000000000040058b <+139>:   test   %r11d,%r11d
8.1.1 0x000000000040058e <+142>:   jne    0x400518 <Adler32::Update(void
const*, unsigned int)+24>
8.1.1 0x0000000000400590 <+144>:   pop    %rbx
8.1.1 0x0000000000400591 <+145>:   retq   
8.1.1 0x0000000000400592 <+146>:   nopw   0x0(%rax,%rax,1)
8.1.1 0x0000000000400598 <+152>:   retq   

The loop variables in 8.1.1 handled a bit differently, that's why the minor
size increase. The two lea's here in the 8.1.1 version are very similar to the
ones before the inner loop in 7.3.1, those calculate the ptr range of the inner
loop. These lea's here calculate the start ptr for the next inner loop run,
however, this ptr is already present in eax. I don't know if this is some
special optimization, or eax should have been used but the compiler missed it.

The double retq at the end is interesting, but maybe the debug info is to
blame.

cmdline used to compile:
$ g++-7.3.1 -g -O3 -Wall a32.cpp
$ g++-8.1.1 -g -O3 -Wall a32.cpp

Platform is AMD64 (FX-8150), Debian 9.4

$ g++-7.3.1 -v
Using built-in specs.
COLLECT_GCC=g++-7.3.1
COLLECT_LTO_WRAPPER=/usr/local/libexec/gcc/x86_64-pc-linux-gnu/7.3.1/lto-wrapper
Target: x86_64-pc-linux-gnu
Configured with: ../configure --enable-languages=c,c++ --disable-multilib
--program-suffix=-7.3.1 --disable-bootstrap CFLAGS='-O2 -march=native
-mtune=native' CXXFLAGS='-O2 -march=native -mtune=native'
Thread model: posix
gcc version 7.3.1 20180429 (GCC)

$ g++-8.1.1 -v
Using built-in specs.
COLLECT_GCC=g++-8.1.1
COLLECT_LTO_WRAPPER=/usr/local/libexec/gcc/x86_64-pc-linux-gnu/8.1.1/lto-wrapper
Target: x86_64-pc-linux-gnu
Configured with: ../configure --enable-languages=c,c++ --disable-multilib
--program-suffix=-8.1.1 --disable-bootstrap CFLAGS='-O2 -march=native
-mtune=native' CXXFLAGS='-O2 -march=native -mtune=native'
Thread model: posix
gcc version 8.1.1 20180502 (GCC)

[Bug c++/85640] New: Code size regression vs 7.3.1

Reply via email to