25% compared to v3.4

gjl at gcc dot gnu.org Mon, 31 Jul 2017 01:16:35 -0700

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=81625


            Bug ID: 81625
           Summary: GCC v4.7 ... v8 is bloating code by > 25% compared to
                    v3.4
           Product: gcc
           Version: 8.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: rtl-optimization
          Assignee: unassigned at gcc dot gnu.org
          Reporter: gjl at gcc dot gnu.org
  Target Milestone: ---

Created attachment 41867
  --> https://gcc.gnu.org/bugzilla/attachment.cgi?id=41867&action=edit
snake-i.c: C test case.

The attached test case, compiled for code size

$ avr-gcc snake-i.c -mmcu=atmega168 -Os -S -dp -ffunction-sections -o
snake-i_$${v}.s

Gives the following sizes with different compiler versions:

avr-gcc (GCC) 3.4.6
   text    data     bss     dec     hex filename
    672       0       0     672     2a0 snake-i_20060421.o

avr-gcc (GCC) 4.7.2
   text    data     bss     dec     hex filename
    854       0       0     854     356 snake-i_4.7.2.o

avr-gcc (GCC) 4.9.2 20140912 (prerelease)
   text    data     bss     dec     hex filename
    894       0       0     894     37e snake-i_4.9.2-pre1.o

avr-gcc (GCC) 5.2.1 20150816
   text    data     bss     dec     hex filename
    876       0       0     876     36c snake-i_5.2.1.o

avr-gcc (GCC) 6.4.1 20170726
   text    data     bss     dec     hex filename
    852       0       0     852     354 snake-i_6.4.1.o

avr-gcc (GCC) 7.1.1 20170725
   text    data     bss     dec     hex filename
    850       0       0     850     352 snake-i_7.1.1.o

avr-gcc (GCC) 8.0.0 20170718 (experimental)
   text    data     bss     dec     hex filename
    852       0       0     852     354 snake-i_8.0_2017-07-19.o

Hence, compared to 3.4.6, we have the following bloat factor:

3.4.6: 672
4.7.2: 854 = +27%
4.9.2: 894 = +33%
5.2.1: 876 = +30%
6.4.1: 852 = +26%
7.1.1: 850 = +26%
8.0.0: 852 = +26%

Mostly due to bad register selection; multiple expensive address computations
(for address that's just 1 after the already computed address), missed
post-increment opportunity, ...

Note that the code from 3.4.6 is already sub-optimal so there is even more room
for improvement.

Just some samples:

    if (s->changed.text)
    {
        s->changed.text = 0;
        sb->str[0] = s->game.level + '0';
        sb->str[1] = '\n';
        u16_to_string (sb->str+2, s->game.score);
    }

3.4.6:

        tst r24  ;  421 tstqi   [length = 1]
        breq .L20        ;  422 branch  [length = 1]
        std Z+6,__zero_reg__     ;  426 *movqi/3        [length = 1]
; Compute address of sb->str to Y=r28.
        subi r28,lo8(-(67))      ;  428 *addhi3/4       [length = 2]
        sbci r29,hi8(-(67))
        ldd r24,Z+7      ;  429 *movqi/4        [length = 1]
; Using post-increment to store '0' + ...
        subi r24,lo8(-(48))      ;  430 addqi3/2        [length = 1]
        st Y+,r24        ;  431 *movqi/3        [length = 1]
        ldi r24,lo8(10)  ;  434 *movqi/2        [length = 1]
; Dito to store '\n'.
        st Y+,r24        ;  435 *movqi/3        [length = 1]
        ldd r22,Z+8      ;  438 *movhi/2        [length = 2]
        ldd r23,Z+9
; Now has sb->str + 2 to pass in r24.
        movw r24,r28     ;  439 *movhi/1        [length = 1]
        call u16_to_string       ;  440 call_value_insn/3       [length = 2]
.L20:
/* epilogue: frame size=0 */


8.0.0:

        tst r24  ;  296 cmpqi3/1        [length = 1]
        brne .+2         ;  297 branch  [length = 2]
        rjmp .L20
; Using reg X=r26 which doesn't support X+const addressing, all described
; in LEGITIMIZE_RELOAD_ADDRESS.  So it adds 6 and after access has to
; subtract 6 again
        adiw r26,6       ;  299 movqi_insn/3    [length = 3]
        st X,__zero_reg__
        sbiw r26,6
; Computes address in Z=r30 as Y+67
        movw r30,r28     ;  397 *movhi/1        [length = 1]
        subi r30,-67     ;  300 addhi3_clobber/2        [length = 2]
        sbci r31,-1
; Still using X.
        adiw r26,7       ;  301 movqi_insn/4    [length = 3]
        ld r24,X
        sbiw r26,7
        subi r24,lo8(-(48))      ;  302 addqi3/2        [length = 1]
; Store '0' +...
        st Z,r24         ;  303 movqi_insn/3    [length = 1]
; What the dickens? Z++ after store to Z, why not just Z+ above?
        adiw r30,1       ;  304 *addhi3/3       [length = 1]
        ldi r24,lo8(10)  ;  305 movqi_insn/2    [length = 1]
        st Z,r24         ;  306 movqi_insn/3    [length = 1]
; Still using X
        adiw r26,8       ;  307 *movhi/3        [length = 3]
        ld r22,X+
        ld r23,X
; Moving Y to r24 and computing Y+67 *again*
        movw r24,r28     ;  399 *movhi/1        [length = 1]
        subi r24,-69     ;  310 *addhi3/4       [length = 2]
        sbci r25,-1
/* epilogue start */
        ; 7 * POP for epilogue
        jmp u16_to_string        ;  311 call_value_insn/4       [length = 2]


A second spot with crazy expensive code; both code bloat and slow execution:

        start--;
        sb->body.start = start;
        sb->body.seg[start].len = 0;
        sb->body.seg[start].dir = 2 ^ dir;

3.4.6:

.L34:
        dec r14  ;  178 addqi3/4        [length = 1]
        std Y+15,r14     ;  180 *movqi/3        [length = 1]
        mov r30,r14      ;  182 zero_extendqihi2/2      [length = 2]
        clr r31
        add r30,r30      ;  184 *addhi3/1       [length = 2]
        adc r31,r31
        add r30,r28      ;  185 *addhi3/1       [length = 2]
        adc r31,r29
        std Z+18,__zero_reg__    ;  187 *movqi/3        [length = 1]
        ldi r24,lo8(2)   ;  194 *movqi/2        [length = 1]
        eor r24,r15      ;  195 xorqi3  [length = 1]
        std Z+17,r24     ;  196 *movqi/3        [length = 1]

8.0.0:

.L34:
        dec r15  ;  131 addqi3/4        [length = 1]
        std Y+15,r15     ;  132 movqi_insn/3    [length = 1]
; Zero-extend r15 to r24 ...
        mov r24,r15      ;  404 movqi_insn/1    [length = 1]
        ldi r25,0        ;  405 movqi_insn/1    [length = 1]
; but we need the result in r30.  Why go through r24???
        movw r30,r24     ;  382 *movhi/1        [length = 1]
; Add 9 because wants to access Z+18
        adiw r30,9       ;  134 addhi3_clobber/1        [length = 1]
        lsl r30  ;  445 *ashlhi3_const/2        [length = 2]
        rol r31
        add r30,r28      ;  136 *addhi3/1       [length = 2]
        adc r31,r29
; Why not just Z+18 ?
        st Z,__zero_reg__        ;  137 movqi_insn/3    [length = 1]
; Use the stored zero-extended value to compute Z+17, re-doing all the
; shift and additions *again*
        movw r30,r24     ;  383 *movhi/1        [length = 1]
        adiw r30,1       ;  138 addhi3_clobber/1        [length = 1]
        lsl r30  ;  446 *ashlhi3_const/2        [length = 2]
        rol r31
        add r30,r28      ;  140 *addhi3/1       [length = 2]
        adc r31,r29
        ldi r24,lo8(2)   ;  142 movqi_insn/2    [length = 1]
        eor r13,r24      ;  143 xorqi3  [length = 1]
        std Z+15,r13     ;  144 movqi_insn/3    [length = 1]

Some of the register selection nonsense can be mitigated by -mstrict-X, but
that option may ICE the register allocator so it's not on per default.  And it
still gives code more than 10% behind the effectiveness of 3.4.6:

4.7.2: 722 = +7%
4.9.2: 750 = +11%
5.2.1: 752 = +12%
6.4.1: 764 = +13%
7.1.1: 760 = +13%
8.0.0: 762 = +13%

[Bug rtl-optimization/81625] New: GCC v4.7 ... v8 is bloating code by > 25% compared to v3.4

Reply via email to