https://gcc.gnu.org/bugzilla/show_bug.cgi?id=81625
Bug ID: 81625
Summary: GCC v4.7 ... v8 is bloating code by > 25% compared to
v3.4
Product: gcc
Version: 8.0
Status: UNCONFIRMED
Severity: normal
Priority: P3
Component: rtl-optimization
Assignee: unassigned at gcc dot gnu.org
Reporter: gjl at gcc dot gnu.org
Target Milestone: ---
Created attachment 41867
--> https://gcc.gnu.org/bugzilla/attachment.cgi?id=41867&action=edit
snake-i.c: C test case.
The attached test case, compiled for code size
$ avr-gcc snake-i.c -mmcu=atmega168 -Os -S -dp -ffunction-sections -o
snake-i_$${v}.s
Gives the following sizes with different compiler versions:
avr-gcc (GCC) 3.4.6
text data bss dec hex filename
672 0 0 672 2a0 snake-i_20060421.o
avr-gcc (GCC) 4.7.2
text data bss dec hex filename
854 0 0 854 356 snake-i_4.7.2.o
avr-gcc (GCC) 4.9.2 20140912 (prerelease)
text data bss dec hex filename
894 0 0 894 37e snake-i_4.9.2-pre1.o
avr-gcc (GCC) 5.2.1 20150816
text data bss dec hex filename
876 0 0 876 36c snake-i_5.2.1.o
avr-gcc (GCC) 6.4.1 20170726
text data bss dec hex filename
852 0 0 852 354 snake-i_6.4.1.o
avr-gcc (GCC) 7.1.1 20170725
text data bss dec hex filename
850 0 0 850 352 snake-i_7.1.1.o
avr-gcc (GCC) 8.0.0 20170718 (experimental)
text data bss dec hex filename
852 0 0 852 354 snake-i_8.0_2017-07-19.o
Hence, compared to 3.4.6, we have the following bloat factor:
3.4.6: 672
4.7.2: 854 = +27%
4.9.2: 894 = +33%
5.2.1: 876 = +30%
6.4.1: 852 = +26%
7.1.1: 850 = +26%
8.0.0: 852 = +26%
Mostly due to bad register selection; multiple expensive address computations
(for address that's just 1 after the already computed address), missed
post-increment opportunity, ...
Note that the code from 3.4.6 is already sub-optimal so there is even more room
for improvement.
Just some samples:
if (s->changed.text)
{
s->changed.text = 0;
sb->str[0] = s->game.level + '0';
sb->str[1] = '\n';
u16_to_string (sb->str+2, s->game.score);
}
3.4.6:
tst r24 ; 421 tstqi [length = 1]
breq .L20 ; 422 branch [length = 1]
std Z+6,__zero_reg__ ; 426 *movqi/3 [length = 1]
; Compute address of sb->str to Y=r28.
subi r28,lo8(-(67)) ; 428 *addhi3/4 [length = 2]
sbci r29,hi8(-(67))
ldd r24,Z+7 ; 429 *movqi/4 [length = 1]
; Using post-increment to store '0' + ...
subi r24,lo8(-(48)) ; 430 addqi3/2 [length = 1]
st Y+,r24 ; 431 *movqi/3 [length = 1]
ldi r24,lo8(10) ; 434 *movqi/2 [length = 1]
; Dito to store '\n'.
st Y+,r24 ; 435 *movqi/3 [length = 1]
ldd r22,Z+8 ; 438 *movhi/2 [length = 2]
ldd r23,Z+9
; Now has sb->str + 2 to pass in r24.
movw r24,r28 ; 439 *movhi/1 [length = 1]
call u16_to_string ; 440 call_value_insn/3 [length = 2]
.L20:
/* epilogue: frame size=0 */
8.0.0:
tst r24 ; 296 cmpqi3/1 [length = 1]
brne .+2 ; 297 branch [length = 2]
rjmp .L20
; Using reg X=r26 which doesn't support X+const addressing, all described
; in LEGITIMIZE_RELOAD_ADDRESS. So it adds 6 and after access has to
; subtract 6 again
adiw r26,6 ; 299 movqi_insn/3 [length = 3]
st X,__zero_reg__
sbiw r26,6
; Computes address in Z=r30 as Y+67
movw r30,r28 ; 397 *movhi/1 [length = 1]
subi r30,-67 ; 300 addhi3_clobber/2 [length = 2]
sbci r31,-1
; Still using X.
adiw r26,7 ; 301 movqi_insn/4 [length = 3]
ld r24,X
sbiw r26,7
subi r24,lo8(-(48)) ; 302 addqi3/2 [length = 1]
; Store '0' +...
st Z,r24 ; 303 movqi_insn/3 [length = 1]
; What the dickens? Z++ after store to Z, why not just Z+ above?
adiw r30,1 ; 304 *addhi3/3 [length = 1]
ldi r24,lo8(10) ; 305 movqi_insn/2 [length = 1]
st Z,r24 ; 306 movqi_insn/3 [length = 1]
; Still using X
adiw r26,8 ; 307 *movhi/3 [length = 3]
ld r22,X+
ld r23,X
; Moving Y to r24 and computing Y+67 *again*
movw r24,r28 ; 399 *movhi/1 [length = 1]
subi r24,-69 ; 310 *addhi3/4 [length = 2]
sbci r25,-1
/* epilogue start */
; 7 * POP for epilogue
jmp u16_to_string ; 311 call_value_insn/4 [length = 2]
A second spot with crazy expensive code; both code bloat and slow execution:
start--;
sb->body.start = start;
sb->body.seg[start].len = 0;
sb->body.seg[start].dir = 2 ^ dir;
3.4.6:
.L34:
dec r14 ; 178 addqi3/4 [length = 1]
std Y+15,r14 ; 180 *movqi/3 [length = 1]
mov r30,r14 ; 182 zero_extendqihi2/2 [length = 2]
clr r31
add r30,r30 ; 184 *addhi3/1 [length = 2]
adc r31,r31
add r30,r28 ; 185 *addhi3/1 [length = 2]
adc r31,r29
std Z+18,__zero_reg__ ; 187 *movqi/3 [length = 1]
ldi r24,lo8(2) ; 194 *movqi/2 [length = 1]
eor r24,r15 ; 195 xorqi3 [length = 1]
std Z+17,r24 ; 196 *movqi/3 [length = 1]
8.0.0:
.L34:
dec r15 ; 131 addqi3/4 [length = 1]
std Y+15,r15 ; 132 movqi_insn/3 [length = 1]
; Zero-extend r15 to r24 ...
mov r24,r15 ; 404 movqi_insn/1 [length = 1]
ldi r25,0 ; 405 movqi_insn/1 [length = 1]
; but we need the result in r30. Why go through r24???
movw r30,r24 ; 382 *movhi/1 [length = 1]
; Add 9 because wants to access Z+18
adiw r30,9 ; 134 addhi3_clobber/1 [length = 1]
lsl r30 ; 445 *ashlhi3_const/2 [length = 2]
rol r31
add r30,r28 ; 136 *addhi3/1 [length = 2]
adc r31,r29
; Why not just Z+18 ?
st Z,__zero_reg__ ; 137 movqi_insn/3 [length = 1]
; Use the stored zero-extended value to compute Z+17, re-doing all the
; shift and additions *again*
movw r30,r24 ; 383 *movhi/1 [length = 1]
adiw r30,1 ; 138 addhi3_clobber/1 [length = 1]
lsl r30 ; 446 *ashlhi3_const/2 [length = 2]
rol r31
add r30,r28 ; 140 *addhi3/1 [length = 2]
adc r31,r29
ldi r24,lo8(2) ; 142 movqi_insn/2 [length = 1]
eor r13,r24 ; 143 xorqi3 [length = 1]
std Z+15,r13 ; 144 movqi_insn/3 [length = 1]
Some of the register selection nonsense can be mitigated by -mstrict-X, but
that option may ICE the register allocator so it's not on per default. And it
still gives code more than 10% behind the effectiveness of 3.4.6:
4.7.2: 722 = +7%
4.9.2: 750 = +11%
5.2.1: 752 = +12%
6.4.1: 764 = +13%
7.1.1: 760 = +13%
8.0.0: 762 = +13%