https://gcc.gnu.org/bugzilla/show_bug.cgi?id=90706
Vladimir Makarov <vmakarov at gcc dot gnu.org> changed:
What |Removed |Added
----------------------------------------------------------------------------
CC| |vmakarov at gcc dot gnu.org
--- Comment #14 from Vladimir Makarov <vmakarov at gcc dot gnu.org> ---
What I see is the input to RA was significantly changed sing gcc-8 (see
insns marked by !). A lot of subregs is generated now and there is no
promotion of (argument) hard regs (insns 44-47) because of
https://gcc.gnu.org/legacy-ml/gcc-patches/2018-10/msg01356.html.
1: NOTE_INSN_DELETED 1: NOTE_INSN_DELETED
4: NOTE_INSN_BASIC_BLOCK 2 4: NOTE_INSN_BASIC_BLOCK 2
2: r44:SF=r22:SF 44: r56:QI=r22:QI
REG_DEAD r22:SF REG_DEAD r22:QI
3: NOTE_INSN_FUNCTION_BEG 45: r57:QI=r23:QI
6: r45:QI=0x1 REG_DEAD r23:QI
REG_EQUAL 0x1 46: r58:QI=r24:QI
7: r18:SF=0.0 REG_DEAD r24:QI
! 8: r22:SF=r44:SF 47: r59:QI=r25:QI
REG_DEAD r44:SF REG_DEAD r25:QI
9: r24:QI=call [`__gtsf2'] argc:0 48: r52:QI=r56:QI
REG_DEAD r25:QI REG_DEAD r56:QI
REG_DEAD r23:QI 49: r53:QI=r57:QI
REG_DEAD r22:QI REG_DEAD r57:QI
REG_DEAD r18:SF 50: r54:QI=r58:QI
REG_CALL_DECL `__gtsf2' REG_DEAD r58:QI
REG_EH_REGION 0xffffffff80000000 51: r55:QI=r59:QI
10: NOTE_INSN_DELETED REG_DEAD r59:QI
11: cc0=cmp(r24:QI,0) 3: NOTE_INSN_FUNCTION_BEG
REG_DEAD r24:QI 6: r46:QI=0x1
12: pc={(cc0>0)?L14:pc} REG_EQUAL 0x1
REG_BR_PROB 633507684 7: r18:SF=0.0
22: NOTE_INSN_BASIC_BLOCK 3 ! 52: clobber r60:SI
13: r45:QI=0 ! 53: r60:SI#0=r52:QI
REG_EQUAL 0 REG_DEAD r52:QI
14: L14: ! 54: r60:SI#1=r53:QI
23: NOTE_INSN_BASIC_BLOCK 4 REG_DEAD r53:QI
19: r24:QI=r45:QI ! 55: r60:SI#2=r54:QI
REG_DEAD r45:QI REG_DEAD r54:QI
20: use r24:QI ! 56: r60:SI#3=r55:QI
REG_DEAD r55:QI
! 57: r22:SF=r60:SI#0
REG_DEAD r60:SI
9: r24:QI=call [`__gtsf2']
argc:0
REG_DEAD r25:QI
REG_DEAD r23:QI
REG_DEAD r22:QI
REG_DEAD r18:SF
REG_CALL_DECL `__gtsf2'
REG_EH_REGION
0xffffffff80000000
34: r50:QI=r24:QI
REG_DEAD r24:QI
10: NOTE_INSN_DELETED
11: pc={(r50:QI>0)?L13:pc}
REG_DEAD r50:QI
REG_BR_PROB 633507684
21: NOTE_INSN_BASIC_BLOCK 3
12: r46:QI=0
REG_EQUAL 0
13: L13:
22: NOTE_INSN_BASIC_BLOCK 4
18: r24:QI=r46:QI
REG_DEAD r46:QI
19: use r24:QI
Currently, GCC generates the following AVR code:
check:
push r28
push r29
rcall .
rcall .
push __tmp_reg__
in r28,__SP_L__
in r29,__SP_H__
/* prologue: function */
/* frame size = 5 */
/* stack size = 7 */
.L__stack_usage = 7
ldi r18,lo8(1)
std Y+5,r18
ldi r18,0
ldi r19,0
ldi r20,0
ldi r21,0
! std Y+1,r22
! std Y+2,r23
! std Y+3,r24
! std Y+4,r25
! ldd r22,Y+1
! ldd r23,Y+2
! ldd r24,Y+3
! ldd r25,Y+4
rcall __gtsf2
cp __zero_reg__,r24
brlt .L2
std Y+5,__zero_reg__
.L2:
ldd r24,Y+5
/* epilogue start */
pop __tmp_reg__
pop __tmp_reg__
pop __tmp_reg__
pop __tmp_reg__
pop __tmp_reg__
pop r29
pop r28
ret
There are a lot of loads and stores. That is because p60 got memory:
a2(r60,l0) costs: ADDW_REGS:32000 SIMPLE_LD_REGS:32000 LD_REGS:32000
NO_LD_REGS:32000 GENERAL_REGS:32000 MEM:12000
r60: preferred NO_REGS, alternative NO_REGS, allocno NO_REGS
After some investigation I found that IRA calculates a wrong cost for moving
general hard regs of SFmode.
The following patch solves the problem:
diff --git a/gcc/ira.cc b/gcc/ira.cc
index d28a67b2546..cb4bfca739d 100644
--- a/gcc/ira.cc
+++ b/gcc/ira.cc
@@ -1627,14 +1627,22 @@ ira_init_register_move_cost (machine_mode mode)
*p2 != LIM_REG_CLASSES; p2++)
if (ira_class_hard_regs_num[*p2] > 0
&& (ira_reg_class_max_nregs[*p2][mode]
- <= ira_class_hard_regs_num[*p2]))
+ <= ira_class_hard_regs_num[*p2])
+ && hard_reg_set_intersect_p (ok_regs,
+ reg_class_contents[cl1])
+ && hard_reg_set_intersect_p (ok_regs,
+ reg_class_contents[*p2]))
cost = MAX (cost, ira_register_move_cost[mode][cl1][*p2]);
for (p1 = ®_class_subclasses[cl1][0];
*p1 != LIM_REG_CLASSES; p1++)
if (ira_class_hard_regs_num[*p1] > 0
&& (ira_reg_class_max_nregs[*p1][mode]
- <= ira_class_hard_regs_num[*p1]))
+ <= ira_class_hard_regs_num[*p1])
+ && hard_reg_set_intersect_p (ok_regs,
+ reg_class_contents[cl2])
+ && hard_reg_set_intersect_p (ok_regs,
+ reg_class_contents[*p1]))
cost = MAX (cost, ira_register_move_cost[mode][*p1][cl2]);
ira_assert (cost <= 65535);
With this patch RA generates the following better code:
check:
push r12
push r13
push r14
push r15
push r28
/* prologue: function */
/* frame size = 0 */
/* stack size = 5 */
.L__stack_usage = 5
ldi r28,lo8(1)
ldi r18,0
ldi r19,0
ldi r20,0
ldi r21,0
! mov r12,r22
! mov r13,r23
! mov r14,r24
! mov r15,r25
! mov r25,r15
! mov r24,r14
! mov r23,r13
! mov r22,r12
rcall __gtsf2
cp __zero_reg__,r24
brlt .L2
ldi r28,0
.L2:
mov r24,r28
/* epilogue start */
pop r28
pop r15
pop r14
pop r13
pop r12
ret
Still there are a lot of moves in the generated code. I'll think how
to solve the problem. I think coalescing could do this.
Unfortunately, IRA/LRA do not coalesce moves involving subregs. May
be implementing coalescing at end of LRA could be a solution.
In any case, the full PR solution would take some time. The first, I am
going to submit the patch above after thorough testing a few major
targets. Then I'll work on removing redundant moves. I'll
periodically publish updates on the PR progress.