Improve stack adjustment by reusing a temporary move immediate from the epilog if the register is still valid in the epilog. This generates smaller code for leaf functions:
mov x16, 40000 sub sp, sp, x16 ldr w0, [sp, w0, sxtw 2] add sp, sp, x16 ret Passes GCC regression tests. ChangeLog: 2016-08-04 Wilco Dijkstra <wdijk...@arm.com> gcc/ * config/aarch64/aarch64.c (aarch64_add_constant): Add extra argument to allow emitting the move immediate. Use add/sub with positive immediate. (aarch64_expand_epilogue): Decide when to leave out move. testsuite/ * gcc.target/aarch64/test_frame_17.c: New test. -- diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c index ce2cc5ae3e1291f4ef4a8408461678c9397b06bd..5b59e4dd157351f301fc563a724cefe8a9be132c 100644 --- a/gcc/config/aarch64/aarch64.c +++ b/gcc/config/aarch64/aarch64.c @@ -1941,15 +1941,21 @@ aarch64_expand_mov_immediate (rtx dest, rtx imm) } /* Add DELTA to REGNUM in mode MODE. SCRATCHREG can be used to held - intermediate value if necessary. + intermediate value if necessary. FRAME_RELATED_P should be true if + the RTX_FRAME_RELATED flag should be set and CFA adjustments added + to the generated instructions. If SCRATCHREG is known to hold + abs (delta), EMIT_MOVE_IMM can be set to false to avoid emitting the + immediate again. - This function is sometimes used to adjust the stack pointer, so we must - ensure that it can never cause transient stack deallocation by writing an - invalid value into REGNUM. */ + Since this function may be used to adjust the stack pointer, we must + ensure that it cannot cause transient stack deallocation (for example + by first incrementing SP and then decrementing when adjusting by a + large immediate). */ static void aarch64_add_constant (machine_mode mode, int regnum, int scratchreg, - HOST_WIDE_INT delta, bool frame_related_p) + HOST_WIDE_INT delta, bool frame_related_p, + bool emit_move_imm = true) { HOST_WIDE_INT mdelta = abs_hwi (delta); rtx this_rtx = gen_rtx_REG (mode, regnum); @@ -1967,11 +1973,11 @@ aarch64_add_constant (machine_mode mode, int regnum, int scratchreg, return; } - /* We need two add/sub instructions, each one performing part of the - calculation. Don't do this if the addend can be loaded into register with - a single instruction, in that case we prefer a move to a scratch register - following by an addition. */ - if (mdelta < 0x1000000 && !aarch64_move_imm (delta, mode)) + /* We need two add/sub instructions, each one perform part of the + addition/subtraction, but don't this if the addend can be loaded into + register by single instruction, in that case we prefer a move to scratch + register following by addition. */ + if (mdelta < 0x1000000 && !aarch64_move_imm (mdelta, mode)) { HOST_WIDE_INT low_off = mdelta & 0xfff; @@ -1985,8 +1991,10 @@ aarch64_add_constant (machine_mode mode, int regnum, int scratchreg, /* Otherwise use generic function to handle all other situations. */ rtx scratch_rtx = gen_rtx_REG (mode, scratchreg); - aarch64_internal_mov_immediate (scratch_rtx, GEN_INT (delta), true, mode); - insn = emit_insn (gen_add2_insn (this_rtx, scratch_rtx)); + if (emit_move_imm) + aarch64_internal_mov_immediate (scratch_rtx, GEN_INT (mdelta), true, mode); + insn = emit_insn (delta < 0 ? gen_sub2_insn (this_rtx, scratch_rtx) + : gen_add2_insn (this_rtx, scratch_rtx)); if (frame_related_p) { RTX_FRAME_RELATED_P (insn) = frame_related_p; @@ -3288,7 +3296,8 @@ aarch64_expand_epilogue (bool for_sibcall) RTX_FRAME_RELATED_P (insn) = callee_adjust == 0; } else - aarch64_add_constant (Pmode, SP_REGNUM, IP1_REGNUM, final_adjust, true); + aarch64_add_constant (Pmode, SP_REGNUM, IP1_REGNUM, final_adjust, true, + df_regs_ever_live_p (IP1_REGNUM)); aarch64_restore_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM, callee_adjust != 0, &cfi_ops); @@ -3311,7 +3320,8 @@ aarch64_expand_epilogue (bool for_sibcall) cfi_ops = NULL; } - aarch64_add_constant (Pmode, SP_REGNUM, IP0_REGNUM, initial_adjust, true); + aarch64_add_constant (Pmode, SP_REGNUM, IP0_REGNUM, initial_adjust, true, + df_regs_ever_live_p (IP0_REGNUM)); if (cfi_ops) { diff --git a/gcc/testsuite/gcc.target/aarch64/test_frame_17.c b/gcc/testsuite/gcc.target/aarch64/test_frame_17.c new file mode 100644 index 0000000000000000000000000000000000000000..c214431999b60cce8a75204876a8c73ec6304128 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/test_frame_17.c @@ -0,0 +1,21 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 --save-temps" } */ + +/* Test reuse of stack adjustment temporaries. */ + +void foo (); + +int reuse_mov (int i) +{ + int arr[1025]; + return arr[i]; +} + +int no_reuse_mov (int i) +{ + int arr[1025]; + foo (); + return arr[i]; +} + +/* { dg-final { scan-assembler-times "mov\tx16, \[0-9\]+" 3 } } */