On 29/07/16 12:49, Wilco Dijkstra wrote:
> This patch optimizes the prolog and epilog code to reduce the number of
> instructions and avoid multiple writes to SP. The key idea is that epilogs
> are almost exact reverses of prologs, and thus all the decisions only need
> to be taken once. The frame layout is decided in aarch64_layout_frame()
> and decisions recorded in the new aarch64_frame fields initial_adjust,
> callee_adjust, callee_offset and final_adjust.
>
> A generic frame setup consists of 5 basic steps:
>
> 1. sub sp, sp, initial_adjust
> 2. stp reg1, reg2, [sp, -callee_adjust]! (push if callee_adjust != 0)
> 3. add fp, sp, callee_offset (if frame_pointer_needed)
> 4. stp reg3, reg4, [sp, callee_offset + N*16] (store remaining callee-saves)
> 5. sub sp, sp, final_adjust
>
> The epilog reverses this, and may omit step 3 if alloca wasn't used.
>
> Bootstrap, GCC & gdb regression OK.
>
> ChangeLog:
> 2016-07-29 Wilco Dijkstra <[email protected]>
>
> gcc/
> * config/aarch64/aarch64.h (aarch64_frame):
> Remove padding0 and hardfp_offset. Add locals_offset,
> initial_adjust, callee_adjust, callee_offset and final_adjust.
> * config/aarch64/aarch64.c (aarch64_layout_frame):
> Remove unused padding0 and hardfp_offset initializations.
> Choose frame layout and set frame variables accordingly.
> Use INVALID_REGNUM instead of FIRST_PSEUDO_REGISTER.
> (aarch64_push_regs): Use INVALID_REGNUM, not FIRST_PSEUDO_REGISTER.
> (aarch64_pop_regs): Likewise.
> (aarch64_expand_prologue): Remove all decision code, just emit
> prolog according to frame variables.
> (aarch64_expand_epilogue): Remove all decision code, just emit
> epilog according to frame variables.
> (aarch64_initial_elimination_offset): Use offset to local/arg area.
>
> testsuite/
> * gcc.target/aarch64/test_frame_10.c: Fix test to check for a
> single stack adjustment, no writeback.
> * gcc.target/aarch64/test_frame_12.c: Likewise.
> * gcc.target/aarch64/test_frame_13.c: Likewise.
> * gcc.target/aarch64/test_frame_15.c: Likewise.
> * gcc.target/aarch64/test_frame_6.c: Likewise.
> * gcc.target/aarch64/test_frame_7.c: Likewise.
> * gcc.target/aarch64/test_frame_8.c: Likewise.
> * gcc.target/aarch64/test_frame_16.c: New test.
Two minor nits, but otherwise OK.
R.
> ---
>
> diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
> index
> 58959229c004e58405076b0e691b6b5634720140..455869f074dd72a38b6f8e1b199d83aa75b408b1
> 100644
> --- a/gcc/config/aarch64/aarch64.h
> +++ b/gcc/config/aarch64/aarch64.h
> @@ -550,11 +550,14 @@ struct GTY (()) aarch64_frame
> STACK_BOUNDARY. */
> HOST_WIDE_INT saved_varargs_size;
>
> + /* The size of the saved callee-save int/FP registers. */
> +
> HOST_WIDE_INT saved_regs_size;
> - /* Padding if needed after the all the callee save registers have
> - been saved. */
> - HOST_WIDE_INT padding0;
> - HOST_WIDE_INT hardfp_offset; /* HARD_FRAME_POINTER_REGNUM */
> +
> + /* Offset from the base of the frame (incomming SP) to the
> + top of the locals area. This value is always a multiple of
> + STACK_BOUNDARY. */
> + HOST_WIDE_INT locals_offset;
>
> /* Offset from the base of the frame (incomming SP) to the
> hard_frame_pointer. This value is always a multiple of
> @@ -564,12 +567,25 @@ struct GTY (()) aarch64_frame
> /* The size of the frame. This value is the offset from base of the
> * frame (incomming SP) to the stack_pointer. This value is always
> * a multiple of STACK_BOUNDARY. */
> + HOST_WIDE_INT frame_size;
> +
> + /* The size of the initial stack adjustment before saving callee-saves. */
> + HOST_WIDE_INT initial_adjust;
> +
> + /* The writeback value when pushing callee-save registers.
> + It is zero when no push is used. */
> + HOST_WIDE_INT callee_adjust;
> +
> + /* The offset from SP to the callee-save registers after initial_adjust.
> + It may be non-zero if no push is used (ie. callee_adjust == 0). */
> + HOST_WIDE_INT callee_offset;
> +
> + /* The size of the stack adjustment after saving callee-saves. */
> + HOST_WIDE_INT final_adjust;
>
> unsigned wb_candidate1;
> unsigned wb_candidate2;
>
> - HOST_WIDE_INT frame_size;
> -
> bool laid_out;
> };
>
> diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
> index
> 2023cb45fc8e87d94b48ae894bea78235056d4a4..7179dac29736409e1679e4bc932b95ba4c9aa1a5
> 100644
> --- a/gcc/config/aarch64/aarch64.c
> +++ b/gcc/config/aarch64/aarch64.c
> @@ -2728,8 +2728,8 @@ aarch64_layout_frame (void)
> #define SLOT_NOT_REQUIRED (-2)
> #define SLOT_REQUIRED (-1)
>
> - cfun->machine->frame.wb_candidate1 = FIRST_PSEUDO_REGISTER;
> - cfun->machine->frame.wb_candidate2 = FIRST_PSEUDO_REGISTER;
> + cfun->machine->frame.wb_candidate1 = INVALID_REGNUM;
> + cfun->machine->frame.wb_candidate2 = INVALID_REGNUM;
>
> /* First mark all the registers that really need to be saved... */
> for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
> @@ -2763,7 +2763,6 @@ aarch64_layout_frame (void)
> cfun->machine->frame.wb_candidate1 = R29_REGNUM;
> cfun->machine->frame.reg_offset[R30_REGNUM] = UNITS_PER_WORD;
> cfun->machine->frame.wb_candidate2 = R30_REGNUM;
> - cfun->machine->frame.hardfp_offset = 2 * UNITS_PER_WORD;
> offset += 2 * UNITS_PER_WORD;
> }
>
> @@ -2772,9 +2771,9 @@ aarch64_layout_frame (void)
> if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
> {
> cfun->machine->frame.reg_offset[regno] = offset;
> - if (cfun->machine->frame.wb_candidate1 == FIRST_PSEUDO_REGISTER)
> + if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
> cfun->machine->frame.wb_candidate1 = regno;
> - else if (cfun->machine->frame.wb_candidate2 == FIRST_PSEUDO_REGISTER)
> + else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM)
> cfun->machine->frame.wb_candidate2 = regno;
> offset += UNITS_PER_WORD;
> }
> @@ -2783,24 +2782,23 @@ aarch64_layout_frame (void)
> if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
> {
> cfun->machine->frame.reg_offset[regno] = offset;
> - if (cfun->machine->frame.wb_candidate1 == FIRST_PSEUDO_REGISTER)
> + if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
> cfun->machine->frame.wb_candidate1 = regno;
> - else if (cfun->machine->frame.wb_candidate2 == FIRST_PSEUDO_REGISTER
> + else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM
> && cfun->machine->frame.wb_candidate1 >= V0_REGNUM)
> cfun->machine->frame.wb_candidate2 = regno;
> offset += UNITS_PER_WORD;
> }
>
> - cfun->machine->frame.padding0 =
> - (ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT) - offset);
> offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
>
> cfun->machine->frame.saved_regs_size = offset;
>
> + HOST_WIDE_INT varargs_and_saved_regs_size = offset
> + + cfun->machine->frame.saved_varargs_size;
This should be written either as
HOST_WIDE_INT varargs_and_saved_regs_size
= offset + cfun->machine->frame.saved_varargs_size;
or as
HOST_WIDE_INT varargs_and_saved_regs_size = (offset
+
cfun->machine->frame.saved_varargs_size);
Which form you use may depend on the overall line length. In this case, I
think the former is preferable.
> +
> cfun->machine->frame.hard_fp_offset
> - = ROUND_UP (cfun->machine->frame.saved_varargs_size
> - + get_frame_size ()
> - + cfun->machine->frame.saved_regs_size,
> + = ROUND_UP (varargs_and_saved_regs_size + get_frame_size (),
> STACK_BOUNDARY / BITS_PER_UNIT);
>
> cfun->machine->frame.frame_size
> @@ -2808,6 +2806,77 @@ aarch64_layout_frame (void)
> + crtl->outgoing_args_size,
> STACK_BOUNDARY / BITS_PER_UNIT);
>
> + cfun->machine->frame.locals_offset =
> cfun->machine->frame.saved_varargs_size;
> +
> + cfun->machine->frame.initial_adjust = 0;
> + cfun->machine->frame.final_adjust = 0;
> + cfun->machine->frame.callee_adjust = 0;
> + cfun->machine->frame.callee_offset = 0;
> +
> + HOST_WIDE_INT max_push_offset = 0;
> + if (cfun->machine->frame.wb_candidate2 != INVALID_REGNUM)
> + max_push_offset = 512;
> + else if (cfun->machine->frame.wb_candidate1 != INVALID_REGNUM)
> + max_push_offset = 256;
> +
> + if (cfun->machine->frame.frame_size < max_push_offset
> + && crtl->outgoing_args_size == 0)
> + {
> + /* Simple, small frame with no outgoing arguments:
> + stp reg1, reg2, [sp, -frame_size]!
> + stp reg3, reg4, [sp, 16] */
> + cfun->machine->frame.callee_adjust = cfun->machine->frame.frame_size;
> + }
> + else if (crtl->outgoing_args_size
> + + cfun->machine->frame.saved_regs_size < 512
this sub-expression should be wrapped in parenthesis, so that the '+' indents
more deeply than the && that follows. Otherwise the logic can be slightly
confusing.
> + && !(cfun->calls_alloca
> + && cfun->machine->frame.hard_fp_offset < max_push_offset))
> + {
> + /* Frame with small outgoing arguments:
> + sub sp, sp, frame_size
> + stp reg1, reg2, [sp, outgoing_args_size]
> + stp reg3, reg4, [sp, outgoing_args_size + 16] */
> + cfun->machine->frame.initial_adjust = cfun->machine->frame.frame_size;
> + cfun->machine->frame.callee_offset
> + = cfun->machine->frame.frame_size - cfun->machine->frame.hard_fp_offset;
> + }
> + else if (cfun->machine->frame.hard_fp_offset < max_push_offset)
> + {
> + /* Frame with large outgoing arguments but a small local area:
> + stp reg1, reg2, [sp, -hard_fp_offset]!
> + stp reg3, reg4, [sp, 16]
> + sub sp, sp, outgoing_args_size */
> + cfun->machine->frame.callee_adjust =
> cfun->machine->frame.hard_fp_offset;
> + cfun->machine->frame.final_adjust
> + = cfun->machine->frame.frame_size - cfun->machine->frame.callee_adjust;
> + }
> + else if (!frame_pointer_needed
> + && varargs_and_saved_regs_size < max_push_offset)
> + {
> + /* Frame with large local area and outgoing arguments (this pushes the
> + callee-saves first, followed by the locals and outgoing area):
> + stp reg1, reg2, [sp, -varargs_and_saved_regs_size]!
> + stp reg3, reg4, [sp, 16]
> + sub sp, sp, frame_size - varargs_and_saved_regs_size */
> + cfun->machine->frame.callee_adjust = varargs_and_saved_regs_size;
> + cfun->machine->frame.final_adjust
> + = cfun->machine->frame.frame_size - cfun->machine->frame.callee_adjust;
> + cfun->machine->frame.hard_fp_offset =
> cfun->machine->frame.callee_adjust;
> + cfun->machine->frame.locals_offset =
> cfun->machine->frame.hard_fp_offset;
> + }
> + else
> + {
> + /* Frame with large local area and outgoing arguments using frame
> pointer:
> + sub sp, sp, hard_fp_offset
> + stp x29, x30, [sp, 0]
> + add x29, sp, 0
> + stp reg3, reg4, [sp, 16]
> + sub sp, sp, outgoing_args_size */
> + cfun->machine->frame.initial_adjust =
> cfun->machine->frame.hard_fp_offset;
> + cfun->machine->frame.final_adjust
> + = cfun->machine->frame.frame_size - cfun->machine->frame.initial_adjust;
> + }
> +
> cfun->machine->frame.laid_out = true;
> }
>
> @@ -2866,7 +2935,7 @@ aarch64_push_regs (unsigned regno1, unsigned regno2,
> HOST_WIDE_INT adjustment)
> rtx_insn *insn;
> machine_mode mode = (regno1 <= R30_REGNUM) ? DImode : DFmode;
>
> - if (regno2 == FIRST_PSEUDO_REGISTER)
> + if (regno2 == INVALID_REGNUM)
> return aarch64_pushwb_single_reg (mode, regno1, adjustment);
>
> rtx reg1 = gen_rtx_REG (mode, regno1);
> @@ -2905,7 +2974,7 @@ aarch64_pop_regs (unsigned regno1, unsigned regno2,
> HOST_WIDE_INT adjustment,
>
> *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg1, *cfi_ops);
>
> - if (regno2 == FIRST_PSEUDO_REGISTER)
> + if (regno2 == INVALID_REGNUM)
> {
> rtx mem = plus_constant (Pmode, stack_pointer_rtx, adjustment);
> mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
> @@ -3106,23 +3175,16 @@ aarch64_restore_callee_saves (machine_mode mode,
> void
> aarch64_expand_prologue (void)
> {
> - /* sub sp, sp, #<frame_size>
> - stp {fp, lr}, [sp, #<frame_size> - 16]
> - add fp, sp, #<frame_size> - hardfp_offset
> - stp {cs_reg}, [fp, #-16] etc.
> -
> - sub sp, sp, <final_adjustment_if_any>
> - */
> - HOST_WIDE_INT frame_size, offset;
> - HOST_WIDE_INT fp_offset; /* Offset from hard FP to SP. */
> - HOST_WIDE_INT hard_fp_offset;
> - rtx_insn *insn;
> -
> aarch64_layout_frame ();
>
> - offset = frame_size = cfun->machine->frame.frame_size;
> - hard_fp_offset = cfun->machine->frame.hard_fp_offset;
> - fp_offset = frame_size - hard_fp_offset;
> + HOST_WIDE_INT frame_size = cfun->machine->frame.frame_size;
> + HOST_WIDE_INT initial_adjust = cfun->machine->frame.initial_adjust;
> + HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
> + HOST_WIDE_INT final_adjust = cfun->machine->frame.final_adjust;
> + HOST_WIDE_INT callee_offset = cfun->machine->frame.callee_offset;
> + unsigned reg1 = cfun->machine->frame.wb_candidate1;
> + unsigned reg2 = cfun->machine->frame.wb_candidate2;
> + rtx_insn *insn;
>
> if (flag_stack_usage_info)
> current_function_static_stack_size = frame_size;
> @@ -3139,94 +3201,29 @@ aarch64_expand_prologue (void)
> aarch64_emit_probe_stack_range (STACK_CHECK_PROTECT, frame_size);
> }
>
> - /* Store pairs and load pairs have a range only -512 to 504. */
> - if (offset >= 512)
> - {
> - /* When the frame has a large size, an initial decrease is done on
> - the stack pointer to jump over the callee-allocated save area for
> - register varargs, the local variable area and/or the callee-saved
> - register area. This will allow the pre-index write-back
> - store pair instructions to be used for setting up the stack frame
> - efficiently. */
> - offset = hard_fp_offset;
> - if (offset >= 512)
> - offset = cfun->machine->frame.saved_regs_size;
> -
> - frame_size -= (offset + crtl->outgoing_args_size);
> - fp_offset = 0;
> + aarch64_add_constant (Pmode, SP_REGNUM, IP0_REGNUM, -initial_adjust, true);
>
> - aarch64_add_constant (Pmode, SP_REGNUM, IP0_REGNUM, -frame_size, true);
> - }
> - else
> - frame_size = -1;
> + if (callee_adjust != 0)
> + aarch64_push_regs (reg1, reg2, callee_adjust);
>
> - if (offset > 0)
> + if (frame_pointer_needed)
> {
> - bool skip_wb = false;
> -
> - if (frame_pointer_needed)
> - {
> - skip_wb = true;
> -
> - if (fp_offset)
> - {
> - insn = emit_insn (gen_add2_insn (stack_pointer_rtx,
> - GEN_INT (-offset)));
> - RTX_FRAME_RELATED_P (insn) = 1;
> -
> - aarch64_save_callee_saves (DImode, fp_offset, R29_REGNUM,
> - R30_REGNUM, false);
> - }
> - else
> - aarch64_push_regs (R29_REGNUM, R30_REGNUM, offset);
> -
> - /* Set up frame pointer to point to the location of the
> - previous frame pointer on the stack. */
> - insn = emit_insn (gen_add3_insn (hard_frame_pointer_rtx,
> - stack_pointer_rtx,
> - GEN_INT (fp_offset)));
> - RTX_FRAME_RELATED_P (insn) = 1;
> - emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
> - }
> - else
> - {
> - unsigned reg1 = cfun->machine->frame.wb_candidate1;
> - unsigned reg2 = cfun->machine->frame.wb_candidate2;
> -
> - if (fp_offset
> - || reg1 == FIRST_PSEUDO_REGISTER
> - || (reg2 == FIRST_PSEUDO_REGISTER
> - && offset >= 256))
> - {
> - insn = emit_insn (gen_add2_insn (stack_pointer_rtx,
> - GEN_INT (-offset)));
> - RTX_FRAME_RELATED_P (insn) = 1;
> - }
> - else
> - {
> - aarch64_push_regs (reg1, reg2, offset);
> - skip_wb = true;
> - }
> - }
> -
> - aarch64_save_callee_saves (DImode, fp_offset, R0_REGNUM, R30_REGNUM,
> - skip_wb);
> - aarch64_save_callee_saves (DFmode, fp_offset, V0_REGNUM, V31_REGNUM,
> - skip_wb);
> + if (callee_adjust == 0)
> + aarch64_save_callee_saves (DImode, callee_offset, R29_REGNUM,
> + R30_REGNUM, false);
> + insn = emit_insn (gen_add3_insn (hard_frame_pointer_rtx,
> + stack_pointer_rtx,
> + GEN_INT (callee_offset)));
> + RTX_FRAME_RELATED_P (insn) = 1;
> + emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
> }
>
> - /* when offset >= 512,
> - sub sp, sp, #<outgoing_args_size> */
> - if (frame_size > -1)
> - {
> - if (crtl->outgoing_args_size > 0)
> - {
> - insn = emit_insn (gen_add2_insn
> - (stack_pointer_rtx,
> - GEN_INT (- crtl->outgoing_args_size)));
> - RTX_FRAME_RELATED_P (insn) = 1;
> - }
> - }
> + aarch64_save_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
> + callee_adjust != 0 || frame_pointer_needed);
> + aarch64_save_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
> + callee_adjust != 0 || frame_pointer_needed);
> + aarch64_add_constant (Pmode, SP_REGNUM, IP1_REGNUM, -final_adjust,
> + !frame_pointer_needed);
> }
>
> /* Return TRUE if we can use a simple_return insn.
> @@ -3249,104 +3246,80 @@ aarch64_use_return_insn_p (void)
> return cfun->machine->frame.frame_size == 0;
> }
>
> -/* Generate the epilogue instructions for returning from a function. */
> +/* Generate the epilogue instructions for returning from a function.
> + This is almost exactly the reverse of the prolog sequence, except
> + that we need to insert barriers to avoid scheduling loads that read
> + from a deallocated stack, and we optimize the unwind records by
> + emitting them all together if possible. */
> void
> aarch64_expand_epilogue (bool for_sibcall)
> {
> - HOST_WIDE_INT frame_size, offset;
> - HOST_WIDE_INT fp_offset;
> - HOST_WIDE_INT hard_fp_offset;
> - rtx_insn *insn;
> - /* We need to add memory barrier to prevent read from deallocated stack.
> */
> - bool need_barrier_p = (get_frame_size () != 0
> - || cfun->machine->frame.saved_varargs_size);
> -
> aarch64_layout_frame ();
>
> - offset = frame_size = cfun->machine->frame.frame_size;
> - hard_fp_offset = cfun->machine->frame.hard_fp_offset;
> - fp_offset = frame_size - hard_fp_offset;
> + HOST_WIDE_INT initial_adjust = cfun->machine->frame.initial_adjust;
> + HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
> + HOST_WIDE_INT final_adjust = cfun->machine->frame.final_adjust;
> + HOST_WIDE_INT callee_offset = cfun->machine->frame.callee_offset;
> + unsigned reg1 = cfun->machine->frame.wb_candidate1;
> + unsigned reg2 = cfun->machine->frame.wb_candidate2;
> + rtx cfi_ops = NULL;
> + rtx_insn *insn;
>
> - /* Store pairs and load pairs have a range only -512 to 504. */
> - if (offset >= 512)
> - {
> - offset = hard_fp_offset;
> - if (offset >= 512)
> - offset = cfun->machine->frame.saved_regs_size;
> + /* We need to add memory barrier to prevent read from deallocated stack.
> */
> + bool need_barrier_p = (get_frame_size ()
> + + cfun->machine->frame.saved_varargs_size) != 0;
>
> - frame_size -= (offset + crtl->outgoing_args_size);
> - fp_offset = 0;
> - if (!frame_pointer_needed && crtl->outgoing_args_size > 0)
> - {
> - insn = emit_insn (gen_add2_insn
> - (stack_pointer_rtx,
> - GEN_INT (crtl->outgoing_args_size)));
> - RTX_FRAME_RELATED_P (insn) = 1;
> - }
> + /* Emit a barrier to prevent loads from a deallocated stack. */
> + if (final_adjust > crtl->outgoing_args_size || cfun->calls_alloca)
> + {
> + emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
> + need_barrier_p = false;
> }
> - else
> - frame_size = -1;
>
> - /* If there were outgoing arguments or we've done dynamic stack
> - allocation, then restore the stack pointer from the frame
> - pointer. This is at most one insn and more efficient than using
> - GCC's internal mechanism. */
> - if (frame_pointer_needed
> - && (crtl->outgoing_args_size || cfun->calls_alloca))
> + /* Restore the stack pointer from the frame pointer if it may not
> + be the same as the stack pointer. */
> + if (frame_pointer_needed && (final_adjust || cfun->calls_alloca))
> {
> - if (cfun->calls_alloca)
> - emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
> -
> insn = emit_insn (gen_add3_insn (stack_pointer_rtx,
> hard_frame_pointer_rtx,
> - GEN_INT (0)));
> - offset = offset - fp_offset;
> + GEN_INT (-callee_offset)));
> + /* If writeback is used when restoring callee-saves, the CFA
> + is restored on the instruction doing the writeback. */
> + RTX_FRAME_RELATED_P (insn) = callee_adjust == 0;
> }
> + else
> + aarch64_add_constant (Pmode, SP_REGNUM, IP1_REGNUM, final_adjust, true);
>
> - if (offset > 0)
> - {
> - unsigned reg1 = cfun->machine->frame.wb_candidate1;
> - unsigned reg2 = cfun->machine->frame.wb_candidate2;
> - bool skip_wb = true;
> - rtx cfi_ops = NULL;
> -
> - if (frame_pointer_needed)
> - fp_offset = 0;
> - else if (fp_offset
> - || reg1 == FIRST_PSEUDO_REGISTER
> - || (reg2 == FIRST_PSEUDO_REGISTER
> - && offset >= 256))
> - skip_wb = false;
> -
> - aarch64_restore_callee_saves (DImode, fp_offset, R0_REGNUM, R30_REGNUM,
> - skip_wb, &cfi_ops);
> - aarch64_restore_callee_saves (DFmode, fp_offset, V0_REGNUM, V31_REGNUM,
> - skip_wb, &cfi_ops);
> + aarch64_restore_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
> + callee_adjust != 0, &cfi_ops);
> + aarch64_restore_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
> + callee_adjust != 0, &cfi_ops);
>
> - if (need_barrier_p)
> - emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
> + if (need_barrier_p)
> + emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
>
> - if (skip_wb)
> - aarch64_pop_regs (reg1, reg2, offset, &cfi_ops);
> - else
> - emit_insn (gen_add2_insn (stack_pointer_rtx, GEN_INT (offset)));
> + if (callee_adjust != 0)
> + aarch64_pop_regs (reg1, reg2, callee_adjust, &cfi_ops);
>
> - /* Reset the CFA to be SP + FRAME_SIZE. */
> - rtx new_cfa = stack_pointer_rtx;
> - if (frame_size > 0)
> - new_cfa = plus_constant (Pmode, new_cfa, frame_size);
> - cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
> + if (callee_adjust != 0 || initial_adjust > 65536)
> + {
> + /* Emit delayed restores and set the CFA to be SP + initial_adjust. */
> insn = get_last_insn ();
> - REG_NOTES (insn) = cfi_ops;
> + rtx new_cfa = plus_constant (Pmode, stack_pointer_rtx, initial_adjust);
> + REG_NOTES (insn) = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
> RTX_FRAME_RELATED_P (insn) = 1;
> + cfi_ops = NULL;
> }
>
> - if (frame_size > 0)
> - {
> - if (need_barrier_p)
> - emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
> + aarch64_add_constant (Pmode, SP_REGNUM, IP0_REGNUM, initial_adjust, true);
>
> - aarch64_add_constant (Pmode, SP_REGNUM, IP0_REGNUM, frame_size, true);
> + if (cfi_ops)
> + {
> + /* Emit delayed restores and reset the CFA to be SP. */
> + insn = get_last_insn ();
> + cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, stack_pointer_rtx, cfi_ops);
> + REG_NOTES (insn) = cfi_ops;
> + RTX_FRAME_RELATED_P (insn) = 1;
> }
>
> /* Stack adjustment for exception handler. */
> @@ -5173,18 +5146,18 @@ aarch64_initial_elimination_offset (unsigned from,
> unsigned to)
> if (to == HARD_FRAME_POINTER_REGNUM)
> {
> if (from == ARG_POINTER_REGNUM)
> - return cfun->machine->frame.frame_size - crtl->outgoing_args_size;
> + return cfun->machine->frame.hard_fp_offset;
>
> if (from == FRAME_POINTER_REGNUM)
> - return (cfun->machine->frame.hard_fp_offset
> - - cfun->machine->frame.saved_varargs_size);
> + return cfun->machine->frame.hard_fp_offset
> + - cfun->machine->frame.locals_offset;
> }
>
> if (to == STACK_POINTER_REGNUM)
> {
> if (from == FRAME_POINTER_REGNUM)
> - return (cfun->machine->frame.frame_size
> - - cfun->machine->frame.saved_varargs_size);
> + return cfun->machine->frame.frame_size
> + - cfun->machine->frame.locals_offset;
> }
>
> return cfun->machine->frame.frame_size;
> diff --git a/gcc/testsuite/gcc.target/aarch64/test_frame_10.c
> b/gcc/testsuite/gcc.target/aarch64/test_frame_10.c
> index
> 70dd6539af93a034ae64f8603089c6d6f59a6b53..e23a4a83528b71a0de0c95752a9e530bf4ca79e5
> 100644
> --- a/gcc/testsuite/gcc.target/aarch64/test_frame_10.c
> +++ b/gcc/testsuite/gcc.target/aarch64/test_frame_10.c
> @@ -4,8 +4,7 @@
> * total frame size > 512.
> area except outgoing <= 512
> * number of callee-saved reg >= 2.
> - * Split stack adjustment into two subtractions.
> - the first subtractions could be optimized into "stp !". */
> + * Use a single stack adjustment, no writeback. */
>
> /* { dg-do run } */
> /* { dg-options "-O2 -fomit-frame-pointer --save-temps" } */
> @@ -15,6 +14,6 @@
> t_frame_pattern_outgoing (test10, 480, "x19", 24, a[8], a[9], a[10])
> t_frame_run (test10)
>
> -/* { dg-final { scan-assembler-times "stp\tx19, x30, \\\[sp, -\[0-9\]+\\\]!"
> 1 } } */
> -/* { dg-final { scan-assembler-times "ldp\tx19, x30, \\\[sp\\\], \[0-9\]+" 1
> } } */
> +/* { dg-final { scan-assembler-times "stp\tx19, x30, \\\[sp, \[0-9\]+\\\]" 1
> } } */
> +/* { dg-final { scan-assembler-times "ldp\tx19, x30, \\\[sp, \[0-9\]+\\\]" 1
> } } */
>
> diff --git a/gcc/testsuite/gcc.target/aarch64/test_frame_12.c
> b/gcc/testsuite/gcc.target/aarch64/test_frame_12.c
> index
> 2353477c29ea99c56e73a34cf0449cf6c669e973..3d7d3594610c645d2d6f449b6ee0400fdd395849
> 100644
> --- a/gcc/testsuite/gcc.target/aarch64/test_frame_12.c
> +++ b/gcc/testsuite/gcc.target/aarch64/test_frame_12.c
> @@ -13,6 +13,6 @@ t_frame_run (test12)
>
> /* { dg-final { scan-assembler-times "sub\tsp, sp, #\[0-9\]+" 1 } } */
>
> -/* Check epilogue using write-back. */
> -/* { dg-final { scan-assembler-times "ldp\tx29, x30, \\\[sp\\\], \[0-9\]+" 3
> } } */
> +/* Check epilogue using no write-back. */
> +/* { dg-final { scan-assembler-times "ldp\tx29, x30, \\\[sp, \[0-9\]+\\\]" 1
> } } */
>
> diff --git a/gcc/testsuite/gcc.target/aarch64/test_frame_13.c
> b/gcc/testsuite/gcc.target/aarch64/test_frame_13.c
> index
> f3aa263929421db12b78abc733e2b011db3a4e48..74b3370fa463b652265e00fff80cc8856524d509
> 100644
> --- a/gcc/testsuite/gcc.target/aarch64/test_frame_13.c
> +++ b/gcc/testsuite/gcc.target/aarch64/test_frame_13.c
> @@ -2,8 +2,7 @@
> * without outgoing.
> * total frame size > 512.
> * number of callee-save reg >= 2.
> - * split the stack adjustment into two substractions,
> - the second could be optimized into "stp !". */
> + * Use a single stack adjustment, no writeback. */
>
> /* { dg-do run } */
> /* { dg-options "-O2 --save-temps" } */
> @@ -14,4 +13,4 @@ t_frame_pattern (test13, 700, )
> t_frame_run (test13)
>
> /* { dg-final { scan-assembler-times "sub\tsp, sp, #\[0-9\]+" 1 } } */
> -/* { dg-final { scan-assembler-times "stp\tx29, x30, \\\[sp, -\[0-9\]+\\\]!"
> 2 } } */
> +/* { dg-final { scan-assembler-times "stp\tx29, x30, \\\[sp\\\]" 1 } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/test_frame_15.c
> b/gcc/testsuite/gcc.target/aarch64/test_frame_15.c
> index
> fc6f713232de52b72ba5c3eef92e1aea6526199d..bed6714b4fe529a3b81ad8c5253924aa97bf8806
> 100644
> --- a/gcc/testsuite/gcc.target/aarch64/test_frame_15.c
> +++ b/gcc/testsuite/gcc.target/aarch64/test_frame_15.c
> @@ -3,8 +3,7 @@
> * total frame size > 512.
> area except outgoing <= 512
> * number of callee-save reg >= 2.
> - * split the stack adjustment into two substractions,
> - the first could be optimized into "stp !". */
> + * Use a single stack adjustment, no writeback. */
>
> /* { dg-do run } */
> /* { dg-options "-O2 --save-temps" } */
> @@ -15,4 +14,4 @@ t_frame_pattern_outgoing (test15, 480, , 8, a[8])
> t_frame_run (test15)
>
> /* { dg-final { scan-assembler-times "sub\tsp, sp, #\[0-9\]+" 1 } } */
> -/* { dg-final { scan-assembler-times "stp\tx29, x30, \\\[sp, -\[0-9\]+\\\]!"
> 3 } } */
> +/* { dg-final { scan-assembler-times "stp\tx29, x30, \\\[sp, \[0-9\]+\\\]" 1
> } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/test_frame_16.c
> b/gcc/testsuite/gcc.target/aarch64/test_frame_16.c
> new file mode 100644
> index
> 0000000000000000000000000000000000000000..28f3826adadd5eaa6486659e4d6b6d7c5960b9d2
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/test_frame_16.c
> @@ -0,0 +1,25 @@
> +/* Verify:
> + * with outgoing.
> + * single int register push.
> + * varargs and callee-save size >= 256
> + * Use 2 stack adjustments. */
> +
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -fomit-frame-pointer --save-temps" } */
> +
> +#define REP8(X) X,X,X,X,X,X,X,X
> +#define REP64(X) REP8(REP8(X))
> +
> +void outgoing (__builtin_va_list, ...);
> +
> +double vararg_outgoing (int x1, ...)
> +{
> + double a1 = x1, a2 = x1 * 2, a3 = x1 * 3, a4 = x1 * 4, a5 = x1 * 5, a6 =
> x1 * 6;
> + __builtin_va_list vl;
> + __builtin_va_start (vl, x1);
> + outgoing (vl, a1, a2, a3, a4, a5, a6, REP64 (1));
> + __builtin_va_end (vl);
> + return a1 + a2 + a3 + a4 + a5 + a6;
> +}
> +
> +/* { dg-final { scan-assembler-times "sub\tsp, sp, #\[0-9\]+" 2 } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/test_frame_6.c
> b/gcc/testsuite/gcc.target/aarch64/test_frame_6.c
> index
> d8481346c58458934deecb4b7f38fb5821517b56..6a753dff87e28fa71a2f69df5fb95559163fa6cd
> 100644
> --- a/gcc/testsuite/gcc.target/aarch64/test_frame_6.c
> +++ b/gcc/testsuite/gcc.target/aarch64/test_frame_6.c
> @@ -3,8 +3,7 @@
> * without outgoing.
> * total frame size > 512.
> * number of callee-saved reg == 1.
> - * split stack adjustment into two subtractions.
> - the second subtraction should use "str !". */
> + * use a single stack adjustment, no writeback. */
>
> /* { dg-do run } */
> /* { dg-options "-O2 -fomit-frame-pointer --save-temps" } */
> @@ -14,6 +13,7 @@
> t_frame_pattern (test6, 700, )
> t_frame_run (test6)
>
> -/* { dg-final { scan-assembler-times "str\tx30, \\\[sp, -\[0-9\]+\\\]!" 2 }
> } */
> -/* { dg-final { scan-assembler-times "ldr\tx30, \\\[sp\\\], \[0-9\]+" 2 } }
> */
> +/* { dg-final { scan-assembler-times "str\tx30, \\\[sp\\\]" 1 } } */
> +/* { dg-final { scan-assembler-times "ldr\tx30, \\\[sp\\\]" 2 } } */
> +/* { dg-final { scan-assembler-times "ldr\tx30, \\\[sp\\\]," 1 } } */
>
> diff --git a/gcc/testsuite/gcc.target/aarch64/test_frame_7.c
> b/gcc/testsuite/gcc.target/aarch64/test_frame_7.c
> index
> d87d68b3eec72dd23b279ea94391a400c9ae5a9a..f2a8713d19d9f7df49073e9588c5d74661491fb6
> 100644
> --- a/gcc/testsuite/gcc.target/aarch64/test_frame_7.c
> +++ b/gcc/testsuite/gcc.target/aarch64/test_frame_7.c
> @@ -3,8 +3,7 @@
> * without outgoing.
> * total frame size > 512.
> * number of callee-saved reg == 2.
> - * split stack adjustment into two subtractions.
> - the second subtraction should use "stp !". */
> + * use a single stack adjustment, no writeback. */
>
> /* { dg-do run } */
> /* { dg-options "-O2 -fomit-frame-pointer --save-temps" } */
> @@ -14,6 +13,6 @@
> t_frame_pattern (test7, 700, "x19")
> t_frame_run (test7)
>
> -/* { dg-final { scan-assembler-times "stp\tx19, x30, \\\[sp, -\[0-9\]+\\\]!"
> 1 } } */
> -/* { dg-final { scan-assembler-times "ldp\tx19, x30, \\\[sp\\\], \[0-9\]+" 1
> } } */
> +/* { dg-final { scan-assembler-times "stp\tx19, x30, \\\[sp]" 1 } } */
> +/* { dg-final { scan-assembler-times "ldp\tx19, x30, \\\[sp\\\]" 1 } } */
>
> diff --git a/gcc/testsuite/gcc.target/aarch64/test_frame_8.c
> b/gcc/testsuite/gcc.target/aarch64/test_frame_8.c
> index
> 435d9d59e68d71b1d4c56f1beca5fb1bce4f39b8..9b6c6939eb5c3ae1bdcab7fb854b6c519f054c20
> 100644
> --- a/gcc/testsuite/gcc.target/aarch64/test_frame_8.c
> +++ b/gcc/testsuite/gcc.target/aarch64/test_frame_8.c
> @@ -12,6 +12,6 @@
> t_frame_pattern_outgoing (test8, 700, , 8, a[8])
> t_frame_run (test8)
>
> -/* { dg-final { scan-assembler-times "str\tx30, \\\[sp, -\[0-9\]+\\\]!" 3 }
> } */
> -/* { dg-final { scan-assembler-times "ldr\tx30, \\\[sp\\\], \[0-9\]+" 3 } }
> */
> +/* { dg-final { scan-assembler-times "str\tx30, \\\[sp, \[0-9\]+\\\]" 1 } }
> */
> +/* { dg-final { scan-assembler-times "ldr\tx30, \\\[sp, \[0-9\]+\\\]" 1 } }
> */
>