On 07/11/2016 16:59, Adhemerval Zanella wrote:
> On 14/10/2016 15:59, Wilco Dijkstra wrote:
> There is no limit afaik on gold split stack allocation handling,
> and I think one could be added for each backend (in the method
> override require to implement it).
>
> In fact it is not really required to tie the nop generation with the
> instruction generated by 'aarch64_internal_mov_immediate', it is
> just a matter to simplify linker code.
If there is no easy limit and you'll still require a nop, I think it is best
then
to emit mov N+movk #0. Then the scheduler won't be able to reorder
them with the add/sub.
>> Is there any need to detect underflow of x10 or is there a guarantee that
>> stacks are
>> never allocated in the low 2GB (given the maximum adjustment is 2GB)? It's
>> safe
>> to do a signed comparison.
>
> I do not think so, at least none of current backend that implements
> split stack do so.
OK, well a signed comparison like in your new version works for underflow.
Now to the patch:
@@ -3316,6 +3339,28 @@ aarch64_expand_prologue (void)
aarch64_save_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
callee_adjust != 0 || frame_pointer_needed);
aarch64_sub_sp (IP1_REGNUM, final_adjust, !frame_pointer_needed);
+
+ if (split_stack_arg_pointer_used_p ())
+ {
+ /* Setup the argument pointer (x10) for -fsplit-stack code. If
+ __morestack was called, it will left the arg pointer to the
+ old stack in x28. Otherwise, the argument pointer is the top
+ of current frame. */
+ rtx x11 = gen_rtx_REG (Pmode, R11_REGNUM);
+ rtx x28 = gen_rtx_REG (Pmode, R28_REGNUM);
+ rtx cc_reg = gen_rtx_REG (CCmode, CC_REGNUM);
+
+ rtx not_more = gen_label_rtx ();
+
+ rtx cmp = gen_rtx_fmt_ee (LT, VOIDmode, cc_reg, const0_rtx);
+ rtx jump = emit_jump_insn (gen_condjump (cmp, cc_reg, not_more));
+ JUMP_LABEL (jump) = not_more;
+ LABEL_NUSES (not_more) += 1;
+
+ emit_move_insn (x11, x28);
+
+ emit_label (not_more);
+ }
If you pass the old sp in x11 when called from __morestack you can remove
the above thunk completely.
+ /* It limits total maximum stack allocation on 2G so its value can be
+ materialized using two instructions at most (movn/movk). It might be
+ used by the linker to add some extra space for split calling non split
+ stack functions. */
+ allocate = cfun->machine->frame.frame_size;
+ if (allocate > ((HOST_WIDE_INT) 1 << 31))
+ {
+ sorry ("Stack frame larger than 2G is not supported for -fsplit-stack");
+ return;
+ }
Note a 2-instruction mov/movk can generate any immediate up to 4GB and if
we need even large sizes, we could round up to a multiple of 64KB so that 2
instructions are enough for a 48-bit stack size...
+ int ninsn = aarch64_internal_mov_immediate (reg10, GEN_INT (-allocate),
+ true, Pmode);
+ gcc_assert (ninsn == 1 || ninsn == 2);
+ if (ninsn == 1)
+ emit_insn (gen_nop ());
To avoid any issues with the nop being scheduled, it's best to emit an explicit
movk
here (0xffff0000 if allocate > 0, or 0 if zero) using gen_insv_immdi.
+void
+aarch64_split_stack_space_check (rtx size, rtx label)
Isn't very similar code used in aarch64_expand_split_stack_prologue? Any
possibility
to share/reuse?
+static void
+aarch64_live_on_entry (bitmap regs)
+{
+ if (flag_split_stack)
+ bitmap_set_bit (regs, R11_REGNUM);
+}
I'm wondering whether you need extra code in aarch64_can_eliminate to deal
with the argument pointer? Also do we need to define a fixed register, or will
GCC
automatically allocate it to a callee-save if necessary?
+++ b/libgcc/config/aarch64/morestack.S
+/* Offset from __morestack frame where the arguments size saved and
+ passed to __generic_morestack. */
+#define ARGS_SIZE_SAVE 80
This define is unused.
+# The normal function prologue follows here, with a small addition at the
+# end to set up the argument pointer if required (the prolog):
+#
+# [...] # default function prologue
+# b.lt function:
+# mov x11, x28
We don't need this if we pass sp in x11 when calling back to the original
function.
+ stp x8, x10, [sp, 80]
+ stp x11, x12, [sp, 96]
No need to save x11 - it just contains original sp.
+ str x28, [sp, 112]
+ .cfi_offset 28, -112
+
+ # Setup on x28 the function initial frame pointer.
+ add x28, sp, MORESTACK_FRAMESIZE
Why save x28 when x28 = x29 + MORESTACK_FRAMESIZE? You can use x29
throughout the code as it is preserved by calls.
+ # Start using new stack
+ str x29, [x0, -16]!
This has no use.
+ mov sp, x0
+
+ # Set __private_ss stack guard for the new stack.
+ ldr x9, [x28, STACKFRAME_BASE + NEWSTACK_SAVE]
+ add x0, x0, BACKOFF
+ sub x0, x0, 16
Neither has this.
+ ldp x11, x12, [x28, STACKFRAME_BASE + 96]
+ # Indicate __morestack was called.
+ cmp x12, 0
+ blr x12
There is no need to restore x11, all we need to do is restore x12 and branch:
ldr x12, [x28, STACKFRAME_BASE + ...]
add x11, x29, MORESTACK_FRAMESIZE
blx x12
+ # Use old stack again.
+ #sub sp, x28, 16
+ mov sp, x28
Use:
add sp, x29, MORESTACK_FRAMESIZE
+ ldp x0, x1, [x28, STACKFRAME_BASE + 16]
+ ldp x2, x3, [x28, STACKFRAME_BASE + 32]
+ ldp x4, x5, [x28, STACKFRAME_BASE + 48]
+ ldp x6, x7, [x28, STACKFRAME_BASE + 64]
+ ldp x29, x30, [x28, STACKFRAME_BASE]
+ ldr x28, [x28, STACKFRAME_BASE + 112]
+
+ .cfi_remember_state
+ .cfi_restore 30
+ .cfi_restore 29
+ .cfi_restore 28
+ .cfi_def_cfa 31, 0
This needs to restore x29/x30 last to get correct unwinding:
ldp x29, x30, [sp], MORESTACK_FRAMESIZE
.cfi_remember_state
.cfi_restore 30
.cfi_restore 29
.cfi_def_cfa 31, 0
Wilco