https://gcc.gnu.org/g:fa61afef18a8566d1907a5ae0e7754e1eac207d9
commit r16-112-gfa61afef18a8566d1907a5ae0e7754e1eac207d9 Author: Richard Sandiford <richard.sandif...@arm.com> Date: Thu Apr 24 14:31:49 2025 +0100 aarch64: Fix CFA offsets in non-initial stack probes [PR119610] PR119610 is about incorrect CFI output for a stack probe when that probe is not the initial allocation. The main aarch64 stack probe function, aarch64_allocate_and_probe_stack_space, implicitly assumed that the incoming stack pointer pointed to the top of the frame, and thus held the CFA. aarch64_save_callee_saves and aarch64_restore_callee_saves use a parameter called bytes_below_sp to track how far the stack pointer is above the base of the static frame. This patch does the same thing for aarch64_allocate_and_probe_stack_space. Also, I noticed that the SVE path was attaching the first CFA note to the wrong instruction: it was attaching the note to the calculation of the stack size, rather than to the r11<-sp copy. gcc/ PR target/119610 * config/aarch64/aarch64.cc (aarch64_allocate_and_probe_stack_space): Add a bytes_below_sp parameter and use it to calculate the CFA offsets. Attach the first SVE CFA note to the move into the associated temporary register. (aarch64_allocate_and_probe_stack_space): Update calls accordingly. Start out with bytes_per_sp set to the frame size and decrement it after each allocation. gcc/testsuite/ PR target/119610 * g++.dg/torture/pr119610.C: New test. * g++.target/aarch64/sve/pr119610-sve.C: Likewise. Diff: --- gcc/config/aarch64/aarch64.cc | 66 +++++++++++++--------- gcc/testsuite/g++.dg/torture/pr119610.C | 18 ++++++ .../g++.target/aarch64/sve/pr119610-sve.C | 20 +++++++ 3 files changed, 78 insertions(+), 26 deletions(-) diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc index 38c112cc92fd..f7bccf532f89 100644 --- a/gcc/config/aarch64/aarch64.cc +++ b/gcc/config/aarch64/aarch64.cc @@ -9417,13 +9417,16 @@ aarch64_emit_stack_tie (rtx reg) } /* Allocate POLY_SIZE bytes of stack space using TEMP1 and TEMP2 as scratch - registers. If POLY_SIZE is not large enough to require a probe this function - will only adjust the stack. When allocating the stack space - FRAME_RELATED_P is then used to indicate if the allocation is frame related. - FINAL_ADJUSTMENT_P indicates whether we are allocating the area below - the saved registers. If we are then we ensure that any allocation - larger than the ABI defined buffer needs a probe so that the - invariant of having a 1KB buffer is maintained. + registers, given that the stack pointer is currently BYTES_BELOW_SP bytes + above the bottom of the static frame. + + If POLY_SIZE is not large enough to require a probe this function will only + adjust the stack. When allocating the stack space FRAME_RELATED_P is then + used to indicate if the allocation is frame related. FINAL_ADJUSTMENT_P + indicates whether we are allocating the area below the saved registers. + If we are then we ensure that any allocation larger than the ABI defined + buffer needs a probe so that the invariant of having a 1KB buffer is + maintained. We emit barriers after each stack adjustment to prevent optimizations from breaking the invariant that we never drop the stack more than a page. This @@ -9440,6 +9443,7 @@ aarch64_emit_stack_tie (rtx reg) static void aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2, poly_int64 poly_size, + poly_int64 bytes_below_sp, aarch64_isa_mode force_isa_mode, bool frame_related_p, bool final_adjustment_p) @@ -9503,8 +9507,8 @@ aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2, poly_size, temp1, temp2, force_isa_mode, false, true); - rtx_insn *insn = get_last_insn (); - + auto initial_cfa_offset = frame.frame_size - bytes_below_sp; + auto final_cfa_offset = initial_cfa_offset + poly_size; if (frame_related_p) { /* This is done to provide unwinding information for the stack @@ -9514,28 +9518,31 @@ aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2, The tie will expand to nothing but the optimizers will not touch the instruction. */ rtx stack_ptr_copy = gen_rtx_REG (Pmode, STACK_CLASH_SVE_CFA_REGNUM); - emit_move_insn (stack_ptr_copy, stack_pointer_rtx); + auto *insn = emit_move_insn (stack_ptr_copy, stack_pointer_rtx); aarch64_emit_stack_tie (stack_ptr_copy); /* We want the CFA independent of the stack pointer for the duration of the loop. */ - add_reg_note (insn, REG_CFA_DEF_CFA, stack_ptr_copy); + add_reg_note (insn, REG_CFA_DEF_CFA, + plus_constant (Pmode, stack_ptr_copy, + initial_cfa_offset)); RTX_FRAME_RELATED_P (insn) = 1; } rtx probe_const = gen_int_mode (min_probe_threshold, Pmode); rtx guard_const = gen_int_mode (guard_size, Pmode); - insn = emit_insn (gen_probe_sve_stack_clash (Pmode, stack_pointer_rtx, - stack_pointer_rtx, temp1, - probe_const, guard_const)); + auto *insn + = emit_insn (gen_probe_sve_stack_clash (Pmode, stack_pointer_rtx, + stack_pointer_rtx, temp1, + probe_const, guard_const)); /* Now reset the CFA register if needed. */ if (frame_related_p) { add_reg_note (insn, REG_CFA_DEF_CFA, - gen_rtx_PLUS (Pmode, stack_pointer_rtx, - gen_int_mode (poly_size, Pmode))); + plus_constant (Pmode, stack_pointer_rtx, + final_cfa_offset)); RTX_FRAME_RELATED_P (insn) = 1; } @@ -9581,12 +9588,13 @@ aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2, We can determine which allocation we are doing by looking at the value of FRAME_RELATED_P since the final allocations are not frame related. */ + auto cfa_offset = frame.frame_size - (bytes_below_sp - rounded_size); if (frame_related_p) { /* We want the CFA independent of the stack pointer for the duration of the loop. */ add_reg_note (insn, REG_CFA_DEF_CFA, - plus_constant (Pmode, temp1, rounded_size)); + plus_constant (Pmode, temp1, cfa_offset)); RTX_FRAME_RELATED_P (insn) = 1; } @@ -9608,7 +9616,7 @@ aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2, if (frame_related_p) { add_reg_note (insn, REG_CFA_DEF_CFA, - plus_constant (Pmode, stack_pointer_rtx, rounded_size)); + plus_constant (Pmode, stack_pointer_rtx, cfa_offset)); RTX_FRAME_RELATED_P (insn) = 1; } @@ -9916,17 +9924,22 @@ aarch64_expand_prologue (void) code below does not handle it for -fstack-clash-protection. */ gcc_assert (known_eq (initial_adjust, 0) || callee_adjust == 0); + /* The offset of the current SP from the bottom of the static frame. */ + poly_int64 bytes_below_sp = frame_size; + /* Will only probe if the initial adjustment is larger than the guard less the amount of the guard reserved for use by the caller's outgoing args. */ aarch64_allocate_and_probe_stack_space (tmp0_rtx, tmp1_rtx, initial_adjust, - force_isa_mode, true, false); + bytes_below_sp, force_isa_mode, + true, false); + bytes_below_sp -= initial_adjust; if (callee_adjust != 0) - aarch64_push_regs (reg1, reg2, callee_adjust); - - /* The offset of the current SP from the bottom of the static frame. */ - poly_int64 bytes_below_sp = frame_size - initial_adjust - callee_adjust; + { + aarch64_push_regs (reg1, reg2, callee_adjust); + bytes_below_sp -= callee_adjust; + } if (emit_frame_chain) { @@ -9994,7 +10007,7 @@ aarch64_expand_prologue (void) || known_eq (frame.reg_offset[VG_REGNUM], bytes_below_sp)); aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx, sve_callee_adjust, - force_isa_mode, + bytes_below_sp, force_isa_mode, !frame_pointer_needed, false); bytes_below_sp -= sve_callee_adjust; } @@ -10005,10 +10018,11 @@ aarch64_expand_prologue (void) /* We may need to probe the final adjustment if it is larger than the guard that is assumed by the called. */ - gcc_assert (known_eq (bytes_below_sp, final_adjust)); aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx, final_adjust, - force_isa_mode, + bytes_below_sp, force_isa_mode, !frame_pointer_needed, true); + bytes_below_sp -= final_adjust; + gcc_assert (known_eq (bytes_below_sp, 0)); if (emit_frame_chain && maybe_ne (final_adjust, 0)) aarch64_emit_stack_tie (hard_frame_pointer_rtx); diff --git a/gcc/testsuite/g++.dg/torture/pr119610.C b/gcc/testsuite/g++.dg/torture/pr119610.C new file mode 100644 index 000000000000..9998026c4812 --- /dev/null +++ b/gcc/testsuite/g++.dg/torture/pr119610.C @@ -0,0 +1,18 @@ +// { dg-do run } +// { dg-additional-options "-fstack-protector-strong" { target fstack_protector } } +// { dg-additional-options "-fstack-clash-protection" { target supports_stack_clash_protection } } + +int *ptr; +void foo() { + int c[1024*128]; + ptr = c; + throw 1; +} +int main() +{ + try { + foo(); + } catch(int x) { + return 0; + } +} diff --git a/gcc/testsuite/g++.target/aarch64/sve/pr119610-sve.C b/gcc/testsuite/g++.target/aarch64/sve/pr119610-sve.C new file mode 100644 index 000000000000..0044e5121756 --- /dev/null +++ b/gcc/testsuite/g++.target/aarch64/sve/pr119610-sve.C @@ -0,0 +1,20 @@ +// { dg-do run { target aarch64_sve_hw } } +// { dg-additional-options "-fstack-protector-strong" { target fstack_protector } } +// { dg-additional-options "-fstack-clash-protection" { target supports_stack_clash_protection } } + +void *a_ptr, *b_ptr; +void foo() { + __SVInt32_t a; + int b[1024*128]; + a_ptr = &a; + b_ptr = b; + throw 1; +} +int main() +{ + try { + foo(); + } catch(int x) { + return 0; + } +}