On 06/10/15 11:11, Eric Botcazou wrote:
>> Thanks - I have no further comments on this patch. We probably need to
>> implement the same on AArch64 too in order to avoid similar problems.
>
> Here's the implementation for aarch64, very similar but simpler since there
> is
> no shortage of scratch registers; the only thing to note is the new blockage
> pattern. This was tested on real hardware but not with Linux, instead with
> Darwin (experimental port of the toolchain to iOS) and makes it possible to
> pass ACATS (Ada conformance testsuite which requires stack checking).
>
> There is also a couple of tweaks for the ARM implementation: a cosmetic one
> for the probe_stack pattern and one for the output_probe_stack_range loop.
>
>
> 2015-10-06 Tristan Gingold <[email protected]>
> Eric Botcazou <[email protected]>
>
> PR middle-end/65958
> * config/aarch64/aarch64-protos.h (aarch64_output_probe_stack-range):
> Declare.
> * config/aarch64/aarch64.md: Declare UNSPECV_BLOCKAGE and
> UNSPEC_PROBE_STACK_RANGE.
> (blockage): New instruction.
> (probe_stack_range): Likewise.
> * config/aarch64/aarch64.c (aarch64_emit_probe_stack_range): New
> function.
> (aarch64_output_probe_stack_range): Likewise.
> (aarch64_expand_prologue): Invoke aarch64_emit_probe_stack_range if
> static builtin stack checking is enabled.
> * config/aarch64/aarch64-linux.h (STACK_CHECK_STATIC_BUILTIN):
> Define.
>
> * config/arm/arm.c (arm_emit_probe_stack_range): Adjust comment.
> (output_probe_stack_range): Rotate the loop and simplify.
> (thumb1_expand_prologue): Tweak sorry message.
> * config/arm/arm.md (probe_stack): Use bare string.
>
>
> 2015-10-06 Eric Botcazou <[email protected]>
>
> * gcc.target/aarch64/stack-checking.c: New test.
>
Unless there really is common code between the two patches, this should
be separated out into two posts, one for ARM and one for AArch64.
More comments inline.
>
> pr65958-2.diff
>
>
> Index: config/aarch64/aarch64-linux.h
> ===================================================================
> --- config/aarch64/aarch64-linux.h (revision 228512)
> +++ config/aarch64/aarch64-linux.h (working copy)
> @@ -88,4 +88,7 @@
> #undef TARGET_BINDS_LOCAL_P
> #define TARGET_BINDS_LOCAL_P default_binds_local_p_2
>
> +/* Define this to be nonzero if static stack checking is supported. */
> +#define STACK_CHECK_STATIC_BUILTIN 1
> +
> #endif /* GCC_AARCH64_LINUX_H */
> Index: config/aarch64/aarch64-protos.h
> ===================================================================
> --- config/aarch64/aarch64-protos.h (revision 228512)
> +++ config/aarch64/aarch64-protos.h (working copy)
> @@ -316,6 +316,7 @@ void aarch64_asm_output_labelref (FILE *
> void aarch64_cpu_cpp_builtins (cpp_reader *);
> void aarch64_elf_asm_named_section (const char *, unsigned, tree);
> const char * aarch64_gen_far_branch (rtx *, int, const char *, const char *);
> +const char * aarch64_output_probe_stack_range (rtx, rtx);
> void aarch64_err_no_fpadvsimd (machine_mode, const char *);
> void aarch64_expand_epilogue (bool);
> void aarch64_expand_mov_immediate (rtx, rtx);
> Index: config/aarch64/aarch64.c
> ===================================================================
> --- config/aarch64/aarch64.c (revision 228512)
> +++ config/aarch64/aarch64.c (working copy)
> @@ -76,6 +76,7 @@
> #include "sched-int.h"
> #include "cortex-a57-fma-steering.h"
> #include "target-globals.h"
> +#include "common/common-target.h"
>
> /* This file should be included last. */
> #include "target-def.h"
> @@ -2144,6 +2145,167 @@ aarch64_libgcc_cmp_return_mode (void)
> return SImode;
> }
>
> +#define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
> +
> +#if (PROBE_INTERVAL % 4096) != 0
> +#error Cannot use indexed address calculation for stack probing
> +#endif
> +
> +#if PROBE_INTERVAL > 4096
> +#error Cannot use indexed addressing mode for stack probing
> +#endif
> +
Hmm, so if PROBE_INTERVAL != 4096 we barf!
While that's safe and probably right for Linux, on some OSes there might
be a minimum page size of 16k or even 64k. It would be nice if we could
support that.
> +/* Emit code to probe a range of stack addresses from FIRST to FIRST+SIZE,
> + inclusive. These are offsets from the current stack pointer. */
> +
> +static void
> +aarch64_emit_probe_stack_range (HOST_WIDE_INT first, HOST_WIDE_INT size)
> +{
> + rtx reg9 = gen_rtx_REG (Pmode, 9);
Ug! Manifest constants should be moved to pre-defines.
PROBE_STACK_BASE_REG?
> +
> + /* The following code uses indexed address calculation on FIRST. */
> + gcc_assert ((first % 4096) == 0);
where's 4096 come from?
> +
> + /* See if we have a constant small number of probes to generate. If so,
> + that's the easy case. */
> + if (size <= PROBE_INTERVAL)
> + {
> + emit_set_insn (reg9,
> + plus_constant (Pmode, stack_pointer_rtx,
> + -(first + PROBE_INTERVAL)));
> + emit_stack_probe (plus_constant (Pmode, reg9, PROBE_INTERVAL - size));
> + }
> +
> + /* The run-time loop is made up of 8 insns in the generic case while the
> + compile-time loop is made up of 4+2*(n-2) insns for n # of intervals.
> */
> + else if (size <= 4 * PROBE_INTERVAL)
> + {
> + HOST_WIDE_INT i, rem;
> +
> + emit_set_insn (reg9,
> + plus_constant (Pmode, stack_pointer_rtx,
> + -(first + PROBE_INTERVAL)));
> + emit_stack_probe (reg9);
> +
> + /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
> + it exceeds SIZE. If only two probes are needed, this will not
> + generate any code. Then probe at FIRST + SIZE. */
> + for (i = 2 * PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
> + {
> + emit_set_insn (reg9, plus_constant (Pmode, reg9, -PROBE_INTERVAL));
> + emit_stack_probe (reg9);
> + }
> +
> + rem = size - (i - PROBE_INTERVAL);
> + if (rem > 256)
> + {
> + emit_set_insn (reg9, plus_constant (Pmode, reg9, -PROBE_INTERVAL));
> + emit_stack_probe (plus_constant (Pmode, reg9, PROBE_INTERVAL - rem));
> + }
> + else
> + emit_stack_probe (plus_constant (Pmode, reg9, -rem));
> + }
> +
> + /* Otherwise, do the same as above, but in a loop. Note that we must be
> + extra careful with variables wrapping around because we might be at
> + the very top (or the very bottom) of the address space and we have
> + to be able to handle this case properly; in particular, we use an
> + equality test for the loop condition. */
> + else
> + {
> + HOST_WIDE_INT rounded_size;
> + rtx reg10 = gen_rtx_REG (Pmode, 10);
More manifest constants.
> +
> + /* Step 1: round SIZE to the previous multiple of the interval. */
> +
> + rounded_size = size & -PROBE_INTERVAL;
> +
> +
> + /* Step 2: compute initial and final value of the loop counter. */
> +
> + /* TEST_ADDR = SP + FIRST. */
> + emit_set_insn (reg9,
> + plus_constant (Pmode, stack_pointer_rtx, -first));
> +
> + /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE. */
> + emit_set_insn (reg10,
> + plus_constant (Pmode, stack_pointer_rtx,
> + -(first + rounded_size)));
> +
> +
> + /* Step 3: the loop
> +
> + do
> + {
> + TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
> + probe at TEST_ADDR
> + }
> + while (TEST_ADDR != LAST_ADDR)
> +
> + probes at FIRST + N * PROBE_INTERVAL for values of N from 1
> + until it is equal to ROUNDED_SIZE. */
> +
> + emit_insn (gen_probe_stack_range (reg9, reg9, reg10));
> +
> +
> + /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
> + that SIZE is equal to ROUNDED_SIZE. */
> +
> + if (size != rounded_size)
> + {
> + HOST_WIDE_INT rem = size - rounded_size;
> +
> + if (rem > 256)
> + {
> + emit_set_insn (reg10,
> + plus_constant (Pmode, reg10, -PROBE_INTERVAL));
> + emit_stack_probe (plus_constant (Pmode, reg10,
> + PROBE_INTERVAL - rem));
> + }
> + else
> + emit_stack_probe (plus_constant (Pmode, reg10, -rem));
> + }
> + }
> +
> + /* Make sure nothing is scheduled before we are done. */
> + emit_insn (gen_blockage ());
> +}
> +
> +/* Probe a range of stack addresses from REG1 to REG2 inclusive. These are
> + absolute addresses. */
> +
> +const char *
> +aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
> +{
> + static int labelno = 0;
> + char loop_lab[32];
> + rtx xops[2];
> +
> + ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
> +
> + /* Loop. */
> + ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
> +
> + /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
> + xops[0] = reg1;
> + xops[1] = GEN_INT (PROBE_INTERVAL);
> + output_asm_insn ("sub\t%0, %0, %1", xops);
> +
> + /* Probe at TEST_ADDR. */
> + output_asm_insn ("str\txzr, [%0]", xops);
> +
> + /* Test if TEST_ADDR == LAST_ADDR. */
> + xops[1] = reg2;
> + output_asm_insn ("cmp\t%0, %1", xops);
> +
> + /* Branch. */
> + fputs ("\tb.ne\t", asm_out_file);
> + assemble_name_raw (asm_out_file, loop_lab);
> + fputc ('\n', asm_out_file);
> +
> + return "";
> +}
> +
> static bool
> aarch64_frame_pointer_required (void)
> {
> @@ -2544,6 +2706,18 @@ aarch64_expand_prologue (void)
> if (flag_stack_usage_info)
> current_function_static_stack_size = frame_size;
>
> + if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
> + {
> + if (crtl->is_leaf && !cfun->calls_alloca)
> + {
> + if (frame_size > PROBE_INTERVAL && frame_size > STACK_CHECK_PROTECT)
> + aarch64_emit_probe_stack_range (STACK_CHECK_PROTECT,
> + frame_size - STACK_CHECK_PROTECT);
> + }
> + else if (frame_size > 0)
> + aarch64_emit_probe_stack_range (STACK_CHECK_PROTECT, frame_size);
> + }
> +
> /* Store pairs and load pairs have a range only -512 to 504. */
> if (offset >= 512)
> {
> Index: config/aarch64/aarch64.md
> ===================================================================
> --- config/aarch64/aarch64.md (revision 228512)
> +++ config/aarch64/aarch64.md (working copy)
> @@ -104,6 +104,7 @@ (define_c_enum "unspec" [
> UNSPEC_MB
> UNSPEC_NOP
> UNSPEC_PRLG_STK
> + UNSPEC_PROBE_STACK_RANGE
> UNSPEC_RBIT
> UNSPEC_SISD_NEG
> UNSPEC_SISD_SSHL
> @@ -134,6 +135,7 @@ (define_c_enum "unspecv" [
> UNSPECV_SET_FPCR ; Represent assign of FPCR content.
> UNSPECV_GET_FPSR ; Represent fetch of FPSR content.
> UNSPECV_SET_FPSR ; Represent assign of FPSR content.
> + UNSPECV_BLOCKAGE ; Represent a blockage
> ]
> )
>
> @@ -4807,6 +4809,27 @@ (define_insn "stack_tie"
> [(set_attr "length" "0")]
> )
>
> +;; UNSPEC_VOLATILE is considered to use and clobber all hard registers and
> +;; all of memory. This blocks insns from being moved across this point.
> +
> +(define_insn "blockage"
> + [(unspec_volatile [(const_int 0)] UNSPECV_BLOCKAGE)]
> + ""
> + ""
> + [(set_attr "length" "0")
> + (set_attr "type" "block")]
> +)
> +
> +(define_insn "probe_stack_range"
> + [(set (match_operand:DI 0 "register_operand" "=r")
> + (unspec_volatile:DI [(match_operand:DI 1 "register_operand" "0")
> + (match_operand:DI 2 "register_operand" "r")]
> + UNSPEC_PROBE_STACK_RANGE))]
> + ""
> +{
> + return aarch64_output_probe_stack_range (operands[0], operands[2]);
> +})
> +
This should be annotated with the sequence length.
> ;; Named pattern for expanding thread pointer reference.
> (define_expand "get_thread_pointerdi"
> [(match_operand:DI 0 "register_operand" "=r")]
> Index: config/arm/arm.c
> ===================================================================
> --- config/arm/arm.c (revision 228512)
> +++ config/arm/arm.c (working copy)
> @@ -21262,11 +21262,12 @@ arm_emit_probe_stack_range (HOST_WIDE_IN
>
> /* Step 3: the loop
>
> - while (TEST_ADDR != LAST_ADDR)
> + do
> {
> TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
> probe at TEST_ADDR
> }
> + while (TEST_ADDR != LAST_ADDR)
>
> probes at FIRST + N * PROBE_INTERVAL for values of N from 1
> until it is equal to ROUNDED_SIZE. */
> @@ -21311,22 +21312,22 @@ output_probe_stack_range (rtx reg1, rtx
>
> ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
>
> + /* Loop. */
> ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
>
> - /* Test if TEST_ADDR == LAST_ADDR. */
> + /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
> xops[0] = reg1;
> - xops[1] = reg2;
> - output_asm_insn ("cmp\t%0, %1", xops);
> + xops[1] = GEN_INT (PROBE_INTERVAL);
> + output_asm_insn ("sub\t%0, %0, %1", xops);
>
> - if (TARGET_THUMB2)
> - fputs ("\tittt\tne\n", asm_out_file);
> + /* Probe at TEST_ADDR. */
> + output_asm_insn ("str\tr0, [%0, #0]", xops);
>
> - /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
> - xops[1] = GEN_INT (PROBE_INTERVAL);
> - output_asm_insn ("subne\t%0, %0, %1", xops);
> + /* Test if TEST_ADDR == LAST_ADDR. */
> + xops[1] = reg2;
> + output_asm_insn ("cmp\t%0, %1", xops);
>
> - /* Probe at TEST_ADDR and branch. */
> - output_asm_insn ("strne\tr0, [%0, #0]", xops);
> + /* Branch. */
> fputs ("\tbne\t", asm_out_file);
> assemble_name_raw (asm_out_file, loop_lab);
> fputc ('\n', asm_out_file);
> @@ -24869,7 +24870,7 @@ thumb1_expand_prologue (void)
>
> /* If we have a frame, then do stack checking. FIXME: not implemented. */
> if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK && size)
> - sorry ("-fstack-check=specific for THUMB1");
> + sorry ("-fstack-check=specific for Thumb-1");
>
> amount = offsets->outgoing_args - offsets->saved_regs;
> amount -= 4 * thumb1_extra_regs_pushed (offsets, true);
> Index: config/arm/arm.md
> ===================================================================
> --- config/arm/arm.md (revision 228512)
> +++ config/arm/arm.md (working copy)
> @@ -8262,9 +8262,7 @@ (define_insn "probe_stack"
> [(set (match_operand 0 "memory_operand" "=m")
> (unspec [(const_int 0)] UNSPEC_PROBE_STACK))]
> "TARGET_32BIT"
> -{
> - return "str%?\\tr0, %0";
> -}
> + "str%?\\tr0, %0"
> [(set_attr "type" "store1")
> (set_attr "predicable" "yes")]
> )
>
>
> stack-checking.c
>
>
> /* { dg-do run { target { *-*-linux* } } } */
> /* { dg-options "-fstack-check" } */
>
> int main(void)
> {
> char *p;
> if (1)
> {
> char i[48];
> p = __builtin_alloca(8);
> p[0] = 1;
> }
>
> if (1)
> {
> char i[48], j[64];
> j[48] = 0;
> }
>
> return !p[0];
> }
>
R.