Re: [PATCH][GCC][AArch64] Add support for SVE stack clash probing [patch (2/7)]

Richard Sandiford Tue, 28 Aug 2018 13:41:30 -0700

I'll leave the AArch64 maintainers to review, but some comments.

Tamar Christina <tamar.christ...@arm.com> writes:
> diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
> index 
> 06451f38b11822ea77323438fe8c7e373eb9e614..e7efde79bb111e820f4df44a276f6f73070ecd17
>  100644
> --- a/gcc/config/aarch64/aarch64.c
> +++ b/gcc/config/aarch64/aarch64.c
> @@ -3970,6 +3970,90 @@ aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
>    return "";
>  }
 
> +/* Emit the probe loop for doing stack clash probes and stack adjustments for
> +   SVE.  This emits probes from BASE to BASE + ADJUSTMENT based on a guard 
> size
> +   of GUARD_SIZE and emits a probe when at least LIMIT bytes are allocated.  
> By
> +   the end of this function BASE = BASE + ADJUSTMENT.  */
> +
> +const char *
> +aarch64_output_probe_sve_stack_clash (rtx base, rtx adjustment, rtx limit,
> +                                   rtx guard_size)
> +{
> +  /* This function is not allowed to use any instruction generation function
> +     like gen_ and friends.  If you do you'll likely ICE during CFG 
> validation,
> +     so instead emit the code you want using output_asm_insn.  */
> +  gcc_assert (flag_stack_clash_protection);
> +  gcc_assert (CONST_INT_P (limit) && CONST_INT_P (guard_size));
> +  gcc_assert (aarch64_uimm12_shift (INTVAL (limit)));
> +  gcc_assert (aarch64_uimm12_shift (INTVAL (guard_size)));
> +
> +  static int labelno = 0;
> +  char loop_start_lab[32];
> +  char loop_res_lab[32];
> +  char loop_end_lab[32];
> +  rtx xops[2];
> +
> +  ASM_GENERATE_INTERNAL_LABEL (loop_start_lab, "SVLPSRL", labelno);
> +  ASM_GENERATE_INTERNAL_LABEL (loop_res_lab, "BRRCHK", labelno);
> +  ASM_GENERATE_INTERNAL_LABEL (loop_end_lab, "BERCHK", labelno++);
> +
> +  /* Emit loop start label.  */
> +  ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_start_lab);
> +
> +  /* Test if ADJUSTMENT < GUARD_SIZE.  */
> +  xops[0] = adjustment;
> +  xops[1] = guard_size;
> +  output_asm_insn ("cmp\t%0, %1", xops);
> +
> +  /* Branch to residual loop if it is.  */
> +  fputs ("\tb.lt\t", asm_out_file);
> +  assemble_name_raw (asm_out_file, loop_res_lab);
> +  fputc ('\n', asm_out_file);
> +
> +  /* BASE = BASE - GUARD_SIZE.  */
> +  xops[0] = base;
> +  xops[1] = guard_size;
> +  output_asm_insn ("sub\t%0, %0, %1", xops);
> +
> +  /* Probe at BASE + LIMIT.  */
> +  xops[1] = limit;
> +  output_asm_insn ("str\txzr, [%0, %1]", xops);
> +
> +  /* ADJUSTMENT = ADJUSTMENT - GUARD_SIZE.  */
> +  xops[0] = adjustment;
> +  xops[1] = guard_size;
> +  output_asm_insn ("sub\t%0, %0, %1", xops);
> +
> +  /* Branch to loop start.  */
> +  fputs ("\tb\t", asm_out_file);
> +  assemble_name_raw (asm_out_file, loop_start_lab);
> +  fputc ('\n', asm_out_file);
> +
> +  /* Emit residual check label.  */
> +  ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_res_lab);
> +
> +  /* BASE = BASE - ADJUSTMENT.  */
> +  xops[0] = base;
> +  xops[1] = adjustment;
> +  output_asm_insn ("sub\t%0, %0, %1", xops);
> +
> +  /* Test if BASE < LIMIT.  */
> +  xops[1] = limit;
> +  output_asm_insn ("cmp\t%0, %1", xops);


Think this should be ADJUSTMENT < LIMIT.

> +  /* Branch to end.  */
> +  fputs ("\tb.lt\t", asm_out_file);
> +  assemble_name_raw (asm_out_file, loop_end_lab);
> +  fputc ('\n', asm_out_file);
> +
> +  /* Probe at BASE + LIMIT.  */
> +  output_asm_insn ("str\txzr, [%0, %1]", xops);

It looks like this would probe at LIMIT when ADJUSTMENT is exactly LIMIT,
which could clobber the caller's frame.

> +
> +  /* No probe leave.  */
> +  ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_end_lab);
> +  return "";

With the CFA stuff and constant load, I think this works out as:

---------------------------------------------
# 12 insns
        mov     r15, base
        mov     adjustment, N
1:
        cmp     adjustment, guard_size
        b.lt    2f
        sub     base, base, guard_size
        str     xzr, [base, limit]
        sub     adjustment, adjustment, guard_size
        b       1b
2:
        sub     base, base, adjustment
        cmp     adjustment, limit
        b.le    3f
        str     xzr, [base, limit]
3:
---------------------------------------------

What do you think about something like:

---------------------------------------------
# 10 insns
        mov     adjustment, N
        sub     r15, base, adjustment
        subs    adjustment, adjustment, min_probe_threshold
        b.lo    2f
1:
        add     base, x15, adjustment
        str     xzr, [base, 0]
        subs    adjustment, adjustment, 16
        and     adjustment, adjustment, ~(guard_size-1)
        b.hs    1b
2:
        mov     base, r15
---------------------------------------------

or (with different trade-offs):

---------------------------------------------
# 11 insns
        mov     adjustment, N
        sub     r15, base, adjustment
        subs    adjustment, adjustment, min_probe_threshold
        b.lo    2f
        # Might be 0, leading to a double probe
        and     r14, adjustment, guard_size-1
1:
        add     base, x15, adjustment
        str     xzr, [base, 0]
        subs    adjustment, adjustment, r14
        mov     r14, guard_size
        b.hs    1b
2:
        mov     base, r15
---------------------------------------------

or (longer, but with a simpler loop):

---------------------------------------------
# 12 insns
        mov     adjustment, N
        sub     r15, base, adjustment
        subs    adjustment, adjustment, min_probe_threshold
        b.lo    2f
        str     xzr, [base, -16]!
        sub     adjustment, adjustment, 32
        and     adjustment, adjustment, -(guard_size-1)
1:
        add     base, x15, adjustment
        str     xzr, [base, 0]
        subs    adjustment, adjustment, guard_size
        b.hs    1b
2:
        mov     base, r15
---------------------------------------------

with the CFA based on r15+offset?

These loops probe more often than necessary in some cases,
but they only need a single branch in the common case that
ADJUSTMENT <= MIN_PROBE_THRESHOLD.

> @@ -4826,22 +4910,30 @@ aarch64_allocate_and_probe_stack_space (rtx temp1, 
> rtx temp2,
>       }
>      }
> 
> -  HOST_WIDE_INT size;
> +  /* GCC's initialization analysis is broken so initialize size.  */
> +  HOST_WIDE_INT size = 0;

It's not broken in this case. :-)  is_constant only modifies its argument
when returning true.  In other cases the variable keeps whatever value
it had originally.  And the code does use "size" when !is_constant,
so an explicit initialisation is necessary.

>    /* If SIZE is not large enough to require probing, just adjust the stack 
> and
>       exit.  */
> -  if (!poly_size.is_constant (&size)
> -      || known_lt (poly_size, min_probe_threshold)
> +  if ((poly_size.is_constant (&size)
> +       && known_lt (poly_size, min_probe_threshold))
>        || !flag_stack_clash_protection)

No need for the is_constant here, just known_lt is enough.

>      {
>        aarch64_sub_sp (temp1, temp2, poly_size, frame_related_p);
>        return;
>      }
> 
> -  if (dump_file)
> +  if (dump_file && poly_size.is_constant ())
>      fprintf (dump_file,
>            "Stack clash AArch64 prologue: " HOST_WIDE_INT_PRINT_DEC " bytes"
>            ", probing will be required.\n", size);
> 
> +  if (dump_file && !poly_size.is_constant ())
> +    {
> +      fprintf (dump_file, "Stack clash SVE prologue: ");
> +      dump_dec (MSG_NOTE, poly_size);

This should be print_dec (poly_size, dump_file);

> +      fprintf (dump_file, " bytes, dynamic probing will be required.\n");
> +    }
> +
>    /* Round size to the nearest multiple of guard_size, and calculate the
>       residual as the difference between the original size and the rounded
>       size.  */
> @@ -4850,7 +4942,8 @@ aarch64_allocate_and_probe_stack_space (rtx temp1, rtx 
> temp2,
> 
>    /* We can handle a small number of allocations/probes inline.  Otherwise
>       punt to a loop.  */
> -  if (rounded_size <= STACK_CLASH_MAX_UNROLL_PAGES * guard_size)
> +  if (poly_size.is_constant ()
> +      && rounded_size <= STACK_CLASH_MAX_UNROLL_PAGES * guard_size)
>      {
>        for (HOST_WIDE_INT i = 0; i < rounded_size; i += guard_size)
>       {
> @@ -4861,7 +4954,7 @@ aarch64_allocate_and_probe_stack_space (rtx temp1, rtx 
> temp2,
>       }
>        dump_stack_clash_frame_info (PROBE_INLINE, size != rounded_size);
>      }
> -  else
> +  else if (poly_size.is_constant ())
>      {
>        /* Compute the ending address.  */
>        aarch64_add_offset (Pmode, temp1, stack_pointer_rtx, -rounded_size,
> @@ -4910,6 +5003,48 @@ aarch64_allocate_and_probe_stack_space (rtx temp1, rtx 
> temp2,
>        emit_insn (gen_blockage ());
>        dump_stack_clash_frame_info (PROBE_LOOP, size != rounded_size);
>      }
> +  else
> +    {

It would probably be better to handle "!poly_size.is_constant ()"
after the "!flag_stack_clash_protection" if statement and exit early,
so that we don't do calculations based on "size" when "size" has a
fairly meaningless value.  It would also avoid repeated checks for
is_constant.

> +      rtx probe_const = gen_rtx_CONST_INT (Pmode, STACK_CLASH_CALLER_GUARD);
> +      rtx guard_const = gen_rtx_CONST_INT (Pmode, guard_size);

CONST_INTs don't have a recorded mode, so this should either be GEN_INT or
(better) gen_int_mode.

Thanks,
Richard

Re: [PATCH][GCC][AArch64] Add support for SVE stack clash probing [patch (2/7)]

Reply via email to