Re: [PATCH][GCC][AArch64] Add support for SVE stack clash probing [patch (2/7)]

Tamar Christina Fri, 28 Sep 2018 09:41:11 -0700

Hi Richard,

Here's the updated patch with all the feedback processed.


I have also run the compile tests through with -mabi=ilp32 as well.

Ok for trunk?

Thanks,
Tamar

The 09/27/2018 12:11, Richard Sandiford wrote:
> > It turns out the testsuite didn't have a case in it which would cause a
> > significant enough spill to enter the loop.  After creating one I noticed a 
> > bug
> > in the loop and fixed it.
> >
> > The loops are now
> >
> >         .cfi_startproc
> >         mov     x15, sp
> >         cntb    x16, all, mul #11
> >         add     x16, x16, 304
> >         .cfi_def_cfa_register 15
> > .SVLPSPL0:
> >         cmp     x16, 61440
> >         b.lt    .SVLPEND0
> >         sub     sp, sp, 61440
> >         str     xzr, [sp, 0]
> >         subs    x16, x16, 61440
> 
> (The code uses sub rather than subs here)
> 
> >         b      .SVLPSPL0
> > .SVLPEND0:
> >         sub     sp, sp, x16
> >         .cfi_escape 
> > 0xf,0xc,0x8f,0,0x92,0x2e,0,0x8,0x58,0x1e,0x23,0xb0,0x2,0x22
> >
> > for a 64KB guard size.
> 
> That's OK with me.  Like you say, the main goal was to make the common
> case of no probe as fast as possible.
> 
> > diff --git a/gcc/config/aarch64/aarch64-protos.h 
> > b/gcc/config/aarch64/aarch64-protos.h
> > index 
> > ef95fc829b83886e2ff00e4664e31af916e99b0c..e2d8734a8d5e513588e3b0318e9c67fdaebdf0d4
> >  100644
> > --- a/gcc/config/aarch64/aarch64-protos.h
> > +++ b/gcc/config/aarch64/aarch64-protos.h
> > @@ -453,6 +453,7 @@ void aarch64_asm_output_labelref (FILE *, const char *);
> >  void aarch64_cpu_cpp_builtins (cpp_reader *);
> >  const char * aarch64_gen_far_branch (rtx *, int, const char *, const char 
> > *);
> >  const char * aarch64_output_probe_stack_range (rtx, rtx);
> > +const char * aarch64_output_probe_sve_stack_clash (rtx, rtx, rtx, rtx);
> >  void aarch64_err_no_fpadvsimd (machine_mode);
> >  void aarch64_expand_epilogue (bool);
> >  void aarch64_expand_mov_immediate (rtx, rtx, rtx (*) (rtx, rtx) = 0);
> > diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
> > index 
> > d4b13d48d852a70848fc7c51fd867e776efb5e55..245fd6832ec0afe27c42a242c901a2e13024f935
> >  100644
> > --- a/gcc/config/aarch64/aarch64.c
> > +++ b/gcc/config/aarch64/aarch64.c
> > @@ -208,6 +208,7 @@ static bool aarch64_builtin_support_vector_misalignment 
> > (machine_mode mode,
> >  static machine_mode aarch64_simd_container_mode (scalar_mode, poly_int64);
> >  static bool aarch64_print_address_internal (FILE*, machine_mode, rtx,
> >                                         aarch64_addr_query_type);
> > +static HOST_WIDE_INT aarch64_uimm12_nearest_value (HOST_WIDE_INT val);
>  
> >  /* Major revision number of the ARM Architecture implemented by the 
> > target.  */
> >  unsigned aarch64_architecture_version;
> > @@ -3973,6 +3974,83 @@ aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
> >    return "";
> >  }
>  
> > +/* Emit the probe loop for doing stack clash probes and stack adjustments 
> > for
> > +   SVE.  This emits probes from BASE to BASE - ADJUSTMENT based on a guard 
> > size
> > +   of GUARD_SIZE.  When a probe is emitted it is done at MIN_PROBE_OFFSET 
> > bytes
> > +   from the current BASE at an interval of MIN_PROBE_OFFSET.  By the end 
> > of this
> 
> MIN_PROBE_THRESHOLD in both cases (or rename the var to min_probe_offset,
> either's fine).  Probably "at most MIN_PROBE..." given the round down.
> 
> > +   function BASE = BASE - ADJUSTMENT.  */
> > +
> > +const char *
> > +aarch64_output_probe_sve_stack_clash (rtx base, rtx adjustment,
> > +                                 rtx min_probe_threshold, rtx guard_size)
> > +{
> > +  /* This function is not allowed to use any instruction generation 
> > function
> > +     like gen_ and friends.  If you do you'll likely ICE during CFG 
> > validation,
> > +     so instead emit the code you want using output_asm_insn.  */
> > +  gcc_assert (flag_stack_clash_protection);
> > +  gcc_assert (CONST_INT_P (min_probe_threshold) && CONST_INT_P 
> > (guard_size));
> > +  gcc_assert (INTVAL (guard_size) > INTVAL (min_probe_threshold));
> > +
> > +  /* The minimum required allocation before the residual requires probing. 
> >  */
> > +  HOST_WIDE_INT residual_probe_guard = INTVAL (min_probe_threshold);
> > +
> > +  /* Clamp the value down to the nearest value that can be used with a 
> > cmp.  */
> > +  residual_probe_guard = aarch64_uimm12_nearest_value 
> > (residual_probe_guard);
> 
> Maybe aarch64_clamp_to_uimm12_shift or aarch64_round_down_to_uimm12_shift
> would be better; nearest implies that "0x1ff0" should become "0x2000"
> rather than "0x1000".
> 
> > +  /* ADJUSTMENT == RESIDUAL_PROBE_GUARD.  */
> > +  xops[0] = adjustment;
> > +  xops[1] = probe_offset_value_rtx;
> > +  output_asm_insn ("cmp\t%0, %1", xops);
> 
> < rather than == (or just "Compare ...")
> 
> > +  /* Branch to end if not enough adjustment to probe.  */
> > +  fputs ("\tb.lt\t", asm_out_file);
> > +  assemble_name_raw (asm_out_file, loop_end_lab);
> > +  fputc ('\n', asm_out_file);
> > +
> > +  /* BASE = BASE - RESIDUAL_PROBE_GUARD.  */
> > +  xops[0] = base;
> > +  xops[1] = gen_int_mode (residual_probe_guard, Pmode);
> 
> probe_offset_value_rtx
> 
> > +  HOST_WIDE_INT size;
> > +  /* Handle the SVE non-constant case first.  */
> > +  if (!poly_size.is_constant (&size))
> > +    {
> > +
> 
> Excess blank line.
> 
> > +     if (dump_file)
> > +      {
> > +     fprintf (dump_file, "Stack clash SVE prologue: ");
> > +     print_dec (poly_size, dump_file);
> > +     fprintf (dump_file, " bytes, dynamic probing will be required.\n");
> > +      }
> > +
> > +      /* First calculate the amount of bytes we're actually spilling.  */
> > +      aarch64_add_offset (Pmode, temp1, CONST0_RTX (GET_MODE (temp1)),
> 
> Might as well use Pmode for the CONST0_RTX too, for consistency with the
> first argument to aarch64_add_offset.
> 
> > +                     poly_size, temp1, temp2, false, true);
> > +
> > +      rtx_insn *insn = get_last_insn ();
> > +
> > +      if (frame_related_p)
> > +   {
> > +     /* This is done to provide unwinding information for the stack
> > +        adjustments we're about to do, however to prevent the optimizers
> > +        from removing the R15 move and leaving the CFA note (which would be
> > +        very wrong) we tie the old and new stack pointer together.
> > +        The tie will expand to nothing but the optimizers will not touch
> > +        the instruction.  */
> > +     rtx stack_ptr_copy = gen_rtx_REG (Pmode, R15_REGNUM);
> > +     emit_move_insn (stack_ptr_copy, stack_pointer_rtx);
> > +     emit_insn (gen_stack_tie (stack_ptr_copy, stack_pointer_rtx));
> > +
> > +     /* We want the CFA independent of the stack pointer for the
> > +        duration of the loop.  */
> > +     add_reg_note (insn, REG_CFA_DEF_CFA, stack_ptr_copy);
> > +     RTX_FRAME_RELATED_P (insn) = 1;
> > +   }
> > +
> > +      rtx probe_const = gen_int_mode (min_probe_threshold, DImode);
> > +      rtx guard_const = gen_int_mode (guard_size, DImode);
> 
> Pmode in both cases.  (No practical difference, but it makes everything
> agree on the mode.)
> 
> >    if (dump_file)
> > -    fprintf (dump_file,
> > -        "Stack clash AArch64 prologue: " HOST_WIDE_INT_PRINT_DEC " bytes"
> > -        ", probing will be required.\n", size);
> > +    {
> > +      fprintf (dump_file,
> > +          "Stack clash AArch64 prologue: " HOST_WIDE_INT_PRINT_DEC
> > +          " bytes, probing will be required.\n", size);
> > +    }
> 
> Not needed (previous formatting without { ... } was right).
> 
> > +/* Returns the nearest value to VAL that will fit as a 12-bit unsigned 
> > immediate
> > +   that can be created with a left shift of 0 or 12.  */
> > +static HOST_WIDE_INT
> > +aarch64_uimm12_nearest_value (HOST_WIDE_INT val)
> > +{
> > +  if ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val)
> > +    return val;
> > +
> > +  return val & (((HOST_WIDE_INT) 0xfff) << 12);
> > +}
> 
> Are these HOST_WIDE_INT casts needed?
> 
> Probably worth asserting that (val & 0xffffff) == val, or handle
> the case in which it isn't by returning 0xfff000.
>  
> > +;; This instruction is used to generate the stack clash stack adjustment 
> > and
> > +;; probing loop.  We can't change the control flow during prologue and 
> > epilogue
> > +;; code generation.  So we must emit a volatile unspec and expand it later 
> > on.
> > +
> > +(define_insn "probe_sve_stack_clash"
> > +  [(set (match_operand:DI 0 "register_operand" "=rk")
> > +   (unspec_volatile:DI [(match_operand:DI 1 "register_operand" "0")
> > +                        (match_operand:DI 2 "register_operand" "r")
> > +                        (match_operand:DI 3 "const_int_operand" "n")
> > +                        (match_operand:DI 4 "aarch64_plus_immediate" "L")]
> > +                         UNSPECV_PROBE_STACK_RANGE))]
> > +  "TARGET_SVE"
> > +{
> > +  return aarch64_output_probe_sve_stack_clash (operands[0], operands[2],
> > +                                          operands[3], operands[4]);
> > +}
> > +  [(set_attr "length" "28")]
> > +)
> 
> Think this will break for ILP32.  We probably need :P instead of :DI and
> 
>   "@probe_sve_stack_clash_<mode>"
> 
>   gen_probe_sve_stack_clash (Pmode, ...)
> 
> > diff --git a/gcc/testsuite/gcc.target/aarch64/stack-check-cfa-3.c 
> > b/gcc/testsuite/gcc.target/aarch64/stack-check-cfa-3.c
> > new file mode 100644
> > index 
> > 0000000000000000000000000000000000000000..6ea87392843e4b9561cf6d43ffee57887db62e4e
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/aarch64/stack-check-cfa-3.c
> > @@ -0,0 +1,30 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O2 -march=armv8-a+sve -fstack-clash-protection --param 
> > stack-clash-protection-guard-size=16 -funwind-tables -ftree-vectorize" } */
> > +/* { dg-require-effective-target supports_stack_clash_protection } */
> > +
> > +#include <stdint.h>
> > +
> > +#define N 20040
> > +
> > +void __attribute__ ((noinline, noclone))
> > +test (int8_t *restrict dest, int8_t *restrict src)
> > +{
> > +  for (int i = 0; i < N; i+=8)
> > +    {
> > +      dest[i] += src[i * 4];
> > +      dest[i+1] += src[i * 4 + 1];
> > +      dest[i+2] += src[i * 4 + 2];
> > +      dest[i+3] += src[i * 4 + 3];
> > +      dest[i+4] += src[i * 4 + 4];
> > +      dest[i+5] += src[i * 4 + 5];
> > +      dest[i+6] += src[i * 4 + 6];
> > +      dest[i+7] += src[i * 4 + 7];
> > +    }
> > +}
> 
> I think we should use something that has a higher guarantee of
> spilling, since we shouldn't really need to spill for the above.
> See g++.target/aarch64/sve/catch_1.C for one possibility.
> 
> > +/* { dg-final { scan-assembler-times {mov\tx15, sp} 1 } } */
> > +/* { dg-final { scan-assembler-times {\.cfi_def_cfa_register 15} 1 } } */
> > +/* { dg-final { scan-assembler-times {\.cfi_escape 
> > 0xf,0xc,0x8f,0,0x92,0x2e,0,0x8,0x58,0x1e,0x23,0xb0,0x2,0x22} 1 } } */
> > +
> > +/* Checks that the CFA notes are correct for every sp adjustment, but we 
> > also
> > +   need to make sure we can unwind correctly before the frame is set up.  
> > So
> > +   check that we're emitting r15 with a copy of sp an setting the CFA 
> > there.  */
> 
> Think this comment belongs above the dg-finals -- seems odd to have it at
> the end of the file.
> 
> I'll take your word that the cfi_escape is correct, but it looks like
> it matches the full calculation, including the VG multiple.  It would
> be better to leave out that part of the encoding, since the number of
> SVE vectors spilled could vary quite easily.
> 
> > diff --git a/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-16.c 
> > b/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-16.c
> > new file mode 100644
> > index 
> > 0000000000000000000000000000000000000000..fd0e987597eba406fa7351433fe7157743aeca42
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-16.c
> > @@ -0,0 +1,32 @@
> > +/* { dg-do compile } */
> > +/* { dg-require-effective-target supports_stack_clash_protection } */
> > +/* { dg-options "-O2 -march=armv8-a+sve -fstack-clash-protection --param 
> > stack-clash-protection-guard-size=16 -ftree-vectorize" } */
> > +
> > +
> > +#include <stdint.h>
> 
> Excess blank line before include.
> 
> > +#define N 20040
> > +
> > +void __attribute__ ((noinline, noclone))
> > +test (int8_t *restrict dest, int8_t *restrict src)
> > +{
> > +  for (int i = 0; i < N; i+=8)
> > +    {
> > +      dest[i] += src[i * 4];
> > +      dest[i+1] += src[i * 4 + 1];
> > +      dest[i+2] += src[i * 4 + 2];
> > +      dest[i+3] += src[i * 4 + 3];
> > +      dest[i+4] += src[i * 4 + 4];
> > +      dest[i+5] += src[i * 4 + 5];
> > +      dest[i+6] += src[i * 4 + 6];
> > +      dest[i+7] += src[i * 4 + 7];
> > +    }
> > +}
> > +
> > +
> > +/* { dg-final { scan-assembler-times {str\s+xzr, \[sp, 0\]} 1 } } */
> > +/* { dg-final { scan-assembler-times {cmp\s+x[0-9]+, 61440} 1 } } */
> > +/* { dg-final { scan-assembler-times {sub\s+x[0-9]+, x[0-9]+, 61440} 1 } } 
> > */
> > +
> > +/* SVE spill, requires probing as vector size is unknown at compile time.  
> > */
> 
> Same comments above forcing spilling and putting the comment before
> the dg-finals.
> 
> Thanks,
> Richard

--

diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
index ef95fc829b83886e2ff00e4664e31af916e99b0c..e2d8734a8d5e513588e3b0318e9c67fdaebdf0d4 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -453,6 +453,7 @@ void aarch64_asm_output_labelref (FILE *, const char *);
 void aarch64_cpu_cpp_builtins (cpp_reader *);
 const char * aarch64_gen_far_branch (rtx *, int, const char *, const char *);
 const char * aarch64_output_probe_stack_range (rtx, rtx);
+const char * aarch64_output_probe_sve_stack_clash (rtx, rtx, rtx, rtx);
 void aarch64_err_no_fpadvsimd (machine_mode);
 void aarch64_expand_epilogue (bool);
 void aarch64_expand_mov_immediate (rtx, rtx, rtx (*) (rtx, rtx) = 0);
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index d4b13d48d852a70848fc7c51fd867e776efb5e55..8c901e9d8c00d392a2df62d9b63ce5b865b48e50 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -208,6 +208,7 @@ static bool aarch64_builtin_support_vector_misalignment (machine_mode mode,
 static machine_mode aarch64_simd_container_mode (scalar_mode, poly_int64);
 static bool aarch64_print_address_internal (FILE*, machine_mode, rtx,
 					    aarch64_addr_query_type);
+static HOST_WIDE_INT aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val);
 
 /* Major revision number of the ARM Architecture implemented by the target.  */
 unsigned aarch64_architecture_version;
@@ -3973,6 +3974,84 @@ aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
   return "";
 }
 
+/* Emit the probe loop for doing stack clash probes and stack adjustments for
+   SVE.  This emits probes from BASE to BASE - ADJUSTMENT based on a guard size
+   of GUARD_SIZE.  When a probe is emitted it is done at most
+   MIN_PROBE_THRESHOLD bytes from the current BASE at an interval of
+   at most MIN_PROBE_THRESHOLD.  By the end of this function
+   BASE = BASE - ADJUSTMENT.  */
+
+const char *
+aarch64_output_probe_sve_stack_clash (rtx base, rtx adjustment,
+				      rtx min_probe_threshold, rtx guard_size)
+{
+  /* This function is not allowed to use any instruction generation function
+     like gen_ and friends.  If you do you'll likely ICE during CFG validation,
+     so instead emit the code you want using output_asm_insn.  */
+  gcc_assert (flag_stack_clash_protection);
+  gcc_assert (CONST_INT_P (min_probe_threshold) && CONST_INT_P (guard_size));
+  gcc_assert (INTVAL (guard_size) > INTVAL (min_probe_threshold));
+
+  /* The minimum required allocation before the residual requires probing.  */
+  HOST_WIDE_INT residual_probe_guard = INTVAL (min_probe_threshold);
+
+  /* Clamp the value down to the nearest value that can be used with a cmp.  */
+  residual_probe_guard = aarch64_clamp_to_uimm12_shift (residual_probe_guard);
+  rtx probe_offset_value_rtx = gen_int_mode (residual_probe_guard, Pmode);
+
+  gcc_assert (INTVAL (min_probe_threshold) >= residual_probe_guard);
+  gcc_assert (aarch64_uimm12_shift (residual_probe_guard));
+
+  static int labelno = 0;
+  char loop_start_lab[32];
+  char loop_end_lab[32];
+  rtx xops[2];
+
+  ASM_GENERATE_INTERNAL_LABEL (loop_start_lab, "SVLPSPL", labelno);
+  ASM_GENERATE_INTERNAL_LABEL (loop_end_lab, "SVLPEND", labelno++);
+
+  /* Emit loop start label.  */
+  ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_start_lab);
+
+  /* ADJUSTMENT < RESIDUAL_PROBE_GUARD.  */
+  xops[0] = adjustment;
+  xops[1] = probe_offset_value_rtx;
+  output_asm_insn ("cmp\t%0, %1", xops);
+
+  /* Branch to end if not enough adjustment to probe.  */
+  fputs ("\tb.lt\t", asm_out_file);
+  assemble_name_raw (asm_out_file, loop_end_lab);
+  fputc ('\n', asm_out_file);
+
+  /* BASE = BASE - RESIDUAL_PROBE_GUARD.  */
+  xops[0] = base;
+  xops[1] = probe_offset_value_rtx;
+  output_asm_insn ("sub\t%0, %0, %1", xops);
+
+  /* Probe at BASE.  */
+  xops[1] = const0_rtx;
+  output_asm_insn ("str\txzr, [%0, %1]", xops);
+
+  /* ADJUSTMENT = ADJUSTMENT - RESIDUAL_PROBE_GUARD.  */
+  xops[0] = adjustment;
+  xops[1] = probe_offset_value_rtx;
+  output_asm_insn ("sub\t%0, %0, %1", xops);
+
+  /* Branch to start if still more bytes to allocate.  */
+  fputs ("\tb\t", asm_out_file);
+  assemble_name_raw (asm_out_file, loop_start_lab);
+  fputc ('\n', asm_out_file);
+
+  /* No probe leave.  */
+  ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_end_lab);
+
+  /* BASE = BASE - ADJUSTMENT.  */
+  xops[0] = base;
+  xops[1] = adjustment;
+  output_asm_insn ("sub\t%0, %0, %1", xops);
+  return "";
+}
+
 /* Determine whether a frame chain needs to be generated.  */
 static bool
 aarch64_needs_frame_chain (void)
@@ -4835,21 +4914,73 @@ aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
 	}
     }
 
-  HOST_WIDE_INT size;
   /* If SIZE is not large enough to require probing, just adjust the stack and
      exit.  */
-  if (!poly_size.is_constant (&size)
-      || known_lt (poly_size, min_probe_threshold)
+  if (known_lt (poly_size, min_probe_threshold)
       || !flag_stack_clash_protection)
     {
       aarch64_sub_sp (temp1, temp2, poly_size, frame_related_p);
       return;
     }
 
+  HOST_WIDE_INT size;
+  /* Handle the SVE non-constant case first.  */
+  if (!poly_size.is_constant (&size))
+    {
+     if (dump_file)
+      {
+	fprintf (dump_file, "Stack clash SVE prologue: ");
+	print_dec (poly_size, dump_file);
+	fprintf (dump_file, " bytes, dynamic probing will be required.\n");
+      }
+
+      /* First calculate the amount of bytes we're actually spilling.  */
+      aarch64_add_offset (Pmode, temp1, CONST0_RTX (Pmode),
+			  poly_size, temp1, temp2, false, true);
+
+      rtx_insn *insn = get_last_insn ();
+
+      if (frame_related_p)
+	{
+	  /* This is done to provide unwinding information for the stack
+	     adjustments we're about to do, however to prevent the optimizers
+	     from removing the R15 move and leaving the CFA note (which would be
+	     very wrong) we tie the old and new stack pointer together.
+	     The tie will expand to nothing but the optimizers will not touch
+	     the instruction.  */
+	  rtx stack_ptr_copy = gen_rtx_REG (Pmode, R15_REGNUM);
+	  emit_move_insn (stack_ptr_copy, stack_pointer_rtx);
+	  emit_insn (gen_stack_tie (stack_ptr_copy, stack_pointer_rtx));
+
+	  /* We want the CFA independent of the stack pointer for the
+	     duration of the loop.  */
+	  add_reg_note (insn, REG_CFA_DEF_CFA, stack_ptr_copy);
+	  RTX_FRAME_RELATED_P (insn) = 1;
+	}
+
+      rtx probe_const = gen_int_mode (min_probe_threshold, Pmode);
+      rtx guard_const = gen_int_mode (guard_size, Pmode);
+
+      insn = emit_insn (gen_probe_sve_stack_clash (Pmode, stack_pointer_rtx,
+						   stack_pointer_rtx, temp1,
+						   probe_const, guard_const));
+
+      /* Now reset the CFA register if needed.  */
+      if (frame_related_p)
+	{
+	  add_reg_note (insn, REG_CFA_DEF_CFA,
+			gen_rtx_PLUS (Pmode, stack_pointer_rtx,
+				      gen_int_mode (poly_size, Pmode)));
+	  RTX_FRAME_RELATED_P (insn) = 1;
+	}
+
+      return;
+    }
+
   if (dump_file)
     fprintf (dump_file,
-	     "Stack clash AArch64 prologue: " HOST_WIDE_INT_PRINT_DEC " bytes"
-	     ", probing will be required.\n", size);
+	     "Stack clash AArch64 prologue: " HOST_WIDE_INT_PRINT_DEC
+	     " bytes, probing will be required.\n", size);
 
   /* Round size to the nearest multiple of guard_size, and calculate the
      residual as the difference between the original size and the rounded
@@ -5458,6 +5589,20 @@ aarch64_uimm12_shift (HOST_WIDE_INT val)
 	  );
 }
 
+/* Returns the nearest value to VAL that will fit as a 12-bit unsigned immediate
+   that can be created with a left shift of 0 or 12.  */
+static HOST_WIDE_INT
+aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val)
+{
+  /* Check to see if the value fits in 24 bits, as that is the maximum we can
+     handle correctly.  */
+  gcc_assert ((val & 0xffffff) == val);
+
+  if (((val & 0xfff) << 0) == val)
+    return val;
+
+  return val & (0xfff << 12);
+}
 
 /* Return true if val is an immediate that can be loaded into a
    register by a MOVZ instruction.  */
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index b8da13f14fa9990e8fdc3c71ed407c8afc65a324..22eb026f0631958536ab0c33c4d234d0156dc120 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -6464,6 +6464,25 @@
   [(set_attr "length" "32")]
 )
 
+;; This instruction is used to generate the stack clash stack adjustment and
+;; probing loop.  We can't change the control flow during prologue and epilogue
+;; code generation.  So we must emit a volatile unspec and expand it later on.
+
+(define_insn "@probe_sve_stack_clash_<mode>"
+  [(set (match_operand:P 0 "register_operand" "=rk")
+	(unspec_volatile:P [(match_operand:P 1 "register_operand" "0")
+			    (match_operand:P 2 "register_operand" "r")
+			    (match_operand:P 3 "const_int_operand" "n")
+			    (match_operand:P 4 "aarch64_plus_immediate" "L")]
+			     UNSPECV_PROBE_STACK_RANGE))]
+  "TARGET_SVE"
+{
+  return aarch64_output_probe_sve_stack_clash (operands[0], operands[2],
+					       operands[3], operands[4]);
+}
+  [(set_attr "length" "28")]
+)
+
 ;; Named pattern for expanding thread pointer reference.
 (define_expand "get_thread_pointerdi"
   [(match_operand:DI 0 "register_operand" "=r")]
diff --git a/gcc/testsuite/gcc.target/aarch64/stack-check-cfa-3.c b/gcc/testsuite/gcc.target/aarch64/stack-check-cfa-3.c
new file mode 100644
index 0000000000000000000000000000000000000000..41579f26ba9156f3e500f090d132ba9cf28364d3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/stack-check-cfa-3.c
@@ -0,0 +1,13 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -fopenmp-simd -march=armv8-a+sve -fstack-clash-protection --param stack-clash-protection-guard-size=16 -funwind-tables" } */
+/* { dg-require-effective-target supports_stack_clash_protection } */
+
+#include "stack-check-prologue-16.c"
+
+/* Checks that the CFA notes are correct for every sp adjustment, but we also
+   need to make sure we can unwind correctly before the frame is set up.  So
+   check that we're emitting r15 with a copy of sp an setting the CFA there.  */
+
+/* { dg-final { scan-assembler-times {mov\tx15, sp} 1 } } */
+/* { dg-final { scan-assembler-times {\.cfi_def_cfa_register 15} 1 } } */
+/* { dg-final { scan-assembler-times {\.cfi_escape 0xf,0xc,0x8f,0,0x92,0x2e,0,.*} 1 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-16.c b/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-16.c
new file mode 100644
index 0000000000000000000000000000000000000000..d92ef47a57ddda556c563e36ad8aaf4acdeabc57
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-16.c
@@ -0,0 +1,32 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target supports_stack_clash_protection } */
+/* { dg-options "-O3 -fopenmp-simd -march=armv8-a+sve -fstack-clash-protection --param stack-clash-protection-guard-size=16" } */
+
+/* Invoke X (P##n) for n in [0, 7].  */
+#define REPEAT8(X, P) \
+  X (P##0) X (P##1) X (P##2) X (P##3) X (P##4) X (P##5) X (P##6) X (P##7)
+
+/* Invoke X (n) for all octal n in [0, 39].  */
+#define REPEAT40(X) \
+  REPEAT8 (X, 0) REPEAT8 (X, 1)  REPEAT8 (X, 2) REPEAT8 (X, 3) REPEAT8 (X, 4)
+
+/* Expect vector work to be done, with spilling of vector registers.  */
+void
+f2 (int x[40][100], int *y)
+{
+  /* Try to force some spilling.  */
+#define DECLARE(N) int y##N = y[N];
+  REPEAT40 (DECLARE);
+#pragma omp simd
+  for (int i = 0; i < 100; ++i)
+    {
+#define INC(N) x[N][i] += y##N;
+      REPEAT40 (INC);
+    }
+}
+
+/* SVE spill, requires probing as vector size is unknown at compile time.  */
+
+/* { dg-final { scan-assembler-times {str\s+xzr, \[sp, 0\]} 1 } } */
+/* { dg-final { scan-assembler-times {cmp\s+x[0-9]+, 61440} 1 } } */
+/* { dg-final { scan-assembler-times {sub\s+x[0-9]+, x[0-9]+, 61440} 1 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_24.c b/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_24.c
new file mode 100644
index 0000000000000000000000000000000000000000..68a9d5e3d2e74cb331dff0ef3bcd612f8bb0d0f2
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_24.c
@@ -0,0 +1,48 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target supports_stack_clash_protection } */
+/* { dg-options "-O3 -fopenmp-simd -fstack-clash-protection --param stack-clash-protection-guard-size=16" } */
+
+#include <stdint.h>
+
+#define N 50
+#define S 2 * 64 * 1024
+
+/* Invoke X (P##n) for n in [0, 9].  */
+#define REPEAT8(X, P) \
+  X (P##0) X (P##1) X (P##2) X (P##3) X (P##4) X (P##5) X (P##6) X (P##7) \
+  X (P##8)  X (P##9)
+
+/* Invoke X (n) for all n in [0, 49].  */
+#define REPEAT50(X) \
+  REPEAT8 (X, ) REPEAT8 (X, 1)  REPEAT8 (X, 2) REPEAT8 (X, 3) REPEAT8 (X, 4)
+
+  /* Try to force some spilling.  */
+#define DECLARE(N) int src##N = src[N * 4];
+#define INC(N) dest[i] += src##N;
+
+#define TEST_LOOP(NAME, TYPE)				\
+  void __attribute__ ((noinline, noclone, simd))	\
+  NAME (TYPE *restrict dest, TYPE *restrict src)	\
+  {							\
+    REPEAT50 (DECLARE);					\
+    volatile char foo[S];				\
+    foo[S-1]=1;						\
+    for (int i = 0; i < N; i++)				\
+      {							\
+	REPEAT50 (INC);					\
+      }							\
+  }
+
+#define TEST(NAME) \
+  TEST_LOOP (NAME##_i32, int32_t) \
+  TEST_LOOP (NAME##_i64, int64_t) \
+  TEST_LOOP (NAME##_f32, float) \
+  TEST_LOOP (NAME##_f64, double)
+
+TEST (test)
+
+/* Check the vectorized loop for stack clash probing.  */
+
+/* { dg-final { scan-assembler-times {str\s+xzr, \[sp, 0\]} 4 } } */
+/* { dg-final { scan-assembler-times {cmp\s+x[0-9]+, 61440} 4 } } */
+/* { dg-final { scan-assembler-times {sub\s+x[0-9]+, x[0-9]+, 61440} 4 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_24_run.c b/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_24_run.c
new file mode 100644
index 0000000000000000000000000000000000000000..e764476faccded380102dfbc759be7cf6be88345
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_24_run.c
@@ -0,0 +1,37 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-require-effective-target supports_stack_clash_protection } */
+/* { dg-options "-O3 -fopenmp-simd -fstack-clash-protection --param stack-clash-protection-guard-size=16" } */
+
+#include "struct_vect_24.c"
+
+#undef TEST_LOOP
+#define TEST_LOOP(NAME, TYPE)				\
+  {							\
+    TYPE out[N];					\
+    TYPE in[N * 4];					\
+    for (int i = 0; i < N; ++i)				\
+      {							\
+	out[i] = i * 7 / 2;				\
+	asm volatile ("" ::: "memory");			\
+      }							\
+    for (int i = 0; i < N * 4; ++i)			\
+      {							\
+	in[i] = i * 9 / 2;				\
+	asm volatile ("" ::: "memory");			\
+      }							\
+    NAME (out, in);					\
+    for (int i = 0; i < N; ++i)				\
+      {							\
+	TYPE expected = i * 7 / 2;			\
+	if (out[i] != out[0] + expected)		\
+	  __builtin_abort ();				\
+	asm volatile ("" ::: "memory");			\
+      }							\
+  }
+
+int __attribute__ ((optimize (0)))
+main (void)
+{
+  TEST (test);
+  return 0;
+}

Re: [PATCH][GCC][AArch64] Add support for SVE stack clash probing [patch (2/7)]

Reply via email to