Hi Richard,
Here's the updated patch with all the feedback processed.
I have also run the compile tests through with -mabi=ilp32 as well.
Ok for trunk?
Thanks,
Tamar
The 09/27/2018 12:11, Richard Sandiford wrote:
> > It turns out the testsuite didn't have a case in it which would cause a
> > significant enough spill to enter the loop. After creating one I noticed a
> > bug
> > in the loop and fixed it.
> >
> > The loops are now
> >
> > .cfi_startproc
> > mov x15, sp
> > cntb x16, all, mul #11
> > add x16, x16, 304
> > .cfi_def_cfa_register 15
> > .SVLPSPL0:
> > cmp x16, 61440
> > b.lt .SVLPEND0
> > sub sp, sp, 61440
> > str xzr, [sp, 0]
> > subs x16, x16, 61440
>
> (The code uses sub rather than subs here)
>
> > b .SVLPSPL0
> > .SVLPEND0:
> > sub sp, sp, x16
> > .cfi_escape
> > 0xf,0xc,0x8f,0,0x92,0x2e,0,0x8,0x58,0x1e,0x23,0xb0,0x2,0x22
> >
> > for a 64KB guard size.
>
> That's OK with me. Like you say, the main goal was to make the common
> case of no probe as fast as possible.
>
> > diff --git a/gcc/config/aarch64/aarch64-protos.h
> > b/gcc/config/aarch64/aarch64-protos.h
> > index
> > ef95fc829b83886e2ff00e4664e31af916e99b0c..e2d8734a8d5e513588e3b0318e9c67fdaebdf0d4
> > 100644
> > --- a/gcc/config/aarch64/aarch64-protos.h
> > +++ b/gcc/config/aarch64/aarch64-protos.h
> > @@ -453,6 +453,7 @@ void aarch64_asm_output_labelref (FILE *, const char *);
> > void aarch64_cpu_cpp_builtins (cpp_reader *);
> > const char * aarch64_gen_far_branch (rtx *, int, const char *, const char
> > *);
> > const char * aarch64_output_probe_stack_range (rtx, rtx);
> > +const char * aarch64_output_probe_sve_stack_clash (rtx, rtx, rtx, rtx);
> > void aarch64_err_no_fpadvsimd (machine_mode);
> > void aarch64_expand_epilogue (bool);
> > void aarch64_expand_mov_immediate (rtx, rtx, rtx (*) (rtx, rtx) = 0);
> > diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
> > index
> > d4b13d48d852a70848fc7c51fd867e776efb5e55..245fd6832ec0afe27c42a242c901a2e13024f935
> > 100644
> > --- a/gcc/config/aarch64/aarch64.c
> > +++ b/gcc/config/aarch64/aarch64.c
> > @@ -208,6 +208,7 @@ static bool aarch64_builtin_support_vector_misalignment
> > (machine_mode mode,
> > static machine_mode aarch64_simd_container_mode (scalar_mode, poly_int64);
> > static bool aarch64_print_address_internal (FILE*, machine_mode, rtx,
> > aarch64_addr_query_type);
> > +static HOST_WIDE_INT aarch64_uimm12_nearest_value (HOST_WIDE_INT val);
>
> > /* Major revision number of the ARM Architecture implemented by the
> > target. */
> > unsigned aarch64_architecture_version;
> > @@ -3973,6 +3974,83 @@ aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
> > return "";
> > }
>
> > +/* Emit the probe loop for doing stack clash probes and stack adjustments
> > for
> > + SVE. This emits probes from BASE to BASE - ADJUSTMENT based on a guard
> > size
> > + of GUARD_SIZE. When a probe is emitted it is done at MIN_PROBE_OFFSET
> > bytes
> > + from the current BASE at an interval of MIN_PROBE_OFFSET. By the end
> > of this
>
> MIN_PROBE_THRESHOLD in both cases (or rename the var to min_probe_offset,
> either's fine). Probably "at most MIN_PROBE..." given the round down.
>
> > + function BASE = BASE - ADJUSTMENT. */
> > +
> > +const char *
> > +aarch64_output_probe_sve_stack_clash (rtx base, rtx adjustment,
> > + rtx min_probe_threshold, rtx guard_size)
> > +{
> > + /* This function is not allowed to use any instruction generation
> > function
> > + like gen_ and friends. If you do you'll likely ICE during CFG
> > validation,
> > + so instead emit the code you want using output_asm_insn. */
> > + gcc_assert (flag_stack_clash_protection);
> > + gcc_assert (CONST_INT_P (min_probe_threshold) && CONST_INT_P
> > (guard_size));
> > + gcc_assert (INTVAL (guard_size) > INTVAL (min_probe_threshold));
> > +
> > + /* The minimum required allocation before the residual requires probing.
> > */
> > + HOST_WIDE_INT residual_probe_guard = INTVAL (min_probe_threshold);
> > +
> > + /* Clamp the value down to the nearest value that can be used with a
> > cmp. */
> > + residual_probe_guard = aarch64_uimm12_nearest_value
> > (residual_probe_guard);
>
> Maybe aarch64_clamp_to_uimm12_shift or aarch64_round_down_to_uimm12_shift
> would be better; nearest implies that "0x1ff0" should become "0x2000"
> rather than "0x1000".
>
> > + /* ADJUSTMENT == RESIDUAL_PROBE_GUARD. */
> > + xops[0] = adjustment;
> > + xops[1] = probe_offset_value_rtx;
> > + output_asm_insn ("cmp\t%0, %1", xops);
>
> < rather than == (or just "Compare ...")
>
> > + /* Branch to end if not enough adjustment to probe. */
> > + fputs ("\tb.lt\t", asm_out_file);
> > + assemble_name_raw (asm_out_file, loop_end_lab);
> > + fputc ('\n', asm_out_file);
> > +
> > + /* BASE = BASE - RESIDUAL_PROBE_GUARD. */
> > + xops[0] = base;
> > + xops[1] = gen_int_mode (residual_probe_guard, Pmode);
>
> probe_offset_value_rtx
>
> > + HOST_WIDE_INT size;
> > + /* Handle the SVE non-constant case first. */
> > + if (!poly_size.is_constant (&size))
> > + {
> > +
>
> Excess blank line.
>
> > + if (dump_file)
> > + {
> > + fprintf (dump_file, "Stack clash SVE prologue: ");
> > + print_dec (poly_size, dump_file);
> > + fprintf (dump_file, " bytes, dynamic probing will be required.\n");
> > + }
> > +
> > + /* First calculate the amount of bytes we're actually spilling. */
> > + aarch64_add_offset (Pmode, temp1, CONST0_RTX (GET_MODE (temp1)),
>
> Might as well use Pmode for the CONST0_RTX too, for consistency with the
> first argument to aarch64_add_offset.
>
> > + poly_size, temp1, temp2, false, true);
> > +
> > + rtx_insn *insn = get_last_insn ();
> > +
> > + if (frame_related_p)
> > + {
> > + /* This is done to provide unwinding information for the stack
> > + adjustments we're about to do, however to prevent the optimizers
> > + from removing the R15 move and leaving the CFA note (which would be
> > + very wrong) we tie the old and new stack pointer together.
> > + The tie will expand to nothing but the optimizers will not touch
> > + the instruction. */
> > + rtx stack_ptr_copy = gen_rtx_REG (Pmode, R15_REGNUM);
> > + emit_move_insn (stack_ptr_copy, stack_pointer_rtx);
> > + emit_insn (gen_stack_tie (stack_ptr_copy, stack_pointer_rtx));
> > +
> > + /* We want the CFA independent of the stack pointer for the
> > + duration of the loop. */
> > + add_reg_note (insn, REG_CFA_DEF_CFA, stack_ptr_copy);
> > + RTX_FRAME_RELATED_P (insn) = 1;
> > + }
> > +
> > + rtx probe_const = gen_int_mode (min_probe_threshold, DImode);
> > + rtx guard_const = gen_int_mode (guard_size, DImode);
>
> Pmode in both cases. (No practical difference, but it makes everything
> agree on the mode.)
>
> > if (dump_file)
> > - fprintf (dump_file,
> > - "Stack clash AArch64 prologue: " HOST_WIDE_INT_PRINT_DEC " bytes"
> > - ", probing will be required.\n", size);
> > + {
> > + fprintf (dump_file,
> > + "Stack clash AArch64 prologue: " HOST_WIDE_INT_PRINT_DEC
> > + " bytes, probing will be required.\n", size);
> > + }
>
> Not needed (previous formatting without { ... } was right).
>
> > +/* Returns the nearest value to VAL that will fit as a 12-bit unsigned
> > immediate
> > + that can be created with a left shift of 0 or 12. */
> > +static HOST_WIDE_INT
> > +aarch64_uimm12_nearest_value (HOST_WIDE_INT val)
> > +{
> > + if ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val)
> > + return val;
> > +
> > + return val & (((HOST_WIDE_INT) 0xfff) << 12);
> > +}
>
> Are these HOST_WIDE_INT casts needed?
>
> Probably worth asserting that (val & 0xffffff) == val, or handle
> the case in which it isn't by returning 0xfff000.
>
> > +;; This instruction is used to generate the stack clash stack adjustment
> > and
> > +;; probing loop. We can't change the control flow during prologue and
> > epilogue
> > +;; code generation. So we must emit a volatile unspec and expand it later
> > on.
> > +
> > +(define_insn "probe_sve_stack_clash"
> > + [(set (match_operand:DI 0 "register_operand" "=rk")
> > + (unspec_volatile:DI [(match_operand:DI 1 "register_operand" "0")
> > + (match_operand:DI 2 "register_operand" "r")
> > + (match_operand:DI 3 "const_int_operand" "n")
> > + (match_operand:DI 4 "aarch64_plus_immediate" "L")]
> > + UNSPECV_PROBE_STACK_RANGE))]
> > + "TARGET_SVE"
> > +{
> > + return aarch64_output_probe_sve_stack_clash (operands[0], operands[2],
> > + operands[3], operands[4]);
> > +}
> > + [(set_attr "length" "28")]
> > +)
>
> Think this will break for ILP32. We probably need :P instead of :DI and
>
> "@probe_sve_stack_clash_<mode>"
>
> gen_probe_sve_stack_clash (Pmode, ...)
>
> > diff --git a/gcc/testsuite/gcc.target/aarch64/stack-check-cfa-3.c
> > b/gcc/testsuite/gcc.target/aarch64/stack-check-cfa-3.c
> > new file mode 100644
> > index
> > 0000000000000000000000000000000000000000..6ea87392843e4b9561cf6d43ffee57887db62e4e
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/aarch64/stack-check-cfa-3.c
> > @@ -0,0 +1,30 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O2 -march=armv8-a+sve -fstack-clash-protection --param
> > stack-clash-protection-guard-size=16 -funwind-tables -ftree-vectorize" } */
> > +/* { dg-require-effective-target supports_stack_clash_protection } */
> > +
> > +#include <stdint.h>
> > +
> > +#define N 20040
> > +
> > +void __attribute__ ((noinline, noclone))
> > +test (int8_t *restrict dest, int8_t *restrict src)
> > +{
> > + for (int i = 0; i < N; i+=8)
> > + {
> > + dest[i] += src[i * 4];
> > + dest[i+1] += src[i * 4 + 1];
> > + dest[i+2] += src[i * 4 + 2];
> > + dest[i+3] += src[i * 4 + 3];
> > + dest[i+4] += src[i * 4 + 4];
> > + dest[i+5] += src[i * 4 + 5];
> > + dest[i+6] += src[i * 4 + 6];
> > + dest[i+7] += src[i * 4 + 7];
> > + }
> > +}
>
> I think we should use something that has a higher guarantee of
> spilling, since we shouldn't really need to spill for the above.
> See g++.target/aarch64/sve/catch_1.C for one possibility.
>
> > +/* { dg-final { scan-assembler-times {mov\tx15, sp} 1 } } */
> > +/* { dg-final { scan-assembler-times {\.cfi_def_cfa_register 15} 1 } } */
> > +/* { dg-final { scan-assembler-times {\.cfi_escape
> > 0xf,0xc,0x8f,0,0x92,0x2e,0,0x8,0x58,0x1e,0x23,0xb0,0x2,0x22} 1 } } */
> > +
> > +/* Checks that the CFA notes are correct for every sp adjustment, but we
> > also
> > + need to make sure we can unwind correctly before the frame is set up.
> > So
> > + check that we're emitting r15 with a copy of sp an setting the CFA
> > there. */
>
> Think this comment belongs above the dg-finals -- seems odd to have it at
> the end of the file.
>
> I'll take your word that the cfi_escape is correct, but it looks like
> it matches the full calculation, including the VG multiple. It would
> be better to leave out that part of the encoding, since the number of
> SVE vectors spilled could vary quite easily.
>
> > diff --git a/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-16.c
> > b/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-16.c
> > new file mode 100644
> > index
> > 0000000000000000000000000000000000000000..fd0e987597eba406fa7351433fe7157743aeca42
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-16.c
> > @@ -0,0 +1,32 @@
> > +/* { dg-do compile } */
> > +/* { dg-require-effective-target supports_stack_clash_protection } */
> > +/* { dg-options "-O2 -march=armv8-a+sve -fstack-clash-protection --param
> > stack-clash-protection-guard-size=16 -ftree-vectorize" } */
> > +
> > +
> > +#include <stdint.h>
>
> Excess blank line before include.
>
> > +#define N 20040
> > +
> > +void __attribute__ ((noinline, noclone))
> > +test (int8_t *restrict dest, int8_t *restrict src)
> > +{
> > + for (int i = 0; i < N; i+=8)
> > + {
> > + dest[i] += src[i * 4];
> > + dest[i+1] += src[i * 4 + 1];
> > + dest[i+2] += src[i * 4 + 2];
> > + dest[i+3] += src[i * 4 + 3];
> > + dest[i+4] += src[i * 4 + 4];
> > + dest[i+5] += src[i * 4 + 5];
> > + dest[i+6] += src[i * 4 + 6];
> > + dest[i+7] += src[i * 4 + 7];
> > + }
> > +}
> > +
> > +
> > +/* { dg-final { scan-assembler-times {str\s+xzr, \[sp, 0\]} 1 } } */
> > +/* { dg-final { scan-assembler-times {cmp\s+x[0-9]+, 61440} 1 } } */
> > +/* { dg-final { scan-assembler-times {sub\s+x[0-9]+, x[0-9]+, 61440} 1 } }
> > */
> > +
> > +/* SVE spill, requires probing as vector size is unknown at compile time.
> > */
>
> Same comments above forcing spilling and putting the comment before
> the dg-finals.
>
> Thanks,
> Richard
--
diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
index ef95fc829b83886e2ff00e4664e31af916e99b0c..e2d8734a8d5e513588e3b0318e9c67fdaebdf0d4 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -453,6 +453,7 @@ void aarch64_asm_output_labelref (FILE *, const char *);
void aarch64_cpu_cpp_builtins (cpp_reader *);
const char * aarch64_gen_far_branch (rtx *, int, const char *, const char *);
const char * aarch64_output_probe_stack_range (rtx, rtx);
+const char * aarch64_output_probe_sve_stack_clash (rtx, rtx, rtx, rtx);
void aarch64_err_no_fpadvsimd (machine_mode);
void aarch64_expand_epilogue (bool);
void aarch64_expand_mov_immediate (rtx, rtx, rtx (*) (rtx, rtx) = 0);
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index d4b13d48d852a70848fc7c51fd867e776efb5e55..8c901e9d8c00d392a2df62d9b63ce5b865b48e50 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -208,6 +208,7 @@ static bool aarch64_builtin_support_vector_misalignment (machine_mode mode,
static machine_mode aarch64_simd_container_mode (scalar_mode, poly_int64);
static bool aarch64_print_address_internal (FILE*, machine_mode, rtx,
aarch64_addr_query_type);
+static HOST_WIDE_INT aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val);
/* Major revision number of the ARM Architecture implemented by the target. */
unsigned aarch64_architecture_version;
@@ -3973,6 +3974,84 @@ aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
return "";
}
+/* Emit the probe loop for doing stack clash probes and stack adjustments for
+ SVE. This emits probes from BASE to BASE - ADJUSTMENT based on a guard size
+ of GUARD_SIZE. When a probe is emitted it is done at most
+ MIN_PROBE_THRESHOLD bytes from the current BASE at an interval of
+ at most MIN_PROBE_THRESHOLD. By the end of this function
+ BASE = BASE - ADJUSTMENT. */
+
+const char *
+aarch64_output_probe_sve_stack_clash (rtx base, rtx adjustment,
+ rtx min_probe_threshold, rtx guard_size)
+{
+ /* This function is not allowed to use any instruction generation function
+ like gen_ and friends. If you do you'll likely ICE during CFG validation,
+ so instead emit the code you want using output_asm_insn. */
+ gcc_assert (flag_stack_clash_protection);
+ gcc_assert (CONST_INT_P (min_probe_threshold) && CONST_INT_P (guard_size));
+ gcc_assert (INTVAL (guard_size) > INTVAL (min_probe_threshold));
+
+ /* The minimum required allocation before the residual requires probing. */
+ HOST_WIDE_INT residual_probe_guard = INTVAL (min_probe_threshold);
+
+ /* Clamp the value down to the nearest value that can be used with a cmp. */
+ residual_probe_guard = aarch64_clamp_to_uimm12_shift (residual_probe_guard);
+ rtx probe_offset_value_rtx = gen_int_mode (residual_probe_guard, Pmode);
+
+ gcc_assert (INTVAL (min_probe_threshold) >= residual_probe_guard);
+ gcc_assert (aarch64_uimm12_shift (residual_probe_guard));
+
+ static int labelno = 0;
+ char loop_start_lab[32];
+ char loop_end_lab[32];
+ rtx xops[2];
+
+ ASM_GENERATE_INTERNAL_LABEL (loop_start_lab, "SVLPSPL", labelno);
+ ASM_GENERATE_INTERNAL_LABEL (loop_end_lab, "SVLPEND", labelno++);
+
+ /* Emit loop start label. */
+ ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_start_lab);
+
+ /* ADJUSTMENT < RESIDUAL_PROBE_GUARD. */
+ xops[0] = adjustment;
+ xops[1] = probe_offset_value_rtx;
+ output_asm_insn ("cmp\t%0, %1", xops);
+
+ /* Branch to end if not enough adjustment to probe. */
+ fputs ("\tb.lt\t", asm_out_file);
+ assemble_name_raw (asm_out_file, loop_end_lab);
+ fputc ('\n', asm_out_file);
+
+ /* BASE = BASE - RESIDUAL_PROBE_GUARD. */
+ xops[0] = base;
+ xops[1] = probe_offset_value_rtx;
+ output_asm_insn ("sub\t%0, %0, %1", xops);
+
+ /* Probe at BASE. */
+ xops[1] = const0_rtx;
+ output_asm_insn ("str\txzr, [%0, %1]", xops);
+
+ /* ADJUSTMENT = ADJUSTMENT - RESIDUAL_PROBE_GUARD. */
+ xops[0] = adjustment;
+ xops[1] = probe_offset_value_rtx;
+ output_asm_insn ("sub\t%0, %0, %1", xops);
+
+ /* Branch to start if still more bytes to allocate. */
+ fputs ("\tb\t", asm_out_file);
+ assemble_name_raw (asm_out_file, loop_start_lab);
+ fputc ('\n', asm_out_file);
+
+ /* No probe leave. */
+ ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_end_lab);
+
+ /* BASE = BASE - ADJUSTMENT. */
+ xops[0] = base;
+ xops[1] = adjustment;
+ output_asm_insn ("sub\t%0, %0, %1", xops);
+ return "";
+}
+
/* Determine whether a frame chain needs to be generated. */
static bool
aarch64_needs_frame_chain (void)
@@ -4835,21 +4914,73 @@ aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
}
}
- HOST_WIDE_INT size;
/* If SIZE is not large enough to require probing, just adjust the stack and
exit. */
- if (!poly_size.is_constant (&size)
- || known_lt (poly_size, min_probe_threshold)
+ if (known_lt (poly_size, min_probe_threshold)
|| !flag_stack_clash_protection)
{
aarch64_sub_sp (temp1, temp2, poly_size, frame_related_p);
return;
}
+ HOST_WIDE_INT size;
+ /* Handle the SVE non-constant case first. */
+ if (!poly_size.is_constant (&size))
+ {
+ if (dump_file)
+ {
+ fprintf (dump_file, "Stack clash SVE prologue: ");
+ print_dec (poly_size, dump_file);
+ fprintf (dump_file, " bytes, dynamic probing will be required.\n");
+ }
+
+ /* First calculate the amount of bytes we're actually spilling. */
+ aarch64_add_offset (Pmode, temp1, CONST0_RTX (Pmode),
+ poly_size, temp1, temp2, false, true);
+
+ rtx_insn *insn = get_last_insn ();
+
+ if (frame_related_p)
+ {
+ /* This is done to provide unwinding information for the stack
+ adjustments we're about to do, however to prevent the optimizers
+ from removing the R15 move and leaving the CFA note (which would be
+ very wrong) we tie the old and new stack pointer together.
+ The tie will expand to nothing but the optimizers will not touch
+ the instruction. */
+ rtx stack_ptr_copy = gen_rtx_REG (Pmode, R15_REGNUM);
+ emit_move_insn (stack_ptr_copy, stack_pointer_rtx);
+ emit_insn (gen_stack_tie (stack_ptr_copy, stack_pointer_rtx));
+
+ /* We want the CFA independent of the stack pointer for the
+ duration of the loop. */
+ add_reg_note (insn, REG_CFA_DEF_CFA, stack_ptr_copy);
+ RTX_FRAME_RELATED_P (insn) = 1;
+ }
+
+ rtx probe_const = gen_int_mode (min_probe_threshold, Pmode);
+ rtx guard_const = gen_int_mode (guard_size, Pmode);
+
+ insn = emit_insn (gen_probe_sve_stack_clash (Pmode, stack_pointer_rtx,
+ stack_pointer_rtx, temp1,
+ probe_const, guard_const));
+
+ /* Now reset the CFA register if needed. */
+ if (frame_related_p)
+ {
+ add_reg_note (insn, REG_CFA_DEF_CFA,
+ gen_rtx_PLUS (Pmode, stack_pointer_rtx,
+ gen_int_mode (poly_size, Pmode)));
+ RTX_FRAME_RELATED_P (insn) = 1;
+ }
+
+ return;
+ }
+
if (dump_file)
fprintf (dump_file,
- "Stack clash AArch64 prologue: " HOST_WIDE_INT_PRINT_DEC " bytes"
- ", probing will be required.\n", size);
+ "Stack clash AArch64 prologue: " HOST_WIDE_INT_PRINT_DEC
+ " bytes, probing will be required.\n", size);
/* Round size to the nearest multiple of guard_size, and calculate the
residual as the difference between the original size and the rounded
@@ -5458,6 +5589,20 @@ aarch64_uimm12_shift (HOST_WIDE_INT val)
);
}
+/* Returns the nearest value to VAL that will fit as a 12-bit unsigned immediate
+ that can be created with a left shift of 0 or 12. */
+static HOST_WIDE_INT
+aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val)
+{
+ /* Check to see if the value fits in 24 bits, as that is the maximum we can
+ handle correctly. */
+ gcc_assert ((val & 0xffffff) == val);
+
+ if (((val & 0xfff) << 0) == val)
+ return val;
+
+ return val & (0xfff << 12);
+}
/* Return true if val is an immediate that can be loaded into a
register by a MOVZ instruction. */
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index b8da13f14fa9990e8fdc3c71ed407c8afc65a324..22eb026f0631958536ab0c33c4d234d0156dc120 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -6464,6 +6464,25 @@
[(set_attr "length" "32")]
)
+;; This instruction is used to generate the stack clash stack adjustment and
+;; probing loop. We can't change the control flow during prologue and epilogue
+;; code generation. So we must emit a volatile unspec and expand it later on.
+
+(define_insn "@probe_sve_stack_clash_<mode>"
+ [(set (match_operand:P 0 "register_operand" "=rk")
+ (unspec_volatile:P [(match_operand:P 1 "register_operand" "0")
+ (match_operand:P 2 "register_operand" "r")
+ (match_operand:P 3 "const_int_operand" "n")
+ (match_operand:P 4 "aarch64_plus_immediate" "L")]
+ UNSPECV_PROBE_STACK_RANGE))]
+ "TARGET_SVE"
+{
+ return aarch64_output_probe_sve_stack_clash (operands[0], operands[2],
+ operands[3], operands[4]);
+}
+ [(set_attr "length" "28")]
+)
+
;; Named pattern for expanding thread pointer reference.
(define_expand "get_thread_pointerdi"
[(match_operand:DI 0 "register_operand" "=r")]
diff --git a/gcc/testsuite/gcc.target/aarch64/stack-check-cfa-3.c b/gcc/testsuite/gcc.target/aarch64/stack-check-cfa-3.c
new file mode 100644
index 0000000000000000000000000000000000000000..41579f26ba9156f3e500f090d132ba9cf28364d3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/stack-check-cfa-3.c
@@ -0,0 +1,13 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -fopenmp-simd -march=armv8-a+sve -fstack-clash-protection --param stack-clash-protection-guard-size=16 -funwind-tables" } */
+/* { dg-require-effective-target supports_stack_clash_protection } */
+
+#include "stack-check-prologue-16.c"
+
+/* Checks that the CFA notes are correct for every sp adjustment, but we also
+ need to make sure we can unwind correctly before the frame is set up. So
+ check that we're emitting r15 with a copy of sp an setting the CFA there. */
+
+/* { dg-final { scan-assembler-times {mov\tx15, sp} 1 } } */
+/* { dg-final { scan-assembler-times {\.cfi_def_cfa_register 15} 1 } } */
+/* { dg-final { scan-assembler-times {\.cfi_escape 0xf,0xc,0x8f,0,0x92,0x2e,0,.*} 1 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-16.c b/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-16.c
new file mode 100644
index 0000000000000000000000000000000000000000..d92ef47a57ddda556c563e36ad8aaf4acdeabc57
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-16.c
@@ -0,0 +1,32 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target supports_stack_clash_protection } */
+/* { dg-options "-O3 -fopenmp-simd -march=armv8-a+sve -fstack-clash-protection --param stack-clash-protection-guard-size=16" } */
+
+/* Invoke X (P##n) for n in [0, 7]. */
+#define REPEAT8(X, P) \
+ X (P##0) X (P##1) X (P##2) X (P##3) X (P##4) X (P##5) X (P##6) X (P##7)
+
+/* Invoke X (n) for all octal n in [0, 39]. */
+#define REPEAT40(X) \
+ REPEAT8 (X, 0) REPEAT8 (X, 1) REPEAT8 (X, 2) REPEAT8 (X, 3) REPEAT8 (X, 4)
+
+/* Expect vector work to be done, with spilling of vector registers. */
+void
+f2 (int x[40][100], int *y)
+{
+ /* Try to force some spilling. */
+#define DECLARE(N) int y##N = y[N];
+ REPEAT40 (DECLARE);
+#pragma omp simd
+ for (int i = 0; i < 100; ++i)
+ {
+#define INC(N) x[N][i] += y##N;
+ REPEAT40 (INC);
+ }
+}
+
+/* SVE spill, requires probing as vector size is unknown at compile time. */
+
+/* { dg-final { scan-assembler-times {str\s+xzr, \[sp, 0\]} 1 } } */
+/* { dg-final { scan-assembler-times {cmp\s+x[0-9]+, 61440} 1 } } */
+/* { dg-final { scan-assembler-times {sub\s+x[0-9]+, x[0-9]+, 61440} 1 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_24.c b/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_24.c
new file mode 100644
index 0000000000000000000000000000000000000000..68a9d5e3d2e74cb331dff0ef3bcd612f8bb0d0f2
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_24.c
@@ -0,0 +1,48 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target supports_stack_clash_protection } */
+/* { dg-options "-O3 -fopenmp-simd -fstack-clash-protection --param stack-clash-protection-guard-size=16" } */
+
+#include <stdint.h>
+
+#define N 50
+#define S 2 * 64 * 1024
+
+/* Invoke X (P##n) for n in [0, 9]. */
+#define REPEAT8(X, P) \
+ X (P##0) X (P##1) X (P##2) X (P##3) X (P##4) X (P##5) X (P##6) X (P##7) \
+ X (P##8) X (P##9)
+
+/* Invoke X (n) for all n in [0, 49]. */
+#define REPEAT50(X) \
+ REPEAT8 (X, ) REPEAT8 (X, 1) REPEAT8 (X, 2) REPEAT8 (X, 3) REPEAT8 (X, 4)
+
+ /* Try to force some spilling. */
+#define DECLARE(N) int src##N = src[N * 4];
+#define INC(N) dest[i] += src##N;
+
+#define TEST_LOOP(NAME, TYPE) \
+ void __attribute__ ((noinline, noclone, simd)) \
+ NAME (TYPE *restrict dest, TYPE *restrict src) \
+ { \
+ REPEAT50 (DECLARE); \
+ volatile char foo[S]; \
+ foo[S-1]=1; \
+ for (int i = 0; i < N; i++) \
+ { \
+ REPEAT50 (INC); \
+ } \
+ }
+
+#define TEST(NAME) \
+ TEST_LOOP (NAME##_i32, int32_t) \
+ TEST_LOOP (NAME##_i64, int64_t) \
+ TEST_LOOP (NAME##_f32, float) \
+ TEST_LOOP (NAME##_f64, double)
+
+TEST (test)
+
+/* Check the vectorized loop for stack clash probing. */
+
+/* { dg-final { scan-assembler-times {str\s+xzr, \[sp, 0\]} 4 } } */
+/* { dg-final { scan-assembler-times {cmp\s+x[0-9]+, 61440} 4 } } */
+/* { dg-final { scan-assembler-times {sub\s+x[0-9]+, x[0-9]+, 61440} 4 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_24_run.c b/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_24_run.c
new file mode 100644
index 0000000000000000000000000000000000000000..e764476faccded380102dfbc759be7cf6be88345
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_24_run.c
@@ -0,0 +1,37 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-require-effective-target supports_stack_clash_protection } */
+/* { dg-options "-O3 -fopenmp-simd -fstack-clash-protection --param stack-clash-protection-guard-size=16" } */
+
+#include "struct_vect_24.c"
+
+#undef TEST_LOOP
+#define TEST_LOOP(NAME, TYPE) \
+ { \
+ TYPE out[N]; \
+ TYPE in[N * 4]; \
+ for (int i = 0; i < N; ++i) \
+ { \
+ out[i] = i * 7 / 2; \
+ asm volatile ("" ::: "memory"); \
+ } \
+ for (int i = 0; i < N * 4; ++i) \
+ { \
+ in[i] = i * 9 / 2; \
+ asm volatile ("" ::: "memory"); \
+ } \
+ NAME (out, in); \
+ for (int i = 0; i < N; ++i) \
+ { \
+ TYPE expected = i * 7 / 2; \
+ if (out[i] != out[0] + expected) \
+ __builtin_abort (); \
+ asm volatile ("" ::: "memory"); \
+ } \
+ }
+
+int __attribute__ ((optimize (0)))
+main (void)
+{
+ TEST (test);
+ return 0;
+}