I'll leave the AArch64 maintainers to review, but some comments. Tamar Christina <tamar.christ...@arm.com> writes: > diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c > index > 06451f38b11822ea77323438fe8c7e373eb9e614..e7efde79bb111e820f4df44a276f6f73070ecd17 > 100644 > --- a/gcc/config/aarch64/aarch64.c > +++ b/gcc/config/aarch64/aarch64.c > @@ -3970,6 +3970,90 @@ aarch64_output_probe_stack_range (rtx reg1, rtx reg2) > return ""; > } > +/* Emit the probe loop for doing stack clash probes and stack adjustments for > + SVE. This emits probes from BASE to BASE + ADJUSTMENT based on a guard > size > + of GUARD_SIZE and emits a probe when at least LIMIT bytes are allocated. > By > + the end of this function BASE = BASE + ADJUSTMENT. */ > + > +const char * > +aarch64_output_probe_sve_stack_clash (rtx base, rtx adjustment, rtx limit, > + rtx guard_size) > +{ > + /* This function is not allowed to use any instruction generation function > + like gen_ and friends. If you do you'll likely ICE during CFG > validation, > + so instead emit the code you want using output_asm_insn. */ > + gcc_assert (flag_stack_clash_protection); > + gcc_assert (CONST_INT_P (limit) && CONST_INT_P (guard_size)); > + gcc_assert (aarch64_uimm12_shift (INTVAL (limit))); > + gcc_assert (aarch64_uimm12_shift (INTVAL (guard_size))); > + > + static int labelno = 0; > + char loop_start_lab[32]; > + char loop_res_lab[32]; > + char loop_end_lab[32]; > + rtx xops[2]; > + > + ASM_GENERATE_INTERNAL_LABEL (loop_start_lab, "SVLPSRL", labelno); > + ASM_GENERATE_INTERNAL_LABEL (loop_res_lab, "BRRCHK", labelno); > + ASM_GENERATE_INTERNAL_LABEL (loop_end_lab, "BERCHK", labelno++); > + > + /* Emit loop start label. */ > + ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_start_lab); > + > + /* Test if ADJUSTMENT < GUARD_SIZE. */ > + xops[0] = adjustment; > + xops[1] = guard_size; > + output_asm_insn ("cmp\t%0, %1", xops); > + > + /* Branch to residual loop if it is. */ > + fputs ("\tb.lt\t", asm_out_file); > + assemble_name_raw (asm_out_file, loop_res_lab); > + fputc ('\n', asm_out_file); > + > + /* BASE = BASE - GUARD_SIZE. */ > + xops[0] = base; > + xops[1] = guard_size; > + output_asm_insn ("sub\t%0, %0, %1", xops); > + > + /* Probe at BASE + LIMIT. */ > + xops[1] = limit; > + output_asm_insn ("str\txzr, [%0, %1]", xops); > + > + /* ADJUSTMENT = ADJUSTMENT - GUARD_SIZE. */ > + xops[0] = adjustment; > + xops[1] = guard_size; > + output_asm_insn ("sub\t%0, %0, %1", xops); > + > + /* Branch to loop start. */ > + fputs ("\tb\t", asm_out_file); > + assemble_name_raw (asm_out_file, loop_start_lab); > + fputc ('\n', asm_out_file); > + > + /* Emit residual check label. */ > + ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_res_lab); > + > + /* BASE = BASE - ADJUSTMENT. */ > + xops[0] = base; > + xops[1] = adjustment; > + output_asm_insn ("sub\t%0, %0, %1", xops); > + > + /* Test if BASE < LIMIT. */ > + xops[1] = limit; > + output_asm_insn ("cmp\t%0, %1", xops);
Think this should be ADJUSTMENT < LIMIT. > + /* Branch to end. */ > + fputs ("\tb.lt\t", asm_out_file); > + assemble_name_raw (asm_out_file, loop_end_lab); > + fputc ('\n', asm_out_file); > + > + /* Probe at BASE + LIMIT. */ > + output_asm_insn ("str\txzr, [%0, %1]", xops); It looks like this would probe at LIMIT when ADJUSTMENT is exactly LIMIT, which could clobber the caller's frame. > + > + /* No probe leave. */ > + ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_end_lab); > + return ""; With the CFA stuff and constant load, I think this works out as: --------------------------------------------- # 12 insns mov r15, base mov adjustment, N 1: cmp adjustment, guard_size b.lt 2f sub base, base, guard_size str xzr, [base, limit] sub adjustment, adjustment, guard_size b 1b 2: sub base, base, adjustment cmp adjustment, limit b.le 3f str xzr, [base, limit] 3: --------------------------------------------- What do you think about something like: --------------------------------------------- # 10 insns mov adjustment, N sub r15, base, adjustment subs adjustment, adjustment, min_probe_threshold b.lo 2f 1: add base, x15, adjustment str xzr, [base, 0] subs adjustment, adjustment, 16 and adjustment, adjustment, ~(guard_size-1) b.hs 1b 2: mov base, r15 --------------------------------------------- or (with different trade-offs): --------------------------------------------- # 11 insns mov adjustment, N sub r15, base, adjustment subs adjustment, adjustment, min_probe_threshold b.lo 2f # Might be 0, leading to a double probe and r14, adjustment, guard_size-1 1: add base, x15, adjustment str xzr, [base, 0] subs adjustment, adjustment, r14 mov r14, guard_size b.hs 1b 2: mov base, r15 --------------------------------------------- or (longer, but with a simpler loop): --------------------------------------------- # 12 insns mov adjustment, N sub r15, base, adjustment subs adjustment, adjustment, min_probe_threshold b.lo 2f str xzr, [base, -16]! sub adjustment, adjustment, 32 and adjustment, adjustment, -(guard_size-1) 1: add base, x15, adjustment str xzr, [base, 0] subs adjustment, adjustment, guard_size b.hs 1b 2: mov base, r15 --------------------------------------------- with the CFA based on r15+offset? These loops probe more often than necessary in some cases, but they only need a single branch in the common case that ADJUSTMENT <= MIN_PROBE_THRESHOLD. > @@ -4826,22 +4910,30 @@ aarch64_allocate_and_probe_stack_space (rtx temp1, > rtx temp2, > } > } > > - HOST_WIDE_INT size; > + /* GCC's initialization analysis is broken so initialize size. */ > + HOST_WIDE_INT size = 0; It's not broken in this case. :-) is_constant only modifies its argument when returning true. In other cases the variable keeps whatever value it had originally. And the code does use "size" when !is_constant, so an explicit initialisation is necessary. > /* If SIZE is not large enough to require probing, just adjust the stack > and > exit. */ > - if (!poly_size.is_constant (&size) > - || known_lt (poly_size, min_probe_threshold) > + if ((poly_size.is_constant (&size) > + && known_lt (poly_size, min_probe_threshold)) > || !flag_stack_clash_protection) No need for the is_constant here, just known_lt is enough. > { > aarch64_sub_sp (temp1, temp2, poly_size, frame_related_p); > return; > } > > - if (dump_file) > + if (dump_file && poly_size.is_constant ()) > fprintf (dump_file, > "Stack clash AArch64 prologue: " HOST_WIDE_INT_PRINT_DEC " bytes" > ", probing will be required.\n", size); > > + if (dump_file && !poly_size.is_constant ()) > + { > + fprintf (dump_file, "Stack clash SVE prologue: "); > + dump_dec (MSG_NOTE, poly_size); This should be print_dec (poly_size, dump_file); > + fprintf (dump_file, " bytes, dynamic probing will be required.\n"); > + } > + > /* Round size to the nearest multiple of guard_size, and calculate the > residual as the difference between the original size and the rounded > size. */ > @@ -4850,7 +4942,8 @@ aarch64_allocate_and_probe_stack_space (rtx temp1, rtx > temp2, > > /* We can handle a small number of allocations/probes inline. Otherwise > punt to a loop. */ > - if (rounded_size <= STACK_CLASH_MAX_UNROLL_PAGES * guard_size) > + if (poly_size.is_constant () > + && rounded_size <= STACK_CLASH_MAX_UNROLL_PAGES * guard_size) > { > for (HOST_WIDE_INT i = 0; i < rounded_size; i += guard_size) > { > @@ -4861,7 +4954,7 @@ aarch64_allocate_and_probe_stack_space (rtx temp1, rtx > temp2, > } > dump_stack_clash_frame_info (PROBE_INLINE, size != rounded_size); > } > - else > + else if (poly_size.is_constant ()) > { > /* Compute the ending address. */ > aarch64_add_offset (Pmode, temp1, stack_pointer_rtx, -rounded_size, > @@ -4910,6 +5003,48 @@ aarch64_allocate_and_probe_stack_space (rtx temp1, rtx > temp2, > emit_insn (gen_blockage ()); > dump_stack_clash_frame_info (PROBE_LOOP, size != rounded_size); > } > + else > + { It would probably be better to handle "!poly_size.is_constant ()" after the "!flag_stack_clash_protection" if statement and exit early, so that we don't do calculations based on "size" when "size" has a fairly meaningless value. It would also avoid repeated checks for is_constant. > + rtx probe_const = gen_rtx_CONST_INT (Pmode, STACK_CLASH_CALLER_GUARD); > + rtx guard_const = gen_rtx_CONST_INT (Pmode, guard_size); CONST_INTs don't have a recorded mode, so this should either be GEN_INT or (better) gen_int_mode. Thanks, Richard