This patch adds s390 support for stack-clash mitigation. s390's most interesting property is that the caller allocates space for the callee to save registers into.
So much like aarch64, we start with a very conservative assumption about the offset between SP and the most recent stack probe. As we encounter those register saves we may be able to decrease that offset. And like aarch64 as we allocate space, the offset increases. If the offset crosses PROBE_INTERVAL, we must emit probes. Because the register saves hit the caller's frame s390 in some ways generates code more like x86/ppc. Though if there aren't any register saves, then the resulting code looks more like aarch64. For large frames, I did not implement an allocate/probe in a loop. Someone with a better understanding of the architecture is better suited for that work. I'll note that you're going to need another scratch register :-) This is the cause of the xfail of one test which expects to see a prologue allocate/probe loop. s390 has a -mbackchain option. I'm not sure where it's used, but we do try to handle it in the initial offset computation. However, we don't handle it in the actual allocations that occur when -fstack-check=clash is enabled. s390 does not have a -fstack-check=specific implementation. I have not tried to add one. But I have defined STACK_CHECK_STATIC_BUILTIN. I haven't investigated what side effects that might have. Other than the xfail noted above, the s390 uses the same tests as the x86, ppc and aarch64 ports. I suspect we're going to need further iteration here. Thoughts/Comments? Jeff
* config/s390/s390.c (PROBE_INTERVAL): Define. (allocate_stack_space): New function, partially extracted from s390_emit_prologue. (s390_emit_prologue): Track offset to most recent stack probe. Code to allocate space moved into allocate_stack_space. Dump actions when no stack is allocated. * config/s390/s390.h (STACK_CHECK_STATIC_BUILTIN): Define. testsuite/ * gcc.dg/stack-check-6.c: xfail for s390*-*-*. commit 56523059d48f55991e7607dbde248f2aabe3e7e3 Author: Jeff Law <l...@devel1.s390.bos.redhat.com> Date: Fri Jul 7 17:25:35 2017 +0000 S390 implementatoin diff --git a/gcc/config/s390/s390.c b/gcc/config/s390/s390.c index 958ee3b..cddb393 100644 --- a/gcc/config/s390/s390.c +++ b/gcc/config/s390/s390.c @@ -10999,6 +10999,107 @@ pass_s390_early_mach::execute (function *fun) } // anon namespace +#define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP) + +/* Allocate SIZE bytes of stack space, using TEMP_REG as a temporary + if necessary. LAST_PROBE_OFFSET contains the offset of the closest + probe relative to the stack pointer. + + Note that SIZE is negative. + + TMP_REG_IS_LIVE indicates that TEMP_REG actually holds a live + value and must be restored if we clobber it. */ +static void +allocate_stack_space (rtx size, HOST_WIDE_INT last_probe_offset, + rtx temp_reg, bool tmp_reg_is_live) +{ + rtx insn; + + /* If we are emitting stack probes and a SIZE allocation would cross + the PROBE_INTERVAL boundary, then we need significantly different + sequences to allocate and probe the stack. */ + if (flag_stack_check == STACK_CLASH_BUILTIN_STACK_CHECK + && last_probe_offset + -INTVAL (size) < PROBE_INTERVAL) + dump_stack_clash_frame_info (NO_PROBE_SMALL_FRAME, true); + else if (flag_stack_check == STACK_CLASH_BUILTIN_STACK_CHECK + && last_probe_offset + -INTVAL (size) >= PROBE_INTERVAL) + { + rtx memref; + + HOST_WIDE_INT rounded_size = -INTVAL (size) & -PROBE_INTERVAL; + + emit_move_insn (temp_reg, GEN_INT (PROBE_INTERVAL - 8)); + + /* We really should have a runtime loop version as well. */ + for (unsigned int i = 0; i < rounded_size; i += PROBE_INTERVAL) + { + insn = emit_insn (gen_add2_insn (stack_pointer_rtx, + GEN_INT (-PROBE_INTERVAL))); + RTX_FRAME_RELATED_P (insn); + + /* We just allocated PROBE_INTERVAL bytes of stack space. Thus, + a probe is mandatory here, but LAST_PROBE_OFFSET does not + change. */ + memref = gen_rtx_MEM (Pmode, gen_rtx_PLUS (Pmode, temp_reg, + stack_pointer_rtx)); + MEM_VOLATILE_P (memref); + emit_move_insn (memref, temp_reg); + } + + /* Handle any residual allocation request. */ + HOST_WIDE_INT residual = -INTVAL (size) - rounded_size; + insn = emit_insn (gen_add2_insn (stack_pointer_rtx, + GEN_INT (-residual))); + RTX_FRAME_RELATED_P (insn) = 1; + last_probe_offset += residual; + if (last_probe_offset >= PROBE_INTERVAL) + { + emit_move_insn (temp_reg, GEN_INT (residual + - GET_MODE_SIZE (word_mode))); + memref = gen_rtx_MEM (Pmode, gen_rtx_PLUS (Pmode, temp_reg, + stack_pointer_rtx)); + MEM_VOLATILE_P (memref); + emit_move_insn (memref, temp_reg); + } + + /* We clobbered TEMP_REG, but it really isn't a temporary at this point, + restore its value. */ + if (temp_reg_is_live) + { + emit_move_insn (temp_reg, GEN_INT (-INTVAL (size))); + emit_insn (gen_add2_insn (temp_reg, stack_pointer_rtx)); + } + + dump_stack_clash_frame_info (PROBE_INLINE, residual != 0); + emit_insn (gen_blockage ()); + return; + } + + /* Subtract frame size from stack pointer. */ + + if (DISP_IN_RANGE (INTVAL (size))) + { + insn = gen_rtx_SET (stack_pointer_rtx, + gen_rtx_PLUS (Pmode, stack_pointer_rtx, size)); + insn = emit_insn (insn); + } + else + { + if (!CONST_OK_FOR_K (INTVAL (size))) + size = force_const_mem (Pmode, size); + + insn = emit_insn (gen_add2_insn (stack_pointer_rtx, size)); + annotate_constant_pool_refs (&PATTERN (insn)); + } + + RTX_FRAME_RELATED_P (insn) = 1; + rtx real_frame_off = GEN_INT (-cfun_frame_layout.frame_size); + add_reg_note (insn, REG_FRAME_RELATED_EXPR, + gen_rtx_SET (stack_pointer_rtx, + gen_rtx_PLUS (Pmode, stack_pointer_rtx, + real_frame_off))); +} + /* Expand the prologue into a bunch of separate insns. */ void @@ -11023,6 +11124,16 @@ s390_emit_prologue (void) else temp_reg = gen_rtx_REG (Pmode, 1); + /* When probing for stack-clash mitigation, we have to track the distance + between the stack pointer and closest known reference. + + Most of the time we have to make a worst cast assumption. The + only exception is when TARGET_BACKCHAIN is active, in which case + we know *sp (offset 0) was written. */ + HOST_WIDE_INT last_probe_offset + = (TARGET_BACKCHAIN + ? 0 : PROBE_INTERVAL - (STACK_BOUNDARY / UNITS_PER_WORD)); + s390_save_gprs_to_fprs (); /* Save call saved gprs. */ @@ -11034,6 +11145,14 @@ s390_emit_prologue (void) - cfun_frame_layout.first_save_gpr_slot), cfun_frame_layout.first_save_gpr, cfun_frame_layout.last_save_gpr); + + /* This is not 100% correct. If we have more than one register saved, + then LAST_PROBE_OFFSET can move even closer to sp. */ + last_probe_offset + = (cfun_frame_layout.gprs_offset + + UNITS_PER_LONG * (cfun_frame_layout.first_save_gpr + - cfun_frame_layout.first_save_gpr_slot)); + emit_insn (insn); } @@ -11050,6 +11169,8 @@ s390_emit_prologue (void) if (cfun_fpr_save_p (i)) { save_fpr (stack_pointer_rtx, offset, i); + if (offset < last_probe_offset) + last_probe_offset = offset; offset += 8; } else if (!TARGET_PACKED_STACK || cfun->stdarg) @@ -11063,6 +11184,8 @@ s390_emit_prologue (void) if (cfun_fpr_save_p (i)) { insn = save_fpr (stack_pointer_rtx, offset, i); + if (offset < last_probe_offset) + last_probe_offset = offset; offset += 8; /* If f4 and f6 are call clobbered they are saved due to @@ -11085,6 +11208,8 @@ s390_emit_prologue (void) if (cfun_fpr_save_p (i)) { insn = save_fpr (stack_pointer_rtx, offset, i); + if (offset < last_probe_offset) + last_probe_offset = offset; RTX_FRAME_RELATED_P (insn) = 1; offset -= 8; @@ -11104,7 +11229,6 @@ s390_emit_prologue (void) if (cfun_frame_layout.frame_size > 0) { rtx frame_off = GEN_INT (-cfun_frame_layout.frame_size); - rtx real_frame_off; if (s390_stack_size) { @@ -11177,31 +11301,8 @@ s390_emit_prologue (void) if (TARGET_BACKCHAIN || next_fpr) insn = emit_insn (gen_move_insn (temp_reg, stack_pointer_rtx)); - /* Subtract frame size from stack pointer. */ - - if (DISP_IN_RANGE (INTVAL (frame_off))) - { - insn = gen_rtx_SET (stack_pointer_rtx, - gen_rtx_PLUS (Pmode, stack_pointer_rtx, - frame_off)); - insn = emit_insn (insn); - } - else - { - if (!CONST_OK_FOR_K (INTVAL (frame_off))) - frame_off = force_const_mem (Pmode, frame_off); - - insn = emit_insn (gen_add2_insn (stack_pointer_rtx, frame_off)); - annotate_constant_pool_refs (&PATTERN (insn)); - } - - RTX_FRAME_RELATED_P (insn) = 1; - real_frame_off = GEN_INT (-cfun_frame_layout.frame_size); - add_reg_note (insn, REG_FRAME_RELATED_EXPR, - gen_rtx_SET (stack_pointer_rtx, - gen_rtx_PLUS (Pmode, stack_pointer_rtx, - real_frame_off))); - + allocate_stack_space (frame_off, last_probe_offset, temp_reg, + TARGET_BACKCHAIN || next_fpr); /* Set backchain. */ if (TARGET_BACKCHAIN) @@ -11225,6 +11326,8 @@ s390_emit_prologue (void) emit_clobber (addr); } } + else if (flag_stack_check == STACK_CLASH_BUILTIN_STACK_CHECK) + dump_stack_clash_frame_info (NO_PROBE_NO_FRAME, false); /* Save fprs 8 - 15 (64 bit ABI). */ diff --git a/gcc/config/s390/s390.h b/gcc/config/s390/s390.h index 7847047..8169624 100644 --- a/gcc/config/s390/s390.h +++ b/gcc/config/s390/s390.h @@ -1124,4 +1124,5 @@ extern const int processor_flags_table[]; s390_register_target_pragmas (); \ } while (0) +#define STACK_CHECK_STATIC_BUILTIN 1 #endif /* S390_H */ diff --git a/gcc/testsuite/gcc.dg/stack-check-6.c b/gcc/testsuite/gcc.dg/stack-check-6.c index 9422e71..4e08074 100644 --- a/gcc/testsuite/gcc.dg/stack-check-6.c +++ b/gcc/testsuite/gcc.dg/stack-check-6.c @@ -44,8 +44,8 @@ f7 (void) foo (buf); } -/* { dg-final { scan-rtl-dump-times "Stack clash inline probes" 2 "pro_and_epilogue" } } */ -/* { dg-final { scan-rtl-dump-times "Stack clash probe loop" 2 "pro_and_epilogue" } } */ +/* { dg-final { scan-rtl-dump-times "Stack clash inline probes" 2 "pro_and_epilogue" { xfail s390*-*-*} } } */ +/* { dg-final { scan-rtl-dump-times "Stack clash probe loop" 2 "pro_and_epilogue" { xfail s390*-*-*} } } */ /* { dg-final { scan-rtl-dump-times "Stack clash residual allocation in prologue" 4 "pro_and_epilogue" } } */ /* { dg-final { scan-rtl-dump-times "Stack clash not noreturn" 4 "pro_and_epilogue" } } */