https://gcc.gnu.org/g:4e6291b6aa5c2033a36e0ac92077a55471e64f92
commit 4e6291b6aa5c2033a36e0ac92077a55471e64f92 Author: Michael Matz <m...@suse.de> Date: Tue Jul 9 17:27:37 2024 +0200 x86-ssw: tidy and commentary Diff: --- gcc/config/i386/i386.cc | 310 ++++++++++++++++-------------------------------- gcc/config/i386/i386.h | 1 + 2 files changed, 101 insertions(+), 210 deletions(-) diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc index 20f4dcd61870..8c9505d53a75 100644 --- a/gcc/config/i386/i386.cc +++ b/gcc/config/i386/i386.cc @@ -6970,7 +6970,7 @@ ix86_compute_frame_layout (void) } frame->save_regs_using_mov - = (TARGET_PROLOGUE_USING_MOVE /*|| flag_shrink_wrap_separate*/) && m->use_fast_prologue_epilogue; + = TARGET_PROLOGUE_USING_MOVE && m->use_fast_prologue_epilogue; /* Skip return address and error code in exception handler. */ offset = INCOMING_FRAME_SP_OFFSET; @@ -7133,9 +7133,7 @@ ix86_compute_frame_layout (void) || (flag_stack_clash_protection && !ix86_target_stack_probe () && to_allocate > get_probe_interval ())) - { - frame->cannot_use_moves = true; - } + frame->cannot_use_moves = true; if ((!to_allocate && frame->nregs <= 1) || frame->cannot_use_moves) @@ -9190,6 +9188,11 @@ ix86_expand_prologue (void) m->fs.cfa_reg == stack_pointer_rtx); else { + /* Even when shrink-wrapping separately we call emit_prologue + which will reset the frame-state with the expectation that + we leave this routine with the state valid for the normal + body of the function, i.e. reflecting allocated frame. + So track this by hand. */ if (m->fs.cfa_reg == stack_pointer_rtx) m->fs.cfa_offset -= allocate; m->fs.sp_offset += allocate; @@ -10786,9 +10789,17 @@ ix86_live_on_entry (bitmap regs) } /* Separate shrink-wrapping. */ + +/* On x86 we have one component for each hardreg (a component is handled + if it's a callee saved register), and one additional component for + the frame allocation. */ + #define NCOMPONENTS (FIRST_PSEUDO_REGISTER + 1) #define SW_FRAME FIRST_PSEUDO_REGISTER +/* Returns false when we can't allocate the frame as a separate + component. Otherwise return true. */ + static bool separate_frame_alloc_p (void) { @@ -10801,12 +10812,17 @@ separate_frame_alloc_p (void) return true; } +/* Implements TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS. + Returns an sbitmap with all components that we intend to possibly + handle for the current function. */ + static sbitmap ix86_get_separate_components (void) { struct machine_function *m = cfun->machine; struct ix86_frame *frame = &m->frame; sbitmap components; + unsigned min, max; ix86_finalize_stack_frame_flags (); if (frame->cannot_use_moves @@ -10814,24 +10830,42 @@ ix86_get_separate_components (void) || cfun->machine->func_type != TYPE_NORMAL) return NULL; + min = max = INVALID_REGNUM; + components = sbitmap_alloc (NCOMPONENTS); bitmap_clear (components); for (unsigned regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++) if (ix86_save_reg (regno, true, true)) { + if (min == INVALID_REGNUM) + min = regno; + max = regno; bitmap_set_bit (components, regno); } + if (max >= FIRST_PSEUDO_REGISTER) + { + sbitmap_free (components); + return NULL; + } + + m->ssw_min_reg = min; + m->ssw_max_reg = max; + if (separate_frame_alloc_p ()) bitmap_set_bit (components, SW_FRAME); return components; } +/* Implements TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB. Given a BB + return all components that are necessary for it. */ + static sbitmap ix86_components_for_bb (basic_block bb) { + struct machine_function *m = cfun->machine; bool need_frame = false; sbitmap components = sbitmap_alloc (NCOMPONENTS); bitmap_clear (components); @@ -10840,7 +10874,7 @@ ix86_components_for_bb (basic_block bb) bitmap gen = &DF_LIVE_BB_INFO (bb)->gen; bitmap kill = &DF_LIVE_BB_INFO (bb)->kill; - for (unsigned regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++) + for (unsigned regno = m->ssw_min_reg; regno <= m->ssw_max_reg; regno++) if (ix86_save_reg (regno, true, true) && (bitmap_bit_p (in, regno) || bitmap_bit_p (gen, regno) @@ -10881,6 +10915,9 @@ ix86_components_for_bb (basic_block bb) return components; } +/* Implements TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS. Filter out + from COMPONENTS those that we can't handle on edge E. */ + static void ix86_disqualify_components (sbitmap components, edge e, sbitmap, bool) { @@ -10890,6 +10927,10 @@ ix86_disqualify_components (sbitmap components, edge e, sbitmap, bool) bitmap_clear_bit (components, SW_FRAME); } +/* Helper for frame allocation. This resets cfun->machine->fs to + reflect the state at the first instruction before prologue (i.e. + the call just happened). */ + static void ix86_init_frame_state (void) { @@ -10909,108 +10950,33 @@ ix86_init_frame_state (void) m->fs.sp_realigned = false; } +/* Helper for shrink wrapping the frame allocation. This emits the + stack pointer adjustment reflecting the allocation. Only easy cases + are handled, more complicated ones must be ruled out by the other + shrink wrapping hooks. */ + static void ix86_alloc_frame (void) { struct machine_function *m = cfun->machine; const struct ix86_frame &frame = m->frame; - rtx insn, t; - bool int_registers_saved = true; - bool sse_registers_saved = true; HOST_WIDE_INT allocate; - memset (&m->fs, 0, sizeof (m->fs)); - - /* Initialize CFA state for before the prologue. */ - m->fs.cfa_reg = stack_pointer_rtx; - m->fs.cfa_offset = INCOMING_FRAME_SP_OFFSET; - - /* Track SP offset to the CFA. We continue tracking this after we've - swapped the CFA register away from SP. In the case of re-alignment - this is fudged; we're interested to offsets within the local frame. */ - m->fs.sp_offset = INCOMING_FRAME_SP_OFFSET; - m->fs.sp_valid = true; - m->fs.sp_realigned = false; - allocate = frame.stack_pointer_offset - m->fs.sp_offset; - /* On SEH target with very large frame size, allocate an area to save - SSE registers (as the very large allocation won't be described). */ - if (TARGET_SEH - && frame.stack_pointer_offset > SEH_MAX_FRAME_SIZE - && !sse_registers_saved) - { - abort(); - } + /* We should have ruled out all cases of frame allocation + that are too complicated for us to handle separately. Check + them. (They mostly reflect conditions in ix86_expand_prologue). */ - /* If stack clash protection is requested, then probe the stack, unless it - is already probed on the target. */ - if (allocate >= 0 - && flag_stack_clash_protection - && !ix86_target_stack_probe ()) - { - abort(); - ix86_adjust_stack_and_probe (allocate, int_registers_saved, false); - allocate = 0; - } + gcc_assert (!(TARGET_SEH + && frame.stack_pointer_offset > SEH_MAX_FRAME_SIZE)); - /* The stack has already been decremented by the instruction calling us - so probe if the size is non-negative to preserve the protection area. */ - else if (allocate >= 0 && flag_stack_check == STATIC_BUILTIN_STACK_CHECK) - { - const HOST_WIDE_INT probe_interval = get_probe_interval (); + gcc_assert (!(allocate >= 0 + && flag_stack_clash_protection + && !ix86_target_stack_probe ())); - abort(); - if (STACK_CHECK_MOVING_SP) - { - if (crtl->is_leaf - && !cfun->calls_alloca - && allocate <= probe_interval) - ; - - else - { - ix86_adjust_stack_and_probe (allocate, int_registers_saved, true); - allocate = 0; - } - } - - else - { - HOST_WIDE_INT size = allocate; - - if (TARGET_64BIT && size >= HOST_WIDE_INT_C (0x80000000)) - size = 0x80000000 - get_stack_check_protect () - 1; - - if (TARGET_STACK_PROBE) - { - if (crtl->is_leaf && !cfun->calls_alloca) - { - if (size > probe_interval) - ix86_emit_probe_stack_range (0, size, int_registers_saved); - } - else - ix86_emit_probe_stack_range (0, - size + get_stack_check_protect (), - int_registers_saved); - } - else - { - if (crtl->is_leaf && !cfun->calls_alloca) - { - if (size > probe_interval - && size > get_stack_check_protect ()) - ix86_emit_probe_stack_range (get_stack_check_protect (), - (size - - get_stack_check_protect ()), - int_registers_saved); - } - else - ix86_emit_probe_stack_range (get_stack_check_protect (), size, - int_registers_saved); - } - } - } + gcc_assert (!(allocate >= 0 + && flag_stack_check == STATIC_BUILTIN_STACK_CHECK)); if (allocate == 0) ; @@ -11022,98 +10988,12 @@ ix86_alloc_frame (void) m->fs.cfa_reg == stack_pointer_rtx); } else - { - abort(); - rtx eax = gen_rtx_REG (Pmode, AX_REG); - rtx r10 = NULL; - const bool sp_is_cfa_reg = (m->fs.cfa_reg == stack_pointer_rtx); - bool eax_live = ix86_eax_live_at_start_p (); - bool r10_live = false; - - if (TARGET_64BIT) - r10_live = (DECL_STATIC_CHAIN (current_function_decl) != 0); - - if (eax_live) - { - insn = emit_insn (gen_push (eax)); - allocate -= UNITS_PER_WORD; - /* Note that SEH directives need to continue tracking the stack - pointer even after the frame pointer has been set up. */ - if (sp_is_cfa_reg || TARGET_SEH) - { - if (sp_is_cfa_reg) - m->fs.cfa_offset += UNITS_PER_WORD; - RTX_FRAME_RELATED_P (insn) = 1; - add_reg_note (insn, REG_FRAME_RELATED_EXPR, - gen_rtx_SET (stack_pointer_rtx, - plus_constant (Pmode, - stack_pointer_rtx, - -UNITS_PER_WORD))); - } - } - - if (r10_live) - { - r10 = gen_rtx_REG (Pmode, R10_REG); - insn = emit_insn (gen_push (r10)); - allocate -= UNITS_PER_WORD; - if (sp_is_cfa_reg || TARGET_SEH) - { - if (sp_is_cfa_reg) - m->fs.cfa_offset += UNITS_PER_WORD; - RTX_FRAME_RELATED_P (insn) = 1; - add_reg_note (insn, REG_FRAME_RELATED_EXPR, - gen_rtx_SET (stack_pointer_rtx, - plus_constant (Pmode, - stack_pointer_rtx, - -UNITS_PER_WORD))); - } - } - - emit_move_insn (eax, GEN_INT (allocate)); - emit_insn (gen_allocate_stack_worker_probe (Pmode, eax, eax)); - - /* Use the fact that AX still contains ALLOCATE. */ - insn = emit_insn (gen_pro_epilogue_adjust_stack_sub - (Pmode, stack_pointer_rtx, stack_pointer_rtx, eax)); - - if (sp_is_cfa_reg || TARGET_SEH) - { - if (sp_is_cfa_reg) - m->fs.cfa_offset += allocate; - RTX_FRAME_RELATED_P (insn) = 1; - add_reg_note (insn, REG_FRAME_RELATED_EXPR, - gen_rtx_SET (stack_pointer_rtx, - plus_constant (Pmode, stack_pointer_rtx, - -allocate))); - } - m->fs.sp_offset += allocate; - - /* Use stack_pointer_rtx for relative addressing so that code works for - realigned stack. But this means that we need a blockage to prevent - stores based on the frame pointer from being scheduled before. */ - if (r10_live && eax_live) - { - t = gen_rtx_PLUS (Pmode, stack_pointer_rtx, eax); - emit_move_insn (gen_rtx_REG (word_mode, R10_REG), - gen_frame_mem (word_mode, t)); - t = plus_constant (Pmode, t, UNITS_PER_WORD); - emit_move_insn (gen_rtx_REG (word_mode, AX_REG), - gen_frame_mem (word_mode, t)); - emit_insn (gen_memory_blockage ()); - } - else if (eax_live || r10_live) - { - t = gen_rtx_PLUS (Pmode, stack_pointer_rtx, eax); - emit_move_insn (gen_rtx_REG (word_mode, - (eax_live ? AX_REG : R10_REG)), - gen_frame_mem (word_mode, t)); - emit_insn (gen_memory_blockage ()); - } - } + abort(); gcc_assert (m->fs.sp_offset == frame.stack_pointer_offset); } +/* Undoes the frame allocation for easy cases. */ + static void ix86_dealloc_frame (void) { @@ -11131,7 +11011,7 @@ ix86_dealloc_frame (void) gcc_assert (!m->fs.realigned); if (m->fs.sp_offset != UNITS_PER_WORD) { - int style = -1; // XXX + int style = -1; pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx, GEN_INT (m->fs.sp_offset - UNITS_PER_WORD), style, true); @@ -11140,50 +11020,59 @@ ix86_dealloc_frame (void) m->fs = frame_state_save; } +/* Helper for ix86_emit_prologue_components and + ix86_emit_epilogue_components. Implements the saving and restoring + of callee saved registers (it always uses moves to do that, no + push/pop). */ + static void -ix86_process_components (sbitmap components, bool prologue_p) +ix86_process_reg_components (sbitmap components, bool prologue_p) { struct machine_function *m = cfun->machine; struct ix86_frame *frame = &m->frame; HOST_WIDE_INT cfa_offset = frame->reg_save_offset; HOST_WIDE_INT sse_cfa_offset = frame->sse_reg_save_offset; - for (unsigned regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++) - if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, true, true)) + for (unsigned regno = m->ssw_min_reg; regno <= m->ssw_max_reg; regno++) + if (ix86_save_reg (regno, true, true)) { - if (bitmap_bit_p (components, regno)) + HOST_WIDE_INT *poffset; + machine_mode mode; + if (GENERAL_REGNO_P (regno)) { - m->reg_wrapped_separately[regno] = true; - m->anything_separately = true; - if (prologue_p) - ix86_emit_save_reg_using_mov (word_mode, regno, cfa_offset); - else - ix86_emit_restore_reg_using_mov (regno, cfa_offset, true); + poffset = &cfa_offset; + mode = word_mode; } - cfa_offset -= UNITS_PER_WORD; - } - else if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true, true)) - { + else if (SSE_REGNO_P (regno)) + { + poffset = &sse_cfa_offset; + mode = V4SFmode; + } + else + abort (); if (bitmap_bit_p (components, regno)) { m->reg_wrapped_separately[regno] = true; m->anything_separately = true; if (prologue_p) - ix86_emit_save_reg_using_mov (V4SFmode, regno, sse_cfa_offset); + ix86_emit_save_reg_using_mov (mode, regno, *poffset); else - ix86_emit_restore_reg_using_mov (regno, sse_cfa_offset, true); + ix86_emit_restore_reg_using_mov (regno, *poffset, true); } - sse_cfa_offset -= GET_MODE_SIZE (V4SFmode); + *poffset -= GET_MODE_SIZE (mode); } } +/* Implements TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS. + Emit prologue code for everything in COMPONENTS. */ + static void ix86_emit_prologue_components (sbitmap components) { if (bitmap_bit_p (components, SW_FRAME)) ix86_init_frame_state (); - ix86_process_components (components, true); + ix86_process_reg_components (components, true); if (bitmap_bit_p (components, SW_FRAME)) { @@ -11193,22 +11082,23 @@ ix86_emit_prologue_components (sbitmap components) } } +/* Implements TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS. + Emit epilogue code for everything in COMPONENTS. */ + static void ix86_emit_epilogue_components (sbitmap components) { - ix86_process_components (components, false); + ix86_process_reg_components (components, false); if (bitmap_bit_p (components, SW_FRAME)) ix86_dealloc_frame (); } +/* Implements TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS. */ + static void -ix86_set_handled_components (sbitmap /*components*/) +ix86_set_handled_components (sbitmap) { - /*for (unsigned regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++) - if (bitmap_bit_p (components, regno)) - cfun->machine->reg_wrapped_separately[regno] = true;*/ - /*if (bitmap_bit_p (components, SW_FRAME)) - cfun->machine->frame_alloc_separately = true;*/ + /* We track stuff ourself. */ } #undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h index bda3d97ab4cf..4836a47a3ce5 100644 --- a/gcc/config/i386/i386.h +++ b/gcc/config/i386/i386.h @@ -2751,6 +2751,7 @@ struct GTY(()) machine_function { int varargs_gpr_size; int varargs_fpr_size; int optimize_mode_switching[MAX_386_ENTITIES]; + unsigned ssw_min_reg, ssw_max_reg; bool reg_wrapped_separately[FIRST_PSEUDO_REGISTER]; bool frame_alloc_separately; bool anything_separately;