https://gcc.gnu.org/g:4e6291b6aa5c2033a36e0ac92077a55471e64f92

commit 4e6291b6aa5c2033a36e0ac92077a55471e64f92
Author: Michael Matz <m...@suse.de>
Date:   Tue Jul 9 17:27:37 2024 +0200

    x86-ssw: tidy and commentary

Diff:
---
 gcc/config/i386/i386.cc | 310 ++++++++++++++++--------------------------------
 gcc/config/i386/i386.h  |   1 +
 2 files changed, 101 insertions(+), 210 deletions(-)

diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index 20f4dcd61870..8c9505d53a75 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -6970,7 +6970,7 @@ ix86_compute_frame_layout (void)
     }
 
   frame->save_regs_using_mov
-    = (TARGET_PROLOGUE_USING_MOVE /*|| flag_shrink_wrap_separate*/) && 
m->use_fast_prologue_epilogue;
+    = TARGET_PROLOGUE_USING_MOVE && m->use_fast_prologue_epilogue;
 
   /* Skip return address and error code in exception handler.  */
   offset = INCOMING_FRAME_SP_OFFSET;
@@ -7133,9 +7133,7 @@ ix86_compute_frame_layout (void)
       || (flag_stack_clash_protection
          && !ix86_target_stack_probe ()
          && to_allocate > get_probe_interval ()))
-    {
-      frame->cannot_use_moves = true;
-    }
+    frame->cannot_use_moves = true;
 
   if ((!to_allocate && frame->nregs <= 1)
       || frame->cannot_use_moves)
@@ -9190,6 +9188,11 @@ ix86_expand_prologue (void)
                                   m->fs.cfa_reg == stack_pointer_rtx);
       else
        {
+         /* Even when shrink-wrapping separately we call emit_prologue
+            which will reset the frame-state with the expectation that
+            we leave this routine with the state valid for the normal
+            body of the function, i.e. reflecting allocated frame.
+            So track this by hand.  */
          if (m->fs.cfa_reg == stack_pointer_rtx)
            m->fs.cfa_offset -= allocate;
          m->fs.sp_offset += allocate;
@@ -10786,9 +10789,17 @@ ix86_live_on_entry (bitmap regs)
 }
 
 /* Separate shrink-wrapping.  */
+
+/* On x86 we have one component for each hardreg (a component is handled
+   if it's a callee saved register), and one additional component for
+   the frame allocation.  */
+
 #define NCOMPONENTS (FIRST_PSEUDO_REGISTER + 1)
 #define SW_FRAME FIRST_PSEUDO_REGISTER
 
+/* Returns false when we can't allocate the frame as a separate
+   component.  Otherwise return true.  */
+
 static bool
 separate_frame_alloc_p (void)
 {
@@ -10801,12 +10812,17 @@ separate_frame_alloc_p (void)
   return true;
 }
 
+/* Implements TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS.
+   Returns an sbitmap with all components that we intend to possibly
+   handle for the current function.  */
+
 static sbitmap
 ix86_get_separate_components (void)
 {
   struct machine_function *m = cfun->machine;
   struct ix86_frame *frame = &m->frame;
   sbitmap components;
+  unsigned min, max;
 
   ix86_finalize_stack_frame_flags ();
   if (frame->cannot_use_moves
@@ -10814,24 +10830,42 @@ ix86_get_separate_components (void)
       || cfun->machine->func_type != TYPE_NORMAL)
     return NULL;
 
+  min = max = INVALID_REGNUM;
+
   components = sbitmap_alloc (NCOMPONENTS);
   bitmap_clear (components);
 
   for (unsigned regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
     if (ix86_save_reg (regno, true, true))
       {
+       if (min == INVALID_REGNUM)
+         min = regno;
+       max = regno;
        bitmap_set_bit (components, regno);
       }
 
+  if (max >= FIRST_PSEUDO_REGISTER)
+    {
+      sbitmap_free (components);
+      return NULL;
+    }
+
+  m->ssw_min_reg = min;
+  m->ssw_max_reg = max;
+
   if (separate_frame_alloc_p ())
     bitmap_set_bit (components, SW_FRAME);
 
   return components;
 }
 
+/* Implements TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB.  Given a BB
+   return all components that are necessary for it.  */
+
 static sbitmap
 ix86_components_for_bb (basic_block bb)
 {
+  struct machine_function *m = cfun->machine;
   bool need_frame = false;
   sbitmap components = sbitmap_alloc (NCOMPONENTS);
   bitmap_clear (components);
@@ -10840,7 +10874,7 @@ ix86_components_for_bb (basic_block bb)
   bitmap gen = &DF_LIVE_BB_INFO (bb)->gen;
   bitmap kill = &DF_LIVE_BB_INFO (bb)->kill;
 
-  for (unsigned regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
+  for (unsigned regno = m->ssw_min_reg; regno <= m->ssw_max_reg; regno++)
     if (ix86_save_reg (regno, true, true)
        && (bitmap_bit_p (in, regno)
            || bitmap_bit_p (gen, regno)
@@ -10881,6 +10915,9 @@ ix86_components_for_bb (basic_block bb)
   return components;
 }
 
+/* Implements TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS.  Filter out
+   from COMPONENTS those that we can't handle on edge E.  */
+
 static void
 ix86_disqualify_components (sbitmap components, edge e, sbitmap, bool)
 {
@@ -10890,6 +10927,10 @@ ix86_disqualify_components (sbitmap components, edge 
e, sbitmap, bool)
     bitmap_clear_bit (components, SW_FRAME);
 }
 
+/* Helper for frame allocation.  This resets cfun->machine->fs to
+   reflect the state at the first instruction before prologue (i.e.
+   the call just happened).  */
+
 static void
 ix86_init_frame_state (void)
 {
@@ -10909,108 +10950,33 @@ ix86_init_frame_state (void)
   m->fs.sp_realigned = false;
 }
 
+/* Helper for shrink wrapping the frame allocation.  This emits the
+   stack pointer adjustment reflecting the allocation.  Only easy cases
+   are handled, more complicated ones must be ruled out by the other
+   shrink wrapping hooks.  */
+
 static void
 ix86_alloc_frame (void)
 {
   struct machine_function *m = cfun->machine;
   const struct ix86_frame &frame = m->frame;
-  rtx insn, t;
-  bool int_registers_saved = true;
-  bool sse_registers_saved = true;
   HOST_WIDE_INT allocate;
 
-  memset (&m->fs, 0, sizeof (m->fs));
-
-  /* Initialize CFA state for before the prologue.  */
-  m->fs.cfa_reg = stack_pointer_rtx;
-  m->fs.cfa_offset = INCOMING_FRAME_SP_OFFSET;
-
-  /* Track SP offset to the CFA.  We continue tracking this after we've
-     swapped the CFA register away from SP.  In the case of re-alignment
-     this is fudged; we're interested to offsets within the local frame.  */
-  m->fs.sp_offset = INCOMING_FRAME_SP_OFFSET;
-  m->fs.sp_valid = true;
-  m->fs.sp_realigned = false;
-
   allocate = frame.stack_pointer_offset - m->fs.sp_offset;
 
-  /* On SEH target with very large frame size, allocate an area to save
-     SSE registers (as the very large allocation won't be described).  */
-  if (TARGET_SEH
-      && frame.stack_pointer_offset > SEH_MAX_FRAME_SIZE
-      && !sse_registers_saved)
-    {
-      abort();
-    }
+  /* We should have ruled out all cases of frame allocation
+     that are too complicated for us to handle separately.  Check
+     them.  (They mostly reflect conditions in ix86_expand_prologue).  */
 
-  /* If stack clash protection is requested, then probe the stack, unless it
-     is already probed on the target.  */
-  if (allocate >= 0
-      && flag_stack_clash_protection
-      && !ix86_target_stack_probe ())
-    {
-      abort();
-      ix86_adjust_stack_and_probe (allocate, int_registers_saved, false);
-      allocate = 0;
-    }
+  gcc_assert (!(TARGET_SEH
+               && frame.stack_pointer_offset > SEH_MAX_FRAME_SIZE));
 
-  /* The stack has already been decremented by the instruction calling us
-     so probe if the size is non-negative to preserve the protection area.  */
-  else if (allocate >= 0 && flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
-    {
-      const HOST_WIDE_INT probe_interval = get_probe_interval ();
+  gcc_assert (!(allocate >= 0
+               && flag_stack_clash_protection
+               && !ix86_target_stack_probe ()));
 
-      abort();
-      if (STACK_CHECK_MOVING_SP)
-       {
-         if (crtl->is_leaf
-             && !cfun->calls_alloca
-             && allocate <= probe_interval)
-           ;
-
-         else
-           {
-             ix86_adjust_stack_and_probe (allocate, int_registers_saved, true);
-             allocate = 0;
-           }
-       }
-
-      else
-       {
-         HOST_WIDE_INT size = allocate;
-
-         if (TARGET_64BIT && size >= HOST_WIDE_INT_C (0x80000000))
-           size = 0x80000000 - get_stack_check_protect () - 1;
-
-         if (TARGET_STACK_PROBE)
-           {
-             if (crtl->is_leaf && !cfun->calls_alloca)
-               {
-                 if (size > probe_interval)
-                   ix86_emit_probe_stack_range (0, size, int_registers_saved);
-               }
-             else
-               ix86_emit_probe_stack_range (0,
-                                            size + get_stack_check_protect (),
-                                            int_registers_saved);
-           }
-         else
-           {
-             if (crtl->is_leaf && !cfun->calls_alloca)
-               {
-                 if (size > probe_interval
-                     && size > get_stack_check_protect ())
-                   ix86_emit_probe_stack_range (get_stack_check_protect (),
-                                                (size
-                                                 - get_stack_check_protect ()),
-                                                int_registers_saved);
-               }
-             else
-               ix86_emit_probe_stack_range (get_stack_check_protect (), size,
-                                            int_registers_saved);
-           }
-       }
-    }
+  gcc_assert (!(allocate >= 0
+               && flag_stack_check == STATIC_BUILTIN_STACK_CHECK));
 
   if (allocate == 0)
     ;
@@ -11022,98 +10988,12 @@ ix86_alloc_frame (void)
                                 m->fs.cfa_reg == stack_pointer_rtx);
     }
   else
-    {
-      abort();
-      rtx eax = gen_rtx_REG (Pmode, AX_REG);
-      rtx r10 = NULL;
-      const bool sp_is_cfa_reg = (m->fs.cfa_reg == stack_pointer_rtx);
-      bool eax_live = ix86_eax_live_at_start_p ();
-      bool r10_live = false;
-
-      if (TARGET_64BIT)
-       r10_live = (DECL_STATIC_CHAIN (current_function_decl) != 0);
-
-      if (eax_live)
-       {
-         insn = emit_insn (gen_push (eax));
-         allocate -= UNITS_PER_WORD;
-         /* Note that SEH directives need to continue tracking the stack
-            pointer even after the frame pointer has been set up.  */
-         if (sp_is_cfa_reg || TARGET_SEH)
-           {
-             if (sp_is_cfa_reg)
-               m->fs.cfa_offset += UNITS_PER_WORD;
-             RTX_FRAME_RELATED_P (insn) = 1;
-             add_reg_note (insn, REG_FRAME_RELATED_EXPR,
-                           gen_rtx_SET (stack_pointer_rtx,
-                                        plus_constant (Pmode,
-                                                       stack_pointer_rtx,
-                                                       -UNITS_PER_WORD)));
-           }
-       }
-
-      if (r10_live)
-       {
-         r10 = gen_rtx_REG (Pmode, R10_REG);
-         insn = emit_insn (gen_push (r10));
-         allocate -= UNITS_PER_WORD;
-         if (sp_is_cfa_reg || TARGET_SEH)
-           {
-             if (sp_is_cfa_reg)
-               m->fs.cfa_offset += UNITS_PER_WORD;
-             RTX_FRAME_RELATED_P (insn) = 1;
-             add_reg_note (insn, REG_FRAME_RELATED_EXPR,
-                           gen_rtx_SET (stack_pointer_rtx,
-                                        plus_constant (Pmode,
-                                                       stack_pointer_rtx,
-                                                       -UNITS_PER_WORD)));
-           }
-       }
-
-      emit_move_insn (eax, GEN_INT (allocate));
-      emit_insn (gen_allocate_stack_worker_probe (Pmode, eax, eax));
-
-      /* Use the fact that AX still contains ALLOCATE.  */
-      insn = emit_insn (gen_pro_epilogue_adjust_stack_sub
-                       (Pmode, stack_pointer_rtx, stack_pointer_rtx, eax));
-
-      if (sp_is_cfa_reg || TARGET_SEH)
-       {
-         if (sp_is_cfa_reg)
-           m->fs.cfa_offset += allocate;
-         RTX_FRAME_RELATED_P (insn) = 1;
-         add_reg_note (insn, REG_FRAME_RELATED_EXPR,
-                       gen_rtx_SET (stack_pointer_rtx,
-                                    plus_constant (Pmode, stack_pointer_rtx,
-                                                   -allocate)));
-       }
-      m->fs.sp_offset += allocate;
-
-      /* Use stack_pointer_rtx for relative addressing so that code works for
-        realigned stack.  But this means that we need a blockage to prevent
-        stores based on the frame pointer from being scheduled before.  */
-      if (r10_live && eax_live)
-       {
-         t = gen_rtx_PLUS (Pmode, stack_pointer_rtx, eax);
-         emit_move_insn (gen_rtx_REG (word_mode, R10_REG),
-                         gen_frame_mem (word_mode, t));
-         t = plus_constant (Pmode, t, UNITS_PER_WORD);
-         emit_move_insn (gen_rtx_REG (word_mode, AX_REG),
-                         gen_frame_mem (word_mode, t));
-         emit_insn (gen_memory_blockage ());
-       }
-      else if (eax_live || r10_live)
-       {
-         t = gen_rtx_PLUS (Pmode, stack_pointer_rtx, eax);
-         emit_move_insn (gen_rtx_REG (word_mode,
-                                      (eax_live ? AX_REG : R10_REG)),
-                         gen_frame_mem (word_mode, t));
-         emit_insn (gen_memory_blockage ());
-       }
-    }
+    abort();
   gcc_assert (m->fs.sp_offset == frame.stack_pointer_offset);
 }
 
+/* Undoes the frame allocation for easy cases.  */
+
 static void
 ix86_dealloc_frame (void)
 {
@@ -11131,7 +11011,7 @@ ix86_dealloc_frame (void)
   gcc_assert (!m->fs.realigned);
   if (m->fs.sp_offset != UNITS_PER_WORD)
     {
-      int style = -1; // XXX
+      int style = -1;
       pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
                                 GEN_INT (m->fs.sp_offset - UNITS_PER_WORD),
                                 style, true);
@@ -11140,50 +11020,59 @@ ix86_dealloc_frame (void)
   m->fs = frame_state_save;
 }
 
+/* Helper for ix86_emit_prologue_components and
+   ix86_emit_epilogue_components.  Implements the saving and restoring
+   of callee saved registers (it always uses moves to do that, no
+   push/pop).  */
+
 static void
-ix86_process_components (sbitmap components, bool prologue_p)
+ix86_process_reg_components (sbitmap components, bool prologue_p)
 {
   struct machine_function *m = cfun->machine;
   struct ix86_frame *frame = &m->frame;
   HOST_WIDE_INT cfa_offset = frame->reg_save_offset;
   HOST_WIDE_INT sse_cfa_offset = frame->sse_reg_save_offset;
 
-  for (unsigned regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
-    if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, true, true))
+  for (unsigned regno = m->ssw_min_reg; regno <= m->ssw_max_reg; regno++)
+    if (ix86_save_reg (regno, true, true))
       {
-       if (bitmap_bit_p (components, regno))
+       HOST_WIDE_INT *poffset;
+       machine_mode mode;
+       if (GENERAL_REGNO_P (regno))
          {
-           m->reg_wrapped_separately[regno] = true;
-           m->anything_separately = true;
-           if (prologue_p)
-             ix86_emit_save_reg_using_mov (word_mode, regno, cfa_offset);
-           else
-             ix86_emit_restore_reg_using_mov (regno, cfa_offset, true);
+           poffset = &cfa_offset;
+           mode = word_mode;
          }
-       cfa_offset -= UNITS_PER_WORD;
-      }
-    else if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true, true))
-      {
+       else if (SSE_REGNO_P (regno))
+         {
+           poffset = &sse_cfa_offset;
+           mode = V4SFmode;
+         }
+       else
+         abort ();
        if (bitmap_bit_p (components, regno))
          {
            m->reg_wrapped_separately[regno] = true;
            m->anything_separately = true;
            if (prologue_p)
-             ix86_emit_save_reg_using_mov (V4SFmode, regno, sse_cfa_offset);
+             ix86_emit_save_reg_using_mov (mode, regno, *poffset);
            else
-             ix86_emit_restore_reg_using_mov (regno, sse_cfa_offset, true);
+             ix86_emit_restore_reg_using_mov (regno, *poffset, true);
          }
-       sse_cfa_offset -= GET_MODE_SIZE (V4SFmode);
+       *poffset -= GET_MODE_SIZE (mode);
       }
 }
 
+/* Implements TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS.
+   Emit prologue code for everything in COMPONENTS.  */
+
 static void
 ix86_emit_prologue_components (sbitmap components)
 {
   if (bitmap_bit_p (components, SW_FRAME))
     ix86_init_frame_state ();
 
-  ix86_process_components (components, true);
+  ix86_process_reg_components (components, true);
 
   if (bitmap_bit_p (components, SW_FRAME))
     {
@@ -11193,22 +11082,23 @@ ix86_emit_prologue_components (sbitmap components)
     }
 }
 
+/* Implements TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS.
+   Emit epilogue code for everything in COMPONENTS.  */
+
 static void
 ix86_emit_epilogue_components (sbitmap components)
 {
-  ix86_process_components (components, false);
+  ix86_process_reg_components (components, false);
   if (bitmap_bit_p (components, SW_FRAME))
     ix86_dealloc_frame ();
 }
 
+/* Implements TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS.  */
+
 static void
-ix86_set_handled_components (sbitmap /*components*/)
+ix86_set_handled_components (sbitmap)
 {
-  /*for (unsigned regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
-    if (bitmap_bit_p (components, regno))
-      cfun->machine->reg_wrapped_separately[regno] = true;*/
-  /*if (bitmap_bit_p (components, SW_FRAME))
-    cfun->machine->frame_alloc_separately = true;*/
+  /* We track stuff ourself.  */
 }
 
 #undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
index bda3d97ab4cf..4836a47a3ce5 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -2751,6 +2751,7 @@ struct GTY(()) machine_function {
   int varargs_gpr_size;
   int varargs_fpr_size;
   int optimize_mode_switching[MAX_386_ENTITIES];
+  unsigned ssw_min_reg, ssw_max_reg;
   bool reg_wrapped_separately[FIRST_PSEUDO_REGISTER];
   bool frame_alloc_separately;
   bool anything_separately;

Reply via email to