On ARM targets, the stack is aligned to an 8-byte boundary, but when saving/restoring the VFP coprocessor registers in the function prologue/epilogue, it is possible for the 8-byte values to end up at locations that are 4-byte aligned but not 8-byte aligned. This can result in a performance penalty on micro-architectures that are optimized for well-aligned data, especially when such a misalignment may result in cache line splits within a single access. This patch detects when at least one coprocessor register value needs to be saved and adds some additional padding to the stack at that point if necessary to align it to an 8-byte boundary. I've re-used the existing logic to try pushing a 4-byte scratch register and only fall back to an explicit stack adjustment if that fails.

NVIDIA found that an earlier version of this patch (benchmarked with SPECint2k and SPECfp2k on an older version of GCC) gave measurable improvements on their Tegra K1 64-bit processor, aka "Denver". We aren't sure what other ARM processors might benefit from the extra alignment, so we've given it its own command-line option instead of tying it to -mtune.

I did some hand-testing of this patch on small test cases to verify that the expected alignment was happening, but it seemed to me that the expected assembly-language patterns were likely too fragile to be hard-wired into a test case. I also ran regression tests both with and without the switch set so it doesn't break other things. OK to commit?

-Sandra

2014-11-14  Sandra Loosemore  <san...@codesourcery.com>
	    Joshua Conner  <jcon...@nvidia.com>
	    Chris Jones  <chr...@nvidia.com>

	gcc/
	* doc/invoke.texi (Option Summary): Add -malign-saved-fp-regs.
	(ARM options): Document it.
	* config/arm/arm.h (arm_stack_offsets): Add fp_regs_padding field.
	* config/arm/arm.opt (malign-saved-fp-regs): New option.
	* config/arm/arm.c (add_dummy_register_save_p): New function,
	split from...
	(arm_get_frame_offsets): Here.  Use this logic also for aligning
	saved VFP coprocessor registers if possible.
	(arm_expand_prologue): Add explicit padding for saved VFP registers.
	(arm_expand_epilogue): Undo explicit padding for saved VFP registers.
Index: gcc/doc/invoke.texi
===================================================================
--- gcc/doc/invoke.texi	(revision 217322)
+++ gcc/doc/invoke.texi	(working copy)
@@ -533,6 +533,7 @@ Objective-C and Objective-C++ Dialects}.
 -mcpu=@var{name}  -march=@var{name}  -mfpu=@var{name}  @gol
 -mstructure-size-boundary=@var{n} @gol
 -mabort-on-noreturn @gol
+-malign-saved-fp-regs @gol
 -mlong-calls  -mno-long-calls @gol
 -msingle-pic-base  -mno-single-pic-base @gol
 -mpic-register=@var{reg} @gol
@@ -12861,6 +12862,13 @@ Generate a call to the function @code{ab
 @code{noreturn} function.  It is executed if the function tries to
 return.
 
+@item -malign-saved-fp-regs
+@opindex malign-saved-fp-regs
+Add extra padding to align saved VFP coprocessor register values
+on a 64-bit boundary in function prologues.
+This can provide better performance on targets optimized for
+aligned memory accesses.
+
 @item -mlong-calls
 @itemx -mno-long-calls
 @opindex mlong-calls
Index: gcc/config/arm/arm.h
===================================================================
--- gcc/config/arm/arm.h	(revision 217322)
+++ gcc/config/arm/arm.h	(working copy)
@@ -1509,6 +1509,9 @@ typedef struct GTY(()) arm_stack_offsets
   int locals_base;	/* THUMB_HARD_FRAME_POINTER_REGNUM.  */
   int outgoing_args;	/* STACK_POINTER_REGNUM.  */
   unsigned int saved_regs_mask;
+  /* Extra padding inserted before saved VFP regs to keep them
+     doubleword-aligned.  See TARGET_ALIGN_SAVED_FP_REGS.  */
+  int fp_regs_padding;
 }
 arm_stack_offsets;
 
Index: gcc/config/arm/arm.opt
===================================================================
--- gcc/config/arm/arm.opt	(revision 217322)
+++ gcc/config/arm/arm.opt	(working copy)
@@ -58,6 +58,10 @@ mabort-on-noreturn
 Target Report Mask(ABORT_NORETURN)
 Generate a call to abort if a noreturn function returns
 
+malign-saved-fp-regs
+Target Report Mask(ALIGN_SAVED_FP_REGS)
+Save floating-point registers on stack in naturally-aligned locations
+
 mapcs
 Target RejectNegative Mask(APCS_FRAME) Undocumented
 
Index: gcc/config/arm/arm.c
===================================================================
--- gcc/config/arm/arm.c	(revision 217322)
+++ gcc/config/arm/arm.c	(working copy)
@@ -20721,6 +20721,67 @@ any_sibcall_could_use_r3 (void)
   return false;
 }
 
+/* Attempt to add a dummy register push to OFFSETS for alignment purposes.
+   Returns true and adjusts OFFSETS if successful, otherwise returns
+   false if the alignment must be accomplished in some other way.  */
+static bool
+add_dummy_register_save_p (struct arm_stack_offsets *offsets)
+{
+  int reg = -1;
+  int i;
+
+  /* Register r3 is caller-saved.  Normally it does not need to be
+     saved on entry by the prologue.  However if we choose to save
+     it for padding then we may confuse the compiler into thinking
+     a prologue sequence is required when in fact it is not.  This
+     will occur when shrink-wrapping if r3 is used as a scratch
+     register and there are no other callee-saved writes.
+     
+     This situation can be avoided when other callee-saved registers
+     are available and r3 is not mandatory if we choose a callee-saved
+     register for padding.  */
+  bool prefer_callee_reg_p = false;
+  
+  /* If it is safe to use r3, then do so.  This sometimes
+     generates better code on Thumb-2 by avoiding the need to
+     use 32-bit push/pop instructions.  */
+  if (! any_sibcall_could_use_r3 ()
+      && arm_size_return_regs () <= 12
+      && (offsets->saved_regs_mask & (1 << 3)) == 0
+      && (TARGET_THUMB2
+	  || !(TARGET_LDRD && current_tune->prefer_ldrd_strd)))
+    {
+      reg = 3;
+      if (!TARGET_THUMB2)
+	prefer_callee_reg_p = true;
+    }
+  if (reg == -1
+      || prefer_callee_reg_p)
+    {
+      for (i = 4; i <= (TARGET_THUMB1 ? LAST_LO_REGNUM : 11); i++)
+	{
+	  /* Avoid fixed registers; they may be changed at
+	     arbitrary times so it's unsafe to restore them
+	     during the epilogue.  */
+	  if (!fixed_regs[i]
+	      && (offsets->saved_regs_mask & (1 << i)) == 0)
+	    {
+	      reg = i;
+	      break;
+	    }
+	}
+    }
+  
+  if (reg != -1)
+    {
+      offsets->saved_regs += 4;
+      offsets->saved_regs_mask |= (1 << reg);
+      offsets->soft_frame += 4;
+      return true;
+    }
+  return false;
+}
+
 
 /* Compute the distance from register FROM to register TO.
    These can be the arg pointer (26), the soft frame pointer (25),
@@ -20784,7 +20845,8 @@ arm_get_frame_offsets (void)
   int saved;
   int core_saved;
   HOST_WIDE_INT frame_size;
-  int i;
+  bool need_fp_align = false;
+  bool need_fp_padding = false;
 
   offsets = &cfun->machine->stack_offsets;
 
@@ -20816,9 +20878,12 @@ arm_get_frame_offsets (void)
        + arm_compute_static_chain_stack_bytes ()
        + (frame_pointer_needed ? 4 : 0));
 
+  offsets->fp_regs_padding = 0;
+
   if (TARGET_32BIT)
     {
       unsigned int regno;
+      int coproc_regs_saved = 0;
 
       offsets->saved_regs_mask = arm_compute_save_reg_mask ();
       core_saved = bit_count (offsets->saved_regs_mask) * 4;
@@ -20835,14 +20900,28 @@ arm_get_frame_offsets (void)
 	       regno <= LAST_IWMMXT_REGNUM;
 	       regno++)
 	    if (df_regs_ever_live_p (regno) && ! call_used_regs[regno])
-	      saved += 8;
+	      coproc_regs_saved += 8;
 	}
 
       func_type = arm_current_func_type ();
       /* Space for saved VFP registers.  */
       if (! IS_VOLATILE (func_type)
 	  && TARGET_HARD_FLOAT && TARGET_VFP)
-	saved += arm_get_vfp_saved_size ();
+	coproc_regs_saved += arm_get_vfp_saved_size ();
+
+      /* Check if extra padding is required to align saved FP regs
+	 on a doubleword boundary.  */
+      if (TARGET_ALIGN_SAVED_FP_REGS && coproc_regs_saved)
+	{
+	  need_fp_align = true;
+	  if (saved % 8)
+	    {
+	      gcc_assert (! (saved % 4));
+	      need_fp_padding = true;
+	    }
+	}
+
+      saved += coproc_regs_saved;
     }
   else /* TARGET_THUMB1 */
     {
@@ -20858,6 +20937,15 @@ arm_get_frame_offsets (void)
     = offsets->saved_args + arm_compute_static_chain_stack_bytes () + saved;
   offsets->soft_frame = offsets->saved_regs + CALLER_INTERWORKING_SLOT_SIZE;
 
+  /* If we need to align the saved FP registers, see if we can do that
+     by pushing an extra reg.  Otherwise insert some padding explicitly.  */
+  if (need_fp_padding && !add_dummy_register_save_p (offsets))
+    {
+      offsets->fp_regs_padding = 4;
+      offsets->saved_regs += 4;
+      offsets->soft_frame += 4;
+    }
+
   /* A leaf function does not need any stack alignment if it has nothing
      on the stack.  */
   if (leaf && frame_size == 0
@@ -20874,62 +20962,15 @@ arm_get_frame_offsets (void)
   if (ARM_DOUBLEWORD_ALIGN
       && (offsets->soft_frame & 7))
     {
-      offsets->soft_frame += 4;
       /* Try to align stack by pushing an extra reg.  Don't bother doing this
          when there is a stack frame as the alignment will be rolled into
-	 the normal stack adjustment.  */
-      if (frame_size + crtl->outgoing_args_size == 0)
-	{
-	  int reg = -1;
-
-	  /* Register r3 is caller-saved.  Normally it does not need to be
-	     saved on entry by the prologue.  However if we choose to save
-	     it for padding then we may confuse the compiler into thinking
-	     a prologue sequence is required when in fact it is not.  This
-	     will occur when shrink-wrapping if r3 is used as a scratch
-	     register and there are no other callee-saved writes.
-
-	     This situation can be avoided when other callee-saved registers
-	     are available and r3 is not mandatory if we choose a callee-saved
-	     register for padding.  */
-	  bool prefer_callee_reg_p = false;
-
-	  /* If it is safe to use r3, then do so.  This sometimes
-	     generates better code on Thumb-2 by avoiding the need to
-	     use 32-bit push/pop instructions.  */
-          if (! any_sibcall_could_use_r3 ()
-	      && arm_size_return_regs () <= 12
-	      && (offsets->saved_regs_mask & (1 << 3)) == 0
-	      && (TARGET_THUMB2
-		  || !(TARGET_LDRD && current_tune->prefer_ldrd_strd)))
-	    {
-	      reg = 3;
-	      if (!TARGET_THUMB2)
-		prefer_callee_reg_p = true;
-	    }
-	  if (reg == -1
-	      || prefer_callee_reg_p)
-	    {
-	      for (i = 4; i <= (TARGET_THUMB1 ? LAST_LO_REGNUM : 11); i++)
-		{
-		  /* Avoid fixed registers; they may be changed at
-		     arbitrary times so it's unsafe to restore them
-		     during the epilogue.  */
-		  if (!fixed_regs[i]
-		      && (offsets->saved_regs_mask & (1 << i)) == 0)
-		    {
-		      reg = i;
-		      break;
-		    }
-		}
-	    }
-
-	  if (reg != -1)
-	    {
-	      offsets->saved_regs += 4;
-	      offsets->saved_regs_mask |= (1 << reg);
-	    }
-	}
+         the normal stack adjustment.  We also cannot do this if we need
+         to maintain the alignment of the saved FP registers.  */
+      if (! (frame_size + crtl->outgoing_args_size == 0
+	     && !need_fp_align
+	     && add_dummy_register_save_p (offsets)))
+	/* We must accomplish the desired alignment explicitly.  */
+	offsets->soft_frame += 4;
     }
 
   offsets->locals_base = offsets->soft_frame + frame_size;
@@ -21378,6 +21419,17 @@ arm_expand_prologue (void)
         }
     }
 
+  /* Add explicit padding for alignment of saved FP regs, if necessary.  */
+  if (offsets->fp_regs_padding)
+    {
+      gcc_assert (offsets->fp_regs_padding == 4);
+      amount = GEN_INT (-offsets->fp_regs_padding);
+      insn = emit_insn (gen_addsi3 (stack_pointer_rtx,
+				    stack_pointer_rtx, amount));
+      RTX_FRAME_RELATED_P (insn) = 1;
+      saved_regs += offsets->fp_regs_padding;
+    }
+
   if (! IS_VOLATILE (func_type))
     saved_regs += arm_save_coproc_regs ();
 
@@ -27922,6 +27974,16 @@ arm_expand_epilogue (bool really_return)
 				       stack_pointer_rtx, stack_pointer_rtx);
         }
 
+  /* Adjust for any extra padding added to align saved FP registers.  */
+  if (offsets->fp_regs_padding)
+    {
+      rtx tmp;
+      gcc_assert (offsets->fp_regs_padding == 4);
+      tmp = emit_insn (gen_addsi3 (stack_pointer_rtx, stack_pointer_rtx,
+			           GEN_INT (offsets->fp_regs_padding)));
+      RTX_FRAME_RELATED_P (tmp) = 1;
+    }
+
   if (saved_regs_mask)
     {
       rtx insn;

Reply via email to