On ARM targets, the stack is aligned to an 8-byte boundary, but when
saving/restoring the VFP coprocessor registers in the function
prologue/epilogue, it is possible for the 8-byte values to end up at
locations that are 4-byte aligned but not 8-byte aligned. This can
result in a performance penalty on micro-architectures that are
optimized for well-aligned data, especially when such a misalignment may
result in cache line splits within a single access. This patch detects
when at least one coprocessor register value needs to be saved and adds
some additional padding to the stack at that point if necessary to align
it to an 8-byte boundary. I've re-used the existing logic to try
pushing a 4-byte scratch register and only fall back to an explicit
stack adjustment if that fails.
NVIDIA found that an earlier version of this patch (benchmarked with
SPECint2k and SPECfp2k on an older version of GCC) gave measurable
improvements on their Tegra K1 64-bit processor, aka "Denver". We
aren't sure what other ARM processors might benefit from the extra
alignment, so we've given it its own command-line option instead of
tying it to -mtune.
I did some hand-testing of this patch on small test cases to verify that
the expected alignment was happening, but it seemed to me that the
expected assembly-language patterns were likely too fragile to be
hard-wired into a test case. I also ran regression tests both with and
without the switch set so it doesn't break other things. OK to commit?
-Sandra
2014-11-14 Sandra Loosemore <san...@codesourcery.com>
Joshua Conner <jcon...@nvidia.com>
Chris Jones <chr...@nvidia.com>
gcc/
* doc/invoke.texi (Option Summary): Add -malign-saved-fp-regs.
(ARM options): Document it.
* config/arm/arm.h (arm_stack_offsets): Add fp_regs_padding field.
* config/arm/arm.opt (malign-saved-fp-regs): New option.
* config/arm/arm.c (add_dummy_register_save_p): New function,
split from...
(arm_get_frame_offsets): Here. Use this logic also for aligning
saved VFP coprocessor registers if possible.
(arm_expand_prologue): Add explicit padding for saved VFP registers.
(arm_expand_epilogue): Undo explicit padding for saved VFP registers.
Index: gcc/doc/invoke.texi
===================================================================
--- gcc/doc/invoke.texi (revision 217322)
+++ gcc/doc/invoke.texi (working copy)
@@ -533,6 +533,7 @@ Objective-C and Objective-C++ Dialects}.
-mcpu=@var{name} -march=@var{name} -mfpu=@var{name} @gol
-mstructure-size-boundary=@var{n} @gol
-mabort-on-noreturn @gol
+-malign-saved-fp-regs @gol
-mlong-calls -mno-long-calls @gol
-msingle-pic-base -mno-single-pic-base @gol
-mpic-register=@var{reg} @gol
@@ -12861,6 +12862,13 @@ Generate a call to the function @code{ab
@code{noreturn} function. It is executed if the function tries to
return.
+@item -malign-saved-fp-regs
+@opindex malign-saved-fp-regs
+Add extra padding to align saved VFP coprocessor register values
+on a 64-bit boundary in function prologues.
+This can provide better performance on targets optimized for
+aligned memory accesses.
+
@item -mlong-calls
@itemx -mno-long-calls
@opindex mlong-calls
Index: gcc/config/arm/arm.h
===================================================================
--- gcc/config/arm/arm.h (revision 217322)
+++ gcc/config/arm/arm.h (working copy)
@@ -1509,6 +1509,9 @@ typedef struct GTY(()) arm_stack_offsets
int locals_base; /* THUMB_HARD_FRAME_POINTER_REGNUM. */
int outgoing_args; /* STACK_POINTER_REGNUM. */
unsigned int saved_regs_mask;
+ /* Extra padding inserted before saved VFP regs to keep them
+ doubleword-aligned. See TARGET_ALIGN_SAVED_FP_REGS. */
+ int fp_regs_padding;
}
arm_stack_offsets;
Index: gcc/config/arm/arm.opt
===================================================================
--- gcc/config/arm/arm.opt (revision 217322)
+++ gcc/config/arm/arm.opt (working copy)
@@ -58,6 +58,10 @@ mabort-on-noreturn
Target Report Mask(ABORT_NORETURN)
Generate a call to abort if a noreturn function returns
+malign-saved-fp-regs
+Target Report Mask(ALIGN_SAVED_FP_REGS)
+Save floating-point registers on stack in naturally-aligned locations
+
mapcs
Target RejectNegative Mask(APCS_FRAME) Undocumented
Index: gcc/config/arm/arm.c
===================================================================
--- gcc/config/arm/arm.c (revision 217322)
+++ gcc/config/arm/arm.c (working copy)
@@ -20721,6 +20721,67 @@ any_sibcall_could_use_r3 (void)
return false;
}
+/* Attempt to add a dummy register push to OFFSETS for alignment purposes.
+ Returns true and adjusts OFFSETS if successful, otherwise returns
+ false if the alignment must be accomplished in some other way. */
+static bool
+add_dummy_register_save_p (struct arm_stack_offsets *offsets)
+{
+ int reg = -1;
+ int i;
+
+ /* Register r3 is caller-saved. Normally it does not need to be
+ saved on entry by the prologue. However if we choose to save
+ it for padding then we may confuse the compiler into thinking
+ a prologue sequence is required when in fact it is not. This
+ will occur when shrink-wrapping if r3 is used as a scratch
+ register and there are no other callee-saved writes.
+
+ This situation can be avoided when other callee-saved registers
+ are available and r3 is not mandatory if we choose a callee-saved
+ register for padding. */
+ bool prefer_callee_reg_p = false;
+
+ /* If it is safe to use r3, then do so. This sometimes
+ generates better code on Thumb-2 by avoiding the need to
+ use 32-bit push/pop instructions. */
+ if (! any_sibcall_could_use_r3 ()
+ && arm_size_return_regs () <= 12
+ && (offsets->saved_regs_mask & (1 << 3)) == 0
+ && (TARGET_THUMB2
+ || !(TARGET_LDRD && current_tune->prefer_ldrd_strd)))
+ {
+ reg = 3;
+ if (!TARGET_THUMB2)
+ prefer_callee_reg_p = true;
+ }
+ if (reg == -1
+ || prefer_callee_reg_p)
+ {
+ for (i = 4; i <= (TARGET_THUMB1 ? LAST_LO_REGNUM : 11); i++)
+ {
+ /* Avoid fixed registers; they may be changed at
+ arbitrary times so it's unsafe to restore them
+ during the epilogue. */
+ if (!fixed_regs[i]
+ && (offsets->saved_regs_mask & (1 << i)) == 0)
+ {
+ reg = i;
+ break;
+ }
+ }
+ }
+
+ if (reg != -1)
+ {
+ offsets->saved_regs += 4;
+ offsets->saved_regs_mask |= (1 << reg);
+ offsets->soft_frame += 4;
+ return true;
+ }
+ return false;
+}
+
/* Compute the distance from register FROM to register TO.
These can be the arg pointer (26), the soft frame pointer (25),
@@ -20784,7 +20845,8 @@ arm_get_frame_offsets (void)
int saved;
int core_saved;
HOST_WIDE_INT frame_size;
- int i;
+ bool need_fp_align = false;
+ bool need_fp_padding = false;
offsets = &cfun->machine->stack_offsets;
@@ -20816,9 +20878,12 @@ arm_get_frame_offsets (void)
+ arm_compute_static_chain_stack_bytes ()
+ (frame_pointer_needed ? 4 : 0));
+ offsets->fp_regs_padding = 0;
+
if (TARGET_32BIT)
{
unsigned int regno;
+ int coproc_regs_saved = 0;
offsets->saved_regs_mask = arm_compute_save_reg_mask ();
core_saved = bit_count (offsets->saved_regs_mask) * 4;
@@ -20835,14 +20900,28 @@ arm_get_frame_offsets (void)
regno <= LAST_IWMMXT_REGNUM;
regno++)
if (df_regs_ever_live_p (regno) && ! call_used_regs[regno])
- saved += 8;
+ coproc_regs_saved += 8;
}
func_type = arm_current_func_type ();
/* Space for saved VFP registers. */
if (! IS_VOLATILE (func_type)
&& TARGET_HARD_FLOAT && TARGET_VFP)
- saved += arm_get_vfp_saved_size ();
+ coproc_regs_saved += arm_get_vfp_saved_size ();
+
+ /* Check if extra padding is required to align saved FP regs
+ on a doubleword boundary. */
+ if (TARGET_ALIGN_SAVED_FP_REGS && coproc_regs_saved)
+ {
+ need_fp_align = true;
+ if (saved % 8)
+ {
+ gcc_assert (! (saved % 4));
+ need_fp_padding = true;
+ }
+ }
+
+ saved += coproc_regs_saved;
}
else /* TARGET_THUMB1 */
{
@@ -20858,6 +20937,15 @@ arm_get_frame_offsets (void)
= offsets->saved_args + arm_compute_static_chain_stack_bytes () + saved;
offsets->soft_frame = offsets->saved_regs + CALLER_INTERWORKING_SLOT_SIZE;
+ /* If we need to align the saved FP registers, see if we can do that
+ by pushing an extra reg. Otherwise insert some padding explicitly. */
+ if (need_fp_padding && !add_dummy_register_save_p (offsets))
+ {
+ offsets->fp_regs_padding = 4;
+ offsets->saved_regs += 4;
+ offsets->soft_frame += 4;
+ }
+
/* A leaf function does not need any stack alignment if it has nothing
on the stack. */
if (leaf && frame_size == 0
@@ -20874,62 +20962,15 @@ arm_get_frame_offsets (void)
if (ARM_DOUBLEWORD_ALIGN
&& (offsets->soft_frame & 7))
{
- offsets->soft_frame += 4;
/* Try to align stack by pushing an extra reg. Don't bother doing this
when there is a stack frame as the alignment will be rolled into
- the normal stack adjustment. */
- if (frame_size + crtl->outgoing_args_size == 0)
- {
- int reg = -1;
-
- /* Register r3 is caller-saved. Normally it does not need to be
- saved on entry by the prologue. However if we choose to save
- it for padding then we may confuse the compiler into thinking
- a prologue sequence is required when in fact it is not. This
- will occur when shrink-wrapping if r3 is used as a scratch
- register and there are no other callee-saved writes.
-
- This situation can be avoided when other callee-saved registers
- are available and r3 is not mandatory if we choose a callee-saved
- register for padding. */
- bool prefer_callee_reg_p = false;
-
- /* If it is safe to use r3, then do so. This sometimes
- generates better code on Thumb-2 by avoiding the need to
- use 32-bit push/pop instructions. */
- if (! any_sibcall_could_use_r3 ()
- && arm_size_return_regs () <= 12
- && (offsets->saved_regs_mask & (1 << 3)) == 0
- && (TARGET_THUMB2
- || !(TARGET_LDRD && current_tune->prefer_ldrd_strd)))
- {
- reg = 3;
- if (!TARGET_THUMB2)
- prefer_callee_reg_p = true;
- }
- if (reg == -1
- || prefer_callee_reg_p)
- {
- for (i = 4; i <= (TARGET_THUMB1 ? LAST_LO_REGNUM : 11); i++)
- {
- /* Avoid fixed registers; they may be changed at
- arbitrary times so it's unsafe to restore them
- during the epilogue. */
- if (!fixed_regs[i]
- && (offsets->saved_regs_mask & (1 << i)) == 0)
- {
- reg = i;
- break;
- }
- }
- }
-
- if (reg != -1)
- {
- offsets->saved_regs += 4;
- offsets->saved_regs_mask |= (1 << reg);
- }
- }
+ the normal stack adjustment. We also cannot do this if we need
+ to maintain the alignment of the saved FP registers. */
+ if (! (frame_size + crtl->outgoing_args_size == 0
+ && !need_fp_align
+ && add_dummy_register_save_p (offsets)))
+ /* We must accomplish the desired alignment explicitly. */
+ offsets->soft_frame += 4;
}
offsets->locals_base = offsets->soft_frame + frame_size;
@@ -21378,6 +21419,17 @@ arm_expand_prologue (void)
}
}
+ /* Add explicit padding for alignment of saved FP regs, if necessary. */
+ if (offsets->fp_regs_padding)
+ {
+ gcc_assert (offsets->fp_regs_padding == 4);
+ amount = GEN_INT (-offsets->fp_regs_padding);
+ insn = emit_insn (gen_addsi3 (stack_pointer_rtx,
+ stack_pointer_rtx, amount));
+ RTX_FRAME_RELATED_P (insn) = 1;
+ saved_regs += offsets->fp_regs_padding;
+ }
+
if (! IS_VOLATILE (func_type))
saved_regs += arm_save_coproc_regs ();
@@ -27922,6 +27974,16 @@ arm_expand_epilogue (bool really_return)
stack_pointer_rtx, stack_pointer_rtx);
}
+ /* Adjust for any extra padding added to align saved FP registers. */
+ if (offsets->fp_regs_padding)
+ {
+ rtx tmp;
+ gcc_assert (offsets->fp_regs_padding == 4);
+ tmp = emit_insn (gen_addsi3 (stack_pointer_rtx, stack_pointer_rtx,
+ GEN_INT (offsets->fp_regs_padding)));
+ RTX_FRAME_RELATED_P (tmp) = 1;
+ }
+
if (saved_regs_mask)
{
rtx insn;