Consider this test case:

extern void bar1 (void);
extern void bar2 (void);

int a;
int b;

int foo (void)
{
  if (a < 0)
    {
      bar1 ();
      if (b < 0)
        b = 0;
    }
  else
    {
      bar2 ();
      if (b < 0)
        b = 0;
    }
}


Compiling x86_64-unknown-linux X arm-elf "xgcc (GCC) 4.5.0 20090606
(experimental) [trunk revision 148236]", with options -march=armv7-r -Os -dAP"
without patches results in this code:


        .file   "t.c"
        .text
        .align  2
        .global foo
        .type   foo, %function
foo:
        @ args = 0, pretend = 0, frame = 0
        @ frame_needed = 0, uses_anonymous_args = 0
        @ basic block 2
        ldr     r3, .L6 @ 67    *arm_movsi_insn/5       [length = 4]
        stmfd   sp!, {r4, lr}   @ 69    *push_multi     [length = 4]
        ldr     r3, [r3, #0]    @ 6     *arm_movsi_insn/5       [length = 4]
        ldr     r4, .L6+4       @ 62    *arm_movsi_insn/5       [length = 4]
        cmp     r3, #0  @ 7     *arm_cmpsi_insn/1       [length = 4]
        bge     .L2     @ 8     *arm_cond_branch        [length = 4]
        @ basic block 3
        bl      bar1    @ 10    *call_symbol    [length = 4]
        ldr     r3, [r4, #0]    @ 12    *arm_movsi_insn/5       [length = 4]
        cmp     r3, #0  @ 13    *arm_cmpsi_insn/1       [length = 4]
        movlt   r3, #0  @ 17    neon_vornv2di+78/2      [length = 4]
        strlt   r3, [r4, #0]    @ 18    neon_vornv2di+78/6      [length = 4]
        ldmfd   sp!, {r4, pc}   @ 71    return  [length = 12]
.L2:
        @ basic block 4
        bl      bar2    @ 23    *call_symbol    [length = 4]
        ldr     r3, [r4, #0]    @ 25    *arm_movsi_insn/5       [length = 4]
        cmp     r3, #0  @ 26    *arm_cmpsi_insn/1       [length = 4]
        ldmgefd sp!, {r4, pc}   @ 27    *cond_return    [length = 12]
        @ basic block 5
        mov     r3, #0  @ 30    *arm_movsi_insn/2       [length = 4]
        str     r3, [r4, #0]    @ 31    *arm_movsi_insn/6       [length = 4]
        ldmfd   sp!, {r4, pc}   @ 73    return  [length = 12]
.L7:
        .align  2
.L6:
        .word   a
        .word   b
        .size   foo, .-foo
        .comm   a,4,4
        .comm   b,4,4
        .ident  "GCC: (GNU) 4.5.0 20090606 (experimental) [trunk revision
148236]"



Note the conditional return at the end of basic block 3.  It voids a
crossjumping opportunity and results in bigger code compared with a direct
branch to a shared return instruction.  This comes from threading the return
instruction in prologue/epilogue threading.  With a patch to disable this
transformation when optimizing for size, I get this code:

        .file   "t.c"
        .text
        .align  2
        .global foo
        .type   foo, %function
foo:
        @ args = 0, pretend = 0, frame = 0
        @ frame_needed = 0, uses_anonymous_args = 0
        @ basic block 2
        ldr     r3, .L9 @ 67    *arm_movsi_insn/5       [length = 4]
        stmfd   sp!, {r4, lr}   @ 69    *push_multi     [length = 4]
        ldr     r3, [r3, #0]    @ 6     *arm_movsi_insn/5       [length = 4]
        ldr     r4, .L9+4       @ 62    *arm_movsi_insn/5       [length = 4]
        cmp     r3, #0  @ 7     *arm_cmpsi_insn/1       [length = 4]
        bge     .L2     @ 8     *arm_cond_branch        [length = 4]
        @ basic block 3
        bl      bar1    @ 10    *call_symbol    [length = 4]
        b       .L7     @ 87    *arm_jump       [length = 4]
.L2:
        @ basic block 4
        bl      bar2    @ 23    *call_symbol    [length = 4]
.L7:
        @ basic block 5
        ldr     r3, [r4, #0]    @ 25    *arm_movsi_insn/5       [length = 4]
        cmp     r3, #0  @ 26    *arm_cmpsi_insn/1       [length = 4]
        movlt   r3, #0  @ 30    neon_vornv2di+78/2      [length = 4]
        strlt   r3, [r4, #0]    @ 31    neon_vornv2di+78/6      [length = 4]
        ldmfd   sp!, {r4, pc}   @ 72    return  [length = 12]
.L10:
        .align  2
.L9:
        .word   a
        .word   b
        .size   foo, .-foo
        .comm   a,4,4
        .comm   b,4,4
        .ident  "GCC: (GNU) 4.5.0 20090606 (experimental) [trunk revision
148236]"


Basic block 3 now ends in a direct jump and the similar code is cross-jumped.  

The size of the unpatched code (sum of the reported insn lengths) is 100 for
the unpatched code, and 64 bytes for the patched code.

This is the patch:

Index: ../../trunk/gcc/function.c
===================================================================
--- ../../trunk/gcc/function.c  (revision 148236)
+++ ../../trunk/gcc/function.c  (working copy)
@@ -5022,13 +5022,18 @@

   rtl_profile_for_bb (EXIT_BLOCK_PTR);
 #ifdef HAVE_return
-  if (optimize && HAVE_return)
+  if (HAVE_return && optimize_function_for_speed_p (cfun))
     {
       /* If we're allowed to generate a simple return instruction,
         then by definition we don't need a full epilogue.  Examine
         the block that falls through to EXIT.   If it does not
-        contain any code, examine its predecessors and try to
-        emit (conditional) return instructions.  */
+        contain any code, and we are optimizing for speed, then
+        examine its predecessors and try to emit (conditional)
+        return instructions.  If optimizing for size, conditional
+        returns are usually not a win, because they can interfere
+        with crossjumping opportunities, and because there may be
+        registers to restore from the stack in the return insns
+        itself (such as the ARM return instruction).  */

       basic_block last;
       rtx label;


-- 
           Summary: Conditional return not always profitable with -Os
           Product: gcc
           Version: 4.5.0
            Status: UNCONFIRMED
          Keywords: missed-optimization
          Severity: normal
          Priority: P3
         Component: rtl-optimization
        AssignedTo: unassigned at gcc dot gnu dot org
        ReportedBy: steven at gcc dot gnu dot org
GCC target triplet: arm-elf
OtherBugsDependingO 16996
             nThis:


http://gcc.gnu.org/bugzilla/show_bug.cgi?id=40361

Reply via email to