https://gcc.gnu.org/bugzilla/show_bug.cgi?id=66010

            Bug ID: 66010
           Summary: Missed optimization after inlining va_list parameter
           Product: gcc
           Version: 6.0
            Status: UNCONFIRMED
          Severity: minor
          Priority: P3
         Component: tree-optimization
          Assignee: unassigned at gcc dot gnu.org
          Reporter: vries at gcc dot gnu.org
  Target Milestone: ---

Consider this test-case (based on gcc.dg/tree-ssa/stdarg-2.c, f15):
...
#include <stdarg.h>

int
f1 (int i, ...)
{
  int res;
  va_list ap;

  va_start (ap, i);
  res = va_arg (ap, int);
  va_end (ap);

  return res;
}

inline int __attribute__((always_inline))
f2_1 (va_list ap)
{
  return va_arg (ap, int);
}

int
f2 (int i, ...)
{
  int res;
  va_list ap;

  va_start (ap, i);
  res = f2_1 (ap);
  va_end (ap);

  return res;
}
...

When compiling at -O2, the optimized dump for f1 and f2 are very similar:
...
 ;;   basic block 2, loop depth 0, count 0, freq 10000, maybe hot
 ;;   Invalid sum of outgoing probabilities 0.0%
 ;;    prev block 0, next block 3, flags: (NEW, REACHABLE)
 ;;    pred:       ENTRY [100.0%]  (FALLTHRU,EXECUTABLE)
   # .MEM_2 = VDEF <.MEM_1(D)>
   # USE = nonlocal escaped 
-  # CLB = nonlocal escaped { D.1836 }
-  __builtin_va_startD.1030 (&apD.1836, 0);
+  # CLB = nonlocal escaped { D.1844 }
+  __builtin_va_startD.1030 (&apD.1844, 0);
   # VUSE <.MEM_2>
-  _9 = apD.1836.gp_offsetD.2;
-  if (_9 > 47)
+  _11 = MEM[(struct  *)&apD.1844].gp_offsetD.2;
+  if (_11 > 47)
     goto <bb 4>;
   else
     goto <bb 3>;
 ;;    succ:       4 (TRUE_VALUE,EXECUTABLE)
 ;;                3 (FALSE_VALUE,EXECUTABLE)

 ;;   basic block 3, loop depth 0, count 0, freq 0
 ;;   Invalid sum of outgoing probabilities 0.0%
 ;;    prev block 2, next block 4, flags: (NEW)
 ;;    pred:       2 (FALSE_VALUE,EXECUTABLE)
   # VUSE <.MEM_2>
   # PT = nonlocal 
-  _10 = apD.1836.reg_save_areaD.5;
+  _12 = MEM[(struct  *)&apD.1844].reg_save_areaD.5;
   # RANGE [0, 47] NONZERO 63
-  _12 = (sizetype) _9;
+  _14 = (sizetype) _11;
   # PT = nonlocal 
-  addr.4_13 = _10 + _12;
+  addr.8_15 = _12 + _14;
   # RANGE [8, 55] NONZERO 63
-  _15 = _9 + 8;
+  _17 = _11 + 8;
   goto <bb 5>;
 ;;    succ:       5 (FALLTHRU,EXECUTABLE)

 ;;   basic block 4, loop depth 0, count 0, freq 0
 ;;   Invalid sum of outgoing probabilities 0.0%
 ;;    prev block 3, next block 5, flags: (NEW)
 ;;    pred:       2 (TRUE_VALUE,EXECUTABLE)
   # VUSE <.MEM_2>
   # PT = nonlocal 
-  _17 = apD.1836.overflow_arg_areaD.4;
+  _19 = MEM[(struct  *)&apD.1844].overflow_arg_areaD.4;
   # PT = nonlocal 
-  _19 = _17 + 8;
+  _21 = _19 + 8;
 ;;    succ:       5 (FALLTHRU,EXECUTABLE)

 ;;   basic block 5, loop depth 0, count 0, freq 10000, maybe hot
 ;;   Invalid sum of incoming frequencies 0, should be 10000
 ;;    prev block 4, next block 1, flags: (NEW)
 ;;    pred:       3 (FALLTHRU,EXECUTABLE)
 ;;                4 (FALLTHRU,EXECUTABLE)
-  # .MEM_7 = PHI <.MEM_2(3), .MEM_2(4)>
+  # .MEM_8 = PHI <.MEM_2(3), .MEM_2(4)>
   # PT = nonlocal 
-  # addr.4_8 = PHI <addr.4_13(3), _17(4)>
-  # VUSE <.MEM_7>
-  res_4 = MEM[(intD.6 * {ref-all})addr.4_8];
+  # addr.8_9 = PHI <addr.8_15(3), _19(4)>
+  # VUSE <.MEM_8>
+  _6 = MEM[(intD.6 * {ref-all})addr.8_9];
   GIMPLE_NOP
-  # .MEM_6 = VDEF <.MEM_7>
-  apD.1836 ={v} {CLOBBER};
-  # VUSE <.MEM_6>
-  return res_4;
+  # .MEM_5 = VDEF <.MEM_8>
+  apD.1844 ={v} {CLOBBER};
+  # VUSE <.MEM_5>
+  return _6;
 ;;    succ:       EXIT [100.0%] 
...

However, at pass_stdarg, we see on one hand:
...
f1: va_list escapes 0, needs to save 8 GPR units and 0 FPR units.
...

but OTOH:
...
f2: va_list escapes 1, needs to save all GPR units and all FPR units.
...

So while the assembly for f1 is short:
...
f1:
        .cfi_startproc
        leaq    8(%rsp), %rax
        movq    %rsi, -40(%rsp)
        movl    $8, -72(%rsp)
        movq    %rax, -64(%rsp)
        leaq    -48(%rsp), %rax
        movq    %rax, -56(%rsp)
        movl    -40(%rsp), %eax
        ret
        .cfi_endproc
...

for f2, we need to save a lot of registers onto stack:
...
f2:
        .cfi_startproc
        subq    $96, %rsp
        .cfi_def_cfa_offset 104
        testb   %al, %al
        movq    %rsi, -80(%rsp)
        movq    %rdx, -72(%rsp)
        movq    %rcx, -64(%rsp)
        movq    %r8, -56(%rsp)
        movq    %r9, -48(%rsp)
        je      .L8
        movaps  %xmm0, -40(%rsp)
        movaps  %xmm1, -24(%rsp)
        movaps  %xmm2, -8(%rsp)
        movaps  %xmm3, 8(%rsp)
        movaps  %xmm4, 24(%rsp)
        movaps  %xmm5, 40(%rsp)
        movaps  %xmm6, 56(%rsp)
        movaps  %xmm7, 72(%rsp)
.L8:
        leaq    104(%rsp), %rax
        movq    %rax, -104(%rsp)
        leaq    -88(%rsp), %rax
        movl    $8, -112(%rsp)
        movl    $48, -108(%rsp)
        movq    %rax, -96(%rsp)
        movl    -80(%rsp), %eax
        addq    $96, %rsp
        .cfi_def_cfa_offset 8
        ret
        .cfi_endproc
...

Reply via email to