https://gcc.gnu.org/bugzilla/show_bug.cgi?id=66010
Bug ID: 66010
Summary: Missed optimization after inlining va_list parameter
Product: gcc
Version: 6.0
Status: UNCONFIRMED
Severity: minor
Priority: P3
Component: tree-optimization
Assignee: unassigned at gcc dot gnu.org
Reporter: vries at gcc dot gnu.org
Target Milestone: ---
Consider this test-case (based on gcc.dg/tree-ssa/stdarg-2.c, f15):
...
#include <stdarg.h>
int
f1 (int i, ...)
{
int res;
va_list ap;
va_start (ap, i);
res = va_arg (ap, int);
va_end (ap);
return res;
}
inline int __attribute__((always_inline))
f2_1 (va_list ap)
{
return va_arg (ap, int);
}
int
f2 (int i, ...)
{
int res;
va_list ap;
va_start (ap, i);
res = f2_1 (ap);
va_end (ap);
return res;
}
...
When compiling at -O2, the optimized dump for f1 and f2 are very similar:
...
;; basic block 2, loop depth 0, count 0, freq 10000, maybe hot
;; Invalid sum of outgoing probabilities 0.0%
;; prev block 0, next block 3, flags: (NEW, REACHABLE)
;; pred: ENTRY [100.0%] (FALLTHRU,EXECUTABLE)
# .MEM_2 = VDEF <.MEM_1(D)>
# USE = nonlocal escaped
- # CLB = nonlocal escaped { D.1836 }
- __builtin_va_startD.1030 (&apD.1836, 0);
+ # CLB = nonlocal escaped { D.1844 }
+ __builtin_va_startD.1030 (&apD.1844, 0);
# VUSE <.MEM_2>
- _9 = apD.1836.gp_offsetD.2;
- if (_9 > 47)
+ _11 = MEM[(struct *)&apD.1844].gp_offsetD.2;
+ if (_11 > 47)
goto <bb 4>;
else
goto <bb 3>;
;; succ: 4 (TRUE_VALUE,EXECUTABLE)
;; 3 (FALSE_VALUE,EXECUTABLE)
;; basic block 3, loop depth 0, count 0, freq 0
;; Invalid sum of outgoing probabilities 0.0%
;; prev block 2, next block 4, flags: (NEW)
;; pred: 2 (FALSE_VALUE,EXECUTABLE)
# VUSE <.MEM_2>
# PT = nonlocal
- _10 = apD.1836.reg_save_areaD.5;
+ _12 = MEM[(struct *)&apD.1844].reg_save_areaD.5;
# RANGE [0, 47] NONZERO 63
- _12 = (sizetype) _9;
+ _14 = (sizetype) _11;
# PT = nonlocal
- addr.4_13 = _10 + _12;
+ addr.8_15 = _12 + _14;
# RANGE [8, 55] NONZERO 63
- _15 = _9 + 8;
+ _17 = _11 + 8;
goto <bb 5>;
;; succ: 5 (FALLTHRU,EXECUTABLE)
;; basic block 4, loop depth 0, count 0, freq 0
;; Invalid sum of outgoing probabilities 0.0%
;; prev block 3, next block 5, flags: (NEW)
;; pred: 2 (TRUE_VALUE,EXECUTABLE)
# VUSE <.MEM_2>
# PT = nonlocal
- _17 = apD.1836.overflow_arg_areaD.4;
+ _19 = MEM[(struct *)&apD.1844].overflow_arg_areaD.4;
# PT = nonlocal
- _19 = _17 + 8;
+ _21 = _19 + 8;
;; succ: 5 (FALLTHRU,EXECUTABLE)
;; basic block 5, loop depth 0, count 0, freq 10000, maybe hot
;; Invalid sum of incoming frequencies 0, should be 10000
;; prev block 4, next block 1, flags: (NEW)
;; pred: 3 (FALLTHRU,EXECUTABLE)
;; 4 (FALLTHRU,EXECUTABLE)
- # .MEM_7 = PHI <.MEM_2(3), .MEM_2(4)>
+ # .MEM_8 = PHI <.MEM_2(3), .MEM_2(4)>
# PT = nonlocal
- # addr.4_8 = PHI <addr.4_13(3), _17(4)>
- # VUSE <.MEM_7>
- res_4 = MEM[(intD.6 * {ref-all})addr.4_8];
+ # addr.8_9 = PHI <addr.8_15(3), _19(4)>
+ # VUSE <.MEM_8>
+ _6 = MEM[(intD.6 * {ref-all})addr.8_9];
GIMPLE_NOP
- # .MEM_6 = VDEF <.MEM_7>
- apD.1836 ={v} {CLOBBER};
- # VUSE <.MEM_6>
- return res_4;
+ # .MEM_5 = VDEF <.MEM_8>
+ apD.1844 ={v} {CLOBBER};
+ # VUSE <.MEM_5>
+ return _6;
;; succ: EXIT [100.0%]
...
However, at pass_stdarg, we see on one hand:
...
f1: va_list escapes 0, needs to save 8 GPR units and 0 FPR units.
...
but OTOH:
...
f2: va_list escapes 1, needs to save all GPR units and all FPR units.
...
So while the assembly for f1 is short:
...
f1:
.cfi_startproc
leaq 8(%rsp), %rax
movq %rsi, -40(%rsp)
movl $8, -72(%rsp)
movq %rax, -64(%rsp)
leaq -48(%rsp), %rax
movq %rax, -56(%rsp)
movl -40(%rsp), %eax
ret
.cfi_endproc
...
for f2, we need to save a lot of registers onto stack:
...
f2:
.cfi_startproc
subq $96, %rsp
.cfi_def_cfa_offset 104
testb %al, %al
movq %rsi, -80(%rsp)
movq %rdx, -72(%rsp)
movq %rcx, -64(%rsp)
movq %r8, -56(%rsp)
movq %r9, -48(%rsp)
je .L8
movaps %xmm0, -40(%rsp)
movaps %xmm1, -24(%rsp)
movaps %xmm2, -8(%rsp)
movaps %xmm3, 8(%rsp)
movaps %xmm4, 24(%rsp)
movaps %xmm5, 40(%rsp)
movaps %xmm6, 56(%rsp)
movaps %xmm7, 72(%rsp)
.L8:
leaq 104(%rsp), %rax
movq %rax, -104(%rsp)
leaq -88(%rsp), %rax
movl $8, -112(%rsp)
movl $48, -108(%rsp)
movq %rax, -96(%rsp)
movl -80(%rsp), %eax
addq $96, %rsp
.cfi_def_cfa_offset 8
ret
.cfi_endproc
...