https://gcc.gnu.org/bugzilla/show_bug.cgi?id=66010
Bug ID: 66010 Summary: Missed optimization after inlining va_list parameter Product: gcc Version: 6.0 Status: UNCONFIRMED Severity: minor Priority: P3 Component: tree-optimization Assignee: unassigned at gcc dot gnu.org Reporter: vries at gcc dot gnu.org Target Milestone: --- Consider this test-case (based on gcc.dg/tree-ssa/stdarg-2.c, f15): ... #include <stdarg.h> int f1 (int i, ...) { int res; va_list ap; va_start (ap, i); res = va_arg (ap, int); va_end (ap); return res; } inline int __attribute__((always_inline)) f2_1 (va_list ap) { return va_arg (ap, int); } int f2 (int i, ...) { int res; va_list ap; va_start (ap, i); res = f2_1 (ap); va_end (ap); return res; } ... When compiling at -O2, the optimized dump for f1 and f2 are very similar: ... ;; basic block 2, loop depth 0, count 0, freq 10000, maybe hot ;; Invalid sum of outgoing probabilities 0.0% ;; prev block 0, next block 3, flags: (NEW, REACHABLE) ;; pred: ENTRY [100.0%] (FALLTHRU,EXECUTABLE) # .MEM_2 = VDEF <.MEM_1(D)> # USE = nonlocal escaped - # CLB = nonlocal escaped { D.1836 } - __builtin_va_startD.1030 (&apD.1836, 0); + # CLB = nonlocal escaped { D.1844 } + __builtin_va_startD.1030 (&apD.1844, 0); # VUSE <.MEM_2> - _9 = apD.1836.gp_offsetD.2; - if (_9 > 47) + _11 = MEM[(struct *)&apD.1844].gp_offsetD.2; + if (_11 > 47) goto <bb 4>; else goto <bb 3>; ;; succ: 4 (TRUE_VALUE,EXECUTABLE) ;; 3 (FALSE_VALUE,EXECUTABLE) ;; basic block 3, loop depth 0, count 0, freq 0 ;; Invalid sum of outgoing probabilities 0.0% ;; prev block 2, next block 4, flags: (NEW) ;; pred: 2 (FALSE_VALUE,EXECUTABLE) # VUSE <.MEM_2> # PT = nonlocal - _10 = apD.1836.reg_save_areaD.5; + _12 = MEM[(struct *)&apD.1844].reg_save_areaD.5; # RANGE [0, 47] NONZERO 63 - _12 = (sizetype) _9; + _14 = (sizetype) _11; # PT = nonlocal - addr.4_13 = _10 + _12; + addr.8_15 = _12 + _14; # RANGE [8, 55] NONZERO 63 - _15 = _9 + 8; + _17 = _11 + 8; goto <bb 5>; ;; succ: 5 (FALLTHRU,EXECUTABLE) ;; basic block 4, loop depth 0, count 0, freq 0 ;; Invalid sum of outgoing probabilities 0.0% ;; prev block 3, next block 5, flags: (NEW) ;; pred: 2 (TRUE_VALUE,EXECUTABLE) # VUSE <.MEM_2> # PT = nonlocal - _17 = apD.1836.overflow_arg_areaD.4; + _19 = MEM[(struct *)&apD.1844].overflow_arg_areaD.4; # PT = nonlocal - _19 = _17 + 8; + _21 = _19 + 8; ;; succ: 5 (FALLTHRU,EXECUTABLE) ;; basic block 5, loop depth 0, count 0, freq 10000, maybe hot ;; Invalid sum of incoming frequencies 0, should be 10000 ;; prev block 4, next block 1, flags: (NEW) ;; pred: 3 (FALLTHRU,EXECUTABLE) ;; 4 (FALLTHRU,EXECUTABLE) - # .MEM_7 = PHI <.MEM_2(3), .MEM_2(4)> + # .MEM_8 = PHI <.MEM_2(3), .MEM_2(4)> # PT = nonlocal - # addr.4_8 = PHI <addr.4_13(3), _17(4)> - # VUSE <.MEM_7> - res_4 = MEM[(intD.6 * {ref-all})addr.4_8]; + # addr.8_9 = PHI <addr.8_15(3), _19(4)> + # VUSE <.MEM_8> + _6 = MEM[(intD.6 * {ref-all})addr.8_9]; GIMPLE_NOP - # .MEM_6 = VDEF <.MEM_7> - apD.1836 ={v} {CLOBBER}; - # VUSE <.MEM_6> - return res_4; + # .MEM_5 = VDEF <.MEM_8> + apD.1844 ={v} {CLOBBER}; + # VUSE <.MEM_5> + return _6; ;; succ: EXIT [100.0%] ... However, at pass_stdarg, we see on one hand: ... f1: va_list escapes 0, needs to save 8 GPR units and 0 FPR units. ... but OTOH: ... f2: va_list escapes 1, needs to save all GPR units and all FPR units. ... So while the assembly for f1 is short: ... f1: .cfi_startproc leaq 8(%rsp), %rax movq %rsi, -40(%rsp) movl $8, -72(%rsp) movq %rax, -64(%rsp) leaq -48(%rsp), %rax movq %rax, -56(%rsp) movl -40(%rsp), %eax ret .cfi_endproc ... for f2, we need to save a lot of registers onto stack: ... f2: .cfi_startproc subq $96, %rsp .cfi_def_cfa_offset 104 testb %al, %al movq %rsi, -80(%rsp) movq %rdx, -72(%rsp) movq %rcx, -64(%rsp) movq %r8, -56(%rsp) movq %r9, -48(%rsp) je .L8 movaps %xmm0, -40(%rsp) movaps %xmm1, -24(%rsp) movaps %xmm2, -8(%rsp) movaps %xmm3, 8(%rsp) movaps %xmm4, 24(%rsp) movaps %xmm5, 40(%rsp) movaps %xmm6, 56(%rsp) movaps %xmm7, 72(%rsp) .L8: leaq 104(%rsp), %rax movq %rax, -104(%rsp) leaq -88(%rsp), %rax movl $8, -112(%rsp) movl $48, -108(%rsp) movq %rax, -96(%rsp) movl -80(%rsp), %eax addq $96, %rsp .cfi_def_cfa_offset 8 ret .cfi_endproc ...