https://gcc.gnu.org/bugzilla/show_bug.cgi?id=117081
--- Comment #19 from Hongtao Liu <liuhongt at gcc dot gnu.org> --- (In reply to H.J. Lu from comment #18) > (In reply to Haochen Jiang from comment #17) > > > > For reproduce, not only on ADL, the fix patch showed regression on all > > Cascade Lake/Ice Lake/Sapphire Rapids with ~2-4% for 511.povary_r with > > o2_generic_v3. > > Can you extract some testcases to show more PUSH and POP? The original case was a bit more complicated, so I tried to mimic it by writing a similar. extern int bar (double* a, double* b, double* c, double* d, double* e); extern bool foo2 (double* a, double b); int foo (double* a, double* b, double *c) { int rr = 0; double d1; double d2; if (bar (a, b, c, &d1, &d2)) --- mostly false; { if (d1 > 0.0 && d1 < 100.0) { c[0] = a[0] + d1 * b[0]; c[1] = a[1] + d1 * b[1]; c[2] = a[2] + d1 * b[2]; if (foo2 (c, d1)) rr = 1; } if (d2 > 0.0 && d2 < 100.0) { c[0] = a[0] + d2 * b[0]; c[1] = a[1] + d2 * b[1]; c[2] = a[2] + d2 * b[2]; if (foo2 (c, d2)) rr = 1; } } return rr; } Before r15-7400 foo: .LFB0: .cfi_startproc pushq %rbp .cfi_def_cfa_offset 16 .cfi_offset 6, -16 movq %rdi, %rbp pushq %rbx .cfi_def_cfa_offset 24 .cfi_offset 3, -24 movq %rdx, %rbx subq $40, %rsp .cfi_def_cfa_offset 64 leaq 16(%rsp), %rcx leaq 24(%rsp), %r8 movq %rsi, 8(%rsp) call bar movl %eax, %edx testl %eax, %eax je .L1 vmovsd 16(%rsp), %xmm0 vxorpd %xmm1, %xmm1, %xmm1 movq 8(%rsp), %rsi vcomisd %xmm1, %xmm0 jbe .L18 vmovsd .LC1(%rip), %xmm1 vcomisd %xmm0, %xmm1 ja .L21 .L18: xorl %edx, %edx .L3: vmovsd 24(%rsp), %xmm0 vxorpd %xmm1, %xmm1, %xmm1 vcomisd %xmm1, %xmm0 jbe .L1 vmovsd .LC1(%rip), %xmm1 vcomisd %xmm0, %xmm1 ja .L22 .L1: addq $40, %rsp .cfi_remember_state .cfi_def_cfa_offset 24 movl %edx, %eax popq %rbx .cfi_def_cfa_offset 16 popq %rbp .cfi_def_cfa_offset 8 ret .p2align 4,,10 .p2align 3 after r15-7400 foo: .LFB0: .cfi_startproc pushq %r13 .cfi_def_cfa_offset 16 .cfi_offset 13, -16 movq %rsi, %r13 pushq %r12 .cfi_def_cfa_offset 24 .cfi_offset 12, -24 movq %rdi, %r12 pushq %rbp .cfi_def_cfa_offset 32 .cfi_offset 6, -32 movq %rdx, %rbp pushq %rbx .cfi_def_cfa_offset 40 .cfi_offset 3, -40 subq $24, %rsp .cfi_def_cfa_offset 64 movq %rsp, %rcx leaq 8(%rsp), %r8 call bar movl %eax, %ebx testl %eax, %eax je .L1 vmovsd (%rsp), %xmm0 vxorpd %xmm1, %xmm1, %xmm1 vcomisd %xmm1, %xmm0 jbe .L18 vmovsd .LC1(%rip), %xmm1 vcomisd %xmm0, %xmm1 ja .L21 .L18: xorl %ebx, %ebx .L3: vmovsd 8(%rsp), %xmm0 vxorpd %xmm1, %xmm1, %xmm1 vcomisd %xmm1, %xmm0 jbe .L1 vmovsd .LC1(%rip), %xmm1 vcomisd %xmm0, %xmm1 ja .L22 .L1: addq $24, %rsp .cfi_remember_state .cfi_def_cfa_offset 40 movl %ebx, %eax popq %rbx .cfi_def_cfa_offset 32 popq %rbp .cfi_def_cfa_offset 24 popq %r12 .cfi_def_cfa_offset 16 popq %r13 .cfi_def_cfa_offset 8 ret W/o more usage of callee-saved registers, callee needs to restore them before exit which is not needed if more caller-saved register are used.