On Mon, Jan 22, 2024 at 8:58 AM Jan Hubicka <hubi...@ucw.cz> wrote: > > > I compared GCC master branch bootstrap and test times on a slow machine > > with 6.6 Linux kernels compiled with the original GCC 13 and the GCC 13 > > with the backported patch. The performance data isn't precise since the > > measurements were done on different days with different GCC sources under > > different 6.6 kernel versions. > > > > GCC master branch build time in seconds: > > > > before after improvement > > 30043.75user 30013.16user 0% > > 1274.85system 1243.72system 2.4% > > > > GCC master branch test time in seconds (new tests added): > > > > before after improvement > > 216035.90user 216547.51user 0 > > 27365.51system 26658.54system 2.6% > > It is interesting - the system time difference comes from smaller > binary? Is the difference any significant?
I think it comes from In Linux kernel 6.7.0 on x86-64, do_exit is changed from do_exit: endbr64 call <do_exit+0x9> push %r15 push %r14 push %r13 push %r12 mov %rdi,%r12 push %rbp push %rbx mov %gs:0x0,%rbx sub $0x28,%rsp mov %gs:0x28,%rax mov %rax,0x20(%rsp) xor %eax,%eax call *0x0(%rip) # <do_exit+0x39> test $0x2,%ah je <do_exit+0x8d3> to do_exit: endbr64 call <do_exit+0x9> sub $0x28,%rsp mov %rdi,%r12 mov %gs:0x28,%rax mov %rax,0x20(%rsp) xor %eax,%eax mov %gs:0x0,%rbx call *0x0(%rip) # <do_exit+0x2f> test $0x2,%ah je <do_exit+0x8c9> do_exit is called by every process when it exists. > > > > gcc/ > > > > PR target/38534 > > * config/i386/i386-options.cc (ix86_set_func_type): Don't > > save and restore callee saved registers for a noreturn function > > with nothrow or compiled with -fno-exceptions. > > In general this looks like good thing to do. I wonder if that is not > something middle-end should understand for all targets. > Also I wonder about asynchronous stack unwinding. If we want to unwind > stack from interrupt then we may need some registers to be saved (like > base pointer). It is compatible with -fasynchronous-unwind-tables. From glibc test debug/tst-longjmp_chk: Starting program: /export/build/gnu/tools-build/glibc-cet/build-x86_64-linux/debug/tst-longjmp_chk --direct warning: Unable to find libthread_db matching inferior's thread library, thread debugging will not be available. Program received signal SIGABRT, Aborted. __pthread_kill_implementation (threadid=<optimized out>, signo=signo@entry=6, no_tid=no_tid@entry=0) at pthread_kill.c:44 44 return INTERNAL_SYSCALL_ERROR_P (ret) ? INTERNAL_SYSCALL_ERRNO (ret) : 0; (gdb) bt #0 __pthread_kill_implementation (threadid=<optimized out>, signo=signo@entry=6, no_tid=no_tid@entry=0) at pthread_kill.c:44 #1 0x0000555555294a4b in __pthread_kill_internal (signo=6, threadid=<optimized out>) at pthread_kill.c:78 #2 0x000055555523da1a in __GI_raise (sig=sig@entry=6) at ../sysdeps/posix/raise.c:26 #3 0x00005555552248b3 in __GI_abort () at abort.c:79 #4 0x0000555555225a7e in __libc_message_impl ( fmt=fmt@entry=0x5555553b7171 "*** %s ***: terminated\n") at ../sysdeps/posix/libc_fatal.c:132 #5 0x0000555555324517 in __GI___fortify_fail (msg=<optimized out>) at fortify_fail.c:24 #6 0x0000555555323411 in ____longjmp_chk () at ../sysdeps/x86_64/__longjmp.S:57 #7 0x0000555555324d6d in __GI___longjmp_chk ( env=env@entry=0x55555555a200 <b>, val=val@entry=1) at ../setjmp/longjmp.c:41 #8 0x0000555555556a00 in do_test () at tst-longjmp_chk.c:70 #9 0x0000555555557388 in support_test_main (argc=1431675392, argv=0x7fffffffdd30, config=0x1, config@entry=0x7fffffffdbe0) at support_test_main.c:413 #10 0x000055555555673f in main (argc=<optimized out>, argv=<optimized out>) at ../support/test-driver.c:170 (gdb) abort is a return function: extern void abort (void) __THROW __attribute__ ((__noreturn__)); Callee-saved registers aren't saved: Dump of assembler code for function __GI_abort: 0x00005555552247de <+0>: endbr64 0x00005555552247e2 <+4>: sub $0xa8,%rsp 0x00005555552247e9 <+11>: lea 0x1d1540(%rip),%rbx # 0x5555553f5d30 <lock> 0x00005555552247f0 <+18>: mov %fs:0x28,%rax 0x00005555552247f9 <+27>: mov %rax,0x98(%rsp) 0x0000555555224801 <+35>: xor %eax,%eax 0x0000555555224803 <+37>: mov %fs:0x10,%rbp 0x000055555522480c <+46>: cmp %rbp,0x1d1525(%rip) # 0x5555553f5d38 <lock+8> 0x0000555555224813 <+53>: je 0x555555224833 <__GI_abort+85> 0x0000555555224815 <+55>: mov $0x1,%edx 0x000055555522481a <+60>: lock cmpxchg %edx,0x1d150e(%rip) # 0x5555553f5d30 <lock> 0x0000555555224822 <+68>: je 0x55555522482c <__GI_abort+78> 0x0000555555224824 <+70>: mov %rbx,%rdi > Honza > > > > gcc/testsuite/ > > > > PR target/38534 > > * gcc.target/i386/pr38534-1.c: New file. > > * gcc.target/i386/pr38534-2.c: Likewise. > > * gcc.target/i386/pr38534-3.c: Likewise. > > * gcc.target/i386/pr38534-4.c: Likewise. > > * gcc.target/i386/stack-check-17.c: Updated. > > --- > > gcc/config/i386/i386-options.cc | 16 ++++++++++-- > > gcc/testsuite/gcc.target/i386/pr38534-1.c | 26 +++++++++++++++++++ > > gcc/testsuite/gcc.target/i386/pr38534-2.c | 18 +++++++++++++ > > gcc/testsuite/gcc.target/i386/pr38534-3.c | 19 ++++++++++++++ > > gcc/testsuite/gcc.target/i386/pr38534-4.c | 18 +++++++++++++ > > .../gcc.target/i386/stack-check-17.c | 19 +++++--------- > > 6 files changed, 102 insertions(+), 14 deletions(-) > > create mode 100644 gcc/testsuite/gcc.target/i386/pr38534-1.c > > create mode 100644 gcc/testsuite/gcc.target/i386/pr38534-2.c > > create mode 100644 gcc/testsuite/gcc.target/i386/pr38534-3.c > > create mode 100644 gcc/testsuite/gcc.target/i386/pr38534-4.c > > > > diff --git a/gcc/config/i386/i386-options.cc > > b/gcc/config/i386/i386-options.cc > > index 0cdea30599e..f965568947c 100644 > > --- a/gcc/config/i386/i386-options.cc > > +++ b/gcc/config/i386/i386-options.cc > > @@ -3371,9 +3371,21 @@ ix86_simd_clone_adjust (struct cgraph_node *node) > > static void > > ix86_set_func_type (tree fndecl) > > { > > + /* No need to save and restore callee-saved registers for a noreturn > > + function with nothrow or compiled with -fno-exceptions. > > + > > + NB: Don't use TREE_THIS_VOLATILE to check if this is a noreturn > > + function. The local-pure-const pass turns an interrupt function > > + into a noreturn function by setting TREE_THIS_VOLATILE. Normally > > + the local-pure-const pass is run after ix86_set_func_type is called. > > + When the local-pure-const pass is enabled for LTO, the interrupt > > + function is marked as noreturn in the IR output, which leads the > > + incompatible attribute error in LTO1. */ > > bool has_no_callee_saved_registers > > - = lookup_attribute ("no_callee_saved_registers", > > - TYPE_ATTRIBUTES (TREE_TYPE (fndecl))); > > + = (((TREE_NOTHROW (fndecl) || !flag_exceptions) > > + && lookup_attribute ("noreturn", DECL_ATTRIBUTES (fndecl))) > > + || lookup_attribute ("no_callee_saved_registers", > > + TYPE_ATTRIBUTES (TREE_TYPE (fndecl)))); > > > > if (cfun->machine->func_type == TYPE_UNKNOWN) > > { > > diff --git a/gcc/testsuite/gcc.target/i386/pr38534-1.c > > b/gcc/testsuite/gcc.target/i386/pr38534-1.c > > new file mode 100644 > > index 00000000000..9297959e759 > > --- /dev/null > > +++ b/gcc/testsuite/gcc.target/i386/pr38534-1.c > > @@ -0,0 +1,26 @@ > > +/* { dg-do compile } */ > > +/* { dg-options "-O2 > > -mtune-ctrl=^prologue_using_move,^epilogue_using_move" } */ > > + > > +#define ARRAY_SIZE 256 > > + > > +extern int array[ARRAY_SIZE][ARRAY_SIZE][ARRAY_SIZE]; > > +extern int value (int, int, int) > > +#ifndef __x86_64__ > > +__attribute__ ((regparm(3))) > > +#endif > > +; > > + > > +void > > +__attribute__((noreturn)) > > +no_return_to_caller (void) > > +{ > > + unsigned i, j, k; > > + for (i = ARRAY_SIZE; i > 0; --i) > > + for (j = ARRAY_SIZE; j > 0; --j) > > + for (k = ARRAY_SIZE; k > 0; --k) > > + array[i - 1][j - 1][k - 1] = value (i, j, k); > > + while (1); > > +} > > + > > +/* { dg-final { scan-assembler-not "push" } } */ > > +/* { dg-final { scan-assembler-not "pop" } } */ > > diff --git a/gcc/testsuite/gcc.target/i386/pr38534-2.c > > b/gcc/testsuite/gcc.target/i386/pr38534-2.c > > new file mode 100644 > > index 00000000000..1fb01363273 > > --- /dev/null > > +++ b/gcc/testsuite/gcc.target/i386/pr38534-2.c > > @@ -0,0 +1,18 @@ > > +/* { dg-do compile } */ > > +/* { dg-options "-O2 > > -mtune-ctrl=^prologue_using_move,^epilogue_using_move" } */ > > + > > +extern void bar (void) __attribute__ ((no_callee_saved_registers)); > > +extern void fn (void) __attribute__ ((noreturn)); > > + > > +__attribute__ ((noreturn)) > > +void > > +foo (void) > > +{ > > + bar (); > > + fn (); > > +} > > + > > +/* { dg-final { scan-assembler-not "push" } } */ > > +/* { dg-final { scan-assembler-not "pop" } } */ > > +/* { dg-final { scan-assembler-not "jmp\[\\t \]+_?bar" } } */ > > +/* { dg-final { scan-assembler "call\[\\t \]+_?bar" } } */ > > diff --git a/gcc/testsuite/gcc.target/i386/pr38534-3.c > > b/gcc/testsuite/gcc.target/i386/pr38534-3.c > > new file mode 100644 > > index 00000000000..87fc35f3fe9 > > --- /dev/null > > +++ b/gcc/testsuite/gcc.target/i386/pr38534-3.c > > @@ -0,0 +1,19 @@ > > +/* { dg-do compile } */ > > +/* { dg-options "-O2 > > -mtune-ctrl=^prologue_using_move,^epilogue_using_move" } */ > > + > > +typedef void (*fn_t) (void) __attribute__ ((no_callee_saved_registers)); > > +extern fn_t bar; > > +extern void fn (void) __attribute__ ((noreturn)); > > + > > +__attribute__ ((noreturn)) > > +void > > +foo (void) > > +{ > > + bar (); > > + fn (); > > +} > > + > > +/* { dg-final { scan-assembler-not "push" } } */ > > +/* { dg-final { scan-assembler-not "pop" } } */ > > +/* { dg-final { scan-assembler-not "jmp" } } */ > > +/* { dg-final { scan-assembler "call\[\\t \]+" } } */ > > diff --git a/gcc/testsuite/gcc.target/i386/pr38534-4.c > > b/gcc/testsuite/gcc.target/i386/pr38534-4.c > > new file mode 100644 > > index 00000000000..561ebeef194 > > --- /dev/null > > +++ b/gcc/testsuite/gcc.target/i386/pr38534-4.c > > @@ -0,0 +1,18 @@ > > +/* { dg-do compile } */ > > +/* { dg-options "-O2 > > -mtune-ctrl=^prologue_using_move,^epilogue_using_move" } */ > > + > > +typedef void (*fn_t) (void) __attribute__ ((no_callee_saved_registers)); > > +extern void fn (void) __attribute__ ((noreturn)); > > + > > +__attribute__ ((noreturn)) > > +void > > +foo (fn_t bar) > > +{ > > + bar (); > > + fn (); > > +} > > + > > +/* { dg-final { scan-assembler-not "push" } } */ > > +/* { dg-final { scan-assembler-not "pop" } } */ > > +/* { dg-final { scan-assembler-not "jmp" } } */ > > +/* { dg-final { scan-assembler "call\[\\t \]+" } } */ > > diff --git a/gcc/testsuite/gcc.target/i386/stack-check-17.c > > b/gcc/testsuite/gcc.target/i386/stack-check-17.c > > index b3e41cb3d25..061484e1319 100644 > > --- a/gcc/testsuite/gcc.target/i386/stack-check-17.c > > +++ b/gcc/testsuite/gcc.target/i386/stack-check-17.c > > @@ -23,19 +23,14 @@ f3 (void) > > /* Verify no explicit probes. */ > > /* { dg-final { scan-assembler-not "or\[ql\]" } } */ > > > > -/* We also want to verify we did not use a push/pop sequence > > - to probe *sp as the callee register saves are sufficient > > - to probe *sp. > > - > > - y0/y1 are live across the call and thus must be allocated > > +/* y0/y1 are live across the call and thus must be allocated > > into either a stack slot or callee saved register. The former > > would be rather dumb. So assume it does not happen. > > > > - So search for two/four pushes for the callee register saves/argument > > pushes > > - (plus one for the PIC register if needed on ia32) and no pops (since the > > - function has no reachable epilogue). */ > > -/* { dg-final { scan-assembler-times "push\[ql\]" 2 { target { ! ia32 } } > > } } */ > > -/* { dg-final { scan-assembler-times "push\[ql\]" 4 { target { ia32 && > > nonpic } } } } */ > > -/* { dg-final { scan-assembler-times "push\[ql\]" 5 { target { ia32 && { ! > > nonpic } } } } } */ > > -/* { dg-final { scan-assembler-not "pop" } } */ > > + So search for a push/pop sequence for stack probe and 2 argument > > + pushes on ia32. There is no need to save and restore the PIC > > + register on ia32 for a noreturn function. */ > > +/* { dg-final { scan-assembler-times "push\[ql\]" 1 { target { ! ia32 } } > > } } */ > > +/* { dg-final { scan-assembler-times "push\[ql\]" 3 { target ia32 } } } */ > > +/* { dg-final { scan-assembler-times "pop" 1 } } */ > > > > -- > > 2.43.0 > > -- H.J.