On Mon, Jan 22, 2024 at 8:58 AM Jan Hubicka <hubi...@ucw.cz> wrote:
>
> > I compared GCC master branch bootstrap and test times on a slow machine
> > with 6.6 Linux kernels compiled with the original GCC 13 and the GCC 13
> > with the backported patch.  The performance data isn't precise since the
> > measurements were done on different days with different GCC sources under
> > different 6.6 kernel versions.
> >
> > GCC master branch build time in seconds:
> >
> > before                after                  improvement
> > 30043.75user          30013.16user           0%
> > 1274.85system         1243.72system          2.4%
> >
> > GCC master branch test time in seconds (new tests added):
> >
> > before                after                  improvement
> > 216035.90user         216547.51user          0
> > 27365.51system        26658.54system         2.6%
>
> It is interesting - the system time difference comes from smaller
> binary?  Is the difference any significant?

I think it comes from

In Linux kernel 6.7.0 on x86-64, do_exit is changed from

do_exit:
        endbr64
        call   <do_exit+0x9>
        push   %r15
        push   %r14
        push   %r13
        push   %r12
        mov    %rdi,%r12
        push   %rbp
        push   %rbx
        mov    %gs:0x0,%rbx
        sub    $0x28,%rsp
        mov    %gs:0x28,%rax
        mov    %rax,0x20(%rsp)
        xor    %eax,%eax
        call   *0x0(%rip)        # <do_exit+0x39>
        test   $0x2,%ah
        je     <do_exit+0x8d3>

to

do_exit:
        endbr64
        call   <do_exit+0x9>
        sub    $0x28,%rsp
        mov    %rdi,%r12
        mov    %gs:0x28,%rax
        mov    %rax,0x20(%rsp)
        xor    %eax,%eax
        mov    %gs:0x0,%rbx
        call   *0x0(%rip)        # <do_exit+0x2f>
        test   $0x2,%ah
        je     <do_exit+0x8c9>

do_exit is called by every process when it exists.

> >
> > gcc/
> >
> >       PR target/38534
> >       * config/i386/i386-options.cc (ix86_set_func_type): Don't
> >       save and restore callee saved registers for a noreturn function
> >       with nothrow or compiled with -fno-exceptions.
>
> In general this looks like good thing to do.  I wonder if that is not
> something middle-end should understand for all targets.
> Also I wonder about asynchronous stack unwinding.  If we want to unwind
> stack from interrupt then we may need some registers to be saved (like
> base pointer).

It is compatible with -fasynchronous-unwind-tables.  From glibc test
debug/tst-longjmp_chk:

Starting program:
/export/build/gnu/tools-build/glibc-cet/build-x86_64-linux/debug/tst-longjmp_chk
--direct
warning: Unable to find libthread_db matching inferior's thread
library, thread debugging will not be available.

Program received signal SIGABRT, Aborted.
__pthread_kill_implementation (threadid=<optimized out>, signo=signo@entry=6,
    no_tid=no_tid@entry=0) at pthread_kill.c:44
44       return INTERNAL_SYSCALL_ERROR_P (ret) ?
INTERNAL_SYSCALL_ERRNO (ret) : 0;
(gdb) bt
#0  __pthread_kill_implementation (threadid=<optimized out>,
    signo=signo@entry=6, no_tid=no_tid@entry=0) at pthread_kill.c:44
#1  0x0000555555294a4b in __pthread_kill_internal (signo=6,
    threadid=<optimized out>) at pthread_kill.c:78
#2  0x000055555523da1a in __GI_raise (sig=sig@entry=6)
    at ../sysdeps/posix/raise.c:26
#3  0x00005555552248b3 in __GI_abort () at abort.c:79
#4  0x0000555555225a7e in __libc_message_impl (
    fmt=fmt@entry=0x5555553b7171 "*** %s ***: terminated\n")
    at ../sysdeps/posix/libc_fatal.c:132
#5  0x0000555555324517 in __GI___fortify_fail (msg=<optimized out>)
    at fortify_fail.c:24
#6  0x0000555555323411 in ____longjmp_chk ()
    at ../sysdeps/x86_64/__longjmp.S:57
#7  0x0000555555324d6d in __GI___longjmp_chk (
    env=env@entry=0x55555555a200 <b>, val=val@entry=1)
    at ../setjmp/longjmp.c:41
#8  0x0000555555556a00 in do_test () at tst-longjmp_chk.c:70
#9  0x0000555555557388 in support_test_main (argc=1431675392,
    argv=0x7fffffffdd30, config=0x1, config@entry=0x7fffffffdbe0)
    at support_test_main.c:413
#10 0x000055555555673f in main (argc=<optimized out>, argv=<optimized out>)
    at ../support/test-driver.c:170
(gdb)

abort is a return function:

extern void abort (void) __THROW __attribute__ ((__noreturn__));

Callee-saved registers aren't saved:

Dump of assembler code for function __GI_abort:
   0x00005555552247de <+0>: endbr64
   0x00005555552247e2 <+4>: sub    $0xa8,%rsp
   0x00005555552247e9 <+11>: lea    0x1d1540(%rip),%rbx        #
0x5555553f5d30 <lock>
   0x00005555552247f0 <+18>: mov    %fs:0x28,%rax
   0x00005555552247f9 <+27>: mov    %rax,0x98(%rsp)
   0x0000555555224801 <+35>: xor    %eax,%eax
   0x0000555555224803 <+37>: mov    %fs:0x10,%rbp
   0x000055555522480c <+46>: cmp    %rbp,0x1d1525(%rip)        #
0x5555553f5d38 <lock+8>
   0x0000555555224813 <+53>: je     0x555555224833 <__GI_abort+85>
   0x0000555555224815 <+55>: mov    $0x1,%edx
   0x000055555522481a <+60>: lock cmpxchg %edx,0x1d150e(%rip)        #
0x5555553f5d30 <lock>
   0x0000555555224822 <+68>: je     0x55555522482c <__GI_abort+78>
   0x0000555555224824 <+70>: mov    %rbx,%rdi


> Honza
> >
> > gcc/testsuite/
> >
> >       PR target/38534
> >       * gcc.target/i386/pr38534-1.c: New file.
> >       * gcc.target/i386/pr38534-2.c: Likewise.
> >       * gcc.target/i386/pr38534-3.c: Likewise.
> >       * gcc.target/i386/pr38534-4.c: Likewise.
> >       * gcc.target/i386/stack-check-17.c: Updated.
> > ---
> >  gcc/config/i386/i386-options.cc               | 16 ++++++++++--
> >  gcc/testsuite/gcc.target/i386/pr38534-1.c     | 26 +++++++++++++++++++
> >  gcc/testsuite/gcc.target/i386/pr38534-2.c     | 18 +++++++++++++
> >  gcc/testsuite/gcc.target/i386/pr38534-3.c     | 19 ++++++++++++++
> >  gcc/testsuite/gcc.target/i386/pr38534-4.c     | 18 +++++++++++++
> >  .../gcc.target/i386/stack-check-17.c          | 19 +++++---------
> >  6 files changed, 102 insertions(+), 14 deletions(-)
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr38534-1.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr38534-2.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr38534-3.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr38534-4.c
> >
> > diff --git a/gcc/config/i386/i386-options.cc 
> > b/gcc/config/i386/i386-options.cc
> > index 0cdea30599e..f965568947c 100644
> > --- a/gcc/config/i386/i386-options.cc
> > +++ b/gcc/config/i386/i386-options.cc
> > @@ -3371,9 +3371,21 @@ ix86_simd_clone_adjust (struct cgraph_node *node)
> >  static void
> >  ix86_set_func_type (tree fndecl)
> >  {
> > +  /* No need to save and restore callee-saved registers for a noreturn
> > +     function with nothrow or compiled with -fno-exceptions.
> > +
> > +     NB: Don't use TREE_THIS_VOLATILE to check if this is a noreturn
> > +     function.  The local-pure-const pass turns an interrupt function
> > +     into a noreturn function by setting TREE_THIS_VOLATILE.  Normally
> > +     the local-pure-const pass is run after ix86_set_func_type is called.
> > +     When the local-pure-const pass is enabled for LTO, the interrupt
> > +     function is marked as noreturn in the IR output, which leads the
> > +     incompatible attribute error in LTO1.  */
> >    bool has_no_callee_saved_registers
> > -    = lookup_attribute ("no_callee_saved_registers",
> > -                     TYPE_ATTRIBUTES (TREE_TYPE (fndecl)));
> > +    = (((TREE_NOTHROW (fndecl) || !flag_exceptions)
> > +     && lookup_attribute ("noreturn", DECL_ATTRIBUTES (fndecl)))
> > +       || lookup_attribute ("no_callee_saved_registers",
> > +                         TYPE_ATTRIBUTES (TREE_TYPE (fndecl))));
> >
> >    if (cfun->machine->func_type == TYPE_UNKNOWN)
> >      {
> > diff --git a/gcc/testsuite/gcc.target/i386/pr38534-1.c 
> > b/gcc/testsuite/gcc.target/i386/pr38534-1.c
> > new file mode 100644
> > index 00000000000..9297959e759
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr38534-1.c
> > @@ -0,0 +1,26 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O2 
> > -mtune-ctrl=^prologue_using_move,^epilogue_using_move" } */
> > +
> > +#define ARRAY_SIZE 256
> > +
> > +extern int array[ARRAY_SIZE][ARRAY_SIZE][ARRAY_SIZE];
> > +extern int value (int, int, int)
> > +#ifndef __x86_64__
> > +__attribute__ ((regparm(3)))
> > +#endif
> > +;
> > +
> > +void
> > +__attribute__((noreturn))
> > +no_return_to_caller (void)
> > +{
> > +  unsigned i, j, k;
> > +  for (i = ARRAY_SIZE; i > 0; --i)
> > +    for (j = ARRAY_SIZE; j > 0; --j)
> > +      for (k = ARRAY_SIZE; k > 0; --k)
> > +     array[i - 1][j - 1][k - 1] = value (i, j, k);
> > +  while (1);
> > +}
> > +
> > +/* { dg-final { scan-assembler-not "push" } } */
> > +/* { dg-final { scan-assembler-not "pop" } } */
> > diff --git a/gcc/testsuite/gcc.target/i386/pr38534-2.c 
> > b/gcc/testsuite/gcc.target/i386/pr38534-2.c
> > new file mode 100644
> > index 00000000000..1fb01363273
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr38534-2.c
> > @@ -0,0 +1,18 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O2 
> > -mtune-ctrl=^prologue_using_move,^epilogue_using_move" } */
> > +
> > +extern void bar (void) __attribute__ ((no_callee_saved_registers));
> > +extern void fn (void) __attribute__ ((noreturn));
> > +
> > +__attribute__ ((noreturn))
> > +void
> > +foo (void)
> > +{
> > +  bar ();
> > +  fn ();
> > +}
> > +
> > +/* { dg-final { scan-assembler-not "push" } } */
> > +/* { dg-final { scan-assembler-not "pop" } } */
> > +/* { dg-final { scan-assembler-not "jmp\[\\t \]+_?bar" } } */
> > +/* { dg-final { scan-assembler "call\[\\t \]+_?bar" } } */
> > diff --git a/gcc/testsuite/gcc.target/i386/pr38534-3.c 
> > b/gcc/testsuite/gcc.target/i386/pr38534-3.c
> > new file mode 100644
> > index 00000000000..87fc35f3fe9
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr38534-3.c
> > @@ -0,0 +1,19 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O2 
> > -mtune-ctrl=^prologue_using_move,^epilogue_using_move" } */
> > +
> > +typedef void (*fn_t) (void) __attribute__ ((no_callee_saved_registers));
> > +extern fn_t bar;
> > +extern void fn (void) __attribute__ ((noreturn));
> > +
> > +__attribute__ ((noreturn))
> > +void
> > +foo (void)
> > +{
> > +  bar ();
> > +  fn ();
> > +}
> > +
> > +/* { dg-final { scan-assembler-not "push" } } */
> > +/* { dg-final { scan-assembler-not "pop" } } */
> > +/* { dg-final { scan-assembler-not "jmp" } } */
> > +/* { dg-final { scan-assembler "call\[\\t \]+" } } */
> > diff --git a/gcc/testsuite/gcc.target/i386/pr38534-4.c 
> > b/gcc/testsuite/gcc.target/i386/pr38534-4.c
> > new file mode 100644
> > index 00000000000..561ebeef194
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr38534-4.c
> > @@ -0,0 +1,18 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O2 
> > -mtune-ctrl=^prologue_using_move,^epilogue_using_move" } */
> > +
> > +typedef void (*fn_t) (void) __attribute__ ((no_callee_saved_registers));
> > +extern void fn (void) __attribute__ ((noreturn));
> > +
> > +__attribute__ ((noreturn))
> > +void
> > +foo (fn_t bar)
> > +{
> > +  bar ();
> > +  fn ();
> > +}
> > +
> > +/* { dg-final { scan-assembler-not "push" } } */
> > +/* { dg-final { scan-assembler-not "pop" } } */
> > +/* { dg-final { scan-assembler-not "jmp" } } */
> > +/* { dg-final { scan-assembler "call\[\\t \]+" } } */
> > diff --git a/gcc/testsuite/gcc.target/i386/stack-check-17.c 
> > b/gcc/testsuite/gcc.target/i386/stack-check-17.c
> > index b3e41cb3d25..061484e1319 100644
> > --- a/gcc/testsuite/gcc.target/i386/stack-check-17.c
> > +++ b/gcc/testsuite/gcc.target/i386/stack-check-17.c
> > @@ -23,19 +23,14 @@ f3 (void)
> >  /* Verify no explicit probes.  */
> >  /* { dg-final { scan-assembler-not "or\[ql\]" } } */
> >
> > -/* We also want to verify we did not use a push/pop sequence
> > -   to probe *sp as the callee register saves are sufficient
> > -   to probe *sp.
> > -
> > -   y0/y1 are live across the call and thus must be allocated
> > +/* y0/y1 are live across the call and thus must be allocated
> >     into either a stack slot or callee saved register.  The former
> >     would be rather dumb.  So assume it does not happen.
> >
> > -   So search for two/four pushes for the callee register saves/argument 
> > pushes
> > -   (plus one for the PIC register if needed on ia32) and no pops (since the
> > -   function has no reachable epilogue).  */
> > -/* { dg-final { scan-assembler-times "push\[ql\]" 2 { target { ! ia32 } } 
> > } }  */
> > -/* { dg-final { scan-assembler-times "push\[ql\]" 4 { target { ia32 && 
> > nonpic } } } }  */
> > -/* { dg-final { scan-assembler-times "push\[ql\]" 5 { target { ia32 && { ! 
> > nonpic } } } } }  */
> > -/* { dg-final { scan-assembler-not "pop" } } */
> > +   So search for a push/pop sequence for stack probe and 2 argument
> > +   pushes on ia32.  There is no need to save and restore the PIC
> > +   register on ia32 for a noreturn function.  */
> > +/* { dg-final { scan-assembler-times "push\[ql\]" 1 { target { ! ia32 } } 
> > } }  */
> > +/* { dg-final { scan-assembler-times "push\[ql\]" 3 { target ia32 } } }  */
> > +/* { dg-final { scan-assembler-times "pop" 1 } } */
> >
> > --
> > 2.43.0
> >



-- 
H.J.

Reply via email to