On 2021-08-02 13:29:35 +0200, Vincent Lefevre wrote:
> This could mean that the kernel does not deliver a SIGCHLD to the
> SCREEN process when the shell process terminates, or this signal is
> lost. I would say that this is likely to be a bug in the kernel.

Well, probably not. I have another explanation... With more strace
output (I've also added some debug information to utmp.c, seen in
"write" lines):

[...]
write(3, "FreeWindow 1\n", 13)          = 13
write(3, "RemoveUtmp slot=0x1e2a525d\n", 27) = 27
lseek(10, 0, SEEK_SET)                  = 0
alarm(0)                                = 0
rt_sigaction(SIGALRM, {sa_handler=0x7fd37fd32e30, sa_mask=[], 
sa_flags=SA_RESTORER, sa_restorer=0x7fd37fc38840}, {sa_handler=SIG_DFL, 
sa_mask=[], sa_flags=SA_RESTORER, sa_restorer=0x7fd37fc38840}, 8) = 0
alarm(10)                               = 0
fcntl(10, F_SETLKW, {l_type=F_RDLCK, l_whence=SEEK_SET, l_start=0, l_len=0}) = 0
read(10, "\2\0\0\0\0\0\0\0~\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 
384) = 384
read(10, "\6\0\0\0l\2\0\0tty1\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 384) 
= 384
read(10, "\6\0\0\0n\2\0\0hvc0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 384) 
= 384
read(10, "\1\0\0\0005\0\0\0~\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 
384) = 384
read(10, "\7\0\0\0&U\0\0pts/0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 384) = 
384
read(10, "\7\0\0\0JT\0\0pts/1\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 384) = 
384
read(10, "\7\0\0\0\322A\0\0pts/2\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 
384) = 384
read(10, "\10\0\0\0\337U\0\0pts/3\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 
384) = 384
read(10, "\10\0\0\0\337U\0\0pts/4\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 
384) = 384
read(10, "\10\0\0\0\337U\0\0pts/5\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 
384) = 384
read(10, "\7\0\0\0\233:\0\0pts/6\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 
384) = 384
read(10, "\10\0\0\0\337U\0\0pts/7\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 
384) = 384
read(10, "\7\0\0\0|C\0\0pts/8\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 384) = 
384
read(10, "\7\0\0\0$\34\0\0pts/9\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 384) 
= 384
read(10, "\7\0\0\0SZ\0\0pts/10\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 384) = 
384
read(10, "\7\0\0\0|C\0\0pts/11\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 384) = 
384
fcntl(10, F_SETLKW, {l_type=F_UNLCK, l_whence=SEEK_SET, l_start=0, l_len=0}) = 0
alarm(0)                                = 10
rt_sigaction(SIGALRM, {sa_handler=SIG_DFL, sa_mask=[], sa_flags=SA_RESTORER, 
sa_restorer=0x7fd37fc38840}, NULL, 8) = 0
write(3, "pututslot: removeLineFromUtmp\n", 30) = 30
rt_sigaction(SIGCHLD, {sa_handler=SIG_DFL, sa_mask=[], 
sa_flags=SA_RESTORER|SA_RESTART, sa_restorer=0x7fd37fc38840}, 
{sa_handler=0x55b31cf71a21, sa_mask=[], sa_flags=SA_RESTORER|SA_RESTART, 
sa_restorer=0x7fd37fc38840}, 8) = 0
clone(child_stack=NULL, flags=CLONE_CHILD_CLEARTID|CLONE_CHILD_SETTID|SIGCHLD, 
child_tidptr=0x7fd37fbc4a10) = 17771
wait4(17771, [{WIFEXITED(s) && WEXITSTATUS(s) == 0}], 0, NULL) = 17771
--- SIGCHLD {si_signo=SIGCHLD, si_code=CLD_EXITED, si_pid=17771, si_uid=1000, 
si_status=0, si_utime=0, si_stime=0} ---
rt_sigaction(SIGCHLD, {sa_handler=0x55b31cf71a21, sa_mask=[], 
sa_flags=SA_RESTORER|SA_RESTART, sa_restorer=0x7fd37fc38840}, NULL, 8) = 0
write(3, "pututslot: done\n", 16)       = 16
write(3, "RemoveUtmp successfull\n", 23) = 23
[...]

The most interesting part is between "pututslot: removeLineFromUtmp"
and "pututslot: done". I added some debug information in pututslot()
from utmp.c as follows:

#ifdef HAVE_UTEMPTER
  if (eff_uid && wi && wi->w_ptyfd != -1)
    {
      /* sigh, linux hackers made the helper functions void */
      if (SLOT_USED(u))
        {
          debug ("pututslot: addToUtmp\n");
          addToUtmp(wi->w_tty, host, wi->w_ptyfd);
        }
      else
        {
          debug ("pututslot: removeLineFromUtmp\n");
          removeLineFromUtmp(wi->w_tty, wi->w_ptyfd);
        }
      debug ("pututslot: done\n");
      return 1; /* pray for success */
    }
#endif

So, the following lines correspond to what removeLineFromUtmp does:

rt_sigaction(SIGCHLD, {sa_handler=SIG_DFL, sa_mask=[], 
sa_flags=SA_RESTORER|SA_RESTART, sa_restorer=0x7fd37fc38840}, 
{sa_handler=0x55b31cf71a21, sa_mask=[], sa_flags=SA_RESTORER|SA_RESTART, 
sa_restorer=0x7fd37fc38840}, 8) = 0
clone(child_stack=NULL, flags=CLONE_CHILD_CLEARTID|CLONE_CHILD_SETTID|SIGCHLD, 
child_tidptr=0x7fd37fbc4a10) = 17771
wait4(17771, [{WIFEXITED(s) && WEXITSTATUS(s) == 0}], 0, NULL) = 17771
--- SIGCHLD {si_signo=SIGCHLD, si_code=CLD_EXITED, si_pid=17771, si_uid=1000, 
si_status=0, si_utime=0, si_stime=0} ---
rt_sigaction(SIGCHLD, {sa_handler=0x55b31cf71a21, sa_mask=[], 
sa_flags=SA_RESTORER|SA_RESTART, sa_restorer=0x7fd37fc38840}, NULL, 8) = 0

As you can see, the clone I was wondering of last night comes from
removeLineFromUtmp (function of libutempter), which also plays with
signal handlers, probably clashing with screen if the SIGCHLD signal
from the terminated shell process arrives exactly at this time[*].
This would be the race condition.

[*] With strace, this seems to never happen: the SIGCHLD in question
arrives earlier. But the strace makes the SCREEN process slower, so
that in practice, the signal may arrive later, possibly at the same
time as the clone (since the scheduler is involved).

In libutempter, one has:

        int     status = execute_helper(master_fd, args);

which does:

static int
execute_helper(int master_fd, const char *const argv[])
{
        struct sigaction saved_action, action;

        action.sa_handler = SIG_DFL;
        action.sa_flags = SA_RESTART;
        sigemptyset(&action.sa_mask);

        if (sigaction(SIGCHLD, &action, &saved_action) < 0)
        {
#ifdef  UTEMPTER_DEBUG
                fprintf(stderr, "libutempter: sigaction: %s\n",
                        strerror(errno));
#endif
                return 0;
        } else
        {
                pid_t   child;
                int     status = 1;

                child = fork();
                if (!child)
                {
                        do_child(master_fd, argv[0], (char *const *) argv);
                } else if (child < 0)
                {
#ifdef  UTEMPTER_DEBUG
                        fprintf(stderr, "libutempter: fork: %s\n",
                                strerror(errno));
#endif
                        goto ret;
                }

                if (TEMP_FAILURE_RETRY(waitpid(child, &status, 0)) < 0)
                {
#ifdef  UTEMPTER_DEBUG
                        fprintf(stderr, "libutempter: waitpid: %s\n",
                                strerror(errno));
#endif
                        status = 1;
                }

              ret:
                (void) sigaction(SIGCHLD, &saved_action, 0);
                return !status;
        }
}

As you can see the SIGCHLD handler is modified, before being restored
at the end.

-- 
Vincent Lefèvre <vinc...@vinc17.net> - Web: <https://www.vinc17.net/>
100% accessible validated (X)HTML - Blog: <https://www.vinc17.net/blog/>
Work: CR INRIA - computer arithmetic / AriC project (LIP, ENS-Lyon)

Reply via email to