On 2021-08-02 13:29:35 +0200, Vincent Lefevre wrote: > This could mean that the kernel does not deliver a SIGCHLD to the > SCREEN process when the shell process terminates, or this signal is > lost. I would say that this is likely to be a bug in the kernel.
Well, probably not. I have another explanation... With more strace output (I've also added some debug information to utmp.c, seen in "write" lines): [...] write(3, "FreeWindow 1\n", 13) = 13 write(3, "RemoveUtmp slot=0x1e2a525d\n", 27) = 27 lseek(10, 0, SEEK_SET) = 0 alarm(0) = 0 rt_sigaction(SIGALRM, {sa_handler=0x7fd37fd32e30, sa_mask=[], sa_flags=SA_RESTORER, sa_restorer=0x7fd37fc38840}, {sa_handler=SIG_DFL, sa_mask=[], sa_flags=SA_RESTORER, sa_restorer=0x7fd37fc38840}, 8) = 0 alarm(10) = 0 fcntl(10, F_SETLKW, {l_type=F_RDLCK, l_whence=SEEK_SET, l_start=0, l_len=0}) = 0 read(10, "\2\0\0\0\0\0\0\0~\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 384) = 384 read(10, "\6\0\0\0l\2\0\0tty1\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 384) = 384 read(10, "\6\0\0\0n\2\0\0hvc0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 384) = 384 read(10, "\1\0\0\0005\0\0\0~\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 384) = 384 read(10, "\7\0\0\0&U\0\0pts/0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 384) = 384 read(10, "\7\0\0\0JT\0\0pts/1\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 384) = 384 read(10, "\7\0\0\0\322A\0\0pts/2\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 384) = 384 read(10, "\10\0\0\0\337U\0\0pts/3\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 384) = 384 read(10, "\10\0\0\0\337U\0\0pts/4\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 384) = 384 read(10, "\10\0\0\0\337U\0\0pts/5\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 384) = 384 read(10, "\7\0\0\0\233:\0\0pts/6\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 384) = 384 read(10, "\10\0\0\0\337U\0\0pts/7\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 384) = 384 read(10, "\7\0\0\0|C\0\0pts/8\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 384) = 384 read(10, "\7\0\0\0$\34\0\0pts/9\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 384) = 384 read(10, "\7\0\0\0SZ\0\0pts/10\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 384) = 384 read(10, "\7\0\0\0|C\0\0pts/11\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 384) = 384 fcntl(10, F_SETLKW, {l_type=F_UNLCK, l_whence=SEEK_SET, l_start=0, l_len=0}) = 0 alarm(0) = 10 rt_sigaction(SIGALRM, {sa_handler=SIG_DFL, sa_mask=[], sa_flags=SA_RESTORER, sa_restorer=0x7fd37fc38840}, NULL, 8) = 0 write(3, "pututslot: removeLineFromUtmp\n", 30) = 30 rt_sigaction(SIGCHLD, {sa_handler=SIG_DFL, sa_mask=[], sa_flags=SA_RESTORER|SA_RESTART, sa_restorer=0x7fd37fc38840}, {sa_handler=0x55b31cf71a21, sa_mask=[], sa_flags=SA_RESTORER|SA_RESTART, sa_restorer=0x7fd37fc38840}, 8) = 0 clone(child_stack=NULL, flags=CLONE_CHILD_CLEARTID|CLONE_CHILD_SETTID|SIGCHLD, child_tidptr=0x7fd37fbc4a10) = 17771 wait4(17771, [{WIFEXITED(s) && WEXITSTATUS(s) == 0}], 0, NULL) = 17771 --- SIGCHLD {si_signo=SIGCHLD, si_code=CLD_EXITED, si_pid=17771, si_uid=1000, si_status=0, si_utime=0, si_stime=0} --- rt_sigaction(SIGCHLD, {sa_handler=0x55b31cf71a21, sa_mask=[], sa_flags=SA_RESTORER|SA_RESTART, sa_restorer=0x7fd37fc38840}, NULL, 8) = 0 write(3, "pututslot: done\n", 16) = 16 write(3, "RemoveUtmp successfull\n", 23) = 23 [...] The most interesting part is between "pututslot: removeLineFromUtmp" and "pututslot: done". I added some debug information in pututslot() from utmp.c as follows: #ifdef HAVE_UTEMPTER if (eff_uid && wi && wi->w_ptyfd != -1) { /* sigh, linux hackers made the helper functions void */ if (SLOT_USED(u)) { debug ("pututslot: addToUtmp\n"); addToUtmp(wi->w_tty, host, wi->w_ptyfd); } else { debug ("pututslot: removeLineFromUtmp\n"); removeLineFromUtmp(wi->w_tty, wi->w_ptyfd); } debug ("pututslot: done\n"); return 1; /* pray for success */ } #endif So, the following lines correspond to what removeLineFromUtmp does: rt_sigaction(SIGCHLD, {sa_handler=SIG_DFL, sa_mask=[], sa_flags=SA_RESTORER|SA_RESTART, sa_restorer=0x7fd37fc38840}, {sa_handler=0x55b31cf71a21, sa_mask=[], sa_flags=SA_RESTORER|SA_RESTART, sa_restorer=0x7fd37fc38840}, 8) = 0 clone(child_stack=NULL, flags=CLONE_CHILD_CLEARTID|CLONE_CHILD_SETTID|SIGCHLD, child_tidptr=0x7fd37fbc4a10) = 17771 wait4(17771, [{WIFEXITED(s) && WEXITSTATUS(s) == 0}], 0, NULL) = 17771 --- SIGCHLD {si_signo=SIGCHLD, si_code=CLD_EXITED, si_pid=17771, si_uid=1000, si_status=0, si_utime=0, si_stime=0} --- rt_sigaction(SIGCHLD, {sa_handler=0x55b31cf71a21, sa_mask=[], sa_flags=SA_RESTORER|SA_RESTART, sa_restorer=0x7fd37fc38840}, NULL, 8) = 0 As you can see, the clone I was wondering of last night comes from removeLineFromUtmp (function of libutempter), which also plays with signal handlers, probably clashing with screen if the SIGCHLD signal from the terminated shell process arrives exactly at this time[*]. This would be the race condition. [*] With strace, this seems to never happen: the SIGCHLD in question arrives earlier. But the strace makes the SCREEN process slower, so that in practice, the signal may arrive later, possibly at the same time as the clone (since the scheduler is involved). In libutempter, one has: int status = execute_helper(master_fd, args); which does: static int execute_helper(int master_fd, const char *const argv[]) { struct sigaction saved_action, action; action.sa_handler = SIG_DFL; action.sa_flags = SA_RESTART; sigemptyset(&action.sa_mask); if (sigaction(SIGCHLD, &action, &saved_action) < 0) { #ifdef UTEMPTER_DEBUG fprintf(stderr, "libutempter: sigaction: %s\n", strerror(errno)); #endif return 0; } else { pid_t child; int status = 1; child = fork(); if (!child) { do_child(master_fd, argv[0], (char *const *) argv); } else if (child < 0) { #ifdef UTEMPTER_DEBUG fprintf(stderr, "libutempter: fork: %s\n", strerror(errno)); #endif goto ret; } if (TEMP_FAILURE_RETRY(waitpid(child, &status, 0)) < 0) { #ifdef UTEMPTER_DEBUG fprintf(stderr, "libutempter: waitpid: %s\n", strerror(errno)); #endif status = 1; } ret: (void) sigaction(SIGCHLD, &saved_action, 0); return !status; } } As you can see the SIGCHLD handler is modified, before being restored at the end. -- Vincent Lefèvre <vinc...@vinc17.net> - Web: <https://www.vinc17.net/> 100% accessible validated (X)HTML - Blog: <https://www.vinc17.net/blog/> Work: CR INRIA - computer arithmetic / AriC project (LIP, ENS-Lyon)