On Thu, Aug 06, 2020 at 03:16:35AM +0300, Peter Pentchev wrote: > On Thu, Aug 06, 2020 at 12:48:10AM +0200, Michał Mirosław wrote: > > On Thu, Aug 06, 2020 at 12:29:36AM +0300, Peter Pentchev wrote: > > > On Wed, Aug 05, 2020 at 10:52:31PM +0200, Michał Mirosław wrote: > > [...] > > > > Using print-debugging, I see that it stops at wait_for_child line just > > > > after printing the version. It seems that something is reaping the child > > > > before the main thread has a chance to wait for it. > > > > > > OK, so the only thing that comes to my mind now is that you may be > > > hitting a crazy, crazy race between register_child() and child_reaper(), > > > and I say "a crazy, crazy race", because the test has to (apparently > > > reproducibly) receive the CHLD signal exactly between the check and > > > the creation in register_child()'s first "$children{...} //= ...cv" > > > statement. > > > > Well, there is nothing that prevents SIGCHLD arriving between fork() and > > register_child(). You could test this with more confidence (though not > > 100%-reliably) by putting 'exit 1' just at the start of ($pid == 0) branch. > > Nah, the problem is not just "between fork() and register_child()". > It really must arrive at a very specific moment in time, because > the //= operations for setting $children{$pid}{cv} try to make sure that > a new value is not set (that is, a new condition variable is not > created) if there already is such an element in the array. So the race > is indeed between the //= in register_child() and the //= in > child_reaper() - that is, child_reaper() must be invoked (SIGCHLD must > arrive) *during* the execution of the //= in register_child(). > > Unless I'm missing something, which is not at all out of the question :)
The assignment seems not to be at fault (see last strace). I don't know perl's internals enough to say if this statement can be interrupted visibly by a signal handler (I would guess not a perl handler, though). There are two wait4() calls even before child_reaper has a chance to run. > > > Can you apply the following patch and show me the output of running > > > the test? > > > > Sure, but I got no patch. :-) > > Oof. Not my day, is it... Here it is... I hope. [cut patch] With the patch applied: $ TEST_STUNNEL=src/stunnel strace -f -o /tmp/log debian/tests/runtime Found the certificate at debian/tests/certs/certificate.pem and the private key at debian/tests/certs/key.pem Using the /tmp/user/1000/EklzlCzeRO temporary directory About to get the stunnel version information register_child: pid 14943 cv AnyEvent::CondVar=HASH(0x559e0d7572e0) RDBG child_reaper() invoked RDBG - pid -1 status -1 RDBG - done RDBG - out of the child_reaper() loop Got stunnel version 5.56 ^C And in the strace log: 14942 epoll_wait(3, [{EPOLLIN, {u32=5, u64=4294967301}}], 64, 59743) = 1 14942 epoll_wait(3, [{EPOLLIN, {u32=5, u64=4294967301}}], 64, 59743) = 1 14943 exit_group(0 <unfinished ...> 14942 read(5, <unfinished ...> 14943 <... exit_group resumed>) = ? 14942 <... read resumed> "TIMEOUTconnect = 10 seco"..., 8192) = 105 14942 epoll_wait(3, [{EPOLLHUP, {u32=5, u64=4294967301}}], 64, 59743) = 1 14943 +++ exited with 0 +++ 14942 epoll_ctl(3, EPOLL_CTL_MOD, 5, {EPOLLIN, {u32=5, u64=4294967301}}) = 0 14942 --- SIGCHLD {si_signo=SIGCHLD, si_code=CLD_EXITED, si_pid=14943, si_uid=1000, si_status=0, si_utime=0, si_stime=0} --- 14942 write(4, "\1\0\0\0\0\0\0\0", 8) = 8 14942 rt_sigreturn({mask=[PIPE]}) = 0 14942 epoll_wait(3, [{EPOLLHUP, {u32=5, u64=4294967301}}, {EPOLLIN, {u32=4, u64=4294967300}}], 64, 59743) = 2 14942 epoll_ctl(3, EPOLL_CTL_MOD, 5, {EPOLLIN, {u32=5, u64=4294967301}}) = 0 14942 read(4, "\1\0\0\0\0\0\0\0", 8) = 8 14942 wait4(-1, [{WIFEXITED(s) && WEXITSTATUS(s) == 0}], WNOHANG|WSTOPPED|WCONTINUED, NULL) = 14943 14942 wait4(-1, 0x7fffcacfddc4, WNOHANG|WSTOPPED|WCONTINUED, NULL) = -1 ECHILD (No child processes) 14942 write(1, "RDBG child_reaper() invoked\n", 28) = 28 14942 wait4(-1, 0x7fffcacfdb74, WNOHANG, NULL) = -1 ECHILD (No child processes) 14942 write(1, "RDBG - pid -1 status -1\n", 24) = 24 14942 write(1, "RDBG - done\n", 14) = 14 14942 write(1, "RDBG - out of the child_reaper()"..., 38) = 38 14942 epoll_wait(3, [{EPOLLHUP, {u32=5, u64=4294967301}}], 64, 59743) = 1 14942 epoll_ctl(3, EPOLL_CTL_MOD, 5, {EPOLLIN, {u32=5, u64=4294967301}}) = 0 14942 read(5, "", 8192) = 0 14942 write(1, "Got stunnel version 5.56\n", 25) = 25 14942 epoll_wait(3, [{EPOLLHUP, {u32=5, u64=4294967301}}], 64, 59743) = 1 14942 epoll_ctl(3, EPOLL_CTL_DEL, 5, 0x559e0dba9760) = 0 14942 epoll_wait(3, 0x559e0dba9760, 64, 59743) = -1 EINTR (Interrupted system call) 14942 --- SIGINT {si_signo=SIGINT, si_code=SI_KERNEL} --- With 'exit 1' in child branch: $ TEST_STUNNEL=src/stunnel strace -f -o /tmp/log debian/tests/runtime Found the certificate at debian/tests/certs/certificate.pem and the private key at debian/tests/certs/key.pem Using the /tmp/user/1000/MWxDeDDlxS temporary directory About to get the stunnel version information register_child: pid 19260 cv AnyEvent::CondVar=HASH(0x55e391bae118) __DIE__ handler invoked: Could not get the stunnel version number Oof, let us see if there are any children left Wait... do we even have any children left? Pffth, sending a SIGKILL to 19260 Some children remaining, laying low for a second... - waiting for 19260 (src/stunnel -version) RDBG child_reaper() invoked RDBG - pid -1 status -1 RDBG - done RDBG - out of the child_reaper() loop ^C 19749 epoll_wait(3, [{EPOLLHUP, {u32=5, u64=4294967301}}], 64, 0) = 1 19749 epoll_ctl(3, EPOLL_CTL_DEL, 5, 0x55c054b33040) = 0 19749 wait4(-1, [{WIFSIGNALED(s) && WTERMSIG(s) == SIGKILL}], WNOHANG|WSTOPPED|WCONTINUED, NULL) = 19758 19749 wait4(-1, 0x7ffc65939074, WNOHANG|WSTOPPED|WCONTINUED, NULL) = -1 ECHILD (No child processes) 19749 write(1, "RDBG child_reaper() invoked\n", 28) = 28 19749 wait4(-1, 0x7ffc65938e24, WNOHANG, NULL) = -1 ECHILD (No child processes) 19749 write(1, "RDBG - pid -1 status -1\n", 24) = 24 19749 write(1, "RDBG - done\n", 14) = 14 19749 write(1, "RDBG - out of the child_reaper()"..., 38) = 38 19749 epoll_wait(3, 0x55c054b33040, 64, 59743) = -1 EINTR (Interrupted system call) 19749 --- SIGINT {si_signo=SIGINT, si_code=SI_KERNEL} --- With a print before //= in register_child() added: $ TEST_STUNNEL=src/stunnel strace -f -o /tmp/log debian/tests/runtime Found the certificate at debian/tests/certs/certificate.pem and the private key at debian/tests/certs/key.pem Using the /tmp/user/1000/cyDdeCtQaA temporary directory About to get the stunnel version information before //= assignment for pid 21417 register_child: pid 21417 cv AnyEvent::CondVar=HASH(0x564e264d6098) __DIE__ handler invoked: Could not get the stunnel version number Oof, let us see if there are any children left Wait... do we even have any children left? Pffth, sending a SIGKILL to 21417 Some children remaining, laying low for a second... - waiting for 21417 (src/stunnel -version) RDBG child_reaper() invoked RDBG - pid -1 status -1 RDBG - done RDBG - out of the child_reaper() loop ^C 21416 write(1, "before //= assignment for pid 21"..., 36 <unfinished ...> 21417 <... set_robust_list resumed> ) = 0 21416 <... write resumed> ) = 36 21417 rt_sigprocmask(SIG_SETMASK, [PIPE], NULL, 8) = 0 21416 write(1, "register_child: pid 21417 cv Any"..., 68 <unfinished ...> 21417 close(6 <unfinished ...> 21416 <... write resumed> ) = 68 21417 <... close resumed> ) = 0 21416 close(6) = 0 21417 close(5) = 0 21416 epoll_ctl(3, EPOLL_CTL_ADD, 4, {EPOLLIN, {u32=4, u64=4294967300}}) = 0 21416 epoll_ctl(3, EPOLL_CTL_ADD, 5, {EPOLLIN, {u32=5, u64=4294967301}}) = 0 21417 getpid( <unfinished ...> 21416 epoll_wait(3, <unfinished ...> 21417 <... getpid resumed> ) = 21417 21416 <... epoll_wait resumed> [{EPOLLHUP, {u32=5, u64=4294967301}}], 64, 59743) = 1 21416 epoll_ctl(3, EPOLL_CTL_MOD, 5, {EPOLLIN, {u32=5, u64=4294967301}}) = 0 21416 read(5, "", 8192) = 0 21417 rt_sigaction(SIGHUP, NULL, {sa_handler=SIG_DFL, sa_mask=[], sa_flags=0}, 8) = 0 21417 rt_sigaction(SIGINT, NULL, {sa_handler=SIG_DFL, sa_mask=[], sa_flags=0}, 8) = 0 21417 rt_sigaction(SIGQUIT, NULL, {sa_handler=SIG_DFL, sa_mask=[], sa_flags=0}, 8) = 0 21417 rt_sigaction(SIGILL, NULL, <unfinished ...> 21416 write(2, "__DIE__ handler invoked: Could n"..., 66 <unfinished ...> 21417 <... rt_sigaction resumed> {sa_handler=SIG_DFL, sa_mask=[], sa_flags=0}, 8) = 0 21416 <... write resumed> ) = 66 21417 rt_sigaction(SIGTRAP, NULL, <unfinished ...> 21416 write(1, "Oof, let us see if there are any"..., 47 <unfinished ...> 21417 <... rt_sigaction resumed> {sa_handler=SIG_DFL, sa_mask=[], sa_flags=0}, 8) = 0 21416 <... write resumed> ) = 47 21417 rt_sigaction(SIGABRT, NULL, <unfinished ...> 21416 write(1, "Wait... do we even have any chil"..., 43 <unfinished ...> 21417 <... rt_sigaction resumed> {sa_handler=SIG_DFL, sa_mask=[], sa_flags=0}, 8) = 0 21416 <... write resumed> ) = 43 21417 rt_sigaction(SIGBUS, NULL, <unfinished ...> 21416 wait4(-1, <unfinished ...> 21417 <... rt_sigaction resumed> {sa_handler=SIG_DFL, sa_mask=[], sa_flags=0}, 8) = 0 21416 <... wait4 resumed> 0x7ffc12330784, WNOHANG, NULL) = 0 21417 rt_sigaction(SIGFPE, NULL, <unfinished ...> 21416 write(1, "Pffth, sending a SIGKILL to 2141"..., 34 <unfinished ...> 21417 <... rt_sigaction resumed> {sa_handler=SIG_IGN, sa_mask=[FPE], sa_flags=SA_RESTORER|SA_RESTART, sa_restorer=0x7face3f10840}, 8) = 0 21416 <... write resumed> ) = 34 21417 rt_sigaction(SIGKILL, NULL, <unfinished ...> 21416 kill(21417, SIGKILL <unfinished ...> 21417 <... rt_sigaction resumed> {sa_handler=SIG_DFL, sa_mask=[], sa_flags=0}, 8) = 0 21416 <... kill resumed> ) = 0 21416 write(1, "Some children remaining, laying "..., 52) = 52 21416 nanosleep({tv_sec=1, tv_nsec=0}, <unfinished ...> 21417 +++ killed by SIGKILL +++ 21416 <... nanosleep resumed> {tv_sec=0, tv_nsec=999763836}) = ? ERESTART_RESTARTBLOCK (Interrupted by signal) 21416 --- SIGCHLD {si_signo=SIGCHLD, si_code=CLD_KILLED, si_pid=21417, si_uid=1000, si_status=SIGKILL, si_utime=0, si_stime=0} --- 21416 rt_sigreturn({mask=[PIPE]}) = -1 EINTR (Interrupted system call) 21416 write(1, "- waiting for 21417 (src/stunnel"..., 43) = 43 [...] Best Regards, Michał Mirosław