On Thu, Aug 06, 2020 at 03:16:35AM +0300, Peter Pentchev wrote:
> On Thu, Aug 06, 2020 at 12:48:10AM +0200, Michał Mirosław wrote:
> > On Thu, Aug 06, 2020 at 12:29:36AM +0300, Peter Pentchev wrote:
> > > On Wed, Aug 05, 2020 at 10:52:31PM +0200, Michał Mirosław wrote:
> > [...]
> > > > Using print-debugging, I see that it stops at wait_for_child line just
> > > > after printing the version. It seems that something is reaping the child
> > > > before the main thread has a chance to wait for it.
> > > 
> > > OK, so the only thing that comes to my mind now is that you may be
> > > hitting a crazy, crazy race between register_child() and child_reaper(),
> > > and I say "a crazy, crazy race", because the test has to (apparently
> > > reproducibly) receive the CHLD signal exactly between the check and
> > > the creation in register_child()'s first "$children{...} //= ...cv"
> > > statement.
> > 
> > Well, there is nothing that prevents SIGCHLD arriving between fork() and
> > register_child(). You could test this with more confidence (though not
> > 100%-reliably) by putting 'exit 1' just at the start of ($pid == 0) branch.
> 
> Nah, the problem is not just "between fork() and register_child()".
> It really must arrive at a very specific moment in time, because
> the //= operations for setting $children{$pid}{cv} try to make sure that
> a new value is not set (that is, a new condition variable is not
> created) if there already is such an element in the array. So the race
> is indeed between the //= in register_child() and the //= in
> child_reaper() - that is, child_reaper() must be invoked (SIGCHLD must
> arrive) *during* the execution of the //= in register_child().
> 
> Unless I'm missing something, which is not at all out of the question :)

The assignment seems not to be at fault (see last strace). I don't know perl's
internals enough to say if this statement can be interrupted visibly by a signal
handler (I would guess not a perl handler, though). There are two wait4() calls
even before child_reaper has a chance to run.

> > > Can you apply the following patch and show me the output of running
> > > the test?
> > 
> > Sure, but I got no patch. :-)
> 
> Oof. Not my day, is it... Here it is... I hope.
[cut patch]

With the patch applied:

$ TEST_STUNNEL=src/stunnel strace -f -o /tmp/log debian/tests/runtime
Found the certificate at debian/tests/certs/certificate.pem and the
private key at debian/tests/certs/key.pem
Using the /tmp/user/1000/EklzlCzeRO temporary directory
About to get the stunnel version information
register_child: pid 14943 cv AnyEvent::CondVar=HASH(0x559e0d7572e0)
RDBG child_reaper() invoked
RDBG - pid -1 status -1
RDBG   - done
RDBG - out of the child_reaper() loop
Got stunnel version 5.56
^C

And in the strace log:

14942 epoll_wait(3, [{EPOLLIN, {u32=5, u64=4294967301}}], 64, 59743) = 1
14942 epoll_wait(3, [{EPOLLIN, {u32=5, u64=4294967301}}], 64, 59743) = 1
14943 exit_group(0 <unfinished ...>
14942 read(5,  <unfinished ...>
14943 <... exit_group resumed>)         = ?
14942 <... read resumed> "TIMEOUTconnect         = 10 seco"..., 8192) = 105
14942 epoll_wait(3, [{EPOLLHUP, {u32=5, u64=4294967301}}], 64, 59743) = 1
14943 +++ exited with 0 +++
14942 epoll_ctl(3, EPOLL_CTL_MOD, 5, {EPOLLIN, {u32=5, u64=4294967301}}) = 0
14942 --- SIGCHLD {si_signo=SIGCHLD, si_code=CLD_EXITED, si_pid=14943, 
si_uid=1000, si_status=0, si_utime=0, si_stime=0} ---
14942 write(4, "\1\0\0\0\0\0\0\0", 8)   = 8
14942 rt_sigreturn({mask=[PIPE]})       = 0
14942 epoll_wait(3, [{EPOLLHUP, {u32=5, u64=4294967301}}, {EPOLLIN, {u32=4, 
u64=4294967300}}], 64, 59743) = 2
14942 epoll_ctl(3, EPOLL_CTL_MOD, 5, {EPOLLIN, {u32=5, u64=4294967301}}) = 0
14942 read(4, "\1\0\0\0\0\0\0\0", 8)    = 8
14942 wait4(-1, [{WIFEXITED(s) && WEXITSTATUS(s) == 0}], 
WNOHANG|WSTOPPED|WCONTINUED, NULL) = 14943
14942 wait4(-1, 0x7fffcacfddc4, WNOHANG|WSTOPPED|WCONTINUED, NULL) = -1 ECHILD 
(No child processes)
14942 write(1, "RDBG child_reaper() invoked\n", 28) = 28
14942 wait4(-1, 0x7fffcacfdb74, WNOHANG, NULL) = -1 ECHILD (No child processes)
14942 write(1, "RDBG - pid -1 status -1\n", 24) = 24
14942 write(1, "RDBG   - done\n", 14)   = 14
14942 write(1, "RDBG - out of the child_reaper()"..., 38) = 38
14942 epoll_wait(3, [{EPOLLHUP, {u32=5, u64=4294967301}}], 64, 59743) = 1
14942 epoll_ctl(3, EPOLL_CTL_MOD, 5, {EPOLLIN, {u32=5, u64=4294967301}}) = 0
14942 read(5, "", 8192)                 = 0
14942 write(1, "Got stunnel version 5.56\n", 25) = 25
14942 epoll_wait(3, [{EPOLLHUP, {u32=5, u64=4294967301}}], 64, 59743) = 1
14942 epoll_ctl(3, EPOLL_CTL_DEL, 5, 0x559e0dba9760) = 0
14942 epoll_wait(3, 0x559e0dba9760, 64, 59743) = -1 EINTR (Interrupted system 
call)
14942 --- SIGINT {si_signo=SIGINT, si_code=SI_KERNEL} ---


With 'exit 1' in child branch:

$ TEST_STUNNEL=src/stunnel strace -f -o /tmp/log debian/tests/runtime
Found the certificate at debian/tests/certs/certificate.pem and the private key 
at debian/tests/certs/key.pem
Using the /tmp/user/1000/MWxDeDDlxS temporary directory
About to get the stunnel version information
register_child: pid 19260 cv AnyEvent::CondVar=HASH(0x55e391bae118)
__DIE__ handler invoked: Could not get the stunnel version number
Oof, let us see if there are any children left
Wait... do we even have any children left?
Pffth, sending a SIGKILL to 19260
Some children remaining, laying low for a second...
- waiting for 19260 (src/stunnel -version)
RDBG child_reaper() invoked
RDBG - pid -1 status -1
RDBG   - done
RDBG - out of the child_reaper() loop
^C

19749 epoll_wait(3, [{EPOLLHUP, {u32=5, u64=4294967301}}], 64, 0) = 1
19749 epoll_ctl(3, EPOLL_CTL_DEL, 5, 0x55c054b33040) = 0
19749 wait4(-1, [{WIFSIGNALED(s) && WTERMSIG(s) == SIGKILL}], 
WNOHANG|WSTOPPED|WCONTINUED, NULL) = 19758
19749 wait4(-1, 0x7ffc65939074, WNOHANG|WSTOPPED|WCONTINUED, NULL) = -1 ECHILD 
(No child processes)
19749 write(1, "RDBG child_reaper() invoked\n", 28) = 28
19749 wait4(-1, 0x7ffc65938e24, WNOHANG, NULL) = -1 ECHILD (No child processes)
19749 write(1, "RDBG - pid -1 status -1\n", 24) = 24
19749 write(1, "RDBG   - done\n", 14)   = 14
19749 write(1, "RDBG - out of the child_reaper()"..., 38) = 38
19749 epoll_wait(3, 0x55c054b33040, 64, 59743) = -1 EINTR (Interrupted system 
call)
19749 --- SIGINT {si_signo=SIGINT, si_code=SI_KERNEL} ---


With a print before //= in register_child() added:

$ TEST_STUNNEL=src/stunnel strace -f -o /tmp/log debian/tests/runtime
Found the certificate at debian/tests/certs/certificate.pem and the private key 
at debian/tests/certs/key.pem
Using the /tmp/user/1000/cyDdeCtQaA temporary directory
About to get the stunnel version information
before //= assignment for pid 21417
register_child: pid 21417 cv AnyEvent::CondVar=HASH(0x564e264d6098)
__DIE__ handler invoked: Could not get the stunnel version number
Oof, let us see if there are any children left
Wait... do we even have any children left?
Pffth, sending a SIGKILL to 21417
Some children remaining, laying low for a second...
- waiting for 21417 (src/stunnel -version)
RDBG child_reaper() invoked
RDBG - pid -1 status -1
RDBG   - done
RDBG - out of the child_reaper() loop
^C

21416 write(1, "before //= assignment for pid 21"..., 36 <unfinished ...>
21417 <... set_robust_list resumed> )   = 0
21416 <... write resumed> )             = 36
21417 rt_sigprocmask(SIG_SETMASK, [PIPE], NULL, 8) = 0
21416 write(1, "register_child: pid 21417 cv Any"..., 68 <unfinished ...>

21417 close(6 <unfinished ...>
21416 <... write resumed> )             = 68
21417 <... close resumed> )             = 0
21416 close(6)                          = 0
21417 close(5)                          = 0
21416 epoll_ctl(3, EPOLL_CTL_ADD, 4, {EPOLLIN, {u32=4, u64=4294967300}}) = 0
21416 epoll_ctl(3, EPOLL_CTL_ADD, 5, {EPOLLIN, {u32=5, u64=4294967301}}) = 0
21417 getpid( <unfinished ...>
21416 epoll_wait(3,  <unfinished ...>
21417 <... getpid resumed> )            = 21417
21416 <... epoll_wait resumed> [{EPOLLHUP, {u32=5, u64=4294967301}}], 64, 
59743) = 1
21416 epoll_ctl(3, EPOLL_CTL_MOD, 5, {EPOLLIN, {u32=5, u64=4294967301}}) = 0
21416 read(5, "", 8192)                 = 0
21417 rt_sigaction(SIGHUP, NULL, {sa_handler=SIG_DFL, sa_mask=[], sa_flags=0}, 
8) = 0
21417 rt_sigaction(SIGINT, NULL, {sa_handler=SIG_DFL, sa_mask=[], sa_flags=0}, 
8) = 0
21417 rt_sigaction(SIGQUIT, NULL, {sa_handler=SIG_DFL, sa_mask=[], sa_flags=0}, 
8) = 0
21417 rt_sigaction(SIGILL, NULL,  <unfinished ...>
21416 write(2, "__DIE__ handler invoked: Could n"..., 66 <unfinished ...>
21417 <... rt_sigaction resumed> {sa_handler=SIG_DFL, sa_mask=[], sa_flags=0}, 
8) = 0
21416 <... write resumed> )             = 66
21417 rt_sigaction(SIGTRAP, NULL,  <unfinished ...>
21416 write(1, "Oof, let us see if there are any"..., 47 <unfinished ...>
21417 <... rt_sigaction resumed> {sa_handler=SIG_DFL, sa_mask=[], sa_flags=0}, 
8) = 0
21416 <... write resumed> )             = 47
21417 rt_sigaction(SIGABRT, NULL,  <unfinished ...>
21416 write(1, "Wait... do we even have any chil"..., 43 <unfinished ...>
21417 <... rt_sigaction resumed> {sa_handler=SIG_DFL, sa_mask=[], sa_flags=0}, 
8) = 0
21416 <... write resumed> )             = 43
21417 rt_sigaction(SIGBUS, NULL,  <unfinished ...>
21416 wait4(-1,  <unfinished ...>
21417 <... rt_sigaction resumed> {sa_handler=SIG_DFL, sa_mask=[], sa_flags=0}, 
8) = 0
21416 <... wait4 resumed> 0x7ffc12330784, WNOHANG, NULL) = 0
21417 rt_sigaction(SIGFPE, NULL,  <unfinished ...>
21416 write(1, "Pffth, sending a SIGKILL to 2141"..., 34 <unfinished ...>
21417 <... rt_sigaction resumed> {sa_handler=SIG_IGN, sa_mask=[FPE], 
sa_flags=SA_RESTORER|SA_RESTART, sa_restorer=0x7face3f10840}, 8) = 0
21416 <... write resumed> )             = 34
21417 rt_sigaction(SIGKILL, NULL,  <unfinished ...>
21416 kill(21417, SIGKILL <unfinished ...>
21417 <... rt_sigaction resumed> {sa_handler=SIG_DFL, sa_mask=[], sa_flags=0}, 
8) = 0
21416 <... kill resumed> )              = 0
21416 write(1, "Some children remaining, laying "..., 52) = 52
21416 nanosleep({tv_sec=1, tv_nsec=0},  <unfinished ...>
21417 +++ killed by SIGKILL +++
21416 <... nanosleep resumed> {tv_sec=0, tv_nsec=999763836}) = ? 
ERESTART_RESTARTBLOCK (Interrupted by signal)
21416 --- SIGCHLD {si_signo=SIGCHLD, si_code=CLD_KILLED, si_pid=21417, 
si_uid=1000, si_status=SIGKILL, si_utime=0, si_stime=0} ---
21416 rt_sigreturn({mask=[PIPE]})       = -1 EINTR (Interrupted system call)
21416 write(1, "- waiting for 21417 (src/stunnel"..., 43) = 43
[...]

Best Regards,
Michał Mirosław

Reply via email to