I've identified a job token leak caused by a race between func_shell and start_waiting_job.
Basically, it's not legal for anybody to call reap_children during start_waiting_job, because the child won't be on the list until after it starts, and this can cause a different final tokened child on the list, if reaped, to be taken for an untokened child by mistake (since the last child is determined by examining the list). Turns out that if a command involves func_shell, reap_children will be called in the bad place, tokens can be lost, and the world ends up less parallel. I've fixed this here with a moderately nonportable conversion of func_shell to directly calling waitpid. Patch below; you won't really want it as such, but it shows what the deal is... -- Grant Taylor - x1185 - http://pasta/~gtaylor/ Starent Networks - +1.978.851.1185 diff -ru make-3.80/function.c sn-make-3.80/function.c --- make-3.80/function.c Thu Oct 3 22:13:42 2002 +++ sn-make-3.80/function.c Fri Apr 4 18:48:30 2003 @@ -1622,8 +1622,32 @@ /* Loop until child_handler sets shell_function_completed to the status of our child shell. */ +#if 0 + /* Badness: this can be called from between start_command and + the child list insertion, in which case we leak job tokens! */ while (shell_function_completed == 0) reap_children (1, 0); +#else + { + int wrval; + int exit_sig; + int exit_code; + int status; + do { + wrval = waitpid(pid, &status, 0); + } while(wrval == 0 || (wrval == -1 && errno == EINTR)); + + exit_code = WEXITSTATUS (status); + if (exit_code == 0xff) + exit_code = -1; + exit_sig = WIFSIGNALED (status) ? WTERMSIG (status) : 0; + if (exit_sig==0 && exit_code == 127) { + shell_function_completed == -1; + } else { + shell_function_completed = 1; + } + } +#endif if (batch_filename) { DB (DB_VERBOSE, (_("Cleaning up temporary batch file %s\n"), diff -ru make-3.80/job.c sn-make-3.80/job.c --- make-3.80/job.c Fri Aug 9 21:27:17 2002 +++ sn-make-3.80/job.c Fri Apr 4 18:34:03 2003 @@ -211,6 +211,8 @@ unsigned int job_slots_used = 0; +unsigned int jobserver_tokens = 0; + /* Nonzero if the `good' standard input is in use. */ static int good_stdin_used = 0; @@ -406,7 +408,7 @@ extern int shell_function_pid, shell_function_completed; - +static int reap_lock=0; /* Reap all dead children, storing the returned status and the new command state (`cs_finished') in the `file' member of the `struct child' for the dead child, and removing the child from the chain. In addition, if BLOCK @@ -428,6 +430,10 @@ # define REAP_MORE dead_children #endif + if (reap_lock) { + abort(); + } + /* As long as: We have at least one child outstanding OR a shell function in progress, @@ -783,12 +798,24 @@ char token = '+'; /* Write a job token back to the pipe. */ + if (!jobserver_tokens) { + fprintf(stderr, + "make: error, no jobserver tokens in this proc, but writing one back!!!\n"); + exit(1); + } + if (write (job_fds[1], &token, 1) != 1) pfatal_with_name (_("write jobserver")); - DB (DB_JOBS, (_("Released token for child 0x%08lx (%s).\n"), - (unsigned long int) child, child->file->name)); + jobserver_tokens--; + + DB (DB_JOBS, (_("Released token for child 0x%08lx (%s) now have %d in pid %d.\n"), + (unsigned long int) child, child->file->name, jobserver_tokens, getpid())); + } else { + DB (DB_JOBS, (_("No token for child 0x%08lx (%s) children=%p, job_fds[0]=%d job_fds[1]=%d.\n"), + (unsigned long int) child, child->file->name, + children, job_fds[0], job_fds[1])); } if (handling_fatal_signal) /* Don't bother free'ing if about to die. */ @@ -1308,6 +1335,7 @@ } /* Start the first command; reap_children will run later command lines. */ + reap_lock=1; start_job_command (c); switch (f->command_state) @@ -1337,6 +1365,7 @@ assert (f->command_state == cs_finished); break; } + reap_lock=0; return 1; } @@ -1555,8 +1584,9 @@ /* If we got one, we're done here. */ if (got_token == 1) { - DB (DB_JOBS, (_("Obtained token for child 0x%08lx (%s).\n"), - (unsigned long int) c, c->file->name)); + jobserver_tokens++; + DB (DB_JOBS, (_("Obtained token for child 0x%08lx (%s) now have %d in pid %d.\n"), + (unsigned long int) c, c->file->name, jobserver_tokens, getpid())); break; } diff -ru make-3.80/main.c sn-make-3.80/main.c --- make-3.80/main.c Fri Aug 9 21:27:17 2002 +++ sn-make-3.80/main.c Fri Apr 4 17:35:00 2003 @@ -2736,6 +2743,15 @@ /* Wait for children to die. */ for (err = (status != 0); job_slots_used > 0; err = 0) reap_children (1, err); + + { + extern unsigned int jobserver_tokens; + if (jobserver_tokens) { + fprintf(stderr, "make gads - %d jobserver tokens leftover in pid %d\n", + jobserver_tokens, getpid()); + exit(1); + } + } /* Let the remote job module clean up its state. */ remote_cleanup (); _______________________________________________ Bug-make mailing list [EMAIL PROTECTED] http://mail.gnu.org/mailman/listinfo/bug-make