Currently we allow only one attempt to create init in a new namespace.
If the first fork() fails after alloc_pid() succeeds, free_pid() clears
PIDNS_ADDING and thus disables further PID allocations.

Nowadays this looks like an unnecessary limitation. The original reason
to handle "case PIDNS_ADDING" in free_pid() is gone, most probably after
commit 69879c01a0c3 ("proc: Remove the now unnecessary internal mount of
proc").

Change free_pid() to keep ns->pid_allocated == PIDNS_ADDING, and change
alloc_pid() to reset the cursor early, right after taking pidmap_lock.

Test-case:

        #define _GNU_SOURCE
        #include <linux/sched.h>
        #include <sys/syscall.h>
        #include <sys/wait.h>
        #include <assert.h>
        #include <sched.h>
        #include <errno.h>

        int main(void)
        {
                struct clone_args args = {
                        .exit_signal = SIGCHLD,
                        .flags  = CLONE_PIDFD,
                        .pidfd  = 0,
                };
                unsigned long pidfd;
                int pid;

                assert(unshare(CLONE_NEWPID) == 0);

                pid = syscall(__NR_clone3, &args, sizeof(args));
                assert(pid == -1 && errno == EFAULT);

                args.pidfd = (unsigned long)&pidfd;
                pid = syscall(__NR_clone3, &args, sizeof(args));
                if (pid)
                        assert(pid > 0 && wait(NULL) == pid);
                else
                        assert(getpid() == 1);

                return 0;
        }

Signed-off-by: Oleg Nesterov <[email protected]>
---
 kernel/pid.c | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/kernel/pid.c b/kernel/pid.c
index ebf013f35cb3..1a0d2ac1f4a9 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -131,9 +131,8 @@ void free_pid(struct pid *pid)
                        wake_up_process(ns->child_reaper);
                        break;
                case PIDNS_ADDING:
-                       /* Handle a fork failure of the first process */
-                       WARN_ON(ns->child_reaper);
-                       ns->pid_allocated = 0;
+                       /* Only possible if the 1st fork fails */
+                       WARN_ON(READ_ONCE(ns->child_reaper));
                        break;
                }
 
@@ -230,6 +229,10 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t 
*arg_set_tid,
        retried_preload = false;
        idr_preload(GFP_KERNEL);
        spin_lock(&pidmap_lock);
+       /* For the case when the previous attempt to create init failed */
+       if (ns->pid_allocated == PIDNS_ADDING)
+               idr_set_cursor(&ns->idr, 0);
+
        for (tmp = ns, i = ns->level; i >= 0;) {
                int tid = set_tid[ns->level - i];
 
@@ -341,10 +344,6 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t 
*arg_set_tid,
                idr_remove(&upid->ns->idr, upid->nr);
        }
 
-       /* On failure to allocate the first pid, reset the state */
-       if (ns->pid_allocated == PIDNS_ADDING)
-               idr_set_cursor(&ns->idr, 0);
-
        spin_unlock(&pidmap_lock);
        idr_preload_end();
 
-- 
2.52.0



Reply via email to