This simple program

    #include <unistd.h>
    #include <signal.h>
    #include <sys/ptrace.h>
    #include <pthread.h>

    void *thread(void *arg)
    {
            ptrace(PTRACE_TRACEME, 0,0,0);
            return NULL;
    }

    int main(void)
    {
            int pid = fork();

            if (!pid) {
                    pthread_t pt;
                    pthread_create(&pt, NULL, thread, NULL);
                    pthread_join(pt, NULL);
                    execlp("echo", "echo", "passed", NULL);
            }

            sleep(1);
            ptrace(PTRACE_ATTACH, pid, 0,0);
            kill(pid, SIGCONT);

            return 0;
    }

hangs because de_thread() waits for debugger which should release the killed
thread with cred_guard_mutex held, while the debugger sleeps waiting for the
same mutex. Not really that bad, the tracer can be killed, but still this is
a bug and people hit it in practice.

With this patch:

        - de_thread() waits until all the sub-threads pass exit_notify() and
          become zombies.

        - setup_new_exec() waits until all the sub-threads are reaped without
          cred_guard_mutex held.

        - unshare_sighand() and flush_signal_handlers() are moved from
          begin_new_exec() to setup_new_exec(), we can't call them until all
          sub-threads go away.

Signed-off-by: Oleg Nesterov <[email protected]>
---
 fs/exec.c       | 140 +++++++++++++++++++++++-------------------------
 kernel/exit.c   |   9 ++--
 kernel/signal.c |   2 +-
 3 files changed, 71 insertions(+), 80 deletions(-)

diff --git a/fs/exec.c b/fs/exec.c
index 136a7ab5d91c..2bac7deb9a98 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -905,42 +905,56 @@ static int exec_mmap(struct mm_struct *mm)
        return 0;
 }
 
-static int de_thread(struct task_struct *tsk)
+static int kill_sub_threads(struct task_struct *tsk)
 {
        struct signal_struct *sig = tsk->signal;
-       struct sighand_struct *oldsighand = tsk->sighand;
-       spinlock_t *lock = &oldsighand->siglock;
-
-       if (thread_group_empty(tsk))
-               goto no_thread_group;
+       int err = -EINTR;
 
-       /*
-        * Kill all other threads in the thread group.
-        */
-       spin_lock_irq(lock);
-       if ((sig->flags & SIGNAL_GROUP_EXIT) || sig->group_exec_task) {
-               /*
-                * Another group action in progress, just
-                * return so that the signal is processed.
-                */
-               spin_unlock_irq(lock);
-               return -EAGAIN;
+       read_lock(&tasklist_lock);
+       spin_lock_irq(&tsk->sighand->siglock);
+       if (!((sig->flags & SIGNAL_GROUP_EXIT) || sig->group_exec_task)) {
+               sig->group_exec_task = tsk;
+               sig->notify_count = -zap_other_threads(tsk);
+               err = 0;
        }
+       spin_unlock_irq(&tsk->sighand->siglock);
+       read_unlock(&tasklist_lock);
 
-       sig->group_exec_task = tsk;
-       sig->notify_count = zap_other_threads(tsk);
-       if (!thread_group_leader(tsk))
-               sig->notify_count--;
+       return err;
+}
 
-       while (sig->notify_count) {
-               __set_current_state(TASK_KILLABLE);
-               spin_unlock_irq(lock);
-               schedule();
+static int wait_for_notify_count(struct task_struct *tsk)
+{
+       for (;;) {
                if (__fatal_signal_pending(tsk))
-                       goto killed;
-               spin_lock_irq(lock);
+                       return -EINTR;
+               set_current_state(TASK_KILLABLE);
+               if (!tsk->signal->notify_count)
+                       break;
+               schedule();
        }
-       spin_unlock_irq(lock);
+       __set_current_state(TASK_RUNNING);
+       return 0;
+}
+
+static void clear_group_exec_task(struct task_struct *tsk)
+{
+       struct signal_struct *sig = tsk->signal;
+
+       /* protects against exit_notify() and __exit_signal() */
+       read_lock(&tasklist_lock);
+       sig->group_exec_task = NULL;
+       sig->notify_count = 0;
+       read_unlock(&tasklist_lock);
+}
+
+static int de_thread(struct task_struct *tsk)
+{
+       if (thread_group_empty(tsk))
+               goto no_thread_group;
+
+       if (kill_sub_threads(tsk) || wait_for_notify_count(tsk))
+               return -EINTR;
 
        /*
         * At this point all other threads have exited, all we have to
@@ -948,26 +962,10 @@ static int de_thread(struct task_struct *tsk)
         * and to assume its PID:
         */
        if (!thread_group_leader(tsk)) {
-               struct task_struct *leader = tsk->group_leader;
-
-               for (;;) {
-                       cgroup_threadgroup_change_begin(tsk);
-                       write_lock_irq(&tasklist_lock);
-                       /*
-                        * Do this under tasklist_lock to ensure that
-                        * exit_notify() can't miss ->group_exec_task
-                        */
-                       sig->notify_count = -1;
-                       if (likely(leader->exit_state))
-                               break;
-                       __set_current_state(TASK_KILLABLE);
-                       write_unlock_irq(&tasklist_lock);
-                       cgroup_threadgroup_change_end(tsk);
-                       schedule();
-                       if (__fatal_signal_pending(tsk))
-                               goto killed;
-               }
+               struct task_struct *leader = tsk->group_leader, *t;
 
+               cgroup_threadgroup_change_begin(tsk);
+               write_lock_irq(&tasklist_lock);
                /*
                 * The only record we have of the real-time age of a
                 * process, regardless of execs it's done, is start_time.
@@ -1000,8 +998,8 @@ static int de_thread(struct task_struct *tsk)
                list_replace_rcu(&leader->tasks, &tsk->tasks);
                list_replace_init(&leader->sibling, &tsk->sibling);
 
-               tsk->group_leader = tsk;
-               leader->group_leader = tsk;
+               for_each_thread(tsk, t)
+                       t->group_leader = tsk;
 
                tsk->exit_signal = SIGCHLD;
                leader->exit_signal = -1;
@@ -1021,23 +1019,11 @@ static int de_thread(struct task_struct *tsk)
                release_task(leader);
        }
 
-       sig->group_exec_task = NULL;
-       sig->notify_count = 0;
-
 no_thread_group:
        /* we have changed execution domain */
        tsk->exit_signal = SIGCHLD;
-
        BUG_ON(!thread_group_leader(tsk));
        return 0;
-
-killed:
-       /* protects against exit_notify() and __exit_signal() */
-       read_lock(&tasklist_lock);
-       sig->group_exec_task = NULL;
-       sig->notify_count = 0;
-       read_unlock(&tasklist_lock);
-       return -EAGAIN;
 }
 
 
@@ -1171,13 +1157,6 @@ int begin_new_exec(struct linux_binprm * bprm)
        flush_itimer_signals();
 #endif
 
-       /*
-        * Make the signal table private.
-        */
-       retval = unshare_sighand(me);
-       if (retval)
-               goto out_unlock;
-
        me->flags &= ~(PF_RANDOMIZE | PF_FORKNOEXEC |
                                        PF_NOFREEZE | PF_NO_SETAFFINITY);
        flush_thread();
@@ -1249,7 +1228,6 @@ int begin_new_exec(struct linux_binprm * bprm)
        /* An exec changes our domain. We are no longer part of the thread
           group */
        WRITE_ONCE(me->self_exec_id, me->self_exec_id + 1);
-       flush_signal_handlers(me, 0);
 
        retval = set_cred_ucounts(bprm->cred);
        if (retval < 0)
@@ -1293,8 +1271,9 @@ int begin_new_exec(struct linux_binprm * bprm)
        up_write(&me->signal->exec_update_lock);
        if (!bprm->cred)
                mutex_unlock(&me->signal->cred_guard_mutex);
-
 out:
+       if (me->signal->group_exec_task == me)
+               clear_group_exec_task(me);
        return retval;
 }
 EXPORT_SYMBOL(begin_new_exec);
@@ -1325,6 +1304,8 @@ int setup_new_exec(struct linux_binprm * bprm)
 {
        /* Setup things that can depend upon the personality */
        struct task_struct *me = current;
+       struct signal_struct *sig = me->signal;
+       int err = 0;
 
        arch_pick_mmap_layout(me->mm, &bprm->rlim_stack);
 
@@ -1335,10 +1316,23 @@ int setup_new_exec(struct linux_binprm * bprm)
         * some architectures like powerpc
         */
        me->mm->task_size = TASK_SIZE;
-       up_write(&me->signal->exec_update_lock);
-       mutex_unlock(&me->signal->cred_guard_mutex);
+       up_write(&sig->exec_update_lock);
+       mutex_unlock(&sig->cred_guard_mutex);
 
-       return 0;
+       if (sig->group_exec_task) {
+               spin_lock_irq(&me->sighand->siglock);
+               sig->notify_count = sig->nr_threads - 1;
+               spin_unlock_irq(&me->sighand->siglock);
+
+               err = wait_for_notify_count(me);
+               clear_group_exec_task(me);
+       }
+
+       if (!err)
+               err = unshare_sighand(me);
+       if (!err)
+               flush_signal_handlers(me, 0);
+       return err;
 }
 EXPORT_SYMBOL(setup_new_exec);
 
diff --git a/kernel/exit.c b/kernel/exit.c
index f041f0c05ebb..bcde78c97253 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -178,10 +178,7 @@ static void __exit_signal(struct release_task_post *post, 
struct task_struct *ts
                tty = sig->tty;
                sig->tty = NULL;
        } else {
-               /*
-                * If there is any task waiting for the group exit
-                * then notify it:
-                */
+               /* mt-exec, setup_new_exec() -> wait_for_notify_count() */
                if (sig->notify_count > 0 && !--sig->notify_count)
                        wake_up_process(sig->group_exec_task);
 
@@ -766,8 +763,8 @@ static void exit_notify(struct task_struct *tsk, int 
group_dead)
                list_add(&tsk->ptrace_entry, &dead);
        }
 
-       /* mt-exec, de_thread() is waiting for group leader */
-       if (unlikely(tsk->signal->notify_count < 0))
+       /* mt-exec, de_thread() -> wait_for_notify_count() */
+       if (tsk->signal->notify_count < 0 && !++tsk->signal->notify_count)
                wake_up_process(tsk->signal->group_exec_task);
        write_unlock_irq(&tasklist_lock);
 
diff --git a/kernel/signal.c b/kernel/signal.c
index fe9190d84f28..334212044940 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1343,13 +1343,13 @@ int zap_other_threads(struct task_struct *p)
 
        for_other_threads(p, t) {
                task_clear_jobctl_pending(t, JOBCTL_PENDING_MASK);
-               count++;
 
                /* Don't bother with already dead threads */
                if (t->exit_state)
                        continue;
                sigaddset(&t->pending.signal, SIGKILL);
                signal_wake_up(t, 1);
+               count++;
        }
 
        return count;
-- 
2.25.1.362.g51ebf55



Reply via email to