On Thu, Aug 21, 2025 at 07:34:58PM +0200, Bernd Edlinger wrote:

> The solution is to detect this situation and allow
> ptrace_attach to continue by temporarily releasing the
> cred_guard_mutex, while de_thread() is still waiting for
> traced zombies to be eventually released by the tracer.
> In the case of the thread group leader we only have to wait
> for the thread to become a zombie, which may also need
> co-operation from the tracer due to PTRACE_O_TRACEEXIT.
> 
> When a tracer wants to ptrace_attach a task that already
> is in execve, we simply retry the ptrace_may_access
> check while temporarily installing the new credentials
> and dumpability which are about to be used after execve
> completes.  If the ptrace_attach happens on a thread that
> is a sibling-thread of the thread doing execve, it is
> sufficient to check against the old credentials, as this
> thread will be waited for, before the new credentials are
> installed.
> 
> Other threads die quickly since the cred_guard_mutex is
> released, but a deadly signal is already pending.  In case
> the mutex_lock_killable misses the signal, the non-zero
> current->signal->exec_bprm makes sure they release the
> mutex immediately and return with -ERESTARTNOINTR.



> diff --git a/fs/exec.c b/fs/exec.c
> index 2a1e5e4042a1..31c6ceaa5f69 100644
> --- a/fs/exec.c
> +++ b/fs/exec.c
> @@ -905,11 +905,13 @@ static int exec_mmap(struct mm_struct *mm)
>       return 0;
>  }
>  
> -static int de_thread(struct task_struct *tsk)
> +static int de_thread(struct task_struct *tsk, struct linux_binprm *bprm)
>  {
>       struct signal_struct *sig = tsk->signal;
>       struct sighand_struct *oldsighand = tsk->sighand;
>       spinlock_t *lock = &oldsighand->siglock;
> +     struct task_struct *t;
> +     bool unsafe_execve_in_progress = false;
>  
>       if (thread_group_empty(tsk))
>               goto no_thread_group;
> @@ -932,6 +934,19 @@ static int de_thread(struct task_struct *tsk)
>       if (!thread_group_leader(tsk))
>               sig->notify_count--;
>  
> +     for_other_threads(tsk, t) {
> +             if (unlikely(t->ptrace)
> +                 && (t != tsk->group_leader || !t->exit_state))

&& goes at the end of the previous line

> +                     unsafe_execve_in_progress = true;
> +     }
> +
> +     if (unlikely(unsafe_execve_in_progress)) {
> +             spin_unlock_irq(lock);
> +             sig->exec_bprm = bprm;
> +             mutex_unlock(&sig->cred_guard_mutex);
> +             spin_lock_irq(lock);

I'm not clear why we need to drop and re-acquire siglock here.

And I would like a very large comment here explaining why it is safe to
drop cred_guard_mutex here.

> +     }
> +
>       while (sig->notify_count) {
>               __set_current_state(TASK_KILLABLE);
>               spin_unlock_irq(lock);
> @@ -1021,6 +1036,11 @@ static int de_thread(struct task_struct *tsk)
>               release_task(leader);
>       }
>  
> +     if (unlikely(unsafe_execve_in_progress)) {
> +             mutex_lock(&sig->cred_guard_mutex);
> +             sig->exec_bprm = NULL;
> +     }
> +
>       sig->group_exec_task = NULL;
>       sig->notify_count = 0;
>  
> @@ -1032,6 +1052,11 @@ static int de_thread(struct task_struct *tsk)
>       return 0;
>  
>  killed:
> +     if (unlikely(unsafe_execve_in_progress)) {
> +             mutex_lock(&sig->cred_guard_mutex);
> +             sig->exec_bprm = NULL;
> +     }
> +
>       /* protects against exit_notify() and __exit_signal() */
>       read_lock(&tasklist_lock);
>       sig->group_exec_task = NULL;
> @@ -1114,13 +1139,31 @@ int begin_new_exec(struct linux_binprm * bprm)
>        */
>       trace_sched_prepare_exec(current, bprm);
>  
> +     /* If the binary is not readable then enforce mm->dumpable=0 */
> +     would_dump(bprm, bprm->file);
> +     if (bprm->have_execfd)
> +             would_dump(bprm, bprm->executable);
> +
> +     /*
> +      * Figure out dumpability. Note that this checking only of current
> +      * is wrong, but userspace depends on it. This should be testing
> +      * bprm->secureexec instead.
> +      */
> +     if (bprm->interp_flags & BINPRM_FLAGS_ENFORCE_NONDUMP ||
> +         is_dumpability_changed(current_cred(), bprm->cred) ||
> +         !(uid_eq(current_euid(), current_uid()) &&
> +           gid_eq(current_egid(), current_gid())))
> +             set_dumpable(bprm->mm, suid_dumpable);
> +     else
> +             set_dumpable(bprm->mm, SUID_DUMP_USER);
> +

I feel like moving this dumpable stuff around could be a separate patch.
Which can explain how that is correct and why it is needed and all that.

>       /*
>        * Ensure all future errors are fatal.
>        */
>       bprm->point_of_no_return = true;
>  
>       /* Make this the only thread in the thread group */
> -     retval = de_thread(me);
> +     retval = de_thread(me, bprm);
>       if (retval)
>               goto out;
>       /* see the comment in check_unsafe_exec() */
> @@ -1144,11 +1187,6 @@ int begin_new_exec(struct linux_binprm * bprm)
>       if (retval)
>               goto out;
>  
> -     /* If the binary is not readable then enforce mm->dumpable=0 */
> -     would_dump(bprm, bprm->file);
> -     if (bprm->have_execfd)
> -             would_dump(bprm, bprm->executable);
> -
>       /*
>        * Release all of the old mmap stuff
>        */
> @@ -1210,18 +1248,6 @@ int begin_new_exec(struct linux_binprm * bprm)
>  
>       me->sas_ss_sp = me->sas_ss_size = 0;
>  
> -     /*
> -      * Figure out dumpability. Note that this checking only of current
> -      * is wrong, but userspace depends on it. This should be testing
> -      * bprm->secureexec instead.
> -      */
> -     if (bprm->interp_flags & BINPRM_FLAGS_ENFORCE_NONDUMP ||
> -         !(uid_eq(current_euid(), current_uid()) &&
> -           gid_eq(current_egid(), current_gid())))
> -             set_dumpable(current->mm, suid_dumpable);
> -     else
> -             set_dumpable(current->mm, SUID_DUMP_USER);
> -
>       perf_event_exec();
>  
>       /*
> @@ -1361,6 +1387,11 @@ static int prepare_bprm_creds(struct linux_binprm 
> *bprm)
>       if (mutex_lock_interruptible(&current->signal->cred_guard_mutex))
>               return -ERESTARTNOINTR;
>  
> +     if (unlikely(current->signal->exec_bprm)) {
> +             mutex_unlock(&current->signal->cred_guard_mutex);
> +             return -ERESTARTNOINTR;
> +     }

#1

> +
>       bprm->cred = prepare_exec_creds();
>       if (likely(bprm->cred))
>               return 0;
> diff --git a/fs/proc/base.c b/fs/proc/base.c
> index 62d35631ba8c..e5bcf812cee0 100644
> --- a/fs/proc/base.c
> +++ b/fs/proc/base.c
> @@ -2838,6 +2838,12 @@ static ssize_t proc_pid_attr_write(struct file * file, 
> const char __user * buf,
>       if (rv < 0)
>               goto out_free;
>  

Comment explaining why this needs checking goes here.

> +     if (unlikely(current->signal->exec_bprm)) {
> +             mutex_unlock(&current->signal->cred_guard_mutex);
> +             rv = -ERESTARTNOINTR;
> +             goto out_free;
> +     }
> +
>       rv = security_setprocattr(PROC_I(inode)->op.lsmid,
>                                 file->f_path.dentry->d_name.name, page,
>                                 count);
> diff --git a/include/linux/cred.h b/include/linux/cred.h
> index a102a10f833f..fb0361911489 100644
> --- a/include/linux/cred.h
> +++ b/include/linux/cred.h
> @@ -153,6 +153,7 @@ extern const struct cred *get_task_cred(struct 
> task_struct *);
>  extern struct cred *cred_alloc_blank(void);
>  extern struct cred *prepare_creds(void);
>  extern struct cred *prepare_exec_creds(void);
> +extern bool is_dumpability_changed(const struct cred *, const struct cred *);
>  extern int commit_creds(struct cred *);
>  extern void abort_creds(struct cred *);
>  extern struct cred *prepare_kernel_cred(struct task_struct *);
> diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
> index 1ef1edbaaf79..3c47d8b55863 100644
> --- a/include/linux/sched/signal.h
> +++ b/include/linux/sched/signal.h
> @@ -237,9 +237,27 @@ struct signal_struct {
>       struct mm_struct *oom_mm;       /* recorded mm when the thread group got
>                                        * killed by the oom killer */
>  
> +     struct linux_binprm *exec_bprm; /* Used to check ptrace_may_access
> +                                      * against new credentials while
> +                                      * de_thread is waiting for other
> +                                      * traced threads to terminate.
> +                                      * Set while de_thread is executing.
> +                                      * The cred_guard_mutex is released
> +                                      * after de_thread() has called
> +                                      * zap_other_threads(), therefore
> +                                      * a fatal signal is guaranteed to be
> +                                      * already pending in the unlikely
> +                                      * event, that
> +                                      * current->signal->exec_bprm happens
> +                                      * to be non-zero after the
> +                                      * cred_guard_mutex was acquired.
> +                                      */
> +
>       struct mutex cred_guard_mutex;  /* guard against foreign influences on
>                                        * credential calculations
>                                        * (notably. ptrace)
> +                                      * Held while execve runs, except when
> +                                      * a sibling thread is being traced.
>                                        * Deprecated do not use in new code.
>                                        * Use exec_update_lock instead.
>                                        */
> diff --git a/kernel/cred.c b/kernel/cred.c
> index 9676965c0981..0b2822c762df 100644
> --- a/kernel/cred.c
> +++ b/kernel/cred.c
> @@ -375,6 +375,30 @@ static bool cred_cap_issubset(const struct cred *set, 
> const struct cred *subset)
>       return false;
>  }
>  
> +/**
> + * is_dumpability_changed - Will changing creds affect dumpability?
> + * @old: The old credentials.
> + * @new: The new credentials.
> + *
> + * If the @new credentials have no elevated privileges compared to the
> + * @old credentials, the task may remain dumpable.  Otherwise we have
> + * to mark the task as undumpable to avoid information leaks from higher
> + * to lower privilege domains.
> + *
> + * Return: True if the task will become undumpable.
> + */
> +bool is_dumpability_changed(const struct cred *old, const struct cred *new)
> +{
> +     if (!uid_eq(old->euid, new->euid) ||
> +         !gid_eq(old->egid, new->egid) ||
> +         !uid_eq(old->fsuid, new->fsuid) ||
> +         !gid_eq(old->fsgid, new->fsgid) ||
> +         !cred_cap_issubset(old, new))
> +             return true;
> +
> +     return false;
> +}
> +
>  /**
>   * commit_creds - Install new credentials upon the current task
>   * @new: The credentials to be assigned
> @@ -403,11 +427,7 @@ int commit_creds(struct cred *new)
>       get_cred(new); /* we will require a ref for the subj creds too */
>  
>       /* dumpability changes */
> -     if (!uid_eq(old->euid, new->euid) ||
> -         !gid_eq(old->egid, new->egid) ||
> -         !uid_eq(old->fsuid, new->fsuid) ||
> -         !gid_eq(old->fsgid, new->fsgid) ||
> -         !cred_cap_issubset(old, new)) {
> +     if (is_dumpability_changed(old, new)) {
>               if (task->mm)
>                       set_dumpable(task->mm, suid_dumpable);
>               task->pdeath_signal = 0;
> diff --git a/kernel/ptrace.c b/kernel/ptrace.c
> index 75a84efad40f..230298817dbf 100644
> --- a/kernel/ptrace.c
> +++ b/kernel/ptrace.c
> @@ -20,6 +20,7 @@
>  #include <linux/pagemap.h>
>  #include <linux/ptrace.h>
>  #include <linux/security.h>
> +#include <linux/binfmts.h>
>  #include <linux/signal.h>
>  #include <linux/uio.h>
>  #include <linux/audit.h>
> @@ -453,6 +454,28 @@ static int ptrace_attach(struct task_struct *task, long 
> request,
>                               return retval;
>               }
>  
> +             if (unlikely(task == task->signal->group_exec_task)) {
> +                     retval = 
> down_write_killable(&task->signal->exec_update_lock);
> +                     if (retval)
> +                             return retval;

This could be written like:

                        ACQUIRE(rwsem_write_kill, 
guard)(&task->signal->exec_update_lock);
                        retval = ACQUIRE_ERR(rwsem_write_kill, guard);
                        if (retval)
                                return retval;

> +
> +                     scoped_guard (task_lock, task) {
> +                             struct linux_binprm *bprm = 
> task->signal->exec_bprm;
> +                             const struct cred __rcu *old_cred = 
> task->real_cred;
> +                             struct mm_struct *old_mm = task->mm;
> +
> +                             rcu_assign_pointer(task->real_cred, bprm->cred);
> +                             task->mm = bprm->mm;
> +                             retval = __ptrace_may_access(task, 
> PTRACE_MODE_ATTACH_REALCREDS);
> +                             rcu_assign_pointer(task->real_cred, old_cred);
> +                             task->mm = old_mm;
> +                     }
> +
> +                     up_write(&task->signal->exec_update_lock);

And then this goes away ^

> +                     if (retval)
> +                             return retval;
> +             }
> +
>               scoped_guard (write_lock_irq, &tasklist_lock) {
>                       if (unlikely(task->exit_state))
>                               return -EPERM;
> @@ -488,6 +511,14 @@ static int ptrace_traceme(void)
>  {
>       int ret = -EPERM;
>  

This needs comments.

> +     if (mutex_lock_interruptible(&current->signal->cred_guard_mutex))
> +             return -ERESTARTNOINTR;
> +
> +     if (unlikely(current->signal->exec_bprm)) {
> +             mutex_unlock(&current->signal->cred_guard_mutex);
> +             return -ERESTARTNOINTR;
> +     }

#2

> +
>       write_lock_irq(&tasklist_lock);
>       /* Are we already being traced? */
>       if (!current->ptrace) {
> @@ -503,6 +534,7 @@ static int ptrace_traceme(void)
>               }
>       }
>       write_unlock_irq(&tasklist_lock);
> +     mutex_unlock(&current->signal->cred_guard_mutex);
>  
>       return ret;
>  }
> diff --git a/kernel/seccomp.c b/kernel/seccomp.c
> index 41aa761c7738..d61fc275235a 100644
> --- a/kernel/seccomp.c
> +++ b/kernel/seccomp.c
> @@ -1994,9 +1994,15 @@ static long seccomp_set_mode_filter(unsigned int flags,
>        * Make sure we cannot change seccomp or nnp state via TSYNC
>        * while another thread is in the middle of calling exec.
>        */
> -     if (flags & SECCOMP_FILTER_FLAG_TSYNC &&
> -         mutex_lock_killable(&current->signal->cred_guard_mutex))
> -             goto out_put_fd;
> +     if (flags & SECCOMP_FILTER_FLAG_TSYNC) {
> +             if (mutex_lock_killable(&current->signal->cred_guard_mutex))
> +                     goto out_put_fd;
> +
> +             if (unlikely(current->signal->exec_bprm)) {
> +                     mutex_unlock(&current->signal->cred_guard_mutex);
> +                     goto out_put_fd;
> +             }

#3, and after typing this same pattern 3 times, you didn't think it
needed a helper function ?

> +     }
>  
>       spin_lock_irq(&current->sighand->siglock);
>  

Reply via email to