This introduces signal->exec_bprm, which is used to fix the case when at least one of the sibling threads is traced, and therefore the trace process may dead-lock in ptrace_attach, but de_thread will need to wait for the tracer to continue execution.
The solution is to detect this situation and allow ptrace_attach to continue by temporarily releasing the cred_guard_mutex, while de_thread() is still waiting for traced zombies to be eventually released by the tracer. In the case of the thread group leader we only have to wait for the thread to become a zombie, which may also need co-operation from the tracer due to PTRACE_O_TRACEEXIT.
When a tracer wants to ptrace_attach a task that already is in execve, we simply retry the ptrace_may_access check while temporarily installing the new credentials and dumpability which are about to be used after execve completes. If the ptrace_attach happens on a thread that is a sibling-thread of the thread doing execve, it is sufficient to check against the old credentials, as this thread will be waited for, before the new credentials are installed.
Other threads die quickly since the cred_guard_mutex is released, but a deadly signal is already pending. In case the mutex_lock_killable misses the signal, the non-zero current->signal->exec_bprm makes sure they release the mutex immediately and return with -ERESTARTNOINTR.
This means there is no API change, unlike the previous version of this patch which was discussed here:
https://lore.kernel.org/lkml/b6537ae6-31b1-5c50-f32b-8b8332ace882@hotmail.de...
See tools/testing/selftests/ptrace/vmaccess.c for a test case that gets fixed by this change.
Note that since the test case was originally designed to test the ptrace_attach returning an error in this situation, the test expectation needed to be adjusted, to allow the API to succeed at the first attempt.
Signed-off-by: Bernd Edlinger bernd.edlinger@hotmail.de --- fs/exec.c | 69 ++++++++++++++++------- fs/proc/base.c | 6 ++ include/linux/cred.h | 1 + include/linux/sched/signal.h | 18 ++++++ kernel/cred.c | 28 +++++++-- kernel/ptrace.c | 32 +++++++++++ kernel/seccomp.c | 12 +++- tools/testing/selftests/ptrace/vmaccess.c | 23 +++++--- 8 files changed, 155 insertions(+), 34 deletions(-)
v10: Changes to previous version, make the PTRACE_ATTACH retun -EAGAIN, instead of execve return -ERESTARTSYS. Added some lessions learned to the description.
v11: Check old and new credentials in PTRACE_ATTACH again without changing the API.
Note: I got actually one response from an automatic checker to the v11 patch,
https://lore.kernel.org/lkml/202107121344.wu68hEPF-lkp@intel.com/
which is complaining about:
kernel/ptrace.c:425:26: sparse: sparse: incorrect type in assignment (different address spaces) @@ expected struct cred const *old_cred @@ got struct cred const [noderef] __rcu *real_cred @@
417 struct linux_binprm *bprm = task->signal->exec_bprm; 418 const struct cred *old_cred; 419 struct mm_struct *old_mm; 420 421 retval = down_write_killable(&task->signal->exec_update_lock); 422 if (retval) 423 goto unlock_creds; 424 task_lock(task);
425 old_cred = task->real_cred;
v12: Essentially identical to v11.
- Fixed a minor merge conflict in linux v5.17, and fixed the above mentioned nit by adding __rcu to the declaration.
- re-tested the patch with all linux versions from v5.11 to v6.6
v10 was an alternative approach which did imply an API change. But I would prefer to avoid such an API change.
The difficult part is getting the right dumpability flags assigned before de_thread starts, hope you like this version. If not, the v10 is of course also acceptable.
Thanks Bernd.
diff --git a/fs/exec.c b/fs/exec.c index 2f2b0acec4f0..902d3b230485 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1041,11 +1041,13 @@ static int exec_mmap(struct mm_struct *mm) return 0; }
-static int de_thread(struct task_struct *tsk) +static int de_thread(struct task_struct *tsk, struct linux_binprm *bprm) { struct signal_struct *sig = tsk->signal; struct sighand_struct *oldsighand = tsk->sighand; spinlock_t *lock = &oldsighand->siglock; + struct task_struct *t = tsk; + bool unsafe_execve_in_progress = false;
if (thread_group_empty(tsk)) goto no_thread_group; @@ -1068,6 +1070,19 @@ static int de_thread(struct task_struct *tsk) if (!thread_group_leader(tsk)) sig->notify_count--;
+ while_each_thread(tsk, t) { + if (unlikely(t->ptrace) + && (t != tsk->group_leader || !t->exit_state)) + unsafe_execve_in_progress = true; + } + + if (unlikely(unsafe_execve_in_progress)) { + spin_unlock_irq(lock); + sig->exec_bprm = bprm; + mutex_unlock(&sig->cred_guard_mutex); + spin_lock_irq(lock); + } + while (sig->notify_count) { __set_current_state(TASK_KILLABLE); spin_unlock_irq(lock); @@ -1158,6 +1173,11 @@ static int de_thread(struct task_struct *tsk) release_task(leader); }
+ if (unlikely(unsafe_execve_in_progress)) { + mutex_lock(&sig->cred_guard_mutex); + sig->exec_bprm = NULL; + } + sig->group_exec_task = NULL; sig->notify_count = 0;
@@ -1169,6 +1189,11 @@ static int de_thread(struct task_struct *tsk) return 0;
killed: + if (unlikely(unsafe_execve_in_progress)) { + mutex_lock(&sig->cred_guard_mutex); + sig->exec_bprm = NULL; + } + /* protects against exit_notify() and __exit_signal() */ read_lock(&tasklist_lock); sig->group_exec_task = NULL; @@ -1253,6 +1278,24 @@ int begin_new_exec(struct linux_binprm * bprm) if (retval) return retval;
+ /* If the binary is not readable then enforce mm->dumpable=0 */ + would_dump(bprm, bprm->file); + if (bprm->have_execfd) + would_dump(bprm, bprm->executable); + + /* + * Figure out dumpability. Note that this checking only of current + * is wrong, but userspace depends on it. This should be testing + * bprm->secureexec instead. + */ + if (bprm->interp_flags & BINPRM_FLAGS_ENFORCE_NONDUMP || + is_dumpability_changed(current_cred(), bprm->cred) || + !(uid_eq(current_euid(), current_uid()) && + gid_eq(current_egid(), current_gid()))) + set_dumpable(bprm->mm, suid_dumpable); + else + set_dumpable(bprm->mm, SUID_DUMP_USER); + /* * Ensure all future errors are fatal. */ @@ -1261,7 +1304,7 @@ int begin_new_exec(struct linux_binprm * bprm) /* * Make this the only thread in the thread group. */ - retval = de_thread(me); + retval = de_thread(me, bprm); if (retval) goto out;
@@ -1284,11 +1327,6 @@ int begin_new_exec(struct linux_binprm * bprm) if (retval) goto out;
- /* If the binary is not readable then enforce mm->dumpable=0 */ - would_dump(bprm, bprm->file); - if (bprm->have_execfd) - would_dump(bprm, bprm->executable); - /* * Release all of the old mmap stuff */ @@ -1350,18 +1388,6 @@ int begin_new_exec(struct linux_binprm * bprm)
me->sas_ss_sp = me->sas_ss_size = 0;
- /* - * Figure out dumpability. Note that this checking only of current - * is wrong, but userspace depends on it. This should be testing - * bprm->secureexec instead. - */ - if (bprm->interp_flags & BINPRM_FLAGS_ENFORCE_NONDUMP || - !(uid_eq(current_euid(), current_uid()) && - gid_eq(current_egid(), current_gid()))) - set_dumpable(current->mm, suid_dumpable); - else - set_dumpable(current->mm, SUID_DUMP_USER); - perf_event_exec(); __set_task_comm(me, kbasename(bprm->filename), true);
@@ -1480,6 +1506,11 @@ static int prepare_bprm_creds(struct linux_binprm *bprm) if (mutex_lock_interruptible(¤t->signal->cred_guard_mutex)) return -ERESTARTNOINTR;
+ if (unlikely(current->signal->exec_bprm)) { + mutex_unlock(¤t->signal->cred_guard_mutex); + return -ERESTARTNOINTR; + } + bprm->cred = prepare_exec_creds(); if (likely(bprm->cred)) return 0; diff --git a/fs/proc/base.c b/fs/proc/base.c index ffd54617c354..0da9adfadb48 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -2788,6 +2788,12 @@ static ssize_t proc_pid_attr_write(struct file * file, const char __user * buf, if (rv < 0) goto out_free;
+ if (unlikely(current->signal->exec_bprm)) { + mutex_unlock(¤t->signal->cred_guard_mutex); + rv = -ERESTARTNOINTR; + goto out_free; + } + rv = security_setprocattr(PROC_I(inode)->op.lsm, file->f_path.dentry->d_name.name, page, count); diff --git a/include/linux/cred.h b/include/linux/cred.h index f923528d5cc4..b01e309f5686 100644 --- a/include/linux/cred.h +++ b/include/linux/cred.h @@ -159,6 +159,7 @@ extern const struct cred *get_task_cred(struct task_struct *); extern struct cred *cred_alloc_blank(void); extern struct cred *prepare_creds(void); extern struct cred *prepare_exec_creds(void); +extern bool is_dumpability_changed(const struct cred *, const struct cred *); extern int commit_creds(struct cred *); extern void abort_creds(struct cred *); extern const struct cred *override_creds(const struct cred *); diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h index 0014d3adaf84..14df7073a0a8 100644 --- a/include/linux/sched/signal.h +++ b/include/linux/sched/signal.h @@ -234,9 +234,27 @@ struct signal_struct { struct mm_struct *oom_mm; /* recorded mm when the thread group got * killed by the oom killer */
+ struct linux_binprm *exec_bprm; /* Used to check ptrace_may_access + * against new credentials while + * de_thread is waiting for other + * traced threads to terminate. + * Set while de_thread is executing. + * The cred_guard_mutex is released + * after de_thread() has called + * zap_other_threads(), therefore + * a fatal signal is guaranteed to be + * already pending in the unlikely + * event, that + * current->signal->exec_bprm happens + * to be non-zero after the + * cred_guard_mutex was acquired. + */ + struct mutex cred_guard_mutex; /* guard against foreign influences on * credential calculations * (notably. ptrace) + * Held while execve runs, except when + * a sibling thread is being traced. * Deprecated do not use in new code. * Use exec_update_lock instead. */ diff --git a/kernel/cred.c b/kernel/cred.c index 98cb4eca23fb..586cb6c7cf6b 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -433,6 +433,28 @@ static bool cred_cap_issubset(const struct cred *set, const struct cred *subset) return false; }
+/** + * is_dumpability_changed - Will changing creds from old to new + * affect the dumpability in commit_creds? + * + * Return: false - dumpability will not be changed in commit_creds. + * Return: true - dumpability will be changed to non-dumpable. + * + * @old: The old credentials + * @new: The new credentials + */ +bool is_dumpability_changed(const struct cred *old, const struct cred *new) +{ + if (!uid_eq(old->euid, new->euid) || + !gid_eq(old->egid, new->egid) || + !uid_eq(old->fsuid, new->fsuid) || + !gid_eq(old->fsgid, new->fsgid) || + !cred_cap_issubset(old, new)) + return true; + + return false; +} + /** * commit_creds - Install new credentials upon the current task * @new: The credentials to be assigned @@ -467,11 +489,7 @@ int commit_creds(struct cred *new) get_cred(new); /* we will require a ref for the subj creds too */
/* dumpability changes */ - if (!uid_eq(old->euid, new->euid) || - !gid_eq(old->egid, new->egid) || - !uid_eq(old->fsuid, new->fsuid) || - !gid_eq(old->fsgid, new->fsgid) || - !cred_cap_issubset(old, new)) { + if (is_dumpability_changed(old, new)) { if (task->mm) set_dumpable(task->mm, suid_dumpable); task->pdeath_signal = 0; diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 443057bee87c..eb1c450bb7d7 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -20,6 +20,7 @@ #include <linux/pagemap.h> #include <linux/ptrace.h> #include <linux/security.h> +#include <linux/binfmts.h> #include <linux/signal.h> #include <linux/uio.h> #include <linux/audit.h> @@ -435,6 +436,28 @@ static int ptrace_attach(struct task_struct *task, long request, if (retval) goto unlock_creds;
+ if (unlikely(task->in_execve)) { + struct linux_binprm *bprm = task->signal->exec_bprm; + const struct cred __rcu *old_cred; + struct mm_struct *old_mm; + + retval = down_write_killable(&task->signal->exec_update_lock); + if (retval) + goto unlock_creds; + task_lock(task); + old_cred = task->real_cred; + old_mm = task->mm; + rcu_assign_pointer(task->real_cred, bprm->cred); + task->mm = bprm->mm; + retval = __ptrace_may_access(task, PTRACE_MODE_ATTACH_REALCREDS); + rcu_assign_pointer(task->real_cred, old_cred); + task->mm = old_mm; + task_unlock(task); + up_write(&task->signal->exec_update_lock); + if (retval) + goto unlock_creds; + } + write_lock_irq(&tasklist_lock); retval = -EPERM; if (unlikely(task->exit_state)) @@ -508,6 +531,14 @@ static int ptrace_traceme(void) { int ret = -EPERM;
+ if (mutex_lock_interruptible(¤t->signal->cred_guard_mutex)) + return -ERESTARTNOINTR; + + if (unlikely(current->signal->exec_bprm)) { + mutex_unlock(¤t->signal->cred_guard_mutex); + return -ERESTARTNOINTR; + } + write_lock_irq(&tasklist_lock); /* Are we already being traced? */ if (!current->ptrace) { @@ -523,6 +554,7 @@ static int ptrace_traceme(void) } } write_unlock_irq(&tasklist_lock); + mutex_unlock(¤t->signal->cred_guard_mutex);
return ret; } diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 255999ba9190..b29bbfa0b044 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -1955,9 +1955,15 @@ static long seccomp_set_mode_filter(unsigned int flags, * Make sure we cannot change seccomp or nnp state via TSYNC * while another thread is in the middle of calling exec. */ - if (flags & SECCOMP_FILTER_FLAG_TSYNC && - mutex_lock_killable(¤t->signal->cred_guard_mutex)) - goto out_put_fd; + if (flags & SECCOMP_FILTER_FLAG_TSYNC) { + if (mutex_lock_killable(¤t->signal->cred_guard_mutex)) + goto out_put_fd; + + if (unlikely(current->signal->exec_bprm)) { + mutex_unlock(¤t->signal->cred_guard_mutex); + goto out_put_fd; + } + }
spin_lock_irq(¤t->sighand->siglock);
diff --git a/tools/testing/selftests/ptrace/vmaccess.c b/tools/testing/selftests/ptrace/vmaccess.c index 4db327b44586..3b7d81fb99bb 100644 --- a/tools/testing/selftests/ptrace/vmaccess.c +++ b/tools/testing/selftests/ptrace/vmaccess.c @@ -39,8 +39,15 @@ TEST(vmaccess) f = open(mm, O_RDONLY); ASSERT_GE(f, 0); close(f); - f = kill(pid, SIGCONT); - ASSERT_EQ(f, 0); + f = waitpid(-1, NULL, 0); + ASSERT_NE(f, -1); + ASSERT_NE(f, 0); + ASSERT_NE(f, pid); + f = waitpid(-1, NULL, 0); + ASSERT_EQ(f, pid); + f = waitpid(-1, NULL, 0); + ASSERT_EQ(f, -1); + ASSERT_EQ(errno, ECHILD); }
TEST(attach) @@ -57,22 +64,24 @@ TEST(attach)
sleep(1); k = ptrace(PTRACE_ATTACH, pid, 0L, 0L); - ASSERT_EQ(errno, EAGAIN); - ASSERT_EQ(k, -1); + ASSERT_EQ(k, 0); k = waitpid(-1, &s, WNOHANG); ASSERT_NE(k, -1); ASSERT_NE(k, 0); ASSERT_NE(k, pid); ASSERT_EQ(WIFEXITED(s), 1); ASSERT_EQ(WEXITSTATUS(s), 0); - sleep(1); - k = ptrace(PTRACE_ATTACH, pid, 0L, 0L); + k = waitpid(-1, &s, 0); + ASSERT_EQ(k, pid); + ASSERT_EQ(WIFSTOPPED(s), 1); + ASSERT_EQ(WSTOPSIG(s), SIGTRAP); + k = ptrace(PTRACE_CONT, pid, 0L, 0L); ASSERT_EQ(k, 0); k = waitpid(-1, &s, 0); ASSERT_EQ(k, pid); ASSERT_EQ(WIFSTOPPED(s), 1); ASSERT_EQ(WSTOPSIG(s), SIGSTOP); - k = ptrace(PTRACE_DETACH, pid, 0L, 0L); + k = ptrace(PTRACE_CONT, pid, 0L, 0L); ASSERT_EQ(k, 0); k = waitpid(-1, &s, 0); ASSERT_EQ(k, pid);
Hi Bernd,
kernel test robot noticed the following build warnings:
[auto build test WARNING on kees/for-next/execve] [also build test WARNING on kees/for-next/seccomp shuah-kselftest/next shuah-kselftest/fixes linus/master v6.6] [cannot apply to next-20231030] [If your patch is applied to the wrong git tree, kindly drop us a note. And when submitting patch, we suggest to use '--base' as documented in https://git-scm.com/docs/git-format-patch#_base_tree_information]
url: https://github.com/intel-lab-lkp/linux/commits/Bernd-Edlinger/exec-Fix-dead-... base: https://git.kernel.org/pub/scm/linux/kernel/git/kees/linux.git for-next/execve patch link: https://lore.kernel.org/r/AS8P193MB1285DF698D7524EDE22ABFA1E4A1A%40AS8P193MB... patch subject: [PATCH v12] exec: Fix dead-lock in de_thread with ptrace_attach config: loongarch-randconfig-002-20231030 (https://download.01.org/0day-ci/archive/20231030/202310301604.K866zRJ8-lkp@i...) compiler: loongarch64-linux-gcc (GCC) 13.2.0 reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20231030/202310301604.K866zRJ8-lkp@i...)
If you fix the issue in a separate patch/commit (i.e. not just a new version of the same patch/commit), kindly add following tags | Reported-by: kernel test robot lkp@intel.com | Closes: https://lore.kernel.org/oe-kbuild-all/202310301604.K866zRJ8-lkp@intel.com/
All warnings (new ones prefixed by >>):
kernel/cred.c:443: warning: duplicate section name 'Return'
vim +/Return +443 kernel/cred.c
435 436 /** 437 * is_dumpability_changed - Will changing creds from old to new 438 * affect the dumpability in commit_creds? 439 * 440 * Return: false - dumpability will not be changed in commit_creds. 441 * Return: true - dumpability will be changed to non-dumpable. 442 *
443 * @old: The old credentials
444 * @new: The new credentials 445 */ 446 bool is_dumpability_changed(const struct cred *old, const struct cred *new) 447 { 448 if (!uid_eq(old->euid, new->euid) || 449 !gid_eq(old->egid, new->egid) || 450 !uid_eq(old->fsuid, new->fsuid) || 451 !gid_eq(old->fsgid, new->fsgid) || 452 !cred_cap_issubset(old, new)) 453 return true; 454 455 return false; 456 } 457
This introduces signal->exec_bprm, which is used to fix the case when at least one of the sibling threads is traced, and therefore the trace process may dead-lock in ptrace_attach, but de_thread will need to wait for the tracer to continue execution.
The solution is to detect this situation and allow ptrace_attach to continue by temporarily releasing the cred_guard_mutex, while de_thread() is still waiting for traced zombies to be eventually released by the tracer. In the case of the thread group leader we only have to wait for the thread to become a zombie, which may also need co-operation from the tracer due to PTRACE_O_TRACEEXIT.
When a tracer wants to ptrace_attach a task that already is in execve, we simply retry the ptrace_may_access check while temporarily installing the new credentials and dumpability which are about to be used after execve completes. If the ptrace_attach happens on a thread that is a sibling-thread of the thread doing execve, it is sufficient to check against the old credentials, as this thread will be waited for, before the new credentials are installed.
Other threads die quickly since the cred_guard_mutex is released, but a deadly signal is already pending. In case the mutex_lock_killable misses the signal, the non-zero current->signal->exec_bprm makes sure they release the mutex immediately and return with -ERESTARTNOINTR.
This means there is no API change, unlike the previous version of this patch which was discussed here:
https://lore.kernel.org/lkml/b6537ae6-31b1-5c50-f32b-8b8332ace882@hotmail.de...
See tools/testing/selftests/ptrace/vmaccess.c for a test case that gets fixed by this change.
Note that since the test case was originally designed to test the ptrace_attach returning an error in this situation, the test expectation needed to be adjusted, to allow the API to succeed at the first attempt.
Signed-off-by: Bernd Edlinger bernd.edlinger@hotmail.de --- fs/exec.c | 69 ++++++++++++++++------- fs/proc/base.c | 6 ++ include/linux/cred.h | 1 + include/linux/sched/signal.h | 18 ++++++ kernel/cred.c | 28 +++++++-- kernel/ptrace.c | 32 +++++++++++ kernel/seccomp.c | 12 +++- tools/testing/selftests/ptrace/vmaccess.c | 23 +++++--- 8 files changed, 155 insertions(+), 34 deletions(-)
v10: Changes to previous version, make the PTRACE_ATTACH retun -EAGAIN, instead of execve return -ERESTARTSYS. Added some lessions learned to the description.
v11: Check old and new credentials in PTRACE_ATTACH again without changing the API.
Note: I got actually one response from an automatic checker to the v11 patch,
https://lore.kernel.org/lkml/202107121344.wu68hEPF-lkp@intel.com/
which is complaining about:
kernel/ptrace.c:425:26: sparse: sparse: incorrect type in assignment (different address spaces) @@ expected struct cred const *old_cred @@ got struct cred const [noderef] __rcu *real_cred @@
417 struct linux_binprm *bprm = task->signal->exec_bprm; 418 const struct cred *old_cred; 419 struct mm_struct *old_mm; 420 421 retval = down_write_killable(&task->signal->exec_update_lock); 422 if (retval) 423 goto unlock_creds; 424 task_lock(task);
425 old_cred = task->real_cred;
v12: Essentially identical to v11.
- Fixed a minor merge conflict in linux v5.17, and fixed the above mentioned nit by adding __rcu to the declaration.
- re-tested the patch with all linux versions from v5.11 to v6.6
v10 was an alternative approach which did imply an API change. But I would prefer to avoid such an API change.
The difficult part is getting the right dumpability flags assigned before de_thread starts, hope you like this version. If not, the v10 is of course also acceptable.
v13: Fixed duplicated Return section in function header of is_dumpability_changed which was reported by the kernel test robot
Thanks Bernd.
diff --git a/fs/exec.c b/fs/exec.c index 2f2b0acec4f0..902d3b230485 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1041,11 +1041,13 @@ static int exec_mmap(struct mm_struct *mm) return 0; }
-static int de_thread(struct task_struct *tsk) +static int de_thread(struct task_struct *tsk, struct linux_binprm *bprm) { struct signal_struct *sig = tsk->signal; struct sighand_struct *oldsighand = tsk->sighand; spinlock_t *lock = &oldsighand->siglock; + struct task_struct *t = tsk; + bool unsafe_execve_in_progress = false;
if (thread_group_empty(tsk)) goto no_thread_group; @@ -1068,6 +1070,19 @@ static int de_thread(struct task_struct *tsk) if (!thread_group_leader(tsk)) sig->notify_count--;
+ while_each_thread(tsk, t) { + if (unlikely(t->ptrace) + && (t != tsk->group_leader || !t->exit_state)) + unsafe_execve_in_progress = true; + } + + if (unlikely(unsafe_execve_in_progress)) { + spin_unlock_irq(lock); + sig->exec_bprm = bprm; + mutex_unlock(&sig->cred_guard_mutex); + spin_lock_irq(lock); + } + while (sig->notify_count) { __set_current_state(TASK_KILLABLE); spin_unlock_irq(lock); @@ -1158,6 +1173,11 @@ static int de_thread(struct task_struct *tsk) release_task(leader); }
+ if (unlikely(unsafe_execve_in_progress)) { + mutex_lock(&sig->cred_guard_mutex); + sig->exec_bprm = NULL; + } + sig->group_exec_task = NULL; sig->notify_count = 0;
@@ -1169,6 +1189,11 @@ static int de_thread(struct task_struct *tsk) return 0;
killed: + if (unlikely(unsafe_execve_in_progress)) { + mutex_lock(&sig->cred_guard_mutex); + sig->exec_bprm = NULL; + } + /* protects against exit_notify() and __exit_signal() */ read_lock(&tasklist_lock); sig->group_exec_task = NULL; @@ -1253,6 +1278,24 @@ int begin_new_exec(struct linux_binprm * bprm) if (retval) return retval;
+ /* If the binary is not readable then enforce mm->dumpable=0 */ + would_dump(bprm, bprm->file); + if (bprm->have_execfd) + would_dump(bprm, bprm->executable); + + /* + * Figure out dumpability. Note that this checking only of current + * is wrong, but userspace depends on it. This should be testing + * bprm->secureexec instead. + */ + if (bprm->interp_flags & BINPRM_FLAGS_ENFORCE_NONDUMP || + is_dumpability_changed(current_cred(), bprm->cred) || + !(uid_eq(current_euid(), current_uid()) && + gid_eq(current_egid(), current_gid()))) + set_dumpable(bprm->mm, suid_dumpable); + else + set_dumpable(bprm->mm, SUID_DUMP_USER); + /* * Ensure all future errors are fatal. */ @@ -1261,7 +1304,7 @@ int begin_new_exec(struct linux_binprm * bprm) /* * Make this the only thread in the thread group. */ - retval = de_thread(me); + retval = de_thread(me, bprm); if (retval) goto out;
@@ -1284,11 +1327,6 @@ int begin_new_exec(struct linux_binprm * bprm) if (retval) goto out;
- /* If the binary is not readable then enforce mm->dumpable=0 */ - would_dump(bprm, bprm->file); - if (bprm->have_execfd) - would_dump(bprm, bprm->executable); - /* * Release all of the old mmap stuff */ @@ -1350,18 +1388,6 @@ int begin_new_exec(struct linux_binprm * bprm)
me->sas_ss_sp = me->sas_ss_size = 0;
- /* - * Figure out dumpability. Note that this checking only of current - * is wrong, but userspace depends on it. This should be testing - * bprm->secureexec instead. - */ - if (bprm->interp_flags & BINPRM_FLAGS_ENFORCE_NONDUMP || - !(uid_eq(current_euid(), current_uid()) && - gid_eq(current_egid(), current_gid()))) - set_dumpable(current->mm, suid_dumpable); - else - set_dumpable(current->mm, SUID_DUMP_USER); - perf_event_exec(); __set_task_comm(me, kbasename(bprm->filename), true);
@@ -1480,6 +1506,11 @@ static int prepare_bprm_creds(struct linux_binprm *bprm) if (mutex_lock_interruptible(¤t->signal->cred_guard_mutex)) return -ERESTARTNOINTR;
+ if (unlikely(current->signal->exec_bprm)) { + mutex_unlock(¤t->signal->cred_guard_mutex); + return -ERESTARTNOINTR; + } + bprm->cred = prepare_exec_creds(); if (likely(bprm->cred)) return 0; diff --git a/fs/proc/base.c b/fs/proc/base.c index ffd54617c354..0da9adfadb48 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -2788,6 +2788,12 @@ static ssize_t proc_pid_attr_write(struct file * file, const char __user * buf, if (rv < 0) goto out_free;
+ if (unlikely(current->signal->exec_bprm)) { + mutex_unlock(¤t->signal->cred_guard_mutex); + rv = -ERESTARTNOINTR; + goto out_free; + } + rv = security_setprocattr(PROC_I(inode)->op.lsm, file->f_path.dentry->d_name.name, page, count); diff --git a/include/linux/cred.h b/include/linux/cred.h index f923528d5cc4..b01e309f5686 100644 --- a/include/linux/cred.h +++ b/include/linux/cred.h @@ -159,6 +159,7 @@ extern const struct cred *get_task_cred(struct task_struct *); extern struct cred *cred_alloc_blank(void); extern struct cred *prepare_creds(void); extern struct cred *prepare_exec_creds(void); +extern bool is_dumpability_changed(const struct cred *, const struct cred *); extern int commit_creds(struct cred *); extern void abort_creds(struct cred *); extern const struct cred *override_creds(const struct cred *); diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h index 0014d3adaf84..14df7073a0a8 100644 --- a/include/linux/sched/signal.h +++ b/include/linux/sched/signal.h @@ -234,9 +234,27 @@ struct signal_struct { struct mm_struct *oom_mm; /* recorded mm when the thread group got * killed by the oom killer */
+ struct linux_binprm *exec_bprm; /* Used to check ptrace_may_access + * against new credentials while + * de_thread is waiting for other + * traced threads to terminate. + * Set while de_thread is executing. + * The cred_guard_mutex is released + * after de_thread() has called + * zap_other_threads(), therefore + * a fatal signal is guaranteed to be + * already pending in the unlikely + * event, that + * current->signal->exec_bprm happens + * to be non-zero after the + * cred_guard_mutex was acquired. + */ + struct mutex cred_guard_mutex; /* guard against foreign influences on * credential calculations * (notably. ptrace) + * Held while execve runs, except when + * a sibling thread is being traced. * Deprecated do not use in new code. * Use exec_update_lock instead. */ diff --git a/kernel/cred.c b/kernel/cred.c index 98cb4eca23fb..586cb6c7cf6b 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -433,6 +433,28 @@ static bool cred_cap_issubset(const struct cred *set, const struct cred *subset) return false; }
+/** + * is_dumpability_changed - Will changing creds from old to new + * affect the dumpability in commit_creds? + * + * Return: false - dumpability will not be changed in commit_creds. + * true - dumpability will be changed to non-dumpable. + * + * @old: The old credentials + * @new: The new credentials + */ +bool is_dumpability_changed(const struct cred *old, const struct cred *new) +{ + if (!uid_eq(old->euid, new->euid) || + !gid_eq(old->egid, new->egid) || + !uid_eq(old->fsuid, new->fsuid) || + !gid_eq(old->fsgid, new->fsgid) || + !cred_cap_issubset(old, new)) + return true; + + return false; +} + /** * commit_creds - Install new credentials upon the current task * @new: The credentials to be assigned @@ -467,11 +489,7 @@ int commit_creds(struct cred *new) get_cred(new); /* we will require a ref for the subj creds too */
/* dumpability changes */ - if (!uid_eq(old->euid, new->euid) || - !gid_eq(old->egid, new->egid) || - !uid_eq(old->fsuid, new->fsuid) || - !gid_eq(old->fsgid, new->fsgid) || - !cred_cap_issubset(old, new)) { + if (is_dumpability_changed(old, new)) { if (task->mm) set_dumpable(task->mm, suid_dumpable); task->pdeath_signal = 0; diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 443057bee87c..eb1c450bb7d7 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -20,6 +20,7 @@ #include <linux/pagemap.h> #include <linux/ptrace.h> #include <linux/security.h> +#include <linux/binfmts.h> #include <linux/signal.h> #include <linux/uio.h> #include <linux/audit.h> @@ -435,6 +436,28 @@ static int ptrace_attach(struct task_struct *task, long request, if (retval) goto unlock_creds;
+ if (unlikely(task->in_execve)) { + struct linux_binprm *bprm = task->signal->exec_bprm; + const struct cred __rcu *old_cred; + struct mm_struct *old_mm; + + retval = down_write_killable(&task->signal->exec_update_lock); + if (retval) + goto unlock_creds; + task_lock(task); + old_cred = task->real_cred; + old_mm = task->mm; + rcu_assign_pointer(task->real_cred, bprm->cred); + task->mm = bprm->mm; + retval = __ptrace_may_access(task, PTRACE_MODE_ATTACH_REALCREDS); + rcu_assign_pointer(task->real_cred, old_cred); + task->mm = old_mm; + task_unlock(task); + up_write(&task->signal->exec_update_lock); + if (retval) + goto unlock_creds; + } + write_lock_irq(&tasklist_lock); retval = -EPERM; if (unlikely(task->exit_state)) @@ -508,6 +531,14 @@ static int ptrace_traceme(void) { int ret = -EPERM;
+ if (mutex_lock_interruptible(¤t->signal->cred_guard_mutex)) + return -ERESTARTNOINTR; + + if (unlikely(current->signal->exec_bprm)) { + mutex_unlock(¤t->signal->cred_guard_mutex); + return -ERESTARTNOINTR; + } + write_lock_irq(&tasklist_lock); /* Are we already being traced? */ if (!current->ptrace) { @@ -523,6 +554,7 @@ static int ptrace_traceme(void) } } write_unlock_irq(&tasklist_lock); + mutex_unlock(¤t->signal->cred_guard_mutex);
return ret; } diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 255999ba9190..b29bbfa0b044 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -1955,9 +1955,15 @@ static long seccomp_set_mode_filter(unsigned int flags, * Make sure we cannot change seccomp or nnp state via TSYNC * while another thread is in the middle of calling exec. */ - if (flags & SECCOMP_FILTER_FLAG_TSYNC && - mutex_lock_killable(¤t->signal->cred_guard_mutex)) - goto out_put_fd; + if (flags & SECCOMP_FILTER_FLAG_TSYNC) { + if (mutex_lock_killable(¤t->signal->cred_guard_mutex)) + goto out_put_fd; + + if (unlikely(current->signal->exec_bprm)) { + mutex_unlock(¤t->signal->cred_guard_mutex); + goto out_put_fd; + } + }
spin_lock_irq(¤t->sighand->siglock);
diff --git a/tools/testing/selftests/ptrace/vmaccess.c b/tools/testing/selftests/ptrace/vmaccess.c index 4db327b44586..3b7d81fb99bb 100644 --- a/tools/testing/selftests/ptrace/vmaccess.c +++ b/tools/testing/selftests/ptrace/vmaccess.c @@ -39,8 +39,15 @@ TEST(vmaccess) f = open(mm, O_RDONLY); ASSERT_GE(f, 0); close(f); - f = kill(pid, SIGCONT); - ASSERT_EQ(f, 0); + f = waitpid(-1, NULL, 0); + ASSERT_NE(f, -1); + ASSERT_NE(f, 0); + ASSERT_NE(f, pid); + f = waitpid(-1, NULL, 0); + ASSERT_EQ(f, pid); + f = waitpid(-1, NULL, 0); + ASSERT_EQ(f, -1); + ASSERT_EQ(errno, ECHILD); }
TEST(attach) @@ -57,22 +64,24 @@ TEST(attach)
sleep(1); k = ptrace(PTRACE_ATTACH, pid, 0L, 0L); - ASSERT_EQ(errno, EAGAIN); - ASSERT_EQ(k, -1); + ASSERT_EQ(k, 0); k = waitpid(-1, &s, WNOHANG); ASSERT_NE(k, -1); ASSERT_NE(k, 0); ASSERT_NE(k, pid); ASSERT_EQ(WIFEXITED(s), 1); ASSERT_EQ(WEXITSTATUS(s), 0); - sleep(1); - k = ptrace(PTRACE_ATTACH, pid, 0L, 0L); + k = waitpid(-1, &s, 0); + ASSERT_EQ(k, pid); + ASSERT_EQ(WIFSTOPPED(s), 1); + ASSERT_EQ(WSTOPSIG(s), SIGTRAP); + k = ptrace(PTRACE_CONT, pid, 0L, 0L); ASSERT_EQ(k, 0); k = waitpid(-1, &s, 0); ASSERT_EQ(k, pid); ASSERT_EQ(WIFSTOPPED(s), 1); ASSERT_EQ(WSTOPSIG(s), SIGSTOP); - k = ptrace(PTRACE_DETACH, pid, 0L, 0L); + k = ptrace(PTRACE_CONT, pid, 0L, 0L); ASSERT_EQ(k, 0); k = waitpid(-1, &s, 0); ASSERT_EQ(k, pid);
This introduces signal->exec_bprm, which is used to fix the case when at least one of the sibling threads is traced, and therefore the trace process may dead-lock in ptrace_attach, but de_thread will need to wait for the tracer to continue execution.
The problem happens when a tracer tries to ptrace_attach to a multi-threaded process, that does an execve in one of the threads at the same time, without doing that in a forked sub-process. That means: There is a race condition, when one or more of the threads are already ptraced, but the thread that invoked the execve is not yet traced. Now in this case the execve locks the cred_guard_mutex and waits for de_thread to complete. But that waits for the traced sibling threads to exit, and those have to wait for the tracer to receive the exit signal, but the tracer cannot call wait right now, because it is waiting for the ptrace call to complete, and this never does not happen. The traced process and the tracer are now in a deadlock situation, and can only be killed by a fatal signal.
The solution is to detect this situation and allow ptrace_attach to continue by temporarily releasing the cred_guard_mutex, while de_thread() is still waiting for traced zombies to be eventually released by the tracer. In the case of the thread group leader we only have to wait for the thread to become a zombie, which may also need co-operation from the tracer due to PTRACE_O_TRACEEXIT.
When a tracer wants to ptrace_attach a task that already is in execve, we simply retry the ptrace_may_access check while temporarily installing the new credentials and dumpability which are about to be used after execve completes. If the ptrace_attach happens on a thread that is a sibling-thread of the thread doing execve, it is sufficient to check against the old credentials, as this thread will be waited for, before the new credentials are installed.
Other threads die quickly since the cred_guard_mutex is released, but a deadly signal is already pending. In case the mutex_lock_killable misses the signal, the non-zero current->signal->exec_bprm makes sure they release the mutex immediately and return with -ERESTARTNOINTR.
This means there is no API change, unlike the previous version of this patch which was discussed here:
https://lore.kernel.org/lkml/b6537ae6-31b1-5c50-f32b-8b8332ace882@hotmail.de...
See tools/testing/selftests/ptrace/vmaccess.c for a test case that gets fixed by this change.
Note that since the test case was originally designed to test the ptrace_attach returning an error in this situation, the test expectation needed to be adjusted, to allow the API to succeed at the first attempt.
Signed-off-by: Bernd Edlinger bernd.edlinger@hotmail.de --- fs/exec.c | 69 ++++++++++++++++------- fs/proc/base.c | 6 ++ include/linux/cred.h | 1 + include/linux/sched/signal.h | 18 ++++++ kernel/cred.c | 28 +++++++-- kernel/ptrace.c | 32 +++++++++++ kernel/seccomp.c | 12 +++- tools/testing/selftests/ptrace/vmaccess.c | 23 +++++--- 8 files changed, 155 insertions(+), 34 deletions(-)
v10: Changes to previous version, make the PTRACE_ATTACH return -EAGAIN, instead of execve return -ERESTARTSYS. Added some lessions learned to the description.
v11: Check old and new credentials in PTRACE_ATTACH again without changing the API.
Note: I got actually one response from an automatic checker to the v11 patch,
https://lore.kernel.org/lkml/202107121344.wu68hEPF-lkp@intel.com/
which is complaining about:
kernel/ptrace.c:425:26: sparse: sparse: incorrect type in assignment (different address spaces) @@ expected struct cred const *old_cred @@ got struct cred const [noderef] __rcu *real_cred @@
417 struct linux_binprm *bprm = task->signal->exec_bprm; 418 const struct cred *old_cred; 419 struct mm_struct *old_mm; 420 421 retval = down_write_killable(&task->signal->exec_update_lock); 422 if (retval) 423 goto unlock_creds; 424 task_lock(task);
425 old_cred = task->real_cred;
v12: Essentially identical to v11.
- Fixed a minor merge conflict in linux v5.17, and fixed the above mentioned nit by adding __rcu to the declaration.
- re-tested the patch with all linux versions from v5.11 to v6.6
v10 was an alternative approach which did imply an API change. But I would prefer to avoid such an API change.
The difficult part is getting the right dumpability flags assigned before de_thread starts, hope you like this version. If not, the v10 is of course also acceptable.
v13: Fixed duplicated Return section in function header of is_dumpability_changed which was reported by the kernel test robot
v14: rebased to v6.7, refreshed and retested. And added a more detailed description of the actual bug.
Thanks Bernd.
diff --git a/fs/exec.c b/fs/exec.c index 6d9ed2d765ef..f2cf7c58fe16 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1039,11 +1039,13 @@ static int exec_mmap(struct mm_struct *mm) return 0; }
-static int de_thread(struct task_struct *tsk) +static int de_thread(struct task_struct *tsk, struct linux_binprm *bprm) { struct signal_struct *sig = tsk->signal; struct sighand_struct *oldsighand = tsk->sighand; spinlock_t *lock = &oldsighand->siglock; + struct task_struct *t = tsk; + bool unsafe_execve_in_progress = false;
if (thread_group_empty(tsk)) goto no_thread_group; @@ -1066,6 +1068,19 @@ static int de_thread(struct task_struct *tsk) if (!thread_group_leader(tsk)) sig->notify_count--;
+ while_each_thread(tsk, t) { + if (unlikely(t->ptrace) + && (t != tsk->group_leader || !t->exit_state)) + unsafe_execve_in_progress = true; + } + + if (unlikely(unsafe_execve_in_progress)) { + spin_unlock_irq(lock); + sig->exec_bprm = bprm; + mutex_unlock(&sig->cred_guard_mutex); + spin_lock_irq(lock); + } + while (sig->notify_count) { __set_current_state(TASK_KILLABLE); spin_unlock_irq(lock); @@ -1156,6 +1171,11 @@ static int de_thread(struct task_struct *tsk) release_task(leader); }
+ if (unlikely(unsafe_execve_in_progress)) { + mutex_lock(&sig->cred_guard_mutex); + sig->exec_bprm = NULL; + } + sig->group_exec_task = NULL; sig->notify_count = 0;
@@ -1167,6 +1187,11 @@ static int de_thread(struct task_struct *tsk) return 0;
killed: + if (unlikely(unsafe_execve_in_progress)) { + mutex_lock(&sig->cred_guard_mutex); + sig->exec_bprm = NULL; + } + /* protects against exit_notify() and __exit_signal() */ read_lock(&tasklist_lock); sig->group_exec_task = NULL; @@ -1251,6 +1276,24 @@ int begin_new_exec(struct linux_binprm * bprm) if (retval) return retval;
+ /* If the binary is not readable then enforce mm->dumpable=0 */ + would_dump(bprm, bprm->file); + if (bprm->have_execfd) + would_dump(bprm, bprm->executable); + + /* + * Figure out dumpability. Note that this checking only of current + * is wrong, but userspace depends on it. This should be testing + * bprm->secureexec instead. + */ + if (bprm->interp_flags & BINPRM_FLAGS_ENFORCE_NONDUMP || + is_dumpability_changed(current_cred(), bprm->cred) || + !(uid_eq(current_euid(), current_uid()) && + gid_eq(current_egid(), current_gid()))) + set_dumpable(bprm->mm, suid_dumpable); + else + set_dumpable(bprm->mm, SUID_DUMP_USER); + /* * Ensure all future errors are fatal. */ @@ -1259,7 +1302,7 @@ int begin_new_exec(struct linux_binprm * bprm) /* * Make this the only thread in the thread group. */ - retval = de_thread(me); + retval = de_thread(me, bprm); if (retval) goto out;
@@ -1282,11 +1325,6 @@ int begin_new_exec(struct linux_binprm * bprm) if (retval) goto out;
- /* If the binary is not readable then enforce mm->dumpable=0 */ - would_dump(bprm, bprm->file); - if (bprm->have_execfd) - would_dump(bprm, bprm->executable); - /* * Release all of the old mmap stuff */ @@ -1348,18 +1386,6 @@ int begin_new_exec(struct linux_binprm * bprm)
me->sas_ss_sp = me->sas_ss_size = 0;
- /* - * Figure out dumpability. Note that this checking only of current - * is wrong, but userspace depends on it. This should be testing - * bprm->secureexec instead. - */ - if (bprm->interp_flags & BINPRM_FLAGS_ENFORCE_NONDUMP || - !(uid_eq(current_euid(), current_uid()) && - gid_eq(current_egid(), current_gid()))) - set_dumpable(current->mm, suid_dumpable); - else - set_dumpable(current->mm, SUID_DUMP_USER); - perf_event_exec(); __set_task_comm(me, kbasename(bprm->filename), true);
@@ -1478,6 +1504,11 @@ static int prepare_bprm_creds(struct linux_binprm *bprm) if (mutex_lock_interruptible(¤t->signal->cred_guard_mutex)) return -ERESTARTNOINTR;
+ if (unlikely(current->signal->exec_bprm)) { + mutex_unlock(¤t->signal->cred_guard_mutex); + return -ERESTARTNOINTR; + } + bprm->cred = prepare_exec_creds(); if (likely(bprm->cred)) return 0; diff --git a/fs/proc/base.c b/fs/proc/base.c index dd31e3b6bf77..99ff3420138b 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -2784,6 +2784,12 @@ static ssize_t proc_pid_attr_write(struct file * file, const char __user * buf, if (rv < 0) goto out_free;
+ if (unlikely(current->signal->exec_bprm)) { + mutex_unlock(¤t->signal->cred_guard_mutex); + rv = -ERESTARTNOINTR; + goto out_free; + } + rv = security_setprocattr(PROC_I(inode)->op.lsm, file->f_path.dentry->d_name.name, page, count); diff --git a/include/linux/cred.h b/include/linux/cred.h index 2976f534a7a3..a1a1ac38f749 100644 --- a/include/linux/cred.h +++ b/include/linux/cred.h @@ -153,6 +153,7 @@ extern const struct cred *get_task_cred(struct task_struct *); extern struct cred *cred_alloc_blank(void); extern struct cred *prepare_creds(void); extern struct cred *prepare_exec_creds(void); +extern bool is_dumpability_changed(const struct cred *, const struct cred *); extern int commit_creds(struct cred *); extern void abort_creds(struct cred *); extern const struct cred *override_creds(const struct cred *); diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h index 3499c1a8b929..85d8f8f2f44f 100644 --- a/include/linux/sched/signal.h +++ b/include/linux/sched/signal.h @@ -234,9 +234,27 @@ struct signal_struct { struct mm_struct *oom_mm; /* recorded mm when the thread group got * killed by the oom killer */
+ struct linux_binprm *exec_bprm; /* Used to check ptrace_may_access + * against new credentials while + * de_thread is waiting for other + * traced threads to terminate. + * Set while de_thread is executing. + * The cred_guard_mutex is released + * after de_thread() has called + * zap_other_threads(), therefore + * a fatal signal is guaranteed to be + * already pending in the unlikely + * event, that + * current->signal->exec_bprm happens + * to be non-zero after the + * cred_guard_mutex was acquired. + */ + struct mutex cred_guard_mutex; /* guard against foreign influences on * credential calculations * (notably. ptrace) + * Held while execve runs, except when + * a sibling thread is being traced. * Deprecated do not use in new code. * Use exec_update_lock instead. */ diff --git a/kernel/cred.c b/kernel/cred.c index c033a201c808..72aadde3f10c 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -375,6 +375,28 @@ static bool cred_cap_issubset(const struct cred *set, const struct cred *subset) return false; }
+/** + * is_dumpability_changed - Will changing creds from old to new + * affect the dumpability in commit_creds? + * + * Return: false - dumpability will not be changed in commit_creds. + * true - dumpability will be changed to non-dumpable. + * + * @old: The old credentials + * @new: The new credentials + */ +bool is_dumpability_changed(const struct cred *old, const struct cred *new) +{ + if (!uid_eq(old->euid, new->euid) || + !gid_eq(old->egid, new->egid) || + !uid_eq(old->fsuid, new->fsuid) || + !gid_eq(old->fsgid, new->fsgid) || + !cred_cap_issubset(old, new)) + return true; + + return false; +} + /** * commit_creds - Install new credentials upon the current task * @new: The credentials to be assigned @@ -403,11 +425,7 @@ int commit_creds(struct cred *new) get_cred(new); /* we will require a ref for the subj creds too */
/* dumpability changes */ - if (!uid_eq(old->euid, new->euid) || - !gid_eq(old->egid, new->egid) || - !uid_eq(old->fsuid, new->fsuid) || - !gid_eq(old->fsgid, new->fsgid) || - !cred_cap_issubset(old, new)) { + if (is_dumpability_changed(old, new)) { if (task->mm) set_dumpable(task->mm, suid_dumpable); task->pdeath_signal = 0; diff --git a/kernel/ptrace.c b/kernel/ptrace.c index d8b5e13a2229..578bc02eea27 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -20,6 +20,7 @@ #include <linux/pagemap.h> #include <linux/ptrace.h> #include <linux/security.h> +#include <linux/binfmts.h> #include <linux/signal.h> #include <linux/uio.h> #include <linux/audit.h> @@ -435,6 +436,28 @@ static int ptrace_attach(struct task_struct *task, long request, if (retval) goto unlock_creds;
+ if (unlikely(task->in_execve)) { + struct linux_binprm *bprm = task->signal->exec_bprm; + const struct cred __rcu *old_cred; + struct mm_struct *old_mm; + + retval = down_write_killable(&task->signal->exec_update_lock); + if (retval) + goto unlock_creds; + task_lock(task); + old_cred = task->real_cred; + old_mm = task->mm; + rcu_assign_pointer(task->real_cred, bprm->cred); + task->mm = bprm->mm; + retval = __ptrace_may_access(task, PTRACE_MODE_ATTACH_REALCREDS); + rcu_assign_pointer(task->real_cred, old_cred); + task->mm = old_mm; + task_unlock(task); + up_write(&task->signal->exec_update_lock); + if (retval) + goto unlock_creds; + } + write_lock_irq(&tasklist_lock); retval = -EPERM; if (unlikely(task->exit_state)) @@ -508,6 +531,14 @@ static int ptrace_traceme(void) { int ret = -EPERM;
+ if (mutex_lock_interruptible(¤t->signal->cred_guard_mutex)) + return -ERESTARTNOINTR; + + if (unlikely(current->signal->exec_bprm)) { + mutex_unlock(¤t->signal->cred_guard_mutex); + return -ERESTARTNOINTR; + } + write_lock_irq(&tasklist_lock); /* Are we already being traced? */ if (!current->ptrace) { @@ -523,6 +554,7 @@ static int ptrace_traceme(void) } } write_unlock_irq(&tasklist_lock); + mutex_unlock(¤t->signal->cred_guard_mutex);
return ret; } diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 255999ba9190..b29bbfa0b044 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -1955,9 +1955,15 @@ static long seccomp_set_mode_filter(unsigned int flags, * Make sure we cannot change seccomp or nnp state via TSYNC * while another thread is in the middle of calling exec. */ - if (flags & SECCOMP_FILTER_FLAG_TSYNC && - mutex_lock_killable(¤t->signal->cred_guard_mutex)) - goto out_put_fd; + if (flags & SECCOMP_FILTER_FLAG_TSYNC) { + if (mutex_lock_killable(¤t->signal->cred_guard_mutex)) + goto out_put_fd; + + if (unlikely(current->signal->exec_bprm)) { + mutex_unlock(¤t->signal->cred_guard_mutex); + goto out_put_fd; + } + }
spin_lock_irq(¤t->sighand->siglock);
diff --git a/tools/testing/selftests/ptrace/vmaccess.c b/tools/testing/selftests/ptrace/vmaccess.c index 4db327b44586..3b7d81fb99bb 100644 --- a/tools/testing/selftests/ptrace/vmaccess.c +++ b/tools/testing/selftests/ptrace/vmaccess.c @@ -39,8 +39,15 @@ TEST(vmaccess) f = open(mm, O_RDONLY); ASSERT_GE(f, 0); close(f); - f = kill(pid, SIGCONT); - ASSERT_EQ(f, 0); + f = waitpid(-1, NULL, 0); + ASSERT_NE(f, -1); + ASSERT_NE(f, 0); + ASSERT_NE(f, pid); + f = waitpid(-1, NULL, 0); + ASSERT_EQ(f, pid); + f = waitpid(-1, NULL, 0); + ASSERT_EQ(f, -1); + ASSERT_EQ(errno, ECHILD); }
TEST(attach) @@ -57,22 +64,24 @@ TEST(attach)
sleep(1); k = ptrace(PTRACE_ATTACH, pid, 0L, 0L); - ASSERT_EQ(errno, EAGAIN); - ASSERT_EQ(k, -1); + ASSERT_EQ(k, 0); k = waitpid(-1, &s, WNOHANG); ASSERT_NE(k, -1); ASSERT_NE(k, 0); ASSERT_NE(k, pid); ASSERT_EQ(WIFEXITED(s), 1); ASSERT_EQ(WEXITSTATUS(s), 0); - sleep(1); - k = ptrace(PTRACE_ATTACH, pid, 0L, 0L); + k = waitpid(-1, &s, 0); + ASSERT_EQ(k, pid); + ASSERT_EQ(WIFSTOPPED(s), 1); + ASSERT_EQ(WSTOPSIG(s), SIGTRAP); + k = ptrace(PTRACE_CONT, pid, 0L, 0L); ASSERT_EQ(k, 0); k = waitpid(-1, &s, 0); ASSERT_EQ(k, pid); ASSERT_EQ(WIFSTOPPED(s), 1); ASSERT_EQ(WSTOPSIG(s), SIGSTOP); - k = ptrace(PTRACE_DETACH, pid, 0L, 0L); + k = ptrace(PTRACE_CONT, pid, 0L, 0L); ASSERT_EQ(k, 0); k = waitpid(-1, &s, 0); ASSERT_EQ(k, pid);
On Mon, Jan 15, 2024 at 08:22:19PM +0100, Bernd Edlinger wrote:
This introduces signal->exec_bprm, which is used to fix the case when at least one of the sibling threads is traced, and therefore the trace process may dead-lock in ptrace_attach, but de_thread will need to wait for the tracer to continue execution.
Not entirely sure why I've been added to the cc; this doesn't seem like it's even remotely within my realm of expertise.
+++ b/include/linux/cred.h @@ -153,6 +153,7 @@ extern const struct cred *get_task_cred(struct task_struct *); extern struct cred *cred_alloc_blank(void); extern struct cred *prepare_creds(void); extern struct cred *prepare_exec_creds(void); +extern bool is_dumpability_changed(const struct cred *, const struct cred *);
Using 'extern' for function declarations is deprecated. More importantly, you have two arguments of the same type, and how do I know which one is which if you don't name them?
+++ b/kernel/cred.c @@ -375,6 +375,28 @@ static bool cred_cap_issubset(const struct cred *set, const struct cred *subset) return false; } +/**
- is_dumpability_changed - Will changing creds from old to new
- affect the dumpability in commit_creds?
- Return: false - dumpability will not be changed in commit_creds.
true - dumpability will be changed to non-dumpable.
- @old: The old credentials
- @new: The new credentials
- */
Does kernel-doc really parse this correctly? Normal style would be:
/** * is_dumpability_changed - Will changing creds affect dumpability? * @old: The old credentials. * @new: The new credentials. * * If the @new credentials have no elevated privileges compared to the * @old credentials, the task may remain dumpable. Otherwise we have * to mark the task as undumpable to avoid information leaks from higher * to lower privilege domains. * * Return: True if the task will become undumpable. */
@@ -508,6 +531,14 @@ static int ptrace_traceme(void) { int ret = -EPERM;
- if (mutex_lock_interruptible(¤t->signal->cred_guard_mutex))
return -ERESTARTNOINTR;
Do you really want this to be interruptible by a timer signal or a window resize event?
On 1/15/24 20:37, Matthew Wilcox wrote:
On Mon, Jan 15, 2024 at 08:22:19PM +0100, Bernd Edlinger wrote:
This introduces signal->exec_bprm, which is used to fix the case when at least one of the sibling threads is traced, and therefore the trace process may dead-lock in ptrace_attach, but de_thread will need to wait for the tracer to continue execution.
Not entirely sure why I've been added to the cc; this doesn't seem like it's even remotely within my realm of expertise.
Ah, okay, never mind. A couple new email addresses were found this time when I used ./scripts/get_maintainer.pl
+++ b/include/linux/cred.h @@ -153,6 +153,7 @@ extern const struct cred *get_task_cred(struct task_struct *); extern struct cred *cred_alloc_blank(void); extern struct cred *prepare_creds(void); extern struct cred *prepare_exec_creds(void); +extern bool is_dumpability_changed(const struct cred *, const struct cred *);
Using 'extern' for function declarations is deprecated. More importantly, you have two arguments of the same type, and how do I know which one is which if you don't name them?
+++ b/kernel/cred.c @@ -375,6 +375,28 @@ static bool cred_cap_issubset(const struct cred *set, const struct cred *subset) return false; } +/**
- is_dumpability_changed - Will changing creds from old to new
- affect the dumpability in commit_creds?
- Return: false - dumpability will not be changed in commit_creds.
true - dumpability will be changed to non-dumpable.
- @old: The old credentials
- @new: The new credentials
- */
Does kernel-doc really parse this correctly? Normal style would be:
Apparently yes, but I think I only added those lines to silence some automatic checking bots.
/**
- is_dumpability_changed - Will changing creds affect dumpability?
- @old: The old credentials.
- @new: The new credentials.
- If the @new credentials have no elevated privileges compared to the
- @old credentials, the task may remain dumpable. Otherwise we have
- to mark the task as undumpable to avoid information leaks from higher
- to lower privilege domains.
- Return: True if the task will become undumpable.
*/
Thanks a lot, that looks much better. I will use your suggestion as is, when I re-send the patch next time.
@@ -508,6 +531,14 @@ static int ptrace_traceme(void) { int ret = -EPERM;
- if (mutex_lock_interruptible(¤t->signal->cred_guard_mutex))
return -ERESTARTNOINTR;
Do you really want this to be interruptible by a timer signal or a window resize event?
I think that is kind of okay, as most of the existing users lock the mutex also interruptible, so I just wanted to follow those examples.
Thanks Bernd.
I'll try to recall this problem and actually read the patch tommorrow...
Hmm. but it doesn't apply to Linus's tree, you need to rebase it. In particular, please note the recent commit 5431fdd2c181dd2eac2 ("ptrace: Convert ptrace_attach() to use lock guards")
On 01/15, Bernd Edlinger wrote:
The problem happens when a tracer tries to ptrace_attach to a multi-threaded process, that does an execve in one of the threads at the same time, without doing that in a forked sub-process. That means: There is a race condition, when one or more of the threads are already ptraced, but the thread that invoked the execve is not yet traced. Now in this case the execve locks the cred_guard_mutex and waits for de_thread to complete. But that waits for the traced sibling threads to exit, and those have to wait for the tracer to receive the exit signal, but the tracer cannot call wait right now, because it is waiting for the ptrace call to complete, and this never does not happen. The traced process and the tracer are now in a deadlock situation, and can only be killed by a fatal signal.
This looks very confusing to me. And even misleading.
So IIRC the problem is "simple".
de_thread() sleeps with cred_guard_mutex waiting for other threads to exit and pass release_task/__exit_signal.
If one of the sub-threads is traced, debugger should do ptrace_detach() or wait() to release this tracee, the killed tracee won't autoreap.
Now. If debugger tries to take the same cred_guard_mutex before detach/wait we have a deadlock. This is not specific to ptrace_attach(), proc_pid_attr_write() takes this lock too.
Right? Or are there other issues?
-static int de_thread(struct task_struct *tsk) +static int de_thread(struct task_struct *tsk, struct linux_binprm *bprm) { struct signal_struct *sig = tsk->signal; struct sighand_struct *oldsighand = tsk->sighand; spinlock_t *lock = &oldsighand->siglock;
struct task_struct *t = tsk;
bool unsafe_execve_in_progress = false;
if (thread_group_empty(tsk)) goto no_thread_group;
@@ -1066,6 +1068,19 @@ static int de_thread(struct task_struct *tsk) if (!thread_group_leader(tsk)) sig->notify_count--;
- while_each_thread(tsk, t) {
for_other_threads()
if (unlikely(t->ptrace)
&& (t != tsk->group_leader || !t->exit_state))
unsafe_execve_in_progress = true;
The !t->exit_state is not right... This sub-thread can already be a zombie with ->exit_state != 0 but see above, it won't be reaped until the debugger does wait().
- if (unlikely(unsafe_execve_in_progress)) {
spin_unlock_irq(lock);
sig->exec_bprm = bprm;
mutex_unlock(&sig->cred_guard_mutex);
spin_lock_irq(lock);
I don't understand why do we need to unlock and lock siglock here...
But my main question is why do we need the unsafe_execve_in_progress boolean. If this patch is correct and de_thread() can drop and re-acquire cread_guard_mutex when one of the threads is traced, then why can't we do this unconditionally ?
Oleg.
On 1/16/24 16:22, Oleg Nesterov wrote:
I'll try to recall this problem and actually read the patch tommorrow...
Hmm. but it doesn't apply to Linus's tree, you need to rebase it. In particular, please note the recent commit 5431fdd2c181dd2eac2 ("ptrace: Convert ptrace_attach() to use lock guards")
Oh, how ugly... Will this new C++-like "feature" ever make it into a stable branch?
On 01/15, Bernd Edlinger wrote:
The problem happens when a tracer tries to ptrace_attach to a multi-threaded process, that does an execve in one of the threads at the same time, without doing that in a forked sub-process. That means: There is a race condition, when one or more of the threads are already ptraced, but the thread that invoked the execve is not yet traced. Now in this case the execve locks the cred_guard_mutex and waits for de_thread to complete. But that waits for the traced sibling threads to exit, and those have to wait for the tracer to receive the exit signal, but the tracer cannot call wait right now, because it is waiting for the ptrace call to complete, and this never does not happen. The traced process and the tracer are now in a deadlock situation, and can only be killed by a fatal signal.
This looks very confusing to me. And even misleading.
So IIRC the problem is "simple".
de_thread() sleeps with cred_guard_mutex waiting for other threads to exit and pass release_task/__exit_signal.
If one of the sub-threads is traced, debugger should do ptrace_detach() or wait() to release this tracee, the killed tracee won't autoreap.
Yes. but the tracer has to do its job, and that is ptrace_attach the remaining treads, it does not know that it would avoid a dead-lock when it calls wait(), instead of ptrace_attach. It does not know that the tracee has just called execve in one of the not yet traced threads.
Now. If debugger tries to take the same cred_guard_mutex before detach/wait we have a deadlock. This is not specific to ptrace_attach(), proc_pid_attr_write() takes this lock too.
Right? Or are there other issues?
No, proc_pid_attr_write has no problem if it waits for cred_guard_mutex, because it is only called from one of the sibling threads, and zap_other_threads sends a SIGKILL to each of them, thus the mutex_lock_interruptible will stop waiting, and the thread will exit normally. It is only problematic when another process wants to lock the cred_guard_mutex, because it is not receiving a signal, when de_thread is waiting. The only other place where I am aware of this happening is ptrace_attach.
-static int de_thread(struct task_struct *tsk) +static int de_thread(struct task_struct *tsk, struct linux_binprm *bprm) { struct signal_struct *sig = tsk->signal; struct sighand_struct *oldsighand = tsk->sighand; spinlock_t *lock = &oldsighand->siglock;
struct task_struct *t = tsk;
bool unsafe_execve_in_progress = false;
if (thread_group_empty(tsk)) goto no_thread_group;
@@ -1066,6 +1068,19 @@ static int de_thread(struct task_struct *tsk) if (!thread_group_leader(tsk)) sig->notify_count--;
- while_each_thread(tsk, t) {
for_other_threads()
Ah, okay.
if (unlikely(t->ptrace)
&& (t != tsk->group_leader || !t->exit_state))
unsafe_execve_in_progress = true;
The !t->exit_state is not right... This sub-thread can already be a zombie with ->exit_state != 0 but see above, it won't be reaped until the debugger does wait().
I dont think so. de_thread() handles the group_leader different than normal threads. That means normal threads have to wait for being released from the zombie state by the tracer: sig->notify_count > 0, and de_thread is woken up by __exit_signal Once those are gone, de_thread waits for the group leader to reach exit_state = ZOMBIE, but again only if the group_leader is not the current thread: signal->notify_count < 0, and de_thread is woken up by exit_notify. So his reflects exactly what condition has to be met, see:
sig->notify_count = -1; if (likely(leader->exit_state)) break; __set_current_state(TASK_KILLABLE); write_unlock_irq(&tasklist_lock); cgroup_threadgroup_change_end(tsk); schedule(); if (__fatal_signal_pending(tsk)) goto killed;
so when the group_leader's exit_state is already != 0 then the second wait state will not be entered.
- if (unlikely(unsafe_execve_in_progress)) {
spin_unlock_irq(lock);
sig->exec_bprm = bprm;
mutex_unlock(&sig->cred_guard_mutex);
spin_lock_irq(lock);
I don't understand why do we need to unlock and lock siglock here...
That is just a precaution because I did want to release the mutexes exactly in the reverse order as they were acquired.
But my main question is why do we need the unsafe_execve_in_progress boolean. If this patch is correct and de_thread() can drop and re-acquire cread_guard_mutex when one of the threads is traced, then why can't we do this unconditionally ?
I just wanted to keep the impact of the change as small as possible, including possible performance degradation due to double checking of credentials. Worst thing that could happen with this approach, is that a situation where today a dead-lock is imminentm does still not work correctly, but when no tracer is attached, nothing will change or be less performant than before.
Bernd.
On 01/17, Bernd Edlinger wrote:
The problem happens when a tracer tries to ptrace_attach to a multi-threaded process, that does an execve in one of the threads at the same time, without doing that in a forked sub-process. That means: There is a race condition, when one or more of the threads are already ptraced, but the thread that invoked the execve is not yet traced. Now in this case the execve locks the cred_guard_mutex and waits for de_thread to complete. But that waits for the traced sibling threads to exit, and those have to wait for the tracer to receive the exit signal, but the tracer cannot call wait right now, because it is waiting for the ptrace call to complete, and this never does not happen. The traced process and the tracer are now in a deadlock situation, and can only be killed by a fatal signal.
This looks very confusing to me. And even misleading.
So IIRC the problem is "simple".
de_thread() sleeps with cred_guard_mutex waiting for other threads to exit and pass release_task/__exit_signal.
If one of the sub-threads is traced, debugger should do ptrace_detach() or wait() to release this tracee, the killed tracee won't autoreap.
Yes. but the tracer has to do its job, and that is ptrace_attach the remaining treads, it does not know that it would avoid a dead-lock when it calls wait(), instead of ptrace_attach. It does not know that the tracee has just called execve in one of the not yet traced threads.
Hmm. I don't understand you.
I agree we have a problem which should be fixed. Just the changelog looks confusing to me, imo it doesn't explain the race/problem clearly.
Now. If debugger tries to take the same cred_guard_mutex before detach/wait we have a deadlock. This is not specific to ptrace_attach(), proc_pid_attr_write() takes this lock too.
Right? Or are there other issues?
No, proc_pid_attr_write has no problem if it waits for cred_guard_mutex, because it is only called from one of the sibling threads,
OK, thanks, I was wrong. I forgot about "A task may only write its own attributes". So yes, ptrace_attach() is the only source of problematic mutex_lock() today. There were more in the past.
if (unlikely(t->ptrace)
&& (t != tsk->group_leader || !t->exit_state))
unsafe_execve_in_progress = true;
The !t->exit_state is not right... This sub-thread can already be a zombie with ->exit_state != 0 but see above, it won't be reaped until the debugger does wait().
I dont think so. de_thread() handles the group_leader different than normal threads.
I don't follow...
I didn't say that t is a group leader. I said it can be a zombie sub-thread with ->exit_state != 0.
That means normal threads have to wait for being released from the zombie state by the tracer: sig->notify_count > 0, and de_thread is woken up by __exit_signal
That is what I said before. Debugger should release a zombie sub-thread, it won't do __exit_signal() on its own.
- if (unlikely(unsafe_execve_in_progress)) {
spin_unlock_irq(lock);
sig->exec_bprm = bprm;
mutex_unlock(&sig->cred_guard_mutex);
spin_lock_irq(lock);
I don't understand why do we need to unlock and lock siglock here...
That is just a precaution because I did want to release the mutexes exactly in the reverse order as they were acquired.
To me this adds the unnecessary complication.
But my main question is why do we need the unsafe_execve_in_progress boolean. If this patch is correct and de_thread() can drop and re-acquire cread_guard_mutex when one of the threads is traced, then why can't we do this unconditionally ?
I just wanted to keep the impact of the change as small as possible,
But the unsafe_execve_in_progress logic increases the impact and complicates the patch.
I think the fix should be as simple as possible. (to be honest, right now I don't think this is a right approach).
including possible performance degradation due to double checking of credentials.
Not sure I understand, but you can add the performance improvements later. Not to mention that this should be justified, and the for_other_threads() loop added by this patch into de_thread() is not nice performance-wise.
Oleg.
On 1/17/24 17:38, Oleg Nesterov wrote:
On 01/17, Bernd Edlinger wrote:
Yes. but the tracer has to do its job, and that is ptrace_attach the remaining treads, it does not know that it would avoid a dead-lock when it calls wait(), instead of ptrace_attach. It does not know that the tracee has just called execve in one of the not yet traced threads.
Hmm. I don't understand you.
Certainly I am willing to rephrase this until it is understandable. Probably I have just not yet found the proper way to describe the issue here, and your help in resolving that documentation issue is very important to me.
I agree we have a problem which should be fixed. Just the changelog looks confusing to me, imo it doesn't explain the race/problem clearly.
I am trying here to summarize what the test case "attach" in ./tools/testing/selftests/ptrace/vmaccess.c does.
I think it models the use case of a tracer that is trying to attach to a multi-threaded process that is executing execve in a not-yet traced thread while a different sub-thread is already traced, it is not relevant that the test case uses PTRACE_TRACEME, to make the sub-thead traced, the same would happen if the tracer uses some out-of-band mechanism like /proc/pid/task to learn the thread_id of the sub-threads and uses ptrace_attach to each of them.
The test case hits the dead-lock because there is a race condition between before the PTRACE_ATTACH, and it cannot know that the exit event from the sub-thread is already pending before the PTRACE_ATTACH. Of course a real tracer will not sleep a whole second before a PTRACE_ATTACH, but even if it does a waitpid(-1, &s, WNOHANG) immediately before the PTRACE_ATTACH there is a tiny chance that the execve is entered just immediately after waitpid has indicated that there is currently not event pending.
if (unlikely(t->ptrace)
&& (t != tsk->group_leader || !t->exit_state))
unsafe_execve_in_progress = true;
The !t->exit_state is not right... This sub-thread can already be a zombie with ->exit_state != 0 but see above, it won't be reaped until the debugger does wait().
I dont think so. de_thread() handles the group_leader different than normal threads.
I don't follow...
I didn't say that t is a group leader. I said it can be a zombie sub-thread with ->exit_state != 0.
the condition here is
(t != tsk->group_leader || !t->exit_state)
so in other words, if t is a sub-thread, i.e. t != tsk->group_leader then the t->exit_state does not count, and the deadlock is possible.
But if t it is a group leader, then t == tsk->group_leader, but a deadlock is only possible when t->exit_state == 0 at this time. The most likely reason for this is PTRACE_O_TRACEEXIT.
I will add a new test case that demonstrates this in the next iteration of this patch. Here is a preview of what I have right now:
/* * Same test as previous, except that * the group leader is ptraced first, * but this time with PTRACE_O_TRACEEXIT, * and the thread that does execve is * not yet ptraced. This exercises the * code block in de_thread where the * if (!thread_group_leader(tsk)) { * is executed and enters a wait state. */ static long thread2_tid; static void *thread2(void *arg) { thread2_tid = syscall(__NR_gettid); sleep(2); execlp("false", "false", NULL); return NULL; }
TEST(attach2) { int s, k, pid = fork();
if (!pid) { pthread_t pt;
pthread_create(&pt, NULL, thread2, NULL); pthread_join(pt, NULL); return; }
sleep(1); k = ptrace(PTRACE_ATTACH, pid, 0L, 0L); ASSERT_EQ(k, 0); k = waitpid(-1, &s, 0); ASSERT_EQ(k, pid); ASSERT_EQ(WIFSTOPPED(s), 1); ASSERT_EQ(WSTOPSIG(s), SIGSTOP); k = ptrace(PTRACE_SETOPTIONS, pid, 0L, PTRACE_O_TRACEEXIT); ASSERT_EQ(k, 0); thread2_tid = ptrace(PTRACE_PEEKDATA, pid, &thread2_tid, 0L); ASSERT_NE(thread2_tid, -1); ASSERT_NE(thread2_tid, 0); ASSERT_NE(thread2_tid, pid); k = waitpid(-1, &s, WNOHANG); ASSERT_EQ(k, 0); sleep(2); /* deadlock may happen here */ k = ptrace(PTRACE_ATTACH, thread2_tid, 0L, 0L); ASSERT_EQ(k, 0); k = waitpid(-1, &s, WNOHANG); ASSERT_EQ(k, pid); ASSERT_EQ(WIFSTOPPED(s), 1); ASSERT_EQ(WSTOPSIG(s), SIGTRAP); k = waitpid(-1, &s, WNOHANG); ASSERT_EQ(k, 0); k = ptrace(PTRACE_CONT, pid, 0L, 0L); ASSERT_EQ(k, 0); k = waitpid(-1, &s, 0); ASSERT_EQ(k, pid); ASSERT_EQ(WIFSTOPPED(s), 1); ASSERT_EQ(WSTOPSIG(s), SIGTRAP); k = waitpid(-1, &s, WNOHANG); ASSERT_EQ(k, 0); k = ptrace(PTRACE_CONT, pid, 0L, 0L); ASSERT_EQ(k, 0); k = waitpid(-1, &s, 0); ASSERT_EQ(k, pid); ASSERT_EQ(WIFSTOPPED(s), 1); ASSERT_EQ(WSTOPSIG(s), SIGSTOP); k = waitpid(-1, &s, WNOHANG); ASSERT_EQ(k, 0); k = ptrace(PTRACE_CONT, pid, 0L, 0L); ASSERT_EQ(k, 0); k = waitpid(-1, &s, 0); ASSERT_EQ(k, pid); ASSERT_EQ(WIFEXITED(s), 1); ASSERT_EQ(WEXITSTATUS(s), 1); k = waitpid(-1, NULL, 0); ASSERT_EQ(k, -1); ASSERT_EQ(errno, ECHILD); }
So the traced process does the execve in the sub-thread, and the tracer attaches the thread leader first, and when the next PTRACE_ATTACH happens, the thread leader is stopped because of the PTRACE_O_TACEEXIT. So at that time, the t->exit_state == 0, and we receive the following:
k = waitpid(-1, &s, WNOHANG); ASSERT_EQ(k, pid); ASSERT_EQ(WIFSTOPPED(s), 1); ASSERT_EQ(WSTOPSIG(s), SIGTRAP);
yet the de_thread is not finished now, but only when
k = ptrace(PTRACE_CONT, pid, 0L, 0L); ASSERT_EQ(k, 0);
happens.
The only thing, that is admittedly pretty confusing here, is the fact that the thread2_tid morphs into group leader's pid, at this time, and is therefore never heard of again.
So pid refers to the former thread2_tid from now on, and the former group leader does not enter the usual zombie state here, because the sub-thread takes over it's role.
- if (unlikely(unsafe_execve_in_progress)) {
spin_unlock_irq(lock);
sig->exec_bprm = bprm;
mutex_unlock(&sig->cred_guard_mutex);
spin_lock_irq(lock);
I don't understand why do we need to unlock and lock siglock here...
That is just a precaution because I did want to release the mutexes exactly in the reverse order as they were acquired.
To me this adds the unnecessary complication.
Well, the proposed change creates this sequence
mutex_lock(&sig_cred_guard_mutex); spin_lock_irq(lock); mutex_unlock(&sig_cred_guard_mutex); spin_unlock_irq(lock);
I wanted to avoid that, because in a usual real-time os, I'd expect the mutex_unlock to schedule another waiting task, regardless of the spin lock state.
Are you saying that doing this is safe to do in linux, because the scheduling does not happen except when explicitly asked for e.g. by calling schedule() ? And would that also be safe for real time linux port ?
But my main question is why do we need the unsafe_execve_in_progress boolean. If this patch is correct and de_thread() can drop and re-acquire cread_guard_mutex when one of the threads is traced, then why can't we do this unconditionally ?
I just wanted to keep the impact of the change as small as possible,
But the unsafe_execve_in_progress logic increases the impact and complicates the patch.
I think the fix should be as simple as possible. (to be honest, right now I don't think this is a right approach).
The main concern was when a set-suid program is executed by execve. Then it makes a difference if the current thread is traced before the execve or not. That means if the current thread is already traced, the decision, which credentials will be used is different than otherwise.
So currently there are two possbilities, either the trace happens before the execve, and the suid-bit will be ignored, or the trace happens after the execve, but it is checked that the now potentially more privileged credentials allow the tracer to proceed.
With this patch we will have a third prossibility, that is in order to avoid the possible dead-lock we allow the suid-bit to take effect, but only if the tracer's privileges allow both to attach the current credentials and the new credentials. But I would only do that as a last resort, to avoid the possible dead-lock, and not unless a dead-lock is really expected to happen.
Thanks Bernd.
I'll try to read your email later, just one note for now...
On 01/22, Bernd Edlinger wrote:
I didn't say that t is a group leader. I said it can be a zombie sub-thread with ->exit_state != 0.
the condition here is
(t != tsk->group_leader || !t->exit_state)
so in other words, if t is a sub-thread, i.e. t != tsk->group_leader then the t->exit_state does not count,
Ah indeed, somehow I misread this check as if you skip the sub-threads with ->exit_state != 0.
Sorry for noise.
Oleg.
On Mon, Jan 22, 2024 at 02:24:37PM +0100, Bernd Edlinger wrote:
The main concern was when a set-suid program is executed by execve. Then it makes a difference if the current thread is traced before the execve or not. That means if the current thread is already traced, the decision, which credentials will be used is different than otherwise.
So currently there are two possbilities, either the trace happens before the execve, and the suid-bit will be ignored, or the trace happens after the execve, but it is checked that the now potentially more privileged credentials allow the tracer to proceed.
With this patch we will have a third prossibility, that is in order to avoid the possible dead-lock we allow the suid-bit to take effect, but only if the tracer's privileges allow both to attach the current credentials and the new credentials. But I would only do that as a last resort, to avoid the possible dead-lock, and not unless a dead-lock is really expected to happen.
Instead of doing this special cred check (which I am worried could become fragile -- I'd prefer all privilege checks happen in the same place and in the same way...), could we just fail the ptrace_attach of the execve?
On 1/22/24 22:30, Kees Cook wrote:
On Mon, Jan 22, 2024 at 02:24:37PM +0100, Bernd Edlinger wrote:
The main concern was when a set-suid program is executed by execve. Then it makes a difference if the current thread is traced before the execve or not. That means if the current thread is already traced, the decision, which credentials will be used is different than otherwise.
So currently there are two possbilities, either the trace happens before the execve, and the suid-bit will be ignored, or the trace happens after the execve, but it is checked that the now potentially more privileged credentials allow the tracer to proceed.
With this patch we will have a third prossibility, that is in order to avoid the possible dead-lock we allow the suid-bit to take effect, but only if the tracer's privileges allow both to attach the current credentials and the new credentials. But I would only do that as a last resort, to avoid the possible dead-lock, and not unless a dead-lock is really expected to happen.
Instead of doing this special cred check (which I am worried could become fragile -- I'd prefer all privilege checks happen in the same place and in the same way...), could we just fail the ptrace_attach of the execve?
Hmm, yes. That is also possible, and that was actually my first approach, but I think the current patch is superior. I have nevertheless tried it again, to get a better picture of the differences between those two approaches.
See below for how that alternative approach would look like: + the advantage of that would be simplicity. + it avoids the dead-lock in the tracer. - it is an API change, which we normally try to avoid. - the adjusted test case(s) show that the tracer cannot successfully attach to the resulting process before the execve'd process starts up. So although there is no suid process involved in my test cases, the traced program simply escapes out of the tracer's control.
The advantage of the current approach would be: + it avoids the dead-lock in the tracer + it avoids a potentially breaking API change. + the tracer is normally able to successfully attach to the resulting process after the execve completes, before it starts to execute. + the debug experience is just better. - worst case that can happen, is that the security policy denies the tracer the access to the new process after the execve. In that case the PTRACE_ATTACH will fail each time it is attempted, in a similar way as the the alternate approach. But the overall result is still correct. The privileged process escapes, and that is okay in that case. - it is theoretically possible that the security engine gets confused by the additional call to security_ptrace_access_check, but that will be something that can be fixed, when it happens.
However my main motivation, why I started this work was the security implication.
I assume the tracer is a privileged process, like an anti virus program, that supervises all processes and if it detects some anomaly it can ptrace attach to the target, check what it does and prevent it from doing bad things.
- Currently a non-privileged program can potentially send such a privileged tracer into a deadlock. - With the alternative patch below that non-privileged can no longer send the tracer into a deadlock, but it can still quickly escape out of the tracer's control. - But with my latest patch a sufficiently privileged tracer can neither be sent into a deadlock nor can the attached process escape. Mission completed.
Thanks Bernd.
Here is the alternative patch for reference: diff --git a/fs/exec.c b/fs/exec.c index e88249a1ce07..0a948f5821b7 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1045,6 +1045,8 @@ static int de_thread(struct task_struct *tsk) struct signal_struct *sig = tsk->signal; struct sighand_struct *oldsighand = tsk->sighand; spinlock_t *lock = &oldsighand->siglock; + struct task_struct *t; + bool unsafe_execve_in_progress = false;
if (thread_group_empty(tsk)) goto no_thread_group; @@ -1067,6 +1069,18 @@ static int de_thread(struct task_struct *tsk) if (!thread_group_leader(tsk)) sig->notify_count--;
+ for_other_threads(tsk, t) { + if (unlikely(t->ptrace) + && (t != tsk->group_leader || !t->exit_state)) + unsafe_execve_in_progress = true; + } + + if (unlikely(unsafe_execve_in_progress)) { + spin_unlock_irq(lock); + mutex_unlock(&sig->cred_guard_mutex); + spin_lock_irq(lock); + } + while (sig->notify_count) { __set_current_state(TASK_KILLABLE); spin_unlock_irq(lock); @@ -1157,6 +1171,9 @@ static int de_thread(struct task_struct *tsk) release_task(leader); }
+ if (unlikely(unsafe_execve_in_progress)) + mutex_lock(&sig->cred_guard_mutex); + sig->group_exec_task = NULL; sig->notify_count = 0;
@@ -1168,6 +1185,9 @@ static int de_thread(struct task_struct *tsk) return 0;
killed: + if (unlikely(unsafe_execve_in_progress)) + mutex_lock(&sig->cred_guard_mutex); + /* protects against exit_notify() and __exit_signal() */ read_lock(&tasklist_lock); sig->group_exec_task = NULL; @@ -1479,6 +1499,11 @@ static int prepare_bprm_creds(struct linux_binprm *bprm) if (mutex_lock_interruptible(¤t->signal->cred_guard_mutex)) return -ERESTARTNOINTR;
+ if (unlikely(current->signal->group_exec_task)) { + mutex_unlock(¤t->signal->cred_guard_mutex); + return -ERESTARTNOINTR; + } + bprm->cred = prepare_exec_creds(); if (likely(bprm->cred)) return 0; diff --git a/fs/proc/base.c b/fs/proc/base.c index 98a031ac2648..55816320c103 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -2785,6 +2785,12 @@ static ssize_t proc_pid_attr_write(struct file * file, const char __user * buf, if (rv < 0) goto out_free;
+ if (unlikely(current->signal->group_exec_task)) { + mutex_unlock(¤t->signal->cred_guard_mutex); + rv = -ERESTARTNOINTR; + goto out_free; + } + rv = security_setprocattr(PROC_I(inode)->op.lsmid, file->f_path.dentry->d_name.name, page, count); diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 2fabd497d659..162e4c8f7b08 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -444,6 +444,9 @@ static int ptrace_attach(struct task_struct *task, long request, scoped_cond_guard (mutex_intr, return -ERESTARTNOINTR, &task->signal->cred_guard_mutex) {
+ if (unlikely(task->signal->group_exec_task)) + return -EAGAIN; + scoped_guard (task_lock, task) { retval = __ptrace_may_access(task, PTRACE_MODE_ATTACH_REALCREDS); if (retval) @@ -491,6 +494,14 @@ static int ptrace_traceme(void) { int ret = -EPERM;
+ if (mutex_lock_interruptible(¤t->signal->cred_guard_mutex)) + return -ERESTARTNOINTR; + + if (unlikely(current->signal->group_exec_task)) { + mutex_unlock(¤t->signal->cred_guard_mutex); + return -ERESTARTNOINTR; + } + write_lock_irq(&tasklist_lock); /* Are we already being traced? */ if (!current->ptrace) { @@ -506,6 +517,7 @@ static int ptrace_traceme(void) } } write_unlock_irq(&tasklist_lock); + mutex_unlock(¤t->signal->cred_guard_mutex);
return ret; } diff --git a/kernel/seccomp.c b/kernel/seccomp.c index aca7b437882e..6a136d6ddf7c 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -1955,9 +1955,15 @@ static long seccomp_set_mode_filter(unsigned int flags, * Make sure we cannot change seccomp or nnp state via TSYNC * while another thread is in the middle of calling exec. */ - if (flags & SECCOMP_FILTER_FLAG_TSYNC && - mutex_lock_killable(¤t->signal->cred_guard_mutex)) - goto out_put_fd; + if (flags & SECCOMP_FILTER_FLAG_TSYNC) { + if (mutex_lock_killable(¤t->signal->cred_guard_mutex)) + goto out_put_fd; + + if (unlikely(current->signal->group_exec_task)) { + mutex_unlock(¤t->signal->cred_guard_mutex); + goto out_put_fd; + } + }
spin_lock_irq(¤t->sighand->siglock);
diff --git a/tools/testing/selftests/ptrace/vmaccess.c b/tools/testing/selftests/ptrace/vmaccess.c index 4db327b44586..7a51a350a068 100644 --- a/tools/testing/selftests/ptrace/vmaccess.c +++ b/tools/testing/selftests/ptrace/vmaccess.c @@ -14,6 +14,7 @@ #include <signal.h> #include <unistd.h> #include <sys/ptrace.h> +#include <sys/syscall.h>
static void *thread(void *arg) { @@ -23,7 +24,7 @@ static void *thread(void *arg)
TEST(vmaccess) { - int f, pid = fork(); + int s, f, pid = fork(); char mm[64];
if (!pid) { @@ -31,19 +32,42 @@ TEST(vmaccess)
pthread_create(&pt, NULL, thread, NULL); pthread_join(pt, NULL); - execlp("true", "true", NULL); + execlp("false", "false", NULL); + return; }
sleep(1); sprintf(mm, "/proc/%d/mem", pid); + /* deadlock did happen here */ f = open(mm, O_RDONLY); ASSERT_GE(f, 0); close(f); - f = kill(pid, SIGCONT); - ASSERT_EQ(f, 0); + f = waitpid(-1, &s, WNOHANG); + ASSERT_NE(f, -1); + ASSERT_NE(f, 0); + ASSERT_NE(f, pid); + ASSERT_EQ(WIFEXITED(s), 1); + ASSERT_EQ(WEXITSTATUS(s), 0); + f = waitpid(-1, &s, 0); + ASSERT_EQ(f, pid); + ASSERT_EQ(WIFEXITED(s), 1); + ASSERT_EQ(WEXITSTATUS(s), 1); + f = waitpid(-1, NULL, 0); + ASSERT_EQ(f, -1); + ASSERT_EQ(errno, ECHILD); }
-TEST(attach) +/* + * Same test as previous, except that + * we try to ptrace the group leader, + * which is about to call execve, + * when the other thread is already ptraced. + * This exercises the code in de_thread + * where it is waiting inside the + * while (sig->notify_count) { + * loop. + */ +TEST(attach1) { int s, k, pid = fork();
@@ -52,19 +76,67 @@ TEST(attach)
pthread_create(&pt, NULL, thread, NULL); pthread_join(pt, NULL); - execlp("sleep", "sleep", "2", NULL); + execlp("false", "false", NULL); + return; }
sleep(1); + /* deadlock may happen here */ k = ptrace(PTRACE_ATTACH, pid, 0L, 0L); - ASSERT_EQ(errno, EAGAIN); ASSERT_EQ(k, -1); + ASSERT_EQ(errno, EAGAIN); k = waitpid(-1, &s, WNOHANG); ASSERT_NE(k, -1); ASSERT_NE(k, 0); ASSERT_NE(k, pid); ASSERT_EQ(WIFEXITED(s), 1); ASSERT_EQ(WEXITSTATUS(s), 0); + k = ptrace(PTRACE_ATTACH, pid, 0L, 0L); + ASSERT_EQ(k, -1); + ASSERT_EQ(errno, EAGAIN); + k = waitpid(-1, &s, 0); + ASSERT_EQ(k, pid); + ASSERT_EQ(WIFEXITED(s), 1); + ASSERT_EQ(WEXITSTATUS(s), 1); + k = ptrace(PTRACE_ATTACH, pid, 0L, 0L); + ASSERT_EQ(k, -1); + ASSERT_EQ(errno, ESRCH); + k = waitpid(-1, NULL, 0); + ASSERT_EQ(k, -1); + ASSERT_EQ(errno, ECHILD); +} + +/* + * Same test as previous, except that + * the group leader is ptraced first, + * but this time with PTRACE_O_TRACEEXIT, + * and the thread that does execve is + * not yet ptraced. This exercises the + * code block in de_thread where the + * if (!thread_group_leader(tsk)) { + * is executed and enters a wait state. + */ +static long thread2_tid; +static void *thread2(void *arg) +{ + thread2_tid = syscall(__NR_gettid); + sleep(2); + execlp("false", "false", NULL); + return NULL; +} + +TEST(attach2) +{ + int s, k, pid = fork(); + + if (!pid) { + pthread_t pt; + + pthread_create(&pt, NULL, thread2, NULL); + pthread_join(pt, NULL); + return; + } + sleep(1); k = ptrace(PTRACE_ATTACH, pid, 0L, 0L); ASSERT_EQ(k, 0); @@ -72,12 +144,40 @@ TEST(attach) ASSERT_EQ(k, pid); ASSERT_EQ(WIFSTOPPED(s), 1); ASSERT_EQ(WSTOPSIG(s), SIGSTOP); - k = ptrace(PTRACE_DETACH, pid, 0L, 0L); + k = ptrace(PTRACE_SETOPTIONS, pid, 0L, PTRACE_O_TRACEEXIT); + ASSERT_EQ(k, 0); + thread2_tid = ptrace(PTRACE_PEEKDATA, pid, &thread2_tid, 0L); + ASSERT_NE(thread2_tid, -1); + ASSERT_NE(thread2_tid, 0); + ASSERT_NE(thread2_tid, pid); + k = waitpid(-1, &s, WNOHANG); + ASSERT_EQ(k, 0); + sleep(2); + /* deadlock may happen here */ + k = ptrace(PTRACE_ATTACH, thread2_tid, 0L, 0L); + ASSERT_EQ(k, -1); + ASSERT_EQ(errno, EAGAIN); + k = waitpid(-1, &s, WNOHANG); + ASSERT_EQ(k, pid); + ASSERT_EQ(WIFSTOPPED(s), 1); + ASSERT_EQ(WSTOPSIG(s), SIGTRAP); + k = ptrace(PTRACE_ATTACH, thread2_tid, 0L, 0L); + ASSERT_EQ(k, -1); + ASSERT_EQ(errno, EAGAIN); + k = waitpid(-1, &s, WNOHANG); + ASSERT_EQ(k, 0); + k = ptrace(PTRACE_CONT, pid, 0L, 0L); ASSERT_EQ(k, 0); + k = ptrace(PTRACE_ATTACH, thread2_tid, 0L, 0L); + ASSERT_EQ(k, -1); + ASSERT_EQ(errno, EAGAIN); k = waitpid(-1, &s, 0); ASSERT_EQ(k, pid); ASSERT_EQ(WIFEXITED(s), 1); - ASSERT_EQ(WEXITSTATUS(s), 0); + ASSERT_EQ(WEXITSTATUS(s), 1); + k = ptrace(PTRACE_ATTACH, thread2_tid, 0L, 0L); + ASSERT_EQ(k, -1); + ASSERT_EQ(errno, ESRCH); k = waitpid(-1, NULL, 0); ASSERT_EQ(k, -1); ASSERT_EQ(errno, ECHILD);
On Tue, Jan 23, 2024 at 07:30:52PM +0100, Bernd Edlinger wrote:
- Currently a non-privileged program can potentially send such a privileged
tracer into a deadlock.
- With the alternative patch below that non-privileged can no longer send the
tracer into a deadlock, but it can still quickly escape out of the tracer's control.
- But with my latest patch a sufficiently privileged tracer can neither be
sent into a deadlock nor can the attached process escape. Mission completed.
Thanks for the details. And it would be pretty unfriendly to fail the execve() too (or, rather, it makes the execve failure unpredictable). I'll keep reading your patch...
This introduces signal->exec_bprm, which is used to fix the case when at least one of the sibling threads is traced, and therefore the trace process may dead-lock in ptrace_attach, but de_thread will need to wait for the tracer to continue execution.
The problem happens when a tracer tries to ptrace_attach to a multi-threaded process, that does an execve in one of the threads at the same time, without doing that in a forked sub-process. That means: There is a race condition, when one or more of the threads are already ptraced, but the thread that invoked the execve is not yet traced. Now in this case the execve locks the cred_guard_mutex and waits for de_thread to complete. But that waits for the traced sibling threads to exit, and those have to wait for the tracer to receive the exit signal, but the tracer cannot call wait right now, because it is waiting for the ptrace call to complete, and this never does not happen. The traced process and the tracer are now in a deadlock situation, and can only be killed by a fatal signal.
The solution is to detect this situation and allow ptrace_attach to continue by temporarily releasing the cred_guard_mutex, while de_thread() is still waiting for traced zombies to be eventually released by the tracer. In the case of the thread group leader we only have to wait for the thread to become a zombie, which may also need co-operation from the tracer due to PTRACE_O_TRACEEXIT.
When a tracer wants to ptrace_attach a task that already is in execve, we simply retry the ptrace_may_access check while temporarily installing the new credentials and dumpability which are about to be used after execve completes. If the ptrace_attach happens on a thread that is a sibling-thread of the thread doing execve, it is sufficient to check against the old credentials, as this thread will be waited for, before the new credentials are installed.
Other threads die quickly since the cred_guard_mutex is released, but a deadly signal is already pending. In case the mutex_lock_killable misses the signal, the non-zero current->signal->exec_bprm makes sure they release the mutex immediately and return with -ERESTARTNOINTR.
This means there is no API change, unlike the previous version of this patch which was discussed here:
https://lore.kernel.org/lkml/b6537ae6-31b1-5c50-f32b-8b8332ace882@hotmail.de...
See tools/testing/selftests/ptrace/vmaccess.c for a test case that gets fixed by this change.
Note that since the test case was originally designed to test the ptrace_attach returning an error in this situation, the test expectation needed to be adjusted, to allow the API to succeed at the first attempt.
Signed-off-by: Bernd Edlinger bernd.edlinger@hotmail.de --- fs/exec.c | 69 ++++++++--- fs/proc/base.c | 6 + include/linux/cred.h | 1 + include/linux/sched/signal.h | 18 +++ kernel/cred.c | 30 ++++- kernel/ptrace.c | 31 +++++ kernel/seccomp.c | 12 +- tools/testing/selftests/ptrace/vmaccess.c | 135 ++++++++++++++++++++-- 8 files changed, 265 insertions(+), 37 deletions(-)
v10: Changes to previous version, make the PTRACE_ATTACH return -EAGAIN, instead of execve return -ERESTARTSYS. Added some lessions learned to the description.
v11: Check old and new credentials in PTRACE_ATTACH again without changing the API.
Note: I got actually one response from an automatic checker to the v11 patch,
https://lore.kernel.org/lkml/202107121344.wu68hEPF-lkp@intel.com/
which is complaining about:
kernel/ptrace.c:425:26: sparse: sparse: incorrect type in assignment (different address spaces) @@ expected struct cred const *old_cred @@ got struct cred const [noderef] __rcu *real_cred @@
417 struct linux_binprm *bprm = task->signal->exec_bprm; 418 const struct cred *old_cred; 419 struct mm_struct *old_mm; 420 421 retval = down_write_killable(&task->signal->exec_update_lock); 422 if (retval) 423 goto unlock_creds; 424 task_lock(task);
425 old_cred = task->real_cred;
v12: Essentially identical to v11.
- Fixed a minor merge conflict in linux v5.17, and fixed the above mentioned nit by adding __rcu to the declaration.
- re-tested the patch with all linux versions from v5.11 to v6.6
v10 was an alternative approach which did imply an API change. But I would prefer to avoid such an API change.
The difficult part is getting the right dumpability flags assigned before de_thread starts, hope you like this version. If not, the v10 is of course also acceptable.
v13: Fixed duplicated Return section in function header of is_dumpability_changed which was reported by the kernel test robot
v14: rebased to v6.7, refreshed and retested. And added a more detailed description of the actual bug.
v15: rebased to v6.8-rc1, addressed some review comments. Split the test case vmaccess into vmaccess1 and vmaccess2 to improve overall test coverage.
Thanks Bernd.
diff --git a/fs/exec.c b/fs/exec.c index e88249a1ce07..499380d74899 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1040,11 +1040,13 @@ static int exec_mmap(struct mm_struct *mm) return 0; }
-static int de_thread(struct task_struct *tsk) +static int de_thread(struct task_struct *tsk, struct linux_binprm *bprm) { struct signal_struct *sig = tsk->signal; struct sighand_struct *oldsighand = tsk->sighand; spinlock_t *lock = &oldsighand->siglock; + struct task_struct *t; + bool unsafe_execve_in_progress = false;
if (thread_group_empty(tsk)) goto no_thread_group; @@ -1067,6 +1069,19 @@ static int de_thread(struct task_struct *tsk) if (!thread_group_leader(tsk)) sig->notify_count--;
+ for_other_threads(tsk, t) { + if (unlikely(t->ptrace) + && (t != tsk->group_leader || !t->exit_state)) + unsafe_execve_in_progress = true; + } + + if (unlikely(unsafe_execve_in_progress)) { + spin_unlock_irq(lock); + sig->exec_bprm = bprm; + mutex_unlock(&sig->cred_guard_mutex); + spin_lock_irq(lock); + } + while (sig->notify_count) { __set_current_state(TASK_KILLABLE); spin_unlock_irq(lock); @@ -1157,6 +1172,11 @@ static int de_thread(struct task_struct *tsk) release_task(leader); }
+ if (unlikely(unsafe_execve_in_progress)) { + mutex_lock(&sig->cred_guard_mutex); + sig->exec_bprm = NULL; + } + sig->group_exec_task = NULL; sig->notify_count = 0;
@@ -1168,6 +1188,11 @@ static int de_thread(struct task_struct *tsk) return 0;
killed: + if (unlikely(unsafe_execve_in_progress)) { + mutex_lock(&sig->cred_guard_mutex); + sig->exec_bprm = NULL; + } + /* protects against exit_notify() and __exit_signal() */ read_lock(&tasklist_lock); sig->group_exec_task = NULL; @@ -1252,6 +1277,24 @@ int begin_new_exec(struct linux_binprm * bprm) if (retval) return retval;
+ /* If the binary is not readable then enforce mm->dumpable=0 */ + would_dump(bprm, bprm->file); + if (bprm->have_execfd) + would_dump(bprm, bprm->executable); + + /* + * Figure out dumpability. Note that this checking only of current + * is wrong, but userspace depends on it. This should be testing + * bprm->secureexec instead. + */ + if (bprm->interp_flags & BINPRM_FLAGS_ENFORCE_NONDUMP || + is_dumpability_changed(current_cred(), bprm->cred) || + !(uid_eq(current_euid(), current_uid()) && + gid_eq(current_egid(), current_gid()))) + set_dumpable(bprm->mm, suid_dumpable); + else + set_dumpable(bprm->mm, SUID_DUMP_USER); + /* * Ensure all future errors are fatal. */ @@ -1260,7 +1303,7 @@ int begin_new_exec(struct linux_binprm * bprm) /* * Make this the only thread in the thread group. */ - retval = de_thread(me); + retval = de_thread(me, bprm); if (retval) goto out;
@@ -1283,11 +1326,6 @@ int begin_new_exec(struct linux_binprm * bprm) if (retval) goto out;
- /* If the binary is not readable then enforce mm->dumpable=0 */ - would_dump(bprm, bprm->file); - if (bprm->have_execfd) - would_dump(bprm, bprm->executable); - /* * Release all of the old mmap stuff */ @@ -1349,18 +1387,6 @@ int begin_new_exec(struct linux_binprm * bprm)
me->sas_ss_sp = me->sas_ss_size = 0;
- /* - * Figure out dumpability. Note that this checking only of current - * is wrong, but userspace depends on it. This should be testing - * bprm->secureexec instead. - */ - if (bprm->interp_flags & BINPRM_FLAGS_ENFORCE_NONDUMP || - !(uid_eq(current_euid(), current_uid()) && - gid_eq(current_egid(), current_gid()))) - set_dumpable(current->mm, suid_dumpable); - else - set_dumpable(current->mm, SUID_DUMP_USER); - perf_event_exec(); __set_task_comm(me, kbasename(bprm->filename), true);
@@ -1479,6 +1505,11 @@ static int prepare_bprm_creds(struct linux_binprm *bprm) if (mutex_lock_interruptible(¤t->signal->cred_guard_mutex)) return -ERESTARTNOINTR;
+ if (unlikely(current->signal->exec_bprm)) { + mutex_unlock(¤t->signal->cred_guard_mutex); + return -ERESTARTNOINTR; + } + bprm->cred = prepare_exec_creds(); if (likely(bprm->cred)) return 0; diff --git a/fs/proc/base.c b/fs/proc/base.c index 98a031ac2648..eab3461e4da7 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -2785,6 +2785,12 @@ static ssize_t proc_pid_attr_write(struct file * file, const char __user * buf, if (rv < 0) goto out_free;
+ if (unlikely(current->signal->exec_bprm)) { + mutex_unlock(¤t->signal->cred_guard_mutex); + rv = -ERESTARTNOINTR; + goto out_free; + } + rv = security_setprocattr(PROC_I(inode)->op.lsmid, file->f_path.dentry->d_name.name, page, count); diff --git a/include/linux/cred.h b/include/linux/cred.h index 2976f534a7a3..a1a1ac38f749 100644 --- a/include/linux/cred.h +++ b/include/linux/cred.h @@ -153,6 +153,7 @@ extern const struct cred *get_task_cred(struct task_struct *); extern struct cred *cred_alloc_blank(void); extern struct cred *prepare_creds(void); extern struct cred *prepare_exec_creds(void); +extern bool is_dumpability_changed(const struct cred *, const struct cred *); extern int commit_creds(struct cred *); extern void abort_creds(struct cred *); extern const struct cred *override_creds(const struct cred *); diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h index 4b7664c56208..6364e115e9e9 100644 --- a/include/linux/sched/signal.h +++ b/include/linux/sched/signal.h @@ -235,9 +235,27 @@ struct signal_struct { struct mm_struct *oom_mm; /* recorded mm when the thread group got * killed by the oom killer */
+ struct linux_binprm *exec_bprm; /* Used to check ptrace_may_access + * against new credentials while + * de_thread is waiting for other + * traced threads to terminate. + * Set while de_thread is executing. + * The cred_guard_mutex is released + * after de_thread() has called + * zap_other_threads(), therefore + * a fatal signal is guaranteed to be + * already pending in the unlikely + * event, that + * current->signal->exec_bprm happens + * to be non-zero after the + * cred_guard_mutex was acquired. + */ + struct mutex cred_guard_mutex; /* guard against foreign influences on * credential calculations * (notably. ptrace) + * Held while execve runs, except when + * a sibling thread is being traced. * Deprecated do not use in new code. * Use exec_update_lock instead. */ diff --git a/kernel/cred.c b/kernel/cred.c index c033a201c808..0066b5b0f052 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -375,6 +375,30 @@ static bool cred_cap_issubset(const struct cred *set, const struct cred *subset) return false; }
+/** + * is_dumpability_changed - Will changing creds affect dumpability? + * @old: The old credentials. + * @new: The new credentials. + * + * If the @new credentials have no elevated privileges compared to the + * @old credentials, the task may remain dumpable. Otherwise we have + * to mark the task as undumpable to avoid information leaks from higher + * to lower privilege domains. + * + * Return: True if the task will become undumpable. + */ +bool is_dumpability_changed(const struct cred *old, const struct cred *new) +{ + if (!uid_eq(old->euid, new->euid) || + !gid_eq(old->egid, new->egid) || + !uid_eq(old->fsuid, new->fsuid) || + !gid_eq(old->fsgid, new->fsgid) || + !cred_cap_issubset(old, new)) + return true; + + return false; +} + /** * commit_creds - Install new credentials upon the current task * @new: The credentials to be assigned @@ -403,11 +427,7 @@ int commit_creds(struct cred *new) get_cred(new); /* we will require a ref for the subj creds too */
/* dumpability changes */ - if (!uid_eq(old->euid, new->euid) || - !gid_eq(old->egid, new->egid) || - !uid_eq(old->fsuid, new->fsuid) || - !gid_eq(old->fsgid, new->fsgid) || - !cred_cap_issubset(old, new)) { + if (is_dumpability_changed(old, new)) { if (task->mm) set_dumpable(task->mm, suid_dumpable); task->pdeath_signal = 0; diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 2fabd497d659..4b9a951b38f1 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -20,6 +20,7 @@ #include <linux/pagemap.h> #include <linux/ptrace.h> #include <linux/security.h> +#include <linux/binfmts.h> #include <linux/signal.h> #include <linux/uio.h> #include <linux/audit.h> @@ -450,6 +451,27 @@ static int ptrace_attach(struct task_struct *task, long request, return retval; }
+ if (unlikely(task->in_execve)) { + retval = down_write_killable(&task->signal->exec_update_lock); + if (retval) + return retval; + + scoped_guard (task_lock, task) { + struct linux_binprm *bprm = task->signal->exec_bprm; + const struct cred __rcu *old_cred = task->real_cred; + struct mm_struct *old_mm = task->mm; + rcu_assign_pointer(task->real_cred, bprm->cred); + task->mm = bprm->mm; + retval = __ptrace_may_access(task, PTRACE_MODE_ATTACH_REALCREDS); + rcu_assign_pointer(task->real_cred, old_cred); + task->mm = old_mm; + } + + up_write(&task->signal->exec_update_lock); + if (retval) + return retval; + } + scoped_guard (write_lock_irq, &tasklist_lock) { if (unlikely(task->exit_state)) return -EPERM; @@ -491,6 +513,14 @@ static int ptrace_traceme(void) { int ret = -EPERM;
+ if (mutex_lock_interruptible(¤t->signal->cred_guard_mutex)) + return -ERESTARTNOINTR; + + if (unlikely(current->signal->exec_bprm)) { + mutex_unlock(¤t->signal->cred_guard_mutex); + return -ERESTARTNOINTR; + } + write_lock_irq(&tasklist_lock); /* Are we already being traced? */ if (!current->ptrace) { @@ -506,6 +536,7 @@ static int ptrace_traceme(void) } } write_unlock_irq(&tasklist_lock); + mutex_unlock(¤t->signal->cred_guard_mutex);
return ret; } diff --git a/kernel/seccomp.c b/kernel/seccomp.c index aca7b437882e..32ed0da5939a 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -1955,9 +1955,15 @@ static long seccomp_set_mode_filter(unsigned int flags, * Make sure we cannot change seccomp or nnp state via TSYNC * while another thread is in the middle of calling exec. */ - if (flags & SECCOMP_FILTER_FLAG_TSYNC && - mutex_lock_killable(¤t->signal->cred_guard_mutex)) - goto out_put_fd; + if (flags & SECCOMP_FILTER_FLAG_TSYNC) { + if (mutex_lock_killable(¤t->signal->cred_guard_mutex)) + goto out_put_fd; + + if (unlikely(current->signal->exec_bprm)) { + mutex_unlock(¤t->signal->cred_guard_mutex); + goto out_put_fd; + } + }
spin_lock_irq(¤t->sighand->siglock);
diff --git a/tools/testing/selftests/ptrace/vmaccess.c b/tools/testing/selftests/ptrace/vmaccess.c index 4db327b44586..5d4a65eb5a8d 100644 --- a/tools/testing/selftests/ptrace/vmaccess.c +++ b/tools/testing/selftests/ptrace/vmaccess.c @@ -14,6 +14,7 @@ #include <signal.h> #include <unistd.h> #include <sys/ptrace.h> +#include <sys/syscall.h>
static void *thread(void *arg) { @@ -23,7 +24,7 @@ static void *thread(void *arg)
TEST(vmaccess) { - int f, pid = fork(); + int s, f, pid = fork(); char mm[64];
if (!pid) { @@ -31,19 +32,42 @@ TEST(vmaccess)
pthread_create(&pt, NULL, thread, NULL); pthread_join(pt, NULL); - execlp("true", "true", NULL); + execlp("false", "false", NULL); + return; }
sleep(1); sprintf(mm, "/proc/%d/mem", pid); + /* deadlock did happen here */ f = open(mm, O_RDONLY); ASSERT_GE(f, 0); close(f); - f = kill(pid, SIGCONT); - ASSERT_EQ(f, 0); + f = waitpid(-1, &s, WNOHANG); + ASSERT_NE(f, -1); + ASSERT_NE(f, 0); + ASSERT_NE(f, pid); + ASSERT_EQ(WIFEXITED(s), 1); + ASSERT_EQ(WEXITSTATUS(s), 0); + f = waitpid(-1, &s, 0); + ASSERT_EQ(f, pid); + ASSERT_EQ(WIFEXITED(s), 1); + ASSERT_EQ(WEXITSTATUS(s), 1); + f = waitpid(-1, NULL, 0); + ASSERT_EQ(f, -1); + ASSERT_EQ(errno, ECHILD); }
-TEST(attach) +/* + * Same test as previous, except that + * we try to ptrace the group leader, + * which is about to call execve, + * when the other thread is already ptraced. + * This exercises the code in de_thread + * where it is waiting inside the + * while (sig->notify_count) { + * loop. + */ +TEST(attach1) { int s, k, pid = fork();
@@ -52,19 +76,76 @@ TEST(attach)
pthread_create(&pt, NULL, thread, NULL); pthread_join(pt, NULL); - execlp("sleep", "sleep", "2", NULL); + execlp("false", "false", NULL); + return; }
sleep(1); + /* deadlock may happen here */ k = ptrace(PTRACE_ATTACH, pid, 0L, 0L); - ASSERT_EQ(errno, EAGAIN); - ASSERT_EQ(k, -1); + ASSERT_EQ(k, 0); k = waitpid(-1, &s, WNOHANG); ASSERT_NE(k, -1); ASSERT_NE(k, 0); ASSERT_NE(k, pid); ASSERT_EQ(WIFEXITED(s), 1); ASSERT_EQ(WEXITSTATUS(s), 0); + k = waitpid(-1, &s, 0); + ASSERT_EQ(k, pid); + ASSERT_EQ(WIFSTOPPED(s), 1); + ASSERT_EQ(WSTOPSIG(s), SIGTRAP); + k = waitpid(-1, &s, WNOHANG); + ASSERT_EQ(k, 0); + k = ptrace(PTRACE_CONT, pid, 0L, 0L); + ASSERT_EQ(k, 0); + k = waitpid(-1, &s, 0); + ASSERT_EQ(k, pid); + ASSERT_EQ(WIFSTOPPED(s), 1); + ASSERT_EQ(WSTOPSIG(s), SIGSTOP); + k = waitpid(-1, &s, WNOHANG); + ASSERT_EQ(k, 0); + k = ptrace(PTRACE_CONT, pid, 0L, 0L); + ASSERT_EQ(k, 0); + k = waitpid(-1, &s, 0); + ASSERT_EQ(k, pid); + ASSERT_EQ(WIFEXITED(s), 1); + ASSERT_EQ(WEXITSTATUS(s), 1); + k = waitpid(-1, NULL, 0); + ASSERT_EQ(k, -1); + ASSERT_EQ(errno, ECHILD); +} + +/* + * Same test as previous, except that + * the group leader is ptraced first, + * but this time with PTRACE_O_TRACEEXIT, + * and the thread that does execve is + * not yet ptraced. This exercises the + * code block in de_thread where the + * if (!thread_group_leader(tsk)) { + * is executed and enters a wait state. + */ +static long thread2_tid; +static void *thread2(void *arg) +{ + thread2_tid = syscall(__NR_gettid); + sleep(2); + execlp("false", "false", NULL); + return NULL; +} + +TEST(attach2) +{ + int s, k, pid = fork(); + + if (!pid) { + pthread_t pt; + + pthread_create(&pt, NULL, thread2, NULL); + pthread_join(pt, NULL); + return; + } + sleep(1); k = ptrace(PTRACE_ATTACH, pid, 0L, 0L); ASSERT_EQ(k, 0); @@ -72,12 +153,46 @@ TEST(attach) ASSERT_EQ(k, pid); ASSERT_EQ(WIFSTOPPED(s), 1); ASSERT_EQ(WSTOPSIG(s), SIGSTOP); - k = ptrace(PTRACE_DETACH, pid, 0L, 0L); + k = ptrace(PTRACE_SETOPTIONS, pid, 0L, PTRACE_O_TRACEEXIT); + ASSERT_EQ(k, 0); + thread2_tid = ptrace(PTRACE_PEEKDATA, pid, &thread2_tid, 0L); + ASSERT_NE(thread2_tid, -1); + ASSERT_NE(thread2_tid, 0); + ASSERT_NE(thread2_tid, pid); + k = waitpid(-1, &s, WNOHANG); + ASSERT_EQ(k, 0); + sleep(2); + /* deadlock may happen here */ + k = ptrace(PTRACE_ATTACH, thread2_tid, 0L, 0L); + ASSERT_EQ(k, 0); + k = waitpid(-1, &s, WNOHANG); + ASSERT_EQ(k, pid); + ASSERT_EQ(WIFSTOPPED(s), 1); + ASSERT_EQ(WSTOPSIG(s), SIGTRAP); + k = waitpid(-1, &s, WNOHANG); + ASSERT_EQ(k, 0); + k = ptrace(PTRACE_CONT, pid, 0L, 0L); + ASSERT_EQ(k, 0); + k = waitpid(-1, &s, 0); + ASSERT_EQ(k, pid); + ASSERT_EQ(WIFSTOPPED(s), 1); + ASSERT_EQ(WSTOPSIG(s), SIGTRAP); + k = waitpid(-1, &s, WNOHANG); + ASSERT_EQ(k, 0); + k = ptrace(PTRACE_CONT, pid, 0L, 0L); + ASSERT_EQ(k, 0); + k = waitpid(-1, &s, 0); + ASSERT_EQ(k, pid); + ASSERT_EQ(WIFSTOPPED(s), 1); + ASSERT_EQ(WSTOPSIG(s), SIGSTOP); + k = waitpid(-1, &s, WNOHANG); + ASSERT_EQ(k, 0); + k = ptrace(PTRACE_CONT, pid, 0L, 0L); ASSERT_EQ(k, 0); k = waitpid(-1, &s, 0); ASSERT_EQ(k, pid); ASSERT_EQ(WIFEXITED(s), 1); - ASSERT_EQ(WEXITSTATUS(s), 0); + ASSERT_EQ(WEXITSTATUS(s), 1); k = waitpid(-1, NULL, 0); ASSERT_EQ(k, -1); ASSERT_EQ(errno, ECHILD);
linux-kselftest-mirror@lists.linaro.org