Since the introduction of pid namespaces, their interaction with procfs has been entirely implicit in ways that require a lot of dancing around by programs that need to construct sandboxes with different PID namespaces.
Being able to explicitly specify the pid namespace to use when constructing a procfs super block will allow programs to no longer need to fork off a process which does then does unshare(2) / setns(2) and forks again in order to construct a procfs in a pidns.
So, provide a "pidns" mount option which allows such users to just explicitly state which pid namespace they want that procfs instance to use. This interface can be used with fsconfig(2) either with a file descriptor or a path:
fsconfig(procfd, FSCONFIG_SET_FD, "pidns", NULL, nsfd); fsconfig(procfd, FSCONFIG_SET_STRING, "pidns", "/proc/self/ns/pid", 0);
or with classic mount(2) / mount(8):
// mount -t proc -o pidns=/proc/self/ns/pid proc /tmp/proc mount("proc", "/tmp/proc", "proc", MS_..., "pidns=/proc/self/ns/pid");
As this new API is effectively shorthand for setns(2) followed by mount(2), the permission model for this mirrors pidns_install() to avoid opening up new attack surfaces by loosening the existing permission model.
In order to avoid having to RCU-protect all users of proc_pid_ns() (to avoid UAFs), attempting to reconfigure an existing procfs instance's pid namespace will error out with -EBUSY. Creating new procfs instances is quite cheap, so this should not be an impediment to most users, and lets us avoid a lot of churn in fs/proc/* for a feature that it seems unlikely userspace would use.
Signed-off-by: Aleksa Sarai cyphar@cyphar.com --- Documentation/filesystems/proc.rst | 8 +++ fs/proc/root.c | 102 ++++++++++++++++++++++++++++++++++--- 2 files changed, 104 insertions(+), 6 deletions(-)
diff --git a/Documentation/filesystems/proc.rst b/Documentation/filesystems/proc.rst index 5236cb52e357..5a157dadea0b 100644 --- a/Documentation/filesystems/proc.rst +++ b/Documentation/filesystems/proc.rst @@ -2360,6 +2360,7 @@ The following mount options are supported: hidepid= Set /proc/<pid>/ access mode. gid= Set the group authorized to learn processes information. subset= Show only the specified subset of procfs. + pidns= Specify a the namespace used by this procfs. ========= ========================================================
hidepid=off or hidepid=0 means classic mode - everybody may access all @@ -2392,6 +2393,13 @@ information about processes information, just add identd to this group. subset=pid hides all top level files and directories in the procfs that are not related to tasks.
+pidns= specifies a pid namespace (either as a string path to something like +`/proc/$pid/ns/pid`, or a file descriptor when using `FSCONFIG_SET_FD`) that +will be used by the procfs instance when translating pids. By default, procfs +will use the calling process's active pid namespace. Note that the pid +namespace of an existing procfs instance cannot be modified (attempting to do +so will give an `-EBUSY` error). + Chapter 5: Filesystem behavior ==============================
diff --git a/fs/proc/root.c b/fs/proc/root.c index ed86ac710384..22f8b10f6265 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -38,12 +38,18 @@ enum proc_param { Opt_gid, Opt_hidepid, Opt_subset, +#ifdef CONFIG_PID_NS + Opt_pidns, +#endif };
static const struct fs_parameter_spec proc_fs_parameters[] = { - fsparam_u32("gid", Opt_gid), + fsparam_u32("gid", Opt_gid), fsparam_string("hidepid", Opt_hidepid), fsparam_string("subset", Opt_subset), +#ifdef CONFIG_PID_NS + fsparam_file_or_string("pidns", Opt_pidns), +#endif {} };
@@ -109,11 +115,67 @@ static int proc_parse_subset_param(struct fs_context *fc, char *value) return 0; }
+#ifdef CONFIG_PID_NS +static int proc_parse_pidns_param(struct fs_context *fc, + struct fs_parameter *param, + struct fs_parse_result *result) +{ + struct proc_fs_context *ctx = fc->fs_private; + struct pid_namespace *target, *active = task_active_pid_ns(current); + struct ns_common *ns; + struct file *ns_filp __free(fput) = NULL; + + switch (param->type) { + case fs_value_is_file: + /* came throug fsconfig, steal the file reference */ + ns_filp = param->file; + param->file = NULL; + break; + case fs_value_is_string: + ns_filp = filp_open(param->string, O_RDONLY, 0); + break; + default: + WARN_ON_ONCE(true); + break; + } + if (!ns_filp) + ns_filp = ERR_PTR(-EBADF); + if (IS_ERR(ns_filp)) { + errorfc(fc, "could not get file from pidns argument"); + return PTR_ERR(ns_filp); + } + + if (!proc_ns_file(ns_filp)) + return invalfc(fc, "pidns argument is not an nsfs file"); + ns = get_proc_ns(file_inode(ns_filp)); + if (ns->ops->type != CLONE_NEWPID) + return invalfc(fc, "pidns argument is not a pidns file"); + target = container_of(ns, struct pid_namespace, ns); + + /* + * pidns= is shorthand for joining the pidns to get a fsopen fd, so the + * permission model should be the same as pidns_install(). + */ + if (!ns_capable(target->user_ns, CAP_SYS_ADMIN)) { + errorfc(fc, "insufficient permissions to set pidns"); + return -EPERM; + } + if (!pidns_is_ancestor(target, active)) + return invalfc(fc, "cannot set pidns to non-descendant pidns"); + + put_pid_ns(ctx->pid_ns); + ctx->pid_ns = get_pid_ns(target); + put_user_ns(fc->user_ns); + fc->user_ns = get_user_ns(ctx->pid_ns->user_ns); + return 0; +} +#endif /* CONFIG_PID_NS */ + static int proc_parse_param(struct fs_context *fc, struct fs_parameter *param) { struct proc_fs_context *ctx = fc->fs_private; struct fs_parse_result result; - int opt; + int opt, err;
opt = fs_parse(fc, proc_fs_parameters, param, &result); if (opt < 0) @@ -125,15 +187,36 @@ static int proc_parse_param(struct fs_context *fc, struct fs_parameter *param) break;
case Opt_hidepid: - if (proc_parse_hidepid_param(fc, param)) - return -EINVAL; + err = proc_parse_hidepid_param(fc, param); + if (err) + return err; break;
case Opt_subset: - if (proc_parse_subset_param(fc, param->string) < 0) - return -EINVAL; + err = proc_parse_subset_param(fc, param->string); + if (err) + return err; break;
+#ifdef CONFIG_PID_NS + case Opt_pidns: + /* + * We would have to RCU-protect every proc_pid_ns() or + * proc_sb_info() access if we allowed this to be reconfigured + * for an existing procfs instance. Luckily, procfs instances + * are cheap to create, and mount-beneath would let you + * atomically replace an instance even with overmounts. + */ + if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE) { + errorfc(fc, "cannot reconfigure pidns for existing procfs"); + return -EBUSY; + } + err = proc_parse_pidns_param(fc, param, &result); + if (err) + return err; + break; +#endif + default: return -EINVAL; } @@ -154,6 +237,13 @@ static void proc_apply_options(struct proc_fs_info *fs_info, fs_info->hide_pid = ctx->hidepid; if (ctx->mask & (1 << Opt_subset)) fs_info->pidonly = ctx->pidonly; +#ifdef CONFIG_PID_NS + if (ctx->mask & (1 << Opt_pidns) && + !WARN_ON_ONCE(fc->purpose == FS_CONTEXT_FOR_RECONFIGURE)) { + put_pid_ns(fs_info->pid_ns); + fs_info->pid_ns = get_pid_ns(ctx->pid_ns); + } +#endif }
static int proc_fill_super(struct super_block *s, struct fs_context *fc)