From: Peter Zijlstra peterz@infradead.org
commit b1e8206582f9d680cff7d04828708c8b6ab32957 upstream.
Where commit 4ef0c5c6b5ba ("kernel/sched: Fix sched_fork() access an invalid sched_task_group") fixed a fork race vs cgroup, it opened up a race vs syscalls by not placing the task on the runqueue before it gets exposed through the pidhash.
Commit 13765de8148f ("sched/fair: Fix fault in reweight_entity") is trying to fix a single instance of this, instead fix the whole class of issues, effectively reverting this commit.
Fixes: 4ef0c5c6b5ba ("kernel/sched: Fix sched_fork() access an invalid sched_task_group") Reported-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Peter Zijlstra (Intel) peterz@infradead.org Tested-by: Tadeusz Struk tadeusz.struk@linaro.org Tested-by: Zhang Qiao zhangqiao22@huawei.com Tested-by: Dietmar Eggemann dietmar.eggemann@arm.com Link: https://lkml.kernel.org/r/YgoeCbwj5mbCR0qA@hirez.programming.kicks-ass.net Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org --- include/linux/sched/task.h | 4 ++-- kernel/fork.c | 13 ++++++++++++- kernel/sched/core.c | 34 +++++++++++++++++++++------------- 3 files changed, 35 insertions(+), 16 deletions(-)
--- a/include/linux/sched/task.h +++ b/include/linux/sched/task.h @@ -54,8 +54,8 @@ extern asmlinkage void schedule_tail(str extern void init_idle(struct task_struct *idle, int cpu);
extern int sched_fork(unsigned long clone_flags, struct task_struct *p); -extern void sched_post_fork(struct task_struct *p, - struct kernel_clone_args *kargs); +extern void sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs); +extern void sched_post_fork(struct task_struct *p); extern void sched_dead(struct task_struct *p);
void __noreturn do_task_dead(void); --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2294,6 +2294,17 @@ static __latent_entropy struct task_stru goto bad_fork_put_pidfd;
/* + * Now that the cgroups are pinned, re-clone the parent cgroup and put + * the new task on the correct runqueue. All this *before* the task + * becomes visible. + * + * This isn't part of ->can_fork() because while the re-cloning is + * cgroup specific, it unconditionally needs to place the task on a + * runqueue. + */ + sched_cgroup_fork(p, args); + + /* * From this point on we must avoid any synchronous user-space * communication until we take the tasklist-lock. In particular, we do * not want user-space to be able to predict the process start-time by @@ -2402,7 +2413,7 @@ static __latent_entropy struct task_stru fd_install(pidfd, pidfile);
proc_fork_connector(p); - sched_post_fork(p, args); + sched_post_fork(p); cgroup_post_fork(p, args); perf_event_fork(p);
--- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1203,9 +1203,8 @@ int tg_nop(struct task_group *tg, void * } #endif
-static void set_load_weight(struct task_struct *p) +static void set_load_weight(struct task_struct *p, bool update_load) { - bool update_load = !(READ_ONCE(p->__state) & TASK_NEW); int prio = p->static_prio - MAX_RT_PRIO; struct load_weight *load = &p->se.load;
@@ -4393,7 +4392,7 @@ int sched_fork(unsigned long clone_flags p->static_prio = NICE_TO_PRIO(0);
p->prio = p->normal_prio = p->static_prio; - set_load_weight(p); + set_load_weight(p, false);
/* * We don't need the reset flag anymore after the fork. It has @@ -4411,6 +4410,7 @@ int sched_fork(unsigned long clone_flags
init_entity_runnable_average(&p->se);
+ #ifdef CONFIG_SCHED_INFO if (likely(sched_info_on())) memset(&p->sched_info, 0, sizeof(p->sched_info)); @@ -4426,18 +4426,23 @@ int sched_fork(unsigned long clone_flags return 0; }
-void sched_post_fork(struct task_struct *p, struct kernel_clone_args *kargs) +void sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs) { unsigned long flags; -#ifdef CONFIG_CGROUP_SCHED - struct task_group *tg; -#endif
+ /* + * Because we're not yet on the pid-hash, p->pi_lock isn't strictly + * required yet, but lockdep gets upset if rules are violated. + */ raw_spin_lock_irqsave(&p->pi_lock, flags); #ifdef CONFIG_CGROUP_SCHED - tg = container_of(kargs->cset->subsys[cpu_cgrp_id], - struct task_group, css); - p->sched_task_group = autogroup_task_group(p, tg); + if (1) { + struct task_group *tg; + tg = container_of(kargs->cset->subsys[cpu_cgrp_id], + struct task_group, css); + tg = autogroup_task_group(p, tg); + p->sched_task_group = tg; + } #endif rseq_migrate(p); /* @@ -4448,7 +4453,10 @@ void sched_post_fork(struct task_struct if (p->sched_class->task_fork) p->sched_class->task_fork(p); raw_spin_unlock_irqrestore(&p->pi_lock, flags); +}
+void sched_post_fork(struct task_struct *p) +{ uclamp_post_fork(p); }
@@ -6880,7 +6888,7 @@ void set_user_nice(struct task_struct *p put_prev_task(rq, p);
p->static_prio = NICE_TO_PRIO(nice); - set_load_weight(p); + set_load_weight(p, true); old_prio = p->prio; p->prio = effective_prio(p);
@@ -7171,7 +7179,7 @@ static void __setscheduler_params(struct */ p->rt_priority = attr->sched_priority; p->normal_prio = normal_prio(p); - set_load_weight(p); + set_load_weight(p, true); }
/* @@ -9410,7 +9418,7 @@ void __init sched_init(void) #endif }
- set_load_weight(&init_task); + set_load_weight(&init_task, false);
/* * The boot idle thread does lazy MMU switching as well: