As discussed in the LKML thread [1], the asynchronous nature of cpuset hotplug handling code is causing problem with RCU testing. With recent changes in the way locking is being handled in the cpuset code, it is now possible to make the cpuset hotplug code synchronous again without major changes.
This series enables the hotplug code to call directly into cpuset hotplug core without indirection with the exception of the special case of v1 cpuset becoming empty still being handled indirectly with workqueue.
A new simple test case was also written to test this special v1 cpuset case. The test_cpuset_prs.sh script was also run with LOCKDEP on to verify that there is no regression.
[1] https://lore.kernel.org/lkml/ZgYikMb5kZ7rxPp6@slm.duckdns.org/
Waiman Long (2): cgroup/cpuset: Make cpuset hotplug processing synchronous cgroup/cpuset: Add test_cpuset_v1_hp.sh
include/linux/cpuset.h | 3 - kernel/cgroup/cpuset.c | 131 +++++++----------- kernel/cpu.c | 48 ------- kernel/power/process.c | 2 - tools/testing/selftests/cgroup/Makefile | 2 +- .../selftests/cgroup/test_cpuset_v1_hp.sh | 40 ++++++ 6 files changed, 88 insertions(+), 138 deletions(-) create mode 100755 tools/testing/selftests/cgroup/test_cpuset_v1_hp.sh
Since commit 3a5a6d0c2b03("cpuset: don't nest cgroup_mutex inside get_online_cpus()"), cpuset hotplug was done asynchronously via a work function. This is to avoid recursive locking of cgroup_mutex.
Since then, the cgroup locking scheme has changed quite a bit. A cpuset_mutex was introduced to protect cpuset specific operations. The cpuset_mutex is then replaced by a cpuset_rwsem. With commit d74b27d63a8b ("cgroup/cpuset: Change cpuset_rwsem and hotplug lock order"), cpu_hotplug_lock is acquired before cpuset_rwsem. Later on, cpuset_rwsem is reverted back to cpuset_mutex. All these locking changes allow the hotplug code to call into cpuset core directly.
The following commits were also merged due to the asynchronous nature of cpuset hotplug processing.
- commit b22afcdf04c9 ("cpu/hotplug: Cure the cpusets trainwreck") - commit 50e76632339d ("sched/cpuset/pm: Fix cpuset vs. suspend-resume bugs") - commit 28b89b9e6f7b ("cpuset: handle race between CPU hotplug and cpuset_hotplug_work")
Clean up all these bandages by making cpuset hotplug processing synchronous again with the exception that the call to cgroup_transfer_tasks() to transfer tasks out of an empty cgroup v1 cpuset, if necessary, will still be done via a work function due to the existing cgroup_mutex -> cpu_hotplug_lock dependency. It is possible to reverse that dependency, but that will require updating a number of different cgroup controllers. This special hotplug code path should be rarely taken anyway.
As all the cpuset states will be updated by the end of the hotplug operation, we can revert most the above commits except commit 50e76632339d ("sched/cpuset/pm: Fix cpuset vs. suspend-resume bugs") which is partially reverted. Also removing some cpus_read_lock trylock attempts in the cpuset partition code as they are no longer necessary since the cpu_hotplug_lock is now held for the whole duration of the cpuset hotplug code path.
Signed-off-by: Waiman Long longman@redhat.com --- include/linux/cpuset.h | 3 - kernel/cgroup/cpuset.c | 131 +++++++++++++++-------------------------- kernel/cpu.c | 48 --------------- kernel/power/process.c | 2 - 4 files changed, 47 insertions(+), 137 deletions(-)
diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h index 0ce6ff0d9c9a..de4cf0ee96f7 100644 --- a/include/linux/cpuset.h +++ b/include/linux/cpuset.h @@ -70,7 +70,6 @@ extern int cpuset_init(void); extern void cpuset_init_smp(void); extern void cpuset_force_rebuild(void); extern void cpuset_update_active_cpus(void); -extern void cpuset_wait_for_hotplug(void); extern void inc_dl_tasks_cs(struct task_struct *task); extern void dec_dl_tasks_cs(struct task_struct *task); extern void cpuset_lock(void); @@ -185,8 +184,6 @@ static inline void cpuset_update_active_cpus(void) partition_sched_domains(1, NULL, NULL); }
-static inline void cpuset_wait_for_hotplug(void) { } - static inline void inc_dl_tasks_cs(struct task_struct *task) { } static inline void dec_dl_tasks_cs(struct task_struct *task) { } static inline void cpuset_lock(void) { } diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 4237c8748715..13d27b17c889 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -201,6 +201,14 @@ struct cpuset { struct list_head remote_sibling; };
+/* + * Legacy hierarchy call to cgroup_transfer_tasks() is handled asynchrously + */ +struct cpuset_remove_tasks_struct { + struct work_struct work; + struct cpuset *cs; +}; + /* * Exclusive CPUs distributed out to sub-partitions of top_cpuset */ @@ -449,12 +457,6 @@ static DEFINE_SPINLOCK(callback_lock);
static struct workqueue_struct *cpuset_migrate_mm_wq;
-/* - * CPU / memory hotplug is handled asynchronously. - */ -static void cpuset_hotplug_workfn(struct work_struct *work); -static DECLARE_WORK(cpuset_hotplug_work, cpuset_hotplug_workfn); - static DECLARE_WAIT_QUEUE_HEAD(cpuset_attach_wq);
static inline void check_insane_mems_config(nodemask_t *nodes) @@ -540,22 +542,10 @@ static void guarantee_online_cpus(struct task_struct *tsk, rcu_read_lock(); cs = task_cs(tsk);
- while (!cpumask_intersects(cs->effective_cpus, pmask)) { + while (!cpumask_intersects(cs->effective_cpus, pmask)) cs = parent_cs(cs); - if (unlikely(!cs)) { - /* - * The top cpuset doesn't have any online cpu as a - * consequence of a race between cpuset_hotplug_work - * and cpu hotplug notifier. But we know the top - * cpuset's effective_cpus is on its way to be - * identical to cpu_online_mask. - */ - goto out_unlock; - } - } - cpumask_and(pmask, pmask, cs->effective_cpus);
-out_unlock: + cpumask_and(pmask, pmask, cs->effective_cpus); rcu_read_unlock(); }
@@ -1217,7 +1207,7 @@ static void rebuild_sched_domains_locked(void) /* * If we have raced with CPU hotplug, return early to avoid * passing doms with offlined cpu to partition_sched_domains(). - * Anyways, cpuset_hotplug_workfn() will rebuild sched domains. + * Anyways, cpuset_handle_hotplug() will rebuild sched domains. * * With no CPUs in any subpartitions, top_cpuset's effective CPUs * should be the same as the active CPUs, so checking only top_cpuset @@ -1262,11 +1252,9 @@ static void rebuild_sched_domains_locked(void)
void rebuild_sched_domains(void) { - cpus_read_lock(); mutex_lock(&cpuset_mutex); rebuild_sched_domains_locked(); mutex_unlock(&cpuset_mutex); - cpus_read_unlock(); }
/** @@ -2079,14 +2067,11 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd,
/* * For partcmd_update without newmask, it is being called from - * cpuset_hotplug_workfn() where cpus_read_lock() wasn't taken. - * Update the load balance flag and scheduling domain if - * cpus_read_trylock() is successful. + * cpuset_handle_hotplug(). Update the load balance flag and + * scheduling domain accordingly. */ - if ((cmd == partcmd_update) && !newmask && cpus_read_trylock()) { + if ((cmd == partcmd_update) && !newmask) update_partition_sd_lb(cs, old_prs); - cpus_read_unlock(); - }
notify_partition_change(cs, old_prs); return 0; @@ -3599,8 +3584,8 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of, * proceeding, so that we don't end up keep removing tasks added * after execution capability is restored. * - * cpuset_hotplug_work calls back into cgroup core via - * cgroup_transfer_tasks() and waiting for it from a cgroupfs + * cpuset_handle_hotplug may call back into cgroup core asynchronously + * via cgroup_transfer_tasks() and waiting for it from a cgroupfs * operation like this one can lead to a deadlock through kernfs * active_ref protection. Let's break the protection. Losing the * protection is okay as we check whether @cs is online after @@ -3609,7 +3594,6 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of, */ css_get(&cs->css); kernfs_break_active_protection(of->kn); - flush_work(&cpuset_hotplug_work);
cpus_read_lock(); mutex_lock(&cpuset_mutex); @@ -4354,6 +4338,16 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs) } }
+static void cpuset_migrate_tasks_workfn(struct work_struct *work) +{ + struct cpuset_remove_tasks_struct *s; + + s = container_of(work, struct cpuset_remove_tasks_struct, work); + remove_tasks_in_empty_cpuset(s->cs); + css_put(&s->cs->css); + kfree(s); +} + static void hotplug_update_tasks_legacy(struct cpuset *cs, struct cpumask *new_cpus, nodemask_t *new_mems, @@ -4383,12 +4377,20 @@ hotplug_update_tasks_legacy(struct cpuset *cs, /* * Move tasks to the nearest ancestor with execution resources, * This is full cgroup operation which will also call back into - * cpuset. Should be done outside any lock. + * cpuset. Execute it asynchronously using workqueue. */ - if (is_empty) { - mutex_unlock(&cpuset_mutex); - remove_tasks_in_empty_cpuset(cs); - mutex_lock(&cpuset_mutex); + if (is_empty && css_tryget_online(&cs->css)) { + struct cpuset_remove_tasks_struct *s; + + s = kzalloc(sizeof(*s), GFP_KERNEL); + if (WARN_ON_ONCE(!s)) { + css_put(&cs->css); + return; + } + + s->cs = cs; + INIT_WORK(&s->work, cpuset_migrate_tasks_workfn); + schedule_work(&s->work); } }
@@ -4421,30 +4423,6 @@ void cpuset_force_rebuild(void) force_rebuild = true; }
-/* - * Attempt to acquire a cpus_read_lock while a hotplug operation may be in - * progress. - * Return: true if successful, false otherwise - * - * To avoid circular lock dependency between cpuset_mutex and cpus_read_lock, - * cpus_read_trylock() is used here to acquire the lock. - */ -static bool cpuset_hotplug_cpus_read_trylock(void) -{ - int retries = 0; - - while (!cpus_read_trylock()) { - /* - * CPU hotplug still in progress. Retry 5 times - * with a 10ms wait before bailing out. - */ - if (++retries > 5) - return false; - msleep(10); - } - return true; -} - /** * cpuset_hotplug_update_tasks - update tasks in a cpuset for hotunplug * @cs: cpuset in interest @@ -4493,13 +4471,11 @@ static void cpuset_hotplug_update_tasks(struct cpuset *cs, struct tmpmasks *tmp) compute_partition_effective_cpumask(cs, &new_cpus);
if (remote && cpumask_empty(&new_cpus) && - partition_is_populated(cs, NULL) && - cpuset_hotplug_cpus_read_trylock()) { + partition_is_populated(cs, NULL)) { remote_partition_disable(cs, tmp); compute_effective_cpumask(&new_cpus, cs, parent); remote = false; cpuset_force_rebuild(); - cpus_read_unlock(); }
/* @@ -4519,18 +4495,8 @@ static void cpuset_hotplug_update_tasks(struct cpuset *cs, struct tmpmasks *tmp) else if (is_partition_valid(parent) && is_partition_invalid(cs)) partcmd = partcmd_update;
- /* - * cpus_read_lock needs to be held before calling - * update_parent_effective_cpumask(). To avoid circular lock - * dependency between cpuset_mutex and cpus_read_lock, - * cpus_read_trylock() is used here to acquire the lock. - */ if (partcmd >= 0) { - if (!cpuset_hotplug_cpus_read_trylock()) - goto update_tasks; - update_parent_effective_cpumask(cs, partcmd, NULL, tmp); - cpus_read_unlock(); if ((partcmd == partcmd_invalidate) || is_partition_valid(cs)) { compute_partition_effective_cpumask(cs, &new_cpus); cpuset_force_rebuild(); @@ -4558,8 +4524,7 @@ static void cpuset_hotplug_update_tasks(struct cpuset *cs, struct tmpmasks *tmp) }
/** - * cpuset_hotplug_workfn - handle CPU/memory hotunplug for a cpuset - * @work: unused + * cpuset_handle_hotplug - handle CPU/memory hot{,un}plug for a cpuset * * This function is called after either CPU or memory configuration has * changed and updates cpuset accordingly. The top_cpuset is always @@ -4573,8 +4538,10 @@ static void cpuset_hotplug_update_tasks(struct cpuset *cs, struct tmpmasks *tmp) * * Note that CPU offlining during suspend is ignored. We don't modify * cpusets across suspend/resume cycles at all. + * + * CPU / memory hotplug is handled synchronously. */ -static void cpuset_hotplug_workfn(struct work_struct *work) +static void cpuset_handle_hotplug(void) { static cpumask_t new_cpus; static nodemask_t new_mems; @@ -4585,6 +4552,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work) if (on_dfl && !alloc_cpumasks(NULL, &tmp)) ptmp = &tmp;
+ lockdep_assert_cpus_held(); mutex_lock(&cpuset_mutex);
/* fetch the available cpus/mems and find out which changed how */ @@ -4679,12 +4647,7 @@ void cpuset_update_active_cpus(void) * inside cgroup synchronization. Bounce actual hotplug processing * to a work item to avoid reverse locking order. */ - schedule_work(&cpuset_hotplug_work); -} - -void cpuset_wait_for_hotplug(void) -{ - flush_work(&cpuset_hotplug_work); + cpuset_handle_hotplug(); }
/* @@ -4695,7 +4658,7 @@ void cpuset_wait_for_hotplug(void) static int cpuset_track_online_nodes(struct notifier_block *self, unsigned long action, void *arg) { - schedule_work(&cpuset_hotplug_work); + cpuset_handle_hotplug(); return NOTIFY_OK; }
diff --git a/kernel/cpu.c b/kernel/cpu.c index 8f6affd051f7..49ce3f309688 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -1208,52 +1208,6 @@ void __init cpuhp_threads_init(void) kthread_unpark(this_cpu_read(cpuhp_state.thread)); }
-/* - * - * Serialize hotplug trainwrecks outside of the cpu_hotplug_lock - * protected region. - * - * The operation is still serialized against concurrent CPU hotplug via - * cpu_add_remove_lock, i.e. CPU map protection. But it is _not_ - * serialized against other hotplug related activity like adding or - * removing of state callbacks and state instances, which invoke either the - * startup or the teardown callback of the affected state. - * - * This is required for subsystems which are unfixable vs. CPU hotplug and - * evade lock inversion problems by scheduling work which has to be - * completed _before_ cpu_up()/_cpu_down() returns. - * - * Don't even think about adding anything to this for any new code or even - * drivers. It's only purpose is to keep existing lock order trainwrecks - * working. - * - * For cpu_down() there might be valid reasons to finish cleanups which are - * not required to be done under cpu_hotplug_lock, but that's a different - * story and would be not invoked via this. - */ -static void cpu_up_down_serialize_trainwrecks(bool tasks_frozen) -{ - /* - * cpusets delegate hotplug operations to a worker to "solve" the - * lock order problems. Wait for the worker, but only if tasks are - * _not_ frozen (suspend, hibernate) as that would wait forever. - * - * The wait is required because otherwise the hotplug operation - * returns with inconsistent state, which could even be observed in - * user space when a new CPU is brought up. The CPU plug uevent - * would be delivered and user space reacting on it would fail to - * move tasks to the newly plugged CPU up to the point where the - * work has finished because up to that point the newly plugged CPU - * is not assignable in cpusets/cgroups. On unplug that's not - * necessarily a visible issue, but it is still inconsistent state, - * which is the real problem which needs to be "fixed". This can't - * prevent the transient state between scheduling the work and - * returning from waiting for it. - */ - if (!tasks_frozen) - cpuset_wait_for_hotplug(); -} - #ifdef CONFIG_HOTPLUG_CPU #ifndef arch_clear_mm_cpumask_cpu #define arch_clear_mm_cpumask_cpu(cpu, mm) cpumask_clear_cpu(cpu, mm_cpumask(mm)) @@ -1494,7 +1448,6 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen, */ lockup_detector_cleanup(); arch_smt_update(); - cpu_up_down_serialize_trainwrecks(tasks_frozen); return ret; }
@@ -1728,7 +1681,6 @@ static int _cpu_up(unsigned int cpu, int tasks_frozen, enum cpuhp_state target) out: cpus_write_unlock(); arch_smt_update(); - cpu_up_down_serialize_trainwrecks(tasks_frozen); return ret; }
diff --git a/kernel/power/process.c b/kernel/power/process.c index cae81a87cc91..66ac067d9ae6 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -194,8 +194,6 @@ void thaw_processes(void) __usermodehelper_set_disable_depth(UMH_FREEZING); thaw_workqueues();
- cpuset_wait_for_hotplug(); - read_lock(&tasklist_lock); for_each_process_thread(g, p) { /* No other threads should have PF_SUSPEND_TASK set */
Hello Waiman.
(I have no opinion on the overall locking reworks, only the bits about v1 migrations caught my attention.)
On Mon, Apr 01, 2024 at 10:58:57AM -0400, Waiman Long longman@redhat.com wrote: ...
@@ -4383,12 +4377,20 @@ hotplug_update_tasks_legacy(struct cpuset *cs, /* * Move tasks to the nearest ancestor with execution resources, * This is full cgroup operation which will also call back into
* cpuset. Should be done outside any lock.
* cpuset. Execute it asynchronously using workqueue.
...to avoid deadlock on cpus_read_lock
Is this the reason? Also, what happens with the tasks in the window till the migration happens? Is it handled gracefully that their cpu is gone?
- if (is_empty) {
mutex_unlock(&cpuset_mutex);
remove_tasks_in_empty_cpuset(cs);
mutex_lock(&cpuset_mutex);
- if (is_empty && css_tryget_online(&cs->css)) {
struct cpuset_remove_tasks_struct *s;
s = kzalloc(sizeof(*s), GFP_KERNEL);
Is there a benefit of having a work for each cpuset? Instead of traversing whole top_cpuset once in the async work.
Thanks, Michal
On 4/2/24 10:13, Michal Koutný wrote:
Hello Waiman.
(I have no opinion on the overall locking reworks, only the bits about v1 migrations caught my attention.)
On Mon, Apr 01, 2024 at 10:58:57AM -0400, Waiman Long longman@redhat.com wrote: ...
@@ -4383,12 +4377,20 @@ hotplug_update_tasks_legacy(struct cpuset *cs, /* * Move tasks to the nearest ancestor with execution resources, * This is full cgroup operation which will also call back into
* cpuset. Should be done outside any lock.
* cpuset. Execute it asynchronously using workqueue.
...to avoid deadlock on cpus_read_lock
Is this the reason? Also, what happens with the tasks in the window till the migration happens? Is it handled gracefully that their cpu is gone?
Yes, there is a potential that a cpus_read_lock() may be called leading to deadlock. So unless we reverse the current cgroup_mutex --> cpu_hotplug_lock ordering, it is not safe to call cgroup_transfer_tasks() directly.
- if (is_empty) {
mutex_unlock(&cpuset_mutex);
remove_tasks_in_empty_cpuset(cs);
mutex_lock(&cpuset_mutex);
- if (is_empty && css_tryget_online(&cs->css)) {
struct cpuset_remove_tasks_struct *s;
s = kzalloc(sizeof(*s), GFP_KERNEL);
Is there a benefit of having a work for each cpuset? Instead of traversing whole top_cpuset once in the async work.
We could do that too. It's just that we have the repeat the iteration process once the workfn is invoked, but that has the advantage of not needing to do memory allocation. I am OK with either way. Let's see what other folks think about that.
Cheers, Longman
On Tue, Apr 02, 2024 at 11:30:11AM -0400, Waiman Long longman@redhat.com wrote:
Yes, there is a potential that a cpus_read_lock() may be called leading to deadlock. So unless we reverse the current cgroup_mutex --> cpu_hotplug_lock ordering, it is not safe to call cgroup_transfer_tasks() directly.
I see that cgroup_transfer_tasks() has the only user -- cpuset. What about bending it for the specific use like:
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 34aaf0e87def..64deb7212c5c 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -109,7 +109,7 @@ struct cgroup *cgroup_get_from_fd(int fd); struct cgroup *cgroup_v1v2_get_from_fd(int fd);
int cgroup_attach_task_all(struct task_struct *from, struct task_struct *); -int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from); +int cgroup_transfer_tasks_locked(struct cgroup *to, struct cgroup *from);
int cgroup_add_dfl_cftypes(struct cgroup_subsys *ss, struct cftype *cfts); int cgroup_add_legacy_cftypes(struct cgroup_subsys *ss, struct cftype *cfts); diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index 520a11cb12f4..f97025858c7a 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -91,7 +91,8 @@ EXPORT_SYMBOL_GPL(cgroup_attach_task_all); * * Return: %0 on success or a negative errno code on failure */ -int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) +int cgroup_transfer_tasks_locked(struct cgroup *to, struct cgroup *from) { DEFINE_CGROUP_MGCTX(mgctx); struct cgrp_cset_link *link; @@ -106,9 +106,11 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) if (ret) return ret;
- cgroup_lock(); - - cgroup_attach_lock(true); + /* The locking rules serve specific purpose of v1 cpuset hotplug + * migration, see hotplug_update_tasks_legacy() and + * cgroup_attach_lock() */ + lockdep_assert_held(&cgroup_mutex); + lockdep_assert_cpus_held(); + percpu_down_write(&cgroup_threadgroup_rwsem);
/* all tasks in @from are being moved, all csets are source */ spin_lock_irq(&css_set_lock); @@ -144,8 +146,7 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) } while (task && !ret); out_err: cgroup_migrate_finish(&mgctx); - cgroup_attach_unlock(true); - cgroup_unlock(); + percpu_up_write(&cgroup_threadgroup_rwsem); return ret; }
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 13d27b17c889..94fb8b26f038 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -4331,7 +4331,7 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs) nodes_empty(parent->mems_allowed)) parent = parent_cs(parent);
- if (cgroup_transfer_tasks(parent->css.cgroup, cs->css.cgroup)) { + if (cgroup_transfer_tasks_locked(parent->css.cgroup, cs->css.cgroup)) { pr_err("cpuset: failed to transfer tasks out of empty cpuset "); pr_cont_cgroup_name(cs->css.cgroup); pr_cont("\n"); @@ -4376,21 +4376,9 @@ hotplug_update_tasks_legacy(struct cpuset *cs,
/* * Move tasks to the nearest ancestor with execution resources, - * This is full cgroup operation which will also call back into - * cpuset. Execute it asynchronously using workqueue. */ - if (is_empty && css_tryget_online(&cs->css)) { - struct cpuset_remove_tasks_struct *s; - - s = kzalloc(sizeof(*s), GFP_KERNEL); - if (WARN_ON_ONCE(!s)) { - css_put(&cs->css); - return; - } - - s->cs = cs; - INIT_WORK(&s->work, cpuset_migrate_tasks_workfn); - schedule_work(&s->work); + if (is_empty) + remove_tasks_in_empty_cpuset(cs); } }
On 4/3/24 08:02, Michal Koutný wrote:
On Tue, Apr 02, 2024 at 11:30:11AM -0400, Waiman Long longman@redhat.com wrote:
Yes, there is a potential that a cpus_read_lock() may be called leading to deadlock. So unless we reverse the current cgroup_mutex --> cpu_hotplug_lock ordering, it is not safe to call cgroup_transfer_tasks() directly.
I see that cgroup_transfer_tasks() has the only user -- cpuset. What about bending it for the specific use like:
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 34aaf0e87def..64deb7212c5c 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -109,7 +109,7 @@ struct cgroup *cgroup_get_from_fd(int fd); struct cgroup *cgroup_v1v2_get_from_fd(int fd); int cgroup_attach_task_all(struct task_struct *from, struct task_struct *); -int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from); +int cgroup_transfer_tasks_locked(struct cgroup *to, struct cgroup *from); int cgroup_add_dfl_cftypes(struct cgroup_subsys *ss, struct cftype *cfts); int cgroup_add_legacy_cftypes(struct cgroup_subsys *ss, struct cftype *cfts); diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index 520a11cb12f4..f97025858c7a 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -91,7 +91,8 @@ EXPORT_SYMBOL_GPL(cgroup_attach_task_all);
- Return: %0 on success or a negative errno code on failure
*/ -int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) +int cgroup_transfer_tasks_locked(struct cgroup *to, struct cgroup *from) { DEFINE_CGROUP_MGCTX(mgctx); struct cgrp_cset_link *link; @@ -106,9 +106,11 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) if (ret) return ret;
- cgroup_lock();
- cgroup_attach_lock(true);
- /* The locking rules serve specific purpose of v1 cpuset hotplug
* migration, see hotplug_update_tasks_legacy() and
* cgroup_attach_lock() */
- lockdep_assert_held(&cgroup_mutex);
- lockdep_assert_cpus_held();
- percpu_down_write(&cgroup_threadgroup_rwsem);
/* all tasks in @from are being moved, all csets are source */ spin_lock_irq(&css_set_lock); @@ -144,8 +146,7 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) } while (task && !ret); out_err: cgroup_migrate_finish(&mgctx);
- cgroup_attach_unlock(true);
- cgroup_unlock();
- percpu_up_write(&cgroup_threadgroup_rwsem); return ret; }
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 13d27b17c889..94fb8b26f038 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -4331,7 +4331,7 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs) nodes_empty(parent->mems_allowed)) parent = parent_cs(parent);
- if (cgroup_transfer_tasks(parent->css.cgroup, cs->css.cgroup)) {
- if (cgroup_transfer_tasks_locked(parent->css.cgroup, cs->css.cgroup)) { pr_err("cpuset: failed to transfer tasks out of empty cpuset "); pr_cont_cgroup_name(cs->css.cgroup); pr_cont("\n");
@@ -4376,21 +4376,9 @@ hotplug_update_tasks_legacy(struct cpuset *cs, /* * Move tasks to the nearest ancestor with execution resources,
* This is full cgroup operation which will also call back into
*/* cpuset. Execute it asynchronously using workqueue.
- if (is_empty && css_tryget_online(&cs->css)) {
struct cpuset_remove_tasks_struct *s;
s = kzalloc(sizeof(*s), GFP_KERNEL);
if (WARN_ON_ONCE(!s)) {
css_put(&cs->css);
return;
}
s->cs = cs;
INIT_WORK(&s->work, cpuset_migrate_tasks_workfn);
schedule_work(&s->work);
- if (is_empty)
} }remove_tasks_in_empty_cpuset(cs);
It still won't work because of the possibility of mutiple tasks involving in a circular locking dependency. The hotplug thread always acquire the cpu_hotplug_lock first before acquiring cpuset_mutex or cgroup_mtuex in this case (cpu_hotplug_lock --> cgroup_mutex). Other tasks calling into cgroup code will acquire the pair in the order cgroup_mutex --> cpu_hotplug_lock. This may lead to a deadlock if these 2 locking sequences happen at the same time. Lockdep will certainly spill out a splat because of this. So unless we change all the relevant cgroup code to the new cpu_hotplug_lock --> cgroup_mutex locking order, the hotplug code can't call cgroup_transfer_tasks() directly.
Cheers, Longman
On 03/04/24 09:38, Waiman Long wrote:
On 4/3/24 08:02, Michal Koutný wrote:
On Tue, Apr 02, 2024 at 11:30:11AM -0400, Waiman Long longman@redhat.com wrote:
Yes, there is a potential that a cpus_read_lock() may be called leading to deadlock. So unless we reverse the current cgroup_mutex --> cpu_hotplug_lock ordering, it is not safe to call cgroup_transfer_tasks() directly.
I see that cgroup_transfer_tasks() has the only user -- cpuset. What about bending it for the specific use like:
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 34aaf0e87def..64deb7212c5c 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -109,7 +109,7 @@ struct cgroup *cgroup_get_from_fd(int fd); struct cgroup *cgroup_v1v2_get_from_fd(int fd);
int cgroup_attach_task_all(struct task_struct *from, struct task_struct *); -int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from); +int cgroup_transfer_tasks_locked(struct cgroup *to, struct cgroup *from);
int cgroup_add_dfl_cftypes(struct cgroup_subsys *ss, struct cftype *cfts); int cgroup_add_legacy_cftypes(struct cgroup_subsys *ss, struct cftype *cfts); diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index 520a11cb12f4..f97025858c7a 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -91,7 +91,8 @@ EXPORT_SYMBOL_GPL(cgroup_attach_task_all);
- Return: %0 on success or a negative errno code on failure
*/ -int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) +int cgroup_transfer_tasks_locked(struct cgroup *to, struct cgroup *from) { DEFINE_CGROUP_MGCTX(mgctx); struct cgrp_cset_link *link; @@ -106,9 +106,11 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) if (ret) return ret;
- cgroup_lock();
- cgroup_attach_lock(true);
/* The locking rules serve specific purpose of v1 cpuset hotplug
* migration, see hotplug_update_tasks_legacy() and
* cgroup_attach_lock() */
lockdep_assert_held(&cgroup_mutex);
lockdep_assert_cpus_held();
percpu_down_write(&cgroup_threadgroup_rwsem);
/* all tasks in @from are being moved, all csets are source */ spin_lock_irq(&css_set_lock);
@@ -144,8 +146,7 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) } while (task && !ret); out_err: cgroup_migrate_finish(&mgctx);
- cgroup_attach_unlock(true);
- cgroup_unlock();
- percpu_up_write(&cgroup_threadgroup_rwsem); return ret; }
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 13d27b17c889..94fb8b26f038 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -4331,7 +4331,7 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs) nodes_empty(parent->mems_allowed)) parent = parent_cs(parent);
- if (cgroup_transfer_tasks(parent->css.cgroup, cs->css.cgroup)) {
- if (cgroup_transfer_tasks_locked(parent->css.cgroup, cs->css.cgroup)) { pr_err("cpuset: failed to transfer tasks out of empty cpuset "); pr_cont_cgroup_name(cs->css.cgroup); pr_cont("\n");
@@ -4376,21 +4376,9 @@ hotplug_update_tasks_legacy(struct cpuset *cs,
/* * Move tasks to the nearest ancestor with execution resources,
* This is full cgroup operation which will also call back into
* cpuset. Execute it asynchronously using workqueue. */
- if (is_empty && css_tryget_online(&cs->css)) {
struct cpuset_remove_tasks_struct *s;
s = kzalloc(sizeof(*s), GFP_KERNEL);
if (WARN_ON_ONCE(!s)) {
css_put(&cs->css);
return;
}
s->cs = cs;
INIT_WORK(&s->work, cpuset_migrate_tasks_workfn);
schedule_work(&s->work);
- if (is_empty)
} }remove_tasks_in_empty_cpuset(cs);
It still won't work because of the possibility of mutiple tasks involving in a circular locking dependency. The hotplug thread always acquire the cpu_hotplug_lock first before acquiring cpuset_mutex or cgroup_mtuex in this case (cpu_hotplug_lock --> cgroup_mutex). Other tasks calling into cgroup code will acquire the pair in the order cgroup_mutex --> cpu_hotplug_lock. This may lead to a deadlock if these 2 locking sequences happen at the same time. Lockdep will certainly spill out a splat because of this.
So unless we change all the relevant cgroup code to the new cpu_hotplug_lock --> cgroup_mutex locking order, the hotplug code can't call cgroup_transfer_tasks() directly.
IIUC that was Thomas' suggestion [1], but I can't tell yet how bad it would be to change cgroup_lock() to also do a cpus_read_lock().
Also, I gave Michal's patch a try and it looks like it's introducing a cgroup_threadgroup_rwsem -> cpuset_mutex ordering from cgroup_transfer_tasks_locked() `\ percpu_down_write(&cgroup_threadgroup_rwsem); cgroup_migrate() `\ cgroup_migrate_execute() `\ ss->can_attach() // cpuset_can_attach() `\ mutex_lock(&cpuset_mutex);
which is invalid, see below.
[1]: https://lore.kernel.org/lkml/87cyrfe7a3.ffs@tglx/
[ 77.627915] WARNING: possible circular locking dependency detected [ 77.628419] 6.9.0-rc1-00042-g844178b616c7-dirty #23 Tainted: G W [ 77.629035] ------------------------------------------------------ [ 77.629548] cpuhp/2/24 is trying to acquire lock: [ 77.629946] ffffffff82d680b0 (cgroup_threadgroup_rwsem){++++}-{0:0}, at: cgroup_transfer_tasks_locked+0x123/0x450 [ 77.630851] [ 77.630851] but task is already holding lock: [ 77.631397] ffffffff82d6c088 (cpuset_mutex){+.+.}-{3:3}, at: cpuset_update_active_cpus+0x352/0xca0 [ 77.632169] [ 77.632169] which lock already depends on the new lock. [ 77.632169] [ 77.632891] [ 77.632891] the existing dependency chain (in reverse order) is: [ 77.633521] [ 77.633521] -> #1 (cpuset_mutex){+.+.}-{3:3}: [ 77.634028] lock_acquire+0xc0/0x2d0 [ 77.634393] __mutex_lock+0xaa/0x710 [ 77.634751] cpuset_can_attach+0x6d/0x2c0 [ 77.635146] cgroup_migrate_execute+0x6f/0x520 [ 77.635565] cgroup_attach_task+0x2e2/0x450 [ 77.635989] __cgroup1_procs_write.isra.0+0xfd/0x150 [ 77.636440] kernfs_fop_write_iter+0x127/0x1c0 [ 77.636917] vfs_write+0x2b0/0x540 [ 77.637330] ksys_write+0x70/0xf0 [ 77.637707] int80_emulation+0xf8/0x1b0 [ 77.638183] asm_int80_emulation+0x1a/0x20 [ 77.638636] [ 77.638636] -> #0 (cgroup_threadgroup_rwsem){++++}-{0:0}: [ 77.639321] check_prev_add+0xeb/0xb20 [ 77.639751] __lock_acquire+0x12ac/0x16d0 [ 77.640345] lock_acquire+0xc0/0x2d0 [ 77.640903] percpu_down_write+0x33/0x260 [ 77.641347] cgroup_transfer_tasks_locked+0x123/0x450 [ 77.641826] cpuset_update_active_cpus+0x782/0xca0 [ 77.642268] sched_cpu_deactivate+0x1ad/0x1d0 [ 77.642677] cpuhp_invoke_callback+0x16b/0x6b0 [ 77.643098] cpuhp_thread_fun+0x1ba/0x240 [ 77.643488] smpboot_thread_fn+0xd8/0x1d0 [ 77.643873] kthread+0xce/0x100 [ 77.644209] ret_from_fork+0x2f/0x50 [ 77.644626] ret_from_fork_asm+0x1a/0x30 [ 77.645084] [ 77.645084] other info that might help us debug this: [ 77.645084] [ 77.645829] Possible unsafe locking scenario: [ 77.645829] [ 77.646356] CPU0 CPU1 [ 77.646748] ---- ---- [ 77.647143] lock(cpuset_mutex); [ 77.647529] lock(cgroup_threadgroup_rwsem); [ 77.648193] lock(cpuset_mutex); [ 77.648767] lock(cgroup_threadgroup_rwsem); [ 77.649216] [ 77.649216] *** DEADLOCK ***
On 4/3/24 10:26, Valentin Schneider wrote:
On 03/04/24 09:38, Waiman Long wrote:
On 4/3/24 08:02, Michal Koutný wrote:
On Tue, Apr 02, 2024 at 11:30:11AM -0400, Waiman Long longman@redhat.com wrote:
Yes, there is a potential that a cpus_read_lock() may be called leading to deadlock. So unless we reverse the current cgroup_mutex --> cpu_hotplug_lock ordering, it is not safe to call cgroup_transfer_tasks() directly.
I see that cgroup_transfer_tasks() has the only user -- cpuset. What about bending it for the specific use like:
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 34aaf0e87def..64deb7212c5c 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -109,7 +109,7 @@ struct cgroup *cgroup_get_from_fd(int fd); struct cgroup *cgroup_v1v2_get_from_fd(int fd);
int cgroup_attach_task_all(struct task_struct *from, struct task_struct *); -int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from); +int cgroup_transfer_tasks_locked(struct cgroup *to, struct cgroup *from);
int cgroup_add_dfl_cftypes(struct cgroup_subsys *ss, struct cftype *cfts); int cgroup_add_legacy_cftypes(struct cgroup_subsys *ss, struct cftype *cfts); diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index 520a11cb12f4..f97025858c7a 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -91,7 +91,8 @@ EXPORT_SYMBOL_GPL(cgroup_attach_task_all); * * Return: %0 on success or a negative errno code on failure */ -int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) +int cgroup_transfer_tasks_locked(struct cgroup *to, struct cgroup *from) { DEFINE_CGROUP_MGCTX(mgctx); struct cgrp_cset_link *link; @@ -106,9 +106,11 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) if (ret) return ret;
- cgroup_lock();
- cgroup_attach_lock(true);
/* The locking rules serve specific purpose of v1 cpuset hotplug
* migration, see hotplug_update_tasks_legacy() and
* cgroup_attach_lock() */
lockdep_assert_held(&cgroup_mutex);
lockdep_assert_cpus_held();
percpu_down_write(&cgroup_threadgroup_rwsem);
/* all tasks in @from are being moved, all csets are source */ spin_lock_irq(&css_set_lock);
@@ -144,8 +146,7 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) } while (task && !ret); out_err: cgroup_migrate_finish(&mgctx);
- cgroup_attach_unlock(true);
- cgroup_unlock();
- percpu_up_write(&cgroup_threadgroup_rwsem); return ret; }
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 13d27b17c889..94fb8b26f038 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -4331,7 +4331,7 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs) nodes_empty(parent->mems_allowed)) parent = parent_cs(parent);
- if (cgroup_transfer_tasks(parent->css.cgroup, cs->css.cgroup)) {
- if (cgroup_transfer_tasks_locked(parent->css.cgroup, cs->css.cgroup)) { pr_err("cpuset: failed to transfer tasks out of empty cpuset "); pr_cont_cgroup_name(cs->css.cgroup); pr_cont("\n");
@@ -4376,21 +4376,9 @@ hotplug_update_tasks_legacy(struct cpuset *cs,
/* * Move tasks to the nearest ancestor with execution resources,
* This is full cgroup operation which will also call back into
* cpuset. Execute it asynchronously using workqueue. */
- if (is_empty && css_tryget_online(&cs->css)) {
struct cpuset_remove_tasks_struct *s;
s = kzalloc(sizeof(*s), GFP_KERNEL);
if (WARN_ON_ONCE(!s)) {
css_put(&cs->css);
return;
}
s->cs = cs;
INIT_WORK(&s->work, cpuset_migrate_tasks_workfn);
schedule_work(&s->work);
- if (is_empty)
} }remove_tasks_in_empty_cpuset(cs);
It still won't work because of the possibility of mutiple tasks involving in a circular locking dependency. The hotplug thread always acquire the cpu_hotplug_lock first before acquiring cpuset_mutex or cgroup_mtuex in this case (cpu_hotplug_lock --> cgroup_mutex). Other tasks calling into cgroup code will acquire the pair in the order cgroup_mutex --> cpu_hotplug_lock. This may lead to a deadlock if these 2 locking sequences happen at the same time. Lockdep will certainly spill out a splat because of this. So unless we change all the relevant cgroup code to the new cpu_hotplug_lock --> cgroup_mutex locking order, the hotplug code can't call cgroup_transfer_tasks() directly.
IIUC that was Thomas' suggestion [1], but I can't tell yet how bad it would be to change cgroup_lock() to also do a cpus_read_lock().
Changing the locking order is certainly doable. I have taken a cursory look at it and at least the following files need to be changed:
kernel/bpf/cgroup.c kernel/cgroup/cgroup.c kernel/cgroup/legacy_freezer.c mm/memcontrol.c
That requires a lot more testing to make sure that there won't be a regression. Given that the call to cgroup_transfer_tasks() should be rare these days as it will only apply in the case of cgroup v1 under certain condtion, I am not sure this requirement justifies making such extensive changes. So I kind of defer it until we reach a consensus that it is the right thing to do.
Cheers, Longman
Also, I gave Michal's patch a try and it looks like it's introducing a cgroup_threadgroup_rwsem -> cpuset_mutex ordering from cgroup_transfer_tasks_locked() `\ percpu_down_write(&cgroup_threadgroup_rwsem); cgroup_migrate() `\ cgroup_migrate_execute() `\ ss->can_attach() // cpuset_can_attach() `\ mutex_lock(&cpuset_mutex);
which is invalid, see below.
[ 77.627915] WARNING: possible circular locking dependency detected [ 77.628419] 6.9.0-rc1-00042-g844178b616c7-dirty #23 Tainted: G W [ 77.629035] ------------------------------------------------------ [ 77.629548] cpuhp/2/24 is trying to acquire lock: [ 77.629946] ffffffff82d680b0 (cgroup_threadgroup_rwsem){++++}-{0:0}, at: cgroup_transfer_tasks_locked+0x123/0x450 [ 77.630851] [ 77.630851] but task is already holding lock: [ 77.631397] ffffffff82d6c088 (cpuset_mutex){+.+.}-{3:3}, at: cpuset_update_active_cpus+0x352/0xca0 [ 77.632169] [ 77.632169] which lock already depends on the new lock. [ 77.632169] [ 77.632891] [ 77.632891] the existing dependency chain (in reverse order) is: [ 77.633521] [ 77.633521] -> #1 (cpuset_mutex){+.+.}-{3:3}: [ 77.634028] lock_acquire+0xc0/0x2d0 [ 77.634393] __mutex_lock+0xaa/0x710 [ 77.634751] cpuset_can_attach+0x6d/0x2c0 [ 77.635146] cgroup_migrate_execute+0x6f/0x520 [ 77.635565] cgroup_attach_task+0x2e2/0x450 [ 77.635989] __cgroup1_procs_write.isra.0+0xfd/0x150 [ 77.636440] kernfs_fop_write_iter+0x127/0x1c0 [ 77.636917] vfs_write+0x2b0/0x540 [ 77.637330] ksys_write+0x70/0xf0 [ 77.637707] int80_emulation+0xf8/0x1b0 [ 77.638183] asm_int80_emulation+0x1a/0x20 [ 77.638636] [ 77.638636] -> #0 (cgroup_threadgroup_rwsem){++++}-{0:0}: [ 77.639321] check_prev_add+0xeb/0xb20 [ 77.639751] __lock_acquire+0x12ac/0x16d0 [ 77.640345] lock_acquire+0xc0/0x2d0 [ 77.640903] percpu_down_write+0x33/0x260 [ 77.641347] cgroup_transfer_tasks_locked+0x123/0x450 [ 77.641826] cpuset_update_active_cpus+0x782/0xca0 [ 77.642268] sched_cpu_deactivate+0x1ad/0x1d0 [ 77.642677] cpuhp_invoke_callback+0x16b/0x6b0 [ 77.643098] cpuhp_thread_fun+0x1ba/0x240 [ 77.643488] smpboot_thread_fn+0xd8/0x1d0 [ 77.643873] kthread+0xce/0x100 [ 77.644209] ret_from_fork+0x2f/0x50 [ 77.644626] ret_from_fork_asm+0x1a/0x30 [ 77.645084] [ 77.645084] other info that might help us debug this: [ 77.645084] [ 77.645829] Possible unsafe locking scenario: [ 77.645829] [ 77.646356] CPU0 CPU1 [ 77.646748] ---- ---- [ 77.647143] lock(cpuset_mutex); [ 77.647529] lock(cgroup_threadgroup_rwsem); [ 77.648193] lock(cpuset_mutex); [ 77.648767] lock(cgroup_threadgroup_rwsem); [ 77.649216] [ 77.649216] *** DEADLOCK ***
On Wed, Apr 03, 2024 at 10:47:33AM -0400, Waiman Long longman@redhat.com wrote:
should be rare these days as it will only apply in the case of cgroup v1 under certain condtion,
Could the migration be simply omitted it those special cases?
(Tasks remain in cpusets with empty cpusets -- that already happens in with the current patch before workqueue is dispatched.)
Michal
On 4/3/24 10:56, Michal Koutný wrote:
On Wed, Apr 03, 2024 at 10:47:33AM -0400, Waiman Long longman@redhat.com wrote:
should be rare these days as it will only apply in the case of cgroup v1 under certain condtion,
Could the migration be simply omitted it those special cases?
(Tasks remain in cpusets with empty cpusets -- that already happens in with the current patch before workqueue is dispatched.)
The tasks should not be runnable if there is no CPUs left in their v1 cpuset. Migrating them to a parent with runnable CPUs is the current way which I don't want to break. Alternatively, we could force it to fall back to cgroup v2 behavior using the CPUs in their parent cpuset.
Cheers, Longman
Michal
On 03/04/24 10:47, Waiman Long wrote:
On 4/3/24 10:26, Valentin Schneider wrote:
IIUC that was Thomas' suggestion [1], but I can't tell yet how bad it would be to change cgroup_lock() to also do a cpus_read_lock().
Changing the locking order is certainly doable. I have taken a cursory look at it and at least the following files need to be changed:
kernel/bpf/cgroup.c kernel/cgroup/cgroup.c kernel/cgroup/legacy_freezer.c mm/memcontrol.c
That requires a lot more testing to make sure that there won't be a regression. Given that the call to cgroup_transfer_tasks() should be rare these days as it will only apply in the case of cgroup v1 under certain condtion, I am not sure this requirement justifies making such extensive changes. So I kind of defer it until we reach a consensus that it is the right thing to do.
Yeah if we can avoid it initially I'd be up for it.
Just one thing that came to mind - there's no flushing of the cpuset_migrate_tasks_workfn() work, so the scheduler might move tasks itself before the cpuset does via:
balance_push() ->__balance_push_cpu_stop() -> select_fallback_rq()
But, given the current placement of cpuset_wait_for_hotplug(), I believe that's something we can already have, so we should be good.
Cheers, Longman
On Wed, Apr 03, 2024 at 04:26:38PM +0200, Valentin Schneider vschneid@redhat.com wrote:
Also, I gave Michal's patch a try and it looks like it's introducing a
Thank you.
cgroup_threadgroup_rwsem -> cpuset_mutex ordering from cgroup_transfer_tasks_locked() `\ percpu_down_write(&cgroup_threadgroup_rwsem); cgroup_migrate() `\ cgroup_migrate_execute() `\ ss->can_attach() // cpuset_can_attach() `\ mutex_lock(&cpuset_mutex);
which is invalid, see below.
_This_ should be the right order (cpuset_mutex inside cgroup_threadgroup_rwsem), at least in my mental model. Thus I missed that cpuset_mutex must have been taken somewhere higher up in the hotplug code (CPU 0 in the lockdep dump, I can't easily see where from) :-/.
Michal
On 03/04/24 16:54, Michal Koutný wrote:
On Wed, Apr 03, 2024 at 04:26:38PM +0200, Valentin Schneider vschneid@redhat.com wrote:
Also, I gave Michal's patch a try and it looks like it's introducing a
Thank you.
cgroup_threadgroup_rwsem -> cpuset_mutex ordering from cgroup_transfer_tasks_locked() `\ percpu_down_write(&cgroup_threadgroup_rwsem); cgroup_migrate() `\ cgroup_migrate_execute() `\ ss->can_attach() // cpuset_can_attach() `\ mutex_lock(&cpuset_mutex);
which is invalid, see below.
_This_ should be the right order (cpuset_mutex inside cgroup_threadgroup_rwsem), at least in my mental model. Thus I missed that cpuset_mutex must have been taken somewhere higher up in the hotplug code (CPU 0 in the lockdep dump, I can't easily see where from) :-/.
If I got this right...
cpuset_hotplug_update_tasks() `\ mutex_lock(&cpuset_mutex); hotplug_update_tasks_legacy() `\ remove_tasks_in_empty_cpuset() `\ cgroup_transfer_tasks_locked() `\ percpu_down_write(&cgroup_threadgroup_rwsem);
But then that is also followed by:
cgroup_migrate() `\ cgroup_migrate_execute() `\ ss->can_attach() // cpuset_can_attach() `\ mutex_lock(&cpuset_mutex);
which doesn't look good...
Also, I missed that earlier, but this triggers:
cgroup_transfer_tasks_locked() ~> lockdep_assert_held(&cgroup_mutex);
[ 30.773092] WARNING: CPU: 2 PID: 24 at kernel/cgroup/cgroup-v1.c:112 cgroup_transfer_tasks_locked+0x39f/0x450 [ 30.773807] Modules linked in: [ 30.774063] CPU: 2 PID: 24 Comm: cpuhp/2 Not tainted 6.9.0-rc1-00042-g844178b616c7-dirty #25 [ 30.774672] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.16.3-0-ga6ed6b701f0a-prebuilt.qemu.org 04/01/2014 [ 30.775457] RIP: 0010:cgroup_transfer_tasks_locked+0x39f/0x450 [ 30.775891] Code: 0f 85 70 ff ff ff 0f 1f 44 00 00 e9 6d ff ff ff be ff ff ff ff 48 c7 c7 48 82 d6 82 e8 5a 6a ec 00 85 c0 0f 85 6d fd ff ff 90 <0f> 0b 90 e9 64 fd ff ff 48 8b bd e8 fe ff ff be 01 00 00 00 e8 78 [ 30.777270] RSP: 0000:ffffc900000e7c20 EFLAGS: 00010246 [ 30.777813] RAX: 0000000000000000 RBX: ffffc900000e7cb0 RCX: 0000000000000000 [ 30.778443] RDX: 0000000000000000 RSI: ffffffff82d68248 RDI: ffff888004a9a300 [ 30.779142] RBP: ffffc900000e7d50 R08: 0000000000000001 R09: 0000000000000000 [ 30.779889] R10: ffffc900000e7d70 R11: 0000000000000001 R12: ffff8880057c6040 [ 30.780420] R13: ffff88800539f800 R14: 0000000000000001 R15: 0000000000000004 [ 30.780951] FS: 0000000000000000(0000) GS:ffff88801f500000(0000) knlGS:0000000000000000 [ 30.781561] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 30.781989] CR2: 00000000f7e6fe85 CR3: 00000000064ac000 CR4: 00000000000006f0 [ 30.782558] Call Trace: [ 30.782783] <TASK> [ 30.782982] ? __warn+0x87/0x180 [ 30.783250] ? cgroup_transfer_tasks_locked+0x39f/0x450 [ 30.783644] ? report_bug+0x164/0x190 [ 30.783970] ? handle_bug+0x3b/0x70 [ 30.784288] ? exc_invalid_op+0x17/0x70 [ 30.784641] ? asm_exc_invalid_op+0x1a/0x20 [ 30.784992] ? cgroup_transfer_tasks_locked+0x39f/0x450 [ 30.785375] ? __lock_acquire+0xe9d/0x16d0 [ 30.785707] ? cpuset_update_active_cpus+0x15a/0xca0 [ 30.786074] ? cpuset_update_active_cpus+0x782/0xca0 [ 30.786443] cpuset_update_active_cpus+0x782/0xca0 [ 30.786816] sched_cpu_deactivate+0x1ad/0x1d0 [ 30.787148] ? __pfx_sched_cpu_deactivate+0x10/0x10 [ 30.787509] cpuhp_invoke_callback+0x16b/0x6b0 [ 30.787859] ? cpuhp_thread_fun+0x56/0x240 [ 30.788175] cpuhp_thread_fun+0x1ba/0x240 [ 30.788485] smpboot_thread_fn+0xd8/0x1d0 [ 30.788823] ? __pfx_smpboot_thread_fn+0x10/0x10 [ 30.789229] kthread+0xce/0x100 [ 30.789526] ? __pfx_kthread+0x10/0x10 [ 30.789876] ret_from_fork+0x2f/0x50 [ 30.790200] ? __pfx_kthread+0x10/0x10 [ 30.792341] ret_from_fork_asm+0x1a/0x30 [ 30.792716] </TASK>
Michal
Add a simple test to verify that an empty v1 cpuset will force its tasks to be moved to an ancestor node. It is based on the test case documented in commit 76bb5ab8f6e3 ("cpuset: break kernfs active protection in cpuset_write_resmask()").
Signed-off-by: Waiman Long longman@redhat.com --- tools/testing/selftests/cgroup/Makefile | 2 +- .../selftests/cgroup/test_cpuset_v1_hp.sh | 40 +++++++++++++++++++ 2 files changed, 41 insertions(+), 1 deletion(-) create mode 100755 tools/testing/selftests/cgroup/test_cpuset_v1_hp.sh
diff --git a/tools/testing/selftests/cgroup/Makefile b/tools/testing/selftests/cgroup/Makefile index 00b441928909..16461dc0ffdf 100644 --- a/tools/testing/selftests/cgroup/Makefile +++ b/tools/testing/selftests/cgroup/Makefile @@ -4,7 +4,7 @@ CFLAGS += -Wall -pthread all: ${HELPER_PROGS}
TEST_FILES := with_stress.sh -TEST_PROGS := test_stress.sh test_cpuset_prs.sh +TEST_PROGS := test_stress.sh test_cpuset_prs.sh test_cpuset_v1_hp.sh TEST_GEN_FILES := wait_inotify TEST_GEN_PROGS = test_memcontrol TEST_GEN_PROGS += test_kmem diff --git a/tools/testing/selftests/cgroup/test_cpuset_v1_hp.sh b/tools/testing/selftests/cgroup/test_cpuset_v1_hp.sh new file mode 100755 index 000000000000..0d0a1923d8ec --- /dev/null +++ b/tools/testing/selftests/cgroup/test_cpuset_v1_hp.sh @@ -0,0 +1,40 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0 +# +# Test the special cpuset v1 hotplug case where a cpuset become empty of +# CPUs will force migration of tasks out to an ancestor. +# + +skip_test() { + echo "$1" + echo "Test SKIPPED" + exit 4 # ksft_skip +} + +[[ $(id -u) -eq 0 ]] || skip_test "Test must be run as root!" + +# Find cpuset v1 mount point +CPUSET=$(mount -t cgroup | grep cpuset | head -1 | awk -e '{print $3}') +[[ -n "$CPUSET" ]] || skip_test "cpuset v1 mount point not found!" + +# +# Create a test cpuset, put a CPU and a task there and offline that CPU +# +TDIR=test$$ +[[ -d $CPUSET/$TDIR ]] || mkdir $CPUSET/$TDIR +echo 1 > $CPUSET/$TDIR/cpuset.cpus +echo 0 > $CPUSET/$TDIR/cpuset.mems +sleep 10& +TASK=$! +echo $TASK > $CPUSET/$TDIR/tasks +echo 0 > /sys/devices/system/cpu/cpu1/online +sleep 0.5 +echo 1 > /sys/devices/system/cpu/cpu1/online +NEWCS=$(cat /proc/$TASK/cpuset) +rmdir $CPUSET/$TDIR +[[ $NEWCS != "/" ]] && { + echo "cpuset $NEWCS, test FAILED!" + exit 1 +} +echo "Test PASSED" +exit 0
linux-kselftest-mirror@lists.linaro.org