Turns out hotplugging CPUs that are in exclusive cpusets can lead to the cpuset code feeding empty cpumasks to the sched domain rebuild machinery. This leads to the following splat:
[ 30.618174] Internal error: Oops: 96000004 [#1] PREEMPT SMP [ 30.623697] Modules linked in: [ 30.626731] CPU: 0 PID: 235 Comm: kworker/5:2 Not tainted 5.4.0-rc1-00005-g8d495477d62e #23 [ 30.635003] Hardware name: ARM Juno development board (r0) (DT) [ 30.640877] Workqueue: events cpuset_hotplug_workfn [ 30.645713] pstate: 60000005 (nZCv daif -PAN -UAO) [ 30.650464] pc : build_sched_domains (./include/linux/arch_topology.h:23 kernel/sched/topology.c:1898 kernel/sched/topology.c:1969) [ 30.655126] lr : build_sched_domains (kernel/sched/topology.c:1966) [...] [ 30.742047] Call trace: [ 30.744474] build_sched_domains (./include/linux/arch_topology.h:23 kernel/sched/topology.c:1898 kernel/sched/topology.c:1969) [ 30.748793] partition_sched_domains_locked (kernel/sched/topology.c:2250) [ 30.753971] rebuild_sched_domains_locked (./include/linux/bitmap.h:370 ./include/linux/cpumask.h:538 kernel/cgroup/cpuset.c:955 kernel/cgroup/cpuset.c:978 kernel/cgroup/cpuset.c:1019) [ 30.758977] rebuild_sched_domains (kernel/cgroup/cpuset.c:1032) [ 30.763209] cpuset_hotplug_workfn (kernel/cgroup/cpuset.c:3205 (discriminator 2)) [ 30.767613] process_one_work (./arch/arm64/include/asm/jump_label.h:21 ./include/linux/jump_label.h:200 ./include/trace/events/workqueue.h:114 kernel/workqueue.c:2274) [ 30.771586] worker_thread (./include/linux/compiler.h:199 ./include/linux/list.h:268 kernel/workqueue.c:2416) [ 30.775217] kthread (kernel/kthread.c:255) [ 30.778418] ret_from_fork (arch/arm64/kernel/entry.S:1167) [ 30.781965] Code: f860dae2 912802d6 aa1603e1 12800000 (f8616853)
The faulty line in question is
cap = arch_scale_cpu_capacity(cpumask_first(cpu_map));
and we're not checking the return value against nr_cpu_ids (we shouldn't have to!), which leads to the above.
Prevent generate_sched_domains() from returning empty cpumasks, and add some assertion in build_sched_domains() to scream bloody murder if it happens again.
The above splat was obtained on my Juno r0 with:
cgcreate -g cpuset:asym cgset -r cpuset.cpus=0-3 asym cgset -r cpuset.mems=0 asym cgset -r cpuset.cpu_exclusive=1 asym
cgcreate -g cpuset:smp cgset -r cpuset.cpus=4-5 smp cgset -r cpuset.mems=0 smp cgset -r cpuset.cpu_exclusive=1 smp
cgset -r cpuset.sched_load_balance=0 .
echo 0 > /sys/devices/system/cpu/cpu4/online echo 0 > /sys/devices/system/cpu/cpu5/online
Cc: stable@vger.kernel.org Fixes: 05484e098448 ("sched/topology: Add SD_ASYM_CPUCAPACITY flag detection") Signed-off-by: Valentin Schneider valentin.schneider@arm.com --- kernel/cgroup/cpuset.c | 8 ++++++++ kernel/sched/topology.c | 5 ++++- 2 files changed, 12 insertions(+), 1 deletion(-)
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index c52bc91f882b..a859e5539440 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -817,6 +817,11 @@ static int generate_sched_domains(cpumask_var_t **domains, struct cpuset *a = csa[i]; int apn = a->pn;
+ if (cpumask_empty(a->effective_cpus)) { + ndoms--; + continue; + } + for (j = 0; j < csn; j++) { struct cpuset *b = csa[j]; int bpn = b->pn; @@ -859,6 +864,9 @@ static int generate_sched_domains(cpumask_var_t **domains, continue; }
+ if (cpumask_empty(a->effective_cpus)) + continue; + dp = doms[nslot];
if (nslot == ndoms) { diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index b5667a273bf6..9318acf1d1fe 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -1948,7 +1948,7 @@ static struct sched_domain_topology_level static int build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *attr) { - enum s_alloc alloc_state; + enum s_alloc alloc_state = sa_none; struct sched_domain *sd; struct s_data d; struct rq *rq = NULL; @@ -1956,6 +1956,9 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att struct sched_domain_topology_level *tl_asym; bool has_asym = false;
+ if (WARN_ON(cpumask_empty(cpu_map))) + goto error; + alloc_state = __visit_domain_allocation_hell(&d, cpu_map); if (alloc_state != sa_rootdomain) goto error;
On 15/10/2019 17:42, Valentin Schneider wrote:
Turns out hotplugging CPUs that are in exclusive cpusets can lead to the cpuset code feeding empty cpumasks to the sched domain rebuild machinery. This leads to the following splat:
[ 30.618174] Internal error: Oops: 96000004 [#1] PREEMPT SMP [ 30.623697] Modules linked in: [ 30.626731] CPU: 0 PID: 235 Comm: kworker/5:2 Not tainted 5.4.0-rc1-00005-g8d495477d62e #23 [ 30.635003] Hardware name: ARM Juno development board (r0) (DT) [ 30.640877] Workqueue: events cpuset_hotplug_workfn [ 30.645713] pstate: 60000005 (nZCv daif -PAN -UAO) [ 30.650464] pc : build_sched_domains (./include/linux/arch_topology.h:23 kernel/sched/topology.c:1898 kernel/sched/topology.c:1969) [ 30.655126] lr : build_sched_domains (kernel/sched/topology.c:1966) [...] [ 30.742047] Call trace: [ 30.744474] build_sched_domains (./include/linux/arch_topology.h:23 kernel/sched/topology.c:1898 kernel/sched/topology.c:1969) [ 30.748793] partition_sched_domains_locked (kernel/sched/topology.c:2250) [ 30.753971] rebuild_sched_domains_locked (./include/linux/bitmap.h:370 ./include/linux/cpumask.h:538 kernel/cgroup/cpuset.c:955 kernel/cgroup/cpuset.c:978 kernel/cgroup/cpuset.c:1019) [ 30.758977] rebuild_sched_domains (kernel/cgroup/cpuset.c:1032) [ 30.763209] cpuset_hotplug_workfn (kernel/cgroup/cpuset.c:3205 (discriminator 2)) [ 30.767613] process_one_work (./arch/arm64/include/asm/jump_label.h:21 ./include/linux/jump_label.h:200 ./include/trace/events/workqueue.h:114 kernel/workqueue.c:2274) [ 30.771586] worker_thread (./include/linux/compiler.h:199 ./include/linux/list.h:268 kernel/workqueue.c:2416) [ 30.775217] kthread (kernel/kthread.c:255) [ 30.778418] ret_from_fork (arch/arm64/kernel/entry.S:1167) [ 30.781965] Code: f860dae2 912802d6 aa1603e1 12800000 (f8616853)
The faulty line in question is
cap = arch_scale_cpu_capacity(cpumask_first(cpu_map));
and we're not checking the return value against nr_cpu_ids (we shouldn't have to!), which leads to the above.
Prevent generate_sched_domains() from returning empty cpumasks, and add some assertion in build_sched_domains() to scream bloody murder if it happens again.
First I thought we can do with a little less drama by only preventing arch_scale_cpu_capacity() from consuming >= nr_cpu_ids.
@@ -1894,6 +1894,9 @@ static struct sched_domain_topology_level struct sched_domain_topology_level *tl, *asym_tl = NULL; unsigned long cap;
+ if (cpumask_empty(cpu_map)) + return NULL; +
Until I tried to hp'ed in CPU4 after CPU4/5 had been hp'ed out (your example further below) and I got another:
[ 68.014564] Unable to handle kernel paging request at virtual address fffe8009903d8ee0 ... [ 68.191293] Call trace: [ 68.193712] partition_sched_domains_locked+0x1a4/0x4a0 [ 68.198882] rebuild_sched_domains_locked+0x4d0/0x7b0 [ 68.203880] rebuild_sched_domains+0x24/0x40 [ 68.208104] cpuset_hotplug_workfn+0xe0/0x5f8 ...
@@ -2213,6 +2216,11 @@ void partition_sched_domains_locked(int ndoms_new, cpumask_var_t doms_new[], * will be recomputed in function * update_tasks_root_domain(). */ + if (cpumask_empty(doms_cur[i])) + printk("doms_cur[%d] empty\n", i); + rd = cpu_rq(cpumask_any(doms_cur[i]))->rd;
doms_cur[i] is empty when hp'ing in CPU4 again.
Your patch fixes this as well.
Might be worth noting that this is not only about asym CPU capacity handling but missing checks after cpumask operations in case the cpuset is empty.
The above splat was obtained on my Juno r0 with:> cgcreate -g cpuset:asym cgset -r cpuset.cpus=0-3 asym cgset -r cpuset.mems=0 asym cgset -r cpuset.cpu_exclusive=1 asym
cgcreate -g cpuset:smp cgset -r cpuset.cpus=4-5 smp cgset -r cpuset.mems=0 smp cgset -r cpuset.cpu_exclusive=1 smp
cgset -r cpuset.sched_load_balance=0 .
echo 0 > /sys/devices/system/cpu/cpu4/online echo 0 > /sys/devices/system/cpu/cpu5/online
Cc: stable@vger.kernel.org Fixes: 05484e098448 ("sched/topology: Add SD_ASYM_CPUCAPACITY flag detection") Signed-off-by: Valentin Schneider valentin.schneider@arm.com
kernel/cgroup/cpuset.c | 8 ++++++++ kernel/sched/topology.c | 5 ++++- 2 files changed, 12 insertions(+), 1 deletion(-)
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index c52bc91f882b..a859e5539440 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -817,6 +817,11 @@ static int generate_sched_domains(cpumask_var_t **domains, struct cpuset *a = csa[i]; int apn = a->pn;
if (cpumask_empty(a->effective_cpus)) {
ndoms--;
continue;
}
- for (j = 0; j < csn; j++) { struct cpuset *b = csa[j]; int bpn = b->pn;
@@ -859,6 +864,9 @@ static int generate_sched_domains(cpumask_var_t **domains, continue; }
if (cpumask_empty(a->effective_cpus))
continue;
- dp = doms[nslot];
if (nslot == ndoms) { diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index b5667a273bf6..9318acf1d1fe 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -1948,7 +1948,7 @@ static struct sched_domain_topology_level static int build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *attr) {
- enum s_alloc alloc_state;
- enum s_alloc alloc_state = sa_none; struct sched_domain *sd; struct s_data d; struct rq *rq = NULL;
@@ -1956,6 +1956,9 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att struct sched_domain_topology_level *tl_asym; bool has_asym = false;
- if (WARN_ON(cpumask_empty(cpu_map)))
goto error;
- alloc_state = __visit_domain_allocation_hell(&d, cpu_map); if (alloc_state != sa_rootdomain) goto error;
On 22/10/2019 12:43, Dietmar Eggemann wrote:
First I thought we can do with a little less drama by only preventing arch_scale_cpu_capacity() from consuming >= nr_cpu_ids.
@@ -1894,6 +1894,9 @@ static struct sched_domain_topology_level struct sched_domain_topology_level *tl, *asym_tl = NULL; unsigned long cap;
if (cpumask_empty(cpu_map))
return NULL;
Until I tried to hp'ed in CPU4 after CPU4/5 had been hp'ed out (your example further below) and I got another:
[ 68.014564] Unable to handle kernel paging request at virtual address fffe8009903d8ee0 ... [ 68.191293] Call trace: [ 68.193712] partition_sched_domains_locked+0x1a4/0x4a0 [ 68.198882] rebuild_sched_domains_locked+0x4d0/0x7b0 [ 68.203880] rebuild_sched_domains+0x24/0x40 [ 68.208104] cpuset_hotplug_workfn+0xe0/0x5f8 ...
@@ -2213,6 +2216,11 @@ void partition_sched_domains_locked(int ndoms_new, cpumask_var_t doms_new[], * will be recomputed in function * update_tasks_root_domain(). */
if (cpumask_empty(doms_cur[i]))
printk("doms_cur[%d] empty\n", i);
rd = cpu_rq(cpumask_any(doms_cur[i]))->rd;
doms_cur[i] is empty when hp'ing in CPU4 again.
Your patch fixes this as well.
Thanks for giving it a spin!
Might be worth noting that this is not only about asym CPU capacity handling but missing checks after cpumask operations in case the cpuset is empty.
Aye, we end up saving whatever we're given (doms_cur = doms_new at the end of the rebuild). As you pointed out this is also an issue for the operation done by
f9a25f776d78 ("cpusets: Rebuild root domain deadline accounting information")
but it has been introduced after the asymmetry check, hence why I'm tagging the latter for stable.
On 15/10/2019 17:42, Valentin Schneider wrote:
[...]
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index c52bc91f882b..a859e5539440 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -817,6 +817,11 @@ static int generate_sched_domains(cpumask_var_t **domains, struct cpuset *a = csa[i]; int apn = a->pn;
if (cpumask_empty(a->effective_cpus)) {
ndoms--;
continue;
}
- for (j = 0; j < csn; j++) { struct cpuset *b = csa[j]; int bpn = b->pn;
@@ -859,6 +864,9 @@ static int generate_sched_domains(cpumask_var_t **domains, continue; }
if (cpumask_empty(a->effective_cpus))
continue;
Can you not just prevent that a cpuset pointer (cp) is added to the cpuset array (csa[]) in case cpumask_empty(cp->effective_cpus)?
@@ -798,9 +800,14 @@ static int generate_sched_domains(cpumask_var_t **domains, cpumask_subset(cp->cpus_allowed, top_cpuset.effective_cpus)) continue;
- if (is_sched_load_balance(cp)) + if (is_sched_load_balance(cp) && !cpumask_empty(cp->effective_cpus)) csa[csn++] = cp;
dp = doms[nslot];
if (nslot == ndoms) {
[...]
On 23/10/2019 12:46, Dietmar Eggemann wrote:
Can you not just prevent that a cpuset pointer (cp) is added to the cpuset array (csa[]) in case cpumask_empty(cp->effective_cpus)?
@@ -798,9 +800,14 @@ static int generate_sched_domains(cpumask_var_t **domains, cpumask_subset(cp->cpus_allowed, top_cpuset.effective_cpus)) continue;
- if (is_sched_load_balance(cp))
- if (is_sched_load_balance(cp) && !cpumask_empty(cp->effective_cpus)) csa[csn++] = cp;
I think you're right. Let me give it a shot and I'll spin a v4 with this + better changelog for the key.
dp = doms[nslot];
if (nslot == ndoms) {
[...]
linux-stable-mirror@lists.linaro.org