One relatively simple way to allow runtime modification of nohz_full, and rcu_nocbs CPUs is to use the CPU hotplug to bring the affected CPUs offline first, making changes to the housekeeping cpumasks and then bring them back online. However, doing this will be rather costly in term of the number of CPU cycles needed. Still it is the easiet way to achieve the desired result and hopefully we can gradually reduce the overhead over time.
Use the newly introduced cpuhp_offline_cb() API to bring the affected CPUs offline, make the necessary housekeeping cpumask changes and then bring those CPUs back online again.
As HK_TYPE_DOMAIN cpumask is going to be updated at run time, we are going to reset any boot time isolcpus domain setting if an isolated partition or a conflicting non-isolated partition is going to be created.
Since rebuild_sched_domains() will be called at the end of update_isolation_cpumasks(), earlier rebuild_sched_domains_locked() calls will be suppressed to avoid unneeded work.
Signed-off-by: Waiman Long longman@redhat.com --- kernel/cgroup/cpuset.c | 95 ++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 92 insertions(+), 3 deletions(-)
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 87e9ee7922cd..60f336e50b05 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -1355,11 +1355,57 @@ static void partition_xcpus_del(int old_prs, struct cpuset *parent, return; }
+/* + * We are only updating HK_TYPE_DOMAIN and HK_TYPE_KERNEL_NOISE housekeeping + * cpumask for now. HK_TYPE_MANAGED_IRQ will be handled later. + */ +static int do_housekeeping_exclude_cpumask(void *arg __maybe_unused) +{ + int ret; + struct cpumask *icpus = isolated_cpus; + unsigned long flags = BIT(HK_TYPE_DOMAIN) | BIT(HK_TYPE_KERNEL_NOISE); + + /* + * The boot time isolcpus setting will be overwritten if set. + */ + have_boot_isolcpus = false; + + if (have_boot_nohz_full) { + /* + * Need to separate the handling of HK_TYPE_KERNEL_NOISE and + * HK_TYPE_DOMAIN as different cpumasks will be used for each. + */ + ret = housekeeping_exclude_cpumask(icpus, BIT(HK_TYPE_DOMAIN)); + WARN_ON_ONCE((ret < 0) && (ret != -EOPNOTSUPP)); + + if (cpumask_empty(isolcpus_update_state.cpus)) + return ret; + flags = BIT(HK_TYPE_KERNEL_NOISE); + icpus = kmalloc(cpumask_size(), GFP_KERNEL); + if (WARN_ON_ONCE(!icpus)) + return -ENOMEM; + + /* + * Add boot time nohz_full CPUs into the isolated CPUs list + * for exclusion from HK_TYPE_KERNEL_NOISE CPUs. + */ + cpumask_andnot(icpus, cpu_possible_mask, boot_nohz_full_hk_cpus); + cpumask_or(icpus, icpus, isolated_cpus); + } + ret = housekeeping_exclude_cpumask(icpus, flags); + WARN_ON_ONCE((ret < 0) && (ret != -EOPNOTSUPP)); + + if (icpus != isolated_cpus) + kfree(icpus); + return ret; +} + /** * update_isolation_cpumasks - Update external isolation CPU masks * * The following external CPU masks will be updated if necessary: * - workqueue unbound cpumask + * - housekeeping cpumasks */ static void update_isolation_cpumasks(void) { @@ -1371,7 +1417,41 @@ static void update_isolation_cpumasks(void) ret = workqueue_unbound_exclude_cpumask(isolated_cpus); WARN_ON_ONCE(ret < 0);
+ /* + * Mask out offline and boot-time nohz_full non-housekeeping + * CPUs from isolcpus_update_state.cpus to compute the set + * of CPUs that need to be brought offline before calling + * do_housekeeping_exclude_cpumask(). + */ + cpumask_and(isolcpus_update_state.cpus, + isolcpus_update_state.cpus, cpu_active_mask); + if (have_boot_nohz_full) + cpumask_and(isolcpus_update_state.cpus, + isolcpus_update_state.cpus, boot_nohz_full_hk_cpus); + + /* + * Without any change in the set of nohz_full CPUs, we don't really + * need to use CPU hotplug for making change in HK cpumasks. + */ + if (cpumask_empty(isolcpus_update_state.cpus)) + ret = do_housekeeping_exclude_cpumask(NULL); + else + ret = cpuhp_offline_cb(isolcpus_update_state.cpus, + do_housekeeping_exclude_cpumask, NULL); + /* + * A errno value of -EPERM may be returned from cpuhp_offline_cb() if + * any one of the CPUs in isolcpus_update_state.cpus can't be brought + * offline. This can happen for the boot CPU (normally CPU 0) which + * cannot be shut down. This CPU should not be used for creating + * isolated partition. + */ + if (ret == -EPERM) + pr_warn_once("cpuset: The boot CPU shouldn't be used for isolated partition\n"); + else + WARN_ON_ONCE(ret < 0); + cpumask_clear(isolcpus_update_state.cpus); + rebuild_sched_domains(); isolcpus_update_state.updating = false; }
@@ -2961,7 +3041,16 @@ static int update_prstate(struct cpuset *cs, int new_prs) update_partition_sd_lb(cs, old_prs);
notify_partition_change(cs, old_prs); - if (force_sd_rebuild) + + /* + * If boot time domain isolcpus exists and it conflicts with the CPUs + * in the new partition, we will have to reset HK_TYPE_DOMAIN cpumask. + */ + if (have_boot_isolcpus && (new_prs > PRS_MEMBER) && + !cpumask_subset(cs->effective_xcpus, housekeeping_cpumask(HK_TYPE_DOMAIN))) + isolcpus_update_state.updating = true; + + if (force_sd_rebuild && !isolcpus_update_state.updating) rebuild_sched_domains_locked(); free_cpumasks(NULL, &tmpmask); return 0; @@ -3232,7 +3321,7 @@ ssize_t cpuset_write_resmask(struct kernfs_open_file *of, }
free_cpuset(trialcs); - if (force_sd_rebuild) + if (force_sd_rebuild && !isolcpus_update_state.updating) rebuild_sched_domains_locked(); out_unlock: mutex_unlock(&cpuset_mutex); @@ -3999,7 +4088,7 @@ static void cpuset_handle_hotplug(void) }
/* rebuild sched domains if necessary */ - if (force_sd_rebuild) + if (force_sd_rebuild && !isolcpus_update_state.updating) rebuild_sched_domains_cpuslocked();
free_cpumasks(NULL, ptmp);