Hi Tixy,
On 14/03/14 13:58, Jon Medhurst (Tixy) wrote:
On Thu, 2014-03-13 at 17:19 +0000, Chris Redpath wrote:
@@ -1430,10 +1434,22 @@ void scheduler_ipi(void) /* * Check if someone kicked us for doing the nohz idle load balance. */ +#ifdef CONFIG_SCHED_HMP
- {
bool nohz = got_nohz_idle_kick();
if (unlikely(this_rq()->wake_for_idle_pull) ||
unlikely(nohz)) {
if (unlikely(nohz))
this_rq()->idle_balance = 1;
raise_softirq_irqoff(SCHED_SOFTIRQ);
}
- }
+#else if (unlikely(got_nohz_idle_kick())) { this_rq()->idle_balance = 1; raise_softirq_irqoff(SCHED_SOFTIRQ); } +#endif
Wouldn't the above change have been much simpler as:
if (unlikely(got_nohz_idle_kick())) { this_rq()->idle_balance = 1; raise_softirq_irqoff(SCHED_SOFTIRQ); } +#ifdef CONFIG_SCHED_HMP
- else if (unlikely(this_rq()->wake_for_idle_pull)
raise_softirq_irqoff(SCHED_SOFTIRQ);
+#endif
That would also avoid having the comment about checking for 'nohz idle load balance' associated with the HMP code which also does the idle pull check.
I like this change, I was originally intending to do something different here and ended up removing it but I didn't go back and clean this up properly.
I will issue a respin of this one early next week because of the stuff below, so I'll incorporate this.
irq_exit(); }
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index a5cae4a..d59e0de 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3541,6 +3541,43 @@ static const int hmp_max_tasks = 5;
extern void __init arch_get_hmp_domains(struct list_head *hmp_domains_list);
+struct hmp_keepalive {
- bool init;
- ktime_t delay;
- struct hrtimer timer;
+}; +DEFINE_PER_CPU(struct hmp_keepalive, hmp_cpu_keepalive);
+/* setup per-cpu keepalive timers */ +static enum hrtimer_restart hmp_cpu_keepalive_notify(struct hrtimer *hrtimer) +{
- return HRTIMER_NORESTART;
+}
+static inline void hmp_cpu_keepalive_init(struct hmp_keepalive *keepalive, int cpu) +{
- struct hrtimer *hrtimer;
- unsigned int ns_delay = 1000000;
Is there any significance to a 1ms delay? Should the value be related to the scheduler tick, or idle wakeup latency? Or anything which needs to be different on different platforms?
If it's idle related, I notice this figure is the same as the target_residency for the TC2 driver, and before when people have had timers set for this period things sit on a knife edge as to whether a given version of a kernel will decide that it has enough time to go idle, or not. Which can give dramatically different behaviour from build to build and week to week.
It is related to idle in that if there are no pending events on the big CPU it will go back to power-down state again in the small gap (~100usec on TC2) between the big CPU signalling its now awake by pulling a task and the task actually being pushed off the little CPU.
I was aiming for a value which would keep the system alive but be definitely longer than the time it takes to do the push. In an ideal world I'd be able to specify a zero latency requirement on the target CPUs and clear it when the timer expires which would guarantee the effect, but there isn't a way to express that in the kernel at the moment hence the crude timer mechanism.
Do you think it would be worth fetching the cpuidle target residency of state 1 and setting the timer for MAX(150usec, state[1]->exit_latency/2)?
I already provided storage for the timer duration which I initialize only once in case this might be needed but I wasn't too keen on integrating cpuidle here unless I really need to. It sounds like you think I do :)
I'll prototype something for the respin.
- if (keepalive->init)
return;
- /* make sure we only init for the right CPU */
- WARN_ON(keepalive != &per_cpu(hmp_cpu_keepalive, smp_processor_id()));
To be nit-picky, this WARN seems a little redundant given that the only caller (below) passes in exactly the parameter the function checks for; and I can't help wonder if this init function can't just be part of hmp_cpu_keepalive_trigger(). Noe of this is important enough to warrant changing the code though.
This is another leftover from an earlier version where the initialization was not done as-needed but on CPU hotplug so I could avoid any checking in the timer arming path. I'll remove it in the respin - probably just roll it into the trigger as you suggest.
- hrtimer = &keepalive->timer;
- hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
- hrtimer->function = hmp_cpu_keepalive_notify;
- keepalive->delay = ns_to_ktime(ns_delay);
- keepalive->init = true;
+}
+static void hmp_cpu_keepalive_trigger(void) +{
- int cpu = smp_processor_id();
- struct hmp_keepalive *keepalive = &per_cpu(hmp_cpu_keepalive, cpu);
- hmp_cpu_keepalive_init(keepalive, cpu);
- hrtimer_start(&keepalive->timer, keepalive->delay, HRTIMER_MODE_REL_PINNED);
+}
- /* Setup hmp_domains */ static int __init hmp_cpu_mask_setup(void) {
@@ -3598,9 +3635,13 @@ static void hmp_online_cpu(int cpu) static void hmp_offline_cpu(int cpu) { struct hmp_domain *domain = hmp_get_hmp_domain_for_cpu(cpu);
struct hmp_keepalive *keepalive = &per_cpu(hmp_cpu_keepalive, cpu);
if(domain) cpumask_clear_cpu(cpu, &domain->cpus);
if (keepalive->init)
hrtimer_cancel(&keepalive->timer);
} /*
- Needed to determine heaviest tasks etc.
@@ -7021,7 +7062,7 @@ static void hmp_force_up_migration(int this_cpu) target = cpu_rq(cpu); raw_spin_lock_irqsave(&target->lock, flags); curr = target->cfs.curr;
if (!curr) {
}if (!curr || target->active_balance) { raw_spin_unlock_irqrestore(&target->lock, flags); continue;
@@ -7038,16 +7079,13 @@ static void hmp_force_up_migration(int this_cpu) curr = hmp_get_heaviest_task(curr, 1); p = task_of(curr); if (hmp_up_migration(cpu, &target_cpu, curr)) {
if (!target->active_balance) {
get_task_struct(p);
target->push_cpu = target_cpu;
target->migrate_task = p;
got_target = 1;
trace_sched_hmp_migrate(p, target->push_cpu, HMP_MIGRATE_FORCE);
hmp_next_up_delay(&p->se, target->push_cpu);
}
cpu_rq(target_cpu)->wake_for_idle_pull = 1;
raw_spin_unlock_irqrestore(&target->lock, flags);
spin_unlock(&hmp_force_migration);
smp_send_reschedule(target_cpu);
return;
If I've followed the code correctly, does this change now mean that a CPU stops considering up migrations after it found this one, whereas before it would continue to look for others as well. Do we want to still do that? Guess it might be too complicated if we're now letting the other CPU pull things in it's own time. Or, if we were in the situation before where two tasks needed up migrating, would the new pull method actually find both of those to pull anyway and so behaviour is the same as before?
I intended this to work exactly as it is now, and rely upon the frequent calls to force_up_migration to cover all the CPUs because it was easier than handling the fight over runqueue access which happens when we wake multiple big CPUs and ask them each to find the biggest task to pull.
After having thought about your comment perhaps it's less optimal than it could be and I can probably do better.
As a first solution, I could instead count the number of eligible tasks and wake MIN(num_of_big_cpus, num_of_eligible_tasks) big CPUs. I can change idle pull to wait for the migration spinlock rather than exit if it can't take it. That will allow the big CPUs to sync themselves and each task to be pulled.
}
if (!got_target && !target->active_balance) {
if (!got_target) { /* * For now we just check the currently running task. * Selecting the lightest task for offloading will
@@ -7069,7 +7107,7 @@ static void hmp_force_up_migration(int this_cpu) * is not currently running move it, otherwise let the * CPU stopper take care of it. */
if (got_target && !target->active_balance) {
if (got_target) { if (!task_running(target, p)) { trace_sched_hmp_migrate_force_running(p, 0); hmp_migrate_runnable_task(target);
@@ -7175,6 +7213,8 @@ static unsigned int hmp_idle_pull(int this_cpu) raw_spin_unlock_irqrestore(&target->lock, flags);
if (force) {
/* start timer to keep us awake */
stop_one_cpu_nowait(cpu_of(target), hmp_active_task_migration_cpu_stop, target, &target->active_balance_work);hmp_cpu_keepalive_trigger();
@@ -7198,6 +7238,18 @@ static void run_rebalance_domains(struct softirq_action *h) enum cpu_idle_type idle = this_rq->idle_balance ? CPU_IDLE : CPU_NOT_IDLE;
+#ifdef CONFIG_SCHED_HMP
- /* shortcut for hmp idle pull wakeups */
- if (unlikely(this_rq->wake_for_idle_pull)) {
this_rq->wake_for_idle_pull = 0;
if (hmp_idle_pull(this_cpu)) {
/* break out unless running nohz idle as well */
if (idle != CPU_IDLE)
return;
}
- }
+#endif
hmp_force_up_migration(this_cpu);
rebalance_domains(this_cpu, idle);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 989c5ae..0d19ede 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -466,6 +466,7 @@ struct rq { struct cpu_stop_work active_balance_work; #ifdef CONFIG_SCHED_HMP struct task_struct *migrate_task;
- int wake_for_idle_pull; #endif /* cpu of this runqueue: */ int cpu;