Re: [PATCH 3/3] hmp: Use idle pull to perform forced up-migrations

14 Mar 2014

Hi Tixy,
On 14/03/14 13:58, Jon Medhurst (Tixy) wrote:
...
On Thu, 2014-03-13 at 17:19 +0000, Chris Redpath wrote:
...
@@ -1430,10 +1434,22 @@ void scheduler_ipi(void)
   /*
    * Check if someone kicked us for doing the nohz idle load balance.
    */
+#ifdef CONFIG_SCHED_HMP

{
bool nohz = got_nohz_idle_kick();


if (unlikely(this_rq()->wake_for_idle_pull) ||


	unlikely(nohz)) {


	if (unlikely(nohz))


		this_rq()->idle_balance = 1;


	raise_softirq_irqoff(SCHED_SOFTIRQ);


}


}

+#else
   if (unlikely(got_nohz_idle_kick())) {
   	this_rq()->idle_balance = 1;
   	raise_softirq_irqoff(SCHED_SOFTIRQ);
   }
+#endif
Wouldn't the above change have been much simpler as:
if (unlikely(got_nohz_idle_kick())) {
   	this_rq()->idle_balance = 1;
   	raise_softirq_irqoff(SCHED_SOFTIRQ);
   }
+#ifdef CONFIG_SCHED_HMP

else if (unlikely(this_rq()->wake_for_idle_pull)
raise_softirq_irqoff(SCHED_SOFTIRQ);



+#endif
That would also avoid having the comment about checking for 'nohz idle
load balance' associated with the HMP code which also does the idle pull
check.
I like this change, I was originally intending to do something different 
here and ended up removing it but I didn't go back and clean this up 
properly.
I will issue a respin of this one early next week because of the stuff 
below, so I'll incorporate this.
...
...
irq_exit();
  }

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index a5cae4a..d59e0de 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3541,6 +3541,43 @@ static const int hmp_max_tasks = 5;
extern void __init arch_get_hmp_domains(struct list_head *hmp_domains_list);
+struct hmp_keepalive {

bool init;
ktime_t delay;
struct hrtimer timer;

+};
+DEFINE_PER_CPU(struct hmp_keepalive, hmp_cpu_keepalive);



+/* setup per-cpu keepalive timers */
+static enum hrtimer_restart hmp_cpu_keepalive_notify(struct hrtimer *hrtimer)
+{

return HRTIMER_NORESTART;

+}



+static inline void hmp_cpu_keepalive_init(struct hmp_keepalive *keepalive, int cpu)
+{

struct hrtimer *hrtimer;
unsigned int ns_delay = 1000000;

Is there any significance to a 1ms delay? Should the value be related to
the scheduler tick, or idle wakeup latency? Or anything which needs to
be different on different platforms?
If it's idle related, I notice this figure is the same as the
target_residency for the TC2 driver, and before when people have had
timers set for this period things sit on a knife edge as to whether a
given version of a kernel will decide that it has enough time to go
idle, or not. Which can give dramatically different behaviour from build
to build and week to week.
It is related to idle in that if there are no pending events on the big 
CPU it will go back to power-down state again in the small gap (~100usec 
on TC2) between the big CPU signalling its now awake by pulling a task 
and the task actually being pushed off the little CPU.
I was aiming for a value which would keep the system alive but be 
definitely longer than the time it takes to do the push. In an ideal 
world I'd be able to specify a zero latency requirement on the target 
CPUs and clear it when the timer expires which would guarantee the 
effect, but there isn't a way to express that in the kernel at the 
moment hence the crude timer mechanism.
Do you think it would be worth fetching the cpuidle target residency of 
state 1 and setting the timer for MAX(150usec, state[1]->exit_latency/2)?
I already provided storage for the timer duration which I initialize 
only once in case this might be needed but I wasn't too keen on 
integrating cpuidle here unless I really need to. It sounds like you 
think I do :)
I'll prototype something for the respin.
...
...

if (keepalive->init)
return;


/* make sure we only init for the right CPU */
WARN_ON(keepalive != &per_cpu(hmp_cpu_keepalive, smp_processor_id()));

To be nit-picky, this WARN seems a little redundant given that the only
caller (below) passes in exactly the parameter the function checks for;
and I can't help wonder if this init function can't just be part of
hmp_cpu_keepalive_trigger(). Noe of this is important enough to warrant
changing the code though.
This is another leftover from an earlier version where the 
initialization was not done as-needed but on CPU hotplug so I could 
avoid any checking in the timer arming path. I'll remove it in the 
respin - probably just roll it into the trigger as you suggest.
...
...

hrtimer = &keepalive->timer;
hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
hrtimer->function = hmp_cpu_keepalive_notify;
keepalive->delay = ns_to_ktime(ns_delay);
keepalive->init = true;

+}



+static void hmp_cpu_keepalive_trigger(void)
+{

int cpu = smp_processor_id();
struct hmp_keepalive *keepalive = &per_cpu(hmp_cpu_keepalive, cpu);
hmp_cpu_keepalive_init(keepalive, cpu);
hrtimer_start(&keepalive->timer, keepalive->delay, HRTIMER_MODE_REL_PINNED);

+}

/* Setup hmp_domains */
static int __init hmp_cpu_mask_setup(void)
{

@@ -3598,9 +3635,13 @@ static void hmp_online_cpu(int cpu)
  static void hmp_offline_cpu(int cpu)
  {
   struct hmp_domain *domain = hmp_get_hmp_domain_for_cpu(cpu);

struct hmp_keepalive *keepalive = &per_cpu(hmp_cpu_keepalive, cpu);
if(domain)
cpumask_clear_cpu(cpu, &domain->cpus);


if (keepalive->init)

hrtimer_cancel(&keepalive->timer);

}
/*

Needed to determine heaviest tasks etc.



@@ -7021,7 +7062,7 @@ static void hmp_force_up_migration(int this_cpu)
   	target = cpu_rq(cpu);
   	raw_spin_lock_irqsave(&target->lock, flags);
   	curr = target->cfs.curr;

if (!curr) {




if (!curr || target->active_balance) {
raw_spin_unlock_irqrestore(&target->lock, flags);
continue;

}

@@ -7038,16 +7079,13 @@ static void hmp_force_up_migration(int this_cpu)
   	curr = hmp_get_heaviest_task(curr, 1);
   	p = task_of(curr);
   	if (hmp_up_migration(cpu, &target_cpu, curr)) {

	if (!target->active_balance) {


		get_task_struct(p);


		target->push_cpu = target_cpu;


		target->migrate_task = p;


		got_target = 1;


		trace_sched_hmp_migrate(p, target->push_cpu, HMP_MIGRATE_FORCE);


		hmp_next_up_delay(&p->se, target->push_cpu);


	}




	cpu_rq(target_cpu)->wake_for_idle_pull = 1;


	raw_spin_unlock_irqrestore(&target->lock, flags);


	spin_unlock(&hmp_force_migration);


	smp_send_reschedule(target_cpu);


	return;



If I've followed the code correctly, does this change now mean that a
CPU stops considering up migrations after it found this one, whereas
before it would continue to look for others as well. Do we want to still
do that? Guess it might be too complicated if we're now letting the
other CPU pull things in it's own time. Or, if we were in the situation
before where two tasks needed up migrating, would the new pull method
actually find both of those to pull anyway and so behaviour is the same
as before?
I intended this to work exactly as it is now, and rely upon the frequent 
calls to force_up_migration to cover all the CPUs because it was easier 
than handling the fight over runqueue access which happens when we wake 
multiple big CPUs and ask them each to find the biggest task to pull.
After having thought about your comment perhaps it's less optimal than 
it could be and I can probably do better.
As a first solution, I could instead count the number of eligible tasks 
and wake MIN(num_of_big_cpus, num_of_eligible_tasks) big CPUs. I can 
change idle pull to wait for the migration spinlock rather than exit if 
it can't take it. That will allow the big CPUs to sync themselves and 
each task to be pulled.
...
...
}


if (!got_target && !target->active_balance) {




if (!got_target) {
/*
 * For now we just check the currently running task.
 * Selecting the lightest task for offloading will



@@ -7069,7 +7107,7 @@ static void hmp_force_up_migration(int this_cpu)
   	 * is not currently running move it, otherwise let the
   	 * CPU stopper take care of it.
   	 */

if (got_target && !target->active_balance) {




if (got_target) {
if (!task_running(target, p)) {
	trace_sched_hmp_migrate_force_running(p, 0);
	hmp_migrate_runnable_task(target);



@@ -7175,6 +7213,8 @@ static unsigned int hmp_idle_pull(int this_cpu)
   raw_spin_unlock_irqrestore(&target->lock, flags);
if (force) {

/* start timer to keep us awake */


hmp_cpu_keepalive_trigger();

stop_one_cpu_nowait(cpu_of(target),
	hmp_active_task_migration_cpu_stop,
	target, &target->active_balance_work);

@@ -7198,6 +7238,18 @@ static void run_rebalance_domains(struct softirq_action *h)
   enum cpu_idle_type idle = this_rq->idle_balance ?
   					CPU_IDLE : CPU_NOT_IDLE;
+#ifdef CONFIG_SCHED_HMP

/* shortcut for hmp idle pull wakeups */
if (unlikely(this_rq->wake_for_idle_pull)) {
this_rq->wake_for_idle_pull = 0;


if (hmp_idle_pull(this_cpu)) {


	/* break out unless running nohz idle as well */


	if (idle != CPU_IDLE)


		return;


}


}

+#endif

hmp_force_up_migration(this_cpu);
rebalance_domains(this_cpu, idle);


diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 989c5ae..0d19ede 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -466,6 +466,7 @@ struct rq {
   struct cpu_stop_work active_balance_work;
  #ifdef CONFIG_SCHED_HMP
   struct task_struct *migrate_task;

int wake_for_idle_pull;
#endif
/* cpu of this runqueue: */
int cpu;


    

2025

2024

2023

2022

2021

2020

2019

2018

2017

2016

2015

2014

2013

2012

2011

Re: [PATCH 3/3] hmp: Use idle pull to perform forced up-migrations