On my smp platform which is made of 5 cores in 2 clusters, I have the
nr_busy_cpu field of sched_group_power struct that is not null when the
platform is fully idle. The root cause is:
During the boot sequence, some CPUs reach the idle loop and set their
NOHZ_IDLE flag while waiting for others CPUs to boot. But the nr_busy_cpus
field is initialized later with the assumption that all CPUs are in the busy
state whereas some CPUs have already set their NOHZ_IDLE flag.
More generally, the NOHZ_IDLE flag must be initialized when new sched_domains
are created in order to ensure that NOHZ_IDLE and nr_busy_cpus are aligned.
This condition can be ensured by adding a synchronize_rcu between the
destruction of old sched_domains and the creation of new ones so the NOHZ_IDLE
flag will not be updated with old sched_domain once it has been initialized.
But this solution introduces a additionnal latency in the rebuild sequence
that is called during cpu hotplug.
As suggested by Frederic Weisbecker, another solution is to have the same
rcu lifecycle for both NOHZ_IDLE and sched_domain struct. I have introduce
a new sched_domain_rq struct that is the entry point for both sched_domains
and objects that must follow the same lifecycle like NOHZ_IDLE flags. They
will share the same RCU lifecycle and will be always synchronized.
The synchronization is done at the cost of :
- an additional indirection for accessing the first sched_domain level
- an additional indirection and a rcu_dereference before accessing to the
NOHZ_IDLE flag.
Change since v4:
- link both sched_domain and NOHZ_IDLE flag in one RCU object so
their states are always synchronized.
Change since V3;
- NOHZ flag is not cleared if a NULL domain is attached to the CPU
- Remove patch 2/2 which becomes useless with latest modifications
Change since V2:
- change the initialization to idle state instead of busy state so a CPU that
enters idle during the build of the sched_domain will not corrupt the
initialization state
Change since V1:
- remove the patch for SCHED softirq on an idle core use case as it was
a side effect of the other use cases.
Signed-off-by: Vincent Guittot <vincent.guittot(a)linaro.org>
---
include/linux/sched.h | 6 +++
kernel/sched/core.c | 105 ++++++++++++++++++++++++++++++++++++++++++++-----
kernel/sched/fair.c | 35 +++++++++++------
kernel/sched/sched.h | 24 +++++++++--
4 files changed, 145 insertions(+), 25 deletions(-)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index d35d2b6..2a52188 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -959,6 +959,12 @@ struct sched_domain {
unsigned long span[0];
};
+struct sched_domain_rq {
+ struct sched_domain *sd;
+ unsigned long flags;
+ struct rcu_head rcu; /* used during destruction */
+};
+
static inline struct cpumask *sched_domain_span(struct sched_domain *sd)
{
return to_cpumask(sd->span);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 7f12624..69e2313 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5602,6 +5602,15 @@ static void destroy_sched_domains(struct sched_domain *sd, int cpu)
destroy_sched_domain(sd, cpu);
}
+static void destroy_sched_domain_rq(struct sched_domain_rq *sd_rq, int cpu)
+{
+ if (!sd_rq)
+ return;
+
+ destroy_sched_domains(sd_rq->sd, cpu);
+ kfree_rcu(sd_rq, rcu);
+}
+
/*
* Keep a special pointer to the highest sched_domain that has
* SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this
@@ -5632,10 +5641,23 @@ static void update_top_cache_domain(int cpu)
* hold the hotplug lock.
*/
static void
-cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
+cpu_attach_domain(struct sched_domain_rq *sd_rq, struct root_domain *rd,
+ int cpu)
{
struct rq *rq = cpu_rq(cpu);
- struct sched_domain *tmp;
+ struct sched_domain_rq *tmp_rq;
+ struct sched_domain *tmp, *sd = NULL;
+
+ /*
+ * If we don't have any sched_domain and associated object, we can
+ * directly jump to the attach sequence otherwise we try to degenerate
+ * the sched_domain
+ */
+ if (!sd_rq)
+ goto attach;
+
+ /* Get a pointer to the 1st sched_domain */
+ sd = sd_rq->sd;
/* Remove the sched domains which do not contribute to scheduling. */
for (tmp = sd; tmp; ) {
@@ -5658,14 +5680,17 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
destroy_sched_domain(tmp, cpu);
if (sd)
sd->child = NULL;
+ /* update sched_domain_rq */
+ sd_rq->sd = sd;
}
+attach:
sched_domain_debug(sd, cpu);
rq_attach_root(rq, rd);
- tmp = rq->sd;
- rcu_assign_pointer(rq->sd, sd);
- destroy_sched_domains(tmp, cpu);
+ tmp_rq = rq->sd_rq;
+ rcu_assign_pointer(rq->sd_rq, sd_rq);
+ destroy_sched_domain_rq(tmp_rq, cpu);
update_top_cache_domain(cpu);
}
@@ -5695,12 +5720,14 @@ struct sd_data {
};
struct s_data {
+ struct sched_domain_rq ** __percpu sd_rq;
struct sched_domain ** __percpu sd;
struct root_domain *rd;
};
enum s_alloc {
sa_rootdomain,
+ sa_sd_rq,
sa_sd,
sa_sd_storage,
sa_none,
@@ -5935,7 +5962,7 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
return;
update_group_power(sd, cpu);
- atomic_set(&sg->sgp->nr_busy_cpus, sg->group_weight);
+ atomic_set(&sg->sgp->nr_busy_cpus, 0);
}
int __weak arch_sd_sibling_asym_packing(void)
@@ -6011,6 +6038,8 @@ static void set_domain_attribute(struct sched_domain *sd,
static void __sdt_free(const struct cpumask *cpu_map);
static int __sdt_alloc(const struct cpumask *cpu_map);
+static void __sdrq_free(const struct cpumask *cpu_map, struct s_data *d);
+static int __sdrq_alloc(const struct cpumask *cpu_map, struct s_data *d);
static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
const struct cpumask *cpu_map)
@@ -6019,6 +6048,9 @@ static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
case sa_rootdomain:
if (!atomic_read(&d->rd->refcount))
free_rootdomain(&d->rd->rcu); /* fall through */
+ case sa_sd_rq:
+ __sdrq_free(cpu_map, d); /* fall through */
+ free_percpu(d->sd_rq); /* fall through */
case sa_sd:
free_percpu(d->sd); /* fall through */
case sa_sd_storage:
@@ -6038,9 +6070,14 @@ static enum s_alloc __visit_domain_allocation_hell(struct s_data *d,
d->sd = alloc_percpu(struct sched_domain *);
if (!d->sd)
return sa_sd_storage;
+ d->sd_rq = alloc_percpu(struct sched_domain_rq *);
+ if (!d->sd_rq)
+ return sa_sd;
+ if (__sdrq_alloc(cpu_map, d))
+ return sa_sd_rq;
d->rd = alloc_rootdomain();
if (!d->rd)
- return sa_sd;
+ return sa_sd_rq;
return sa_rootdomain;
}
@@ -6466,6 +6503,46 @@ static void __sdt_free(const struct cpumask *cpu_map)
}
}
+static int __sdrq_alloc(const struct cpumask *cpu_map, struct s_data *d)
+{
+ int j;
+
+ for_each_cpu(j, cpu_map) {
+ struct sched_domain_rq *sd_rq;
+
+ sd_rq = kzalloc_node(sizeof(struct sched_domain_rq),
+ GFP_KERNEL, cpu_to_node(j));
+ if (!sd_rq)
+ return -ENOMEM;
+
+ *per_cpu_ptr(d->sd_rq, j) = sd_rq;
+ }
+
+ return 0;
+}
+
+static void __sdrq_free(const struct cpumask *cpu_map, struct s_data *d)
+{
+ int j;
+
+ for_each_cpu(j, cpu_map)
+ if (*per_cpu_ptr(d->sd_rq, j))
+ kfree(*per_cpu_ptr(d->sd_rq, j));
+}
+
+static void build_sched_domain_rq(struct s_data *d, int cpu)
+{
+ struct sched_domain_rq *sd_rq;
+ struct sched_domain *sd;
+
+ /* Attach sched_domain to sched_domain_rq */
+ sd = *per_cpu_ptr(d->sd, cpu);
+ sd_rq = *per_cpu_ptr(d->sd_rq, cpu);
+ sd_rq->sd = sd;
+ /* Init flags */
+ set_bit(NOHZ_IDLE, sched_rq_flags(sd_rq));
+}
+
struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
struct s_data *d, const struct cpumask *cpu_map,
struct sched_domain_attr *attr, struct sched_domain *child,
@@ -6495,6 +6572,7 @@ static int build_sched_domains(const struct cpumask *cpu_map,
struct sched_domain_attr *attr)
{
enum s_alloc alloc_state = sa_none;
+ struct sched_domain_rq *sd_rq;
struct sched_domain *sd;
struct s_data d;
int i, ret = -ENOMEM;
@@ -6547,11 +6625,18 @@ static int build_sched_domains(const struct cpumask *cpu_map,
}
}
+ /* Init objects that must follow the sched_domain lifecycle */
+ for_each_cpu(i, cpu_map) {
+ build_sched_domain_rq(&d, i);
+ }
+
/* Attach the domains */
rcu_read_lock();
for_each_cpu(i, cpu_map) {
- sd = *per_cpu_ptr(d.sd, i);
- cpu_attach_domain(sd, d.rd, i);
+ sd_rq = *per_cpu_ptr(d.sd_rq, i);
+ cpu_attach_domain(sd_rq, d.rd, i);
+ /* claim allocation of sched_domain_rq object */
+ *per_cpu_ptr(d.sd_rq, i) = NULL;
}
rcu_read_unlock();
@@ -6982,7 +7067,7 @@ void __init sched_init(void)
rq->last_load_update_tick = jiffies;
#ifdef CONFIG_SMP
- rq->sd = NULL;
+ rq->sd_rq = NULL;
rq->rd = NULL;
rq->cpu_power = SCHED_POWER_SCALE;
rq->post_schedule = 0;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 7a33e59..1c7447e 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5392,31 +5392,39 @@ static inline void nohz_balance_exit_idle(int cpu)
static inline void set_cpu_sd_state_busy(void)
{
+ struct sched_domain_rq *sd_rq;
struct sched_domain *sd;
int cpu = smp_processor_id();
- if (!test_bit(NOHZ_IDLE, nohz_flags(cpu)))
- return;
- clear_bit(NOHZ_IDLE, nohz_flags(cpu));
-
rcu_read_lock();
- for_each_domain(cpu, sd)
+ sd_rq = get_sched_domain_rq(cpu);
+
+ if (!sd_rq || !test_bit(NOHZ_IDLE, sched_rq_flags(sd_rq)))
+ goto unlock;
+ clear_bit(NOHZ_IDLE, sched_rq_flags(sd_rq));
+
+ for_each_domain_from_rq(sd_rq, sd)
atomic_inc(&sd->groups->sgp->nr_busy_cpus);
+unlock:
rcu_read_unlock();
}
void set_cpu_sd_state_idle(void)
{
+ struct sched_domain_rq *sd_rq;
struct sched_domain *sd;
int cpu = smp_processor_id();
- if (test_bit(NOHZ_IDLE, nohz_flags(cpu)))
- return;
- set_bit(NOHZ_IDLE, nohz_flags(cpu));
-
rcu_read_lock();
- for_each_domain(cpu, sd)
+ sd_rq = get_sched_domain_rq(cpu);
+
+ if (!sd_rq || test_bit(NOHZ_IDLE, sched_rq_flags(sd_rq)))
+ goto unlock;
+ set_bit(NOHZ_IDLE, sched_rq_flags(sd_rq));
+
+ for_each_domain_from_rq(sd_rq, sd)
atomic_dec(&sd->groups->sgp->nr_busy_cpus);
+unlock:
rcu_read_unlock();
}
@@ -5673,7 +5681,12 @@ static void run_rebalance_domains(struct softirq_action *h)
static inline int on_null_domain(int cpu)
{
- return !rcu_dereference_sched(cpu_rq(cpu)->sd);
+ struct sched_domain_rq *sd_rq =
+ rcu_dereference_sched(cpu_rq(cpu)->sd_rq);
+ struct sched_domain *sd = NULL;
+ if (sd_rq)
+ sd = sd_rq->sd;
+ return !sd;
}
/*
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index cc03cfd..f589306 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -417,7 +417,7 @@ struct rq {
#ifdef CONFIG_SMP
struct root_domain *rd;
- struct sched_domain *sd;
+ struct sched_domain_rq *sd_rq;
unsigned long cpu_power;
@@ -505,21 +505,37 @@ DECLARE_PER_CPU(struct rq, runqueues);
#ifdef CONFIG_SMP
-#define rcu_dereference_check_sched_domain(p) \
+#define rcu_dereference_check_sched_domain_rq(p) \
rcu_dereference_check((p), \
lockdep_is_held(&sched_domains_mutex))
+#define get_sched_domain_rq(cpu) \
+ rcu_dereference_check_sched_domain_rq(cpu_rq(cpu)->sd_rq)
+
+#define rcu_dereference_check_sched_domain(cpu) ({ \
+ struct sched_domain_rq *__sd_rq = get_sched_domain_rq(cpu); \
+ struct sched_domain *__sd = NULL; \
+ if (__sd_rq) \
+ __sd = __sd_rq->sd; \
+ __sd; \
+})
+
+#define sched_rq_flags(sd_rq) (&sd_rq->flags)
+
/*
- * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
+ * The domain tree (rq->sd_rq) is protected by RCU's quiescent state transition.
* See detach_destroy_domains: synchronize_sched for details.
*
* The domain tree of any CPU may only be accessed from within
* preempt-disabled sections.
*/
#define for_each_domain(cpu, __sd) \
- for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); \
+ for (__sd = rcu_dereference_check_sched_domain(cpu); \
__sd; __sd = __sd->parent)
+#define for_each_domain_from_rq(sd_rq, __sd) \
+ for (__sd = sd_rq->sd; __sd; __sd = __sd->parent)
+
#define for_each_lower_domain(sd) for (; sd; sd = sd->child)
/**
--
1.7.9.5
This patchset series provide some code consolidation across the different
cpuidle drivers. It contains two parts, the first one is the removal of
the time keeping flag and the second one, is a common initialization routine.
All the drivers use the en_core_tk_irqen flag, which means it is not necessary
to make the time computation optional. We can remove this flag and assume the
cpuidle framework always manage this operation.
The cpuidle code initialization is duplicated across the different drivers in
the same manner.
The repeating pattern is:
SMP:
cpuidle_register_driver(drv);
for_each_possible_cpu(cpu) {
dev = per_cpu(cpuidle_device, cpu);
cpuidle_register_device(dev);
}
UP:
cpuidle_register_driver(drv);
cpuidle_register_device(dev);
As on a UP machine the macro 'for_each_cpu' is a one iteration loop, using the
initialization loop from SMP to UP works.
The patchset does some cleanup for different drivers in order to make the init
code the same. Then it introduces a generic function:
cpuidle_register(struct cpuidle_driver *drv, struct cpumask *cpumask)
The cpumask is for the coupled idle states.
The drivers are then modified to take into account this new function and
to remove the duplicated code.
The benefit is observable in the diffstat: 332 lines of code removed.
Tested-on: u8500
Tested-on: at91
Tested-on: intel i5
Tested-on: OMAP4
Compiled with and without CPU_IDLE for:
u8500, at91, davinci, exynos, imx5, imx6, kirkwood, multi_v7 (for calxeda),
omap2plus, s3c64, tegra1, tegra2, tegra3
Daniel Lezcano (19):
ARM: shmobile: cpuidle: remove shmobile_enter_wfi function
ARM: OMAP3: remove cpuidle_wrap_enter
cpuidle: remove en_core_tk_irqen flag
ARM: ux500: cpuidle: replace for_each_online_cpu by
for_each_possible_cpu
ARM: imx: cpuidle: create separate drivers for imx5/imx6
cpuidle: make a single register function for all
ARM: ux500: cpuidle: use init/exit common routine
ARM: at91: cpuidle: use init/exit common routine
ARM: OMAP3: cpuidle: use init/exit common routine
ARM: s3c64xx: cpuidle: use init/exit common routine
ARM: tegra: cpuidle: use init/exit common routine
ARM: shmobile: cpuidle: use init/exit common routine
ARM: OMAP4: cpuidle: use init/exit common routine
ARM: tegra: cpuidle: use init/exit common routine for tegra2
ARM: tegra: cpuidle: use init/exit common routine for tegra3
ARM: calxeda: cpuidle: use init/exit common routine
ARM: kirkwood: cpuidle: use init/exit common routine
ARM: davinci: cpuidle: use init/exit common routine
ARM: imx: cpuidle: use init/exit common routine
Documentation/cpuidle/driver.txt | 6 +
arch/arm/mach-at91/cpuidle.c | 18 +--
arch/arm/mach-davinci/cpuidle.c | 21 +---
arch/arm/mach-exynos/cpuidle.c | 1 -
arch/arm/mach-imx/Makefile | 1 +
arch/arm/mach-imx/cpuidle-imx5.c | 40 +++++++
arch/arm/mach-imx/cpuidle-imx6q.c | 3 +-
arch/arm/mach-imx/cpuidle.c | 80 -------------
arch/arm/mach-imx/cpuidle.h | 10 +-
arch/arm/mach-imx/pm-imx5.c | 30 +----
arch/arm/mach-omap2/cpuidle34xx.c | 49 ++------
arch/arm/mach-omap2/cpuidle44xx.c | 23 +---
arch/arm/mach-s3c64xx/cpuidle.c | 15 +--
arch/arm/mach-shmobile/cpuidle.c | 11 +-
arch/arm/mach-shmobile/include/mach/common.h | 3 -
arch/arm/mach-shmobile/pm-sh7372.c | 2 -
arch/arm/mach-tegra/cpuidle-tegra114.c | 27 +----
arch/arm/mach-tegra/cpuidle-tegra20.c | 31 +----
arch/arm/mach-tegra/cpuidle-tegra30.c | 28 +----
arch/arm/mach-ux500/cpuidle.c | 33 +-----
arch/powerpc/platforms/pseries/processor_idle.c | 1 -
arch/sh/kernel/cpu/shmobile/cpuidle.c | 1 -
arch/x86/kernel/apm_32.c | 1 -
drivers/acpi/processor_idle.c | 1 -
drivers/cpuidle/cpuidle-calxeda.c | 53 +--------
drivers/cpuidle/cpuidle-kirkwood.c | 18 +--
drivers/cpuidle/cpuidle.c | 144 ++++++++++++++---------
drivers/idle/intel_idle.c | 1 -
include/linux/cpuidle.h | 20 ++--
29 files changed, 175 insertions(+), 497 deletions(-)
create mode 100644 arch/arm/mach-imx/cpuidle-imx5.c
delete mode 100644 arch/arm/mach-imx/cpuidle.c
--
1.7.9.5
__cpufreq_governor() must be called with correct policy->cpus mask. In
__cpufreq_remove_dev() we initially clear policy->cpus with cpumask_clear_cpu()
and then call __cpufreq_governor(policy, CPUFREQ_GOV_POLICY_EXIT). In case
governor is doing some per-cpu stuff in EXIT callback, this can create uncertain
behavior.
Generic governors in drivers/cpufreq/ doesn't do any per-cpu stuff in EXIT
callback and so we don't face any issues currently. But its better to keep the
code clean, so we don't face any issues in future.
Now, we call cpumask_clear_cpu() only when multiple cpus are managed by policy.
Signed-off-by: Viresh Kumar <viresh.kumar(a)linaro.org>
---
drivers/cpufreq/cpufreq.c | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index fd97a62..3564947 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -1105,7 +1105,9 @@ static int __cpufreq_remove_dev(struct device *dev, struct subsys_interface *sif
WARN_ON(lock_policy_rwsem_write(cpu));
cpus = cpumask_weight(data->cpus);
- cpumask_clear_cpu(cpu, data->cpus);
+
+ if (cpus > 1)
+ cpumask_clear_cpu(cpu, data->cpus);
unlock_policy_rwsem_write(cpu);
if (cpu != data->cpu) {
--
1.7.12.rc2.18.g61b472e
On 3 April 2013 16:00, Benjamin Herrenschmidt <benh(a)kernel.crashing.org> wrote:
> On Wed, 2013-04-03 at 15:00 +0530, Viresh Kumar wrote:
>> On 31 March 2013 09:33, Viresh Kumar <viresh.kumar(a)linaro.org> wrote:
>> > Benjamin/Paul/Olof,
>> >
>> > Any comments on this?
>>
>> Ping!!
>
> I'm on vacation until end of April. No objection to the patch but
> somebody needs to test it.
Hi,
Can somebody else from powerpc world give it a try?
OR
@Rafael: Can we get this pushed in linux-next as is and then people would
be forced to test it and in case there are any complains, i will fix them or
you can revert it?
The current update of the rq's load can be erroneous when RT tasks are
involved
The update of the load of a rq that becomes idle, is done only if the avg_idle
is less than sysctl_sched_migration_cost. If RT tasks and short idle duration
alternate, the runnable_avg will not be updated correctly and the time will be
accounted as idle time when a CFS task wakes up.
A new idle_enter function is called when the next task is the idle function
so the elapsed time will be accounted as run time in the load of the rq,
whatever the average idle time is. The function update_rq_runnable_avg is
removed from idle_balance.
When a RT task is scheduled on an idle CPU, the update of the rq's load is
not done when the rq exit idle state because CFS's functions are not
called. Then, the idle_balance, which is called just before entering the
idle function, updates the rq's load and makes the assumption that the
elapsed time since the last update, was only running time.
As a consequence, the rq's load of a CPU that only runs a periodic RT task,
is close to LOAD_AVG_MAX whatever the running duration of the RT task is.
A new idle_exit function is called when the prev task is the idle function
so the elapsed time will be accounted as idle time in the rq's load.
Changes since V5:
- Rename idle_enter/exit function to idle_enter/exit_fair
Changes since V4:
- Rebase on v3.9-rc6 instead of Steven Rostedt's patches
- Create the post_schedule_idle function that was previously created by Steven's patches
Changes since V3:
- Remove dependancy with CONFIG_FAIR_GROUP_SCHED
- Add a new idle_enter function and create a post_schedule callback for
idle class
- Remove the update_runnable_avg from idle_balance
Changes since V2:
- remove useless definition for UP platform
- rebased on top of Steven Rostedt's patches :
https://lkml.org/lkml/2013/2/12/558
Changes since V1:
- move code out of schedule function and create a pre_schedule callback for
idle class instead.
Signed-off-by: Vincent Guittot <vincent.guittot(a)linaro.org>
---
kernel/sched/fair.c | 23 +++++++++++++++++++++--
kernel/sched/idle_task.c | 16 ++++++++++++++++
kernel/sched/sched.h | 12 ++++++++++++
3 files changed, 49 insertions(+), 2 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 7a33e59..1de3df0 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1562,6 +1562,27 @@ static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
} /* migrations, e.g. sleep=0 leave decay_count == 0 */
}
+
+/*
+ * Update the rq's load with the elapsed running time before entering
+ * idle. if the last scheduled task is not a CFS task, idle_enter will
+ * be the only way to update the runnable statistic.
+ */
+void idle_enter_fair(struct rq *this_rq)
+{
+ update_rq_runnable_avg(this_rq, 1);
+}
+
+/*
+ * Update the rq's load with the elapsed idle time before a task is
+ * scheduled. if the newly scheduled task is not a CFS task, idle_exit will
+ * be the only way to update the runnable statistic.
+ */
+void idle_exit_fair(struct rq *this_rq)
+{
+ update_rq_runnable_avg(this_rq, 0);
+}
+
#else
static inline void update_entity_load_avg(struct sched_entity *se,
int update_cfs_rq) {}
@@ -5219,8 +5240,6 @@ void idle_balance(int this_cpu, struct rq *this_rq)
if (this_rq->avg_idle < sysctl_sched_migration_cost)
return;
- update_rq_runnable_avg(this_rq, 1);
-
/*
* Drop the rq->lock, but keep IRQ/preempt disabled.
*/
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c
index b6baf37..b8ce773 100644
--- a/kernel/sched/idle_task.c
+++ b/kernel/sched/idle_task.c
@@ -13,6 +13,16 @@ select_task_rq_idle(struct task_struct *p, int sd_flag, int flags)
{
return task_cpu(p); /* IDLE tasks as never migrated */
}
+
+static void pre_schedule_idle(struct rq *rq, struct task_struct *prev)
+{
+ idle_exit_fair(rq);
+}
+
+static void post_schedule_idle(struct rq *rq)
+{
+ idle_enter_fair(rq);
+}
#endif /* CONFIG_SMP */
/*
* Idle tasks are unconditionally rescheduled:
@@ -25,6 +35,10 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int fl
static struct task_struct *pick_next_task_idle(struct rq *rq)
{
schedstat_inc(rq, sched_goidle);
+#ifdef CONFIG_SMP
+ /* Trigger the post schedule to do an idle_enter for CFS */
+ rq->post_schedule = 1;
+#endif
return rq->idle;
}
@@ -86,6 +100,8 @@ const struct sched_class idle_sched_class = {
#ifdef CONFIG_SMP
.select_task_rq = select_task_rq_idle,
+ .pre_schedule = pre_schedule_idle,
+ .post_schedule = post_schedule_idle,
#endif
.set_curr_task = set_curr_task_idle,
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index cc03cfd..8f1d80e 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -880,6 +880,18 @@ extern const struct sched_class idle_sched_class;
extern void trigger_load_balance(struct rq *rq, int cpu);
extern void idle_balance(int this_cpu, struct rq *this_rq);
+/*
+ * Only depends on SMP, FAIR_GROUP_SCHED may be removed when runnable_avg
+ * becomes useful in lb
+ */
+#if defined(CONFIG_FAIR_GROUP_SCHED)
+extern void idle_enter_fair(struct rq *this_rq);
+extern void idle_exit_fair(struct rq *this_rq);
+#else
+static inline void idle_enter_fair(struct rq *this_rq) {}
+static inline void idle_exit_fair(struct rq *this_rq) {}
+#endif
+
#else /* CONFIG_SMP */
static inline void idle_balance(int cpu, struct rq *rq)
--
1.7.9.5
The current update of the rq's load can be erroneous when RT tasks are
involved
The update of the load of a rq that becomes idle, is done only if the avg_idle
is less than sysctl_sched_migration_cost. If RT tasks and short idle duration
alternate, the runnable_avg will not be updated correctly and the time will be
accounted as idle time when a CFS task wakes up.
A new idle_enter function is called when the next task is the idle function
so the elapsed time will be accounted as run time in the load of the rq,
whatever the average idle time is. The function update_rq_runnable_avg is
removed from idle_balance.
When a RT task is scheduled on an idle CPU, the update of the rq's load is
not done when the rq exit idle state because CFS's functions are not
called. Then, the idle_balance, which is called just before entering the
idle function, updates the rq's load and makes the assumption that the
elapsed time since the last update, was only running time.
As a consequence, the rq's load of a CPU that only runs a periodic RT task,
is close to LOAD_AVG_MAX whatever the running duration of the RT task is.
A new idle_exit function is called when the prev task is the idle function
so the elapsed time will be accounted as idle time in the rq's load.
Changes since V3:
- Remove dependancy with CONFIG_FAIR_GROUP_SCHED
- Add a new idle_enter function and create a post_schedule callback for
idle class
- Remove the update_runnable_avg from idle_balance
Changes since V2:
- remove useless definition for UP platform
- rebased on top of Steven Rostedt's patches :
https://lkml.org/lkml/2013/2/12/558
Changes since V1:
- move code out of schedule function and create a pre_schedule callback for
idle class instead.
Signed-off-by: Vincent Guittot <vincent.guittot(a)linaro.org>
---
kernel/sched/fair.c | 23 +++++++++++++++++++++--
kernel/sched/idle_task.c | 10 ++++++++++
kernel/sched/sched.h | 12 ++++++++++++
3 files changed, 43 insertions(+), 2 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 0fcdbff..1851ca8 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1562,6 +1562,27 @@ static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
} /* migrations, e.g. sleep=0 leave decay_count == 0 */
}
+
+/*
+ * Update the rq's load with the elapsed running time before entering
+ * idle. if the last scheduled task is not a CFS task, idle_enter will
+ * be the only way to update the runnable statistic.
+ */
+void idle_enter(struct rq *this_rq)
+{
+ update_rq_runnable_avg(this_rq, 1);
+}
+
+/*
+ * Update the rq's load with the elapsed idle time before a task is
+ * scheduled. if the newly scheduled task is not a CFS task, idle_exit will
+ * be the only way to update the runnable statistic.
+ */
+void idle_exit(struct rq *this_rq)
+{
+ update_rq_runnable_avg(this_rq, 0);
+}
+
#else
static inline void update_entity_load_avg(struct sched_entity *se,
int update_cfs_rq) {}
@@ -5219,8 +5240,6 @@ void idle_balance(int this_cpu, struct rq *this_rq)
if (this_rq->avg_idle < sysctl_sched_migration_cost)
return;
- update_rq_runnable_avg(this_rq, 1);
-
/*
* Drop the rq->lock, but keep preempt disabled.
*/
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c
index 66b5220..0775261 100644
--- a/kernel/sched/idle_task.c
+++ b/kernel/sched/idle_task.c
@@ -14,8 +14,17 @@ select_task_rq_idle(struct task_struct *p, int sd_flag, int flags)
return task_cpu(p); /* IDLE tasks as never migrated */
}
+static void pre_schedule_idle(struct rq *rq, struct task_struct *prev)
+{
+ /* Update rq's load with elapsed idle time */
+ idle_exit(rq);
+}
+
static void post_schedule_idle(struct rq *rq)
{
+ /* Update rq's load with elapsed running time */
+ idle_enter(rq);
+
idle_balance(smp_processor_id(), rq);
}
#endif /* CONFIG_SMP */
@@ -95,6 +104,7 @@ const struct sched_class idle_sched_class = {
#ifdef CONFIG_SMP
.select_task_rq = select_task_rq_idle,
+ .pre_schedule = pre_schedule_idle,
.post_schedule = post_schedule_idle,
#endif
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index fc88644..ff4b029 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -878,6 +878,18 @@ extern const struct sched_class idle_sched_class;
extern void trigger_load_balance(struct rq *rq, int cpu);
extern void idle_balance(int this_cpu, struct rq *this_rq);
+/*
+ * Only depends on SMP, FAIR_GROUP_SCHED may be removed when runnable_avg
+ * becomes useful in lb
+ */
+#if defined(CONFIG_FAIR_GROUP_SCHED)
+extern void idle_enter(struct rq *this_rq);
+extern void idle_exit(struct rq *this_rq);
+#else
+static inline void idle_enter(struct rq *this_rq) {}
+static inline void idle_exit(struct rq *this_rq) {}
+#endif
+
#else /* CONFIG_SMP */
static inline void idle_balance(int cpu, struct rq *rq)
--
1.7.9.5
The current update of the rq's load can be erroneous when RT tasks are
involved
The update of the load of a rq that becomes idle, is done only if the avg_idle
is less than sysctl_sched_migration_cost. If RT tasks and short idle duration
alternate, the runnable_avg will not be updated correctly and the time will be
accounted as idle time when a CFS task wakes up.
A new idle_enter function is called when the next task is the idle function
so the elapsed time will be accounted as run time in the load of the rq,
whatever the average idle time is. The function update_rq_runnable_avg is
removed from idle_balance.
When a RT task is scheduled on an idle CPU, the update of the rq's load is
not done when the rq exit idle state because CFS's functions are not
called. Then, the idle_balance, which is called just before entering the
idle function, updates the rq's load and makes the assumption that the
elapsed time since the last update, was only running time.
As a consequence, the rq's load of a CPU that only runs a periodic RT task,
is close to LOAD_AVG_MAX whatever the running duration of the RT task is.
A new idle_exit function is called when the prev task is the idle function
so the elapsed time will be accounted as idle time in the rq's load.
Changes since V4:
- Rebase on v3.9-rc6 instead of Steven Rostedt's patches
- Create the post_schedule_idle function that was previously created by Steven's patches
Changes since V3:
- Remove dependancy with CONFIG_FAIR_GROUP_SCHED
- Add a new idle_enter function and create a post_schedule callback for
idle class
- Remove the update_runnable_avg from idle_balance
Changes since V2:
- remove useless definition for UP platform
- rebased on top of Steven Rostedt's patches :
https://lkml.org/lkml/2013/2/12/558
Changes since V1:
- move code out of schedule function and create a pre_schedule callback for
idle class instead.
Signed-off-by: Vincent Guittot <vincent.guittot(a)linaro.org>
---
kernel/sched/fair.c | 23 +++++++++++++++++++++--
kernel/sched/idle_task.c | 16 ++++++++++++++++
kernel/sched/sched.h | 12 ++++++++++++
3 files changed, 49 insertions(+), 2 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 7a33e59..653edd8 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1562,6 +1562,27 @@ static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
} /* migrations, e.g. sleep=0 leave decay_count == 0 */
}
+
+/*
+ * Update the rq's load with the elapsed running time before entering
+ * idle. if the last scheduled task is not a CFS task, idle_enter will
+ * be the only way to update the runnable statistic.
+ */
+void idle_enter(struct rq *this_rq)
+{
+ update_rq_runnable_avg(this_rq, 1);
+}
+
+/*
+ * Update the rq's load with the elapsed idle time before a task is
+ * scheduled. if the newly scheduled task is not a CFS task, idle_exit will
+ * be the only way to update the runnable statistic.
+ */
+void idle_exit(struct rq *this_rq)
+{
+ update_rq_runnable_avg(this_rq, 0);
+}
+
#else
static inline void update_entity_load_avg(struct sched_entity *se,
int update_cfs_rq) {}
@@ -5219,8 +5240,6 @@ void idle_balance(int this_cpu, struct rq *this_rq)
if (this_rq->avg_idle < sysctl_sched_migration_cost)
return;
- update_rq_runnable_avg(this_rq, 1);
-
/*
* Drop the rq->lock, but keep IRQ/preempt disabled.
*/
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c
index b6baf37..cef61fa 100644
--- a/kernel/sched/idle_task.c
+++ b/kernel/sched/idle_task.c
@@ -13,6 +13,16 @@ select_task_rq_idle(struct task_struct *p, int sd_flag, int flags)
{
return task_cpu(p); /* IDLE tasks as never migrated */
}
+
+static void pre_schedule_idle(struct rq *rq, struct task_struct *prev)
+{
+ idle_exit(rq);
+}
+
+static void post_schedule_idle(struct rq *rq)
+{
+ idle_enter(rq);
+}
#endif /* CONFIG_SMP */
/*
* Idle tasks are unconditionally rescheduled:
@@ -25,6 +35,10 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int fl
static struct task_struct *pick_next_task_idle(struct rq *rq)
{
schedstat_inc(rq, sched_goidle);
+#ifdef CONFIG_SMP
+ /* Trigger the post schedule to do an idle_enter for CFS */
+ rq->post_schedule = 1;
+#endif
return rq->idle;
}
@@ -86,6 +100,8 @@ const struct sched_class idle_sched_class = {
#ifdef CONFIG_SMP
.select_task_rq = select_task_rq_idle,
+ .pre_schedule = pre_schedule_idle,
+ .post_schedule = post_schedule_idle,
#endif
.set_curr_task = set_curr_task_idle,
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index cc03cfd..2b826f2 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -880,6 +880,18 @@ extern const struct sched_class idle_sched_class;
extern void trigger_load_balance(struct rq *rq, int cpu);
extern void idle_balance(int this_cpu, struct rq *this_rq);
+/*
+ * Only depends on SMP, FAIR_GROUP_SCHED may be removed when runnable_avg
+ * becomes useful in lb
+ */
+#if defined(CONFIG_FAIR_GROUP_SCHED)
+extern void idle_enter(struct rq *this_rq);
+extern void idle_exit(struct rq *this_rq);
+#else
+static inline void idle_enter(struct rq *this_rq) {}
+static inline void idle_exit(struct rq *this_rq) {}
+#endif
+
#else /* CONFIG_SMP */
static inline void idle_balance(int cpu, struct rq *rq)
--
1.7.9.5
Hi,
I was going through the b.L switcher code. I found a call to
enter_nonsecure_world() with parameter "bl_image", obviously it must be
address of function that initializes switcher functionality. But I couldn't
find any other reference to this symbol in the switcher code. Can somebody
please explain this?
Thanks.
-Prashant
=== David Long ===
=== Highlights ===
* Responded to QA requests for input on testing requirements for uprobes
and kprobes.
* Did some coming up to speed on systemtap.
* Still working on a clean way to disentangle uprobe and kprobe code
without unnecessary duplication.
=== Plans ===
* Restructure code
* Start building systemtap
=== Issues ===
* Apparently we have a complaint from a TSC member that Kprobes does not
work, yet v3.8 passes the kernel-built-in tests and when exercised
manually krpobes seem to work. We need more specific information about
the problems seen.
=== Travel/Time Off ===
-dl
This patch series adds support for DRM FIMD DT for Exynos4 DT Machines,
specifically for Exynos4412 SoC.
changes since v10:
- addressed comments from Sylwester Nawrocki <sylvester.nawrocki(a)gmail.com>
changes since v9:
- dropped the patch "ARM: dts: Add lcd pinctrl node entries for EXYNOS4412 SoC"
as the gpios in the newly added nodes "lcd_en" and "lcd_sync" in this patch
were already PULLed high by existing "lcd_clk" node.
- addressed comments from Sylwester Nawrocki <sylvester.nawrocki(a)gmail.com>
and Thomas Abraham <thomas.abraham(a)linaro.org>
changes since v8:
- addressed comments to add missing documentation for clock and clock-names
properties as pointed out by Sachin Kamat <sachin.kamat(a)linaro.org>
changes since v7:
- rebased to kgene's "for-next"
- Migrated to Common Clock Framework
- removed the patch "ARM: dts: Add FIMD AUXDATA node entry for exynos4 DT",
as migration to Common Clock Framework will NOT need this.
- addressed the comments raised by Sachin Kamat <sachin.kamat(a)linaro.org>
changes since v6:
- addressed comments and added interrupt-names = "fifo", "vsync", "lcd_sys"
in exynos4.dtsi and re-ordered the interrupt numbering to match the order in
interrupt combiner IP as suggested by Sylwester Nawrocki <sylvester.nawrocki(a)gmail.com>.
changes since v5:
- renamed the fimd binding documentation file name as "samsung-fimd.txt",
since it not only talks about exynos display controller but also about
previous samsung display controllers.
- rephrased an abmigious statement about the interrupt combiner in the
fimd binding documentation as pointed out by
Sachin Kamat <sachin.kamat(a)linaro.org>
changes since v4:
- moved the fimd binding documentation to Documentation/devicetree/bindings/video/
as suggested by Sylwester Nawrocki <sylvester.nawrocki(a)gmail.com>
- added more fimd compatiblity strings in fimd documentation as
discussed at https://patchwork.kernel.org/patch/2144861/ with
Sylwester Nawrocki <sylvester.nawrocki(a)gmail.com> and
Tomasz Figa <tomasz.figa(a)gmail.com>
- modified compatible string for exynos4 fimd as "exynos4210-fimd"
exynos5 fimd as "exynos5250-fimd" to stick to the rule that compatible
value should be named after first specific SoC model in which this
particular IP version was included as discussed at
https://patchwork.kernel.org/patch/2144861/
- documented more about the interrupt combiner and their order as
suggested by Sylwester Nawrocki <sylvester.nawrocki(a)gmail.com>
changes since v3:
- rebased on
http://git.kernel.org/?p=linux/kernel/git/kgene/linux-samsung.git;a=shortlo…
changes since v2:
- added alias to 'fimd@11c00000' node
(reported by: Rahul Sharma <r.sh.open(a)gmail.com>)
- removed 'lcd0_data' node as there was already a similar node lcd_data24
(reported by: Jingoo Han <jg1.han(a)samsung.com>
- replaced spaces with tabs in display-timing node
changes since v1:
- added new patch to add FIMD DT binding Documentation
- removed patch enabling SAMSUNG_DEV_BACKLIGHT and SAMSUNG_DEV_PMW
for mach-exynos4 DT
- added 'status' property to fimd DT node
Is based on branch kgene's "for-next"
https://git.kernel.org/cgit/linux/kernel/git/kgene/linux-samsung.git/log/?h…
Vikas Sajjan (3):
ARM: dts: Add FIMD node to exynos4
ARM: dts: Add FIMD node and display timing node to
exynos4412-origen.dts
ARM: dts: Add FIMD DT binding Documentation
.../devicetree/bindings/video/samsung-fimd.txt | 65 ++++++++++++++++++++
arch/arm/boot/dts/exynos4.dtsi | 12 ++++
arch/arm/boot/dts/exynos4412-origen.dts | 21 +++++++
3 files changed, 98 insertions(+)
create mode 100644 Documentation/devicetree/bindings/video/samsung-fimd.txt
--
1.7.9.5