The sched_mc feature has been originally designed to improve power
consumption of multi-package system and several architecture functions
are available to tune the topology and the scheduler's parameters when
scheduler rebuilds the sched_domain hierarchy (change the
sched_mc_power_savings level). This patches improve the power
consumption of dual and quad cortex-A9 when the sched_mc_power_savings
is set to 2. The following patches' policy is to accept up to 4
threads (can be configured) in the run queue of a core before starting
to load balance if cpu runs at low frequencies but to accept only 1
thread for high frequencies, which is the normal behaviour. The goal
is to use only one core in light load situation and all cores in heavy
load situation
Patches [1-2] modify the ARM cpu topology according to
sched_mc_power_savings value and Cortex id.
Patch [3] enables ARCH_POWER feature of the scheduler.
Patch [4] adds arch_scale_freq_power function for ARM platform.
Patches [5-6] modify the cpu_power of CA-9 according to
sched_mc_power_savings' level and core frequency. The main goal is to
increase the capacity of a core when using low cpu frequency in order
to pull tasks on this core. Note that this behaviour is not really
advised but it can be seen as an intermediate step between the use of
cpu hotplug (which is not a power saving feature) and a new load
balancer which will take into account low load situation on dual core.
Patch [7] ensures that cpu0 is used in priority when only one CPU is running.
Patch [8] adds some debugfs interface for test purpose.
Patch [9] ensures that the cpu_power will be updated periodically.
TODO list:
-remove useless start of ilb when the core has capacity.
-add a method (DT, sysfs, ...) to set threshold for using 1 or all cpus for CA-9
v2:
*Modify the method to update cpu_power
*There are fewer patches than v1 because some issues are fixed by
patches that has been pushed for 3.2.
*These patches has been tested on snowball and vexpress boards.
*Performance results are similar to v1
v1: http://permalink.gmane.org/gmane.linux.linaro.devel/8087
Vincent
With a lot of small task, the softirq sched is nearly never called
when no_hz is enable. Te load_balance is mainly called with
the newly_idle mode which doesn't update the cpu_power.
Add a next_update field which ensure a maximum update period when
there is short activity
Signed-off-by: Vincent Guittot <vincent.guittot(a)linaro.org>
---
include/linux/sched.h | 1 +
kernel/sched_fair.c | 24 ++++++++++++++++--------
2 files changed, 17 insertions(+), 8 deletions(-)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 41d0237..8610921 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -901,6 +901,7 @@ struct sched_group_power {
* single CPU.
*/
unsigned int power, power_orig;
+ unsigned long next_update;
};
struct sched_group {
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index bc8ee99..320b7a0 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -91,6 +91,8 @@ unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;
static const struct sched_class fair_sched_class;
+static unsigned long __read_mostly max_load_balance_interval = HZ/10;
+
/**************************************************************
* CFS operations on generic schedulable entities:
*/
@@ -2667,6 +2669,11 @@ static void update_group_power(struct sched_domain *sd, int cpu)
struct sched_domain *child = sd->child;
struct sched_group *group, *sdg = sd->groups;
unsigned long power;
+ unsigned long interval;
+
+ interval = msecs_to_jiffies(sd->balance_interval);
+ interval = clamp(interval, 1UL, max_load_balance_interval);
+ sdg->sgp->next_update = jiffies + interval;
if (!child) {
update_cpu_power(sd, cpu);
@@ -2774,12 +2781,15 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
* domains. In the newly idle case, we will allow all the cpu's
* to do the newly idle load balance.
*/
- if (idle != CPU_NEWLY_IDLE && local_group) {
- if (balance_cpu != this_cpu) {
- *balance = 0;
- return;
- }
- update_group_power(sd, this_cpu);
+ if (local_group) {
+ if (idle != CPU_NEWLY_IDLE) {
+ if (balance_cpu != this_cpu) {
+ *balance = 0;
+ return;
+ }
+ update_group_power(sd, this_cpu);
+ } else if (time_after_eq(jiffies, group->sgp->next_update))
+ update_group_power(sd, this_cpu);
}
/* Adjust by relative CPU power of the group */
@@ -3879,8 +3889,6 @@ void select_nohz_load_balancer(int stop_tick)
static DEFINE_SPINLOCK(balancing);
-static unsigned long __read_mostly max_load_balance_interval = HZ/10;
-
/*
* Scale the max load_balance interval with the number of CPUs in the system.
* This trades load-balance latency on larger machines for less cross talk.
--
1.7.4.1
Signed-off-by: Vincent Guittot <vincent.guittot(a)linaro.org>
---
arch/arm/kernel/topology.c | 134 +++++++++++++++++++++++++++++++++++++++++---
1 files changed, 126 insertions(+), 8 deletions(-)
diff --git a/arch/arm/kernel/topology.c b/arch/arm/kernel/topology.c
index 2774c5d..a1b1f7f 100644
--- a/arch/arm/kernel/topology.c
+++ b/arch/arm/kernel/topology.c
@@ -21,6 +21,10 @@
#include <linux/cpumask.h>
#include <linux/cpuset.h>
+#ifdef CONFIG_CPU_FREQ
+#include <linux/cpufreq.h>
+#endif
+
#include <asm/cputype.h>
#include <asm/topology.h>
@@ -54,6 +58,7 @@ struct cputopo_arm cpu_topology[NR_CPUS];
* using its own cpu_power even it's not always true because of
* no_hz_idle_balance
*/
+
static DEFINE_PER_CPU(unsigned int, cpu_scale);
/*
@@ -65,17 +70,127 @@ unsigned int advanced_topology = 1;
static void normal_cpu_topology_mask(void);
static void (*set_cpu_topology_mask)(void) = normal_cpu_topology_mask;
-/* This table sets the cpu_power scale of a cpu according to the sched_mc mode.
- * The content of this table could be SoC specific so we should add a method to
- * overwrite this default table.
+#ifdef CONFIG_CPU_FREQ
+/*
+ * This struct describes parameters to compute cpu_power
+ */
+struct cputopo_power {
+ int id;
+ int max; /* max idx in the table */
+ unsigned int step; /* frequency step for the table */
+ unsigned int *table; /* table of cpu_power */
+};
+
+/* default table with one default cpu_power value */
+unsigned int table_default_power[1] = {
+ 1024
+};
+
+static struct cputopo_power default_cpu_power = {
+ .max = 1,
+ .step = 1,
+ .table = table_default_power,
+};
+
+/* CA-9 table with cpufreq modifying cpu_power */
+#define CPU_MAX_FREQ 10
+/* we use a 200Mhz step for scaling cpu power */
+#define CPU_TOPO_FREQ_STEP 200000
+/* This table sets the cpu_power scale of a cpu according to 2 inputs which are
+ * the frequency and the sched_mc mode. The content of this table could be SoC
+ * specific so we should add a method to overwrite this default table.
* TODO: Study how to use DT for setting this table
*/
+unsigned int table_ca9_power[CPU_MAX_FREQ] = {
+/* freq< 200 400 600 800 1000 1200 1400 1600 1800 other*/
+ 4096, 4096, 4096, 1024, 1024, 1024, 1024, 1024, 1024, 1024, /* Power save mode CA9 MP */
+};
+
+static struct cputopo_power CA9_cpu_power = {
+ .max = CPU_MAX_FREQ,
+ .step = CPU_TOPO_FREQ_STEP,
+ .table = table_ca9_power,
+};
+
#define ARM_CORTEX_A9_DEFAULT_SCALE 0
#define ARM_CORTEX_A9_POWER_SCALE 1
/* This table list all possible cpu power configuration */
-unsigned int table_config[2] = {
+struct cputopo_power *table_config[2] = {
+ &default_cpu_power,
+ &CA9_cpu_power,
+};
+
+struct cputopo_scale {
+ int id;
+ int freq;
+ struct cputopo_power *power;
+};
+
+/*
+ * The table will be mostly used by one cpu which will update the
+ * configuration for all cpu on a cpufreq notification
+ * or a sched_mc level change
+ */
+static struct cputopo_scale cpu_power[NR_CPUS];
+
+static void set_cpufreq_scale(unsigned int cpuid, unsigned int freq)
+{
+ unsigned int idx;
+
+ cpu_power[cpuid].freq = freq;
+
+ idx = freq / cpu_power[cpuid].power->step;
+ if (idx >= cpu_power[cpuid].power->max)
+ idx = cpu_power[cpuid].power->max - 1;
+
+ per_cpu(cpu_scale, cpuid) = cpu_power[cpuid].power->table[idx];
+ smp_wmb();
+}
+
+static void set_power_scale(unsigned int cpu, unsigned int idx)
+{
+ cpu_power[cpu].id = idx;
+ cpu_power[cpu].power = table_config[idx];
+
+ set_cpufreq_scale(cpu, cpu_power[cpu].freq);
+}
+
+static int topo_cpufreq_transition(struct notifier_block *nb,
+ unsigned long state, void *data)
+{
+ struct cpufreq_freqs *freqs = data;
+
+ if (state == CPUFREQ_POSTCHANGE || state == CPUFREQ_RESUMECHANGE)
+ set_cpufreq_scale(freqs->cpu, freqs->new);
+
+ return NOTIFY_OK;
+}
+
+static struct notifier_block topo_cpufreq_nb = {
+ .notifier_call = topo_cpufreq_transition,
+};
+
+static int topo_cpufreq_init(void)
+{
+ unsigned int cpu;
+
+ /* TODO set initial value according to current freq */
+
+ /* init core mask */
+ for_each_possible_cpu(cpu) {
+ cpu_power[cpu].freq = 0;
+ cpu_power[cpu].power = &default_cpu_power;
+ }
+
+ return cpufreq_register_notifier(&topo_cpufreq_nb,
+ CPUFREQ_TRANSITION_NOTIFIER);
+}
+#else
+#define ARM_CORTEX_A9_DEFAULT_SCALE 0
+#define ARM_CORTEX_A9_POWER_SCALE 0
+/* This table list all possible cpu power configuration */
+unsigned int table_config[1] = {
1024,
- 4096
};
static void set_power_scale(unsigned int cpu, unsigned int idx)
@@ -83,14 +198,17 @@ static void set_power_scale(unsigned int cpu, unsigned int idx)
per_cpu(cpu_scale, cpu) = table_config[idx];
}
+static inline int topo_cpufreq_init(void) {return 0; }
+#endif
+
static int init_cpu_power_scale(void)
{
+ /* register cpufreq notifer */
+ topo_cpufreq_init();
+
/* Do we need to change default config */
advanced_topology = 1;
- /* force topology update */
- arch_update_cpu_topology();
-
/* Force a cpu topology update */
rebuild_sched_domains();
--
1.7.4.1
Add an architecture specific function for setting cpu_power
Signed-off-by: Vincent Guittot <vincent.guittot(a)linaro.org>
---
arch/arm/kernel/topology.c | 22 ++++++++++++++++++++++
1 files changed, 22 insertions(+), 0 deletions(-)
diff --git a/arch/arm/kernel/topology.c b/arch/arm/kernel/topology.c
index af1c3e6..9d80e22 100644
--- a/arch/arm/kernel/topology.c
+++ b/arch/arm/kernel/topology.c
@@ -45,12 +45,32 @@
struct cputopo_arm cpu_topology[NR_CPUS];
/*
+ * cpu power scale management
+ */
+
+/*
+ * a per cpu data structure should be better because each cpu is mainly
+ * using its own cpu_power even it's not always true because of
+ * no_hz_idle_balance
+ */
+static DEFINE_PER_CPU(unsigned int, cpu_scale);
+
+/*
* cpu topology mask management
*/
unsigned int advanced_topology = 1;
/*
+ * Update the cpu power
+ */
+
+unsigned long arch_scale_freq_power(struct sched_domain *sd, int cpu)
+{
+ return per_cpu(cpu_scale, cpu);
+}
+
+/*
* default topology function
*/
@@ -281,6 +301,8 @@ void init_cpu_topology(void)
cpu_topo->socket_id = -1;
cpumask_clear(&cpu_topo->core_sibling);
cpumask_clear(&cpu_topo->thread_sibling);
+
+ per_cpu(cpu_scale, cpu) = SCHED_POWER_SCALE;
}
smp_wmb();
}
--
1.7.4.1