The set value of `fast_switch_enabled` indicates that fast_switch callback is set. For some drivers such as amd_pstate and intel_pstate, the adjust_perf callback is used but it still sets `fast_switch_possible` flag. This is because this flag also decides whether schedutil governor selects adjust_perf function for frequency update. This condition in the schedutil governor forces the scaling driver to set the `fast_switch_possible` flag.
Remove `fast_switch_enabled` check when schedutil decides to select adjust_perf function for frequency update. Thus removing this drivers are now free to remove `fast_switch_possible` flag if they don't use fast_switch callback.
This issue becomes apparent when aperf/mperf overflow occurs. When this happens, kernel disables frequency invariance calculation which causes schedutil to fallback to sugov_update_single_freq which currently relies on the fast_switch callback.
Normal flow: sugov_update_single_perf cpufreq_driver_adjust_perf cpufreq_driver->adjust_perf
Error case flow: sugov_update_single_perf sugov_update_single_freq <-- This is chosen because the freq invariant is disabled due to aperf/mperf overflow cpufreq_driver_fast_switch cpufreq_driver->fast_switch <-- Here NULL pointer dereference is happening, because fast_switch is not set
This change fixes this NULL pointer dereference issue.
Fixes: a61dec744745 ("cpufreq: schedutil: Avoid missing updates for one-CPU policies") Signed-off-by: Wyes Karny wyes.karny@amd.com
Cc: "Rafael J. Wysocki" rafael@kernel.org Cc: stable@vger.kernel.org --- drivers/cpufreq/amd-pstate.c | 10 +++++++--- drivers/cpufreq/cpufreq.c | 20 +++++++++++++++++++- drivers/cpufreq/intel_pstate.c | 3 +-- include/linux/cpufreq.h | 1 + kernel/sched/cpufreq_schedutil.c | 2 +- 5 files changed, 29 insertions(+), 7 deletions(-)
diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c index 5a3d4aa0f45a..007bfe724a6a 100644 --- a/drivers/cpufreq/amd-pstate.c +++ b/drivers/cpufreq/amd-pstate.c @@ -671,8 +671,14 @@ static int amd_pstate_cpu_init(struct cpufreq_policy *policy) /* It will be updated by governor */ policy->cur = policy->cpuinfo.min_freq;
+ /** + * For shared memory system frequency update takes time that's why + * do this in deferred kthread context. + */ if (boot_cpu_has(X86_FEATURE_CPPC)) - policy->fast_switch_possible = true; + current_pstate_driver->adjust_perf = amd_pstate_adjust_perf; + else + current_pstate_driver->adjust_perf = NULL;
ret = freq_qos_add_request(&policy->constraints, &cpudata->req[0], FREQ_QOS_MIN, policy->cpuinfo.min_freq); @@ -697,8 +703,6 @@ static int amd_pstate_cpu_init(struct cpufreq_policy *policy) policy->driver_data = cpudata;
amd_pstate_boost_init(cpudata); - if (!current_pstate_driver->adjust_perf) - current_pstate_driver->adjust_perf = amd_pstate_adjust_perf;
return 0;
diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index 6b52ebe5a890..366747012104 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -501,6 +501,13 @@ void cpufreq_enable_fast_switch(struct cpufreq_policy *policy) if (!policy->fast_switch_possible) return;
+ /** + * It's not expected driver's fast_switch callback is not set + * even fast_switch_possible is true. + */ + if (WARN_ON(!cpufreq_driver_has_fast_switch())) + return; + mutex_lock(&cpufreq_fast_switch_lock); if (cpufreq_fast_switch_count >= 0) { cpufreq_fast_switch_count++; @@ -2143,6 +2150,17 @@ unsigned int cpufreq_driver_fast_switch(struct cpufreq_policy *policy, } EXPORT_SYMBOL_GPL(cpufreq_driver_fast_switch);
+/** + * cpufreq_driver_has_fast_switch - Check "fast switch" callback. + * + * Return 'true' if the ->fast_switch callback is present for the + * current driver or 'false' otherwise. + */ +bool cpufreq_driver_has_fast_switch(void) +{ + return !!cpufreq_driver->fast_switch; +} + /** * cpufreq_driver_adjust_perf - Adjust CPU performance level in one go. * @cpu: Target CPU. @@ -2157,7 +2175,7 @@ EXPORT_SYMBOL_GPL(cpufreq_driver_fast_switch); * and it is expected to select a suitable performance level equal to or above * @min_perf and preferably equal to or below @target_perf. * - * This function must not be called if policy->fast_switch_enabled is unset. + * By default this function takes the fast frequency update path. * * Governors calling this function must guarantee that it will never be invoked * twice in parallel for the same CPU and that it will never be called in diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index 2548ec92faa2..007893514c87 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -2698,8 +2698,6 @@ static int __intel_pstate_cpu_init(struct cpufreq_policy *policy)
intel_pstate_init_acpi_perf_limits(policy);
- policy->fast_switch_possible = true; - return 0; }
@@ -2955,6 +2953,7 @@ static int intel_cpufreq_cpu_init(struct cpufreq_policy *policy) if (ret) return ret;
+ policy->fast_switch_possible = true; policy->cpuinfo.transition_latency = INTEL_CPUFREQ_TRANSITION_LATENCY; /* This reflects the intel_pstate_get_cpu_pstates() setting. */ policy->cur = policy->cpuinfo.min_freq; diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index 26e2eb399484..7a32cfca26c9 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -604,6 +604,7 @@ struct cpufreq_governor { /* Pass a target to the cpufreq driver */ unsigned int cpufreq_driver_fast_switch(struct cpufreq_policy *policy, unsigned int target_freq); +bool cpufreq_driver_has_fast_switch(void); void cpufreq_driver_adjust_perf(unsigned int cpu, unsigned long min_perf, unsigned long target_perf, diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index e3211455b203..f993ecf731a9 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -776,7 +776,7 @@ static int sugov_start(struct cpufreq_policy *policy)
if (policy_is_shared(policy)) uu = sugov_update_shared; - else if (policy->fast_switch_enabled && cpufreq_driver_has_adjust_perf()) + else if (cpufreq_driver_has_adjust_perf()) uu = sugov_update_single_perf; else uu = sugov_update_single_freq;
On Tue, May 9, 2023 at 8:06 PM Wyes Karny wyes.karny@amd.com wrote:
The set value of `fast_switch_enabled` indicates that fast_switch callback is set. For some drivers such as amd_pstate and intel_pstate, the adjust_perf callback is used but it still sets `fast_switch_possible` flag. This is because this flag also decides whether schedutil governor selects adjust_perf function for frequency update. This condition in the schedutil governor forces the scaling driver to set the `fast_switch_possible` flag.
Remove `fast_switch_enabled` check when schedutil decides to select adjust_perf function for frequency update. Thus removing this drivers are now free to remove `fast_switch_possible` flag if they don't use fast_switch callback.
This issue becomes apparent when aperf/mperf overflow occurs. When this happens, kernel disables frequency invariance calculation which causes schedutil to fallback to sugov_update_single_freq which currently relies on the fast_switch callback.
Normal flow: sugov_update_single_perf cpufreq_driver_adjust_perf cpufreq_driver->adjust_perf
Error case flow: sugov_update_single_perf sugov_update_single_freq <-- This is chosen because the freq invariant is disabled due to aperf/mperf overflow cpufreq_driver_fast_switch cpufreq_driver->fast_switch <-- Here NULL pointer dereference is happening, because fast_switch is not set
So you need to set fast_switch.
Please read the comment in sugov_update_single_perf(). It explains why adjust_perf is not used when scale invariance is not enabled: the mapping between the performance levels and frequency are not generally defined in that case and it is up to the driver to figure out what perf level to use to get the given frequency. And this is exactly why fast_switch is not optional: because scale invariance may be disabled.
Please feel free to update the documentation to clarify this, but the way to fix the issue is to implement fast_switch in the driver.
Fixes: a61dec744745 ("cpufreq: schedutil: Avoid missing updates for one-CPU policies") Signed-off-by: Wyes Karny wyes.karny@amd.com
Cc: "Rafael J. Wysocki" rafael@kernel.org Cc: stable@vger.kernel.org
drivers/cpufreq/amd-pstate.c | 10 +++++++--- drivers/cpufreq/cpufreq.c | 20 +++++++++++++++++++- drivers/cpufreq/intel_pstate.c | 3 +-- include/linux/cpufreq.h | 1 + kernel/sched/cpufreq_schedutil.c | 2 +- 5 files changed, 29 insertions(+), 7 deletions(-)
diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c index 5a3d4aa0f45a..007bfe724a6a 100644 --- a/drivers/cpufreq/amd-pstate.c +++ b/drivers/cpufreq/amd-pstate.c @@ -671,8 +671,14 @@ static int amd_pstate_cpu_init(struct cpufreq_policy *policy) /* It will be updated by governor */ policy->cur = policy->cpuinfo.min_freq;
/**
* For shared memory system frequency update takes time that's why
* do this in deferred kthread context.
*/ if (boot_cpu_has(X86_FEATURE_CPPC))
policy->fast_switch_possible = true;
current_pstate_driver->adjust_perf = amd_pstate_adjust_perf;
else
current_pstate_driver->adjust_perf = NULL; ret = freq_qos_add_request(&policy->constraints, &cpudata->req[0], FREQ_QOS_MIN, policy->cpuinfo.min_freq);
@@ -697,8 +703,6 @@ static int amd_pstate_cpu_init(struct cpufreq_policy *policy) policy->driver_data = cpudata;
amd_pstate_boost_init(cpudata);
if (!current_pstate_driver->adjust_perf)
current_pstate_driver->adjust_perf = amd_pstate_adjust_perf; return 0;
diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index 6b52ebe5a890..366747012104 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -501,6 +501,13 @@ void cpufreq_enable_fast_switch(struct cpufreq_policy *policy) if (!policy->fast_switch_possible) return;
/**
* It's not expected driver's fast_switch callback is not set
* even fast_switch_possible is true.
*/
if (WARN_ON(!cpufreq_driver_has_fast_switch()))
return;
mutex_lock(&cpufreq_fast_switch_lock); if (cpufreq_fast_switch_count >= 0) { cpufreq_fast_switch_count++;
@@ -2143,6 +2150,17 @@ unsigned int cpufreq_driver_fast_switch(struct cpufreq_policy *policy, } EXPORT_SYMBOL_GPL(cpufreq_driver_fast_switch);
+/**
- cpufreq_driver_has_fast_switch - Check "fast switch" callback.
- Return 'true' if the ->fast_switch callback is present for the
- current driver or 'false' otherwise.
- */
+bool cpufreq_driver_has_fast_switch(void) +{
return !!cpufreq_driver->fast_switch;
+}
/**
- cpufreq_driver_adjust_perf - Adjust CPU performance level in one go.
- @cpu: Target CPU.
@@ -2157,7 +2175,7 @@ EXPORT_SYMBOL_GPL(cpufreq_driver_fast_switch);
- and it is expected to select a suitable performance level equal to or above
- @min_perf and preferably equal to or below @target_perf.
- This function must not be called if policy->fast_switch_enabled is unset.
- By default this function takes the fast frequency update path.
- Governors calling this function must guarantee that it will never be invoked
- twice in parallel for the same CPU and that it will never be called in
diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index 2548ec92faa2..007893514c87 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -2698,8 +2698,6 @@ static int __intel_pstate_cpu_init(struct cpufreq_policy *policy)
intel_pstate_init_acpi_perf_limits(policy);
policy->fast_switch_possible = true;
return 0;
}
@@ -2955,6 +2953,7 @@ static int intel_cpufreq_cpu_init(struct cpufreq_policy *policy) if (ret) return ret;
policy->fast_switch_possible = true; policy->cpuinfo.transition_latency = INTEL_CPUFREQ_TRANSITION_LATENCY; /* This reflects the intel_pstate_get_cpu_pstates() setting. */ policy->cur = policy->cpuinfo.min_freq;
diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index 26e2eb399484..7a32cfca26c9 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -604,6 +604,7 @@ struct cpufreq_governor { /* Pass a target to the cpufreq driver */ unsigned int cpufreq_driver_fast_switch(struct cpufreq_policy *policy, unsigned int target_freq); +bool cpufreq_driver_has_fast_switch(void); void cpufreq_driver_adjust_perf(unsigned int cpu, unsigned long min_perf, unsigned long target_perf, diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index e3211455b203..f993ecf731a9 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -776,7 +776,7 @@ static int sugov_start(struct cpufreq_policy *policy)
if (policy_is_shared(policy)) uu = sugov_update_shared;
else if (policy->fast_switch_enabled && cpufreq_driver_has_adjust_perf())
else if (cpufreq_driver_has_adjust_perf()) uu = sugov_update_single_perf; else uu = sugov_update_single_freq;
-- 2.34.1
On Tue, May 9, 2023 at 8:18 PM Rafael J. Wysocki rafael@kernel.org wrote:
On Tue, May 9, 2023 at 8:06 PM Wyes Karny wyes.karny@amd.com wrote:
The set value of `fast_switch_enabled` indicates that fast_switch callback is set.
BTW, this is not true. The ACPI cpufreq driver has fast_switch, but it doesn't always use it.
For some drivers such as amd_pstate and intel_pstate, the adjust_perf callback is used but it still sets `fast_switch_possible` flag. This is because this flag also decides whether schedutil governor selects adjust_perf function for frequency update. This condition in the schedutil governor forces the scaling driver to set the `fast_switch_possible` flag.
Yes, it does, but setting this flag is not sufficient.
Remove `fast_switch_enabled` check when schedutil decides to select adjust_perf function for frequency update. Thus removing this drivers are now free to remove `fast_switch_possible` flag if they don't use fast_switch callback.
This issue becomes apparent when aperf/mperf overflow occurs. When this happens, kernel disables frequency invariance calculation which causes schedutil to fallback to sugov_update_single_freq which currently relies on the fast_switch callback.
Normal flow: sugov_update_single_perf cpufreq_driver_adjust_perf cpufreq_driver->adjust_perf
Error case flow: sugov_update_single_perf sugov_update_single_freq <-- This is chosen because the freq invariant is disabled due to aperf/mperf overflow cpufreq_driver_fast_switch cpufreq_driver->fast_switch <-- Here NULL pointer dereference is happening, because fast_switch is not set
So you need to set fast_switch.
Please read the comment in sugov_update_single_perf(). It explains why adjust_perf is not used when scale invariance is not enabled: the mapping between the performance levels and frequency are not generally defined in that case and it is up to the driver to figure out what perf level to use to get the given frequency. And this is exactly why fast_switch is not optional: because scale invariance may be disabled.
Please feel free to update the documentation to clarify this, but the way to fix the issue is to implement fast_switch in the driver.
Fixes: a61dec744745 ("cpufreq: schedutil: Avoid missing updates for one-CPU policies") Signed-off-by: Wyes Karny wyes.karny@amd.com
Cc: "Rafael J. Wysocki" rafael@kernel.org Cc: stable@vger.kernel.org
drivers/cpufreq/amd-pstate.c | 10 +++++++--- drivers/cpufreq/cpufreq.c | 20 +++++++++++++++++++- drivers/cpufreq/intel_pstate.c | 3 +-- include/linux/cpufreq.h | 1 + kernel/sched/cpufreq_schedutil.c | 2 +- 5 files changed, 29 insertions(+), 7 deletions(-)
diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c index 5a3d4aa0f45a..007bfe724a6a 100644 --- a/drivers/cpufreq/amd-pstate.c +++ b/drivers/cpufreq/amd-pstate.c @@ -671,8 +671,14 @@ static int amd_pstate_cpu_init(struct cpufreq_policy *policy) /* It will be updated by governor */ policy->cur = policy->cpuinfo.min_freq;
/**
* For shared memory system frequency update takes time that's why
* do this in deferred kthread context.
*/ if (boot_cpu_has(X86_FEATURE_CPPC))
policy->fast_switch_possible = true;
current_pstate_driver->adjust_perf = amd_pstate_adjust_perf;
else
current_pstate_driver->adjust_perf = NULL; ret = freq_qos_add_request(&policy->constraints, &cpudata->req[0], FREQ_QOS_MIN, policy->cpuinfo.min_freq);
@@ -697,8 +703,6 @@ static int amd_pstate_cpu_init(struct cpufreq_policy *policy) policy->driver_data = cpudata;
amd_pstate_boost_init(cpudata);
if (!current_pstate_driver->adjust_perf)
current_pstate_driver->adjust_perf = amd_pstate_adjust_perf; return 0;
diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index 6b52ebe5a890..366747012104 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -501,6 +501,13 @@ void cpufreq_enable_fast_switch(struct cpufreq_policy *policy) if (!policy->fast_switch_possible) return;
/**
* It's not expected driver's fast_switch callback is not set
* even fast_switch_possible is true.
*/
if (WARN_ON(!cpufreq_driver_has_fast_switch()))
return;
mutex_lock(&cpufreq_fast_switch_lock); if (cpufreq_fast_switch_count >= 0) { cpufreq_fast_switch_count++;
@@ -2143,6 +2150,17 @@ unsigned int cpufreq_driver_fast_switch(struct cpufreq_policy *policy, } EXPORT_SYMBOL_GPL(cpufreq_driver_fast_switch);
+/**
- cpufreq_driver_has_fast_switch - Check "fast switch" callback.
- Return 'true' if the ->fast_switch callback is present for the
- current driver or 'false' otherwise.
- */
+bool cpufreq_driver_has_fast_switch(void) +{
return !!cpufreq_driver->fast_switch;
+}
/**
- cpufreq_driver_adjust_perf - Adjust CPU performance level in one go.
- @cpu: Target CPU.
@@ -2157,7 +2175,7 @@ EXPORT_SYMBOL_GPL(cpufreq_driver_fast_switch);
- and it is expected to select a suitable performance level equal to or above
- @min_perf and preferably equal to or below @target_perf.
- This function must not be called if policy->fast_switch_enabled is unset.
- By default this function takes the fast frequency update path.
- Governors calling this function must guarantee that it will never be invoked
- twice in parallel for the same CPU and that it will never be called in
diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index 2548ec92faa2..007893514c87 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -2698,8 +2698,6 @@ static int __intel_pstate_cpu_init(struct cpufreq_policy *policy)
intel_pstate_init_acpi_perf_limits(policy);
policy->fast_switch_possible = true;
return 0;
}
@@ -2955,6 +2953,7 @@ static int intel_cpufreq_cpu_init(struct cpufreq_policy *policy) if (ret) return ret;
policy->fast_switch_possible = true;
I'm not sure what this is about. Is it a cleanup of intel_pstate?
policy->cpuinfo.transition_latency = INTEL_CPUFREQ_TRANSITION_LATENCY; /* This reflects the intel_pstate_get_cpu_pstates() setting. */ policy->cur = policy->cpuinfo.min_freq;
diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index 26e2eb399484..7a32cfca26c9 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -604,6 +604,7 @@ struct cpufreq_governor { /* Pass a target to the cpufreq driver */ unsigned int cpufreq_driver_fast_switch(struct cpufreq_policy *policy, unsigned int target_freq); +bool cpufreq_driver_has_fast_switch(void); void cpufreq_driver_adjust_perf(unsigned int cpu, unsigned long min_perf, unsigned long target_perf, diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index e3211455b203..f993ecf731a9 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -776,7 +776,7 @@ static int sugov_start(struct cpufreq_policy *policy)
if (policy_is_shared(policy)) uu = sugov_update_shared;
else if (policy->fast_switch_enabled && cpufreq_driver_has_adjust_perf())
else if (cpufreq_driver_has_adjust_perf()) uu = sugov_update_single_perf; else uu = sugov_update_single_freq;
--
Hi Rafael,
Thanks for reviewing the patch.
On 09 May 20:39, Rafael J. Wysocki wrote: ------------------------------------------>8--------------------------------------
diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index 2548ec92faa2..007893514c87 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -2698,8 +2698,6 @@ static int __intel_pstate_cpu_init(struct cpufreq_policy *policy)
intel_pstate_init_acpi_perf_limits(policy);
policy->fast_switch_possible = true;
return 0;
}
@@ -2955,6 +2953,7 @@ static int intel_cpufreq_cpu_init(struct cpufreq_policy *policy) if (ret) return ret;
policy->fast_switch_possible = true;
I'm not sure what this is about. Is it a cleanup of intel_pstate?
This patch intends to remove fast_switch_possible flag dependency from drivers which only use adjust_perf as frequency/pref update callback. As intel_pstate and amd_pstate driver has only adjust_perf and not fast_switch, therefore I'm removing that flag from these drivers. But intel_cpufreq has fast_switch therefore, only adding that flag for intel_cpufreq driver.
Thanks & Regards, Wyes
policy->cpuinfo.transition_latency = INTEL_CPUFREQ_TRANSITION_LATENCY; /* This reflects the intel_pstate_get_cpu_pstates() setting. */ policy->cur = policy->cpuinfo.min_freq;
diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index 26e2eb399484..7a32cfca26c9 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -604,6 +604,7 @@ struct cpufreq_governor { /* Pass a target to the cpufreq driver */ unsigned int cpufreq_driver_fast_switch(struct cpufreq_policy *policy, unsigned int target_freq); +bool cpufreq_driver_has_fast_switch(void); void cpufreq_driver_adjust_perf(unsigned int cpu, unsigned long min_perf, unsigned long target_perf, diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index e3211455b203..f993ecf731a9 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -776,7 +776,7 @@ static int sugov_start(struct cpufreq_policy *policy)
if (policy_is_shared(policy)) uu = sugov_update_shared;
else if (policy->fast_switch_enabled && cpufreq_driver_has_adjust_perf())
else if (cpufreq_driver_has_adjust_perf()) uu = sugov_update_single_perf; else uu = sugov_update_single_freq;
--
On Wed, May 10, 2023 at 7:43 AM Wyes Karny wyes.karny@amd.com wrote:
Hi Rafael,
Thanks for reviewing the patch.
On 09 May 20:39, Rafael J. Wysocki wrote: ------------------------------------------>8--------------------------------------
diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index 2548ec92faa2..007893514c87 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -2698,8 +2698,6 @@ static int __intel_pstate_cpu_init(struct cpufreq_policy *policy)
intel_pstate_init_acpi_perf_limits(policy);
policy->fast_switch_possible = true;
return 0;
}
@@ -2955,6 +2953,7 @@ static int intel_cpufreq_cpu_init(struct cpufreq_policy *policy) if (ret) return ret;
policy->fast_switch_possible = true;
I'm not sure what this is about. Is it a cleanup of intel_pstate?
This patch intends to remove fast_switch_possible flag dependency from drivers which only use adjust_perf as frequency/pref update callback. As intel_pstate and amd_pstate driver has only adjust_perf and not fast_switch, therefore I'm removing that flag from these drivers. But intel_cpufreq has fast_switch therefore, only adding that flag for intel_cpufreq driver.
But is it really better to change it? It works correctly as-is AFAICS.
In any case, the intel_pstate change should be a separate patch, because it is not directly related to the other changes in the $subject one IMV.
Hi Rafael,
On 09 May 20:18, Rafael J. Wysocki wrote:
On Tue, May 9, 2023 at 8:06 PM Wyes Karny wyes.karny@amd.com wrote:
The set value of `fast_switch_enabled` indicates that fast_switch callback is set. For some drivers such as amd_pstate and intel_pstate, the adjust_perf callback is used but it still sets `fast_switch_possible` flag. This is because this flag also decides whether schedutil governor selects adjust_perf function for frequency update. This condition in the schedutil governor forces the scaling driver to set the `fast_switch_possible` flag.
Remove `fast_switch_enabled` check when schedutil decides to select adjust_perf function for frequency update. Thus removing this drivers are now free to remove `fast_switch_possible` flag if they don't use fast_switch callback.
This issue becomes apparent when aperf/mperf overflow occurs. When this happens, kernel disables frequency invariance calculation which causes schedutil to fallback to sugov_update_single_freq which currently relies on the fast_switch callback.
Normal flow: sugov_update_single_perf cpufreq_driver_adjust_perf cpufreq_driver->adjust_perf
Error case flow: sugov_update_single_perf sugov_update_single_freq <-- This is chosen because the freq invariant is disabled due to aperf/mperf overflow cpufreq_driver_fast_switch cpufreq_driver->fast_switch <-- Here NULL pointer dereference is happening, because fast_switch is not set
So you need to set fast_switch.
Please read the comment in sugov_update_single_perf(). It explains why adjust_perf is not used when scale invariance is not enabled: the mapping between the performance levels and frequency are not generally defined in that case and it is up to the driver to figure out what perf level to use to get the given frequency. And this is exactly why fast_switch is not optional: because scale invariance may be disabled.
Please feel free to update the documentation to clarify this, but the way to fix the issue is to implement fast_switch in the driver.
Thanks for clarifying that fast_swich is not optional as frequency invariance could be disabled in runtime and this would cause to select fast_switch. I'll make those changes.
Thanks, Wyes
Fixes: a61dec744745 ("cpufreq: schedutil: Avoid missing updates for one-CPU policies") Signed-off-by: Wyes Karny wyes.karny@amd.com
Cc: "Rafael J. Wysocki" rafael@kernel.org Cc: stable@vger.kernel.org
drivers/cpufreq/amd-pstate.c | 10 +++++++--- drivers/cpufreq/cpufreq.c | 20 +++++++++++++++++++- drivers/cpufreq/intel_pstate.c | 3 +-- include/linux/cpufreq.h | 1 + kernel/sched/cpufreq_schedutil.c | 2 +- 5 files changed, 29 insertions(+), 7 deletions(-)
diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c index 5a3d4aa0f45a..007bfe724a6a 100644 --- a/drivers/cpufreq/amd-pstate.c +++ b/drivers/cpufreq/amd-pstate.c @@ -671,8 +671,14 @@ static int amd_pstate_cpu_init(struct cpufreq_policy *policy) /* It will be updated by governor */ policy->cur = policy->cpuinfo.min_freq;
/**
* For shared memory system frequency update takes time that's why
* do this in deferred kthread context.
*/ if (boot_cpu_has(X86_FEATURE_CPPC))
policy->fast_switch_possible = true;
current_pstate_driver->adjust_perf = amd_pstate_adjust_perf;
else
current_pstate_driver->adjust_perf = NULL; ret = freq_qos_add_request(&policy->constraints, &cpudata->req[0], FREQ_QOS_MIN, policy->cpuinfo.min_freq);
@@ -697,8 +703,6 @@ static int amd_pstate_cpu_init(struct cpufreq_policy *policy) policy->driver_data = cpudata;
amd_pstate_boost_init(cpudata);
if (!current_pstate_driver->adjust_perf)
current_pstate_driver->adjust_perf = amd_pstate_adjust_perf; return 0;
diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index 6b52ebe5a890..366747012104 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -501,6 +501,13 @@ void cpufreq_enable_fast_switch(struct cpufreq_policy *policy) if (!policy->fast_switch_possible) return;
/**
* It's not expected driver's fast_switch callback is not set
* even fast_switch_possible is true.
*/
if (WARN_ON(!cpufreq_driver_has_fast_switch()))
return;
mutex_lock(&cpufreq_fast_switch_lock); if (cpufreq_fast_switch_count >= 0) { cpufreq_fast_switch_count++;
@@ -2143,6 +2150,17 @@ unsigned int cpufreq_driver_fast_switch(struct cpufreq_policy *policy, } EXPORT_SYMBOL_GPL(cpufreq_driver_fast_switch);
+/**
- cpufreq_driver_has_fast_switch - Check "fast switch" callback.
- Return 'true' if the ->fast_switch callback is present for the
- current driver or 'false' otherwise.
- */
+bool cpufreq_driver_has_fast_switch(void) +{
return !!cpufreq_driver->fast_switch;
+}
/**
- cpufreq_driver_adjust_perf - Adjust CPU performance level in one go.
- @cpu: Target CPU.
@@ -2157,7 +2175,7 @@ EXPORT_SYMBOL_GPL(cpufreq_driver_fast_switch);
- and it is expected to select a suitable performance level equal to or above
- @min_perf and preferably equal to or below @target_perf.
- This function must not be called if policy->fast_switch_enabled is unset.
- By default this function takes the fast frequency update path.
- Governors calling this function must guarantee that it will never be invoked
- twice in parallel for the same CPU and that it will never be called in
diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index 2548ec92faa2..007893514c87 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -2698,8 +2698,6 @@ static int __intel_pstate_cpu_init(struct cpufreq_policy *policy)
intel_pstate_init_acpi_perf_limits(policy);
policy->fast_switch_possible = true;
return 0;
}
@@ -2955,6 +2953,7 @@ static int intel_cpufreq_cpu_init(struct cpufreq_policy *policy) if (ret) return ret;
policy->fast_switch_possible = true; policy->cpuinfo.transition_latency = INTEL_CPUFREQ_TRANSITION_LATENCY; /* This reflects the intel_pstate_get_cpu_pstates() setting. */ policy->cur = policy->cpuinfo.min_freq;
diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index 26e2eb399484..7a32cfca26c9 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -604,6 +604,7 @@ struct cpufreq_governor { /* Pass a target to the cpufreq driver */ unsigned int cpufreq_driver_fast_switch(struct cpufreq_policy *policy, unsigned int target_freq); +bool cpufreq_driver_has_fast_switch(void); void cpufreq_driver_adjust_perf(unsigned int cpu, unsigned long min_perf, unsigned long target_perf, diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index e3211455b203..f993ecf731a9 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -776,7 +776,7 @@ static int sugov_start(struct cpufreq_policy *policy)
if (policy_is_shared(policy)) uu = sugov_update_shared;
else if (policy->fast_switch_enabled && cpufreq_driver_has_adjust_perf())
else if (cpufreq_driver_has_adjust_perf()) uu = sugov_update_single_perf; else uu = sugov_update_single_freq;
-- 2.34.1
linux-stable-mirror@lists.linaro.org