From: "Gautham R. Shenoy" <ego(a)linux.vnet.ibm.com>
On POWERNV platform, Pstates are 8-bit values. On POWER8 they are
negatively numbered while on POWER9 they are positively
numbered. Thus, on POWER9, the maximum number of pstates could be as
high as 256.
The current code interprets pstates as a signed 8-bit value. This
causes a problem on POWER9 platforms which have more than 128 pstates.
On such systems, on a CPU that is in a lower pstate whose number is
greater than 128, querying the current pstate returns a "pstate X is
out of bound" error message and the current pstate is reported as the
nominal pstate.
This patch fixes the aforementioned issue by correctly differentiating
the sign whenever a pstate value read, depending on whether the
pstates are positively numbered or negatively numbered.
Fixes: commit 09ca4c9b5958 ("cpufreq: powernv: Replacing pstate_id with frequency table index")
Cc: <stable(a)vger.kernel.org> #v4.8
Signed-off-by: Gautham R. Shenoy <ego(a)linux.vnet.ibm.com>
Tested-and-reviewed-by: Shilpasri G Bhat <shilpa.bhat(a)linux.vnet.ibm.com>
Acked-by: Viresh Kumar <viresh.kumar(a)linaro.org>
---
drivers/cpufreq/powernv-cpufreq.c | 43 ++++++++++++++++++++++++++++++---------
1 file changed, 33 insertions(+), 10 deletions(-)
diff --git a/drivers/cpufreq/powernv-cpufreq.c b/drivers/cpufreq/powernv-cpufreq.c
index b6d7c4c..bb7586e 100644
--- a/drivers/cpufreq/powernv-cpufreq.c
+++ b/drivers/cpufreq/powernv-cpufreq.c
@@ -41,11 +41,14 @@
#define POWERNV_MAX_PSTATES 256
#define PMSR_PSAFE_ENABLE (1UL << 30)
#define PMSR_SPR_EM_DISABLE (1UL << 31)
-#define PMSR_MAX(x) ((x >> 32) & 0xFF)
+#define EXTRACT_BYTE(x, shift) (((x) >> shift) & 0xFF)
+#define MAX_SHIFT 32
#define LPSTATE_SHIFT 48
#define GPSTATE_SHIFT 56
-#define GET_LPSTATE(x) (((x) >> LPSTATE_SHIFT) & 0xFF)
-#define GET_GPSTATE(x) (((x) >> GPSTATE_SHIFT) & 0xFF)
+#define GET_PMSR_MAX(x) EXTRACT_BYTE(x, MAX_SHIFT)
+#define GET_LPSTATE(x) EXTRACT_BYTE(x, LPSTATE_SHIFT)
+#define GET_GPSTATE(x) EXTRACT_BYTE(x, GPSTATE_SHIFT)
+
#define MAX_RAMP_DOWN_TIME 5120
/*
@@ -64,6 +67,12 @@
/* Interval after which the timer is queued to bring down global pstate */
#define GPSTATE_TIMER_INTERVAL 2000
+/*
+ * On POWER8 the pstates are negatively numbered. On POWER9, they are
+ * positively numbered. Use this flag to track whether we have
+ * positive or negative numbered pstates.
+ */
+static bool pos_pstates;
/**
* struct global_pstate_info - Per policy data structure to maintain history of
@@ -164,7 +173,7 @@ static inline unsigned int pstate_to_idx(int pstate)
int min = powernv_freqs[powernv_pstate_info.min].driver_data;
int max = powernv_freqs[powernv_pstate_info.max].driver_data;
- if (min > 0) {
+ if (pos_pstates) {
if (unlikely((pstate < max) || (pstate > min))) {
pr_warn_once("pstate %d is out of bound\n", pstate);
return powernv_pstate_info.nominal;
@@ -301,6 +310,9 @@ static int init_powernv_pstates(void)
}
}
+ if ((int)pstate_min > 0)
+ pos_pstates = true;
+
/* End of list marker entry */
powernv_freqs[i].frequency = CPUFREQ_TABLE_END;
return 0;
@@ -438,7 +450,6 @@ struct powernv_smp_call_data {
static void powernv_read_cpu_freq(void *arg)
{
unsigned long pmspr_val;
- s8 local_pstate_id;
struct powernv_smp_call_data *freq_data = arg;
pmspr_val = get_pmspr(SPRN_PMSR);
@@ -447,8 +458,11 @@ static void powernv_read_cpu_freq(void *arg)
* The local pstate id corresponds bits 48..55 in the PMSR.
* Note: Watch out for the sign!
*/
- local_pstate_id = (pmspr_val >> 48) & 0xFF;
- freq_data->pstate_id = local_pstate_id;
+ if (pos_pstates)
+ freq_data->pstate_id = (u8)GET_LPSTATE(pmspr_val);
+ else
+ freq_data->pstate_id = (s8)GET_LPSTATE(pmspr_val);
+
freq_data->freq = pstate_id_to_freq(freq_data->pstate_id);
pr_debug("cpu %d pmsr %016lX pstate_id %d frequency %d kHz\n",
@@ -522,7 +536,10 @@ static void powernv_cpufreq_throttle_check(void *data)
chip = this_cpu_read(chip_info);
/* Check for Pmax Capping */
- pmsr_pmax = (s8)PMSR_MAX(pmsr);
+ if (pos_pstates)
+ pmsr_pmax = (u8)GET_PMSR_MAX(pmsr);
+ else
+ pmsr_pmax = (s8)GET_PMSR_MAX(pmsr);
pmsr_pmax_idx = pstate_to_idx(pmsr_pmax);
if (pmsr_pmax_idx != powernv_pstate_info.max) {
if (chip->throttled)
@@ -645,8 +662,14 @@ void gpstate_timer_handler(struct timer_list *t)
* value. Hence, read from PMCR to get correct data.
*/
val = get_pmspr(SPRN_PMCR);
- freq_data.gpstate_id = (s8)GET_GPSTATE(val);
- freq_data.pstate_id = (s8)GET_LPSTATE(val);
+ if (pos_pstates) {
+ freq_data.gpstate_id = (u8)GET_GPSTATE(val);
+ freq_data.pstate_id = (u8)GET_LPSTATE(val);
+ } else {
+ freq_data.gpstate_id = (s8)GET_GPSTATE(val);
+ freq_data.pstate_id = (s8)GET_LPSTATE(val);
+ }
+
if (freq_data.gpstate_id == freq_data.pstate_id) {
reset_gpstates(policy);
spin_unlock(&gpstates->gpstate_lock);
--
1.8.3.1
From: Wanpeng Li <wanpeng.li(a)hotmail.com>
------------[ cut here ]------------
Bad FPU state detected at kvm_put_guest_fpu+0xd8/0x2d0 [kvm], reinitializing FPU registers.
WARNING: CPU: 1 PID: 4594 at arch/x86/mm/extable.c:103 ex_handler_fprestore+0x88/0x90
CPU: 1 PID: 4594 Comm: qemu-system-x86 Tainted: G B OE 4.15.0-rc2+ #10
RIP: 0010:ex_handler_fprestore+0x88/0x90
Call Trace:
fixup_exception+0x4e/0x60
do_general_protection+0xff/0x270
general_protection+0x22/0x30
RIP: 0010:kvm_put_guest_fpu+0xd8/0x2d0 [kvm]
RSP: 0018:ffff8803d5627810 EFLAGS: 00010246
kvm_vcpu_reset+0x3b4/0x3c0 [kvm]
kvm_apic_accept_events+0x1c0/0x240 [kvm]
kvm_arch_vcpu_ioctl_run+0x1658/0x2fb0 [kvm]
kvm_vcpu_ioctl+0x479/0x880 [kvm]
do_vfs_ioctl+0x142/0x9a0
SyS_ioctl+0x74/0x80
do_syscall_64+0x15f/0x600
This can be reproduced by running any testcase in kvm-unit-tests since
the qemu userspace FPU context is not initialized, which results in the
init path from kvm_apic_accept_events() will load/put qemu userspace
FPU context w/o initialized. In addition, w/o this splatting we still
should initialize vcpu->arch.user_fpu instead of current->thread.fpu.
This patch fixes it by initializing qemu user FPU context if it is
uninitialized before KVM_RUN.
Cc: Paolo Bonzini <pbonzini(a)redhat.com>
Cc: Radim Krčmář <rkrcmar(a)redhat.com>
Cc: Rik van Riel <riel(a)redhat.com>
Cc: stable(a)vger.kernel.org
Fixes: f775b13eedee (x86,kvm: move qemu/guest FPU switching out to vcpu_run)
Signed-off-by: Wanpeng Li <wanpeng.li(a)hotmail.com>
---
arch/x86/kvm/x86.c | 7 +++++--
1 file changed, 5 insertions(+), 2 deletions(-)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index a92b22f..063a643 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -7273,10 +7273,13 @@ static int complete_emulated_mmio(struct kvm_vcpu *vcpu)
int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
{
- struct fpu *fpu = ¤t->thread.fpu;
+ struct fpu *fpu = &vcpu->arch.user_fpu;
int r;
- fpu__initialize(fpu);
+ if (!fpu->initialized) {
+ fpstate_init(&vcpu->arch.user_fpu.state);
+ fpu->initialized = 1;
+ }
kvm_sigset_activate(vcpu);
--
2.7.4