For compatibility with userspace the MSR_AMD64_TSC_RATIO can be set even when the feature is not exposed to the guest, which breaks assumptions that it has the default value in this case.
Fixes: 5228eb96a487 ("KVM: x86: nSVM: implement nested TSC scaling") Cc: stable@vger.kernel.org
Reported-by: Dāvis Mosāns davispuh@gmail.com Signed-off-by: Maxim Levitsky mlevitsk@redhat.com --- arch/x86/kvm/svm/nested.c | 10 ++++------ arch/x86/kvm/svm/svm.c | 5 +++-- arch/x86/kvm/svm/svm.h | 1 + 3 files changed, 8 insertions(+), 8 deletions(-)
diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index 1218b5a342fc..a2e2436057dc 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -574,14 +574,12 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm) vcpu->arch.tsc_offset = kvm_calc_nested_tsc_offset( vcpu->arch.l1_tsc_offset, svm->nested.ctl.tsc_offset, - svm->tsc_ratio_msr); + svm_get_l2_tsc_multiplier(vcpu));
svm->vmcb->control.tsc_offset = vcpu->arch.tsc_offset;
- if (svm->tsc_ratio_msr != kvm_default_tsc_scaling_ratio) { - WARN_ON(!svm->tsc_scaling_enabled); + if (svm_get_l2_tsc_multiplier(vcpu) != kvm_default_tsc_scaling_ratio) nested_svm_update_tsc_ratio_msr(vcpu); - }
svm->vmcb->control.int_ctl = (svm->nested.ctl.int_ctl & int_ctl_vmcb12_bits) | @@ -867,8 +865,7 @@ int nested_svm_vmexit(struct vcpu_svm *svm) vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS); }
- if (svm->tsc_ratio_msr != kvm_default_tsc_scaling_ratio) { - WARN_ON(!svm->tsc_scaling_enabled); + if (svm_get_l2_tsc_multiplier(vcpu) != kvm_default_tsc_scaling_ratio) { vcpu->arch.tsc_scaling_ratio = vcpu->arch.l1_tsc_scaling_ratio; svm_write_tsc_multiplier(vcpu, vcpu->arch.tsc_scaling_ratio); } @@ -1264,6 +1261,7 @@ void nested_svm_update_tsc_ratio_msr(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu);
+ WARN_ON_ONCE(!svm->tsc_scaling_enabled); vcpu->arch.tsc_scaling_ratio = kvm_calc_nested_tsc_multiplier(vcpu->arch.l1_tsc_scaling_ratio, svm->tsc_ratio_msr); diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 975be872cd1a..5cf6bb5bfd3e 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -911,11 +911,12 @@ static u64 svm_get_l2_tsc_offset(struct kvm_vcpu *vcpu) return svm->nested.ctl.tsc_offset; }
-static u64 svm_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu) +u64 svm_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu);
- return svm->tsc_ratio_msr; + return svm->tsc_scaling_enabled ? svm->tsc_ratio_msr : + kvm_default_tsc_scaling_ratio; }
static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 852b12aee03d..006571dd30df 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -542,6 +542,7 @@ int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr, bool has_error_code, u32 error_code); int nested_svm_exit_special(struct vcpu_svm *svm); void nested_svm_update_tsc_ratio_msr(struct kvm_vcpu *vcpu); +u64 svm_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu); void svm_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 multiplier); void nested_copy_vmcb_control_to_cache(struct vcpu_svm *svm, struct vmcb_control_area *control);
On 2/21/22 11:33, Maxim Levitsky wrote:
For compatibility with userspace the MSR_AMD64_TSC_RATIO can be set even when the feature is not exposed to the guest, which breaks assumptions that it has the default value in this case.
Fixes: 5228eb96a487 ("KVM: x86: nSVM: implement nested TSC scaling") Cc: stable@vger.kernel.org
Reported-by: Dāvis Mosāns davispuh@gmail.com Signed-off-by: Maxim Levitsky mlevitsk@redhat.com
It's not clear how QEMU ends up writing MSR_AMD64_TSC_RATIO_DEFAULT rather than 0, but we clearly have a bug in KVM. It should not allow writing 0 in the first place if tsc-ratio is not available in the VM.
If QEMU really can get itself in this situation, we cannot fix this except with KVM_CAP_DISABLE_QUIRKS (a quirk that would accept and ignore host-initiated writes if the CPUID bit is not available) or perhaps with a pr_warn_ratelimited and a quick deprecation cycle, until some time after 6.2.1 is released.
Hmmm... maybe the issue is actually that KVM *returns* 0 from KVM_GET_MSRS? And in this case, fixing KVM would also prevent QEMU from sending the bogus KVM_SET_MSRS?
Thanks,
Paolo
arch/x86/kvm/svm/nested.c | 10 ++++------ arch/x86/kvm/svm/svm.c | 5 +++-- arch/x86/kvm/svm/svm.h | 1 + 3 files changed, 8 insertions(+), 8 deletions(-)
diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index 1218b5a342fc..a2e2436057dc 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -574,14 +574,12 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm) vcpu->arch.tsc_offset = kvm_calc_nested_tsc_offset( vcpu->arch.l1_tsc_offset, svm->nested.ctl.tsc_offset,
svm->tsc_ratio_msr);
svm_get_l2_tsc_multiplier(vcpu));
svm->vmcb->control.tsc_offset = vcpu->arch.tsc_offset;
- if (svm->tsc_ratio_msr != kvm_default_tsc_scaling_ratio) {
WARN_ON(!svm->tsc_scaling_enabled);
- if (svm_get_l2_tsc_multiplier(vcpu) != kvm_default_tsc_scaling_ratio) nested_svm_update_tsc_ratio_msr(vcpu);
- }
svm->vmcb->control.int_ctl = (svm->nested.ctl.int_ctl & int_ctl_vmcb12_bits) | @@ -867,8 +865,7 @@ int nested_svm_vmexit(struct vcpu_svm *svm) vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS); }
- if (svm->tsc_ratio_msr != kvm_default_tsc_scaling_ratio) {
WARN_ON(!svm->tsc_scaling_enabled);
- if (svm_get_l2_tsc_multiplier(vcpu) != kvm_default_tsc_scaling_ratio) { vcpu->arch.tsc_scaling_ratio = vcpu->arch.l1_tsc_scaling_ratio; svm_write_tsc_multiplier(vcpu, vcpu->arch.tsc_scaling_ratio); }
@@ -1264,6 +1261,7 @@ void nested_svm_update_tsc_ratio_msr(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu);
- WARN_ON_ONCE(!svm->tsc_scaling_enabled); vcpu->arch.tsc_scaling_ratio = kvm_calc_nested_tsc_multiplier(vcpu->arch.l1_tsc_scaling_ratio, svm->tsc_ratio_msr);
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 975be872cd1a..5cf6bb5bfd3e 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -911,11 +911,12 @@ static u64 svm_get_l2_tsc_offset(struct kvm_vcpu *vcpu) return svm->nested.ctl.tsc_offset; } -static u64 svm_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu) +u64 svm_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu);
- return svm->tsc_ratio_msr;
- return svm->tsc_scaling_enabled ? svm->tsc_ratio_msr :
}kvm_default_tsc_scaling_ratio;
static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 852b12aee03d..006571dd30df 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -542,6 +542,7 @@ int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr, bool has_error_code, u32 error_code); int nested_svm_exit_special(struct vcpu_svm *svm); void nested_svm_update_tsc_ratio_msr(struct kvm_vcpu *vcpu); +u64 svm_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu); void svm_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 multiplier); void nested_copy_vmcb_control_to_cache(struct vcpu_svm *svm, struct vmcb_control_area *control);
On Mon, 2022-02-21 at 13:33 +0100, Paolo Bonzini wrote:
On 2/21/22 11:33, Maxim Levitsky wrote:
For compatibility with userspace the MSR_AMD64_TSC_RATIO can be set even when the feature is not exposed to the guest, which breaks assumptions that it has the default value in this case.
Fixes: 5228eb96a487 ("KVM: x86: nSVM: implement nested TSC scaling") Cc: stable@vger.kernel.org
Reported-by: Dāvis Mosāns davispuh@gmail.com Signed-off-by: Maxim Levitsky mlevitsk@redhat.com
It's not clear how QEMU ends up writing MSR_AMD64_TSC_RATIO_DEFAULT rather than 0, but we clearly have a bug in KVM. It should not allow writing 0 in the first place if tsc-ratio is not available in the VM.
Qemu currently (the code is very new so it can be changed) writes the initial value of 0 to this msr if tsc scaling is disabled in the guest, or MSR_AMD64_TSC_RATIO_DEFAULT if the tsc scaling is enabled.
The guest can change it only when TSC scaling is enabled for it. If tsc scaling is not enabled, both guest reads or writes of this MSR get #GP.
I only allowed qemu writes of this msr because I thought that qemu might first set the MSR and then set guest CPUID.
Also, for example the MSR_IA32_XSS uses the same logic in KVM.
As for why qemu sets this msr regardless of guest CPUID bit, it seemed to be cleaner this way - kvm_put_msrs in qemu seems not to check guest CPUID but rather only check that KVM supports this msr, which will be true regardless of guest's CPUID bit.
What do you think?
Best regards, Maxim Levitsky
If QEMU really can get itself in this situation, we cannot fix this except with KVM_CAP_DISABLE_QUIRKS (a quirk that would accept and ignore host-initiated writes if the CPUID bit is not available) or perhaps with a pr_warn_ratelimited and a quick deprecation cycle, until some time after 6.2.1 is released.
Hmmm... maybe the issue is actually that KVM *returns* 0 from KVM_GET_MSRS? And in this case, fixing KVM would also prevent QEMU from sending the bogus KVM_SET_MSRS?
Thanks,
Paolo
arch/x86/kvm/svm/nested.c | 10 ++++------ arch/x86/kvm/svm/svm.c | 5 +++-- arch/x86/kvm/svm/svm.h | 1 + 3 files changed, 8 insertions(+), 8 deletions(-)
diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index 1218b5a342fc..a2e2436057dc 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -574,14 +574,12 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm) vcpu->arch.tsc_offset = kvm_calc_nested_tsc_offset( vcpu->arch.l1_tsc_offset, svm->nested.ctl.tsc_offset,
svm->tsc_ratio_msr);
svm_get_l2_tsc_multiplier(vcpu));
svm->vmcb->control.tsc_offset = vcpu->arch.tsc_offset;
- if (svm->tsc_ratio_msr != kvm_default_tsc_scaling_ratio) {
WARN_ON(!svm->tsc_scaling_enabled);
- if (svm_get_l2_tsc_multiplier(vcpu) != kvm_default_tsc_scaling_ratio) nested_svm_update_tsc_ratio_msr(vcpu);
- }
svm->vmcb->control.int_ctl = (svm->nested.ctl.int_ctl & int_ctl_vmcb12_bits) | @@ -867,8 +865,7 @@ int nested_svm_vmexit(struct vcpu_svm *svm) vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS); }
- if (svm->tsc_ratio_msr != kvm_default_tsc_scaling_ratio) {
WARN_ON(!svm->tsc_scaling_enabled);
- if (svm_get_l2_tsc_multiplier(vcpu) != kvm_default_tsc_scaling_ratio) { vcpu->arch.tsc_scaling_ratio = vcpu->arch.l1_tsc_scaling_ratio; svm_write_tsc_multiplier(vcpu, vcpu->arch.tsc_scaling_ratio); }
@@ -1264,6 +1261,7 @@ void nested_svm_update_tsc_ratio_msr(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu);
- WARN_ON_ONCE(!svm->tsc_scaling_enabled); vcpu->arch.tsc_scaling_ratio = kvm_calc_nested_tsc_multiplier(vcpu->arch.l1_tsc_scaling_ratio, svm->tsc_ratio_msr);
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 975be872cd1a..5cf6bb5bfd3e 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -911,11 +911,12 @@ static u64 svm_get_l2_tsc_offset(struct kvm_vcpu *vcpu) return svm->nested.ctl.tsc_offset; } -static u64 svm_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu) +u64 svm_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu);
- return svm->tsc_ratio_msr;
- return svm->tsc_scaling_enabled ? svm->tsc_ratio_msr :
}kvm_default_tsc_scaling_ratio;
static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 852b12aee03d..006571dd30df 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -542,6 +542,7 @@ int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr, bool has_error_code, u32 error_code); int nested_svm_exit_special(struct vcpu_svm *svm); void nested_svm_update_tsc_ratio_msr(struct kvm_vcpu *vcpu); +u64 svm_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu); void svm_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 multiplier); void nested_copy_vmcb_control_to_cache(struct vcpu_svm *svm, struct vmcb_control_area *control);
On 2/21/22 14:17, Maxim Levitsky wrote:
It's not clear how QEMU ends up writing MSR_AMD64_TSC_RATIO_DEFAULT rather than 0, but we clearly have a bug in KVM. It should not allow writing 0 in the first place if tsc-ratio is not available in the VM.
Qemu currently (the code is very new so it can be changed) writes the initial value of 0 to this msr if tsc scaling is disabled in the guest, or MSR_AMD64_TSC_RATIO_DEFAULT if the tsc scaling is enabled.
It's released in 6.2, though, right?
The guest can change it only when TSC scaling is enabled for it. If tsc scaling is not enabled, both guest reads or writes of this MSR get #GP. I only allowed qemu writes of this msr because I thought that qemu might first set the MSR and then set guest CPUID.
Also, for example the MSR_IA32_XSS uses the same logic in KVM.
As for why qemu sets this msr regardless of guest CPUID bit, it seemed to be cleaner this way - kvm_put_msrs in qemu seems not to check guest CPUID but rather only check that KVM supports this msr, which will be true regardless of guest's CPUID bit.
Yes, I agree it's cleaner in QEMU and consistent with others in KVM. However the value that is written should be the default one (which is usually zero, but not always and not in this case).
Paolo
linux-stable-mirror@lists.linaro.org