There are currently two possible causes of VMRUN failures:
1) Consistency checks failures. In this case, KVM updates the exit code in the mapped VMCB12 and exits early in nested_svm_vmrun(). This causes a few problems:
A) KVM does not clear the GIF if the early consistency checks fail (because nested_svm_vmexit() is not called). Nothing requires GIF=0 before a VMRUN, from the APM:
It is assumed that VMM software cleared GIF some time before executing the VMRUN instruction, to ensure an atomic state switch.
So an early #VMEXIT from early consistency checks could leave the GIF set.
B) svm_leave_smm() is missing consistency checks on the newly loaded guest state, because the checks aren't performed by enter_svm_guest_mode().
2) Faiure to load L2's CR3 or merge the MSR bitmaps. In this case, a fully-fledged #VMEXIT injection is performed as VMCB02 is already prepared.
Arguably all VMRUN failures should be handled before the VMCB02 is prepared, but with proper cleanup (e.g. clear the GIF). Move all the potential failure checks inside enter_svm_guest_mode() before switching to VMCB02. On failure of any of these checks, nested_svm_vmrun() synthesizes a minimal #VMEXIT through the new nested_svm_failed_vmrun() helper.
__nested_svm_vmexit() already performs the necessary cleanup for a failed VMRUN, including uninitializing the nested MMU and reloading L1's CR3. This ensures that consistency check failures do proper necessary cleanup, while other failures do not doo too much cleanup. It also leaves a unified path for handling VMRUN failures.
Cc: stable@vger.kernel.org Fixes: 52c65a30a5c6 ("KVM: SVM: Check for nested vmrun intercept before emulating vmrun") Suggested-by: Sean Christopherson seanjc@google.com Signed-off-by: Yosry Ahmed yosry.ahmed@linux.dev --- arch/x86/kvm/svm/nested.c | 58 +++++++++++++++++++++------------------ 1 file changed, 32 insertions(+), 26 deletions(-)
diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index 1356bd6383ca..632e941febaf 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -889,22 +889,19 @@ int enter_svm_guest_mode(struct kvm_vcpu *vcpu, u64 vmcb12_gpa, vmcb12->control.intercepts[INTERCEPT_WORD4], vmcb12->control.intercepts[INTERCEPT_WORD5]);
- svm->nested.vmcb12_gpa = vmcb12_gpa;
WARN_ON(svm->vmcb == svm->nested.vmcb02.ptr);
enter_guest_mode(vcpu);
+ if (!nested_vmcb_check_save(vcpu, &svm->nested.save) || + !nested_vmcb_check_controls(vcpu, &svm->nested.ctl)) + return -EINVAL; + if (nested_npt_enabled(svm)) nested_svm_init_mmu_context(vcpu);
- nested_svm_copy_common_state(svm->vmcb01.ptr, svm->nested.vmcb02.ptr); - - svm_switch_vmcb(svm, &svm->nested.vmcb02); - nested_vmcb02_prepare_control(svm, vmcb12->save.rip, vmcb12->save.cs.base); - nested_vmcb02_prepare_save(svm, vmcb12); - ret = nested_svm_load_cr3(&svm->vcpu, svm->nested.save.cr3, nested_npt_enabled(svm), from_vmrun); if (ret) @@ -916,6 +913,17 @@ int enter_svm_guest_mode(struct kvm_vcpu *vcpu, u64 vmcb12_gpa, return ret; }
+ /* + * Any VMRUN failure needs to happen before this point, such that the + * nested #VMEXIT is injected properly by nested_svm_failed_vmrun(). + */ + + nested_svm_copy_common_state(svm->vmcb01.ptr, svm->nested.vmcb02.ptr); + + svm_switch_vmcb(svm, &svm->nested.vmcb02); + nested_vmcb02_prepare_control(svm, vmcb12->save.rip, vmcb12->save.cs.base); + nested_vmcb02_prepare_save(svm, vmcb12); + if (!from_vmrun) kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu);
@@ -957,6 +965,17 @@ static void __nested_svm_vmexit(struct vcpu_svm *svm) kvm_queue_exception(vcpu, DB_VECTOR); }
+static void nested_svm_failed_vmrun(struct vcpu_svm *svm, struct vmcb *vmcb12) +{ + WARN_ON(svm->vmcb == svm->nested.vmcb02.ptr); + + vmcb12->control.exit_code = SVM_EXIT_ERR; + vmcb12->control.exit_code_hi = -1u; + vmcb12->control.exit_info_1 = 0; + vmcb12->control.exit_info_2 = 0; + __nested_svm_vmexit(svm); +} + int nested_svm_vmrun(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); @@ -999,15 +1018,6 @@ int nested_svm_vmrun(struct kvm_vcpu *vcpu) nested_copy_vmcb_control_to_cache(svm, &vmcb12->control); nested_copy_vmcb_save_to_cache(svm, &vmcb12->save);
- if (!nested_vmcb_check_save(vcpu, &svm->nested.save) || - !nested_vmcb_check_controls(vcpu, &svm->nested.ctl)) { - vmcb12->control.exit_code = SVM_EXIT_ERR; - vmcb12->control.exit_code_hi = -1u; - vmcb12->control.exit_info_1 = 0; - vmcb12->control.exit_info_2 = 0; - goto out; - } - /* * Since vmcb01 is not in use, we can use it to store some of the L1 * state. @@ -1028,15 +1038,9 @@ int nested_svm_vmrun(struct kvm_vcpu *vcpu) svm->nmi_l1_to_l2 = false; svm->soft_int_injected = false;
- svm->vmcb->control.exit_code = SVM_EXIT_ERR; - svm->vmcb->control.exit_code_hi = -1u; - svm->vmcb->control.exit_info_1 = 0; - svm->vmcb->control.exit_info_2 = 0; - - nested_svm_vmexit(svm); + nested_svm_failed_vmrun(svm, vmcb12); }
-out: kvm_vcpu_unmap(vcpu, &map);
return ret; @@ -1172,6 +1176,8 @@ void nested_svm_vmexit(struct vcpu_svm *svm)
kvm_nested_vmexit_handle_ibrs(vcpu);
+ /* VMRUN failures before switching to VMCB02 are handled by nested_svm_failed_vmrun() */ + WARN_ON_ONCE(svm->vmcb != svm->nested.vmcb02.ptr); svm_switch_vmcb(svm, &svm->vmcb01);
/* @@ -1859,9 +1865,6 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu, if (nested_npt_enabled(svm)) nested_svm_init_mmu_context(vcpu);
- svm_switch_vmcb(svm, &svm->nested.vmcb02); - nested_vmcb02_prepare_control(svm, svm->vmcb->save.rip, svm->vmcb->save.cs.base); - /* * While the nested guest CR3 is already checked and set by * KVM_SET_SREGS, it was set when nested state was yet loaded, @@ -1874,6 +1877,9 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu, if (WARN_ON_ONCE(ret)) goto out_free;
+ svm_switch_vmcb(svm, &svm->nested.vmcb02); + nested_vmcb02_prepare_control(svm, svm->vmcb->save.rip, svm->vmcb->save.cs.base); + svm->nested.force_msr_bitmap_recalc = true;
kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu);