Nikolas Wipper nikwip@amazon.de writes:
Introduce a suspension state for Hyper-V enlightened vCPUs. Microsoft's "Hypervisor Top Level Functional Specification" (TLFS) introduces this state as a "vCPU that is stopped on a instruction guest boundary, either explicitly or implicitly due to an intercept". The only documented explicit suspension is caused in response to a TLB Flush hypercall, which will use the state switching API in subsequent patches.
Each Hyper-V vCPU now has a 'suspended' boolean field, which is checked before entering the guest. When set, it forces the vCPU to block. Once in that state, the vCPU ignores any events. The vCPU is unsuspended by clearing 'suspend' and issuing a request to force the vCPU out of sleep.
Suspensions are issued as a mechanism to halt execution until state change is observed on a remote vCPU. Hyper-V vCPUs track this with 'waiting_on', which holds the 'vcpu_id' of the remote vCPU that forced the vCPU to enter the suspended state. It's the remote vCPU's responsibility to wake up the suspended vCPUs when ready. 'waiting_on' ensures the remote vCPU can selectively unsuspend vCPUs that blocked on its behalf while leaving other suspended vCPUs undisturbed. One vCPU can only be suspended due to a single remote vCPU, but different vCPUs can be suspended on behalf of different or the same remote vCPU(s). The guest is responsible for avoiding circular dependencies between suspended vCPUs.
Callers of the suspend API are responsible for ensuring that suspend and unsuspend aren't called in parallel while targeting the same pair of vCPUs. Otherwise kvm_hv_vcpu_{un}suspend_tlb_flush() ensure 'waiting_on' and 'suspended' are updated and accessed in the correct order. This, for example, avoids races where the unsuspended vCPU re-suspends before kvm_hv_vcpu_unsuspend_tlb_flush() is done updating 'waiting_on'.
Signed-off-by: Nikolas Wipper nikwip@amazon.de
arch/x86/include/asm/kvm_host.h | 3 +++ arch/x86/kvm/hyperv.c | 30 ++++++++++++++++++++++++++++++ arch/x86/kvm/hyperv.h | 17 +++++++++++++++++ arch/x86/kvm/x86.c | 4 +++- 4 files changed, 53 insertions(+), 1 deletion(-)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 46e0a466d7fb..7571ac578884 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -695,6 +695,9 @@ struct kvm_vcpu_hv { u64 vm_id; u32 vp_id; } nested;
- bool suspended;
- int waiting_on;
I don't quite understand why we need 'suspended' at all. Isn't it always suspended when 'waiting_on != -1'? I can see we always update these two in pair.
Also, I would suggest we use a more descriptive name. 'waiting_on_vcpu_id', for example.
}; struct kvm_hypervisor_cpuid { diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index 4f0a94346d00..6e7941ed25ae 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -971,6 +971,7 @@ int kvm_hv_vcpu_init(struct kvm_vcpu *vcpu) vcpu->arch.hyperv = hv_vcpu; hv_vcpu->vcpu = vcpu;
- hv_vcpu->waiting_on = -1;
synic_init(&hv_vcpu->synic); @@ -2915,3 +2916,32 @@ int kvm_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid, return 0; }
+void kvm_hv_vcpu_suspend_tlb_flush(struct kvm_vcpu *vcpu, int vcpu_id)
Can we make parameter's name 'waiting_on_vcpu_id' as well? Because as-is I'm getting confused which CPU of these two is actually getting suspended)
Also, why do we need '_tlb_flush' in the name? The mechanism seems to be fairly generic, it's just that we use it for TLB flushes.
+{
- /* waiting_on's store should happen before suspended's */
- WRITE_ONCE(vcpu->arch.hyperv->waiting_on, vcpu_id);
- WRITE_ONCE(vcpu->arch.hyperv->suspended, true);
+}
+void kvm_hv_vcpu_unsuspend_tlb_flush(struct kvm_vcpu *vcpu)
And here someone may expect this means 'unsuspend vcpu' but in reality this means 'unsuspend all vCPUs which are waiting on 'vcpu'). I guess we need a rename. How about
void kvm_hv_unsuspend_vcpus(struct kvm_vcpu *waiting_on_vcpu)
?
+{
- DECLARE_BITMAP(vcpu_mask, KVM_MAX_VCPUS);
- struct kvm_vcpu_hv *vcpu_hv;
- struct kvm_vcpu *v;
- unsigned long i;
- kvm_for_each_vcpu(i, v, vcpu->kvm) {
vcpu_hv = to_hv_vcpu(v);
if (kvm_hv_vcpu_suspended(v) &&
READ_ONCE(vcpu_hv->waiting_on) == vcpu->vcpu_id) {
/* waiting_on's store should happen before suspended's */
WRITE_ONCE(v->arch.hyperv->waiting_on, -1);
WRITE_ONCE(v->arch.hyperv->suspended, false);
__set_bit(i, vcpu_mask);
}
- }
- kvm_make_vcpus_request_mask(vcpu->kvm, KVM_REQ_EVENT, vcpu_mask);
+} diff --git a/arch/x86/kvm/hyperv.h b/arch/x86/kvm/hyperv.h index 913bfc96959c..a55832cea221 100644 --- a/arch/x86/kvm/hyperv.h +++ b/arch/x86/kvm/hyperv.h @@ -265,6 +265,15 @@ static inline void kvm_hv_nested_transtion_tlb_flush(struct kvm_vcpu *vcpu, } int kvm_hv_vcpu_flush_tlb(struct kvm_vcpu *vcpu);
+static inline bool kvm_hv_vcpu_suspended(struct kvm_vcpu *vcpu) +{
- return vcpu->arch.hyperv_enabled &&
READ_ONCE(vcpu->arch.hyperv->suspended);
I don't think READ_ONCE() means anything here, does it?
+}
+void kvm_hv_vcpu_suspend_tlb_flush(struct kvm_vcpu *vcpu, int vcpu_id); +void kvm_hv_vcpu_unsuspend_tlb_flush(struct kvm_vcpu *vcpu); #else /* CONFIG_KVM_HYPERV */ static inline void kvm_hv_setup_tsc_page(struct kvm *kvm, struct pvclock_vcpu_time_info *hv_clock) {} @@ -321,6 +330,14 @@ static inline u32 kvm_hv_get_vpindex(struct kvm_vcpu *vcpu) return vcpu->vcpu_idx; } static inline void kvm_hv_nested_transtion_tlb_flush(struct kvm_vcpu *vcpu, bool tdp_enabled) {}
+static inline bool kvm_hv_vcpu_suspended(struct kvm_vcpu *vcpu) +{
- return false;
+}
+static inline void kvm_hv_vcpu_suspend_tlb_flush(struct kvm_vcpu *vcpu, int vcpu_id) {} +static inline void kvm_hv_vcpu_unsuspend_tlb_flush(struct kvm_vcpu *vcpu) {} #endif /* CONFIG_KVM_HYPERV */ #endif /* __ARCH_X86_KVM_HYPERV_H__ */ diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 15080385b8fe..18d0a300e79a 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -135,6 +135,8 @@ static void store_regs(struct kvm_vcpu *vcpu); static int sync_regs(struct kvm_vcpu *vcpu); static int kvm_vcpu_do_singlestep(struct kvm_vcpu *vcpu); +static inline bool kvm_hv_vcpu_suspended(struct kvm_vcpu *vcpu);
static int __set_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2); static void __get_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2); @@ -11107,7 +11109,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) static bool kvm_vcpu_running(struct kvm_vcpu *vcpu) { return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE &&
!vcpu->arch.apf.halted);
!vcpu->arch.apf.halted && !kvm_hv_vcpu_suspended(vcpu));
} static bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu)