Commit 53fafdbb8b21f ("KVM: x86: switch KVMCLOCK base to monotonic raw clock") changed kvmclock to use tkr_raw instead of tkr_mono. However, the default kvmclock_offset for the VM was still based on the monotonic clock and, if the raw clock drifted enough from the monotonic clock, this could cause a negative system_time to be written to the guest's struct pvclock. RHEL5 does not like it and (if it boots fast enough to observe a negative time value) it hangs.
There is another thing to be careful about: getboottime64 returns the host boot time in tkr_mono units, and subtracting tkr_raw units will cause the wallclock to be off if tkr_raw drifts from tkr_mono. To avoid this, compute the wallclock delta from the current time instead of being clever and using getboottime64.
Fixes: 53fafdbb8b21f ("KVM: x86: switch KVMCLOCK base to monotonic raw clock") Cc: stable@vger.kernel.org Signed-off-by: Paolo Bonzini pbonzini@redhat.com --- arch/x86/kvm/x86.c | 38 +++++++++++++++++++++++--------------- 1 file changed, 23 insertions(+), 15 deletions(-)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 1b4273cce63c..b5e0648580e1 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1577,6 +1577,18 @@ static void update_pvclock_gtod(struct timekeeper *tk)
write_seqcount_end(&vdata->seq); } + +static s64 get_kvmclock_base_ns(void) +{ + /* Count up from boot time, but with the frequency of the raw clock. */ + return ktime_to_ns(ktime_add(ktime_get_raw(), pvclock_gtod_data.offs_boot)); +} +#else +static s64 get_kvmclock_base_ns(void) +{ + /* Master clock not used, so we can just use CLOCK_BOOTTIME. */ + return ktime_get_boottime_ns(); +} #endif
void kvm_set_pending_timer(struct kvm_vcpu *vcpu) @@ -1590,7 +1602,7 @@ static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock) int version; int r; struct pvclock_wall_clock wc; - struct timespec64 boot; + u64 wall_nsec;
if (!wall_clock) return; @@ -1610,17 +1622,12 @@ static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock) /* * The guest calculates current wall clock time by adding * system time (updated by kvm_guest_time_update below) to the - * wall clock specified here. guest system time equals host - * system time for us, thus we must fill in host boot time here. + * wall clock specified here. We do the reverse here. */ - getboottime64(&boot); + wall_nsec = ktime_get_real_ns() - get_kvmclock_ns(kvm);
- if (kvm->arch.kvmclock_offset) { - struct timespec64 ts = ns_to_timespec64(kvm->arch.kvmclock_offset); - boot = timespec64_sub(boot, ts); - } - wc.sec = (u32)boot.tv_sec; /* overflow in 2106 guest time */ - wc.nsec = boot.tv_nsec; + wc.nsec = do_div(wall_nsec, 1000000000); + wc.sec = (u32)wall_nsec; /* overflow in 2106 guest time */ wc.version = version;
kvm_write_guest(kvm, wall_clock, &wc, sizeof(wc)); @@ -1868,7 +1875,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr)
raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags); offset = kvm_compute_tsc_offset(vcpu, data); - ns = ktime_get_boottime_ns(); + ns = get_kvmclock_base_ns(); elapsed = ns - kvm->arch.last_tsc_nsec;
if (vcpu->arch.virtual_tsc_khz) { @@ -2206,7 +2213,7 @@ u64 get_kvmclock_ns(struct kvm *kvm) spin_lock(&ka->pvclock_gtod_sync_lock); if (!ka->use_master_clock) { spin_unlock(&ka->pvclock_gtod_sync_lock); - return ktime_get_boottime_ns() + ka->kvmclock_offset; + return get_kvmclock_base_ns() + ka->kvmclock_offset; }
hv_clock.tsc_timestamp = ka->master_cycle_now; @@ -2222,7 +2229,7 @@ u64 get_kvmclock_ns(struct kvm *kvm) &hv_clock.tsc_to_system_mul); ret = __pvclock_read_cycles(&hv_clock, rdtsc()); } else - ret = ktime_get_boottime_ns() + ka->kvmclock_offset; + ret = get_kvmclock_base_ns() + ka->kvmclock_offset;
put_cpu();
@@ -2321,7 +2328,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) } if (!use_master_clock) { host_tsc = rdtsc(); - kernel_ns = ktime_get_boottime_ns(); + kernel_ns = get_kvmclock_base_ns(); }
tsc_timestamp = kvm_read_l1_tsc(v, host_tsc); @@ -2361,6 +2368,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) vcpu->hv_clock.tsc_timestamp = tsc_timestamp; vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset; vcpu->last_guest_tsc = tsc_timestamp; + WARN_ON(vcpu->hv_clock.system_time < 0);
/* If the host uses TSC clocksource, then it is stable */ pvclock_flags = 0; @@ -9473,7 +9481,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) mutex_init(&kvm->arch.apic_map_lock); spin_lock_init(&kvm->arch.pvclock_gtod_sync_lock);
- kvm->arch.kvmclock_offset = -ktime_get_boottime_ns(); + kvm->arch.kvmclock_offset = -get_kvmclock_base_ns(); pvclock_update_vm_gtod_copy(kvm);
kvm->arch.guest_can_read_msr_platform_info = true;
Paolo Bonzini pbonzini@redhat.com writes:
Commit 53fafdbb8b21f ("KVM: x86: switch KVMCLOCK base to monotonic raw clock") changed kvmclock to use tkr_raw instead of tkr_mono. However, the default kvmclock_offset for the VM was still based on the monotonic clock and, if the raw clock drifted enough from the monotonic clock, this could cause a negative system_time to be written to the guest's struct pvclock. RHEL5 does not like it and (if it boots fast enough to observe a negative time value) it hangs.
There is another thing to be careful about: getboottime64 returns the host boot time in tkr_mono units, and subtracting tkr_raw units will cause the wallclock to be off if tkr_raw drifts from tkr_mono. To avoid this, compute the wallclock delta from the current time instead of being clever and using getboottime64.
Fixes: 53fafdbb8b21f ("KVM: x86: switch KVMCLOCK base to monotonic raw clock") Cc: stable@vger.kernel.org Signed-off-by: Paolo Bonzini pbonzini@redhat.com
arch/x86/kvm/x86.c | 38 +++++++++++++++++++++++--------------- 1 file changed, 23 insertions(+), 15 deletions(-)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 1b4273cce63c..b5e0648580e1 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1577,6 +1577,18 @@ static void update_pvclock_gtod(struct timekeeper *tk) write_seqcount_end(&vdata->seq); }
+static s64 get_kvmclock_base_ns(void) +{
- /* Count up from boot time, but with the frequency of the raw clock. */
- return ktime_to_ns(ktime_add(ktime_get_raw(), pvclock_gtod_data.offs_boot));
+} +#else +static s64 get_kvmclock_base_ns(void) +{
- /* Master clock not used, so we can just use CLOCK_BOOTTIME. */
- return ktime_get_boottime_ns();
+} #endif
But we could've still used the RAW+offs_boot version, right? And this is just to basically preserve the existing behavior on !x86.
void kvm_set_pending_timer(struct kvm_vcpu *vcpu) @@ -1590,7 +1602,7 @@ static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock) int version; int r; struct pvclock_wall_clock wc;
- struct timespec64 boot;
- u64 wall_nsec;
if (!wall_clock) return; @@ -1610,17 +1622,12 @@ static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock) /* * The guest calculates current wall clock time by adding * system time (updated by kvm_guest_time_update below) to the
* wall clock specified here. guest system time equals host
* system time for us, thus we must fill in host boot time here.
*/* wall clock specified here. We do the reverse here.
- getboottime64(&boot);
- wall_nsec = ktime_get_real_ns() - get_kvmclock_ns(kvm);
There are not that many hosts with more than 50 years uptime and likely none running Linux with live kernel patching support so I bet noone will ever see this overflowing, however, as wall_nsec is u64 and we're dealing with kvmclock here I'd suggest to add a WARN_ON().
- if (kvm->arch.kvmclock_offset) {
struct timespec64 ts = ns_to_timespec64(kvm->arch.kvmclock_offset);
boot = timespec64_sub(boot, ts);
- }
- wc.sec = (u32)boot.tv_sec; /* overflow in 2106 guest time */
- wc.nsec = boot.tv_nsec;
- wc.nsec = do_div(wall_nsec, 1000000000);
- wc.sec = (u32)wall_nsec; /* overflow in 2106 guest time */ wc.version = version;
kvm_write_guest(kvm, wall_clock, &wc, sizeof(wc)); @@ -1868,7 +1875,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr) raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags); offset = kvm_compute_tsc_offset(vcpu, data);
- ns = ktime_get_boottime_ns();
- ns = get_kvmclock_base_ns(); elapsed = ns - kvm->arch.last_tsc_nsec;
if (vcpu->arch.virtual_tsc_khz) { @@ -2206,7 +2213,7 @@ u64 get_kvmclock_ns(struct kvm *kvm) spin_lock(&ka->pvclock_gtod_sync_lock); if (!ka->use_master_clock) { spin_unlock(&ka->pvclock_gtod_sync_lock);
return ktime_get_boottime_ns() + ka->kvmclock_offset;
}return get_kvmclock_base_ns() + ka->kvmclock_offset;
hv_clock.tsc_timestamp = ka->master_cycle_now; @@ -2222,7 +2229,7 @@ u64 get_kvmclock_ns(struct kvm *kvm) &hv_clock.tsc_to_system_mul); ret = __pvclock_read_cycles(&hv_clock, rdtsc()); } else
ret = ktime_get_boottime_ns() + ka->kvmclock_offset;
ret = get_kvmclock_base_ns() + ka->kvmclock_offset;
put_cpu(); @@ -2321,7 +2328,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) } if (!use_master_clock) { host_tsc = rdtsc();
kernel_ns = ktime_get_boottime_ns();
}kernel_ns = get_kvmclock_base_ns();
tsc_timestamp = kvm_read_l1_tsc(v, host_tsc); @@ -2361,6 +2368,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) vcpu->hv_clock.tsc_timestamp = tsc_timestamp; vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset; vcpu->last_guest_tsc = tsc_timestamp;
- WARN_ON(vcpu->hv_clock.system_time < 0);
/* If the host uses TSC clocksource, then it is stable */ pvclock_flags = 0; @@ -9473,7 +9481,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) mutex_init(&kvm->arch.apic_map_lock); spin_lock_init(&kvm->arch.pvclock_gtod_sync_lock);
- kvm->arch.kvmclock_offset = -ktime_get_boottime_ns();
- kvm->arch.kvmclock_offset = -get_kvmclock_base_ns(); pvclock_update_vm_gtod_copy(kvm);
kvm->arch.guest_can_read_msr_platform_info = true;
This looks correct to me but kvmclock is a glorious beast so take this with a grain of salt)
Reviewed-by: Vitaly Kuznetsov vkuznets@redhat.com
On 23/01/20 14:43, Vitaly Kuznetsov wrote:
+static s64 get_kvmclock_base_ns(void) +{
- /* Count up from boot time, but with the frequency of the raw clock. */
- return ktime_to_ns(ktime_add(ktime_get_raw(), pvclock_gtod_data.offs_boot));
+} +#else +static s64 get_kvmclock_base_ns(void) +{
- /* Master clock not used, so we can just use CLOCK_BOOTTIME. */
- return ktime_get_boottime_ns();
+} #endif
But we could've still used the RAW+offs_boot version, right? And this is just to basically preserve the existing behavior on !x86.
Yes, there's no reason to restrict the pvclock_gtod notifier to x86_64. But this is stable material so I kept it easy.
- getboottime64(&boot);
- wall_nsec = ktime_get_real_ns() - get_kvmclock_ns(kvm);
There are not that many hosts with more than 50 years uptime and likely none running Linux with live kernel patching support so I bet noone will ever see this overflowing, however, as wall_nsec is u64 and we're dealing with kvmclock here I'd suggest to add a WARN_ON().
You're off by a factor of 10, 2^64 nanoseconds are about 584 years (584*365*10^9*86400). :)
Paolo
linux-stable-mirror@lists.linaro.org