BZ: 1768622 Brew: https://brewweb.engineering.redhat.com/brew/taskinfo?taskID=28587593 commit 8171cd68806bd2fc28ef688e32fb2a3b3deb04e5
Commit 53fafdbb8b21f ("KVM: x86: switch KVMCLOCK base to monotonic raw clock") changed kvmclock to use tkr_raw instead of tkr_mono. However, the default kvmclock_offset for the VM was still based on the monotonic clock and, if the raw clock drifted enough from the monotonic clock, this could cause a negative system_time to be written to the guest's struct pvclock. RHEL5 does not like it and (if it boots fast enough to observe a negative time value) it hangs.
There is another thing to be careful about: getboottime64 returns the host boot time with tkr_mono frequency, and subtracting the tkr_raw-based kvmclock value will cause the wallclock to be off if tkr_raw drifts from tkr_mono. To avoid this, compute the wallclock delta from the current time instead of being clever and using getboottime64.
Fixes: 53fafdbb8b21f ("KVM: x86: switch KVMCLOCK base to monotonic raw clock") Cc: stable@vger.kernel.org Reviewed-by: Vitaly Kuznetsov vkuznets@redhat.com Signed-off-by: Paolo Bonzini pbonzini@redhat.com
Index: kernel-rhel/arch/x86/kvm/x86.c =================================================================== --- kernel-rhel.orig/arch/x86/kvm/x86.c +++ kernel-rhel/arch/x86/kvm/x86.c @@ -1595,6 +1595,18 @@ static void update_pvclock_gtod(struct t
write_seqcount_end(&vdata->seq); } + +static s64 get_kvmclock_base_ns(void) +{ + /* Count up from boot time, but with the frequency of the raw clock. */ + return ktime_to_ns(ktime_add(ktime_get_raw(), pvclock_gtod_data.offs_boot)); +} +#else +static s64 get_kvmclock_base_ns(void) +{ + /* Master clock not used, so we can just use CLOCK_BOOTTIME. */ + return ktime_get_boottime_ns(); +} #endif
void kvm_set_pending_timer(struct kvm_vcpu *vcpu) @@ -1608,7 +1620,7 @@ static void kvm_write_wall_clock(struct int version; int r; struct pvclock_wall_clock wc; - struct timespec64 boot; + u64 wall_nsec;
if (!wall_clock) return; @@ -1628,17 +1640,12 @@ static void kvm_write_wall_clock(struct /* * The guest calculates current wall clock time by adding * system time (updated by kvm_guest_time_update below) to the - * wall clock specified here. guest system time equals host - * system time for us, thus we must fill in host boot time here. + * wall clock specified here. We do the reverse here. */ - getboottime64(&boot); + wall_nsec = ktime_get_real_ns() - get_kvmclock_ns(kvm);
- if (kvm->arch.kvmclock_offset) { - struct timespec64 ts = ns_to_timespec64(kvm->arch.kvmclock_offset); - boot = timespec64_sub(boot, ts); - } - wc.sec = (u32)boot.tv_sec; /* overflow in 2106 guest time */ - wc.nsec = boot.tv_nsec; + wc.nsec = do_div(wall_nsec, 1000000000); + wc.sec = (u32)wall_nsec; /* overflow in 2106 guest time */ wc.version = version;
kvm_write_guest(kvm, wall_clock, &wc, sizeof(wc)); @@ -1886,7 +1893,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu
raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags); offset = kvm_compute_tsc_offset(vcpu, data); - ns = ktime_get_boot_ns(); + ns = get_kvmclock_base_ns(); elapsed = ns - kvm->arch.last_tsc_nsec;
if (vcpu->arch.virtual_tsc_khz) { @@ -2224,7 +2231,7 @@ u64 get_kvmclock_ns(struct kvm *kvm) spin_lock(&ka->pvclock_gtod_sync_lock); if (!ka->use_master_clock) { spin_unlock(&ka->pvclock_gtod_sync_lock); - return ktime_get_boot_ns() + ka->kvmclock_offset; + return get_kvmclock_base_ns() + ka->kvmclock_offset; }
hv_clock.tsc_timestamp = ka->master_cycle_now; @@ -2240,7 +2247,7 @@ u64 get_kvmclock_ns(struct kvm *kvm) &hv_clock.tsc_to_system_mul); ret = __pvclock_read_cycles(&hv_clock, rdtsc()); } else - ret = ktime_get_boot_ns() + ka->kvmclock_offset; + ret = get_kvmclock_base_ns() + ka->kvmclock_offset;
put_cpu();
@@ -2339,7 +2346,7 @@ static int kvm_guest_time_update(struct } if (!use_master_clock) { host_tsc = rdtsc(); - kernel_ns = ktime_get_boot_ns(); + kernel_ns = get_kvmclock_base_ns(); }
tsc_timestamp = kvm_read_l1_tsc(v, host_tsc); @@ -2379,6 +2386,7 @@ static int kvm_guest_time_update(struct vcpu->hv_clock.tsc_timestamp = tsc_timestamp; vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset; vcpu->last_guest_tsc = tsc_timestamp; + WARN_ON(vcpu->hv_clock.system_time < 0);
/* If the host uses TSC clocksource, then it is stable */ pvclock_flags = 0; @@ -9486,7 +9494,7 @@ int kvm_arch_init_vm(struct kvm *kvm, un mutex_init(&kvm->arch.apic_map_lock); spin_lock_init(&kvm->arch.pvclock_gtod_sync_lock);
- kvm->arch.kvmclock_offset = -ktime_get_boot_ns(); + kvm->arch.kvmclock_offset = -get_kvmclock_base_ns(); pvclock_update_vm_gtod_copy(kvm);
kvm->arch.guest_can_read_msr_platform_info = true;
Hi Marcelo,
I'm marking this as superseded by pwid 303977
http://patchwork.lab.bos.redhat.com/patch/303977/
which was included in kernel-4.18.0-196.el8
commit 9751522d92195bc64883c71e2bee8ed0fcbc5007 Author: Vitaly Kuznetsov vkuznets@redhat.com Date: Mon Apr 20 15:20:04 2020 -0400
[x86] kvm: x86: use raw clock values consistently
Message-id: 20200420152004.933168-1-vkuznets@redhat.com Patchwork-id: 303977 Patchwork-instance: patchwork O-Subject: [RHEL8.3 virt PATCH v2 312/614] KVM: x86: use raw clock values consistently Bugzilla: 1813987 RH-Acked-by: Andrew Jones drjones@redhat.com RH-Acked-by: Paolo Bonzini pbonzini@redhat.com RH-Acked-by: Tony Camuso tcamuso@redhat.com
Note there is a difference between yours and Vitaly's patch in the following hunk
+@@ -1656,6 +1656,18 @@ static void update_pvclock_gtod(struct timekeeper *tk)
write_seqcount_end(&vdata->seq); } @@ -14,12 +16,12 @@ +static s64 get_kvmclock_base_ns(void) +{ + /* Master clock not used, so we can just use CLOCK_BOOTTIME. */ -+ return ktime_get_boottime_ns(); ++ return ktime_get_boot_ns(); +} #endif
This is in !CONFIG_X86_64 path, so I'm not sure how much we care about this. Anyway Vitaly's patch corrects this, because rhel does not have upstream commit 9285ec4c8b61d4930a575081abeba2cd4f449a74
Thank you
On Mon, May 18, 2020 at 04:15:19PM -0300, Marcelo Tosatti wrote:
BZ: 1768622 Brew: https://brewweb.engineering.redhat.com/brew/taskinfo?taskID=28587593 commit 8171cd68806bd2fc28ef688e32fb2a3b3deb04e5
Commit 53fafdbb8b21f ("KVM: x86: switch KVMCLOCK base to monotonic raw clock") changed kvmclock to use tkr_raw instead of tkr_mono. However, the default kvmclock_offset for the VM was still based on the monotonic clock and, if the raw clock drifted enough from the monotonic clock, this could cause a negative system_time to be written to the guest's struct pvclock. RHEL5 does not like it and (if it boots fast enough to observe a negative time value) it hangs.
There is another thing to be careful about: getboottime64 returns the host boot time with tkr_mono frequency, and subtracting the tkr_raw-based kvmclock value will cause the wallclock to be off if tkr_raw drifts from tkr_mono. To avoid this, compute the wallclock delta from the current time instead of being clever and using getboottime64.
Fixes: 53fafdbb8b21f ("KVM: x86: switch KVMCLOCK base to monotonic raw clock") Cc: stable@vger.kernel.org Reviewed-by: Vitaly Kuznetsov vkuznets@redhat.com Signed-off-by: Paolo Bonzini pbonzini@redhat.com
Index: kernel-rhel/arch/x86/kvm/x86.c
--- kernel-rhel.orig/arch/x86/kvm/x86.c +++ kernel-rhel/arch/x86/kvm/x86.c @@ -1595,6 +1595,18 @@ static void update_pvclock_gtod(struct t write_seqcount_end(&vdata->seq); }
+static s64 get_kvmclock_base_ns(void) +{
- /* Count up from boot time, but with the frequency of the raw clock. */
- return ktime_to_ns(ktime_add(ktime_get_raw(), pvclock_gtod_data.offs_boot));
+} +#else +static s64 get_kvmclock_base_ns(void) +{
- /* Master clock not used, so we can just use CLOCK_BOOTTIME. */
- return ktime_get_boottime_ns();
+} #endif void kvm_set_pending_timer(struct kvm_vcpu *vcpu) @@ -1608,7 +1620,7 @@ static void kvm_write_wall_clock(struct int version; int r; struct pvclock_wall_clock wc;
- struct timespec64 boot;
- u64 wall_nsec;
if (!wall_clock) return; @@ -1628,17 +1640,12 @@ static void kvm_write_wall_clock(struct /* * The guest calculates current wall clock time by adding * system time (updated by kvm_guest_time_update below) to the
* wall clock specified here. guest system time equals host
* system time for us, thus we must fill in host boot time here.
*/* wall clock specified here. We do the reverse here.
- getboottime64(&boot);
- wall_nsec = ktime_get_real_ns() - get_kvmclock_ns(kvm);
- if (kvm->arch.kvmclock_offset) {
struct timespec64 ts = ns_to_timespec64(kvm->arch.kvmclock_offset);
boot = timespec64_sub(boot, ts);
- }
- wc.sec = (u32)boot.tv_sec; /* overflow in 2106 guest time */
- wc.nsec = boot.tv_nsec;
- wc.nsec = do_div(wall_nsec, 1000000000);
- wc.sec = (u32)wall_nsec; /* overflow in 2106 guest time */ wc.version = version;
kvm_write_guest(kvm, wall_clock, &wc, sizeof(wc)); @@ -1886,7 +1893,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags); offset = kvm_compute_tsc_offset(vcpu, data);
- ns = ktime_get_boot_ns();
- ns = get_kvmclock_base_ns(); elapsed = ns - kvm->arch.last_tsc_nsec;
if (vcpu->arch.virtual_tsc_khz) { @@ -2224,7 +2231,7 @@ u64 get_kvmclock_ns(struct kvm *kvm) spin_lock(&ka->pvclock_gtod_sync_lock); if (!ka->use_master_clock) { spin_unlock(&ka->pvclock_gtod_sync_lock);
return ktime_get_boot_ns() + ka->kvmclock_offset;
}return get_kvmclock_base_ns() + ka->kvmclock_offset;
hv_clock.tsc_timestamp = ka->master_cycle_now; @@ -2240,7 +2247,7 @@ u64 get_kvmclock_ns(struct kvm *kvm) &hv_clock.tsc_to_system_mul); ret = __pvclock_read_cycles(&hv_clock, rdtsc()); } else
ret = ktime_get_boot_ns() + ka->kvmclock_offset;
ret = get_kvmclock_base_ns() + ka->kvmclock_offset;
put_cpu(); @@ -2339,7 +2346,7 @@ static int kvm_guest_time_update(struct } if (!use_master_clock) { host_tsc = rdtsc();
kernel_ns = ktime_get_boot_ns();
}kernel_ns = get_kvmclock_base_ns();
tsc_timestamp = kvm_read_l1_tsc(v, host_tsc); @@ -2379,6 +2386,7 @@ static int kvm_guest_time_update(struct vcpu->hv_clock.tsc_timestamp = tsc_timestamp; vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset; vcpu->last_guest_tsc = tsc_timestamp;
- WARN_ON(vcpu->hv_clock.system_time < 0);
/* If the host uses TSC clocksource, then it is stable */ pvclock_flags = 0; @@ -9486,7 +9494,7 @@ int kvm_arch_init_vm(struct kvm *kvm, un mutex_init(&kvm->arch.apic_map_lock); spin_lock_init(&kvm->arch.pvclock_gtod_sync_lock);
- kvm->arch.kvmclock_offset = -ktime_get_boot_ns();
- kvm->arch.kvmclock_offset = -get_kvmclock_base_ns(); pvclock_update_vm_gtod_copy(kvm);
kvm->arch.guest_can_read_msr_platform_info = true;
On Tue, May 19, 2020 at 10:58:52AM +0200, Frantisek Hrbata wrote:
Hi Marcelo,
I'm marking this as superseded by pwid 303977
http://patchwork.lab.bos.redhat.com/patch/303977/
which was included in kernel-4.18.0-196.el8
commit 9751522d92195bc64883c71e2bee8ed0fcbc5007 Author: Vitaly Kuznetsov <vkuznets@redhat.com> Date: Mon Apr 20 15:20:04 2020 -0400 [x86] kvm: x86: use raw clock values consistently Message-id: <20200420152004.933168-1-vkuznets@redhat.com> Patchwork-id: 303977 Patchwork-instance: patchwork O-Subject: [RHEL8.3 virt PATCH v2 312/614] KVM: x86: use raw clock values consistently Bugzilla: 1813987 RH-Acked-by: Andrew Jones <drjones@redhat.com> RH-Acked-by: Paolo Bonzini <pbonzini@redhat.com> RH-Acked-by: Tony Camuso <tcamuso@redhat.com>
Note there is a difference between yours and Vitaly's patch in the following hunk
+@@ -1656,6 +1656,18 @@ static void update_pvclock_gtod(struct timekeeper *tk)
write_seqcount_end(&vdata->seq);
} @@ -14,12 +16,12 @@ +static s64 get_kvmclock_base_ns(void) +{
/* Master clock not used, so we can just use CLOCK_BOOTTIME. */
-+ return ktime_get_boottime_ns(); ++ return ktime_get_boot_ns(); +} #endif
This is in !CONFIG_X86_64 path, so I'm not sure how much we care about this. Anyway Vitaly's patch corrects this, because rhel does not have upstream commit 9285ec4c8b61d4930a575081abeba2cd4f449a74
Gotta love private git repo discussions on a public mailing list :)
greg k-h
linux-stable-mirror@lists.linaro.org