Commit f94c8d11699759 ("sched/clock, x86/tsc: Rework the x86 'unstable' sched_clock() interface") broke Xen guest time handling across migration:
[ 187.249951] Freezing user space processes ... (elapsed 0.001 seconds) done. [ 187.251137] OOM killer disabled. [ 187.251137] Freezing remaining freezable tasks ... (elapsed 0.001 seconds) done. [ 187.252299] suspending xenstore... [ 187.266987] xen:grant_table: Grant tables using version 1 layout [18446743811.706476] OOM killer enabled. [18446743811.706478] Restarting tasks ... done. [18446743811.720505] Setting capacity to 16777216
Fix that by setting xen_sched_clock_offset at resume time to ensure a monotonic clock value.
Fixes: f94c8d11699759 ("sched/clock, x86/tsc: Rework the x86 'unstable' sched_clock() interface") Cc: stable@vger.kernel.org # 4.11 Reported-by: Hans van Kranenburg hans@knorrie.org Signed-off-by: Juergen Gross jgross@suse.com --- arch/x86/xen/suspend.c | 4 ++++ arch/x86/xen/time.c | 11 +++++++++++ arch/x86/xen/xen-ops.h | 2 ++ 3 files changed, 17 insertions(+)
diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c index 1d83152c761b..45fc9caf3880 100644 --- a/arch/x86/xen/suspend.c +++ b/arch/x86/xen/suspend.c @@ -67,6 +67,8 @@ void xen_arch_resume(void) { int cpu;
+ xen_clocksource_resume(); + on_each_cpu(xen_vcpu_notify_restore, NULL, 1);
for_each_online_cpu(cpu) @@ -81,4 +83,6 @@ void xen_arch_suspend(void) xen_pmu_finish(cpu);
on_each_cpu(xen_vcpu_notify_suspend, NULL, 1); + + xen_clocksource_suspend(); } diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c index 72bf446c3fee..117ce958ffe6 100644 --- a/arch/x86/xen/time.c +++ b/arch/x86/xen/time.c @@ -32,6 +32,7 @@ #define TIMER_SLOP 100000
static u64 xen_sched_clock_offset __read_mostly; +static u64 xen_clock_value_saved;
/* Get the TSC speed from Xen */ static unsigned long xen_tsc_khz(void) @@ -54,6 +55,16 @@ static u64 xen_clocksource_read(void) return ret; }
+void xen_clocksource_suspend(void) +{ + xen_clock_value_saved = xen_clocksource_read() - xen_sched_clock_offset; +} + +void xen_clocksource_resume(void) +{ + xen_sched_clock_offset = xen_clocksource_read() - xen_clock_value_saved; +} + static u64 xen_clocksource_get_cycles(struct clocksource *cs) { return xen_clocksource_read(); diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index 0e60bd918695..a17d3bdab6b8 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -63,6 +63,8 @@ void __init xen_build_dynamic_phys_to_machine(void); void __init xen_vmalloc_p2m_tree(void);
void xen_init_irq_ops(void); +void xen_clocksource_suspend(void); +void xen_clocksource_resume(void); void xen_setup_timer(int cpu); void xen_setup_runstate_info(int cpu); void xen_teardown_timer(int cpu);
On 10/01/2019 16:34, Boris Ostrovsky wrote:
On 1/10/19 5:07 AM, Juergen Gross wrote:
+void xen_clocksource_suspend(void) +{
- xen_clock_value_saved = xen_clocksource_read() - xen_sched_clock_offset;
xen_clock_value_saved = xen_sched_clock() maybe?
I wanted xen_clocksource_suspend() and xen_clocksource_resume() to be symmetrical to each other.
In case you are feeling strong about that, I'm not. :-) So in case you insist on it I can change it. Or you can do so while committing.
Juergen
On 1/10/19 11:14 AM, Juergen Gross wrote:
On 10/01/2019 16:34, Boris Ostrovsky wrote:
On 1/10/19 5:07 AM, Juergen Gross wrote:
+void xen_clocksource_suspend(void) +{
- xen_clock_value_saved = xen_clocksource_read() - xen_sched_clock_offset;
xen_clock_value_saved = xen_sched_clock() maybe?
I wanted xen_clocksource_suspend() and xen_clocksource_resume() to be symmetrical to each other.
OK.
Reviewed-by: Boris Ostrovsky boris.ostrovsky@oracle.com
In case you are feeling strong about that, I'm not. :-) So in case you insist on it I can change it. Or you can do so while committing.
On 1/10/19 12:17 PM, Boris Ostrovsky wrote:
On 1/10/19 11:14 AM, Juergen Gross wrote:
On 10/01/2019 16:34, Boris Ostrovsky wrote:
On 1/10/19 5:07 AM, Juergen Gross wrote:
+void xen_clocksource_suspend(void) +{
- xen_clock_value_saved = xen_clocksource_read() - xen_sched_clock_offset;
xen_clock_value_saved = xen_sched_clock() maybe?
I wanted xen_clocksource_suspend() and xen_clocksource_resume() to be symmetrical to each other.
OK.
Reviewed-by: Boris Ostrovsky boris.ostrovsky@oracle.com
In case you are feeling strong about that, I'm not. :-) So in case you insist on it I can change it. Or you can do so while committing.
I did some basic testing and noticed this (at loglevel=8):
[ 64.336488] Freezing user space processes ... (elapsed 0.001 seconds) done. [ 64.337805] OOM killer disabled. [ 64.337814] Freezing remaining freezable tasks ... (elapsed 0.000 seconds) done. [ 64.339066] suspending xenstore... [ 85.888340] xen:grant_table: Grant tables using version 1 layout [ 64.359729] OOM killer enabled. [ 64.359736] Restarting tasks ... done.
Which made me think that perhaps we should do suspend/restore of the clocksource as close as possible to HYPERVISOR_suspend() call, e.g. in xen_arch_pre_suspend()/xen_arch_post_suspend():
diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c index 45fc9caf3880..80ecba3fcc8c 100644 --- a/arch/x86/xen/suspend.c +++ b/arch/x86/xen/suspend.c @@ -22,6 +22,7 @@ static DEFINE_PER_CPU(u64, spec_ctrl); void xen_arch_pre_suspend(void) { + xen_clocksource_suspend(); xen_save_time_memory_area(); if (xen_pv_domain()) @@ -36,6 +37,7 @@ void xen_arch_post_suspend(int cancelled) xen_hvm_post_suspend(cancelled); xen_restore_time_memory_area(); + xen_clocksource_resume(); } static void xen_vcpu_notify_restore(void *data)
This still has a window of incorrect clock value (you can see it, for example, when xen_hvm_post_suspend() does pr_info("Xen HVM callback vector for event delivery is enabled\n")), but it's smaller than before. In particular, we will make time right before dpm_resume_start() call.
-boris
On 11/01/2019 00:10, Boris Ostrovsky wrote:
On 1/10/19 12:17 PM, Boris Ostrovsky wrote:
On 1/10/19 11:14 AM, Juergen Gross wrote:
On 10/01/2019 16:34, Boris Ostrovsky wrote:
On 1/10/19 5:07 AM, Juergen Gross wrote:
+void xen_clocksource_suspend(void) +{
- xen_clock_value_saved = xen_clocksource_read() - xen_sched_clock_offset;
xen_clock_value_saved = xen_sched_clock() maybe?
I wanted xen_clocksource_suspend() and xen_clocksource_resume() to be symmetrical to each other.
OK.
Reviewed-by: Boris Ostrovsky boris.ostrovsky@oracle.com
In case you are feeling strong about that, I'm not. :-) So in case you insist on it I can change it. Or you can do so while committing.
I did some basic testing and noticed this (at loglevel=8):
[ 64.336488] Freezing user space processes ... (elapsed 0.001 seconds) done. [ 64.337805] OOM killer disabled. [ 64.337814] Freezing remaining freezable tasks ... (elapsed 0.000 seconds) done. [ 64.339066] suspending xenstore... [ 85.888340] xen:grant_table: Grant tables using version 1 layout [ 64.359729] OOM killer enabled. [ 64.359736] Restarting tasks ... done.
Which made me think that perhaps we should do suspend/restore of the clocksource as close as possible to HYPERVISOR_suspend() call, e.g. in xen_arch_pre_suspend()/xen_arch_post_suspend():
diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c index 45fc9caf3880..80ecba3fcc8c 100644 --- a/arch/x86/xen/suspend.c +++ b/arch/x86/xen/suspend.c @@ -22,6 +22,7 @@ static DEFINE_PER_CPU(u64, spec_ctrl); void xen_arch_pre_suspend(void) { + xen_clocksource_suspend(); xen_save_time_memory_area(); if (xen_pv_domain()) @@ -36,6 +37,7 @@ void xen_arch_post_suspend(int cancelled) xen_hvm_post_suspend(cancelled); xen_restore_time_memory_area(); + xen_clocksource_resume(); } static void xen_vcpu_notify_restore(void *data)
This still has a window of incorrect clock value (you can see it, for example, when xen_hvm_post_suspend() does pr_info("Xen HVM callback vector for event delivery is enabled\n")), but it's smaller than before. In particular, we will make time right before dpm_resume_start() call.
You are right, this is better.
Sending out V2 soon.
Juergen
On 11/01/2019 08:15, Juergen Gross wrote:
On 11/01/2019 00:10, Boris Ostrovsky wrote:
On 1/10/19 12:17 PM, Boris Ostrovsky wrote:
On 1/10/19 11:14 AM, Juergen Gross wrote:
On 10/01/2019 16:34, Boris Ostrovsky wrote:
On 1/10/19 5:07 AM, Juergen Gross wrote:
+void xen_clocksource_suspend(void) +{
- xen_clock_value_saved = xen_clocksource_read() - xen_sched_clock_offset;
xen_clock_value_saved = xen_sched_clock() maybe?
I wanted xen_clocksource_suspend() and xen_clocksource_resume() to be symmetrical to each other.
OK.
Reviewed-by: Boris Ostrovsky boris.ostrovsky@oracle.com
In case you are feeling strong about that, I'm not. :-) So in case you insist on it I can change it. Or you can do so while committing.
I did some basic testing and noticed this (at loglevel=8):
[ 64.336488] Freezing user space processes ... (elapsed 0.001 seconds) done. [ 64.337805] OOM killer disabled. [ 64.337814] Freezing remaining freezable tasks ... (elapsed 0.000 seconds) done. [ 64.339066] suspending xenstore... [ 85.888340] xen:grant_table: Grant tables using version 1 layout [ 64.359729] OOM killer enabled. [ 64.359736] Restarting tasks ... done.
Which made me think that perhaps we should do suspend/restore of the clocksource as close as possible to HYPERVISOR_suspend() call, e.g. in xen_arch_pre_suspend()/xen_arch_post_suspend():
diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c index 45fc9caf3880..80ecba3fcc8c 100644 --- a/arch/x86/xen/suspend.c +++ b/arch/x86/xen/suspend.c @@ -22,6 +22,7 @@ static DEFINE_PER_CPU(u64, spec_ctrl); void xen_arch_pre_suspend(void) { + xen_clocksource_suspend(); xen_save_time_memory_area(); if (xen_pv_domain()) @@ -36,6 +37,7 @@ void xen_arch_post_suspend(int cancelled) xen_hvm_post_suspend(cancelled); xen_restore_time_memory_area(); + xen_clocksource_resume(); } static void xen_vcpu_notify_restore(void *data)
This still has a window of incorrect clock value (you can see it, for example, when xen_hvm_post_suspend() does pr_info("Xen HVM callback vector for event delivery is enabled\n")), but it's smaller than before. In particular, we will make time right before dpm_resume_start() call.
You are right, this is better.
In fact, I can just handle this entirely inside arch/x86/xen/time.c by moving the required statements into xen_save_time_memory_area() and xen_restore_time_memory_area().
Juergen
On 1/10/19 11:07 AM, Juergen Gross wrote:
Commit f94c8d11699759 ("sched/clock, x86/tsc: Rework the x86 'unstable' sched_clock() interface") broke Xen guest time handling across migration:
[ 187.249951] Freezing user space processes ... (elapsed 0.001 seconds) done. [ 187.251137] OOM killer disabled. [ 187.251137] Freezing remaining freezable tasks ... (elapsed 0.001 seconds) done. [ 187.252299] suspending xenstore... [ 187.266987] xen:grant_table: Grant tables using version 1 layout [18446743811.706476] OOM killer enabled. [18446743811.706478] Restarting tasks ... done. [18446743811.720505] Setting capacity to 16777216
I tried this on top of 4.19.14 (together with "x86/mm: Fix guard hole handling", which still doesn't seem to be in 4.19) and I can confirm the fix:
Using a PV domU,
4.19 without patch:
[ 646.199018] Freezing user space processes ... (elapsed 0.002 seconds) done. [ 646.201305] OOM killer disabled. [ 646.201311] Freezing remaining freezable tasks ... (elapsed 0.001 seconds) done. [ 646.202699] suspending xenstore... [ 646.203005] xen:grant_table: Grant tables using version 1 layout [18446200797.089367] OOM killer enabled. [18446200797.089382] Restarting tasks ... done.
4.19 with patch, doing live migration movement between exactly the same set of physical servers:
[ 74.878062] Freezing user space processes ... (elapsed 0.002 seconds) done. [ 74.880308] OOM killer disabled. [ 74.880314] Freezing remaining freezable tasks ... (elapsed 0.000 seconds) done. [ 74.881681] suspending xenstore... [ 74.887497] xen:grant_table: Grant tables using version 1 layout [ 74.942011] OOM killer enabled. [ 74.942025] Restarting tasks ... done. [ 74.947688] Setting capacity to 6291456 [ 74.950833] Setting capacity to 10485760
Tested-by: Hans van Kranenburg hans@knorrie.org
Thanks!
Fix that by setting xen_sched_clock_offset at resume time to ensure a monotonic clock value.
Fixes: f94c8d11699759 ("sched/clock, x86/tsc: Rework the x86 'unstable' sched_clock() interface") Cc: stable@vger.kernel.org # 4.11 Reported-by: Hans van Kranenburg hans@knorrie.org Signed-off-by: Juergen Gross jgross@suse.com
arch/x86/xen/suspend.c | 4 ++++ arch/x86/xen/time.c | 11 +++++++++++ arch/x86/xen/xen-ops.h | 2 ++ 3 files changed, 17 insertions(+)
diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c index 1d83152c761b..45fc9caf3880 100644 --- a/arch/x86/xen/suspend.c +++ b/arch/x86/xen/suspend.c @@ -67,6 +67,8 @@ void xen_arch_resume(void) { int cpu;
- xen_clocksource_resume();
- on_each_cpu(xen_vcpu_notify_restore, NULL, 1);
for_each_online_cpu(cpu) @@ -81,4 +83,6 @@ void xen_arch_suspend(void) xen_pmu_finish(cpu); on_each_cpu(xen_vcpu_notify_suspend, NULL, 1);
- xen_clocksource_suspend();
} diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c index 72bf446c3fee..117ce958ffe6 100644 --- a/arch/x86/xen/time.c +++ b/arch/x86/xen/time.c @@ -32,6 +32,7 @@ #define TIMER_SLOP 100000 static u64 xen_sched_clock_offset __read_mostly; +static u64 xen_clock_value_saved; /* Get the TSC speed from Xen */ static unsigned long xen_tsc_khz(void) @@ -54,6 +55,16 @@ static u64 xen_clocksource_read(void) return ret; } +void xen_clocksource_suspend(void) +{
- xen_clock_value_saved = xen_clocksource_read() - xen_sched_clock_offset;
+}
+void xen_clocksource_resume(void) +{
- xen_sched_clock_offset = xen_clocksource_read() - xen_clock_value_saved;
+}
static u64 xen_clocksource_get_cycles(struct clocksource *cs) { return xen_clocksource_read(); diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index 0e60bd918695..a17d3bdab6b8 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -63,6 +63,8 @@ void __init xen_build_dynamic_phys_to_machine(void); void __init xen_vmalloc_p2m_tree(void); void xen_init_irq_ops(void); +void xen_clocksource_suspend(void); +void xen_clocksource_resume(void); void xen_setup_timer(int cpu); void xen_setup_runstate_info(int cpu); void xen_teardown_timer(int cpu);
linux-stable-mirror@lists.linaro.org