From: Frederic Weisbecker frederic@kernel.org
[ Upstream commit a1ff03cd6fb9c501fff63a4a2bface9adcfa81cd ]
tick: Detect and fix jiffies update stall
On some rare cases, the timekeeper CPU may be delaying its jiffies update duty for a while. Known causes include:
* The timekeeper is waiting on stop_machine in a MULTI_STOP_DISABLE_IRQ or MULTI_STOP_RUN state. Disabled interrupts prevent from timekeeping updates while waiting for the target CPU to complete its stop_machine() callback.
* The timekeeper vcpu has VMEXIT'ed for a long while due to some overload on the host.
Detect and fix these situations with emergency timekeeping catchups.
Original-patch-by: Paul E. McKenney paulmck@kernel.org Signed-off-by: Frederic Weisbecker frederic@kernel.org Cc: Thomas Gleixner tglx@linutronix.de Signed-off-by: Joel Fernandes (Google) joel@joelfernandes.org --- kernel/time/tick-sched.c | 17 +++++++++++++++++ kernel/time/tick-sched.h | 4 ++++ 2 files changed, 21 insertions(+)
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index f42d0776bc84..7701c720dc1f 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -180,6 +180,8 @@ static ktime_t tick_init_jiffy_update(void) return period; }
+#define MAX_STALLED_JIFFIES 5 + static void tick_sched_do_timer(struct tick_sched *ts, ktime_t now) { int cpu = smp_processor_id(); @@ -207,6 +209,21 @@ static void tick_sched_do_timer(struct tick_sched *ts, ktime_t now) if (tick_do_timer_cpu == cpu) tick_do_update_jiffies64(now);
+ /* + * If jiffies update stalled for too long (timekeeper in stop_machine() + * or VMEXIT'ed for several msecs), force an update. + */ + if (ts->last_tick_jiffies != jiffies) { + ts->stalled_jiffies = 0; + ts->last_tick_jiffies = READ_ONCE(jiffies); + } else { + if (++ts->stalled_jiffies == MAX_STALLED_JIFFIES) { + tick_do_update_jiffies64(now); + ts->stalled_jiffies = 0; + ts->last_tick_jiffies = READ_ONCE(jiffies); + } + } + if (ts->inidle) ts->got_idle_tick = 1; } diff --git a/kernel/time/tick-sched.h b/kernel/time/tick-sched.h index d952ae393423..504649513399 100644 --- a/kernel/time/tick-sched.h +++ b/kernel/time/tick-sched.h @@ -49,6 +49,8 @@ enum tick_nohz_mode { * @timer_expires_base: Base time clock monotonic for @timer_expires * @next_timer: Expiry time of next expiring timer for debugging purpose only * @tick_dep_mask: Tick dependency mask - is set, if someone needs the tick + * @last_tick_jiffies: Value of jiffies seen on last tick + * @stalled_jiffies: Number of stalled jiffies detected across ticks */ struct tick_sched { struct hrtimer sched_timer; @@ -77,6 +79,8 @@ struct tick_sched { u64 next_timer; ktime_t idle_expires; atomic_t tick_dep_mask; + unsigned long last_tick_jiffies; + unsigned int stalled_jiffies; };
extern struct tick_sched *tick_get_tick_sched(int cpu);
From: Nicholas Piggin npiggin@gmail.com
[ Upstream commit 5417ddc1cf1f5c8cba31ab217cf57ada7ab6ea88 ]
When tick_nohz_stop_tick() stops the tick and high resolution timers are disabled, then the clock event device is not put into ONESHOT_STOPPED mode. This can lead to spurious timer interrupts with some clock event device drivers that don't shut down entirely after firing.
Eliminate these by putting the device into ONESHOT_STOPPED mode at points where it is not being reprogrammed. When there are no timers active, then tick_program_event() with KTIME_MAX can be used to stop the device. When there is a timer active, the device can be stopped at the next tick (any new timer added by timers will reprogram the tick).
Signed-off-by: Nicholas Piggin npiggin@gmail.com Signed-off-by: Thomas Gleixner tglx@linutronix.de Link: https://lore.kernel.org/r/20220422141446.915024-1-npiggin@gmail.com Signed-off-by: Joel Fernandes (Google) joel@joelfernandes.org --- kernel/time/tick-sched.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-)
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 7701c720dc1f..5786e2794ae1 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -950,6 +950,8 @@ static void tick_nohz_stop_tick(struct tick_sched *ts, int cpu) if (unlikely(expires == KTIME_MAX)) { if (ts->nohz_mode == NOHZ_MODE_HIGHRES) hrtimer_cancel(&ts->sched_timer); + else + tick_program_event(KTIME_MAX, 1); return; }
@@ -1356,9 +1358,15 @@ static void tick_nohz_handler(struct clock_event_device *dev) tick_sched_do_timer(ts, now); tick_sched_handle(ts, regs);
- /* No need to reprogram if we are running tickless */ - if (unlikely(ts->tick_stopped)) + if (unlikely(ts->tick_stopped)) { + /* + * The clockevent device is not reprogrammed, so change the + * clock event device to ONESHOT_STOPPED to avoid spurious + * interrupts on devices which might not be truly one shot. + */ + tick_program_event(KTIME_MAX, 1); return; + }
hrtimer_forward(&ts->sched_timer, now, TICK_NSEC); tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1);
From: Frederic Weisbecker frederic@kernel.org
[ Upstream commit 53e87e3cdc155f20c3417b689df8d2ac88d79576 ]
When at least one CPU runs in nohz_full mode, a dedicated timekeeper CPU is guaranteed to stay online and to never stop its tick.
Meanwhile on some rare case, the dedicated timekeeper may be running with interrupts disabled for a while, such as in stop_machine.
If jiffies stop being updated, a nohz_full CPU may end up endlessly programming the next tick in the past, taking the last jiffies update monotonic timestamp as a stale base, resulting in an tick storm.
Here is a scenario where it matters:
0) CPU 0 is the timekeeper and CPU 1 a nohz_full CPU.
1) A stop machine callback is queued to execute somewhere.
2) CPU 0 reaches MULTI_STOP_DISABLE_IRQ while CPU 1 is still in MULTI_STOP_PREPARE. Hence CPU 0 can't do its timekeeping duty. CPU 1 can still take IRQs.
3) CPU 1 receives an IRQ which queues a timer callback one jiffy forward.
4) On IRQ exit, CPU 1 schedules the tick one jiffy forward, taking last_jiffies_update as a base. But last_jiffies_update hasn't been updated for 2 jiffies since the timekeeper has interrupts disabled.
5) clockevents_program_event(), which relies on ktime_get(), observes that the expiration is in the past and therefore programs the min delta event on the clock.
6) The tick fires immediately, goto 3)
7) Tick storm, the nohz_full CPU is drown and takes ages to reach MULTI_STOP_DISABLE_IRQ, which is the only way out of this situation.
Solve this with unconditionally updating jiffies if the value is stale on nohz_full IRQ entry. IRQs and other disturbances are expected to be rare enough on nohz_full for the unconditional call to ktime_get() to actually matter.
Reported-by: Paul E. McKenney paulmck@kernel.org Signed-off-by: Frederic Weisbecker frederic@kernel.org Signed-off-by: Thomas Gleixner tglx@linutronix.de Tested-by: Paul E. McKenney paulmck@kernel.org Link: https://lore.kernel.org/r/20211026141055.57358-2-frederic@kernel.org Signed-off-by: Joel Fernandes (Google) joel@joelfernandes.org --- kernel/softirq.c | 3 ++- kernel/time/tick-sched.c | 7 +++++++ 2 files changed, 9 insertions(+), 1 deletion(-)
diff --git a/kernel/softirq.c b/kernel/softirq.c index 322b65d45676..41f470929e99 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -595,7 +595,8 @@ void irq_enter_rcu(void) { __irq_enter_raw();
- if (is_idle_task(current) && (irq_count() == HARDIRQ_OFFSET)) + if (tick_nohz_full_cpu(smp_processor_id()) || + (is_idle_task(current) && (irq_count() == HARDIRQ_OFFSET))) tick_irq_enter();
account_hardirq_enter(current); diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 5786e2794ae1..7f5310d1a4d6 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -1420,6 +1420,13 @@ static inline void tick_nohz_irq_enter(void) now = ktime_get(); if (ts->idle_active) tick_nohz_stop_idle(ts, now); + /* + * If all CPUs are idle. We may need to update a stale jiffies value. + * Note nohz_full is a special case: a timekeeper is guaranteed to stay + * alive but it might be busy looping with interrupts disabled in some + * rare case (typically stop machine). So we must make sure we have a + * last resort. + */ if (ts->tick_stopped) tick_nohz_update_jiffies(now); }
Hi, Unfortunately, One of my tests showed the following after 10 minutes of running the TREE01 scenario (even though TREE04 got fixed). Let us hold off on these 3 patches. I need to be sure there's absolutely no new issue introduced. So more work to do.
Thank you!
[ 667.823113] CPU: 2 PID: 27 Comm: migration/2 Not tainted 5.15.126-rc1+ #26 [ 667.904999] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.2-debian-1.16.2-1 04/01/2014 [ 668.063061] Stopper: multi_cpu_stop+0x0/0x170 <- stop_machine_from_inactive_cpu+0x118/0x170 [ 668.179708] RIP: 0010:multi_cpu_stop+0x150/0x170 [ 668.231838] Code: 35 49 dd a1 01 48 c7 c7 60 51 95 bc 49 c7 c6 60 51 95 bc e8 62 c0 31 00 39 44 24 04 41 0f 94 c7 e9 ed fe ff ff e8 10 82 fc ff <48> 8b 0d 89 69 0e 45 8b 74 24 04 48 c7 c7 6d c4 35 bc 48 29 c8 48 [ 668.563146] RSP: 0000:ffffb28f0023be68 EFLAGS: 00010216 [ 668.646854] RAX: 0000009b1b04be30 RBX: 0000000000000001 RCX: 0000000000000017 [ 668.733241] RDX: 00000ef798000000 RSI: 00000000000e4546 RDI: 0001d5f268800000 [ 668.851846] RBP: ffffb28f000c7e90 R08: 0000009c17aaf41a R09: 7fffffffffffffff [ 668.966445] R10: 0000009aa37f6a00 R11: 00000000012679f7 R12: 000000000001e7c0 [ 669.056865] R13: 0000000000000002 R14: ffff8ec49ffb4f00 R15: ffffb28f000c7e01 [ 669.172537] FS: 0000000000000000(0000) GS:ffff8ec49ea80000(0000) knlGS:0000000000000000 [ 669.318440] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 669.395012] CR2: 000000000001e7c0 CR3: 0000000019e0c000 CR4: 00000000000006e0 [ 669.505733] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 669.588303] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 669.702863] Call Trace: [ 669.742668] <TASK> [ 669.789840] ? __die_body.cold+0x1a/0x1f [ 669.836098] ? page_fault_oops+0xcd/0x210 [ 669.922700] ? exc_page_fault+0x60/0x140 [ 669.968411] ? asm_exc_page_fault+0x22/0x30 [ 670.012394] ? multi_cpu_stop+0x150/0x170 [ 670.115211] ? multi_cpu_stop+0x150/0x170 [ 670.153099] ? stop_machine_yield+0x10/0x10 [ 670.236328] cpu_stopper_thread+0x85/0x130 [ 670.288173] smpboot_thread_fn+0x183/0x220 [ 670.398452] ? smpboot_register_percpu_thread+0xd0/0xd0 [ 670.482244] kthread+0x12d/0x160 [ 670.527783] ? set_kthread_struct+0x40/0x40 [ 670.606296] ret_from_fork+0x22/0x30 [ 670.651658] </TASK> [ 670.695138] Modules linked in: [ 670.740198] CR2: 000000000001e7c0
On Thu, Aug 10, 2023 at 6:31 PM Joel Fernandes (Google) joel@joelfernandes.org wrote:
From: Frederic Weisbecker frederic@kernel.org
[ Upstream commit a1ff03cd6fb9c501fff63a4a2bface9adcfa81cd ]
tick: Detect and fix jiffies update stall
On some rare cases, the timekeeper CPU may be delaying its jiffies update duty for a while. Known causes include:
The timekeeper is waiting on stop_machine in a MULTI_STOP_DISABLE_IRQ or MULTI_STOP_RUN state. Disabled interrupts prevent from timekeeping updates while waiting for the target CPU to complete its stop_machine() callback.
The timekeeper vcpu has VMEXIT'ed for a long while due to some overload on the host.
Detect and fix these situations with emergency timekeeping catchups.
Original-patch-by: Paul E. McKenney paulmck@kernel.org Signed-off-by: Frederic Weisbecker frederic@kernel.org Cc: Thomas Gleixner tglx@linutronix.de Signed-off-by: Joel Fernandes (Google) joel@joelfernandes.org
kernel/time/tick-sched.c | 17 +++++++++++++++++ kernel/time/tick-sched.h | 4 ++++ 2 files changed, 21 insertions(+)
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index f42d0776bc84..7701c720dc1f 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -180,6 +180,8 @@ static ktime_t tick_init_jiffy_update(void) return period; }
+#define MAX_STALLED_JIFFIES 5
static void tick_sched_do_timer(struct tick_sched *ts, ktime_t now) { int cpu = smp_processor_id(); @@ -207,6 +209,21 @@ static void tick_sched_do_timer(struct tick_sched *ts, ktime_t now) if (tick_do_timer_cpu == cpu) tick_do_update_jiffies64(now);
/*
* If jiffies update stalled for too long (timekeeper in stop_machine()
* or VMEXIT'ed for several msecs), force an update.
*/
if (ts->last_tick_jiffies != jiffies) {
ts->stalled_jiffies = 0;
ts->last_tick_jiffies = READ_ONCE(jiffies);
} else {
if (++ts->stalled_jiffies == MAX_STALLED_JIFFIES) {
tick_do_update_jiffies64(now);
ts->stalled_jiffies = 0;
ts->last_tick_jiffies = READ_ONCE(jiffies);
}
}
if (ts->inidle) ts->got_idle_tick = 1;
} diff --git a/kernel/time/tick-sched.h b/kernel/time/tick-sched.h index d952ae393423..504649513399 100644 --- a/kernel/time/tick-sched.h +++ b/kernel/time/tick-sched.h @@ -49,6 +49,8 @@ enum tick_nohz_mode {
- @timer_expires_base: Base time clock monotonic for @timer_expires
- @next_timer: Expiry time of next expiring timer for debugging purpose only
- @tick_dep_mask: Tick dependency mask - is set, if someone needs the tick
- @last_tick_jiffies: Value of jiffies seen on last tick
*/
- @stalled_jiffies: Number of stalled jiffies detected across ticks
struct tick_sched { struct hrtimer sched_timer; @@ -77,6 +79,8 @@ struct tick_sched { u64 next_timer; ktime_t idle_expires; atomic_t tick_dep_mask;
unsigned long last_tick_jiffies;
unsigned int stalled_jiffies;
};
extern struct tick_sched *tick_get_tick_sched(int cpu);
2.41.0.640.ga95def55d0-goog
On Thu, Aug 10, 2023 at 11:14 PM Joel Fernandes joel@joelfernandes.org wrote:
Hi, Unfortunately, One of my tests showed the following after 10 minutes of running the TREE01 scenario (even though TREE04 got fixed). Let us hold off on these 3 patches. I need to be sure there's absolutely no new issue introduced. So more work to do.
Thank you!
And heh, this could well be one of my "debug patches" in stop machine code that is misbehaving. In any case, I'll do more long running tests with the fixes before posting them again.
Thank you all,
- Joel
[ 667.823113] CPU: 2 PID: 27 Comm: migration/2 Not tainted 5.15.126-rc1+ #26 [ 667.904999] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.2-debian-1.16.2-1 04/01/2014 [ 668.063061] Stopper: multi_cpu_stop+0x0/0x170 <- stop_machine_from_inactive_cpu+0x118/0x170 [ 668.179708] RIP: 0010:multi_cpu_stop+0x150/0x170 [ 668.231838] Code: 35 49 dd a1 01 48 c7 c7 60 51 95 bc 49 c7 c6 60 51 95 bc e8 62 c0 31 00 39 44 24 04 41 0f 94 c7 e9 ed fe ff ff e8 10 82 fc ff <48> 8b 0d 89 69 0e 45 8b 74 24 04 48 c7 c7 6d c4 35 bc 48 29 c8 48 [ 668.563146] RSP: 0000:ffffb28f0023be68 EFLAGS: 00010216 [ 668.646854] RAX: 0000009b1b04be30 RBX: 0000000000000001 RCX: 0000000000000017 [ 668.733241] RDX: 00000ef798000000 RSI: 00000000000e4546 RDI: 0001d5f268800000 [ 668.851846] RBP: ffffb28f000c7e90 R08: 0000009c17aaf41a R09: 7fffffffffffffff [ 668.966445] R10: 0000009aa37f6a00 R11: 00000000012679f7 R12: 000000000001e7c0 [ 669.056865] R13: 0000000000000002 R14: ffff8ec49ffb4f00 R15: ffffb28f000c7e01 [ 669.172537] FS: 0000000000000000(0000) GS:ffff8ec49ea80000(0000) knlGS:0000000000000000 [ 669.318440] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 669.395012] CR2: 000000000001e7c0 CR3: 0000000019e0c000 CR4: 00000000000006e0 [ 669.505733] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 669.588303] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 669.702863] Call Trace: [ 669.742668] <TASK> [ 669.789840] ? __die_body.cold+0x1a/0x1f [ 669.836098] ? page_fault_oops+0xcd/0x210 [ 669.922700] ? exc_page_fault+0x60/0x140 [ 669.968411] ? asm_exc_page_fault+0x22/0x30 [ 670.012394] ? multi_cpu_stop+0x150/0x170 [ 670.115211] ? multi_cpu_stop+0x150/0x170 [ 670.153099] ? stop_machine_yield+0x10/0x10 [ 670.236328] cpu_stopper_thread+0x85/0x130 [ 670.288173] smpboot_thread_fn+0x183/0x220 [ 670.398452] ? smpboot_register_percpu_thread+0xd0/0xd0 [ 670.482244] kthread+0x12d/0x160 [ 670.527783] ? set_kthread_struct+0x40/0x40 [ 670.606296] ret_from_fork+0x22/0x30 [ 670.651658] </TASK> [ 670.695138] Modules linked in: [ 670.740198] CR2: 000000000001e7c0
On Thu, Aug 10, 2023 at 6:31 PM Joel Fernandes (Google) joel@joelfernandes.org wrote:
From: Frederic Weisbecker frederic@kernel.org
[ Upstream commit a1ff03cd6fb9c501fff63a4a2bface9adcfa81cd ]
tick: Detect and fix jiffies update stall
On some rare cases, the timekeeper CPU may be delaying its jiffies update duty for a while. Known causes include:
The timekeeper is waiting on stop_machine in a MULTI_STOP_DISABLE_IRQ or MULTI_STOP_RUN state. Disabled interrupts prevent from timekeeping updates while waiting for the target CPU to complete its stop_machine() callback.
The timekeeper vcpu has VMEXIT'ed for a long while due to some overload on the host.
Detect and fix these situations with emergency timekeeping catchups.
Original-patch-by: Paul E. McKenney paulmck@kernel.org Signed-off-by: Frederic Weisbecker frederic@kernel.org Cc: Thomas Gleixner tglx@linutronix.de Signed-off-by: Joel Fernandes (Google) joel@joelfernandes.org
kernel/time/tick-sched.c | 17 +++++++++++++++++ kernel/time/tick-sched.h | 4 ++++ 2 files changed, 21 insertions(+)
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index f42d0776bc84..7701c720dc1f 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -180,6 +180,8 @@ static ktime_t tick_init_jiffy_update(void) return period; }
+#define MAX_STALLED_JIFFIES 5
static void tick_sched_do_timer(struct tick_sched *ts, ktime_t now) { int cpu = smp_processor_id(); @@ -207,6 +209,21 @@ static void tick_sched_do_timer(struct tick_sched *ts, ktime_t now) if (tick_do_timer_cpu == cpu) tick_do_update_jiffies64(now);
/*
* If jiffies update stalled for too long (timekeeper in stop_machine()
* or VMEXIT'ed for several msecs), force an update.
*/
if (ts->last_tick_jiffies != jiffies) {
ts->stalled_jiffies = 0;
ts->last_tick_jiffies = READ_ONCE(jiffies);
} else {
if (++ts->stalled_jiffies == MAX_STALLED_JIFFIES) {
tick_do_update_jiffies64(now);
ts->stalled_jiffies = 0;
ts->last_tick_jiffies = READ_ONCE(jiffies);
}
}
if (ts->inidle) ts->got_idle_tick = 1;
} diff --git a/kernel/time/tick-sched.h b/kernel/time/tick-sched.h index d952ae393423..504649513399 100644 --- a/kernel/time/tick-sched.h +++ b/kernel/time/tick-sched.h @@ -49,6 +49,8 @@ enum tick_nohz_mode {
- @timer_expires_base: Base time clock monotonic for @timer_expires
- @next_timer: Expiry time of next expiring timer for debugging purpose only
- @tick_dep_mask: Tick dependency mask - is set, if someone needs the tick
- @last_tick_jiffies: Value of jiffies seen on last tick
*/
- @stalled_jiffies: Number of stalled jiffies detected across ticks
struct tick_sched { struct hrtimer sched_timer; @@ -77,6 +79,8 @@ struct tick_sched { u64 next_timer; ktime_t idle_expires; atomic_t tick_dep_mask;
unsigned long last_tick_jiffies;
unsigned int stalled_jiffies;
};
extern struct tick_sched *tick_get_tick_sched(int cpu);
2.41.0.640.ga95def55d0-goog
linux-stable-mirror@lists.linaro.org