The WakeOvfIsDeferred code path in __call_rcu_nocb_wake() attempts to wake rcuog when the callback count exceeds qhimark and callbacks aren't done with their GP (newly queued or awaiting GP). However, a lot of testing proves this wake is always redundant or useless.
In the flooding case, rcuog is always waiting for a GP to finish. So waking up the rcuog thread is pointless. The timer wakeup adds overhead, rcuog simply wakes up and goes back to sleep achieving nothing.
This path also adds a full memory barrier, and additional timer expiry modifications unnecessarily.
The root cause is that WakeOvfIsDeferred fires when !rcu_segcblist_ready_cbs() (GP not complete), but waking rcuog cannot accelerate GP completion.
This commit therefore removes this path.
Tested with rcutorture scenarios: TREE01, TREE05, TREE08 (all NOCB configurations) - all pass. Also stress tested using a kernel module that floods call_rcu() to trigger the overload conditions and made the observations confirming the findings.
Signed-off-by: Joel Fernandes joelagnelf@nvidia.com --- kernel/rcu/tree.h | 1 - kernel/rcu/tree_nocb.h | 35 +++++++++-------------------------- 2 files changed, 9 insertions(+), 27 deletions(-)
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 2265b9c2906e..653fb4ba5852 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -301,7 +301,6 @@ struct rcu_data { #define RCU_NOCB_WAKE_BYPASS 1 #define RCU_NOCB_WAKE_LAZY 2 #define RCU_NOCB_WAKE 3 -#define RCU_NOCB_WAKE_FORCE 4
#define RCU_JIFFIES_TILL_FORCE_QS (1 + (HZ > 250) + (HZ > 500)) /* For jiffies_till_first_fqs and */ diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h index e6cd56603cad..daff2756cd90 100644 --- a/kernel/rcu/tree_nocb.h +++ b/kernel/rcu/tree_nocb.h @@ -518,10 +518,8 @@ static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp, }
/* - * Awaken the no-CBs grace-period kthread if needed, either due to it - * legitimately being asleep or due to overload conditions. - * - * If warranted, also wake up the kthread servicing this CPUs queues. + * Awaken the no-CBs grace-period kthread if needed due to it legitimately + * being asleep. */ static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_alldone, unsigned long flags) @@ -533,7 +531,6 @@ static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_alldone, long lazy_len; long len; struct task_struct *t; - struct rcu_data *rdp_gp = rdp->nocb_gp_rdp;
// If we are being polled or there is no kthread, just leave. t = READ_ONCE(rdp->nocb_gp_kthread); @@ -549,22 +546,22 @@ static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_alldone, lazy_len = READ_ONCE(rdp->lazy_len); if (was_alldone) { rdp->qlen_last_fqs_check = len; + rcu_nocb_unlock(rdp); // Only lazy CBs in bypass list if (lazy_len && bypass_len == lazy_len) { - rcu_nocb_unlock(rdp); wake_nocb_gp_defer(rdp, RCU_NOCB_WAKE_LAZY, TPS("WakeLazy")); } else if (!irqs_disabled_flags(flags)) { /* ... if queue was empty ... */ - rcu_nocb_unlock(rdp); wake_nocb_gp(rdp, false); trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WakeEmpty")); } else { - rcu_nocb_unlock(rdp); wake_nocb_gp_defer(rdp, RCU_NOCB_WAKE, TPS("WakeEmptyIsDeferred")); } + + return; } else if (len > rdp->qlen_last_fqs_check + qhimark) { /* ... or if many callbacks queued. */ rdp->qlen_last_fqs_check = len; @@ -575,21 +572,10 @@ static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_alldone, rcu_advance_cbs_nowake(rdp->mynode, rdp); rdp->nocb_gp_adv_time = j; } - smp_mb(); /* Enqueue before timer_pending(). */ - if ((rdp->nocb_cb_sleep || - !rcu_segcblist_ready_cbs(&rdp->cblist)) && - !timer_pending(&rdp_gp->nocb_timer)) { - rcu_nocb_unlock(rdp); - wake_nocb_gp_defer(rdp, RCU_NOCB_WAKE_FORCE, - TPS("WakeOvfIsDeferred")); - } else { - rcu_nocb_unlock(rdp); - trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WakeNot")); - } - } else { - rcu_nocb_unlock(rdp); - trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WakeNot")); } + + rcu_nocb_unlock(rdp); + trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WakeNot")); }
static void call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *head, @@ -966,7 +952,6 @@ static bool do_nocb_deferred_wakeup_common(struct rcu_data *rdp_gp, unsigned long flags) __releases(rdp_gp->nocb_gp_lock) { - int ndw; int ret;
if (!rcu_nocb_need_deferred_wakeup(rdp_gp, level)) { @@ -974,8 +959,7 @@ static bool do_nocb_deferred_wakeup_common(struct rcu_data *rdp_gp, return false; }
- ndw = rdp_gp->nocb_defer_wakeup; - ret = __wake_nocb_gp(rdp_gp, rdp, ndw == RCU_NOCB_WAKE_FORCE, flags); + ret = __wake_nocb_gp(rdp_gp, rdp, false, flags); trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("DeferredWake"));
return ret; @@ -991,7 +975,6 @@ static void do_nocb_deferred_wakeup_timer(struct timer_list *t) trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("Timer"));
raw_spin_lock_irqsave(&rdp->nocb_gp_lock, flags); - smp_mb__after_spinlock(); /* Timer expire before wakeup. */ do_nocb_deferred_wakeup_common(rdp, rdp, RCU_NOCB_WAKE_BYPASS, flags); }