[PATCH 4.9 167/171] sched/fair: Fix throttle_list starvation with low CFS quota

8 Nov 2018

4.9-stable review patch.  If anyone has any objections, please let me know.
------------------
From: Phil Auld pauld@redhat.com
commit baa9be4ffb55876923dc9716abc0a448e510ba30 upstream.
With a very low cpu.cfs_quota_us setting, such as the minimum of 1000,
distribute_cfs_runtime may not empty the throttled_list before it runs
out of runtime to distribute. In that case, due to the change from
c06f04c7048 to put throttled entries at the head of the list, later entries
on the list will starve.  Essentially, the same X processes will get pulled
off the list, given CPU time and then, when expired, get put back on the
head of the list where distribute_cfs_runtime will give runtime to the same
set of processes leaving the rest.
Fix the issue by setting a bit in struct cfs_bandwidth when
distribute_cfs_runtime is running, so that the code in throttle_cfs_rq can
decide to put the throttled entry on the tail or the head of the list.  The
bit is set/cleared by the callers of distribute_cfs_runtime while they hold
cfs_bandwidth->lock.
This is easy to reproduce with a handful of CPU consumers. I use 'crash' on
the live system. In some cases you can simply look at the throttled list and
see the later entries are not changing:
crash> list cfs_rq.throttled_list -H 0xffff90b54f6ade40 -s cfs_rq.runtime_remaining | paste - - | awk '{print $1"  "$4}' | pr -t -n3
    1     ffff90b56cb2d200  -976050
    2     ffff90b56cb2cc00  -484925
    3     ffff90b56cb2bc00  -658814
    4     ffff90b56cb2ba00  -275365
    5     ffff90b166a45600  -135138
    6     ffff90b56cb2da00  -282505
    7     ffff90b56cb2e000  -148065
    8     ffff90b56cb2fa00  -872591
    9     ffff90b56cb2c000  -84687
   10     ffff90b56cb2f000  -87237
   11     ffff90b166a40a00  -164582
crash> list cfs_rq.throttled_list -H 0xffff90b54f6ade40 -s cfs_rq.runtime_remaining | paste - - | awk '{print $1"  "$4}' | pr -t -n3
    1     ffff90b56cb2d200  -994147
    2     ffff90b56cb2cc00  -306051
    3     ffff90b56cb2bc00  -961321
    4     ffff90b56cb2ba00  -24490
    5     ffff90b166a45600  -135138
    6     ffff90b56cb2da00  -282505
    7     ffff90b56cb2e000  -148065
    8     ffff90b56cb2fa00  -872591
    9     ffff90b56cb2c000  -84687
   10     ffff90b56cb2f000  -87237
   11     ffff90b166a40a00  -164582
Sometimes it is easier to see by finding a process getting starved and looking
at the sched_info:
crash> task ffff8eb765994500 sched_info
  PID: 7800   TASK: ffff8eb765994500  CPU: 16  COMMAND: "cputest"
    sched_info = {
      pcount = 8,
      run_delay = 697094208,
      last_arrival = 240260125039,
      last_queued = 240260327513
    },
  crash> task ffff8eb765994500 sched_info
  PID: 7800   TASK: ffff8eb765994500  CPU: 16  COMMAND: "cputest"
    sched_info = {
      pcount = 8,
      run_delay = 697094208,
      last_arrival = 240260125039,
      last_queued = 240260327513
    },
Signed-off-by: Phil Auld pauld@redhat.com
Reviewed-by: Ben Segall bsegall@google.com
Cc: Linus Torvalds torvalds@linux-foundation.org
Cc: Peter Zijlstra peterz@infradead.org
Cc: Thomas Gleixner tglx@linutronix.de
Cc: stable@vger.kernel.org
Fixes: c06f04c70489 ("sched: Fix potential near-infinite distribute_cfs_runtime() loop")
Link: http://lkml.kernel.org/r/20181008143639.GA4019@pauld.bos.csb
Signed-off-by: Ingo Molnar mingo@kernel.org
Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org
---
 kernel/sched/fair.c  |   22 +++++++++++++++++++---
 kernel/sched/sched.h |    2 ++
 2 files changed, 21 insertions(+), 3 deletions(-)

--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3976,9 +3976,13 @@ static void throttle_cfs_rq(struct cfs_r
/*
     * Add to the _head_ of the list, so that an already-started
-	 * distribute_cfs_runtime will not see us
+	 * distribute_cfs_runtime will not see us. If disribute_cfs_runtime is
+	 * not running add to the tail so that later runqueues don't get starved.
     */
-	list_add_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
+	if (cfs_b->distribute_running)
+		list_add_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
+	else
+		list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
/*
     * If we're the first throttled task, make sure the bandwidth
@@ -4121,14 +4125,16 @@ static int do_sched_cfs_period_timer(str
     * in us over-using our runtime if it is all used during this loop, but
     * only by limited amounts in that extreme case.
     */
-	while (throttled && cfs_b->runtime > 0) {
+	while (throttled && cfs_b->runtime > 0 && !cfs_b->distribute_running) {
    	runtime = cfs_b->runtime;
+		cfs_b->distribute_running = 1;
    	raw_spin_unlock(&cfs_b->lock);
    	/* we can't nest cfs_b->lock while distributing bandwidth */
    	runtime = distribute_cfs_runtime(cfs_b, runtime,
    					 runtime_expires);
    	raw_spin_lock(&cfs_b->lock);
+		cfs_b->distribute_running = 0;
    	throttled = !list_empty(&cfs_b->throttled_cfs_rq);
cfs_b->runtime -= min(runtime, cfs_b->runtime);
@@ -4239,6 +4245,11 @@ static void do_sched_cfs_slack_timer(str
/* confirm we're still not at a refresh boundary */
    raw_spin_lock(&cfs_b->lock);
+	if (cfs_b->distribute_running) {
+		raw_spin_unlock(&cfs_b->lock);
+		return;
+	}
+
    if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) {
    	raw_spin_unlock(&cfs_b->lock);
    	return;
@@ -4248,6 +4259,9 @@ static void do_sched_cfs_slack_timer(str
    	runtime = cfs_b->runtime;
expires = cfs_b->runtime_expires;
+	if (runtime)
+		cfs_b->distribute_running = 1;
+
    raw_spin_unlock(&cfs_b->lock);
if (!runtime)
@@ -4258,6 +4272,7 @@ static void do_sched_cfs_slack_timer(str
    raw_spin_lock(&cfs_b->lock);
    if (expires == cfs_b->runtime_expires)
    	cfs_b->runtime -= min(runtime, cfs_b->runtime);
+	cfs_b->distribute_running = 0;
    raw_spin_unlock(&cfs_b->lock);
 }
@@ -4366,6 +4381,7 @@ void init_cfs_bandwidth(struct cfs_bandw
    cfs_b->period_timer.function = sched_cfs_period_timer;
    hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
    cfs_b->slack_timer.function = sched_cfs_slack_timer;
+	cfs_b->distribute_running = 0;
 }
static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -255,6 +255,8 @@ struct cfs_bandwidth {
    /* statistics */
    int nr_periods, nr_throttled;
    u64 throttled_time;
+
+	bool distribute_running;
 #endif
 };

    

2026

2025

2024

2023

2022

2021

2020

2019

2018

2017

[PATCH 4.9 167/171] sched/fair: Fix throttle_list starvation with low CFS quota