Turns out a cfs_rq->runtime_remaining can become positive in assign_cfs_rq_runtime(), but this codepath has no call to unthrottle_cfs_rq().
This can leave us in a situation where we have a throttled cfs_rq with positive ->runtime_remaining, which breaks the math in distribute_cfs_runtime(): this function expects a negative value so that it may safely negate it into a positive value.
Add the missing unthrottle_cfs_rq(). While at it, add a WARN_ON where we expect negative values, and pull in a comment from the mailing list that didn't make it in [1].
[1]: https://lkml.kernel.org/r/BANLkTi=NmCxKX6EbDQcJYDJ5kKyG2N1ssw@mail.gmail.com
Cc: stable@vger.kernel.org Fixes: ec12cb7f31e2 ("sched: Accumulate per-cfs_rq cpu usage and charge against bandwidth") Reported-by: Liangyan liangyan.peng@linux.alibaba.com Signed-off-by: Valentin Schneider valentin.schneider@arm.com --- kernel/sched/fair.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 1054d2cf6aaa..219ff3f328e5 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4385,6 +4385,11 @@ static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq) return rq_clock_task(rq_of(cfs_rq)) - cfs_rq->throttled_clock_task_time; }
+static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq) +{ + return cfs_bandwidth_used() && cfs_rq->throttled; +} + /* returns 0 on failure to allocate runtime */ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq) { @@ -4411,6 +4416,9 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
cfs_rq->runtime_remaining += amount;
+ if (cfs_rq->runtime_remaining > 0 && cfs_rq_throttled(cfs_rq)) + unthrottle_cfs_rq(cfs_rq); + return cfs_rq->runtime_remaining > 0; }
@@ -4439,11 +4447,6 @@ void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) __account_cfs_rq_runtime(cfs_rq, delta_exec); }
-static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq) -{ - return cfs_bandwidth_used() && cfs_rq->throttled; -} - /* check whether cfs_rq, or any parent, is throttled */ static inline int throttled_hierarchy(struct cfs_rq *cfs_rq) { @@ -4628,6 +4631,10 @@ static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b, u64 remaining) if (!cfs_rq_throttled(cfs_rq)) goto next;
+ /* By the above check, this should never be true */ + WARN_ON(cfs_rq->runtime_remaining > 0); + + /* Pick the minimum amount to return to a positive quota state */ runtime = -cfs_rq->runtime_remaining + 1; if (runtime > remaining) runtime = remaining;
On Tue, Aug 20, 2019 at 11:54:20AM +0100, Valentin Schneider wrote:
Turns out a cfs_rq->runtime_remaining can become positive in assign_cfs_rq_runtime(), but this codepath has no call to unthrottle_cfs_rq().
This can leave us in a situation where we have a throttled cfs_rq with positive ->runtime_remaining, which breaks the math in distribute_cfs_runtime(): this function expects a negative value so that it may safely negate it into a positive value.
Add the missing unthrottle_cfs_rq(). While at it, add a WARN_ON where we expect negative values, and pull in a comment from the mailing list that didn't make it in [1].
Cc: stable@vger.kernel.org Fixes: ec12cb7f31e2 ("sched: Accumulate per-cfs_rq cpu usage and charge against bandwidth") Reported-by: Liangyan liangyan.peng@linux.alibaba.com Signed-off-by: Valentin Schneider valentin.schneider@arm.com
Thanks!
kernel/sched/fair.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 1054d2cf6aaa..219ff3f328e5 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4385,6 +4385,11 @@ static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq) return rq_clock_task(rq_of(cfs_rq)) - cfs_rq->throttled_clock_task_time; } +static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq) +{
- return cfs_bandwidth_used() && cfs_rq->throttled;
+}
/* returns 0 on failure to allocate runtime */ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq) { @@ -4411,6 +4416,9 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq) cfs_rq->runtime_remaining += amount;
- if (cfs_rq->runtime_remaining > 0 && cfs_rq_throttled(cfs_rq))
unthrottle_cfs_rq(cfs_rq);
- return cfs_rq->runtime_remaining > 0;
} @@ -4439,11 +4447,6 @@ void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) __account_cfs_rq_runtime(cfs_rq, delta_exec); } -static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq) -{
- return cfs_bandwidth_used() && cfs_rq->throttled;
-}
/* check whether cfs_rq, or any parent, is throttled */ static inline int throttled_hierarchy(struct cfs_rq *cfs_rq) { @@ -4628,6 +4631,10 @@ static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b, u64 remaining) if (!cfs_rq_throttled(cfs_rq)) goto next;
/* By the above check, this should never be true */
WARN_ON(cfs_rq->runtime_remaining > 0);
runtime = -cfs_rq->runtime_remaining + 1; if (runtime > remaining) runtime = remaining;/* Pick the minimum amount to return to a positive quota state */
-- 2.22.0
Peter Zijlstra peterz@infradead.org writes:
On Tue, Aug 20, 2019 at 11:54:20AM +0100, Valentin Schneider wrote:
Turns out a cfs_rq->runtime_remaining can become positive in assign_cfs_rq_runtime(), but this codepath has no call to unthrottle_cfs_rq().
This can leave us in a situation where we have a throttled cfs_rq with positive ->runtime_remaining, which breaks the math in distribute_cfs_runtime(): this function expects a negative value so that it may safely negate it into a positive value.
Add the missing unthrottle_cfs_rq(). While at it, add a WARN_ON where we expect negative values, and pull in a comment from the mailing list that didn't make it in [1].
This didn't exist because it's not supposed to be possible to call account_cfs_rq_runtime on a throttled cfs_rq at all, so that's the invariant being violated. Do you know what the code path causing this looks like?
This would allow both list del and add while distribute is doing a foreach, but I think that the racing behavior would just be to restart the distribute loop, which is fine.
Cc: stable@vger.kernel.org Fixes: ec12cb7f31e2 ("sched: Accumulate per-cfs_rq cpu usage and charge against bandwidth") Reported-by: Liangyan liangyan.peng@linux.alibaba.com Signed-off-by: Valentin Schneider valentin.schneider@arm.com
Thanks!
kernel/sched/fair.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 1054d2cf6aaa..219ff3f328e5 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4385,6 +4385,11 @@ static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq) return rq_clock_task(rq_of(cfs_rq)) - cfs_rq->throttled_clock_task_time; } +static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq) +{
- return cfs_bandwidth_used() && cfs_rq->throttled;
+}
/* returns 0 on failure to allocate runtime */ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq) { @@ -4411,6 +4416,9 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq) cfs_rq->runtime_remaining += amount;
- if (cfs_rq->runtime_remaining > 0 && cfs_rq_throttled(cfs_rq))
unthrottle_cfs_rq(cfs_rq);
- return cfs_rq->runtime_remaining > 0;
} @@ -4439,11 +4447,6 @@ void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) __account_cfs_rq_runtime(cfs_rq, delta_exec); } -static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq) -{
- return cfs_bandwidth_used() && cfs_rq->throttled;
-}
/* check whether cfs_rq, or any parent, is throttled */ static inline int throttled_hierarchy(struct cfs_rq *cfs_rq) { @@ -4628,6 +4631,10 @@ static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b, u64 remaining) if (!cfs_rq_throttled(cfs_rq)) goto next;
/* By the above check, this should never be true */
WARN_ON(cfs_rq->runtime_remaining > 0);
runtime = -cfs_rq->runtime_remaining + 1; if (runtime > remaining) runtime = remaining;/* Pick the minimum amount to return to a positive quota state */
-- 2.22.0
Valentin Schneider valentin.schneider@arm.com writes:
Turns out a cfs_rq->runtime_remaining can become positive in assign_cfs_rq_runtime(), but this codepath has no call to unthrottle_cfs_rq().
This can leave us in a situation where we have a throttled cfs_rq with positive ->runtime_remaining, which breaks the math in distribute_cfs_runtime(): this function expects a negative value so that it may safely negate it into a positive value.
Add the missing unthrottle_cfs_rq(). While at it, add a WARN_ON where we expect negative values, and pull in a comment from the mailing list that didn't make it in [1].
Cc: stable@vger.kernel.org Fixes: ec12cb7f31e2 ("sched: Accumulate per-cfs_rq cpu usage and charge against bandwidth") Reported-by: Liangyan liangyan.peng@linux.alibaba.com Signed-off-by: Valentin Schneider valentin.schneider@arm.com
Having now seen the rest of the thread:
Could you send the repro, as it doesn't seem to have reached lkml, so that I can confirm my guess as to what's going on?
It seems most likely we throttle during one of the remove-change-adds in set_cpus_allowed and friends or during the put half of pick_next_task followed by idle balance to drop the lock. Then distribute races with a later assign_cfs_rq_runtime so that the account finds runtime in the cfs_b.
Re clock_task, it's only frozen for the purposes of pelt, not delta_exec
The other possible way to fix this would be to skip assign if throttled, since the only time it could succeed is if we're racing with a distribute that will unthrottle use anyways.
The main advantage of that is the risk of screwy behavior due to unthrottling in the middle of pick_next/put_prev. The disadvantage is that we already have the lock, if it works we don't need an ipi to trigger a preempt, etc. (But I think one of the issues is that we may trigger the preempt on the previous task, not the next, and I'm not 100% sure that will carry over correctly)
kernel/sched/fair.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 1054d2cf6aaa..219ff3f328e5 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4385,6 +4385,11 @@ static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq) return rq_clock_task(rq_of(cfs_rq)) - cfs_rq->throttled_clock_task_time; } +static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq) +{
- return cfs_bandwidth_used() && cfs_rq->throttled;
+}
/* returns 0 on failure to allocate runtime */ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq) { @@ -4411,6 +4416,9 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq) cfs_rq->runtime_remaining += amount;
- if (cfs_rq->runtime_remaining > 0 && cfs_rq_throttled(cfs_rq))
unthrottle_cfs_rq(cfs_rq);
- return cfs_rq->runtime_remaining > 0;
} @@ -4439,11 +4447,6 @@ void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) __account_cfs_rq_runtime(cfs_rq, delta_exec); } -static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq) -{
- return cfs_bandwidth_used() && cfs_rq->throttled;
-}
/* check whether cfs_rq, or any parent, is throttled */ static inline int throttled_hierarchy(struct cfs_rq *cfs_rq) { @@ -4628,6 +4631,10 @@ static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b, u64 remaining) if (!cfs_rq_throttled(cfs_rq)) goto next;
/* By the above check, this should never be true */
WARN_ON(cfs_rq->runtime_remaining > 0);
runtime = -cfs_rq->runtime_remaining + 1; if (runtime > remaining) runtime = remaining;/* Pick the minimum amount to return to a positive quota state */
On 22/08/2019 19:48, bsegall@google.com wrote:> Having now seen the rest of the thread:
Could you send the repro, as it doesn't seem to have reached lkml, so that I can confirm my guess as to what's going on?
Huh, odd. Here's the thing:
delay.c: #include <stdlib.h> #include <stdio.h> #include <unistd.h> #include <pthread.h>
unsigned long NUM_LOOPS=2500000*250;
/* simple loop based delay: */ static void delay_loop(unsigned long loops) { asm volatile( " test %0,%0 \n" " jz 3f \n" " jmp 1f \n"
".align 16 \n" "1: jmp 2f \n"
".align 16 \n" "2: dec %0 \n" " jnz 2b \n" "3: dec %0 \n"
: /* we don't need output */ :"a" (loops) ); }
void __const_udelay(unsigned long xloops) { int d0;
xloops *= 4; asm("mull %%edx" :"=d" (xloops), "=&a" (d0) :"1" (xloops), "0" (NUM_LOOPS));
delay_loop(++xloops); }
void __udelay(unsigned long usecs) { __const_udelay(usecs * 0x000010c7); /* 2**32 / 1000000 (rounded up) */ }
static void *thread_start(void *arg) { while(1) { __udelay((unsigned long)arg); usleep(7000); } }
int main(int argc, char*argv[]) { int i; int thread; unsigned long timeout; pthread_t new_th;
if (argc != 3) { printf("./delay nr_thread work_loop\n"); exit(-1); }
thread = atoi(argv[1]); timeout = (unsigned long)atoi(argv[2]);
for (i = 0; i < thread; i++) { pthread_create(&new_th, NULL, thread_start, (void *)timeout); usleep(100); }
while(1) { sleep(10); } }
do-delay.sh: #!/bin/bash
mkdir /sys/fs/cgroup/cpu/test1 echo 100000 > /sys/fs/cgroup/cpu/cpu.cfs_period_us echo 1600000 > /sys/fs/cgroup/cpu/test1/cpu.cfs_quota_us ./delay 500 1000 & echo $! > /sys/fs/cgroup/cpu/test1/cgroup.procs mkdir /sys/fs/cgroup/cpu/test2 echo 100000 > /sys/fs/cgroup/cpu/test2/cpu.cfs_period_us echo 800000 > /sys/fs/cgroup/cpu/test2/cpu.cfs_quota_us ./delay 500 1000 & echo $! > /sys/fs/cgroup/cpu/test2/cgroup.procs prev=0;while true; do curr=`cat /sys/fs/cgroup/cpu/test1/cpuacct.usage` && echo $(($curr-$prev)) && prev=$curr && sleep 1; done
I ran the thing on a dual-socket x86 test box and could trigger the issue quite rapidly (w/ the WARN_ON in distribute_cfs_runtime()).
It seems most likely we throttle during one of the remove-change-adds in set_cpus_allowed and friends or during the put half of pick_next_task followed by idle balance to drop the lock. Then distribute races with a later assign_cfs_rq_runtime so that the account finds runtime in the cfs_b.
I should still have a trace laying around, let me have a look.
Re clock_task, it's only frozen for the purposes of pelt, not delta_exec
Noted, thanks. But then we shouldn't expect throttled rq's to call into update_curr(), right? Maybe just right after they've been throttled, but not beyond that. Otherwise I fail to see how that would make sense.
The other possible way to fix this would be to skip assign if throttled, since the only time it could succeed is if we're racing with a distribute that will unthrottle use anyways.
So pretty much the change Liangyan originally proposed? (so much for trying to help :p)
The main advantage of that is the risk of screwy behavior due to unthrottling in the middle of pick_next/put_prev. The disadvantage is that we already have the lock, if it works we don't need an ipi to trigger a preempt, etc. (But I think one of the issues is that we may trigger the preempt on the previous task, not the next, and I'm not 100% sure that will carry over correctly)
On 22/08/2019 21:40, Valentin Schneider wrote:
On 22/08/2019 19:48, bsegall@google.com wrote:
Re we shouldn't get account_cfs_rq_runtime() called on throttled cfs_rq's, with this: --- diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 171eef3f08f9..1acb88024cad 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4385,6 +4385,11 @@ static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq) return rq_clock_task(rq_of(cfs_rq)) - cfs_rq->throttled_clock_task_time; }
+static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq) +{ + return cfs_bandwidth_used() && cfs_rq->throttled; +} + /* returns 0 on failure to allocate runtime */ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq) { @@ -4411,6 +4416,8 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
cfs_rq->runtime_remaining += amount;
+ WARN_ON(cfs_rq_throttled(cfs_rq) && cfs_rq->runtime_remaining > 0); + return cfs_rq->runtime_remaining > 0; }
@@ -4436,12 +4443,9 @@ void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled) return;
- __account_cfs_rq_runtime(cfs_rq, delta_exec); -} + WARN_ON(cfs_rq_throttled(cfs_rq));
-static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq) -{ - return cfs_bandwidth_used() && cfs_rq->throttled; + __account_cfs_rq_runtime(cfs_rq, delta_exec); }
/* check whether cfs_rq, or any parent, is throttled */ ---
I get this:
[ 204.798643] Call Trace: [ 204.798645] put_prev_entity+0x8d/0x100 [ 204.798647] put_prev_task_fair+0x22/0x40 [ 204.798648] pick_next_task_idle+0x36/0x50 [ 204.798650] __schedule+0x61d/0x6c0 [ 204.798651] schedule+0x2d/0x90 [ 204.798653] exit_to_usermode_loop+0x61/0x100 [ 204.798654] prepare_exit_to_usermode+0x91/0xa0 [ 204.798656] retint_user+0x8/0x8
(this is a hit on the account_cfs_rq_runtime() WARN_ON)
Resend.
Sorry that my previous email has format issue.
On 19/8/23 上午2:48, bsegall@google.com wrote:
Valentin Schneider valentin.schneider@arm.com writes:
Turns out a cfs_rq->runtime_remaining can become positive in assign_cfs_rq_runtime(), but this codepath has no call to unthrottle_cfs_rq().
This can leave us in a situation where we have a throttled cfs_rq with positive ->runtime_remaining, which breaks the math in distribute_cfs_runtime(): this function expects a negative value so that it may safely negate it into a positive value.
Add the missing unthrottle_cfs_rq(). While at it, add a WARN_ON where we expect negative values, and pull in a comment from the mailing list that didn't make it in [1].
Cc: stable@vger.kernel.org Fixes: ec12cb7f31e2 ("sched: Accumulate per-cfs_rq cpu usage and charge against bandwidth") Reported-by: Liangyan liangyan.peng@linux.alibaba.com Signed-off-by: Valentin Schneider valentin.schneider@arm.com
Having now seen the rest of the thread:
Could you send the repro, as it doesn't seem to have reached lkml, so that I can confirm my guess as to what's going on?
It seems most likely we throttle during one of the remove-change-adds in set_cpus_allowed and friends or during the put half of pick_next_task followed by idle balance to drop the lock. Then distribute races with a later assign_cfs_rq_runtime so that the account finds runtime in the cfs_b.
pick_next_task_fair calls update_curr but get zero runtime because of cfs_b->runtime=0, then throttle current cfs_rq because of cfs_rq->runtime_remaining=0, then call put_prev_entity to __account_cfs_rq_runtime to assign new runtime since dequeue_entity on another cpu just call return_cfs_rq_runtime to return some runtime to cfs_b->runtime.
Please check below debug log to trace this logic.
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index f35930f..0da556c 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4395,6 +4395,12 @@ static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) if (likely(cfs_rq->runtime_remaining > 0)) return;
+ if (cfs_rq->throttled && smp_processor_id()==20) { + pr_err("__account_cfs_rq_runtime:cfs_rq=%p,comm=%s,pid=%d,cpu=%d\n",cfs_rq,current->comm,current->pid,smp_processor_id()); + dump_stack(); + } + //if (cfs_rq->throttled) + // return; /* * if we're unable to extend our runtime we resched so that the active * hierarchy can be throttled @@ -4508,6 +4514,13 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq) sub_nr_running(rq, task_delta);
cfs_rq->throttled = 1; + { + if (cfs_rq->nr_running > 0 && smp_processor_id()==20) { + pr_err("throttle_cfs_rq:cfs_rq=%p,comm=%s,pid=%d,cpu=%d\n",cfs_rq,current->comm,current->pid,smp_processor_id()); + dump_stack(); + } + } cfs_rq->throttled_clock = rq_clock(rq); raw_spin_lock(&cfs_b->lock); empty = list_empty(&cfs_b->throttled_cfs_rq);
[ 257.406397] throttle_cfs_rq:cfs_rq=00000000b012d731,comm=delay,pid=4307,cpu=20 [ 257.407251] CPU: 20 PID: 4307 Comm: delay Tainted: G W E 5.2.0-rc3+ #9 [ 257.408795] Call Trace: [ 257.409085] dump_stack+0x5c/0x7b [ 257.409482] throttle_cfs_rq+0x2c3/0x2d0 [ 257.409940] check_cfs_rq_runtime+0x2f/0x50 [ 257.410430] pick_next_task_fair+0xb1/0x740 [ 257.410918] __schedule+0x104/0x670 [ 257.411333] schedule+0x33/0x90 [ 257.411706] exit_to_usermode_loop+0x6a/0xf0 [ 257.412201] prepare_exit_to_usermode+0x80/0xc0 [ 257.412734] retint_user+0x8/0x8 [ 257.413114] RIP: 0033:0x4006d0 [ 257.413480] Code: 7d f8 48 8b 45 f8 48 85 c0 74 24 eb 0d 0f 1f 00 66 2e 0f 1f 84 00 00 00 00 00 eb 0e 0f 1f 40 00 66 2e 0f 1f 84 00 00 00 00 00 <48> ff c8 75 fb 48 ff c8 90 5d c3 55 48 89 e5 48 83 ec 18 48 89 7d [ 257.415630] RSP: 002b:00007f9b74abbe90 EFLAGS: 00000206 ORIG_RAX: ffffffffffffff13 [ 257.416508] RAX: 0000000000060f7b RBX: 0000000000000000 RCX: 0000000000000000 [ 257.417333] RDX: 00000000002625b3 RSI: 0000000000000000 RDI: 00000000002625b4 [ 257.418155] RBP: 00007f9b74abbe90 R08: 00007f9b74abc700 R09: 00007f9b74abc700 [ 257.418983] R10: 00007f9b74abc9d0 R11: 0000000000000000 R12: 00007ffee72e1afe [ 257.419813] R13: 00007ffee72e1aff R14: 00007ffee72e1b00 R15: 0000000000000000
[ 257.420718] __account_cfs_rq_runtime:cfs_rq=00000000b012d731,comm=delay,pid=4307,cpu=20 [ 257.421646] CPU: 20 PID: 4307 Comm: delay Tainted: G W E 5.2.0-rc3+ #9 [ 257.422538] Call Trace: [ 257.424712] dump_stack+0x5c/0x7b [ 257.425101] __account_cfs_rq_runtime+0x1d7/0x1e0 [ 257.425656] put_prev_entity+0x90/0x100 [ 257.426102] pick_next_task_fair+0x334/0x740 [ 257.426605] __schedule+0x104/0x670 [ 257.427013] schedule+0x33/0x90 [ 257.427384] exit_to_usermode_loop+0x6a/0xf0 [ 257.427879] prepare_exit_to_usermode+0x80/0xc0 [ 257.428406] retint_user+0x8/0x8 [ 257.428783] RIP: 0033:0x4006d0
Re clock_task, it's only frozen for the purposes of pelt, not delta_exec
The other possible way to fix this would be to skip assign if throttled, since the only time it could succeed is if we're racing with a distribute that will unthrottle use anyways.
I ever posted a similar patch here https://lkml.org/lkml/2019/8/14/1176
The main advantage of that is the risk of screwy behavior due to unthrottling in the middle of pick_next/put_prev. The disadvantage is that we already have the lock, if it works we don't need an ipi to trigger a preempt, etc. (But I think one of the issues is that we may trigger the preempt on the previous task, not the next, and I'm not 100% sure that will carry over correctly)
kernel/sched/fair.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 1054d2cf6aaa..219ff3f328e5 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4385,6 +4385,11 @@ static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq) return rq_clock_task(rq_of(cfs_rq)) - cfs_rq->throttled_clock_task_time; } +static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq) +{
- return cfs_bandwidth_used() && cfs_rq->throttled;
+}
- /* returns 0 on failure to allocate runtime */ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq) {
@@ -4411,6 +4416,9 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq) cfs_rq->runtime_remaining += amount;
- if (cfs_rq->runtime_remaining > 0 && cfs_rq_throttled(cfs_rq))
unthrottle_cfs_rq(cfs_rq);
- return cfs_rq->runtime_remaining > 0; }
@@ -4439,11 +4447,6 @@ void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) __account_cfs_rq_runtime(cfs_rq, delta_exec); } -static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq) -{
- return cfs_bandwidth_used() && cfs_rq->throttled;
-}
- /* check whether cfs_rq, or any parent, is throttled */ static inline int throttled_hierarchy(struct cfs_rq *cfs_rq) {
@@ -4628,6 +4631,10 @@ static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b, u64 remaining) if (!cfs_rq_throttled(cfs_rq)) goto next;
/* By the above check, this should never be true */
WARN_ON(cfs_rq->runtime_remaining > 0);
runtime = -cfs_rq->runtime_remaining + 1; if (runtime > remaining) runtime = remaining;/* Pick the minimum amount to return to a positive quota state */
linux-stable-mirror@lists.linaro.org