This is a note to let you know that I've just added the patch titled
pipe: avoid round_pipe_size() nr_pages overflow on 32-bit
to the 4.9-stable tree which can be found at:
http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=sum…
The filename of the patch is:
pipe-avoid-round_pipe_size-nr_pages-overflow-on-32-bit.patch
and it can be found in the queue-4.9 subdirectory.
If you, or anyone else, feels it should not be added to the stable tree,
please let <stable(a)vger.kernel.org> know about it.
>From d3f14c485867cfb2e0c48aa88c41d0ef4bf5209c Mon Sep 17 00:00:00 2001
From: Joe Lawrence <joe.lawrence(a)redhat.com>
Date: Fri, 17 Nov 2017 15:29:21 -0800
Subject: pipe: avoid round_pipe_size() nr_pages overflow on 32-bit
From: Joe Lawrence <joe.lawrence(a)redhat.com>
commit d3f14c485867cfb2e0c48aa88c41d0ef4bf5209c upstream.
round_pipe_size() contains a right-bit-shift expression which may
overflow, which would cause undefined results in a subsequent
roundup_pow_of_two() call.
static inline unsigned int round_pipe_size(unsigned int size)
{
unsigned long nr_pages;
nr_pages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
return roundup_pow_of_two(nr_pages) << PAGE_SHIFT;
}
PAGE_SIZE is defined as (1UL << PAGE_SHIFT), so:
- 4 bytes wide on 32-bit (0 to 0xffffffff)
- 8 bytes wide on 64-bit (0 to 0xffffffffffffffff)
That means that 32-bit round_pipe_size(), nr_pages may overflow to 0:
size=0x00000000 nr_pages=0x0
size=0x00000001 nr_pages=0x1
size=0xfffff000 nr_pages=0xfffff
size=0xfffff001 nr_pages=0x0 << !
size=0xffffffff nr_pages=0x0 << !
This is bad because roundup_pow_of_two(n) is undefined when n == 0!
64-bit is not a problem as the unsigned int size is 4 bytes wide
(similar to 32-bit) and the larger, 8 byte wide unsigned long, is
sufficient to handle the largest value of the bit shift expression:
size=0xffffffff nr_pages=100000
Modify round_pipe_size() to return 0 if n == 0 and updates its callers to
handle accordingly.
Link: http://lkml.kernel.org/r/1507658689-11669-3-git-send-email-joe.lawrence@red…
Signed-off-by: Joe Lawrence <joe.lawrence(a)redhat.com>
Reported-by: Mikulas Patocka <mpatocka(a)redhat.com>
Reviewed-by: Mikulas Patocka <mpatocka(a)redhat.com>
Cc: Al Viro <viro(a)zeniv.linux.org.uk>
Cc: Jens Axboe <axboe(a)kernel.dk>
Cc: Michael Kerrisk <mtk.manpages(a)gmail.com>
Cc: Randy Dunlap <rdunlap(a)infradead.org>
Cc: Josh Poimboeuf <jpoimboe(a)redhat.com>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
Signed-off-by: Dong Jinguang <dongjinguang(a)huawei.com>
Signed-off-by: Linus Torvalds <torvalds(a)linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
---
fs/pipe.c | 17 +++++++++++++++--
1 file changed, 15 insertions(+), 2 deletions(-)
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -1018,13 +1018,19 @@ const struct file_operations pipefifo_fo
/*
* Currently we rely on the pipe array holding a power-of-2 number
- * of pages.
+ * of pages. Returns 0 on error.
*/
static inline unsigned int round_pipe_size(unsigned int size)
{
unsigned long nr_pages;
+ if (size < pipe_min_size)
+ size = pipe_min_size;
+
nr_pages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ if (nr_pages == 0)
+ return 0;
+
return roundup_pow_of_two(nr_pages) << PAGE_SHIFT;
}
@@ -1040,6 +1046,8 @@ static long pipe_set_size(struct pipe_in
long ret = 0;
size = round_pipe_size(arg);
+ if (size == 0)
+ return -EINVAL;
nr_pages = size >> PAGE_SHIFT;
if (!nr_pages)
@@ -1123,13 +1131,18 @@ out_revert_acct:
int pipe_proc_fn(struct ctl_table *table, int write, void __user *buf,
size_t *lenp, loff_t *ppos)
{
+ unsigned int rounded_pipe_max_size;
int ret;
ret = proc_dointvec_minmax(table, write, buf, lenp, ppos);
if (ret < 0 || !write)
return ret;
- pipe_max_size = round_pipe_size(pipe_max_size);
+ rounded_pipe_max_size = round_pipe_size(pipe_max_size);
+ if (rounded_pipe_max_size == 0)
+ return -EINVAL;
+
+ pipe_max_size = rounded_pipe_max_size;
return ret;
}
Patches currently in stable-queue which might be from joe.lawrence(a)redhat.com are
queue-4.9/pipe-avoid-round_pipe_size-nr_pages-overflow-on-32-bit.patch
This is a note to let you know that I've just added the patch titled
pipe: avoid round_pipe_size() nr_pages overflow on 32-bit
to the 4.4-stable tree which can be found at:
http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=sum…
The filename of the patch is:
pipe-avoid-round_pipe_size-nr_pages-overflow-on-32-bit.patch
and it can be found in the queue-4.4 subdirectory.
If you, or anyone else, feels it should not be added to the stable tree,
please let <stable(a)vger.kernel.org> know about it.
>From d3f14c485867cfb2e0c48aa88c41d0ef4bf5209c Mon Sep 17 00:00:00 2001
From: Joe Lawrence <joe.lawrence(a)redhat.com>
Date: Fri, 17 Nov 2017 15:29:21 -0800
Subject: pipe: avoid round_pipe_size() nr_pages overflow on 32-bit
From: Joe Lawrence <joe.lawrence(a)redhat.com>
commit d3f14c485867cfb2e0c48aa88c41d0ef4bf5209c upstream.
round_pipe_size() contains a right-bit-shift expression which may
overflow, which would cause undefined results in a subsequent
roundup_pow_of_two() call.
static inline unsigned int round_pipe_size(unsigned int size)
{
unsigned long nr_pages;
nr_pages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
return roundup_pow_of_two(nr_pages) << PAGE_SHIFT;
}
PAGE_SIZE is defined as (1UL << PAGE_SHIFT), so:
- 4 bytes wide on 32-bit (0 to 0xffffffff)
- 8 bytes wide on 64-bit (0 to 0xffffffffffffffff)
That means that 32-bit round_pipe_size(), nr_pages may overflow to 0:
size=0x00000000 nr_pages=0x0
size=0x00000001 nr_pages=0x1
size=0xfffff000 nr_pages=0xfffff
size=0xfffff001 nr_pages=0x0 << !
size=0xffffffff nr_pages=0x0 << !
This is bad because roundup_pow_of_two(n) is undefined when n == 0!
64-bit is not a problem as the unsigned int size is 4 bytes wide
(similar to 32-bit) and the larger, 8 byte wide unsigned long, is
sufficient to handle the largest value of the bit shift expression:
size=0xffffffff nr_pages=100000
Modify round_pipe_size() to return 0 if n == 0 and updates its callers to
handle accordingly.
Link: http://lkml.kernel.org/r/1507658689-11669-3-git-send-email-joe.lawrence@red…
Signed-off-by: Joe Lawrence <joe.lawrence(a)redhat.com>
Reported-by: Mikulas Patocka <mpatocka(a)redhat.com>
Reviewed-by: Mikulas Patocka <mpatocka(a)redhat.com>
Cc: Al Viro <viro(a)zeniv.linux.org.uk>
Cc: Jens Axboe <axboe(a)kernel.dk>
Cc: Michael Kerrisk <mtk.manpages(a)gmail.com>
Cc: Randy Dunlap <rdunlap(a)infradead.org>
Cc: Josh Poimboeuf <jpoimboe(a)redhat.com>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds(a)linux-foundation.org>
Signed-off-by: Dong Jinguang <dongjinguang(a)huawei.com>
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
---
fs/pipe.c | 18 ++++++++++++++++--
1 file changed, 16 insertions(+), 2 deletions(-)
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -1001,6 +1001,9 @@ static long pipe_set_size(struct pipe_in
{
struct pipe_buffer *bufs;
+ if (!nr_pages)
+ return -EINVAL;
+
/*
* We can shrink the pipe, if arg >= pipe->nrbufs. Since we don't
* expect a lot of shrink+grow operations, just free and allocate
@@ -1045,13 +1048,19 @@ static long pipe_set_size(struct pipe_in
/*
* Currently we rely on the pipe array holding a power-of-2 number
- * of pages.
+ * of pages. Returns 0 on error.
*/
static inline unsigned int round_pipe_size(unsigned int size)
{
unsigned long nr_pages;
+ if (size < pipe_min_size)
+ size = pipe_min_size;
+
nr_pages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ if (nr_pages == 0)
+ return 0;
+
return roundup_pow_of_two(nr_pages) << PAGE_SHIFT;
}
@@ -1062,13 +1071,18 @@ static inline unsigned int round_pipe_si
int pipe_proc_fn(struct ctl_table *table, int write, void __user *buf,
size_t *lenp, loff_t *ppos)
{
+ unsigned int rounded_pipe_max_size;
int ret;
ret = proc_dointvec_minmax(table, write, buf, lenp, ppos);
if (ret < 0 || !write)
return ret;
- pipe_max_size = round_pipe_size(pipe_max_size);
+ rounded_pipe_max_size = round_pipe_size(pipe_max_size);
+ if (rounded_pipe_max_size == 0)
+ return -EINVAL;
+
+ pipe_max_size = rounded_pipe_max_size;
return ret;
}
Patches currently in stable-queue which might be from joe.lawrence(a)redhat.com are
queue-4.4/pipe-avoid-round_pipe_size-nr_pages-overflow-on-32-bit.patch
This is a note to let you know that I've just added the patch titled
pipe: avoid round_pipe_size() nr_pages overflow on 32-bit
to the 4.14-stable tree which can be found at:
http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=sum…
The filename of the patch is:
pipe-avoid-round_pipe_size-nr_pages-overflow-on-32-bit.patch
and it can be found in the queue-4.14 subdirectory.
If you, or anyone else, feels it should not be added to the stable tree,
please let <stable(a)vger.kernel.org> know about it.
>From d3f14c485867cfb2e0c48aa88c41d0ef4bf5209c Mon Sep 17 00:00:00 2001
From: Joe Lawrence <joe.lawrence(a)redhat.com>
Date: Fri, 17 Nov 2017 15:29:21 -0800
Subject: pipe: avoid round_pipe_size() nr_pages overflow on 32-bit
From: Joe Lawrence <joe.lawrence(a)redhat.com>
commit d3f14c485867cfb2e0c48aa88c41d0ef4bf5209c upstream.
round_pipe_size() contains a right-bit-shift expression which may
overflow, which would cause undefined results in a subsequent
roundup_pow_of_two() call.
static inline unsigned int round_pipe_size(unsigned int size)
{
unsigned long nr_pages;
nr_pages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
return roundup_pow_of_two(nr_pages) << PAGE_SHIFT;
}
PAGE_SIZE is defined as (1UL << PAGE_SHIFT), so:
- 4 bytes wide on 32-bit (0 to 0xffffffff)
- 8 bytes wide on 64-bit (0 to 0xffffffffffffffff)
That means that 32-bit round_pipe_size(), nr_pages may overflow to 0:
size=0x00000000 nr_pages=0x0
size=0x00000001 nr_pages=0x1
size=0xfffff000 nr_pages=0xfffff
size=0xfffff001 nr_pages=0x0 << !
size=0xffffffff nr_pages=0x0 << !
This is bad because roundup_pow_of_two(n) is undefined when n == 0!
64-bit is not a problem as the unsigned int size is 4 bytes wide
(similar to 32-bit) and the larger, 8 byte wide unsigned long, is
sufficient to handle the largest value of the bit shift expression:
size=0xffffffff nr_pages=100000
Modify round_pipe_size() to return 0 if n == 0 and updates its callers to
handle accordingly.
Link: http://lkml.kernel.org/r/1507658689-11669-3-git-send-email-joe.lawrence@red…
Signed-off-by: Joe Lawrence <joe.lawrence(a)redhat.com>
Reported-by: Mikulas Patocka <mpatocka(a)redhat.com>
Reviewed-by: Mikulas Patocka <mpatocka(a)redhat.com>
Cc: Al Viro <viro(a)zeniv.linux.org.uk>
Cc: Jens Axboe <axboe(a)kernel.dk>
Cc: Michael Kerrisk <mtk.manpages(a)gmail.com>
Cc: Randy Dunlap <rdunlap(a)infradead.org>
Cc: Josh Poimboeuf <jpoimboe(a)redhat.com>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
Signed-off-by: Dong Jinguang <dongjinguang(a)huawei.com>
Signed-off-by: Linus Torvalds <torvalds(a)linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
---
fs/pipe.c | 17 +++++++++++++++--
1 file changed, 15 insertions(+), 2 deletions(-)
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -1018,13 +1018,19 @@ const struct file_operations pipefifo_fo
/*
* Currently we rely on the pipe array holding a power-of-2 number
- * of pages.
+ * of pages. Returns 0 on error.
*/
static inline unsigned int round_pipe_size(unsigned int size)
{
unsigned long nr_pages;
+ if (size < pipe_min_size)
+ size = pipe_min_size;
+
nr_pages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ if (nr_pages == 0)
+ return 0;
+
return roundup_pow_of_two(nr_pages) << PAGE_SHIFT;
}
@@ -1040,6 +1046,8 @@ static long pipe_set_size(struct pipe_in
long ret = 0;
size = round_pipe_size(arg);
+ if (size == 0)
+ return -EINVAL;
nr_pages = size >> PAGE_SHIFT;
if (!nr_pages)
@@ -1123,13 +1131,18 @@ out_revert_acct:
int pipe_proc_fn(struct ctl_table *table, int write, void __user *buf,
size_t *lenp, loff_t *ppos)
{
+ unsigned int rounded_pipe_max_size;
int ret;
ret = proc_douintvec_minmax(table, write, buf, lenp, ppos);
if (ret < 0 || !write)
return ret;
- pipe_max_size = round_pipe_size(pipe_max_size);
+ rounded_pipe_max_size = round_pipe_size(pipe_max_size);
+ if (rounded_pipe_max_size == 0)
+ return -EINVAL;
+
+ pipe_max_size = rounded_pipe_max_size;
return ret;
}
Patches currently in stable-queue which might be from joe.lawrence(a)redhat.com are
queue-4.14/pipe-avoid-round_pipe_size-nr_pages-overflow-on-32-bit.patch
This is a note to let you know that I've just added the patch titled
pipe: avoid round_pipe_size() nr_pages overflow on 32-bit
to the 3.18-stable tree which can be found at:
http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=sum…
The filename of the patch is:
pipe-avoid-round_pipe_size-nr_pages-overflow-on-32-bit.patch
and it can be found in the queue-3.18 subdirectory.
If you, or anyone else, feels it should not be added to the stable tree,
please let <stable(a)vger.kernel.org> know about it.
>From d3f14c485867cfb2e0c48aa88c41d0ef4bf5209c Mon Sep 17 00:00:00 2001
From: Joe Lawrence <joe.lawrence(a)redhat.com>
Date: Fri, 17 Nov 2017 15:29:21 -0800
Subject: pipe: avoid round_pipe_size() nr_pages overflow on 32-bit
From: Joe Lawrence <joe.lawrence(a)redhat.com>
commit d3f14c485867cfb2e0c48aa88c41d0ef4bf5209c upstream.
round_pipe_size() contains a right-bit-shift expression which may
overflow, which would cause undefined results in a subsequent
roundup_pow_of_two() call.
static inline unsigned int round_pipe_size(unsigned int size)
{
unsigned long nr_pages;
nr_pages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
return roundup_pow_of_two(nr_pages) << PAGE_SHIFT;
}
PAGE_SIZE is defined as (1UL << PAGE_SHIFT), so:
- 4 bytes wide on 32-bit (0 to 0xffffffff)
- 8 bytes wide on 64-bit (0 to 0xffffffffffffffff)
That means that 32-bit round_pipe_size(), nr_pages may overflow to 0:
size=0x00000000 nr_pages=0x0
size=0x00000001 nr_pages=0x1
size=0xfffff000 nr_pages=0xfffff
size=0xfffff001 nr_pages=0x0 << !
size=0xffffffff nr_pages=0x0 << !
This is bad because roundup_pow_of_two(n) is undefined when n == 0!
64-bit is not a problem as the unsigned int size is 4 bytes wide
(similar to 32-bit) and the larger, 8 byte wide unsigned long, is
sufficient to handle the largest value of the bit shift expression:
size=0xffffffff nr_pages=100000
Modify round_pipe_size() to return 0 if n == 0 and updates its callers to
handle accordingly.
Link: http://lkml.kernel.org/r/1507658689-11669-3-git-send-email-joe.lawrence@red…
Signed-off-by: Joe Lawrence <joe.lawrence(a)redhat.com>
Reported-by: Mikulas Patocka <mpatocka(a)redhat.com>
Reviewed-by: Mikulas Patocka <mpatocka(a)redhat.com>
Cc: Al Viro <viro(a)zeniv.linux.org.uk>
Cc: Jens Axboe <axboe(a)kernel.dk>
Cc: Michael Kerrisk <mtk.manpages(a)gmail.com>
Cc: Randy Dunlap <rdunlap(a)infradead.org>
Cc: Josh Poimboeuf <jpoimboe(a)redhat.com>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds(a)linux-foundation.org>
Signed-off-by: Dong Jinguang <dongjinguang(a)huawei.com>
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
---
fs/pipe.c | 18 ++++++++++++++++--
1 file changed, 16 insertions(+), 2 deletions(-)
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -1002,6 +1002,9 @@ static long pipe_set_size(struct pipe_in
{
struct pipe_buffer *bufs;
+ if (!nr_pages)
+ return -EINVAL;
+
/*
* We can shrink the pipe, if arg >= pipe->nrbufs. Since we don't
* expect a lot of shrink+grow operations, just free and allocate
@@ -1046,13 +1049,19 @@ static long pipe_set_size(struct pipe_in
/*
* Currently we rely on the pipe array holding a power-of-2 number
- * of pages.
+ * of pages. Returns 0 on error.
*/
static inline unsigned int round_pipe_size(unsigned int size)
{
unsigned long nr_pages;
+ if (size < pipe_min_size)
+ size = pipe_min_size;
+
nr_pages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ if (nr_pages == 0)
+ return 0;
+
return roundup_pow_of_two(nr_pages) << PAGE_SHIFT;
}
@@ -1063,13 +1072,18 @@ static inline unsigned int round_pipe_si
int pipe_proc_fn(struct ctl_table *table, int write, void __user *buf,
size_t *lenp, loff_t *ppos)
{
+ unsigned int rounded_pipe_max_size;
int ret;
ret = proc_dointvec_minmax(table, write, buf, lenp, ppos);
if (ret < 0 || !write)
return ret;
- pipe_max_size = round_pipe_size(pipe_max_size);
+ rounded_pipe_max_size = round_pipe_size(pipe_max_size);
+ if (rounded_pipe_max_size == 0)
+ return -EINVAL;
+
+ pipe_max_size = rounded_pipe_max_size;
return ret;
}
Patches currently in stable-queue which might be from joe.lawrence(a)redhat.com are
queue-3.18/pipe-avoid-round_pipe_size-nr_pages-overflow-on-32-bit.patch
On Fri, Jan 19, 2018 at 02:08:35PM +0100, David Woodhouse wrote:
> On Fri, 2018-01-19 at 13:26 +0100, Greg Kroah-Hartman wrote:
> >
> > I don't have a way to test this, I'll merge it into the existing patch
> > and push out a new tree to see how 0-day and Guenter's build-farm handle
> > it.
>
> It seems to work (and boot in qemu) here.
Ok, great, thanks for the fix.
greg k-h
This is a note to let you know that I've just added the patch titled
sched/deadline: Use the revised wakeup rule for suspending constrained dl tasks
to the 4.9-stable tree which can be found at:
http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=sum…
The filename of the patch is:
sched-deadline-use-the-revised-wakeup-rule-for-suspending-constrained-dl-tasks.patch
and it can be found in the queue-4.9 subdirectory.
If you, or anyone else, feels it should not be added to the stable tree,
please let <stable(a)vger.kernel.org> know about it.
>From 3effcb4247e74a51f5d8b775a1ee4abf87cc089a Mon Sep 17 00:00:00 2001
From: Daniel Bristot de Oliveira <bristot(a)redhat.com>
Date: Mon, 29 May 2017 16:24:03 +0200
Subject: sched/deadline: Use the revised wakeup rule for suspending constrained dl tasks
From: Daniel Bristot de Oliveira <bristot(a)redhat.com>
commit 3effcb4247e74a51f5d8b775a1ee4abf87cc089a upstream.
We have been facing some problems with self-suspending constrained
deadline tasks. The main reason is that the original CBS was not
designed for such sort of tasks.
One problem reported by Xunlei Pang takes place when a task
suspends, and then is awakened before the deadline, but so close
to the deadline that its remaining runtime can cause the task
to have an absolute density higher than allowed. In such situation,
the original CBS assumes that the task is facing an early activation,
and so it replenishes the task and set another deadline, one deadline
in the future. This rule works fine for implicit deadline tasks.
Moreover, it allows the system to adapt the period of a task in which
the external event source suffered from a clock drift.
However, this opens the window for bandwidth leakage for constrained
deadline tasks. For instance, a task with the following parameters:
runtime = 5 ms
deadline = 7 ms
[density] = 5 / 7 = 0.71
period = 1000 ms
If the task runs for 1 ms, and then suspends for another 1ms,
it will be awakened with the following parameters:
remaining runtime = 4
laxity = 5
presenting a absolute density of 4 / 5 = 0.80.
In this case, the original CBS would assume the task had an early
wakeup. Then, CBS will reset the runtime, and the absolute deadline will
be postponed by one relative deadline, allowing the task to run.
The problem is that, if the task runs this pattern forever, it will keep
receiving bandwidth, being able to run 1ms every 2ms. Following this
behavior, the task would be able to run 500 ms in 1 sec. Thus running
more than the 5 ms / 1 sec the admission control allowed it to run.
Trying to address the self-suspending case, Luca Abeni, Giuseppe
Lipari, and Juri Lelli [1] revisited the CBS in order to deal with
self-suspending tasks. In the new approach, rather than
replenishing/postponing the absolute deadline, the revised wakeup rule
adjusts the remaining runtime, reducing it to fit into the allowed
density.
A revised version of the idea is:
At a given time t, the maximum absolute density of a task cannot be
higher than its relative density, that is:
runtime / (deadline - t) <= dl_runtime / dl_deadline
Knowing the laxity of a task (deadline - t), it is possible to move
it to the other side of the equality, thus enabling to define max
remaining runtime a task can use within the absolute deadline, without
over-running the allowed density:
runtime = (dl_runtime / dl_deadline) * (deadline - t)
For instance, in our previous example, the task could still run:
runtime = ( 5 / 7 ) * 5
runtime = 3.57 ms
Without causing damage for other deadline tasks. It is note worthy
that the laxity cannot be negative because that would cause a negative
runtime. Thus, this patch depends on the patch:
df8eac8cafce ("sched/deadline: Throttle a constrained deadline task activated after the deadline")
Which throttles a constrained deadline task activated after the
deadline.
Finally, it is also possible to use the revised wakeup rule for
all other tasks, but that would require some more discussions
about pros and cons.
Reported-by: Xunlei Pang <xpang(a)redhat.com>
Signed-off-by: Daniel Bristot de Oliveira <bristot(a)redhat.com>
[peterz: replaced dl_is_constrained with dl_is_implicit]
Signed-off-by: Peter Zijlstra (Intel) <peterz(a)infradead.org>
Cc: Juri Lelli <juri.lelli(a)arm.com>
Cc: Linus Torvalds <torvalds(a)linux-foundation.org>
Cc: Luca Abeni <luca.abeni(a)santannapisa.it>
Cc: Mike Galbraith <efault(a)gmx.de>
Cc: Peter Zijlstra <peterz(a)infradead.org>
Cc: Romulo Silva de Oliveira <romulo.deoliveira(a)ufsc.br>
Cc: Steven Rostedt <rostedt(a)goodmis.org>
Cc: Thomas Gleixner <tglx(a)linutronix.de>
Cc: Tommaso Cucinotta <tommaso.cucinotta(a)sssup.it>
Link: http://lkml.kernel.org/r/5c800ab3a74a168a84ee5f3f84d12a02e11383be.149580380…
Signed-off-by: Ingo Molnar <mingo(a)kernel.org>
Cc: Ben Hutchings <ben.hutchings(a)codethink.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
---
include/linux/sched.h | 3 -
kernel/sched/core.c | 2
kernel/sched/deadline.c | 98 ++++++++++++++++++++++++++++++++++++++++++------
3 files changed, 90 insertions(+), 13 deletions(-)
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1411,7 +1411,8 @@ struct sched_dl_entity {
u64 dl_runtime; /* maximum runtime for each instance */
u64 dl_deadline; /* relative deadline of each instance */
u64 dl_period; /* separation of two instances (period) */
- u64 dl_bw; /* dl_runtime / dl_deadline */
+ u64 dl_bw; /* dl_runtime / dl_period */
+ u64 dl_density; /* dl_runtime / dl_deadline */
/*
* Actual scheduling parameters. Initialized with the values above,
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2183,6 +2183,7 @@ void __dl_clear_params(struct task_struc
dl_se->dl_period = 0;
dl_se->flags = 0;
dl_se->dl_bw = 0;
+ dl_se->dl_density = 0;
dl_se->dl_throttled = 0;
dl_se->dl_yielded = 0;
@@ -3911,6 +3912,7 @@ __setparam_dl(struct task_struct *p, con
dl_se->dl_period = attr->sched_period ?: dl_se->dl_deadline;
dl_se->flags = attr->sched_flags;
dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime);
+ dl_se->dl_density = to_ratio(dl_se->dl_deadline, dl_se->dl_runtime);
/*
* Changing the parameters of a task is 'tricky' and we're not doing
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -484,13 +484,84 @@ static bool dl_entity_overflow(struct sc
}
/*
- * When a -deadline entity is queued back on the runqueue, its runtime and
- * deadline might need updating.
+ * Revised wakeup rule [1]: For self-suspending tasks, rather then
+ * re-initializing task's runtime and deadline, the revised wakeup
+ * rule adjusts the task's runtime to avoid the task to overrun its
+ * density.
*
- * The policy here is that we update the deadline of the entity only if:
- * - the current deadline is in the past,
- * - using the remaining runtime with the current deadline would make
- * the entity exceed its bandwidth.
+ * Reasoning: a task may overrun the density if:
+ * runtime / (deadline - t) > dl_runtime / dl_deadline
+ *
+ * Therefore, runtime can be adjusted to:
+ * runtime = (dl_runtime / dl_deadline) * (deadline - t)
+ *
+ * In such way that runtime will be equal to the maximum density
+ * the task can use without breaking any rule.
+ *
+ * [1] Luca Abeni, Giuseppe Lipari, and Juri Lelli. 2015. Constant
+ * bandwidth server revisited. SIGBED Rev. 11, 4 (January 2015), 19-24.
+ */
+static void
+update_dl_revised_wakeup(struct sched_dl_entity *dl_se, struct rq *rq)
+{
+ u64 laxity = dl_se->deadline - rq_clock(rq);
+
+ /*
+ * If the task has deadline < period, and the deadline is in the past,
+ * it should already be throttled before this check.
+ *
+ * See update_dl_entity() comments for further details.
+ */
+ WARN_ON(dl_time_before(dl_se->deadline, rq_clock(rq)));
+
+ dl_se->runtime = (dl_se->dl_density * laxity) >> BW_SHIFT;
+}
+
+/*
+ * Regarding the deadline, a task with implicit deadline has a relative
+ * deadline == relative period. A task with constrained deadline has a
+ * relative deadline <= relative period.
+ *
+ * We support constrained deadline tasks. However, there are some restrictions
+ * applied only for tasks which do not have an implicit deadline. See
+ * update_dl_entity() to know more about such restrictions.
+ *
+ * The dl_is_implicit() returns true if the task has an implicit deadline.
+ */
+static inline bool dl_is_implicit(struct sched_dl_entity *dl_se)
+{
+ return dl_se->dl_deadline == dl_se->dl_period;
+}
+
+/*
+ * When a deadline entity is placed in the runqueue, its runtime and deadline
+ * might need to be updated. This is done by a CBS wake up rule. There are two
+ * different rules: 1) the original CBS; and 2) the Revisited CBS.
+ *
+ * When the task is starting a new period, the Original CBS is used. In this
+ * case, the runtime is replenished and a new absolute deadline is set.
+ *
+ * When a task is queued before the begin of the next period, using the
+ * remaining runtime and deadline could make the entity to overflow, see
+ * dl_entity_overflow() to find more about runtime overflow. When such case
+ * is detected, the runtime and deadline need to be updated.
+ *
+ * If the task has an implicit deadline, i.e., deadline == period, the Original
+ * CBS is applied. the runtime is replenished and a new absolute deadline is
+ * set, as in the previous cases.
+ *
+ * However, the Original CBS does not work properly for tasks with
+ * deadline < period, which are said to have a constrained deadline. By
+ * applying the Original CBS, a constrained deadline task would be able to run
+ * runtime/deadline in a period. With deadline < period, the task would
+ * overrun the runtime/period allowed bandwidth, breaking the admission test.
+ *
+ * In order to prevent this misbehave, the Revisited CBS is used for
+ * constrained deadline tasks when a runtime overflow is detected. In the
+ * Revisited CBS, rather than replenishing & setting a new absolute deadline,
+ * the remaining runtime of the task is reduced to avoid runtime overflow.
+ * Please refer to the comments update_dl_revised_wakeup() function to find
+ * more about the Revised CBS rule.
*/
static void update_dl_entity(struct sched_dl_entity *dl_se,
struct sched_dl_entity *pi_se)
@@ -500,6 +571,14 @@ static void update_dl_entity(struct sche
if (dl_time_before(dl_se->deadline, rq_clock(rq)) ||
dl_entity_overflow(dl_se, pi_se, rq_clock(rq))) {
+
+ if (unlikely(!dl_is_implicit(dl_se) &&
+ !dl_time_before(dl_se->deadline, rq_clock(rq)) &&
+ !dl_se->dl_boosted)){
+ update_dl_revised_wakeup(dl_se, rq);
+ return;
+ }
+
dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline;
dl_se->runtime = pi_se->dl_runtime;
}
@@ -961,11 +1040,6 @@ static void dequeue_dl_entity(struct sch
__dequeue_dl_entity(dl_se);
}
-static inline bool dl_is_constrained(struct sched_dl_entity *dl_se)
-{
- return dl_se->dl_deadline < dl_se->dl_period;
-}
-
static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
{
struct task_struct *pi_task = rt_mutex_get_top_task(p);
@@ -997,7 +1071,7 @@ static void enqueue_task_dl(struct rq *r
* If that is the case, the task will be throttled and
* the replenishment timer will be set to the next period.
*/
- if (!p->dl.dl_throttled && dl_is_constrained(&p->dl))
+ if (!p->dl.dl_throttled && !dl_is_implicit(&p->dl))
dl_check_constrained_dl(&p->dl);
/*
Patches currently in stable-queue which might be from bristot(a)redhat.com are
queue-4.9/sched-deadline-zero-out-positive-runtime-after-throttling-constrained-tasks.patch
queue-4.9/sched-deadline-use-the-revised-wakeup-rule-for-suspending-constrained-dl-tasks.patch
This is a note to let you know that I've just added the patch titled
sched/deadline: Use the revised wakeup rule for suspending constrained dl tasks
to the 4.4-stable tree which can be found at:
http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=sum…
The filename of the patch is:
sched-deadline-use-the-revised-wakeup-rule-for-suspending-constrained-dl-tasks.patch
and it can be found in the queue-4.4 subdirectory.
If you, or anyone else, feels it should not be added to the stable tree,
please let <stable(a)vger.kernel.org> know about it.
>From 3effcb4247e74a51f5d8b775a1ee4abf87cc089a Mon Sep 17 00:00:00 2001
From: Daniel Bristot de Oliveira <bristot(a)redhat.com>
Date: Mon, 29 May 2017 16:24:03 +0200
Subject: sched/deadline: Use the revised wakeup rule for suspending constrained dl tasks
From: Daniel Bristot de Oliveira <bristot(a)redhat.com>
commit 3effcb4247e74a51f5d8b775a1ee4abf87cc089a upstream.
We have been facing some problems with self-suspending constrained
deadline tasks. The main reason is that the original CBS was not
designed for such sort of tasks.
One problem reported by Xunlei Pang takes place when a task
suspends, and then is awakened before the deadline, but so close
to the deadline that its remaining runtime can cause the task
to have an absolute density higher than allowed. In such situation,
the original CBS assumes that the task is facing an early activation,
and so it replenishes the task and set another deadline, one deadline
in the future. This rule works fine for implicit deadline tasks.
Moreover, it allows the system to adapt the period of a task in which
the external event source suffered from a clock drift.
However, this opens the window for bandwidth leakage for constrained
deadline tasks. For instance, a task with the following parameters:
runtime = 5 ms
deadline = 7 ms
[density] = 5 / 7 = 0.71
period = 1000 ms
If the task runs for 1 ms, and then suspends for another 1ms,
it will be awakened with the following parameters:
remaining runtime = 4
laxity = 5
presenting a absolute density of 4 / 5 = 0.80.
In this case, the original CBS would assume the task had an early
wakeup. Then, CBS will reset the runtime, and the absolute deadline will
be postponed by one relative deadline, allowing the task to run.
The problem is that, if the task runs this pattern forever, it will keep
receiving bandwidth, being able to run 1ms every 2ms. Following this
behavior, the task would be able to run 500 ms in 1 sec. Thus running
more than the 5 ms / 1 sec the admission control allowed it to run.
Trying to address the self-suspending case, Luca Abeni, Giuseppe
Lipari, and Juri Lelli [1] revisited the CBS in order to deal with
self-suspending tasks. In the new approach, rather than
replenishing/postponing the absolute deadline, the revised wakeup rule
adjusts the remaining runtime, reducing it to fit into the allowed
density.
A revised version of the idea is:
At a given time t, the maximum absolute density of a task cannot be
higher than its relative density, that is:
runtime / (deadline - t) <= dl_runtime / dl_deadline
Knowing the laxity of a task (deadline - t), it is possible to move
it to the other side of the equality, thus enabling to define max
remaining runtime a task can use within the absolute deadline, without
over-running the allowed density:
runtime = (dl_runtime / dl_deadline) * (deadline - t)
For instance, in our previous example, the task could still run:
runtime = ( 5 / 7 ) * 5
runtime = 3.57 ms
Without causing damage for other deadline tasks. It is note worthy
that the laxity cannot be negative because that would cause a negative
runtime. Thus, this patch depends on the patch:
df8eac8cafce ("sched/deadline: Throttle a constrained deadline task activated after the deadline")
Which throttles a constrained deadline task activated after the
deadline.
Finally, it is also possible to use the revised wakeup rule for
all other tasks, but that would require some more discussions
about pros and cons.
Reported-by: Xunlei Pang <xpang(a)redhat.com>
Signed-off-by: Daniel Bristot de Oliveira <bristot(a)redhat.com>
[peterz: replaced dl_is_constrained with dl_is_implicit]
Signed-off-by: Peter Zijlstra (Intel) <peterz(a)infradead.org>
Cc: Juri Lelli <juri.lelli(a)arm.com>
Cc: Linus Torvalds <torvalds(a)linux-foundation.org>
Cc: Luca Abeni <luca.abeni(a)santannapisa.it>
Cc: Mike Galbraith <efault(a)gmx.de>
Cc: Peter Zijlstra <peterz(a)infradead.org>
Cc: Romulo Silva de Oliveira <romulo.deoliveira(a)ufsc.br>
Cc: Steven Rostedt <rostedt(a)goodmis.org>
Cc: Thomas Gleixner <tglx(a)linutronix.de>
Cc: Tommaso Cucinotta <tommaso.cucinotta(a)sssup.it>
Link: http://lkml.kernel.org/r/5c800ab3a74a168a84ee5f3f84d12a02e11383be.149580380…
Signed-off-by: Ingo Molnar <mingo(a)kernel.org>
Cc: Ben Hutchings <ben.hutchings(a)codethink.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
---
include/linux/sched.h | 3 -
kernel/sched/core.c | 2
kernel/sched/deadline.c | 98 ++++++++++++++++++++++++++++++++++++++++++------
3 files changed, 90 insertions(+), 13 deletions(-)
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1312,7 +1312,8 @@ struct sched_dl_entity {
u64 dl_runtime; /* maximum runtime for each instance */
u64 dl_deadline; /* relative deadline of each instance */
u64 dl_period; /* separation of two instances (period) */
- u64 dl_bw; /* dl_runtime / dl_deadline */
+ u64 dl_bw; /* dl_runtime / dl_period */
+ u64 dl_density; /* dl_runtime / dl_deadline */
/*
* Actual scheduling parameters. Initialized with the values above,
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2109,6 +2109,7 @@ void __dl_clear_params(struct task_struc
dl_se->dl_period = 0;
dl_se->flags = 0;
dl_se->dl_bw = 0;
+ dl_se->dl_density = 0;
dl_se->dl_throttled = 0;
dl_se->dl_new = 1;
@@ -3647,6 +3648,7 @@ __setparam_dl(struct task_struct *p, con
dl_se->dl_period = attr->sched_period ?: dl_se->dl_deadline;
dl_se->flags = attr->sched_flags;
dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime);
+ dl_se->dl_density = to_ratio(dl_se->dl_deadline, dl_se->dl_runtime);
/*
* Changing the parameters of a task is 'tricky' and we're not doing
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -480,13 +480,84 @@ static bool dl_entity_overflow(struct sc
}
/*
- * When a -deadline entity is queued back on the runqueue, its runtime and
- * deadline might need updating.
+ * Revised wakeup rule [1]: For self-suspending tasks, rather then
+ * re-initializing task's runtime and deadline, the revised wakeup
+ * rule adjusts the task's runtime to avoid the task to overrun its
+ * density.
*
- * The policy here is that we update the deadline of the entity only if:
- * - the current deadline is in the past,
- * - using the remaining runtime with the current deadline would make
- * the entity exceed its bandwidth.
+ * Reasoning: a task may overrun the density if:
+ * runtime / (deadline - t) > dl_runtime / dl_deadline
+ *
+ * Therefore, runtime can be adjusted to:
+ * runtime = (dl_runtime / dl_deadline) * (deadline - t)
+ *
+ * In such way that runtime will be equal to the maximum density
+ * the task can use without breaking any rule.
+ *
+ * [1] Luca Abeni, Giuseppe Lipari, and Juri Lelli. 2015. Constant
+ * bandwidth server revisited. SIGBED Rev. 11, 4 (January 2015), 19-24.
+ */
+static void
+update_dl_revised_wakeup(struct sched_dl_entity *dl_se, struct rq *rq)
+{
+ u64 laxity = dl_se->deadline - rq_clock(rq);
+
+ /*
+ * If the task has deadline < period, and the deadline is in the past,
+ * it should already be throttled before this check.
+ *
+ * See update_dl_entity() comments for further details.
+ */
+ WARN_ON(dl_time_before(dl_se->deadline, rq_clock(rq)));
+
+ dl_se->runtime = (dl_se->dl_density * laxity) >> BW_SHIFT;
+}
+
+/*
+ * Regarding the deadline, a task with implicit deadline has a relative
+ * deadline == relative period. A task with constrained deadline has a
+ * relative deadline <= relative period.
+ *
+ * We support constrained deadline tasks. However, there are some restrictions
+ * applied only for tasks which do not have an implicit deadline. See
+ * update_dl_entity() to know more about such restrictions.
+ *
+ * The dl_is_implicit() returns true if the task has an implicit deadline.
+ */
+static inline bool dl_is_implicit(struct sched_dl_entity *dl_se)
+{
+ return dl_se->dl_deadline == dl_se->dl_period;
+}
+
+/*
+ * When a deadline entity is placed in the runqueue, its runtime and deadline
+ * might need to be updated. This is done by a CBS wake up rule. There are two
+ * different rules: 1) the original CBS; and 2) the Revisited CBS.
+ *
+ * When the task is starting a new period, the Original CBS is used. In this
+ * case, the runtime is replenished and a new absolute deadline is set.
+ *
+ * When a task is queued before the begin of the next period, using the
+ * remaining runtime and deadline could make the entity to overflow, see
+ * dl_entity_overflow() to find more about runtime overflow. When such case
+ * is detected, the runtime and deadline need to be updated.
+ *
+ * If the task has an implicit deadline, i.e., deadline == period, the Original
+ * CBS is applied. the runtime is replenished and a new absolute deadline is
+ * set, as in the previous cases.
+ *
+ * However, the Original CBS does not work properly for tasks with
+ * deadline < period, which are said to have a constrained deadline. By
+ * applying the Original CBS, a constrained deadline task would be able to run
+ * runtime/deadline in a period. With deadline < period, the task would
+ * overrun the runtime/period allowed bandwidth, breaking the admission test.
+ *
+ * In order to prevent this misbehave, the Revisited CBS is used for
+ * constrained deadline tasks when a runtime overflow is detected. In the
+ * Revisited CBS, rather than replenishing & setting a new absolute deadline,
+ * the remaining runtime of the task is reduced to avoid runtime overflow.
+ * Please refer to the comments update_dl_revised_wakeup() function to find
+ * more about the Revised CBS rule.
*/
static void update_dl_entity(struct sched_dl_entity *dl_se,
struct sched_dl_entity *pi_se)
@@ -505,6 +576,14 @@ static void update_dl_entity(struct sche
if (dl_time_before(dl_se->deadline, rq_clock(rq)) ||
dl_entity_overflow(dl_se, pi_se, rq_clock(rq))) {
+
+ if (unlikely(!dl_is_implicit(dl_se) &&
+ !dl_time_before(dl_se->deadline, rq_clock(rq)) &&
+ !dl_se->dl_boosted)){
+ update_dl_revised_wakeup(dl_se, rq);
+ return;
+ }
+
dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline;
dl_se->runtime = pi_se->dl_runtime;
}
@@ -991,11 +1070,6 @@ static void dequeue_dl_entity(struct sch
__dequeue_dl_entity(dl_se);
}
-static inline bool dl_is_constrained(struct sched_dl_entity *dl_se)
-{
- return dl_se->dl_deadline < dl_se->dl_period;
-}
-
static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
{
struct task_struct *pi_task = rt_mutex_get_top_task(p);
@@ -1027,7 +1101,7 @@ static void enqueue_task_dl(struct rq *r
* If that is the case, the task will be throttled and
* the replenishment timer will be set to the next period.
*/
- if (!p->dl.dl_throttled && dl_is_constrained(&p->dl))
+ if (!p->dl.dl_throttled && !dl_is_implicit(&p->dl))
dl_check_constrained_dl(&p->dl);
/*
Patches currently in stable-queue which might be from bristot(a)redhat.com are
queue-4.4/sched-deadline-zero-out-positive-runtime-after-throttling-constrained-tasks.patch
queue-4.4/sched-deadline-use-the-revised-wakeup-rule-for-suspending-constrained-dl-tasks.patch
On Fri, Jan 19, 2018 at 11:21:54AM +0100, David Woodhouse wrote:
> On Fri, 2018-01-19 at 10:34 +0100, David Woodhouse wrote:
> > On Thu, 2018-01-18 at 19:10 +0100, Greg Kroah-Hartman wrote:
> > >
> > > On Thu, Jan 18, 2018 at 08:41:58AM -0800, Guenter Roeck wrote:
> > > >
> > > >
> > > > Building i386:defconfig ... failed
> > > > --------------
> > > > Error log:
> > > > arch/x86/entry/entry_32.S: Assembler messages:
> > > > arch/x86/entry/entry_32.S:230: Error: too many memory references
> > > > for `mov'
> > > Ick, no good, 0-day has pointed this out as well.
> > >
> > > Razvan and David, any ideas?
> >
> > CALL_NOSPEC PT_EBX(%esp)
> >
> > That turns into a retpoline with
> >
> > mov PT_EBX(%esp), 0(%esp)
> >
> > Which is doubly wrong, because not only can't you have two memory
> > operands to a 'mov' but %esp has already *moved* by the time we get
> > here so we'd be using the wrong source anyway.
> >
> > We need to pick a victim register and load PT_EBX(%esp) into it, then
> > CALL_NOSPEC %\reg.
> >
> > We'll fix this and also the RSP-clobbering in context switch that you
> > just sent a "fails to apply" message for.
>
> Try this. Not even build tested. I think we can have %edx here, as it
> would be the second argument to the kthread function, and clobbered by
> it too.
>
> Signed-off-by-if-it-works: David Woodhouse <dwmw(a)amazon.co.uk>
>
> --- a/arch/x86/entry/entry_32.S
> +++ b/arch/x86/entry/entry_32.S
> @@ -227,7 +227,8 @@ ENTRY(ret_from_kernel_thread)
> pushl $0x0202 # Reset kernel eflags
> popfl
> movl PT_EBP(%esp), %eax
> - CALL_NOSPEC PT_EBX(%esp)
> + movl PT_EBX(%esp), %edx
> + CALL_NOSPEC %edx
> movl $0, PT_EAX(%esp)
>
I don't have a way to test this, I'll merge it into the existing patch
and push out a new tree to see how 0-day and Guenter's build-farm handle
it.
thanks,
greg k-h
This is a note to let you know that I've just added the patch titled
x86/tsc: Fix erroneous TSC rate on Skylake Xeon
to the 4.9-stable tree which can be found at:
http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=sum…
The filename of the patch is:
x86-tsc-fix-erroneous-tsc-rate-on-skylake-xeon.patch
and it can be found in the queue-4.9 subdirectory.
If you, or anyone else, feels it should not be added to the stable tree,
please let <stable(a)vger.kernel.org> know about it.
>From b511203093489eb1829cb4de86e8214752205ac6 Mon Sep 17 00:00:00 2001
From: Len Brown <len.brown(a)intel.com>
Date: Fri, 22 Dec 2017 00:27:55 -0500
Subject: x86/tsc: Fix erroneous TSC rate on Skylake Xeon
From: Len Brown <len.brown(a)intel.com>
commit b511203093489eb1829cb4de86e8214752205ac6 upstream.
The INTEL_FAM6_SKYLAKE_X hardcoded crystal_khz value of 25MHZ is
problematic:
- SKX workstations (with same model # as server variants) use a 24 MHz
crystal. This results in a -4.0% time drift rate on SKX workstations.
- SKX servers subject the crystal to an EMI reduction circuit that reduces its
actual frequency by (approximately) -0.25%. This results in -1 second per
10 minute time drift as compared to network time.
This issue can also trigger a timer and power problem, on configurations
that use the LAPIC timer (versus the TSC deadline timer). Clock ticks
scheduled with the LAPIC timer arrive a few usec before the time they are
expected (according to the slow TSC). This causes Linux to poll-idle, when
it should be in an idle power saving state. The idle and clock code do not
graciously recover from this error, sometimes resulting in significant
polling and measurable power impact.
Stop using native_calibrate_tsc() for INTEL_FAM6_SKYLAKE_X.
native_calibrate_tsc() will return 0, boot will run with tsc_khz = cpu_khz,
and the TSC refined calibration will update tsc_khz to correct for the
difference.
[ tglx: Sanitized change log ]
Fixes: 6baf3d61821f ("x86/tsc: Add additional Intel CPU models to the crystal quirk list")
Signed-off-by: Len Brown <len.brown(a)intel.com>
Signed-off-by: Thomas Gleixner <tglx(a)linutronix.de>
Cc: peterz(a)infradead.org
Cc: Prarit Bhargava <prarit(a)redhat.com>
Link: https://lkml.kernel.org/r/ff6dcea166e8ff8f2f6a03c17beab2cb436aa779.15139204…
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
---
arch/x86/kernel/tsc.c | 1 -
1 file changed, 1 deletion(-)
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -693,7 +693,6 @@ unsigned long native_calibrate_tsc(void)
case INTEL_FAM6_KABYLAKE_DESKTOP:
crystal_khz = 24000; /* 24.0 MHz */
break;
- case INTEL_FAM6_SKYLAKE_X:
case INTEL_FAM6_ATOM_DENVERTON:
crystal_khz = 25000; /* 25.0 MHz */
break;
Patches currently in stable-queue which might be from len.brown(a)intel.com are
queue-4.9/x86-tsc-fix-erroneous-tsc-rate-on-skylake-xeon.patch
This is a note to let you know that I've just added the patch titled
x86/retpoline: Fill RSB on context switch for affected CPUs
to the 4.9-stable tree which can be found at:
http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=sum…
The filename of the patch is:
x86-retpoline-fill-rsb-on-context-switch-for-affected-cpus.patch
and it can be found in the queue-4.9 subdirectory.
If you, or anyone else, feels it should not be added to the stable tree,
please let <stable(a)vger.kernel.org> know about it.
>From c995efd5a740d9cbafbf58bde4973e8b50b4d761 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw(a)amazon.co.uk>
Date: Fri, 12 Jan 2018 17:49:25 +0000
Subject: x86/retpoline: Fill RSB on context switch for affected CPUs
From: David Woodhouse <dwmw(a)amazon.co.uk>
commit c995efd5a740d9cbafbf58bde4973e8b50b4d761 upstream.
On context switch from a shallow call stack to a deeper one, as the CPU
does 'ret' up the deeper side it may encounter RSB entries (predictions for
where the 'ret' goes to) which were populated in userspace.
This is problematic if neither SMEP nor KPTI (the latter of which marks
userspace pages as NX for the kernel) are active, as malicious code in
userspace may then be executed speculatively.
Overwrite the CPU's return prediction stack with calls which are predicted
to return to an infinite loop, to "capture" speculation if this
happens. This is required both for retpoline, and also in conjunction with
IBRS for !SMEP && !KPTI.
On Skylake+ the problem is slightly different, and an *underflow* of the
RSB may cause errant branch predictions to occur. So there it's not so much
overwrite, as *filling* the RSB to attempt to prevent it getting
empty. This is only a partial solution for Skylake+ since there are many
other conditions which may result in the RSB becoming empty. The full
solution on Skylake+ is to use IBRS, which will prevent the problem even
when the RSB becomes empty. With IBRS, the RSB-stuffing will not be
required on context switch.
[ tglx: Added missing vendor check and slighty massaged comments and
changelog ]
Signed-off-by: David Woodhouse <dwmw(a)amazon.co.uk>
Signed-off-by: Thomas Gleixner <tglx(a)linutronix.de>
Acked-by: Arjan van de Ven <arjan(a)linux.intel.com>
Cc: gnomes(a)lxorguk.ukuu.org.uk
Cc: Rik van Riel <riel(a)redhat.com>
Cc: Andi Kleen <ak(a)linux.intel.com>
Cc: Josh Poimboeuf <jpoimboe(a)redhat.com>
Cc: thomas.lendacky(a)amd.com
Cc: Peter Zijlstra <peterz(a)infradead.org>
Cc: Linus Torvalds <torvalds(a)linux-foundation.org>
Cc: Jiri Kosina <jikos(a)kernel.org>
Cc: Andy Lutomirski <luto(a)amacapital.net>
Cc: Dave Hansen <dave.hansen(a)intel.com>
Cc: Kees Cook <keescook(a)google.com>
Cc: Tim Chen <tim.c.chen(a)linux.intel.com>
Cc: Greg Kroah-Hartman <gregkh(a)linux-foundation.org>
Cc: Paul Turner <pjt(a)google.com>
Link: https://lkml.kernel.org/r/1515779365-9032-1-git-send-email-dwmw@amazon.co.uk
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
---
arch/x86/entry/entry_32.S | 11 +++++++++++
arch/x86/entry/entry_64.S | 11 +++++++++++
arch/x86/include/asm/cpufeatures.h | 1 +
arch/x86/kernel/cpu/bugs.c | 36 ++++++++++++++++++++++++++++++++++++
4 files changed, 59 insertions(+)
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -229,6 +229,17 @@ ENTRY(__switch_to_asm)
movl %ebx, PER_CPU_VAR(stack_canary)+stack_canary_offset
#endif
+#ifdef CONFIG_RETPOLINE
+ /*
+ * When switching from a shallower to a deeper call stack
+ * the RSB may either underflow or use entries populated
+ * with userspace addresses. On CPUs where those concerns
+ * exist, overwrite the RSB with entries which capture
+ * speculative execution to prevent attack.
+ */
+ FILL_RETURN_BUFFER %ebx, RSB_CLEAR_LOOPS, X86_FEATURE_RSB_CTXSW
+#endif
+
/* restore callee-saved registers */
popl %esi
popl %edi
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -427,6 +427,17 @@ ENTRY(__switch_to_asm)
movq %rbx, PER_CPU_VAR(irq_stack_union)+stack_canary_offset
#endif
+#ifdef CONFIG_RETPOLINE
+ /*
+ * When switching from a shallower to a deeper call stack
+ * the RSB may either underflow or use entries populated
+ * with userspace addresses. On CPUs where those concerns
+ * exist, overwrite the RSB with entries which capture
+ * speculative execution to prevent attack.
+ */
+ FILL_RETURN_BUFFER %r12, RSB_CLEAR_LOOPS, X86_FEATURE_RSB_CTXSW
+#endif
+
/* restore callee-saved registers */
popq %r15
popq %r14
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -200,6 +200,7 @@
#define X86_FEATURE_INTEL_PT ( 7*32+15) /* Intel Processor Trace */
#define X86_FEATURE_AVX512_4VNNIW (7*32+16) /* AVX-512 Neural Network Instructions */
#define X86_FEATURE_AVX512_4FMAPS (7*32+17) /* AVX-512 Multiply Accumulation Single precision */
+#define X86_FEATURE_RSB_CTXSW ( 7*32+19) /* Fill RSB on context switches */
/* Because the ALTERNATIVE scheme is for members of the X86_FEATURE club... */
#define X86_FEATURE_KAISER ( 7*32+31) /* CONFIG_PAGE_TABLE_ISOLATION w/o nokaiser */
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -22,6 +22,7 @@
#include <asm/alternative.h>
#include <asm/pgtable.h>
#include <asm/cacheflush.h>
+#include <asm/intel-family.h>
static void __init spectre_v2_select_mitigation(void);
@@ -154,6 +155,23 @@ disable:
return SPECTRE_V2_CMD_NONE;
}
+/* Check for Skylake-like CPUs (for RSB handling) */
+static bool __init is_skylake_era(void)
+{
+ if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL &&
+ boot_cpu_data.x86 == 6) {
+ switch (boot_cpu_data.x86_model) {
+ case INTEL_FAM6_SKYLAKE_MOBILE:
+ case INTEL_FAM6_SKYLAKE_DESKTOP:
+ case INTEL_FAM6_SKYLAKE_X:
+ case INTEL_FAM6_KABYLAKE_MOBILE:
+ case INTEL_FAM6_KABYLAKE_DESKTOP:
+ return true;
+ }
+ }
+ return false;
+}
+
static void __init spectre_v2_select_mitigation(void)
{
enum spectre_v2_mitigation_cmd cmd = spectre_v2_parse_cmdline();
@@ -212,6 +230,24 @@ retpoline_auto:
spectre_v2_enabled = mode;
pr_info("%s\n", spectre_v2_strings[mode]);
+
+ /*
+ * If neither SMEP or KPTI are available, there is a risk of
+ * hitting userspace addresses in the RSB after a context switch
+ * from a shallow call stack to a deeper one. To prevent this fill
+ * the entire RSB, even when using IBRS.
+ *
+ * Skylake era CPUs have a separate issue with *underflow* of the
+ * RSB, when they will predict 'ret' targets from the generic BTB.
+ * The proper mitigation for this is IBRS. If IBRS is not supported
+ * or deactivated in favour of retpolines the RSB fill on context
+ * switch is required.
+ */
+ if ((!boot_cpu_has(X86_FEATURE_PTI) &&
+ !boot_cpu_has(X86_FEATURE_SMEP)) || is_skylake_era()) {
+ setup_force_cpu_cap(X86_FEATURE_RSB_CTXSW);
+ pr_info("Filling RSB on context switch\n");
+ }
}
#undef pr_fmt
Patches currently in stable-queue which might be from dwmw(a)amazon.co.uk are
queue-4.9/module-add-retpoline-tag-to-vermagic.patch
queue-4.9/x86-retpoline-fill-rsb-on-context-switch-for-affected-cpus.patch
queue-4.9/x86-retpoline-add-lfence-to-the-retpoline-rsb-filling-rsb-macros.patch